objstr.c 24.8 KB
Newer Older
1
2
3
4
5
6
7
8
9
#include <stdlib.h>
#include <stdint.h>
#include <stdarg.h>
#include <string.h>
#include <assert.h>

#include "nlr.h"
#include "misc.h"
#include "mpconfig.h"
10
#include "qstr.h"
11
12
13
14
15
16
#include "obj.h"
#include "runtime0.h"
#include "runtime.h"

typedef struct _mp_obj_str_t {
    mp_obj_base_t base;
17
18
19
    machine_uint_t hash : 16; // XXX here we assume the hash size is 16 bits (it is at the moment; see qstr.c)
    machine_uint_t len : 16; // len == number of bytes used in data, alloc = len + 1 because (at the moment) we also append a null byte
    byte data[];
20
21
} mp_obj_str_t;

22
23
24
25
26
27
28
29
30
// use this macro to extract the string hash
#define GET_STR_HASH(str_obj_in, str_hash) uint str_hash; if (MP_OBJ_IS_QSTR(str_obj_in)) { str_hash = qstr_hash(MP_OBJ_QSTR_VALUE(str_obj_in)); } else { str_hash = ((mp_obj_str_t*)str_obj_in)->hash; }

// use this macro to extract the string length
#define GET_STR_LEN(str_obj_in, str_len) uint str_len; if (MP_OBJ_IS_QSTR(str_obj_in)) { str_len = qstr_len(MP_OBJ_QSTR_VALUE(str_obj_in)); } else { str_len = ((mp_obj_str_t*)str_obj_in)->len; }

// use this macro to extract the string data and length
#define GET_STR_DATA_LEN(str_obj_in, str_data, str_len) const byte *str_data; uint str_len; if (MP_OBJ_IS_QSTR(str_obj_in)) { str_data = qstr_data(MP_OBJ_QSTR_VALUE(str_obj_in), &str_len); } else { str_len = ((mp_obj_str_t*)str_obj_in)->len; str_data = ((mp_obj_str_t*)str_obj_in)->data; }

31
32
STATIC mp_obj_t mp_obj_new_str_iterator(mp_obj_t str);
STATIC mp_obj_t mp_obj_new_bytes_iterator(mp_obj_t str);
xyb's avatar
xyb committed
33
34
35
36

/******************************************************************************/
/* str                                                                        */

37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
void mp_str_print_quoted(void (*print)(void *env, const char *fmt, ...), void *env, const byte *str_data, uint str_len) {
    // this escapes characters, but it will be very slow to print (calling print many times)
    bool has_single_quote = false;
    bool has_double_quote = false;
    for (const byte *s = str_data, *top = str_data + str_len; (!has_single_quote || !has_double_quote) && s < top; s++) {
        if (*s == '\'') {
            has_single_quote = true;
        } else if (*s == '"') {
            has_double_quote = true;
        }
    }
    int quote_char = '\'';
    if (has_single_quote && !has_double_quote) {
        quote_char = '"';
    }
    print(env, "%c", quote_char);
    for (const byte *s = str_data, *top = str_data + str_len; s < top; s++) {
        if (*s == quote_char) {
            print(env, "\\%c", quote_char);
        } else if (*s == '\\') {
            print(env, "\\\\");
        } else if (32 <= *s && *s <= 126) {
            print(env, "%c", *s);
        } else if (*s == '\n') {
            print(env, "\\n");
        // TODO add more escape codes here if we want to match CPython
        } else {
            print(env, "\\x%02x", *s);
        }
    }
    print(env, "%c", quote_char);
}

70
STATIC void str_print(void (*print)(void *env, const char *fmt, ...), void *env, mp_obj_t self_in, mp_print_kind_t kind) {
71
    GET_STR_DATA_LEN(self_in, str_data, str_len);
72
73
    bool is_bytes = MP_OBJ_IS_TYPE(self_in, &bytes_type);
    if (kind == PRINT_STR && !is_bytes) {
74
        print(env, "%.*s", str_len, str_data);
75
    } else {
76
77
78
        if (is_bytes) {
            print(env, "b");
        }
79
        mp_str_print_quoted(print, env, str_data, str_len);
80
    }
81
82
}

83
84
// like strstr but with specified length and allows \0 bytes
// TODO replace with something more efficient/standard
85
STATIC const byte *find_subbytes(const byte *haystack, uint hlen, const byte *needle, uint nlen) {
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
    if (hlen >= nlen) {
        for (uint i = 0; i <= hlen - nlen; i++) {
            bool found = true;
            for (uint j = 0; j < nlen; j++) {
                if (haystack[i + j] != needle[j]) {
                    found = false;
                    break;
                }
            }
            if (found) {
                return haystack + i;
            }
        }
    }
    return NULL;
}

103
STATIC mp_obj_t str_binary_op(int op, mp_obj_t lhs_in, mp_obj_t rhs_in) {
104
    GET_STR_DATA_LEN(lhs_in, lhs_data, lhs_len);
105
106
    switch (op) {
        case RT_BINARY_OP_SUBSCR:
107
108
109
            // TODO: need predicate to check for int-like type (bools are such for example)
            // ["no", "yes"][1 == 2] is common idiom
            if (MP_OBJ_IS_SMALL_INT(rhs_in)) {
110
                uint index = mp_get_index(mp_obj_get_type(lhs_in), lhs_len, rhs_in, false);
111
                if (MP_OBJ_IS_TYPE(lhs_in, &bytes_type)) {
112
                    return MP_OBJ_NEW_SMALL_INT((mp_small_int_t)lhs_data[index]);
113
114
115
                } else {
                    return mp_obj_new_str(lhs_data + index, 1, true);
                }
116
#if MICROPY_ENABLE_SLICE
117
            } else if (MP_OBJ_IS_TYPE(rhs_in, &slice_type)) {
118
                machine_uint_t start, stop;
Paul Sokolovsky's avatar
Paul Sokolovsky committed
119
120
121
                if (!m_seq_get_fast_slice_indexes(lhs_len, rhs_in, &start, &stop)) {
                    assert(0);
                }
122
                return mp_obj_new_str(lhs_data + start, stop - start, false);
123
#endif
124
            } else {
125
126
                // Message doesn't match CPython, but we don't have so much bytes as they
                // to spend them on verbose wording
127
                nlr_jump(mp_obj_new_exception_msg(&mp_type_TypeError, "index must be int"));
128
            }
129
130
131

        case RT_BINARY_OP_ADD:
        case RT_BINARY_OP_INPLACE_ADD:
132
            if (MP_OBJ_IS_STR(rhs_in)) {
133
                // add 2 strings
134
135

                GET_STR_DATA_LEN(rhs_in, rhs_data, rhs_len);
136
                int alloc_len = lhs_len + rhs_len;
137
138

                /* code for making qstr
139
140
141
142
                byte *q_ptr;
                byte *val = qstr_build_start(alloc_len, &q_ptr);
                memcpy(val, lhs_data, lhs_len);
                memcpy(val + lhs_len, rhs_data, rhs_len);
143
144
145
146
147
                return MP_OBJ_NEW_QSTR(qstr_build_end(q_ptr));
                */

                // code for non-qstr
                byte *data;
148
                mp_obj_t s = mp_obj_str_builder_start(mp_obj_get_type(lhs_in), alloc_len, &data);
149
150
151
                memcpy(data, lhs_data, lhs_len);
                memcpy(data + lhs_len, rhs_data, rhs_len);
                return mp_obj_str_builder_end(s);
152
153
            }
            break;
154

155
        case RT_BINARY_OP_IN:
156
            /* NOTE `a in b` is `b.__contains__(a)` */
157
158
            if (MP_OBJ_IS_STR(rhs_in)) {
                GET_STR_DATA_LEN(rhs_in, rhs_data, rhs_len);
159
                return MP_BOOL(find_subbytes(lhs_data, lhs_len, rhs_data, rhs_len) != NULL);
160
161
            }
            break;
162

163
164
165
166
167
168
        case RT_BINARY_OP_MULTIPLY:
        {
            if (!MP_OBJ_IS_SMALL_INT(rhs_in)) {
                return NULL;
            }
            int n = MP_OBJ_SMALL_INT_VALUE(rhs_in);
169
            byte *data;
170
            mp_obj_t s = mp_obj_str_builder_start(mp_obj_get_type(lhs_in), lhs_len * n, &data);
171
172
            mp_seq_multiply(lhs_data, sizeof(*lhs_data), lhs_len, n, data);
            return mp_obj_str_builder_end(s);
173
        }
174
175
176
177
178
179
180
181
182
183
184
185

        // These 2 are never passed here, dealt with as a special case in rt_binary_op().
        //case RT_BINARY_OP_EQUAL:
        //case RT_BINARY_OP_NOT_EQUAL:
        case RT_BINARY_OP_LESS:
        case RT_BINARY_OP_LESS_EQUAL:
        case RT_BINARY_OP_MORE:
        case RT_BINARY_OP_MORE_EQUAL:
            if (MP_OBJ_IS_STR(rhs_in)) {
                GET_STR_DATA_LEN(rhs_in, rhs_data, rhs_len);
                return MP_BOOL(mp_seq_cmp_bytes(op, lhs_data, lhs_len, rhs_data, rhs_len));
            }
186
187
188
189
190
    }

    return MP_OBJ_NULL; // op not supported
}

191
STATIC mp_obj_t str_join(mp_obj_t self_in, mp_obj_t arg) {
192
    assert(MP_OBJ_IS_STR(self_in));
193

194
    // get separation string
195
    GET_STR_DATA_LEN(self_in, sep_str, sep_len);
196
197

    // process args
198
199
200
201
202
203
204
205
206
    uint seq_len;
    mp_obj_t *seq_items;
    if (MP_OBJ_IS_TYPE(arg, &tuple_type)) {
        mp_obj_tuple_get(arg, &seq_len, &seq_items);
    } else if (MP_OBJ_IS_TYPE(arg, &list_type)) {
        mp_obj_list_get(arg, &seq_len, &seq_items);
    } else {
        goto bad_arg;
    }
207
208
209

    // count required length
    int required_len = 0;
210
    for (int i = 0; i < seq_len; i++) {
211
        if (!MP_OBJ_IS_STR(seq_items[i])) {
212
213
            goto bad_arg;
        }
214
215
216
        if (i > 0) {
            required_len += sep_len;
        }
217
218
        GET_STR_LEN(seq_items[i], l);
        required_len += l;
219
220
221
    }

    // make joined string
222
    byte *data;
223
    mp_obj_t joined_str = mp_obj_str_builder_start(mp_obj_get_type(self_in), required_len, &data);
224
225
    for (int i = 0; i < seq_len; i++) {
        if (i > 0) {
226
227
            memcpy(data, sep_str, sep_len);
            data += sep_len;
228
        }
229
230
231
        GET_STR_DATA_LEN(seq_items[i], s, l);
        memcpy(data, s, l);
        data += l;
232
    }
233
234

    // return joined string
235
    return mp_obj_str_builder_end(joined_str);
236
237

bad_arg:
238
    nlr_jump(mp_obj_new_exception_msg(&mp_type_TypeError, "?str.join expecting a list of str's"));
239
240
}

Paul Sokolovsky's avatar
Paul Sokolovsky committed
241
242
#define is_ws(c) ((c) == ' ' || (c) == '\t')

243
STATIC mp_obj_t str_split(uint n_args, const mp_obj_t *args) {
Paul Sokolovsky's avatar
Paul Sokolovsky committed
244
245
246
247
248
249
250
251
252
    int splits = -1;
    mp_obj_t sep = mp_const_none;
    if (n_args > 1) {
        sep = args[1];
        if (n_args > 2) {
            splits = MP_OBJ_SMALL_INT_VALUE(args[2]);
        }
    }
    assert(sep == mp_const_none);
253
    (void)sep; // unused; to hush compiler warning
Paul Sokolovsky's avatar
Paul Sokolovsky committed
254
    mp_obj_t res = mp_obj_new_list(0, NULL);
255
256
257
    GET_STR_DATA_LEN(args[0], s, len);
    const byte *top = s + len;
    const byte *start;
Paul Sokolovsky's avatar
Paul Sokolovsky committed
258
259

    // Initial whitespace is not counted as split, so we pre-do it
260
261
    while (s < top && is_ws(*s)) s++;
    while (s < top && splits != 0) {
Paul Sokolovsky's avatar
Paul Sokolovsky committed
262
        start = s;
263
264
265
        while (s < top && !is_ws(*s)) s++;
        rt_list_append(res, mp_obj_new_str(start, s - start, false));
        if (s >= top) {
Paul Sokolovsky's avatar
Paul Sokolovsky committed
266
267
            break;
        }
268
        while (s < top && is_ws(*s)) s++;
Paul Sokolovsky's avatar
Paul Sokolovsky committed
269
270
271
272
273
        if (splits > 0) {
            splits--;
        }
    }

274
275
    if (s < top) {
        rt_list_append(res, mp_obj_new_str(s, top - s, false));
Paul Sokolovsky's avatar
Paul Sokolovsky committed
276
277
278
279
280
    }

    return res;
}

281
STATIC mp_obj_t str_find(uint n_args, const mp_obj_t *args) {
282
    assert(2 <= n_args && n_args <= 4);
283
284
    assert(MP_OBJ_IS_STR(args[0]));
    assert(MP_OBJ_IS_STR(args[1]));
285

286
287
    GET_STR_DATA_LEN(args[0], haystack, haystack_len);
    GET_STR_DATA_LEN(args[1], needle, needle_len);
288
289
290
291
292

    size_t start = 0;
    size_t end = haystack_len;
    /* TODO use a non-exception-throwing mp_get_index */
    if (n_args >= 3 && args[2] != mp_const_none) {
293
        start = mp_get_index(&str_type, haystack_len, args[2], true);
294
295
    }
    if (n_args >= 4 && args[3] != mp_const_none) {
296
        end = mp_get_index(&str_type, haystack_len, args[3], true);
297
298
    }

299
    const byte *p = find_subbytes(haystack + start, haystack_len - start, needle, needle_len);
300
301
302
303
304
305
    if (p == NULL) {
        // not found
        return MP_OBJ_NEW_SMALL_INT(-1);
    } else {
        // found
        machine_int_t pos = p - haystack;
306
307
308
        if (pos + needle_len > end) {
            pos = -1;
        }
309
        return MP_OBJ_NEW_SMALL_INT(pos);
310
311
312
    }
}

313
// TODO: (Much) more variety in args
314
STATIC mp_obj_t str_startswith(mp_obj_t self_in, mp_obj_t arg) {
315
316
317
318
319
320
321
322
    GET_STR_DATA_LEN(self_in, str, str_len);
    GET_STR_DATA_LEN(arg, prefix, prefix_len);
    if (prefix_len > str_len) {
        return mp_const_false;
    }
    return MP_BOOL(memcmp(str, prefix, prefix_len) == 0);
}

323
STATIC bool chr_in_str(const byte* const str, const size_t str_len, int c) {
324
325
326
327
328
329
330
331
    for (size_t i = 0; i < str_len; i++) {
        if (str[i] == c) {
            return true;
        }
    }
    return false;
}

332
STATIC mp_obj_t str_strip(uint n_args, const mp_obj_t *args) {
xbe's avatar
xbe committed
333
    assert(1 <= n_args && n_args <= 2);
334
335
336
337
338
    assert(MP_OBJ_IS_STR(args[0]));

    const byte *chars_to_del;
    uint chars_to_del_len;
    static const byte whitespace[] = " \t\n\r\v\f";
xbe's avatar
xbe committed
339
340
341

    if (n_args == 1) {
        chars_to_del = whitespace;
342
        chars_to_del_len = sizeof(whitespace);
xbe's avatar
xbe committed
343
    } else {
344
345
346
347
        assert(MP_OBJ_IS_STR(args[1]));
        GET_STR_DATA_LEN(args[1], s, l);
        chars_to_del = s;
        chars_to_del_len = l;
xbe's avatar
xbe committed
348
349
    }

350
    GET_STR_DATA_LEN(args[0], orig_str, orig_str_len);
xbe's avatar
xbe committed
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365

    size_t first_good_char_pos = 0;
    bool first_good_char_pos_set = false;
    size_t last_good_char_pos = 0;
    for (size_t i = 0; i < orig_str_len; i++) {
        if (!chr_in_str(chars_to_del, chars_to_del_len, orig_str[i])) {
            last_good_char_pos = i;
            if (!first_good_char_pos_set) {
                first_good_char_pos = i;
                first_good_char_pos_set = true;
            }
        }
    }

    if (first_good_char_pos == 0 && last_good_char_pos == 0) {
366
367
        // string is all whitespace, return ''
        return MP_OBJ_NEW_QSTR(MP_QSTR_);
xbe's avatar
xbe committed
368
369
370
371
372
    }

    assert(last_good_char_pos >= first_good_char_pos);
    //+1 to accomodate the last character
    size_t stripped_len = last_good_char_pos - first_good_char_pos + 1;
373
    return mp_obj_new_str(orig_str + first_good_char_pos, stripped_len, false);
xbe's avatar
xbe committed
374
375
}

376
mp_obj_t str_format(uint n_args, const mp_obj_t *args) {
377
    assert(MP_OBJ_IS_STR(args[0]));
378

379
    GET_STR_DATA_LEN(args[0], str, len);
380
381
    int arg_i = 1;
    vstr_t *vstr = vstr_new();
382
    for (const byte *top = str + len; str < top; str++) {
383
384
        if (*str == '{') {
            str++;
385
            if (str < top && *str == '{') {
386
                vstr_add_char(vstr, '{');
387
            } else {
388
                while (str < top && *str != '}') str++;
389
                if (arg_i >= n_args) {
390
                    nlr_jump(mp_obj_new_exception_msg(&mp_type_IndexError, "tuple index out of range"));
391
                }
392
                // TODO: may be PRINT_REPR depending on formatting code
393
                mp_obj_print_helper((void (*)(void*, const char*, ...))vstr_printf, vstr, args[arg_i], PRINT_STR);
394
395
396
397
398
399
400
                arg_i++;
            }
        } else {
            vstr_add_char(vstr, *str);
        }
    }

401
402
403
    mp_obj_t s = mp_obj_new_str((byte*)vstr->buf, vstr->len, false);
    vstr_free(vstr);
    return s;
404
405
}

406
STATIC mp_obj_t str_replace(uint n_args, const mp_obj_t *args) {
407
408
409
410
    assert(MP_OBJ_IS_STR(args[0]));
    assert(MP_OBJ_IS_STR(args[1]));
    assert(MP_OBJ_IS_STR(args[2]));

411
    machine_int_t max_rep = 0;
412
    if (n_args == 4) {
413
414
415
416
417
418
419
        assert(MP_OBJ_IS_SMALL_INT(args[3]));
        max_rep = MP_OBJ_SMALL_INT_VALUE(args[3]);
        if (max_rep == 0) {
            return args[0];
        } else if (max_rep < 0) {
            max_rep = 0;
        }
420
    }
421
422

    // if max_rep is still 0 by this point we will need to do all possible replacements
423
424
425
426

    GET_STR_DATA_LEN(args[0], str, str_len);
    GET_STR_DATA_LEN(args[1], old, old_len);
    GET_STR_DATA_LEN(args[2], new, new_len);
427
428

    // old won't exist in str if it's longer, so nothing to replace
429
    if (old_len > str_len) {
430
        return args[0];
431
432
    }

433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
    // data for the replaced string
    byte *data = NULL;
    mp_obj_t replaced_str = MP_OBJ_NULL;

    // do 2 passes over the string:
    //   first pass computes the required length of the replaced string
    //   second pass does the replacements
    for (;;) {
        machine_uint_t replaced_str_index = 0;
        machine_uint_t num_replacements_done = 0;
        const byte *old_occurrence;
        const byte *offset_ptr = str;
        machine_uint_t offset_num = 0;
        while ((old_occurrence = find_subbytes(offset_ptr, str_len - offset_num, old, old_len)) != NULL) {
            // copy from just after end of last occurrence of to-be-replaced string to right before start of next occurrence
            if (data != NULL) {
                memcpy(data + replaced_str_index, offset_ptr, old_occurrence - offset_ptr);
            }
            replaced_str_index += old_occurrence - offset_ptr;
            // copy the replacement string
            if (data != NULL) {
                memcpy(data + replaced_str_index, new, new_len);
            }
            replaced_str_index += new_len;
            offset_ptr = old_occurrence + old_len;
            offset_num = offset_ptr - str;

            num_replacements_done++;
            if (max_rep != 0 && num_replacements_done == max_rep){
                break;
            }
        }

        // copy from just after end of last occurrence of to-be-replaced string to end of old string
        if (data != NULL) {
            memcpy(data + replaced_str_index, offset_ptr, str_len - offset_num);
        }
        replaced_str_index += str_len - offset_num;

        if (data == NULL) {
            // first pass
            if (num_replacements_done == 0) {
                // no substr found, return original string
                return args[0];
            } else {
                // substr found, allocate new string
                replaced_str = mp_obj_str_builder_start(mp_obj_get_type(args[0]), replaced_str_index, &data);
            }
        } else {
            // second pass, we are done
            break;
        }
485
    }
486

487
488
489
    return mp_obj_str_builder_end(replaced_str);
}

490
491
492
493
494
495
496
497
STATIC mp_obj_t str_count(uint n_args, const mp_obj_t *args) {
    assert(2 <= n_args && n_args <= 4);
    assert(MP_OBJ_IS_STR(args[0]));
    assert(MP_OBJ_IS_STR(args[1]));

    GET_STR_DATA_LEN(args[0], haystack, haystack_len);
    GET_STR_DATA_LEN(args[1], needle, needle_len);

498
499
    machine_uint_t start = 0;
    machine_uint_t end = haystack_len;
500
501
502
503
504
505
506
507
    /* TODO use a non-exception-throwing mp_get_index */
    if (n_args >= 3 && args[2] != mp_const_none) {
        start = mp_get_index(&str_type, haystack_len, args[2], true);
    }
    if (n_args >= 4 && args[3] != mp_const_none) {
        end = mp_get_index(&str_type, haystack_len, args[3], true);
    }

508
509
510
    // if needle_len is zero then we count each gap between characters as an occurrence
    if (needle_len == 0) {
        return MP_OBJ_NEW_SMALL_INT(end - start + 1);
511
512
    }

513
514
    // count the occurrences
    machine_int_t num_occurrences = 0;
xbe's avatar
xbe committed
515
516
517
518
519
    for (machine_uint_t haystack_index = start; haystack_index + needle_len <= end; haystack_index++) {
        if (memcmp(&haystack[haystack_index], needle, needle_len) == 0) {
            num_occurrences++;
            haystack_index += needle_len - 1;
        }
520
521
522
523
524
    }

    return MP_OBJ_NEW_SMALL_INT(num_occurrences);
}

525
526
527
528
529
530
531
532
533
534
535
536
537
538
STATIC machine_int_t str_get_buffer(mp_obj_t self_in, buffer_info_t *bufinfo, int flags) {
    if (flags == BUFFER_READ) {
        GET_STR_DATA_LEN(self_in, str_data, str_len);
        bufinfo->buf = (void*)str_data;
        bufinfo->len = str_len;
        return 0;
    } else {
        // can't write to a string
        bufinfo->buf = NULL;
        bufinfo->len = 0;
        return 1;
    }
}

539
540
541
542
543
544
545
STATIC MP_DEFINE_CONST_FUN_OBJ_VAR_BETWEEN(str_find_obj, 2, 4, str_find);
STATIC MP_DEFINE_CONST_FUN_OBJ_2(str_join_obj, str_join);
STATIC MP_DEFINE_CONST_FUN_OBJ_VAR_BETWEEN(str_split_obj, 1, 3, str_split);
STATIC MP_DEFINE_CONST_FUN_OBJ_2(str_startswith_obj, str_startswith);
STATIC MP_DEFINE_CONST_FUN_OBJ_VAR_BETWEEN(str_strip_obj, 1, 2, str_strip);
STATIC MP_DEFINE_CONST_FUN_OBJ_VAR(str_format_obj, 1, str_format);
STATIC MP_DEFINE_CONST_FUN_OBJ_VAR_BETWEEN(str_replace_obj, 3, 4, str_replace);
546
STATIC MP_DEFINE_CONST_FUN_OBJ_VAR_BETWEEN(str_count_obj, 2, 4, str_count);
547

548
STATIC const mp_method_t str_type_methods[] = {
549
    { "find", &str_find_obj },
ian-v's avatar
ian-v committed
550
    { "join", &str_join_obj },
Paul Sokolovsky's avatar
Paul Sokolovsky committed
551
    { "split", &str_split_obj },
552
    { "startswith", &str_startswith_obj },
xbe's avatar
xbe committed
553
    { "strip", &str_strip_obj },
ian-v's avatar
ian-v committed
554
    { "format", &str_format_obj },
555
    { "replace", &str_replace_obj },
556
    { "count", &str_count_obj },
ian-v's avatar
ian-v committed
557
558
    { NULL, NULL }, // end-of-list sentinel
};
559

560
const mp_obj_type_t str_type = {
561
    { &mp_type_type },
562
    .name = MP_QSTR_str,
563
564
    .print = str_print,
    .binary_op = str_binary_op,
565
566
    .getiter = mp_obj_new_str_iterator,
    .methods = str_type_methods,
567
    .buffer_p = { .get_buffer = str_get_buffer },
568
569
570
571
};

// Reuses most of methods from str
const mp_obj_type_t bytes_type = {
572
    { &mp_type_type },
573
    .name = MP_QSTR_bytes,
574
575
576
    .print = str_print,
    .binary_op = str_binary_op,
    .getiter = mp_obj_new_bytes_iterator,
ian-v's avatar
ian-v committed
577
    .methods = str_type_methods,
578
579
};

580
mp_obj_t mp_obj_str_builder_start(const mp_obj_type_t *type, uint len, byte **data) {
581
    mp_obj_str_t *o = m_new_obj_var(mp_obj_str_t, byte, len + 1);
582
    o->base.type = type;
583
584
585
586
587
588
589
590
591
592
    o->len = len;
    *data = o->data;
    return o;
}

mp_obj_t mp_obj_str_builder_end(mp_obj_t o_in) {
    assert(MP_OBJ_IS_STR(o_in));
    mp_obj_str_t *o = o_in;
    o->hash = qstr_compute_hash(o->data, o->len);
    o->data[o->len] = '\0'; // for now we add null for compatibility with C ASCIIZ strings
593
594
595
    return o;
}

596
STATIC mp_obj_t str_new(const mp_obj_type_t *type, const byte* data, uint len) {
597
598
599
600
601
602
603
604
605
    mp_obj_str_t *o = m_new_obj_var(mp_obj_str_t, byte, len + 1);
    o->base.type = type;
    o->hash = qstr_compute_hash(data, len);
    o->len = len;
    memcpy(o->data, data, len * sizeof(byte));
    o->data[len] = '\0'; // for now we add null for compatibility with C ASCIIZ strings
    return o;
}

606
607
608
609
610
611
612
613
614
615
mp_obj_t mp_obj_new_str(const byte* data, uint len, bool make_qstr_if_not_already) {
    qstr q = qstr_find_strn(data, len);
    if (q != MP_QSTR_NULL) {
        // qstr with this data already exists
        return MP_OBJ_NEW_QSTR(q);
    } else if (make_qstr_if_not_already) {
        // no existing qstr, make a new one
        return MP_OBJ_NEW_QSTR(qstr_from_strn((const char*)data, len));
    } else {
        // no existing qstr, don't make one
616
        return str_new(&str_type, data, len);
617
618
619
    }
}

620
621
622
623
mp_obj_t mp_obj_new_bytes(const byte* data, uint len) {
    return str_new(&bytes_type, data, len);
}

624
625
626
627
628
629
630
631
632
633
634
635
636
637
bool mp_obj_str_equal(mp_obj_t s1, mp_obj_t s2) {
    if (MP_OBJ_IS_QSTR(s1) && MP_OBJ_IS_QSTR(s2)) {
        return s1 == s2;
    } else {
        GET_STR_HASH(s1, h1);
        GET_STR_HASH(s2, h2);
        if (h1 != h2) {
            return false;
        }
        GET_STR_DATA_LEN(s1, d1, l1);
        GET_STR_DATA_LEN(s2, d2, l2);
        if (l1 != l2) {
            return false;
        }
638
        return memcmp(d1, d2, l1) == 0;
639
640
641
    }
}

642
643
void bad_implicit_conversion(mp_obj_t self_in) __attribute__((noreturn));
void bad_implicit_conversion(mp_obj_t self_in) {
644
    nlr_jump(mp_obj_new_exception_msg_varg(&mp_type_TypeError, "Can't convert '%s' object to str implicitly", mp_obj_get_type_str(self_in)));
645
646
}

647
648
649
650
651
uint mp_obj_str_get_hash(mp_obj_t self_in) {
    if (MP_OBJ_IS_STR(self_in)) {
        GET_STR_HASH(self_in, h);
        return h;
    } else {
652
        bad_implicit_conversion(self_in);
653
    }
654
655
656
657
658
659
660
}

uint mp_obj_str_get_len(mp_obj_t self_in) {
    if (MP_OBJ_IS_STR(self_in)) {
        GET_STR_LEN(self_in, l);
        return l;
    } else {
661
662
663
664
665
666
667
668
669
670
671
672
673
674
        bad_implicit_conversion(self_in);
    }
}

// use this if you will anyway convert the string to a qstr
// will be more efficient for the case where it's already a qstr
qstr mp_obj_str_get_qstr(mp_obj_t self_in) {
    if (MP_OBJ_IS_QSTR(self_in)) {
        return MP_OBJ_QSTR_VALUE(self_in);
    } else if (MP_OBJ_IS_TYPE(self_in, &str_type)) {
        mp_obj_str_t *self = self_in;
        return qstr_from_strn((char*)self->data, self->len);
    } else {
        bad_implicit_conversion(self_in);
675
676
677
678
679
680
681
682
683
684
685
    }
}

// only use this function if you need the str data to be zero terminated
// at the moment all strings are zero terminated to help with C ASCIIZ compatibility
const char *mp_obj_str_get_str(mp_obj_t self_in) {
    if (MP_OBJ_IS_STR(self_in)) {
        GET_STR_DATA_LEN(self_in, s, l);
        (void)l; // len unused
        return (const char*)s;
    } else {
686
        bad_implicit_conversion(self_in);
687
688
689
    }
}

690
const char *mp_obj_str_get_data(mp_obj_t self_in, uint *len) {
691
692
693
    if (MP_OBJ_IS_STR(self_in)) {
        GET_STR_DATA_LEN(self_in, s, l);
        *len = l;
694
        return (const char*)s;
695
    } else {
696
        bad_implicit_conversion(self_in);
697
    }
698
}
xyb's avatar
xyb committed
699
700
701
702
703
704

/******************************************************************************/
/* str iterator                                                               */

typedef struct _mp_obj_str_it_t {
    mp_obj_base_t base;
705
    mp_obj_t str;
xyb's avatar
xyb committed
706
707
708
    machine_uint_t cur;
} mp_obj_str_it_t;

709
STATIC mp_obj_t str_it_iternext(mp_obj_t self_in) {
xyb's avatar
xyb committed
710
    mp_obj_str_it_t *self = self_in;
711
712
713
    GET_STR_DATA_LEN(self->str, str, len);
    if (self->cur < len) {
        mp_obj_t o_out = mp_obj_new_str(str + self->cur, 1, true);
xyb's avatar
xyb committed
714
715
716
717
718
719
720
        self->cur += 1;
        return o_out;
    } else {
        return mp_const_stop_iteration;
    }
}

721
STATIC const mp_obj_type_t str_it_type = {
722
    { &mp_type_type },
723
    .name = MP_QSTR_iterator,
724
    .iternext = str_it_iternext,
xyb's avatar
xyb committed
725
726
};

727
STATIC mp_obj_t bytes_it_iternext(mp_obj_t self_in) {
728
729
730
    mp_obj_str_it_t *self = self_in;
    GET_STR_DATA_LEN(self->str, str, len);
    if (self->cur < len) {
731
        mp_obj_t o_out = MP_OBJ_NEW_SMALL_INT((mp_small_int_t)str[self->cur]);
732
733
734
735
736
737
738
        self->cur += 1;
        return o_out;
    } else {
        return mp_const_stop_iteration;
    }
}

739
STATIC const mp_obj_type_t bytes_it_type = {
740
    { &mp_type_type },
741
    .name = MP_QSTR_iterator,
742
743
744
745
    .iternext = bytes_it_iternext,
};

mp_obj_t mp_obj_new_str_iterator(mp_obj_t str) {
xyb's avatar
xyb committed
746
747
748
    mp_obj_str_it_t *o = m_new_obj(mp_obj_str_it_t);
    o->base.type = &str_it_type;
    o->str = str;
749
750
751
752
753
754
755
756
757
    o->cur = 0;
    return o;
}

mp_obj_t mp_obj_new_bytes_iterator(mp_obj_t str) {
    mp_obj_str_it_t *o = m_new_obj(mp_obj_str_it_t);
    o->base.type = &bytes_it_type;
    o->str = str;
    o->cur = 0;
xyb's avatar
xyb committed
758
759
    return o;
}