objstr.c 30.1 KB
Newer Older
xbe's avatar
xbe committed
1
#include <stdbool.h>
2
3
4
5
6
7
#include <string.h>
#include <assert.h>

#include "nlr.h"
#include "misc.h"
#include "mpconfig.h"
8
#include "qstr.h"
9
10
11
12
13
14
#include "obj.h"
#include "runtime0.h"
#include "runtime.h"

typedef struct _mp_obj_str_t {
    mp_obj_base_t base;
15
16
    machine_uint_t hash : 16; // XXX here we assume the hash size is 16 bits (it is at the moment; see qstr.c)
    machine_uint_t len : 16; // len == number of bytes used in data, alloc = len + 1 because (at the moment) we also append a null byte
17
    const byte *data;
18
19
} mp_obj_str_t;

20
21
const mp_obj_t mp_const_empty_bytes;

22
23
24
25
26
27
28
29
30
// use this macro to extract the string hash
#define GET_STR_HASH(str_obj_in, str_hash) uint str_hash; if (MP_OBJ_IS_QSTR(str_obj_in)) { str_hash = qstr_hash(MP_OBJ_QSTR_VALUE(str_obj_in)); } else { str_hash = ((mp_obj_str_t*)str_obj_in)->hash; }

// use this macro to extract the string length
#define GET_STR_LEN(str_obj_in, str_len) uint str_len; if (MP_OBJ_IS_QSTR(str_obj_in)) { str_len = qstr_len(MP_OBJ_QSTR_VALUE(str_obj_in)); } else { str_len = ((mp_obj_str_t*)str_obj_in)->len; }

// use this macro to extract the string data and length
#define GET_STR_DATA_LEN(str_obj_in, str_data, str_len) const byte *str_data; uint str_len; if (MP_OBJ_IS_QSTR(str_obj_in)) { str_data = qstr_data(MP_OBJ_QSTR_VALUE(str_obj_in), &str_len); } else { str_len = ((mp_obj_str_t*)str_obj_in)->len; str_data = ((mp_obj_str_t*)str_obj_in)->data; }

31
32
STATIC mp_obj_t mp_obj_new_str_iterator(mp_obj_t str);
STATIC mp_obj_t mp_obj_new_bytes_iterator(mp_obj_t str);
33
STATIC mp_obj_t str_new(const mp_obj_type_t *type, const byte* data, uint len);
xyb's avatar
xyb committed
34
35
36
37

/******************************************************************************/
/* str                                                                        */

38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
void mp_str_print_quoted(void (*print)(void *env, const char *fmt, ...), void *env, const byte *str_data, uint str_len) {
    // this escapes characters, but it will be very slow to print (calling print many times)
    bool has_single_quote = false;
    bool has_double_quote = false;
    for (const byte *s = str_data, *top = str_data + str_len; (!has_single_quote || !has_double_quote) && s < top; s++) {
        if (*s == '\'') {
            has_single_quote = true;
        } else if (*s == '"') {
            has_double_quote = true;
        }
    }
    int quote_char = '\'';
    if (has_single_quote && !has_double_quote) {
        quote_char = '"';
    }
    print(env, "%c", quote_char);
    for (const byte *s = str_data, *top = str_data + str_len; s < top; s++) {
        if (*s == quote_char) {
            print(env, "\\%c", quote_char);
        } else if (*s == '\\') {
            print(env, "\\\\");
        } else if (32 <= *s && *s <= 126) {
            print(env, "%c", *s);
        } else if (*s == '\n') {
            print(env, "\\n");
        // TODO add more escape codes here if we want to match CPython
        } else {
            print(env, "\\x%02x", *s);
        }
    }
    print(env, "%c", quote_char);
}

71
STATIC void str_print(void (*print)(void *env, const char *fmt, ...), void *env, mp_obj_t self_in, mp_print_kind_t kind) {
72
    GET_STR_DATA_LEN(self_in, str_data, str_len);
73
74
    bool is_bytes = MP_OBJ_IS_TYPE(self_in, &bytes_type);
    if (kind == PRINT_STR && !is_bytes) {
75
        print(env, "%.*s", str_len, str_data);
76
    } else {
77
78
79
        if (is_bytes) {
            print(env, "b");
        }
80
        mp_str_print_quoted(print, env, str_data, str_len);
81
    }
82
83
}

84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
STATIC mp_obj_t str_make_new(mp_obj_t type_in, uint n_args, uint n_kw, const mp_obj_t *args) {
    switch (n_args) {
        case 0:
            return MP_OBJ_NEW_QSTR(MP_QSTR_);

        case 1:
        {
            vstr_t *vstr = vstr_new();
            mp_obj_print_helper((void (*)(void*, const char*, ...))vstr_printf, vstr, args[0], PRINT_STR);
            mp_obj_t s = mp_obj_new_str((byte*)vstr->buf, vstr->len, false);
            vstr_free(vstr);
            return s;
        }

        case 2:
        case 3:
        {
            // TODO: validate 2nd/3rd args
            if (!MP_OBJ_IS_TYPE(args[0], &bytes_type)) {
                nlr_jump(mp_obj_new_exception_msg(&mp_type_TypeError, "bytes expected"));
            }
            GET_STR_DATA_LEN(args[0], str_data, str_len);
            GET_STR_HASH(args[0], str_hash);
            mp_obj_str_t *o = str_new(&str_type, NULL, str_len);
            o->data = str_data;
            o->hash = str_hash;
            return o;
        }

        default:
            nlr_jump(mp_obj_new_exception_msg(&mp_type_TypeError, "str takes at most 3 arguments"));
    }
}

118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
STATIC mp_obj_t bytes_make_new(mp_obj_t type_in, uint n_args, uint n_kw, const mp_obj_t *args) {
    if (n_args == 0) {
        return mp_const_empty_bytes;
    }

    if (MP_OBJ_IS_STR(args[0])) {
        if (n_args < 2 || n_args > 3) {
            goto wrong_args;
        }
        GET_STR_DATA_LEN(args[0], str_data, str_len);
        GET_STR_HASH(args[0], str_hash);
        mp_obj_str_t *o = str_new(&bytes_type, NULL, str_len);
        o->data = str_data;
        o->hash = str_hash;
        return o;
    }

    if (n_args > 1) {
        goto wrong_args;
    }

    if (MP_OBJ_IS_SMALL_INT(args[0])) {
        uint len = MP_OBJ_SMALL_INT_VALUE(args[0]);
        byte *data;

        mp_obj_t o = mp_obj_str_builder_start(&bytes_type, len, &data);
        memset(data, 0, len);
        return mp_obj_str_builder_end(o);
    }

    int len;
    byte *data;
    vstr_t *vstr = NULL;
    mp_obj_t o = NULL;
    // Try to create array of exact len if initializer len is known
    mp_obj_t len_in = mp_obj_len_maybe(args[0]);
    if (len_in == MP_OBJ_NULL) {
        len = -1;
        vstr = vstr_new();
    } else {
        len = MP_OBJ_SMALL_INT_VALUE(len_in);
        o = mp_obj_str_builder_start(&bytes_type, len, &data);
    }

    mp_obj_t iterable = rt_getiter(args[0]);
    mp_obj_t item;
    while ((item = rt_iternext(iterable)) != mp_const_stop_iteration) {
        if (len == -1) {
            vstr_add_char(vstr, MP_OBJ_SMALL_INT_VALUE(item));
        } else {
            *data++ = MP_OBJ_SMALL_INT_VALUE(item);
        }
    }

    if (len == -1) {
        vstr_shrink(vstr);
        // TODO: Optimize, borrow buffer from vstr
        len = vstr_len(vstr);
        o = mp_obj_str_builder_start(&bytes_type, len, &data);
        memcpy(data, vstr_str(vstr), len);
        vstr_free(vstr);
    }

    return mp_obj_str_builder_end(o);

wrong_args:
        nlr_jump(mp_obj_new_exception_msg(&mp_type_TypeError, "wrong number of arguments"));
}

187
188
// like strstr but with specified length and allows \0 bytes
// TODO replace with something more efficient/standard
189
STATIC const byte *find_subbytes(const byte *haystack, uint hlen, const byte *needle, uint nlen) {
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
    if (hlen >= nlen) {
        for (uint i = 0; i <= hlen - nlen; i++) {
            bool found = true;
            for (uint j = 0; j < nlen; j++) {
                if (haystack[i + j] != needle[j]) {
                    found = false;
                    break;
                }
            }
            if (found) {
                return haystack + i;
            }
        }
    }
    return NULL;
}

207
STATIC mp_obj_t str_binary_op(int op, mp_obj_t lhs_in, mp_obj_t rhs_in) {
208
    GET_STR_DATA_LEN(lhs_in, lhs_data, lhs_len);
209
210
    switch (op) {
        case RT_BINARY_OP_SUBSCR:
211
212
213
            // TODO: need predicate to check for int-like type (bools are such for example)
            // ["no", "yes"][1 == 2] is common idiom
            if (MP_OBJ_IS_SMALL_INT(rhs_in)) {
214
                uint index = mp_get_index(mp_obj_get_type(lhs_in), lhs_len, rhs_in, false);
215
                if (MP_OBJ_IS_TYPE(lhs_in, &bytes_type)) {
216
                    return MP_OBJ_NEW_SMALL_INT((mp_small_int_t)lhs_data[index]);
217
218
219
                } else {
                    return mp_obj_new_str(lhs_data + index, 1, true);
                }
220
#if MICROPY_ENABLE_SLICE
221
            } else if (MP_OBJ_IS_TYPE(rhs_in, &slice_type)) {
222
                machine_uint_t start, stop;
Paul Sokolovsky's avatar
Paul Sokolovsky committed
223
224
225
                if (!m_seq_get_fast_slice_indexes(lhs_len, rhs_in, &start, &stop)) {
                    assert(0);
                }
226
                return mp_obj_new_str(lhs_data + start, stop - start, false);
227
#endif
228
            } else {
229
230
                // Message doesn't match CPython, but we don't have so much bytes as they
                // to spend them on verbose wording
231
                nlr_jump(mp_obj_new_exception_msg(&mp_type_TypeError, "index must be int"));
232
            }
233
234
235

        case RT_BINARY_OP_ADD:
        case RT_BINARY_OP_INPLACE_ADD:
236
            if (MP_OBJ_IS_STR(rhs_in)) {
237
                // add 2 strings
238
239

                GET_STR_DATA_LEN(rhs_in, rhs_data, rhs_len);
240
                int alloc_len = lhs_len + rhs_len;
241
242

                /* code for making qstr
243
244
245
246
                byte *q_ptr;
                byte *val = qstr_build_start(alloc_len, &q_ptr);
                memcpy(val, lhs_data, lhs_len);
                memcpy(val + lhs_len, rhs_data, rhs_len);
247
248
249
250
251
                return MP_OBJ_NEW_QSTR(qstr_build_end(q_ptr));
                */

                // code for non-qstr
                byte *data;
252
                mp_obj_t s = mp_obj_str_builder_start(mp_obj_get_type(lhs_in), alloc_len, &data);
253
254
255
                memcpy(data, lhs_data, lhs_len);
                memcpy(data + lhs_len, rhs_data, rhs_len);
                return mp_obj_str_builder_end(s);
256
257
            }
            break;
258

259
        case RT_BINARY_OP_IN:
260
            /* NOTE `a in b` is `b.__contains__(a)` */
261
262
            if (MP_OBJ_IS_STR(rhs_in)) {
                GET_STR_DATA_LEN(rhs_in, rhs_data, rhs_len);
263
                return MP_BOOL(find_subbytes(lhs_data, lhs_len, rhs_data, rhs_len) != NULL);
264
265
            }
            break;
266

267
268
269
270
271
272
        case RT_BINARY_OP_MULTIPLY:
        {
            if (!MP_OBJ_IS_SMALL_INT(rhs_in)) {
                return NULL;
            }
            int n = MP_OBJ_SMALL_INT_VALUE(rhs_in);
273
            byte *data;
274
            mp_obj_t s = mp_obj_str_builder_start(mp_obj_get_type(lhs_in), lhs_len * n, &data);
275
276
            mp_seq_multiply(lhs_data, sizeof(*lhs_data), lhs_len, n, data);
            return mp_obj_str_builder_end(s);
277
        }
278
279
280
281
282
283
284
285
286
287
288
289

        // These 2 are never passed here, dealt with as a special case in rt_binary_op().
        //case RT_BINARY_OP_EQUAL:
        //case RT_BINARY_OP_NOT_EQUAL:
        case RT_BINARY_OP_LESS:
        case RT_BINARY_OP_LESS_EQUAL:
        case RT_BINARY_OP_MORE:
        case RT_BINARY_OP_MORE_EQUAL:
            if (MP_OBJ_IS_STR(rhs_in)) {
                GET_STR_DATA_LEN(rhs_in, rhs_data, rhs_len);
                return MP_BOOL(mp_seq_cmp_bytes(op, lhs_data, lhs_len, rhs_data, rhs_len));
            }
290
291
292
293
294
    }

    return MP_OBJ_NULL; // op not supported
}

295
STATIC mp_obj_t str_join(mp_obj_t self_in, mp_obj_t arg) {
296
    assert(MP_OBJ_IS_STR(self_in));
297

298
    // get separation string
299
    GET_STR_DATA_LEN(self_in, sep_str, sep_len);
300
301

    // process args
302
303
304
305
306
307
308
309
310
    uint seq_len;
    mp_obj_t *seq_items;
    if (MP_OBJ_IS_TYPE(arg, &tuple_type)) {
        mp_obj_tuple_get(arg, &seq_len, &seq_items);
    } else if (MP_OBJ_IS_TYPE(arg, &list_type)) {
        mp_obj_list_get(arg, &seq_len, &seq_items);
    } else {
        goto bad_arg;
    }
311
312
313

    // count required length
    int required_len = 0;
314
    for (int i = 0; i < seq_len; i++) {
315
        if (!MP_OBJ_IS_STR(seq_items[i])) {
316
317
            goto bad_arg;
        }
318
319
320
        if (i > 0) {
            required_len += sep_len;
        }
321
322
        GET_STR_LEN(seq_items[i], l);
        required_len += l;
323
324
325
    }

    // make joined string
326
    byte *data;
327
    mp_obj_t joined_str = mp_obj_str_builder_start(mp_obj_get_type(self_in), required_len, &data);
328
329
    for (int i = 0; i < seq_len; i++) {
        if (i > 0) {
330
331
            memcpy(data, sep_str, sep_len);
            data += sep_len;
332
        }
333
334
335
        GET_STR_DATA_LEN(seq_items[i], s, l);
        memcpy(data, s, l);
        data += l;
336
    }
337
338

    // return joined string
339
    return mp_obj_str_builder_end(joined_str);
340
341

bad_arg:
342
    nlr_jump(mp_obj_new_exception_msg(&mp_type_TypeError, "?str.join expecting a list of str's"));
343
344
}

Paul Sokolovsky's avatar
Paul Sokolovsky committed
345
346
#define is_ws(c) ((c) == ' ' || (c) == '\t')

347
STATIC mp_obj_t str_split(uint n_args, const mp_obj_t *args) {
Paul Sokolovsky's avatar
Paul Sokolovsky committed
348
349
350
351
352
353
354
355
356
    int splits = -1;
    mp_obj_t sep = mp_const_none;
    if (n_args > 1) {
        sep = args[1];
        if (n_args > 2) {
            splits = MP_OBJ_SMALL_INT_VALUE(args[2]);
        }
    }
    assert(sep == mp_const_none);
357
    (void)sep; // unused; to hush compiler warning
Paul Sokolovsky's avatar
Paul Sokolovsky committed
358
    mp_obj_t res = mp_obj_new_list(0, NULL);
359
360
361
    GET_STR_DATA_LEN(args[0], s, len);
    const byte *top = s + len;
    const byte *start;
Paul Sokolovsky's avatar
Paul Sokolovsky committed
362
363

    // Initial whitespace is not counted as split, so we pre-do it
364
365
    while (s < top && is_ws(*s)) s++;
    while (s < top && splits != 0) {
Paul Sokolovsky's avatar
Paul Sokolovsky committed
366
        start = s;
367
368
369
        while (s < top && !is_ws(*s)) s++;
        rt_list_append(res, mp_obj_new_str(start, s - start, false));
        if (s >= top) {
Paul Sokolovsky's avatar
Paul Sokolovsky committed
370
371
            break;
        }
372
        while (s < top && is_ws(*s)) s++;
Paul Sokolovsky's avatar
Paul Sokolovsky committed
373
374
375
376
377
        if (splits > 0) {
            splits--;
        }
    }

378
379
    if (s < top) {
        rt_list_append(res, mp_obj_new_str(s, top - s, false));
Paul Sokolovsky's avatar
Paul Sokolovsky committed
380
381
382
383
384
    }

    return res;
}

385
STATIC mp_obj_t str_find(uint n_args, const mp_obj_t *args) {
386
    assert(2 <= n_args && n_args <= 4);
387
388
    assert(MP_OBJ_IS_STR(args[0]));
    assert(MP_OBJ_IS_STR(args[1]));
389

390
391
    GET_STR_DATA_LEN(args[0], haystack, haystack_len);
    GET_STR_DATA_LEN(args[1], needle, needle_len);
392

393
394
    machine_uint_t start = 0;
    machine_uint_t end = haystack_len;
395
    if (n_args >= 3 && args[2] != mp_const_none) {
396
        start = mp_get_index(&str_type, haystack_len, args[2], true);
397
398
    }
    if (n_args >= 4 && args[3] != mp_const_none) {
399
        end = mp_get_index(&str_type, haystack_len, args[3], true);
400
401
    }

402
    const byte *p = find_subbytes(haystack + start, haystack_len - start, needle, needle_len);
403
404
405
406
407
408
    if (p == NULL) {
        // not found
        return MP_OBJ_NEW_SMALL_INT(-1);
    } else {
        // found
        machine_int_t pos = p - haystack;
409
410
411
        if (pos + needle_len > end) {
            pos = -1;
        }
412
        return MP_OBJ_NEW_SMALL_INT(pos);
413
414
415
    }
}

416
// TODO: (Much) more variety in args
417
STATIC mp_obj_t str_startswith(mp_obj_t self_in, mp_obj_t arg) {
418
419
420
421
422
423
424
425
    GET_STR_DATA_LEN(self_in, str, str_len);
    GET_STR_DATA_LEN(arg, prefix, prefix_len);
    if (prefix_len > str_len) {
        return mp_const_false;
    }
    return MP_BOOL(memcmp(str, prefix, prefix_len) == 0);
}

426
427
STATIC bool chr_in_str(const byte* const str, const machine_uint_t str_len, int c) {
    for (machine_uint_t i = 0; i < str_len; i++) {
428
429
430
431
432
433
434
        if (str[i] == c) {
            return true;
        }
    }
    return false;
}

435
STATIC mp_obj_t str_strip(uint n_args, const mp_obj_t *args) {
xbe's avatar
xbe committed
436
    assert(1 <= n_args && n_args <= 2);
437
438
439
440
441
    assert(MP_OBJ_IS_STR(args[0]));

    const byte *chars_to_del;
    uint chars_to_del_len;
    static const byte whitespace[] = " \t\n\r\v\f";
xbe's avatar
xbe committed
442
443
444

    if (n_args == 1) {
        chars_to_del = whitespace;
445
        chars_to_del_len = sizeof(whitespace);
xbe's avatar
xbe committed
446
    } else {
447
448
449
450
        assert(MP_OBJ_IS_STR(args[1]));
        GET_STR_DATA_LEN(args[1], s, l);
        chars_to_del = s;
        chars_to_del_len = l;
xbe's avatar
xbe committed
451
452
    }

453
    GET_STR_DATA_LEN(args[0], orig_str, orig_str_len);
xbe's avatar
xbe committed
454

455
    machine_uint_t first_good_char_pos = 0;
xbe's avatar
xbe committed
456
    bool first_good_char_pos_set = false;
457
458
    machine_uint_t last_good_char_pos = 0;
    for (machine_uint_t i = 0; i < orig_str_len; i++) {
xbe's avatar
xbe committed
459
460
461
462
463
464
465
466
467
468
        if (!chr_in_str(chars_to_del, chars_to_del_len, orig_str[i])) {
            last_good_char_pos = i;
            if (!first_good_char_pos_set) {
                first_good_char_pos = i;
                first_good_char_pos_set = true;
            }
        }
    }

    if (first_good_char_pos == 0 && last_good_char_pos == 0) {
469
470
        // string is all whitespace, return ''
        return MP_OBJ_NEW_QSTR(MP_QSTR_);
xbe's avatar
xbe committed
471
472
473
474
    }

    assert(last_good_char_pos >= first_good_char_pos);
    //+1 to accomodate the last character
475
    machine_uint_t stripped_len = last_good_char_pos - first_good_char_pos + 1;
476
    return mp_obj_new_str(orig_str + first_good_char_pos, stripped_len, false);
xbe's avatar
xbe committed
477
478
}

479
mp_obj_t str_format(uint n_args, const mp_obj_t *args) {
480
    assert(MP_OBJ_IS_STR(args[0]));
481

482
    GET_STR_DATA_LEN(args[0], str, len);
483
484
    int arg_i = 1;
    vstr_t *vstr = vstr_new();
485
    for (const byte *top = str + len; str < top; str++) {
486
487
        if (*str == '{') {
            str++;
488
            if (str < top && *str == '{') {
489
                vstr_add_char(vstr, '{');
490
            } else {
491
                while (str < top && *str != '}') str++;
492
                if (arg_i >= n_args) {
493
                    nlr_jump(mp_obj_new_exception_msg(&mp_type_IndexError, "tuple index out of range"));
494
                }
495
                // TODO: may be PRINT_REPR depending on formatting code
496
                mp_obj_print_helper((void (*)(void*, const char*, ...))vstr_printf, vstr, args[arg_i], PRINT_STR);
497
498
499
500
501
502
503
                arg_i++;
            }
        } else {
            vstr_add_char(vstr, *str);
        }
    }

504
505
506
    mp_obj_t s = mp_obj_new_str((byte*)vstr->buf, vstr->len, false);
    vstr_free(vstr);
    return s;
507
508
}

509
STATIC mp_obj_t str_replace(uint n_args, const mp_obj_t *args) {
510
511
512
513
    assert(MP_OBJ_IS_STR(args[0]));
    assert(MP_OBJ_IS_STR(args[1]));
    assert(MP_OBJ_IS_STR(args[2]));

514
    machine_int_t max_rep = 0;
515
    if (n_args == 4) {
516
517
518
519
520
521
522
        assert(MP_OBJ_IS_SMALL_INT(args[3]));
        max_rep = MP_OBJ_SMALL_INT_VALUE(args[3]);
        if (max_rep == 0) {
            return args[0];
        } else if (max_rep < 0) {
            max_rep = 0;
        }
523
    }
524
525

    // if max_rep is still 0 by this point we will need to do all possible replacements
526
527
528
529

    GET_STR_DATA_LEN(args[0], str, str_len);
    GET_STR_DATA_LEN(args[1], old, old_len);
    GET_STR_DATA_LEN(args[2], new, new_len);
530
531

    // old won't exist in str if it's longer, so nothing to replace
532
    if (old_len > str_len) {
533
        return args[0];
534
535
    }

536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
    // data for the replaced string
    byte *data = NULL;
    mp_obj_t replaced_str = MP_OBJ_NULL;

    // do 2 passes over the string:
    //   first pass computes the required length of the replaced string
    //   second pass does the replacements
    for (;;) {
        machine_uint_t replaced_str_index = 0;
        machine_uint_t num_replacements_done = 0;
        const byte *old_occurrence;
        const byte *offset_ptr = str;
        machine_uint_t offset_num = 0;
        while ((old_occurrence = find_subbytes(offset_ptr, str_len - offset_num, old, old_len)) != NULL) {
            // copy from just after end of last occurrence of to-be-replaced string to right before start of next occurrence
            if (data != NULL) {
                memcpy(data + replaced_str_index, offset_ptr, old_occurrence - offset_ptr);
            }
            replaced_str_index += old_occurrence - offset_ptr;
            // copy the replacement string
            if (data != NULL) {
                memcpy(data + replaced_str_index, new, new_len);
            }
            replaced_str_index += new_len;
            offset_ptr = old_occurrence + old_len;
            offset_num = offset_ptr - str;

            num_replacements_done++;
            if (max_rep != 0 && num_replacements_done == max_rep){
                break;
            }
        }

        // copy from just after end of last occurrence of to-be-replaced string to end of old string
        if (data != NULL) {
            memcpy(data + replaced_str_index, offset_ptr, str_len - offset_num);
        }
        replaced_str_index += str_len - offset_num;

        if (data == NULL) {
            // first pass
            if (num_replacements_done == 0) {
                // no substr found, return original string
                return args[0];
            } else {
                // substr found, allocate new string
                replaced_str = mp_obj_str_builder_start(mp_obj_get_type(args[0]), replaced_str_index, &data);
            }
        } else {
            // second pass, we are done
            break;
        }
588
    }
589

590
591
592
    return mp_obj_str_builder_end(replaced_str);
}

593
594
595
596
597
598
599
600
STATIC mp_obj_t str_count(uint n_args, const mp_obj_t *args) {
    assert(2 <= n_args && n_args <= 4);
    assert(MP_OBJ_IS_STR(args[0]));
    assert(MP_OBJ_IS_STR(args[1]));

    GET_STR_DATA_LEN(args[0], haystack, haystack_len);
    GET_STR_DATA_LEN(args[1], needle, needle_len);

601
602
    machine_uint_t start = 0;
    machine_uint_t end = haystack_len;
603
604
605
606
607
608
609
    if (n_args >= 3 && args[2] != mp_const_none) {
        start = mp_get_index(&str_type, haystack_len, args[2], true);
    }
    if (n_args >= 4 && args[3] != mp_const_none) {
        end = mp_get_index(&str_type, haystack_len, args[3], true);
    }

610
611
612
    // if needle_len is zero then we count each gap between characters as an occurrence
    if (needle_len == 0) {
        return MP_OBJ_NEW_SMALL_INT(end - start + 1);
613
614
    }

615
616
    // count the occurrences
    machine_int_t num_occurrences = 0;
xbe's avatar
xbe committed
617
618
619
620
621
    for (machine_uint_t haystack_index = start; haystack_index + needle_len <= end; haystack_index++) {
        if (memcmp(&haystack[haystack_index], needle, needle_len) == 0) {
            num_occurrences++;
            haystack_index += needle_len - 1;
        }
622
623
624
625
626
    }

    return MP_OBJ_NEW_SMALL_INT(num_occurrences);
}

627
STATIC mp_obj_t str_partitioner(mp_obj_t self_in, mp_obj_t arg, machine_int_t direction) {
628
629
630
631
632
    assert(MP_OBJ_IS_STR(self_in));
    if (!MP_OBJ_IS_STR(arg)) {
        nlr_jump(mp_obj_new_exception_msg_varg(&mp_type_TypeError,
                                               "Can't convert '%s' object to str implicitly", mp_obj_get_type_str(arg)));
    }
633

634
635
636
637
638
639
    GET_STR_DATA_LEN(self_in, str, str_len);
    GET_STR_DATA_LEN(arg, sep, sep_len);

    if (sep_len == 0) {
        nlr_jump(mp_obj_new_exception_msg(&mp_type_ValueError, "empty separator"));
    }
640
641
642
643
644

    mp_obj_t result[] = {MP_OBJ_NEW_QSTR(MP_QSTR_), MP_OBJ_NEW_QSTR(MP_QSTR_), MP_OBJ_NEW_QSTR(MP_QSTR_)};

    if (direction > 0) {
        result[0] = self_in;
645
    } else {
646
        result[2] = self_in;
647
    }
648

649
650
651
652
653
654
655
656
657
658
659
660
661
662
    if (str_len >= sep_len) {
        machine_uint_t str_index, str_index_end;
        if (direction > 0) {
            str_index = 0;
            str_index_end = str_len - sep_len;
        } else {
            str_index = str_len - sep_len;
            str_index_end = 0;
        }
        for (;;) {
            if (memcmp(&str[str_index], sep, sep_len) == 0) {
                result[0] = mp_obj_new_str(str, str_index, false);
                result[1] = arg;
                result[2] = mp_obj_new_str(str + str_index + sep_len, str_len - str_index - sep_len, false);
663
664
                break;
            }
665
666
667
668
            if (str_index == str_index_end) {
                break;
            }
            str_index += direction;
669
670
        }
    }
671

672
    return mp_obj_new_tuple(3, result);
673
674
}

675
676
STATIC mp_obj_t str_partition(mp_obj_t self_in, mp_obj_t arg) {
    return str_partitioner(self_in, arg, 1);
677
}
678

679
680
STATIC mp_obj_t str_rpartition(mp_obj_t self_in, mp_obj_t arg) {
    return str_partitioner(self_in, arg, -1);
681
682
}

683
684
685
686
687
688
689
690
691
692
693
694
695
696
STATIC machine_int_t str_get_buffer(mp_obj_t self_in, buffer_info_t *bufinfo, int flags) {
    if (flags == BUFFER_READ) {
        GET_STR_DATA_LEN(self_in, str_data, str_len);
        bufinfo->buf = (void*)str_data;
        bufinfo->len = str_len;
        return 0;
    } else {
        // can't write to a string
        bufinfo->buf = NULL;
        bufinfo->len = 0;
        return 1;
    }
}

697
698
699
700
701
702
703
STATIC MP_DEFINE_CONST_FUN_OBJ_VAR_BETWEEN(str_find_obj, 2, 4, str_find);
STATIC MP_DEFINE_CONST_FUN_OBJ_2(str_join_obj, str_join);
STATIC MP_DEFINE_CONST_FUN_OBJ_VAR_BETWEEN(str_split_obj, 1, 3, str_split);
STATIC MP_DEFINE_CONST_FUN_OBJ_2(str_startswith_obj, str_startswith);
STATIC MP_DEFINE_CONST_FUN_OBJ_VAR_BETWEEN(str_strip_obj, 1, 2, str_strip);
STATIC MP_DEFINE_CONST_FUN_OBJ_VAR(str_format_obj, 1, str_format);
STATIC MP_DEFINE_CONST_FUN_OBJ_VAR_BETWEEN(str_replace_obj, 3, 4, str_replace);
704
STATIC MP_DEFINE_CONST_FUN_OBJ_VAR_BETWEEN(str_count_obj, 2, 4, str_count);
705
STATIC MP_DEFINE_CONST_FUN_OBJ_2(str_partition_obj, str_partition);
706
STATIC MP_DEFINE_CONST_FUN_OBJ_2(str_rpartition_obj, str_rpartition);
707

708
STATIC const mp_method_t str_type_methods[] = {
709
    { "find", &str_find_obj },
ian-v's avatar
ian-v committed
710
    { "join", &str_join_obj },
Paul Sokolovsky's avatar
Paul Sokolovsky committed
711
    { "split", &str_split_obj },
712
    { "startswith", &str_startswith_obj },
xbe's avatar
xbe committed
713
    { "strip", &str_strip_obj },
ian-v's avatar
ian-v committed
714
    { "format", &str_format_obj },
715
    { "replace", &str_replace_obj },
716
    { "count", &str_count_obj },
717
    { "partition", &str_partition_obj },
718
    { "rpartition", &str_rpartition_obj },
ian-v's avatar
ian-v committed
719
720
    { NULL, NULL }, // end-of-list sentinel
};
721

722
const mp_obj_type_t str_type = {
723
    { &mp_type_type },
724
    .name = MP_QSTR_str,
725
    .print = str_print,
726
    .make_new = str_make_new,
727
    .binary_op = str_binary_op,
728
729
    .getiter = mp_obj_new_str_iterator,
    .methods = str_type_methods,
730
    .buffer_p = { .get_buffer = str_get_buffer },
731
732
733
734
};

// Reuses most of methods from str
const mp_obj_type_t bytes_type = {
735
    { &mp_type_type },
736
    .name = MP_QSTR_bytes,
737
    .print = str_print,
738
    .make_new = bytes_make_new,
739
740
    .binary_op = str_binary_op,
    .getiter = mp_obj_new_bytes_iterator,
ian-v's avatar
ian-v committed
741
    .methods = str_type_methods,
742
743
};

744
745
746
747
// the zero-length bytes
STATIC const mp_obj_str_t empty_bytes_obj = {{&bytes_type}, 0, 0, NULL};
const mp_obj_t mp_const_empty_bytes = (mp_obj_t)&empty_bytes_obj;

748
mp_obj_t mp_obj_str_builder_start(const mp_obj_type_t *type, uint len, byte **data) {
749
    mp_obj_str_t *o = m_new_obj(mp_obj_str_t);
750
    o->base.type = type;
751
    o->len = len;
752
753
754
    byte *p = m_new(byte, len + 1);
    o->data = p;
    *data = p;
755
756
757
758
759
760
    return o;
}

mp_obj_t mp_obj_str_builder_end(mp_obj_t o_in) {
    mp_obj_str_t *o = o_in;
    o->hash = qstr_compute_hash(o->data, o->len);
761
762
    byte *p = (byte*)o->data;
    p[o->len] = '\0'; // for now we add null for compatibility with C ASCIIZ strings
763
764
765
    return o;
}

766
STATIC mp_obj_t str_new(const mp_obj_type_t *type, const byte* data, uint len) {
767
    mp_obj_str_t *o = m_new_obj(mp_obj_str_t);
768
769
    o->base.type = type;
    o->len = len;
770
771
772
773
774
775
776
    if (data) {
        o->hash = qstr_compute_hash(data, len);
        byte *p = m_new(byte, len + 1);
        o->data = p;
        memcpy(p, data, len * sizeof(byte));
        p[len] = '\0'; // for now we add null for compatibility with C ASCIIZ strings
    }
777
778
779
    return o;
}

780
781
782
783
784
785
786
787
788
789
mp_obj_t mp_obj_new_str(const byte* data, uint len, bool make_qstr_if_not_already) {
    qstr q = qstr_find_strn(data, len);
    if (q != MP_QSTR_NULL) {
        // qstr with this data already exists
        return MP_OBJ_NEW_QSTR(q);
    } else if (make_qstr_if_not_already) {
        // no existing qstr, make a new one
        return MP_OBJ_NEW_QSTR(qstr_from_strn((const char*)data, len));
    } else {
        // no existing qstr, don't make one
790
        return str_new(&str_type, data, len);
791
792
793
    }
}

794
795
796
797
mp_obj_t mp_obj_new_bytes(const byte* data, uint len) {
    return str_new(&bytes_type, data, len);
}

798
799
800
801
802
803
804
805
806
807
808
809
810
811
bool mp_obj_str_equal(mp_obj_t s1, mp_obj_t s2) {
    if (MP_OBJ_IS_QSTR(s1) && MP_OBJ_IS_QSTR(s2)) {
        return s1 == s2;
    } else {
        GET_STR_HASH(s1, h1);
        GET_STR_HASH(s2, h2);
        if (h1 != h2) {
            return false;
        }
        GET_STR_DATA_LEN(s1, d1, l1);
        GET_STR_DATA_LEN(s2, d2, l2);
        if (l1 != l2) {
            return false;
        }
812
        return memcmp(d1, d2, l1) == 0;
813
814
815
    }
}

816
817
void bad_implicit_conversion(mp_obj_t self_in) __attribute__((noreturn));
void bad_implicit_conversion(mp_obj_t self_in) {
818
    nlr_jump(mp_obj_new_exception_msg_varg(&mp_type_TypeError, "Can't convert '%s' object to str implicitly", mp_obj_get_type_str(self_in)));
819
820
}

821
822
823
824
825
uint mp_obj_str_get_hash(mp_obj_t self_in) {
    if (MP_OBJ_IS_STR(self_in)) {
        GET_STR_HASH(self_in, h);
        return h;
    } else {
826
        bad_implicit_conversion(self_in);
827
    }
828
829
830
831
832
833
834
}

uint mp_obj_str_get_len(mp_obj_t self_in) {
    if (MP_OBJ_IS_STR(self_in)) {
        GET_STR_LEN(self_in, l);
        return l;
    } else {
835
836
837
838
839
840
841
842
843
844
845
846
847
848
        bad_implicit_conversion(self_in);
    }
}

// use this if you will anyway convert the string to a qstr
// will be more efficient for the case where it's already a qstr
qstr mp_obj_str_get_qstr(mp_obj_t self_in) {
    if (MP_OBJ_IS_QSTR(self_in)) {
        return MP_OBJ_QSTR_VALUE(self_in);
    } else if (MP_OBJ_IS_TYPE(self_in, &str_type)) {
        mp_obj_str_t *self = self_in;
        return qstr_from_strn((char*)self->data, self->len);
    } else {
        bad_implicit_conversion(self_in);
849
850
851
852
853
854
855
856
857
858
859
    }
}

// only use this function if you need the str data to be zero terminated
// at the moment all strings are zero terminated to help with C ASCIIZ compatibility
const char *mp_obj_str_get_str(mp_obj_t self_in) {
    if (MP_OBJ_IS_STR(self_in)) {
        GET_STR_DATA_LEN(self_in, s, l);
        (void)l; // len unused
        return (const char*)s;
    } else {
860
        bad_implicit_conversion(self_in);
861
862
863
    }
}

864
const char *mp_obj_str_get_data(mp_obj_t self_in, uint *len) {
865
866
867
    if (MP_OBJ_IS_STR(self_in)) {
        GET_STR_DATA_LEN(self_in, s, l);
        *len = l;
868
        return (const char*)s;
869
    } else {
870
        bad_implicit_conversion(self_in);
871
    }
872
}
xyb's avatar
xyb committed
873
874
875
876
877
878

/******************************************************************************/
/* str iterator                                                               */

typedef struct _mp_obj_str_it_t {
    mp_obj_base_t base;
879
    mp_obj_t str;
xyb's avatar
xyb committed
880
881
882
    machine_uint_t cur;
} mp_obj_str_it_t;

883
STATIC mp_obj_t str_it_iternext(mp_obj_t self_in) {
xyb's avatar
xyb committed
884
    mp_obj_str_it_t *self = self_in;
885
886
887
    GET_STR_DATA_LEN(self->str, str, len);
    if (self->cur < len) {
        mp_obj_t o_out = mp_obj_new_str(str + self->cur, 1, true);
xyb's avatar
xyb committed
888
889
890
891
892
893
894
        self->cur += 1;
        return o_out;
    } else {
        return mp_const_stop_iteration;
    }
}

895
STATIC const mp_obj_type_t str_it_type = {
896
    { &mp_type_type },
897
    .name = MP_QSTR_iterator,
898
    .iternext = str_it_iternext,
xyb's avatar
xyb committed
899
900
};

901
STATIC mp_obj_t bytes_it_iternext(mp_obj_t self_in) {
902
903
904
    mp_obj_str_it_t *self = self_in;
    GET_STR_DATA_LEN(self->str, str, len);
    if (self->cur < len) {
905
        mp_obj_t o_out = MP_OBJ_NEW_SMALL_INT((mp_small_int_t)str[self->cur]);
906
907
908
909
910
911
912
        self->cur += 1;
        return o_out;
    } else {
        return mp_const_stop_iteration;
    }
}

913
STATIC const mp_obj_type_t bytes_it_type = {
914
    { &mp_type_type },
915
    .name = MP_QSTR_iterator,
916
917
918
919
    .iternext = bytes_it_iternext,
};

mp_obj_t mp_obj_new_str_iterator(mp_obj_t str) {
xyb's avatar
xyb committed
920
921
922
    mp_obj_str_it_t *o = m_new_obj(mp_obj_str_it_t);
    o->base.type = &str_it_type;
    o->str = str;
923
924
925
926
927
928
929
930
931
    o->cur = 0;
    return o;
}

mp_obj_t mp_obj_new_bytes_iterator(mp_obj_t str) {
    mp_obj_str_it_t *o = m_new_obj(mp_obj_str_it_t);
    o->base.type = &bytes_it_type;
    o->str = str;
    o->cur = 0;
xyb's avatar
xyb committed
932
933
    return o;
}