objstr.c 30.2 KB
Newer Older
xbe's avatar
xbe committed
1
#include <stdbool.h>
2
3
4
5
6
7
#include <string.h>
#include <assert.h>

#include "nlr.h"
#include "misc.h"
#include "mpconfig.h"
8
#include "qstr.h"
9
10
11
12
13
14
#include "obj.h"
#include "runtime0.h"
#include "runtime.h"

typedef struct _mp_obj_str_t {
    mp_obj_base_t base;
15
16
    machine_uint_t hash : 16; // XXX here we assume the hash size is 16 bits (it is at the moment; see qstr.c)
    machine_uint_t len : 16; // len == number of bytes used in data, alloc = len + 1 because (at the moment) we also append a null byte
17
    const byte *data;
18
19
} mp_obj_str_t;

20
21
const mp_obj_t mp_const_empty_bytes;

22
23
24
25
26
27
28
29
30
// use this macro to extract the string hash
#define GET_STR_HASH(str_obj_in, str_hash) uint str_hash; if (MP_OBJ_IS_QSTR(str_obj_in)) { str_hash = qstr_hash(MP_OBJ_QSTR_VALUE(str_obj_in)); } else { str_hash = ((mp_obj_str_t*)str_obj_in)->hash; }

// use this macro to extract the string length
#define GET_STR_LEN(str_obj_in, str_len) uint str_len; if (MP_OBJ_IS_QSTR(str_obj_in)) { str_len = qstr_len(MP_OBJ_QSTR_VALUE(str_obj_in)); } else { str_len = ((mp_obj_str_t*)str_obj_in)->len; }

// use this macro to extract the string data and length
#define GET_STR_DATA_LEN(str_obj_in, str_data, str_len) const byte *str_data; uint str_len; if (MP_OBJ_IS_QSTR(str_obj_in)) { str_data = qstr_data(MP_OBJ_QSTR_VALUE(str_obj_in), &str_len); } else { str_len = ((mp_obj_str_t*)str_obj_in)->len; str_data = ((mp_obj_str_t*)str_obj_in)->data; }

31
32
STATIC mp_obj_t mp_obj_new_str_iterator(mp_obj_t str);
STATIC mp_obj_t mp_obj_new_bytes_iterator(mp_obj_t str);
33
STATIC mp_obj_t str_new(const mp_obj_type_t *type, const byte* data, uint len);
xyb's avatar
xyb committed
34
35
36
37

/******************************************************************************/
/* str                                                                        */

38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
void mp_str_print_quoted(void (*print)(void *env, const char *fmt, ...), void *env, const byte *str_data, uint str_len) {
    // this escapes characters, but it will be very slow to print (calling print many times)
    bool has_single_quote = false;
    bool has_double_quote = false;
    for (const byte *s = str_data, *top = str_data + str_len; (!has_single_quote || !has_double_quote) && s < top; s++) {
        if (*s == '\'') {
            has_single_quote = true;
        } else if (*s == '"') {
            has_double_quote = true;
        }
    }
    int quote_char = '\'';
    if (has_single_quote && !has_double_quote) {
        quote_char = '"';
    }
    print(env, "%c", quote_char);
    for (const byte *s = str_data, *top = str_data + str_len; s < top; s++) {
        if (*s == quote_char) {
            print(env, "\\%c", quote_char);
        } else if (*s == '\\') {
            print(env, "\\\\");
        } else if (32 <= *s && *s <= 126) {
            print(env, "%c", *s);
        } else if (*s == '\n') {
            print(env, "\\n");
        // TODO add more escape codes here if we want to match CPython
        } else {
            print(env, "\\x%02x", *s);
        }
    }
    print(env, "%c", quote_char);
}

71
STATIC void str_print(void (*print)(void *env, const char *fmt, ...), void *env, mp_obj_t self_in, mp_print_kind_t kind) {
72
    GET_STR_DATA_LEN(self_in, str_data, str_len);
73
74
    bool is_bytes = MP_OBJ_IS_TYPE(self_in, &bytes_type);
    if (kind == PRINT_STR && !is_bytes) {
75
        print(env, "%.*s", str_len, str_data);
76
    } else {
77
78
79
        if (is_bytes) {
            print(env, "b");
        }
80
        mp_str_print_quoted(print, env, str_data, str_len);
81
    }
82
83
}

84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
STATIC mp_obj_t str_make_new(mp_obj_t type_in, uint n_args, uint n_kw, const mp_obj_t *args) {
    switch (n_args) {
        case 0:
            return MP_OBJ_NEW_QSTR(MP_QSTR_);

        case 1:
        {
            vstr_t *vstr = vstr_new();
            mp_obj_print_helper((void (*)(void*, const char*, ...))vstr_printf, vstr, args[0], PRINT_STR);
            mp_obj_t s = mp_obj_new_str((byte*)vstr->buf, vstr->len, false);
            vstr_free(vstr);
            return s;
        }

        case 2:
        case 3:
        {
            // TODO: validate 2nd/3rd args
            if (!MP_OBJ_IS_TYPE(args[0], &bytes_type)) {
                nlr_jump(mp_obj_new_exception_msg(&mp_type_TypeError, "bytes expected"));
            }
            GET_STR_DATA_LEN(args[0], str_data, str_len);
            GET_STR_HASH(args[0], str_hash);
            mp_obj_str_t *o = str_new(&str_type, NULL, str_len);
            o->data = str_data;
            o->hash = str_hash;
            return o;
        }

        default:
            nlr_jump(mp_obj_new_exception_msg(&mp_type_TypeError, "str takes at most 3 arguments"));
    }
}

118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
STATIC mp_obj_t bytes_make_new(mp_obj_t type_in, uint n_args, uint n_kw, const mp_obj_t *args) {
    if (n_args == 0) {
        return mp_const_empty_bytes;
    }

    if (MP_OBJ_IS_STR(args[0])) {
        if (n_args < 2 || n_args > 3) {
            goto wrong_args;
        }
        GET_STR_DATA_LEN(args[0], str_data, str_len);
        GET_STR_HASH(args[0], str_hash);
        mp_obj_str_t *o = str_new(&bytes_type, NULL, str_len);
        o->data = str_data;
        o->hash = str_hash;
        return o;
    }

    if (n_args > 1) {
        goto wrong_args;
    }

    if (MP_OBJ_IS_SMALL_INT(args[0])) {
        uint len = MP_OBJ_SMALL_INT_VALUE(args[0]);
        byte *data;

        mp_obj_t o = mp_obj_str_builder_start(&bytes_type, len, &data);
        memset(data, 0, len);
        return mp_obj_str_builder_end(o);
    }

    int len;
    byte *data;
    vstr_t *vstr = NULL;
    mp_obj_t o = NULL;
    // Try to create array of exact len if initializer len is known
    mp_obj_t len_in = mp_obj_len_maybe(args[0]);
    if (len_in == MP_OBJ_NULL) {
        len = -1;
        vstr = vstr_new();
    } else {
        len = MP_OBJ_SMALL_INT_VALUE(len_in);
        o = mp_obj_str_builder_start(&bytes_type, len, &data);
    }

    mp_obj_t iterable = rt_getiter(args[0]);
    mp_obj_t item;
    while ((item = rt_iternext(iterable)) != mp_const_stop_iteration) {
        if (len == -1) {
            vstr_add_char(vstr, MP_OBJ_SMALL_INT_VALUE(item));
        } else {
            *data++ = MP_OBJ_SMALL_INT_VALUE(item);
        }
    }

    if (len == -1) {
        vstr_shrink(vstr);
        // TODO: Optimize, borrow buffer from vstr
        len = vstr_len(vstr);
        o = mp_obj_str_builder_start(&bytes_type, len, &data);
        memcpy(data, vstr_str(vstr), len);
        vstr_free(vstr);
    }

    return mp_obj_str_builder_end(o);

wrong_args:
        nlr_jump(mp_obj_new_exception_msg(&mp_type_TypeError, "wrong number of arguments"));
}

187
188
// like strstr but with specified length and allows \0 bytes
// TODO replace with something more efficient/standard
189
STATIC const byte *find_subbytes(const byte *haystack, uint hlen, const byte *needle, uint nlen) {
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
    if (hlen >= nlen) {
        for (uint i = 0; i <= hlen - nlen; i++) {
            bool found = true;
            for (uint j = 0; j < nlen; j++) {
                if (haystack[i + j] != needle[j]) {
                    found = false;
                    break;
                }
            }
            if (found) {
                return haystack + i;
            }
        }
    }
    return NULL;
}

207
STATIC mp_obj_t str_binary_op(int op, mp_obj_t lhs_in, mp_obj_t rhs_in) {
208
    GET_STR_DATA_LEN(lhs_in, lhs_data, lhs_len);
209
210
    switch (op) {
        case RT_BINARY_OP_SUBSCR:
211
212
213
            // TODO: need predicate to check for int-like type (bools are such for example)
            // ["no", "yes"][1 == 2] is common idiom
            if (MP_OBJ_IS_SMALL_INT(rhs_in)) {
214
                uint index = mp_get_index(mp_obj_get_type(lhs_in), lhs_len, rhs_in, false);
215
                if (MP_OBJ_IS_TYPE(lhs_in, &bytes_type)) {
216
                    return MP_OBJ_NEW_SMALL_INT((mp_small_int_t)lhs_data[index]);
217
218
219
                } else {
                    return mp_obj_new_str(lhs_data + index, 1, true);
                }
220
#if MICROPY_ENABLE_SLICE
221
            } else if (MP_OBJ_IS_TYPE(rhs_in, &slice_type)) {
222
                machine_uint_t start, stop;
Paul Sokolovsky's avatar
Paul Sokolovsky committed
223
224
225
                if (!m_seq_get_fast_slice_indexes(lhs_len, rhs_in, &start, &stop)) {
                    assert(0);
                }
226
                return mp_obj_new_str(lhs_data + start, stop - start, false);
227
#endif
228
            } else {
229
230
                // Message doesn't match CPython, but we don't have so much bytes as they
                // to spend them on verbose wording
231
                nlr_jump(mp_obj_new_exception_msg(&mp_type_TypeError, "index must be int"));
232
            }
233
234
235

        case RT_BINARY_OP_ADD:
        case RT_BINARY_OP_INPLACE_ADD:
236
            if (MP_OBJ_IS_STR(rhs_in)) {
237
                // add 2 strings
238
239

                GET_STR_DATA_LEN(rhs_in, rhs_data, rhs_len);
240
                int alloc_len = lhs_len + rhs_len;
241
242

                /* code for making qstr
243
244
245
246
                byte *q_ptr;
                byte *val = qstr_build_start(alloc_len, &q_ptr);
                memcpy(val, lhs_data, lhs_len);
                memcpy(val + lhs_len, rhs_data, rhs_len);
247
248
249
250
251
                return MP_OBJ_NEW_QSTR(qstr_build_end(q_ptr));
                */

                // code for non-qstr
                byte *data;
252
                mp_obj_t s = mp_obj_str_builder_start(mp_obj_get_type(lhs_in), alloc_len, &data);
253
254
255
                memcpy(data, lhs_data, lhs_len);
                memcpy(data + lhs_len, rhs_data, rhs_len);
                return mp_obj_str_builder_end(s);
256
257
            }
            break;
258

259
        case RT_BINARY_OP_IN:
260
            /* NOTE `a in b` is `b.__contains__(a)` */
261
262
            if (MP_OBJ_IS_STR(rhs_in)) {
                GET_STR_DATA_LEN(rhs_in, rhs_data, rhs_len);
263
                return MP_BOOL(find_subbytes(lhs_data, lhs_len, rhs_data, rhs_len) != NULL);
264
265
            }
            break;
266

267
268
269
270
271
272
        case RT_BINARY_OP_MULTIPLY:
        {
            if (!MP_OBJ_IS_SMALL_INT(rhs_in)) {
                return NULL;
            }
            int n = MP_OBJ_SMALL_INT_VALUE(rhs_in);
273
            byte *data;
274
            mp_obj_t s = mp_obj_str_builder_start(mp_obj_get_type(lhs_in), lhs_len * n, &data);
275
276
            mp_seq_multiply(lhs_data, sizeof(*lhs_data), lhs_len, n, data);
            return mp_obj_str_builder_end(s);
277
        }
278
279
280
281
282
283
284
285
286
287
288
289

        // These 2 are never passed here, dealt with as a special case in rt_binary_op().
        //case RT_BINARY_OP_EQUAL:
        //case RT_BINARY_OP_NOT_EQUAL:
        case RT_BINARY_OP_LESS:
        case RT_BINARY_OP_LESS_EQUAL:
        case RT_BINARY_OP_MORE:
        case RT_BINARY_OP_MORE_EQUAL:
            if (MP_OBJ_IS_STR(rhs_in)) {
                GET_STR_DATA_LEN(rhs_in, rhs_data, rhs_len);
                return MP_BOOL(mp_seq_cmp_bytes(op, lhs_data, lhs_len, rhs_data, rhs_len));
            }
290
291
292
293
294
    }

    return MP_OBJ_NULL; // op not supported
}

295
STATIC mp_obj_t str_join(mp_obj_t self_in, mp_obj_t arg) {
296
    assert(MP_OBJ_IS_STR(self_in));
297

298
    // get separation string
299
    GET_STR_DATA_LEN(self_in, sep_str, sep_len);
300
301

    // process args
302
303
304
305
306
307
308
309
310
    uint seq_len;
    mp_obj_t *seq_items;
    if (MP_OBJ_IS_TYPE(arg, &tuple_type)) {
        mp_obj_tuple_get(arg, &seq_len, &seq_items);
    } else if (MP_OBJ_IS_TYPE(arg, &list_type)) {
        mp_obj_list_get(arg, &seq_len, &seq_items);
    } else {
        goto bad_arg;
    }
311
312
313

    // count required length
    int required_len = 0;
314
    for (int i = 0; i < seq_len; i++) {
315
        if (!MP_OBJ_IS_STR(seq_items[i])) {
316
317
            goto bad_arg;
        }
318
319
320
        if (i > 0) {
            required_len += sep_len;
        }
321
322
        GET_STR_LEN(seq_items[i], l);
        required_len += l;
323
324
325
    }

    // make joined string
326
    byte *data;
327
    mp_obj_t joined_str = mp_obj_str_builder_start(mp_obj_get_type(self_in), required_len, &data);
328
329
    for (int i = 0; i < seq_len; i++) {
        if (i > 0) {
330
331
            memcpy(data, sep_str, sep_len);
            data += sep_len;
332
        }
333
334
335
        GET_STR_DATA_LEN(seq_items[i], s, l);
        memcpy(data, s, l);
        data += l;
336
    }
337
338

    // return joined string
339
    return mp_obj_str_builder_end(joined_str);
340
341

bad_arg:
342
    nlr_jump(mp_obj_new_exception_msg(&mp_type_TypeError, "?str.join expecting a list of str's"));
343
344
}

Paul Sokolovsky's avatar
Paul Sokolovsky committed
345
346
#define is_ws(c) ((c) == ' ' || (c) == '\t')

347
STATIC mp_obj_t str_split(uint n_args, const mp_obj_t *args) {
Paul Sokolovsky's avatar
Paul Sokolovsky committed
348
349
350
351
352
353
354
355
356
    int splits = -1;
    mp_obj_t sep = mp_const_none;
    if (n_args > 1) {
        sep = args[1];
        if (n_args > 2) {
            splits = MP_OBJ_SMALL_INT_VALUE(args[2]);
        }
    }
    assert(sep == mp_const_none);
357
    (void)sep; // unused; to hush compiler warning
Paul Sokolovsky's avatar
Paul Sokolovsky committed
358
    mp_obj_t res = mp_obj_new_list(0, NULL);
359
360
361
    GET_STR_DATA_LEN(args[0], s, len);
    const byte *top = s + len;
    const byte *start;
Paul Sokolovsky's avatar
Paul Sokolovsky committed
362
363

    // Initial whitespace is not counted as split, so we pre-do it
364
365
    while (s < top && is_ws(*s)) s++;
    while (s < top && splits != 0) {
Paul Sokolovsky's avatar
Paul Sokolovsky committed
366
        start = s;
367
368
369
        while (s < top && !is_ws(*s)) s++;
        rt_list_append(res, mp_obj_new_str(start, s - start, false));
        if (s >= top) {
Paul Sokolovsky's avatar
Paul Sokolovsky committed
370
371
            break;
        }
372
        while (s < top && is_ws(*s)) s++;
Paul Sokolovsky's avatar
Paul Sokolovsky committed
373
374
375
376
377
        if (splits > 0) {
            splits--;
        }
    }

378
379
    if (s < top) {
        rt_list_append(res, mp_obj_new_str(s, top - s, false));
Paul Sokolovsky's avatar
Paul Sokolovsky committed
380
381
382
383
384
    }

    return res;
}

385
STATIC mp_obj_t str_find(uint n_args, const mp_obj_t *args) {
386
    assert(2 <= n_args && n_args <= 4);
387
388
    assert(MP_OBJ_IS_STR(args[0]));
    assert(MP_OBJ_IS_STR(args[1]));
389

390
391
    GET_STR_DATA_LEN(args[0], haystack, haystack_len);
    GET_STR_DATA_LEN(args[1], needle, needle_len);
392

393
394
    machine_uint_t start = 0;
    machine_uint_t end = haystack_len;
395
396
    /* TODO use a non-exception-throwing mp_get_index */
    if (n_args >= 3 && args[2] != mp_const_none) {
397
        start = mp_get_index(&str_type, haystack_len, args[2], true);
398
399
    }
    if (n_args >= 4 && args[3] != mp_const_none) {
400
        end = mp_get_index(&str_type, haystack_len, args[3], true);
401
402
    }

403
    const byte *p = find_subbytes(haystack + start, haystack_len - start, needle, needle_len);
404
405
406
407
408
409
    if (p == NULL) {
        // not found
        return MP_OBJ_NEW_SMALL_INT(-1);
    } else {
        // found
        machine_int_t pos = p - haystack;
410
411
412
        if (pos + needle_len > end) {
            pos = -1;
        }
413
        return MP_OBJ_NEW_SMALL_INT(pos);
414
415
416
    }
}

417
// TODO: (Much) more variety in args
418
STATIC mp_obj_t str_startswith(mp_obj_t self_in, mp_obj_t arg) {
419
420
421
422
423
424
425
426
    GET_STR_DATA_LEN(self_in, str, str_len);
    GET_STR_DATA_LEN(arg, prefix, prefix_len);
    if (prefix_len > str_len) {
        return mp_const_false;
    }
    return MP_BOOL(memcmp(str, prefix, prefix_len) == 0);
}

427
428
STATIC bool chr_in_str(const byte* const str, const machine_uint_t str_len, int c) {
    for (machine_uint_t i = 0; i < str_len; i++) {
429
430
431
432
433
434
435
        if (str[i] == c) {
            return true;
        }
    }
    return false;
}

436
STATIC mp_obj_t str_strip(uint n_args, const mp_obj_t *args) {
xbe's avatar
xbe committed
437
    assert(1 <= n_args && n_args <= 2);
438
439
440
441
442
    assert(MP_OBJ_IS_STR(args[0]));

    const byte *chars_to_del;
    uint chars_to_del_len;
    static const byte whitespace[] = " \t\n\r\v\f";
xbe's avatar
xbe committed
443
444
445

    if (n_args == 1) {
        chars_to_del = whitespace;
446
        chars_to_del_len = sizeof(whitespace);
xbe's avatar
xbe committed
447
    } else {
448
449
450
451
        assert(MP_OBJ_IS_STR(args[1]));
        GET_STR_DATA_LEN(args[1], s, l);
        chars_to_del = s;
        chars_to_del_len = l;
xbe's avatar
xbe committed
452
453
    }

454
    GET_STR_DATA_LEN(args[0], orig_str, orig_str_len);
xbe's avatar
xbe committed
455

456
    machine_uint_t first_good_char_pos = 0;
xbe's avatar
xbe committed
457
    bool first_good_char_pos_set = false;
458
459
    machine_uint_t last_good_char_pos = 0;
    for (machine_uint_t i = 0; i < orig_str_len; i++) {
xbe's avatar
xbe committed
460
461
462
463
464
465
466
467
468
469
        if (!chr_in_str(chars_to_del, chars_to_del_len, orig_str[i])) {
            last_good_char_pos = i;
            if (!first_good_char_pos_set) {
                first_good_char_pos = i;
                first_good_char_pos_set = true;
            }
        }
    }

    if (first_good_char_pos == 0 && last_good_char_pos == 0) {
470
471
        // string is all whitespace, return ''
        return MP_OBJ_NEW_QSTR(MP_QSTR_);
xbe's avatar
xbe committed
472
473
474
475
    }

    assert(last_good_char_pos >= first_good_char_pos);
    //+1 to accomodate the last character
476
    machine_uint_t stripped_len = last_good_char_pos - first_good_char_pos + 1;
477
    return mp_obj_new_str(orig_str + first_good_char_pos, stripped_len, false);
xbe's avatar
xbe committed
478
479
}

480
mp_obj_t str_format(uint n_args, const mp_obj_t *args) {
481
    assert(MP_OBJ_IS_STR(args[0]));
482

483
    GET_STR_DATA_LEN(args[0], str, len);
484
485
    int arg_i = 1;
    vstr_t *vstr = vstr_new();
486
    for (const byte *top = str + len; str < top; str++) {
487
488
        if (*str == '{') {
            str++;
489
            if (str < top && *str == '{') {
490
                vstr_add_char(vstr, '{');
491
            } else {
492
                while (str < top && *str != '}') str++;
493
                if (arg_i >= n_args) {
494
                    nlr_jump(mp_obj_new_exception_msg(&mp_type_IndexError, "tuple index out of range"));
495
                }
496
                // TODO: may be PRINT_REPR depending on formatting code
497
                mp_obj_print_helper((void (*)(void*, const char*, ...))vstr_printf, vstr, args[arg_i], PRINT_STR);
498
499
500
501
502
503
504
                arg_i++;
            }
        } else {
            vstr_add_char(vstr, *str);
        }
    }

505
506
507
    mp_obj_t s = mp_obj_new_str((byte*)vstr->buf, vstr->len, false);
    vstr_free(vstr);
    return s;
508
509
}

510
STATIC mp_obj_t str_replace(uint n_args, const mp_obj_t *args) {
511
512
513
514
    assert(MP_OBJ_IS_STR(args[0]));
    assert(MP_OBJ_IS_STR(args[1]));
    assert(MP_OBJ_IS_STR(args[2]));

515
    machine_int_t max_rep = 0;
516
    if (n_args == 4) {
517
518
519
520
521
522
523
        assert(MP_OBJ_IS_SMALL_INT(args[3]));
        max_rep = MP_OBJ_SMALL_INT_VALUE(args[3]);
        if (max_rep == 0) {
            return args[0];
        } else if (max_rep < 0) {
            max_rep = 0;
        }
524
    }
525
526

    // if max_rep is still 0 by this point we will need to do all possible replacements
527
528
529
530

    GET_STR_DATA_LEN(args[0], str, str_len);
    GET_STR_DATA_LEN(args[1], old, old_len);
    GET_STR_DATA_LEN(args[2], new, new_len);
531
532

    // old won't exist in str if it's longer, so nothing to replace
533
    if (old_len > str_len) {
534
        return args[0];
535
536
    }

537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
    // data for the replaced string
    byte *data = NULL;
    mp_obj_t replaced_str = MP_OBJ_NULL;

    // do 2 passes over the string:
    //   first pass computes the required length of the replaced string
    //   second pass does the replacements
    for (;;) {
        machine_uint_t replaced_str_index = 0;
        machine_uint_t num_replacements_done = 0;
        const byte *old_occurrence;
        const byte *offset_ptr = str;
        machine_uint_t offset_num = 0;
        while ((old_occurrence = find_subbytes(offset_ptr, str_len - offset_num, old, old_len)) != NULL) {
            // copy from just after end of last occurrence of to-be-replaced string to right before start of next occurrence
            if (data != NULL) {
                memcpy(data + replaced_str_index, offset_ptr, old_occurrence - offset_ptr);
            }
            replaced_str_index += old_occurrence - offset_ptr;
            // copy the replacement string
            if (data != NULL) {
                memcpy(data + replaced_str_index, new, new_len);
            }
            replaced_str_index += new_len;
            offset_ptr = old_occurrence + old_len;
            offset_num = offset_ptr - str;

            num_replacements_done++;
            if (max_rep != 0 && num_replacements_done == max_rep){
                break;
            }
        }

        // copy from just after end of last occurrence of to-be-replaced string to end of old string
        if (data != NULL) {
            memcpy(data + replaced_str_index, offset_ptr, str_len - offset_num);
        }
        replaced_str_index += str_len - offset_num;

        if (data == NULL) {
            // first pass
            if (num_replacements_done == 0) {
                // no substr found, return original string
                return args[0];
            } else {
                // substr found, allocate new string
                replaced_str = mp_obj_str_builder_start(mp_obj_get_type(args[0]), replaced_str_index, &data);
            }
        } else {
            // second pass, we are done
            break;
        }
589
    }
590

591
592
593
    return mp_obj_str_builder_end(replaced_str);
}

594
595
596
597
598
599
600
601
STATIC mp_obj_t str_count(uint n_args, const mp_obj_t *args) {
    assert(2 <= n_args && n_args <= 4);
    assert(MP_OBJ_IS_STR(args[0]));
    assert(MP_OBJ_IS_STR(args[1]));

    GET_STR_DATA_LEN(args[0], haystack, haystack_len);
    GET_STR_DATA_LEN(args[1], needle, needle_len);

602
603
    machine_uint_t start = 0;
    machine_uint_t end = haystack_len;
604
605
606
607
608
609
610
611
    /* TODO use a non-exception-throwing mp_get_index */
    if (n_args >= 3 && args[2] != mp_const_none) {
        start = mp_get_index(&str_type, haystack_len, args[2], true);
    }
    if (n_args >= 4 && args[3] != mp_const_none) {
        end = mp_get_index(&str_type, haystack_len, args[3], true);
    }

612
613
614
    // if needle_len is zero then we count each gap between characters as an occurrence
    if (needle_len == 0) {
        return MP_OBJ_NEW_SMALL_INT(end - start + 1);
615
616
    }

617
618
    // count the occurrences
    machine_int_t num_occurrences = 0;
xbe's avatar
xbe committed
619
620
621
622
623
    for (machine_uint_t haystack_index = start; haystack_index + needle_len <= end; haystack_index++) {
        if (memcmp(&haystack[haystack_index], needle, needle_len) == 0) {
            num_occurrences++;
            haystack_index += needle_len - 1;
        }
624
625
626
627
628
    }

    return MP_OBJ_NEW_SMALL_INT(num_occurrences);
}

629
STATIC mp_obj_t str_partitioner(mp_obj_t self_in, mp_obj_t arg, machine_int_t direction) {
630
631
632
633
634
    assert(MP_OBJ_IS_STR(self_in));
    if (!MP_OBJ_IS_STR(arg)) {
        nlr_jump(mp_obj_new_exception_msg_varg(&mp_type_TypeError,
                                               "Can't convert '%s' object to str implicitly", mp_obj_get_type_str(arg)));
    }
635

636
637
638
639
640
641
    GET_STR_DATA_LEN(self_in, str, str_len);
    GET_STR_DATA_LEN(arg, sep, sep_len);

    if (sep_len == 0) {
        nlr_jump(mp_obj_new_exception_msg(&mp_type_ValueError, "empty separator"));
    }
642
643
644
645
646

    mp_obj_t result[] = {MP_OBJ_NEW_QSTR(MP_QSTR_), MP_OBJ_NEW_QSTR(MP_QSTR_), MP_OBJ_NEW_QSTR(MP_QSTR_)};

    if (direction > 0) {
        result[0] = self_in;
647
    } else {
648
        result[2] = self_in;
649
    }
650

651
652
653
654
655
656
657
658
659
660
661
662
663
664
    if (str_len >= sep_len) {
        machine_uint_t str_index, str_index_end;
        if (direction > 0) {
            str_index = 0;
            str_index_end = str_len - sep_len;
        } else {
            str_index = str_len - sep_len;
            str_index_end = 0;
        }
        for (;;) {
            if (memcmp(&str[str_index], sep, sep_len) == 0) {
                result[0] = mp_obj_new_str(str, str_index, false);
                result[1] = arg;
                result[2] = mp_obj_new_str(str + str_index + sep_len, str_len - str_index - sep_len, false);
665
666
                break;
            }
667
668
669
670
            if (str_index == str_index_end) {
                break;
            }
            str_index += direction;
671
672
        }
    }
673

674
    return mp_obj_new_tuple(3, result);
675
676
}

677
678
STATIC mp_obj_t str_partition(mp_obj_t self_in, mp_obj_t arg) {
    return str_partitioner(self_in, arg, 1);
679
}
680

681
682
STATIC mp_obj_t str_rpartition(mp_obj_t self_in, mp_obj_t arg) {
    return str_partitioner(self_in, arg, -1);
683
684
}

685
686
687
688
689
690
691
692
693
694
695
696
697
698
STATIC machine_int_t str_get_buffer(mp_obj_t self_in, buffer_info_t *bufinfo, int flags) {
    if (flags == BUFFER_READ) {
        GET_STR_DATA_LEN(self_in, str_data, str_len);
        bufinfo->buf = (void*)str_data;
        bufinfo->len = str_len;
        return 0;
    } else {
        // can't write to a string
        bufinfo->buf = NULL;
        bufinfo->len = 0;
        return 1;
    }
}

699
700
701
702
703
704
705
STATIC MP_DEFINE_CONST_FUN_OBJ_VAR_BETWEEN(str_find_obj, 2, 4, str_find);
STATIC MP_DEFINE_CONST_FUN_OBJ_2(str_join_obj, str_join);
STATIC MP_DEFINE_CONST_FUN_OBJ_VAR_BETWEEN(str_split_obj, 1, 3, str_split);
STATIC MP_DEFINE_CONST_FUN_OBJ_2(str_startswith_obj, str_startswith);
STATIC MP_DEFINE_CONST_FUN_OBJ_VAR_BETWEEN(str_strip_obj, 1, 2, str_strip);
STATIC MP_DEFINE_CONST_FUN_OBJ_VAR(str_format_obj, 1, str_format);
STATIC MP_DEFINE_CONST_FUN_OBJ_VAR_BETWEEN(str_replace_obj, 3, 4, str_replace);
706
STATIC MP_DEFINE_CONST_FUN_OBJ_VAR_BETWEEN(str_count_obj, 2, 4, str_count);
707
STATIC MP_DEFINE_CONST_FUN_OBJ_2(str_partition_obj, str_partition);
708
STATIC MP_DEFINE_CONST_FUN_OBJ_2(str_rpartition_obj, str_rpartition);
709

710
STATIC const mp_method_t str_type_methods[] = {
711
    { "find", &str_find_obj },
ian-v's avatar
ian-v committed
712
    { "join", &str_join_obj },
Paul Sokolovsky's avatar
Paul Sokolovsky committed
713
    { "split", &str_split_obj },
714
    { "startswith", &str_startswith_obj },
xbe's avatar
xbe committed
715
    { "strip", &str_strip_obj },
ian-v's avatar
ian-v committed
716
    { "format", &str_format_obj },
717
    { "replace", &str_replace_obj },
718
    { "count", &str_count_obj },
719
    { "partition", &str_partition_obj },
720
    { "rpartition", &str_rpartition_obj },
ian-v's avatar
ian-v committed
721
722
    { NULL, NULL }, // end-of-list sentinel
};
723

724
const mp_obj_type_t str_type = {
725
    { &mp_type_type },
726
    .name = MP_QSTR_str,
727
    .print = str_print,
728
    .make_new = str_make_new,
729
    .binary_op = str_binary_op,
730
731
    .getiter = mp_obj_new_str_iterator,
    .methods = str_type_methods,
732
    .buffer_p = { .get_buffer = str_get_buffer },
733
734
735
736
};

// Reuses most of methods from str
const mp_obj_type_t bytes_type = {
737
    { &mp_type_type },
738
    .name = MP_QSTR_bytes,
739
    .print = str_print,
740
    .make_new = bytes_make_new,
741
742
    .binary_op = str_binary_op,
    .getiter = mp_obj_new_bytes_iterator,
ian-v's avatar
ian-v committed
743
    .methods = str_type_methods,
744
745
};

746
747
748
749
// the zero-length bytes
STATIC const mp_obj_str_t empty_bytes_obj = {{&bytes_type}, 0, 0, NULL};
const mp_obj_t mp_const_empty_bytes = (mp_obj_t)&empty_bytes_obj;

750
mp_obj_t mp_obj_str_builder_start(const mp_obj_type_t *type, uint len, byte **data) {
751
    mp_obj_str_t *o = m_new_obj(mp_obj_str_t);
752
    o->base.type = type;
753
    o->len = len;
754
755
756
    byte *p = m_new(byte, len + 1);
    o->data = p;
    *data = p;
757
758
759
760
761
762
    return o;
}

mp_obj_t mp_obj_str_builder_end(mp_obj_t o_in) {
    mp_obj_str_t *o = o_in;
    o->hash = qstr_compute_hash(o->data, o->len);
763
764
    byte *p = (byte*)o->data;
    p[o->len] = '\0'; // for now we add null for compatibility with C ASCIIZ strings
765
766
767
    return o;
}

768
STATIC mp_obj_t str_new(const mp_obj_type_t *type, const byte* data, uint len) {
769
    mp_obj_str_t *o = m_new_obj(mp_obj_str_t);
770
771
    o->base.type = type;
    o->len = len;
772
773
774
775
776
777
778
    if (data) {
        o->hash = qstr_compute_hash(data, len);
        byte *p = m_new(byte, len + 1);
        o->data = p;
        memcpy(p, data, len * sizeof(byte));
        p[len] = '\0'; // for now we add null for compatibility with C ASCIIZ strings
    }
779
780
781
    return o;
}

782
783
784
785
786
787
788
789
790
791
mp_obj_t mp_obj_new_str(const byte* data, uint len, bool make_qstr_if_not_already) {
    qstr q = qstr_find_strn(data, len);
    if (q != MP_QSTR_NULL) {
        // qstr with this data already exists
        return MP_OBJ_NEW_QSTR(q);
    } else if (make_qstr_if_not_already) {
        // no existing qstr, make a new one
        return MP_OBJ_NEW_QSTR(qstr_from_strn((const char*)data, len));
    } else {
        // no existing qstr, don't make one
792
        return str_new(&str_type, data, len);
793
794
795
    }
}

796
797
798
799
mp_obj_t mp_obj_new_bytes(const byte* data, uint len) {
    return str_new(&bytes_type, data, len);
}

800
801
802
803
804
805
806
807
808
809
810
811
812
813
bool mp_obj_str_equal(mp_obj_t s1, mp_obj_t s2) {
    if (MP_OBJ_IS_QSTR(s1) && MP_OBJ_IS_QSTR(s2)) {
        return s1 == s2;
    } else {
        GET_STR_HASH(s1, h1);
        GET_STR_HASH(s2, h2);
        if (h1 != h2) {
            return false;
        }
        GET_STR_DATA_LEN(s1, d1, l1);
        GET_STR_DATA_LEN(s2, d2, l2);
        if (l1 != l2) {
            return false;
        }
814
        return memcmp(d1, d2, l1) == 0;
815
816
817
    }
}

818
819
void bad_implicit_conversion(mp_obj_t self_in) __attribute__((noreturn));
void bad_implicit_conversion(mp_obj_t self_in) {
820
    nlr_jump(mp_obj_new_exception_msg_varg(&mp_type_TypeError, "Can't convert '%s' object to str implicitly", mp_obj_get_type_str(self_in)));
821
822
}

823
824
825
826
827
uint mp_obj_str_get_hash(mp_obj_t self_in) {
    if (MP_OBJ_IS_STR(self_in)) {
        GET_STR_HASH(self_in, h);
        return h;
    } else {
828
        bad_implicit_conversion(self_in);
829
    }
830
831
832
833
834
835
836
}

uint mp_obj_str_get_len(mp_obj_t self_in) {
    if (MP_OBJ_IS_STR(self_in)) {
        GET_STR_LEN(self_in, l);
        return l;
    } else {
837
838
839
840
841
842
843
844
845
846
847
848
849
850
        bad_implicit_conversion(self_in);
    }
}

// use this if you will anyway convert the string to a qstr
// will be more efficient for the case where it's already a qstr
qstr mp_obj_str_get_qstr(mp_obj_t self_in) {
    if (MP_OBJ_IS_QSTR(self_in)) {
        return MP_OBJ_QSTR_VALUE(self_in);
    } else if (MP_OBJ_IS_TYPE(self_in, &str_type)) {
        mp_obj_str_t *self = self_in;
        return qstr_from_strn((char*)self->data, self->len);
    } else {
        bad_implicit_conversion(self_in);
851
852
853
854
855
856
857
858
859
860
861
    }
}

// only use this function if you need the str data to be zero terminated
// at the moment all strings are zero terminated to help with C ASCIIZ compatibility
const char *mp_obj_str_get_str(mp_obj_t self_in) {
    if (MP_OBJ_IS_STR(self_in)) {
        GET_STR_DATA_LEN(self_in, s, l);
        (void)l; // len unused
        return (const char*)s;
    } else {
862
        bad_implicit_conversion(self_in);
863
864
865
    }
}

866
const char *mp_obj_str_get_data(mp_obj_t self_in, uint *len) {
867
868
869
    if (MP_OBJ_IS_STR(self_in)) {
        GET_STR_DATA_LEN(self_in, s, l);
        *len = l;
870
        return (const char*)s;
871
    } else {
872
        bad_implicit_conversion(self_in);
873
    }
874
}
xyb's avatar
xyb committed
875
876
877
878
879
880

/******************************************************************************/
/* str iterator                                                               */

typedef struct _mp_obj_str_it_t {
    mp_obj_base_t base;
881
    mp_obj_t str;
xyb's avatar
xyb committed
882
883
884
    machine_uint_t cur;
} mp_obj_str_it_t;

885
STATIC mp_obj_t str_it_iternext(mp_obj_t self_in) {
xyb's avatar
xyb committed
886
    mp_obj_str_it_t *self = self_in;
887
888
889
    GET_STR_DATA_LEN(self->str, str, len);
    if (self->cur < len) {
        mp_obj_t o_out = mp_obj_new_str(str + self->cur, 1, true);
xyb's avatar
xyb committed
890
891
892
893
894
895
896
        self->cur += 1;
        return o_out;
    } else {
        return mp_const_stop_iteration;
    }
}

897
STATIC const mp_obj_type_t str_it_type = {
898
    { &mp_type_type },
899
    .name = MP_QSTR_iterator,
900
    .iternext = str_it_iternext,
xyb's avatar
xyb committed
901
902
};

903
STATIC mp_obj_t bytes_it_iternext(mp_obj_t self_in) {
904
905
906
    mp_obj_str_it_t *self = self_in;
    GET_STR_DATA_LEN(self->str, str, len);
    if (self->cur < len) {
907
        mp_obj_t o_out = MP_OBJ_NEW_SMALL_INT((mp_small_int_t)str[self->cur]);
908
909
910
911
912
913
914
        self->cur += 1;
        return o_out;
    } else {
        return mp_const_stop_iteration;
    }
}

915
STATIC const mp_obj_type_t bytes_it_type = {
916
    { &mp_type_type },
917
    .name = MP_QSTR_iterator,
918
919
920
921
    .iternext = bytes_it_iternext,
};

mp_obj_t mp_obj_new_str_iterator(mp_obj_t str) {
xyb's avatar
xyb committed
922
923
924
    mp_obj_str_it_t *o = m_new_obj(mp_obj_str_it_t);
    o->base.type = &str_it_type;
    o->str = str;
925
926
927
928
929
930
931
932
933
    o->cur = 0;
    return o;
}

mp_obj_t mp_obj_new_bytes_iterator(mp_obj_t str) {
    mp_obj_str_it_t *o = m_new_obj(mp_obj_str_it_t);
    o->base.type = &bytes_it_type;
    o->str = str;
    o->cur = 0;
xyb's avatar
xyb committed
934
935
    return o;
}