objstr.c 30.5 KB
Newer Older
xbe's avatar
xbe committed
1
#include <stdbool.h>
2
3
4
5
6
7
#include <string.h>
#include <assert.h>

#include "nlr.h"
#include "misc.h"
#include "mpconfig.h"
8
#include "qstr.h"
9
10
11
12
13
14
#include "obj.h"
#include "runtime0.h"
#include "runtime.h"

typedef struct _mp_obj_str_t {
    mp_obj_base_t base;
15
16
    machine_uint_t hash : 16; // XXX here we assume the hash size is 16 bits (it is at the moment; see qstr.c)
    machine_uint_t len : 16; // len == number of bytes used in data, alloc = len + 1 because (at the moment) we also append a null byte
17
    const byte *data;
18
19
} mp_obj_str_t;

20
21
const mp_obj_t mp_const_empty_bytes;

22
23
24
25
26
27
28
29
30
// use this macro to extract the string hash
#define GET_STR_HASH(str_obj_in, str_hash) uint str_hash; if (MP_OBJ_IS_QSTR(str_obj_in)) { str_hash = qstr_hash(MP_OBJ_QSTR_VALUE(str_obj_in)); } else { str_hash = ((mp_obj_str_t*)str_obj_in)->hash; }

// use this macro to extract the string length
#define GET_STR_LEN(str_obj_in, str_len) uint str_len; if (MP_OBJ_IS_QSTR(str_obj_in)) { str_len = qstr_len(MP_OBJ_QSTR_VALUE(str_obj_in)); } else { str_len = ((mp_obj_str_t*)str_obj_in)->len; }

// use this macro to extract the string data and length
#define GET_STR_DATA_LEN(str_obj_in, str_data, str_len) const byte *str_data; uint str_len; if (MP_OBJ_IS_QSTR(str_obj_in)) { str_data = qstr_data(MP_OBJ_QSTR_VALUE(str_obj_in), &str_len); } else { str_len = ((mp_obj_str_t*)str_obj_in)->len; str_data = ((mp_obj_str_t*)str_obj_in)->data; }

31
32
STATIC mp_obj_t mp_obj_new_str_iterator(mp_obj_t str);
STATIC mp_obj_t mp_obj_new_bytes_iterator(mp_obj_t str);
33
STATIC mp_obj_t str_new(const mp_obj_type_t *type, const byte* data, uint len);
xyb's avatar
xyb committed
34
35
36
37

/******************************************************************************/
/* str                                                                        */

38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
void mp_str_print_quoted(void (*print)(void *env, const char *fmt, ...), void *env, const byte *str_data, uint str_len) {
    // this escapes characters, but it will be very slow to print (calling print many times)
    bool has_single_quote = false;
    bool has_double_quote = false;
    for (const byte *s = str_data, *top = str_data + str_len; (!has_single_quote || !has_double_quote) && s < top; s++) {
        if (*s == '\'') {
            has_single_quote = true;
        } else if (*s == '"') {
            has_double_quote = true;
        }
    }
    int quote_char = '\'';
    if (has_single_quote && !has_double_quote) {
        quote_char = '"';
    }
    print(env, "%c", quote_char);
    for (const byte *s = str_data, *top = str_data + str_len; s < top; s++) {
        if (*s == quote_char) {
            print(env, "\\%c", quote_char);
        } else if (*s == '\\') {
            print(env, "\\\\");
        } else if (32 <= *s && *s <= 126) {
            print(env, "%c", *s);
        } else if (*s == '\n') {
            print(env, "\\n");
        // TODO add more escape codes here if we want to match CPython
        } else {
            print(env, "\\x%02x", *s);
        }
    }
    print(env, "%c", quote_char);
}

71
STATIC void str_print(void (*print)(void *env, const char *fmt, ...), void *env, mp_obj_t self_in, mp_print_kind_t kind) {
72
    GET_STR_DATA_LEN(self_in, str_data, str_len);
73
    bool is_bytes = MP_OBJ_IS_TYPE(self_in, &mp_type_bytes);
74
    if (kind == PRINT_STR && !is_bytes) {
75
        print(env, "%.*s", str_len, str_data);
76
    } else {
77
78
79
        if (is_bytes) {
            print(env, "b");
        }
80
        mp_str_print_quoted(print, env, str_data, str_len);
81
    }
82
83
}

84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
STATIC mp_obj_t str_make_new(mp_obj_t type_in, uint n_args, uint n_kw, const mp_obj_t *args) {
    switch (n_args) {
        case 0:
            return MP_OBJ_NEW_QSTR(MP_QSTR_);

        case 1:
        {
            vstr_t *vstr = vstr_new();
            mp_obj_print_helper((void (*)(void*, const char*, ...))vstr_printf, vstr, args[0], PRINT_STR);
            mp_obj_t s = mp_obj_new_str((byte*)vstr->buf, vstr->len, false);
            vstr_free(vstr);
            return s;
        }

        case 2:
        case 3:
        {
            // TODO: validate 2nd/3rd args
102
            if (!MP_OBJ_IS_TYPE(args[0], &mp_type_bytes)) {
103
104
105
106
                nlr_jump(mp_obj_new_exception_msg(&mp_type_TypeError, "bytes expected"));
            }
            GET_STR_DATA_LEN(args[0], str_data, str_len);
            GET_STR_HASH(args[0], str_hash);
107
            mp_obj_str_t *o = str_new(&mp_type_str, NULL, str_len);
108
109
110
111
112
113
114
115
116
117
            o->data = str_data;
            o->hash = str_hash;
            return o;
        }

        default:
            nlr_jump(mp_obj_new_exception_msg(&mp_type_TypeError, "str takes at most 3 arguments"));
    }
}

118
119
120
121
122
123
124
125
126
127
128
STATIC mp_obj_t bytes_make_new(mp_obj_t type_in, uint n_args, uint n_kw, const mp_obj_t *args) {
    if (n_args == 0) {
        return mp_const_empty_bytes;
    }

    if (MP_OBJ_IS_STR(args[0])) {
        if (n_args < 2 || n_args > 3) {
            goto wrong_args;
        }
        GET_STR_DATA_LEN(args[0], str_data, str_len);
        GET_STR_HASH(args[0], str_hash);
129
        mp_obj_str_t *o = str_new(&mp_type_bytes, NULL, str_len);
130
131
132
133
134
135
136
137
138
139
140
141
142
        o->data = str_data;
        o->hash = str_hash;
        return o;
    }

    if (n_args > 1) {
        goto wrong_args;
    }

    if (MP_OBJ_IS_SMALL_INT(args[0])) {
        uint len = MP_OBJ_SMALL_INT_VALUE(args[0]);
        byte *data;

143
        mp_obj_t o = mp_obj_str_builder_start(&mp_type_bytes, len, &data);
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
        memset(data, 0, len);
        return mp_obj_str_builder_end(o);
    }

    int len;
    byte *data;
    vstr_t *vstr = NULL;
    mp_obj_t o = NULL;
    // Try to create array of exact len if initializer len is known
    mp_obj_t len_in = mp_obj_len_maybe(args[0]);
    if (len_in == MP_OBJ_NULL) {
        len = -1;
        vstr = vstr_new();
    } else {
        len = MP_OBJ_SMALL_INT_VALUE(len_in);
159
        o = mp_obj_str_builder_start(&mp_type_bytes, len, &data);
160
161
    }

Damien George's avatar
Damien George committed
162
    mp_obj_t iterable = mp_getiter(args[0]);
163
    mp_obj_t item;
Damien George's avatar
Damien George committed
164
    while ((item = mp_iternext(iterable)) != MP_OBJ_NULL) {
165
166
167
168
169
170
171
172
173
174
175
        if (len == -1) {
            vstr_add_char(vstr, MP_OBJ_SMALL_INT_VALUE(item));
        } else {
            *data++ = MP_OBJ_SMALL_INT_VALUE(item);
        }
    }

    if (len == -1) {
        vstr_shrink(vstr);
        // TODO: Optimize, borrow buffer from vstr
        len = vstr_len(vstr);
176
        o = mp_obj_str_builder_start(&mp_type_bytes, len, &data);
177
178
179
180
181
182
183
184
185
186
        memcpy(data, vstr_str(vstr), len);
        vstr_free(vstr);
    }

    return mp_obj_str_builder_end(o);

wrong_args:
        nlr_jump(mp_obj_new_exception_msg(&mp_type_TypeError, "wrong number of arguments"));
}

187
188
// like strstr but with specified length and allows \0 bytes
// TODO replace with something more efficient/standard
189
STATIC const byte *find_subbytes(const byte *haystack, machine_uint_t hlen, const byte *needle, machine_uint_t nlen, machine_int_t direction) {
190
    if (hlen >= nlen) {
191
192
193
194
195
196
197
198
199
200
201
202
        machine_uint_t str_index, str_index_end;
        if (direction > 0) {
            str_index = 0;
            str_index_end = hlen - nlen;
        } else {
            str_index = hlen - nlen;
            str_index_end = 0;
        }
        for (;;) {
            if (memcmp(&haystack[str_index], needle, nlen) == 0) {
                //found
                return haystack + str_index;
203
            }
204
205
206
            if (str_index == str_index_end) {
                //not found
                break;
207
            }
208
            str_index += direction;
209
210
211
212
213
        }
    }
    return NULL;
}

214
STATIC mp_obj_t str_binary_op(int op, mp_obj_t lhs_in, mp_obj_t rhs_in) {
215
    GET_STR_DATA_LEN(lhs_in, lhs_data, lhs_len);
216
    switch (op) {
Damien George's avatar
Damien George committed
217
        case MP_BINARY_OP_SUBSCR:
218
219
220
            // TODO: need predicate to check for int-like type (bools are such for example)
            // ["no", "yes"][1 == 2] is common idiom
            if (MP_OBJ_IS_SMALL_INT(rhs_in)) {
221
                uint index = mp_get_index(mp_obj_get_type(lhs_in), lhs_len, rhs_in, false);
222
                if (MP_OBJ_IS_TYPE(lhs_in, &mp_type_bytes)) {
223
                    return MP_OBJ_NEW_SMALL_INT((mp_small_int_t)lhs_data[index]);
224
225
226
                } else {
                    return mp_obj_new_str(lhs_data + index, 1, true);
                }
227
#if MICROPY_ENABLE_SLICE
228
            } else if (MP_OBJ_IS_TYPE(rhs_in, &mp_type_slice)) {
229
                machine_uint_t start, stop;
Paul Sokolovsky's avatar
Paul Sokolovsky committed
230
231
232
                if (!m_seq_get_fast_slice_indexes(lhs_len, rhs_in, &start, &stop)) {
                    assert(0);
                }
233
                return mp_obj_new_str(lhs_data + start, stop - start, false);
234
#endif
235
            } else {
236
237
                // Message doesn't match CPython, but we don't have so much bytes as they
                // to spend them on verbose wording
238
                nlr_jump(mp_obj_new_exception_msg(&mp_type_TypeError, "index must be int"));
239
            }
240

Damien George's avatar
Damien George committed
241
242
        case MP_BINARY_OP_ADD:
        case MP_BINARY_OP_INPLACE_ADD:
243
            if (MP_OBJ_IS_STR(rhs_in)) {
244
                // add 2 strings
245
246

                GET_STR_DATA_LEN(rhs_in, rhs_data, rhs_len);
247
                int alloc_len = lhs_len + rhs_len;
248
249

                /* code for making qstr
250
251
252
253
                byte *q_ptr;
                byte *val = qstr_build_start(alloc_len, &q_ptr);
                memcpy(val, lhs_data, lhs_len);
                memcpy(val + lhs_len, rhs_data, rhs_len);
254
255
256
257
258
                return MP_OBJ_NEW_QSTR(qstr_build_end(q_ptr));
                */

                // code for non-qstr
                byte *data;
259
                mp_obj_t s = mp_obj_str_builder_start(mp_obj_get_type(lhs_in), alloc_len, &data);
260
261
262
                memcpy(data, lhs_data, lhs_len);
                memcpy(data + lhs_len, rhs_data, rhs_len);
                return mp_obj_str_builder_end(s);
263
264
            }
            break;
265

Damien George's avatar
Damien George committed
266
        case MP_BINARY_OP_IN:
267
            /* NOTE `a in b` is `b.__contains__(a)` */
268
269
            if (MP_OBJ_IS_STR(rhs_in)) {
                GET_STR_DATA_LEN(rhs_in, rhs_data, rhs_len);
270
                return MP_BOOL(find_subbytes(lhs_data, lhs_len, rhs_data, rhs_len, 1) != NULL);
271
272
            }
            break;
273

Damien George's avatar
Damien George committed
274
        case MP_BINARY_OP_MULTIPLY:
275
276
277
278
279
        {
            if (!MP_OBJ_IS_SMALL_INT(rhs_in)) {
                return NULL;
            }
            int n = MP_OBJ_SMALL_INT_VALUE(rhs_in);
280
            byte *data;
281
            mp_obj_t s = mp_obj_str_builder_start(mp_obj_get_type(lhs_in), lhs_len * n, &data);
282
283
            mp_seq_multiply(lhs_data, sizeof(*lhs_data), lhs_len, n, data);
            return mp_obj_str_builder_end(s);
284
        }
285

Damien George's avatar
Damien George committed
286
287
288
289
290
291
292
        // These 2 are never passed here, dealt with as a special case in mp_binary_op().
        //case MP_BINARY_OP_EQUAL:
        //case MP_BINARY_OP_NOT_EQUAL:
        case MP_BINARY_OP_LESS:
        case MP_BINARY_OP_LESS_EQUAL:
        case MP_BINARY_OP_MORE:
        case MP_BINARY_OP_MORE_EQUAL:
293
294
295
296
            if (MP_OBJ_IS_STR(rhs_in)) {
                GET_STR_DATA_LEN(rhs_in, rhs_data, rhs_len);
                return MP_BOOL(mp_seq_cmp_bytes(op, lhs_data, lhs_len, rhs_data, rhs_len));
            }
297
298
299
300
301
    }

    return MP_OBJ_NULL; // op not supported
}

302
STATIC mp_obj_t str_join(mp_obj_t self_in, mp_obj_t arg) {
303
    assert(MP_OBJ_IS_STR(self_in));
304

305
    // get separation string
306
    GET_STR_DATA_LEN(self_in, sep_str, sep_len);
307
308

    // process args
309
310
    uint seq_len;
    mp_obj_t *seq_items;
311
    if (MP_OBJ_IS_TYPE(arg, &mp_type_tuple)) {
312
        mp_obj_tuple_get(arg, &seq_len, &seq_items);
313
    } else if (MP_OBJ_IS_TYPE(arg, &mp_type_list)) {
314
315
316
317
        mp_obj_list_get(arg, &seq_len, &seq_items);
    } else {
        goto bad_arg;
    }
318
319
320

    // count required length
    int required_len = 0;
321
    for (int i = 0; i < seq_len; i++) {
322
        if (!MP_OBJ_IS_STR(seq_items[i])) {
323
324
            goto bad_arg;
        }
325
326
327
        if (i > 0) {
            required_len += sep_len;
        }
328
329
        GET_STR_LEN(seq_items[i], l);
        required_len += l;
330
331
332
    }

    // make joined string
333
    byte *data;
334
    mp_obj_t joined_str = mp_obj_str_builder_start(mp_obj_get_type(self_in), required_len, &data);
335
336
    for (int i = 0; i < seq_len; i++) {
        if (i > 0) {
337
338
            memcpy(data, sep_str, sep_len);
            data += sep_len;
339
        }
340
341
342
        GET_STR_DATA_LEN(seq_items[i], s, l);
        memcpy(data, s, l);
        data += l;
343
    }
344
345

    // return joined string
346
    return mp_obj_str_builder_end(joined_str);
347
348

bad_arg:
349
    nlr_jump(mp_obj_new_exception_msg(&mp_type_TypeError, "?str.join expecting a list of str's"));
350
351
}

Paul Sokolovsky's avatar
Paul Sokolovsky committed
352
353
#define is_ws(c) ((c) == ' ' || (c) == '\t')

354
STATIC mp_obj_t str_split(uint n_args, const mp_obj_t *args) {
Paul Sokolovsky's avatar
Paul Sokolovsky committed
355
356
357
358
359
360
361
362
363
    int splits = -1;
    mp_obj_t sep = mp_const_none;
    if (n_args > 1) {
        sep = args[1];
        if (n_args > 2) {
            splits = MP_OBJ_SMALL_INT_VALUE(args[2]);
        }
    }
    assert(sep == mp_const_none);
364
    (void)sep; // unused; to hush compiler warning
Paul Sokolovsky's avatar
Paul Sokolovsky committed
365
    mp_obj_t res = mp_obj_new_list(0, NULL);
366
367
368
    GET_STR_DATA_LEN(args[0], s, len);
    const byte *top = s + len;
    const byte *start;
Paul Sokolovsky's avatar
Paul Sokolovsky committed
369
370

    // Initial whitespace is not counted as split, so we pre-do it
371
372
    while (s < top && is_ws(*s)) s++;
    while (s < top && splits != 0) {
Paul Sokolovsky's avatar
Paul Sokolovsky committed
373
        start = s;
374
        while (s < top && !is_ws(*s)) s++;
Damien George's avatar
Damien George committed
375
        mp_list_append(res, mp_obj_new_str(start, s - start, false));
376
        if (s >= top) {
Paul Sokolovsky's avatar
Paul Sokolovsky committed
377
378
            break;
        }
379
        while (s < top && is_ws(*s)) s++;
Paul Sokolovsky's avatar
Paul Sokolovsky committed
380
381
382
383
384
        if (splits > 0) {
            splits--;
        }
    }

385
    if (s < top) {
Damien George's avatar
Damien George committed
386
        mp_list_append(res, mp_obj_new_str(s, top - s, false));
Paul Sokolovsky's avatar
Paul Sokolovsky committed
387
388
389
390
391
    }

    return res;
}

392
STATIC mp_obj_t str_finder(uint n_args, const mp_obj_t *args, machine_int_t direction) {
393
    assert(2 <= n_args && n_args <= 4);
394
395
    assert(MP_OBJ_IS_STR(args[0]));
    assert(MP_OBJ_IS_STR(args[1]));
396

397
398
    GET_STR_DATA_LEN(args[0], haystack, haystack_len);
    GET_STR_DATA_LEN(args[1], needle, needle_len);
399

400
401
    machine_uint_t start = 0;
    machine_uint_t end = haystack_len;
402
    if (n_args >= 3 && args[2] != mp_const_none) {
403
        start = mp_get_index(&mp_type_str, haystack_len, args[2], true);
404
405
    }
    if (n_args >= 4 && args[3] != mp_const_none) {
406
        end = mp_get_index(&mp_type_str, haystack_len, args[3], true);
407
408
    }

409
    const byte *p = find_subbytes(haystack + start, end - start, needle, needle_len, direction);
410
411
412
413
414
    if (p == NULL) {
        // not found
        return MP_OBJ_NEW_SMALL_INT(-1);
    } else {
        // found
415
        return MP_OBJ_NEW_SMALL_INT(p - haystack);
416
417
418
    }
}

419
420
421
422
423
424
425
426
STATIC mp_obj_t str_find(uint n_args, const mp_obj_t *args) {
    return str_finder(n_args, args, 1);
}

STATIC mp_obj_t str_rfind(uint n_args, const mp_obj_t *args) {
    return str_finder(n_args, args, -1);
}

427
// TODO: (Much) more variety in args
428
STATIC mp_obj_t str_startswith(mp_obj_t self_in, mp_obj_t arg) {
429
430
431
432
433
434
435
436
    GET_STR_DATA_LEN(self_in, str, str_len);
    GET_STR_DATA_LEN(arg, prefix, prefix_len);
    if (prefix_len > str_len) {
        return mp_const_false;
    }
    return MP_BOOL(memcmp(str, prefix, prefix_len) == 0);
}

437
STATIC mp_obj_t str_strip(uint n_args, const mp_obj_t *args) {
xbe's avatar
xbe committed
438
    assert(1 <= n_args && n_args <= 2);
439
440
441
442
443
    assert(MP_OBJ_IS_STR(args[0]));

    const byte *chars_to_del;
    uint chars_to_del_len;
    static const byte whitespace[] = " \t\n\r\v\f";
xbe's avatar
xbe committed
444
445
446

    if (n_args == 1) {
        chars_to_del = whitespace;
447
        chars_to_del_len = sizeof(whitespace);
xbe's avatar
xbe committed
448
    } else {
449
450
451
452
        assert(MP_OBJ_IS_STR(args[1]));
        GET_STR_DATA_LEN(args[1], s, l);
        chars_to_del = s;
        chars_to_del_len = l;
xbe's avatar
xbe committed
453
454
    }

455
    GET_STR_DATA_LEN(args[0], orig_str, orig_str_len);
xbe's avatar
xbe committed
456

457
    machine_uint_t first_good_char_pos = 0;
xbe's avatar
xbe committed
458
    bool first_good_char_pos_set = false;
459
460
    machine_uint_t last_good_char_pos = 0;
    for (machine_uint_t i = 0; i < orig_str_len; i++) {
461
        if (find_subbytes(chars_to_del, chars_to_del_len, &orig_str[i], 1, 1) == NULL) {
xbe's avatar
xbe committed
462
463
464
465
466
467
468
469
470
            last_good_char_pos = i;
            if (!first_good_char_pos_set) {
                first_good_char_pos = i;
                first_good_char_pos_set = true;
            }
        }
    }

    if (first_good_char_pos == 0 && last_good_char_pos == 0) {
471
472
        // string is all whitespace, return ''
        return MP_OBJ_NEW_QSTR(MP_QSTR_);
xbe's avatar
xbe committed
473
474
475
476
    }

    assert(last_good_char_pos >= first_good_char_pos);
    //+1 to accomodate the last character
477
    machine_uint_t stripped_len = last_good_char_pos - first_good_char_pos + 1;
478
    return mp_obj_new_str(orig_str + first_good_char_pos, stripped_len, false);
xbe's avatar
xbe committed
479
480
}

481
mp_obj_t str_format(uint n_args, const mp_obj_t *args) {
482
    assert(MP_OBJ_IS_STR(args[0]));
483

484
    GET_STR_DATA_LEN(args[0], str, len);
485
486
    int arg_i = 1;
    vstr_t *vstr = vstr_new();
487
    for (const byte *top = str + len; str < top; str++) {
488
489
        if (*str == '{') {
            str++;
490
            if (str < top && *str == '{') {
491
                vstr_add_char(vstr, '{');
492
            } else {
493
                while (str < top && *str != '}') str++;
494
                if (arg_i >= n_args) {
495
                    nlr_jump(mp_obj_new_exception_msg(&mp_type_IndexError, "tuple index out of range"));
496
                }
497
                // TODO: may be PRINT_REPR depending on formatting code
498
                mp_obj_print_helper((void (*)(void*, const char*, ...))vstr_printf, vstr, args[arg_i], PRINT_STR);
499
500
501
502
503
504
505
                arg_i++;
            }
        } else {
            vstr_add_char(vstr, *str);
        }
    }

506
507
508
    mp_obj_t s = mp_obj_new_str((byte*)vstr->buf, vstr->len, false);
    vstr_free(vstr);
    return s;
509
510
}

511
STATIC mp_obj_t str_replace(uint n_args, const mp_obj_t *args) {
512
513
514
515
    assert(MP_OBJ_IS_STR(args[0]));
    assert(MP_OBJ_IS_STR(args[1]));
    assert(MP_OBJ_IS_STR(args[2]));

516
    machine_int_t max_rep = 0;
517
    if (n_args == 4) {
518
519
520
521
522
523
524
        assert(MP_OBJ_IS_SMALL_INT(args[3]));
        max_rep = MP_OBJ_SMALL_INT_VALUE(args[3]);
        if (max_rep == 0) {
            return args[0];
        } else if (max_rep < 0) {
            max_rep = 0;
        }
525
    }
526
527

    // if max_rep is still 0 by this point we will need to do all possible replacements
528
529
530
531

    GET_STR_DATA_LEN(args[0], str, str_len);
    GET_STR_DATA_LEN(args[1], old, old_len);
    GET_STR_DATA_LEN(args[2], new, new_len);
532
533

    // old won't exist in str if it's longer, so nothing to replace
534
    if (old_len > str_len) {
535
        return args[0];
536
537
    }

538
539
540
541
542
543
544
545
546
547
548
549
550
    // data for the replaced string
    byte *data = NULL;
    mp_obj_t replaced_str = MP_OBJ_NULL;

    // do 2 passes over the string:
    //   first pass computes the required length of the replaced string
    //   second pass does the replacements
    for (;;) {
        machine_uint_t replaced_str_index = 0;
        machine_uint_t num_replacements_done = 0;
        const byte *old_occurrence;
        const byte *offset_ptr = str;
        machine_uint_t offset_num = 0;
551
        while ((old_occurrence = find_subbytes(offset_ptr, str_len - offset_num, old, old_len, 1)) != NULL) {
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
            // copy from just after end of last occurrence of to-be-replaced string to right before start of next occurrence
            if (data != NULL) {
                memcpy(data + replaced_str_index, offset_ptr, old_occurrence - offset_ptr);
            }
            replaced_str_index += old_occurrence - offset_ptr;
            // copy the replacement string
            if (data != NULL) {
                memcpy(data + replaced_str_index, new, new_len);
            }
            replaced_str_index += new_len;
            offset_ptr = old_occurrence + old_len;
            offset_num = offset_ptr - str;

            num_replacements_done++;
            if (max_rep != 0 && num_replacements_done == max_rep){
                break;
            }
        }

        // copy from just after end of last occurrence of to-be-replaced string to end of old string
        if (data != NULL) {
            memcpy(data + replaced_str_index, offset_ptr, str_len - offset_num);
        }
        replaced_str_index += str_len - offset_num;

        if (data == NULL) {
            // first pass
            if (num_replacements_done == 0) {
                // no substr found, return original string
                return args[0];
            } else {
                // substr found, allocate new string
                replaced_str = mp_obj_str_builder_start(mp_obj_get_type(args[0]), replaced_str_index, &data);
            }
        } else {
            // second pass, we are done
            break;
        }
590
    }
591

592
593
594
    return mp_obj_str_builder_end(replaced_str);
}

595
596
597
598
599
600
601
602
STATIC mp_obj_t str_count(uint n_args, const mp_obj_t *args) {
    assert(2 <= n_args && n_args <= 4);
    assert(MP_OBJ_IS_STR(args[0]));
    assert(MP_OBJ_IS_STR(args[1]));

    GET_STR_DATA_LEN(args[0], haystack, haystack_len);
    GET_STR_DATA_LEN(args[1], needle, needle_len);

603
604
    machine_uint_t start = 0;
    machine_uint_t end = haystack_len;
605
    if (n_args >= 3 && args[2] != mp_const_none) {
606
        start = mp_get_index(&mp_type_str, haystack_len, args[2], true);
607
608
    }
    if (n_args >= 4 && args[3] != mp_const_none) {
609
        end = mp_get_index(&mp_type_str, haystack_len, args[3], true);
610
611
    }

612
613
614
    // if needle_len is zero then we count each gap between characters as an occurrence
    if (needle_len == 0) {
        return MP_OBJ_NEW_SMALL_INT(end - start + 1);
615
616
    }

617
618
    // count the occurrences
    machine_int_t num_occurrences = 0;
xbe's avatar
xbe committed
619
620
621
622
623
    for (machine_uint_t haystack_index = start; haystack_index + needle_len <= end; haystack_index++) {
        if (memcmp(&haystack[haystack_index], needle, needle_len) == 0) {
            num_occurrences++;
            haystack_index += needle_len - 1;
        }
624
625
626
627
628
    }

    return MP_OBJ_NEW_SMALL_INT(num_occurrences);
}

629
STATIC mp_obj_t str_partitioner(mp_obj_t self_in, mp_obj_t arg, machine_int_t direction) {
630
631
632
633
634
    assert(MP_OBJ_IS_STR(self_in));
    if (!MP_OBJ_IS_STR(arg)) {
        nlr_jump(mp_obj_new_exception_msg_varg(&mp_type_TypeError,
                                               "Can't convert '%s' object to str implicitly", mp_obj_get_type_str(arg)));
    }
635

636
637
638
639
640
641
    GET_STR_DATA_LEN(self_in, str, str_len);
    GET_STR_DATA_LEN(arg, sep, sep_len);

    if (sep_len == 0) {
        nlr_jump(mp_obj_new_exception_msg(&mp_type_ValueError, "empty separator"));
    }
642
643
644
645
646

    mp_obj_t result[] = {MP_OBJ_NEW_QSTR(MP_QSTR_), MP_OBJ_NEW_QSTR(MP_QSTR_), MP_OBJ_NEW_QSTR(MP_QSTR_)};

    if (direction > 0) {
        result[0] = self_in;
647
    } else {
648
        result[2] = self_in;
649
    }
650

651
652
653
654
655
656
    const byte *position_ptr = find_subbytes(str, str_len, sep, sep_len, direction);
    if (position_ptr != NULL) {
        machine_uint_t position = position_ptr - str;
        result[0] = mp_obj_new_str(str, position, false);
        result[1] = arg;
        result[2] = mp_obj_new_str(str + position + sep_len, str_len - position - sep_len, false);
657
    }
658

659
    return mp_obj_new_tuple(3, result);
660
661
}

662
663
STATIC mp_obj_t str_partition(mp_obj_t self_in, mp_obj_t arg) {
    return str_partitioner(self_in, arg, 1);
664
}
665

666
667
STATIC mp_obj_t str_rpartition(mp_obj_t self_in, mp_obj_t arg) {
    return str_partitioner(self_in, arg, -1);
668
669
}

670
671
672
673
674
675
676
677
678
679
680
681
682
683
STATIC machine_int_t str_get_buffer(mp_obj_t self_in, buffer_info_t *bufinfo, int flags) {
    if (flags == BUFFER_READ) {
        GET_STR_DATA_LEN(self_in, str_data, str_len);
        bufinfo->buf = (void*)str_data;
        bufinfo->len = str_len;
        return 0;
    } else {
        // can't write to a string
        bufinfo->buf = NULL;
        bufinfo->len = 0;
        return 1;
    }
}

684
STATIC MP_DEFINE_CONST_FUN_OBJ_VAR_BETWEEN(str_find_obj, 2, 4, str_find);
685
STATIC MP_DEFINE_CONST_FUN_OBJ_VAR_BETWEEN(str_rfind_obj, 2, 4, str_rfind);
686
687
688
689
690
691
STATIC MP_DEFINE_CONST_FUN_OBJ_2(str_join_obj, str_join);
STATIC MP_DEFINE_CONST_FUN_OBJ_VAR_BETWEEN(str_split_obj, 1, 3, str_split);
STATIC MP_DEFINE_CONST_FUN_OBJ_2(str_startswith_obj, str_startswith);
STATIC MP_DEFINE_CONST_FUN_OBJ_VAR_BETWEEN(str_strip_obj, 1, 2, str_strip);
STATIC MP_DEFINE_CONST_FUN_OBJ_VAR(str_format_obj, 1, str_format);
STATIC MP_DEFINE_CONST_FUN_OBJ_VAR_BETWEEN(str_replace_obj, 3, 4, str_replace);
692
STATIC MP_DEFINE_CONST_FUN_OBJ_VAR_BETWEEN(str_count_obj, 2, 4, str_count);
693
STATIC MP_DEFINE_CONST_FUN_OBJ_2(str_partition_obj, str_partition);
694
STATIC MP_DEFINE_CONST_FUN_OBJ_2(str_rpartition_obj, str_rpartition);
695

696
697
698
699
700
701
702
703
704
705
706
707
STATIC const mp_map_elem_t str_locals_dict_table[] = {
    { MP_OBJ_NEW_QSTR(MP_QSTR_find), (mp_obj_t)&str_find_obj },
    { MP_OBJ_NEW_QSTR(MP_QSTR_rfind), (mp_obj_t)&str_rfind_obj },
    { MP_OBJ_NEW_QSTR(MP_QSTR_join), (mp_obj_t)&str_join_obj },
    { MP_OBJ_NEW_QSTR(MP_QSTR_split), (mp_obj_t)&str_split_obj },
    { MP_OBJ_NEW_QSTR(MP_QSTR_startswith), (mp_obj_t)&str_startswith_obj },
    { MP_OBJ_NEW_QSTR(MP_QSTR_strip), (mp_obj_t)&str_strip_obj },
    { MP_OBJ_NEW_QSTR(MP_QSTR_format), (mp_obj_t)&str_format_obj },
    { MP_OBJ_NEW_QSTR(MP_QSTR_replace), (mp_obj_t)&str_replace_obj },
    { MP_OBJ_NEW_QSTR(MP_QSTR_count), (mp_obj_t)&str_count_obj },
    { MP_OBJ_NEW_QSTR(MP_QSTR_partition), (mp_obj_t)&str_partition_obj },
    { MP_OBJ_NEW_QSTR(MP_QSTR_rpartition), (mp_obj_t)&str_rpartition_obj },
ian-v's avatar
ian-v committed
708
};
709

710
711
STATIC MP_DEFINE_CONST_DICT(str_locals_dict, str_locals_dict_table);

712
const mp_obj_type_t mp_type_str = {
713
    { &mp_type_type },
714
    .name = MP_QSTR_str,
715
    .print = str_print,
716
    .make_new = str_make_new,
717
    .binary_op = str_binary_op,
718
    .getiter = mp_obj_new_str_iterator,
719
    .buffer_p = { .get_buffer = str_get_buffer },
720
    .locals_dict = (mp_obj_t)&str_locals_dict,
721
722
723
};

// Reuses most of methods from str
724
const mp_obj_type_t mp_type_bytes = {
725
    { &mp_type_type },
726
    .name = MP_QSTR_bytes,
727
    .print = str_print,
728
    .make_new = bytes_make_new,
729
730
    .binary_op = str_binary_op,
    .getiter = mp_obj_new_bytes_iterator,
731
    .locals_dict = (mp_obj_t)&str_locals_dict,
732
733
};

734
// the zero-length bytes
735
STATIC const mp_obj_str_t empty_bytes_obj = {{&mp_type_bytes}, 0, 0, NULL};
736
737
const mp_obj_t mp_const_empty_bytes = (mp_obj_t)&empty_bytes_obj;

738
mp_obj_t mp_obj_str_builder_start(const mp_obj_type_t *type, uint len, byte **data) {
739
    mp_obj_str_t *o = m_new_obj(mp_obj_str_t);
740
    o->base.type = type;
741
    o->len = len;
742
743
744
    byte *p = m_new(byte, len + 1);
    o->data = p;
    *data = p;
745
746
747
748
749
750
    return o;
}

mp_obj_t mp_obj_str_builder_end(mp_obj_t o_in) {
    mp_obj_str_t *o = o_in;
    o->hash = qstr_compute_hash(o->data, o->len);
751
752
    byte *p = (byte*)o->data;
    p[o->len] = '\0'; // for now we add null for compatibility with C ASCIIZ strings
753
754
755
    return o;
}

756
STATIC mp_obj_t str_new(const mp_obj_type_t *type, const byte* data, uint len) {
757
    mp_obj_str_t *o = m_new_obj(mp_obj_str_t);
758
759
    o->base.type = type;
    o->len = len;
760
761
762
763
764
765
766
    if (data) {
        o->hash = qstr_compute_hash(data, len);
        byte *p = m_new(byte, len + 1);
        o->data = p;
        memcpy(p, data, len * sizeof(byte));
        p[len] = '\0'; // for now we add null for compatibility with C ASCIIZ strings
    }
767
768
769
    return o;
}

770
771
772
773
774
775
776
777
778
779
mp_obj_t mp_obj_new_str(const byte* data, uint len, bool make_qstr_if_not_already) {
    qstr q = qstr_find_strn(data, len);
    if (q != MP_QSTR_NULL) {
        // qstr with this data already exists
        return MP_OBJ_NEW_QSTR(q);
    } else if (make_qstr_if_not_already) {
        // no existing qstr, make a new one
        return MP_OBJ_NEW_QSTR(qstr_from_strn((const char*)data, len));
    } else {
        // no existing qstr, don't make one
780
        return str_new(&mp_type_str, data, len);
781
782
783
    }
}

784
mp_obj_t mp_obj_new_bytes(const byte* data, uint len) {
785
    return str_new(&mp_type_bytes, data, len);
786
787
}

788
789
790
791
792
793
794
795
796
797
798
799
800
801
bool mp_obj_str_equal(mp_obj_t s1, mp_obj_t s2) {
    if (MP_OBJ_IS_QSTR(s1) && MP_OBJ_IS_QSTR(s2)) {
        return s1 == s2;
    } else {
        GET_STR_HASH(s1, h1);
        GET_STR_HASH(s2, h2);
        if (h1 != h2) {
            return false;
        }
        GET_STR_DATA_LEN(s1, d1, l1);
        GET_STR_DATA_LEN(s2, d2, l2);
        if (l1 != l2) {
            return false;
        }
802
        return memcmp(d1, d2, l1) == 0;
803
804
805
    }
}

806
807
void bad_implicit_conversion(mp_obj_t self_in) __attribute__((noreturn));
void bad_implicit_conversion(mp_obj_t self_in) {
808
    nlr_jump(mp_obj_new_exception_msg_varg(&mp_type_TypeError, "Can't convert '%s' object to str implicitly", mp_obj_get_type_str(self_in)));
809
810
}

811
812
813
814
815
uint mp_obj_str_get_hash(mp_obj_t self_in) {
    if (MP_OBJ_IS_STR(self_in)) {
        GET_STR_HASH(self_in, h);
        return h;
    } else {
816
        bad_implicit_conversion(self_in);
817
    }
818
819
820
821
822
823
824
}

uint mp_obj_str_get_len(mp_obj_t self_in) {
    if (MP_OBJ_IS_STR(self_in)) {
        GET_STR_LEN(self_in, l);
        return l;
    } else {
825
826
827
828
829
830
831
832
833
        bad_implicit_conversion(self_in);
    }
}

// use this if you will anyway convert the string to a qstr
// will be more efficient for the case where it's already a qstr
qstr mp_obj_str_get_qstr(mp_obj_t self_in) {
    if (MP_OBJ_IS_QSTR(self_in)) {
        return MP_OBJ_QSTR_VALUE(self_in);
834
    } else if (MP_OBJ_IS_TYPE(self_in, &mp_type_str)) {
835
836
837
838
        mp_obj_str_t *self = self_in;
        return qstr_from_strn((char*)self->data, self->len);
    } else {
        bad_implicit_conversion(self_in);
839
840
841
842
843
844
845
846
847
848
849
    }
}

// only use this function if you need the str data to be zero terminated
// at the moment all strings are zero terminated to help with C ASCIIZ compatibility
const char *mp_obj_str_get_str(mp_obj_t self_in) {
    if (MP_OBJ_IS_STR(self_in)) {
        GET_STR_DATA_LEN(self_in, s, l);
        (void)l; // len unused
        return (const char*)s;
    } else {
850
        bad_implicit_conversion(self_in);
851
852
853
    }
}

854
const char *mp_obj_str_get_data(mp_obj_t self_in, uint *len) {
855
856
857
    if (MP_OBJ_IS_STR(self_in)) {
        GET_STR_DATA_LEN(self_in, s, l);
        *len = l;
858
        return (const char*)s;
859
    } else {
860
        bad_implicit_conversion(self_in);
861
    }
862
}
xyb's avatar
xyb committed
863
864
865
866
867
868

/******************************************************************************/
/* str iterator                                                               */

typedef struct _mp_obj_str_it_t {
    mp_obj_base_t base;
869
    mp_obj_t str;
xyb's avatar
xyb committed
870
871
872
    machine_uint_t cur;
} mp_obj_str_it_t;

873
STATIC mp_obj_t str_it_iternext(mp_obj_t self_in) {
xyb's avatar
xyb committed
874
    mp_obj_str_it_t *self = self_in;
875
876
877
    GET_STR_DATA_LEN(self->str, str, len);
    if (self->cur < len) {
        mp_obj_t o_out = mp_obj_new_str(str + self->cur, 1, true);
xyb's avatar
xyb committed
878
879
880
        self->cur += 1;
        return o_out;
    } else {
881
        return MP_OBJ_NULL;
xyb's avatar
xyb committed
882
883
884
    }
}

885
STATIC const mp_obj_type_t mp_type_str_it = {
886
    { &mp_type_type },
887
    .name = MP_QSTR_iterator,
888
    .iternext = str_it_iternext,
xyb's avatar
xyb committed
889
890
};

891
STATIC mp_obj_t bytes_it_iternext(mp_obj_t self_in) {
892
893
894
    mp_obj_str_it_t *self = self_in;
    GET_STR_DATA_LEN(self->str, str, len);
    if (self->cur < len) {
895
        mp_obj_t o_out = MP_OBJ_NEW_SMALL_INT((mp_small_int_t)str[self->cur]);
896
897
898
        self->cur += 1;
        return o_out;
    } else {
899
        return MP_OBJ_NULL;
900
901
902
    }
}

903
STATIC const mp_obj_type_t mp_type_bytes_it = {
904
    { &mp_type_type },
905
    .name = MP_QSTR_iterator,
906
907
908
909
    .iternext = bytes_it_iternext,
};

mp_obj_t mp_obj_new_str_iterator(mp_obj_t str) {
xyb's avatar
xyb committed
910
    mp_obj_str_it_t *o = m_new_obj(mp_obj_str_it_t);
911
    o->base.type = &mp_type_str_it;
xyb's avatar
xyb committed
912
    o->str = str;
913
914
915
916
917
918
    o->cur = 0;
    return o;
}

mp_obj_t mp_obj_new_bytes_iterator(mp_obj_t str) {
    mp_obj_str_it_t *o = m_new_obj(mp_obj_str_it_t);
919
    o->base.type = &mp_type_bytes_it;
920
921
    o->str = str;
    o->cur = 0;
xyb's avatar
xyb committed
922
923
    return o;
}