objstr.c 30.4 KB
Newer Older
xbe's avatar
xbe committed
1
#include <stdbool.h>
2
3
4
5
6
7
#include <string.h>
#include <assert.h>

#include "nlr.h"
#include "misc.h"
#include "mpconfig.h"
8
#include "qstr.h"
9
#include "obj.h"
10
#include "map.h"
11
12
13
14
15
#include "runtime0.h"
#include "runtime.h"

typedef struct _mp_obj_str_t {
    mp_obj_base_t base;
16
17
    machine_uint_t hash : 16; // XXX here we assume the hash size is 16 bits (it is at the moment; see qstr.c)
    machine_uint_t len : 16; // len == number of bytes used in data, alloc = len + 1 because (at the moment) we also append a null byte
18
    const byte *data;
19
20
} mp_obj_str_t;

21
22
const mp_obj_t mp_const_empty_bytes;

23
24
25
26
27
28
29
30
31
// use this macro to extract the string hash
#define GET_STR_HASH(str_obj_in, str_hash) uint str_hash; if (MP_OBJ_IS_QSTR(str_obj_in)) { str_hash = qstr_hash(MP_OBJ_QSTR_VALUE(str_obj_in)); } else { str_hash = ((mp_obj_str_t*)str_obj_in)->hash; }

// use this macro to extract the string length
#define GET_STR_LEN(str_obj_in, str_len) uint str_len; if (MP_OBJ_IS_QSTR(str_obj_in)) { str_len = qstr_len(MP_OBJ_QSTR_VALUE(str_obj_in)); } else { str_len = ((mp_obj_str_t*)str_obj_in)->len; }

// use this macro to extract the string data and length
#define GET_STR_DATA_LEN(str_obj_in, str_data, str_len) const byte *str_data; uint str_len; if (MP_OBJ_IS_QSTR(str_obj_in)) { str_data = qstr_data(MP_OBJ_QSTR_VALUE(str_obj_in), &str_len); } else { str_len = ((mp_obj_str_t*)str_obj_in)->len; str_data = ((mp_obj_str_t*)str_obj_in)->data; }

32
33
STATIC mp_obj_t mp_obj_new_str_iterator(mp_obj_t str);
STATIC mp_obj_t mp_obj_new_bytes_iterator(mp_obj_t str);
34
STATIC mp_obj_t str_new(const mp_obj_type_t *type, const byte* data, uint len);
xyb's avatar
xyb committed
35
36
37
38

/******************************************************************************/
/* str                                                                        */

39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
void mp_str_print_quoted(void (*print)(void *env, const char *fmt, ...), void *env, const byte *str_data, uint str_len) {
    // this escapes characters, but it will be very slow to print (calling print many times)
    bool has_single_quote = false;
    bool has_double_quote = false;
    for (const byte *s = str_data, *top = str_data + str_len; (!has_single_quote || !has_double_quote) && s < top; s++) {
        if (*s == '\'') {
            has_single_quote = true;
        } else if (*s == '"') {
            has_double_quote = true;
        }
    }
    int quote_char = '\'';
    if (has_single_quote && !has_double_quote) {
        quote_char = '"';
    }
    print(env, "%c", quote_char);
    for (const byte *s = str_data, *top = str_data + str_len; s < top; s++) {
        if (*s == quote_char) {
            print(env, "\\%c", quote_char);
        } else if (*s == '\\') {
            print(env, "\\\\");
        } else if (32 <= *s && *s <= 126) {
            print(env, "%c", *s);
        } else if (*s == '\n') {
            print(env, "\\n");
        // TODO add more escape codes here if we want to match CPython
        } else {
            print(env, "\\x%02x", *s);
        }
    }
    print(env, "%c", quote_char);
}

72
STATIC void str_print(void (*print)(void *env, const char *fmt, ...), void *env, mp_obj_t self_in, mp_print_kind_t kind) {
73
    GET_STR_DATA_LEN(self_in, str_data, str_len);
74
75
    bool is_bytes = MP_OBJ_IS_TYPE(self_in, &bytes_type);
    if (kind == PRINT_STR && !is_bytes) {
76
        print(env, "%.*s", str_len, str_data);
77
    } else {
78
79
80
        if (is_bytes) {
            print(env, "b");
        }
81
        mp_str_print_quoted(print, env, str_data, str_len);
82
    }
83
84
}

85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
STATIC mp_obj_t str_make_new(mp_obj_t type_in, uint n_args, uint n_kw, const mp_obj_t *args) {
    switch (n_args) {
        case 0:
            return MP_OBJ_NEW_QSTR(MP_QSTR_);

        case 1:
        {
            vstr_t *vstr = vstr_new();
            mp_obj_print_helper((void (*)(void*, const char*, ...))vstr_printf, vstr, args[0], PRINT_STR);
            mp_obj_t s = mp_obj_new_str((byte*)vstr->buf, vstr->len, false);
            vstr_free(vstr);
            return s;
        }

        case 2:
        case 3:
        {
            // TODO: validate 2nd/3rd args
            if (!MP_OBJ_IS_TYPE(args[0], &bytes_type)) {
                nlr_jump(mp_obj_new_exception_msg(&mp_type_TypeError, "bytes expected"));
            }
            GET_STR_DATA_LEN(args[0], str_data, str_len);
            GET_STR_HASH(args[0], str_hash);
            mp_obj_str_t *o = str_new(&str_type, NULL, str_len);
            o->data = str_data;
            o->hash = str_hash;
            return o;
        }

        default:
            nlr_jump(mp_obj_new_exception_msg(&mp_type_TypeError, "str takes at most 3 arguments"));
    }
}

119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
STATIC mp_obj_t bytes_make_new(mp_obj_t type_in, uint n_args, uint n_kw, const mp_obj_t *args) {
    if (n_args == 0) {
        return mp_const_empty_bytes;
    }

    if (MP_OBJ_IS_STR(args[0])) {
        if (n_args < 2 || n_args > 3) {
            goto wrong_args;
        }
        GET_STR_DATA_LEN(args[0], str_data, str_len);
        GET_STR_HASH(args[0], str_hash);
        mp_obj_str_t *o = str_new(&bytes_type, NULL, str_len);
        o->data = str_data;
        o->hash = str_hash;
        return o;
    }

    if (n_args > 1) {
        goto wrong_args;
    }

    if (MP_OBJ_IS_SMALL_INT(args[0])) {
        uint len = MP_OBJ_SMALL_INT_VALUE(args[0]);
        byte *data;

        mp_obj_t o = mp_obj_str_builder_start(&bytes_type, len, &data);
        memset(data, 0, len);
        return mp_obj_str_builder_end(o);
    }

    int len;
    byte *data;
    vstr_t *vstr = NULL;
    mp_obj_t o = NULL;
    // Try to create array of exact len if initializer len is known
    mp_obj_t len_in = mp_obj_len_maybe(args[0]);
    if (len_in == MP_OBJ_NULL) {
        len = -1;
        vstr = vstr_new();
    } else {
        len = MP_OBJ_SMALL_INT_VALUE(len_in);
        o = mp_obj_str_builder_start(&bytes_type, len, &data);
    }

    mp_obj_t iterable = rt_getiter(args[0]);
    mp_obj_t item;
165
    while ((item = rt_iternext(iterable)) != MP_OBJ_NULL) {
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
        if (len == -1) {
            vstr_add_char(vstr, MP_OBJ_SMALL_INT_VALUE(item));
        } else {
            *data++ = MP_OBJ_SMALL_INT_VALUE(item);
        }
    }

    if (len == -1) {
        vstr_shrink(vstr);
        // TODO: Optimize, borrow buffer from vstr
        len = vstr_len(vstr);
        o = mp_obj_str_builder_start(&bytes_type, len, &data);
        memcpy(data, vstr_str(vstr), len);
        vstr_free(vstr);
    }

    return mp_obj_str_builder_end(o);

wrong_args:
        nlr_jump(mp_obj_new_exception_msg(&mp_type_TypeError, "wrong number of arguments"));
}

188
189
// like strstr but with specified length and allows \0 bytes
// TODO replace with something more efficient/standard
190
STATIC const byte *find_subbytes(const byte *haystack, machine_uint_t hlen, const byte *needle, machine_uint_t nlen, machine_int_t direction) {
191
    if (hlen >= nlen) {
192
193
194
195
196
197
198
199
200
201
202
203
        machine_uint_t str_index, str_index_end;
        if (direction > 0) {
            str_index = 0;
            str_index_end = hlen - nlen;
        } else {
            str_index = hlen - nlen;
            str_index_end = 0;
        }
        for (;;) {
            if (memcmp(&haystack[str_index], needle, nlen) == 0) {
                //found
                return haystack + str_index;
204
            }
205
206
207
            if (str_index == str_index_end) {
                //not found
                break;
208
            }
209
            str_index += direction;
210
211
212
213
214
        }
    }
    return NULL;
}

215
STATIC mp_obj_t str_binary_op(int op, mp_obj_t lhs_in, mp_obj_t rhs_in) {
216
    GET_STR_DATA_LEN(lhs_in, lhs_data, lhs_len);
217
218
    switch (op) {
        case RT_BINARY_OP_SUBSCR:
219
220
221
            // TODO: need predicate to check for int-like type (bools are such for example)
            // ["no", "yes"][1 == 2] is common idiom
            if (MP_OBJ_IS_SMALL_INT(rhs_in)) {
222
                uint index = mp_get_index(mp_obj_get_type(lhs_in), lhs_len, rhs_in, false);
223
                if (MP_OBJ_IS_TYPE(lhs_in, &bytes_type)) {
224
                    return MP_OBJ_NEW_SMALL_INT((mp_small_int_t)lhs_data[index]);
225
226
227
                } else {
                    return mp_obj_new_str(lhs_data + index, 1, true);
                }
228
#if MICROPY_ENABLE_SLICE
229
            } else if (MP_OBJ_IS_TYPE(rhs_in, &slice_type)) {
230
                machine_uint_t start, stop;
Paul Sokolovsky's avatar
Paul Sokolovsky committed
231
232
233
                if (!m_seq_get_fast_slice_indexes(lhs_len, rhs_in, &start, &stop)) {
                    assert(0);
                }
234
                return mp_obj_new_str(lhs_data + start, stop - start, false);
235
#endif
236
            } else {
237
238
                // Message doesn't match CPython, but we don't have so much bytes as they
                // to spend them on verbose wording
239
                nlr_jump(mp_obj_new_exception_msg(&mp_type_TypeError, "index must be int"));
240
            }
241
242
243

        case RT_BINARY_OP_ADD:
        case RT_BINARY_OP_INPLACE_ADD:
244
            if (MP_OBJ_IS_STR(rhs_in)) {
245
                // add 2 strings
246
247

                GET_STR_DATA_LEN(rhs_in, rhs_data, rhs_len);
248
                int alloc_len = lhs_len + rhs_len;
249
250

                /* code for making qstr
251
252
253
254
                byte *q_ptr;
                byte *val = qstr_build_start(alloc_len, &q_ptr);
                memcpy(val, lhs_data, lhs_len);
                memcpy(val + lhs_len, rhs_data, rhs_len);
255
256
257
258
259
                return MP_OBJ_NEW_QSTR(qstr_build_end(q_ptr));
                */

                // code for non-qstr
                byte *data;
260
                mp_obj_t s = mp_obj_str_builder_start(mp_obj_get_type(lhs_in), alloc_len, &data);
261
262
263
                memcpy(data, lhs_data, lhs_len);
                memcpy(data + lhs_len, rhs_data, rhs_len);
                return mp_obj_str_builder_end(s);
264
265
            }
            break;
266

267
        case RT_BINARY_OP_IN:
268
            /* NOTE `a in b` is `b.__contains__(a)` */
269
270
            if (MP_OBJ_IS_STR(rhs_in)) {
                GET_STR_DATA_LEN(rhs_in, rhs_data, rhs_len);
271
                return MP_BOOL(find_subbytes(lhs_data, lhs_len, rhs_data, rhs_len, 1) != NULL);
272
273
            }
            break;
274

275
276
277
278
279
280
        case RT_BINARY_OP_MULTIPLY:
        {
            if (!MP_OBJ_IS_SMALL_INT(rhs_in)) {
                return NULL;
            }
            int n = MP_OBJ_SMALL_INT_VALUE(rhs_in);
281
            byte *data;
282
            mp_obj_t s = mp_obj_str_builder_start(mp_obj_get_type(lhs_in), lhs_len * n, &data);
283
284
            mp_seq_multiply(lhs_data, sizeof(*lhs_data), lhs_len, n, data);
            return mp_obj_str_builder_end(s);
285
        }
286
287
288
289
290
291
292
293
294
295
296
297

        // These 2 are never passed here, dealt with as a special case in rt_binary_op().
        //case RT_BINARY_OP_EQUAL:
        //case RT_BINARY_OP_NOT_EQUAL:
        case RT_BINARY_OP_LESS:
        case RT_BINARY_OP_LESS_EQUAL:
        case RT_BINARY_OP_MORE:
        case RT_BINARY_OP_MORE_EQUAL:
            if (MP_OBJ_IS_STR(rhs_in)) {
                GET_STR_DATA_LEN(rhs_in, rhs_data, rhs_len);
                return MP_BOOL(mp_seq_cmp_bytes(op, lhs_data, lhs_len, rhs_data, rhs_len));
            }
298
299
300
301
302
    }

    return MP_OBJ_NULL; // op not supported
}

303
STATIC mp_obj_t str_join(mp_obj_t self_in, mp_obj_t arg) {
304
    assert(MP_OBJ_IS_STR(self_in));
305

306
    // get separation string
307
    GET_STR_DATA_LEN(self_in, sep_str, sep_len);
308
309

    // process args
310
311
312
313
314
315
316
317
318
    uint seq_len;
    mp_obj_t *seq_items;
    if (MP_OBJ_IS_TYPE(arg, &tuple_type)) {
        mp_obj_tuple_get(arg, &seq_len, &seq_items);
    } else if (MP_OBJ_IS_TYPE(arg, &list_type)) {
        mp_obj_list_get(arg, &seq_len, &seq_items);
    } else {
        goto bad_arg;
    }
319
320
321

    // count required length
    int required_len = 0;
322
    for (int i = 0; i < seq_len; i++) {
323
        if (!MP_OBJ_IS_STR(seq_items[i])) {
324
325
            goto bad_arg;
        }
326
327
328
        if (i > 0) {
            required_len += sep_len;
        }
329
330
        GET_STR_LEN(seq_items[i], l);
        required_len += l;
331
332
333
    }

    // make joined string
334
    byte *data;
335
    mp_obj_t joined_str = mp_obj_str_builder_start(mp_obj_get_type(self_in), required_len, &data);
336
337
    for (int i = 0; i < seq_len; i++) {
        if (i > 0) {
338
339
            memcpy(data, sep_str, sep_len);
            data += sep_len;
340
        }
341
342
343
        GET_STR_DATA_LEN(seq_items[i], s, l);
        memcpy(data, s, l);
        data += l;
344
    }
345
346

    // return joined string
347
    return mp_obj_str_builder_end(joined_str);
348
349

bad_arg:
350
    nlr_jump(mp_obj_new_exception_msg(&mp_type_TypeError, "?str.join expecting a list of str's"));
351
352
}

Paul Sokolovsky's avatar
Paul Sokolovsky committed
353
354
#define is_ws(c) ((c) == ' ' || (c) == '\t')

355
STATIC mp_obj_t str_split(uint n_args, const mp_obj_t *args) {
Paul Sokolovsky's avatar
Paul Sokolovsky committed
356
357
358
359
360
361
362
363
364
    int splits = -1;
    mp_obj_t sep = mp_const_none;
    if (n_args > 1) {
        sep = args[1];
        if (n_args > 2) {
            splits = MP_OBJ_SMALL_INT_VALUE(args[2]);
        }
    }
    assert(sep == mp_const_none);
365
    (void)sep; // unused; to hush compiler warning
Paul Sokolovsky's avatar
Paul Sokolovsky committed
366
    mp_obj_t res = mp_obj_new_list(0, NULL);
367
368
369
    GET_STR_DATA_LEN(args[0], s, len);
    const byte *top = s + len;
    const byte *start;
Paul Sokolovsky's avatar
Paul Sokolovsky committed
370
371

    // Initial whitespace is not counted as split, so we pre-do it
372
373
    while (s < top && is_ws(*s)) s++;
    while (s < top && splits != 0) {
Paul Sokolovsky's avatar
Paul Sokolovsky committed
374
        start = s;
375
376
377
        while (s < top && !is_ws(*s)) s++;
        rt_list_append(res, mp_obj_new_str(start, s - start, false));
        if (s >= top) {
Paul Sokolovsky's avatar
Paul Sokolovsky committed
378
379
            break;
        }
380
        while (s < top && is_ws(*s)) s++;
Paul Sokolovsky's avatar
Paul Sokolovsky committed
381
382
383
384
385
        if (splits > 0) {
            splits--;
        }
    }

386
387
    if (s < top) {
        rt_list_append(res, mp_obj_new_str(s, top - s, false));
Paul Sokolovsky's avatar
Paul Sokolovsky committed
388
389
390
391
392
    }

    return res;
}

393
STATIC mp_obj_t str_finder(uint n_args, const mp_obj_t *args, machine_int_t direction) {
394
    assert(2 <= n_args && n_args <= 4);
395
396
    assert(MP_OBJ_IS_STR(args[0]));
    assert(MP_OBJ_IS_STR(args[1]));
397

398
399
    GET_STR_DATA_LEN(args[0], haystack, haystack_len);
    GET_STR_DATA_LEN(args[1], needle, needle_len);
400

401
402
    machine_uint_t start = 0;
    machine_uint_t end = haystack_len;
403
    if (n_args >= 3 && args[2] != mp_const_none) {
404
        start = mp_get_index(&str_type, haystack_len, args[2], true);
405
406
    }
    if (n_args >= 4 && args[3] != mp_const_none) {
407
        end = mp_get_index(&str_type, haystack_len, args[3], true);
408
409
    }

410
    const byte *p = find_subbytes(haystack + start, end - start, needle, needle_len, direction);
411
412
413
414
415
    if (p == NULL) {
        // not found
        return MP_OBJ_NEW_SMALL_INT(-1);
    } else {
        // found
416
        return MP_OBJ_NEW_SMALL_INT(p - haystack);
417
418
419
    }
}

420
421
422
423
424
425
426
427
STATIC mp_obj_t str_find(uint n_args, const mp_obj_t *args) {
    return str_finder(n_args, args, 1);
}

STATIC mp_obj_t str_rfind(uint n_args, const mp_obj_t *args) {
    return str_finder(n_args, args, -1);
}

428
// TODO: (Much) more variety in args
429
STATIC mp_obj_t str_startswith(mp_obj_t self_in, mp_obj_t arg) {
430
431
432
433
434
435
436
437
    GET_STR_DATA_LEN(self_in, str, str_len);
    GET_STR_DATA_LEN(arg, prefix, prefix_len);
    if (prefix_len > str_len) {
        return mp_const_false;
    }
    return MP_BOOL(memcmp(str, prefix, prefix_len) == 0);
}

438
STATIC mp_obj_t str_strip(uint n_args, const mp_obj_t *args) {
xbe's avatar
xbe committed
439
    assert(1 <= n_args && n_args <= 2);
440
441
442
443
444
    assert(MP_OBJ_IS_STR(args[0]));

    const byte *chars_to_del;
    uint chars_to_del_len;
    static const byte whitespace[] = " \t\n\r\v\f";
xbe's avatar
xbe committed
445
446
447

    if (n_args == 1) {
        chars_to_del = whitespace;
448
        chars_to_del_len = sizeof(whitespace);
xbe's avatar
xbe committed
449
    } else {
450
451
452
453
        assert(MP_OBJ_IS_STR(args[1]));
        GET_STR_DATA_LEN(args[1], s, l);
        chars_to_del = s;
        chars_to_del_len = l;
xbe's avatar
xbe committed
454
455
    }

456
    GET_STR_DATA_LEN(args[0], orig_str, orig_str_len);
xbe's avatar
xbe committed
457

458
    machine_uint_t first_good_char_pos = 0;
xbe's avatar
xbe committed
459
    bool first_good_char_pos_set = false;
460
461
    machine_uint_t last_good_char_pos = 0;
    for (machine_uint_t i = 0; i < orig_str_len; i++) {
462
        if (find_subbytes(chars_to_del, chars_to_del_len, &orig_str[i], 1, 1) == NULL) {
xbe's avatar
xbe committed
463
464
465
466
467
468
469
470
471
            last_good_char_pos = i;
            if (!first_good_char_pos_set) {
                first_good_char_pos = i;
                first_good_char_pos_set = true;
            }
        }
    }

    if (first_good_char_pos == 0 && last_good_char_pos == 0) {
472
473
        // string is all whitespace, return ''
        return MP_OBJ_NEW_QSTR(MP_QSTR_);
xbe's avatar
xbe committed
474
475
476
477
    }

    assert(last_good_char_pos >= first_good_char_pos);
    //+1 to accomodate the last character
478
    machine_uint_t stripped_len = last_good_char_pos - first_good_char_pos + 1;
479
    return mp_obj_new_str(orig_str + first_good_char_pos, stripped_len, false);
xbe's avatar
xbe committed
480
481
}

482
mp_obj_t str_format(uint n_args, const mp_obj_t *args) {
483
    assert(MP_OBJ_IS_STR(args[0]));
484

485
    GET_STR_DATA_LEN(args[0], str, len);
486
487
    int arg_i = 1;
    vstr_t *vstr = vstr_new();
488
    for (const byte *top = str + len; str < top; str++) {
489
490
        if (*str == '{') {
            str++;
491
            if (str < top && *str == '{') {
492
                vstr_add_char(vstr, '{');
493
            } else {
494
                while (str < top && *str != '}') str++;
495
                if (arg_i >= n_args) {
496
                    nlr_jump(mp_obj_new_exception_msg(&mp_type_IndexError, "tuple index out of range"));
497
                }
498
                // TODO: may be PRINT_REPR depending on formatting code
499
                mp_obj_print_helper((void (*)(void*, const char*, ...))vstr_printf, vstr, args[arg_i], PRINT_STR);
500
501
502
503
504
505
506
                arg_i++;
            }
        } else {
            vstr_add_char(vstr, *str);
        }
    }

507
508
509
    mp_obj_t s = mp_obj_new_str((byte*)vstr->buf, vstr->len, false);
    vstr_free(vstr);
    return s;
510
511
}

512
STATIC mp_obj_t str_replace(uint n_args, const mp_obj_t *args) {
513
514
515
516
    assert(MP_OBJ_IS_STR(args[0]));
    assert(MP_OBJ_IS_STR(args[1]));
    assert(MP_OBJ_IS_STR(args[2]));

517
    machine_int_t max_rep = 0;
518
    if (n_args == 4) {
519
520
521
522
523
524
525
        assert(MP_OBJ_IS_SMALL_INT(args[3]));
        max_rep = MP_OBJ_SMALL_INT_VALUE(args[3]);
        if (max_rep == 0) {
            return args[0];
        } else if (max_rep < 0) {
            max_rep = 0;
        }
526
    }
527
528

    // if max_rep is still 0 by this point we will need to do all possible replacements
529
530
531
532

    GET_STR_DATA_LEN(args[0], str, str_len);
    GET_STR_DATA_LEN(args[1], old, old_len);
    GET_STR_DATA_LEN(args[2], new, new_len);
533
534

    // old won't exist in str if it's longer, so nothing to replace
535
    if (old_len > str_len) {
536
        return args[0];
537
538
    }

539
540
541
542
543
544
545
546
547
548
549
550
551
    // data for the replaced string
    byte *data = NULL;
    mp_obj_t replaced_str = MP_OBJ_NULL;

    // do 2 passes over the string:
    //   first pass computes the required length of the replaced string
    //   second pass does the replacements
    for (;;) {
        machine_uint_t replaced_str_index = 0;
        machine_uint_t num_replacements_done = 0;
        const byte *old_occurrence;
        const byte *offset_ptr = str;
        machine_uint_t offset_num = 0;
552
        while ((old_occurrence = find_subbytes(offset_ptr, str_len - offset_num, old, old_len, 1)) != NULL) {
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
            // copy from just after end of last occurrence of to-be-replaced string to right before start of next occurrence
            if (data != NULL) {
                memcpy(data + replaced_str_index, offset_ptr, old_occurrence - offset_ptr);
            }
            replaced_str_index += old_occurrence - offset_ptr;
            // copy the replacement string
            if (data != NULL) {
                memcpy(data + replaced_str_index, new, new_len);
            }
            replaced_str_index += new_len;
            offset_ptr = old_occurrence + old_len;
            offset_num = offset_ptr - str;

            num_replacements_done++;
            if (max_rep != 0 && num_replacements_done == max_rep){
                break;
            }
        }

        // copy from just after end of last occurrence of to-be-replaced string to end of old string
        if (data != NULL) {
            memcpy(data + replaced_str_index, offset_ptr, str_len - offset_num);
        }
        replaced_str_index += str_len - offset_num;

        if (data == NULL) {
            // first pass
            if (num_replacements_done == 0) {
                // no substr found, return original string
                return args[0];
            } else {
                // substr found, allocate new string
                replaced_str = mp_obj_str_builder_start(mp_obj_get_type(args[0]), replaced_str_index, &data);
            }
        } else {
            // second pass, we are done
            break;
        }
591
    }
592

593
594
595
    return mp_obj_str_builder_end(replaced_str);
}

596
597
598
599
600
601
602
603
STATIC mp_obj_t str_count(uint n_args, const mp_obj_t *args) {
    assert(2 <= n_args && n_args <= 4);
    assert(MP_OBJ_IS_STR(args[0]));
    assert(MP_OBJ_IS_STR(args[1]));

    GET_STR_DATA_LEN(args[0], haystack, haystack_len);
    GET_STR_DATA_LEN(args[1], needle, needle_len);

604
605
    machine_uint_t start = 0;
    machine_uint_t end = haystack_len;
606
607
608
609
610
611
612
    if (n_args >= 3 && args[2] != mp_const_none) {
        start = mp_get_index(&str_type, haystack_len, args[2], true);
    }
    if (n_args >= 4 && args[3] != mp_const_none) {
        end = mp_get_index(&str_type, haystack_len, args[3], true);
    }

613
614
615
    // if needle_len is zero then we count each gap between characters as an occurrence
    if (needle_len == 0) {
        return MP_OBJ_NEW_SMALL_INT(end - start + 1);
616
617
    }

618
619
    // count the occurrences
    machine_int_t num_occurrences = 0;
xbe's avatar
xbe committed
620
621
622
623
624
    for (machine_uint_t haystack_index = start; haystack_index + needle_len <= end; haystack_index++) {
        if (memcmp(&haystack[haystack_index], needle, needle_len) == 0) {
            num_occurrences++;
            haystack_index += needle_len - 1;
        }
625
626
627
628
629
    }

    return MP_OBJ_NEW_SMALL_INT(num_occurrences);
}

630
STATIC mp_obj_t str_partitioner(mp_obj_t self_in, mp_obj_t arg, machine_int_t direction) {
631
632
633
634
635
    assert(MP_OBJ_IS_STR(self_in));
    if (!MP_OBJ_IS_STR(arg)) {
        nlr_jump(mp_obj_new_exception_msg_varg(&mp_type_TypeError,
                                               "Can't convert '%s' object to str implicitly", mp_obj_get_type_str(arg)));
    }
636

637
638
639
640
641
642
    GET_STR_DATA_LEN(self_in, str, str_len);
    GET_STR_DATA_LEN(arg, sep, sep_len);

    if (sep_len == 0) {
        nlr_jump(mp_obj_new_exception_msg(&mp_type_ValueError, "empty separator"));
    }
643
644
645
646
647

    mp_obj_t result[] = {MP_OBJ_NEW_QSTR(MP_QSTR_), MP_OBJ_NEW_QSTR(MP_QSTR_), MP_OBJ_NEW_QSTR(MP_QSTR_)};

    if (direction > 0) {
        result[0] = self_in;
648
    } else {
649
        result[2] = self_in;
650
    }
651

652
653
654
655
656
657
    const byte *position_ptr = find_subbytes(str, str_len, sep, sep_len, direction);
    if (position_ptr != NULL) {
        machine_uint_t position = position_ptr - str;
        result[0] = mp_obj_new_str(str, position, false);
        result[1] = arg;
        result[2] = mp_obj_new_str(str + position + sep_len, str_len - position - sep_len, false);
658
    }
659

660
    return mp_obj_new_tuple(3, result);
661
662
}

663
664
STATIC mp_obj_t str_partition(mp_obj_t self_in, mp_obj_t arg) {
    return str_partitioner(self_in, arg, 1);
665
}
666

667
668
STATIC mp_obj_t str_rpartition(mp_obj_t self_in, mp_obj_t arg) {
    return str_partitioner(self_in, arg, -1);
669
670
}

671
672
673
674
675
676
677
678
679
680
681
682
683
684
STATIC machine_int_t str_get_buffer(mp_obj_t self_in, buffer_info_t *bufinfo, int flags) {
    if (flags == BUFFER_READ) {
        GET_STR_DATA_LEN(self_in, str_data, str_len);
        bufinfo->buf = (void*)str_data;
        bufinfo->len = str_len;
        return 0;
    } else {
        // can't write to a string
        bufinfo->buf = NULL;
        bufinfo->len = 0;
        return 1;
    }
}

685
STATIC MP_DEFINE_CONST_FUN_OBJ_VAR_BETWEEN(str_find_obj, 2, 4, str_find);
686
STATIC MP_DEFINE_CONST_FUN_OBJ_VAR_BETWEEN(str_rfind_obj, 2, 4, str_rfind);
687
688
689
690
691
692
STATIC MP_DEFINE_CONST_FUN_OBJ_2(str_join_obj, str_join);
STATIC MP_DEFINE_CONST_FUN_OBJ_VAR_BETWEEN(str_split_obj, 1, 3, str_split);
STATIC MP_DEFINE_CONST_FUN_OBJ_2(str_startswith_obj, str_startswith);
STATIC MP_DEFINE_CONST_FUN_OBJ_VAR_BETWEEN(str_strip_obj, 1, 2, str_strip);
STATIC MP_DEFINE_CONST_FUN_OBJ_VAR(str_format_obj, 1, str_format);
STATIC MP_DEFINE_CONST_FUN_OBJ_VAR_BETWEEN(str_replace_obj, 3, 4, str_replace);
693
STATIC MP_DEFINE_CONST_FUN_OBJ_VAR_BETWEEN(str_count_obj, 2, 4, str_count);
694
STATIC MP_DEFINE_CONST_FUN_OBJ_2(str_partition_obj, str_partition);
695
STATIC MP_DEFINE_CONST_FUN_OBJ_2(str_rpartition_obj, str_rpartition);
696

697
698
699
700
701
702
703
704
705
706
707
708
STATIC const mp_map_elem_t str_locals_dict_table[] = {
    { MP_OBJ_NEW_QSTR(MP_QSTR_find), (mp_obj_t)&str_find_obj },
    { MP_OBJ_NEW_QSTR(MP_QSTR_rfind), (mp_obj_t)&str_rfind_obj },
    { MP_OBJ_NEW_QSTR(MP_QSTR_join), (mp_obj_t)&str_join_obj },
    { MP_OBJ_NEW_QSTR(MP_QSTR_split), (mp_obj_t)&str_split_obj },
    { MP_OBJ_NEW_QSTR(MP_QSTR_startswith), (mp_obj_t)&str_startswith_obj },
    { MP_OBJ_NEW_QSTR(MP_QSTR_strip), (mp_obj_t)&str_strip_obj },
    { MP_OBJ_NEW_QSTR(MP_QSTR_format), (mp_obj_t)&str_format_obj },
    { MP_OBJ_NEW_QSTR(MP_QSTR_replace), (mp_obj_t)&str_replace_obj },
    { MP_OBJ_NEW_QSTR(MP_QSTR_count), (mp_obj_t)&str_count_obj },
    { MP_OBJ_NEW_QSTR(MP_QSTR_partition), (mp_obj_t)&str_partition_obj },
    { MP_OBJ_NEW_QSTR(MP_QSTR_rpartition), (mp_obj_t)&str_rpartition_obj },
ian-v's avatar
ian-v committed
709
};
710

711
712
STATIC MP_DEFINE_CONST_DICT(str_locals_dict, str_locals_dict_table);

713
const mp_obj_type_t str_type = {
714
    { &mp_type_type },
715
    .name = MP_QSTR_str,
716
    .print = str_print,
717
    .make_new = str_make_new,
718
    .binary_op = str_binary_op,
719
    .getiter = mp_obj_new_str_iterator,
720
    .buffer_p = { .get_buffer = str_get_buffer },
721
    .locals_dict = (mp_obj_t)&str_locals_dict,
722
723
724
725
};

// Reuses most of methods from str
const mp_obj_type_t bytes_type = {
726
    { &mp_type_type },
727
    .name = MP_QSTR_bytes,
728
    .print = str_print,
729
    .make_new = bytes_make_new,
730
731
    .binary_op = str_binary_op,
    .getiter = mp_obj_new_bytes_iterator,
732
    .locals_dict = (mp_obj_t)&str_locals_dict,
733
734
};

735
736
737
738
// the zero-length bytes
STATIC const mp_obj_str_t empty_bytes_obj = {{&bytes_type}, 0, 0, NULL};
const mp_obj_t mp_const_empty_bytes = (mp_obj_t)&empty_bytes_obj;

739
mp_obj_t mp_obj_str_builder_start(const mp_obj_type_t *type, uint len, byte **data) {
740
    mp_obj_str_t *o = m_new_obj(mp_obj_str_t);
741
    o->base.type = type;
742
    o->len = len;
743
744
745
    byte *p = m_new(byte, len + 1);
    o->data = p;
    *data = p;
746
747
748
749
750
751
    return o;
}

mp_obj_t mp_obj_str_builder_end(mp_obj_t o_in) {
    mp_obj_str_t *o = o_in;
    o->hash = qstr_compute_hash(o->data, o->len);
752
753
    byte *p = (byte*)o->data;
    p[o->len] = '\0'; // for now we add null for compatibility with C ASCIIZ strings
754
755
756
    return o;
}

757
STATIC mp_obj_t str_new(const mp_obj_type_t *type, const byte* data, uint len) {
758
    mp_obj_str_t *o = m_new_obj(mp_obj_str_t);
759
760
    o->base.type = type;
    o->len = len;
761
762
763
764
765
766
767
    if (data) {
        o->hash = qstr_compute_hash(data, len);
        byte *p = m_new(byte, len + 1);
        o->data = p;
        memcpy(p, data, len * sizeof(byte));
        p[len] = '\0'; // for now we add null for compatibility with C ASCIIZ strings
    }
768
769
770
    return o;
}

771
772
773
774
775
776
777
778
779
780
mp_obj_t mp_obj_new_str(const byte* data, uint len, bool make_qstr_if_not_already) {
    qstr q = qstr_find_strn(data, len);
    if (q != MP_QSTR_NULL) {
        // qstr with this data already exists
        return MP_OBJ_NEW_QSTR(q);
    } else if (make_qstr_if_not_already) {
        // no existing qstr, make a new one
        return MP_OBJ_NEW_QSTR(qstr_from_strn((const char*)data, len));
    } else {
        // no existing qstr, don't make one
781
        return str_new(&str_type, data, len);
782
783
784
    }
}

785
786
787
788
mp_obj_t mp_obj_new_bytes(const byte* data, uint len) {
    return str_new(&bytes_type, data, len);
}

789
790
791
792
793
794
795
796
797
798
799
800
801
802
bool mp_obj_str_equal(mp_obj_t s1, mp_obj_t s2) {
    if (MP_OBJ_IS_QSTR(s1) && MP_OBJ_IS_QSTR(s2)) {
        return s1 == s2;
    } else {
        GET_STR_HASH(s1, h1);
        GET_STR_HASH(s2, h2);
        if (h1 != h2) {
            return false;
        }
        GET_STR_DATA_LEN(s1, d1, l1);
        GET_STR_DATA_LEN(s2, d2, l2);
        if (l1 != l2) {
            return false;
        }
803
        return memcmp(d1, d2, l1) == 0;
804
805
806
    }
}

807
808
void bad_implicit_conversion(mp_obj_t self_in) __attribute__((noreturn));
void bad_implicit_conversion(mp_obj_t self_in) {
809
    nlr_jump(mp_obj_new_exception_msg_varg(&mp_type_TypeError, "Can't convert '%s' object to str implicitly", mp_obj_get_type_str(self_in)));
810
811
}

812
813
814
815
816
uint mp_obj_str_get_hash(mp_obj_t self_in) {
    if (MP_OBJ_IS_STR(self_in)) {
        GET_STR_HASH(self_in, h);
        return h;
    } else {
817
        bad_implicit_conversion(self_in);
818
    }
819
820
821
822
823
824
825
}

uint mp_obj_str_get_len(mp_obj_t self_in) {
    if (MP_OBJ_IS_STR(self_in)) {
        GET_STR_LEN(self_in, l);
        return l;
    } else {
826
827
828
829
830
831
832
833
834
835
836
837
838
839
        bad_implicit_conversion(self_in);
    }
}

// use this if you will anyway convert the string to a qstr
// will be more efficient for the case where it's already a qstr
qstr mp_obj_str_get_qstr(mp_obj_t self_in) {
    if (MP_OBJ_IS_QSTR(self_in)) {
        return MP_OBJ_QSTR_VALUE(self_in);
    } else if (MP_OBJ_IS_TYPE(self_in, &str_type)) {
        mp_obj_str_t *self = self_in;
        return qstr_from_strn((char*)self->data, self->len);
    } else {
        bad_implicit_conversion(self_in);
840
841
842
843
844
845
846
847
848
849
850
    }
}

// only use this function if you need the str data to be zero terminated
// at the moment all strings are zero terminated to help with C ASCIIZ compatibility
const char *mp_obj_str_get_str(mp_obj_t self_in) {
    if (MP_OBJ_IS_STR(self_in)) {
        GET_STR_DATA_LEN(self_in, s, l);
        (void)l; // len unused
        return (const char*)s;
    } else {
851
        bad_implicit_conversion(self_in);
852
853
854
    }
}

855
const char *mp_obj_str_get_data(mp_obj_t self_in, uint *len) {
856
857
858
    if (MP_OBJ_IS_STR(self_in)) {
        GET_STR_DATA_LEN(self_in, s, l);
        *len = l;
859
        return (const char*)s;
860
    } else {
861
        bad_implicit_conversion(self_in);
862
    }
863
}
xyb's avatar
xyb committed
864
865
866
867
868
869

/******************************************************************************/
/* str iterator                                                               */

typedef struct _mp_obj_str_it_t {
    mp_obj_base_t base;
870
    mp_obj_t str;
xyb's avatar
xyb committed
871
872
873
    machine_uint_t cur;
} mp_obj_str_it_t;

874
STATIC mp_obj_t str_it_iternext(mp_obj_t self_in) {
xyb's avatar
xyb committed
875
    mp_obj_str_it_t *self = self_in;
876
877
878
    GET_STR_DATA_LEN(self->str, str, len);
    if (self->cur < len) {
        mp_obj_t o_out = mp_obj_new_str(str + self->cur, 1, true);
xyb's avatar
xyb committed
879
880
881
        self->cur += 1;
        return o_out;
    } else {
882
        return MP_OBJ_NULL;
xyb's avatar
xyb committed
883
884
885
    }
}

886
STATIC const mp_obj_type_t str_it_type = {
887
    { &mp_type_type },
888
    .name = MP_QSTR_iterator,
889
    .iternext = str_it_iternext,
xyb's avatar
xyb committed
890
891
};

892
STATIC mp_obj_t bytes_it_iternext(mp_obj_t self_in) {
893
894
895
    mp_obj_str_it_t *self = self_in;
    GET_STR_DATA_LEN(self->str, str, len);
    if (self->cur < len) {
896
        mp_obj_t o_out = MP_OBJ_NEW_SMALL_INT((mp_small_int_t)str[self->cur]);
897
898
899
        self->cur += 1;
        return o_out;
    } else {
900
        return MP_OBJ_NULL;
901
902
903
    }
}

904
STATIC const mp_obj_type_t bytes_it_type = {
905
    { &mp_type_type },
906
    .name = MP_QSTR_iterator,
907
908
909
910
    .iternext = bytes_it_iternext,
};

mp_obj_t mp_obj_new_str_iterator(mp_obj_t str) {
xyb's avatar
xyb committed
911
912
913
    mp_obj_str_it_t *o = m_new_obj(mp_obj_str_it_t);
    o->base.type = &str_it_type;
    o->str = str;
914
915
916
917
918
919
920
921
922
    o->cur = 0;
    return o;
}

mp_obj_t mp_obj_new_bytes_iterator(mp_obj_t str) {
    mp_obj_str_it_t *o = m_new_obj(mp_obj_str_it_t);
    o->base.type = &bytes_it_type;
    o->str = str;
    o->cur = 0;
xyb's avatar
xyb committed
923
924
    return o;
}