objstr.c 30.1 KB
Newer Older
xbe's avatar
xbe committed
1
#include <stdbool.h>
2
3
4
5
6
7
#include <string.h>
#include <assert.h>

#include "nlr.h"
#include "misc.h"
#include "mpconfig.h"
8
#include "qstr.h"
9
10
11
12
13
14
#include "obj.h"
#include "runtime0.h"
#include "runtime.h"

typedef struct _mp_obj_str_t {
    mp_obj_base_t base;
15
16
    machine_uint_t hash : 16; // XXX here we assume the hash size is 16 bits (it is at the moment; see qstr.c)
    machine_uint_t len : 16; // len == number of bytes used in data, alloc = len + 1 because (at the moment) we also append a null byte
17
    const byte *data;
18
19
} mp_obj_str_t;

20
21
const mp_obj_t mp_const_empty_bytes;

22
23
24
25
26
27
28
29
30
// use this macro to extract the string hash
#define GET_STR_HASH(str_obj_in, str_hash) uint str_hash; if (MP_OBJ_IS_QSTR(str_obj_in)) { str_hash = qstr_hash(MP_OBJ_QSTR_VALUE(str_obj_in)); } else { str_hash = ((mp_obj_str_t*)str_obj_in)->hash; }

// use this macro to extract the string length
#define GET_STR_LEN(str_obj_in, str_len) uint str_len; if (MP_OBJ_IS_QSTR(str_obj_in)) { str_len = qstr_len(MP_OBJ_QSTR_VALUE(str_obj_in)); } else { str_len = ((mp_obj_str_t*)str_obj_in)->len; }

// use this macro to extract the string data and length
#define GET_STR_DATA_LEN(str_obj_in, str_data, str_len) const byte *str_data; uint str_len; if (MP_OBJ_IS_QSTR(str_obj_in)) { str_data = qstr_data(MP_OBJ_QSTR_VALUE(str_obj_in), &str_len); } else { str_len = ((mp_obj_str_t*)str_obj_in)->len; str_data = ((mp_obj_str_t*)str_obj_in)->data; }

31
32
STATIC mp_obj_t mp_obj_new_str_iterator(mp_obj_t str);
STATIC mp_obj_t mp_obj_new_bytes_iterator(mp_obj_t str);
33
STATIC mp_obj_t str_new(const mp_obj_type_t *type, const byte* data, uint len);
xyb's avatar
xyb committed
34
35
36
37

/******************************************************************************/
/* str                                                                        */

38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
void mp_str_print_quoted(void (*print)(void *env, const char *fmt, ...), void *env, const byte *str_data, uint str_len) {
    // this escapes characters, but it will be very slow to print (calling print many times)
    bool has_single_quote = false;
    bool has_double_quote = false;
    for (const byte *s = str_data, *top = str_data + str_len; (!has_single_quote || !has_double_quote) && s < top; s++) {
        if (*s == '\'') {
            has_single_quote = true;
        } else if (*s == '"') {
            has_double_quote = true;
        }
    }
    int quote_char = '\'';
    if (has_single_quote && !has_double_quote) {
        quote_char = '"';
    }
    print(env, "%c", quote_char);
    for (const byte *s = str_data, *top = str_data + str_len; s < top; s++) {
        if (*s == quote_char) {
            print(env, "\\%c", quote_char);
        } else if (*s == '\\') {
            print(env, "\\\\");
        } else if (32 <= *s && *s <= 126) {
            print(env, "%c", *s);
        } else if (*s == '\n') {
            print(env, "\\n");
        // TODO add more escape codes here if we want to match CPython
        } else {
            print(env, "\\x%02x", *s);
        }
    }
    print(env, "%c", quote_char);
}

71
STATIC void str_print(void (*print)(void *env, const char *fmt, ...), void *env, mp_obj_t self_in, mp_print_kind_t kind) {
72
    GET_STR_DATA_LEN(self_in, str_data, str_len);
73
74
    bool is_bytes = MP_OBJ_IS_TYPE(self_in, &bytes_type);
    if (kind == PRINT_STR && !is_bytes) {
75
        print(env, "%.*s", str_len, str_data);
76
    } else {
77
78
79
        if (is_bytes) {
            print(env, "b");
        }
80
        mp_str_print_quoted(print, env, str_data, str_len);
81
    }
82
83
}

84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
STATIC mp_obj_t str_make_new(mp_obj_t type_in, uint n_args, uint n_kw, const mp_obj_t *args) {
    switch (n_args) {
        case 0:
            return MP_OBJ_NEW_QSTR(MP_QSTR_);

        case 1:
        {
            vstr_t *vstr = vstr_new();
            mp_obj_print_helper((void (*)(void*, const char*, ...))vstr_printf, vstr, args[0], PRINT_STR);
            mp_obj_t s = mp_obj_new_str((byte*)vstr->buf, vstr->len, false);
            vstr_free(vstr);
            return s;
        }

        case 2:
        case 3:
        {
            // TODO: validate 2nd/3rd args
            if (!MP_OBJ_IS_TYPE(args[0], &bytes_type)) {
                nlr_jump(mp_obj_new_exception_msg(&mp_type_TypeError, "bytes expected"));
            }
            GET_STR_DATA_LEN(args[0], str_data, str_len);
            GET_STR_HASH(args[0], str_hash);
            mp_obj_str_t *o = str_new(&str_type, NULL, str_len);
            o->data = str_data;
            o->hash = str_hash;
            return o;
        }

        default:
            nlr_jump(mp_obj_new_exception_msg(&mp_type_TypeError, "str takes at most 3 arguments"));
    }
}

118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
STATIC mp_obj_t bytes_make_new(mp_obj_t type_in, uint n_args, uint n_kw, const mp_obj_t *args) {
    if (n_args == 0) {
        return mp_const_empty_bytes;
    }

    if (MP_OBJ_IS_STR(args[0])) {
        if (n_args < 2 || n_args > 3) {
            goto wrong_args;
        }
        GET_STR_DATA_LEN(args[0], str_data, str_len);
        GET_STR_HASH(args[0], str_hash);
        mp_obj_str_t *o = str_new(&bytes_type, NULL, str_len);
        o->data = str_data;
        o->hash = str_hash;
        return o;
    }

    if (n_args > 1) {
        goto wrong_args;
    }

    if (MP_OBJ_IS_SMALL_INT(args[0])) {
        uint len = MP_OBJ_SMALL_INT_VALUE(args[0]);
        byte *data;

        mp_obj_t o = mp_obj_str_builder_start(&bytes_type, len, &data);
        memset(data, 0, len);
        return mp_obj_str_builder_end(o);
    }

    int len;
    byte *data;
    vstr_t *vstr = NULL;
    mp_obj_t o = NULL;
    // Try to create array of exact len if initializer len is known
    mp_obj_t len_in = mp_obj_len_maybe(args[0]);
    if (len_in == MP_OBJ_NULL) {
        len = -1;
        vstr = vstr_new();
    } else {
        len = MP_OBJ_SMALL_INT_VALUE(len_in);
        o = mp_obj_str_builder_start(&bytes_type, len, &data);
    }

    mp_obj_t iterable = rt_getiter(args[0]);
    mp_obj_t item;
    while ((item = rt_iternext(iterable)) != mp_const_stop_iteration) {
        if (len == -1) {
            vstr_add_char(vstr, MP_OBJ_SMALL_INT_VALUE(item));
        } else {
            *data++ = MP_OBJ_SMALL_INT_VALUE(item);
        }
    }

    if (len == -1) {
        vstr_shrink(vstr);
        // TODO: Optimize, borrow buffer from vstr
        len = vstr_len(vstr);
        o = mp_obj_str_builder_start(&bytes_type, len, &data);
        memcpy(data, vstr_str(vstr), len);
        vstr_free(vstr);
    }

    return mp_obj_str_builder_end(o);

wrong_args:
        nlr_jump(mp_obj_new_exception_msg(&mp_type_TypeError, "wrong number of arguments"));
}

187
188
// like strstr but with specified length and allows \0 bytes
// TODO replace with something more efficient/standard
189
STATIC const byte *find_subbytes(const byte *haystack, machine_uint_t hlen, const byte *needle, machine_uint_t nlen, machine_int_t direction) {
190
    if (hlen >= nlen) {
191
192
193
194
195
196
197
198
199
200
201
202
        machine_uint_t str_index, str_index_end;
        if (direction > 0) {
            str_index = 0;
            str_index_end = hlen - nlen;
        } else {
            str_index = hlen - nlen;
            str_index_end = 0;
        }
        for (;;) {
            if (memcmp(&haystack[str_index], needle, nlen) == 0) {
                //found
                return haystack + str_index;
203
            }
204
205
206
            if (str_index == str_index_end) {
                //not found
                break;
207
            }
208
            str_index += direction;
209
210
211
212
213
        }
    }
    return NULL;
}

214
STATIC mp_obj_t str_binary_op(int op, mp_obj_t lhs_in, mp_obj_t rhs_in) {
215
    GET_STR_DATA_LEN(lhs_in, lhs_data, lhs_len);
216
217
    switch (op) {
        case RT_BINARY_OP_SUBSCR:
218
219
220
            // TODO: need predicate to check for int-like type (bools are such for example)
            // ["no", "yes"][1 == 2] is common idiom
            if (MP_OBJ_IS_SMALL_INT(rhs_in)) {
221
                uint index = mp_get_index(mp_obj_get_type(lhs_in), lhs_len, rhs_in, false);
222
                if (MP_OBJ_IS_TYPE(lhs_in, &bytes_type)) {
223
                    return MP_OBJ_NEW_SMALL_INT((mp_small_int_t)lhs_data[index]);
224
225
226
                } else {
                    return mp_obj_new_str(lhs_data + index, 1, true);
                }
227
#if MICROPY_ENABLE_SLICE
228
            } else if (MP_OBJ_IS_TYPE(rhs_in, &slice_type)) {
229
                machine_uint_t start, stop;
Paul Sokolovsky's avatar
Paul Sokolovsky committed
230
231
232
                if (!m_seq_get_fast_slice_indexes(lhs_len, rhs_in, &start, &stop)) {
                    assert(0);
                }
233
                return mp_obj_new_str(lhs_data + start, stop - start, false);
234
#endif
235
            } else {
236
237
                // Message doesn't match CPython, but we don't have so much bytes as they
                // to spend them on verbose wording
238
                nlr_jump(mp_obj_new_exception_msg(&mp_type_TypeError, "index must be int"));
239
            }
240
241
242

        case RT_BINARY_OP_ADD:
        case RT_BINARY_OP_INPLACE_ADD:
243
            if (MP_OBJ_IS_STR(rhs_in)) {
244
                // add 2 strings
245
246

                GET_STR_DATA_LEN(rhs_in, rhs_data, rhs_len);
247
                int alloc_len = lhs_len + rhs_len;
248
249

                /* code for making qstr
250
251
252
253
                byte *q_ptr;
                byte *val = qstr_build_start(alloc_len, &q_ptr);
                memcpy(val, lhs_data, lhs_len);
                memcpy(val + lhs_len, rhs_data, rhs_len);
254
255
256
257
258
                return MP_OBJ_NEW_QSTR(qstr_build_end(q_ptr));
                */

                // code for non-qstr
                byte *data;
259
                mp_obj_t s = mp_obj_str_builder_start(mp_obj_get_type(lhs_in), alloc_len, &data);
260
261
262
                memcpy(data, lhs_data, lhs_len);
                memcpy(data + lhs_len, rhs_data, rhs_len);
                return mp_obj_str_builder_end(s);
263
264
            }
            break;
265

266
        case RT_BINARY_OP_IN:
267
            /* NOTE `a in b` is `b.__contains__(a)` */
268
269
            if (MP_OBJ_IS_STR(rhs_in)) {
                GET_STR_DATA_LEN(rhs_in, rhs_data, rhs_len);
270
                return MP_BOOL(find_subbytes(lhs_data, lhs_len, rhs_data, rhs_len, 1) != NULL);
271
272
            }
            break;
273

274
275
276
277
278
279
        case RT_BINARY_OP_MULTIPLY:
        {
            if (!MP_OBJ_IS_SMALL_INT(rhs_in)) {
                return NULL;
            }
            int n = MP_OBJ_SMALL_INT_VALUE(rhs_in);
280
            byte *data;
281
            mp_obj_t s = mp_obj_str_builder_start(mp_obj_get_type(lhs_in), lhs_len * n, &data);
282
283
            mp_seq_multiply(lhs_data, sizeof(*lhs_data), lhs_len, n, data);
            return mp_obj_str_builder_end(s);
284
        }
285
286
287
288
289
290
291
292
293
294
295
296

        // These 2 are never passed here, dealt with as a special case in rt_binary_op().
        //case RT_BINARY_OP_EQUAL:
        //case RT_BINARY_OP_NOT_EQUAL:
        case RT_BINARY_OP_LESS:
        case RT_BINARY_OP_LESS_EQUAL:
        case RT_BINARY_OP_MORE:
        case RT_BINARY_OP_MORE_EQUAL:
            if (MP_OBJ_IS_STR(rhs_in)) {
                GET_STR_DATA_LEN(rhs_in, rhs_data, rhs_len);
                return MP_BOOL(mp_seq_cmp_bytes(op, lhs_data, lhs_len, rhs_data, rhs_len));
            }
297
298
299
300
301
    }

    return MP_OBJ_NULL; // op not supported
}

302
STATIC mp_obj_t str_join(mp_obj_t self_in, mp_obj_t arg) {
303
    assert(MP_OBJ_IS_STR(self_in));
304

305
    // get separation string
306
    GET_STR_DATA_LEN(self_in, sep_str, sep_len);
307
308

    // process args
309
310
311
312
313
314
315
316
317
    uint seq_len;
    mp_obj_t *seq_items;
    if (MP_OBJ_IS_TYPE(arg, &tuple_type)) {
        mp_obj_tuple_get(arg, &seq_len, &seq_items);
    } else if (MP_OBJ_IS_TYPE(arg, &list_type)) {
        mp_obj_list_get(arg, &seq_len, &seq_items);
    } else {
        goto bad_arg;
    }
318
319
320

    // count required length
    int required_len = 0;
321
    for (int i = 0; i < seq_len; i++) {
322
        if (!MP_OBJ_IS_STR(seq_items[i])) {
323
324
            goto bad_arg;
        }
325
326
327
        if (i > 0) {
            required_len += sep_len;
        }
328
329
        GET_STR_LEN(seq_items[i], l);
        required_len += l;
330
331
332
    }

    // make joined string
333
    byte *data;
334
    mp_obj_t joined_str = mp_obj_str_builder_start(mp_obj_get_type(self_in), required_len, &data);
335
336
    for (int i = 0; i < seq_len; i++) {
        if (i > 0) {
337
338
            memcpy(data, sep_str, sep_len);
            data += sep_len;
339
        }
340
341
342
        GET_STR_DATA_LEN(seq_items[i], s, l);
        memcpy(data, s, l);
        data += l;
343
    }
344
345

    // return joined string
346
    return mp_obj_str_builder_end(joined_str);
347
348

bad_arg:
349
    nlr_jump(mp_obj_new_exception_msg(&mp_type_TypeError, "?str.join expecting a list of str's"));
350
351
}

Paul Sokolovsky's avatar
Paul Sokolovsky committed
352
353
#define is_ws(c) ((c) == ' ' || (c) == '\t')

354
STATIC mp_obj_t str_split(uint n_args, const mp_obj_t *args) {
Paul Sokolovsky's avatar
Paul Sokolovsky committed
355
356
357
358
359
360
361
362
363
    int splits = -1;
    mp_obj_t sep = mp_const_none;
    if (n_args > 1) {
        sep = args[1];
        if (n_args > 2) {
            splits = MP_OBJ_SMALL_INT_VALUE(args[2]);
        }
    }
    assert(sep == mp_const_none);
364
    (void)sep; // unused; to hush compiler warning
Paul Sokolovsky's avatar
Paul Sokolovsky committed
365
    mp_obj_t res = mp_obj_new_list(0, NULL);
366
367
368
    GET_STR_DATA_LEN(args[0], s, len);
    const byte *top = s + len;
    const byte *start;
Paul Sokolovsky's avatar
Paul Sokolovsky committed
369
370

    // Initial whitespace is not counted as split, so we pre-do it
371
372
    while (s < top && is_ws(*s)) s++;
    while (s < top && splits != 0) {
Paul Sokolovsky's avatar
Paul Sokolovsky committed
373
        start = s;
374
375
376
        while (s < top && !is_ws(*s)) s++;
        rt_list_append(res, mp_obj_new_str(start, s - start, false));
        if (s >= top) {
Paul Sokolovsky's avatar
Paul Sokolovsky committed
377
378
            break;
        }
379
        while (s < top && is_ws(*s)) s++;
Paul Sokolovsky's avatar
Paul Sokolovsky committed
380
381
382
383
384
        if (splits > 0) {
            splits--;
        }
    }

385
386
    if (s < top) {
        rt_list_append(res, mp_obj_new_str(s, top - s, false));
Paul Sokolovsky's avatar
Paul Sokolovsky committed
387
388
389
390
391
    }

    return res;
}

392
STATIC mp_obj_t str_finder(uint n_args, const mp_obj_t *args, machine_int_t direction) {
393
    assert(2 <= n_args && n_args <= 4);
394
395
    assert(MP_OBJ_IS_STR(args[0]));
    assert(MP_OBJ_IS_STR(args[1]));
396

397
398
    GET_STR_DATA_LEN(args[0], haystack, haystack_len);
    GET_STR_DATA_LEN(args[1], needle, needle_len);
399

400
401
    machine_uint_t start = 0;
    machine_uint_t end = haystack_len;
402
    if (n_args >= 3 && args[2] != mp_const_none) {
403
        start = mp_get_index(&str_type, haystack_len, args[2], true);
404
405
    }
    if (n_args >= 4 && args[3] != mp_const_none) {
406
        end = mp_get_index(&str_type, haystack_len, args[3], true);
407
408
    }

409
    const byte *p = find_subbytes(haystack + start, end - start, needle, needle_len, direction);
410
411
412
413
414
    if (p == NULL) {
        // not found
        return MP_OBJ_NEW_SMALL_INT(-1);
    } else {
        // found
415
        return MP_OBJ_NEW_SMALL_INT(p - haystack);
416
417
418
    }
}

419
420
421
422
423
424
425
426
STATIC mp_obj_t str_find(uint n_args, const mp_obj_t *args) {
    return str_finder(n_args, args, 1);
}

STATIC mp_obj_t str_rfind(uint n_args, const mp_obj_t *args) {
    return str_finder(n_args, args, -1);
}

427
// TODO: (Much) more variety in args
428
STATIC mp_obj_t str_startswith(mp_obj_t self_in, mp_obj_t arg) {
429
430
431
432
433
434
435
436
    GET_STR_DATA_LEN(self_in, str, str_len);
    GET_STR_DATA_LEN(arg, prefix, prefix_len);
    if (prefix_len > str_len) {
        return mp_const_false;
    }
    return MP_BOOL(memcmp(str, prefix, prefix_len) == 0);
}

437
STATIC mp_obj_t str_strip(uint n_args, const mp_obj_t *args) {
xbe's avatar
xbe committed
438
    assert(1 <= n_args && n_args <= 2);
439
440
441
442
443
    assert(MP_OBJ_IS_STR(args[0]));

    const byte *chars_to_del;
    uint chars_to_del_len;
    static const byte whitespace[] = " \t\n\r\v\f";
xbe's avatar
xbe committed
444
445
446

    if (n_args == 1) {
        chars_to_del = whitespace;
447
        chars_to_del_len = sizeof(whitespace);
xbe's avatar
xbe committed
448
    } else {
449
450
451
452
        assert(MP_OBJ_IS_STR(args[1]));
        GET_STR_DATA_LEN(args[1], s, l);
        chars_to_del = s;
        chars_to_del_len = l;
xbe's avatar
xbe committed
453
454
    }

455
    GET_STR_DATA_LEN(args[0], orig_str, orig_str_len);
xbe's avatar
xbe committed
456

457
    machine_uint_t first_good_char_pos = 0;
xbe's avatar
xbe committed
458
    bool first_good_char_pos_set = false;
459
460
    machine_uint_t last_good_char_pos = 0;
    for (machine_uint_t i = 0; i < orig_str_len; i++) {
461
        if (find_subbytes(chars_to_del, chars_to_del_len, &orig_str[i], 1, 1) == NULL) {
xbe's avatar
xbe committed
462
463
464
465
466
467
468
469
470
            last_good_char_pos = i;
            if (!first_good_char_pos_set) {
                first_good_char_pos = i;
                first_good_char_pos_set = true;
            }
        }
    }

    if (first_good_char_pos == 0 && last_good_char_pos == 0) {
471
472
        // string is all whitespace, return ''
        return MP_OBJ_NEW_QSTR(MP_QSTR_);
xbe's avatar
xbe committed
473
474
475
476
    }

    assert(last_good_char_pos >= first_good_char_pos);
    //+1 to accomodate the last character
477
    machine_uint_t stripped_len = last_good_char_pos - first_good_char_pos + 1;
478
    return mp_obj_new_str(orig_str + first_good_char_pos, stripped_len, false);
xbe's avatar
xbe committed
479
480
}

481
mp_obj_t str_format(uint n_args, const mp_obj_t *args) {
482
    assert(MP_OBJ_IS_STR(args[0]));
483

484
    GET_STR_DATA_LEN(args[0], str, len);
485
486
    int arg_i = 1;
    vstr_t *vstr = vstr_new();
487
    for (const byte *top = str + len; str < top; str++) {
488
489
        if (*str == '{') {
            str++;
490
            if (str < top && *str == '{') {
491
                vstr_add_char(vstr, '{');
492
            } else {
493
                while (str < top && *str != '}') str++;
494
                if (arg_i >= n_args) {
495
                    nlr_jump(mp_obj_new_exception_msg(&mp_type_IndexError, "tuple index out of range"));
496
                }
497
                // TODO: may be PRINT_REPR depending on formatting code
498
                mp_obj_print_helper((void (*)(void*, const char*, ...))vstr_printf, vstr, args[arg_i], PRINT_STR);
499
500
501
502
503
504
505
                arg_i++;
            }
        } else {
            vstr_add_char(vstr, *str);
        }
    }

506
507
508
    mp_obj_t s = mp_obj_new_str((byte*)vstr->buf, vstr->len, false);
    vstr_free(vstr);
    return s;
509
510
}

511
STATIC mp_obj_t str_replace(uint n_args, const mp_obj_t *args) {
512
513
514
515
    assert(MP_OBJ_IS_STR(args[0]));
    assert(MP_OBJ_IS_STR(args[1]));
    assert(MP_OBJ_IS_STR(args[2]));

516
    machine_int_t max_rep = 0;
517
    if (n_args == 4) {
518
519
520
521
522
523
524
        assert(MP_OBJ_IS_SMALL_INT(args[3]));
        max_rep = MP_OBJ_SMALL_INT_VALUE(args[3]);
        if (max_rep == 0) {
            return args[0];
        } else if (max_rep < 0) {
            max_rep = 0;
        }
525
    }
526
527

    // if max_rep is still 0 by this point we will need to do all possible replacements
528
529
530
531

    GET_STR_DATA_LEN(args[0], str, str_len);
    GET_STR_DATA_LEN(args[1], old, old_len);
    GET_STR_DATA_LEN(args[2], new, new_len);
532
533

    // old won't exist in str if it's longer, so nothing to replace
534
    if (old_len > str_len) {
535
        return args[0];
536
537
    }

538
539
540
541
542
543
544
545
546
547
548
549
550
    // data for the replaced string
    byte *data = NULL;
    mp_obj_t replaced_str = MP_OBJ_NULL;

    // do 2 passes over the string:
    //   first pass computes the required length of the replaced string
    //   second pass does the replacements
    for (;;) {
        machine_uint_t replaced_str_index = 0;
        machine_uint_t num_replacements_done = 0;
        const byte *old_occurrence;
        const byte *offset_ptr = str;
        machine_uint_t offset_num = 0;
551
        while ((old_occurrence = find_subbytes(offset_ptr, str_len - offset_num, old, old_len, 1)) != NULL) {
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
            // copy from just after end of last occurrence of to-be-replaced string to right before start of next occurrence
            if (data != NULL) {
                memcpy(data + replaced_str_index, offset_ptr, old_occurrence - offset_ptr);
            }
            replaced_str_index += old_occurrence - offset_ptr;
            // copy the replacement string
            if (data != NULL) {
                memcpy(data + replaced_str_index, new, new_len);
            }
            replaced_str_index += new_len;
            offset_ptr = old_occurrence + old_len;
            offset_num = offset_ptr - str;

            num_replacements_done++;
            if (max_rep != 0 && num_replacements_done == max_rep){
                break;
            }
        }

        // copy from just after end of last occurrence of to-be-replaced string to end of old string
        if (data != NULL) {
            memcpy(data + replaced_str_index, offset_ptr, str_len - offset_num);
        }
        replaced_str_index += str_len - offset_num;

        if (data == NULL) {
            // first pass
            if (num_replacements_done == 0) {
                // no substr found, return original string
                return args[0];
            } else {
                // substr found, allocate new string
                replaced_str = mp_obj_str_builder_start(mp_obj_get_type(args[0]), replaced_str_index, &data);
            }
        } else {
            // second pass, we are done
            break;
        }
590
    }
591

592
593
594
    return mp_obj_str_builder_end(replaced_str);
}

595
596
597
598
599
600
601
602
STATIC mp_obj_t str_count(uint n_args, const mp_obj_t *args) {
    assert(2 <= n_args && n_args <= 4);
    assert(MP_OBJ_IS_STR(args[0]));
    assert(MP_OBJ_IS_STR(args[1]));

    GET_STR_DATA_LEN(args[0], haystack, haystack_len);
    GET_STR_DATA_LEN(args[1], needle, needle_len);

603
604
    machine_uint_t start = 0;
    machine_uint_t end = haystack_len;
605
606
607
608
609
610
611
    if (n_args >= 3 && args[2] != mp_const_none) {
        start = mp_get_index(&str_type, haystack_len, args[2], true);
    }
    if (n_args >= 4 && args[3] != mp_const_none) {
        end = mp_get_index(&str_type, haystack_len, args[3], true);
    }

612
613
614
    // if needle_len is zero then we count each gap between characters as an occurrence
    if (needle_len == 0) {
        return MP_OBJ_NEW_SMALL_INT(end - start + 1);
615
616
    }

617
618
    // count the occurrences
    machine_int_t num_occurrences = 0;
xbe's avatar
xbe committed
619
620
621
622
623
    for (machine_uint_t haystack_index = start; haystack_index + needle_len <= end; haystack_index++) {
        if (memcmp(&haystack[haystack_index], needle, needle_len) == 0) {
            num_occurrences++;
            haystack_index += needle_len - 1;
        }
624
625
626
627
628
    }

    return MP_OBJ_NEW_SMALL_INT(num_occurrences);
}

629
STATIC mp_obj_t str_partitioner(mp_obj_t self_in, mp_obj_t arg, machine_int_t direction) {
630
631
632
633
634
    assert(MP_OBJ_IS_STR(self_in));
    if (!MP_OBJ_IS_STR(arg)) {
        nlr_jump(mp_obj_new_exception_msg_varg(&mp_type_TypeError,
                                               "Can't convert '%s' object to str implicitly", mp_obj_get_type_str(arg)));
    }
635

636
637
638
639
640
641
    GET_STR_DATA_LEN(self_in, str, str_len);
    GET_STR_DATA_LEN(arg, sep, sep_len);

    if (sep_len == 0) {
        nlr_jump(mp_obj_new_exception_msg(&mp_type_ValueError, "empty separator"));
    }
642
643
644
645
646

    mp_obj_t result[] = {MP_OBJ_NEW_QSTR(MP_QSTR_), MP_OBJ_NEW_QSTR(MP_QSTR_), MP_OBJ_NEW_QSTR(MP_QSTR_)};

    if (direction > 0) {
        result[0] = self_in;
647
    } else {
648
        result[2] = self_in;
649
    }
650

651
652
653
654
655
656
    const byte *position_ptr = find_subbytes(str, str_len, sep, sep_len, direction);
    if (position_ptr != NULL) {
        machine_uint_t position = position_ptr - str;
        result[0] = mp_obj_new_str(str, position, false);
        result[1] = arg;
        result[2] = mp_obj_new_str(str + position + sep_len, str_len - position - sep_len, false);
657
    }
658

659
    return mp_obj_new_tuple(3, result);
660
661
}

662
663
STATIC mp_obj_t str_partition(mp_obj_t self_in, mp_obj_t arg) {
    return str_partitioner(self_in, arg, 1);
664
}
665

666
667
STATIC mp_obj_t str_rpartition(mp_obj_t self_in, mp_obj_t arg) {
    return str_partitioner(self_in, arg, -1);
668
669
}

670
671
672
673
674
675
676
677
678
679
680
681
682
683
STATIC machine_int_t str_get_buffer(mp_obj_t self_in, buffer_info_t *bufinfo, int flags) {
    if (flags == BUFFER_READ) {
        GET_STR_DATA_LEN(self_in, str_data, str_len);
        bufinfo->buf = (void*)str_data;
        bufinfo->len = str_len;
        return 0;
    } else {
        // can't write to a string
        bufinfo->buf = NULL;
        bufinfo->len = 0;
        return 1;
    }
}

684
STATIC MP_DEFINE_CONST_FUN_OBJ_VAR_BETWEEN(str_find_obj, 2, 4, str_find);
685
STATIC MP_DEFINE_CONST_FUN_OBJ_VAR_BETWEEN(str_rfind_obj, 2, 4, str_rfind);
686
687
688
689
690
691
STATIC MP_DEFINE_CONST_FUN_OBJ_2(str_join_obj, str_join);
STATIC MP_DEFINE_CONST_FUN_OBJ_VAR_BETWEEN(str_split_obj, 1, 3, str_split);
STATIC MP_DEFINE_CONST_FUN_OBJ_2(str_startswith_obj, str_startswith);
STATIC MP_DEFINE_CONST_FUN_OBJ_VAR_BETWEEN(str_strip_obj, 1, 2, str_strip);
STATIC MP_DEFINE_CONST_FUN_OBJ_VAR(str_format_obj, 1, str_format);
STATIC MP_DEFINE_CONST_FUN_OBJ_VAR_BETWEEN(str_replace_obj, 3, 4, str_replace);
692
STATIC MP_DEFINE_CONST_FUN_OBJ_VAR_BETWEEN(str_count_obj, 2, 4, str_count);
693
STATIC MP_DEFINE_CONST_FUN_OBJ_2(str_partition_obj, str_partition);
694
STATIC MP_DEFINE_CONST_FUN_OBJ_2(str_rpartition_obj, str_rpartition);
695

696
STATIC const mp_method_t str_type_methods[] = {
697
    { "find", &str_find_obj },
698
    { "rfind", &str_rfind_obj },
ian-v's avatar
ian-v committed
699
    { "join", &str_join_obj },
Paul Sokolovsky's avatar
Paul Sokolovsky committed
700
    { "split", &str_split_obj },
701
    { "startswith", &str_startswith_obj },
xbe's avatar
xbe committed
702
    { "strip", &str_strip_obj },
ian-v's avatar
ian-v committed
703
    { "format", &str_format_obj },
704
    { "replace", &str_replace_obj },
705
    { "count", &str_count_obj },
706
    { "partition", &str_partition_obj },
707
    { "rpartition", &str_rpartition_obj },
ian-v's avatar
ian-v committed
708
709
    { NULL, NULL }, // end-of-list sentinel
};
710

711
const mp_obj_type_t str_type = {
712
    { &mp_type_type },
713
    .name = MP_QSTR_str,
714
    .print = str_print,
715
    .make_new = str_make_new,
716
    .binary_op = str_binary_op,
717
718
    .getiter = mp_obj_new_str_iterator,
    .methods = str_type_methods,
719
    .buffer_p = { .get_buffer = str_get_buffer },
720
721
722
723
};

// Reuses most of methods from str
const mp_obj_type_t bytes_type = {
724
    { &mp_type_type },
725
    .name = MP_QSTR_bytes,
726
    .print = str_print,
727
    .make_new = bytes_make_new,
728
729
    .binary_op = str_binary_op,
    .getiter = mp_obj_new_bytes_iterator,
ian-v's avatar
ian-v committed
730
    .methods = str_type_methods,
731
732
};

733
734
735
736
// the zero-length bytes
STATIC const mp_obj_str_t empty_bytes_obj = {{&bytes_type}, 0, 0, NULL};
const mp_obj_t mp_const_empty_bytes = (mp_obj_t)&empty_bytes_obj;

737
mp_obj_t mp_obj_str_builder_start(const mp_obj_type_t *type, uint len, byte **data) {
738
    mp_obj_str_t *o = m_new_obj(mp_obj_str_t);
739
    o->base.type = type;
740
    o->len = len;
741
742
743
    byte *p = m_new(byte, len + 1);
    o->data = p;
    *data = p;
744
745
746
747
748
749
    return o;
}

mp_obj_t mp_obj_str_builder_end(mp_obj_t o_in) {
    mp_obj_str_t *o = o_in;
    o->hash = qstr_compute_hash(o->data, o->len);
750
751
    byte *p = (byte*)o->data;
    p[o->len] = '\0'; // for now we add null for compatibility with C ASCIIZ strings
752
753
754
    return o;
}

755
STATIC mp_obj_t str_new(const mp_obj_type_t *type, const byte* data, uint len) {
756
    mp_obj_str_t *o = m_new_obj(mp_obj_str_t);
757
758
    o->base.type = type;
    o->len = len;
759
760
761
762
763
764
765
    if (data) {
        o->hash = qstr_compute_hash(data, len);
        byte *p = m_new(byte, len + 1);
        o->data = p;
        memcpy(p, data, len * sizeof(byte));
        p[len] = '\0'; // for now we add null for compatibility with C ASCIIZ strings
    }
766
767
768
    return o;
}

769
770
771
772
773
774
775
776
777
778
mp_obj_t mp_obj_new_str(const byte* data, uint len, bool make_qstr_if_not_already) {
    qstr q = qstr_find_strn(data, len);
    if (q != MP_QSTR_NULL) {
        // qstr with this data already exists
        return MP_OBJ_NEW_QSTR(q);
    } else if (make_qstr_if_not_already) {
        // no existing qstr, make a new one
        return MP_OBJ_NEW_QSTR(qstr_from_strn((const char*)data, len));
    } else {
        // no existing qstr, don't make one
779
        return str_new(&str_type, data, len);
780
781
782
    }
}

783
784
785
786
mp_obj_t mp_obj_new_bytes(const byte* data, uint len) {
    return str_new(&bytes_type, data, len);
}

787
788
789
790
791
792
793
794
795
796
797
798
799
800
bool mp_obj_str_equal(mp_obj_t s1, mp_obj_t s2) {
    if (MP_OBJ_IS_QSTR(s1) && MP_OBJ_IS_QSTR(s2)) {
        return s1 == s2;
    } else {
        GET_STR_HASH(s1, h1);
        GET_STR_HASH(s2, h2);
        if (h1 != h2) {
            return false;
        }
        GET_STR_DATA_LEN(s1, d1, l1);
        GET_STR_DATA_LEN(s2, d2, l2);
        if (l1 != l2) {
            return false;
        }
801
        return memcmp(d1, d2, l1) == 0;
802
803
804
    }
}

805
806
void bad_implicit_conversion(mp_obj_t self_in) __attribute__((noreturn));
void bad_implicit_conversion(mp_obj_t self_in) {
807
    nlr_jump(mp_obj_new_exception_msg_varg(&mp_type_TypeError, "Can't convert '%s' object to str implicitly", mp_obj_get_type_str(self_in)));
808
809
}

810
811
812
813
814
uint mp_obj_str_get_hash(mp_obj_t self_in) {
    if (MP_OBJ_IS_STR(self_in)) {
        GET_STR_HASH(self_in, h);
        return h;
    } else {
815
        bad_implicit_conversion(self_in);
816
    }
817
818
819
820
821
822
823
}

uint mp_obj_str_get_len(mp_obj_t self_in) {
    if (MP_OBJ_IS_STR(self_in)) {
        GET_STR_LEN(self_in, l);
        return l;
    } else {
824
825
826
827
828
829
830
831
832
833
834
835
836
837
        bad_implicit_conversion(self_in);
    }
}

// use this if you will anyway convert the string to a qstr
// will be more efficient for the case where it's already a qstr
qstr mp_obj_str_get_qstr(mp_obj_t self_in) {
    if (MP_OBJ_IS_QSTR(self_in)) {
        return MP_OBJ_QSTR_VALUE(self_in);
    } else if (MP_OBJ_IS_TYPE(self_in, &str_type)) {
        mp_obj_str_t *self = self_in;
        return qstr_from_strn((char*)self->data, self->len);
    } else {
        bad_implicit_conversion(self_in);
838
839
840
841
842
843
844
845
846
847
848
    }
}

// only use this function if you need the str data to be zero terminated
// at the moment all strings are zero terminated to help with C ASCIIZ compatibility
const char *mp_obj_str_get_str(mp_obj_t self_in) {
    if (MP_OBJ_IS_STR(self_in)) {
        GET_STR_DATA_LEN(self_in, s, l);
        (void)l; // len unused
        return (const char*)s;
    } else {
849
        bad_implicit_conversion(self_in);
850
851
852
    }
}

853
const char *mp_obj_str_get_data(mp_obj_t self_in, uint *len) {
854
855
856
    if (MP_OBJ_IS_STR(self_in)) {
        GET_STR_DATA_LEN(self_in, s, l);
        *len = l;
857
        return (const char*)s;
858
    } else {
859
        bad_implicit_conversion(self_in);
860
    }
861
}
xyb's avatar
xyb committed
862
863
864
865
866
867

/******************************************************************************/
/* str iterator                                                               */

typedef struct _mp_obj_str_it_t {
    mp_obj_base_t base;
868
    mp_obj_t str;
xyb's avatar
xyb committed
869
870
871
    machine_uint_t cur;
} mp_obj_str_it_t;

872
STATIC mp_obj_t str_it_iternext(mp_obj_t self_in) {
xyb's avatar
xyb committed
873
    mp_obj_str_it_t *self = self_in;
874
875
876
    GET_STR_DATA_LEN(self->str, str, len);
    if (self->cur < len) {
        mp_obj_t o_out = mp_obj_new_str(str + self->cur, 1, true);
xyb's avatar
xyb committed
877
878
879
880
881
882
883
        self->cur += 1;
        return o_out;
    } else {
        return mp_const_stop_iteration;
    }
}

884
STATIC const mp_obj_type_t str_it_type = {
885
    { &mp_type_type },
886
    .name = MP_QSTR_iterator,
887
    .iternext = str_it_iternext,
xyb's avatar
xyb committed
888
889
};

890
STATIC mp_obj_t bytes_it_iternext(mp_obj_t self_in) {
891
892
893
    mp_obj_str_it_t *self = self_in;
    GET_STR_DATA_LEN(self->str, str, len);
    if (self->cur < len) {
894
        mp_obj_t o_out = MP_OBJ_NEW_SMALL_INT((mp_small_int_t)str[self->cur]);
895
896
897
898
899
900
901
        self->cur += 1;
        return o_out;
    } else {
        return mp_const_stop_iteration;
    }
}

902
STATIC const mp_obj_type_t bytes_it_type = {
903
    { &mp_type_type },
904
    .name = MP_QSTR_iterator,
905
906
907
908
    .iternext = bytes_it_iternext,
};

mp_obj_t mp_obj_new_str_iterator(mp_obj_t str) {
xyb's avatar
xyb committed
909
910
911
    mp_obj_str_it_t *o = m_new_obj(mp_obj_str_it_t);
    o->base.type = &str_it_type;
    o->str = str;
912
913
914
915
916
917
918
919
920
    o->cur = 0;
    return o;
}

mp_obj_t mp_obj_new_bytes_iterator(mp_obj_t str) {
    mp_obj_str_it_t *o = m_new_obj(mp_obj_str_it_t);
    o->base.type = &bytes_it_type;
    o->str = str;
    o->cur = 0;
xyb's avatar
xyb committed
921
922
    return o;
}