objstr.c 28.1 KB
Newer Older
xbe's avatar
xbe committed
1
#include <stdbool.h>
2
3
4
5
6
7
#include <string.h>
#include <assert.h>

#include "nlr.h"
#include "misc.h"
#include "mpconfig.h"
8
#include "qstr.h"
9
10
11
12
13
14
#include "obj.h"
#include "runtime0.h"
#include "runtime.h"

typedef struct _mp_obj_str_t {
    mp_obj_base_t base;
15
16
    machine_uint_t hash : 16; // XXX here we assume the hash size is 16 bits (it is at the moment; see qstr.c)
    machine_uint_t len : 16; // len == number of bytes used in data, alloc = len + 1 because (at the moment) we also append a null byte
17
    const byte *data;
18
19
} mp_obj_str_t;

20
21
22
23
24
25
26
27
28
// use this macro to extract the string hash
#define GET_STR_HASH(str_obj_in, str_hash) uint str_hash; if (MP_OBJ_IS_QSTR(str_obj_in)) { str_hash = qstr_hash(MP_OBJ_QSTR_VALUE(str_obj_in)); } else { str_hash = ((mp_obj_str_t*)str_obj_in)->hash; }

// use this macro to extract the string length
#define GET_STR_LEN(str_obj_in, str_len) uint str_len; if (MP_OBJ_IS_QSTR(str_obj_in)) { str_len = qstr_len(MP_OBJ_QSTR_VALUE(str_obj_in)); } else { str_len = ((mp_obj_str_t*)str_obj_in)->len; }

// use this macro to extract the string data and length
#define GET_STR_DATA_LEN(str_obj_in, str_data, str_len) const byte *str_data; uint str_len; if (MP_OBJ_IS_QSTR(str_obj_in)) { str_data = qstr_data(MP_OBJ_QSTR_VALUE(str_obj_in), &str_len); } else { str_len = ((mp_obj_str_t*)str_obj_in)->len; str_data = ((mp_obj_str_t*)str_obj_in)->data; }

29
30
STATIC mp_obj_t mp_obj_new_str_iterator(mp_obj_t str);
STATIC mp_obj_t mp_obj_new_bytes_iterator(mp_obj_t str);
31
STATIC mp_obj_t str_new(const mp_obj_type_t *type, const byte* data, uint len);
xyb's avatar
xyb committed
32
33
34
35

/******************************************************************************/
/* str                                                                        */

36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
void mp_str_print_quoted(void (*print)(void *env, const char *fmt, ...), void *env, const byte *str_data, uint str_len) {
    // this escapes characters, but it will be very slow to print (calling print many times)
    bool has_single_quote = false;
    bool has_double_quote = false;
    for (const byte *s = str_data, *top = str_data + str_len; (!has_single_quote || !has_double_quote) && s < top; s++) {
        if (*s == '\'') {
            has_single_quote = true;
        } else if (*s == '"') {
            has_double_quote = true;
        }
    }
    int quote_char = '\'';
    if (has_single_quote && !has_double_quote) {
        quote_char = '"';
    }
    print(env, "%c", quote_char);
    for (const byte *s = str_data, *top = str_data + str_len; s < top; s++) {
        if (*s == quote_char) {
            print(env, "\\%c", quote_char);
        } else if (*s == '\\') {
            print(env, "\\\\");
        } else if (32 <= *s && *s <= 126) {
            print(env, "%c", *s);
        } else if (*s == '\n') {
            print(env, "\\n");
        // TODO add more escape codes here if we want to match CPython
        } else {
            print(env, "\\x%02x", *s);
        }
    }
    print(env, "%c", quote_char);
}

69
STATIC void str_print(void (*print)(void *env, const char *fmt, ...), void *env, mp_obj_t self_in, mp_print_kind_t kind) {
70
    GET_STR_DATA_LEN(self_in, str_data, str_len);
71
72
    bool is_bytes = MP_OBJ_IS_TYPE(self_in, &bytes_type);
    if (kind == PRINT_STR && !is_bytes) {
73
        print(env, "%.*s", str_len, str_data);
74
    } else {
75
76
77
        if (is_bytes) {
            print(env, "b");
        }
78
        mp_str_print_quoted(print, env, str_data, str_len);
79
    }
80
81
}

82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
STATIC mp_obj_t str_make_new(mp_obj_t type_in, uint n_args, uint n_kw, const mp_obj_t *args) {
    switch (n_args) {
        case 0:
            return MP_OBJ_NEW_QSTR(MP_QSTR_);

        case 1:
        {
            vstr_t *vstr = vstr_new();
            mp_obj_print_helper((void (*)(void*, const char*, ...))vstr_printf, vstr, args[0], PRINT_STR);
            mp_obj_t s = mp_obj_new_str((byte*)vstr->buf, vstr->len, false);
            vstr_free(vstr);
            return s;
        }

        case 2:
        case 3:
        {
            // TODO: validate 2nd/3rd args
            if (!MP_OBJ_IS_TYPE(args[0], &bytes_type)) {
                nlr_jump(mp_obj_new_exception_msg(&mp_type_TypeError, "bytes expected"));
            }
            GET_STR_DATA_LEN(args[0], str_data, str_len);
            GET_STR_HASH(args[0], str_hash);
            mp_obj_str_t *o = str_new(&str_type, NULL, str_len);
            o->data = str_data;
            o->hash = str_hash;
            return o;
        }

        default:
            nlr_jump(mp_obj_new_exception_msg(&mp_type_TypeError, "str takes at most 3 arguments"));
    }
}

116
117
// like strstr but with specified length and allows \0 bytes
// TODO replace with something more efficient/standard
118
STATIC const byte *find_subbytes(const byte *haystack, uint hlen, const byte *needle, uint nlen) {
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
    if (hlen >= nlen) {
        for (uint i = 0; i <= hlen - nlen; i++) {
            bool found = true;
            for (uint j = 0; j < nlen; j++) {
                if (haystack[i + j] != needle[j]) {
                    found = false;
                    break;
                }
            }
            if (found) {
                return haystack + i;
            }
        }
    }
    return NULL;
}

136
STATIC mp_obj_t str_binary_op(int op, mp_obj_t lhs_in, mp_obj_t rhs_in) {
137
    GET_STR_DATA_LEN(lhs_in, lhs_data, lhs_len);
138
139
    switch (op) {
        case RT_BINARY_OP_SUBSCR:
140
141
142
            // TODO: need predicate to check for int-like type (bools are such for example)
            // ["no", "yes"][1 == 2] is common idiom
            if (MP_OBJ_IS_SMALL_INT(rhs_in)) {
143
                uint index = mp_get_index(mp_obj_get_type(lhs_in), lhs_len, rhs_in, false);
144
                if (MP_OBJ_IS_TYPE(lhs_in, &bytes_type)) {
145
                    return MP_OBJ_NEW_SMALL_INT((mp_small_int_t)lhs_data[index]);
146
147
148
                } else {
                    return mp_obj_new_str(lhs_data + index, 1, true);
                }
149
#if MICROPY_ENABLE_SLICE
150
            } else if (MP_OBJ_IS_TYPE(rhs_in, &slice_type)) {
151
                machine_uint_t start, stop;
Paul Sokolovsky's avatar
Paul Sokolovsky committed
152
153
154
                if (!m_seq_get_fast_slice_indexes(lhs_len, rhs_in, &start, &stop)) {
                    assert(0);
                }
155
                return mp_obj_new_str(lhs_data + start, stop - start, false);
156
#endif
157
            } else {
158
159
                // Message doesn't match CPython, but we don't have so much bytes as they
                // to spend them on verbose wording
160
                nlr_jump(mp_obj_new_exception_msg(&mp_type_TypeError, "index must be int"));
161
            }
162
163
164

        case RT_BINARY_OP_ADD:
        case RT_BINARY_OP_INPLACE_ADD:
165
            if (MP_OBJ_IS_STR(rhs_in)) {
166
                // add 2 strings
167
168

                GET_STR_DATA_LEN(rhs_in, rhs_data, rhs_len);
169
                int alloc_len = lhs_len + rhs_len;
170
171

                /* code for making qstr
172
173
174
175
                byte *q_ptr;
                byte *val = qstr_build_start(alloc_len, &q_ptr);
                memcpy(val, lhs_data, lhs_len);
                memcpy(val + lhs_len, rhs_data, rhs_len);
176
177
178
179
180
                return MP_OBJ_NEW_QSTR(qstr_build_end(q_ptr));
                */

                // code for non-qstr
                byte *data;
181
                mp_obj_t s = mp_obj_str_builder_start(mp_obj_get_type(lhs_in), alloc_len, &data);
182
183
184
                memcpy(data, lhs_data, lhs_len);
                memcpy(data + lhs_len, rhs_data, rhs_len);
                return mp_obj_str_builder_end(s);
185
186
            }
            break;
187

188
        case RT_BINARY_OP_IN:
189
            /* NOTE `a in b` is `b.__contains__(a)` */
190
191
            if (MP_OBJ_IS_STR(rhs_in)) {
                GET_STR_DATA_LEN(rhs_in, rhs_data, rhs_len);
192
                return MP_BOOL(find_subbytes(lhs_data, lhs_len, rhs_data, rhs_len) != NULL);
193
194
            }
            break;
195

196
197
198
199
200
201
        case RT_BINARY_OP_MULTIPLY:
        {
            if (!MP_OBJ_IS_SMALL_INT(rhs_in)) {
                return NULL;
            }
            int n = MP_OBJ_SMALL_INT_VALUE(rhs_in);
202
            byte *data;
203
            mp_obj_t s = mp_obj_str_builder_start(mp_obj_get_type(lhs_in), lhs_len * n, &data);
204
205
            mp_seq_multiply(lhs_data, sizeof(*lhs_data), lhs_len, n, data);
            return mp_obj_str_builder_end(s);
206
        }
207
208
209
210
211
212
213
214
215
216
217
218

        // These 2 are never passed here, dealt with as a special case in rt_binary_op().
        //case RT_BINARY_OP_EQUAL:
        //case RT_BINARY_OP_NOT_EQUAL:
        case RT_BINARY_OP_LESS:
        case RT_BINARY_OP_LESS_EQUAL:
        case RT_BINARY_OP_MORE:
        case RT_BINARY_OP_MORE_EQUAL:
            if (MP_OBJ_IS_STR(rhs_in)) {
                GET_STR_DATA_LEN(rhs_in, rhs_data, rhs_len);
                return MP_BOOL(mp_seq_cmp_bytes(op, lhs_data, lhs_len, rhs_data, rhs_len));
            }
219
220
221
222
223
    }

    return MP_OBJ_NULL; // op not supported
}

224
STATIC mp_obj_t str_join(mp_obj_t self_in, mp_obj_t arg) {
225
    assert(MP_OBJ_IS_STR(self_in));
226

227
    // get separation string
228
    GET_STR_DATA_LEN(self_in, sep_str, sep_len);
229
230

    // process args
231
232
233
234
235
236
237
238
239
    uint seq_len;
    mp_obj_t *seq_items;
    if (MP_OBJ_IS_TYPE(arg, &tuple_type)) {
        mp_obj_tuple_get(arg, &seq_len, &seq_items);
    } else if (MP_OBJ_IS_TYPE(arg, &list_type)) {
        mp_obj_list_get(arg, &seq_len, &seq_items);
    } else {
        goto bad_arg;
    }
240
241
242

    // count required length
    int required_len = 0;
243
    for (int i = 0; i < seq_len; i++) {
244
        if (!MP_OBJ_IS_STR(seq_items[i])) {
245
246
            goto bad_arg;
        }
247
248
249
        if (i > 0) {
            required_len += sep_len;
        }
250
251
        GET_STR_LEN(seq_items[i], l);
        required_len += l;
252
253
254
    }

    // make joined string
255
    byte *data;
256
    mp_obj_t joined_str = mp_obj_str_builder_start(mp_obj_get_type(self_in), required_len, &data);
257
258
    for (int i = 0; i < seq_len; i++) {
        if (i > 0) {
259
260
            memcpy(data, sep_str, sep_len);
            data += sep_len;
261
        }
262
263
264
        GET_STR_DATA_LEN(seq_items[i], s, l);
        memcpy(data, s, l);
        data += l;
265
    }
266
267

    // return joined string
268
    return mp_obj_str_builder_end(joined_str);
269
270

bad_arg:
271
    nlr_jump(mp_obj_new_exception_msg(&mp_type_TypeError, "?str.join expecting a list of str's"));
272
273
}

Paul Sokolovsky's avatar
Paul Sokolovsky committed
274
275
#define is_ws(c) ((c) == ' ' || (c) == '\t')

276
STATIC mp_obj_t str_split(uint n_args, const mp_obj_t *args) {
Paul Sokolovsky's avatar
Paul Sokolovsky committed
277
278
279
280
281
282
283
284
285
    int splits = -1;
    mp_obj_t sep = mp_const_none;
    if (n_args > 1) {
        sep = args[1];
        if (n_args > 2) {
            splits = MP_OBJ_SMALL_INT_VALUE(args[2]);
        }
    }
    assert(sep == mp_const_none);
286
    (void)sep; // unused; to hush compiler warning
Paul Sokolovsky's avatar
Paul Sokolovsky committed
287
    mp_obj_t res = mp_obj_new_list(0, NULL);
288
289
290
    GET_STR_DATA_LEN(args[0], s, len);
    const byte *top = s + len;
    const byte *start;
Paul Sokolovsky's avatar
Paul Sokolovsky committed
291
292

    // Initial whitespace is not counted as split, so we pre-do it
293
294
    while (s < top && is_ws(*s)) s++;
    while (s < top && splits != 0) {
Paul Sokolovsky's avatar
Paul Sokolovsky committed
295
        start = s;
296
297
298
        while (s < top && !is_ws(*s)) s++;
        rt_list_append(res, mp_obj_new_str(start, s - start, false));
        if (s >= top) {
Paul Sokolovsky's avatar
Paul Sokolovsky committed
299
300
            break;
        }
301
        while (s < top && is_ws(*s)) s++;
Paul Sokolovsky's avatar
Paul Sokolovsky committed
302
303
304
305
306
        if (splits > 0) {
            splits--;
        }
    }

307
308
    if (s < top) {
        rt_list_append(res, mp_obj_new_str(s, top - s, false));
Paul Sokolovsky's avatar
Paul Sokolovsky committed
309
310
311
312
313
    }

    return res;
}

314
STATIC mp_obj_t str_find(uint n_args, const mp_obj_t *args) {
315
    assert(2 <= n_args && n_args <= 4);
316
317
    assert(MP_OBJ_IS_STR(args[0]));
    assert(MP_OBJ_IS_STR(args[1]));
318

319
320
    GET_STR_DATA_LEN(args[0], haystack, haystack_len);
    GET_STR_DATA_LEN(args[1], needle, needle_len);
321

322
323
    machine_uint_t start = 0;
    machine_uint_t end = haystack_len;
324
325
    /* TODO use a non-exception-throwing mp_get_index */
    if (n_args >= 3 && args[2] != mp_const_none) {
326
        start = mp_get_index(&str_type, haystack_len, args[2], true);
327
328
    }
    if (n_args >= 4 && args[3] != mp_const_none) {
329
        end = mp_get_index(&str_type, haystack_len, args[3], true);
330
331
    }

332
    const byte *p = find_subbytes(haystack + start, haystack_len - start, needle, needle_len);
333
334
335
336
337
338
    if (p == NULL) {
        // not found
        return MP_OBJ_NEW_SMALL_INT(-1);
    } else {
        // found
        machine_int_t pos = p - haystack;
339
340
341
        if (pos + needle_len > end) {
            pos = -1;
        }
342
        return MP_OBJ_NEW_SMALL_INT(pos);
343
344
345
    }
}

346
// TODO: (Much) more variety in args
347
STATIC mp_obj_t str_startswith(mp_obj_t self_in, mp_obj_t arg) {
348
349
350
351
352
353
354
355
    GET_STR_DATA_LEN(self_in, str, str_len);
    GET_STR_DATA_LEN(arg, prefix, prefix_len);
    if (prefix_len > str_len) {
        return mp_const_false;
    }
    return MP_BOOL(memcmp(str, prefix, prefix_len) == 0);
}

356
357
STATIC bool chr_in_str(const byte* const str, const machine_uint_t str_len, int c) {
    for (machine_uint_t i = 0; i < str_len; i++) {
358
359
360
361
362
363
364
        if (str[i] == c) {
            return true;
        }
    }
    return false;
}

365
STATIC mp_obj_t str_strip(uint n_args, const mp_obj_t *args) {
xbe's avatar
xbe committed
366
    assert(1 <= n_args && n_args <= 2);
367
368
369
370
371
    assert(MP_OBJ_IS_STR(args[0]));

    const byte *chars_to_del;
    uint chars_to_del_len;
    static const byte whitespace[] = " \t\n\r\v\f";
xbe's avatar
xbe committed
372
373
374

    if (n_args == 1) {
        chars_to_del = whitespace;
375
        chars_to_del_len = sizeof(whitespace);
xbe's avatar
xbe committed
376
    } else {
377
378
379
380
        assert(MP_OBJ_IS_STR(args[1]));
        GET_STR_DATA_LEN(args[1], s, l);
        chars_to_del = s;
        chars_to_del_len = l;
xbe's avatar
xbe committed
381
382
    }

383
    GET_STR_DATA_LEN(args[0], orig_str, orig_str_len);
xbe's avatar
xbe committed
384

385
    machine_uint_t first_good_char_pos = 0;
xbe's avatar
xbe committed
386
    bool first_good_char_pos_set = false;
387
388
    machine_uint_t last_good_char_pos = 0;
    for (machine_uint_t i = 0; i < orig_str_len; i++) {
xbe's avatar
xbe committed
389
390
391
392
393
394
395
396
397
398
        if (!chr_in_str(chars_to_del, chars_to_del_len, orig_str[i])) {
            last_good_char_pos = i;
            if (!first_good_char_pos_set) {
                first_good_char_pos = i;
                first_good_char_pos_set = true;
            }
        }
    }

    if (first_good_char_pos == 0 && last_good_char_pos == 0) {
399
400
        // string is all whitespace, return ''
        return MP_OBJ_NEW_QSTR(MP_QSTR_);
xbe's avatar
xbe committed
401
402
403
404
    }

    assert(last_good_char_pos >= first_good_char_pos);
    //+1 to accomodate the last character
405
    machine_uint_t stripped_len = last_good_char_pos - first_good_char_pos + 1;
406
    return mp_obj_new_str(orig_str + first_good_char_pos, stripped_len, false);
xbe's avatar
xbe committed
407
408
}

409
mp_obj_t str_format(uint n_args, const mp_obj_t *args) {
410
    assert(MP_OBJ_IS_STR(args[0]));
411

412
    GET_STR_DATA_LEN(args[0], str, len);
413
414
    int arg_i = 1;
    vstr_t *vstr = vstr_new();
415
    for (const byte *top = str + len; str < top; str++) {
416
417
        if (*str == '{') {
            str++;
418
            if (str < top && *str == '{') {
419
                vstr_add_char(vstr, '{');
420
            } else {
421
                while (str < top && *str != '}') str++;
422
                if (arg_i >= n_args) {
423
                    nlr_jump(mp_obj_new_exception_msg(&mp_type_IndexError, "tuple index out of range"));
424
                }
425
                // TODO: may be PRINT_REPR depending on formatting code
426
                mp_obj_print_helper((void (*)(void*, const char*, ...))vstr_printf, vstr, args[arg_i], PRINT_STR);
427
428
429
430
431
432
433
                arg_i++;
            }
        } else {
            vstr_add_char(vstr, *str);
        }
    }

434
435
436
    mp_obj_t s = mp_obj_new_str((byte*)vstr->buf, vstr->len, false);
    vstr_free(vstr);
    return s;
437
438
}

439
STATIC mp_obj_t str_replace(uint n_args, const mp_obj_t *args) {
440
441
442
443
    assert(MP_OBJ_IS_STR(args[0]));
    assert(MP_OBJ_IS_STR(args[1]));
    assert(MP_OBJ_IS_STR(args[2]));

444
    machine_int_t max_rep = 0;
445
    if (n_args == 4) {
446
447
448
449
450
451
452
        assert(MP_OBJ_IS_SMALL_INT(args[3]));
        max_rep = MP_OBJ_SMALL_INT_VALUE(args[3]);
        if (max_rep == 0) {
            return args[0];
        } else if (max_rep < 0) {
            max_rep = 0;
        }
453
    }
454
455

    // if max_rep is still 0 by this point we will need to do all possible replacements
456
457
458
459

    GET_STR_DATA_LEN(args[0], str, str_len);
    GET_STR_DATA_LEN(args[1], old, old_len);
    GET_STR_DATA_LEN(args[2], new, new_len);
460
461

    // old won't exist in str if it's longer, so nothing to replace
462
    if (old_len > str_len) {
463
        return args[0];
464
465
    }

466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
    // data for the replaced string
    byte *data = NULL;
    mp_obj_t replaced_str = MP_OBJ_NULL;

    // do 2 passes over the string:
    //   first pass computes the required length of the replaced string
    //   second pass does the replacements
    for (;;) {
        machine_uint_t replaced_str_index = 0;
        machine_uint_t num_replacements_done = 0;
        const byte *old_occurrence;
        const byte *offset_ptr = str;
        machine_uint_t offset_num = 0;
        while ((old_occurrence = find_subbytes(offset_ptr, str_len - offset_num, old, old_len)) != NULL) {
            // copy from just after end of last occurrence of to-be-replaced string to right before start of next occurrence
            if (data != NULL) {
                memcpy(data + replaced_str_index, offset_ptr, old_occurrence - offset_ptr);
            }
            replaced_str_index += old_occurrence - offset_ptr;
            // copy the replacement string
            if (data != NULL) {
                memcpy(data + replaced_str_index, new, new_len);
            }
            replaced_str_index += new_len;
            offset_ptr = old_occurrence + old_len;
            offset_num = offset_ptr - str;

            num_replacements_done++;
            if (max_rep != 0 && num_replacements_done == max_rep){
                break;
            }
        }

        // copy from just after end of last occurrence of to-be-replaced string to end of old string
        if (data != NULL) {
            memcpy(data + replaced_str_index, offset_ptr, str_len - offset_num);
        }
        replaced_str_index += str_len - offset_num;

        if (data == NULL) {
            // first pass
            if (num_replacements_done == 0) {
                // no substr found, return original string
                return args[0];
            } else {
                // substr found, allocate new string
                replaced_str = mp_obj_str_builder_start(mp_obj_get_type(args[0]), replaced_str_index, &data);
            }
        } else {
            // second pass, we are done
            break;
        }
518
    }
519

520
521
522
    return mp_obj_str_builder_end(replaced_str);
}

523
524
525
526
527
528
529
530
STATIC mp_obj_t str_count(uint n_args, const mp_obj_t *args) {
    assert(2 <= n_args && n_args <= 4);
    assert(MP_OBJ_IS_STR(args[0]));
    assert(MP_OBJ_IS_STR(args[1]));

    GET_STR_DATA_LEN(args[0], haystack, haystack_len);
    GET_STR_DATA_LEN(args[1], needle, needle_len);

531
532
    machine_uint_t start = 0;
    machine_uint_t end = haystack_len;
533
534
535
536
537
538
539
540
    /* TODO use a non-exception-throwing mp_get_index */
    if (n_args >= 3 && args[2] != mp_const_none) {
        start = mp_get_index(&str_type, haystack_len, args[2], true);
    }
    if (n_args >= 4 && args[3] != mp_const_none) {
        end = mp_get_index(&str_type, haystack_len, args[3], true);
    }

541
542
543
    // if needle_len is zero then we count each gap between characters as an occurrence
    if (needle_len == 0) {
        return MP_OBJ_NEW_SMALL_INT(end - start + 1);
544
545
    }

546
547
    // count the occurrences
    machine_int_t num_occurrences = 0;
xbe's avatar
xbe committed
548
549
550
551
552
    for (machine_uint_t haystack_index = start; haystack_index + needle_len <= end; haystack_index++) {
        if (memcmp(&haystack[haystack_index], needle, needle_len) == 0) {
            num_occurrences++;
            haystack_index += needle_len - 1;
        }
553
554
555
556
557
    }

    return MP_OBJ_NEW_SMALL_INT(num_occurrences);
}

558
STATIC mp_obj_t str_partitioner(mp_obj_t self_in, mp_obj_t arg, machine_int_t direction) {
559
560
561
562
563
    assert(MP_OBJ_IS_STR(self_in));
    if (!MP_OBJ_IS_STR(arg)) {
        nlr_jump(mp_obj_new_exception_msg_varg(&mp_type_TypeError,
                                               "Can't convert '%s' object to str implicitly", mp_obj_get_type_str(arg)));
    }
564

565
566
567
568
569
570
    GET_STR_DATA_LEN(self_in, str, str_len);
    GET_STR_DATA_LEN(arg, sep, sep_len);

    if (sep_len == 0) {
        nlr_jump(mp_obj_new_exception_msg(&mp_type_ValueError, "empty separator"));
    }
571
572
573
574
575

    mp_obj_t result[] = {MP_OBJ_NEW_QSTR(MP_QSTR_), MP_OBJ_NEW_QSTR(MP_QSTR_), MP_OBJ_NEW_QSTR(MP_QSTR_)};

    if (direction > 0) {
        result[0] = self_in;
576
    } else {
577
        result[2] = self_in;
578
    }
579

580
581
582
583
584
585
586
587
588
589
590
591
592
593
    if (str_len >= sep_len) {
        machine_uint_t str_index, str_index_end;
        if (direction > 0) {
            str_index = 0;
            str_index_end = str_len - sep_len;
        } else {
            str_index = str_len - sep_len;
            str_index_end = 0;
        }
        for (;;) {
            if (memcmp(&str[str_index], sep, sep_len) == 0) {
                result[0] = mp_obj_new_str(str, str_index, false);
                result[1] = arg;
                result[2] = mp_obj_new_str(str + str_index + sep_len, str_len - str_index - sep_len, false);
594
595
                break;
            }
596
597
598
599
            if (str_index == str_index_end) {
                break;
            }
            str_index += direction;
600
601
        }
    }
602

603
    return mp_obj_new_tuple(3, result);
604
605
}

606
607
STATIC mp_obj_t str_partition(mp_obj_t self_in, mp_obj_t arg) {
    return str_partitioner(self_in, arg, 1);
608
}
609

610
611
STATIC mp_obj_t str_rpartition(mp_obj_t self_in, mp_obj_t arg) {
    return str_partitioner(self_in, arg, -1);
612
613
}

614
615
616
617
618
619
620
621
622
623
624
625
626
627
STATIC machine_int_t str_get_buffer(mp_obj_t self_in, buffer_info_t *bufinfo, int flags) {
    if (flags == BUFFER_READ) {
        GET_STR_DATA_LEN(self_in, str_data, str_len);
        bufinfo->buf = (void*)str_data;
        bufinfo->len = str_len;
        return 0;
    } else {
        // can't write to a string
        bufinfo->buf = NULL;
        bufinfo->len = 0;
        return 1;
    }
}

628
629
630
631
632
633
634
STATIC MP_DEFINE_CONST_FUN_OBJ_VAR_BETWEEN(str_find_obj, 2, 4, str_find);
STATIC MP_DEFINE_CONST_FUN_OBJ_2(str_join_obj, str_join);
STATIC MP_DEFINE_CONST_FUN_OBJ_VAR_BETWEEN(str_split_obj, 1, 3, str_split);
STATIC MP_DEFINE_CONST_FUN_OBJ_2(str_startswith_obj, str_startswith);
STATIC MP_DEFINE_CONST_FUN_OBJ_VAR_BETWEEN(str_strip_obj, 1, 2, str_strip);
STATIC MP_DEFINE_CONST_FUN_OBJ_VAR(str_format_obj, 1, str_format);
STATIC MP_DEFINE_CONST_FUN_OBJ_VAR_BETWEEN(str_replace_obj, 3, 4, str_replace);
635
STATIC MP_DEFINE_CONST_FUN_OBJ_VAR_BETWEEN(str_count_obj, 2, 4, str_count);
636
STATIC MP_DEFINE_CONST_FUN_OBJ_2(str_partition_obj, str_partition);
637
STATIC MP_DEFINE_CONST_FUN_OBJ_2(str_rpartition_obj, str_rpartition);
638

639
STATIC const mp_method_t str_type_methods[] = {
640
    { "find", &str_find_obj },
ian-v's avatar
ian-v committed
641
    { "join", &str_join_obj },
Paul Sokolovsky's avatar
Paul Sokolovsky committed
642
    { "split", &str_split_obj },
643
    { "startswith", &str_startswith_obj },
xbe's avatar
xbe committed
644
    { "strip", &str_strip_obj },
ian-v's avatar
ian-v committed
645
    { "format", &str_format_obj },
646
    { "replace", &str_replace_obj },
647
    { "count", &str_count_obj },
648
    { "partition", &str_partition_obj },
649
    { "rpartition", &str_rpartition_obj },
ian-v's avatar
ian-v committed
650
651
    { NULL, NULL }, // end-of-list sentinel
};
652

653
const mp_obj_type_t str_type = {
654
    { &mp_type_type },
655
    .name = MP_QSTR_str,
656
    .print = str_print,
657
    .make_new = str_make_new,
658
    .binary_op = str_binary_op,
659
660
    .getiter = mp_obj_new_str_iterator,
    .methods = str_type_methods,
661
    .buffer_p = { .get_buffer = str_get_buffer },
662
663
664
665
};

// Reuses most of methods from str
const mp_obj_type_t bytes_type = {
666
    { &mp_type_type },
667
    .name = MP_QSTR_bytes,
668
669
670
    .print = str_print,
    .binary_op = str_binary_op,
    .getiter = mp_obj_new_bytes_iterator,
ian-v's avatar
ian-v committed
671
    .methods = str_type_methods,
672
673
};

674
mp_obj_t mp_obj_str_builder_start(const mp_obj_type_t *type, uint len, byte **data) {
675
    mp_obj_str_t *o = m_new_obj(mp_obj_str_t);
676
    o->base.type = type;
677
    o->len = len;
678
679
680
    byte *p = m_new(byte, len + 1);
    o->data = p;
    *data = p;
681
682
683
684
685
686
687
    return o;
}

mp_obj_t mp_obj_str_builder_end(mp_obj_t o_in) {
    assert(MP_OBJ_IS_STR(o_in));
    mp_obj_str_t *o = o_in;
    o->hash = qstr_compute_hash(o->data, o->len);
688
689
    byte *p = (byte*)o->data;
    p[o->len] = '\0'; // for now we add null for compatibility with C ASCIIZ strings
690
691
692
    return o;
}

693
STATIC mp_obj_t str_new(const mp_obj_type_t *type, const byte* data, uint len) {
694
    mp_obj_str_t *o = m_new_obj(mp_obj_str_t);
695
696
    o->base.type = type;
    o->len = len;
697
698
699
700
701
702
703
    if (data) {
        o->hash = qstr_compute_hash(data, len);
        byte *p = m_new(byte, len + 1);
        o->data = p;
        memcpy(p, data, len * sizeof(byte));
        p[len] = '\0'; // for now we add null for compatibility with C ASCIIZ strings
    }
704
705
706
    return o;
}

707
708
709
710
711
712
713
714
715
716
mp_obj_t mp_obj_new_str(const byte* data, uint len, bool make_qstr_if_not_already) {
    qstr q = qstr_find_strn(data, len);
    if (q != MP_QSTR_NULL) {
        // qstr with this data already exists
        return MP_OBJ_NEW_QSTR(q);
    } else if (make_qstr_if_not_already) {
        // no existing qstr, make a new one
        return MP_OBJ_NEW_QSTR(qstr_from_strn((const char*)data, len));
    } else {
        // no existing qstr, don't make one
717
        return str_new(&str_type, data, len);
718
719
720
    }
}

721
722
723
724
mp_obj_t mp_obj_new_bytes(const byte* data, uint len) {
    return str_new(&bytes_type, data, len);
}

725
726
727
728
729
730
731
732
733
734
735
736
737
738
bool mp_obj_str_equal(mp_obj_t s1, mp_obj_t s2) {
    if (MP_OBJ_IS_QSTR(s1) && MP_OBJ_IS_QSTR(s2)) {
        return s1 == s2;
    } else {
        GET_STR_HASH(s1, h1);
        GET_STR_HASH(s2, h2);
        if (h1 != h2) {
            return false;
        }
        GET_STR_DATA_LEN(s1, d1, l1);
        GET_STR_DATA_LEN(s2, d2, l2);
        if (l1 != l2) {
            return false;
        }
739
        return memcmp(d1, d2, l1) == 0;
740
741
742
    }
}

743
744
void bad_implicit_conversion(mp_obj_t self_in) __attribute__((noreturn));
void bad_implicit_conversion(mp_obj_t self_in) {
745
    nlr_jump(mp_obj_new_exception_msg_varg(&mp_type_TypeError, "Can't convert '%s' object to str implicitly", mp_obj_get_type_str(self_in)));
746
747
}

748
749
750
751
752
uint mp_obj_str_get_hash(mp_obj_t self_in) {
    if (MP_OBJ_IS_STR(self_in)) {
        GET_STR_HASH(self_in, h);
        return h;
    } else {
753
        bad_implicit_conversion(self_in);
754
    }
755
756
757
758
759
760
761
}

uint mp_obj_str_get_len(mp_obj_t self_in) {
    if (MP_OBJ_IS_STR(self_in)) {
        GET_STR_LEN(self_in, l);
        return l;
    } else {
762
763
764
765
766
767
768
769
770
771
772
773
774
775
        bad_implicit_conversion(self_in);
    }
}

// use this if you will anyway convert the string to a qstr
// will be more efficient for the case where it's already a qstr
qstr mp_obj_str_get_qstr(mp_obj_t self_in) {
    if (MP_OBJ_IS_QSTR(self_in)) {
        return MP_OBJ_QSTR_VALUE(self_in);
    } else if (MP_OBJ_IS_TYPE(self_in, &str_type)) {
        mp_obj_str_t *self = self_in;
        return qstr_from_strn((char*)self->data, self->len);
    } else {
        bad_implicit_conversion(self_in);
776
777
778
779
780
781
782
783
784
785
786
    }
}

// only use this function if you need the str data to be zero terminated
// at the moment all strings are zero terminated to help with C ASCIIZ compatibility
const char *mp_obj_str_get_str(mp_obj_t self_in) {
    if (MP_OBJ_IS_STR(self_in)) {
        GET_STR_DATA_LEN(self_in, s, l);
        (void)l; // len unused
        return (const char*)s;
    } else {
787
        bad_implicit_conversion(self_in);
788
789
790
    }
}

791
const char *mp_obj_str_get_data(mp_obj_t self_in, uint *len) {
792
793
794
    if (MP_OBJ_IS_STR(self_in)) {
        GET_STR_DATA_LEN(self_in, s, l);
        *len = l;
795
        return (const char*)s;
796
    } else {
797
        bad_implicit_conversion(self_in);
798
    }
799
}
xyb's avatar
xyb committed
800
801
802
803
804
805

/******************************************************************************/
/* str iterator                                                               */

typedef struct _mp_obj_str_it_t {
    mp_obj_base_t base;
806
    mp_obj_t str;
xyb's avatar
xyb committed
807
808
809
    machine_uint_t cur;
} mp_obj_str_it_t;

810
STATIC mp_obj_t str_it_iternext(mp_obj_t self_in) {
xyb's avatar
xyb committed
811
    mp_obj_str_it_t *self = self_in;
812
813
814
    GET_STR_DATA_LEN(self->str, str, len);
    if (self->cur < len) {
        mp_obj_t o_out = mp_obj_new_str(str + self->cur, 1, true);
xyb's avatar
xyb committed
815
816
817
818
819
820
821
        self->cur += 1;
        return o_out;
    } else {
        return mp_const_stop_iteration;
    }
}

822
STATIC const mp_obj_type_t str_it_type = {
823
    { &mp_type_type },
824
    .name = MP_QSTR_iterator,
825
    .iternext = str_it_iternext,
xyb's avatar
xyb committed
826
827
};

828
STATIC mp_obj_t bytes_it_iternext(mp_obj_t self_in) {
829
830
831
    mp_obj_str_it_t *self = self_in;
    GET_STR_DATA_LEN(self->str, str, len);
    if (self->cur < len) {
832
        mp_obj_t o_out = MP_OBJ_NEW_SMALL_INT((mp_small_int_t)str[self->cur]);
833
834
835
836
837
838
839
        self->cur += 1;
        return o_out;
    } else {
        return mp_const_stop_iteration;
    }
}

840
STATIC const mp_obj_type_t bytes_it_type = {
841
    { &mp_type_type },
842
    .name = MP_QSTR_iterator,
843
844
845
846
    .iternext = bytes_it_iternext,
};

mp_obj_t mp_obj_new_str_iterator(mp_obj_t str) {
xyb's avatar
xyb committed
847
848
849
    mp_obj_str_it_t *o = m_new_obj(mp_obj_str_it_t);
    o->base.type = &str_it_type;
    o->str = str;
850
851
852
853
854
855
856
857
858
    o->cur = 0;
    return o;
}

mp_obj_t mp_obj_new_bytes_iterator(mp_obj_t str) {
    mp_obj_str_it_t *o = m_new_obj(mp_obj_str_it_t);
    o->base.type = &bytes_it_type;
    o->str = str;
    o->cur = 0;
xyb's avatar
xyb committed
859
860
    return o;
}