objstr.c 26.6 KB
Newer Older
xbe's avatar
xbe committed
1
#include <stdbool.h>
2
3
4
5
6
7
#include <string.h>
#include <assert.h>

#include "nlr.h"
#include "misc.h"
#include "mpconfig.h"
8
#include "qstr.h"
9
10
11
12
13
14
#include "obj.h"
#include "runtime0.h"
#include "runtime.h"

typedef struct _mp_obj_str_t {
    mp_obj_base_t base;
15
16
17
    machine_uint_t hash : 16; // XXX here we assume the hash size is 16 bits (it is at the moment; see qstr.c)
    machine_uint_t len : 16; // len == number of bytes used in data, alloc = len + 1 because (at the moment) we also append a null byte
    byte data[];
18
19
} mp_obj_str_t;

20
21
22
23
24
25
26
27
28
// use this macro to extract the string hash
#define GET_STR_HASH(str_obj_in, str_hash) uint str_hash; if (MP_OBJ_IS_QSTR(str_obj_in)) { str_hash = qstr_hash(MP_OBJ_QSTR_VALUE(str_obj_in)); } else { str_hash = ((mp_obj_str_t*)str_obj_in)->hash; }

// use this macro to extract the string length
#define GET_STR_LEN(str_obj_in, str_len) uint str_len; if (MP_OBJ_IS_QSTR(str_obj_in)) { str_len = qstr_len(MP_OBJ_QSTR_VALUE(str_obj_in)); } else { str_len = ((mp_obj_str_t*)str_obj_in)->len; }

// use this macro to extract the string data and length
#define GET_STR_DATA_LEN(str_obj_in, str_data, str_len) const byte *str_data; uint str_len; if (MP_OBJ_IS_QSTR(str_obj_in)) { str_data = qstr_data(MP_OBJ_QSTR_VALUE(str_obj_in), &str_len); } else { str_len = ((mp_obj_str_t*)str_obj_in)->len; str_data = ((mp_obj_str_t*)str_obj_in)->data; }

29
30
STATIC mp_obj_t mp_obj_new_str_iterator(mp_obj_t str);
STATIC mp_obj_t mp_obj_new_bytes_iterator(mp_obj_t str);
xyb's avatar
xyb committed
31
32
33
34

/******************************************************************************/
/* str                                                                        */

35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
void mp_str_print_quoted(void (*print)(void *env, const char *fmt, ...), void *env, const byte *str_data, uint str_len) {
    // this escapes characters, but it will be very slow to print (calling print many times)
    bool has_single_quote = false;
    bool has_double_quote = false;
    for (const byte *s = str_data, *top = str_data + str_len; (!has_single_quote || !has_double_quote) && s < top; s++) {
        if (*s == '\'') {
            has_single_quote = true;
        } else if (*s == '"') {
            has_double_quote = true;
        }
    }
    int quote_char = '\'';
    if (has_single_quote && !has_double_quote) {
        quote_char = '"';
    }
    print(env, "%c", quote_char);
    for (const byte *s = str_data, *top = str_data + str_len; s < top; s++) {
        if (*s == quote_char) {
            print(env, "\\%c", quote_char);
        } else if (*s == '\\') {
            print(env, "\\\\");
        } else if (32 <= *s && *s <= 126) {
            print(env, "%c", *s);
        } else if (*s == '\n') {
            print(env, "\\n");
        // TODO add more escape codes here if we want to match CPython
        } else {
            print(env, "\\x%02x", *s);
        }
    }
    print(env, "%c", quote_char);
}

68
STATIC void str_print(void (*print)(void *env, const char *fmt, ...), void *env, mp_obj_t self_in, mp_print_kind_t kind) {
69
    GET_STR_DATA_LEN(self_in, str_data, str_len);
70
71
    bool is_bytes = MP_OBJ_IS_TYPE(self_in, &bytes_type);
    if (kind == PRINT_STR && !is_bytes) {
72
        print(env, "%.*s", str_len, str_data);
73
    } else {
74
75
76
        if (is_bytes) {
            print(env, "b");
        }
77
        mp_str_print_quoted(print, env, str_data, str_len);
78
    }
79
80
}

81
82
// like strstr but with specified length and allows \0 bytes
// TODO replace with something more efficient/standard
83
STATIC const byte *find_subbytes(const byte *haystack, uint hlen, const byte *needle, uint nlen) {
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
    if (hlen >= nlen) {
        for (uint i = 0; i <= hlen - nlen; i++) {
            bool found = true;
            for (uint j = 0; j < nlen; j++) {
                if (haystack[i + j] != needle[j]) {
                    found = false;
                    break;
                }
            }
            if (found) {
                return haystack + i;
            }
        }
    }
    return NULL;
}

101
STATIC mp_obj_t str_binary_op(int op, mp_obj_t lhs_in, mp_obj_t rhs_in) {
102
    GET_STR_DATA_LEN(lhs_in, lhs_data, lhs_len);
103
104
    switch (op) {
        case RT_BINARY_OP_SUBSCR:
105
106
107
            // TODO: need predicate to check for int-like type (bools are such for example)
            // ["no", "yes"][1 == 2] is common idiom
            if (MP_OBJ_IS_SMALL_INT(rhs_in)) {
108
                uint index = mp_get_index(mp_obj_get_type(lhs_in), lhs_len, rhs_in, false);
109
                if (MP_OBJ_IS_TYPE(lhs_in, &bytes_type)) {
110
                    return MP_OBJ_NEW_SMALL_INT((mp_small_int_t)lhs_data[index]);
111
112
113
                } else {
                    return mp_obj_new_str(lhs_data + index, 1, true);
                }
114
#if MICROPY_ENABLE_SLICE
115
            } else if (MP_OBJ_IS_TYPE(rhs_in, &slice_type)) {
116
                machine_uint_t start, stop;
Paul Sokolovsky's avatar
Paul Sokolovsky committed
117
118
119
                if (!m_seq_get_fast_slice_indexes(lhs_len, rhs_in, &start, &stop)) {
                    assert(0);
                }
120
                return mp_obj_new_str(lhs_data + start, stop - start, false);
121
#endif
122
            } else {
123
124
                // Message doesn't match CPython, but we don't have so much bytes as they
                // to spend them on verbose wording
125
                nlr_jump(mp_obj_new_exception_msg(&mp_type_TypeError, "index must be int"));
126
            }
127
128
129

        case RT_BINARY_OP_ADD:
        case RT_BINARY_OP_INPLACE_ADD:
130
            if (MP_OBJ_IS_STR(rhs_in)) {
131
                // add 2 strings
132
133

                GET_STR_DATA_LEN(rhs_in, rhs_data, rhs_len);
134
                int alloc_len = lhs_len + rhs_len;
135
136

                /* code for making qstr
137
138
139
140
                byte *q_ptr;
                byte *val = qstr_build_start(alloc_len, &q_ptr);
                memcpy(val, lhs_data, lhs_len);
                memcpy(val + lhs_len, rhs_data, rhs_len);
141
142
143
144
145
                return MP_OBJ_NEW_QSTR(qstr_build_end(q_ptr));
                */

                // code for non-qstr
                byte *data;
146
                mp_obj_t s = mp_obj_str_builder_start(mp_obj_get_type(lhs_in), alloc_len, &data);
147
148
149
                memcpy(data, lhs_data, lhs_len);
                memcpy(data + lhs_len, rhs_data, rhs_len);
                return mp_obj_str_builder_end(s);
150
151
            }
            break;
152

153
        case RT_BINARY_OP_IN:
154
            /* NOTE `a in b` is `b.__contains__(a)` */
155
156
            if (MP_OBJ_IS_STR(rhs_in)) {
                GET_STR_DATA_LEN(rhs_in, rhs_data, rhs_len);
157
                return MP_BOOL(find_subbytes(lhs_data, lhs_len, rhs_data, rhs_len) != NULL);
158
159
            }
            break;
160

161
162
163
164
165
166
        case RT_BINARY_OP_MULTIPLY:
        {
            if (!MP_OBJ_IS_SMALL_INT(rhs_in)) {
                return NULL;
            }
            int n = MP_OBJ_SMALL_INT_VALUE(rhs_in);
167
            byte *data;
168
            mp_obj_t s = mp_obj_str_builder_start(mp_obj_get_type(lhs_in), lhs_len * n, &data);
169
170
            mp_seq_multiply(lhs_data, sizeof(*lhs_data), lhs_len, n, data);
            return mp_obj_str_builder_end(s);
171
        }
172
173
174
175
176
177
178
179
180
181
182
183

        // These 2 are never passed here, dealt with as a special case in rt_binary_op().
        //case RT_BINARY_OP_EQUAL:
        //case RT_BINARY_OP_NOT_EQUAL:
        case RT_BINARY_OP_LESS:
        case RT_BINARY_OP_LESS_EQUAL:
        case RT_BINARY_OP_MORE:
        case RT_BINARY_OP_MORE_EQUAL:
            if (MP_OBJ_IS_STR(rhs_in)) {
                GET_STR_DATA_LEN(rhs_in, rhs_data, rhs_len);
                return MP_BOOL(mp_seq_cmp_bytes(op, lhs_data, lhs_len, rhs_data, rhs_len));
            }
184
185
186
187
188
    }

    return MP_OBJ_NULL; // op not supported
}

189
STATIC mp_obj_t str_join(mp_obj_t self_in, mp_obj_t arg) {
190
    assert(MP_OBJ_IS_STR(self_in));
191

192
    // get separation string
193
    GET_STR_DATA_LEN(self_in, sep_str, sep_len);
194
195

    // process args
196
197
198
199
200
201
202
203
204
    uint seq_len;
    mp_obj_t *seq_items;
    if (MP_OBJ_IS_TYPE(arg, &tuple_type)) {
        mp_obj_tuple_get(arg, &seq_len, &seq_items);
    } else if (MP_OBJ_IS_TYPE(arg, &list_type)) {
        mp_obj_list_get(arg, &seq_len, &seq_items);
    } else {
        goto bad_arg;
    }
205
206
207

    // count required length
    int required_len = 0;
208
    for (int i = 0; i < seq_len; i++) {
209
        if (!MP_OBJ_IS_STR(seq_items[i])) {
210
211
            goto bad_arg;
        }
212
213
214
        if (i > 0) {
            required_len += sep_len;
        }
215
216
        GET_STR_LEN(seq_items[i], l);
        required_len += l;
217
218
219
    }

    // make joined string
220
    byte *data;
221
    mp_obj_t joined_str = mp_obj_str_builder_start(mp_obj_get_type(self_in), required_len, &data);
222
223
    for (int i = 0; i < seq_len; i++) {
        if (i > 0) {
224
225
            memcpy(data, sep_str, sep_len);
            data += sep_len;
226
        }
227
228
229
        GET_STR_DATA_LEN(seq_items[i], s, l);
        memcpy(data, s, l);
        data += l;
230
    }
231
232

    // return joined string
233
    return mp_obj_str_builder_end(joined_str);
234
235

bad_arg:
236
    nlr_jump(mp_obj_new_exception_msg(&mp_type_TypeError, "?str.join expecting a list of str's"));
237
238
}

Paul Sokolovsky's avatar
Paul Sokolovsky committed
239
240
#define is_ws(c) ((c) == ' ' || (c) == '\t')

241
STATIC mp_obj_t str_split(uint n_args, const mp_obj_t *args) {
Paul Sokolovsky's avatar
Paul Sokolovsky committed
242
243
244
245
246
247
248
249
250
    int splits = -1;
    mp_obj_t sep = mp_const_none;
    if (n_args > 1) {
        sep = args[1];
        if (n_args > 2) {
            splits = MP_OBJ_SMALL_INT_VALUE(args[2]);
        }
    }
    assert(sep == mp_const_none);
251
    (void)sep; // unused; to hush compiler warning
Paul Sokolovsky's avatar
Paul Sokolovsky committed
252
    mp_obj_t res = mp_obj_new_list(0, NULL);
253
254
255
    GET_STR_DATA_LEN(args[0], s, len);
    const byte *top = s + len;
    const byte *start;
Paul Sokolovsky's avatar
Paul Sokolovsky committed
256
257

    // Initial whitespace is not counted as split, so we pre-do it
258
259
    while (s < top && is_ws(*s)) s++;
    while (s < top && splits != 0) {
Paul Sokolovsky's avatar
Paul Sokolovsky committed
260
        start = s;
261
262
263
        while (s < top && !is_ws(*s)) s++;
        rt_list_append(res, mp_obj_new_str(start, s - start, false));
        if (s >= top) {
Paul Sokolovsky's avatar
Paul Sokolovsky committed
264
265
            break;
        }
266
        while (s < top && is_ws(*s)) s++;
Paul Sokolovsky's avatar
Paul Sokolovsky committed
267
268
269
270
271
        if (splits > 0) {
            splits--;
        }
    }

272
273
    if (s < top) {
        rt_list_append(res, mp_obj_new_str(s, top - s, false));
Paul Sokolovsky's avatar
Paul Sokolovsky committed
274
275
276
277
278
    }

    return res;
}

279
STATIC mp_obj_t str_find(uint n_args, const mp_obj_t *args) {
280
    assert(2 <= n_args && n_args <= 4);
281
282
    assert(MP_OBJ_IS_STR(args[0]));
    assert(MP_OBJ_IS_STR(args[1]));
283

284
285
    GET_STR_DATA_LEN(args[0], haystack, haystack_len);
    GET_STR_DATA_LEN(args[1], needle, needle_len);
286

287
288
    machine_uint_t start = 0;
    machine_uint_t end = haystack_len;
289
290
    /* TODO use a non-exception-throwing mp_get_index */
    if (n_args >= 3 && args[2] != mp_const_none) {
291
        start = mp_get_index(&str_type, haystack_len, args[2], true);
292
293
    }
    if (n_args >= 4 && args[3] != mp_const_none) {
294
        end = mp_get_index(&str_type, haystack_len, args[3], true);
295
296
    }

297
    const byte *p = find_subbytes(haystack + start, haystack_len - start, needle, needle_len);
298
299
300
301
302
303
    if (p == NULL) {
        // not found
        return MP_OBJ_NEW_SMALL_INT(-1);
    } else {
        // found
        machine_int_t pos = p - haystack;
304
305
306
        if (pos + needle_len > end) {
            pos = -1;
        }
307
        return MP_OBJ_NEW_SMALL_INT(pos);
308
309
310
    }
}

311
// TODO: (Much) more variety in args
312
STATIC mp_obj_t str_startswith(mp_obj_t self_in, mp_obj_t arg) {
313
314
315
316
317
318
319
320
    GET_STR_DATA_LEN(self_in, str, str_len);
    GET_STR_DATA_LEN(arg, prefix, prefix_len);
    if (prefix_len > str_len) {
        return mp_const_false;
    }
    return MP_BOOL(memcmp(str, prefix, prefix_len) == 0);
}

321
322
STATIC bool chr_in_str(const byte* const str, const machine_uint_t str_len, int c) {
    for (machine_uint_t i = 0; i < str_len; i++) {
323
324
325
326
327
328
329
        if (str[i] == c) {
            return true;
        }
    }
    return false;
}

330
STATIC mp_obj_t str_strip(uint n_args, const mp_obj_t *args) {
xbe's avatar
xbe committed
331
    assert(1 <= n_args && n_args <= 2);
332
333
334
335
336
    assert(MP_OBJ_IS_STR(args[0]));

    const byte *chars_to_del;
    uint chars_to_del_len;
    static const byte whitespace[] = " \t\n\r\v\f";
xbe's avatar
xbe committed
337
338
339

    if (n_args == 1) {
        chars_to_del = whitespace;
340
        chars_to_del_len = sizeof(whitespace);
xbe's avatar
xbe committed
341
    } else {
342
343
344
345
        assert(MP_OBJ_IS_STR(args[1]));
        GET_STR_DATA_LEN(args[1], s, l);
        chars_to_del = s;
        chars_to_del_len = l;
xbe's avatar
xbe committed
346
347
    }

348
    GET_STR_DATA_LEN(args[0], orig_str, orig_str_len);
xbe's avatar
xbe committed
349

350
    machine_uint_t first_good_char_pos = 0;
xbe's avatar
xbe committed
351
    bool first_good_char_pos_set = false;
352
353
    machine_uint_t last_good_char_pos = 0;
    for (machine_uint_t i = 0; i < orig_str_len; i++) {
xbe's avatar
xbe committed
354
355
356
357
358
359
360
361
362
363
        if (!chr_in_str(chars_to_del, chars_to_del_len, orig_str[i])) {
            last_good_char_pos = i;
            if (!first_good_char_pos_set) {
                first_good_char_pos = i;
                first_good_char_pos_set = true;
            }
        }
    }

    if (first_good_char_pos == 0 && last_good_char_pos == 0) {
364
365
        // string is all whitespace, return ''
        return MP_OBJ_NEW_QSTR(MP_QSTR_);
xbe's avatar
xbe committed
366
367
368
369
    }

    assert(last_good_char_pos >= first_good_char_pos);
    //+1 to accomodate the last character
370
    machine_uint_t stripped_len = last_good_char_pos - first_good_char_pos + 1;
371
    return mp_obj_new_str(orig_str + first_good_char_pos, stripped_len, false);
xbe's avatar
xbe committed
372
373
}

374
mp_obj_t str_format(uint n_args, const mp_obj_t *args) {
375
    assert(MP_OBJ_IS_STR(args[0]));
376

377
    GET_STR_DATA_LEN(args[0], str, len);
378
379
    int arg_i = 1;
    vstr_t *vstr = vstr_new();
380
    for (const byte *top = str + len; str < top; str++) {
381
382
        if (*str == '{') {
            str++;
383
            if (str < top && *str == '{') {
384
                vstr_add_char(vstr, '{');
385
            } else {
386
                while (str < top && *str != '}') str++;
387
                if (arg_i >= n_args) {
388
                    nlr_jump(mp_obj_new_exception_msg(&mp_type_IndexError, "tuple index out of range"));
389
                }
390
                // TODO: may be PRINT_REPR depending on formatting code
391
                mp_obj_print_helper((void (*)(void*, const char*, ...))vstr_printf, vstr, args[arg_i], PRINT_STR);
392
393
394
395
396
397
398
                arg_i++;
            }
        } else {
            vstr_add_char(vstr, *str);
        }
    }

399
400
401
    mp_obj_t s = mp_obj_new_str((byte*)vstr->buf, vstr->len, false);
    vstr_free(vstr);
    return s;
402
403
}

404
STATIC mp_obj_t str_replace(uint n_args, const mp_obj_t *args) {
405
406
407
408
    assert(MP_OBJ_IS_STR(args[0]));
    assert(MP_OBJ_IS_STR(args[1]));
    assert(MP_OBJ_IS_STR(args[2]));

409
    machine_int_t max_rep = 0;
410
    if (n_args == 4) {
411
412
413
414
415
416
417
        assert(MP_OBJ_IS_SMALL_INT(args[3]));
        max_rep = MP_OBJ_SMALL_INT_VALUE(args[3]);
        if (max_rep == 0) {
            return args[0];
        } else if (max_rep < 0) {
            max_rep = 0;
        }
418
    }
419
420

    // if max_rep is still 0 by this point we will need to do all possible replacements
421
422
423
424

    GET_STR_DATA_LEN(args[0], str, str_len);
    GET_STR_DATA_LEN(args[1], old, old_len);
    GET_STR_DATA_LEN(args[2], new, new_len);
425
426

    // old won't exist in str if it's longer, so nothing to replace
427
    if (old_len > str_len) {
428
        return args[0];
429
430
    }

431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
    // data for the replaced string
    byte *data = NULL;
    mp_obj_t replaced_str = MP_OBJ_NULL;

    // do 2 passes over the string:
    //   first pass computes the required length of the replaced string
    //   second pass does the replacements
    for (;;) {
        machine_uint_t replaced_str_index = 0;
        machine_uint_t num_replacements_done = 0;
        const byte *old_occurrence;
        const byte *offset_ptr = str;
        machine_uint_t offset_num = 0;
        while ((old_occurrence = find_subbytes(offset_ptr, str_len - offset_num, old, old_len)) != NULL) {
            // copy from just after end of last occurrence of to-be-replaced string to right before start of next occurrence
            if (data != NULL) {
                memcpy(data + replaced_str_index, offset_ptr, old_occurrence - offset_ptr);
            }
            replaced_str_index += old_occurrence - offset_ptr;
            // copy the replacement string
            if (data != NULL) {
                memcpy(data + replaced_str_index, new, new_len);
            }
            replaced_str_index += new_len;
            offset_ptr = old_occurrence + old_len;
            offset_num = offset_ptr - str;

            num_replacements_done++;
            if (max_rep != 0 && num_replacements_done == max_rep){
                break;
            }
        }

        // copy from just after end of last occurrence of to-be-replaced string to end of old string
        if (data != NULL) {
            memcpy(data + replaced_str_index, offset_ptr, str_len - offset_num);
        }
        replaced_str_index += str_len - offset_num;

        if (data == NULL) {
            // first pass
            if (num_replacements_done == 0) {
                // no substr found, return original string
                return args[0];
            } else {
                // substr found, allocate new string
                replaced_str = mp_obj_str_builder_start(mp_obj_get_type(args[0]), replaced_str_index, &data);
            }
        } else {
            // second pass, we are done
            break;
        }
483
    }
484

485
486
487
    return mp_obj_str_builder_end(replaced_str);
}

488
489
490
491
492
493
494
495
STATIC mp_obj_t str_count(uint n_args, const mp_obj_t *args) {
    assert(2 <= n_args && n_args <= 4);
    assert(MP_OBJ_IS_STR(args[0]));
    assert(MP_OBJ_IS_STR(args[1]));

    GET_STR_DATA_LEN(args[0], haystack, haystack_len);
    GET_STR_DATA_LEN(args[1], needle, needle_len);

496
497
    machine_uint_t start = 0;
    machine_uint_t end = haystack_len;
498
499
500
501
502
503
504
505
    /* TODO use a non-exception-throwing mp_get_index */
    if (n_args >= 3 && args[2] != mp_const_none) {
        start = mp_get_index(&str_type, haystack_len, args[2], true);
    }
    if (n_args >= 4 && args[3] != mp_const_none) {
        end = mp_get_index(&str_type, haystack_len, args[3], true);
    }

506
507
508
    // if needle_len is zero then we count each gap between characters as an occurrence
    if (needle_len == 0) {
        return MP_OBJ_NEW_SMALL_INT(end - start + 1);
509
510
    }

511
512
    // count the occurrences
    machine_int_t num_occurrences = 0;
xbe's avatar
xbe committed
513
514
515
516
517
    for (machine_uint_t haystack_index = start; haystack_index + needle_len <= end; haystack_index++) {
        if (memcmp(&haystack[haystack_index], needle, needle_len) == 0) {
            num_occurrences++;
            haystack_index += needle_len - 1;
        }
518
519
520
521
522
    }

    return MP_OBJ_NEW_SMALL_INT(num_occurrences);
}

523
STATIC mp_obj_t str_partitioner(mp_obj_t self_in, mp_obj_t arg, bool rpartition) {
524
525
526
527
528
529
530
    assert(MP_OBJ_IS_STR(self_in));
    if (!MP_OBJ_IS_STR(arg)) {
        nlr_jump(mp_obj_new_exception_msg_varg(&mp_type_TypeError,
                                               "Can't convert '%s' object to str implicitly", mp_obj_get_type_str(arg)));
    }
    GET_STR_DATA_LEN(self_in, str, str_len);
    GET_STR_DATA_LEN(arg, sep, sep_len);
531
    mp_obj_t result[] = {MP_OBJ_NEW_QSTR(MP_QSTR_), MP_OBJ_NEW_QSTR(MP_QSTR_), MP_OBJ_NEW_QSTR(MP_QSTR_)};
532
533
534
535

    if (sep_len == 0) {
        nlr_jump(mp_obj_new_exception_msg(&mp_type_ValueError, "empty separator"));
    }
536
537
538
539
540
    if (rpartition) {
        result[2] = mp_obj_new_str(str, str_len, false);
    } else {
        result[0] = mp_obj_new_str(str, str_len, false);
    }
541
542
543

    for (machine_uint_t str_index = 0; str_index + sep_len <= str_len; str_index++) {
        if (memcmp(&str[str_index], sep, sep_len) == 0) {
544
545
546
547
548
549
            result[0] = mp_obj_new_str(str, str_index, false);
            result[1] = arg;
            result[2] = mp_obj_new_str(str + str_index + sep_len, str_len - str_index - sep_len, false);
            if (!rpartition) {
                break;
            }
550
551
        }
    }
552
    return mp_obj_new_tuple(3, result);
553
554
}

555
556
557
STATIC mp_obj_t str_partition(mp_obj_t self_in, mp_obj_t arg, bool partition) {
    return str_partitioner(self_in, arg, false);
}
558

559
560
STATIC mp_obj_t str_rpartition(mp_obj_t self_in, mp_obj_t arg, bool partition) {
    return str_partitioner(self_in, arg, true);
561
562
}

563
564
565
566
567
568
569
570
571
572
573
574
575
576
STATIC machine_int_t str_get_buffer(mp_obj_t self_in, buffer_info_t *bufinfo, int flags) {
    if (flags == BUFFER_READ) {
        GET_STR_DATA_LEN(self_in, str_data, str_len);
        bufinfo->buf = (void*)str_data;
        bufinfo->len = str_len;
        return 0;
    } else {
        // can't write to a string
        bufinfo->buf = NULL;
        bufinfo->len = 0;
        return 1;
    }
}

577
578
579
580
581
582
583
STATIC MP_DEFINE_CONST_FUN_OBJ_VAR_BETWEEN(str_find_obj, 2, 4, str_find);
STATIC MP_DEFINE_CONST_FUN_OBJ_2(str_join_obj, str_join);
STATIC MP_DEFINE_CONST_FUN_OBJ_VAR_BETWEEN(str_split_obj, 1, 3, str_split);
STATIC MP_DEFINE_CONST_FUN_OBJ_2(str_startswith_obj, str_startswith);
STATIC MP_DEFINE_CONST_FUN_OBJ_VAR_BETWEEN(str_strip_obj, 1, 2, str_strip);
STATIC MP_DEFINE_CONST_FUN_OBJ_VAR(str_format_obj, 1, str_format);
STATIC MP_DEFINE_CONST_FUN_OBJ_VAR_BETWEEN(str_replace_obj, 3, 4, str_replace);
584
STATIC MP_DEFINE_CONST_FUN_OBJ_VAR_BETWEEN(str_count_obj, 2, 4, str_count);
585
STATIC MP_DEFINE_CONST_FUN_OBJ_2(str_partition_obj, str_partition);
586
STATIC MP_DEFINE_CONST_FUN_OBJ_2(str_rpartition_obj, str_rpartition);
587

588
STATIC const mp_method_t str_type_methods[] = {
589
    { "find", &str_find_obj },
ian-v's avatar
ian-v committed
590
    { "join", &str_join_obj },
Paul Sokolovsky's avatar
Paul Sokolovsky committed
591
    { "split", &str_split_obj },
592
    { "startswith", &str_startswith_obj },
xbe's avatar
xbe committed
593
    { "strip", &str_strip_obj },
ian-v's avatar
ian-v committed
594
    { "format", &str_format_obj },
595
    { "replace", &str_replace_obj },
596
    { "count", &str_count_obj },
597
    { "partition", &str_partition_obj },
598
    { "rpartition", &str_rpartition_obj },
ian-v's avatar
ian-v committed
599
600
    { NULL, NULL }, // end-of-list sentinel
};
601

602
const mp_obj_type_t str_type = {
603
    { &mp_type_type },
604
    .name = MP_QSTR_str,
605
606
    .print = str_print,
    .binary_op = str_binary_op,
607
608
    .getiter = mp_obj_new_str_iterator,
    .methods = str_type_methods,
609
    .buffer_p = { .get_buffer = str_get_buffer },
610
611
612
613
};

// Reuses most of methods from str
const mp_obj_type_t bytes_type = {
614
    { &mp_type_type },
615
    .name = MP_QSTR_bytes,
616
617
618
    .print = str_print,
    .binary_op = str_binary_op,
    .getiter = mp_obj_new_bytes_iterator,
ian-v's avatar
ian-v committed
619
    .methods = str_type_methods,
620
621
};

622
mp_obj_t mp_obj_str_builder_start(const mp_obj_type_t *type, uint len, byte **data) {
623
    mp_obj_str_t *o = m_new_obj_var(mp_obj_str_t, byte, len + 1);
624
    o->base.type = type;
625
626
627
628
629
630
631
632
633
634
    o->len = len;
    *data = o->data;
    return o;
}

mp_obj_t mp_obj_str_builder_end(mp_obj_t o_in) {
    assert(MP_OBJ_IS_STR(o_in));
    mp_obj_str_t *o = o_in;
    o->hash = qstr_compute_hash(o->data, o->len);
    o->data[o->len] = '\0'; // for now we add null for compatibility with C ASCIIZ strings
635
636
637
    return o;
}

638
STATIC mp_obj_t str_new(const mp_obj_type_t *type, const byte* data, uint len) {
639
640
641
642
643
644
645
646
647
    mp_obj_str_t *o = m_new_obj_var(mp_obj_str_t, byte, len + 1);
    o->base.type = type;
    o->hash = qstr_compute_hash(data, len);
    o->len = len;
    memcpy(o->data, data, len * sizeof(byte));
    o->data[len] = '\0'; // for now we add null for compatibility with C ASCIIZ strings
    return o;
}

648
649
650
651
652
653
654
655
656
657
mp_obj_t mp_obj_new_str(const byte* data, uint len, bool make_qstr_if_not_already) {
    qstr q = qstr_find_strn(data, len);
    if (q != MP_QSTR_NULL) {
        // qstr with this data already exists
        return MP_OBJ_NEW_QSTR(q);
    } else if (make_qstr_if_not_already) {
        // no existing qstr, make a new one
        return MP_OBJ_NEW_QSTR(qstr_from_strn((const char*)data, len));
    } else {
        // no existing qstr, don't make one
658
        return str_new(&str_type, data, len);
659
660
661
    }
}

662
663
664
665
mp_obj_t mp_obj_new_bytes(const byte* data, uint len) {
    return str_new(&bytes_type, data, len);
}

666
667
668
669
670
671
672
673
674
675
676
677
678
679
bool mp_obj_str_equal(mp_obj_t s1, mp_obj_t s2) {
    if (MP_OBJ_IS_QSTR(s1) && MP_OBJ_IS_QSTR(s2)) {
        return s1 == s2;
    } else {
        GET_STR_HASH(s1, h1);
        GET_STR_HASH(s2, h2);
        if (h1 != h2) {
            return false;
        }
        GET_STR_DATA_LEN(s1, d1, l1);
        GET_STR_DATA_LEN(s2, d2, l2);
        if (l1 != l2) {
            return false;
        }
680
        return memcmp(d1, d2, l1) == 0;
681
682
683
    }
}

684
685
void bad_implicit_conversion(mp_obj_t self_in) __attribute__((noreturn));
void bad_implicit_conversion(mp_obj_t self_in) {
686
    nlr_jump(mp_obj_new_exception_msg_varg(&mp_type_TypeError, "Can't convert '%s' object to str implicitly", mp_obj_get_type_str(self_in)));
687
688
}

689
690
691
692
693
uint mp_obj_str_get_hash(mp_obj_t self_in) {
    if (MP_OBJ_IS_STR(self_in)) {
        GET_STR_HASH(self_in, h);
        return h;
    } else {
694
        bad_implicit_conversion(self_in);
695
    }
696
697
698
699
700
701
702
}

uint mp_obj_str_get_len(mp_obj_t self_in) {
    if (MP_OBJ_IS_STR(self_in)) {
        GET_STR_LEN(self_in, l);
        return l;
    } else {
703
704
705
706
707
708
709
710
711
712
713
714
715
716
        bad_implicit_conversion(self_in);
    }
}

// use this if you will anyway convert the string to a qstr
// will be more efficient for the case where it's already a qstr
qstr mp_obj_str_get_qstr(mp_obj_t self_in) {
    if (MP_OBJ_IS_QSTR(self_in)) {
        return MP_OBJ_QSTR_VALUE(self_in);
    } else if (MP_OBJ_IS_TYPE(self_in, &str_type)) {
        mp_obj_str_t *self = self_in;
        return qstr_from_strn((char*)self->data, self->len);
    } else {
        bad_implicit_conversion(self_in);
717
718
719
720
721
722
723
724
725
726
727
    }
}

// only use this function if you need the str data to be zero terminated
// at the moment all strings are zero terminated to help with C ASCIIZ compatibility
const char *mp_obj_str_get_str(mp_obj_t self_in) {
    if (MP_OBJ_IS_STR(self_in)) {
        GET_STR_DATA_LEN(self_in, s, l);
        (void)l; // len unused
        return (const char*)s;
    } else {
728
        bad_implicit_conversion(self_in);
729
730
731
    }
}

732
const char *mp_obj_str_get_data(mp_obj_t self_in, uint *len) {
733
734
735
    if (MP_OBJ_IS_STR(self_in)) {
        GET_STR_DATA_LEN(self_in, s, l);
        *len = l;
736
        return (const char*)s;
737
    } else {
738
        bad_implicit_conversion(self_in);
739
    }
740
}
xyb's avatar
xyb committed
741
742
743
744
745
746

/******************************************************************************/
/* str iterator                                                               */

typedef struct _mp_obj_str_it_t {
    mp_obj_base_t base;
747
    mp_obj_t str;
xyb's avatar
xyb committed
748
749
750
    machine_uint_t cur;
} mp_obj_str_it_t;

751
STATIC mp_obj_t str_it_iternext(mp_obj_t self_in) {
xyb's avatar
xyb committed
752
    mp_obj_str_it_t *self = self_in;
753
754
755
    GET_STR_DATA_LEN(self->str, str, len);
    if (self->cur < len) {
        mp_obj_t o_out = mp_obj_new_str(str + self->cur, 1, true);
xyb's avatar
xyb committed
756
757
758
759
760
761
762
        self->cur += 1;
        return o_out;
    } else {
        return mp_const_stop_iteration;
    }
}

763
STATIC const mp_obj_type_t str_it_type = {
764
    { &mp_type_type },
765
    .name = MP_QSTR_iterator,
766
    .iternext = str_it_iternext,
xyb's avatar
xyb committed
767
768
};

769
STATIC mp_obj_t bytes_it_iternext(mp_obj_t self_in) {
770
771
772
    mp_obj_str_it_t *self = self_in;
    GET_STR_DATA_LEN(self->str, str, len);
    if (self->cur < len) {
773
        mp_obj_t o_out = MP_OBJ_NEW_SMALL_INT((mp_small_int_t)str[self->cur]);
774
775
776
777
778
779
780
        self->cur += 1;
        return o_out;
    } else {
        return mp_const_stop_iteration;
    }
}

781
STATIC const mp_obj_type_t bytes_it_type = {
782
    { &mp_type_type },
783
    .name = MP_QSTR_iterator,
784
785
786
787
    .iternext = bytes_it_iternext,
};

mp_obj_t mp_obj_new_str_iterator(mp_obj_t str) {
xyb's avatar
xyb committed
788
789
790
    mp_obj_str_it_t *o = m_new_obj(mp_obj_str_it_t);
    o->base.type = &str_it_type;
    o->str = str;
791
792
793
794
795
796
797
798
799
    o->cur = 0;
    return o;
}

mp_obj_t mp_obj_new_bytes_iterator(mp_obj_t str) {
    mp_obj_str_it_t *o = m_new_obj(mp_obj_str_it_t);
    o->base.type = &bytes_it_type;
    o->str = str;
    o->cur = 0;
xyb's avatar
xyb committed
800
801
    return o;
}