objstr.c 23 KB
Newer Older
1
2
3
4
5
6
7
8
9
#include <stdlib.h>
#include <stdint.h>
#include <stdarg.h>
#include <string.h>
#include <assert.h>

#include "nlr.h"
#include "misc.h"
#include "mpconfig.h"
10
#include "qstr.h"
11
12
13
14
15
16
#include "obj.h"
#include "runtime0.h"
#include "runtime.h"

typedef struct _mp_obj_str_t {
    mp_obj_base_t base;
17
18
19
    machine_uint_t hash : 16; // XXX here we assume the hash size is 16 bits (it is at the moment; see qstr.c)
    machine_uint_t len : 16; // len == number of bytes used in data, alloc = len + 1 because (at the moment) we also append a null byte
    byte data[];
20
21
} mp_obj_str_t;

22
23
24
25
26
27
28
29
30
// use this macro to extract the string hash
#define GET_STR_HASH(str_obj_in, str_hash) uint str_hash; if (MP_OBJ_IS_QSTR(str_obj_in)) { str_hash = qstr_hash(MP_OBJ_QSTR_VALUE(str_obj_in)); } else { str_hash = ((mp_obj_str_t*)str_obj_in)->hash; }

// use this macro to extract the string length
#define GET_STR_LEN(str_obj_in, str_len) uint str_len; if (MP_OBJ_IS_QSTR(str_obj_in)) { str_len = qstr_len(MP_OBJ_QSTR_VALUE(str_obj_in)); } else { str_len = ((mp_obj_str_t*)str_obj_in)->len; }

// use this macro to extract the string data and length
#define GET_STR_DATA_LEN(str_obj_in, str_data, str_len) const byte *str_data; uint str_len; if (MP_OBJ_IS_QSTR(str_obj_in)) { str_data = qstr_data(MP_OBJ_QSTR_VALUE(str_obj_in), &str_len); } else { str_len = ((mp_obj_str_t*)str_obj_in)->len; str_data = ((mp_obj_str_t*)str_obj_in)->data; }

31
32
static mp_obj_t mp_obj_new_str_iterator(mp_obj_t str);
static mp_obj_t mp_obj_new_bytes_iterator(mp_obj_t str);
xyb's avatar
xyb committed
33
34
35
36

/******************************************************************************/
/* str                                                                        */

37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
void mp_str_print_quoted(void (*print)(void *env, const char *fmt, ...), void *env, const byte *str_data, uint str_len) {
    // this escapes characters, but it will be very slow to print (calling print many times)
    bool has_single_quote = false;
    bool has_double_quote = false;
    for (const byte *s = str_data, *top = str_data + str_len; (!has_single_quote || !has_double_quote) && s < top; s++) {
        if (*s == '\'') {
            has_single_quote = true;
        } else if (*s == '"') {
            has_double_quote = true;
        }
    }
    int quote_char = '\'';
    if (has_single_quote && !has_double_quote) {
        quote_char = '"';
    }
    print(env, "%c", quote_char);
    for (const byte *s = str_data, *top = str_data + str_len; s < top; s++) {
        if (*s == quote_char) {
            print(env, "\\%c", quote_char);
        } else if (*s == '\\') {
            print(env, "\\\\");
        } else if (32 <= *s && *s <= 126) {
            print(env, "%c", *s);
        } else if (*s == '\n') {
            print(env, "\\n");
        // TODO add more escape codes here if we want to match CPython
        } else {
            print(env, "\\x%02x", *s);
        }
    }
    print(env, "%c", quote_char);
}

static void str_print(void (*print)(void *env, const char *fmt, ...), void *env, mp_obj_t self_in, mp_print_kind_t kind) {
71
    GET_STR_DATA_LEN(self_in, str_data, str_len);
72
73
    bool is_bytes = MP_OBJ_IS_TYPE(self_in, &bytes_type);
    if (kind == PRINT_STR && !is_bytes) {
74
        print(env, "%.*s", str_len, str_data);
75
    } else {
76
77
78
        if (is_bytes) {
            print(env, "b");
        }
79
        mp_str_print_quoted(print, env, str_data, str_len);
80
    }
81
82
}

83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
// like strstr but with specified length and allows \0 bytes
// TODO replace with something more efficient/standard
static const byte *find_subbytes(const byte *haystack, uint hlen, const byte *needle, uint nlen) {
    if (hlen >= nlen) {
        for (uint i = 0; i <= hlen - nlen; i++) {
            bool found = true;
            for (uint j = 0; j < nlen; j++) {
                if (haystack[i + j] != needle[j]) {
                    found = false;
                    break;
                }
            }
            if (found) {
                return haystack + i;
            }
        }
    }
    return NULL;
}

103
mp_obj_t str_binary_op(int op, mp_obj_t lhs_in, mp_obj_t rhs_in) {
104
    GET_STR_DATA_LEN(lhs_in, lhs_data, lhs_len);
105
106
    switch (op) {
        case RT_BINARY_OP_SUBSCR:
107
108
109
            // TODO: need predicate to check for int-like type (bools are such for example)
            // ["no", "yes"][1 == 2] is common idiom
            if (MP_OBJ_IS_SMALL_INT(rhs_in)) {
110
                uint index = mp_get_index(mp_obj_get_type(lhs_in), lhs_len, rhs_in);
111
                if (MP_OBJ_IS_TYPE(lhs_in, &bytes_type)) {
112
                    return MP_OBJ_NEW_SMALL_INT((mp_small_int_t)lhs_data[index]);
113
114
115
                } else {
                    return mp_obj_new_str(lhs_data + index, 1, true);
                }
116
#if MICROPY_ENABLE_SLICE
117
            } else if (MP_OBJ_IS_TYPE(rhs_in, &slice_type)) {
118
                machine_int_t start, stop, step;
119
120
                mp_obj_slice_get(rhs_in, &start, &stop, &step);
                assert(step == 1);
121
                if (start < 0) {
122
                    start = lhs_len + start;
123
124
125
                    if (start < 0) {
                        start = 0;
                    }
126
127
                } else if (start > lhs_len) {
                    start = lhs_len;
128
129
                }
                if (stop <= 0) {
130
                    stop = lhs_len + stop;
131
132
133
134
                    // CPython returns empty string in such case
                    if (stop < 0) {
                        stop = start;
                    }
135
136
                } else if (stop > lhs_len) {
                    stop = lhs_len;
137
                }
138
                return mp_obj_new_str(lhs_data + start, stop - start, false);
139
#endif
140
            } else {
141
142
                // Message doesn't match CPython, but we don't have so much bytes as they
                // to spend them on verbose wording
143
                nlr_jump(mp_obj_new_exception_msg(MP_QSTR_TypeError, "index must be int"));
144
            }
145
146
147

        case RT_BINARY_OP_ADD:
        case RT_BINARY_OP_INPLACE_ADD:
148
            if (MP_OBJ_IS_STR(rhs_in)) {
149
                // add 2 strings
150
151

                GET_STR_DATA_LEN(rhs_in, rhs_data, rhs_len);
152
                int alloc_len = lhs_len + rhs_len;
153
154

                /* code for making qstr
155
156
157
158
                byte *q_ptr;
                byte *val = qstr_build_start(alloc_len, &q_ptr);
                memcpy(val, lhs_data, lhs_len);
                memcpy(val + lhs_len, rhs_data, rhs_len);
159
160
161
162
163
                return MP_OBJ_NEW_QSTR(qstr_build_end(q_ptr));
                */

                // code for non-qstr
                byte *data;
164
                mp_obj_t s = mp_obj_str_builder_start(mp_obj_get_type(lhs_in), alloc_len, &data);
165
166
167
                memcpy(data, lhs_data, lhs_len);
                memcpy(data + lhs_len, rhs_data, rhs_len);
                return mp_obj_str_builder_end(s);
168
169
            }
            break;
170

171
172
173
        case RT_COMPARE_OP_IN:
        case RT_COMPARE_OP_NOT_IN:
            /* NOTE `a in b` is `b.__contains__(a)` */
174
175
            if (MP_OBJ_IS_STR(rhs_in)) {
                GET_STR_DATA_LEN(rhs_in, rhs_data, rhs_len);
176
                return MP_BOOL((op == RT_COMPARE_OP_IN) ^ (find_subbytes(lhs_data, lhs_len, rhs_data, rhs_len) == NULL));
177
178
            }
            break;
179

180
181
182
183
184
185
        case RT_BINARY_OP_MULTIPLY:
        {
            if (!MP_OBJ_IS_SMALL_INT(rhs_in)) {
                return NULL;
            }
            int n = MP_OBJ_SMALL_INT_VALUE(rhs_in);
186
            byte *data;
187
            mp_obj_t s = mp_obj_str_builder_start(mp_obj_get_type(lhs_in), lhs_len * n, &data);
188
189
            mp_seq_multiply(lhs_data, sizeof(*lhs_data), lhs_len, n, data);
            return mp_obj_str_builder_end(s);
190
        }
191
192
193
194
195
196
    }

    return MP_OBJ_NULL; // op not supported
}

mp_obj_t str_join(mp_obj_t self_in, mp_obj_t arg) {
197
    assert(MP_OBJ_IS_STR(self_in));
198

199
    // get separation string
200
    GET_STR_DATA_LEN(self_in, sep_str, sep_len);
201
202

    // process args
203
204
205
206
207
208
209
210
211
    uint seq_len;
    mp_obj_t *seq_items;
    if (MP_OBJ_IS_TYPE(arg, &tuple_type)) {
        mp_obj_tuple_get(arg, &seq_len, &seq_items);
    } else if (MP_OBJ_IS_TYPE(arg, &list_type)) {
        mp_obj_list_get(arg, &seq_len, &seq_items);
    } else {
        goto bad_arg;
    }
212
213
214

    // count required length
    int required_len = 0;
215
    for (int i = 0; i < seq_len; i++) {
216
        if (!MP_OBJ_IS_STR(seq_items[i])) {
217
218
            goto bad_arg;
        }
219
220
221
        if (i > 0) {
            required_len += sep_len;
        }
222
223
        GET_STR_LEN(seq_items[i], l);
        required_len += l;
224
225
226
    }

    // make joined string
227
    byte *data;
228
    mp_obj_t joined_str = mp_obj_str_builder_start(mp_obj_get_type(self_in), required_len, &data);
229
230
    for (int i = 0; i < seq_len; i++) {
        if (i > 0) {
231
232
            memcpy(data, sep_str, sep_len);
            data += sep_len;
233
        }
234
235
236
        GET_STR_DATA_LEN(seq_items[i], s, l);
        memcpy(data, s, l);
        data += l;
237
    }
238
239

    // return joined string
240
    return mp_obj_str_builder_end(joined_str);
241
242

bad_arg:
243
    nlr_jump(mp_obj_new_exception_msg(MP_QSTR_TypeError, "?str.join expecting a list of str's"));
244
245
}

Paul Sokolovsky's avatar
Paul Sokolovsky committed
246
247
248
249
250
251
252
253
254
255
256
257
#define is_ws(c) ((c) == ' ' || (c) == '\t')

static mp_obj_t str_split(uint n_args, const mp_obj_t *args) {
    int splits = -1;
    mp_obj_t sep = mp_const_none;
    if (n_args > 1) {
        sep = args[1];
        if (n_args > 2) {
            splits = MP_OBJ_SMALL_INT_VALUE(args[2]);
        }
    }
    assert(sep == mp_const_none);
258
    (void)sep; // unused; to hush compiler warning
Paul Sokolovsky's avatar
Paul Sokolovsky committed
259
    mp_obj_t res = mp_obj_new_list(0, NULL);
260
261
262
    GET_STR_DATA_LEN(args[0], s, len);
    const byte *top = s + len;
    const byte *start;
Paul Sokolovsky's avatar
Paul Sokolovsky committed
263
264

    // Initial whitespace is not counted as split, so we pre-do it
265
266
    while (s < top && is_ws(*s)) s++;
    while (s < top && splits != 0) {
Paul Sokolovsky's avatar
Paul Sokolovsky committed
267
        start = s;
268
269
270
        while (s < top && !is_ws(*s)) s++;
        rt_list_append(res, mp_obj_new_str(start, s - start, false));
        if (s >= top) {
Paul Sokolovsky's avatar
Paul Sokolovsky committed
271
272
            break;
        }
273
        while (s < top && is_ws(*s)) s++;
Paul Sokolovsky's avatar
Paul Sokolovsky committed
274
275
276
277
278
        if (splits > 0) {
            splits--;
        }
    }

279
280
    if (s < top) {
        rt_list_append(res, mp_obj_new_str(s, top - s, false));
Paul Sokolovsky's avatar
Paul Sokolovsky committed
281
282
283
284
285
    }

    return res;
}

286
static mp_obj_t str_find(uint n_args, const mp_obj_t *args) {
287
    assert(2 <= n_args && n_args <= 4);
288
289
    assert(MP_OBJ_IS_STR(args[0]));
    assert(MP_OBJ_IS_STR(args[1]));
290

291
292
    GET_STR_DATA_LEN(args[0], haystack, haystack_len);
    GET_STR_DATA_LEN(args[1], needle, needle_len);
293
294
295
296
297
298
299
300
301
302
303

    size_t start = 0;
    size_t end = haystack_len;
    /* TODO use a non-exception-throwing mp_get_index */
    if (n_args >= 3 && args[2] != mp_const_none) {
        start = mp_get_index(&str_type, haystack_len, args[2]);
    }
    if (n_args >= 4 && args[3] != mp_const_none) {
        end = mp_get_index(&str_type, haystack_len, args[3]);
    }

304
    const byte *p = find_subbytes(haystack + start, haystack_len - start, needle, needle_len);
305
306
307
308
309
310
    if (p == NULL) {
        // not found
        return MP_OBJ_NEW_SMALL_INT(-1);
    } else {
        // found
        machine_int_t pos = p - haystack;
311
312
313
        if (pos + needle_len > end) {
            pos = -1;
        }
314
        return MP_OBJ_NEW_SMALL_INT(pos);
315
316
317
    }
}

318
319
320
321
322
323
324
325
326
327
// TODO: (Much) more variety in args
static mp_obj_t str_startswith(mp_obj_t self_in, mp_obj_t arg) {
    GET_STR_DATA_LEN(self_in, str, str_len);
    GET_STR_DATA_LEN(arg, prefix, prefix_len);
    if (prefix_len > str_len) {
        return mp_const_false;
    }
    return MP_BOOL(memcmp(str, prefix, prefix_len) == 0);
}

328
329
330
331
332
333
334
335
336
static bool chr_in_str(const byte* const str, const size_t str_len, int c) {
    for (size_t i = 0; i < str_len; i++) {
        if (str[i] == c) {
            return true;
        }
    }
    return false;
}

337
mp_obj_t str_strip(uint n_args, const mp_obj_t *args) {
xbe's avatar
xbe committed
338
    assert(1 <= n_args && n_args <= 2);
339
340
341
342
343
    assert(MP_OBJ_IS_STR(args[0]));

    const byte *chars_to_del;
    uint chars_to_del_len;
    static const byte whitespace[] = " \t\n\r\v\f";
xbe's avatar
xbe committed
344
345
346

    if (n_args == 1) {
        chars_to_del = whitespace;
347
        chars_to_del_len = sizeof(whitespace);
xbe's avatar
xbe committed
348
    } else {
349
350
351
352
        assert(MP_OBJ_IS_STR(args[1]));
        GET_STR_DATA_LEN(args[1], s, l);
        chars_to_del = s;
        chars_to_del_len = l;
xbe's avatar
xbe committed
353
354
    }

355
    GET_STR_DATA_LEN(args[0], orig_str, orig_str_len);
xbe's avatar
xbe committed
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370

    size_t first_good_char_pos = 0;
    bool first_good_char_pos_set = false;
    size_t last_good_char_pos = 0;
    for (size_t i = 0; i < orig_str_len; i++) {
        if (!chr_in_str(chars_to_del, chars_to_del_len, orig_str[i])) {
            last_good_char_pos = i;
            if (!first_good_char_pos_set) {
                first_good_char_pos = i;
                first_good_char_pos_set = true;
            }
        }
    }

    if (first_good_char_pos == 0 && last_good_char_pos == 0) {
371
372
        // string is all whitespace, return ''
        return MP_OBJ_NEW_QSTR(MP_QSTR_);
xbe's avatar
xbe committed
373
374
375
376
377
    }

    assert(last_good_char_pos >= first_good_char_pos);
    //+1 to accomodate the last character
    size_t stripped_len = last_good_char_pos - first_good_char_pos + 1;
378
    return mp_obj_new_str(orig_str + first_good_char_pos, stripped_len, false);
xbe's avatar
xbe committed
379
380
}

381
mp_obj_t str_format(uint n_args, const mp_obj_t *args) {
382
    assert(MP_OBJ_IS_STR(args[0]));
383

384
    GET_STR_DATA_LEN(args[0], str, len);
385
386
    int arg_i = 1;
    vstr_t *vstr = vstr_new();
387
    for (const byte *top = str + len; str < top; str++) {
388
389
        if (*str == '{') {
            str++;
390
            if (str < top && *str == '{') {
391
                vstr_add_char(vstr, '{');
392
            } else {
393
                while (str < top && *str != '}') str++;
394
                if (arg_i >= n_args) {
395
                    nlr_jump(mp_obj_new_exception_msg(MP_QSTR_IndexError, "tuple index out of range"));
396
                }
397
                // TODO: may be PRINT_REPR depending on formatting code
398
                mp_obj_print_helper((void (*)(void*, const char*, ...))vstr_printf, vstr, args[arg_i], PRINT_STR);
399
400
401
402
403
404
405
                arg_i++;
            }
        } else {
            vstr_add_char(vstr, *str);
        }
    }

406
407
408
    mp_obj_t s = mp_obj_new_str((byte*)vstr->buf, vstr->len, false);
    vstr_free(vstr);
    return s;
409
410
}

411
412
413
414
415
mp_obj_t str_replace(uint n_args, const mp_obj_t *args) {
    assert(MP_OBJ_IS_STR(args[0]));
    assert(MP_OBJ_IS_STR(args[1]));
    assert(MP_OBJ_IS_STR(args[2]));

416
    machine_int_t max_rep = 0;
417
418
419
420
    if (n_args == 4) {
	assert(MP_OBJ_IS_SMALL_INT(args[3]));
	max_rep = MP_OBJ_SMALL_INT_VALUE(args[3]);
	if (max_rep == 0) {
421
422
	    return args[0];
	} else if (max_rep < 0) {
423
424
425
	    max_rep = 0;
	}
    }
426
427

    // if max_rep is still 0 by this point we will need to do all possible replacements
428
429
430
431

    GET_STR_DATA_LEN(args[0], str, str_len);
    GET_STR_DATA_LEN(args[1], old, old_len);
    GET_STR_DATA_LEN(args[2], new, new_len);
432
433

    // old won't exist in str if it's longer, so nothing to replace
434
    if (old_len > str_len) {
435
	return args[0];
436
437
    }

438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
    // data for the replaced string
    byte *data = NULL;
    mp_obj_t replaced_str = MP_OBJ_NULL;

    // do 2 passes over the string:
    //   first pass computes the required length of the replaced string
    //   second pass does the replacements
    for (;;) {
        machine_uint_t replaced_str_index = 0;
        machine_uint_t num_replacements_done = 0;
        const byte *old_occurrence;
        const byte *offset_ptr = str;
        machine_uint_t offset_num = 0;
        while ((old_occurrence = find_subbytes(offset_ptr, str_len - offset_num, old, old_len)) != NULL) {
            // copy from just after end of last occurrence of to-be-replaced string to right before start of next occurrence
            if (data != NULL) {
                memcpy(data + replaced_str_index, offset_ptr, old_occurrence - offset_ptr);
            }
            replaced_str_index += old_occurrence - offset_ptr;
            // copy the replacement string
            if (data != NULL) {
                memcpy(data + replaced_str_index, new, new_len);
            }
            replaced_str_index += new_len;
            offset_ptr = old_occurrence + old_len;
            offset_num = offset_ptr - str;

            num_replacements_done++;
            if (max_rep != 0 && num_replacements_done == max_rep){
                break;
            }
        }

        // copy from just after end of last occurrence of to-be-replaced string to end of old string
        if (data != NULL) {
            memcpy(data + replaced_str_index, offset_ptr, str_len - offset_num);
        }
        replaced_str_index += str_len - offset_num;

        if (data == NULL) {
            // first pass
            if (num_replacements_done == 0) {
                // no substr found, return original string
                return args[0];
            } else {
                // substr found, allocate new string
                replaced_str = mp_obj_str_builder_start(mp_obj_get_type(args[0]), replaced_str_index, &data);
            }
        } else {
            // second pass, we are done
            break;
        }
490
    }
491

492
493
494
    return mp_obj_str_builder_end(replaced_str);
}

495
static MP_DEFINE_CONST_FUN_OBJ_VAR_BETWEEN(str_find_obj, 2, 4, str_find);
496
static MP_DEFINE_CONST_FUN_OBJ_2(str_join_obj, str_join);
Paul Sokolovsky's avatar
Paul Sokolovsky committed
497
static MP_DEFINE_CONST_FUN_OBJ_VAR_BETWEEN(str_split_obj, 1, 3, str_split);
498
static MP_DEFINE_CONST_FUN_OBJ_2(str_startswith_obj, str_startswith);
xbe's avatar
xbe committed
499
static MP_DEFINE_CONST_FUN_OBJ_VAR_BETWEEN(str_strip_obj, 1, 2, str_strip);
500
static MP_DEFINE_CONST_FUN_OBJ_VAR(str_format_obj, 1, str_format);
501
static MP_DEFINE_CONST_FUN_OBJ_VAR_BETWEEN(str_replace_obj, 3, 4, str_replace);
502

ian-v's avatar
ian-v committed
503
static const mp_method_t str_type_methods[] = {
504
    { "find", &str_find_obj },
ian-v's avatar
ian-v committed
505
    { "join", &str_join_obj },
Paul Sokolovsky's avatar
Paul Sokolovsky committed
506
    { "split", &str_split_obj },
507
    { "startswith", &str_startswith_obj },
xbe's avatar
xbe committed
508
    { "strip", &str_strip_obj },
ian-v's avatar
ian-v committed
509
    { "format", &str_format_obj },
510
    { "replace", &str_replace_obj },
ian-v's avatar
ian-v committed
511
512
    { NULL, NULL }, // end-of-list sentinel
};
513

514
515
516
const mp_obj_type_t str_type = {
    { &mp_const_type },
    "str",
517
518
    .print = str_print,
    .binary_op = str_binary_op,
519
520
521
522
523
524
525
526
527
528
529
    .getiter = mp_obj_new_str_iterator,
    .methods = str_type_methods,
};

// Reuses most of methods from str
const mp_obj_type_t bytes_type = {
    { &mp_const_type },
    "bytes",
    .print = str_print,
    .binary_op = str_binary_op,
    .getiter = mp_obj_new_bytes_iterator,
ian-v's avatar
ian-v committed
530
    .methods = str_type_methods,
531
532
};

533
mp_obj_t mp_obj_str_builder_start(const mp_obj_type_t *type, uint len, byte **data) {
534
    mp_obj_str_t *o = m_new_obj_var(mp_obj_str_t, byte, len + 1);
535
    o->base.type = type;
536
537
538
539
540
541
542
543
544
545
    o->len = len;
    *data = o->data;
    return o;
}

mp_obj_t mp_obj_str_builder_end(mp_obj_t o_in) {
    assert(MP_OBJ_IS_STR(o_in));
    mp_obj_str_t *o = o_in;
    o->hash = qstr_compute_hash(o->data, o->len);
    o->data[o->len] = '\0'; // for now we add null for compatibility with C ASCIIZ strings
546
547
548
    return o;
}

549
550
551
552
553
554
555
556
557
558
static mp_obj_t str_new(const mp_obj_type_t *type, const byte* data, uint len) {
    mp_obj_str_t *o = m_new_obj_var(mp_obj_str_t, byte, len + 1);
    o->base.type = type;
    o->hash = qstr_compute_hash(data, len);
    o->len = len;
    memcpy(o->data, data, len * sizeof(byte));
    o->data[len] = '\0'; // for now we add null for compatibility with C ASCIIZ strings
    return o;
}

559
560
561
562
563
564
565
566
567
568
mp_obj_t mp_obj_new_str(const byte* data, uint len, bool make_qstr_if_not_already) {
    qstr q = qstr_find_strn(data, len);
    if (q != MP_QSTR_NULL) {
        // qstr with this data already exists
        return MP_OBJ_NEW_QSTR(q);
    } else if (make_qstr_if_not_already) {
        // no existing qstr, make a new one
        return MP_OBJ_NEW_QSTR(qstr_from_strn((const char*)data, len));
    } else {
        // no existing qstr, don't make one
569
        return str_new(&str_type, data, len);
570
571
572
    }
}

573
574
575
576
mp_obj_t mp_obj_new_bytes(const byte* data, uint len) {
    return str_new(&bytes_type, data, len);
}

577
578
579
580
581
582
583
584
585
586
587
588
589
590
bool mp_obj_str_equal(mp_obj_t s1, mp_obj_t s2) {
    if (MP_OBJ_IS_QSTR(s1) && MP_OBJ_IS_QSTR(s2)) {
        return s1 == s2;
    } else {
        GET_STR_HASH(s1, h1);
        GET_STR_HASH(s2, h2);
        if (h1 != h2) {
            return false;
        }
        GET_STR_DATA_LEN(s1, d1, l1);
        GET_STR_DATA_LEN(s2, d2, l2);
        if (l1 != l2) {
            return false;
        }
591
        return memcmp(d1, d2, l1) == 0;
592
593
594
    }
}

595
596
597
598
599
void bad_implicit_conversion(mp_obj_t self_in) __attribute__((noreturn));
void bad_implicit_conversion(mp_obj_t self_in) {
    nlr_jump(mp_obj_new_exception_msg_varg(MP_QSTR_TypeError, "Can't convert '%s' object to str implicitly", mp_obj_get_type_str(self_in)));
}

600
601
602
603
604
uint mp_obj_str_get_hash(mp_obj_t self_in) {
    if (MP_OBJ_IS_STR(self_in)) {
        GET_STR_HASH(self_in, h);
        return h;
    } else {
605
        bad_implicit_conversion(self_in);
606
    }
607
608
609
610
611
612
613
}

uint mp_obj_str_get_len(mp_obj_t self_in) {
    if (MP_OBJ_IS_STR(self_in)) {
        GET_STR_LEN(self_in, l);
        return l;
    } else {
614
615
616
617
618
619
620
621
622
623
624
625
626
627
        bad_implicit_conversion(self_in);
    }
}

// use this if you will anyway convert the string to a qstr
// will be more efficient for the case where it's already a qstr
qstr mp_obj_str_get_qstr(mp_obj_t self_in) {
    if (MP_OBJ_IS_QSTR(self_in)) {
        return MP_OBJ_QSTR_VALUE(self_in);
    } else if (MP_OBJ_IS_TYPE(self_in, &str_type)) {
        mp_obj_str_t *self = self_in;
        return qstr_from_strn((char*)self->data, self->len);
    } else {
        bad_implicit_conversion(self_in);
628
629
630
631
632
633
634
635
636
637
638
    }
}

// only use this function if you need the str data to be zero terminated
// at the moment all strings are zero terminated to help with C ASCIIZ compatibility
const char *mp_obj_str_get_str(mp_obj_t self_in) {
    if (MP_OBJ_IS_STR(self_in)) {
        GET_STR_DATA_LEN(self_in, s, l);
        (void)l; // len unused
        return (const char*)s;
    } else {
639
        bad_implicit_conversion(self_in);
640
641
642
643
644
645
646
647
648
    }
}

const byte *mp_obj_str_get_data(mp_obj_t self_in, uint *len) {
    if (MP_OBJ_IS_STR(self_in)) {
        GET_STR_DATA_LEN(self_in, s, l);
        *len = l;
        return s;
    } else {
649
        bad_implicit_conversion(self_in);
650
    }
651
}
xyb's avatar
xyb committed
652
653
654
655
656
657

/******************************************************************************/
/* str iterator                                                               */

typedef struct _mp_obj_str_it_t {
    mp_obj_base_t base;
658
    mp_obj_t str;
xyb's avatar
xyb committed
659
660
661
662
663
    machine_uint_t cur;
} mp_obj_str_it_t;

mp_obj_t str_it_iternext(mp_obj_t self_in) {
    mp_obj_str_it_t *self = self_in;
664
665
666
    GET_STR_DATA_LEN(self->str, str, len);
    if (self->cur < len) {
        mp_obj_t o_out = mp_obj_new_str(str + self->cur, 1, true);
xyb's avatar
xyb committed
667
668
669
670
671
672
673
674
675
676
        self->cur += 1;
        return o_out;
    } else {
        return mp_const_stop_iteration;
    }
}

static const mp_obj_type_t str_it_type = {
    { &mp_const_type },
    "str_iterator",
677
    .iternext = str_it_iternext,
xyb's avatar
xyb committed
678
679
};

680
681
682
683
mp_obj_t bytes_it_iternext(mp_obj_t self_in) {
    mp_obj_str_it_t *self = self_in;
    GET_STR_DATA_LEN(self->str, str, len);
    if (self->cur < len) {
684
        mp_obj_t o_out = MP_OBJ_NEW_SMALL_INT((mp_small_int_t)str[self->cur]);
685
686
687
688
689
690
691
692
693
694
695
696
697
698
        self->cur += 1;
        return o_out;
    } else {
        return mp_const_stop_iteration;
    }
}

static const mp_obj_type_t bytes_it_type = {
    { &mp_const_type },
    "bytes_iterator",
    .iternext = bytes_it_iternext,
};

mp_obj_t mp_obj_new_str_iterator(mp_obj_t str) {
xyb's avatar
xyb committed
699
700
701
    mp_obj_str_it_t *o = m_new_obj(mp_obj_str_it_t);
    o->base.type = &str_it_type;
    o->str = str;
702
703
704
705
706
707
708
709
710
    o->cur = 0;
    return o;
}

mp_obj_t mp_obj_new_bytes_iterator(mp_obj_t str) {
    mp_obj_str_it_t *o = m_new_obj(mp_obj_str_it_t);
    o->base.type = &bytes_it_type;
    o->str = str;
    o->cur = 0;
xyb's avatar
xyb committed
711
712
    return o;
}