objstr.c 22.4 KB
Newer Older
1
2
3
4
5
6
7
8
9
#include <stdlib.h>
#include <stdint.h>
#include <stdarg.h>
#include <string.h>
#include <assert.h>

#include "nlr.h"
#include "misc.h"
#include "mpconfig.h"
10
#include "qstr.h"
11
12
13
14
15
16
#include "obj.h"
#include "runtime0.h"
#include "runtime.h"

typedef struct _mp_obj_str_t {
    mp_obj_base_t base;
17
18
19
    machine_uint_t hash : 16; // XXX here we assume the hash size is 16 bits (it is at the moment; see qstr.c)
    machine_uint_t len : 16; // len == number of bytes used in data, alloc = len + 1 because (at the moment) we also append a null byte
    byte data[];
20
21
} mp_obj_str_t;

22
23
24
25
26
27
28
29
30
// use this macro to extract the string hash
#define GET_STR_HASH(str_obj_in, str_hash) uint str_hash; if (MP_OBJ_IS_QSTR(str_obj_in)) { str_hash = qstr_hash(MP_OBJ_QSTR_VALUE(str_obj_in)); } else { str_hash = ((mp_obj_str_t*)str_obj_in)->hash; }

// use this macro to extract the string length
#define GET_STR_LEN(str_obj_in, str_len) uint str_len; if (MP_OBJ_IS_QSTR(str_obj_in)) { str_len = qstr_len(MP_OBJ_QSTR_VALUE(str_obj_in)); } else { str_len = ((mp_obj_str_t*)str_obj_in)->len; }

// use this macro to extract the string data and length
#define GET_STR_DATA_LEN(str_obj_in, str_data, str_len) const byte *str_data; uint str_len; if (MP_OBJ_IS_QSTR(str_obj_in)) { str_data = qstr_data(MP_OBJ_QSTR_VALUE(str_obj_in), &str_len); } else { str_len = ((mp_obj_str_t*)str_obj_in)->len; str_data = ((mp_obj_str_t*)str_obj_in)->data; }

31
32
static mp_obj_t mp_obj_new_str_iterator(mp_obj_t str);
static mp_obj_t mp_obj_new_bytes_iterator(mp_obj_t str);
xyb's avatar
xyb committed
33
34
35
36

/******************************************************************************/
/* str                                                                        */

37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
void mp_str_print_quoted(void (*print)(void *env, const char *fmt, ...), void *env, const byte *str_data, uint str_len) {
    // this escapes characters, but it will be very slow to print (calling print many times)
    bool has_single_quote = false;
    bool has_double_quote = false;
    for (const byte *s = str_data, *top = str_data + str_len; (!has_single_quote || !has_double_quote) && s < top; s++) {
        if (*s == '\'') {
            has_single_quote = true;
        } else if (*s == '"') {
            has_double_quote = true;
        }
    }
    int quote_char = '\'';
    if (has_single_quote && !has_double_quote) {
        quote_char = '"';
    }
    print(env, "%c", quote_char);
    for (const byte *s = str_data, *top = str_data + str_len; s < top; s++) {
        if (*s == quote_char) {
            print(env, "\\%c", quote_char);
        } else if (*s == '\\') {
            print(env, "\\\\");
        } else if (32 <= *s && *s <= 126) {
            print(env, "%c", *s);
        } else if (*s == '\n') {
            print(env, "\\n");
        // TODO add more escape codes here if we want to match CPython
        } else {
            print(env, "\\x%02x", *s);
        }
    }
    print(env, "%c", quote_char);
}

static void str_print(void (*print)(void *env, const char *fmt, ...), void *env, mp_obj_t self_in, mp_print_kind_t kind) {
71
    GET_STR_DATA_LEN(self_in, str_data, str_len);
72
73
    bool is_bytes = MP_OBJ_IS_TYPE(self_in, &bytes_type);
    if (kind == PRINT_STR && !is_bytes) {
74
        print(env, "%.*s", str_len, str_data);
75
    } else {
76
77
78
        if (is_bytes) {
            print(env, "b");
        }
79
        mp_str_print_quoted(print, env, str_data, str_len);
80
    }
81
82
}

83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
// like strstr but with specified length and allows \0 bytes
// TODO replace with something more efficient/standard
static const byte *find_subbytes(const byte *haystack, uint hlen, const byte *needle, uint nlen) {
    if (hlen >= nlen) {
        for (uint i = 0; i <= hlen - nlen; i++) {
            bool found = true;
            for (uint j = 0; j < nlen; j++) {
                if (haystack[i + j] != needle[j]) {
                    found = false;
                    break;
                }
            }
            if (found) {
                return haystack + i;
            }
        }
    }
    return NULL;
}

103
mp_obj_t str_binary_op(int op, mp_obj_t lhs_in, mp_obj_t rhs_in) {
104
    GET_STR_DATA_LEN(lhs_in, lhs_data, lhs_len);
105
106
    switch (op) {
        case RT_BINARY_OP_SUBSCR:
107
108
109
            // TODO: need predicate to check for int-like type (bools are such for example)
            // ["no", "yes"][1 == 2] is common idiom
            if (MP_OBJ_IS_SMALL_INT(rhs_in)) {
110
                uint index = mp_get_index(mp_obj_get_type(lhs_in), lhs_len, rhs_in);
111
                if (MP_OBJ_IS_TYPE(lhs_in, &bytes_type)) {
112
                    return MP_OBJ_NEW_SMALL_INT((mp_small_int_t)lhs_data[index]);
113
114
115
                } else {
                    return mp_obj_new_str(lhs_data + index, 1, true);
                }
116
#if MICROPY_ENABLE_SLICE
117
            } else if (MP_OBJ_IS_TYPE(rhs_in, &slice_type)) {
118
119
                machine_uint_t start, stop;
                assert(m_seq_get_fast_slice_indexes(lhs_len, rhs_in, &start, &stop));
120
                return mp_obj_new_str(lhs_data + start, stop - start, false);
121
#endif
122
            } else {
123
124
                // Message doesn't match CPython, but we don't have so much bytes as they
                // to spend them on verbose wording
125
                nlr_jump(mp_obj_new_exception_msg(MP_QSTR_TypeError, "index must be int"));
126
            }
127
128
129

        case RT_BINARY_OP_ADD:
        case RT_BINARY_OP_INPLACE_ADD:
130
            if (MP_OBJ_IS_STR(rhs_in)) {
131
                // add 2 strings
132
133

                GET_STR_DATA_LEN(rhs_in, rhs_data, rhs_len);
134
                int alloc_len = lhs_len + rhs_len;
135
136

                /* code for making qstr
137
138
139
140
                byte *q_ptr;
                byte *val = qstr_build_start(alloc_len, &q_ptr);
                memcpy(val, lhs_data, lhs_len);
                memcpy(val + lhs_len, rhs_data, rhs_len);
141
142
143
144
145
                return MP_OBJ_NEW_QSTR(qstr_build_end(q_ptr));
                */

                // code for non-qstr
                byte *data;
146
                mp_obj_t s = mp_obj_str_builder_start(mp_obj_get_type(lhs_in), alloc_len, &data);
147
148
149
                memcpy(data, lhs_data, lhs_len);
                memcpy(data + lhs_len, rhs_data, rhs_len);
                return mp_obj_str_builder_end(s);
150
151
            }
            break;
152

153
        case RT_BINARY_OP_IN:
154
            /* NOTE `a in b` is `b.__contains__(a)` */
155
156
            if (MP_OBJ_IS_STR(rhs_in)) {
                GET_STR_DATA_LEN(rhs_in, rhs_data, rhs_len);
157
                return MP_BOOL(find_subbytes(lhs_data, lhs_len, rhs_data, rhs_len) != NULL);
158
159
            }
            break;
160

161
162
163
164
165
166
        case RT_BINARY_OP_MULTIPLY:
        {
            if (!MP_OBJ_IS_SMALL_INT(rhs_in)) {
                return NULL;
            }
            int n = MP_OBJ_SMALL_INT_VALUE(rhs_in);
167
            byte *data;
168
            mp_obj_t s = mp_obj_str_builder_start(mp_obj_get_type(lhs_in), lhs_len * n, &data);
169
170
            mp_seq_multiply(lhs_data, sizeof(*lhs_data), lhs_len, n, data);
            return mp_obj_str_builder_end(s);
171
        }
172
173
174
175
176
177
    }

    return MP_OBJ_NULL; // op not supported
}

mp_obj_t str_join(mp_obj_t self_in, mp_obj_t arg) {
178
    assert(MP_OBJ_IS_STR(self_in));
179

180
    // get separation string
181
    GET_STR_DATA_LEN(self_in, sep_str, sep_len);
182
183

    // process args
184
185
186
187
188
189
190
191
192
    uint seq_len;
    mp_obj_t *seq_items;
    if (MP_OBJ_IS_TYPE(arg, &tuple_type)) {
        mp_obj_tuple_get(arg, &seq_len, &seq_items);
    } else if (MP_OBJ_IS_TYPE(arg, &list_type)) {
        mp_obj_list_get(arg, &seq_len, &seq_items);
    } else {
        goto bad_arg;
    }
193
194
195

    // count required length
    int required_len = 0;
196
    for (int i = 0; i < seq_len; i++) {
197
        if (!MP_OBJ_IS_STR(seq_items[i])) {
198
199
            goto bad_arg;
        }
200
201
202
        if (i > 0) {
            required_len += sep_len;
        }
203
204
        GET_STR_LEN(seq_items[i], l);
        required_len += l;
205
206
207
    }

    // make joined string
208
    byte *data;
209
    mp_obj_t joined_str = mp_obj_str_builder_start(mp_obj_get_type(self_in), required_len, &data);
210
211
    for (int i = 0; i < seq_len; i++) {
        if (i > 0) {
212
213
            memcpy(data, sep_str, sep_len);
            data += sep_len;
214
        }
215
216
217
        GET_STR_DATA_LEN(seq_items[i], s, l);
        memcpy(data, s, l);
        data += l;
218
    }
219
220

    // return joined string
221
    return mp_obj_str_builder_end(joined_str);
222
223

bad_arg:
224
    nlr_jump(mp_obj_new_exception_msg(MP_QSTR_TypeError, "?str.join expecting a list of str's"));
225
226
}

Paul Sokolovsky's avatar
Paul Sokolovsky committed
227
228
229
230
231
232
233
234
235
236
237
238
#define is_ws(c) ((c) == ' ' || (c) == '\t')

static mp_obj_t str_split(uint n_args, const mp_obj_t *args) {
    int splits = -1;
    mp_obj_t sep = mp_const_none;
    if (n_args > 1) {
        sep = args[1];
        if (n_args > 2) {
            splits = MP_OBJ_SMALL_INT_VALUE(args[2]);
        }
    }
    assert(sep == mp_const_none);
239
    (void)sep; // unused; to hush compiler warning
Paul Sokolovsky's avatar
Paul Sokolovsky committed
240
    mp_obj_t res = mp_obj_new_list(0, NULL);
241
242
243
    GET_STR_DATA_LEN(args[0], s, len);
    const byte *top = s + len;
    const byte *start;
Paul Sokolovsky's avatar
Paul Sokolovsky committed
244
245

    // Initial whitespace is not counted as split, so we pre-do it
246
247
    while (s < top && is_ws(*s)) s++;
    while (s < top && splits != 0) {
Paul Sokolovsky's avatar
Paul Sokolovsky committed
248
        start = s;
249
250
251
        while (s < top && !is_ws(*s)) s++;
        rt_list_append(res, mp_obj_new_str(start, s - start, false));
        if (s >= top) {
Paul Sokolovsky's avatar
Paul Sokolovsky committed
252
253
            break;
        }
254
        while (s < top && is_ws(*s)) s++;
Paul Sokolovsky's avatar
Paul Sokolovsky committed
255
256
257
258
259
        if (splits > 0) {
            splits--;
        }
    }

260
261
    if (s < top) {
        rt_list_append(res, mp_obj_new_str(s, top - s, false));
Paul Sokolovsky's avatar
Paul Sokolovsky committed
262
263
264
265
266
    }

    return res;
}

267
static mp_obj_t str_find(uint n_args, const mp_obj_t *args) {
268
    assert(2 <= n_args && n_args <= 4);
269
270
    assert(MP_OBJ_IS_STR(args[0]));
    assert(MP_OBJ_IS_STR(args[1]));
271

272
273
    GET_STR_DATA_LEN(args[0], haystack, haystack_len);
    GET_STR_DATA_LEN(args[1], needle, needle_len);
274
275
276
277
278
279
280
281
282
283
284

    size_t start = 0;
    size_t end = haystack_len;
    /* TODO use a non-exception-throwing mp_get_index */
    if (n_args >= 3 && args[2] != mp_const_none) {
        start = mp_get_index(&str_type, haystack_len, args[2]);
    }
    if (n_args >= 4 && args[3] != mp_const_none) {
        end = mp_get_index(&str_type, haystack_len, args[3]);
    }

285
    const byte *p = find_subbytes(haystack + start, haystack_len - start, needle, needle_len);
286
287
288
289
290
291
    if (p == NULL) {
        // not found
        return MP_OBJ_NEW_SMALL_INT(-1);
    } else {
        // found
        machine_int_t pos = p - haystack;
292
293
294
        if (pos + needle_len > end) {
            pos = -1;
        }
295
        return MP_OBJ_NEW_SMALL_INT(pos);
296
297
298
    }
}

299
300
301
302
303
304
305
306
307
308
// TODO: (Much) more variety in args
static mp_obj_t str_startswith(mp_obj_t self_in, mp_obj_t arg) {
    GET_STR_DATA_LEN(self_in, str, str_len);
    GET_STR_DATA_LEN(arg, prefix, prefix_len);
    if (prefix_len > str_len) {
        return mp_const_false;
    }
    return MP_BOOL(memcmp(str, prefix, prefix_len) == 0);
}

309
310
311
312
313
314
315
316
317
static bool chr_in_str(const byte* const str, const size_t str_len, int c) {
    for (size_t i = 0; i < str_len; i++) {
        if (str[i] == c) {
            return true;
        }
    }
    return false;
}

318
mp_obj_t str_strip(uint n_args, const mp_obj_t *args) {
xbe's avatar
xbe committed
319
    assert(1 <= n_args && n_args <= 2);
320
321
322
323
324
    assert(MP_OBJ_IS_STR(args[0]));

    const byte *chars_to_del;
    uint chars_to_del_len;
    static const byte whitespace[] = " \t\n\r\v\f";
xbe's avatar
xbe committed
325
326
327

    if (n_args == 1) {
        chars_to_del = whitespace;
328
        chars_to_del_len = sizeof(whitespace);
xbe's avatar
xbe committed
329
    } else {
330
331
332
333
        assert(MP_OBJ_IS_STR(args[1]));
        GET_STR_DATA_LEN(args[1], s, l);
        chars_to_del = s;
        chars_to_del_len = l;
xbe's avatar
xbe committed
334
335
    }

336
    GET_STR_DATA_LEN(args[0], orig_str, orig_str_len);
xbe's avatar
xbe committed
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351

    size_t first_good_char_pos = 0;
    bool first_good_char_pos_set = false;
    size_t last_good_char_pos = 0;
    for (size_t i = 0; i < orig_str_len; i++) {
        if (!chr_in_str(chars_to_del, chars_to_del_len, orig_str[i])) {
            last_good_char_pos = i;
            if (!first_good_char_pos_set) {
                first_good_char_pos = i;
                first_good_char_pos_set = true;
            }
        }
    }

    if (first_good_char_pos == 0 && last_good_char_pos == 0) {
352
353
        // string is all whitespace, return ''
        return MP_OBJ_NEW_QSTR(MP_QSTR_);
xbe's avatar
xbe committed
354
355
356
357
358
    }

    assert(last_good_char_pos >= first_good_char_pos);
    //+1 to accomodate the last character
    size_t stripped_len = last_good_char_pos - first_good_char_pos + 1;
359
    return mp_obj_new_str(orig_str + first_good_char_pos, stripped_len, false);
xbe's avatar
xbe committed
360
361
}

362
mp_obj_t str_format(uint n_args, const mp_obj_t *args) {
363
    assert(MP_OBJ_IS_STR(args[0]));
364

365
    GET_STR_DATA_LEN(args[0], str, len);
366
367
    int arg_i = 1;
    vstr_t *vstr = vstr_new();
368
    for (const byte *top = str + len; str < top; str++) {
369
370
        if (*str == '{') {
            str++;
371
            if (str < top && *str == '{') {
372
                vstr_add_char(vstr, '{');
373
            } else {
374
                while (str < top && *str != '}') str++;
375
                if (arg_i >= n_args) {
376
                    nlr_jump(mp_obj_new_exception_msg(MP_QSTR_IndexError, "tuple index out of range"));
377
                }
378
                // TODO: may be PRINT_REPR depending on formatting code
379
                mp_obj_print_helper((void (*)(void*, const char*, ...))vstr_printf, vstr, args[arg_i], PRINT_STR);
380
381
382
383
384
385
386
                arg_i++;
            }
        } else {
            vstr_add_char(vstr, *str);
        }
    }

387
388
389
    mp_obj_t s = mp_obj_new_str((byte*)vstr->buf, vstr->len, false);
    vstr_free(vstr);
    return s;
390
391
}

392
393
394
395
396
mp_obj_t str_replace(uint n_args, const mp_obj_t *args) {
    assert(MP_OBJ_IS_STR(args[0]));
    assert(MP_OBJ_IS_STR(args[1]));
    assert(MP_OBJ_IS_STR(args[2]));

397
    machine_int_t max_rep = 0;
398
399
400
401
    if (n_args == 4) {
	assert(MP_OBJ_IS_SMALL_INT(args[3]));
	max_rep = MP_OBJ_SMALL_INT_VALUE(args[3]);
	if (max_rep == 0) {
402
403
	    return args[0];
	} else if (max_rep < 0) {
404
405
406
	    max_rep = 0;
	}
    }
407
408

    // if max_rep is still 0 by this point we will need to do all possible replacements
409
410
411
412

    GET_STR_DATA_LEN(args[0], str, str_len);
    GET_STR_DATA_LEN(args[1], old, old_len);
    GET_STR_DATA_LEN(args[2], new, new_len);
413
414

    // old won't exist in str if it's longer, so nothing to replace
415
    if (old_len > str_len) {
416
	return args[0];
417
418
    }

419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
    // data for the replaced string
    byte *data = NULL;
    mp_obj_t replaced_str = MP_OBJ_NULL;

    // do 2 passes over the string:
    //   first pass computes the required length of the replaced string
    //   second pass does the replacements
    for (;;) {
        machine_uint_t replaced_str_index = 0;
        machine_uint_t num_replacements_done = 0;
        const byte *old_occurrence;
        const byte *offset_ptr = str;
        machine_uint_t offset_num = 0;
        while ((old_occurrence = find_subbytes(offset_ptr, str_len - offset_num, old, old_len)) != NULL) {
            // copy from just after end of last occurrence of to-be-replaced string to right before start of next occurrence
            if (data != NULL) {
                memcpy(data + replaced_str_index, offset_ptr, old_occurrence - offset_ptr);
            }
            replaced_str_index += old_occurrence - offset_ptr;
            // copy the replacement string
            if (data != NULL) {
                memcpy(data + replaced_str_index, new, new_len);
            }
            replaced_str_index += new_len;
            offset_ptr = old_occurrence + old_len;
            offset_num = offset_ptr - str;

            num_replacements_done++;
            if (max_rep != 0 && num_replacements_done == max_rep){
                break;
            }
        }

        // copy from just after end of last occurrence of to-be-replaced string to end of old string
        if (data != NULL) {
            memcpy(data + replaced_str_index, offset_ptr, str_len - offset_num);
        }
        replaced_str_index += str_len - offset_num;

        if (data == NULL) {
            // first pass
            if (num_replacements_done == 0) {
                // no substr found, return original string
                return args[0];
            } else {
                // substr found, allocate new string
                replaced_str = mp_obj_str_builder_start(mp_obj_get_type(args[0]), replaced_str_index, &data);
            }
        } else {
            // second pass, we are done
            break;
        }
471
    }
472

473
474
475
    return mp_obj_str_builder_end(replaced_str);
}

476
static MP_DEFINE_CONST_FUN_OBJ_VAR_BETWEEN(str_find_obj, 2, 4, str_find);
477
static MP_DEFINE_CONST_FUN_OBJ_2(str_join_obj, str_join);
Paul Sokolovsky's avatar
Paul Sokolovsky committed
478
static MP_DEFINE_CONST_FUN_OBJ_VAR_BETWEEN(str_split_obj, 1, 3, str_split);
479
static MP_DEFINE_CONST_FUN_OBJ_2(str_startswith_obj, str_startswith);
xbe's avatar
xbe committed
480
static MP_DEFINE_CONST_FUN_OBJ_VAR_BETWEEN(str_strip_obj, 1, 2, str_strip);
481
static MP_DEFINE_CONST_FUN_OBJ_VAR(str_format_obj, 1, str_format);
482
static MP_DEFINE_CONST_FUN_OBJ_VAR_BETWEEN(str_replace_obj, 3, 4, str_replace);
483

ian-v's avatar
ian-v committed
484
static const mp_method_t str_type_methods[] = {
485
    { "find", &str_find_obj },
ian-v's avatar
ian-v committed
486
    { "join", &str_join_obj },
Paul Sokolovsky's avatar
Paul Sokolovsky committed
487
    { "split", &str_split_obj },
488
    { "startswith", &str_startswith_obj },
xbe's avatar
xbe committed
489
    { "strip", &str_strip_obj },
ian-v's avatar
ian-v committed
490
    { "format", &str_format_obj },
491
    { "replace", &str_replace_obj },
ian-v's avatar
ian-v committed
492
493
    { NULL, NULL }, // end-of-list sentinel
};
494

495
496
497
const mp_obj_type_t str_type = {
    { &mp_const_type },
    "str",
498
499
    .print = str_print,
    .binary_op = str_binary_op,
500
501
502
503
504
505
506
507
508
509
510
    .getiter = mp_obj_new_str_iterator,
    .methods = str_type_methods,
};

// Reuses most of methods from str
const mp_obj_type_t bytes_type = {
    { &mp_const_type },
    "bytes",
    .print = str_print,
    .binary_op = str_binary_op,
    .getiter = mp_obj_new_bytes_iterator,
ian-v's avatar
ian-v committed
511
    .methods = str_type_methods,
512
513
};

514
mp_obj_t mp_obj_str_builder_start(const mp_obj_type_t *type, uint len, byte **data) {
515
    mp_obj_str_t *o = m_new_obj_var(mp_obj_str_t, byte, len + 1);
516
    o->base.type = type;
517
518
519
520
521
522
523
524
525
526
    o->len = len;
    *data = o->data;
    return o;
}

mp_obj_t mp_obj_str_builder_end(mp_obj_t o_in) {
    assert(MP_OBJ_IS_STR(o_in));
    mp_obj_str_t *o = o_in;
    o->hash = qstr_compute_hash(o->data, o->len);
    o->data[o->len] = '\0'; // for now we add null for compatibility with C ASCIIZ strings
527
528
529
    return o;
}

530
531
532
533
534
535
536
537
538
539
static mp_obj_t str_new(const mp_obj_type_t *type, const byte* data, uint len) {
    mp_obj_str_t *o = m_new_obj_var(mp_obj_str_t, byte, len + 1);
    o->base.type = type;
    o->hash = qstr_compute_hash(data, len);
    o->len = len;
    memcpy(o->data, data, len * sizeof(byte));
    o->data[len] = '\0'; // for now we add null for compatibility with C ASCIIZ strings
    return o;
}

540
541
542
543
544
545
546
547
548
549
mp_obj_t mp_obj_new_str(const byte* data, uint len, bool make_qstr_if_not_already) {
    qstr q = qstr_find_strn(data, len);
    if (q != MP_QSTR_NULL) {
        // qstr with this data already exists
        return MP_OBJ_NEW_QSTR(q);
    } else if (make_qstr_if_not_already) {
        // no existing qstr, make a new one
        return MP_OBJ_NEW_QSTR(qstr_from_strn((const char*)data, len));
    } else {
        // no existing qstr, don't make one
550
        return str_new(&str_type, data, len);
551
552
553
    }
}

554
555
556
557
mp_obj_t mp_obj_new_bytes(const byte* data, uint len) {
    return str_new(&bytes_type, data, len);
}

558
559
560
561
562
563
564
565
566
567
568
569
570
571
bool mp_obj_str_equal(mp_obj_t s1, mp_obj_t s2) {
    if (MP_OBJ_IS_QSTR(s1) && MP_OBJ_IS_QSTR(s2)) {
        return s1 == s2;
    } else {
        GET_STR_HASH(s1, h1);
        GET_STR_HASH(s2, h2);
        if (h1 != h2) {
            return false;
        }
        GET_STR_DATA_LEN(s1, d1, l1);
        GET_STR_DATA_LEN(s2, d2, l2);
        if (l1 != l2) {
            return false;
        }
572
        return memcmp(d1, d2, l1) == 0;
573
574
575
    }
}

576
577
578
579
580
void bad_implicit_conversion(mp_obj_t self_in) __attribute__((noreturn));
void bad_implicit_conversion(mp_obj_t self_in) {
    nlr_jump(mp_obj_new_exception_msg_varg(MP_QSTR_TypeError, "Can't convert '%s' object to str implicitly", mp_obj_get_type_str(self_in)));
}

581
582
583
584
585
uint mp_obj_str_get_hash(mp_obj_t self_in) {
    if (MP_OBJ_IS_STR(self_in)) {
        GET_STR_HASH(self_in, h);
        return h;
    } else {
586
        bad_implicit_conversion(self_in);
587
    }
588
589
590
591
592
593
594
}

uint mp_obj_str_get_len(mp_obj_t self_in) {
    if (MP_OBJ_IS_STR(self_in)) {
        GET_STR_LEN(self_in, l);
        return l;
    } else {
595
596
597
598
599
600
601
602
603
604
605
606
607
608
        bad_implicit_conversion(self_in);
    }
}

// use this if you will anyway convert the string to a qstr
// will be more efficient for the case where it's already a qstr
qstr mp_obj_str_get_qstr(mp_obj_t self_in) {
    if (MP_OBJ_IS_QSTR(self_in)) {
        return MP_OBJ_QSTR_VALUE(self_in);
    } else if (MP_OBJ_IS_TYPE(self_in, &str_type)) {
        mp_obj_str_t *self = self_in;
        return qstr_from_strn((char*)self->data, self->len);
    } else {
        bad_implicit_conversion(self_in);
609
610
611
612
613
614
615
616
617
618
619
    }
}

// only use this function if you need the str data to be zero terminated
// at the moment all strings are zero terminated to help with C ASCIIZ compatibility
const char *mp_obj_str_get_str(mp_obj_t self_in) {
    if (MP_OBJ_IS_STR(self_in)) {
        GET_STR_DATA_LEN(self_in, s, l);
        (void)l; // len unused
        return (const char*)s;
    } else {
620
        bad_implicit_conversion(self_in);
621
622
623
624
625
626
627
628
629
    }
}

const byte *mp_obj_str_get_data(mp_obj_t self_in, uint *len) {
    if (MP_OBJ_IS_STR(self_in)) {
        GET_STR_DATA_LEN(self_in, s, l);
        *len = l;
        return s;
    } else {
630
        bad_implicit_conversion(self_in);
631
    }
632
}
xyb's avatar
xyb committed
633
634
635
636
637
638

/******************************************************************************/
/* str iterator                                                               */

typedef struct _mp_obj_str_it_t {
    mp_obj_base_t base;
639
    mp_obj_t str;
xyb's avatar
xyb committed
640
641
642
643
644
    machine_uint_t cur;
} mp_obj_str_it_t;

mp_obj_t str_it_iternext(mp_obj_t self_in) {
    mp_obj_str_it_t *self = self_in;
645
646
647
    GET_STR_DATA_LEN(self->str, str, len);
    if (self->cur < len) {
        mp_obj_t o_out = mp_obj_new_str(str + self->cur, 1, true);
xyb's avatar
xyb committed
648
649
650
651
652
653
654
655
656
657
        self->cur += 1;
        return o_out;
    } else {
        return mp_const_stop_iteration;
    }
}

static const mp_obj_type_t str_it_type = {
    { &mp_const_type },
    "str_iterator",
658
    .iternext = str_it_iternext,
xyb's avatar
xyb committed
659
660
};

661
662
663
664
mp_obj_t bytes_it_iternext(mp_obj_t self_in) {
    mp_obj_str_it_t *self = self_in;
    GET_STR_DATA_LEN(self->str, str, len);
    if (self->cur < len) {
665
        mp_obj_t o_out = MP_OBJ_NEW_SMALL_INT((mp_small_int_t)str[self->cur]);
666
667
668
669
670
671
672
673
674
675
676
677
678
679
        self->cur += 1;
        return o_out;
    } else {
        return mp_const_stop_iteration;
    }
}

static const mp_obj_type_t bytes_it_type = {
    { &mp_const_type },
    "bytes_iterator",
    .iternext = bytes_it_iternext,
};

mp_obj_t mp_obj_new_str_iterator(mp_obj_t str) {
xyb's avatar
xyb committed
680
681
682
    mp_obj_str_it_t *o = m_new_obj(mp_obj_str_it_t);
    o->base.type = &str_it_type;
    o->str = str;
683
684
685
686
687
688
689
690
691
    o->cur = 0;
    return o;
}

mp_obj_t mp_obj_new_bytes_iterator(mp_obj_t str) {
    mp_obj_str_it_t *o = m_new_obj(mp_obj_str_it_t);
    o->base.type = &bytes_it_type;
    o->str = str;
    o->cur = 0;
xyb's avatar
xyb committed
692
693
    return o;
}