objstr.c 16.8 KB
Newer Older
1
2
3
4
5
6
7
8
9
#include <stdlib.h>
#include <stdint.h>
#include <stdarg.h>
#include <string.h>
#include <assert.h>

#include "nlr.h"
#include "misc.h"
#include "mpconfig.h"
10
#include "qstr.h"
11
12
13
14
15
16
#include "obj.h"
#include "runtime0.h"
#include "runtime.h"

typedef struct _mp_obj_str_t {
    mp_obj_base_t base;
17
18
19
    machine_uint_t hash : 16; // XXX here we assume the hash size is 16 bits (it is at the moment; see qstr.c)
    machine_uint_t len : 16; // len == number of bytes used in data, alloc = len + 1 because (at the moment) we also append a null byte
    byte data[];
20
21
} mp_obj_str_t;

22
23
24
25
26
27
28
29
30
31
// use this macro to extract the string hash
#define GET_STR_HASH(str_obj_in, str_hash) uint str_hash; if (MP_OBJ_IS_QSTR(str_obj_in)) { str_hash = qstr_hash(MP_OBJ_QSTR_VALUE(str_obj_in)); } else { str_hash = ((mp_obj_str_t*)str_obj_in)->hash; }

// use this macro to extract the string length
#define GET_STR_LEN(str_obj_in, str_len) uint str_len; if (MP_OBJ_IS_QSTR(str_obj_in)) { str_len = qstr_len(MP_OBJ_QSTR_VALUE(str_obj_in)); } else { str_len = ((mp_obj_str_t*)str_obj_in)->len; }

// use this macro to extract the string data and length
#define GET_STR_DATA_LEN(str_obj_in, str_data, str_len) const byte *str_data; uint str_len; if (MP_OBJ_IS_QSTR(str_obj_in)) { str_data = qstr_data(MP_OBJ_QSTR_VALUE(str_obj_in), &str_len); } else { str_len = ((mp_obj_str_t*)str_obj_in)->len; str_data = ((mp_obj_str_t*)str_obj_in)->data; }

static mp_obj_t mp_obj_new_str_iterator(mp_obj_t str, int cur);
xyb's avatar
xyb committed
32
33
34
35

/******************************************************************************/
/* str                                                                        */

36
37
void str_print(void (*print)(void *env, const char *fmt, ...), void *env, mp_obj_t self_in, mp_print_kind_t kind) {
    GET_STR_DATA_LEN(self_in, str_data, str_len);
38
    if (kind == PRINT_STR) {
39
        print(env, "%.*s", str_len, str_data);
40
41
    } else {
        // TODO need to escape chars etc
42
        print(env, "'%.*s'", str_len, str_data);
43
    }
44
45
}

46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
// like strstr but with specified length and allows \0 bytes
// TODO replace with something more efficient/standard
static const byte *find_subbytes(const byte *haystack, uint hlen, const byte *needle, uint nlen) {
    if (hlen >= nlen) {
        for (uint i = 0; i <= hlen - nlen; i++) {
            bool found = true;
            for (uint j = 0; j < nlen; j++) {
                if (haystack[i + j] != needle[j]) {
                    found = false;
                    break;
                }
            }
            if (found) {
                return haystack + i;
            }
        }
    }
    return NULL;
}

66
mp_obj_t str_binary_op(int op, mp_obj_t lhs_in, mp_obj_t rhs_in) {
67
    GET_STR_DATA_LEN(lhs_in, lhs_data, lhs_len);
68
69
    switch (op) {
        case RT_BINARY_OP_SUBSCR:
70
71
72
            // TODO: need predicate to check for int-like type (bools are such for example)
            // ["no", "yes"][1 == 2] is common idiom
            if (MP_OBJ_IS_SMALL_INT(rhs_in)) {
73
74
                uint index = mp_get_index(mp_obj_get_type(lhs_in), lhs_len, rhs_in);
                return mp_obj_new_str(lhs_data + index, 1, true);
75
#if MICROPY_ENABLE_SLICE
76
            } else if (MP_OBJ_IS_TYPE(rhs_in, &slice_type)) {
77
                machine_int_t start, stop, step;
78
79
                mp_obj_slice_get(rhs_in, &start, &stop, &step);
                assert(step == 1);
80
                if (start < 0) {
81
                    start = lhs_len + start;
82
83
84
                    if (start < 0) {
                        start = 0;
                    }
85
86
                } else if (start > lhs_len) {
                    start = lhs_len;
87
88
                }
                if (stop <= 0) {
89
                    stop = lhs_len + stop;
90
91
92
93
                    // CPython returns empty string in such case
                    if (stop < 0) {
                        stop = start;
                    }
94
95
                } else if (stop > lhs_len) {
                    stop = lhs_len;
96
                }
97
                return mp_obj_new_str(lhs_data + start, stop - start, false);
98
#endif
99
            } else {
100
101
                // Message doesn't match CPython, but we don't have so much bytes as they
                // to spend them on verbose wording
102
                nlr_jump(mp_obj_new_exception_msg(MP_QSTR_TypeError, "index must be int"));
103
            }
104
105
106

        case RT_BINARY_OP_ADD:
        case RT_BINARY_OP_INPLACE_ADD:
107
            if (MP_OBJ_IS_STR(rhs_in)) {
108
                // add 2 strings
109
110

                GET_STR_DATA_LEN(rhs_in, rhs_data, rhs_len);
111
                int alloc_len = lhs_len + rhs_len;
112
113

                /* code for making qstr
114
115
116
117
                byte *q_ptr;
                byte *val = qstr_build_start(alloc_len, &q_ptr);
                memcpy(val, lhs_data, lhs_len);
                memcpy(val + lhs_len, rhs_data, rhs_len);
118
119
120
121
122
123
124
125
126
                return MP_OBJ_NEW_QSTR(qstr_build_end(q_ptr));
                */

                // code for non-qstr
                byte *data;
                mp_obj_t s = mp_obj_str_builder_start(alloc_len, &data);
                memcpy(data, lhs_data, lhs_len);
                memcpy(data + lhs_len, rhs_data, rhs_len);
                return mp_obj_str_builder_end(s);
127
128
            }
            break;
129

130
131
132
        case RT_COMPARE_OP_IN:
        case RT_COMPARE_OP_NOT_IN:
            /* NOTE `a in b` is `b.__contains__(a)` */
133
134
            if (MP_OBJ_IS_STR(rhs_in)) {
                GET_STR_DATA_LEN(rhs_in, rhs_data, rhs_len);
135
                return MP_BOOL((op == RT_COMPARE_OP_IN) ^ (find_subbytes(lhs_data, lhs_len, rhs_data, rhs_len) == NULL));
136
137
            }
            break;
138

139
140
141
142
143
144
        case RT_BINARY_OP_MULTIPLY:
        {
            if (!MP_OBJ_IS_SMALL_INT(rhs_in)) {
                return NULL;
            }
            int n = MP_OBJ_SMALL_INT_VALUE(rhs_in);
145
146
147
148
            byte *data;
            mp_obj_t s = mp_obj_str_builder_start(lhs_len * n, &data);
            mp_seq_multiply(lhs_data, sizeof(*lhs_data), lhs_len, n, data);
            return mp_obj_str_builder_end(s);
149
        }
150
151
152
153
154
    }

    return MP_OBJ_NULL; // op not supported
}

xyb's avatar
xyb committed
155
156
157
158
static mp_obj_t str_getiter(mp_obj_t o_in) {
    return mp_obj_new_str_iterator(o_in, 0);
}

159
mp_obj_t str_join(mp_obj_t self_in, mp_obj_t arg) {
160
    assert(MP_OBJ_IS_STR(self_in));
161

162
    // get separation string
163
    GET_STR_DATA_LEN(self_in, sep_str, sep_len);
164
165

    // process args
166
167
168
169
170
171
172
173
174
    uint seq_len;
    mp_obj_t *seq_items;
    if (MP_OBJ_IS_TYPE(arg, &tuple_type)) {
        mp_obj_tuple_get(arg, &seq_len, &seq_items);
    } else if (MP_OBJ_IS_TYPE(arg, &list_type)) {
        mp_obj_list_get(arg, &seq_len, &seq_items);
    } else {
        goto bad_arg;
    }
175
176
177

    // count required length
    int required_len = 0;
178
    for (int i = 0; i < seq_len; i++) {
179
        if (!MP_OBJ_IS_STR(seq_items[i])) {
180
181
            goto bad_arg;
        }
182
183
184
        if (i > 0) {
            required_len += sep_len;
        }
185
186
        GET_STR_LEN(seq_items[i], l);
        required_len += l;
187
188
189
    }

    // make joined string
190
191
    byte *data;
    mp_obj_t joined_str = mp_obj_str_builder_start(required_len, &data);
192
193
    for (int i = 0; i < seq_len; i++) {
        if (i > 0) {
194
195
            memcpy(data, sep_str, sep_len);
            data += sep_len;
196
        }
197
198
199
        GET_STR_DATA_LEN(seq_items[i], s, l);
        memcpy(data, s, l);
        data += l;
200
    }
201
202

    // return joined string
203
    return mp_obj_str_builder_end(joined_str);
204
205

bad_arg:
206
    nlr_jump(mp_obj_new_exception_msg(MP_QSTR_TypeError, "?str.join expecting a list of str's"));
207
208
}

Paul Sokolovsky's avatar
Paul Sokolovsky committed
209
210
211
212
213
214
215
216
217
218
219
220
#define is_ws(c) ((c) == ' ' || (c) == '\t')

static mp_obj_t str_split(uint n_args, const mp_obj_t *args) {
    int splits = -1;
    mp_obj_t sep = mp_const_none;
    if (n_args > 1) {
        sep = args[1];
        if (n_args > 2) {
            splits = MP_OBJ_SMALL_INT_VALUE(args[2]);
        }
    }
    assert(sep == mp_const_none);
221
    (void)sep; // unused; to hush compiler warning
Paul Sokolovsky's avatar
Paul Sokolovsky committed
222
    mp_obj_t res = mp_obj_new_list(0, NULL);
223
224
225
    GET_STR_DATA_LEN(args[0], s, len);
    const byte *top = s + len;
    const byte *start;
Paul Sokolovsky's avatar
Paul Sokolovsky committed
226
227

    // Initial whitespace is not counted as split, so we pre-do it
228
229
    while (s < top && is_ws(*s)) s++;
    while (s < top && splits != 0) {
Paul Sokolovsky's avatar
Paul Sokolovsky committed
230
        start = s;
231
232
233
        while (s < top && !is_ws(*s)) s++;
        rt_list_append(res, mp_obj_new_str(start, s - start, false));
        if (s >= top) {
Paul Sokolovsky's avatar
Paul Sokolovsky committed
234
235
            break;
        }
236
        while (s < top && is_ws(*s)) s++;
Paul Sokolovsky's avatar
Paul Sokolovsky committed
237
238
239
240
241
        if (splits > 0) {
            splits--;
        }
    }

242
243
    if (s < top) {
        rt_list_append(res, mp_obj_new_str(s, top - s, false));
Paul Sokolovsky's avatar
Paul Sokolovsky committed
244
245
246
247
248
    }

    return res;
}

249
static mp_obj_t str_find(uint n_args, const mp_obj_t *args) {
250
    assert(2 <= n_args && n_args <= 4);
251
252
    assert(MP_OBJ_IS_STR(args[0]));
    assert(MP_OBJ_IS_STR(args[1]));
253

254
255
    GET_STR_DATA_LEN(args[0], haystack, haystack_len);
    GET_STR_DATA_LEN(args[1], needle, needle_len);
256
257
258
259
260
261
262
263
264
265
266

    size_t start = 0;
    size_t end = haystack_len;
    /* TODO use a non-exception-throwing mp_get_index */
    if (n_args >= 3 && args[2] != mp_const_none) {
        start = mp_get_index(&str_type, haystack_len, args[2]);
    }
    if (n_args >= 4 && args[3] != mp_const_none) {
        end = mp_get_index(&str_type, haystack_len, args[3]);
    }

267
    const byte *p = find_subbytes(haystack + start, haystack_len - start, needle, needle_len);
268
269
270
271
272
273
    if (p == NULL) {
        // not found
        return MP_OBJ_NEW_SMALL_INT(-1);
    } else {
        // found
        machine_int_t pos = p - haystack;
274
275
276
        if (pos + needle_len > end) {
            pos = -1;
        }
277
        return MP_OBJ_NEW_SMALL_INT(pos);
278
279
280
    }
}

281
282
283
284
285
286
287
288
289
static bool chr_in_str(const byte* const str, const size_t str_len, int c) {
    for (size_t i = 0; i < str_len; i++) {
        if (str[i] == c) {
            return true;
        }
    }
    return false;
}

290
mp_obj_t str_strip(uint n_args, const mp_obj_t *args) {
xbe's avatar
xbe committed
291
    assert(1 <= n_args && n_args <= 2);
292
293
294
295
296
    assert(MP_OBJ_IS_STR(args[0]));

    const byte *chars_to_del;
    uint chars_to_del_len;
    static const byte whitespace[] = " \t\n\r\v\f";
xbe's avatar
xbe committed
297
298
299

    if (n_args == 1) {
        chars_to_del = whitespace;
300
        chars_to_del_len = sizeof(whitespace);
xbe's avatar
xbe committed
301
    } else {
302
303
304
305
        assert(MP_OBJ_IS_STR(args[1]));
        GET_STR_DATA_LEN(args[1], s, l);
        chars_to_del = s;
        chars_to_del_len = l;
xbe's avatar
xbe committed
306
307
    }

308
    GET_STR_DATA_LEN(args[0], orig_str, orig_str_len);
xbe's avatar
xbe committed
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323

    size_t first_good_char_pos = 0;
    bool first_good_char_pos_set = false;
    size_t last_good_char_pos = 0;
    for (size_t i = 0; i < orig_str_len; i++) {
        if (!chr_in_str(chars_to_del, chars_to_del_len, orig_str[i])) {
            last_good_char_pos = i;
            if (!first_good_char_pos_set) {
                first_good_char_pos = i;
                first_good_char_pos_set = true;
            }
        }
    }

    if (first_good_char_pos == 0 && last_good_char_pos == 0) {
324
325
        // string is all whitespace, return ''
        return MP_OBJ_NEW_QSTR(MP_QSTR_);
xbe's avatar
xbe committed
326
327
328
329
330
    }

    assert(last_good_char_pos >= first_good_char_pos);
    //+1 to accomodate the last character
    size_t stripped_len = last_good_char_pos - first_good_char_pos + 1;
331
    return mp_obj_new_str(orig_str + first_good_char_pos, stripped_len, false);
xbe's avatar
xbe committed
332
333
}

334
mp_obj_t str_format(uint n_args, const mp_obj_t *args) {
335
    assert(MP_OBJ_IS_STR(args[0]));
336

337
    GET_STR_DATA_LEN(args[0], str, len);
338
339
    int arg_i = 1;
    vstr_t *vstr = vstr_new();
340
    for (const byte *top = str + len; str < top; str++) {
341
342
        if (*str == '{') {
            str++;
343
            if (str < top && *str == '{') {
344
                vstr_add_char(vstr, '{');
345
            } else {
346
                while (str < top && *str != '}') str++;
347
                if (arg_i >= n_args) {
348
                    nlr_jump(mp_obj_new_exception_msg(MP_QSTR_IndexError, "tuple index out of range"));
349
                }
350
                // TODO: may be PRINT_REPR depending on formatting code
351
                mp_obj_print_helper((void (*)(void*, const char*, ...))vstr_printf, vstr, args[arg_i], PRINT_STR);
352
353
354
355
356
357
358
                arg_i++;
            }
        } else {
            vstr_add_char(vstr, *str);
        }
    }

359
360
361
    mp_obj_t s = mp_obj_new_str((byte*)vstr->buf, vstr->len, false);
    vstr_free(vstr);
    return s;
362
363
}

364
static MP_DEFINE_CONST_FUN_OBJ_VAR_BETWEEN(str_find_obj, 2, 4, str_find);
365
static MP_DEFINE_CONST_FUN_OBJ_2(str_join_obj, str_join);
Paul Sokolovsky's avatar
Paul Sokolovsky committed
366
static MP_DEFINE_CONST_FUN_OBJ_VAR_BETWEEN(str_split_obj, 1, 3, str_split);
xbe's avatar
xbe committed
367
static MP_DEFINE_CONST_FUN_OBJ_VAR_BETWEEN(str_strip_obj, 1, 2, str_strip);
368
369
static MP_DEFINE_CONST_FUN_OBJ_VAR(str_format_obj, 1, str_format);

ian-v's avatar
ian-v committed
370
static const mp_method_t str_type_methods[] = {
371
    { "find", &str_find_obj },
ian-v's avatar
ian-v committed
372
    { "join", &str_join_obj },
Paul Sokolovsky's avatar
Paul Sokolovsky committed
373
    { "split", &str_split_obj },
xbe's avatar
xbe committed
374
    { "strip", &str_strip_obj },
ian-v's avatar
ian-v committed
375
376
377
    { "format", &str_format_obj },
    { NULL, NULL }, // end-of-list sentinel
};
378

379
380
381
const mp_obj_type_t str_type = {
    { &mp_const_type },
    "str",
382
383
384
    .print = str_print,
    .binary_op = str_binary_op,
    .getiter = str_getiter,
ian-v's avatar
ian-v committed
385
    .methods = str_type_methods,
386
387
};

388
389
mp_obj_t mp_obj_str_builder_start(uint len, byte **data) {
    mp_obj_str_t *o = m_new_obj_var(mp_obj_str_t, byte, len + 1);
390
    o->base.type = &str_type;
391
392
393
394
395
396
397
398
399
400
    o->len = len;
    *data = o->data;
    return o;
}

mp_obj_t mp_obj_str_builder_end(mp_obj_t o_in) {
    assert(MP_OBJ_IS_STR(o_in));
    mp_obj_str_t *o = o_in;
    o->hash = qstr_compute_hash(o->data, o->len);
    o->data[o->len] = '\0'; // for now we add null for compatibility with C ASCIIZ strings
401
402
403
    return o;
}

404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
mp_obj_t mp_obj_new_str(const byte* data, uint len, bool make_qstr_if_not_already) {
    qstr q = qstr_find_strn(data, len);
    if (q != MP_QSTR_NULL) {
        // qstr with this data already exists
        return MP_OBJ_NEW_QSTR(q);
    } else if (make_qstr_if_not_already) {
        // no existing qstr, make a new one
        return MP_OBJ_NEW_QSTR(qstr_from_strn((const char*)data, len));
    } else {
        // no existing qstr, don't make one
        mp_obj_str_t *o = m_new_obj_var(mp_obj_str_t, byte, len + 1);
        o->base.type = &str_type;
        o->hash = qstr_compute_hash(data, len);
        o->len = len;
        memcpy(o->data, data, len * sizeof(byte));
        o->data[len] = '\0'; // for now we add null for compatibility with C ASCIIZ strings
        return o;
    }
}

bool mp_obj_str_equal(mp_obj_t s1, mp_obj_t s2) {
    if (MP_OBJ_IS_QSTR(s1) && MP_OBJ_IS_QSTR(s2)) {
        return s1 == s2;
    } else {
        GET_STR_HASH(s1, h1);
        GET_STR_HASH(s2, h2);
        if (h1 != h2) {
            return false;
        }
        GET_STR_DATA_LEN(s1, d1, l1);
        GET_STR_DATA_LEN(s2, d2, l2);
        if (l1 != l2) {
            return false;
        }
        return strncmp((const char*)d1, (const char*)d2, l1) == 0;
    }
}

uint mp_obj_str_get_hash(mp_obj_t self_in) {
    if (MP_OBJ_IS_STR(self_in)) {
        GET_STR_HASH(self_in, h);
        return h;
    } else {
        nlr_jump(mp_obj_new_exception_msg_varg(MP_QSTR_TypeError, "Can't convert '%s' object to str implicitly",
                 mp_obj_get_type_str(self_in)));
449
    }
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
}

uint mp_obj_str_get_len(mp_obj_t self_in) {
    if (MP_OBJ_IS_STR(self_in)) {
        GET_STR_LEN(self_in, l);
        return l;
    } else {
        nlr_jump(mp_obj_new_exception_msg_varg(MP_QSTR_TypeError, "Can't convert '%s' object to str implicitly",
                 mp_obj_get_type_str(self_in)));
    }
}

// only use this function if you need the str data to be zero terminated
// at the moment all strings are zero terminated to help with C ASCIIZ compatibility
const char *mp_obj_str_get_str(mp_obj_t self_in) {
    if (MP_OBJ_IS_STR(self_in)) {
        GET_STR_DATA_LEN(self_in, s, l);
        (void)l; // len unused
        return (const char*)s;
    } else {
        nlr_jump(mp_obj_new_exception_msg_varg(MP_QSTR_TypeError, "Can't convert '%s' object to str implicitly",
                 mp_obj_get_type_str(self_in)));
    }
}

const byte *mp_obj_str_get_data(mp_obj_t self_in, uint *len) {
    if (MP_OBJ_IS_STR(self_in)) {
        GET_STR_DATA_LEN(self_in, s, l);
        *len = l;
        return s;
    } else {
        nlr_jump(mp_obj_new_exception_msg_varg(MP_QSTR_TypeError, "Can't convert '%s' object to str implicitly",
                 mp_obj_get_type_str(self_in)));
483
    }
484
}
xyb's avatar
xyb committed
485
486
487
488
489
490

/******************************************************************************/
/* str iterator                                                               */

typedef struct _mp_obj_str_it_t {
    mp_obj_base_t base;
491
    mp_obj_t str;
xyb's avatar
xyb committed
492
493
494
495
496
    machine_uint_t cur;
} mp_obj_str_it_t;

mp_obj_t str_it_iternext(mp_obj_t self_in) {
    mp_obj_str_it_t *self = self_in;
497
498
499
    GET_STR_DATA_LEN(self->str, str, len);
    if (self->cur < len) {
        mp_obj_t o_out = mp_obj_new_str(str + self->cur, 1, true);
xyb's avatar
xyb committed
500
501
502
503
504
505
506
507
508
509
        self->cur += 1;
        return o_out;
    } else {
        return mp_const_stop_iteration;
    }
}

static const mp_obj_type_t str_it_type = {
    { &mp_const_type },
    "str_iterator",
510
    .iternext = str_it_iternext,
xyb's avatar
xyb committed
511
512
};

513
mp_obj_t mp_obj_new_str_iterator(mp_obj_t str, int cur) {
xyb's avatar
xyb committed
514
515
516
517
518
519
    mp_obj_str_it_t *o = m_new_obj(mp_obj_str_it_t);
    o->base.type = &str_it_type;
    o->str = str;
    o->cur = cur;
    return o;
}