objstr.c 13.3 KB
Newer Older
1
2
3
4
5
6
7
8
9
#include <stdlib.h>
#include <stdint.h>
#include <stdarg.h>
#include <string.h>
#include <assert.h>

#include "nlr.h"
#include "misc.h"
#include "mpconfig.h"
10
#include "qstr.h"
11
12
13
14
15
16
17
18
19
#include "obj.h"
#include "runtime0.h"
#include "runtime.h"

typedef struct _mp_obj_str_t {
    mp_obj_base_t base;
    qstr qstr;
} mp_obj_str_t;

xyb's avatar
xyb committed
20
21
22
23
24
static mp_obj_t mp_obj_new_str_iterator(mp_obj_str_t *str, int cur);

/******************************************************************************/
/* str                                                                        */

25
void mp_obj_str_print_qstr(void (*print)(void *env, const char *fmt, ...), void *env, qstr q, mp_print_kind_t kind) {
26
    if (kind == PRINT_STR) {
27
        print(env, "%s", qstr_str(q));
28
29
    } else {
        // TODO need to escape chars etc
30
        print(env, "'%s'", qstr_str(q));
31
    }
32
33
}

34
35
36
37
38
void str_print(void (*print)(void *env, const char *fmt, ...), void *env, mp_obj_t self_in, mp_print_kind_t kind) {
    mp_obj_str_t *self = self_in;
    mp_obj_str_print_qstr(print, env, self->qstr, kind);
}

39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
// like strstr but with specified length and allows \0 bytes
// TODO replace with something more efficient/standard
static const byte *find_subbytes(const byte *haystack, uint hlen, const byte *needle, uint nlen) {
    if (hlen >= nlen) {
        for (uint i = 0; i <= hlen - nlen; i++) {
            bool found = true;
            for (uint j = 0; j < nlen; j++) {
                if (haystack[i + j] != needle[j]) {
                    found = false;
                    break;
                }
            }
            if (found) {
                return haystack + i;
            }
        }
    }
    return NULL;
}

59
60
mp_obj_t str_binary_op(int op, mp_obj_t lhs_in, mp_obj_t rhs_in) {
    mp_obj_str_t *lhs = lhs_in;
61
62
    uint lhs_len;
    const byte *lhs_data = qstr_data(lhs->qstr, &lhs_len);
63
64
    switch (op) {
        case RT_BINARY_OP_SUBSCR:
65
66
67
            // TODO: need predicate to check for int-like type (bools are such for example)
            // ["no", "yes"][1 == 2] is common idiom
            if (MP_OBJ_IS_SMALL_INT(rhs_in)) {
68
69
                uint index = mp_get_index(lhs->base.type, lhs_len, rhs_in);
                return mp_obj_new_str(qstr_from_strn((const char*)lhs_data + index, 1));
70
#if MICROPY_ENABLE_SLICE
71
            } else if (MP_OBJ_IS_TYPE(rhs_in, &slice_type)) {
72
                machine_int_t start, stop, step;
73
74
                mp_obj_slice_get(rhs_in, &start, &stop, &step);
                assert(step == 1);
75
                if (start < 0) {
76
                    start = lhs_len + start;
77
78
79
                    if (start < 0) {
                        start = 0;
                    }
80
81
                } else if (start > lhs_len) {
                    start = lhs_len;
82
83
                }
                if (stop <= 0) {
84
                    stop = lhs_len + stop;
85
86
87
88
                    // CPython returns empty string in such case
                    if (stop < 0) {
                        stop = start;
                    }
89
90
                } else if (stop > lhs_len) {
                    stop = lhs_len;
91
                }
92
                return mp_obj_new_str(qstr_from_strn((const char*)lhs_data + start, stop - start));
93
#endif
94
            } else {
95
96
                // Message doesn't match CPython, but we don't have so much bytes as they
                // to spend them on verbose wording
97
                nlr_jump(mp_obj_new_exception_msg(MP_QSTR_TypeError, "index must be int"));
98
            }
99
100
101
102
103

        case RT_BINARY_OP_ADD:
        case RT_BINARY_OP_INPLACE_ADD:
            if (MP_OBJ_IS_TYPE(rhs_in, &str_type)) {
                // add 2 strings
104
105
106
107
108
109
110
111
                uint rhs_len;
                const byte *rhs_data = qstr_data(((mp_obj_str_t*)rhs_in)->qstr, &rhs_len);
                int alloc_len = lhs_len + rhs_len;
                byte *q_ptr;
                byte *val = qstr_build_start(alloc_len, &q_ptr);
                memcpy(val, lhs_data, lhs_len);
                memcpy(val + lhs_len, rhs_data, rhs_len);
                return mp_obj_new_str(qstr_build_end(q_ptr));
112
113
            }
            break;
114
115
116
117
        case RT_COMPARE_OP_IN:
        case RT_COMPARE_OP_NOT_IN:
            /* NOTE `a in b` is `b.__contains__(a)` */
            if (MP_OBJ_IS_TYPE(rhs_in, &str_type)) {
118
119
120
121
                uint rhs_len;
                const byte *rhs_data = qstr_data(((mp_obj_str_t*)rhs_in)->qstr, &rhs_len);
                return MP_BOOL((op == RT_COMPARE_OP_IN) ^ (find_subbytes(lhs_data, lhs_len, rhs_data, rhs_len) == NULL));
                return mp_const_false;
122
123
            }
            break;
124
125
126
127
128
129
        case RT_BINARY_OP_MULTIPLY:
        {
            if (!MP_OBJ_IS_SMALL_INT(rhs_in)) {
                return NULL;
            }
            int n = MP_OBJ_SMALL_INT_VALUE(rhs_in);
130
131
132
            char *s = m_new(char, lhs_len * n);
            mp_seq_multiply(lhs_data, sizeof(*lhs_data), lhs_len, n, s);
            return MP_OBJ_NEW_QSTR(qstr_from_strn_take(s, lhs_len * n, lhs_len * n));
133
        }
134
135
136
137
138
    }

    return MP_OBJ_NULL; // op not supported
}

xyb's avatar
xyb committed
139
140
141
142
static mp_obj_t str_getiter(mp_obj_t o_in) {
    return mp_obj_new_str_iterator(o_in, 0);
}

143
144
145
146
mp_obj_t str_join(mp_obj_t self_in, mp_obj_t arg) {
    assert(MP_OBJ_IS_TYPE(self_in, &str_type));
    mp_obj_str_t *self = self_in;

147
148
149
150
151
    // get separation string
    const char *sep_str = qstr_str(self->qstr);
    size_t sep_len = strlen(sep_str);

    // process args
152
153
154
155
156
157
158
159
160
    uint seq_len;
    mp_obj_t *seq_items;
    if (MP_OBJ_IS_TYPE(arg, &tuple_type)) {
        mp_obj_tuple_get(arg, &seq_len, &seq_items);
    } else if (MP_OBJ_IS_TYPE(arg, &list_type)) {
        mp_obj_list_get(arg, &seq_len, &seq_items);
    } else {
        goto bad_arg;
    }
161
162
163

    // count required length
    int required_len = 0;
164
165
166
167
    for (int i = 0; i < seq_len; i++) {
        if (!MP_OBJ_IS_TYPE(seq_items[i], &str_type)) {
            goto bad_arg;
        }
168
169
170
        if (i > 0) {
            required_len += sep_len;
        }
171
172
173
174
        required_len += strlen(qstr_str(mp_obj_str_get(seq_items[i])));
    }

    // make joined string
175
176
    byte *q_ptr;
    byte *s_dest = qstr_build_start(required_len, &q_ptr);
177
178
    for (int i = 0; i < seq_len; i++) {
        if (i > 0) {
179
180
            memcpy(s_dest, sep_str, sep_len);
            s_dest += sep_len;
181
        }
182
183
        uint s2_len;
        const byte *s2 = qstr_data(mp_obj_str_get(seq_items[i]), &s2_len);
184
185
        memcpy(s_dest, s2, s2_len);
        s_dest += s2_len;
186
    }
187
    qstr q = qstr_build_end(q_ptr);
188
189

    // return joined string
190
    return mp_obj_new_str(q);
191
192

bad_arg:
193
    nlr_jump(mp_obj_new_exception_msg(MP_QSTR_TypeError, "?str.join expecting a list of str's"));
194
195
}

Paul Sokolovsky's avatar
Paul Sokolovsky committed
196
197
198
199
200
201
202
203
204
205
206
207
#define is_ws(c) ((c) == ' ' || (c) == '\t')

static mp_obj_t str_split(uint n_args, const mp_obj_t *args) {
    int splits = -1;
    mp_obj_t sep = mp_const_none;
    if (n_args > 1) {
        sep = args[1];
        if (n_args > 2) {
            splits = MP_OBJ_SMALL_INT_VALUE(args[2]);
        }
    }
    assert(sep == mp_const_none);
208
    (void)sep; // unused; to hush compiler warning
Paul Sokolovsky's avatar
Paul Sokolovsky committed
209
210
211
212
213
214
215
216
217
    mp_obj_t res = mp_obj_new_list(0, NULL);
    const char *s = qstr_str(mp_obj_str_get(args[0]));
    const char *start;

    // Initial whitespace is not counted as split, so we pre-do it
    while (is_ws(*s)) s++;
    while (*s && splits != 0) {
        start = s;
        while (*s != 0 && !is_ws(*s)) s++;
218
        rt_list_append(res, MP_OBJ_NEW_QSTR(qstr_from_strn(start, s - start)));
Paul Sokolovsky's avatar
Paul Sokolovsky committed
219
220
221
222
223
224
225
226
227
228
        if (*s == 0) {
            break;
        }
        while (is_ws(*s)) s++;
        if (splits > 0) {
            splits--;
        }
    }

    if (*s != 0) {
229
        rt_list_append(res, MP_OBJ_NEW_QSTR(qstr_from_str(s)));
Paul Sokolovsky's avatar
Paul Sokolovsky committed
230
231
232
233
234
    }

    return res;
}

xbe's avatar
xbe committed
235
236
237
238
239
240
241
242
243
static bool chr_in_str(const char* const str, const size_t str_len, const char c) {
    for (size_t i = 0; i < str_len; i++) {
        if (str[i] == c) {
            return true;
        }
    }
    return false;
}

244
static mp_obj_t str_find(uint n_args, const mp_obj_t *args) {
245
    assert(2 <= n_args && n_args <= 4);
246
247
    const char* haystack = qstr_str(mp_obj_str_get(args[0]));
    const char* needle = qstr_str(mp_obj_str_get(args[1]));
248

249
250
    size_t haystack_len = strlen(haystack);
    size_t needle_len = strlen(needle);
251
252
253
254
255
256
257
258
259
260
261
262

    size_t start = 0;
    size_t end = haystack_len;
    /* TODO use a non-exception-throwing mp_get_index */
    if (n_args >= 3 && args[2] != mp_const_none) {
        start = mp_get_index(&str_type, haystack_len, args[2]);
    }
    if (n_args >= 4 && args[3] != mp_const_none) {
        end = mp_get_index(&str_type, haystack_len, args[3]);
    }

    char *p = strstr(haystack + start, needle);
263
264
265
266
267
268
    if (p == NULL) {
        // not found
        return MP_OBJ_NEW_SMALL_INT(-1);
    } else {
        // found
        machine_int_t pos = p - haystack;
269
270
271
        if (pos + needle_len > end) {
            pos = -1;
        }
272
        return MP_OBJ_NEW_SMALL_INT(pos);
273
274
275
    }
}

276
mp_obj_t str_strip(uint n_args, const mp_obj_t *args) {
xbe's avatar
xbe committed
277
278
279
280
281
282
283
284
    assert(1 <= n_args && n_args <= 2);
    assert(MP_OBJ_IS_TYPE(args[0], &str_type));
    const char *chars_to_del;
    static const char whitespace[] = " \t\n\r\v\f";

    if (n_args == 1) {
        chars_to_del = whitespace;
    } else {
285
        chars_to_del = qstr_str(mp_obj_str_get(args[1]));
xbe's avatar
xbe committed
286
287
288
    }

    const size_t chars_to_del_len = strlen(chars_to_del);
289
    const char *orig_str = qstr_str(mp_obj_str_get(args[0]));
xbe's avatar
xbe committed
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
    const size_t orig_str_len = strlen(orig_str);

    size_t first_good_char_pos = 0;
    bool first_good_char_pos_set = false;
    size_t last_good_char_pos = 0;
    for (size_t i = 0; i < orig_str_len; i++) {
        if (!chr_in_str(chars_to_del, chars_to_del_len, orig_str[i])) {
            last_good_char_pos = i;
            if (!first_good_char_pos_set) {
                first_good_char_pos = i;
                first_good_char_pos_set = true;
            }
        }
    }

    if (first_good_char_pos == 0 && last_good_char_pos == 0) {
306
307
        //string is all whitespace, return ''
        return mp_obj_new_str(MP_QSTR_);
xbe's avatar
xbe committed
308
309
310
311
312
    }

    assert(last_good_char_pos >= first_good_char_pos);
    //+1 to accomodate the last character
    size_t stripped_len = last_good_char_pos - first_good_char_pos + 1;
313
    return mp_obj_new_str(qstr_from_strn(orig_str + first_good_char_pos, stripped_len));
xbe's avatar
xbe committed
314
315
}

316
mp_obj_t str_format(uint n_args, const mp_obj_t *args) {
317
318
319
320
321
322
323
324
325
326
327
    assert(MP_OBJ_IS_TYPE(args[0], &str_type));
    mp_obj_str_t *self = args[0];

    const char *str = qstr_str(self->qstr);
    int arg_i = 1;
    vstr_t *vstr = vstr_new();
    for (; *str; str++) {
        if (*str == '{') {
            str++;
            if (*str == '{') {
                vstr_add_char(vstr, '{');
328
329
            } else {
                while (*str != '}') str++;
330
                if (arg_i >= n_args) {
331
                    nlr_jump(mp_obj_new_exception_msg(MP_QSTR_IndexError, "tuple index out of range"));
332
                }
333
                // TODO: may be PRINT_REPR depending on formatting code
334
                mp_obj_print_helper((void (*)(void*, const char*, ...))vstr_printf, vstr, args[arg_i], PRINT_STR);
335
336
337
338
339
340
341
                arg_i++;
            }
        } else {
            vstr_add_char(vstr, *str);
        }
    }

342
    return mp_obj_new_str(qstr_from_strn_take(vstr->buf, vstr->alloc, vstr->len));
343
344
}

345
static MP_DEFINE_CONST_FUN_OBJ_VAR_BETWEEN(str_find_obj, 2, 4, str_find);
346
static MP_DEFINE_CONST_FUN_OBJ_2(str_join_obj, str_join);
Paul Sokolovsky's avatar
Paul Sokolovsky committed
347
static MP_DEFINE_CONST_FUN_OBJ_VAR_BETWEEN(str_split_obj, 1, 3, str_split);
xbe's avatar
xbe committed
348
static MP_DEFINE_CONST_FUN_OBJ_VAR_BETWEEN(str_strip_obj, 1, 2, str_strip);
349
350
static MP_DEFINE_CONST_FUN_OBJ_VAR(str_format_obj, 1, str_format);

ian-v's avatar
ian-v committed
351
static const mp_method_t str_type_methods[] = {
352
    { "find", &str_find_obj },
ian-v's avatar
ian-v committed
353
    { "join", &str_join_obj },
Paul Sokolovsky's avatar
Paul Sokolovsky committed
354
    { "split", &str_split_obj },
xbe's avatar
xbe committed
355
    { "strip", &str_strip_obj },
ian-v's avatar
ian-v committed
356
357
358
    { "format", &str_format_obj },
    { NULL, NULL }, // end-of-list sentinel
};
359

360
361
362
const mp_obj_type_t str_type = {
    { &mp_const_type },
    "str",
363
364
365
    .print = str_print,
    .binary_op = str_binary_op,
    .getiter = str_getiter,
ian-v's avatar
ian-v committed
366
    .methods = str_type_methods,
367
368
369
370
371
372
373
374
375
376
};

mp_obj_t mp_obj_new_str(qstr qstr) {
    mp_obj_str_t *o = m_new_obj(mp_obj_str_t);
    o->base.type = &str_type;
    o->qstr = qstr;
    return o;
}

qstr mp_obj_str_get(mp_obj_t self_in) {
377
378
379
380
381
382
383
384
385
    if (MP_OBJ_IS_QSTR(self_in)) {
        return MP_OBJ_QSTR_VALUE(self_in);
    }
    if (MP_OBJ_IS_TYPE(self_in, &str_type)) {
        mp_obj_str_t *self = self_in;
        return self->qstr;
    }
    nlr_jump(mp_obj_new_exception_msg_varg(MP_QSTR_TypeError, "Can't convert '%s' object to str implicitly",
             mp_obj_get_type_str(self_in)));
386
}
xyb's avatar
xyb committed
387
388
389
390
391
392
393
394
395
396
397
398
399
400

/******************************************************************************/
/* str iterator                                                               */

typedef struct _mp_obj_str_it_t {
    mp_obj_base_t base;
    mp_obj_str_t *str;
    machine_uint_t cur;
} mp_obj_str_it_t;

mp_obj_t str_it_iternext(mp_obj_t self_in) {
    mp_obj_str_it_t *self = self_in;
    const char *str = qstr_str(self->str->qstr);
    if (self->cur < strlen(str)) {
401
        mp_obj_t o_out = mp_obj_new_str(qstr_from_strn(str + self->cur, 1));
xyb's avatar
xyb committed
402
403
404
405
406
407
408
409
410
411
        self->cur += 1;
        return o_out;
    } else {
        return mp_const_stop_iteration;
    }
}

static const mp_obj_type_t str_it_type = {
    { &mp_const_type },
    "str_iterator",
412
    .iternext = str_it_iternext,
xyb's avatar
xyb committed
413
414
415
416
417
418
419
420
421
};

mp_obj_t mp_obj_new_str_iterator(mp_obj_str_t *str, int cur) {
    mp_obj_str_it_t *o = m_new_obj(mp_obj_str_it_t);
    o->base.type = &str_it_type;
    o->str = str;
    o->cur = cur;
    return o;
}