objstr.c 73.4 KB
Newer Older
1
2
3
4
5
6
/*
 * This file is part of the Micro Python project, http://micropython.org/
 *
 * The MIT License (MIT)
 *
 * Copyright (c) 2013, 2014 Damien P. George
7
 * Copyright (c) 2014 Paul Sokolovsky
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */

28
29
30
#include <string.h>
#include <assert.h>

31
32
33
34
35
36
#include "py/nlr.h"
#include "py/unicode.h"
#include "py/objstr.h"
#include "py/objlist.h"
#include "py/runtime0.h"
#include "py/runtime.h"
37
#include "py/stackctrl.h"
38

39
STATIC mp_obj_t str_modulo_format(mp_obj_t pattern, mp_uint_t n_args, const mp_obj_t *args, mp_obj_t dict);
40

41
STATIC mp_obj_t mp_obj_new_bytes_iterator(mp_obj_t str);
42
STATIC NORETURN void bad_implicit_conversion(mp_obj_t self_in);
43

xyb's avatar
xyb committed
44
45
46
/******************************************************************************/
/* str                                                                        */

47
void mp_str_print_quoted(const mp_print_t *print, const byte *str_data, mp_uint_t str_len, bool is_bytes) {
48
49
50
    // this escapes characters, but it will be very slow to print (calling print many times)
    bool has_single_quote = false;
    bool has_double_quote = false;
51
    for (const byte *s = str_data, *top = str_data + str_len; !has_double_quote && s < top; s++) {
52
53
54
55
56
57
58
59
60
61
        if (*s == '\'') {
            has_single_quote = true;
        } else if (*s == '"') {
            has_double_quote = true;
        }
    }
    int quote_char = '\'';
    if (has_single_quote && !has_double_quote) {
        quote_char = '"';
    }
62
    mp_printf(print, "%c", quote_char);
63
64
    for (const byte *s = str_data, *top = str_data + str_len; s < top; s++) {
        if (*s == quote_char) {
65
            mp_printf(print, "\\%c", quote_char);
66
        } else if (*s == '\\') {
67
            mp_print_str(print, "\\\\");
68
69
70
71
        } else if (*s >= 0x20 && *s != 0x7f && (!is_bytes || *s < 0x80)) {
            // In strings, anything which is not ascii control character
            // is printed as is, this includes characters in range 0x80-0xff
            // (which can be non-Latin letters, etc.)
72
            mp_printf(print, "%c", *s);
73
        } else if (*s == '\n') {
74
            mp_print_str(print, "\\n");
75
        } else if (*s == '\r') {
76
            mp_print_str(print, "\\r");
77
        } else if (*s == '\t') {
78
            mp_print_str(print, "\\t");
79
        } else {
80
            mp_printf(print, "\\x%02x", *s);
81
82
        }
    }
83
    mp_printf(print, "%c", quote_char);
84
85
}

86
#if MICROPY_PY_UJSON
87
void mp_str_print_json(const mp_print_t *print, const byte *str_data, size_t str_len) {
88
89
    // for JSON spec, see http://www.ietf.org/rfc/rfc4627.txt
    // if we are given a valid utf8-encoded string, we will print it in a JSON-conforming way
90
    mp_print_str(print, "\"");
91
    for (const byte *s = str_data, *top = str_data + str_len; s < top; s++) {
92
        if (*s == '"' || *s == '\\') {
93
            mp_printf(print, "\\%c", *s);
94
95
        } else if (*s >= 32) {
            // this will handle normal and utf-8 encoded chars
96
            mp_printf(print, "%c", *s);
97
        } else if (*s == '\n') {
98
            mp_print_str(print, "\\n");
99
        } else if (*s == '\r') {
100
            mp_print_str(print, "\\r");
101
        } else if (*s == '\t') {
102
            mp_print_str(print, "\\t");
103
        } else {
104
            // this will handle control chars
105
            mp_printf(print, "\\u%04x", *s);
106
107
        }
    }
108
    mp_print_str(print, "\"");
109
110
111
}
#endif

112
STATIC void str_print(const mp_print_t *print, mp_obj_t self_in, mp_print_kind_t kind) {
113
    GET_STR_DATA_LEN(self_in, str_data, str_len);
114
115
    #if MICROPY_PY_UJSON
    if (kind == PRINT_JSON) {
116
        mp_str_print_json(print, str_data, str_len);
117
118
119
        return;
    }
    #endif
120
    #if !MICROPY_PY_BUILTINS_STR_UNICODE
121
    bool is_bytes = MP_OBJ_IS_TYPE(self_in, &mp_type_bytes);
122
123
124
    #else
    bool is_bytes = true;
    #endif
125
    if (kind == PRINT_RAW || (!MICROPY_PY_BUILTINS_STR_UNICODE && kind == PRINT_STR && !is_bytes)) {
126
        mp_printf(print, "%.*s", str_len, str_data);
127
    } else {
128
        if (is_bytes) {
129
            mp_print_str(print, "b");
130
        }
131
        mp_str_print_quoted(print, str_data, str_len, is_bytes);
132
    }
133
134
}

135
mp_obj_t mp_obj_str_make_new(const mp_obj_type_t *type, size_t n_args, size_t n_kw, const mp_obj_t *args) {
136
137
138
139
140
141
#if MICROPY_CPYTHON_COMPAT
    if (n_kw != 0) {
        mp_arg_error_unimpl_kw();
    }
#endif

142
143
    mp_arg_check_num(n_args, n_kw, 0, 3, false);

144
145
146
147
    switch (n_args) {
        case 0:
            return MP_OBJ_NEW_QSTR(MP_QSTR_);

148
        case 1: {
149
            vstr_t vstr;
150
151
152
            mp_print_t print;
            vstr_init_print(&vstr, 16, &print);
            mp_obj_print_helper(&print, args[0], PRINT_STR);
153
            return mp_obj_new_str_from_vstr(type, &vstr);
154
155
        }

156
        default: // 2 or 3 args
157
            // TODO: validate 2nd/3rd args
158
159
160
            if (MP_OBJ_IS_TYPE(args[0], &mp_type_bytes)) {
                GET_STR_DATA_LEN(args[0], str_data, str_len);
                GET_STR_HASH(args[0], str_hash);
161
                mp_obj_str_t *o = MP_OBJ_TO_PTR(mp_obj_new_str_of_type(type, NULL, str_len));
162
163
                o->data = str_data;
                o->hash = str_hash;
164
                return MP_OBJ_FROM_PTR(o);
165
166
167
168
            } else {
                mp_buffer_info_t bufinfo;
                mp_get_buffer_raise(args[0], &bufinfo, MP_BUFFER_READ);
                return mp_obj_new_str(bufinfo.buf, bufinfo.len, false);
169
170
171
172
            }
    }
}

173
STATIC mp_obj_t bytes_make_new(const mp_obj_type_t *type_in, size_t n_args, size_t n_kw, const mp_obj_t *args) {
174
175
    (void)type_in;

176
    #if MICROPY_CPYTHON_COMPAT
177
178
179
    if (n_kw != 0) {
        mp_arg_error_unimpl_kw();
    }
180
181
182
    #else
    (void)n_kw;
    #endif
183

184
185
186
187
    if (n_args == 0) {
        return mp_const_empty_bytes;
    }

188
189
190
191
192
193
    if (MP_OBJ_IS_STR(args[0])) {
        if (n_args < 2 || n_args > 3) {
            goto wrong_args;
        }
        GET_STR_DATA_LEN(args[0], str_data, str_len);
        GET_STR_HASH(args[0], str_hash);
194
        mp_obj_str_t *o = MP_OBJ_TO_PTR(mp_obj_new_str_of_type(&mp_type_bytes, NULL, str_len));
195
196
        o->data = str_data;
        o->hash = str_hash;
197
        return MP_OBJ_FROM_PTR(o);
198
199
200
201
202
203
204
205
    }

    if (n_args > 1) {
        goto wrong_args;
    }

    if (MP_OBJ_IS_SMALL_INT(args[0])) {
        uint len = MP_OBJ_SMALL_INT_VALUE(args[0]);
206
207
208
209
        vstr_t vstr;
        vstr_init_len(&vstr, len);
        memset(vstr.buf, 0, len);
        return mp_obj_new_str_from_vstr(&mp_type_bytes, &vstr);
210
211
    }

212
213
214
215
216
217
    // check if argument has the buffer protocol
    mp_buffer_info_t bufinfo;
    if (mp_get_buffer(args[0], &bufinfo, MP_BUFFER_READ)) {
        return mp_obj_new_str_of_type(&mp_type_bytes, bufinfo.buf, bufinfo.len);
    }

218
    vstr_t vstr;
219
220
221
    // Try to create array of exact len if initializer len is known
    mp_obj_t len_in = mp_obj_len_maybe(args[0]);
    if (len_in == MP_OBJ_NULL) {
222
        vstr_init(&vstr, 16);
223
    } else {
224
        mp_int_t len = MP_OBJ_SMALL_INT_VALUE(len_in);
225
        vstr_init(&vstr, len);
226
227
    }

Damien George's avatar
Damien George committed
228
    mp_obj_t iterable = mp_getiter(args[0]);
229
    mp_obj_t item;
230
    while ((item = mp_iternext(iterable)) != MP_OBJ_STOP_ITERATION) {
231
232
233
        mp_int_t val = mp_obj_get_int(item);
        #if MICROPY_CPYTHON_COMPAT
        if (val < 0 || val > 255) {
234
            mp_raise_ValueError("bytes value out of range");
235
236
237
        }
        #endif
        vstr_add_byte(&vstr, val);
238
239
    }

240
    return mp_obj_new_str_from_vstr(&mp_type_bytes, &vstr);
241
242

wrong_args:
243
    mp_raise_TypeError("wrong number of arguments");
244
245
}

246
247
// like strstr but with specified length and allows \0 bytes
// TODO replace with something more efficient/standard
248
const byte *find_subbytes(const byte *haystack, mp_uint_t hlen, const byte *needle, mp_uint_t nlen, mp_int_t direction) {
249
    if (hlen >= nlen) {
250
        mp_uint_t str_index, str_index_end;
251
252
253
254
255
256
257
258
259
260
261
        if (direction > 0) {
            str_index = 0;
            str_index_end = hlen - nlen;
        } else {
            str_index = hlen - nlen;
            str_index_end = 0;
        }
        for (;;) {
            if (memcmp(&haystack[str_index], needle, nlen) == 0) {
                //found
                return haystack + str_index;
262
            }
263
264
265
            if (str_index == str_index_end) {
                //not found
                break;
266
            }
267
            str_index += direction;
268
269
270
271
272
        }
    }
    return NULL;
}

273
274
275
// Note: this function is used to check if an object is a str or bytes, which
// works because both those types use it as their binary_op method.  Revisit
// MP_OBJ_IS_STR_OR_BYTES if this fact changes.
276
mp_obj_t mp_obj_str_binary_op(mp_uint_t op, mp_obj_t lhs_in, mp_obj_t rhs_in) {
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
    // check for modulo
    if (op == MP_BINARY_OP_MODULO) {
        mp_obj_t *args;
        mp_uint_t n_args;
        mp_obj_t dict = MP_OBJ_NULL;
        if (MP_OBJ_IS_TYPE(rhs_in, &mp_type_tuple)) {
            // TODO: Support tuple subclasses?
            mp_obj_tuple_get(rhs_in, &n_args, &args);
        } else if (MP_OBJ_IS_TYPE(rhs_in, &mp_type_dict)) {
            args = NULL;
            n_args = 0;
            dict = rhs_in;
        } else {
            args = &rhs_in;
            n_args = 1;
        }
        return str_modulo_format(lhs_in, n_args, args, dict);
    }

    // from now on we need lhs type and data, so extract them
297
    mp_obj_type_t *lhs_type = mp_obj_get_type(lhs_in);
298
    GET_STR_DATA_LEN(lhs_in, lhs_data, lhs_len);
299

300
301
302
303
304
305
306
307
308
309
310
    // check for multiply
    if (op == MP_BINARY_OP_MULTIPLY) {
        mp_int_t n;
        if (!mp_obj_get_int_maybe(rhs_in, &n)) {
            return MP_OBJ_NULL; // op not supported
        }
        if (n <= 0) {
            if (lhs_type == &mp_type_str) {
                return MP_OBJ_NEW_QSTR(MP_QSTR_); // empty str
            } else {
                return mp_const_empty_bytes;
311
            }
312
        }
313
314
315
316
        vstr_t vstr;
        vstr_init_len(&vstr, lhs_len * n);
        mp_seq_multiply(lhs_data, sizeof(*lhs_data), lhs_len, n, vstr.buf);
        return mp_obj_new_str_from_vstr(lhs_type, &vstr);
317
    }
318

319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
    // From now on all operations allow:
    //    - str with str
    //    - bytes with bytes
    //    - bytes with bytearray
    //    - bytes with array.array
    // To do this efficiently we use the buffer protocol to extract the raw
    // data for the rhs, but only if the lhs is a bytes object.
    //
    // NOTE: CPython does not allow comparison between bytes ard array.array
    // (even if the array is of type 'b'), even though it allows addition of
    // such types.  We are not compatible with this (we do allow comparison
    // of bytes with anything that has the buffer protocol).  It would be
    // easy to "fix" this with a bit of extra logic below, but it costs code
    // size and execution time so we don't.

    const byte *rhs_data;
    mp_uint_t rhs_len;
    if (lhs_type == mp_obj_get_type(rhs_in)) {
        GET_STR_DATA_LEN(rhs_in, rhs_data_, rhs_len_);
        rhs_data = rhs_data_;
        rhs_len = rhs_len_;
    } else if (lhs_type == &mp_type_bytes) {
        mp_buffer_info_t bufinfo;
        if (!mp_get_buffer(rhs_in, &bufinfo, MP_BUFFER_READ)) {
343
            return MP_OBJ_NULL; // op not supported
344
345
346
347
348
349
350
351
352
353
354
        }
        rhs_data = bufinfo.buf;
        rhs_len = bufinfo.len;
    } else {
        // incompatible types
        return MP_OBJ_NULL; // op not supported
    }

    switch (op) {
        case MP_BINARY_OP_ADD:
        case MP_BINARY_OP_INPLACE_ADD: {
355
356
357
358
359
            vstr_t vstr;
            vstr_init_len(&vstr, lhs_len + rhs_len);
            memcpy(vstr.buf, lhs_data, lhs_len);
            memcpy(vstr.buf + lhs_len, rhs_data, rhs_len);
            return mp_obj_new_str_from_vstr(lhs_type, &vstr);
360
        }
361

362
363
        case MP_BINARY_OP_IN:
            /* NOTE `a in b` is `b.__contains__(a)` */
364
            return mp_obj_new_bool(find_subbytes(lhs_data, lhs_len, rhs_data, rhs_len, 1) != NULL);
365

366
367
        //case MP_BINARY_OP_NOT_EQUAL: // This is never passed here
        case MP_BINARY_OP_EQUAL: // This will be passed only for bytes, str is dealt with in mp_obj_equal()
Damien George's avatar
Damien George committed
368
369
370
371
        case MP_BINARY_OP_LESS:
        case MP_BINARY_OP_LESS_EQUAL:
        case MP_BINARY_OP_MORE:
        case MP_BINARY_OP_MORE_EQUAL:
372
            return mp_obj_new_bool(mp_seq_cmp_bytes(op, lhs_data, lhs_len, rhs_data, rhs_len));
373
374
    }

375
    return MP_OBJ_NULL; // op not supported
376
377
}

378
379
#if !MICROPY_PY_BUILTINS_STR_UNICODE
// objstrunicode defines own version
380
const byte *str_index_to_ptr(const mp_obj_type_t *type, const byte *self_data, size_t self_len,
381
                             mp_obj_t index, bool is_slice) {
382
    mp_uint_t index_val = mp_get_index(type, self_len, index, is_slice);
383
384
    return self_data + index_val;
}
385
#endif
386

387
388
// This is used for both bytes and 8-bit strings. This is not used for unicode strings.
STATIC mp_obj_t bytes_subscr(mp_obj_t self_in, mp_obj_t index, mp_obj_t value) {
389
    mp_obj_type_t *type = mp_obj_get_type(self_in);
390
391
392
    GET_STR_DATA_LEN(self_in, self_data, self_len);
    if (value == MP_OBJ_SENTINEL) {
        // load
393
#if MICROPY_PY_BUILTINS_SLICE
394
        if (MP_OBJ_IS_TYPE(index, &mp_type_slice)) {
395
396
            mp_bound_slice_t slice;
            if (!mp_seq_get_fast_slice_indexes(self_len, index, &slice)) {
397
                mp_not_implemented("only slices with step=1 (aka None) are supported");
398
            }
399
            return mp_obj_new_str_of_type(type, self_data + slice.start, slice.stop - slice.start);
400
401
        }
#endif
402
        mp_uint_t index_val = mp_get_index(type, self_len, index, false);
403
404
        // If we have unicode enabled the type will always be bytes, so take the short cut.
        if (MICROPY_PY_BUILTINS_STR_UNICODE || type == &mp_type_bytes) {
405
            return MP_OBJ_NEW_SMALL_INT(self_data[index_val]);
406
        } else {
407
            return mp_obj_new_str((char*)&self_data[index_val], 1, true);
408
409
        }
    } else {
410
        return MP_OBJ_NULL; // op not supported
411
412
413
    }
}

414
STATIC mp_obj_t str_join(mp_obj_t self_in, mp_obj_t arg) {
415
    mp_check_self(MP_OBJ_IS_STR_OR_BYTES(self_in));
416
    const mp_obj_type_t *self_type = mp_obj_get_type(self_in);
417

418
    // get separation string
419
    GET_STR_DATA_LEN(self_in, sep_str, sep_len);
420
421

    // process args
422
    mp_uint_t seq_len;
423
    mp_obj_t *seq_items;
424
    if (MP_OBJ_IS_TYPE(arg, &mp_type_tuple)) {
425
426
        mp_obj_tuple_get(arg, &seq_len, &seq_items);
    } else {
427
428
        if (!MP_OBJ_IS_TYPE(arg, &mp_type_list)) {
            // arg is not a list, try to convert it to one
429
            // TODO: Try to optimize?
430
            arg = mp_type_list.make_new(&mp_type_list, 1, 0, &arg);
431
432
        }
        mp_obj_list_get(arg, &seq_len, &seq_items);
433
    }
434
435

    // count required length
436
437
    mp_uint_t required_len = 0;
    for (mp_uint_t i = 0; i < seq_len; i++) {
438
        if (mp_obj_get_type(seq_items[i]) != self_type) {
439
440
            mp_raise_msg(&mp_type_TypeError,
                "join expects a list of str/bytes objects consistent with self object");
441
        }
442
443
444
        if (i > 0) {
            required_len += sep_len;
        }
445
446
        GET_STR_LEN(seq_items[i], l);
        required_len += l;
447
448
449
    }

    // make joined string
450
451
452
    vstr_t vstr;
    vstr_init_len(&vstr, required_len);
    byte *data = (byte*)vstr.buf;
453
    for (mp_uint_t i = 0; i < seq_len; i++) {
454
        if (i > 0) {
455
456
            memcpy(data, sep_str, sep_len);
            data += sep_len;
457
        }
458
459
460
        GET_STR_DATA_LEN(seq_items[i], s, l);
        memcpy(data, s, l);
        data += l;
461
    }
462
463

    // return joined string
464
    return mp_obj_new_str_from_vstr(self_type, &vstr);
465
466
}

467
mp_obj_t mp_obj_str_split(size_t n_args, const mp_obj_t *args) {
468
    const mp_obj_type_t *self_type = mp_obj_get_type(args[0]);
469
    mp_int_t splits = -1;
Paul Sokolovsky's avatar
Paul Sokolovsky committed
470
471
472
473
    mp_obj_t sep = mp_const_none;
    if (n_args > 1) {
        sep = args[1];
        if (n_args > 2) {
474
            splits = mp_obj_get_int(args[2]);
Paul Sokolovsky's avatar
Paul Sokolovsky committed
475
476
        }
    }
477

Paul Sokolovsky's avatar
Paul Sokolovsky committed
478
    mp_obj_t res = mp_obj_new_list(0, NULL);
479
480
    GET_STR_DATA_LEN(args[0], s, len);
    const byte *top = s + len;
481
482
483
484
485

    if (sep == mp_const_none) {
        // sep not given, so separate on whitespace

        // Initial whitespace is not counted as split, so we pre-do it
486
        while (s < top && unichar_isspace(*s)) s++;
487
488
        while (s < top && splits != 0) {
            const byte *start = s;
489
            while (s < top && !unichar_isspace(*s)) s++;
490
            mp_obj_list_append(res, mp_obj_new_str_of_type(self_type, start, s - start));
491
492
493
            if (s >= top) {
                break;
            }
494
            while (s < top && unichar_isspace(*s)) s++;
495
496
497
            if (splits > 0) {
                splits--;
            }
Paul Sokolovsky's avatar
Paul Sokolovsky committed
498
499
        }

500
        if (s < top) {
501
            mp_obj_list_append(res, mp_obj_new_str_of_type(self_type, s, top - s));
502
503
504
505
        }

    } else {
        // sep given
506
        if (mp_obj_get_type(sep) != self_type) {
507
            bad_implicit_conversion(sep);
508
        }
509

510
        mp_uint_t sep_len;
511
512
513
        const char *sep_str = mp_obj_str_get_data(sep, &sep_len);

        if (sep_len == 0) {
514
            mp_raise_ValueError("empty separator");
515
516
517
518
519
520
521
522
523
524
525
526
527
        }

        for (;;) {
            const byte *start = s;
            for (;;) {
                if (splits == 0 || s + sep_len > top) {
                    s = top;
                    break;
                } else if (memcmp(s, sep_str, sep_len) == 0) {
                    break;
                }
                s++;
            }
528
            mp_obj_list_append(res, mp_obj_new_str_of_type(self_type, start, s - start));
529
530
531
532
533
534
535
536
            if (s >= top) {
                break;
            }
            s += sep_len;
            if (splits > 0) {
                splits--;
            }
        }
Paul Sokolovsky's avatar
Paul Sokolovsky committed
537
538
539
540
541
    }

    return res;
}

542
#if MICROPY_PY_BUILTINS_STR_SPLITLINES
543
STATIC mp_obj_t str_splitlines(size_t n_args, const mp_obj_t *pos_args, mp_map_t *kw_args) {
544
    enum { ARG_keepends };
545
546
547
548
549
    static const mp_arg_t allowed_args[] = {
        { MP_QSTR_keepends, MP_ARG_BOOL, {.u_bool = false} },
    };

    // parse args
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
    mp_arg_val_t args[MP_ARRAY_SIZE(allowed_args)];
    mp_arg_parse_all(n_args - 1, pos_args + 1, kw_args, MP_ARRAY_SIZE(allowed_args), allowed_args, args);

    const mp_obj_type_t *self_type = mp_obj_get_type(pos_args[0]);
    mp_obj_t res = mp_obj_new_list(0, NULL);

    GET_STR_DATA_LEN(pos_args[0], s, len);
    const byte *top = s + len;

    while (s < top) {
        const byte *start = s;
        size_t match = 0;
        while (s < top) {
            if (*s == '\n') {
                match = 1;
                break;
            } else if (*s == '\r') {
                if (s[1] == '\n') {
                    match = 2;
                } else {
                    match = 1;
                }
                break;
            }
            s++;
        }
        size_t sub_len = s - start;
        if (args[ARG_keepends].u_bool) {
            sub_len += match;
        }
        mp_obj_list_append(res, mp_obj_new_str_of_type(self_type, start, sub_len));
        s += match;
    }

    return res;
585
586
587
}
#endif

588
STATIC mp_obj_t str_rsplit(size_t n_args, const mp_obj_t *args) {
589
590
591
    if (n_args < 3) {
        // If we don't have split limit, it doesn't matter from which side
        // we split.
592
        return mp_obj_str_split(n_args, args);
593
594
595
596
597
    }
    const mp_obj_type_t *self_type = mp_obj_get_type(args[0]);
    mp_obj_t sep = args[1];
    GET_STR_DATA_LEN(args[0], s, len);

598
599
    mp_int_t splits = mp_obj_get_int(args[2]);
    mp_int_t org_splits = splits;
600
601
    // Preallocate list to the max expected # of elements, as we
    // will fill it from the end.
602
    mp_obj_list_t *res = MP_OBJ_TO_PTR(mp_obj_new_list(splits + 1, NULL));
603
    mp_int_t idx = splits;
604
605

    if (sep == mp_const_none) {
606
        mp_not_implemented("rsplit(None,n)");
607
    } else {
608
        mp_uint_t sep_len;
609
610
611
        const char *sep_str = mp_obj_str_get_data(sep, &sep_len);

        if (sep_len == 0) {
612
            mp_raise_ValueError("empty separator");
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
        }

        const byte *beg = s;
        const byte *last = s + len;
        for (;;) {
            s = last - sep_len;
            for (;;) {
                if (splits == 0 || s < beg) {
                    break;
                } else if (memcmp(s, sep_str, sep_len) == 0) {
                    break;
                }
                s--;
            }
            if (s < beg || splits == 0) {
628
                res->items[idx] = mp_obj_new_str_of_type(self_type, beg, last - beg);
629
630
                break;
            }
631
            res->items[idx--] = mp_obj_new_str_of_type(self_type, s + sep_len, last - s - sep_len);
632
633
634
635
636
637
638
            last = s;
            if (splits > 0) {
                splits--;
            }
        }
        if (idx != 0) {
            // We split less parts than split limit, now go cleanup surplus
639
            mp_int_t used = org_splits + 1 - idx;
640
            memmove(res->items, &res->items[idx], used * sizeof(mp_obj_t));
641
642
643
644
645
            mp_seq_clear(res->items, used, res->alloc, sizeof(*res->items));
            res->len = used;
        }
    }

646
    return MP_OBJ_FROM_PTR(res);
647
648
}

649
STATIC mp_obj_t str_finder(mp_uint_t n_args, const mp_obj_t *args, mp_int_t direction, bool is_index) {
650
    const mp_obj_type_t *self_type = mp_obj_get_type(args[0]);
651
    mp_check_self(MP_OBJ_IS_STR_OR_BYTES(args[0]));
652
653

    // check argument type
654
    if (mp_obj_get_type(args[1]) != self_type) {
655
656
        bad_implicit_conversion(args[1]);
    }
657

658
659
    GET_STR_DATA_LEN(args[0], haystack, haystack_len);
    GET_STR_DATA_LEN(args[1], needle, needle_len);
660

661
662
    const byte *start = haystack;
    const byte *end = haystack + haystack_len;
663
    if (n_args >= 3 && args[2] != mp_const_none) {
664
        start = str_index_to_ptr(self_type, haystack, haystack_len, args[2], true);
665
666
    }
    if (n_args >= 4 && args[3] != mp_const_none) {
667
        end = str_index_to_ptr(self_type, haystack, haystack_len, args[3], true);
668
669
    }

670
    const byte *p = find_subbytes(start, end - start, needle, needle_len, direction);
671
672
    if (p == NULL) {
        // not found
673
        if (is_index) {
674
            mp_raise_ValueError("substring not found");
675
676
677
        } else {
            return MP_OBJ_NEW_SMALL_INT(-1);
        }
678
679
    } else {
        // found
680
681
682
683
684
        #if MICROPY_PY_BUILTINS_STR_UNICODE
        if (self_type == &mp_type_str) {
            return MP_OBJ_NEW_SMALL_INT(utf8_ptr_to_index(haystack, p));
        }
        #endif
685
        return MP_OBJ_NEW_SMALL_INT(p - haystack);
686
687
688
    }
}

689
STATIC mp_obj_t str_find(size_t n_args, const mp_obj_t *args) {
690
    return str_finder(n_args, args, 1, false);
691
692
}

693
STATIC mp_obj_t str_rfind(size_t n_args, const mp_obj_t *args) {
694
695
696
    return str_finder(n_args, args, -1, false);
}

697
STATIC mp_obj_t str_index(size_t n_args, const mp_obj_t *args) {
698
699
700
    return str_finder(n_args, args, 1, true);
}

701
STATIC mp_obj_t str_rindex(size_t n_args, const mp_obj_t *args) {
702
    return str_finder(n_args, args, -1, true);
703
704
}

705
// TODO: (Much) more variety in args
706
STATIC mp_obj_t str_startswith(size_t n_args, const mp_obj_t *args) {
707
    const mp_obj_type_t *self_type = mp_obj_get_type(args[0]);
708
709
    GET_STR_DATA_LEN(args[0], str, str_len);
    GET_STR_DATA_LEN(args[1], prefix, prefix_len);
710
    const byte *start = str;
711
    if (n_args > 2) {
712
        start = str_index_to_ptr(self_type, str, str_len, args[2], true);
713
    }
714
    if (prefix_len + (start - str) > str_len) {
715
716
        return mp_const_false;
    }
717
    return mp_obj_new_bool(memcmp(start, prefix, prefix_len) == 0);
718
719
}

720
STATIC mp_obj_t str_endswith(size_t n_args, const mp_obj_t *args) {
721
722
    GET_STR_DATA_LEN(args[0], str, str_len);
    GET_STR_DATA_LEN(args[1], suffix, suffix_len);
723
724
725
    if (n_args > 2) {
        mp_not_implemented("start/end indices");
    }
726
727
728
729

    if (suffix_len > str_len) {
        return mp_const_false;
    }
730
    return mp_obj_new_bool(memcmp(str + (str_len - suffix_len), suffix, suffix_len) == 0);
731
732
}

733
734
enum { LSTRIP, RSTRIP, STRIP };

735
STATIC mp_obj_t str_uni_strip(int type, mp_uint_t n_args, const mp_obj_t *args) {
736
    mp_check_self(MP_OBJ_IS_STR_OR_BYTES(args[0]));
737
    const mp_obj_type_t *self_type = mp_obj_get_type(args[0]);
738
739
740
741

    const byte *chars_to_del;
    uint chars_to_del_len;
    static const byte whitespace[] = " \t\n\r\v\f";
xbe's avatar
xbe committed
742
743
744

    if (n_args == 1) {
        chars_to_del = whitespace;
745
        chars_to_del_len = sizeof(whitespace);
xbe's avatar
xbe committed
746
    } else {
747
        if (mp_obj_get_type(args[1]) != self_type) {
748
            bad_implicit_conversion(args[1]);
749
        }
750
751
752
        GET_STR_DATA_LEN(args[1], s, l);
        chars_to_del = s;
        chars_to_del_len = l;
xbe's avatar
xbe committed
753
754
    }

755
    GET_STR_DATA_LEN(args[0], orig_str, orig_str_len);
xbe's avatar
xbe committed
756

757
    mp_uint_t first_good_char_pos = 0;
xbe's avatar
xbe committed
758
    bool first_good_char_pos_set = false;
759
760
761
    mp_uint_t last_good_char_pos = 0;
    mp_uint_t i = 0;
    mp_int_t delta = 1;
762
763
764
765
    if (type == RSTRIP) {
        i = orig_str_len - 1;
        delta = -1;
    }
766
    for (mp_uint_t len = orig_str_len; len > 0; len--) {
767
        if (find_subbytes(chars_to_del, chars_to_del_len, &orig_str[i], 1, 1) == NULL) {
xbe's avatar
xbe committed
768
            if (!first_good_char_pos_set) {
769
                first_good_char_pos_set = true;
xbe's avatar
xbe committed
770
                first_good_char_pos = i;
771
772
773
                if (type == LSTRIP) {
                    last_good_char_pos = orig_str_len - 1;
                    break;
774
775
776
777
                } else if (type == RSTRIP) {
                    first_good_char_pos = 0;
                    last_good_char_pos = i;
                    break;
778
                }
xbe's avatar
xbe committed
779
            }
780
            last_good_char_pos = i;
xbe's avatar
xbe committed
781
        }
782
        i += delta;
xbe's avatar
xbe committed
783
784
    }

785
    if (!first_good_char_pos_set) {
786
        // string is all whitespace, return ''
787
788
789
790
791
        if (self_type == &mp_type_str) {
            return MP_OBJ_NEW_QSTR(MP_QSTR_);
        } else {
            return mp_const_empty_bytes;
        }
xbe's avatar
xbe committed
792
793
794
795
    }

    assert(last_good_char_pos >= first_good_char_pos);
    //+1 to accomodate the last character
796
    mp_uint_t stripped_len = last_good_char_pos - first_good_char_pos + 1;
797
798
799
800
801
802
    if (stripped_len == orig_str_len) {
        // If nothing was stripped, don't bother to dup original string
        // TODO: watch out for this case when we'll get to bytearray.strip()
        assert(first_good_char_pos == 0);
        return args[0];
    }
803
    return mp_obj_new_str_of_type(self_type, orig_str + first_good_char_pos, stripped_len);
xbe's avatar
xbe committed
804
805
}

806
STATIC mp_obj_t str_strip(size_t n_args, const mp_obj_t *args) {
807
808
809
    return str_uni_strip(STRIP, n_args, args);
}

810
STATIC mp_obj_t str_lstrip(size_t n_args, const mp_obj_t *args) {
811
812
813
    return str_uni_strip(LSTRIP, n_args, args);
}

814
STATIC mp_obj_t str_rstrip(size_t n_args, const mp_obj_t *args) {
815
816
817
    return str_uni_strip(RSTRIP, n_args, args);
}

818
819
820
#if MICROPY_PY_BUILTINS_STR_CENTER
STATIC mp_obj_t str_center(mp_obj_t str_in, mp_obj_t width_in) {
    GET_STR_DATA_LEN(str_in, str, str_len);
821
    mp_uint_t width = mp_obj_get_int(width_in);
822
823
824
825
826
827
828
829
830
831
832
833
834
    if (str_len >= width) {
        return str_in;
    }

    vstr_t vstr;
    vstr_init_len(&vstr, width);
    memset(vstr.buf, ' ', width);
    int left = (width - str_len) / 2;
    memcpy(vstr.buf + left, str, str_len);
    return mp_obj_new_str_from_vstr(mp_obj_get_type(str_in), &vstr);
}
#endif

Dave Hylands's avatar
Dave Hylands committed
835
836
// Takes an int arg, but only parses unsigned numbers, and only changes
// *num if at least one digit was parsed.
837
838
STATIC const char *str_to_int(const char *str, const char *top, int *num) {
    if (str < top && '0' <= *str && *str <= '9') {
Dave Hylands's avatar
Dave Hylands committed
839
840
        *num = 0;
        do {
841
842
            *num = *num * 10 + (*str - '0');
            str++;
Dave Hylands's avatar
Dave Hylands committed
843
        }
844
        while (str < top && '0' <= *str && *str <= '9');
Dave Hylands's avatar
Dave Hylands committed
845
    }
846
    return str;
Dave Hylands's avatar
Dave Hylands committed
847
848
}

849
STATIC bool isalignment(char ch) {
Dave Hylands's avatar
Dave Hylands committed
850
851
852
    return ch && strchr("<>=^", ch) != NULL;
}

853
STATIC bool istype(char ch) {
Dave Hylands's avatar
Dave Hylands committed
854
855
856
    return ch && strchr("bcdeEfFgGnosxX%", ch) != NULL;
}

857
STATIC bool arg_looks_integer(mp_obj_t arg) {
Dave Hylands's avatar
Dave Hylands committed
858
859
860
    return MP_OBJ_IS_TYPE(arg, &mp_type_bool) || MP_OBJ_IS_INT(arg);
}

861
STATIC bool arg_looks_numeric(mp_obj_t arg) {
Dave Hylands's avatar
Dave Hylands committed
862
    return arg_looks_integer(arg)
863
#if MICROPY_PY_BUILTINS_FLOAT
864
        || mp_obj_is_float(arg)
Dave Hylands's avatar
Dave Hylands committed
865
866
867
868
#endif
    ;
}

869
STATIC mp_obj_t arg_as_int(mp_obj_t arg) {
870
#if MICROPY_PY_BUILTINS_FLOAT
871
872
    if (mp_obj_is_float(arg)) {
        return mp_obj_new_int_from_float(mp_obj_float_get(arg));
873
874
    }
#endif
875
    return arg;
876
877
}

878
STATIC NORETURN void terse_str_format_value_error(void) {
879
    mp_raise_ValueError("bad format string");
880
881
}

882
STATIC vstr_t mp_obj_str_format_helper(const char *str, const char *top, int *arg_i, mp_uint_t n_args, const mp_obj_t *args, mp_map_t *kwargs) {
883
    vstr_t vstr;
884
885
    mp_print_t print;
    vstr_init_print(&vstr, 16, &print);
Dave Hylands's avatar
Dave Hylands committed
886

887
    for (; str < top; str++) {
Dave Hylands's avatar
Dave Hylands committed
888
889
890
        if (*str == '}') {
            str++;
            if (str < top && *str == '}') {
891
                vstr_add_byte(&vstr, '}');
Dave Hylands's avatar
Dave Hylands committed
892
893
                continue;
            }
894
895
896
            if (MICROPY_ERROR_REPORTING == MICROPY_ERROR_REPORTING_TERSE) {
                terse_str_format_value_error();
            } else {
897
898
                mp_raise_msg(&mp_type_ValueError,
                    "single '}' encountered in format string");
899
            }
Dave Hylands's avatar
Dave Hylands committed
900
901
        }
        if (*str != '{') {
902
            vstr_add_byte(&vstr, *str);
Dave Hylands's avatar
Dave Hylands committed
903
904
905
906
907
            continue;
        }

        str++;
        if (str < top && *str == '{') {
908
            vstr_add_byte(&vstr, '{');
Dave Hylands's avatar
Dave Hylands committed
909
910
911
912
913
            continue;
        }

        // replacement_field ::=  "{" [field_name] ["!" conversion] [":" format_spec] "}"

914
915
        const char *field_name = NULL;
        const char *field_name_top = NULL;
Dave Hylands's avatar
Dave Hylands committed
916
        char conversion = '\0';