objstr.c 73.8 KB
Newer Older
1
2
3
4
5
6
/*
 * This file is part of the Micro Python project, http://micropython.org/
 *
 * The MIT License (MIT)
 *
 * Copyright (c) 2013, 2014 Damien P. George
7
 * Copyright (c) 2014 Paul Sokolovsky
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */

28
29
30
#include <string.h>
#include <assert.h>

31
32
33
34
35
36
#include "py/nlr.h"
#include "py/unicode.h"
#include "py/objstr.h"
#include "py/objlist.h"
#include "py/runtime0.h"
#include "py/runtime.h"
37
#include "py/stackctrl.h"
38

39
STATIC mp_obj_t str_modulo_format(mp_obj_t pattern, size_t n_args, const mp_obj_t *args, mp_obj_t dict);
40

41
STATIC mp_obj_t mp_obj_new_bytes_iterator(mp_obj_t str);
42
STATIC NORETURN void bad_implicit_conversion(mp_obj_t self_in);
43

xyb's avatar
xyb committed
44
45
46
/******************************************************************************/
/* str                                                                        */

47
void mp_str_print_quoted(const mp_print_t *print, const byte *str_data, mp_uint_t str_len, bool is_bytes) {
48
49
50
    // this escapes characters, but it will be very slow to print (calling print many times)
    bool has_single_quote = false;
    bool has_double_quote = false;
51
    for (const byte *s = str_data, *top = str_data + str_len; !has_double_quote && s < top; s++) {
52
53
54
55
56
57
58
59
60
61
        if (*s == '\'') {
            has_single_quote = true;
        } else if (*s == '"') {
            has_double_quote = true;
        }
    }
    int quote_char = '\'';
    if (has_single_quote && !has_double_quote) {
        quote_char = '"';
    }
62
    mp_printf(print, "%c", quote_char);
63
64
    for (const byte *s = str_data, *top = str_data + str_len; s < top; s++) {
        if (*s == quote_char) {
65
            mp_printf(print, "\\%c", quote_char);
66
        } else if (*s == '\\') {
67
            mp_print_str(print, "\\\\");
68
69
70
71
        } else if (*s >= 0x20 && *s != 0x7f && (!is_bytes || *s < 0x80)) {
            // In strings, anything which is not ascii control character
            // is printed as is, this includes characters in range 0x80-0xff
            // (which can be non-Latin letters, etc.)
72
            mp_printf(print, "%c", *s);
73
        } else if (*s == '\n') {
74
            mp_print_str(print, "\\n");
75
        } else if (*s == '\r') {
76
            mp_print_str(print, "\\r");
77
        } else if (*s == '\t') {
78
            mp_print_str(print, "\\t");
79
        } else {
80
            mp_printf(print, "\\x%02x", *s);
81
82
        }
    }
83
    mp_printf(print, "%c", quote_char);
84
85
}

86
#if MICROPY_PY_UJSON
87
void mp_str_print_json(const mp_print_t *print, const byte *str_data, size_t str_len) {
88
89
    // for JSON spec, see http://www.ietf.org/rfc/rfc4627.txt
    // if we are given a valid utf8-encoded string, we will print it in a JSON-conforming way
90
    mp_print_str(print, "\"");
91
    for (const byte *s = str_data, *top = str_data + str_len; s < top; s++) {
92
        if (*s == '"' || *s == '\\') {
93
            mp_printf(print, "\\%c", *s);
94
95
        } else if (*s >= 32) {
            // this will handle normal and utf-8 encoded chars
96
            mp_printf(print, "%c", *s);
97
        } else if (*s == '\n') {
98
            mp_print_str(print, "\\n");
99
        } else if (*s == '\r') {
100
            mp_print_str(print, "\\r");
101
        } else if (*s == '\t') {
102
            mp_print_str(print, "\\t");
103
        } else {
104
            // this will handle control chars
105
            mp_printf(print, "\\u%04x", *s);
106
107
        }
    }
108
    mp_print_str(print, "\"");
109
110
111
}
#endif

112
STATIC void str_print(const mp_print_t *print, mp_obj_t self_in, mp_print_kind_t kind) {
113
    GET_STR_DATA_LEN(self_in, str_data, str_len);
114
115
    #if MICROPY_PY_UJSON
    if (kind == PRINT_JSON) {
116
        mp_str_print_json(print, str_data, str_len);
117
118
119
        return;
    }
    #endif
120
    #if !MICROPY_PY_BUILTINS_STR_UNICODE
121
    bool is_bytes = MP_OBJ_IS_TYPE(self_in, &mp_type_bytes);
122
123
124
    #else
    bool is_bytes = true;
    #endif
125
    if (kind == PRINT_RAW || (!MICROPY_PY_BUILTINS_STR_UNICODE && kind == PRINT_STR && !is_bytes)) {
126
        mp_printf(print, "%.*s", str_len, str_data);
127
    } else {
128
        if (is_bytes) {
129
            mp_print_str(print, "b");
130
        }
131
        mp_str_print_quoted(print, str_data, str_len, is_bytes);
132
    }
133
134
}

135
mp_obj_t mp_obj_str_make_new(const mp_obj_type_t *type, size_t n_args, size_t n_kw, const mp_obj_t *args) {
136
137
138
139
140
141
#if MICROPY_CPYTHON_COMPAT
    if (n_kw != 0) {
        mp_arg_error_unimpl_kw();
    }
#endif

142
143
    mp_arg_check_num(n_args, n_kw, 0, 3, false);

144
145
146
147
    switch (n_args) {
        case 0:
            return MP_OBJ_NEW_QSTR(MP_QSTR_);

148
        case 1: {
149
            vstr_t vstr;
150
151
152
            mp_print_t print;
            vstr_init_print(&vstr, 16, &print);
            mp_obj_print_helper(&print, args[0], PRINT_STR);
153
            return mp_obj_new_str_from_vstr(type, &vstr);
154
155
        }

156
        default: // 2 or 3 args
157
            // TODO: validate 2nd/3rd args
158
159
160
            if (MP_OBJ_IS_TYPE(args[0], &mp_type_bytes)) {
                GET_STR_DATA_LEN(args[0], str_data, str_len);
                GET_STR_HASH(args[0], str_hash);
161
162
163
                if (str_hash == 0) {
                    str_hash = qstr_compute_hash(str_data, str_len);
                }
164
                mp_obj_str_t *o = MP_OBJ_TO_PTR(mp_obj_new_str_of_type(type, NULL, str_len));
165
166
                o->data = str_data;
                o->hash = str_hash;
167
                return MP_OBJ_FROM_PTR(o);
168
169
170
171
            } else {
                mp_buffer_info_t bufinfo;
                mp_get_buffer_raise(args[0], &bufinfo, MP_BUFFER_READ);
                return mp_obj_new_str(bufinfo.buf, bufinfo.len, false);
172
173
174
175
            }
    }
}

176
STATIC mp_obj_t bytes_make_new(const mp_obj_type_t *type_in, size_t n_args, size_t n_kw, const mp_obj_t *args) {
177
178
    (void)type_in;

179
    #if MICROPY_CPYTHON_COMPAT
180
181
182
    if (n_kw != 0) {
        mp_arg_error_unimpl_kw();
    }
183
184
185
    #else
    (void)n_kw;
    #endif
186

187
188
189
190
    if (n_args == 0) {
        return mp_const_empty_bytes;
    }

191
192
193
194
195
196
    if (MP_OBJ_IS_STR(args[0])) {
        if (n_args < 2 || n_args > 3) {
            goto wrong_args;
        }
        GET_STR_DATA_LEN(args[0], str_data, str_len);
        GET_STR_HASH(args[0], str_hash);
197
198
199
        if (str_hash == 0) {
            str_hash = qstr_compute_hash(str_data, str_len);
        }
200
        mp_obj_str_t *o = MP_OBJ_TO_PTR(mp_obj_new_str_of_type(&mp_type_bytes, NULL, str_len));
201
202
        o->data = str_data;
        o->hash = str_hash;
203
        return MP_OBJ_FROM_PTR(o);
204
205
206
207
208
209
210
211
    }

    if (n_args > 1) {
        goto wrong_args;
    }

    if (MP_OBJ_IS_SMALL_INT(args[0])) {
        uint len = MP_OBJ_SMALL_INT_VALUE(args[0]);
212
213
214
215
        vstr_t vstr;
        vstr_init_len(&vstr, len);
        memset(vstr.buf, 0, len);
        return mp_obj_new_str_from_vstr(&mp_type_bytes, &vstr);
216
217
    }

218
219
220
221
222
223
    // check if argument has the buffer protocol
    mp_buffer_info_t bufinfo;
    if (mp_get_buffer(args[0], &bufinfo, MP_BUFFER_READ)) {
        return mp_obj_new_str_of_type(&mp_type_bytes, bufinfo.buf, bufinfo.len);
    }

224
    vstr_t vstr;
225
226
227
    // Try to create array of exact len if initializer len is known
    mp_obj_t len_in = mp_obj_len_maybe(args[0]);
    if (len_in == MP_OBJ_NULL) {
228
        vstr_init(&vstr, 16);
229
    } else {
230
        mp_int_t len = MP_OBJ_SMALL_INT_VALUE(len_in);
231
        vstr_init(&vstr, len);
232
233
    }

Damien George's avatar
Damien George committed
234
    mp_obj_t iterable = mp_getiter(args[0]);
235
    mp_obj_t item;
236
    while ((item = mp_iternext(iterable)) != MP_OBJ_STOP_ITERATION) {
237
238
239
        mp_int_t val = mp_obj_get_int(item);
        #if MICROPY_CPYTHON_COMPAT
        if (val < 0 || val > 255) {
240
            mp_raise_ValueError("bytes value out of range");
241
242
243
        }
        #endif
        vstr_add_byte(&vstr, val);
244
245
    }

246
    return mp_obj_new_str_from_vstr(&mp_type_bytes, &vstr);
247
248

wrong_args:
249
    mp_raise_TypeError("wrong number of arguments");
250
251
}

252
253
// like strstr but with specified length and allows \0 bytes
// TODO replace with something more efficient/standard
254
const byte *find_subbytes(const byte *haystack, mp_uint_t hlen, const byte *needle, mp_uint_t nlen, mp_int_t direction) {
255
    if (hlen >= nlen) {
256
        mp_uint_t str_index, str_index_end;
257
258
259
260
261
262
263
264
265
266
267
        if (direction > 0) {
            str_index = 0;
            str_index_end = hlen - nlen;
        } else {
            str_index = hlen - nlen;
            str_index_end = 0;
        }
        for (;;) {
            if (memcmp(&haystack[str_index], needle, nlen) == 0) {
                //found
                return haystack + str_index;
268
            }
269
270
271
            if (str_index == str_index_end) {
                //not found
                break;
272
            }
273
            str_index += direction;
274
275
276
277
278
        }
    }
    return NULL;
}

279
280
281
// Note: this function is used to check if an object is a str or bytes, which
// works because both those types use it as their binary_op method.  Revisit
// MP_OBJ_IS_STR_OR_BYTES if this fact changes.
282
mp_obj_t mp_obj_str_binary_op(mp_uint_t op, mp_obj_t lhs_in, mp_obj_t rhs_in) {
283
284
    // check for modulo
    if (op == MP_BINARY_OP_MODULO) {
285
286
        mp_obj_t *args = &rhs_in;
        mp_uint_t n_args = 1;
287
288
289
290
291
292
293
294
295
296
297
        mp_obj_t dict = MP_OBJ_NULL;
        if (MP_OBJ_IS_TYPE(rhs_in, &mp_type_tuple)) {
            // TODO: Support tuple subclasses?
            mp_obj_tuple_get(rhs_in, &n_args, &args);
        } else if (MP_OBJ_IS_TYPE(rhs_in, &mp_type_dict)) {
            dict = rhs_in;
        }
        return str_modulo_format(lhs_in, n_args, args, dict);
    }

    // from now on we need lhs type and data, so extract them
298
    mp_obj_type_t *lhs_type = mp_obj_get_type(lhs_in);
299
    GET_STR_DATA_LEN(lhs_in, lhs_data, lhs_len);
300

301
302
303
304
305
306
307
308
309
310
311
    // check for multiply
    if (op == MP_BINARY_OP_MULTIPLY) {
        mp_int_t n;
        if (!mp_obj_get_int_maybe(rhs_in, &n)) {
            return MP_OBJ_NULL; // op not supported
        }
        if (n <= 0) {
            if (lhs_type == &mp_type_str) {
                return MP_OBJ_NEW_QSTR(MP_QSTR_); // empty str
            } else {
                return mp_const_empty_bytes;
312
            }
313
        }
314
315
316
317
        vstr_t vstr;
        vstr_init_len(&vstr, lhs_len * n);
        mp_seq_multiply(lhs_data, sizeof(*lhs_data), lhs_len, n, vstr.buf);
        return mp_obj_new_str_from_vstr(lhs_type, &vstr);
318
    }
319

320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
    // From now on all operations allow:
    //    - str with str
    //    - bytes with bytes
    //    - bytes with bytearray
    //    - bytes with array.array
    // To do this efficiently we use the buffer protocol to extract the raw
    // data for the rhs, but only if the lhs is a bytes object.
    //
    // NOTE: CPython does not allow comparison between bytes ard array.array
    // (even if the array is of type 'b'), even though it allows addition of
    // such types.  We are not compatible with this (we do allow comparison
    // of bytes with anything that has the buffer protocol).  It would be
    // easy to "fix" this with a bit of extra logic below, but it costs code
    // size and execution time so we don't.

    const byte *rhs_data;
    mp_uint_t rhs_len;
    if (lhs_type == mp_obj_get_type(rhs_in)) {
        GET_STR_DATA_LEN(rhs_in, rhs_data_, rhs_len_);
        rhs_data = rhs_data_;
        rhs_len = rhs_len_;
    } else if (lhs_type == &mp_type_bytes) {
        mp_buffer_info_t bufinfo;
        if (!mp_get_buffer(rhs_in, &bufinfo, MP_BUFFER_READ)) {
344
            return MP_OBJ_NULL; // op not supported
345
346
347
348
349
350
351
352
353
354
355
        }
        rhs_data = bufinfo.buf;
        rhs_len = bufinfo.len;
    } else {
        // incompatible types
        return MP_OBJ_NULL; // op not supported
    }

    switch (op) {
        case MP_BINARY_OP_ADD:
        case MP_BINARY_OP_INPLACE_ADD: {
356
357
358
359
360
361
362
            if (lhs_len == 0) {
                return rhs_in;
            }
            if (rhs_len == 0) {
                return lhs_in;
            }

363
364
365
366
367
            vstr_t vstr;
            vstr_init_len(&vstr, lhs_len + rhs_len);
            memcpy(vstr.buf, lhs_data, lhs_len);
            memcpy(vstr.buf + lhs_len, rhs_data, rhs_len);
            return mp_obj_new_str_from_vstr(lhs_type, &vstr);
368
        }
369

370
371
        case MP_BINARY_OP_IN:
            /* NOTE `a in b` is `b.__contains__(a)` */
372
            return mp_obj_new_bool(find_subbytes(lhs_data, lhs_len, rhs_data, rhs_len, 1) != NULL);
373

374
375
        //case MP_BINARY_OP_NOT_EQUAL: // This is never passed here
        case MP_BINARY_OP_EQUAL: // This will be passed only for bytes, str is dealt with in mp_obj_equal()
Damien George's avatar
Damien George committed
376
377
378
379
        case MP_BINARY_OP_LESS:
        case MP_BINARY_OP_LESS_EQUAL:
        case MP_BINARY_OP_MORE:
        case MP_BINARY_OP_MORE_EQUAL:
380
            return mp_obj_new_bool(mp_seq_cmp_bytes(op, lhs_data, lhs_len, rhs_data, rhs_len));
381
382
    }

383
    return MP_OBJ_NULL; // op not supported
384
385
}

386
387
#if !MICROPY_PY_BUILTINS_STR_UNICODE
// objstrunicode defines own version
388
const byte *str_index_to_ptr(const mp_obj_type_t *type, const byte *self_data, size_t self_len,
389
                             mp_obj_t index, bool is_slice) {
390
    mp_uint_t index_val = mp_get_index(type, self_len, index, is_slice);
391
392
    return self_data + index_val;
}
393
#endif
394

395
396
// This is used for both bytes and 8-bit strings. This is not used for unicode strings.
STATIC mp_obj_t bytes_subscr(mp_obj_t self_in, mp_obj_t index, mp_obj_t value) {
397
    mp_obj_type_t *type = mp_obj_get_type(self_in);
398
399
400
    GET_STR_DATA_LEN(self_in, self_data, self_len);
    if (value == MP_OBJ_SENTINEL) {
        // load
401
#if MICROPY_PY_BUILTINS_SLICE
402
        if (MP_OBJ_IS_TYPE(index, &mp_type_slice)) {
403
404
            mp_bound_slice_t slice;
            if (!mp_seq_get_fast_slice_indexes(self_len, index, &slice)) {
405
                mp_not_implemented("only slices with step=1 (aka None) are supported");
406
            }
407
            return mp_obj_new_str_of_type(type, self_data + slice.start, slice.stop - slice.start);
408
409
        }
#endif
410
        mp_uint_t index_val = mp_get_index(type, self_len, index, false);
411
412
        // If we have unicode enabled the type will always be bytes, so take the short cut.
        if (MICROPY_PY_BUILTINS_STR_UNICODE || type == &mp_type_bytes) {
413
            return MP_OBJ_NEW_SMALL_INT(self_data[index_val]);
414
        } else {
415
            return mp_obj_new_str((char*)&self_data[index_val], 1, true);
416
417
        }
    } else {
418
        return MP_OBJ_NULL; // op not supported
419
420
421
    }
}

422
STATIC mp_obj_t str_join(mp_obj_t self_in, mp_obj_t arg) {
423
    mp_check_self(MP_OBJ_IS_STR_OR_BYTES(self_in));
424
    const mp_obj_type_t *self_type = mp_obj_get_type(self_in);
425

426
    // get separation string
427
    GET_STR_DATA_LEN(self_in, sep_str, sep_len);
428
429

    // process args
430
    mp_uint_t seq_len;
431
    mp_obj_t *seq_items;
432
    if (MP_OBJ_IS_TYPE(arg, &mp_type_tuple)) {
433
434
        mp_obj_tuple_get(arg, &seq_len, &seq_items);
    } else {
435
436
        if (!MP_OBJ_IS_TYPE(arg, &mp_type_list)) {
            // arg is not a list, try to convert it to one
437
            // TODO: Try to optimize?
438
            arg = mp_type_list.make_new(&mp_type_list, 1, 0, &arg);
439
440
        }
        mp_obj_list_get(arg, &seq_len, &seq_items);
441
    }
442
443

    // count required length
444
445
    mp_uint_t required_len = 0;
    for (mp_uint_t i = 0; i < seq_len; i++) {
446
        if (mp_obj_get_type(seq_items[i]) != self_type) {
447
            mp_raise_TypeError(
448
                "join expects a list of str/bytes objects consistent with self object");
449
        }
450
451
452
        if (i > 0) {
            required_len += sep_len;
        }
453
454
        GET_STR_LEN(seq_items[i], l);
        required_len += l;
455
456
457
    }

    // make joined string
458
459
460
    vstr_t vstr;
    vstr_init_len(&vstr, required_len);
    byte *data = (byte*)vstr.buf;
461
    for (mp_uint_t i = 0; i < seq_len; i++) {
462
        if (i > 0) {
463
464
            memcpy(data, sep_str, sep_len);
            data += sep_len;
465
        }
466
467
468
        GET_STR_DATA_LEN(seq_items[i], s, l);
        memcpy(data, s, l);
        data += l;
469
    }
470
471

    // return joined string
472
    return mp_obj_new_str_from_vstr(self_type, &vstr);
473
474
}

475
mp_obj_t mp_obj_str_split(size_t n_args, const mp_obj_t *args) {
476
    const mp_obj_type_t *self_type = mp_obj_get_type(args[0]);
477
    mp_int_t splits = -1;
Paul Sokolovsky's avatar
Paul Sokolovsky committed
478
479
480
481
    mp_obj_t sep = mp_const_none;
    if (n_args > 1) {
        sep = args[1];
        if (n_args > 2) {
482
            splits = mp_obj_get_int(args[2]);
Paul Sokolovsky's avatar
Paul Sokolovsky committed
483
484
        }
    }
485

Paul Sokolovsky's avatar
Paul Sokolovsky committed
486
    mp_obj_t res = mp_obj_new_list(0, NULL);
487
488
    GET_STR_DATA_LEN(args[0], s, len);
    const byte *top = s + len;
489
490
491
492
493

    if (sep == mp_const_none) {
        // sep not given, so separate on whitespace

        // Initial whitespace is not counted as split, so we pre-do it
494
        while (s < top && unichar_isspace(*s)) s++;
495
496
        while (s < top && splits != 0) {
            const byte *start = s;
497
            while (s < top && !unichar_isspace(*s)) s++;
498
            mp_obj_list_append(res, mp_obj_new_str_of_type(self_type, start, s - start));
499
500
501
            if (s >= top) {
                break;
            }
502
            while (s < top && unichar_isspace(*s)) s++;
503
504
505
            if (splits > 0) {
                splits--;
            }
Paul Sokolovsky's avatar
Paul Sokolovsky committed
506
507
        }

508
        if (s < top) {
509
            mp_obj_list_append(res, mp_obj_new_str_of_type(self_type, s, top - s));
510
511
512
513
        }

    } else {
        // sep given
514
        if (mp_obj_get_type(sep) != self_type) {
515
            bad_implicit_conversion(sep);
516
        }
517

518
        mp_uint_t sep_len;
519
520
521
        const char *sep_str = mp_obj_str_get_data(sep, &sep_len);

        if (sep_len == 0) {
522
            mp_raise_ValueError("empty separator");
523
524
525
526
527
528
529
530
531
532
533
534
535
        }

        for (;;) {
            const byte *start = s;
            for (;;) {
                if (splits == 0 || s + sep_len > top) {
                    s = top;
                    break;
                } else if (memcmp(s, sep_str, sep_len) == 0) {
                    break;
                }
                s++;
            }
536
            mp_obj_list_append(res, mp_obj_new_str_of_type(self_type, start, s - start));
537
538
539
540
541
542
543
544
            if (s >= top) {
                break;
            }
            s += sep_len;
            if (splits > 0) {
                splits--;
            }
        }
Paul Sokolovsky's avatar
Paul Sokolovsky committed
545
546
547
548
549
    }

    return res;
}

550
#if MICROPY_PY_BUILTINS_STR_SPLITLINES
551
STATIC mp_obj_t str_splitlines(size_t n_args, const mp_obj_t *pos_args, mp_map_t *kw_args) {
552
    enum { ARG_keepends };
553
554
555
556
557
    static const mp_arg_t allowed_args[] = {
        { MP_QSTR_keepends, MP_ARG_BOOL, {.u_bool = false} },
    };

    // parse args
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
    mp_arg_val_t args[MP_ARRAY_SIZE(allowed_args)];
    mp_arg_parse_all(n_args - 1, pos_args + 1, kw_args, MP_ARRAY_SIZE(allowed_args), allowed_args, args);

    const mp_obj_type_t *self_type = mp_obj_get_type(pos_args[0]);
    mp_obj_t res = mp_obj_new_list(0, NULL);

    GET_STR_DATA_LEN(pos_args[0], s, len);
    const byte *top = s + len;

    while (s < top) {
        const byte *start = s;
        size_t match = 0;
        while (s < top) {
            if (*s == '\n') {
                match = 1;
                break;
            } else if (*s == '\r') {
                if (s[1] == '\n') {
                    match = 2;
                } else {
                    match = 1;
                }
                break;
            }
            s++;
        }
        size_t sub_len = s - start;
        if (args[ARG_keepends].u_bool) {
            sub_len += match;
        }
        mp_obj_list_append(res, mp_obj_new_str_of_type(self_type, start, sub_len));
        s += match;
    }

    return res;
593
594
595
}
#endif

596
STATIC mp_obj_t str_rsplit(size_t n_args, const mp_obj_t *args) {
597
598
599
    if (n_args < 3) {
        // If we don't have split limit, it doesn't matter from which side
        // we split.
600
        return mp_obj_str_split(n_args, args);
601
602
603
604
605
    }
    const mp_obj_type_t *self_type = mp_obj_get_type(args[0]);
    mp_obj_t sep = args[1];
    GET_STR_DATA_LEN(args[0], s, len);

606
607
    mp_int_t splits = mp_obj_get_int(args[2]);
    mp_int_t org_splits = splits;
608
609
    // Preallocate list to the max expected # of elements, as we
    // will fill it from the end.
610
    mp_obj_list_t *res = MP_OBJ_TO_PTR(mp_obj_new_list(splits + 1, NULL));
611
    mp_int_t idx = splits;
612
613

    if (sep == mp_const_none) {
614
        mp_not_implemented("rsplit(None,n)");
615
    } else {
616
        mp_uint_t sep_len;
617
618
619
        const char *sep_str = mp_obj_str_get_data(sep, &sep_len);

        if (sep_len == 0) {
620
            mp_raise_ValueError("empty separator");
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
        }

        const byte *beg = s;
        const byte *last = s + len;
        for (;;) {
            s = last - sep_len;
            for (;;) {
                if (splits == 0 || s < beg) {
                    break;
                } else if (memcmp(s, sep_str, sep_len) == 0) {
                    break;
                }
                s--;
            }
            if (s < beg || splits == 0) {
636
                res->items[idx] = mp_obj_new_str_of_type(self_type, beg, last - beg);
637
638
                break;
            }
639
            res->items[idx--] = mp_obj_new_str_of_type(self_type, s + sep_len, last - s - sep_len);
640
641
642
643
644
645
646
            last = s;
            if (splits > 0) {
                splits--;
            }
        }
        if (idx != 0) {
            // We split less parts than split limit, now go cleanup surplus
647
            mp_int_t used = org_splits + 1 - idx;
648
            memmove(res->items, &res->items[idx], used * sizeof(mp_obj_t));
649
650
651
652
653
            mp_seq_clear(res->items, used, res->alloc, sizeof(*res->items));
            res->len = used;
        }
    }

654
    return MP_OBJ_FROM_PTR(res);
655
656
}

657
STATIC mp_obj_t str_finder(size_t n_args, const mp_obj_t *args, mp_int_t direction, bool is_index) {
658
    const mp_obj_type_t *self_type = mp_obj_get_type(args[0]);
659
    mp_check_self(MP_OBJ_IS_STR_OR_BYTES(args[0]));
660
661

    // check argument type
662
    if (mp_obj_get_type(args[1]) != self_type) {
663
664
        bad_implicit_conversion(args[1]);
    }
665

666
667
    GET_STR_DATA_LEN(args[0], haystack, haystack_len);
    GET_STR_DATA_LEN(args[1], needle, needle_len);
668

669
670
    const byte *start = haystack;
    const byte *end = haystack + haystack_len;
671
    if (n_args >= 3 && args[2] != mp_const_none) {
672
        start = str_index_to_ptr(self_type, haystack, haystack_len, args[2], true);
673
674
    }
    if (n_args >= 4 && args[3] != mp_const_none) {
675
        end = str_index_to_ptr(self_type, haystack, haystack_len, args[3], true);
676
677
    }

678
    const byte *p = find_subbytes(start, end - start, needle, needle_len, direction);
679
680
    if (p == NULL) {
        // not found
681
        if (is_index) {
682
            mp_raise_ValueError("substring not found");
683
684
685
        } else {
            return MP_OBJ_NEW_SMALL_INT(-1);
        }
686
687
    } else {
        // found
688
689
690
691
692
        #if MICROPY_PY_BUILTINS_STR_UNICODE
        if (self_type == &mp_type_str) {
            return MP_OBJ_NEW_SMALL_INT(utf8_ptr_to_index(haystack, p));
        }
        #endif
693
        return MP_OBJ_NEW_SMALL_INT(p - haystack);
694
695
696
    }
}

697
STATIC mp_obj_t str_find(size_t n_args, const mp_obj_t *args) {
698
    return str_finder(n_args, args, 1, false);
699
700
}

701
STATIC mp_obj_t str_rfind(size_t n_args, const mp_obj_t *args) {
702
703
704
    return str_finder(n_args, args, -1, false);
}

705
STATIC mp_obj_t str_index(size_t n_args, const mp_obj_t *args) {
706
707
708
    return str_finder(n_args, args, 1, true);
}

709
STATIC mp_obj_t str_rindex(size_t n_args, const mp_obj_t *args) {
710
    return str_finder(n_args, args, -1, true);
711
712
}

713
// TODO: (Much) more variety in args
714
STATIC mp_obj_t str_startswith(size_t n_args, const mp_obj_t *args) {
715
    const mp_obj_type_t *self_type = mp_obj_get_type(args[0]);
716
717
    GET_STR_DATA_LEN(args[0], str, str_len);
    GET_STR_DATA_LEN(args[1], prefix, prefix_len);
718
    const byte *start = str;
719
    if (n_args > 2) {
720
        start = str_index_to_ptr(self_type, str, str_len, args[2], true);
721
    }
722
    if (prefix_len + (start - str) > str_len) {
723
724
        return mp_const_false;
    }
725
    return mp_obj_new_bool(memcmp(start, prefix, prefix_len) == 0);
726
727
}

728
STATIC mp_obj_t str_endswith(size_t n_args, const mp_obj_t *args) {
729
730
    GET_STR_DATA_LEN(args[0], str, str_len);
    GET_STR_DATA_LEN(args[1], suffix, suffix_len);
731
732
733
    if (n_args > 2) {
        mp_not_implemented("start/end indices");
    }
734
735
736
737

    if (suffix_len > str_len) {
        return mp_const_false;
    }
738
    return mp_obj_new_bool(memcmp(str + (str_len - suffix_len), suffix, suffix_len) == 0);
739
740
}

741
742
enum { LSTRIP, RSTRIP, STRIP };

743
STATIC mp_obj_t str_uni_strip(int type, size_t n_args, const mp_obj_t *args) {
744
    mp_check_self(MP_OBJ_IS_STR_OR_BYTES(args[0]));
745
    const mp_obj_type_t *self_type = mp_obj_get_type(args[0]);
746
747
748
749

    const byte *chars_to_del;
    uint chars_to_del_len;
    static const byte whitespace[] = " \t\n\r\v\f";
xbe's avatar
xbe committed
750
751
752

    if (n_args == 1) {
        chars_to_del = whitespace;
753
        chars_to_del_len = sizeof(whitespace);
xbe's avatar
xbe committed
754
    } else {
755
        if (mp_obj_get_type(args[1]) != self_type) {
756
            bad_implicit_conversion(args[1]);
757
        }
758
759
760
        GET_STR_DATA_LEN(args[1], s, l);
        chars_to_del = s;
        chars_to_del_len = l;
xbe's avatar
xbe committed
761
762
    }

763
    GET_STR_DATA_LEN(args[0], orig_str, orig_str_len);
xbe's avatar
xbe committed
764

765
    mp_uint_t first_good_char_pos = 0;
xbe's avatar
xbe committed
766
    bool first_good_char_pos_set = false;
767
768
769
    mp_uint_t last_good_char_pos = 0;
    mp_uint_t i = 0;
    mp_int_t delta = 1;
770
771
772
773
    if (type == RSTRIP) {
        i = orig_str_len - 1;
        delta = -1;
    }
774
    for (mp_uint_t len = orig_str_len; len > 0; len--) {
775
        if (find_subbytes(chars_to_del, chars_to_del_len, &orig_str[i], 1, 1) == NULL) {
xbe's avatar
xbe committed
776
            if (!first_good_char_pos_set) {
777
                first_good_char_pos_set = true;
xbe's avatar
xbe committed
778
                first_good_char_pos = i;
779
780
781
                if (type == LSTRIP) {
                    last_good_char_pos = orig_str_len - 1;
                    break;
782
783
784
785
                } else if (type == RSTRIP) {
                    first_good_char_pos = 0;
                    last_good_char_pos = i;
                    break;
786
                }
xbe's avatar
xbe committed
787
            }
788
            last_good_char_pos = i;
xbe's avatar
xbe committed
789
        }
790
        i += delta;
xbe's avatar
xbe committed
791
792
    }

793
    if (!first_good_char_pos_set) {
794
        // string is all whitespace, return ''
795
796
797
798
799
        if (self_type == &mp_type_str) {
            return MP_OBJ_NEW_QSTR(MP_QSTR_);
        } else {
            return mp_const_empty_bytes;
        }
xbe's avatar
xbe committed
800
801
802
803
    }

    assert(last_good_char_pos >= first_good_char_pos);
    //+1 to accomodate the last character
804
    mp_uint_t stripped_len = last_good_char_pos - first_good_char_pos + 1;
805
806
807
808
809
810
    if (stripped_len == orig_str_len) {
        // If nothing was stripped, don't bother to dup original string
        // TODO: watch out for this case when we'll get to bytearray.strip()
        assert(first_good_char_pos == 0);
        return args[0];
    }
811
    return mp_obj_new_str_of_type(self_type, orig_str + first_good_char_pos, stripped_len);
xbe's avatar
xbe committed
812
813
}

814
STATIC mp_obj_t str_strip(size_t n_args, const mp_obj_t *args) {
815
816
817
    return str_uni_strip(STRIP, n_args, args);
}

818
STATIC mp_obj_t str_lstrip(size_t n_args, const mp_obj_t *args) {
819
820
821
    return str_uni_strip(LSTRIP, n_args, args);
}

822
STATIC mp_obj_t str_rstrip(size_t n_args, const mp_obj_t *args) {
823
824
825
    return str_uni_strip(RSTRIP, n_args, args);
}

826
827
828
#if MICROPY_PY_BUILTINS_STR_CENTER
STATIC mp_obj_t str_center(mp_obj_t str_in, mp_obj_t width_in) {
    GET_STR_DATA_LEN(str_in, str, str_len);
829
    mp_uint_t width = mp_obj_get_int(width_in);
830
831
832
833
834
835
836
837
838
839
840
841
842
    if (str_len >= width) {
        return str_in;
    }

    vstr_t vstr;
    vstr_init_len(&vstr, width);
    memset(vstr.buf, ' ', width);
    int left = (width - str_len) / 2;
    memcpy(vstr.buf + left, str, str_len);
    return mp_obj_new_str_from_vstr(mp_obj_get_type(str_in), &vstr);
}
#endif

Dave Hylands's avatar
Dave Hylands committed
843
844
// Takes an int arg, but only parses unsigned numbers, and only changes
// *num if at least one digit was parsed.
845
846
STATIC const char *str_to_int(const char *str, const char *top, int *num) {
    if (str < top && '0' <= *str && *str <= '9') {
Dave Hylands's avatar
Dave Hylands committed
847
848
        *num = 0;
        do {
849
850
            *num = *num * 10 + (*str - '0');
            str++;
Dave Hylands's avatar
Dave Hylands committed
851
        }
852
        while (str < top && '0' <= *str && *str <= '9');
Dave Hylands's avatar
Dave Hylands committed
853
    }
854
    return str;
Dave Hylands's avatar
Dave Hylands committed
855
856
}

857
STATIC bool isalignment(char ch) {
Dave Hylands's avatar
Dave Hylands committed
858
859
860
    return ch && strchr("<>=^", ch) != NULL;
}

861
STATIC bool istype(char ch) {
Dave Hylands's avatar
Dave Hylands committed
862
863
864
    return ch && strchr("bcdeEfFgGnosxX%", ch) != NULL;
}

865
STATIC bool arg_looks_integer(mp_obj_t arg) {
Dave Hylands's avatar
Dave Hylands committed
866
867
868
    return MP_OBJ_IS_TYPE(arg, &mp_type_bool) || MP_OBJ_IS_INT(arg);
}

869
STATIC bool arg_looks_numeric(mp_obj_t arg) {
Dave Hylands's avatar
Dave Hylands committed
870
    return arg_looks_integer(arg)
871
#if MICROPY_PY_BUILTINS_FLOAT
872
        || mp_obj_is_float(arg)
Dave Hylands's avatar
Dave Hylands committed
873
874
875
876
#endif
    ;
}

877
STATIC mp_obj_t arg_as_int(mp_obj_t arg) {
878
#if MICROPY_PY_BUILTINS_FLOAT
879
880
    if (mp_obj_is_float(arg)) {
        return mp_obj_new_int_from_float(mp_obj_float_get(arg));
881
882
    }
#endif
883
    return arg;
884
885
}

886
#if MICROPY_ERROR_REPORTING == MICROPY_ERROR_REPORTING_TERSE
887
STATIC NORETURN void terse_str_format_value_error(void) {
888
    mp_raise_ValueError("bad format string");
889
}
890
891
892
893
#else
// define to nothing to improve coverage
#define terse_str_format_value_error()
#endif
894

895
STATIC vstr_t mp_obj_str_format_helper(const char *str, const char *top, int *arg_i, size_t n_args, const mp_obj_t *args, mp_map_t *kwargs) {
896
    vstr_t vstr;
897
898
    mp_print_t print;
    vstr_init_print(&vstr, 16, &print);
Dave Hylands's avatar
Dave Hylands committed
899

900
    for (; str < top; str++) {
Dave Hylands's avatar
Dave Hylands committed
901
902
903
        if (*str == '}') {
            str++;
            if (str < top && *str == '}') {
904
                vstr_add_byte(&vstr, '}');
Dave Hylands's avatar
Dave Hylands committed
905
906
                continue;
            }
907
908
909
            if (MICROPY_ERROR_REPORTING == MICROPY_ERROR_REPORTING_TERSE) {
                terse_str_format_value_error();
            } else {
910
                mp_raise_ValueError("single '}' encountered in format string");