objstr.c 75.5 KB
Newer Older
1
/*
2
 * This file is part of the MicroPython project, http://micropython.org/
3 4 5 6
 *
 * The MIT License (MIT)
 *
 * Copyright (c) 2013, 2014 Damien P. George
7
 * Copyright (c) 2014 Paul Sokolovsky
8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */

28 29 30
#include <string.h>
#include <assert.h>

31 32 33 34
#include "py/unicode.h"
#include "py/objstr.h"
#include "py/objlist.h"
#include "py/runtime.h"
35
#include "py/stackctrl.h"
36

37
STATIC mp_obj_t str_modulo_format(mp_obj_t pattern, size_t n_args, const mp_obj_t *args, mp_obj_t dict);
38

39
STATIC mp_obj_t mp_obj_new_bytes_iterator(mp_obj_t str, mp_obj_iter_buf_t *iter_buf);
40
STATIC NORETURN void bad_implicit_conversion(mp_obj_t self_in);
41

xyb's avatar
xyb committed
42 43 44
/******************************************************************************/
/* str                                                                        */

45
void mp_str_print_quoted(const mp_print_t *print, const byte *str_data, size_t str_len, bool is_bytes) {
46 47 48
    // this escapes characters, but it will be very slow to print (calling print many times)
    bool has_single_quote = false;
    bool has_double_quote = false;
49
    for (const byte *s = str_data, *top = str_data + str_len; !has_double_quote && s < top; s++) {
50 51 52 53 54 55 56 57 58 59
        if (*s == '\'') {
            has_single_quote = true;
        } else if (*s == '"') {
            has_double_quote = true;
        }
    }
    int quote_char = '\'';
    if (has_single_quote && !has_double_quote) {
        quote_char = '"';
    }
60
    mp_printf(print, "%c", quote_char);
61 62
    for (const byte *s = str_data, *top = str_data + str_len; s < top; s++) {
        if (*s == quote_char) {
63
            mp_printf(print, "\\%c", quote_char);
64
        } else if (*s == '\\') {
65
            mp_print_str(print, "\\\\");
66 67 68 69
        } else if (*s >= 0x20 && *s != 0x7f && (!is_bytes || *s < 0x80)) {
            // In strings, anything which is not ascii control character
            // is printed as is, this includes characters in range 0x80-0xff
            // (which can be non-Latin letters, etc.)
70
            mp_printf(print, "%c", *s);
71
        } else if (*s == '\n') {
72
            mp_print_str(print, "\\n");
73
        } else if (*s == '\r') {
74
            mp_print_str(print, "\\r");
75
        } else if (*s == '\t') {
76
            mp_print_str(print, "\\t");
77
        } else {
78
            mp_printf(print, "\\x%02x", *s);
79 80
        }
    }
81
    mp_printf(print, "%c", quote_char);
82 83
}

84
#if MICROPY_PY_UJSON
85
void mp_str_print_json(const mp_print_t *print, const byte *str_data, size_t str_len) {
86 87
    // for JSON spec, see http://www.ietf.org/rfc/rfc4627.txt
    // if we are given a valid utf8-encoded string, we will print it in a JSON-conforming way
88
    mp_print_str(print, "\"");
89
    for (const byte *s = str_data, *top = str_data + str_len; s < top; s++) {
90
        if (*s == '"' || *s == '\\') {
91
            mp_printf(print, "\\%c", *s);
92 93
        } else if (*s >= 32) {
            // this will handle normal and utf-8 encoded chars
94
            mp_printf(print, "%c", *s);
95
        } else if (*s == '\n') {
96
            mp_print_str(print, "\\n");
97
        } else if (*s == '\r') {
98
            mp_print_str(print, "\\r");
99
        } else if (*s == '\t') {
100
            mp_print_str(print, "\\t");
101
        } else {
102
            // this will handle control chars
103
            mp_printf(print, "\\u%04x", *s);
104 105
        }
    }
106
    mp_print_str(print, "\"");
107 108 109
}
#endif

110
STATIC void str_print(const mp_print_t *print, mp_obj_t self_in, mp_print_kind_t kind) {
111
    GET_STR_DATA_LEN(self_in, str_data, str_len);
112 113
    #if MICROPY_PY_UJSON
    if (kind == PRINT_JSON) {
114
        mp_str_print_json(print, str_data, str_len);
115 116 117
        return;
    }
    #endif
118
    #if !MICROPY_PY_BUILTINS_STR_UNICODE
119
    bool is_bytes = MP_OBJ_IS_TYPE(self_in, &mp_type_bytes);
120 121 122
    #else
    bool is_bytes = true;
    #endif
123
    if (kind == PRINT_RAW || (!MICROPY_PY_BUILTINS_STR_UNICODE && kind == PRINT_STR && !is_bytes)) {
124
        mp_printf(print, "%.*s", str_len, str_data);
125
    } else {
126
        if (is_bytes) {
127
            mp_print_str(print, "b");
128
        }
129
        mp_str_print_quoted(print, str_data, str_len, is_bytes);
130
    }
131 132
}

133
mp_obj_t mp_obj_str_make_new(const mp_obj_type_t *type, size_t n_args, size_t n_kw, const mp_obj_t *args) {
134 135 136 137 138 139
#if MICROPY_CPYTHON_COMPAT
    if (n_kw != 0) {
        mp_arg_error_unimpl_kw();
    }
#endif

140 141
    mp_arg_check_num(n_args, n_kw, 0, 3, false);

142 143 144 145
    switch (n_args) {
        case 0:
            return MP_OBJ_NEW_QSTR(MP_QSTR_);

146
        case 1: {
147
            vstr_t vstr;
148 149 150
            mp_print_t print;
            vstr_init_print(&vstr, 16, &print);
            mp_obj_print_helper(&print, args[0], PRINT_STR);
151
            return mp_obj_new_str_from_vstr(type, &vstr);
152 153
        }

154
        default: // 2 or 3 args
155
            // TODO: validate 2nd/3rd args
156 157 158
            if (MP_OBJ_IS_TYPE(args[0], &mp_type_bytes)) {
                GET_STR_DATA_LEN(args[0], str_data, str_len);
                GET_STR_HASH(args[0], str_hash);
159 160 161
                if (str_hash == 0) {
                    str_hash = qstr_compute_hash(str_data, str_len);
                }
162 163 164 165 166
                #if MICROPY_PY_BUILTINS_STR_UNICODE_CHECK
                if (!utf8_check(str_data, str_len)) {
                    mp_raise_msg(&mp_type_UnicodeError, NULL);
                }
                #endif
167 168 169 170 171 172 173

                // Check if a qstr with this data already exists
                qstr q = qstr_find_strn((const char*)str_data, str_len);
                if (q != MP_QSTR_NULL) {
                    return MP_OBJ_NEW_QSTR(q);
                }

174
                mp_obj_str_t *o = MP_OBJ_TO_PTR(mp_obj_new_str_copy(type, NULL, str_len));
175 176
                o->data = str_data;
                o->hash = str_hash;
177
                return MP_OBJ_FROM_PTR(o);
178 179 180
            } else {
                mp_buffer_info_t bufinfo;
                mp_get_buffer_raise(args[0], &bufinfo, MP_BUFFER_READ);
181 182 183 184 185
                #if MICROPY_PY_BUILTINS_STR_UNICODE_CHECK
                if (!utf8_check(bufinfo.buf, bufinfo.len)) {
                    mp_raise_msg(&mp_type_UnicodeError, NULL);
                }
                #endif
186
                return mp_obj_new_str(bufinfo.buf, bufinfo.len);
187 188 189 190
            }
    }
}

191
STATIC mp_obj_t bytes_make_new(const mp_obj_type_t *type_in, size_t n_args, size_t n_kw, const mp_obj_t *args) {
192 193
    (void)type_in;

194
    #if MICROPY_CPYTHON_COMPAT
195 196 197
    if (n_kw != 0) {
        mp_arg_error_unimpl_kw();
    }
198 199 200
    #else
    (void)n_kw;
    #endif
201

202 203 204 205
    if (n_args == 0) {
        return mp_const_empty_bytes;
    }

206 207 208 209 210 211
    if (MP_OBJ_IS_STR(args[0])) {
        if (n_args < 2 || n_args > 3) {
            goto wrong_args;
        }
        GET_STR_DATA_LEN(args[0], str_data, str_len);
        GET_STR_HASH(args[0], str_hash);
212 213 214
        if (str_hash == 0) {
            str_hash = qstr_compute_hash(str_data, str_len);
        }
215
        mp_obj_str_t *o = MP_OBJ_TO_PTR(mp_obj_new_str_copy(&mp_type_bytes, NULL, str_len));
216 217
        o->data = str_data;
        o->hash = str_hash;
218
        return MP_OBJ_FROM_PTR(o);
219 220 221 222 223 224 225
    }

    if (n_args > 1) {
        goto wrong_args;
    }

    if (MP_OBJ_IS_SMALL_INT(args[0])) {
226 227 228 229
        mp_int_t len = MP_OBJ_SMALL_INT_VALUE(args[0]);
        if (len < 0) {
            mp_raise_ValueError(NULL);
        }
230 231 232 233
        vstr_t vstr;
        vstr_init_len(&vstr, len);
        memset(vstr.buf, 0, len);
        return mp_obj_new_str_from_vstr(&mp_type_bytes, &vstr);
234 235
    }

236 237 238
    // check if argument has the buffer protocol
    mp_buffer_info_t bufinfo;
    if (mp_get_buffer(args[0], &bufinfo, MP_BUFFER_READ)) {
239
        return mp_obj_new_bytes(bufinfo.buf, bufinfo.len);
240 241
    }

242
    vstr_t vstr;
243 244 245
    // Try to create array of exact len if initializer len is known
    mp_obj_t len_in = mp_obj_len_maybe(args[0]);
    if (len_in == MP_OBJ_NULL) {
246
        vstr_init(&vstr, 16);
247
    } else {
248
        mp_int_t len = MP_OBJ_SMALL_INT_VALUE(len_in);
249
        vstr_init(&vstr, len);
250 251
    }

252 253
    mp_obj_iter_buf_t iter_buf;
    mp_obj_t iterable = mp_getiter(args[0], &iter_buf);
254
    mp_obj_t item;
255
    while ((item = mp_iternext(iterable)) != MP_OBJ_STOP_ITERATION) {
256
        mp_int_t val = mp_obj_get_int(item);
257
        #if MICROPY_FULL_CHECKS
258
        if (val < 0 || val > 255) {
259
            mp_raise_ValueError("bytes value out of range");
260 261 262
        }
        #endif
        vstr_add_byte(&vstr, val);
263 264
    }

265
    return mp_obj_new_str_from_vstr(&mp_type_bytes, &vstr);
266 267

wrong_args:
268
    mp_raise_TypeError("wrong number of arguments");
269 270
}

271 272
// like strstr but with specified length and allows \0 bytes
// TODO replace with something more efficient/standard
273
const byte *find_subbytes(const byte *haystack, size_t hlen, const byte *needle, size_t nlen, int direction) {
274
    if (hlen >= nlen) {
275
        size_t str_index, str_index_end;
276 277 278 279 280 281 282 283 284 285 286
        if (direction > 0) {
            str_index = 0;
            str_index_end = hlen - nlen;
        } else {
            str_index = hlen - nlen;
            str_index_end = 0;
        }
        for (;;) {
            if (memcmp(&haystack[str_index], needle, nlen) == 0) {
                //found
                return haystack + str_index;
287
            }
288 289 290
            if (str_index == str_index_end) {
                //not found
                break;
291
            }
292
            str_index += direction;
293 294 295 296 297
        }
    }
    return NULL;
}

298 299 300
// Note: this function is used to check if an object is a str or bytes, which
// works because both those types use it as their binary_op method.  Revisit
// MP_OBJ_IS_STR_OR_BYTES if this fact changes.
301
mp_obj_t mp_obj_str_binary_op(mp_binary_op_t op, mp_obj_t lhs_in, mp_obj_t rhs_in) {
302 303
    // check for modulo
    if (op == MP_BINARY_OP_MODULO) {
304
        mp_obj_t *args = &rhs_in;
305
        size_t n_args = 1;
306 307 308 309 310 311 312 313 314 315 316
        mp_obj_t dict = MP_OBJ_NULL;
        if (MP_OBJ_IS_TYPE(rhs_in, &mp_type_tuple)) {
            // TODO: Support tuple subclasses?
            mp_obj_tuple_get(rhs_in, &n_args, &args);
        } else if (MP_OBJ_IS_TYPE(rhs_in, &mp_type_dict)) {
            dict = rhs_in;
        }
        return str_modulo_format(lhs_in, n_args, args, dict);
    }

    // from now on we need lhs type and data, so extract them
317
    mp_obj_type_t *lhs_type = mp_obj_get_type(lhs_in);
318
    GET_STR_DATA_LEN(lhs_in, lhs_data, lhs_len);
319

320 321 322 323 324 325 326 327 328 329 330
    // check for multiply
    if (op == MP_BINARY_OP_MULTIPLY) {
        mp_int_t n;
        if (!mp_obj_get_int_maybe(rhs_in, &n)) {
            return MP_OBJ_NULL; // op not supported
        }
        if (n <= 0) {
            if (lhs_type == &mp_type_str) {
                return MP_OBJ_NEW_QSTR(MP_QSTR_); // empty str
            } else {
                return mp_const_empty_bytes;
331
            }
332
        }
333 334 335 336
        vstr_t vstr;
        vstr_init_len(&vstr, lhs_len * n);
        mp_seq_multiply(lhs_data, sizeof(*lhs_data), lhs_len, n, vstr.buf);
        return mp_obj_new_str_from_vstr(lhs_type, &vstr);
337
    }
338

339 340 341 342 343 344 345 346 347 348 349 350 351 352 353
    // From now on all operations allow:
    //    - str with str
    //    - bytes with bytes
    //    - bytes with bytearray
    //    - bytes with array.array
    // To do this efficiently we use the buffer protocol to extract the raw
    // data for the rhs, but only if the lhs is a bytes object.
    //
    // NOTE: CPython does not allow comparison between bytes ard array.array
    // (even if the array is of type 'b'), even though it allows addition of
    // such types.  We are not compatible with this (we do allow comparison
    // of bytes with anything that has the buffer protocol).  It would be
    // easy to "fix" this with a bit of extra logic below, but it costs code
    // size and execution time so we don't.

354 355
    const byte *rhs_data = NULL;
    size_t rhs_len = 0;
356 357 358 359 360 361 362
    if (lhs_type == mp_obj_get_type(rhs_in)) {
        GET_STR_DATA_LEN(rhs_in, rhs_data_, rhs_len_);
        rhs_data = rhs_data_;
        rhs_len = rhs_len_;
    } else if (lhs_type == &mp_type_bytes) {
        mp_buffer_info_t bufinfo;
        if (!mp_get_buffer(rhs_in, &bufinfo, MP_BUFFER_READ)) {
363
            return MP_OBJ_NULL; // op not supported
364 365 366 367
        }
        rhs_data = bufinfo.buf;
        rhs_len = bufinfo.len;
    } else {
368 369 370
        // LHS is str and RHS has an incompatible type
        // (except if operation is EQUAL, but that's handled by mp_obj_equal)
        bad_implicit_conversion(rhs_in);
371 372 373 374 375
    }

    switch (op) {
        case MP_BINARY_OP_ADD:
        case MP_BINARY_OP_INPLACE_ADD: {
376
            if (lhs_len == 0 && mp_obj_get_type(rhs_in) == lhs_type) {
377 378 379 380 381 382
                return rhs_in;
            }
            if (rhs_len == 0) {
                return lhs_in;
            }

383 384 385 386 387
            vstr_t vstr;
            vstr_init_len(&vstr, lhs_len + rhs_len);
            memcpy(vstr.buf, lhs_data, lhs_len);
            memcpy(vstr.buf + lhs_len, rhs_data, rhs_len);
            return mp_obj_new_str_from_vstr(lhs_type, &vstr);
388
        }
389

390
        case MP_BINARY_OP_CONTAINS:
391
            return mp_obj_new_bool(find_subbytes(lhs_data, lhs_len, rhs_data, rhs_len, 1) != NULL);
392

393 394
        //case MP_BINARY_OP_NOT_EQUAL: // This is never passed here
        case MP_BINARY_OP_EQUAL: // This will be passed only for bytes, str is dealt with in mp_obj_equal()
Damien George's avatar
Damien George committed
395 396 397 398
        case MP_BINARY_OP_LESS:
        case MP_BINARY_OP_LESS_EQUAL:
        case MP_BINARY_OP_MORE:
        case MP_BINARY_OP_MORE_EQUAL:
399
            return mp_obj_new_bool(mp_seq_cmp_bytes(op, lhs_data, lhs_len, rhs_data, rhs_len));
400

401 402 403
        default:
            return MP_OBJ_NULL; // op not supported
    }
404 405
}

406 407
#if !MICROPY_PY_BUILTINS_STR_UNICODE
// objstrunicode defines own version
408
const byte *str_index_to_ptr(const mp_obj_type_t *type, const byte *self_data, size_t self_len,
409
                             mp_obj_t index, bool is_slice) {
410
    size_t index_val = mp_get_index(type, self_len, index, is_slice);
411 412
    return self_data + index_val;
}
413
#endif
414

415 416
// This is used for both bytes and 8-bit strings. This is not used for unicode strings.
STATIC mp_obj_t bytes_subscr(mp_obj_t self_in, mp_obj_t index, mp_obj_t value) {
417
    mp_obj_type_t *type = mp_obj_get_type(self_in);
418 419 420
    GET_STR_DATA_LEN(self_in, self_data, self_len);
    if (value == MP_OBJ_SENTINEL) {
        // load
421
#if MICROPY_PY_BUILTINS_SLICE
422
        if (MP_OBJ_IS_TYPE(index, &mp_type_slice)) {
423 424
            mp_bound_slice_t slice;
            if (!mp_seq_get_fast_slice_indexes(self_len, index, &slice)) {
425
                mp_raise_NotImplementedError("only slices with step=1 (aka None) are supported");
426
            }
427
            return mp_obj_new_str_of_type(type, self_data + slice.start, slice.stop - slice.start);
428 429
        }
#endif
430
        size_t index_val = mp_get_index(type, self_len, index, false);
431
        // If we have unicode enabled the type will always be bytes, so take the short cut.
432 433 434 435
        #if MICROPY_PY_BUILTINS_STR_UNICODE
        return MP_OBJ_NEW_SMALL_INT(self_data[index_val]);
        #else
        if (type == &mp_type_bytes) {
436
            return MP_OBJ_NEW_SMALL_INT(self_data[index_val]);
437
        } else {
438
            return mp_obj_new_str_via_qstr((char*)&self_data[index_val], 1);
439
        }
440
        #endif
441
    } else {
442
        return MP_OBJ_NULL; // op not supported
443 444 445
    }
}

446
STATIC mp_obj_t str_join(mp_obj_t self_in, mp_obj_t arg) {
447
    mp_check_self(MP_OBJ_IS_STR_OR_BYTES(self_in));
448
    const mp_obj_type_t *self_type = mp_obj_get_type(self_in);
449

450
    // get separation string
451
    GET_STR_DATA_LEN(self_in, sep_str, sep_len);
452 453

    // process args
454
    size_t seq_len;
455
    mp_obj_t *seq_items;
456 457 458 459 460

    if (!MP_OBJ_IS_TYPE(arg, &mp_type_list) && !MP_OBJ_IS_TYPE(arg, &mp_type_tuple)) {
        // arg is not a list nor a tuple, try to convert it to a list
        // TODO: Try to optimize?
        arg = mp_type_list.make_new(&mp_type_list, 1, 0, &arg);
461
    }
462
    mp_obj_get_array(arg, &seq_len, &seq_items);
463 464

    // count required length
465 466
    size_t required_len = 0;
    for (size_t i = 0; i < seq_len; i++) {
467
        if (mp_obj_get_type(seq_items[i]) != self_type) {
468
            mp_raise_TypeError(
469
                "join expects a list of str/bytes objects consistent with self object");
470
        }
471 472 473
        if (i > 0) {
            required_len += sep_len;
        }
474 475
        GET_STR_LEN(seq_items[i], l);
        required_len += l;
476 477 478
    }

    // make joined string
479 480 481
    vstr_t vstr;
    vstr_init_len(&vstr, required_len);
    byte *data = (byte*)vstr.buf;
482
    for (size_t i = 0; i < seq_len; i++) {
483
        if (i > 0) {
484 485
            memcpy(data, sep_str, sep_len);
            data += sep_len;
486
        }
487 488 489
        GET_STR_DATA_LEN(seq_items[i], s, l);
        memcpy(data, s, l);
        data += l;
490
    }
491 492

    // return joined string
493
    return mp_obj_new_str_from_vstr(self_type, &vstr);
494
}
495
MP_DEFINE_CONST_FUN_OBJ_2(str_join_obj, str_join);
496

497
mp_obj_t mp_obj_str_split(size_t n_args, const mp_obj_t *args) {
498
    const mp_obj_type_t *self_type = mp_obj_get_type(args[0]);
499
    mp_int_t splits = -1;
Paul Sokolovsky's avatar
Paul Sokolovsky committed
500 501 502 503
    mp_obj_t sep = mp_const_none;
    if (n_args > 1) {
        sep = args[1];
        if (n_args > 2) {
504
            splits = mp_obj_get_int(args[2]);
Paul Sokolovsky's avatar
Paul Sokolovsky committed
505 506
        }
    }
507

Paul Sokolovsky's avatar
Paul Sokolovsky committed
508
    mp_obj_t res = mp_obj_new_list(0, NULL);
509 510
    GET_STR_DATA_LEN(args[0], s, len);
    const byte *top = s + len;
511 512 513 514 515

    if (sep == mp_const_none) {
        // sep not given, so separate on whitespace

        // Initial whitespace is not counted as split, so we pre-do it
516
        while (s < top && unichar_isspace(*s)) s++;
517 518
        while (s < top && splits != 0) {
            const byte *start = s;
519
            while (s < top && !unichar_isspace(*s)) s++;
520
            mp_obj_list_append(res, mp_obj_new_str_of_type(self_type, start, s - start));
521 522 523
            if (s >= top) {
                break;
            }
524
            while (s < top && unichar_isspace(*s)) s++;
525 526 527
            if (splits > 0) {
                splits--;
            }
Paul Sokolovsky's avatar
Paul Sokolovsky committed
528 529
        }

530
        if (s < top) {
531
            mp_obj_list_append(res, mp_obj_new_str_of_type(self_type, s, top - s));
532 533 534 535
        }

    } else {
        // sep given
536
        if (mp_obj_get_type(sep) != self_type) {
537
            bad_implicit_conversion(sep);
538
        }
539

540
        size_t sep_len;
541 542 543
        const char *sep_str = mp_obj_str_get_data(sep, &sep_len);

        if (sep_len == 0) {
544
            mp_raise_ValueError("empty separator");
545 546 547 548 549 550 551 552 553 554 555 556 557
        }

        for (;;) {
            const byte *start = s;
            for (;;) {
                if (splits == 0 || s + sep_len > top) {
                    s = top;
                    break;
                } else if (memcmp(s, sep_str, sep_len) == 0) {
                    break;
                }
                s++;
            }
558
            mp_obj_list_append(res, mp_obj_new_str_of_type(self_type, start, s - start));
559 560 561 562 563 564 565 566
            if (s >= top) {
                break;
            }
            s += sep_len;
            if (splits > 0) {
                splits--;
            }
        }
Paul Sokolovsky's avatar
Paul Sokolovsky committed
567 568 569 570
    }

    return res;
}
571
MP_DEFINE_CONST_FUN_OBJ_VAR_BETWEEN(str_split_obj, 1, 3, mp_obj_str_split);
Paul Sokolovsky's avatar
Paul Sokolovsky committed
572

573
#if MICROPY_PY_BUILTINS_STR_SPLITLINES
574
STATIC mp_obj_t str_splitlines(size_t n_args, const mp_obj_t *pos_args, mp_map_t *kw_args) {
575
    enum { ARG_keepends };
576 577 578 579 580
    static const mp_arg_t allowed_args[] = {
        { MP_QSTR_keepends, MP_ARG_BOOL, {.u_bool = false} },
    };

    // parse args
581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615
    mp_arg_val_t args[MP_ARRAY_SIZE(allowed_args)];
    mp_arg_parse_all(n_args - 1, pos_args + 1, kw_args, MP_ARRAY_SIZE(allowed_args), allowed_args, args);

    const mp_obj_type_t *self_type = mp_obj_get_type(pos_args[0]);
    mp_obj_t res = mp_obj_new_list(0, NULL);

    GET_STR_DATA_LEN(pos_args[0], s, len);
    const byte *top = s + len;

    while (s < top) {
        const byte *start = s;
        size_t match = 0;
        while (s < top) {
            if (*s == '\n') {
                match = 1;
                break;
            } else if (*s == '\r') {
                if (s[1] == '\n') {
                    match = 2;
                } else {
                    match = 1;
                }
                break;
            }
            s++;
        }
        size_t sub_len = s - start;
        if (args[ARG_keepends].u_bool) {
            sub_len += match;
        }
        mp_obj_list_append(res, mp_obj_new_str_of_type(self_type, start, sub_len));
        s += match;
    }

    return res;
616
}
617
MP_DEFINE_CONST_FUN_OBJ_KW(str_splitlines_obj, 1, str_splitlines);
618 619
#endif

620
STATIC mp_obj_t str_rsplit(size_t n_args, const mp_obj_t *args) {
621 622 623
    if (n_args < 3) {
        // If we don't have split limit, it doesn't matter from which side
        // we split.
624
        return mp_obj_str_split(n_args, args);
625 626 627 628 629
    }
    const mp_obj_type_t *self_type = mp_obj_get_type(args[0]);
    mp_obj_t sep = args[1];
    GET_STR_DATA_LEN(args[0], s, len);

630
    mp_int_t splits = mp_obj_get_int(args[2]);
631 632 633 634 635
    if (splits < 0) {
        // Negative limit means no limit, so delegate to split().
        return mp_obj_str_split(n_args, args);
    }

636
    mp_int_t org_splits = splits;
637 638
    // Preallocate list to the max expected # of elements, as we
    // will fill it from the end.
639
    mp_obj_list_t *res = MP_OBJ_TO_PTR(mp_obj_new_list(splits + 1, NULL));
640
    mp_int_t idx = splits;
641 642

    if (sep == mp_const_none) {
643
        mp_raise_NotImplementedError("rsplit(None,n)");
644
    } else {
645
        size_t sep_len;
646 647 648
        const char *sep_str = mp_obj_str_get_data(sep, &sep_len);

        if (sep_len == 0) {
649
            mp_raise_ValueError("empty separator");
650 651 652 653 654 655 656 657 658 659 660 661 662 663 664
        }

        const byte *beg = s;
        const byte *last = s + len;
        for (;;) {
            s = last - sep_len;
            for (;;) {
                if (splits == 0 || s < beg) {
                    break;
                } else if (memcmp(s, sep_str, sep_len) == 0) {
                    break;
                }
                s--;
            }
            if (s < beg || splits == 0) {
665
                res->items[idx] = mp_obj_new_str_of_type(self_type, beg, last - beg);
666 667
                break;
            }
668
            res->items[idx--] = mp_obj_new_str_of_type(self_type, s + sep_len, last - s - sep_len);
669
            last = s;
670
            splits--;
671 672 673
        }
        if (idx != 0) {
            // We split less parts than split limit, now go cleanup surplus
674
            size_t used = org_splits + 1 - idx;
675
            memmove(res->items, &res->items[idx], used * sizeof(mp_obj_t));
676 677 678 679 680
            mp_seq_clear(res->items, used, res->alloc, sizeof(*res->items));
            res->len = used;
        }
    }

681
    return MP_OBJ_FROM_PTR(res);
682
}
683
MP_DEFINE_CONST_FUN_OBJ_VAR_BETWEEN(str_rsplit_obj, 1, 3, str_rsplit);
684

685
STATIC mp_obj_t str_finder(size_t n_args, const mp_obj_t *args, int direction, bool is_index) {
686
    const mp_obj_type_t *self_type = mp_obj_get_type(args[0]);
687
    mp_check_self(MP_OBJ_IS_STR_OR_BYTES(args[0]));
688 689

    // check argument type
690
    if (mp_obj_get_type(args[1]) != self_type) {
691 692
        bad_implicit_conversion(args[1]);
    }
693

694 695
    GET_STR_DATA_LEN(args[0], haystack, haystack_len);
    GET_STR_DATA_LEN(args[1], needle, needle_len);
696

697 698
    const byte *start = haystack;
    const byte *end = haystack + haystack_len;
699
    if (n_args >= 3 && args[2] != mp_const_none) {
700
        start = str_index_to_ptr(self_type, haystack, haystack_len, args[2], true);
701 702
    }
    if (n_args >= 4 && args[3] != mp_const_none) {
703
        end = str_index_to_ptr(self_type, haystack, haystack_len, args[3], true);
704 705
    }

706
    const byte *p = find_subbytes(start, end - start, needle, needle_len, direction);
707 708
    if (p == NULL) {
        // not found
709
        if (is_index) {
710
            mp_raise_ValueError("substring not found");
711 712 713
        } else {
            return MP_OBJ_NEW_SMALL_INT(-1);
        }
714 715
    } else {
        // found
716 717 718 719 720
        #if MICROPY_PY_BUILTINS_STR_UNICODE
        if (self_type == &mp_type_str) {
            return MP_OBJ_NEW_SMALL_INT(utf8_ptr_to_index(haystack, p));
        }
        #endif
721
        return MP_OBJ_NEW_SMALL_INT(p - haystack);
722 723 724
    }
}

725
STATIC mp_obj_t str_find(size_t n_args, const mp_obj_t *args) {
726
    return str_finder(n_args, args, 1, false);
727
}
728
MP_DEFINE_CONST_FUN_OBJ_VAR_BETWEEN(str_find_obj, 2, 4, str_find);
729

730
STATIC mp_obj_t str_rfind(size_t n_args, const mp_obj_t *args) {
731 732
    return str_finder(n_args, args, -1, false);
}
733
MP_DEFINE_CONST_FUN_OBJ_VAR_BETWEEN(str_rfind_obj, 2, 4, str_rfind);
734

735
STATIC mp_obj_t str_index(size_t n_args, const mp_obj_t *args) {
736 737
    return str_finder(n_args, args, 1, true);
}
738
MP_DEFINE_CONST_FUN_OBJ_VAR_BETWEEN(str_index_obj, 2, 4, str_index);
739

740
STATIC mp_obj_t str_rindex(size_t n_args, const mp_obj_t *args) {
741
    return str_finder(n_args, args, -1, true);
742
}
743
MP_DEFINE_CONST_FUN_OBJ_VAR_BETWEEN(str_rindex_obj, 2, 4, str_rindex);
744

745
// TODO: (Much) more variety in args
746
STATIC mp_obj_t str_startswith(size_t n_args, const mp_obj_t *args) {
747
    const mp_obj_type_t *self_type = mp_obj_get_type(args[0]);
748
    GET_STR_DATA_LEN(args[0], str, str_len);
749 750
    size_t prefix_len;
    const char *prefix = mp_obj_str_get_data(args[1], &prefix_len);
751
    const byte *start = str;
752
    if (n_args > 2) {
753
        start = str_index_to_ptr(self_type, str, str_len, args[2], true);
754
    }
755
    if (prefix_len + (start - str) > str_len) {
756 757
        return mp_const_false;
    }
758
    return mp_obj_new_bool(memcmp(start, prefix, prefix_len) == 0);
759
}
760
MP_DEFINE_CONST_FUN_OBJ_VAR_BETWEEN(str_startswith_obj, 2, 3, str_startswith);
761

762
STATIC mp_obj_t str_endswith(size_t n_args, const mp_obj_t *args) {
763
    GET_STR_DATA_LEN(args[0], str, str_len);
764 765
    size_t suffix_len;
    const char *suffix = mp_obj_str_get_data(args[1], &suffix_len);
766
    if (n_args > 2) {
767
        mp_raise_NotImplementedError("start/end indices");
768
    }
769 770 771 772

    if (suffix_len > str_len) {
        return mp_const_false;
    }
773
    return mp_obj_new_bool(memcmp(str + (str_len - suffix_len), suffix, suffix_len) == 0);
774
}
775
MP_DEFINE_CONST_FUN_OBJ_VAR_BETWEEN(str_endswith_obj, 2, 3, str_endswith);
776

777 778
enum { LSTRIP, RSTRIP, STRIP };

779
STATIC mp_obj_t str_uni_strip(int type, size_t n_args, const mp_obj_t *args) {
780
    mp_check_self(MP_OBJ_IS_STR_OR_BYTES(args[0]));
781
    const mp_obj_type_t *self_type = mp_obj_get_type(args[0]);
782 783 784 785

    const byte *chars_to_del;
    uint chars_to_del_len;
    static const byte whitespace[] = " \t\n\r\v\f";
xbe's avatar
xbe committed
786 787 788

    if (n_args == 1) {
        chars_to_del = whitespace;
789
        chars_to_del_len = sizeof(whitespace) - 1;
xbe's avatar
xbe committed
790
    } else {
791
        if (mp_obj_get_type(args[1]) != self_type) {
792
            bad_implicit_conversion(args[1]);
793
        }
794 795 796
        GET_STR_DATA_LEN(args[1], s, l);
        chars_to_del = s;
        chars_to_del_len = l;
xbe's avatar
xbe committed
797 798
    }

799
    GET_STR_DATA_LEN(args[0], orig_str, orig_str_len);
xbe's avatar
xbe committed
800

801
    size_t first_good_char_pos = 0;
xbe's avatar
xbe committed
802
    bool first_good_char_pos_set = false;
803 804 805
    size_t last_good_char_pos = 0;
    size_t i = 0;
    int delta = 1;
806 807 808 809
    if (type == RSTRIP) {
        i = orig_str_len - 1;
        delta = -1;
    }
810
    for (size_t len = orig_str_len; len > 0; len--) {
811
        if (find_subbytes(chars_to_del, chars_to_del_len, &orig_str[i], 1, 1) == NULL) {
xbe's avatar
xbe committed
812
            if (!first_good_char_pos_set) {
813
                first_good_char_pos_set = true;
xbe's avatar
xbe committed
814
                first_good_char_pos = i;
815 816 817
                if (type == LSTRIP) {
                    last_good_char_pos = orig_str_len - 1;
                    break;
818 819 820 821
                } else if (type == RSTRIP) {
                    first_good_char_pos = 0;
                    last_good_char_pos = i;
                    break;
822
                }
xbe's avatar
xbe committed
823
            }
824
            last_good_char_pos = i;
xbe's avatar
xbe committed
825
        }
826
        i += delta;
xbe's avatar
xbe committed
827 828
    }

829
    if (!first_good_char_pos_set) {
830
        // string is all whitespace, return ''
831 832 833 834 835
        if (self_type == &mp_type_str) {
            return MP_OBJ_NEW_QSTR(MP_QSTR_);
        } else {
            return mp_const_empty_bytes;
        }
xbe's avatar
xbe committed
836 837 838
    }

    assert(last_good_char_pos >= first_good_char_pos);
Ville Skyttä's avatar
Ville Skyttä committed
839
    //+1 to accommodate the last character
840
    size_t stripped_len = last_good_char_pos - first_good_char_pos + 1;
841 842 843 844 845 846
    if (stripped_len == orig_str_len) {
        // If nothing was stripped, don't bother to dup original string
        // TODO: watch out for this case when we'll get to bytearray.strip()
        assert(first_good_char_pos == 0);
        return args[0];
    }
847
    return mp_obj_new_str_of_type(self_type, orig_str + first_good_char_pos, stripped_len);
xbe's avatar
xbe committed
848 849
}

850
STATIC mp_obj_t str_strip(size_t n_args, const mp_obj_t *args) {
851 852
    return str_uni_strip(STRIP, n_args, args);
}
853
MP_DEFINE_CONST_FUN_OBJ_VAR_BETWEEN(str_strip_obj, 1, 2, str_strip);
854

855
STATIC mp_obj_t str_lstrip(size_t n_args, const mp_obj_t *args) {
856 857
    return str_uni_strip(LSTRIP, n_args, args);
}
858
MP_DEFINE_CONST_FUN_OBJ_VAR_BETWEEN(str_lstrip_obj, 1, 2, str_lstrip);
859

860
STATIC mp_obj_t str_rstrip(size_t n_args, const mp_obj_t *args) {
861 862
    return str_uni_strip(RSTRIP, n_args, args);
}
863
MP_DEFINE_CONST_FUN_OBJ_VAR_BETWEEN(str_rstrip_obj, 1, 2, str_rstrip);
864

865 866 867
#if MICROPY_PY_BUILTINS_STR_CENTER
STATIC mp_obj_t str_center(mp_obj_t str_in, mp_obj_t width_in) {
    GET_STR_DATA_LEN(str_in, str, str_len);
868
    mp_uint_t width = mp_obj_get_int(width_in);
869 870 871 872 873 874 875 876 877 878 879
    if (str_len >= width) {
        return str_in;
    }

    vstr_t vstr;
    vstr_init_len(&vstr, width);
    memset(vstr.buf, ' ', width);
    int left = (width - str_len) / 2;
    memcpy(vstr.buf + left, str, str_len);
    return mp_obj_new_str_from_vstr(mp_obj_get_type(str_in), &vstr);
}
880
MP_DEFINE_CONST_FUN_OBJ_2(str_center_obj, str_center);
881 882
#endif

Dave Hylands's avatar
Dave Hylands committed
883 884
// Takes an int arg, but only parses unsigned numbers, and only changes
// *num if at least one digit was parsed.
885 886
STATIC const char *str_to_int(const char *str, const char *top, int *num) {
    if (str < top && '0' <= *str && *str <= '9') {
Dave Hylands's avatar
Dave Hylands committed
887 888
        *num = 0;
        do {
889 890
            *num = *num * 10 + (*str - '0');
            str++;
Dave Hylands's avatar
Dave Hylands committed
891
        }
892
        while (str < top && '0' <= *str && *str <= '9');
Dave Hylands's avatar
Dave Hylands committed
893
    }
894
    return str;
Dave Hylands's avatar
Dave Hylands committed
895 896
}

897
STATIC bool isalignment(char ch) {
Dave Hylands's avatar
Dave Hylands committed
898 899 900
    return ch && strchr("<>=^", ch) != NULL;
}

901
STATIC bool istype(char ch) {
Dave Hylands's avatar
Dave Hylands committed
902 903 904
    return ch && strchr("bcdeEfFgGnosxX%", ch) != NULL;
}

905
STATIC bool arg_looks_integer(mp_obj_t arg) {
Dave Hylands's avatar
Dave Hylands committed
906 907 908
    return MP_OBJ_IS_TYPE(arg, &mp_type_bool) || MP_OBJ_IS_INT(arg);
}

909
STATIC bool arg_looks_numeric(mp_obj_t arg) {
Dave Hylands's avatar
Dave Hylands committed
910
    return arg_looks_integer(arg)
911
#if MICROPY_PY_BUILTINS_FLOAT
912
        || mp_obj_is_float(arg)
Dave Hylands's avatar
Dave Hylands committed
913 914 915 916
#endif
    ;
}

917
STATIC mp_obj_t arg_as_int(mp_obj_t arg) {
918
#if MICROPY_PY_BUILTINS_FLOAT
919 920
    if (mp_obj_is_float(arg)) {
        return mp_obj_new_int_from_float(mp_obj_float_get(arg));
921 922
    }
#endif
923
    return arg;
924