lexer.c 26.5 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
/*
 * This file is part of the Micro Python project, http://micropython.org/
 *
 * The MIT License (MIT)
 *
 * Copyright (c) 2013, 2014 Damien P. George
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */

Damien's avatar
Damien committed
27
28
29
/* lexer.c -- simple tokeniser for Python implementation
 */

xbe's avatar
xbe committed
30
#include <stdbool.h>
Damien's avatar
Damien committed
31
32
33
34
#include <stdint.h>
#include <stdio.h>
#include <assert.h>

35
#include "mpconfig.h"
36
#include "misc.h"
37
#include "qstr.h"
Damien's avatar
Damien committed
38
39
40
41
#include "lexer.h"

#define TAB_SIZE (8)

42
43
44
// TODO seems that CPython allows NULL byte in the input stream
// don't know if that's intentional or not, but we don't allow it

45
struct _mp_lexer_t {
46
    qstr source_name;           // name of source
47
    void *stream_data;          // data for stream
48
49
    mp_lexer_stream_next_char_t stream_next_char;   // stream callback to get next char
    mp_lexer_stream_close_t stream_close;           // stream callback to free
Damien's avatar
Damien committed
50

51
    unichar chr0, chr1, chr2;   // current cached characters from source
Damien's avatar
Damien committed
52
53
54
55

    uint line;                  // source line
    uint column;                // source column

56
57
    int emit_dent;              // non-zero when there are INDENT/DEDENT tokens to emit
    int nested_bracket_level;   // >0 when there are nested brackets over multiple lines
Damien's avatar
Damien committed
58
59
60
61
62

    uint alloc_indent_level;
    uint num_indent_level;
    uint16_t *indent_level;

63
    vstr_t vstr;
64
    mp_token_t tok_cur;
Damien's avatar
Damien committed
65
66
};

67
uint mp_optimise_value;
68

69
// TODO replace with a call to a standard function
70
bool str_strn_equal(const char *str, const char *strn, int len) {
Damien's avatar
Damien committed
71
72
    uint i = 0;

73
    while (i < len && *str == *strn) {
Damien's avatar
Damien committed
74
75
        ++i;
        ++str;
76
        ++strn;
Damien's avatar
Damien committed
77
78
    }

79
    return i == len && *str == 0;
Damien's avatar
Damien committed
80
81
}

82
#ifdef MICROPY_DEBUG_PRINTERS
83
void mp_token_show(const mp_token_t *tok) {
84
    printf("(%d:%d) kind:%d str:%p len:%d", tok->src_line, tok->src_column, tok->kind, tok->str, tok->len);
Damien's avatar
Damien committed
85
    if (tok->str != NULL && tok->len > 0) {
86
87
        const byte *i = (const byte *)tok->str;
        const byte *j = (const byte *)i + tok->len;
Damien's avatar
Damien committed
88
89
        printf(" ");
        while (i < j) {
90
91
92
            unichar c = utf8_get_char(i);
            i = utf8_next_char(i);
            if (unichar_isprint(c)) {
Damien's avatar
Damien committed
93
94
95
96
97
98
99
100
                printf("%c", c);
            } else {
                printf("?");
            }
        }
    }
    printf("\n");
}
101
#endif
Damien's avatar
Damien committed
102

103
104
#define CUR_CHAR(lex) ((lex)->chr0)

105
STATIC bool is_end(mp_lexer_t *lex) {
106
    return lex->chr0 == MP_LEXER_CHAR_EOF;
Damien's avatar
Damien committed
107
108
}

109
STATIC bool is_physical_newline(mp_lexer_t *lex) {
Damien's avatar
Damien committed
110
111
112
    return lex->chr0 == '\n' || lex->chr0 == '\r';
}

113
STATIC bool is_char(mp_lexer_t *lex, char c) {
Damien's avatar
Damien committed
114
115
116
    return lex->chr0 == c;
}

117
STATIC bool is_char_or(mp_lexer_t *lex, char c1, char c2) {
Damien's avatar
Damien committed
118
119
120
    return lex->chr0 == c1 || lex->chr0 == c2;
}

121
STATIC bool is_char_or3(mp_lexer_t *lex, char c1, char c2, char c3) {
Damien's avatar
Damien committed
122
123
124
125
    return lex->chr0 == c1 || lex->chr0 == c2 || lex->chr0 == c3;
}

/*
126
STATIC bool is_char_following(mp_lexer_t *lex, char c) {
Damien's avatar
Damien committed
127
128
129
130
    return lex->chr1 == c;
}
*/

131
STATIC bool is_char_following_or(mp_lexer_t *lex, char c1, char c2) {
Damien's avatar
Damien committed
132
133
134
    return lex->chr1 == c1 || lex->chr1 == c2;
}

135
STATIC bool is_char_following_following_or(mp_lexer_t *lex, char c1, char c2) {
Damien's avatar
Damien committed
136
137
138
    return lex->chr2 == c1 || lex->chr2 == c2;
}

139
STATIC bool is_char_and(mp_lexer_t *lex, char c1, char c2) {
Damien's avatar
Damien committed
140
141
142
    return lex->chr0 == c1 && lex->chr1 == c2;
}

143
STATIC bool is_whitespace(mp_lexer_t *lex) {
144
    return unichar_isspace(lex->chr0);
Damien's avatar
Damien committed
145
146
}

147
STATIC bool is_letter(mp_lexer_t *lex) {
148
    return unichar_isalpha(lex->chr0);
Damien's avatar
Damien committed
149
150
}

151
STATIC bool is_digit(mp_lexer_t *lex) {
152
    return unichar_isdigit(lex->chr0);
Damien's avatar
Damien committed
153
154
}

155
STATIC bool is_following_digit(mp_lexer_t *lex) {
156
    return unichar_isdigit(lex->chr1);
Damien's avatar
Damien committed
157
158
}

159
STATIC bool is_following_odigit(mp_lexer_t *lex) {
160
161
162
    return lex->chr1 >= '0' && lex->chr1 <= '7';
}

Damien's avatar
Damien committed
163
// TODO UNICODE include unicode characters in definition of identifiers
164
STATIC bool is_head_of_identifier(mp_lexer_t *lex) {
Damien's avatar
Damien committed
165
166
167
168
    return is_letter(lex) || lex->chr0 == '_';
}

// TODO UNICODE include unicode characters in definition of identifiers
169
STATIC bool is_tail_of_identifier(mp_lexer_t *lex) {
Damien's avatar
Damien committed
170
171
172
    return is_head_of_identifier(lex) || is_digit(lex);
}

173
STATIC void next_char(mp_lexer_t *lex) {
174
    if (lex->chr0 == MP_LEXER_CHAR_EOF) {
Damien's avatar
Damien committed
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
        return;
    }

    int advance = 1;

    if (lex->chr0 == '\n') {
        // LF is a new line
        ++lex->line;
        lex->column = 1;
    } else if (lex->chr0 == '\r') {
        // CR is a new line
        ++lex->line;
        lex->column = 1;
        if (lex->chr1 == '\n') {
            // CR LF is a single new line
            advance = 2;
        }
    } else if (lex->chr0 == '\t') {
        // a tab
        lex->column = (((lex->column - 1 + TAB_SIZE) / TAB_SIZE) * TAB_SIZE) + 1;
    } else {
        // a character worth one column
        ++lex->column;
    }

    for (; advance > 0; advance--) {
        lex->chr0 = lex->chr1;
        lex->chr1 = lex->chr2;
203
        lex->chr2 = lex->stream_next_char(lex->stream_data);
204
        if (lex->chr2 == MP_LEXER_CHAR_EOF) {
Damien's avatar
Damien committed
205
            // EOF
206
            if (lex->chr1 != MP_LEXER_CHAR_EOF && lex->chr1 != '\n' && lex->chr1 != '\r') {
Damien's avatar
Damien committed
207
208
209
210
211
212
                lex->chr2 = '\n'; // insert newline at end of file
            }
        }
    }
}

213
void indent_push(mp_lexer_t *lex, uint indent) {
Damien's avatar
Damien committed
214
    if (lex->num_indent_level >= lex->alloc_indent_level) {
215
        // TODO use m_renew_maybe and somehow indicate an error if it fails... probably by using MP_TOKEN_MEMORY_ERROR
216
217
        lex->indent_level = m_renew(uint16_t, lex->indent_level, lex->alloc_indent_level, lex->alloc_indent_level + MICROPY_ALLOC_LEXEL_INDENT_INC);
        lex->alloc_indent_level += MICROPY_ALLOC_LEXEL_INDENT_INC;
Damien's avatar
Damien committed
218
219
220
221
    }
    lex->indent_level[lex->num_indent_level++] = indent;
}

222
uint indent_top(mp_lexer_t *lex) {
Damien's avatar
Damien committed
223
224
225
    return lex->indent_level[lex->num_indent_level - 1];
}

226
void indent_pop(mp_lexer_t *lex) {
Damien's avatar
Damien committed
227
228
229
230
231
232
233
234
235
236
    lex->num_indent_level -= 1;
}

// some tricky operator encoding:
//     <op>  = begin with <op>, if this opchar matches then begin here
//     e<op> = end with <op>, if this opchar matches then end
//     E<op> = mandatory end with <op>, this opchar must match, then end
//     c<op> = continue with <op>, if this opchar matches then continue matching
// this means if the start of two ops are the same then they are equal til the last char

237
STATIC const char *tok_enc =
Damien's avatar
Damien committed
238
239
240
241
242
243
244
245
246
247
248
249
    "()[]{},:;@~" // singles
    "<e=c<e="     // < <= << <<=
    ">e=c>e="     // > >= >> >>=
    "*e=c*e="     // * *= ** **=
    "+e="         // + +=
    "-e=e>"       // - -= ->
    "&e="         // & &=
    "|e="         // | |=
    "/e=c/e="     // / /= // //=
    "%e="         // % %=
    "^e="         // ^ ^=
    "=e="         // = ==
250
    "!E=";        // !=
Damien's avatar
Damien committed
251
252

// TODO static assert that number of tokens is less than 256 so we can safely make this table with byte sized entries
253
STATIC const uint8_t tok_enc_kind[] = {
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
    MP_TOKEN_DEL_PAREN_OPEN, MP_TOKEN_DEL_PAREN_CLOSE,
    MP_TOKEN_DEL_BRACKET_OPEN, MP_TOKEN_DEL_BRACKET_CLOSE,
    MP_TOKEN_DEL_BRACE_OPEN, MP_TOKEN_DEL_BRACE_CLOSE,
    MP_TOKEN_DEL_COMMA, MP_TOKEN_DEL_COLON, MP_TOKEN_DEL_SEMICOLON, MP_TOKEN_DEL_AT, MP_TOKEN_OP_TILDE,

    MP_TOKEN_OP_LESS, MP_TOKEN_OP_LESS_EQUAL, MP_TOKEN_OP_DBL_LESS, MP_TOKEN_DEL_DBL_LESS_EQUAL,
    MP_TOKEN_OP_MORE, MP_TOKEN_OP_MORE_EQUAL, MP_TOKEN_OP_DBL_MORE, MP_TOKEN_DEL_DBL_MORE_EQUAL,
    MP_TOKEN_OP_STAR, MP_TOKEN_DEL_STAR_EQUAL, MP_TOKEN_OP_DBL_STAR, MP_TOKEN_DEL_DBL_STAR_EQUAL,
    MP_TOKEN_OP_PLUS, MP_TOKEN_DEL_PLUS_EQUAL,
    MP_TOKEN_OP_MINUS, MP_TOKEN_DEL_MINUS_EQUAL, MP_TOKEN_DEL_MINUS_MORE,
    MP_TOKEN_OP_AMPERSAND, MP_TOKEN_DEL_AMPERSAND_EQUAL,
    MP_TOKEN_OP_PIPE, MP_TOKEN_DEL_PIPE_EQUAL,
    MP_TOKEN_OP_SLASH, MP_TOKEN_DEL_SLASH_EQUAL, MP_TOKEN_OP_DBL_SLASH, MP_TOKEN_DEL_DBL_SLASH_EQUAL,
    MP_TOKEN_OP_PERCENT, MP_TOKEN_DEL_PERCENT_EQUAL,
    MP_TOKEN_OP_CARET, MP_TOKEN_DEL_CARET_EQUAL,
    MP_TOKEN_DEL_EQUAL, MP_TOKEN_OP_DBL_EQUAL,
    MP_TOKEN_OP_NOT_EQUAL,
Damien's avatar
Damien committed
271
272
273
};

// must have the same order as enum in lexer.h
274
STATIC const char *tok_kw[] = {
Damien's avatar
Damien committed
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
    "False",
    "None",
    "True",
    "and",
    "as",
    "assert",
    "break",
    "class",
    "continue",
    "def",
    "del",
    "elif",
    "else",
    "except",
    "finally",
    "for",
    "from",
    "global",
    "if",
    "import",
    "in",
    "is",
    "lambda",
    "nonlocal",
    "not",
    "or",
    "pass",
    "raise",
    "return",
    "try",
    "while",
    "with",
    "yield",
308
    "__debug__",
Damien's avatar
Damien committed
309
310
};

311
STATIC int hex_digit(unichar c) {
312
313
314
315
316
317
318
319
320
321
322
    // c is assumed to be hex digit
    int n = c - '0';
    if (n > 9) {
        n &= ~('a' - 'A');
        n -= ('A' - ('9' + 1));
    }
    return n;
}

// This is called with CUR_CHAR() before first hex digit, and should return with
// it pointing to last hex digit
323
STATIC bool get_hex(mp_lexer_t *lex, int num_digits, uint *result) {
324
325
326
327
328
329
330
331
332
333
334
335
336
    uint num = 0;
    while (num_digits-- != 0) {
        next_char(lex);
        unichar c = CUR_CHAR(lex);
        if (!unichar_isxdigit(c)) {
            return false;
        }
        num = (num << 4) + hex_digit(c);
    }
    *result = num;
    return true;
}

337
STATIC void mp_lexer_next_token_into(mp_lexer_t *lex, mp_token_t *tok, bool first_token) {
338
    // skip white space and comments
Damien's avatar
Damien committed
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
    bool had_physical_newline = false;
    while (!is_end(lex)) {
        if (is_physical_newline(lex)) {
            had_physical_newline = true;
            next_char(lex);
        } else if (is_whitespace(lex)) {
            next_char(lex);
        } else if (is_char(lex, '#')) {
            next_char(lex);
            while (!is_end(lex) && !is_physical_newline(lex)) {
                next_char(lex);
            }
            // had_physical_newline will be set on next loop
        } else if (is_char(lex, '\\')) {
            // backslash (outside string literals) must appear just before a physical newline
            next_char(lex);
            if (!is_physical_newline(lex)) {
356
357
358
359
360
361
362
363
                // SyntaxError: unexpected character after line continuation character
                tok->src_line = lex->line;
                tok->src_column = lex->column;
                tok->kind = MP_TOKEN_BAD_LINE_CONTINUATION;
                vstr_reset(&lex->vstr);
                tok->str = vstr_str(&lex->vstr);
                tok->len = 0;
                return;
Damien's avatar
Damien committed
364
365
366
367
368
369
370
371
            } else {
                next_char(lex);
            }
        } else {
            break;
        }
    }

372
    // set token source information
Damien's avatar
Damien committed
373
374
375
    tok->src_line = lex->line;
    tok->src_column = lex->column;

376
377
378
379
380
381
382
383
    // start new token text
    vstr_reset(&lex->vstr);

    if (first_token && lex->line == 1 && lex->column != 1) {
        // check that the first token is in the first column
        // if first token is not on first line, we get a physical newline and
        // this check is done as part of normal indent/dedent checking below
        // (done to get equivalence with CPython)
384
        tok->kind = MP_TOKEN_INDENT;
385
386

    } else if (lex->emit_dent < 0) {
387
        tok->kind = MP_TOKEN_DEDENT;
Damien's avatar
Damien committed
388
389
390
        lex->emit_dent += 1;

    } else if (lex->emit_dent > 0) {
391
        tok->kind = MP_TOKEN_INDENT;
Damien's avatar
Damien committed
392
393
        lex->emit_dent -= 1;

394
    } else if (had_physical_newline && lex->nested_bracket_level == 0) {
395
        tok->kind = MP_TOKEN_NEWLINE;
Damien's avatar
Damien committed
396
397
398
399
400
401
402
403
404
405
406
407
408

        uint num_spaces = lex->column - 1;
        lex->emit_dent = 0;
        if (num_spaces == indent_top(lex)) {
        } else if (num_spaces > indent_top(lex)) {
            indent_push(lex, num_spaces);
            lex->emit_dent += 1;
        } else {
            while (num_spaces < indent_top(lex)) {
                indent_pop(lex);
                lex->emit_dent -= 1;
            }
            if (num_spaces != indent_top(lex)) {
409
                tok->kind = MP_TOKEN_DEDENT_MISMATCH;
Damien's avatar
Damien committed
410
411
412
413
414
            }
        }

    } else if (is_end(lex)) {
        if (indent_top(lex) > 0) {
415
            tok->kind = MP_TOKEN_NEWLINE;
Damien's avatar
Damien committed
416
417
418
419
420
421
            lex->emit_dent = 0;
            while (indent_top(lex) > 0) {
                indent_pop(lex);
                lex->emit_dent -= 1;
            }
        } else {
422
            tok->kind = MP_TOKEN_END;
Damien's avatar
Damien committed
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
        }

    } else if (is_char_or(lex, '\'', '\"')
               || (is_char_or3(lex, 'r', 'u', 'b') && is_char_following_or(lex, '\'', '\"'))
               || ((is_char_and(lex, 'r', 'b') || is_char_and(lex, 'b', 'r')) && is_char_following_following_or(lex, '\'', '\"'))) {
        // a string or bytes literal

        // parse type codes
        bool is_raw = false;
        bool is_bytes = false;
        if (is_char(lex, 'u')) {
            next_char(lex);
        } else if (is_char(lex, 'b')) {
            is_bytes = true;
            next_char(lex);
            if (is_char(lex, 'r')) {
                is_raw = true;
                next_char(lex);
            }
        } else if (is_char(lex, 'r')) {
            is_raw = true;
            next_char(lex);
            if (is_char(lex, 'b')) {
                is_bytes = true;
                next_char(lex);
            }
        }

        // set token kind
        if (is_bytes) {
453
            tok->kind = MP_TOKEN_BYTES;
Damien's avatar
Damien committed
454
        } else {
455
            tok->kind = MP_TOKEN_STRING;
Damien's avatar
Damien committed
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
        }

        // get first quoting character
        char quote_char = '\'';
        if (is_char(lex, '\"')) {
            quote_char = '\"';
        }
        next_char(lex);

        // work out if it's a single or triple quoted literal
        int num_quotes;
        if (is_char_and(lex, quote_char, quote_char)) {
            // triple quotes
            next_char(lex);
            next_char(lex);
            num_quotes = 3;
        } else {
            // single quotes
            num_quotes = 1;
        }

        // parse the literal
        int n_closing = 0;
        while (!is_end(lex) && (num_quotes > 1 || !is_char(lex, '\n')) && n_closing < num_quotes) {
            if (is_char(lex, quote_char)) {
                n_closing += 1;
482
                vstr_add_char(&lex->vstr, CUR_CHAR(lex));
Damien's avatar
Damien committed
483
484
            } else {
                n_closing = 0;
485
                if (is_char(lex, '\\')) {
Damien's avatar
Damien committed
486
                    next_char(lex);
487
                    unichar c = CUR_CHAR(lex);
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
                    if (is_raw) {
                        // raw strings allow escaping of quotes, but the backslash is also emitted
                        vstr_add_char(&lex->vstr, '\\');
                    } else {
                        switch (c) {
                            case MP_LEXER_CHAR_EOF: break; // TODO a proper error message?
                            case '\n': c = MP_LEXER_CHAR_EOF; break; // TODO check this works correctly (we are supposed to ignore it
                            case '\\': break;
                            case '\'': break;
                            case '"': break;
                            case 'a': c = 0x07; break;
                            case 'b': c = 0x08; break;
                            case 't': c = 0x09; break;
                            case 'n': c = 0x0a; break;
                            case 'v': c = 0x0b; break;
                            case 'f': c = 0x0c; break;
                            case 'r': c = 0x0d; break;
505
506
507
508
509
510
511
512
                            case 'u':
                            case 'U':
                                if (is_bytes) {
                                    // b'\u1234' == b'\\u1234'
                                    vstr_add_char(&lex->vstr, '\\');
                                    break;
                                }
                                // Otherwise fall through.
513
514
515
                            case 'x':
                            {
                                uint num = 0;
516
                                if (!get_hex(lex, (c == 'x' ? 2 : c == 'u' ? 4 : 8), &num)) {
517
518
                                    // TODO error message
                                    assert(0);
519
520
                                }
                                c = num;
521
                                break;
522
                            }
523
524
525
526
527
528
529
530
                            case 'N':
                                // Supporting '\N{LATIN SMALL LETTER A}' == 'a' would require keeping the
                                // entire Unicode name table in the core. As of Unicode 6.3.0, that's nearly
                                // 3MB of text; even gzip-compressed and with minimal structure, it'll take
                                // roughly half a meg of storage. This form of Unicode escape may be added
                                // later on, but it's definitely not a priority right now. -- CJA 20140607
                                assert(!"Unicode name escapes not supported");
                                break;
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
                            default:
                                if (c >= '0' && c <= '7') {
                                    // Octal sequence, 1-3 chars
                                    int digits = 3;
                                    int num = c - '0';
                                    while (is_following_odigit(lex) && --digits != 0) {
                                        next_char(lex);
                                        num = num * 8 + (CUR_CHAR(lex) - '0');
                                    }
                                    c = num;
                                } else {
                                    // unrecognised escape character; CPython lets this through verbatim as '\' and then the character
                                    vstr_add_char(&lex->vstr, '\\');
                                }
                                break;
                        }
547
                    }
548
                    if (c != MP_LEXER_CHAR_EOF) {
549
550
551
552
553
554
555
                        if (c < 0x110000 && !is_bytes) {
                            vstr_add_char(&lex->vstr, c);
                        } else if (c < 0x100 && is_bytes) {
                            vstr_add_byte(&lex->vstr, c);
                        } else {
                            assert(!"TODO: Throw an error, invalid escape code probably");
                        }
556
557
558
                    }
                } else {
                    vstr_add_char(&lex->vstr, CUR_CHAR(lex));
Damien's avatar
Damien committed
559
560
561
562
563
564
565
                }
            }
            next_char(lex);
        }

        // check we got the required end quotes
        if (n_closing < num_quotes) {
566
            tok->kind = MP_TOKEN_LONELY_STRING_OPEN;
Damien's avatar
Damien committed
567
568
        }

569
        // cut off the end quotes from the token text
570
        vstr_cut_tail_bytes(&lex->vstr, n_closing);
Damien's avatar
Damien committed
571
572

    } else if (is_head_of_identifier(lex)) {
573
        tok->kind = MP_TOKEN_NAME;
Damien's avatar
Damien committed
574

575
576
        // get first char
        vstr_add_char(&lex->vstr, CUR_CHAR(lex));
Damien's avatar
Damien committed
577
578
        next_char(lex);

579
        // get tail chars
Damien's avatar
Damien committed
580
        while (!is_end(lex) && is_tail_of_identifier(lex)) {
581
            vstr_add_char(&lex->vstr, CUR_CHAR(lex));
Damien's avatar
Damien committed
582
583
584
585
            next_char(lex);
        }

    } else if (is_digit(lex) || (is_char(lex, '.') && is_following_digit(lex))) {
586
        tok->kind = MP_TOKEN_NUMBER;
Damien's avatar
Damien committed
587

588
589
        // get first char
        vstr_add_char(&lex->vstr, CUR_CHAR(lex));
Damien's avatar
Damien committed
590
591
        next_char(lex);

592
        // get tail chars
Damien's avatar
Damien committed
593
594
        while (!is_end(lex)) {
            if (is_char_or(lex, 'e', 'E')) {
595
                vstr_add_char(&lex->vstr, 'e');
Damien's avatar
Damien committed
596
597
                next_char(lex);
                if (is_char(lex, '+') || is_char(lex, '-')) {
598
                    vstr_add_char(&lex->vstr, CUR_CHAR(lex));
Damien's avatar
Damien committed
599
600
601
                    next_char(lex);
                }
            } else if (is_letter(lex) || is_digit(lex) || is_char_or(lex, '_', '.')) {
602
                vstr_add_char(&lex->vstr, CUR_CHAR(lex));
Damien's avatar
Damien committed
603
604
605
606
607
608
                next_char(lex);
            } else {
                break;
            }
        }

609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
    } else if (is_char(lex, '.')) {
        // special handling for . and ... operators, because .. is not a valid operator

        // get first char
        vstr_add_char(&lex->vstr, '.');
        next_char(lex);

        if (is_char_and(lex, '.', '.')) {
            vstr_add_char(&lex->vstr, '.');
            vstr_add_char(&lex->vstr, '.');
            next_char(lex);
            next_char(lex);
            tok->kind = MP_TOKEN_ELLIPSIS;
        } else {
            tok->kind = MP_TOKEN_DEL_PERIOD;
        }

Damien's avatar
Damien committed
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
    } else {
        // search for encoded delimiter or operator

        const char *t = tok_enc;
        uint tok_enc_index = 0;
        for (; *t != 0 && !is_char(lex, *t); t += 1) {
            if (*t == 'e' || *t == 'c') {
                t += 1;
            } else if (*t == 'E') {
                tok_enc_index -= 1;
                t += 1;
            }
            tok_enc_index += 1;
        }

        next_char(lex);

        if (*t == 0) {
            // didn't match any delimiter or operator characters
645
            tok->kind = MP_TOKEN_INVALID;
Damien's avatar
Damien committed
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669

        } else {
            // matched a delimiter or operator character

            // get the maximum characters for a valid token
            t += 1;
            uint t_index = tok_enc_index;
            for (;;) {
                for (; *t == 'e'; t += 1) {
                    t += 1;
                    t_index += 1;
                    if (is_char(lex, *t)) {
                        next_char(lex);
                        tok_enc_index = t_index;
                        break;
                    }
                }

                if (*t == 'E') {
                    t += 1;
                    if (is_char(lex, *t)) {
                        next_char(lex);
                        tok_enc_index = t_index;
                    } else {
670
                        tok->kind = MP_TOKEN_INVALID;
671
                        goto tok_enc_no_match;
Damien's avatar
Damien committed
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
                    }
                    break;
                }

                if (*t == 'c') {
                    t += 1;
                    t_index += 1;
                    if (is_char(lex, *t)) {
                        next_char(lex);
                        tok_enc_index = t_index;
                        t += 1;
                    } else {
                        break;
                    }
                } else {
                    break;
                }
            }

            // set token kind
            tok->kind = tok_enc_kind[tok_enc_index];

694
695
            tok_enc_no_match:

Damien's avatar
Damien committed
696
            // compute bracket level for implicit line joining
697
            if (tok->kind == MP_TOKEN_DEL_PAREN_OPEN || tok->kind == MP_TOKEN_DEL_BRACKET_OPEN || tok->kind == MP_TOKEN_DEL_BRACE_OPEN) {
Damien's avatar
Damien committed
698
                lex->nested_bracket_level += 1;
699
            } else if (tok->kind == MP_TOKEN_DEL_PAREN_CLOSE || tok->kind == MP_TOKEN_DEL_BRACKET_CLOSE || tok->kind == MP_TOKEN_DEL_BRACE_CLOSE) {
Damien's avatar
Damien committed
700
701
702
703
704
                lex->nested_bracket_level -= 1;
            }
        }
    }

705
706
707
    // point token text to vstr buffer
    tok->str = vstr_str(&lex->vstr);
    tok->len = vstr_len(&lex->vstr);
Damien's avatar
Damien committed
708

709
    // check for keywords
710
    if (tok->kind == MP_TOKEN_NAME) {
711
712
713
714
715
        // We check for __debug__ here and convert it to its value.  This is so
        // the parser gives a syntax error on, eg, x.__debug__.  Otherwise, we
        // need to check for this special token in many places in the compiler.
        // TODO improve speed of these string comparisons
        //for (int i = 0; tok_kw[i] != NULL; i++) {
716
        for (int i = 0; i < MP_ARRAY_SIZE(tok_kw); i++) {
717
            if (str_strn_equal(tok_kw[i], tok->str, tok->len)) {
718
719
                if (i == MP_ARRAY_SIZE(tok_kw) - 1) {
                    // tok_kw[MP_ARRAY_SIZE(tok_kw) - 1] == "__debug__"
720
                    tok->kind = (mp_optimise_value == 0 ? MP_TOKEN_KW_TRUE : MP_TOKEN_KW_FALSE);
721
722
723
                } else {
                    tok->kind = MP_TOKEN_KW_FALSE + i;
                }
Damien's avatar
Damien committed
724
725
726
727
728
729
                break;
            }
        }
    }
}

730
mp_lexer_t *mp_lexer_new(qstr src_name, void *stream_data, mp_lexer_stream_next_char_t stream_next_char, mp_lexer_stream_close_t stream_close) {
731
732
733
734
735
736
737
738
739
    mp_lexer_t *lex = m_new_maybe(mp_lexer_t, 1);

    // check for memory allocation error
    if (lex == NULL) {
        if (stream_close) {
            stream_close(stream_data);
        }
        return NULL;
    }
Damien's avatar
Damien committed
740

741
    lex->source_name = src_name;
742
743
    lex->stream_data = stream_data;
    lex->stream_next_char = stream_next_char;
744
    lex->stream_close = stream_close;
Damien's avatar
Damien committed
745
746
747
748
    lex->line = 1;
    lex->column = 1;
    lex->emit_dent = 0;
    lex->nested_bracket_level = 0;
749
    lex->alloc_indent_level = MICROPY_ALLOC_LEXER_INDENT_INIT;
Damien's avatar
Damien committed
750
    lex->num_indent_level = 1;
751
    lex->indent_level = m_new_maybe(uint16_t, lex->alloc_indent_level);
752
    vstr_init(&lex->vstr, 32);
Damien's avatar
Damien committed
753

754
755
756
757
758
759
760
761
762
    // check for memory allocation error
    if (lex->indent_level == NULL || vstr_had_error(&lex->vstr)) {
        mp_lexer_free(lex);
        return NULL;
    }

    // store sentinel for first indentation level
    lex->indent_level[0] = 0;

Damien's avatar
Damien committed
763
    // preload characters
764
765
766
767
768
    lex->chr0 = stream_next_char(stream_data);
    lex->chr1 = stream_next_char(stream_data);
    lex->chr2 = stream_next_char(stream_data);

    // if input stream is 0, 1 or 2 characters long and doesn't end in a newline, then insert a newline at the end
769
    if (lex->chr0 == MP_LEXER_CHAR_EOF) {
770
        lex->chr0 = '\n';
771
    } else if (lex->chr1 == MP_LEXER_CHAR_EOF) {
Damien's avatar
Damien committed
772
        if (lex->chr0 != '\n' && lex->chr0 != '\r') {
773
            lex->chr1 = '\n';
Damien's avatar
Damien committed
774
        }
775
    } else if (lex->chr2 == MP_LEXER_CHAR_EOF) {
Damien's avatar
Damien committed
776
        if (lex->chr1 != '\n' && lex->chr1 != '\r') {
777
            lex->chr2 = '\n';
Damien's avatar
Damien committed
778
779
780
        }
    }

781
    // preload first token
782
    mp_lexer_next_token_into(lex, &lex->tok_cur, true);
Damien's avatar
Damien committed
783
784
785
786

    return lex;
}

787
void mp_lexer_free(mp_lexer_t *lex) {
788
    if (lex) {
789
790
        if (lex->stream_close) {
            lex->stream_close(lex->stream_data);
791
        }
792
        vstr_clear(&lex->vstr);
793
        m_del(uint16_t, lex->indent_level, lex->alloc_indent_level);
794
        m_del_obj(mp_lexer_t, lex);
Damien's avatar
Damien committed
795
796
797
    }
}

798
799
800
801
qstr mp_lexer_source_name(mp_lexer_t *lex) {
    return lex->source_name;
}

802
803
void mp_lexer_to_next(mp_lexer_t *lex) {
    mp_lexer_next_token_into(lex, &lex->tok_cur, false);
Damien's avatar
Damien committed
804
805
}

806
const mp_token_t *mp_lexer_cur(const mp_lexer_t *lex) {
Damien's avatar
Damien committed
807
808
809
    return &lex->tok_cur;
}

810
bool mp_lexer_is_kind(mp_lexer_t *lex, mp_token_kind_t kind) {
Damien's avatar
Damien committed
811
812
    return lex->tok_cur.kind == kind;
}