Commit 534b7c36 authored by Damien George's avatar Damien George
Browse files

py: Do adjacent str/bytes literal concatenation in lexer, not compiler.

It's much more efficient in RAM and code size to do implicit literal string
concatenation in the lexer, as opposed to the compiler.

RAM usage is reduced because the concatenation can be done right away in the
tokeniser by just accumulating the string/bytes literals into the lexer's
vstr.  Prior to this patch adjacent strings/bytes would create a parse tree
(one node per string/bytes) and then in the compiler a whole new chunk of
memory was allocated to store the concatenated string, which used more than
double the memory compared to just accumulating in the lexer.

This patch also significantly reduces code size:

bare-arm: -204
minimal:  -204
unix x64: -328
stmhal:   -208
esp8266:  -284
cc3200:   -224
parent 773278ec
......@@ -2301,65 +2301,6 @@ STATIC void compile_atom_expr_trailers(compiler_t *comp, mp_parse_node_struct_t
}
}
STATIC void compile_atom_string(compiler_t *comp, mp_parse_node_struct_t *pns) {
// a list of strings
// check type of list (string or bytes) and count total number of bytes
int n = MP_PARSE_NODE_STRUCT_NUM_NODES(pns);
size_t n_bytes = 0;
int string_kind = MP_PARSE_NODE_NULL;
for (int i = 0; i < n; i++) {
int pn_kind;
if (MP_PARSE_NODE_IS_LEAF(pns->nodes[i])) {
pn_kind = MP_PARSE_NODE_LEAF_KIND(pns->nodes[i]);
assert(pn_kind == MP_PARSE_NODE_STRING || pn_kind == MP_PARSE_NODE_BYTES);
n_bytes += qstr_len(MP_PARSE_NODE_LEAF_ARG(pns->nodes[i]));
} else {
assert(MP_PARSE_NODE_IS_STRUCT(pns->nodes[i]));
mp_parse_node_struct_t *pns_string = (mp_parse_node_struct_t*)pns->nodes[i];
if (MP_PARSE_NODE_STRUCT_KIND(pns_string) == PN_string) {
pn_kind = MP_PARSE_NODE_STRING;
} else {
assert(MP_PARSE_NODE_STRUCT_KIND(pns_string) == PN_bytes);
pn_kind = MP_PARSE_NODE_BYTES;
}
n_bytes += pns_string->nodes[1];
}
if (i == 0) {
string_kind = pn_kind;
} else if (pn_kind != string_kind) {
compile_syntax_error(comp, (mp_parse_node_t)pns, "cannot mix bytes and nonbytes literals");
return;
}
}
// if we are not in the last pass, just load a dummy object
if (comp->pass != MP_PASS_EMIT) {
EMIT_ARG(load_const_obj, mp_const_none);
return;
}
// concatenate string/bytes
vstr_t vstr;
vstr_init_len(&vstr, n_bytes);
byte *s_dest = (byte*)vstr.buf;
for (int i = 0; i < n; i++) {
if (MP_PARSE_NODE_IS_LEAF(pns->nodes[i])) {
size_t s_len;
const byte *s = qstr_data(MP_PARSE_NODE_LEAF_ARG(pns->nodes[i]), &s_len);
memcpy(s_dest, s, s_len);
s_dest += s_len;
} else {
mp_parse_node_struct_t *pns_string = (mp_parse_node_struct_t*)pns->nodes[i];
memcpy(s_dest, (const char*)pns_string->nodes[0], pns_string->nodes[1]);
s_dest += pns_string->nodes[1];
}
}
// load the object
EMIT_ARG(load_const_obj, mp_obj_new_str_from_vstr(string_kind == MP_PARSE_NODE_STRING ? &mp_type_str : &mp_type_bytes, &vstr));
}
// pns needs to have 2 nodes, first is lhs of comprehension, second is PN_comp_for node
STATIC void compile_comprehension(compiler_t *comp, mp_parse_node_struct_t *pns, scope_kind_t kind) {
assert(MP_PARSE_NODE_STRUCT_NUM_NODES(pns) == 2);
......
......@@ -268,8 +268,7 @@ DEF_RULE_NC(power_dbl_star, and_ident(2), tok(OP_DBL_STAR), rule(factor))
// testlist_comp: (test|star_expr) ( comp_for | (',' (test|star_expr))* [','] )
// trailer: '(' [arglist] ')' | '[' subscriptlist ']' | '.' NAME
DEF_RULE_NC(atom, or(11), tok(NAME), tok(INTEGER), tok(FLOAT_OR_IMAG), rule(atom_string), tok(ELLIPSIS), tok(KW_NONE), tok(KW_TRUE), tok(KW_FALSE), rule(atom_paren), rule(atom_bracket), rule(atom_brace))
DEF_RULE(atom_string, c(atom_string), one_or_more, rule(string_or_bytes))
DEF_RULE_NC(atom, or(12), tok(NAME), tok(INTEGER), tok(FLOAT_OR_IMAG), tok(STRING), tok(BYTES), tok(ELLIPSIS), tok(KW_NONE), tok(KW_TRUE), tok(KW_FALSE), rule(atom_paren), rule(atom_bracket), rule(atom_brace))
DEF_RULE_NC(string_or_bytes, or(2), tok(STRING), tok(BYTES))
DEF_RULE(atom_paren, c(atom_paren), and(3), tok(DEL_PAREN_OPEN), opt_rule(atom_2b), tok(DEL_PAREN_CLOSE))
DEF_RULE_NC(atom_2b, or(2), rule(yield_expr), rule(testlist_comp))
......
......@@ -63,11 +63,9 @@ STATIC bool is_char_or3(mp_lexer_t *lex, byte c1, byte c2, byte c3) {
return lex->chr0 == c1 || lex->chr0 == c2 || lex->chr0 == c3;
}
/*
STATIC bool is_char_following(mp_lexer_t *lex, byte c) {
return lex->chr1 == c;
}
*/
STATIC bool is_char_following_or(mp_lexer_t *lex, byte c1, byte c2) {
return lex->chr1 == c1 || lex->chr1 == c2;
......@@ -106,6 +104,13 @@ STATIC bool is_following_odigit(mp_lexer_t *lex) {
return lex->chr1 >= '0' && lex->chr1 <= '7';
}
STATIC bool is_string_or_bytes(mp_lexer_t *lex) {
return is_char_or(lex, '\'', '\"')
|| (is_char_or3(lex, 'r', 'u', 'b') && is_char_following_or(lex, '\'', '\"'))
|| ((is_char_and(lex, 'r', 'b') || is_char_and(lex, 'b', 'r'))
&& is_char_following_following_or(lex, '\'', '\"'));
}
// to easily parse utf-8 identifiers we allow any raw byte with high bit set
STATIC bool is_head_of_identifier(mp_lexer_t *lex) {
return is_letter(lex) || lex->chr0 == '_' || lex->chr0 >= 0x80;
......@@ -272,14 +277,144 @@ STATIC bool get_hex(mp_lexer_t *lex, mp_uint_t num_digits, mp_uint_t *result) {
return true;
}
void mp_lexer_to_next(mp_lexer_t *lex) {
// start new token text
vstr_reset(&lex->vstr);
STATIC void parse_string_literal(mp_lexer_t *lex, bool is_raw) {
// get first quoting character
char quote_char = '\'';
if (is_char(lex, '\"')) {
quote_char = '\"';
}
next_char(lex);
// skip white space and comments
// work out if it's a single or triple quoted literal
size_t num_quotes;
if (is_char_and(lex, quote_char, quote_char)) {
// triple quotes
next_char(lex);
next_char(lex);
num_quotes = 3;
} else {
// single quotes
num_quotes = 1;
}
size_t n_closing = 0;
while (!is_end(lex) && (num_quotes > 1 || !is_char(lex, '\n')) && n_closing < num_quotes) {
if (is_char(lex, quote_char)) {
n_closing += 1;
vstr_add_char(&lex->vstr, CUR_CHAR(lex));
} else {
n_closing = 0;
if (is_char(lex, '\\')) {
next_char(lex);
unichar c = CUR_CHAR(lex);
if (is_raw) {
// raw strings allow escaping of quotes, but the backslash is also emitted
vstr_add_char(&lex->vstr, '\\');
} else {
switch (c) {
// note: "c" can never be MP_LEXER_EOF because next_char
// always inserts a newline at the end of the input stream
case '\n': c = MP_LEXER_EOF; break; // backslash escape the newline, just ignore it
case '\\': break;
case '\'': break;
case '"': break;
case 'a': c = 0x07; break;
case 'b': c = 0x08; break;
case 't': c = 0x09; break;
case 'n': c = 0x0a; break;
case 'v': c = 0x0b; break;
case 'f': c = 0x0c; break;
case 'r': c = 0x0d; break;
case 'u':
case 'U':
if (lex->tok_kind == MP_TOKEN_BYTES) {
// b'\u1234' == b'\\u1234'
vstr_add_char(&lex->vstr, '\\');
break;
}
// Otherwise fall through.
case 'x':
{
mp_uint_t num = 0;
if (!get_hex(lex, (c == 'x' ? 2 : c == 'u' ? 4 : 8), &num)) {
// not enough hex chars for escape sequence
lex->tok_kind = MP_TOKEN_INVALID;
}
c = num;
break;
}
case 'N':
// Supporting '\N{LATIN SMALL LETTER A}' == 'a' would require keeping the
// entire Unicode name table in the core. As of Unicode 6.3.0, that's nearly
// 3MB of text; even gzip-compressed and with minimal structure, it'll take
// roughly half a meg of storage. This form of Unicode escape may be added
// later on, but it's definitely not a priority right now. -- CJA 20140607
mp_not_implemented("unicode name escapes");
break;
default:
if (c >= '0' && c <= '7') {
// Octal sequence, 1-3 chars
mp_uint_t digits = 3;
mp_uint_t num = c - '0';
while (is_following_odigit(lex) && --digits != 0) {
next_char(lex);
num = num * 8 + (CUR_CHAR(lex) - '0');
}
c = num;
} else {
// unrecognised escape character; CPython lets this through verbatim as '\' and then the character
vstr_add_char(&lex->vstr, '\\');
}
break;
}
}
if (c != MP_LEXER_EOF) {
if (MICROPY_PY_BUILTINS_STR_UNICODE_DYNAMIC) {
if (c < 0x110000 && lex->tok_kind == MP_TOKEN_STRING) {
vstr_add_char(&lex->vstr, c);
} else if (c < 0x100 && lex->tok_kind == MP_TOKEN_BYTES) {
vstr_add_byte(&lex->vstr, c);
} else {
// unicode character out of range
// this raises a generic SyntaxError; could provide more info
lex->tok_kind = MP_TOKEN_INVALID;
}
} else {
// without unicode everything is just added as an 8-bit byte
if (c < 0x100) {
vstr_add_byte(&lex->vstr, c);
} else {
// 8-bit character out of range
// this raises a generic SyntaxError; could provide more info
lex->tok_kind = MP_TOKEN_INVALID;
}
}
}
} else {
// Add the "character" as a byte so that we remain 8-bit clean.
// This way, strings are parsed correctly whether or not they contain utf-8 chars.
vstr_add_byte(&lex->vstr, CUR_CHAR(lex));
}
}
next_char(lex);
}
// check we got the required end quotes
if (n_closing < num_quotes) {
lex->tok_kind = MP_TOKEN_LONELY_STRING_OPEN;
}
// cut off the end quotes from the token text
vstr_cut_tail_bytes(&lex->vstr, n_closing);
}
STATIC bool skip_whitespace(mp_lexer_t *lex, bool stop_at_newline) {
bool had_physical_newline = false;
while (!is_end(lex)) {
if (is_physical_newline(lex)) {
if (stop_at_newline && lex->nested_bracket_level == 0) {
break;
}
had_physical_newline = true;
next_char(lex);
} else if (is_whitespace(lex)) {
......@@ -298,6 +433,15 @@ void mp_lexer_to_next(mp_lexer_t *lex) {
break;
}
}
return had_physical_newline;
}
void mp_lexer_to_next(mp_lexer_t *lex) {
// start new token text
vstr_reset(&lex->vstr);
// skip white space and comments
bool had_physical_newline = skip_whitespace(lex, false);
// set token source information
lex->tok_line = lex->line;
......@@ -332,168 +476,65 @@ void mp_lexer_to_next(mp_lexer_t *lex) {
} else if (is_end(lex)) {
lex->tok_kind = MP_TOKEN_END;
} else if (is_char_or(lex, '\'', '\"')
|| (is_char_or3(lex, 'r', 'u', 'b') && is_char_following_or(lex, '\'', '\"'))
|| ((is_char_and(lex, 'r', 'b') || is_char_and(lex, 'b', 'r')) && is_char_following_following_or(lex, '\'', '\"'))) {
} else if (is_string_or_bytes(lex)) {
// a string or bytes literal
// parse type codes
bool is_raw = false;
bool is_bytes = false;
if (is_char(lex, 'u')) {
next_char(lex);
} else if (is_char(lex, 'b')) {
is_bytes = true;
next_char(lex);
if (is_char(lex, 'r')) {
is_raw = true;
next_char(lex);
}
} else if (is_char(lex, 'r')) {
is_raw = true;
next_char(lex);
if (is_char(lex, 'b')) {
is_bytes = true;
next_char(lex);
}
}
// Python requires adjacent string/bytes literals to be automatically
// concatenated. We do it here in the tokeniser to make efficient use of RAM,
// because then the lexer's vstr can be used to accumulate the string literal,
// in contrast to creating a parse tree of strings and then joining them later
// in the compiler. It's also more compact in code size to do it here.
// set token kind
if (is_bytes) {
lex->tok_kind = MP_TOKEN_BYTES;
} else {
lex->tok_kind = MP_TOKEN_STRING;
}
// MP_TOKEN_END is used to indicate that this is the first string token
lex->tok_kind = MP_TOKEN_END;
// get first quoting character
char quote_char = '\'';
if (is_char(lex, '\"')) {
quote_char = '\"';
}
next_char(lex);
// Loop to accumulate string/bytes literals
do {
// parse type codes
bool is_raw = false;
mp_token_kind_t kind = MP_TOKEN_STRING;
int n_char = 0;
if (is_char(lex, 'u')) {
n_char = 1;
} else if (is_char(lex, 'b')) {
kind = MP_TOKEN_BYTES;
n_char = 1;
if (is_char_following(lex, 'r')) {
is_raw = true;
n_char = 2;
}
} else if (is_char(lex, 'r')) {
is_raw = true;
n_char = 1;
if (is_char_following(lex, 'b')) {
kind = MP_TOKEN_BYTES;
n_char = 2;
}
}
// work out if it's a single or triple quoted literal
mp_uint_t num_quotes;
if (is_char_and(lex, quote_char, quote_char)) {
// triple quotes
next_char(lex);
next_char(lex);
num_quotes = 3;
} else {
// single quotes
num_quotes = 1;
}
// Set or check token kind
if (lex->tok_kind == MP_TOKEN_END) {
lex->tok_kind = kind;
} else if (lex->tok_kind != kind) {
// Can't concatenate string with bytes
break;
}
// parse the literal
mp_uint_t n_closing = 0;
while (!is_end(lex) && (num_quotes > 1 || !is_char(lex, '\n')) && n_closing < num_quotes) {
if (is_char(lex, quote_char)) {
n_closing += 1;
vstr_add_char(&lex->vstr, CUR_CHAR(lex));
} else {
n_closing = 0;
if (is_char(lex, '\\')) {
// Skip any type code characters
if (n_char != 0) {
next_char(lex);
if (n_char == 2) {
next_char(lex);
unichar c = CUR_CHAR(lex);
if (is_raw) {
// raw strings allow escaping of quotes, but the backslash is also emitted
vstr_add_char(&lex->vstr, '\\');
} else {
switch (c) {
// note: "c" can never be MP_LEXER_EOF because next_char
// always inserts a newline at the end of the input stream
case '\n': c = MP_LEXER_EOF; break; // backslash escape the newline, just ignore it
case '\\': break;
case '\'': break;
case '"': break;
case 'a': c = 0x07; break;
case 'b': c = 0x08; break;
case 't': c = 0x09; break;
case 'n': c = 0x0a; break;
case 'v': c = 0x0b; break;
case 'f': c = 0x0c; break;
case 'r': c = 0x0d; break;
case 'u':
case 'U':
if (is_bytes) {
// b'\u1234' == b'\\u1234'
vstr_add_char(&lex->vstr, '\\');
break;
}
// Otherwise fall through.
case 'x':
{
mp_uint_t num = 0;
if (!get_hex(lex, (c == 'x' ? 2 : c == 'u' ? 4 : 8), &num)) {
// not enough hex chars for escape sequence
lex->tok_kind = MP_TOKEN_INVALID;
}
c = num;
break;
}
case 'N':
// Supporting '\N{LATIN SMALL LETTER A}' == 'a' would require keeping the
// entire Unicode name table in the core. As of Unicode 6.3.0, that's nearly
// 3MB of text; even gzip-compressed and with minimal structure, it'll take
// roughly half a meg of storage. This form of Unicode escape may be added
// later on, but it's definitely not a priority right now. -- CJA 20140607
mp_not_implemented("unicode name escapes");
break;
default:
if (c >= '0' && c <= '7') {
// Octal sequence, 1-3 chars
mp_uint_t digits = 3;
mp_uint_t num = c - '0';
while (is_following_odigit(lex) && --digits != 0) {
next_char(lex);
num = num * 8 + (CUR_CHAR(lex) - '0');
}
c = num;
} else {
// unrecognised escape character; CPython lets this through verbatim as '\' and then the character
vstr_add_char(&lex->vstr, '\\');
}
break;
}
}
if (c != MP_LEXER_EOF) {
if (MICROPY_PY_BUILTINS_STR_UNICODE_DYNAMIC) {
if (c < 0x110000 && !is_bytes) {
vstr_add_char(&lex->vstr, c);
} else if (c < 0x100 && is_bytes) {
vstr_add_byte(&lex->vstr, c);
} else {
// unicode character out of range
// this raises a generic SyntaxError; could provide more info
lex->tok_kind = MP_TOKEN_INVALID;
}
} else {
// without unicode everything is just added as an 8-bit byte
if (c < 0x100) {
vstr_add_byte(&lex->vstr, c);
} else {
// 8-bit character out of range
// this raises a generic SyntaxError; could provide more info
lex->tok_kind = MP_TOKEN_INVALID;
}
}
}
} else {
// Add the "character" as a byte so that we remain 8-bit clean.
// This way, strings are parsed correctly whether or not they contain utf-8 chars.
vstr_add_byte(&lex->vstr, CUR_CHAR(lex));
}
}
next_char(lex);
}
// check we got the required end quotes
if (n_closing < num_quotes) {
lex->tok_kind = MP_TOKEN_LONELY_STRING_OPEN;
}
// Parse the literal
parse_string_literal(lex, is_raw);
// Skip whitespace so we can check if there's another string following
skip_whitespace(lex, true);
// cut off the end quotes from the token text
vstr_cut_tail_bytes(&lex->vstr, n_closing);
} while (is_string_or_bytes(lex));
} else if (is_head_of_identifier(lex)) {
lex->tok_kind = MP_TOKEN_NAME;
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment