Commit 4c81ba80 authored by Damien George's avatar Damien George
Browse files

py: Never intern data of large string/bytes object; add relevant tests.

Previously to this patch all constant string/bytes objects were
interned by the compiler, and this lead to crashes when the qstr was too
long (noticeable now that qstr length storage defaults to 1 byte).

With this patch, long string/bytes objects are never interned, and are
referenced directly as constant objects within generated code using
load_const_obj.
parent dab13851
......@@ -46,6 +46,7 @@ typedef enum {
#undef DEF_RULE
PN_maximum_number_of,
PN_string, // special node for non-interned string
PN_bytes, // special node for non-interned bytes
} pn_kind_t;
#define EMIT(fun) (comp->emit_method_table->fun(comp->emit))
......@@ -172,6 +173,7 @@ STATIC mp_parse_node_t fold_constants(compiler_t *comp, mp_parse_node_t pn, mp_m
break;
#endif
case PN_string:
case PN_bytes:
return pn;
}
......@@ -427,6 +429,9 @@ STATIC bool cpython_c_tuple_is_const(mp_parse_node_t pn) {
if (MP_PARSE_NODE_IS_STRUCT_KIND(pn, PN_string)) {
return true;
}
if (MP_PARSE_NODE_IS_STRUCT_KIND(pn, PN_bytes)) {
return true;
}
if (!MP_PARSE_NODE_IS_LEAF(pn)) {
return false;
}
......@@ -475,9 +480,9 @@ STATIC void cpython_c_print_quoted_str(vstr_t *vstr, const char *str, uint len,
}
STATIC void cpython_c_tuple_emit_const(compiler_t *comp, mp_parse_node_t pn, vstr_t *vstr) {
if (MP_PARSE_NODE_IS_STRUCT_KIND(pn, PN_string)) {
if (MP_PARSE_NODE_IS_STRUCT_KIND(pn, PN_string) || MP_PARSE_NODE_IS_STRUCT_KIND(pn, PN_bytes)) {
mp_parse_node_struct_t *pns = (mp_parse_node_struct_t*)pn;
cpython_c_print_quoted_str(vstr, (const char*)pns->nodes[0], (mp_uint_t)pns->nodes[1], false);
cpython_c_print_quoted_str(vstr, (const char*)pns->nodes[0], (mp_uint_t)pns->nodes[1], MP_PARSE_NODE_IS_STRUCT_KIND(pn, PN_bytes));
return;
}
......@@ -2151,7 +2156,8 @@ STATIC void compile_expr_stmt(compiler_t *comp, mp_parse_node_struct_t *pns) {
} else {
// for non-REPL, evaluate then discard the expression
if ((MP_PARSE_NODE_IS_LEAF(pns->nodes[0]) && !MP_PARSE_NODE_IS_ID(pns->nodes[0]))
|| MP_PARSE_NODE_IS_STRUCT_KIND(pns->nodes[0], PN_string)) {
|| MP_PARSE_NODE_IS_STRUCT_KIND(pns->nodes[0], PN_string)
|| MP_PARSE_NODE_IS_STRUCT_KIND(pns->nodes[0], PN_bytes)) {
// do nothing with a lonely constant
} else {
compile_node(comp, pns->nodes[0]); // just an expression
......@@ -2595,8 +2601,12 @@ STATIC void compile_atom_string(compiler_t *comp, mp_parse_node_struct_t *pns) {
} else {
assert(MP_PARSE_NODE_IS_STRUCT(pns->nodes[i]));
mp_parse_node_struct_t *pns_string = (mp_parse_node_struct_t*)pns->nodes[i];
assert(MP_PARSE_NODE_STRUCT_KIND(pns_string) == PN_string);
pn_kind = MP_PARSE_NODE_STRING;
if (MP_PARSE_NODE_STRUCT_KIND(pns_string) == PN_string) {
pn_kind = MP_PARSE_NODE_STRING;
} else {
assert(MP_PARSE_NODE_STRUCT_KIND(pns_string) == PN_bytes);
pn_kind = MP_PARSE_NODE_BYTES;
}
n_bytes += (mp_uint_t)pns_string->nodes[1];
}
if (i == 0) {
......@@ -2608,8 +2618,8 @@ STATIC void compile_atom_string(compiler_t *comp, mp_parse_node_struct_t *pns) {
}
// concatenate string/bytes
byte *q_ptr;
byte *s_dest = qstr_build_start(n_bytes, &q_ptr);
byte *s_dest;
mp_obj_t obj = mp_obj_str_builder_start(string_kind == MP_PARSE_NODE_STRING ? &mp_type_str : &mp_type_bytes, n_bytes, &s_dest);
for (int i = 0; i < n; i++) {
if (MP_PARSE_NODE_IS_LEAF(pns->nodes[i])) {
mp_uint_t s_len;
......@@ -2622,9 +2632,7 @@ STATIC void compile_atom_string(compiler_t *comp, mp_parse_node_struct_t *pns) {
s_dest += (mp_uint_t)pns_string->nodes[1];
}
}
qstr q = qstr_build_end(q_ptr);
EMIT_ARG(load_const_str, q, string_kind == MP_PARSE_NODE_BYTES);
EMIT_ARG(load_const_obj, mp_obj_str_builder_end(obj));
}
// pns needs to have 2 nodes, first is lhs of comprehension, second is PN_comp_for node
......@@ -2959,7 +2967,9 @@ STATIC void compile_node(compiler_t *comp, mp_parse_node_t pn) {
mp_parse_node_struct_t *pns = (mp_parse_node_struct_t*)pn;
EMIT_ARG(set_line_number, pns->source_line);
if (MP_PARSE_NODE_STRUCT_KIND(pns) == PN_string) {
EMIT_ARG(load_const_str, qstr_from_strn((const char*)pns->nodes[0], (mp_uint_t)pns->nodes[1]), false);
EMIT_ARG(load_const_obj, mp_obj_new_str((const char*)pns->nodes[0], (mp_uint_t)pns->nodes[1], false));
} else if (MP_PARSE_NODE_STRUCT_KIND(pns) == PN_bytes) {
EMIT_ARG(load_const_obj, mp_obj_new_bytes((const byte*)pns->nodes[0], (mp_uint_t)pns->nodes[1]));
} else {
compile_function_t f = compile_function[MP_PARSE_NODE_STRUCT_KIND(pns)];
if (f == NULL) {
......
......@@ -70,6 +70,7 @@ enum {
#undef DEF_RULE
RULE_maximum_number_of,
RULE_string, // special node for non-interned string
RULE_bytes, // special node for non-interned bytes
};
#define ident (RULE_ACT_ALLOW_IDENT)
......@@ -176,7 +177,7 @@ void mp_parse_node_free(mp_parse_node_t pn) {
mp_parse_node_struct_t *pns = (mp_parse_node_struct_t *)pn;
mp_uint_t n = MP_PARSE_NODE_STRUCT_NUM_NODES(pns);
mp_uint_t rule_id = MP_PARSE_NODE_STRUCT_KIND(pns);
if (rule_id == RULE_string) {
if (rule_id == RULE_string || rule_id == RULE_bytes) {
m_del(char, (char*)pns->nodes[0], (mp_uint_t)pns->nodes[1]);
} else {
bool adjust = ADD_BLANK_NODE(rules[rule_id]);
......@@ -225,6 +226,8 @@ void mp_parse_node_print(mp_parse_node_t pn, mp_uint_t indent) {
mp_parse_node_struct_t *pns = (mp_parse_node_struct_t*)pn;
if (MP_PARSE_NODE_STRUCT_KIND(pns) == RULE_string) {
printf("literal str(%.*s)\n", (int)pns->nodes[1], (char*)pns->nodes[0]);
} else if (MP_PARSE_NODE_STRUCT_KIND(pns) == RULE_bytes) {
printf("literal bytes(%.*s)\n", (int)pns->nodes[1], (char*)pns->nodes[0]);
} else {
mp_uint_t n = MP_PARSE_NODE_STRUCT_NUM_NODES(pns);
#ifdef USE_RULE_NAME
......@@ -281,14 +284,14 @@ STATIC void push_result_node(parser_t *parser, mp_parse_node_t pn) {
parser->result_stack[parser->result_stack_top++] = pn;
}
STATIC void push_result_string(parser_t *parser, mp_uint_t src_line, const char *str, mp_uint_t len) {
STATIC void push_result_string_bytes(parser_t *parser, mp_uint_t src_line, mp_uint_t rule_kind, const char *str, mp_uint_t len) {
mp_parse_node_struct_t *pn = m_new_obj_var_maybe(mp_parse_node_struct_t, mp_parse_node_t, 2);
if (pn == NULL) {
memory_error(parser);
return;
}
pn->source_line = src_line;
pn->kind_num_nodes = RULE_string | (2 << 8);
pn->kind_num_nodes = rule_kind | (2 << 8);
char *p = m_new(char, len);
memcpy(p, str, len);
pn->nodes[0] = (mp_int_t)p;
......@@ -340,8 +343,8 @@ STATIC void push_result_token(parser_t *parser) {
} else {
pn = mp_parse_node_new_leaf(MP_PARSE_NODE_INTEGER, qstr_from_strn(str, len));
}
} else if (lex->tok_kind == MP_TOKEN_STRING) {
// Don't automatically intern all strings. doc strings (which are usually large)
} else if (lex->tok_kind == MP_TOKEN_STRING || lex->tok_kind == MP_TOKEN_BYTES) {
// Don't automatically intern all strings/bytes. doc strings (which are usually large)
// will be discarded by the compiler, and so we shouldn't intern them.
qstr qst = MP_QSTR_NULL;
if (lex->vstr.len <= MICROPY_ALLOC_PARSE_INTERN_STRING_LEN) {
......@@ -353,14 +356,12 @@ STATIC void push_result_token(parser_t *parser) {
}
if (qst != MP_QSTR_NULL) {
// qstr exists, make a leaf node
pn = mp_parse_node_new_leaf(MP_PARSE_NODE_STRING, qst);
pn = mp_parse_node_new_leaf(lex->tok_kind == MP_TOKEN_STRING ? MP_PARSE_NODE_STRING : MP_PARSE_NODE_BYTES, qst);
} else {
// not interned, make a node holding a pointer to the string data
push_result_string(parser, lex->tok_line, lex->vstr.buf, lex->vstr.len);
// not interned, make a node holding a pointer to the string/bytes data
push_result_string_bytes(parser, lex->tok_line, lex->tok_kind == MP_TOKEN_STRING ? RULE_string : RULE_bytes, lex->vstr.buf, lex->vstr.len);
return;
}
} else if (lex->tok_kind == MP_TOKEN_BYTES) {
pn = mp_parse_node_new_leaf(MP_PARSE_NODE_BYTES, qstr_from_strn(lex->vstr.buf, lex->vstr.len));
} else {
pn = mp_parse_node_new_leaf(MP_PARSE_NODE_TOKEN, lex->tok_kind);
}
......
......@@ -148,6 +148,7 @@ qstr qstr_from_str(const char *str) {
}
qstr qstr_from_strn(const char *str, mp_uint_t len) {
assert(len < (1 << (8 * MICROPY_QSTR_BYTES_IN_LEN)));
qstr q = qstr_find_strn(str, len);
if (q == 0) {
mp_uint_t hash = qstr_compute_hash((const byte*)str, len);
......
b1 = b"long bytes long bytes long bytes long bytes long bytes long bytes long bytes long bytes long bytes long bytes long bytes long bytes long bytes long bytes long bytes long bytes long bytes long bytes long bytes long bytes long bytes long bytes long bytes long bytes long bytes long bytes long bytes long bytes long bytes long bytes long bytes long bytes long bytes long bytes long bytes long bytes long bytes long bytes long bytes long bytes long bytes"
b2 = b"concatenated bytes" b"concatenated bytes" b"concatenated bytes" b"concatenated bytes" b"concatenated bytes" b"concatenated bytes" b"concatenated bytes" b"concatenated bytes" b"concatenated bytes" b"concatenated bytes" b"concatenated bytes" b"concatenated bytes" b"concatenated bytes" b"concatenated bytes" b"concatenated bytes" b"concatenated bytes"
s1 = "long string long string long string long string long string long string long string long string long string long string long string long string long string long string long string long string long string long string long string long string long string long string long string long string long string long string long string long string long string long string long string long string long string long string long string long string long string long string long string long string long string"
s2 = "concatenated string" "concatenated string" "concatenated string" "concatenated string" "concatenated string" "concatenated string" "concatenated string" "concatenated string" "concatenated string" "concatenated string" "concatenated string" "concatenated string" "concatenated string" "concatenated string" "concatenated string" "concatenated string"
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment