Skip to content
GitLab
Projects
Groups
Snippets
/
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Sign in
Toggle navigation
Menu
Open sidebar
TASTE
uPython-mirror
Commits
b1b84055
Commit
b1b84055
authored
Jun 28, 2014
by
Damien George
Browse files
Merge branch 'unicode'
parents
8993fb6c
635b60e2
Changes
22
Hide whitespace changes
Inline
Side-by-side
py/builtin.c
View file @
b1b84055
...
...
@@ -172,13 +172,40 @@ STATIC mp_obj_t mp_builtin_callable(mp_obj_t o_in) {
MP_DEFINE_CONST_FUN_OBJ_1
(
mp_builtin_callable_obj
,
mp_builtin_callable
);
STATIC
mp_obj_t
mp_builtin_chr
(
mp_obj_t
o_in
)
{
int
ord
=
mp_obj_get_int
(
o_in
);
#if MICROPY_PY_BUILTINS_STR_UNICODE
machine_int_t
c
=
mp_obj_get_int
(
o_in
);
char
str
[
4
];
int
len
=
0
;
if
(
c
<
0x80
)
{
*
str
=
c
;
len
=
1
;
}
else
if
(
c
<
0x800
)
{
str
[
0
]
=
(
c
>>
6
)
|
0xC0
;
str
[
1
]
=
(
c
&
0x3F
)
|
0x80
;
len
=
2
;
}
else
if
(
c
<
0x10000
)
{
str
[
0
]
=
(
c
>>
12
)
|
0xE0
;
str
[
1
]
=
((
c
>>
6
)
&
0x3F
)
|
0x80
;
str
[
2
]
=
(
c
&
0x3F
)
|
0x80
;
len
=
3
;
}
else
if
(
c
<
0x110000
)
{
str
[
0
]
=
(
c
>>
18
)
|
0xF0
;
str
[
1
]
=
((
c
>>
12
)
&
0x3F
)
|
0x80
;
str
[
2
]
=
((
c
>>
6
)
&
0x3F
)
|
0x80
;
str
[
3
]
=
(
c
&
0x3F
)
|
0x80
;
len
=
4
;
}
else
{
nlr_raise
(
mp_obj_new_exception_msg
(
&
mp_type_ValueError
,
"chr() arg not in range(0x110000)"
));
}
return
mp_obj_new_str
(
str
,
len
,
true
);
#else
machine_int_t
ord
=
mp_obj_get_int
(
o_in
);
if
(
0
<=
ord
&&
ord
<=
0x10ffff
)
{
char
str
[
1
]
=
{
ord
};
return
mp_obj_new_str
(
str
,
1
,
true
);
}
else
{
nlr_raise
(
mp_obj_new_exception_msg
(
&
mp_type_ValueError
,
"chr() arg not in range(0x110000)"
));
}
#endif
}
MP_DEFINE_CONST_FUN_OBJ_1
(
mp_builtin_chr_obj
,
mp_builtin_chr
);
...
...
@@ -344,13 +371,32 @@ MP_DEFINE_CONST_FUN_OBJ_1(mp_builtin_oct_obj, mp_builtin_oct);
STATIC
mp_obj_t
mp_builtin_ord
(
mp_obj_t
o_in
)
{
uint
len
;
const
char
*
str
=
mp_obj_str_get_data
(
o_in
,
&
len
);
#if MICROPY_PY_BUILTINS_STR_UNICODE
uint
charlen
=
unichar_charlen
(
str
,
len
);
if
(
charlen
==
1
)
{
if
(
MP_OBJ_IS_STR
(
o_in
)
&&
UTF8_IS_NONASCII
(
*
str
))
{
machine_int_t
ord
=
*
str
++
&
0x7F
;
for
(
machine_int_t
mask
=
0x40
;
ord
&
mask
;
mask
>>=
1
)
{
ord
&=
~
mask
;
}
while
(
UTF8_IS_CONT
(
*
str
))
{
ord
=
(
ord
<<
6
)
|
(
*
str
++
&
0x3F
);
}
return
mp_obj_new_int
(
ord
);
}
else
{
return
mp_obj_new_int
(((
const
byte
*
)
str
)[
0
]);
}
}
else
{
nlr_raise
(
mp_obj_new_exception_msg_varg
(
&
mp_type_TypeError
,
"ord() expected a character, but string of length %d found"
,
charlen
));
}
#else
if
(
len
==
1
)
{
// don't sign extend when converting to ord
// TODO unicode
return
mp_obj_new_int
(((
const
byte
*
)
str
)[
0
]);
}
else
{
nlr_raise
(
mp_obj_new_exception_msg_varg
(
&
mp_type_TypeError
,
"ord() expected a character, but string of length %d found"
,
len
));
}
#endif
}
MP_DEFINE_CONST_FUN_OBJ_1
(
mp_builtin_ord_obj
,
mp_builtin_ord
);
...
...
py/lexer.c
View file @
b1b84055
...
...
@@ -502,19 +502,32 @@ STATIC void mp_lexer_next_token_into(mp_lexer_t *lex, mp_token_t *tok, bool firs
case
'v'
:
c
=
0x0b
;
break
;
case
'f'
:
c
=
0x0c
;
break
;
case
'r'
:
c
=
0x0d
;
break
;
case
'u'
:
case
'U'
:
if
(
is_bytes
)
{
// b'\u1234' == b'\\u1234'
vstr_add_char
(
&
lex
->
vstr
,
'\\'
);
break
;
}
// Otherwise fall through.
case
'x'
:
{
uint
num
=
0
;
if
(
!
get_hex
(
lex
,
2
,
&
num
))
{
if
(
!
get_hex
(
lex
,
(
c
==
'x'
?
2
:
c
==
'u'
?
4
:
8
)
,
&
num
))
{
// TODO error message
assert
(
0
);
}
c
=
num
;
break
;
}
case
'N'
:
break
;
// TODO \N{name} only in strings
case
'u'
:
break
;
// TODO \uxxxx only in strings
case
'U'
:
break
;
// TODO \Uxxxxxxxx only in strings
case
'N'
:
// Supporting '\N{LATIN SMALL LETTER A}' == 'a' would require keeping the
// entire Unicode name table in the core. As of Unicode 6.3.0, that's nearly
// 3MB of text; even gzip-compressed and with minimal structure, it'll take
// roughly half a meg of storage. This form of Unicode escape may be added
// later on, but it's definitely not a priority right now. -- CJA 20140607
assert
(
!
"Unicode name escapes not supported"
);
break
;
default:
if
(
c
>=
'0'
&&
c
<=
'7'
)
{
// Octal sequence, 1-3 chars
...
...
@@ -533,7 +546,13 @@ STATIC void mp_lexer_next_token_into(mp_lexer_t *lex, mp_token_t *tok, bool firs
}
}
if
(
c
!=
MP_LEXER_CHAR_EOF
)
{
vstr_add_char
(
&
lex
->
vstr
,
c
);
if
(
c
<
0x110000
&&
!
is_bytes
)
{
vstr_add_char
(
&
lex
->
vstr
,
c
);
}
else
if
(
c
<
0x100
&&
is_bytes
)
{
vstr_add_byte
(
&
lex
->
vstr
,
c
);
}
else
{
assert
(
!
"TODO: Throw an error, invalid escape code probably"
);
}
}
}
else
{
vstr_add_char
(
&
lex
->
vstr
,
CUR_CHAR
(
lex
));
...
...
py/misc.h
View file @
b1b84055
...
...
@@ -100,7 +100,9 @@ bool unichar_isupper(unichar c);
bool
unichar_islower
(
unichar
c
);
unichar
unichar_tolower
(
unichar
c
);
unichar
unichar_toupper
(
unichar
c
);
#define unichar_charlen(s, bytelen) (bytelen)
uint
unichar_charlen
(
const
char
*
str
,
uint
len
);
// TODO this should return machine_uint_t
#define UTF8_IS_NONASCII(ch) ((ch) & 0x80)
#define UTF8_IS_CONT(ch) (((ch) & 0xC0) == 0x80)
/** variable string *********************************************/
...
...
@@ -164,4 +166,18 @@ int DEBUG_printf(const char *fmt, ...);
extern
uint
mp_verbose_flag
;
// This is useful for unicode handling. Some CPU archs has
// special instructions for efficient implentation of this
// function (e.g. CLZ on ARM).
// NOTE: this function is unused at the moment
#ifndef count_lead_ones
static
inline
uint
count_lead_ones
(
byte
val
)
{
uint
c
=
0
;
for
(
byte
mask
=
0x80
;
val
&
mask
;
mask
>>=
1
)
{
c
++
;
}
return
c
;
}
#endif
#endif // _INCLUDED_MINILIB_H
py/mpconfig.h
View file @
b1b84055
...
...
@@ -249,6 +249,11 @@ typedef double mp_float_t;
/*****************************************************************************/
/* Fine control over Python builtins, classes, modules, etc */
// Whether str object is proper unicode
#ifndef MICROPY_PY_BUILTINS_STR_UNICODE
#define MICROPY_PY_BUILTINS_STR_UNICODE (0)
#endif
// Whether to support bytearray object
#ifndef MICROPY_PY_BUILTINS_BYTEARRAY
#define MICROPY_PY_BUILTINS_BYTEARRAY (1)
...
...
py/obj.c
View file @
b1b84055
...
...
@@ -357,7 +357,12 @@ uint mp_get_index(const mp_obj_type_t *type, machine_uint_t len, mp_obj_t index,
// may return MP_OBJ_NULL
mp_obj_t
mp_obj_len_maybe
(
mp_obj_t
o_in
)
{
if
(
MP_OBJ_IS_STR
(
o_in
)
||
MP_OBJ_IS_TYPE
(
o_in
,
&
mp_type_bytes
))
{
if
(
#if !MICROPY_PY_BUILTINS_STR_UNICODE
// It's simple - unicode is slow, non-unicode is fast
MP_OBJ_IS_STR
(
o_in
)
||
#endif
MP_OBJ_IS_TYPE
(
o_in
,
&
mp_type_bytes
))
{
return
MP_OBJ_NEW_SMALL_INT
((
machine_int_t
)
mp_obj_str_get_len
(
o_in
));
}
else
{
mp_obj_type_t
*
type
=
mp_obj_get_type
(
o_in
);
...
...
py/objstr.c
View file @
b1b84055
...
...
@@ -32,6 +32,7 @@
#include
"mpconfig.h"
#include
"nlr.h"
#include
"misc.h"
#include
"unicode.h"
#include
"qstr.h"
#include
"obj.h"
#include
"runtime0.h"
...
...
@@ -43,16 +44,7 @@
STATIC
mp_obj_t
str_modulo_format
(
mp_obj_t
pattern
,
uint
n_args
,
const
mp_obj_t
*
args
,
mp_obj_t
dict
);
const
mp_obj_t
mp_const_empty_bytes
;
// use this macro to extract the string hash
#define GET_STR_HASH(str_obj_in, str_hash) uint str_hash; if (MP_OBJ_IS_QSTR(str_obj_in)) { str_hash = qstr_hash(MP_OBJ_QSTR_VALUE(str_obj_in)); } else { str_hash = ((mp_obj_str_t*)str_obj_in)->hash; }
// use this macro to extract the string length
#define GET_STR_LEN(str_obj_in, str_len) uint str_len; if (MP_OBJ_IS_QSTR(str_obj_in)) { str_len = qstr_len(MP_OBJ_QSTR_VALUE(str_obj_in)); } else { str_len = ((mp_obj_str_t*)str_obj_in)->len; }
// use this macro to extract the string data and length
#define GET_STR_DATA_LEN(str_obj_in, str_data, str_len) const byte *str_data; uint str_len; if (MP_OBJ_IS_QSTR(str_obj_in)) { str_data = qstr_data(MP_OBJ_QSTR_VALUE(str_obj_in), &str_len); } else { str_len = ((mp_obj_str_t*)str_obj_in)->len; str_data = ((mp_obj_str_t*)str_obj_in)->data; }
STATIC
mp_obj_t
mp_obj_new_str_iterator
(
mp_obj_t
str
);
mp_obj_t
mp_obj_new_str_iterator
(
mp_obj_t
str
);
STATIC
mp_obj_t
mp_obj_new_bytes_iterator
(
mp_obj_t
str
);
STATIC
NORETURN
void
bad_implicit_conversion
(
mp_obj_t
self_in
);
STATIC
NORETURN
void
arg_type_mixup
();
...
...
@@ -259,7 +251,7 @@ STATIC const byte *find_subbytes(const byte *haystack, machine_uint_t hlen, cons
return
NULL
;
}
STATIC
mp_obj_t
str_binary_op
(
int
op
,
mp_obj_t
lhs_in
,
mp_obj_t
rhs_in
)
{
mp_obj_t
mp_obj_
str_binary_op
(
int
op
,
mp_obj_t
lhs_in
,
mp_obj_t
rhs_in
)
{
GET_STR_DATA_LEN
(
lhs_in
,
lhs_data
,
lhs_len
);
mp_obj_type_t
*
lhs_type
=
mp_obj_get_type
(
lhs_in
);
mp_obj_type_t
*
rhs_type
=
mp_obj_get_type
(
rhs_in
);
...
...
@@ -352,11 +344,14 @@ uncomparable:
return
MP_OBJ_NULL
;
// op not supported
}
#if !MICROPY_PY_BUILTINS_STR_UNICODE
// objstrunicode defines own version
const
byte
*
str_index_to_ptr
(
const
mp_obj_type_t
*
type
,
const
byte
*
self_data
,
uint
self_len
,
mp_obj_t
index
,
bool
is_slice
)
{
machine_uint_t
index_val
=
mp_get_index
(
type
,
self_len
,
index
,
is_slice
);
return
self_data
+
index_val
;
}
#endif
STATIC
mp_obj_t
str_subscr
(
mp_obj_t
self_in
,
mp_obj_t
index
,
mp_obj_t
value
)
{
mp_obj_type_t
*
type
=
mp_obj_get_type
(
self_in
);
...
...
@@ -571,7 +566,6 @@ STATIC mp_obj_t str_rsplit(uint n_args, const mp_obj_t *args) {
return
res
;
}
STATIC
mp_obj_t
str_finder
(
uint
n_args
,
const
mp_obj_t
*
args
,
machine_int_t
direction
,
bool
is_index
)
{
const
mp_obj_type_t
*
self_type
=
mp_obj_get_type
(
args
[
0
]);
assert
(
2
<=
n_args
&&
n_args
<=
4
);
...
...
@@ -600,6 +594,11 @@ STATIC mp_obj_t str_finder(uint n_args, const mp_obj_t *args, machine_int_t dire
}
}
else
{
// found
#if MICROPY_PY_BUILTINS_STR_UNICODE
if
(
self_type
==
&
mp_type_str
)
{
return
MP_OBJ_NEW_SMALL_INT
(
utf8_ptr_to_index
(
haystack
,
p
));
}
#endif
return
MP_OBJ_NEW_SMALL_INT
(
p
-
haystack
);
}
}
...
...
@@ -1449,7 +1448,7 @@ STATIC mp_obj_t str_count(uint n_args, const mp_obj_t *args) {
// if needle_len is zero then we count each gap between characters as an occurrence
if
(
needle_len
==
0
)
{
return
MP_OBJ_NEW_SMALL_INT
(
unichar_charlen
((
const
char
*
)
start
,
end
-
start
)
+
1
);
return
MP_OBJ_NEW_SMALL_INT
(
(
machine_uint_t
)
unichar_charlen
((
const
char
*
)
start
,
end
-
start
)
+
1
);
}
// count the occurrences
...
...
@@ -1610,7 +1609,7 @@ STATIC mp_obj_t str_encode(uint n_args, const mp_obj_t *args) {
}
#endif
STATIC
machine_int_t
str_get_buffer
(
mp_obj_t
self_in
,
mp_buffer_info_t
*
bufinfo
,
int
flags
)
{
machine_int_t
mp_obj_
str_get_buffer
(
mp_obj_t
self_in
,
mp_buffer_info_t
*
bufinfo
,
int
flags
)
{
if
(
flags
==
MP_BUFFER_READ
)
{
GET_STR_DATA_LEN
(
self_in
,
str_data
,
str_len
);
bufinfo
->
buf
=
(
void
*
)
str_data
;
...
...
@@ -1627,38 +1626,45 @@ STATIC machine_int_t str_get_buffer(mp_obj_t self_in, mp_buffer_info_t *bufinfo,
}
#if MICROPY_CPYTHON_COMPAT
STATIC
MP_DEFINE_CONST_FUN_OBJ_VAR_BETWEEN
(
bytes_decode_obj
,
1
,
3
,
bytes_decode
);
STATIC
MP_DEFINE_CONST_FUN_OBJ_VAR_BETWEEN
(
str_encode_obj
,
1
,
3
,
str_encode
);
MP_DEFINE_CONST_FUN_OBJ_VAR_BETWEEN
(
bytes_decode_obj
,
1
,
3
,
bytes_decode
);
MP_DEFINE_CONST_FUN_OBJ_VAR_BETWEEN
(
str_encode_obj
,
1
,
3
,
str_encode
);
#endif
STATIC
MP_DEFINE_CONST_FUN_OBJ_VAR_BETWEEN
(
str_find_obj
,
2
,
4
,
str_find
);
STATIC
MP_DEFINE_CONST_FUN_OBJ_VAR_BETWEEN
(
str_rfind_obj
,
2
,
4
,
str_rfind
);
STATIC
MP_DEFINE_CONST_FUN_OBJ_VAR_BETWEEN
(
str_index_obj
,
2
,
4
,
str_index
);
STATIC
MP_DEFINE_CONST_FUN_OBJ_VAR_BETWEEN
(
str_rindex_obj
,
2
,
4
,
str_rindex
);
STATIC
MP_DEFINE_CONST_FUN_OBJ_2
(
str_join_obj
,
str_join
);
STATIC
MP_DEFINE_CONST_FUN_OBJ_VAR_BETWEEN
(
str_split_obj
,
1
,
3
,
str_split
);
STATIC
MP_DEFINE_CONST_FUN_OBJ_VAR_BETWEEN
(
str_rsplit_obj
,
1
,
3
,
str_rsplit
);
STATIC
MP_DEFINE_CONST_FUN_OBJ_VAR_BETWEEN
(
str_startswith_obj
,
2
,
3
,
str_startswith
);
STATIC
MP_DEFINE_CONST_FUN_OBJ_VAR_BETWEEN
(
str_endswith_obj
,
2
,
3
,
str_endswith
);
STATIC
MP_DEFINE_CONST_FUN_OBJ_VAR_BETWEEN
(
str_strip_obj
,
1
,
2
,
str_strip
);
STATIC
MP_DEFINE_CONST_FUN_OBJ_VAR_BETWEEN
(
str_lstrip_obj
,
1
,
2
,
str_lstrip
);
STATIC
MP_DEFINE_CONST_FUN_OBJ_VAR_BETWEEN
(
str_rstrip_obj
,
1
,
2
,
str_rstrip
);
STATIC
MP_DEFINE_CONST_FUN_OBJ_VAR
(
str_format_obj
,
1
,
mp_obj_str_format
);
STATIC
MP_DEFINE_CONST_FUN_OBJ_VAR_BETWEEN
(
str_replace_obj
,
3
,
4
,
str_replace
);
STATIC
MP_DEFINE_CONST_FUN_OBJ_VAR_BETWEEN
(
str_count_obj
,
2
,
4
,
str_count
);
STATIC
MP_DEFINE_CONST_FUN_OBJ_2
(
str_partition_obj
,
str_partition
);
STATIC
MP_DEFINE_CONST_FUN_OBJ_2
(
str_rpartition_obj
,
str_rpartition
);
STATIC
MP_DEFINE_CONST_FUN_OBJ_1
(
str_lower_obj
,
str_lower
);
STATIC
MP_DEFINE_CONST_FUN_OBJ_1
(
str_upper_obj
,
str_upper
);
STATIC
MP_DEFINE_CONST_FUN_OBJ_1
(
str_isspace_obj
,
str_isspace
);
STATIC
MP_DEFINE_CONST_FUN_OBJ_1
(
str_isalpha_obj
,
str_isalpha
);
STATIC
MP_DEFINE_CONST_FUN_OBJ_1
(
str_isdigit_obj
,
str_isdigit
);
STATIC
MP_DEFINE_CONST_FUN_OBJ_1
(
str_isupper_obj
,
str_isupper
);
STATIC
MP_DEFINE_CONST_FUN_OBJ_1
(
str_islower_obj
,
str_islower
);
MP_DEFINE_CONST_FUN_OBJ_VAR_BETWEEN
(
str_find_obj
,
2
,
4
,
str_find
);
MP_DEFINE_CONST_FUN_OBJ_VAR_BETWEEN
(
str_rfind_obj
,
2
,
4
,
str_rfind
);
MP_DEFINE_CONST_FUN_OBJ_VAR_BETWEEN
(
str_index_obj
,
2
,
4
,
str_index
);
MP_DEFINE_CONST_FUN_OBJ_VAR_BETWEEN
(
str_rindex_obj
,
2
,
4
,
str_rindex
);
MP_DEFINE_CONST_FUN_OBJ_2
(
str_join_obj
,
str_join
);
MP_DEFINE_CONST_FUN_OBJ_VAR_BETWEEN
(
str_split_obj
,
1
,
3
,
str_split
);
MP_DEFINE_CONST_FUN_OBJ_VAR_BETWEEN
(
str_rsplit_obj
,
1
,
3
,
str_rsplit
);
MP_DEFINE_CONST_FUN_OBJ_VAR_BETWEEN
(
str_startswith_obj
,
2
,
3
,
str_startswith
);
MP_DEFINE_CONST_FUN_OBJ_VAR_BETWEEN
(
str_endswith_obj
,
2
,
3
,
str_endswith
);
MP_DEFINE_CONST_FUN_OBJ_VAR_BETWEEN
(
str_strip_obj
,
1
,
2
,
str_strip
);
MP_DEFINE_CONST_FUN_OBJ_VAR_BETWEEN
(
str_lstrip_obj
,
1
,
2
,
str_lstrip
);
MP_DEFINE_CONST_FUN_OBJ_VAR_BETWEEN
(
str_rstrip_obj
,
1
,
2
,
str_rstrip
);
MP_DEFINE_CONST_FUN_OBJ_VAR
(
str_format_obj
,
1
,
mp_obj_str_format
);
MP_DEFINE_CONST_FUN_OBJ_VAR_BETWEEN
(
str_replace_obj
,
3
,
4
,
str_replace
);
MP_DEFINE_CONST_FUN_OBJ_VAR_BETWEEN
(
str_count_obj
,
2
,
4
,
str_count
);
MP_DEFINE_CONST_FUN_OBJ_2
(
str_partition_obj
,
str_partition
);
MP_DEFINE_CONST_FUN_OBJ_2
(
str_rpartition_obj
,
str_rpartition
);
MP_DEFINE_CONST_FUN_OBJ_1
(
str_lower_obj
,
str_lower
);
MP_DEFINE_CONST_FUN_OBJ_1
(
str_upper_obj
,
str_upper
);
MP_DEFINE_CONST_FUN_OBJ_1
(
str_isspace_obj
,
str_isspace
);
MP_DEFINE_CONST_FUN_OBJ_1
(
str_isalpha_obj
,
str_isalpha
);
MP_DEFINE_CONST_FUN_OBJ_1
(
str_isdigit_obj
,
str_isdigit
);
MP_DEFINE_CONST_FUN_OBJ_1
(
str_isupper_obj
,
str_isupper
);
MP_DEFINE_CONST_FUN_OBJ_1
(
str_islower_obj
,
str_islower
);
STATIC
const
mp_map_elem_t
str_locals_dict_table
[]
=
{
#if MICROPY_CPYTHON_COMPAT
{
MP_OBJ_NEW_QSTR
(
MP_QSTR_decode
),
(
mp_obj_t
)
&
bytes_decode_obj
},
#if !MICROPY_PY_BUILTINS_STR_UNICODE
// If we have separate unicode type, then here we have methods only
// for bytes type, and it should not have encode() methods. Otherwise,
// we have non-compliant-but-practical bytestring type, which shares
// method table with bytes, so they both have encode() and decode()
// methods (which should do type checking at runtime).
{
MP_OBJ_NEW_QSTR
(
MP_QSTR_encode
),
(
mp_obj_t
)
&
str_encode_obj
},
#endif
#endif
{
MP_OBJ_NEW_QSTR
(
MP_QSTR_find
),
(
mp_obj_t
)
&
str_find_obj
},
{
MP_OBJ_NEW_QSTR
(
MP_QSTR_rfind
),
(
mp_obj_t
)
&
str_rfind_obj
},
...
...
@@ -1688,17 +1694,19 @@ STATIC const mp_map_elem_t str_locals_dict_table[] = {
STATIC
MP_DEFINE_CONST_DICT
(
str_locals_dict
,
str_locals_dict_table
);
#if !MICROPY_PY_BUILTINS_STR_UNICODE
const
mp_obj_type_t
mp_type_str
=
{
{
&
mp_type_type
},
.
name
=
MP_QSTR_str
,
.
print
=
str_print
,
.
make_new
=
str_make_new
,
.
binary_op
=
str_binary_op
,
.
binary_op
=
mp_obj_
str_binary_op
,
.
subscr
=
str_subscr
,
.
getiter
=
mp_obj_new_str_iterator
,
.
buffer_p
=
{
.
get_buffer
=
str_get_buffer
},
.
buffer_p
=
{
.
get_buffer
=
mp_obj_
str_get_buffer
},
.
locals_dict
=
(
mp_obj_t
)
&
str_locals_dict
,
};
#endif
// Reuses most of methods from str
const
mp_obj_type_t
mp_type_bytes
=
{
...
...
@@ -1706,10 +1714,10 @@ const mp_obj_type_t mp_type_bytes = {
.
name
=
MP_QSTR_bytes
,
.
print
=
str_print
,
.
make_new
=
bytes_make_new
,
.
binary_op
=
str_binary_op
,
.
binary_op
=
mp_obj_
str_binary_op
,
.
subscr
=
str_subscr
,
.
getiter
=
mp_obj_new_bytes_iterator
,
.
buffer_p
=
{
.
get_buffer
=
str_get_buffer
},
.
buffer_p
=
{
.
get_buffer
=
mp_obj_
str_get_buffer
},
.
locals_dict
=
(
mp_obj_t
)
&
str_locals_dict
,
};
...
...
@@ -1866,6 +1874,7 @@ typedef struct _mp_obj_str_it_t {
machine_uint_t
cur
;
}
mp_obj_str_it_t
;
#if !MICROPY_PY_BUILTINS_STR_UNICODE
STATIC
mp_obj_t
str_it_iternext
(
mp_obj_t
self_in
)
{
mp_obj_str_it_t
*
self
=
self_in
;
GET_STR_DATA_LEN
(
self
->
str
,
str
,
len
);
...
...
@@ -1885,6 +1894,15 @@ STATIC const mp_obj_type_t mp_type_str_it = {
.
iternext
=
str_it_iternext
,
};
mp_obj_t
mp_obj_new_str_iterator
(
mp_obj_t
str
)
{
mp_obj_str_it_t
*
o
=
m_new_obj
(
mp_obj_str_it_t
);
o
->
base
.
type
=
&
mp_type_str_it
;
o
->
str
=
str
;
o
->
cur
=
0
;
return
o
;
}
#endif
STATIC
mp_obj_t
bytes_it_iternext
(
mp_obj_t
self_in
)
{
mp_obj_str_it_t
*
self
=
self_in
;
GET_STR_DATA_LEN
(
self
->
str
,
str
,
len
);
...
...
@@ -1904,14 +1922,6 @@ STATIC const mp_obj_type_t mp_type_bytes_it = {
.
iternext
=
bytes_it_iternext
,
};
mp_obj_t
mp_obj_new_str_iterator
(
mp_obj_t
str
)
{
mp_obj_str_it_t
*
o
=
m_new_obj
(
mp_obj_str_it_t
);
o
->
base
.
type
=
&
mp_type_str_it
;
o
->
str
=
str
;
o
->
cur
=
0
;
return
o
;
}
mp_obj_t
mp_obj_new_bytes_iterator
(
mp_obj_t
str
)
{
mp_obj_str_it_t
*
o
=
m_new_obj
(
mp_obj_str_it_t
);
o
->
base
.
type
=
&
mp_type_bytes_it
;
...
...
py/objstr.h
View file @
b1b84055
...
...
@@ -35,5 +35,53 @@ typedef struct _mp_obj_str_t {
#define MP_DEFINE_STR_OBJ(obj_name, str) mp_obj_str_t obj_name = {{&mp_type_str}, 0, sizeof(str) - 1, (const byte*)str};
// use this macro to extract the string hash
#define GET_STR_HASH(str_obj_in, str_hash) \
uint str_hash; if (MP_OBJ_IS_QSTR(str_obj_in)) \
{ str_hash = qstr_hash(MP_OBJ_QSTR_VALUE(str_obj_in)); } else { str_hash = ((mp_obj_str_t*)str_obj_in)->hash; }
// use this macro to extract the string length
#define GET_STR_LEN(str_obj_in, str_len) \
uint str_len; if (MP_OBJ_IS_QSTR(str_obj_in)) \
{ str_len = qstr_len(MP_OBJ_QSTR_VALUE(str_obj_in)); } else { str_len = ((mp_obj_str_t*)str_obj_in)->len; }
// use this macro to extract the string data and length
#define GET_STR_DATA_LEN(str_obj_in, str_data, str_len) \
const byte *str_data; uint str_len; if (MP_OBJ_IS_QSTR(str_obj_in)) \
{ str_data = qstr_data(MP_OBJ_QSTR_VALUE(str_obj_in), &str_len); } \
else { str_len = ((mp_obj_str_t*)str_obj_in)->len; str_data = ((mp_obj_str_t*)str_obj_in)->data; }
mp_obj_t
mp_obj_str_format
(
uint
n_args
,
const
mp_obj_t
*
args
);
mp_obj_t
mp_obj_new_str_of_type
(
const
mp_obj_type_t
*
type
,
const
byte
*
data
,
uint
len
);
mp_obj_t
mp_obj_str_binary_op
(
int
op
,
mp_obj_t
lhs_in
,
mp_obj_t
rhs_in
);
machine_int_t
mp_obj_str_get_buffer
(
mp_obj_t
self_in
,
mp_buffer_info_t
*
bufinfo
,
int
flags
);
const
byte
*
str_index_to_ptr
(
const
mp_obj_type_t
*
type
,
const
byte
*
self_data
,
uint
self_len
,
mp_obj_t
index
,
bool
is_slice
);
MP_DECLARE_CONST_FUN_OBJ
(
str_encode_obj
);
MP_DECLARE_CONST_FUN_OBJ
(
str_find_obj
);
MP_DECLARE_CONST_FUN_OBJ
(
str_rfind_obj
);
MP_DECLARE_CONST_FUN_OBJ
(
str_index_obj
);
MP_DECLARE_CONST_FUN_OBJ
(
str_rindex_obj
);
MP_DECLARE_CONST_FUN_OBJ
(
str_join_obj
);
MP_DECLARE_CONST_FUN_OBJ
(
str_split_obj
);
MP_DECLARE_CONST_FUN_OBJ
(
str_rsplit_obj
);
MP_DECLARE_CONST_FUN_OBJ
(
str_startswith_obj
);
MP_DECLARE_CONST_FUN_OBJ
(
str_endswith_obj
);
MP_DECLARE_CONST_FUN_OBJ
(
str_strip_obj
);
MP_DECLARE_CONST_FUN_OBJ
(
str_lstrip_obj
);
MP_DECLARE_CONST_FUN_OBJ
(
str_rstrip_obj
);
MP_DECLARE_CONST_FUN_OBJ
(
str_format_obj
);
MP_DECLARE_CONST_FUN_OBJ
(
str_replace_obj
);
MP_DECLARE_CONST_FUN_OBJ
(
str_count_obj
);
MP_DECLARE_CONST_FUN_OBJ
(
str_partition_obj
);
MP_DECLARE_CONST_FUN_OBJ
(
str_rpartition_obj
);
MP_DECLARE_CONST_FUN_OBJ
(
str_lower_obj
);
MP_DECLARE_CONST_FUN_OBJ
(
str_upper_obj
);
MP_DECLARE_CONST_FUN_OBJ
(
str_isspace_obj
);
MP_DECLARE_CONST_FUN_OBJ
(
str_isalpha_obj
);
MP_DECLARE_CONST_FUN_OBJ
(
str_isdigit_obj
);
MP_DECLARE_CONST_FUN_OBJ
(
str_isupper_obj
);
MP_DECLARE_CONST_FUN_OBJ
(
str_islower_obj
);
py/objstrunicode.c
0 → 100644
View file @
b1b84055
/*
* This file is part of the Micro Python project, http://micropython.org/
*
* The MIT License (MIT)
*
* Copyright (c) 2013, 2014 Damien P. George
* Copyright (c) 2014 Paul Sokolovsky
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*/
#include
<stdbool.h>
#include
<string.h>
#include
<assert.h>
#include
"mpconfig.h"
#include
"nlr.h"
#include
"misc.h"
#include
"qstr.h"
#include
"obj.h"
#include
"runtime0.h"
#include
"runtime.h"
#include
"pfenv.h"
#include
"objstr.h"
#include
"objlist.h"
#if MICROPY_PY_BUILTINS_STR_UNICODE
STATIC
mp_obj_t
mp_obj_new_str_iterator
(
mp_obj_t
str
);
/******************************************************************************/
/* str */
STATIC
void
uni_print_quoted
(
void
(
*
print
)(
void
*
env
,
const
char
*
fmt
,
...),
void
*
env
,
const
byte
*
str_data
,
uint
str_len
)
{
// this escapes characters, but it will be very slow to print (calling print many times)
bool
has_single_quote
=
false
;
bool
has_double_quote
=
false
;
for
(
const
byte
*
s
=
str_data
,
*
top
=
str_data
+
str_len
;
!
has_double_quote
&&
s
<
top
;
s
++
)
{
if
(
*
s
==
'\''
)
{
has_single_quote
=
true
;
}
else
if
(
*
s
==
'"'
)
{
has_double_quote
=
true
;
}
}
int
quote_char
=
'\''
;
if
(
has_single_quote
&&
!
has_double_quote
)
{
quote_char
=
'"'
;
}
print
(
env
,
"%c"
,
quote_char
);
const
byte
*
s
=
str_data
,
*
top
=
str_data
+
str_len
;
while
(
s
<
top
)
{
unichar
ch
;
ch
=
utf8_get_char
(
s
);
s
=
utf8_next_char
(
s
);
if
(
ch
==
quote_char
)
{
print
(
env
,
"
\\
%c"
,
quote_char
);
}
else
if
(
ch
==
'\\'
)
{
print
(
env
,
"
\\\\
"
);
}
else
if
(
32
<=
ch
&&
ch
<=
126
)
{
print
(
env
,
"%c"
,
ch
);
}
else
if
(
ch
==
'\n'
)
{
print
(
env
,
"
\\
n"
);
}
else
if
(
ch
==
'\r'
)
{
print
(
env
,
"
\\
r"
);
}
else
if
(
ch
==
'\t'
)
{
print
(
env
,
"
\\
t"
);
}
else
if
(
ch
<
0x100
)
{
print
(
env
,
"
\\
x%02x"
,
ch
);
}
else
if
(
ch
<
0x10000
)
{
print
(
env
,
"
\\
u%04x"
,
ch
);
}
else
{
print
(
env
,
"
\\
U%08x"
,
ch
);
}
}
print
(
env
,
"%c"
,
quote_char
);
}
STATIC
void
uni_print
(
void
(
*
print
)(
void
*
env
,
const
char
*
fmt
,
...),
void
*
env
,
mp_obj_t
self_in
,
mp_print_kind_t
kind
)
{
GET_STR_DATA_LEN
(
self_in
,
str_data
,
str_len
);
if
(
kind
==
PRINT_STR
)
{
print
(
env
,
"%.*s"
,
str_len
,
str_data
);
}
else
{
uni_print_quoted
(
print
,
env
,
str_data
,
str_len
);
}
}
STATIC
mp_obj_t
uni_unary_op
(
int
op
,
mp_obj_t
self_in
)
{
GET_STR_DATA_LEN
(
self_in
,
str_data
,
str_len
);
switch
(
op
)
{
case
MP_UNARY_OP_BOOL
:
return
MP_BOOL
(
str_len
!=
0
);
case
MP_UNARY_OP_LEN
:
return
MP_OBJ_NEW_SMALL_INT
((
machine_int_t
)
unichar_charlen
((
const
char
*
)
str_data
,
str_len
));
default:
return
MP_OBJ_NULL
;
// op not supported
}
}
STATIC
mp_obj_t
str_make_new
(
mp_obj_t
type_in
,
uint
n_args
,
uint
n_kw
,
const
mp_obj_t
*
args
)
{
#if MICROPY_CPYTHON_COMPAT
if
(
n_kw
!=
0
)
{
mp_arg_error_unimpl_kw
();
}
#endif
switch
(
n_args
)
{
case
0
:
return
MP_OBJ_NEW_QSTR
(
MP_QSTR_
);
case
1
:
{
vstr_t
*
vstr
=
vstr_new
();
mp_obj_print_helper
((
void
(
*
)(
void
*
,
const
char
*
,
...))
vstr_printf
,
vstr
,
args
[
0
],
PRINT_STR
);
mp_obj_t
s
=
mp_obj_new_str
(
vstr
->
buf
,
vstr
->
len
,
false
);
vstr_free
(
vstr
);
return
s
;
}
case
2
:
case
3
:
{
// TODO: validate 2nd/3rd args
if
(
!
MP_OBJ_IS_TYPE
(
args
[
0
],
&
mp_type_bytes
))
{
nlr_raise
(
mp_obj_new_exception_msg
(
&
mp_type_TypeError
,
"bytes expected"
));
}
GET_STR_DATA_LEN
(
args
[
0
],
str_data
,
str_len
);
GET_STR_HASH
(
args
[
0
],
str_hash
);
mp_obj_str_t
*
o
=
mp_obj_new_str_of_type
(
&
mp_type_str
,
NULL
,
str_len
);