Message ID | 20211102054537.1137247-1-maskray@google.com |
---|---|
State | New |
Headers | show |
Series | [v2] regex: Unnest nested functions in regcomp.c | expand |
On 02/11/2021 02:45, Fangrui Song via Libc-alpha wrote: > This refactor moves four functions out of a nested scope and converts > them into static always_inline functions. collseqwc, table_size, > symb_table, extra are now initialized to zero because they are passed as > function arguments. > > On x86-64, .text is 16 byte larger likely due to the 4 stores. > This is nothing compared to the amount of work that regcomp has to do > looking up the collation weights, or other functions. > > If the non-buildable `sysdeps/generic/dl-machine.h` doesn't count, > this patch removes the last `auto inline` usage from glibc. LGTM, thanks. Carlos might want to ack as well. Reviewed-by: Adhemerval Zanella <adhemerval.zanella@linaro.org> > --- > v1 -> v2: > * Update commit message > --- > posix/regcomp.c | 514 +++++++++++++++++++++++++----------------------- > 1 file changed, 266 insertions(+), 248 deletions(-) > > diff --git a/posix/regcomp.c b/posix/regcomp.c > index 887e5b5068..e2b5c2807a 100644 > --- a/posix/regcomp.c > +++ b/posix/regcomp.c > @@ -2831,6 +2831,263 @@ build_collating_symbol (bitset_t sbcset, const unsigned char *name) > } > #endif /* not _LIBC */ > > +#ifdef _LIBC > +/* Local function for parse_bracket_exp used in _LIBC environment. > + Seek the collating symbol entry corresponding to NAME. > + Return the index of the symbol in the SYMB_TABLE, > + or -1 if not found. */ > + > +static inline int32_t > +__attribute__ ((always_inline)) > +seek_collating_symbol_entry (const unsigned char *name, size_t name_len, > + const int32_t *symb_table, int32_t table_size, > + const unsigned char *extra) > +{ > + int32_t elem; > + > + for (elem = 0; elem < table_size; elem++) > + if (symb_table[2 * elem] != 0) > + { > + int32_t idx = symb_table[2 * elem + 1]; > + /* Skip the name of collating element name. */ > + idx += 1 + extra[idx]; > + if (/* Compare the length of the name. */ > + name_len == extra[idx] > + /* Compare the name. */ > + && memcmp (name, &extra[idx + 1], name_len) == 0) > + /* Yep, this is the entry. */ > + return elem; > + } > + return -1; > +} > + Ok (I would let the compiler optimize it, but __attribute__ ((always_inline)) at least tries to mimic what nested function does). > +/* Local function for parse_bracket_exp used in _LIBC environment. > + Look up the collation sequence value of BR_ELEM. > + Return the value if succeeded, UINT_MAX otherwise. */ > + > +static inline unsigned int > +__attribute__ ((always_inline)) > +lookup_collation_sequence_value (bracket_elem_t *br_elem, uint32_t nrules, > + const unsigned char *collseqmb, > + const char *collseqwc, int32_t table_size, > + const int32_t *symb_table, > + const unsigned char *extra) > +{ > + if (br_elem->type == SB_CHAR) > + { > + /* if (MB_CUR_MAX == 1) */ > + if (nrules == 0) > + return collseqmb[br_elem->opr.ch]; > + else > + { > + wint_t wc = __btowc (br_elem->opr.ch); > + return __collseq_table_lookup (collseqwc, wc); > + } > + } > + else if (br_elem->type == MB_CHAR) > + { > + if (nrules != 0) > + return __collseq_table_lookup (collseqwc, br_elem->opr.wch); > + } > + else if (br_elem->type == COLL_SYM) > + { > + size_t sym_name_len = strlen ((char *) br_elem->opr.name); > + if (nrules != 0) > + { > + int32_t elem, idx; > + elem = seek_collating_symbol_entry (br_elem->opr.name, > + sym_name_len, > + symb_table, table_size, > + extra); > + if (elem != -1) > + { > + /* We found the entry. */ > + idx = symb_table[2 * elem + 1]; > + /* Skip the name of collating element name. */ > + idx += 1 + extra[idx]; > + /* Skip the byte sequence of the collating element. */ > + idx += 1 + extra[idx]; > + /* Adjust for the alignment. */ > + idx = (idx + 3) & ~3; > + /* Skip the multibyte collation sequence value. */ > + idx += sizeof (unsigned int); > + /* Skip the wide char sequence of the collating element. */ > + idx += sizeof (unsigned int) * > + (1 + *(unsigned int *) (extra + idx)); > + /* Return the collation sequence value. */ > + return *(unsigned int *) (extra + idx); > + } > + else if (sym_name_len == 1) > + { > + /* No valid character. Match it as a single byte > + character. */ > + return collseqmb[br_elem->opr.name[0]]; > + } > + } > + else if (sym_name_len == 1) > + return collseqmb[br_elem->opr.name[0]]; > + } > + return UINT_MAX; > +} > + > +/* Local function for parse_bracket_exp used in _LIBC environment. > + Build the range expression which starts from START_ELEM, and ends > + at END_ELEM. The result are written to MBCSET and SBCSET. > + RANGE_ALLOC is the allocated size of mbcset->range_starts, and > + mbcset->range_ends, is a pointer argument since we may > + update it. */ > + > +static inline reg_errcode_t > +__attribute__ ((always_inline)) > +build_range_exp (bitset_t sbcset, re_charset_t *mbcset, int *range_alloc, > + bracket_elem_t *start_elem, bracket_elem_t *end_elem, > + re_dfa_t *dfa, reg_syntax_t syntax, uint32_t nrules, > + const unsigned char *collseqmb, const char *collseqwc, > + int32_t table_size, const int32_t *symb_table, > + const unsigned char *extra) > +{ > + unsigned int ch; > + uint32_t start_collseq; > + uint32_t end_collseq; > + > + /* Equivalence Classes and Character Classes can't be a range > + start/end. */ > + if (__glibc_unlikely (start_elem->type == EQUIV_CLASS > + || start_elem->type == CHAR_CLASS > + || end_elem->type == EQUIV_CLASS > + || end_elem->type == CHAR_CLASS)) > + return REG_ERANGE; > + > + /* FIXME: Implement rational ranges here, too. */ > + start_collseq = lookup_collation_sequence_value (start_elem, nrules, collseqmb, collseqwc, > + table_size, symb_table, extra); > + end_collseq = lookup_collation_sequence_value (end_elem, nrules, collseqmb, collseqwc, > + table_size, symb_table, extra); > + /* Check start/end collation sequence values. */ > + if (__glibc_unlikely (start_collseq == UINT_MAX > + || end_collseq == UINT_MAX)) > + return REG_ECOLLATE; > + if (__glibc_unlikely ((syntax & RE_NO_EMPTY_RANGES) > + && start_collseq > end_collseq)) > + return REG_ERANGE; > + > + /* Got valid collation sequence values, add them as a new entry. > + However, if we have no collation elements, and the character set > + is single byte, the single byte character set that we > + build below suffices. */ > + if (nrules > 0 || dfa->mb_cur_max > 1) > + { > + /* Check the space of the arrays. */ > + if (__glibc_unlikely (*range_alloc == mbcset->nranges)) > + { > + /* There is not enough space, need realloc. */ > + uint32_t *new_array_start; > + uint32_t *new_array_end; > + int new_nranges; > + > + /* +1 in case of mbcset->nranges is 0. */ > + new_nranges = 2 * mbcset->nranges + 1; > + new_array_start = re_realloc (mbcset->range_starts, uint32_t, > + new_nranges); > + new_array_end = re_realloc (mbcset->range_ends, uint32_t, > + new_nranges); > + > + if (__glibc_unlikely (new_array_start == NULL > + || new_array_end == NULL)) > + return REG_ESPACE; > + > + mbcset->range_starts = new_array_start; > + mbcset->range_ends = new_array_end; > + *range_alloc = new_nranges; > + } > + > + mbcset->range_starts[mbcset->nranges] = start_collseq; > + mbcset->range_ends[mbcset->nranges++] = end_collseq; > + } > + > + /* Build the table for single byte characters. */ > + for (ch = 0; ch < SBC_MAX; ch++) > + { > + uint32_t ch_collseq; > + /* if (MB_CUR_MAX == 1) */ > + if (nrules == 0) > + ch_collseq = collseqmb[ch]; > + else > + ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch)); > + if (start_collseq <= ch_collseq && ch_collseq <= end_collseq) > + bitset_set (sbcset, ch); > + } > + return REG_NOERROR; > +} > + > +/* Local function for parse_bracket_exp used in _LIBC environment. > + Build the collating element which is represented by NAME. > + The result are written to MBCSET and SBCSET. > + COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a > + pointer argument since we may update it. */ > + > +static inline reg_errcode_t > +__attribute__ ((always_inline)) > +build_collating_symbol (bitset_t sbcset, re_charset_t *mbcset, > + int *coll_sym_alloc, const unsigned char *name, > + uint32_t nrules, int32_t table_size, > + const int32_t *symb_table, const unsigned char *extra) > +{ > + int32_t elem, idx; > + size_t name_len = strlen ((const char *) name); > + if (nrules != 0) > + { > + elem = seek_collating_symbol_entry (name, name_len, symb_table, > + table_size, extra); > + if (elem != -1) > + { > + /* We found the entry. */ > + idx = symb_table[2 * elem + 1]; > + /* Skip the name of collating element name. */ > + idx += 1 + extra[idx]; > + } > + else if (name_len == 1) > + { > + /* No valid character, treat it as a normal > + character. */ > + bitset_set (sbcset, name[0]); > + return REG_NOERROR; > + } > + else > + return REG_ECOLLATE; > + > + /* Got valid collation sequence, add it as a new entry. */ > + /* Check the space of the arrays. */ > + if (__glibc_unlikely (*coll_sym_alloc == mbcset->ncoll_syms)) > + { > + /* Not enough, realloc it. */ > + /* +1 in case of mbcset->ncoll_syms is 0. */ > + int new_coll_sym_alloc = 2 * mbcset->ncoll_syms + 1; > + /* Use realloc since mbcset->coll_syms is NULL > + if *alloc == 0. */ > + int32_t *new_coll_syms = re_realloc (mbcset->coll_syms, int32_t, > + new_coll_sym_alloc); > + if (__glibc_unlikely (new_coll_syms == NULL)) > + return REG_ESPACE; > + mbcset->coll_syms = new_coll_syms; > + *coll_sym_alloc = new_coll_sym_alloc; > + } > + mbcset->coll_syms[mbcset->ncoll_syms++] = idx; > + return REG_NOERROR; > + } > + else > + { > + if (__glibc_unlikely (name_len != 1)) > + return REG_ECOLLATE; > + else > + { > + bitset_set (sbcset, name[0]); > + return REG_NOERROR; > + } > + } > +} > +#endif /* _LIBC */ > + > /* This function parse bracket expression like "[abc]", "[a-c]", > "[[.a-a.]]" etc. */ > > @@ -2840,253 +3097,11 @@ parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token, > { > #ifdef _LIBC > const unsigned char *collseqmb; > - const char *collseqwc; > + const char *collseqwc = NULL; > uint32_t nrules; > - int32_t table_size; > - const int32_t *symb_table; > - const unsigned char *extra; > - > - /* Local function for parse_bracket_exp used in _LIBC environment. > - Seek the collating symbol entry corresponding to NAME. > - Return the index of the symbol in the SYMB_TABLE, > - or -1 if not found. */ > - > - auto inline int32_t > - __attribute__ ((always_inline)) > - seek_collating_symbol_entry (const unsigned char *name, size_t name_len) > - { > - int32_t elem; > - > - for (elem = 0; elem < table_size; elem++) > - if (symb_table[2 * elem] != 0) > - { > - int32_t idx = symb_table[2 * elem + 1]; > - /* Skip the name of collating element name. */ > - idx += 1 + extra[idx]; > - if (/* Compare the length of the name. */ > - name_len == extra[idx] > - /* Compare the name. */ > - && memcmp (name, &extra[idx + 1], name_len) == 0) > - /* Yep, this is the entry. */ > - return elem; > - } > - return -1; > - } > - > - /* Local function for parse_bracket_exp used in _LIBC environment. > - Look up the collation sequence value of BR_ELEM. > - Return the value if succeeded, UINT_MAX otherwise. */ > - > - auto inline unsigned int > - __attribute__ ((always_inline)) > - lookup_collation_sequence_value (bracket_elem_t *br_elem) > - { > - if (br_elem->type == SB_CHAR) > - { > - /* > - if (MB_CUR_MAX == 1) > - */ > - if (nrules == 0) > - return collseqmb[br_elem->opr.ch]; > - else > - { > - wint_t wc = __btowc (br_elem->opr.ch); > - return __collseq_table_lookup (collseqwc, wc); > - } > - } > - else if (br_elem->type == MB_CHAR) > - { > - if (nrules != 0) > - return __collseq_table_lookup (collseqwc, br_elem->opr.wch); > - } > - else if (br_elem->type == COLL_SYM) > - { > - size_t sym_name_len = strlen ((char *) br_elem->opr.name); > - if (nrules != 0) > - { > - int32_t elem, idx; > - elem = seek_collating_symbol_entry (br_elem->opr.name, > - sym_name_len); > - if (elem != -1) > - { > - /* We found the entry. */ > - idx = symb_table[2 * elem + 1]; > - /* Skip the name of collating element name. */ > - idx += 1 + extra[idx]; > - /* Skip the byte sequence of the collating element. */ > - idx += 1 + extra[idx]; > - /* Adjust for the alignment. */ > - idx = (idx + 3) & ~3; > - /* Skip the multibyte collation sequence value. */ > - idx += sizeof (unsigned int); > - /* Skip the wide char sequence of the collating element. */ > - idx += sizeof (unsigned int) * > - (1 + *(unsigned int *) (extra + idx)); > - /* Return the collation sequence value. */ > - return *(unsigned int *) (extra + idx); > - } > - else if (sym_name_len == 1) > - { > - /* No valid character. Match it as a single byte > - character. */ > - return collseqmb[br_elem->opr.name[0]]; > - } > - } > - else if (sym_name_len == 1) > - return collseqmb[br_elem->opr.name[0]]; > - } > - return UINT_MAX; > - } > - > - /* Local function for parse_bracket_exp used in _LIBC environment. > - Build the range expression which starts from START_ELEM, and ends > - at END_ELEM. The result are written to MBCSET and SBCSET. > - RANGE_ALLOC is the allocated size of mbcset->range_starts, and > - mbcset->range_ends, is a pointer argument since we may > - update it. */ > - > - auto inline reg_errcode_t > - __attribute__ ((always_inline)) > - build_range_exp (bitset_t sbcset, re_charset_t *mbcset, int *range_alloc, > - bracket_elem_t *start_elem, bracket_elem_t *end_elem) > - { > - unsigned int ch; > - uint32_t start_collseq; > - uint32_t end_collseq; > - > - /* Equivalence Classes and Character Classes can't be a range > - start/end. */ > - if (__glibc_unlikely (start_elem->type == EQUIV_CLASS > - || start_elem->type == CHAR_CLASS > - || end_elem->type == EQUIV_CLASS > - || end_elem->type == CHAR_CLASS)) > - return REG_ERANGE; > - > - /* FIXME: Implement rational ranges here, too. */ > - start_collseq = lookup_collation_sequence_value (start_elem); > - end_collseq = lookup_collation_sequence_value (end_elem); > - /* Check start/end collation sequence values. */ > - if (__glibc_unlikely (start_collseq == UINT_MAX > - || end_collseq == UINT_MAX)) > - return REG_ECOLLATE; > - if (__glibc_unlikely ((syntax & RE_NO_EMPTY_RANGES) > - && start_collseq > end_collseq)) > - return REG_ERANGE; > - > - /* Got valid collation sequence values, add them as a new entry. > - However, if we have no collation elements, and the character set > - is single byte, the single byte character set that we > - build below suffices. */ > - if (nrules > 0 || dfa->mb_cur_max > 1) > - { > - /* Check the space of the arrays. */ > - if (__glibc_unlikely (*range_alloc == mbcset->nranges)) > - { > - /* There is not enough space, need realloc. */ > - uint32_t *new_array_start; > - uint32_t *new_array_end; > - Idx new_nranges; > - > - /* +1 in case of mbcset->nranges is 0. */ > - new_nranges = 2 * mbcset->nranges + 1; > - new_array_start = re_realloc (mbcset->range_starts, uint32_t, > - new_nranges); > - new_array_end = re_realloc (mbcset->range_ends, uint32_t, > - new_nranges); > - > - if (__glibc_unlikely (new_array_start == NULL > - || new_array_end == NULL)) > - return REG_ESPACE; > - > - mbcset->range_starts = new_array_start; > - mbcset->range_ends = new_array_end; > - *range_alloc = new_nranges; > - } > - > - mbcset->range_starts[mbcset->nranges] = start_collseq; > - mbcset->range_ends[mbcset->nranges++] = end_collseq; > - } > - > - /* Build the table for single byte characters. */ > - for (ch = 0; ch < SBC_MAX; ch++) > - { > - uint32_t ch_collseq; > - /* > - if (MB_CUR_MAX == 1) > - */ > - if (nrules == 0) > - ch_collseq = collseqmb[ch]; > - else > - ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch)); > - if (start_collseq <= ch_collseq && ch_collseq <= end_collseq) > - bitset_set (sbcset, ch); > - } > - return REG_NOERROR; > - } > - > - /* Local function for parse_bracket_exp used in _LIBC environment. > - Build the collating element which is represented by NAME. > - The result are written to MBCSET and SBCSET. > - COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a > - pointer argument since we may update it. */ > - > - auto inline reg_errcode_t > - __attribute__ ((always_inline)) > - build_collating_symbol (bitset_t sbcset, re_charset_t *mbcset, > - Idx *coll_sym_alloc, const unsigned char *name) > - { > - int32_t elem, idx; > - size_t name_len = strlen ((const char *) name); > - if (nrules != 0) > - { > - elem = seek_collating_symbol_entry (name, name_len); > - if (elem != -1) > - { > - /* We found the entry. */ > - idx = symb_table[2 * elem + 1]; > - /* Skip the name of collating element name. */ > - idx += 1 + extra[idx]; > - } > - else if (name_len == 1) > - { > - /* No valid character, treat it as a normal > - character. */ > - bitset_set (sbcset, name[0]); > - return REG_NOERROR; > - } > - else > - return REG_ECOLLATE; > - > - /* Got valid collation sequence, add it as a new entry. */ > - /* Check the space of the arrays. */ > - if (__glibc_unlikely (*coll_sym_alloc == mbcset->ncoll_syms)) > - { > - /* Not enough, realloc it. */ > - /* +1 in case of mbcset->ncoll_syms is 0. */ > - Idx new_coll_sym_alloc = 2 * mbcset->ncoll_syms + 1; > - /* Use realloc since mbcset->coll_syms is NULL > - if *alloc == 0. */ > - int32_t *new_coll_syms = re_realloc (mbcset->coll_syms, int32_t, > - new_coll_sym_alloc); > - if (__glibc_unlikely (new_coll_syms == NULL)) > - return REG_ESPACE; > - mbcset->coll_syms = new_coll_syms; > - *coll_sym_alloc = new_coll_sym_alloc; > - } > - mbcset->coll_syms[mbcset->ncoll_syms++] = idx; > - return REG_NOERROR; > - } > - else > - { > - if (__glibc_unlikely (name_len != 1)) > - return REG_ECOLLATE; > - else > - { > - bitset_set (sbcset, name[0]); > - return REG_NOERROR; > - } > - } > - } > + int32_t table_size = 0; > + const int32_t *symb_table = NULL; > + const unsigned char *extra = NULL; > #endif > > re_token_t br_token; > @@ -3230,7 +3245,9 @@ parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token, > > #ifdef _LIBC > *err = build_range_exp (sbcset, mbcset, &range_alloc, > - &start_elem, &end_elem); > + &start_elem, &end_elem, > + dfa, syntax, nrules, collseqmb, collseqwc, > + table_size, symb_table, extra); > #else > # ifdef RE_ENABLE_I18N > *err = build_range_exp (syntax, sbcset, > @@ -3283,7 +3300,8 @@ parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token, > #ifdef RE_ENABLE_I18N > mbcset, &coll_sym_alloc, > #endif /* RE_ENABLE_I18N */ > - start_elem.opr.name); > + start_elem.opr.name, > + nrules, table_size, symb_table, extra); > if (__glibc_unlikely (*err != REG_NOERROR)) > goto parse_bracket_exp_free_return; > break; >
On 11/2/21 08:15, Adhemerval Zanella wrote: > > > On 02/11/2021 02:45, Fangrui Song via Libc-alpha wrote: >> This refactor moves four functions out of a nested scope and converts >> them into static always_inline functions. collseqwc, table_size, >> symb_table, extra are now initialized to zero because they are passed as >> function arguments. >> >> On x86-64, .text is 16 byte larger likely due to the 4 stores. >> This is nothing compared to the amount of work that regcomp has to do >> looking up the collation weights, or other functions. >> >> If the non-buildable `sysdeps/generic/dl-machine.h` doesn't count, >> this patch removes the last `auto inline` usage from glibc. > > LGTM, thanks. Carlos might want to ack as well. > > Reviewed-by: Adhemerval Zanella <adhemerval.zanella@linaro.org> LGTM. Thanks! I think that keeping always_inline is the right thing to do in the context of a refactor. Reviewed-by: Carlos O'Donell <carlos@redhat.com> >> --- >> v1 -> v2: >> * Update commit message >> --- >> posix/regcomp.c | 514 +++++++++++++++++++++++++----------------------- >> 1 file changed, 266 insertions(+), 248 deletions(-) >> >> diff --git a/posix/regcomp.c b/posix/regcomp.c >> index 887e5b5068..e2b5c2807a 100644 >> --- a/posix/regcomp.c >> +++ b/posix/regcomp.c >> @@ -2831,6 +2831,263 @@ build_collating_symbol (bitset_t sbcset, const unsigned char *name) >> } >> #endif /* not _LIBC */ >> >> +#ifdef _LIBC >> +/* Local function for parse_bracket_exp used in _LIBC environment. >> + Seek the collating symbol entry corresponding to NAME. >> + Return the index of the symbol in the SYMB_TABLE, >> + or -1 if not found. */ >> + >> +static inline int32_t >> +__attribute__ ((always_inline)) >> +seek_collating_symbol_entry (const unsigned char *name, size_t name_len, >> + const int32_t *symb_table, int32_t table_size, >> + const unsigned char *extra) >> +{ >> + int32_t elem; >> + >> + for (elem = 0; elem < table_size; elem++) >> + if (symb_table[2 * elem] != 0) >> + { >> + int32_t idx = symb_table[2 * elem + 1]; >> + /* Skip the name of collating element name. */ >> + idx += 1 + extra[idx]; >> + if (/* Compare the length of the name. */ >> + name_len == extra[idx] >> + /* Compare the name. */ >> + && memcmp (name, &extra[idx + 1], name_len) == 0) >> + /* Yep, this is the entry. */ >> + return elem; >> + } >> + return -1; >> +} >> + > > Ok (I would let the compiler optimize it, but __attribute__ ((always_inline)) > at least tries to mimic what nested function does). > >> +/* Local function for parse_bracket_exp used in _LIBC environment. >> + Look up the collation sequence value of BR_ELEM. >> + Return the value if succeeded, UINT_MAX otherwise. */ >> + >> +static inline unsigned int >> +__attribute__ ((always_inline)) >> +lookup_collation_sequence_value (bracket_elem_t *br_elem, uint32_t nrules, >> + const unsigned char *collseqmb, >> + const char *collseqwc, int32_t table_size, >> + const int32_t *symb_table, >> + const unsigned char *extra) >> +{ >> + if (br_elem->type == SB_CHAR) >> + { >> + /* if (MB_CUR_MAX == 1) */ >> + if (nrules == 0) >> + return collseqmb[br_elem->opr.ch]; >> + else >> + { >> + wint_t wc = __btowc (br_elem->opr.ch); >> + return __collseq_table_lookup (collseqwc, wc); >> + } >> + } >> + else if (br_elem->type == MB_CHAR) >> + { >> + if (nrules != 0) >> + return __collseq_table_lookup (collseqwc, br_elem->opr.wch); >> + } >> + else if (br_elem->type == COLL_SYM) >> + { >> + size_t sym_name_len = strlen ((char *) br_elem->opr.name); >> + if (nrules != 0) >> + { >> + int32_t elem, idx; >> + elem = seek_collating_symbol_entry (br_elem->opr.name, >> + sym_name_len, >> + symb_table, table_size, >> + extra); >> + if (elem != -1) >> + { >> + /* We found the entry. */ >> + idx = symb_table[2 * elem + 1]; >> + /* Skip the name of collating element name. */ >> + idx += 1 + extra[idx]; >> + /* Skip the byte sequence of the collating element. */ >> + idx += 1 + extra[idx]; >> + /* Adjust for the alignment. */ >> + idx = (idx + 3) & ~3; >> + /* Skip the multibyte collation sequence value. */ >> + idx += sizeof (unsigned int); >> + /* Skip the wide char sequence of the collating element. */ >> + idx += sizeof (unsigned int) * >> + (1 + *(unsigned int *) (extra + idx)); >> + /* Return the collation sequence value. */ >> + return *(unsigned int *) (extra + idx); >> + } >> + else if (sym_name_len == 1) >> + { >> + /* No valid character. Match it as a single byte >> + character. */ >> + return collseqmb[br_elem->opr.name[0]]; >> + } >> + } >> + else if (sym_name_len == 1) >> + return collseqmb[br_elem->opr.name[0]]; >> + } >> + return UINT_MAX; >> +} >> + >> +/* Local function for parse_bracket_exp used in _LIBC environment. >> + Build the range expression which starts from START_ELEM, and ends >> + at END_ELEM. The result are written to MBCSET and SBCSET. >> + RANGE_ALLOC is the allocated size of mbcset->range_starts, and >> + mbcset->range_ends, is a pointer argument since we may >> + update it. */ >> + >> +static inline reg_errcode_t >> +__attribute__ ((always_inline)) >> +build_range_exp (bitset_t sbcset, re_charset_t *mbcset, int *range_alloc, >> + bracket_elem_t *start_elem, bracket_elem_t *end_elem, >> + re_dfa_t *dfa, reg_syntax_t syntax, uint32_t nrules, >> + const unsigned char *collseqmb, const char *collseqwc, >> + int32_t table_size, const int32_t *symb_table, >> + const unsigned char *extra) >> +{ >> + unsigned int ch; >> + uint32_t start_collseq; >> + uint32_t end_collseq; >> + >> + /* Equivalence Classes and Character Classes can't be a range >> + start/end. */ >> + if (__glibc_unlikely (start_elem->type == EQUIV_CLASS >> + || start_elem->type == CHAR_CLASS >> + || end_elem->type == EQUIV_CLASS >> + || end_elem->type == CHAR_CLASS)) >> + return REG_ERANGE; >> + >> + /* FIXME: Implement rational ranges here, too. */ >> + start_collseq = lookup_collation_sequence_value (start_elem, nrules, collseqmb, collseqwc, >> + table_size, symb_table, extra); >> + end_collseq = lookup_collation_sequence_value (end_elem, nrules, collseqmb, collseqwc, >> + table_size, symb_table, extra); >> + /* Check start/end collation sequence values. */ >> + if (__glibc_unlikely (start_collseq == UINT_MAX >> + || end_collseq == UINT_MAX)) >> + return REG_ECOLLATE; >> + if (__glibc_unlikely ((syntax & RE_NO_EMPTY_RANGES) >> + && start_collseq > end_collseq)) >> + return REG_ERANGE; >> + >> + /* Got valid collation sequence values, add them as a new entry. >> + However, if we have no collation elements, and the character set >> + is single byte, the single byte character set that we >> + build below suffices. */ >> + if (nrules > 0 || dfa->mb_cur_max > 1) >> + { >> + /* Check the space of the arrays. */ >> + if (__glibc_unlikely (*range_alloc == mbcset->nranges)) >> + { >> + /* There is not enough space, need realloc. */ >> + uint32_t *new_array_start; >> + uint32_t *new_array_end; >> + int new_nranges; >> + >> + /* +1 in case of mbcset->nranges is 0. */ >> + new_nranges = 2 * mbcset->nranges + 1; >> + new_array_start = re_realloc (mbcset->range_starts, uint32_t, >> + new_nranges); >> + new_array_end = re_realloc (mbcset->range_ends, uint32_t, >> + new_nranges); >> + >> + if (__glibc_unlikely (new_array_start == NULL >> + || new_array_end == NULL)) >> + return REG_ESPACE; >> + >> + mbcset->range_starts = new_array_start; >> + mbcset->range_ends = new_array_end; >> + *range_alloc = new_nranges; >> + } >> + >> + mbcset->range_starts[mbcset->nranges] = start_collseq; >> + mbcset->range_ends[mbcset->nranges++] = end_collseq; >> + } >> + >> + /* Build the table for single byte characters. */ >> + for (ch = 0; ch < SBC_MAX; ch++) >> + { >> + uint32_t ch_collseq; >> + /* if (MB_CUR_MAX == 1) */ >> + if (nrules == 0) >> + ch_collseq = collseqmb[ch]; >> + else >> + ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch)); >> + if (start_collseq <= ch_collseq && ch_collseq <= end_collseq) >> + bitset_set (sbcset, ch); >> + } >> + return REG_NOERROR; >> +} >> + >> +/* Local function for parse_bracket_exp used in _LIBC environment. >> + Build the collating element which is represented by NAME. >> + The result are written to MBCSET and SBCSET. >> + COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a >> + pointer argument since we may update it. */ >> + >> +static inline reg_errcode_t >> +__attribute__ ((always_inline)) >> +build_collating_symbol (bitset_t sbcset, re_charset_t *mbcset, >> + int *coll_sym_alloc, const unsigned char *name, >> + uint32_t nrules, int32_t table_size, >> + const int32_t *symb_table, const unsigned char *extra) >> +{ >> + int32_t elem, idx; >> + size_t name_len = strlen ((const char *) name); >> + if (nrules != 0) >> + { >> + elem = seek_collating_symbol_entry (name, name_len, symb_table, >> + table_size, extra); >> + if (elem != -1) >> + { >> + /* We found the entry. */ >> + idx = symb_table[2 * elem + 1]; >> + /* Skip the name of collating element name. */ >> + idx += 1 + extra[idx]; >> + } >> + else if (name_len == 1) >> + { >> + /* No valid character, treat it as a normal >> + character. */ >> + bitset_set (sbcset, name[0]); >> + return REG_NOERROR; >> + } >> + else >> + return REG_ECOLLATE; >> + >> + /* Got valid collation sequence, add it as a new entry. */ >> + /* Check the space of the arrays. */ >> + if (__glibc_unlikely (*coll_sym_alloc == mbcset->ncoll_syms)) >> + { >> + /* Not enough, realloc it. */ >> + /* +1 in case of mbcset->ncoll_syms is 0. */ >> + int new_coll_sym_alloc = 2 * mbcset->ncoll_syms + 1; >> + /* Use realloc since mbcset->coll_syms is NULL >> + if *alloc == 0. */ >> + int32_t *new_coll_syms = re_realloc (mbcset->coll_syms, int32_t, >> + new_coll_sym_alloc); >> + if (__glibc_unlikely (new_coll_syms == NULL)) >> + return REG_ESPACE; >> + mbcset->coll_syms = new_coll_syms; >> + *coll_sym_alloc = new_coll_sym_alloc; >> + } >> + mbcset->coll_syms[mbcset->ncoll_syms++] = idx; >> + return REG_NOERROR; >> + } >> + else >> + { >> + if (__glibc_unlikely (name_len != 1)) >> + return REG_ECOLLATE; >> + else >> + { >> + bitset_set (sbcset, name[0]); >> + return REG_NOERROR; >> + } >> + } >> +} >> +#endif /* _LIBC */ >> + >> /* This function parse bracket expression like "[abc]", "[a-c]", >> "[[.a-a.]]" etc. */ >> >> @@ -2840,253 +3097,11 @@ parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token, >> { >> #ifdef _LIBC >> const unsigned char *collseqmb; >> - const char *collseqwc; >> + const char *collseqwc = NULL; >> uint32_t nrules; >> - int32_t table_size; >> - const int32_t *symb_table; >> - const unsigned char *extra; >> - >> - /* Local function for parse_bracket_exp used in _LIBC environment. >> - Seek the collating symbol entry corresponding to NAME. >> - Return the index of the symbol in the SYMB_TABLE, >> - or -1 if not found. */ >> - >> - auto inline int32_t >> - __attribute__ ((always_inline)) >> - seek_collating_symbol_entry (const unsigned char *name, size_t name_len) >> - { >> - int32_t elem; >> - >> - for (elem = 0; elem < table_size; elem++) >> - if (symb_table[2 * elem] != 0) >> - { >> - int32_t idx = symb_table[2 * elem + 1]; >> - /* Skip the name of collating element name. */ >> - idx += 1 + extra[idx]; >> - if (/* Compare the length of the name. */ >> - name_len == extra[idx] >> - /* Compare the name. */ >> - && memcmp (name, &extra[idx + 1], name_len) == 0) >> - /* Yep, this is the entry. */ >> - return elem; >> - } >> - return -1; >> - } >> - >> - /* Local function for parse_bracket_exp used in _LIBC environment. >> - Look up the collation sequence value of BR_ELEM. >> - Return the value if succeeded, UINT_MAX otherwise. */ >> - >> - auto inline unsigned int >> - __attribute__ ((always_inline)) >> - lookup_collation_sequence_value (bracket_elem_t *br_elem) >> - { >> - if (br_elem->type == SB_CHAR) >> - { >> - /* >> - if (MB_CUR_MAX == 1) >> - */ >> - if (nrules == 0) >> - return collseqmb[br_elem->opr.ch]; >> - else >> - { >> - wint_t wc = __btowc (br_elem->opr.ch); >> - return __collseq_table_lookup (collseqwc, wc); >> - } >> - } >> - else if (br_elem->type == MB_CHAR) >> - { >> - if (nrules != 0) >> - return __collseq_table_lookup (collseqwc, br_elem->opr.wch); >> - } >> - else if (br_elem->type == COLL_SYM) >> - { >> - size_t sym_name_len = strlen ((char *) br_elem->opr.name); >> - if (nrules != 0) >> - { >> - int32_t elem, idx; >> - elem = seek_collating_symbol_entry (br_elem->opr.name, >> - sym_name_len); >> - if (elem != -1) >> - { >> - /* We found the entry. */ >> - idx = symb_table[2 * elem + 1]; >> - /* Skip the name of collating element name. */ >> - idx += 1 + extra[idx]; >> - /* Skip the byte sequence of the collating element. */ >> - idx += 1 + extra[idx]; >> - /* Adjust for the alignment. */ >> - idx = (idx + 3) & ~3; >> - /* Skip the multibyte collation sequence value. */ >> - idx += sizeof (unsigned int); >> - /* Skip the wide char sequence of the collating element. */ >> - idx += sizeof (unsigned int) * >> - (1 + *(unsigned int *) (extra + idx)); >> - /* Return the collation sequence value. */ >> - return *(unsigned int *) (extra + idx); >> - } >> - else if (sym_name_len == 1) >> - { >> - /* No valid character. Match it as a single byte >> - character. */ >> - return collseqmb[br_elem->opr.name[0]]; >> - } >> - } >> - else if (sym_name_len == 1) >> - return collseqmb[br_elem->opr.name[0]]; >> - } >> - return UINT_MAX; >> - } >> - >> - /* Local function for parse_bracket_exp used in _LIBC environment. >> - Build the range expression which starts from START_ELEM, and ends >> - at END_ELEM. The result are written to MBCSET and SBCSET. >> - RANGE_ALLOC is the allocated size of mbcset->range_starts, and >> - mbcset->range_ends, is a pointer argument since we may >> - update it. */ >> - >> - auto inline reg_errcode_t >> - __attribute__ ((always_inline)) >> - build_range_exp (bitset_t sbcset, re_charset_t *mbcset, int *range_alloc, >> - bracket_elem_t *start_elem, bracket_elem_t *end_elem) >> - { >> - unsigned int ch; >> - uint32_t start_collseq; >> - uint32_t end_collseq; >> - >> - /* Equivalence Classes and Character Classes can't be a range >> - start/end. */ >> - if (__glibc_unlikely (start_elem->type == EQUIV_CLASS >> - || start_elem->type == CHAR_CLASS >> - || end_elem->type == EQUIV_CLASS >> - || end_elem->type == CHAR_CLASS)) >> - return REG_ERANGE; >> - >> - /* FIXME: Implement rational ranges here, too. */ >> - start_collseq = lookup_collation_sequence_value (start_elem); >> - end_collseq = lookup_collation_sequence_value (end_elem); >> - /* Check start/end collation sequence values. */ >> - if (__glibc_unlikely (start_collseq == UINT_MAX >> - || end_collseq == UINT_MAX)) >> - return REG_ECOLLATE; >> - if (__glibc_unlikely ((syntax & RE_NO_EMPTY_RANGES) >> - && start_collseq > end_collseq)) >> - return REG_ERANGE; >> - >> - /* Got valid collation sequence values, add them as a new entry. >> - However, if we have no collation elements, and the character set >> - is single byte, the single byte character set that we >> - build below suffices. */ >> - if (nrules > 0 || dfa->mb_cur_max > 1) >> - { >> - /* Check the space of the arrays. */ >> - if (__glibc_unlikely (*range_alloc == mbcset->nranges)) >> - { >> - /* There is not enough space, need realloc. */ >> - uint32_t *new_array_start; >> - uint32_t *new_array_end; >> - Idx new_nranges; >> - >> - /* +1 in case of mbcset->nranges is 0. */ >> - new_nranges = 2 * mbcset->nranges + 1; >> - new_array_start = re_realloc (mbcset->range_starts, uint32_t, >> - new_nranges); >> - new_array_end = re_realloc (mbcset->range_ends, uint32_t, >> - new_nranges); >> - >> - if (__glibc_unlikely (new_array_start == NULL >> - || new_array_end == NULL)) >> - return REG_ESPACE; >> - >> - mbcset->range_starts = new_array_start; >> - mbcset->range_ends = new_array_end; >> - *range_alloc = new_nranges; >> - } >> - >> - mbcset->range_starts[mbcset->nranges] = start_collseq; >> - mbcset->range_ends[mbcset->nranges++] = end_collseq; >> - } >> - >> - /* Build the table for single byte characters. */ >> - for (ch = 0; ch < SBC_MAX; ch++) >> - { >> - uint32_t ch_collseq; >> - /* >> - if (MB_CUR_MAX == 1) >> - */ >> - if (nrules == 0) >> - ch_collseq = collseqmb[ch]; >> - else >> - ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch)); >> - if (start_collseq <= ch_collseq && ch_collseq <= end_collseq) >> - bitset_set (sbcset, ch); >> - } >> - return REG_NOERROR; >> - } >> - >> - /* Local function for parse_bracket_exp used in _LIBC environment. >> - Build the collating element which is represented by NAME. >> - The result are written to MBCSET and SBCSET. >> - COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a >> - pointer argument since we may update it. */ >> - >> - auto inline reg_errcode_t >> - __attribute__ ((always_inline)) >> - build_collating_symbol (bitset_t sbcset, re_charset_t *mbcset, >> - Idx *coll_sym_alloc, const unsigned char *name) >> - { >> - int32_t elem, idx; >> - size_t name_len = strlen ((const char *) name); >> - if (nrules != 0) >> - { >> - elem = seek_collating_symbol_entry (name, name_len); >> - if (elem != -1) >> - { >> - /* We found the entry. */ >> - idx = symb_table[2 * elem + 1]; >> - /* Skip the name of collating element name. */ >> - idx += 1 + extra[idx]; >> - } >> - else if (name_len == 1) >> - { >> - /* No valid character, treat it as a normal >> - character. */ >> - bitset_set (sbcset, name[0]); >> - return REG_NOERROR; >> - } >> - else >> - return REG_ECOLLATE; >> - >> - /* Got valid collation sequence, add it as a new entry. */ >> - /* Check the space of the arrays. */ >> - if (__glibc_unlikely (*coll_sym_alloc == mbcset->ncoll_syms)) >> - { >> - /* Not enough, realloc it. */ >> - /* +1 in case of mbcset->ncoll_syms is 0. */ >> - Idx new_coll_sym_alloc = 2 * mbcset->ncoll_syms + 1; >> - /* Use realloc since mbcset->coll_syms is NULL >> - if *alloc == 0. */ >> - int32_t *new_coll_syms = re_realloc (mbcset->coll_syms, int32_t, >> - new_coll_sym_alloc); >> - if (__glibc_unlikely (new_coll_syms == NULL)) >> - return REG_ESPACE; >> - mbcset->coll_syms = new_coll_syms; >> - *coll_sym_alloc = new_coll_sym_alloc; >> - } >> - mbcset->coll_syms[mbcset->ncoll_syms++] = idx; >> - return REG_NOERROR; >> - } >> - else >> - { >> - if (__glibc_unlikely (name_len != 1)) >> - return REG_ECOLLATE; >> - else >> - { >> - bitset_set (sbcset, name[0]); >> - return REG_NOERROR; >> - } >> - } >> - } >> + int32_t table_size = 0; >> + const int32_t *symb_table = NULL; >> + const unsigned char *extra = NULL; >> #endif >> >> re_token_t br_token; >> @@ -3230,7 +3245,9 @@ parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token, >> >> #ifdef _LIBC >> *err = build_range_exp (sbcset, mbcset, &range_alloc, >> - &start_elem, &end_elem); >> + &start_elem, &end_elem, >> + dfa, syntax, nrules, collseqmb, collseqwc, >> + table_size, symb_table, extra); >> #else >> # ifdef RE_ENABLE_I18N >> *err = build_range_exp (syntax, sbcset, >> @@ -3283,7 +3300,8 @@ parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token, >> #ifdef RE_ENABLE_I18N >> mbcset, &coll_sym_alloc, >> #endif /* RE_ENABLE_I18N */ >> - start_elem.opr.name); >> + start_elem.opr.name, >> + nrules, table_size, symb_table, extra); >> if (__glibc_unlikely (*err != REG_NOERROR)) >> goto parse_bracket_exp_free_return; >> break; >> >
diff --git a/posix/regcomp.c b/posix/regcomp.c index 887e5b5068..e2b5c2807a 100644 --- a/posix/regcomp.c +++ b/posix/regcomp.c @@ -2831,6 +2831,263 @@ build_collating_symbol (bitset_t sbcset, const unsigned char *name) } #endif /* not _LIBC */ +#ifdef _LIBC +/* Local function for parse_bracket_exp used in _LIBC environment. + Seek the collating symbol entry corresponding to NAME. + Return the index of the symbol in the SYMB_TABLE, + or -1 if not found. */ + +static inline int32_t +__attribute__ ((always_inline)) +seek_collating_symbol_entry (const unsigned char *name, size_t name_len, + const int32_t *symb_table, int32_t table_size, + const unsigned char *extra) +{ + int32_t elem; + + for (elem = 0; elem < table_size; elem++) + if (symb_table[2 * elem] != 0) + { + int32_t idx = symb_table[2 * elem + 1]; + /* Skip the name of collating element name. */ + idx += 1 + extra[idx]; + if (/* Compare the length of the name. */ + name_len == extra[idx] + /* Compare the name. */ + && memcmp (name, &extra[idx + 1], name_len) == 0) + /* Yep, this is the entry. */ + return elem; + } + return -1; +} + +/* Local function for parse_bracket_exp used in _LIBC environment. + Look up the collation sequence value of BR_ELEM. + Return the value if succeeded, UINT_MAX otherwise. */ + +static inline unsigned int +__attribute__ ((always_inline)) +lookup_collation_sequence_value (bracket_elem_t *br_elem, uint32_t nrules, + const unsigned char *collseqmb, + const char *collseqwc, int32_t table_size, + const int32_t *symb_table, + const unsigned char *extra) +{ + if (br_elem->type == SB_CHAR) + { + /* if (MB_CUR_MAX == 1) */ + if (nrules == 0) + return collseqmb[br_elem->opr.ch]; + else + { + wint_t wc = __btowc (br_elem->opr.ch); + return __collseq_table_lookup (collseqwc, wc); + } + } + else if (br_elem->type == MB_CHAR) + { + if (nrules != 0) + return __collseq_table_lookup (collseqwc, br_elem->opr.wch); + } + else if (br_elem->type == COLL_SYM) + { + size_t sym_name_len = strlen ((char *) br_elem->opr.name); + if (nrules != 0) + { + int32_t elem, idx; + elem = seek_collating_symbol_entry (br_elem->opr.name, + sym_name_len, + symb_table, table_size, + extra); + if (elem != -1) + { + /* We found the entry. */ + idx = symb_table[2 * elem + 1]; + /* Skip the name of collating element name. */ + idx += 1 + extra[idx]; + /* Skip the byte sequence of the collating element. */ + idx += 1 + extra[idx]; + /* Adjust for the alignment. */ + idx = (idx + 3) & ~3; + /* Skip the multibyte collation sequence value. */ + idx += sizeof (unsigned int); + /* Skip the wide char sequence of the collating element. */ + idx += sizeof (unsigned int) * + (1 + *(unsigned int *) (extra + idx)); + /* Return the collation sequence value. */ + return *(unsigned int *) (extra + idx); + } + else if (sym_name_len == 1) + { + /* No valid character. Match it as a single byte + character. */ + return collseqmb[br_elem->opr.name[0]]; + } + } + else if (sym_name_len == 1) + return collseqmb[br_elem->opr.name[0]]; + } + return UINT_MAX; +} + +/* Local function for parse_bracket_exp used in _LIBC environment. + Build the range expression which starts from START_ELEM, and ends + at END_ELEM. The result are written to MBCSET and SBCSET. + RANGE_ALLOC is the allocated size of mbcset->range_starts, and + mbcset->range_ends, is a pointer argument since we may + update it. */ + +static inline reg_errcode_t +__attribute__ ((always_inline)) +build_range_exp (bitset_t sbcset, re_charset_t *mbcset, int *range_alloc, + bracket_elem_t *start_elem, bracket_elem_t *end_elem, + re_dfa_t *dfa, reg_syntax_t syntax, uint32_t nrules, + const unsigned char *collseqmb, const char *collseqwc, + int32_t table_size, const int32_t *symb_table, + const unsigned char *extra) +{ + unsigned int ch; + uint32_t start_collseq; + uint32_t end_collseq; + + /* Equivalence Classes and Character Classes can't be a range + start/end. */ + if (__glibc_unlikely (start_elem->type == EQUIV_CLASS + || start_elem->type == CHAR_CLASS + || end_elem->type == EQUIV_CLASS + || end_elem->type == CHAR_CLASS)) + return REG_ERANGE; + + /* FIXME: Implement rational ranges here, too. */ + start_collseq = lookup_collation_sequence_value (start_elem, nrules, collseqmb, collseqwc, + table_size, symb_table, extra); + end_collseq = lookup_collation_sequence_value (end_elem, nrules, collseqmb, collseqwc, + table_size, symb_table, extra); + /* Check start/end collation sequence values. */ + if (__glibc_unlikely (start_collseq == UINT_MAX + || end_collseq == UINT_MAX)) + return REG_ECOLLATE; + if (__glibc_unlikely ((syntax & RE_NO_EMPTY_RANGES) + && start_collseq > end_collseq)) + return REG_ERANGE; + + /* Got valid collation sequence values, add them as a new entry. + However, if we have no collation elements, and the character set + is single byte, the single byte character set that we + build below suffices. */ + if (nrules > 0 || dfa->mb_cur_max > 1) + { + /* Check the space of the arrays. */ + if (__glibc_unlikely (*range_alloc == mbcset->nranges)) + { + /* There is not enough space, need realloc. */ + uint32_t *new_array_start; + uint32_t *new_array_end; + int new_nranges; + + /* +1 in case of mbcset->nranges is 0. */ + new_nranges = 2 * mbcset->nranges + 1; + new_array_start = re_realloc (mbcset->range_starts, uint32_t, + new_nranges); + new_array_end = re_realloc (mbcset->range_ends, uint32_t, + new_nranges); + + if (__glibc_unlikely (new_array_start == NULL + || new_array_end == NULL)) + return REG_ESPACE; + + mbcset->range_starts = new_array_start; + mbcset->range_ends = new_array_end; + *range_alloc = new_nranges; + } + + mbcset->range_starts[mbcset->nranges] = start_collseq; + mbcset->range_ends[mbcset->nranges++] = end_collseq; + } + + /* Build the table for single byte characters. */ + for (ch = 0; ch < SBC_MAX; ch++) + { + uint32_t ch_collseq; + /* if (MB_CUR_MAX == 1) */ + if (nrules == 0) + ch_collseq = collseqmb[ch]; + else + ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch)); + if (start_collseq <= ch_collseq && ch_collseq <= end_collseq) + bitset_set (sbcset, ch); + } + return REG_NOERROR; +} + +/* Local function for parse_bracket_exp used in _LIBC environment. + Build the collating element which is represented by NAME. + The result are written to MBCSET and SBCSET. + COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a + pointer argument since we may update it. */ + +static inline reg_errcode_t +__attribute__ ((always_inline)) +build_collating_symbol (bitset_t sbcset, re_charset_t *mbcset, + int *coll_sym_alloc, const unsigned char *name, + uint32_t nrules, int32_t table_size, + const int32_t *symb_table, const unsigned char *extra) +{ + int32_t elem, idx; + size_t name_len = strlen ((const char *) name); + if (nrules != 0) + { + elem = seek_collating_symbol_entry (name, name_len, symb_table, + table_size, extra); + if (elem != -1) + { + /* We found the entry. */ + idx = symb_table[2 * elem + 1]; + /* Skip the name of collating element name. */ + idx += 1 + extra[idx]; + } + else if (name_len == 1) + { + /* No valid character, treat it as a normal + character. */ + bitset_set (sbcset, name[0]); + return REG_NOERROR; + } + else + return REG_ECOLLATE; + + /* Got valid collation sequence, add it as a new entry. */ + /* Check the space of the arrays. */ + if (__glibc_unlikely (*coll_sym_alloc == mbcset->ncoll_syms)) + { + /* Not enough, realloc it. */ + /* +1 in case of mbcset->ncoll_syms is 0. */ + int new_coll_sym_alloc = 2 * mbcset->ncoll_syms + 1; + /* Use realloc since mbcset->coll_syms is NULL + if *alloc == 0. */ + int32_t *new_coll_syms = re_realloc (mbcset->coll_syms, int32_t, + new_coll_sym_alloc); + if (__glibc_unlikely (new_coll_syms == NULL)) + return REG_ESPACE; + mbcset->coll_syms = new_coll_syms; + *coll_sym_alloc = new_coll_sym_alloc; + } + mbcset->coll_syms[mbcset->ncoll_syms++] = idx; + return REG_NOERROR; + } + else + { + if (__glibc_unlikely (name_len != 1)) + return REG_ECOLLATE; + else + { + bitset_set (sbcset, name[0]); + return REG_NOERROR; + } + } +} +#endif /* _LIBC */ + /* This function parse bracket expression like "[abc]", "[a-c]", "[[.a-a.]]" etc. */ @@ -2840,253 +3097,11 @@ parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token, { #ifdef _LIBC const unsigned char *collseqmb; - const char *collseqwc; + const char *collseqwc = NULL; uint32_t nrules; - int32_t table_size; - const int32_t *symb_table; - const unsigned char *extra; - - /* Local function for parse_bracket_exp used in _LIBC environment. - Seek the collating symbol entry corresponding to NAME. - Return the index of the symbol in the SYMB_TABLE, - or -1 if not found. */ - - auto inline int32_t - __attribute__ ((always_inline)) - seek_collating_symbol_entry (const unsigned char *name, size_t name_len) - { - int32_t elem; - - for (elem = 0; elem < table_size; elem++) - if (symb_table[2 * elem] != 0) - { - int32_t idx = symb_table[2 * elem + 1]; - /* Skip the name of collating element name. */ - idx += 1 + extra[idx]; - if (/* Compare the length of the name. */ - name_len == extra[idx] - /* Compare the name. */ - && memcmp (name, &extra[idx + 1], name_len) == 0) - /* Yep, this is the entry. */ - return elem; - } - return -1; - } - - /* Local function for parse_bracket_exp used in _LIBC environment. - Look up the collation sequence value of BR_ELEM. - Return the value if succeeded, UINT_MAX otherwise. */ - - auto inline unsigned int - __attribute__ ((always_inline)) - lookup_collation_sequence_value (bracket_elem_t *br_elem) - { - if (br_elem->type == SB_CHAR) - { - /* - if (MB_CUR_MAX == 1) - */ - if (nrules == 0) - return collseqmb[br_elem->opr.ch]; - else - { - wint_t wc = __btowc (br_elem->opr.ch); - return __collseq_table_lookup (collseqwc, wc); - } - } - else if (br_elem->type == MB_CHAR) - { - if (nrules != 0) - return __collseq_table_lookup (collseqwc, br_elem->opr.wch); - } - else if (br_elem->type == COLL_SYM) - { - size_t sym_name_len = strlen ((char *) br_elem->opr.name); - if (nrules != 0) - { - int32_t elem, idx; - elem = seek_collating_symbol_entry (br_elem->opr.name, - sym_name_len); - if (elem != -1) - { - /* We found the entry. */ - idx = symb_table[2 * elem + 1]; - /* Skip the name of collating element name. */ - idx += 1 + extra[idx]; - /* Skip the byte sequence of the collating element. */ - idx += 1 + extra[idx]; - /* Adjust for the alignment. */ - idx = (idx + 3) & ~3; - /* Skip the multibyte collation sequence value. */ - idx += sizeof (unsigned int); - /* Skip the wide char sequence of the collating element. */ - idx += sizeof (unsigned int) * - (1 + *(unsigned int *) (extra + idx)); - /* Return the collation sequence value. */ - return *(unsigned int *) (extra + idx); - } - else if (sym_name_len == 1) - { - /* No valid character. Match it as a single byte - character. */ - return collseqmb[br_elem->opr.name[0]]; - } - } - else if (sym_name_len == 1) - return collseqmb[br_elem->opr.name[0]]; - } - return UINT_MAX; - } - - /* Local function for parse_bracket_exp used in _LIBC environment. - Build the range expression which starts from START_ELEM, and ends - at END_ELEM. The result are written to MBCSET and SBCSET. - RANGE_ALLOC is the allocated size of mbcset->range_starts, and - mbcset->range_ends, is a pointer argument since we may - update it. */ - - auto inline reg_errcode_t - __attribute__ ((always_inline)) - build_range_exp (bitset_t sbcset, re_charset_t *mbcset, int *range_alloc, - bracket_elem_t *start_elem, bracket_elem_t *end_elem) - { - unsigned int ch; - uint32_t start_collseq; - uint32_t end_collseq; - - /* Equivalence Classes and Character Classes can't be a range - start/end. */ - if (__glibc_unlikely (start_elem->type == EQUIV_CLASS - || start_elem->type == CHAR_CLASS - || end_elem->type == EQUIV_CLASS - || end_elem->type == CHAR_CLASS)) - return REG_ERANGE; - - /* FIXME: Implement rational ranges here, too. */ - start_collseq = lookup_collation_sequence_value (start_elem); - end_collseq = lookup_collation_sequence_value (end_elem); - /* Check start/end collation sequence values. */ - if (__glibc_unlikely (start_collseq == UINT_MAX - || end_collseq == UINT_MAX)) - return REG_ECOLLATE; - if (__glibc_unlikely ((syntax & RE_NO_EMPTY_RANGES) - && start_collseq > end_collseq)) - return REG_ERANGE; - - /* Got valid collation sequence values, add them as a new entry. - However, if we have no collation elements, and the character set - is single byte, the single byte character set that we - build below suffices. */ - if (nrules > 0 || dfa->mb_cur_max > 1) - { - /* Check the space of the arrays. */ - if (__glibc_unlikely (*range_alloc == mbcset->nranges)) - { - /* There is not enough space, need realloc. */ - uint32_t *new_array_start; - uint32_t *new_array_end; - Idx new_nranges; - - /* +1 in case of mbcset->nranges is 0. */ - new_nranges = 2 * mbcset->nranges + 1; - new_array_start = re_realloc (mbcset->range_starts, uint32_t, - new_nranges); - new_array_end = re_realloc (mbcset->range_ends, uint32_t, - new_nranges); - - if (__glibc_unlikely (new_array_start == NULL - || new_array_end == NULL)) - return REG_ESPACE; - - mbcset->range_starts = new_array_start; - mbcset->range_ends = new_array_end; - *range_alloc = new_nranges; - } - - mbcset->range_starts[mbcset->nranges] = start_collseq; - mbcset->range_ends[mbcset->nranges++] = end_collseq; - } - - /* Build the table for single byte characters. */ - for (ch = 0; ch < SBC_MAX; ch++) - { - uint32_t ch_collseq; - /* - if (MB_CUR_MAX == 1) - */ - if (nrules == 0) - ch_collseq = collseqmb[ch]; - else - ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch)); - if (start_collseq <= ch_collseq && ch_collseq <= end_collseq) - bitset_set (sbcset, ch); - } - return REG_NOERROR; - } - - /* Local function for parse_bracket_exp used in _LIBC environment. - Build the collating element which is represented by NAME. - The result are written to MBCSET and SBCSET. - COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a - pointer argument since we may update it. */ - - auto inline reg_errcode_t - __attribute__ ((always_inline)) - build_collating_symbol (bitset_t sbcset, re_charset_t *mbcset, - Idx *coll_sym_alloc, const unsigned char *name) - { - int32_t elem, idx; - size_t name_len = strlen ((const char *) name); - if (nrules != 0) - { - elem = seek_collating_symbol_entry (name, name_len); - if (elem != -1) - { - /* We found the entry. */ - idx = symb_table[2 * elem + 1]; - /* Skip the name of collating element name. */ - idx += 1 + extra[idx]; - } - else if (name_len == 1) - { - /* No valid character, treat it as a normal - character. */ - bitset_set (sbcset, name[0]); - return REG_NOERROR; - } - else - return REG_ECOLLATE; - - /* Got valid collation sequence, add it as a new entry. */ - /* Check the space of the arrays. */ - if (__glibc_unlikely (*coll_sym_alloc == mbcset->ncoll_syms)) - { - /* Not enough, realloc it. */ - /* +1 in case of mbcset->ncoll_syms is 0. */ - Idx new_coll_sym_alloc = 2 * mbcset->ncoll_syms + 1; - /* Use realloc since mbcset->coll_syms is NULL - if *alloc == 0. */ - int32_t *new_coll_syms = re_realloc (mbcset->coll_syms, int32_t, - new_coll_sym_alloc); - if (__glibc_unlikely (new_coll_syms == NULL)) - return REG_ESPACE; - mbcset->coll_syms = new_coll_syms; - *coll_sym_alloc = new_coll_sym_alloc; - } - mbcset->coll_syms[mbcset->ncoll_syms++] = idx; - return REG_NOERROR; - } - else - { - if (__glibc_unlikely (name_len != 1)) - return REG_ECOLLATE; - else - { - bitset_set (sbcset, name[0]); - return REG_NOERROR; - } - } - } + int32_t table_size = 0; + const int32_t *symb_table = NULL; + const unsigned char *extra = NULL; #endif re_token_t br_token; @@ -3230,7 +3245,9 @@ parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token, #ifdef _LIBC *err = build_range_exp (sbcset, mbcset, &range_alloc, - &start_elem, &end_elem); + &start_elem, &end_elem, + dfa, syntax, nrules, collseqmb, collseqwc, + table_size, symb_table, extra); #else # ifdef RE_ENABLE_I18N *err = build_range_exp (syntax, sbcset, @@ -3283,7 +3300,8 @@ parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token, #ifdef RE_ENABLE_I18N mbcset, &coll_sym_alloc, #endif /* RE_ENABLE_I18N */ - start_elem.opr.name); + start_elem.opr.name, + nrules, table_size, symb_table, extra); if (__glibc_unlikely (*err != REG_NOERROR)) goto parse_bracket_exp_free_return; break;