posix/regcomp.c

   1 /* Extended regular expression matching and search library.
   2    Copyright (C) 2002, 2003 Free Software Foundation, Inc.
   3    This file is part of the GNU C Library.
   4    Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
   5
   6    The GNU C Library is free software; you can redistribute it and/or
   7    modify it under the terms of the GNU Lesser General Public
   8    License as published by the Free Software Foundation; either
   9    version 2.1 of the License, or (at your option) any later version.
  10
  11    The GNU C Library is distributed in the hope that it will be useful,
  12    but WITHOUT ANY WARRANTY; without even the implied warranty of
  13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14    Lesser General Public License for more details.
  15
  16    You should have received a copy of the GNU Lesser General Public
  17    License along with the GNU C Library; if not, write to the Free
  18    Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
  19    02111-1307 USA.  */
  20
  21 static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern,
  22                                           int length, reg_syntax_t syntax);
  23 static void re_compile_fastmap_iter (regex_t *bufp,
  24                                      const re_dfastate_t *init_state,
  25                                      char *fastmap);
  26 static reg_errcode_t init_dfa (re_dfa_t *dfa, int pat_len);
  27 static reg_errcode_t init_word_char (re_dfa_t *dfa);
  28 #ifdef RE_ENABLE_I18N
  29 static void free_charset (re_charset_t *cset);
  30 #endif /* RE_ENABLE_I18N */
  31 static void free_workarea_compile (regex_t *preg);
  32 static reg_errcode_t create_initial_state (re_dfa_t *dfa);
  33 static reg_errcode_t analyze (re_dfa_t *dfa);
  34 static reg_errcode_t analyze_tree (re_dfa_t *dfa, bin_tree_t *node);
  35 static void calc_first (re_dfa_t *dfa, bin_tree_t *node);
  36 static void calc_next (re_dfa_t *dfa, bin_tree_t *node);
  37 static void calc_epsdest (re_dfa_t *dfa, bin_tree_t *node);
  38 static reg_errcode_t duplicate_node_closure (re_dfa_t *dfa, int top_org_node,
  39                                              int top_clone_node, int root_node,
  40                                              unsigned int constraint);
  41 static reg_errcode_t duplicate_node (int *new_idx, re_dfa_t *dfa, int org_idx,
  42                                      unsigned int constraint);
  43 static int search_duplicated_node (re_dfa_t *dfa, int org_node,
  44                                    unsigned int constraint);
  45 static reg_errcode_t calc_eclosure (re_dfa_t *dfa);
  46 static reg_errcode_t calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa,
  47                                          int node, int root);
  48 static void calc_inveclosure (re_dfa_t *dfa);
  49 static int fetch_number (re_string_t *input, re_token_t *token,
  50                          reg_syntax_t syntax);
  51 static re_token_t fetch_token (re_string_t *input, reg_syntax_t syntax);
  52 static int peek_token (re_token_t *token, re_string_t *input,
  53                         reg_syntax_t syntax);
  54 static int peek_token_bracket (re_token_t *token, re_string_t *input,
  55                                reg_syntax_t syntax);
  56 static bin_tree_t *parse (re_string_t *regexp, regex_t *preg,
  57                           reg_syntax_t syntax, reg_errcode_t *err);
  58 static bin_tree_t *parse_reg_exp (re_string_t *regexp, regex_t *preg,
  59                                   re_token_t *token, reg_syntax_t syntax,
  60                                   int nest, reg_errcode_t *err);
  61 static bin_tree_t *parse_branch (re_string_t *regexp, regex_t *preg,
  62                                  re_token_t *token, reg_syntax_t syntax,
  63                                  int nest, reg_errcode_t *err);
  64 static bin_tree_t *parse_expression (re_string_t *regexp, regex_t *preg,
  65                                      re_token_t *token, reg_syntax_t syntax,
  66                                      int nest, reg_errcode_t *err);
  67 static bin_tree_t *parse_sub_exp (re_string_t *regexp, regex_t *preg,
  68                                   re_token_t *token, reg_syntax_t syntax,
  69                                   int nest, reg_errcode_t *err);
  70 static bin_tree_t *parse_dup_op (bin_tree_t *dup_elem, re_string_t *regexp,
  71                                  re_dfa_t *dfa, re_token_t *token,
  72                                  reg_syntax_t syntax, reg_errcode_t *err);
  73 static bin_tree_t *parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa,
  74                                       re_token_t *token, reg_syntax_t syntax,
  75                                       reg_errcode_t *err);
  76 static reg_errcode_t parse_bracket_element (bracket_elem_t *elem,
  77                                             re_string_t *regexp,
  78                                             re_token_t *token, int token_len,
  79                                             re_dfa_t *dfa,
  80                                             reg_syntax_t syntax);
  81 static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem,
  82                                           re_string_t *regexp,
  83                                           re_token_t *token);
  84 #ifndef _LIBC
  85 # ifdef RE_ENABLE_I18N
  86 static reg_errcode_t build_range_exp (re_bitset_ptr_t sbcset,
  87                                       re_charset_t *mbcset, int *range_alloc,
  88                                       bracket_elem_t *start_elem,
  89                                       bracket_elem_t *end_elem);
  90 static reg_errcode_t build_collating_symbol (re_bitset_ptr_t sbcset,
  91                                              re_charset_t *mbcset,
  92                                              int *coll_sym_alloc,
  93                                              const unsigned char *name);
  94 # else /* not RE_ENABLE_I18N */
  95 static reg_errcode_t build_range_exp (re_bitset_ptr_t sbcset,
  96                                       bracket_elem_t *start_elem,
  97                                       bracket_elem_t *end_elem);
  98 static reg_errcode_t build_collating_symbol (re_bitset_ptr_t sbcset,
  99                                              const unsigned char *name);
 100 # endif /* not RE_ENABLE_I18N */
 101 #endif /* not _LIBC */
 102 #ifdef RE_ENABLE_I18N
 103 static reg_errcode_t build_equiv_class (re_bitset_ptr_t sbcset,
 104                                         re_charset_t *mbcset,
 105                                         int *equiv_class_alloc,
 106                                         const unsigned char *name);
 107 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
 108                                       re_bitset_ptr_t sbcset,
 109                                       re_charset_t *mbcset,
 110                                       int *char_class_alloc,
 111                                       const unsigned char *class_name,
 112                                       reg_syntax_t syntax);
 113 #else  /* not RE_ENABLE_I18N */
 114 static reg_errcode_t build_equiv_class (re_bitset_ptr_t sbcset,
 115                                         const unsigned char *name);
 116 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
 117                                       re_bitset_ptr_t sbcset,
 118                                       const unsigned char *class_name,
 119                                       reg_syntax_t syntax);
 120 #endif /* not RE_ENABLE_I18N */
 121 static bin_tree_t *build_word_op (re_dfa_t *dfa, RE_TRANSLATE_TYPE trans,
 122                                   int not, reg_errcode_t *err);
 123 static void free_bin_tree (bin_tree_t *tree);
 124 static bin_tree_t *create_tree (bin_tree_t *left, bin_tree_t *right,
 125                                 re_token_type_t type, int index);
 126 static bin_tree_t *duplicate_tree (const bin_tree_t *src, re_dfa_t *dfa);
 127 \f
 128 /* This table gives an error message for each of the error codes listed
 129    in regex.h.  Obviously the order here has to be same as there.
 130    POSIX doesn't require that we do anything for REG_NOERROR,
 131    but why not be nice?  */
 132
 133 const char __re_error_msgid[] attribute_hidden =
 134   {
 135 #define REG_NOERROR_IDX 0
 136     gettext_noop ("Success")    /* REG_NOERROR */
 137     "\0"
 138 #define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
 139     gettext_noop ("No match")   /* REG_NOMATCH */
 140     "\0"
 141 #define REG_BADPAT_IDX  (REG_NOMATCH_IDX + sizeof "No match")
 142     gettext_noop ("Invalid regular expression") /* REG_BADPAT */
 143     "\0"
 144 #define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
 145     gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
 146     "\0"
 147 #define REG_ECTYPE_IDX  (REG_ECOLLATE_IDX + sizeof "Invalid collation character")
 148     gettext_noop ("Invalid character class name") /* REG_ECTYPE */
 149     "\0"
 150 #define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name")
 151     gettext_noop ("Trailing backslash") /* REG_EESCAPE */
 152     "\0"
 153 #define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash")
 154     gettext_noop ("Invalid back reference") /* REG_ESUBREG */
 155     "\0"
 156 #define REG_EBRACK_IDX  (REG_ESUBREG_IDX + sizeof "Invalid back reference")
 157     gettext_noop ("Unmatched [ or [^")  /* REG_EBRACK */
 158     "\0"
 159 #define REG_EPAREN_IDX  (REG_EBRACK_IDX + sizeof "Unmatched [ or [^")
 160     gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
 161     "\0"
 162 #define REG_EBRACE_IDX  (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
 163     gettext_noop ("Unmatched \\{") /* REG_EBRACE */
 164     "\0"
 165 #define REG_BADBR_IDX   (REG_EBRACE_IDX + sizeof "Unmatched \\{")
 166     gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
 167     "\0"
 168 #define REG_ERANGE_IDX  (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
 169     gettext_noop ("Invalid range end")  /* REG_ERANGE */
 170     "\0"
 171 #define REG_ESPACE_IDX  (REG_ERANGE_IDX + sizeof "Invalid range end")
 172     gettext_noop ("Memory exhausted") /* REG_ESPACE */
 173     "\0"
 174 #define REG_BADRPT_IDX  (REG_ESPACE_IDX + sizeof "Memory exhausted")
 175     gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
 176     "\0"
 177 #define REG_EEND_IDX    (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
 178     gettext_noop ("Premature end of regular expression") /* REG_EEND */
 179     "\0"
 180 #define REG_ESIZE_IDX   (REG_EEND_IDX + sizeof "Premature end of regular expression")
 181     gettext_noop ("Regular expression too big") /* REG_ESIZE */
 182     "\0"
 183 #define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big")
 184     gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
 185   };
 186
 187 const size_t __re_error_msgid_idx[] attribute_hidden =
 188   {
 189     REG_NOERROR_IDX,
 190     REG_NOMATCH_IDX,
 191     REG_BADPAT_IDX,
 192     REG_ECOLLATE_IDX,
 193     REG_ECTYPE_IDX,
 194     REG_EESCAPE_IDX,
 195     REG_ESUBREG_IDX,
 196     REG_EBRACK_IDX,
 197     REG_EPAREN_IDX,
 198     REG_EBRACE_IDX,
 199     REG_BADBR_IDX,
 200     REG_ERANGE_IDX,
 201     REG_ESPACE_IDX,
 202     REG_BADRPT_IDX,
 203     REG_EEND_IDX,
 204     REG_ESIZE_IDX,
 205     REG_ERPAREN_IDX
 206   };
 207 \f
 208 /* Entry points for GNU code.  */
 209
 210 /* re_compile_pattern is the GNU regular expression compiler: it
 211    compiles PATTERN (of length LENGTH) and puts the result in BUFP.
 212    Returns 0 if the pattern was valid, otherwise an error string.
 213
 214    Assumes the `allocated' (and perhaps `buffer') and `translate' fields
 215    are set in BUFP on entry.  */
 216
 217 const char *
 218 re_compile_pattern (pattern, length, bufp)
 219     const char *pattern;
 220     size_t length;
 221     struct re_pattern_buffer *bufp;
 222 {
 223   reg_errcode_t ret;
 224
 225   /* And GNU code determines whether or not to get register information
 226      by passing null for the REGS argument to re_match, etc., not by
 227      setting no_sub.  */
 228   bufp->no_sub = 0;
 229
 230   /* Match anchors at newline.  */
 231   bufp->newline_anchor = 1;
 232
 233   ret = re_compile_internal (bufp, pattern, length, re_syntax_options);
 234
 235   if (!ret)
 236     return NULL;
 237   return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
 238 }
 239 #ifdef _LIBC
 240 weak_alias (__re_compile_pattern, re_compile_pattern)
 241 #endif
 242
 243 /* Set by `re_set_syntax' to the current regexp syntax to recognize.  Can
 244    also be assigned to arbitrarily: each pattern buffer stores its own
 245    syntax, so it can be changed between regex compilations.  */
 246 /* This has no initializer because initialized variables in Emacs
 247    become read-only after dumping.  */
 248 reg_syntax_t re_syntax_options;
 249
 250
 251 /* Specify the precise syntax of regexps for compilation.  This provides
 252    for compatibility for various utilities which historically have
 253    different, incompatible syntaxes.
 254
 255    The argument SYNTAX is a bit mask comprised of the various bits
 256    defined in regex.h.  We return the old syntax.  */
 257
 258 reg_syntax_t
 259 re_set_syntax (syntax)
 260     reg_syntax_t syntax;
 261 {
 262   reg_syntax_t ret = re_syntax_options;
 263
 264   re_syntax_options = syntax;
 265   return ret;
 266 }
 267 #ifdef _LIBC
 268 weak_alias (__re_set_syntax, re_set_syntax)
 269 #endif
 270
 271 int
 272 re_compile_fastmap (bufp)
 273     struct re_pattern_buffer *bufp;
 274 {
 275   re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
 276   char *fastmap = bufp->fastmap;
 277
 278   memset (fastmap, '\0', sizeof (char) * SBC_MAX);
 279   re_compile_fastmap_iter (bufp, dfa->init_state, fastmap);
 280   if (dfa->init_state != dfa->init_state_word)
 281     re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap);
 282   if (dfa->init_state != dfa->init_state_nl)
 283     re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap);
 284   if (dfa->init_state != dfa->init_state_begbuf)
 285     re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap);
 286   bufp->fastmap_accurate = 1;
 287   return 0;
 288 }
 289 #ifdef _LIBC
 290 weak_alias (__re_compile_fastmap, re_compile_fastmap)
 291 #endif
 292
 293 static inline void
 294 __attribute ((always_inline))
 295 re_set_fastmap (char *fastmap, int icase, int ch)
 296 {
 297   fastmap[ch] = 1;
 298   if (icase)
 299     fastmap[tolower (ch)] = 1;
 300 }
 301
 302 /* Helper function for re_compile_fastmap.
 303    Compile fastmap for the initial_state INIT_STATE.  */
 304
 305 static void
 306 re_compile_fastmap_iter (bufp, init_state, fastmap)
 307      regex_t *bufp;
 308      const re_dfastate_t *init_state;
 309      char *fastmap;
 310 {
 311   re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
 312   int node_cnt;
 313   int icase = (MB_CUR_MAX == 1 && (bufp->syntax & RE_ICASE));
 314   for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt)
 315     {
 316       int node = init_state->nodes.elems[node_cnt];
 317       re_token_type_t type = dfa->nodes[node].type;
 318
 319       if (type == CHARACTER)
 320         re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c);
 321       else if (type == SIMPLE_BRACKET)
 322         {
 323           int i, j, ch;
 324           for (i = 0, ch = 0; i < BITSET_UINTS; ++i)
 325             for (j = 0; j < UINT_BITS; ++j, ++ch)
 326               if (dfa->nodes[node].opr.sbcset[i] & (1 << j))
 327                 re_set_fastmap (fastmap, icase, ch);
 328         }
 329 #ifdef RE_ENABLE_I18N
 330       else if (type == COMPLEX_BRACKET)
 331         {
 332           int i;
 333           re_charset_t *cset = dfa->nodes[node].opr.mbcset;
 334           if (cset->non_match || cset->ncoll_syms || cset->nequiv_classes
 335               || cset->nranges || cset->nchar_classes)
 336             {
 337 # ifdef _LIBC
 338               if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0)
 339                 {
 340                   /* In this case we want to catch the bytes which are
 341                      the first byte of any collation elements.
 342                      e.g. In da_DK, we want to catch 'a' since "aa"
 343                           is a valid collation element, and don't catch
 344                           'b' since 'b' is the only collation element
 345                           which starts from 'b'.  */
 346                   int j, ch;
 347                   const int32_t *table = (const int32_t *)
 348                     _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
 349                   for (i = 0, ch = 0; i < BITSET_UINTS; ++i)
 350                     for (j = 0; j < UINT_BITS; ++j, ++ch)
 351                       if (table[ch] < 0)
 352                         re_set_fastmap (fastmap, icase, ch);
 353                 }
 354 # else
 355               if (MB_CUR_MAX > 1)
 356                 for (i = 0; i < SBC_MAX; ++i)
 357                   if (__btowc (i) == WEOF)
 358                     re_set_fastmap (fastmap, icase, i);
 359 # endif /* not _LIBC */
 360             }
 361           for (i = 0; i < cset->nmbchars; ++i)
 362             {
 363               char buf[256];
 364               mbstate_t state;
 365               memset (&state, '\0', sizeof (state));
 366               __wcrtomb (buf, cset->mbchars[i], &state);
 367               re_set_fastmap (fastmap, icase, *(unsigned char *) buf);
 368             }
 369         }
 370 #endif /* RE_ENABLE_I18N */
 371       else if (type == END_OF_RE || type == OP_PERIOD)
 372         {
 373           memset (fastmap, '\1', sizeof (char) * SBC_MAX);
 374           if (type == END_OF_RE)
 375             bufp->can_be_null = 1;
 376           return;
 377         }
 378     }
 379 }
 380 \f
 381 /* Entry point for POSIX code.  */
 382 /* regcomp takes a regular expression as a string and compiles it.
 383
 384    PREG is a regex_t *.  We do not expect any fields to be initialized,
 385    since POSIX says we shouldn't.  Thus, we set
 386
 387      `buffer' to the compiled pattern;
 388      `used' to the length of the compiled pattern;
 389      `syntax' to RE_SYNTAX_POSIX_EXTENDED if the
 390        REG_EXTENDED bit in CFLAGS is set; otherwise, to
 391        RE_SYNTAX_POSIX_BASIC;
 392      `newline_anchor' to REG_NEWLINE being set in CFLAGS;
 393      `fastmap' to an allocated space for the fastmap;
 394      `fastmap_accurate' to zero;
 395      `re_nsub' to the number of subexpressions in PATTERN.
 396
 397    PATTERN is the address of the pattern string.
 398
 399    CFLAGS is a series of bits which affect compilation.
 400
 401      If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
 402      use POSIX basic syntax.
 403
 404      If REG_NEWLINE is set, then . and [^...] don't match newline.
 405      Also, regexec will try a match beginning after every newline.
 406
 407      If REG_ICASE is set, then we considers upper- and lowercase
 408      versions of letters to be equivalent when matching.
 409
 410      If REG_NOSUB is set, then when PREG is passed to regexec, that
 411      routine will report only success or failure, and nothing about the
 412      registers.
 413
 414    It returns 0 if it succeeds, nonzero if it doesn't.  (See regex.h for
 415    the return codes and their meanings.)  */
 416
 417 int
 418 regcomp (preg, pattern, cflags)
 419     regex_t *__restrict preg;
 420     const char *__restrict pattern;
 421     int cflags;
 422 {
 423   reg_errcode_t ret;
 424   reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? RE_SYNTAX_POSIX_EXTENDED
 425                          : RE_SYNTAX_POSIX_BASIC);
 426
 427   preg->buffer = NULL;
 428   preg->allocated = 0;
 429   preg->used = 0;
 430
 431   /* Try to allocate space for the fastmap.  */
 432   preg->fastmap = re_malloc (char, SBC_MAX);
 433   if (BE (preg->fastmap == NULL, 0))
 434     return REG_ESPACE;
 435
 436   syntax |= (cflags & REG_ICASE) ? RE_ICASE : 0;
 437
 438   /* If REG_NEWLINE is set, newlines are treated differently.  */
 439   if (cflags & REG_NEWLINE)
 440     { /* REG_NEWLINE implies neither . nor [^...] match newline.  */
 441       syntax &= ~RE_DOT_NEWLINE;
 442       syntax |= RE_HAT_LISTS_NOT_NEWLINE;
 443       /* It also changes the matching behavior.  */
 444       preg->newline_anchor = 1;
 445     }
 446   else
 447     preg->newline_anchor = 0;
 448   preg->no_sub = !!(cflags & REG_NOSUB);
 449   preg->translate = NULL;
 450
 451   ret = re_compile_internal (preg, pattern, strlen (pattern), syntax);
 452
 453   /* POSIX doesn't distinguish between an unmatched open-group and an
 454      unmatched close-group: both are REG_EPAREN.  */
 455   if (ret == REG_ERPAREN)
 456     ret = REG_EPAREN;
 457
 458   /* We have already checked preg->fastmap != NULL.  */
 459   if (BE (ret == REG_NOERROR, 1))
 460     /* Compute the fastmap now, since regexec cannot modify the pattern
 461        buffer.  This function nevers fails in this implementation.  */
 462     (void) re_compile_fastmap (preg);
 463   else
 464     {
 465       /* Some error occurred while compiling the expression.  */
 466       re_free (preg->fastmap);
 467       preg->fastmap = NULL;
 468     }
 469
 470   return (int) ret;
 471 }
 472 #ifdef _LIBC
 473 weak_alias (__regcomp, regcomp)
 474 #endif
 475
 476 /* Returns a message corresponding to an error code, ERRCODE, returned
 477    from either regcomp or regexec.   We don't use PREG here.  */
 478
 479 size_t
 480 regerror (errcode, preg, errbuf, errbuf_size)
 481     int errcode;
 482     const regex_t *preg;
 483     char *errbuf;
 484     size_t errbuf_size;
 485 {
 486   const char *msg;
 487   size_t msg_size;
 488
 489   if (BE (errcode < 0
 490           || errcode >= (int) (sizeof (__re_error_msgid_idx)
 491                                / sizeof (__re_error_msgid_idx[0])), 0))
 492     /* Only error codes returned by the rest of the code should be passed
 493        to this routine.  If we are given anything else, or if other regex
 494        code generates an invalid error code, then the program has a bug.
 495        Dump core so we can fix it.  */
 496     abort ();
 497
 498   msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]);
 499
 500   msg_size = strlen (msg) + 1; /* Includes the null.  */
 501
 502   if (BE (errbuf_size != 0, 1))
 503     {
 504       if (BE (msg_size > errbuf_size, 0))
 505         {
 506 #if defined HAVE_MEMPCPY || defined _LIBC
 507           *((char *) __mempcpy (errbuf, msg, errbuf_size - 1)) = '\0';
 508 #else
 509           memcpy (errbuf, msg, errbuf_size - 1);
 510           errbuf[errbuf_size - 1] = 0;
 511 #endif
 512         }
 513       else
 514         memcpy (errbuf, msg, msg_size);
 515     }
 516
 517   return msg_size;
 518 }
 519 #ifdef _LIBC
 520 weak_alias (__regerror, regerror)
 521 #endif
 522
 523
 524 static void
 525 free_dfa_content (re_dfa_t *dfa)
 526 {
 527   int i, j;
 528
 529   re_free (dfa->subexps);
 530
 531   for (i = 0; i < dfa->nodes_len; ++i)
 532     {
 533       re_token_t *node = dfa->nodes + i;
 534 #ifdef RE_ENABLE_I18N
 535       if (node->type == COMPLEX_BRACKET && node->duplicated == 0)
 536         free_charset (node->opr.mbcset);
 537       else
 538 #endif /* RE_ENABLE_I18N */
 539         if (node->type == SIMPLE_BRACKET && node->duplicated == 0)
 540           re_free (node->opr.sbcset);
 541     }
 542   re_free (dfa->nexts);
 543   for (i = 0; i < dfa->nodes_len; ++i)
 544     {
 545       if (dfa->eclosures != NULL)
 546         re_node_set_free (dfa->eclosures + i);
 547       if (dfa->inveclosures != NULL)
 548         re_node_set_free (dfa->inveclosures + i);
 549       if (dfa->edests != NULL)
 550         re_node_set_free (dfa->edests + i);
 551     }
 552   re_free (dfa->edests);
 553   re_free (dfa->eclosures);
 554   re_free (dfa->inveclosures);
 555   re_free (dfa->nodes);
 556
 557   for (i = 0; i <= dfa->state_hash_mask; ++i)
 558     {
 559       struct re_state_table_entry *entry = dfa->state_table + i;
 560       for (j = 0; j < entry->num; ++j)
 561         {
 562           re_dfastate_t *state = entry->array[j];
 563           free_state (state);
 564         }
 565       re_free (entry->array);
 566     }
 567   re_free (dfa->state_table);
 568
 569   if (dfa->word_char != NULL)
 570     re_free (dfa->word_char);
 571 #ifdef DEBUG
 572   re_free (dfa->re_str);
 573 #endif
 574
 575   re_free (dfa);
 576 }
 577
 578
 579 /* Free dynamically allocated space used by PREG.  */
 580
 581 void
 582 regfree (preg)
 583     regex_t *preg;
 584 {
 585   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
 586   if (BE (dfa != NULL, 1))
 587     free_dfa_content (dfa);
 588
 589   re_free (preg->fastmap);
 590 }
 591 #ifdef _LIBC
 592 weak_alias (__regfree, regfree)
 593 #endif
 594 \f
 595 /* Entry points compatible with 4.2 BSD regex library.  We don't define
 596    them unless specifically requested.  */
 597
 598 #if defined _REGEX_RE_COMP || defined _LIBC
 599
 600 /* BSD has one and only one pattern buffer.  */
 601 static struct re_pattern_buffer re_comp_buf;
 602
 603 char *
 604 # ifdef _LIBC
 605 /* Make these definitions weak in libc, so POSIX programs can redefine
 606    these names if they don't use our functions, and still use
 607    regcomp/regexec above without link errors.  */
 608 weak_function
 609 # endif
 610 re_comp (s)
 611      const char *s;
 612 {
 613   reg_errcode_t ret;
 614   char *fastmap;
 615
 616   if (!s)
 617     {
 618       if (!re_comp_buf.buffer)
 619         return gettext ("No previous regular expression");
 620       return 0;
 621     }
 622
 623   if (re_comp_buf.buffer)
 624     {
 625       fastmap = re_comp_buf.fastmap;
 626       re_comp_buf.fastmap = NULL;
 627       __regfree (&re_comp_buf);
 628       memset (&re_comp_buf, '\0', sizeof (re_comp_buf));
 629       re_comp_buf.fastmap = fastmap;
 630     }
 631
 632   if (re_comp_buf.fastmap == NULL)
 633     {
 634       re_comp_buf.fastmap = (char *) malloc (SBC_MAX);
 635       if (re_comp_buf.fastmap == NULL)
 636         return (char *) gettext (__re_error_msgid
 637                                  + __re_error_msgid_idx[(int) REG_ESPACE]);
 638     }
 639
 640   /* Since `re_exec' always passes NULL for the `regs' argument, we
 641      don't need to initialize the pattern buffer fields which affect it.  */
 642
 643   /* Match anchors at newlines.  */
 644   re_comp_buf.newline_anchor = 1;
 645
 646   ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options);
 647
 648   if (!ret)
 649     return NULL;
 650
 651   /* Yes, we're discarding `const' here if !HAVE_LIBINTL.  */
 652   return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
 653 }
 654
 655 #ifdef _LIBC
 656 libc_freeres_fn (free_mem)
 657 {
 658   __regfree (&re_comp_buf);
 659 }
 660 #endif
 661
 662 #endif /* _REGEX_RE_COMP */
 663 \f
 664 /* Internal entry point.
 665    Compile the regular expression PATTERN, whose length is LENGTH.
 666    SYNTAX indicate regular expression's syntax.  */
 667
 668 static reg_errcode_t
 669 re_compile_internal (preg, pattern, length, syntax)
 670      regex_t *preg;
 671      const char * pattern;
 672      int length;
 673      reg_syntax_t syntax;
 674 {
 675   reg_errcode_t err = REG_NOERROR;
 676   re_dfa_t *dfa;
 677   re_string_t regexp;
 678
 679   /* Initialize the pattern buffer.  */
 680   preg->fastmap_accurate = 0;
 681   preg->syntax = syntax;
 682   preg->not_bol = preg->not_eol = 0;
 683   preg->used = 0;
 684   preg->re_nsub = 0;
 685   preg->can_be_null = 0;
 686   preg->regs_allocated = REGS_UNALLOCATED;
 687
 688   /* Initialize the dfa.  */
 689   dfa = (re_dfa_t *) preg->buffer;
 690   if (preg->allocated < sizeof (re_dfa_t))
 691     {
 692       /* If zero allocated, but buffer is non-null, try to realloc
 693          enough space.  This loses if buffer's address is bogus, but
 694          that is the user's responsibility.  If ->buffer is NULL this
 695          is a simple allocation.  */
 696       dfa = re_realloc (preg->buffer, re_dfa_t, 1);
 697       if (dfa == NULL)
 698         return REG_ESPACE;
 699       preg->allocated = sizeof (re_dfa_t);
 700     }
 701   preg->buffer = (unsigned char *) dfa;
 702   preg->used = sizeof (re_dfa_t);
 703
 704   err = init_dfa (dfa, length);
 705   if (BE (err != REG_NOERROR, 0))
 706     {
 707       re_free (dfa);
 708       preg->buffer = NULL;
 709       preg->allocated = 0;
 710       return err;
 711     }
 712 #ifdef DEBUG
 713   dfa->re_str = re_malloc (char, length + 1);
 714   strncpy (dfa->re_str, pattern, length + 1);
 715 #endif
 716
 717   err = re_string_construct (&regexp, pattern, length, preg->translate,
 718                              syntax & RE_ICASE);
 719   if (BE (err != REG_NOERROR, 0))
 720     {
 721       re_free (dfa);
 722       preg->buffer = NULL;
 723       preg->allocated = 0;
 724       return err;
 725     }
 726
 727   /* Parse the regular expression, and build a structure tree.  */
 728   preg->re_nsub = 0;
 729   dfa->str_tree = parse (&regexp, preg, syntax, &err);
 730   if (BE (dfa->str_tree == NULL, 0))
 731     goto re_compile_internal_free_return;
 732
 733   /* Analyze the tree and collect information which is necessary to
 734      create the dfa.  */
 735   err = analyze (dfa);
 736   if (BE (err != REG_NOERROR, 0))
 737     goto re_compile_internal_free_return;
 738
 739   /* Then create the initial state of the dfa.  */
 740   err = create_initial_state (dfa);
 741
 742   /* Release work areas.  */
 743   free_workarea_compile (preg);
 744   re_string_destruct (&regexp);
 745
 746   if (BE (err != REG_NOERROR, 0))
 747     {
 748     re_compile_internal_free_return:
 749       free_dfa_content (dfa);
 750       preg->buffer = NULL;
 751       preg->allocated = 0;
 752     }
 753
 754   return err;
 755 }
 756
 757 /* Initialize DFA.  We use the length of the regular expression PAT_LEN
 758    as the initial length of some arrays.  */
 759
 760 static reg_errcode_t
 761 init_dfa (dfa, pat_len)
 762      re_dfa_t *dfa;
 763      int pat_len;
 764 {
 765   int table_size;
 766
 767   memset (dfa, '\0', sizeof (re_dfa_t));
 768
 769   dfa->nodes_alloc = pat_len + 1;
 770   dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc);
 771
 772   dfa->states_alloc = pat_len + 1;
 773
 774   /*  table_size = 2 ^ ceil(log pat_len) */
 775   for (table_size = 1; table_size > 0; table_size <<= 1)
 776     if (table_size > pat_len)
 777       break;
 778
 779   dfa->state_table = calloc (sizeof (struct re_state_table_entry), table_size);
 780   dfa->state_hash_mask = table_size - 1;
 781
 782   dfa->subexps_alloc = 1;
 783   dfa->subexps = re_malloc (re_subexp_t, dfa->subexps_alloc);
 784   dfa->word_char = NULL;
 785
 786   if (BE (dfa->nodes == NULL || dfa->state_table == NULL
 787           || dfa->subexps == NULL, 0))
 788     {
 789       /* We don't bother to free anything which was allocated.  Very
 790          soon the process will go down anyway.  */
 791       dfa->subexps = NULL;
 792       dfa->state_table = NULL;
 793       dfa->nodes = NULL;
 794       return REG_ESPACE;
 795     }
 796   return REG_NOERROR;
 797 }
 798
 799 /* Initialize WORD_CHAR table, which indicate which character is
 800    "word".  In this case "word" means that it is the word construction
 801    character used by some operators like "\<", "\>", etc.  */
 802
 803 static reg_errcode_t
 804 init_word_char (dfa)
 805      re_dfa_t *dfa;
 806 {
 807   int i, j, ch;
 808   dfa->word_char = (re_bitset_ptr_t) calloc (sizeof (bitset), 1);
 809   if (BE (dfa->word_char == NULL, 0))
 810     return REG_ESPACE;
 811   for (i = 0, ch = 0; i < BITSET_UINTS; ++i)
 812     for (j = 0; j < UINT_BITS; ++j, ++ch)
 813       if (isalnum (ch) || ch == '_')
 814         dfa->word_char[i] |= 1 << j;
 815   return REG_NOERROR;
 816 }
 817
 818 /* Free the work area which are only used while compiling.  */
 819
 820 static void
 821 free_workarea_compile (preg)
 822      regex_t *preg;
 823 {
 824   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
 825   free_bin_tree (dfa->str_tree);
 826   dfa->str_tree = NULL;
 827   re_free (dfa->org_indices);
 828   dfa->org_indices = NULL;
 829 }
 830
 831 /* Create initial states for all contexts.  */
 832
 833 static reg_errcode_t
 834 create_initial_state (dfa)
 835      re_dfa_t *dfa;
 836 {
 837   int first, i;
 838   reg_errcode_t err;
 839   re_node_set init_nodes;
 840
 841   /* Initial states have the epsilon closure of the node which is
 842      the first node of the regular expression.  */
 843   first = dfa->str_tree->first;
 844   dfa->init_node = first;
 845   err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first);
 846   if (BE (err != REG_NOERROR, 0))
 847     return err;
 848
 849   /* The back-references which are in initial states can epsilon transit,
 850      since in this case all of the subexpressions can be null.
 851      Then we add epsilon closures of the nodes which are the next nodes of
 852      the back-references.  */
 853   if (dfa->nbackref > 0)
 854     for (i = 0; i < init_nodes.nelem; ++i)
 855       {
 856         int node_idx = init_nodes.elems[i];
 857         re_token_type_t type = dfa->nodes[node_idx].type;
 858
 859         int clexp_idx;
 860         if (type != OP_BACK_REF)
 861           continue;
 862         for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx)
 863           {
 864             re_token_t *clexp_node;
 865             clexp_node = dfa->nodes + init_nodes.elems[clexp_idx];
 866             if (clexp_node->type == OP_CLOSE_SUBEXP
 867                 && clexp_node->opr.idx + 1 == dfa->nodes[node_idx].opr.idx)
 868               break;
 869           }
 870         if (clexp_idx == init_nodes.nelem)
 871           continue;
 872
 873         if (type == OP_BACK_REF)
 874           {
 875             int dest_idx = dfa->edests[node_idx].elems[0];
 876             if (!re_node_set_contains (&init_nodes, dest_idx))
 877               {
 878                 re_node_set_merge (&init_nodes, dfa->eclosures + dest_idx);
 879                 i = 0;
 880               }
 881           }
 882       }
 883
 884   /* It must be the first time to invoke acquire_state.  */
 885   dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0);
 886   /* We don't check ERR here, since the initial state must not be NULL.  */
 887   if (BE (dfa->init_state == NULL, 0))
 888     return err;
 889   if (dfa->init_state->has_constraint)
 890     {
 891       dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes,
 892                                                        CONTEXT_WORD);
 893       dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes,
 894                                                      CONTEXT_NEWLINE);
 895       dfa->init_state_begbuf = re_acquire_state_context (&err, dfa,
 896                                                          &init_nodes,
 897                                                          CONTEXT_NEWLINE
 898                                                          | CONTEXT_BEGBUF);
 899       if (BE (dfa->init_state_word == NULL || dfa->init_state_nl == NULL
 900               || dfa->init_state_begbuf == NULL, 0))
 901         return err;
 902     }
 903   else
 904     dfa->init_state_word = dfa->init_state_nl
 905       = dfa->init_state_begbuf = dfa->init_state;
 906
 907   re_node_set_free (&init_nodes);
 908   return REG_NOERROR;
 909 }
 910 \f
 911 /* Analyze the structure tree, and calculate "first", "next", "edest",
 912    "eclosure", and "inveclosure".  */
 913
 914 static reg_errcode_t
 915 analyze (dfa)
 916      re_dfa_t *dfa;
 917 {
 918   int i;
 919   reg_errcode_t ret;
 920
 921   /* Allocate arrays.  */
 922   dfa->nexts = re_malloc (int, dfa->nodes_alloc);
 923   dfa->org_indices = re_malloc (int, dfa->nodes_alloc);
 924   dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc);
 925   dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc);
 926   dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_alloc);
 927   if (BE (dfa->nexts == NULL || dfa->org_indices == NULL || dfa->edests == NULL
 928           || dfa->eclosures == NULL || dfa->inveclosures == NULL, 0))
 929     return REG_ESPACE;
 930   /* Initialize them.  */
 931   for (i = 0; i < dfa->nodes_len; ++i)
 932     {
 933       dfa->nexts[i] = -1;
 934       re_node_set_init_empty (dfa->edests + i);
 935       re_node_set_init_empty (dfa->eclosures + i);
 936       re_node_set_init_empty (dfa->inveclosures + i);
 937     }
 938
 939   ret = analyze_tree (dfa, dfa->str_tree);
 940   if (BE (ret == REG_NOERROR, 1))
 941     {
 942       ret = calc_eclosure (dfa);
 943       if (ret == REG_NOERROR)
 944         calc_inveclosure (dfa);
 945     }
 946   return ret;
 947 }
 948
 949 /* Helper functions for analyze.
 950    This function calculate "first", "next", and "edest" for the subtree
 951    whose root is NODE.  */
 952
 953 static reg_errcode_t
 954 analyze_tree (dfa, node)
 955      re_dfa_t *dfa;
 956      bin_tree_t *node;
 957 {
 958   reg_errcode_t ret;
 959   if (node->first == -1)
 960     calc_first (dfa, node);
 961   if (node->next == -1)
 962     calc_next (dfa, node);
 963   if (node->eclosure.nelem == 0)
 964     calc_epsdest (dfa, node);
 965   /* Calculate "first" etc. for the left child.  */
 966   if (node->left != NULL)
 967     {
 968       ret = analyze_tree (dfa, node->left);
 969       if (BE (ret != REG_NOERROR, 0))
 970         return ret;
 971     }
 972   /* Calculate "first" etc. for the right child.  */
 973   if (node->right != NULL)
 974     {
 975       ret = analyze_tree (dfa, node->right);
 976       if (BE (ret != REG_NOERROR, 0))
 977         return ret;
 978     }
 979   return REG_NOERROR;
 980 }
 981
 982 /* Calculate "first" for the node NODE.  */
 983 static void
 984 calc_first (dfa, node)
 985      re_dfa_t *dfa;
 986      bin_tree_t *node;
 987 {
 988   int idx, type;
 989   idx = node->node_idx;
 990   type = (node->type == 0) ? dfa->nodes[idx].type : node->type;
 991
 992   switch (type)
 993     {
 994 #ifdef DEBUG
 995     case OP_OPEN_BRACKET:
 996     case OP_CLOSE_BRACKET:
 997     case OP_OPEN_DUP_NUM:
 998     case OP_CLOSE_DUP_NUM:
 999     case OP_NON_MATCH_LIST:
1000     case OP_OPEN_COLL_ELEM:
1001     case OP_CLOSE_COLL_ELEM:
1002     case OP_OPEN_EQUIV_CLASS:
1003     case OP_CLOSE_EQUIV_CLASS:
1004     case OP_OPEN_CHAR_CLASS:
1005     case OP_CLOSE_CHAR_CLASS:
1006       /* These must not be appeared here.  */
1007       assert (0);
1008 #endif
1009     case END_OF_RE:
1010     case CHARACTER:
1011     case OP_PERIOD:
1012     case OP_DUP_ASTERISK:
1013     case OP_DUP_QUESTION:
1014 #ifdef RE_ENABLE_I18N
1015     case COMPLEX_BRACKET:
1016 #endif /* RE_ENABLE_I18N */
1017     case SIMPLE_BRACKET:
1018     case OP_BACK_REF:
1019     case ANCHOR:
1020     case OP_OPEN_SUBEXP:
1021     case OP_CLOSE_SUBEXP:
1022       node->first = idx;
1023       break;
1024     case OP_DUP_PLUS:
1025 #ifdef DEBUG
1026       assert (node->left != NULL);
1027 #endif
1028       if (node->left->first == -1)
1029         calc_first (dfa, node->left);
1030       node->first = node->left->first;
1031       break;
1032     case OP_ALT:
1033       node->first = idx;
1034       break;
1035       /* else fall through */
1036     default:
1037 #ifdef DEBUG
1038       assert (node->left != NULL);
1039 #endif
1040       if (node->left->first == -1)
1041         calc_first (dfa, node->left);
1042       node->first = node->left->first;
1043       break;
1044     }
1045 }
1046
1047 /* Calculate "next" for the node NODE.  */
1048
1049 static void
1050 calc_next (dfa, node)
1051      re_dfa_t *dfa;
1052      bin_tree_t *node;
1053 {
1054   int idx, type;
1055   bin_tree_t *parent = node->parent;
1056   if (parent == NULL)
1057     {
1058       node->next = -1;
1059       idx = node->node_idx;
1060       if (node->type == 0)
1061         dfa->nexts[idx] = node->next;
1062       return;
1063     }
1064
1065   idx = parent->node_idx;
1066   type = (parent->type == 0) ? dfa->nodes[idx].type : parent->type;
1067
1068   switch (type)
1069     {
1070     case OP_DUP_ASTERISK:
1071     case OP_DUP_PLUS:
1072       node->next = idx;
1073       break;
1074     case CONCAT:
1075       if (parent->left == node)
1076         {
1077           if (parent->right->first == -1)
1078             calc_first (dfa, parent->right);
1079           node->next = parent->right->first;
1080           break;
1081         }
1082       /* else fall through */
1083     default:
1084       if (parent->next == -1)
1085         calc_next (dfa, parent);
1086       node->next = parent->next;
1087       break;
1088     }
1089   idx = node->node_idx;
1090   if (node->type == 0)
1091     dfa->nexts[idx] = node->next;
1092 }
1093
1094 /* Calculate "edest" for the node NODE.  */
1095
1096 static void
1097 calc_epsdest (dfa, node)
1098      re_dfa_t *dfa;
1099      bin_tree_t *node;
1100 {
1101   int idx;
1102   idx = node->node_idx;
1103   if (node->type == 0)
1104     {
1105       if (dfa->nodes[idx].type == OP_DUP_ASTERISK
1106           || dfa->nodes[idx].type == OP_DUP_PLUS
1107           || dfa->nodes[idx].type == OP_DUP_QUESTION)
1108         {
1109           if (node->left->first == -1)
1110             calc_first (dfa, node->left);
1111           if (node->next == -1)
1112             calc_next (dfa, node);
1113           re_node_set_init_2 (dfa->edests + idx, node->left->first,
1114                               node->next);
1115         }
1116       else if (dfa->nodes[idx].type == OP_ALT)
1117         {
1118           int left, right;
1119           if (node->left != NULL)
1120             {
1121               if (node->left->first == -1)
1122                 calc_first (dfa, node->left);
1123               left = node->left->first;
1124             }
1125           else
1126             {
1127               if (node->next == -1)
1128                 calc_next (dfa, node);
1129               left = node->next;
1130             }
1131           if (node->right != NULL)
1132             {
1133               if (node->right->first == -1)
1134                 calc_first (dfa, node->right);
1135               right = node->right->first;
1136             }
1137           else
1138             {
1139               if (node->next == -1)
1140                 calc_next (dfa, node);
1141               right = node->next;
1142             }
1143           re_node_set_init_2 (dfa->edests + idx, left, right);
1144         }
1145       else if (dfa->nodes[idx].type == ANCHOR
1146                || dfa->nodes[idx].type == OP_OPEN_SUBEXP
1147                || dfa->nodes[idx].type == OP_CLOSE_SUBEXP
1148                || dfa->nodes[idx].type == OP_BACK_REF)
1149         re_node_set_init_1 (dfa->edests + idx, node->next);
1150     }
1151 }
1152
1153 /* Duplicate the epsilon closure of the node ROOT_NODE.
1154    Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
1155    to their own constraint.  */
1156
1157 static reg_errcode_t
1158 duplicate_node_closure (dfa, top_org_node, top_clone_node, root_node,
1159                         init_constraint)
1160      re_dfa_t *dfa;
1161      int top_org_node, top_clone_node, root_node;
1162      unsigned int init_constraint;
1163 {
1164   reg_errcode_t err;
1165   int org_node, clone_node, ret;
1166   unsigned int constraint = init_constraint;
1167   for (org_node = top_org_node, clone_node = top_clone_node;;)
1168     {
1169       int org_dest, clone_dest;
1170       if (dfa->nodes[org_node].type == OP_BACK_REF)
1171         {
1172           /* If the back reference epsilon-transit, its destination must
1173              also have the constraint.  Then duplicate the epsilon closure
1174              of the destination of the back reference, and store it in
1175              edests of the back reference.  */
1176           org_dest = dfa->nexts[org_node];
1177           re_node_set_empty (dfa->edests + clone_node);
1178           err = duplicate_node (&clone_dest, dfa, org_dest, constraint);
1179           if (BE (err != REG_NOERROR, 0))
1180             return err;
1181           dfa->nexts[clone_node] = dfa->nexts[org_node];
1182           ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1183           if (BE (ret < 0, 0))
1184             return REG_ESPACE;
1185         }
1186       else if (dfa->edests[org_node].nelem == 0)
1187         {
1188           /* In case of the node can't epsilon-transit, don't duplicate the
1189              destination and store the original destination as the
1190              destination of the node.  */
1191           dfa->nexts[clone_node] = dfa->nexts[org_node];
1192           break;
1193         }
1194       else if (dfa->edests[org_node].nelem == 1)
1195         {
1196           /* In case of the node can epsilon-transit, and it has only one
1197              destination.  */
1198           org_dest = dfa->edests[org_node].elems[0];
1199           re_node_set_empty (dfa->edests + clone_node);
1200           if (dfa->nodes[org_node].type == ANCHOR)
1201             {
1202               /* In case of the node has another constraint, append it.  */
1203               if (org_node == root_node && clone_node != org_node)
1204                 {
1205                   /* ...but if the node is root_node itself, it means the
1206                      epsilon closure have a loop, then tie it to the
1207                      destination of the root_node.  */
1208                   ret = re_node_set_insert (dfa->edests + clone_node,
1209                                             org_dest);
1210                   if (BE (ret < 0, 0))
1211                     return REG_ESPACE;
1212                   break;
1213                 }
1214               constraint |= dfa->nodes[org_node].opr.ctx_type;
1215             }
1216           err = duplicate_node (&clone_dest, dfa, org_dest, constraint);
1217           if (BE (err != REG_NOERROR, 0))
1218             return err;
1219           ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1220           if (BE (ret < 0, 0))
1221             return REG_ESPACE;
1222         }
1223       else /* dfa->edests[org_node].nelem == 2 */
1224         {
1225           /* In case of the node can epsilon-transit, and it has two
1226              destinations. E.g. '|', '*', '+', '?'.   */
1227           org_dest = dfa->edests[org_node].elems[0];
1228           re_node_set_empty (dfa->edests + clone_node);
1229           /* Search for a duplicated node which satisfies the constraint.  */
1230           clone_dest = search_duplicated_node (dfa, org_dest, constraint);
1231           if (clone_dest == -1)
1232             {
1233               /* There are no such a duplicated node, create a new one.  */
1234               err = duplicate_node (&clone_dest, dfa, org_dest, constraint);
1235               if (BE (err != REG_NOERROR, 0))
1236                 return err;
1237               ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1238               if (BE (ret < 0, 0))
1239                 return REG_ESPACE;
1240               err = duplicate_node_closure (dfa, org_dest, clone_dest,
1241                                             root_node, constraint);
1242               if (BE (err != REG_NOERROR, 0))
1243                 return err;
1244             }
1245           else
1246             {
1247               /* There are a duplicated node which satisfy the constraint,
1248                  use it to avoid infinite loop.  */
1249               ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1250               if (BE (ret < 0, 0))
1251                 return REG_ESPACE;
1252             }
1253
1254           org_dest = dfa->edests[org_node].elems[1];
1255           err = duplicate_node (&clone_dest, dfa, org_dest, constraint);
1256           if (BE (err != REG_NOERROR, 0))
1257             return err;
1258           ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1259           if (BE (ret < 0, 0))
1260             return REG_ESPACE;
1261         }
1262       org_node = org_dest;
1263       clone_node = clone_dest;
1264     }
1265   return REG_NOERROR;
1266 }
1267
1268 /* Search for a node which is duplicated from the node ORG_NODE, and
1269    satisfies the constraint CONSTRAINT.  */
1270
1271 static int
1272 search_duplicated_node (dfa, org_node, constraint)
1273      re_dfa_t *dfa;
1274      int org_node;
1275      unsigned int constraint;
1276 {
1277   int idx;
1278   for (idx = dfa->nodes_len - 1; dfa->nodes[idx].duplicated && idx > 0; --idx)
1279     {
1280       if (org_node == dfa->org_indices[idx]
1281           && constraint == dfa->nodes[idx].constraint)
1282         return idx; /* Found.  */
1283     }
1284   return -1; /* Not found.  */
1285 }
1286
1287 /* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
1288    The new index will be stored in NEW_IDX and return REG_NOERROR if succeeded,
1289    otherwise return the error code.  */
1290
1291 static reg_errcode_t
1292 duplicate_node (new_idx, dfa, org_idx, constraint)
1293      re_dfa_t *dfa;
1294      int *new_idx, org_idx;
1295      unsigned int constraint;
1296 {
1297   re_token_t dup;
1298   int dup_idx;
1299
1300   dup = dfa->nodes[org_idx];
1301   dup_idx = re_dfa_add_node (dfa, dup, 1);
1302   if (BE (dup_idx == -1, 0))
1303     return REG_ESPACE;
1304   dfa->nodes[dup_idx].constraint = constraint;
1305   if (dfa->nodes[org_idx].type == ANCHOR)
1306     dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].opr.ctx_type;
1307   dfa->nodes[dup_idx].duplicated = 1;
1308   re_node_set_init_empty (dfa->edests + dup_idx);
1309   re_node_set_init_empty (dfa->eclosures + dup_idx);
1310   re_node_set_init_empty (dfa->inveclosures + dup_idx);
1311
1312   /* Store the index of the original node.  */
1313   dfa->org_indices[dup_idx] = org_idx;
1314   *new_idx = dup_idx;
1315   return REG_NOERROR;
1316 }
1317
1318 static void
1319 calc_inveclosure (dfa)
1320      re_dfa_t *dfa;
1321 {
1322   int src, idx, dest;
1323   for (src = 0; src < dfa->nodes_len; ++src)
1324     {
1325       for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx)
1326         {
1327           dest = dfa->eclosures[src].elems[idx];
1328           re_node_set_insert (dfa->inveclosures + dest, src);
1329         }
1330     }
1331 }
1332
1333 /* Calculate "eclosure" for all the node in DFA.  */
1334
1335 static reg_errcode_t
1336 calc_eclosure (dfa)
1337      re_dfa_t *dfa;
1338 {
1339   int node_idx, incomplete;
1340 #ifdef DEBUG
1341   assert (dfa->nodes_len > 0);
1342 #endif
1343   incomplete = 0;
1344   /* For each nodes, calculate epsilon closure.  */
1345   for (node_idx = 0; ; ++node_idx)
1346     {
1347       reg_errcode_t err;
1348       re_node_set eclosure_elem;
1349       if (node_idx == dfa->nodes_len)
1350         {
1351           if (!incomplete)
1352             break;
1353           incomplete = 0;
1354           node_idx = 0;
1355         }
1356
1357 #ifdef DEBUG
1358       assert (dfa->eclosures[node_idx].nelem != -1);
1359 #endif
1360       /* If we have already calculated, skip it.  */
1361       if (dfa->eclosures[node_idx].nelem != 0)
1362         continue;
1363       /* Calculate epsilon closure of `node_idx'.  */
1364       err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, 1);
1365       if (BE (err != REG_NOERROR, 0))
1366         return err;
1367
1368       if (dfa->eclosures[node_idx].nelem == 0)
1369         {
1370           incomplete = 1;
1371           re_node_set_free (&eclosure_elem);
1372         }
1373     }
1374   return REG_NOERROR;
1375 }
1376
1377 /* Calculate epsilon closure of NODE.  */
1378
1379 static reg_errcode_t
1380 calc_eclosure_iter (new_set, dfa, node, root)
1381      re_node_set *new_set;
1382      re_dfa_t *dfa;
1383      int node, root;
1384 {
1385   reg_errcode_t err;
1386   unsigned int constraint;
1387   int i, incomplete;
1388   re_node_set eclosure;
1389   incomplete = 0;
1390   err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1);
1391   if (BE (err != REG_NOERROR, 0))
1392     return err;
1393
1394   /* This indicates that we are calculating this node now.
1395      We reference this value to avoid infinite loop.  */
1396   dfa->eclosures[node].nelem = -1;
1397
1398   constraint = ((dfa->nodes[node].type == ANCHOR)
1399                 ? dfa->nodes[node].opr.ctx_type : 0);
1400   /* If the current node has constraints, duplicate all nodes.
1401      Since they must inherit the constraints.  */
1402   if (constraint && !dfa->nodes[dfa->edests[node].elems[0]].duplicated)
1403     {
1404       int org_node, cur_node;
1405       org_node = cur_node = node;
1406       err = duplicate_node_closure (dfa, node, node, node, constraint);
1407       if (BE (err != REG_NOERROR, 0))
1408         return err;
1409     }
1410
1411   /* Expand each epsilon destination nodes.  */
1412   if (IS_EPSILON_NODE(dfa->nodes[node].type))
1413     for (i = 0; i < dfa->edests[node].nelem; ++i)
1414       {
1415         re_node_set eclosure_elem;
1416         int edest = dfa->edests[node].elems[i];
1417         /* If calculating the epsilon closure of `edest' is in progress,
1418            return intermediate result.  */
1419         if (dfa->eclosures[edest].nelem == -1)
1420           {
1421             incomplete = 1;
1422             continue;
1423           }
1424         /* If we haven't calculated the epsilon closure of `edest' yet,
1425            calculate now. Otherwise use calculated epsilon closure.  */
1426         if (dfa->eclosures[edest].nelem == 0)
1427           {
1428             err = calc_eclosure_iter (&eclosure_elem, dfa, edest, 0);
1429             if (BE (err != REG_NOERROR, 0))
1430               return err;
1431           }
1432         else
1433           eclosure_elem = dfa->eclosures[edest];
1434         /* Merge the epsilon closure of `edest'.  */
1435         re_node_set_merge (&eclosure, &eclosure_elem);
1436         /* If the epsilon closure of `edest' is incomplete,
1437            the epsilon closure of this node is also incomplete.  */
1438         if (dfa->eclosures[edest].nelem == 0)
1439           {
1440             incomplete = 1;
1441             re_node_set_free (&eclosure_elem);
1442           }
1443       }
1444
1445   /* Epsilon closures include itself.  */
1446   re_node_set_insert (&eclosure, node);
1447   if (incomplete && !root)
1448     dfa->eclosures[node].nelem = 0;
1449   else
1450     dfa->eclosures[node] = eclosure;
1451   *new_set = eclosure;
1452   return REG_NOERROR;
1453 }
1454 \f
1455 /* Functions for token which are used in the parser.  */
1456
1457 /* Fetch a token from INPUT.
1458    We must not use this function inside bracket expressions.  */
1459
1460 static re_token_t
1461 fetch_token (input, syntax)
1462      re_string_t *input;
1463      reg_syntax_t syntax;
1464 {
1465   re_token_t token;
1466   int consumed_byte;
1467   consumed_byte = peek_token (&token, input, syntax);
1468   re_string_skip_bytes (input, consumed_byte);
1469   return token;
1470 }
1471
1472 /* Peek a token from INPUT, and return the length of the token.
1473    We must not use this function inside bracket expressions.  */
1474
1475 static int
1476 peek_token (token, input, syntax)
1477      re_token_t *token;
1478      re_string_t *input;
1479      reg_syntax_t syntax;
1480 {
1481   unsigned char c;
1482
1483   if (re_string_eoi (input))
1484     {
1485       token->type = END_OF_RE;
1486       return 0;
1487     }
1488
1489   c = re_string_peek_byte (input, 0);
1490   token->opr.c = c;
1491
1492 #ifdef RE_ENABLE_I18N
1493   token->mb_partial = 0;
1494   if (MB_CUR_MAX > 1 &&
1495       !re_string_first_byte (input, re_string_cur_idx (input)))
1496     {
1497       token->type = CHARACTER;
1498       token->mb_partial = 1;
1499       return 1;
1500     }
1501 #endif
1502   if (c == '\\')
1503     {
1504       unsigned char c2;
1505       if (re_string_cur_idx (input) + 1 >= re_string_length (input))
1506         {
1507           token->type = BACK_SLASH;
1508           return 1;
1509         }
1510
1511       c2 = re_string_peek_byte_case (input, 1);
1512       token->opr.c = c2;
1513       token->type = CHARACTER;
1514       switch (c2)
1515         {
1516         case '|':
1517           if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_NO_BK_VBAR))
1518             token->type = OP_ALT;
1519           break;
1520         case '1': case '2': case '3': case '4': case '5':
1521         case '6': case '7': case '8': case '9':
1522           if (!(syntax & RE_NO_BK_REFS))
1523             {
1524               token->type = OP_BACK_REF;
1525               token->opr.idx = c2 - '0';
1526             }
1527           break;
1528         case '<':
1529           if (!(syntax & RE_NO_GNU_OPS))
1530             {
1531               token->type = ANCHOR;
1532               token->opr.idx = WORD_FIRST;
1533             }
1534           break;
1535         case '>':
1536           if (!(syntax & RE_NO_GNU_OPS))
1537             {
1538               token->type = ANCHOR;
1539               token->opr.idx = WORD_LAST;
1540             }
1541           break;
1542         case 'b':
1543           if (!(syntax & RE_NO_GNU_OPS))
1544             {
1545               token->type = ANCHOR;
1546               token->opr.idx = WORD_DELIM;
1547             }
1548           break;
1549         case 'B':
1550           if (!(syntax & RE_NO_GNU_OPS))
1551             {
1552               token->type = ANCHOR;
1553               token->opr.idx = INSIDE_WORD;
1554             }
1555           break;
1556         case 'w':
1557           if (!(syntax & RE_NO_GNU_OPS))
1558             token->type = OP_WORD;
1559           break;
1560         case 'W':
1561           if (!(syntax & RE_NO_GNU_OPS))
1562             token->type = OP_NOTWORD;
1563           break;
1564         case '`':
1565           if (!(syntax & RE_NO_GNU_OPS))
1566             {
1567               token->type = ANCHOR;
1568               token->opr.idx = BUF_FIRST;
1569             }
1570           break;
1571         case '\'':
1572           if (!(syntax & RE_NO_GNU_OPS))
1573             {
1574               token->type = ANCHOR;
1575               token->opr.idx = BUF_LAST;
1576             }
1577           break;
1578         case '(':
1579           if (!(syntax & RE_NO_BK_PARENS))
1580             token->type = OP_OPEN_SUBEXP;
1581           break;
1582         case ')':
1583           if (!(syntax & RE_NO_BK_PARENS))
1584             token->type = OP_CLOSE_SUBEXP;
1585           break;
1586         case '+':
1587           if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1588             token->type = OP_DUP_PLUS;
1589           break;
1590         case '?':
1591           if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1592             token->type = OP_DUP_QUESTION;
1593           break;
1594         case '{':
1595           if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1596             token->type = OP_OPEN_DUP_NUM;
1597           break;
1598         case '}':
1599           if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1600             token->type = OP_CLOSE_DUP_NUM;
1601           break;
1602         default:
1603           break;
1604         }
1605       return 2;
1606     }
1607
1608   token->type = CHARACTER;
1609   switch (c)
1610     {
1611     case '\n':
1612       if (syntax & RE_NEWLINE_ALT)
1613         token->type = OP_ALT;
1614       break;
1615     case '|':
1616       if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_NO_BK_VBAR))
1617         token->type = OP_ALT;
1618       break;
1619     case '*':
1620       token->type = OP_DUP_ASTERISK;
1621       break;
1622     case '+':
1623       if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1624         token->type = OP_DUP_PLUS;
1625       break;
1626     case '?':
1627       if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1628         token->type = OP_DUP_QUESTION;
1629       break;
1630     case '{':
1631       if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1632         token->type = OP_OPEN_DUP_NUM;
1633       break;
1634     case '}':
1635       if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1636         token->type = OP_CLOSE_DUP_NUM;
1637       break;
1638     case '(':
1639       if (syntax & RE_NO_BK_PARENS)
1640         token->type = OP_OPEN_SUBEXP;
1641       break;
1642     case ')':
1643       if (syntax & RE_NO_BK_PARENS)
1644         token->type = OP_CLOSE_SUBEXP;
1645       break;
1646     case '[':
1647       token->type = OP_OPEN_BRACKET;
1648       break;
1649     case '.':
1650       token->type = OP_PERIOD;
1651       break;
1652     case '^':
1653       if (!(syntax & (RE_CONTEXT_INDEP_ANCHORS | RE_CARET_ANCHORS_HERE)) &&
1654           re_string_cur_idx (input) != 0)
1655         {
1656           char prev = re_string_peek_byte (input, -1);
1657           if (!(syntax & RE_NEWLINE_ALT) || prev != '\n')
1658             break;
1659         }
1660       token->type = ANCHOR;
1661       token->opr.idx = LINE_FIRST;
1662       break;
1663     case '$':
1664       if (!(syntax & RE_CONTEXT_INDEP_ANCHORS) &&
1665           re_string_cur_idx (input) + 1 != re_string_length (input))
1666         {
1667           re_token_t next;
1668           re_string_skip_bytes (input, 1);
1669           peek_token (&next, input, syntax);
1670           re_string_skip_bytes (input, -1);
1671           if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP)
1672             break;
1673         }
1674       token->type = ANCHOR;
1675       token->opr.idx = LINE_LAST;
1676       break;
1677     default:
1678       break;
1679     }
1680   return 1;
1681 }
1682
1683 /* Peek a token from INPUT, and return the length of the token.
1684    We must not use this function out of bracket expressions.  */
1685
1686 static int
1687 peek_token_bracket (token, input, syntax)
1688      re_token_t *token;
1689      re_string_t *input;
1690      reg_syntax_t syntax;
1691 {
1692   unsigned char c;
1693   if (re_string_eoi (input))
1694     {
1695       token->type = END_OF_RE;
1696       return 0;
1697     }
1698   c = re_string_peek_byte (input, 0);
1699   token->opr.c = c;
1700
1701 #ifdef RE_ENABLE_I18N
1702   if (MB_CUR_MAX > 1 &&
1703       !re_string_first_byte (input, re_string_cur_idx (input)))
1704     {
1705       token->type = CHARACTER;
1706       return 1;
1707     }
1708 #endif /* RE_ENABLE_I18N */
1709
1710   if (c == '\\' && (syntax & RE_BACKSLASH_ESCAPE_IN_LISTS))
1711     {
1712       /* In this case, '\' escape a character.  */
1713       unsigned char c2;
1714       re_string_skip_bytes (input, 1);
1715       c2 = re_string_peek_byte (input, 0);
1716       token->opr.c = c2;
1717       token->type = CHARACTER;
1718       return 1;
1719     }
1720   if (c == '[') /* '[' is a special char in a bracket exps.  */
1721     {
1722       unsigned char c2;
1723       int token_len;
1724       c2 = re_string_peek_byte (input, 1);
1725       token->opr.c = c2;
1726       token_len = 2;
1727       switch (c2)
1728         {
1729         case '.':
1730           token->type = OP_OPEN_COLL_ELEM;
1731           break;
1732         case '=':
1733           token->type = OP_OPEN_EQUIV_CLASS;
1734           break;
1735         case ':':
1736           if (syntax & RE_CHAR_CLASSES)
1737             {
1738               token->type = OP_OPEN_CHAR_CLASS;
1739               break;
1740             }
1741           /* else fall through.  */
1742         default:
1743           token->type = CHARACTER;
1744           token->opr.c = c;
1745           token_len = 1;
1746           break;
1747         }
1748       return token_len;
1749     }
1750   switch (c)
1751     {
1752     case '-':
1753       token->type = OP_CHARSET_RANGE;
1754       break;
1755     case ']':
1756       token->type = OP_CLOSE_BRACKET;
1757       break;
1758     case '^':
1759       token->type = OP_NON_MATCH_LIST;
1760       break;
1761     default:
1762       token->type = CHARACTER;
1763     }
1764   return 1;
1765 }
1766 \f
1767 /* Functions for parser.  */
1768
1769 /* Entry point of the parser.
1770    Parse the regular expression REGEXP and return the structure tree.
1771    If an error is occured, ERR is set by error code, and return NULL.
1772    This function build the following tree, from regular expression <reg_exp>:
1773            CAT
1774            / \
1775           /   \
1776    <reg_exp>  EOR
1777
1778    CAT means concatenation.
1779    EOR means end of regular expression.  */
1780
1781 static bin_tree_t *
1782 parse (regexp, preg, syntax, err)
1783      re_string_t *regexp;
1784      regex_t *preg;
1785      reg_syntax_t syntax;
1786      reg_errcode_t *err;
1787 {
1788   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1789   bin_tree_t *tree, *eor, *root;
1790   re_token_t current_token;
1791   int new_idx;
1792   current_token = fetch_token (regexp, syntax | RE_CARET_ANCHORS_HERE);
1793   tree = parse_reg_exp (regexp, preg, &current_token, syntax, 0, err);
1794   if (BE (*err != REG_NOERROR && tree == NULL, 0))
1795     return NULL;
1796   new_idx = re_dfa_add_node (dfa, current_token, 0);
1797   eor = create_tree (NULL, NULL, 0, new_idx);
1798   if (tree != NULL)
1799     root = create_tree (tree, eor, CONCAT, 0);
1800   else
1801     root = eor;
1802   if (BE (new_idx == -1 || eor == NULL || root == NULL, 0))
1803     {
1804       *err = REG_ESPACE;
1805       return NULL;
1806     }
1807   return root;
1808 }
1809
1810 /* This function build the following tree, from regular expression
1811    <branch1>|<branch2>:
1812            ALT
1813            / \
1814           /   \
1815    <branch1> <branch2>
1816
1817    ALT means alternative, which represents the operator `|'.  */
1818
1819 static bin_tree_t *
1820 parse_reg_exp (regexp, preg, token, syntax, nest, err)
1821      re_string_t *regexp;
1822      regex_t *preg;
1823      re_token_t *token;
1824      reg_syntax_t syntax;
1825      int nest;
1826      reg_errcode_t *err;
1827 {
1828   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1829   bin_tree_t *tree, *branch = NULL;
1830   int new_idx;
1831   tree = parse_branch (regexp, preg, token, syntax, nest, err);
1832   if (BE (*err != REG_NOERROR && tree == NULL, 0))
1833     return NULL;
1834
1835   while (token->type == OP_ALT)
1836     {
1837       re_token_t alt_token = *token;
1838       new_idx = re_dfa_add_node (dfa, alt_token, 0);
1839       *token = fetch_token (regexp, syntax | RE_CARET_ANCHORS_HERE);
1840       if (token->type != OP_ALT && token->type != END_OF_RE
1841           && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
1842         {
1843           branch = parse_branch (regexp, preg, token, syntax, nest, err);
1844           if (BE (*err != REG_NOERROR && branch == NULL, 0))
1845             {
1846               free_bin_tree (tree);
1847               return NULL;
1848             }
1849         }
1850       else
1851         branch = NULL;
1852       tree = create_tree (tree, branch, 0, new_idx);
1853       if (BE (new_idx == -1 || tree == NULL, 0))
1854         {
1855           *err = REG_ESPACE;
1856           return NULL;
1857         }
1858       dfa->has_plural_match = 1;
1859     }
1860   return tree;
1861 }
1862
1863 /* This function build the following tree, from regular expression
1864    <exp1><exp2>:
1865         CAT
1866         / \
1867        /   \
1868    <exp1> <exp2>
1869
1870    CAT means concatenation.  */
1871
1872 static bin_tree_t *
1873 parse_branch (regexp, preg, token, syntax, nest, err)
1874      re_string_t *regexp;
1875      regex_t *preg;
1876      re_token_t *token;
1877      reg_syntax_t syntax;
1878      int nest;
1879      reg_errcode_t *err;
1880 {
1881   bin_tree_t *tree, *exp;
1882   tree = parse_expression (regexp, preg, token, syntax, nest, err);
1883   if (BE (*err != REG_NOERROR && tree == NULL, 0))
1884     return NULL;
1885
1886   while (token->type != OP_ALT && token->type != END_OF_RE
1887          && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
1888     {
1889       exp = parse_expression (regexp, preg, token, syntax, nest, err);
1890       if (BE (*err != REG_NOERROR && exp == NULL, 0))
1891         {
1892           free_bin_tree (tree);
1893           return NULL;
1894         }
1895       if (tree != NULL && exp != NULL)
1896         {
1897           tree = create_tree (tree, exp, CONCAT, 0);
1898           if (tree == NULL)
1899             {
1900               *err = REG_ESPACE;
1901               return NULL;
1902             }
1903         }
1904       else if (tree == NULL)
1905         tree = exp;
1906       /* Otherwise exp == NULL, we don't need to create new tree.  */
1907     }
1908   return tree;
1909 }
1910
1911 /* This function build the following tree, from regular expression a*:
1912          *
1913          |
1914          a
1915 */
1916
1917 static bin_tree_t *
1918 parse_expression (regexp, preg, token, syntax, nest, err)
1919      re_string_t *regexp;
1920      regex_t *preg;
1921      re_token_t *token;
1922      reg_syntax_t syntax;
1923      int nest;
1924      reg_errcode_t *err;
1925 {
1926   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1927   bin_tree_t *tree;
1928   int new_idx;
1929   switch (token->type)
1930     {
1931     case CHARACTER:
1932       new_idx = re_dfa_add_node (dfa, *token, 0);
1933       tree = create_tree (NULL, NULL, 0, new_idx);
1934       if (BE (new_idx == -1 || tree == NULL, 0))
1935         {
1936           *err = REG_ESPACE;
1937           return NULL;
1938         }
1939 #ifdef RE_ENABLE_I18N
1940       if (MB_CUR_MAX > 1)
1941         {
1942           while (!re_string_eoi (regexp)
1943                  && !re_string_first_byte (regexp, re_string_cur_idx (regexp)))
1944             {
1945               bin_tree_t *mbc_remain;
1946               *token = fetch_token (regexp, syntax);
1947               new_idx = re_dfa_add_node (dfa, *token, 0);
1948               mbc_remain = create_tree (NULL, NULL, 0, new_idx);
1949               tree = create_tree (tree, mbc_remain, CONCAT, 0);
1950               if (BE (new_idx == -1 || mbc_remain == NULL || tree == NULL, 0))
1951                 {
1952                   *err = REG_ESPACE;
1953                   return NULL;
1954                 }
1955             }
1956         }
1957 #endif
1958       break;
1959     case OP_OPEN_SUBEXP:
1960       tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err);
1961       if (BE (*err != REG_NOERROR && tree == NULL, 0))
1962         return NULL;
1963       break;
1964     case OP_OPEN_BRACKET:
1965       tree = parse_bracket_exp (regexp, dfa, token, syntax, err);
1966       if (BE (*err != REG_NOERROR && tree == NULL, 0))
1967         return NULL;
1968       break;
1969     case OP_BACK_REF:
1970       if (BE (preg->re_nsub < token->opr.idx
1971               || dfa->subexps[token->opr.idx - 1].end == -1, 0))
1972         {
1973           *err = REG_ESUBREG;
1974           return NULL;
1975         }
1976       dfa->used_bkref_map |= 1 << (token->opr.idx - 1);
1977       new_idx = re_dfa_add_node (dfa, *token, 0);
1978       tree = create_tree (NULL, NULL, 0, new_idx);
1979       if (BE (new_idx == -1 || tree == NULL, 0))
1980         {
1981           *err = REG_ESPACE;
1982           return NULL;
1983         }
1984       ++dfa->nbackref;
1985       dfa->has_mb_node = 1;
1986       break;
1987     case OP_DUP_ASTERISK:
1988     case OP_DUP_PLUS:
1989     case OP_DUP_QUESTION:
1990     case OP_OPEN_DUP_NUM:
1991       if (syntax & RE_CONTEXT_INVALID_OPS)
1992         {
1993           *err = REG_BADRPT;
1994           return NULL;
1995         }
1996       else if (syntax & RE_CONTEXT_INDEP_OPS)
1997         {
1998           *token = fetch_token (regexp, syntax);
1999           return parse_expression (regexp, preg, token, syntax, nest, err);
2000         }
2001       /* else fall through  */
2002     case OP_CLOSE_SUBEXP:
2003       if ((token->type == OP_CLOSE_SUBEXP) &&
2004           !(syntax & RE_UNMATCHED_RIGHT_PAREN_ORD))
2005         {
2006           *err = REG_ERPAREN;
2007           return NULL;
2008         }
2009       /* else fall through  */
2010     case OP_CLOSE_DUP_NUM:
2011       /* We treat it as a normal character.  */
2012
2013       /* Then we can these characters as normal characters.  */
2014       token->type = CHARACTER;
2015       new_idx = re_dfa_add_node (dfa, *token, 0);
2016       tree = create_tree (NULL, NULL, 0, new_idx);
2017       if (BE (new_idx == -1 || tree == NULL, 0))
2018         {
2019           *err = REG_ESPACE;
2020           return NULL;
2021         }
2022       break;
2023     case ANCHOR:
2024       if (dfa->word_char == NULL)
2025         {
2026           *err = init_word_char (dfa);
2027           if (BE (*err != REG_NOERROR, 0))
2028             return NULL;
2029         }
2030       if (token->opr.ctx_type == WORD_DELIM)
2031         {
2032           bin_tree_t *tree_first, *tree_last;
2033           int idx_first, idx_last;
2034           token->opr.ctx_type = WORD_FIRST;
2035           idx_first = re_dfa_add_node (dfa, *token, 0);
2036           tree_first = create_tree (NULL, NULL, 0, idx_first);
2037           token->opr.ctx_type = WORD_LAST;
2038           idx_last = re_dfa_add_node (dfa, *token, 0);
2039           tree_last = create_tree (NULL, NULL, 0, idx_last);
2040           token->type = OP_ALT;
2041           new_idx = re_dfa_add_node (dfa, *token, 0);
2042           tree = create_tree (tree_first, tree_last, 0, new_idx);
2043           if (BE (idx_first == -1 || idx_last == -1 || new_idx == -1
2044                   || tree_first == NULL || tree_last == NULL
2045                   || tree == NULL, 0))
2046             {
2047               *err = REG_ESPACE;
2048               return NULL;
2049             }
2050         }
2051       else
2052         {
2053           new_idx = re_dfa_add_node (dfa, *token, 0);
2054           tree = create_tree (NULL, NULL, 0, new_idx);
2055           if (BE (new_idx == -1 || tree == NULL, 0))
2056             {
2057               *err = REG_ESPACE;
2058               return NULL;
2059             }
2060         }
2061       /* We must return here, since ANCHORs can't be followed
2062          by repetition operators.
2063          eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
2064              it must not be "<ANCHOR(^)><REPEAT(*)>".  */
2065       *token = fetch_token (regexp, syntax);
2066       return tree;
2067     case OP_PERIOD:
2068       new_idx = re_dfa_add_node (dfa, *token, 0);
2069       tree = create_tree (NULL, NULL, 0, new_idx);
2070       if (BE (new_idx == -1 || tree == NULL, 0))
2071         {
2072           *err = REG_ESPACE;
2073           return NULL;
2074         }
2075       if (MB_CUR_MAX > 1)
2076         dfa->has_mb_node = 1;
2077       break;
2078     case OP_WORD:
2079       tree = build_word_op (dfa, regexp->trans, 0, err);
2080       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2081         return NULL;
2082       break;
2083     case OP_NOTWORD:
2084       tree = build_word_op (dfa, regexp->trans, 1, err);
2085       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2086         return NULL;
2087       break;
2088     case OP_ALT:
2089     case END_OF_RE:
2090       return NULL;
2091     case BACK_SLASH:
2092       *err = REG_EESCAPE;
2093       return NULL;
2094     default:
2095       /* Must not happen?  */
2096 #ifdef DEBUG
2097       assert (0);
2098 #endif
2099       return NULL;
2100     }
2101   *token = fetch_token (regexp, syntax);
2102
2103   while (token->type == OP_DUP_ASTERISK || token->type == OP_DUP_PLUS
2104          || token->type == OP_DUP_QUESTION || token->type == OP_OPEN_DUP_NUM)
2105     {
2106       tree = parse_dup_op (tree, regexp, dfa, token, syntax, err);
2107       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2108         return NULL;
2109       dfa->has_plural_match = 1;
2110     }
2111
2112   return tree;
2113 }
2114
2115 /* This function build the following tree, from regular expression
2116    (<reg_exp>):
2117          SUBEXP
2118             |
2119         <reg_exp>
2120 */
2121
2122 static bin_tree_t *
2123 parse_sub_exp (regexp, preg, token, syntax, nest, err)
2124      re_string_t *regexp;
2125      regex_t *preg;
2126      re_token_t *token;
2127      reg_syntax_t syntax;
2128      int nest;
2129      reg_errcode_t *err;
2130 {
2131   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2132   bin_tree_t *tree, *left_par, *right_par;
2133   size_t cur_nsub;
2134   int new_idx;
2135   cur_nsub = preg->re_nsub++;
2136   if (dfa->subexps_alloc < preg->re_nsub)
2137     {
2138       re_subexp_t *new_array;
2139       dfa->subexps_alloc *= 2;
2140       new_array = re_realloc (dfa->subexps, re_subexp_t, dfa->subexps_alloc);
2141       if (BE (new_array == NULL, 0))
2142         {
2143           dfa->subexps_alloc /= 2;
2144           *err = REG_ESPACE;
2145           return NULL;
2146         }
2147       dfa->subexps = new_array;
2148     }
2149   dfa->subexps[cur_nsub].start = dfa->nodes_len;
2150   dfa->subexps[cur_nsub].end = -1;
2151
2152   new_idx = re_dfa_add_node (dfa, *token, 0);
2153   left_par = create_tree (NULL, NULL, 0, new_idx);
2154   if (BE (new_idx == -1 || left_par == NULL, 0))
2155     {
2156       *err = REG_ESPACE;
2157       return NULL;
2158     }
2159   dfa->nodes[new_idx].opr.idx = cur_nsub;
2160   *token = fetch_token (regexp, syntax);
2161
2162   /* The subexpression may be a null string.  */
2163   if (token->type == OP_CLOSE_SUBEXP)
2164     tree = NULL;
2165   else
2166     {
2167       tree = parse_reg_exp (regexp, preg, token, syntax, nest, err);
2168       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2169         return NULL;
2170     }
2171   if (BE (token->type != OP_CLOSE_SUBEXP, 0))
2172     {
2173       free_bin_tree (tree);
2174       *err = REG_BADPAT;
2175       return NULL;
2176     }
2177   new_idx = re_dfa_add_node (dfa, *token, 0);
2178   dfa->subexps[cur_nsub].end = dfa->nodes_len;
2179   right_par = create_tree (NULL, NULL, 0, new_idx);
2180   tree = ((tree == NULL) ? right_par
2181           : create_tree (tree, right_par, CONCAT, 0));
2182   tree = create_tree (left_par, tree, CONCAT, 0);
2183   if (BE (new_idx == -1 || right_par == NULL || tree == NULL, 0))
2184     {
2185       *err = REG_ESPACE;
2186       return NULL;
2187     }
2188   dfa->nodes[new_idx].opr.idx = cur_nsub;
2189
2190   return tree;
2191 }
2192
2193 /* This function parse repetition operators like "*", "+", "{1,3}" etc.  */
2194
2195 static bin_tree_t *
2196 parse_dup_op (dup_elem, regexp, dfa, token, syntax, err)
2197      bin_tree_t *dup_elem;
2198      re_string_t *regexp;
2199      re_dfa_t *dfa;
2200      re_token_t *token;
2201      reg_syntax_t syntax;
2202      reg_errcode_t *err;
2203 {
2204   re_token_t dup_token;
2205   bin_tree_t *tree = dup_elem, *work_tree;
2206   int new_idx, start_idx = re_string_cur_idx (regexp);
2207   re_token_t start_token = *token;
2208   if (token->type == OP_OPEN_DUP_NUM)
2209     {
2210       int i;
2211       int end = 0;
2212       int start = fetch_number (regexp, token, syntax);
2213       bin_tree_t *elem;
2214       if (start == -1)
2215         {
2216           if (token->type == CHARACTER && token->opr.c == ',')
2217             start = 0; /* We treat "{,m}" as "{0,m}".  */
2218           else
2219             {
2220               *err = REG_BADBR; /* <re>{} is invalid.  */
2221               return NULL;
2222             }
2223         }
2224       if (BE (start != -2, 1))
2225         {
2226           /* We treat "{n}" as "{n,n}".  */
2227           end = ((token->type == OP_CLOSE_DUP_NUM) ? start
2228                  : ((token->type == CHARACTER && token->opr.c == ',')
2229                     ? fetch_number (regexp, token, syntax) : -2));
2230         }
2231       if (BE (start == -2 || end == -2, 0))
2232         {
2233           /* Invalid sequence.  */
2234           if (token->type == OP_CLOSE_DUP_NUM)
2235             goto parse_dup_op_invalid_interval;
2236           else
2237             goto parse_dup_op_ebrace;
2238         }
2239       if (BE (start == 0 && end == 0, 0))
2240         {
2241           /* We treat "<re>{0}" and "<re>{0,0}" as null string.  */
2242           *token = fetch_token (regexp, syntax);
2243           free_bin_tree (dup_elem);
2244           return NULL;
2245         }
2246
2247       /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}".  */
2248       elem = tree;
2249       for (i = 0; i < start; ++i)
2250         if (i != 0)
2251           {
2252             work_tree = duplicate_tree (elem, dfa);
2253             tree = create_tree (tree, work_tree, CONCAT, 0);
2254             if (BE (work_tree == NULL || tree == NULL, 0))
2255               goto parse_dup_op_espace;
2256           }
2257
2258       if (end == -1)
2259         {
2260           /* We treat "<re>{0,}" as "<re>*".  */
2261           dup_token.type = OP_DUP_ASTERISK;
2262           if (start > 0)
2263             {
2264               elem = duplicate_tree (elem, dfa);
2265               new_idx = re_dfa_add_node (dfa, dup_token, 0);
2266               work_tree = create_tree (elem, NULL, 0, new_idx);
2267               tree = create_tree (tree, work_tree, CONCAT, 0);
2268               if (BE (elem == NULL || new_idx == -1 || work_tree == NULL
2269                       || tree == NULL, 0))
2270                 goto parse_dup_op_espace;
2271             }
2272           else
2273             {
2274               new_idx = re_dfa_add_node (dfa, dup_token, 0);
2275               tree = create_tree (elem, NULL, 0, new_idx);
2276               if (BE (new_idx == -1 || tree == NULL, 0))
2277                 goto parse_dup_op_espace;
2278             }
2279         }
2280       else if (end - start > 0)
2281         {
2282           /* Then extract "<re>{0,m}" to "<re>?<re>?...<re>?".  */
2283           dup_token.type = OP_DUP_QUESTION;
2284           if (start > 0)
2285             {
2286               elem = duplicate_tree (elem, dfa);
2287               new_idx = re_dfa_add_node (dfa, dup_token, 0);
2288               elem = create_tree (elem, NULL, 0, new_idx);
2289               tree = create_tree (tree, elem, CONCAT, 0);
2290               if (BE (elem == NULL || new_idx == -1 || tree == NULL, 0))
2291                 goto parse_dup_op_espace;
2292             }
2293           else
2294             {
2295               new_idx = re_dfa_add_node (dfa, dup_token, 0);
2296               tree = elem = create_tree (elem, NULL, 0, new_idx);
2297               if (BE (new_idx == -1 || tree == NULL, 0))
2298                 goto parse_dup_op_espace;
2299             }
2300           for (i = 1; i < end - start; ++i)
2301             {
2302               work_tree = duplicate_tree (elem, dfa);
2303               tree = create_tree (tree, work_tree, CONCAT, 0);
2304               if (BE (work_tree == NULL || tree == NULL, 0))
2305                 {
2306                   *err = REG_ESPACE;
2307                   return NULL;
2308                 }
2309             }
2310         }
2311     }
2312   else
2313     {
2314       new_idx = re_dfa_add_node (dfa, *token, 0);
2315       tree = create_tree (tree, NULL, 0, new_idx);
2316       if (BE (new_idx == -1 || tree == NULL, 0))
2317         {
2318           *err = REG_ESPACE;
2319           return NULL;
2320         }
2321     }
2322   *token = fetch_token (regexp, syntax);
2323   return tree;
2324
2325  parse_dup_op_espace:
2326   free_bin_tree (tree);
2327   *err = REG_ESPACE;
2328   return NULL;
2329
2330  parse_dup_op_ebrace:
2331   if (BE (!(syntax & RE_INVALID_INTERVAL_ORD), 0))
2332     {
2333       *err = REG_EBRACE;
2334       return NULL;
2335     }
2336   goto parse_dup_op_rollback;
2337  parse_dup_op_invalid_interval:
2338   if (BE (!(syntax & RE_INVALID_INTERVAL_ORD), 0))
2339     {
2340       *err = REG_BADBR;
2341       return NULL;
2342     }
2343  parse_dup_op_rollback:
2344   re_string_set_index (regexp, start_idx);
2345   *token = start_token;
2346   token->type = CHARACTER;
2347   return dup_elem;
2348 }
2349
2350 /* Size of the names for collating symbol/equivalence_class/character_class.
2351    I'm not sure, but maybe enough.  */
2352 #define BRACKET_NAME_BUF_SIZE 32
2353
2354 #ifndef _LIBC
2355   /* Local function for parse_bracket_exp only used in case of NOT _LIBC.
2356      Build the range expression which starts from START_ELEM, and ends
2357      at END_ELEM.  The result are written to MBCSET and SBCSET.
2358      RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2359      mbcset->range_ends, is a pointer argument sinse we may
2360      update it.  */
2361
2362 static reg_errcode_t
2363 # ifdef RE_ENABLE_I18N
2364 build_range_exp (sbcset, mbcset, range_alloc, start_elem, end_elem)
2365      re_charset_t *mbcset;
2366      int *range_alloc;
2367 # else /* not RE_ENABLE_I18N */
2368 build_range_exp (sbcset, start_elem, end_elem)
2369 # endif /* not RE_ENABLE_I18N */
2370      re_bitset_ptr_t sbcset;
2371      bracket_elem_t *start_elem, *end_elem;
2372 {
2373   unsigned int start_ch, end_ch;
2374   /* Equivalence Classes and Character Classes can't be a range start/end.  */
2375   if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2376           || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2377           0))
2378     return REG_ERANGE;
2379
2380   /* We can handle no multi character collating elements without libc
2381      support.  */
2382   if (BE ((start_elem->type == COLL_SYM
2383            && strlen ((char *) start_elem->opr.name) > 1)
2384           || (end_elem->type == COLL_SYM
2385               && strlen ((char *) end_elem->opr.name) > 1), 0))
2386     return REG_ECOLLATE;
2387
2388 # ifdef RE_ENABLE_I18N
2389   {
2390     wchar_t wc, start_wc, end_wc;
2391     wchar_t cmp_buf[6] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
2392
2393     start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch
2394                 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2395                    : 0));
2396     end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch
2397               : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2398                  : 0));
2399     start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
2400                 ? __btowc (start_ch) : start_elem->opr.wch);
2401     end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
2402               ? __btowc (end_ch) : end_elem->opr.wch);
2403     cmp_buf[0] = start_wc;
2404     cmp_buf[4] = end_wc;
2405     if (wcscoll (cmp_buf, cmp_buf + 4) > 0)
2406       return REG_ERANGE;
2407
2408     /* Check the space of the arrays.  */
2409     if (*range_alloc == mbcset->nranges)
2410       {
2411         /* There are not enough space, need realloc.  */
2412         wchar_t *new_array_start, *new_array_end;
2413         int new_nranges;
2414
2415         /* +1 in case of mbcset->nranges is 0.  */
2416         new_nranges = 2 * mbcset->nranges + 1;
2417         /* Use realloc since mbcset->range_starts and mbcset->range_ends
2418            are NULL if *range_alloc == 0.  */
2419         new_array_start = re_realloc (mbcset->range_starts, wchar_t,
2420                                       new_nranges);
2421         new_array_end = re_realloc (mbcset->range_ends, wchar_t,
2422                                     new_nranges);
2423
2424         if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2425           return REG_ESPACE;
2426
2427         mbcset->range_starts = new_array_start;
2428         mbcset->range_ends = new_array_end;
2429         *range_alloc = new_nranges;
2430       }
2431
2432     mbcset->range_starts[mbcset->nranges] = start_wc;
2433     mbcset->range_ends[mbcset->nranges++] = end_wc;
2434
2435     /* Build the table for single byte characters.  */
2436     for (wc = 0; wc <= SBC_MAX; ++wc)
2437       {
2438         cmp_buf[2] = wc;
2439         if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
2440             && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
2441           bitset_set (sbcset, wc);
2442       }
2443   }
2444 # else /* not RE_ENABLE_I18N */
2445   {
2446     unsigned int ch;
2447     start_ch = ((start_elem->type == SB_CHAR ) ? start_elem->opr.ch
2448                 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2449                    : 0));
2450     end_ch = ((end_elem->type == SB_CHAR ) ? end_elem->opr.ch
2451               : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2452                  : 0));
2453     if (start_ch > end_ch)
2454       return REG_ERANGE;
2455     /* Build the table for single byte characters.  */
2456     for (ch = 0; ch <= SBC_MAX; ++ch)
2457       if (start_ch <= ch  && ch <= end_ch)
2458         bitset_set (sbcset, ch);
2459   }
2460 # endif /* not RE_ENABLE_I18N */
2461   return REG_NOERROR;
2462 }
2463 #endif /* not _LIBC */
2464
2465 #ifndef _LIBC
2466 /* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
2467    Build the collating element which is represented by NAME.
2468    The result are written to MBCSET and SBCSET.
2469    COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2470    pointer argument since we may update it.  */
2471
2472 static reg_errcode_t
2473 # ifdef RE_ENABLE_I18N
2474 build_collating_symbol (sbcset, mbcset, coll_sym_alloc, name)
2475      re_charset_t *mbcset;
2476      int *coll_sym_alloc;
2477 # else /* not RE_ENABLE_I18N */
2478 build_collating_symbol (sbcset, name)
2479 # endif /* not RE_ENABLE_I18N */
2480      re_bitset_ptr_t sbcset;
2481      const unsigned char *name;
2482 {
2483   size_t name_len = strlen ((const char *) name);
2484   if (BE (name_len != 1, 0))
2485     return REG_ECOLLATE;
2486   else
2487     {
2488       bitset_set (sbcset, name[0]);
2489       return REG_NOERROR;
2490     }
2491 }
2492 #endif /* not _LIBC */
2493
2494 /* This function parse bracket expression like "[abc]", "[a-c]",
2495    "[[.a-a.]]" etc.  */
2496
2497 static bin_tree_t *
2498 parse_bracket_exp (regexp, dfa, token, syntax, err)
2499      re_string_t *regexp;
2500      re_dfa_t *dfa;
2501      re_token_t *token;
2502      reg_syntax_t syntax;
2503      reg_errcode_t *err;
2504 {
2505 #ifdef _LIBC
2506   const unsigned char *collseqmb;
2507   const char *collseqwc;
2508   uint32_t nrules;
2509   int32_t table_size;
2510   const int32_t *symb_table;
2511   const unsigned char *extra;
2512
2513   /* Local function for parse_bracket_exp used in _LIBC environement.
2514      Seek the collating symbol entry correspondings to NAME.
2515      Return the index of the symbol in the SYMB_TABLE.  */
2516
2517   static inline int32_t
2518   __attribute ((always_inline))
2519   seek_collating_symbol_entry (name, name_len)
2520          const unsigned char *name;
2521          size_t name_len;
2522     {
2523       int32_t hash = elem_hash ((const char *) name, name_len);
2524       int32_t elem = hash % table_size;
2525       int32_t second = hash % (table_size - 2);
2526       while (symb_table[2 * elem] != 0)
2527         {
2528           /* First compare the hashing value.  */
2529           if (symb_table[2 * elem] == hash
2530               /* Compare the length of the name.  */
2531               && name_len == extra[symb_table[2 * elem + 1]]
2532               /* Compare the name.  */
2533               && memcmp (name, &extra[symb_table[2 * elem + 1] + 1],
2534                          name_len) == 0)
2535             {
2536               /* Yep, this is the entry.  */
2537               break;
2538             }
2539
2540           /* Next entry.  */
2541           elem += second;
2542         }
2543       return elem;
2544     }
2545
2546   /* Local function for parse_bracket_exp used in _LIBC environement.
2547      Look up the collation sequence value of BR_ELEM.
2548      Return the value if succeeded, UINT_MAX otherwise.  */
2549
2550   static inline unsigned int
2551   __attribute ((always_inline))
2552   lookup_collation_sequence_value (br_elem)
2553          bracket_elem_t *br_elem;
2554     {
2555       if (br_elem->type == SB_CHAR)
2556         {
2557           /*
2558           if (MB_CUR_MAX == 1)
2559           */
2560           if (nrules == 0)
2561             return collseqmb[br_elem->opr.ch];
2562           else
2563             {
2564               wint_t wc = __btowc (br_elem->opr.ch);
2565               return __collseq_table_lookup (collseqwc, wc);
2566             }
2567         }
2568       else if (br_elem->type == MB_CHAR)
2569         {
2570           return __collseq_table_lookup (collseqwc, br_elem->opr.wch);
2571         }
2572       else if (br_elem->type == COLL_SYM)
2573         {
2574           size_t sym_name_len = strlen ((char *) br_elem->opr.name);
2575           if (nrules != 0)
2576             {
2577               int32_t elem, idx;
2578               elem = seek_collating_symbol_entry (br_elem->opr.name,
2579                                                   sym_name_len);
2580               if (symb_table[2 * elem] != 0)
2581                 {
2582                   /* We found the entry.  */
2583                   idx = symb_table[2 * elem + 1];
2584                   /* Skip the name of collating element name.  */
2585                   idx += 1 + extra[idx];
2586                   /* Skip the byte sequence of the collating element.  */
2587                   idx += 1 + extra[idx];
2588                   /* Adjust for the alignment.  */
2589                   idx = (idx + 3) & ~3;
2590                   /* Skip the multibyte collation sequence value.  */
2591                   idx += sizeof (unsigned int);
2592                   /* Skip the wide char sequence of the collating element.  */
2593                   idx += sizeof (unsigned int) *
2594                     (1 + *(unsigned int *) (extra + idx));
2595                   /* Return the collation sequence value.  */
2596                   return *(unsigned int *) (extra + idx);
2597                 }
2598               else if (symb_table[2 * elem] == 0 && sym_name_len == 1)
2599                 {
2600                   /* No valid character.  Match it as a single byte
2601                      character.  */
2602                   return collseqmb[br_elem->opr.name[0]];
2603                 }
2604             }
2605           else if (sym_name_len == 1)
2606             return collseqmb[br_elem->opr.name[0]];
2607         }
2608       return UINT_MAX;
2609     }
2610
2611   /* Local function for parse_bracket_exp used in _LIBC environement.
2612      Build the range expression which starts from START_ELEM, and ends
2613      at END_ELEM.  The result are written to MBCSET and SBCSET.
2614      RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2615      mbcset->range_ends, is a pointer argument sinse we may
2616      update it.  */
2617
2618   static inline reg_errcode_t
2619   __attribute ((always_inline))
2620 # ifdef RE_ENABLE_I18N
2621   build_range_exp (sbcset, mbcset, range_alloc, start_elem, end_elem)
2622          re_charset_t *mbcset;
2623          int *range_alloc;
2624 # else /* not RE_ENABLE_I18N */
2625   build_range_exp (sbcset, start_elem, end_elem)
2626 # endif /* not RE_ENABLE_I18N */
2627          re_bitset_ptr_t sbcset;
2628          bracket_elem_t *start_elem, *end_elem;
2629     {
2630       unsigned int ch;
2631       uint32_t start_collseq;
2632       uint32_t end_collseq;
2633
2634 # ifdef RE_ENABLE_I18N
2635       /* Check the space of the arrays.  */
2636       if (*range_alloc == mbcset->nranges)
2637         {
2638           /* There are not enough space, need realloc.  */
2639           uint32_t *new_array_start;
2640           uint32_t *new_array_end;
2641           int new_nranges;
2642
2643           /* +1 in case of mbcset->nranges is 0.  */
2644           new_nranges = 2 * mbcset->nranges + 1;
2645           /* Use realloc since mbcset->range_starts and mbcset->range_ends
2646              are NULL if *range_alloc == 0.  */
2647           new_array_start = re_realloc (mbcset->range_starts, uint32_t,
2648                                         new_nranges);
2649           new_array_end = re_realloc (mbcset->range_ends, uint32_t,
2650                                       new_nranges);
2651
2652           if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2653             return REG_ESPACE;
2654
2655           mbcset->range_starts = new_array_start;
2656           mbcset->range_ends = new_array_end;
2657           *range_alloc = new_nranges;
2658         }
2659 # endif /* RE_ENABLE_I18N */
2660
2661       /* Equivalence Classes and Character Classes can't be a range
2662          start/end.  */
2663       if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2664               || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2665               0))
2666         return REG_ERANGE;
2667
2668       start_collseq = lookup_collation_sequence_value (start_elem);
2669       end_collseq = lookup_collation_sequence_value (end_elem);
2670       /* Check start/end collation sequence values.  */
2671       if (BE (start_collseq == UINT_MAX || end_collseq == UINT_MAX, 0))
2672         return REG_ECOLLATE;
2673       if (BE ((syntax & RE_NO_EMPTY_RANGES) && start_collseq > end_collseq, 0))
2674         return REG_ERANGE;
2675
2676 # ifdef RE_ENABLE_I18N
2677       /* Got valid collation sequence values, add them as a new entry.  */
2678       mbcset->range_starts[mbcset->nranges] = start_collseq;
2679       mbcset->range_ends[mbcset->nranges++] = end_collseq;
2680 # endif /* RE_ENABLE_I18N */
2681
2682       /* Build the table for single byte characters.  */
2683       for (ch = 0; ch <= SBC_MAX; ch++)
2684         {
2685           uint32_t ch_collseq;
2686           /*
2687           if (MB_CUR_MAX == 1)
2688           */
2689           if (nrules == 0)
2690             ch_collseq = collseqmb[ch];
2691           else
2692             ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch));
2693           if (start_collseq <= ch_collseq && ch_collseq <= end_collseq)
2694             bitset_set (sbcset, ch);
2695         }
2696       return REG_NOERROR;
2697     }
2698
2699   /* Local function for parse_bracket_exp used in _LIBC environement.
2700      Build the collating element which is represented by NAME.
2701      The result are written to MBCSET and SBCSET.
2702      COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2703      pointer argument sinse we may update it.  */
2704
2705   static inline reg_errcode_t
2706   __attribute ((always_inline))
2707 # ifdef RE_ENABLE_I18N
2708   build_collating_symbol (sbcset, mbcset, coll_sym_alloc, name)
2709          re_charset_t *mbcset;
2710          int *coll_sym_alloc;
2711 # else /* not RE_ENABLE_I18N */
2712   build_collating_symbol (sbcset, name)
2713 # endif /* not RE_ENABLE_I18N */
2714          re_bitset_ptr_t sbcset;
2715          const unsigned char *name;
2716     {
2717       int32_t elem, idx;
2718       size_t name_len = strlen ((const char *) name);
2719       if (nrules != 0)
2720         {
2721           elem = seek_collating_symbol_entry (name, name_len);
2722           if (symb_table[2 * elem] != 0)
2723             {
2724               /* We found the entry.  */
2725               idx = symb_table[2 * elem + 1];
2726               /* Skip the name of collating element name.  */
2727               idx += 1 + extra[idx];
2728             }
2729           else if (symb_table[2 * elem] == 0 && name_len == 1)
2730             {
2731               /* No valid character, treat it as a normal
2732                  character.  */
2733               bitset_set (sbcset, name[0]);
2734               return REG_NOERROR;
2735             }
2736           else
2737             return REG_ECOLLATE;
2738
2739 # ifdef RE_ENABLE_I18N
2740           /* Got valid collation sequence, add it as a new entry.  */
2741           /* Check the space of the arrays.  */
2742           if (*coll_sym_alloc == mbcset->ncoll_syms)
2743             {
2744               /* Not enough, realloc it.  */
2745               /* +1 in case of mbcset->ncoll_syms is 0.  */
2746               *coll_sym_alloc = 2 * mbcset->ncoll_syms + 1;
2747               /* Use realloc since mbcset->coll_syms is NULL
2748                  if *alloc == 0.  */
2749               mbcset->coll_syms = re_realloc (mbcset->coll_syms, int32_t,
2750                                               *coll_sym_alloc);
2751               if (BE (mbcset->coll_syms == NULL, 0))
2752                 return REG_ESPACE;
2753             }
2754           mbcset->coll_syms[mbcset->ncoll_syms++] = idx;
2755 # endif /* RE_ENABLE_I18N */
2756           return REG_NOERROR;
2757         }
2758       else
2759         {
2760           if (BE (name_len != 1, 0))
2761             return REG_ECOLLATE;
2762           else
2763             {
2764               bitset_set (sbcset, name[0]);
2765               return REG_NOERROR;
2766             }
2767         }
2768     }
2769 #endif
2770
2771   re_token_t br_token;
2772   re_bitset_ptr_t sbcset;
2773 #ifdef RE_ENABLE_I18N
2774   re_charset_t *mbcset;
2775   int coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0;
2776   int equiv_class_alloc = 0, char_class_alloc = 0;
2777 #else /* not RE_ENABLE_I18N */
2778   int non_match = 0;
2779 #endif /* not RE_ENABLE_I18N */
2780   bin_tree_t *work_tree;
2781   int token_len, new_idx;
2782 #ifdef _LIBC
2783   collseqmb = (const unsigned char *)
2784     _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
2785   nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
2786   if (nrules)
2787     {
2788       /*
2789       if (MB_CUR_MAX > 1)
2790       */
2791         collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
2792       table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB);
2793       symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE,
2794                                                   _NL_COLLATE_SYMB_TABLEMB);
2795       extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
2796                                                    _NL_COLLATE_SYMB_EXTRAMB);
2797     }
2798 #endif
2799   sbcset = (re_bitset_ptr_t) calloc (sizeof (unsigned int), BITSET_UINTS);
2800 #ifdef RE_ENABLE_I18N
2801   mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
2802 #endif /* RE_ENABLE_I18N */
2803 #ifdef RE_ENABLE_I18N
2804   if (BE (sbcset == NULL || mbcset == NULL, 0))
2805 #else
2806   if (BE (sbcset == NULL, 0))
2807 #endif /* RE_ENABLE_I18N */
2808     {
2809       *err = REG_ESPACE;
2810       return NULL;
2811     }
2812
2813   token_len = peek_token_bracket (token, regexp, syntax);
2814   if (BE (token->type == END_OF_RE, 0))
2815     {
2816       *err = REG_BADPAT;
2817       goto parse_bracket_exp_free_return;
2818     }
2819   if (token->type == OP_NON_MATCH_LIST)
2820     {
2821 #ifdef RE_ENABLE_I18N
2822       int i;
2823       mbcset->non_match = 1;
2824 #else /* not RE_ENABLE_I18N */
2825       non_match = 1;
2826 #endif /* not RE_ENABLE_I18N */
2827       if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
2828         bitset_set (sbcset, '\0');
2829       re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
2830       token_len = peek_token_bracket (token, regexp, syntax);
2831       if (BE (token->type == END_OF_RE, 0))
2832         {
2833           *err = REG_BADPAT;
2834           goto parse_bracket_exp_free_return;
2835         }
2836 #ifdef RE_ENABLE_I18N
2837       if (MB_CUR_MAX > 1)
2838         for (i = 0; i < SBC_MAX; ++i)
2839           if (__btowc (i) == WEOF)
2840             bitset_set (sbcset, i);
2841 #endif /* RE_ENABLE_I18N */
2842     }
2843
2844   /* We treat the first ']' as a normal character.  */
2845   if (token->type == OP_CLOSE_BRACKET)
2846     token->type = CHARACTER;
2847
2848   while (1)
2849     {
2850       bracket_elem_t start_elem, end_elem;
2851       unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE];
2852       unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE];
2853       reg_errcode_t ret;
2854       int token_len2 = 0, is_range_exp = 0;
2855       re_token_t token2;
2856
2857       start_elem.opr.name = start_name_buf;
2858       ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa,
2859                                    syntax);
2860       if (BE (ret != REG_NOERROR, 0))
2861         {
2862           *err = ret;
2863           goto parse_bracket_exp_free_return;
2864         }
2865
2866       token_len = peek_token_bracket (token, regexp, syntax);
2867       if (BE (token->type == END_OF_RE, 0))
2868         {
2869           *err = REG_BADPAT;
2870           goto parse_bracket_exp_free_return;
2871         }
2872       if (token->type == OP_CHARSET_RANGE)
2873         {
2874           re_string_skip_bytes (regexp, token_len); /* Skip '-'.  */
2875           token_len2 = peek_token_bracket (&token2, regexp, syntax);
2876           if (BE (token->type == END_OF_RE, 0))
2877             {
2878               *err = REG_BADPAT;
2879               goto parse_bracket_exp_free_return;
2880             }
2881           if (token2.type == OP_CLOSE_BRACKET)
2882             {
2883               /* We treat the last '-' as a normal character.  */
2884               re_string_skip_bytes (regexp, -token_len);
2885               token->type = CHARACTER;
2886             }
2887           else
2888             is_range_exp = 1;
2889         }
2890
2891       if (is_range_exp == 1)
2892         {
2893           end_elem.opr.name = end_name_buf;
2894           ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2,
2895                                        dfa, syntax);
2896           if (BE (ret != REG_NOERROR, 0))
2897             {
2898               *err = ret;
2899               goto parse_bracket_exp_free_return;
2900             }
2901
2902           token_len = peek_token_bracket (token, regexp, syntax);
2903           if (BE (token->type == END_OF_RE, 0))
2904             {
2905               *err = REG_BADPAT;
2906               goto parse_bracket_exp_free_return;
2907             }
2908           *err = build_range_exp (sbcset,
2909 #ifdef RE_ENABLE_I18N
2910                                   mbcset, &range_alloc,
2911 #endif /* RE_ENABLE_I18N */
2912                                   &start_elem, &end_elem);
2913           if (BE (*err != REG_NOERROR, 0))
2914             goto parse_bracket_exp_free_return;
2915         }
2916       else
2917         {
2918           switch (start_elem.type)
2919             {
2920             case SB_CHAR:
2921               bitset_set (sbcset, start_elem.opr.ch);
2922               break;
2923 #ifdef RE_ENABLE_I18N
2924             case MB_CHAR:
2925               /* Check whether the array has enough space.  */
2926               if (mbchar_alloc == mbcset->nmbchars)
2927                 {
2928                   /* Not enough, realloc it.  */
2929                   /* +1 in case of mbcset->nmbchars is 0.  */
2930                   mbchar_alloc = 2 * mbcset->nmbchars + 1;
2931                   /* Use realloc since array is NULL if *alloc == 0.  */
2932                   mbcset->mbchars = re_realloc (mbcset->mbchars, wchar_t,
2933                                                 mbchar_alloc);
2934                   if (BE (mbcset->mbchars == NULL, 0))
2935                     goto parse_bracket_exp_espace;
2936                 }
2937               mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch;
2938               break;
2939 #endif /* RE_ENABLE_I18N */
2940             case EQUIV_CLASS:
2941               *err = build_equiv_class (sbcset,
2942 #ifdef RE_ENABLE_I18N
2943                                         mbcset, &equiv_class_alloc,
2944 #endif /* RE_ENABLE_I18N */
2945                                         start_elem.opr.name);
2946               if (BE (*err != REG_NOERROR, 0))
2947                 goto parse_bracket_exp_free_return;
2948               break;
2949             case COLL_SYM:
2950               *err = build_collating_symbol (sbcset,
2951 #ifdef RE_ENABLE_I18N
2952                                              mbcset, &coll_sym_alloc,
2953 #endif /* RE_ENABLE_I18N */
2954                                              start_elem.opr.name);
2955               if (BE (*err != REG_NOERROR, 0))
2956                 goto parse_bracket_exp_free_return;
2957               break;
2958             case CHAR_CLASS:
2959               *err = build_charclass (regexp->trans, sbcset,
2960 #ifdef RE_ENABLE_I18N
2961                                       mbcset, &char_class_alloc,
2962 #endif /* RE_ENABLE_I18N */
2963                                       start_elem.opr.name, syntax);
2964               if (BE (*err != REG_NOERROR, 0))
2965                goto parse_bracket_exp_free_return;
2966               break;
2967             default:
2968               assert (0);
2969               break;
2970             }
2971         }
2972       if (token->type == OP_CLOSE_BRACKET)
2973         break;
2974     }
2975
2976   re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
2977
2978   /* If it is non-matching list.  */
2979 #ifdef RE_ENABLE_I18N
2980   if (mbcset->non_match)
2981 #else /* not RE_ENABLE_I18N */
2982   if (non_match)
2983 #endif /* not RE_ENABLE_I18N */
2984     bitset_not (sbcset);
2985
2986   /* Build a tree for simple bracket.  */
2987   br_token.type = SIMPLE_BRACKET;
2988   br_token.opr.sbcset = sbcset;
2989   new_idx = re_dfa_add_node (dfa, br_token, 0);
2990   work_tree = create_tree (NULL, NULL, 0, new_idx);
2991   if (BE (new_idx == -1 || work_tree == NULL, 0))
2992     goto parse_bracket_exp_espace;
2993
2994 #ifdef RE_ENABLE_I18N
2995   if (mbcset->nmbchars || mbcset->ncoll_syms || mbcset->nequiv_classes
2996       || mbcset->nranges || (MB_CUR_MAX > 1 && (mbcset->nchar_classes
2997                                                 || mbcset->non_match)))
2998     {
2999       re_token_t alt_token;
3000       bin_tree_t *mbc_tree;
3001       /* Build a tree for complex bracket.  */
3002       br_token.type = COMPLEX_BRACKET;
3003       br_token.opr.mbcset = mbcset;
3004       dfa->has_mb_node = 1;
3005       new_idx = re_dfa_add_node (dfa, br_token, 0);
3006       mbc_tree = create_tree (NULL, NULL, 0, new_idx);
3007       if (BE (new_idx == -1 || mbc_tree == NULL, 0))
3008         goto parse_bracket_exp_espace;
3009       /* Then join them by ALT node.  */
3010       dfa->has_plural_match = 1;
3011       alt_token.type = OP_ALT;
3012       new_idx = re_dfa_add_node (dfa, alt_token, 0);
3013       work_tree = create_tree (work_tree, mbc_tree, 0, new_idx);
3014       if (BE (new_idx != -1 && mbc_tree != NULL, 1))
3015         return work_tree;
3016     }
3017   else
3018     {
3019       free_charset (mbcset);
3020       return work_tree;
3021     }
3022 #else /* not RE_ENABLE_I18N */
3023   return work_tree;
3024 #endif /* not RE_ENABLE_I18N */
3025
3026  parse_bracket_exp_espace:
3027   *err = REG_ESPACE;
3028  parse_bracket_exp_free_return:
3029   re_free (sbcset);
3030 #ifdef RE_ENABLE_I18N
3031   free_charset (mbcset);
3032 #endif /* RE_ENABLE_I18N */
3033   return NULL;
3034 }
3035
3036 /* Parse an element in the bracket expression.  */
3037
3038 static reg_errcode_t
3039 parse_bracket_element (elem, regexp, token, token_len, dfa, syntax)
3040      bracket_elem_t *elem;
3041      re_string_t *regexp;
3042      re_token_t *token;
3043      int token_len;
3044      re_dfa_t *dfa;
3045      reg_syntax_t syntax;
3046 {
3047 #ifdef RE_ENABLE_I18N
3048   int cur_char_size;
3049   cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp));
3050   if (cur_char_size > 1)
3051     {
3052       elem->type = MB_CHAR;
3053       elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp));
3054       re_string_skip_bytes (regexp, cur_char_size);
3055       return REG_NOERROR;
3056     }
3057 #endif /* RE_ENABLE_I18N */
3058   re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3059   if (token->type == OP_OPEN_COLL_ELEM || token->type == OP_OPEN_CHAR_CLASS
3060       || token->type == OP_OPEN_EQUIV_CLASS)
3061     return parse_bracket_symbol (elem, regexp, token);
3062   elem->type = SB_CHAR;
3063   elem->opr.ch = token->opr.c;
3064   return REG_NOERROR;
3065 }
3066
3067 /* Parse a bracket symbol in the bracket expression.  Bracket symbols are
3068    such as [:<character_class>:], [.<collating_element>.], and
3069    [=<equivalent_class>=].  */
3070
3071 static reg_errcode_t
3072 parse_bracket_symbol (elem, regexp, token)
3073      bracket_elem_t *elem;
3074      re_string_t *regexp;
3075      re_token_t *token;
3076 {
3077   unsigned char ch, delim = token->opr.c;
3078   int i = 0;
3079   for (;; ++i)
3080     {
3081       if (re_string_eoi(regexp) || i >= BRACKET_NAME_BUF_SIZE)
3082         return REG_EBRACK;
3083       if (token->type == OP_OPEN_CHAR_CLASS)
3084         ch = re_string_fetch_byte_case (regexp);
3085       else
3086         ch = re_string_fetch_byte (regexp);
3087       if (ch == delim && re_string_peek_byte (regexp, 0) == ']')
3088         break;
3089       elem->opr.name[i] = ch;
3090     }
3091   re_string_skip_bytes (regexp, 1);
3092   elem->opr.name[i] = '\0';
3093   switch (token->type)
3094     {
3095     case OP_OPEN_COLL_ELEM:
3096       elem->type = COLL_SYM;
3097       break;
3098     case OP_OPEN_EQUIV_CLASS:
3099       elem->type = EQUIV_CLASS;
3100       break;
3101     case OP_OPEN_CHAR_CLASS:
3102       elem->type = CHAR_CLASS;
3103       break;
3104     default:
3105       break;
3106     }
3107   return REG_NOERROR;
3108 }
3109
3110   /* Helper function for parse_bracket_exp.
3111      Build the equivalence class which is represented by NAME.
3112      The result are written to MBCSET and SBCSET.
3113      EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
3114      is a pointer argument sinse we may update it.  */
3115
3116 static reg_errcode_t
3117 #ifdef RE_ENABLE_I18N
3118 build_equiv_class (sbcset, mbcset, equiv_class_alloc, name)
3119      re_charset_t *mbcset;
3120      int *equiv_class_alloc;
3121 #else /* not RE_ENABLE_I18N */
3122 build_equiv_class (sbcset, name)
3123 #endif /* not RE_ENABLE_I18N */
3124      re_bitset_ptr_t sbcset;
3125      const unsigned char *name;
3126 {
3127 #if defined _LIBC && defined RE_ENABLE_I18N
3128   uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3129   if (nrules != 0)
3130     {
3131       const int32_t *table, *indirect;
3132       const unsigned char *weights, *extra, *cp;
3133       unsigned char char_buf[2];
3134       int32_t idx1, idx2;
3135       unsigned int ch;
3136       size_t len;
3137       /* This #include defines a local function!  */
3138 # include <locale/weight.h>
3139       /* Calculate the index for equivalence class.  */
3140       cp = name;
3141       table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3142       weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3143                                                _NL_COLLATE_WEIGHTMB);
3144       extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3145                                                    _NL_COLLATE_EXTRAMB);
3146       indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3147                                                 _NL_COLLATE_INDIRECTMB);
3148       idx1 = findidx (&cp);
3149       if (BE (idx1 == 0 || cp < name + strlen ((const char *) name), 0))
3150         /* This isn't a valid character.  */
3151         return REG_ECOLLATE;
3152
3153       /* Build single byte matcing table for this equivalence class.  */
3154       char_buf[1] = (unsigned char) '\0';
3155       len = weights[idx1];
3156       for (ch = 0; ch < SBC_MAX; ++ch)
3157         {
3158           char_buf[0] = ch;
3159           cp = char_buf;
3160           idx2 = findidx (&cp);
3161 /*
3162           idx2 = table[ch];
3163 */
3164           if (idx2 == 0)
3165             /* This isn't a valid character.  */
3166             continue;
3167           if (len == weights[idx2])
3168             {
3169               int cnt = 0;
3170               while (cnt <= len &&
3171                      weights[idx1 + 1 + cnt] == weights[idx2 + 1 + cnt])
3172                 ++cnt;
3173
3174               if (cnt > len)
3175                 bitset_set (sbcset, ch);
3176             }
3177         }
3178       /* Check whether the array has enough space.  */
3179       if (*equiv_class_alloc == mbcset->nequiv_classes)
3180         {
3181           /* Not enough, realloc it.  */
3182           /* +1 in case of mbcset->nequiv_classes is 0.  */
3183           *equiv_class_alloc = 2 * mbcset->nequiv_classes + 1;
3184           /* Use realloc since the array is NULL if *alloc == 0.  */
3185           mbcset->equiv_classes = re_realloc (mbcset->equiv_classes, int32_t,
3186                                               *equiv_class_alloc);
3187           if (BE (mbcset->equiv_classes == NULL, 0))
3188             return REG_ESPACE;
3189         }
3190       mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1;
3191     }
3192   else
3193 #endif /* _LIBC && RE_ENABLE_I18N */
3194     {
3195       if (BE (strlen ((const char *) name) != 1, 0))
3196         return REG_ECOLLATE;
3197       bitset_set (sbcset, *name);
3198     }
3199   return REG_NOERROR;
3200 }
3201
3202   /* Helper function for parse_bracket_exp.
3203      Build the character class which is represented by NAME.
3204      The result are written to MBCSET and SBCSET.
3205      CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
3206      is a pointer argument sinse we may update it.  */
3207
3208 static reg_errcode_t
3209 #ifdef RE_ENABLE_I18N
3210 build_charclass (trans, sbcset, mbcset, char_class_alloc, class_name, syntax)
3211      re_charset_t *mbcset;
3212      int *char_class_alloc;
3213 #else /* not RE_ENABLE_I18N */
3214 build_charclass (trans, sbcset, class_name, syntax)
3215 #endif /* not RE_ENABLE_I18N */
3216      RE_TRANSLATE_TYPE trans;
3217      re_bitset_ptr_t sbcset;
3218      const unsigned char *class_name;
3219      reg_syntax_t syntax;
3220 {
3221   int i;
3222   const char *name = (const char *) class_name;
3223
3224   /* In case of REG_ICASE "upper" and "lower" match the both of
3225      upper and lower cases.  */
3226   if ((syntax & RE_ICASE)
3227       && (strcmp (name, "upper") == 0 || strcmp (name, "lower") == 0))
3228     name = "alpha";
3229
3230 #ifdef RE_ENABLE_I18N
3231   /* Check the space of the arrays.  */
3232   if (*char_class_alloc == mbcset->nchar_classes)
3233     {
3234       /* Not enough, realloc it.  */
3235       /* +1 in case of mbcset->nchar_classes is 0.  */
3236       *char_class_alloc = 2 * mbcset->nchar_classes + 1;
3237       /* Use realloc since array is NULL if *alloc == 0.  */
3238       mbcset->char_classes = re_realloc (mbcset->char_classes, wctype_t,
3239                                          *char_class_alloc);
3240       if (BE (mbcset->char_classes == NULL, 0))
3241         return REG_ESPACE;
3242     }
3243   mbcset->char_classes[mbcset->nchar_classes++] = __wctype (name);
3244 #endif /* RE_ENABLE_I18N */
3245
3246 #define BUILD_CHARCLASS_LOOP(ctype_func)        \
3247     for (i = 0; i < SBC_MAX; ++i)               \
3248       {                                         \
3249         if (ctype_func (i))                     \
3250           {                                     \
3251             int ch = trans ? trans[i] : i;      \
3252             bitset_set (sbcset, ch);            \
3253           }                                     \
3254       }
3255
3256   if (strcmp (name, "alnum") == 0)
3257     BUILD_CHARCLASS_LOOP (isalnum)
3258   else if (strcmp (name, "cntrl") == 0)
3259     BUILD_CHARCLASS_LOOP (iscntrl)
3260   else if (strcmp (name, "lower") == 0)
3261     BUILD_CHARCLASS_LOOP (islower)
3262   else if (strcmp (name, "space") == 0)
3263     BUILD_CHARCLASS_LOOP (isspace)
3264   else if (strcmp (name, "alpha") == 0)
3265     BUILD_CHARCLASS_LOOP (isalpha)
3266   else if (strcmp (name, "digit") == 0)
3267     BUILD_CHARCLASS_LOOP (isdigit)
3268   else if (strcmp (name, "print") == 0)
3269     BUILD_CHARCLASS_LOOP (isprint)
3270   else if (strcmp (name, "upper") == 0)
3271     BUILD_CHARCLASS_LOOP (isupper)
3272   else if (strcmp (name, "blank") == 0)
3273     BUILD_CHARCLASS_LOOP (isblank)
3274   else if (strcmp (name, "graph") == 0)
3275     BUILD_CHARCLASS_LOOP (isgraph)
3276   else if (strcmp (name, "punct") == 0)
3277     BUILD_CHARCLASS_LOOP (ispunct)
3278   else if (strcmp (name, "xdigit") == 0)
3279     BUILD_CHARCLASS_LOOP (isxdigit)
3280   else
3281     return REG_ECTYPE;
3282
3283   return REG_NOERROR;
3284 }
3285
3286 static bin_tree_t *
3287 build_word_op (dfa, trans, not, err)
3288      re_dfa_t *dfa;
3289      RE_TRANSLATE_TYPE trans;
3290      int not;
3291      reg_errcode_t *err;
3292 {
3293   re_bitset_ptr_t sbcset;
3294 #ifdef RE_ENABLE_I18N
3295   re_charset_t *mbcset;
3296   int alloc = 0;
3297 #else /* not RE_ENABLE_I18N */
3298   int non_match = 0;
3299 #endif /* not RE_ENABLE_I18N */
3300   reg_errcode_t ret;
3301   re_token_t br_token;
3302   bin_tree_t *tree;
3303   int new_idx;
3304
3305   sbcset = (re_bitset_ptr_t) calloc (sizeof (unsigned int), BITSET_UINTS);
3306 #ifdef RE_ENABLE_I18N
3307   mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3308 #endif /* RE_ENABLE_I18N */
3309
3310 #ifdef RE_ENABLE_I18N
3311   if (BE (sbcset == NULL || mbcset == NULL, 0))
3312 #else /* not RE_ENABLE_I18N */
3313   if (BE (sbcset == NULL, 0))
3314 #endif /* not RE_ENABLE_I18N */
3315     {
3316       *err = REG_ESPACE;
3317       return NULL;
3318     }
3319
3320   if (not)
3321     {
3322 #ifdef RE_ENABLE_I18N
3323       int i;
3324       /*
3325       if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3326         bitset_set(cset->sbcset, '\0');
3327       */
3328       mbcset->non_match = 1;
3329       if (MB_CUR_MAX > 1)
3330         for (i = 0; i < SBC_MAX; ++i)
3331           if (__btowc (i) == WEOF)
3332             bitset_set (sbcset, i);
3333 #else /* not RE_ENABLE_I18N */
3334       non_match = 1;
3335 #endif /* not RE_ENABLE_I18N */
3336     }
3337
3338   /* We don't care the syntax in this case.  */
3339   ret = build_charclass (trans, sbcset,
3340 #ifdef RE_ENABLE_I18N
3341                          mbcset, &alloc,
3342 #endif /* RE_ENABLE_I18N */
3343                          (const unsigned char *) "alnum", 0);
3344
3345   if (BE (ret != REG_NOERROR, 0))
3346     {
3347       re_free (sbcset);
3348 #ifdef RE_ENABLE_I18N
3349       free_charset (mbcset);
3350 #endif /* RE_ENABLE_I18N */
3351       *err = ret;
3352       return NULL;
3353     }
3354   /* \w match '_' also.  */
3355   bitset_set (sbcset, '_');
3356
3357   /* If it is non-matching list.  */
3358 #ifdef RE_ENABLE_I18N
3359   if (mbcset->non_match)
3360 #else /* not RE_ENABLE_I18N */
3361   if (non_match)
3362 #endif /* not RE_ENABLE_I18N */
3363     bitset_not (sbcset);
3364
3365   /* Build a tree for simple bracket.  */
3366   br_token.type = SIMPLE_BRACKET;
3367   br_token.opr.sbcset = sbcset;
3368   new_idx = re_dfa_add_node (dfa, br_token, 0);
3369   tree = create_tree (NULL, NULL, 0, new_idx);
3370   if (BE (new_idx == -1 || tree == NULL, 0))
3371     goto build_word_op_espace;
3372
3373 #ifdef RE_ENABLE_I18N
3374   if (MB_CUR_MAX > 1)
3375     {
3376       re_token_t alt_token;
3377       bin_tree_t *mbc_tree;
3378       /* Build a tree for complex bracket.  */
3379       br_token.type = COMPLEX_BRACKET;
3380       br_token.opr.mbcset = mbcset;
3381       dfa->has_mb_node = 1;
3382       new_idx = re_dfa_add_node (dfa, br_token, 0);
3383       mbc_tree = create_tree (NULL, NULL, 0, new_idx);
3384       if (BE (new_idx == -1 || mbc_tree == NULL, 0))
3385         goto build_word_op_espace;
3386       /* Then join them by ALT node.  */
3387       alt_token.type = OP_ALT;
3388       new_idx = re_dfa_add_node (dfa, alt_token, 0);
3389       tree = create_tree (tree, mbc_tree, 0, new_idx);
3390       if (BE (new_idx != -1 && mbc_tree != NULL, 1))
3391         return tree;
3392     }
3393   else
3394     {
3395       free_charset (mbcset);
3396       return tree;
3397     }
3398 #else /* not RE_ENABLE_I18N */
3399   return tree;
3400 #endif /* not RE_ENABLE_I18N */
3401
3402  build_word_op_espace:
3403   re_free (sbcset);
3404 #ifdef RE_ENABLE_I18N
3405   free_charset (mbcset);
3406 #endif /* RE_ENABLE_I18N */
3407   *err = REG_ESPACE;
3408   return NULL;
3409 }
3410
3411 /* This is intended for the expressions like "a{1,3}".
3412    Fetch a number from `input', and return the number.
3413    Return -1, if the number field is empty like "{,1}".
3414    Return -2, If an error is occured.  */
3415
3416 static int
3417 fetch_number (input, token, syntax)
3418      re_string_t *input;
3419      re_token_t *token;
3420      reg_syntax_t syntax;
3421 {
3422   int num = -1;
3423   unsigned char c;
3424   while (1)
3425     {
3426       *token = fetch_token (input, syntax);
3427       c = token->opr.c;
3428       if (BE (token->type == END_OF_RE, 0))
3429         return -2;
3430       if (token->type == OP_CLOSE_DUP_NUM || c == ',')
3431         break;
3432       num = ((token->type != CHARACTER || c < '0' || '9' < c || num == -2)
3433              ? -2 : ((num == -1) ? c - '0' : num * 10 + c - '0'));
3434       num = (num > RE_DUP_MAX) ? -2 : num;
3435     }
3436   return num;
3437 }
3438 \f
3439 #ifdef RE_ENABLE_I18N
3440 static void
3441 free_charset (re_charset_t *cset)
3442 {
3443   re_free (cset->mbchars);
3444 # ifdef _LIBC
3445   re_free (cset->coll_syms);
3446   re_free (cset->equiv_classes);
3447   re_free (cset->range_starts);
3448   re_free (cset->range_ends);
3449 # endif
3450   re_free (cset->char_classes);
3451   re_free (cset);
3452 }
3453 #endif /* RE_ENABLE_I18N */
3454 \f
3455 /* Functions for binary tree operation.  */
3456
3457 /* Create a node of tree.
3458    Note: This function automatically free left and right if malloc fails.  */
3459
3460 static bin_tree_t *
3461 create_tree (left, right, type, index)
3462      bin_tree_t *left;
3463      bin_tree_t *right;
3464      re_token_type_t type;
3465      int index;
3466 {
3467   bin_tree_t *tree;
3468   tree = re_malloc (bin_tree_t, 1);
3469   if (BE (tree == NULL, 0))
3470     {
3471       free_bin_tree (left);
3472       free_bin_tree (right);
3473       return NULL;
3474     }
3475   tree->parent = NULL;
3476   tree->left = left;
3477   tree->right = right;
3478   tree->type = type;
3479   tree->node_idx = index;
3480   tree->first = -1;
3481   tree->next = -1;
3482   re_node_set_init_empty (&tree->eclosure);
3483
3484   if (left != NULL)
3485     left->parent = tree;
3486   if (right != NULL)
3487     right->parent = tree;
3488   return tree;
3489 }
3490
3491 /* Free the sub tree pointed by TREE.  */
3492
3493 static void
3494 free_bin_tree (tree)
3495      bin_tree_t *tree;
3496 {
3497   if (tree == NULL)
3498     return;
3499   /*re_node_set_free (&tree->eclosure);*/
3500   free_bin_tree (tree->left);
3501   free_bin_tree (tree->right);
3502   re_free (tree);
3503 }
3504
3505 /* Duplicate the node SRC, and return new node.  */
3506
3507 static bin_tree_t *
3508 duplicate_tree (src, dfa)
3509      const bin_tree_t *src;
3510      re_dfa_t *dfa;
3511 {
3512   bin_tree_t *left = NULL, *right = NULL, *new_tree;
3513   int new_node_idx;
3514   /* Since node indies must be according to Post-order of the tree,
3515      we must duplicate the left at first.  */
3516   if (src->left != NULL)
3517     {
3518       left = duplicate_tree (src->left, dfa);
3519       if (left == NULL)
3520         return NULL;
3521     }
3522
3523   /* Secondaly, duplicate the right.  */
3524   if (src->right != NULL)
3525     {
3526       right = duplicate_tree (src->right, dfa);
3527       if (right == NULL)
3528         {
3529           free_bin_tree (left);
3530           return NULL;
3531         }
3532     }
3533
3534   /* At last, duplicate itself.  */
3535   if (src->type == NON_TYPE)
3536     {
3537       new_node_idx = re_dfa_add_node (dfa, dfa->nodes[src->node_idx], 0);
3538       dfa->nodes[new_node_idx].duplicated = 1;
3539       if (BE (new_node_idx == -1, 0))
3540         {
3541           free_bin_tree (left);
3542           free_bin_tree (right);
3543           return NULL;
3544         }
3545     }
3546   else
3547     new_node_idx = src->type;
3548
3549   new_tree = create_tree (left, right, src->type, new_node_idx);
3550   if (BE (new_tree == NULL, 0))
3551     {
3552       free_bin_tree (left);
3553       free_bin_tree (right);
3554     }
3555   return new_tree;
3556 }