lib/Support/regcomp.c

   1 /*-
   2  * This code is derived from OpenBSD's libc/regex, original license follows:
   3  *
   4  * Copyright (c) 1992, 1993, 1994 Henry Spencer.
   5  * Copyright (c) 1992, 1993, 1994
   6  *      The Regents of the University of California.  All rights reserved.
   7  *
   8  * This code is derived from software contributed to Berkeley by
   9  * Henry Spencer.
  10  *
  11  * Redistribution and use in source and binary forms, with or without
  12  * modification, are permitted provided that the following conditions
  13  * are met:
  14  * 1. Redistributions of source code must retain the above copyright
  15  *    notice, this list of conditions and the following disclaimer.
  16  * 2. Redistributions in binary form must reproduce the above copyright
  17  *    notice, this list of conditions and the following disclaimer in the
  18  *    documentation and/or other materials provided with the distribution.
  19  * 3. Neither the name of the University nor the names of its contributors
  20  *    may be used to endorse or promote products derived from this software
  21  *    without specific prior written permission.
  22  *
  23  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  24  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  25  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  26  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  27  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  28  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  29  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  30  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  31  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  32  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  33  * SUCH DAMAGE.
  34  *
  35  *      @(#)regcomp.c   8.5 (Berkeley) 3/20/94
  36  */
  37
  38 #include <sys/types.h>
  39 #include <stdio.h>
  40 #include <string.h>
  41 #include <ctype.h>
  42 #include <limits.h>
  43 #include <stdlib.h>
  44 #include "regex_impl.h"
  45
  46 #include "regutils.h"
  47 #include "regex2.h"
  48
  49 #include "regcclass.h"
  50 #include "regcname.h"
  51
  52 /*
  53  * parse structure, passed up and down to avoid global variables and
  54  * other clumsinesses
  55  */
  56 struct parse {
  57         char *next;             /* next character in RE */
  58         char *end;              /* end of string (-> NUL normally) */
  59         int error;              /* has an error been seen? */
  60         sop *strip;             /* malloced strip */
  61         sopno ssize;            /* malloced strip size (allocated) */
  62         sopno slen;             /* malloced strip length (used) */
  63         int ncsalloc;           /* number of csets allocated */
  64         struct re_guts *g;
  65 #       define  NPAREN  10      /* we need to remember () 1-9 for back refs */
  66         sopno pbegin[NPAREN];   /* -> ( ([0] unused) */
  67         sopno pend[NPAREN];     /* -> ) ([0] unused) */
  68 };
  69
  70 static void p_ere(struct parse *, int);
  71 static void p_ere_exp(struct parse *);
  72 static void p_str(struct parse *);
  73 static void p_bre(struct parse *, int, int);
  74 static int p_simp_re(struct parse *, int);
  75 static int p_count(struct parse *);
  76 static void p_bracket(struct parse *);
  77 static void p_b_term(struct parse *, cset *);
  78 static void p_b_cclass(struct parse *, cset *);
  79 static void p_b_eclass(struct parse *, cset *);
  80 static char p_b_symbol(struct parse *);
  81 static char p_b_coll_elem(struct parse *, int);
  82 static char othercase(int);
  83 static void bothcases(struct parse *, int);
  84 static void ordinary(struct parse *, int);
  85 static void nonnewline(struct parse *);
  86 static void repeat(struct parse *, sopno, int, int);
  87 static int seterr(struct parse *, int);
  88 static cset *allocset(struct parse *);
  89 static void freeset(struct parse *, cset *);
  90 static int freezeset(struct parse *, cset *);
  91 static int firstch(struct parse *, cset *);
  92 static int nch(struct parse *, cset *);
  93 static void mcadd(struct parse *, cset *, const char *);
  94 static void mcinvert(struct parse *, cset *);
  95 static void mccase(struct parse *, cset *);
  96 static int isinsets(struct re_guts *, int);
  97 static int samesets(struct re_guts *, int, int);
  98 static void categorize(struct parse *, struct re_guts *);
  99 static sopno dupl(struct parse *, sopno, sopno);
 100 static void doemit(struct parse *, sop, size_t);
 101 static void doinsert(struct parse *, sop, size_t, sopno);
 102 static void dofwd(struct parse *, sopno, sop);
 103 static void enlarge(struct parse *, sopno);
 104 static void stripsnug(struct parse *, struct re_guts *);
 105 static void findmust(struct parse *, struct re_guts *);
 106 static sopno pluscount(struct parse *, struct re_guts *);
 107
 108 static char nuls[10];           /* place to point scanner in event of error */
 109
 110 /*
 111  * macros for use with parse structure
 112  * BEWARE:  these know that the parse structure is named `p' !!!
 113  */
 114 #define PEEK()  (*p->next)
 115 #define PEEK2() (*(p->next+1))
 116 #define MORE()  (p->next < p->end)
 117 #define MORE2() (p->next+1 < p->end)
 118 #define SEE(c)  (MORE() && PEEK() == (c))
 119 #define SEETWO(a, b)    (MORE() && MORE2() && PEEK() == (a) && PEEK2() == (b))
 120 #define EAT(c)  ((SEE(c)) ? (NEXT(), 1) : 0)
 121 #define EATTWO(a, b)    ((SEETWO(a, b)) ? (NEXT2(), 1) : 0)
 122 #define NEXT()  (p->next++)
 123 #define NEXT2() (p->next += 2)
 124 #define NEXTn(n)        (p->next += (n))
 125 #define GETNEXT()       (*p->next++)
 126 #define SETERROR(e)     seterr(p, (e))
 127 #define REQUIRE(co, e)  (void)((co) || SETERROR(e))
 128 #define MUSTSEE(c, e)   (REQUIRE(MORE() && PEEK() == (c), e))
 129 #define MUSTEAT(c, e)   (REQUIRE(MORE() && GETNEXT() == (c), e))
 130 #define MUSTNOTSEE(c, e)        (REQUIRE(!MORE() || PEEK() != (c), e))
 131 #define EMIT(op, sopnd) doemit(p, (sop)(op), (size_t)(sopnd))
 132 #define INSERT(op, pos) doinsert(p, (sop)(op), HERE()-(pos)+1, pos)
 133 #define AHEAD(pos)              dofwd(p, pos, HERE()-(pos))
 134 #define ASTERN(sop, pos)        EMIT(sop, HERE()-pos)
 135 #define HERE()          (p->slen)
 136 #define THERE()         (p->slen - 1)
 137 #define THERETHERE()    (p->slen - 2)
 138 #define DROP(n) (p->slen -= (n))
 139
 140 #ifdef  _POSIX2_RE_DUP_MAX
 141 #define DUPMAX  _POSIX2_RE_DUP_MAX
 142 #else
 143 #define DUPMAX  255
 144 #endif
 145
 146 #ifndef NDEBUG
 147 static int never = 0;           /* for use in asserts; shuts lint up */
 148 #else
 149 #define never   0               /* some <assert.h>s have bugs too */
 150 #endif
 151
 152 /*
 153  - llvm_regcomp - interface for parser and compilation
 154  */
 155 int                             /* 0 success, otherwise REG_something */
 156 llvm_regcomp(llvm_regex_t *preg, const char *pattern, int cflags)
 157 {
 158         struct parse pa;
 159         struct re_guts *g;
 160         struct parse *p = &pa;
 161         int i;
 162         size_t len;
 163 #ifdef REDEBUG
 164 #       define  GOODFLAGS(f)    (f)
 165 #else
 166 #       define  GOODFLAGS(f)    ((f)&~REG_DUMP)
 167 #endif
 168
 169         cflags = GOODFLAGS(cflags);
 170         if ((cflags&REG_EXTENDED) && (cflags&REG_NOSPEC))
 171                 return(REG_INVARG);
 172
 173         if (cflags&REG_PEND) {
 174                 if (preg->re_endp < pattern)
 175                         return(REG_INVARG);
 176                 len = preg->re_endp - pattern;
 177         } else
 178                 len = strlen((const char *)pattern);
 179
 180         /* do the mallocs early so failure handling is easy */
 181         g = (struct re_guts *)malloc(sizeof(struct re_guts) +
 182                                                         (NC-1)*sizeof(cat_t));
 183         if (g == NULL)
 184                 return(REG_ESPACE);
 185         p->ssize = len/(size_t)2*(size_t)3 + (size_t)1; /* ugh */
 186         p->strip = (sop *)calloc(p->ssize, sizeof(sop));
 187         p->slen = 0;
 188         if (p->strip == NULL) {
 189                 free((char *)g);
 190                 return(REG_ESPACE);
 191         }
 192
 193         /* set things up */
 194         p->g = g;
 195         p->next = (char *)pattern;      /* convenience; we do not modify it */
 196         p->end = p->next + len;
 197         p->error = 0;
 198         p->ncsalloc = 0;
 199         for (i = 0; i < NPAREN; i++) {
 200                 p->pbegin[i] = 0;
 201                 p->pend[i] = 0;
 202         }
 203         g->csetsize = NC;
 204         g->sets = NULL;
 205         g->setbits = NULL;
 206         g->ncsets = 0;
 207         g->cflags = cflags;
 208         g->iflags = 0;
 209         g->nbol = 0;
 210         g->neol = 0;
 211         g->must = NULL;
 212         g->mlen = 0;
 213         g->nsub = 0;
 214         g->ncategories = 1;     /* category 0 is "everything else" */
 215         g->categories = &g->catspace[-(CHAR_MIN)];
 216         (void) memset((char *)g->catspace, 0, NC*sizeof(cat_t));
 217         g->backrefs = 0;
 218
 219         /* do it */
 220         EMIT(OEND, 0);
 221         g->firststate = THERE();
 222         if (cflags&REG_EXTENDED)
 223                 p_ere(p, OUT);
 224         else if (cflags&REG_NOSPEC)
 225                 p_str(p);
 226         else
 227                 p_bre(p, OUT, OUT);
 228         EMIT(OEND, 0);
 229         g->laststate = THERE();
 230
 231         /* tidy up loose ends and fill things in */
 232         categorize(p, g);
 233         stripsnug(p, g);
 234         findmust(p, g);
 235         g->nplus = pluscount(p, g);
 236         g->magic = MAGIC2;
 237         preg->re_nsub = g->nsub;
 238         preg->re_g = g;
 239         preg->re_magic = MAGIC1;
 240 #ifndef REDEBUG
 241         /* not debugging, so can't rely on the assert() in llvm_regexec() */
 242         if (g->iflags&REGEX_BAD)
 243                 SETERROR(REG_ASSERT);
 244 #endif
 245
 246         /* win or lose, we're done */
 247         if (p->error != 0)      /* lose */
 248                 llvm_regfree(preg);
 249         return(p->error);
 250 }
 251
 252 /*
 253  - p_ere - ERE parser top level, concatenation and alternation
 254  */
 255 static void
 256 p_ere(struct parse *p, int stop)        /* character this ERE should end at */
 257 {
 258         char c;
 259         sopno prevback;
 260         sopno prevfwd;
 261         sopno conc;
 262         int first = 1;          /* is this the first alternative? */
 263
 264         for (;;) {
 265                 /* do a bunch of concatenated expressions */
 266                 conc = HERE();
 267                 while (MORE() && (c = PEEK()) != '|' && c != stop)
 268                         p_ere_exp(p);
 269                 REQUIRE(HERE() != conc, REG_EMPTY);     /* require nonempty */
 270
 271                 if (!EAT('|'))
 272                         break;          /* NOTE BREAK OUT */
 273
 274                 if (first) {
 275                         INSERT(OCH_, conc);     /* offset is wrong */
 276                         prevfwd = conc;
 277                         prevback = conc;
 278                         first = 0;
 279                 }
 280                 ASTERN(OOR1, prevback);
 281                 prevback = THERE();
 282                 AHEAD(prevfwd);                 /* fix previous offset */
 283                 prevfwd = HERE();
 284                 EMIT(OOR2, 0);                  /* offset is very wrong */
 285         }
 286
 287         if (!first) {           /* tail-end fixups */
 288                 AHEAD(prevfwd);
 289                 ASTERN(O_CH, prevback);
 290         }
 291
 292         assert(!MORE() || SEE(stop));
 293 }
 294
 295 /*
 296  - p_ere_exp - parse one subERE, an atom possibly followed by a repetition op
 297  */
 298 static void
 299 p_ere_exp(struct parse *p)
 300 {
 301         char c;
 302         sopno pos;
 303         int count;
 304         int count2;
 305         sopno subno;
 306         int wascaret = 0;
 307
 308         assert(MORE());         /* caller should have ensured this */
 309         c = GETNEXT();
 310
 311         pos = HERE();
 312         switch (c) {
 313         case '(':
 314                 REQUIRE(MORE(), REG_EPAREN);
 315                 p->g->nsub++;
 316                 subno = p->g->nsub;
 317                 if (subno < NPAREN)
 318                         p->pbegin[subno] = HERE();
 319                 EMIT(OLPAREN, subno);
 320                 if (!SEE(')'))
 321                         p_ere(p, ')');
 322                 if (subno < NPAREN) {
 323                         p->pend[subno] = HERE();
 324                         assert(p->pend[subno] != 0);
 325                 }
 326                 EMIT(ORPAREN, subno);
 327                 MUSTEAT(')', REG_EPAREN);
 328                 break;
 329 #ifndef POSIX_MISTAKE
 330         case ')':               /* happens only if no current unmatched ( */
 331                 /*
 332                  * You may ask, why the ifndef?  Because I didn't notice
 333                  * this until slightly too late for 1003.2, and none of the
 334                  * other 1003.2 regular-expression reviewers noticed it at
 335                  * all.  So an unmatched ) is legal POSIX, at least until
 336                  * we can get it fixed.
 337                  */
 338                 SETERROR(REG_EPAREN);
 339                 break;
 340 #endif
 341         case '^':
 342                 EMIT(OBOL, 0);
 343                 p->g->iflags |= USEBOL;
 344                 p->g->nbol++;
 345                 wascaret = 1;
 346                 break;
 347         case '$':
 348                 EMIT(OEOL, 0);
 349                 p->g->iflags |= USEEOL;
 350                 p->g->neol++;
 351                 break;
 352         case '|':
 353                 SETERROR(REG_EMPTY);
 354                 break;
 355         case '*':
 356         case '+':
 357         case '?':
 358                 SETERROR(REG_BADRPT);
 359                 break;
 360         case '.':
 361                 if (p->g->cflags&REG_NEWLINE)
 362                         nonnewline(p);
 363                 else
 364                         EMIT(OANY, 0);
 365                 break;
 366         case '[':
 367                 p_bracket(p);
 368                 break;
 369         case '\\':
 370                 REQUIRE(MORE(), REG_EESCAPE);
 371                 c = GETNEXT();
 372                 ordinary(p, c);
 373                 break;
 374         case '{':               /* okay as ordinary except if digit follows */
 375                 REQUIRE(!MORE() || !isdigit((uch)PEEK()), REG_BADRPT);
 376                 /* FALLTHROUGH */
 377         default:
 378                 ordinary(p, c);
 379                 break;
 380         }
 381
 382         if (!MORE())
 383                 return;
 384         c = PEEK();
 385         /* we call { a repetition if followed by a digit */
 386         if (!( c == '*' || c == '+' || c == '?' ||
 387                                 (c == '{' && MORE2() && isdigit((uch)PEEK2())) ))
 388                 return;         /* no repetition, we're done */
 389         NEXT();
 390
 391         REQUIRE(!wascaret, REG_BADRPT);
 392         switch (c) {
 393         case '*':       /* implemented as +? */
 394                 /* this case does not require the (y|) trick, noKLUDGE */
 395                 INSERT(OPLUS_, pos);
 396                 ASTERN(O_PLUS, pos);
 397                 INSERT(OQUEST_, pos);
 398                 ASTERN(O_QUEST, pos);
 399                 break;
 400         case '+':
 401                 INSERT(OPLUS_, pos);
 402                 ASTERN(O_PLUS, pos);
 403                 break;
 404         case '?':
 405                 /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */
 406                 INSERT(OCH_, pos);              /* offset slightly wrong */
 407                 ASTERN(OOR1, pos);              /* this one's right */
 408                 AHEAD(pos);                     /* fix the OCH_ */
 409                 EMIT(OOR2, 0);                  /* offset very wrong... */
 410                 AHEAD(THERE());                 /* ...so fix it */
 411                 ASTERN(O_CH, THERETHERE());
 412                 break;
 413         case '{':
 414                 count = p_count(p);
 415                 if (EAT(',')) {
 416                         if (isdigit((uch)PEEK())) {
 417                                 count2 = p_count(p);
 418                                 REQUIRE(count <= count2, REG_BADBR);
 419                         } else          /* single number with comma */
 420                                 count2 = INFINITY;
 421                 } else          /* just a single number */
 422                         count2 = count;
 423                 repeat(p, pos, count, count2);
 424                 if (!EAT('}')) {        /* error heuristics */
 425                         while (MORE() && PEEK() != '}')
 426                                 NEXT();
 427                         REQUIRE(MORE(), REG_EBRACE);
 428                         SETERROR(REG_BADBR);
 429                 }
 430                 break;
 431         }
 432
 433         if (!MORE())
 434                 return;
 435         c = PEEK();
 436         if (!( c == '*' || c == '+' || c == '?' ||
 437                                 (c == '{' && MORE2() && isdigit((uch)PEEK2())) ) )
 438                 return;
 439         SETERROR(REG_BADRPT);
 440 }
 441
 442 /*
 443  - p_str - string (no metacharacters) "parser"
 444  */
 445 static void
 446 p_str(struct parse *p)
 447 {
 448         REQUIRE(MORE(), REG_EMPTY);
 449         while (MORE())
 450                 ordinary(p, GETNEXT());
 451 }
 452
 453 /*
 454  - p_bre - BRE parser top level, anchoring and concatenation
 455  * Giving end1 as OUT essentially eliminates the end1/end2 check.
 456  *
 457  * This implementation is a bit of a kludge, in that a trailing $ is first
 458  * taken as an ordinary character and then revised to be an anchor.  The
 459  * only undesirable side effect is that '$' gets included as a character
 460  * category in such cases.  This is fairly harmless; not worth fixing.
 461  * The amount of lookahead needed to avoid this kludge is excessive.
 462  */
 463 static void
 464 p_bre(struct parse *p,
 465     int end1,           /* first terminating character */
 466     int end2)           /* second terminating character */
 467 {
 468         sopno start = HERE();
 469         int first = 1;                  /* first subexpression? */
 470         int wasdollar = 0;
 471
 472         if (EAT('^')) {
 473                 EMIT(OBOL, 0);
 474                 p->g->iflags |= USEBOL;
 475                 p->g->nbol++;
 476         }
 477         while (MORE() && !SEETWO(end1, end2)) {
 478                 wasdollar = p_simp_re(p, first);
 479                 first = 0;
 480         }
 481         if (wasdollar) {        /* oops, that was a trailing anchor */
 482                 DROP(1);
 483                 EMIT(OEOL, 0);
 484                 p->g->iflags |= USEEOL;
 485                 p->g->neol++;
 486         }
 487
 488         REQUIRE(HERE() != start, REG_EMPTY);    /* require nonempty */
 489 }
 490
 491 /*
 492  - p_simp_re - parse a simple RE, an atom possibly followed by a repetition
 493  */
 494 static int                      /* was the simple RE an unbackslashed $? */
 495 p_simp_re(struct parse *p,
 496     int starordinary)           /* is a leading * an ordinary character? */
 497 {
 498         int c;
 499         int count;
 500         int count2;
 501         sopno pos;
 502         int i;
 503         sopno subno;
 504 #       define  BACKSL  (1<<CHAR_BIT)
 505
 506         pos = HERE();           /* repetion op, if any, covers from here */
 507
 508         assert(MORE());         /* caller should have ensured this */
 509         c = GETNEXT();
 510         if (c == '\\') {
 511                 REQUIRE(MORE(), REG_EESCAPE);
 512                 c = BACKSL | GETNEXT();
 513         }
 514         switch (c) {
 515         case '.':
 516                 if (p->g->cflags&REG_NEWLINE)
 517                         nonnewline(p);
 518                 else
 519                         EMIT(OANY, 0);
 520                 break;
 521         case '[':
 522                 p_bracket(p);
 523                 break;
 524         case BACKSL|'{':
 525                 SETERROR(REG_BADRPT);
 526                 break;
 527         case BACKSL|'(':
 528                 p->g->nsub++;
 529                 subno = p->g->nsub;
 530                 if (subno < NPAREN)
 531                         p->pbegin[subno] = HERE();
 532                 EMIT(OLPAREN, subno);
 533                 /* the MORE here is an error heuristic */
 534                 if (MORE() && !SEETWO('\\', ')'))
 535                         p_bre(p, '\\', ')');
 536                 if (subno < NPAREN) {
 537                         p->pend[subno] = HERE();
 538                         assert(p->pend[subno] != 0);
 539                 }
 540                 EMIT(ORPAREN, subno);
 541                 REQUIRE(EATTWO('\\', ')'), REG_EPAREN);
 542                 break;
 543         case BACKSL|')':        /* should not get here -- must be user */
 544         case BACKSL|'}':
 545                 SETERROR(REG_EPAREN);
 546                 break;
 547         case BACKSL|'1':
 548         case BACKSL|'2':
 549         case BACKSL|'3':
 550         case BACKSL|'4':
 551         case BACKSL|'5':
 552         case BACKSL|'6':
 553         case BACKSL|'7':
 554         case BACKSL|'8':
 555         case BACKSL|'9':
 556                 i = (c&~BACKSL) - '0';
 557                 assert(i < NPAREN);
 558                 if (p->pend[i] != 0) {
 559                         assert(i <= p->g->nsub);
 560                         EMIT(OBACK_, i);
 561                         assert(p->pbegin[i] != 0);
 562                         assert(OP(p->strip[p->pbegin[i]]) == OLPAREN);
 563                         assert(OP(p->strip[p->pend[i]]) == ORPAREN);
 564                         (void) dupl(p, p->pbegin[i]+1, p->pend[i]);
 565                         EMIT(O_BACK, i);
 566                 } else
 567                         SETERROR(REG_ESUBREG);
 568                 p->g->backrefs = 1;
 569                 break;
 570         case '*':
 571                 REQUIRE(starordinary, REG_BADRPT);
 572                 /* FALLTHROUGH */
 573         default:
 574                 ordinary(p, (char)c);
 575                 break;
 576         }
 577
 578         if (EAT('*')) {         /* implemented as +? */
 579                 /* this case does not require the (y|) trick, noKLUDGE */
 580                 INSERT(OPLUS_, pos);
 581                 ASTERN(O_PLUS, pos);
 582                 INSERT(OQUEST_, pos);
 583                 ASTERN(O_QUEST, pos);
 584         } else if (EATTWO('\\', '{')) {
 585                 count = p_count(p);
 586                 if (EAT(',')) {
 587                         if (MORE() && isdigit((uch)PEEK())) {
 588                                 count2 = p_count(p);
 589                                 REQUIRE(count <= count2, REG_BADBR);
 590                         } else          /* single number with comma */
 591                                 count2 = INFINITY;
 592                 } else          /* just a single number */
 593                         count2 = count;
 594                 repeat(p, pos, count, count2);
 595                 if (!EATTWO('\\', '}')) {       /* error heuristics */
 596                         while (MORE() && !SEETWO('\\', '}'))
 597                                 NEXT();
 598                         REQUIRE(MORE(), REG_EBRACE);
 599                         SETERROR(REG_BADBR);
 600                 }
 601         } else if (c == '$')    /* $ (but not \$) ends it */
 602                 return(1);
 603
 604         return(0);
 605 }
 606
 607 /*
 608  - p_count - parse a repetition count
 609  */
 610 static int                      /* the value */
 611 p_count(struct parse *p)
 612 {
 613         int count = 0;
 614         int ndigits = 0;
 615
 616         while (MORE() && isdigit((uch)PEEK()) && count <= DUPMAX) {
 617                 count = count*10 + (GETNEXT() - '0');
 618                 ndigits++;
 619         }
 620
 621         REQUIRE(ndigits > 0 && count <= DUPMAX, REG_BADBR);
 622         return(count);
 623 }
 624
 625 /*
 626  - p_bracket - parse a bracketed character list
 627  *
 628  * Note a significant property of this code:  if the allocset() did SETERROR,
 629  * no set operations are done.
 630  */
 631 static void
 632 p_bracket(struct parse *p)
 633 {
 634         cset *cs;
 635         int invert = 0;
 636
 637         /* Dept of Truly Sickening Special-Case Kludges */
 638         if (p->next + 5 < p->end && strncmp(p->next, "[:<:]]", 6) == 0) {
 639                 EMIT(OBOW, 0);
 640                 NEXTn(6);
 641                 return;
 642         }
 643         if (p->next + 5 < p->end && strncmp(p->next, "[:>:]]", 6) == 0) {
 644                 EMIT(OEOW, 0);
 645                 NEXTn(6);
 646                 return;
 647         }
 648
 649         if ((cs = allocset(p)) == NULL) {
 650                 /* allocset did set error status in p */
 651                 return;
 652         }
 653
 654         if (EAT('^'))
 655                 invert++;       /* make note to invert set at end */
 656         if (EAT(']'))
 657                 CHadd(cs, ']');
 658         else if (EAT('-'))
 659                 CHadd(cs, '-');
 660         while (MORE() && PEEK() != ']' && !SEETWO('-', ']'))
 661                 p_b_term(p, cs);
 662         if (EAT('-'))
 663                 CHadd(cs, '-');
 664         MUSTEAT(']', REG_EBRACK);
 665
 666         if (p->error != 0) {    /* don't mess things up further */
 667                 freeset(p, cs);
 668                 return;
 669         }
 670
 671         if (p->g->cflags&REG_ICASE) {
 672                 int i;
 673                 int ci;
 674
 675                 for (i = p->g->csetsize - 1; i >= 0; i--)
 676                         if (CHIN(cs, i) && isalpha(i)) {
 677                                 ci = othercase(i);
 678                                 if (ci != i)
 679                                         CHadd(cs, ci);
 680                         }
 681                 if (cs->multis != NULL)
 682                         mccase(p, cs);
 683         }
 684         if (invert) {
 685                 int i;
 686
 687                 for (i = p->g->csetsize - 1; i >= 0; i--)
 688                         if (CHIN(cs, i))
 689                                 CHsub(cs, i);
 690                         else
 691                                 CHadd(cs, i);
 692                 if (p->g->cflags&REG_NEWLINE)
 693                         CHsub(cs, '\n');
 694                 if (cs->multis != NULL)
 695                         mcinvert(p, cs);
 696         }
 697
 698         assert(cs->multis == NULL);             /* xxx */
 699
 700         if (nch(p, cs) == 1) {          /* optimize singleton sets */
 701                 ordinary(p, firstch(p, cs));
 702                 freeset(p, cs);
 703         } else
 704                 EMIT(OANYOF, freezeset(p, cs));
 705 }
 706
 707 /*
 708  - p_b_term - parse one term of a bracketed character list
 709  */
 710 static void
 711 p_b_term(struct parse *p, cset *cs)
 712 {
 713         char c;
 714         char start, finish;
 715         int i;
 716
 717         /* classify what we've got */
 718         switch ((MORE()) ? PEEK() : '\0') {
 719         case '[':
 720                 c = (MORE2()) ? PEEK2() : '\0';
 721                 break;
 722         case '-':
 723                 SETERROR(REG_ERANGE);
 724                 return;                 /* NOTE RETURN */
 725                 break;
 726         default:
 727                 c = '\0';
 728                 break;
 729         }
 730
 731         switch (c) {
 732         case ':':               /* character class */
 733                 NEXT2();
 734                 REQUIRE(MORE(), REG_EBRACK);
 735                 c = PEEK();
 736                 REQUIRE(c != '-' && c != ']', REG_ECTYPE);
 737                 p_b_cclass(p, cs);
 738                 REQUIRE(MORE(), REG_EBRACK);
 739                 REQUIRE(EATTWO(':', ']'), REG_ECTYPE);
 740                 break;
 741         case '=':               /* equivalence class */
 742                 NEXT2();
 743                 REQUIRE(MORE(), REG_EBRACK);
 744                 c = PEEK();
 745                 REQUIRE(c != '-' && c != ']', REG_ECOLLATE);
 746                 p_b_eclass(p, cs);
 747                 REQUIRE(MORE(), REG_EBRACK);
 748                 REQUIRE(EATTWO('=', ']'), REG_ECOLLATE);
 749                 break;
 750         default:                /* symbol, ordinary character, or range */
 751 /* xxx revision needed for multichar stuff */
 752                 start = p_b_symbol(p);
 753                 if (SEE('-') && MORE2() && PEEK2() != ']') {
 754                         /* range */
 755                         NEXT();
 756                         if (EAT('-'))
 757                                 finish = '-';
 758                         else
 759                                 finish = p_b_symbol(p);
 760                 } else
 761                         finish = start;
 762 /* xxx what about signed chars here... */
 763                 REQUIRE(start <= finish, REG_ERANGE);
 764                 for (i = start; i <= finish; i++)
 765                         CHadd(cs, i);
 766                 break;
 767         }
 768 }
 769
 770 /*
 771  - p_b_cclass - parse a character-class name and deal with it
 772  */
 773 static void
 774 p_b_cclass(struct parse *p, cset *cs)
 775 {
 776         char *sp = p->next;
 777         struct cclass *cp;
 778         size_t len;
 779         const char *u;
 780         char c;
 781
 782         while (MORE() && isalpha(PEEK()))
 783                 NEXT();
 784         len = p->next - sp;
 785         for (cp = cclasses; cp->name != NULL; cp++)
 786                 if (strncmp(cp->name, sp, len) == 0 && cp->name[len] == '\0')
 787                         break;
 788         if (cp->name == NULL) {
 789                 /* oops, didn't find it */
 790                 SETERROR(REG_ECTYPE);
 791                 return;
 792         }
 793
 794         u = cp->chars;
 795         while ((c = *u++) != '\0')
 796                 CHadd(cs, c);
 797         for (u = cp->multis; *u != '\0'; u += strlen(u) + 1)
 798                 MCadd(p, cs, u);
 799 }
 800
 801 /*
 802  - p_b_eclass - parse an equivalence-class name and deal with it
 803  *
 804  * This implementation is incomplete. xxx
 805  */
 806 static void
 807 p_b_eclass(struct parse *p, cset *cs)
 808 {
 809         char c;
 810
 811         c = p_b_coll_elem(p, '=');
 812         CHadd(cs, c);
 813 }
 814
 815 /*
 816  - p_b_symbol - parse a character or [..]ed multicharacter collating symbol
 817  */
 818 static char                     /* value of symbol */
 819 p_b_symbol(struct parse *p)
 820 {
 821         char value;
 822
 823         REQUIRE(MORE(), REG_EBRACK);
 824         if (!EATTWO('[', '.'))
 825                 return(GETNEXT());
 826
 827         /* collating symbol */
 828         value = p_b_coll_elem(p, '.');
 829         REQUIRE(EATTWO('.', ']'), REG_ECOLLATE);
 830         return(value);
 831 }
 832
 833 /*
 834  - p_b_coll_elem - parse a collating-element name and look it up
 835  */
 836 static char                     /* value of collating element */
 837 p_b_coll_elem(struct parse *p,
 838     int endc)                   /* name ended by endc,']' */
 839 {
 840         char *sp = p->next;
 841         struct cname *cp;
 842         int len;
 843
 844         while (MORE() && !SEETWO(endc, ']'))
 845                 NEXT();
 846         if (!MORE()) {
 847                 SETERROR(REG_EBRACK);
 848                 return(0);
 849         }
 850         len = p->next - sp;
 851         for (cp = cnames; cp->name != NULL; cp++)
 852                 if (strncmp(cp->name, sp, len) == 0 && cp->name[len] == '\0')
 853                         return(cp->code);       /* known name */
 854         if (len == 1)
 855                 return(*sp);    /* single character */
 856         SETERROR(REG_ECOLLATE);                 /* neither */
 857         return(0);
 858 }
 859
 860 /*
 861  - othercase - return the case counterpart of an alphabetic
 862  */
 863 static char                     /* if no counterpart, return ch */
 864 othercase(int ch)
 865 {
 866         ch = (uch)ch;
 867         assert(isalpha(ch));
 868         if (isupper(ch))
 869                 return ((uch)tolower(ch));
 870         else if (islower(ch))
 871                 return ((uch)toupper(ch));
 872         else                    /* peculiar, but could happen */
 873                 return(ch);
 874 }
 875
 876 /*
 877  - bothcases - emit a dualcase version of a two-case character
 878  *
 879  * Boy, is this implementation ever a kludge...
 880  */
 881 static void
 882 bothcases(struct parse *p, int ch)
 883 {
 884         char *oldnext = p->next;
 885         char *oldend = p->end;
 886         char bracket[3];
 887
 888         ch = (uch)ch;
 889         assert(othercase(ch) != ch);    /* p_bracket() would recurse */
 890         p->next = bracket;
 891         p->end = bracket+2;
 892         bracket[0] = ch;
 893         bracket[1] = ']';
 894         bracket[2] = '\0';
 895         p_bracket(p);
 896         assert(p->next == bracket+2);
 897         p->next = oldnext;
 898         p->end = oldend;
 899 }
 900
 901 /*
 902  - ordinary - emit an ordinary character
 903  */
 904 static void
 905 ordinary(struct parse *p, int ch)
 906 {
 907         cat_t *cap = p->g->categories;
 908
 909         if ((p->g->cflags&REG_ICASE) && isalpha((uch)ch) && othercase(ch) != ch)
 910                 bothcases(p, ch);
 911         else {
 912                 EMIT(OCHAR, (uch)ch);
 913                 if (cap[ch] == 0)
 914                         cap[ch] = p->g->ncategories++;
 915         }
 916 }
 917
 918 /*
 919  - nonnewline - emit REG_NEWLINE version of OANY
 920  *
 921  * Boy, is this implementation ever a kludge...
 922  */
 923 static void
 924 nonnewline(struct parse *p)
 925 {
 926         char *oldnext = p->next;
 927         char *oldend = p->end;
 928         char bracket[4];
 929
 930         p->next = bracket;
 931         p->end = bracket+3;
 932         bracket[0] = '^';
 933         bracket[1] = '\n';
 934         bracket[2] = ']';
 935         bracket[3] = '\0';
 936         p_bracket(p);
 937         assert(p->next == bracket+3);
 938         p->next = oldnext;
 939         p->end = oldend;
 940 }
 941
 942 /*
 943  - repeat - generate code for a bounded repetition, recursively if needed
 944  */
 945 static void
 946 repeat(struct parse *p,
 947     sopno start,                /* operand from here to end of strip */
 948     int from,                   /* repeated from this number */
 949     int to)                     /* to this number of times (maybe INFINITY) */
 950 {
 951         sopno finish = HERE();
 952 #       define  N       2
 953 #       define  INF     3
 954 #       define  REP(f, t)       ((f)*8 + (t))
 955 #       define  MAP(n)  (((n) <= 1) ? (n) : ((n) == INFINITY) ? INF : N)
 956         sopno copy;
 957
 958         if (p->error != 0)      /* head off possible runaway recursion */
 959                 return;
 960
 961         assert(from <= to);
 962
 963         switch (REP(MAP(from), MAP(to))) {
 964         case REP(0, 0):                 /* must be user doing this */
 965                 DROP(finish-start);     /* drop the operand */
 966                 break;
 967         case REP(0, 1):                 /* as x{1,1}? */
 968         case REP(0, N):                 /* as x{1,n}? */
 969         case REP(0, INF):               /* as x{1,}? */
 970                 /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */
 971                 INSERT(OCH_, start);            /* offset is wrong... */
 972                 repeat(p, start+1, 1, to);
 973                 ASTERN(OOR1, start);
 974                 AHEAD(start);                   /* ... fix it */
 975                 EMIT(OOR2, 0);
 976                 AHEAD(THERE());
 977                 ASTERN(O_CH, THERETHERE());
 978                 break;
 979         case REP(1, 1):                 /* trivial case */
 980                 /* done */
 981                 break;
 982         case REP(1, N):                 /* as x?x{1,n-1} */
 983                 /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */
 984                 INSERT(OCH_, start);
 985                 ASTERN(OOR1, start);
 986                 AHEAD(start);
 987                 EMIT(OOR2, 0);                  /* offset very wrong... */
 988                 AHEAD(THERE());                 /* ...so fix it */
 989                 ASTERN(O_CH, THERETHERE());
 990                 copy = dupl(p, start+1, finish+1);
 991                 assert(copy == finish+4);
 992                 repeat(p, copy, 1, to-1);
 993                 break;
 994         case REP(1, INF):               /* as x+ */
 995                 INSERT(OPLUS_, start);
 996                 ASTERN(O_PLUS, start);
 997                 break;
 998         case REP(N, N):                 /* as xx{m-1,n-1} */
 999                 copy = dupl(p, start, finish);
1000                 repeat(p, copy, from-1, to-1);
1001                 break;
1002         case REP(N, INF):               /* as xx{n-1,INF} */
1003                 copy = dupl(p, start, finish);
1004                 repeat(p, copy, from-1, to);
1005                 break;
1006         default:                        /* "can't happen" */
1007                 SETERROR(REG_ASSERT);   /* just in case */
1008                 break;
1009         }
1010 }
1011
1012 /*
1013  - seterr - set an error condition
1014  */
1015 static int                      /* useless but makes type checking happy */
1016 seterr(struct parse *p, int e)
1017 {
1018         if (p->error == 0)      /* keep earliest error condition */
1019                 p->error = e;
1020         p->next = nuls;         /* try to bring things to a halt */
1021         p->end = nuls;
1022         return(0);              /* make the return value well-defined */
1023 }
1024
1025 /*
1026  - allocset - allocate a set of characters for []
1027  */
1028 static cset *
1029 allocset(struct parse *p)
1030 {
1031         int no = p->g->ncsets++;
1032         size_t nc;
1033         size_t nbytes;
1034         cset *cs;
1035         size_t css = (size_t)p->g->csetsize;
1036         int i;
1037
1038         if (no >= p->ncsalloc) {        /* need another column of space */
1039                 void *ptr;
1040
1041                 p->ncsalloc += CHAR_BIT;
1042                 nc = p->ncsalloc;
1043                 assert(nc % CHAR_BIT == 0);
1044                 nbytes = nc / CHAR_BIT * css;
1045
1046                 ptr = (cset *)realloc((char *)p->g->sets, nc * sizeof(cset));
1047                 if (ptr == NULL)
1048                         goto nomem;
1049                 p->g->sets = ptr;
1050
1051                 ptr = (uch *)realloc((char *)p->g->setbits, nbytes);
1052                 if (ptr == NULL)
1053                         goto nomem;
1054                 p->g->setbits = ptr;
1055
1056                 for (i = 0; i < no; i++)
1057                         p->g->sets[i].ptr = p->g->setbits + css*(i/CHAR_BIT);
1058
1059                 (void) memset((char *)p->g->setbits + (nbytes - css), 0, css);
1060         }
1061         /* XXX should not happen */
1062         if (p->g->sets == NULL || p->g->setbits == NULL)
1063                 goto nomem;
1064
1065         cs = &p->g->sets[no];
1066         cs->ptr = p->g->setbits + css*((no)/CHAR_BIT);
1067         cs->mask = 1 << ((no) % CHAR_BIT);
1068         cs->hash = 0;
1069         cs->smultis = 0;
1070         cs->multis = NULL;
1071
1072         return(cs);
1073 nomem:
1074         free(p->g->sets);
1075         p->g->sets = NULL;
1076         free(p->g->setbits);
1077         p->g->setbits = NULL;
1078
1079         SETERROR(REG_ESPACE);
1080         /* caller's responsibility not to do set ops */
1081         return(NULL);
1082 }
1083
1084 /*
1085  - freeset - free a now-unused set
1086  */
1087 static void
1088 freeset(struct parse *p, cset *cs)
1089 {
1090         size_t i;
1091         cset *top = &p->g->sets[p->g->ncsets];
1092         size_t css = (size_t)p->g->csetsize;
1093
1094         for (i = 0; i < css; i++)
1095                 CHsub(cs, i);
1096         if (cs == top-1)        /* recover only the easy case */
1097                 p->g->ncsets--;
1098 }
1099
1100 /*
1101  - freezeset - final processing on a set of characters
1102  *
1103  * The main task here is merging identical sets.  This is usually a waste
1104  * of time (although the hash code minimizes the overhead), but can win
1105  * big if REG_ICASE is being used.  REG_ICASE, by the way, is why the hash
1106  * is done using addition rather than xor -- all ASCII [aA] sets xor to
1107  * the same value!
1108  */
1109 static int                      /* set number */
1110 freezeset(struct parse *p, cset *cs)
1111 {
1112         uch h = cs->hash;
1113         size_t i;
1114         cset *top = &p->g->sets[p->g->ncsets];
1115         cset *cs2;
1116         size_t css = (size_t)p->g->csetsize;
1117
1118         /* look for an earlier one which is the same */
1119         for (cs2 = &p->g->sets[0]; cs2 < top; cs2++)
1120                 if (cs2->hash == h && cs2 != cs) {
1121                         /* maybe */
1122                         for (i = 0; i < css; i++)
1123                                 if (!!CHIN(cs2, i) != !!CHIN(cs, i))
1124                                         break;          /* no */
1125                         if (i == css)
1126                                 break;                  /* yes */
1127                 }
1128
1129         if (cs2 < top) {        /* found one */
1130                 freeset(p, cs);
1131                 cs = cs2;
1132         }
1133
1134         return((int)(cs - p->g->sets));
1135 }
1136
1137 /*
1138  - firstch - return first character in a set (which must have at least one)
1139  */
1140 static int                      /* character; there is no "none" value */
1141 firstch(struct parse *p, cset *cs)
1142 {
1143         size_t i;
1144         size_t css = (size_t)p->g->csetsize;
1145
1146         for (i = 0; i < css; i++)
1147                 if (CHIN(cs, i))
1148                         return((char)i);
1149         assert(never);
1150         return(0);              /* arbitrary */
1151 }
1152
1153 /*
1154  - nch - number of characters in a set
1155  */
1156 static int
1157 nch(struct parse *p, cset *cs)
1158 {
1159         size_t i;
1160         size_t css = (size_t)p->g->csetsize;
1161         int n = 0;
1162
1163         for (i = 0; i < css; i++)
1164                 if (CHIN(cs, i))
1165                         n++;
1166         return(n);
1167 }
1168
1169 /*
1170  - mcadd - add a collating element to a cset
1171  */
1172 static void
1173 mcadd( struct parse *p, cset *cs, const char *cp)
1174 {
1175         size_t oldend = cs->smultis;
1176         void *np;
1177
1178         cs->smultis += strlen(cp) + 1;
1179         np = realloc(cs->multis, cs->smultis);
1180         if (np == NULL) {
1181                 if (cs->multis)
1182                         free(cs->multis);
1183                 cs->multis = NULL;
1184                 SETERROR(REG_ESPACE);
1185                 return;
1186         }
1187         cs->multis = np;
1188
1189         llvm_strlcpy(cs->multis + oldend - 1, cp, cs->smultis - oldend + 1);
1190 }
1191
1192 /*
1193  - mcinvert - invert the list of collating elements in a cset
1194  *
1195  * This would have to know the set of possibilities.  Implementation
1196  * is deferred.
1197  */
1198 /* ARGSUSED */
1199 static void
1200 mcinvert(struct parse *p, cset *cs)
1201 {
1202         assert(cs->multis == NULL);     /* xxx */
1203 }
1204
1205 /*
1206  - mccase - add case counterparts of the list of collating elements in a cset
1207  *
1208  * This would have to know the set of possibilities.  Implementation
1209  * is deferred.
1210  */
1211 /* ARGSUSED */
1212 static void
1213 mccase(struct parse *p, cset *cs)
1214 {
1215         assert(cs->multis == NULL);     /* xxx */
1216 }
1217
1218 /*
1219  - isinsets - is this character in any sets?
1220  */
1221 static int                      /* predicate */
1222 isinsets(struct re_guts *g, int c)
1223 {
1224         uch *col;
1225         int i;
1226         int ncols = (g->ncsets+(CHAR_BIT-1)) / CHAR_BIT;
1227         unsigned uc = (uch)c;
1228
1229         for (i = 0, col = g->setbits; i < ncols; i++, col += g->csetsize)
1230                 if (col[uc] != 0)
1231                         return(1);
1232         return(0);
1233 }
1234
1235 /*
1236  - samesets - are these two characters in exactly the same sets?
1237  */
1238 static int                      /* predicate */
1239 samesets(struct re_guts *g, int c1, int c2)
1240 {
1241         uch *col;
1242         int i;
1243         int ncols = (g->ncsets+(CHAR_BIT-1)) / CHAR_BIT;
1244         unsigned uc1 = (uch)c1;
1245         unsigned uc2 = (uch)c2;
1246
1247         for (i = 0, col = g->setbits; i < ncols; i++, col += g->csetsize)
1248                 if (col[uc1] != col[uc2])
1249                         return(0);
1250         return(1);
1251 }
1252
1253 /*
1254  - categorize - sort out character categories
1255  */
1256 static void
1257 categorize(struct parse *p, struct re_guts *g)
1258 {
1259         cat_t *cats = g->categories;
1260         int c;
1261         int c2;
1262         cat_t cat;
1263
1264         /* avoid making error situations worse */
1265         if (p->error != 0)
1266                 return;
1267
1268         for (c = CHAR_MIN; c <= CHAR_MAX; c++)
1269                 if (cats[c] == 0 && isinsets(g, c)) {
1270                         cat = g->ncategories++;
1271                         cats[c] = cat;
1272                         for (c2 = c+1; c2 <= CHAR_MAX; c2++)
1273                                 if (cats[c2] == 0 && samesets(g, c, c2))
1274                                         cats[c2] = cat;
1275                 }
1276 }
1277
1278 /*
1279  - dupl - emit a duplicate of a bunch of sops
1280  */
1281 static sopno                    /* start of duplicate */
1282 dupl(struct parse *p,
1283     sopno start,                /* from here */
1284     sopno finish)               /* to this less one */
1285 {
1286         sopno ret = HERE();
1287         sopno len = finish - start;
1288
1289         assert(finish >= start);
1290         if (len == 0)
1291                 return(ret);
1292         enlarge(p, p->ssize + len);     /* this many unexpected additions */
1293         assert(p->ssize >= p->slen + len);
1294         (void) memmove((char *)(p->strip + p->slen),
1295                 (char *)(p->strip + start), (size_t)len*sizeof(sop));
1296         p->slen += len;
1297         return(ret);
1298 }
1299
1300 /*
1301  - doemit - emit a strip operator
1302  *
1303  * It might seem better to implement this as a macro with a function as
1304  * hard-case backup, but it's just too big and messy unless there are
1305  * some changes to the data structures.  Maybe later.
1306  */
1307 static void
1308 doemit(struct parse *p, sop op, size_t opnd)
1309 {
1310         /* avoid making error situations worse */
1311         if (p->error != 0)
1312                 return;
1313
1314         /* deal with oversize operands ("can't happen", more or less) */
1315         assert(opnd < 1<<OPSHIFT);
1316
1317         /* deal with undersized strip */
1318         if (p->slen >= p->ssize)
1319                 enlarge(p, (p->ssize+1) / 2 * 3);       /* +50% */
1320         assert(p->slen < p->ssize);
1321
1322         /* finally, it's all reduced to the easy case */
1323         p->strip[p->slen++] = SOP(op, opnd);
1324 }
1325
1326 /*
1327  - doinsert - insert a sop into the strip
1328  */
1329 static void
1330 doinsert(struct parse *p, sop op, size_t opnd, sopno pos)
1331 {
1332         sopno sn;
1333         sop s;
1334         int i;
1335
1336         /* avoid making error situations worse */
1337         if (p->error != 0)
1338                 return;
1339
1340         sn = HERE();
1341         EMIT(op, opnd);         /* do checks, ensure space */
1342         assert(HERE() == sn+1);
1343         s = p->strip[sn];
1344
1345         /* adjust paren pointers */
1346         assert(pos > 0);
1347         for (i = 1; i < NPAREN; i++) {
1348                 if (p->pbegin[i] >= pos) {
1349                         p->pbegin[i]++;
1350                 }
1351                 if (p->pend[i] >= pos) {
1352                         p->pend[i]++;
1353                 }
1354         }
1355
1356         memmove((char *)&p->strip[pos+1], (char *)&p->strip[pos],
1357                                                 (HERE()-pos-1)*sizeof(sop));
1358         p->strip[pos] = s;
1359 }
1360
1361 /*
1362  - dofwd - complete a forward reference
1363  */
1364 static void
1365 dofwd(struct parse *p, sopno pos, sop value)
1366 {
1367         /* avoid making error situations worse */
1368         if (p->error != 0)
1369                 return;
1370
1371         assert(value < 1<<OPSHIFT);
1372         p->strip[pos] = OP(p->strip[pos]) | value;
1373 }
1374
1375 /*
1376  - enlarge - enlarge the strip
1377  */
1378 static void
1379 enlarge(struct parse *p, sopno size)
1380 {
1381         sop *sp;
1382
1383         if (p->ssize >= size)
1384                 return;
1385
1386         sp = (sop *)realloc(p->strip, size*sizeof(sop));
1387         if (sp == NULL) {
1388                 SETERROR(REG_ESPACE);
1389                 return;
1390         }
1391         p->strip = sp;
1392         p->ssize = size;
1393 }
1394
1395 /*
1396  - stripsnug - compact the strip
1397  */
1398 static void
1399 stripsnug(struct parse *p, struct re_guts *g)
1400 {
1401         g->nstates = p->slen;
1402         g->strip = (sop *)realloc((char *)p->strip, p->slen * sizeof(sop));
1403         if (g->strip == NULL) {
1404                 SETERROR(REG_ESPACE);
1405                 g->strip = p->strip;
1406         }
1407 }
1408
1409 /*
1410  - findmust - fill in must and mlen with longest mandatory literal string
1411  *
1412  * This algorithm could do fancy things like analyzing the operands of |
1413  * for common subsequences.  Someday.  This code is simple and finds most
1414  * of the interesting cases.
1415  *
1416  * Note that must and mlen got initialized during setup.
1417  */
1418 static void
1419 findmust(struct parse *p, struct re_guts *g)
1420 {
1421         sop *scan;
1422         sop *start = 0; /* start initialized in the default case, after that */
1423         sop *newstart = 0; /* newstart was initialized in the OCHAR case */
1424         sopno newlen;
1425         sop s;
1426         char *cp;
1427         sopno i;
1428
1429         /* avoid making error situations worse */
1430         if (p->error != 0)
1431                 return;
1432
1433         /* find the longest OCHAR sequence in strip */
1434         newlen = 0;
1435         scan = g->strip + 1;
1436         do {
1437                 s = *scan++;
1438                 switch (OP(s)) {
1439                 case OCHAR:             /* sequence member */
1440                         if (newlen == 0)                /* new sequence */
1441                                 newstart = scan - 1;
1442                         newlen++;
1443                         break;
1444                 case OPLUS_:            /* things that don't break one */
1445                 case OLPAREN:
1446                 case ORPAREN:
1447                         break;
1448                 case OQUEST_:           /* things that must be skipped */
1449                 case OCH_:
1450                         scan--;
1451                         do {
1452                                 scan += OPND(s);
1453                                 s = *scan;
1454                                 /* assert() interferes w debug printouts */
1455                                 if (OP(s) != O_QUEST && OP(s) != O_CH &&
1456                                                         OP(s) != OOR2) {
1457                                         g->iflags |= REGEX_BAD;
1458                                         return;
1459                                 }
1460                         } while (OP(s) != O_QUEST && OP(s) != O_CH);
1461                         /* fallthrough */
1462                 default:                /* things that break a sequence */
1463                         if (newlen > g->mlen) {         /* ends one */
1464                                 start = newstart;
1465                                 g->mlen = newlen;
1466                         }
1467                         newlen = 0;
1468                         break;
1469                 }
1470         } while (OP(s) != OEND);
1471
1472         if (g->mlen == 0)               /* there isn't one */
1473                 return;
1474
1475         /* turn it into a character string */
1476         g->must = malloc((size_t)g->mlen + 1);
1477         if (g->must == NULL) {          /* argh; just forget it */
1478                 g->mlen = 0;
1479                 return;
1480         }
1481         cp = g->must;
1482         scan = start;
1483         for (i = g->mlen; i > 0; i--) {
1484                 while (OP(s = *scan++) != OCHAR)
1485                         continue;
1486                 assert(cp < g->must + g->mlen);
1487                 *cp++ = (char)OPND(s);
1488         }
1489         assert(cp == g->must + g->mlen);
1490         *cp++ = '\0';           /* just on general principles */
1491 }
1492
1493 /*
1494  - pluscount - count + nesting
1495  */
1496 static sopno                    /* nesting depth */
1497 pluscount(struct parse *p, struct re_guts *g)
1498 {
1499         sop *scan;
1500         sop s;
1501         sopno plusnest = 0;
1502         sopno maxnest = 0;
1503
1504         if (p->error != 0)
1505                 return(0);      /* there may not be an OEND */
1506
1507         scan = g->strip + 1;
1508         do {
1509                 s = *scan++;
1510                 switch (OP(s)) {
1511                 case OPLUS_:
1512                         plusnest++;
1513                         break;
1514                 case O_PLUS:
1515                         if (plusnest > maxnest)
1516                                 maxnest = plusnest;
1517                         plusnest--;
1518                         break;
1519                 }
1520         } while (OP(s) != OEND);
1521         if (plusnest != 0)
1522                 g->iflags |= REGEX_BAD;
1523         return(maxnest);
1524 }