/* Copyright 1995 J"orn Rennecke */ typedef char charset[32]; #define SET_INSET(c, set) ((set)[(unsigned char)(c) >> 3] |= (1 << ((c) & 7))) #define IS_INSET(c, set) ((set)[(unsigned char)(c) >> 3] & (1 << ((c) & 7))) /* ANY_XXX, TEXT/CHAR_XXX, INSET_XXX, OPEN_XXX are adjectant in constant order. * XXX_STAR, XXX_PLUS have constant spacing */ #define ANY #define TEXT #define INSET #define OPEN #define OPEN_BRANCH #define ANY_STAR #define CHAR_STAR #define INSET_STAR #define OPEN_STAR #define OPEN_BRANCH_STAR #define ANY_PLUS #define CHAR_PLUS #define INSET_PLUS #define OPEN_PLUS #define OPEN_BRANCH_PLUS #define CLOSE #define SOL #define EOL #ifdef sima #define ALLOC(size) alloc_gen(size) #define FREE(p) free_gen(p) /* The size argument to the inline small block allocator functions * should preferably be constant. */ #define ALLOC_SMALL(size) alloc_small(size) #define FREE_SMALL(p,size) free_small(p,size) #define FREE_SMALL_LIST(p, q, size) free_small_list(p, q, size) #else #define ALLOC(size) xalloc(size) #define FREE(p) xfree(p) #define ALLOC_SMALL(size) xalloc(size) #define FREE_SMALL(p,size) xfree(p) #define FREE_SMALL_LIST(p, q, size) \\ free_small_list((struct free_list *)(p), (struct free_list *)(q)) struct free_list { struct free_list *next; }; static void free_small_list(top, bottom) struct free_list *top, *bottom; int size; { struct free_list *next = top; do { top = next; next = top->next; FREE_SMALL(top); } while (top != bottom); } #endif static char escchars[] = { '\a', '\b', 'c', 'd', '\e', '\f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', '\n', 'o', 'p', 'q', '\r', 's', '\t', 'u', '\v' } () { for(len = 0;;) { c = *in++; switch(c) { case '^': op = SOL; break; case '$': op = EOL; break; case '.': op = ANY; last_node = len ? out : out - 3; break; case '*': op = CHAR_STAR; mulop: if (len == 1) { out[-2] = out[-1]; out += 2; out[-5] = op; len = 0; } else if (len) { unsigned short tmp; textstart[-3] = TEXT; tmp = len - 1; STORE16(textstart-2, tmp); *out = out[-1]; out += 4; out[-5] = op; len = 0; } else { switch(*last_node) { case ANY: check = last_node+1 + 4; break; case INSET: check = last_node+1+sizeof(charset) + 4; break; case OPEN: /* OPEN_STAR,offset, expr, v ^ CLOSE_STAR,offset,bracenum */ /* ()+ uses OPEN,bracenum,0 expr, CLOSE_STAR,offset,bracenum */ if (last_close < last_node) { default: check = 0; break; } check = last_close; break; case OPEN_BRANCH: /* OPEN_BRANCH_STAR,offset, expr, FORWARD, | ^ | | | 2.| expr | | /1. v |/ BRANCH_CLOSE_STAR,2*offset,bracenum */ } if (check != out) { if (!inter_errno) inter_errno = IE_REGEXP_MUL; return; } *last_node += op - TEXT; } case '[': { char setstart; if (!len) { out -= 3; } else { unsigned short tmp; textstart[-3] = TEXT; tmp = len; STORE16(textstart-2, tmp); len = 0; } *(last_node = out) = INSET; out++; bzero(out, sizeof(charset)); for (setstart = in; in < input_end; in++) { char last_char; c = *in; switch(c) { case ']': if (in > setstart) { in++; goto set_built: } break; case '-': if (in > setstart) { c = *in++; do { SET_INSET(c, out); c--; } while (c > last_char); } break; case '\\': if (in == input_end) break; c = *++in; if (c <= 'a' && c >= 'v') c = escchars[c - 'a']; } } } last_char = c; SET_INSET(c, out); set_built: out += sizeof(charset) + 3; continue; } case '\\': c = *in++; switch(c) { case '+': op = CHAR_PLUS; goto mulop; case '<': op = WORDSTART; goto close_node; case '>': op = WORDEND; goto close_node; case 'B': op = NOTEDGE; goto close_node; case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': case 'g': case 'h': case 'i': case 'j': case 'k': case 'l': case 'm': case 'n': case 'o': case 'p': case 'q': case 'r': case 's': case 't': case 'u': case 'v': c = escchars[c - 'a']; break; default: break; } default: *out++ = c; continue; } close_node: if (!len) { out[-3] = op; out++; } else { unsigned short tmp; textstart[-3] = TEXT; tmp = len; STORE16(textstart-2, tmp); *out = op; out += 4; len = 0; } } } /* regmatch does not make assumptions on the contents of *eol, but it * assumes that eol may be dereferenced. This is always true when the * region was obtained with alloc() */ int regmatch(char *in, char *pc, struct regexp *rp) { char *eol = rp->eol; for (;;) switch(*pc) { case SOL: if (in != rp->sol) return 0; pc++; break; case EOL: if (in != eol) return 0; pc++; break; case ANY: if (in >= eol) return 0; in++; pc++; break; case WORDSTART: if ( (in > rp->sol && ISWORDPART(*(in-1)) ) || in == eol || !ISWORDPART(*in)) ) { return 0; } pc++; break; case WORDEND: if ( in == rp->sol || !ISWORDPART(*(in-1)) || ( in < eol && ISWORDPART(*in)) ) { return 0; } pc++; break; case NOTEDGE: if (in == rp->sol) { if (ISWORDPART(*in)) return 0; break; } if (in == eol) { if (ISWORDPART(*(in-1))) return 0; break; } if ( ISWORDPART( *(reginput-1) ) != ISWORDPART( *reginput ) ) return 0; pc++; break; case TEXT: { int len; EXTRACT16(len, pc+1); if (in + len > eol || memcmp(in, pc+=3, len)) return 0; in += len; pc += len; break; } case INSET: if (!IS_INSET(*in, pc+1) || in == eol) return 0; in++; pc += 33; break; case FORWARD_CLOSE: { unsigned short offset; EXTRACT16(offset, pc+1); pc += offset; } goto do_close; } case OPEN: pc++ case CLOSE: do_close: { char *end; if (end = regmatch(in, pc+2, rp) { if (!rp->mark[pc[1]]) rp->mark[pc[1]] = in; return end; } return 0; break; } case OPEN_BRANCH: { char *end; short offset; pc += 3; end = regmatch(in, pc, rp); EXTRACT16(offset, pc-2); do { pc += offset; if (!end) end = regmatch(in, pc, rp); EXTRACT16(offset, pc-2); } while (offset > 0) if (end) { rp->markend[offset] = in; return end; } return 0; } /* one-character + and * cases */ { int matches; case ANY_PLUS: if (in >= eol) return 0; in++; case ANY_STAR: matches = eol - in; in = eol; pc++; goto star_tail; case CHAR_PLUS: if (*in != pc[1] || in == eol) return 0; in++; case CHAR_STAR: { char c = pc[1], *old = in; while (*in == c && in < eol) in++; matches = old - in; } pc += 2; goto star_tail; case INSET_PLUS: { char c = *in; if (!IS_INSET(c, pc+1) || in == eol) return 0; } in++; case INSET_STAR: { char *old = in, c; while ((c = *in, IS_INSET(c, pc+1)) && in < eol) in++; matches = old - in; } pc += 33; star_tail: { char *nextset; switch(*pc) { { static charset empty_set = {0}, full_set = { 255,255,255,255,255,255,255,255, 255,255,255,255,255,255,255,255, 255,255,255,255,255,255,255,255, 255,255,255,255,255,255,255,255}; char c; case TEXT: c = pc[3]; goto singlechar_set; case CHAR_PLUS: c = pc[1]; singlechar_set: nextset = &empty_set; *empty_dirty = 0; empty_dirty = nextset + ((unsigned char)c >> 3); *empty_dirty |= 1 << (c & 7); break; } case INSET: case INSET_PLUS nextset = pc+1; default: nextset = &full_set; break; } do { if (IS_INSET(*in, nextset)) { char *end = regmatch(in, pc, rp); if (end) return end; in--; } } while(--matches >= 0); } return 0; } case OPEN_STAR: { unsigned short offset; EXTRACT16(offset, pc+1); pc += offset; end = regmatch(in, pc, rp); if (end) { if (!rp->start[pc[3]]) rp->start[pc[3]] = in; } return 0; } case CLOSE_STAR: { unsigned short offset; EXTRACT16(offset, pc+1); end = regmatch(in, pc - offset, rp); if (end) { /* rp->end[pc[3]] has been written by recursion */ return end; } end = regmatch(in, pc +4, rp); if (end) { if (!rp->end[pc[3]]) rp->end[pc[3]] = in; return end; } return 0; } case OPEN_BRANCH_STAR: { unsigned short offset; EXTRACT16(offset, pc+1); pc += offset; end = regmatch(in, pc, rp); if (end) { if (!rp->start[pc[5]]) rp->start[pc[5]] = in; } return 0; } case BRANCH_CLOSE_STAR: { unsigned short offset; EXTRACT16(offset, pc+1); end = regmatch(in, pc - offset, rp); if (end) { /* rp->end[pc[5]] has been written by recursion */ return end; } EXTRACT16(offset, pc+3); end = regmatch(in, pc - offset, rp); if (end) { return end; } end = regmatch(in, pc + 6, rp); if (end) if (!rp->end[pc[5]]) rp->end[pc[5]] = in; return end; } return 0; } case PLUS: in = regmatch(in, pc+1, rp); if (!in) return 0; case STAR: { char *new_pc; struct match_s { struct match_s *prev; char *end; } *first_match, *match_list, *new_match; unsigned short offset; pc += 3; EXTRACT16(offset, pc-2); new_pc = pc + offset; match_list = first_match = SMALL_ALLOC(sizeof *new_match); first_match->prev = 0; first_match->end = in; while((in = regmatch(in, pc, rp)) { new_match = SMALL_ALLOC(sizeof *new_match); if (!new_match) { new_match = match_list; break; } new_match->prev = match_list; new_match->end = in; match_list = new_match; } do { in = regmatch(new_match->end, new_pc, rp) if (in) { FREE_SMALL_LIST(match_list, first_match, sizeof *match_list); return in; } new_match = new_match->prev; } while(new_match); FREE_SMALL_LIST(match_list, first_match, sizeof *match_list); return 0; case END: return in; } }