5 Author: Tatu Ylonen <ylo@ngs.fi>
7 Copyright (c) 1991 Tatu Ylonen, Espoo, Finland
9 Permission to use, copy, modify, distribute, and sell this software
10 and its documentation is hereby granted without fee, provided that the
11 above copyright notice appears in all source code copies, the name of
12 Tatu Ylonen is not used to advertise products containing this software
13 or a derivation thereof, and all modified versions are clearly marked
16 This software is provided "as is" without express or implied warranty.
18 Created: Thu Sep 26 17:14:05 1991 ylo
19 Last modified: Sun Mar 29 16:47:31 1992 ylo
21 This code draws many ideas from the regular expression packages by
22 Henry Spencer of the University of Toronto and Richard Stallman of the
23 Free Software Foundation.
25 Emacs-specific code and syntax table code is almost directly borrowed
38 #include <sys/types.h>
46 #define MACRO_BEGIN do {
47 #define MACRO_END } while (0)
49 enum regexp_compiled_ops /* opcodes for compiled regexp */
51 Cend, /* end of pattern reached */
52 Cbol, /* beginning of line */
53 Ceol, /* end of line */
54 Cset, /* character set. Followed by 32 bytes of set. */
55 Cexact, /* followed by a byte to match */
56 Canychar, /* matches any character except newline */
57 Cstart_memory, /* set register start addr (followed by reg number) */
58 Cend_memory, /* set register end addr (followed by reg number) */
59 Cmatch_memory, /* match a duplicate of reg contents (regnum follows)*/
60 Cjump, /* followed by two bytes (lsb,msb) of displacement. */
61 Cstar_jump, /* will change to jump/update_failure_jump at runtime */
62 Cfailure_jump, /* jump to addr on failure */
63 Cupdate_failure_jump, /* update topmost failure point and jump */
64 Cdummy_failure_jump, /* push a dummy failure point and jump */
65 Cbegbuf, /* match at beginning of buffer */
66 Cendbuf, /* match at end of buffer */
67 Cwordbeg, /* match at beginning of word */
68 Cwordend, /* match at end of word */
69 Cwordbound, /* match if at word boundary */
70 Cnotwordbound, /* match if not at word boundary */
72 Cemacs_at_dot, /* emacs only: matches at dot */
74 Csyntaxspec, /* matches syntax code (1 byte follows) */
75 Cnotsyntaxspec /* matches if syntax code does not match (1 byte foll)*/
78 enum regexp_syntax_op /* syntax codes for plain and quoted characters */
80 Rend, /* special code for end of regexp */
81 Rnormal, /* normal character */
82 Ranychar, /* any character except newline */
83 Rquote, /* the quote character */
84 Rbol, /* match beginning of line */
85 Reol, /* match end of line */
86 Roptional, /* match preceding expression optionally */
87 Rstar, /* match preceding expr zero or more times */
88 Rplus, /* match preceding expr one or more times */
89 Ror, /* match either of alternatives */
90 Ropenpar, /* opening parenthesis */
91 Rclosepar, /* closing parenthesis */
92 Rmemory, /* match memory register */
93 Rextended_memory, /* \vnn to match registers 10-99 */
94 Ropenset, /* open set. Internal syntax hard-coded below. */
95 /* the following are gnu extensions to "normal" regexp syntax */
96 Rbegbuf, /* beginning of buffer */
97 Rendbuf, /* end of buffer */
98 Rwordchar, /* word character */
99 Rnotwordchar, /* not word character */
100 Rwordbeg, /* beginning of word */
101 Rwordend, /* end of word */
102 Rwordbound, /* word bound */
103 Rnotwordbound, /* not word bound */
105 Remacs_at_dot, /* emacs: at dot */
106 Remacs_syntaxspec, /* syntaxspec */
107 Remacs_notsyntaxspec, /* notsyntaxspec */
112 static int re_compile_initialized = 0;
113 static int regexp_syntax = 0;
114 static unsigned char regexp_plain_ops[256];
115 static unsigned char regexp_quoted_ops[256];
116 static unsigned char regexp_precedences[Rnum_ops];
117 static int regexp_context_indep_ops;
118 static int regexp_ansi_sequences;
120 #define NUM_LEVELS 5 /* number of precedence levels in use */
121 #define MAX_NESTING 100 /* max nesting level of operators */
125 /* This code is for emacs compatibility only. */
132 /* emacs defines NULL in some strange way? */
138 #define SYNTAX(ch) re_syntax_table[(unsigned char)(ch)]
142 char *re_syntax_table;
144 static char re_syntax_table[256];
145 #endif /* SYNTAX_TABLE */
149 static void re_compile_initialize()
153 #if !defined(emacs) && !defined(SYNTAX_TABLE)
154 static int syntax_table_inited = 0;
156 if (!syntax_table_inited)
158 syntax_table_inited = 1;
159 memset(re_syntax_table, 0, 256);
160 for (a = 'a'; a <= 'z'; a++)
161 re_syntax_table[a] = Sword;
162 for (a = 'A'; a <= 'Z'; a++)
163 re_syntax_table[a] = Sword;
164 for (a = '0'; a <= '9'; a++)
165 re_syntax_table[a] = Sword;
167 #endif /* !emacs && !SYNTAX_TABLE */
168 re_compile_initialized = 1;
169 for (a = 0; a < 256; a++)
171 regexp_plain_ops[a] = Rnormal;
172 regexp_quoted_ops[a] = Rnormal;
174 for (a = '0'; a <= '9'; a++)
175 regexp_quoted_ops[a] = Rmemory;
176 regexp_plain_ops['\134'] = Rquote;
177 if (regexp_syntax & RE_NO_BK_PARENS)
179 regexp_plain_ops['('] = Ropenpar;
180 regexp_plain_ops[')'] = Rclosepar;
184 regexp_quoted_ops['('] = Ropenpar;
185 regexp_quoted_ops[')'] = Rclosepar;
187 if (regexp_syntax & RE_NO_BK_VBAR)
188 regexp_plain_ops['\174'] = Ror;
190 regexp_quoted_ops['\174'] = Ror;
191 regexp_plain_ops['*'] = Rstar;
192 if (regexp_syntax & RE_BK_PLUS_QM)
194 regexp_quoted_ops['+'] = Rplus;
195 regexp_quoted_ops['?'] = Roptional;
199 regexp_plain_ops['+'] = Rplus;
200 regexp_plain_ops['?'] = Roptional;
202 if (regexp_syntax & RE_NEWLINE_OR)
203 regexp_plain_ops['\n'] = Ror;
204 regexp_plain_ops['\133'] = Ropenset;
205 regexp_plain_ops['\136'] = Rbol;
206 regexp_plain_ops['$'] = Reol;
207 regexp_plain_ops['.'] = Ranychar;
208 if (!(regexp_syntax & RE_NO_GNU_EXTENSIONS))
211 regexp_quoted_ops['='] = Remacs_at_dot;
212 regexp_quoted_ops['s'] = Remacs_syntaxspec;
213 regexp_quoted_ops['S'] = Remacs_notsyntaxspec;
215 regexp_quoted_ops['w'] = Rwordchar;
216 regexp_quoted_ops['W'] = Rnotwordchar;
217 regexp_quoted_ops['<'] = Rwordbeg;
218 regexp_quoted_ops['>'] = Rwordend;
219 regexp_quoted_ops['b'] = Rwordbound;
220 regexp_quoted_ops['B'] = Rnotwordbound;
221 regexp_quoted_ops['`'] = Rbegbuf;
222 regexp_quoted_ops['\''] = Rendbuf;
224 if (regexp_syntax & RE_ANSI_HEX)
225 regexp_quoted_ops['v'] = Rextended_memory;
226 for (a = 0; a < Rnum_ops; a++)
227 regexp_precedences[a] = 4;
228 if (regexp_syntax & RE_TIGHT_VBAR)
230 regexp_precedences[Ror] = 3;
231 regexp_precedences[Rbol] = 2;
232 regexp_precedences[Reol] = 2;
236 regexp_precedences[Ror] = 2;
237 regexp_precedences[Rbol] = 3;
238 regexp_precedences[Reol] = 3;
240 regexp_precedences[Rclosepar] = 1;
241 regexp_precedences[Rend] = 0;
242 regexp_context_indep_ops = (regexp_syntax & RE_CONTEXT_INDEP_OPS) != 0;
243 regexp_ansi_sequences = (regexp_syntax & RE_ANSI_HEX) != 0;
246 int re_set_syntax(syntax)
252 regexp_syntax = syntax;
253 re_compile_initialize();
257 static int hex_char_to_decimal(ch)
260 if (ch >= '0' && ch <= '9')
262 if (ch >= 'a' && ch <= 'f')
263 return ch - 'a' + 10;
264 if (ch >= 'A' && ch <= 'F')
265 return ch - 'A' + 10;
269 char *re_compile_pattern(regex, size, bufp)
274 int a, pos, op, current_level, level, opcode;
275 int pattern_offset = 0, alloc;
276 int starts[NUM_LEVELS * MAX_NESTING], starts_base;
277 int future_jumps[MAX_NESTING], num_jumps;
278 unsigned char ch = 0;
279 char *pattern, *translate;
280 int next_register, paren_depth, num_open_registers, open_registers[RE_NREGS];
281 int beginning_context;
283 #define NEXTCHAR(var) \
286 goto ends_prematurely; \
287 (var) = regex[pos]; \
291 #define ALLOC(amount) \
293 if (pattern_offset+(amount) > alloc) \
295 alloc += 256 + (amount); \
296 pattern = realloc(pattern, alloc); \
298 goto out_of_memory; \
302 #define STORE(ch) pattern[pattern_offset++] = (ch)
304 #define CURRENT_LEVEL_START (starts[starts_base + current_level])
306 #define SET_LEVEL_START starts[starts_base + current_level] = pattern_offset
308 #define PUSH_LEVEL_STARTS if (starts_base < (MAX_NESTING-1)*NUM_LEVELS) \
309 starts_base += NUM_LEVELS; \
313 #define POP_LEVEL_STARTS starts_base -= NUM_LEVELS
315 #define PUT_ADDR(offset,addr) \
317 int disp = (addr) - (offset) - 2; \
318 pattern[(offset)] = disp & 0xff; \
319 pattern[(offset)+1] = (disp>>8) & 0xff; \
322 #define INSERT_JUMP(pos,type,addr) \
324 int a, p = (pos), t = (type), ad = (addr); \
325 for (a = pattern_offset - 1; a >= p; a--) \
326 pattern[a + 3] = pattern[a]; \
329 pattern_offset += 3; \
332 #define SETBIT(buf,offset,bit) (buf)[(offset)+(bit)/8] |= (1<<((bit) & 7))
336 bufp->allocated = alloc; \
337 bufp->buffer = pattern; \
338 bufp->used = pattern_offset; \
341 #define GETHEX(var) \
343 char gethex_ch, gethex_value; \
344 NEXTCHAR(gethex_ch); \
345 gethex_value = hex_char_to_decimal(gethex_ch); \
346 if (gethex_value == 16) \
348 NEXTCHAR(gethex_ch); \
349 gethex_ch = hex_char_to_decimal(gethex_ch); \
350 if (gethex_ch == 16) \
352 (var) = gethex_value * 16 + gethex_ch; \
355 #define ANSI_TRANSLATE(ch) \
361 ch = 7; /* audible bell */ \
365 ch = 8; /* backspace */ \
369 ch = 12; /* form feed */ \
373 ch = 10; /* line feed */ \
377 ch = 13; /* carriage return */ \
385 ch = 11; /* vertical tab */ \
387 case 'x': /* hex code */ \
392 /* other characters passed through */ \
394 ch = translate[(unsigned char)ch]; \
399 if (!re_compile_initialized)
400 re_compile_initialize();
402 bufp->fastmap_accurate = 0;
403 bufp->uses_registers = 0;
404 translate = bufp->translate;
405 pattern = bufp->buffer;
406 alloc = bufp->allocated;
407 if (alloc == 0 || pattern == NULL)
410 pattern = malloc(alloc);
419 num_open_registers = 0;
422 beginning_context = 1;
424 /* we use Rend dummy to ensure that pending jumps are updated (due to
425 low priority of Rend) before exiting the loop. */
435 ch = translate[(unsigned char)ch];
436 op = regexp_plain_ops[(unsigned char)ch];
440 op = regexp_quoted_ops[(unsigned char)ch];
441 if (op == Rnormal && regexp_ansi_sequences)
445 level = regexp_precedences[op];
446 /* printf("ch='%c' op=%d level=%d current_level=%d curlevstart=%d\n",
447 ch, op, level, current_level, CURRENT_LEVEL_START); */
448 if (level > current_level)
450 for (current_level++; current_level < level; current_level++)
455 if (level < current_level)
457 current_level = level;
458 for (;num_jumps > 0 &&
459 future_jumps[num_jumps-1] >= CURRENT_LEVEL_START;
461 PUT_ADDR(future_jumps[num_jumps-1], pattern_offset);
470 store_opcode_and_arg: /* opcode & ch must be set */
487 if (!beginning_context) {
488 if (regexp_context_indep_ops)
496 if (!((pos >= size) ||
497 ((regexp_syntax & RE_NO_BK_VBAR) ?
498 (regex[pos] == '\174') :
499 (pos+1 < size && regex[pos] == '\134' &&
500 regex[pos+1] == '\174')) ||
501 ((regexp_syntax & RE_NO_BK_PARENS)?
503 (pos+1 < size && regex[pos] == '\134' &&
504 regex[pos+1] == ')')))) {
505 if (regexp_context_indep_ops)
514 if (beginning_context) {
515 if (regexp_context_indep_ops)
520 if (CURRENT_LEVEL_START == pattern_offset)
521 break; /* ignore empty patterns for ? */
523 INSERT_JUMP(CURRENT_LEVEL_START, Cfailure_jump,
528 if (beginning_context) {
529 if (regexp_context_indep_ops)
534 if (CURRENT_LEVEL_START == pattern_offset)
535 break; /* ignore empty patterns for + and * */
537 INSERT_JUMP(CURRENT_LEVEL_START, Cfailure_jump,
539 INSERT_JUMP(pattern_offset, Cstar_jump, CURRENT_LEVEL_START);
540 if (op == Rplus) /* jump over initial failure_jump */
541 INSERT_JUMP(CURRENT_LEVEL_START, Cdummy_failure_jump,
542 CURRENT_LEVEL_START + 6);
546 INSERT_JUMP(CURRENT_LEVEL_START, Cfailure_jump,
548 if (num_jumps >= MAX_NESTING)
551 future_jumps[num_jumps++] = pattern_offset;
558 if (next_register < RE_NREGS)
560 bufp->uses_registers = 1;
562 STORE(Cstart_memory);
563 STORE(next_register);
564 open_registers[num_open_registers++] = next_register;
573 if (paren_depth <= 0)
574 goto parenthesis_error;
576 current_level = regexp_precedences[Ropenpar];
578 if (paren_depth < num_open_registers)
580 bufp->uses_registers = 1;
583 num_open_registers--;
584 STORE(open_registers[num_open_registers]);
589 goto bad_match_register;
590 assert(ch >= '0' && ch <= '9');
591 bufp->uses_registers = 1;
592 opcode = Cmatch_memory;
594 goto store_opcode_and_arg;
595 case Rextended_memory:
597 if (ch < '0' || ch > '9')
598 goto bad_match_register;
600 if (a < '0' || a > '9')
601 goto bad_match_register;
602 ch = 10 * (a - '0') + ch - '0';
603 if (ch <= 0 || ch >= RE_NREGS)
604 goto bad_match_register;
605 bufp->uses_registers = 1;
606 opcode = Cmatch_memory;
607 goto store_opcode_and_arg;
610 int complement,prev,offset,range,firstchar;
615 offset = pattern_offset;
616 for (a = 0; a < 256/8; a++)
620 ch = translate[(unsigned char)ch];
626 ch = translate[(unsigned char)ch];
633 while (ch != '\135' || firstchar)
636 if (regexp_ansi_sequences && ch == '\134')
643 for (a = prev; a <= ch; a++)
644 SETBIT(pattern, offset, a);
649 if (prev != -1 && ch == '-')
653 SETBIT(pattern, offset, ch);
658 ch = translate[(unsigned char)ch];
661 SETBIT(pattern, offset, '-');
664 for (a = 0; a < 256/8; a++)
665 pattern[offset+a] ^= 0xff;
676 opcode = Csyntaxspec;
678 goto store_opcode_and_arg;
680 opcode = Cnotsyntaxspec;
682 goto store_opcode_and_arg;
693 opcode = Cnotwordbound;
697 opcode = Cemacs_at_dot;
699 case Remacs_syntaxspec:
702 ch = translate[(unsigned char)ch];
703 opcode = Csyntaxspec;
704 ch = syntax_spec_code[(unsigned char)ch];
705 goto store_opcode_and_arg;
706 case Remacs_notsyntaxspec:
709 ch = translate[(unsigned char)ch];
710 opcode = Cnotsyntaxspec;
711 ch = syntax_spec_code[(unsigned char)ch];
712 goto store_opcode_and_arg;
717 beginning_context = (op == Ropenpar || op == Ror);
719 if (starts_base != 0)
720 goto parenthesis_error;
721 assert(num_jumps == 0);
729 return "Badly placed special character";
733 return "Bad match register number";
737 return "Bad hexadecimal number";
741 return "Badly placed parenthesis";
745 return "Out of memory";
749 return "Regular expression ends prematurely";
753 return "Regular expression too complex";
760 #undef CURRENT_LEVEL_START
761 #undef SET_LEVEL_START
762 #undef PUSH_LEVEL_STARTS
763 #undef POP_LEVEL_STARTS
769 static void re_compile_fastmap_aux(code, pos, visited, can_be_null, fastmap)
770 char *code, *visited, *can_be_null, *fastmap;
773 int a, b, syntaxcode;
776 return; /* we have already been here */
796 syntaxcode = code[pos++];
797 for (a = 0; a < 256; a++)
798 if (SYNTAX(a) == syntaxcode)
802 syntaxcode = code[pos++];
803 for (a = 0; a < 256; a++)
804 if (SYNTAX(a) != syntaxcode)
809 if (*can_be_null == 0)
810 *can_be_null = 2; /* can match null, but only at end of buffer*/
813 for (a = 0; a < 256/8; a++)
814 if (code[pos + a] != 0)
815 for (b = 0; b < 8; b++)
816 if (code[pos + a] & (1 << b))
817 fastmap[(a << 3) + b] = 1;
821 fastmap[(unsigned char)code[pos]] = 1;
824 for (a = 0; a < 256; a++)
833 for (a = 0; a < 256; a++)
838 case Cdummy_failure_jump:
839 case Cupdate_failure_jump:
841 a = (unsigned char)code[pos++];
842 a |= (unsigned char)code[pos++] << 8;
843 pos += (int)(short)a;
846 /* argh... the regexp contains empty loops. This is not
847 good, as this may cause a failure stack overflow when
848 matching. Oh well. */
849 /* this path leads nowhere; pursue other paths. */
855 a = (unsigned char)code[pos++];
856 a |= (unsigned char)code[pos++] << 8;
857 a = pos + (int)(short)a;
858 re_compile_fastmap_aux(code, a, visited, can_be_null, fastmap);
861 abort(); /* probably some opcode is missing from this switch */
866 static int re_do_compile_fastmap(buffer, used, pos, can_be_null, fastmap)
867 char *buffer, *fastmap, *can_be_null;
870 char small_visited[512], *visited;
872 if (used <= sizeof(small_visited))
873 visited = small_visited;
876 visited = malloc(used);
881 memset(fastmap, 0, 256);
882 memset(visited, 0, used);
883 re_compile_fastmap_aux(buffer, pos, visited, can_be_null, fastmap);
884 if (visited != small_visited)
889 void re_compile_fastmap(bufp)
892 if (!bufp->fastmap || bufp->fastmap_accurate)
894 assert(bufp->used > 0);
895 if (!re_do_compile_fastmap(bufp->buffer, bufp->used, 0, &bufp->can_be_null,
898 if (bufp->buffer[0] == Cbol)
899 bufp->anchor = 1; /* begline */
901 if (bufp->buffer[0] == Cbegbuf)
902 bufp->anchor = 2; /* begbuf */
904 bufp->anchor = 0; /* none */
905 bufp->fastmap_accurate = 1;
908 #define INITIAL_FAILURES 128 /* initial # failure points to allocate */
909 #define MAX_FAILURES 4100 /* max # of failure points before failing */
911 int re_match_2(bufp, string1, size1, string2, size2, pos, regs, mstop)
913 char *string1, *string2;
914 int size1, size2, pos, mstop;
915 regexp_registers_t regs;
917 struct failure_point { char *text, *partend, *code; }
918 *failure_stack_start, *failure_sp, *failure_stack_end,
919 initial_failure_stack[INITIAL_FAILURES];
920 char *code, *translate, *text, *textend, *partend, *part_2_end;
921 char *regstart_text[RE_NREGS], *regstart_partend[RE_NREGS];
922 char *regend_text[RE_NREGS], *regend_partend[RE_NREGS];
923 int a, b, ch, reg, regch, match_end;
924 char *regtext, *regpartend, *regtextend;
928 if (text == partend) \
930 if (text == textend) \
933 partend = part_2_end; \
937 #define NEXTCHAR(var) \
940 (var) = (unsigned char)*text++; \
942 (var) = (unsigned char)translate[(var)]; \
945 assert(pos >= 0 && size1 >= 0 && size2 >= 0 && mstop >= 0);
946 assert(mstop <= size1 + size2);
947 assert(pos <= mstop);
951 text = string1 + pos;
954 partend = string1 + mstop;
959 partend = string1 + size1;
960 textend = string2 + mstop - size1;
962 part_2_end = string2 + mstop - size1;
966 text = string2 + pos - size1;
967 partend = string2 + mstop - size1;
969 part_2_end = partend;
972 if (bufp->uses_registers && regs != NULL)
973 for (a = 0; a < RE_NREGS; a++)
974 regend_text[a] = NULL;
977 translate = bufp->translate;
978 failure_stack_start = failure_sp = initial_failure_stack;
979 failure_stack_end = initial_failure_stack + INITIAL_FAILURES;
982 /* re_search_2 has already done this, and otherwise we get little benefit
983 from this. So I'll leave this out. */
984 if (bufp->fastmap_accurate && !bufp->can_be_null &&
986 !bufp->fastmap[translate ?
987 (unsigned char)translate[(unsigned char)*text] :
988 (unsigned char)*text])
989 return -1; /* it can't possibly match */
998 if (partend != part_2_end)
999 match_end = text - string1;
1001 match_end = text - string2 + size1;
1004 regs->start[0] = pos;
1005 regs->end[0] = match_end;
1006 if (!bufp->uses_registers)
1008 for (a = 1; a < RE_NREGS; a++)
1010 regs->start[a] = -1;
1016 for (a = 1; a < RE_NREGS; a++)
1018 if (regend_text[a] == NULL)
1020 regs->start[a] = -1;
1024 if (regstart_partend[a] != part_2_end)
1025 regs->start[a] = regstart_text[a] - string1;
1027 regs->start[a] = regstart_text[a] - string2 + size1;
1028 if (regend_partend[a] != part_2_end)
1029 regs->end[a] = regend_text[a] - string1;
1031 regs->end[a] = regend_text[a] - string2 + size1;
1035 if (failure_stack_start != initial_failure_stack)
1036 free((char *)failure_stack_start);
1037 return match_end - pos;
1039 if (text == string1 || text[-1] == '\n') /* text[-1] always valid */
1043 if (text == string2 + size2 ||
1044 (text == string1 + size1 ?
1045 (size2 == 0 || *string2 == '\n') :
1051 if (code[ch/8] & (1<<(ch & 7)))
1059 if (ch != (unsigned char)*code++)
1069 regstart_text[reg] = text;
1070 regstart_partend[reg] = partend;
1074 regend_text[reg] = text;
1075 regend_partend[reg] = partend;
1079 if (regend_text[reg] == NULL)
1080 goto fail; /* or should we just match nothing? */
1081 regtext = regstart_text[reg];
1082 regtextend = regend_text[reg];
1083 if (regstart_partend[reg] == regend_partend[reg])
1084 regpartend = regtextend;
1086 regpartend = string1 + size1;
1088 for (;regtext != regtextend;)
1091 if (regtext == regpartend)
1093 regch = (unsigned char)*regtext++;
1095 regch = (unsigned char)translate[regch];
1101 /* star is coded as:
1103 ... code for operand of star
1105 2: ... code after star
1106 We change the star_jump to update_failure_jump if we can determine
1107 that it is safe to do so; otherwise we change it to an ordinary
1112 2: ... code for operand of plus
1114 3: ... code after plus
1115 For star_jump considerations this is processed identically
1117 a = (unsigned char)*code++;
1118 a |= (unsigned char)*code++ << 8;
1121 char map[256], can_be_null;
1124 p1 = code + a + 3; /* skip the failure_jump */
1125 assert(p1[-3] == Cfailure_jump);
1127 /* p1 points inside loop, p2 points to after loop */
1128 if (!re_do_compile_fastmap(bufp->buffer, bufp->used,
1129 p2 - bufp->buffer, &can_be_null, map))
1130 goto make_normal_jump;
1131 /* If we might introduce a new update point inside the loop,
1132 we can't optimize because then update_jump would update a
1133 wrong failure point. Thus we have to be quite careful here. */
1135 /* loop until we find something that consumes a character */
1155 ch = (unsigned char)*p1++;
1157 goto make_normal_jump;
1160 for (b = 0; b < 256; b++)
1161 if (b != '\n' && map[b])
1162 goto make_normal_jump;
1165 for (b = 0; b < 256; b++)
1166 if ((p1[b >> 3] & (1 << (b & 7))) && map[b])
1167 goto make_normal_jump;
1171 goto make_normal_jump;
1173 /* now we know that we can't backtrack. */
1174 while (p1 != p2 - 3)
1179 abort(); /* we certainly shouldn't get this inside loop */
1202 case Cnotsyntaxspec:
1208 case Cupdate_failure_jump:
1209 case Cdummy_failure_jump:
1210 goto make_normal_jump;
1212 printf("regexpr.c: processing star_jump: unknown op %d\n", p1[-1]);
1216 goto make_update_jump;
1219 /* printf("changing to normal jump\n"); */
1224 /* printf("changing to update jump\n"); */
1226 a += 3; /* jump to after the Cfailure_jump */
1227 code[-1] = Cupdate_failure_jump;
1230 /* fall to next case */
1231 case Cupdate_failure_jump:
1232 failure_sp[-1].text = text;
1233 failure_sp[-1].partend = partend;
1234 /* fall to next case */
1236 a = (unsigned char)*code++;
1237 a |= (unsigned char)*code++ << 8;
1238 code += (int)(short)a;
1240 case Cdummy_failure_jump:
1242 if (failure_sp == failure_stack_end)
1244 if (failure_stack_start != initial_failure_stack)
1246 failure_stack_start = (struct failure_point *)
1247 malloc(MAX_FAILURES * sizeof(*failure_stack_start));
1248 failure_stack_end = failure_stack_start + MAX_FAILURES;
1249 memcpy((char *)failure_stack_start, (char *)initial_failure_stack,
1250 INITIAL_FAILURES * sizeof(*failure_stack_start));
1251 failure_sp = failure_stack_start + INITIAL_FAILURES;
1253 a = (unsigned char)*code++;
1254 a |= (unsigned char)*code++ << 8;
1256 if (code[-3] == Cdummy_failure_jump)
1257 { /* this is only used in plus */
1258 assert(*code == Cfailure_jump);
1259 b = (unsigned char)code[1];
1260 b |= (unsigned char)code[2] << 8;
1261 failure_sp->code = code + (int)(short)b + 3;
1262 failure_sp->text = NULL;
1267 failure_sp->code = code + a;
1268 failure_sp->text = text;
1269 failure_sp->partend = partend;
1274 if (text == string1)
1278 if (size2 == 0 ? text == string1 + size1 : text == string2 + size2)
1282 if (text == string2 + size2)
1284 if (size2 == 0 && text == string1 + size1)
1286 if (SYNTAX(text == string1 + size1 ? *string1 : *text) != Sword)
1288 if (text == string1)
1290 if (SYNTAX(text[-1]) != Sword)
1294 if (text == string1)
1296 if (SYNTAX(text[-1]) != Sword)
1298 if (text == string2 + size2)
1300 if (size2 == 0 && text == string1 + size1)
1302 if (SYNTAX(*text) == Sword)
1306 /* Note: as in gnu regexp, this also matches at the beginning
1307 and end of buffer. */
1308 if (text == string1 || text == string2 + size2 ||
1309 (size2 == 0 && text == string1 + size1))
1311 if ((SYNTAX(text[-1]) == Sword) ^
1312 (SYNTAX(text == string1 + size1 ? *string2 : *text) == Sword))
1316 /* Note: as in gnu regexp, this never matches at the beginning
1317 and end of buffer. */
1318 if (text == string1 || text == string2 + size2 ||
1319 (size2 == 0 && text == string1 + size1))
1321 if (!((SYNTAX(text[-1]) == Sword) ^
1322 (SYNTAX(text == string1 + size1 ? *string2 : *text) == Sword)))
1327 if (SYNTAX(ch) != (unsigned char)*code++)
1330 case Cnotsyntaxspec:
1332 if (SYNTAX(ch) != (unsigned char)*code++)
1337 if (PTR_CHAR_POS((unsigned char *)text) + 1 != point)
1350 if (failure_sp != failure_stack_start)
1353 text = failure_sp->text;
1356 partend = failure_sp->partend;
1357 code = failure_sp->code;
1358 goto continue_matching;
1360 if (failure_stack_start != initial_failure_stack)
1361 free((char *)failure_stack_start);
1365 if (failure_stack_start != initial_failure_stack)
1366 free((char *)failure_stack_start);
1374 int re_match(bufp, string, size, pos, regs)
1378 regexp_registers_t regs;
1380 return re_match_2(bufp, string, size, (char *)NULL, 0, pos, regs, size);
1383 int re_search_2(bufp, string1, size1, string2, size2, pos, range, regs,
1386 char *string1, *string2;
1387 int size1, size2, pos, range, mstop;
1388 regexp_registers_t regs;
1390 char *fastmap, *translate, *text, *partstart, *partend;
1394 assert(size1 >= 0 && size2 >= 0 && pos >= 0 && mstop >= 0);
1395 assert(pos + range >= 0 && pos + range <= size1 + size2);
1396 assert(pos <= mstop);
1398 fastmap = bufp->fastmap;
1399 translate = bufp->translate;
1400 if (fastmap && !bufp->fastmap_accurate)
1401 re_compile_fastmap(bufp);
1402 anchor = bufp->anchor;
1403 if (bufp->can_be_null == 1) /* can_be_null == 2: can match null at eob */
1418 for (; range >= 0; range--, pos += dir)
1423 { /* searching forwards */
1426 text = string1 + pos;
1427 if (pos + range > size1)
1428 partend = string1 + size1;
1430 partend = string1 + pos + range;
1434 text = string2 + pos - size1;
1435 partend = string2 + pos + range - size1;
1439 while (text != partend &&
1440 !fastmap[(unsigned char)
1441 translate[(unsigned char)*text]])
1444 while (text != partend && !fastmap[(unsigned char)*text])
1446 pos += text - partstart;
1447 range -= text - partstart;
1448 if (pos == size1 + size2 && bufp->can_be_null == 0)
1452 { /* searching backwards */
1455 text = string1 + pos;
1456 partstart = string1 + pos - range;
1460 text = string2 + pos - size1;
1461 if (range < pos - size1)
1462 partstart = string2 + pos - size1 - range;
1464 partstart = string2;
1468 while (text != partstart &&
1469 !fastmap[(unsigned char)
1470 translate[(unsigned char)*text]])
1473 while (text != partstart &&
1474 !fastmap[(unsigned char)*text])
1476 pos -= partend - text;
1477 range -= partend - text;
1481 { /* anchored to begline */
1483 (pos <= size1 ? string1[pos - 1] :
1484 string2[pos - size1 - 1]) != '\n')
1487 assert(pos >= 0 && pos <= size1 + size2);
1488 ret = re_match_2(bufp, string1, size1, string2, size2, pos, regs, mstop);
1497 int re_search(bufp, string, size, startpos, range, regs)
1500 int size, startpos, range;
1501 regexp_registers_t regs;
1503 return re_search_2(bufp, string, size, (char *)NULL, 0,
1504 startpos, range, regs, size);
1507 static struct re_pattern_buffer re_comp_buf;
1514 if (!re_comp_buf.buffer)
1515 return "Out of memory";
1518 if (!re_comp_buf.buffer)
1520 /* the buffer will be allocated automatically */
1521 re_comp_buf.fastmap = malloc(256);
1522 re_comp_buf.translate = NULL;
1524 return re_compile_pattern(s, strlen(s), &re_comp_buf);
1530 int len = strlen(s);
1532 return re_search(&re_comp_buf, s, len, 0, len, (regexp_registers_t)NULL) >= 0;
1535 /* POSIX Compatibility */
1537 int regcomp(regex_t *preg, const char *regex, int cflags)
1540 memset(preg, 0, sizeof(*preg));
1541 if (cflags & REG_EXTENDED)
1542 syntax |= (RE_CONTEXT_INDEP_OPS | RE_NO_BK_PARENS | RE_NO_BK_VBAR);
1543 re_set_syntax(syntax);
1544 if (re_compile_pattern((char *)regex, strlen(regex), preg) == NULL)
1549 int regexec(const regex_t *preg, const char *string, size_t nmatch,
1550 regmatch_t pmatch[], int eflags)
1552 int len = strlen(string);
1555 ret = re_search((regex_t *)preg, (char *)string, len, 0, len, (regexp_registers_t)NULL);
1562 size_t regerror(int errcode, const regex_t *preg, char *errbuf,
1568 void regfree(regex_t *preg)
1579 struct re_pattern_buffer exp;
1580 struct re_registers regs;
1586 exp.translate = NULL;
1587 exp.fastmap = fastmap;
1589 /* re_set_syntax(RE_NO_BK_PARENS|RE_NO_BK_VBAR|RE_ANSI_HEX); */
1593 printf("Enter regexp:\n");
1595 cp=re_compile_pattern(buf, strlen(buf), &exp);
1598 printf("Error: %s\n", cp);
1601 re_compile_fastmap(&exp);
1603 for (pos = 0; pos < exp.used;)
1605 printf("%d: ", pos);
1606 switch (exp.buffer[pos++])
1618 strcpy(buf, "set ");
1619 for (a = 0; a < 256/8; a++)
1620 sprintf(buf+strlen(buf)," %02x",
1621 (unsigned char)exp.buffer[pos++]);
1624 sprintf(buf, "exact '%c' 0x%x", exp.buffer[pos],
1625 (unsigned char)exp.buffer[pos]);
1629 strcpy(buf, "anychar");
1632 sprintf(buf, "start_memory %d", exp.buffer[pos++]);
1635 sprintf(buf, "end_memory %d", exp.buffer[pos++]);
1638 sprintf(buf, "match_memory %d", exp.buffer[pos++]);
1641 case Cdummy_failure_jump:
1644 case Cupdate_failure_jump:
1645 a = (unsigned char)exp.buffer[pos++];
1646 a += (unsigned char)exp.buffer[pos++] << 8;
1648 switch (exp.buffer[pos-3])
1657 cp = "failure_jump";
1659 case Cupdate_failure_jump:
1660 cp = "update_failure_jump";
1662 case Cdummy_failure_jump:
1663 cp = "dummy_failure_jump";
1666 cp = "unknown jump";
1669 sprintf(buf, "%s %d", cp, a + pos);
1672 strcpy(buf,"begbuf");
1675 strcpy(buf,"endbuf");
1678 strcpy(buf,"wordbeg");
1681 strcpy(buf,"wordend");
1684 strcpy(buf,"wordbound");
1687 strcpy(buf,"notwordbound");
1690 sprintf(buf, "unknown code %d",
1691 (unsigned char)exp.buffer[pos - 1]);
1694 printf("%s\n", buf);
1696 printf("can_be_null = %d uses_registers = %d anchor = %d\n",
1697 exp.can_be_null, exp.uses_registers, exp.anchor);
1700 for (a = 0; a < 256; a++)
1704 printf("Enter strings. An empty line terminates.\n");
1705 while (fgets(buf, sizeof(buf), stdin))
1709 a = re_search(&exp, buf, strlen(buf), 0, strlen(buf), ®s);
1710 printf("search returns %d\n", a);
1713 for (a = 0; a < RE_NREGS; a++)
1715 printf("buf %d: %d to %d\n", a, regs.start[a], regs.end[a]);
1722 #endif /* TEST_REGEXP */