5 Author: Tatu Ylonen <ylo@ngs.fi>
7 Copyright (c) 1991 Tatu Ylonen, Espoo, Finland
9 Permission to use, copy, modify, distribute, and sell this software
10 and its documentation is hereby granted without fee, provided that the
11 above copyright notice appears in all source code copies, the name of
12 Tatu Ylonen is not used to advertise products containing this software
13 or a derivation thereof, and all modified versions are clearly marked
16 This software is provided "as is" without express or implied warranty.
18 Created: Thu Sep 26 17:14:05 1991 ylo
19 Last modified: Sun Mar 29 16:47:31 1992 ylo
21 This code draws many ideas from the regular expression packages by
22 Henry Spencer of the University of Toronto and Richard Stallman of the
23 Free Software Foundation.
25 Emacs-specific code and syntax table code is almost directly borrowed
28 The SILC Regex API and modifications by Pekka Riikonen, under the same
29 license as the original code.
35 /* Modified for use in SILC Runtime Toolkit. I think we have disabled many
36 features we could use, for the sake of simple API, which we may want to
37 extend later. But, we've added RE_NOTBOL and RE_NOTEOL. */
39 #define RE_NREGS 128 /* number of registers available */
41 /* bit definitions for syntax */
42 #define RE_NO_BK_PARENS 1 /* no quoting for parentheses */
43 #define RE_NO_BK_VBAR 2 /* no quoting for vertical bar */
44 #define RE_BK_PLUS_QM 4 /* quoting needed for + and ? */
45 #define RE_TIGHT_VBAR 8 /* | binds tighter than ^ and $ */
46 #define RE_NEWLINE_OR 16 /* treat newline as or */
47 #define RE_CONTEXT_INDEP_OPS 32 /* ^$?*+ are special in all contexts */
48 #define RE_ANSI_HEX 64 /* ansi sequences (\n etc) and \xhh */
49 #define RE_NO_GNU_EXTENSIONS 128 /* no gnu extensions */
50 #define RE_NOTBOL 256 /* bol fails to match */
51 #define RE_NOTEOL 512 /* eol fails to match */
53 /* definitions for some common regexp styles */
54 #define RE_SYNTAX_AWK (RE_NO_BK_PARENS|RE_NO_BK_VBAR|RE_CONTEXT_INDEP_OPS)
55 #define RE_SYNTAX_EGREP (RE_SYNTAX_AWK|RE_NEWLINE_OR)
56 #define RE_SYNTAX_GREP (RE_BK_PLUS_QM|RE_NEWLINE_OR)
57 #define RE_SYNTAX_EMACS 0
60 typedef struct re_registers {
61 int start[RE_NREGS]; /* start offset of region */
62 int end[RE_NREGS]; /* end offset of region */
63 } *regexp_registers_t;
65 int re_set_syntax(int syntax);
66 /* This sets the syntax to use and returns the previous syntax. The
67 syntax is specified by a bit mask of the above defined bits. */
69 SilcResult re_compile_pattern(char *regex, int regex_size, SilcRegex compiled);
70 /* This compiles the regexp (given in regex and length in regex_size).
71 This returns NULL if the regexp compiled successfully, and an error
72 message if an error was encountered. The buffer field must be
73 initialized to a memory area allocated by malloc (or to NULL) before
74 use, and the allocated field must be set to its length (or 0 if buffer is
75 NULL). Also, the translate field must be set to point to a valid
76 translation table, or NULL if it is not used. */
78 int re_match(SilcRegex compiled, char *string, int size, int pos,
79 regexp_registers_t regs, unsigned int flags);
80 /* This tries to match the regexp against the string. This returns the
81 length of the matched portion, or -1 if the pattern could not be
82 matched and -2 if an error (such as failure stack overflow) is
85 int re_match_2(SilcRegex compiled, char *string1, int size1,
86 char *string2, int size2, int pos, regexp_registers_t regs,
87 int mstop, unsigned int flags);
88 /* This tries to match the regexp to the concatenation of string1 and
89 string2. This returns the length of the matched portion, or -1 if the
90 pattern could not be matched and -2 if an error (such as failure stack
91 overflow) is encountered. */
93 int re_search(SilcRegex compiled, char *string, int size, int startpos,
94 int range, regexp_registers_t regs, unsigned int flags);
95 /* This rearches for a substring matching the regexp. This returns the first
96 index at which a match is found. range specifies at how many positions to
97 try matching; positive values indicate searching forwards, and negative
98 values indicate searching backwards. mstop specifies the offset beyond
99 which a match must not go. This returns -1 if no match is found, and
100 -2 if an error (such as failure stack overflow) is encountered. */
102 int re_search_2(SilcRegex compiled, char *string1, int size1,
103 char *string2, int size2, int startpos, int range,
104 regexp_registers_t regs, int mstop, unsigned int flags);
105 /* This is like re_search, but search from the concatenation of string1 and
108 void re_compile_fastmap(SilcRegex compiled);
109 /* This computes the fastmap for the regexp. For this to have any effect,
110 the calling program must have initialized the fastmap field to point
111 to an array of 256 characters. */
113 #define MACRO_BEGIN do {
114 #define MACRO_END } while (0)
116 enum regexp_compiled_ops /* opcodes for compiled regexp */
118 Cend, /* end of pattern reached */
119 Cbol, /* beginning of line */
120 Ceol, /* end of line */
121 Cset, /* character set. Followed by 32 bytes of set. */
122 Cexact, /* followed by a byte to match */
123 Canychar, /* matches any character except newline */
124 Cstart_memory, /* set register start addr (followed by reg number) */
125 Cend_memory, /* set register end addr (followed by reg number) */
126 Cmatch_memory, /* match a duplicate of reg contents (regnum follows)*/
127 Cjump, /* followed by two bytes (lsb,msb) of displacement. */
128 Cstar_jump, /* will change to jump/update_failure_jump at runtime */
129 Cfailure_jump, /* jump to addr on failure */
130 Cupdate_failure_jump, /* update topmost failure point and jump */
131 Cdummy_failure_jump, /* push a dummy failure point and jump */
132 Cbegbuf, /* match at beginning of buffer */
133 Cendbuf, /* match at end of buffer */
134 Cwordbeg, /* match at beginning of word */
135 Cwordend, /* match at end of word */
136 Cwordbound, /* match if at word boundary */
137 Cnotwordbound, /* match if not at word boundary */
139 Cemacs_at_dot, /* emacs only: matches at dot */
141 Csyntaxspec, /* matches syntax code (1 byte follows) */
142 Cnotsyntaxspec /* matches if syntax code does not match (1 byte foll)*/
145 enum regexp_syntax_op /* syntax codes for plain and quoted characters */
147 Rend, /* special code for end of regexp */
148 Rnormal, /* normal character */
149 Ranychar, /* any character except newline */
150 Rquote, /* the quote character */
151 Rbol, /* match beginning of line */
152 Reol, /* match end of line */
153 Roptional, /* match preceding expression optionally */
154 Rstar, /* match preceding expr zero or more times */
155 Rplus, /* match preceding expr one or more times */
156 Ror, /* match either of alternatives */
157 Ropenpar, /* opening parenthesis */
158 Rclosepar, /* closing parenthesis */
159 Rmemory, /* match memory register */
160 Rextended_memory, /* \vnn to match registers 10-99 */
161 Ropenset, /* open set. Internal syntax hard-coded below. */
162 /* the following are gnu extensions to "normal" regexp syntax */
163 Rbegbuf, /* beginning of buffer */
164 Rendbuf, /* end of buffer */
165 Rwordchar, /* word character */
166 Rnotwordchar, /* not word character */
167 Rwordbeg, /* beginning of word */
168 Rwordend, /* end of word */
169 Rwordbound, /* word bound */
170 Rnotwordbound, /* not word bound */
172 Remacs_at_dot, /* emacs: at dot */
173 Remacs_syntaxspec, /* syntaxspec */
174 Remacs_notsyntaxspec, /* notsyntaxspec */
179 static int re_compile_initialized = 0;
180 static int regexp_syntax = 0;
181 static unsigned char regexp_plain_ops[256];
182 static unsigned char regexp_quoted_ops[256];
183 static unsigned char regexp_precedences[Rnum_ops];
184 static int regexp_context_indep_ops;
185 static int regexp_ansi_sequences;
187 #define NUM_LEVELS 5 /* number of precedence levels in use */
188 #define MAX_NESTING 100 /* max nesting level of operators */
192 /* This code is for emacs compatibility only. */
199 /* emacs defines NULL in some strange way? */
205 #define SYNTAX(ch) re_syntax_table[(unsigned char)(ch)]
209 char *re_syntax_table;
211 static char re_syntax_table[256];
212 #endif /* SYNTAX_TABLE */
216 static void re_compile_initialize()
220 #if !defined(emacs) && !defined(SYNTAX_TABLE)
221 static int syntax_table_inited = 0;
223 if (!syntax_table_inited)
225 syntax_table_inited = 1;
226 memset(re_syntax_table, 0, 256);
227 for (a = 'a'; a <= 'z'; a++)
228 re_syntax_table[a] = Sword;
229 for (a = 'A'; a <= 'Z'; a++)
230 re_syntax_table[a] = Sword;
231 for (a = '0'; a <= '9'; a++)
232 re_syntax_table[a] = Sword;
234 #endif /* !emacs && !SYNTAX_TABLE */
235 re_compile_initialized = 1;
236 for (a = 0; a < 256; a++)
238 regexp_plain_ops[a] = Rnormal;
239 regexp_quoted_ops[a] = Rnormal;
241 for (a = '0'; a <= '9'; a++)
242 regexp_quoted_ops[a] = Rmemory;
243 regexp_plain_ops['\134'] = Rquote;
244 if (regexp_syntax & RE_NO_BK_PARENS)
246 regexp_plain_ops['('] = Ropenpar;
247 regexp_plain_ops[')'] = Rclosepar;
251 regexp_quoted_ops['('] = Ropenpar;
252 regexp_quoted_ops[')'] = Rclosepar;
254 if (regexp_syntax & RE_NO_BK_VBAR)
255 regexp_plain_ops['\174'] = Ror;
257 regexp_quoted_ops['\174'] = Ror;
258 regexp_plain_ops['*'] = Rstar;
259 if (regexp_syntax & RE_BK_PLUS_QM)
261 regexp_quoted_ops['+'] = Rplus;
262 regexp_quoted_ops['?'] = Roptional;
266 regexp_plain_ops['+'] = Rplus;
267 regexp_plain_ops['?'] = Roptional;
269 if (regexp_syntax & RE_NEWLINE_OR)
270 regexp_plain_ops['\n'] = Ror;
271 regexp_plain_ops['\133'] = Ropenset;
272 regexp_plain_ops['\136'] = Rbol;
273 regexp_plain_ops['$'] = Reol;
274 regexp_plain_ops['.'] = Ranychar;
275 if (!(regexp_syntax & RE_NO_GNU_EXTENSIONS))
278 regexp_quoted_ops['='] = Remacs_at_dot;
279 regexp_quoted_ops['s'] = Remacs_syntaxspec;
280 regexp_quoted_ops['S'] = Remacs_notsyntaxspec;
282 regexp_quoted_ops['w'] = Rwordchar;
283 regexp_quoted_ops['W'] = Rnotwordchar;
284 regexp_quoted_ops['<'] = Rwordbeg;
285 regexp_quoted_ops['>'] = Rwordend;
286 regexp_quoted_ops['b'] = Rwordbound;
287 regexp_quoted_ops['B'] = Rnotwordbound;
288 regexp_quoted_ops['`'] = Rbegbuf;
289 regexp_quoted_ops['\''] = Rendbuf;
291 if (regexp_syntax & RE_ANSI_HEX)
292 regexp_quoted_ops['v'] = Rextended_memory;
293 for (a = 0; a < Rnum_ops; a++)
294 regexp_precedences[a] = 4;
295 if (regexp_syntax & RE_TIGHT_VBAR)
297 regexp_precedences[Ror] = 3;
298 regexp_precedences[Rbol] = 2;
299 regexp_precedences[Reol] = 2;
303 regexp_precedences[Ror] = 2;
304 regexp_precedences[Rbol] = 3;
305 regexp_precedences[Reol] = 3;
307 regexp_precedences[Rclosepar] = 1;
308 regexp_precedences[Rend] = 0;
309 regexp_context_indep_ops = (regexp_syntax & RE_CONTEXT_INDEP_OPS) != 0;
310 regexp_ansi_sequences = (regexp_syntax & RE_ANSI_HEX) != 0;
313 int re_set_syntax(syntax)
319 regexp_syntax = syntax;
320 re_compile_initialize();
324 static int hex_char_to_decimal(ch)
327 if (ch >= '0' && ch <= '9')
329 if (ch >= 'a' && ch <= 'f')
330 return ch - 'a' + 10;
331 if (ch >= 'A' && ch <= 'F')
332 return ch - 'A' + 10;
336 SilcResult re_compile_pattern(regex, size, bufp)
341 int a, pos, op, current_level, level, opcode;
342 int pattern_offset = 0, alloc;
343 int starts[NUM_LEVELS * MAX_NESTING], starts_base;
344 int future_jumps[MAX_NESTING], num_jumps;
345 unsigned char ch = 0;
346 char *pattern, *translate;
347 int next_register, paren_depth, num_open_registers, open_registers[RE_NREGS];
348 int beginning_context;
350 #define NEXTCHAR(var) \
353 goto ends_prematurely; \
354 (var) = regex[pos]; \
358 #define ALLOC(amount) \
360 if (pattern_offset+(amount) > alloc) \
362 alloc += 256 + (amount); \
363 pattern = silc_realloc(pattern, alloc); \
365 goto out_of_memory; \
369 #define STORE(ch) pattern[pattern_offset++] = (ch)
371 #define CURRENT_LEVEL_START (starts[starts_base + current_level])
373 #define SET_LEVEL_START starts[starts_base + current_level] = pattern_offset
375 #define PUSH_LEVEL_STARTS if (starts_base < (MAX_NESTING-1)*NUM_LEVELS) \
376 starts_base += NUM_LEVELS; \
380 #define POP_LEVEL_STARTS starts_base -= NUM_LEVELS
382 #define PUT_ADDR(offset,addr) \
384 int disp = (addr) - (offset) - 2; \
385 pattern[(offset)] = disp & 0xff; \
386 pattern[(offset)+1] = (disp>>8) & 0xff; \
389 #define INSERT_JUMP(pos,type,addr) \
391 int a, p = (pos), t = (type), ad = (addr); \
392 for (a = pattern_offset - 1; a >= p; a--) \
393 pattern[a + 3] = pattern[a]; \
396 pattern_offset += 3; \
399 #define SETBIT(buf,offset,bit) (buf)[(offset)+(bit)/8] |= (1<<((bit) & 7))
403 bufp->allocated = alloc; \
404 bufp->buffer = pattern; \
405 bufp->used = pattern_offset; \
408 #define GETHEX(var) \
410 char gethex_ch, gethex_value; \
411 NEXTCHAR(gethex_ch); \
412 gethex_value = hex_char_to_decimal(gethex_ch); \
413 if (gethex_value == 16) \
415 NEXTCHAR(gethex_ch); \
416 gethex_ch = hex_char_to_decimal(gethex_ch); \
417 if (gethex_ch == 16) \
419 (var) = gethex_value * 16 + gethex_ch; \
422 #define ANSI_TRANSLATE(ch) \
428 ch = 7; /* audible bell */ \
432 ch = 8; /* backspace */ \
436 ch = 12; /* form feed */ \
440 ch = 10; /* line feed */ \
444 ch = 13; /* carriage return */ \
452 ch = 11; /* vertical tab */ \
454 case 'x': /* hex code */ \
459 /* other characters passed through */ \
461 ch = translate[(unsigned char)ch]; \
466 if (!re_compile_initialized)
467 re_compile_initialize();
469 bufp->fastmap_accurate = 0;
470 bufp->uses_registers = 0;
471 translate = bufp->translate;
472 pattern = bufp->buffer;
473 alloc = bufp->allocated;
474 if (alloc == 0 || pattern == NULL)
477 pattern = silc_malloc(alloc);
486 num_open_registers = 0;
489 beginning_context = 1;
491 /* we use Rend dummy to ensure that pending jumps are updated (due to
492 low priority of Rend) before exiting the loop. */
502 ch = translate[(unsigned char)ch];
503 op = regexp_plain_ops[(unsigned char)ch];
507 op = regexp_quoted_ops[(unsigned char)ch];
508 if (op == Rnormal && regexp_ansi_sequences)
512 level = regexp_precedences[op];
513 /* printf("ch='%c' op=%d level=%d current_level=%d curlevstart=%d\n",
514 ch, op, level, current_level, CURRENT_LEVEL_START); */
515 if (level > current_level)
517 for (current_level++; current_level < level; current_level++)
522 if (level < current_level)
524 current_level = level;
525 for (;num_jumps > 0 &&
526 future_jumps[num_jumps-1] >= CURRENT_LEVEL_START;
528 PUT_ADDR(future_jumps[num_jumps-1], pattern_offset);
537 store_opcode_and_arg: /* opcode & ch must be set */
554 if (!beginning_context) {
555 if (regexp_context_indep_ops)
563 if (!((pos >= size) ||
564 ((regexp_syntax & RE_NO_BK_VBAR) ?
565 (regex[pos] == '\174') :
566 (pos+1 < size && regex[pos] == '\134' &&
567 regex[pos+1] == '\174')) ||
568 ((regexp_syntax & RE_NO_BK_PARENS)?
570 (pos+1 < size && regex[pos] == '\134' &&
571 regex[pos+1] == ')')))) {
572 if (regexp_context_indep_ops)
581 if (beginning_context) {
582 if (regexp_context_indep_ops)
587 if (CURRENT_LEVEL_START == pattern_offset)
588 break; /* ignore empty patterns for ? */
590 INSERT_JUMP(CURRENT_LEVEL_START, Cfailure_jump,
595 if (beginning_context) {
596 if (regexp_context_indep_ops)
601 if (CURRENT_LEVEL_START == pattern_offset)
602 break; /* ignore empty patterns for + and * */
604 INSERT_JUMP(CURRENT_LEVEL_START, Cfailure_jump,
606 INSERT_JUMP(pattern_offset, Cstar_jump, CURRENT_LEVEL_START);
607 if (op == Rplus) /* jump over initial failure_jump */
608 INSERT_JUMP(CURRENT_LEVEL_START, Cdummy_failure_jump,
609 CURRENT_LEVEL_START + 6);
613 INSERT_JUMP(CURRENT_LEVEL_START, Cfailure_jump,
615 if (num_jumps >= MAX_NESTING)
618 future_jumps[num_jumps++] = pattern_offset;
625 if (next_register < RE_NREGS)
627 bufp->uses_registers = 1;
629 STORE(Cstart_memory);
630 STORE(next_register);
631 open_registers[num_open_registers++] = next_register;
640 if (paren_depth <= 0)
641 goto parenthesis_error;
643 current_level = regexp_precedences[Ropenpar];
645 if (paren_depth < num_open_registers)
647 bufp->uses_registers = 1;
650 num_open_registers--;
651 STORE(open_registers[num_open_registers]);
656 goto bad_match_register;
657 assert(ch >= '0' && ch <= '9');
658 bufp->uses_registers = 1;
659 opcode = Cmatch_memory;
661 goto store_opcode_and_arg;
662 case Rextended_memory:
664 if (ch < '0' || ch > '9')
665 goto bad_match_register;
667 if (a < '0' || a > '9')
668 goto bad_match_register;
669 ch = 10 * (a - '0') + ch - '0';
670 if (ch <= 0 || ch >= RE_NREGS)
671 goto bad_match_register;
672 bufp->uses_registers = 1;
673 opcode = Cmatch_memory;
674 goto store_opcode_and_arg;
677 int complement,prev,offset,range,firstchar;
682 offset = pattern_offset;
683 for (a = 0; a < 256/8; a++)
687 ch = translate[(unsigned char)ch];
693 ch = translate[(unsigned char)ch];
700 while (ch != '\135' || firstchar)
703 if (regexp_ansi_sequences && ch == '\134')
710 for (a = prev; a <= ch; a++)
711 SETBIT(pattern, offset, a);
716 if (prev != -1 && ch == '-')
720 SETBIT(pattern, offset, ch);
725 ch = translate[(unsigned char)ch];
728 SETBIT(pattern, offset, '-');
731 for (a = 0; a < 256/8; a++)
732 pattern[offset+a] ^= 0xff;
743 opcode = Csyntaxspec;
745 goto store_opcode_and_arg;
747 opcode = Cnotsyntaxspec;
749 goto store_opcode_and_arg;
760 opcode = Cnotwordbound;
764 opcode = Cemacs_at_dot;
766 case Remacs_syntaxspec:
769 ch = translate[(unsigned char)ch];
770 opcode = Csyntaxspec;
771 ch = syntax_spec_code[(unsigned char)ch];
772 goto store_opcode_and_arg;
773 case Remacs_notsyntaxspec:
776 ch = translate[(unsigned char)ch];
777 opcode = Cnotsyntaxspec;
778 ch = syntax_spec_code[(unsigned char)ch];
779 goto store_opcode_and_arg;
784 beginning_context = (op == Ropenpar || op == Ror);
786 if (starts_base != 0)
787 goto parenthesis_error;
788 assert(num_jumps == 0);
796 return SILC_ERR_REGEX_SPECIAL;
800 return SILC_ERR_REGEX_REG;
804 return SILC_ERR_REGEX_HEX;
808 return SILC_ERR_REGEX_PAREN;
812 return SILC_ERR_OUT_OF_MEMORY;
816 return SILC_ERR_OVERFLOW;
820 return SILC_ERR_REGEX_TOO_COMPLEX;
827 #undef CURRENT_LEVEL_START
828 #undef SET_LEVEL_START
829 #undef PUSH_LEVEL_STARTS
830 #undef POP_LEVEL_STARTS
836 static void re_compile_fastmap_aux(code, pos, visited, can_be_null, fastmap)
837 char *code, *visited, *can_be_null, *fastmap;
840 int a, b, syntaxcode;
843 return; /* we have already been here */
863 syntaxcode = code[pos++];
864 for (a = 0; a < 256; a++)
865 if (SYNTAX(a) == syntaxcode)
869 syntaxcode = code[pos++];
870 for (a = 0; a < 256; a++)
871 if (SYNTAX(a) != syntaxcode)
876 if (*can_be_null == 0)
877 *can_be_null = 2; /* can match null, but only at end of buffer*/
880 for (a = 0; a < 256/8; a++)
881 if (code[pos + a] != 0)
882 for (b = 0; b < 8; b++)
883 if (code[pos + a] & (1 << b))
884 fastmap[(a << 3) + b] = 1;
888 fastmap[(unsigned char)code[pos]] = 1;
891 for (a = 0; a < 256; a++)
900 for (a = 0; a < 256; a++)
905 case Cdummy_failure_jump:
906 case Cupdate_failure_jump:
908 a = (unsigned char)code[pos++];
909 a |= (unsigned char)code[pos++] << 8;
910 pos += (int)(short)a;
913 /* argh... the regexp contains empty loops. This is not
914 good, as this may cause a failure stack overflow when
915 matching. Oh well. */
916 /* this path leads nowhere; pursue other paths. */
922 a = (unsigned char)code[pos++];
923 a |= (unsigned char)code[pos++] << 8;
924 a = pos + (int)(short)a;
925 re_compile_fastmap_aux(code, a, visited, can_be_null, fastmap);
928 abort(); /* probably some opcode is missing from this switch */
933 static int re_do_compile_fastmap(buffer, used, pos, can_be_null, fastmap)
934 char *buffer, *fastmap, *can_be_null;
937 char small_visited[512], *visited;
939 if (used <= sizeof(small_visited))
940 visited = small_visited;
943 visited = silc_malloc(used);
948 memset(fastmap, 0, 256);
949 memset(visited, 0, used);
950 re_compile_fastmap_aux(buffer, pos, visited, can_be_null, fastmap);
951 if (visited != small_visited)
956 void re_compile_fastmap(bufp)
959 if (!bufp->fastmap || bufp->fastmap_accurate)
961 assert(bufp->used > 0);
962 if (!re_do_compile_fastmap(bufp->buffer, bufp->used, 0, &bufp->can_be_null,
965 if (bufp->buffer[0] == Cbol)
966 bufp->anchor = 1; /* begline */
968 if (bufp->buffer[0] == Cbegbuf)
969 bufp->anchor = 2; /* begbuf */
971 bufp->anchor = 0; /* none */
972 bufp->fastmap_accurate = 1;
975 #define INITIAL_FAILURES 128 /* initial # failure points to allocate */
976 #define MAX_FAILURES 4100 /* max # of failure points before failing */
978 int re_match_2(bufp, string1, size1, string2, size2, pos, regs, mstop, flags)
980 char *string1, *string2;
981 int size1, size2, pos, mstop;
982 regexp_registers_t regs;
985 struct failure_point { char *text, *partend, *code; }
986 *failure_stack_start, *failure_sp, *failure_stack_end,
987 initial_failure_stack[INITIAL_FAILURES];
988 char *code, *translate, *text, *textend, *partend, *part_2_end;
989 char *regstart_text[RE_NREGS], *regstart_partend[RE_NREGS];
990 char *regend_text[RE_NREGS], *regend_partend[RE_NREGS];
991 int a, b, ch, reg, regch, match_end;
992 char *regtext, *regpartend, *regtextend;
996 if (text == partend) \
998 if (text == textend) \
1001 partend = part_2_end; \
1005 #define NEXTCHAR(var) \
1008 (var) = (unsigned char)*text++; \
1010 (var) = (unsigned char)translate[(var)]; \
1013 assert(pos >= 0 && size1 >= 0 && size2 >= 0 && mstop >= 0);
1014 assert(mstop <= size1 + size2);
1015 assert(pos <= mstop);
1019 text = string1 + pos;
1022 partend = string1 + mstop;
1027 partend = string1 + size1;
1028 textend = string2 + mstop - size1;
1030 part_2_end = string2 + mstop - size1;
1034 text = string2 + pos - size1;
1035 partend = string2 + mstop - size1;
1037 part_2_end = partend;
1040 if (bufp->uses_registers && regs != NULL)
1041 for (a = 0; a < RE_NREGS; a++)
1042 regend_text[a] = NULL;
1044 code = bufp->buffer;
1045 translate = bufp->translate;
1046 failure_stack_start = failure_sp = initial_failure_stack;
1047 failure_stack_end = initial_failure_stack + INITIAL_FAILURES;
1050 /* re_search_2 has already done this, and otherwise we get little benefit
1051 from this. So I'll leave this out. */
1052 if (bufp->fastmap_accurate && !bufp->can_be_null &&
1054 !bufp->fastmap[translate ?
1055 (unsigned char)translate[(unsigned char)*text] :
1056 (unsigned char)*text])
1057 return -1; /* it can't possibly match */
1066 if (partend != part_2_end)
1067 match_end = text - string1;
1069 match_end = text - string2 + size1;
1072 regs->start[0] = pos;
1073 regs->end[0] = match_end;
1074 if (!bufp->uses_registers)
1076 for (a = 1; a < RE_NREGS; a++)
1078 regs->start[a] = -1;
1084 for (a = 1; a < RE_NREGS; a++)
1086 if (regend_text[a] == NULL)
1088 regs->start[a] = -1;
1092 if (regstart_partend[a] != part_2_end)
1093 regs->start[a] = regstart_text[a] - string1;
1095 regs->start[a] = regstart_text[a] - string2 + size1;
1096 if (regend_partend[a] != part_2_end)
1097 regs->end[a] = regend_text[a] - string1;
1099 regs->end[a] = regend_text[a] - string2 + size1;
1103 if (failure_stack_start != initial_failure_stack)
1104 silc_free((char *)failure_stack_start);
1105 return match_end - pos;
1107 if (text == string1 || text[-1] == '\n') { /* text[-1] always valid */
1108 if (flags & RE_NOTBOL)
1114 if (text == string2 + size2 ||
1115 (text == string1 + size1 ?
1116 (size2 == 0 || *string2 == '\n') :
1118 if (flags & RE_NOTEOL)
1125 if (code[ch/8] & (1<<(ch & 7)))
1133 if (ch != (unsigned char)*code++)
1143 regstart_text[reg] = text;
1144 regstart_partend[reg] = partend;
1148 regend_text[reg] = text;
1149 regend_partend[reg] = partend;
1153 if (regend_text[reg] == NULL)
1154 goto fail; /* or should we just match nothing? */
1155 regtext = regstart_text[reg];
1156 regtextend = regend_text[reg];
1157 if (regstart_partend[reg] == regend_partend[reg])
1158 regpartend = regtextend;
1160 regpartend = string1 + size1;
1162 for (;regtext != regtextend;)
1165 if (regtext == regpartend)
1167 regch = (unsigned char)*regtext++;
1169 regch = (unsigned char)translate[regch];
1175 /* star is coded as:
1177 ... code for operand of star
1179 2: ... code after star
1180 We change the star_jump to update_failure_jump if we can determine
1181 that it is safe to do so; otherwise we change it to an ordinary
1186 2: ... code for operand of plus
1188 3: ... code after plus
1189 For star_jump considerations this is processed identically
1191 a = (unsigned char)*code++;
1192 a |= (unsigned char)*code++ << 8;
1195 char map[256], can_be_null;
1198 p1 = code + a + 3; /* skip the failure_jump */
1199 assert(p1[-3] == Cfailure_jump);
1201 /* p1 points inside loop, p2 points to after loop */
1202 if (!re_do_compile_fastmap(bufp->buffer, bufp->used,
1203 p2 - bufp->buffer, &can_be_null, map))
1204 goto make_normal_jump;
1205 /* If we might introduce a new update point inside the loop,
1206 we can't optimize because then update_jump would update a
1207 wrong failure point. Thus we have to be quite careful here. */
1209 /* loop until we find something that consumes a character */
1229 ch = (unsigned char)*p1++;
1231 goto make_normal_jump;
1234 for (b = 0; b < 256; b++)
1235 if (b != '\n' && map[b])
1236 goto make_normal_jump;
1239 for (b = 0; b < 256; b++)
1240 if ((p1[b >> 3] & (1 << (b & 7))) && map[b])
1241 goto make_normal_jump;
1245 goto make_normal_jump;
1247 /* now we know that we can't backtrack. */
1248 while (p1 != p2 - 3)
1253 abort(); /* we certainly shouldn't get this inside loop */
1276 case Cnotsyntaxspec:
1282 case Cupdate_failure_jump:
1283 case Cdummy_failure_jump:
1284 goto make_normal_jump;
1286 printf("regexpr.c: processing star_jump: unknown op %d\n", p1[-1]);
1290 goto make_update_jump;
1293 /* printf("changing to normal jump\n"); */
1298 /* printf("changing to update jump\n"); */
1300 a += 3; /* jump to after the Cfailure_jump */
1301 code[-1] = Cupdate_failure_jump;
1304 /* fall to next case */
1305 case Cupdate_failure_jump:
1306 failure_sp[-1].text = text;
1307 failure_sp[-1].partend = partend;
1308 /* fall to next case */
1310 a = (unsigned char)*code++;
1311 a |= (unsigned char)*code++ << 8;
1312 code += (int)(short)a;
1314 case Cdummy_failure_jump:
1316 if (failure_sp == failure_stack_end)
1318 if (failure_stack_start != initial_failure_stack)
1320 failure_stack_start = (struct failure_point *)
1321 silc_malloc(MAX_FAILURES * sizeof(*failure_stack_start));
1322 failure_stack_end = failure_stack_start + MAX_FAILURES;
1323 memcpy((char *)failure_stack_start, (char *)initial_failure_stack,
1324 INITIAL_FAILURES * sizeof(*failure_stack_start));
1325 failure_sp = failure_stack_start + INITIAL_FAILURES;
1327 a = (unsigned char)*code++;
1328 a |= (unsigned char)*code++ << 8;
1330 if (code[-3] == Cdummy_failure_jump)
1331 { /* this is only used in plus */
1332 assert(*code == Cfailure_jump);
1333 b = (unsigned char)code[1];
1334 b |= (unsigned char)code[2] << 8;
1335 failure_sp->code = code + (int)(short)b + 3;
1336 failure_sp->text = NULL;
1341 failure_sp->code = code + a;
1342 failure_sp->text = text;
1343 failure_sp->partend = partend;
1348 if (text == string1)
1352 if (size2 == 0 ? text == string1 + size1 : text == string2 + size2)
1356 if (text == string2 + size2)
1358 if (size2 == 0 && text == string1 + size1)
1360 if (SYNTAX(text == string1 + size1 ? *string1 : *text) != Sword)
1362 if (text == string1)
1364 if (SYNTAX(text[-1]) != Sword)
1368 if (text == string1)
1370 if (SYNTAX(text[-1]) != Sword)
1372 if (text == string2 + size2)
1374 if (size2 == 0 && text == string1 + size1)
1376 if (SYNTAX(*text) == Sword)
1380 /* Note: as in gnu regexp, this also matches at the beginning
1381 and end of buffer. */
1382 if (text == string1 || text == string2 + size2 ||
1383 (size2 == 0 && text == string1 + size1))
1385 if ((SYNTAX(text[-1]) == Sword) ^
1386 (SYNTAX(text == string1 + size1 ? *string2 : *text) == Sword))
1390 /* Note: as in gnu regexp, this never matches at the beginning
1391 and end of buffer. */
1392 if (text == string1 || text == string2 + size2 ||
1393 (size2 == 0 && text == string1 + size1))
1395 if (!((SYNTAX(text[-1]) == Sword) ^
1396 (SYNTAX(text == string1 + size1 ? *string2 : *text) == Sword)))
1401 if (SYNTAX(ch) != (unsigned char)*code++)
1404 case Cnotsyntaxspec:
1406 if (SYNTAX(ch) != (unsigned char)*code++)
1411 if (PTR_CHAR_POS((unsigned char *)text) + 1 != point)
1424 if (failure_sp != failure_stack_start)
1427 text = failure_sp->text;
1430 partend = failure_sp->partend;
1431 code = failure_sp->code;
1432 goto continue_matching;
1434 if (failure_stack_start != initial_failure_stack)
1435 silc_free((char *)failure_stack_start);
1439 if (failure_stack_start != initial_failure_stack)
1440 silc_free((char *)failure_stack_start);
1448 int re_match(bufp, string, size, pos, regs, flags)
1452 regexp_registers_t regs;
1455 return re_match_2(bufp, string, size, (char *)NULL, 0, pos, regs, size,
1459 int re_search_2(bufp, string1, size1, string2, size2, pos, range, regs,
1462 char *string1, *string2;
1463 int size1, size2, pos, range, mstop;
1464 regexp_registers_t regs;
1467 char *fastmap, *translate, *text, *partstart, *partend;
1471 assert(size1 >= 0 && size2 >= 0 && pos >= 0 && mstop >= 0);
1472 assert(pos + range >= 0 && pos + range <= size1 + size2);
1473 assert(pos <= mstop);
1475 fastmap = bufp->fastmap;
1476 translate = bufp->translate;
1477 if (fastmap && !bufp->fastmap_accurate)
1478 re_compile_fastmap(bufp);
1479 anchor = bufp->anchor;
1480 if (bufp->can_be_null == 1) /* can_be_null == 2: can match null at eob */
1495 for (; range >= 0; range--, pos += dir)
1500 { /* searching forwards */
1503 text = string1 + pos;
1504 if (pos + range > size1)
1505 partend = string1 + size1;
1507 partend = string1 + pos + range;
1511 text = string2 + pos - size1;
1512 partend = string2 + pos + range - size1;
1516 while (text != partend &&
1517 !fastmap[(unsigned char)
1518 translate[(unsigned char)*text]])
1521 while (text != partend && !fastmap[(unsigned char)*text])
1523 pos += text - partstart;
1524 range -= text - partstart;
1525 if (pos == size1 + size2 && bufp->can_be_null == 0)
1529 { /* searching backwards */
1532 text = string1 + pos;
1533 partstart = string1 + pos - range;
1537 text = string2 + pos - size1;
1538 if (range < pos - size1)
1539 partstart = string2 + pos - size1 - range;
1541 partstart = string2;
1545 while (text != partstart &&
1546 !fastmap[(unsigned char)
1547 translate[(unsigned char)*text]])
1550 while (text != partstart &&
1551 !fastmap[(unsigned char)*text])
1553 pos -= partend - text;
1554 range -= partend - text;
1558 { /* anchored to begline */
1560 (pos <= size1 ? string1[pos - 1] :
1561 string2[pos - size1 - 1]) != '\n')
1564 assert(pos >= 0 && pos <= size1 + size2);
1565 ret = re_match_2(bufp, string1, size1, string2, size2, pos, regs, mstop,
1575 int re_search(bufp, string, size, startpos, range, regs, flags)
1578 int size, startpos, range;
1579 regexp_registers_t regs;
1582 return re_search_2(bufp, string, size, (char *)NULL, 0,
1583 startpos, range, regs, size, flags);
1586 /****************************** SILC Regex API ******************************/
1588 /* Compile regular expression */
1590 SilcBool silc_regex_compile(SilcRegex regexp, const char *regex,
1591 SilcRegexFlags flags)
1596 if (!regexp || !regex) {
1597 silc_set_errno(SILC_ERR_INVALID_ARGUMENT);
1601 memset(regexp, 0, sizeof(*regexp));
1604 syntax |= (RE_CONTEXT_INDEP_OPS | RE_NO_BK_PARENS | RE_NO_BK_VBAR);
1605 re_set_syntax(syntax);
1608 ret = re_compile_pattern((char *)regex, strlen(regex), regexp);
1610 silc_set_errno(ret);
1612 return ret == SILC_OK;
1615 /* Match compiled regular expression */
1617 SilcBool silc_regex_match(SilcRegex regexp, const char *string,
1618 SilcUInt32 string_len, SilcUInt32 num_match,
1619 SilcRegexMatch match, SilcRegexFlags flags)
1621 struct re_registers regs;
1625 if (!regexp || !string) {
1626 silc_set_errno(SILC_ERR_INVALID_ARGUMENT);
1630 if (num_match && !match) {
1631 silc_set_errno(SILC_ERR_INVALID_ARGUMENT);
1635 /* Internal limit for maximum number of registers */
1636 if (num_match > RE_NREGS)
1637 num_match = RE_NREGS;
1640 if (flags & SILC_REGEX_NOTBOL)
1642 if (flags & SILC_REGEX_NOTEOL)
1646 ret = re_search(regexp, (char *)string, string_len, 0, string_len,
1647 num_match ? ®s : NULL, f);
1650 silc_set_errno(SILC_ERR);
1652 silc_set_errno(SILC_ERR_NOT_FOUND);
1656 /* Return matches */
1657 for (i = 0; i < num_match; i++) {
1658 match[i].start = regs.start[i];
1659 match[i].end = regs.end[i];
1668 void silc_regex_free(SilcRegex regexp)
1670 silc_free(regexp->buffer);
1675 SilcBool silc_regex_va(const char *string, SilcUInt32 string_len,
1676 const char *regex, SilcBuffer match, va_list va)
1678 SilcRegexStruct reg;
1679 SilcRegexMatch m = NULL;
1680 SilcBuffer buf, *rets = NULL;
1684 if (!silc_regex_compile(®, regex, 0))
1687 /* Get match pointers */
1689 rets = silc_malloc(sizeof(*rets));
1694 while ((buf = va_arg(va, SilcBuffer))) {
1695 rets = silc_realloc(rets, (c + 1) * sizeof(*rets));
1701 m = silc_malloc(c * sizeof(*m));
1709 if (!silc_regex_match(®, string, string_len, c, m, 0)) {
1715 /* Return matches */
1716 for (i = 0; i < c; i++) {
1717 if (m[i].start == -1)
1719 silc_buffer_set(rets[i], (unsigned char *)string + m[i].start,
1720 m[i].end - m[i].start);
1731 SilcBool silc_regex(const char *string, const char *regex,
1732 SilcBuffer match, ...)
1737 if (!string || !regex) {
1738 silc_set_errno(SILC_ERR_INVALID_ARGUMENT);
1743 va_start(va, match);
1745 ret = silc_regex_va(string, strlen(string), regex, match, va);
1755 SilcBool silc_regex_buffer(SilcBuffer buffer, const char *regex,
1756 SilcBuffer match, ...)
1761 if (!buffer || !regex) {
1762 silc_set_errno(SILC_ERR_INVALID_ARGUMENT);
1767 va_start(va, match);
1769 ret = silc_regex_va((const char *)silc_buffer_data(buffer),
1770 silc_buffer_len(buffer), regex, match, va);