1 /* Extended regular expression matching and search library,
3 (Implements POSIX draft P10003.2/D11.2, except for
4 internationalization features.)
6 Copyright (C) 1993 Free Software Foundation, Inc.
8 This program is free software; you can redistribute it and/or modify
9 it under the terms of the GNU General Public License as published by
10 the Free Software Foundation; either version 2, or (at your option)
13 This program is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 GNU General Public License for more details.
18 You should have received a copy of the GNU General Public License
19 along with this program; if not, write to the Free Software
20 Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */
22 /* AIX requires this to be the first thing in the file. */
23 #if defined (_AIX) && !defined (REGEX_MALLOC)
33 /* We need this for `regex.h', and perhaps for the Emacs include files. */
34 #include <sys/types.h>
45 /* The `emacs' switch turns on certain matching commands
46 that make sense only in Emacs. */
53 /* Emacs uses `NULL' as a predicate. */
58 /* We used to test for `BSTRING' here, but only GCC and Emacs define
59 `BSTRING', as far as I know, and neither of them use this code. */
60 #if HAVE_STRING_H || STDC_HEADERS
63 #define bcmp(s1, s2, n) memcmp ((s1), (s2), (n))
66 #define bcopy(s, d, n) memcpy ((d), (s), (n))
69 #define bzero(s, n) memset ((s), 0, (n))
77 /* Define the syntax stuff for \<, \>, etc. */
79 /* This must be nonzero for the wordchar and notwordchar pattern
80 commands in re_match_2. */
87 extern char *re_syntax_table;
89 #else /* not SYNTAX_TABLE */
91 /* How many characters in the character set. */
92 #define CHAR_SET_SIZE 256
94 static char re_syntax_table[CHAR_SET_SIZE];
105 bzero (re_syntax_table, sizeof re_syntax_table);
107 for (c = 'a'; c <= 'z'; c++)
108 re_syntax_table[c] = Sword;
110 for (c = 'A'; c <= 'Z'; c++)
111 re_syntax_table[c] = Sword;
113 for (c = '0'; c <= '9'; c++)
114 re_syntax_table[c] = Sword;
116 re_syntax_table['_'] = Sword;
121 #endif /* not SYNTAX_TABLE */
123 #define SYNTAX(c) re_syntax_table[c]
125 #endif /* not emacs */
127 /* Get the interface, including the syntax bits. */
130 /* isalpha etc. are used for the character classes. */
138 #define ISBLANK(c) (isascii (c) && isblank (c))
140 #define ISBLANK(c) ((c) == ' ' || (c) == '\t')
143 #define ISGRAPH(c) (isascii (c) && isgraph (c))
145 #define ISGRAPH(c) (isascii (c) && isprint (c) && !isspace (c))
148 #define ISPRINT(c) (isascii (c) && isprint (c))
149 #define ISDIGIT(c) (isascii (c) && isdigit (c))
150 #define ISALNUM(c) (isascii (c) && isalnum (c))
151 #define ISALPHA(c) (isascii (c) && isalpha (c))
152 #define ISCNTRL(c) (isascii (c) && iscntrl (c))
153 #define ISLOWER(c) (isascii (c) && islower (c))
154 #define ISPUNCT(c) (isascii (c) && ispunct (c))
155 #define ISSPACE(c) (isascii (c) && isspace (c))
156 #define ISUPPER(c) (isascii (c) && isupper (c))
157 #define ISXDIGIT(c) (isascii (c) && isxdigit (c))
163 /* We remove any previous definition of `SIGN_EXTEND_CHAR',
164 since ours (we hope) works properly with all combinations of
165 machines, compilers, `char' and `unsigned char' argument types.
166 (Per Bothner suggested the basic approach.) */
167 #undef SIGN_EXTEND_CHAR
169 #define SIGN_EXTEND_CHAR(c) ((signed char) (c))
170 #else /* not __STDC__ */
171 /* As in Harbison and Steele. */
172 #define SIGN_EXTEND_CHAR(c) ((((unsigned char) (c)) ^ 128) - 128)
175 /* Should we use malloc or alloca? If REGEX_MALLOC is not defined, we
176 use `alloca' instead of `malloc'. This is because using malloc in
177 re_search* or re_match* could cause memory leaks when C-g is used in
178 Emacs; also, malloc is slower and causes storage fragmentation. On
179 the other hand, malloc is more portable, and easier to debug.
181 Because we sometimes use alloca, some routines have to be macros,
182 not functions -- `alloca'-allocated space disappears at the end of the
183 function it is called in. */
187 #define REGEX_ALLOCATE malloc
188 #define REGEX_REALLOCATE(source, osize, nsize) realloc (source, nsize)
190 #else /* not REGEX_MALLOC */
192 /* Emacs already defines alloca, sometimes. */
195 /* Make alloca work the best possible way. */
197 #define alloca __builtin_alloca
198 #else /* not __GNUC__ */
201 #else /* not __GNUC__ or HAVE_ALLOCA_H */
202 #ifndef _AIX /* Already did AIX, up at the top. */
204 #endif /* not _AIX */
205 #endif /* not HAVE_ALLOCA_H */
206 #endif /* not __GNUC__ */
208 #endif /* not alloca */
210 #define REGEX_ALLOCATE alloca
212 /* Assumes a `char *destination' variable. */
213 #define REGEX_REALLOCATE(source, osize, nsize) \
214 (destination = (char *) alloca (nsize), \
215 bcopy (source, destination, osize), \
218 #endif /* not REGEX_MALLOC */
221 /* True if `size1' is non-NULL and PTR is pointing anywhere inside
222 `string1' or just past its end. This works if PTR is NULL, which is
224 #define FIRST_STRING_P(ptr) \
225 (size1 && string1 <= (ptr) && (ptr) <= string1 + size1)
227 /* (Re)Allocate N items of type T using malloc, or fail. */
228 #define TALLOC(n, t) ((t *) malloc ((n) * sizeof (t)))
229 #define RETALLOC(addr, n, t) ((addr) = (t *) realloc (addr, (n) * sizeof (t)))
230 #define REGEX_TALLOC(n, t) ((t *) REGEX_ALLOCATE ((n) * sizeof (t)))
232 #define BYTEWIDTH 8 /* In bits. */
234 #define STREQ(s1, s2) ((strcmp (s1, s2) == 0))
236 #define MAX(a, b) ((a) > (b) ? (a) : (b))
237 #define MIN(a, b) ((a) < (b) ? (a) : (b))
239 typedef char boolean;
243 /* These are the command codes that appear in compiled regular
244 expressions. Some opcodes are followed by argument bytes. A
245 command code can specify any interpretation whatsoever for its
246 arguments. Zero bytes may appear in the compiled regular expression.
248 The value of `exactn' is needed in search.c (search_buffer) in Emacs.
249 So regex.h defines a symbol `RE_EXACTN_VALUE' to be 1; the value of
250 `exactn' we use here must also be 1. */
256 /* Followed by one byte giving n, then by n literal bytes. */
259 /* Matches any (more or less) character. */
262 /* Matches any one char belonging to specified set. First
263 following byte is number of bitmap bytes. Then come bytes
264 for a bitmap saying which chars are in. Bits in each byte
265 are ordered low-bit-first. A character is in the set if its
266 bit is 1. A character too large to have a bit in the map is
267 automatically not in the set. */
270 /* Same parameters as charset, but match any character that is
271 not one of those specified. */
274 /* Start remembering the text that is matched, for storing in a
275 register. Followed by one byte with the register number, in
276 the range 0 to one less than the pattern buffer's re_nsub
277 field. Then followed by one byte with the number of groups
278 inner to this one. (This last has to be part of the
279 start_memory only because we need it in the on_failure_jump
283 /* Stop remembering the text that is matched and store it in a
284 memory register. Followed by one byte with the register
285 number, in the range 0 to one less than `re_nsub' in the
286 pattern buffer, and one byte with the number of inner groups,
287 just like `start_memory'. (We need the number of inner
288 groups here because we don't have any easy way of finding the
289 corresponding start_memory when we're at a stop_memory.) */
292 /* Match a duplicate of something remembered. Followed by one
293 byte containing the register number. */
296 /* Fail unless at beginning of line. */
299 /* Fail unless at end of line. */
302 /* Succeeds if at beginning of buffer (if emacs) or at beginning
303 of string to be matched (if not). */
306 /* Analogously, for end of buffer/string. */
309 /* Followed by two byte relative address to which to jump. */
312 /* Same as jump, but marks the end of an alternative. */
315 /* Followed by two-byte relative address of place to resume at
316 in case of failure. */
319 /* Like on_failure_jump, but pushes a placeholder instead of the
320 current string position when executed. */
321 on_failure_keep_string_jump,
323 /* Throw away latest failure point and then jump to following
324 two-byte relative address. */
327 /* Change to pop_failure_jump if know won't have to backtrack to
328 match; otherwise change to jump. This is used to jump
329 back to the beginning of a repeat. If what follows this jump
330 clearly won't match what the repeat does, such that we can be
331 sure that there is no use backtracking out of repetitions
332 already matched, then we change it to a pop_failure_jump.
333 Followed by two-byte address. */
336 /* Jump to following two-byte address, and push a dummy failure
337 point. This failure point will be thrown away if an attempt
338 is made to use it for a failure. A `+' construct makes this
339 before the first repeat. Also used as an intermediary kind
340 of jump when compiling an alternative. */
343 /* Push a dummy failure point and continue. Used at the end of
347 /* Followed by two-byte relative address and two-byte number n.
348 After matching N times, jump to the address upon failure. */
351 /* Followed by two-byte relative address, and two-byte number n.
352 Jump to the address N times, then fail. */
355 /* Set the following two-byte relative address to the
356 subsequent two-byte number. The address *includes* the two
360 wordchar, /* Matches any word-constituent character. */
361 notwordchar, /* Matches any char that is not a word-constituent. */
363 wordbeg, /* Succeeds if at word beginning. */
364 wordend, /* Succeeds if at word end. */
366 wordbound, /* Succeeds if at a word boundary. */
367 notwordbound /* Succeeds if not at a word boundary. */
370 ,before_dot, /* Succeeds if before point. */
371 at_dot, /* Succeeds if at point. */
372 after_dot, /* Succeeds if after point. */
374 /* Matches any character whose syntax is specified. Followed by
375 a byte which contains a syntax code, e.g., Sword. */
378 /* Matches any character whose syntax is not that specified. */
383 /* Common operations on the compiled pattern. */
385 /* Store NUMBER in two contiguous bytes starting at DESTINATION. */
387 #define STORE_NUMBER(destination, number) \
389 (destination)[0] = (number) & 0377; \
390 (destination)[1] = (number) >> 8; \
393 /* Same as STORE_NUMBER, except increment DESTINATION to
394 the byte after where the number is stored. Therefore, DESTINATION
395 must be an lvalue. */
397 #define STORE_NUMBER_AND_INCR(destination, number) \
399 STORE_NUMBER (destination, number); \
400 (destination) += 2; \
403 /* Put into DESTINATION a number stored in two contiguous bytes starting
406 #define EXTRACT_NUMBER(destination, source) \
408 (destination) = *(source) & 0377; \
409 (destination) += SIGN_EXTEND_CHAR (*((source) + 1)) << 8; \
414 extract_number (dest, source)
416 unsigned char *source;
418 int temp = SIGN_EXTEND_CHAR (*(source + 1));
419 *dest = *source & 0377;
423 #ifndef EXTRACT_MACROS /* To debug the macros. */
424 #undef EXTRACT_NUMBER
425 #define EXTRACT_NUMBER(dest, src) extract_number (&dest, src)
426 #endif /* not EXTRACT_MACROS */
430 /* Same as EXTRACT_NUMBER, except increment SOURCE to after the number.
431 SOURCE must be an lvalue. */
433 #define EXTRACT_NUMBER_AND_INCR(destination, source) \
435 EXTRACT_NUMBER (destination, source); \
441 extract_number_and_incr (destination, source)
443 unsigned char **source;
445 extract_number (destination, *source);
449 #ifndef EXTRACT_MACROS
450 #undef EXTRACT_NUMBER_AND_INCR
451 #define EXTRACT_NUMBER_AND_INCR(dest, src) \
452 extract_number_and_incr (&dest, &src)
453 #endif /* not EXTRACT_MACROS */
457 /* If DEBUG is defined, Regex prints many voluminous messages about what
458 it is doing (if the variable `debug' is nonzero). If linked with the
459 main program in `iregex.c', you can enter patterns and strings
460 interactively. And if linked with the main program in `main.c' and
461 the other test files, you can run the already-written tests. */
465 /* We use standard I/O for debugging. */
468 /* It is useful to test things that ``must'' be true when debugging. */
471 static int debug = 0;
473 #define DEBUG_STATEMENT(e) e
474 #define DEBUG_PRINT1(x) if (debug) printf (x)
475 #define DEBUG_PRINT2(x1, x2) if (debug) printf (x1, x2)
476 #define DEBUG_PRINT3(x1, x2, x3) if (debug) printf (x1, x2, x3)
477 #define DEBUG_PRINT4(x1, x2, x3, x4) if (debug) printf (x1, x2, x3, x4)
478 #define DEBUG_PRINT_COMPILED_PATTERN(p, s, e) \
479 if (debug) print_partial_compiled_pattern (s, e)
480 #define DEBUG_PRINT_DOUBLE_STRING(w, s1, sz1, s2, sz2) \
481 if (debug) print_double_string (w, s1, sz1, s2, sz2)
484 extern void printchar ();
486 /* Print the fastmap in human-readable form. */
489 print_fastmap (fastmap)
492 unsigned was_a_range = 0;
495 while (i < (1 << BYTEWIDTH))
501 while (i < (1 << BYTEWIDTH) && fastmap[i])
517 /* Print a compiled pattern string in human-readable form, starting at
518 the START pointer into it and ending just before the pointer END. */
521 print_partial_compiled_pattern (start, end)
522 unsigned char *start;
526 unsigned char *p = start;
527 unsigned char *pend = end;
535 /* Loop over pattern commands. */
538 switch ((re_opcode_t) *p++)
546 printf ("/exactn/%d", mcnt);
557 printf ("/start_memory/%d/%d", mcnt, *p++);
562 printf ("/stop_memory/%d/%d", mcnt, *p++);
566 printf ("/duplicate/%d", *p++);
578 printf ("/charset%s",
579 (re_opcode_t) *(p - 1) == charset_not ? "_not" : "");
581 assert (p + *p < pend);
583 for (c = 0; c < *p; c++)
586 unsigned char map_byte = p[1 + c];
590 for (bit = 0; bit < BYTEWIDTH; bit++)
591 if (map_byte & (1 << bit))
592 printchar (c * BYTEWIDTH + bit);
606 case on_failure_jump:
607 extract_number_and_incr (&mcnt, &p);
608 printf ("/on_failure_jump/0/%d", mcnt);
611 case on_failure_keep_string_jump:
612 extract_number_and_incr (&mcnt, &p);
613 printf ("/on_failure_keep_string_jump/0/%d", mcnt);
616 case dummy_failure_jump:
617 extract_number_and_incr (&mcnt, &p);
618 printf ("/dummy_failure_jump/0/%d", mcnt);
621 case push_dummy_failure:
622 printf ("/push_dummy_failure");
626 extract_number_and_incr (&mcnt, &p);
627 printf ("/maybe_pop_jump/0/%d", mcnt);
630 case pop_failure_jump:
631 extract_number_and_incr (&mcnt, &p);
632 printf ("/pop_failure_jump/0/%d", mcnt);
636 extract_number_and_incr (&mcnt, &p);
637 printf ("/jump_past_alt/0/%d", mcnt);
641 extract_number_and_incr (&mcnt, &p);
642 printf ("/jump/0/%d", mcnt);
646 extract_number_and_incr (&mcnt, &p);
647 extract_number_and_incr (&mcnt2, &p);
648 printf ("/succeed_n/0/%d/0/%d", mcnt, mcnt2);
652 extract_number_and_incr (&mcnt, &p);
653 extract_number_and_incr (&mcnt2, &p);
654 printf ("/jump_n/0/%d/0/%d", mcnt, mcnt2);
658 extract_number_and_incr (&mcnt, &p);
659 extract_number_and_incr (&mcnt2, &p);
660 printf ("/set_number_at/0/%d/0/%d", mcnt, mcnt2);
664 printf ("/wordbound");
668 printf ("/notwordbound");
680 printf ("/before_dot");
688 printf ("/after_dot");
692 printf ("/syntaxspec");
694 printf ("/%d", mcnt);
698 printf ("/notsyntaxspec");
700 printf ("/%d", mcnt);
705 printf ("/wordchar");
709 printf ("/notwordchar");
721 printf ("?%d", *(p-1));
729 print_compiled_pattern (bufp)
730 struct re_pattern_buffer *bufp;
732 unsigned char *buffer = bufp->buffer;
734 print_partial_compiled_pattern (buffer, buffer + bufp->used);
735 printf ("%d bytes used/%d bytes allocated.\n", bufp->used, bufp->allocated);
737 if (bufp->fastmap_accurate && bufp->fastmap)
739 printf ("fastmap: ");
740 print_fastmap (bufp->fastmap);
743 printf ("re_nsub: %d\t", bufp->re_nsub);
744 printf ("regs_alloc: %d\t", bufp->regs_allocated);
745 printf ("can_be_null: %d\t", bufp->can_be_null);
746 printf ("newline_anchor: %d\n", bufp->newline_anchor);
747 printf ("no_sub: %d\t", bufp->no_sub);
748 printf ("not_bol: %d\t", bufp->not_bol);
749 printf ("not_eol: %d\t", bufp->not_eol);
750 printf ("syntax: %d\n", bufp->syntax);
751 /* Perhaps we should print the translate table? */
756 print_double_string (where, string1, size1, string2, size2)
769 if (FIRST_STRING_P (where))
771 for (this_char = where - string1; this_char < size1; this_char++)
772 printchar (string1[this_char]);
777 for (this_char = where - string2; this_char < size2; this_char++)
778 printchar (string2[this_char]);
782 #else /* not DEBUG */
787 #define DEBUG_STATEMENT(e)
788 #define DEBUG_PRINT1(x)
789 #define DEBUG_PRINT2(x1, x2)
790 #define DEBUG_PRINT3(x1, x2, x3)
791 #define DEBUG_PRINT4(x1, x2, x3, x4)
792 #define DEBUG_PRINT_COMPILED_PATTERN(p, s, e)
793 #define DEBUG_PRINT_DOUBLE_STRING(w, s1, sz1, s2, sz2)
795 #endif /* not DEBUG */
797 /* Set by `re_set_syntax' to the current regexp syntax to recognize. Can
798 also be assigned to arbitrarily: each pattern buffer stores its own
799 syntax, so it can be changed between regex compilations. */
800 reg_syntax_t re_syntax_options = RE_SYNTAX_EMACS;
803 /* Specify the precise syntax of regexps for compilation. This provides
804 for compatibility for various utilities which historically have
805 different, incompatible syntaxes.
807 The argument SYNTAX is a bit mask comprised of the various bits
808 defined in regex.h. We return the old syntax. */
811 re_set_syntax (syntax)
814 reg_syntax_t ret = re_syntax_options;
816 re_syntax_options = syntax;
820 /* This table gives an error message for each of the error codes listed
821 in regex.h. Obviously the order here has to be same as there. */
823 static const char *re_error_msg[] =
824 { NULL, /* REG_NOERROR */
825 "No match", /* REG_NOMATCH */
826 "Invalid regular expression", /* REG_BADPAT */
827 "Invalid collation character", /* REG_ECOLLATE */
828 "Invalid character class name", /* REG_ECTYPE */
829 "Trailing backslash", /* REG_EESCAPE */
830 "Invalid back reference", /* REG_ESUBREG */
831 "Unmatched [ or [^", /* REG_EBRACK */
832 "Unmatched ( or \\(", /* REG_EPAREN */
833 "Unmatched \\{", /* REG_EBRACE */
834 "Invalid content of \\{\\}", /* REG_BADBR */
835 "Invalid range end", /* REG_ERANGE */
836 "Memory exhausted", /* REG_ESPACE */
837 "Invalid preceding regular expression", /* REG_BADRPT */
838 "Premature end of regular expression", /* REG_EEND */
839 "Regular expression too big", /* REG_ESIZE */
840 "Unmatched ) or \\)", /* REG_ERPAREN */
843 /* Subroutine declarations and macros for regex_compile. */
845 static void store_op1 (), store_op2 ();
846 static void insert_op1 (), insert_op2 ();
847 static boolean at_begline_loc_p (), at_endline_loc_p ();
848 static boolean group_in_compile_stack ();
849 static reg_errcode_t compile_range ();
851 /* Fetch the next character in the uncompiled pattern---translating it
852 if necessary. Also cast from a signed character in the constant
853 string passed to us by the user to an unsigned char that we can use
854 as an array index (in, e.g., `translate'). */
855 #define PATFETCH(c) \
856 do {if (p == pend) return REG_EEND; \
857 c = (unsigned char) *p++; \
858 if (translate) c = translate[c]; \
861 /* Fetch the next character in the uncompiled pattern, with no
863 #define PATFETCH_RAW(c) \
864 do {if (p == pend) return REG_EEND; \
865 c = (unsigned char) *p++; \
868 /* Go backwards one character in the pattern. */
869 #define PATUNFETCH p--
872 /* If `translate' is non-null, return translate[D], else just D. We
873 cast the subscript to translate because some data is declared as
874 `char *', to avoid warnings when a string constant is passed. But
875 when we use a character as a subscript we must make it unsigned. */
876 #define TRANSLATE(d) (translate ? translate[(unsigned char) (d)] : (d))
879 /* Macros for outputting the compiled pattern into `buffer'. */
881 /* If the buffer isn't allocated when it comes in, use this. */
882 #define INIT_BUF_SIZE 32
884 /* Make sure we have at least N more bytes of space in buffer. */
885 #define GET_BUFFER_SPACE(n) \
886 while (b - bufp->buffer + (n) > bufp->allocated) \
889 /* Make sure we have one more byte of buffer space and then add C to it. */
890 #define BUF_PUSH(c) \
892 GET_BUFFER_SPACE (1); \
893 *b++ = (unsigned char) (c); \
897 /* Ensure we have two more bytes of buffer space and then append C1 and C2. */
898 #define BUF_PUSH_2(c1, c2) \
900 GET_BUFFER_SPACE (2); \
901 *b++ = (unsigned char) (c1); \
902 *b++ = (unsigned char) (c2); \
906 /* As with BUF_PUSH_2, except for three bytes. */
907 #define BUF_PUSH_3(c1, c2, c3) \
909 GET_BUFFER_SPACE (3); \
910 *b++ = (unsigned char) (c1); \
911 *b++ = (unsigned char) (c2); \
912 *b++ = (unsigned char) (c3); \
916 /* Store a jump with opcode OP at LOC to location TO. We store a
917 relative address offset by the three bytes the jump itself occupies. */
918 #define STORE_JUMP(op, loc, to) \
919 store_op1 (op, loc, (to) - (loc) - 3)
921 /* Likewise, for a two-argument jump. */
922 #define STORE_JUMP2(op, loc, to, arg) \
923 store_op2 (op, loc, (to) - (loc) - 3, arg)
925 /* Like `STORE_JUMP', but for inserting. Assume `b' is the buffer end. */
926 #define INSERT_JUMP(op, loc, to) \
927 insert_op1 (op, loc, (to) - (loc) - 3, b)
929 /* Like `STORE_JUMP2', but for inserting. Assume `b' is the buffer end. */
930 #define INSERT_JUMP2(op, loc, to, arg) \
931 insert_op2 (op, loc, (to) - (loc) - 3, arg, b)
934 /* This is not an arbitrary limit: the arguments which represent offsets
935 into the pattern are two bytes long. So if 2^16 bytes turns out to
936 be too small, many things would have to change. */
937 #define MAX_BUF_SIZE (1L << 16)
940 /* Extend the buffer by twice its current size via realloc and
941 reset the pointers that pointed into the old block to point to the
942 correct places in the new one. If extending the buffer results in it
943 being larger than MAX_BUF_SIZE, then flag memory exhausted. */
944 #define EXTEND_BUFFER() \
946 unsigned char *old_buffer = bufp->buffer; \
947 if (bufp->allocated == MAX_BUF_SIZE) \
949 bufp->allocated <<= 1; \
950 if (bufp->allocated > MAX_BUF_SIZE) \
951 bufp->allocated = MAX_BUF_SIZE; \
952 bufp->buffer = (unsigned char *) realloc (bufp->buffer, bufp->allocated);\
953 if (bufp->buffer == NULL) \
955 /* If the buffer moved, move all the pointers into it. */ \
956 if (old_buffer != bufp->buffer) \
958 b = (b - old_buffer) + bufp->buffer; \
959 begalt = (begalt - old_buffer) + bufp->buffer; \
960 if (fixup_alt_jump) \
961 fixup_alt_jump = (fixup_alt_jump - old_buffer) + bufp->buffer;\
963 laststart = (laststart - old_buffer) + bufp->buffer; \
965 pending_exact = (pending_exact - old_buffer) + bufp->buffer; \
970 /* Since we have one byte reserved for the register number argument to
971 {start,stop}_memory, the maximum number of groups we can report
972 things about is what fits in that byte. */
973 #define MAX_REGNUM 255
975 /* But patterns can have more than `MAX_REGNUM' registers. We just
976 ignore the excess. */
977 typedef unsigned regnum_t;
980 /* Macros for the compile stack. */
982 /* Since offsets can go either forwards or backwards, this type needs to
983 be able to hold values from -(MAX_BUF_SIZE - 1) to MAX_BUF_SIZE - 1. */
984 typedef int pattern_offset_t;
988 pattern_offset_t begalt_offset;
989 pattern_offset_t fixup_alt_jump;
990 pattern_offset_t inner_group_offset;
991 pattern_offset_t laststart_offset;
993 } compile_stack_elt_t;
998 compile_stack_elt_t *stack;
1000 unsigned avail; /* Offset of next open position. */
1001 } compile_stack_type;
1004 #define INIT_COMPILE_STACK_SIZE 32
1006 #define COMPILE_STACK_EMPTY (compile_stack.avail == 0)
1007 #define COMPILE_STACK_FULL (compile_stack.avail == compile_stack.size)
1009 /* The next available element. */
1010 #define COMPILE_STACK_TOP (compile_stack.stack[compile_stack.avail])
1013 /* Set the bit for character C in a list. */
1014 #define SET_LIST_BIT(c) \
1015 (b[((unsigned char) (c)) / BYTEWIDTH] \
1016 |= 1 << (((unsigned char) c) % BYTEWIDTH))
1019 /* Get the next unsigned number in the uncompiled pattern. */
1020 #define GET_UNSIGNED_NUMBER(num) \
1024 while (ISDIGIT (c)) \
1028 num = num * 10 + c - '0'; \
1036 #define CHAR_CLASS_MAX_LENGTH 6 /* Namely, `xdigit'. */
1038 #define IS_CHAR_CLASS(string) \
1039 (STREQ (string, "alpha") || STREQ (string, "upper") \
1040 || STREQ (string, "lower") || STREQ (string, "digit") \
1041 || STREQ (string, "alnum") || STREQ (string, "xdigit") \
1042 || STREQ (string, "space") || STREQ (string, "print") \
1043 || STREQ (string, "punct") || STREQ (string, "graph") \
1044 || STREQ (string, "cntrl") || STREQ (string, "blank"))
1046 /* `regex_compile' compiles PATTERN (of length SIZE) according to SYNTAX.
1047 Returns one of error codes defined in `regex.h', or zero for success.
1049 Assumes the `allocated' (and perhaps `buffer') and `translate'
1050 fields are set in BUFP on entry.
1052 If it succeeds, results are put in BUFP (if it returns an error, the
1053 contents of BUFP are undefined):
1054 `buffer' is the compiled pattern;
1055 `syntax' is set to SYNTAX;
1056 `used' is set to the length of the compiled pattern;
1057 `fastmap_accurate' is zero;
1058 `re_nsub' is the number of subexpressions in PATTERN;
1059 `not_bol' and `not_eol' are zero;
1061 The `fastmap' and `newline_anchor' fields are neither
1062 examined nor set. */
1064 static reg_errcode_t
1065 regex_compile (pattern, size, syntax, bufp)
1066 const char *pattern;
1068 reg_syntax_t syntax;
1069 struct re_pattern_buffer *bufp;
1071 /* We fetch characters from PATTERN here. Even though PATTERN is
1072 `char *' (i.e., signed), we declare these variables as unsigned, so
1073 they can be reliably used as array indices. */
1074 register unsigned char c, c1;
1076 /* A random tempory spot in PATTERN. */
1079 /* Points to the end of the buffer, where we should append. */
1080 register unsigned char *b;
1082 /* Keeps track of unclosed groups. */
1083 compile_stack_type compile_stack;
1085 /* Points to the current (ending) position in the pattern. */
1086 const char *p = pattern;
1087 const char *pend = pattern + size;
1089 /* How to translate the characters in the pattern. */
1090 char *translate = bufp->translate;
1092 /* Address of the count-byte of the most recently inserted `exactn'
1093 command. This makes it possible to tell if a new exact-match
1094 character can be added to that command or if the character requires
1095 a new `exactn' command. */
1096 unsigned char *pending_exact = 0;
1098 /* Address of start of the most recently finished expression.
1099 This tells, e.g., postfix * where to find the start of its
1100 operand. Reset at the beginning of groups and alternatives. */
1101 unsigned char *laststart = 0;
1103 /* Address of beginning of regexp, or inside of last group. */
1104 unsigned char *begalt;
1106 /* Place in the uncompiled pattern (i.e., the {) to
1107 which to go back if the interval is invalid. */
1108 const char *beg_interval;
1110 /* Address of the place where a forward jump should go to the end of
1111 the containing expression. Each alternative of an `or' -- except the
1112 last -- ends with a forward jump of this sort. */
1113 unsigned char *fixup_alt_jump = 0;
1115 /* Counts open-groups as they are encountered. Remembered for the
1116 matching close-group on the compile stack, so the same register
1117 number is put in the stop_memory as the start_memory. */
1118 regnum_t regnum = 0;
1121 DEBUG_PRINT1 ("\nCompiling pattern: ");
1124 unsigned debug_count;
1126 for (debug_count = 0; debug_count < size; debug_count++)
1127 printchar (pattern[debug_count]);
1132 /* Initialize the compile stack. */
1133 compile_stack.stack = TALLOC (INIT_COMPILE_STACK_SIZE, compile_stack_elt_t);
1134 if (compile_stack.stack == NULL)
1137 compile_stack.size = INIT_COMPILE_STACK_SIZE;
1138 compile_stack.avail = 0;
1140 /* Initialize the pattern buffer. */
1141 bufp->syntax = syntax;
1142 bufp->fastmap_accurate = 0;
1143 bufp->not_bol = bufp->not_eol = 0;
1145 /* Set `used' to zero, so that if we return an error, the pattern
1146 printer (for debugging) will think there's no pattern. We reset it
1150 /* Always count groups, whether or not bufp->no_sub is set. */
1153 #if !defined (emacs) && !defined (SYNTAX_TABLE)
1154 /* Initialize the syntax table. */
1155 init_syntax_once ();
1158 if (bufp->allocated == 0)
1161 { /* If zero allocated, but buffer is non-null, try to realloc
1162 enough space. This loses if buffer's address is bogus, but
1163 that is the user's responsibility. */
1164 RETALLOC (bufp->buffer, INIT_BUF_SIZE, unsigned char);
1167 { /* Caller did not allocate a buffer. Do it for them. */
1168 bufp->buffer = TALLOC (INIT_BUF_SIZE, unsigned char);
1170 if (!bufp->buffer) return REG_ESPACE;
1172 bufp->allocated = INIT_BUF_SIZE;
1175 begalt = b = bufp->buffer;
1177 /* Loop through the uncompiled pattern until we're at the end. */
1186 if ( /* If at start of pattern, it's an operator. */
1188 /* If context independent, it's an operator. */
1189 || syntax & RE_CONTEXT_INDEP_ANCHORS
1190 /* Otherwise, depends on what's come before. */
1191 || at_begline_loc_p (pattern, p, syntax))
1201 if ( /* If at end of pattern, it's an operator. */
1203 /* If context independent, it's an operator. */
1204 || syntax & RE_CONTEXT_INDEP_ANCHORS
1205 /* Otherwise, depends on what's next. */
1206 || at_endline_loc_p (p, pend, syntax))
1216 if ((syntax & RE_BK_PLUS_QM)
1217 || (syntax & RE_LIMITED_OPS))
1221 /* If there is no previous pattern... */
1224 if (syntax & RE_CONTEXT_INVALID_OPS)
1226 else if (!(syntax & RE_CONTEXT_INDEP_OPS))
1231 /* Are we optimizing this jump? */
1232 boolean keep_string_p = false;
1234 /* 1 means zero (many) matches is allowed. */
1235 char zero_times_ok = 0, many_times_ok = 0;
1237 /* If there is a sequence of repetition chars, collapse it
1238 down to just one (the right one). We can't combine
1239 interval operators with these because of, e.g., `a{2}*',
1240 which should only match an even number of `a's. */
1244 zero_times_ok |= c != '+';
1245 many_times_ok |= c != '?';
1253 || (!(syntax & RE_BK_PLUS_QM) && (c == '+' || c == '?')))
1256 else if (syntax & RE_BK_PLUS_QM && c == '\\')
1258 if (p == pend) return REG_EESCAPE;
1261 if (!(c1 == '+' || c1 == '?'))
1276 /* If we get here, we found another repeat character. */
1279 /* Star, etc. applied to an empty pattern is equivalent
1280 to an empty pattern. */
1284 /* Now we know whether or not zero matches is allowed
1285 and also whether or not two or more matches is allowed. */
1287 { /* More than one repetition is allowed, so put in at the
1288 end a backward relative jump from `b' to before the next
1289 jump we're going to put in below (which jumps from
1290 laststart to after this jump).
1292 But if we are at the `*' in the exact sequence `.*\n',
1293 insert an unconditional jump backwards to the .,
1294 instead of the beginning of the loop. This way we only
1295 push a failure point once, instead of every time
1296 through the loop. */
1297 assert (p - 1 > pattern);
1299 /* Allocate the space for the jump. */
1300 GET_BUFFER_SPACE (3);
1302 /* We know we are not at the first character of the pattern,
1303 because laststart was nonzero. And we've already
1304 incremented `p', by the way, to be the character after
1305 the `*'. Do we have to do something analogous here
1306 for null bytes, because of RE_DOT_NOT_NULL? */
1307 if (TRANSLATE (*(p - 2)) == TRANSLATE ('.')
1309 && p < pend && TRANSLATE (*p) == TRANSLATE ('\n')
1310 && !(syntax & RE_DOT_NEWLINE))
1311 { /* We have .*\n. */
1312 STORE_JUMP (jump, b, laststart);
1313 keep_string_p = true;
1316 /* Anything else. */
1317 STORE_JUMP (maybe_pop_jump, b, laststart - 3);
1319 /* We've added more stuff to the buffer. */
1323 /* On failure, jump from laststart to b + 3, which will be the
1324 end of the buffer after this jump is inserted. */
1325 GET_BUFFER_SPACE (3);
1326 INSERT_JUMP (keep_string_p ? on_failure_keep_string_jump
1334 /* At least one repetition is required, so insert a
1335 `dummy_failure_jump' before the initial
1336 `on_failure_jump' instruction of the loop. This
1337 effects a skip over that instruction the first time
1338 we hit that loop. */
1339 GET_BUFFER_SPACE (3);
1340 INSERT_JUMP (dummy_failure_jump, laststart, laststart + 6);
1355 boolean had_char_class = false;
1357 if (p == pend) return REG_EBRACK;
1359 /* Ensure that we have enough space to push a charset: the
1360 opcode, the length count, and the bitset; 34 bytes in all. */
1361 GET_BUFFER_SPACE (34);
1365 /* We test `*p == '^' twice, instead of using an if
1366 statement, so we only need one BUF_PUSH. */
1367 BUF_PUSH (*p == '^' ? charset_not : charset);
1371 /* Remember the first position in the bracket expression. */
1374 /* Push the number of bytes in the bitmap. */
1375 BUF_PUSH ((1 << BYTEWIDTH) / BYTEWIDTH);
1377 /* Clear the whole map. */
1378 bzero (b, (1 << BYTEWIDTH) / BYTEWIDTH);
1380 /* charset_not matches newline according to a syntax bit. */
1381 if ((re_opcode_t) b[-2] == charset_not
1382 && (syntax & RE_HAT_LISTS_NOT_NEWLINE))
1383 SET_LIST_BIT ('\n');
1385 /* Read in characters and ranges, setting map bits. */
1388 if (p == pend) return REG_EBRACK;
1392 /* \ might escape characters inside [...] and [^...]. */
1393 if ((syntax & RE_BACKSLASH_ESCAPE_IN_LISTS) && c == '\\')
1395 if (p == pend) return REG_EESCAPE;
1402 /* Could be the end of the bracket expression. If it's
1403 not (i.e., when the bracket expression is `[]' so
1404 far), the ']' character bit gets set way below. */
1405 if (c == ']' && p != p1 + 1)
1408 /* Look ahead to see if it's a range when the last thing
1409 was a character class. */
1410 if (had_char_class && c == '-' && *p != ']')
1413 /* Look ahead to see if it's a range when the last thing
1414 was a character: if this is a hyphen not at the
1415 beginning or the end of a list, then it's the range
1418 && !(p - 2 >= pattern && p[-2] == '[')
1419 && !(p - 3 >= pattern && p[-3] == '[' && p[-2] == '^')
1423 = compile_range (&p, pend, translate, syntax, b);
1424 if (ret != REG_NOERROR) return ret;
1427 else if (p[0] == '-' && p[1] != ']')
1428 { /* This handles ranges made up of characters only. */
1431 /* Move past the `-'. */
1434 ret = compile_range (&p, pend, translate, syntax, b);
1435 if (ret != REG_NOERROR) return ret;
1438 /* See if we're at the beginning of a possible character
1441 else if (syntax & RE_CHAR_CLASSES && c == '[' && *p == ':')
1442 { /* Leave room for the null. */
1443 char str[CHAR_CLASS_MAX_LENGTH + 1];
1448 /* If pattern is `[[:'. */
1449 if (p == pend) return REG_EBRACK;
1454 if (c == ':' || c == ']' || p == pend
1455 || c1 == CHAR_CLASS_MAX_LENGTH)
1461 /* If isn't a word bracketed by `[:' and:`]':
1462 undo the ending character, the letters, and leave
1463 the leading `:' and `[' (but set bits for them). */
1464 if (c == ':' && *p == ']')
1467 boolean is_alnum = STREQ (str, "alnum");
1468 boolean is_alpha = STREQ (str, "alpha");
1469 boolean is_blank = STREQ (str, "blank");
1470 boolean is_cntrl = STREQ (str, "cntrl");
1471 boolean is_digit = STREQ (str, "digit");
1472 boolean is_graph = STREQ (str, "graph");
1473 boolean is_lower = STREQ (str, "lower");
1474 boolean is_print = STREQ (str, "print");
1475 boolean is_punct = STREQ (str, "punct");
1476 boolean is_space = STREQ (str, "space");
1477 boolean is_upper = STREQ (str, "upper");
1478 boolean is_xdigit = STREQ (str, "xdigit");
1480 if (!IS_CHAR_CLASS (str)) return REG_ECTYPE;
1482 /* Throw away the ] at the end of the character
1486 if (p == pend) return REG_EBRACK;
1488 for (ch = 0; ch < 1 << BYTEWIDTH; ch++)
1490 if ( (is_alnum && ISALNUM (ch))
1491 || (is_alpha && ISALPHA (ch))
1492 || (is_blank && ISBLANK (ch))
1493 || (is_cntrl && ISCNTRL (ch))
1494 || (is_digit && ISDIGIT (ch))
1495 || (is_graph && ISGRAPH (ch))
1496 || (is_lower && ISLOWER (ch))
1497 || (is_print && ISPRINT (ch))
1498 || (is_punct && ISPUNCT (ch))
1499 || (is_space && ISSPACE (ch))
1500 || (is_upper && ISUPPER (ch))
1501 || (is_xdigit && ISXDIGIT (ch)))
1504 had_char_class = true;
1513 had_char_class = false;
1518 had_char_class = false;
1523 /* Discard any (non)matching list bytes that are all 0 at the
1524 end of the map. Decrease the map-length byte too. */
1525 while ((int) b[-1] > 0 && b[b[-1] - 1] == 0)
1533 if (syntax & RE_NO_BK_PARENS)
1540 if (syntax & RE_NO_BK_PARENS)
1547 if (syntax & RE_NEWLINE_ALT)
1554 if (syntax & RE_NO_BK_VBAR)
1561 if (syntax & RE_INTERVALS && syntax & RE_NO_BK_BRACES)
1562 goto handle_interval;
1568 if (p == pend) return REG_EESCAPE;
1570 /* Do not translate the character after the \, so that we can
1571 distinguish, e.g., \B from \b, even if we normally would
1572 translate, e.g., B to b. */
1578 if (syntax & RE_NO_BK_PARENS)
1579 goto normal_backslash;
1585 if (COMPILE_STACK_FULL)
1587 RETALLOC (compile_stack.stack, compile_stack.size << 1,
1588 compile_stack_elt_t);
1589 if (compile_stack.stack == NULL) return REG_ESPACE;
1591 compile_stack.size <<= 1;
1594 /* These are the values to restore when we hit end of this
1595 group. They are all relative offsets, so that if the
1596 whole pattern moves because of realloc, they will still
1598 COMPILE_STACK_TOP.begalt_offset = begalt - bufp->buffer;
1599 COMPILE_STACK_TOP.fixup_alt_jump
1600 = fixup_alt_jump ? fixup_alt_jump - bufp->buffer + 1 : 0;
1601 COMPILE_STACK_TOP.laststart_offset = b - bufp->buffer;
1602 COMPILE_STACK_TOP.regnum = regnum;
1604 /* We will eventually replace the 0 with the number of
1605 groups inner to this one. But do not push a
1606 start_memory for groups beyond the last one we can
1607 represent in the compiled pattern. */
1608 if (regnum <= MAX_REGNUM)
1610 COMPILE_STACK_TOP.inner_group_offset = b - bufp->buffer + 2;
1611 BUF_PUSH_3 (start_memory, regnum, 0);
1614 compile_stack.avail++;
1619 /* If we've reached MAX_REGNUM groups, then this open
1620 won't actually generate any code, so we'll have to
1621 clear pending_exact explicitly. */
1627 if (syntax & RE_NO_BK_PARENS) goto normal_backslash;
1629 if (COMPILE_STACK_EMPTY)
1631 if (syntax & RE_UNMATCHED_RIGHT_PAREN_ORD)
1632 goto normal_backslash;
1639 { /* Push a dummy failure point at the end of the
1640 alternative for a possible future
1641 `pop_failure_jump' to pop. See comments at
1642 `push_dummy_failure' in `re_match_2'. */
1643 BUF_PUSH (push_dummy_failure);
1645 /* We allocated space for this jump when we assigned
1646 to `fixup_alt_jump', in the `handle_alt' case below. */
1647 STORE_JUMP (jump_past_alt, fixup_alt_jump, b - 1);
1650 /* See similar code for backslashed left paren above. */
1651 if (COMPILE_STACK_EMPTY)
1653 if (syntax & RE_UNMATCHED_RIGHT_PAREN_ORD)
1659 /* Since we just checked for an empty stack above, this
1660 ``can't happen''. */
1661 assert (compile_stack.avail != 0);
1663 /* We don't just want to restore into `regnum', because
1664 later groups should continue to be numbered higher,
1665 as in `(ab)c(de)' -- the second group is #2. */
1666 regnum_t this_group_regnum;
1668 compile_stack.avail--;
1669 begalt = bufp->buffer + COMPILE_STACK_TOP.begalt_offset;
1671 = COMPILE_STACK_TOP.fixup_alt_jump
1672 ? bufp->buffer + COMPILE_STACK_TOP.fixup_alt_jump - 1
1674 laststart = bufp->buffer + COMPILE_STACK_TOP.laststart_offset;
1675 this_group_regnum = COMPILE_STACK_TOP.regnum;
1676 /* If we've reached MAX_REGNUM groups, then this open
1677 won't actually generate any code, so we'll have to
1678 clear pending_exact explicitly. */
1681 /* We're at the end of the group, so now we know how many
1682 groups were inside this one. */
1683 if (this_group_regnum <= MAX_REGNUM)
1685 unsigned char *inner_group_loc
1686 = bufp->buffer + COMPILE_STACK_TOP.inner_group_offset;
1688 *inner_group_loc = regnum - this_group_regnum;
1689 BUF_PUSH_3 (stop_memory, this_group_regnum,
1690 regnum - this_group_regnum);
1696 case '|': /* `\|'. */
1697 if (syntax & RE_LIMITED_OPS || syntax & RE_NO_BK_VBAR)
1698 goto normal_backslash;
1700 if (syntax & RE_LIMITED_OPS)
1703 /* Insert before the previous alternative a jump which
1704 jumps to this alternative if the former fails. */
1705 GET_BUFFER_SPACE (3);
1706 INSERT_JUMP (on_failure_jump, begalt, b + 6);
1710 /* The alternative before this one has a jump after it
1711 which gets executed if it gets matched. Adjust that
1712 jump so it will jump to this alternative's analogous
1713 jump (put in below, which in turn will jump to the next
1714 (if any) alternative's such jump, etc.). The last such
1715 jump jumps to the correct final destination. A picture:
1721 If we are at `b', then fixup_alt_jump right now points to a
1722 three-byte space after `a'. We'll put in the jump, set
1723 fixup_alt_jump to right after `b', and leave behind three
1724 bytes which we'll fill in when we get to after `c'. */
1727 STORE_JUMP (jump_past_alt, fixup_alt_jump, b);
1729 /* Mark and leave space for a jump after this alternative,
1730 to be filled in later either by next alternative or
1731 when know we're at the end of a series of alternatives. */
1733 GET_BUFFER_SPACE (3);
1742 /* If \{ is a literal. */
1743 if (!(syntax & RE_INTERVALS)
1744 /* If we're at `\{' and it's not the open-interval
1746 || ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1747 || (p - 2 == pattern && p == pend))
1748 goto normal_backslash;
1752 /* If got here, then the syntax allows intervals. */
1754 /* At least (most) this many matches must be made. */
1755 int lower_bound = -1, upper_bound = -1;
1757 beg_interval = p - 1;
1761 if (syntax & RE_NO_BK_BRACES)
1762 goto unfetch_interval;
1767 GET_UNSIGNED_NUMBER (lower_bound);
1771 GET_UNSIGNED_NUMBER (upper_bound);
1772 if (upper_bound < 0) upper_bound = RE_DUP_MAX;
1775 /* Interval such as `{1}' => match exactly once. */
1776 upper_bound = lower_bound;
1778 if (lower_bound < 0 || upper_bound > RE_DUP_MAX
1779 || lower_bound > upper_bound)
1781 if (syntax & RE_NO_BK_BRACES)
1782 goto unfetch_interval;
1787 if (!(syntax & RE_NO_BK_BRACES))
1789 if (c != '\\') return REG_EBRACE;
1796 if (syntax & RE_NO_BK_BRACES)
1797 goto unfetch_interval;
1802 /* We just parsed a valid interval. */
1804 /* If it's invalid to have no preceding re. */
1807 if (syntax & RE_CONTEXT_INVALID_OPS)
1809 else if (syntax & RE_CONTEXT_INDEP_OPS)
1812 goto unfetch_interval;
1815 /* If the upper bound is zero, don't want to succeed at
1816 all; jump from `laststart' to `b + 3', which will be
1817 the end of the buffer after we insert the jump. */
1818 if (upper_bound == 0)
1820 GET_BUFFER_SPACE (3);
1821 INSERT_JUMP (jump, laststart, b + 3);
1825 /* Otherwise, we have a nontrivial interval. When
1826 we're all done, the pattern will look like:
1827 set_number_at <jump count> <upper bound>
1828 set_number_at <succeed_n count> <lower bound>
1829 succeed_n <after jump addr> <succed_n count>
1831 jump_n <succeed_n addr> <jump count>
1832 (The upper bound and `jump_n' are omitted if
1833 `upper_bound' is 1, though.) */
1835 { /* If the upper bound is > 1, we need to insert
1836 more at the end of the loop. */
1837 unsigned nbytes = 10 + (upper_bound > 1) * 10;
1839 GET_BUFFER_SPACE (nbytes);
1841 /* Initialize lower bound of the `succeed_n', even
1842 though it will be set during matching by its
1843 attendant `set_number_at' (inserted next),
1844 because `re_compile_fastmap' needs to know.
1845 Jump to the `jump_n' we might insert below. */
1846 INSERT_JUMP2 (succeed_n, laststart,
1847 b + 5 + (upper_bound > 1) * 5,
1851 /* Code to initialize the lower bound. Insert
1852 before the `succeed_n'. The `5' is the last two
1853 bytes of this `set_number_at', plus 3 bytes of
1854 the following `succeed_n'. */
1855 insert_op2 (set_number_at, laststart, 5, lower_bound, b);
1858 if (upper_bound > 1)
1859 { /* More than one repetition is allowed, so
1860 append a backward jump to the `succeed_n'
1861 that starts this interval.
1863 When we've reached this during matching,
1864 we'll have matched the interval once, so
1865 jump back only `upper_bound - 1' times. */
1866 STORE_JUMP2 (jump_n, b, laststart + 5,
1870 /* The location we want to set is the second
1871 parameter of the `jump_n'; that is `b-2' as
1872 an absolute address. `laststart' will be
1873 the `set_number_at' we're about to insert;
1874 `laststart+3' the number to set, the source
1875 for the relative address. But we are
1876 inserting into the middle of the pattern --
1877 so everything is getting moved up by 5.
1878 Conclusion: (b - 2) - (laststart + 3) + 5,
1879 i.e., b - laststart.
1881 We insert this at the beginning of the loop
1882 so that if we fail during matching, we'll
1883 reinitialize the bounds. */
1884 insert_op2 (set_number_at, laststart, b - laststart,
1885 upper_bound - 1, b);
1890 beg_interval = NULL;
1895 /* If an invalid interval, match the characters as literals. */
1896 assert (beg_interval);
1898 beg_interval = NULL;
1900 /* normal_char and normal_backslash need `c'. */
1903 if (!(syntax & RE_NO_BK_BRACES))
1905 if (p > pattern && p[-1] == '\\')
1906 goto normal_backslash;
1911 /* There is no way to specify the before_dot and after_dot
1912 operators. rms says this is ok. --karl */
1920 BUF_PUSH_2 (syntaxspec, syntax_spec_code[c]);
1926 BUF_PUSH_2 (notsyntaxspec, syntax_spec_code[c]);
1933 BUF_PUSH (wordchar);
1939 BUF_PUSH (notwordchar);
1952 BUF_PUSH (wordbound);
1956 BUF_PUSH (notwordbound);
1967 case '1': case '2': case '3': case '4': case '5':
1968 case '6': case '7': case '8': case '9':
1969 if (syntax & RE_NO_BK_REFS)
1977 /* Can't back reference to a subexpression if inside of it. */
1978 if (group_in_compile_stack (compile_stack, c1))
1982 BUF_PUSH_2 (duplicate, c1);
1988 if (syntax & RE_BK_PLUS_QM)
1991 goto normal_backslash;
1995 /* You might think it would be useful for \ to mean
1996 not to translate; but if we don't translate it
1997 it will never match anything. */
2005 /* Expects the character in `c'. */
2007 /* If no exactn currently being built. */
2010 /* If last exactn not at current position. */
2011 || pending_exact + *pending_exact + 1 != b
2013 /* We have only one byte following the exactn for the count. */
2014 || *pending_exact == (1 << BYTEWIDTH) - 1
2016 /* If followed by a repetition operator. */
2017 || *p == '*' || *p == '^'
2018 || ((syntax & RE_BK_PLUS_QM)
2019 ? *p == '\\' && (p[1] == '+' || p[1] == '?')
2020 : (*p == '+' || *p == '?'))
2021 || ((syntax & RE_INTERVALS)
2022 && ((syntax & RE_NO_BK_BRACES)
2024 : (p[0] == '\\' && p[1] == '{'))))
2026 /* Start building a new exactn. */
2030 BUF_PUSH_2 (exactn, 0);
2031 pending_exact = b - 1;
2038 } /* while p != pend */
2041 /* Through the pattern now. */
2044 STORE_JUMP (jump_past_alt, fixup_alt_jump, b);
2046 if (!COMPILE_STACK_EMPTY)
2049 free (compile_stack.stack);
2051 /* We have succeeded; set the length of the buffer. */
2052 bufp->used = b - bufp->buffer;
2057 DEBUG_PRINT1 ("\nCompiled pattern: ");
2058 print_compiled_pattern (bufp);
2063 } /* regex_compile */
2065 /* Subroutines for `regex_compile'. */
2067 /* Store OP at LOC followed by two-byte integer parameter ARG. */
2070 store_op1 (op, loc, arg)
2075 *loc = (unsigned char) op;
2076 STORE_NUMBER (loc + 1, arg);
2080 /* Like `store_op1', but for two two-byte parameters ARG1 and ARG2. */
2083 store_op2 (op, loc, arg1, arg2)
2088 *loc = (unsigned char) op;
2089 STORE_NUMBER (loc + 1, arg1);
2090 STORE_NUMBER (loc + 3, arg2);
2094 /* Copy the bytes from LOC to END to open up three bytes of space at LOC
2095 for OP followed by two-byte integer parameter ARG. */
2098 insert_op1 (op, loc, arg, end)
2104 register unsigned char *pfrom = end;
2105 register unsigned char *pto = end + 3;
2107 while (pfrom != loc)
2110 store_op1 (op, loc, arg);
2114 /* Like `insert_op1', but for two two-byte parameters ARG1 and ARG2. */
2117 insert_op2 (op, loc, arg1, arg2, end)
2123 register unsigned char *pfrom = end;
2124 register unsigned char *pto = end + 5;
2126 while (pfrom != loc)
2129 store_op2 (op, loc, arg1, arg2);
2133 /* P points to just after a ^ in PATTERN. Return true if that ^ comes
2134 after an alternative or a begin-subexpression. We assume there is at
2135 least one character before the ^. */
2138 at_begline_loc_p (pattern, p, syntax)
2139 const char *pattern, *p;
2140 reg_syntax_t syntax;
2142 const char *prev = p - 2;
2143 boolean prev_prev_backslash = prev > pattern && prev[-1] == '\\';
2146 /* After a subexpression? */
2147 (*prev == '(' && (syntax & RE_NO_BK_PARENS || prev_prev_backslash))
2148 /* After an alternative? */
2149 || (*prev == '|' && (syntax & RE_NO_BK_VBAR || prev_prev_backslash));
2153 /* The dual of at_begline_loc_p. This one is for $. We assume there is
2154 at least one character after the $, i.e., `P < PEND'. */
2157 at_endline_loc_p (p, pend, syntax)
2158 const char *p, *pend;
2161 const char *next = p;
2162 boolean next_backslash = *next == '\\';
2163 const char *next_next = p + 1 < pend ? p + 1 : NULL;
2166 /* Before a subexpression? */
2167 (syntax & RE_NO_BK_PARENS ? *next == ')'
2168 : next_backslash && next_next && *next_next == ')')
2169 /* Before an alternative? */
2170 || (syntax & RE_NO_BK_VBAR ? *next == '|'
2171 : next_backslash && next_next && *next_next == '|');
2175 /* Returns true if REGNUM is in one of COMPILE_STACK's elements and
2176 false if it's not. */
2179 group_in_compile_stack (compile_stack, regnum)
2180 compile_stack_type compile_stack;
2185 for (this_element = compile_stack.avail - 1;
2188 if (compile_stack.stack[this_element].regnum == regnum)
2195 /* Read the ending character of a range (in a bracket expression) from the
2196 uncompiled pattern *P_PTR (which ends at PEND). We assume the
2197 starting character is in `P[-2]'. (`P[-1]' is the character `-'.)
2198 Then we set the translation of all bits between the starting and
2199 ending characters (inclusive) in the compiled pattern B.
2201 Return an error code.
2203 We use these short variable names so we can use the same macros as
2204 `regex_compile' itself. */
2206 static reg_errcode_t
2207 compile_range (p_ptr, pend, translate, syntax, b)
2208 const char **p_ptr, *pend;
2210 reg_syntax_t syntax;
2215 const char *p = *p_ptr;
2216 int range_start, range_end;
2221 /* Even though the pattern is a signed `char *', we need to fetch
2222 with unsigned char *'s; if the high bit of the pattern character
2223 is set, the range endpoints will be negative if we fetch using a
2226 We also want to fetch the endpoints without translating them; the
2227 appropriate translation is done in the bit-setting loop below. */
2228 range_start = ((unsigned char *) p)[-2];
2229 range_end = ((unsigned char *) p)[0];
2231 /* Have to increment the pointer into the pattern string, so the
2232 caller isn't still at the ending character. */
2235 /* If the start is after the end, the range is empty. */
2236 if (range_start > range_end)
2237 return syntax & RE_NO_EMPTY_RANGES ? REG_ERANGE : REG_NOERROR;
2239 /* Here we see why `this_char' has to be larger than an `unsigned
2240 char' -- the range is inclusive, so if `range_end' == 0xff
2241 (assuming 8-bit characters), we would otherwise go into an infinite
2242 loop, since all characters <= 0xff. */
2243 for (this_char = range_start; this_char <= range_end; this_char++)
2245 SET_LIST_BIT (TRANSLATE (this_char));
2251 /* Failure stack declarations and macros; both re_compile_fastmap and
2252 re_match_2 use a failure stack. These have to be macros because of
2256 /* Number of failure points for which to initially allocate space
2257 when matching. If this number is exceeded, we allocate more
2258 space, so it is not a hard limit. */
2259 #ifndef INIT_FAILURE_ALLOC
2260 #define INIT_FAILURE_ALLOC 5
2263 /* Roughly the maximum number of failure points on the stack. Would be
2264 exactly that if always used MAX_FAILURE_SPACE each time we failed.
2265 This is a variable only so users of regex can assign to it; we never
2266 change it ourselves. */
2267 int re_max_failures = 2000;
2269 typedef const unsigned char *fail_stack_elt_t;
2273 fail_stack_elt_t *stack;
2275 unsigned avail; /* Offset of next open position. */
2278 #define FAIL_STACK_EMPTY() (fail_stack.avail == 0)
2279 #define FAIL_STACK_PTR_EMPTY() (fail_stack_ptr->avail == 0)
2280 #define FAIL_STACK_FULL() (fail_stack.avail == fail_stack.size)
2281 #define FAIL_STACK_TOP() (fail_stack.stack[fail_stack.avail])
2284 /* Initialize `fail_stack'. Do `return -2' if the alloc fails. */
2286 #define INIT_FAIL_STACK() \
2288 fail_stack.stack = (fail_stack_elt_t *) \
2289 REGEX_ALLOCATE (INIT_FAILURE_ALLOC * sizeof (fail_stack_elt_t)); \
2291 if (fail_stack.stack == NULL) \
2294 fail_stack.size = INIT_FAILURE_ALLOC; \
2295 fail_stack.avail = 0; \
2299 /* Double the size of FAIL_STACK, up to approximately `re_max_failures' items.
2301 Return 1 if succeeds, and 0 if either ran out of memory
2302 allocating space for it or it was already too large.
2304 REGEX_REALLOCATE requires `destination' be declared. */
2306 #define DOUBLE_FAIL_STACK(fail_stack) \
2307 ((fail_stack).size > re_max_failures * MAX_FAILURE_ITEMS \
2309 : ((fail_stack).stack = (fail_stack_elt_t *) \
2310 REGEX_REALLOCATE ((fail_stack).stack, \
2311 (fail_stack).size * sizeof (fail_stack_elt_t), \
2312 ((fail_stack).size << 1) * sizeof (fail_stack_elt_t)), \
2314 (fail_stack).stack == NULL \
2316 : ((fail_stack).size <<= 1, \
2320 /* Push PATTERN_OP on FAIL_STACK.
2322 Return 1 if was able to do so and 0 if ran out of memory allocating
2324 #define PUSH_PATTERN_OP(pattern_op, fail_stack) \
2325 ((FAIL_STACK_FULL () \
2326 && !DOUBLE_FAIL_STACK (fail_stack)) \
2328 : ((fail_stack).stack[(fail_stack).avail++] = pattern_op, \
2331 /* This pushes an item onto the failure stack. Must be a four-byte
2332 value. Assumes the variable `fail_stack'. Probably should only
2333 be called from within `PUSH_FAILURE_POINT'. */
2334 #define PUSH_FAILURE_ITEM(item) \
2335 fail_stack.stack[fail_stack.avail++] = (fail_stack_elt_t) item
2337 /* The complement operation. Assumes `fail_stack' is nonempty. */
2338 #define POP_FAILURE_ITEM() fail_stack.stack[--fail_stack.avail]
2340 /* Used to omit pushing failure point id's when we're not debugging. */
2342 #define DEBUG_PUSH PUSH_FAILURE_ITEM
2343 #define DEBUG_POP(item_addr) *(item_addr) = POP_FAILURE_ITEM ()
2345 #define DEBUG_PUSH(item)
2346 #define DEBUG_POP(item_addr)
2350 /* Push the information about the state we will need
2351 if we ever fail back to it.
2353 Requires variables fail_stack, regstart, regend, reg_info, and
2354 num_regs be declared. DOUBLE_FAIL_STACK requires `destination' be
2357 Does `return FAILURE_CODE' if runs out of memory. */
2359 #define PUSH_FAILURE_POINT(pattern_place, string_place, failure_code) \
2361 char *destination; \
2362 /* Must be int, so when we don't save any registers, the arithmetic \
2363 of 0 + -1 isn't done as unsigned. */ \
2366 DEBUG_STATEMENT (failure_id++); \
2367 DEBUG_STATEMENT (nfailure_points_pushed++); \
2368 DEBUG_PRINT2 ("\nPUSH_FAILURE_POINT #%u:\n", failure_id); \
2369 DEBUG_PRINT2 (" Before push, next avail: %d\n", (fail_stack).avail);\
2370 DEBUG_PRINT2 (" size: %d\n", (fail_stack).size);\
2372 DEBUG_PRINT2 (" slots needed: %d\n", NUM_FAILURE_ITEMS); \
2373 DEBUG_PRINT2 (" available: %d\n", REMAINING_AVAIL_SLOTS); \
2375 /* Ensure we have enough space allocated for what we will push. */ \
2376 while (REMAINING_AVAIL_SLOTS < NUM_FAILURE_ITEMS) \
2378 if (!DOUBLE_FAIL_STACK (fail_stack)) \
2379 return failure_code; \
2381 DEBUG_PRINT2 ("\n Doubled stack; size now: %d\n", \
2382 (fail_stack).size); \
2383 DEBUG_PRINT2 (" slots available: %d\n", REMAINING_AVAIL_SLOTS);\
2386 /* Push the info, starting with the registers. */ \
2387 DEBUG_PRINT1 ("\n"); \
2389 for (this_reg = lowest_active_reg; this_reg <= highest_active_reg; \
2392 DEBUG_PRINT2 (" Pushing reg: %d\n", this_reg); \
2393 DEBUG_STATEMENT (num_regs_pushed++); \
2395 DEBUG_PRINT2 (" start: 0x%x\n", regstart[this_reg]); \
2396 PUSH_FAILURE_ITEM (regstart[this_reg]); \
2398 DEBUG_PRINT2 (" end: 0x%x\n", regend[this_reg]); \
2399 PUSH_FAILURE_ITEM (regend[this_reg]); \
2401 DEBUG_PRINT2 (" info: 0x%x\n ", reg_info[this_reg]); \
2402 DEBUG_PRINT2 (" match_null=%d", \
2403 REG_MATCH_NULL_STRING_P (reg_info[this_reg])); \
2404 DEBUG_PRINT2 (" active=%d", IS_ACTIVE (reg_info[this_reg])); \
2405 DEBUG_PRINT2 (" matched_something=%d", \
2406 MATCHED_SOMETHING (reg_info[this_reg])); \
2407 DEBUG_PRINT2 (" ever_matched=%d", \
2408 EVER_MATCHED_SOMETHING (reg_info[this_reg])); \
2409 DEBUG_PRINT1 ("\n"); \
2410 PUSH_FAILURE_ITEM (reg_info[this_reg].word); \
2413 DEBUG_PRINT2 (" Pushing low active reg: %d\n", lowest_active_reg);\
2414 PUSH_FAILURE_ITEM (lowest_active_reg); \
2416 DEBUG_PRINT2 (" Pushing high active reg: %d\n", highest_active_reg);\
2417 PUSH_FAILURE_ITEM (highest_active_reg); \
2419 DEBUG_PRINT2 (" Pushing pattern 0x%x: ", pattern_place); \
2420 DEBUG_PRINT_COMPILED_PATTERN (bufp, pattern_place, pend); \
2421 PUSH_FAILURE_ITEM (pattern_place); \
2423 DEBUG_PRINT2 (" Pushing string 0x%x: `", string_place); \
2424 DEBUG_PRINT_DOUBLE_STRING (string_place, string1, size1, string2, \
2426 DEBUG_PRINT1 ("'\n"); \
2427 PUSH_FAILURE_ITEM (string_place); \
2429 DEBUG_PRINT2 (" Pushing failure id: %u\n", failure_id); \
2430 DEBUG_PUSH (failure_id); \
2433 /* This is the number of items that are pushed and popped on the stack
2434 for each register. */
2435 #define NUM_REG_ITEMS 3
2437 /* Individual items aside from the registers. */
2439 #define NUM_NONREG_ITEMS 5 /* Includes failure point id. */
2441 #define NUM_NONREG_ITEMS 4
2444 /* We push at most this many items on the stack. */
2445 #define MAX_FAILURE_ITEMS ((num_regs - 1) * NUM_REG_ITEMS + NUM_NONREG_ITEMS)
2447 /* We actually push this many items. */
2448 #define NUM_FAILURE_ITEMS \
2449 ((highest_active_reg - lowest_active_reg + 1) * NUM_REG_ITEMS \
2452 /* How many items can still be added to the stack without overflowing it. */
2453 #define REMAINING_AVAIL_SLOTS ((fail_stack).size - (fail_stack).avail)
2456 /* Pops what PUSH_FAIL_STACK pushes.
2458 We restore into the parameters, all of which should be lvalues:
2459 STR -- the saved data position.
2460 PAT -- the saved pattern position.
2461 LOW_REG, HIGH_REG -- the highest and lowest active registers.
2462 REGSTART, REGEND -- arrays of string positions.
2463 REG_INFO -- array of information about each subexpression.
2465 Also assumes the variables `fail_stack' and (if debugging), `bufp',
2466 `pend', `string1', `size1', `string2', and `size2'. */
2468 #define POP_FAILURE_POINT(str, pat, low_reg, high_reg, regstart, regend, reg_info)\
2470 DEBUG_STATEMENT (fail_stack_elt_t failure_id;) \
2472 const unsigned char *string_temp; \
2474 assert (!FAIL_STACK_EMPTY ()); \
2476 /* Remove failure points and point to how many regs pushed. */ \
2477 DEBUG_PRINT1 ("POP_FAILURE_POINT:\n"); \
2478 DEBUG_PRINT2 (" Before pop, next avail: %d\n", fail_stack.avail); \
2479 DEBUG_PRINT2 (" size: %d\n", fail_stack.size); \
2481 assert (fail_stack.avail >= NUM_NONREG_ITEMS); \
2483 DEBUG_POP (&failure_id); \
2484 DEBUG_PRINT2 (" Popping failure id: %u\n", failure_id); \
2486 /* If the saved string location is NULL, it came from an \
2487 on_failure_keep_string_jump opcode, and we want to throw away the \
2488 saved NULL, thus retaining our current position in the string. */ \
2489 string_temp = POP_FAILURE_ITEM (); \
2490 if (string_temp != NULL) \
2491 str = (const char *) string_temp; \
2493 DEBUG_PRINT2 (" Popping string 0x%x: `", str); \
2494 DEBUG_PRINT_DOUBLE_STRING (str, string1, size1, string2, size2); \
2495 DEBUG_PRINT1 ("'\n"); \
2497 pat = (unsigned char *) POP_FAILURE_ITEM (); \
2498 DEBUG_PRINT2 (" Popping pattern 0x%x: ", pat); \
2499 DEBUG_PRINT_COMPILED_PATTERN (bufp, pat, pend); \
2501 /* Restore register info. */ \
2502 high_reg = (unsigned) POP_FAILURE_ITEM (); \
2503 DEBUG_PRINT2 (" Popping high active reg: %d\n", high_reg); \
2505 low_reg = (unsigned) POP_FAILURE_ITEM (); \
2506 DEBUG_PRINT2 (" Popping low active reg: %d\n", low_reg); \
2508 for (this_reg = high_reg; this_reg >= low_reg; this_reg--) \
2510 DEBUG_PRINT2 (" Popping reg: %d\n", this_reg); \
2512 reg_info[this_reg].word = POP_FAILURE_ITEM (); \
2513 DEBUG_PRINT2 (" info: 0x%x\n", reg_info[this_reg]); \
2515 regend[this_reg] = (const char *) POP_FAILURE_ITEM (); \
2516 DEBUG_PRINT2 (" end: 0x%x\n", regend[this_reg]); \
2518 regstart[this_reg] = (const char *) POP_FAILURE_ITEM (); \
2519 DEBUG_PRINT2 (" start: 0x%x\n", regstart[this_reg]); \
2522 DEBUG_STATEMENT (nfailure_points_popped++); \
2523 } /* POP_FAILURE_POINT */
2525 /* re_compile_fastmap computes a ``fastmap'' for the compiled pattern in
2526 BUFP. A fastmap records which of the (1 << BYTEWIDTH) possible
2527 characters can start a string that matches the pattern. This fastmap
2528 is used by re_search to skip quickly over impossible starting points.
2530 The caller must supply the address of a (1 << BYTEWIDTH)-byte data
2531 area as BUFP->fastmap.
2533 We set the `fastmap', `fastmap_accurate', and `can_be_null' fields in
2536 Returns 0 if we succeed, -2 if an internal error. */
2539 re_compile_fastmap (bufp)
2540 struct re_pattern_buffer *bufp;
2543 fail_stack_type fail_stack;
2544 #ifndef REGEX_MALLOC
2547 /* We don't push any register information onto the failure stack. */
2548 unsigned num_regs = 0;
2550 register char *fastmap = bufp->fastmap;
2551 unsigned char *pattern = bufp->buffer;
2552 unsigned long size = bufp->used;
2553 const unsigned char *p = pattern;
2554 register unsigned char *pend = pattern + size;
2556 /* Assume that each path through the pattern can be null until
2557 proven otherwise. We set this false at the bottom of switch
2558 statement, to which we get only if a particular path doesn't
2559 match the empty string. */
2560 boolean path_can_be_null = true;
2562 /* We aren't doing a `succeed_n' to begin with. */
2563 boolean succeed_n_p = false;
2565 assert (fastmap != NULL && p != NULL);
2568 bzero (fastmap, 1 << BYTEWIDTH); /* Assume nothing's valid. */
2569 bufp->fastmap_accurate = 1; /* It will be when we're done. */
2570 bufp->can_be_null = 0;
2572 while (p != pend || !FAIL_STACK_EMPTY ())
2576 bufp->can_be_null |= path_can_be_null;
2578 /* Reset for next path. */
2579 path_can_be_null = true;
2581 p = fail_stack.stack[--fail_stack.avail];
2584 /* We should never be about to go beyond the end of the pattern. */
2587 #ifdef SWITCH_ENUM_BUG
2588 switch ((int) ((re_opcode_t) *p++))
2590 switch ((re_opcode_t) *p++)
2594 /* I guess the idea here is to simply not bother with a fastmap
2595 if a backreference is used, since it's too hard to figure out
2596 the fastmap for the corresponding group. Setting
2597 `can_be_null' stops `re_search_2' from using the fastmap, so
2598 that is all we do. */
2600 bufp->can_be_null = 1;
2604 /* Following are the cases which match a character. These end
2613 for (j = *p++ * BYTEWIDTH - 1; j >= 0; j--)
2614 if (p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH)))
2620 /* Chars beyond end of map must be allowed. */
2621 for (j = *p * BYTEWIDTH; j < (1 << BYTEWIDTH); j++)
2624 for (j = *p++ * BYTEWIDTH - 1; j >= 0; j--)
2625 if (!(p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH))))
2631 for (j = 0; j < (1 << BYTEWIDTH); j++)
2632 if (SYNTAX (j) == Sword)
2638 for (j = 0; j < (1 << BYTEWIDTH); j++)
2639 if (SYNTAX (j) != Sword)
2645 /* `.' matches anything ... */
2646 for (j = 0; j < (1 << BYTEWIDTH); j++)
2649 /* ... except perhaps newline. */
2650 if (!(bufp->syntax & RE_DOT_NEWLINE))
2653 /* Return if we have already set `can_be_null'; if we have,
2654 then the fastmap is irrelevant. Something's wrong here. */
2655 else if (bufp->can_be_null)
2658 /* Otherwise, have to check alternative paths. */
2665 for (j = 0; j < (1 << BYTEWIDTH); j++)
2666 if (SYNTAX (j) == (enum syntaxcode) k)
2673 for (j = 0; j < (1 << BYTEWIDTH); j++)
2674 if (SYNTAX (j) != (enum syntaxcode) k)
2679 /* All cases after this match the empty string. These end with
2687 #endif /* not emacs */
2699 case push_dummy_failure:
2704 case pop_failure_jump:
2705 case maybe_pop_jump:
2708 case dummy_failure_jump:
2709 EXTRACT_NUMBER_AND_INCR (j, p);
2714 /* Jump backward implies we just went through the body of a
2715 loop and matched nothing. Opcode jumped to should be
2716 `on_failure_jump' or `succeed_n'. Just treat it like an
2717 ordinary jump. For a * loop, it has pushed its failure
2718 point already; if so, discard that as redundant. */
2719 if ((re_opcode_t) *p != on_failure_jump
2720 && (re_opcode_t) *p != succeed_n)
2724 EXTRACT_NUMBER_AND_INCR (j, p);
2727 /* If what's on the stack is where we are now, pop it. */
2728 if (!FAIL_STACK_EMPTY ()
2729 && fail_stack.stack[fail_stack.avail - 1] == p)
2735 case on_failure_jump:
2736 case on_failure_keep_string_jump:
2737 handle_on_failure_jump:
2738 EXTRACT_NUMBER_AND_INCR (j, p);
2740 /* For some patterns, e.g., `(a?)?', `p+j' here points to the
2741 end of the pattern. We don't want to push such a point,
2742 since when we restore it above, entering the switch will
2743 increment `p' past the end of the pattern. We don't need
2744 to push such a point since we obviously won't find any more
2745 fastmap entries beyond `pend'. Such a pattern can match
2746 the null string, though. */
2749 if (!PUSH_PATTERN_OP (p + j, fail_stack))
2753 bufp->can_be_null = 1;
2757 EXTRACT_NUMBER_AND_INCR (k, p); /* Skip the n. */
2758 succeed_n_p = false;
2765 /* Get to the number of times to succeed. */
2768 /* Increment p past the n for when k != 0. */
2769 EXTRACT_NUMBER_AND_INCR (k, p);
2773 succeed_n_p = true; /* Spaghetti code alert. */
2774 goto handle_on_failure_jump;
2791 abort (); /* We have listed all the cases. */
2794 /* Getting here means we have found the possible starting
2795 characters for one path of the pattern -- and that the empty
2796 string does not match. We need not follow this path further.
2797 Instead, look at the next alternative (remembered on the
2798 stack), or quit if no more. The test at the top of the loop
2799 does these things. */
2800 path_can_be_null = false;
2804 /* Set `can_be_null' for the last path (also the first path, if the
2805 pattern is empty). */
2806 bufp->can_be_null |= path_can_be_null;
2808 } /* re_compile_fastmap */
2810 /* Set REGS to hold NUM_REGS registers, storing them in STARTS and
2811 ENDS. Subsequent matches using PATTERN_BUFFER and REGS will use
2812 this memory for recording register information. STARTS and ENDS
2813 must be allocated using the malloc library routine, and must each
2814 be at least NUM_REGS * sizeof (regoff_t) bytes long.
2816 If NUM_REGS == 0, then subsequent matches should allocate their own
2819 Unless this function is called, the first search or match using
2820 PATTERN_BUFFER will allocate its own register data, without
2821 freeing the old data. */
2824 re_set_registers (bufp, regs, num_regs, starts, ends)
2825 struct re_pattern_buffer *bufp;
2826 struct re_registers *regs;
2828 regoff_t *starts, *ends;
2832 bufp->regs_allocated = REGS_REALLOCATE;
2833 regs->num_regs = num_regs;
2834 regs->start = starts;
2839 bufp->regs_allocated = REGS_UNALLOCATED;
2841 regs->start = regs->end = (regoff_t) 0;
2845 /* Searching routines. */
2847 /* Like re_search_2, below, but only one string is specified, and
2848 doesn't let you say where to stop matching. */
2851 re_search (bufp, string, size, startpos, range, regs)
2852 struct re_pattern_buffer *bufp;
2854 int size, startpos, range;
2855 struct re_registers *regs;
2857 return re_search_2 (bufp, NULL, 0, string, size, startpos, range,
2862 /* Using the compiled pattern in BUFP->buffer, first tries to match the
2863 virtual concatenation of STRING1 and STRING2, starting first at index
2864 STARTPOS, then at STARTPOS + 1, and so on.
2866 STRING1 and STRING2 have length SIZE1 and SIZE2, respectively.
2868 RANGE is how far to scan while trying to match. RANGE = 0 means try
2869 only at STARTPOS; in general, the last start tried is STARTPOS +
2872 In REGS, return the indices of the virtual concatenation of STRING1
2873 and STRING2 that matched the entire BUFP->buffer and its contained
2876 Do not consider matching one past the index STOP in the virtual
2877 concatenation of STRING1 and STRING2.
2879 We return either the position in the strings at which the match was
2880 found, -1 if no match, or -2 if error (such as failure
2884 re_search_2 (bufp, string1, size1, string2, size2, startpos, range, regs, stop)
2885 struct re_pattern_buffer *bufp;
2886 const char *string1, *string2;
2890 struct re_registers *regs;
2894 register char *fastmap = bufp->fastmap;
2895 register char *translate = bufp->translate;
2896 int total_size = size1 + size2;
2897 int endpos = startpos + range;
2899 /* Check for out-of-range STARTPOS. */
2900 if (startpos < 0 || startpos > total_size)
2903 /* Fix up RANGE if it might eventually take us outside
2904 the virtual concatenation of STRING1 and STRING2. */
2906 range = -1 - startpos;
2907 else if (endpos > total_size)
2908 range = total_size - startpos;
2910 /* If the search isn't to be a backwards one, don't waste time in a
2911 search for a pattern that must be anchored. */
2912 if (bufp->used > 0 && (re_opcode_t) bufp->buffer[0] == begbuf && range > 0)
2920 /* Update the fastmap now if not correct already. */
2921 if (fastmap && !bufp->fastmap_accurate)
2922 if (re_compile_fastmap (bufp) == -2)
2925 /* Loop through the string, looking for a place to start matching. */
2928 /* If a fastmap is supplied, skip quickly over characters that
2929 cannot be the start of a match. If the pattern can match the
2930 null string, however, we don't need to skip characters; we want
2931 the first null string. */
2932 if (fastmap && startpos < total_size && !bufp->can_be_null)
2934 if (range > 0) /* Searching forwards. */
2936 register const char *d;
2937 register int lim = 0;
2940 if (startpos < size1 && startpos + range >= size1)
2941 lim = range - (size1 - startpos);
2943 d = (startpos >= size1 ? string2 - size1 : string1) + startpos;
2945 /* Written out as an if-else to avoid testing `translate'
2949 && !fastmap[(unsigned char)
2950 translate[(unsigned char) *d++]])
2953 while (range > lim && !fastmap[(unsigned char) *d++])
2956 startpos += irange - range;
2958 else /* Searching backwards. */
2960 register char c = (size1 == 0 || startpos >= size1
2961 ? string2[startpos - size1]
2962 : string1[startpos]);
2964 if (!fastmap[(unsigned char) TRANSLATE (c)])
2969 /* If can't match the null string, and that's all we have left, fail. */
2970 if (range >= 0 && startpos == total_size && fastmap
2971 && !bufp->can_be_null)
2974 val = re_match_2 (bufp, string1, size1, string2, size2,
2975 startpos, regs, stop);
2999 /* Declarations and macros for re_match_2. */
3001 static int bcmp_translate ();
3002 static boolean alt_match_null_string_p (),
3003 common_op_match_null_string_p (),
3004 group_match_null_string_p ();
3006 /* Structure for per-register (a.k.a. per-group) information.
3007 This must not be longer than one word, because we push this value
3008 onto the failure stack. Other register information, such as the
3009 starting and ending positions (which are addresses), and the list of
3010 inner groups (which is a bits list) are maintained in separate
3013 We are making a (strictly speaking) nonportable assumption here: that
3014 the compiler will pack our bit fields into something that fits into
3015 the type of `word', i.e., is something that fits into one item on the
3019 fail_stack_elt_t word;
3022 /* This field is one if this group can match the empty string,
3023 zero if not. If not yet determined, `MATCH_NULL_UNSET_VALUE'. */
3024 #define MATCH_NULL_UNSET_VALUE 3
3025 unsigned match_null_string_p : 2;
3026 unsigned is_active : 1;
3027 unsigned matched_something : 1;
3028 unsigned ever_matched_something : 1;
3030 } register_info_type;
3032 #define REG_MATCH_NULL_STRING_P(R) ((R).bits.match_null_string_p)
3033 #define IS_ACTIVE(R) ((R).bits.is_active)
3034 #define MATCHED_SOMETHING(R) ((R).bits.matched_something)
3035 #define EVER_MATCHED_SOMETHING(R) ((R).bits.ever_matched_something)
3038 /* Call this when have matched a real character; it sets `matched' flags
3039 for the subexpressions which we are currently inside. Also records
3040 that those subexprs have matched. */
3041 #define SET_REGS_MATCHED() \
3045 for (r = lowest_active_reg; r <= highest_active_reg; r++) \
3047 MATCHED_SOMETHING (reg_info[r]) \
3048 = EVER_MATCHED_SOMETHING (reg_info[r]) \
3055 /* This converts PTR, a pointer into one of the search strings `string1'
3056 and `string2' into an offset from the beginning of that string. */
3057 #define POINTER_TO_OFFSET(ptr) \
3058 (FIRST_STRING_P (ptr) ? (ptr) - string1 : (ptr) - string2 + size1)
3060 /* Registers are set to a sentinel when they haven't yet matched. */
3061 #define REG_UNSET_VALUE ((char *) -1)
3062 #define REG_UNSET(e) ((e) == REG_UNSET_VALUE)
3065 /* Macros for dealing with the split strings in re_match_2. */
3067 #define MATCHING_IN_FIRST_STRING (dend == end_match_1)
3069 /* Call before fetching a character with *d. This switches over to
3070 string2 if necessary. */
3071 #define PREFETCH() \
3074 /* End of string2 => fail. */ \
3075 if (dend == end_match_2) \
3077 /* End of string1 => advance to string2. */ \
3079 dend = end_match_2; \
3083 /* Test if at very beginning or at very end of the virtual concatenation
3084 of `string1' and `string2'. If only one string, it's `string2'. */
3085 #define AT_STRINGS_BEG(d) ((d) == (size1 ? string1 : string2) || !size2)
3086 #define AT_STRINGS_END(d) ((d) == end2)
3089 /* Test if D points to a character which is word-constituent. We have
3090 two special cases to check for: if past the end of string1, look at
3091 the first character in string2; and if before the beginning of
3092 string2, look at the last character in string1. */
3093 #define WORDCHAR_P(d) \
3094 (SYNTAX ((d) == end1 ? *string2 \
3095 : (d) == string2 - 1 ? *(end1 - 1) : *(d)) \
3098 /* Test if the character before D and the one at D differ with respect
3099 to being word-constituent. */
3100 #define AT_WORD_BOUNDARY(d) \
3101 (AT_STRINGS_BEG (d) || AT_STRINGS_END (d) \
3102 || WORDCHAR_P (d - 1) != WORDCHAR_P (d))
3105 /* Free everything we malloc. */
3107 #define FREE_VAR(var) if (var) free (var); var = NULL
3108 #define FREE_VARIABLES() \
3110 FREE_VAR (fail_stack.stack); \
3111 FREE_VAR (regstart); \
3112 FREE_VAR (regend); \
3113 FREE_VAR (old_regstart); \
3114 FREE_VAR (old_regend); \
3115 FREE_VAR (best_regstart); \
3116 FREE_VAR (best_regend); \
3117 FREE_VAR (reg_info); \
3118 FREE_VAR (reg_dummy); \
3119 FREE_VAR (reg_info_dummy); \
3121 #else /* not REGEX_MALLOC */
3122 /* Some MIPS systems (at least) want this to free alloca'd storage. */
3123 #define FREE_VARIABLES() alloca (0)
3124 #endif /* not REGEX_MALLOC */
3127 /* These values must meet several constraints. They must not be valid
3128 register values; since we have a limit of 255 registers (because
3129 we use only one byte in the pattern for the register number), we can
3130 use numbers larger than 255. They must differ by 1, because of
3131 NUM_FAILURE_ITEMS above. And the value for the lowest register must
3132 be larger than the value for the highest register, so we do not try
3133 to actually save any registers when none are active. */
3134 #define NO_HIGHEST_ACTIVE_REG (1 << BYTEWIDTH)
3135 #define NO_LOWEST_ACTIVE_REG (NO_HIGHEST_ACTIVE_REG + 1)
3137 /* Matching routines. */
3139 #ifndef emacs /* Emacs never uses this. */
3140 /* re_match is like re_match_2 except it takes only a single string. */
3143 re_match (bufp, string, size, pos, regs)
3144 struct re_pattern_buffer *bufp;
3147 struct re_registers *regs;
3149 return re_match_2 (bufp, NULL, 0, string, size, pos, regs, size);
3151 #endif /* not emacs */
3154 /* re_match_2 matches the compiled pattern in BUFP against the
3155 the (virtual) concatenation of STRING1 and STRING2 (of length SIZE1
3156 and SIZE2, respectively). We start matching at POS, and stop
3159 If REGS is non-null and the `no_sub' field of BUFP is nonzero, we
3160 store offsets for the substring each group matched in REGS. See the
3161 documentation for exactly how many groups we fill.
3163 We return -1 if no match, -2 if an internal error (such as the
3164 failure stack overflowing). Otherwise, we return the length of the
3165 matched substring. */
3168 re_match_2 (bufp, string1, size1, string2, size2, pos, regs, stop)
3169 struct re_pattern_buffer *bufp;
3170 const char *string1, *string2;
3173 struct re_registers *regs;
3176 /* General temporaries. */
3180 /* Just past the end of the corresponding string. */
3181 const char *end1, *end2;
3183 /* Pointers into string1 and string2, just past the last characters in
3184 each to consider matching. */
3185 const char *end_match_1, *end_match_2;
3187 /* Where we are in the data, and the end of the current string. */
3188 const char *d, *dend;
3190 /* Where we are in the pattern, and the end of the pattern. */
3191 unsigned char *p = bufp->buffer;
3192 register unsigned char *pend = p + bufp->used;
3194 /* We use this to map every character in the string. */
3195 char *translate = bufp->translate;
3197 /* Failure point stack. Each place that can handle a failure further
3198 down the line pushes a failure point on this stack. It consists of
3199 restart, regend, and reg_info for all registers corresponding to
3200 the subexpressions we're currently inside, plus the number of such
3201 registers, and, finally, two char *'s. The first char * is where
3202 to resume scanning the pattern; the second one is where to resume
3203 scanning the strings. If the latter is zero, the failure point is
3204 a ``dummy''; if a failure happens and the failure point is a dummy,
3205 it gets discarded and the next next one is tried. */
3206 fail_stack_type fail_stack;
3208 static unsigned failure_id = 0;
3209 unsigned nfailure_points_pushed = 0, nfailure_points_popped = 0;
3212 /* We fill all the registers internally, independent of what we
3213 return, for use in backreferences. The number here includes
3214 an element for register zero. */
3215 unsigned num_regs = bufp->re_nsub + 1;
3217 /* The currently active registers. */
3218 unsigned lowest_active_reg = NO_LOWEST_ACTIVE_REG;
3219 unsigned highest_active_reg = NO_HIGHEST_ACTIVE_REG;
3221 /* Information on the contents of registers. These are pointers into
3222 the input strings; they record just what was matched (on this
3223 attempt) by a subexpression part of the pattern, that is, the
3224 regnum-th regstart pointer points to where in the pattern we began
3225 matching and the regnum-th regend points to right after where we
3226 stopped matching the regnum-th subexpression. (The zeroth register
3227 keeps track of what the whole pattern matches.) */
3228 const char **regstart = NULL, **regend = NULL;
3230 /* If a group that's operated upon by a repetition operator fails to
3231 match anything, then the register for its start will need to be
3232 restored because it will have been set to wherever in the string we
3233 are when we last see its open-group operator. Similarly for a
3235 const char **old_regstart = NULL, **old_regend = NULL;
3237 /* The is_active field of reg_info helps us keep track of which (possibly
3238 nested) subexpressions we are currently in. The matched_something
3239 field of reg_info[reg_num] helps us tell whether or not we have
3240 matched any of the pattern so far this time through the reg_num-th
3241 subexpression. These two fields get reset each time through any
3242 loop their register is in. */
3243 register_info_type *reg_info = NULL;
3245 /* The following record the register info as found in the above
3246 variables when we find a match better than any we've seen before.
3247 This happens as we backtrack through the failure points, which in
3248 turn happens only if we have not yet matched the entire string. */
3249 unsigned best_regs_set = false;
3250 const char **best_regstart = NULL, **best_regend = NULL;
3252 /* Logically, this is `best_regend[0]'. But we don't want to have to
3253 allocate space for that if we're not allocating space for anything
3254 else (see below). Also, we never need info about register 0 for
3255 any of the other register vectors, and it seems rather a kludge to
3256 treat `best_regend' differently than the rest. So we keep track of
3257 the end of the best match so far in a separate variable. We
3258 initialize this to NULL so that when we backtrack the first time
3259 and need to test it, it's not garbage. */
3260 const char *match_end = NULL;
3262 /* Used when we pop values we don't care about. */
3263 const char **reg_dummy = NULL;
3264 register_info_type *reg_info_dummy = NULL;
3267 /* Counts the total number of registers pushed. */
3268 unsigned num_regs_pushed = 0;
3271 DEBUG_PRINT1 ("\n\nEntering re_match_2.\n");
3275 /* Do not bother to initialize all the register variables if there are
3276 no groups in the pattern, as it takes a fair amount of time. If
3277 there are groups, we include space for register 0 (the whole
3278 pattern), even though we never use it, since it simplifies the
3279 array indexing. We should fix this. */
3282 regstart = REGEX_TALLOC (num_regs, const char *);
3283 regend = REGEX_TALLOC (num_regs, const char *);
3284 old_regstart = REGEX_TALLOC (num_regs, const char *);
3285 old_regend = REGEX_TALLOC (num_regs, const char *);
3286 best_regstart = REGEX_TALLOC (num_regs, const char *);
3287 best_regend = REGEX_TALLOC (num_regs, const char *);
3288 reg_info = REGEX_TALLOC (num_regs, register_info_type);
3289 reg_dummy = REGEX_TALLOC (num_regs, const char *);
3290 reg_info_dummy = REGEX_TALLOC (num_regs, register_info_type);
3292 if (!(regstart && regend && old_regstart && old_regend && reg_info
3293 && best_regstart && best_regend && reg_dummy && reg_info_dummy))
3302 /* We must initialize all our variables to NULL, so that
3303 `FREE_VARIABLES' doesn't try to free them. */
3304 regstart = regend = old_regstart = old_regend = best_regstart
3305 = best_regend = reg_dummy = NULL;
3306 reg_info = reg_info_dummy = (register_info_type *) NULL;
3308 #endif /* REGEX_MALLOC */
3310 /* The starting position is bogus. */
3311 if (pos < 0 || pos > size1 + size2)
3317 /* Initialize subexpression text positions to -1 to mark ones that no
3318 start_memory/stop_memory has been seen for. Also initialize the
3319 register information struct. */
3320 for (mcnt = 1; mcnt < num_regs; mcnt++)
3322 regstart[mcnt] = regend[mcnt]
3323 = old_regstart[mcnt] = old_regend[mcnt] = REG_UNSET_VALUE;
3325 REG_MATCH_NULL_STRING_P (reg_info[mcnt]) = MATCH_NULL_UNSET_VALUE;
3326 IS_ACTIVE (reg_info[mcnt]) = 0;
3327 MATCHED_SOMETHING (reg_info[mcnt]) = 0;
3328 EVER_MATCHED_SOMETHING (reg_info[mcnt]) = 0;
3331 /* We move `string1' into `string2' if the latter's empty -- but not if
3332 `string1' is null. */
3333 if (size2 == 0 && string1 != NULL)
3340 end1 = string1 + size1;
3341 end2 = string2 + size2;
3343 /* Compute where to stop matching, within the two strings. */
3346 end_match_1 = string1 + stop;
3347 end_match_2 = string2;
3352 end_match_2 = string2 + stop - size1;
3355 /* `p' scans through the pattern as `d' scans through the data.
3356 `dend' is the end of the input string that `d' points within. `d'
3357 is advanced into the following input string whenever necessary, but
3358 this happens before fetching; therefore, at the beginning of the
3359 loop, `d' can be pointing at the end of a string, but it cannot
3361 if (size1 > 0 && pos <= size1)
3368 d = string2 + pos - size1;
3372 DEBUG_PRINT1 ("The compiled pattern is: ");
3373 DEBUG_PRINT_COMPILED_PATTERN (bufp, p, pend);
3374 DEBUG_PRINT1 ("The string to match is: `");
3375 DEBUG_PRINT_DOUBLE_STRING (d, string1, size1, string2, size2);
3376 DEBUG_PRINT1 ("'\n");
3378 /* This loops over pattern commands. It exits by returning from the
3379 function if the match is complete, or it drops through if the match
3380 fails at this starting point in the input data. */
3383 DEBUG_PRINT2 ("\n0x%x: ", p);
3386 { /* End of pattern means we might have succeeded. */
3387 DEBUG_PRINT1 ("end of pattern ... ");
3389 /* If we haven't matched the entire string, and we want the
3390 longest match, try backtracking. */
3391 if (d != end_match_2)
3393 DEBUG_PRINT1 ("backtracking.\n");
3395 if (!FAIL_STACK_EMPTY ())
3396 { /* More failure points to try. */
3397 boolean same_str_p = (FIRST_STRING_P (match_end)
3398 == MATCHING_IN_FIRST_STRING);
3400 /* If exceeds best match so far, save it. */
3402 || (same_str_p && d > match_end)
3403 || (!same_str_p && !MATCHING_IN_FIRST_STRING))
3405 best_regs_set = true;
3408 DEBUG_PRINT1 ("\nSAVING match as best so far.\n");
3410 for (mcnt = 1; mcnt < num_regs; mcnt++)
3412 best_regstart[mcnt] = regstart[mcnt];
3413 best_regend[mcnt] = regend[mcnt];
3419 /* If no failure points, don't restore garbage. */
3420 else if (best_regs_set)
3423 /* Restore best match. It may happen that `dend ==
3424 end_match_1' while the restored d is in string2.
3425 For example, the pattern `x.*y.*z' against the
3426 strings `x-' and `y-z-', if the two strings are
3427 not consecutive in memory. */
3428 DEBUG_PRINT1 ("Restoring best registers.\n");
3431 dend = ((d >= string1 && d <= end1)
3432 ? end_match_1 : end_match_2);
3434 for (mcnt = 1; mcnt < num_regs; mcnt++)
3436 regstart[mcnt] = best_regstart[mcnt];
3437 regend[mcnt] = best_regend[mcnt];
3440 } /* d != end_match_2 */
3442 DEBUG_PRINT1 ("Accepting match.\n");
3444 /* If caller wants register contents data back, do it. */
3445 if (regs && !bufp->no_sub)
3447 /* Have the register data arrays been allocated? */
3448 if (bufp->regs_allocated == REGS_UNALLOCATED)
3449 { /* No. So allocate them with malloc. We need one
3450 extra element beyond `num_regs' for the `-1' marker
3452 regs->num_regs = MAX (RE_NREGS, num_regs + 1);
3453 regs->start = TALLOC (regs->num_regs, regoff_t);
3454 regs->end = TALLOC (regs->num_regs, regoff_t);
3455 if (regs->start == NULL || regs->end == NULL)
3457 bufp->regs_allocated = REGS_REALLOCATE;
3459 else if (bufp->regs_allocated == REGS_REALLOCATE)
3460 { /* Yes. If we need more elements than were already
3461 allocated, reallocate them. If we need fewer, just
3463 if (regs->num_regs < num_regs + 1)
3465 regs->num_regs = num_regs + 1;
3466 RETALLOC (regs->start, regs->num_regs, regoff_t);
3467 RETALLOC (regs->end, regs->num_regs, regoff_t);
3468 if (regs->start == NULL || regs->end == NULL)
3473 assert (bufp->regs_allocated == REGS_FIXED);
3475 /* Convert the pointer data in `regstart' and `regend' to
3476 indices. Register zero has to be set differently,
3477 since we haven't kept track of any info for it. */
3478 if (regs->num_regs > 0)
3480 regs->start[0] = pos;
3481 regs->end[0] = (MATCHING_IN_FIRST_STRING ? d - string1
3482 : d - string2 + size1);
3485 /* Go through the first `min (num_regs, regs->num_regs)'
3486 registers, since that is all we initialized. */
3487 for (mcnt = 1; mcnt < MIN (num_regs, regs->num_regs); mcnt++)
3489 if (REG_UNSET (regstart[mcnt]) || REG_UNSET (regend[mcnt]))
3490 regs->start[mcnt] = regs->end[mcnt] = -1;
3493 regs->start[mcnt] = POINTER_TO_OFFSET (regstart[mcnt]);
3494 regs->end[mcnt] = POINTER_TO_OFFSET (regend[mcnt]);
3498 /* If the regs structure we return has more elements than
3499 were in the pattern, set the extra elements to -1. If
3500 we (re)allocated the registers, this is the case,
3501 because we always allocate enough to have at least one
3503 for (mcnt = num_regs; mcnt < regs->num_regs; mcnt++)
3504 regs->start[mcnt] = regs->end[mcnt] = -1;
3505 } /* regs && !bufp->no_sub */
3508 DEBUG_PRINT4 ("%u failure points pushed, %u popped (%u remain).\n",
3509 nfailure_points_pushed, nfailure_points_popped,
3510 nfailure_points_pushed - nfailure_points_popped);
3511 DEBUG_PRINT2 ("%u registers pushed.\n", num_regs_pushed);
3513 mcnt = d - pos - (MATCHING_IN_FIRST_STRING
3517 DEBUG_PRINT2 ("Returning %d from re_match_2.\n", mcnt);
3522 /* Otherwise match next pattern command. */
3523 #ifdef SWITCH_ENUM_BUG
3524 switch ((int) ((re_opcode_t) *p++))
3526 switch ((re_opcode_t) *p++)
3529 /* Ignore these. Used to ignore the n of succeed_n's which
3530 currently have n == 0. */
3532 DEBUG_PRINT1 ("EXECUTING no_op.\n");
3536 /* Match the next n pattern characters exactly. The following
3537 byte in the pattern defines n, and the n bytes after that
3538 are the characters to match. */
3541 DEBUG_PRINT2 ("EXECUTING exactn %d.\n", mcnt);
3543 /* This is written out as an if-else so we don't waste time
3544 testing `translate' inside the loop. */
3550 if (translate[(unsigned char) *d++] != (char) *p++)
3560 if (*d++ != (char) *p++) goto fail;
3564 SET_REGS_MATCHED ();
3568 /* Match any character except possibly a newline or a null. */
3570 DEBUG_PRINT1 ("EXECUTING anychar.\n");
3574 if ((!(bufp->syntax & RE_DOT_NEWLINE) && TRANSLATE (*d) == '\n')
3575 || (bufp->syntax & RE_DOT_NOT_NULL && TRANSLATE (*d) == '\000'))
3578 SET_REGS_MATCHED ();
3579 DEBUG_PRINT2 (" Matched `%d'.\n", *d);
3587 register unsigned char c;
3588 boolean not = (re_opcode_t) *(p - 1) == charset_not;
3590 DEBUG_PRINT2 ("EXECUTING charset%s.\n", not ? "_not" : "");
3593 c = TRANSLATE (*d); /* The character to match. */
3595 /* Cast to `unsigned' instead of `unsigned char' in case the
3596 bit list is a full 32 bytes long. */
3597 if (c < (unsigned) (*p * BYTEWIDTH)
3598 && p[1 + c / BYTEWIDTH] & (1 << (c % BYTEWIDTH)))
3603 if (!not) goto fail;
3605 SET_REGS_MATCHED ();
3611 /* The beginning of a group is represented by start_memory.
3612 The arguments are the register number in the next byte, and the
3613 number of groups inner to this one in the next. The text
3614 matched within the group is recorded (in the internal
3615 registers data structure) under the register number. */
3617 DEBUG_PRINT3 ("EXECUTING start_memory %d (%d):\n", *p, p[1]);
3619 /* Find out if this group can match the empty string. */
3620 p1 = p; /* To send to group_match_null_string_p. */
3622 if (REG_MATCH_NULL_STRING_P (reg_info[*p]) == MATCH_NULL_UNSET_VALUE)
3623 REG_MATCH_NULL_STRING_P (reg_info[*p])
3624 = group_match_null_string_p (&p1, pend, reg_info);
3626 /* Save the position in the string where we were the last time
3627 we were at this open-group operator in case the group is
3628 operated upon by a repetition operator, e.g., with `(a*)*b'
3629 against `ab'; then we want to ignore where we are now in
3630 the string in case this attempt to match fails. */
3631 old_regstart[*p] = REG_MATCH_NULL_STRING_P (reg_info[*p])
3632 ? REG_UNSET (regstart[*p]) ? d : regstart[*p]
3634 DEBUG_PRINT2 (" old_regstart: %d\n",
3635 POINTER_TO_OFFSET (old_regstart[*p]));
3638 DEBUG_PRINT2 (" regstart: %d\n", POINTER_TO_OFFSET (regstart[*p]));
3640 IS_ACTIVE (reg_info[*p]) = 1;
3641 MATCHED_SOMETHING (reg_info[*p]) = 0;
3643 /* This is the new highest active register. */
3644 highest_active_reg = *p;
3646 /* If nothing was active before, this is the new lowest active
3648 if (lowest_active_reg == NO_LOWEST_ACTIVE_REG)
3649 lowest_active_reg = *p;
3651 /* Move past the register number and inner group count. */
3656 /* The stop_memory opcode represents the end of a group. Its
3657 arguments are the same as start_memory's: the register
3658 number, and the number of inner groups. */
3660 DEBUG_PRINT3 ("EXECUTING stop_memory %d (%d):\n", *p, p[1]);
3662 /* We need to save the string position the last time we were at
3663 this close-group operator in case the group is operated
3664 upon by a repetition operator, e.g., with `((a*)*(b*)*)*'
3665 against `aba'; then we want to ignore where we are now in
3666 the string in case this attempt to match fails. */
3667 old_regend[*p] = REG_MATCH_NULL_STRING_P (reg_info[*p])
3668 ? REG_UNSET (regend[*p]) ? d : regend[*p]
3670 DEBUG_PRINT2 (" old_regend: %d\n",
3671 POINTER_TO_OFFSET (old_regend[*p]));
3674 DEBUG_PRINT2 (" regend: %d\n", POINTER_TO_OFFSET (regend[*p]));
3676 /* This register isn't active anymore. */
3677 IS_ACTIVE (reg_info[*p]) = 0;
3679 /* If this was the only register active, nothing is active
3681 if (lowest_active_reg == highest_active_reg)
3683 lowest_active_reg = NO_LOWEST_ACTIVE_REG;
3684 highest_active_reg = NO_HIGHEST_ACTIVE_REG;
3687 { /* We must scan for the new highest active register, since
3688 it isn't necessarily one less than now: consider
3689 (a(b)c(d(e)f)g). When group 3 ends, after the f), the
3690 new highest active register is 1. */
3691 unsigned char r = *p - 1;
3692 while (r > 0 && !IS_ACTIVE (reg_info[r]))
3695 /* If we end up at register zero, that means that we saved
3696 the registers as the result of an `on_failure_jump', not
3697 a `start_memory', and we jumped to past the innermost
3698 `stop_memory'. For example, in ((.)*) we save
3699 registers 1 and 2 as a result of the *, but when we pop
3700 back to the second ), we are at the stop_memory 1.
3701 Thus, nothing is active. */
3704 lowest_active_reg = NO_LOWEST_ACTIVE_REG;
3705 highest_active_reg = NO_HIGHEST_ACTIVE_REG;
3708 highest_active_reg = r;
3711 /* If just failed to match something this time around with a
3712 group that's operated on by a repetition operator, try to
3713 force exit from the ``loop'', and restore the register
3714 information for this group that we had before trying this
3716 if ((!MATCHED_SOMETHING (reg_info[*p])
3717 || (re_opcode_t) p[-3] == start_memory)
3720 boolean is_a_jump_n = false;
3724 switch ((re_opcode_t) *p1++)
3728 case pop_failure_jump:
3729 case maybe_pop_jump:
3731 case dummy_failure_jump:
3732 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
3742 /* If the next operation is a jump backwards in the pattern
3743 to an on_failure_jump right before the start_memory
3744 corresponding to this stop_memory, exit from the loop
3745 by forcing a failure after pushing on the stack the
3746 on_failure_jump's jump in the pattern, and d. */
3747 if (mcnt < 0 && (re_opcode_t) *p1 == on_failure_jump
3748 && (re_opcode_t) p1[3] == start_memory && p1[4] == *p)
3750 /* If this group ever matched anything, then restore
3751 what its registers were before trying this last
3752 failed match, e.g., with `(a*)*b' against `ab' for
3753 regstart[1], and, e.g., with `((a*)*(b*)*)*'
3754 against `aba' for regend[3].
3756 Also restore the registers for inner groups for,
3757 e.g., `((a*)(b*))*' against `aba' (register 3 would
3758 otherwise get trashed). */
3760 if (EVER_MATCHED_SOMETHING (reg_info[*p]))
3764 EVER_MATCHED_SOMETHING (reg_info[*p]) = 0;
3766 /* Restore this and inner groups' (if any) registers. */
3767 for (r = *p; r < *p + *(p + 1); r++)
3769 regstart[r] = old_regstart[r];
3771 /* xx why this test? */
3772 if ((int) old_regend[r] >= (int) regstart[r])
3773 regend[r] = old_regend[r];
3777 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
3778 PUSH_FAILURE_POINT (p1 + mcnt, d, -2);
3784 /* Move past the register number and the inner group count. */
3789 /* \<digit> has been turned into a `duplicate' command which is
3790 followed by the numeric value of <digit> as the register number. */
3793 register const char *d2, *dend2;
3794 int regno = *p++; /* Get which register to match against. */
3795 DEBUG_PRINT2 ("EXECUTING duplicate %d.\n", regno);
3797 /* Can't back reference a group which we've never matched. */
3798 if (REG_UNSET (regstart[regno]) || REG_UNSET (regend[regno]))
3801 /* Where in input to try to start matching. */
3802 d2 = regstart[regno];
3804 /* Where to stop matching; if both the place to start and
3805 the place to stop matching are in the same string, then
3806 set to the place to stop, otherwise, for now have to use
3807 the end of the first string. */
3809 dend2 = ((FIRST_STRING_P (regstart[regno])
3810 == FIRST_STRING_P (regend[regno]))
3811 ? regend[regno] : end_match_1);
3814 /* If necessary, advance to next segment in register
3818 if (dend2 == end_match_2) break;
3819 if (dend2 == regend[regno]) break;
3821 /* End of string1 => advance to string2. */
3823 dend2 = regend[regno];
3825 /* At end of register contents => success */
3826 if (d2 == dend2) break;
3828 /* If necessary, advance to next segment in data. */
3831 /* How many characters left in this segment to match. */
3834 /* Want how many consecutive characters we can match in
3835 one shot, so, if necessary, adjust the count. */
3836 if (mcnt > dend2 - d2)
3839 /* Compare that many; failure if mismatch, else move
3842 ? bcmp_translate (d, d2, mcnt, translate)
3843 : bcmp (d, d2, mcnt))
3845 d += mcnt, d2 += mcnt;
3851 /* begline matches the empty string at the beginning of the string
3852 (unless `not_bol' is set in `bufp'), and, if
3853 `newline_anchor' is set, after newlines. */
3855 DEBUG_PRINT1 ("EXECUTING begline.\n");
3857 if (AT_STRINGS_BEG (d))
3859 if (!bufp->not_bol) break;
3861 else if (d[-1] == '\n' && bufp->newline_anchor)
3865 /* In all other cases, we fail. */
3869 /* endline is the dual of begline. */
3871 DEBUG_PRINT1 ("EXECUTING endline.\n");
3873 if (AT_STRINGS_END (d))
3875 if (!bufp->not_eol) break;
3878 /* We have to ``prefetch'' the next character. */
3879 else if ((d == end1 ? *string2 : *d) == '\n'
3880 && bufp->newline_anchor)
3887 /* Match at the very beginning of the data. */
3889 DEBUG_PRINT1 ("EXECUTING begbuf.\n");
3890 if (AT_STRINGS_BEG (d))
3895 /* Match at the very end of the data. */
3897 DEBUG_PRINT1 ("EXECUTING endbuf.\n");
3898 if (AT_STRINGS_END (d))
3903 /* on_failure_keep_string_jump is used to optimize `.*\n'. It
3904 pushes NULL as the value for the string on the stack. Then
3905 `pop_failure_point' will keep the current value for the
3906 string, instead of restoring it. To see why, consider
3907 matching `foo\nbar' against `.*\n'. The .* matches the foo;
3908 then the . fails against the \n. But the next thing we want
3909 to do is match the \n against the \n; if we restored the
3910 string value, we would be back at the foo.
3912 Because this is used only in specific cases, we don't need to
3913 check all the things that `on_failure_jump' does, to make
3914 sure the right things get saved on the stack. Hence we don't
3915 share its code. The only reason to push anything on the
3916 stack at all is that otherwise we would have to change
3917 `anychar's code to do something besides goto fail in this
3918 case; that seems worse than this. */
3919 case on_failure_keep_string_jump:
3920 DEBUG_PRINT1 ("EXECUTING on_failure_keep_string_jump");
3922 EXTRACT_NUMBER_AND_INCR (mcnt, p);
3923 DEBUG_PRINT3 (" %d (to 0x%x):\n", mcnt, p + mcnt);
3925 PUSH_FAILURE_POINT (p + mcnt, NULL, -2);
3929 /* Uses of on_failure_jump:
3931 Each alternative starts with an on_failure_jump that points
3932 to the beginning of the next alternative. Each alternative
3933 except the last ends with a jump that in effect jumps past
3934 the rest of the alternatives. (They really jump to the
3935 ending jump of the following alternative, because tensioning
3936 these jumps is a hassle.)
3938 Repeats start with an on_failure_jump that points past both
3939 the repetition text and either the following jump or
3940 pop_failure_jump back to this on_failure_jump. */
3941 case on_failure_jump:
3943 DEBUG_PRINT1 ("EXECUTING on_failure_jump");
3945 EXTRACT_NUMBER_AND_INCR (mcnt, p);
3946 DEBUG_PRINT3 (" %d (to 0x%x)", mcnt, p + mcnt);
3948 /* If this on_failure_jump comes right before a group (i.e.,
3949 the original * applied to a group), save the information
3950 for that group and all inner ones, so that if we fail back
3951 to this point, the group's information will be correct.
3952 For example, in \(a*\)*\1, we need the preceding group,
3953 and in \(\(a*\)b*\)\2, we need the inner group. */
3955 /* We can't use `p' to check ahead because we push
3956 a failure point to `p + mcnt' after we do this. */
3959 /* We need to skip no_op's before we look for the
3960 start_memory in case this on_failure_jump is happening as
3961 the result of a completed succeed_n, as in \(a\)\{1,3\}b\1
3963 while (p1 < pend && (re_opcode_t) *p1 == no_op)
3966 if (p1 < pend && (re_opcode_t) *p1 == start_memory)
3968 /* We have a new highest active register now. This will
3969 get reset at the start_memory we are about to get to,
3970 but we will have saved all the registers relevant to
3971 this repetition op, as described above. */
3972 highest_active_reg = *(p1 + 1) + *(p1 + 2);
3973 if (lowest_active_reg == NO_LOWEST_ACTIVE_REG)
3974 lowest_active_reg = *(p1 + 1);
3977 DEBUG_PRINT1 (":\n");
3978 PUSH_FAILURE_POINT (p + mcnt, d, -2);
3982 /* A smart repeat ends with `maybe_pop_jump'.
3983 We change it to either `pop_failure_jump' or `jump'. */
3984 case maybe_pop_jump:
3985 EXTRACT_NUMBER_AND_INCR (mcnt, p);
3986 DEBUG_PRINT2 ("EXECUTING maybe_pop_jump %d.\n", mcnt);
3988 register unsigned char *p2 = p;
3990 /* Compare the beginning of the repeat with what in the
3991 pattern follows its end. If we can establish that there
3992 is nothing that they would both match, i.e., that we
3993 would have to backtrack because of (as in, e.g., `a*a')
3994 then we can change to pop_failure_jump, because we'll
3995 never have to backtrack.
3997 This is not true in the case of alternatives: in
3998 `(a|ab)*' we do need to backtrack to the `ab' alternative
3999 (e.g., if the string was `ab'). But instead of trying to
4000 detect that here, the alternative has put on a dummy
4001 failure point which is what we will end up popping. */
4003 /* Skip over open/close-group commands. */
4004 while (p2 + 2 < pend
4005 && ((re_opcode_t) *p2 == stop_memory
4006 || (re_opcode_t) *p2 == start_memory))
4007 p2 += 3; /* Skip over args, too. */
4009 /* If we're at the end of the pattern, we can change. */
4012 /* Consider what happens when matching ":\(.*\)"
4013 against ":/". I don't really understand this code
4015 p[-3] = (unsigned char) pop_failure_jump;
4017 (" End of pattern: change to `pop_failure_jump'.\n");
4020 else if ((re_opcode_t) *p2 == exactn
4021 || (bufp->newline_anchor && (re_opcode_t) *p2 == endline))
4023 register unsigned char c
4024 = *p2 == (unsigned char) endline ? '\n' : p2[2];
4027 /* p1[0] ... p1[2] are the `on_failure_jump' corresponding
4028 to the `maybe_finalize_jump' of this case. Examine what
4030 if ((re_opcode_t) p1[3] == exactn && p1[5] != c)
4032 p[-3] = (unsigned char) pop_failure_jump;
4033 DEBUG_PRINT3 (" %c != %c => pop_failure_jump.\n",
4037 else if ((re_opcode_t) p1[3] == charset
4038 || (re_opcode_t) p1[3] == charset_not)
4040 int not = (re_opcode_t) p1[3] == charset_not;
4042 if (c < (unsigned char) (p1[4] * BYTEWIDTH)
4043 && p1[5 + c / BYTEWIDTH] & (1 << (c % BYTEWIDTH)))
4046 /* `not' is equal to 1 if c would match, which means
4047 that we can't change to pop_failure_jump. */
4050 p[-3] = (unsigned char) pop_failure_jump;
4051 DEBUG_PRINT1 (" No match => pop_failure_jump.\n");
4056 p -= 2; /* Point at relative address again. */
4057 if ((re_opcode_t) p[-1] != pop_failure_jump)
4059 p[-1] = (unsigned char) jump;
4060 DEBUG_PRINT1 (" Match => jump.\n");
4061 goto unconditional_jump;
4063 /* Note fall through. */
4066 /* The end of a simple repeat has a pop_failure_jump back to
4067 its matching on_failure_jump, where the latter will push a
4068 failure point. The pop_failure_jump takes off failure
4069 points put on by this pop_failure_jump's matching
4070 on_failure_jump; we got through the pattern to here from the
4071 matching on_failure_jump, so didn't fail. */
4072 case pop_failure_jump:
4074 /* We need to pass separate storage for the lowest and
4075 highest registers, even though we don't care about the
4076 actual values. Otherwise, we will restore only one
4077 register from the stack, since lowest will == highest in
4078 `pop_failure_point'. */
4079 unsigned dummy_low_reg, dummy_high_reg;
4080 unsigned char *pdummy;
4083 DEBUG_PRINT1 ("EXECUTING pop_failure_jump.\n");
4084 POP_FAILURE_POINT (sdummy, pdummy,
4085 dummy_low_reg, dummy_high_reg,
4086 reg_dummy, reg_dummy, reg_info_dummy);
4088 /* Note fall through. */
4091 /* Unconditionally jump (without popping any failure points). */
4094 EXTRACT_NUMBER_AND_INCR (mcnt, p); /* Get the amount to jump. */
4095 DEBUG_PRINT2 ("EXECUTING jump %d ", mcnt);
4096 p += mcnt; /* Do the jump. */
4097 DEBUG_PRINT2 ("(to 0x%x).\n", p);
4101 /* We need this opcode so we can detect where alternatives end
4102 in `group_match_null_string_p' et al. */
4104 DEBUG_PRINT1 ("EXECUTING jump_past_alt.\n");
4105 goto unconditional_jump;
4108 /* Normally, the on_failure_jump pushes a failure point, which
4109 then gets popped at pop_failure_jump. We will end up at
4110 pop_failure_jump, also, and with a pattern of, say, `a+', we
4111 are skipping over the on_failure_jump, so we have to push
4112 something meaningless for pop_failure_jump to pop. */
4113 case dummy_failure_jump:
4114 DEBUG_PRINT1 ("EXECUTING dummy_failure_jump.\n");
4115 /* It doesn't matter what we push for the string here. What
4116 the code at `fail' tests is the value for the pattern. */
4117 PUSH_FAILURE_POINT (0, 0, -2);
4118 goto unconditional_jump;
4121 /* At the end of an alternative, we need to push a dummy failure
4122 point in case we are followed by a `pop_failure_jump', because
4123 we don't want the failure point for the alternative to be
4124 popped. For example, matching `(a|ab)*' against `aab'
4125 requires that we match the `ab' alternative. */
4126 case push_dummy_failure:
4127 DEBUG_PRINT1 ("EXECUTING push_dummy_failure.\n");
4128 /* See comments just above at `dummy_failure_jump' about the
4130 PUSH_FAILURE_POINT (0, 0, -2);
4133 /* Have to succeed matching what follows at least n times.
4134 After that, handle like `on_failure_jump'. */
4136 EXTRACT_NUMBER (mcnt, p + 2);
4137 DEBUG_PRINT2 ("EXECUTING succeed_n %d.\n", mcnt);
4140 /* Originally, this is how many times we HAVE to succeed. */
4145 STORE_NUMBER_AND_INCR (p, mcnt);
4146 DEBUG_PRINT3 (" Setting 0x%x to %d.\n", p, mcnt);
4150 DEBUG_PRINT2 (" Setting two bytes from 0x%x to no_op.\n", p+2);
4151 p[2] = (unsigned char) no_op;
4152 p[3] = (unsigned char) no_op;
4158 EXTRACT_NUMBER (mcnt, p + 2);
4159 DEBUG_PRINT2 ("EXECUTING jump_n %d.\n", mcnt);
4161 /* Originally, this is how many times we CAN jump. */
4165 STORE_NUMBER (p + 2, mcnt);
4166 goto unconditional_jump;
4168 /* If don't have to jump any more, skip over the rest of command. */
4175 DEBUG_PRINT1 ("EXECUTING set_number_at.\n");
4177 EXTRACT_NUMBER_AND_INCR (mcnt, p);
4179 EXTRACT_NUMBER_AND_INCR (mcnt, p);
4180 DEBUG_PRINT3 (" Setting 0x%x to %d.\n", p1, mcnt);
4181 STORE_NUMBER (p1, mcnt);
4186 DEBUG_PRINT1 ("EXECUTING wordbound.\n");
4187 if (AT_WORD_BOUNDARY (d))
4192 DEBUG_PRINT1 ("EXECUTING notwordbound.\n");
4193 if (AT_WORD_BOUNDARY (d))
4198 DEBUG_PRINT1 ("EXECUTING wordbeg.\n");
4199 if (WORDCHAR_P (d) && (AT_STRINGS_BEG (d) || !WORDCHAR_P (d - 1)))
4204 DEBUG_PRINT1 ("EXECUTING wordend.\n");
4205 if (!AT_STRINGS_BEG (d) && WORDCHAR_P (d - 1)
4206 && (!WORDCHAR_P (d) || AT_STRINGS_END (d)))
4213 DEBUG_PRINT1 ("EXECUTING before_dot.\n");
4214 if (PTR_CHAR_POS ((unsigned char *) d) >= point)
4219 DEBUG_PRINT1 ("EXECUTING at_dot.\n");
4220 if (PTR_CHAR_POS ((unsigned char *) d) != point)
4225 DEBUG_PRINT1 ("EXECUTING after_dot.\n");
4226 if (PTR_CHAR_POS ((unsigned char *) d) <= point)
4229 #else /* not emacs19 */
4231 DEBUG_PRINT1 ("EXECUTING at_dot.\n");
4232 if (PTR_CHAR_POS ((unsigned char *) d) + 1 != point)
4235 #endif /* not emacs19 */
4238 DEBUG_PRINT2 ("EXECUTING syntaxspec %d.\n", mcnt);
4243 DEBUG_PRINT1 ("EXECUTING Emacs wordchar.\n");
4247 if (SYNTAX (*d++) != (enum syntaxcode) mcnt)
4249 SET_REGS_MATCHED ();
4253 DEBUG_PRINT2 ("EXECUTING notsyntaxspec %d.\n", mcnt);
4255 goto matchnotsyntax;
4258 DEBUG_PRINT1 ("EXECUTING Emacs notwordchar.\n");
4262 if (SYNTAX (*d++) == (enum syntaxcode) mcnt)
4264 SET_REGS_MATCHED ();
4267 #else /* not emacs */
4269 DEBUG_PRINT1 ("EXECUTING non-Emacs wordchar.\n");
4271 if (!WORDCHAR_P (d))
4273 SET_REGS_MATCHED ();
4278 DEBUG_PRINT1 ("EXECUTING non-Emacs notwordchar.\n");
4282 SET_REGS_MATCHED ();
4285 #endif /* not emacs */
4290 continue; /* Successfully executed one pattern command; keep going. */
4293 /* We goto here if a matching operation fails. */
4295 if (!FAIL_STACK_EMPTY ())
4296 { /* A restart point is known. Restore to that state. */
4297 DEBUG_PRINT1 ("\nFAIL:\n");
4298 POP_FAILURE_POINT (d, p,
4299 lowest_active_reg, highest_active_reg,
4300 regstart, regend, reg_info);
4302 /* If this failure point is a dummy, try the next one. */
4306 /* If we failed to the end of the pattern, don't examine *p. */
4310 boolean is_a_jump_n = false;
4312 /* If failed to a backwards jump that's part of a repetition
4313 loop, need to pop this failure point and use the next one. */
4314 switch ((re_opcode_t) *p)
4318 case maybe_pop_jump:
4319 case pop_failure_jump:
4322 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
4325 if ((is_a_jump_n && (re_opcode_t) *p1 == succeed_n)
4327 && (re_opcode_t) *p1 == on_failure_jump))
4335 if (d >= string1 && d <= end1)
4339 break; /* Matching at this starting point really fails. */
4343 goto restore_best_regs;
4347 return -1; /* Failure to match. */
4350 /* Subroutine definitions for re_match_2. */
4353 /* We are passed P pointing to a register number after a start_memory.
4355 Return true if the pattern up to the corresponding stop_memory can
4356 match the empty string, and false otherwise.
4358 If we find the matching stop_memory, sets P to point to one past its number.
4359 Otherwise, sets P to an undefined byte less than or equal to END.
4361 We don't handle duplicates properly (yet). */
4364 group_match_null_string_p (p, end, reg_info)
4365 unsigned char **p, *end;
4366 register_info_type *reg_info;
4369 /* Point to after the args to the start_memory. */
4370 unsigned char *p1 = *p + 2;
4374 /* Skip over opcodes that can match nothing, and return true or
4375 false, as appropriate, when we get to one that can't, or to the
4376 matching stop_memory. */
4378 switch ((re_opcode_t) *p1)
4380 /* Could be either a loop or a series of alternatives. */
4381 case on_failure_jump:
4383 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
4385 /* If the next operation is not a jump backwards in the
4390 /* Go through the on_failure_jumps of the alternatives,
4391 seeing if any of the alternatives cannot match nothing.
4392 The last alternative starts with only a jump,
4393 whereas the rest start with on_failure_jump and end
4394 with a jump, e.g., here is the pattern for `a|b|c':
4396 /on_failure_jump/0/6/exactn/1/a/jump_past_alt/0/6
4397 /on_failure_jump/0/6/exactn/1/b/jump_past_alt/0/3
4400 So, we have to first go through the first (n-1)
4401 alternatives and then deal with the last one separately. */
4404 /* Deal with the first (n-1) alternatives, which start
4405 with an on_failure_jump (see above) that jumps to right
4406 past a jump_past_alt. */
4408 while ((re_opcode_t) p1[mcnt-3] == jump_past_alt)
4410 /* `mcnt' holds how many bytes long the alternative
4411 is, including the ending `jump_past_alt' and
4414 if (!alt_match_null_string_p (p1, p1 + mcnt - 3,
4418 /* Move to right after this alternative, including the
4422 /* Break if it's the beginning of an n-th alternative
4423 that doesn't begin with an on_failure_jump. */
4424 if ((re_opcode_t) *p1 != on_failure_jump)
4427 /* Still have to check that it's not an n-th
4428 alternative that starts with an on_failure_jump. */
4430 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
4431 if ((re_opcode_t) p1[mcnt-3] != jump_past_alt)
4433 /* Get to the beginning of the n-th alternative. */
4439 /* Deal with the last alternative: go back and get number
4440 of the `jump_past_alt' just before it. `mcnt' contains
4441 the length of the alternative. */
4442 EXTRACT_NUMBER (mcnt, p1 - 2);
4444 if (!alt_match_null_string_p (p1, p1 + mcnt, reg_info))
4447 p1 += mcnt; /* Get past the n-th alternative. */
4453 assert (p1[1] == **p);
4459 if (!common_op_match_null_string_p (&p1, end, reg_info))
4462 } /* while p1 < end */
4465 } /* group_match_null_string_p */
4468 /* Similar to group_match_null_string_p, but doesn't deal with alternatives:
4469 It expects P to be the first byte of a single alternative and END one
4470 byte past the last. The alternative can contain groups. */
4473 alt_match_null_string_p (p, end, reg_info)
4474 unsigned char *p, *end;
4475 register_info_type *reg_info;
4478 unsigned char *p1 = p;
4482 /* Skip over opcodes that can match nothing, and break when we get
4483 to one that can't. */
4485 switch ((re_opcode_t) *p1)
4488 case on_failure_jump:
4490 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
4495 if (!common_op_match_null_string_p (&p1, end, reg_info))
4498 } /* while p1 < end */
4501 } /* alt_match_null_string_p */
4504 /* Deals with the ops common to group_match_null_string_p and
4505 alt_match_null_string_p.
4507 Sets P to one after the op and its arguments, if any. */
4510 common_op_match_null_string_p (p, end, reg_info)
4511 unsigned char **p, *end;
4512 register_info_type *reg_info;
4517 unsigned char *p1 = *p;
4519 switch ((re_opcode_t) *p1++)
4539 assert (reg_no > 0 && reg_no <= MAX_REGNUM);
4540 ret = group_match_null_string_p (&p1, end, reg_info);
4542 /* Have to set this here in case we're checking a group which
4543 contains a group and a back reference to it. */
4545 if (REG_MATCH_NULL_STRING_P (reg_info[reg_no]) == MATCH_NULL_UNSET_VALUE)
4546 REG_MATCH_NULL_STRING_P (reg_info[reg_no]) = ret;
4552 /* If this is an optimized succeed_n for zero times, make the jump. */
4554 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
4562 /* Get to the number of times to succeed. */
4564 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
4569 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
4577 if (!REG_MATCH_NULL_STRING_P (reg_info[*p1]))
4585 /* All other opcodes mean we cannot match the empty string. */
4591 } /* common_op_match_null_string_p */
4594 /* Return zero if TRANSLATE[S1] and TRANSLATE[S2] are identical for LEN
4595 bytes; nonzero otherwise. */
4598 bcmp_translate (s1, s2, len, translate)
4599 unsigned char *s1, *s2;
4603 register unsigned char *p1 = s1, *p2 = s2;
4606 if (translate[*p1++] != translate[*p2++]) return 1;
4612 /* Entry points for GNU code. */
4614 /* re_compile_pattern is the GNU regular expression compiler: it
4615 compiles PATTERN (of length SIZE) and puts the result in BUFP.
4616 Returns 0 if the pattern was valid, otherwise an error string.
4618 Assumes the `allocated' (and perhaps `buffer') and `translate' fields
4619 are set in BUFP on entry.
4621 We call regex_compile to do the actual compilation. */
4624 re_compile_pattern (pattern, length, bufp)
4625 const char *pattern;
4627 struct re_pattern_buffer *bufp;
4631 /* GNU code is written to assume at least RE_NREGS registers will be set
4632 (and at least one extra will be -1). */
4633 bufp->regs_allocated = REGS_UNALLOCATED;
4635 /* And GNU code determines whether or not to get register information
4636 by passing null for the REGS argument to re_match, etc., not by
4640 /* Match anchors at newline. */
4641 bufp->newline_anchor = 1;
4643 ret = regex_compile (pattern, length, re_syntax_options, bufp);
4645 return re_error_msg[(int) ret];
4648 /* Entry points compatible with 4.2 BSD regex library. We don't define
4649 them if this is an Emacs or POSIX compilation. */
4651 #if !defined (emacs) && !defined (_POSIX_SOURCE)
4653 /* BSD has one and only one pattern buffer. */
4654 static struct re_pattern_buffer re_comp_buf;
4664 if (!re_comp_buf.buffer)
4665 return "No previous regular expression";
4669 if (!re_comp_buf.buffer)
4671 re_comp_buf.buffer = (unsigned char *) malloc (200);
4672 if (re_comp_buf.buffer == NULL)
4673 return "Memory exhausted";
4674 re_comp_buf.allocated = 200;
4676 re_comp_buf.fastmap = (char *) malloc (1 << BYTEWIDTH);
4677 if (re_comp_buf.fastmap == NULL)
4678 return "Memory exhausted";
4681 /* Since `re_exec' always passes NULL for the `regs' argument, we
4682 don't need to initialize the pattern buffer fields which affect it. */
4684 /* Match anchors at newlines. */
4685 re_comp_buf.newline_anchor = 1;
4687 ret = regex_compile (s, strlen (s), re_syntax_options, &re_comp_buf);
4689 /* Yes, we're discarding `const' here. */
4690 return (char *) re_error_msg[(int) ret];
4698 const int len = strlen (s);
4700 0 <= re_search (&re_comp_buf, s, len, 0, len, (struct re_registers *) 0);
4702 #endif /* not emacs and not _POSIX_SOURCE */
4704 /* POSIX.2 functions. Don't define these for Emacs. */
4708 /* regcomp takes a regular expression as a string and compiles it.
4710 PREG is a regex_t *. We do not expect any fields to be initialized,
4711 since POSIX says we shouldn't. Thus, we set
4713 `buffer' to the compiled pattern;
4714 `used' to the length of the compiled pattern;
4715 `syntax' to RE_SYNTAX_POSIX_EXTENDED if the
4716 REG_EXTENDED bit in CFLAGS is set; otherwise, to
4717 RE_SYNTAX_POSIX_BASIC;
4718 `newline_anchor' to REG_NEWLINE being set in CFLAGS;
4719 `fastmap' and `fastmap_accurate' to zero;
4720 `re_nsub' to the number of subexpressions in PATTERN.
4722 PATTERN is the address of the pattern string.
4724 CFLAGS is a series of bits which affect compilation.
4726 If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
4727 use POSIX basic syntax.
4729 If REG_NEWLINE is set, then . and [^...] don't match newline.
4730 Also, regexec will try a match beginning after every newline.
4732 If REG_ICASE is set, then we considers upper- and lowercase
4733 versions of letters to be equivalent when matching.
4735 If REG_NOSUB is set, then when PREG is passed to regexec, that
4736 routine will report only success or failure, and nothing about the
4739 It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for
4740 the return codes and their meanings.) */
4743 regcomp (preg, pattern, cflags)
4745 const char *pattern;
4750 = (cflags & REG_EXTENDED) ?
4751 RE_SYNTAX_POSIX_EXTENDED : RE_SYNTAX_POSIX_BASIC;
4753 /* regex_compile will allocate the space for the compiled pattern. */
4755 preg->allocated = 0;
4757 /* Don't bother to use a fastmap when searching. This simplifies the
4758 REG_NEWLINE case: if we used a fastmap, we'd have to put all the
4759 characters after newlines into the fastmap. This way, we just try
4763 if (cflags & REG_ICASE)
4767 preg->translate = (char *) malloc (CHAR_SET_SIZE);
4768 if (preg->translate == NULL)
4769 return (int) REG_ESPACE;
4771 /* Map uppercase characters to corresponding lowercase ones. */
4772 for (i = 0; i < CHAR_SET_SIZE; i++)
4773 preg->translate[i] = ISUPPER (i) ? tolower (i) : i;
4776 preg->translate = NULL;
4778 /* If REG_NEWLINE is set, newlines are treated differently. */
4779 if (cflags & REG_NEWLINE)
4780 { /* REG_NEWLINE implies neither . nor [^...] match newline. */
4781 syntax &= ~RE_DOT_NEWLINE;
4782 syntax |= RE_HAT_LISTS_NOT_NEWLINE;
4783 /* It also changes the matching behavior. */
4784 preg->newline_anchor = 1;
4787 preg->newline_anchor = 0;
4789 preg->no_sub = !!(cflags & REG_NOSUB);
4791 /* POSIX says a null character in the pattern terminates it, so we
4792 can use strlen here in compiling the pattern. */
4793 ret = regex_compile (pattern, strlen (pattern), syntax, preg);
4795 /* POSIX doesn't distinguish between an unmatched open-group and an
4796 unmatched close-group: both are REG_EPAREN. */
4797 if (ret == REG_ERPAREN) ret = REG_EPAREN;
4803 /* regexec searches for a given pattern, specified by PREG, in the
4806 If NMATCH is zero or REG_NOSUB was set in the cflags argument to
4807 `regcomp', we ignore PMATCH. Otherwise, we assume PMATCH has at
4808 least NMATCH elements, and we set them to the offsets of the
4809 corresponding matched substrings.
4811 EFLAGS specifies `execution flags' which affect matching: if
4812 REG_NOTBOL is set, then ^ does not match at the beginning of the
4813 string; if REG_NOTEOL is set, then $ does not match at the end.
4815 We return 0 if we find a match and REG_NOMATCH if not. */
4818 regexec (preg, string, nmatch, pmatch, eflags)
4819 const regex_t *preg;
4822 regmatch_t pmatch[];
4826 struct re_registers regs;
4827 regex_t private_preg;
4828 int len = strlen (string);
4829 boolean want_reg_info = !preg->no_sub && nmatch > 0;
4831 private_preg = *preg;
4833 private_preg.not_bol = !!(eflags & REG_NOTBOL);
4834 private_preg.not_eol = !!(eflags & REG_NOTEOL);
4836 /* The user has told us exactly how many registers to return
4837 information about, via `nmatch'. We have to pass that on to the
4838 matching routines. */
4839 private_preg.regs_allocated = REGS_FIXED;
4843 regs.num_regs = nmatch;
4844 regs.start = TALLOC (nmatch, regoff_t);
4845 regs.end = TALLOC (nmatch, regoff_t);
4846 if (regs.start == NULL || regs.end == NULL)
4847 return (int) REG_NOMATCH;
4850 /* Perform the searching operation. */
4851 ret = re_search (&private_preg, string, len,
4852 /* start: */ 0, /* range: */ len,
4853 want_reg_info ? ®s : (struct re_registers *) 0);
4855 /* Copy the register information to the POSIX structure. */
4862 for (r = 0; r < nmatch; r++)
4864 pmatch[r].rm_so = regs.start[r];
4865 pmatch[r].rm_eo = regs.end[r];
4869 /* If we needed the temporary register info, free the space now. */
4874 /* We want zero return to mean success, unlike `re_search'. */
4875 return ret >= 0 ? (int) REG_NOERROR : (int) REG_NOMATCH;
4879 /* Returns a message corresponding to an error code, ERRCODE, returned
4880 from either regcomp or regexec. We don't use PREG here. */
4883 regerror (errcode, preg, errbuf, errbuf_size)
4885 const regex_t *preg;
4893 || errcode >= (sizeof (re_error_msg) / sizeof (re_error_msg[0])))
4894 /* Only error codes returned by the rest of the code should be passed
4895 to this routine. If we are given anything else, or if other regex
4896 code generates an invalid error code, then the program has a bug.
4897 Dump core so we can fix it. */
4900 msg = re_error_msg[errcode];
4902 /* POSIX doesn't require that we do anything in this case, but why
4907 msg_size = strlen (msg) + 1; /* Includes the null. */
4909 if (errbuf_size != 0)
4911 if (msg_size > errbuf_size)
4913 strncpy (errbuf, msg, errbuf_size - 1);
4914 errbuf[errbuf_size - 1] = 0;
4917 strcpy (errbuf, msg);
4924 /* Free dynamically allocated space used by PREG. */
4930 if (preg->buffer != NULL)
4931 free (preg->buffer);
4932 preg->buffer = NULL;
4934 preg->allocated = 0;
4937 if (preg->fastmap != NULL)
4938 free (preg->fastmap);
4939 preg->fastmap = NULL;
4940 preg->fastmap_accurate = 0;
4942 if (preg->translate != NULL)
4943 free (preg->translate);
4944 preg->translate = NULL;
4947 #endif /* not emacs */
4951 make-backup-files: t
4953 trim-versions-without-asking: nil