5 Author: Pekka Riikonen <priikone@silcnet.org>
7 Copyright (C) 2007 - 2008 Pekka Riikonen
9 This program is free software; you can redistribute it and/or modify
10 it under the terms of the GNU General Public License as published by
11 the Free Software Foundation; version 2 of the License.
13 This program is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 GNU General Public License for more details.
20 /****h* silcutil/SILC Regular Expression Interface
24 * SILC regular expression interface provides Unix and POSIX compliant
25 * regular expression compilation and matching.
27 * The interface also provides many convenience functions to make the use
28 * of regular expressions easier. Especially the silc_regex allows very
29 * simple way to match strings against regular expressions and get the
30 * exact match or matches as a return.
32 * The regex syntax follows POSIX regex syntax:
35 * ^ Match start of line/string
36 * '^a' matches 'ab' but not 'ba'
37 * $ Match end of line/string
38 * 'a$' matches 'ba' but not 'ab'
39 * . Match any single character (except new line (\n))
40 * '.a' matches 'ba' but not 'a'
41 * + Preceding item is matched one or more times
42 * 'a+b' matches 'aaab' but not 'b'
43 * * Preceding item is matched zero or more times
44 * 'a*b' matches 'ab', 'aab' and 'b'
45 * ? Preceding item is matched zero or one time
46 * 'ca?b' matches 'cb' and 'cab' but not 'caab'
47 * | Joins two expressions and matches either of them (OR)
48 * 'foo|bar' matches 'foo' or 'bar'
49 * {n} Preceding item is matched exactly n times (n can be 0-255)
50 * 'a{2}' matches 'aa' but not 'aaa'
51 * {n,} Preceding item is matched n or more times
52 * 'a{2,} matches 'aa' and 'aaaa' but not 'a'
53 * {n,m} Preceding item is matched at least n times and at most m times
54 * 'a{2,4}' matches 'aa', 'aaa' and 'aaaa' but not 'aaaaa'
55 * [ ] Match any single character in the character list inside [ ]
56 * '[0123]' matches only '0', '1', '2' or '3'
57 * [ - ] Match any single character in the specified range
58 * '[0-5]' matches digits 0-5.
59 * [^ ] Match any character not in the character list or range
60 * '[^09]]' matches any other character except '0' and '9'
61 * ( ) Subexpression, grouping
63 * Escaping (C-language style, '\' is written as '\\'):
64 * \\ Considers following character literal ('\\{' is '{')
65 * \\\\ Matches literal \
66 * \a Matches bell (BEL)
67 * \t Matches horizontal tab (HT)
68 * \n Matches new line (LF)
69 * \v Matches vertical tab (VT)
70 * \f Matches form feed (FF)
71 * \r Matches carriage ret (CR)
72 * \\< Match null string at the start of a word
73 * \\> Match null string at the end of a word
74 * \\b Match null string at the edge of a wrod
75 * \\B Match null string when not at the edge of a word
79 * SilcRegexStruct reg;
81 * // Compile regular expression
82 * if (!silc_regex_compile(®, "foo[0-9]*", 0))
85 * // Match string against the compiled regex
86 * if (!silc_regex_match(®, "foo20", 0, NULL, 0))
89 * // Free the compiled regular expression
90 * silc_regex_free(®);
97 /****s* silcutil/SilcRegexAPI/SilcRegex
101 * typedef struct { ... } *SilcRegex, SilcRegexStruct;
105 * The regular expression context. This context is given as argument
106 * to all silc_regex_* functions. It is usually statically allocated
107 * but can be dynamically allocated by silc_malloc.
110 typedef struct SilcRegexObject {
111 SilcStack rstack; /* Stack for fast allocations */
112 unsigned char *buffer; /* compiled pattern */
113 char *fastmap; /* fastmap[ch] is true if ch can start pattern */
114 char *translate; /* translation to apply during comp/match */
115 int allocated; /* allocated size of compiled pattern */
116 int used; /* actual length of compiled pattern */
117 int num_registers; /* number of registers used */
118 char fastmap_accurate; /* true if fastmap is valid */
119 char can_be_null; /* true if can match empty string */
120 char uses_registers; /* registers used and need to be initialized */
121 char anchor; /* anchor: 0=none 1=begline 2=begbuf */
122 } *SilcRegex, SilcRegexStruct;
124 /****s* silcutil/SilcRegexAPI/SilcRegexMatch
128 * typedef struct { ... } *SilcRegexMatch, SilcRegexMatchStruct;
132 * The regular expression match context that provides information on the
133 * found match. It provides the start offset and end offset of the
138 typedef struct SilcRegexMatchObject {
139 int start; /* Start offset of region */
140 int end; /* End offset of region */
141 } *SilcRegexMatch, SilcRegexMatchStruct;
144 /****d* silcutil/SilcRegexAPI/SilcRegexFlags
148 * typedef enum { ... } SilcRegexFlags;
152 * Regular expression feature flags.
157 SILC_REGEX_DEFAULT = 0x00000000,
159 /* The following flags can be used with silc_regex_match */
161 /* The beginning-of-line (^) always fails to match. This can be useful
162 when beginning of a string should not be interpreted as the beginning
164 SILC_REGEX_NOTBOL = 0x00010000,
166 /* The end-of-line ($) always fails to match. */
167 SILC_REGEX_NOTEOL = 0x00020000,
171 /****f* silcutil/SilcRegexAPI/silc_regex_compile
175 * SilcBool silc_regex_compile(SilcRegex regexp, const char *regex,
176 * SilcRegexFlags flags);
180 * Compiles the regular expression string `regex'. The `regexp' is a
181 * pre-allocated regular expression context. The `flags' define
182 * various feature flags. This function must be called before the
183 * silc_regex_match can be used to find matches.
185 * Returns TRUE after the compilation is completed. Returns FALSE on
186 * error and sets silc_errno.
189 SilcBool silc_regex_compile(SilcRegex regexp, const char *regex,
190 SilcRegexFlags flags);
192 /****f* silcutil/SilcRegexAPI/silc_regex_compile
196 * SilcBool silc_regex_match(SilcRegex regexp, const char *string,
197 * SilcUInt32 string_len, SilcUInt32 num_match,
198 * SilcRegexMatch match, SilcRegexFlags flags);
202 * Finds one or more matches from the `string' using the pre-compiled
203 * regular expression `regexp'. It must be compiled by calling the
204 * silc_regex_compile before calling this function. The `flags' defines
205 * various feature flags.
207 * If only one match is needed the `num_match' may be set to 0 and the
208 * `match' is set to NULL. If multiple matches (substrings) are needed the
209 * `num_match' defines the size of the `match' array, where each of the
210 * matches (with parenthesized regular expression) will be stored. The
211 * `match' provides information on where the match was found in `string',
212 * providing the start offset and end offset of the match. Unused entires
213 * in the array will have -1 as the offset values.
215 * Returns TRUE if the string matched the regular expression or FALSE
216 * if it did not match or error occurred. The silc_errno will indicate
217 * the error. The silc_errno is set to SILC_ERR_NOT_FOUND if the regular
218 * expression did not match.
222 * // Find first match (check if string matches)
223 * if (!silc_regex_match(®, "foo20", 5, 0, NULL, 0))
226 * // Find multiple matches, one by one
227 * SilcRegexMatchStruct match;
229 * while (silc_regex_match(®, string, len, 1, &match, 0)) {
230 * match_string = silc_memdup(string + match.start,
231 * match.end - match.start);
232 * string += match.end;
235 * // Parse URI into its components, available in the match[] array
236 * SilcRegexStruct reg;
237 * SilcRegexMatchStruct match[7];
239 * silc_regex_compile(®, "^(([^:]+)://)?([^:/]+)(:([0-9]+))?(/.*)", 0);
240 * silc_regex_match(®, "http://example.com/page.html", len, 7, match, 0);
243 SilcBool silc_regex_match(SilcRegex regexp, const char *string,
244 SilcUInt32 string_len, SilcUInt32 num_match,
245 SilcRegexMatch match, SilcRegexFlags flags);
247 /****f* silcutil/SilcRegexAPI/silc_regex_free
251 * void silc_regex_free(SilcRegex regexp);
255 * Free's the compiled regular expression context `regexp'. This must
256 * be called even if `regexp' is statically allocated. If the
257 * silc_regex_compile has been called this function must be called.
260 void silc_regex_free(SilcRegex regexp);
262 /****f* silcutil/SilcRegexAPI/silc_regex
266 * SilcBool silc_regex(const char *string, const char *regex,
267 * SilcBuffer match, ...);
271 * Matches the `string' to the regular expression `regex'. Returns TRUE
272 * if the `string' matches the regular expression or FALSE if it does not
273 * match. The silc_errno is also set to SILC_ERR_NOT_FOUND.
275 * The first (whole) match is returned to `match' buffer if it is non-NULL.
276 * The variable argument list are buffers where multiple matches are
277 * returned in case of group (parenthesized) regular expression. The caller
278 * needs to know how many pointers to provide, in order to get all matches.
279 * If `match' is non-NULL the variable argument list must be ended with
280 * NULL. The data in the `match' and in any other buffer is from `string'
281 * and must not be freed by the caller.
286 * if (!silc_regex("foobar", "foo.", NULL))
289 * // Get the pointer to the first match
290 * if (!silc_regex("foobar", ".bar", &match, NULL))
294 * SilcBufferStruct match, sub1, sub2;
296 * if (!silc_regex("Hello World", "(H..).(o..)", &match, &sub1, &sub2, NULL))
300 SilcBool silc_regex(const char *string, const char *regex,
301 SilcBuffer match, ...);
303 /****f* silcutil/SilcRegexAPI/silc_regex_buffer
307 * SilcBool silc_regex_buffer(SilcBuffer buffer, const char *regex,
308 * SilcBuffer match, ...);
312 * Same as silc_regex but the string to match is in `buffer'. Returns
313 * TRUE if the string matches and FALSE if it doesn't. See examples and
314 * other information in silc_regex. The `buffer' and `match' may be the
318 SilcBool silc_regex_buffer(SilcBuffer buffer, const char *regex,
319 SilcBuffer match, ...);
321 /* Backwards support */
322 #define silc_string_regex_match(regex, string) silc_regex(string, regex, NULL)
324 #endif /* SILCREGEX_H */