diff options
Diffstat (limited to 'src/core/librcscripts/simple-regex.c')
-rw-r--r-- | src/core/librcscripts/simple-regex.c | 827 |
1 files changed, 827 insertions, 0 deletions
diff --git a/src/core/librcscripts/simple-regex.c b/src/core/librcscripts/simple-regex.c new file mode 100644 index 0000000..a5a9234 --- /dev/null +++ b/src/core/librcscripts/simple-regex.c @@ -0,0 +1,827 @@ +/* + * simple_regex.c + * + * Simle regex library. + * + * Copyright (C) 2004,2005 Martin Schlemmer <azarah@nosferatu.za.org> + * + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation version 2 of the License. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write to the Free Software Foundation, Inc., + * 675 Mass Ave, Cambridge, MA 02139, USA. + * + * $Header$ + */ + +/* + * Some notes: + * + * - This is a very simple regex library (read: return a match if some string + * matches some regex). It is probably not POSIX (if there are a POSIX or + * other standard) compatible. + * + * - I primarily wrote it to _not_ use glibc type regex functions, in case we + * might want to use it in code that have to be linked agaist klibc, etc. + * + * - It really is not optimized in any way yet. + * + * - Supported operators are: + * + * '.', '?', '*', '+' - So called 'wildcards' + * '[a-z]', '[^a-z]' - Basic 'lists'. Note that 'a-z' just specify that + * it supports basic lists as well as sequences .. + * The '^' is for an inverted list of course. + * '^', '$' - The 'from start' and 'to end' operators. If these + * are not used at the start ('^') or end ('$') of the + * regex, they will be treated as normal characters + * (this of course exclude the use of '^' in a 'list'). + * + * - If an invalid argument was passed, the functions returns 0 with + * 'regex_data-match == 0' (no error with no match) rather than -1. It may + * not be consistant with other practices, but I personally do not feel it is + * a critical error for these types of functions, and there are debugging you + * can enable to verify that there are no such issues. + * + * - __somefunction() is usually a helper function for somefunction(). I guess + * recursion might be an alternative, but I try to avoid it. + * + * - In general if we are matching a 'wildcard' ('*', '+' or '?'), a 'word' + * (read: some part of the regex that do not contain a 'wildcard' or 'list') + * will have a greater 'weight' than the 'wildcard'. This means that we + * will only continue to evaluate the 'wildcard' until the following 'word' + * (if any) matches. Currently this do not hold true for a 'list' not + * followed by a 'wildcard' - I might fix this in future. + * + */ + +#include <errno.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> + +#include "debug.h" +#include "misc.h" +#include "simple-regex.h" + +/* Macro to check if a regex_data_t pointer is valid */ +#define CHECK_REGEX_DATA_P(_regex_data, _on_error) \ + do { \ + if ((NULL == _regex_data) \ + || (NULL == _regex_data->data) \ + /* We do not check for this, as it might still \ + * provide a match ('*' or '?' wildcard) */ \ + /* || (0 == strlen(_regex_data->data)) */ \ + || (NULL == _regex_data->regex) \ + || (0 == strlen(_regex_data->regex))) {\ + DBG_MSG("Invalid argument passed!\n"); \ + goto _on_error; \ + } \ + } while (0) + +size_t get_word(const char *regex, char **r_word); +int match_word(regex_data_t *regex_data); +size_t get_list_size(const char *regex); +size_t get_list(const char *regex, char **r_list); +int __match_list(regex_data_t *regex_data); +int match_list(regex_data_t *regex_data); +size_t get_wildcard(const char *regex, char *r_wildcard); +int __match_wildcard(regex_data_t *regex_data, +int (*match_func)(regex_data_t *regex_data), const char *regex); +int match_wildcard(regex_data_t *regex_data); +int __match(regex_data_t *regex_data); + +/* + * Return values for match_* functions + * + * 0 - There was no error. If there was a match, regex_data->match + * - will be > 0 (this is the definitive check - if not true, the + * - other values of the struct may be bogus), regex_data->count + * - will be the amount of data that was matched (might be 0 for + * - some wildcards), and regex_data->r_count will be > 0. + * + * -1 - An error occured. Check errno for more info. + * + */ + +size_t get_word(const char *regex, char **r_word) +{ + char *r_list; + char *tmp_p; + size_t count = 0; + size_t tmp_count; + + /* NULL string means we do not have a word */ + if ((NULL == regex) || (0 == strlen(regex))) { + DBG_MSG("Invalid argument passed!\n"); + return 0; + } + + *r_word = malloc(strlen(regex) + 1); + if (NULL == r_word) { + DBG_MSG("Failed to allocate buffer!\n"); + return 0; + } + tmp_p = *r_word; + + while (strlen(regex) > 0) { + switch (regex[0]) { + case '*': + case '+': + case '?': + /* If its a wildcard, backup one step */ + *(--tmp_p) = '\0'; + count--; + return count; + case '[': + tmp_count = get_list(regex, &r_list); + free(r_list); + /* In theory should not happen, but you never know + * what may happen in future ... */ + if (-1 == tmp_count) + goto error; + + /* Bail if we have a list */ + if (tmp_count > 0) { + tmp_p[0] = '\0'; + return count; + } + default: + *tmp_p++ = *regex++; + count++; + break; + } + } + + tmp_p[0] = '\0'; + + return count; + +error: + free(*r_word); + + return -1; +} + +int match_word(regex_data_t *regex_data) +{ + char *data_p = regex_data->data; + char *r_word = NULL, *r_word_p; + size_t count = 0; + + CHECK_REGEX_DATA_P(regex_data, exit); + + count = get_word(regex_data->regex, &r_word); + if (-1 == count) + goto error; + if (0 == count) + goto exit; + r_word_p = r_word; + + while ((strlen(data_p) > 0) && (strlen(r_word_p) > 0 )) { + /* If 'r_word' is not 100% part of 'string', we do not have + * a match. If its a '.', it matches no matter what. */ + if ((data_p[0] != r_word_p[0]) && ('.' != r_word_p[0])) { + count = 0; + goto exit; + } + + data_p++; + r_word_p++; + } + + /* If 'string' is shorter than 'r_word', we do not have a match */ + if ((0 == strlen(data_p)) && (0 < strlen(r_word_p))) { + count = 0; + goto exit; + } + +exit: + /* Fill in our structure */ + if (0 == count) + regex_data->match = REGEX_NO_MATCH; + else if (strlen(regex_data->data) == count) + regex_data->match = REGEX_FULL_MATCH; + else + regex_data->match = REGEX_PARTIAL_MATCH; + if (regex_data->match != REGEX_NO_MATCH) + regex_data->where = regex_data->data; + else + regex_data->where = NULL; + regex_data->count = count; + regex_data->r_count = count; + + free(r_word); + return 0; + +error: + regex_data->match = REGEX_NO_MATCH; + + free(r_word); + return -1; +} + +size_t get_list_size(const char *regex) +{ + size_t count = 0; + + /* NULL string means we do not have a list */ + if ((NULL == regex) + || (0 == strlen(regex)) + || ('[' != regex[0])) { + DBG_MSG("Invalid argument passed!\n"); + return 0; + } + + regex++; + + while ((strlen(regex) > 0) && (']' != regex[0])) { + /* We have a sequence (x-y) */ + if (('-' == regex[0]) + && (']' != regex[1]) + && (strlen(regex) >= 2) + && (regex[-1] < regex[1])) + { + /* Add current + diff in sequence */ + count += regex[1] - regex[-1]; + /* Take care of '-' and next char */ + regex += 2; + } else { + regex++; + count++; + } + } + + return count; +} + +size_t get_list(const char *regex, char **r_list) +{ + char *tmp_buf = NULL; + size_t count = 0; + size_t size; + + /* NULL string means we do not have a list */ + if ((NULL == regex) || (0 == strlen(regex))) { + DBG_MSG("Invalid argument passed!\n"); + return 0; + } + + /* Bail if we do not have a list. Do not add debugging, as + * it is very noisy (used a lot when we call match_list() in + * __match() and match() to test for list matching) */ + if ('[' != regex[0]) + return 0; + + size = get_list_size(regex); + if (0 == size) { + /* Should not be an issue, but just in case */ + DBG_MSG("0 returned by get_list_size.\n"); + return 0; + } + + *r_list = malloc(size + 1); + if (NULL == *r_list) { + DBG_MSG("Failed to allocate buffer!\n"); + return -1; + } + tmp_buf = *r_list; + + /* Take care of '[' */ + regex++; + count++; + + while ((strlen(regex) > 0) && (']' != regex[0])) { + /* We have a sequence (x-y) */ + if (('-' == regex[0]) + && (']' != regex[1]) + && (strlen(regex) >= 2) + && (regex[-1] < regex[1])) { + /* Fill in missing chars in sequence */ + while (tmp_buf[-1] < regex[1]) { + tmp_buf[0] = (char)(tmp_buf[-1] + 1); + tmp_buf++; + /* We do not increase count */ + } + /* Take care of '-' and next char */ + count += 2; + regex += 2; + } else { + *tmp_buf++ = *regex++; + count++; + } + } + + tmp_buf[0] = '\0'; + /* Take care of ']' */ + count++; + + /* We do not have a list as it does not end in ']' */ + if (']' != regex[0]) { + count = 0; + free(*r_list); + } + + return count; +} + +/* If the first is the '^' character, everything but the list is matched + * NOTE: We only evaluate _ONE_ data character at a time!! */ +int __match_list(regex_data_t *regex_data) +{ + regex_data_t tmp_data; + char *data_p = regex_data->data; + char *list_p = regex_data->regex; + char test_regex[2] = { '\0', '\0' }; + int invert = 0; + int match; + int retval; + + CHECK_REGEX_DATA_P(regex_data, failed); + + if ('^' == list_p[0]) { + /* We need to invert the match */ + invert = 1; + /* Make sure '^' is not part of our list */ + list_p++; + } + + if (invert) + /* All should be a match if not in the list */ + match = 1; + else + /* We only have a match if in the list */ + match = 0; + + while (strlen(list_p) > 0) { + test_regex[0] = list_p[0]; + + FILL_REGEX_DATA(tmp_data, data_p, test_regex); + retval = match_word(&tmp_data); + if (-1 == retval) + goto error; + + if (REGEX_MATCH(tmp_data)) { + if (invert) + /* If we exclude the list from + * characters we try to match, we + * have a match until one of the + * list is found. */ + match = 0; + else + /* If not, we have to keep looking + * until one from the list match + * before we have a match */ + match = 1; + break; + } + list_p++; + } + + /* Fill in our structure */ + if (match) { + regex_data->match = REGEX_PARTIAL_MATCH; + regex_data->where = regex_data->data; + regex_data->count = 1; + /* This one is more cosmetic, as match_list() will + * do the right thing */ + regex_data->r_count = 0; /* strlen(regex_data->regex); */ + } else { +failed: + regex_data->match = REGEX_NO_MATCH; + regex_data->where = NULL; + regex_data->count = 0; + regex_data->r_count = 0; + } + + return 0; + +error: + regex_data->match = REGEX_NO_MATCH; + + return -1; +} + +int match_list(regex_data_t *regex_data) +{ + regex_data_t tmp_data; + char *data_p = regex_data->data; + char *list_p = regex_data->regex; + char *r_list = NULL; + size_t r_count = 0; + int retval; + + CHECK_REGEX_DATA_P(regex_data, failed); + + r_count = get_list(list_p, &r_list); + if (-1 == r_count) + goto error; + if (0 == r_count) + goto failed; + + FILL_REGEX_DATA(tmp_data, data_p, &list_p[r_count-1]); + retval = __match_wildcard(&tmp_data, __match_list, r_list); + if (-1 == retval) + goto error; + if (REGEX_MATCH(tmp_data)) { + /* This should be 2 ('word' + 'wildcard'), so just remove + * the wildcard */ + tmp_data.r_count--; + goto exit; + } + + FILL_REGEX_DATA(tmp_data, data_p, r_list); + retval = __match_list(&tmp_data); + if (-1 == retval) + goto error; + if (REGEX_MATCH(tmp_data)) + goto exit; + +failed: + /* We will fill in regex_data below */ + tmp_data.match = REGEX_NO_MATCH; + tmp_data.where = NULL; + tmp_data.count = 0; + tmp_data.r_count = 0; + +exit: + /* Fill in our structure */ + regex_data->match = tmp_data.match; + regex_data->where = tmp_data.where; + regex_data->count = tmp_data.count; + if (regex_data->match != REGEX_NO_MATCH) + /* tmp_data.r_count for __match_wildcard will take care of the + * wildcard, and tmp_data.r_count for __match_list will be 0 */ + regex_data->r_count = r_count + tmp_data.r_count; + else + regex_data->r_count = 0; + + free(r_list); + return 0; + +error: + regex_data->match = REGEX_NO_MATCH; + + free(r_list); + return -1; +} + +size_t get_wildcard(const char *regex, char *r_wildcard) +{ + /* NULL regex means we do not have a wildcard */ + if ((NULL == regex) || (0 == strlen(regex))) { + DBG_MSG("Invalid argument passed!\n"); + return 0; + } + + r_wildcard[0] = regex[0]; + r_wildcard[2] = '\0'; + + switch (regex[1]) { + case '*': + case '+': + case '?': + r_wildcard[1] = regex[1]; + break; + default: + r_wildcard[0] = '\0'; + return 0; + } + + return strlen(r_wildcard); +} + +int __match_wildcard(regex_data_t *regex_data, int (*match_func)(regex_data_t *regex_data), const char *regex) +{ + regex_data_t tmp_data; + char *data_p = regex_data->data; + char *wildcard_p = regex_data->regex; + char r_wildcard[3]; + size_t count = 0; + size_t r_count = 0; + int is_match = 0; + int retval; + + CHECK_REGEX_DATA_P(regex_data, exit); + + if (NULL == match_func) { + DBG_MSG("NULL match_func was passed!\n"); + goto exit; + } + + r_count = get_wildcard(wildcard_p, r_wildcard); + if (0 == r_count) + goto exit; + + FILL_REGEX_DATA(tmp_data, data_p, (char *)regex); + retval = match_func(&tmp_data); + if (-1 == retval) + goto error; + + switch (r_wildcard[1]) { + case '*': + case '?': + /* '*' and '?' always matches */ + is_match = 1; + case '+': + /* We need to match all of them */ + do { + /* If we have at least one match for '+', or none + * for '*' or '?', check if we have a word or list match. + * We do this because a word weights more than a wildcard */ + if ((strlen(wildcard_p) > 2) + && ((count > 0) + || ('*' == r_wildcard[1]) + || ('?' == r_wildcard[1]))) { + regex_data_t tmp_data2; +#if 0 + printf("data_p = %s, wildcard_p = %s\n", data_p, wildcard_p); +#endif + + FILL_REGEX_DATA(tmp_data2, data_p, &wildcard_p[2]); + retval = match(&tmp_data2); + if (-1 == retval) + goto error; + + if (/* '.' might be a special case ... */ + /* ('.' != wildcard_p[2]) && */ + ((REGEX_MATCH(tmp_data2)) + && (REGEX_FULL_MATCH == tmp_data2.match))) { + goto exit; + } + } + + if (REGEX_MATCH(tmp_data)) { + data_p += tmp_data.count; + count += tmp_data.count; + is_match = 1; + + FILL_REGEX_DATA(tmp_data, data_p, (char *)regex); + retval = match_func(&tmp_data); + if (-1 == retval) + goto error; + } + /* Only once for '?' */ + } while ((REGEX_MATCH(tmp_data)) && ('?' != r_wildcard[1])); + + break; + default: + /* No wildcard */ + break; + } + +exit: + /* Fill in our structure */ + /* We can still have a match ('*' and '?'), although count == 0 */ + if ((0 == count) && (0 == is_match)) + regex_data->match = REGEX_NO_MATCH; + else if (strlen(regex_data->data) == count) + regex_data->match = REGEX_FULL_MATCH; + else + regex_data->match = REGEX_PARTIAL_MATCH; + if (regex_data->match != REGEX_NO_MATCH) + regex_data->where = regex_data->data; + else + regex_data->where = NULL; + regex_data->count = count; + regex_data->r_count = r_count; + + return 0; + +error: + regex_data->match = REGEX_NO_MATCH; + + return -1; +} + +int match_wildcard(regex_data_t *regex_data) +{ + regex_data_t tmp_data; + char *data_p = regex_data->data; + char *wildcard_p = regex_data->regex; + char r_wildcard[3]; + size_t r_count; + int retval; + + CHECK_REGEX_DATA_P(regex_data, failed); + + /* Invalid wildcard - we need a character + a regex operator */ + if (strlen(wildcard_p) < 2) + goto failed; + + r_count = get_wildcard(wildcard_p, r_wildcard); + if (0 == r_count) + goto failed; + + /* Needed so that match_word() will not bail if it sees the wildcard */ + r_wildcard[1] = '\0'; + + FILL_REGEX_DATA(tmp_data, data_p, wildcard_p); + retval = __match_wildcard(&tmp_data, match_word, r_wildcard); + if (-1 == retval) + goto error; + if (REGEX_MATCH(tmp_data)) + goto exit; + +failed: + /* We will fill in regex_data below */ + tmp_data.match = REGEX_NO_MATCH; + tmp_data.where = NULL; + tmp_data.count = 0; + tmp_data.r_count = 0; + +exit: + /* Fill in our structure */ + regex_data->match = tmp_data.match; + regex_data->where = tmp_data.where; + regex_data->count = tmp_data.count; + regex_data->r_count = tmp_data.r_count; + + return 0; + +error: + regex_data->match = REGEX_NO_MATCH; + + return -1; +} + +int __match(regex_data_t *regex_data) +{ + regex_data_t tmp_data; + char *data_p = regex_data->data; + char *regex_p = regex_data->regex; + size_t count = 0; + size_t r_count = 0; + int match = 0; + int retval; + + CHECK_REGEX_DATA_P(regex_data, failed); + + while (strlen(regex_p) > 0) { +#if 0 + printf("data_p = '%s', regex_p = '%s'\n", data_p, regex_p); +#endif + + FILL_REGEX_DATA(tmp_data, data_p, regex_p); + retval = match_list(&tmp_data); + if (-1 == retval) + goto error; + if (REGEX_MATCH(tmp_data)) + goto match; + + FILL_REGEX_DATA(tmp_data, data_p, regex_p); + retval = match_wildcard(&tmp_data); + if (-1 == retval) + goto error; + if (REGEX_MATCH(tmp_data)) + goto match; + + FILL_REGEX_DATA(tmp_data, data_p, regex_p); + retval = match_word(&tmp_data); + if (-1 == retval) + goto error; + if (REGEX_MATCH(tmp_data)) + goto match; + + break; + +match: + data_p += tmp_data.count; + count += tmp_data.count; + regex_p += tmp_data.r_count; + r_count += tmp_data.r_count; + match = 1; + + /* Check that we do not go out of bounds */ + if (((data_p - regex_data->data) > strlen(regex_data->data)) + || ((regex_p - regex_data->regex) > strlen(regex_data->regex))) + goto failed; + } + + /* We could not match the whole regex (data too short?) */ + if (0 != strlen(regex_p)) + goto failed; + + goto exit; + +failed: + /* We will fill in regex_data below */ + count = 0; + r_count = 0; + match = 0; + +exit: + /* Fill in our structure */ + /* We can still have a match ('*' and '?'), although count == 0 */ + if ((0 == count) && (0 == match)) + regex_data->match = REGEX_NO_MATCH; + else if (strlen(regex_data->data) == count) + regex_data->match = REGEX_FULL_MATCH; + else + regex_data->match = REGEX_PARTIAL_MATCH; + if (regex_data->match != REGEX_NO_MATCH) + regex_data->where = regex_data->data; + else + regex_data->where = NULL; + regex_data->count = count; + regex_data->r_count = r_count; + + return 0; + +error: + regex_data->match = REGEX_NO_MATCH; + + return -1; +} + +int match(regex_data_t *regex_data) +{ + regex_data_t tmp_data; + char *data_p = regex_data->data; + char *regex_p; + char *tmp_buf = NULL; + int from_start = 0; + int to_end = 0; + int retval; + + CHECK_REGEX_DATA_P(regex_data, failed); + + /* We might be modifying regex_p, so make a copy */ + tmp_buf = strndup(regex_data->regex, strlen(regex_data->regex)); + if (NULL == tmp_buf) { + DBG_MSG("Failed to allocate temporary buffer!\n"); + goto error; + } + regex_p = tmp_buf; + + /* Should we only match from the start? */ + if ('^' == regex_p[0]) { + regex_p++; + from_start = 1; + } + + /* Should we match up to the end? */ + if ('$' == regex_p[strlen(regex_p) - 1]) { + regex_p[strlen(regex_p) - 1] = '\0'; + to_end = 1; + } + + do { + FILL_REGEX_DATA(tmp_data, data_p, regex_p); + retval = __match(&tmp_data); + if (-1 == retval) + goto error; + } while ((strlen(data_p++) > 0) + && (!REGEX_MATCH(tmp_data)) + && (0 == from_start)); + + /* Compensate for above extra inc */ + data_p--; + + /* Fill in our structure */ + if (REGEX_MATCH(tmp_data)) { + /* Check if we had an '$' at the end of the regex, and + * verify that we still have a match */ + if ((1 == to_end) && (tmp_data.count != strlen(data_p))) { + goto failed; + } + + if ((data_p == regex_data->data) + && (tmp_data.match == REGEX_FULL_MATCH)) + regex_data->match = REGEX_FULL_MATCH; + else + regex_data->match = REGEX_PARTIAL_MATCH; + regex_data->where = data_p; + regex_data->count = tmp_data.count; + regex_data->r_count = tmp_data.r_count; + if (1 == from_start) + regex_data->r_count++; + if (1 == to_end) + regex_data->r_count++; + } else { +failed: + regex_data->match = REGEX_NO_MATCH; + regex_data->where = NULL; + regex_data->count = 0; + regex_data->r_count = 0; + } + + free(tmp_buf); + + return 0; + +error: + regex_data->match = REGEX_NO_MATCH; + free(tmp_buf); + + return -1; +} + |