| 1 | /*************************************************************************/ |
| 2 | /* */ |
| 3 | /* Language Technologies Institute */ |
| 4 | /* Carnegie Mellon University */ |
| 5 | /* Copyright (c) 1999 */ |
| 6 | /* All Rights Reserved. */ |
| 7 | /* */ |
| 8 | /* Permission is hereby granted, free of charge, to use and distribute */ |
| 9 | /* this software and its documentation without restriction, including */ |
| 10 | /* without limitation the rights to use, copy, modify, merge, publish, */ |
| 11 | /* distribute, sublicense, and/or sell copies of this work, and to */ |
| 12 | /* permit persons to whom this work is furnished to do so, subject to */ |
| 13 | /* the following conditions: */ |
| 14 | /* 1. The code must retain the above copyright notice, this list of */ |
| 15 | /* conditions and the following disclaimer. */ |
| 16 | /* 2. Any modifications must be clearly marked as such. */ |
| 17 | /* 3. Original authors' names are not deleted. */ |
| 18 | /* 4. The authors' names are not used to endorse or promote products */ |
| 19 | /* derived from this software without specific prior written */ |
| 20 | /* permission. */ |
| 21 | /* */ |
| 22 | /* CARNEGIE MELLON UNIVERSITY AND THE CONTRIBUTORS TO THIS WORK */ |
| 23 | /* DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING */ |
| 24 | /* ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT */ |
| 25 | /* SHALL CARNEGIE MELLON UNIVERSITY NOR THE CONTRIBUTORS BE LIABLE */ |
| 26 | /* FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES */ |
| 27 | /* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN */ |
| 28 | /* AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, */ |
| 29 | /* ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF */ |
| 30 | /* THIS SOFTWARE. */ |
| 31 | /* */ |
| 32 | /*************************************************************************/ |
| 33 | /* Author: Alan W Black (awb@cs.cmu.edu) */ |
| 34 | /* Date: January 2000 */ |
| 35 | /*************************************************************************/ |
| 36 | /* */ |
| 37 | /* cst front-end to Henry Spencer's regex code */ |
| 38 | /* */ |
| 39 | /*************************************************************************/ |
| 40 | |
| 41 | /* Includes portions or regexp.h, copyright follows: */ |
| 42 | /* |
| 43 | * Copyright (c) 1986 by University of Toronto. |
| 44 | * Copyright (c) 1989, 1993 |
| 45 | * The Regents of the University of California. All rights reserved. |
| 46 | * |
| 47 | * This code is derived from software contributed to Berkeley |
| 48 | * by Henry Spencer. |
| 49 | * |
| 50 | * Redistribution and use in source and binary forms, with or without |
| 51 | * modification, are permitted provided that the following conditions |
| 52 | * are met: |
| 53 | * 1. Redistributions of source code must retain the above copyright |
| 54 | * notice, this list of conditions and the following disclaimer. |
| 55 | * 2. Redistributions in binary form must reproduce the above copyright |
| 56 | * notice, this list of conditions and the following disclaimer in the |
| 57 | * documentation and/or other materials provided with the distribution. |
| 58 | * 3. All advertising materials mentioning features or use of this software |
| 59 | * must display the following acknowledgement: |
| 60 | * This product includes software developed by the University of |
| 61 | * California, Berkeley and its contributors. |
| 62 | * 4. Neither the name of the University nor the names of its contributors |
| 63 | * may be used to endorse or promote products derived from this software |
| 64 | * without specific prior written permission. |
| 65 | * |
| 66 | * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND |
| 67 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
| 68 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
| 69 | * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE |
| 70 | * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL |
| 71 | * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS |
| 72 | * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) |
| 73 | * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT |
| 74 | * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY |
| 75 | * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF |
| 76 | * SUCH DAMAGE. |
| 77 | * |
| 78 | * @(#)regexp.h 8.1 (Berkeley) 6/2/93 |
| 79 | */ |
| 80 | |
| 81 | #ifndef _CST_REGEX_H__ |
| 82 | #define _CST_REGEX_H__ |
| 83 | |
| 84 | #include "cst_file.h" |
| 85 | #include "cst_string.h" |
| 86 | |
| 87 | /* |
| 88 | * The first byte of the regexp internal "program" is actually this magic |
| 89 | * number; the start node begins in the second byte. |
| 90 | */ |
| 91 | #define CST_REGMAGIC 0234 |
| 92 | |
| 93 | typedef struct cst_regex_struct { |
| 94 | char regstart; /* Internal use only. */ |
| 95 | char reganch; /* Internal use only. */ |
| 96 | char *regmust; /* Internal use only. */ |
| 97 | int regmlen; /* Internal use only. */ |
| 98 | int regsize; |
| 99 | char *program; |
| 100 | } cst_regex; |
| 101 | |
| 102 | #define CST_NSUBEXP 10 |
| 103 | typedef struct cst_regstate_struct { |
| 104 | const char *startp[CST_NSUBEXP]; |
| 105 | const char *endp[CST_NSUBEXP]; |
| 106 | const char *input; |
| 107 | const char *bol; |
| 108 | } cst_regstate; |
| 109 | |
| 110 | cst_regex *new_cst_regex(const char *str); |
| 111 | void delete_cst_regex(cst_regex *r); |
| 112 | |
| 113 | int cst_regex_match(const cst_regex *r, const char *str); |
| 114 | cst_regstate *cst_regex_match_return(const cst_regex *r, const char *str); |
| 115 | |
| 116 | /* Internal functions from original HS code */ |
| 117 | cst_regex *hs_regcomp(const char *); |
| 118 | cst_regstate *hs_regexec(const cst_regex *, const char *); |
| 119 | void hs_regdelete(cst_regex *); |
| 120 | |
| 121 | /* Works similarly to snprintf(3), in that at most max characters are |
| 122 | written to out, including the trailing NUL, and the return value is |
| 123 | the number of characters written, *excluding* the trailing NUL. |
| 124 | Also works similarly to wcstombs(3) in that passing NULL as out |
| 125 | will count the number of characters that would be written without |
| 126 | doing any actual conversion, and ignoring max. So, you could use |
| 127 | it like this: |
| 128 | |
| 129 | rx = new_cst_regex("\\(.*\\)_\\(.*\\)"); |
| 130 | if ((rs = cst_regex_match_return(rx, "foo_bar")) != NULL) { |
| 131 | size_t n; |
| 132 | |
| 133 | n = cst_regsub(rs, "\\1_\\2_quux", NULL, 0) + 1; |
| 134 | out = cst_alloc(char, n); |
| 135 | cst_regsub(rs, "\\1_\\2_quux", out, n); |
| 136 | } */ |
| 137 | size_t cst_regsub(const cst_regstate *r, const char *in, char *out, size_t max); |
| 138 | |
| 139 | /* Initialize the regex engine and global regex constants */ |
| 140 | void cst_regex_init(); |
| 141 | |
| 142 | /* Regexps used in text processing (these are latin-alphabet specific |
| 143 | and to some extent US English-specific) */ |
| 144 | extern const cst_regex * const cst_rx_white; |
| 145 | extern const cst_regex * const cst_rx_alpha; |
| 146 | extern const cst_regex * const cst_rx_uppercase; |
| 147 | extern const cst_regex * const cst_rx_lowercase; |
| 148 | extern const cst_regex * const cst_rx_alphanum; |
| 149 | extern const cst_regex * const cst_rx_identifier; |
| 150 | extern const cst_regex * const cst_rx_int; |
| 151 | extern const cst_regex * const cst_rx_double; |
| 152 | extern const cst_regex * const cst_rx_commaint; |
| 153 | extern const cst_regex * const cst_rx_digits; |
| 154 | extern const cst_regex * const cst_rx_dotted_abbrev; |
| 155 | |
| 156 | /* Table of regexps used in CART trees (only one so far) */ |
| 157 | extern const cst_regex * const cst_regex_table[]; |
| 158 | #define CST_RX_dotted_abbrev_NUM 0 |
| 159 | |
| 160 | #endif |
| 161 | |