1 | /*************************************************************************/ |
2 | /* */ |
3 | /* Language Technologies Institute */ |
4 | /* Carnegie Mellon University */ |
5 | /* Copyright (c) 1999 */ |
6 | /* All Rights Reserved. */ |
7 | /* */ |
8 | /* Permission is hereby granted, free of charge, to use and distribute */ |
9 | /* this software and its documentation without restriction, including */ |
10 | /* without limitation the rights to use, copy, modify, merge, publish, */ |
11 | /* distribute, sublicense, and/or sell copies of this work, and to */ |
12 | /* permit persons to whom this work is furnished to do so, subject to */ |
13 | /* the following conditions: */ |
14 | /* 1. The code must retain the above copyright notice, this list of */ |
15 | /* conditions and the following disclaimer. */ |
16 | /* 2. Any modifications must be clearly marked as such. */ |
17 | /* 3. Original authors' names are not deleted. */ |
18 | /* 4. The authors' names are not used to endorse or promote products */ |
19 | /* derived from this software without specific prior written */ |
20 | /* permission. */ |
21 | /* */ |
22 | /* CARNEGIE MELLON UNIVERSITY AND THE CONTRIBUTORS TO THIS WORK */ |
23 | /* DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING */ |
24 | /* ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT */ |
25 | /* SHALL CARNEGIE MELLON UNIVERSITY NOR THE CONTRIBUTORS BE LIABLE */ |
26 | /* FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES */ |
27 | /* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN */ |
28 | /* AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, */ |
29 | /* ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF */ |
30 | /* THIS SOFTWARE. */ |
31 | /* */ |
32 | /*************************************************************************/ |
33 | /* Author: Alan W Black (awb@cs.cmu.edu) */ |
34 | /* Date: January 2000 */ |
35 | /*************************************************************************/ |
36 | /* */ |
37 | /* cst front-end to Henry Spencer's regex code */ |
38 | /* */ |
39 | /*************************************************************************/ |
40 | |
41 | /* Includes portions or regexp.h, copyright follows: */ |
42 | /* |
43 | * Copyright (c) 1986 by University of Toronto. |
44 | * Copyright (c) 1989, 1993 |
45 | * The Regents of the University of California. All rights reserved. |
46 | * |
47 | * This code is derived from software contributed to Berkeley |
48 | * by Henry Spencer. |
49 | * |
50 | * Redistribution and use in source and binary forms, with or without |
51 | * modification, are permitted provided that the following conditions |
52 | * are met: |
53 | * 1. Redistributions of source code must retain the above copyright |
54 | * notice, this list of conditions and the following disclaimer. |
55 | * 2. Redistributions in binary form must reproduce the above copyright |
56 | * notice, this list of conditions and the following disclaimer in the |
57 | * documentation and/or other materials provided with the distribution. |
58 | * 3. All advertising materials mentioning features or use of this software |
59 | * must display the following acknowledgement: |
60 | * This product includes software developed by the University of |
61 | * California, Berkeley and its contributors. |
62 | * 4. Neither the name of the University nor the names of its contributors |
63 | * may be used to endorse or promote products derived from this software |
64 | * without specific prior written permission. |
65 | * |
66 | * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND |
67 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
68 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
69 | * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE |
70 | * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL |
71 | * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS |
72 | * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) |
73 | * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT |
74 | * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY |
75 | * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF |
76 | * SUCH DAMAGE. |
77 | * |
78 | * @(#)regexp.h 8.1 (Berkeley) 6/2/93 |
79 | */ |
80 | |
81 | #ifndef _CST_REGEX_H__ |
82 | #define _CST_REGEX_H__ |
83 | |
84 | #include "cst_file.h" |
85 | #include "cst_string.h" |
86 | |
87 | /* |
88 | * The first byte of the regexp internal "program" is actually this magic |
89 | * number; the start node begins in the second byte. |
90 | */ |
91 | #define CST_REGMAGIC 0234 |
92 | |
93 | typedef struct cst_regex_struct { |
94 | char regstart; /* Internal use only. */ |
95 | char reganch; /* Internal use only. */ |
96 | char *regmust; /* Internal use only. */ |
97 | int regmlen; /* Internal use only. */ |
98 | int regsize; |
99 | char *program; |
100 | } cst_regex; |
101 | |
102 | #define CST_NSUBEXP 10 |
103 | typedef struct cst_regstate_struct { |
104 | const char *startp[CST_NSUBEXP]; |
105 | const char *endp[CST_NSUBEXP]; |
106 | const char *input; |
107 | const char *bol; |
108 | } cst_regstate; |
109 | |
110 | cst_regex *new_cst_regex(const char *str); |
111 | void delete_cst_regex(cst_regex *r); |
112 | |
113 | int cst_regex_match(const cst_regex *r, const char *str); |
114 | cst_regstate *cst_regex_match_return(const cst_regex *r, const char *str); |
115 | |
116 | /* Internal functions from original HS code */ |
117 | cst_regex *hs_regcomp(const char *); |
118 | cst_regstate *hs_regexec(const cst_regex *, const char *); |
119 | void hs_regdelete(cst_regex *); |
120 | |
121 | /* Works similarly to snprintf(3), in that at most max characters are |
122 | written to out, including the trailing NUL, and the return value is |
123 | the number of characters written, *excluding* the trailing NUL. |
124 | Also works similarly to wcstombs(3) in that passing NULL as out |
125 | will count the number of characters that would be written without |
126 | doing any actual conversion, and ignoring max. So, you could use |
127 | it like this: |
128 | |
129 | rx = new_cst_regex("\\(.*\\)_\\(.*\\)"); |
130 | if ((rs = cst_regex_match_return(rx, "foo_bar")) != NULL) { |
131 | size_t n; |
132 | |
133 | n = cst_regsub(rs, "\\1_\\2_quux", NULL, 0) + 1; |
134 | out = cst_alloc(char, n); |
135 | cst_regsub(rs, "\\1_\\2_quux", out, n); |
136 | } */ |
137 | size_t cst_regsub(const cst_regstate *r, const char *in, char *out, size_t max); |
138 | |
139 | /* Initialize the regex engine and global regex constants */ |
140 | void cst_regex_init(); |
141 | |
142 | /* Regexps used in text processing (these are latin-alphabet specific |
143 | and to some extent US English-specific) */ |
144 | extern const cst_regex * const cst_rx_white; |
145 | extern const cst_regex * const cst_rx_alpha; |
146 | extern const cst_regex * const cst_rx_uppercase; |
147 | extern const cst_regex * const cst_rx_lowercase; |
148 | extern const cst_regex * const cst_rx_alphanum; |
149 | extern const cst_regex * const cst_rx_identifier; |
150 | extern const cst_regex * const cst_rx_int; |
151 | extern const cst_regex * const cst_rx_double; |
152 | extern const cst_regex * const cst_rx_commaint; |
153 | extern const cst_regex * const cst_rx_digits; |
154 | extern const cst_regex * const cst_rx_dotted_abbrev; |
155 | |
156 | /* Table of regexps used in CART trees (only one so far) */ |
157 | extern const cst_regex * const cst_regex_table[]; |
158 | #define CST_RX_dotted_abbrev_NUM 0 |
159 | |
160 | #endif |
161 | |