| 1 | /*************************************************************************/ | 
| 2 | /*                                                                       */ | 
| 3 | /*                  Language Technologies Institute                      */ | 
| 4 | /*                     Carnegie Mellon University                        */ | 
| 5 | /*                        Copyright (c) 1999                             */ | 
| 6 | /*                        All Rights Reserved.                           */ | 
| 7 | /*                                                                       */ | 
| 8 | /*  Permission is hereby granted, free of charge, to use and distribute  */ | 
| 9 | /*  this software and its documentation without restriction, including   */ | 
| 10 | /*  without limitation the rights to use, copy, modify, merge, publish,  */ | 
| 11 | /*  distribute, sublicense, and/or sell copies of this work, and to      */ | 
| 12 | /*  permit persons to whom this work is furnished to do so, subject to   */ | 
| 13 | /*  the following conditions:                                            */ | 
| 14 | /*   1. The code must retain the above copyright notice, this list of    */ | 
| 15 | /*      conditions and the following disclaimer.                         */ | 
| 16 | /*   2. Any modifications must be clearly marked as such.                */ | 
| 17 | /*   3. Original authors' names are not deleted.                         */ | 
| 18 | /*   4. The authors' names are not used to endorse or promote products   */ | 
| 19 | /*      derived from this software without specific prior written        */ | 
| 20 | /*      permission.                                                      */ | 
| 21 | /*                                                                       */ | 
| 22 | /*  CARNEGIE MELLON UNIVERSITY AND THE CONTRIBUTORS TO THIS WORK         */ | 
| 23 | /*  DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING      */ | 
| 24 | /*  ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT   */ | 
| 25 | /*  SHALL CARNEGIE MELLON UNIVERSITY NOR THE CONTRIBUTORS BE LIABLE      */ | 
| 26 | /*  FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES    */ | 
| 27 | /*  WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN   */ | 
| 28 | /*  AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,          */ | 
| 29 | /*  ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF       */ | 
| 30 | /*  THIS SOFTWARE.                                                       */ | 
| 31 | /*                                                                       */ | 
| 32 | /*************************************************************************/ | 
| 33 | /*             Author:  Alan W Black (awb@cs.cmu.edu)                    */ | 
| 34 | /*               Date:  January 2000                                     */ | 
| 35 | /*************************************************************************/ | 
| 36 | /*                                                                       */ | 
| 37 | /*  cst front-end to Henry Spencer's regex code                          */ | 
| 38 | /*                                                                       */ | 
| 39 | /*************************************************************************/ | 
| 40 |  | 
| 41 | /* Includes portions or regexp.h, copyright follows: */ | 
| 42 | /* | 
| 43 |  * Copyright (c) 1986 by University of Toronto. | 
| 44 |  * Copyright (c) 1989, 1993 | 
| 45 |  *	The Regents of the University of California.  All rights reserved. | 
| 46 |  * | 
| 47 |  * This code is derived from software contributed to Berkeley | 
| 48 |  * by Henry Spencer. | 
| 49 |  * | 
| 50 |  * Redistribution and use in source and binary forms, with or without | 
| 51 |  * modification, are permitted provided that the following conditions | 
| 52 |  * are met: | 
| 53 |  * 1. Redistributions of source code must retain the above copyright | 
| 54 |  *    notice, this list of conditions and the following disclaimer. | 
| 55 |  * 2. Redistributions in binary form must reproduce the above copyright | 
| 56 |  *    notice, this list of conditions and the following disclaimer in the | 
| 57 |  *    documentation and/or other materials provided with the distribution. | 
| 58 |  * 3. All advertising materials mentioning features or use of this software | 
| 59 |  *    must display the following acknowledgement: | 
| 60 |  *	This product includes software developed by the University of | 
| 61 |  *	California, Berkeley and its contributors. | 
| 62 |  * 4. Neither the name of the University nor the names of its contributors | 
| 63 |  *    may be used to endorse or promote products derived from this software | 
| 64 |  *    without specific prior written permission. | 
| 65 |  * | 
| 66 |  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND | 
| 67 |  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | 
| 68 |  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | 
| 69 |  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE | 
| 70 |  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | 
| 71 |  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS | 
| 72 |  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) | 
| 73 |  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT | 
| 74 |  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY | 
| 75 |  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF | 
| 76 |  * SUCH DAMAGE. | 
| 77 |  * | 
| 78 |  *	@(#)regexp.h	8.1 (Berkeley) 6/2/93 | 
| 79 |  */ | 
| 80 |  | 
| 81 | #ifndef _CST_REGEX_H__ | 
| 82 | #define _CST_REGEX_H__ | 
| 83 |  | 
| 84 | #include "cst_file.h" | 
| 85 | #include "cst_string.h" | 
| 86 |  | 
| 87 | /* | 
| 88 |  * The first byte of the regexp internal "program" is actually this magic | 
| 89 |  * number; the start node begins in the second byte. | 
| 90 |  */ | 
| 91 | #define	CST_REGMAGIC	0234 | 
| 92 |  | 
| 93 | typedef struct cst_regex_struct { | 
| 94 |     char regstart;		/* Internal use only. */ | 
| 95 |     char reganch;		/* Internal use only. */ | 
| 96 |     char *regmust;		/* Internal use only. */ | 
| 97 |     int regmlen;		/* Internal use only. */ | 
| 98 |     int regsize; | 
| 99 |     char *program; | 
| 100 | } cst_regex; | 
| 101 |  | 
| 102 | #define CST_NSUBEXP  10 | 
| 103 | typedef struct cst_regstate_struct { | 
| 104 | 	const char *startp[CST_NSUBEXP]; | 
| 105 | 	const char *endp[CST_NSUBEXP]; | 
| 106 | 	const char *input; | 
| 107 | 	const char *bol; | 
| 108 | } cst_regstate; | 
| 109 |  | 
| 110 | cst_regex *new_cst_regex(const char *str); | 
| 111 | void delete_cst_regex(cst_regex *r); | 
| 112 |  | 
| 113 | int cst_regex_match(const cst_regex *r, const char *str); | 
| 114 | cst_regstate *cst_regex_match_return(const cst_regex *r, const char *str); | 
| 115 |  | 
| 116 | /* Internal functions from original HS code */ | 
| 117 | cst_regex *hs_regcomp(const char *); | 
| 118 | cst_regstate *hs_regexec(const cst_regex *, const char *); | 
| 119 | void hs_regdelete(cst_regex *); | 
| 120 |  | 
| 121 | /* Works similarly to snprintf(3), in that at most max characters are | 
| 122 |    written to out, including the trailing NUL, and the return value is | 
| 123 |    the number of characters written, *excluding* the trailing NUL. | 
| 124 |    Also works similarly to wcstombs(3) in that passing NULL as out | 
| 125 |    will count the number of characters that would be written without | 
| 126 |    doing any actual conversion, and ignoring max.  So, you could use | 
| 127 |    it like this: | 
| 128 |  | 
| 129 |    rx = new_cst_regex("\\(.*\\)_\\(.*\\)"); | 
| 130 |    if ((rs = cst_regex_match_return(rx, "foo_bar")) != NULL) { | 
| 131 |    	size_t n; | 
| 132 |  | 
| 133 | 	n = cst_regsub(rs, "\\1_\\2_quux", NULL, 0) + 1; | 
| 134 | 	out = cst_alloc(char, n); | 
| 135 | 	cst_regsub(rs, "\\1_\\2_quux", out, n); | 
| 136 |    } */ | 
| 137 | size_t cst_regsub(const cst_regstate *r, const char *in, char *out, size_t max); | 
| 138 |  | 
| 139 | /* Initialize the regex engine and global regex constants */ | 
| 140 | void cst_regex_init(); | 
| 141 |  | 
| 142 | /* Regexps used in text processing (these are latin-alphabet specific | 
| 143 |    and to some extent US English-specific) */ | 
| 144 | extern const cst_regex * const cst_rx_white; | 
| 145 | extern const cst_regex * const cst_rx_alpha; | 
| 146 | extern const cst_regex * const cst_rx_uppercase; | 
| 147 | extern const cst_regex * const cst_rx_lowercase; | 
| 148 | extern const cst_regex * const cst_rx_alphanum; | 
| 149 | extern const cst_regex * const cst_rx_identifier; | 
| 150 | extern const cst_regex * const cst_rx_int; | 
| 151 | extern const cst_regex * const cst_rx_double; | 
| 152 | extern const cst_regex * const cst_rx_commaint; | 
| 153 | extern const cst_regex * const cst_rx_digits; | 
| 154 | extern const cst_regex * const cst_rx_dotted_abbrev; | 
| 155 |  | 
| 156 | /* Table of regexps used in CART trees (only one so far) */ | 
| 157 | extern const cst_regex * const cst_regex_table[]; | 
| 158 | #define CST_RX_dotted_abbrev_NUM 0 | 
| 159 |  | 
| 160 | #endif | 
| 161 |  |