1/*************************************************************************/
2/* */
3/* Language Technologies Institute */
4/* Carnegie Mellon University */
5/* Copyright (c) 1999 */
6/* All Rights Reserved. */
7/* */
8/* Permission is hereby granted, free of charge, to use and distribute */
9/* this software and its documentation without restriction, including */
10/* without limitation the rights to use, copy, modify, merge, publish, */
11/* distribute, sublicense, and/or sell copies of this work, and to */
12/* permit persons to whom this work is furnished to do so, subject to */
13/* the following conditions: */
14/* 1. The code must retain the above copyright notice, this list of */
15/* conditions and the following disclaimer. */
16/* 2. Any modifications must be clearly marked as such. */
17/* 3. Original authors' names are not deleted. */
18/* 4. The authors' names are not used to endorse or promote products */
19/* derived from this software without specific prior written */
20/* permission. */
21/* */
22/* CARNEGIE MELLON UNIVERSITY AND THE CONTRIBUTORS TO THIS WORK */
23/* DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING */
24/* ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT */
25/* SHALL CARNEGIE MELLON UNIVERSITY NOR THE CONTRIBUTORS BE LIABLE */
26/* FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES */
27/* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN */
28/* AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, */
29/* ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF */
30/* THIS SOFTWARE. */
31/* */
32/*************************************************************************/
33/* Author: Alan W Black (awb@cs.cmu.edu) */
34/* Date: January 2000 */
35/*************************************************************************/
36/* */
37/* cst front-end to Henry Spencer's regex code */
38/* */
39/*************************************************************************/
40
41/* Includes portions or regexp.h, copyright follows: */
42/*
43 * Copyright (c) 1986 by University of Toronto.
44 * Copyright (c) 1989, 1993
45 * The Regents of the University of California. All rights reserved.
46 *
47 * This code is derived from software contributed to Berkeley
48 * by Henry Spencer.
49 *
50 * Redistribution and use in source and binary forms, with or without
51 * modification, are permitted provided that the following conditions
52 * are met:
53 * 1. Redistributions of source code must retain the above copyright
54 * notice, this list of conditions and the following disclaimer.
55 * 2. Redistributions in binary form must reproduce the above copyright
56 * notice, this list of conditions and the following disclaimer in the
57 * documentation and/or other materials provided with the distribution.
58 * 3. All advertising materials mentioning features or use of this software
59 * must display the following acknowledgement:
60 * This product includes software developed by the University of
61 * California, Berkeley and its contributors.
62 * 4. Neither the name of the University nor the names of its contributors
63 * may be used to endorse or promote products derived from this software
64 * without specific prior written permission.
65 *
66 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
67 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
68 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
69 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
70 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
71 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
72 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
73 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
74 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
75 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
76 * SUCH DAMAGE.
77 *
78 * @(#)regexp.h 8.1 (Berkeley) 6/2/93
79 */
80
81#ifndef _CST_REGEX_H__
82#define _CST_REGEX_H__
83
84#include "cst_file.h"
85#include "cst_string.h"
86
87/*
88 * The first byte of the regexp internal "program" is actually this magic
89 * number; the start node begins in the second byte.
90 */
91#define CST_REGMAGIC 0234
92
93typedef struct cst_regex_struct {
94 char regstart; /* Internal use only. */
95 char reganch; /* Internal use only. */
96 char *regmust; /* Internal use only. */
97 int regmlen; /* Internal use only. */
98 int regsize;
99 char *program;
100} cst_regex;
101
102#define CST_NSUBEXP 10
103typedef struct cst_regstate_struct {
104 const char *startp[CST_NSUBEXP];
105 const char *endp[CST_NSUBEXP];
106 const char *input;
107 const char *bol;
108} cst_regstate;
109
110cst_regex *new_cst_regex(const char *str);
111void delete_cst_regex(cst_regex *r);
112
113int cst_regex_match(const cst_regex *r, const char *str);
114cst_regstate *cst_regex_match_return(const cst_regex *r, const char *str);
115
116/* Internal functions from original HS code */
117cst_regex *hs_regcomp(const char *);
118cst_regstate *hs_regexec(const cst_regex *, const char *);
119void hs_regdelete(cst_regex *);
120
121/* Works similarly to snprintf(3), in that at most max characters are
122 written to out, including the trailing NUL, and the return value is
123 the number of characters written, *excluding* the trailing NUL.
124 Also works similarly to wcstombs(3) in that passing NULL as out
125 will count the number of characters that would be written without
126 doing any actual conversion, and ignoring max. So, you could use
127 it like this:
128
129 rx = new_cst_regex("\\(.*\\)_\\(.*\\)");
130 if ((rs = cst_regex_match_return(rx, "foo_bar")) != NULL) {
131 size_t n;
132
133 n = cst_regsub(rs, "\\1_\\2_quux", NULL, 0) + 1;
134 out = cst_alloc(char, n);
135 cst_regsub(rs, "\\1_\\2_quux", out, n);
136 } */
137size_t cst_regsub(const cst_regstate *r, const char *in, char *out, size_t max);
138
139/* Initialize the regex engine and global regex constants */
140void cst_regex_init();
141
142/* Regexps used in text processing (these are latin-alphabet specific
143 and to some extent US English-specific) */
144extern const cst_regex * const cst_rx_white;
145extern const cst_regex * const cst_rx_alpha;
146extern const cst_regex * const cst_rx_uppercase;
147extern const cst_regex * const cst_rx_lowercase;
148extern const cst_regex * const cst_rx_alphanum;
149extern const cst_regex * const cst_rx_identifier;
150extern const cst_regex * const cst_rx_int;
151extern const cst_regex * const cst_rx_double;
152extern const cst_regex * const cst_rx_commaint;
153extern const cst_regex * const cst_rx_digits;
154extern const cst_regex * const cst_rx_dotted_abbrev;
155
156/* Table of regexps used in CART trees (only one so far) */
157extern const cst_regex * const cst_regex_table[];
158#define CST_RX_dotted_abbrev_NUM 0
159
160#endif
161

source code of include/flite/cst_regex.h