cst_tokenstream.h source code [include/flite/cst_tokenstream.h]

1	/***********************************************************************/
2	/ /
3	/ Language Technologies Institute /
4	/ Carnegie Mellon University /
5	/ Copyright (c) 1999 /
6	/ All Rights Reserved. /
7	/ /
8	/ Permission is hereby granted, free of charge, to use and distribute /
9	/ this software and its documentation without restriction, including /
10	/ without limitation the rights to use, copy, modify, merge, publish, /
11	/ distribute, sublicense, and/or sell copies of this work, and to /
12	/ permit persons to whom this work is furnished to do so, subject to /
13	/ the following conditions: /
14	/ 1. The code must retain the above copyright notice, this list of /
15	/ conditions and the following disclaimer. /
16	/ 2. Any modifications must be clearly marked as such. /
17	/ 3. Original authors' names are not deleted. /
18	/ 4. The authors' names are not used to endorse or promote products /
19	/ derived from this software without specific prior written /
20	/ permission. /
21	/ /
22	/ CARNEGIE MELLON UNIVERSITY AND THE CONTRIBUTORS TO THIS WORK /
23	/ DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING /
24	/ ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT /
25	/ SHALL CARNEGIE MELLON UNIVERSITY NOR THE CONTRIBUTORS BE LIABLE /
26	/ FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES /
27	/ WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN /
28	/ AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, /
29	/ ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF /
30	/ THIS SOFTWARE. /
31	/ /
32	/***********************************************************************/
33	/ Author: Alan W Black (awb@cs.cmu.edu) /
34	/ Date: July 1999 /
35	/***********************************************************************/
36	/ /
37	/ Tokenizer for strings and files /
38	/ /
39	/***********************************************************************/
40	#ifndef _CST_TOKENSTREAM_H__
41	#define _CST_TOKENSTREAM_H__
42
43	#include "cst_alloc.h"
44	#include "cst_string.h"
45	#include "cst_file.h"
46	#include "cst_features.h"
47
48	typedef struct cst_tokenstream_struct {
49	cst_file fd;
50	int file_pos;
51	int line_number;
52	int eof_flag;
53	cst_string *string_buffer;
54
55	int current_char;
56
57	int token_pos;
58	int ws_max;
59	cst_string *whitespace;
60	int prep_max;
61	cst_string *prepunctuation;
62	int token_max;
63	cst_string *token;
64	int postp_max;
65	cst_string *postpunctuation;
66
67	cst_features tags; /* e.g xml tags /
68	/ if set will find token boundaries at every utf8 character /
69	int utf8_explode_mode;
70
71	void *streamtype_data;
72
73	/ Should only be set through set_charclasses as charclass table needs /
74	/ to be updated when you reset these /
75	const cst_string *p_whitespacesymbols;
76	const cst_string *p_singlecharsymbols;
77	const cst_string *p_prepunctuationsymbols;
78	const cst_string *p_postpunctuationsymbols;
79
80	cst_string charclass[`256`];
81
82	/ To allow externally specified reading functions e.g. epub/xml /
83	int (open)(struct* cst_tokenstream_struct ts, const* char *filename);
84	void (close)(struct* cst_tokenstream_struct *ts);
85	int (eof)(struct* cst_tokenstream_struct *ts);
86	int (seek)(struct* cst_tokenstream_struct ts, int* pos);
87	int (tell)(struct* cst_tokenstream_struct *ts);
88	int (size)(struct* cst_tokenstream_struct *ts);
89	int (getc)(struct* cst_tokenstream_struct *ts);
90	} cst_tokenstream;
91
92	#define TS_CHARCLASS_NONE 0
93	#define TS_CHARCLASS_WHITESPACE 2
94	#define TS_CHARCLASS_SINGLECHAR 4
95	#define TS_CHARCLASS_PREPUNCT 8
96	#define TS_CHARCLASS_POSTPUNCT 16
97	#define TS_CHARCLASS_QUOTE 32
98
99	#define ts_charclass(C,CLASS,TS) ((TS)->charclass[(unsigned char)C] & CLASS)
100
101	extern const cst_string * const cst_ts_default_whitespacesymbols;
102	extern const cst_string * const cst_ts_default_prepunctuationsymbols;
103	extern const cst_string * const cst_ts_default_postpunctuationsymbols;
104	extern const cst_string * const cst_ts_default_singlecharsymbols;
105
106	/ Public functions for tokenstream manipulation /
107	cst_tokenstream ts_open(const* char *filename,
108	const cst_string *whitespacesymbols,
109	const cst_string *singlecharsymbols,
110	const cst_string *prepunctsymbols,
111	const cst_string *postpunctsymbols);
112	cst_tokenstream ts_open_string(const* cst_string *string,
113	const cst_string *whitespacesymbols,
114	const cst_string *singlecharsymbols,
115	const cst_string *prepunctsymbols,
116	const cst_string *postpunctsymbols);
117	cst_tokenstream ts_open_generic(const* char *filename,
118	const cst_string *whitespacesymbols,
119	const cst_string *singlecharsymbols,
120	const cst_string *prepunctsymbols,
121	const cst_string *postpunctsymbols,
122	void *streamtype_data,
123	int (open)(cst_tokenstream ts,
124	const char *filename),
125	void (close)(cst_tokenstream ts),
126	int (eof)(cst_tokenstream ts),
127	int (seek)(cst_tokenstream ts, int pos),
128	int (tell)(cst_tokenstream ts),
129	int (size)(cst_tokenstream ts),
130	int (getc)(cst_tokenstream ts));
131	void ts_close(cst_tokenstream *ts);
132
133	#ifdef _WIN32
134	__inline int ts_utf8_sequence_length(char c0);
135	#else
136	int ts_utf8_sequence_length(char c0);
137	#endif
138	// {
139	// / Get the expected length of UTF8 sequence given its most /
140	// / significant byte /
141	// return (( 0xE5000000 >> (( c0 >> 3 ) & 0x1E )) & 3 ) + 1;
142	// }
143
144	int ts_eof(cst_tokenstream *ts);
145	const cst_string ts_get(cst_tokenstream ts);
146
147	const cst_string ts_get_quoted_token(cst_tokenstream ts,
148	char quote,
149	char escape);
150	/ Externally specified ts interfaces may need this /
151	cst_string private_ts_getc(cst_tokenstream *ts);
152
153
154	void set_charclasses(cst_tokenstream *ts,
155	const cst_string *whitespace,
156	const cst_string *singlecharsymbols,
157	const cst_string *prepunctuation,
158	const cst_string *postpunctuation);
159
160	int ts_read(void buff, int* size, int num, cst_tokenstream *ts);
161
162	int ts_set_stream_pos(cst_tokenstream ts,int* pos);
163	int ts_get_stream_pos(cst_tokenstream *ts);
164	int ts_get_stream_size(cst_tokenstream *ts);
165
166	#endif
167

source code of include/flite/cst_tokenstream.h