1 | /*************************************************************************/ |
2 | /* */ |
3 | /* Language Technologies Institute */ |
4 | /* Carnegie Mellon University */ |
5 | /* Copyright (c) 1999 */ |
6 | /* All Rights Reserved. */ |
7 | /* */ |
8 | /* Permission is hereby granted, free of charge, to use and distribute */ |
9 | /* this software and its documentation without restriction, including */ |
10 | /* without limitation the rights to use, copy, modify, merge, publish, */ |
11 | /* distribute, sublicense, and/or sell copies of this work, and to */ |
12 | /* permit persons to whom this work is furnished to do so, subject to */ |
13 | /* the following conditions: */ |
14 | /* 1. The code must retain the above copyright notice, this list of */ |
15 | /* conditions and the following disclaimer. */ |
16 | /* 2. Any modifications must be clearly marked as such. */ |
17 | /* 3. Original authors' names are not deleted. */ |
18 | /* 4. The authors' names are not used to endorse or promote products */ |
19 | /* derived from this software without specific prior written */ |
20 | /* permission. */ |
21 | /* */ |
22 | /* CARNEGIE MELLON UNIVERSITY AND THE CONTRIBUTORS TO THIS WORK */ |
23 | /* DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING */ |
24 | /* ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT */ |
25 | /* SHALL CARNEGIE MELLON UNIVERSITY NOR THE CONTRIBUTORS BE LIABLE */ |
26 | /* FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES */ |
27 | /* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN */ |
28 | /* AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, */ |
29 | /* ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF */ |
30 | /* THIS SOFTWARE. */ |
31 | /* */ |
32 | /*************************************************************************/ |
33 | /* Author: Alan W Black (awb@cs.cmu.edu) */ |
34 | /* Date: July 1999 */ |
35 | /*************************************************************************/ |
36 | /* */ |
37 | /* Tokenizer for strings and files */ |
38 | /* */ |
39 | /*************************************************************************/ |
40 | #ifndef _CST_TOKENSTREAM_H__ |
41 | #define _CST_TOKENSTREAM_H__ |
42 | |
43 | #include "cst_alloc.h" |
44 | #include "cst_string.h" |
45 | #include "cst_file.h" |
46 | #include "cst_features.h" |
47 | |
48 | typedef struct cst_tokenstream_struct { |
49 | cst_file fd; |
50 | int file_pos; |
51 | int line_number; |
52 | int eof_flag; |
53 | cst_string *string_buffer; |
54 | |
55 | int current_char; |
56 | |
57 | int token_pos; |
58 | int ws_max; |
59 | cst_string *whitespace; |
60 | int prep_max; |
61 | cst_string *prepunctuation; |
62 | int token_max; |
63 | cst_string *token; |
64 | int postp_max; |
65 | cst_string *postpunctuation; |
66 | |
67 | cst_features *tags; /* e.g xml tags */ |
68 | /* if set will find token boundaries at every utf8 character */ |
69 | int utf8_explode_mode; |
70 | |
71 | void *streamtype_data; |
72 | |
73 | /* Should only be set through set_charclasses as charclass table needs */ |
74 | /* to be updated when you reset these */ |
75 | const cst_string *p_whitespacesymbols; |
76 | const cst_string *p_singlecharsymbols; |
77 | const cst_string *p_prepunctuationsymbols; |
78 | const cst_string *p_postpunctuationsymbols; |
79 | |
80 | cst_string charclass[256]; |
81 | |
82 | /* To allow externally specified reading functions e.g. epub/xml */ |
83 | int (*open)(struct cst_tokenstream_struct *ts, const char *filename); |
84 | void (*close)(struct cst_tokenstream_struct *ts); |
85 | int (*eof)(struct cst_tokenstream_struct *ts); |
86 | int (*seek)(struct cst_tokenstream_struct *ts, int pos); |
87 | int (*tell)(struct cst_tokenstream_struct *ts); |
88 | int (*size)(struct cst_tokenstream_struct *ts); |
89 | int (*getc)(struct cst_tokenstream_struct *ts); |
90 | } cst_tokenstream; |
91 | |
92 | #define TS_CHARCLASS_NONE 0 |
93 | #define TS_CHARCLASS_WHITESPACE 2 |
94 | #define TS_CHARCLASS_SINGLECHAR 4 |
95 | #define TS_CHARCLASS_PREPUNCT 8 |
96 | #define TS_CHARCLASS_POSTPUNCT 16 |
97 | #define TS_CHARCLASS_QUOTE 32 |
98 | |
99 | #define ts_charclass(C,CLASS,TS) ((TS)->charclass[(unsigned char)C] & CLASS) |
100 | |
101 | extern const cst_string * const cst_ts_default_whitespacesymbols; |
102 | extern const cst_string * const cst_ts_default_prepunctuationsymbols; |
103 | extern const cst_string * const cst_ts_default_postpunctuationsymbols; |
104 | extern const cst_string * const cst_ts_default_singlecharsymbols; |
105 | |
106 | /* Public functions for tokenstream manipulation */ |
107 | cst_tokenstream *ts_open(const char *filename, |
108 | const cst_string *whitespacesymbols, |
109 | const cst_string *singlecharsymbols, |
110 | const cst_string *prepunctsymbols, |
111 | const cst_string *postpunctsymbols); |
112 | cst_tokenstream *ts_open_string(const cst_string *string, |
113 | const cst_string *whitespacesymbols, |
114 | const cst_string *singlecharsymbols, |
115 | const cst_string *prepunctsymbols, |
116 | const cst_string *postpunctsymbols); |
117 | cst_tokenstream *ts_open_generic(const char *filename, |
118 | const cst_string *whitespacesymbols, |
119 | const cst_string *singlecharsymbols, |
120 | const cst_string *prepunctsymbols, |
121 | const cst_string *postpunctsymbols, |
122 | void *streamtype_data, |
123 | int (*open)(cst_tokenstream *ts, |
124 | const char *filename), |
125 | void (*close)(cst_tokenstream *ts), |
126 | int (*eof)(cst_tokenstream *ts), |
127 | int (*seek)(cst_tokenstream *ts, int pos), |
128 | int (*tell)(cst_tokenstream *ts), |
129 | int (*size)(cst_tokenstream *ts), |
130 | int (*getc)(cst_tokenstream *ts)); |
131 | void ts_close(cst_tokenstream *ts); |
132 | |
133 | #ifdef _WIN32 |
134 | __inline int ts_utf8_sequence_length(char c0); |
135 | #else |
136 | int ts_utf8_sequence_length(char c0); |
137 | #endif |
138 | // { |
139 | // /* Get the expected length of UTF8 sequence given its most */ |
140 | // /* significant byte */ |
141 | // return (( 0xE5000000 >> (( c0 >> 3 ) & 0x1E )) & 3 ) + 1; |
142 | // } |
143 | |
144 | int ts_eof(cst_tokenstream *ts); |
145 | const cst_string *ts_get(cst_tokenstream *ts); |
146 | |
147 | const cst_string *ts_get_quoted_token(cst_tokenstream *ts, |
148 | char quote, |
149 | char escape); |
150 | /* Externally specified ts interfaces may need this */ |
151 | cst_string private_ts_getc(cst_tokenstream *ts); |
152 | |
153 | |
154 | void set_charclasses(cst_tokenstream *ts, |
155 | const cst_string *whitespace, |
156 | const cst_string *singlecharsymbols, |
157 | const cst_string *prepunctuation, |
158 | const cst_string *postpunctuation); |
159 | |
160 | int ts_read(void *buff, int size, int num, cst_tokenstream *ts); |
161 | |
162 | int ts_set_stream_pos(cst_tokenstream *ts,int pos); |
163 | int ts_get_stream_pos(cst_tokenstream *ts); |
164 | int ts_get_stream_size(cst_tokenstream *ts); |
165 | |
166 | #endif |
167 | |