1/*************************************************************************/
2/* */
3/* Language Technologies Institute */
4/* Carnegie Mellon University */
5/* Copyright (c) 1999 */
6/* All Rights Reserved. */
7/* */
8/* Permission is hereby granted, free of charge, to use and distribute */
9/* this software and its documentation without restriction, including */
10/* without limitation the rights to use, copy, modify, merge, publish, */
11/* distribute, sublicense, and/or sell copies of this work, and to */
12/* permit persons to whom this work is furnished to do so, subject to */
13/* the following conditions: */
14/* 1. The code must retain the above copyright notice, this list of */
15/* conditions and the following disclaimer. */
16/* 2. Any modifications must be clearly marked as such. */
17/* 3. Original authors' names are not deleted. */
18/* 4. The authors' names are not used to endorse or promote products */
19/* derived from this software without specific prior written */
20/* permission. */
21/* */
22/* CARNEGIE MELLON UNIVERSITY AND THE CONTRIBUTORS TO THIS WORK */
23/* DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING */
24/* ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT */
25/* SHALL CARNEGIE MELLON UNIVERSITY NOR THE CONTRIBUTORS BE LIABLE */
26/* FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES */
27/* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN */
28/* AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, */
29/* ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF */
30/* THIS SOFTWARE. */
31/* */
32/*************************************************************************/
33/* Author: Alan W Black (awb@cs.cmu.edu) */
34/* Date: July 1999 */
35/*************************************************************************/
36/* */
37/* Tokenizer for strings and files */
38/* */
39/*************************************************************************/
40#ifndef _CST_TOKENSTREAM_H__
41#define _CST_TOKENSTREAM_H__
42
43#include "cst_alloc.h"
44#include "cst_string.h"
45#include "cst_file.h"
46#include "cst_features.h"
47
48typedef struct cst_tokenstream_struct {
49 cst_file fd;
50 int file_pos;
51 int line_number;
52 int eof_flag;
53 cst_string *string_buffer;
54
55 int current_char;
56
57 int token_pos;
58 int ws_max;
59 cst_string *whitespace;
60 int prep_max;
61 cst_string *prepunctuation;
62 int token_max;
63 cst_string *token;
64 int postp_max;
65 cst_string *postpunctuation;
66
67 cst_features *tags; /* e.g xml tags */
68 /* if set will find token boundaries at every utf8 character */
69 int utf8_explode_mode;
70
71 void *streamtype_data;
72
73 /* Should only be set through set_charclasses as charclass table needs */
74 /* to be updated when you reset these */
75 const cst_string *p_whitespacesymbols;
76 const cst_string *p_singlecharsymbols;
77 const cst_string *p_prepunctuationsymbols;
78 const cst_string *p_postpunctuationsymbols;
79
80 cst_string charclass[256];
81
82 /* To allow externally specified reading functions e.g. epub/xml */
83 int (*open)(struct cst_tokenstream_struct *ts, const char *filename);
84 void (*close)(struct cst_tokenstream_struct *ts);
85 int (*eof)(struct cst_tokenstream_struct *ts);
86 int (*seek)(struct cst_tokenstream_struct *ts, int pos);
87 int (*tell)(struct cst_tokenstream_struct *ts);
88 int (*size)(struct cst_tokenstream_struct *ts);
89 int (*getc)(struct cst_tokenstream_struct *ts);
90} cst_tokenstream;
91
92#define TS_CHARCLASS_NONE 0
93#define TS_CHARCLASS_WHITESPACE 2
94#define TS_CHARCLASS_SINGLECHAR 4
95#define TS_CHARCLASS_PREPUNCT 8
96#define TS_CHARCLASS_POSTPUNCT 16
97#define TS_CHARCLASS_QUOTE 32
98
99#define ts_charclass(C,CLASS,TS) ((TS)->charclass[(unsigned char)C] & CLASS)
100
101extern const cst_string * const cst_ts_default_whitespacesymbols;
102extern const cst_string * const cst_ts_default_prepunctuationsymbols;
103extern const cst_string * const cst_ts_default_postpunctuationsymbols;
104extern const cst_string * const cst_ts_default_singlecharsymbols;
105
106/* Public functions for tokenstream manipulation */
107cst_tokenstream *ts_open(const char *filename,
108 const cst_string *whitespacesymbols,
109 const cst_string *singlecharsymbols,
110 const cst_string *prepunctsymbols,
111 const cst_string *postpunctsymbols);
112cst_tokenstream *ts_open_string(const cst_string *string,
113 const cst_string *whitespacesymbols,
114 const cst_string *singlecharsymbols,
115 const cst_string *prepunctsymbols,
116 const cst_string *postpunctsymbols);
117cst_tokenstream *ts_open_generic(const char *filename,
118 const cst_string *whitespacesymbols,
119 const cst_string *singlecharsymbols,
120 const cst_string *prepunctsymbols,
121 const cst_string *postpunctsymbols,
122 void *streamtype_data,
123 int (*open)(cst_tokenstream *ts,
124 const char *filename),
125 void (*close)(cst_tokenstream *ts),
126 int (*eof)(cst_tokenstream *ts),
127 int (*seek)(cst_tokenstream *ts, int pos),
128 int (*tell)(cst_tokenstream *ts),
129 int (*size)(cst_tokenstream *ts),
130 int (*getc)(cst_tokenstream *ts));
131void ts_close(cst_tokenstream *ts);
132
133#ifdef _WIN32
134__inline int ts_utf8_sequence_length(char c0);
135#else
136int ts_utf8_sequence_length(char c0);
137#endif
138 // {
139// /* Get the expected length of UTF8 sequence given its most */
140// /* significant byte */
141// return (( 0xE5000000 >> (( c0 >> 3 ) & 0x1E )) & 3 ) + 1;
142// }
143
144int ts_eof(cst_tokenstream *ts);
145const cst_string *ts_get(cst_tokenstream *ts);
146
147const cst_string *ts_get_quoted_token(cst_tokenstream *ts,
148 char quote,
149 char escape);
150/* Externally specified ts interfaces may need this */
151cst_string private_ts_getc(cst_tokenstream *ts);
152
153
154void set_charclasses(cst_tokenstream *ts,
155 const cst_string *whitespace,
156 const cst_string *singlecharsymbols,
157 const cst_string *prepunctuation,
158 const cst_string *postpunctuation);
159
160int ts_read(void *buff, int size, int num, cst_tokenstream *ts);
161
162int ts_set_stream_pos(cst_tokenstream *ts,int pos);
163int ts_get_stream_pos(cst_tokenstream *ts);
164int ts_get_stream_size(cst_tokenstream *ts);
165
166#endif
167

source code of include/flite/cst_tokenstream.h