| 1 | /*************************************************************************/ |
| 2 | /* */ |
| 3 | /* Language Technologies Institute */ |
| 4 | /* Carnegie Mellon University */ |
| 5 | /* Copyright (c) 1999 */ |
| 6 | /* All Rights Reserved. */ |
| 7 | /* */ |
| 8 | /* Permission is hereby granted, free of charge, to use and distribute */ |
| 9 | /* this software and its documentation without restriction, including */ |
| 10 | /* without limitation the rights to use, copy, modify, merge, publish, */ |
| 11 | /* distribute, sublicense, and/or sell copies of this work, and to */ |
| 12 | /* permit persons to whom this work is furnished to do so, subject to */ |
| 13 | /* the following conditions: */ |
| 14 | /* 1. The code must retain the above copyright notice, this list of */ |
| 15 | /* conditions and the following disclaimer. */ |
| 16 | /* 2. Any modifications must be clearly marked as such. */ |
| 17 | /* 3. Original authors' names are not deleted. */ |
| 18 | /* 4. The authors' names are not used to endorse or promote products */ |
| 19 | /* derived from this software without specific prior written */ |
| 20 | /* permission. */ |
| 21 | /* */ |
| 22 | /* CARNEGIE MELLON UNIVERSITY AND THE CONTRIBUTORS TO THIS WORK */ |
| 23 | /* DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING */ |
| 24 | /* ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT */ |
| 25 | /* SHALL CARNEGIE MELLON UNIVERSITY NOR THE CONTRIBUTORS BE LIABLE */ |
| 26 | /* FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES */ |
| 27 | /* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN */ |
| 28 | /* AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, */ |
| 29 | /* ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF */ |
| 30 | /* THIS SOFTWARE. */ |
| 31 | /* */ |
| 32 | /*************************************************************************/ |
| 33 | /* Author: Alan W Black (awb@cs.cmu.edu) */ |
| 34 | /* Date: July 1999 */ |
| 35 | /*************************************************************************/ |
| 36 | /* */ |
| 37 | /* Tokenizer for strings and files */ |
| 38 | /* */ |
| 39 | /*************************************************************************/ |
| 40 | #ifndef _CST_TOKENSTREAM_H__ |
| 41 | #define _CST_TOKENSTREAM_H__ |
| 42 | |
| 43 | #include "cst_alloc.h" |
| 44 | #include "cst_string.h" |
| 45 | #include "cst_file.h" |
| 46 | #include "cst_features.h" |
| 47 | |
| 48 | typedef struct cst_tokenstream_struct { |
| 49 | cst_file fd; |
| 50 | int file_pos; |
| 51 | int line_number; |
| 52 | int eof_flag; |
| 53 | cst_string *string_buffer; |
| 54 | |
| 55 | int current_char; |
| 56 | |
| 57 | int token_pos; |
| 58 | int ws_max; |
| 59 | cst_string *whitespace; |
| 60 | int prep_max; |
| 61 | cst_string *prepunctuation; |
| 62 | int token_max; |
| 63 | cst_string *token; |
| 64 | int postp_max; |
| 65 | cst_string *postpunctuation; |
| 66 | |
| 67 | cst_features *tags; /* e.g xml tags */ |
| 68 | /* if set will find token boundaries at every utf8 character */ |
| 69 | int utf8_explode_mode; |
| 70 | |
| 71 | void *streamtype_data; |
| 72 | |
| 73 | /* Should only be set through set_charclasses as charclass table needs */ |
| 74 | /* to be updated when you reset these */ |
| 75 | const cst_string *p_whitespacesymbols; |
| 76 | const cst_string *p_singlecharsymbols; |
| 77 | const cst_string *p_prepunctuationsymbols; |
| 78 | const cst_string *p_postpunctuationsymbols; |
| 79 | |
| 80 | cst_string charclass[256]; |
| 81 | |
| 82 | /* To allow externally specified reading functions e.g. epub/xml */ |
| 83 | int (*open)(struct cst_tokenstream_struct *ts, const char *filename); |
| 84 | void (*close)(struct cst_tokenstream_struct *ts); |
| 85 | int (*eof)(struct cst_tokenstream_struct *ts); |
| 86 | int (*seek)(struct cst_tokenstream_struct *ts, int pos); |
| 87 | int (*tell)(struct cst_tokenstream_struct *ts); |
| 88 | int (*size)(struct cst_tokenstream_struct *ts); |
| 89 | int (*getc)(struct cst_tokenstream_struct *ts); |
| 90 | } cst_tokenstream; |
| 91 | |
| 92 | #define TS_CHARCLASS_NONE 0 |
| 93 | #define TS_CHARCLASS_WHITESPACE 2 |
| 94 | #define TS_CHARCLASS_SINGLECHAR 4 |
| 95 | #define TS_CHARCLASS_PREPUNCT 8 |
| 96 | #define TS_CHARCLASS_POSTPUNCT 16 |
| 97 | #define TS_CHARCLASS_QUOTE 32 |
| 98 | |
| 99 | #define ts_charclass(C,CLASS,TS) ((TS)->charclass[(unsigned char)C] & CLASS) |
| 100 | |
| 101 | extern const cst_string * const cst_ts_default_whitespacesymbols; |
| 102 | extern const cst_string * const cst_ts_default_prepunctuationsymbols; |
| 103 | extern const cst_string * const cst_ts_default_postpunctuationsymbols; |
| 104 | extern const cst_string * const cst_ts_default_singlecharsymbols; |
| 105 | |
| 106 | /* Public functions for tokenstream manipulation */ |
| 107 | cst_tokenstream *ts_open(const char *filename, |
| 108 | const cst_string *whitespacesymbols, |
| 109 | const cst_string *singlecharsymbols, |
| 110 | const cst_string *prepunctsymbols, |
| 111 | const cst_string *postpunctsymbols); |
| 112 | cst_tokenstream *ts_open_string(const cst_string *string, |
| 113 | const cst_string *whitespacesymbols, |
| 114 | const cst_string *singlecharsymbols, |
| 115 | const cst_string *prepunctsymbols, |
| 116 | const cst_string *postpunctsymbols); |
| 117 | cst_tokenstream *ts_open_generic(const char *filename, |
| 118 | const cst_string *whitespacesymbols, |
| 119 | const cst_string *singlecharsymbols, |
| 120 | const cst_string *prepunctsymbols, |
| 121 | const cst_string *postpunctsymbols, |
| 122 | void *streamtype_data, |
| 123 | int (*open)(cst_tokenstream *ts, |
| 124 | const char *filename), |
| 125 | void (*close)(cst_tokenstream *ts), |
| 126 | int (*eof)(cst_tokenstream *ts), |
| 127 | int (*seek)(cst_tokenstream *ts, int pos), |
| 128 | int (*tell)(cst_tokenstream *ts), |
| 129 | int (*size)(cst_tokenstream *ts), |
| 130 | int (*getc)(cst_tokenstream *ts)); |
| 131 | void ts_close(cst_tokenstream *ts); |
| 132 | |
| 133 | #ifdef _WIN32 |
| 134 | __inline int ts_utf8_sequence_length(char c0); |
| 135 | #else |
| 136 | int ts_utf8_sequence_length(char c0); |
| 137 | #endif |
| 138 | // { |
| 139 | // /* Get the expected length of UTF8 sequence given its most */ |
| 140 | // /* significant byte */ |
| 141 | // return (( 0xE5000000 >> (( c0 >> 3 ) & 0x1E )) & 3 ) + 1; |
| 142 | // } |
| 143 | |
| 144 | int ts_eof(cst_tokenstream *ts); |
| 145 | const cst_string *ts_get(cst_tokenstream *ts); |
| 146 | |
| 147 | const cst_string *ts_get_quoted_token(cst_tokenstream *ts, |
| 148 | char quote, |
| 149 | char escape); |
| 150 | /* Externally specified ts interfaces may need this */ |
| 151 | cst_string private_ts_getc(cst_tokenstream *ts); |
| 152 | |
| 153 | |
| 154 | void set_charclasses(cst_tokenstream *ts, |
| 155 | const cst_string *whitespace, |
| 156 | const cst_string *singlecharsymbols, |
| 157 | const cst_string *prepunctuation, |
| 158 | const cst_string *postpunctuation); |
| 159 | |
| 160 | int ts_read(void *buff, int size, int num, cst_tokenstream *ts); |
| 161 | |
| 162 | int ts_set_stream_pos(cst_tokenstream *ts,int pos); |
| 163 | int ts_get_stream_pos(cst_tokenstream *ts); |
| 164 | int ts_get_stream_size(cst_tokenstream *ts); |
| 165 | |
| 166 | #endif |
| 167 | |