| 1 | /*************************************************************************/ | 
| 2 | /*                                                                       */ | 
| 3 | /*                  Language Technologies Institute                      */ | 
| 4 | /*                     Carnegie Mellon University                        */ | 
| 5 | /*                        Copyright (c) 1999                             */ | 
| 6 | /*                        All Rights Reserved.                           */ | 
| 7 | /*                                                                       */ | 
| 8 | /*  Permission is hereby granted, free of charge, to use and distribute  */ | 
| 9 | /*  this software and its documentation without restriction, including   */ | 
| 10 | /*  without limitation the rights to use, copy, modify, merge, publish,  */ | 
| 11 | /*  distribute, sublicense, and/or sell copies of this work, and to      */ | 
| 12 | /*  permit persons to whom this work is furnished to do so, subject to   */ | 
| 13 | /*  the following conditions:                                            */ | 
| 14 | /*   1. The code must retain the above copyright notice, this list of    */ | 
| 15 | /*      conditions and the following disclaimer.                         */ | 
| 16 | /*   2. Any modifications must be clearly marked as such.                */ | 
| 17 | /*   3. Original authors' names are not deleted.                         */ | 
| 18 | /*   4. The authors' names are not used to endorse or promote products   */ | 
| 19 | /*      derived from this software without specific prior written        */ | 
| 20 | /*      permission.                                                      */ | 
| 21 | /*                                                                       */ | 
| 22 | /*  CARNEGIE MELLON UNIVERSITY AND THE CONTRIBUTORS TO THIS WORK         */ | 
| 23 | /*  DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING      */ | 
| 24 | /*  ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT   */ | 
| 25 | /*  SHALL CARNEGIE MELLON UNIVERSITY NOR THE CONTRIBUTORS BE LIABLE      */ | 
| 26 | /*  FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES    */ | 
| 27 | /*  WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN   */ | 
| 28 | /*  AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,          */ | 
| 29 | /*  ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF       */ | 
| 30 | /*  THIS SOFTWARE.                                                       */ | 
| 31 | /*                                                                       */ | 
| 32 | /*************************************************************************/ | 
| 33 | /*             Author:  Alan W Black (awb@cs.cmu.edu)                    */ | 
| 34 | /*               Date:  July 1999                                        */ | 
| 35 | /*************************************************************************/ | 
| 36 | /*                                                                       */ | 
| 37 | /*  Tokenizer for strings and files                                      */ | 
| 38 | /*                                                                       */ | 
| 39 | /*************************************************************************/ | 
| 40 | #ifndef _CST_TOKENSTREAM_H__ | 
| 41 | #define _CST_TOKENSTREAM_H__ | 
| 42 |  | 
| 43 | #include "cst_alloc.h" | 
| 44 | #include "cst_string.h" | 
| 45 | #include "cst_file.h" | 
| 46 | #include "cst_features.h" | 
| 47 |  | 
| 48 | typedef struct  cst_tokenstream_struct { | 
| 49 |     cst_file fd; | 
| 50 |     int file_pos; | 
| 51 |     int line_number; | 
| 52 |     int eof_flag; | 
| 53 |     cst_string *string_buffer; | 
| 54 |  | 
| 55 |     int current_char; | 
| 56 |  | 
| 57 |     int token_pos; | 
| 58 |     int ws_max; | 
| 59 |     cst_string *whitespace; | 
| 60 |     int prep_max; | 
| 61 |     cst_string *prepunctuation; | 
| 62 |     int token_max; | 
| 63 |     cst_string *token; | 
| 64 |     int postp_max; | 
| 65 |     cst_string *postpunctuation; | 
| 66 |  | 
| 67 |     cst_features *tags;  /* e.g xml tags */ | 
| 68 |     /* if set will find token boundaries at every utf8 character */ | 
| 69 |     int utf8_explode_mode;   | 
| 70 |  | 
| 71 |     void *streamtype_data; | 
| 72 |  | 
| 73 |     /* Should only be set through set_charclasses as charclass table needs */ | 
| 74 |     /* to be updated when you reset these                                  */ | 
| 75 |     const cst_string *p_whitespacesymbols; | 
| 76 |     const cst_string *p_singlecharsymbols; | 
| 77 |     const cst_string *p_prepunctuationsymbols; | 
| 78 |     const cst_string *p_postpunctuationsymbols; | 
| 79 |  | 
| 80 |     cst_string charclass[256]; | 
| 81 |  | 
| 82 |     /* To allow externally specified reading functions e.g. epub/xml */ | 
| 83 |     int (*open)(struct cst_tokenstream_struct *ts, const char *filename); | 
| 84 |     void (*close)(struct cst_tokenstream_struct *ts); | 
| 85 |     int (*eof)(struct cst_tokenstream_struct *ts); | 
| 86 |     int (*seek)(struct cst_tokenstream_struct *ts, int pos); | 
| 87 |     int (*tell)(struct cst_tokenstream_struct *ts); | 
| 88 |     int (*size)(struct cst_tokenstream_struct *ts); | 
| 89 |     int (*getc)(struct cst_tokenstream_struct *ts); | 
| 90 | } cst_tokenstream; | 
| 91 |  | 
| 92 | #define TS_CHARCLASS_NONE        0 | 
| 93 | #define TS_CHARCLASS_WHITESPACE  2 | 
| 94 | #define TS_CHARCLASS_SINGLECHAR  4 | 
| 95 | #define TS_CHARCLASS_PREPUNCT    8 | 
| 96 | #define TS_CHARCLASS_POSTPUNCT  16 | 
| 97 | #define TS_CHARCLASS_QUOTE      32 | 
| 98 |  | 
| 99 | #define ts_charclass(C,CLASS,TS) ((TS)->charclass[(unsigned char)C] & CLASS) | 
| 100 |  | 
| 101 | extern const cst_string * const cst_ts_default_whitespacesymbols; | 
| 102 | extern const cst_string * const cst_ts_default_prepunctuationsymbols; | 
| 103 | extern const cst_string * const cst_ts_default_postpunctuationsymbols; | 
| 104 | extern const cst_string * const cst_ts_default_singlecharsymbols; | 
| 105 |  | 
| 106 | /* Public functions for tokenstream manipulation */ | 
| 107 | cst_tokenstream *ts_open(const char *filename, | 
| 108 | 			 const cst_string *whitespacesymbols, | 
| 109 | 			 const cst_string *singlecharsymbols, | 
| 110 | 			 const cst_string *prepunctsymbols, | 
| 111 | 			 const cst_string *postpunctsymbols); | 
| 112 | cst_tokenstream *ts_open_string(const cst_string *string, | 
| 113 | 				const cst_string *whitespacesymbols, | 
| 114 | 				const cst_string *singlecharsymbols, | 
| 115 | 				const cst_string *prepunctsymbols, | 
| 116 | 				const cst_string *postpunctsymbols); | 
| 117 | cst_tokenstream *ts_open_generic(const char *filename, | 
| 118 |                                  const cst_string *whitespacesymbols, | 
| 119 |                                  const cst_string *singlecharsymbols, | 
| 120 |                                  const cst_string *prepunctsymbols, | 
| 121 |                                  const cst_string *postpunctsymbols, | 
| 122 |                                  void *streamtype_data, | 
| 123 |                                  int (*open)(cst_tokenstream *ts, | 
| 124 |                                              const char *filename), | 
| 125 |                                  void (*close)(cst_tokenstream *ts), | 
| 126 |                                  int (*eof)(cst_tokenstream *ts), | 
| 127 |                                  int (*seek)(cst_tokenstream *ts, int pos), | 
| 128 |                                  int (*tell)(cst_tokenstream *ts), | 
| 129 |                                  int (*size)(cst_tokenstream *ts), | 
| 130 |                                  int (*getc)(cst_tokenstream *ts)); | 
| 131 | void ts_close(cst_tokenstream *ts); | 
| 132 |  | 
| 133 | #ifdef _WIN32 | 
| 134 | __inline int ts_utf8_sequence_length(char c0); | 
| 135 | #else | 
| 136 | int ts_utf8_sequence_length(char c0); | 
| 137 | #endif | 
| 138 |  // { | 
| 139 | //    /* Get the expected length of UTF8 sequence given its most */ | 
| 140 | //    /* significant byte */ | 
| 141 | //    return (( 0xE5000000 >> (( c0 >> 3 ) & 0x1E )) & 3 ) + 1; | 
| 142 | // } | 
| 143 |  | 
| 144 | int ts_eof(cst_tokenstream *ts); | 
| 145 | const cst_string *ts_get(cst_tokenstream *ts); | 
| 146 |  | 
| 147 | const cst_string *ts_get_quoted_token(cst_tokenstream *ts, | 
| 148 | 				char quote, | 
| 149 | 				char escape); | 
| 150 | /* Externally specified ts interfaces may need this */ | 
| 151 | cst_string private_ts_getc(cst_tokenstream *ts); | 
| 152 |  | 
| 153 |  | 
| 154 | void set_charclasses(cst_tokenstream *ts, | 
| 155 | 		     const cst_string *whitespace, | 
| 156 | 		     const cst_string *singlecharsymbols, | 
| 157 | 		     const cst_string *prepunctuation, | 
| 158 | 		     const cst_string *postpunctuation); | 
| 159 |  | 
| 160 | int ts_read(void *buff, int size, int num, cst_tokenstream *ts); | 
| 161 |  | 
| 162 | int ts_set_stream_pos(cst_tokenstream *ts,int pos); | 
| 163 | int ts_get_stream_pos(cst_tokenstream *ts); | 
| 164 | int ts_get_stream_size(cst_tokenstream *ts); | 
| 165 |  | 
| 166 | #endif | 
| 167 |  |