1 | /* |
2 | * Summary: interface for an HTML 4.0 non-verifying parser |
3 | * Description: this module implements an HTML 4.0 non-verifying parser |
4 | * with API compatible with the XML parser ones. It should |
5 | * be able to parse "real world" HTML, even if severely |
6 | * broken from a specification point of view. |
7 | * |
8 | * Copy: See Copyright for the status of this software. |
9 | * |
10 | * Author: Daniel Veillard |
11 | */ |
12 | |
13 | #ifndef __HTML_PARSER_H__ |
14 | #define __HTML_PARSER_H__ |
15 | #include <libxml/xmlversion.h> |
16 | #include <libxml/parser.h> |
17 | |
18 | #ifdef LIBXML_HTML_ENABLED |
19 | |
20 | #ifdef __cplusplus |
21 | extern "C" { |
22 | #endif |
23 | |
24 | /* |
25 | * Most of the back-end structures from XML and HTML are shared. |
26 | */ |
27 | typedef xmlParserCtxt htmlParserCtxt; |
28 | typedef xmlParserCtxtPtr htmlParserCtxtPtr; |
29 | typedef xmlParserNodeInfo htmlParserNodeInfo; |
30 | typedef xmlSAXHandler htmlSAXHandler; |
31 | typedef xmlSAXHandlerPtr htmlSAXHandlerPtr; |
32 | typedef xmlParserInput htmlParserInput; |
33 | typedef xmlParserInputPtr htmlParserInputPtr; |
34 | typedef xmlDocPtr htmlDocPtr; |
35 | typedef xmlNodePtr htmlNodePtr; |
36 | |
37 | /* |
38 | * Internal description of an HTML element, representing HTML 4.01 |
39 | * and XHTML 1.0 (which share the same structure). |
40 | */ |
41 | typedef struct _htmlElemDesc htmlElemDesc; |
42 | typedef htmlElemDesc *htmlElemDescPtr; |
43 | struct _htmlElemDesc { |
44 | const char *name; /* The tag name */ |
45 | char startTag; /* Whether the start tag can be implied */ |
46 | char endTag; /* Whether the end tag can be implied */ |
47 | char saveEndTag; /* Whether the end tag should be saved */ |
48 | char empty; /* Is this an empty element ? */ |
49 | char depr; /* Is this a deprecated element ? */ |
50 | char dtd; /* 1: only in Loose DTD, 2: only Frameset one */ |
51 | char isinline; /* is this a block 0 or inline 1 element */ |
52 | const char *desc; /* the description */ |
53 | |
54 | /* NRK Jan.2003 |
55 | * New fields encapsulating HTML structure |
56 | * |
57 | * Bugs: |
58 | * This is a very limited representation. It fails to tell us when |
59 | * an element *requires* subelements (we only have whether they're |
60 | * allowed or not), and it doesn't tell us where CDATA and PCDATA |
61 | * are allowed. Some element relationships are not fully represented: |
62 | * these are flagged with the word MODIFIER |
63 | */ |
64 | const char** subelts; /* allowed sub-elements of this element */ |
65 | const char* defaultsubelt; /* subelement for suggested auto-repair |
66 | if necessary or NULL */ |
67 | const char** attrs_opt; /* Optional Attributes */ |
68 | const char** attrs_depr; /* Additional deprecated attributes */ |
69 | const char** attrs_req; /* Required attributes */ |
70 | }; |
71 | |
72 | /* |
73 | * Internal description of an HTML entity. |
74 | */ |
75 | typedef struct _htmlEntityDesc htmlEntityDesc; |
76 | typedef htmlEntityDesc *htmlEntityDescPtr; |
77 | struct _htmlEntityDesc { |
78 | unsigned int value; /* the UNICODE value for the character */ |
79 | const char *name; /* The entity name */ |
80 | const char *desc; /* the description */ |
81 | }; |
82 | |
83 | /* |
84 | * There is only few public functions. |
85 | */ |
86 | XMLPUBFUN const htmlElemDesc * XMLCALL |
87 | htmlTagLookup (const xmlChar *tag); |
88 | XMLPUBFUN const htmlEntityDesc * XMLCALL |
89 | htmlEntityLookup(const xmlChar *name); |
90 | XMLPUBFUN const htmlEntityDesc * XMLCALL |
91 | htmlEntityValueLookup(unsigned int value); |
92 | |
93 | XMLPUBFUN int XMLCALL |
94 | htmlIsAutoClosed(htmlDocPtr doc, |
95 | htmlNodePtr elem); |
96 | XMLPUBFUN int XMLCALL |
97 | htmlAutoCloseTag(htmlDocPtr doc, |
98 | const xmlChar *name, |
99 | htmlNodePtr elem); |
100 | XMLPUBFUN const htmlEntityDesc * XMLCALL |
101 | htmlParseEntityRef(htmlParserCtxtPtr ctxt, |
102 | const xmlChar **str); |
103 | XMLPUBFUN int XMLCALL |
104 | htmlParseCharRef(htmlParserCtxtPtr ctxt); |
105 | XMLPUBFUN void XMLCALL |
106 | htmlParseElement(htmlParserCtxtPtr ctxt); |
107 | |
108 | XMLPUBFUN htmlParserCtxtPtr XMLCALL |
109 | htmlNewParserCtxt(void); |
110 | |
111 | XMLPUBFUN htmlParserCtxtPtr XMLCALL |
112 | htmlCreateMemoryParserCtxt(const char *buffer, |
113 | int size); |
114 | |
115 | XMLPUBFUN int XMLCALL |
116 | htmlParseDocument(htmlParserCtxtPtr ctxt); |
117 | XMLPUBFUN htmlDocPtr XMLCALL |
118 | htmlSAXParseDoc (const xmlChar *cur, |
119 | const char *encoding, |
120 | htmlSAXHandlerPtr sax, |
121 | void *userData); |
122 | XMLPUBFUN htmlDocPtr XMLCALL |
123 | htmlParseDoc (const xmlChar *cur, |
124 | const char *encoding); |
125 | XMLPUBFUN htmlDocPtr XMLCALL |
126 | htmlSAXParseFile(const char *filename, |
127 | const char *encoding, |
128 | htmlSAXHandlerPtr sax, |
129 | void *userData); |
130 | XMLPUBFUN htmlDocPtr XMLCALL |
131 | htmlParseFile (const char *filename, |
132 | const char *encoding); |
133 | XMLPUBFUN int XMLCALL |
134 | UTF8ToHtml (unsigned char *out, |
135 | int *outlen, |
136 | const unsigned char *in, |
137 | int *inlen); |
138 | XMLPUBFUN int XMLCALL |
139 | htmlEncodeEntities(unsigned char *out, |
140 | int *outlen, |
141 | const unsigned char *in, |
142 | int *inlen, int quoteChar); |
143 | XMLPUBFUN int XMLCALL |
144 | htmlIsScriptAttribute(const xmlChar *name); |
145 | XMLPUBFUN int XMLCALL |
146 | htmlHandleOmittedElem(int val); |
147 | |
148 | #ifdef LIBXML_PUSH_ENABLED |
149 | /** |
150 | * Interfaces for the Push mode. |
151 | */ |
152 | XMLPUBFUN htmlParserCtxtPtr XMLCALL |
153 | htmlCreatePushParserCtxt(htmlSAXHandlerPtr sax, |
154 | void *user_data, |
155 | const char *chunk, |
156 | int size, |
157 | const char *filename, |
158 | xmlCharEncoding enc); |
159 | XMLPUBFUN int XMLCALL |
160 | htmlParseChunk (htmlParserCtxtPtr ctxt, |
161 | const char *chunk, |
162 | int size, |
163 | int terminate); |
164 | #endif /* LIBXML_PUSH_ENABLED */ |
165 | |
166 | XMLPUBFUN void XMLCALL |
167 | htmlFreeParserCtxt (htmlParserCtxtPtr ctxt); |
168 | |
169 | /* |
170 | * New set of simpler/more flexible APIs |
171 | */ |
172 | /** |
173 | * xmlParserOption: |
174 | * |
175 | * This is the set of XML parser options that can be passed down |
176 | * to the xmlReadDoc() and similar calls. |
177 | */ |
178 | typedef enum { |
179 | HTML_PARSE_RECOVER = 1<<0, /* Relaxed parsing */ |
180 | HTML_PARSE_NODEFDTD = 1<<2, /* do not default a doctype if not found */ |
181 | HTML_PARSE_NOERROR = 1<<5, /* suppress error reports */ |
182 | HTML_PARSE_NOWARNING= 1<<6, /* suppress warning reports */ |
183 | HTML_PARSE_PEDANTIC = 1<<7, /* pedantic error reporting */ |
184 | HTML_PARSE_NOBLANKS = 1<<8, /* remove blank nodes */ |
185 | HTML_PARSE_NONET = 1<<11,/* Forbid network access */ |
186 | HTML_PARSE_NOIMPLIED= 1<<13,/* Do not add implied html/body... elements */ |
187 | HTML_PARSE_COMPACT = 1<<16,/* compact small text nodes */ |
188 | HTML_PARSE_IGNORE_ENC=1<<21 /* ignore internal document encoding hint */ |
189 | } htmlParserOption; |
190 | |
191 | XMLPUBFUN void XMLCALL |
192 | htmlCtxtReset (htmlParserCtxtPtr ctxt); |
193 | XMLPUBFUN int XMLCALL |
194 | htmlCtxtUseOptions (htmlParserCtxtPtr ctxt, |
195 | int options); |
196 | XMLPUBFUN htmlDocPtr XMLCALL |
197 | htmlReadDoc (const xmlChar *cur, |
198 | const char *URL, |
199 | const char *encoding, |
200 | int options); |
201 | XMLPUBFUN htmlDocPtr XMLCALL |
202 | htmlReadFile (const char *URL, |
203 | const char *encoding, |
204 | int options); |
205 | XMLPUBFUN htmlDocPtr XMLCALL |
206 | htmlReadMemory (const char *buffer, |
207 | int size, |
208 | const char *URL, |
209 | const char *encoding, |
210 | int options); |
211 | XMLPUBFUN htmlDocPtr XMLCALL |
212 | htmlReadFd (int fd, |
213 | const char *URL, |
214 | const char *encoding, |
215 | int options); |
216 | XMLPUBFUN htmlDocPtr XMLCALL |
217 | htmlReadIO (xmlInputReadCallback ioread, |
218 | xmlInputCloseCallback ioclose, |
219 | void *ioctx, |
220 | const char *URL, |
221 | const char *encoding, |
222 | int options); |
223 | XMLPUBFUN htmlDocPtr XMLCALL |
224 | htmlCtxtReadDoc (xmlParserCtxtPtr ctxt, |
225 | const xmlChar *cur, |
226 | const char *URL, |
227 | const char *encoding, |
228 | int options); |
229 | XMLPUBFUN htmlDocPtr XMLCALL |
230 | htmlCtxtReadFile (xmlParserCtxtPtr ctxt, |
231 | const char *filename, |
232 | const char *encoding, |
233 | int options); |
234 | XMLPUBFUN htmlDocPtr XMLCALL |
235 | htmlCtxtReadMemory (xmlParserCtxtPtr ctxt, |
236 | const char *buffer, |
237 | int size, |
238 | const char *URL, |
239 | const char *encoding, |
240 | int options); |
241 | XMLPUBFUN htmlDocPtr XMLCALL |
242 | htmlCtxtReadFd (xmlParserCtxtPtr ctxt, |
243 | int fd, |
244 | const char *URL, |
245 | const char *encoding, |
246 | int options); |
247 | XMLPUBFUN htmlDocPtr XMLCALL |
248 | htmlCtxtReadIO (xmlParserCtxtPtr ctxt, |
249 | xmlInputReadCallback ioread, |
250 | xmlInputCloseCallback ioclose, |
251 | void *ioctx, |
252 | const char *URL, |
253 | const char *encoding, |
254 | int options); |
255 | |
256 | /* NRK/Jan2003: further knowledge of HTML structure |
257 | */ |
258 | typedef enum { |
259 | HTML_NA = 0 , /* something we don't check at all */ |
260 | HTML_INVALID = 0x1 , |
261 | HTML_DEPRECATED = 0x2 , |
262 | HTML_VALID = 0x4 , |
263 | HTML_REQUIRED = 0xc /* VALID bit set so ( & HTML_VALID ) is TRUE */ |
264 | } htmlStatus ; |
265 | |
266 | /* Using htmlElemDesc rather than name here, to emphasise the fact |
267 | that otherwise there's a lookup overhead |
268 | */ |
269 | XMLPUBFUN htmlStatus XMLCALL htmlAttrAllowed(const htmlElemDesc*, const xmlChar*, int) ; |
270 | XMLPUBFUN int XMLCALL htmlElementAllowedHere(const htmlElemDesc*, const xmlChar*) ; |
271 | XMLPUBFUN htmlStatus XMLCALL htmlElementStatusHere(const htmlElemDesc*, const htmlElemDesc*) ; |
272 | XMLPUBFUN htmlStatus XMLCALL htmlNodeStatus(const htmlNodePtr, int) ; |
273 | /** |
274 | * htmlDefaultSubelement: |
275 | * @elt: HTML element |
276 | * |
277 | * Returns the default subelement for this element |
278 | */ |
279 | #define htmlDefaultSubelement(elt) elt->defaultsubelt |
280 | /** |
281 | * htmlElementAllowedHereDesc: |
282 | * @parent: HTML parent element |
283 | * @elt: HTML element |
284 | * |
285 | * Checks whether an HTML element description may be a |
286 | * direct child of the specified element. |
287 | * |
288 | * Returns 1 if allowed; 0 otherwise. |
289 | */ |
290 | #define htmlElementAllowedHereDesc(parent,elt) \ |
291 | htmlElementAllowedHere((parent), (elt)->name) |
292 | /** |
293 | * htmlRequiredAttrs: |
294 | * @elt: HTML element |
295 | * |
296 | * Returns the attributes required for the specified element. |
297 | */ |
298 | #define htmlRequiredAttrs(elt) (elt)->attrs_req |
299 | |
300 | |
301 | #ifdef __cplusplus |
302 | } |
303 | #endif |
304 | |
305 | #endif /* LIBXML_HTML_ENABLED */ |
306 | #endif /* __HTML_PARSER_H__ */ |
307 | |