2 * Summary: interface for an HTML 4.0 non-verifying parser
3 * Description: this module implements an HTML 4.0 non-verifying parser
4 * with API compatible with the XML parser ones. It should
5 * be able to parse "real world" HTML, even if severely
6 * broken from a specification point of view.
8 * Copy: See Copyright for the status of this software.
10 * Author: Daniel Veillard
11 * Portion Copyright © 2009 Nokia Corporation and/or its subsidiary(-ies). All rights reserved.
22 #include <stdapis/libxml2/libxml2_parser.h>
29 * Most of the back-end structures from XML and HTML are shared.
31 typedef xmlParserCtxt htmlParserCtxt;
32 typedef xmlParserCtxtPtr htmlParserCtxtPtr;
33 typedef xmlParserNodeInfo htmlParserNodeInfo;
34 typedef xmlSAXHandler htmlSAXHandler;
35 typedef xmlSAXHandlerPtr htmlSAXHandlerPtr;
36 typedef xmlParserInput htmlParserInput;
37 typedef xmlParserInputPtr htmlParserInputPtr;
38 typedef xmlDocPtr htmlDocPtr;
39 typedef xmlNodePtr htmlNodePtr;
42 * Internal description of an HTML element, representing HTML 4.01
43 * and XHTML 1.0 (which share the same structure).
45 typedef struct _htmlElemDesc htmlElemDesc;
46 typedef htmlElemDesc *htmlElemDescPtr;
47 struct _htmlElemDesc {
48 const char *name; /* The tag name */
49 char startTag; /* Whether the start tag can be implied */
50 char endTag; /* Whether the end tag can be implied */
51 char saveEndTag; /* Whether the end tag should be saved */
52 char empty; /* Is this an empty element ? */
53 char depr; /* Is this a deprecated element ? */
54 char dtd; /* 1: only in Loose DTD, 2: only Frameset one */
55 char isinline; /* is this a block 0 or inline 1 element */
56 const char *desc; /* the description */
59 * New fields encapsulating HTML structure
62 * This is a very limited representation. It fails to tell us when
63 * an element *requires* subelements (we only have whether they're
64 * allowed or not), and it doesn't tell us where CDATA and PCDATA
65 * are allowed. Some element relationships are not fully represented:
66 * these are flagged with the word MODIFIER
68 const char** subelts; /* allowed sub-elements of this element */
69 const char* defaultsubelt; /* subelement for suggested auto-repair
70 if necessary or NULL */
71 const char** attrs_opt; /* Optional Attributes */
72 const char** attrs_depr; /* Additional deprecated attributes */
73 const char** attrs_req; /* Required attributes */
77 * Internal description of an HTML entity.
79 typedef struct _htmlEntityDesc htmlEntityDesc;
80 typedef htmlEntityDesc *htmlEntityDescPtr;
81 struct _htmlEntityDesc {
82 unsigned int value; /* the UNICODE value for the character */
83 const char *name; /* The entity name */
84 const char *desc; /* the description */
87 #if defined(LIBXML_HTML_ENABLED) || defined(XMLENGINE_XSLT)
89 * There is only few public functions.
91 XMLPUBFUN const htmlElemDesc * XMLCALL
92 htmlTagLookup (const xmlChar *tag);
94 #endif /* LIBXML_HTML_ENABLED || XMLENGINE_XSLT ) */
96 #ifdef LIBXML_HTML_ENABLED
98 XMLPUBFUN const htmlEntityDesc * XMLCALL
99 htmlEntityLookup(const xmlChar *name);
100 XMLPUBFUN const htmlEntityDesc * XMLCALL
101 htmlEntityValueLookup(unsigned int value);
103 XMLPUBFUN int XMLCALL
104 htmlIsAutoClosed(htmlDocPtr doc,
106 XMLPUBFUN int XMLCALL
107 htmlAutoCloseTag(htmlDocPtr doc,
110 XMLPUBFUN const htmlEntityDesc * XMLCALL
111 htmlParseEntityRef(htmlParserCtxtPtr ctxt,
112 const xmlChar **str);
113 XMLPUBFUN int XMLCALL
114 htmlParseCharRef(htmlParserCtxtPtr ctxt);
115 XMLPUBFUN void XMLCALL
116 htmlParseElement(htmlParserCtxtPtr ctxt);
118 XMLPUBFUN htmlParserCtxtPtr XMLCALL
119 htmlCreateMemoryParserCtxt(const char *buffer,
122 XMLPUBFUN int XMLCALL
123 htmlParseDocument(htmlParserCtxtPtr ctxt);
124 XMLPUBFUN htmlDocPtr XMLCALL
125 htmlSAXParseDoc (xmlChar *cur,
126 const char *encoding,
127 htmlSAXHandlerPtr sax,
129 XMLPUBFUN htmlDocPtr XMLCALL
130 htmlParseDoc (xmlChar *cur,
131 const char *encoding);
132 XMLPUBFUN htmlDocPtr XMLCALL
133 htmlSAXParseFile(const char *filename,
134 const char *encoding,
135 htmlSAXHandlerPtr sax,
137 XMLPUBFUN htmlDocPtr XMLCALL
138 htmlParseFile (const char *filename,
139 const char *encoding);
140 XMLPUBFUN int XMLCALL
141 UTF8ToHtml (unsigned char *out,
143 const unsigned char *in,
145 XMLPUBFUN int XMLCALL
146 htmlEncodeEntities(unsigned char *out,
148 const unsigned char *in,
149 int *inlen, int quoteChar);
150 XMLPUBFUN int XMLCALL
151 htmlIsScriptAttribute(const xmlChar *name);
152 XMLPUBFUN int XMLCALL
153 htmlHandleOmittedElem(int val);
155 #ifdef LIBXML_PUSH_ENABLED
157 * Interfaces for the Push mode.
159 XMLPUBFUN htmlParserCtxtPtr XMLCALL
160 htmlCreatePushParserCtxt(htmlSAXHandlerPtr sax,
164 const char *filename,
165 xmlCharEncoding enc);
166 XMLPUBFUN int XMLCALL
167 htmlParseChunk (htmlParserCtxtPtr ctxt,
171 #endif /* LIBXML_PUSH_ENABLED */
173 XMLPUBFUN void XMLCALL
174 htmlFreeParserCtxt (htmlParserCtxtPtr ctxt);
177 * New set of simpler/more flexible APIs
182 * This is the set of XML parser options that can be passed down
183 * to the xmlReadDoc() and similar calls.
186 HTML_PARSE_NOERROR = 1<<5, /* suppress error reports */
187 HTML_PARSE_NOWARNING= 1<<6, /* suppress warning reports */
188 HTML_PARSE_PEDANTIC = 1<<7, /* pedantic error reporting */
189 HTML_PARSE_NOBLANKS = 1<<8, /* remove blank nodes */
190 HTML_PARSE_NONET = 1<<11 /* Forbid network access */
193 XMLPUBFUN void XMLCALL
194 htmlCtxtReset (htmlParserCtxtPtr ctxt);
195 XMLPUBFUN int XMLCALL
196 htmlCtxtUseOptions (htmlParserCtxtPtr ctxt,
198 XMLPUBFUN htmlDocPtr XMLCALL
199 htmlReadDoc (const xmlChar *cur,
201 const char *encoding,
203 XMLPUBFUN htmlDocPtr XMLCALL
204 htmlReadFile (const char *URL,
205 const char *encoding,
207 XMLPUBFUN htmlDocPtr XMLCALL
208 htmlReadMemory (const char *buffer,
211 const char *encoding,
213 XMLPUBFUN htmlDocPtr XMLCALL
216 const char *encoding,
218 XMLPUBFUN htmlDocPtr XMLCALL
219 htmlReadIO (xmlInputReadCallback ioread,
220 xmlInputCloseCallback ioclose,
223 const char *encoding,
225 XMLPUBFUN htmlDocPtr XMLCALL
226 htmlCtxtReadDoc (xmlParserCtxtPtr ctxt,
229 const char *encoding,
231 XMLPUBFUN htmlDocPtr XMLCALL
232 htmlCtxtReadFile (xmlParserCtxtPtr ctxt,
233 const char *filename,
234 const char *encoding,
236 XMLPUBFUN htmlDocPtr XMLCALL
237 htmlCtxtReadMemory (xmlParserCtxtPtr ctxt,
241 const char *encoding,
243 XMLPUBFUN htmlDocPtr XMLCALL
244 htmlCtxtReadFd (xmlParserCtxtPtr ctxt,
247 const char *encoding,
249 XMLPUBFUN htmlDocPtr XMLCALL
250 htmlCtxtReadIO (xmlParserCtxtPtr ctxt,
251 xmlInputReadCallback ioread,
252 xmlInputCloseCallback ioclose,
255 const char *encoding,
258 /* NRK/Jan2003: further knowledge of HTML structure
261 HTML_NA = 0 , /* something we don't check at all */
263 HTML_DEPRECATED = 0x2 ,
265 HTML_REQUIRED = 0xc /* VALID bit set so ( & HTML_VALID ) is TRUE */
268 /* Using htmlElemDesc rather than name here, to emphasise the fact
269 that otherwise there's a lookup overhead
271 XMLPUBFUN htmlStatus XMLCALL htmlAttrAllowed(const htmlElemDesc*, const xmlChar*, int) ;
272 XMLPUBFUN int XMLCALL htmlElementAllowedHere(const htmlElemDesc*, const xmlChar*) ;
273 XMLPUBFUN htmlStatus XMLCALL htmlElementStatusHere(const htmlElemDesc*, const htmlElemDesc*) ;
274 XMLPUBFUN htmlStatus XMLCALL htmlNodeStatus(const htmlNodePtr, int) ;
276 * htmlDefaultSubelement:
277 * @param elt HTML element
279 * Returns the default subelement for this element
281 #define htmlDefaultSubelement(elt) elt->defaultsubelt
283 * htmlElementAllowedHereDesc:
284 * @param parent HTML parent element
285 * @param elt HTML element
287 * Checks whether an HTML element description may be a
288 * direct child of the specified element.
290 * Returns 1 if allowed; 0 otherwise.
292 #define htmlElementAllowedHereDesc(parent,elt) \
293 htmlElementAllowedHere((parent), (elt)->name)
296 * @param elt HTML element
298 * Returns the attributes required for the specified element.
300 #define htmlRequiredAttrs(elt) (elt)->attrs_req
303 #endif /* LIBXML_HTML_ENABLED */
309 #endif /* HTML_PARSER_H */