HTMLparser

HTMLparser - interface for an HTML 4.0 non-verifying parser

this module implements an HTML 4.0 non-verifying parser with API compatible with the XML parser ones. It should be able to parse "real world" HTML, even if severely broken from a specification point of view.

Author(s): Daniel Veillard

Synopsis

#define htmlDefaultSubelement(elt);
#define htmlElementAllowedHereDesc(parent, elt);
#define htmlRequiredAttrs(elt);
typedef xmlDocPtr htmlDocPtr;
typedef struct _htmlElemDesc htmlElemDesc;
typedef htmlElemDesc * htmlElemDescPtr;
typedef struct _htmlEntityDesc htmlEntityDesc;
typedef htmlEntityDesc * htmlEntityDescPtr;
typedef xmlNodePtr htmlNodePtr;
typedef xmlParserCtxt htmlParserCtxt;
typedef xmlParserCtxtPtr htmlParserCtxtPtr;
typedef xmlParserInput htmlParserInput;
typedef xmlParserInputPtr htmlParserInputPtr;
typedef xmlParserNodeInfo htmlParserNodeInfo;
typedef enum htmlParserOption;
typedef xmlSAXHandler htmlSAXHandler;
typedef xmlSAXHandlerPtr htmlSAXHandlerPtr;
typedef enum htmlStatus;
int	UTF8ToHtml			(unsigned char * out, 
int * outlen,
const unsigned char * in,
int * inlen); htmlStatus htmlAttrAllowed (const htmlElemDesc * elt,
const xmlChar * attr,
int legacy); int htmlAutoCloseTag (htmlDocPtr doc,
const xmlChar * name,
htmlNodePtr elem); htmlParserCtxtPtr htmlCreateFileParserCtxt (const char * filename,
const char * encoding); htmlParserCtxtPtr htmlCreateMemoryParserCtxt (const char * buffer,
int size); htmlParserCtxtPtr htmlCreatePushParserCtxt (htmlSAXHandlerPtr sax,
void * user_data,
const char * chunk,
int size,
const char * filename,
xmlCharEncoding enc); htmlDocPtr htmlCtxtParseDocument (htmlParserCtxtPtr ctxt,
xmlParserInputPtr input); htmlDocPtr htmlCtxtReadDoc (htmlParserCtxtPtr ctxt,
const xmlChar * str,
const char * URL,
const char * encoding,
int options); htmlDocPtr htmlCtxtReadFd (htmlParserCtxtPtr ctxt,
int fd,
const char * URL,
const char * encoding,
int options); htmlDocPtr htmlCtxtReadFile (htmlParserCtxtPtr ctxt,
const char * filename,
const char * encoding,
int options); htmlDocPtr htmlCtxtReadIO (htmlParserCtxtPtr ctxt,
xmlInputReadCallback ioread,
xmlInputCloseCallback ioclose,
void * ioctx,
const char * URL,
const char * encoding,
int options); htmlDocPtr htmlCtxtReadMemory (htmlParserCtxtPtr ctxt,
const char * buffer,
int size,
const char * URL,
const char * encoding,
int options); void htmlCtxtReset (htmlParserCtxtPtr ctxt); int htmlCtxtUseOptions (htmlParserCtxtPtr ctxt,
int options); int htmlElementAllowedHere (const htmlElemDesc * parent,
const xmlChar * elt); htmlStatus htmlElementStatusHere (const htmlElemDesc * parent,
const htmlElemDesc * elt); int htmlEncodeEntities (unsigned char * out,
int * outlen,
const unsigned char * in,
int * inlen,
int quoteChar); const htmlEntityDesc * htmlEntityLookup (const xmlChar * name); const htmlEntityDesc * htmlEntityValueLookup (unsigned int value); void htmlFreeParserCtxt (htmlParserCtxtPtr ctxt); int htmlHandleOmittedElem (int val); void htmlInitAutoClose (void); int htmlIsAutoClosed (htmlDocPtr doc,
htmlNodePtr elem); int htmlIsScriptAttribute (const xmlChar * name); htmlParserCtxtPtr htmlNewParserCtxt (void); htmlParserCtxtPtr htmlNewSAXParserCtxt (const htmlSAXHandler * sax,
void * userData); htmlStatus htmlNodeStatus (htmlNodePtr node,
int legacy); int htmlParseCharRef (htmlParserCtxtPtr ctxt); int htmlParseChunk (htmlParserCtxtPtr ctxt,
const char * chunk,
int size,
int terminate); htmlDocPtr htmlParseDoc (const xmlChar * cur,
const char * encoding); int htmlParseDocument (htmlParserCtxtPtr ctxt); void htmlParseElement (htmlParserCtxtPtr ctxt); const htmlEntityDesc * htmlParseEntityRef (htmlParserCtxtPtr ctxt,
const xmlChar ** str); htmlDocPtr htmlParseFile (const char * filename,
const char * encoding); htmlDocPtr htmlReadDoc (const xmlChar * str,
const char * url,
const char * encoding,
int options); htmlDocPtr htmlReadFd (int fd,
const char * url,
const char * encoding,
int options); htmlDocPtr htmlReadFile (const char * filename,
const char * encoding,
int options); htmlDocPtr htmlReadIO (xmlInputReadCallback ioread,
xmlInputCloseCallback ioclose,
void * ioctx,
const char * url,
const char * encoding,
int options); htmlDocPtr htmlReadMemory (const char * buffer,
int size,
const char * url,
const char * encoding,
int options); htmlDocPtr htmlSAXParseDoc (const xmlChar * cur,
const char * encoding,
htmlSAXHandlerPtr sax,
void * userData); htmlDocPtr htmlSAXParseFile (const char * filename,
const char * encoding,
htmlSAXHandlerPtr sax,
void * userData); const htmlElemDesc * htmlTagLookup (const xmlChar * tag);

Description

Details

Macro htmlDefaultSubelement

#define htmlDefaultSubelement(elt);

Returns the default subelement for this element

elt: HTML element

Macro htmlElementAllowedHereDesc

#define htmlElementAllowedHereDesc(parent, elt);

Checks whether an HTML element description may be a direct child of the specified element. Returns 1 if allowed; 0 otherwise.

parent: HTML parent element
elt: HTML element

Macro htmlRequiredAttrs

#define htmlRequiredAttrs(elt);

Returns the attributes required for the specified element.

elt: HTML element

Typedef htmlDocPtr

xmlDocPtr htmlDocPtr;


Structure htmlElemDesc

struct _htmlElemDesc {
    const char *	name	: The tag name
    char	startTag	: Whether the start tag can be implied
    char	endTag	: Whether the end tag can be implied
    char	saveEndTag	: Whether the end tag should be saved
    char	empty	: Is this an empty element ?
    char	depr	: Is this a deprecated element ?
    char	dtd	: 1: only in Loose DTD, 2: only Frameset one
    char	isinline	: is this a block 0 or inline 1 element
    const char *	desc	: the description NRK Jan.2003 * New fields encapsulating HTML structur
    const char **	subelts	: allowed sub-elements of this element
    const char *	defaultsubelt	: subelement for suggested auto-repair if necessary or NULL
    const char **	attrs_opt	: Optional Attributes
    const char **	attrs_depr	: Additional deprecated attributes
    const char **	attrs_req	: Required attributes
} htmlElemDesc;


Typedef htmlElemDescPtr

htmlElemDesc * htmlElemDescPtr;


Structure htmlEntityDesc

struct _htmlEntityDesc {
    unsigned int	value	: the UNICODE value for the character
    const char *	name	: The entity name
    const char *	desc	: the description
} htmlEntityDesc;


Typedef htmlEntityDescPtr

htmlEntityDesc * htmlEntityDescPtr;


Typedef htmlNodePtr

xmlNodePtr htmlNodePtr;


Typedef htmlParserCtxt

xmlParserCtxt htmlParserCtxt;


Typedef htmlParserCtxtPtr

xmlParserCtxtPtr htmlParserCtxtPtr;


Typedef htmlParserInput

xmlParserInput htmlParserInput;


Typedef htmlParserInputPtr

xmlParserInputPtr htmlParserInputPtr;


Typedef htmlParserNodeInfo

xmlParserNodeInfo htmlParserNodeInfo;


Enum htmlParserOption

enum htmlParserOption {
    HTML_PARSE_RECOVER = 1 /* Relaxed parsing */
    HTML_PARSE_NODEFDTD = 4 /* do not default a doctype if not found */
    HTML_PARSE_NOERROR = 32 /* suppress error reports */
    HTML_PARSE_NOWARNING = 64 /* suppress warning reports */
    HTML_PARSE_PEDANTIC = 128 /* pedantic error reporting */
    HTML_PARSE_NOBLANKS = 256 /* remove blank nodes */
    HTML_PARSE_NONET = 2048 /* Forbid network access */
    HTML_PARSE_NOIMPLIED = 8192 /* Do not add implied html/body... elements */
    HTML_PARSE_COMPACT = 65536 /* compact small text nodes */
    HTML_PARSE_IGNORE_ENC = 2097152 /*  ignore internal document encoding hint */
};


Typedef htmlSAXHandler

xmlSAXHandler htmlSAXHandler;


Typedef htmlSAXHandlerPtr

xmlSAXHandlerPtr htmlSAXHandlerPtr;


Enum htmlStatus

enum htmlStatus {
    HTML_NA = 0 /* something we don't check at all */
    HTML_INVALID = 1
    HTML_DEPRECATED = 2
    HTML_VALID = 4
    HTML_REQUIRED = 12 /*  VALID bit set so ( & HTML_VALID ) is TRUE */
};


Variable htmlDefaultSAXHandler

const xmlSAXHandlerV1 htmlDefaultSAXHandler;

DEPRECATED: This handler is unused and will be removed from future versions. Default old SAX v1 handler for HTML, builds the DOM tree


UTF8ToHtml ()

int	UTF8ToHtml			(unsigned char * out, 
int * outlen,
const unsigned char * in,
int * inlen)

Take a block of UTF-8 chars in and try to convert it to an ASCII plus HTML entities block of chars out.

out: a pointer to an array of bytes to store the result
outlen: the length of @out
in: a pointer to an array of UTF-8 chars
inlen: the length of @in
Returns: 0 if success, -2 if the transcoding fails, or -1 otherwise The value of @inlen after return is the number of octets consumed as the return value is positive, else unpredictable. The value of @outlen after return is the number of octets consumed.

htmlAttrAllowed ()

htmlStatus	htmlAttrAllowed		(const htmlElemDesc * elt, 
const xmlChar * attr,
int legacy)

Checks whether an attribute is valid for an element Has full knowledge of Required and Deprecated attributes

elt: HTML element
attr: HTML attribute
legacy: whether to allow deprecated attributes
Returns: one of HTML_REQUIRED, HTML_VALID, HTML_DEPRECATED, HTML_INVALID

htmlAutoCloseTag ()

int	htmlAutoCloseTag		(htmlDocPtr doc, 
const xmlChar * name,
htmlNodePtr elem)

The HTML DTD allows a tag to implicitly close other tags. The list is kept in htmlStartClose array. This function checks if the element or one of it's children would autoclose the given tag.

doc: the HTML document
name: The tag name
elem: the HTML element
Returns: 1 if autoclose, 0 otherwise

htmlCreateFileParserCtxt ()

htmlParserCtxtPtr	htmlCreateFileParserCtxt	(const char * filename, 
const char * encoding)

DEPRECATED: Use htmlNewParserCtxt and htmlCtxtReadFile. Create a parser context to read from a file. A non-NULL encoding overrides encoding declarations in the document. Automatic support for ZLIB/Compress compressed document is provided by default if found at compile-time.

filename: the filename
encoding: optional encoding
Returns: the new parser context or NULL if a memory allocation failed.

htmlCreateMemoryParserCtxt ()

htmlParserCtxtPtr	htmlCreateMemoryParserCtxt	(const char * buffer, 
int size)

DEPRECATED: Use htmlNewParserCtxt and htmlCtxtReadMemory. Create a parser context for an HTML in-memory document. The input buffer must not contain any terminating null bytes.

buffer: a pointer to a char array
size: the size of the array
Returns: the new parser context or NULL

htmlCreatePushParserCtxt ()

htmlParserCtxtPtr	htmlCreatePushParserCtxt	(htmlSAXHandlerPtr sax, 
void * user_data,
const char * chunk,
int size,
const char * filename,
xmlCharEncoding enc)

Create a parser context for using the HTML parser in push mode.

sax: a SAX handler (optional)
user_data: The user data returned on SAX callbacks (optional)
chunk: a pointer to an array of chars (optional)
size: number of chars in the array
filename: only used for error reporting (optional)
enc: encoding (deprecated, pass XML_CHAR_ENCODING_NONE)
Returns: the new parser context or NULL if a memory allocation failed.

htmlCtxtParseDocument ()

htmlDocPtr	htmlCtxtParseDocument	(htmlParserCtxtPtr ctxt, 
xmlParserInputPtr input)

Parse an HTML document and return the resulting document tree. Available since 2.13.0.

ctxt: an HTML parser context
input: parser input
Returns: the resulting document tree or NULL

htmlCtxtReadDoc ()

htmlDocPtr	htmlCtxtReadDoc		(htmlParserCtxtPtr ctxt, 
const xmlChar * str,
const char * URL,
const char * encoding,
int options)

Parse an HTML in-memory document and build a tree. See htmlCtxtUseOptions for details.

ctxt: an HTML parser context
str: a pointer to a zero terminated string
URL: only used for error reporting (optional)
encoding: the document encoding (optional)
options: a combination of htmlParserOptions
Returns: the resulting document tree

htmlCtxtReadFd ()

htmlDocPtr	htmlCtxtReadFd		(htmlParserCtxtPtr ctxt, 
int fd,
const char * URL,
const char * encoding,
int options)

Parse an HTML from a file descriptor and build a tree. See htmlCtxtUseOptions for details. NOTE that the file descriptor will not be closed when the context is freed or reset.

ctxt: an HTML parser context
fd: an open file descriptor
URL: only used for error reporting (optional)
encoding: the document encoding (optinal)
options: a combination of htmlParserOptions
Returns: the resulting document tree

htmlCtxtReadFile ()

htmlDocPtr	htmlCtxtReadFile	(htmlParserCtxtPtr ctxt, 
const char * filename,
const char * encoding,
int options)

Parse an HTML file from the filesystem, the network or a user-defined resource loader. See xmlNewInputURL and htmlCtxtUseOptions for details.

ctxt: an HTML parser context
filename: a file or URL
encoding: the document encoding (optional)
options: a combination of htmlParserOptions
Returns: the resulting document tree

htmlCtxtReadIO ()

htmlDocPtr	htmlCtxtReadIO		(htmlParserCtxtPtr ctxt, 
xmlInputReadCallback ioread,
xmlInputCloseCallback ioclose,
void * ioctx,
const char * URL,
const char * encoding,
int options)

Parse an HTML document from I/O functions and source and build a tree. See xmlNewInputIO and htmlCtxtUseOptions for details.

ctxt: an HTML parser context
ioread: an I/O read function
ioclose: an I/O close function
ioctx: an I/O handler
URL: the base URL to use for the document
encoding: the document encoding, or NULL
options: a combination of htmlParserOption(s)
Returns: the resulting document tree

htmlCtxtReadMemory ()

htmlDocPtr	htmlCtxtReadMemory	(htmlParserCtxtPtr ctxt, 
const char * buffer,
int size,
const char * URL,
const char * encoding,
int options)

Parse an HTML in-memory document and build a tree. The input buffer must not contain any terminating null bytes. See htmlCtxtUseOptions for details.

ctxt: an HTML parser context
buffer: a pointer to a char array
size: the size of the array
URL: only used for error reporting (optional)
encoding: the document encoding (optinal)
options: a combination of htmlParserOptions
Returns: the resulting document tree

htmlCtxtReset ()

void	htmlCtxtReset			(htmlParserCtxtPtr ctxt)

Reset a parser context

ctxt: an HTML parser context

htmlCtxtUseOptions ()

int	htmlCtxtUseOptions		(htmlParserCtxtPtr ctxt, 
int options)

Applies the options to the parser context

ctxt: an HTML parser context
options: a combination of htmlParserOption(s)
Returns: 0 in case of success, the set of unknown or unimplemented options in case of error.

htmlElementAllowedHere ()

int	htmlElementAllowedHere		(const htmlElemDesc * parent, 
const xmlChar * elt)

Checks whether an HTML element may be a direct child of a parent element. Note - doesn't check for deprecated elements

parent: HTML parent element
elt: HTML element
Returns: 1 if allowed; 0 otherwise.

htmlElementStatusHere ()

htmlStatus	htmlElementStatusHere	(const htmlElemDesc * parent, 
const htmlElemDesc * elt)

Checks whether an HTML element may be a direct child of a parent element. and if so whether it is valid or deprecated.

parent: HTML parent element
elt: HTML element
Returns: one of HTML_VALID, HTML_DEPRECATED, HTML_INVALID

htmlEncodeEntities ()

int	htmlEncodeEntities		(unsigned char * out, 
int * outlen,
const unsigned char * in,
int * inlen,
int quoteChar)

Take a block of UTF-8 chars in and try to convert it to an ASCII plus HTML entities block of chars out.

out: a pointer to an array of bytes to store the result
outlen: the length of @out
in: a pointer to an array of UTF-8 chars
inlen: the length of @in
quoteChar: the quote character to escape (' or ") or zero.
Returns: 0 if success, -2 if the transcoding fails, or -1 otherwise The value of @inlen after return is the number of octets consumed as the return value is positive, else unpredictable. The value of @outlen after return is the number of octets consumed.

htmlEntityLookup ()

const htmlEntityDesc *	htmlEntityLookup	(const xmlChar * name)

Lookup the given entity in EntitiesTable TODO: the linear scan is really ugly, an hash table is really needed.

name: the entity name
Returns: the associated htmlEntityDescPtr if found, NULL otherwise.

htmlEntityValueLookup ()

const htmlEntityDesc *	htmlEntityValueLookup	(unsigned int value)

Lookup the given entity in EntitiesTable TODO: the linear scan is really ugly, an hash table is really needed.

value: the entity's unicode value
Returns: the associated htmlEntityDescPtr if found, NULL otherwise.

htmlFreeParserCtxt ()

void	htmlFreeParserCtxt		(htmlParserCtxtPtr ctxt)

Free all the memory used by a parser context. However the parsed document in ctxt->myDoc is not freed.

ctxt: an HTML parser context

htmlHandleOmittedElem ()

int	htmlHandleOmittedElem		(int val)

DEPRECATED: Use HTML_PARSE_NOIMPLIED Set and return the previous value for handling HTML omitted tags.

val: int 0 or 1
Returns: the last value for 0 for no handling, 1 for auto insertion.

htmlInitAutoClose ()

void	htmlInitAutoClose		(void)

DEPRECATED: This is a no-op.


htmlIsAutoClosed ()

int	htmlIsAutoClosed		(htmlDocPtr doc, 
htmlNodePtr elem)

The HTML DTD allows a tag to implicitly close other tags. The list is kept in htmlStartClose array. This function checks if a tag is autoclosed by one of it's child

doc: the HTML document
elem: the HTML element
Returns: 1 if autoclosed, 0 otherwise

htmlIsScriptAttribute ()

int	htmlIsScriptAttribute		(const xmlChar * name)

Check if an attribute is of content type Script

name: an attribute name
Returns: 1 is the attribute is a script 0 otherwise

htmlNewParserCtxt ()

htmlParserCtxtPtr	htmlNewParserCtxt	(void)

Allocate and initialize a new HTML parser context. This can be used to parse HTML documents into DOM trees with functions like xmlCtxtReadFile or xmlCtxtReadMemory. See htmlCtxtUseOptions for parser options. See xmlCtxtSetErrorHandler for advanced error handling. See xmlNewInputURL, xmlNewInputMemory, xmlNewInputIO and similar functions for advanced input control. See htmlNewSAXParserCtxt for custom SAX parsers.

Returns: the htmlParserCtxtPtr or NULL in case of allocation error

htmlNewSAXParserCtxt ()

htmlParserCtxtPtr	htmlNewSAXParserCtxt	(const htmlSAXHandler * sax, 
void * userData)

Allocate and initialize a new HTML SAX parser context. If userData is NULL, the parser context will be passed as user data. Available since 2.11.0. If you want support older versions, it's best to invoke htmlNewParserCtxt and set ctxt->sax with struct assignment. Also see htmlNewParserCtxt.

sax: SAX handler
userData: user data
Returns: the htmlParserCtxtPtr or NULL in case of allocation error

htmlNodeStatus ()

htmlStatus	htmlNodeStatus		(htmlNodePtr node, 
int legacy)

Checks whether the tree node is valid. Experimental (the author only uses the HTML enhancements in a SAX parser)

node: an htmlNodePtr in a tree
legacy: whether to allow deprecated elements (YES is faster here for Element nodes)
Returns: for Element nodes, a return from htmlElementAllowedHere (if legacy allowed) or htmlElementStatusHere (otherwise). for Attribute nodes, a return from htmlAttrAllowed for other nodes, HTML_NA (no checks performed)

htmlParseCharRef ()

int	htmlParseCharRef		(htmlParserCtxtPtr ctxt)

DEPRECATED: Internal function, don't use. parse Reference declarations [66] CharRef ::= '&#' [0-9]+ ';' | '&#x' [0-9a-fA-F]+ ';'

ctxt: an HTML parser context
Returns: the value parsed (as an int)

htmlParseChunk ()

int	htmlParseChunk			(htmlParserCtxtPtr ctxt, 
const char * chunk,
int size,
int terminate)

Parse a chunk of memory in push parser mode. Assumes that the parser context was initialized with htmlCreatePushParserCtxt. The last chunk, which will often be empty, must be marked with the @terminate flag. With the default SAX callbacks, the resulting document will be available in ctxt->myDoc. This pointer will not be freed by the library. If the document isn't well-formed, ctxt->myDoc is set to NULL.

ctxt: an HTML parser context
chunk: chunk of memory
size: size of chunk in bytes
terminate: last chunk indicator
Returns: an xmlParserErrors code (0 on success).

htmlParseDoc ()

htmlDocPtr	htmlParseDoc		(const xmlChar * cur, 
const char * encoding)

DEPRECATED: Use htmlReadDoc. Parse an HTML in-memory document and build a tree. This function uses deprecated global parser options.

cur: a pointer to an array of xmlChar
encoding: the encoding (optional)
Returns: the resulting document tree

htmlParseDocument ()

int	htmlParseDocument		(htmlParserCtxtPtr ctxt)

Parse an HTML document and invoke the SAX handlers. This is useful if you're only interested in custom SAX callbacks. If you want a document tree, use htmlCtxtParseDocument.

ctxt: an HTML parser context
Returns: 0, -1 in case of error.

htmlParseElement ()

void	htmlParseElement		(htmlParserCtxtPtr ctxt)

DEPRECATED: Internal function, don't use. parse an HTML element, this is highly recursive this is kept for compatibility with previous code versions [39] element ::= EmptyElemTag | STag content ETag [41] Attribute ::= Name Eq AttValue

ctxt: an HTML parser context

htmlParseEntityRef ()

const htmlEntityDesc *	htmlParseEntityRef	(htmlParserCtxtPtr ctxt, 
const xmlChar ** str)

DEPRECATED: Internal function, don't use. parse an HTML ENTITY references [68] EntityRef ::= '&' Name ';'

ctxt: an HTML parser context
str: location to store the entity name
Returns: the associated htmlEntityDescPtr if found, or NULL otherwise, if non-NULL *str will have to be freed by the caller.

htmlParseFile ()

htmlDocPtr	htmlParseFile		(const char * filename, 
const char * encoding)

Parse an HTML file and build a tree. See xmlNewInputURL for details.

filename: the filename
encoding: encoding (optional)
Returns: the resulting document tree

htmlReadDoc ()

htmlDocPtr	htmlReadDoc		(const xmlChar * str, 
const char * url,
const char * encoding,
int options)

Convenience function to parse an HTML document from a zero-terminated string. See htmlCtxtReadDoc for details.

str: a pointer to a zero terminated string
url: only used for error reporting (optoinal)
encoding: the document encoding (optional)
options: a combination of htmlParserOptions
Returns: the resulting document tree.

htmlReadFd ()

htmlDocPtr	htmlReadFd		(int fd, 
const char * url,
const char * encoding,
int options)

Convenience function to parse an HTML document from a file descriptor. NOTE that the file descriptor will not be closed when the context is freed or reset. See htmlCtxtReadFd for details.

fd: an open file descriptor
url: only used for error reporting (optional)
encoding: the document encoding, or NULL
options: a combination of htmlParserOptions
Returns: the resulting document tree

htmlReadFile ()

htmlDocPtr	htmlReadFile		(const char * filename, 
const char * encoding,
int options)

Convenience function to parse an HTML file from the filesystem, the network or a global user-defined resource loader. See htmlCtxtReadFile for details.

filename: a file or URL
encoding: the document encoding (optional)
options: a combination of htmlParserOptions
Returns: the resulting document tree.

htmlReadIO ()

htmlDocPtr	htmlReadIO		(xmlInputReadCallback ioread, 
xmlInputCloseCallback ioclose,
void * ioctx,
const char * url,
const char * encoding,
int options)

Convenience function to parse an HTML document from I/O functions and context. See htmlCtxtReadIO for details.

ioread: an I/O read function
ioclose: an I/O close function (optional)
ioctx: an I/O handler
url: only used for error reporting (optional)
encoding: the document encoding (optional)
options: a combination of htmlParserOption(s)
Returns: the resulting document tree

htmlReadMemory ()

htmlDocPtr	htmlReadMemory		(const char * buffer, 
int size,
const char * url,
const char * encoding,
int options)

Convenience function to parse an HTML document from memory. The input buffer must not contain any terminating null bytes. See htmlCtxtReadMemory for details.

buffer: a pointer to a char array
size: the size of the array
url: only used for error reporting (optional)
encoding: the document encoding, or NULL
options: a combination of htmlParserOption(s)
Returns: the resulting document tree

htmlSAXParseDoc ()

htmlDocPtr	htmlSAXParseDoc		(const xmlChar * cur, 
const char * encoding,
htmlSAXHandlerPtr sax,
void * userData)

DEPRECATED: Use htmlNewSAXParserCtxt and htmlCtxtReadDoc. Parse an HTML in-memory document. If sax is not NULL, use the SAX callbacks to handle parse events. If sax is NULL, fallback to the default DOM behavior and return a tree.

cur: a pointer to an array of xmlChar
encoding: a free form C string describing the HTML document encoding, or NULL
sax: the SAX handler block
userData: if using SAX, this pointer will be provided on callbacks.
Returns: the resulting document tree unless SAX is NULL or the document is not well formed.

htmlSAXParseFile ()

htmlDocPtr	htmlSAXParseFile	(const char * filename, 
const char * encoding,
htmlSAXHandlerPtr sax,
void * userData)

DEPRECATED: Use htmlNewSAXParserCtxt and htmlCtxtReadFile. parse an HTML file and build a tree. Automatic support for ZLIB/Compress compressed document is provided by default if found at compile-time. It use the given SAX function block to handle the parsing callback. If sax is NULL, fallback to the default DOM tree building routines.

filename: the filename
encoding: encoding (optional)
sax: the SAX handler block
userData: if using SAX, this pointer will be provided on callbacks.
Returns: the resulting document tree unless SAX is NULL or the document is not well formed.

htmlTagLookup ()

const htmlElemDesc *	htmlTagLookup	(const xmlChar * tag)

Lookup the HTML tag in the ElementTable

tag: The tag name in lowercase
Returns: the related htmlElemDescPtr or NULL if not found.