Viewing file: html5parser.py (8.43 KB) -rw-r--r-- Select action/file-type: (+) | (+) | (+) | Code (+) | Session (+) | (+) | SDB (+) | (+) | (+) | (+) | (+) | (+) |
""" An interface to html5lib that mimics the lxml.html interface. """ import sys import string
from html5lib import HTMLParser as _HTMLParser from html5lib.treebuilders.etree_lxml import TreeBuilder from lxml import etree from lxml.html import Element, XHTML_NAMESPACE, _contains_block_level_tag
# python3 compatibility try: _strings = basestring except NameError: _strings = (bytes, str) try: from urllib2 import urlopen except ImportError: from urllib.request import urlopen try: from urlparse import urlparse except ImportError: from urllib.parse import urlparse
class HTMLParser(_HTMLParser): """An html5lib HTML parser with lxml as tree."""
def __init__(self, strict=False, **kwargs): _HTMLParser.__init__(self, strict=strict, tree=TreeBuilder, **kwargs)
try: from html5lib import XHTMLParser as _XHTMLParser except ImportError: pass else: class XHTMLParser(_XHTMLParser): """An html5lib XHTML Parser with lxml as tree."""
def __init__(self, strict=False, **kwargs): _XHTMLParser.__init__(self, strict=strict, tree=TreeBuilder, **kwargs)
xhtml_parser = XHTMLParser()
def _find_tag(tree, tag): elem = tree.find(tag) if elem is not None: return elem return tree.find('{%s}%s' % (XHTML_NAMESPACE, tag))
def document_fromstring(html, guess_charset=None, parser=None): """ Parse a whole document into a string.
If `guess_charset` is true, or if the input is not Unicode but a byte string, the `chardet` library will perform charset guessing on the string. """ if not isinstance(html, _strings): raise TypeError('string required')
if parser is None: parser = html_parser
options = {} if guess_charset is None and isinstance(html, bytes): # html5lib does not accept useChardet as an argument, if it # detected the html argument would produce unicode objects. guess_charset = True if guess_charset is not None: options['useChardet'] = guess_charset return parser.parse(html, **options).getroot()
def fragments_fromstring(html, no_leading_text=False, guess_charset=None, parser=None): """Parses several HTML elements, returning a list of elements.
The first item in the list may be a string. If no_leading_text is true, then it will be an error if there is leading text, and it will always be a list of only elements.
If `guess_charset` is true, the `chardet` library will perform charset guessing on the string. """ if not isinstance(html, _strings): raise TypeError('string required')
if parser is None: parser = html_parser
options = {} if guess_charset is None and isinstance(html, bytes): # html5lib does not accept useChardet as an argument, if it # detected the html argument would produce unicode objects. guess_charset = False if guess_charset is not None: options['useChardet'] = guess_charset children = parser.parseFragment(html, 'div', **options) if children and isinstance(children[0], _strings): if no_leading_text: if children[0].strip(): raise etree.ParserError('There is leading text: %r' % children[0]) del children[0] return children
def fragment_fromstring(html, create_parent=False, guess_charset=None, parser=None): """Parses a single HTML element; it is an error if there is more than one element, or if anything but whitespace precedes or follows the element.
If 'create_parent' is true (or is a tag name) then a parent node will be created to encapsulate the HTML in a single element. In this case, leading or trailing text is allowed.
If `guess_charset` is true, the `chardet` library will perform charset guessing on the string. """ if not isinstance(html, _strings): raise TypeError('string required')
accept_leading_text = bool(create_parent)
elements = fragments_fromstring( html, guess_charset=guess_charset, parser=parser, no_leading_text=not accept_leading_text)
if create_parent: if not isinstance(create_parent, _strings): create_parent = 'div' new_root = Element(create_parent) if elements: if isinstance(elements[0], _strings): new_root.text = elements[0] del elements[0] new_root.extend(elements) return new_root
if not elements: raise etree.ParserError('No elements found') if len(elements) > 1: raise etree.ParserError('Multiple elements found') result = elements[0] if result.tail and result.tail.strip(): raise etree.ParserError('Element followed by text: %r' % result.tail) result.tail = None return result
def fromstring(html, guess_charset=None, parser=None): """Parse the html, returning a single element/document.
This tries to minimally parse the chunk of text, without knowing if it is a fragment or a document.
'base_url' will set the document's base_url attribute (and the tree's docinfo.URL)
If `guess_charset` is true, or if the input is not Unicode but a byte string, the `chardet` library will perform charset guessing on the string. """ if not isinstance(html, _strings): raise TypeError('string required') doc = document_fromstring(html, parser=parser, guess_charset=guess_charset)
# document starts with doctype or <html>, full document! start = html[:50] if isinstance(start, bytes): # Allow text comparison in python3. # Decode as ascii, that also covers latin-1 and utf-8 for the # characters we need. start = start.decode('ascii', 'replace')
start = start.lstrip().lower() if start.startswith('<html') or start.startswith('<!doctype'): return doc
head = _find_tag(doc, 'head')
# if the head is not empty we have a full document if len(head): return doc
body = _find_tag(doc, 'body')
# The body has just one element, so it was probably a single # element passed in if (len(body) == 1 and (not body.text or not body.text.strip()) and (not body[-1].tail or not body[-1].tail.strip())): return body[0]
# Now we have a body which represents a bunch of tags which have the # content that was passed in. We will create a fake container, which # is the body tag, except <body> implies too much structure. if _contains_block_level_tag(body): body.tag = 'div' else: body.tag = 'span' return body
def parse(filename_url_or_file, guess_charset=None, parser=None): """Parse a filename, URL, or file-like object into an HTML document tree. Note: this returns a tree, not an element. Use ``parse(...).getroot()`` to get the document root.
If ``guess_charset`` is true, the ``useChardet`` option is passed into html5lib to enable character detection. This option is on by default when parsing from URLs, off by default when parsing from file(-like) objects (which tend to return Unicode more often than not), and on by default when parsing from a file path (which is read in binary mode). """ if parser is None: parser = html_parser if not isinstance(filename_url_or_file, _strings): fp = filename_url_or_file if guess_charset is None: # assume that file-like objects return Unicode more often than bytes guess_charset = False elif _looks_like_url(filename_url_or_file): fp = urlopen(filename_url_or_file) if guess_charset is None: # assume that URLs return bytes guess_charset = True else: fp = open(filename_url_or_file, 'rb') if guess_charset is None: guess_charset = True
options = {} # html5lib does not accept useChardet as an argument, if it # detected the html argument would produce unicode objects. if guess_charset: options['useChardet'] = guess_charset return parser.parse(fp, **options)
def _looks_like_url(str): scheme = urlparse(str)[0] if not scheme: return False elif (sys.platform == 'win32' and scheme in string.ascii_letters and len(scheme) == 1): # looks like a 'normal' absolute path return False else: return True
html_parser = HTMLParser()
|