1  """ 
  2  An interface to html5lib that mimics the lxml.html interface. 
  3  """ 
  4   
  5  import sys 
  6  import string 
  7   
  8  from html5lib import HTMLParser as _HTMLParser 
  9  from html5lib.treebuilders.etree_lxml import TreeBuilder 
 10   
 11  from lxml import etree 
 12  from lxml.html import _contains_block_level_tag, XHTML_NAMESPACE, Element 
 13   
 14   
 15  try: 
 16      _strings = basestring 
 17  except NameError: 
 18      _strings = (bytes, str) 
 19  try: 
 20      from urllib2 import urlopen 
 21  except ImportError: 
 22      from urllib.request import urlopen 
 23  try: 
 24      from urlparse import urlparse 
 25  except ImportError: 
 26      from urllib.parse import urlparse 
 27   
 29      """An html5lib HTML parser with lxml as tree.""" 
 30   
 31 -    def __init__(self, strict=False, **kwargs): 
   33   
 34   
 35  try: 
 36      from html5lib import XHTMLParser as _XHTMLParser 
 37  except ImportError: 
 38      pass 
 39  else: 
 41          """An html5lib XHTML Parser with lxml as tree.""" 
 42   
 43 -        def __init__(self, strict=False, **kwargs): 
   45   
 46      xhtml_parser = XHTMLParser() 
 47   
 48   
 54   
 55   
 65   
 66   
 69      """Parses several HTML elements, returning a list of elements. 
 70   
 71      The first item in the list may be a string.  If no_leading_text is true, 
 72      then it will be an error if there is leading text, and it will always be 
 73      a list of only elements. 
 74   
 75      If `guess_charset` is `True` and the text was not unicode but a 
 76      bytestring, the `chardet` library will perform charset guessing on the 
 77      string. 
 78      """ 
 79      if not isinstance(html, _strings): 
 80          raise TypeError('string required') 
 81   
 82      if parser is None: 
 83          parser = html_parser 
 84   
 85      children = parser.parseFragment(html, 'div', useChardet=guess_charset) 
 86      if children and isinstance(children[0], _strings): 
 87          if no_leading_text: 
 88              if children[0].strip(): 
 89                  raise etree.ParserError('There is leading text: %r' % 
 90                                          children[0]) 
 91              del children[0] 
 92      return children 
  93   
 94   
 97      """Parses a single HTML element; it is an error if there is more than 
 98      one element, or if anything but whitespace precedes or follows the 
 99      element. 
100   
101      If create_parent is true (or is a tag name) then a parent node 
102      will be created to encapsulate the HTML in a single element.  In 
103      this case, leading or trailing text is allowed. 
104      """ 
105      if not isinstance(html, _strings): 
106          raise TypeError('string required') 
107   
108      accept_leading_text = bool(create_parent) 
109   
110      elements = fragments_fromstring( 
111          html, guess_charset=guess_charset, parser=parser, 
112          no_leading_text=not accept_leading_text) 
113   
114      if create_parent: 
115          if not isinstance(create_parent, _strings): 
116              create_parent = 'div' 
117          new_root = Element(create_parent) 
118          if elements: 
119              if isinstance(elements[0], _strings): 
120                  new_root.text = elements[0] 
121                  del elements[0] 
122              new_root.extend(elements) 
123          return new_root 
124   
125      if not elements: 
126          raise etree.ParserError('No elements found') 
127      if len(elements) > 1: 
128          raise etree.ParserError('Multiple elements found') 
129      result = elements[0] 
130      if result.tail and result.tail.strip(): 
131          raise etree.ParserError('Element followed by text: %r' % result.tail) 
132      result.tail = None 
133      return result 
 134   
135   
136 -def fromstring(html, guess_charset=True, parser=None): 
 137      """Parse the html, returning a single element/document. 
138   
139      This tries to minimally parse the chunk of text, without knowing if it 
140      is a fragment or a document. 
141   
142      base_url will set the document's base_url attribute (and the tree's docinfo.URL) 
143      """ 
144      if not isinstance(html, _strings): 
145          raise TypeError('string required') 
146      doc = document_fromstring(html, parser=parser, 
147                                guess_charset=guess_charset) 
148   
149       
150      start = html[:50] 
151      if isinstance(start, bytes): 
152           
153           
154           
155          start = start.decode('ascii', 'replace') 
156   
157      start = start.lstrip().lower() 
158      if start.startswith('<html') or start.startswith('<!doctype'): 
159          return doc 
160   
161      head = _find_tag(doc, 'head') 
162   
163       
164      if len(head): 
165          return doc 
166   
167      body = _find_tag(doc, 'body') 
168   
169       
170       
171      if (len(body) == 1 and (not body.text or not body.text.strip()) 
172          and (not body[-1].tail or not body[-1].tail.strip())): 
173          return body[0] 
174   
175       
176       
177       
178      if _contains_block_level_tag(body): 
179          body.tag = 'div' 
180      else: 
181          body.tag = 'span' 
182      return body 
 183   
184   
185 -def parse(filename_url_or_file, guess_charset=True, parser=None): 
 186      """Parse a filename, URL, or file-like object into an HTML document 
187      tree.  Note: this returns a tree, not an element.  Use 
188      ``parse(...).getroot()`` to get the document root. 
189      """ 
190      if parser is None: 
191          parser = html_parser 
192      if not isinstance(filename_url_or_file, _strings): 
193          fp = filename_url_or_file 
194      elif _looks_like_url(filename_url_or_file): 
195          fp = urlopen(filename_url_or_file) 
196      else: 
197          fp = open(filename_url_or_file, 'rb') 
198      return parser.parse(fp, useChardet=guess_charset) 
 199   
200   
202      scheme = urlparse(str)[0] 
203      if not scheme: 
204          return False 
205      elif (sys.platform == 'win32' and 
206              scheme in string.ascii_letters 
207              and len(scheme) == 1): 
208           
209          return False 
210      else: 
211          return True 
 212   
213   
214  html_parser = HTMLParser() 
215