1  """ 
  2  An interface to html5lib that mimics the lxml.html interface. 
  3  """ 
  4  import sys 
  5  import string 
  6   
  7  from html5lib import HTMLParser as _HTMLParser 
  8  from html5lib.treebuilders.etree_lxml import TreeBuilder 
  9  from lxml import etree 
 10  from lxml.html import Element, XHTML_NAMESPACE, _contains_block_level_tag 
 11   
 12   
 13  try: 
 14      _strings = basestring 
 15  except NameError: 
 16      _strings = (bytes, str) 
 17  try: 
 18      from urllib2 import urlopen 
 19  except ImportError: 
 20      from urllib.request import urlopen 
 21  try: 
 22      from urlparse import urlparse 
 23  except ImportError: 
 24      from urllib.parse import urlparse 
 25   
 26   
 28      """An html5lib HTML parser with lxml as tree.""" 
 29   
 30 -    def __init__(self, strict=False, **kwargs): 
   32   
 33   
 34  try: 
 35      from html5lib import XHTMLParser as _XHTMLParser 
 36  except ImportError: 
 37      pass 
 38  else: 
 40          """An html5lib XHTML Parser with lxml as tree.""" 
 41   
 42 -        def __init__(self, strict=False, **kwargs): 
   44   
 45      xhtml_parser = XHTMLParser() 
 46   
 47   
 53   
 54   
 56      """ 
 57      Parse a whole document into a string. 
 58   
 59      If `guess_charset` is true, or if the input is not Unicode but a 
 60      byte string, the `chardet` library will perform charset guessing 
 61      on the string. 
 62      """ 
 63      if not isinstance(html, _strings): 
 64          raise TypeError('string required') 
 65   
 66      if parser is None: 
 67          parser = html_parser 
 68   
 69      options = {} 
 70      if guess_charset is None and isinstance(html, bytes): 
 71           
 72           
 73          guess_charset = True 
 74      if guess_charset is not None: 
 75          options['useChardet'] = guess_charset 
 76      return parser.parse(html, **options).getroot() 
  77   
 78   
 81      """Parses several HTML elements, returning a list of elements. 
 82   
 83      The first item in the list may be a string.  If no_leading_text is true, 
 84      then it will be an error if there is leading text, and it will always be 
 85      a list of only elements. 
 86   
 87      If `guess_charset` is true, the `chardet` library will perform charset 
 88      guessing on the string. 
 89      """ 
 90      if not isinstance(html, _strings): 
 91          raise TypeError('string required') 
 92   
 93      if parser is None: 
 94          parser = html_parser 
 95   
 96      options = {} 
 97      if guess_charset is None and isinstance(html, bytes): 
 98           
 99           
100          guess_charset = False 
101      if guess_charset is not None: 
102          options['useChardet'] = guess_charset 
103      children = parser.parseFragment(html, 'div', **options) 
104      if children and isinstance(children[0], _strings): 
105          if no_leading_text: 
106              if children[0].strip(): 
107                  raise etree.ParserError('There is leading text: %r' % 
108                                          children[0]) 
109              del children[0] 
110      return children 
 111   
112   
115      """Parses a single HTML element; it is an error if there is more than 
116      one element, or if anything but whitespace precedes or follows the 
117      element. 
118   
119      If 'create_parent' is true (or is a tag name) then a parent node 
120      will be created to encapsulate the HTML in a single element.  In 
121      this case, leading or trailing text is allowed. 
122   
123      If `guess_charset` is true, the `chardet` library will perform charset 
124      guessing on the string. 
125      """ 
126      if not isinstance(html, _strings): 
127          raise TypeError('string required') 
128   
129      accept_leading_text = bool(create_parent) 
130   
131      elements = fragments_fromstring( 
132          html, guess_charset=guess_charset, parser=parser, 
133          no_leading_text=not accept_leading_text) 
134   
135      if create_parent: 
136          if not isinstance(create_parent, _strings): 
137              create_parent = 'div' 
138          new_root = Element(create_parent) 
139          if elements: 
140              if isinstance(elements[0], _strings): 
141                  new_root.text = elements[0] 
142                  del elements[0] 
143              new_root.extend(elements) 
144          return new_root 
145   
146      if not elements: 
147          raise etree.ParserError('No elements found') 
148      if len(elements) > 1: 
149          raise etree.ParserError('Multiple elements found') 
150      result = elements[0] 
151      if result.tail and result.tail.strip(): 
152          raise etree.ParserError('Element followed by text: %r' % result.tail) 
153      result.tail = None 
154      return result 
 155   
156   
157 -def fromstring(html, guess_charset=None, parser=None): 
 158      """Parse the html, returning a single element/document. 
159   
160      This tries to minimally parse the chunk of text, without knowing if it 
161      is a fragment or a document. 
162   
163      'base_url' will set the document's base_url attribute (and the tree's 
164      docinfo.URL) 
165   
166      If `guess_charset` is true, or if the input is not Unicode but a 
167      byte string, the `chardet` library will perform charset guessing 
168      on the string. 
169      """ 
170      if not isinstance(html, _strings): 
171          raise TypeError('string required') 
172      doc = document_fromstring(html, parser=parser, 
173                                guess_charset=guess_charset) 
174   
175       
176      start = html[:50] 
177      if isinstance(start, bytes): 
178           
179           
180           
181          start = start.decode('ascii', 'replace') 
182   
183      start = start.lstrip().lower() 
184      if start.startswith('<html') or start.startswith('<!doctype'): 
185          return doc 
186   
187      head = _find_tag(doc, 'head') 
188   
189       
190      if len(head): 
191          return doc 
192   
193      body = _find_tag(doc, 'body') 
194   
195       
196       
197      if (len(body) == 1 and (not body.text or not body.text.strip()) 
198          and (not body[-1].tail or not body[-1].tail.strip())): 
199          return body[0] 
200   
201       
202       
203       
204      if _contains_block_level_tag(body): 
205          body.tag = 'div' 
206      else: 
207          body.tag = 'span' 
208      return body 
 209   
210   
211 -def parse(filename_url_or_file, guess_charset=None, parser=None): 
 212      """Parse a filename, URL, or file-like object into an HTML document 
213      tree.  Note: this returns a tree, not an element.  Use 
214      ``parse(...).getroot()`` to get the document root. 
215   
216      If ``guess_charset`` is true, the ``useChardet`` option is passed into 
217      html5lib to enable character detection.  This option is on by default 
218      when parsing from URLs, off by default when parsing from file(-like) 
219      objects (which tend to return Unicode more often than not), and on by 
220      default when parsing from a file path (which is read in binary mode). 
221      """ 
222      if parser is None: 
223          parser = html_parser 
224      if not isinstance(filename_url_or_file, _strings): 
225          fp = filename_url_or_file 
226          if guess_charset is None: 
227               
228              guess_charset = False 
229      elif _looks_like_url(filename_url_or_file): 
230          fp = urlopen(filename_url_or_file) 
231          if guess_charset is None: 
232               
233              guess_charset = True 
234      else: 
235          fp = open(filename_url_or_file, 'rb') 
236          if guess_charset is None: 
237              guess_charset = True 
238   
239      options = {} 
240       
241       
242      if guess_charset: 
243          options['useChardet'] = guess_charset 
244      return parser.parse(fp, **options) 
 245   
246   
248      scheme = urlparse(str)[0] 
249      if not scheme: 
250          return False 
251      elif (sys.platform == 'win32' and 
252              scheme in string.ascii_letters 
253              and len(scheme) == 1): 
254           
255          return False 
256      else: 
257          return True 
 258   
259   
260  html_parser = HTMLParser() 
261