1   
   2   
   3   
   4   
   5   
   6   
   7   
   8   
   9   
  10   
  11   
  12   
  13   
  14   
  15   
  16   
  17   
  18   
  19   
  20   
  21   
  22   
  23   
  24   
  25   
  26   
  27   
  28   
  29   
  30   
  31  """The ``lxml.html`` tool set for HTML handling. 
  32  """ 
  33   
  34  from __future__ import absolute_import 
  35   
  36  __all__ = [ 
  37      'document_fromstring', 'fragment_fromstring', 'fragments_fromstring', 'fromstring', 
  38      'tostring', 'Element', 'defs', 'open_in_browser', 'submit_form', 
  39      'find_rel_links', 'find_class', 'make_links_absolute', 
  40      'resolve_base_href', 'iterlinks', 'rewrite_links', 'open_in_browser', 'parse'] 
  41   
  42   
  43  import copy 
  44  import sys 
  45  import re 
  46  from functools import partial 
  47   
  48  try: 
  49       
  50      from collections.abc import MutableMapping, MutableSet 
  51  except ImportError: 
  52      from collections import MutableMapping, MutableSet 
  53   
  54  from .. import etree 
  55  from . import defs 
  56  from ._setmixin import SetMixin 
  57   
  58  try: 
  59      from urlparse import urljoin 
  60  except ImportError: 
  61       
  62      from urllib.parse import urljoin 
  63   
  64  try: 
  65      unicode 
  66  except NameError: 
  67       
  68      unicode = str 
  69  try: 
  70      basestring 
  71  except NameError: 
  72       
  73      basestring = (str, bytes) 
  77      if not s: 
  78          return s 
  79      if sys.version_info[0] >= 3: 
  80          sub = re.compile(r"^(\s*)u'", re.M).sub 
  81      else: 
  82          sub = re.compile(r"^(\s*)b'", re.M).sub 
  83      return sub(r"\1'", s) 
   84   
  85   
  86  XHTML_NAMESPACE = "http://www.w3.org/1999/xhtml" 
  87   
  88  _rel_links_xpath = etree.XPath("descendant-or-self::a[@rel]|descendant-or-self::x:a[@rel]", 
  89                                 namespaces={'x':XHTML_NAMESPACE}) 
  90  _options_xpath = etree.XPath("descendant-or-self::option|descendant-or-self::x:option", 
  91                               namespaces={'x':XHTML_NAMESPACE}) 
  92  _forms_xpath = etree.XPath("descendant-or-self::form|descendant-or-self::x:form", 
  93                             namespaces={'x':XHTML_NAMESPACE}) 
  94   
  95  _class_xpath = etree.XPath("descendant-or-self::*[@class and contains(concat(' ', normalize-space(@class), ' '), concat(' ', $class_name, ' '))]") 
  96  _id_xpath = etree.XPath("descendant-or-self::*[@id=$id]") 
  97  _collect_string_content = etree.XPath("string()") 
  98  _iter_css_urls = re.compile(r'url\(('+'["][^"]*["]|'+"['][^']*[']|"+r'[^)]*)\)', re.I).finditer 
  99  _iter_css_imports = re.compile(r'@import "(.*?)"').finditer 
 100  _label_xpath = etree.XPath("//label[@for=$id]|//x:label[@for=$id]", 
 101                             namespaces={'x':XHTML_NAMESPACE}) 
 102  _archive_re = re.compile(r'[^ ]+') 
 103  _parse_meta_refresh_url = re.compile( 
 104      r'[^;=]*;\s*(?:url\s*=\s*)?(?P<url>.*)$', re.I).search 
 108      if s[:1] == '"' and s[-1:] == '"' or s[:1] == "'" and s[-1:] == "'": 
 109          return s[1:-1], pos+1 
 110      else: 
 111          return s,pos 
  112   
 123   
 130   
 133      """Provides access to an element's class attribute as a set-like collection. 
 134      Usage:: 
 135   
 136          >>> el = fromstring('<p class="hidden large">Text</p>') 
 137          >>> classes = el.classes  # or: classes = Classes(el.attrib) 
 138          >>> classes |= ['block', 'paragraph'] 
 139          >>> el.get('class') 
 140          'hidden large block paragraph' 
 141          >>> classes.toggle('hidden') 
 142          False 
 143          >>> el.get('class') 
 144          'large block paragraph' 
 145          >>> classes -= ('some', 'classes', 'block') 
 146          >>> el.get('class') 
 147          'large paragraph' 
 148      """ 
 150          self._attributes = attributes 
 151          self._get_class_value = partial(attributes.get, 'class', '') 
  152   
 153 -    def add(self, value): 
  154          """ 
 155          Add a class. 
 156   
 157          This has no effect if the class is already present. 
 158          """ 
 159          if not value or re.search(r'\s', value): 
 160              raise ValueError("Invalid class name: %r" % value) 
 161          classes = self._get_class_value().split() 
 162          if value in classes: 
 163              return 
 164          classes.append(value) 
 165          self._attributes['class'] = ' '.join(classes) 
  166   
 168          """ 
 169          Remove a class if it is currently present. 
 170   
 171          If the class is not present, do nothing. 
 172          """ 
 173          if not value or re.search(r'\s', value): 
 174              raise ValueError("Invalid class name: %r" % value) 
 175          classes = [name for name in self._get_class_value().split() 
 176                     if name != value] 
 177          if classes: 
 178              self._attributes['class'] = ' '.join(classes) 
 179          elif 'class' in self._attributes: 
 180              del self._attributes['class'] 
  181   
 183          """ 
 184          Remove a class; it must currently be present. 
 185   
 186          If the class is not present, raise a KeyError. 
 187          """ 
 188          if not value or re.search(r'\s', value): 
 189              raise ValueError("Invalid class name: %r" % value) 
 190          super(Classes, self).remove(value) 
  191   
 195   
 197          return iter(self._get_class_value().split()) 
  198   
 200          return len(self._get_class_value().split()) 
  201   
 202       
 203   
 205          """ 
 206          Add all names from 'values'. 
 207          """ 
 208          classes = self._get_class_value().split() 
 209          extended = False 
 210          for value in values: 
 211              if value not in classes: 
 212                  classes.append(value) 
 213                  extended = True 
 214          if extended: 
 215              self._attributes['class'] = ' '.join(classes) 
  216   
 218          """ 
 219          Add a class name if it isn't there yet, or remove it if it exists. 
 220   
 221          Returns true if the class was added (and is now enabled) and 
 222          false if it was removed (and is now disabled). 
 223          """ 
 224          if not value or re.search(r'\s', value): 
 225              raise ValueError("Invalid class name: %r" % value) 
 226          classes = self._get_class_value().split() 
 227          try: 
 228              classes.remove(value) 
 229              enabled = False 
 230          except ValueError: 
 231              classes.append(value) 
 232              enabled = True 
 233          if classes: 
 234              self._attributes['class'] = ' '.join(classes) 
 235          else: 
 236              del self._attributes['class'] 
 237          return enabled 
   238   
 241   
 242 -    def set(self, key, value=None): 
  243          """set(self, key, value=None) 
 244   
 245          Sets an element attribute.  If no value is provided, or if the value is None, 
 246          creates a 'boolean' attribute without value, e.g. "<form novalidate></form>" 
 247          for ``form.set('novalidate')``. 
 248          """ 
 249          super(HtmlElement, self).set(key, value) 
  250   
 251      @property 
 253          """ 
 254          A set-like wrapper around the 'class' attribute. 
 255          """ 
 256          return Classes(self.attrib) 
  257   
 258      @classes.setter 
 266   
 267      @property 
 269          """ 
 270          Returns the base URL, given when the page was parsed. 
 271   
 272          Use with ``urlparse.urljoin(el.base_url, href)`` to get 
 273          absolute URLs. 
 274          """ 
 275          return self.getroottree().docinfo.URL 
  276   
 277      @property 
 283   
 284      @property 
 286          """ 
 287          Return the <body> element.  Can be called from a child element 
 288          to get the document's head. 
 289          """ 
 290          return self.xpath('//body|//x:body', namespaces={'x':XHTML_NAMESPACE})[0] 
  291   
 292      @property 
 294          """ 
 295          Returns the <head> element.  Can be called from a child 
 296          element to get the document's head. 
 297          """ 
 298          return self.xpath('//head|//x:head', namespaces={'x':XHTML_NAMESPACE})[0] 
  299   
 300      @property 
 302          """ 
 303          Get or set any <label> element associated with this element. 
 304          """ 
 305          id = self.get('id') 
 306          if not id: 
 307              return None 
 308          result = _label_xpath(self, id=id) 
 309          if not result: 
 310              return None 
 311          else: 
 312              return result[0] 
  313   
 314      @label.setter 
 316          id = self.get('id') 
 317          if not id: 
 318              raise TypeError( 
 319                  "You cannot set a label for an element (%r) that has no id" 
 320                  % self) 
 321          if _nons(label.tag) != 'label': 
 322              raise TypeError( 
 323                  "You can only assign label to a label element (not %r)" 
 324                  % label) 
 325          label.set('for', id) 
  326   
 327      @label.deleter 
 332   
 334          """ 
 335          Removes this element from the tree, including its children and 
 336          text.  The tail text is joined to the previous element or 
 337          parent. 
 338          """ 
 339          parent = self.getparent() 
 340          assert parent is not None 
 341          if self.tail: 
 342              previous = self.getprevious() 
 343              if previous is None: 
 344                  parent.text = (parent.text or '') + self.tail 
 345              else: 
 346                  previous.tail = (previous.tail or '') + self.tail 
 347          parent.remove(self) 
  348   
 350          """ 
 351          Remove the tag, but not its children or text.  The children and text 
 352          are merged into the parent. 
 353   
 354          Example:: 
 355   
 356              >>> h = fragment_fromstring('<div>Hello <b>World!</b></div>') 
 357              >>> h.find('.//b').drop_tag() 
 358              >>> print(tostring(h, encoding='unicode')) 
 359              <div>Hello World!</div> 
 360          """ 
 361          parent = self.getparent() 
 362          assert parent is not None 
 363          previous = self.getprevious() 
 364          if self.text and isinstance(self.tag, basestring): 
 365               
 366              if previous is None: 
 367                  parent.text = (parent.text or '') + self.text 
 368              else: 
 369                  previous.tail = (previous.tail or '') + self.text 
 370          if self.tail: 
 371              if len(self): 
 372                  last = self[-1] 
 373                  last.tail = (last.tail or '') + self.tail 
 374              elif previous is None: 
 375                  parent.text = (parent.text or '') + self.tail 
 376              else: 
 377                  previous.tail = (previous.tail or '') + self.tail 
 378          index = parent.index(self) 
 379          parent[index:index+1] = self[:] 
  380   
 382          """ 
 383          Find any links like ``<a rel="{rel}">...</a>``; returns a list of elements. 
 384          """ 
 385          rel = rel.lower() 
 386          return [el for el in _rel_links_xpath(self) 
 387                  if el.get('rel').lower() == rel] 
  388   
 390          """ 
 391          Find any elements with the given class name. 
 392          """ 
 393          return _class_xpath(self, class_name=class_name) 
  394   
 396          """ 
 397          Get the first element in a document with the given id.  If none is 
 398          found, return the default argument if provided or raise KeyError 
 399          otherwise. 
 400   
 401          Note that there can be more than one element with the same id, 
 402          and this isn't uncommon in HTML documents found in the wild. 
 403          Browsers return only the first match, and this function does 
 404          the same. 
 405          """ 
 406          try: 
 407               
 408               
 409              return _id_xpath(self, id=id)[0] 
 410          except IndexError: 
 411              if default: 
 412                  return default[0] 
 413              else: 
 414                  raise KeyError(id) 
  415   
 416 -    def text_content(self): 
  417          """ 
 418          Return the text content of the tag (and the text in any children). 
 419          """ 
 420          return _collect_string_content(self) 
  421   
 422 -    def cssselect(self, expr, translator='html'): 
  423          """ 
 424          Run the CSS expression on this element and its children, 
 425          returning a list of the results. 
 426   
 427          Equivalent to lxml.cssselect.CSSSelect(expr, translator='html')(self) 
 428          -- note that pre-compiling the expression can provide a substantial 
 429          speedup. 
 430          """ 
 431           
 432          from lxml.cssselect import CSSSelector 
 433          return CSSSelector(expr, translator=translator)(self) 
  434   
 435       
 436       
 437       
 438   
 439 -    def make_links_absolute(self, base_url=None, resolve_base_href=True, 
 440                              handle_failures=None): 
  441          """ 
 442          Make all links in the document absolute, given the 
 443          ``base_url`` for the document (the full URL where the document 
 444          came from), or if no ``base_url`` is given, then the ``.base_url`` 
 445          of the document. 
 446   
 447          If ``resolve_base_href`` is true, then any ``<base href>`` 
 448          tags in the document are used *and* removed from the document. 
 449          If it is false then any such tag is ignored. 
 450   
 451          If ``handle_failures`` is None (default), a failure to process 
 452          a URL will abort the processing.  If set to 'ignore', errors 
 453          are ignored.  If set to 'discard', failing URLs will be removed. 
 454          """ 
 455          if base_url is None: 
 456              base_url = self.base_url 
 457              if base_url is None: 
 458                  raise TypeError( 
 459                      "No base_url given, and the document has no base_url") 
 460          if resolve_base_href: 
 461              self.resolve_base_href() 
 462   
 463          if handle_failures == 'ignore': 
 464              def link_repl(href): 
 465                  try: 
 466                      return urljoin(base_url, href) 
 467                  except ValueError: 
 468                      return href 
  469          elif handle_failures == 'discard': 
 470              def link_repl(href): 
 471                  try: 
 472                      return urljoin(base_url, href) 
 473                  except ValueError: 
 474                      return None 
  475          elif handle_failures is None: 
 476              def link_repl(href): 
 477                  return urljoin(base_url, href) 
 478          else: 
 479              raise ValueError( 
 480                  "unexpected value for handle_failures: %r" % handle_failures) 
 481   
 482          self.rewrite_links(link_repl) 
 483   
 485          """ 
 486          Find any ``<base href>`` tag in the document, and apply its 
 487          values to all links found in the document.  Also remove the 
 488          tag once it has been applied. 
 489   
 490          If ``handle_failures`` is None (default), a failure to process 
 491          a URL will abort the processing.  If set to 'ignore', errors 
 492          are ignored.  If set to 'discard', failing URLs will be removed. 
 493          """ 
 494          base_href = None 
 495          basetags = self.xpath('//base[@href]|//x:base[@href]', 
 496                                namespaces={'x': XHTML_NAMESPACE}) 
 497          for b in basetags: 
 498              base_href = b.get('href') 
 499              b.drop_tree() 
 500          if not base_href: 
 501              return 
 502          self.make_links_absolute(base_href, resolve_base_href=False, 
 503                                   handle_failures=handle_failures) 
  504   
 506          """ 
 507          Yield (element, attribute, link, pos), where attribute may be None 
 508          (indicating the link is in the text).  ``pos`` is the position 
 509          where the link occurs; often 0, but sometimes something else in 
 510          the case of links in stylesheets or style tags. 
 511   
 512          Note: <base href> is *not* taken into account in any way.  The 
 513          link you get is exactly the link in the document. 
 514   
 515          Note: multiple links inside of a single text string or 
 516          attribute value are returned in reversed order.  This makes it 
 517          possible to replace or delete them from the text string value 
 518          based on their reported text positions.  Otherwise, a 
 519          modification at one text position can change the positions of 
 520          links reported later on. 
 521          """ 
 522          link_attrs = defs.link_attrs 
 523          for el in self.iter(etree.Element): 
 524              attribs = el.attrib 
 525              tag = _nons(el.tag) 
 526              if tag == 'object': 
 527                  codebase = None 
 528                   
 529                   
 530                  if 'codebase' in attribs: 
 531                      codebase = el.get('codebase') 
 532                      yield (el, 'codebase', codebase, 0) 
 533                  for attrib in ('classid', 'data'): 
 534                      if attrib in attribs: 
 535                          value = el.get(attrib) 
 536                          if codebase is not None: 
 537                              value = urljoin(codebase, value) 
 538                          yield (el, attrib, value, 0) 
 539                  if 'archive' in attribs: 
 540                      for match in _archive_re.finditer(el.get('archive')): 
 541                          value = match.group(0) 
 542                          if codebase is not None: 
 543                              value = urljoin(codebase, value) 
 544                          yield (el, 'archive', value, match.start()) 
 545              else: 
 546                  for attrib in link_attrs: 
 547                      if attrib in attribs: 
 548                          yield (el, attrib, attribs[attrib], 0) 
 549              if tag == 'meta': 
 550                  http_equiv = attribs.get('http-equiv', '').lower() 
 551                  if http_equiv == 'refresh': 
 552                      content = attribs.get('content', '') 
 553                      match = _parse_meta_refresh_url(content) 
 554                      url = (match.group('url') if match else content).strip() 
 555                       
 556                       
 557                      if url: 
 558                          url, pos = _unquote_match( 
 559                              url, match.start('url') if match else content.find(url)) 
 560                          yield (el, 'content', url, pos) 
 561              elif tag == 'param': 
 562                  valuetype = el.get('valuetype') or '' 
 563                  if valuetype.lower() == 'ref': 
 564                       
 565                       
 566                       
 567                       
 568                       
 569                       
 570                      yield (el, 'value', el.get('value'), 0) 
 571              elif tag == 'style' and el.text: 
 572                  urls = [ 
 573                       
 574                      _unquote_match(match.group(1), match.start(1))[::-1] 
 575                      for match in _iter_css_urls(el.text) 
 576                      ] + [ 
 577                      (match.start(1), match.group(1)) 
 578                      for match in _iter_css_imports(el.text) 
 579                      ] 
 580                  if urls: 
 581                       
 582                       
 583                       
 584                      urls.sort(reverse=True) 
 585                      for start, url in urls: 
 586                          yield (el, None, url, start) 
 587              if 'style' in attribs: 
 588                  urls = list(_iter_css_urls(attribs['style'])) 
 589                  if urls: 
 590                       
 591                      for match in urls[::-1]: 
 592                          url, start = _unquote_match(match.group(1), match.start(1)) 
 593                          yield (el, 'style', url, start) 
  594   
 595 -    def rewrite_links(self, link_repl_func, resolve_base_href=True, 
 596                        base_href=None): 
  597          """ 
 598          Rewrite all the links in the document.  For each link 
 599          ``link_repl_func(link)`` will be called, and the return value 
 600          will replace the old link. 
 601   
 602          Note that links may not be absolute (unless you first called 
 603          ``make_links_absolute()``), and may be internal (e.g., 
 604          ``'#anchor'``).  They can also be values like 
 605          ``'mailto:email'`` or ``'javascript:expr'``. 
 606   
 607          If you give ``base_href`` then all links passed to 
 608          ``link_repl_func()`` will take that into account. 
 609   
 610          If the ``link_repl_func`` returns None, the attribute or 
 611          tag text will be removed completely. 
 612          """ 
 613          if base_href is not None: 
 614               
 615               
 616              self.make_links_absolute( 
 617                  base_href, resolve_base_href=resolve_base_href) 
 618          elif resolve_base_href: 
 619              self.resolve_base_href() 
 620   
 621          for el, attrib, link, pos in self.iterlinks(): 
 622              new_link = link_repl_func(link.strip()) 
 623              if new_link == link: 
 624                  continue 
 625              if new_link is None: 
 626                   
 627                  if attrib is None: 
 628                      el.text = '' 
 629                  else: 
 630                      del el.attrib[attrib] 
 631                  continue 
 632   
 633              if attrib is None: 
 634                  new = el.text[:pos] + new_link + el.text[pos+len(link):] 
 635                  el.text = new 
 636              else: 
 637                  cur = el.get(attrib) 
 638                  if not pos and len(cur) == len(link): 
 639                      new = new_link   
 640                  else: 
 641                      new = cur[:pos] + new_link + cur[pos+len(link):] 
 642                  el.set(attrib, new) 
  643   
 646      """ 
 647      An object that represents a method on an element as a function; 
 648      the function takes either an element or an HTML string.  It 
 649      returns whatever the function normally returns, or if the function 
 650      works in-place (and so returns None) it returns a serialized form 
 651      of the resulting document. 
 652      """ 
 658          result_type = type(doc) 
 659          if isinstance(doc, basestring): 
 660              if 'copy' in kw: 
 661                  raise TypeError( 
 662                      "The keyword 'copy' can only be used with element inputs to %s, not a string input" % self.name) 
 663              doc = fromstring(doc, **kw) 
 664          else: 
 665              if 'copy' in kw: 
 666                  make_a_copy = kw.pop('copy') 
 667              else: 
 668                  make_a_copy = self.copy 
 669              if make_a_copy: 
 670                  doc = copy.deepcopy(doc) 
 671          meth = getattr(doc, self.name) 
 672          result = meth(*args, **kw) 
 673           
 674          if result is None: 
 675               
 676              return _transform_result(result_type, doc) 
 677          else: 
 678              return result 
   679   
 680   
 681  find_rel_links = _MethodFunc('find_rel_links', copy=False) 
 682  find_class = _MethodFunc('find_class', copy=False) 
 683  make_links_absolute = _MethodFunc('make_links_absolute', copy=True) 
 684  resolve_base_href = _MethodFunc('resolve_base_href', copy=True) 
 685  iterlinks = _MethodFunc('iterlinks', copy=False) 
 686  rewrite_links = _MethodFunc('rewrite_links', copy=True) 
 691   
 697   
 701   
 702   
 703 -class HtmlEntity(etree.EntityBase, HtmlMixin): 
  705   
 708      """A lookup scheme for HTML Element classes. 
 709   
 710      To create a lookup instance with different Element classes, pass a tag 
 711      name mapping of Element classes in the ``classes`` keyword argument and/or 
 712      a tag name mapping of Mixin classes in the ``mixins`` keyword argument. 
 713      The special key '*' denotes a Mixin class that should be mixed into all 
 714      Element classes. 
 715      """ 
 716      _default_element_classes = {} 
 717   
 718 -    def __init__(self, classes=None, mixins=None): 
  735   
 736 -    def lookup(self, node_type, document, namespace, name): 
   747   
 748   
 749   
 750   
 751   
 752   
 753  _looks_like_full_html_unicode = re.compile( 
 754      unicode(r'^\s*<(?:html|!doctype)'), re.I).match 
 755  _looks_like_full_html_bytes = re.compile( 
 756      r'^\s*<(?:html|!doctype)'.encode('ascii'), re.I).match 
 771   
 775      """Parses several HTML elements, returning a list of elements. 
 776   
 777      The first item in the list may be a string. 
 778      If no_leading_text is true, then it will be an error if there is 
 779      leading text, and it will always be a list of only elements. 
 780   
 781      base_url will set the document's base_url attribute 
 782      (and the tree's docinfo.URL). 
 783      """ 
 784      if parser is None: 
 785          parser = html_parser 
 786       
 787      if isinstance(html, bytes): 
 788          if not _looks_like_full_html_bytes(html): 
 789               
 790              html = ('<html><body>'.encode('ascii') + html + 
 791                      '</body></html>'.encode('ascii')) 
 792      else: 
 793          if not _looks_like_full_html_unicode(html): 
 794              html = '<html><body>%s</body></html>' % html 
 795      doc = document_fromstring(html, parser=parser, base_url=base_url, **kw) 
 796      assert _nons(doc.tag) == 'html' 
 797      bodies = [e for e in doc if _nons(e.tag) == 'body'] 
 798      assert len(bodies) == 1, ("too many bodies: %r in %r" % (bodies, html)) 
 799      body = bodies[0] 
 800      elements = [] 
 801      if no_leading_text and body.text and body.text.strip(): 
 802          raise etree.ParserError( 
 803              "There is leading text: %r" % body.text) 
 804      if body.text and body.text.strip(): 
 805          elements.append(body.text) 
 806      elements.extend(body) 
 807       
 808       
 809      return elements 
  810   
 814      """ 
 815      Parses a single HTML element; it is an error if there is more than 
 816      one element, or if anything but whitespace precedes or follows the 
 817      element. 
 818   
 819      If ``create_parent`` is true (or is a tag name) then a parent node 
 820      will be created to encapsulate the HTML in a single element.  In this 
 821      case, leading or trailing text is also allowed, as are multiple elements 
 822      as result of the parsing. 
 823   
 824      Passing a ``base_url`` will set the document's ``base_url`` attribute 
 825      (and the tree's docinfo.URL). 
 826      """ 
 827      if parser is None: 
 828          parser = html_parser 
 829   
 830      accept_leading_text = bool(create_parent) 
 831   
 832      elements = fragments_fromstring( 
 833          html, parser=parser, no_leading_text=not accept_leading_text, 
 834          base_url=base_url, **kw) 
 835   
 836      if create_parent: 
 837          if not isinstance(create_parent, basestring): 
 838              create_parent = 'div' 
 839          new_root = Element(create_parent) 
 840          if elements: 
 841              if isinstance(elements[0], basestring): 
 842                  new_root.text = elements[0] 
 843                  del elements[0] 
 844              new_root.extend(elements) 
 845          return new_root 
 846   
 847      if not elements: 
 848          raise etree.ParserError('No elements found') 
 849      if len(elements) > 1: 
 850          raise etree.ParserError( 
 851              "Multiple elements found (%s)" 
 852              % ', '.join([_element_name(e) for e in elements])) 
 853      el = elements[0] 
 854      if el.tail and el.tail.strip(): 
 855          raise etree.ParserError( 
 856              "Element followed by text: %r" % el.tail) 
 857      el.tail = None 
 858      return el 
  859   
 860   
 861 -def fromstring(html, base_url=None, parser=None, **kw): 
  927   
 928   
 929 -def parse(filename_or_url, parser=None, base_url=None, **kw): 
  930      """ 
 931      Parse a filename, URL, or file-like object into an HTML document 
 932      tree.  Note: this returns a tree, not an element.  Use 
 933      ``parse(...).getroot()`` to get the document root. 
 934   
 935      You can override the base URL with the ``base_url`` keyword.  This 
 936      is most useful when parsing from a file-like object. 
 937      """ 
 938      if parser is None: 
 939          parser = html_parser 
 940      return etree.parse(filename_or_url, parser, base_url=base_url, **kw) 
  941   
 950   
 953      if isinstance(el, etree.CommentBase): 
 954          return 'comment' 
 955      elif isinstance(el, basestring): 
 956          return 'string' 
 957      else: 
 958          return _nons(el.tag) 
  959   
1079   
1080   
1081  HtmlElementClassLookup._default_element_classes['form'] = FormElement 
1120   
1123      if not url: 
1124          raise ValueError("cannot submit, no URL provided") 
1125       
1126      try: 
1127          from urllib import urlencode, urlopen 
1128      except ImportError:  
1129          from urllib.request import urlopen 
1130          from urllib.parse import urlencode 
1131      if method == 'GET': 
1132          if '?' in url: 
1133              url += '&' 
1134          else: 
1135              url += '?' 
1136          url += urlencode(values) 
1137          data = None 
1138      else: 
1139          data = urlencode(values) 
1140          if not isinstance(data, bytes): 
1141              data = data.encode('ASCII') 
1142      return urlopen(url, data) 
 1143   
1146   
1154          raise KeyError( 
1155              "You cannot remove keys from ElementDict") 
 1159          return item in self.inputs 
 1164   
1166          return '<%s for form %s>' % ( 
1167              self.__class__.__name__, 
1168              self.inputs.form._name()) 
  1169   
1236   
1267   
1268   
1269 -class TextareaElement(InputMixin, HtmlElement): 
 1270      """ 
1271      ``<textarea>`` element.  You can get the name with ``.name`` and 
1272      get/set the value with ``.value`` 
1273      """ 
1274      @property 
1276          """ 
1277          Get/set the value (which is the contents of this element) 
1278          """ 
1279          content = self.text or '' 
1280          if self.tag.startswith("{%s}" % XHTML_NAMESPACE): 
1281              serialisation_method = 'xml' 
1282          else: 
1283              serialisation_method = 'html' 
1284          for el in self: 
1285               
1286              content += etree.tostring( 
1287                  el, method=serialisation_method, encoding='unicode') 
1288          return content 
 1289   
1290      @value.setter 
1291 -    def value(self, value): 
 1292          del self[:] 
1293          self.text = value 
 1294   
1295      @value.deleter 
1297          self.text = '' 
1298          del self[:] 
  1299   
1300   
1301  HtmlElementClassLookup._default_element_classes['textarea'] = TextareaElement 
1305      """ 
1306      ``<select>`` element.  You can get the name with ``.name``. 
1307   
1308      ``.value`` will be the value of the selected option, unless this 
1309      is a multi-select element (``<select multiple>``), in which case 
1310      it will be a set-like object.  In either case ``.value_options`` 
1311      gives the possible values. 
1312   
1313      The boolean attribute ``.multiple`` shows if this is a 
1314      multi-select. 
1315      """ 
1316      @property 
1318          """ 
1319          Get/set the value of this select (the selected option). 
1320   
1321          If this is a multi-select, this is a set-like object that 
1322          represents all the selected options. 
1323          """ 
1324          if self.multiple: 
1325              return MultipleSelectOptions(self) 
1326          for el in _options_xpath(self): 
1327              if el.get('selected') is not None: 
1328                  value = el.get('value') 
1329                  if value is None: 
1330                      value = (el.text or '').strip() 
1331                  return value 
1332          return None 
 1333   
1334      @value.setter 
1335 -    def value(self, value): 
 1336          if self.multiple: 
1337              if isinstance(value, basestring): 
1338                  raise TypeError("You must pass in a sequence") 
1339              values = self.value 
1340              values.clear() 
1341              values.update(value) 
1342              return 
1343          checked_option = None 
1344          if value is not None: 
1345              for el in _options_xpath(self): 
1346                  opt_value = el.get('value') 
1347                  if opt_value is None: 
1348                      opt_value = (el.text or '').strip() 
1349                  if opt_value == value: 
1350                      checked_option = el 
1351                      break 
1352              else: 
1353                  raise ValueError( 
1354                      "There is no option with the value of %r" % value) 
1355          for el in _options_xpath(self): 
1356              if 'selected' in el.attrib: 
1357                  del el.attrib['selected'] 
1358          if checked_option is not None: 
1359              checked_option.set('selected', '') 
 1360   
1361      @value.deleter 
1368   
1369      @property 
1382   
1383      @property 
1385          """ 
1386          Boolean attribute: is there a ``multiple`` attribute on this element. 
1387          """ 
1388          return 'multiple' in self.attrib 
 1389   
1390      @multiple.setter 
1392          if value: 
1393              self.set('multiple', '') 
1394          elif 'multiple' in self.attrib: 
1395              del self.attrib['multiple'] 
  1396   
1397   
1398  HtmlElementClassLookup._default_element_classes['select'] = SelectElement 
1402      """ 
1403      Represents all the selected options in a ``<select multiple>`` element. 
1404   
1405      You can add to this set-like option to select an option, or remove 
1406      to unselect the option. 
1407      """ 
1408   
1410          self.select = select 
 1411   
1412      @property 
1414          """ 
1415          Iterator of all the ``<option>`` elements. 
1416          """ 
1417          return iter(_options_xpath(self.select)) 
 1418   
1420          for option in self.options: 
1421              if 'selected' in option.attrib: 
1422                  opt_value = option.get('value') 
1423                  if opt_value is None: 
1424                      opt_value = (option.text or '').strip() 
1425                  yield opt_value 
 1426   
1427 -    def add(self, item): 
 1428          for option in self.options: 
1429              opt_value = option.get('value') 
1430              if opt_value is None: 
1431                  opt_value = (option.text or '').strip() 
1432              if opt_value == item: 
1433                  option.set('selected', '') 
1434                  break 
1435          else: 
1436              raise ValueError( 
1437                  "There is no option with the value %r" % item) 
 1438   
1440          for option in self.options: 
1441              opt_value = option.get('value') 
1442              if opt_value is None: 
1443                  opt_value = (option.text or '').strip() 
1444              if opt_value == item: 
1445                  if 'selected' in option.attrib: 
1446                      del option.attrib['selected'] 
1447                  else: 
1448                      raise ValueError( 
1449                          "The option %r is not currently selected" % item) 
1450                  break 
1451          else: 
1452              raise ValueError( 
1453                  "There is not option with the value %r" % item) 
 1454   
1456          return '<%s {%s} for select name=%r>' % ( 
1457              self.__class__.__name__, 
1458              ', '.join([repr(v) for v in self]), 
1459              self.select.name) 
  1460   
1463      """ 
1464      This object represents several ``<input type=radio>`` elements 
1465      that have the same name. 
1466   
1467      You can use this like a list, but also use the property 
1468      ``.value`` to check/uncheck inputs.  Also you can use 
1469      ``.value_options`` to get the possible values. 
1470      """ 
1471      @property 
1473          """ 
1474          Get/set the value, which checks the radio with that value (and 
1475          unchecks any other value). 
1476          """ 
1477          for el in self: 
1478              if 'checked' in el.attrib: 
1479                  return el.get('value') 
1480          return None 
 1481   
1482      @value.setter 
1483 -    def value(self, value): 
 1484          checked_option = None 
1485          if value is not None: 
1486              for el in self: 
1487                  if el.get('value') == value: 
1488                      checked_option = el 
1489                      break 
1490              else: 
1491                  raise ValueError("There is no radio input with the value %r" % value) 
1492          for el in self: 
1493              if 'checked' in el.attrib: 
1494                  del el.attrib['checked'] 
1495          if checked_option is not None: 
1496              checked_option.set('checked', '') 
 1497   
1498      @value.deleter 
1501   
1502      @property 
1504          """ 
1505          Returns a list of all the possible values. 
1506          """ 
1507          return [el.get('value') for el in self] 
 1508   
1510          return '%s(%s)' % ( 
1511              self.__class__.__name__, 
1512              list.__repr__(self)) 
  1513   
1516      """ 
1517      Represents a group of checkboxes (``<input type=checkbox>``) that 
1518      have the same name. 
1519   
1520      In addition to using this like a list, the ``.value`` attribute 
1521      returns a set-like object that you can add to or remove from to 
1522      check and uncheck checkboxes.  You can also use ``.value_options`` 
1523      to get the possible values. 
1524      """ 
1525      @property 
1527          """ 
1528          Return a set-like object that can be modified to check or 
1529          uncheck individual checkboxes according to their value. 
1530          """ 
1531          return CheckboxValues(self) 
 1532   
1533      @value.setter 
1534 -    def value(self, value): 
 1542   
1543      @value.deleter 
1546   
1547      @property 
1549          """ 
1550          Returns a list of all the possible values. 
1551          """ 
1552          return [el.get('value') for el in self] 
 1553   
1555          return '%s(%s)' % ( 
1556              self.__class__.__name__, list.__repr__(self)) 
  1557   
1560      """ 
1561      Represents the values of the checked checkboxes in a group of 
1562      checkboxes with the same name. 
1563      """ 
1564   
1567   
1569          return iter([ 
1570              el.get('value') 
1571              for el in self.group 
1572              if 'checked' in el.attrib]) 
 1573   
1574 -    def add(self, value): 
 1575          for el in self.group: 
1576              if el.get('value') == value: 
1577                  el.set('checked', '') 
1578                  break 
1579          else: 
1580              raise KeyError("No checkbox with value %r" % value) 
 1581   
1583          for el in self.group: 
1584              if el.get('value') == value: 
1585                  if 'checked' in el.attrib: 
1586                      del el.attrib['checked'] 
1587                  else: 
1588                      raise KeyError( 
1589                          "The checkbox with value %r was already unchecked" % value) 
1590                  break 
1591          else: 
1592              raise KeyError( 
1593                  "No checkbox with value %r" % value) 
 1594   
1596          return '<%s {%s} for checkboxes name=%r>' % ( 
1597              self.__class__.__name__, 
1598              ', '.join([repr(v) for v in self]), 
1599              self.group.name) 
  1600   
1694   
1695   
1696  HtmlElementClassLookup._default_element_classes['input'] = InputElement 
1700      """ 
1701      Represents a ``<label>`` element. 
1702   
1703      Label elements are linked to other elements with their ``for`` 
1704      attribute.  You can access this element with ``label.for_element``. 
1705      """ 
1706      @property 
1708          """ 
1709          Get/set the element this label points to.  Return None if it 
1710          can't be found. 
1711          """ 
1712          id = self.get('for') 
1713          if not id: 
1714              return None 
1715          return self.body.get_element_by_id(id) 
 1716   
1717      @for_element.setter 
1719          id = other.get('id') 
1720          if not id: 
1721              raise TypeError( 
1722                  "Element %r has no id attribute" % other) 
1723          self.set('for', id) 
 1724   
1725      @for_element.deleter 
 1730   
1731   
1732  HtmlElementClassLookup._default_element_classes['label'] = LabelElement 
1752   
1755      """Convert all tags in an XHTML tree to HTML by removing their 
1756      XHTML namespace. 
1757      """ 
1758      try: 
1759          xhtml = xhtml.getroot() 
1760      except AttributeError: 
1761          pass 
1762      prefix = "{%s}" % XHTML_NAMESPACE 
1763      prefix_len = len(prefix) 
1764      for el in xhtml.iter(prefix + "*"): 
1765          el.tag = el.tag[prefix_len:] 
 1766   
1767   
1768   
1769   
1770  __str_replace_meta_content_type = re.compile( 
1771      r'<meta http-equiv="Content-Type"[^>]*>').sub 
1772  __bytes_replace_meta_content_type = re.compile( 
1773      r'<meta http-equiv="Content-Type"[^>]*>'.encode('ASCII')).sub 
1774   
1775   
1776 -def tostring(doc, pretty_print=False, include_meta_content_type=False, 
1777               encoding=None, method="html", with_tail=True, doctype=None): 
 1778      """Return an HTML string representation of the document. 
1779   
1780      Note: if include_meta_content_type is true this will create a 
1781      ``<meta http-equiv="Content-Type" ...>`` tag in the head; 
1782      regardless of the value of include_meta_content_type any existing 
1783      ``<meta http-equiv="Content-Type" ...>`` tag will be removed 
1784   
1785      The ``encoding`` argument controls the output encoding (defauts to 
1786      ASCII, with &#...; character references for any characters outside 
1787      of ASCII).  Note that you can pass the name ``'unicode'`` as 
1788      ``encoding`` argument to serialise to a Unicode string. 
1789   
1790      The ``method`` argument defines the output method.  It defaults to 
1791      'html', but can also be 'xml' for xhtml output, or 'text' to 
1792      serialise to plain text without markup. 
1793   
1794      To leave out the tail text of the top-level element that is being 
1795      serialised, pass ``with_tail=False``. 
1796   
1797      The ``doctype`` option allows passing in a plain string that will 
1798      be serialised before the XML tree.  Note that passing in non 
1799      well-formed content here will make the XML output non well-formed. 
1800      Also, an existing doctype in the document tree will not be removed 
1801      when serialising an ElementTree instance. 
1802   
1803      Example:: 
1804   
1805          >>> from lxml import html 
1806          >>> root = html.fragment_fromstring('<p>Hello<br>world!</p>') 
1807   
1808          >>> html.tostring(root) 
1809          b'<p>Hello<br>world!</p>' 
1810          >>> html.tostring(root, method='html') 
1811          b'<p>Hello<br>world!</p>' 
1812   
1813          >>> html.tostring(root, method='xml') 
1814          b'<p>Hello<br/>world!</p>' 
1815   
1816          >>> html.tostring(root, method='text') 
1817          b'Helloworld!' 
1818   
1819          >>> html.tostring(root, method='text', encoding='unicode') 
1820          u'Helloworld!' 
1821   
1822          >>> root = html.fragment_fromstring('<div><p>Hello<br>world!</p>TAIL</div>') 
1823          >>> html.tostring(root[0], method='text', encoding='unicode') 
1824          u'Helloworld!TAIL' 
1825   
1826          >>> html.tostring(root[0], method='text', encoding='unicode', with_tail=False) 
1827          u'Helloworld!' 
1828   
1829          >>> doc = html.document_fromstring('<p>Hello<br>world!</p>') 
1830          >>> html.tostring(doc, method='html', encoding='unicode') 
1831          u'<html><body><p>Hello<br>world!</p></body></html>' 
1832   
1833          >>> print(html.tostring(doc, method='html', encoding='unicode', 
1834          ...          doctype='<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN"' 
1835          ...                  ' "http://www.w3.org/TR/html4/strict.dtd">')) 
1836          <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd"> 
1837          <html><body><p>Hello<br>world!</p></body></html> 
1838      """ 
1839      html = etree.tostring(doc, method=method, pretty_print=pretty_print, 
1840                            encoding=encoding, with_tail=with_tail, 
1841                            doctype=doctype) 
1842      if method == 'html' and not include_meta_content_type: 
1843          if isinstance(html, str): 
1844              html = __str_replace_meta_content_type('', html) 
1845          else: 
1846              html = __bytes_replace_meta_content_type(bytes(), html) 
1847      return html 
 1848   
1849   
1850  tostring.__doc__ = __fix_docstring(tostring.__doc__) 
1854      """ 
1855      Open the HTML document in a web browser, saving it to a temporary 
1856      file to open it.  Note that this does not delete the file after 
1857      use.  This is mainly meant for debugging. 
1858      """ 
1859      import os 
1860      import webbrowser 
1861      import tempfile 
1862      if not isinstance(doc, etree._ElementTree): 
1863          doc = etree.ElementTree(doc) 
1864      handle, fn = tempfile.mkstemp(suffix='.html') 
1865      f = os.fdopen(handle, 'wb') 
1866      try: 
1867          doc.write(f, method="html", encoding=encoding or doc.docinfo.encoding or "UTF-8") 
1868      finally: 
1869           
1870          f.close() 
1871      url = 'file://' + fn.replace(os.path.sep, '/') 
1872      print(url) 
1873      webbrowser.open(url) 
 1874   
1875   
1876   
1877   
1878   
1879   
1880 -class HTMLParser(etree.HTMLParser): 
 1881      """An HTML parser that is configured to return lxml.html Element 
1882      objects. 
1883      """ 
 1887   
1890      """An XML parser that is configured to return lxml.html Element 
1891      objects. 
1892   
1893      Note that this parser is not really XHTML aware unless you let it 
1894      load a DTD that declares the HTML entities.  To do this, make sure 
1895      you have the XHTML DTDs installed in your catalogs, and create the 
1896      parser like this:: 
1897   
1898          >>> parser = XHTMLParser(load_dtd=True) 
1899   
1900      If you additionally want to validate the document, use this:: 
1901   
1902          >>> parser = XHTMLParser(dtd_validation=True) 
1903   
1904      For catalog support, see http://www.xmlsoft.org/catalog.html. 
1905      """ 
 1909   
1912      """Create a new HTML Element. 
1913   
1914      This can also be used for XHTML documents. 
1915      """ 
1916      v = html_parser.makeelement(*args, **kw) 
1917      return v 
 1918   
1919   
1920  html_parser = HTMLParser() 
1921  xhtml_parser = XHTMLParser() 
1922