1   
   2   
   3   
   4   
   5   
   6   
   7   
   8   
   9   
  10   
  11   
  12   
  13   
  14   
  15   
  16   
  17   
  18   
  19   
  20   
  21   
  22   
  23   
  24   
  25   
  26   
  27   
  28   
  29   
  30   
  31  """The ``lxml.html`` tool set for HTML handling. 
  32  """ 
  33   
  34  from __future__ import absolute_import 
  35   
  36  __all__ = [ 
  37      'document_fromstring', 'fragment_fromstring', 'fragments_fromstring', 'fromstring', 
  38      'tostring', 'Element', 'defs', 'open_in_browser', 'submit_form', 
  39      'find_rel_links', 'find_class', 'make_links_absolute', 
  40      'resolve_base_href', 'iterlinks', 'rewrite_links', 'open_in_browser', 'parse'] 
  41   
  42   
  43  import copy 
  44  import sys 
  45  import re 
  46  from functools import partial 
  47   
  48  try: 
  49       
  50      from collections.abc import MutableMapping, MutableSet 
  51  except ImportError: 
  52      from collections import MutableMapping, MutableSet 
  53   
  54  from .. import etree 
  55  from . import defs 
  56  from ._setmixin import SetMixin 
  57   
  58  try: 
  59      from urlparse import urljoin 
  60  except ImportError: 
  61       
  62      from urllib.parse import urljoin 
  63   
  64  try: 
  65      unicode 
  66  except NameError: 
  67       
  68      unicode = str 
  69  try: 
  70      basestring 
  71  except NameError: 
  72       
  73      basestring = (str, bytes) 
  77      if not s: 
  78          return s 
  79      if sys.version_info[0] >= 3: 
  80          sub = re.compile(r"^(\s*)u'", re.M).sub 
  81      else: 
  82          sub = re.compile(r"^(\s*)b'", re.M).sub 
  83      return sub(r"\1'", s) 
   84   
  85   
  86  XHTML_NAMESPACE = "http://www.w3.org/1999/xhtml" 
  87   
  88  _rel_links_xpath = etree.XPath("descendant-or-self::a[@rel]|descendant-or-self::x:a[@rel]", 
  89                                 namespaces={'x':XHTML_NAMESPACE}) 
  90  _options_xpath = etree.XPath("descendant-or-self::option|descendant-or-self::x:option", 
  91                               namespaces={'x':XHTML_NAMESPACE}) 
  92  _forms_xpath = etree.XPath("descendant-or-self::form|descendant-or-self::x:form", 
  93                             namespaces={'x':XHTML_NAMESPACE}) 
  94   
  95  _class_xpath = etree.XPath("descendant-or-self::*[@class and contains(concat(' ', normalize-space(@class), ' '), concat(' ', $class_name, ' '))]") 
  96  _id_xpath = etree.XPath("descendant-or-self::*[@id=$id]") 
  97  _collect_string_content = etree.XPath("string()") 
  98  _iter_css_urls = re.compile(r'url\(('+'["][^"]*["]|'+"['][^']*[']|"+r'[^)]*)\)', re.I).finditer 
  99  _iter_css_imports = re.compile(r'@import "(.*?)"').finditer 
 100  _label_xpath = etree.XPath("//label[@for=$id]|//x:label[@for=$id]", 
 101                             namespaces={'x':XHTML_NAMESPACE}) 
 102  _archive_re = re.compile(r'[^ ]+') 
 103  _parse_meta_refresh_url = re.compile( 
 104      r'[^;=]*;\s*(?:url\s*=\s*)?(?P<url>.*)$', re.I).search 
 108      if s[:1] == '"' and s[-1:] == '"' or s[:1] == "'" and s[-1:] == "'": 
 109          return s[1:-1], pos+1 
 110      else: 
 111          return s,pos 
  112   
 123   
 130   
 133      """Provides access to an element's class attribute as a set-like collection. 
 134      Usage:: 
 135   
 136          >>> el = fromstring('<p class="hidden large">Text</p>') 
 137          >>> classes = el.classes  # or: classes = Classes(el.attrib) 
 138          >>> classes |= ['block', 'paragraph'] 
 139          >>> el.get('class') 
 140          'hidden large block paragraph' 
 141          >>> classes.toggle('hidden') 
 142          False 
 143          >>> el.get('class') 
 144          'large block paragraph' 
 145          >>> classes -= ('some', 'classes', 'block') 
 146          >>> el.get('class') 
 147          'large paragraph' 
 148      """ 
 150          self._attributes = attributes 
 151          self._get_class_value = partial(attributes.get, 'class', '') 
  152   
 153 -    def add(self, value): 
  154          """ 
 155          Add a class. 
 156   
 157          This has no effect if the class is already present. 
 158          """ 
 159          if not value or re.search(r'\s', value): 
 160              raise ValueError("Invalid class name: %r" % value) 
 161          classes = self._get_class_value().split() 
 162          if value in classes: 
 163              return 
 164          classes.append(value) 
 165          self._attributes['class'] = ' '.join(classes) 
  166   
 168          """ 
 169          Remove a class if it is currently present. 
 170   
 171          If the class is not present, do nothing. 
 172          """ 
 173          if not value or re.search(r'\s', value): 
 174              raise ValueError("Invalid class name: %r" % value) 
 175          classes = [name for name in self._get_class_value().split() 
 176                     if name != value] 
 177          if classes: 
 178              self._attributes['class'] = ' '.join(classes) 
 179          elif 'class' in self._attributes: 
 180              del self._attributes['class'] 
  181   
 183          """ 
 184          Remove a class; it must currently be present. 
 185   
 186          If the class is not present, raise a KeyError. 
 187          """ 
 188          if not value or re.search(r'\s', value): 
 189              raise ValueError("Invalid class name: %r" % value) 
 190          super(Classes, self).remove(value) 
  191   
 195   
 197          return iter(self._get_class_value().split()) 
  198   
 200          return len(self._get_class_value().split()) 
  201   
 202       
 203   
 205          """ 
 206          Add all names from 'values'. 
 207          """ 
 208          classes = self._get_class_value().split() 
 209          extended = False 
 210          for value in values: 
 211              if value not in classes: 
 212                  classes.append(value) 
 213                  extended = True 
 214          if extended: 
 215              self._attributes['class'] = ' '.join(classes) 
  216   
 218          """ 
 219          Add a class name if it isn't there yet, or remove it if it exists. 
 220   
 221          Returns true if the class was added (and is now enabled) and 
 222          false if it was removed (and is now disabled). 
 223          """ 
 224          if not value or re.search(r'\s', value): 
 225              raise ValueError("Invalid class name: %r" % value) 
 226          classes = self._get_class_value().split() 
 227          try: 
 228              classes.remove(value) 
 229              enabled = False 
 230          except ValueError: 
 231              classes.append(value) 
 232              enabled = True 
 233          if classes: 
 234              self._attributes['class'] = ' '.join(classes) 
 235          else: 
 236              del self._attributes['class'] 
 237          return enabled 
   238   
 241   
 242      @property 
 244          """ 
 245          A set-like wrapper around the 'class' attribute. 
 246          """ 
 247          return Classes(self.attrib) 
  248   
 249      @classes.setter 
 257   
 258      @property 
 260          """ 
 261          Returns the base URL, given when the page was parsed. 
 262   
 263          Use with ``urlparse.urljoin(el.base_url, href)`` to get 
 264          absolute URLs. 
 265          """ 
 266          return self.getroottree().docinfo.URL 
  267   
 268      @property 
 274   
 275      @property 
 277          """ 
 278          Return the <body> element.  Can be called from a child element 
 279          to get the document's head. 
 280          """ 
 281          return self.xpath('//body|//x:body', namespaces={'x':XHTML_NAMESPACE})[0] 
  282   
 283      @property 
 285          """ 
 286          Returns the <head> element.  Can be called from a child 
 287          element to get the document's head. 
 288          """ 
 289          return self.xpath('//head|//x:head', namespaces={'x':XHTML_NAMESPACE})[0] 
  290   
 291      @property 
 293          """ 
 294          Get or set any <label> element associated with this element. 
 295          """ 
 296          id = self.get('id') 
 297          if not id: 
 298              return None 
 299          result = _label_xpath(self, id=id) 
 300          if not result: 
 301              return None 
 302          else: 
 303              return result[0] 
  304   
 305      @label.setter 
 307          id = self.get('id') 
 308          if not id: 
 309              raise TypeError( 
 310                  "You cannot set a label for an element (%r) that has no id" 
 311                  % self) 
 312          if _nons(label.tag) != 'label': 
 313              raise TypeError( 
 314                  "You can only assign label to a label element (not %r)" 
 315                  % label) 
 316          label.set('for', id) 
  317   
 318      @label.deleter 
 323   
 325          """ 
 326          Removes this element from the tree, including its children and 
 327          text.  The tail text is joined to the previous element or 
 328          parent. 
 329          """ 
 330          parent = self.getparent() 
 331          assert parent is not None 
 332          if self.tail: 
 333              previous = self.getprevious() 
 334              if previous is None: 
 335                  parent.text = (parent.text or '') + self.tail 
 336              else: 
 337                  previous.tail = (previous.tail or '') + self.tail 
 338          parent.remove(self) 
  339   
 341          """ 
 342          Remove the tag, but not its children or text.  The children and text 
 343          are merged into the parent. 
 344   
 345          Example:: 
 346   
 347              >>> h = fragment_fromstring('<div>Hello <b>World!</b></div>') 
 348              >>> h.find('.//b').drop_tag() 
 349              >>> print(tostring(h, encoding='unicode')) 
 350              <div>Hello World!</div> 
 351          """ 
 352          parent = self.getparent() 
 353          assert parent is not None 
 354          previous = self.getprevious() 
 355          if self.text and isinstance(self.tag, basestring): 
 356               
 357              if previous is None: 
 358                  parent.text = (parent.text or '') + self.text 
 359              else: 
 360                  previous.tail = (previous.tail or '') + self.text 
 361          if self.tail: 
 362              if len(self): 
 363                  last = self[-1] 
 364                  last.tail = (last.tail or '') + self.tail 
 365              elif previous is None: 
 366                  parent.text = (parent.text or '') + self.tail 
 367              else: 
 368                  previous.tail = (previous.tail or '') + self.tail 
 369          index = parent.index(self) 
 370          parent[index:index+1] = self[:] 
  371   
 373          """ 
 374          Find any links like ``<a rel="{rel}">...</a>``; returns a list of elements. 
 375          """ 
 376          rel = rel.lower() 
 377          return [el for el in _rel_links_xpath(self) 
 378                  if el.get('rel').lower() == rel] 
  379   
 381          """ 
 382          Find any elements with the given class name. 
 383          """ 
 384          return _class_xpath(self, class_name=class_name) 
  385   
 387          """ 
 388          Get the first element in a document with the given id.  If none is 
 389          found, return the default argument if provided or raise KeyError 
 390          otherwise. 
 391   
 392          Note that there can be more than one element with the same id, 
 393          and this isn't uncommon in HTML documents found in the wild. 
 394          Browsers return only the first match, and this function does 
 395          the same. 
 396          """ 
 397          try: 
 398               
 399               
 400              return _id_xpath(self, id=id)[0] 
 401          except IndexError: 
 402              if default: 
 403                  return default[0] 
 404              else: 
 405                  raise KeyError(id) 
  406   
 407 -    def text_content(self): 
  408          """ 
 409          Return the text content of the tag (and the text in any children). 
 410          """ 
 411          return _collect_string_content(self) 
  412   
 413 -    def cssselect(self, expr, translator='html'): 
  414          """ 
 415          Run the CSS expression on this element and its children, 
 416          returning a list of the results. 
 417   
 418          Equivalent to lxml.cssselect.CSSSelect(expr, translator='html')(self) 
 419          -- note that pre-compiling the expression can provide a substantial 
 420          speedup. 
 421          """ 
 422           
 423          from lxml.cssselect import CSSSelector 
 424          return CSSSelector(expr, translator=translator)(self) 
  425   
 426       
 427       
 428       
 429   
 430 -    def make_links_absolute(self, base_url=None, resolve_base_href=True, 
 431                              handle_failures=None): 
  432          """ 
 433          Make all links in the document absolute, given the 
 434          ``base_url`` for the document (the full URL where the document 
 435          came from), or if no ``base_url`` is given, then the ``.base_url`` 
 436          of the document. 
 437   
 438          If ``resolve_base_href`` is true, then any ``<base href>`` 
 439          tags in the document are used *and* removed from the document. 
 440          If it is false then any such tag is ignored. 
 441   
 442          If ``handle_failures`` is None (default), a failure to process 
 443          a URL will abort the processing.  If set to 'ignore', errors 
 444          are ignored.  If set to 'discard', failing URLs will be removed. 
 445          """ 
 446          if base_url is None: 
 447              base_url = self.base_url 
 448              if base_url is None: 
 449                  raise TypeError( 
 450                      "No base_url given, and the document has no base_url") 
 451          if resolve_base_href: 
 452              self.resolve_base_href() 
 453   
 454          if handle_failures == 'ignore': 
 455              def link_repl(href): 
 456                  try: 
 457                      return urljoin(base_url, href) 
 458                  except ValueError: 
 459                      return href 
  460          elif handle_failures == 'discard': 
 461              def link_repl(href): 
 462                  try: 
 463                      return urljoin(base_url, href) 
 464                  except ValueError: 
 465                      return None 
  466          elif handle_failures is None: 
 467              def link_repl(href): 
 468                  return urljoin(base_url, href) 
 469          else: 
 470              raise ValueError( 
 471                  "unexpected value for handle_failures: %r" % handle_failures) 
 472   
 473          self.rewrite_links(link_repl) 
 474   
 476          """ 
 477          Find any ``<base href>`` tag in the document, and apply its 
 478          values to all links found in the document.  Also remove the 
 479          tag once it has been applied. 
 480   
 481          If ``handle_failures`` is None (default), a failure to process 
 482          a URL will abort the processing.  If set to 'ignore', errors 
 483          are ignored.  If set to 'discard', failing URLs will be removed. 
 484          """ 
 485          base_href = None 
 486          basetags = self.xpath('//base[@href]|//x:base[@href]', 
 487                                namespaces={'x': XHTML_NAMESPACE}) 
 488          for b in basetags: 
 489              base_href = b.get('href') 
 490              b.drop_tree() 
 491          if not base_href: 
 492              return 
 493          self.make_links_absolute(base_href, resolve_base_href=False, 
 494                                   handle_failures=handle_failures) 
  495   
 497          """ 
 498          Yield (element, attribute, link, pos), where attribute may be None 
 499          (indicating the link is in the text).  ``pos`` is the position 
 500          where the link occurs; often 0, but sometimes something else in 
 501          the case of links in stylesheets or style tags. 
 502   
 503          Note: <base href> is *not* taken into account in any way.  The 
 504          link you get is exactly the link in the document. 
 505   
 506          Note: multiple links inside of a single text string or 
 507          attribute value are returned in reversed order.  This makes it 
 508          possible to replace or delete them from the text string value 
 509          based on their reported text positions.  Otherwise, a 
 510          modification at one text position can change the positions of 
 511          links reported later on. 
 512          """ 
 513          link_attrs = defs.link_attrs 
 514          for el in self.iter(etree.Element): 
 515              attribs = el.attrib 
 516              tag = _nons(el.tag) 
 517              if tag == 'object': 
 518                  codebase = None 
 519                   
 520                   
 521                  if 'codebase' in attribs: 
 522                      codebase = el.get('codebase') 
 523                      yield (el, 'codebase', codebase, 0) 
 524                  for attrib in ('classid', 'data'): 
 525                      if attrib in attribs: 
 526                          value = el.get(attrib) 
 527                          if codebase is not None: 
 528                              value = urljoin(codebase, value) 
 529                          yield (el, attrib, value, 0) 
 530                  if 'archive' in attribs: 
 531                      for match in _archive_re.finditer(el.get('archive')): 
 532                          value = match.group(0) 
 533                          if codebase is not None: 
 534                              value = urljoin(codebase, value) 
 535                          yield (el, 'archive', value, match.start()) 
 536              else: 
 537                  for attrib in link_attrs: 
 538                      if attrib in attribs: 
 539                          yield (el, attrib, attribs[attrib], 0) 
 540              if tag == 'meta': 
 541                  http_equiv = attribs.get('http-equiv', '').lower() 
 542                  if http_equiv == 'refresh': 
 543                      content = attribs.get('content', '') 
 544                      match = _parse_meta_refresh_url(content) 
 545                      url = (match.group('url') if match else content).strip() 
 546                       
 547                       
 548                      if url: 
 549                          url, pos = _unquote_match( 
 550                              url, match.start('url') if match else content.find(url)) 
 551                          yield (el, 'content', url, pos) 
 552              elif tag == 'param': 
 553                  valuetype = el.get('valuetype') or '' 
 554                  if valuetype.lower() == 'ref': 
 555                       
 556                       
 557                       
 558                       
 559                       
 560                       
 561                      yield (el, 'value', el.get('value'), 0) 
 562              elif tag == 'style' and el.text: 
 563                  urls = [ 
 564                       
 565                      _unquote_match(match.group(1), match.start(1))[::-1] 
 566                      for match in _iter_css_urls(el.text) 
 567                      ] + [ 
 568                      (match.start(1), match.group(1)) 
 569                      for match in _iter_css_imports(el.text) 
 570                      ] 
 571                  if urls: 
 572                       
 573                       
 574                       
 575                      urls.sort(reverse=True) 
 576                      for start, url in urls: 
 577                          yield (el, None, url, start) 
 578              if 'style' in attribs: 
 579                  urls = list(_iter_css_urls(attribs['style'])) 
 580                  if urls: 
 581                       
 582                      for match in urls[::-1]: 
 583                          url, start = _unquote_match(match.group(1), match.start(1)) 
 584                          yield (el, 'style', url, start) 
  585   
 586 -    def rewrite_links(self, link_repl_func, resolve_base_href=True, 
 587                        base_href=None): 
  588          """ 
 589          Rewrite all the links in the document.  For each link 
 590          ``link_repl_func(link)`` will be called, and the return value 
 591          will replace the old link. 
 592   
 593          Note that links may not be absolute (unless you first called 
 594          ``make_links_absolute()``), and may be internal (e.g., 
 595          ``'#anchor'``).  They can also be values like 
 596          ``'mailto:email'`` or ``'javascript:expr'``. 
 597   
 598          If you give ``base_href`` then all links passed to 
 599          ``link_repl_func()`` will take that into account. 
 600   
 601          If the ``link_repl_func`` returns None, the attribute or 
 602          tag text will be removed completely. 
 603          """ 
 604          if base_href is not None: 
 605               
 606               
 607              self.make_links_absolute( 
 608                  base_href, resolve_base_href=resolve_base_href) 
 609          elif resolve_base_href: 
 610              self.resolve_base_href() 
 611   
 612          for el, attrib, link, pos in self.iterlinks(): 
 613              new_link = link_repl_func(link.strip()) 
 614              if new_link == link: 
 615                  continue 
 616              if new_link is None: 
 617                   
 618                  if attrib is None: 
 619                      el.text = '' 
 620                  else: 
 621                      del el.attrib[attrib] 
 622                  continue 
 623   
 624              if attrib is None: 
 625                  new = el.text[:pos] + new_link + el.text[pos+len(link):] 
 626                  el.text = new 
 627              else: 
 628                  cur = el.get(attrib) 
 629                  if not pos and len(cur) == len(link): 
 630                      new = new_link   
 631                  else: 
 632                      new = cur[:pos] + new_link + cur[pos+len(link):] 
 633                  el.set(attrib, new) 
  634   
 637      """ 
 638      An object that represents a method on an element as a function; 
 639      the function takes either an element or an HTML string.  It 
 640      returns whatever the function normally returns, or if the function 
 641      works in-place (and so returns None) it returns a serialized form 
 642      of the resulting document. 
 643      """ 
 649          result_type = type(doc) 
 650          if isinstance(doc, basestring): 
 651              if 'copy' in kw: 
 652                  raise TypeError( 
 653                      "The keyword 'copy' can only be used with element inputs to %s, not a string input" % self.name) 
 654              doc = fromstring(doc, **kw) 
 655          else: 
 656              if 'copy' in kw: 
 657                  make_a_copy = kw.pop('copy') 
 658              else: 
 659                  make_a_copy = self.copy 
 660              if make_a_copy: 
 661                  doc = copy.deepcopy(doc) 
 662          meth = getattr(doc, self.name) 
 663          result = meth(*args, **kw) 
 664           
 665          if result is None: 
 666               
 667              return _transform_result(result_type, doc) 
 668          else: 
 669              return result 
   670   
 671   
 672  find_rel_links = _MethodFunc('find_rel_links', copy=False) 
 673  find_class = _MethodFunc('find_class', copy=False) 
 674  make_links_absolute = _MethodFunc('make_links_absolute', copy=True) 
 675  resolve_base_href = _MethodFunc('resolve_base_href', copy=True) 
 676  iterlinks = _MethodFunc('iterlinks', copy=False) 
 677  rewrite_links = _MethodFunc('rewrite_links', copy=True) 
 682   
 687   
 691   
 692   
 693 -class HtmlEntity(etree.EntityBase, HtmlMixin): 
  695   
 698      """A lookup scheme for HTML Element classes. 
 699   
 700      To create a lookup instance with different Element classes, pass a tag 
 701      name mapping of Element classes in the ``classes`` keyword argument and/or 
 702      a tag name mapping of Mixin classes in the ``mixins`` keyword argument. 
 703      The special key '*' denotes a Mixin class that should be mixed into all 
 704      Element classes. 
 705      """ 
 706      _default_element_classes = {} 
 707   
 708 -    def __init__(self, classes=None, mixins=None): 
  725   
 726 -    def lookup(self, node_type, document, namespace, name): 
   737   
 738   
 739   
 740   
 741   
 742   
 743  _looks_like_full_html_unicode = re.compile( 
 744      unicode(r'^\s*<(?:html|!doctype)'), re.I).match 
 745  _looks_like_full_html_bytes = re.compile( 
 746      r'^\s*<(?:html|!doctype)'.encode('ascii'), re.I).match 
 761   
 765      """ 
 766      Parses several HTML elements, returning a list of elements. 
 767   
 768      The first item in the list may be a string (though leading 
 769      whitespace is removed).  If no_leading_text is true, then it will 
 770      be an error if there is leading text, and it will always be a list 
 771      of only elements. 
 772   
 773      base_url will set the document's base_url attribute (and the tree's docinfo.URL) 
 774      """ 
 775      if parser is None: 
 776          parser = html_parser 
 777       
 778      if isinstance(html, bytes): 
 779          if not _looks_like_full_html_bytes(html): 
 780               
 781              html = ('<html><body>'.encode('ascii') + html + 
 782                      '</body></html>'.encode('ascii')) 
 783      else: 
 784          if not _looks_like_full_html_unicode(html): 
 785              html = '<html><body>%s</body></html>' % html 
 786      doc = document_fromstring(html, parser=parser, base_url=base_url, **kw) 
 787      assert _nons(doc.tag) == 'html' 
 788      bodies = [e for e in doc if _nons(e.tag) == 'body'] 
 789      assert len(bodies) == 1, ("too many bodies: %r in %r" % (bodies, html)) 
 790      body = bodies[0] 
 791      elements = [] 
 792      if no_leading_text and body.text and body.text.strip(): 
 793          raise etree.ParserError( 
 794              "There is leading text: %r" % body.text) 
 795      if body.text and body.text.strip(): 
 796          elements.append(body.text) 
 797      elements.extend(body) 
 798       
 799       
 800      return elements 
  801   
 805      """ 
 806      Parses a single HTML element; it is an error if there is more than 
 807      one element, or if anything but whitespace precedes or follows the 
 808      element. 
 809   
 810      If ``create_parent`` is true (or is a tag name) then a parent node 
 811      will be created to encapsulate the HTML in a single element.  In this 
 812      case, leading or trailing text is also allowed, as are multiple elements 
 813      as result of the parsing. 
 814   
 815      Passing a ``base_url`` will set the document's ``base_url`` attribute 
 816      (and the tree's docinfo.URL). 
 817      """ 
 818      if parser is None: 
 819          parser = html_parser 
 820   
 821      accept_leading_text = bool(create_parent) 
 822   
 823      elements = fragments_fromstring( 
 824          html, parser=parser, no_leading_text=not accept_leading_text, 
 825          base_url=base_url, **kw) 
 826   
 827      if create_parent: 
 828          if not isinstance(create_parent, basestring): 
 829              create_parent = 'div' 
 830          new_root = Element(create_parent) 
 831          if elements: 
 832              if isinstance(elements[0], basestring): 
 833                  new_root.text = elements[0] 
 834                  del elements[0] 
 835              new_root.extend(elements) 
 836          return new_root 
 837   
 838      if not elements: 
 839          raise etree.ParserError('No elements found') 
 840      if len(elements) > 1: 
 841          raise etree.ParserError( 
 842              "Multiple elements found (%s)" 
 843              % ', '.join([_element_name(e) for e in elements])) 
 844      el = elements[0] 
 845      if el.tail and el.tail.strip(): 
 846          raise etree.ParserError( 
 847              "Element followed by text: %r" % el.tail) 
 848      el.tail = None 
 849      return el 
  850   
 851   
 852 -def fromstring(html, base_url=None, parser=None, **kw): 
  918   
 919   
 920 -def parse(filename_or_url, parser=None, base_url=None, **kw): 
  921      """ 
 922      Parse a filename, URL, or file-like object into an HTML document 
 923      tree.  Note: this returns a tree, not an element.  Use 
 924      ``parse(...).getroot()`` to get the document root. 
 925   
 926      You can override the base URL with the ``base_url`` keyword.  This 
 927      is most useful when parsing from a file-like object. 
 928      """ 
 929      if parser is None: 
 930          parser = html_parser 
 931      return etree.parse(filename_or_url, parser, base_url=base_url, **kw) 
  932   
 941   
 944      if isinstance(el, etree.CommentBase): 
 945          return 'comment' 
 946      elif isinstance(el, basestring): 
 947          return 'string' 
 948      else: 
 949          return _nons(el.tag) 
  950   
1070   
1071   
1072  HtmlElementClassLookup._default_element_classes['form'] = FormElement 
1111   
1114      if not url: 
1115          raise ValueError("cannot submit, no URL provided") 
1116       
1117      try: 
1118          from urllib import urlencode, urlopen 
1119      except ImportError:  
1120          from urllib.request import urlopen 
1121          from urllib.parse import urlencode 
1122      if method == 'GET': 
1123          if '?' in url: 
1124              url += '&' 
1125          else: 
1126              url += '?' 
1127          url += urlencode(values) 
1128          data = None 
1129      else: 
1130          data = urlencode(values) 
1131      return urlopen(url, data) 
 1132   
1135   
1143          raise KeyError( 
1144              "You cannot remove keys from ElementDict") 
 1148          return item in self.inputs 
 1153   
1155          return '<%s for form %s>' % ( 
1156              self.__class__.__name__, 
1157              self.inputs.form._name()) 
  1158   
1225   
1256   
1257   
1258 -class TextareaElement(InputMixin, HtmlElement): 
 1259      """ 
1260      ``<textarea>`` element.  You can get the name with ``.name`` and 
1261      get/set the value with ``.value`` 
1262      """ 
1263      @property 
1265          """ 
1266          Get/set the value (which is the contents of this element) 
1267          """ 
1268          content = self.text or '' 
1269          if self.tag.startswith("{%s}" % XHTML_NAMESPACE): 
1270              serialisation_method = 'xml' 
1271          else: 
1272              serialisation_method = 'html' 
1273          for el in self: 
1274               
1275              content += etree.tostring( 
1276                  el, method=serialisation_method, encoding='unicode') 
1277          return content 
 1278   
1279      @value.setter 
1280 -    def value(self, value): 
 1281          del self[:] 
1282          self.text = value 
 1283   
1284      @value.deleter 
1286          self.text = '' 
1287          del self[:] 
  1288   
1289   
1290  HtmlElementClassLookup._default_element_classes['textarea'] = TextareaElement 
1294      """ 
1295      ``<select>`` element.  You can get the name with ``.name``. 
1296   
1297      ``.value`` will be the value of the selected option, unless this 
1298      is a multi-select element (``<select multiple>``), in which case 
1299      it will be a set-like object.  In either case ``.value_options`` 
1300      gives the possible values. 
1301   
1302      The boolean attribute ``.multiple`` shows if this is a 
1303      multi-select. 
1304      """ 
1305      @property 
1307          """ 
1308          Get/set the value of this select (the selected option). 
1309   
1310          If this is a multi-select, this is a set-like object that 
1311          represents all the selected options. 
1312          """ 
1313          if self.multiple: 
1314              return MultipleSelectOptions(self) 
1315          for el in _options_xpath(self): 
1316              if el.get('selected') is not None: 
1317                  value = el.get('value') 
1318                  if value is None: 
1319                      value = el.text or '' 
1320                  if value: 
1321                      value = value.strip() 
1322                  return value 
1323          return None 
 1324   
1325      @value.setter 
1326 -    def value(self, value): 
 1327          if self.multiple: 
1328              if isinstance(value, basestring): 
1329                  raise TypeError("You must pass in a sequence") 
1330              values = self.value 
1331              values.clear() 
1332              values.update(value) 
1333              return 
1334          checked_option = None 
1335          if value is not None: 
1336              value = value.strip() 
1337              for el in _options_xpath(self): 
1338                  opt_value = el.get('value') 
1339                  if opt_value is None: 
1340                      opt_value = el.text or '' 
1341                  if opt_value: 
1342                      opt_value = opt_value.strip() 
1343                  if opt_value == value: 
1344                      checked_option = el 
1345                      break 
1346              else: 
1347                  raise ValueError( 
1348                      "There is no option with the value of %r" % value) 
1349          for el in _options_xpath(self): 
1350              if 'selected' in el.attrib: 
1351                  del el.attrib['selected'] 
1352          if checked_option is not None: 
1353              checked_option.set('selected', '') 
 1354   
1355      @value.deleter 
1362   
1363      @property 
1378   
1379      @property 
1381          """ 
1382          Boolean attribute: is there a ``multiple`` attribute on this element. 
1383          """ 
1384          return 'multiple' in self.attrib 
 1385   
1386      @multiple.setter 
1388          if value: 
1389              self.set('multiple', '') 
1390          elif 'multiple' in self.attrib: 
1391              del self.attrib['multiple'] 
  1392   
1393   
1394  HtmlElementClassLookup._default_element_classes['select'] = SelectElement 
1398      """ 
1399      Represents all the selected options in a ``<select multiple>`` element. 
1400   
1401      You can add to this set-like option to select an option, or remove 
1402      to unselect the option. 
1403      """ 
1404   
1406          self.select = select 
 1407   
1408      @property 
1410          """ 
1411          Iterator of all the ``<option>`` elements. 
1412          """ 
1413          return iter(_options_xpath(self.select)) 
 1414   
1416          for option in self.options: 
1417              if 'selected' in option.attrib: 
1418                  opt_value = option.get('value') 
1419                  if opt_value is None: 
1420                      opt_value = option.text or '' 
1421                  if opt_value: 
1422                      opt_value = opt_value.strip() 
1423                  yield opt_value 
 1424   
1425 -    def add(self, item): 
 1426          for option in self.options: 
1427              opt_value = option.get('value') 
1428              if opt_value is None: 
1429                  opt_value = option.text or '' 
1430              if opt_value: 
1431                  opt_value = opt_value.strip() 
1432              if opt_value == item: 
1433                  option.set('selected', '') 
1434                  break 
1435          else: 
1436              raise ValueError( 
1437                  "There is no option with the value %r" % item) 
 1438   
1440          for option in self.options: 
1441              opt_value = option.get('value') 
1442              if opt_value is None: 
1443                  opt_value = option.text or '' 
1444              if opt_value: 
1445                  opt_value = opt_value.strip() 
1446              if opt_value == item: 
1447                  if 'selected' in option.attrib: 
1448                      del option.attrib['selected'] 
1449                  else: 
1450                      raise ValueError( 
1451                          "The option %r is not currently selected" % item) 
1452                  break 
1453          else: 
1454              raise ValueError( 
1455                  "There is not option with the value %r" % item) 
 1456   
1458          return '<%s {%s} for select name=%r>' % ( 
1459              self.__class__.__name__, 
1460              ', '.join([repr(v) for v in self]), 
1461              self.select.name) 
  1462   
1465      """ 
1466      This object represents several ``<input type=radio>`` elements 
1467      that have the same name. 
1468   
1469      You can use this like a list, but also use the property 
1470      ``.value`` to check/uncheck inputs.  Also you can use 
1471      ``.value_options`` to get the possible values. 
1472      """ 
1473      @property 
1475          """ 
1476          Get/set the value, which checks the radio with that value (and 
1477          unchecks any other value). 
1478          """ 
1479          for el in self: 
1480              if 'checked' in el.attrib: 
1481                  return el.get('value') 
1482          return None 
 1483   
1484      @value.setter 
1485 -    def value(self, value): 
 1486          checked_option = None 
1487          if value is not None: 
1488              for el in self: 
1489                  if el.get('value') == value: 
1490                      checked_option = el 
1491                      break 
1492              else: 
1493                  raise ValueError("There is no radio input with the value %r" % value) 
1494          for el in self: 
1495              if 'checked' in el.attrib: 
1496                  del el.attrib['checked'] 
1497          if checked_option is not None: 
1498              checked_option.set('checked', '') 
 1499   
1500      @value.deleter 
1503   
1504      @property 
1506          """ 
1507          Returns a list of all the possible values. 
1508          """ 
1509          return [el.get('value') for el in self] 
 1510   
1512          return '%s(%s)' % ( 
1513              self.__class__.__name__, 
1514              list.__repr__(self)) 
  1515   
1518      """ 
1519      Represents a group of checkboxes (``<input type=checkbox>``) that 
1520      have the same name. 
1521   
1522      In addition to using this like a list, the ``.value`` attribute 
1523      returns a set-like object that you can add to or remove from to 
1524      check and uncheck checkboxes.  You can also use ``.value_options`` 
1525      to get the possible values. 
1526      """ 
1527      @property 
1529          """ 
1530          Return a set-like object that can be modified to check or 
1531          uncheck individual checkboxes according to their value. 
1532          """ 
1533          return CheckboxValues(self) 
 1534   
1535      @value.setter 
1536 -    def value(self, value): 
 1544   
1545      @value.deleter 
1548   
1549      @property 
1551          """ 
1552          Returns a list of all the possible values. 
1553          """ 
1554          return [el.get('value') for el in self] 
 1555   
1557          return '%s(%s)' % ( 
1558              self.__class__.__name__, list.__repr__(self)) 
  1559   
1562      """ 
1563      Represents the values of the checked checkboxes in a group of 
1564      checkboxes with the same name. 
1565      """ 
1566   
1569   
1571          return iter([ 
1572              el.get('value') 
1573              for el in self.group 
1574              if 'checked' in el.attrib]) 
 1575   
1576 -    def add(self, value): 
 1577          for el in self.group: 
1578              if el.get('value') == value: 
1579                  el.set('checked', '') 
1580                  break 
1581          else: 
1582              raise KeyError("No checkbox with value %r" % value) 
 1583   
1585          for el in self.group: 
1586              if el.get('value') == value: 
1587                  if 'checked' in el.attrib: 
1588                      del el.attrib['checked'] 
1589                  else: 
1590                      raise KeyError( 
1591                          "The checkbox with value %r was already unchecked" % value) 
1592                  break 
1593          else: 
1594              raise KeyError( 
1595                  "No checkbox with value %r" % value) 
 1596   
1598          return '<%s {%s} for checkboxes name=%r>' % ( 
1599              self.__class__.__name__, 
1600              ', '.join([repr(v) for v in self]), 
1601              self.group.name) 
  1602   
1696   
1697   
1698  HtmlElementClassLookup._default_element_classes['input'] = InputElement 
1702      """ 
1703      Represents a ``<label>`` element. 
1704   
1705      Label elements are linked to other elements with their ``for`` 
1706      attribute.  You can access this element with ``label.for_element``. 
1707      """ 
1708      @property 
1710          """ 
1711          Get/set the element this label points to.  Return None if it 
1712          can't be found. 
1713          """ 
1714          id = self.get('for') 
1715          if not id: 
1716              return None 
1717          return self.body.get_element_by_id(id) 
 1718   
1719      @for_element.setter 
1721          id = other.get('id') 
1722          if not id: 
1723              raise TypeError( 
1724                  "Element %r has no id attribute" % other) 
1725          self.set('for', id) 
 1726   
1727      @for_element.deleter 
 1732   
1733   
1734  HtmlElementClassLookup._default_element_classes['label'] = LabelElement 
1754   
1757      """Convert all tags in an XHTML tree to HTML by removing their 
1758      XHTML namespace. 
1759      """ 
1760      try: 
1761          xhtml = xhtml.getroot() 
1762      except AttributeError: 
1763          pass 
1764      prefix = "{%s}" % XHTML_NAMESPACE 
1765      prefix_len = len(prefix) 
1766      for el in xhtml.iter(prefix + "*"): 
1767          el.tag = el.tag[prefix_len:] 
 1768   
1769   
1770   
1771   
1772  __str_replace_meta_content_type = re.compile( 
1773      r'<meta http-equiv="Content-Type"[^>]*>').sub 
1774  __bytes_replace_meta_content_type = re.compile( 
1775      r'<meta http-equiv="Content-Type"[^>]*>'.encode('ASCII')).sub 
1776   
1777   
1778 -def tostring(doc, pretty_print=False, include_meta_content_type=False, 
1779               encoding=None, method="html", with_tail=True, doctype=None): 
 1780      """Return an HTML string representation of the document. 
1781   
1782      Note: if include_meta_content_type is true this will create a 
1783      ``<meta http-equiv="Content-Type" ...>`` tag in the head; 
1784      regardless of the value of include_meta_content_type any existing 
1785      ``<meta http-equiv="Content-Type" ...>`` tag will be removed 
1786   
1787      The ``encoding`` argument controls the output encoding (defauts to 
1788      ASCII, with &#...; character references for any characters outside 
1789      of ASCII).  Note that you can pass the name ``'unicode'`` as 
1790      ``encoding`` argument to serialise to a Unicode string. 
1791   
1792      The ``method`` argument defines the output method.  It defaults to 
1793      'html', but can also be 'xml' for xhtml output, or 'text' to 
1794      serialise to plain text without markup. 
1795   
1796      To leave out the tail text of the top-level element that is being 
1797      serialised, pass ``with_tail=False``. 
1798   
1799      The ``doctype`` option allows passing in a plain string that will 
1800      be serialised before the XML tree.  Note that passing in non 
1801      well-formed content here will make the XML output non well-formed. 
1802      Also, an existing doctype in the document tree will not be removed 
1803      when serialising an ElementTree instance. 
1804   
1805      Example:: 
1806   
1807          >>> from lxml import html 
1808          >>> root = html.fragment_fromstring('<p>Hello<br>world!</p>') 
1809   
1810          >>> html.tostring(root) 
1811          b'<p>Hello<br>world!</p>' 
1812          >>> html.tostring(root, method='html') 
1813          b'<p>Hello<br>world!</p>' 
1814   
1815          >>> html.tostring(root, method='xml') 
1816          b'<p>Hello<br/>world!</p>' 
1817   
1818          >>> html.tostring(root, method='text') 
1819          b'Helloworld!' 
1820   
1821          >>> html.tostring(root, method='text', encoding='unicode') 
1822          u'Helloworld!' 
1823   
1824          >>> root = html.fragment_fromstring('<div><p>Hello<br>world!</p>TAIL</div>') 
1825          >>> html.tostring(root[0], method='text', encoding='unicode') 
1826          u'Helloworld!TAIL' 
1827   
1828          >>> html.tostring(root[0], method='text', encoding='unicode', with_tail=False) 
1829          u'Helloworld!' 
1830   
1831          >>> doc = html.document_fromstring('<p>Hello<br>world!</p>') 
1832          >>> html.tostring(doc, method='html', encoding='unicode') 
1833          u'<html><body><p>Hello<br>world!</p></body></html>' 
1834   
1835          >>> print(html.tostring(doc, method='html', encoding='unicode', 
1836          ...          doctype='<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN"' 
1837          ...                  ' "http://www.w3.org/TR/html4/strict.dtd">')) 
1838          <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd"> 
1839          <html><body><p>Hello<br>world!</p></body></html> 
1840      """ 
1841      html = etree.tostring(doc, method=method, pretty_print=pretty_print, 
1842                            encoding=encoding, with_tail=with_tail, 
1843                            doctype=doctype) 
1844      if method == 'html' and not include_meta_content_type: 
1845          if isinstance(html, str): 
1846              html = __str_replace_meta_content_type('', html) 
1847          else: 
1848              html = __bytes_replace_meta_content_type(bytes(), html) 
1849      return html 
 1850   
1851   
1852  tostring.__doc__ = __fix_docstring(tostring.__doc__) 
1856      """ 
1857      Open the HTML document in a web browser, saving it to a temporary 
1858      file to open it.  Note that this does not delete the file after 
1859      use.  This is mainly meant for debugging. 
1860      """ 
1861      import os 
1862      import webbrowser 
1863      import tempfile 
1864      if not isinstance(doc, etree._ElementTree): 
1865          doc = etree.ElementTree(doc) 
1866      handle, fn = tempfile.mkstemp(suffix='.html') 
1867      f = os.fdopen(handle, 'wb') 
1868      try: 
1869          doc.write(f, method="html", encoding=encoding or doc.docinfo.encoding or "UTF-8") 
1870      finally: 
1871           
1872          f.close() 
1873      url = 'file://' + fn.replace(os.path.sep, '/') 
1874      print(url) 
1875      webbrowser.open(url) 
 1876   
1877   
1878   
1879   
1880   
1881   
1882 -class HTMLParser(etree.HTMLParser): 
 1883      """An HTML parser that is configured to return lxml.html Element 
1884      objects. 
1885      """ 
 1889   
1892      """An XML parser that is configured to return lxml.html Element 
1893      objects. 
1894   
1895      Note that this parser is not really XHTML aware unless you let it 
1896      load a DTD that declares the HTML entities.  To do this, make sure 
1897      you have the XHTML DTDs installed in your catalogs, and create the 
1898      parser like this:: 
1899   
1900          >>> parser = XHTMLParser(load_dtd=True) 
1901   
1902      If you additionally want to validate the document, use this:: 
1903   
1904          >>> parser = XHTMLParser(dtd_validation=True) 
1905   
1906      For catalog support, see http://www.xmlsoft.org/catalog.html. 
1907      """ 
 1911   
1914      """Create a new HTML Element. 
1915   
1916      This can also be used for XHTML documents. 
1917      """ 
1918      v = html_parser.makeelement(*args, **kw) 
1919      return v 
 1920   
1921   
1922  html_parser = HTMLParser() 
1923  xhtml_parser = XHTMLParser() 
1924