1   
   2   
   3   
   4   
   5   
   6   
   7   
   8   
   9   
  10   
  11   
  12   
  13   
  14   
  15   
  16   
  17   
  18   
  19   
  20   
  21   
  22   
  23   
  24   
  25   
  26   
  27   
  28   
  29   
  30   
  31  """The ``lxml.html`` tool set for HTML handling. 
  32  """ 
  33   
  34  from __future__ import absolute_import 
  35   
  36  __all__ = [ 
  37      'document_fromstring', 'fragment_fromstring', 'fragments_fromstring', 'fromstring', 
  38      'tostring', 'Element', 'defs', 'open_in_browser', 'submit_form', 
  39      'find_rel_links', 'find_class', 'make_links_absolute', 
  40      'resolve_base_href', 'iterlinks', 'rewrite_links', 'open_in_browser', 'parse'] 
  41   
  42   
  43  import copy 
  44  import sys 
  45  import re 
  46  from functools import partial 
  47   
  48  try: 
  49       
  50      from collections.abc import MutableMapping, MutableSet 
  51  except ImportError: 
  52      from collections import MutableMapping, MutableSet 
  53   
  54  from .. import etree 
  55  from . import defs 
  56  from ._setmixin import SetMixin 
  57   
  58  try: 
  59      from urlparse import urljoin 
  60  except ImportError: 
  61       
  62      from urllib.parse import urljoin 
  63   
  64  try: 
  65      unicode 
  66  except NameError: 
  67       
  68      unicode = str 
  69  try: 
  70      basestring 
  71  except NameError: 
  72       
  73      basestring = (str, bytes) 
  77      if not s: 
  78          return s 
  79      if sys.version_info[0] >= 3: 
  80          sub = re.compile(r"^(\s*)u'", re.M).sub 
  81      else: 
  82          sub = re.compile(r"^(\s*)b'", re.M).sub 
  83      return sub(r"\1'", s) 
   84   
  85   
  86  XHTML_NAMESPACE = "http://www.w3.org/1999/xhtml" 
  87   
  88  _rel_links_xpath = etree.XPath("descendant-or-self::a[@rel]|descendant-or-self::x:a[@rel]", 
  89                                 namespaces={'x':XHTML_NAMESPACE}) 
  90  _options_xpath = etree.XPath("descendant-or-self::option|descendant-or-self::x:option", 
  91                               namespaces={'x':XHTML_NAMESPACE}) 
  92  _forms_xpath = etree.XPath("descendant-or-self::form|descendant-or-self::x:form", 
  93                             namespaces={'x':XHTML_NAMESPACE}) 
  94   
  95  _class_xpath = etree.XPath("descendant-or-self::*[@class and contains(concat(' ', normalize-space(@class), ' '), concat(' ', $class_name, ' '))]") 
  96  _id_xpath = etree.XPath("descendant-or-self::*[@id=$id]") 
  97  _collect_string_content = etree.XPath("string()") 
  98  _iter_css_urls = re.compile(r'url\(('+'["][^"]*["]|'+"['][^']*[']|"+r'[^)]*)\)', re.I).finditer 
  99  _iter_css_imports = re.compile(r'@import "(.*?)"').finditer 
 100  _label_xpath = etree.XPath("//label[@for=$id]|//x:label[@for=$id]", 
 101                             namespaces={'x':XHTML_NAMESPACE}) 
 102  _archive_re = re.compile(r'[^ ]+') 
 103  _parse_meta_refresh_url = re.compile( 
 104      r'[^;=]*;\s*(?:url\s*=\s*)?(?P<url>.*)$', re.I).search 
 108      if s[:1] == '"' and s[-1:] == '"' or s[:1] == "'" and s[-1:] == "'": 
 109          return s[1:-1], pos+1 
 110      else: 
 111          return s,pos 
  112   
 123   
 130   
 133      """Provides access to an element's class attribute as a set-like collection. 
 134      Usage:: 
 135   
 136          >>> el = fromstring('<p class="hidden large">Text</p>') 
 137          >>> classes = el.classes  # or: classes = Classes(el.attrib) 
 138          >>> classes |= ['block', 'paragraph'] 
 139          >>> el.get('class') 
 140          'hidden large block paragraph' 
 141          >>> classes.toggle('hidden') 
 142          False 
 143          >>> el.get('class') 
 144          'large block paragraph' 
 145          >>> classes -= ('some', 'classes', 'block') 
 146          >>> el.get('class') 
 147          'large paragraph' 
 148      """ 
 150          self._attributes = attributes 
 151          self._get_class_value = partial(attributes.get, 'class', '') 
  152   
 153 -    def add(self, value): 
  154          """ 
 155          Add a class. 
 156   
 157          This has no effect if the class is already present. 
 158          """ 
 159          if not value or re.search(r'\s', value): 
 160              raise ValueError("Invalid class name: %r" % value) 
 161          classes = self._get_class_value().split() 
 162          if value in classes: 
 163              return 
 164          classes.append(value) 
 165          self._attributes['class'] = ' '.join(classes) 
  166   
 168          """ 
 169          Remove a class if it is currently present. 
 170   
 171          If the class is not present, do nothing. 
 172          """ 
 173          if not value or re.search(r'\s', value): 
 174              raise ValueError("Invalid class name: %r" % value) 
 175          classes = [name for name in self._get_class_value().split() 
 176                     if name != value] 
 177          if classes: 
 178              self._attributes['class'] = ' '.join(classes) 
 179          elif 'class' in self._attributes: 
 180              del self._attributes['class'] 
  181   
 183          """ 
 184          Remove a class; it must currently be present. 
 185   
 186          If the class is not present, raise a KeyError. 
 187          """ 
 188          if not value or re.search(r'\s', value): 
 189              raise ValueError("Invalid class name: %r" % value) 
 190          super(Classes, self).remove(value) 
  191   
 195   
 197          return iter(self._get_class_value().split()) 
  198   
 200          return len(self._get_class_value().split()) 
  201   
 202       
 203   
 205          """ 
 206          Add all names from 'values'. 
 207          """ 
 208          classes = self._get_class_value().split() 
 209          extended = False 
 210          for value in values: 
 211              if value not in classes: 
 212                  classes.append(value) 
 213                  extended = True 
 214          if extended: 
 215              self._attributes['class'] = ' '.join(classes) 
  216   
 218          """ 
 219          Add a class name if it isn't there yet, or remove it if it exists. 
 220   
 221          Returns true if the class was added (and is now enabled) and 
 222          false if it was removed (and is now disabled). 
 223          """ 
 224          if not value or re.search(r'\s', value): 
 225              raise ValueError("Invalid class name: %r" % value) 
 226          classes = self._get_class_value().split() 
 227          try: 
 228              classes.remove(value) 
 229              enabled = False 
 230          except ValueError: 
 231              classes.append(value) 
 232              enabled = True 
 233          if classes: 
 234              self._attributes['class'] = ' '.join(classes) 
 235          else: 
 236              del self._attributes['class'] 
 237          return enabled 
   238   
 241   
 242 -    def set(self, key, value=None): 
  243          """set(self, key, value=None) 
 244   
 245          Sets an element attribute.  If no value is provided, or if the value is None, 
 246          creates a 'boolean' attribute without value, e.g. "<form novalidate></form>" 
 247          for ``form.set('novalidate')``. 
 248          """ 
 249          super(HtmlElement, self).set(key, value) 
  250   
 251      @property 
 253          """ 
 254          A set-like wrapper around the 'class' attribute. 
 255          """ 
 256          return Classes(self.attrib) 
  257   
 258      @classes.setter 
 266   
 267      @property 
 269          """ 
 270          Returns the base URL, given when the page was parsed. 
 271   
 272          Use with ``urlparse.urljoin(el.base_url, href)`` to get 
 273          absolute URLs. 
 274          """ 
 275          return self.getroottree().docinfo.URL 
  276   
 277      @property 
 283   
 284      @property 
 286          """ 
 287          Return the <body> element.  Can be called from a child element 
 288          to get the document's head. 
 289          """ 
 290          return self.xpath('//body|//x:body', namespaces={'x':XHTML_NAMESPACE})[0] 
  291   
 292      @property 
 294          """ 
 295          Returns the <head> element.  Can be called from a child 
 296          element to get the document's head. 
 297          """ 
 298          return self.xpath('//head|//x:head', namespaces={'x':XHTML_NAMESPACE})[0] 
  299   
 300      @property 
 302          """ 
 303          Get or set any <label> element associated with this element. 
 304          """ 
 305          id = self.get('id') 
 306          if not id: 
 307              return None 
 308          result = _label_xpath(self, id=id) 
 309          if not result: 
 310              return None 
 311          else: 
 312              return result[0] 
  313   
 314      @label.setter 
 316          id = self.get('id') 
 317          if not id: 
 318              raise TypeError( 
 319                  "You cannot set a label for an element (%r) that has no id" 
 320                  % self) 
 321          if _nons(label.tag) != 'label': 
 322              raise TypeError( 
 323                  "You can only assign label to a label element (not %r)" 
 324                  % label) 
 325          label.set('for', id) 
  326   
 327      @label.deleter 
 332   
 334          """ 
 335          Removes this element from the tree, including its children and 
 336          text.  The tail text is joined to the previous element or 
 337          parent. 
 338          """ 
 339          parent = self.getparent() 
 340          assert parent is not None 
 341          if self.tail: 
 342              previous = self.getprevious() 
 343              if previous is None: 
 344                  parent.text = (parent.text or '') + self.tail 
 345              else: 
 346                  previous.tail = (previous.tail or '') + self.tail 
 347          parent.remove(self) 
  348   
 350          """ 
 351          Remove the tag, but not its children or text.  The children and text 
 352          are merged into the parent. 
 353   
 354          Example:: 
 355   
 356              >>> h = fragment_fromstring('<div>Hello <b>World!</b></div>') 
 357              >>> h.find('.//b').drop_tag() 
 358              >>> print(tostring(h, encoding='unicode')) 
 359              <div>Hello World!</div> 
 360          """ 
 361          parent = self.getparent() 
 362          assert parent is not None 
 363          previous = self.getprevious() 
 364          if self.text and isinstance(self.tag, basestring): 
 365               
 366              if previous is None: 
 367                  parent.text = (parent.text or '') + self.text 
 368              else: 
 369                  previous.tail = (previous.tail or '') + self.text 
 370          if self.tail: 
 371              if len(self): 
 372                  last = self[-1] 
 373                  last.tail = (last.tail or '') + self.tail 
 374              elif previous is None: 
 375                  parent.text = (parent.text or '') + self.tail 
 376              else: 
 377                  previous.tail = (previous.tail or '') + self.tail 
 378          index = parent.index(self) 
 379          parent[index:index+1] = self[:] 
  380   
 382          """ 
 383          Find any links like ``<a rel="{rel}">...</a>``; returns a list of elements. 
 384          """ 
 385          rel = rel.lower() 
 386          return [el for el in _rel_links_xpath(self) 
 387                  if el.get('rel').lower() == rel] 
  388   
 390          """ 
 391          Find any elements with the given class name. 
 392          """ 
 393          return _class_xpath(self, class_name=class_name) 
  394   
 396          """ 
 397          Get the first element in a document with the given id.  If none is 
 398          found, return the default argument if provided or raise KeyError 
 399          otherwise. 
 400   
 401          Note that there can be more than one element with the same id, 
 402          and this isn't uncommon in HTML documents found in the wild. 
 403          Browsers return only the first match, and this function does 
 404          the same. 
 405          """ 
 406          try: 
 407               
 408               
 409              return _id_xpath(self, id=id)[0] 
 410          except IndexError: 
 411              if default: 
 412                  return default[0] 
 413              else: 
 414                  raise KeyError(id) 
  415   
 416 -    def text_content(self): 
  417          """ 
 418          Return the text content of the tag (and the text in any children). 
 419          """ 
 420          return _collect_string_content(self) 
  421   
 422 -    def cssselect(self, expr, translator='html'): 
  423          """ 
 424          Run the CSS expression on this element and its children, 
 425          returning a list of the results. 
 426   
 427          Equivalent to lxml.cssselect.CSSSelect(expr, translator='html')(self) 
 428          -- note that pre-compiling the expression can provide a substantial 
 429          speedup. 
 430          """ 
 431           
 432          from lxml.cssselect import CSSSelector 
 433          return CSSSelector(expr, translator=translator)(self) 
  434   
 435       
 436       
 437       
 438   
 439 -    def make_links_absolute(self, base_url=None, resolve_base_href=True, 
 440                              handle_failures=None): 
  441          """ 
 442          Make all links in the document absolute, given the 
 443          ``base_url`` for the document (the full URL where the document 
 444          came from), or if no ``base_url`` is given, then the ``.base_url`` 
 445          of the document. 
 446   
 447          If ``resolve_base_href`` is true, then any ``<base href>`` 
 448          tags in the document are used *and* removed from the document. 
 449          If it is false then any such tag is ignored. 
 450   
 451          If ``handle_failures`` is None (default), a failure to process 
 452          a URL will abort the processing.  If set to 'ignore', errors 
 453          are ignored.  If set to 'discard', failing URLs will be removed. 
 454          """ 
 455          if base_url is None: 
 456              base_url = self.base_url 
 457              if base_url is None: 
 458                  raise TypeError( 
 459                      "No base_url given, and the document has no base_url") 
 460          if resolve_base_href: 
 461              self.resolve_base_href() 
 462   
 463          if handle_failures == 'ignore': 
 464              def link_repl(href): 
 465                  try: 
 466                      return urljoin(base_url, href) 
 467                  except ValueError: 
 468                      return href 
  469          elif handle_failures == 'discard': 
 470              def link_repl(href): 
 471                  try: 
 472                      return urljoin(base_url, href) 
 473                  except ValueError: 
 474                      return None 
  475          elif handle_failures is None: 
 476              def link_repl(href): 
 477                  return urljoin(base_url, href) 
 478          else: 
 479              raise ValueError( 
 480                  "unexpected value for handle_failures: %r" % handle_failures) 
 481   
 482          self.rewrite_links(link_repl) 
 483   
 485          """ 
 486          Find any ``<base href>`` tag in the document, and apply its 
 487          values to all links found in the document.  Also remove the 
 488          tag once it has been applied. 
 489   
 490          If ``handle_failures`` is None (default), a failure to process 
 491          a URL will abort the processing.  If set to 'ignore', errors 
 492          are ignored.  If set to 'discard', failing URLs will be removed. 
 493          """ 
 494          base_href = None 
 495          basetags = self.xpath('//base[@href]|//x:base[@href]', 
 496                                namespaces={'x': XHTML_NAMESPACE}) 
 497          for b in basetags: 
 498              base_href = b.get('href') 
 499              b.drop_tree() 
 500          if not base_href: 
 501              return 
 502          self.make_links_absolute(base_href, resolve_base_href=False, 
 503                                   handle_failures=handle_failures) 
  504   
 506          """ 
 507          Yield (element, attribute, link, pos), where attribute may be None 
 508          (indicating the link is in the text).  ``pos`` is the position 
 509          where the link occurs; often 0, but sometimes something else in 
 510          the case of links in stylesheets or style tags. 
 511   
 512          Note: <base href> is *not* taken into account in any way.  The 
 513          link you get is exactly the link in the document. 
 514   
 515          Note: multiple links inside of a single text string or 
 516          attribute value are returned in reversed order.  This makes it 
 517          possible to replace or delete them from the text string value 
 518          based on their reported text positions.  Otherwise, a 
 519          modification at one text position can change the positions of 
 520          links reported later on. 
 521          """ 
 522          link_attrs = defs.link_attrs 
 523          for el in self.iter(etree.Element): 
 524              attribs = el.attrib 
 525              tag = _nons(el.tag) 
 526              if tag == 'object': 
 527                  codebase = None 
 528                   
 529                   
 530                  if 'codebase' in attribs: 
 531                      codebase = el.get('codebase') 
 532                      yield (el, 'codebase', codebase, 0) 
 533                  for attrib in ('classid', 'data'): 
 534                      if attrib in attribs: 
 535                          value = el.get(attrib) 
 536                          if codebase is not None: 
 537                              value = urljoin(codebase, value) 
 538                          yield (el, attrib, value, 0) 
 539                  if 'archive' in attribs: 
 540                      for match in _archive_re.finditer(el.get('archive')): 
 541                          value = match.group(0) 
 542                          if codebase is not None: 
 543                              value = urljoin(codebase, value) 
 544                          yield (el, 'archive', value, match.start()) 
 545              else: 
 546                  for attrib in link_attrs: 
 547                      if attrib in attribs: 
 548                          yield (el, attrib, attribs[attrib], 0) 
 549              if tag == 'meta': 
 550                  http_equiv = attribs.get('http-equiv', '').lower() 
 551                  if http_equiv == 'refresh': 
 552                      content = attribs.get('content', '') 
 553                      match = _parse_meta_refresh_url(content) 
 554                      url = (match.group('url') if match else content).strip() 
 555                       
 556                       
 557                      if url: 
 558                          url, pos = _unquote_match( 
 559                              url, match.start('url') if match else content.find(url)) 
 560                          yield (el, 'content', url, pos) 
 561              elif tag == 'param': 
 562                  valuetype = el.get('valuetype') or '' 
 563                  if valuetype.lower() == 'ref': 
 564                       
 565                       
 566                       
 567                       
 568                       
 569                       
 570                      yield (el, 'value', el.get('value'), 0) 
 571              elif tag == 'style' and el.text: 
 572                  urls = [ 
 573                       
 574                      _unquote_match(match.group(1), match.start(1))[::-1] 
 575                      for match in _iter_css_urls(el.text) 
 576                      ] + [ 
 577                      (match.start(1), match.group(1)) 
 578                      for match in _iter_css_imports(el.text) 
 579                      ] 
 580                  if urls: 
 581                       
 582                       
 583                       
 584                      urls.sort(reverse=True) 
 585                      for start, url in urls: 
 586                          yield (el, None, url, start) 
 587              if 'style' in attribs: 
 588                  urls = list(_iter_css_urls(attribs['style'])) 
 589                  if urls: 
 590                       
 591                      for match in urls[::-1]: 
 592                          url, start = _unquote_match(match.group(1), match.start(1)) 
 593                          yield (el, 'style', url, start) 
  594   
 595 -    def rewrite_links(self, link_repl_func, resolve_base_href=True, 
 596                        base_href=None): 
  597          """ 
 598          Rewrite all the links in the document.  For each link 
 599          ``link_repl_func(link)`` will be called, and the return value 
 600          will replace the old link. 
 601   
 602          Note that links may not be absolute (unless you first called 
 603          ``make_links_absolute()``), and may be internal (e.g., 
 604          ``'#anchor'``).  They can also be values like 
 605          ``'mailto:email'`` or ``'javascript:expr'``. 
 606   
 607          If you give ``base_href`` then all links passed to 
 608          ``link_repl_func()`` will take that into account. 
 609   
 610          If the ``link_repl_func`` returns None, the attribute or 
 611          tag text will be removed completely. 
 612          """ 
 613          if base_href is not None: 
 614               
 615               
 616              self.make_links_absolute( 
 617                  base_href, resolve_base_href=resolve_base_href) 
 618          elif resolve_base_href: 
 619              self.resolve_base_href() 
 620   
 621          for el, attrib, link, pos in self.iterlinks(): 
 622              new_link = link_repl_func(link.strip()) 
 623              if new_link == link: 
 624                  continue 
 625              if new_link is None: 
 626                   
 627                  if attrib is None: 
 628                      el.text = '' 
 629                  else: 
 630                      del el.attrib[attrib] 
 631                  continue 
 632   
 633              if attrib is None: 
 634                  new = el.text[:pos] + new_link + el.text[pos+len(link):] 
 635                  el.text = new 
 636              else: 
 637                  cur = el.get(attrib) 
 638                  if not pos and len(cur) == len(link): 
 639                      new = new_link   
 640                  else: 
 641                      new = cur[:pos] + new_link + cur[pos+len(link):] 
 642                  el.set(attrib, new) 
  643   
 646      """ 
 647      An object that represents a method on an element as a function; 
 648      the function takes either an element or an HTML string.  It 
 649      returns whatever the function normally returns, or if the function 
 650      works in-place (and so returns None) it returns a serialized form 
 651      of the resulting document. 
 652      """ 
 658          result_type = type(doc) 
 659          if isinstance(doc, basestring): 
 660              if 'copy' in kw: 
 661                  raise TypeError( 
 662                      "The keyword 'copy' can only be used with element inputs to %s, not a string input" % self.name) 
 663              doc = fromstring(doc, **kw) 
 664          else: 
 665              if 'copy' in kw: 
 666                  make_a_copy = kw.pop('copy') 
 667              else: 
 668                  make_a_copy = self.copy 
 669              if make_a_copy: 
 670                  doc = copy.deepcopy(doc) 
 671          meth = getattr(doc, self.name) 
 672          result = meth(*args, **kw) 
 673           
 674          if result is None: 
 675               
 676              return _transform_result(result_type, doc) 
 677          else: 
 678              return result 
   679   
 680   
 681  find_rel_links = _MethodFunc('find_rel_links', copy=False) 
 682  find_class = _MethodFunc('find_class', copy=False) 
 683  make_links_absolute = _MethodFunc('make_links_absolute', copy=True) 
 684  resolve_base_href = _MethodFunc('resolve_base_href', copy=True) 
 685  iterlinks = _MethodFunc('iterlinks', copy=False) 
 686  rewrite_links = _MethodFunc('rewrite_links', copy=True) 
 691   
 697   
 701   
 702   
 703 -class HtmlEntity(etree.EntityBase, HtmlMixin): 
  705   
 708      """A lookup scheme for HTML Element classes. 
 709   
 710      To create a lookup instance with different Element classes, pass a tag 
 711      name mapping of Element classes in the ``classes`` keyword argument and/or 
 712      a tag name mapping of Mixin classes in the ``mixins`` keyword argument. 
 713      The special key '*' denotes a Mixin class that should be mixed into all 
 714      Element classes. 
 715      """ 
 716      _default_element_classes = {} 
 717   
 718 -    def __init__(self, classes=None, mixins=None): 
  735   
 736 -    def lookup(self, node_type, document, namespace, name): 
   747   
 748   
 749   
 750   
 751   
 752   
 753  _looks_like_full_html_unicode = re.compile( 
 754      unicode(r'^\s*<(?:html|!doctype)'), re.I).match 
 755  _looks_like_full_html_bytes = re.compile( 
 756      r'^\s*<(?:html|!doctype)'.encode('ascii'), re.I).match 
 771   
 775      """Parses several HTML elements, returning a list of elements. 
 776   
 777      The first item in the list may be a string. 
 778      If no_leading_text is true, then it will be an error if there is 
 779      leading text, and it will always be a list of only elements. 
 780   
 781      base_url will set the document's base_url attribute 
 782      (and the tree's docinfo.URL). 
 783      """ 
 784      if parser is None: 
 785          parser = html_parser 
 786       
 787      if isinstance(html, bytes): 
 788          if not _looks_like_full_html_bytes(html): 
 789               
 790              html = ('<html><body>'.encode('ascii') + html + 
 791                      '</body></html>'.encode('ascii')) 
 792      else: 
 793          if not _looks_like_full_html_unicode(html): 
 794              html = '<html><body>%s</body></html>' % html 
 795      doc = document_fromstring(html, parser=parser, base_url=base_url, **kw) 
 796      assert _nons(doc.tag) == 'html' 
 797      bodies = [e for e in doc if _nons(e.tag) == 'body'] 
 798      assert len(bodies) == 1, ("too many bodies: %r in %r" % (bodies, html)) 
 799      body = bodies[0] 
 800      elements = [] 
 801      if no_leading_text and body.text and body.text.strip(): 
 802          raise etree.ParserError( 
 803              "There is leading text: %r" % body.text) 
 804      if body.text and body.text.strip(): 
 805          elements.append(body.text) 
 806      elements.extend(body) 
 807       
 808       
 809      return elements 
  810   
 814      """ 
 815      Parses a single HTML element; it is an error if there is more than 
 816      one element, or if anything but whitespace precedes or follows the 
 817      element. 
 818   
 819      If ``create_parent`` is true (or is a tag name) then a parent node 
 820      will be created to encapsulate the HTML in a single element.  In this 
 821      case, leading or trailing text is also allowed, as are multiple elements 
 822      as result of the parsing. 
 823   
 824      Passing a ``base_url`` will set the document's ``base_url`` attribute 
 825      (and the tree's docinfo.URL). 
 826      """ 
 827      if parser is None: 
 828          parser = html_parser 
 829   
 830      accept_leading_text = bool(create_parent) 
 831   
 832      elements = fragments_fromstring( 
 833          html, parser=parser, no_leading_text=not accept_leading_text, 
 834          base_url=base_url, **kw) 
 835   
 836      if create_parent: 
 837          if not isinstance(create_parent, basestring): 
 838              create_parent = 'div' 
 839          new_root = Element(create_parent) 
 840          if elements: 
 841              if isinstance(elements[0], basestring): 
 842                  new_root.text = elements[0] 
 843                  del elements[0] 
 844              new_root.extend(elements) 
 845          return new_root 
 846   
 847      if not elements: 
 848          raise etree.ParserError('No elements found') 
 849      if len(elements) > 1: 
 850          raise etree.ParserError( 
 851              "Multiple elements found (%s)" 
 852              % ', '.join([_element_name(e) for e in elements])) 
 853      el = elements[0] 
 854      if el.tail and el.tail.strip(): 
 855          raise etree.ParserError( 
 856              "Element followed by text: %r" % el.tail) 
 857      el.tail = None 
 858      return el 
  859   
 860   
 861 -def fromstring(html, base_url=None, parser=None, **kw): 
  927   
 928   
 929 -def parse(filename_or_url, parser=None, base_url=None, **kw): 
  930      """ 
 931      Parse a filename, URL, or file-like object into an HTML document 
 932      tree.  Note: this returns a tree, not an element.  Use 
 933      ``parse(...).getroot()`` to get the document root. 
 934   
 935      You can override the base URL with the ``base_url`` keyword.  This 
 936      is most useful when parsing from a file-like object. 
 937      """ 
 938      if parser is None: 
 939          parser = html_parser 
 940      return etree.parse(filename_or_url, parser, base_url=base_url, **kw) 
  941   
 950   
 953      if isinstance(el, etree.CommentBase): 
 954          return 'comment' 
 955      elif isinstance(el, basestring): 
 956          return 'string' 
 957      else: 
 958          return _nons(el.tag) 
  959   
1079   
1080   
1081  HtmlElementClassLookup._default_element_classes['form'] = FormElement 
1120   
1123      if not url: 
1124          raise ValueError("cannot submit, no URL provided") 
1125       
1126      try: 
1127          from urllib import urlencode, urlopen 
1128      except ImportError:  
1129          from urllib.request import urlopen 
1130          from urllib.parse import urlencode 
1131      if method == 'GET': 
1132          if '?' in url: 
1133              url += '&' 
1134          else: 
1135              url += '?' 
1136          url += urlencode(values) 
1137          data = None 
1138      else: 
1139          data = urlencode(values) 
1140          if not isinstance(data, bytes): 
1141              data = data.encode('ASCII') 
1142      return urlopen(url, data) 
 1143   
1146   
1154          raise KeyError( 
1155              "You cannot remove keys from ElementDict") 
 1159          return item in self.inputs 
 1164   
1166          return '<%s for form %s>' % ( 
1167              self.__class__.__name__, 
1168              self.inputs.form._name()) 
  1169   
1236   
1267   
1268   
1269 -class TextareaElement(InputMixin, HtmlElement): 
 1270      """ 
1271      ``<textarea>`` element.  You can get the name with ``.name`` and 
1272      get/set the value with ``.value`` 
1273      """ 
1274      @property 
1276          """ 
1277          Get/set the value (which is the contents of this element) 
1278          """ 
1279          content = self.text or '' 
1280          if self.tag.startswith("{%s}" % XHTML_NAMESPACE): 
1281              serialisation_method = 'xml' 
1282          else: 
1283              serialisation_method = 'html' 
1284          for el in self: 
1285               
1286              content += etree.tostring( 
1287                  el, method=serialisation_method, encoding='unicode') 
1288          return content 
 1289   
1290      @value.setter 
1291 -    def value(self, value): 
 1292          del self[:] 
1293          self.text = value 
 1294   
1295      @value.deleter 
1297          self.text = '' 
1298          del self[:] 
  1299   
1300   
1301  HtmlElementClassLookup._default_element_classes['textarea'] = TextareaElement 
1305      """ 
1306      ``<select>`` element.  You can get the name with ``.name``. 
1307   
1308      ``.value`` will be the value of the selected option, unless this 
1309      is a multi-select element (``<select multiple>``), in which case 
1310      it will be a set-like object.  In either case ``.value_options`` 
1311      gives the possible values. 
1312   
1313      The boolean attribute ``.multiple`` shows if this is a 
1314      multi-select. 
1315      """ 
1316      @property 
1318          """ 
1319          Get/set the value of this select (the selected option). 
1320   
1321          If this is a multi-select, this is a set-like object that 
1322          represents all the selected options. 
1323          """ 
1324          if self.multiple: 
1325              return MultipleSelectOptions(self) 
1326          options = _options_xpath(self) 
1327   
1328          try: 
1329              selected_option = next(el for el in reversed(options) if el.get('selected') is not None) 
1330          except StopIteration: 
1331              try: 
1332                  selected_option = next(el for el in options if el.get('disabled') is None) 
1333              except StopIteration: 
1334                  return None 
1335          value = selected_option.get('value') 
1336          if value is None: 
1337              value = (selected_option.text or '').strip() 
1338          return value 
 1339   
1340      @value.setter 
1341 -    def value(self, value): 
 1342          if self.multiple: 
1343              if isinstance(value, basestring): 
1344                  raise TypeError("You must pass in a sequence") 
1345              values = self.value 
1346              values.clear() 
1347              values.update(value) 
1348              return 
1349          checked_option = None 
1350          if value is not None: 
1351              for el in _options_xpath(self): 
1352                  opt_value = el.get('value') 
1353                  if opt_value is None: 
1354                      opt_value = (el.text or '').strip() 
1355                  if opt_value == value: 
1356                      checked_option = el 
1357                      break 
1358              else: 
1359                  raise ValueError( 
1360                      "There is no option with the value of %r" % value) 
1361          for el in _options_xpath(self): 
1362              if 'selected' in el.attrib: 
1363                  del el.attrib['selected'] 
1364          if checked_option is not None: 
1365              checked_option.set('selected', '') 
 1366   
1367      @value.deleter 
1374   
1375      @property 
1388   
1389      @property 
1391          """ 
1392          Boolean attribute: is there a ``multiple`` attribute on this element. 
1393          """ 
1394          return 'multiple' in self.attrib 
 1395   
1396      @multiple.setter 
1398          if value: 
1399              self.set('multiple', '') 
1400          elif 'multiple' in self.attrib: 
1401              del self.attrib['multiple'] 
  1402   
1403   
1404  HtmlElementClassLookup._default_element_classes['select'] = SelectElement 
1408      """ 
1409      Represents all the selected options in a ``<select multiple>`` element. 
1410   
1411      You can add to this set-like option to select an option, or remove 
1412      to unselect the option. 
1413      """ 
1414   
1416          self.select = select 
 1417   
1418      @property 
1420          """ 
1421          Iterator of all the ``<option>`` elements. 
1422          """ 
1423          return iter(_options_xpath(self.select)) 
 1424   
1426          for option in self.options: 
1427              if 'selected' in option.attrib: 
1428                  opt_value = option.get('value') 
1429                  if opt_value is None: 
1430                      opt_value = (option.text or '').strip() 
1431                  yield opt_value 
 1432   
1433 -    def add(self, item): 
 1434          for option in self.options: 
1435              opt_value = option.get('value') 
1436              if opt_value is None: 
1437                  opt_value = (option.text or '').strip() 
1438              if opt_value == item: 
1439                  option.set('selected', '') 
1440                  break 
1441          else: 
1442              raise ValueError( 
1443                  "There is no option with the value %r" % item) 
 1444   
1446          for option in self.options: 
1447              opt_value = option.get('value') 
1448              if opt_value is None: 
1449                  opt_value = (option.text or '').strip() 
1450              if opt_value == item: 
1451                  if 'selected' in option.attrib: 
1452                      del option.attrib['selected'] 
1453                  else: 
1454                      raise ValueError( 
1455                          "The option %r is not currently selected" % item) 
1456                  break 
1457          else: 
1458              raise ValueError( 
1459                  "There is not option with the value %r" % item) 
 1460   
1462          return '<%s {%s} for select name=%r>' % ( 
1463              self.__class__.__name__, 
1464              ', '.join([repr(v) for v in self]), 
1465              self.select.name) 
  1466   
1469      """ 
1470      This object represents several ``<input type=radio>`` elements 
1471      that have the same name. 
1472   
1473      You can use this like a list, but also use the property 
1474      ``.value`` to check/uncheck inputs.  Also you can use 
1475      ``.value_options`` to get the possible values. 
1476      """ 
1477      @property 
1479          """ 
1480          Get/set the value, which checks the radio with that value (and 
1481          unchecks any other value). 
1482          """ 
1483          for el in self: 
1484              if 'checked' in el.attrib: 
1485                  return el.get('value') 
1486          return None 
 1487   
1488      @value.setter 
1489 -    def value(self, value): 
 1490          checked_option = None 
1491          if value is not None: 
1492              for el in self: 
1493                  if el.get('value') == value: 
1494                      checked_option = el 
1495                      break 
1496              else: 
1497                  raise ValueError("There is no radio input with the value %r" % value) 
1498          for el in self: 
1499              if 'checked' in el.attrib: 
1500                  del el.attrib['checked'] 
1501          if checked_option is not None: 
1502              checked_option.set('checked', '') 
 1503   
1504      @value.deleter 
1507   
1508      @property 
1510          """ 
1511          Returns a list of all the possible values. 
1512          """ 
1513          return [el.get('value') for el in self] 
 1514   
1516          return '%s(%s)' % ( 
1517              self.__class__.__name__, 
1518              list.__repr__(self)) 
  1519   
1522      """ 
1523      Represents a group of checkboxes (``<input type=checkbox>``) that 
1524      have the same name. 
1525   
1526      In addition to using this like a list, the ``.value`` attribute 
1527      returns a set-like object that you can add to or remove from to 
1528      check and uncheck checkboxes.  You can also use ``.value_options`` 
1529      to get the possible values. 
1530      """ 
1531      @property 
1533          """ 
1534          Return a set-like object that can be modified to check or 
1535          uncheck individual checkboxes according to their value. 
1536          """ 
1537          return CheckboxValues(self) 
 1538   
1539      @value.setter 
1540 -    def value(self, value): 
 1548   
1549      @value.deleter 
1552   
1553      @property 
1555          """ 
1556          Returns a list of all the possible values. 
1557          """ 
1558          return [el.get('value') for el in self] 
 1559   
1561          return '%s(%s)' % ( 
1562              self.__class__.__name__, list.__repr__(self)) 
  1563   
1566      """ 
1567      Represents the values of the checked checkboxes in a group of 
1568      checkboxes with the same name. 
1569      """ 
1570   
1573   
1575          return iter([ 
1576              el.get('value') 
1577              for el in self.group 
1578              if 'checked' in el.attrib]) 
 1579   
1580 -    def add(self, value): 
 1581          for el in self.group: 
1582              if el.get('value') == value: 
1583                  el.set('checked', '') 
1584                  break 
1585          else: 
1586              raise KeyError("No checkbox with value %r" % value) 
 1587   
1589          for el in self.group: 
1590              if el.get('value') == value: 
1591                  if 'checked' in el.attrib: 
1592                      del el.attrib['checked'] 
1593                  else: 
1594                      raise KeyError( 
1595                          "The checkbox with value %r was already unchecked" % value) 
1596                  break 
1597          else: 
1598              raise KeyError( 
1599                  "No checkbox with value %r" % value) 
 1600   
1602          return '<%s {%s} for checkboxes name=%r>' % ( 
1603              self.__class__.__name__, 
1604              ', '.join([repr(v) for v in self]), 
1605              self.group.name) 
  1606   
1700   
1701   
1702  HtmlElementClassLookup._default_element_classes['input'] = InputElement 
1706      """ 
1707      Represents a ``<label>`` element. 
1708   
1709      Label elements are linked to other elements with their ``for`` 
1710      attribute.  You can access this element with ``label.for_element``. 
1711      """ 
1712      @property 
1714          """ 
1715          Get/set the element this label points to.  Return None if it 
1716          can't be found. 
1717          """ 
1718          id = self.get('for') 
1719          if not id: 
1720              return None 
1721          return self.body.get_element_by_id(id) 
 1722   
1723      @for_element.setter 
1725          id = other.get('id') 
1726          if not id: 
1727              raise TypeError( 
1728                  "Element %r has no id attribute" % other) 
1729          self.set('for', id) 
 1730   
1731      @for_element.deleter 
 1736   
1737   
1738  HtmlElementClassLookup._default_element_classes['label'] = LabelElement 
1758   
1761      """Convert all tags in an XHTML tree to HTML by removing their 
1762      XHTML namespace. 
1763      """ 
1764      try: 
1765          xhtml = xhtml.getroot() 
1766      except AttributeError: 
1767          pass 
1768      prefix = "{%s}" % XHTML_NAMESPACE 
1769      prefix_len = len(prefix) 
1770      for el in xhtml.iter(prefix + "*"): 
1771          el.tag = el.tag[prefix_len:] 
 1772   
1773   
1774   
1775   
1776  __str_replace_meta_content_type = re.compile( 
1777      r'<meta http-equiv="Content-Type"[^>]*>').sub 
1778  __bytes_replace_meta_content_type = re.compile( 
1779      r'<meta http-equiv="Content-Type"[^>]*>'.encode('ASCII')).sub 
1780   
1781   
1782 -def tostring(doc, pretty_print=False, include_meta_content_type=False, 
1783               encoding=None, method="html", with_tail=True, doctype=None): 
 1784      """Return an HTML string representation of the document. 
1785   
1786      Note: if include_meta_content_type is true this will create a 
1787      ``<meta http-equiv="Content-Type" ...>`` tag in the head; 
1788      regardless of the value of include_meta_content_type any existing 
1789      ``<meta http-equiv="Content-Type" ...>`` tag will be removed 
1790   
1791      The ``encoding`` argument controls the output encoding (defauts to 
1792      ASCII, with &#...; character references for any characters outside 
1793      of ASCII).  Note that you can pass the name ``'unicode'`` as 
1794      ``encoding`` argument to serialise to a Unicode string. 
1795   
1796      The ``method`` argument defines the output method.  It defaults to 
1797      'html', but can also be 'xml' for xhtml output, or 'text' to 
1798      serialise to plain text without markup. 
1799   
1800      To leave out the tail text of the top-level element that is being 
1801      serialised, pass ``with_tail=False``. 
1802   
1803      The ``doctype`` option allows passing in a plain string that will 
1804      be serialised before the XML tree.  Note that passing in non 
1805      well-formed content here will make the XML output non well-formed. 
1806      Also, an existing doctype in the document tree will not be removed 
1807      when serialising an ElementTree instance. 
1808   
1809      Example:: 
1810   
1811          >>> from lxml import html 
1812          >>> root = html.fragment_fromstring('<p>Hello<br>world!</p>') 
1813   
1814          >>> html.tostring(root) 
1815          b'<p>Hello<br>world!</p>' 
1816          >>> html.tostring(root, method='html') 
1817          b'<p>Hello<br>world!</p>' 
1818   
1819          >>> html.tostring(root, method='xml') 
1820          b'<p>Hello<br/>world!</p>' 
1821   
1822          >>> html.tostring(root, method='text') 
1823          b'Helloworld!' 
1824   
1825          >>> html.tostring(root, method='text', encoding='unicode') 
1826          u'Helloworld!' 
1827   
1828          >>> root = html.fragment_fromstring('<div><p>Hello<br>world!</p>TAIL</div>') 
1829          >>> html.tostring(root[0], method='text', encoding='unicode') 
1830          u'Helloworld!TAIL' 
1831   
1832          >>> html.tostring(root[0], method='text', encoding='unicode', with_tail=False) 
1833          u'Helloworld!' 
1834   
1835          >>> doc = html.document_fromstring('<p>Hello<br>world!</p>') 
1836          >>> html.tostring(doc, method='html', encoding='unicode') 
1837          u'<html><body><p>Hello<br>world!</p></body></html>' 
1838   
1839          >>> print(html.tostring(doc, method='html', encoding='unicode', 
1840          ...          doctype='<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN"' 
1841          ...                  ' "http://www.w3.org/TR/html4/strict.dtd">')) 
1842          <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd"> 
1843          <html><body><p>Hello<br>world!</p></body></html> 
1844      """ 
1845      html = etree.tostring(doc, method=method, pretty_print=pretty_print, 
1846                            encoding=encoding, with_tail=with_tail, 
1847                            doctype=doctype) 
1848      if method == 'html' and not include_meta_content_type: 
1849          if isinstance(html, str): 
1850              html = __str_replace_meta_content_type('', html) 
1851          else: 
1852              html = __bytes_replace_meta_content_type(bytes(), html) 
1853      return html 
 1854   
1855   
1856  tostring.__doc__ = __fix_docstring(tostring.__doc__) 
1860      """ 
1861      Open the HTML document in a web browser, saving it to a temporary 
1862      file to open it.  Note that this does not delete the file after 
1863      use.  This is mainly meant for debugging. 
1864      """ 
1865      import os 
1866      import webbrowser 
1867      import tempfile 
1868      if not isinstance(doc, etree._ElementTree): 
1869          doc = etree.ElementTree(doc) 
1870      handle, fn = tempfile.mkstemp(suffix='.html') 
1871      f = os.fdopen(handle, 'wb') 
1872      try: 
1873          doc.write(f, method="html", encoding=encoding or doc.docinfo.encoding or "UTF-8") 
1874      finally: 
1875           
1876          f.close() 
1877      url = 'file://' + fn.replace(os.path.sep, '/') 
1878      print(url) 
1879      webbrowser.open(url) 
 1880   
1881   
1882   
1883   
1884   
1885   
1886 -class HTMLParser(etree.HTMLParser): 
 1887      """An HTML parser that is configured to return lxml.html Element 
1888      objects. 
1889      """ 
 1893   
1896      """An XML parser that is configured to return lxml.html Element 
1897      objects. 
1898   
1899      Note that this parser is not really XHTML aware unless you let it 
1900      load a DTD that declares the HTML entities.  To do this, make sure 
1901      you have the XHTML DTDs installed in your catalogs, and create the 
1902      parser like this:: 
1903   
1904          >>> parser = XHTMLParser(load_dtd=True) 
1905   
1906      If you additionally want to validate the document, use this:: 
1907   
1908          >>> parser = XHTMLParser(dtd_validation=True) 
1909   
1910      For catalog support, see http://www.xmlsoft.org/catalog.html. 
1911      """ 
 1915   
1918      """Create a new HTML Element. 
1919   
1920      This can also be used for XHTML documents. 
1921      """ 
1922      v = html_parser.makeelement(*args, **kw) 
1923      return v 
 1924   
1925   
1926  html_parser = HTMLParser() 
1927  xhtml_parser = XHTMLParser() 
1928