1  """A cleanup tool for HTML. 
  2   
  3  Removes unwanted tags and content.  See the `Cleaner` class for 
  4  details. 
  5  """ 
  6   
  7  import re 
  8  import copy 
  9  try: 
 10      from urlparse import urlsplit 
 11  except ImportError: 
 12       
 13      from urllib.parse import urlsplit 
 14  from lxml import etree 
 15  from lxml.html import defs 
 16  from lxml.html import fromstring, XHTML_NAMESPACE 
 17  from lxml.html import xhtml_to_html, _transform_result 
 18   
 19  try: 
 20      unichr 
 21  except NameError: 
 22       
 23      unichr = chr 
 24  try: 
 25      unicode 
 26  except NameError: 
 27       
 28      unicode = str 
 29  try: 
 30      bytes 
 31  except NameError: 
 32       
 33      bytes = str 
 34  try: 
 35      basestring 
 36  except NameError: 
 37      basestring = (str, bytes) 
 38   
 39   
 40  __all__ = ['clean_html', 'clean', 'Cleaner', 'autolink', 'autolink_html', 
 41             'word_break', 'word_break_html'] 
 42   
 43   
 44   
 45   
 46   
 47   
 48   
 49   
 50   
 51   
 52   
 53   
 54   
 55   
 56   
 57   
 58   
 59   
 60   
 61   
 62   
 63   
 64  _css_javascript_re = re.compile( 
 65      r'expression\s*\(.*?\)', re.S|re.I) 
 66   
 67   
 68  _css_import_re = re.compile( 
 69      r'@\s*import', re.I) 
 70   
 71   
 72   
 73  _is_image_dataurl = re.compile( 
 74      r'^data:image/.+;base64', re.I).search 
 75  _is_possibly_malicious_scheme = re.compile( 
 76      r'(?:javascript|jscript|livescript|vbscript|data|about|mocha):', 
 77      re.I).search 
 82   
 83  _substitute_whitespace = re.compile(r'[\s\x00-\x08\x0B\x0C\x0E-\x19]+').sub 
 84   
 85   
 86   
 87  _conditional_comment_re = re.compile( 
 88      r'\[if[\s\n\r]+.*?][\s\n\r]*>', re.I|re.S) 
 89   
 90  _find_styled_elements = etree.XPath( 
 91      "descendant-or-self::*[@style]") 
 92   
 93  _find_external_links = etree.XPath( 
 94      ("descendant-or-self::a  [normalize-space(@href) and substring(normalize-space(@href),1,1) != '#'] |" 
 95       "descendant-or-self::x:a[normalize-space(@href) and substring(normalize-space(@href),1,1) != '#']"), 
 96      namespaces={'x':XHTML_NAMESPACE}) 
 97   
 98   
100      """ 
101      Instances cleans the document of each of the possible offending 
102      elements.  The cleaning is controlled by attributes; you can 
103      override attributes in a subclass, or set them in the constructor. 
104   
105      ``scripts``: 
106          Removes any ``<script>`` tags. 
107   
108      ``javascript``: 
109          Removes any Javascript, like an ``onclick`` attribute. Also removes stylesheets 
110          as they could contain Javascript. 
111   
112      ``comments``: 
113          Removes any comments. 
114   
115      ``style``: 
116          Removes any style tags. 
117   
118      ``inline_style`` 
119          Removes any style attributes.  Defaults to the value of the ``style`` option. 
120   
121      ``links``: 
122          Removes any ``<link>`` tags 
123   
124      ``meta``: 
125          Removes any ``<meta>`` tags 
126   
127      ``page_structure``: 
128          Structural parts of a page: ``<head>``, ``<html>``, ``<title>``. 
129   
130      ``processing_instructions``: 
131          Removes any processing instructions. 
132   
133      ``embedded``: 
134          Removes any embedded objects (flash, iframes) 
135   
136      ``frames``: 
137          Removes any frame-related tags 
138   
139      ``forms``: 
140          Removes any form tags 
141   
142      ``annoying_tags``: 
143          Tags that aren't *wrong*, but are annoying.  ``<blink>`` and ``<marquee>`` 
144   
145      ``remove_tags``: 
146          A list of tags to remove.  Only the tags will be removed, 
147          their content will get pulled up into the parent tag. 
148   
149      ``kill_tags``: 
150          A list of tags to kill.  Killing also removes the tag's content, 
151          i.e. the whole subtree, not just the tag itself. 
152   
153      ``allow_tags``: 
154          A list of tags to include (default include all). 
155   
156      ``remove_unknown_tags``: 
157          Remove any tags that aren't standard parts of HTML. 
158   
159      ``safe_attrs_only``: 
160          If true, only include 'safe' attributes (specifically the list 
161          from the feedparser HTML sanitisation web site). 
162   
163      ``safe_attrs``: 
164          A set of attribute names to override the default list of attributes 
165          considered 'safe' (when safe_attrs_only=True). 
166   
167      ``add_nofollow``: 
168          If true, then any <a> tags will have ``rel="nofollow"`` added to them. 
169   
170      ``host_whitelist``: 
171          A list or set of hosts that you can use for embedded content 
172          (for content like ``<object>``, ``<link rel="stylesheet">``, etc). 
173          You can also implement/override the method 
174          ``allow_embedded_url(el, url)`` or ``allow_element(el)`` to 
175          implement more complex rules for what can be embedded. 
176          Anything that passes this test will be shown, regardless of 
177          the value of (for instance) ``embedded``. 
178   
179          Note that this parameter might not work as intended if you do not 
180          make the links absolute before doing the cleaning. 
181   
182          Note that you may also need to set ``whitelist_tags``. 
183   
184      ``whitelist_tags``: 
185          A set of tags that can be included with ``host_whitelist``. 
186          The default is ``iframe`` and ``embed``; you may wish to 
187          include other tags like ``script``, or you may want to 
188          implement ``allow_embedded_url`` for more control.  Set to None to 
189          include all tags. 
190   
191      This modifies the document *in place*. 
192      """ 
193   
194      scripts = True 
195      javascript = True 
196      comments = True 
197      style = False 
198      inline_style = None 
199      links = True 
200      meta = True 
201      page_structure = True 
202      processing_instructions = True 
203      embedded = True 
204      frames = True 
205      forms = True 
206      annoying_tags = True 
207      remove_tags = None 
208      allow_tags = None 
209      kill_tags = None 
210      remove_unknown_tags = True 
211      safe_attrs_only = True 
212      safe_attrs = defs.safe_attrs 
213      add_nofollow = False 
214      host_whitelist = () 
215      whitelist_tags = set(['iframe', 'embed']) 
216   
225   
226       
227       
228      _tag_link_attrs = dict( 
229          script='src', 
230          link='href', 
231           
232           
233          applet=['code', 'object'], 
234          iframe='src', 
235          embed='src', 
236          layer='src', 
237           
238           
239           
240           
241           
242           
243           
244           
245          a='href', 
246          ) 
247   
249          """ 
250          Cleans the document. 
251          """ 
252          if hasattr(doc, 'getroot'): 
253               
254              doc = doc.getroot() 
255           
256          xhtml_to_html(doc) 
257           
258           
259          for el in doc.iter('image'): 
260              el.tag = 'img' 
261          if not self.comments: 
262               
263               
264              self.kill_conditional_comments(doc) 
265   
266          kill_tags = set(self.kill_tags or ()) 
267          remove_tags = set(self.remove_tags or ()) 
268          allow_tags = set(self.allow_tags or ()) 
269   
270          if self.scripts: 
271              kill_tags.add('script') 
272          if self.safe_attrs_only: 
273              safe_attrs = set(self.safe_attrs) 
274              for el in doc.iter(etree.Element): 
275                  attrib = el.attrib 
276                  for aname in attrib.keys(): 
277                      if aname not in safe_attrs: 
278                          del attrib[aname] 
279          if self.javascript: 
280              if not (self.safe_attrs_only and 
281                      self.safe_attrs == defs.safe_attrs): 
282                   
283                  for el in doc.iter(etree.Element): 
284                      attrib = el.attrib 
285                      for aname in attrib.keys(): 
286                          if aname.startswith('on'): 
287                              del attrib[aname] 
288              doc.rewrite_links(self._remove_javascript_link, 
289                                resolve_base_href=False) 
290               
291               
292              if not self.inline_style: 
293                  for el in _find_styled_elements(doc): 
294                      old = el.get('style') 
295                      new = _css_javascript_re.sub('', old) 
296                      new = _css_import_re.sub('', new) 
297                      if self._has_sneaky_javascript(new): 
298                           
299                          del el.attrib['style'] 
300                      elif new != old: 
301                          el.set('style', new) 
302              if not self.style: 
303                  for el in list(doc.iter('style')): 
304                      if el.get('type', '').lower().strip() == 'text/javascript': 
305                          el.drop_tree() 
306                          continue 
307                      old = el.text or '' 
308                      new = _css_javascript_re.sub('', old) 
309                       
310                      new = _css_import_re.sub('', old) 
311                      if self._has_sneaky_javascript(new): 
312                           
313                          el.text = '/* deleted */' 
314                      elif new != old: 
315                          el.text = new 
316          if self.comments or self.processing_instructions: 
317               
318               
319               
320              kill_tags.add(etree.Comment) 
321          if self.processing_instructions: 
322              kill_tags.add(etree.ProcessingInstruction) 
323          if self.style: 
324              kill_tags.add('style') 
325          if self.inline_style: 
326              etree.strip_attributes(doc, 'style') 
327          if self.links: 
328              kill_tags.add('link') 
329          elif self.style or self.javascript: 
330               
331               
332              for el in list(doc.iter('link')): 
333                  if 'stylesheet' in el.get('rel', '').lower(): 
334                       
335                      if not self.allow_element(el): 
336                          el.drop_tree() 
337          if self.meta: 
338              kill_tags.add('meta') 
339          if self.page_structure: 
340              remove_tags.update(('head', 'html', 'title')) 
341          if self.embedded: 
342               
343               
344               
345              for el in list(doc.iter('param')): 
346                  found_parent = False 
347                  parent = el.getparent() 
348                  while parent is not None and parent.tag not in ('applet', 'object'): 
349                      parent = parent.getparent() 
350                  if parent is None: 
351                      el.drop_tree() 
352              kill_tags.update(('applet',)) 
353               
354              remove_tags.update(('iframe', 'embed', 'layer', 'object', 'param')) 
355          if self.frames: 
356               
357               
358               
359              kill_tags.update(defs.frame_tags) 
360          if self.forms: 
361              remove_tags.add('form') 
362              kill_tags.update(('button', 'input', 'select', 'textarea')) 
363          if self.annoying_tags: 
364              remove_tags.update(('blink', 'marquee')) 
365   
366          _remove = [] 
367          _kill = [] 
368          for el in doc.iter(): 
369              if el.tag in kill_tags: 
370                  if self.allow_element(el): 
371                      continue 
372                  _kill.append(el) 
373              elif el.tag in remove_tags: 
374                  if self.allow_element(el): 
375                      continue 
376                  _remove.append(el) 
377   
378          if _remove and _remove[0] == doc: 
379               
380               
381              el = _remove.pop(0) 
382              el.tag = 'div' 
383              el.attrib.clear() 
384          elif _kill and _kill[0] == doc: 
385               
386               
387              el = _kill.pop(0) 
388              if el.tag != 'html': 
389                  el.tag = 'div' 
390              el.clear() 
391   
392          _kill.reverse()  
393          for el in _kill: 
394              el.drop_tree() 
395          for el in _remove: 
396              el.drop_tag() 
397   
398          if self.remove_unknown_tags: 
399              if allow_tags: 
400                  raise ValueError( 
401                      "It does not make sense to pass in both allow_tags and remove_unknown_tags") 
402              allow_tags = set(defs.tags) 
403          if allow_tags: 
404              bad = [] 
405              for el in doc.iter(): 
406                  if el.tag not in allow_tags: 
407                      bad.append(el) 
408              if bad: 
409                  if bad[0] is doc: 
410                      el = bad.pop(0) 
411                      el.tag = 'div' 
412                      el.attrib.clear() 
413                  for el in bad: 
414                      el.drop_tag() 
415          if self.add_nofollow: 
416              for el in _find_external_links(doc): 
417                  if not self.allow_follow(el): 
418                      rel = el.get('rel') 
419                      if rel: 
420                          if ('nofollow' in rel 
421                                  and ' nofollow ' in (' %s ' % rel)): 
422                              continue 
423                          rel = '%s nofollow' % rel 
424                      else: 
425                          rel = 'nofollow' 
426                      el.set('rel', rel) 
 427   
429          """ 
430          Override to suppress rel="nofollow" on some anchors. 
431          """ 
432          return False 
 433   
435          if el.tag not in self._tag_link_attrs: 
436              return False 
437          attr = self._tag_link_attrs[el.tag] 
438          if isinstance(attr, (list, tuple)): 
439              for one_attr in attr: 
440                  url = el.get(one_attr) 
441                  if not url: 
442                      return False 
443                  if not self.allow_embedded_url(el, url): 
444                      return False 
445              return True 
446          else: 
447              url = el.get(attr) 
448              if not url: 
449                  return False 
450              return self.allow_embedded_url(el, url) 
 451   
453          if (self.whitelist_tags is not None 
454              and el.tag not in self.whitelist_tags): 
455              return False 
456          scheme, netloc, path, query, fragment = urlsplit(url) 
457          netloc = netloc.lower().split(':', 1)[0] 
458          if scheme not in ('http', 'https'): 
459              return False 
460          if netloc in self.host_whitelist: 
461              return True 
462          return False 
 463   
474   
476          bad = [] 
477          for el in doc.iter(iterate): 
478              if condition(el): 
479                  bad.append(el) 
480          for el in bad: 
481              el.drop_tree() 
 482   
490   
491      _substitute_comments = re.compile(r'/\*.*?\*/', re.S).sub 
492   
494          """ 
495          Depending on the browser, stuff like ``e x p r e s s i o n(...)`` 
496          can get interpreted, or ``expre/* stuff */ssion(...)``.  This 
497          checks for attempt to do stuff like this. 
498   
499          Typically the response will be to kill the entire style; if you 
500          have just a bit of Javascript in the style another rule will catch 
501          that and remove only the Javascript from the style; this catches 
502          more sneaky attempts. 
503          """ 
504          style = self._substitute_comments('', style) 
505          style = style.replace('\\', '') 
506          style = _substitute_whitespace('', style) 
507          style = style.lower() 
508          if 'javascript:' in style: 
509              return True 
510          if 'expression(' in style: 
511              return True 
512          return False 
 513   
 522   
523  clean = Cleaner() 
524  clean_html = clean.clean_html 
525   
526   
527   
528   
529   
530  _link_regexes = [ 
531      re.compile(r'(?P<body>https?://(?P<host>[a-z0-9._-]+)(?:/[/\-_.,a-z0-9%&?;=~]*)?(?:\([/\-_.,a-z0-9%&?;=~]*\))?)', re.I), 
532       
533      re.compile(r'mailto:(?P<body>[a-z0-9._-]+@(?P<host>[a-z0-9_.-]+[a-z]))', re.I), 
534      ] 
535   
536  _avoid_elements = ['textarea', 'pre', 'code', 'head', 'select', 'a'] 
537   
538  _avoid_hosts = [ 
539      re.compile(r'^localhost', re.I), 
540      re.compile(r'\bexample\.(?:com|org|net)$', re.I), 
541      re.compile(r'^127\.0\.0\.1$'), 
542      ] 
543   
544  _avoid_classes = ['nolink'] 
545   
550      """ 
551      Turn any URLs into links. 
552   
553      It will search for links identified by the given regular 
554      expressions (by default mailto and http(s) links). 
555   
556      It won't link text in an element in avoid_elements, or an element 
557      with a class in avoid_classes.  It won't link to anything with a 
558      host that matches one of the regular expressions in avoid_hosts 
559      (default localhost and 127.0.0.1). 
560   
561      If you pass in an element, the element's tail will not be 
562      substituted, only the contents of the element. 
563      """ 
564      if el.tag in avoid_elements: 
565          return 
566      class_name = el.get('class') 
567      if class_name: 
568          class_name = class_name.split() 
569          for match_class in avoid_classes: 
570              if match_class in class_name: 
571                  return 
572      for child in list(el): 
573          autolink(child, link_regexes=link_regexes, 
574                   avoid_elements=avoid_elements, 
575                   avoid_hosts=avoid_hosts, 
576                   avoid_classes=avoid_classes) 
577          if child.tail: 
578              text, tail_children = _link_text( 
579                  child.tail, link_regexes, avoid_hosts, factory=el.makeelement) 
580              if tail_children: 
581                  child.tail = text 
582                  index = el.index(child) 
583                  el[index+1:index+1] = tail_children 
584      if el.text: 
585          text, pre_children = _link_text( 
586              el.text, link_regexes, avoid_hosts, factory=el.makeelement) 
587          if pre_children: 
588              el.text = text 
589              el[:0] = pre_children 
 590   
591 -def _link_text(text, link_regexes, avoid_hosts, factory): 
 592      leading_text = '' 
593      links = [] 
594      last_pos = 0 
595      while 1: 
596          best_match, best_pos = None, None 
597          for regex in link_regexes: 
598              regex_pos = last_pos 
599              while 1: 
600                  match = regex.search(text, pos=regex_pos) 
601                  if match is None: 
602                      break 
603                  host = match.group('host') 
604                  for host_regex in avoid_hosts: 
605                      if host_regex.search(host): 
606                          regex_pos = match.end() 
607                          break 
608                  else: 
609                      break 
610              if match is None: 
611                  continue 
612              if best_pos is None or match.start() < best_pos: 
613                  best_match = match 
614                  best_pos = match.start() 
615          if best_match is None: 
616               
617              if links: 
618                  assert not links[-1].tail 
619                  links[-1].tail = text 
620              else: 
621                  assert not leading_text 
622                  leading_text = text 
623              break 
624          link = best_match.group(0) 
625          end = best_match.end() 
626          if link.endswith('.') or link.endswith(','): 
627               
628              end -= 1 
629              link = link[:-1] 
630          prev_text = text[:best_match.start()] 
631          if links: 
632              assert not links[-1].tail 
633              links[-1].tail = prev_text 
634          else: 
635              assert not leading_text 
636              leading_text = prev_text 
637          anchor = factory('a') 
638          anchor.set('href', link) 
639          body = best_match.group('body') 
640          if not body: 
641              body = link 
642          if body.endswith('.') or body.endswith(','): 
643              body = body[:-1] 
644          anchor.text = body 
645          links.append(anchor) 
646          text = text[end:] 
647      return leading_text, links 
 648                   
657   
658  autolink_html.__doc__ = autolink.__doc__ 
659   
660   
661   
662   
663   
664  _avoid_word_break_elements = ['pre', 'textarea', 'code'] 
665  _avoid_word_break_classes = ['nobreak'] 
666   
671      """ 
672      Breaks any long words found in the body of the text (not attributes). 
673   
674      Doesn't effect any of the tags in avoid_elements, by default 
675      ``<textarea>`` and ``<pre>`` 
676   
677      Breaks words by inserting ​, which is a unicode character 
678      for Zero Width Space character.  This generally takes up no space 
679      in rendering, but does copy as a space, and in monospace contexts 
680      usually takes up space. 
681   
682      See http://www.cs.tut.fi/~jkorpela/html/nobr.html for a discussion 
683      """ 
684       
685       
686      if el.tag in _avoid_word_break_elements: 
687          return 
688      class_name = el.get('class') 
689      if class_name: 
690          dont_break = False 
691          class_name = class_name.split() 
692          for avoid in avoid_classes: 
693              if avoid in class_name: 
694                  dont_break = True 
695                  break 
696          if dont_break: 
697              return 
698      if el.text: 
699          el.text = _break_text(el.text, max_width, break_character) 
700      for child in el: 
701          word_break(child, max_width=max_width, 
702                     avoid_elements=avoid_elements, 
703                     avoid_classes=avoid_classes, 
704                     break_character=break_character) 
705          if child.tail: 
706              child.tail = _break_text(child.tail, max_width, break_character) 
 707   
713   
714 -def _break_text(text, max_width, break_character): 
 715      words = text.split() 
716      for word in words: 
717          if len(word) > max_width: 
718              replacement = _insert_break(word, max_width, break_character) 
719              text = text.replace(word, replacement) 
720      return text 
 721   
722  _break_prefer_re = re.compile(r'[^a-z]', re.I) 
723   
725      orig_word = word 
726      result = '' 
727      while len(word) > width: 
728          start = word[:width] 
729          breaks = list(_break_prefer_re.finditer(start)) 
730          if breaks: 
731              last_break = breaks[-1] 
732               
733              if last_break.end() > width-10: 
734                   
735                   
736                  start = word[:last_break.end()] 
737          result += start + break_character 
738          word = word[len(start):] 
739      result += word 
740      return result 
 741