1  import difflib 
  2  from lxml import etree 
  3  from lxml.html import fragment_fromstring 
  4  import re 
  5   
  6  __all__ = ['html_annotate', 'htmldiff'] 
  7   
  8  try: 
  9      from html import escape as html_escape 
 10  except ImportError: 
 11      from cgi import escape as html_escape 
 12  try: 
 13      _unicode = unicode 
 14  except NameError: 
 15       
 16      _unicode = str 
 17  try: 
 18      basestring 
 19  except NameError: 
 20       
 21      basestring = str 
 22   
 23   
 24   
 25   
 26   
 28      return '<span title="%s">%s</span>' % ( 
 29          html_escape(_unicode(version), 1), text) 
  30   
 32      """ 
 33      doclist should be ordered from oldest to newest, like:: 
 34   
 35          >>> version1 = 'Hello World' 
 36          >>> version2 = 'Goodbye World' 
 37          >>> print(html_annotate([(version1, 'version 1'), 
 38          ...                      (version2, 'version 2')])) 
 39          <span title="version 2">Goodbye</span> <span title="version 1">World</span> 
 40   
 41      The documents must be *fragments* (str/UTF8 or unicode), not 
 42      complete documents 
 43   
 44      The markup argument is a function to markup the spans of words. 
 45      This function is called like markup('Hello', 'version 2'), and 
 46      returns HTML.  The first argument is text and never includes any 
 47      markup.  The default uses a span with a title: 
 48   
 49          >>> print(default_markup('Some Text', 'by Joe')) 
 50          <span title="by Joe">Some Text</span> 
 51      """ 
 52       
 53       
 54       
 55       
 56       
 57      tokenlist = [tokenize_annotated(doc, version) 
 58                   for doc, version in doclist] 
 59      cur_tokens = tokenlist[0] 
 60      for tokens in tokenlist[1:]: 
 61          html_annotate_merge_annotations(cur_tokens, tokens) 
 62          cur_tokens = tokens 
 63   
 64       
 65       
 66      cur_tokens = compress_tokens(cur_tokens) 
 67       
 68      result = markup_serialize_tokens(cur_tokens, markup) 
 69      return ''.join(result).strip() 
  70   
 72      """Tokenize a document and add an annotation attribute to each token 
 73      """ 
 74      tokens = tokenize(doc, include_hrefs=False) 
 75      for tok in tokens:  
 76          tok.annotation = annotation 
 77      return tokens 
 78   
 80      """Merge the annotations from tokens_old into tokens_new, when the 
 81      tokens in the new document already existed in the old document. 
 82      """ 
 83      s = InsensitiveSequenceMatcher(a=tokens_old, b=tokens_new) 
 84      commands = s.get_opcodes() 
 85   
 86      for command, i1, i2, j1, j2 in commands: 
 87          if command == 'equal':  
 88              eq_old = tokens_old[i1:i2] 
 89              eq_new = tokens_new[j1:j2] 
 90              copy_annotations(eq_old, eq_new) 
 91   
 93      """ 
 94      Copy annotations from the tokens listed in src to the tokens in dest 
 95      """ 
 96      assert len(src) == len(dest) 
 97      for src_tok, dest_tok in zip(src, dest):  
 98          dest_tok.annotation = src_tok.annotation 
 99   
101      """ 
102      Combine adjacent tokens when there is no HTML between the tokens,  
103      and they share an annotation 
104      """ 
105      result = [tokens[0]]  
106      for tok in tokens[1:]:  
107          if (not result[-1].post_tags and  
108              not tok.pre_tags and  
109              result[-1].annotation == tok.annotation):  
110              compress_merge_back(result, tok) 
111          else:  
112              result.append(tok) 
113      return result 
 114   
116      """ Merge tok into the last element of tokens (modifying the list of 
117      tokens in-place).  """ 
118      last = tokens[-1] 
119      if type(last) is not token or type(tok) is not token:  
120          tokens.append(tok) 
121      else: 
122          text = _unicode(last) 
123          if last.trailing_whitespace: 
124              text += last.trailing_whitespace 
125          text += tok 
126          merged = token(text, 
127                         pre_tags=last.pre_tags, 
128                         post_tags=tok.post_tags, 
129                         trailing_whitespace=tok.trailing_whitespace) 
130          merged.annotation = last.annotation 
131          tokens[-1] = merged 
132       
134      """ 
135      Serialize the list of tokens into a list of text chunks, calling 
136      markup_func around text to add annotations. 
137      """ 
138      for token in tokens: 
139          for pre in token.pre_tags: 
140              yield pre 
141          html = token.html() 
142          html = markup_func(html, token.annotation) 
143          if token.trailing_whitespace: 
144              html += token.trailing_whitespace 
145          yield html 
146          for post in token.post_tags: 
147              yield post 
 148   
149   
150   
151   
152   
153   
155       
156       
157      """ Do a diff of the old and new document.  The documents are HTML 
158      *fragments* (str/UTF8 or unicode), they are not complete documents 
159      (i.e., no <html> tag). 
160   
161      Returns HTML with <ins> and <del> tags added around the 
162      appropriate text.   
163   
164      Markup is generally ignored, with the markup from new_html 
165      preserved, and possibly some markup from old_html (though it is 
166      considered acceptable to lose some of the old markup).  Only the 
167      words in the HTML are diffed.  The exception is <img> tags, which 
168      are treated like words, and the href attribute of <a> tags, which 
169      are noted inside the tag itself when there are changes. 
170      """  
171      old_html_tokens = tokenize(old_html) 
172      new_html_tokens = tokenize(new_html) 
173      result = htmldiff_tokens(old_html_tokens, new_html_tokens) 
174      result = ''.join(result).strip() 
175      return fixup_ins_del_tags(result) 
 176   
178      """ Does a diff on the tokens themselves, returning a list of text 
179      chunks (not tokens). 
180      """ 
181       
182       
183       
184       
185       
186       
187       
188       
189       
190       
191       
192       
193       
194      s = InsensitiveSequenceMatcher(a=html1_tokens, b=html2_tokens) 
195      commands = s.get_opcodes() 
196      result = [] 
197      for command, i1, i2, j1, j2 in commands: 
198          if command == 'equal': 
199              result.extend(expand_tokens(html2_tokens[j1:j2], equal=True)) 
200              continue 
201          if command == 'insert' or command == 'replace': 
202              ins_tokens = expand_tokens(html2_tokens[j1:j2]) 
203              merge_insert(ins_tokens, result) 
204          if command == 'delete' or command == 'replace': 
205              del_tokens = expand_tokens(html1_tokens[i1:i2]) 
206              merge_delete(del_tokens, result) 
207       
208       
209       
210       
211      result = cleanup_delete(result) 
212   
213      return result 
 214   
216      """Given a list of tokens, return a generator of the chunks of 
217      text for the data in the tokens. 
218      """ 
219      for token in tokens: 
220          for pre in token.pre_tags: 
221              yield pre 
222          if not equal or not token.hide_when_equal: 
223              if token.trailing_whitespace: 
224                  yield token.html() + token.trailing_whitespace 
225              else: 
226                  yield token.html() 
227          for post in token.post_tags: 
228              yield post 
 229   
231      """ doc is the already-handled document (as a list of text chunks); 
232      here we add <ins>ins_chunks</ins> to the end of that.  """ 
233       
234       
235       
236      unbalanced_start, balanced, unbalanced_end = split_unbalanced(ins_chunks) 
237      doc.extend(unbalanced_start) 
238      if doc and not doc[-1].endswith(' '): 
239           
240           
241          doc[-1] += ' ' 
242      doc.append('<ins>') 
243      if balanced and balanced[-1].endswith(' '): 
244           
245          balanced[-1] = balanced[-1][:-1] 
246      doc.extend(balanced) 
247      doc.append('</ins> ') 
248      doc.extend(unbalanced_end) 
 249   
250   
251   
252   
257   
259      """ Raised when the document no longer contains any pending deletes 
260      (DEL_START/DEL_END) """ 
 261   
263      """ Adds the text chunks in del_chunks to the document doc (another 
264      list of text chunks) with marker to show it is a delete. 
265      cleanup_delete later resolves these markers into <del> tags.""" 
266      doc.append(DEL_START) 
267      doc.extend(del_chunks) 
268      doc.append(DEL_END) 
 269   
271      """ Cleans up any DEL_START/DEL_END markers in the document, replacing 
272      them with <del></del>.  To do this while keeping the document 
273      valid, it may need to drop some tags (either start or end tags). 
274   
275      It may also move the del into adjacent tags to try to move it to a 
276      similar location where it was originally located (e.g., moving a 
277      delete into preceding <div> tag, if the del looks like (DEL_START, 
278      'Text</div>', DEL_END)""" 
279      while 1: 
280           
281           
282           
283          try: 
284              pre_delete, delete, post_delete = split_delete(chunks) 
285          except NoDeletes: 
286               
287              break 
288           
289           
290          unbalanced_start, balanced, unbalanced_end = split_unbalanced(delete) 
291           
292           
293          locate_unbalanced_start(unbalanced_start, pre_delete, post_delete) 
294          locate_unbalanced_end(unbalanced_end, pre_delete, post_delete) 
295          doc = pre_delete 
296          if doc and not doc[-1].endswith(' '): 
297               
298              doc[-1] += ' ' 
299          doc.append('<del>') 
300          if balanced and balanced[-1].endswith(' '): 
301               
302              balanced[-1] = balanced[-1][:-1] 
303          doc.extend(balanced) 
304          doc.append('</del> ') 
305          doc.extend(post_delete) 
306          chunks = doc 
307      return chunks 
 308   
310      """Return (unbalanced_start, balanced, unbalanced_end), where each is 
311      a list of text and tag chunks. 
312   
313      unbalanced_start is a list of all the tags that are opened, but 
314      not closed in this span.  Similarly, unbalanced_end is a list of 
315      tags that are closed but were not opened.  Extracting these might 
316      mean some reordering of the chunks.""" 
317      start = [] 
318      end = [] 
319      tag_stack = [] 
320      balanced = [] 
321      for chunk in chunks: 
322          if not chunk.startswith('<'): 
323              balanced.append(chunk) 
324              continue 
325          endtag = chunk[1] == '/' 
326          name = chunk.split()[0].strip('<>/') 
327          if name in empty_tags: 
328              balanced.append(chunk) 
329              continue 
330          if endtag: 
331              if tag_stack and tag_stack[-1][0] == name: 
332                  balanced.append(chunk) 
333                  name, pos, tag = tag_stack.pop() 
334                  balanced[pos] = tag 
335              elif tag_stack: 
336                  start.extend([tag for name, pos, tag in tag_stack]) 
337                  tag_stack = [] 
338                  end.append(chunk) 
339              else: 
340                  end.append(chunk) 
341          else: 
342              tag_stack.append((name, len(balanced), chunk)) 
343              balanced.append(None) 
344      start.extend( 
345          [chunk for name, pos, chunk in tag_stack]) 
346      balanced = [chunk for chunk in balanced if chunk is not None] 
347      return start, balanced, end 
 348   
350      """ Returns (stuff_before_DEL_START, stuff_inside_DEL_START_END, 
351      stuff_after_DEL_END).  Returns the first case found (there may be 
352      more DEL_STARTs in stuff_after_DEL_END).  Raises NoDeletes if 
353      there's no DEL_START found. """ 
354      try: 
355          pos = chunks.index(DEL_START) 
356      except ValueError: 
357          raise NoDeletes 
358      pos2 = chunks.index(DEL_END) 
359      return chunks[:pos], chunks[pos+1:pos2], chunks[pos2+1:] 
 360   
362      """ pre_delete and post_delete implicitly point to a place in the 
363      document (where the two were split).  This moves that point (by 
364      popping items from one and pushing them onto the other).  It moves 
365      the point to try to find a place where unbalanced_start applies. 
366   
367      As an example:: 
368   
369          >>> unbalanced_start = ['<div>'] 
370          >>> doc = ['<p>', 'Text', '</p>', '<div>', 'More Text', '</div>'] 
371          >>> pre, post = doc[:3], doc[3:] 
372          >>> pre, post 
373          (['<p>', 'Text', '</p>'], ['<div>', 'More Text', '</div>']) 
374          >>> locate_unbalanced_start(unbalanced_start, pre, post) 
375          >>> pre, post 
376          (['<p>', 'Text', '</p>', '<div>'], ['More Text', '</div>']) 
377   
378      As you can see, we moved the point so that the dangling <div> that 
379      we found will be effectively replaced by the div in the original 
380      document.  If this doesn't work out, we just throw away 
381      unbalanced_start without doing anything. 
382      """ 
383      while 1: 
384          if not unbalanced_start: 
385               
386              break 
387          finding = unbalanced_start[0] 
388          finding_name = finding.split()[0].strip('<>') 
389          if not post_delete: 
390              break 
391          next = post_delete[0] 
392          if next is DEL_START or not next.startswith('<'): 
393               
394              break 
395          if next[1] == '/': 
396               
397              break 
398          name = next.split()[0].strip('<>') 
399          if name == 'ins': 
400               
401              break 
402          assert name != 'del', ( 
403              "Unexpected delete tag: %r" % next) 
404          if name == finding_name: 
405              unbalanced_start.pop(0) 
406              pre_delete.append(post_delete.pop(0)) 
407          else: 
408               
409              break 
 410   
412      """ like locate_unbalanced_start, except handling end tags and 
413      possibly moving the point earlier in the document.  """ 
414      while 1: 
415          if not unbalanced_end: 
416               
417              break 
418          finding = unbalanced_end[-1] 
419          finding_name = finding.split()[0].strip('<>/') 
420          if not pre_delete: 
421              break 
422          next = pre_delete[-1] 
423          if next is DEL_END or not next.startswith('</'): 
424               
425              break 
426          name = next.split()[0].strip('<>/') 
427          if name == 'ins' or name == 'del': 
428               
429              break 
430          if name == finding_name: 
431              unbalanced_end.pop() 
432              post_delete.insert(0, pre_delete.pop()) 
433          else: 
434               
435              break 
 436   
438      """ Represents a diffable token, generally a word that is displayed to 
439      the user.  Opening tags are attached to this token when they are 
440      adjacent (pre_tags) and closing tags that follow the word 
441      (post_tags).  Some exceptions occur when there are empty tags 
442      adjacent to a word, so there may be close tags in pre_tags, or 
443      open tags in post_tags. 
444   
445      We also keep track of whether the word was originally followed by 
446      whitespace, even though we do not want to treat the word as 
447      equivalent to a similar word that does not have a trailing 
448      space.""" 
449   
450       
451       
452      hide_when_equal = False 
453   
454 -    def __new__(cls, text, pre_tags=None, post_tags=None, trailing_whitespace=""): 
 455          obj = _unicode.__new__(cls, text) 
456   
457          if pre_tags is not None: 
458              obj.pre_tags = pre_tags 
459          else: 
460              obj.pre_tags = [] 
461   
462          if post_tags is not None: 
463              obj.post_tags = post_tags 
464          else: 
465              obj.post_tags = [] 
466   
467          obj.trailing_whitespace = trailing_whitespace 
468   
469          return obj 
 470   
472          return 'token(%s, %r, %r, %r)' % (_unicode.__repr__(self), self.pre_tags, 
473                                            self.post_tags, self.trailing_whitespace) 
 474   
476          return _unicode(self) 
  477   
479   
480      """ Represents a token that is actually a tag.  Currently this is just 
481      the <img> tag, which takes up visible space just like a word but 
482      is only represented in a document by a tag.  """ 
483   
484 -    def __new__(cls, tag, data, html_repr, pre_tags=None,  
485                  post_tags=None, trailing_whitespace=""): 
 486          obj = token.__new__(cls, "%s: %s" % (type, data),  
487                              pre_tags=pre_tags,  
488                              post_tags=post_tags,  
489                              trailing_whitespace=trailing_whitespace) 
490          obj.tag = tag 
491          obj.data = data 
492          obj.html_repr = html_repr 
493          return obj 
 494   
496          return 'tag_token(%s, %s, html_repr=%s, post_tags=%r, pre_tags=%r, trailing_whitespace=%r)' % ( 
497              self.tag,  
498              self.data,  
499              self.html_repr,  
500              self.pre_tags,  
501              self.post_tags,  
502              self.trailing_whitespace) 
 504          return self.html_repr 
  505   
507   
508      """ Represents the href in an anchor tag.  Unlike other words, we only 
509      show the href when it changes.  """ 
510   
511      hide_when_equal = True 
512   
514          return ' Link: %s' % self 
  515   
517      """ 
518      Parse the given HTML and returns token objects (words with attached tags). 
519   
520      This parses only the content of a page; anything in the head is 
521      ignored, and the <head> and <body> elements are themselves 
522      optional.  The content is then parsed by lxml, which ensures the 
523      validity of the resulting parsed document (though lxml may make 
524      incorrect guesses when the markup is particular bad). 
525   
526      <ins> and <del> tags are also eliminated from the document, as 
527      that gets confusing. 
528   
529      If include_hrefs is true, then the href attribute of <a> tags is 
530      included as a special kind of diffable token.""" 
531      if etree.iselement(html): 
532          body_el = html 
533      else: 
534          body_el = parse_html(html, cleanup=True) 
535       
536      chunks = flatten_el(body_el, skip_tag=True, include_hrefs=include_hrefs) 
537       
538      return fixup_chunks(chunks) 
 539   
541      """ 
542      Parses an HTML fragment, returning an lxml element.  Note that the HTML will be 
543      wrapped in a <div> tag that was not in the original document. 
544   
545      If cleanup is true, make sure there's no <head> or <body>, and get 
546      rid of any <ins> and <del> tags. 
547      """ 
548      if cleanup: 
549           
550          html = cleanup_html(html) 
551      return fragment_fromstring(html, create_parent=True) 
 552   
553  _body_re = re.compile(r'<body.*?>', re.I|re.S) 
554  _end_body_re = re.compile(r'</body.*?>', re.I|re.S) 
555  _ins_del_re = re.compile(r'</?(ins|del).*?>', re.I|re.S) 
556   
558      """ This 'cleans' the HTML, meaning that any page structure is removed 
559      (only the contents of <body> are used, if there is any <body). 
560      Also <ins> and <del> tags are removed.  """ 
561      match = _body_re.search(html) 
562      if match: 
563          html = html[match.end():] 
564      match = _end_body_re.search(html) 
565      if match: 
566          html = html[:match.start()] 
567      html = _ins_del_re.sub('', html) 
568      return html 
 569       
570   
571  end_whitespace_re = re.compile(r'[ \t\n\r]$') 
572   
574      """ 
575      This function takes a word, such as 'test\n\n' and returns ('test','\n\n') 
576      """ 
577      stripped_length = len(word.rstrip()) 
578      return word[0:stripped_length], word[stripped_length:] 
 579   
580   
582      """ 
583      This function takes a list of chunks and produces a list of tokens. 
584      """ 
585      tag_accum = [] 
586      cur_word = None 
587      result = [] 
588      for chunk in chunks: 
589          if isinstance(chunk, tuple): 
590              if chunk[0] == 'img': 
591                  src = chunk[1] 
592                  tag, trailing_whitespace = split_trailing_whitespace(chunk[2]) 
593                  cur_word = tag_token('img', src, html_repr=tag, 
594                                       pre_tags=tag_accum, 
595                                       trailing_whitespace=trailing_whitespace) 
596                  tag_accum = [] 
597                  result.append(cur_word) 
598   
599              elif chunk[0] == 'href': 
600                  href = chunk[1] 
601                  cur_word = href_token(href, pre_tags=tag_accum, trailing_whitespace=" ") 
602                  tag_accum = [] 
603                  result.append(cur_word) 
604              continue 
605   
606          if is_word(chunk): 
607              chunk, trailing_whitespace = split_trailing_whitespace(chunk) 
608              cur_word = token(chunk, pre_tags=tag_accum, trailing_whitespace=trailing_whitespace) 
609              tag_accum = [] 
610              result.append(cur_word) 
611   
612          elif is_start_tag(chunk): 
613              tag_accum.append(chunk) 
614   
615          elif is_end_tag(chunk): 
616              if tag_accum: 
617                  tag_accum.append(chunk) 
618              else: 
619                  assert cur_word, ( 
620                      "Weird state, cur_word=%r, result=%r, chunks=%r of %r" 
621                      % (cur_word, result, chunk, chunks)) 
622                  cur_word.post_tags.append(chunk) 
623          else: 
624              assert(0) 
625   
626      if not result: 
627          return [token('', pre_tags=tag_accum)] 
628      else: 
629          result[-1].post_tags.extend(tag_accum) 
630   
631      return result 
 632   
633   
634   
635  empty_tags = ( 
636      'param', 'img', 'area', 'br', 'basefont', 'input', 
637      'base', 'meta', 'link', 'col') 
638   
639  block_level_tags = ( 
640      'address', 
641      'blockquote', 
642      'center', 
643      'dir', 
644      'div', 
645      'dl', 
646      'fieldset', 
647      'form', 
648      'h1', 
649      'h2', 
650      'h3', 
651      'h4', 
652      'h5', 
653      'h6', 
654      'hr', 
655      'isindex', 
656      'menu', 
657      'noframes', 
658      'noscript', 
659      'ol', 
660      'p', 
661      'pre', 
662      'table', 
663      'ul', 
664      ) 
665   
666  block_level_container_tags = ( 
667      'dd', 
668      'dt', 
669      'frameset', 
670      'li', 
671      'tbody', 
672      'td', 
673      'tfoot', 
674      'th', 
675      'thead', 
676      'tr', 
677      ) 
678   
679   
680 -def flatten_el(el, include_hrefs, skip_tag=False): 
 681      """ Takes an lxml element el, and generates all the text chunks for 
682      that tag.  Each start tag is a chunk, each word is a chunk, and each 
683      end tag is a chunk. 
684   
685      If skip_tag is true, then the outermost container tag is 
686      not returned (just its contents).""" 
687      if not skip_tag: 
688          if el.tag == 'img': 
689              yield ('img', el.get('src'), start_tag(el)) 
690          else: 
691              yield start_tag(el) 
692      if el.tag in empty_tags and not el.text and not len(el) and not el.tail: 
693          return 
694      start_words = split_words(el.text) 
695      for word in start_words: 
696          yield html_escape(word) 
697      for child in el: 
698          for item in flatten_el(child, include_hrefs=include_hrefs): 
699              yield item 
700      if el.tag == 'a' and el.get('href') and include_hrefs: 
701          yield ('href', el.get('href')) 
702      if not skip_tag: 
703          yield end_tag(el) 
704          end_words = split_words(el.tail) 
705          for word in end_words: 
706              yield html_escape(word) 
 707   
708  split_words_re = re.compile(r'\S+(?:\s+|$)', re.U) 
709   
711      """ Splits some text into words. Includes trailing whitespace 
712      on each word when appropriate.  """ 
713      if not text or not text.strip(): 
714          return [] 
715   
716      words = split_words_re.findall(text) 
717      return words 
 718   
719  start_whitespace_re = re.compile(r'^[ \t\n\r]') 
720   
722      """ 
723      The text representation of the start tag for a tag. 
724      """ 
725      return '<%s%s>' % ( 
726          el.tag, ''.join([' %s="%s"' % (name, html_escape(value, True)) 
727                           for name, value in el.attrib.items()])) 
 728   
730      """ The text representation of an end tag for a tag.  Includes 
731      trailing whitespace when appropriate.  """ 
732      if el.tail and start_whitespace_re.search(el.tail): 
733          extra = ' ' 
734      else: 
735          extra = '' 
736      return '</%s>%s' % (el.tag, extra) 
 737   
739      return not tok.startswith('<') 
 740   
742      return tok.startswith('</') 
 743   
745      return tok.startswith('<') and not tok.startswith('</') 
 746   
755   
757      """ Serialize a single lxml element as HTML.  The serialized form 
758      includes the elements tail.   
759   
760      If skip_outer is true, then don't serialize the outermost tag 
761      """ 
762      assert not isinstance(el, basestring), ( 
763          "You should pass in an element, not a string like %r" % el) 
764      html = etree.tostring(el, method="html", encoding=_unicode) 
765      if skip_outer: 
766           
767          html = html[html.find('>')+1:] 
768           
769          html = html[:html.rfind('<')] 
770          return html.strip() 
771      else: 
772          return html 
 773   
783               
784   
794   
827               
829      """ 
830      Removes an element, but merges its contents into its place, e.g., 
831      given <p>Hi <i>there!</i></p>, if you remove the <i> element you get 
832      <p>Hi there!</p> 
833      """ 
834      parent = el.getparent() 
835      text = el.text or '' 
836      if el.tail: 
837          if not len(el): 
838              text += el.tail 
839          else: 
840              if el[-1].tail: 
841                  el[-1].tail += el.tail 
842              else: 
843                  el[-1].tail = el.tail 
844      index = parent.index(el) 
845      if text: 
846          if index == 0: 
847              previous = None 
848          else: 
849              previous = parent[index-1] 
850          if previous is None: 
851              if parent.text: 
852                  parent.text += text 
853              else: 
854                  parent.text = text 
855          else: 
856              if previous.tail: 
857                  previous.tail += text 
858              else: 
859                  previous.tail = text 
860      parent[index:index+1] = el.getchildren() 
 861   
863      """ 
864      Acts like SequenceMatcher, but tries not to find very small equal 
865      blocks amidst large spans of changes 
866      """ 
867   
868      threshold = 2 
869       
871          size = min(len(self.b), len(self.b)) 
872          threshold = min(self.threshold, size / 4) 
873          actual = difflib.SequenceMatcher.get_matching_blocks(self) 
874          return [item for item in actual 
875                  if item[2] > threshold 
876                  or not item[2]] 
  877   
878  if __name__ == '__main__': 
879      from lxml.html import _diffcommand 
880      _diffcommand.main() 
881