1  """ 
  2  lxml-based doctest output comparison. 
  3   
  4  Note: normally, you should just import the `lxml.usedoctest` and 
  5  `lxml.html.usedoctest` modules from within a doctest, instead of this 
  6  one:: 
  7   
  8      >>> import lxml.usedoctest # for XML output 
  9   
 10      >>> import lxml.html.usedoctest # for HTML output 
 11   
 12  To use this module directly, you must call ``lxmldoctest.install()``, 
 13  which will cause doctest to use this in all subsequent calls. 
 14   
 15  This changes the way output is checked and comparisons are made for 
 16  XML or HTML-like content. 
 17   
 18  XML or HTML content is noticed because the example starts with ``<`` 
 19  (it's HTML if it starts with ``<html``).  You can also use the 
 20  ``PARSE_HTML`` and ``PARSE_XML`` flags to force parsing. 
 21   
 22  Some rough wildcard-like things are allowed.  Whitespace is generally 
 23  ignored (except in attributes).  In text (attributes and text in the 
 24  body) you can use ``...`` as a wildcard.  In an example it also 
 25  matches any trailing tags in the element, though it does not match 
 26  leading tags.  You may create a tag ``<any>`` or include an ``any`` 
 27  attribute in the tag.  An ``any`` tag matches any tag, while the 
 28  attribute matches any and all attributes. 
 29   
 30  When a match fails, the reformatted example and gotten text is 
 31  displayed (indented), and a rough diff-like output is given.  Anything 
 32  marked with ``+`` is in the output but wasn't supposed to be, and 
 33  similarly ``-`` means its in the example but wasn't in the output. 
 34   
 35  You can disable parsing on one line with ``# doctest:+NOPARSE_MARKUP`` 
 36  """ 
 37   
 38  from lxml import etree 
 39  import sys 
 40  import re 
 41  import doctest 
 42  try: 
 43      from html import escape as html_escape 
 44  except ImportError: 
 45      from cgi import escape as html_escape 
 46   
 47  __all__ = ['PARSE_HTML', 'PARSE_XML', 'NOPARSE_MARKUP', 'LXMLOutputChecker', 
 48             'LHTMLOutputChecker', 'install', 'temp_install'] 
 49   
 50  try: 
 51      _basestring = basestring 
 52  except NameError: 
 53      _basestring = (str, bytes) 
 54   
 55  _IS_PYTHON_3 = sys.version_info[0] >= 3 
 56   
 57  PARSE_HTML = doctest.register_optionflag('PARSE_HTML') 
 58  PARSE_XML = doctest.register_optionflag('PARSE_XML') 
 59  NOPARSE_MARKUP = doctest.register_optionflag('NOPARSE_MARKUP') 
 60   
 61  OutputChecker = doctest.OutputChecker 
 62   
 64      if v is None: 
 65          return None 
 66      else: 
 67          return v.strip() 
  68   
 71   
 72  _html_parser = etree.HTMLParser(recover=False, remove_blank_text=True) 
 73   
 76   
 77   
 78  _repr_re = re.compile(r'^<[^>]+ (at|object) ') 
 79  _norm_whitespace_re = re.compile(r'[ \t\n][ \t\n]+') 
 80   
 82   
 83      empty_tags = ( 
 84          'param', 'img', 'area', 'br', 'basefont', 'input', 
 85          'base', 'meta', 'link', 'col') 
 86   
 89   
 91          alt_self = getattr(self, '_temp_override_self', None) 
 92          if alt_self is not None: 
 93              super_method = self._temp_call_super_check_output 
 94              self = alt_self 
 95          else: 
 96              super_method = OutputChecker.check_output 
 97          parser = self.get_parser(want, got, optionflags) 
 98          if not parser: 
 99              return super_method( 
100                  self, want, got, optionflags) 
101          try: 
102              want_doc = parser(want) 
103          except etree.XMLSyntaxError: 
104              return False 
105          try: 
106              got_doc = parser(got) 
107          except etree.XMLSyntaxError: 
108              return False 
109          return self.compare_docs(want_doc, got_doc) 
 110   
126   
128          s = s.strip() 
129          return (s.startswith('<') 
130                  and not _repr_re.search(s)) 
 131   
133          if not self.tag_compare(want.tag, got.tag): 
134              return False 
135          if not self.text_compare(want.text, got.text, True): 
136              return False 
137          if not self.text_compare(want.tail, got.tail, True): 
138              return False 
139          if 'any' not in want.attrib: 
140              want_keys = sorted(want.attrib.keys()) 
141              got_keys = sorted(got.attrib.keys()) 
142              if want_keys != got_keys: 
143                  return False 
144              for key in want_keys: 
145                  if not self.text_compare(want.attrib[key], got.attrib[key], False): 
146                      return False 
147          if want.text != '...' or len(want): 
148              want_children = list(want) 
149              got_children = list(got) 
150              while want_children or got_children: 
151                  if not want_children or not got_children: 
152                      return False 
153                  want_first = want_children.pop(0) 
154                  got_first = got_children.pop(0) 
155                  if not self.compare_docs(want_first, got_first): 
156                      return False 
157                  if not got_children and want_first.tail == '...': 
158                      break 
159          return True 
 160   
161 -    def text_compare(self, want, got, strip): 
 162          want = want or '' 
163          got = got or '' 
164          if strip: 
165              want = norm_whitespace(want).strip() 
166              got = norm_whitespace(got).strip() 
167          want = '^%s$' % re.escape(want) 
168          want = want.replace(r'\.\.\.', '.*') 
169          if re.search(want, got): 
170              return True 
171          else: 
172              return False 
 173   
175          if want == 'any': 
176              return True 
177          if (not isinstance(want, _basestring) 
178              or not isinstance(got, _basestring)): 
179              return want == got 
180          want = want or '' 
181          got = got or '' 
182          if want.startswith('{...}'): 
183               
184              return want.split('}')[-1] == got.split('}')[-1] 
185          else: 
186              return want == got 
 187   
189          want = example.want 
190          parser = self.get_parser(want, got, optionflags) 
191          errors = [] 
192          if parser is not None: 
193              try: 
194                  want_doc = parser(want) 
195              except etree.XMLSyntaxError: 
196                  e = sys.exc_info()[1] 
197                  errors.append('In example: %s' % e) 
198              try: 
199                  got_doc = parser(got) 
200              except etree.XMLSyntaxError: 
201                  e = sys.exc_info()[1] 
202                  errors.append('In actual output: %s' % e) 
203          if parser is None or errors: 
204              value = OutputChecker.output_difference( 
205                  self, example, got, optionflags) 
206              if errors: 
207                  errors.append(value) 
208                  return '\n'.join(errors) 
209              else: 
210                  return value 
211          html = parser is html_fromstring 
212          diff_parts = [] 
213          diff_parts.append('Expected:') 
214          diff_parts.append(self.format_doc(want_doc, html, 2)) 
215          diff_parts.append('Got:') 
216          diff_parts.append(self.format_doc(got_doc, html, 2)) 
217          diff_parts.append('Diff:') 
218          diff_parts.append(self.collect_diff(want_doc, got_doc, html, 2)) 
219          return '\n'.join(diff_parts) 
 220   
222          if not html: 
223              return False 
224          if el.tag not in self.empty_tags: 
225              return False 
226          if el.text or len(el): 
227               
228              return False 
229          return True 
 230   
265   
272   
283       
289   
291          parts = [] 
292          if not len(want) and not len(got): 
293              parts.append(' '*indent) 
294              parts.append(self.collect_diff_tag(want, got)) 
295              if not self.html_empty_tag(got, html): 
296                  parts.append(self.collect_diff_text(want.text, got.text)) 
297                  parts.append(self.collect_diff_end_tag(want, got)) 
298              parts.append(self.collect_diff_text(want.tail, got.tail)) 
299              parts.append('\n') 
300              return ''.join(parts) 
301          parts.append(' '*indent) 
302          parts.append(self.collect_diff_tag(want, got)) 
303          parts.append('\n') 
304          if strip(want.text) or strip(got.text): 
305              parts.append(' '*indent) 
306              parts.append(self.collect_diff_text(want.text, got.text)) 
307              parts.append('\n') 
308          want_children = list(want) 
309          got_children = list(got) 
310          while want_children or got_children: 
311              if not want_children: 
312                  parts.append(self.format_doc(got_children.pop(0), html, indent+2, '+')) 
313                  continue 
314              if not got_children: 
315                  parts.append(self.format_doc(want_children.pop(0), html, indent+2, '-')) 
316                  continue 
317              parts.append(self.collect_diff( 
318                  want_children.pop(0), got_children.pop(0), html, indent+2)) 
319          parts.append(' '*indent) 
320          parts.append(self.collect_diff_end_tag(want, got)) 
321          parts.append('\n') 
322          if strip(want.tail) or strip(got.tail): 
323              parts.append(' '*indent) 
324              parts.append(self.collect_diff_text(want.tail, got.tail)) 
325              parts.append('\n') 
326          return ''.join(parts) 
 327   
329          if not self.tag_compare(want.tag, got.tag): 
330              tag = '%s (got: %s)' % (want.tag, got.tag) 
331          else: 
332              tag = got.tag 
333          attrs = [] 
334          any = want.tag == 'any' or 'any' in want.attrib 
335          for name, value in sorted(got.attrib.items()): 
336              if name not in want.attrib and not any: 
337                  attrs.append('+%s="%s"' % (name, self.format_text(value, False))) 
338              else: 
339                  if name in want.attrib: 
340                      text = self.collect_diff_text(want.attrib[name], value, False) 
341                  else: 
342                      text = self.format_text(value, False) 
343                  attrs.append('%s="%s"' % (name, text)) 
344          if not any: 
345              for name, value in sorted(want.attrib.items()): 
346                  if name in got.attrib: 
347                      continue 
348                  attrs.append('-%s="%s"' % (name, self.format_text(value, False))) 
349          if attrs: 
350              tag = '<%s %s>' % (tag, ' '.join(attrs)) 
351          else: 
352              tag = '<%s>' % tag 
353          return tag 
 354   
356          if want.tag != got.tag: 
357              tag = '%s (got: %s)' % (want.tag, got.tag) 
358          else: 
359              tag = got.tag 
360          return '</%s>' % tag 
 361   
362 -    def collect_diff_text(self, want, got, strip=True): 
 363          if self.text_compare(want, got, strip): 
364              if not got: 
365                  return '' 
366              return self.format_text(got, strip) 
367          text = '%s (got: %s)' % (want, got) 
368          return self.format_text(text, strip) 
  369   
373       
375      """ 
376      Install doctestcompare for all future doctests. 
377   
378      If html is true, then by default the HTML parser will be used; 
379      otherwise the XML parser is used. 
380      """ 
381      if html: 
382          doctest.OutputChecker = LHTMLOutputChecker 
383      else: 
384          doctest.OutputChecker = LXMLOutputChecker 
 385   
387      """ 
388      Use this *inside* a doctest to enable this checker for this 
389      doctest only. 
390   
391      If html is true, then by default the HTML parser will be used; 
392      otherwise the XML parser is used. 
393      """ 
394      if html: 
395          Checker = LHTMLOutputChecker 
396      else: 
397          Checker = LXMLOutputChecker 
398      frame = _find_doctest_frame() 
399      dt_self = frame.f_locals['self'] 
400      checker = Checker() 
401      old_checker = dt_self._checker 
402      dt_self._checker = checker 
403       
404       
405       
406       
407       
408       
409       
410       
411       
412      if _IS_PYTHON_3: 
413          check_func = frame.f_locals['check'].__func__ 
414          checker_check_func = checker.check_output.__func__ 
415      else: 
416          check_func = frame.f_locals['check'].im_func 
417          checker_check_func = checker.check_output.im_func 
418       
419       
420      doctest.etree = etree 
421      _RestoreChecker(dt_self, old_checker, checker, 
422                      check_func, checker_check_func, 
423                      del_module) 
 424   
426 -    def __init__(self, dt_self, old_checker, new_checker, check_func, clone_func, 
427                   del_module): 
 428          self.dt_self = dt_self 
429          self.checker = old_checker 
430          self.checker._temp_call_super_check_output = self.call_super 
431          self.checker._temp_override_self = new_checker 
432          self.check_func = check_func 
433          self.clone_func = clone_func 
434          self.del_module = del_module 
435          self.install_clone() 
436          self.install_dt_self() 
 438          if _IS_PYTHON_3: 
439              self.func_code = self.check_func.__code__ 
440              self.func_globals = self.check_func.__globals__ 
441              self.check_func.__code__ = self.clone_func.__code__ 
442          else: 
443              self.func_code = self.check_func.func_code 
444              self.func_globals = self.check_func.func_globals 
445              self.check_func.func_code = self.clone_func.func_code 
 447          if _IS_PYTHON_3: 
448              self.check_func.__code__ = self.func_code 
449          else: 
450              self.check_func.func_code = self.func_code 
 452          self.prev_func = self.dt_self._DocTestRunner__record_outcome 
453          self.dt_self._DocTestRunner__record_outcome = self 
 455          self.dt_self._DocTestRunner__record_outcome = self.prev_func 
 457          if self.del_module: 
458              import sys 
459              del sys.modules[self.del_module] 
460              if '.' in self.del_module: 
461                  package, module = self.del_module.rsplit('.', 1) 
462                  package_mod = sys.modules[package] 
463                  delattr(package_mod, module) 
 480      import sys 
481      frame = sys._getframe(1) 
482      while frame: 
483          l = frame.f_locals 
484          if 'BOOM' in l: 
485               
486              return frame 
487          frame = frame.f_back 
488      raise LookupError( 
489          "Could not find doctest (only use this function *inside* a doctest)") 
 490       
491  __test__ = { 
492      'basic': ''' 
493      >>> temp_install() 
494      >>> print """<xml a="1" b="2">stuff</xml>""" 
495      <xml b="2" a="1">...</xml> 
496      >>> print """<xml xmlns="http://example.com"><tag   attr="bar"   /></xml>""" 
497      <xml xmlns="..."> 
498        <tag attr="..." /> 
499      </xml> 
500      >>> print """<xml>blahblahblah<foo /></xml>""" # doctest: +NOPARSE_MARKUP, +ELLIPSIS 
501      <xml>...foo /></xml> 
502      '''} 
503   
504  if __name__ == '__main__': 
505      import doctest 
506      doctest.testmod() 
507