1   
  2   
  3  """ 
  4  HTML parser test cases for etree 
  5  """ 
  6   
  7  import unittest 
  8  import tempfile, os, os.path, sys 
  9   
 10  this_dir = os.path.dirname(__file__) 
 11  if this_dir not in sys.path: 
 12      sys.path.insert(0, this_dir)  
 13   
 14  from common_imports import etree, html, StringIO, BytesIO, fileInTestDir, _bytes, _str 
 15  from common_imports import SillyFileLike, HelperTestCase, write_to_file, next 
 16   
 17  try: 
 18      unicode 
 19  except NameError: 
 20      unicode = str 
 21   
 22   
 24      """HTML parser test cases 
 25      """ 
 26      etree = etree 
 27   
 28      html_str = _bytes("<html><head><title>test</title></head><body><h1>page title</h1></body></html>") 
 29      html_str_pretty = _bytes("""\ 
 30  <html> 
 31  <head><title>test</title></head> 
 32  <body><h1>page title</h1></body> 
 33  </html> 
 34  """) 
 35      broken_html_str = _bytes("<html><head><title>test" 
 36                               "<body><h1>page title</h3></p></html>") 
 37      uhtml_str = _bytes( 
 38          "<html><head><title>test á</title></head>" 
 39          "<body><h1>page á title</h1></body></html>").decode('utf8') 
 40   
 44   
 49   
 57   
 59          if sys.maxunicode < 1114111: 
 60              return   
 61          element = self.etree.HTML(_bytes( 
 62              '<html><body><p>\\U00026007</p></body></html>' 
 63          ).decode('unicode_escape')) 
 64          p_text = element.findtext('.//p') 
 65          self.assertEqual(1, len(p_text)) 
 66          self.assertEqual(_bytes('\\U00026007').decode('unicode_escape'), 
 67                           p_text) 
  68   
 76   
 84   
 89   
 96   
 98          parser = self.etree.HTMLParser() 
 99          Element = parser.makeelement 
100   
101          el = Element('name') 
102          self.assertRaises(ValueError, Element, '{}') 
103          self.assertRaises(ValueError, setattr, el, 'tag', '{}') 
104   
105          self.assertRaises(ValueError, Element, '{test}') 
106          self.assertRaises(ValueError, setattr, el, 'tag', '{test}') 
 107   
121   
123          parser = self.etree.HTMLParser() 
124          Element = parser.makeelement 
125   
126          self.assertRaises(ValueError, Element, 'p"name') 
127          self.assertRaises(ValueError, Element, "na'me") 
128          self.assertRaises(ValueError, Element, '{test}"name') 
129          self.assertRaises(ValueError, Element, "{test}name'") 
130   
131          el = Element('name') 
132          self.assertRaises(ValueError, setattr, el, 'tag', "pname'") 
133          self.assertRaises(ValueError, setattr, el, 'tag', '"pname') 
134          self.assertEqual(el.tag, "name") 
 135   
137          parser = self.etree.HTMLParser() 
138          Element = parser.makeelement 
139   
140          self.assertRaises(ValueError, Element, ' name ') 
141          self.assertRaises(ValueError, Element, 'na me') 
142          self.assertRaises(ValueError, Element, '{test} name') 
143   
144          el = Element('name') 
145          self.assertRaises(ValueError, setattr, el, 'tag', ' name ') 
146          self.assertEqual(el.tag, "name") 
 147   
157   
169   
171          parser = self.etree.HTMLParser() 
172          Element = parser.makeelement 
173          SubElement = self.etree.SubElement 
174   
175          el = Element('name') 
176          self.assertRaises(ValueError, SubElement, el, "name'") 
177          self.assertRaises(ValueError, SubElement, el, 'na"me') 
178          self.assertRaises(ValueError, SubElement, el, "{test}na'me") 
179          self.assertRaises(ValueError, SubElement, el, '{test}"name') 
 180   
190   
197   
205   
215   
217          text = _str('Søk på nettet') 
218          wrong_head = _str(''' 
219          <head> 
220            <meta http-equiv="Content-Type" 
221                  content="text/html; charset=UTF-8" /> 
222          </head>''') 
223          html_latin1 = (_str('<html>%s<body><p>%s</p></body></html>') % (wrong_head, 
224                                                                          text) 
225                        ).encode('iso-8859-1') 
226   
227          self.assertRaises(self.etree.ParseError, 
228                            self.etree.parse, 
229                            BytesIO(html_latin1)) 
230   
231          tree = self.etree.parse( 
232              BytesIO(html_latin1), 
233              self.etree.HTMLParser(encoding="iso-8859-1")) 
234          p = tree.find("//p") 
235          self.assertEqual(p.text, text) 
 236   
241   
247   
251   
264   
272   
273   
274   
275   
276   
277   
278   
279   
286   
301   
303          iterparse = self.etree.iterparse 
304          f = BytesIO( 
305              '<html><head><title>TITLE</title><body><p>P</p></body></html>') 
306   
307          iterator = iterparse(f, html=True) 
308          self.assertEqual(None, iterator.root) 
309   
310          events = list(iterator) 
311          root = iterator.root 
312          self.assertTrue(root is not None) 
313          self.assertEqual( 
314              [('end', root[0][0]), ('end', root[0]), ('end', root[1][0]), 
315               ('end', root[1]), ('end', root)], 
316              events) 
 317   
319          iterparse = self.etree.iterparse 
320          f = BytesIO( 
321              '<html><head><title>TITLE</title><body><p>P</p></body></html>') 
322   
323          iterator = iterparse(f, html=True) 
324          self.assertEqual(None, iterator.root) 
325   
326          event, element = next(iterator) 
327          self.assertEqual('end', event) 
328          self.assertEqual('title', element.tag) 
329          self.assertEqual(None, iterator.root) 
330          del element 
331   
332          event, element = next(iterator) 
333          self.assertEqual('end', event) 
334          self.assertEqual('head', element.tag) 
335          self.assertEqual(None, iterator.root) 
336          del element 
337          del iterator 
 338   
340          iterparse = self.etree.iterparse 
341          f = BytesIO('<head><title>TEST></head><p>P<br></div>') 
342   
343          iterator = iterparse(f, html=True) 
344          self.assertEqual(None, iterator.root) 
345   
346          events = list(iterator) 
347          root = iterator.root 
348          self.assertTrue(root is not None) 
349          self.assertEqual('html', root.tag) 
350          self.assertEqual('head', root[0].tag) 
351          self.assertEqual('body', root[1].tag) 
352          self.assertEqual('p', root[1][0].tag) 
353          self.assertEqual('br', root[1][0][0].tag) 
354          self.assertEqual( 
355              [('end', root[0][0]), ('end', root[0]), ('end', root[1][0][0]), 
356               ('end', root[1][0]), ('end', root[1]), ('end', root)], 
357              events) 
 358   
364   
366          iterparse = self.etree.iterparse 
367          iterator = iterparse(fileInTestDir("shakespeare.html"), 
368                               html=True) 
369   
370          self.assertEqual(None, iterator.root) 
371          events = list(iterator) 
372          root = iterator.root 
373          self.assertTrue(root is not None) 
374          self.assertEqual(249, len(events)) 
375          self.assertFalse( 
376              [event for (event, element) in events if event != 'end']) 
 377   
379          iterparse = self.etree.iterparse 
380          f = BytesIO( 
381              '<html><head><title>TITLE</title><body><p>P</p></body></html>') 
382   
383          iterator = iterparse(f, html=True, events=('start',)) 
384          self.assertEqual(None, iterator.root) 
385   
386          events = list(iterator) 
387          root = iterator.root 
388          self.assertNotEqual(None, root) 
389          self.assertEqual( 
390              [('start', root), ('start', root[0]), ('start', root[0][0]), 
391                  ('start', root[1]), ('start', root[1][0])], 
392              events) 
 393   
404   
423   
438   
440          assertFalse  = self.assertFalse 
441          events = [] 
442          class Target(object): 
443              def start(self, tag, attrib): 
444                  events.append(("start", tag)) 
445                  assertFalse(attrib) 
 446              def end(self, tag): 
447                  events.append(("end", tag)) 
 448              def close(self): 
449                  return "DONE" 
450   
451          parser = self.etree.HTMLParser(target=Target()) 
452   
453          parser.feed("<html><body></body></html>") 
454          done = parser.close() 
455   
456          self.assertEqual("DONE", done) 
457          self.assertEqual([ 
458              ("start", "html"), ("start", "body"), 
459              ("end", "body"), ("end", "html")], events) 
460   
462          assertFalse  = self.assertFalse 
463          events = [] 
464          class Target(object): 
465              def start(self, tag, attrib): 
466                  events.append(("start", tag)) 
467                  assertFalse(attrib) 
 468              def end(self, tag): 
469                  events.append(("end", tag)) 
470              def doctype(self, *args): 
471                  events.append(("doctype", args)) 
472              def close(self): 
473                  return "DONE" 
474   
475          parser = self.etree.HTMLParser(target=Target()) 
476          parser.feed("<!DOCTYPE><html><body></body></html>") 
477          done = parser.close() 
478   
479          self.assertEqual("DONE", done) 
480          self.assertEqual([ 
481              ("doctype", (None, None, None)), 
482              ("start", "html"), ("start", "body"), 
483              ("end", "body"), ("end", "html")], events) 
484   
486          assertFalse  = self.assertFalse 
487          events = [] 
488          class Target(object): 
489              def start(self, tag, attrib): 
490                  events.append(("start", tag)) 
491                  assertFalse(attrib) 
 492              def end(self, tag): 
493                  events.append(("end", tag)) 
494              def doctype(self, *args): 
495                  events.append(("doctype", args)) 
496              def close(self): 
497                  return "DONE" 
498   
499          parser = self.etree.HTMLParser(target=Target()) 
500          parser.feed("<!DOCTYPE html><html><body></body></html>") 
501          done = parser.close() 
502   
503          self.assertEqual("DONE", done) 
504          self.assertEqual([ 
505              ("doctype", ("html", None, None)), 
506              ("start", "html"), ("start", "body"), 
507              ("end", "body"), ("end", "html")], events) 
508   
510          assertFalse  = self.assertFalse 
511          events = [] 
512          class Target(object): 
513              def start(self, tag, attrib): 
514                  events.append(("start", tag)) 
515                  assertFalse(attrib) 
 516              def end(self, tag): 
517                  events.append(("end", tag)) 
518              def doctype(self, *args): 
519                  events.append(("doctype", args)) 
520              def close(self): 
521                  return "DONE" 
522   
523          parser = self.etree.HTMLParser(target=Target()) 
524          parser.feed('<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01//EN" "sys.dtd">' 
525                      '<html><body></body></html>') 
526          done = parser.close() 
527   
528          self.assertEqual("DONE", done) 
529          self.assertEqual([ 
530              ("doctype", ("html", "-//W3C//DTD HTML 4.01//EN", "sys.dtd")), 
531              ("start", "html"), ("start", "body"), 
532              ("end", "body"), ("end", "html")], events) 
533   
535          events = [] 
536          class Target(object): 
537              def start(self, tag, attrib): 
538                  events.append(("start", tag)) 
539                  raise ValueError("START") 
 540              def end(self, tag): 
541                  events.append(("end", tag)) 
542                  raise TypeError("END") 
543              def close(self): 
544                  return "DONE" 
545   
546          parser = self.etree.HTMLParser(target=Target()) 
547          try: 
548              parser.feed('<html><body>') 
549              parser.feed('</body></html>') 
550          except ValueError as exc: 
551              assert "START" in str(exc) 
552          except TypeError as exc: 
553              assert "END" in str(exc) 
554              self.assertTrue(False, "wrong exception raised") 
555          else: 
556              self.assertTrue(False, "no exception raised") 
557   
558          self.assertTrue(("start", "html") in events, events) 
559          self.assertTrue(("end", "html") not in events, events) 
560   
562          events = [] 
563          class Target(object): 
564              def start(self, tag, attrib): 
565                  events.append(("start", tag)) 
566                  raise ValueError("START") 
 567              def end(self, tag): 
568                  events.append(("end", tag)) 
569                  raise TypeError("END") 
570              def close(self): 
571                  return "DONE" 
572   
573          parser = self.etree.HTMLParser(target=Target()) 
574          try: 
575              self.etree.fromstring('<html><body></body></html>', parser) 
576          except ValueError as exc: 
577              assert "START" in str(exc), str(exc) 
578          except TypeError as exc: 
579              assert "END" in str(exc), str(exc) 
580              self.assertTrue(False, "wrong exception raised") 
581          else: 
582              self.assertTrue(False, "no exception raised") 
583   
584          self.assertTrue(("start", "html") in events, events) 
585          self.assertTrue(("end", "html") not in events, events) 
586   
588          doc = html.Element('html').getroottree() 
589          doc.docinfo.public_id = "-//W3C//DTD XHTML 1.0 Strict//EN" 
590          doc.docinfo.system_url = \ 
591              "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd" 
592          self.assertEqual(doc.docinfo.doctype, 
593                           '<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">') 
594          self.assertEqual(self.etree.tostring(doc), 
595                           _bytes('''<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"> 
596  <html xmlns="http://www.w3.org/1999/xhtml"></html>''')) 
 597   
608   
618   
628   
634   
640   
641   
643      suite = unittest.TestSuite() 
644      suite.addTests([unittest.makeSuite(HtmlParserTestCase)]) 
645      return suite 
 646   
647   
648  if __name__ == '__main__': 
649      print('to test use test.py %s' % __file__) 
650