1  """External interface to the BeautifulSoup HTML parser. 
  2  """ 
  3   
  4  __all__ = ["fromstring", "parse", "convert_tree"] 
  5   
  6  import re 
  7  from lxml import etree, html 
  8   
  9  try: 
 10      from bs4 import ( 
 11          BeautifulSoup, Tag, Comment, ProcessingInstruction, NavigableString, 
 12          Declaration, Doctype) 
 13      _DECLARATION_OR_DOCTYPE = (Declaration, Doctype) 
 14  except ImportError: 
 15      from BeautifulSoup import ( 
 16          BeautifulSoup, Tag, Comment, ProcessingInstruction, NavigableString, 
 17          Declaration) 
 18      _DECLARATION_OR_DOCTYPE = Declaration 
 19   
 20   
 21 -def fromstring(data, beautifulsoup=None, makeelement=None, **bsargs): 
  22      """Parse a string of HTML data into an Element tree using the 
 23      BeautifulSoup parser. 
 24   
 25      Returns the root ``<html>`` Element of the tree. 
 26   
 27      You can pass a different BeautifulSoup parser through the 
 28      `beautifulsoup` keyword, and a diffent Element factory function 
 29      through the `makeelement` keyword.  By default, the standard 
 30      ``BeautifulSoup`` class and the default factory of `lxml.html` are 
 31      used. 
 32      """ 
 33      return _parse(data, beautifulsoup, makeelement, **bsargs) 
  34   
 35   
 36 -def parse(file, beautifulsoup=None, makeelement=None, **bsargs): 
  37      """Parse a file into an ElemenTree using the BeautifulSoup parser. 
 38   
 39      You can pass a different BeautifulSoup parser through the 
 40      `beautifulsoup` keyword, and a diffent Element factory function 
 41      through the `makeelement` keyword.  By default, the standard 
 42      ``BeautifulSoup`` class and the default factory of `lxml.html` are 
 43      used. 
 44      """ 
 45      if not hasattr(file, 'read'): 
 46          file = open(file) 
 47      root = _parse(file, beautifulsoup, makeelement, **bsargs) 
 48      return etree.ElementTree(root) 
  49   
 52      """Convert a BeautifulSoup tree to a list of Element trees. 
 53   
 54      Returns a list instead of a single root Element to support 
 55      HTML-like soup with more than one root element. 
 56   
 57      You can pass a different Element factory through the `makeelement` 
 58      keyword. 
 59      """ 
 60      root = _convert_tree(beautiful_soup_tree, makeelement) 
 61      children = root.getchildren() 
 62      for child in children: 
 63          root.remove(child) 
 64      return children 
  65   
 66   
 67   
 68   
 69 -def _parse(source, beautifulsoup, makeelement, **bsargs): 
  70      if beautifulsoup is None: 
 71          beautifulsoup = BeautifulSoup 
 72      if hasattr(beautifulsoup, "HTML_ENTITIES"):   
 73          if 'convertEntities' not in bsargs: 
 74              bsargs['convertEntities'] = 'html' 
 75      if hasattr(beautifulsoup, "DEFAULT_BUILDER_FEATURES"):   
 76          if 'features' not in bsargs: 
 77              bsargs['features'] = 'html.parser'   
 78      tree = beautifulsoup(source, **bsargs) 
 79      root = _convert_tree(tree, makeelement) 
 80       
 81      if len(root) == 1 and root[0].tag == "html": 
 82          return root[0] 
 83      root.tag = "html" 
 84      return root 
  85   
 86   
 87  _parse_doctype_declaration = re.compile( 
 88      r'(?:\s|[<!])*DOCTYPE\s*HTML' 
 89      r'(?:\s+PUBLIC)?(?:\s+(\'[^\']*\'|"[^"]*"))?' 
 90      r'(?:\s+(\'[^\']*\'|"[^"]*"))?', 
 91      re.IGNORECASE).match 
 95       
 97          self.name = 'html' 
 98          self.attrs = [] 
 99          self.contents = contents 
 100   
 103   
106      if makeelement is None: 
107          makeelement = html.html_parser.makeelement 
108   
109       
110       
111       
112       
113       
114       
115      first_element_idx = last_element_idx = None 
116      html_root = declaration = None 
117      for i, e in enumerate(beautiful_soup_tree): 
118          if isinstance(e, Tag): 
119              if first_element_idx is None: 
120                  first_element_idx = i 
121              last_element_idx = i 
122              if html_root is None and e.name and e.name.lower() == 'html': 
123                  html_root = e 
124          elif declaration is None and isinstance(e, _DECLARATION_OR_DOCTYPE): 
125              declaration = e 
126   
127       
128       
129       
130       
131       
132      if first_element_idx is None: 
133          pre_root = post_root = [] 
134          roots = beautiful_soup_tree.contents 
135      else: 
136          pre_root = beautiful_soup_tree.contents[:first_element_idx] 
137          roots = beautiful_soup_tree.contents[first_element_idx:last_element_idx+1] 
138          post_root = beautiful_soup_tree.contents[last_element_idx+1:] 
139   
140       
141      if html_root is not None: 
142           
143          i = roots.index(html_root) 
144          html_root.contents = roots[:i] + html_root.contents + roots[i+1:] 
145      else: 
146           
147          html_root = _PseudoTag(roots) 
148   
149      convert_node = _init_node_converters(makeelement) 
150   
151       
152      res_root = convert_node(html_root) 
153      prev = res_root 
154      for e in reversed(pre_root): 
155          converted = convert_node(e) 
156          if converted is not None: 
157              prev.addprevious(converted) 
158              prev = converted 
159   
160       
161      prev = res_root 
162      for e in post_root: 
163          converted = convert_node(e) 
164          if converted is not None: 
165              prev.addnext(converted) 
166              prev = converted 
167   
168      if declaration is not None: 
169          try: 
170               
171              doctype_string = declaration.output_ready() 
172          except AttributeError: 
173              doctype_string = declaration.string 
174   
175          match = _parse_doctype_declaration(doctype_string) 
176          if not match: 
177               
178               
179              pass 
180          else: 
181              external_id, sys_uri = match.groups() 
182              docinfo = res_root.getroottree().docinfo 
183               
184              docinfo.public_id = external_id and external_id[1:-1] 
185              docinfo.system_url = sys_uri and sys_uri[1:-1] 
186   
187      return res_root 
 188   
191      converters = {} 
192      ordered_node_types = [] 
193   
194      def converter(*types): 
195          def add(handler): 
196              for t in types: 
197                  converters[t] = handler 
198                  ordered_node_types.append(t) 
199              return handler 
 200          return add 
201   
202      def find_best_converter(node): 
203          for t in ordered_node_types: 
204              if isinstance(node, t): 
205                  return converters[t] 
206          return None 
207   
208      def convert_node(bs_node, parent=None): 
209           
210          try: 
211              handler = converters[type(bs_node)] 
212          except KeyError: 
213              handler = converters[type(bs_node)] = find_best_converter(bs_node) 
214          if handler is None: 
215              return None 
216          return handler(bs_node, parent) 
217   
218      def map_attrs(bs_attrs): 
219          if isinstance(bs_attrs, dict):   
220              attribs = {} 
221              for k, v in bs_attrs.items(): 
222                  if isinstance(v, list): 
223                      v = " ".join(v) 
224                  attribs[k] = unescape(v) 
225          else: 
226              attribs = dict((k, unescape(v)) for k, v in bs_attrs) 
227          return attribs 
228   
229      def append_text(parent, text): 
230          if len(parent) == 0: 
231              parent.text = (parent.text or '') + text 
232          else: 
233              parent[-1].tail = (parent[-1].tail or '') + text 
234   
235       
236   
237      @converter(Tag, _PseudoTag) 
238      def convert_tag(bs_node, parent): 
239          attrs = bs_node.attrs 
240          if parent is not None: 
241              attribs = map_attrs(attrs) if attrs else None 
242              res = etree.SubElement(parent, bs_node.name, attrib=attribs) 
243          else: 
244              attribs = map_attrs(attrs) if attrs else {} 
245              res = makeelement(bs_node.name, attrib=attribs) 
246   
247          for child in bs_node: 
248               
249              try: 
250                  handler = converters[type(child)] 
251              except KeyError: 
252                  pass 
253              else: 
254                  if handler is not None: 
255                      handler(child, res) 
256                  continue 
257              convert_node(child, res) 
258          return res 
259   
260      @converter(Comment) 
261      def convert_comment(bs_node, parent): 
262          res = html.HtmlComment(bs_node) 
263          if parent is not None: 
264              parent.append(res) 
265          return res 
266   
267      @converter(ProcessingInstruction) 
268      def convert_pi(bs_node, parent): 
269          if bs_node.endswith('?'): 
270               
271               
272              bs_node = bs_node[:-1] 
273          res = etree.ProcessingInstruction(*bs_node.split(' ', 1)) 
274          if parent is not None: 
275              parent.append(res) 
276          return res 
277   
278      @converter(NavigableString) 
279      def convert_text(bs_node, parent): 
280          if parent is not None: 
281              append_text(parent, unescape(bs_node)) 
282          return None 
283   
284      return convert_node 
285   
286   
287   
288   
289  try: 
290      from html.entities import name2codepoint   
291  except ImportError: 
292      from htmlentitydefs import name2codepoint 
293   
294   
295  handle_entities = re.compile(r"&(\w+);").sub 
296   
297   
298  try: 
299      unichr 
300  except NameError: 
301       
302      unichr = chr 
306      if not string: 
307          return '' 
308       
309      def unescape_entity(m): 
310          try: 
311              return unichr(name2codepoint[m.group(1)]) 
312          except KeyError: 
313              return m.group(0)   
 314      return handle_entities(unescape_entity, string) 
315