A cleanup tool for HTML.
|
character
|
unichr(i)
Return a string of one character with ordinal i; 0 <= i < 256. |
|
|
|
|
_is_image_dataurl(...)
search(string[, pos[, endpos]]) --> match object or None.
Scan through string looking for a match, and return a corresponding
match object instance. Return None if no position in the string matches. |
source code
|
|
|
|
_is_possibly_malicious_scheme(...)
search(string[, pos[, endpos]]) --> match object or None.
Scan through string looking for a match, and return a corresponding
match object instance. Return None if no position in the string matches. |
source code
|
|
|
|
|
|
|
_substitute_whitespace(...)
sub(repl, string[, count = 0]) --> newstring
Return the string obtained by replacing the leftmost non-overlapping
occurrences of pattern in string by the replacement repl. |
source code
|
|
|
|
|
|
|
autolink(el,
link_regexes=_link_regexes,
avoid_elements=_avoid_elements,
avoid_hosts=_avoid_hosts,
avoid_classes=_avoid_classes)
Turn any URLs into links. |
source code
|
|
|
|
| _link_text(text,
link_regexes,
avoid_hosts,
factory) |
source code
|
|
|
|
|
|
|
word_break(el,
max_width=40,
avoid_elements=_avoid_word_break_elements,
avoid_classes=_avoid_word_break_classes,
break_character=unichr(0x200b))
Breaks any long words found in the body of the text (not attributes). |
source code
|
|
|
|
|
|
|
| _break_text(text,
max_width,
break_character) |
source code
|
|
|
|
| _insert_break(word,
width,
break_character) |
source code
|
|
|
|
basestring = str, bytes
|
|
|
_css_javascript_re = re.compile(r'(?is)expression\s*\(.*?\)')
|
|
|
_css_import_re = re.compile(r'(?i)@\s*import')
|
|
|
_conditional_comment_re = re.compile(r'(?is)\[if[\s\n\r]+.*?\]...
|
|
|
_find_styled_elements = descendant-or-self::*[@style]
|
|
|
_find_external_links = descendant-or-self::a [normalize-space...
|
|
|
clean = Cleaner()
|
|
|
_link_regexes = [re.compile(r'(?i)(?P<body>https?://(?P<host>[...
|
|
|
_avoid_elements = ['textarea', 'pre', 'code', 'head', 'select'...
|
|
|
_avoid_hosts = [re.compile(r'(?i)^localhost'), re.compile(r'(?...
|
|
|
_avoid_classes = ['nolink']
|
|
|
_avoid_word_break_elements = ['pre', 'textarea', 'code']
|
|
|
_avoid_word_break_classes = ['nobreak']
|
|
|
_break_prefer_re = re.compile(r'(?i)[^a-z]')
|
|
|
__package__ = 'lxml.html'
|
|
|
__test__ = {}
|