1
0
mirror of https://github.com/l1ving/youtube-dl synced 2025-02-21 07:12:51 +08:00

Refactor IDParser to search for elements by any attribute not just ID

This commit is contained in:
Nick Daniels 2012-12-19 14:21:14 +00:00
parent 70e9b6fd47
commit 11cdd6a765

View File

@ -201,10 +201,11 @@ def htmlentity_transform(matchobj):
return (u'&%s;' % entity) return (u'&%s;' % entity)
compat_html_parser.locatestarttagend = re.compile(r"""<[a-zA-Z][-.a-zA-Z0-9:_]*(?:\s+(?:(?<=['"\s])[^\s/>][^\s/=>]*(?:\s*=+\s*(?:'[^']*'|"[^"]*"|(?!['"])[^>\s]*))?\s*)*)?\s*""", re.VERBOSE) # backport bugfix compat_html_parser.locatestarttagend = re.compile(r"""<[a-zA-Z][-.a-zA-Z0-9:_]*(?:\s+(?:(?<=['"\s])[^\s/>][^\s/=>]*(?:\s*=+\s*(?:'[^']*'|"[^"]*"|(?!['"])[^>\s]*))?\s*)*)?\s*""", re.VERBOSE) # backport bugfix
class IDParser(compat_html_parser.HTMLParser): class AttrParser(compat_html_parser.HTMLParser):
"""Modified HTMLParser that isolates a tag with the specified id""" """Modified HTMLParser that isolates a tag with the specified attribute"""
def __init__(self, id): def __init__(self, attribute, value):
self.id = id self.attribute = attribute
self.value = value
self.result = None self.result = None
self.started = False self.started = False
self.depth = {} self.depth = {}
@ -229,7 +230,7 @@ class IDParser(compat_html_parser.HTMLParser):
attrs = dict(attrs) attrs = dict(attrs)
if self.started: if self.started:
self.find_startpos(None) self.find_startpos(None)
if 'id' in attrs and attrs['id'] == self.id: if self.attribute in attrs and attrs[self.attribute] == self.value:
self.result = [tag] self.result = [tag]
self.started = True self.started = True
self.watch_startpos = True self.watch_startpos = True
@ -267,8 +268,12 @@ class IDParser(compat_html_parser.HTMLParser):
return '\n'.join(lines).strip() return '\n'.join(lines).strip()
def get_element_by_id(id, html): def get_element_by_id(id, html):
"""Return the content of the tag with the specified id in the passed HTML document""" """Return the content of the tag with the specified ID in the passed HTML document"""
parser = IDParser(id) return get_element_by_attribute("id", id, html)
def get_element_by_attribute(attribute, value, html):
"""Return the content of the tag with the specified attribute in the passed HTML document"""
parser = AttrParser(attribute, value)
try: try:
parser.loads(html) parser.loads(html)
except compat_html_parser.HTMLParseError: except compat_html_parser.HTMLParseError: