From 0269764b07e1b35422a9820fd0934213569b5b55 Mon Sep 17 00:00:00 2001 From: Jonathon Padfield Date: Sat, 11 Feb 2012 16:17:23 +1100 Subject: [PATCH 1/2] Changed GenericIE so all regex matches on a page are used to find video urls --- youtube-dl | 66 ++++++++++++++++++++++++++++-------------------------- 1 file changed, 34 insertions(+), 32 deletions(-) diff --git a/youtube-dl b/youtube-dl index be599a2b2..e3710a8db 100755 --- a/youtube-dl +++ b/youtube-dl @@ -2170,27 +2170,15 @@ class GenericIE(InfoExtractor): self.report_extraction(video_id) # Start with something easy: JW Player in SWFObject - mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage) - if mobj is None: + matches = [mobj for mobj in re.finditer(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)] + + if len(matches) == 0: # Broaden the search a little bit - mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage) - if mobj is None: + matches = [mobj for mobj in re.finditer(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)] + if len(matches) == 0: self._downloader.trouble(u'ERROR: Invalid URL: %s' % url) return - # It's possible that one of the regexes - # matched, but returned an empty group: - if mobj.group(1) is None: - self._downloader.trouble(u'ERROR: Invalid URL: %s' % url) - return - - video_url = urllib.unquote(mobj.group(1)) - video_id = os.path.basename(video_url) - - # here's a fun little line of code for you: - video_extension = os.path.splitext(video_id)[1][1:] - video_id = os.path.splitext(video_id)[0] - # it's tempting to parse this further, but you would # have to take into account all the variations like # Video Title - Site Name @@ -2212,21 +2200,35 @@ class GenericIE(InfoExtractor): return video_uploader = mobj.group(1).decode('utf-8') - try: - # Process video information - self._downloader.process_info({ - 'id': video_id.decode('utf-8'), - 'url': video_url.decode('utf-8'), - 'uploader': video_uploader, - 'upload_date': u'NA', - 'title': video_title, - 'stitle': simple_title, - 'ext': video_extension.decode('utf-8'), - 'format': u'NA', - 'player_url': None, - }) - except UnavailableVideoError, err: - self._downloader.trouble(u'\nERROR: unable to download video') + for mobj in matches: + # It's possible that one of the regexes + # matched, but returned an empty group: + if mobj.group(1) is None: + self._downloader.trouble(u'ERROR: Invalid URL: %s' % url) + continue + + video_url = urllib.unquote(mobj.group(1)) + video_id = os.path.basename(video_url) + + # here's a fun little line of code for you: + video_extension = os.path.splitext(video_id)[1][1:] + video_id = os.path.splitext(video_id)[0] + + try: + # Process video information + self._downloader.process_info({ + 'id': video_id.decode('utf-8'), + 'url': video_url.decode('utf-8'), + 'uploader': video_uploader, + 'upload_date': u'NA', + 'title': video_title, + 'stitle': simple_title, + 'ext': video_extension.decode('utf-8'), + 'format': u'NA', + 'player_url': None, + }) + except UnavailableVideoError, err: + self._downloader.trouble(u'\nERROR: unable to download video') class YoutubeSearchIE(InfoExtractor): From 20b2a5596886dc8d95cef1cc248fe22a6e2c2143 Mon Sep 17 00:00:00 2001 From: Jonathon Padfield Date: Sun, 26 Feb 2012 11:00:52 +1100 Subject: [PATCH 2/2] Loosened regex on DailyMotion videos to allow for matching of private video urls. --- youtube-dl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube-dl b/youtube-dl index e3710a8db..07be181f0 100755 --- a/youtube-dl +++ b/youtube-dl @@ -1593,7 +1593,7 @@ class MetacafeIE(InfoExtractor): class DailymotionIE(InfoExtractor): """Information Extractor for Dailymotion""" - _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^_/]+)_([^/]+)' + _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/(.+)' IE_NAME = u'dailymotion' def __init__(self, downloader=None): @@ -1608,7 +1608,7 @@ class DailymotionIE(InfoExtractor): self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id) def _real_extract(self, url): - # Extract id and simplified title from URL + # Extract id mobj = re.match(self._VALID_URL, url) if mobj is None: self._downloader.trouble(u'ERROR: invalid URL: %s' % url)