From 25537736eb6ff6feaaf98caa2ef4d14f1274e8ae Mon Sep 17 00:00:00 2001 From: Johny Mo Swag Date: Thu, 2 May 2013 15:14:30 -0700 Subject: [PATCH] Revert "Merge remote-tracking branch 'upstream/master'" This reverts commit 71525f0a7cfd0a0843717156469bc821c29b73f9, reversing changes made to 57cf2636d7ee60b2cfac0abcbf8292f3a56779b3. --- test/tests.json | 4 +- youtube_dl/InfoExtractors.py | 259 +++++++++++++++++++++++++++-------- 2 files changed, 201 insertions(+), 62 deletions(-) diff --git a/test/tests.json b/test/tests.json index 3c84dcf15..4eba27846 100644 --- a/test/tests.json +++ b/test/tests.json @@ -112,7 +112,7 @@ { "name": "Escapist", "url": "http://www.escapistmagazine.com/videos/view/the-escapist-presents/6618-Breaking-Down-Baldurs-Gate", - "file": "6618-Breaking-Down-Baldurs-Gate.mp4", + "file": "6618-Breaking-Down-Baldurs-Gate.flv", "md5": "c6793dbda81388f4264c1ba18684a74d", "skip": "Fails with timeout on Travis" }, @@ -344,7 +344,7 @@ "file": "17258355236.mp4", "md5": "7c6a514d691b034ccf8567999e9e88a3", "info_dict": { - "title": "Calling all Pris! - A sample video from LeeAnn. (If you need an idea..." + "title": "A sample video from LeeAnn. (If you need an idea..." } }, { diff --git a/youtube_dl/InfoExtractors.py b/youtube_dl/InfoExtractors.py index 80abbd711..f5bc41c68 100755 --- a/youtube_dl/InfoExtractors.py +++ b/youtube_dl/InfoExtractors.py @@ -1069,7 +1069,13 @@ class VimeoIE(InfoExtractor): # Retrieve video webpage to extract further information request = compat_urllib_request.Request(url, None, std_headers) - webpage = self._download_webpage(request, video_id) + try: + self.report_download_webpage(video_id) + webpage_bytes = compat_urllib_request.urlopen(request).read() + webpage = webpage_bytes.decode('utf-8') + except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: + self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err)) + return # Now we begin extracting as much information as we can from what we # retrieved. First we extract the information common to all extractors, @@ -1676,6 +1682,10 @@ class YoutubePlaylistIE(InfoExtractor): """Receives a URL and returns True if suitable for this IE.""" return re.match(cls._VALID_URL, url, re.VERBOSE) is not None + def report_download_page(self, playlist_id, pagenum): + """Report attempt to download playlist page with given number.""" + self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum)) + def _real_extract(self, url): # Extract playlist id mobj = re.match(self._VALID_URL, url, re.VERBOSE) @@ -1689,8 +1699,14 @@ class YoutubePlaylistIE(InfoExtractor): videos = [] while True: + self.report_download_page(playlist_id, page_num) + url = self._TEMPLATE_URL % (playlist_id, self._MAX_RESULTS, self._MAX_RESULTS * (page_num - 1) + 1) - page = self._download_webpage(url, playlist_id, u'Downloading page #%s' % page_num) + try: + page = compat_urllib_request.urlopen(url).read().decode('utf8') + except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: + self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err)) + return try: response = json.loads(page) @@ -1729,6 +1745,10 @@ class YoutubeChannelIE(InfoExtractor): _MORE_PAGES_URL = 'http://www.youtube.com/channel_ajax?action_load_more_videos=1&flow=list&paging=%s&view=0&sort=da&channel_id=%s' IE_NAME = u'youtube:channel' + def report_download_page(self, channel_id, pagenum): + """Report attempt to download channel page with given number.""" + self._downloader.to_screen(u'[youtube] Channel %s: Downloading page #%s' % (channel_id, pagenum)) + def extract_videos_from_page(self, page): ids_in_page = [] for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&?', page): @@ -1748,9 +1768,14 @@ class YoutubeChannelIE(InfoExtractor): video_ids = [] pagenum = 1 + self.report_download_page(channel_id, pagenum) url = self._TEMPLATE_URL % (channel_id, pagenum) - page = self._download_webpage(url, channel_id, - u'Downloading page #%s' % pagenum) + request = compat_urllib_request.Request(url) + try: + page = compat_urllib_request.urlopen(request).read().decode('utf8') + except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: + self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err)) + return # Extract video identifiers ids_in_page = self.extract_videos_from_page(page) @@ -1761,9 +1786,14 @@ class YoutubeChannelIE(InfoExtractor): while True: pagenum = pagenum + 1 + self.report_download_page(channel_id, pagenum) url = self._MORE_PAGES_URL % (pagenum, channel_id) - page = self._download_webpage(url, channel_id, - u'Downloading page #%s' % pagenum) + request = compat_urllib_request.Request(url) + try: + page = compat_urllib_request.urlopen(request).read().decode('utf8') + except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: + self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err)) + return page = json.loads(page) @@ -1790,6 +1820,11 @@ class YoutubeUserIE(InfoExtractor): _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]' IE_NAME = u'youtube:user' + def report_download_page(self, username, start_index): + """Report attempt to download user page.""" + self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' % + (username, start_index, start_index + self._GDATA_PAGE_SIZE)) + def _real_extract(self, url): # Extract username mobj = re.match(self._VALID_URL, url) @@ -1809,10 +1844,15 @@ class YoutubeUserIE(InfoExtractor): while True: start_index = pagenum * self._GDATA_PAGE_SIZE + 1 + self.report_download_page(username, start_index) - gdata_url = self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index) - page = self._download_webpage(gdata_url, username, - u'Downloading video ids from %d to %d' % (start_index, start_index + self._GDATA_PAGE_SIZE)) + request = compat_urllib_request.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index)) + + try: + page = compat_urllib_request.urlopen(request).read().decode('utf-8') + except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: + self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err)) + return # Extract video identifiers ids_in_page = [] @@ -1846,6 +1886,11 @@ class BlipTVUserIE(InfoExtractor): _PAGE_SIZE = 12 IE_NAME = u'blip.tv:user' + def report_download_page(self, username, pagenum): + """Report attempt to download user page.""" + self.to_screen(u'user %s: Downloading video ids from page %d' % + (username, pagenum)) + def _real_extract(self, url): # Extract username mobj = re.match(self._VALID_URL, url) @@ -1857,9 +1902,15 @@ class BlipTVUserIE(InfoExtractor): page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1' - page = self._download_webpage(url, username, u'Downloading user page') - mobj = re.search(r'data-users-id="([^"]+)"', page) - page_base = page_base % mobj.group(1) + request = compat_urllib_request.Request(url) + + try: + page = compat_urllib_request.urlopen(request).read().decode('utf-8') + mobj = re.search(r'data-users-id="([^"]+)"', page) + page_base = page_base % mobj.group(1) + except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: + self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err)) + return # Download video ids using BlipTV Ajax calls. Result size per @@ -1871,9 +1922,14 @@ class BlipTVUserIE(InfoExtractor): pagenum = 1 while True: + self.report_download_page(username, pagenum) url = page_base + "&page=" + str(pagenum) - page = self._download_webpage(url, username, - u'Downloading video ids from page %d' % pagenum) + request = compat_urllib_request.Request( url ) + try: + page = compat_urllib_request.urlopen(request).read().decode('utf-8') + except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: + self._downloader.report_error(u'unable to download webpage: %s' % str(err)) + return # Extract video identifiers ids_in_page = [] @@ -2232,6 +2288,12 @@ class ComedyCentralIE(InfoExtractor): """Receives a URL and returns True if suitable for this IE.""" return re.match(cls._VALID_URL, url, re.VERBOSE) is not None + def report_config_download(self, episode_id, media_id): + self.to_screen(u'%s: Downloading configuration for %s' % (episode_id, media_id)) + + def report_index_download(self, episode_id): + self.to_screen(u'%s: Downloading show index' % episode_id) + def _print_formats(self, formats): print('Available formats:') for x in formats: @@ -2265,8 +2327,15 @@ class ComedyCentralIE(InfoExtractor): else: epTitle = mobj.group('episode') + req = compat_urllib_request.Request(url) self.report_extraction(epTitle) - webpage = self._download_webpage(url, epTitle) + try: + htmlHandle = compat_urllib_request.urlopen(req) + html = htmlHandle.read() + webpage = html.decode('utf-8') + except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: + self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err)) + return if dlNewest: url = htmlHandle.geturl() mobj = re.match(self._VALID_URL, url, re.VERBOSE) @@ -2294,9 +2363,12 @@ class ComedyCentralIE(InfoExtractor): uri = mMovieParams[0][1] indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri}) - indexXml = self._download_webpage(indexUrl, epTitle, - u'Downloading show index', - u'unable to download episode index') + self.report_index_download(epTitle) + try: + indexXml = compat_urllib_request.urlopen(indexUrl).read() + except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: + self._downloader.report_error(u'unable to download episode index: ' + compat_str(err)) + return results = [] @@ -2311,8 +2383,13 @@ class ComedyCentralIE(InfoExtractor): configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' + compat_urllib_parse.urlencode({'uri': mediaId})) - configXml = self._download_webpage(configUrl, epTitle, - u'Downloading configuration for %s' % shortMediaId) + configReq = compat_urllib_request.Request(configUrl) + self.report_config_download(epTitle, shortMediaId) + try: + configXml = compat_urllib_request.urlopen(configReq).read() + except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: + self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err)) + return cdoc = xml.etree.ElementTree.fromstring(configXml) turls = [] @@ -2369,6 +2446,9 @@ class EscapistIE(InfoExtractor): _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P[^/]+)/(?P[^/?]+)[/?]?.*$' IE_NAME = u'escapist' + def report_config_download(self, showName): + self.to_screen(u'%s: Downloading configuration' % showName) + def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) if mobj is None: @@ -2378,7 +2458,14 @@ class EscapistIE(InfoExtractor): videoId = mobj.group('episode') self.report_extraction(showName) - webPage = self._download_webpage(url, showName) + try: + webPage = compat_urllib_request.urlopen(url) + webPageBytes = webPage.read() + m = re.match(r'text/html; charset="?([^"]+)"?', webPage.headers['Content-Type']) + webPage = webPageBytes.decode(m.group(1) if m else 'utf-8') + except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: + self._downloader.report_error(u'unable to download webpage: ' + compat_str(err)) + return descMatch = re.search('video|app)/ #If the page is only for videos or for a game (?P\d+)/? @@ -3488,13 +3627,14 @@ class WorldStarHipHopIE(InfoExtractor): def _real_extract(self, url): _src_url = r'so\.addVariable\("file","(.*?)"\)' - m = re.match(self._VALID_URL, url) - video_id = m.group('id') - - webpage_src = self._download_webpage(url, video_id) + webpage_src = compat_urllib_request.urlopen(url).read() + webpage_src = webpage_src.decode('utf-8') mobj = re.search(_src_url, webpage_src) + m = re.match(self._VALID_URL, url) + video_id = m.group('id') + if mobj is not None: video_url = mobj.group(1) if 'mp4' in video_url: @@ -3848,13 +3988,12 @@ class KeekIE(InfoExtractor): return [info] class TEDIE(InfoExtractor): - _VALID_URL=r'''http://www\.ted\.com/ + _VALID_URL=r'''http://www.ted.com/ ( ((?Pplaylists)/(?P\d+)) # We have a playlist | ((?Ptalks)) # We have a simple talk ) - (/lang/(.*?))? # The url may contain the language /(?P\w+) # Here goes the name and then ".html" ''' @@ -4105,7 +4244,7 @@ class ARDIE(InfoExtractor): return [info] class TumblrIE(InfoExtractor): - _VALID_URL = r'http://(?P.*?)\.tumblr\.com/((post)|(video))/(?P\d*)/(.*?)' + _VALID_URL = r'http://(?P.*?).tumblr.com/((post)|(video))/(?P\d*)/(.*?)' def _real_extract(self, url): m_url = re.match(self._VALID_URL, url) @@ -4115,7 +4254,7 @@ class TumblrIE(InfoExtractor): url = 'http://%s.tumblr.com/post/%s/' % (blog, video_id) webpage = self._download_webpage(url, video_id) - re_video = r'src=\\x22(?Phttp://%s\.tumblr\.com/video_file/%s/(.*?))\\x22 type=\\x22video/(?P.*?)\\x22' % (blog, video_id) + re_video = r'src=\\x22(?Phttp://%s.tumblr.com/video_file/%s/(.*?))\\x22 type=\\x22video/(?P.*?)\\x22' % (blog, video_id) video = re.search(re_video, webpage) if video is None: self.to_screen("No video founded") @@ -4128,8 +4267,8 @@ class TumblrIE(InfoExtractor): # The only place where you can get a title, it's not complete, # but searching in other places doesn't work for all videos - re_title = r'(?P<title>.*?)' - title = unescapeHTML(re.search(re_title, webpage, re.DOTALL).group('title')) + re_title = r'(.*?) - (?P<title>.*?)' + title = unescapeHTML(re.search(re_title, webpage).group('title')) return [{'id': video_id, 'url': video_url, @@ -4139,7 +4278,7 @@ class TumblrIE(InfoExtractor): }] class BandcampIE(InfoExtractor): - _VALID_URL = r'http://.*?\.bandcamp\.com/track/(?P.*)' + _VALID_URL = r'http://.*?.bandcamp.com/track/(?P<title>.*)' def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) @@ -4164,7 +4303,7 @@ class BandcampIE(InfoExtractor): mp3_info = info[u'downloads'][u'mp3-320'] # If we try to use this url it says the link has expired initial_url = mp3_info[u'url'] - re_url = r'(?P<server>http://(.*?)\.bandcamp\.com)/download/track\?enc=mp3-320&fsig=(?P<fsig>.*?)&id=(?P<id>.*?)&ts=(?P<ts>.*)$' + re_url = r'(?P<server>http://(.*?).bandcamp.com)/download/track\?enc=mp3-320&fsig=(?P<fsig>.*?)&id=(?P<id>.*?)&ts=(?P<ts>.*)$' m_url = re.match(re_url, initial_url) #We build the url we will use to get the final track url # This url is build in Bandcamp in the script download_bunde_*.js