From fe52f9f9565dcb42e79c7ada654c95151cda97c4 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Wed, 19 Feb 2014 11:35:35 +0100 Subject: [PATCH 01/58] Document prefered config location (#2407) --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index bd091be86..9ae838b58 100644 --- a/README.md +++ b/README.md @@ -246,7 +246,7 @@ which means you can modify it, redistribute it or use it however you like. # CONFIGURATION -You can configure youtube-dl by placing default arguments (such as `--extract-audio --no-mtime` to always extract the audio and not copy the mtime) into `/etc/youtube-dl.conf` and/or `~/.config/youtube-dl.conf`. On Windows, the configuration file locations are `%APPDATA%\youtube-dl\config.txt` and `C:\Users\\youtube-dl.conf`. +You can configure youtube-dl by placing default arguments (such as `--extract-audio --no-mtime` to always extract the audio and not copy the mtime) into `/etc/youtube-dl.conf` and/or `~/.config/youtube-dl/config`. On Windows, the configuration file locations are `%APPDATA%\youtube-dl\config.txt` and `C:\Users\\youtube-dl.conf`. # OUTPUT TEMPLATE From 0d4b4865cc0cddbebf0093209f51d44a7d765e3e Mon Sep 17 00:00:00 2001 From: pulpe Date: Wed, 19 Feb 2014 16:13:45 +0100 Subject: [PATCH 02/58] [README.md] correct the test command --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 9ae838b58..897603e3e 100644 --- a/README.md +++ b/README.md @@ -357,7 +357,7 @@ If you want to create a build of youtube-dl yourself, you'll need ### Adding support for a new site -If you want to add support for a new site, copy *any* [recently modified](https://github.com/rg3/youtube-dl/commits/master/youtube_dl/extractor) file in `youtube_dl/extractor`, add an import in [`youtube_dl/extractor/__init__.py`](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/__init__.py). Have a look at [`youtube_dl/common/extractor/common.py`](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/common.py) for possible helper methods and a [detailed description of what your extractor should return](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/common.py#L38). Don't forget to run the tests with `python test/test_download.py Test_Download.test_YourExtractor`! For a detailed tutorial, refer to [this blog post](http://filippo.io/add-support-for-a-new-video-site-to-youtube-dl/). +If you want to add support for a new site, copy *any* [recently modified](https://github.com/rg3/youtube-dl/commits/master/youtube_dl/extractor) file in `youtube_dl/extractor`, add an import in [`youtube_dl/extractor/__init__.py`](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/__init__.py). Have a look at [`youtube_dl/common/extractor/common.py`](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/common.py) for possible helper methods and a [detailed description of what your extractor should return](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/common.py#L38). Don't forget to run the tests with `python test/test_download.py TestDownload.test_YourExtractor`! For a detailed tutorial, refer to [this blog post](http://filippo.io/add-support-for-a-new-video-site-to-youtube-dl/). # BUGS From ccb079ee679ffe09694e5e0be3034db358478348 Mon Sep 17 00:00:00 2001 From: "Sergey M." Date: Thu, 20 Feb 2014 01:42:15 +0700 Subject: [PATCH 03/58] [xhamster] Fix and improve --- youtube_dl/extractor/xhamster.py | 135 ++++++++++++++++--------------- 1 file changed, 69 insertions(+), 66 deletions(-) diff --git a/youtube_dl/extractor/xhamster.py b/youtube_dl/extractor/xhamster.py index f6c515f7f..a75e1380d 100644 --- a/youtube_dl/extractor/xhamster.py +++ b/youtube_dl/extractor/xhamster.py @@ -4,51 +4,51 @@ import re from .common import InfoExtractor from ..utils import ( - compat_urllib_parse, ExtractorError, + unified_strdate, + str_to_int, + int_or_none, + parse_duration, ) class XHamsterIE(InfoExtractor): """Information Extractor for xHamster""" - _VALID_URL = r'(?:http://)?(?:www\.)?xhamster\.com/movies/(?P[0-9]+)/(?P.+?)\.html(?:\?.*)?' - _TESTS = [{ - 'url': 'http://xhamster.com/movies/1509445/femaleagent_shy_beauty_takes_the_bait.html', - 'file': '1509445.mp4', - 'md5': '8281348b8d3c53d39fffb377d24eac4e', - 'info_dict': { - "upload_date": "20121014", - "uploader_id": "Ruseful2011", - "title": "FemaleAgent Shy beauty takes the bait", - "age_limit": 18, + _VALID_URL = r'http://(?:www\.)?xhamster\.com/movies/(?P[0-9]+)/(?P.+?)\.html(?:\?.*)?' + _TESTS = [ + { + 'url': 'http://xhamster.com/movies/1509445/femaleagent_shy_beauty_takes_the_bait.html', + 'md5': '8281348b8d3c53d39fffb377d24eac4e', + 'info_dict': { + 'id': '1509445', + 'ext': 'mp4', + 'title': 'FemaleAgent Shy beauty takes the bait', + 'upload_date': '20121014', + 'uploader_id': 'Ruseful2011', + 'duration': 893, + 'age_limit': 18, + } + }, + { + 'url': 'http://xhamster.com/movies/2221348/britney_spears_sexy_booty.html?hd', + 'md5': '4cbd8d56708ecb4fb4124c23e4acb81a', + 'info_dict': { + 'id': '2221348', + 'ext': 'mp4', + 'title': 'Britney Spears Sexy Booty', + 'upload_date': '20130914', + 'uploader_id': 'jojo747400', + 'duration': 200, + 'age_limit': 18, + } } - }, - { - 'url': 'http://xhamster.com/movies/2221348/britney_spears_sexy_booty.html?hd', - 'file': '2221348.flv', - 'md5': 'e767b9475de189320f691f49c679c4c7', - 'info_dict': { - "upload_date": "20130914", - "uploader_id": "jojo747400", - "title": "Britney Spears Sexy Booty", - "age_limit": 18, - } - }] + ] def _real_extract(self,url): def extract_video_url(webpage): - mobj = re.search(r'\'srv\': \'(?P[^\']*)\',\s*\'file\': \'(?P[^\']+)\',', webpage) - if mobj is None: - raise ExtractorError('Unable to extract media URL') - if len(mobj.group('server')) == 0: - return compat_urllib_parse.unquote(mobj.group('file')) - else: - return mobj.group('server')+'/key='+mobj.group('file') - - def extract_mp4_video_url(webpage): - mp4 = re.search(r'', webpage) if mp4 is None: - return None + raise ExtractorError('Unable to extract media URL') else: return mp4.group(1) @@ -62,50 +62,48 @@ class XHamsterIE(InfoExtractor): mrss_url = 'http://xhamster.com/movies/%s/%s.html' % (video_id, seo) webpage = self._download_webpage(mrss_url, video_id) - video_title = self._html_search_regex( - r'(?P<title>.+?) - xHamster\.com', webpage, 'title') + title = self._html_search_regex(r'(?P<title>.+?) - xHamster\.com', webpage, 'title') # Only a few videos have an description mobj = re.search(r'Description: ([^<]+)', webpage) - video_description = mobj.group(1) if mobj else None + description = mobj.group(1) if mobj else None - mobj = re.search(r'hint=\'(?P[0-9]{4})-(?P[0-9]{2})-(?P[0-9]{2}) [0-9]{2}:[0-9]{2}:[0-9]{2} [A-Z]{3,4}\'', webpage) - if mobj: - video_upload_date = mobj.group('upload_date_Y')+mobj.group('upload_date_m')+mobj.group('upload_date_d') - else: - video_upload_date = None - self._downloader.report_warning('Unable to extract upload date') + upload_date = self._html_search_regex(r'hint=\'(\d{4}-\d{2}-\d{2}) \d{2}:\d{2}:\d{2} [A-Z]{3,4}\'', + webpage, 'upload date', fatal=False) + if upload_date: + upload_date = unified_strdate(upload_date) - video_uploader_id = self._html_search_regex( - r']+>(?P[^<]+)', + uploader_id = self._html_search_regex(r']+>(?P[^<]+)', webpage, 'uploader id', default='anonymous') - video_thumbnail = self._search_regex( - r'\'image\':\'(?P[^\']+)\'', - webpage, 'thumbnail', fatal=False) + thumbnail = self._html_search_regex(r'', webpage, 'thumbnail', fatal=False) + + duration = parse_duration(self._html_search_regex(r'Runtime: (\d+:\d+)', + webpage, 'duration', fatal=False)) + + view_count = self._html_search_regex(r'Views: ([^<]+)', webpage, 'view count', fatal=False) + if view_count: + view_count = str_to_int(view_count) + + mobj = re.search(r"hint='(?P\d+) Likes / (?P\d+) Dislikes'", webpage) + (like_count, dislike_count) = (mobj.group('likecount'), mobj.group('dislikecount')) if mobj else (None, None) + + mobj = re.search(r'Comments \((?P\d+)\)', webpage) + comment_count = mobj.group('commentcount') if mobj else 0 age_limit = self._rta_search(webpage) hd = is_hd(webpage) + video_url = extract_video_url(webpage) formats = [{ 'url': video_url, 'format_id': 'hd' if hd else 'sd', - 'preference': 0, + 'preference': 1, }] - video_mp4_url = extract_mp4_video_url(webpage) - if video_mp4_url is not None: - formats.append({ - 'url': video_mp4_url, - 'ext': 'mp4', - 'format_id': 'mp4-hd' if hd else 'mp4-sd', - 'preference': 1, - }) - if not hd: - webpage = self._download_webpage( - mrss_url + '?hd', video_id, note='Downloading HD webpage') + webpage = self._download_webpage(mrss_url + '?hd', video_id, note='Downloading HD webpage') if is_hd(webpage): video_url = extract_video_url(webpage) formats.append({ @@ -118,11 +116,16 @@ class XHamsterIE(InfoExtractor): return { 'id': video_id, - 'title': video_title, - 'formats': formats, - 'description': video_description, - 'upload_date': video_upload_date, - 'uploader_id': video_uploader_id, - 'thumbnail': video_thumbnail, + 'title': title, + 'description': description, + 'upload_date': upload_date, + 'uploader_id': uploader_id, + 'thumbnail': thumbnail, + 'duration': duration, + 'view_count': view_count, + 'like_count': int_or_none(like_count), + 'dislike_count': int_or_none(dislike_count), + 'comment_count': int_or_none(comment_count), 'age_limit': age_limit, + 'formats': formats, } From 98c4b8fa1b7527884c5ebd6a3f51abdd6eea5abc Mon Sep 17 00:00:00 2001 From: "Anthony J. Bentley" Date: Wed, 19 Feb 2014 20:02:29 -0700 Subject: [PATCH 04/58] =?UTF-8?q?Fix=20minor=20typo:=20=E2=80=9Cto=20to?= =?UTF-8?q?=E2=80=9D=20=E2=86=92=20=E2=80=9Cto=E2=80=9D.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- README.md | 2 +- youtube_dl/__init__.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 897603e3e..49b62f13f 100644 --- a/README.md +++ b/README.md @@ -20,7 +20,7 @@ which means you can modify it, redistribute it or use it however you like. sure that you have sufficient permissions (run with sudo if needed) -i, --ignore-errors continue on download errors, for example to - to skip unavailable videos in a playlist + skip unavailable videos in a playlist --abort-on-error Abort downloading of further videos (in the playlist or the command line) if an error occurs diff --git a/youtube_dl/__init__.py b/youtube_dl/__init__.py index f843036c7..70608066c 100644 --- a/youtube_dl/__init__.py +++ b/youtube_dl/__init__.py @@ -208,7 +208,7 @@ def parseOpts(overrideArguments=None): general.add_option('-U', '--update', action='store_true', dest='update_self', help='update this program to latest version. Make sure that you have sufficient permissions (run with sudo if needed)') general.add_option('-i', '--ignore-errors', - action='store_true', dest='ignoreerrors', help='continue on download errors, for example to to skip unavailable videos in a playlist', default=False) + action='store_true', dest='ignoreerrors', help='continue on download errors, for example to skip unavailable videos in a playlist', default=False) general.add_option('--abort-on-error', action='store_false', dest='ignoreerrors', help='Abort downloading of further videos (in the playlist or the command line) if an error occurs') From 280bc5dad651728e493b3b25a672a9aaef590683 Mon Sep 17 00:00:00 2001 From: "Sergey M." Date: Thu, 20 Feb 2014 18:49:39 +0700 Subject: [PATCH 05/58] [bbccouk] Add friendly contry filter error message (#2184) --- youtube_dl/extractor/bbccouk.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/youtube_dl/extractor/bbccouk.py b/youtube_dl/extractor/bbccouk.py index 69d128974..75e608f99 100644 --- a/youtube_dl/extractor/bbccouk.py +++ b/youtube_dl/extractor/bbccouk.py @@ -162,6 +162,11 @@ class BBCCoUkIE(SubtitlesInfoExtractor): mobj = re.match(self._VALID_URL, url) group_id = mobj.group('id') + webpage = self._download_webpage(url, group_id, 'Downloading video page') + if re.search(r'id="emp-error" class="notinuk">', webpage): + raise ExtractorError('Currently BBC iPlayer TV programmes are available to play in the UK only', + expected=True) + playlist = self._download_xml('http://www.bbc.co.uk/iplayer/playlist/%s' % group_id, group_id, 'Downloading playlist XML') From 4fc946b546c2a471774646f7da291105f8a0cb99 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Thu, 20 Feb 2014 13:14:05 +0100 Subject: [PATCH 06/58] [generic] Add support for RSS feeds (Fixes #667) --- test/test_playlists.py | 9 +++++++++ youtube_dl/extractor/generic.py | 28 ++++++++++++++++++++++++++++ 2 files changed, 37 insertions(+) diff --git a/test/test_playlists.py b/test/test_playlists.py index 1de9e8ec1..25bec9f1c 100644 --- a/test/test_playlists.py +++ b/test/test_playlists.py @@ -250,5 +250,14 @@ class TestPlaylists(unittest.TestCase): self.assertEqual(result['title'], 'python language') self.assertTrue(len(result['entries']) == 15) + def test_generic_rss_feed(self): + dl = FakeYDL() + ie = GenericIE(dl) + result = ie.extract('http://www.escapistmagazine.com/rss/videos/list/1.xml') + self.assertIsPlaylist(result) + self.assertEqual(result['id'], 'http://www.escapistmagazine.com/rss/videos/list/1.xml') + self.assertEqual(result['title'], 'Zero Punctuation') + self.assertTrue(len(result['entries']) > 10) + if __name__ == '__main__': unittest.main() diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 5bcc78bf7..30160d59d 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -4,6 +4,7 @@ from __future__ import unicode_literals import os import re +import xml.etree.ElementTree from .common import InfoExtractor from .youtube import YoutubeIE @@ -159,6 +160,25 @@ class GenericIE(InfoExtractor): raise ExtractorError('Invalid URL protocol') return response + def _extract_rss(self, url, video_id, doc): + playlist_title = doc.find('./channel/title').text + playlist_desc_el = doc.find('./channel/description') + playlist_desc = None if playlist_desc_el is None else playlist_desc_el.text + + entries = [{ + '_type': 'url', + 'url': e.find('link').text, + 'title': e.find('title').text, + } for e in doc.findall('./channel/item')] + + return { + '_type': 'playlist', + 'id': url, + 'title': playlist_title, + 'description': playlist_desc, + 'entries': entries, + } + def _real_extract(self, url): parsed_url = compat_urlparse.urlparse(url) if not parsed_url.scheme: @@ -219,6 +239,14 @@ class GenericIE(InfoExtractor): self.report_extraction(video_id) + # Is it an RSS feed? + try: + doc = xml.etree.ElementTree.fromstring(webpage) + if doc.tag == 'rss': + return self._extract_rss(url, video_id, doc) + except xml.etree.ElementTree.ParseError: + pass + # it's tempting to parse this further, but you would # have to take into account all the variations like # Video Title - Site Name From eae16eb67b4e0c7deea4a56cae19650aab809662 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Thu, 20 Feb 2014 13:14:21 +0100 Subject: [PATCH 07/58] release 2014.02.20 --- youtube_dl/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/version.py b/youtube_dl/version.py index b7ea461c3..b722f8175 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,2 +1,2 @@ -__version__ = '2014.02.19.1' +__version__ = '2014.02.20' From d68f0cdb238edc552a285135bc2684b6709f4f56 Mon Sep 17 00:00:00 2001 From: m0viefreak Date: Thu, 20 Feb 2014 18:19:03 +0100 Subject: [PATCH 08/58] [youtube] decrypt signature when downloading dash manifest --- youtube_dl/extractor/youtube.py | 22 ++++++++++++++++++---- 1 file changed, 18 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 02c5ede74..5b0d30ed1 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -1366,12 +1366,26 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): raise ExtractorError(u'no conn, hlsvp or url_encoded_fmt_stream_map information found in video info') # Look for the DASH manifest - dash_manifest_url_lst = video_info.get('dashmpd') - if (dash_manifest_url_lst and dash_manifest_url_lst[0] and - self._downloader.params.get('youtube_include_dash_manifest', False)): + if (self._downloader.params.get('youtube_include_dash_manifest', False)): try: + # The DASH manifest used needs to be the one from the original video_webpage. + # The one found in get_video_info seems to be using different signatures. + # However, in the case of an age restriction there won't be any embedded dashmpd in the video_webpage. + # Luckily, it seems, this case uses some kind of default signature (len == 86), so the + # combination of get_video_info and the _static_decrypt_signature() decryption fallback will work here. + if age_gate: + dash_manifest_url = video_info.get('dashmpd')[0]; + else: + x = re.search(r'ytplayer\.config = ({.*});', video_webpage) + x = json.loads(x.group(1)); + dash_manifest_url = x['args']['dashmpd'] + def decrypt_sig(mobj): + s = mobj.group(1) + dec_s = self._decrypt_signature(s, video_id, player_url, age_gate) + return '/signature/%s' % dec_s + dash_manifest_url = re.sub(r'/s/([\w\.]+)', decrypt_sig, dash_manifest_url) dash_doc = self._download_xml( - dash_manifest_url_lst[0], video_id, + dash_manifest_url, video_id, note=u'Downloading DASH manifest', errnote=u'Could not download DASH manifest') for r in dash_doc.findall(u'.//{urn:mpeg:DASH:schema:MPD:2011}Representation'): From dbb94fb044abae465644bb14daa45b0658ef5cf0 Mon Sep 17 00:00:00 2001 From: "Sergey M." Date: Fri, 21 Feb 2014 17:19:55 +0700 Subject: [PATCH 09/58] [youtube] Fix playlist extraction (Closes #2423, #2424, #2425) --- youtube_dl/extractor/youtube.py | 30 ++++++++++++++++-------------- 1 file changed, 16 insertions(+), 14 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 02c5ede74..8e768ea4f 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -1443,9 +1443,9 @@ class YoutubePlaylistIE(YoutubeBaseInfoExtractor): | ((?:PL|EC|UU|FL|RD)[0-9A-Za-z-_]{10,}) )""" - _TEMPLATE_URL = 'https://www.youtube.com/playlist?list=%s&page=%s' + _TEMPLATE_URL = 'https://www.youtube.com/playlist?list=%s' _MORE_PAGES_INDICATOR = r'data-link-type="next"' - _VIDEO_RE = r'href="/watch\?v=(?P[0-9A-Za-z_-]{11})&[^"]*?index=(?P\d+)' + _VIDEO_RE = r'href="\s*/watch\?v=(?P[0-9A-Za-z_-]{11})&[^"]*?index=(?P\d+)' IE_NAME = u'youtube:playlist' def _real_initialize(self): @@ -1493,29 +1493,31 @@ class YoutubePlaylistIE(YoutubeBaseInfoExtractor): raise ExtractorError(u'For downloading YouTube.com top lists, use ' u'the "yttoplist" keyword, for example "youtube-dl \'yttoplist:music:Top Tracks\'"', expected=True) + url = self._TEMPLATE_URL % playlist_id + page = self._download_webpage(url, playlist_id) + more_widget_html = content_html = page + # Extract the video ids from the playlist pages ids = [] for page_num in itertools.count(1): - url = self._TEMPLATE_URL % (playlist_id, page_num) - page = self._download_webpage(url, playlist_id, u'Downloading page #%s' % page_num) - matches = re.finditer(self._VIDEO_RE, page) + matches = re.finditer(self._VIDEO_RE, content_html) # We remove the duplicates and the link with index 0 # (it's not the first video of the playlist) new_ids = orderedSet(m.group('id') for m in matches if m.group('index') != '0') ids.extend(new_ids) - if re.search(self._MORE_PAGES_INDICATOR, page) is None: + mobj = re.search(r'data-uix-load-more-href="/?(?P[^"]+)"', more_widget_html) + if not mobj: break - try: - playlist_title = self._og_search_title(page) - except RegexNotFoundError: - self.report_warning( - u'Playlist page is missing OpenGraph title, falling back ...', - playlist_id) - playlist_title = self._html_search_regex( - r'

(.*?)

', page, u'title') + more = self._download_json( + 'https://youtube.com/%s' % mobj.group('more'), playlist_id, 'Downloading page #%s' % page_num) + content_html = more['content_html'] + more_widget_html = more['load_more_widget_html'] + + playlist_title = self._html_search_regex( + r'

\s*(.*?)\s*

', page, u'title') url_results = self._ids_to_results(ids) return self.playlist_result(url_results, playlist_id, playlist_title) From da362979887b163d09d67c84c788fa16d921e4bc Mon Sep 17 00:00:00 2001 From: "Sergey M." Date: Fri, 21 Feb 2014 17:57:19 +0700 Subject: [PATCH 10/58] [wimp] Modernize and replace test --- youtube_dl/extractor/wimp.py | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/youtube_dl/extractor/wimp.py b/youtube_dl/extractor/wimp.py index 9a6bb0c76..79fd53e0c 100644 --- a/youtube_dl/extractor/wimp.py +++ b/youtube_dl/extractor/wimp.py @@ -6,14 +6,15 @@ from .common import InfoExtractor class WimpIE(InfoExtractor): - _VALID_URL = r'(?:http://)?(?:www\.)?wimp\.com/([^/]+)/' + _VALID_URL = r'http://(?:www\.)?wimp\.com/([^/]+)/' _TEST = { - 'url': 'http://www.wimp.com/deerfence/', - 'file': 'deerfence.flv', - 'md5': '8b215e2e0168c6081a1cf84b2846a2b5', + 'url': 'http://www.wimp.com/maruexhausted/', + 'md5': 'f1acced123ecb28d9bb79f2479f2b6a1', 'info_dict': { - "title": "Watch Till End: Herd of deer jump over a fence.", - "description": "These deer look as fluid as running water when they jump over this fence as a herd. This video is one that needs to be watched until the very end for the true majesty to be witnessed, but once it comes, it's sure to take your breath away.", + 'id': 'maruexhausted', + 'ext': 'flv', + 'title': 'Maru is exhausted.', + 'description': 'md5:57e099e857c0a4ea312542b684a869b8', } } @@ -30,4 +31,4 @@ class WimpIE(InfoExtractor): 'title': self._og_search_title(webpage), 'thumbnail': self._og_search_thumbnail(webpage), 'description': self._og_search_description(webpage), - } + } \ No newline at end of file From 43e77ca4552d9076b893a63d576d424ba1eeb3cd Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Fri, 21 Feb 2014 12:16:03 +0100 Subject: [PATCH 11/58] release 2014.02.21 --- youtube_dl/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/version.py b/youtube_dl/version.py index b722f8175..c2660a316 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,2 +1,2 @@ -__version__ = '2014.02.20' +__version__ = '2014.02.21' From 3489b7d26c727dac604cf9ece562139372da9bb7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Fri, 21 Feb 2014 15:15:58 +0100 Subject: [PATCH 12/58] [youtube] Simplify the decryption process for the manifest urls and add a test (closes #2422) --- youtube_dl/extractor/youtube.py | 27 +++++++++++++++++++++------ 1 file changed, 21 insertions(+), 6 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 49cca4c63..e1ef90e38 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -297,6 +297,23 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): u"format": "141", }, }, + # DASH manifest with encrypted signature + { + u'url': u'https://www.youtube.com/watch?v=IB3lcPjvWLA', + u'info_dict': { + u'id': u'IB3lcPjvWLA', + u'ext': u'm4a', + u'title': u'Afrojack - The Spark ft. Spree Wilson', + u'description': u'md5:3199ed45ee8836572865580804d7ac0f', + u'uploader': u'AfrojackVEVO', + u'uploader_id': u'AfrojackVEVO', + u'upload_date': u'20131011', + }, + u"params": { + u'youtube_include_dash_manifest': True, + u'format': '141', + }, + }, ] @@ -1272,8 +1289,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): mobj = re.search(r';ytplayer.config = ({.*?});', video_webpage) if not mobj: raise ValueError('Could not find vevo ID') - info = json.loads(mobj.group(1)) - args = info['args'] + ytplayer_config = json.loads(mobj.group(1)) + args = ytplayer_config['args'] # Easy way to know if the 's' value is in url_encoded_fmt_stream_map # this signatures are encrypted if 'url_encoded_fmt_stream_map' not in args: @@ -1374,11 +1391,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): # Luckily, it seems, this case uses some kind of default signature (len == 86), so the # combination of get_video_info and the _static_decrypt_signature() decryption fallback will work here. if age_gate: - dash_manifest_url = video_info.get('dashmpd')[0]; + dash_manifest_url = video_info.get('dashmpd')[0] else: - x = re.search(r'ytplayer\.config = ({.*});', video_webpage) - x = json.loads(x.group(1)); - dash_manifest_url = x['args']['dashmpd'] + dash_manifest_url = ytplayer_config['args']['dashmpd'] def decrypt_sig(mobj): s = mobj.group(1) dec_s = self._decrypt_signature(s, video_id, player_url, age_gate) From f7300c5c90a99d234a2c7a6d70f5b5baa9d35046 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Fri, 21 Feb 2014 16:59:10 +0100 Subject: [PATCH 13/58] [generic] Fix on python 2.6 `ParseError` is not available, it raises `xml.parsers.expat.ExpatError`. The webpage needs to be encoded. --- youtube_dl/extractor/generic.py | 5 +++-- youtube_dl/utils.py | 5 +++++ 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 30160d59d..9a2e54d14 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -13,6 +13,7 @@ from ..utils import ( compat_urllib_parse, compat_urllib_request, compat_urlparse, + compat_xml_parse_error, ExtractorError, HEADRequest, @@ -241,10 +242,10 @@ class GenericIE(InfoExtractor): # Is it an RSS feed? try: - doc = xml.etree.ElementTree.fromstring(webpage) + doc = xml.etree.ElementTree.fromstring(webpage.encode('utf-8')) if doc.tag == 'rss': return self._extract_rss(url, video_id, doc) - except xml.etree.ElementTree.ParseError: + except compat_xml_parse_error: pass # it's tempting to parse this further, but you would diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 057cd20d1..471516b8f 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -174,6 +174,11 @@ try: except NameError: compat_chr = chr +try: + from xml.etree.ElementTree import ParseError as compat_xml_parse_error +except ImportError: # Python 2.6 + from xml.parsers.expat import ExpatError as compat_xml_parse_error + def compat_ord(c): if type(c) is int: return c else: return ord(c) From 3eb38acb43f30c94a9b93894b1d8171f3e6fa809 Mon Sep 17 00:00:00 2001 From: David Triendl Date: Fri, 21 Feb 2014 17:28:30 +0100 Subject: [PATCH 14/58] [BR] Add "BR" extractor Extractor for videos from the Bayerischer Rundfunk Mediathek[1]. Currently only supports videos. Audio and podcasts do not work yet with this extractor. 1: http://br.de/mediathek --- youtube_dl/extractor/__init__.py | 1 + youtube_dl/extractor/br.py | 70 ++++++++++++++++++++++++++++++++ 2 files changed, 71 insertions(+) create mode 100644 youtube_dl/extractor/br.py diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index e35287f88..989482a9b 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -19,6 +19,7 @@ from .bbccouk import BBCCoUkIE from .blinkx import BlinkxIE from .bliptv import BlipTVIE, BlipTVUserIE from .bloomberg import BloombergIE +from .br import BRIE from .breakcom import BreakIE from .brightcove import BrightcoveIE from .c56 import C56IE diff --git a/youtube_dl/extractor/br.py b/youtube_dl/extractor/br.py new file mode 100644 index 000000000..888c5c239 --- /dev/null +++ b/youtube_dl/extractor/br.py @@ -0,0 +1,70 @@ +# coding: utf-8 + +from .common import InfoExtractor + +class BRIE(InfoExtractor): + + IE_DESC = u"Bayerischer Rundfunk Mediathek" + _VALID_URL = r"^https?://(?:www\.)?br\.de/mediathek/video/(?:sendungen/)?(?:[a-z0-9\-]+\.html)$" + _BASE_URL = u"http://www.br.de" + + _TESTS = [] + + def _real_extract(self, url): + page = self._download_webpage(url, None) + xml_url = self._search_regex(r"return BRavFramework\.register\(BRavFramework\('avPlayer_(?:[a-f0-9-]{36})'\)\.setup\({dataURL:'(/mediathek/video/[a-z0-9/~_.-]+)'}\)\);", page, "XMLURL") + xml = self._download_xml(self._BASE_URL + xml_url, None) + + videos = [] + for xml_video in xml.findall("video"): + video = {} + video["id"] = xml_video.get("externalId") + video["title"] = xml_video.find("title").text + video["formats"] = self._extract_formats(xml_video.find("assets")) + video["thumbnails"] = self._extract_thumbnails(xml_video.find("teaserImage/variants")) + video["thumbnail"] = video["thumbnails"][0]["url"] + video["description"] = " ".join(xml_video.find("shareTitle").text.splitlines()) + video["uploader"] = xml_video.find("author").text + video["upload_date"] = "".join(reversed(xml_video.find("broadcastDate").text.split("."))) + video["webpage_url"] = xml_video.find("permalink").text + videos.append(video) + + if len(videos) > 1: + self._downloader.report_warning(u'found multiple videos; please' + u'report this with the video URL to http://yt-dl.org/bug') + return videos[0] + + def _extract_formats(self, assets): + vformats = [] + for asset in assets.findall("asset"): + if asset.find("downloadUrl") is None: + continue + vformat = {} + vformat["url"] = asset.find("downloadUrl").text + vformat["ext"] = asset.find("mediaType").text + vformat["format_id"] = asset.get("type") + vformat["width"] = int(asset.find("frameWidth").text) + vformat["height"] = int(asset.find("frameHeight").text) + vformat["resolution"] = "%ix%i" % (vformat["width"], vformat["height"]) + vformat["tbr"] = int(asset.find("bitrateVideo").text) + vformat["abr"] = int(asset.find("bitrateAudio").text) + vformat["vcodec"] = asset.find("codecVideo").text + vformat["container"] = vformat["ext"] + vformat["filesize"] = int(asset.find("size").text) + vformat["preference"] = vformat["quality"] = -1 + vformat["format"] = "%s container with %i Kbps %s" % (vformat["container"], vformat["tbr"], vformat["vcodec"]) + vformats.append(vformat) + self._sort_formats(vformats) + return vformats + + def _extract_thumbnails(self, variants): + thumbnails = [] + for variant in variants.findall("variant"): + thumbnail = {} + thumbnail["url"] = self._BASE_URL + variant.find("url").text + thumbnail["width"] = int(variant.find("width").text) + thumbnail["height"] = int(variant.find("height").text) + thumbnail["resolution"] = "%ix%i" % (thumbnail["width"], thumbnail["height"]) + thumbnails.append(thumbnail) + thumbnails.sort(key = lambda x: x["width"] * x["height"], reverse=True) + return thumbnails From 7928024f5747eee6435c22df8395da4ed0aef4cd Mon Sep 17 00:00:00 2001 From: David Triendl Date: Fri, 21 Feb 2014 17:48:40 +0100 Subject: [PATCH 15/58] [BR] Add basic test --- youtube_dl/extractor/br.py | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/br.py b/youtube_dl/extractor/br.py index 888c5c239..41e16f6cb 100644 --- a/youtube_dl/extractor/br.py +++ b/youtube_dl/extractor/br.py @@ -8,7 +8,19 @@ class BRIE(InfoExtractor): _VALID_URL = r"^https?://(?:www\.)?br\.de/mediathek/video/(?:sendungen/)?(?:[a-z0-9\-]+\.html)$" _BASE_URL = u"http://www.br.de" - _TESTS = [] + _TESTS = [ + { + u"url": u"http://www.br.de/mediathek/video/anselm-gruen-114.html", + u"file": u"2c8d81c5-6fb7-4a74-88d4-e768e5856532.mp4", + u"md5": u"c4f83cf0f023ba5875aba0bf46860df2", + u"info_dict": { + u"title": u"Feiern und Verzichten", + u"description": u"Anselm Grün: Feiern und Verzichten", + u"uploader": u"BR/Birgit Baier", + u"upload_date": u"20140301" + } + } + ] def _real_extract(self, url): page = self._download_webpage(url, None) From 1cf563d84baf84a208f4368b9b9f5b2e2b98ddd3 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Fri, 21 Feb 2014 18:19:48 +0100 Subject: [PATCH 16/58] release 2014.02.21.1 --- youtube_dl/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/version.py b/youtube_dl/version.py index c2660a316..a92faa5a7 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,2 +1,2 @@ -__version__ = '2014.02.21' +__version__ = '2014.02.21.1' From 9ddfd84e41b57343adac6d0677d91750686b8bc6 Mon Sep 17 00:00:00 2001 From: Johny Mo Swag Date: Sat, 22 Feb 2014 00:11:57 -0800 Subject: [PATCH 17/58] added trutubeIE --- youtube_dl/extractor/__init__.py | 1 + youtube_dl/extractor/trutube.py | 43 +++++++++++++++++++++++++ youtube_dl/extractor/worldstarhiphop.py | 4 +-- 3 files changed, 46 insertions(+), 2 deletions(-) create mode 100644 youtube_dl/extractor/trutube.py diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index e35287f88..6377f8b4c 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -224,6 +224,7 @@ from .tinypic import TinyPicIE from .toutv import TouTvIE from .traileraddict import TrailerAddictIE from .trilulilu import TriluliluIE +from .trutube import TruTubeIE from .tube8 import Tube8IE from .tudou import TudouIE from .tumblr import TumblrIE diff --git a/youtube_dl/extractor/trutube.py b/youtube_dl/extractor/trutube.py new file mode 100644 index 000000000..37d3af0ca --- /dev/null +++ b/youtube_dl/extractor/trutube.py @@ -0,0 +1,43 @@ +import re + +from .common import InfoExtractor +from ..utils import ( + ExtractorError, +) + + +class TruTubeIE(InfoExtractor): + _VALID_URL = r'(?:https?://)?(?:www\.)?(?Ptrutube\.tv/video/(?P.*/.*))' + _TEST = { + 'url': ('http://www.trutube.tv/video/20814/Ernst-Zundel-met-les-Jui' + 'fs-en-guarde-VOSTFR'), + 'md5': '9973aa3c2870626799d2ac4e36cfc3dc', + 'info_dict': { + u"title": u"TruTube.TV - Spitting in the face of die-versity", + u"ext": u"mp4" + } + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + + video_id = mobj.group('videoid') + + # Get webpage content + webpage = self._download_webpage(url, video_id) + + # Get the video title + video_title = self._html_search_regex(r'(?P<title>.*)', + webpage, 'title').strip() + + video_url = self._search_regex(r'(http://.*\.(?:mp4|flv))', + webpage, u'video URL') + + ext = video_url[-3:] + + return { + 'id': video_id, + 'url': video_url, + 'title': video_title, + 'ext': ext + } diff --git a/youtube_dl/extractor/worldstarhiphop.py b/youtube_dl/extractor/worldstarhiphop.py index 3237596a3..fc9237a3f 100644 --- a/youtube_dl/extractor/worldstarhiphop.py +++ b/youtube_dl/extractor/worldstarhiphop.py @@ -22,8 +22,8 @@ class WorldStarHipHopIE(InfoExtractor): webpage_src = self._download_webpage(url, video_id) m_vevo_id = re.search(r'videoId=(.*?)&?', - webpage_src) - + webpage_src) + if m_vevo_id is not None: self.to_screen(u'Vevo video detected:') return self.url_result('vevo:%s' % m_vevo_id.group(1), ie='Vevo') From 9767726b6689e25540530b6551296aed43a0808e Mon Sep 17 00:00:00 2001 From: "Sergey M." Date: Sat, 22 Feb 2014 16:45:03 +0700 Subject: [PATCH 18/58] [spankwire] Improve and modernize --- youtube_dl/extractor/spankwire.py | 65 +++++++++++++++++++------------ 1 file changed, 41 insertions(+), 24 deletions(-) diff --git a/youtube_dl/extractor/spankwire.py b/youtube_dl/extractor/spankwire.py index 3362b3db8..45703ec25 100644 --- a/youtube_dl/extractor/spankwire.py +++ b/youtube_dl/extractor/spankwire.py @@ -1,6 +1,5 @@ from __future__ import unicode_literals -import os import re from .common import InfoExtractor @@ -8,23 +7,27 @@ from ..utils import ( compat_urllib_parse_urlparse, compat_urllib_request, compat_urllib_parse, + unified_strdate, + str_to_int, + int_or_none, ) -from ..aes import ( - aes_decrypt_text -) +from ..aes import aes_decrypt_text class SpankwireIE(InfoExtractor): - _VALID_URL = r'^(?:https?://)?(?:www\.)?(?Pspankwire\.com/[^/]*/video(?P[0-9]+)/?)' + _VALID_URL = r'https?://(?:www\.)?(?Pspankwire\.com/[^/]*/video(?P[0-9]+)/?)' _TEST = { 'url': 'http://www.spankwire.com/Buckcherry-s-X-Rated-Music-Video-Crazy-Bitch/video103545/', - 'file': '103545.mp4', - 'md5': '1b3f55e345500552dbc252a3e9c1af43', + 'md5': '8bbfde12b101204b39e4b9fe7eb67095', 'info_dict': { - "uploader": "oreusz", - "title": "Buckcherry`s X Rated Music Video Crazy Bitch", - "description": "Crazy Bitch X rated music video.", - "age_limit": 18, + 'id': '103545', + 'ext': 'mp4', + 'title': 'Buckcherry`s X Rated Music Video Crazy Bitch', + 'description': 'Crazy Bitch X rated music video.', + 'uploader': 'oreusz', + 'uploader_id': '124697', + 'upload_date': '20070508', + 'age_limit': 18, } } @@ -37,13 +40,26 @@ class SpankwireIE(InfoExtractor): req.add_header('Cookie', 'age_verified=1') webpage = self._download_webpage(req, video_id) - video_title = self._html_search_regex(r'

([^<]+)', webpage, 'title') - video_uploader = self._html_search_regex( - r'by:\s*]*>(.+?)', webpage, 'uploader', fatal=False) - thumbnail = self._html_search_regex( - r'flashvars\.image_url = "([^"]+)', webpage, 'thumbnail', fatal=False) + title = self._html_search_regex(r'

([^<]+)', webpage, 'title') description = self._html_search_regex( r'([^<]+)<', webpage, 'description', fatal=False) + thumbnail = self._html_search_regex( + r'flashvars\.image_url = "([^"]+)', webpage, 'thumbnail', fatal=False) + + uploader = self._html_search_regex( + r'by:\s*]*>(.+?)', webpage, 'uploader', fatal=False) + uploader_id = self._html_search_regex( + r'by:\s* on (.+?) at \d+:\d+', webpage, 'upload date', fatal=False) + if upload_date: + upload_date = unified_strdate(upload_date) + + view_count = self._html_search_regex( + r'
([^<]+) views
', webpage, 'view count', fatal=False) + if view_count: + view_count = str_to_int(view_count) + comment_count = int_or_none(self._html_search_regex( + r'\s*(\d+) Comments', webpage, 'comment count', fatal=False)) video_urls = list(map(compat_urllib_parse.unquote , re.findall(r'flashvars\.quality_[0-9]{3}p = "([^"]+)', webpage))) if webpage.find('flashvars\.encrypted = "true"') != -1: @@ -53,16 +69,13 @@ class SpankwireIE(InfoExtractor): formats = [] for video_url in video_urls: path = compat_urllib_parse_urlparse(video_url).path - extension = os.path.splitext(path)[1][1:] format = path.split('/')[4].split('_')[:2] resolution, bitrate_str = format format = "-".join(format) - height = int(resolution.rstrip('P')) - tbr = int(bitrate_str.rstrip('K')) - + height = int(resolution.rstrip('Pp')) + tbr = int(bitrate_str.rstrip('Kk')) formats.append({ 'url': video_url, - 'ext': extension, 'resolution': resolution, 'format': format, 'tbr': tbr, @@ -75,10 +88,14 @@ class SpankwireIE(InfoExtractor): return { 'id': video_id, - 'uploader': video_uploader, - 'title': video_title, - 'thumbnail': thumbnail, + 'title': title, 'description': description, + 'thumbnail': thumbnail, + 'uploader': uploader, + 'uploader_id': uploader_id, + 'upload_date': upload_date, + 'view_count': view_count, + 'comment_count': comment_count, 'formats': formats, 'age_limit': age_limit, } From 2e7b4cb714e021f3660e7d2a7ba1e9a575bc2e6f Mon Sep 17 00:00:00 2001 From: "Sergey M." Date: Sat, 22 Feb 2014 16:50:08 +0700 Subject: [PATCH 19/58] [spankwire] Fix uploader id regex --- youtube_dl/extractor/spankwire.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/spankwire.py b/youtube_dl/extractor/spankwire.py index 45703ec25..2007a0013 100644 --- a/youtube_dl/extractor/spankwire.py +++ b/youtube_dl/extractor/spankwire.py @@ -49,7 +49,7 @@ class SpankwireIE(InfoExtractor): uploader = self._html_search_regex( r'by:\s*
]*>(.+?)', webpage, 'uploader', fatal=False) uploader_id = self._html_search_regex( - r'by:\s* on (.+?) at \d+:\d+', webpage, 'upload date', fatal=False) if upload_date: upload_date = unified_strdate(upload_date) From 0568c352f331549d2a0b961ba8c7a70e48daed78 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Sat, 22 Feb 2014 14:27:09 +0100 Subject: [PATCH 20/58] [canalc2] Modernize --- youtube_dl/extractor/canalc2.py | 28 ++++++++++++++++------------ 1 file changed, 16 insertions(+), 12 deletions(-) diff --git a/youtube_dl/extractor/canalc2.py b/youtube_dl/extractor/canalc2.py index 3d8d7f9d2..c4fefefe4 100644 --- a/youtube_dl/extractor/canalc2.py +++ b/youtube_dl/extractor/canalc2.py @@ -1,4 +1,6 @@ # coding: utf-8 +from __future__ import unicode_literals + import re from .common import InfoExtractor @@ -9,11 +11,12 @@ class Canalc2IE(InfoExtractor): _VALID_URL = r'http://.*?\.canalc2\.tv/video\.asp\?.*?idVideo=(?P\d+)' _TEST = { - u'url': u'http://www.canalc2.tv/video.asp?idVideo=12163&voir=oui', - u'file': u'12163.mp4', - u'md5': u'060158428b650f896c542dfbb3d6487f', - u'info_dict': { - u'title': u'Terrasses du Numérique' + 'url': 'http://www.canalc2.tv/video.asp?idVideo=12163&voir=oui', + 'md5': '060158428b650f896c542dfbb3d6487f', + 'info_dict': { + 'id': '12163', + 'ext': 'mp4', + 'title': 'Terrasses du Numérique' } } @@ -28,10 +31,11 @@ class Canalc2IE(InfoExtractor): video_url = 'http://vod-flash.u-strasbg.fr:8080/' + file_name title = self._html_search_regex( - r'class="evenement8">(.*?)', webpage, u'title') - - return {'id': video_id, - 'ext': 'mp4', - 'url': video_url, - 'title': title, - } + r'class="evenement8">(.*?)', webpage, 'title') + + return { + 'id': video_id, + 'ext': 'mp4', + 'url': video_url, + 'title': title, + } From 41d3ec5fba1b64be78f3ea823a604c28833a94cd Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Sat, 22 Feb 2014 14:36:04 +0100 Subject: [PATCH 21/58] [savefrom] Add extractor (Fixes #2434) --- youtube_dl/extractor/__init__.py | 1 + youtube_dl/extractor/savefrom.py | 37 ++++++++++++++++++++++++++++++++ 2 files changed, 38 insertions(+) create mode 100644 youtube_dl/extractor/savefrom.py diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index e35287f88..229bdc595 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -186,6 +186,7 @@ from .rutube import ( RutubeMovieIE, RutubePersonIE, ) +from .savefrom import SaveFromIE from .servingsys import ServingSysIE from .sina import SinaIE from .slashdot import SlashdotIE diff --git a/youtube_dl/extractor/savefrom.py b/youtube_dl/extractor/savefrom.py new file mode 100644 index 000000000..198a08c1c --- /dev/null +++ b/youtube_dl/extractor/savefrom.py @@ -0,0 +1,37 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import os.path +import re + +from .common import InfoExtractor + + +class SaveFromIE(InfoExtractor): + IE_NAME = 'savefrom.net' + _VALID_URL = r'https?://[^.]+\.savefrom\.net/\#url=(?P.*)$' + + _TEST = { + 'url': 'http://en.savefrom.net/#url=http://youtube.com/watch?v=UlVRAPW2WJY&utm_source=youtube.com&utm_medium=short_domains&utm_campaign=ssyoutube.com', + 'info_dict': { + 'id': 'UlVRAPW2WJY', + 'ext': 'mp4', + 'title': 'About Team Radical MMA | MMA Fighting', + 'upload_date': '20120816', + 'uploader': 'Howcast', + 'uploader_id': 'Howcast', + 'description': 'md5:4f0aac94361a12e1ce57d74f85265175', + }, + 'params': { + 'skip_download': True + } + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = os.path.splitext(url.split('/')[-1])[0] + return { + '_type': 'url', + 'id': video_id, + 'url': mobj.group('url'), + } From 491ed3dda2435992ae27dcefae22d3416d9aff12 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Sat, 22 Feb 2014 15:05:30 +0100 Subject: [PATCH 22/58] [trutube] Support multiple formats (#2433) --- test/test_download.py | 20 +++++++++++---- youtube_dl/extractor/trutube.py | 44 ++++++++++++++++++--------------- 2 files changed, 39 insertions(+), 25 deletions(-) diff --git a/test/test_download.py b/test/test_download.py index 7587a18aa..ff571c48f 100644 --- a/test/test_download.py +++ b/test/test_download.py @@ -18,6 +18,7 @@ from test.helper import ( import hashlib import io import json +import re import socket import youtube_dl.YoutubeDL @@ -137,12 +138,21 @@ def generator(test_case): with io.open(info_json_fn, encoding='utf-8') as infof: info_dict = json.load(infof) for (info_field, expected) in tc.get('info_dict', {}).items(): - if isinstance(expected, compat_str) and expected.startswith('md5:'): - got = 'md5:' + md5(info_dict.get(info_field)) - else: + if isinstance(expected, compat_str) and expected.startswith('re:'): got = info_dict.get(info_field) - self.assertEqual(expected, got, - u'invalid value for field %s, expected %r, got %r' % (info_field, expected, got)) + match_str = expected[len('re:'):] + match_rex = re.compile(match_str) + + self.assertTrue( + isinstance(got, compat_str) and match_rex.match(got), + u'field %s (value: %r) should match %r' % (info_field, got, match_str)) + else: + if isinstance(expected, compat_str) and expected.startswith('md5:'): + got = 'md5:' + md5(info_dict.get(info_field)) + else: + got = info_dict.get(info_field) + self.assertEqual(expected, got, + u'invalid value for field %s, expected %r, got %r' % (info_field, expected, got)) # If checkable fields are missing from the test case, print the info_dict test_info_dict = dict((key, value if not isinstance(value, compat_str) or len(value) < 250 else 'md5:' + md5(value)) diff --git a/youtube_dl/extractor/trutube.py b/youtube_dl/extractor/trutube.py index 37d3af0ca..1166836de 100644 --- a/youtube_dl/extractor/trutube.py +++ b/youtube_dl/extractor/trutube.py @@ -1,3 +1,5 @@ +from __future__ import unicode_literals + import re from .common import InfoExtractor @@ -7,37 +9,39 @@ from ..utils import ( class TruTubeIE(InfoExtractor): - _VALID_URL = r'(?:https?://)?(?:www\.)?(?Ptrutube\.tv/video/(?P.*/.*))' + _VALID_URL = r'https?://(?:www\.)?trutube\.tv/video/(?P[0-9]+)/.*' _TEST = { - 'url': ('http://www.trutube.tv/video/20814/Ernst-Zundel-met-les-Jui' - 'fs-en-guarde-VOSTFR'), - 'md5': '9973aa3c2870626799d2ac4e36cfc3dc', + 'url': 'http://trutube.tv/video/14880/Ramses-II-Proven-To-Be-A-Red-Headed-Caucasoid-', + 'md5': 'c5b6e301b0a2040b074746cbeaa26ca1', 'info_dict': { - u"title": u"TruTube.TV - Spitting in the face of die-versity", - u"ext": u"mp4" + 'id': '14880', + 'ext': 'flv', + 'title': 'Ramses II - Proven To Be A Red Headed Caucasoid', + 'thumbnail': 're:^http:.*\.jpg$', } } def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') - video_id = mobj.group('videoid') - - # Get webpage content webpage = self._download_webpage(url, video_id) + video_title = self._og_search_title(webpage).strip() + thumbnail = self._search_regex( + r"var splash_img = '([^']+)';", webpage, 'thumbnail', fatal=False) - # Get the video title - video_title = self._html_search_regex(r'(?P<title>.*)', - webpage, 'title').strip() - - video_url = self._search_regex(r'(http://.*\.(?:mp4|flv))', - webpage, u'video URL') - - ext = video_url[-3:] + all_formats = re.finditer( + r"var (?P[a-z]+)_video_file\s*=\s*'(?P[^']+)';", webpage) + formats = [{ + 'format_id': m.group('key'), + 'quality': -i, + 'url': m.group('url'), + } for i, m in enumerate(all_formats)] + self._sort_formats(formats) return { 'id': video_id, - 'url': video_url, 'title': video_title, - 'ext': ext - } + 'formats': formats, + 'thumbnail': thumbnail, + } From 0f8f097183a149144eb3c3c9b4e598c0842edd7e Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Sat, 22 Feb 2014 15:06:07 +0100 Subject: [PATCH 23/58] [release.sh] Do not run tests by default We are at the point that testing takes waay too long for a release cycle, and fails way too often. Tests through travis are a better indicator than testing just before release. --- devscripts/release.sh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/devscripts/release.sh b/devscripts/release.sh index 323acf8cf..72e708c7f 100755 --- a/devscripts/release.sh +++ b/devscripts/release.sh @@ -14,9 +14,9 @@ set -e -skip_tests=false -if [ "$1" = '--skip-test' ]; then - skip_tests=true +skip_tests=true +if [ "$1" = '--run-tests' ]; then + skip_tests=false shift fi From 5e0b6523440e785e044df3295712c051382e5515 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Sat, 22 Feb 2014 15:07:25 +0100 Subject: [PATCH 24/58] release 2014.02.22 --- youtube_dl/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/version.py b/youtube_dl/version.py index a92faa5a7..290d54f05 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,2 +1,2 @@ -__version__ = '2014.02.21.1' +__version__ = '2014.02.22' From 06aabfc422182551481d5cbb09d44dcf0fd25f15 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Sat, 22 Feb 2014 20:17:26 +0100 Subject: [PATCH 25/58] [br] Simplify --- youtube_dl/extractor/br.py | 123 ++++++++++++++++++------------------- 1 file changed, 60 insertions(+), 63 deletions(-) diff --git a/youtube_dl/extractor/br.py b/youtube_dl/extractor/br.py index 41e16f6cb..2256f47bb 100644 --- a/youtube_dl/extractor/br.py +++ b/youtube_dl/extractor/br.py @@ -1,82 +1,79 @@ # coding: utf-8 +from __future__ import unicode_literals + +import re from .common import InfoExtractor + class BRIE(InfoExtractor): + IE_DESC = "Bayerischer Rundfunk Mediathek" + _VALID_URL = r"^https?://(?:www\.)?br\.de/mediathek/video/(?:sendungen/)?(?P[a-z0-9\-]+)\.html$" + _BASE_URL = "http://www.br.de" - IE_DESC = u"Bayerischer Rundfunk Mediathek" - _VALID_URL = r"^https?://(?:www\.)?br\.de/mediathek/video/(?:sendungen/)?(?:[a-z0-9\-]+\.html)$" - _BASE_URL = u"http://www.br.de" - - _TESTS = [ - { - u"url": u"http://www.br.de/mediathek/video/anselm-gruen-114.html", - u"file": u"2c8d81c5-6fb7-4a74-88d4-e768e5856532.mp4", - u"md5": u"c4f83cf0f023ba5875aba0bf46860df2", - u"info_dict": { - u"title": u"Feiern und Verzichten", - u"description": u"Anselm Grün: Feiern und Verzichten", - u"uploader": u"BR/Birgit Baier", - u"upload_date": u"20140301" - } + _TEST = { + "url": "http://www.br.de/mediathek/video/anselm-gruen-114.html", + "md5": "c4f83cf0f023ba5875aba0bf46860df2", + "info_dict": { + "id": "2c8d81c5-6fb7-4a74-88d4-e768e5856532", + "ext": "mp4", + "title": "Feiern und Verzichten", + "description": "Anselm Grün: Feiern und Verzichten", + "uploader": "BR/Birgit Baier", + "upload_date": "20140301" } - ] + } def _real_extract(self, url): - page = self._download_webpage(url, None) - xml_url = self._search_regex(r"return BRavFramework\.register\(BRavFramework\('avPlayer_(?:[a-f0-9-]{36})'\)\.setup\({dataURL:'(/mediathek/video/[a-z0-9/~_.-]+)'}\)\);", page, "XMLURL") + mobj = re.match(self._VALID_URL, url) + display_id = mobj.group('id') + page = self._download_webpage(url, display_id) + xml_url = self._search_regex( + r"return BRavFramework\.register\(BRavFramework\('avPlayer_(?:[a-f0-9-]{36})'\)\.setup\({dataURL:'(/mediathek/video/[a-z0-9/~_.-]+)'}\)\);", page, "XMLURL") xml = self._download_xml(self._BASE_URL + xml_url, None) - videos = [] - for xml_video in xml.findall("video"): - video = {} - video["id"] = xml_video.get("externalId") - video["title"] = xml_video.find("title").text - video["formats"] = self._extract_formats(xml_video.find("assets")) - video["thumbnails"] = self._extract_thumbnails(xml_video.find("teaserImage/variants")) - video["thumbnail"] = video["thumbnails"][0]["url"] - video["description"] = " ".join(xml_video.find("shareTitle").text.splitlines()) - video["uploader"] = xml_video.find("author").text - video["upload_date"] = "".join(reversed(xml_video.find("broadcastDate").text.split("."))) - video["webpage_url"] = xml_video.find("permalink").text - videos.append(video) + videos = [{ + "id": xml_video.get("externalId"), + "title": xml_video.find("title").text, + "formats": self._extract_formats(xml_video.find("assets")), + "thumbnails": self._extract_thumbnails(xml_video.find("teaserImage/variants")), + "description": " ".join(xml_video.find("shareTitle").text.splitlines()), + "uploader": xml_video.find("author").text, + "upload_date": "".join(reversed(xml_video.find("broadcastDate").text.split("."))), + "webpage_url": xml_video.find("permalink").text, + } for xml_video in xml.findall("video")] if len(videos) > 1: - self._downloader.report_warning(u'found multiple videos; please' - u'report this with the video URL to http://yt-dl.org/bug') + self._downloader.report_warning( + 'found multiple videos; please ' + 'report this with the video URL to http://yt-dl.org/bug') + if not videos: + raise ExtractorError('No video entries found') return videos[0] def _extract_formats(self, assets): - vformats = [] - for asset in assets.findall("asset"): - if asset.find("downloadUrl") is None: - continue - vformat = {} - vformat["url"] = asset.find("downloadUrl").text - vformat["ext"] = asset.find("mediaType").text - vformat["format_id"] = asset.get("type") - vformat["width"] = int(asset.find("frameWidth").text) - vformat["height"] = int(asset.find("frameHeight").text) - vformat["resolution"] = "%ix%i" % (vformat["width"], vformat["height"]) - vformat["tbr"] = int(asset.find("bitrateVideo").text) - vformat["abr"] = int(asset.find("bitrateAudio").text) - vformat["vcodec"] = asset.find("codecVideo").text - vformat["container"] = vformat["ext"] - vformat["filesize"] = int(asset.find("size").text) - vformat["preference"] = vformat["quality"] = -1 - vformat["format"] = "%s container with %i Kbps %s" % (vformat["container"], vformat["tbr"], vformat["vcodec"]) - vformats.append(vformat) - self._sort_formats(vformats) - return vformats + formats = [{ + "url": asset.find("downloadUrl").text, + "ext": asset.find("mediaType").text, + "format_id": asset.get("type"), + "width": int(asset.find("frameWidth").text), + "height": int(asset.find("frameHeight").text), + "tbr": int(asset.find("bitrateVideo").text), + "abr": int(asset.find("bitrateAudio").text), + "vcodec": asset.find("codecVideo").text, + "container": asset.find("mediaType").text, + "filesize": int(asset.find("size").text), + } for asset in assets.findall("asset") + if asset.find("downloadUrl") is not None] + + self._sort_formats(formats) + return formats def _extract_thumbnails(self, variants): - thumbnails = [] - for variant in variants.findall("variant"): - thumbnail = {} - thumbnail["url"] = self._BASE_URL + variant.find("url").text - thumbnail["width"] = int(variant.find("width").text) - thumbnail["height"] = int(variant.find("height").text) - thumbnail["resolution"] = "%ix%i" % (thumbnail["width"], thumbnail["height"]) - thumbnails.append(thumbnail) - thumbnails.sort(key = lambda x: x["width"] * x["height"], reverse=True) + thumbnails = [{ + "url": self._BASE_URL + variant.find("url").text, + "width": int(variant.find("width").text), + "height": int(variant.find("height").text), + } for variant in variants.findall("variant")] + thumbnails.sort(key=lambda x: x["width"] * x["height"], reverse=True) return thumbnails From f38da667317579672d909dbb604ab003172af9e0 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Sat, 22 Feb 2014 20:19:41 +0100 Subject: [PATCH 26/58] Credit @soult for br --- youtube_dl/__init__.py | 1 + 1 file changed, 1 insertion(+) diff --git a/youtube_dl/__init__.py b/youtube_dl/__init__.py index 70608066c..84f29a1a5 100644 --- a/youtube_dl/__init__.py +++ b/youtube_dl/__init__.py @@ -46,6 +46,7 @@ __authors__ = ( 'Andreas Schmitz', 'Michael Kaiser', 'Niklas Laxström', + 'David Triendl', ) __license__ = 'Public Domain' From 3e123c1e2887cb144f1b73cabb78ab34c43d10bc Mon Sep 17 00:00:00 2001 From: "Sergey M." Date: Sun, 23 Feb 2014 04:50:05 +0700 Subject: [PATCH 27/58] [videobam] Add support for videobam.com (Closes #2411) --- youtube_dl/extractor/__init__.py | 1 + youtube_dl/extractor/videobam.py | 79 ++++++++++++++++++++++++++++++++ 2 files changed, 80 insertions(+) create mode 100644 youtube_dl/extractor/videobam.py diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 3badcc238..6dccd5ae7 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -241,6 +241,7 @@ from .vesti import VestiIE from .vevo import VevoIE from .vice import ViceIE from .viddler import ViddlerIE +from .videobam import VideoBamIE from .videodetective import VideoDetectiveIE from .videofyme import VideofyMeIE from .videopremium import VideoPremiumIE diff --git a/youtube_dl/extractor/videobam.py b/youtube_dl/extractor/videobam.py new file mode 100644 index 000000000..3366d982c --- /dev/null +++ b/youtube_dl/extractor/videobam.py @@ -0,0 +1,79 @@ +from __future__ import unicode_literals + +import re +import json + +from .common import InfoExtractor +from ..utils import int_or_none + + +class VideoBamIE(InfoExtractor): + _VALID_URL = r'http://(?:www\.)?videobam\.com/(?:videos/download/)?(?P[a-zA-Z]+)' + + _TESTS = [ + { + 'url': 'http://videobam.com/OiJQM', + 'md5': 'db471f27763a531f10416a0c58b5a1e0', + 'info_dict': { + 'id': 'OiJQM', + 'ext': 'mp4', + 'title': 'Is Alcohol Worse Than Ecstasy?', + 'description': 'md5:d25b96151515c91debc42bfbb3eb2683', + 'uploader': 'frihetsvinge', + }, + }, + { + 'url': 'http://videobam.com/pqLvq', + 'md5': 'd9a565b5379a99126ef94e1d7f9a383e', + 'note': 'HD video', + 'info_dict': { + 'id': 'pqLvq', + 'ext': 'mp4', + } + }, + ] + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + + page = self._download_webpage('http://videobam.com/%s' % video_id, video_id, 'Downloading page') + + formats = [] + + for preference, format_id in enumerate(['low', 'high']): + mobj = re.search(r"%s: '(?P[^']+)'" % format_id, page) + if not mobj: + continue + formats.append({ + 'url': mobj.group('url'), + 'ext': 'mp4', + 'format_id': format_id, + 'preference': preference, + }) + + if not formats: + player_config = json.loads(self._html_search_regex(r'var player_config = ({.+?});', page, 'player config')) + formats = [{ + 'url': item['url'], + 'ext': 'mp4', + } for item in player_config['playlist'] if 'autoPlay' in item] + + self._sort_formats(formats) + + title = self._og_search_title(page, default='VideoBam', fatal=False) + description = self._og_search_description(page, default=None) + thumbnail = self._og_search_thumbnail(page) + uploader = self._html_search_regex(r'Upload by ([^<]+)', page, 'uploader', fatal=False, default=None) + view_count = int_or_none( + self._html_search_regex(r'Views: (\d+) ', page, 'view count', fatal=False)) + + return { + 'id': video_id, + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + 'uploader': uploader, + 'view_count': view_count, + 'formats': formats, + } \ No newline at end of file From 1df96e59cec2dfaced6b1fedc0db1efb39094f2b Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Sat, 22 Feb 2014 23:03:00 +0100 Subject: [PATCH 28/58] [f4m] Clean up --- youtube_dl/downloader/f4m.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/youtube_dl/downloader/f4m.py b/youtube_dl/downloader/f4m.py index 2a870a758..4e6abfe10 100644 --- a/youtube_dl/downloader/f4m.py +++ b/youtube_dl/downloader/f4m.py @@ -12,7 +12,6 @@ from .http import HttpFD from ..utils import ( struct_pack, struct_unpack, - compat_urllib_request, compat_urlparse, format_bytes, encodeFilename, @@ -117,8 +116,8 @@ class FlvReader(io.BytesIO): self.read_unsigned_char() # flags self.read(3) - # BootstrapinfoVersion - bootstrap_info_version = self.read_unsigned_int() + + self.read_unsigned_int() # BootstrapinfoVersion # Profile,Live,Update,Reserved self.read(1) # time scale @@ -127,15 +126,15 @@ class FlvReader(io.BytesIO): self.read_unsigned_long_long() # SmpteTimeCodeOffset self.read_unsigned_long_long() - # MovieIdentifier - movie_identifier = self.read_string() + + self.read_string() # MovieIdentifier server_count = self.read_unsigned_char() # ServerEntryTable for i in range(server_count): self.read_string() quality_count = self.read_unsigned_char() # QualityEntryTable - for i in range(server_count): + for i in range(quality_count): self.read_string() # DrmData self.read_string() From 521ee823343544806cf7db95c32abf337aee8aab Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Sat, 22 Feb 2014 23:03:12 +0100 Subject: [PATCH 29/58] Fix imports --- youtube_dl/extractor/br.py | 1 + youtube_dl/extractor/trutube.py | 3 --- youtube_dl/extractor/youtube.py | 1 - 3 files changed, 1 insertion(+), 4 deletions(-) diff --git a/youtube_dl/extractor/br.py b/youtube_dl/extractor/br.py index 2256f47bb..5fcc1084a 100644 --- a/youtube_dl/extractor/br.py +++ b/youtube_dl/extractor/br.py @@ -4,6 +4,7 @@ from __future__ import unicode_literals import re from .common import InfoExtractor +from ..utils import ExtractorError class BRIE(InfoExtractor): diff --git a/youtube_dl/extractor/trutube.py b/youtube_dl/extractor/trutube.py index 1166836de..57f956683 100644 --- a/youtube_dl/extractor/trutube.py +++ b/youtube_dl/extractor/trutube.py @@ -3,9 +3,6 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..utils import ( - ExtractorError, -) class TruTubeIE(InfoExtractor): diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index e1ef90e38..cf0bd4ae3 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -29,7 +29,6 @@ from ..utils import ( ExtractorError, int_or_none, PagedList, - RegexNotFoundError, unescapeHTML, unified_strdate, orderedSet, From 23c2baadb3e48e46d263e1e9150e173ea92baa6a Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Sat, 22 Feb 2014 23:15:41 +0100 Subject: [PATCH 30/58] [videobam] Set age_limit to 18 From [their ToS](http://videobam.com/terms): "User must be eighteen 18[sic] years of age or older to use or access this web site." --- youtube_dl/extractor/videobam.py | 1 + 1 file changed, 1 insertion(+) diff --git a/youtube_dl/extractor/videobam.py b/youtube_dl/extractor/videobam.py index 3366d982c..cdfff05ae 100644 --- a/youtube_dl/extractor/videobam.py +++ b/youtube_dl/extractor/videobam.py @@ -76,4 +76,5 @@ class VideoBamIE(InfoExtractor): 'uploader': uploader, 'view_count': view_count, 'formats': formats, + 'age_limit': 18, } \ No newline at end of file From 2fcc873c4cdfda024ee5fec2be38512e22cb1963 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Sat, 22 Feb 2014 23:17:56 +0100 Subject: [PATCH 31/58] release 2014.02.22.1 --- youtube_dl/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 290d54f05..b166da7c7 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,2 +1,2 @@ -__version__ = '2014.02.22' +__version__ = '2014.02.22.1' From 78b373975d21f9573c1b32c53aa3c05ab4a490f2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Sun, 23 Feb 2014 12:08:30 +0100 Subject: [PATCH 32/58] [vine] Fix uploader extraction --- youtube_dl/extractor/vine.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/vine.py b/youtube_dl/extractor/vine.py index e14ff91d4..a919129a6 100644 --- a/youtube_dl/extractor/vine.py +++ b/youtube_dl/extractor/vine.py @@ -30,8 +30,9 @@ class VineIE(InfoExtractor): video_url = self._html_search_meta('twitter:player:stream', webpage, 'video URL') - uploader = self._html_search_regex(r'

(.*?)

', - webpage, 'uploader', fatal=False, flags=re.DOTALL) + twitter_title = self._html_search_meta('twitter:title', webpage, + 'twitter title') + uploader = re.sub('\'s post on Vine', '', twitter_title) return { 'id': video_id, From a25f2f990ab7a02902059daeea42d492f9d4f205 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Sun, 23 Feb 2014 12:20:58 +0100 Subject: [PATCH 33/58] [breakcom] Fix info json extraction --- youtube_dl/extractor/breakcom.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/breakcom.py b/youtube_dl/extractor/breakcom.py index 8ec6dda49..66b9190a9 100644 --- a/youtube_dl/extractor/breakcom.py +++ b/youtube_dl/extractor/breakcom.py @@ -23,8 +23,8 @@ class BreakIE(InfoExtractor): video_id = mobj.group(1).split("-")[-1] embed_url = 'http://www.break.com/embed/%s' % video_id webpage = self._download_webpage(embed_url, video_id) - info_json = self._search_regex(r'var embedVars = ({.*?});', webpage, - 'info json', flags=re.DOTALL) + info_json = self._search_regex(r'var embedVars = ({.*})\s*?', + 'webpage', 'info json', flags=re.DOTALL) info = json.loads(info_json) video_url = info['videoUri'] m_youtube = re.search(r'(https?://www\.youtube\.com/watch\?v=.*)', video_url) From 9b77f951c76cc4db41266100a4ed1ca1a2acdd4e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Sun, 23 Feb 2014 12:28:44 +0100 Subject: [PATCH 34/58] [breakcom] Fix error when calling _search_regex MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit I passed `’webpage’` instead of the variable `webpage`. --- youtube_dl/extractor/breakcom.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/breakcom.py b/youtube_dl/extractor/breakcom.py index 66b9190a9..85635d1cc 100644 --- a/youtube_dl/extractor/breakcom.py +++ b/youtube_dl/extractor/breakcom.py @@ -24,7 +24,7 @@ class BreakIE(InfoExtractor): embed_url = 'http://www.break.com/embed/%s' % video_id webpage = self._download_webpage(embed_url, video_id) info_json = self._search_regex(r'var embedVars = ({.*})\s*?', - 'webpage', 'info json', flags=re.DOTALL) + webpage, 'info json', flags=re.DOTALL) info = json.loads(info_json) video_url = info['videoUri'] m_youtube = re.search(r'(https?://www\.youtube\.com/watch\?v=.*)', video_url) From 7ff5d5c2e23cb1329d0505eb7f2bcc850f8d3047 Mon Sep 17 00:00:00 2001 From: "Sergey M." Date: Sun, 23 Feb 2014 19:00:51 +0700 Subject: [PATCH 35/58] Add one more format to unified_strdate --- youtube_dl/utils.py | 1 + 1 file changed, 1 insertion(+) diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 471516b8f..25e40a837 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -779,6 +779,7 @@ def unified_strdate(date_str): '%Y-%m-%dT%H:%M:%S.%fZ', '%Y-%m-%dT%H:%M:%S.%f0Z', '%Y-%m-%dT%H:%M:%S', + '%Y-%m-%dT%H:%M:%S.%f', '%Y-%m-%dT%H:%M', ] for expression in format_expressions: From f919201ecccc6ac208b86d281439eb669d373765 Mon Sep 17 00:00:00 2001 From: "Sergey M." Date: Sun, 23 Feb 2014 19:01:15 +0700 Subject: [PATCH 36/58] [vine] Extract more metadata and support low format --- youtube_dl/extractor/vine.py | 46 ++++++++++++++++++++++++------------ 1 file changed, 31 insertions(+), 15 deletions(-) diff --git a/youtube_dl/extractor/vine.py b/youtube_dl/extractor/vine.py index a919129a6..5bbc8ba88 100644 --- a/youtube_dl/extractor/vine.py +++ b/youtube_dl/extractor/vine.py @@ -1,8 +1,10 @@ from __future__ import unicode_literals import re +import json from .common import InfoExtractor +from ..utils import unified_strdate class VineIE(InfoExtractor): @@ -13,32 +15,46 @@ class VineIE(InfoExtractor): 'info_dict': { 'id': 'b9KOOWX7HUx', 'ext': 'mp4', - 'uploader': 'Jack Dorsey', 'title': 'Chicken.', + 'description': 'Chicken.', + 'upload_date': '20130519', + 'uploader': 'Jack Dorsey', + 'uploader_id': '76', }, } def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') - webpage_url = 'https://vine.co/v/' + video_id - webpage = self._download_webpage(webpage_url, video_id) - self.report_extraction(video_id) + webpage = self._download_webpage('https://vine.co/v/' + video_id, video_id) - video_url = self._html_search_meta('twitter:player:stream', webpage, - 'video URL') + data = json.loads(self._html_search_regex( + r'window\.POST_DATA = { %s: ({.+?}) }' % video_id, webpage, 'vine data')) - twitter_title = self._html_search_meta('twitter:title', webpage, - 'twitter title') - uploader = re.sub('\'s post on Vine', '', twitter_title) + formats = [ + { + 'url': data['videoLowURL'], + 'ext': 'mp4', + 'format_id': 'low', + }, + { + 'url': data['videoUrl'], + 'ext': 'mp4', + 'format_id': 'standard', + } + ] return { 'id': video_id, - 'url': video_url, - 'ext': 'mp4', 'title': self._og_search_title(webpage), - 'thumbnail': self._og_search_thumbnail(webpage), - 'uploader': uploader, - } + 'description': data['description'], + 'thumbnail': data['thumbnailUrl'], + 'upload_date': unified_strdate(data['created']), + 'uploader': data['username'], + 'uploader_id': data['userIdStr'], + 'like_count': data['likes']['count'], + 'comment_count': data['comments']['count'], + 'repost_count': data['reposts']['count'], + 'formats': formats, + } \ No newline at end of file From bc2f773b4f738c7fdaac1786d4796c69c5b57295 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Sun, 23 Feb 2014 17:17:36 +0100 Subject: [PATCH 37/58] [youtube:playlist] Fix mixes extraction (fixes #2444) --- youtube_dl/extractor/youtube.py | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index cf0bd4ae3..f868b1929 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -1488,11 +1488,16 @@ class YoutubePlaylistIE(YoutubeBaseInfoExtractor): # the id of the playlist is just 'RD' + video_id url = 'https://youtube.com/watch?v=%s&list=%s' % (playlist_id[-11:], playlist_id) webpage = self._download_webpage(url, playlist_id, u'Downloading Youtube mix') - title_span = (get_element_by_attribute('class', 'title long-title', webpage) or - get_element_by_attribute('class', 'title ', webpage)) + search_title = lambda class_name: get_element_by_attribute('class', class_name, webpage) + title_span = (search_title('playlist-title') or + search_title('title long-title') or search_title('title')) title = clean_html(title_span) - video_re = r'data-index="\d+".*?href="/watch\?v=([0-9A-Za-z_-]{11})&[^"]*?list=%s' % re.escape(playlist_id) - ids = orderedSet(re.findall(video_re, webpage)) + video_re = r'''(?x)data-index="\d+".*? + data-video-username="(.*?)".*? + href="/watch\?v=([0-9A-Za-z_-]{11})&[^"]*?list=%s''' % re.escape(playlist_id) + matches = orderedSet(re.findall(video_re, webpage, flags=re.DOTALL)) + # Some of the videos may have beend deleted, their username field is empty + ids = [video_id for (username, video_id) in matches if username] url_results = self._ids_to_results(ids) return self.playlist_result(url_results, playlist_id, title) From ffe8fe356a14e38ab8b5e8329fce015d93b93bc8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Sun, 23 Feb 2014 18:06:51 +0100 Subject: [PATCH 38/58] [normalboots] Fix video url extraction --- youtube_dl/extractor/normalboots.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/normalboots.py b/youtube_dl/extractor/normalboots.py index 81b7855b0..1dce5430f 100644 --- a/youtube_dl/extractor/normalboots.py +++ b/youtube_dl/extractor/normalboots.py @@ -49,7 +49,7 @@ class NormalbootsIE(InfoExtractor): player_url = self._html_search_regex(r'[\S]+)"', webpage, 'url') player_page = self._download_webpage(player_url, video_id) - video_url = u'http://player.screenwavemedia.com/' + self._html_search_regex(r"'file':\s'(?P[0-9A-Za-z-_\.]+)'", player_page, 'file') + video_url = self._html_search_regex(r"file:\s'(?P[^']+\.mp4)'", player_page, 'file') info['url'] = video_url info['title'] = video_title From 92661c994b4f01bb39204d05652f57e7f6491579 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Sun, 23 Feb 2014 18:28:22 +0100 Subject: [PATCH 39/58] [normalboots] Modernize and simplify --- youtube_dl/extractor/normalboots.py | 66 ++++++++++++----------------- 1 file changed, 28 insertions(+), 38 deletions(-) diff --git a/youtube_dl/extractor/normalboots.py b/youtube_dl/extractor/normalboots.py index 1dce5430f..25e71a56e 100644 --- a/youtube_dl/extractor/normalboots.py +++ b/youtube_dl/extractor/normalboots.py @@ -1,61 +1,51 @@ +# encoding: utf-8 +from __future__ import unicode_literals + import re from .common import InfoExtractor from ..utils import ( - ExtractorError, unified_strdate, ) + class NormalbootsIE(InfoExtractor): - _VALID_URL = r'(?:http://)?(?:www\.)?normalboots\.com/video/(?P[0-9a-z-]*)/?$' + _VALID_URL = r'http://(?:www\.)?normalboots\.com/video/(?P[0-9a-z-]*)/?$' _TEST = { - u'url': u'http://normalboots.com/video/home-alone-games-jontron/', - u'file': u'home-alone-games-jontron.mp4', - u'md5': u'8bf6de238915dd501105b44ef5f1e0f6', - u'info_dict': { - u'title': u'Home Alone Games - JonTron - NormalBoots', - u'description': u'Jon is late for Christmas. Typical. Thanks to: Paul Ritchey for Co-Writing/Filming: http://www.youtube.com/user/ContinueShow Michael Azzi for Christmas Intro Animation: http://michafrar.tumblr.com/ Jerrod Waters for Christmas Intro Music: http://www.youtube.com/user/xXJerryTerryXx Casey Ormond for \u2018Tense Battle Theme\u2019:\xa0http://www.youtube.com/Kiamet/', - u'uploader': u'JonTron', - u'upload_date': u'20140125', + 'url': 'http://normalboots.com/video/home-alone-games-jontron/', + 'md5': '8bf6de238915dd501105b44ef5f1e0f6', + 'info_dict': { + 'id': 'home-alone-games-jontron', + 'ext': 'mp4', + 'title': 'Home Alone Games - JonTron - NormalBoots', + 'description': 'Jon is late for Christmas. Typical. Thanks to: Paul Ritchey for Co-Writing/Filming: http://www.youtube.com/user/ContinueShow Michael Azzi for Christmas Intro Animation: http://michafrar.tumblr.com/ Jerrod Waters for Christmas Intro Music: http://www.youtube.com/user/xXJerryTerryXx Casey Ormond for ‘Tense Battle Theme’:\xa0http://www.youtube.com/Kiamet/', + 'uploader': 'JonTron', + 'upload_date': '20140125', } } - + def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) - if mobj is None: - raise ExtractorError(u'Invalid URL: %s' % url) video_id = mobj.group('videoid') - - info = { - 'id': video_id, - 'uploader': None, - 'upload_date': None, - } - - if url[:4] != 'http': - url = 'http://' + url - + webpage = self._download_webpage(url, video_id) - video_title = self._og_search_title(webpage) - video_description = self._og_search_description(webpage) - video_thumbnail = self._og_search_thumbnail(webpage) video_uploader = self._html_search_regex(r'Posted\sby\s(?P[A-Za-z]*)\s', webpage, 'uploader') - raw_upload_date = self._html_search_regex('[A-Za-z]+, (?P.*)', + raw_upload_date = self._html_search_regex('[A-Za-z]+, (?P.*)', webpage, 'date') video_upload_date = unified_strdate(raw_upload_date) - video_upload_date = unified_strdate(raw_upload_date) - + player_url = self._html_search_regex(r'[\S]+)"', webpage, 'url') player_page = self._download_webpage(player_url, video_id) video_url = self._html_search_regex(r"file:\s'(?P[^']+\.mp4)'", player_page, 'file') - - info['url'] = video_url - info['title'] = video_title - info['description'] = video_description - info['thumbnail'] = video_thumbnail - info['uploader'] = video_uploader - info['upload_date'] = video_upload_date - - return info + + return { + 'id': video_id, + 'url': video_url, + 'title': self._og_search_title(webpage), + 'description': self._og_search_description(webpage), + 'thumbnail': self._og_search_thumbnail(webpage), + 'uploader': video_uploader, + 'upload_date': video_upload_date, + } From a2dafe2887acc745b70eff811a2932098048cd64 Mon Sep 17 00:00:00 2001 From: "Sergey M." Date: Mon, 24 Feb 2014 12:51:06 +0700 Subject: [PATCH 40/58] [youtube] Fix mix video regex Attributes' order in
  • is arbitrary and changes every time playlist page is fetched, so we can't rely on `data-index` to be before `data-video-username`. --- youtube_dl/extractor/youtube.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index f868b1929..f65052a89 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -1492,11 +1492,10 @@ class YoutubePlaylistIE(YoutubeBaseInfoExtractor): title_span = (search_title('playlist-title') or search_title('title long-title') or search_title('title')) title = clean_html(title_span) - video_re = r'''(?x)data-index="\d+".*? - data-video-username="(.*?)".*? + video_re = r'''(?x)data-video-username="(.*?)".*? href="/watch\?v=([0-9A-Za-z_-]{11})&[^"]*?list=%s''' % re.escape(playlist_id) matches = orderedSet(re.findall(video_re, webpage, flags=re.DOTALL)) - # Some of the videos may have beend deleted, their username field is empty + # Some of the videos may have been deleted, their username field is empty ids = [video_id for (username, video_id) in matches if username] url_results = self._ids_to_results(ids) From 919052d0947a2b73ca5ac40c05a450c8a965688a Mon Sep 17 00:00:00 2001 From: "Sergey M." Date: Mon, 24 Feb 2014 13:47:47 +0700 Subject: [PATCH 41/58] [zdf] Fix podcast extraction and use unicode literals (Closes #2446) --- youtube_dl/extractor/zdf.py | 70 ++++++++++++++++++------------------- 1 file changed, 34 insertions(+), 36 deletions(-) diff --git a/youtube_dl/extractor/zdf.py b/youtube_dl/extractor/zdf.py index 829f002cf..3b1ac4e9f 100644 --- a/youtube_dl/extractor/zdf.py +++ b/youtube_dl/extractor/zdf.py @@ -1,4 +1,5 @@ # coding: utf-8 +from __future__ import unicode_literals import re @@ -13,52 +14,42 @@ class ZDFIE(InfoExtractor): _VALID_URL = r'^https?://www\.zdf\.de/ZDFmediathek(?P#)?/(.*beitrag/(?:video/)?)(?P[0-9]+)(?:/[^/?]+)?(?:\?.*)?' _TEST = { - u"url": u"http://www.zdf.de/ZDFmediathek/beitrag/video/2037704/ZDFspezial---Ende-des-Machtpokers--?bc=sts;stt", - u"file": u"2037704.webm", - u"info_dict": { - u"upload_date": u"20131127", - u"description": u"Union und SPD haben sich auf einen Koalitionsvertrag geeinigt. Aber was bedeutet das für die Bürger? Sehen Sie hierzu das ZDFspezial \"Ende des Machtpokers - Große Koalition für Deutschland\".", - u"uploader": u"spezial", - u"title": u"ZDFspezial - Ende des Machtpokers" + 'url': 'http://www.zdf.de/ZDFmediathek/beitrag/video/2037704/ZDFspezial---Ende-des-Machtpokers--?bc=sts;stt', + 'info_dict': { + 'id': '2037704', + 'ext': 'webm', + 'title': 'ZDFspezial - Ende des Machtpokers', + 'description': 'Union und SPD haben sich auf einen Koalitionsvertrag geeinigt. Aber was bedeutet das für die Bürger? Sehen Sie hierzu das ZDFspezial "Ende des Machtpokers - Große Koalition für Deutschland".', + 'duration': 1022, + 'uploader': 'spezial', + 'uploader_id': '225948', + 'upload_date': '20131127', }, - u"skip": u"Videos on ZDF.de are depublicised in short order", + 'skip': 'Videos on ZDF.de are depublicised in short order', } def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) video_id = mobj.group('video_id') - xml_url = u'http://www.zdf.de/ZDFmediathek/xmlservice/web/beitragsDetails?ak=web&id=%s' % video_id + xml_url = 'http://www.zdf.de/ZDFmediathek/xmlservice/web/beitragsDetails?ak=web&id=%s' % video_id doc = self._download_xml( xml_url, video_id, - note=u'Downloading video info', - errnote=u'Failed to download video info') + note='Downloading video info', + errnote='Failed to download video info') title = doc.find('.//information/title').text description = doc.find('.//information/detail').text + duration = int(doc.find('.//details/lengthSec').text) uploader_node = doc.find('.//details/originChannelTitle') uploader = None if uploader_node is None else uploader_node.text - duration_str = doc.find('.//details/length').text - duration_m = re.match(r'''(?x)^ - (?P[0-9]{2}) - :(?P[0-9]{2}) - :(?P[0-9]{2}) - (?:\.(?P[0-9]+)?) - ''', duration_str) - duration = ( - ( - (int(duration_m.group('hours')) * 60 * 60) + - (int(duration_m.group('minutes')) * 60) + - int(duration_m.group('seconds')) - ) - if duration_m - else None - ) + uploader_id_node = doc.find('.//details/originChannelId') + uploader_id = None if uploader_id_node is None else uploader_id_node.text upload_date = unified_strdate(doc.find('.//details/airtime').text) def xml_to_format(fnode): video_url = fnode.find('url').text - is_available = u'http://www.metafilegenerator' not in video_url + is_available = 'http://www.metafilegenerator' not in video_url format_id = fnode.attrib['basetype'] format_m = re.match(r'''(?x) @@ -71,22 +62,28 @@ class ZDFIE(InfoExtractor): quality = fnode.find('./quality').text abr = int(fnode.find('./audioBitrate').text) // 1000 - vbr = int(fnode.find('./videoBitrate').text) // 1000 + vbr_node = fnode.find('./videoBitrate') + vbr = None if vbr_node is None else int(vbr_node.text) // 1000 - format_note = u'' + width_node = fnode.find('./width') + width = None if width_node is None else int_or_none(width_node.text) + height_node = fnode.find('./height') + height = None if height_node is None else int_or_none(height_node.text) + + format_note = '' if not format_note: format_note = None return { - 'format_id': format_id + u'-' + quality, + 'format_id': format_id + '-' + quality, 'url': video_url, 'ext': ext, 'acodec': format_m.group('acodec'), 'vcodec': format_m.group('vcodec'), 'abr': abr, 'vbr': vbr, - 'width': int_or_none(fnode.find('./width').text), - 'height': int_or_none(fnode.find('./height').text), + 'width': width, + 'height': height, 'filesize': int_or_none(fnode.find('./filesize').text), 'format_note': format_note, 'protocol': proto, @@ -103,9 +100,10 @@ class ZDFIE(InfoExtractor): return { 'id': video_id, 'title': title, - 'formats': formats, 'description': description, - 'uploader': uploader, 'duration': duration, + 'uploader': uploader, + 'uploader_id': uploader_id, 'upload_date': upload_date, - } + 'formats': formats, + } \ No newline at end of file From 0bf5cf9886bf3ba5ff52b782be306aec056617b3 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Mon, 24 Feb 2014 09:44:22 +0100 Subject: [PATCH 42/58] release 2014.02.24 --- youtube_dl/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/version.py b/youtube_dl/version.py index b166da7c7..0d6ddc194 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,2 +1,2 @@ -__version__ = '2014.02.22.1' +__version__ = '2014.02.24' From bc3be21d59e03f11f2b839dc5d5cfbb3352eff45 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Mon, 24 Feb 2014 09:53:48 +0100 Subject: [PATCH 43/58] [iprima] Clean up a little bit --- youtube_dl/extractor/iprima.py | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/youtube_dl/extractor/iprima.py b/youtube_dl/extractor/iprima.py index dde482998..bf5e44d88 100644 --- a/youtube_dl/extractor/iprima.py +++ b/youtube_dl/extractor/iprima.py @@ -10,7 +10,7 @@ from ..utils import compat_urllib_request class IPrimaIE(InfoExtractor): - _VALID_URL = r'https?://play\.iprima\.cz/(?P.+)/(?P.+)' + _VALID_URL = r'https?://play\.iprima\.cz/[^?#]+/(?P[^?#]+)' _TESTS = [{ 'url': 'http://play.iprima.cz/particka/particka-92', @@ -29,7 +29,7 @@ class IPrimaIE(InfoExtractor): def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('videoid') + video_id = mobj.group('id') webpage = self._download_webpage(url, video_id) @@ -44,18 +44,19 @@ class IPrimaIE(InfoExtractor): base_url = ''.join(re.findall(r"embed\['stream'\] = '(.+?)'.+'(\?auth=)'.+'(.+?)';", playerpage)[1]) zoneGEO = self._html_search_regex(r'"zoneGEO":(.+?),', webpage, 'zoneGEO') - if zoneGEO != '0': - base_url = base_url.replace('token', 'token_'+zoneGEO) + base_url = base_url.replace('token', 'token_' + zoneGEO) formats = [] for format_id in ['lq', 'hq', 'hd']: - filename = self._html_search_regex(r'"%s_id":(.+?),' % format_id, webpage, 'filename') + filename = self._html_search_regex( + r'"%s_id":(.+?),' % format_id, webpage, 'filename') if filename == 'null': continue - real_id = self._search_regex(r'Prima-[0-9]{10}-([0-9]+)_', filename, 'real video id') + real_id = self._search_regex( + r'Prima-[0-9]{10}-([0-9]+)_', filename, 'real video id') if format_id == 'lq': quality = 0 @@ -63,13 +64,13 @@ class IPrimaIE(InfoExtractor): quality = 1 elif format_id == 'hd': quality = 2 - filename = 'hq/'+filename + filename = 'hq/' + filename formats.append({ 'format_id': format_id, 'url': base_url, 'quality': quality, - 'play_path': 'mp4:'+filename.replace('"', '')[:-4], + 'play_path': 'mp4:' + filename.replace('"', '')[:-4], 'rtmp_live': True, 'ext': 'flv', }) From 973f2532f58b030b8d9b220d184efd2e91db6796 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Mon, 24 Feb 2014 10:12:36 +0100 Subject: [PATCH 44/58] [iprima] Add support for -WEB URLs (Closes #2449) --- youtube_dl/extractor/iprima.py | 26 +++++++++++++++++++------- 1 file changed, 19 insertions(+), 7 deletions(-) diff --git a/youtube_dl/extractor/iprima.py b/youtube_dl/extractor/iprima.py index bf5e44d88..22820e51b 100644 --- a/youtube_dl/extractor/iprima.py +++ b/youtube_dl/extractor/iprima.py @@ -22,10 +22,21 @@ class IPrimaIE(InfoExtractor): 'thumbnail': 'http://play.iprima.cz/sites/default/files/image_crops/image_620x349/3/491483_particka-92_image_620x349.jpg', }, 'params': { - 'skip_download': True, + 'skip_download': True, # requires rtmpdump }, - }, - ] + }, { + 'url': 'http://play.iprima.cz/particka/tchibo-particka-jarni-moda', + 'info_dict': { + 'id': '9718337', + 'ext': 'flv', + 'title': 'Tchibo Partička - Jarní móda', + 'description': 'md5:589f8f59f414220621ff8882eb3ce7be', + 'thumbnail': 're:^http:.*\.jpg$', + }, + 'params': { + 'skip_download': True, # requires rtmpdump + }, + }] def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) @@ -33,9 +44,9 @@ class IPrimaIE(InfoExtractor): webpage = self._download_webpage(url, video_id) - player_url = 'http://embed.livebox.cz/iprimaplay/player-embed-v2.js?__tok%s__=%s' % ( - floor(random()*1073741824), - floor(random()*1073741824)) + player_url = ( + 'http://embed.livebox.cz/iprimaplay/player-embed-v2.js?__tok%s__=%s' % + (floor(random()*1073741824), floor(random()*1073741824)) req = compat_urllib_request.Request(player_url) req.add_header('Referer', url) @@ -56,7 +67,8 @@ class IPrimaIE(InfoExtractor): continue real_id = self._search_regex( - r'Prima-[0-9]{10}-([0-9]+)_', filename, 'real video id') + r'Prima-(?:[0-9]{10}|WEB)-([0-9]+)[-_]', + filename, 'real video id') if format_id == 'lq': quality = 0 From 3cfe79147359c2e9ecbb20caf1e77655dfcf75d1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Mon, 24 Feb 2014 13:50:53 +0100 Subject: [PATCH 45/58] [iprima] Add missing `)` --- youtube_dl/extractor/iprima.py | 1 + 1 file changed, 1 insertion(+) diff --git a/youtube_dl/extractor/iprima.py b/youtube_dl/extractor/iprima.py index 22820e51b..7956e7624 100644 --- a/youtube_dl/extractor/iprima.py +++ b/youtube_dl/extractor/iprima.py @@ -47,6 +47,7 @@ class IPrimaIE(InfoExtractor): player_url = ( 'http://embed.livebox.cz/iprimaplay/player-embed-v2.js?__tok%s__=%s' % (floor(random()*1073741824), floor(random()*1073741824)) + ) req = compat_urllib_request.Request(player_url) req.add_header('Referer', url) From cd7ee7aa444ac85ddf6ca29645195f331d96d139 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Mon, 24 Feb 2014 14:00:31 +0100 Subject: [PATCH 46/58] [nbc] Modernize --- youtube_dl/extractor/nbc.py | 30 +++++++++++++++++------------- 1 file changed, 17 insertions(+), 13 deletions(-) diff --git a/youtube_dl/extractor/nbc.py b/youtube_dl/extractor/nbc.py index e8bbfff7b..ff750de3f 100644 --- a/youtube_dl/extractor/nbc.py +++ b/youtube_dl/extractor/nbc.py @@ -1,3 +1,5 @@ +from __future__ import unicode_literals + import re from .common import InfoExtractor @@ -8,12 +10,13 @@ class NBCNewsIE(InfoExtractor): _VALID_URL = r'https?://www\.nbcnews\.com/video/.+?/(?P\d+)' _TEST = { - u'url': u'http://www.nbcnews.com/video/nbc-news/52753292', - u'file': u'52753292.flv', - u'md5': u'47abaac93c6eaf9ad37ee6c4463a5179', - u'info_dict': { - u'title': u'Crew emerges after four-month Mars food study', - u'description': u'md5:24e632ffac72b35f8b67a12d1b6ddfc1', + 'url': 'http://www.nbcnews.com/video/nbc-news/52753292', + 'md5': '47abaac93c6eaf9ad37ee6c4463a5179', + 'info_dict': { + 'id': '52753292', + 'ext': 'flv', + 'title': 'Crew emerges after four-month Mars food study', + 'description': 'md5:24e632ffac72b35f8b67a12d1b6ddfc1', }, } @@ -23,10 +26,11 @@ class NBCNewsIE(InfoExtractor): all_info = self._download_xml('http://www.nbcnews.com/id/%s/displaymode/1219' % video_id, video_id) info = all_info.find('video') - return {'id': video_id, - 'title': info.find('headline').text, - 'ext': 'flv', - 'url': find_xpath_attr(info, 'media', 'type', 'flashVideo').text, - 'description': compat_str(info.find('caption').text), - 'thumbnail': find_xpath_attr(info, 'media', 'type', 'thumbnail').text, - } + return { + 'id': video_id, + 'title': info.find('headline').text, + 'ext': 'flv', + 'url': find_xpath_attr(info, 'media', 'type', 'flashVideo').text, + 'description': compat_str(info.find('caption').text), + 'thumbnail': find_xpath_attr(info, 'media', 'type', 'thumbnail').text, + } From 9e57ce716f03daa8ae27979af757819078595d6a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Mon, 24 Feb 2014 14:18:12 +0100 Subject: [PATCH 47/58] [academicearth] Fix extraction The courses seems to be no longer available, changed the test to a playlist. --- test/test_playlists.py | 10 +++++----- youtube_dl/extractor/academicearth.py | 9 +++++---- 2 files changed, 10 insertions(+), 9 deletions(-) diff --git a/test/test_playlists.py b/test/test_playlists.py index 25bec9f1c..07c85b322 100644 --- a/test/test_playlists.py +++ b/test/test_playlists.py @@ -170,12 +170,12 @@ class TestPlaylists(unittest.TestCase): def test_AcademicEarthCourse(self): dl = FakeYDL() ie = AcademicEarthCourseIE(dl) - result = ie.extract('http://academicearth.org/courses/building-dynamic-websites/') + result = ie.extract('http://academicearth.org/playlists/laws-of-nature/') self.assertIsPlaylist(result) - self.assertEqual(result['id'], 'building-dynamic-websites') - self.assertEqual(result['title'], 'Building Dynamic Websites') - self.assertEqual(result['description'], u"Today's websites are increasingly dynamic. Pages are no longer static HTML files but instead generated by scripts and database calls. User interfaces are more seamless, with technologies like Ajax replacing traditional page reloads. This course teaches students how to build dynamic websites with Ajax and with Linux, Apache, MySQL, and PHP (LAMP), one of today's most popular frameworks. Students learn how to set up domain names with DNS, how to structure pages with XHTML and CSS, how to program in JavaScript and PHP, how to configure Apache and MySQL, how to design and query databases with SQL, how to use Ajax with both XML and JSON, and how to build mashups. The course explores issues of security, scalability, and cross-browser support and also discusses enterprise-level deployments of websites, including third-party hosting, virtualization, colocation in data centers, firewalling, and load-balancing.") - self.assertEqual(len(result['entries']), 10) + self.assertEqual(result['id'], 'laws-of-nature') + self.assertEqual(result['title'], 'Laws of Nature') + self.assertEqual(result['description'],u'Introduce yourself to the laws of nature with these free online college lectures from Yale, Harvard, and MIT.')# u"Today's websites are increasingly dynamic. Pages are no longer static HTML files but instead generated by scripts and database calls. User interfaces are more seamless, with technologies like Ajax replacing traditional page reloads. This course teaches students how to build dynamic websites with Ajax and with Linux, Apache, MySQL, and PHP (LAMP), one of today's most popular frameworks. Students learn how to set up domain names with DNS, how to structure pages with XHTML and CSS, how to program in JavaScript and PHP, how to configure Apache and MySQL, how to design and query databases with SQL, how to use Ajax with both XML and JSON, and how to build mashups. The course explores issues of security, scalability, and cross-browser support and also discusses enterprise-level deployments of websites, including third-party hosting, virtualization, colocation in data centers, firewalling, and load-balancing.") + self.assertEqual(len(result['entries']), 4) def test_ivi_compilation(self): dl = FakeYDL() diff --git a/youtube_dl/extractor/academicearth.py b/youtube_dl/extractor/academicearth.py index 72f81d01a..f62173282 100644 --- a/youtube_dl/extractor/academicearth.py +++ b/youtube_dl/extractor/academicearth.py @@ -5,7 +5,7 @@ from .common import InfoExtractor class AcademicEarthCourseIE(InfoExtractor): - _VALID_URL = r'^https?://(?:www\.)?academicearth\.org/(?:courses|playlists)/(?P[^?#/]+)' + _VALID_URL = r'^https?://(?:www\.)?academicearth\.org/playlists/(?P[^?#/]+)' IE_NAME = 'AcademicEarth:Course' def _real_extract(self, url): @@ -14,12 +14,13 @@ class AcademicEarthCourseIE(InfoExtractor): webpage = self._download_webpage(url, playlist_id) title = self._html_search_regex( - r'

    (.*?)

    ', webpage, u'title') + r'

    ]*?>(.*?)

    ', webpage, u'title') description = self._html_search_regex( - r'

    (.*?)

    ', + r'

    ]*?>(.*?)

    ', webpage, u'description', fatal=False) + print(description) urls = re.findall( - r'

    ', + r'
  • \s*?', webpage) entries = [self.url_result(u) for u in urls] From b732f3581f606aaa98bd84ab18ff07653391ead9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Mon, 24 Feb 2014 14:20:17 +0100 Subject: [PATCH 48/58] [academicearth] Remove debug print --- youtube_dl/extractor/academicearth.py | 1 - 1 file changed, 1 deletion(-) diff --git a/youtube_dl/extractor/academicearth.py b/youtube_dl/extractor/academicearth.py index f62173282..59d3bbba4 100644 --- a/youtube_dl/extractor/academicearth.py +++ b/youtube_dl/extractor/academicearth.py @@ -18,7 +18,6 @@ class AcademicEarthCourseIE(InfoExtractor): description = self._html_search_regex( r'

    ]*?>(.*?)

    ', webpage, u'description', fatal=False) - print(description) urls = re.findall( r'
  • \s*?', webpage) From 47610c4d3e6f8b7f3c70974c3287988db68c891a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Mon, 24 Feb 2014 14:35:26 +0100 Subject: [PATCH 49/58] [cinemassacre] Fix extraction Now we download over http, we don't need rtmpdump. --- youtube_dl/extractor/cinemassacre.py | 33 ++++++++++------------------ 1 file changed, 11 insertions(+), 22 deletions(-) diff --git a/youtube_dl/extractor/cinemassacre.py b/youtube_dl/extractor/cinemassacre.py index f0d08cebf..acc18dbe2 100644 --- a/youtube_dl/extractor/cinemassacre.py +++ b/youtube_dl/extractor/cinemassacre.py @@ -11,28 +11,22 @@ class CinemassacreIE(InfoExtractor): _VALID_URL = r'(?:http://)?(?:www\.)?(?Pcinemassacre\.com/(?P[0-9]{4})/(?P[0-9]{2})/(?P[0-9]{2})/.+?)(?:[/?].*)?' _TESTS = [{ u'url': u'http://cinemassacre.com/2012/11/10/avgn-the-movie-trailer/', - u'file': u'19911.flv', + u'file': u'19911.mp4', + u'md5': u'fde81fbafaee331785f58cd6c0d46190', u'info_dict': { u'upload_date': u'20121110', u'title': u'“Angry Video Game Nerd: The Movie” – Trailer', u'description': u'md5:fb87405fcb42a331742a0dce2708560b', }, - u'params': { - # rtmp download - u'skip_download': True, - }, }, { u'url': u'http://cinemassacre.com/2013/10/02/the-mummys-hand-1940', - u'file': u'521be8ef82b16.flv', + u'file': u'521be8ef82b16.mp4', + u'md5': u'd72f10cd39eac4215048f62ab477a511', u'info_dict': { u'upload_date': u'20131002', u'title': u'The Mummy’s Hand (1940)', }, - u'params': { - # rtmp download - u'skip_download': True, - }, }] def _real_extract(self, url): @@ -55,26 +49,21 @@ class CinemassacreIE(InfoExtractor): video_description = None playerdata = self._download_webpage(playerdata_url, video_id) - url = self._html_search_regex(r'\'streamer\': \'(?P[^\']+)\'', playerdata, u'url') - sd_file = self._html_search_regex(r'\'file\': \'(?P[^\']+)\'', playerdata, u'sd_file') - hd_file = self._html_search_regex(r'\'?file\'?: "(?P[^"]+)"', playerdata, u'hd_file') - video_thumbnail = self._html_search_regex(r'\'image\': \'(?P[^\']+)\'', playerdata, u'thumbnail', fatal=False) + sd_url = self._html_search_regex(r'file: \'(?P[^\']+)\', label: \'SD\'', playerdata, u'sd_file') + hd_url= self._html_search_regex(r'file: \'(?P[^\']+)\', label: \'HD\'', playerdata, u'hd_file') + video_thumbnail = self._html_search_regex(r'image: \'(?P[^\']+)\'', playerdata, u'thumbnail', fatal=False) formats = [ { - 'url': url, - 'play_path': 'mp4:' + sd_file, - 'rtmp_live': True, # workaround - 'ext': 'flv', + 'url': sd_url, + 'ext': 'mp4', 'format': 'sd', 'format_id': 'sd', }, { - 'url': url, - 'play_path': 'mp4:' + hd_file, - 'rtmp_live': True, # workaround - 'ext': 'flv', + 'url': hd_url, + 'ext': 'mp4', 'format': 'hd', 'format_id': 'hd', }, From 5bb67dbfea5f64d7eaa2ae1584b8d4ec5aa6f5d6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Mon, 24 Feb 2014 14:44:29 +0100 Subject: [PATCH 50/58] [cinemassacre] Modernize --- youtube_dl/extractor/cinemassacre.py | 60 ++++++++++++++-------------- 1 file changed, 31 insertions(+), 29 deletions(-) diff --git a/youtube_dl/extractor/cinemassacre.py b/youtube_dl/extractor/cinemassacre.py index acc18dbe2..bfbffefdc 100644 --- a/youtube_dl/extractor/cinemassacre.py +++ b/youtube_dl/extractor/cinemassacre.py @@ -1,4 +1,5 @@ # encoding: utf-8 +from __future__ import unicode_literals import re from .common import InfoExtractor @@ -8,51 +9,52 @@ from ..utils import ( class CinemassacreIE(InfoExtractor): - _VALID_URL = r'(?:http://)?(?:www\.)?(?Pcinemassacre\.com/(?P[0-9]{4})/(?P[0-9]{2})/(?P[0-9]{2})/.+?)(?:[/?].*)?' - _TESTS = [{ - u'url': u'http://cinemassacre.com/2012/11/10/avgn-the-movie-trailer/', - u'file': u'19911.mp4', - u'md5': u'fde81fbafaee331785f58cd6c0d46190', - u'info_dict': { - u'upload_date': u'20121110', - u'title': u'“Angry Video Game Nerd: The Movie” – Trailer', - u'description': u'md5:fb87405fcb42a331742a0dce2708560b', + _VALID_URL = r'http://(?:www\.)?cinemassacre\.com/(?P[0-9]{4})/(?P[0-9]{2})/(?P[0-9]{2})/.+?' + _TESTS = [ + { + 'url': 'http://cinemassacre.com/2012/11/10/avgn-the-movie-trailer/', + 'file': '19911.mp4', + 'md5': 'fde81fbafaee331785f58cd6c0d46190', + 'info_dict': { + 'upload_date': '20121110', + 'title': '“Angry Video Game Nerd: The Movie” – Trailer', + 'description': 'md5:fb87405fcb42a331742a0dce2708560b', + }, }, - }, - { - u'url': u'http://cinemassacre.com/2013/10/02/the-mummys-hand-1940', - u'file': u'521be8ef82b16.mp4', - u'md5': u'd72f10cd39eac4215048f62ab477a511', - u'info_dict': { - u'upload_date': u'20131002', - u'title': u'The Mummy’s Hand (1940)', - }, - }] + { + 'url': 'http://cinemassacre.com/2013/10/02/the-mummys-hand-1940', + 'file': '521be8ef82b16.mp4', + 'md5': 'd72f10cd39eac4215048f62ab477a511', + 'info_dict': { + 'upload_date': '20131002', + 'title': 'The Mummy’s Hand (1940)', + }, + } + ] def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) - webpage_url = u'http://' + mobj.group('url') - webpage = self._download_webpage(webpage_url, None) # Don't know video id yet + webpage = self._download_webpage(url, None) # Don't know video id yet video_date = mobj.group('date_Y') + mobj.group('date_m') + mobj.group('date_d') mobj = re.search(r'src="(?Phttp://player\.screenwavemedia\.com/play/[a-zA-Z]+\.php\?id=(?:Cinemassacre-)?(?P.+?))"', webpage) if not mobj: - raise ExtractorError(u'Can\'t extract embed url and video id') - playerdata_url = mobj.group(u'embed_url') - video_id = mobj.group(u'video_id') + raise ExtractorError('Can\'t extract embed url and video id') + playerdata_url = mobj.group('embed_url') + video_id = mobj.group('video_id') video_title = self._html_search_regex(r'(?P<title>.+?)\|', - webpage, u'title') + webpage, 'title') video_description = self._html_search_regex(r'<div class="entry-content">(?P<description>.+?)</div>', - webpage, u'description', flags=re.DOTALL, fatal=False) + webpage, 'description', flags=re.DOTALL, fatal=False) if len(video_description) == 0: video_description = None playerdata = self._download_webpage(playerdata_url, video_id) - sd_url = self._html_search_regex(r'file: \'(?P<sd_file>[^\']+)\', label: \'SD\'', playerdata, u'sd_file') - hd_url= self._html_search_regex(r'file: \'(?P<hd_file>[^\']+)\', label: \'HD\'', playerdata, u'hd_file') - video_thumbnail = self._html_search_regex(r'image: \'(?P<thumbnail>[^\']+)\'', playerdata, u'thumbnail', fatal=False) + sd_url = self._html_search_regex(r'file: \'(?P<sd_file>[^\']+)\', label: \'SD\'', playerdata, 'sd_file') + hd_url = self._html_search_regex(r'file: \'(?P<hd_file>[^\']+)\', label: \'HD\'', playerdata, 'hd_file') + video_thumbnail = self._html_search_regex(r'image: \'(?P<thumbnail>[^\']+)\'', playerdata, 'thumbnail', fatal=False) formats = [ { From d6f0d8664977f72f54aaf644e87586d4b0e0be2d Mon Sep 17 00:00:00 2001 From: "Sergey M." <dstftw@gmail.com> Date: Mon, 24 Feb 2014 22:01:19 +0700 Subject: [PATCH 51/58] [novamov] Improve _VALID_URL --- youtube_dl/extractor/novamov.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/novamov.py b/youtube_dl/extractor/novamov.py index 6af8d934c..22a382457 100644 --- a/youtube_dl/extractor/novamov.py +++ b/youtube_dl/extractor/novamov.py @@ -10,7 +10,7 @@ from ..utils import ( class NovamovIE(InfoExtractor): - _VALID_URL = r'http://(?:(?:www\.)?novamov\.com/video/|(?:(?:embed|www)\.)novamov\.com/embed\.php\?v=)(?P<videoid>[a-z\d]{13})' + _VALID_URL = r'http://(?:(?:www\.)?novamov\.com/video/|(?:(?:embed|www)\.)novamov\.com/embed\.php\?(?:.*?&)?v=)(?P<videoid>[a-z\d]{13})' _TEST = { 'url': 'http://www.novamov.com/video/4rurhn9x446jj', From ce78943ae1be8cc24ec43f97dc67d34010ae08f7 Mon Sep 17 00:00:00 2001 From: "Sergey M." <dstftw@gmail.com> Date: Mon, 24 Feb 2014 23:30:09 +0700 Subject: [PATCH 52/58] [novamov] Generalize extractor --- youtube_dl/extractor/__init__.py | 2 +- youtube_dl/extractor/novamov.py | 42 ++++++++++++++++++-------------- 2 files changed, 25 insertions(+), 19 deletions(-) diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 6dccd5ae7..8eff3df41 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -161,7 +161,7 @@ from .nhl import NHLIE, NHLVideocenterIE from .niconico import NiconicoIE from .ninegag import NineGagIE from .normalboots import NormalbootsIE -from .novamov import NovamovIE +from .novamov import NovaMovIE from .nowness import NownessIE from .nowvideo import NowVideoIE from .ooyala import OoyalaIE diff --git a/youtube_dl/extractor/novamov.py b/youtube_dl/extractor/novamov.py index 22a382457..fd310e219 100644 --- a/youtube_dl/extractor/novamov.py +++ b/youtube_dl/extractor/novamov.py @@ -9,14 +9,25 @@ from ..utils import ( ) -class NovamovIE(InfoExtractor): - _VALID_URL = r'http://(?:(?:www\.)?novamov\.com/video/|(?:(?:embed|www)\.)novamov\.com/embed\.php\?(?:.*?&)?v=)(?P<videoid>[a-z\d]{13})' +class NovaMovIE(InfoExtractor): + IE_NAME = 'novamov' + IE_DESC = 'NovaMov' + + _VALID_URL = r'http://(?:(?:www\.)?%(host)s/video/|(?:(?:embed|www)\.)%(host)s/embed\.php\?(?:.*?&)?v=)(?P<videoid>[a-z\d]{13})' % {'host': 'novamov\.com'} + + _HOST = 'www.novamov.com' + + _FILE_DELETED_REGEX = r'This file no longer exists on our servers!</h2>' + _FILEKEY_REGEX = r'flashvars\.filekey="(?P<filekey>[^"]+)";' + _TITLE_REGEX = r'(?s)<div class="v_tab blockborder rounded5" id="v_tab1">\s*<h3>([^<]+)</h3>' + _DESCRIPTION_REGEX = r'(?s)<div class="v_tab blockborder rounded5" id="v_tab1">\s*<h3>[^<]+</h3><p>([^<]+)</p>' _TEST = { 'url': 'http://www.novamov.com/video/4rurhn9x446jj', - 'file': '4rurhn9x446jj.flv', 'md5': '7205f346a52bbeba427603ba10d4b935', 'info_dict': { + 'id': '4rurhn9x446jj', + 'ext': 'flv', 'title': 'search engine optimization', 'description': 'search engine optimization is used to rank the web page in the google search engine' }, @@ -27,31 +38,26 @@ class NovamovIE(InfoExtractor): mobj = re.match(self._VALID_URL, url) video_id = mobj.group('videoid') - page = self._download_webpage('http://www.novamov.com/video/%s' % video_id, - video_id, 'Downloading video page') + page = self._download_webpage( + 'http://%s/video/%s' % (self._HOST, video_id), video_id, 'Downloading video page') - if re.search(r'This file no longer exists on our servers!</h2>', page) is not None: + if re.search(self._FILE_DELETED_REGEX, page) is not None: raise ExtractorError(u'Video %s does not exist' % video_id, expected=True) - filekey = self._search_regex( - r'flashvars\.filekey="(?P<filekey>[^"]+)";', page, 'filekey') + filekey = self._search_regex(self._FILEKEY_REGEX, page, 'filekey') - title = self._html_search_regex( - r'(?s)<div class="v_tab blockborder rounded5" id="v_tab1">\s*<h3>([^<]+)</h3>', - page, 'title', fatal=False) + title = self._html_search_regex(self._TITLE_REGEX, page, 'title', fatal=False) - description = self._html_search_regex( - r'(?s)<div class="v_tab blockborder rounded5" id="v_tab1">\s*<h3>[^<]+</h3><p>([^<]+)</p>', - page, 'description', fatal=False) + description = self._html_search_regex(self._DESCRIPTION_REGEX, page, 'description', default='', fatal=False) api_response = self._download_webpage( - 'http://www.novamov.com/api/player.api.php?key=%s&file=%s' % (filekey, video_id), - video_id, 'Downloading video api response') + 'http://%s/api/player.api.php?key=%s&file=%s' % (self._HOST, filekey, video_id), video_id, + 'Downloading video api response') response = compat_urlparse.parse_qs(api_response) if 'error_msg' in response: - raise ExtractorError('novamov returned error: %s' % response['error_msg'][0], expected=True) + raise ExtractorError('%s returned error: %s' % (self.IE_NAME, response['error_msg'][0]), expected=True) video_url = response['url'][0] @@ -60,4 +66,4 @@ class NovamovIE(InfoExtractor): 'url': video_url, 'title': title, 'description': description - } + } \ No newline at end of file From f1c9dfcc010611adf145f74d86047b7387b62025 Mon Sep 17 00:00:00 2001 From: "Sergey M." <dstftw@gmail.com> Date: Mon, 24 Feb 2014 23:30:58 +0700 Subject: [PATCH 53/58] [nowvideo] Rewrite based on novamov extractor --- youtube_dl/extractor/nowvideo.py | 64 ++++++++++++-------------------- 1 file changed, 23 insertions(+), 41 deletions(-) diff --git a/youtube_dl/extractor/nowvideo.py b/youtube_dl/extractor/nowvideo.py index 168ca8b9f..dd665874d 100644 --- a/youtube_dl/extractor/nowvideo.py +++ b/youtube_dl/extractor/nowvideo.py @@ -1,46 +1,28 @@ -import re +from __future__ import unicode_literals -from .common import InfoExtractor -from ..utils import compat_urlparse +from .novamov import NovaMovIE -class NowVideoIE(InfoExtractor): - _VALID_URL = r'(?:https?://)?(?:www\.)?nowvideo\.(?:ch|sx)/video/(?P<id>\w+)' +class NowVideoIE(NovaMovIE): + IE_NAME = 'nowvideo' + IE_DESC = 'NowVideo' + + _VALID_URL = r'http://(?:(?:www\.)?%(host)s/video/|(?:(?:embed|www)\.)%(host)s/embed\.php\?(?:.*?&)?v=)(?P<videoid>[a-z\d]{13})' % {'host': 'nowvideo\.(?:ch|sx|eu)'} + + _HOST = 'www.nowvideo.ch' + + _FILE_DELETED_REGEX = r'>This file no longer exists on our servers.<' + _FILEKEY_REGEX = r'var fkzd="([^"]+)";' + _TITLE_REGEX = r'<h4>([^<]+)</h4>' + _DESCRIPTION_REGEX = r'</h4>\s*<p>([^<]+)</p>' + _TEST = { - u'url': u'http://www.nowvideo.ch/video/0mw0yow7b6dxa', - u'file': u'0mw0yow7b6dxa.flv', - u'md5': u'f8fbbc8add72bd95b7850c6a02fc8817', - u'info_dict': { - u"title": u"youtubedl test video _BaW_jenozKc.mp4" + 'url': 'http://www.nowvideo.ch/video/0mw0yow7b6dxa', + 'md5': 'f8fbbc8add72bd95b7850c6a02fc8817', + 'info_dict': { + 'id': '0mw0yow7b6dxa', + 'ext': 'flv', + 'title': 'youtubedl test video _BaW_jenozKc.mp4', + 'description': 'Description', } - } - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - - video_id = mobj.group('id') - webpage_url = 'http://www.nowvideo.ch/video/' + video_id - embed_url = 'http://embed.nowvideo.ch/embed.php?v=' + video_id - webpage = self._download_webpage(webpage_url, video_id) - embed_page = self._download_webpage(embed_url, video_id, - u'Downloading embed page') - - self.report_extraction(video_id) - - video_title = self._html_search_regex(r'<h4>(.*)</h4>', - webpage, u'video title') - - video_key = self._search_regex(r'var fkzd="(.*)";', - embed_page, u'video key') - - api_call = "http://www.nowvideo.ch/api/player.api.php?file={0}&numOfErrors=0&cid=1&key={1}".format(video_id, video_key) - api_response = self._download_webpage(api_call, video_id, - u'Downloading API page') - video_url = compat_urlparse.parse_qs(api_response)[u'url'][0] - - return [{ - 'id': video_id, - 'url': video_url, - 'ext': 'flv', - 'title': video_title, - }] + } \ No newline at end of file From f6acbdecf483513166287a41cba5eed928404dc2 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Mon, 24 Feb 2014 17:31:09 +0100 Subject: [PATCH 54/58] [podomatic] Use unicode_literals --- youtube_dl/extractor/podomatic.py | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/youtube_dl/extractor/podomatic.py b/youtube_dl/extractor/podomatic.py index 58200971b..19ad45c98 100644 --- a/youtube_dl/extractor/podomatic.py +++ b/youtube_dl/extractor/podomatic.py @@ -1,7 +1,10 @@ +from __future__ import unicode_literals + import json import re from .common import InfoExtractor +from ..utils import int_or_none class PodomaticIE(InfoExtractor): @@ -9,14 +12,14 @@ class PodomaticIE(InfoExtractor): _VALID_URL = r'^(?P<proto>https?)://(?P<channel>[^.]+)\.podomatic\.com/entry/(?P<id>[^?]+)' _TEST = { - u"url": u"http://scienceteachingtips.podomatic.com/entry/2009-01-02T16_03_35-08_00", - u"file": u"2009-01-02T16_03_35-08_00.mp3", - u"md5": u"84bb855fcf3429e6bf72460e1eed782d", - u"info_dict": { - u"uploader": u"Science Teaching Tips", - u"uploader_id": u"scienceteachingtips", - u"title": u"64. When the Moon Hits Your Eye", - u"duration": 446, + "url": "http://scienceteachingtips.podomatic.com/entry/2009-01-02T16_03_35-08_00", + "file": "2009-01-02T16_03_35-08_00.mp3", + "md5": "84bb855fcf3429e6bf72460e1eed782d", + "info_dict": { + "uploader": "Science Teaching Tips", + "uploader_id": "scienceteachingtips", + "title": "64. When the Moon Hits Your Eye", + "duration": 446, } } @@ -36,7 +39,7 @@ class PodomaticIE(InfoExtractor): uploader = data['podcast'] title = data['title'] thumbnail = data['imageLocation'] - duration = int(data['length'] / 1000.0) + duration = int_or_none(data.get('length'), 1000) return { 'id': video_id, From b1c6c32f785d2e0ec943da0bc88609bcd4409ae3 Mon Sep 17 00:00:00 2001 From: "Sergey M." <dstftw@gmail.com> Date: Mon, 24 Feb 2014 23:37:42 +0700 Subject: [PATCH 55/58] [generic] Add support for nowvideo embedded videos --- youtube_dl/extractor/generic.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 9a2e54d14..22d1b5daa 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -363,11 +363,17 @@ class GenericIE(InfoExtractor): if mobj is not None: return self.url_result(mobj.group(1), 'Mpora') - # Look for embedded Novamov player + # Look for embedded NovaMov player mobj = re.search( r'<iframe[^>]+?src=(["\'])(?P<url>http://(?:(?:embed|www)\.)?novamov\.com/embed\.php.+?)\1', webpage) if mobj is not None: - return self.url_result(mobj.group('url'), 'Novamov') + return self.url_result(mobj.group('url'), 'NovaMov') + + # Look for embedded NowVideo player + mobj = re.search( + r'<iframe[^>]+?src=(["\'])(?P<url>http://(?:(?:embed|www)\.)?nowvideo\.(?:ch|sx|eu)/embed\.php.+?)\1', webpage) + if mobj is not None: + return self.url_result(mobj.group('url'), 'NowVideo') # Look for embedded Facebook player mobj = re.search( From 2bfe4ead4ba13e1fed69e70ef49cec9dfa7f84a2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= <jaime.marquinez.ferrandiz@gmail.com> Date: Mon, 24 Feb 2014 22:01:34 +0100 Subject: [PATCH 56/58] [veoh] Allow to download videos with age protection (fixes #2455) --- youtube_dl/extractor/veoh.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/youtube_dl/extractor/veoh.py b/youtube_dl/extractor/veoh.py index baa57f343..c90feefd2 100644 --- a/youtube_dl/extractor/veoh.py +++ b/youtube_dl/extractor/veoh.py @@ -4,6 +4,7 @@ import re import json from .common import InfoExtractor +from ..utils import compat_urllib_request class VeohIE(InfoExtractor): @@ -24,6 +25,13 @@ class VeohIE(InfoExtractor): mobj = re.match(self._VALID_URL, url) video_id = mobj.group('id') webpage = self._download_webpage(url, video_id) + age_limit = 0 + if 'class="adultwarning-container"' in webpage: + self.report_age_confirmation() + age_limit = 18 + request = compat_urllib_request.Request(url) + request.add_header('Cookie', 'confirmedAdult=true') + webpage = self._download_webpage(request, video_id) m_youtube = re.search(r'http://www\.youtube\.com/v/(.*?)(\&|")', webpage) if m_youtube is not None: @@ -44,4 +52,5 @@ class VeohIE(InfoExtractor): 'thumbnail': info.get('highResImage') or info.get('medResImage'), 'description': info['description'], 'view_count': info['views'], + 'age_limit': age_limit, } From 62e609ab771140b185e98ed085445d40b751cbfc Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Tue, 25 Feb 2014 01:43:17 +0100 Subject: [PATCH 57/58] Ignore BOM in batch files (Fixes #2450) --- test/test_utils.py | 11 +++++++++++ youtube_dl/__init__.py | 13 ++++++------- youtube_dl/utils.py | 17 +++++++++++++++++ 3 files changed, 34 insertions(+), 7 deletions(-) diff --git a/test/test_utils.py b/test/test_utils.py index 84553b943..4e3c37fb4 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -9,6 +9,7 @@ sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) # Various small unit tests +import io import xml.etree.ElementTree #from youtube_dl.utils import htmlentity_transform @@ -21,6 +22,7 @@ from youtube_dl.utils import ( orderedSet, PagedList, parse_duration, + read_batch_urls, sanitize_filename, shell_quote, smuggle_url, @@ -250,5 +252,14 @@ class TestUtil(unittest.TestCase): def test_struct_unpack(self): self.assertEqual(struct_unpack(u'!B', b'\x00'), (0,)) + def test_read_batch_urls(self): + f = io.StringIO(u'''\xef\xbb\xbf foo + bar\r + baz + # More after this line\r + ; or after this + bam''') + self.assertEqual(read_batch_urls(f), [u'foo', u'bar', u'baz', u'bam']) + if __name__ == '__main__': unittest.main() diff --git a/youtube_dl/__init__.py b/youtube_dl/__init__.py index 84f29a1a5..2aaafd37a 100644 --- a/youtube_dl/__init__.py +++ b/youtube_dl/__init__.py @@ -71,6 +71,7 @@ from .utils import ( get_cachedir, MaxDownloadsReached, preferredencoding, + read_batch_urls, SameFileError, setproctitle, std_headers, @@ -552,21 +553,19 @@ def _real_main(argv=None): sys.exit(0) # Batch file verification - batchurls = [] + batch_urls = [] if opts.batchfile is not None: try: if opts.batchfile == '-': batchfd = sys.stdin else: - batchfd = open(opts.batchfile, 'r') - batchurls = batchfd.readlines() - batchurls = [x.strip() for x in batchurls] - batchurls = [x for x in batchurls if len(x) > 0 and not re.search(r'^[#/;]', x)] + batchfd = io.open(opts.batchfile, 'r', encoding='utf-8', errors='ignore') + batch_urls = read_batch_urls(batchfd) if opts.verbose: - write_string(u'[debug] Batch file urls: ' + repr(batchurls) + u'\n') + write_string(u'[debug] Batch file urls: ' + repr(batch_urls) + u'\n') except IOError: sys.exit(u'ERROR: batch file could not be read') - all_urls = batchurls + args + all_urls = batch_urls + args all_urls = [url.strip() for url in all_urls] _enc = preferredencoding() all_urls = [url.decode(_enc, 'ignore') if isinstance(url, bytes) else url for url in all_urls] diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 25e40a837..0c482631a 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -1,6 +1,7 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- +import contextlib import ctypes import datetime import email.utils @@ -1245,3 +1246,19 @@ except TypeError: else: struct_pack = struct.pack struct_unpack = struct.unpack + + +def read_batch_urls(batch_fd): + def fixup(url): + if not isinstance(url, compat_str): + url = url.decode('utf-8', 'replace') + BOM_UTF8 = u'\xef\xbb\xbf' + if url.startswith(BOM_UTF8): + url = url[len(BOM_UTF8):] + url = url.strip() + if url.startswith(('#', ';', ']')): + return False + return url + + with contextlib.closing(batch_fd) as fd: + return [url for url in map(fixup, fd) if url] From cc3a3b6b47eace5ccaeb51a8579701593715d8c4 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Tue, 25 Feb 2014 01:45:10 +0100 Subject: [PATCH 58/58] release 2014.02.25 --- youtube_dl/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 0d6ddc194..2b1eee34d 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,2 +1,2 @@ -__version__ = '2014.02.24' +__version__ = '2014.02.25'