From 8da531359e8dc5299b438195ac75c30100ae05df Mon Sep 17 00:00:00 2001 From: sahutd Date: Sat, 18 Jan 2014 20:45:53 +0530 Subject: [PATCH 001/339] Added dropbox support. issue #2055 --- youtube_dl/extractor/dropbox.py | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) create mode 100644 youtube_dl/extractor/dropbox.py diff --git a/youtube_dl/extractor/dropbox.py b/youtube_dl/extractor/dropbox.py new file mode 100644 index 000000000..0df025cd3 --- /dev/null +++ b/youtube_dl/extractor/dropbox.py @@ -0,0 +1,33 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor + +class DropBoxIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?dropbox.com/s/(?P[a-zA-Z0-9]{15})/(?P.*)' + _TEST = { + 'url': 'https://www.dropbox.com/s/mcnzehi9wo55th4/20131219_085616.mp4', + 'file': '20131219_085616.mp4', + 'md5': '2cec58eb277054eca0dbaaf3bdc72564', + + } + + + def _real_extract(self,url): + mobj = re.match(self._VALID_URL, url) + video_id=mobj.group('id') + title=mobj.group('title') + webpage = self._download_webpage(url, video_id) + video_url=url+'?dl=1' + return{ + 'id':video_id, + 'title':title, + 'formats': [{ + 'url': video_url, + 'vcodec': 'none', + }] + + } + \ No newline at end of file From f2ffd10bb202ba4bb1111a86901a0f8d1f225998 Mon Sep 17 00:00:00 2001 From: sahutd <sahutd@users.noreply.github.com> Date: Sat, 18 Jan 2014 20:48:43 +0530 Subject: [PATCH 002/339] Update __init__.py --- youtube_dl/extractor/__init__.py | 1 + 1 file changed, 1 insertion(+) diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index d66f7b026..81c06b586 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -47,6 +47,7 @@ from .depositfiles import DepositFilesIE from .dotsub import DotsubIE from .dreisat import DreiSatIE from .defense import DefenseGouvFrIE +from .dropbox import DropBoxIE from .ebaumsworld import EbaumsWorldIE from .ehow import EHowIE from .eighttracks import EightTracksIE From 2a7c35dd46ad90872feb2ca1baf85f226624cc4c Mon Sep 17 00:00:00 2001 From: sahutd <sahutd@gmail.com> Date: Sat, 18 Jan 2014 20:50:42 +0530 Subject: [PATCH 003/339] added dropbox support --- youtube_dl/extractor/__init__.py | 1 + 1 file changed, 1 insertion(+) diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index d66f7b026..81c06b586 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -47,6 +47,7 @@ from .depositfiles import DepositFilesIE from .dotsub import DotsubIE from .dreisat import DreiSatIE from .defense import DefenseGouvFrIE +from .dropbox import DropBoxIE from .ebaumsworld import EbaumsWorldIE from .ehow import EHowIE from .eighttracks import EightTracksIE From dd27fd1739ad7fed878ecab17e51001c336d1190 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Sun, 19 Jan 2014 05:47:20 +0100 Subject: [PATCH 004/339] [youtube] Download DASH manifest If given, download and parse the DASH manifest file, in order to get ultra-HQ formats. Fixes #2166 --- youtube_dl/YoutubeDL.py | 2 + youtube_dl/extractor/common.py | 1 + youtube_dl/extractor/youtube.py | 96 ++++++++++++++++++++++++--------- youtube_dl/utils.py | 4 +- 4 files changed, 76 insertions(+), 27 deletions(-) diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index a0ab89b3d..11f88f128 100644 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -1091,6 +1091,8 @@ class YoutubeDL(object): res += 'audio' if fdict.get('abr') is not None: res += '@%3dk' % fdict['abr'] + if fdict.get('asr') is not None: + res += ' (%5dHz)' % fdict['asr'] if fdict.get('filesize') is not None: if res: res += ', ' diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 692d828da..56c54a5ce 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -63,6 +63,7 @@ class InfoExtractor(object): * tbr Average bitrate of audio and video in KBit/s * abr Average audio bitrate in KBit/s * acodec Name of the audio codec in use + * asr Audio sampling rate in Hertz * vbr Average video bitrate in KBit/s * vcodec Name of the video codec in use * filesize The number of bytes, if known in advance diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index bf3fde610..b943f19f9 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -27,6 +27,7 @@ from ..utils import ( get_element_by_id, get_element_by_attribute, ExtractorError, + int_or_none, unescapeHTML, unified_strdate, orderedSet, @@ -269,6 +270,21 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): u"uploader_id": u"setindia" } }, + { + u"url": u"http://www.youtube.com/watch?v=a9LDPn-MO4I", + u"file": u"a9LDPn-MO4I.m4a", + u"note": u"256k DASH audio (format 141) via DASH manifest", + u"params": { + u"format": "141" + }, + u"info_dict": { + u"upload_date": "20121002", + u"uploader_id": "8KVIDEO", + u"description": "No description available.", + u"uploader": "8KVIDEO", + u"title": "UHDTV TEST 8K VIDEO.mp4" + } + }, ] @@ -1066,18 +1082,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): video_id = mobj.group(2) return video_id - def _get_video_url_list(self, url_map): - """ - Transform a dictionary in the format {itag:url} to a list of (itag, url) - with the requested formats. - """ - existing_formats = [x for x in self._formats if x in url_map] - if len(existing_formats) == 0: - raise ExtractorError(u'no known formats available for video') - video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats - video_url_list.reverse() # order worst to best - return video_url_list - def _extract_from_m3u8(self, manifest_url, video_id): url_map = {} def _get_urls(_manifest): @@ -1251,7 +1255,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): video_annotations = self._extract_annotations(video_id) # Decide which formats to download - try: mobj = re.search(r';ytplayer.config = ({.*?});', video_webpage) if not mobj: @@ -1276,9 +1279,26 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): except ValueError: pass + def _map_to_format_list(urlmap): + formats = [] + for itag, video_real_url in urlmap.items(): + dct = { + 'format_id': itag, + 'url': video_real_url, + 'player_url': player_url, + } + dct.update(self._formats[itag]) + formats.append(dct) + return formats + if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'): self.report_rtmp_download() - video_url_list = [('_rtmp', video_info['conn'][0])] + formats = [{ + 'format_id': '_rtmp', + 'protocol': 'rtmp', + 'url': video_info['conn'][0], + 'player_url': player_url, + }] elif len(video_info.get('url_encoded_fmt_stream_map', [])) >= 1 or len(video_info.get('adaptive_fmts', [])) >= 1: encoded_url_map = video_info.get('url_encoded_fmt_stream_map', [''])[0] + ',' + video_info.get('adaptive_fmts',[''])[0] if 'rtmpe%3Dyes' in encoded_url_map: @@ -1323,23 +1343,49 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): if 'ratebypass' not in url: url += '&ratebypass=yes' url_map[url_data['itag'][0]] = url - video_url_list = self._get_video_url_list(url_map) + formats = _map_to_format_list(url_map) elif video_info.get('hlsvp'): manifest_url = video_info['hlsvp'][0] url_map = self._extract_from_m3u8(manifest_url, video_id) - video_url_list = self._get_video_url_list(url_map) + formats = _map_to_format_list(url_map) else: raise ExtractorError(u'no conn, hlsvp or url_encoded_fmt_stream_map information found in video info') - formats = [] - for itag, video_real_url in video_url_list: - dct = { - 'format_id': itag, - 'url': video_real_url, - 'player_url': player_url, - } - dct.update(self._formats[itag]) - formats.append(dct) + # Look for the DASH manifest + dash_manifest_url_lst = video_info.get('dashmpd') + if dash_manifest_url_lst and dash_manifest_url_lst[0]: + try: + dash_doc = self._download_xml( + dash_manifest_url_lst[0], video_id, + note=u'Downloading DASH manifest', + errnote=u'Could not download DASH manifest') + for r in dash_doc.findall(u'.//{urn:mpeg:DASH:schema:MPD:2011}Representation'): + url_el = r.find('{urn:mpeg:DASH:schema:MPD:2011}BaseURL') + if url_el is None: + continue + format_id = r.attrib['id'] + video_url = url_el.text + filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength')) + f = { + 'format_id': format_id, + 'url': video_url, + 'width': int_or_none(r.attrib.get('width')), + 'tbr': int_or_none(r.attrib.get('bandwidth'), 1000), + 'asr': int_or_none(r.attrib.get('audioSamplingRate')), + 'filesize': filesize, + } + try: + existing_format = next( + fo for fo in formats + if fo['format_id'] == format_id) + except StopIteration: + f.update(self._formats.get(format_id, {})) + formats.append(f) + else: + existing_format.update(f) + + except (ExtractorError, KeyError) as e: + self.report_warning(u'Skipping DASH manifest: %s' % e, video_id) self._sort_formats(formats) diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 73fe1ad0a..879394d88 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -1128,8 +1128,8 @@ class HEADRequest(compat_urllib_request.Request): return "HEAD" -def int_or_none(v): - return v if v is None else int(v) +def int_or_none(v, scale=1): + return v if v is None else (int(v) // scale) def parse_duration(s): From 6b79f40c3d37d33ac944241e205df8c5c4bbabca Mon Sep 17 00:00:00 2001 From: sahutd <sahutd@gmail.com> Date: Sun, 19 Jan 2014 10:20:26 +0530 Subject: [PATCH 005/339] Added support for Dropbox --- youtube_dl/extractor/__init__.py | 2 +- youtube_dl/extractor/dropbox.py | 19 +++++++++---------- 2 files changed, 10 insertions(+), 11 deletions(-) diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 81c06b586..5605e917b 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -47,7 +47,7 @@ from .depositfiles import DepositFilesIE from .dotsub import DotsubIE from .dreisat import DreiSatIE from .defense import DefenseGouvFrIE -from .dropbox import DropBoxIE +from .dropbox import DropboxIE from .ebaumsworld import EbaumsWorldIE from .ehow import EHowIE from .eighttracks import EightTracksIE diff --git a/youtube_dl/extractor/dropbox.py b/youtube_dl/extractor/dropbox.py index 0df025cd3..e4d60d17a 100644 --- a/youtube_dl/extractor/dropbox.py +++ b/youtube_dl/extractor/dropbox.py @@ -5,13 +5,15 @@ import re from .common import InfoExtractor -class DropBoxIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?dropbox.com/s/(?P<id>[a-zA-Z0-9]{15})/(?P<title>.*)' +class DropboxIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?dropbox[.]com/s/(?P<id>[a-zA-Z0-9]{15})/(?P<title>[^?#]*)' _TEST = { - 'url': 'https://www.dropbox.com/s/mcnzehi9wo55th4/20131219_085616.mp4', - 'file': '20131219_085616.mp4', - 'md5': '2cec58eb277054eca0dbaaf3bdc72564', - + u'url': u'https://www.dropbox.com/s/mcnzehi9wo55th4/20131219_085616.mp4', + u'file': u'mcnzehi9wo55th4.mp4', + u'md5': u'2cec58eb277054eca0dbaaf3bdc72564', + u'info_dict': { + u'title': '20131219_085616' + } } @@ -24,10 +26,7 @@ class DropBoxIE(InfoExtractor): return{ 'id':video_id, 'title':title, - 'formats': [{ - 'url': video_url, - 'vcodec': 'none', - }] + 'url':video_url } \ No newline at end of file From ce4e242a6f360e9b7cbf9675d55a14eb1e595a7f Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Sun, 19 Jan 2014 06:14:24 +0100 Subject: [PATCH 006/339] [dropbox] PEP8 and simplify (#2171) --- youtube_dl/extractor/dropbox.py | 36 ++++++++++++++++----------------- 1 file changed, 17 insertions(+), 19 deletions(-) diff --git a/youtube_dl/extractor/dropbox.py b/youtube_dl/extractor/dropbox.py index e4d60d17a..44f827e89 100644 --- a/youtube_dl/extractor/dropbox.py +++ b/youtube_dl/extractor/dropbox.py @@ -5,28 +5,26 @@ import re from .common import InfoExtractor + class DropboxIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?dropbox[.]com/s/(?P<id>[a-zA-Z0-9]{15})/(?P<title>[^?#]*)' _TEST = { - u'url': u'https://www.dropbox.com/s/mcnzehi9wo55th4/20131219_085616.mp4', - u'file': u'mcnzehi9wo55th4.mp4', - u'md5': u'2cec58eb277054eca0dbaaf3bdc72564', - u'info_dict': { - u'title': '20131219_085616' + 'url': 'https://www.dropbox.com/s/mcnzehi9wo55th4/20131219_085616.mp4', + 'file': 'mcnzehi9wo55th4.mp4', + 'md5': '2cec58eb277054eca0dbaaf3bdc72564', + 'info_dict': { + 'title': '20131219_085616' } } - - - def _real_extract(self,url): + + def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) - video_id=mobj.group('id') - title=mobj.group('title') - webpage = self._download_webpage(url, video_id) - video_url=url+'?dl=1' - return{ - 'id':video_id, - 'title':title, - 'url':video_url - - } - \ No newline at end of file + video_id = mobj.group('id') + title = mobj.group('title') + video_url = url + '?dl=1' + + return { + 'id': video_id, + 'title': title, + 'url': video_url, + } From 4cf393bb4baf7fd9b2e8643012dd17ab61184310 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Sun, 19 Jan 2014 06:16:40 +0100 Subject: [PATCH 007/339] [dropbox] Correct test case (#2171) --- youtube_dl/extractor/dropbox.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/dropbox.py b/youtube_dl/extractor/dropbox.py index 44f827e89..d74981eea 100644 --- a/youtube_dl/extractor/dropbox.py +++ b/youtube_dl/extractor/dropbox.py @@ -1,6 +1,7 @@ # coding: utf-8 from __future__ import unicode_literals +import os.path import re from .common import InfoExtractor @@ -11,7 +12,7 @@ class DropboxIE(InfoExtractor): _TEST = { 'url': 'https://www.dropbox.com/s/mcnzehi9wo55th4/20131219_085616.mp4', 'file': 'mcnzehi9wo55th4.mp4', - 'md5': '2cec58eb277054eca0dbaaf3bdc72564', + 'md5': 'f6d65b1b326e82fd7ab7720bea3dacae', 'info_dict': { 'title': '20131219_085616' } @@ -20,7 +21,7 @@ class DropboxIE(InfoExtractor): def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) video_id = mobj.group('id') - title = mobj.group('title') + title = os.path.splitext(mobj.group('title'))[0] video_url = url + '?dl=1' return { From efb1bb90a06d95f838e2854b6dd523749d8e8955 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= <jaime.marquinez.ferrandiz@gmail.com> Date: Sun, 19 Jan 2014 11:38:48 +0100 Subject: [PATCH 008/339] [myspace] Add support for song urls (fixes #2040) --- youtube_dl/extractor/myspace.py | 88 +++++++++++++++++++++++---------- 1 file changed, 62 insertions(+), 26 deletions(-) diff --git a/youtube_dl/extractor/myspace.py b/youtube_dl/extractor/myspace.py index 050f54a5a..462b8b957 100644 --- a/youtube_dl/extractor/myspace.py +++ b/youtube_dl/extractor/myspace.py @@ -1,3 +1,5 @@ +from __future__ import unicode_literals + import re import json @@ -8,41 +10,75 @@ from ..utils import ( class MySpaceIE(InfoExtractor): - _VALID_URL = r'https?://myspace\.com/([^/]+)/video/[^/]+/(?P<id>\d+)' + _VALID_URL = r'https?://myspace\.com/([^/]+)/(?:video/[^/]+/|music/song/.*?)(?P<id>\d+)' - _TEST = { - u'url': u'https://myspace.com/coldplay/video/viva-la-vida/100008689', - u'info_dict': { - u'id': u'100008689', - u'ext': u'flv', - u'title': u'Viva La Vida', - u'description': u'The official Viva La Vida video, directed by Hype Williams', - u'uploader': u'Coldplay', - u'uploader_id': u'coldplay', + _TESTS = [ + { + 'url': 'https://myspace.com/coldplay/video/viva-la-vida/100008689', + 'info_dict': { + 'id': '100008689', + 'ext': 'flv', + 'title': 'Viva La Vida', + 'description': 'The official Viva La Vida video, directed by Hype Williams', + 'uploader': 'Coldplay', + 'uploader_id': 'coldplay', + }, + 'params': { + # rtmp download + 'skip_download': True, + }, }, - u'params': { - # rtmp download - u'skip_download': True, + # song + { + 'url': 'https://myspace.com/spiderbags/music/song/darkness-in-my-heart-39008454-27041242', + 'info_dict': { + 'id': '39008454', + 'ext': 'flv', + 'title': 'Darkness In My Heart', + 'uploader_id': 'spiderbags', + }, + 'params': { + # rtmp download + 'skip_download': True, + }, }, - } + ] def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) video_id = mobj.group('id') webpage = self._download_webpage(url, video_id) - context = json.loads(self._search_regex(r'context = ({.*?});', webpage, - u'context')) - video = context['video'] - rtmp_url, play_path = video['streamUrl'].split(';', 1) - return { - 'id': compat_str(video['mediaId']), - 'title': video['title'], + if 'music/song' in url: + # songs don't store any useful info in the 'context' variable + def search_data(name): + return self._search_regex(r'data-%s="(.*?)"' % name, webpage, + name) + streamUrl = search_data('stream-url') + info = { + 'id': video_id, + 'title': self._og_search_title(webpage), + 'uploader_id': search_data('artist-username'), + 'thumbnail': self._og_search_thumbnail(webpage), + } + else: + context = json.loads(self._search_regex(r'context = ({.*?});', webpage, + u'context')) + video = context['video'] + streamUrl = video['streamUrl'] + info = { + 'id': compat_str(video['mediaId']), + 'title': video['title'], + 'description': video['description'], + 'thumbnail': video['imageUrl'], + 'uploader': video['artistName'], + 'uploader_id': video['artistUsername'], + } + + rtmp_url, play_path = streamUrl.split(';', 1) + info.update({ 'url': rtmp_url, 'play_path': play_path, 'ext': 'flv', - 'description': video['description'], - 'thumbnail': video['imageUrl'], - 'uploader': video['artistName'], - 'uploader_id': video['artistUsername'], - } + }) + return info From 5016f3eac879455a08cf7df0282fe59af9f3facf Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Mon, 20 Jan 2014 02:44:08 +0100 Subject: [PATCH 009/339] [myspace] More robust mediatype check --- youtube_dl/extractor/myspace.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/myspace.py b/youtube_dl/extractor/myspace.py index 462b8b957..c16939f54 100644 --- a/youtube_dl/extractor/myspace.py +++ b/youtube_dl/extractor/myspace.py @@ -10,7 +10,7 @@ from ..utils import ( class MySpaceIE(InfoExtractor): - _VALID_URL = r'https?://myspace\.com/([^/]+)/(?:video/[^/]+/|music/song/.*?)(?P<id>\d+)' + _VALID_URL = r'https?://myspace\.com/([^/]+)/(?P<mediatype>video/[^/]+/|music/song/.*?)(?P<id>\d+)' _TESTS = [ { @@ -49,7 +49,7 @@ class MySpaceIE(InfoExtractor): video_id = mobj.group('id') webpage = self._download_webpage(url, video_id) - if 'music/song' in url: + if mobj.group('mediatype').startswith('music/song'): # songs don't store any useful info in the 'context' variable def search_data(name): return self._search_regex(r'data-%s="(.*?)"' % name, webpage, From c91778f8c0ba120378cb806f694fdc3f94a5634c Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Mon, 20 Jan 2014 02:45:49 +0100 Subject: [PATCH 010/339] [youtube] Fall back to header if playlist title is not available Sometimes (in about 10% of requests), the og:title is missing for a weird reason. See #2170 for an example --- youtube_dl/extractor/youtube.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index bf3fde610..248b30ffb 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -27,6 +27,7 @@ from ..utils import ( get_element_by_id, get_element_by_attribute, ExtractorError, + RegexNotFoundError, unescapeHTML, unified_strdate, orderedSet, @@ -1448,7 +1449,14 @@ class YoutubePlaylistIE(YoutubeBaseInfoExtractor): if re.search(self._MORE_PAGES_INDICATOR, page) is None: break - playlist_title = self._og_search_title(page) + try: + playlist_title = self._og_search_title(page) + except RegexNotFoundError: + self.report_warning( + u'Playlist page is missing OpenGraph title, falling back ...', + playlist_id) + playlist_title = self._html_search_regex( + r'<h1 class="pl-header-title">(.*?)</h1>', page, u'title') url_results = self._ids_to_results(ids) return self.playlist_result(url_results, playlist_id, playlist_title) From b7ab05908440915c6c5faa541abe00c62a88bc27 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Mon, 20 Jan 2014 11:36:47 +0100 Subject: [PATCH 011/339] Add infrastructure for paged lists This commit allows to download pages in playlists as needed instead of all at once. Before this commit, youtube-dl http://www.youtube.com/user/ANNnewsCH/videos --playlist-end 2 --skip-download took quite some time - now it's almost instantaneous. As an example, the youtube:user extractor has been converted. Fixes #2175 --- test/test_utils.py | 22 +++++++++++++++++ youtube_dl/YoutubeDL.py | 23 +++++++++++------ youtube_dl/extractor/youtube.py | 28 ++++++++------------- youtube_dl/utils.py | 44 +++++++++++++++++++++++++++++++++ 4 files changed, 92 insertions(+), 25 deletions(-) diff --git a/test/test_utils.py b/test/test_utils.py index bee355ee0..349c1107f 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -18,6 +18,7 @@ from youtube_dl.utils import ( find_xpath_attr, get_meta_content, orderedSet, + PagedList, parse_duration, sanitize_filename, shell_quote, @@ -200,5 +201,26 @@ class TestUtil(unittest.TestCase): self.assertEqual(parse_duration('9:12:43'), 33163) self.assertEqual(parse_duration('x:y'), None) + def test_paged_list(self): + def testPL(size, pagesize, sliceargs, expected): + def get_page(pagenum): + firstid = pagenum * pagesize + upto = min(size, pagenum * pagesize + pagesize) + for i in range(firstid, upto): + yield i + + pl = PagedList(get_page, pagesize) + got = pl.getslice(*sliceargs) + self.assertEqual(got, expected) + + testPL(5, 2, (), [0, 1, 2, 3, 4]) + testPL(5, 2, (1,), [1, 2, 3, 4]) + testPL(5, 2, (2,), [2, 3, 4]) + testPL(5, 2, (4,), [4]) + testPL(5, 2, (0, 3), [0, 1, 2]) + testPL(5, 2, (1, 4), [1, 2, 3]) + testPL(5, 2, (2, 99), [2, 3, 4]) + testPL(5, 2, (20, 99), []) + if __name__ == '__main__': unittest.main() diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index a0ab89b3d..2ad6f1028 100644 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -39,6 +39,7 @@ from .utils import ( locked_file, make_HTTPS_handler, MaxDownloadsReached, + PagedList, PostProcessingError, platform_name, preferredencoding, @@ -575,19 +576,27 @@ class YoutubeDL(object): playlist_results = [] - n_all_entries = len(ie_result['entries']) playliststart = self.params.get('playliststart', 1) - 1 playlistend = self.params.get('playlistend', None) # For backwards compatibility, interpret -1 as whole list if playlistend == -1: playlistend = None - entries = ie_result['entries'][playliststart:playlistend] - n_entries = len(entries) - - self.to_screen( - "[%s] playlist '%s': Collected %d video ids (downloading %d of them)" % - (ie_result['extractor'], playlist, n_all_entries, n_entries)) + if isinstance(ie_result['entries'], list): + n_all_entries = len(ie_result['entries']) + entries = ie_result['entries'][playliststart:playlistend] + n_entries = len(entries) + self.to_screen( + "[%s] playlist %s: Collected %d video ids (downloading %d of them)" % + (ie_result['extractor'], playlist, n_all_entries, n_entries)) + else: + assert isinstance(ie_result['entries'], PagedList) + entries = ie_result['entries'].getslice( + playliststart, playlistend) + n_entries = len(entries) + self.to_screen( + "[%s] playlist %s: Downloading %d videos" % + (ie_result['extractor'], playlist, n_entries)) for i, entry in enumerate(entries, 1): self.to_screen('[download] Downloading video #%s of %s' % (i, n_entries)) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 248b30ffb..dd1a58f3f 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -27,6 +27,7 @@ from ..utils import ( get_element_by_id, get_element_by_attribute, ExtractorError, + PagedList, RegexNotFoundError, unescapeHTML, unified_strdate, @@ -1580,44 +1581,35 @@ class YoutubeUserIE(InfoExtractor): # page by page until there are no video ids - it means we got # all of them. - url_results = [] - - for pagenum in itertools.count(0): + def download_page(pagenum): start_index = pagenum * self._GDATA_PAGE_SIZE + 1 gdata_url = self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index) - page = self._download_webpage(gdata_url, username, - u'Downloading video ids from %d to %d' % (start_index, start_index + self._GDATA_PAGE_SIZE)) + page = self._download_webpage( + gdata_url, username, + u'Downloading video ids from %d to %d' % ( + start_index, start_index + self._GDATA_PAGE_SIZE)) try: response = json.loads(page) except ValueError as err: raise ExtractorError(u'Invalid JSON in API response: ' + compat_str(err)) if 'entry' not in response['feed']: - # Number of videos is a multiple of self._MAX_RESULTS - break + return # Extract video identifiers entries = response['feed']['entry'] for entry in entries: title = entry['title']['$t'] video_id = entry['id']['$t'].split('/')[-1] - url_results.append({ + yield { '_type': 'url', 'url': video_id, 'ie_key': 'Youtube', 'id': 'video_id', 'title': title, - }) - - # A little optimization - if current page is not - # "full", ie. does not contain PAGE_SIZE video ids then - # we can assume that this page is the last one - there - # are no more ids on further pages - no need to query - # again. - - if len(entries) < self._GDATA_PAGE_SIZE: - break + } + url_results = PagedList(download_page, self._GDATA_PAGE_SIZE) return self.playlist_result(url_results, playlist_title=username) diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 73fe1ad0a..ff124d9e8 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -6,6 +6,7 @@ import datetime import email.utils import errno import gzip +import itertools import io import json import locale @@ -1161,3 +1162,46 @@ def check_executable(exe, args=[]): except OSError: return False return exe + + +class PagedList(object): + def __init__(self, pagefunc, pagesize): + self._pagefunc = pagefunc + self._pagesize = pagesize + + def getslice(self, start=0, end=None): + res = [] + for pagenum in itertools.count(start // self._pagesize): + firstid = pagenum * self._pagesize + nextfirstid = pagenum * self._pagesize + self._pagesize + if start >= nextfirstid: + continue + + page_results = list(self._pagefunc(pagenum)) + + startv = ( + start % self._pagesize + if firstid <= start < nextfirstid + else 0) + + endv = ( + ((end - 1) % self._pagesize) + 1 + if (end is not None and firstid <= end <= nextfirstid) + else None) + + if startv != 0 or endv is not None: + page_results = page_results[startv:endv] + res.extend(page_results) + + # A little optimization - if current page is not "full", ie. does + # not contain page_size videos then we can assume that this page + # is the last one - there are no more ids on further pages - + # i.e. no need to query again. + if len(page_results) + startv < self._pagesize: + break + + # If we got the whole page, but the next page is not interesting, + # break out early as well + if end == nextfirstid: + break + return res From b853d2e1555dbb4a09fe3d7857c6d2bc044646f4 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Mon, 20 Jan 2014 11:44:37 +0100 Subject: [PATCH 012/339] release 2014.01.20 --- youtube_dl/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/version.py b/youtube_dl/version.py index b9c25c4a9..87285763a 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,2 +1,2 @@ -__version__ = '2014.01.17.2' +__version__ = '2014.01.20' From 5aafe895fce2a7be9595cb2e56b7bd73a748e6b6 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Mon, 20 Jan 2014 22:11:34 +0100 Subject: [PATCH 013/339] Correct XML ampersand fixup --- test/test_utils.py | 14 ++++++++++++++ youtube_dl/extractor/clipsyndicate.py | 4 ++-- youtube_dl/extractor/metacritic.py | 4 ++-- youtube_dl/extractor/mtv.py | 6 ++---- youtube_dl/utils.py | 7 +++++-- 5 files changed, 25 insertions(+), 10 deletions(-) diff --git a/test/test_utils.py b/test/test_utils.py index bee355ee0..a17483ada 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -16,6 +16,7 @@ from youtube_dl.utils import ( DateRange, encodeFilename, find_xpath_attr, + fix_xml_ampersands, get_meta_content, orderedSet, parse_duration, @@ -200,5 +201,18 @@ class TestUtil(unittest.TestCase): self.assertEqual(parse_duration('9:12:43'), 33163) self.assertEqual(parse_duration('x:y'), None) + def test_fix_xml_ampersands(self): + self.assertEqual( + fix_xml_ampersands('"&x=y&z=a'), '"&x=y&z=a') + self.assertEqual( + fix_xml_ampersands('"&x=y&wrong;&z=a'), + '"&x=y&wrong;&z=a') + self.assertEqual( + fix_xml_ampersands('&'><"'), + '&'><"') + self.assertEqual( + fix_xml_ampersands('Ӓ᪼'), 'Ӓ᪼') + self.assertEqual(fix_xml_ampersands('&#&#'), '&#&#') + if __name__ == '__main__': unittest.main() diff --git a/youtube_dl/extractor/clipsyndicate.py b/youtube_dl/extractor/clipsyndicate.py index c60089ad3..9ab6a4ab6 100644 --- a/youtube_dl/extractor/clipsyndicate.py +++ b/youtube_dl/extractor/clipsyndicate.py @@ -3,7 +3,7 @@ import re from .common import InfoExtractor from ..utils import ( find_xpath_attr, - fix_xml_all_ampersand, + fix_xml_ampersands ) @@ -33,7 +33,7 @@ class ClipsyndicateIE(InfoExtractor): pdoc = self._download_xml( 'http://eplayer.clipsyndicate.com/osmf/playlist?%s' % flvars, video_id, u'Downloading video info', - transform_source=fix_xml_all_ampersand) + transform_source=fix_xml_ampersands) track_doc = pdoc.find('trackList/track') def find_param(name): diff --git a/youtube_dl/extractor/metacritic.py b/youtube_dl/extractor/metacritic.py index f3ff0e8bb..465ac4916 100644 --- a/youtube_dl/extractor/metacritic.py +++ b/youtube_dl/extractor/metacritic.py @@ -4,7 +4,7 @@ import re from .common import InfoExtractor from ..utils import ( - fix_xml_all_ampersand, + fix_xml_ampersands, ) @@ -27,7 +27,7 @@ class MetacriticIE(InfoExtractor): webpage = self._download_webpage(url, video_id) # The xml is not well formatted, there are raw '&' info = self._download_xml('http://www.metacritic.com/video_data?video=' + video_id, - video_id, 'Downloading info xml', transform_source=fix_xml_all_ampersand) + video_id, 'Downloading info xml', transform_source=fix_xml_ampersands) clip = next(c for c in info.findall('playList/clip') if c.find('id').text == video_id) formats = [] diff --git a/youtube_dl/extractor/mtv.py b/youtube_dl/extractor/mtv.py index f1cf41e2d..c4fa16fb6 100644 --- a/youtube_dl/extractor/mtv.py +++ b/youtube_dl/extractor/mtv.py @@ -5,6 +5,7 @@ from .common import InfoExtractor from ..utils import ( compat_urllib_parse, ExtractorError, + fix_xml_ampersands, ) def _media_xml_tag(tag): @@ -83,12 +84,9 @@ class MTVServicesInfoExtractor(InfoExtractor): video_id = self._id_from_uri(uri) data = compat_urllib_parse.urlencode({'uri': uri}) - def fix_ampersand(s): - """ Fix unencoded ampersand in XML """ - return s.replace(u'& ', '& ') idoc = self._download_xml( self._FEED_URL + '?' + data, video_id, - u'Downloading info', transform_source=fix_ampersand) + u'Downloading info', transform_source=fix_xml_ampersands) return [self._get_video_info(item) for item in idoc.findall('.//item')] diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 73fe1ad0a..70f284149 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -1092,9 +1092,12 @@ def month_by_name(name): return None -def fix_xml_all_ampersand(xml_str): +def fix_xml_ampersands(xml_str): """Replace all the '&' by '&' in XML""" - return xml_str.replace(u'&', u'&') + return re.sub( + r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)', + u'&', + xml_str) def setproctitle(title): From b60016e83139ace517fc823cf2b22756e64c2e63 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Tue, 21 Jan 2014 01:39:39 +0100 Subject: [PATCH 014/339] Deal with implicitly UTF-16 decoded webpages These webpages don't specify an encoding and rely on the BOM --- youtube_dl/extractor/common.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 692d828da..6c5d77e58 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -220,6 +220,8 @@ class InfoExtractor(object): webpage_bytes[:1024]) if m: encoding = m.group(1).decode('ascii') + elif webpage_bytes.startswith(b'\xff\xfe'): + encoding = 'utf-16' else: encoding = 'utf-8' if self._downloader.params.get('dump_intermediate_pages', False): From 3486df383ba3b2f799c5d65bc563bba8dd8c5903 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Tue, 21 Jan 2014 01:40:34 +0100 Subject: [PATCH 015/339] [generic] Improve testcase --- youtube_dl/extractor/generic.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 839530982..3e96cb15f 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -92,11 +92,12 @@ class GenericIE(InfoExtractor): # ooyala video { 'url': 'http://www.rollingstone.com/music/videos/norwegian-dj-cashmere-cat-goes-spartan-on-with-me-premiere-20131219', + 'file': 'BwY2RxaTrTkslxOfcan0UCf0YqyvWysJ.mp4', 'md5': '5644c6ca5d5782c1d0d350dad9bd840c', 'info_dict': { 'id': 'BwY2RxaTrTkslxOfcan0UCf0YqyvWysJ', 'ext': 'mp4', - 'title': '2cc213299525360.mov', #that's what we get + 'title': '2cc213299525360.mov', # that's what we get }, }, ] From 9d4288b2d4a47d36a2a8fa116f1023251e436cdc Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Tue, 21 Jan 2014 01:41:13 +0100 Subject: [PATCH 016/339] [extractor/common] Clarify when and when not we generate the filename --- youtube_dl/extractor/common.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 6c5d77e58..582eb4f5b 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -238,7 +238,7 @@ class InfoExtractor(object): except AttributeError: url = url_or_request if len(url) > 200: - h = hashlib.md5(url).hexdigest() + h = u'___' + hashlib.md5(url).hexdigest() url = url[:200 - len(h)] + h raw_filename = ('%s_%s.dump' % (video_id, url)) filename = sanitize_filename(raw_filename, restricted=True) From 7b0817e8e189ced899b64bfc3190b8f6218f04a3 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Tue, 21 Jan 2014 02:09:49 +0100 Subject: [PATCH 017/339] [servingsys] Add support This also adds support for brightcove advertisements. Fixes #2181 --- youtube_dl/YoutubeDL.py | 1 + youtube_dl/__init__.py | 6 ++- youtube_dl/extractor/__init__.py | 1 + youtube_dl/extractor/brightcove.py | 51 +++++++++++++++++++--- youtube_dl/extractor/servingsys.py | 70 ++++++++++++++++++++++++++++++ 5 files changed, 121 insertions(+), 8 deletions(-) create mode 100644 youtube_dl/extractor/servingsys.py diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index a0ab89b3d..dc8aa788c 100644 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -151,6 +151,7 @@ class YoutubeDL(object): bidi_workaround: Work around buggy terminals without bidirectional text support, using fridibi debug_printtraffic:Print out sent and received HTTP traffic + include_ads: Download ads as well The following parameters are not used by YoutubeDL itself, they are used by the FileDownloader: diff --git a/youtube_dl/__init__.py b/youtube_dl/__init__.py index 82b1ff4f4..a948b1d90 100644 --- a/youtube_dl/__init__.py +++ b/youtube_dl/__init__.py @@ -238,7 +238,10 @@ def parseOpts(overrideArguments=None): selection.add_option('--download-archive', metavar='FILE', dest='download_archive', help='Download only videos not listed in the archive file. Record the IDs of all downloaded videos in it.') - + selection.add_option( + '--include-ads', dest='include_ads', + action='store_true', + help='Download advertisements as well (experimental)') authentication.add_option('-u', '--username', dest='username', metavar='USERNAME', help='account username') @@ -716,6 +719,7 @@ def _real_main(argv=None): 'bidi_workaround': opts.bidi_workaround, 'debug_printtraffic': opts.debug_printtraffic, 'prefer_ffmpeg': opts.prefer_ffmpeg, + 'include_ads': opts.include_ads, } with YoutubeDL(ydl_opts) as ydl: diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 5605e917b..7b374f7b9 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -152,6 +152,7 @@ from .rottentomatoes import RottenTomatoesIE from .roxwel import RoxwelIE from .rtlnow import RTLnowIE from .rutube import RutubeIE +from .servingsys import ServingSysIE from .sina import SinaIE from .slashdot import SlashdotIE from .slideshare import SlideshareIE diff --git a/youtube_dl/extractor/brightcove.py b/youtube_dl/extractor/brightcove.py index 8ac38f4aa..b873dc0d4 100644 --- a/youtube_dl/extractor/brightcove.py +++ b/youtube_dl/extractor/brightcove.py @@ -9,9 +9,11 @@ from .common import InfoExtractor from ..utils import ( compat_urllib_parse, find_xpath_attr, + fix_xml_ampersands, compat_urlparse, compat_str, compat_urllib_request, + compat_parse_qs, ExtractorError, unsmuggle_url, @@ -83,17 +85,30 @@ class BrightcoveIE(InfoExtractor): lambda m: m.group(1) + '/>', object_str) # Fix up some stupid XML, see https://github.com/rg3/youtube-dl/issues/1608 object_str = object_str.replace('<--', '<!--') + object_str = fix_xml_ampersands(object_str) object_doc = xml.etree.ElementTree.fromstring(object_str) - assert 'BrightcoveExperience' in object_doc.attrib['class'] - params = { - 'playerID': find_xpath_attr(object_doc, './param', 'name', 'playerID').attrib['value'], - } + + fv_el = find_xpath_attr(object_doc, './param', 'name', 'flashVars') + flashvars = dict( + (k, v[0]) + for k, v in compat_parse_qs(fv_el.attrib['value']).items()) + def find_param(name): + if name in flashvars: + return flashvars[name] node = find_xpath_attr(object_doc, './param', 'name', name) if node is not None: return node.attrib['value'] return None + + params = {} + + playerID = find_param('playerID') + if playerID is None: + raise ExtractorError('Cannot find player ID') + params['playerID'] = playerID + playerKey = find_param('playerKey') # Not all pages define this value if playerKey is not None: @@ -114,8 +129,12 @@ class BrightcoveIE(InfoExtractor): if it can't be found """ m_brightcove = re.search( - r'<object[^>]+?class=([\'"])[^>]*?BrightcoveExperience.*?\1.+?</object>', - webpage, re.DOTALL) + r'''(?sx)<object + (?: + :[^>]+?class=([\'"])[^>]*?BrightcoveExperience.*?\1 | + [^>]*?>\s*<param\s+name="movie"\s+value="https?://[^/]*brightcove\.com/ + ).+?</object>''', + webpage) if m_brightcove is not None: return cls._build_brighcove_url(m_brightcove.group()) else: @@ -156,6 +175,7 @@ class BrightcoveIE(InfoExtractor): info = self._search_regex(r'var experienceJSON = ({.*?});', webpage, 'json') info = json.loads(info)['data'] video_info = info['programmedContent']['videoPlayer']['mediaDTO'] + video_info['_youtubedl_adServerURL'] = info.get('adServerURL') return self._extract_video_info(video_info) @@ -193,6 +213,23 @@ class BrightcoveIE(InfoExtractor): info.update({ 'url': video_info['FLVFullLengthURL'], }) - else: + + if self._downloader.params.get('include_ads', False): + adServerURL = video_info.get('_youtubedl_adServerURL') + if adServerURL: + ad_info = { + '_type': 'url', + 'url': adServerURL, + } + if 'url' in info: + return { + '_type': 'playlist', + 'title': info['title'], + 'entries': [ad_info, info], + } + else: + return ad_info + + if 'url' not in info: raise ExtractorError('Unable to extract video url for %s' % info['id']) return info diff --git a/youtube_dl/extractor/servingsys.py b/youtube_dl/extractor/servingsys.py new file mode 100644 index 000000000..7ba237dde --- /dev/null +++ b/youtube_dl/extractor/servingsys.py @@ -0,0 +1,70 @@ +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + int_or_none, +) + + +class ServingSysIE(InfoExtractor): + _VALID_URL = r'https?://(?:[^.]+\.)?serving-sys\.com/BurstingPipe/adServer\.bs\?.*?&pli=(?P<id>[0-9]+)' + + _TEST = { + 'url': 'http://bs.serving-sys.com/BurstingPipe/adServer.bs?cn=is&c=23&pl=VAST&pli=5349193&PluID=0&pos=7135&ord=[timestamp]&cim=1?', + 'playlist': [{ + 'file': '29955898.flv', + 'md5': 'baed851342df6846eb8677a60a011a0f', + 'info_dict': { + 'title': 'AdAPPter_Hyundai_demo (1)', + 'duration': 74, + 'tbr': 1378, + 'width': 640, + 'height': 400, + }, + }, { + 'file': '29907998.flv', + 'md5': '979b4da2655c4bc2d81aeb915a8c5014', + 'info_dict': { + 'title': 'AdAPPter_Hyundai_demo (2)', + 'duration': 34, + 'width': 854, + 'height': 480, + 'tbr': 516, + }, + }], + 'params': { + 'playlistend': 2, + } + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + pl_id = mobj.group('id') + + vast_doc = self._download_xml(url, pl_id) + title = vast_doc.find('.//AdTitle').text + media = vast_doc.find('.//MediaFile').text + info_url = self._search_regex(r'&adData=([^&]+)&', media, 'info URL') + + doc = self._download_xml(info_url, pl_id, 'Downloading video info') + entries = [{ + '_type': 'video', + 'id': a.attrib['id'], + 'title': '%s (%s)' % (title, a.attrib['assetID']), + 'url': a.attrib['URL'], + 'duration': int_or_none(a.attrib.get('length')), + 'tbr': int_or_none(a.attrib.get('bitrate')), + 'height': int_or_none(a.attrib.get('height')), + 'width': int_or_none(a.attrib.get('width')), + } for a in doc.findall('.//AdditionalAssets/asset')] + + return { + '_type': 'playlist', + 'id': pl_id, + 'title': title, + 'entries': entries, + } + + \ No newline at end of file From 29895011315f015d0881fd21350f301efc74f22c Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Tue, 21 Jan 2014 14:07:41 +0100 Subject: [PATCH 018/339] release 2014.01.21 --- README.md | 1 + youtube_dl/version.py | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index cf0bb7b65..aafdd3e51 100644 --- a/README.md +++ b/README.md @@ -71,6 +71,7 @@ which means you can modify it, redistribute it or use it however you like. --download-archive FILE Download only videos not listed in the archive file. Record the IDs of all downloaded videos in it. + --include-ads Download advertisements as well (experimental) ## Download Options: -r, --rate-limit LIMIT maximum download rate in bytes per second (e.g. diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 87285763a..ca47958be 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,2 +1,2 @@ -__version__ = '2014.01.20' +__version__ = '2014.01.21' From 9d11a41fe4f1f70682640e8522565827047dbf89 Mon Sep 17 00:00:00 2001 From: Mike Col <MikeCol@gmx.net> Date: Tue, 21 Jan 2014 14:12:59 +0100 Subject: [PATCH 019/339] [redtube] Add support for thumbnails Signed-off-by: Philipp Hagemeister <phihag@phihag.de> --- youtube_dl/extractor/redtube.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/youtube_dl/extractor/redtube.py b/youtube_dl/extractor/redtube.py index 5c4cd2068..ca2e0aae7 100644 --- a/youtube_dl/extractor/redtube.py +++ b/youtube_dl/extractor/redtube.py @@ -33,6 +33,10 @@ class RedTubeIE(InfoExtractor): r'<h1 class="videoTitle[^"]*">(.+?)</h1>', webpage, u'title') + video_thumbnail = self._html_search_regex( + r'playerInnerHTML.+?<img\s+src="(.+?)"', + webpage, u'thumbnail', fatal=False) + # No self-labeling, but they describe themselves as # "Home of Videos Porno" age_limit = 18 @@ -42,5 +46,6 @@ class RedTubeIE(InfoExtractor): 'url': video_url, 'ext': video_extension, 'title': video_title, + 'thumbnail': video_thumbnail, 'age_limit': age_limit, } From 032b3df5afd7bf5db053221d46a4b7a5e58df45d Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Tue, 21 Jan 2014 14:16:44 +0100 Subject: [PATCH 020/339] [redtube] Use unicode_literals --- youtube_dl/extractor/redtube.py | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/youtube_dl/extractor/redtube.py b/youtube_dl/extractor/redtube.py index ca2e0aae7..4295cf93a 100644 --- a/youtube_dl/extractor/redtube.py +++ b/youtube_dl/extractor/redtube.py @@ -1,3 +1,5 @@ +from __future__ import unicode_literals + import re from .common import InfoExtractor @@ -6,14 +8,14 @@ from .common import InfoExtractor class RedTubeIE(InfoExtractor): _VALID_URL = r'http://(?:www\.)?redtube\.com/(?P<id>[0-9]+)' _TEST = { - u'url': u'http://www.redtube.com/66418', - u'file': u'66418.mp4', + 'url': 'http://www.redtube.com/66418', + 'file': '66418.mp4', # md5 varies from time to time, as in # https://travis-ci.org/rg3/youtube-dl/jobs/14052463#L295 - #u'md5': u'7b8c22b5e7098a3e1c09709df1126d2d', - u'info_dict': { - u"title": u"Sucked on a toilet", - u"age_limit": 18, + #'md5': u'7b8c22b5e7098a3e1c09709df1126d2d', + 'info_dict': { + "title": "Sucked on a toilet", + "age_limit": 18, } } @@ -42,10 +44,10 @@ class RedTubeIE(InfoExtractor): age_limit = 18 return { - 'id': video_id, - 'url': video_url, - 'ext': video_extension, - 'title': video_title, + 'id': video_id, + 'url': video_url, + 'ext': video_extension, + 'title': video_title, 'thumbnail': video_thumbnail, 'age_limit': age_limit, } From 608bf69880a09b5a7b409b584d15a66afdbc5ce5 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Tue, 21 Jan 2014 17:29:04 +0100 Subject: [PATCH 021/339] [vk] avoid built-in names --- youtube_dl/extractor/vk.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/youtube_dl/extractor/vk.py b/youtube_dl/extractor/vk.py index 90d8a6d07..2719becbf 100644 --- a/youtube_dl/extractor/vk.py +++ b/youtube_dl/extractor/vk.py @@ -33,13 +33,13 @@ class VKIE(InfoExtractor): if m_yt is not None: self.to_screen(u'Youtube video detected') return self.url_result(m_yt.group(1), 'Youtube') - vars_json = self._search_regex(r'var vars = ({.*?});', info_page, u'vars') - vars = json.loads(vars_json) + data_json = self._search_regex(r'var vars = ({.*?});', info_page, u'vars') + data = json.loads(data_json) return { - 'id': compat_str(vars['vid']), - 'url': vars['url240'], - 'title': unescapeHTML(vars['md_title']), - 'thumbnail': vars['jpg'], - 'uploader': vars['md_author'], + 'id': compat_str(data['vid']), + 'url': data['url240'], + 'title': unescapeHTML(data['md_title']), + 'thumbnail': data['jpg'], + 'uploader': data['md_author'], } From 94a23d2a1ed94af8bb80898194f03c38a5dcdb1d Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Tue, 21 Jan 2014 17:32:03 +0100 Subject: [PATCH 022/339] [vk] Use unicode_literals --- youtube_dl/extractor/vk.py | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/youtube_dl/extractor/vk.py b/youtube_dl/extractor/vk.py index 2719becbf..02729506c 100644 --- a/youtube_dl/extractor/vk.py +++ b/youtube_dl/extractor/vk.py @@ -1,4 +1,6 @@ # encoding: utf-8 +from __future__ import unicode_literals + import re import json @@ -10,17 +12,16 @@ from ..utils import ( class VKIE(InfoExtractor): - IE_NAME = u'vk.com' + IE_NAME = 'vk.com' _VALID_URL = r'https?://vk\.com/(?:videos.*?\?.*?z=)?video(?P<id>.*?)(?:\?|%2F|$)' _TEST = { - u'url': u'http://vk.com/videos-77521?z=video-77521_162222515%2Fclub77521', - u'md5': u'0deae91935c54e00003c2a00646315f0', - u'info_dict': { - u'id': u'162222515', - u'ext': u'flv', - u'title': u'ProtivoGunz - Хуёвая песня', - u'uploader': u'Noize MC', + 'url': 'http://vk.com/videos-77521?z=video-77521_162222515%2Fclub77521', + 'file': '162222515.flv', + 'md5': '0deae91935c54e00003c2a00646315f0', + 'info_dict': { + 'title': 'ProtivoGunz - Хуёвая песня', + 'uploader': 'Noize MC', }, } @@ -33,7 +34,7 @@ class VKIE(InfoExtractor): if m_yt is not None: self.to_screen(u'Youtube video detected') return self.url_result(m_yt.group(1), 'Youtube') - data_json = self._search_regex(r'var vars = ({.*?});', info_page, u'vars') + data_json = self._search_regex(r'var vars = ({.*?});', info_page, 'vars') data = json.loads(data_json) return { From 9834872bf63b4e03b66c5e3b8f306556e735d8c5 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Tue, 21 Jan 2014 18:10:14 +0100 Subject: [PATCH 023/339] [facebook] Add support for embeds Example URL: http://www.hostblogger.de/blog/archives/6181-Auto-jagt-Betonmischer.html --- youtube_dl/extractor/facebook.py | 9 +++++++-- youtube_dl/extractor/generic.py | 6 ++++++ 2 files changed, 13 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/facebook.py b/youtube_dl/extractor/facebook.py index 4556079c8..8f9154c0e 100644 --- a/youtube_dl/extractor/facebook.py +++ b/youtube_dl/extractor/facebook.py @@ -17,7 +17,12 @@ from ..utils import ( class FacebookIE(InfoExtractor): """Information Extractor for Facebook""" - _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:[^#?]*#!/)?(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)' + _VALID_URL = r'''(?x) + (?:https?://)?(?:\w+\.)?facebook\.com/ + (?:[^#?]*\#!/)? + (?:video/video\.php|photo\.php|video/embed)\?(?:.*?) + (?:v|video_id)=(?P<id>[0-9]+) + (?:.*)''' _LOGIN_URL = 'https://www.facebook.com/login.php?next=http%3A%2F%2Ffacebook.com%2Fhome.php&login_attempt=1' _CHECKPOINT_URL = 'https://www.facebook.com/checkpoint/?next=http%3A%2F%2Ffacebook.com%2Fhome.php&_fb_noscript=1' _NETRC_MACHINE = 'facebook' @@ -90,7 +95,7 @@ class FacebookIE(InfoExtractor): mobj = re.match(self._VALID_URL, url) if mobj is None: raise ExtractorError(u'Invalid URL: %s' % url) - video_id = mobj.group('ID') + video_id = mobj.group('id') url = 'https://www.facebook.com/video/video.php?v=%s' % video_id webpage = self._download_webpage(url, video_id) diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 3e96cb15f..91536075d 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -319,6 +319,12 @@ class GenericIE(InfoExtractor): if mobj is not None: return self.url_result(mobj.group('url'), 'Novamov') + # Look for embedded Facebook player + mobj = re.search( + r'<iframe[^>]+?src=(["\'])(?P<url>https://www.facebook.com/video/embed.+?)\1', webpage) + if mobj is not None: + return self.url_result(mobj.group('url'), 'Facebook') + # Start with something easy: JW Player in SWFObject mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage) if mobj is None: From 913f32929b36ec5d0f9685b7c80ac37855ebaf51 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Tue, 21 Jan 2014 18:21:44 +0100 Subject: [PATCH 024/339] [vk] Add support for HQ videos (Fixes #2187) --- youtube_dl/extractor/vk.py | 27 ++++++++++++++++++++++----- 1 file changed, 22 insertions(+), 5 deletions(-) diff --git a/youtube_dl/extractor/vk.py b/youtube_dl/extractor/vk.py index 02729506c..f13ba1c8e 100644 --- a/youtube_dl/extractor/vk.py +++ b/youtube_dl/extractor/vk.py @@ -15,7 +15,7 @@ class VKIE(InfoExtractor): IE_NAME = 'vk.com' _VALID_URL = r'https?://vk\.com/(?:videos.*?\?.*?z=)?video(?P<id>.*?)(?:\?|%2F|$)' - _TEST = { + _TESTS = [{ 'url': 'http://vk.com/videos-77521?z=video-77521_162222515%2Fclub77521', 'file': '162222515.flv', 'md5': '0deae91935c54e00003c2a00646315f0', @@ -23,7 +23,16 @@ class VKIE(InfoExtractor): 'title': 'ProtivoGunz - Хуёвая песня', 'uploader': 'Noize MC', }, - } + }, + { + 'url': 'http://vk.com/video4643923_163339118', + 'file': '163339118.mp4', + 'md5': 'f79bccb5cd182b1f43502ca5685b2b36', + 'info_dict': { + 'uploader': 'Elvira Dzhonik', + 'title': 'Dream Theater - Hollow Years Live at Budokan 720*', + } + }] def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) @@ -37,10 +46,18 @@ class VKIE(InfoExtractor): data_json = self._search_regex(r'var vars = ({.*?});', info_page, 'vars') data = json.loads(data_json) + formats = [{ + 'format_id': k, + 'url': v, + 'width': int(k[len('url'):]), + } for k, v in data.items() + if k.startswith('url')] + self._sort_formats(formats) + return { 'id': compat_str(data['vid']), - 'url': data['url240'], + 'formats': formats, 'title': unescapeHTML(data['md_title']), - 'thumbnail': data['jpg'], - 'uploader': data['md_author'], + 'thumbnail': data.get('jpg'), + 'uploader': data.get('md_author'), } From 4a3b72771fa86b1d4321a9643c893276c26078a4 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Tue, 21 Jan 2014 18:21:53 +0100 Subject: [PATCH 025/339] release 2014.01.21.1 --- youtube_dl/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/version.py b/youtube_dl/version.py index ca47958be..0a21e896d 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,2 +1,2 @@ -__version__ = '2014.01.21' +__version__ = '2014.01.21.1' From ef9f2ba7afe0966b7d65158b663f9fcc11db3fd1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= <jaime.marquinez.ferrandiz@gmail.com> Date: Tue, 21 Jan 2014 19:44:47 +0100 Subject: [PATCH 026/339] [mtv] Use unicode_literals --- youtube_dl/extractor/mtv.py | 44 +++++++++++++++++++------------------ 1 file changed, 23 insertions(+), 21 deletions(-) diff --git a/youtube_dl/extractor/mtv.py b/youtube_dl/extractor/mtv.py index c4fa16fb6..e24f22656 100644 --- a/youtube_dl/extractor/mtv.py +++ b/youtube_dl/extractor/mtv.py @@ -1,3 +1,5 @@ +from __future__ import unicode_literals + import re import xml.etree.ElementTree @@ -36,7 +38,7 @@ class MTVServicesInfoExtractor(InfoExtractor): def _extract_video_formats(self, metadataXml): if '/error_country_block.swf' in metadataXml: - raise ExtractorError(u'This video is not available from your country.', expected=True) + raise ExtractorError('This video is not available from your country.', expected=True) mdoc = xml.etree.ElementTree.fromstring(metadataXml.encode('utf-8')) formats = [] @@ -60,11 +62,11 @@ class MTVServicesInfoExtractor(InfoExtractor): self.report_extraction(video_id) mediagen_url = itemdoc.find('%s/%s' % (_media_xml_tag('group'), _media_xml_tag('content'))).attrib['url'] # Remove the templates, like &device={device} - mediagen_url = re.sub(r'&[^=]*?={.*?}(?=(&|$))', u'', mediagen_url) + mediagen_url = re.sub(r'&[^=]*?={.*?}(?=(&|$))', '', mediagen_url) if 'acceptMethods' not in mediagen_url: mediagen_url += '&acceptMethods=fms' mediagen_page = self._download_webpage(mediagen_url, video_id, - u'Downloading video urls') + 'Downloading video urls') description_node = itemdoc.find('description') if description_node is not None: @@ -86,7 +88,7 @@ class MTVServicesInfoExtractor(InfoExtractor): idoc = self._download_xml( self._FEED_URL + '?' + data, video_id, - u'Downloading info', transform_source=fix_xml_ampersands) + 'Downloading info', transform_source=fix_xml_ampersands) return [self._get_video_info(item) for item in idoc.findall('.//item')] @@ -99,25 +101,25 @@ class MTVIE(MTVServicesInfoExtractor): _TESTS = [ { - u'url': u'http://www.mtv.com/videos/misc/853555/ours-vh1-storytellers.jhtml', - u'file': u'853555.mp4', - u'md5': u'850f3f143316b1e71fa56a4edfd6e0f8', - u'info_dict': { - u'title': u'Taylor Swift - "Ours (VH1 Storytellers)"', - u'description': u'Album: Taylor Swift performs "Ours" for VH1 Storytellers at Harvey Mudd College.', + 'url': 'http://www.mtv.com/videos/misc/853555/ours-vh1-storytellers.jhtml', + 'file': '853555.mp4', + 'md5': '850f3f143316b1e71fa56a4edfd6e0f8', + 'info_dict': { + 'title': 'Taylor Swift - "Ours (VH1 Storytellers)"', + 'description': 'Album: Taylor Swift performs "Ours" for VH1 Storytellers at Harvey Mudd College.', }, }, { - u'add_ie': ['Vevo'], - u'url': u'http://www.mtv.com/videos/taylor-swift/916187/everything-has-changed-ft-ed-sheeran.jhtml', - u'file': u'USCJY1331283.mp4', - u'md5': u'73b4e7fcadd88929292fe52c3ced8caf', - u'info_dict': { - u'title': u'Everything Has Changed', - u'upload_date': u'20130606', - u'uploader': u'Taylor Swift', + 'add_ie': ['Vevo'], + 'url': 'http://www.mtv.com/videos/taylor-swift/916187/everything-has-changed-ft-ed-sheeran.jhtml', + 'file': 'USCJY1331283.mp4', + 'md5': '73b4e7fcadd88929292fe52c3ced8caf', + 'info_dict': { + 'title': 'Everything Has Changed', + 'upload_date': '20130606', + 'uploader': 'Taylor Swift', }, - u'skip': u'VEVO is only available in some countries', + 'skip': 'VEVO is only available in some countries', }, ] @@ -136,8 +138,8 @@ class MTVIE(MTVServicesInfoExtractor): webpage, re.DOTALL) if m_vevo: vevo_id = m_vevo.group(1); - self.to_screen(u'Vevo video detected: %s' % vevo_id) + self.to_screen('Vevo video detected: %s' % vevo_id) return self.url_result('vevo:%s' % vevo_id, ie='Vevo') - uri = self._html_search_regex(r'/uri/(.*?)\?', webpage, u'uri') + uri = self._html_search_regex(r'/uri/(.*?)\?', webpage, 'uri') return self._get_videos_info(uri) From e4f320a4d044b690721016e36972cd547ee787d3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= <jaime.marquinez.ferrandiz@gmail.com> Date: Tue, 21 Jan 2014 19:57:38 +0100 Subject: [PATCH 027/339] =?UTF-8?q?[mtv]=20Check=20for=20geo-blocked=20vid?= =?UTF-8?q?eos=20in=20the=20xml=20document,=20not=20in=20the=20xml?= =?UTF-8?q?=E2=80=99s=20string?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Allows to use the `_download_xml` method --- youtube_dl/extractor/mtv.py | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/youtube_dl/extractor/mtv.py b/youtube_dl/extractor/mtv.py index e24f22656..485c1fd7d 100644 --- a/youtube_dl/extractor/mtv.py +++ b/youtube_dl/extractor/mtv.py @@ -1,7 +1,6 @@ from __future__ import unicode_literals import re -import xml.etree.ElementTree from .common import InfoExtractor from ..utils import ( @@ -36,10 +35,9 @@ class MTVServicesInfoExtractor(InfoExtractor): else: return thumb_node.attrib['url'] - def _extract_video_formats(self, metadataXml): - if '/error_country_block.swf' in metadataXml: + def _extract_video_formats(self, mdoc): + if re.match(r'.*/error_country_block\.swf$', mdoc.find('.//src').text) is not None: raise ExtractorError('This video is not available from your country.', expected=True) - mdoc = xml.etree.ElementTree.fromstring(metadataXml.encode('utf-8')) formats = [] for rendition in mdoc.findall('.//rendition'): @@ -65,8 +63,8 @@ class MTVServicesInfoExtractor(InfoExtractor): mediagen_url = re.sub(r'&[^=]*?={.*?}(?=(&|$))', '', mediagen_url) if 'acceptMethods' not in mediagen_url: mediagen_url += '&acceptMethods=fms' - mediagen_page = self._download_webpage(mediagen_url, video_id, - 'Downloading video urls') + mediagen_doc = self._download_xml(mediagen_url, video_id, + 'Downloading video urls') description_node = itemdoc.find('description') if description_node is not None: @@ -76,7 +74,7 @@ class MTVServicesInfoExtractor(InfoExtractor): return { 'title': itemdoc.find('title').text, - 'formats': self._extract_video_formats(mediagen_page), + 'formats': self._extract_video_formats(mediagen_doc), 'id': video_id, 'thumbnail': self._get_thumbnail_url(uri, itemdoc), 'description': description, From 8d9453b9e852b585cd7d0228c126d36b682af42f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= <jaime.marquinez.ferrandiz@gmail.com> Date: Tue, 21 Jan 2014 20:54:47 +0100 Subject: [PATCH 028/339] Add an extractor for spike.com (#2072) Added a generic _real_extract to MTVServicesInfoExtractor --- youtube_dl/extractor/__init__.py | 1 + youtube_dl/extractor/mtv.py | 13 +++++++++++++ youtube_dl/extractor/spike.py | 19 +++++++++++++++++++ 3 files changed, 33 insertions(+) create mode 100644 youtube_dl/extractor/spike.py diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 7b374f7b9..d37f0a178 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -171,6 +171,7 @@ from .southparkstudios import ( from .space import SpaceIE from .spankwire import SpankwireIE from .spiegel import SpiegelIE +from .spike import SpikeIE from .stanfordoc import StanfordOpenClassroomIE from .statigram import StatigramIE from .steam import SteamIE diff --git a/youtube_dl/extractor/mtv.py b/youtube_dl/extractor/mtv.py index 485c1fd7d..517115501 100644 --- a/youtube_dl/extractor/mtv.py +++ b/youtube_dl/extractor/mtv.py @@ -7,6 +7,8 @@ from ..utils import ( compat_urllib_parse, ExtractorError, fix_xml_ampersands, + url_basename, + RegexNotFoundError, ) def _media_xml_tag(tag): @@ -89,6 +91,17 @@ class MTVServicesInfoExtractor(InfoExtractor): 'Downloading info', transform_source=fix_xml_ampersands) return [self._get_video_info(item) for item in idoc.findall('.//item')] + def _real_extract(self, url): + title = url_basename(url) + webpage = self._download_webpage(url, title) + try: + # the url is in the format http://media.mtvnservices.com/fb/{mgid}.swf + fb_url = self._og_search_video_url(webpage) + mgid = url_basename(fb_url).rpartition('.')[0] + except RegexNotFoundError: + mgid = self._search_regex(r'data-mgid="(.*?)"', webpage, u'mgid') + return self._get_videos_info(mgid) + class MTVIE(MTVServicesInfoExtractor): _VALID_URL = r'''(?x)^https?:// diff --git a/youtube_dl/extractor/spike.py b/youtube_dl/extractor/spike.py new file mode 100644 index 000000000..71a9aaa24 --- /dev/null +++ b/youtube_dl/extractor/spike.py @@ -0,0 +1,19 @@ +from __future__ import unicode_literals + +from .mtv import MTVServicesInfoExtractor + + +class SpikeIE(MTVServicesInfoExtractor): + _VALID_URL = r'https?://www\.spike\.com/(video-clips|episodes)/.+' + _TEST = { + 'url': 'http://www.spike.com/video-clips/lhtu8m/auction-hunters-can-allen-ride-a-hundred-year-old-motorcycle', + 'md5': '1a9265f32b0c375793d6c4ce45255256', + 'info_dict': { + 'id': 'b9c8221a-4e50-479a-b86d-3333323e38ba', + 'ext': 'mp4', + 'title': 'Can Allen Ride A Hundred Year-Old Motorcycle?', + 'description': 'md5:fbed7e82ed5fad493615b3094a9499cb', + }, + } + + _FEED_URL = 'http://www.spike.com/feeds/mrss/' From bc4ba05fcbb20dfead6796b0878427b51c9f150a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= <jaime.marquinez.ferrandiz@gmail.com> Date: Tue, 21 Jan 2014 20:59:31 +0100 Subject: [PATCH 029/339] [mtv] Add an extractor for mtviggy.com (#2072) --- youtube_dl/extractor/__init__.py | 5 ++++- youtube_dl/extractor/mtv.py | 14 ++++++++++++++ 2 files changed, 18 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index d37f0a178..4d6aeabdf 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -119,7 +119,10 @@ from .mit import TechTVMITIE, MITIE from .mixcloud import MixcloudIE from .mpora import MporaIE from .mofosex import MofosexIE -from .mtv import MTVIE +from .mtv import ( + MTVIE, + MTVIggyIE, +) from .muzu import MuzuTVIE from .myspace import MySpaceIE from .myspass import MySpassIE diff --git a/youtube_dl/extractor/mtv.py b/youtube_dl/extractor/mtv.py index 517115501..127fbeb4e 100644 --- a/youtube_dl/extractor/mtv.py +++ b/youtube_dl/extractor/mtv.py @@ -154,3 +154,17 @@ class MTVIE(MTVServicesInfoExtractor): uri = self._html_search_regex(r'/uri/(.*?)\?', webpage, 'uri') return self._get_videos_info(uri) + + +class MTVIggyIE(MTVServicesInfoExtractor): + IE_NAME = 'mtviggy.com' + _VALID_URL = r'https?://www\.mtviggy\.com/videos/.+' + _TEST = { + 'url': 'http://www.mtviggy.com/videos/arcade-fire-behind-the-scenes-at-the-biggest-music-experiment-yet/', + 'info_dict': { + 'id': '984696', + 'ext': 'mp4', + 'title': 'Short', + } + } + _FEED_URL = 'http://all.mtvworldverticals.com/feed-xml/' From d614aa40e35825e1cde7c92fc6092d226afe4898 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= <jaime.marquinez.ferrandiz@gmail.com> Date: Tue, 21 Jan 2014 21:53:10 +0100 Subject: [PATCH 030/339] [brightcove] Fix check for url in the result MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit It may have the ‘formats’ field instead of ‘url’. --- youtube_dl/extractor/brightcove.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/brightcove.py b/youtube_dl/extractor/brightcove.py index b873dc0d4..e13c040f8 100644 --- a/youtube_dl/extractor/brightcove.py +++ b/youtube_dl/extractor/brightcove.py @@ -230,6 +230,6 @@ class BrightcoveIE(InfoExtractor): else: return ad_info - if 'url' not in info: + if 'url' not in info and not info.get('formats'): raise ExtractorError('Unable to extract video url for %s' % info['id']) return info From 47917f24c499f7949b04a23c35459ca69adae62d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= <jaime.marquinez.ferrandiz@gmail.com> Date: Tue, 21 Jan 2014 22:04:46 +0100 Subject: [PATCH 031/339] [brightcove] Fix extraction of embedded videos MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There was a leading ‘:’ in the regex. The ‘flashvars’ parameter is not always available. --- youtube_dl/extractor/brightcove.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/brightcove.py b/youtube_dl/extractor/brightcove.py index e13c040f8..e1c45d1f0 100644 --- a/youtube_dl/extractor/brightcove.py +++ b/youtube_dl/extractor/brightcove.py @@ -90,9 +90,12 @@ class BrightcoveIE(InfoExtractor): object_doc = xml.etree.ElementTree.fromstring(object_str) fv_el = find_xpath_attr(object_doc, './param', 'name', 'flashVars') - flashvars = dict( - (k, v[0]) - for k, v in compat_parse_qs(fv_el.attrib['value']).items()) + if fv_el is not None: + flashvars = dict( + (k, v[0]) + for k, v in compat_parse_qs(fv_el.attrib['value']).items()) + else: + flashvars = {} def find_param(name): if name in flashvars: @@ -131,7 +134,7 @@ class BrightcoveIE(InfoExtractor): m_brightcove = re.search( r'''(?sx)<object (?: - :[^>]+?class=([\'"])[^>]*?BrightcoveExperience.*?\1 | + [^>]+?class=([\'"])[^>]*?BrightcoveExperience.*?\1 | [^>]*?>\s*<param\s+name="movie"\s+value="https?://[^/]*brightcove\.com/ ).+?</object>''', webpage) From 90834c78fed7b383efac8cb1b8adb9f864992c98 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Wed, 22 Jan 2014 00:17:33 +0100 Subject: [PATCH 032/339] [mtv] Fix title for gametrailers (Fixes #2188) We now prefer the title including the category, because that title is what is presented at the actual sites. --- youtube_dl/extractor/gametrailers.py | 2 +- youtube_dl/extractor/mtv.py | 17 ++++++++++++++++- 2 files changed, 17 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/gametrailers.py b/youtube_dl/extractor/gametrailers.py index d82a5d4b2..617578e72 100644 --- a/youtube_dl/extractor/gametrailers.py +++ b/youtube_dl/extractor/gametrailers.py @@ -10,7 +10,7 @@ class GametrailersIE(MTVServicesInfoExtractor): u'file': u'70e9a5d7-cf25-4a10-9104-6f3e7342ae0d.mp4', u'md5': u'4c8e67681a0ea7ec241e8c09b3ea8cf7', u'info_dict': { - u'title': u'E3 2013: Debut Trailer', + u'title': u'Mirror\'s Edge 2|E3 2013: Debut Trailer', u'description': u'Faith is back! Check out the World Premiere trailer for Mirror\'s Edge 2 straight from the EA Press Conference at E3 2013!', }, } diff --git a/youtube_dl/extractor/mtv.py b/youtube_dl/extractor/mtv.py index c4fa16fb6..8385929e0 100644 --- a/youtube_dl/extractor/mtv.py +++ b/youtube_dl/extractor/mtv.py @@ -5,9 +5,11 @@ from .common import InfoExtractor from ..utils import ( compat_urllib_parse, ExtractorError, + find_xpath_attr, fix_xml_ampersands, ) + def _media_xml_tag(tag): return '{http://search.yahoo.com/mrss/}%s' % tag @@ -72,8 +74,21 @@ class MTVServicesInfoExtractor(InfoExtractor): else: description = None + title_el = None + if title_el is None: + title_el = find_xpath_attr( + itemdoc, './/{http://search.yahoo.com/mrss/}category', + 'scheme', 'urn:mtvn:video_title') + if title_el is None: + title_el = itemdoc.find('.//{http://search.yahoo.com/mrss/}title') + if title_el is None: + title_el = itemdoc.find('.//title') + title = title_el.text + if title is None: + raise ExtractorError('Could not find video title') + return { - 'title': itemdoc.find('title').text, + 'title': title, 'formats': self._extract_video_formats(mediagen_page), 'id': video_id, 'thumbnail': self._get_thumbnail_url(uri, itemdoc), From 32dac6943d7e00203bb11695016178c3a23fb135 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Wed, 22 Jan 2014 00:07:02 +0100 Subject: [PATCH 033/339] [mtv] Use unicode_literals --- youtube_dl/extractor/mtv.py | 44 +++++++++++++++++++------------------ 1 file changed, 23 insertions(+), 21 deletions(-) diff --git a/youtube_dl/extractor/mtv.py b/youtube_dl/extractor/mtv.py index 8385929e0..32cfa3632 100644 --- a/youtube_dl/extractor/mtv.py +++ b/youtube_dl/extractor/mtv.py @@ -1,3 +1,5 @@ +from __future__ import unicode_literals + import re import xml.etree.ElementTree @@ -38,7 +40,7 @@ class MTVServicesInfoExtractor(InfoExtractor): def _extract_video_formats(self, metadataXml): if '/error_country_block.swf' in metadataXml: - raise ExtractorError(u'This video is not available from your country.', expected=True) + raise ExtractorError('This video is not available from your country.', expected=True) mdoc = xml.etree.ElementTree.fromstring(metadataXml.encode('utf-8')) formats = [] @@ -62,11 +64,11 @@ class MTVServicesInfoExtractor(InfoExtractor): self.report_extraction(video_id) mediagen_url = itemdoc.find('%s/%s' % (_media_xml_tag('group'), _media_xml_tag('content'))).attrib['url'] # Remove the templates, like &device={device} - mediagen_url = re.sub(r'&[^=]*?={.*?}(?=(&|$))', u'', mediagen_url) + mediagen_url = re.sub(r'&[^=]*?={.*?}(?=(&|$))', '', mediagen_url) if 'acceptMethods' not in mediagen_url: mediagen_url += '&acceptMethods=fms' mediagen_page = self._download_webpage(mediagen_url, video_id, - u'Downloading video urls') + 'Downloading video urls') description_node = itemdoc.find('description') if description_node is not None: @@ -101,7 +103,7 @@ class MTVServicesInfoExtractor(InfoExtractor): idoc = self._download_xml( self._FEED_URL + '?' + data, video_id, - u'Downloading info', transform_source=fix_xml_ampersands) + 'Downloading info', transform_source=fix_xml_ampersands) return [self._get_video_info(item) for item in idoc.findall('.//item')] @@ -114,25 +116,25 @@ class MTVIE(MTVServicesInfoExtractor): _TESTS = [ { - u'url': u'http://www.mtv.com/videos/misc/853555/ours-vh1-storytellers.jhtml', - u'file': u'853555.mp4', - u'md5': u'850f3f143316b1e71fa56a4edfd6e0f8', - u'info_dict': { - u'title': u'Taylor Swift - "Ours (VH1 Storytellers)"', - u'description': u'Album: Taylor Swift performs "Ours" for VH1 Storytellers at Harvey Mudd College.', + 'url': 'http://www.mtv.com/videos/misc/853555/ours-vh1-storytellers.jhtml', + 'file': '853555.mp4', + 'md5': '850f3f143316b1e71fa56a4edfd6e0f8', + 'info_dict': { + 'title': 'Taylor Swift - "Ours (VH1 Storytellers)"', + 'description': 'Album: Taylor Swift performs "Ours" for VH1 Storytellers at Harvey Mudd College.', }, }, { - u'add_ie': ['Vevo'], - u'url': u'http://www.mtv.com/videos/taylor-swift/916187/everything-has-changed-ft-ed-sheeran.jhtml', - u'file': u'USCJY1331283.mp4', - u'md5': u'73b4e7fcadd88929292fe52c3ced8caf', - u'info_dict': { - u'title': u'Everything Has Changed', - u'upload_date': u'20130606', - u'uploader': u'Taylor Swift', + 'add_ie': ['Vevo'], + 'url': 'http://www.mtv.com/videos/taylor-swift/916187/everything-has-changed-ft-ed-sheeran.jhtml', + 'file': 'USCJY1331283.mp4', + 'md5': '73b4e7fcadd88929292fe52c3ced8caf', + 'info_dict': { + 'title': 'Everything Has Changed', + 'upload_date': '20130606', + 'uploader': 'Taylor Swift', }, - u'skip': u'VEVO is only available in some countries', + 'skip': 'VEVO is only available in some countries', }, ] @@ -151,8 +153,8 @@ class MTVIE(MTVServicesInfoExtractor): webpage, re.DOTALL) if m_vevo: vevo_id = m_vevo.group(1); - self.to_screen(u'Vevo video detected: %s' % vevo_id) + self.to_screen('Vevo video detected: %s' % vevo_id) return self.url_result('vevo:%s' % vevo_id, ie='Vevo') - uri = self._html_search_regex(r'/uri/(.*?)\?', webpage, u'uri') + uri = self._html_search_regex(r'/uri/(.*?)\?', webpage, 'uri') return self._get_videos_info(uri) From 06769acd717191ed61ce639314975816a8f7969c Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Wed, 22 Jan 2014 00:03:46 +0100 Subject: [PATCH 034/339] [gametrailers] Use unicode_literals Conflicts: youtube_dl/extractor/gametrailers.py --- youtube_dl/extractor/gametrailers.py | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/youtube_dl/extractor/gametrailers.py b/youtube_dl/extractor/gametrailers.py index 617578e72..c1fdd770e 100644 --- a/youtube_dl/extractor/gametrailers.py +++ b/youtube_dl/extractor/gametrailers.py @@ -1,3 +1,5 @@ +from __future__ import unicode_literals + import re from .mtv import MTVServicesInfoExtractor @@ -6,12 +8,12 @@ from .mtv import MTVServicesInfoExtractor class GametrailersIE(MTVServicesInfoExtractor): _VALID_URL = r'http://www\.gametrailers\.com/(?P<type>videos|reviews|full-episodes)/(?P<id>.*?)/(?P<title>.*)' _TEST = { - u'url': u'http://www.gametrailers.com/videos/zbvr8i/mirror-s-edge-2-e3-2013--debut-trailer', - u'file': u'70e9a5d7-cf25-4a10-9104-6f3e7342ae0d.mp4', - u'md5': u'4c8e67681a0ea7ec241e8c09b3ea8cf7', - u'info_dict': { - u'title': u'Mirror\'s Edge 2|E3 2013: Debut Trailer', - u'description': u'Faith is back! Check out the World Premiere trailer for Mirror\'s Edge 2 straight from the EA Press Conference at E3 2013!', + 'url': 'http://www.gametrailers.com/videos/zbvr8i/mirror-s-edge-2-e3-2013--debut-trailer', + 'file': '70e9a5d7-cf25-4a10-9104-6f3e7342ae0d.mp4', + 'md5': '4c8e67681a0ea7ec241e8c09b3ea8cf7', + 'info_dict': { + 'title': 'Mirror\'s Edge 2|E3 2013: Debut Trailer', + 'description': 'Faith is back! Check out the World Premiere trailer for Mirror\'s Edge 2 straight from the EA Press Conference at E3 2013!', }, } @@ -23,5 +25,5 @@ class GametrailersIE(MTVServicesInfoExtractor): webpage = self._download_webpage(url, video_id) mgid = self._search_regex([r'data-video="(?P<mgid>mgid:.*?)"', r'data-contentId=\'(?P<mgid>mgid:.*?)\''], - webpage, u'mgid') + webpage, 'mgid') return self._get_videos_info(mgid) From 398edd06895d1815aca8549ec900ab8e1d1e3149 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Wed, 22 Jan 2014 00:21:41 +0100 Subject: [PATCH 035/339] release 2014.01.22 --- youtube_dl/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 0a21e896d..9401e0a0b 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,2 +1,2 @@ -__version__ = '2014.01.21.1' +__version__ = '2014.01.22' From a70515c0fd46f83111a0ccb63a70fe3df27fde3e Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Wed, 22 Jan 2014 00:27:17 +0100 Subject: [PATCH 036/339] [servingsys] Do not run test on travis Apparantly, even the advertisers do geoblocking now!? From the US, this isn't outright blocked, but there are no videos returned. --- youtube_dl/extractor/servingsys.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/servingsys.py b/youtube_dl/extractor/servingsys.py index 7ba237dde..1dc551d5c 100644 --- a/youtube_dl/extractor/servingsys.py +++ b/youtube_dl/extractor/servingsys.py @@ -36,7 +36,8 @@ class ServingSysIE(InfoExtractor): }], 'params': { 'playlistend': 2, - } + }, + 'skip': 'Blocked in the US [sic]', } def _real_extract(self, url): From 00122de6a9d215651154274ad01ac799d580ed96 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Wed, 22 Jan 2014 01:04:07 +0100 Subject: [PATCH 037/339] [gametrailers/mtv] Fix pre-3.x compatibility function for find_xpath_attr Fixes #2189 --- youtube_dl/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 70f284149..9ab7288cc 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -224,7 +224,7 @@ if sys.version_info >= (2,7): def find_xpath_attr(node, xpath, key, val): """ Find the xpath xpath[@key=val] """ assert re.match(r'^[a-zA-Z]+$', key) - assert re.match(r'^[a-zA-Z0-9@\s]*$', val) + assert re.match(r'^[a-zA-Z0-9@\s:.]*$', val) expr = xpath + u"[@%s='%s']" % (key, val) return node.find(expr) else: From 99f770caa8e064358fcc03308e17f347791b25b1 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Wed, 22 Jan 2014 01:55:50 +0100 Subject: [PATCH 038/339] [hotnewhiphop] Retrieve media key --- youtube_dl/extractor/hotnewhiphop.py | 67 +++++++++++++++++++--------- 1 file changed, 46 insertions(+), 21 deletions(-) diff --git a/youtube_dl/extractor/hotnewhiphop.py b/youtube_dl/extractor/hotnewhiphop.py index 0ee74fb38..a106f81d2 100644 --- a/youtube_dl/extractor/hotnewhiphop.py +++ b/youtube_dl/extractor/hotnewhiphop.py @@ -1,17 +1,25 @@ +from __future__ import unicode_literals + import re import base64 from .common import InfoExtractor +from ..utils import ( + compat_urllib_parse, + compat_urllib_request, + ExtractorError, + HEADRequest, +) class HotNewHipHopIE(InfoExtractor): _VALID_URL = r'http://www\.hotnewhiphop.com/.*\.(?P<id>.*)\.html' _TEST = { - u'url': u"http://www.hotnewhiphop.com/freddie-gibbs-lay-it-down-song.1435540.html", - u'file': u'1435540.mp3', - u'md5': u'2c2cd2f76ef11a9b3b581e8b232f3d96', - u'info_dict': { - u"title": u'Freddie Gibbs "Lay It Down"' + 'url': 'http://www.hotnewhiphop.com/freddie-gibbs-lay-it-down-song.1435540.html', + 'file': '1435540.mp3', + 'md5': '2c2cd2f76ef11a9b3b581e8b232f3d96', + 'info_dict': { + 'title': 'Freddie Gibbs - Lay It Down' } } @@ -21,24 +29,41 @@ class HotNewHipHopIE(InfoExtractor): webpage_src = self._download_webpage(url, video_id) - video_url_base64 = self._search_regex(r'data-path="(.*?)"', - webpage_src, u'video URL', fatal=False) + video_url_base64 = self._search_regex( + r'data-path="(.*?)"', webpage_src, u'video URL', fatal=False) - if video_url_base64 == None: - video_url = self._search_regex(r'"contentUrl" content="(.*?)"', webpage_src, - u'video URL') + if video_url_base64 is None: + video_url = self._search_regex( + r'"contentUrl" content="(.*?)"', webpage_src, u'video URL') return self.url_result(video_url, ie='Youtube') - video_url = base64.b64decode(video_url_base64).decode('utf-8') + reqdata = compat_urllib_parse.urlencode([ + ('mediaType', 's'), + ('mediaId', video_id), + ]) + r = compat_urllib_request.Request( + 'http://www.hotnewhiphop.com/ajax/media/getActions/', data=reqdata) + r.add_header('Content-Type', 'application/x-www-form-urlencoded') + mkd = self._download_json( + r, video_id, note='Requesting media key', + errnote='Could not download media key') + if 'mediaKey' not in mkd: + raise ExtractorError('Did not get a media key') - video_title = self._html_search_regex(r"<title>(.*)", - webpage_src, u'title') + redirect_url = base64.b64decode(video_url_base64).decode('utf-8') + redirect_req = HEADRequest(redirect_url) + req = self._request_webpage( + redirect_req, video_id, + note='Resolving final URL', errnote='Could not resolve final URL') + video_url = req.geturl() + if video_url.endswith('.html'): + raise ExtractorError('Redirect failed') - results = [{ - 'id': video_id, - 'url' : video_url, - 'title' : video_title, - 'thumbnail' : self._og_search_thumbnail(webpage_src), - 'ext' : 'mp3', - }] - return results + video_title = self._og_search_title(webpage_src).strip() + + return { + 'id': video_id, + 'url': video_url, + 'title': video_title, + 'thumbnail': self._og_search_thumbnail(webpage_src), + } From 2250865fb09062418e78d79618fefe8817235488 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Wed, 22 Jan 2014 02:01:23 +0100 Subject: [PATCH 039/339] [Wimp] Use new URL relay method --- youtube_dl/extractor/wimp.py | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/youtube_dl/extractor/wimp.py b/youtube_dl/extractor/wimp.py index 82a626e0e..9a6bb0c76 100644 --- a/youtube_dl/extractor/wimp.py +++ b/youtube_dl/extractor/wimp.py @@ -1,5 +1,6 @@ +from __future__ import unicode_literals + import re -import base64 from .common import InfoExtractor @@ -7,12 +8,12 @@ from .common import InfoExtractor class WimpIE(InfoExtractor): _VALID_URL = r'(?:http://)?(?:www\.)?wimp\.com/([^/]+)/' _TEST = { - u'url': u'http://www.wimp.com/deerfence/', - u'file': u'deerfence.flv', - u'md5': u'8b215e2e0168c6081a1cf84b2846a2b5', - u'info_dict': { - u"title": u"Watch Till End: Herd of deer jump over a fence.", - u"description": u"These deer look as fluid as running water when they jump over this fence as a herd. This video is one that needs to be watched until the very end for the true majesty to be witnessed, but once it comes, it's sure to take your breath away.", + 'url': 'http://www.wimp.com/deerfence/', + 'file': 'deerfence.flv', + 'md5': '8b215e2e0168c6081a1cf84b2846a2b5', + 'info_dict': { + "title": "Watch Till End: Herd of deer jump over a fence.", + "description": "These deer look as fluid as running water when they jump over this fence as a herd. This video is one that needs to be watched until the very end for the true majesty to be witnessed, but once it comes, it's sure to take your breath away.", } } @@ -20,13 +21,12 @@ class WimpIE(InfoExtractor): mobj = re.match(self._VALID_URL, url) video_id = mobj.group(1) webpage = self._download_webpage(url, video_id) - googleString = self._search_regex("googleCode = '(.*?)'", webpage, 'file url') - googleString = base64.b64decode(googleString).decode('ascii') - final_url = self._search_regex('","(.*?)"', googleString, u'final video url') + video_url = self._search_regex( + r's1\.addVariable\("file",\s*"([^"]+)"\);', webpage, 'video URL') return { 'id': video_id, - 'url': final_url, + 'url': video_url, 'title': self._og_search_title(webpage), 'thumbnail': self._og_search_thumbnail(webpage), 'description': self._og_search_description(webpage), From af1588c05f308ffa2fdabaa2fc4c38673b4217d8 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Wed, 22 Jan 2014 02:04:51 +0100 Subject: [PATCH 040/339] [mtv] Update tests and xpath function for new title extraction --- youtube_dl/extractor/mtv.py | 2 +- youtube_dl/extractor/spike.py | 2 +- youtube_dl/utils.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/mtv.py b/youtube_dl/extractor/mtv.py index af889a8af..51f91ef1d 100644 --- a/youtube_dl/extractor/mtv.py +++ b/youtube_dl/extractor/mtv.py @@ -180,7 +180,7 @@ class MTVIggyIE(MTVServicesInfoExtractor): 'info_dict': { 'id': '984696', 'ext': 'mp4', - 'title': 'Short', + 'title': 'Arcade Fire: Behind the Scenes at the Biggest Music Experiment Yet', } } _FEED_URL = 'http://all.mtvworldverticals.com/feed-xml/' diff --git a/youtube_dl/extractor/spike.py b/youtube_dl/extractor/spike.py index 71a9aaa24..56682ac45 100644 --- a/youtube_dl/extractor/spike.py +++ b/youtube_dl/extractor/spike.py @@ -11,7 +11,7 @@ class SpikeIE(MTVServicesInfoExtractor): 'info_dict': { 'id': 'b9c8221a-4e50-479a-b86d-3333323e38ba', 'ext': 'mp4', - 'title': 'Can Allen Ride A Hundred Year-Old Motorcycle?', + 'title': 'Auction Hunters|Can Allen Ride A Hundred Year-Old Motorcycle?', 'description': 'md5:fbed7e82ed5fad493615b3094a9499cb', }, } diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 9ab7288cc..6c00973bd 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -224,7 +224,7 @@ if sys.version_info >= (2,7): def find_xpath_attr(node, xpath, key, val): """ Find the xpath xpath[@key=val] """ assert re.match(r'^[a-zA-Z]+$', key) - assert re.match(r'^[a-zA-Z0-9@\s:.]*$', val) + assert re.match(r'^[a-zA-Z0-9@\s:._]*$', val) expr = xpath + u"[@%s='%s']" % (key, val) return node.find(expr) else: From d3a1c7191731ba391167f3b3b04e08982349be8c Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Wed, 22 Jan 2014 02:16:40 +0100 Subject: [PATCH 041/339] [ringtv] Fix and add news extraction --- youtube_dl/extractor/ringtv.py | 53 +++++++++++++++++++--------------- 1 file changed, 30 insertions(+), 23 deletions(-) diff --git a/youtube_dl/extractor/ringtv.py b/youtube_dl/extractor/ringtv.py index 1b08c3167..9fbdb9fcb 100644 --- a/youtube_dl/extractor/ringtv.py +++ b/youtube_dl/extractor/ringtv.py @@ -1,37 +1,44 @@ +from __future__ import unicode_literals + import re from .common import InfoExtractor class RingTVIE(InfoExtractor): - _VALID_URL = r'(?:http://)?(?:www\.)?ringtv\.craveonline\.com/videos/video/([^/]+)' + _VALID_URL = r'(?:http://)?(?:www\.)?ringtv\.craveonline\.com/(?Pnews|videos/video)/(?P[^/?#]+)' _TEST = { - u"url": u"http://ringtv.craveonline.com/videos/video/746619-canelo-alvarez-talks-about-mayweather-showdown", - u"file": u"746619.mp4", - u"md5": u"7c46b4057d22de32e0a539f017e64ad3", - u"info_dict": { - u"title": u"Canelo Alvarez talks about Mayweather showdown", - u"description": u"Saul \\\"Canelo\\\" Alvarez spoke to the media about his Sept. 14 showdown with Floyd Mayweather after their kick-off presser in NYC. Canelo is motivated and confident that he will have the speed and gameplan to beat the pound-for-pound king." + "url": "http://ringtv.craveonline.com/news/310833-luis-collazo-says-victor-ortiz-better-not-quit-on-jan-30", + "file": "857645.mp4", + "md5": "d25945f5df41cdca2d2587165ac28720", + "info_dict": { + "title": 'Video: Luis Collazo says Victor Ortiz "better not quit on Jan. 30" - Ring TV', + "description": 'Luis Collazo is excited about his Jan. 30 showdown with fellow former welterweight titleholder Victor Ortiz at Barclays Center in his hometown of Brooklyn. The SuperBowl week fight headlines a Golden Boy Live! card on Fox Sports 1.', } } def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) - video_id = mobj.group(1).split('-')[0] + video_id = mobj.group('id').split('-')[0] webpage = self._download_webpage(url, video_id) - title = self._search_regex(r'(.+?)', - webpage, 'video title').replace(' | RingTV','') - description = self._search_regex(r'
(.+?)
', - webpage, 'Description') - final_url = "http://ringtv.craveonline.springboardplatform.com/storage/ringtv.craveonline.com/conversion/%s.mp4" %(str(video_id)) - thumbnail_url = "http://ringtv.craveonline.springboardplatform.com/storage/ringtv.craveonline.com/snapshots/%s.jpg" %(str(video_id)) - ext = final_url.split('.')[-1] - return [{ - 'id' : video_id, - 'url' : final_url, - 'ext' : ext, - 'title' : title, - 'thumbnail' : thumbnail_url, - 'description' : description, - }] + + if mobj.group('type') == 'news': + video_id = self._search_regex( + r'''(?x)]+src="http://cms\.springboardplatform\.com/ + embed_iframe/[0-9]+/video/([0-9]+)/''', + webpage, 'real video ID') + title = self._og_search_title(webpage) + description = self._html_search_regex( + r'addthis:description="([^"]+)"', + webpage, 'description', fatal=False) + final_url = "http://ringtv.craveonline.springboardplatform.com/storage/ringtv.craveonline.com/conversion/%s.mp4" % video_id + thumbnail_url = "http://ringtv.craveonline.springboardplatform.com/storage/ringtv.craveonline.com/snapshots/%s.jpg" % video_id + + return { + 'id': video_id, + 'url': final_url, + 'title': title, + 'thumbnail': thumbnail_url, + 'description': description, + } From 6fd2957163880b0233a94a884cce1e80ee7cf4c3 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Wed, 22 Jan 2014 02:17:00 +0100 Subject: [PATCH 042/339] release 2014.01.22.1 --- youtube_dl/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 9401e0a0b..0103131a7 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,2 +1,2 @@ -__version__ = '2014.01.22' +__version__ = '2014.01.22.1' From 90f479b6d567ff6d7581250eec3ffd16b0f10a4d Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Wed, 22 Jan 2014 03:04:10 +0100 Subject: [PATCH 043/339] [novamov] Skip tests --- youtube_dl/extractor/novamov.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/novamov.py b/youtube_dl/extractor/novamov.py index 48ee00da3..6af8d934c 100644 --- a/youtube_dl/extractor/novamov.py +++ b/youtube_dl/extractor/novamov.py @@ -19,7 +19,8 @@ class NovamovIE(InfoExtractor): 'info_dict': { 'title': 'search engine optimization', 'description': 'search engine optimization is used to rank the web page in the google search engine' - } + }, + 'skip': '"Invalid token" errors abound (in web interface as well as youtube-dl, there is nothing we can do about it.)' } def _real_extract(self, url): From 48c63f1653e1c35f8dcb318fb51d6fc4281116ac Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Wed, 22 Jan 2014 03:09:21 +0100 Subject: [PATCH 044/339] [d8] disable test; video got deleted --- youtube_dl/extractor/d8.py | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/youtube_dl/extractor/d8.py b/youtube_dl/extractor/d8.py index a56842b16..093164122 100644 --- a/youtube_dl/extractor/d8.py +++ b/youtube_dl/extractor/d8.py @@ -1,22 +1,25 @@ # encoding: utf-8 +from __future__ import unicode_literal + from .canalplus import CanalplusIE class D8IE(CanalplusIE): _VALID_URL = r'https?://www\.d8\.tv/.*?/(?P.*)' _VIDEO_INFO_TEMPLATE = 'http://service.canal-plus.com/video/rest/getVideosLiees/d8/%s' - IE_NAME = u'd8.tv' + IE_NAME = 'd8.tv' _TEST = { - u'url': u'http://www.d8.tv/d8-docs-mags/pid6589-d8-campagne-intime.html', - u'file': u'966289.flv', - u'info_dict': { - u'title': u'Campagne intime - Documentaire exceptionnel', - u'description': u'md5:d2643b799fb190846ae09c61e59a859f', - u'upload_date': u'20131108', + 'url': 'http://www.d8.tv/d8-docs-mags/pid6589-d8-campagne-intime.html', + 'file': '966289.flv', + 'info_dict': { + 'title': 'Campagne intime - Documentaire exceptionnel', + 'description': 'md5:d2643b799fb190846ae09c61e59a859f', + 'upload_date': '20131108', }, - u'params': { + 'params': { # rtmp - u'skip_download': True, + 'skip_download': True, }, + 'skip': 'videos get deleted after a while', } From 43030f36db60d1262525e8182c2bb8f8bc68f260 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Wed, 22 Jan 2014 03:10:31 +0100 Subject: [PATCH 045/339] [d8] typo --- youtube_dl/extractor/d8.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/d8.py b/youtube_dl/extractor/d8.py index 093164122..6b26ff2e3 100644 --- a/youtube_dl/extractor/d8.py +++ b/youtube_dl/extractor/d8.py @@ -1,5 +1,5 @@ # encoding: utf-8 -from __future__ import unicode_literal +from __future__ import unicode_literals from .canalplus import CanalplusIE From d7b51547c0220b28d6914f4721a7d0ff11d8d98e Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Wed, 22 Jan 2014 03:41:19 +0100 Subject: [PATCH 046/339] [imdb:list] Switch to loading the webpage The RSS method seems to be defunct. --- youtube_dl/extractor/imdb.py | 31 ++++++++++++------------------- 1 file changed, 12 insertions(+), 19 deletions(-) diff --git a/youtube_dl/extractor/imdb.py b/youtube_dl/extractor/imdb.py index f40769eac..1763af020 100644 --- a/youtube_dl/extractor/imdb.py +++ b/youtube_dl/extractor/imdb.py @@ -67,23 +67,16 @@ class ImdbListIE(InfoExtractor): def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) list_id = mobj.group('id') - - # RSS XML is sometimes malformed - rss = self._download_webpage('http://rss.imdb.com/list/%s' % list_id, list_id, 'Downloading list RSS') - list_title = self._html_search_regex(r'(.*?)', rss, 'list title') - - # Export is independent of actual author_id, but returns 404 if no author_id is provided. - # However, passing dummy author_id seems to be enough. - csv = self._download_webpage('http://www.imdb.com/list/export?list_id=%s&author_id=ur00000000' % list_id, - list_id, 'Downloading list CSV') - - entries = [] - for item in csv.split('\n')[1:]: - cols = item.split(',') - if len(cols) < 2: - continue - item_id = cols[1][1:-1] - if item_id.startswith('vi'): - entries.append(self.url_result('http://www.imdb.com/video/imdb/%s' % item_id, 'Imdb')) - + + webpage = self._download_webpage(url, list_id) + list_code = self._search_regex( + r'(?s)(.*?)class="see-more"', + webpage, 'list code') + entries = [ + self.url_result('http://www.imdb.com' + m, 'Imdb') + for m in re.findall(r'href="(/video/imdb/vi[^"]+)"', webpage)] + + list_title = self._html_search_regex( + r'

(.*?)

', webpage, 'list title') + return self.playlist_result(entries, list_id, list_title) From 780ee4e501b1234e51c3530a93930962bea42f34 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Wed, 22 Jan 2014 03:49:16 +0100 Subject: [PATCH 047/339] [comedycentral] Adapt testcase In contrast to other sites, ComedyCentral seems to understand how to sensibly use MTV IE, but the additional text shouldn't hurt. --- youtube_dl/extractor/comedycentral.py | 2 +- youtube_dl/extractor/mtv.py | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/comedycentral.py b/youtube_dl/extractor/comedycentral.py index 27bd8256e..6d55a07af 100644 --- a/youtube_dl/extractor/comedycentral.py +++ b/youtube_dl/extractor/comedycentral.py @@ -23,7 +23,7 @@ class ComedyCentralIE(MTVServicesInfoExtractor): u'info_dict': { u'id': u'cef0cbb3-e776-4bc9-b62e-8016deccb354', u'ext': u'mp4', - u'title': u'Uncensored - Greg Fitzsimmons - Too Good of a Mother', + u'title': u'CC:Stand-Up|Greg Fitzsimmons: Life on Stage|Uncensored - Too Good of a Mother', u'description': u'After a certain point, breastfeeding becomes c**kblocking.', }, } diff --git a/youtube_dl/extractor/mtv.py b/youtube_dl/extractor/mtv.py index 51f91ef1d..ceb3aa37e 100644 --- a/youtube_dl/extractor/mtv.py +++ b/youtube_dl/extractor/mtv.py @@ -89,6 +89,7 @@ class MTVServicesInfoExtractor(InfoExtractor): title = title_el.text if title is None: raise ExtractorError('Could not find video title') + title = title.strip() return { 'title': title, From a4a028323e7a09c1d716de20c951f37b5b7c44f7 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Wed, 22 Jan 2014 03:50:49 +0100 Subject: [PATCH 048/339] [comedycentral] Use unicode_literals --- youtube_dl/extractor/comedycentral.py | 62 ++++++++++++++------------- 1 file changed, 32 insertions(+), 30 deletions(-) diff --git a/youtube_dl/extractor/comedycentral.py b/youtube_dl/extractor/comedycentral.py index 6d55a07af..f0ad5c9a3 100644 --- a/youtube_dl/extractor/comedycentral.py +++ b/youtube_dl/extractor/comedycentral.py @@ -1,3 +1,5 @@ +from __future__ import unicode_literals + import re from .common import InfoExtractor @@ -15,16 +17,16 @@ class ComedyCentralIE(MTVServicesInfoExtractor): _VALID_URL = r'''(?x)https?://(?:www.)?comedycentral.com/ (video-clips|episodes|cc-studios|video-collections) /(?P.*)''' - _FEED_URL = u'http://comedycentral.com/feeds/mrss/' + _FEED_URL = 'http://comedycentral.com/feeds/mrss/' _TEST = { - u'url': u'http://www.comedycentral.com/video-clips/kllhuv/stand-up-greg-fitzsimmons--uncensored---too-good-of-a-mother', - u'md5': u'4167875aae411f903b751a21f357f1ee', - u'info_dict': { - u'id': u'cef0cbb3-e776-4bc9-b62e-8016deccb354', - u'ext': u'mp4', - u'title': u'CC:Stand-Up|Greg Fitzsimmons: Life on Stage|Uncensored - Too Good of a Mother', - u'description': u'After a certain point, breastfeeding becomes c**kblocking.', + 'url': 'http://www.comedycentral.com/video-clips/kllhuv/stand-up-greg-fitzsimmons--uncensored---too-good-of-a-mother', + 'md5': '4167875aae411f903b751a21f357f1ee', + 'info_dict': { + 'id': 'cef0cbb3-e776-4bc9-b62e-8016deccb354', + 'ext': 'mp4', + 'title': 'CC:Stand-Up|Greg Fitzsimmons: Life on Stage|Uncensored - Too Good of a Mother', + 'description': 'After a certain point, breastfeeding becomes c**kblocking.', }, } @@ -33,12 +35,12 @@ class ComedyCentralIE(MTVServicesInfoExtractor): title = mobj.group('title') webpage = self._download_webpage(url, title) mgid = self._search_regex(r'data-mgid="(?P<mgid>mgid:.*?)"', - webpage, u'mgid') + webpage, 'mgid') return self._get_videos_info(mgid) class ComedyCentralShowsIE(InfoExtractor): - IE_DESC = u'The Daily Show / Colbert Report' + IE_DESC = 'The Daily Show / Colbert Report' # urls can be abbreviations like :thedailyshow or :colbert # urls for episodes like: # or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day @@ -55,14 +57,14 @@ class ComedyCentralShowsIE(InfoExtractor): extended-interviews/(?P<interID>[0-9]+)/playlist_tds_extended_(?P<interview_title>.*?)/.*?))) $""" _TEST = { - u'url': u'http://www.thedailyshow.com/watch/thu-december-13-2012/kristen-stewart', - u'file': u'422212.mp4', - u'md5': u'4e2f5cb088a83cd8cdb7756132f9739d', - u'info_dict': { - u"upload_date": u"20121214", - u"description": u"Kristen Stewart", - u"uploader": u"thedailyshow", - u"title": u"thedailyshow-kristen-stewart part 1" + 'url': 'http://www.thedailyshow.com/watch/thu-december-13-2012/kristen-stewart', + 'file': '422212.mp4', + 'md5': '4e2f5cb088a83cd8cdb7756132f9739d', + 'info_dict': { + "upload_date": "20121214", + "description": "Kristen Stewart", + "uploader": "thedailyshow", + "title": "thedailyshow-kristen-stewart part 1" } } @@ -94,20 +96,20 @@ class ComedyCentralShowsIE(InfoExtractor): def _transform_rtmp_url(rtmp_video_url): m = re.match(r'^rtmpe?://.*?/(?P<finalid>gsp.comedystor/.*)$', rtmp_video_url) if not m: - raise ExtractorError(u'Cannot transform RTMP url') + raise ExtractorError('Cannot transform RTMP url') base = 'http://mtvnmobile.vo.llnwd.net/kip0/_pxn=1+_pxI0=Ripod-h264+_pxL0=undefined+_pxM0=+_pxK=18639+_pxE=mp4/44620/mtvnorigin/' return base + m.group('finalid') def _real_extract(self, url): mobj = re.match(self._VALID_URL, url, re.VERBOSE) if mobj is None: - raise ExtractorError(u'Invalid URL: %s' % url) + raise ExtractorError('Invalid URL: %s' % url) if mobj.group('shortname'): if mobj.group('shortname') in ('tds', 'thedailyshow'): - url = u'http://www.thedailyshow.com/full-episodes/' + url = 'http://www.thedailyshow.com/full-episodes/' else: - url = u'http://www.colbertnation.com/full-episodes/' + url = 'http://www.colbertnation.com/full-episodes/' mobj = re.match(self._VALID_URL, url, re.VERBOSE) assert mobj is not None @@ -133,9 +135,9 @@ class ComedyCentralShowsIE(InfoExtractor): url = htmlHandle.geturl() mobj = re.match(self._VALID_URL, url, re.VERBOSE) if mobj is None: - raise ExtractorError(u'Invalid redirected URL: ' + url) + raise ExtractorError('Invalid redirected URL: ' + url) if mobj.group('episode') == '': - raise ExtractorError(u'Redirected URL is still not specific: ' + url) + raise ExtractorError('Redirected URL is still not specific: ' + url) epTitle = mobj.group('episode') mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*(?:episode|video).*?:.*?))"', webpage) @@ -147,15 +149,15 @@ class ComedyCentralShowsIE(InfoExtractor): altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video).*?:.*?)"', webpage) if len(altMovieParams) == 0: - raise ExtractorError(u'unable to find Flash URL in webpage ' + url) + raise ExtractorError('unable to find Flash URL in webpage ' + url) else: mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])] uri = mMovieParams[0][1] indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri}) idoc = self._download_xml(indexUrl, epTitle, - u'Downloading show index', - u'unable to download episode index') + 'Downloading show index', + 'unable to download episode index') results = [] @@ -170,7 +172,7 @@ class ComedyCentralShowsIE(InfoExtractor): configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' + compat_urllib_parse.urlencode({'uri': mediaId})) cdoc = self._download_xml(configUrl, epTitle, - u'Downloading configuration for %s' % shortMediaId) + 'Downloading configuration for %s' % shortMediaId) turls = [] for rendition in cdoc.findall('.//rendition'): @@ -178,7 +180,7 @@ class ComedyCentralShowsIE(InfoExtractor): turls.append(finfo) if len(turls) == 0: - self._downloader.report_error(u'unable to download ' + mediaId + ': No videos found') + self._downloader.report_error('unable to download ' + mediaId + ': No videos found') continue formats = [] @@ -192,7 +194,7 @@ class ComedyCentralShowsIE(InfoExtractor): 'width': w, }) - effTitle = showId + u'-' + epTitle + u' part ' + compat_str(partNum+1) + effTitle = showId + '-' + epTitle + ' part ' + compat_str(partNum+1) results.append({ 'id': shortMediaId, 'formats': formats, From c39f7013e1070a55729d143817690a282a5b3f10 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= <jaime.marquinez.ferrandiz@gmail.com> Date: Wed, 22 Jan 2014 10:51:17 +0100 Subject: [PATCH 049/339] [gametrailers] Use the generic `_real_extract` provided by the base class --- youtube_dl/extractor/gametrailers.py | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/youtube_dl/extractor/gametrailers.py b/youtube_dl/extractor/gametrailers.py index c1fdd770e..66b3b50d4 100644 --- a/youtube_dl/extractor/gametrailers.py +++ b/youtube_dl/extractor/gametrailers.py @@ -1,7 +1,5 @@ from __future__ import unicode_literals -import re - from .mtv import MTVServicesInfoExtractor @@ -18,12 +16,3 @@ class GametrailersIE(MTVServicesInfoExtractor): } _FEED_URL = 'http://www.gametrailers.com/feeds/mrss' - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') - webpage = self._download_webpage(url, video_id) - mgid = self._search_regex([r'data-video="(?P<mgid>mgid:.*?)"', - r'data-contentId=\'(?P<mgid>mgid:.*?)\''], - webpage, 'mgid') - return self._get_videos_info(mgid) From 407ae733ab24dbe4fdbd5b996aef6438f16834a2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= <jaime.marquinez.ferrandiz@gmail.com> Date: Wed, 22 Jan 2014 11:06:03 +0100 Subject: [PATCH 050/339] =?UTF-8?q?[cspan]=20Make=20=E2=80=98www=E2=80=99?= =?UTF-8?q?=20optional=20and=20improve=20the=20regex=20for=20extracting=20?= =?UTF-8?q?the=20id=20(fixes=20#2194)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- youtube_dl/extractor/cspan.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/cspan.py b/youtube_dl/extractor/cspan.py index a2cbd4d8d..521bbdee0 100644 --- a/youtube_dl/extractor/cspan.py +++ b/youtube_dl/extractor/cspan.py @@ -10,7 +10,7 @@ from ..utils import ( class CSpanIE(InfoExtractor): - _VALID_URL = r'http://www\.c-spanvideo\.org/program/(.*)' + _VALID_URL = r'http://(?:www\.)?c-spanvideo\.org/program/(?P<name>.*)' IE_DESC = 'C-SPAN' _TEST = { 'url': 'http://www.c-spanvideo.org/program/HolderonV', @@ -24,9 +24,9 @@ class CSpanIE(InfoExtractor): def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) - prog_name = mobj.group(1) + prog_name = mobj.group('name') webpage = self._download_webpage(url, prog_name) - video_id = self._search_regex(r'programid=(.*?)&', webpage, 'video id') + video_id = self._search_regex(r'prog(?:ram)?id=(.*?)&', webpage, 'video id') title = self._html_search_regex( r'<!-- title -->\n\s*<h1[^>]*>(.*?)</h1>', webpage, 'title') From 47739636a912d0704b3089326145851706d6cbc1 Mon Sep 17 00:00:00 2001 From: dst <dstftw@gmail.com> Date: Wed, 22 Jan 2014 17:25:32 +0700 Subject: [PATCH 051/339] [space] Add support for mobile URLs --- youtube_dl/extractor/space.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/space.py b/youtube_dl/extractor/space.py index 11455e0fa..4a3e52ad8 100644 --- a/youtube_dl/extractor/space.py +++ b/youtube_dl/extractor/space.py @@ -6,7 +6,7 @@ from ..utils import RegexNotFoundError, ExtractorError class SpaceIE(InfoExtractor): - _VALID_URL = r'https?://www\.space\.com/\d+-(?P<title>[^/\.\?]*?)-video\.html' + _VALID_URL = r'https?://(?:(?:www|m)\.)?space\.com/\d+-(?P<title>[^/\.\?]*?)-video\.html' _TEST = { u'add_ie': ['Brightcove'], u'url': u'http://www.space.com/23373-huge-martian-landforms-detail-revealed-by-european-probe-video.html', From 4bbf139aa75cace056989411003e4a1b6b2616e7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= <jaime.marquinez.ferrandiz@gmail.com> Date: Wed, 22 Jan 2014 11:35:17 +0100 Subject: [PATCH 052/339] [southparkstudios] Use the generic `_real_extract` provided by the base class --- youtube_dl/extractor/mtv.py | 9 ++++++--- youtube_dl/extractor/southparkstudios.py | 8 -------- 2 files changed, 6 insertions(+), 11 deletions(-) diff --git a/youtube_dl/extractor/mtv.py b/youtube_dl/extractor/mtv.py index ceb3aa37e..f6f31bfdc 100644 --- a/youtube_dl/extractor/mtv.py +++ b/youtube_dl/extractor/mtv.py @@ -112,9 +112,12 @@ class MTVServicesInfoExtractor(InfoExtractor): title = url_basename(url) webpage = self._download_webpage(url, title) try: - # the url is in the format http://media.mtvnservices.com/fb/{mgid}.swf - fb_url = self._og_search_video_url(webpage) - mgid = url_basename(fb_url).rpartition('.')[0] + # the url can be http://media.mtvnservices.com/fb/{mgid}.swf + # or http://media.mtvnservices.com/{mgid} + og_url = self._og_search_video_url(webpage) + mgid = url_basename(og_url) + if mgid.endswith('.swf'): + mgid = mgid[:-4] except RegexNotFoundError: mgid = self._search_regex(r'data-mgid="(.*?)"', webpage, u'mgid') return self._get_videos_info(mgid) diff --git a/youtube_dl/extractor/southparkstudios.py b/youtube_dl/extractor/southparkstudios.py index fd90cc5dd..9f8d3a5fa 100644 --- a/youtube_dl/extractor/southparkstudios.py +++ b/youtube_dl/extractor/southparkstudios.py @@ -18,14 +18,6 @@ class SouthParkStudiosIE(MTVServicesInfoExtractor): }, }] - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - url = u'http://www.' + mobj.group(u'url') - video_id = mobj.group('id') - webpage = self._download_webpage(url, video_id) - mgid = self._search_regex(r'swfobject.embedSWF\(".*?(mgid:.*?)"', - webpage, u'mgid') - return self._get_videos_info(mgid) class SouthparkDeIE(SouthParkStudiosIE): IE_NAME = u'southpark.de' From 130f12985a9b44781680083d9bcc200a79e9395d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= <jaime.marquinez.ferrandiz@gmail.com> Date: Wed, 22 Jan 2014 11:44:26 +0100 Subject: [PATCH 053/339] [comedycentral] Use the generic `_real_extract` provided by the base class --- youtube_dl/extractor/comedycentral.py | 8 -------- 1 file changed, 8 deletions(-) diff --git a/youtube_dl/extractor/comedycentral.py b/youtube_dl/extractor/comedycentral.py index f0ad5c9a3..3333d433b 100644 --- a/youtube_dl/extractor/comedycentral.py +++ b/youtube_dl/extractor/comedycentral.py @@ -30,14 +30,6 @@ class ComedyCentralIE(MTVServicesInfoExtractor): }, } - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - title = mobj.group('title') - webpage = self._download_webpage(url, title) - mgid = self._search_regex(r'data-mgid="(?P<mgid>mgid:.*?)"', - webpage, 'mgid') - return self._get_videos_info(mgid) - class ComedyCentralShowsIE(InfoExtractor): IE_DESC = 'The Daily Show / Colbert Report' From 04b4d394d92e325095fbf096f3f62bc4013fc785 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Wed, 22 Jan 2014 14:16:43 +0100 Subject: [PATCH 054/339] Add new --default-search option (#2193) --- youtube_dl/YoutubeDL.py | 2 ++ youtube_dl/__init__.py | 7 ++++++- youtube_dl/extractor/generic.py | 15 +++++++++++++-- 3 files changed, 21 insertions(+), 3 deletions(-) diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index dc8aa788c..c6430d367 100644 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -152,6 +152,8 @@ class YoutubeDL(object): support, using fridibi debug_printtraffic:Print out sent and received HTTP traffic include_ads: Download ads as well + default_search: Prepend this string if an input url is not valid. + 'auto' for elaborate guessing The following parameters are not used by YoutubeDL itself, they are used by the FileDownloader: diff --git a/youtube_dl/__init__.py b/youtube_dl/__init__.py index a948b1d90..44047888d 100644 --- a/youtube_dl/__init__.py +++ b/youtube_dl/__init__.py @@ -199,7 +199,9 @@ def parseOpts(overrideArguments=None): general.add_option( '--bidi-workaround', dest='bidi_workaround', action='store_true', help=u'Work around terminals that lack bidirectional text support. Requires bidiv or fribidi executable in PATH') - + general.add_option('--default-search', + dest='default_search', metavar='PREFIX', + help='Use this prefix for unqualified URLs. For example "gvsearch2:" downloads two videos from google videos for youtube-dl "large apple". By default (with value "auto") youtube-dl guesses.') selection.add_option( '--playlist-start', @@ -619,6 +621,8 @@ def _real_main(argv=None): date = DateRange.day(opts.date) else: date = DateRange(opts.dateafter, opts.datebefore) + if opts.default_search not in ('auto', None) and ':' not in opts.default_search: + parser.error(u'--default-search invalid; did you forget a colon (:) at the end?') # --all-sub automatically sets --write-sub if --write-auto-sub is not given # this was the old behaviour if only --all-sub was given. @@ -720,6 +724,7 @@ def _real_main(argv=None): 'debug_printtraffic': opts.debug_printtraffic, 'prefer_ffmpeg': opts.prefer_ffmpeg, 'include_ads': opts.include_ads, + 'default_search': opts.default_search, } with YoutubeDL(ydl_opts) as ydl: diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 91536075d..e1933837d 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -162,8 +162,19 @@ class GenericIE(InfoExtractor): def _real_extract(self, url): parsed_url = compat_urlparse.urlparse(url) if not parsed_url.scheme: - self._downloader.report_warning('The url doesn\'t specify the protocol, trying with http') - return self.url_result('http://' + url) + default_search = self._downloader.params.get('default_search') + if default_search is None: + default_search = 'auto' + + if default_search == 'auto': + if '/' in url: + self._downloader.report_warning('The url doesn\'t specify the protocol, trying with http') + return self.url_result('http://' + url) + else: + return self.url_result('ytsearch:' + url) + else: + assert ':' in default_search + return self.url_result(default_search + url) video_id = os.path.splitext(url.split('/')[-1])[0] self.to_screen('%s: Requesting header' % video_id) From a70c83768e30c99479af04c7cb229545743a9134 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Wed, 22 Jan 2014 14:33:16 +0100 Subject: [PATCH 055/339] release 2014.01.22.2 --- README.md | 4 ++++ youtube_dl/version.py | 2 +- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index aafdd3e51..3a28a1854 100644 --- a/README.md +++ b/README.md @@ -44,6 +44,10 @@ which means you can modify it, redistribute it or use it however you like. --bidi-workaround Work around terminals that lack bidirectional text support. Requires bidiv or fribidi executable in PATH + --default-search PREFIX Use this prefix for unqualified URLs. For example + "gvsearch2:" downloads two videos from google + videos for youtube-dl "large apple". By default + (with value "auto") youtube-dl guesses. ## Video Selection: --playlist-start NUMBER playlist video to start at (default is 1) diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 0103131a7..3b67546b2 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,2 +1,2 @@ -__version__ = '2014.01.22.1' +__version__ = '2014.01.22.2' From ba7678f9cc1099313f3fa9221538116a24e8e627 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Wed, 22 Jan 2014 14:47:29 +0100 Subject: [PATCH 056/339] Add -f bestaudio (Fixes #2163) --- test/test_YoutubeDL.py | 30 ++++++++++++++++++++++++++++++ youtube_dl/YoutubeDL.py | 12 ++++++++++++ youtube_dl/__init__.py | 2 +- 3 files changed, 43 insertions(+), 1 deletion(-) diff --git a/test/test_YoutubeDL.py b/test/test_YoutubeDL.py index 01de10e31..3adb9f344 100644 --- a/test/test_YoutubeDL.py +++ b/test/test_YoutubeDL.py @@ -150,6 +150,36 @@ class TestFormatSelection(unittest.TestCase): downloaded = ydl.downloaded_info_dicts[0] self.assertEqual(downloaded['format_id'], u'35') + def test_format_selection_audio(self): + formats = [ + {u'format_id': u'audio-low', u'ext': u'webm', 'preference': 1, 'vcodec': 'none'}, + {u'format_id': u'audio-mid', u'ext': u'webm', 'preference': 2, 'vcodec': 'none'}, + {u'format_id': u'audio-high', u'ext': u'flv', 'preference': 3, 'vcodec': 'none'}, + {u'format_id': u'vid', u'ext': u'mp4', 'preference': 4}, + ] + info_dict = {u'formats': formats, u'extractor': u'test'} + + ydl = YDL({'format': u'bestaudio'}) + ydl.process_ie_result(info_dict.copy()) + downloaded = ydl.downloaded_info_dicts[0] + self.assertEqual(downloaded['format_id'], u'audio-high') + + ydl = YDL({'format': u'worstaudio'}) + ydl.process_ie_result(info_dict.copy()) + downloaded = ydl.downloaded_info_dicts[0] + self.assertEqual(downloaded['format_id'], u'audio-low') + + formats = [ + {u'format_id': u'vid-low', u'ext': u'mp4', 'preference': 1}, + {u'format_id': u'vid-high', u'ext': u'mp4', 'preference': 2}, + ] + info_dict = {u'formats': formats, u'extractor': u'test'} + + ydl = YDL({'format': u'bestaudio/worstaudio/best'}) + ydl.process_ie_result(info_dict.copy()) + downloaded = ydl.downloaded_info_dicts[0] + self.assertEqual(downloaded['format_id'], u'vid-high') + def test_youtube_format_selection(self): order = [ '38', '37', '46', '22', '45', '35', '44', '18', '34', '43', '6', '5', '36', '17', '13', diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index c6430d367..9f15616fa 100644 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -637,6 +637,18 @@ class YoutubeDL(object): return available_formats[-1] elif format_spec == 'worst': return available_formats[0] + elif format_spec == 'bestaudio': + audio_formats = [ + f for f in available_formats + if f.get('vcodec') == 'none'] + if audio_formats: + return audio_formats[-1] + elif format_spec == 'worstaudio': + audio_formats = [ + f for f in available_formats + if f.get('vcodec') == 'none'] + if audio_formats: + return audio_formats[0] else: extensions = ['mp4', 'flv', 'webm', '3gp'] if format_spec in extensions: diff --git a/youtube_dl/__init__.py b/youtube_dl/__init__.py index 44047888d..4db97ad3c 100644 --- a/youtube_dl/__init__.py +++ b/youtube_dl/__init__.py @@ -257,7 +257,7 @@ def parseOpts(overrideArguments=None): video_format.add_option('-f', '--format', action='store', dest='format', metavar='FORMAT', default='best', - help='video format code, specify the order of preference using slashes: "-f 22/17/18". "-f mp4" and "-f flv" are also supported') + help='video format code, specify the order of preference using slashes: "-f 22/17/18". "-f mp4" and "-f flv" are also supported. You can also use the special names "best", "bestaudio", and "worst"') video_format.add_option('--all-formats', action='store_const', dest='format', help='download all available video formats', const='all') video_format.add_option('--prefer-free-formats', From 8908741806e248049c98657718caf00c0ae33bd0 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Wed, 22 Jan 2014 14:47:58 +0100 Subject: [PATCH 057/339] Use unicode_literals in test_YoutubeDL --- test/test_YoutubeDL.py | 126 +++++++++++++++++++++-------------------- 1 file changed, 64 insertions(+), 62 deletions(-) diff --git a/test/test_YoutubeDL.py b/test/test_YoutubeDL.py index 3adb9f344..37e7b9b28 100644 --- a/test/test_YoutubeDL.py +++ b/test/test_YoutubeDL.py @@ -1,5 +1,7 @@ #!/usr/bin/env python +from __future__ import unicode_literals + # Allow direct execution import os import sys @@ -30,155 +32,155 @@ class TestFormatSelection(unittest.TestCase): ydl = YDL() ydl.params['prefer_free_formats'] = True formats = [ - {u'ext': u'webm', u'height': 460}, - {u'ext': u'mp4', u'height': 460}, + {'ext': 'webm', 'height': 460}, + {'ext': 'mp4', 'height': 460}, ] - info_dict = {u'formats': formats, u'extractor': u'test'} + info_dict = {'formats': formats, 'extractor': 'test'} yie = YoutubeIE(ydl) yie._sort_formats(info_dict['formats']) ydl.process_ie_result(info_dict) downloaded = ydl.downloaded_info_dicts[0] - self.assertEqual(downloaded[u'ext'], u'webm') + self.assertEqual(downloaded['ext'], 'webm') # Different resolution => download best quality (mp4) ydl = YDL() ydl.params['prefer_free_formats'] = True formats = [ - {u'ext': u'webm', u'height': 720}, - {u'ext': u'mp4', u'height': 1080}, + {'ext': 'webm', 'height': 720}, + {'ext': 'mp4', 'height': 1080}, ] - info_dict[u'formats'] = formats + info_dict['formats'] = formats yie = YoutubeIE(ydl) yie._sort_formats(info_dict['formats']) ydl.process_ie_result(info_dict) downloaded = ydl.downloaded_info_dicts[0] - self.assertEqual(downloaded[u'ext'], u'mp4') + self.assertEqual(downloaded['ext'], 'mp4') # No prefer_free_formats => prefer mp4 and flv for greater compatibilty ydl = YDL() ydl.params['prefer_free_formats'] = False formats = [ - {u'ext': u'webm', u'height': 720}, - {u'ext': u'mp4', u'height': 720}, - {u'ext': u'flv', u'height': 720}, + {'ext': 'webm', 'height': 720}, + {'ext': 'mp4', 'height': 720}, + {'ext': 'flv', 'height': 720}, ] - info_dict[u'formats'] = formats + info_dict['formats'] = formats yie = YoutubeIE(ydl) yie._sort_formats(info_dict['formats']) ydl.process_ie_result(info_dict) downloaded = ydl.downloaded_info_dicts[0] - self.assertEqual(downloaded[u'ext'], u'mp4') + self.assertEqual(downloaded['ext'], 'mp4') ydl = YDL() ydl.params['prefer_free_formats'] = False formats = [ - {u'ext': u'flv', u'height': 720}, - {u'ext': u'webm', u'height': 720}, + {'ext': 'flv', 'height': 720}, + {'ext': 'webm', 'height': 720}, ] - info_dict[u'formats'] = formats + info_dict['formats'] = formats yie = YoutubeIE(ydl) yie._sort_formats(info_dict['formats']) ydl.process_ie_result(info_dict) downloaded = ydl.downloaded_info_dicts[0] - self.assertEqual(downloaded[u'ext'], u'flv') + self.assertEqual(downloaded['ext'], 'flv') def test_format_limit(self): formats = [ - {u'format_id': u'meh', u'url': u'http://example.com/meh', 'preference': 1}, - {u'format_id': u'good', u'url': u'http://example.com/good', 'preference': 2}, - {u'format_id': u'great', u'url': u'http://example.com/great', 'preference': 3}, - {u'format_id': u'excellent', u'url': u'http://example.com/exc', 'preference': 4}, + {'format_id': 'meh', 'url': 'http://example.com/meh', 'preference': 1}, + {'format_id': 'good', 'url': 'http://example.com/good', 'preference': 2}, + {'format_id': 'great', 'url': 'http://example.com/great', 'preference': 3}, + {'format_id': 'excellent', 'url': 'http://example.com/exc', 'preference': 4}, ] info_dict = { - u'formats': formats, u'extractor': u'test', 'id': 'testvid'} + 'formats': formats, 'extractor': 'test', 'id': 'testvid'} ydl = YDL() ydl.process_ie_result(info_dict) downloaded = ydl.downloaded_info_dicts[0] - self.assertEqual(downloaded[u'format_id'], u'excellent') + self.assertEqual(downloaded['format_id'], 'excellent') ydl = YDL({'format_limit': 'good'}) assert ydl.params['format_limit'] == 'good' ydl.process_ie_result(info_dict.copy()) downloaded = ydl.downloaded_info_dicts[0] - self.assertEqual(downloaded[u'format_id'], u'good') + self.assertEqual(downloaded['format_id'], 'good') ydl = YDL({'format_limit': 'great', 'format': 'all'}) ydl.process_ie_result(info_dict.copy()) - self.assertEqual(ydl.downloaded_info_dicts[0][u'format_id'], u'meh') - self.assertEqual(ydl.downloaded_info_dicts[1][u'format_id'], u'good') - self.assertEqual(ydl.downloaded_info_dicts[2][u'format_id'], u'great') + self.assertEqual(ydl.downloaded_info_dicts[0]['format_id'], 'meh') + self.assertEqual(ydl.downloaded_info_dicts[1]['format_id'], 'good') + self.assertEqual(ydl.downloaded_info_dicts[2]['format_id'], 'great') self.assertTrue('3' in ydl.msgs[0]) ydl = YDL() ydl.params['format_limit'] = 'excellent' ydl.process_ie_result(info_dict.copy()) downloaded = ydl.downloaded_info_dicts[0] - self.assertEqual(downloaded[u'format_id'], u'excellent') + self.assertEqual(downloaded['format_id'], 'excellent') def test_format_selection(self): formats = [ - {u'format_id': u'35', u'ext': u'mp4', 'preference': 1}, - {u'format_id': u'45', u'ext': u'webm', 'preference': 2}, - {u'format_id': u'47', u'ext': u'webm', 'preference': 3}, - {u'format_id': u'2', u'ext': u'flv', 'preference': 4}, + {'format_id': '35', 'ext': 'mp4', 'preference': 1}, + {'format_id': '45', 'ext': 'webm', 'preference': 2}, + {'format_id': '47', 'ext': 'webm', 'preference': 3}, + {'format_id': '2', 'ext': 'flv', 'preference': 4}, ] - info_dict = {u'formats': formats, u'extractor': u'test'} + info_dict = {'formats': formats, 'extractor': 'test'} - ydl = YDL({'format': u'20/47'}) + ydl = YDL({'format': '20/47'}) ydl.process_ie_result(info_dict.copy()) downloaded = ydl.downloaded_info_dicts[0] - self.assertEqual(downloaded['format_id'], u'47') + self.assertEqual(downloaded['format_id'], '47') - ydl = YDL({'format': u'20/71/worst'}) + ydl = YDL({'format': '20/71/worst'}) ydl.process_ie_result(info_dict.copy()) downloaded = ydl.downloaded_info_dicts[0] - self.assertEqual(downloaded['format_id'], u'35') + self.assertEqual(downloaded['format_id'], '35') ydl = YDL() ydl.process_ie_result(info_dict.copy()) downloaded = ydl.downloaded_info_dicts[0] - self.assertEqual(downloaded['format_id'], u'2') + self.assertEqual(downloaded['format_id'], '2') - ydl = YDL({'format': u'webm/mp4'}) + ydl = YDL({'format': 'webm/mp4'}) ydl.process_ie_result(info_dict.copy()) downloaded = ydl.downloaded_info_dicts[0] - self.assertEqual(downloaded['format_id'], u'47') + self.assertEqual(downloaded['format_id'], '47') - ydl = YDL({'format': u'3gp/40/mp4'}) + ydl = YDL({'format': '3gp/40/mp4'}) ydl.process_ie_result(info_dict.copy()) downloaded = ydl.downloaded_info_dicts[0] - self.assertEqual(downloaded['format_id'], u'35') + self.assertEqual(downloaded['format_id'], '35') def test_format_selection_audio(self): formats = [ - {u'format_id': u'audio-low', u'ext': u'webm', 'preference': 1, 'vcodec': 'none'}, - {u'format_id': u'audio-mid', u'ext': u'webm', 'preference': 2, 'vcodec': 'none'}, - {u'format_id': u'audio-high', u'ext': u'flv', 'preference': 3, 'vcodec': 'none'}, - {u'format_id': u'vid', u'ext': u'mp4', 'preference': 4}, + {'format_id': 'audio-low', 'ext': 'webm', 'preference': 1, 'vcodec': 'none'}, + {'format_id': 'audio-mid', 'ext': 'webm', 'preference': 2, 'vcodec': 'none'}, + {'format_id': 'audio-high', 'ext': 'flv', 'preference': 3, 'vcodec': 'none'}, + {'format_id': 'vid', 'ext': 'mp4', 'preference': 4}, ] - info_dict = {u'formats': formats, u'extractor': u'test'} + info_dict = {'formats': formats, 'extractor': 'test'} - ydl = YDL({'format': u'bestaudio'}) + ydl = YDL({'format': 'bestaudio'}) ydl.process_ie_result(info_dict.copy()) downloaded = ydl.downloaded_info_dicts[0] - self.assertEqual(downloaded['format_id'], u'audio-high') + self.assertEqual(downloaded['format_id'], 'audio-high') - ydl = YDL({'format': u'worstaudio'}) + ydl = YDL({'format': 'worstaudio'}) ydl.process_ie_result(info_dict.copy()) downloaded = ydl.downloaded_info_dicts[0] - self.assertEqual(downloaded['format_id'], u'audio-low') + self.assertEqual(downloaded['format_id'], 'audio-low') formats = [ - {u'format_id': u'vid-low', u'ext': u'mp4', 'preference': 1}, - {u'format_id': u'vid-high', u'ext': u'mp4', 'preference': 2}, + {'format_id': 'vid-low', 'ext': 'mp4', 'preference': 1}, + {'format_id': 'vid-high', 'ext': 'mp4', 'preference': 2}, ] - info_dict = {u'formats': formats, u'extractor': u'test'} + info_dict = {'formats': formats, 'extractor': 'test'} - ydl = YDL({'format': u'bestaudio/worstaudio/best'}) + ydl = YDL({'format': 'bestaudio/worstaudio/best'}) ydl.process_ie_result(info_dict.copy()) downloaded = ydl.downloaded_info_dicts[0] - self.assertEqual(downloaded['format_id'], u'vid-high') + self.assertEqual(downloaded['format_id'], 'vid-high') def test_youtube_format_selection(self): order = [ @@ -230,17 +232,17 @@ class TestFormatSelection(unittest.TestCase): def test_prepare_filename(self): info = { - u'id': u'1234', - u'ext': u'mp4', - u'width': None, + 'id': '1234', + 'ext': 'mp4', + 'width': None, } def fname(templ): ydl = YoutubeDL({'outtmpl': templ}) return ydl.prepare_filename(info) - self.assertEqual(fname(u'%(id)s.%(ext)s'), u'1234.mp4') - self.assertEqual(fname(u'%(id)s-%(width)s.%(ext)s'), u'1234-NA.mp4') + self.assertEqual(fname('%(id)s.%(ext)s'), '1234.mp4') + self.assertEqual(fname('%(id)s-%(width)s.%(ext)s'), '1234-NA.mp4') # Replace missing fields with 'NA' - self.assertEqual(fname(u'%(uploader_date)s-%(id)s.%(ext)s'), u'NA-1234.mp4') + self.assertEqual(fname('%(uploader_date)s-%(id)s.%(ext)s'), 'NA-1234.mp4') if __name__ == '__main__': From de3ef3ed5865fb0579062b03c25354f2587c780f Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Wed, 22 Jan 2014 14:53:23 +0100 Subject: [PATCH 058/339] Default to -f best-audio when only audio is requested --- youtube_dl/YoutubeDL.py | 2 +- youtube_dl/__init__.py | 6 +++++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index 9f15616fa..1491f8908 100644 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -713,7 +713,7 @@ class YoutubeDL(object): self.list_formats(info_dict) return - req_format = self.params.get('format', 'best') + req_format = self.params.get('format') if req_format is None: req_format = 'best' formats_to_download = [] diff --git a/youtube_dl/__init__.py b/youtube_dl/__init__.py index 4db97ad3c..c15e6ef7e 100644 --- a/youtube_dl/__init__.py +++ b/youtube_dl/__init__.py @@ -256,7 +256,7 @@ def parseOpts(overrideArguments=None): video_format.add_option('-f', '--format', - action='store', dest='format', metavar='FORMAT', default='best', + action='store', dest='format', metavar='FORMAT', default=None, help='video format code, specify the order of preference using slashes: "-f 22/17/18". "-f mp4" and "-f flv" are also supported. You can also use the special names "best", "bestaudio", and "worst"') video_format.add_option('--all-formats', action='store_const', dest='format', help='download all available video formats', const='all') @@ -624,6 +624,10 @@ def _real_main(argv=None): if opts.default_search not in ('auto', None) and ':' not in opts.default_search: parser.error(u'--default-search invalid; did you forget a colon (:) at the end?') + # Do not download videos when there are audio-only formats + if opts.extractaudio and not opts.keepvideo and opts.format is None: + opts.format = 'bestaudio/best' + # --all-sub automatically sets --write-sub if --write-auto-sub is not given # this was the old behaviour if only --all-sub was given. if opts.allsubtitles and (opts.writeautomaticsub == False): From cd8a562267762933a0890c84457fd19dc5a42936 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Wed, 22 Jan 2014 14:53:36 +0100 Subject: [PATCH 059/339] release 2014.01.22.3 --- README.md | 4 +++- youtube_dl/version.py | 2 +- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 3a28a1854..84c43e630 100644 --- a/README.md +++ b/README.md @@ -158,7 +158,9 @@ which means you can modify it, redistribute it or use it however you like. ## Video Format Options: -f, --format FORMAT video format code, specify the order of preference using slashes: "-f 22/17/18". "-f mp4" - and "-f flv" are also supported + and "-f flv" are also supported. You can also use + the special names "best", "bestaudio", and + "worst" --all-formats download all available video formats --prefer-free-formats prefer free video formats unless a specific one is requested diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 3b67546b2..7daf9b48d 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,2 +1,2 @@ -__version__ = '2014.01.22.2' +__version__ = '2014.01.22.3' From 79bf58f9b550d6112719ea6156bc639a427aa28e Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Wed, 22 Jan 2014 14:55:45 +0100 Subject: [PATCH 060/339] Document -f worstaudio as well --- README.md | 4 ++-- youtube_dl/__init__.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 84c43e630..3fe126c65 100644 --- a/README.md +++ b/README.md @@ -159,8 +159,8 @@ which means you can modify it, redistribute it or use it however you like. -f, --format FORMAT video format code, specify the order of preference using slashes: "-f 22/17/18". "-f mp4" and "-f flv" are also supported. You can also use - the special names "best", "bestaudio", and - "worst" + the special names "best", "bestaudio", "worst", + and "worstaudio" --all-formats download all available video formats --prefer-free-formats prefer free video formats unless a specific one is requested diff --git a/youtube_dl/__init__.py b/youtube_dl/__init__.py index c15e6ef7e..568c5e6d0 100644 --- a/youtube_dl/__init__.py +++ b/youtube_dl/__init__.py @@ -257,7 +257,7 @@ def parseOpts(overrideArguments=None): video_format.add_option('-f', '--format', action='store', dest='format', metavar='FORMAT', default=None, - help='video format code, specify the order of preference using slashes: "-f 22/17/18". "-f mp4" and "-f flv" are also supported. You can also use the special names "best", "bestaudio", and "worst"') + help='video format code, specify the order of preference using slashes: "-f 22/17/18". "-f mp4" and "-f flv" are also supported. You can also use the special names "best", "bestaudio", "worst", and "worstaudio"') video_format.add_option('--all-formats', action='store_const', dest='format', help='download all available video formats', const='all') video_format.add_option('--prefer-free-formats', From 11577ec0542163cbae5ad97869ea56bbd46bbc37 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Wed, 22 Jan 2014 15:10:00 +0100 Subject: [PATCH 061/339] [cspan] Disable test It works fine from all my machines, no matter where, but from travis, we get lots of 403s. Maybe another project is scraping CSPAN from travis and they're blocking the travis machines? --- youtube_dl/extractor/cspan.py | 1 + 1 file changed, 1 insertion(+) diff --git a/youtube_dl/extractor/cspan.py b/youtube_dl/extractor/cspan.py index 521bbdee0..e54009622 100644 --- a/youtube_dl/extractor/cspan.py +++ b/youtube_dl/extractor/cspan.py @@ -20,6 +20,7 @@ class CSpanIE(InfoExtractor): 'title': 'Attorney General Eric Holder on Voting Rights Act Decision', 'description': 'Attorney General Eric Holder spoke to reporters following the Supreme Court decision in [Shelby County v. Holder] in which the court ruled that the preclearance provisions of the Voting Rights Act could not be enforced until Congress established new guidelines for review.', }, + 'skip': 'Regularly fails on travis, for unknown reasons', } def _real_extract(self, url): From 714d709a31a8fbb8a0aee94df59730673c4c035b Mon Sep 17 00:00:00 2001 From: Mike Col <MikeCol@gmx.net> Date: Wed, 22 Jan 2014 19:01:41 +0100 Subject: [PATCH 062/339] [xvideos] Fix thumbnail extraction Signed-off-by: Philipp Hagemeister <phihag@phihag.de> --- youtube_dl/extractor/xvideos.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/xvideos.py b/youtube_dl/extractor/xvideos.py index 90138d7e5..4ee538b50 100644 --- a/youtube_dl/extractor/xvideos.py +++ b/youtube_dl/extractor/xvideos.py @@ -35,8 +35,8 @@ class XVideosIE(InfoExtractor): webpage, u'title') # Extract video thumbnail - video_thumbnail = self._search_regex(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', - webpage, u'thumbnail', fatal=False) + video_thumbnail = self._search_regex(r'url_bigthumb=(.+?)&', + webpage, u'thumbnail', fatal=False) info = { 'id': video_id, From d7975ea28785e1d2c344ee65acff873edb43e914 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Wed, 22 Jan 2014 19:02:48 +0100 Subject: [PATCH 063/339] [xvideos] Simplify --- youtube_dl/extractor/xvideos.py | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/youtube_dl/extractor/xvideos.py b/youtube_dl/extractor/xvideos.py index 4ee538b50..85e99e1b0 100644 --- a/youtube_dl/extractor/xvideos.py +++ b/youtube_dl/extractor/xvideos.py @@ -1,3 +1,5 @@ +from __future__ import unicode_literals + import re from .common import InfoExtractor @@ -9,12 +11,12 @@ from ..utils import ( class XVideosIE(InfoExtractor): _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)' _TEST = { - u'url': u'http://www.xvideos.com/video939581/funny_porns_by_s_-1', - u'file': u'939581.flv', - u'md5': u'1d0c835822f0a71a7bf011855db929d0', - u'info_dict': { - u"title": u"Funny Porns By >>>>S<<<<<< -1", - u"age_limit": 18, + 'url': 'http://www.xvideos.com/video939581/funny_porns_by_s_-1', + 'file': '939581.flv', + 'md5': '1d0c835822f0a71a7bf011855db929d0', + 'info_dict': { + "title": "Funny Porns By >>>>S<<<<<< -1", + "age_limit": 18, } } @@ -27,18 +29,18 @@ class XVideosIE(InfoExtractor): self.report_extraction(video_id) # Extract video URL - video_url = compat_urllib_parse.unquote(self._search_regex(r'flv_url=(.+?)&', - webpage, u'video URL')) + video_url = compat_urllib_parse.unquote( + self._search_regex(r'flv_url=(.+?)&', webpage, 'video URL')) # Extract title - video_title = self._html_search_regex(r'<title>(.*?)\s+-\s+XVID', - webpage, u'title') + video_title = self._html_search_regex( + r'<title>(.*?)\s+-\s+XVID', webpage, 'title') # Extract video thumbnail - video_thumbnail = self._search_regex(r'url_bigthumb=(.+?)&', - webpage, u'thumbnail', fatal=False) + video_thumbnail = self._search_regex( + r'url_bigthumb=(.+?)&', webpage, 'thumbnail', fatal=False) - info = { + return { 'id': video_id, 'url': video_url, 'uploader': None, @@ -49,5 +51,3 @@ class XVideosIE(InfoExtractor): 'description': None, 'age_limit': 18, } - - return [info] From c4cd138b920033ea6d7138b1af8f26197dbab042 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Wed, 22 Jan 2014 21:01:52 +0100 Subject: [PATCH 064/339] release 2014.01.22.4 --- youtube_dl/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 7daf9b48d..8514c07f0 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,2 +1,2 @@ -__version__ = '2014.01.22.3' +__version__ = '2014.01.22.4' From bd2d82a5d3f119644d125e0b0a71dca738733fb4 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Wed, 22 Jan 2014 21:41:28 +0100 Subject: [PATCH 065/339] [newgrounds] Simplify --- youtube_dl/extractor/newgrounds.py | 30 +++++++++++++++++------------- 1 file changed, 17 insertions(+), 13 deletions(-) diff --git a/youtube_dl/extractor/newgrounds.py b/youtube_dl/extractor/newgrounds.py index 2ef80bce0..5cb83ba14 100644 --- a/youtube_dl/extractor/newgrounds.py +++ b/youtube_dl/extractor/newgrounds.py @@ -1,3 +1,5 @@ +from __future__ import unicode_literals + import json import re @@ -8,12 +10,12 @@ from ..utils import determine_ext class NewgroundsIE(InfoExtractor): _VALID_URL = r'(?:https?://)?(?:www\.)?newgrounds\.com/audio/listen/(?P<id>\d+)' _TEST = { - u'url': u'http://www.newgrounds.com/audio/listen/549479', - u'file': u'549479.mp3', - u'md5': u'fe6033d297591288fa1c1f780386f07a', - u'info_dict': { - u"title": u"B7 - BusMode", - u"uploader": u"Burn7", + 'url': 'http://www.newgrounds.com/audio/listen/549479', + 'file': '549479.mp3', + 'md5': 'fe6033d297591288fa1c1f780386f07a', + 'info_dict': { + "title": "B7 - BusMode", + "uploader": "Burn7", } } @@ -22,17 +24,19 @@ class NewgroundsIE(InfoExtractor): music_id = mobj.group('id') webpage = self._download_webpage(url, music_id) - title = self._html_search_regex(r',"name":"([^"]+)",', webpage, u'music title') - uploader = self._html_search_regex(r',"artist":"([^"]+)",', webpage, u'music uploader') + title = self._html_search_regex( + r',"name":"([^"]+)",', webpage, 'music title') + uploader = self._html_search_regex( + r',"artist":"([^"]+)",', webpage, 'music uploader') - music_url_json_string = self._html_search_regex(r'({"url":"[^"]+"),', webpage, u'music url') + '}' + music_url_json_string = self._html_search_regex( + r'({"url":"[^"]+"),', webpage, 'music url') + '}' music_url_json = json.loads(music_url_json_string) music_url = music_url_json['url'] return { - 'id': music_id, - 'title': title, - 'url': music_url, + 'id': music_id, + 'title': title, + 'url': music_url, 'uploader': uploader, - 'ext': determine_ext(music_url), } From dd26ced164f834a337956ab57a014a8afd8b9131 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Wed, 22 Jan 2014 21:43:33 +0100 Subject: [PATCH 066/339] Add __len__ to PagedLists --- youtube_dl/utils.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index ed5ee222f..8fa4cb67f 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -1172,6 +1172,10 @@ class PagedList(object): self._pagefunc = pagefunc self._pagesize = pagesize + def __len__(self): + # This is only useful for tests + return len(self.getslice()) + def getslice(self, start=0, end=None): res = [] for pagenum in itertools.count(start // self._pagesize): From 4919603f66aac39f81ce90b3beca47db59d9384d Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Wed, 22 Jan 2014 21:56:37 +0100 Subject: [PATCH 067/339] [youtube] Make DASH manifest download conditional for now DASH download fails on many videos (all with encrypted signatures? not sure yet), for example 07FYdnEawAQ, with a 403. --- youtube_dl/__init__.py | 8 +++++++- youtube_dl/extractor/youtube.py | 12 +++++++----- 2 files changed, 14 insertions(+), 6 deletions(-) diff --git a/youtube_dl/__init__.py b/youtube_dl/__init__.py index 568c5e6d0..870145c36 100644 --- a/youtube_dl/__init__.py +++ b/youtube_dl/__init__.py @@ -244,6 +244,10 @@ def parseOpts(overrideArguments=None): '--include-ads', dest='include_ads', action='store_true', help='Download advertisements as well (experimental)') + verbosity.add_option( + '--youtube-include-dash-manifest', action='store_true', + dest='youtube_include_dash_manifest', default=False, + help='Try to download the DASH manifest on YouTube videos (experimental)') authentication.add_option('-u', '--username', dest='username', metavar='USERNAME', help='account username') @@ -348,7 +352,8 @@ def parseOpts(overrideArguments=None): help=optparse.SUPPRESS_HELP) verbosity.add_option('--print-traffic', dest='debug_printtraffic', action='store_true', default=False, - help=optparse.SUPPRESS_HELP) + help='Display sent and read HTTP traffic') + filesystem.add_option('-t', '--title', action='store_true', dest='usetitle', help='use title in file name (default)', default=False) @@ -729,6 +734,7 @@ def _real_main(argv=None): 'prefer_ffmpeg': opts.prefer_ffmpeg, 'include_ads': opts.include_ads, 'default_search': opts.default_search, + 'youtube_include_dash_manifest': opts.youtube_include_dash_manifest, } with YoutubeDL(ydl_opts) as ydl: diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 57b8fdff7..175763309 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -276,16 +276,17 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): u"url": u"http://www.youtube.com/watch?v=a9LDPn-MO4I", u"file": u"a9LDPn-MO4I.m4a", u"note": u"256k DASH audio (format 141) via DASH manifest", - u"params": { - u"format": "141" - }, u"info_dict": { u"upload_date": "20121002", u"uploader_id": "8KVIDEO", u"description": "No description available.", u"uploader": "8KVIDEO", u"title": "UHDTV TEST 8K VIDEO.mp4" - } + }, + u"params": { + u"youtube_include_dash_manifest": True, + u"format": "141", + }, }, ] @@ -1355,7 +1356,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): # Look for the DASH manifest dash_manifest_url_lst = video_info.get('dashmpd') - if dash_manifest_url_lst and dash_manifest_url_lst[0]: + if (dash_manifest_url_lst and dash_manifest_url_lst[0] and + self._downloader.params.get('youtube_include_dash_manifest', False)): try: dash_doc = self._download_xml( dash_manifest_url_lst[0], video_id, From 780083dbc6b1377c69c81aa5b4e794699fcb39ed Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Wed, 22 Jan 2014 21:57:17 +0100 Subject: [PATCH 068/339] release 2014.01.22.5 --- README.md | 383 ++++++++++++++++++++++-------------------- youtube_dl/version.py | 2 +- 2 files changed, 206 insertions(+), 179 deletions(-) diff --git a/README.md b/README.md index 3fe126c65..5b6626374 100644 --- a/README.md +++ b/README.md @@ -14,200 +14,227 @@ your Unix box, on Windows or on Mac OS X. It is released to the public domain, which means you can modify it, redistribute it or use it however you like. # OPTIONS - -h, --help print this help text and exit - --version print program version and exit - -U, --update update this program to latest version. Make sure - that you have sufficient permissions (run with - sudo if needed) - -i, --ignore-errors continue on download errors, for example to to - skip unavailable videos in a playlist - --abort-on-error Abort downloading of further videos (in the - playlist or the command line) if an error occurs - --dump-user-agent display the current browser identification - --user-agent UA specify a custom user agent - --referer REF specify a custom referer, use if the video access - is restricted to one domain - --list-extractors List all supported extractors and the URLs they - would handle - --extractor-descriptions Output descriptions of all supported extractors - --proxy URL Use the specified HTTP/HTTPS proxy. Pass in an - empty string (--proxy "") for direct connection - --no-check-certificate Suppress HTTPS certificate validation. - --cache-dir DIR Location in the filesystem where youtube-dl can - store some downloaded information permanently. By - default $XDG_CACHE_HOME/youtube-dl or ~/.cache - /youtube-dl . At the moment, only YouTube player - files (for videos with obfuscated signatures) are - cached, but that may change. - --no-cache-dir Disable filesystem caching - --socket-timeout None Time to wait before giving up, in seconds - --bidi-workaround Work around terminals that lack bidirectional - text support. Requires bidiv or fribidi - executable in PATH - --default-search PREFIX Use this prefix for unqualified URLs. For example - "gvsearch2:" downloads two videos from google - videos for youtube-dl "large apple". By default - (with value "auto") youtube-dl guesses. + -h, --help print this help text and exit + --version print program version and exit + -U, --update update this program to latest version. Make + sure that you have sufficient permissions + (run with sudo if needed) + -i, --ignore-errors continue on download errors, for example to + to skip unavailable videos in a playlist + --abort-on-error Abort downloading of further videos (in the + playlist or the command line) if an error + occurs + --dump-user-agent display the current browser identification + --user-agent UA specify a custom user agent + --referer REF specify a custom referer, use if the video + access is restricted to one domain + --list-extractors List all supported extractors and the URLs + they would handle + --extractor-descriptions Output descriptions of all supported + extractors + --proxy URL Use the specified HTTP/HTTPS proxy. Pass in + an empty string (--proxy "") for direct + connection + --no-check-certificate Suppress HTTPS certificate validation. + --cache-dir DIR Location in the filesystem where youtube-dl + can store some downloaded information + permanently. By default $XDG_CACHE_HOME + /youtube-dl or ~/.cache/youtube-dl . At the + moment, only YouTube player files (for + videos with obfuscated signatures) are + cached, but that may change. + --no-cache-dir Disable filesystem caching + --socket-timeout None Time to wait before giving up, in seconds + --bidi-workaround Work around terminals that lack + bidirectional text support. Requires bidiv + or fribidi executable in PATH + --default-search PREFIX Use this prefix for unqualified URLs. For + example "gvsearch2:" downloads two videos + from google videos for youtube-dl "large + apple". By default (with value "auto") + youtube-dl guesses. ## Video Selection: - --playlist-start NUMBER playlist video to start at (default is 1) - --playlist-end NUMBER playlist video to end at (default is last) - --match-title REGEX download only matching titles (regex or caseless - sub-string) - --reject-title REGEX skip download for matching titles (regex or - caseless sub-string) - --max-downloads NUMBER Abort after downloading NUMBER files - --min-filesize SIZE Do not download any videos smaller than SIZE - (e.g. 50k or 44.6m) - --max-filesize SIZE Do not download any videos larger than SIZE (e.g. - 50k or 44.6m) - --date DATE download only videos uploaded in this date - --datebefore DATE download only videos uploaded on or before this - date (i.e. inclusive) - --dateafter DATE download only videos uploaded on or after this - date (i.e. inclusive) - --min-views COUNT Do not download any videos with less than COUNT - views - --max-views COUNT Do not download any videos with more than COUNT - views - --no-playlist download only the currently playing video - --age-limit YEARS download only videos suitable for the given age - --download-archive FILE Download only videos not listed in the archive - file. Record the IDs of all downloaded videos in - it. - --include-ads Download advertisements as well (experimental) + --playlist-start NUMBER playlist video to start at (default is 1) + --playlist-end NUMBER playlist video to end at (default is last) + --match-title REGEX download only matching titles (regex or + caseless sub-string) + --reject-title REGEX skip download for matching titles (regex or + caseless sub-string) + --max-downloads NUMBER Abort after downloading NUMBER files + --min-filesize SIZE Do not download any videos smaller than + SIZE (e.g. 50k or 44.6m) + --max-filesize SIZE Do not download any videos larger than SIZE + (e.g. 50k or 44.6m) + --date DATE download only videos uploaded in this date + --datebefore DATE download only videos uploaded on or before + this date (i.e. inclusive) + --dateafter DATE download only videos uploaded on or after + this date (i.e. inclusive) + --min-views COUNT Do not download any videos with less than + COUNT views + --max-views COUNT Do not download any videos with more than + COUNT views + --no-playlist download only the currently playing video + --age-limit YEARS download only videos suitable for the given + age + --download-archive FILE Download only videos not listed in the + archive file. Record the IDs of all + downloaded videos in it. + --include-ads Download advertisements as well + (experimental) ## Download Options: - -r, --rate-limit LIMIT maximum download rate in bytes per second (e.g. - 50K or 4.2M) - -R, --retries RETRIES number of retries (default is 10) - --buffer-size SIZE size of download buffer (e.g. 1024 or 16K) - (default is 1024) - --no-resize-buffer do not automatically adjust the buffer size. By - default, the buffer size is automatically resized - from an initial value of SIZE. + -r, --rate-limit LIMIT maximum download rate in bytes per second + (e.g. 50K or 4.2M) + -R, --retries RETRIES number of retries (default is 10) + --buffer-size SIZE size of download buffer (e.g. 1024 or 16K) + (default is 1024) + --no-resize-buffer do not automatically adjust the buffer + size. By default, the buffer size is + automatically resized from an initial value + of SIZE. ## Filesystem Options: - -t, --title use title in file name (default) - --id use only video ID in file name - -l, --literal [deprecated] alias of --title - -A, --auto-number number downloaded files starting from 00000 - -o, --output TEMPLATE output filename template. Use %(title)s to get - the title, %(uploader)s for the uploader name, - %(uploader_id)s for the uploader nickname if - different, %(autonumber)s to get an automatically - incremented number, %(ext)s for the filename - extension, %(format)s for the format description - (like "22 - 1280x720" or "HD"), %(format_id)s for - the unique id of the format (like Youtube's - itags: "137"), %(upload_date)s for the upload - date (YYYYMMDD), %(extractor)s for the provider - (youtube, metacafe, etc), %(id)s for the video - id, %(playlist)s for the playlist the video is - in, %(playlist_index)s for the position in the - playlist and %% for a literal percent. Use - to - output to stdout. Can also be used to download to - a different directory, for example with -o '/my/d - ownloads/%(uploader)s/%(title)s-%(id)s.%(ext)s' . - --autonumber-size NUMBER Specifies the number of digits in %(autonumber)s - when it is present in output filename template or - --auto-number option is given - --restrict-filenames Restrict filenames to only ASCII characters, and - avoid "&" and spaces in filenames - -a, --batch-file FILE file containing URLs to download ('-' for stdin) - --load-info FILE json file containing the video information - (created with the "--write-json" option) - -w, --no-overwrites do not overwrite files - -c, --continue force resume of partially downloaded files. By - default, youtube-dl will resume downloads if - possible. - --no-continue do not resume partially downloaded files (restart - from beginning) - --cookies FILE file to read cookies from and dump cookie jar in - --no-part do not use .part files - --no-mtime do not use the Last-modified header to set the - file modification time - --write-description write video description to a .description file - --write-info-json write video metadata to a .info.json file - --write-annotations write video annotations to a .annotation file - --write-thumbnail write thumbnail image to disk + -t, --title use title in file name (default) + --id use only video ID in file name + -l, --literal [deprecated] alias of --title + -A, --auto-number number downloaded files starting from 00000 + -o, --output TEMPLATE output filename template. Use %(title)s to + get the title, %(uploader)s for the + uploader name, %(uploader_id)s for the + uploader nickname if different, + %(autonumber)s to get an automatically + incremented number, %(ext)s for the + filename extension, %(format)s for the + format description (like "22 - 1280x720" or + "HD"), %(format_id)s for the unique id of + the format (like Youtube's itags: "137"), + %(upload_date)s for the upload date + (YYYYMMDD), %(extractor)s for the provider + (youtube, metacafe, etc), %(id)s for the + video id, %(playlist)s for the playlist the + video is in, %(playlist_index)s for the + position in the playlist and %% for a + literal percent. Use - to output to stdout. + Can also be used to download to a different + directory, for example with -o '/my/downloa + ds/%(uploader)s/%(title)s-%(id)s.%(ext)s' . + --autonumber-size NUMBER Specifies the number of digits in + %(autonumber)s when it is present in output + filename template or --auto-number option + is given + --restrict-filenames Restrict filenames to only ASCII + characters, and avoid "&" and spaces in + filenames + -a, --batch-file FILE file containing URLs to download ('-' for + stdin) + --load-info FILE json file containing the video information + (created with the "--write-json" option) + -w, --no-overwrites do not overwrite files + -c, --continue force resume of partially downloaded files. + By default, youtube-dl will resume + downloads if possible. + --no-continue do not resume partially downloaded files + (restart from beginning) + --cookies FILE file to read cookies from and dump cookie + jar in + --no-part do not use .part files + --no-mtime do not use the Last-modified header to set + the file modification time + --write-description write video description to a .description + file + --write-info-json write video metadata to a .info.json file + --write-annotations write video annotations to a .annotation + file + --write-thumbnail write thumbnail image to disk ## Verbosity / Simulation Options: - -q, --quiet activates quiet mode - -s, --simulate do not download the video and do not write - anything to disk - --skip-download do not download the video - -g, --get-url simulate, quiet but print URL - -e, --get-title simulate, quiet but print title - --get-id simulate, quiet but print id - --get-thumbnail simulate, quiet but print thumbnail URL - --get-description simulate, quiet but print video description - --get-duration simulate, quiet but print video length - --get-filename simulate, quiet but print output filename - --get-format simulate, quiet but print output format - -j, --dump-json simulate, quiet but print JSON information - --newline output progress bar as new lines - --no-progress do not print progress bar - --console-title display progress in console titlebar - -v, --verbose print various debugging information - --dump-intermediate-pages print downloaded pages to debug problems (very - verbose) - --write-pages Write downloaded intermediary pages to files in - the current directory to debug problems + --youtube-include-dash-manifest Try to download the DASH manifest on + YouTube videos (experimental) + -q, --quiet activates quiet mode + -s, --simulate do not download the video and do not write + anything to disk + --skip-download do not download the video + -g, --get-url simulate, quiet but print URL + -e, --get-title simulate, quiet but print title + --get-id simulate, quiet but print id + --get-thumbnail simulate, quiet but print thumbnail URL + --get-description simulate, quiet but print video description + --get-duration simulate, quiet but print video length + --get-filename simulate, quiet but print output filename + --get-format simulate, quiet but print output format + -j, --dump-json simulate, quiet but print JSON information + --newline output progress bar as new lines + --no-progress do not print progress bar + --console-title display progress in console titlebar + -v, --verbose print various debugging information + --dump-intermediate-pages print downloaded pages to debug problems + (very verbose) + --write-pages Write downloaded intermediary pages to + files in the current directory to debug + problems + --print-traffic Display sent and read HTTP traffic ## Video Format Options: - -f, --format FORMAT video format code, specify the order of - preference using slashes: "-f 22/17/18". "-f mp4" - and "-f flv" are also supported. You can also use - the special names "best", "bestaudio", "worst", - and "worstaudio" - --all-formats download all available video formats - --prefer-free-formats prefer free video formats unless a specific one - is requested - --max-quality FORMAT highest quality format to download - -F, --list-formats list all available formats + -f, --format FORMAT video format code, specify the order of + preference using slashes: "-f 22/17/18". + "-f mp4" and "-f flv" are also supported. + You can also use the special names "best", + "bestaudio", "worst", and "worstaudio" + --all-formats download all available video formats + --prefer-free-formats prefer free video formats unless a specific + one is requested + --max-quality FORMAT highest quality format to download + -F, --list-formats list all available formats ## Subtitle Options: - --write-sub write subtitle file - --write-auto-sub write automatic subtitle file (youtube only) - --all-subs downloads all the available subtitles of the - video - --list-subs lists all available subtitles for the video - --sub-format FORMAT subtitle format (default=srt) ([sbv/vtt] youtube - only) - --sub-lang LANGS languages of the subtitles to download (optional) - separated by commas, use IETF language tags like - 'en,pt' + --write-sub write subtitle file + --write-auto-sub write automatic subtitle file (youtube + only) + --all-subs downloads all the available subtitles of + the video + --list-subs lists all available subtitles for the video + --sub-format FORMAT subtitle format (default=srt) ([sbv/vtt] + youtube only) + --sub-lang LANGS languages of the subtitles to download + (optional) separated by commas, use IETF + language tags like 'en,pt' ## Authentication Options: - -u, --username USERNAME account username - -p, --password PASSWORD account password - -n, --netrc use .netrc authentication data - --video-password PASSWORD video password (vimeo, smotri) + -u, --username USERNAME account username + -p, --password PASSWORD account password + -n, --netrc use .netrc authentication data + --video-password PASSWORD video password (vimeo, smotri) ## Post-processing Options: - -x, --extract-audio convert video files to audio-only files (requires - ffmpeg or avconv and ffprobe or avprobe) - --audio-format FORMAT "best", "aac", "vorbis", "mp3", "m4a", "opus", or - "wav"; best by default - --audio-quality QUALITY ffmpeg/avconv audio quality specification, insert - a value between 0 (better) and 9 (worse) for VBR - or a specific bitrate like 128K (default 5) - --recode-video FORMAT Encode the video to another format if necessary - (currently supported: mp4|flv|ogg|webm) - -k, --keep-video keeps the video file on disk after the post- - processing; the video is erased by default - --no-post-overwrites do not overwrite post-processed files; the post- - processed files are overwritten by default - --embed-subs embed subtitles in the video (only for mp4 - videos) - --add-metadata write metadata to the video file - --xattrs write metadata to the video file's xattrs (using - dublin core and xdg standards) - --prefer-avconv Prefer avconv over ffmpeg for running the - postprocessors (default) - --prefer-ffmpeg Prefer ffmpeg over avconv for running the - postprocessors + -x, --extract-audio convert video files to audio-only files + (requires ffmpeg or avconv and ffprobe or + avprobe) + --audio-format FORMAT "best", "aac", "vorbis", "mp3", "m4a", + "opus", or "wav"; best by default + --audio-quality QUALITY ffmpeg/avconv audio quality specification, + insert a value between 0 (better) and 9 + (worse) for VBR or a specific bitrate like + 128K (default 5) + --recode-video FORMAT Encode the video to another format if + necessary (currently supported: + mp4|flv|ogg|webm) + -k, --keep-video keeps the video file on disk after the + post-processing; the video is erased by + default + --no-post-overwrites do not overwrite post-processed files; the + post-processed files are overwritten by + default + --embed-subs embed subtitles in the video (only for mp4 + videos) + --add-metadata write metadata to the video file + --xattrs write metadata to the video file's xattrs + (using dublin core and xdg standards) + --prefer-avconv Prefer avconv over ffmpeg for running the + postprocessors (default) + --prefer-ffmpeg Prefer ffmpeg over avconv for running the + postprocessors # CONFIGURATION diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 8514c07f0..2978afb0e 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,2 +1,2 @@ -__version__ = '2014.01.22.4' +__version__ = '2014.01.22.5' From 8b1be5cd73d4b9af7ac4bcf24cec0383ba674c24 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Wed, 22 Jan 2014 22:17:53 +0100 Subject: [PATCH 069/339] Move --youtube-include-dash-manifest into correct option group --- youtube_dl/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/__init__.py b/youtube_dl/__init__.py index 870145c36..96a031110 100644 --- a/youtube_dl/__init__.py +++ b/youtube_dl/__init__.py @@ -244,7 +244,7 @@ def parseOpts(overrideArguments=None): '--include-ads', dest='include_ads', action='store_true', help='Download advertisements as well (experimental)') - verbosity.add_option( + selection.add_option( '--youtube-include-dash-manifest', action='store_true', dest='youtube_include_dash_manifest', default=False, help='Try to download the DASH manifest on YouTube videos (experimental)') From 12ed57418c9c46b483ccd326a13724d91c2b911d Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Wed, 22 Jan 2014 22:31:19 +0100 Subject: [PATCH 070/339] [gamespot] Fix regexp --- youtube_dl/extractor/gamespot.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/gamespot.py b/youtube_dl/extractor/gamespot.py index 380ebbe55..c9598ad3a 100644 --- a/youtube_dl/extractor/gamespot.py +++ b/youtube_dl/extractor/gamespot.py @@ -28,7 +28,7 @@ class GameSpotIE(InfoExtractor): mobj = re.match(self._VALID_URL, url) page_id = mobj.group('page_id') webpage = self._download_webpage(url, page_id) - data_video_json = self._search_regex(r'data-video=\'(.*?)\'', webpage, u'data video') + data_video_json = self._search_regex(r'data-video=["\'](.*?)["\']', webpage, 'data video') data_video = json.loads(unescapeHTML(data_video_json)) # Transform the manifest url to a link to the mp4 files @@ -36,7 +36,7 @@ class GameSpotIE(InfoExtractor): f4m_url = data_video['videoStreams']['f4m_stream'] f4m_path = compat_urlparse.urlparse(f4m_url).path QUALITIES_RE = r'((,\d+)+,?)' - qualities = self._search_regex(QUALITIES_RE, f4m_path, u'qualities').strip(',').split(',') + qualities = self._search_regex(QUALITIES_RE, f4m_path, 'qualities').strip(',').split(',') http_path = f4m_path[1:].split('/', 1)[1] http_template = re.sub(QUALITIES_RE, r'%s', http_path) http_template = http_template.replace('.csmil/manifest.f4m', '') From 066f6a06305c715c94054ea00734e9259d5a2257 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Thu, 23 Jan 2014 00:12:47 +0100 Subject: [PATCH 071/339] [nowness] Add support --- youtube_dl/extractor/__init__.py | 1 + youtube_dl/extractor/brightcove.py | 2 +- youtube_dl/extractor/nowness.py | 49 ++++++++++++++++++++++++++++++ 3 files changed, 51 insertions(+), 1 deletion(-) create mode 100644 youtube_dl/extractor/nowness.py diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 4d6aeabdf..118982ff8 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -136,6 +136,7 @@ from .nhl import NHLIE, NHLVideocenterIE from .niconico import NiconicoIE from .ninegag import NineGagIE from .novamov import NovamovIE +from .nowness import NownessIE from .nowvideo import NowVideoIE from .ooyala import OoyalaIE from .orf import ORFIE diff --git a/youtube_dl/extractor/brightcove.py b/youtube_dl/extractor/brightcove.py index e1c45d1f0..443294e6f 100644 --- a/youtube_dl/extractor/brightcove.py +++ b/youtube_dl/extractor/brightcove.py @@ -198,7 +198,7 @@ class BrightcoveIE(InfoExtractor): def _extract_video_info(self, video_info): info = { 'id': compat_str(video_info['id']), - 'title': video_info['displayName'], + 'title': video_info['displayName'].strip(), 'description': video_info.get('shortDescription'), 'thumbnail': video_info.get('videoStillURL') or video_info.get('thumbnailURL'), 'uploader': video_info.get('publisherName'), diff --git a/youtube_dl/extractor/nowness.py b/youtube_dl/extractor/nowness.py new file mode 100644 index 000000000..b1bcb7e54 --- /dev/null +++ b/youtube_dl/extractor/nowness.py @@ -0,0 +1,49 @@ +from __future__ import unicode_literals + +import re + +from .brightcove import BrightcoveIE +from .common import InfoExtractor +from ..utils import ( + ExtractorError, +) + + +class NownessIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?nowness\.com/[^?#]*?/(?P<id>[0-9]+)/(?P<slug>[^/]+?)(?:$|[?#])' + + _TEST = { + 'url': 'http://www.nowness.com/day/2013/6/27/3131/candor--the-art-of-gesticulation', + 'file': '2520295746001.mp4', + 'md5': '0ece2f70a7bd252c7b00f3070182d418', + 'info_dict': { + 'description': 'Candor: The Art of Gesticulation', + 'uploader': 'Nowness', + 'title': 'Candor: The Art of Gesticulation', + } + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('slug') + + webpage = self._download_webpage(url, video_id) + player_url = self._search_regex( + r'"([^"]+/content/issue-[0-9.]+.js)"', webpage, 'player URL') + real_id = self._search_regex( + r'\sdata-videoId="([0-9]+)"', webpage, 'internal video ID') + + player_code = self._download_webpage( + player_url, video_id, + note='Downloading player JavaScript', + errnote='Player download failed') + player_code = player_code.replace("'+d+'", real_id) + + bc_url = BrightcoveIE._extract_brightcove_url(player_code) + if bc_url is None: + raise ExtractorError('Could not find player definition') + return { + '_type': 'url', + 'url': bc_url, + 'ie_key': 'Brightcove', + } From c35b1b07e2ca9dfac0cb897fd5ad047bebf15400 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Thu, 23 Jan 2014 00:13:00 +0100 Subject: [PATCH 072/339] release 2014.01.23 --- README.md | 4 ++-- youtube_dl/version.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 5b6626374..0eec2e0cb 100644 --- a/README.md +++ b/README.md @@ -83,6 +83,8 @@ which means you can modify it, redistribute it or use it however you like. downloaded videos in it. --include-ads Download advertisements as well (experimental) + --youtube-include-dash-manifest Try to download the DASH manifest on + YouTube videos (experimental) ## Download Options: -r, --rate-limit LIMIT maximum download rate in bytes per second @@ -150,8 +152,6 @@ which means you can modify it, redistribute it or use it however you like. --write-thumbnail write thumbnail image to disk ## Verbosity / Simulation Options: - --youtube-include-dash-manifest Try to download the DASH manifest on - YouTube videos (experimental) -q, --quiet activates quiet mode -s, --simulate do not download the video and do not write anything to disk diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 2978afb0e..5182f71c7 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,2 +1,2 @@ -__version__ = '2014.01.22.5' +__version__ = '2014.01.23' From 65d781128aee4f0a0609f6140ba1b232b9794781 Mon Sep 17 00:00:00 2001 From: Mike Col <MikeCol@gmx.net> Date: Thu, 23 Jan 2014 03:51:09 +0100 Subject: [PATCH 073/339] [xhamster] Add support for hd video Signed-off-by: Philipp Hagemeister <phihag@phihag.de> --- youtube_dl/extractor/xhamster.py | 20 +++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/xhamster.py b/youtube_dl/extractor/xhamster.py index ef9997ee4..fe13024d5 100644 --- a/youtube_dl/extractor/xhamster.py +++ b/youtube_dl/extractor/xhamster.py @@ -45,6 +45,13 @@ class XHamsterIE(InfoExtractor): else: return mobj.group('server')+'/key='+mobj.group('file') + def extract_mp4_video_url(webpage): + mp4 = re.search(r'<a href=\"(.+?)\" class=\"mp4Play\"',webpage) + if mp4 is None: + return None + else: + return mp4.group(1) + def is_hd(webpage): return webpage.find('<div class=\'icon iconHD\'') != -1 @@ -80,14 +87,25 @@ class XHamsterIE(InfoExtractor): age_limit = self._rta_search(webpage) - video_url = extract_video_url(webpage) hd = is_hd(webpage) + + video_url = extract_video_url(webpage) formats = [{ 'url': video_url, 'ext': determine_ext(video_url), 'format': 'hd' if hd else 'sd', 'format_id': 'hd' if hd else 'sd', }] + + video_mp4_url = extract_mp4_video_url(webpage) + if (not video_mp4_url is None) and (formats[0]['ext'] != 'mp4'): + formats.append( { + 'url': video_mp4_url, + 'ext': 'mp4', + 'format': 'hd' if hd else 'sd', + 'format_id': 'hd' if hd else 'sd', + }) + if not hd: webpage = self._download_webpage(mrss_url+'?hd', video_id) if is_hd(webpage): From 35409e11014da3fddd3df405e1503dba91f1f208 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Thu, 23 Jan 2014 03:52:59 +0100 Subject: [PATCH 074/339] [xhamster] Use unicode_literals --- youtube_dl/extractor/xhamster.py | 52 +++++++++++++++++--------------- 1 file changed, 27 insertions(+), 25 deletions(-) diff --git a/youtube_dl/extractor/xhamster.py b/youtube_dl/extractor/xhamster.py index fe13024d5..833d2d1b4 100644 --- a/youtube_dl/extractor/xhamster.py +++ b/youtube_dl/extractor/xhamster.py @@ -1,3 +1,5 @@ +from __future__ import unicode_literals + import re from .common import InfoExtractor @@ -13,25 +15,25 @@ class XHamsterIE(InfoExtractor): """Information Extractor for xHamster""" _VALID_URL = r'(?:http://)?(?:www\.)?xhamster\.com/movies/(?P<id>[0-9]+)/(?P<seo>.+?)\.html(?:\?.*)?' _TESTS = [{ - u'url': u'http://xhamster.com/movies/1509445/femaleagent_shy_beauty_takes_the_bait.html', - u'file': u'1509445.flv', - u'md5': u'9f48e0e8d58e3076bb236ff412ab62fa', - u'info_dict': { - u"upload_date": u"20121014", - u"uploader_id": u"Ruseful2011", - u"title": u"FemaleAgent Shy beauty takes the bait", - u"age_limit": 18, + 'url': 'http://xhamster.com/movies/1509445/femaleagent_shy_beauty_takes_the_bait.html', + 'file': '1509445.flv', + 'md5': '9f48e0e8d58e3076bb236ff412ab62fa', + 'info_dict': { + "upload_date": "20121014", + "uploader_id": "Ruseful2011", + "title": "FemaleAgent Shy beauty takes the bait", + "age_limit": 18, } }, { - u'url': u'http://xhamster.com/movies/2221348/britney_spears_sexy_booty.html?hd', - u'file': u'2221348.flv', - u'md5': u'e767b9475de189320f691f49c679c4c7', - u'info_dict': { - u"upload_date": u"20130914", - u"uploader_id": u"jojo747400", - u"title": u"Britney Spears Sexy Booty", - u"age_limit": 18, + 'url': 'http://xhamster.com/movies/2221348/britney_spears_sexy_booty.html?hd', + 'file': '2221348.flv', + 'md5': 'e767b9475de189320f691f49c679c4c7', + 'info_dict': { + "upload_date": "20130914", + "uploader_id": "jojo747400", + "title": "Britney Spears Sexy Booty", + "age_limit": 18, } }] @@ -63,7 +65,7 @@ class XHamsterIE(InfoExtractor): webpage = self._download_webpage(mrss_url, video_id) video_title = self._html_search_regex(r'<title>(?P<title>.+?) - xHamster\.com', - webpage, u'title') + webpage, 'title') # Only a few videos have an description mobj = re.search('Description: (?P[^<]+)', webpage) @@ -80,10 +82,10 @@ class XHamsterIE(InfoExtractor): self._downloader.report_warning(u'Unable to extract upload date') video_uploader_id = self._html_search_regex(r']+>(?P[^<]+)', - webpage, u'uploader id', default=u'anonymous') + webpage, 'uploader id', default=u'anonymous') video_thumbnail = self._search_regex(r'\'image\':\'(?P[^\']+)\'', - webpage, u'thumbnail', fatal=False) + webpage, 'thumbnail', fatal=False) age_limit = self._rta_search(webpage) @@ -99,12 +101,12 @@ class XHamsterIE(InfoExtractor): video_mp4_url = extract_mp4_video_url(webpage) if (not video_mp4_url is None) and (formats[0]['ext'] != 'mp4'): - formats.append( { - 'url': video_mp4_url, - 'ext': 'mp4', - 'format': 'hd' if hd else 'sd', - 'format_id': 'hd' if hd else 'sd', - }) + formats.append({ + 'url': video_mp4_url, + 'ext': 'mp4', + 'format': 'hd' if hd else 'sd', + 'format_id': 'hd' if hd else 'sd', + }) if not hd: webpage = self._download_webpage(mrss_url+'?hd', video_id) From 22ff1c4a9374b45359db5d9bab6abf26aaaf0f73 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Thu, 23 Jan 2014 04:04:35 +0100 Subject: [PATCH 075/339] [xhamster] Futher simplification --- youtube_dl/extractor/xhamster.py | 52 +++++++++++++++----------------- 1 file changed, 25 insertions(+), 27 deletions(-) diff --git a/youtube_dl/extractor/xhamster.py b/youtube_dl/extractor/xhamster.py index 833d2d1b4..d317f29f2 100644 --- a/youtube_dl/extractor/xhamster.py +++ b/youtube_dl/extractor/xhamster.py @@ -6,7 +6,6 @@ from .common import InfoExtractor from ..utils import ( compat_urllib_parse, unescapeHTML, - determine_ext, ExtractorError, ) @@ -16,11 +15,11 @@ class XHamsterIE(InfoExtractor): _VALID_URL = r'(?:http://)?(?:www\.)?xhamster\.com/movies/(?P[0-9]+)/(?P.+?)\.html(?:\?.*)?' _TESTS = [{ 'url': 'http://xhamster.com/movies/1509445/femaleagent_shy_beauty_takes_the_bait.html', - 'file': '1509445.flv', - 'md5': '9f48e0e8d58e3076bb236ff412ab62fa', + 'file': '1509445.mp4', + 'md5': '8281348b8d3c53d39fffb377d24eac4e', 'info_dict': { - "upload_date": "20121014", - "uploader_id": "Ruseful2011", + "upload_date": "20121014", + "uploader_id": "Ruseful2011", "title": "FemaleAgent Shy beauty takes the bait", "age_limit": 18, } @@ -41,7 +40,7 @@ class XHamsterIE(InfoExtractor): def extract_video_url(webpage): mobj = re.search(r'\'srv\': \'(?P[^\']*)\',\s*\'file\': \'(?P[^\']+)\',', webpage) if mobj is None: - raise ExtractorError(u'Unable to extract media URL') + raise ExtractorError('Unable to extract media URL') if len(mobj.group('server')) == 0: return compat_urllib_parse.unquote(mobj.group('file')) else: @@ -55,7 +54,7 @@ class XHamsterIE(InfoExtractor): return mp4.group(1) def is_hd(webpage): - return webpage.find('
(?P.+?) - xHamster\.com', - webpage, 'title') + video_title = self._html_search_regex( + r'(?P<title>.+?) - xHamster\.com', webpage, 'title') # Only a few videos have an description - mobj = re.search('Description: (?P[^<]+)', webpage) - if mobj: - video_description = unescapeHTML(mobj.group('description')) - else: - video_description = None + mobj = re.search(r'Description: ([^<]+)', webpage) + video_description = mobj.group(1) if mobj else None mobj = re.search(r'hint=\'(?P[0-9]{4})-(?P[0-9]{2})-(?P[0-9]{2}) [0-9]{2}:[0-9]{2}:[0-9]{2} [A-Z]{3,4}\'', webpage) if mobj: video_upload_date = mobj.group('upload_date_Y')+mobj.group('upload_date_m')+mobj.group('upload_date_d') else: video_upload_date = None - self._downloader.report_warning(u'Unable to extract upload date') + self._downloader.report_warning('Unable to extract upload date') - video_uploader_id = self._html_search_regex(r']+>(?P[^<]+)', - webpage, 'uploader id', default=u'anonymous') + video_uploader_id = self._html_search_regex( + r']+>(?P[^<]+)', + webpage, 'uploader id', default='anonymous') - video_thumbnail = self._search_regex(r'\'image\':\'(?P[^\']+)\'', + video_thumbnail = self._search_regex( + r'\'image\':\'(?P[^\']+)\'', webpage, 'thumbnail', fatal=False) age_limit = self._rta_search(webpage) hd = is_hd(webpage) - video_url = extract_video_url(webpage) formats = [{ 'url': video_url, - 'ext': determine_ext(video_url), - 'format': 'hd' if hd else 'sd', 'format_id': 'hd' if hd else 'sd', + 'preference': 0, }] video_mp4_url = extract_mp4_video_url(webpage) - if (not video_mp4_url is None) and (formats[0]['ext'] != 'mp4'): + if video_mp4_url is not None: formats.append({ 'url': video_mp4_url, 'ext': 'mp4', - 'format': 'hd' if hd else 'sd', - 'format_id': 'hd' if hd else 'sd', + 'format_id': 'mp4-hd' if hd else 'mp4-sd', + 'preference': 1, }) if not hd: - webpage = self._download_webpage(mrss_url+'?hd', video_id) + webpage = self._download_webpage( + mrss_url + '?hd', video_id, note='Downloading HD webpage') if is_hd(webpage): video_url = extract_video_url(webpage) formats.append({ 'url': video_url, - 'ext': determine_ext(video_url), - 'format': 'hd', 'format_id': 'hd', + 'preference': 2, }) + self._sort_formats(formats) + return { 'id': video_id, 'title': video_title, From 924f47f7b6d08f5a93558d3239b6bf94c71dc245 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Thu, 23 Jan 2014 04:05:58 +0100 Subject: [PATCH 076/339] [rottentomatoes] Use unicode_literals --- youtube_dl/extractor/rottentomatoes.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/youtube_dl/extractor/rottentomatoes.py b/youtube_dl/extractor/rottentomatoes.py index c79c39413..c1500b82f 100644 --- a/youtube_dl/extractor/rottentomatoes.py +++ b/youtube_dl/extractor/rottentomatoes.py @@ -1,3 +1,5 @@ +from __future__ import unicode_literals + from .videodetective import VideoDetectiveIE @@ -7,10 +9,10 @@ class RottenTomatoesIE(VideoDetectiveIE): _VALID_URL = r'https?://www\.rottentomatoes\.com/m/[^/]+/trailers/(?P\d+)' _TEST = { - u'url': u'http://www.rottentomatoes.com/m/toy_story_3/trailers/11028566/', - u'file': '613340.mp4', - u'info_dict': { - u'title': u'TOY STORY 3', - u'description': u'From the creators of the beloved TOY STORY films, comes a story that will reunite the gang in a whole new way.', + 'url': 'http://www.rottentomatoes.com/m/toy_story_3/trailers/11028566/', + 'file': '613340.mp4', + 'info_dict': { + 'title': 'TOY STORY 3', + 'description': 'From the creators of the beloved TOY STORY films, comes a story that will reunite the gang in a whole new way.', }, } From 18a25c5d78528dd27e2a92aa0d44088a8c04bcf0 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Thu, 23 Jan 2014 10:24:44 +0100 Subject: [PATCH 077/339] Clarify update output (Fixes #2205) No, we are not intentionally hiding the version number. Why would we? --- youtube_dl/update.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/update.py b/youtube_dl/update.py index cd9670166..273083761 100644 --- a/youtube_dl/update.py +++ b/youtube_dl/update.py @@ -90,7 +90,7 @@ def update_self(to_screen, verbose): to_screen(u'youtube-dl is up to date (%s)' % __version__) return - to_screen(u'Updating to version ' + version_id + '...') + to_screen(u'Updating to version ' + version_id + ' ...') version = versions_info['versions'][version_id] print_notes(to_screen, versions_info['versions']) From d3e5bbf437082bc43ff71a93eaaedb65c0b8917f Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Thu, 23 Jan 2014 10:36:47 +0100 Subject: [PATCH 078/339] Correct --max-downloads with --ignore-errors --- youtube_dl/YoutubeDL.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index a48e8ba23..f30bc090a 100644 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -517,6 +517,8 @@ class YoutubeDL(object): except ExtractorError as de: # An error we somewhat expected self.report_error(compat_str(de), de.format_traceback()) break + except MaxDownloadsReached: + raise except Exception as e: if self.params.get('ignoreerrors', False): self.report_error(compat_str(e), tb=compat_str(traceback.format_exc())) From 9dab1b7f28f29ffa6b2ccfe09791725589ad1c4c Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Thu, 23 Jan 2014 10:37:34 +0100 Subject: [PATCH 079/339] release 2014.01.23.1 --- youtube_dl/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 5182f71c7..4c714f9f7 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,2 +1,2 @@ -__version__ = '2014.01.23' +__version__ = '2014.01.23.1' From 76f270a46a47c7df2d75ca8da437c12c729fc380 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Thu, 23 Jan 2014 14:00:29 +0100 Subject: [PATCH 080/339] [sina] use unicode_literals --- youtube_dl/extractor/sina.py | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/youtube_dl/extractor/sina.py b/youtube_dl/extractor/sina.py index 74a87fe56..7548696a7 100644 --- a/youtube_dl/extractor/sina.py +++ b/youtube_dl/extractor/sina.py @@ -1,4 +1,5 @@ # coding: utf-8 +from __future__ import unicode_literals import re @@ -20,11 +21,11 @@ class SinaIE(InfoExtractor): ''' _TEST = { - u'url': u'http://video.sina.com.cn/news/vlist/zt/chczlj2013/?opsubject_id=top12#110028898', - u'file': u'110028898.flv', - u'md5': u'd65dd22ddcf44e38ce2bf58a10c3e71f', - u'info_dict': { - u'title': u'《中国新闻》 朝鲜要求巴拿马立即释放被扣船员', + 'url': 'http://video.sina.com.cn/news/vlist/zt/chczlj2013/?opsubject_id=top12#110028898', + 'file': '110028898.flv', + 'md5': 'd65dd22ddcf44e38ce2bf58a10c3e71f', + 'info_dict': { + 'title': '《中国新闻》 朝鲜要求巴拿马立即释放被扣船员', } } @@ -35,10 +36,10 @@ class SinaIE(InfoExtractor): def _extract_video(self, video_id): data = compat_urllib_parse.urlencode({'vid': video_id}) url_doc = self._download_xml('http://v.iask.com/v_play.php?%s' % data, - video_id, u'Downloading video url') + video_id, 'Downloading video url') image_page = self._download_webpage( 'http://interface.video.sina.com.cn/interface/common/getVideoImage.php?%s' % data, - video_id, u'Downloading thumbnail info') + video_id, 'Downloading thumbnail info') return {'id': video_id, 'url': url_doc.find('./durl/url').text, @@ -52,7 +53,7 @@ class SinaIE(InfoExtractor): video_id = mobj.group('id') if mobj.group('token') is not None: # The video id is in the redirected url - self.to_screen(u'Getting video id') + self.to_screen('Getting video id') request = compat_urllib_request.Request(url) request.get_method = lambda: 'HEAD' (_, urlh) = self._download_webpage_handle(request, 'NA', False) @@ -60,6 +61,6 @@ class SinaIE(InfoExtractor): elif video_id is None: pseudo_id = mobj.group('pseudo_id') webpage = self._download_webpage(url, pseudo_id) - video_id = self._search_regex(r'vid:\'(\d+?)\'', webpage, u'video id') + video_id = self._search_regex(r'vid:\'(\d+?)\'', webpage, 'video id') return self._extract_video(video_id) From 8b769664c47fb0aa4ceeb2341c61cf72d757c524 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Thu, 23 Jan 2014 14:03:14 +0100 Subject: [PATCH 081/339] [sina] Recognize http://video.sina.com.cn/v/b/{id}-*.html urls (fixes #2212) --- youtube_dl/extractor/sina.py | 28 +++++++++++++++++++--------- 1 file changed, 19 insertions(+), 9 deletions(-) diff --git a/youtube_dl/extractor/sina.py b/youtube_dl/extractor/sina.py index 7548696a7..2909ef18b 100644 --- a/youtube_dl/extractor/sina.py +++ b/youtube_dl/extractor/sina.py @@ -13,21 +13,31 @@ from ..utils import ( class SinaIE(InfoExtractor): _VALID_URL = r'''https?://(.*?\.)?video\.sina\.com\.cn/ ( - (.+?/(((?P\d+).html)|(.*?(\#|(vid=))(?P\d+?)($|&)))) + (.+?/(((?P\d+).html)|(.*?(\#|(vid=)|b/)(?P\d+?)($|&|\-)))) | # This is used by external sites like Weibo (api/sinawebApi/outplay.php/(?P.+?)\.swf) ) ''' - _TEST = { - 'url': 'http://video.sina.com.cn/news/vlist/zt/chczlj2013/?opsubject_id=top12#110028898', - 'file': '110028898.flv', - 'md5': 'd65dd22ddcf44e38ce2bf58a10c3e71f', - 'info_dict': { - 'title': '《中国新闻》 朝鲜要求巴拿马立即释放被扣船员', - } - } + _TESTS = [ + { + 'url': 'http://video.sina.com.cn/news/vlist/zt/chczlj2013/?opsubject_id=top12#110028898', + 'file': '110028898.flv', + 'md5': 'd65dd22ddcf44e38ce2bf58a10c3e71f', + 'info_dict': { + 'title': '《中国新闻》 朝鲜要求巴拿马立即释放被扣船员', + } + }, + { + 'url': 'http://video.sina.com.cn/v/b/101314253-1290078633.html', + 'info_dict': { + 'id': '101314253', + 'ext': 'flv', + 'title': '军方提高对朝情报监视级别', + }, + }, + ] @classmethod def suitable(cls, url): From 975d35dbab002fbe65172a4a764b6d10787f9eb6 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Thu, 23 Jan 2014 16:14:54 +0100 Subject: [PATCH 082/339] [youtube:truncated_url] Also match mail subscription links (#2214) --- youtube_dl/extractor/youtube.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 175763309..6deb56447 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -1805,7 +1805,10 @@ class YoutubeFavouritesIE(YoutubeBaseInfoExtractor): class YoutubeTruncatedURLIE(InfoExtractor): IE_NAME = 'youtube:truncated_url' IE_DESC = False # Do not list - _VALID_URL = r'(?:https?://)?[^/]+/watch\?feature=[a-z_]+$' + _VALID_URL = r'''(?x) + (?:https?://)?[^/]+/watch\?feature=[a-z_]+$| + (?:https?://)?(?:www\.)?youtube\.com/attribution_link\?a=[^&]+$ + ''' def _real_extract(self, url): raise ExtractorError( From 8c61d9a9b11cd768a7f93da00daa5ce6aec7d92c Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Thu, 23 Jan 2014 18:50:04 +0100 Subject: [PATCH 083/339] Mention default for -f (Fixes #2215) --- youtube_dl/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/__init__.py b/youtube_dl/__init__.py index 96a031110..294fccb44 100644 --- a/youtube_dl/__init__.py +++ b/youtube_dl/__init__.py @@ -261,7 +261,7 @@ def parseOpts(overrideArguments=None): video_format.add_option('-f', '--format', action='store', dest='format', metavar='FORMAT', default=None, - help='video format code, specify the order of preference using slashes: "-f 22/17/18". "-f mp4" and "-f flv" are also supported. You can also use the special names "best", "bestaudio", "worst", and "worstaudio"') + help='video format code, specify the order of preference using slashes: "-f 22/17/18". "-f mp4" and "-f flv" are also supported. You can also use the special names "best", "bestaudio", "worst", and "worstaudio". By default, youtube-dl will pick the best quality.') video_format.add_option('--all-formats', action='store_const', dest='format', help='download all available video formats', const='all') video_format.add_option('--prefer-free-formats', From fd28827864f94aee1cb4103179b6c4965f0b6641 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Thu, 23 Jan 2014 18:56:36 +0100 Subject: [PATCH 084/339] Do not count unmatched videos for --max-downloads (Fixes #2211) --- youtube_dl/YoutubeDL.py | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index f30bc090a..f202ba4f0 100644 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -396,10 +396,6 @@ class YoutubeDL(object): except UnicodeEncodeError: self.to_screen('[download] The file has already been downloaded') - def increment_downloads(self): - """Increment the ordinal that assigns a number to each file.""" - self._num_downloads += 1 - def prepare_filename(self, info_dict): """Generate the output filename.""" try: @@ -773,8 +769,11 @@ class YoutubeDL(object): """Process a single resolved IE result.""" assert info_dict.get('_type', 'video') == 'video' - #We increment the download the download count here to match the previous behaviour. - self.increment_downloads() + + max_downloads = self.params.get('max_downloads') + if max_downloads is not None: + if self._num_downloads >= int(max_downloads): + raise MaxDownloadsReached() info_dict['fulltitle'] = info_dict['title'] if len(info_dict['title']) > 200: @@ -791,10 +790,7 @@ class YoutubeDL(object): self.to_screen('[download] ' + reason) return - max_downloads = self.params.get('max_downloads') - if max_downloads is not None: - if self._num_downloads > int(max_downloads): - raise MaxDownloadsReached() + self._num_downloads += 1 filename = self.prepare_filename(info_dict) From 629be17af42fdb448a36a007900697d4998c3b4b Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Thu, 23 Jan 2014 19:05:05 +0100 Subject: [PATCH 085/339] release 2014.01.23.2 --- README.md | 4 +++- youtube_dl/version.py | 2 +- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 0eec2e0cb..54d59ea3e 100644 --- a/README.md +++ b/README.md @@ -181,7 +181,9 @@ which means you can modify it, redistribute it or use it however you like. preference using slashes: "-f 22/17/18". "-f mp4" and "-f flv" are also supported. You can also use the special names "best", - "bestaudio", "worst", and "worstaudio" + "bestaudio", "worst", and "worstaudio". By + default, youtube-dl will pick the best + quality. --all-formats download all available video formats --prefer-free-formats prefer free video formats unless a specific one is requested diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 4c714f9f7..bcec90fe4 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,2 +1,2 @@ -__version__ = '2014.01.23.1' +__version__ = '2014.01.23.2' From 0b65e5d40f9d6d9a25fd463a4ab0db95022c534e Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Thu, 23 Jan 2014 23:21:42 +0100 Subject: [PATCH 086/339] [youtube] Do not break upon unknown formats --- youtube_dl/extractor/youtube.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 6deb56447..8816f4f80 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -1290,7 +1290,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): 'url': video_real_url, 'player_url': player_url, } - dct.update(self._formats[itag]) + if itag in self._formats: + dct.update(self._formats[itag]) formats.append(dct) return formats From 63ef36e8d8fb7846b344918a39a9d2df05dfac56 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Thu, 23 Jan 2014 23:28:29 +0100 Subject: [PATCH 087/339] Add build instructions (Fixes #2218) --- README.md | 28 +++++++++++++++++++++++++--- 1 file changed, 25 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 54d59ea3e..d795ef6f2 100644 --- a/README.md +++ b/README.md @@ -325,11 +325,27 @@ Since June 2012 (#342) youtube-dl is packed as an executable zipfile, simply unz To run the exe you need to install first the [Microsoft Visual C++ 2008 Redistributable Package](http://www.microsoft.com/en-us/download/details.aspx?id=29). -# COPYRIGHT +# BUILD INSTRUCTIONS -youtube-dl is released into the public domain by the copyright holders. +Most users do not need to build youtube-dl and can [download the builds](http://rg3.github.io/youtube-dl/download.html) or get them from their distribution. -This README file was originally written by Daniel Bolton () and is likewise released into the public domain. +To run youtube-dl as a developer, you don't need to build anything either. Simply execute + + python -m youtube_dl + +To run the test, simply invoke your favorite test runner, or execute a test file directly; any of the following work: + + python -m unittest discover + python test/test_download.py + nosetests + +If you want to create a build of youtube-dl yourself, you'll need + +* python +* make +* pandoc +* zip +* nosetests # BUGS @@ -388,3 +404,9 @@ Only post features that you (or an incapicated friend you can personally talk to ### Is your question about youtube-dl? It may sound strange, but some bug reports we receive are completely unrelated to youtube-dl and relate to a different or even the reporter's own application. Please make sure that you are actually using youtube-dl. If you are using a UI for youtube-dl, report the bug to the maintainer of the actual application providing the UI. On the other hand, if your UI for youtube-dl fails in some way you believe is related to youtube-dl, by all means, go ahead and report the bug. + +# COPYRIGHT + +youtube-dl is released into the public domain by the copyright holders. + +This README file was originally written by Daniel Bolton () and is likewise released into the public domain. From 67ccb7719715d8edaee291f7ab4f5d5caad3d48f Mon Sep 17 00:00:00 2001 From: Pierre Rudloff Date: Thu, 23 Jan 2014 23:42:34 +0100 Subject: [PATCH 088/339] Removed websurg extractor --- youtube_dl/extractor/__init__.py | 1 - youtube_dl/extractor/websurg.py | 59 -------------------------------- 2 files changed, 60 deletions(-) delete mode 100644 youtube_dl/extractor/websurg.py diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 118982ff8..90c6a8fdb 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -218,7 +218,6 @@ from .vine import VineIE from .viki import VikiIE from .vk import VKIE from .wat import WatIE -from .websurg import WeBSurgIE from .weibo import WeiboIE from .wimp import WimpIE from .wistia import WistiaIE diff --git a/youtube_dl/extractor/websurg.py b/youtube_dl/extractor/websurg.py deleted file mode 100644 index 43953bfdd..000000000 --- a/youtube_dl/extractor/websurg.py +++ /dev/null @@ -1,59 +0,0 @@ -# coding: utf-8 - -import re - -from ..utils import ( - compat_urllib_request, - compat_urllib_parse -) - -from .common import InfoExtractor - -class WeBSurgIE(InfoExtractor): - IE_NAME = u'websurg.com' - _VALID_URL = r'http://.*?\.websurg\.com/MEDIA/\?noheader=1&doi=(.*)' - - _TEST = { - u'url': u'http://www.websurg.com/MEDIA/?noheader=1&doi=vd01en4012', - u'file': u'vd01en4012.mp4', - u'params': { - u'skip_download': True, - }, - u'skip': u'Requires login information', - } - - _LOGIN_URL = 'http://www.websurg.com/inc/login/login_div.ajax.php?login=1' - - def _real_initialize(self): - - login_form = { - 'username': self._downloader.params['username'], - 'password': self._downloader.params['password'], - 'Submit': 1 - } - - request = compat_urllib_request.Request( - self._LOGIN_URL, compat_urllib_parse.urlencode(login_form)) - request.add_header( - 'Content-Type', 'application/x-www-form-urlencoded;charset=utf-8') - compat_urllib_request.urlopen(request).info() - webpage = self._download_webpage(self._LOGIN_URL, '', 'Logging in') - - if webpage != 'OK': - self._downloader.report_error( - u'Unable to log in: bad username/password') - - def _real_extract(self, url): - video_id = re.match(self._VALID_URL, url).group(1) - - webpage = self._download_webpage(url, video_id) - - url_info = re.search(r'streamer="(.*?)" src="(.*?)"', webpage) - - return {'id': video_id, - 'title': self._og_search_title(webpage), - 'description': self._og_search_description(webpage), - 'ext' : 'mp4', - 'url' : url_info.group(1) + '/' + url_info.group(2), - 'thumbnail': self._og_search_thumbnail(webpage) - } From 1394ce65b4c25426cbb4dd1d97adc4cc7d0ba80b Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Thu, 23 Jan 2014 23:54:06 +0100 Subject: [PATCH 089/339] [youtube] Add new formats (Fixes #2221) --- youtube_dl/YoutubeDL.py | 13 +++++++++++-- youtube_dl/extractor/common.py | 1 + youtube_dl/extractor/youtube.py | 5 +++++ 3 files changed, 17 insertions(+), 2 deletions(-) diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index f202ba4f0..1e94d8ac6 100644 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -1094,9 +1094,15 @@ class YoutubeDL(object): res += fdict['format_note'] + ' ' if fdict.get('tbr') is not None: res += '%4dk ' % fdict['tbr'] + if fdict.get('container') is not None: + if res: + res += ', ' + res += '%s container' % fdict['container'] if (fdict.get('vcodec') is not None and fdict.get('vcodec') != 'none'): - res += '%-5s' % fdict['vcodec'] + if res: + res += ', ' + res += fdict['vcodec'] if fdict.get('vbr') is not None: res += '@' elif fdict.get('vbr') is not None and fdict.get('abr') is not None: @@ -1106,7 +1112,10 @@ class YoutubeDL(object): if fdict.get('acodec') is not None: if res: res += ', ' - res += '%-5s' % fdict['acodec'] + if fdict['acodec'] == 'none': + res += 'video only' + else: + res += '%-5s' % fdict['acodec'] elif fdict.get('abr') is not None: if res: res += ', ' diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 02a82dc57..aa48bd4e6 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -66,6 +66,7 @@ class InfoExtractor(object): * asr Audio sampling rate in Hertz * vbr Average video bitrate in KBit/s * vcodec Name of the video codec in use + * container Name of the container format * filesize The number of bytes, if known in advance * player_url SWF Player URL (used for rtmpdump). * protocol The protocol that will be used for the actual diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 8816f4f80..f70dca77c 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -207,6 +207,11 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): '141': {'ext': 'm4a', 'format_note': 'DASH audio', 'vcodec': 'none', 'abr': 256, 'preference': -50}, # Dash webm + '167': {'ext': 'webm', 'height': 360, 'width': 640, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'VP8', 'acodec': 'none', 'preference': -40}, + '168': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'VP8', 'acodec': 'none', 'preference': -40}, + '168': {'ext': 'webm', 'height': 1080, 'width': 1920, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'VP8', 'acodec': 'none', 'preference': -40}, + '218': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'VP8', 'acodec': 'none', 'preference': -40}, + '219': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'VP8', 'acodec': 'none', 'preference': -40}, '242': {'ext': 'webm', 'height': 240, 'resolution': '240p', 'format_note': 'DASH webm', 'preference': -40}, '243': {'ext': 'webm', 'height': 360, 'resolution': '360p', 'format_note': 'DASH webm', 'preference': -40}, '244': {'ext': 'webm', 'height': 480, 'resolution': '480p', 'format_note': 'DASH webm', 'preference': -40}, From f265fc123881bb0fe8c3d9d13169f64aa64ca85c Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Thu, 23 Jan 2014 23:55:53 +0100 Subject: [PATCH 090/339] release 2014.01.23.3 --- youtube_dl/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/version.py b/youtube_dl/version.py index bcec90fe4..f845d99ab 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,2 +1,2 @@ -__version__ = '2014.01.23.2' +__version__ = '2014.01.23.3' From 2c5bae429ae62b925b37039f687def6676b4b459 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Fri, 24 Jan 2014 00:06:26 +0100 Subject: [PATCH 091/339] [youtube] Fix new formats --- youtube_dl/extractor/youtube.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index f70dca77c..1bc2dc22b 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -209,7 +209,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): # Dash webm '167': {'ext': 'webm', 'height': 360, 'width': 640, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'VP8', 'acodec': 'none', 'preference': -40}, '168': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'VP8', 'acodec': 'none', 'preference': -40}, - '168': {'ext': 'webm', 'height': 1080, 'width': 1920, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'VP8', 'acodec': 'none', 'preference': -40}, + '169': {'ext': 'webm', 'height': 720, 'width': 1280, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'VP8', 'acodec': 'none', 'preference': -40}, + '170': {'ext': 'webm', 'height': 1080, 'width': 1920, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'VP8', 'acodec': 'none', 'preference': -40}, '218': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'VP8', 'acodec': 'none', 'preference': -40}, '219': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'VP8', 'acodec': 'none', 'preference': -40}, '242': {'ext': 'webm', 'height': 240, 'resolution': '240p', 'format_note': 'DASH webm', 'preference': -40}, From 886fa723247abd449b80c3800b5fb52e435d8941 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Fri, 24 Jan 2014 00:06:55 +0100 Subject: [PATCH 092/339] release 2014.01.23.4 --- youtube_dl/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/version.py b/youtube_dl/version.py index f845d99ab..0701961a5 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,2 +1,2 @@ -__version__ = '2014.01.23.3' +__version__ = '2014.01.23.4' From 008af8660b7d0075882ac7d22c03dca67b5e64f1 Mon Sep 17 00:00:00 2001 From: Pornophage Date: Sat, 25 Jan 2014 01:46:52 +0100 Subject: [PATCH 093/339] Add cliphunter extractor --- youtube_dl/extractor/__init__.py | 1 + youtube_dl/extractor/cliphunter.py | 69 ++++++++++++++++++++++++++++++ 2 files changed, 70 insertions(+) create mode 100644 youtube_dl/extractor/cliphunter.py diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 118982ff8..80fc1f6ae 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -27,6 +27,7 @@ from .cbs import CBSIE from .channel9 import Channel9IE from .cinemassacre import CinemassacreIE from .clipfish import ClipfishIE +from .cliphunter import CliphunterIE from .clipsyndicate import ClipsyndicateIE from .cmt import CMTIE from .cnn import CNNIE diff --git a/youtube_dl/extractor/cliphunter.py b/youtube_dl/extractor/cliphunter.py new file mode 100644 index 000000000..9c90e2f98 --- /dev/null +++ b/youtube_dl/extractor/cliphunter.py @@ -0,0 +1,69 @@ +import re +import string + +from .common import InfoExtractor +from ..utils import ( + determine_ext, + ExtractorError, +) + +translation_table = ( + '\x00\x01\x02\x03\x04\x05\x06\x07\x08\t\n\x0b\x0c\r\x0e\x0f\x10\x11\x12' + '\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f !"#:%.\'=)*+,-./0123' + '456789:;?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]&_`hbcevofhdjknamoutsstupwrli{' + '|}~\x7f\x80\x81\x82\x83\x84\x85\x86\x87\x88\x89\x8a\x8b\x8c\x8d\x8e\x8f' + '\x90\x91\x92\x93\x94\x95\x96\x97\x98\x99\x9a\x9b\x9c\x9d\x9e\x9f\xa0\xa1' + '\xa2\xa3\xa4\xa5\xa6\xa7\xa8\xa9\xaa\xab\xac\xad\xae\xaf\xb0\xb1\xb2\xb3' + '\xb4\xb5\xb6\xb7\xb8\xb9\xba\xbb\xbc\xbd\xbe\xbf\xc0\xc1\xc2\xc3\xc4\xc5' + '\xc6\xc7\xc8\xc9\xca\xcb\xcc\xcd\xce\xcf\xd0\xd1\xd2\xd3\xd4\xd5\xd6\xd7' + '\xd8\xd9\xda\xdb\xdc\xdd\xde\xdf\xe0\xe1\xe2\xe3\xe4\xe5\xe6\xe7\xe8\xe9' + '\xea\xeb\xec\xed\xee\xef\xf0\xf1\xf2\xf3\xf4\xf5\xf6\xf7\xf8\xf9\xfa\xfb' + '\xfc\xfd\xfe\xff' +) + + +class CliphunterIE(InfoExtractor): + """Information Extractor for Cliphunter""" + IE_NAME = u'cliphunter' + + _VALID_URL = (r'(?:http://)?(?:www\.)?cliphunter\.com/w/' + '(?P[0-9]+)/' + '(?P.+?)(?:\?.*)?') + _TESTS = [{ + u'url': u'http://www.cliphunter.com/w/1012420/Fun_Jynx_Maze_solo', + u'file': u'1012420.flv', + u'md5': u'49f72e2fd2977e6e518be9836dcf861e', + u'info_dict': { + u"title": u"Fun Jynx Maze solo", + } + }, + ] + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + if mobj is None: + raise ExtractorError(u'Unable to extract media URL') + + video_id = mobj.group('id') + + webpage = self._download_webpage(url, video_id) + + pl_fiji = re.search(r'pl_fiji = \'([^\']+)\'', webpage).group(1) + pl_c_qual = re.search(r'pl_c_qual = "(.)"', webpage).group(1) + video_title = re.search(r'mediaTitle = "([^"]+)"', webpage).group(1) + + video_url = string.translate(pl_fiji.encode(), translation_table) + + formats = [{ + 'url': video_url, + 'ext': determine_ext(video_url), + 'format': pl_c_qual, + 'format_id': pl_c_qual, + }] + + return { + 'id': video_id, + 'title': video_title, + 'formats': formats, + 'description': '', + } From bacb5e4f440a7ac3997282c3603032e575bb518b Mon Sep 17 00:00:00 2001 From: Pornophage Date: Sat, 25 Jan 2014 02:34:08 +0100 Subject: [PATCH 094/339] Minor fixes Remove empty description Set correct md5 test --- youtube_dl/extractor/cliphunter.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/youtube_dl/extractor/cliphunter.py b/youtube_dl/extractor/cliphunter.py index 9c90e2f98..2d8c09630 100644 --- a/youtube_dl/extractor/cliphunter.py +++ b/youtube_dl/extractor/cliphunter.py @@ -32,12 +32,11 @@ class CliphunterIE(InfoExtractor): _TESTS = [{ u'url': u'http://www.cliphunter.com/w/1012420/Fun_Jynx_Maze_solo', u'file': u'1012420.flv', - u'md5': u'49f72e2fd2977e6e518be9836dcf861e', + u'md5': u'15e7740f30428abf70f4223478dc1225', u'info_dict': { - u"title": u"Fun Jynx Maze solo", + u'title': u'Fun Jynx Maze solo', } - }, - ] + }] def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) @@ -65,5 +64,4 @@ class CliphunterIE(InfoExtractor): 'id': video_id, 'title': video_title, 'formats': formats, - 'description': '', } From 352d08e3e51011975a6c45bc60842a18f28e96ef Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Sat, 25 Jan 2014 11:31:30 +0100 Subject: [PATCH 095/339] Add an extractor for freespeech.org (closes #2234) --- youtube_dl/extractor/__init__.py | 1 + youtube_dl/extractor/freespeech.py | 37 ++++++++++++++++++++++++++++++ 2 files changed, 38 insertions(+) create mode 100644 youtube_dl/extractor/freespeech.py diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 118982ff8..05c782d58 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -72,6 +72,7 @@ from .francetv import ( CultureboxIE, ) from .freesound import FreesoundIE +from .freespeech import FreespeechIE from .funnyordie import FunnyOrDieIE from .gamekings import GamekingsIE from .gamespot import GameSpotIE diff --git a/youtube_dl/extractor/freespeech.py b/youtube_dl/extractor/freespeech.py new file mode 100644 index 000000000..bb253ebba --- /dev/null +++ b/youtube_dl/extractor/freespeech.py @@ -0,0 +1,37 @@ +from __future__ import unicode_literals + +import re +import json + +from .common import InfoExtractor + + +class FreespeechIE(InfoExtractor): + IE_NAME = 'freespeech.org' + _VALID_URL = r'https://www.freespeech.org/video/(?P.+)' + _TEST = { + 'add_ie': ['Youtube'], + 'url': 'https://www.freespeech.org/video/obama-romney-campaign-colorado-ahead-debate-0', + 'info_dict': { + 'id': 'poKsVCZ64uU', + 'ext': 'mp4', + 'title': 'Obama, Romney Campaign in Colorado Ahead of Debate', + 'description': 'Obama, Romney Campaign in Colorado Ahead of Debate', + 'uploader': 'freespeechtv', + 'uploader_id': 'freespeechtv', + 'upload_date': '20121002', + }, + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + title = mobj.group('title') + webpage = self._download_webpage(url, title) + info_json = self._search_regex(r'jQuery.extend\(Drupal.settings, ({.*?})\);', webpage, 'info') + info = json.loads(info_json) + + return { + '_type': 'url', + 'url': info['jw_player']['basic_video_node_player']['file'], + 'ie_key': 'Youtube', + } From 59188de1136208cb6c06113a6af0708d3544b87f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= <jaime.marquinez.ferrandiz@gmail.com> Date: Sat, 25 Jan 2014 11:48:08 +0100 Subject: [PATCH 096/339] =?UTF-8?q?Properly=20escape=20=E2=80=98.=E2=80=99?= =?UTF-8?q?=20in=20some=20=5FVALID=5FURL=20properties?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- youtube_dl/extractor/condenast.py | 2 +- youtube_dl/extractor/freespeech.py | 2 +- youtube_dl/extractor/hotnewhiphop.py | 2 +- youtube_dl/extractor/vimeo.py | 10 +++++----- 4 files changed, 8 insertions(+), 8 deletions(-) diff --git a/youtube_dl/extractor/condenast.py b/youtube_dl/extractor/condenast.py index 03b75b80d..91c1c1348 100644 --- a/youtube_dl/extractor/condenast.py +++ b/youtube_dl/extractor/condenast.py @@ -30,7 +30,7 @@ class CondeNastIE(InfoExtractor): 'vanityfair': 'Vanity Fair', } - _VALID_URL = r'http://(video|www).(?P<site>%s).com/(?P<type>watch|series|video)/(?P<id>.+)' % '|'.join(_SITES.keys()) + _VALID_URL = r'http://(video|www)\.(?P<site>%s)\.com/(?P<type>watch|series|video)/(?P<id>.+)' % '|'.join(_SITES.keys()) IE_DESC = 'Condé Nast media group: %s' % ', '.join(sorted(_SITES.values())) _TEST = { diff --git a/youtube_dl/extractor/freespeech.py b/youtube_dl/extractor/freespeech.py index bb253ebba..c210177f7 100644 --- a/youtube_dl/extractor/freespeech.py +++ b/youtube_dl/extractor/freespeech.py @@ -8,7 +8,7 @@ from .common import InfoExtractor class FreespeechIE(InfoExtractor): IE_NAME = 'freespeech.org' - _VALID_URL = r'https://www.freespeech.org/video/(?P<title>.+)' + _VALID_URL = r'https://www\.freespeech\.org/video/(?P<title>.+)' _TEST = { 'add_ie': ['Youtube'], 'url': 'https://www.freespeech.org/video/obama-romney-campaign-colorado-ahead-debate-0', diff --git a/youtube_dl/extractor/hotnewhiphop.py b/youtube_dl/extractor/hotnewhiphop.py index a106f81d2..80b48b1b3 100644 --- a/youtube_dl/extractor/hotnewhiphop.py +++ b/youtube_dl/extractor/hotnewhiphop.py @@ -13,7 +13,7 @@ from ..utils import ( class HotNewHipHopIE(InfoExtractor): - _VALID_URL = r'http://www\.hotnewhiphop.com/.*\.(?P<id>.*)\.html' + _VALID_URL = r'http://www\.hotnewhiphop\.com/.*\.(?P<id>.*)\.html' _TEST = { 'url': 'http://www.hotnewhiphop.com/freddie-gibbs-lay-it-down-song.1435540.html', 'file': '1435540.mp3', diff --git a/youtube_dl/extractor/vimeo.py b/youtube_dl/extractor/vimeo.py index 193675549..a50170ce7 100644 --- a/youtube_dl/extractor/vimeo.py +++ b/youtube_dl/extractor/vimeo.py @@ -291,7 +291,7 @@ class VimeoIE(InfoExtractor): class VimeoChannelIE(InfoExtractor): IE_NAME = 'vimeo:channel' - _VALID_URL = r'(?:https?://)?vimeo.\com/channels/(?P<id>[^/]+)' + _VALID_URL = r'(?:https?://)?vimeo\.com/channels/(?P<id>[^/]+)' _MORE_PAGES_INDICATOR = r'<a.+?rel="next"' _TITLE_RE = r'<link rel="alternate"[^>]+?title="(.*?)"' @@ -327,7 +327,7 @@ class VimeoChannelIE(InfoExtractor): class VimeoUserIE(VimeoChannelIE): IE_NAME = 'vimeo:user' - _VALID_URL = r'(?:https?://)?vimeo.\com/(?P<name>[^/]+)(?:/videos|[#?]|$)' + _VALID_URL = r'(?:https?://)?vimeo\.com/(?P<name>[^/]+)(?:/videos|[#?]|$)' _TITLE_RE = r'<a[^>]+?class="user">([^<>]+?)</a>' @classmethod @@ -344,7 +344,7 @@ class VimeoUserIE(VimeoChannelIE): class VimeoAlbumIE(VimeoChannelIE): IE_NAME = 'vimeo:album' - _VALID_URL = r'(?:https?://)?vimeo.\com/album/(?P<id>\d+)' + _VALID_URL = r'(?:https?://)?vimeo\.com/album/(?P<id>\d+)' _TITLE_RE = r'<header id="page_header">\n\s*<h1>(.*?)</h1>' def _page_url(self, base_url, pagenum): @@ -358,7 +358,7 @@ class VimeoAlbumIE(VimeoChannelIE): class VimeoGroupsIE(VimeoAlbumIE): IE_NAME = 'vimeo:group' - _VALID_URL = r'(?:https?://)?vimeo.\com/groups/(?P<name>[^/]+)' + _VALID_URL = r'(?:https?://)?vimeo\.com/groups/(?P<name>[^/]+)' def _extract_list_title(self, webpage): return self._og_search_title(webpage) @@ -372,7 +372,7 @@ class VimeoGroupsIE(VimeoAlbumIE): class VimeoReviewIE(InfoExtractor): IE_NAME = 'vimeo:review' IE_DESC = 'Review pages on vimeo' - _VALID_URL = r'(?:https?://)?vimeo.\com/[^/]+/review/(?P<id>[^/]+)' + _VALID_URL = r'(?:https?://)?vimeo\.com/[^/]+/review/(?P<id>[^/]+)' _TEST = { 'url': 'https://vimeo.com/user21297594/review/75524534/3c257a1b5d', 'file': '75524534.mp4', From f945612bd03642fa01ea7220ebe3e8eae99a9052 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Sat, 25 Jan 2014 14:18:54 +0100 Subject: [PATCH 097/339] [rtlnow] Simplify --- youtube_dl/extractor/rtlnow.py | 150 +++++++++++++++++---------------- 1 file changed, 77 insertions(+), 73 deletions(-) diff --git a/youtube_dl/extractor/rtlnow.py b/youtube_dl/extractor/rtlnow.py index a43d6ced5..cd50f708d 100644 --- a/youtube_dl/extractor/rtlnow.py +++ b/youtube_dl/extractor/rtlnow.py @@ -1,4 +1,7 @@ # encoding: utf-8 + +from __future__ import unicode_literals + import re from .common import InfoExtractor @@ -12,78 +15,77 @@ class RTLnowIE(InfoExtractor): """Information Extractor for RTL NOW, RTL2 NOW, RTL NITRO, SUPER RTL NOW, VOX NOW and n-tv NOW""" _VALID_URL = r'(?:http://)?(?P<url>(?P<domain>rtl-now\.rtl\.de|rtl2now\.rtl2\.de|(?:www\.)?voxnow\.de|(?:www\.)?rtlnitronow\.de|(?:www\.)?superrtlnow\.de|(?:www\.)?n-tvnow\.de)/+[a-zA-Z0-9-]+/[a-zA-Z0-9-]+\.php\?(?:container_id|film_id)=(?P<video_id>[0-9]+)&player=1(?:&season=[0-9]+)?(?:&.*)?)' _TESTS = [{ - u'url': u'http://rtl-now.rtl.de/ahornallee/folge-1.php?film_id=90419&player=1&season=1', - u'file': u'90419.flv', - u'info_dict': { - u'upload_date': u'20070416', - u'title': u'Ahornallee - Folge 1 - Der Einzug', - u'description': u'Folge 1 - Der Einzug', + 'url': 'http://rtl-now.rtl.de/ahornallee/folge-1.php?film_id=90419&player=1&season=1', + 'file': '90419.flv', + 'info_dict': { + 'upload_date': '20070416', + 'title': 'Ahornallee - Folge 1 - Der Einzug', + 'description': 'Folge 1 - Der Einzug', }, - u'params': { - u'skip_download': True, + 'params': { + 'skip_download': True, }, - u'skip': u'Only works from Germany', + 'skip': 'Only works from Germany', }, { - u'url': u'http://rtl2now.rtl2.de/aerger-im-revier/episode-15-teil-1.php?film_id=69756&player=1&season=2&index=5', - u'file': u'69756.flv', - u'info_dict': { - u'upload_date': u'20120519', - u'title': u'Ärger im Revier - Ein junger Ladendieb, ein handfester Streit...', - u'description': u'Ärger im Revier - Ein junger Ladendieb, ein handfester Streit u.a.', - u'thumbnail': u'http://autoimg.static-fra.de/rtl2now/219850/1500x1500/image2.jpg', + 'url': 'http://rtl2now.rtl2.de/aerger-im-revier/episode-15-teil-1.php?film_id=69756&player=1&season=2&index=5', + 'file': '69756.flv', + 'info_dict': { + 'upload_date': '20120519', + 'title': 'Ärger im Revier - Ein junger Ladendieb, ein handfester Streit...', + 'description': 'Ärger im Revier - Ein junger Ladendieb, ein handfester Streit u.a.', + 'thumbnail': 'http://autoimg.static-fra.de/rtl2now/219850/1500x1500/image2.jpg', }, - u'params': { - u'skip_download': True, + 'params': { + 'skip_download': True, }, - u'skip': u'Only works from Germany', + 'skip': 'Only works from Germany', }, { - u'url': u'http://www.voxnow.de/voxtours/suedafrika-reporter-ii.php?film_id=13883&player=1&season=17', - u'file': u'13883.flv', - u'info_dict': { - u'upload_date': u'20090627', - u'title': u'Voxtours - Südafrika-Reporter II', - u'description': u'Südafrika-Reporter II', + 'url': 'http://www.voxnow.de/voxtours/suedafrika-reporter-ii.php?film_id=13883&player=1&season=17', + 'file': '13883.flv', + 'info_dict': { + 'upload_date': '20090627', + 'title': 'Voxtours - Südafrika-Reporter II', + 'description': 'Südafrika-Reporter II', }, - u'params': { - u'skip_download': True, + 'params': { + 'skip_download': True, }, }, { - u'url': u'http://superrtlnow.de/medicopter-117/angst.php?film_id=99205&player=1', - u'file': u'99205.flv', - u'info_dict': { - u'upload_date': u'20080928', - u'title': u'Medicopter 117 - Angst!', - u'description': u'Angst!', - u'thumbnail': u'http://autoimg.static-fra.de/superrtlnow/287529/1500x1500/image2.jpg' + 'url': 'http://superrtlnow.de/medicopter-117/angst.php?film_id=99205&player=1', + 'file': '99205.flv', + 'info_dict': { + 'upload_date': '20080928', + 'title': 'Medicopter 117 - Angst!', + 'description': 'Angst!', + 'thumbnail': 'http://autoimg.static-fra.de/superrtlnow/287529/1500x1500/image2.jpg' }, - u'params': { - u'skip_download': True, + 'params': { + 'skip_download': True, }, }, { - u'url': u'http://www.n-tvnow.de/top-gear/episode-1-2013-01-01-00-00-00.php?film_id=124903&player=1&season=10', - u'file': u'124903.flv', - u'info_dict': { - u'upload_date': u'20130101', - u'title': u'Top Gear vom 01.01.2013', - u'description': u'Episode 1', + 'url': 'http://www.n-tvnow.de/top-gear/episode-1-2013-01-01-00-00-00.php?film_id=124903&player=1&season=10', + 'file': '124903.flv', + 'info_dict': { + 'upload_date': '20130101', + 'title': 'Top Gear vom 01.01.2013', + 'description': 'Episode 1', }, - u'params': { - u'skip_download': True, + 'params': { + 'skip_download': True, }, - u'skip': u'Only works from Germany', + 'skip': 'Only works from Germany', }] - - def _real_extract(self,url): + def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) - webpage_url = u'http://' + mobj.group('url') - video_page_url = u'http://' + mobj.group('domain') + u'/' - video_id = mobj.group(u'video_id') + webpage_url = 'http://' + mobj.group('url') + video_page_url = 'http://' + mobj.group('domain') + '/' + video_id = mobj.group('video_id') webpage = self._download_webpage(webpage_url, video_id) @@ -94,51 +96,53 @@ class RTLnowIE(InfoExtractor): msg = clean_html(note_m.group(1)) raise ExtractorError(msg) - video_title = self._html_search_regex(r'<title>(?P<title>[^<]+?)( \| [^<]*)?', - webpage, u'title') - playerdata_url = self._html_search_regex(r'\'playerdata\': \'(?P[^\']+)\'', - webpage, u'playerdata_url') + video_title = self._html_search_regex( + r'(?P<title>[^<]+?)( \| [^<]*)?', + webpage, 'title') + playerdata_url = self._html_search_regex( + r'\'playerdata\': \'(?P[^\']+)\'', + webpage, 'playerdata_url') playerdata = self._download_webpage(playerdata_url, video_id) mobj = re.search(r'<!\[CDATA\[(?P<description>.+?)(?:\s+- (?:Sendung )?vom (?P<upload_date_d>[0-9]{2})\.(?P<upload_date_m>[0-9]{2})\.(?:(?P<upload_date_Y>[0-9]{4})|(?P<upload_date_y>[0-9]{2})) [0-9]{2}:[0-9]{2} Uhr)?\]\]>', playerdata) if mobj: - video_description = mobj.group(u'description') + video_description = mobj.group('description') if mobj.group('upload_date_Y'): video_upload_date = mobj.group('upload_date_Y') elif mobj.group('upload_date_y'): - video_upload_date = u'20' + mobj.group('upload_date_y') + video_upload_date = '20' + mobj.group('upload_date_y') else: video_upload_date = None if video_upload_date: - video_upload_date += mobj.group('upload_date_m')+mobj.group('upload_date_d') + video_upload_date += mobj.group('upload_date_m') + mobj.group('upload_date_d') else: video_description = None video_upload_date = None - self._downloader.report_warning(u'Unable to extract description and upload date') + self._downloader.report_warning('Unable to extract description and upload date') # Thumbnail: not every video has an thumbnail mobj = re.search(r'', webpage) if mobj: - video_thumbnail = mobj.group(u'thumbnail') + video_thumbnail = mobj.group('thumbnail') else: video_thumbnail = None mobj = re.search(r']+>rtmpe://(?:[^/]+/){2})(?P[^\]]+)\]\]>', playerdata) if mobj is None: - raise ExtractorError(u'Unable to extract media URL') - video_url = mobj.group(u'url') - video_play_path = u'mp4:' + mobj.group(u'play_path') - video_player_url = video_page_url + u'includes/vodplayer.swf' + raise ExtractorError('Unable to extract media URL') + video_url = mobj.group('url') + video_play_path = 'mp4:' + mobj.group('play_path') + video_player_url = video_page_url + 'includes/vodplayer.swf' - return [{ - 'id': video_id, - 'url': video_url, - 'play_path': video_play_path, - 'page_url': video_page_url, - 'player_url': video_player_url, - 'ext': 'flv', - 'title': video_title, + return { + 'id': video_id, + 'url': video_url, + 'play_path': video_play_path, + 'page_url': video_page_url, + 'player_url': video_player_url, + 'ext': 'flv', + 'title': video_title, 'description': video_description, 'upload_date': video_upload_date, - 'thumbnail': video_thumbnail, - }] + 'thumbnail': video_thumbnail, + } From 944d65c762cc8426bb10093d11dbb94ea5dc21cb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Sat, 25 Jan 2014 15:31:58 +0100 Subject: [PATCH 098/339] =?UTF-8?q?[extractor/common]=20Encode=20the=20url?= =?UTF-8?q?=20when=20calculating=20the=20md5=20with=20`=E2=80=94write-page?= =?UTF-8?q?s`=20option?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This doesn’t cause any problem in python 2.*, but on python 3 the `md5` function only accepts bytes. --- youtube_dl/extractor/common.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index aa48bd4e6..3cf742a3b 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -240,7 +240,7 @@ class InfoExtractor(object): except AttributeError: url = url_or_request if len(url) > 200: - h = u'___' + hashlib.md5(url).hexdigest() + h = u'___' + hashlib.md5(url.encode('utf-8')).hexdigest() url = url[:200 - len(h)] + h raw_filename = ('%s_%s.dump' % (video_id, url)) filename = sanitize_filename(raw_filename, restricted=True) From f89197d73e14d33ea580b5fdaed0e84e4b6851a5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Sat, 25 Jan 2014 12:02:43 +0100 Subject: [PATCH 099/339] Some pep8 style fixes --- youtube_dl/YoutubeDL.py | 6 +++--- youtube_dl/downloader/__init__.py | 2 +- youtube_dl/downloader/common.py | 1 - youtube_dl/downloader/http.py | 6 +++--- youtube_dl/downloader/mplayer.py | 4 ++-- 5 files changed, 9 insertions(+), 10 deletions(-) diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index 1e94d8ac6..42cbcf699 100644 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -331,7 +331,7 @@ class YoutubeDL(object): def __exit__(self, *args): self.restore_console_title() - + if self.params.get('cookiefile') is not None: self.cookiejar.save() @@ -710,10 +710,10 @@ class YoutubeDL(object): # TODO Central sorting goes here - if formats[0] is not info_dict: + if formats[0] is not info_dict: # only set the 'formats' fields if the original info_dict list them # otherwise we end up with a circular reference, the first (and unique) - # element in the 'formats' field in info_dict is info_dict itself, + # element in the 'formats' field in info_dict is info_dict itself, # wich can't be exported to json info_dict['formats'] = formats if self.params.get('listformats', None): diff --git a/youtube_dl/downloader/__init__.py b/youtube_dl/downloader/__init__.py index f19b490f1..0d9eb0001 100644 --- a/youtube_dl/downloader/__init__.py +++ b/youtube_dl/downloader/__init__.py @@ -8,6 +8,7 @@ from ..utils import ( determine_ext, ) + def get_suitable_downloader(info_dict): """Get the downloader class that can handle the info dict.""" url = info_dict['url'] @@ -20,4 +21,3 @@ def get_suitable_downloader(info_dict): return MplayerFD else: return HttpFD - diff --git a/youtube_dl/downloader/common.py b/youtube_dl/downloader/common.py index 10143d56a..5a068aa8b 100644 --- a/youtube_dl/downloader/common.py +++ b/youtube_dl/downloader/common.py @@ -314,4 +314,3 @@ class FileDownloader(object): if the download is successful. """ self._progress_hooks.append(ph) - diff --git a/youtube_dl/downloader/http.py b/youtube_dl/downloader/http.py index 8407727ba..748f9f3ad 100644 --- a/youtube_dl/downloader/http.py +++ b/youtube_dl/downloader/http.py @@ -27,7 +27,7 @@ class HttpFD(FileDownloader): request = compat_urllib_request.Request(url, None, headers) if self.params.get('test', False): - request.add_header('Range','bytes=0-10240') + request.add_header('Range', 'bytes=0-10240') # Establish possible resume length if os.path.isfile(encodeFilename(tmpfilename)): @@ -39,7 +39,7 @@ class HttpFD(FileDownloader): if resume_len != 0: if self.params.get('continuedl', False): self.report_resuming_byte(resume_len) - request.add_header('Range','bytes=%d-' % resume_len) + request.add_header('Range', 'bytes=%d-' % resume_len) open_mode = 'ab' else: resume_len = 0 @@ -100,7 +100,7 @@ class HttpFD(FileDownloader): if data_len is not None: data_len = int(data_len) + resume_len min_data_len = self.params.get("min_filesize", None) - max_data_len = self.params.get("max_filesize", None) + max_data_len = self.params.get("max_filesize", None) if min_data_len is not None and data_len < min_data_len: self.to_screen(u'\r[download] File is smaller than min-filesize (%s bytes < %s bytes). Aborting.' % (data_len, min_data_len)) return False diff --git a/youtube_dl/downloader/mplayer.py b/youtube_dl/downloader/mplayer.py index 67e0e4189..4de7f15f4 100644 --- a/youtube_dl/downloader/mplayer.py +++ b/youtube_dl/downloader/mplayer.py @@ -18,10 +18,10 @@ class MplayerFD(FileDownloader): try: subprocess.call(['mplayer', '-h'], stdout=(open(os.path.devnull, 'w')), stderr=subprocess.STDOUT) except (OSError, IOError): - self.report_error(u'MMS or RTSP download detected but "%s" could not be run' % args[0] ) + self.report_error(u'MMS or RTSP download detected but "%s" could not be run' % args[0]) return False - # Download using mplayer. + # Download using mplayer. retval = subprocess.call(args) if retval == 0: fsize = os.path.getsize(encodeFilename(tmpfilename)) From beddbc2ad10864868d8537a65928f88a58d729c8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Sat, 25 Jan 2014 15:47:03 +0100 Subject: [PATCH 100/339] [youtube:toplist] Make the regex for finding the playlist link more flexible `title={foo}` may not be at the end of the `href` string. --- youtube_dl/extractor/youtube.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 1bc2dc22b..1c74e3fc9 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -1529,7 +1529,7 @@ class YoutubeTopListIE(YoutubePlaylistIE): channel = mobj.group('chann') title = mobj.group('title') query = compat_urllib_parse.urlencode({'title': title}) - playlist_re = 'href="([^"]+?%s[^"]+?)"' % re.escape(query) + playlist_re = 'href="([^"]+?%s.*?)"' % re.escape(query) channel_page = self._download_webpage('https://www.youtube.com/%s' % channel, title) link = self._html_search_regex(playlist_re, channel_page, u'list') url = compat_urlparse.urljoin('https://www.youtube.com/', link) From 48f9678a32377491bf0967cb13cf21f2d7704126 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Sat, 25 Jan 2014 16:55:59 +0100 Subject: [PATCH 101/339] [test/youtube_lists] Change the list used for testing the Top Lists extractor MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The ‘Top tracks’ list is not always present in the channel page --- test/test_youtube_lists.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/test_youtube_lists.py b/test/test_youtube_lists.py index d9fe5af4e..de157f657 100644 --- a/test/test_youtube_lists.py +++ b/test/test_youtube_lists.py @@ -120,7 +120,7 @@ class TestYoutubeLists(unittest.TestCase): def test_youtube_toplist(self): dl = FakeYDL() ie = YoutubeTopListIE(dl) - result = ie.extract('yttoplist:music:Top Tracks') + result = ie.extract('yttoplist:music:Trending') entries = result['entries'] self.assertTrue(len(entries) >= 5) From 38c2e5b8d5ac616dbfd5fff3b023583fe5c3a30d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Sat, 25 Jan 2014 17:11:55 +0100 Subject: [PATCH 102/339] [youtube] Use https: in more urls --- youtube_dl/extractor/youtube.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 1c74e3fc9..c3fbbc0de 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -40,7 +40,7 @@ class YoutubeBaseInfoExtractor(InfoExtractor): """Provide base functions for Youtube extractors""" _LOGIN_URL = 'https://accounts.google.com/ServiceLogin' _LANG_URL = r'https://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1' - _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en' + _AGE_URL = 'https://www.youtube.com/verify_age?next_url=/&gl=US&hl=en' _NETRC_MACHINE = 'youtube' # If True it will raise an error if no login info is provided _LOGIN_REQUIRED = False @@ -1014,7 +1014,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): def _get_available_subtitles(self, video_id, webpage): try: sub_list = self._download_webpage( - 'http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id, + 'https://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id, video_id, note=False) except ExtractorError as err: self._downloader.report_warning(u'unable to download video subtitles: %s' % compat_str(err)) @@ -1030,7 +1030,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): 'fmt': self._downloader.params.get('subtitlesformat', 'srt'), 'name': unescapeHTML(l[0]).encode('utf-8'), }) - url = u'http://www.youtube.com/api/timedtext?' + params + url = u'https://www.youtube.com/api/timedtext?' + params sub_lang_list[lang] = url if not sub_lang_list: self._downloader.report_warning(u'video doesn\'t have subtitles') @@ -1554,7 +1554,7 @@ class YoutubeChannelIE(InfoExtractor): IE_DESC = u'YouTube.com channels' _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)" _MORE_PAGES_INDICATOR = 'yt-uix-load-more' - _MORE_PAGES_URL = 'http://www.youtube.com/c4_browse_ajax?action_load_more_videos=1&flow=list&paging=%s&view=0&sort=da&channel_id=%s' + _MORE_PAGES_URL = 'https://www.youtube.com/c4_browse_ajax?action_load_more_videos=1&flow=list&paging=%s&view=0&sort=da&channel_id=%s' IE_NAME = u'youtube:channel' def extract_videos_from_page(self, page): @@ -1610,9 +1610,9 @@ class YoutubeChannelIE(InfoExtractor): class YoutubeUserIE(InfoExtractor): IE_DESC = u'YouTube.com user videos (URL or "ytuser" keyword)' _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:user/)?(?!(?:attribution_link|watch)(?:$|[^a-z_A-Z0-9-])))|ytuser:)(?!feed/)([A-Za-z0-9_-]+)' - _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s' + _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/users/%s' _GDATA_PAGE_SIZE = 50 - _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d&alt=json' + _GDATA_URL = 'https://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d&alt=json' IE_NAME = u'youtube:user' @classmethod @@ -1743,7 +1743,7 @@ class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor): action = 'action_load_system_feed' if self._PERSONAL_FEED: action = 'action_load_personal_feed' - return 'http://www.youtube.com/feed_ajax?%s=1&feed_name=%s&paging=%%s' % (action, self._FEED_NAME) + return 'https://www.youtube.com/feed_ajax?%s=1&feed_name=%s&paging=%%s' % (action, self._FEED_NAME) @property def IE_NAME(self): From 5700e7792aed45d6504ae957610d8254d5bb073f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Sat, 25 Jan 2014 17:22:41 +0100 Subject: [PATCH 103/339] [youtube] Encode the data when submitting the form for confirming the age Needed on python 3 --- youtube_dl/extractor/youtube.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index c3fbbc0de..87a5a452e 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -111,7 +111,8 @@ class YoutubeBaseInfoExtractor(InfoExtractor): 'next_url': '/', 'action_confirm': 'Confirm', } - req = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form)) + req = compat_urllib_request.Request(self._AGE_URL, + compat_urllib_parse.urlencode(age_form).encode('ascii')) self._download_webpage( req, None, From 53bfd6b24c48ae052b73e9ab19a9c9906d57fa44 Mon Sep 17 00:00:00 2001 From: sahutd Date: Sun, 26 Jan 2014 14:05:34 +0530 Subject: [PATCH 104/339] Added support for Discovery Issue #2227 --- youtube_dl/extractor/__init__.py | 1 + youtube_dl/extractor/discovery.py | 40 +++++++++++++++++++++++++++++++ 2 files changed, 41 insertions(+) create mode 100644 youtube_dl/extractor/discovery.py diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 05c782d58..ba3d4ac0e 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -47,6 +47,7 @@ from .depositfiles import DepositFilesIE from .dotsub import DotsubIE from .dreisat import DreiSatIE from .defense import DefenseGouvFrIE +from .discovery import DiscoveryIE from .dropbox import DropboxIE from .ebaumsworld import EbaumsWorldIE from .ehow import EHowIE diff --git a/youtube_dl/extractor/discovery.py b/youtube_dl/extractor/discovery.py new file mode 100644 index 000000000..14fca3cae --- /dev/null +++ b/youtube_dl/extractor/discovery.py @@ -0,0 +1,40 @@ +from __future__ import unicode_literals + +import re +import json +from .common import InfoExtractor + + +class DiscoveryIE(InfoExtractor): + _VALID_URL = r'http://dsc\.discovery\.com\/[a-zA-Z0-9\-]*/[a-zA-Z0-9\-]*/videos/(?P[a-zA-Z0-9\-]*)(.htm)?' + _TEST = { + 'url': 'http://dsc.discovery.com/tv-shows/mythbusters/videos/mission-impossible-outtakes.htm', + 'file': 'mission-impossible-outtakes.mp4', + 'md5': 'e12614f9ee303a6ccef415cb0793eba2', + 'info_dict': { + 'title': 'MythBusters: Mission Impossible Outtakes' + } + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + webpage = self._download_webpage(url, video_id) + title = self._search_regex( + r'(?<=\"name\": ")(?P.*?)(?=\"\,)', webpage, r'Filename') + duration = int(self._search_regex( + r'(?<=\"duration\"\: )(?P<duration>.*?)(?=,)', webpage, r'Duration')) + formats_raw = self._search_regex( + r'(?<=\"mp4\":)(.*?)(}])', webpage, r'formats') + '}]' + formats_json = json.loads(formats_raw) + formats = [] + for f in formats_json: + formats.append( + {'url': f['src'], r'ext': r'mp4', 'tbr': int(f['bitrate'][:-1])}) + + return { + 'id': video_id, + 'duration': duration, + 'title': title, + 'formats': formats + } From db1f388878db8ce2ae6473a5447a5aa6c9ea86f1 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Mon, 27 Jan 2014 05:47:30 +0100 Subject: [PATCH 105/339] [huffpost] Add support --- youtube_dl/downloader/__init__.py | 5 ++- youtube_dl/extractor/__init__.py | 1 + youtube_dl/extractor/common.py | 2 +- youtube_dl/extractor/generic.py | 8 +++- youtube_dl/extractor/huffpost.py | 70 +++++++++++++++++++++++++++++++ 5 files changed, 83 insertions(+), 3 deletions(-) create mode 100644 youtube_dl/extractor/huffpost.py diff --git a/youtube_dl/downloader/__init__.py b/youtube_dl/downloader/__init__.py index 0d9eb0001..aaa92bc75 100644 --- a/youtube_dl/downloader/__init__.py +++ b/youtube_dl/downloader/__init__.py @@ -1,3 +1,5 @@ +from __future__ import unicode_literals + from .common import FileDownloader from .hls import HlsFD from .http import HttpFD @@ -12,10 +14,11 @@ from ..utils import ( def get_suitable_downloader(info_dict): """Get the downloader class that can handle the info dict.""" url = info_dict['url'] + protocol = info_dict.get('protocol') if url.startswith('rtmp'): return RtmpFD - if determine_ext(url) == u'm3u8': + if (protocol == 'm3u8') or (protocol is None and determine_ext(url) == 'm3u8'): return HlsFD if url.startswith('mms') or url.startswith('rtsp'): return MplayerFD diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 8daf995b9..5de90d6d9 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -83,6 +83,7 @@ from .googlesearch import GoogleSearchIE from .hark import HarkIE from .hotnewhiphop import HotNewHipHopIE from .howcast import HowcastIE +from .huffpost import HuffPostIE from .hypem import HypemIE from .ign import IGNIE, OneUPIE from .imdb import ( diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 3cf742a3b..db1ca9edb 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -71,7 +71,7 @@ class InfoExtractor(object): * player_url SWF Player URL (used for rtmpdump). * protocol The protocol that will be used for the actual download, lower-case. - "http", "https", "rtsp", "rtmp" or so. + "http", "https", "rtsp", "rtmp", "m3u8" or so. * preference Order number of this format. If this field is present and not None, the formats get sorted by this field. diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index e1933837d..829e5894f 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -332,10 +332,16 @@ class GenericIE(InfoExtractor): # Look for embedded Facebook player mobj = re.search( - r'<iframe[^>]+?src=(["\'])(?P<url>https://www.facebook.com/video/embed.+?)\1', webpage) + r'<iframe[^>]+?src=(["\'])(?P<url>https://www\.facebook\.com/video/embed.+?)\1', webpage) if mobj is not None: return self.url_result(mobj.group('url'), 'Facebook') + # Look for embedded Huffington Post player + mobj = re.search( + r'<iframe[^>]+?src=(["\'])(?P<url>https?://embed\.live.huffingtonpost\.com/.+?)\1', webpage) + if mobj is not None: + return self.url_result(mobj.group('url'), 'HuffPost') + # Start with something easy: JW Player in SWFObject mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage) if mobj is None: diff --git a/youtube_dl/extractor/huffpost.py b/youtube_dl/extractor/huffpost.py new file mode 100644 index 000000000..b47114ab4 --- /dev/null +++ b/youtube_dl/extractor/huffpost.py @@ -0,0 +1,70 @@ +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + parse_duration, + unified_strdate, +) + + +class HuffPostIE(InfoExtractor): + IE_DESC = 'Huffington Post' + _VALID_URL = r'''(?x) + https?://(embed\.)?live\.huffingtonpost\.com/ + (?: + r/segment/[^/]+/| + HPLEmbedPlayer/\?segmentId= + ) + (?P<id>[0-9a-f]+)''' + + _TEST = { + 'url': 'http://live.huffingtonpost.com/r/segment/legalese-it/52dd3e4b02a7602131000677', + 'file': '52dd3e4b02a7602131000677.mp4', + 'md5': 'TODO', + 'info_dict': { + 'title': 'TODO', + 'description': 'TODO', + 'duration': 1549, + } + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + + api_url = 'http://embed.live.huffingtonpost.com/api/segments/%s.json' % video_id + data = self._download_json(api_url, video_id)['data'] + + video_title = data['title'] + duration = parse_duration(data['running_time']) + upload_date = unified_strdate(data['schedule']['started_at']) + + thumbnails = [] + for url in data['images'].values(): + m = re.match('.*-([0-9]+x[0-9]+)\.', url) + if not m: + continue + thumbnails.append({ + 'url': url, + 'resolution': m.group(1), + }) + + formats = [{ + 'format': key, + 'format_id': key.replace('/', '.'), + 'ext': 'mp4', + 'url': url, + 'vcodec': 'none' if key.startswith('audio/') else None, + } for key, url in data['sources']['live'].items()] + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': video_title, + 'formats': formats, + 'duration': duration, + 'upload_date': upload_date, + 'thumbnails': thumbnails, + } From 6c57e8a063b7e34208ac9e225786e8152943303a Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Mon, 27 Jan 2014 06:22:15 +0100 Subject: [PATCH 106/339] [setup.py] Only print a warning if documentation files are missing (Fixes #780) --- setup.py | 24 +++++++++++++++++++----- 1 file changed, 19 insertions(+), 5 deletions(-) diff --git a/setup.py b/setup.py index 1f45159cd..03e7b358e 100644 --- a/setup.py +++ b/setup.py @@ -3,7 +3,9 @@ from __future__ import print_function +import os.path import pkg_resources +import warnings import sys try: @@ -44,12 +46,24 @@ py2exe_params = { if len(sys.argv) >= 2 and sys.argv[1] == 'py2exe': params = py2exe_params else: + files_spec = [ + ('etc/bash_completion.d', ['youtube-dl.bash-completion']), + ('share/doc/youtube_dl', ['README.txt']), + ('share/man/man1', ['youtube-dl.1']) + ] + root = os.path.dirname(os.path.abspath(__file__)) + data_files = [] + for dirname, files in files_spec: + resfiles = [] + for fn in files: + if not os.path.exists(fn): + warnings.warn('Skipping file %s since it is not present. Type make to build all automatically generated files.' % fn) + else: + resfiles.append(fn) + data_files.append((dirname, resfiles)) + params = { - 'data_files': [ # Installing system-wide would require sudo... - ('etc/bash_completion.d', ['youtube-dl.bash-completion']), - ('share/doc/youtube_dl', ['README.txt']), - ('share/man/man1', ['youtube-dl.1']) - ] + 'data_files': data_files, } if setuptools_available: params['entry_points'] = {'console_scripts': ['youtube-dl = youtube_dl:main']} From d16076ff3efcfb4817a0ff1a48f7d84e6f46f8f3 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Mon, 27 Jan 2014 06:55:35 +0100 Subject: [PATCH 107/339] [huffpost] Fix extractor --- youtube_dl/extractor/huffpost.py | 20 ++++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/huffpost.py b/youtube_dl/extractor/huffpost.py index b47114ab4..0d1ea6802 100644 --- a/youtube_dl/extractor/huffpost.py +++ b/youtube_dl/extractor/huffpost.py @@ -22,11 +22,12 @@ class HuffPostIE(InfoExtractor): _TEST = { 'url': 'http://live.huffingtonpost.com/r/segment/legalese-it/52dd3e4b02a7602131000677', 'file': '52dd3e4b02a7602131000677.mp4', - 'md5': 'TODO', + 'md5': '55f5e8981c1c80a64706a44b74833de8', 'info_dict': { - 'title': 'TODO', - 'description': 'TODO', + 'title': 'Legalese It! with @MikeSacksHP', + 'description': 'This week on Legalese It, Mike talks to David Bosco about his new book on the ICC, "Rough Justice," he also discusses the Virginia AG\'s historic stance on gay marriage, the execution of Edgar Tamayo, the ICC\'s delay of Kenya\'s President and more. ', 'duration': 1549, + 'upload_date': '20140124', } } @@ -39,7 +40,8 @@ class HuffPostIE(InfoExtractor): video_title = data['title'] duration = parse_duration(data['running_time']) - upload_date = unified_strdate(data['schedule']['started_at']) + upload_date = unified_strdate(data['schedule']['starts_at']) + description = data.get('description') thumbnails = [] for url in data['images'].values(): @@ -58,11 +60,21 @@ class HuffPostIE(InfoExtractor): 'url': url, 'vcodec': 'none' if key.startswith('audio/') else None, } for key, url in data['sources']['live'].items()] + if data.get('fivemin_id'): + fid = data['fivemin_id'] + fcat = str(int(fid) // 100 + 1) + furl = 'http://avideos.5min.com/2/' + fcat[-3:] + '/' + fcat + '/' + fid + '.mp4' + formats.append({ + 'format': 'fivemin', + 'url': furl, + 'preference': 1, + }) self._sort_formats(formats) return { 'id': video_id, 'title': video_title, + 'description': description, 'formats': formats, 'duration': duration, 'upload_date': upload_date, From a17d16d59cc39e786d572000df34bcf3f1e3804a Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Mon, 27 Jan 2014 07:05:28 +0100 Subject: [PATCH 108/339] [la7] Add support --- youtube_dl/extractor/__init__.py | 1 + youtube_dl/extractor/la7.py | 56 ++++++++++++++++++++++++++++++++ 2 files changed, 57 insertions(+) create mode 100644 youtube_dl/extractor/la7.py diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 5de90d6d9..be3cada98 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -107,6 +107,7 @@ from .keezmovies import KeezMoviesIE from .khanacademy import KhanAcademyIE from .kickstarter import KickStarterIE from .keek import KeekIE +from .la7 import LA7IE from .liveleak import LiveLeakIE from .livestream import LivestreamIE, LivestreamOriginalIE from .lynda import ( diff --git a/youtube_dl/extractor/la7.py b/youtube_dl/extractor/la7.py new file mode 100644 index 000000000..a91b94ee9 --- /dev/null +++ b/youtube_dl/extractor/la7.py @@ -0,0 +1,56 @@ +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + parse_duration, +) + + +class LA7IE(InfoExtractor): + IE_NAME = 'la7.tv' + _VALID_URL = r'https?://(?:www\.)?la7\.tv/richplayer/\?assetid=(?P<id>[0-9]+)' + + _TEST = { + 'url': 'http://www.la7.tv/richplayer/?assetid=50355319', + 'file': '50355319.mp4', + 'md5': 'ec7d1f0224d20ba293ab56cf2259651f', + 'info_dict': { + 'title': 'IL DIVO', + 'description': 'Un film di Paolo Sorrentino con Toni Servillo, Anna Bonaiuto, Giulio Bosetti e Flavio Bucci', + 'duration': 6254, + } + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + + xml_url = 'http://www.la7.tv/repliche/content/index.php?contentId=%s' % video_id + doc = self._download_xml(xml_url, video_id) + + video_title = doc.find('title').text + description = doc.find('description').text + duration = parse_duration(doc.find('duration').text) + thumbnail = doc.find('img').text + view_count = int(doc.find('views').text) + + prefix = doc.find('.//fqdn').text.strip().replace('auto:', 'http:') + + formats = [{ + 'format': vnode.find('quality').text, + 'tbr': int(vnode.find('quality').text), + 'url': vnode.find('fms').text.strip().replace('mp4:', prefix), + } for vnode in doc.findall('.//videos/video')] + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': video_title, + 'description': description, + 'thumbnail': thumbnail, + 'duration': duration, + 'formats': formats, + 'view_count': view_count, + } From 25c67d257cb033ec90752d79583e94592e9896cd Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Mon, 27 Jan 2014 07:05:39 +0100 Subject: [PATCH 109/339] release 2014.01.27 --- youtube_dl/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 0701961a5..9e6c7a66f 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,2 +1,2 @@ -__version__ = '2014.01.23.4' +__version__ = '2014.01.27' From 1da1558f462136da3ff1cc3f042439c6d5a73920 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Mon, 27 Jan 2014 07:08:01 +0100 Subject: [PATCH 110/339] [la7] Support more URLs --- youtube_dl/extractor/la7.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/la7.py b/youtube_dl/extractor/la7.py index a91b94ee9..6d61f9a90 100644 --- a/youtube_dl/extractor/la7.py +++ b/youtube_dl/extractor/la7.py @@ -10,7 +10,13 @@ from ..utils import ( class LA7IE(InfoExtractor): IE_NAME = 'la7.tv' - _VALID_URL = r'https?://(?:www\.)?la7\.tv/richplayer/\?assetid=(?P<id>[0-9]+)' + _VALID_URL = r'''(?x) + https?://(?:www\.)?la7\.tv/ + (?: + richplayer/\?assetid=| + \?contentId= + ) + (?P<id>[0-9]+)''' _TEST = { 'url': 'http://www.la7.tv/richplayer/?assetid=50355319', From ca3e054750cacf479b05bd0af1ac0fa4eff2d124 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Mon, 27 Jan 2014 07:09:55 +0100 Subject: [PATCH 111/339] release 2014.01.27.1 --- youtube_dl/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 9e6c7a66f..dd3c37007 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,2 +1,2 @@ -__version__ = '2014.01.27' +__version__ = '2014.01.27.1' From 7e8caf30c0552c64cd38eb3471cd6d49e5c1b20e Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Mon, 27 Jan 2014 07:31:54 +0100 Subject: [PATCH 112/339] Throw an error if no video formats are found --- youtube_dl/extractor/common.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index db1ca9edb..f7478d459 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -466,6 +466,9 @@ class InfoExtractor(object): return RATING_TABLE.get(rating.lower(), None) def _sort_formats(self, formats): + if not formats: + raise ExtractorError(u'No video formats found') + def _formats_key(f): # TODO remove the following workaround from ..utils import determine_ext From c060b774467f499bc946ae024bc9fc4ecfbc6d67 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Mon, 27 Jan 2014 07:36:18 +0100 Subject: [PATCH 113/339] [tumblr] Use unicode_literals --- youtube_dl/extractor/tumblr.py | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/youtube_dl/extractor/tumblr.py b/youtube_dl/extractor/tumblr.py index ad5840ca2..abbbb9661 100644 --- a/youtube_dl/extractor/tumblr.py +++ b/youtube_dl/extractor/tumblr.py @@ -1,3 +1,5 @@ +from __future__ import unicode_literals + import re from .common import InfoExtractor @@ -9,11 +11,11 @@ from ..utils import ( class TumblrIE(InfoExtractor): _VALID_URL = r'http://(?P<blog_name>.*?)\.tumblr\.com/((post)|(video))/(?P<id>\d*)/(.*?)' _TEST = { - u'url': u'http://tatianamaslanydaily.tumblr.com/post/54196191430/orphan-black-dvd-extra-behind-the-scenes', - u'file': u'54196191430.mp4', - u'md5': u'479bb068e5b16462f5176a6828829767', - u'info_dict': { - u"title": u"tatiana maslany news" + 'url': 'http://tatianamaslanydaily.tumblr.com/post/54196191430/orphan-black-dvd-extra-behind-the-scenes', + 'file': '54196191430.mp4', + 'md5': '479bb068e5b16462f5176a6828829767', + 'info_dict': { + "title": "tatiana maslany news" } } @@ -28,18 +30,18 @@ class TumblrIE(InfoExtractor): re_video = r'src=\\x22(?P<video_url>http://%s\.tumblr\.com/video_file/%s/(.*?))\\x22 type=\\x22video/(?P<ext>.*?)\\x22' % (blog, video_id) video = re.search(re_video, webpage) if video is None: - raise ExtractorError(u'Unable to extract video') + raise ExtractorError('Unable to extract video') video_url = video.group('video_url') ext = video.group('ext') video_thumbnail = self._search_regex(r'posters(.*?)\[\\x22(?P<thumb>.*?)\\x22', - webpage, u'thumbnail', fatal=False) # We pick the first poster + webpage, 'thumbnail', fatal=False) # We pick the first poster if video_thumbnail: video_thumbnail = video_thumbnail.replace('\\', '') # The only place where you can get a title, it's not complete, # but searching in other places doesn't work for all videos video_title = self._html_search_regex(r'<title>(?P<title>.*?)(?: \| Tumblr)?', - webpage, u'title', flags=re.DOTALL) + webpage, 'title', flags=re.DOTALL) return [{'id': video_id, 'url': video_url, From 67379078264fa1538ba6f3387873981f9cee3ab5 Mon Sep 17 00:00:00 2001 From: Mike Col Date: Mon, 27 Jan 2014 07:38:55 +0100 Subject: [PATCH 114/339] [tumblr] Fix thumbnail extraction Signed-off-by: Philipp Hagemeister --- youtube_dl/extractor/tumblr.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/tumblr.py b/youtube_dl/extractor/tumblr.py index abbbb9661..f7bc77c48 100644 --- a/youtube_dl/extractor/tumblr.py +++ b/youtube_dl/extractor/tumblr.py @@ -34,9 +34,11 @@ class TumblrIE(InfoExtractor): video_url = video.group('video_url') ext = video.group('ext') - video_thumbnail = self._search_regex(r'posters(.*?)\[\\x22(?P.*?)\\x22', + video_thumbnail = self._search_regex( + r'posters.*?\[\\x22(.*?)\\x22', webpage, 'thumbnail', fatal=False) # We pick the first poster - if video_thumbnail: video_thumbnail = video_thumbnail.replace('\\', '') + if video_thumbnail: + video_thumbnail = video_thumbnail.replace('\\\\/', '/') # The only place where you can get a title, it's not complete, # but searching in other places doesn't work for all videos From 4d9be98dbce0d060adeb1748d2a40848efc40e6d Mon Sep 17 00:00:00 2001 From: MikeCol Date: Mon, 27 Jan 2014 07:42:30 +0100 Subject: [PATCH 115/339] Malemotion extractor --- youtube_dl/extractor/__init__.py | 1 + youtube_dl/extractor/malemotion.py | 58 ++++++++++++++++++++++++++++++ 2 files changed, 59 insertions(+) create mode 100644 youtube_dl/extractor/malemotion.py diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index be3cada98..cc992a64d 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -115,6 +115,7 @@ from .lynda import ( LyndaCourseIE ) from .macgamestore import MacGameStoreIE +from .malemotion import MalemotionIE from .mdr import MDRIE from .metacafe import MetacafeIE from .metacritic import MetacriticIE diff --git a/youtube_dl/extractor/malemotion.py b/youtube_dl/extractor/malemotion.py new file mode 100644 index 000000000..62e99091d --- /dev/null +++ b/youtube_dl/extractor/malemotion.py @@ -0,0 +1,58 @@ +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + compat_urllib_parse, +) + +class MalemotionIE(InfoExtractor): + _VALID_URL = r'^(?:https?://)?malemotion\.com/video/(.+?)\.(?P.+?)(#|$)' + _TEST = { + 'url': 'http://malemotion.com/video/bien-dur.10ew', + 'file': '10ew.mp4', + 'md5': 'b3cc49f953b107e4a363cdff07d100ce', + 'info_dict': { + "title": "Bien dur", + "age_limit": 18, + } + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group("id") + + webpage = self._download_webpage(url, video_id) + + self.report_extraction(video_id) + + # Extract video URL + video_url = compat_urllib_parse.unquote( + self._search_regex(r'(.*?) Date: Mon, 27 Jan 2014 07:43:41 +0100 Subject: [PATCH 116/339] Credit @MikeCol for malemotion IE --- youtube_dl/__init__.py | 1 + 1 file changed, 1 insertion(+) diff --git a/youtube_dl/__init__.py b/youtube_dl/__init__.py index 294fccb44..08cf2f934 100644 --- a/youtube_dl/__init__.py +++ b/youtube_dl/__init__.py @@ -40,6 +40,7 @@ __authors__ = ( 'Michael Orlitzky', 'Chris Gahan', 'Saimadhav Heblikar', + 'Mike Col', ) __license__ = 'Public Domain' From efc867775e09f493cca95fc5cd2986db7b55a71b Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Mon, 27 Jan 2014 07:55:30 +0100 Subject: [PATCH 117/339] [cliphunter] Simplify --- youtube_dl/extractor/cliphunter.py | 30 +++++++++++++----------------- 1 file changed, 13 insertions(+), 17 deletions(-) diff --git a/youtube_dl/extractor/cliphunter.py b/youtube_dl/extractor/cliphunter.py index 2d8c09630..42d406820 100644 --- a/youtube_dl/extractor/cliphunter.py +++ b/youtube_dl/extractor/cliphunter.py @@ -1,9 +1,10 @@ +from __future__ import unicode_literals + import re import string from .common import InfoExtractor from ..utils import ( - determine_ext, ExtractorError, ) @@ -23,39 +24,34 @@ translation_table = ( class CliphunterIE(InfoExtractor): - """Information Extractor for Cliphunter""" - IE_NAME = u'cliphunter' + IE_NAME = 'cliphunter' _VALID_URL = (r'(?:http://)?(?:www\.)?cliphunter\.com/w/' '(?P[0-9]+)/' '(?P.+?)(?:\?.*)?') - _TESTS = [{ - u'url': u'http://www.cliphunter.com/w/1012420/Fun_Jynx_Maze_solo', - u'file': u'1012420.flv', - u'md5': u'15e7740f30428abf70f4223478dc1225', - u'info_dict': { - u'title': u'Fun Jynx Maze solo', + _TESTS = { + 'url': 'http://www.cliphunter.com/w/1012420/Fun_Jynx_Maze_solo', + 'file': '1012420.flv', + 'md5': '15e7740f30428abf70f4223478dc1225', + 'info_dict': { + 'title': 'Fun Jynx Maze solo', } - }] + } def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) - if mobj is None: - raise ExtractorError(u'Unable to extract media URL') - video_id = mobj.group('id') webpage = self._download_webpage(url, video_id) - pl_fiji = re.search(r'pl_fiji = \'([^\']+)\'', webpage).group(1) - pl_c_qual = re.search(r'pl_c_qual = "(.)"', webpage).group(1) - video_title = re.search(r'mediaTitle = "([^"]+)"', webpage).group(1) + pl_fiji = self._search_regex(r'pl_fiji = \'([^\']+)\'', webpage, 'video data') + pl_c_qual = self._search_regex(r'pl_c_qual = "(.)"', webpage, 'video quality') + video_title = self._search_regex(r'mediaTitle = "([^"]+)"', webpage, 'title') video_url = string.translate(pl_fiji.encode(), translation_table) formats = [{ 'url': video_url, - 'ext': determine_ext(video_url), 'format': pl_c_qual, 'format_id': pl_c_qual, }] From b6d3a99678052bb85a187268dbd50e35fbde109c Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Mon, 27 Jan 2014 12:39:39 +0100 Subject: [PATCH 118/339] [cliphunter] Simplify (#2233) --- youtube_dl/extractor/cliphunter.py | 40 ++++++++++++++---------------- 1 file changed, 18 insertions(+), 22 deletions(-) diff --git a/youtube_dl/extractor/cliphunter.py b/youtube_dl/extractor/cliphunter.py index 42d406820..d891fa301 100644 --- a/youtube_dl/extractor/cliphunter.py +++ b/youtube_dl/extractor/cliphunter.py @@ -8,28 +8,22 @@ from ..utils import ( ExtractorError, ) -translation_table = ( - '\x00\x01\x02\x03\x04\x05\x06\x07\x08\t\n\x0b\x0c\r\x0e\x0f\x10\x11\x12' - '\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f !"#:%.\'=)*+,-./0123' - '456789:;?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]&_`hbcevofhdjknamoutsstupwrli{' - '|}~\x7f\x80\x81\x82\x83\x84\x85\x86\x87\x88\x89\x8a\x8b\x8c\x8d\x8e\x8f' - '\x90\x91\x92\x93\x94\x95\x96\x97\x98\x99\x9a\x9b\x9c\x9d\x9e\x9f\xa0\xa1' - '\xa2\xa3\xa4\xa5\xa6\xa7\xa8\xa9\xaa\xab\xac\xad\xae\xaf\xb0\xb1\xb2\xb3' - '\xb4\xb5\xb6\xb7\xb8\xb9\xba\xbb\xbc\xbd\xbe\xbf\xc0\xc1\xc2\xc3\xc4\xc5' - '\xc6\xc7\xc8\xc9\xca\xcb\xcc\xcd\xce\xcf\xd0\xd1\xd2\xd3\xd4\xd5\xd6\xd7' - '\xd8\xd9\xda\xdb\xdc\xdd\xde\xdf\xe0\xe1\xe2\xe3\xe4\xe5\xe6\xe7\xe8\xe9' - '\xea\xeb\xec\xed\xee\xef\xf0\xf1\xf2\xf3\xf4\xf5\xf6\xf7\xf8\xf9\xfa\xfb' - '\xfc\xfd\xfe\xff' -) +translation_table = { + 'a': 'h', 'd': 'e', 'e': 'v', 'f': 'o', 'g': 'f', 'i': 'd', 'l': 'n', + 'm': 'a', 'n': 'm', 'p': 'u', 'q': 't', 'r': 's', 'v': 'p', 'x': 'r', + 'y': 'l', 'z': 'i', + '$': ':', '&': '.', '(': '=', '^': '&', '=': '/', +} class CliphunterIE(InfoExtractor): IE_NAME = 'cliphunter' - _VALID_URL = (r'(?:http://)?(?:www\.)?cliphunter\.com/w/' - '(?P[0-9]+)/' - '(?P.+?)(?:\?.*)?') - _TESTS = { + _VALID_URL = r'''(?x)http://(?:www\.)?cliphunter\.com/w/ + (?P[0-9]+)/ + (?P.+?)(?:$|[#\?]) + ''' + _TEST = { 'url': 'http://www.cliphunter.com/w/1012420/Fun_Jynx_Maze_solo', 'file': '1012420.flv', 'md5': '15e7740f30428abf70f4223478dc1225', @@ -44,15 +38,17 @@ class CliphunterIE(InfoExtractor): webpage = self._download_webpage(url, video_id) - pl_fiji = self._search_regex(r'pl_fiji = \'([^\']+)\'', webpage, 'video data') - pl_c_qual = self._search_regex(r'pl_c_qual = "(.)"', webpage, 'video quality') - video_title = self._search_regex(r'mediaTitle = "([^"]+)"', webpage, 'title') + pl_fiji = self._search_regex( + r'pl_fiji = \'([^\']+)\'', webpage, 'video data') + pl_c_qual = self._search_regex( + r'pl_c_qual = "(.)"', webpage, 'video quality') + video_title = self._search_regex( + r'mediaTitle = "([^"]+)"', webpage, 'title') - video_url = string.translate(pl_fiji.encode(), translation_table) + video_url = ''.join(translation_table.get(c, c) for c in pl_fiji) formats = [{ 'url': video_url, - 'format': pl_c_qual, 'format_id': pl_c_qual, }] From 9b05bd42e590ee21daba80ec7fd3fd79991a3bca Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Mon, 27 Jan 2014 12:41:30 +0100 Subject: [PATCH 119/339] [discovery] Extract more info and simplify --- youtube_dl/extractor/discovery.py | 36 ++++++++++++++++++------------- 1 file changed, 21 insertions(+), 15 deletions(-) diff --git a/youtube_dl/extractor/discovery.py b/youtube_dl/extractor/discovery.py index 14fca3cae..885944c5e 100644 --- a/youtube_dl/extractor/discovery.py +++ b/youtube_dl/extractor/discovery.py @@ -2,6 +2,7 @@ from __future__ import unicode_literals import re import json + from .common import InfoExtractor @@ -9,32 +10,37 @@ class DiscoveryIE(InfoExtractor): _VALID_URL = r'http://dsc\.discovery\.com\/[a-zA-Z0-9\-]*/[a-zA-Z0-9\-]*/videos/(?P[a-zA-Z0-9\-]*)(.htm)?' _TEST = { 'url': 'http://dsc.discovery.com/tv-shows/mythbusters/videos/mission-impossible-outtakes.htm', - 'file': 'mission-impossible-outtakes.mp4', + 'file': '614784.mp4', 'md5': 'e12614f9ee303a6ccef415cb0793eba2', 'info_dict': { - 'title': 'MythBusters: Mission Impossible Outtakes' - } + 'title': 'MythBusters: Mission Impossible Outtakes', + 'description': ('Watch Jamie Hyneman and Adam Savage practice being' + ' each other -- to the point of confusing Jamie\'s dog -- and ' + 'don\'t miss Adam moon-walking as Jamie ... behind Jamie\'s' + ' back.'), + 'duration': 156, + }, } def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) video_id = mobj.group('id') webpage = self._download_webpage(url, video_id) - title = self._search_regex( - r'(?<=\"name\": ")(?P.*?)(?=\"\,)', webpage, r'Filename') - duration = int(self._search_regex( - r'(?<=\"duration\"\: )(?P<duration>.*?)(?=,)', webpage, r'Duration')) - formats_raw = self._search_regex( - r'(?<=\"mp4\":)(.*?)(}])', webpage, r'formats') + '}]' - formats_json = json.loads(formats_raw) + + video_list_json = self._search_regex(r'var videoListJSON = ({.*?});', + webpage, 'video list', flags=re.DOTALL) + video_list = json.loads(video_list_json) + info = video_list['clips'][0] formats = [] - for f in formats_json: + for f in info['mp4']: formats.append( {'url': f['src'], r'ext': r'mp4', 'tbr': int(f['bitrate'][:-1])}) return { - 'id': video_id, - 'duration': duration, - 'title': title, - 'formats': formats + 'id': info['contentId'], + 'title': video_list['name'], + 'formats': formats, + 'description': info['videoCaption'], + 'thumbnail': info.get('videoStillURL') or info.get('thumbnailURL'), + 'duration': info['duration'], } From d1b30713fbea61427fb803696eeac5b51c4dad6e Mon Sep 17 00:00:00 2001 From: Matthew Franglen <matthew.franglen@semantico.net> Date: Mon, 27 Jan 2014 15:33:16 +0000 Subject: [PATCH 120/339] Add antigen compatible plugin description --- youtube-dl.plugin.zsh | 1 + 1 file changed, 1 insertion(+) create mode 100644 youtube-dl.plugin.zsh diff --git a/youtube-dl.plugin.zsh b/youtube-dl.plugin.zsh new file mode 100644 index 000000000..5ecc6e124 --- /dev/null +++ b/youtube-dl.plugin.zsh @@ -0,0 +1 @@ +export PATH=${PATH}:$(dirname $0) From f9b8549609554038cce3088a9cd21bd8d3f80c5c Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Mon, 27 Jan 2014 18:40:10 +0100 Subject: [PATCH 121/339] [ard] Support multiple formats (Closes #2247) --- youtube_dl/extractor/ard.py | 87 ++++++++++++++++++++++++------------- 1 file changed, 58 insertions(+), 29 deletions(-) diff --git a/youtube_dl/extractor/ard.py b/youtube_dl/extractor/ard.py index dbf8eed99..b88f71bc4 100644 --- a/youtube_dl/extractor/ard.py +++ b/youtube_dl/extractor/ard.py @@ -1,22 +1,28 @@ +# coding: utf-8 +from __future__ import unicode_literals + import re from .common import InfoExtractor from ..utils import ( + determine_ext, ExtractorError, ) + class ARDIE(InfoExtractor): - _VALID_URL = r'^(?:https?://)?(?:(?:www\.)?ardmediathek\.de|mediathek\.daserste\.de)/(?:.*/)(?P<video_id>[^/\?]+)(?:\?.*)?' - _TITLE = r'<h1(?: class="boxTopHeadline")?>(?P<title>.*)</h1>' - _MEDIA_STREAM = r'mediaCollection\.addMediaStream\((?P<media_type>\d+), (?P<quality>\d+), "(?P<rtmp_url>[^"]*)", "(?P<video_url>[^"]*)", "[^"]*"\)' + _VALID_URL = r'^https?://(?:(?:www\.)?ardmediathek\.de|mediathek\.daserste\.de)/(?:.*/)(?P<video_id>[^/\?]+)(?:\?.*)?' + _TEST = { - u'url': u'http://www.ardmediathek.de/das-erste/tagesschau-in-100-sek?documentId=14077640', - u'file': u'14077640.mp4', - u'md5': u'6ca8824255460c787376353f9e20bbd8', - u'info_dict': { - u"title": u"11.04.2013 09:23 Uhr - Tagesschau in 100 Sekunden" + 'url': 'http://www.ardmediathek.de/das-erste/guenther-jauch/edward-snowden-im-interview-held-oder-verraeter?documentId=19288786', + 'file': '19288786.mp4', + 'md5': '515bf47ce209fb3f5a61b7aad364634c', + 'info_dict': { + 'title': 'Edward Snowden im Interview - Held oder Verräter?', + 'description': 'Edward Snowden hat alles aufs Spiel gesetzt, um die weltweite \xdcberwachung durch die Geheimdienste zu enttarnen. Nun stellt sich der ehemalige NSA-Mitarbeiter erstmals weltweit in einem TV-Interview den Fragen eines NDR-Journalisten. Die Sendung vom Sonntagabend.', + 'thumbnail': 'http://www.ardmediathek.de/ard/servlet/contentblob/19/28/87/90/19288790/bild/2250037', }, - u'skip': u'Requires rtmpdump' + 'skip': 'Blocked outside of Germany', } def _real_extract(self, url): @@ -29,26 +35,49 @@ class ARDIE(InfoExtractor): else: video_id = m.group('video_id') - # determine title and media streams from webpage - html = self._download_webpage(url, video_id) - title = re.search(self._TITLE, html).group('title') - streams = [mo.groupdict() for mo in re.finditer(self._MEDIA_STREAM, html)] + webpage = self._download_webpage(url, video_id) + + title = self._html_search_regex( + r'<h1(?:\s+class="boxTopHeadline")?>(.*?)</h1>', webpage, 'title') + description = self._html_search_meta( + 'dcterms.abstract', webpage, 'description') + thumbnail = self._og_search_thumbnail(webpage) + + streams = [ + mo.groupdict() + for mo in re.finditer( + r'mediaCollection\.addMediaStream\((?P<media_type>\d+), (?P<quality>\d+), "(?P<rtmp_url>[^"]*)", "(?P<video_url>[^"]*)", "[^"]*"\)', webpage)] if not streams: - assert '"fsk"' in html - raise ExtractorError(u'This video is only available after 8:00 pm') + if '"fsk"' in webpage: + raise ExtractorError('This video is only available after 20:00') - # choose default media type and highest quality for now - stream = max([s for s in streams if int(s["media_type"]) == 0], - key=lambda s: int(s["quality"])) + formats = [] + for s in streams: + format = { + 'quality': int(s['quality']), + } + if s.get('rtmp_url'): + format['protocol'] = 'rtmp' + format['url'] = s['rtmp_url'] + format['playpath'] = s['video_url'] + else: + format['url'] = s['video_url'] - # there's two possibilities: RTMP stream or HTTP download - info = {'id': video_id, 'title': title, 'ext': 'mp4'} - if stream['rtmp_url']: - self.to_screen(u'RTMP download detected') - assert stream['video_url'].startswith('mp4:') - info["url"] = stream["rtmp_url"] - info["play_path"] = stream['video_url'] - else: - assert stream["video_url"].endswith('.mp4') - info["url"] = stream["video_url"] - return [info] + quality_name = self._search_regex( + r'[,.]([a-zA-Z0-9_-]+),?\.mp4', format['url'], + 'quality name', default='NA') + format['format_id'] = '%s-%s-%s-%s' % ( + determine_ext(format['url']), quality_name, s['media_type'], + s['quality']) + + formats.append(format) + + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': title, + 'description': description, + 'formats': formats, + 'thumbnail': thumbnail, + } From b21a918984ef1eaf551ca78f0a278ed27e8a3f49 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Mon, 27 Jan 2014 19:22:45 +0100 Subject: [PATCH 122/339] release 2014.01.27.2 --- youtube_dl/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/version.py b/youtube_dl/version.py index dd3c37007..3c9952e17 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,2 +1,2 @@ -__version__ = '2014.01.27.1' +__version__ = '2014.01.27.2' From 075911d48ebbf0a7ea54565b08db867c99187d0f Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Mon, 27 Jan 2014 23:47:22 +0100 Subject: [PATCH 123/339] [la7] Skip test on travis --- youtube_dl/extractor/la7.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/la7.py b/youtube_dl/extractor/la7.py index 6d61f9a90..db2028e9f 100644 --- a/youtube_dl/extractor/la7.py +++ b/youtube_dl/extractor/la7.py @@ -26,7 +26,8 @@ class LA7IE(InfoExtractor): 'title': 'IL DIVO', 'description': 'Un film di Paolo Sorrentino con Toni Servillo, Anna Bonaiuto, Giulio Bosetti e Flavio Bucci', 'duration': 6254, - } + }, + 'skip': 'Blocked in the US', } def _real_extract(self, url): From 1547c8cc881b85f9f400f29c30b3a352eb679608 Mon Sep 17 00:00:00 2001 From: dst <dstftw@gmail.com> Date: Tue, 28 Jan 2014 06:56:09 +0700 Subject: [PATCH 124/339] [rutube] Add support for channels and movies --- youtube_dl/extractor/__init__.py | 6 ++- youtube_dl/extractor/rutube.py | 74 +++++++++++++++++++++++++++----- 2 files changed, 69 insertions(+), 11 deletions(-) diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 1e8556124..19ca5d6b4 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -161,7 +161,11 @@ from .ro220 import Ro220IE from .rottentomatoes import RottenTomatoesIE from .roxwel import RoxwelIE from .rtlnow import RTLnowIE -from .rutube import RutubeIE +from .rutube import ( + RutubeIE, + RutubeChannelIE, + RutubeMovieIE +) from .servingsys import ServingSysIE from .sina import SinaIE from .slashdot import SlashdotIE diff --git a/youtube_dl/extractor/rutube.py b/youtube_dl/extractor/rutube.py index e3e9bc07f..2001a83ef 100644 --- a/youtube_dl/extractor/rutube.py +++ b/youtube_dl/extractor/rutube.py @@ -1,6 +1,9 @@ # encoding: utf-8 +from __future__ import unicode_literals + import re import json +import itertools from .common import InfoExtractor from ..utils import ( @@ -11,26 +14,28 @@ from ..utils import ( class RutubeIE(InfoExtractor): + IE_NAME = 'rutube' + IE_DESC = 'Rutube videos' _VALID_URL = r'https?://rutube\.ru/video/(?P<long_id>\w+)' _TEST = { - u'url': u'http://rutube.ru/video/3eac3b4561676c17df9132a9a1e62e3e/', - u'file': u'3eac3b4561676c17df9132a9a1e62e3e.mp4', - u'info_dict': { - u'title': u'Раненный кенгуру забежал в аптеку', - u'uploader': u'NTDRussian', - u'uploader_id': u'29790', + 'url': 'http://rutube.ru/video/3eac3b4561676c17df9132a9a1e62e3e/', + 'file': '3eac3b4561676c17df9132a9a1e62e3e.mp4', + 'info_dict': { + 'title': 'Раненный кенгуру забежал в аптеку', + 'uploader': 'NTDRussian', + 'uploader_id': '29790', }, - u'params': { + 'params': { # It requires ffmpeg (m3u8 download) - u'skip_download': True, + 'skip_download': True, }, } def _get_api_response(self, short_id, subpath): api_url = 'http://rutube.ru/api/play/%s/%s/?format=json' % (subpath, short_id) response_json = self._download_webpage(api_url, short_id, - u'Downloading %s json' % subpath) + 'Downloading %s json' % subpath) return json.loads(response_json) def _real_extract(self, url): @@ -45,7 +50,7 @@ class RutubeIE(InfoExtractor): author = trackinfo.get('author') or {} m3u8_url = trackinfo['video_balancer'].get('m3u8') if m3u8_url is None: - raise ExtractorError(u'Couldn\'t find m3u8 manifest url') + raise ExtractorError('Couldn\'t find m3u8 manifest url') return { 'id': trackinfo['id'], @@ -56,3 +61,52 @@ class RutubeIE(InfoExtractor): 'uploader': author.get('name'), 'uploader_id': compat_str(author['id']) if author else None, } + + +class RutubeChannelIE(InfoExtractor): + IE_NAME = 'rutube:channel' + IE_DESC = 'Rutube channels' + _VALID_URL = r'http://rutube\.ru/tags/video/(?P<id>\d+)' + + _PAGE_TEMPLATE = 'http://rutube.ru/api/tags/video/%s/?page=%s&format=json' + + def _extract_videos(self, channel_id, channel_title=None): + entries = [] + for pagenum in itertools.count(1): + response_json = self._download_webpage(self._PAGE_TEMPLATE % (channel_id, pagenum), + channel_id, 'Downloading page %s' % pagenum) + page = json.loads(response_json) + if 'detail' in page and page['detail'] == 'Not found': + raise ExtractorError('Channel %s does not exist' % channel_id, expected=True) + results = page['results'] + if len(results) == 0: + break; + entries.extend(self.url_result(v['video_url'], 'Rutube') for v in results) + if page['has_next'] is False: + break; + return self.playlist_result(entries, channel_id, channel_title) + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + channel_id = mobj.group('id') + return self._extract_videos(channel_id) + + +class RutubeMovieIE(RutubeChannelIE): + IE_NAME = 'rutube:movie' + IE_DESC = 'Rutube movies' + _VALID_URL = r'http://rutube\.ru/metainfo/tv/(?P<id>\d+)' + + _MOVIE_TEMPLATE = 'http://rutube.ru/api/metainfo/tv/%s/?format=json' + _PAGE_TEMPLATE = 'http://rutube.ru/api/metainfo/tv/%s/video?page=%s&format=json' + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + movie_id = mobj.group('id') + movie_json = self._download_webpage(self._MOVIE_TEMPLATE % movie_id, movie_id, + 'Downloading movie JSON') + movie = json.loads(movie_json) + if 'detail' in movie and movie['detail'] == 'Not found': + raise ExtractorError('Movie %s does not exist' % movie_id, expected=True) + movie_name = movie['name'] + return self._extract_videos(movie_id, movie_name) \ No newline at end of file From 117bec936c18e9c3b1d467c8710075ed4376e254 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Tue, 28 Jan 2014 00:53:46 +0100 Subject: [PATCH 125/339] [brightcove] Parse URL from meta element if available (Fixes #2253) --- youtube_dl/extractor/brightcove.py | 22 +++++++++++++++++++--- 1 file changed, 19 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/brightcove.py b/youtube_dl/extractor/brightcove.py index 443294e6f..717e151d9 100644 --- a/youtube_dl/extractor/brightcove.py +++ b/youtube_dl/extractor/brightcove.py @@ -23,7 +23,6 @@ from ..utils import ( class BrightcoveIE(InfoExtractor): _VALID_URL = r'https?://.*brightcove\.com/(services|viewer).*\?(?P<query>.*)' _FEDERATED_URL_TEMPLATE = 'http://c.brightcove.com/services/viewer/htmlFederated?%s' - _PLAYLIST_URL_TEMPLATE = 'http://c.brightcove.com/services/json/experience/runtime/?command=get_programming_for_experience&playerKey=%s' _TESTS = [ { @@ -71,6 +70,17 @@ class BrightcoveIE(InfoExtractor): 'uploader': 'National Ballet of Canada', }, }, + { + # https://github.com/rg3/youtube-dl/issues/2253 + 'url': 'http://v.thestar.com/services/player/bcpid2071349530001?bckey=AQ~~,AAAAuO4KaJE~,gatFNwSKdGDmDpIYqNJ-fTHn_c4z_LH_&bctid=3101154703001', + 'file': '3101154703001.mp4', + 'md5': '0ba9446db037002366bab3b3eb30c88c', + 'info_dict': { + 'title': 'Still no power', + 'uploader': 'thestar.com', + 'description': 'Mississauga resident David Farmer is still out of power as a result of the ice storm a month ago. To keep the house warm, Farmer cuts wood from his property for a wood burning stove downstairs.', + } + } ] @classmethod @@ -131,6 +141,11 @@ class BrightcoveIE(InfoExtractor): """Try to extract the brightcove url from the wepbage, returns None if it can't be found """ + + url_m = re.search(r'<meta\s+property="og:video"\s+content="(http://c.brightcove.com/[^"]+)"', webpage) + if url_m: + return url_m.group(1) + m_brightcove = re.search( r'''(?sx)<object (?: @@ -183,8 +198,9 @@ class BrightcoveIE(InfoExtractor): return self._extract_video_info(video_info) def _get_playlist_info(self, player_key): - playlist_info = self._download_webpage(self._PLAYLIST_URL_TEMPLATE % player_key, - player_key, 'Downloading playlist information') + info_url = 'http://c.brightcove.com/services/json/experience/runtime/?command=get_programming_for_experience&playerKey=%s' % player_key + playlist_info = self._download_webpage( + info_url, player_key, 'Downloading playlist information') json_data = json.loads(playlist_info) if 'videoList' not in json_data: From 9e8ee5455332a524b63a24af7542c04cc8729549 Mon Sep 17 00:00:00 2001 From: MikeCol <MikeCol@gmx.net> Date: Tue, 28 Jan 2014 01:41:18 +0100 Subject: [PATCH 126/339] VALID_URL changed to match different kinds of Tumblr-URLs --- youtube_dl/extractor/tumblr.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/tumblr.py b/youtube_dl/extractor/tumblr.py index f7bc77c48..544369068 100644 --- a/youtube_dl/extractor/tumblr.py +++ b/youtube_dl/extractor/tumblr.py @@ -9,7 +9,7 @@ from ..utils import ( class TumblrIE(InfoExtractor): - _VALID_URL = r'http://(?P<blog_name>.*?)\.tumblr\.com/((post)|(video))/(?P<id>\d*)/(.*?)' + _VALID_URL = r'http://(?P<blog_name>.*?)\.tumblr\.com/((post)|(video))/(?P<id>\d*)($|/)' _TEST = { 'url': 'http://tatianamaslanydaily.tumblr.com/post/54196191430/orphan-black-dvd-extra-behind-the-scenes', 'file': '54196191430.mp4', From a2fb2a213452f0f8b4e9fea9518ad148e1b1e6d4 Mon Sep 17 00:00:00 2001 From: dst <dstftw@gmail.com> Date: Tue, 28 Jan 2014 08:19:45 +0700 Subject: [PATCH 127/339] [rutube] Improve video extractor --- youtube_dl/extractor/rutube.py | 47 +++++++++++++++++----------------- 1 file changed, 23 insertions(+), 24 deletions(-) diff --git a/youtube_dl/extractor/rutube.py b/youtube_dl/extractor/rutube.py index 2001a83ef..6075296ff 100644 --- a/youtube_dl/extractor/rutube.py +++ b/youtube_dl/extractor/rutube.py @@ -7,8 +7,8 @@ import itertools from .common import InfoExtractor from ..utils import ( - compat_urlparse, compat_str, + unified_strdate, ExtractorError, ) @@ -32,20 +32,18 @@ class RutubeIE(InfoExtractor): }, } - def _get_api_response(self, short_id, subpath): - api_url = 'http://rutube.ru/api/play/%s/%s/?format=json' % (subpath, short_id) - response_json = self._download_webpage(api_url, short_id, - 'Downloading %s json' % subpath) - return json.loads(response_json) - def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) long_id = mobj.group('long_id') - webpage = self._download_webpage(url, long_id) - og_video = self._og_search_video_url(webpage) - short_id = compat_urlparse.urlparse(og_video).path[1:] - options = self._get_api_response(short_id, 'options') - trackinfo = self._get_api_response(short_id, 'trackinfo') + + api_response = self._download_webpage('http://rutube.ru/api/video/%s/?format=json' % long_id, + long_id, 'Downloading video JSON') + video = json.loads(api_response) + + api_response = self._download_webpage('http://rutube.ru/api/play/trackinfo/%s/?format=json' % long_id, + long_id, 'Downloading trackinfo JSON') + trackinfo = json.loads(api_response) + # Some videos don't have the author field author = trackinfo.get('author') or {} m3u8_url = trackinfo['video_balancer'].get('m3u8') @@ -53,13 +51,18 @@ class RutubeIE(InfoExtractor): raise ExtractorError('Couldn\'t find m3u8 manifest url') return { - 'id': trackinfo['id'], - 'title': trackinfo['title'], + 'id': video['id'], + 'title': video['title'], + 'description': video['description'], + 'duration': video['duration'], + 'view_count': video['hits'], 'url': m3u8_url, 'ext': 'mp4', - 'thumbnail': options['thumbnail_url'], + 'thumbnail': video['thumbnail_url'], 'uploader': author.get('name'), 'uploader_id': compat_str(author['id']) if author else None, + 'upload_date': unified_strdate(video['created_ts']), + 'age_limit': 18 if video['is_adult'] else 0, } @@ -73,15 +76,13 @@ class RutubeChannelIE(InfoExtractor): def _extract_videos(self, channel_id, channel_title=None): entries = [] for pagenum in itertools.count(1): - response_json = self._download_webpage(self._PAGE_TEMPLATE % (channel_id, pagenum), + api_response = self._download_webpage(self._PAGE_TEMPLATE % (channel_id, pagenum), channel_id, 'Downloading page %s' % pagenum) - page = json.loads(response_json) - if 'detail' in page and page['detail'] == 'Not found': - raise ExtractorError('Channel %s does not exist' % channel_id, expected=True) + page = json.loads(api_response) results = page['results'] if len(results) == 0: break; - entries.extend(self.url_result(v['video_url'], 'Rutube') for v in results) + entries.extend(self.url_result(result['video_url'], 'Rutube') for result in results) if page['has_next'] is False: break; return self.playlist_result(entries, channel_id, channel_title) @@ -103,10 +104,8 @@ class RutubeMovieIE(RutubeChannelIE): def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) movie_id = mobj.group('id') - movie_json = self._download_webpage(self._MOVIE_TEMPLATE % movie_id, movie_id, + api_response = self._download_webpage(self._MOVIE_TEMPLATE % movie_id, movie_id, 'Downloading movie JSON') - movie = json.loads(movie_json) - if 'detail' in movie and movie['detail'] == 'Not found': - raise ExtractorError('Movie %s does not exist' % movie_id, expected=True) + movie = json.loads(api_response) movie_name = movie['name'] return self._extract_videos(movie_id, movie_name) \ No newline at end of file From 87fac3238d09a4298d6cfa66f8753c734b81e5b7 Mon Sep 17 00:00:00 2001 From: dst <dstftw@gmail.com> Date: Tue, 28 Jan 2014 08:25:56 +0700 Subject: [PATCH 128/339] [rutube] Add channel test --- test/test_playlists.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/test/test_playlists.py b/test/test_playlists.py index 5eeba091e..3861224b0 100644 --- a/test/test_playlists.py +++ b/test/test_playlists.py @@ -33,6 +33,7 @@ from youtube_dl.extractor import ( ImdbListIE, KhanAcademyIE, EveryonesMixtapeIE, + RutubeChannelIE, ) @@ -219,6 +220,14 @@ class TestPlaylists(unittest.TestCase): self.assertEqual(result['id'], 'm7m0jJAbMQi') self.assertEqual(result['title'], 'Driving') self.assertEqual(len(result['entries']), 24) + + def test_rutube_channel(self): + dl = FakeYDL() + ie = RutubeChannelIE(dl) + result = ie.extract('http://rutube.ru/tags/video/1409') + self.assertIsPlaylist(result) + self.assertEqual(result['id'], '1409') + self.assertTrue(len(result['entries']) >= 34) if __name__ == '__main__': From e3a9f32f52bbd9be8b38d70a529808f123126ce7 Mon Sep 17 00:00:00 2001 From: dst <dstftw@gmail.com> Date: Tue, 28 Jan 2014 08:47:17 +0700 Subject: [PATCH 129/339] [rutube] Add support for user videos --- youtube_dl/extractor/__init__.py | 3 ++- youtube_dl/extractor/rutube.py | 22 +++++++++++++++------- 2 files changed, 17 insertions(+), 8 deletions(-) diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 19ca5d6b4..e89b5cf9d 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -164,7 +164,8 @@ from .rtlnow import RTLnowIE from .rutube import ( RutubeIE, RutubeChannelIE, - RutubeMovieIE + RutubeMovieIE, + RutubePersonIE, ) from .servingsys import ServingSysIE from .sina import SinaIE diff --git a/youtube_dl/extractor/rutube.py b/youtube_dl/extractor/rutube.py index 6075296ff..9a20facfd 100644 --- a/youtube_dl/extractor/rutube.py +++ b/youtube_dl/extractor/rutube.py @@ -16,7 +16,7 @@ from ..utils import ( class RutubeIE(InfoExtractor): IE_NAME = 'rutube' IE_DESC = 'Rutube videos' - _VALID_URL = r'https?://rutube\.ru/video/(?P<long_id>\w+)' + _VALID_URL = r'https?://rutube\.ru/video/(?P<id>[\da-z]{32})' _TEST = { 'url': 'http://rutube.ru/video/3eac3b4561676c17df9132a9a1e62e3e/', @@ -34,14 +34,14 @@ class RutubeIE(InfoExtractor): def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) - long_id = mobj.group('long_id') + video_id = mobj.group('id') - api_response = self._download_webpage('http://rutube.ru/api/video/%s/?format=json' % long_id, - long_id, 'Downloading video JSON') + api_response = self._download_webpage('http://rutube.ru/api/video/%s/?format=json' % video_id, + video_id, 'Downloading video JSON') video = json.loads(api_response) - api_response = self._download_webpage('http://rutube.ru/api/play/trackinfo/%s/?format=json' % long_id, - long_id, 'Downloading trackinfo JSON') + api_response = self._download_webpage('http://rutube.ru/api/play/trackinfo/%s/?format=json' % video_id, + video_id, 'Downloading trackinfo JSON') trackinfo = json.loads(api_response) # Some videos don't have the author field @@ -108,4 +108,12 @@ class RutubeMovieIE(RutubeChannelIE): 'Downloading movie JSON') movie = json.loads(api_response) movie_name = movie['name'] - return self._extract_videos(movie_id, movie_name) \ No newline at end of file + return self._extract_videos(movie_id, movie_name) + + +class RutubePersonIE(RutubeChannelIE): + IE_NAME = 'rutube:person' + IE_DESC = 'Rutube person videos' + _VALID_URL = r'http://rutube\.ru/video/person/(?P<id>\d+)' + + _PAGE_TEMPLATE = 'http://rutube.ru/api/video/person/%s/?page=%s&format=json' \ No newline at end of file From a3978a615950af6e990313820f93baddce067ee4 Mon Sep 17 00:00:00 2001 From: dst <dstftw@gmail.com> Date: Tue, 28 Jan 2014 09:12:23 +0700 Subject: [PATCH 130/339] [imdb] Fix duplicated entries bug --- youtube_dl/extractor/imdb.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/youtube_dl/extractor/imdb.py b/youtube_dl/extractor/imdb.py index 1763af020..7cee505c0 100644 --- a/youtube_dl/extractor/imdb.py +++ b/youtube_dl/extractor/imdb.py @@ -69,12 +69,9 @@ class ImdbListIE(InfoExtractor): list_id = mobj.group('id') webpage = self._download_webpage(url, list_id) - list_code = self._search_regex( - r'(?s)<div\s+class="list\sdetail">(.*?)class="see-more"', - webpage, 'list code') entries = [ self.url_result('http://www.imdb.com' + m, 'Imdb') - for m in re.findall(r'href="(/video/imdb/vi[^"]+)"', webpage)] + for m in re.findall(r'href="(/video/imdb/vi[^"]+)"\s+data-type="playlist"', webpage)] list_title = self._html_search_regex( r'<h1 class="header">(.*?)</h1>', webpage, 'list title') From 98669ed79c10bbcfc5678a520de98df79ca6b1b4 Mon Sep 17 00:00:00 2001 From: dst <dstftw@gmail.com> Date: Tue, 28 Jan 2014 09:13:08 +0700 Subject: [PATCH 131/339] [imdb] Fix playlist test --- test/test_playlists.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/test/test_playlists.py b/test/test_playlists.py index 5eeba091e..a8f4a49f4 100644 --- a/test/test_playlists.py +++ b/test/test_playlists.py @@ -195,11 +195,11 @@ class TestPlaylists(unittest.TestCase): def test_imdb_list(self): dl = FakeYDL() ie = ImdbListIE(dl) - result = ie.extract('http://www.imdb.com/list/sMjedvGDd8U') + result = ie.extract('http://www.imdb.com/list/JFs9NWw6XI0') self.assertIsPlaylist(result) - self.assertEqual(result['id'], 'sMjedvGDd8U') - self.assertEqual(result['title'], 'Animated and Family Films') - self.assertTrue(len(result['entries']) >= 48) + self.assertEqual(result['id'], 'JFs9NWw6XI0') + self.assertEqual(result['title'], 'March 23, 2012 Releases') + self.assertEqual(len(result['entries']), 7) def test_khanacademy_topic(self): dl = FakeYDL() From 00ff8f92a54de32eb2bde54f3c52fe4acf854ee1 Mon Sep 17 00:00:00 2001 From: dst <dstftw@gmail.com> Date: Tue, 28 Jan 2014 09:31:14 +0700 Subject: [PATCH 132/339] [rutube] Update test --- youtube_dl/extractor/rutube.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/youtube_dl/extractor/rutube.py b/youtube_dl/extractor/rutube.py index 9a20facfd..83459c522 100644 --- a/youtube_dl/extractor/rutube.py +++ b/youtube_dl/extractor/rutube.py @@ -23,8 +23,11 @@ class RutubeIE(InfoExtractor): 'file': '3eac3b4561676c17df9132a9a1e62e3e.mp4', 'info_dict': { 'title': 'Раненный кенгуру забежал в аптеку', + 'description': 'http://www.ntdtv.ru ', + 'duration': 80, 'uploader': 'NTDRussian', 'uploader_id': '29790', + 'upload_date': '20131016', }, 'params': { # It requires ffmpeg (m3u8 download) From 37e3b90d5996bc14e6ced937907a36b02bb8e490 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Tue, 28 Jan 2014 03:32:07 +0100 Subject: [PATCH 133/339] [rutube] Simplify --- youtube_dl/extractor/rutube.py | 26 ++++++++++++++------------ 1 file changed, 14 insertions(+), 12 deletions(-) diff --git a/youtube_dl/extractor/rutube.py b/youtube_dl/extractor/rutube.py index 9a20facfd..f58c775ba 100644 --- a/youtube_dl/extractor/rutube.py +++ b/youtube_dl/extractor/rutube.py @@ -15,7 +15,7 @@ from ..utils import ( class RutubeIE(InfoExtractor): IE_NAME = 'rutube' - IE_DESC = 'Rutube videos' + IE_DESC = 'Rutube videos' _VALID_URL = r'https?://rutube\.ru/video/(?P<id>[\da-z]{32})' _TEST = { @@ -68,7 +68,7 @@ class RutubeIE(InfoExtractor): class RutubeChannelIE(InfoExtractor): IE_NAME = 'rutube:channel' - IE_DESC = 'Rutube channels' + IE_DESC = 'Rutube channels' _VALID_URL = r'http://rutube\.ru/tags/video/(?P<id>\d+)' _PAGE_TEMPLATE = 'http://rutube.ru/api/tags/video/%s/?page=%s&format=json' @@ -76,15 +76,16 @@ class RutubeChannelIE(InfoExtractor): def _extract_videos(self, channel_id, channel_title=None): entries = [] for pagenum in itertools.count(1): - api_response = self._download_webpage(self._PAGE_TEMPLATE % (channel_id, pagenum), - channel_id, 'Downloading page %s' % pagenum) + api_response = self._download_webpage( + self._PAGE_TEMPLATE % (channel_id, pagenum), + channel_id, 'Downloading page %s' % pagenum) page = json.loads(api_response) results = page['results'] - if len(results) == 0: - break; + if not results: + break entries.extend(self.url_result(result['video_url'], 'Rutube') for result in results) - if page['has_next'] is False: - break; + if not page['has_next']: + break return self.playlist_result(entries, channel_id, channel_title) def _real_extract(self, url): @@ -95,7 +96,7 @@ class RutubeChannelIE(InfoExtractor): class RutubeMovieIE(RutubeChannelIE): IE_NAME = 'rutube:movie' - IE_DESC = 'Rutube movies' + IE_DESC = 'Rutube movies' _VALID_URL = r'http://rutube\.ru/metainfo/tv/(?P<id>\d+)' _MOVIE_TEMPLATE = 'http://rutube.ru/api/metainfo/tv/%s/?format=json' @@ -104,8 +105,9 @@ class RutubeMovieIE(RutubeChannelIE): def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) movie_id = mobj.group('id') - api_response = self._download_webpage(self._MOVIE_TEMPLATE % movie_id, movie_id, - 'Downloading movie JSON') + api_response = self._download_webpage( + self._MOVIE_TEMPLATE % movie_id, movie_id, + 'Downloading movie JSON') movie = json.loads(api_response) movie_name = movie['name'] return self._extract_videos(movie_id, movie_name) @@ -116,4 +118,4 @@ class RutubePersonIE(RutubeChannelIE): IE_DESC = 'Rutube person videos' _VALID_URL = r'http://rutube\.ru/video/person/(?P<id>\d+)' - _PAGE_TEMPLATE = 'http://rutube.ru/api/video/person/%s/?page=%s&format=json' \ No newline at end of file + _PAGE_TEMPLATE = 'http://rutube.ru/api/video/person/%s/?page=%s&format=json' From 17ab4d3b5e3075d32a0a93422e40f61ef4919d7d Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Tue, 28 Jan 2014 03:35:32 +0100 Subject: [PATCH 134/339] [brightcove] Move test to generic --- youtube_dl/extractor/brightcove.py | 11 ----------- youtube_dl/extractor/generic.py | 12 ++++++++++++ 2 files changed, 12 insertions(+), 11 deletions(-) diff --git a/youtube_dl/extractor/brightcove.py b/youtube_dl/extractor/brightcove.py index 717e151d9..9ccf923a6 100644 --- a/youtube_dl/extractor/brightcove.py +++ b/youtube_dl/extractor/brightcove.py @@ -69,17 +69,6 @@ class BrightcoveIE(InfoExtractor): 'description': 'md5:363109c02998fee92ec02211bd8000df', 'uploader': 'National Ballet of Canada', }, - }, - { - # https://github.com/rg3/youtube-dl/issues/2253 - 'url': 'http://v.thestar.com/services/player/bcpid2071349530001?bckey=AQ~~,AAAAuO4KaJE~,gatFNwSKdGDmDpIYqNJ-fTHn_c4z_LH_&bctid=3101154703001', - 'file': '3101154703001.mp4', - 'md5': '0ba9446db037002366bab3b3eb30c88c', - 'info_dict': { - 'title': 'Still no power', - 'uploader': 'thestar.com', - 'description': 'Mississauga resident David Farmer is still out of power as a result of the ice storm a month ago. To keep the house warm, Farmer cuts wood from his property for a wood burning stove downstairs.', - } } ] diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 829e5894f..48de379b7 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -78,6 +78,18 @@ class GenericIE(InfoExtractor): 'skip_download': True, }, }, + { + # https://github.com/rg3/youtube-dl/issues/2253 + 'url': 'http://bcove.me/i6nfkrc3', + 'file': '3101154703001.mp4', + 'md5': '0ba9446db037002366bab3b3eb30c88c', + 'info_dict': { + 'title': 'Still no power', + 'uploader': 'thestar.com', + 'description': 'Mississauga resident David Farmer is still out of power as a result of the ice storm a month ago. To keep the house warm, Farmer cuts wood from his property for a wood burning stove downstairs.', + }, + 'add_ie': ['Brightcove'], + }, # Direct link to a video { 'url': 'http://media.w3.org/2010/05/sintel/trailer.mp4', From 456895d9cfb9ee37eb9b328f1a96cee601904fb8 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Tue, 28 Jan 2014 03:37:23 +0100 Subject: [PATCH 135/339] [tumblr] Test new URL format (#2255) --- test/test_all_urls.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/test/test_all_urls.py b/test/test_all_urls.py index 75547f42a..94cbce6e8 100644 --- a/test/test_all_urls.py +++ b/test/test_all_urls.py @@ -120,5 +120,9 @@ class TestAllURLsMatching(unittest.TestCase): def test_soundcloud_not_matching_sets(self): self.assertMatch('http://soundcloud.com/floex/sets/gone-ep', ['soundcloud:set']) + def test_tumblr(self): + self.assertMatch('http://tatianamaslanydaily.tumblr.com/post/54196191430/orphan-black-dvd-extra-behind-the-scenes', ['Tumblr']) + self.assertMatch('http://tatianamaslanydaily.tumblr.com/post/54196191430', ['Tumblr']) + if __name__ == '__main__': unittest.main() From bc1d1a5a71f5996d9516a01b69f28dfa4aa598a7 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Tue, 28 Jan 2014 03:37:42 +0100 Subject: [PATCH 136/339] release 2014.01.28 --- youtube_dl/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 3c9952e17..8a2ff3210 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,2 +1,2 @@ -__version__ = '2014.01.27.2' +__version__ = '2014.01.28' From 4a192f817ee8f3283e0799a4b960b762a8590770 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Tue, 28 Jan 2014 03:44:19 +0100 Subject: [PATCH 137/339] release 2014.01.28.1 --- youtube_dl/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 8a2ff3210..aab85706a 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,2 +1,2 @@ -__version__ = '2014.01.28' +__version__ = '2014.01.28.1' From e299f6d27f8ba003e22ccbc550309ea60e7229ad Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Tue, 28 Jan 2014 03:53:00 +0100 Subject: [PATCH 138/339] [pornhd] Fix --- youtube_dl/extractor/pornhd.py | 24 +++++++++++++++--------- 1 file changed, 15 insertions(+), 9 deletions(-) diff --git a/youtube_dl/extractor/pornhd.py b/youtube_dl/extractor/pornhd.py index e9ff8d1af..58f9c690e 100644 --- a/youtube_dl/extractor/pornhd.py +++ b/youtube_dl/extractor/pornhd.py @@ -1,3 +1,5 @@ +from __future__ import unicode_literals + import re from .common import InfoExtractor @@ -7,12 +9,12 @@ from ..utils import compat_urllib_parse class PornHdIE(InfoExtractor): _VALID_URL = r'(?:http://)?(?:www\.)?pornhd\.com/(?:[a-z]{2,4}/)?videos/(?P<video_id>[0-9]+)/(?P<video_title>.+)' _TEST = { - u'url': u'http://www.pornhd.com/videos/1962/sierra-day-gets-his-cum-all-over-herself-hd-porn-video', - u'file': u'1962.flv', - u'md5': u'35272469887dca97abd30abecc6cdf75', - u'info_dict': { - u"title": u"sierra-day-gets-his-cum-all-over-herself-hd-porn-video", - u"age_limit": 18, + 'url': 'http://www.pornhd.com/videos/1962/sierra-day-gets-his-cum-all-over-herself-hd-porn-video', + 'file': '1962.flv', + 'md5': '35272469887dca97abd30abecc6cdf75', + 'info_dict': { + "title": "sierra-day-gets-his-cum-all-over-herself-hd-porn-video", + "age_limit": 18, } } @@ -24,9 +26,13 @@ class PornHdIE(InfoExtractor): webpage = self._download_webpage(url, video_id) - video_url = self._html_search_regex( - r'&hd=(http.+?)&', webpage, u'video URL') - video_url = compat_urllib_parse.unquote(video_url) + next_url = self._html_search_regex( + r'&hd=(http.+?)&', webpage, 'video URL') + next_url = compat_urllib_parse.unquote(next_url) + + video_url = self._download_webpage( + next_url, video_id, note='Retrieving video URL', + errnote='Could not retrieve video URL') age_limit = 18 return { From 869baf35659945c755bb901146d456e5c21b425e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= <jaime.marquinez.ferrandiz@gmail.com> Date: Tue, 28 Jan 2014 18:37:42 +0100 Subject: [PATCH 139/339] [funnyordie] Simplify and use unicode_literals --- youtube_dl/extractor/funnyordie.py | 24 ++++++++++++++---------- 1 file changed, 14 insertions(+), 10 deletions(-) diff --git a/youtube_dl/extractor/funnyordie.py b/youtube_dl/extractor/funnyordie.py index 2ccdb7073..7c40e6753 100644 --- a/youtube_dl/extractor/funnyordie.py +++ b/youtube_dl/extractor/funnyordie.py @@ -1,3 +1,5 @@ +from __future__ import unicode_literals + import re from .common import InfoExtractor @@ -6,13 +8,16 @@ from .common import InfoExtractor class FunnyOrDieIE(InfoExtractor): _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$' _TEST = { - u'url': u'http://www.funnyordie.com/videos/0732f586d7/heart-shaped-box-literal-video-version', - u'file': u'0732f586d7.mp4', - u'md5': u'f647e9e90064b53b6e046e75d0241fbd', - u'info_dict': { - u"description": u"Lyrics changed to match the video. Spoken cameo by Obscurus Lupa (from ThatGuyWithTheGlasses.com). Based on a concept by Dustin McLean (DustFilms.com). Performed, edited, and written by David A. Scott.", - u"title": u"Heart-Shaped Box: Literal Video Version" - } + 'url': 'http://www.funnyordie.com/videos/0732f586d7/heart-shaped-box-literal-video-version', + 'file': '0732f586d7.mp4', + 'md5': 'f647e9e90064b53b6e046e75d0241fbd', + 'info_dict': { + 'description': ('Lyrics changed to match the video. Spoken cameo ' + 'by Obscurus Lupa (from ThatGuyWithTheGlasses.com). Based on a ' + 'concept by Dustin McLean (DustFilms.com). Performed, edited, ' + 'and written by David A. Scott.'), + 'title': 'Heart-Shaped Box: Literal Video Version', + }, } def _real_extract(self, url): @@ -23,13 +28,12 @@ class FunnyOrDieIE(InfoExtractor): video_url = self._search_regex( [r'type="video/mp4" src="(.*?)"', r'src="([^>]*?)" type=\'video/mp4\''], - webpage, u'video URL', flags=re.DOTALL) + webpage, 'video URL', flags=re.DOTALL) - info = { + return { 'id': video_id, 'url': video_url, 'ext': 'mp4', 'title': self._og_search_title(webpage), 'description': self._og_search_description(webpage), } - return [info] From 5aaca50d60cf2dfbc548dd14dc0289dcd0a4d89b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= <jaime.marquinez.ferrandiz@gmail.com> Date: Tue, 28 Jan 2014 18:47:31 +0100 Subject: [PATCH 140/339] [keek] Simplify and use unicode_literals --- youtube_dl/extractor/keek.py | 44 ++++++++++++++++++------------------ 1 file changed, 22 insertions(+), 22 deletions(-) diff --git a/youtube_dl/extractor/keek.py b/youtube_dl/extractor/keek.py index a7b88d2d9..5d679e88d 100644 --- a/youtube_dl/extractor/keek.py +++ b/youtube_dl/extractor/keek.py @@ -1,3 +1,5 @@ +from __future__ import unicode_literals + import re from .common import InfoExtractor @@ -5,36 +7,34 @@ from .common import InfoExtractor class KeekIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<videoID>\w+)' - IE_NAME = u'keek' + IE_NAME = 'keek' _TEST = { - u'url': u'https://www.keek.com/ytdl/keeks/NODfbab', - u'file': u'NODfbab.mp4', - u'md5': u'9b0636f8c0f7614afa4ea5e4c6e57e83', - u'info_dict': { - u"uploader": u"ytdl", - u"title": u"test chars: \"'/\\\u00e4<>This is a test video for youtube-dl.For more information, contact phihag@phihag.de ." - } + 'url': 'https://www.keek.com/ytdl/keeks/NODfbab', + 'file': 'NODfbab.mp4', + 'md5': '9b0636f8c0f7614afa4ea5e4c6e57e83', + 'info_dict': { + 'uploader': 'ytdl', + 'title': 'test chars: "\'/\\\u00e4<>This is a test video for youtube-dl.For more information, contact phihag@phihag.de .', + }, } def _real_extract(self, url): m = re.match(self._VALID_URL, url) video_id = m.group('videoID') - video_url = u'http://cdn.keek.com/keek/video/%s' % video_id - thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id + video_url = 'http://cdn.keek.com/keek/video/%s' % video_id + thumbnail = 'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id webpage = self._download_webpage(url, video_id) - video_title = self._og_search_title(webpage) + uploader = self._html_search_regex( + r'<div class="user-name-and-bio">[\S\s]+?<h2>(?P<uploader>.+?)</h2>', + webpage, 'uploader', fatal=False) - uploader = self._html_search_regex(r'<div class="user-name-and-bio">[\S\s]+?<h2>(?P<uploader>.+?)</h2>', - webpage, u'uploader', fatal=False) - - info = { - 'id': video_id, - 'url': video_url, - 'ext': 'mp4', - 'title': video_title, - 'thumbnail': thumbnail, - 'uploader': uploader + return { + 'id': video_id, + 'url': video_url, + 'ext': 'mp4', + 'title': self._og_search_title(webpage), + 'thumbnail': thumbnail, + 'uploader': uploader } - return [info] From ed85007039c2cd23638a318f1750160f99e703a1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= <jaime.marquinez.ferrandiz@gmail.com> Date: Tue, 28 Jan 2014 18:55:06 +0100 Subject: [PATCH 141/339] [ninegag] Use unicode_literals --- youtube_dl/extractor/ninegag.py | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/youtube_dl/extractor/ninegag.py b/youtube_dl/extractor/ninegag.py index ea986c00e..2b7236be5 100644 --- a/youtube_dl/extractor/ninegag.py +++ b/youtube_dl/extractor/ninegag.py @@ -1,3 +1,5 @@ +from __future__ import unicode_literals + import json import re @@ -9,13 +11,13 @@ class NineGagIE(InfoExtractor): _VALID_URL = r'^https?://(?:www\.)?9gag\.tv/v/(?P<id>[0-9]+)' _TEST = { - u"url": u"http://9gag.tv/v/1912", - u"file": u"1912.mp4", - u"info_dict": { - u"description": u"This 3-minute video will make you smile and then make you feel untalented and insignificant. Anyway, you should share this awesomeness. (Thanks, Dino!)", - u"title": u"\"People Are Awesome 2013\" Is Absolutely Awesome" + "url": "http://9gag.tv/v/1912", + "file": "1912.mp4", + "info_dict": { + "description": "This 3-minute video will make you smile and then make you feel untalented and insignificant. Anyway, you should share this awesomeness. (Thanks, Dino!)", + "title": "\"People Are Awesome 2013\" Is Absolutely Awesome" }, - u'add_ie': [u'Youtube'] + 'add_ie': ['Youtube'] } def _real_extract(self, url): @@ -25,7 +27,7 @@ class NineGagIE(InfoExtractor): webpage = self._download_webpage(url, video_id) data_json = self._html_search_regex(r'''(?x) <div\s*id="tv-video"\s*data-video-source="youtube"\s* - data-video-meta="([^"]+)"''', webpage, u'video metadata') + data-video-meta="([^"]+)"''', webpage, 'video metadata') data = json.loads(data_json) From ffe8f62d27762e8627e53ab1081f433663eb307c Mon Sep 17 00:00:00 2001 From: dst <dstftw@gmail.com> Date: Wed, 29 Jan 2014 01:52:57 +0700 Subject: [PATCH 142/339] [smotri] Simplify login and use unicode literals --- youtube_dl/extractor/smotri.py | 247 ++++++++++++++++----------------- 1 file changed, 121 insertions(+), 126 deletions(-) diff --git a/youtube_dl/extractor/smotri.py b/youtube_dl/extractor/smotri.py index 99f5b19d2..f249f013c 100644 --- a/youtube_dl/extractor/smotri.py +++ b/youtube_dl/extractor/smotri.py @@ -1,4 +1,5 @@ # encoding: utf-8 +from __future__ import unicode_literals import os.path import re @@ -16,76 +17,76 @@ from ..utils import ( class SmotriIE(InfoExtractor): - IE_DESC = u'Smotri.com' - IE_NAME = u'smotri' + IE_DESC = 'Smotri.com' + IE_NAME = 'smotri' _VALID_URL = r'^https?://(?:www\.)?(?P<url>smotri\.com/video/view/\?id=(?P<videoid>v(?P<realvideoid>[0-9]+)[a-z0-9]{4}))' _TESTS = [ # real video id 2610366 { - u'url': u'http://smotri.com/video/view/?id=v261036632ab', - u'file': u'v261036632ab.mp4', - u'md5': u'2a7b08249e6f5636557579c368040eb9', - u'info_dict': { - u'title': u'катастрофа с камер видеонаблюдения', - u'uploader': u'rbc2008', - u'uploader_id': u'rbc08', - u'upload_date': u'20131118', - u'description': u'катастрофа с камер видеонаблюдения, видео катастрофа с камер видеонаблюдения', - u'thumbnail': u'http://frame6.loadup.ru/8b/a9/2610366.3.3.jpg', + 'url': 'http://smotri.com/video/view/?id=v261036632ab', + 'file': 'v261036632ab.mp4', + 'md5': '2a7b08249e6f5636557579c368040eb9', + 'info_dict': { + 'title': 'катастрофа с камер видеонаблюдения', + 'uploader': 'rbc2008', + 'uploader_id': 'rbc08', + 'upload_date': '20131118', + 'description': 'катастрофа с камер видеонаблюдения, видео катастрофа с камер видеонаблюдения', + 'thumbnail': 'http://frame6.loadup.ru/8b/a9/2610366.3.3.jpg', }, }, # real video id 57591 { - u'url': u'http://smotri.com/video/view/?id=v57591cb20', - u'file': u'v57591cb20.flv', - u'md5': u'830266dfc21f077eac5afd1883091bcd', - u'info_dict': { - u'title': u'test', - u'uploader': u'Support Photofile@photofile', - u'uploader_id': u'support-photofile', - u'upload_date': u'20070704', - u'description': u'test, видео test', - u'thumbnail': u'http://frame4.loadup.ru/03/ed/57591.2.3.jpg', + 'url': 'http://smotri.com/video/view/?id=v57591cb20', + 'file': 'v57591cb20.flv', + 'md5': '830266dfc21f077eac5afd1883091bcd', + 'info_dict': { + 'title': 'test', + 'uploader': 'Support Photofile@photofile', + 'uploader_id': 'support-photofile', + 'upload_date': '20070704', + 'description': 'test, видео test', + 'thumbnail': 'http://frame4.loadup.ru/03/ed/57591.2.3.jpg', }, }, # video-password { - u'url': u'http://smotri.com/video/view/?id=v1390466a13c', - u'file': u'v1390466a13c.mp4', - u'md5': u'f6331cef33cad65a0815ee482a54440b', - u'info_dict': { - u'title': u'TOCCA_A_NOI_-_LE_COSE_NON_VANNO_CAMBIAMOLE_ORA-1', - u'uploader': u'timoxa40', - u'uploader_id': u'timoxa40', - u'upload_date': u'20100404', - u'thumbnail': u'http://frame7.loadup.ru/af/3f/1390466.3.3.jpg', - u'description': u'TOCCA_A_NOI_-_LE_COSE_NON_VANNO_CAMBIAMOLE_ORA-1, видео TOCCA_A_NOI_-_LE_COSE_NON_VANNO_CAMBIAMOLE_ORA-1', + 'url': 'http://smotri.com/video/view/?id=v1390466a13c', + 'file': 'v1390466a13c.mp4', + 'md5': 'f6331cef33cad65a0815ee482a54440b', + 'info_dict': { + 'title': 'TOCCA_A_NOI_-_LE_COSE_NON_VANNO_CAMBIAMOLE_ORA-1', + 'uploader': 'timoxa40', + 'uploader_id': 'timoxa40', + 'upload_date': '20100404', + 'thumbnail': 'http://frame7.loadup.ru/af/3f/1390466.3.3.jpg', + 'description': 'TOCCA_A_NOI_-_LE_COSE_NON_VANNO_CAMBIAMOLE_ORA-1, видео TOCCA_A_NOI_-_LE_COSE_NON_VANNO_CAMBIAMOLE_ORA-1', }, - u'params': { - u'videopassword': u'qwerty', + 'params': { + 'videopassword': 'qwerty', }, }, # age limit + video-password { - u'url': u'http://smotri.com/video/view/?id=v15408898bcf', - u'file': u'v15408898bcf.flv', - u'md5': u'91e909c9f0521adf5ee86fbe073aad70', - u'info_dict': { - u'title': u'этот ролик не покажут по ТВ', - u'uploader': u'zzxxx', - u'uploader_id': u'ueggb', - u'upload_date': u'20101001', - u'thumbnail': u'http://frame3.loadup.ru/75/75/1540889.1.3.jpg', - u'age_limit': 18, - u'description': u'этот ролик не покажут по ТВ, видео этот ролик не покажут по ТВ', + 'url': 'http://smotri.com/video/view/?id=v15408898bcf', + 'file': 'v15408898bcf.flv', + 'md5': '91e909c9f0521adf5ee86fbe073aad70', + 'info_dict': { + 'title': 'этот ролик не покажут по ТВ', + 'uploader': 'zzxxx', + 'uploader_id': 'ueggb', + 'upload_date': '20101001', + 'thumbnail': 'http://frame3.loadup.ru/75/75/1540889.1.3.jpg', + 'age_limit': 18, + 'description': 'этот ролик не покажут по ТВ, видео этот ролик не покажут по ТВ', }, - u'params': { - u'videopassword': u'333' + 'params': { + 'videopassword': '333' } } ] - + _SUCCESS = 0 _PASSWORD_NOT_VERIFIED = 1 _PASSWORD_DETECTED = 2 @@ -106,71 +107,71 @@ class SmotriIE(InfoExtractor): # Download video JSON data video_json_url = 'http://smotri.com/vt.php?id=%s' % real_video_id - video_json_page = self._download_webpage(video_json_url, video_id, u'Downloading video JSON') + video_json_page = self._download_webpage(video_json_url, video_id, 'Downloading video JSON') video_json = json.loads(video_json_page) - + status = video_json['status'] if status == self._VIDEO_NOT_FOUND: - raise ExtractorError(u'Video %s does not exist' % video_id, expected=True) - elif status == self._PASSWORD_DETECTED: # The video is protected by a password, retry with + raise ExtractorError('Video %s does not exist' % video_id, expected=True) + elif status == self._PASSWORD_DETECTED: # The video is protected by a password, retry with # video-password set video_password = self._downloader.params.get('videopassword', None) if not video_password: - raise ExtractorError(u'This video is protected by a password, use the --video-password option', expected=True) + raise ExtractorError('This video is protected by a password, use the --video-password option', expected=True) video_json_url += '&md5pass=%s' % hashlib.md5(video_password.encode('utf-8')).hexdigest() - video_json_page = self._download_webpage(video_json_url, video_id, u'Downloading video JSON (video-password set)') + video_json_page = self._download_webpage(video_json_url, video_id, 'Downloading video JSON (video-password set)') video_json = json.loads(video_json_page) status = video_json['status'] if status == self._PASSWORD_NOT_VERIFIED: - raise ExtractorError(u'Video password is invalid', expected=True) - + raise ExtractorError('Video password is invalid', expected=True) + if status != self._SUCCESS: - raise ExtractorError(u'Unexpected status value %s' % status) - + raise ExtractorError('Unexpected status value %s' % status) + # Extract the URL of the video video_url = video_json['file_data'] - + # Video JSON does not provide enough meta data # We will extract some from the video web page instead video_page_url = 'http://' + mobj.group('url') - video_page = self._download_webpage(video_page_url, video_id, u'Downloading video page') + video_page = self._download_webpage(video_page_url, video_id, 'Downloading video page') # Warning if video is unavailable warning = self._html_search_regex( r'<div class="videoUnModer">(.*?)</div>', video_page, - u'warning message', default=None) + 'warning message', default=None) if warning is not None: self._downloader.report_warning( - u'Video %s may not be available; smotri said: %s ' % + 'Video %s may not be available; smotri said: %s ' % (video_id, warning)) # Adult content - if re.search(u'EroConfirmText">', video_page) is not None: + if re.search('EroConfirmText">', video_page) is not None: self.report_age_confirmation() confirm_string = self._html_search_regex( r'<a href="/video/view/\?id=%s&confirm=([^"]+)" title="[^"]+">' % video_id, - video_page, u'confirm string') + video_page, 'confirm string') confirm_url = video_page_url + '&confirm=%s' % confirm_string - video_page = self._download_webpage(confirm_url, video_id, u'Downloading video page (age confirmed)') + video_page = self._download_webpage(confirm_url, video_id, 'Downloading video page (age confirmed)') adult_content = True else: adult_content = False - + # Extract the rest of meta data - video_title = self._search_meta(u'name', video_page, u'title') + video_title = self._search_meta('name', video_page, 'title') if not video_title: video_title = os.path.splitext(url_basename(video_url))[0] - video_description = self._search_meta(u'description', video_page) - END_TEXT = u' на сайте Smotri.com' + video_description = self._search_meta('description', video_page) + END_TEXT = ' на сайте Smotri.com' if video_description and video_description.endswith(END_TEXT): video_description = video_description[:-len(END_TEXT)] - START_TEXT = u'Смотреть онлайн ролик ' + START_TEXT = 'Смотреть онлайн ролик ' if video_description and video_description.startswith(START_TEXT): video_description = video_description[len(START_TEXT):] - video_thumbnail = self._search_meta(u'thumbnail', video_page) + video_thumbnail = self._search_meta('thumbnail', video_page) - upload_date_str = self._search_meta(u'uploadDate', video_page, u'upload date') + upload_date_str = self._search_meta('uploadDate', video_page, 'upload date') if upload_date_str: upload_date_m = re.search(r'(?P<year>\d{4})\.(?P<month>\d{2})\.(?P<day>\d{2})T', upload_date_str) video_upload_date = ( @@ -183,8 +184,8 @@ class SmotriIE(InfoExtractor): ) else: video_upload_date = None - - duration_str = self._search_meta(u'duration', video_page) + + duration_str = self._search_meta('duration', video_page) if duration_str: duration_m = re.search(r'T(?P<hours>[0-9]{2})H(?P<minutes>[0-9]{2})M(?P<seconds>[0-9]{2})S', duration_str) video_duration = ( @@ -197,19 +198,19 @@ class SmotriIE(InfoExtractor): ) else: video_duration = None - + video_uploader = self._html_search_regex( - u'<div class="DescrUser"><div>Автор.*?onmouseover="popup_user_info[^"]+">(.*?)</a>', - video_page, u'uploader', fatal=False, flags=re.MULTILINE|re.DOTALL) - + '<div class="DescrUser"><div>Автор.*?onmouseover="popup_user_info[^"]+">(.*?)</a>', + video_page, 'uploader', fatal=False, flags=re.MULTILINE|re.DOTALL) + video_uploader_id = self._html_search_regex( - u'<div class="DescrUser"><div>Автор.*?onmouseover="popup_user_info\\(.*?\'([^\']+)\'\\);">', - video_page, u'uploader id', fatal=False, flags=re.MULTILINE|re.DOTALL) - + '<div class="DescrUser"><div>Автор.*?onmouseover="popup_user_info\\(.*?\'([^\']+)\'\\);">', + video_page, 'uploader id', fatal=False, flags=re.MULTILINE|re.DOTALL) + video_view_count = self._html_search_regex( - u'Общее количество просмотров.*?<span class="Number">(\\d+)</span>', - video_page, u'view count', fatal=False, flags=re.MULTILINE|re.DOTALL) - + 'Общее количество просмотров.*?<span class="Number">(\\d+)</span>', + video_page, 'view count', fatal=False, flags=re.MULTILINE|re.DOTALL) + return { 'id': video_id, 'url': video_url, @@ -227,8 +228,8 @@ class SmotriIE(InfoExtractor): class SmotriCommunityIE(InfoExtractor): - IE_DESC = u'Smotri.com community videos' - IE_NAME = u'smotri:community' + IE_DESC = 'Smotri.com community videos' + IE_NAME = 'smotri:community' _VALID_URL = r'^https?://(?:www\.)?smotri\.com/community/video/(?P<communityid>[0-9A-Za-z_\'-]+)' def _real_extract(self, url): @@ -236,21 +237,21 @@ class SmotriCommunityIE(InfoExtractor): community_id = mobj.group('communityid') url = 'http://smotri.com/export/rss/video/by/community/-/%s/video.xml' % community_id - rss = self._download_xml(url, community_id, u'Downloading community RSS') + rss = self._download_xml(url, community_id, 'Downloading community RSS') entries = [self.url_result(video_url.text, 'Smotri') for video_url in rss.findall('./channel/item/link')] description_text = rss.find('./channel/description').text community_title = self._html_search_regex( - u'^Видео сообщества "([^"]+)"$', description_text, u'community title') + '^Видео сообщества "([^"]+)"$', description_text, 'community title') return self.playlist_result(entries, community_id, community_title) class SmotriUserIE(InfoExtractor): - IE_DESC = u'Smotri.com user videos' - IE_NAME = u'smotri:user' + IE_DESC = 'Smotri.com user videos' + IE_NAME = 'smotri:user' _VALID_URL = r'^https?://(?:www\.)?smotri\.com/user/(?P<userid>[0-9A-Za-z_\'-]+)' def _real_extract(self, url): @@ -258,22 +259,22 @@ class SmotriUserIE(InfoExtractor): user_id = mobj.group('userid') url = 'http://smotri.com/export/rss/user/video/-/%s/video.xml' % user_id - rss = self._download_xml(url, user_id, u'Downloading user RSS') + rss = self._download_xml(url, user_id, 'Downloading user RSS') entries = [self.url_result(video_url.text, 'Smotri') for video_url in rss.findall('./channel/item/link')] description_text = rss.find('./channel/description').text user_nickname = self._html_search_regex( - u'^Видео режиссера (.*)$', description_text, - u'user nickname') + '^Видео режиссера (.*)$', description_text, + 'user nickname') return self.playlist_result(entries, user_id, user_nickname) class SmotriBroadcastIE(InfoExtractor): - IE_DESC = u'Smotri.com broadcasts' - IE_NAME = u'smotri:broadcast' + IE_DESC = 'Smotri.com broadcasts' + IE_NAME = 'smotri:broadcast' _VALID_URL = r'^https?://(?:www\.)?(?P<url>smotri\.com/live/(?P<broadcastid>[^/]+))/?.*' def _real_extract(self, url): @@ -281,46 +282,40 @@ class SmotriBroadcastIE(InfoExtractor): broadcast_id = mobj.group('broadcastid') broadcast_url = 'http://' + mobj.group('url') - broadcast_page = self._download_webpage(broadcast_url, broadcast_id, u'Downloading broadcast page') + broadcast_page = self._download_webpage(broadcast_url, broadcast_id, 'Downloading broadcast page') - if re.search(u'>Режиссер с логином <br/>"%s"<br/> <span>не существует<' % broadcast_id, broadcast_page) is not None: - raise ExtractorError(u'Broadcast %s does not exist' % broadcast_id, expected=True) + if re.search('>Режиссер с логином <br/>"%s"<br/> <span>не существует<' % broadcast_id, broadcast_page) is not None: + raise ExtractorError('Broadcast %s does not exist' % broadcast_id, expected=True) # Adult content - if re.search(u'EroConfirmText">', broadcast_page) is not None: + if re.search('EroConfirmText">', broadcast_page) is not None: (username, password) = self._get_login_info() if username is None: - raise ExtractorError(u'Erotic broadcasts allowed only for registered users, ' - u'use --username and --password options to provide account credentials.', expected=True) + raise ExtractorError('Erotic broadcasts allowed only for registered users, ' + 'use --username and --password options to provide account credentials.', expected=True) - # Log in - login_form_strs = { - u'login-hint53': '1', - u'confirm_erotic': '1', - u'login': username, - u'password': password, + login_form = { + 'login-hint53': '1', + 'confirm_erotic': '1', + 'login': username, + 'password': password, } - # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode - # chokes on unicode - login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items()) - login_data = compat_urllib_parse.urlencode(login_form).encode('utf-8') - login_url = broadcast_url + '/?no_redirect=1' - request = compat_urllib_request.Request(login_url, login_data) - request.add_header('Content-Type', 'application/x-www-form-urlencoded') - broadcast_page = self._download_webpage( - request, broadcast_id, note=u'Logging in and confirming age') - if re.search(u'>Неверный логин или пароль<', broadcast_page) is not None: - raise ExtractorError(u'Unable to log in: bad username or password', expected=True) + request = compat_urllib_request.Request(broadcast_url + '/?no_redirect=1', compat_urllib_parse.urlencode(login_form)) + request.add_header('Content-Type', 'application/x-www-form-urlencoded') + broadcast_page = self._download_webpage(request, broadcast_id, 'Logging in and confirming age') + + if re.search('>Неверный логин или пароль<', broadcast_page) is not None: + raise ExtractorError('Unable to log in: bad username or password', expected=True) adult_content = True else: adult_content = False ticket = self._html_search_regex( - u'window\.broadcast_control\.addFlashVar\\(\'file\', \'([^\']+)\'\\);', - broadcast_page, u'broadcast ticket') + 'window\.broadcast_control\.addFlashVar\\(\'file\', \'([^\']+)\'\\);', + broadcast_page, 'broadcast ticket') url = 'http://smotri.com/broadcast/view/url/?ticket=%s' % ticket @@ -328,22 +323,22 @@ class SmotriBroadcastIE(InfoExtractor): if broadcast_password: url += '&pass=%s' % hashlib.md5(broadcast_password.encode('utf-8')).hexdigest() - broadcast_json_page = self._download_webpage(url, broadcast_id, u'Downloading broadcast JSON') + broadcast_json_page = self._download_webpage(url, broadcast_id, 'Downloading broadcast JSON') try: broadcast_json = json.loads(broadcast_json_page) protected_broadcast = broadcast_json['_pass_protected'] == 1 if protected_broadcast and not broadcast_password: - raise ExtractorError(u'This broadcast is protected by a password, use the --video-password option', expected=True) + raise ExtractorError('This broadcast is protected by a password, use the --video-password option', expected=True) broadcast_offline = broadcast_json['is_play'] == 0 if broadcast_offline: - raise ExtractorError(u'Broadcast %s is offline' % broadcast_id, expected=True) + raise ExtractorError('Broadcast %s is offline' % broadcast_id, expected=True) rtmp_url = broadcast_json['_server'] if not rtmp_url.startswith('rtmp://'): - raise ExtractorError(u'Unexpected broadcast rtmp URL') + raise ExtractorError('Unexpected broadcast rtmp URL') broadcast_playpath = broadcast_json['_streamName'] broadcast_thumbnail = broadcast_json['_imgURL'] @@ -354,8 +349,8 @@ class SmotriBroadcastIE(InfoExtractor): rtmp_conn = 'S:%s' % uuid.uuid4().hex except KeyError: if protected_broadcast: - raise ExtractorError(u'Bad broadcast password', expected=True) - raise ExtractorError(u'Unexpected broadcast JSON') + raise ExtractorError('Bad broadcast password', expected=True) + raise ExtractorError('Unexpected broadcast JSON') return { 'id': broadcast_id, From adc267eebf294e9aa27c8df97307a25cd0adf83f Mon Sep 17 00:00:00 2001 From: dst <dstftw@gmail.com> Date: Wed, 29 Jan 2014 02:00:56 +0700 Subject: [PATCH 143/339] [channel9] Use unicode literals --- youtube_dl/extractor/channel9.py | 69 ++++++++++++++++---------------- 1 file changed, 35 insertions(+), 34 deletions(-) diff --git a/youtube_dl/extractor/channel9.py b/youtube_dl/extractor/channel9.py index 574881b70..1fce59dba 100644 --- a/youtube_dl/extractor/channel9.py +++ b/youtube_dl/extractor/channel9.py @@ -1,4 +1,5 @@ # encoding: utf-8 +from __future__ import unicode_literals import re @@ -11,38 +12,38 @@ class Channel9IE(InfoExtractor): The type of provided URL (video or playlist) is determined according to meta Search.PageType from web page HTML rather than URL itself, as it is - not always possible to do. + not always possible to do. ''' - IE_DESC = u'Channel 9' - IE_NAME = u'channel9' + IE_DESC = 'Channel 9' + IE_NAME = 'channel9' _VALID_URL = r'^https?://(?:www\.)?channel9\.msdn\.com/(?P<contentpath>.+)/?' _TESTS = [ { - u'url': u'http://channel9.msdn.com/Events/TechEd/Australia/2013/KOS002', - u'file': u'Events_TechEd_Australia_2013_KOS002.mp4', - u'md5': u'bbd75296ba47916b754e73c3a4bbdf10', - u'info_dict': { - u'title': u'Developer Kick-Off Session: Stuff We Love', - u'description': u'md5:c08d72240b7c87fcecafe2692f80e35f', - u'duration': 4576, - u'thumbnail': u'http://media.ch9.ms/ch9/9d51/03902f2d-fc97-4d3c-b195-0bfe15a19d51/KOS002_220.jpg', - u'session_code': u'KOS002', - u'session_day': u'Day 1', - u'session_room': u'Arena 1A', - u'session_speakers': [ u'Ed Blankenship', u'Andrew Coates', u'Brady Gaster', u'Patrick Klug', u'Mads Kristensen' ], + 'url': 'http://channel9.msdn.com/Events/TechEd/Australia/2013/KOS002', + 'file': 'Events_TechEd_Australia_2013_KOS002.mp4', + 'md5': 'bbd75296ba47916b754e73c3a4bbdf10', + 'info_dict': { + 'title': 'Developer Kick-Off Session: Stuff We Love', + 'description': 'md5:c08d72240b7c87fcecafe2692f80e35f', + 'duration': 4576, + 'thumbnail': 'http://media.ch9.ms/ch9/9d51/03902f2d-fc97-4d3c-b195-0bfe15a19d51/KOS002_220.jpg', + 'session_code': 'KOS002', + 'session_day': 'Day 1', + 'session_room': 'Arena 1A', + 'session_speakers': [ 'Ed Blankenship', 'Andrew Coates', 'Brady Gaster', 'Patrick Klug', 'Mads Kristensen' ], }, }, { - u'url': u'http://channel9.msdn.com/posts/Self-service-BI-with-Power-BI-nuclear-testing', - u'file': u'posts_Self-service-BI-with-Power-BI-nuclear-testing.mp4', - u'md5': u'b43ee4529d111bc37ba7ee4f34813e68', - u'info_dict': { - u'title': u'Self-service BI with Power BI - nuclear testing', - u'description': u'md5:d1e6ecaafa7fb52a2cacdf9599829f5b', - u'duration': 1540, - u'thumbnail': u'http://media.ch9.ms/ch9/87e1/0300391f-a455-4c72-bec3-4422f19287e1/selfservicenuk_512.jpg', - u'authors': [ u'Mike Wilmot' ], + 'url': 'http://channel9.msdn.com/posts/Self-service-BI-with-Power-BI-nuclear-testing', + 'file': 'posts_Self-service-BI-with-Power-BI-nuclear-testing.mp4', + 'md5': 'b43ee4529d111bc37ba7ee4f34813e68', + 'info_dict': { + 'title': 'Self-service BI with Power BI - nuclear testing', + 'description': 'md5:d1e6ecaafa7fb52a2cacdf9599829f5b', + 'duration': 1540, + 'thumbnail': 'http://media.ch9.ms/ch9/87e1/0300391f-a455-4c72-bec3-4422f19287e1/selfservicenuk_512.jpg', + 'authors': [ 'Mike Wilmot' ], }, } ] @@ -60,7 +61,7 @@ class Channel9IE(InfoExtractor): return 0 units = m.group('units') try: - exponent = [u'B', u'KB', u'MB', u'GB', u'TB', u'PB', u'EB', u'ZB', u'YB'].index(units.upper()) + exponent = ['B', 'KB', 'MB', 'GB', 'TB', 'PB', 'EB', 'ZB', 'YB'].index(units.upper()) except ValueError: return 0 size = float(m.group('size')) @@ -80,7 +81,7 @@ class Channel9IE(InfoExtractor): 'url': x.group('url'), 'format_id': x.group('quality'), 'format_note': x.group('note'), - 'format': u'%s (%s)' % (x.group('quality'), x.group('note')), + 'format': '%s (%s)' % (x.group('quality'), x.group('note')), 'filesize': self._restore_bytes(x.group('filesize')), # File size is approximate 'preference': self._known_formats.index(x.group('quality')), 'vcodec': 'none' if x.group('note') == 'Audio only' else None, @@ -91,10 +92,10 @@ class Channel9IE(InfoExtractor): return formats def _extract_title(self, html): - title = self._html_search_meta(u'title', html, u'title') + title = self._html_search_meta('title', html, 'title') if title is None: title = self._og_search_title(html) - TITLE_SUFFIX = u' (Channel 9)' + TITLE_SUFFIX = ' (Channel 9)' if title is not None and title.endswith(TITLE_SUFFIX): title = title[:-len(TITLE_SUFFIX)] return title @@ -110,7 +111,7 @@ class Channel9IE(InfoExtractor): m = re.search(DESCRIPTION_REGEX, html) if m is not None: return m.group('description') - return self._html_search_meta(u'description', html, u'description') + return self._html_search_meta('description', html, 'description') def _extract_duration(self, html): m = re.search(r'data-video_duration="(?P<hours>\d{2}):(?P<minutes>\d{2}):(?P<seconds>\d{2})"', html) @@ -172,7 +173,7 @@ class Channel9IE(InfoExtractor): # Nothing to download if len(formats) == 0 and slides is None and zip_ is None: - self._downloader.report_warning(u'None of recording, slides or zip are available for %s' % content_path) + self._downloader.report_warning('None of recording, slides or zip are available for %s' % content_path) return # Extract meta @@ -244,7 +245,7 @@ class Channel9IE(InfoExtractor): return contents def _extract_list(self, content_path): - rss = self._download_xml(self._RSS_URL % content_path, content_path, u'Downloading RSS') + rss = self._download_xml(self._RSS_URL % content_path, content_path, 'Downloading RSS') entries = [self.url_result(session_url.text, 'Channel9') for session_url in rss.findall('./channel/item/link')] title_text = rss.find('./channel/title').text @@ -254,11 +255,11 @@ class Channel9IE(InfoExtractor): mobj = re.match(self._VALID_URL, url) content_path = mobj.group('contentpath') - webpage = self._download_webpage(url, content_path, u'Downloading web page') + webpage = self._download_webpage(url, content_path, 'Downloading web page') page_type_m = re.search(r'<meta name="Search.PageType" content="(?P<pagetype>[^"]+)"/>', webpage) if page_type_m is None: - raise ExtractorError(u'Search.PageType not found, don\'t know how to process this page', expected=True) + raise ExtractorError('Search.PageType not found, don\'t know how to process this page', expected=True) page_type = page_type_m.group('pagetype') if page_type == 'List': # List page, may contain list of 'item'-like objects @@ -268,4 +269,4 @@ class Channel9IE(InfoExtractor): elif page_type == 'Session': # Event session page, may contain downloadable content return self._extract_session(webpage, content_path) else: - raise ExtractorError(u'Unexpected Search.PageType %s' % page_type, expected=True) \ No newline at end of file + raise ExtractorError('Unexpected Search.PageType %s' % page_type, expected=True) \ No newline at end of file From 459a53c2c295aaecc14e62af9df4e4ce4a218d1c Mon Sep 17 00:00:00 2001 From: dst <dstftw@gmail.com> Date: Wed, 29 Jan 2014 02:07:29 +0700 Subject: [PATCH 144/339] [channel9] Remove unnecessary coding cookie --- youtube_dl/extractor/channel9.py | 1 - 1 file changed, 1 deletion(-) diff --git a/youtube_dl/extractor/channel9.py b/youtube_dl/extractor/channel9.py index 1fce59dba..3867d7850 100644 --- a/youtube_dl/extractor/channel9.py +++ b/youtube_dl/extractor/channel9.py @@ -1,4 +1,3 @@ -# encoding: utf-8 from __future__ import unicode_literals import re From ceb2b7d257a61c930a89f154da4dce6d720cea4e Mon Sep 17 00:00:00 2001 From: dst <dstftw@gmail.com> Date: Wed, 29 Jan 2014 02:20:48 +0700 Subject: [PATCH 145/339] [ivi] Fix test and use unicode literals --- youtube_dl/extractor/ivi.py | 108 ++++++++++++++++++------------------ 1 file changed, 55 insertions(+), 53 deletions(-) diff --git a/youtube_dl/extractor/ivi.py b/youtube_dl/extractor/ivi.py index 98d1d272a..18dd9cb1e 100644 --- a/youtube_dl/extractor/ivi.py +++ b/youtube_dl/extractor/ivi.py @@ -1,4 +1,5 @@ # encoding: utf-8 +from __future__ import unicode_literals import re import json @@ -11,38 +12,38 @@ from ..utils import ( class IviIE(InfoExtractor): - IE_DESC = u'ivi.ru' - IE_NAME = u'ivi' + IE_DESC = 'ivi.ru' + IE_NAME = 'ivi' _VALID_URL = r'^https?://(?:www\.)?ivi\.ru/watch(?:/(?P<compilationid>[^/]+))?/(?P<videoid>\d+)' _TESTS = [ # Single movie { - u'url': u'http://www.ivi.ru/watch/53141', - u'file': u'53141.mp4', - u'md5': u'6ff5be2254e796ed346251d117196cf4', - u'info_dict': { - u'title': u'Иван Васильевич меняет профессию', - u'description': u'md5:14d8eda24e9d93d29b5857012c6d6346', - u'duration': 5498, - u'thumbnail': u'http://thumbs.ivi.ru/f20.vcp.digitalaccess.ru/contents/d/1/c3c885163a082c29bceeb7b5a267a6.jpg', + 'url': 'http://www.ivi.ru/watch/53141', + 'file': '53141.mp4', + 'md5': '6ff5be2254e796ed346251d117196cf4', + 'info_dict': { + 'title': 'Иван Васильевич меняет профессию', + 'description': 'md5:b924063ea1677c8fe343d8a72ac2195f', + 'duration': 5498, + 'thumbnail': 'http://thumbs.ivi.ru/f20.vcp.digitalaccess.ru/contents/d/1/c3c885163a082c29bceeb7b5a267a6.jpg', }, - u'skip': u'Only works from Russia', + 'skip': 'Only works from Russia', }, # Serial's serie { - u'url': u'http://www.ivi.ru/watch/dezhurnyi_angel/74791', - u'file': u'74791.mp4', - u'md5': u'3e6cc9a848c1d2ebcc6476444967baa9', - u'info_dict': { - u'title': u'Дежурный ангел - 1 серия', - u'duration': 2490, - u'thumbnail': u'http://thumbs.ivi.ru/f7.vcp.digitalaccess.ru/contents/8/e/bc2f6c2b6e5d291152fdd32c059141.jpg', + 'url': 'http://www.ivi.ru/watch/dezhurnyi_angel/74791', + 'file': '74791.mp4', + 'md5': '3e6cc9a848c1d2ebcc6476444967baa9', + 'info_dict': { + 'title': 'Дежурный ангел - 1 серия', + 'duration': 2490, + 'thumbnail': 'http://thumbs.ivi.ru/f7.vcp.digitalaccess.ru/contents/8/e/bc2f6c2b6e5d291152fdd32c059141.jpg', }, - u'skip': u'Only works from Russia', + 'skip': 'Only works from Russia', } ] - + # Sorted by quality _known_formats = ['MP4-low-mobile', 'MP4-mobile', 'FLV-lo', 'MP4-lo', 'FLV-hi', 'MP4-hi', 'MP4-SHQ'] @@ -54,7 +55,7 @@ class IviIE(InfoExtractor): return m.group('description') if m is not None else None def _extract_comment_count(self, html): - m = re.search(u'(?s)<a href="#" id="view-comments" class="action-button dim gradient">\s*Комментарии:\s*(?P<commentcount>\d+)\s*</a>', html) + m = re.search('(?s)<a href="#" id="view-comments" class="action-button dim gradient">\s*Комментарии:\s*(?P<commentcount>\d+)\s*</a>', html) return int(m.group('commentcount')) if m is not None else 0 def _real_extract(self, url): @@ -63,49 +64,49 @@ class IviIE(InfoExtractor): api_url = 'http://api.digitalaccess.ru/api/json/' - data = {u'method': u'da.content.get', - u'params': [video_id, {u'site': u's183', - u'referrer': u'http://www.ivi.ru/watch/%s' % video_id, - u'contentid': video_id - } - ] + data = {'method': 'da.content.get', + 'params': [video_id, {'site': 's183', + 'referrer': 'http://www.ivi.ru/watch/%s' % video_id, + 'contentid': video_id + } + ] } request = compat_urllib_request.Request(api_url, json.dumps(data)) - video_json_page = self._download_webpage(request, video_id, u'Downloading video JSON') + video_json_page = self._download_webpage(request, video_id, 'Downloading video JSON') video_json = json.loads(video_json_page) - if u'error' in video_json: - error = video_json[u'error'] - if error[u'origin'] == u'NoRedisValidData': - raise ExtractorError(u'Video %s does not exist' % video_id, expected=True) - raise ExtractorError(u'Unable to download video %s: %s' % (video_id, error[u'message']), expected=True) + if 'error' in video_json: + error = video_json['error'] + if error['origin'] == 'NoRedisValidData': + raise ExtractorError('Video %s does not exist' % video_id, expected=True) + raise ExtractorError('Unable to download video %s: %s' % (video_id, error['message']), expected=True) - result = video_json[u'result'] + result = video_json['result'] formats = [{ - 'url': x[u'url'], - 'format_id': x[u'content_format'], - 'preference': self._known_formats.index(x[u'content_format']), - } for x in result[u'files'] if x[u'content_format'] in self._known_formats] + 'url': x['url'], + 'format_id': x['content_format'], + 'preference': self._known_formats.index(x['content_format']), + } for x in result['files'] if x['content_format'] in self._known_formats] self._sort_formats(formats) if not formats: - raise ExtractorError(u'No media links available for %s' % video_id) + raise ExtractorError('No media links available for %s' % video_id) - duration = result[u'duration'] - compilation = result[u'compilation'] - title = result[u'title'] + duration = result['duration'] + compilation = result['compilation'] + title = result['title'] title = '%s - %s' % (compilation, title) if compilation is not None else title - previews = result[u'preview'] + previews = result['preview'] previews.sort(key=lambda fmt: self._known_thumbnails.index(fmt['content_format'])) - thumbnail = previews[-1][u'url'] if len(previews) > 0 else None + thumbnail = previews[-1]['url'] if len(previews) > 0 else None - video_page = self._download_webpage(url, video_id, u'Downloading video page') + video_page = self._download_webpage(url, video_id, 'Downloading video page') description = self._extract_description(video_page) comment_count = self._extract_comment_count(video_page) @@ -121,8 +122,8 @@ class IviIE(InfoExtractor): class IviCompilationIE(InfoExtractor): - IE_DESC = u'ivi.ru compilations' - IE_NAME = u'ivi:compilation' + IE_DESC = 'ivi.ru compilations' + IE_NAME = 'ivi:compilation' _VALID_URL = r'^https?://(?:www\.)?ivi\.ru/watch/(?!\d+)(?P<compilationid>[a-z\d_-]+)(?:/season(?P<seasonid>\d+))?$' def _extract_entries(self, html, compilation_id): @@ -135,22 +136,23 @@ class IviCompilationIE(InfoExtractor): season_id = mobj.group('seasonid') if season_id is not None: # Season link - season_page = self._download_webpage(url, compilation_id, u'Downloading season %s web page' % season_id) + season_page = self._download_webpage(url, compilation_id, 'Downloading season %s web page' % season_id) playlist_id = '%s/season%s' % (compilation_id, season_id) - playlist_title = self._html_search_meta(u'title', season_page, u'title') + playlist_title = self._html_search_meta('title', season_page, 'title') entries = self._extract_entries(season_page, compilation_id) else: # Compilation link - compilation_page = self._download_webpage(url, compilation_id, u'Downloading compilation web page') + compilation_page = self._download_webpage(url, compilation_id, 'Downloading compilation web page') playlist_id = compilation_id - playlist_title = self._html_search_meta(u'title', compilation_page, u'title') + playlist_title = self._html_search_meta('title', compilation_page, 'title') seasons = re.findall(r'<a href="/watch/%s/season(\d+)">[^<]+</a>' % compilation_id, compilation_page) if len(seasons) == 0: # No seasons in this compilation entries = self._extract_entries(compilation_page, compilation_id) else: entries = [] for season_id in seasons: - season_page = self._download_webpage('http://www.ivi.ru/watch/%s/season%s' % (compilation_id, season_id), - compilation_id, u'Downloading season %s web page' % season_id) + season_page = self._download_webpage( + 'http://www.ivi.ru/watch/%s/season%s' % (compilation_id, season_id), + compilation_id, 'Downloading season %s web page' % season_id) entries.extend(self._extract_entries(season_page, compilation_id)) return self.playlist_result(entries, playlist_id, playlist_title) \ No newline at end of file From 7eeb5bef24b0c76e5d01d8f4ac6f94fb70416bb5 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Tue, 28 Jan 2014 21:57:38 +0100 Subject: [PATCH 146/339] [liveleak] Simplify --- youtube_dl/extractor/liveleak.py | 38 +++++++++++++------------------- 1 file changed, 15 insertions(+), 23 deletions(-) diff --git a/youtube_dl/extractor/liveleak.py b/youtube_dl/extractor/liveleak.py index 5ae57a77c..d01fd01e3 100644 --- a/youtube_dl/extractor/liveleak.py +++ b/youtube_dl/extractor/liveleak.py @@ -1,3 +1,5 @@ +from __future__ import unicode_literals + import re from .common import InfoExtractor @@ -7,46 +9,36 @@ from ..utils import ( class LiveLeakIE(InfoExtractor): - _VALID_URL = r'^(?:http://)?(?:\w+\.)?liveleak\.com/view\?(?:.*?)i=(?P<video_id>[\w_]+)(?:.*)' - IE_NAME = u'liveleak' _TEST = { - u'url': u'http://www.liveleak.com/view?i=757_1364311680', - u'file': u'757_1364311680.mp4', - u'md5': u'0813c2430bea7a46bf13acf3406992f4', - u'info_dict': { - u"description": u"extremely bad day for this guy..!", - u"uploader": u"ljfriel2", - u"title": u"Most unlucky car accident" + 'url': 'http://www.liveleak.com/view?i=757_1364311680', + 'file': '757_1364311680.mp4', + 'md5': '0813c2430bea7a46bf13acf3406992f4', + 'info_dict': { + 'description': 'extremely bad day for this guy..!', + 'uploader': 'ljfriel2', + 'title': 'Most unlucky car accident' } } def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) - if mobj is None: - raise ExtractorError(u'Invalid URL: %s' % url) video_id = mobj.group('video_id') - webpage = self._download_webpage(url, video_id) - - video_url = self._search_regex(r'file: "(.*?)",', - webpage, u'video URL') + video_url = self._search_regex( + r'file: "(.*?)",', webpage, 'video URL') video_title = self._og_search_title(webpage).replace('LiveLeak.com -', '').strip() - video_description = self._og_search_description(webpage) + video_uploader = self._html_search_regex( + r'By:.*?(\w+)</a>', webpage, 'uploader', fatal=False) - video_uploader = self._html_search_regex(r'By:.*?(\w+)</a>', - webpage, u'uploader', fatal=False) - - info = { - 'id': video_id, + return { + 'id': video_id, 'url': video_url, 'ext': 'mp4', 'title': video_title, 'description': video_description, 'uploader': video_uploader } - - return [info] From b11cec4162ea8deeda3bbaa081d402b76dfb3899 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Wed, 29 Jan 2014 11:16:12 +0100 Subject: [PATCH 147/339] [youtube:user] Fix id key (Fixes #1745) --- youtube_dl/extractor/youtube.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 87a5a452e..54592d174 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -1662,7 +1662,7 @@ class YoutubeUserIE(InfoExtractor): '_type': 'url', 'url': video_id, 'ie_key': 'Youtube', - 'id': 'video_id', + 'id': video_id, 'title': title, } url_results = PagedList(download_page, self._GDATA_PAGE_SIZE) From 8db69786c2c1d7d846fec21541c0363676c211cf Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Wed, 29 Jan 2014 11:16:28 +0100 Subject: [PATCH 148/339] release 2014.01.29 --- youtube_dl/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/version.py b/youtube_dl/version.py index aab85706a..7158105c2 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,2 +1,2 @@ -__version__ = '2014.01.28.1' +__version__ = '2014.01.29' From 26a78d4bbfdab15d05517f4d26c6753bfd148749 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= <jaime.marquinez.ferrandiz@gmail.com> Date: Wed, 29 Jan 2014 15:16:18 +0100 Subject: [PATCH 149/339] [nba] Simplify and use unicode_literals Remove the commented parts for extracting the upload date --- youtube_dl/extractor/nba.py | 31 +++++++++++-------------------- 1 file changed, 11 insertions(+), 20 deletions(-) diff --git a/youtube_dl/extractor/nba.py b/youtube_dl/extractor/nba.py index 0f178905b..7e421610e 100644 --- a/youtube_dl/extractor/nba.py +++ b/youtube_dl/extractor/nba.py @@ -1,48 +1,39 @@ +from __future__ import unicode_literals + import re from .common import InfoExtractor -from ..utils import ( - ExtractorError, -) class NBAIE(InfoExtractor): _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*?)(?:/index\.html)?(?:\?.*)?$' _TEST = { - u'url': u'http://www.nba.com/video/games/nets/2012/12/04/0021200253-okc-bkn-recap.nba/index.html', - u'file': u'0021200253-okc-bkn-recap.nba.mp4', - u'md5': u'c0edcfc37607344e2ff8f13c378c88a4', - u'info_dict': { - u"description": u"Kevin Durant scores 32 points and dishes out six assists as the Thunder beat the Nets in Brooklyn.", - u"title": u"Thunder vs. Nets" - } + 'url': 'http://www.nba.com/video/games/nets/2012/12/04/0021200253-okc-bkn-recap.nba/index.html', + 'file': u'0021200253-okc-bkn-recap.nba.mp4', + 'md5': u'c0edcfc37607344e2ff8f13c378c88a4', + 'info_dict': { + 'description': 'Kevin Durant scores 32 points and dishes out six assists as the Thunder beat the Nets in Brooklyn.', + 'title': 'Thunder vs. Nets', + }, } def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) - if mobj is None: - raise ExtractorError(u'Invalid URL: %s' % url) - video_id = mobj.group(1) webpage = self._download_webpage(url, video_id) - video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4' + video_url = 'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4' shortened_video_id = video_id.rpartition('/')[2] title = self._og_search_title(webpage, default=shortened_video_id).replace('NBA.com: ', '') - # It isn't there in the HTML it returns to us - # uploader_date = self._html_search_regex(r'<b>Date:</b> (.*?)</div>', webpage, 'upload_date', fatal=False) - description = self._html_search_regex(r'<meta name="description" (?:content|value)="(.*?)" />', webpage, 'description', fatal=False) - info = { + return { 'id': shortened_video_id, 'url': video_url, 'ext': 'mp4', 'title': title, - # 'uploader_date': uploader_date, 'description': description, } - return [info] From d4a21e0b4978b1207f48d6311cfec1ea14a62b76 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= <jaime.marquinez.ferrandiz@gmail.com> Date: Wed, 29 Jan 2014 15:21:12 +0100 Subject: [PATCH 150/339] [tutv] Simplify and use unicode_literals --- youtube_dl/extractor/tutv.py | 27 +++++++++++++-------------- 1 file changed, 13 insertions(+), 14 deletions(-) diff --git a/youtube_dl/extractor/tutv.py b/youtube_dl/extractor/tutv.py index 4e404fbf5..c980153ec 100644 --- a/youtube_dl/extractor/tutv.py +++ b/youtube_dl/extractor/tutv.py @@ -1,3 +1,4 @@ +from __future__ import unicode_literals import base64 import re @@ -6,15 +7,16 @@ from ..utils import ( compat_parse_qs, ) + class TutvIE(InfoExtractor): - _VALID_URL=r'https?://(?:www\.)?tu\.tv/videos/(?P<id>[^/?]+)' + _VALID_URL = r'https?://(?:www\.)?tu\.tv/videos/(?P<id>[^/?]+)' _TEST = { - u'url': u'http://tu.tv/videos/noah-en-pabellon-cuahutemoc', - u'file': u'2742556.flv', - u'md5': u'5eb766671f69b82e528dc1e7769c5cb2', - u'info_dict': { - u"title": u"Noah en pabellon cuahutemoc" - } + 'url': 'http://tu.tv/videos/noah-en-pabellon-cuahutemoc', + 'file': '2742556.flv', + 'md5': '5eb766671f69b82e528dc1e7769c5cb2', + 'info_dict': { + 'title': 'Noah en pabellon cuahutemoc', + }, } def _real_extract(self, url): @@ -22,18 +24,15 @@ class TutvIE(InfoExtractor): video_id = mobj.group('id') webpage = self._download_webpage(url, video_id) - internal_id = self._search_regex(r'codVideo=([0-9]+)', webpage, u'internal video ID') + internal_id = self._search_regex(r'codVideo=([0-9]+)', webpage, 'internal video ID') - data_url = u'http://tu.tv/flvurl.php?codVideo=' + str(internal_id) - data_content = self._download_webpage(data_url, video_id, note=u'Downloading video info') + data_url = 'http://tu.tv/flvurl.php?codVideo=' + str(internal_id) + data_content = self._download_webpage(data_url, video_id, note='Downloading video info') data = compat_parse_qs(data_content) video_url = base64.b64decode(data['kpt'][0]).decode('utf-8') - ext = video_url.partition(u'?')[0].rpartition(u'.')[2] - info = { + return { 'id': internal_id, 'url': video_url, - 'ext': ext, 'title': self._og_search_title(webpage), } - return [info] From d882161d5a25b658c3be7f6c6c86f4c707d58bb3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= <jaime.marquinez.ferrandiz@gmail.com> Date: Wed, 29 Jan 2014 15:34:35 +0100 Subject: [PATCH 151/339] [infoq] Simplify and use unicode_literals --- youtube_dl/extractor/infoq.py | 39 ++++++++++++++--------------------- 1 file changed, 16 insertions(+), 23 deletions(-) diff --git a/youtube_dl/extractor/infoq.py b/youtube_dl/extractor/infoq.py index c79c589c7..7c208b85d 100644 --- a/youtube_dl/extractor/infoq.py +++ b/youtube_dl/extractor/infoq.py @@ -1,27 +1,27 @@ +from __future__ import unicode_literals + import base64 import re from .common import InfoExtractor from ..utils import ( compat_urllib_parse, - - ExtractorError, ) class InfoQIE(InfoExtractor): _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$' _TEST = { - u"name": u"InfoQ", - u"url": u"http://www.infoq.com/presentations/A-Few-of-My-Favorite-Python-Things", - u"file": u"12-jan-pythonthings.mp4", - u"info_dict": { - u"description": u"Mike Pirnat presents some tips and tricks, standard libraries and third party packages that make programming in Python a richer experience.", - u"title": u"A Few of My Favorite [Python] Things" + "name": "InfoQ", + "url": "http://www.infoq.com/presentations/A-Few-of-My-Favorite-Python-Things", + "file": "12-jan-pythonthings.mp4", + "info_dict": { + "description": "Mike Pirnat presents some tips and tricks, standard libraries and third party packages that make programming in Python a richer experience.", + "title": "A Few of My Favorite [Python] Things", + }, + "params": { + "skip_download": True, }, - u"params": { - u"skip_download": True - } } def _real_extract(self, url): @@ -31,32 +31,25 @@ class InfoQIE(InfoExtractor): self.report_extraction(url) # Extract video URL - mobj = re.search(r"jsclassref ?= ?'([^']*)'", webpage) - if mobj is None: - raise ExtractorError(u'Unable to extract video url') - real_id = compat_urllib_parse.unquote(base64.b64decode(mobj.group(1).encode('ascii')).decode('utf-8')) + encoded_id = self._search_regex(r"jsclassref ?= ?'([^']*)'", webpage, 'encoded id') + real_id = compat_urllib_parse.unquote(base64.b64decode(encoded_id.encode('ascii')).decode('utf-8')) video_url = 'rtmpe://video.infoq.com/cfx/st/' + real_id # Extract title video_title = self._search_regex(r'contentTitle = "(.*?)";', - webpage, u'title') + webpage, 'title') # Extract description video_description = self._html_search_regex(r'<meta name="description" content="(.*)"(?:\s*/)?>', - webpage, u'description', fatal=False) + webpage, 'description', fatal=False) video_filename = video_url.split('/')[-1] video_id, extension = video_filename.split('.') - info = { + return { 'id': video_id, 'url': video_url, - 'uploader': None, - 'upload_date': None, 'title': video_title, 'ext': extension, # Extension is always(?) mp4, but seems to be flv - 'thumbnail': None, 'description': video_description, } - - return [info] \ No newline at end of file From 245b612a366b7ccae9a6e17c6f223caa551b8426 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= <jaime.marquinez.ferrandiz@gmail.com> Date: Wed, 29 Jan 2014 16:37:10 +0100 Subject: [PATCH 152/339] [rbmaradio] Simplify and use unicode_literals --- youtube_dl/extractor/rbmaradio.py | 51 ++++++++++++++++--------------- 1 file changed, 26 insertions(+), 25 deletions(-) diff --git a/youtube_dl/extractor/rbmaradio.py b/youtube_dl/extractor/rbmaradio.py index 4b6147a73..b9cb7abd1 100644 --- a/youtube_dl/extractor/rbmaradio.py +++ b/youtube_dl/extractor/rbmaradio.py @@ -1,3 +1,6 @@ +# encoding: utf-8 +from __future__ import unicode_literals + import json import re @@ -12,16 +15,16 @@ from ..utils import ( class RBMARadioIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<videoID>[^/]+)$' _TEST = { - u'url': u'http://www.rbmaradio.com/shows/ford-lopatin-live-at-primavera-sound-2011', - u'file': u'ford-lopatin-live-at-primavera-sound-2011.mp3', - u'md5': u'6bc6f9bcb18994b4c983bc3bf4384d95', - u'info_dict': { - u"uploader_id": u"ford-lopatin", - u"location": u"Spain", - u"description": u"Joel Ford and Daniel \u2019Oneohtrix Point Never\u2019 Lopatin fly their midified pop extravaganza to Spain. Live at Primavera Sound 2011.", - u"uploader": u"Ford & Lopatin", - u"title": u"Live at Primavera Sound 2011" - } + 'url': 'http://www.rbmaradio.com/shows/ford-lopatin-live-at-primavera-sound-2011', + 'file': 'ford-lopatin-live-at-primavera-sound-2011.mp3', + 'md5': '6bc6f9bcb18994b4c983bc3bf4384d95', + 'info_dict': { + "uploader_id": "ford-lopatin", + "location": "Spain", + "description": "Joel Ford and Daniel ’Oneohtrix Point Never’ Lopatin fly their midified pop extravaganza to Spain. Live at Primavera Sound 2011.", + "uploader": "Ford & Lopatin", + "title": "Live at Primavera Sound 2011", + }, } def _real_extract(self, url): @@ -31,26 +34,24 @@ class RBMARadioIE(InfoExtractor): webpage = self._download_webpage(url, video_id) json_data = self._search_regex(r'window\.gon.*?gon\.show=(.+?);$', - webpage, u'json data', flags=re.MULTILINE) + webpage, 'json data', flags=re.MULTILINE) try: data = json.loads(json_data) except ValueError as e: - raise ExtractorError(u'Invalid JSON: ' + str(e)) + raise ExtractorError('Invalid JSON: ' + str(e)) video_url = data['akamai_url'] + '&cbr=256' url_parts = compat_urllib_parse_urlparse(video_url) - video_ext = url_parts.path.rpartition('.')[2] - info = { - 'id': video_id, - 'url': video_url, - 'ext': video_ext, - 'title': data['title'], - 'description': data.get('teaser_text'), - 'location': data.get('country_of_origin'), - 'uploader': data.get('host', {}).get('name'), - 'uploader_id': data.get('host', {}).get('slug'), - 'thumbnail': data.get('image', {}).get('large_url_2x'), - 'duration': data.get('duration'), + + return { + 'id': video_id, + 'url': video_url, + 'title': data['title'], + 'description': data.get('teaser_text'), + 'location': data.get('country_of_origin'), + 'uploader': data.get('host', {}).get('name'), + 'uploader_id': data.get('host', {}).get('slug'), + 'thumbnail': data.get('image', {}).get('large_url_2x'), + 'duration': data.get('duration'), } - return [info] From 0b76600deb1d084cac9387b417a5d1756f7e13a6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= <jaime.marquinez.ferrandiz@gmail.com> Date: Wed, 29 Jan 2014 16:44:21 +0100 Subject: [PATCH 153/339] [youjizz] Simplify and use unicode_literals --- youtube_dl/extractor/youjizz.py | 42 +++++++++++++++++---------------- 1 file changed, 22 insertions(+), 20 deletions(-) diff --git a/youtube_dl/extractor/youjizz.py b/youtube_dl/extractor/youjizz.py index e971b5b4b..fcb5ff758 100644 --- a/youtube_dl/extractor/youjizz.py +++ b/youtube_dl/extractor/youjizz.py @@ -1,3 +1,5 @@ +from __future__ import unicode_literals + import re from .common import InfoExtractor @@ -9,12 +11,12 @@ from ..utils import ( class YouJizzIE(InfoExtractor): _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+)\.html$' _TEST = { - u'url': u'http://www.youjizz.com/videos/zeichentrick-1-2189178.html', - u'file': u'2189178.flv', - u'md5': u'07e15fa469ba384c7693fd246905547c', - u'info_dict': { - u"title": u"Zeichentrick 1", - u"age_limit": 18, + 'url': 'http://www.youjizz.com/videos/zeichentrick-1-2189178.html', + 'file': '2189178.flv', + 'md5': '07e15fa469ba384c7693fd246905547c', + 'info_dict': { + "title": "Zeichentrick 1", + "age_limit": 18, } } @@ -30,12 +32,12 @@ class YouJizzIE(InfoExtractor): # Get the video title video_title = self._html_search_regex(r'<title>(?P<title>.*)', - webpage, u'title').strip() + webpage, 'title').strip() # Get the embed page result = re.search(r'https?://www.youjizz.com/videos/embed/(?P[0-9]+)', webpage) if result is None: - raise ExtractorError(u'ERROR: unable to extract embed page') + raise ExtractorError('ERROR: unable to extract embed page') embed_page_url = result.group(0).strip() video_id = result.group('videoid') @@ -47,23 +49,23 @@ class YouJizzIE(InfoExtractor): if m_playlist is not None: playlist_url = m_playlist.group('playlist') playlist_page = self._download_webpage(playlist_url, video_id, - u'Downloading playlist page') + 'Downloading playlist page') m_levels = list(re.finditer(r'[^"]+)"\)\);', - webpage, u'video URL') + webpage, 'video URL') - info = {'id': video_id, - 'url': video_url, - 'title': video_title, - 'ext': 'flv', - 'format': 'flv', - 'player_url': embed_page_url, - 'age_limit': age_limit} - - return [info] + return { + 'id': video_id, + 'url': video_url, + 'title': video_title, + 'ext': 'flv', + 'format': 'flv', + 'player_url': embed_page_url, + 'age_limit': age_limit, + } From fb2a706d11a52156f56aaa8751e40177494ababa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Wed, 29 Jan 2014 16:55:23 +0100 Subject: [PATCH 154/339] [myspass] Simplify and use unicode_literals --- youtube_dl/extractor/myspass.py | 27 +++++++++++++-------------- 1 file changed, 13 insertions(+), 14 deletions(-) diff --git a/youtube_dl/extractor/myspass.py b/youtube_dl/extractor/myspass.py index 4becddee6..4fa0575f8 100644 --- a/youtube_dl/extractor/myspass.py +++ b/youtube_dl/extractor/myspass.py @@ -1,3 +1,4 @@ +from __future__ import unicode_literals import os.path from .common import InfoExtractor @@ -11,13 +12,13 @@ from ..utils import ( class MySpassIE(InfoExtractor): _VALID_URL = r'http://www\.myspass\.de/.*' _TEST = { - u'url': u'http://www.myspass.de/myspass/shows/tvshows/absolute-mehrheit/Absolute-Mehrheit-vom-17022013-Die-Highlights-Teil-2--/11741/', - u'file': u'11741.mp4', - u'md5': u'0b49f4844a068f8b33f4b7c88405862b', - u'info_dict': { - u"description": u"Wer kann in die Fu\u00dfstapfen von Wolfgang Kubicki treten und die Mehrheit der Zuschauer hinter sich versammeln? Wird vielleicht sogar die Absolute Mehrheit geknackt und der Jackpot von 200.000 Euro mit nach Hause genommen?", - u"title": u"Absolute Mehrheit vom 17.02.2013 - Die Highlights, Teil 2" - } + 'url': 'http://www.myspass.de/myspass/shows/tvshows/absolute-mehrheit/Absolute-Mehrheit-vom-17022013-Die-Highlights-Teil-2--/11741/', + 'file': '11741.mp4', + 'md5': '0b49f4844a068f8b33f4b7c88405862b', + 'info_dict': { + "description": "Wer kann in die Fu\u00dfstapfen von Wolfgang Kubicki treten und die Mehrheit der Zuschauer hinter sich versammeln? Wird vielleicht sogar die Absolute Mehrheit geknackt und der Jackpot von 200.000 Euro mit nach Hause genommen?", + "title": "Absolute Mehrheit vom 17.02.2013 - Die Highlights, Teil 2", + }, } def _real_extract(self, url): @@ -37,12 +38,11 @@ class MySpassIE(InfoExtractor): # extract values from metadata url_flv_el = metadata.find('url_flv') if url_flv_el is None: - raise ExtractorError(u'Unable to extract download url') + raise ExtractorError('Unable to extract download url') video_url = url_flv_el.text - extension = os.path.splitext(video_url)[1][1:] title_el = metadata.find('title') if title_el is None: - raise ExtractorError(u'Unable to extract title') + raise ExtractorError('Unable to extract title') title = title_el.text format_id_el = metadata.find('format_id') if format_id_el is None: @@ -59,13 +59,12 @@ class MySpassIE(InfoExtractor): thumbnail = imagePreview_el.text else: thumbnail = None - info = { + + return { 'id': video_id, 'url': video_url, 'title': title, - 'ext': extension, 'format': format, 'thumbnail': thumbnail, - 'description': description + 'description': description, } - return [info] From 0c708f11cbde6339adbf9a8477f673e25e68e920 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Wed, 29 Jan 2014 18:03:32 +0100 Subject: [PATCH 155/339] [bloomberg] Fix ooyala url extraction MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Added a helper method to InfoExtractor for searching the ‘twitter:player’ meta property. Now the OoyalaIE also recognizes the ‘ec’ parameter in the url as the embed code. --- youtube_dl/extractor/bloomberg.py | 4 ++-- youtube_dl/extractor/common.py | 4 ++++ youtube_dl/extractor/ooyala.py | 2 +- 3 files changed, 7 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/bloomberg.py b/youtube_dl/extractor/bloomberg.py index d18bc7e0c..df2cff81c 100644 --- a/youtube_dl/extractor/bloomberg.py +++ b/youtube_dl/extractor/bloomberg.py @@ -24,5 +24,5 @@ class BloombergIE(InfoExtractor): mobj = re.match(self._VALID_URL, url) name = mobj.group('name') webpage = self._download_webpage(url, name) - ooyala_code = self._search_regex(r'.+?)(&|$)' + _VALID_URL = r'https?://.+?\.ooyala\.com/.*?(?:embedCode|ec)=(?P.+?)(&|$)' _TEST = { # From http://it.slashdot.org/story/13/04/25/178216/recovering-data-from-broken-hard-drives-and-ssds-video From c3f51436bf3b36572b403d3240c947e6961bd25f Mon Sep 17 00:00:00 2001 From: dst Date: Thu, 30 Jan 2014 04:26:46 +0700 Subject: [PATCH 156/339] Improve some regexes for embedded players --- youtube_dl/extractor/generic.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 48de379b7..e0d51996d 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -254,7 +254,7 @@ class GenericIE(InfoExtractor): # Look for embedded (iframe) Vimeo player mobj = re.search( - r']+?src="((?:https?:)?//player.vimeo.com/video/.+?)"', webpage) + r']+?src="((?:https?:)?//player\.vimeo\.com/video/.+?)"', webpage) if mobj: player_url = unescapeHTML(mobj.group(1)) surl = smuggle_url(player_url, {'Referer': url}) @@ -262,7 +262,7 @@ class GenericIE(InfoExtractor): # Look for embedded (swf embed) Vimeo player mobj = re.search( - r']+?src="(https?://(?:www\.)?vimeo.com/moogaloop.swf.+?)"', webpage) + r']+?src="(https?://(?:www\.)?vimeo\.com/moogaloop\.swf.+?)"', webpage) if mobj: return self.url_result(mobj.group(1), 'Vimeo') @@ -332,7 +332,7 @@ class GenericIE(InfoExtractor): return self.url_result(mobj.group(1), 'Aparat') # Look for MPORA videos - mobj = re.search(r'', page) if not mobj: raise ExtractorError('No media found') From 463b33461637b0f2c98772228dd9d7eae171b560 Mon Sep 17 00:00:00 2001 From: "Sergey M." Date: Fri, 14 Feb 2014 23:12:15 +0700 Subject: [PATCH 307/339] [ndr] Replace 404 test --- youtube_dl/extractor/ndr.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/youtube_dl/extractor/ndr.py b/youtube_dl/extractor/ndr.py index 811ef5201..0650f9564 100644 --- a/youtube_dl/extractor/ndr.py +++ b/youtube_dl/extractor/ndr.py @@ -26,15 +26,15 @@ class NDRIE(InfoExtractor): }, }, { - 'url': 'http://www.ndr.de/903/audio191719.html', - 'md5': '41ed601768534dd18a9ae34d84798129', + 'url': 'http://www.ndr.de/info/audio51535.html', + 'md5': 'bb3cd38e24fbcc866d13b50ca59307b8', 'note': 'Audio file', 'info_dict': { - 'id': '191719', + 'id': '51535', 'ext': 'mp3', - 'title': '"Es war schockierend"', - 'description': 'md5:ed7ff8364793545021a6355b97e95f10', - 'duration': 112, + 'title': 'La Valette entgeht der Hinrichtung', + 'description': 'md5:22f9541913a40fe50091d5cdd7c9f536', + 'duration': 884, } } ] From 66c43a53e4b1b4d4e530ae4dcded2d382d51b264 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Niklas=20Laxstr=C3=B6m?= Date: Fri, 14 Feb 2014 18:14:28 +0200 Subject: [PATCH 308/339] Add support for video.helsinki.fi archives --- youtube_dl/extractor/__init__.py | 1 + youtube_dl/extractor/helsinki.py | 51 ++++++++++++++++++++++++++++++++ 2 files changed, 52 insertions(+) create mode 100644 youtube_dl/extractor/helsinki.py diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 3a8cd8a58..9490df0d8 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -91,6 +91,7 @@ from .generic import GenericIE from .googleplus import GooglePlusIE from .googlesearch import GoogleSearchIE from .hark import HarkIE +from .helsinki import HelsinkiIE from .hotnewhiphop import HotNewHipHopIE from .howcast import HowcastIE from .huffpost import HuffPostIE diff --git a/youtube_dl/extractor/helsinki.py b/youtube_dl/extractor/helsinki.py new file mode 100644 index 000000000..2a54f3cca --- /dev/null +++ b/youtube_dl/extractor/helsinki.py @@ -0,0 +1,51 @@ +# -*- coding: utf-8 -*- + +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor + + +class HelsinkiIE(InfoExtractor): + _VALID_URL = r'https?://video\.helsinki\.fi/Arkisto/flash\.php\?id=(?P\d+)' + _TEST = { + 'url': 'http://video.helsinki.fi/Arkisto/flash.php?id=20258', + 'md5': 'cd829201b890905682eb194cbdea55d7', + 'info_dict': { + 'id': '20258', + 'ext': 'mp4', + 'title': 'Tietotekniikkafoorumi-iltapäivä', + } + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + vid = mobj.group('id') + webpage = self._download_webpage(url, vid) + formats = [] + mobj = re.search('file=((\w+):[^&]+)', webpage) + if mobj: formats.append({ + 'ext': mobj.group(2), + 'play_path': mobj.group(1), + 'url': 'rtmp://flashvideo.it.helsinki.fi/vod/', + 'player_url': 'http://video.helsinki.fi/player.swf', + 'format_note': 'sd' + }) + + mobj = re.search('hd\.file=((\w+):[^&]+)', webpage) + if mobj: formats.append({ + 'ext': mobj.group(2), + 'play_path': mobj.group(1), + 'url': 'rtmp://flashvideo.it.helsinki.fi/vod/', + 'player_url': 'http://video.helsinki.fi/player.swf', + 'format_note': 'hd' + }) + + return { + 'id': vid, + 'title': self._og_search_title(webpage).replace('Video: ', ''), + 'description': self._og_search_description(webpage), + 'thumbnail': self._og_search_thumbnail(webpage), + 'formats': formats + } From 3165dc4d9f8cf637c544802bbbb519d7b3578f08 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Sat, 15 Feb 2014 13:04:31 +0100 Subject: [PATCH 309/339] [france2.fr:generation-quoi] Skip test The videos seem to not be available outside France --- youtube_dl/extractor/francetv.py | 1 + 1 file changed, 1 insertion(+) diff --git a/youtube_dl/extractor/francetv.py b/youtube_dl/extractor/francetv.py index ae342341c..51eb97b2f 100644 --- a/youtube_dl/extractor/francetv.py +++ b/youtube_dl/extractor/francetv.py @@ -184,6 +184,7 @@ class GenerationQuoiIE(InfoExtractor): # It uses Dailymotion 'skip_download': True, }, + 'skip': 'Only available from France', } def _real_extract(self, url): From e68abba91099eddee5e84bac6b6228a3b6fb1a95 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Sat, 15 Feb 2014 13:12:41 +0100 Subject: [PATCH 310/339] [sohu] Skip test Only available from China --- youtube_dl/extractor/sohu.py | 1 + 1 file changed, 1 insertion(+) diff --git a/youtube_dl/extractor/sohu.py b/youtube_dl/extractor/sohu.py index 2b9bf0cb7..bebcafb62 100644 --- a/youtube_dl/extractor/sohu.py +++ b/youtube_dl/extractor/sohu.py @@ -17,6 +17,7 @@ class SohuIE(InfoExtractor): u'info_dict': { u'title': u'MV:Far East Movement《The Illest》', }, + u'skip': u'Only available from China', } def _real_extract(self, url): From 99043c2ea5a670587b005a9cae33cd138a515290 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Sat, 15 Feb 2014 13:17:31 +0100 Subject: [PATCH 311/339] Replace test for dailymotion users --- test/test_playlists.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/test/test_playlists.py b/test/test_playlists.py index e0eb05460..1de9e8ec1 100644 --- a/test/test_playlists.py +++ b/test/test_playlists.py @@ -55,10 +55,10 @@ class TestPlaylists(unittest.TestCase): def test_dailymotion_user(self): dl = FakeYDL() ie = DailymotionUserIE(dl) - result = ie.extract('http://www.dailymotion.com/user/generation-quoi/') + result = ie.extract('https://www.dailymotion.com/user/nqtv') self.assertIsPlaylist(result) - self.assertEqual(result['title'], 'Génération Quoi') - self.assertTrue(len(result['entries']) >= 26) + self.assertEqual(result['title'], 'Rémi Gaillard') + self.assertTrue(len(result['entries']) >= 100) def test_vimeo_channel(self): dl = FakeYDL() From b53466e1680db3d710415329674c887d38af46c5 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Sat, 15 Feb 2014 16:24:43 +0100 Subject: [PATCH 312/339] Fix f4m downloading on Python 2.6 --- test/test_utils.py | 4 ++++ youtube_dl/downloader/f4m.py | 11 ++++++----- youtube_dl/utils.py | 18 ++++++++++++++++++ 3 files changed, 28 insertions(+), 5 deletions(-) diff --git a/test/test_utils.py b/test/test_utils.py index 97c408ebf..1ca5f5af8 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -25,6 +25,7 @@ from youtube_dl.utils import ( shell_quote, smuggle_url, str_to_int, + struct_unpack, timeconvert, unescapeHTML, unified_strdate, @@ -237,5 +238,8 @@ class TestUtil(unittest.TestCase): testPL(5, 2, (2, 99), [2, 3, 4]) testPL(5, 2, (20, 99), []) + def test_struct_unpack(self): + self.assertEqual(struct_unpack(u'!B', b'\x00'), (0,)) + if __name__ == '__main__': unittest.main() diff --git a/youtube_dl/downloader/f4m.py b/youtube_dl/downloader/f4m.py index 9a6c03556..052751106 100644 --- a/youtube_dl/downloader/f4m.py +++ b/youtube_dl/downloader/f4m.py @@ -4,13 +4,14 @@ import base64 import io import itertools import os -from struct import unpack, pack import time import xml.etree.ElementTree as etree from .common import FileDownloader from .http import HttpFD from ..utils import ( + struct_pack, + struct_unpack, compat_urllib_request, compat_urlparse, format_bytes, @@ -27,13 +28,13 @@ class FlvReader(io.BytesIO): # Utility functions for reading numbers and strings def read_unsigned_long_long(self): - return unpack('!Q', self.read(8))[0] + return struct_unpack('!Q', self.read(8))[0] def read_unsigned_int(self): - return unpack('!I', self.read(4))[0] + return struct_unpack('!I', self.read(4))[0] def read_unsigned_char(self): - return unpack('!B', self.read(1))[0] + return struct_unpack('!B', self.read(1))[0] def read_string(self): res = b'' @@ -196,7 +197,7 @@ def write_flv_header(stream, metadata): # Script data stream.write(b'\x12') # Size of the metadata with 3 bytes - stream.write(pack('!L', len(metadata))[1:]) + stream.write(struct_pack('!L', len(metadata))[1:]) stream.write(b'\x00\x00\x00\x00\x00\x00\x00') stream.write(metadata) # Magic numbers extracted from the output files produced by AdobeHDS.php diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 67c6af507..dd03f058f 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -17,6 +17,7 @@ import platform import re import ssl import socket +import struct import subprocess import sys import traceback @@ -1220,3 +1221,20 @@ def uppercase_escape(s): return re.sub( r'\\U([0-9a-fA-F]{8})', lambda m: compat_chr(int(m.group(1), base=16)), s) + +try: + struct.pack(u'!I', 0) +except TypeError: + # In Python 2.6 (and some 2.7 versions), struct requires a bytes argument + def struct_pack(spec, *args): + if isinstance(spec, compat_str): + spec = spec.encode('ascii') + return struct.pack(spec, *args) + + def struct_unpack(spec, *args): + if isinstance(spec, compat_str): + spec = spec.encode('ascii') + return struct.unpack(spec, *args) +else: + struct_pack = struct.pack + struct_unpack = struct.unpack From 07ad22b8afb41aa0ef000a67532a6498c0edc592 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Sat, 15 Feb 2014 16:30:11 +0100 Subject: [PATCH 313/339] [youtube:search] Mark "no results found" error as expected --- youtube_dl/extractor/youtube.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 8c2c4dfa2..a81036843 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -1694,7 +1694,8 @@ class YoutubeSearchIE(SearchInfoExtractor): api_response = data['data'] if 'items' not in api_response: - raise ExtractorError(u'[youtube] No video results') + raise ExtractorError( + u'[youtube] No video results', expected=True) new_ids = list(video['id'] for video in api_response['items']) video_ids += new_ids From 1824b4816974bbd8d026ed44185ad888e6fb6d70 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Sat, 15 Feb 2014 17:09:49 +0100 Subject: [PATCH 314/339] [f4m] Download only the first fragment with the `--test` option --- youtube_dl/downloader/f4m.py | 11 ++++++++++- youtube_dl/extractor/syfy.py | 5 +---- 2 files changed, 11 insertions(+), 5 deletions(-) diff --git a/youtube_dl/downloader/f4m.py b/youtube_dl/downloader/f4m.py index 052751106..2a870a758 100644 --- a/youtube_dl/downloader/f4m.py +++ b/youtube_dl/downloader/f4m.py @@ -224,7 +224,13 @@ class F4mFD(FileDownloader): self.to_screen('[download] Downloading f4m manifest') manifest = self.ydl.urlopen(man_url).read() self.report_destination(filename) - http_dl = HttpQuietDownloader(self.ydl, {'continuedl': True, 'quiet': True, 'noprogress': True}) + http_dl = HttpQuietDownloader(self.ydl, + { + 'continuedl': True, + 'quiet': True, + 'noprogress': True, + 'test': self.params.get('test', False), + }) doc = etree.fromstring(manifest) formats = [(int(f.attrib.get('bitrate', -1)), f) for f in doc.findall(_add_ns('media'))] @@ -235,6 +241,9 @@ class F4mFD(FileDownloader): metadata = base64.b64decode(media.find(_add_ns('metadata')).text) boot_info = read_bootstrap_info(bootstrap) fragments_list = build_fragments_list(boot_info) + if self.params.get('test', False): + # We only download the first fragment + fragments_list = fragments_list[:1] total_frags = len(fragments_list) tmpfilename = self.temp_name(filename) diff --git a/youtube_dl/extractor/syfy.py b/youtube_dl/extractor/syfy.py index 502d43ec4..8809a57fe 100644 --- a/youtube_dl/extractor/syfy.py +++ b/youtube_dl/extractor/syfy.py @@ -10,16 +10,13 @@ class SyfyIE(InfoExtractor): _TEST = { 'url': 'http://www.syfy.com/videos/Robot%20Combat%20League/Behind%20the%20Scenes/vid:2631458', + 'md5': 'e07de1d52c7278adbb9b9b1c93a66849', 'info_dict': { 'id': 'NmqMrGnXvmO1', 'ext': 'flv', 'title': 'George Lucas has Advice for his Daughter', 'description': 'Listen to what insights George Lucas give his daughter Amanda.', }, - 'params': { - # f4m download - 'skip_download': True, - }, 'add_ie': ['ThePlatform'], } From 00cf938aa54af446d20d01f5184d09880af803f3 Mon Sep 17 00:00:00 2001 From: "Sergey M." Date: Sun, 16 Feb 2014 06:11:38 +0700 Subject: [PATCH 315/339] [nfb] Add rtmp app field to format --- youtube_dl/extractor/nfb.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/nfb.py b/youtube_dl/extractor/nfb.py index a8c514f53..e88566c69 100644 --- a/youtube_dl/extractor/nfb.py +++ b/youtube_dl/extractor/nfb.py @@ -74,7 +74,8 @@ class NFBIE(InfoExtractor): description = media.find('description').text # It seems assets always go from lower to better quality, so no need to sort formats = [{ - 'url': x.find('default/streamerURI').text + '/', + 'url': x.find('default/streamerURI').text, + 'app': x.find('default/streamerURI').text.split('/', 3)[3], 'play_path': x.find('default/url').text, 'rtmp_live': False, 'ext': 'mp4', From 03635e2a711483ad3cf0bf5cdbde173fa37593c9 Mon Sep 17 00:00:00 2001 From: Michael Kaiser Date: Sun, 16 Feb 2014 18:10:04 +0100 Subject: [PATCH 316/339] Add support for 4tube.com. --- youtube_dl/extractor/__init__.py | 1 + youtube_dl/extractor/fourtube.py | 57 ++++++++++++++++++++++++++++++++ 2 files changed, 58 insertions(+) create mode 100644 youtube_dl/extractor/fourtube.py diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 8715da7db..7b247e124 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -73,6 +73,7 @@ from .fktv import ( FKTVPosteckeIE, ) from .flickr import FlickrIE +from .fourtube import FourTubeIE from .franceinter import FranceInterIE from .francetv import ( PluzzIE, diff --git a/youtube_dl/extractor/fourtube.py b/youtube_dl/extractor/fourtube.py new file mode 100644 index 000000000..8e7a7e156 --- /dev/null +++ b/youtube_dl/extractor/fourtube.py @@ -0,0 +1,57 @@ +import re + +from .common import InfoExtractor +from ..utils import compat_urllib_request + +class FourTubeIE(InfoExtractor): + IE_NAME = '4tube' + _VALID_URL = r'(?:https?://)?www\.4tube\.com/videos/(?P\d+)/.*' + + _TEST = { + 'url': 'http://www.4tube.com/videos/209733/hot-babe-holly-michaels-gets-her-ass-stuffed-by-black', + 'md5': '6516c8ac63b03de06bc8eac14362db4f', + 'info_dict': { + 'id': '209733', + 'ext': 'mp4', + 'title': 'Hot Babe Holly Michaels gets her ass stuffed by black' + } + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + + video_id = mobj.group('id') + webpage_url = 'http://www.4tube.com/videos/' + video_id + webpage = self._download_webpage(webpage_url, video_id) + + self.report_extraction(video_id) + + playlist_json = self._html_search_regex(r'var playerConfigPlaylist\s+=\s+([^;]+)', webpage, u'Playlist') + media_id = self._search_regex(r'idMedia:\s*(\d+)', playlist_json, u"Media Id") + thumbnail_url = self._search_regex(r'image:\s*"([^"]*)', playlist_json, u'Thumbnail') + sources = self._search_regex(r'sources:\s*\[([^\]]*)\]', playlist_json, u'Sources').split(',') + title = self._search_regex(r'title:\s*"([^"]*)', playlist_json, u'Title') + + token_url = "http://tkn.4tube.com/{0}/desktop/{1}".format(media_id, "+".join(sources)) + headers = { + b'Content-Type': b'application/x-www-form-urlencoded', + b'Origin': b'http://www.4tube.com', + } + token_req = compat_urllib_request.Request(token_url, b'{}', headers) + tokens = self._download_json(token_req, video_id) + + formats = [{ + 'url': tokens[format]['token'], + 'format_id': format + 'p', + 'resolution': format + 'p', + 'quality': int(format), + } for format in sources] + + return [{ + 'id': video_id, + 'title': title, + 'formats': formats, + 'thumbnail': thumbnail_url, + 'age_limit': 18, + 'webpage_url': webpage_url, + }] From 9032dc28a63e0bed6d6d49f445c2a1f3499de911 Mon Sep 17 00:00:00 2001 From: "Sergey M." Date: Mon, 17 Feb 2014 02:05:15 +0700 Subject: [PATCH 317/339] [vk] Add login feature (Closes #2206) --- youtube_dl/extractor/vk.py | 84 ++++++++++++++++++++++++++++++-------- 1 file changed, 68 insertions(+), 16 deletions(-) diff --git a/youtube_dl/extractor/vk.py b/youtube_dl/extractor/vk.py index f13ba1c8e..b8299c237 100644 --- a/youtube_dl/extractor/vk.py +++ b/youtube_dl/extractor/vk.py @@ -6,6 +6,9 @@ import json from .common import InfoExtractor from ..utils import ( + ExtractorError, + compat_urllib_request, + compat_urllib_parse, compat_str, unescapeHTML, ) @@ -15,30 +18,78 @@ class VKIE(InfoExtractor): IE_NAME = 'vk.com' _VALID_URL = r'https?://vk\.com/(?:videos.*?\?.*?z=)?video(?P.*?)(?:\?|%2F|$)' - _TESTS = [{ - 'url': 'http://vk.com/videos-77521?z=video-77521_162222515%2Fclub77521', - 'file': '162222515.flv', - 'md5': '0deae91935c54e00003c2a00646315f0', - 'info_dict': { - 'title': 'ProtivoGunz - Хуёвая песня', - 'uploader': 'Noize MC', + _TESTS = [ + { + 'url': 'http://vk.com/videos-77521?z=video-77521_162222515%2Fclub77521', + 'md5': '0deae91935c54e00003c2a00646315f0', + 'info_dict': { + 'id': '162222515', + 'ext': 'flv', + 'title': 'ProtivoGunz - Хуёвая песня', + 'uploader': 'Noize MC', + 'duration': 195, + }, }, - }, - { - 'url': 'http://vk.com/video4643923_163339118', - 'file': '163339118.mp4', - 'md5': 'f79bccb5cd182b1f43502ca5685b2b36', - 'info_dict': { - 'uploader': 'Elvira Dzhonik', - 'title': 'Dream Theater - Hollow Years Live at Budokan 720*', + { + 'url': 'http://vk.com/video4643923_163339118', + 'md5': 'f79bccb5cd182b1f43502ca5685b2b36', + 'info_dict': { + 'id': '163339118', + 'ext': 'mp4', + 'uploader': 'Elvira Dzhonik', + 'title': 'Dream Theater - Hollow Years Live at Budokan 720*', + 'duration': 558, + } + }, + { + 'url': 'http://vk.com/video-8871596_164049491', + 'md5': 'a590bcaf3d543576c9bd162812387666', + 'note': 'Only available for registered users', + 'info_dict': { + 'id': '164049491', + 'ext': 'mp4', + 'uploader': 'Триллеры', + 'title': '► Бойцовский клуб / Fight Club 1999 [HD 720]\u00a0', + 'duration': 8352, + }, + 'skip': 'Requires vk account credentials', } - }] + ] + + def _login(self): + (username, password) = self._get_login_info() + if username is None: + return + + login_form = { + 'act': 'login', + 'role': 'al_frame', + 'expire': '1', + 'email': username, + 'pass': password, + } + + request = compat_urllib_request.Request('https://login.vk.com/?act=login', + compat_urllib_parse.urlencode(login_form).encode('utf-8')) + login_page = self._download_webpage(request, None, note='Logging in as %s' % username) + + if re.search(r'onLoginFailed', login_page): + raise ExtractorError('Unable to login, incorrect username and/or password', expected=True) + + def _real_initialize(self): + self._login() def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) video_id = mobj.group('id') + info_url = 'http://vk.com/al_video.php?act=show&al=1&video=%s' % video_id info_page = self._download_webpage(info_url, video_id) + + if re.search(r'Please log in or <', info_page): + raise ExtractorError('This video is only available for registered users, ' + 'use --username and --password options to provide account credentials.', expected=True) + m_yt = re.search(r'src="(http://www.youtube.com/.*?)"', info_page) if m_yt is not None: self.to_screen(u'Youtube video detected') @@ -60,4 +111,5 @@ class VKIE(InfoExtractor): 'title': unescapeHTML(data['md_title']), 'thumbnail': data.get('jpg'), 'uploader': data.get('md_author'), + 'duration': data.get('duration') } From 5544e038ab034ed7003270dbf4e557e5df79e794 Mon Sep 17 00:00:00 2001 From: "Sergey M." Date: Mon, 17 Feb 2014 02:17:10 +0700 Subject: [PATCH 318/339] [vk] Add entry for netrc authentication --- youtube_dl/extractor/vk.py | 1 + 1 file changed, 1 insertion(+) diff --git a/youtube_dl/extractor/vk.py b/youtube_dl/extractor/vk.py index b8299c237..a293b8875 100644 --- a/youtube_dl/extractor/vk.py +++ b/youtube_dl/extractor/vk.py @@ -17,6 +17,7 @@ from ..utils import ( class VKIE(InfoExtractor): IE_NAME = 'vk.com' _VALID_URL = r'https?://vk\.com/(?:videos.*?\?.*?z=)?video(?P.*?)(?:\?|%2F|$)' + _NETRC_MACHINE = 'vk' _TESTS = [ { From 541cb26c0d3ab273ba2d6d42608216166fc30fba Mon Sep 17 00:00:00 2001 From: "Sergey M." Date: Mon, 17 Feb 2014 02:19:55 +0700 Subject: [PATCH 319/339] [smotri] Add entry for netrc authentication --- youtube_dl/extractor/smotri.py | 1 + 1 file changed, 1 insertion(+) diff --git a/youtube_dl/extractor/smotri.py b/youtube_dl/extractor/smotri.py index f249f013c..540c55703 100644 --- a/youtube_dl/extractor/smotri.py +++ b/youtube_dl/extractor/smotri.py @@ -20,6 +20,7 @@ class SmotriIE(InfoExtractor): IE_DESC = 'Smotri.com' IE_NAME = 'smotri' _VALID_URL = r'^https?://(?:www\.)?(?Psmotri\.com/video/view/\?id=(?Pv(?P[0-9]+)[a-z0-9]{4}))' + _NETRC_MACHINE = 'smotri' _TESTS = [ # real video id 2610366 From 2db806b4aa0047002cf4c8b1ce6e3dd79ab8ee69 Mon Sep 17 00:00:00 2001 From: "Sergey M." Date: Mon, 17 Feb 2014 03:46:26 +0700 Subject: [PATCH 320/339] Improve parse_duration --- test/test_utils.py | 9 +++++++++ youtube_dl/utils.py | 2 +- 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/test/test_utils.py b/test/test_utils.py index 1ca5f5af8..84553b943 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -202,7 +202,16 @@ class TestUtil(unittest.TestCase): self.assertEqual(parse_duration('1'), 1) self.assertEqual(parse_duration('1337:12'), 80232) self.assertEqual(parse_duration('9:12:43'), 33163) + self.assertEqual(parse_duration('12:00'), 720) + self.assertEqual(parse_duration('00:01:01'), 61) self.assertEqual(parse_duration('x:y'), None) + self.assertEqual(parse_duration('3h11m53s'), 11513) + self.assertEqual(parse_duration('62m45s'), 3765) + self.assertEqual(parse_duration('6m59s'), 419) + self.assertEqual(parse_duration('49s'), 49) + self.assertEqual(parse_duration('0h0m0s'), 0) + self.assertEqual(parse_duration('0m0s'), 0) + self.assertEqual(parse_duration('0s'), 0) def test_fix_xml_ampersands(self): self.assertEqual( diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index dd03f058f..a192a420a 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -1144,7 +1144,7 @@ def parse_duration(s): return None m = re.match( - r'(?:(?:(?P[0-9]+):)?(?P[0-9]+):)?(?P[0-9]+)$', s) + r'(?:(?:(?P[0-9]+)[:h])?(?P[0-9]+)[:m])?(?P[0-9]+)s?$', s) if not m: return None res = int(m.group('secs')) From 0f99566c01d0cebc4553836f5159bccb04c2907c Mon Sep 17 00:00:00 2001 From: "Sergey M." Date: Mon, 17 Feb 2014 03:47:03 +0700 Subject: [PATCH 321/339] Add one more format in unified_strdate --- youtube_dl/utils.py | 1 + 1 file changed, 1 insertion(+) diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index a192a420a..057cd20d1 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -762,6 +762,7 @@ def unified_strdate(date_str): date_str = re.sub(r' ?(\+|-)[0-9]{2}:?[0-9]{2}$', '', date_str) format_expressions = [ '%d %B %Y', + '%d %b %Y', '%B %d %Y', '%b %d %Y', '%Y-%m-%d', From ae6cae78f194ba3031c47af9d10fa8ed84f05dab Mon Sep 17 00:00:00 2001 From: "Sergey M." Date: Mon, 17 Feb 2014 03:51:03 +0700 Subject: [PATCH 322/339] [4tube] Minor changes and extract more metadata --- youtube_dl/extractor/fourtube.py | 72 ++++++++++++++++++++++++-------- 1 file changed, 55 insertions(+), 17 deletions(-) diff --git a/youtube_dl/extractor/fourtube.py b/youtube_dl/extractor/fourtube.py index 8e7a7e156..8db7fc6cb 100644 --- a/youtube_dl/extractor/fourtube.py +++ b/youtube_dl/extractor/fourtube.py @@ -1,21 +1,34 @@ +from __future__ import unicode_literals + import re from .common import InfoExtractor -from ..utils import compat_urllib_request +from ..utils import ( + compat_urllib_request, + unified_strdate, + str_to_int, + parse_duration, +) +from youtube_dl.utils import clean_html + class FourTubeIE(InfoExtractor): IE_NAME = '4tube' - _VALID_URL = r'(?:https?://)?www\.4tube\.com/videos/(?P\d+)/.*' + _VALID_URL = r'https?://(?:www\.)?4tube\.com/videos/(?P\d+)' _TEST = { - 'url': 'http://www.4tube.com/videos/209733/hot-babe-holly-michaels-gets-her-ass-stuffed-by-black', - 'md5': '6516c8ac63b03de06bc8eac14362db4f', - 'info_dict': { - 'id': '209733', - 'ext': 'mp4', - 'title': 'Hot Babe Holly Michaels gets her ass stuffed by black' - } - } + 'url': 'http://www.4tube.com/videos/209733/hot-babe-holly-michaels-gets-her-ass-stuffed-by-black', + 'md5': '6516c8ac63b03de06bc8eac14362db4f', + 'info_dict': { + 'id': '209733', + 'ext': 'mp4', + 'title': 'Hot Babe Holly Michaels gets her ass stuffed by black', + 'uploader': 'WCP Club', + 'uploader_id': 'wcp-club', + 'upload_date': '20131031', + 'duration': 583, + } + } def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) @@ -26,11 +39,29 @@ class FourTubeIE(InfoExtractor): self.report_extraction(video_id) - playlist_json = self._html_search_regex(r'var playerConfigPlaylist\s+=\s+([^;]+)', webpage, u'Playlist') - media_id = self._search_regex(r'idMedia:\s*(\d+)', playlist_json, u"Media Id") - thumbnail_url = self._search_regex(r'image:\s*"([^"]*)', playlist_json, u'Thumbnail') - sources = self._search_regex(r'sources:\s*\[([^\]]*)\]', playlist_json, u'Sources').split(',') - title = self._search_regex(r'title:\s*"([^"]*)', playlist_json, u'Title') + playlist_json = self._html_search_regex(r'var playerConfigPlaylist\s+=\s+([^;]+)', webpage, 'Playlist') + media_id = self._search_regex(r'idMedia:\s*(\d+)', playlist_json, 'Media Id') + sources = self._search_regex(r'sources:\s*\[([^\]]*)\]', playlist_json, 'Sources').split(',') + title = self._search_regex(r'title:\s*"([^"]*)', playlist_json, 'Title') + thumbnail_url = self._search_regex(r'image:\s*"([^"]*)', playlist_json, 'Thumbnail', fatal=False) + + uploader_str = self._search_regex(r'Uploaded by(.*?)', webpage, 'uploader', fatal=False) + mobj = re.search(r'(?P[^<]+)', uploader_str) + (uploader, uploader_id) = (mobj.group('name'), mobj.group('id')) if mobj else (clean_html(uploader_str), None) + + upload_date = None + view_count = None + duration = None + description = self._html_search_meta('description', webpage, 'description') + if description: + upload_date = self._search_regex(r'Published Date: (\d{2} [a-zA-Z]{3} \d{4})', description, 'upload date', + fatal=False) + if upload_date: + upload_date = unified_strdate(upload_date) + view_count = self._search_regex(r'Views: ([\d,\.]+)', description, 'view count', fatal=False) + if view_count: + view_count = str_to_int(view_count) + duration = parse_duration(self._search_regex(r'Length: (\d+m\d+s)', description, 'duration', fatal=False)) token_url = "http://tkn.4tube.com/{0}/desktop/{1}".format(media_id, "+".join(sources)) headers = { @@ -47,11 +78,18 @@ class FourTubeIE(InfoExtractor): 'quality': int(format), } for format in sources] - return [{ + self._sort_formats(formats) + + return { 'id': video_id, 'title': title, 'formats': formats, 'thumbnail': thumbnail_url, + 'uploader': uploader, + 'uploader_id': uploader_id, + 'upload_date': upload_date, + 'view_count': view_count, + 'duration': duration, 'age_limit': 18, 'webpage_url': webpage_url, - }] + } \ No newline at end of file From 6d784e87f489eb754ffdc5be962c98fc8edfc395 Mon Sep 17 00:00:00 2001 From: "Sergey M." Date: Mon, 17 Feb 2014 09:03:28 +0700 Subject: [PATCH 323/339] Credit @prutz1311 for normalboots.com (#2279) --- youtube_dl/__init__.py | 1 + 1 file changed, 1 insertion(+) diff --git a/youtube_dl/__init__.py b/youtube_dl/__init__.py index e81366851..73844fbbb 100644 --- a/youtube_dl/__init__.py +++ b/youtube_dl/__init__.py @@ -41,6 +41,7 @@ __authors__ = ( 'Chris Gahan', 'Saimadhav Heblikar', 'Mike Col', + 'Oleg Prutz', 'Andreas Schmitz', ) From 0cea52cc183fbd5d9616b49ce49cb479ed9aad43 Mon Sep 17 00:00:00 2001 From: "Sergey M." Date: Mon, 17 Feb 2014 09:07:36 +0700 Subject: [PATCH 324/339] Credit @pulpe for play.iprima.cz and stream.cz --- youtube_dl/__init__.py | 1 + 1 file changed, 1 insertion(+) diff --git a/youtube_dl/__init__.py b/youtube_dl/__init__.py index 73844fbbb..055fd7029 100644 --- a/youtube_dl/__init__.py +++ b/youtube_dl/__init__.py @@ -42,6 +42,7 @@ __authors__ = ( 'Saimadhav Heblikar', 'Mike Col', 'Oleg Prutz', + 'pulpe', 'Andreas Schmitz', ) From cbffec0c95043656a12b9900507bb8dc34b35e0b Mon Sep 17 00:00:00 2001 From: "Sergey M." Date: Mon, 17 Feb 2014 09:08:38 +0700 Subject: [PATCH 325/339] Credit @patheticpat for 4tube.com (#2398) --- youtube_dl/__init__.py | 1 + 1 file changed, 1 insertion(+) diff --git a/youtube_dl/__init__.py b/youtube_dl/__init__.py index 055fd7029..57aaff5da 100644 --- a/youtube_dl/__init__.py +++ b/youtube_dl/__init__.py @@ -44,6 +44,7 @@ __authors__ = ( 'Oleg Prutz', 'pulpe', 'Andreas Schmitz', + 'Michael Kaiser', ) __license__ = 'Public Domain' From 960f3171713cc2cdea3414f6dba500584a065ad8 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Mon, 17 Feb 2014 11:32:30 +0100 Subject: [PATCH 326/339] [helsinki] Simplify --- youtube_dl/extractor/helsinki.py | 53 +++++++++++++++++++------------- 1 file changed, 32 insertions(+), 21 deletions(-) diff --git a/youtube_dl/extractor/helsinki.py b/youtube_dl/extractor/helsinki.py index 2a54f3cca..5268efa49 100644 --- a/youtube_dl/extractor/helsinki.py +++ b/youtube_dl/extractor/helsinki.py @@ -8,44 +8,55 @@ from .common import InfoExtractor class HelsinkiIE(InfoExtractor): + IE_DESC = 'helsinki.fi' _VALID_URL = r'https?://video\.helsinki\.fi/Arkisto/flash\.php\?id=(?P\d+)' _TEST = { 'url': 'http://video.helsinki.fi/Arkisto/flash.php?id=20258', - 'md5': 'cd829201b890905682eb194cbdea55d7', 'info_dict': { 'id': '20258', 'ext': 'mp4', 'title': 'Tietotekniikkafoorumi-iltapäivä', + 'description': 'md5:f5c904224d43c133225130fe156a5ee0', + }, + 'params': { + 'skip_download': True, # RTMP } } def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) - vid = mobj.group('id') - webpage = self._download_webpage(url, vid) + video_id = mobj.group('id') + webpage = self._download_webpage(url, video_id) formats = [] - mobj = re.search('file=((\w+):[^&]+)', webpage) - if mobj: formats.append({ - 'ext': mobj.group(2), - 'play_path': mobj.group(1), - 'url': 'rtmp://flashvideo.it.helsinki.fi/vod/', - 'player_url': 'http://video.helsinki.fi/player.swf', - 'format_note': 'sd' - }) - mobj = re.search('hd\.file=((\w+):[^&]+)', webpage) - if mobj: formats.append({ - 'ext': mobj.group(2), - 'play_path': mobj.group(1), - 'url': 'rtmp://flashvideo.it.helsinki.fi/vod/', - 'player_url': 'http://video.helsinki.fi/player.swf', - 'format_note': 'hd' - }) + mobj = re.search(r'file=((\w+):[^&]+)', webpage) + if mobj: + formats.append({ + 'ext': mobj.group(2), + 'play_path': mobj.group(1), + 'url': 'rtmp://flashvideo.it.helsinki.fi/vod/', + 'player_url': 'http://video.helsinki.fi/player.swf', + 'format_note': 'sd', + 'quality': 0, + }) + + mobj = re.search(r'hd\.file=((\w+):[^&]+)', webpage) + if mobj: + formats.append({ + 'ext': mobj.group(2), + 'play_path': mobj.group(1), + 'url': 'rtmp://flashvideo.it.helsinki.fi/vod/', + 'player_url': 'http://video.helsinki.fi/player.swf', + 'format_note': 'hd', + 'quality': 1, + }) + + self._sort_formats(formats) return { - 'id': vid, + 'id': video_id, 'title': self._og_search_title(webpage).replace('Video: ', ''), 'description': self._og_search_description(webpage), 'thumbnail': self._og_search_thumbnail(webpage), - 'formats': formats + 'formats': formats, } From 96d1637082e29ac0b438638ae37466f546717319 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Mon, 17 Feb 2014 11:33:01 +0100 Subject: [PATCH 327/339] Credit @Nikerabbit for helsinki --- youtube_dl/__init__.py | 1 + 1 file changed, 1 insertion(+) diff --git a/youtube_dl/__init__.py b/youtube_dl/__init__.py index 57aaff5da..f843036c7 100644 --- a/youtube_dl/__init__.py +++ b/youtube_dl/__init__.py @@ -45,6 +45,7 @@ __authors__ = ( 'pulpe', 'Andreas Schmitz', 'Michael Kaiser', + 'Niklas Laxström', ) __license__ = 'Public Domain' From a0dfcdce5ef63769c887e759e331bb371f3c68e2 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Mon, 17 Feb 2014 11:33:13 +0100 Subject: [PATCH 328/339] release 2014.02.17 --- youtube_dl/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/version.py b/youtube_dl/version.py index cfcadd3d1..a9fead95d 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,2 +1,2 @@ -__version__ = '2014.02.13' +__version__ = '2014.02.17' From ad5976b4d90da6921a5e72603f3b73c4597e6138 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Mon, 17 Feb 2014 11:44:24 +0100 Subject: [PATCH 329/339] [vimeo] Modernize test definition --- youtube_dl/extractor/vimeo.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/youtube_dl/extractor/vimeo.py b/youtube_dl/extractor/vimeo.py index 4bc262049..c5ee84807 100644 --- a/youtube_dl/extractor/vimeo.py +++ b/youtube_dl/extractor/vimeo.py @@ -37,13 +37,14 @@ class VimeoIE(SubtitlesInfoExtractor): _TESTS = [ { 'url': 'http://vimeo.com/56015672#at=0', - 'file': '56015672.mp4', 'md5': '8879b6cc097e987f02484baf890129e5', 'info_dict': { - "upload_date": "20121220", - "description": "This is a test case for youtube-dl.\nFor more information, see github.com/rg3/youtube-dl\nTest chars: \u2605 \" ' \u5e78 / \\ \u00e4 \u21ad \U0001d550", - "uploader_id": "user7108434", - "uploader": "Filippo Valsorda", + 'id': '56015672', + 'ext': 'mp4', + "upload_date": "20121220", + "description": "This is a test case for youtube-dl.\nFor more information, see github.com/rg3/youtube-dl\nTest chars: \u2605 \" ' \u5e78 / \\ \u00e4 \u21ad \U0001d550", + "uploader_id": "user7108434", + "uploader": "Filippo Valsorda", "title": "youtube-dl test video - \u2605 \" ' \u5e78 / \\ \u00e4 \u21ad \U0001d550", }, }, From 2eb5d315d49b51f3594cda27f44773e826c6cccb Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Mon, 17 Feb 2014 14:56:21 +0100 Subject: [PATCH 330/339] [youtube] Match more truncated URLs (Closes #2402) --- test/test_all_urls.py | 3 +++ youtube_dl/extractor/youtube.py | 2 +- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/test/test_all_urls.py b/test/test_all_urls.py index aa8e4e4bd..7a78005a3 100644 --- a/test/test_all_urls.py +++ b/test/test_all_urls.py @@ -68,6 +68,9 @@ class TestAllURLsMatching(unittest.TestCase): def test_youtube_show_matching(self): self.assertMatch('http://www.youtube.com/show/airdisasters', ['youtube:show']) + def test_youtube_truncated(self): + self.assertMatch('http://www.youtube.com/watch?', ['youtube:truncated_url']) + def test_justin_tv_channelid_matching(self): self.assertTrue(JustinTVIE.suitable(u"justin.tv/vanillatv")) self.assertTrue(JustinTVIE.suitable(u"twitch.tv/vanillatv")) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index a81036843..059cf8cbd 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -1815,7 +1815,7 @@ class YoutubeTruncatedURLIE(InfoExtractor): IE_NAME = 'youtube:truncated_url' IE_DESC = False # Do not list _VALID_URL = r'''(?x) - (?:https?://)?[^/]+/watch\?feature=[a-z_]+$| + (?:https?://)?[^/]+/watch\?(?:feature=[a-z_]+)?$| (?:https?://)?(?:www\.)?youtube\.com/attribution_link\?a=[^&]+$ ''' From 09c4d50944c6450efef7bd38a3de35369b6099c7 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Mon, 17 Feb 2014 14:58:39 +0100 Subject: [PATCH 331/339] Fix indenting in README --- README.md | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index 35876d979..bd091be86 100644 --- a/README.md +++ b/README.md @@ -281,12 +281,14 @@ Videos can be filtered by their upload date using the options `--date`, `--dateb Examples: - $ # Download only the videos uploaded in the last 6 months - $ youtube-dl --dateafter now-6months - $ # Download only the videos uploaded on January 1, 1970 - $ youtube-dl --date 19700101 - $ # will only download the videos uploaded in the 200x decade - $ youtube-dl --dateafter 20000101 --datebefore 20091231 + # Download only the videos uploaded in the last 6 months + $ youtube-dl --dateafter now-6months + + # Download only the videos uploaded on January 1, 1970 + $ youtube-dl --date 19700101 + + $ # will only download the videos uploaded in the 200x decade + $ youtube-dl --dateafter 20000101 --datebefore 20091231 # FAQ From c7f0177fa73e5efe2b3b961d63e5784f5882db21 Mon Sep 17 00:00:00 2001 From: "Sergey M." Date: Tue, 18 Feb 2014 00:26:12 +0700 Subject: [PATCH 332/339] [bbccouk] Skip test --- youtube_dl/extractor/bbccouk.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/bbccouk.py b/youtube_dl/extractor/bbccouk.py index 6d785c0bf..57fdd00e4 100644 --- a/youtube_dl/extractor/bbccouk.py +++ b/youtube_dl/extractor/bbccouk.py @@ -38,7 +38,8 @@ class BBCCoUkIE(SubtitlesInfoExtractor): 'params': { # rtmp download 'skip_download': True, - } + }, + 'skip': 'Episode is no longer available on BBC iPlayer Radio', }, { 'url': 'http://www.bbc.co.uk/iplayer/episode/b03vhd1f/The_Voice_UK_Series_3_Blind_Auditions_5/', From f7000f3a1b4df82e8fed886ce2f8b11bea7c86ab Mon Sep 17 00:00:00 2001 From: "Sergey M." Date: Wed, 19 Feb 2014 02:00:54 +0700 Subject: [PATCH 333/339] [youtube] Add support for yourepeat.com URLs (Closes #2397) --- youtube_dl/extractor/youtube.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 059cf8cbd..02c5ede74 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -138,13 +138,14 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): (?:(?:(?:(?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie)?\.com/| (?:www\.)?deturl\.com/www\.youtube\.com/| (?:www\.)?pwnyoutube\.com/| + (?:www\.)?yourepeat\.com/| tube\.majestyc\.net/| youtube\.googleapis\.com/) # the various hostnames, with wildcard subdomains (?:.*?\#/)? # handle anchor (#/) redirect urls (?: # the various things that can precede the ID: (?:(?:v|embed|e)/) # v/ or embed/ or e/ |(?: # or the v= param in all its forms - (?:(?:watch|movie)(?:_popup)?(?:\.php)?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx) + (?:(?:watch|movie)(?:_popup)?(?:\.php)?/?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx) (?:\?|\#!?) # the params delimiter ? or # or #! (?:.*?&)? # any other preceding param (like /?s=tuff&v=xxxx) v= From f2d0fc682370cd4e4cb8b32f66a26901fc47dfe6 Mon Sep 17 00:00:00 2001 From: "Sergey M." Date: Wed, 19 Feb 2014 06:46:14 +0700 Subject: [PATCH 334/339] [bbccouk] Replace test This older episode is from 1994 and hopefully won't get deleted. --- youtube_dl/extractor/bbccouk.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/youtube_dl/extractor/bbccouk.py b/youtube_dl/extractor/bbccouk.py index 57fdd00e4..69d128974 100644 --- a/youtube_dl/extractor/bbccouk.py +++ b/youtube_dl/extractor/bbccouk.py @@ -13,13 +13,13 @@ class BBCCoUkIE(SubtitlesInfoExtractor): _TESTS = [ { - 'url': 'http://www.bbc.co.uk/programmes/p01q7wz1', + 'url': 'http://www.bbc.co.uk/programmes/b039g8p7', 'info_dict': { - 'id': 'p01q7wz4', + 'id': 'b039d07m', 'ext': 'flv', - 'title': 'Friction: Blu Mar Ten guest mix: Blu Mar Ten - Guest Mix', - 'description': 'Blu Mar Ten deliver a Guest Mix for Friction.', - 'duration': 1936, + 'title': 'Kaleidoscope: Leonard Cohen', + 'description': 'md5:db4755d7a665ae72343779f7dacb402c', + 'duration': 1740, }, 'params': { # rtmp download From f6f01ea17b553dfb78e4305a665c966db342fcdb Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Wed, 19 Feb 2014 01:04:24 +0100 Subject: [PATCH 335/339] [space] modernize --- youtube_dl/extractor/space.py | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/youtube_dl/extractor/space.py b/youtube_dl/extractor/space.py index 4a3e52ad8..d34aefeaa 100644 --- a/youtube_dl/extractor/space.py +++ b/youtube_dl/extractor/space.py @@ -1,3 +1,5 @@ +from __future__ import unicode_literals + import re from .common import InfoExtractor @@ -8,14 +10,14 @@ from ..utils import RegexNotFoundError, ExtractorError class SpaceIE(InfoExtractor): _VALID_URL = r'https?://(?:(?:www|m)\.)?space\.com/\d+-(?P[^/\.\?]*?)-video\.html' _TEST = { - u'add_ie': ['Brightcove'], - u'url': u'http://www.space.com/23373-huge-martian-landforms-detail-revealed-by-european-probe-video.html', - u'info_dict': { - u'id': u'2780937028001', - u'ext': u'mp4', - u'title': u'Huge Martian Landforms\' Detail Revealed By European Probe | Video', - u'description': u'md5:db81cf7f3122f95ed234b631a6ea1e61', - u'uploader': u'TechMedia Networks', + 'add_ie': ['Brightcove'], + 'url': 'http://www.space.com/23373-huge-martian-landforms-detail-revealed-by-european-probe-video.html', + 'info_dict': { + 'id': '2780937028001', + 'ext': 'mp4', + 'title': 'Huge Martian Landforms\' Detail Revealed By European Probe | Video', + 'description': 'md5:db81cf7f3122f95ed234b631a6ea1e61', + 'uploader': 'TechMedia Networks', }, } From a9c7198a0b3f6a94c3f3132f8ea5e7585478f344 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Wed, 19 Feb 2014 01:06:16 +0100 Subject: [PATCH 336/339] [testurl] Add extractor This is a pseudo extractor that can be used to quickly look up test URLs, or test without the test harness. --- youtube_dl/extractor/__init__.py | 1 + youtube_dl/extractor/testurl.py | 66 ++++++++++++++++++++++++++++++++ 2 files changed, 67 insertions(+) create mode 100644 youtube_dl/extractor/testurl.py diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 725371883..e35287f88 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -216,6 +216,7 @@ from .sztvhu import SztvHuIE from .teamcoco import TeamcocoIE from .techtalks import TechTalksIE from .ted import TEDIE +from .testurl import TestURLIE from .tf1 import TF1IE from .theplatform import ThePlatformIE from .thisav import ThisAVIE diff --git a/youtube_dl/extractor/testurl.py b/youtube_dl/extractor/testurl.py new file mode 100644 index 000000000..bdc6e2064 --- /dev/null +++ b/youtube_dl/extractor/testurl.py @@ -0,0 +1,66 @@ +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ExtractorError + + +class TestURLIE(InfoExtractor): + """ Allows adressing of the test cases as test:yout.*be_1 """ + + IE_DESC = False # Do not list + _VALID_URL = r'test(?:url)?:(?P<id>(?P<extractor>.+?)(?:_(?P<num>[0-9]+))?)$' + + def _real_extract(self, url): + from ..extractor import gen_extractors + + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + extractor_id = mobj.group('extractor') + all_extractors = gen_extractors() + + rex = re.compile(extractor_id, flags=re.IGNORECASE) + matching_extractors = [ + e for e in all_extractors if rex.search(e.IE_NAME)] + + if len(matching_extractors) == 0: + raise ExtractorError( + 'No extractors matching %r found' % extractor_id, + expected=True) + elif len(matching_extractors) > 1: + # Is it obvious which one to pick? + try: + extractor = next( + ie for ie in matching_extractors + if ie.IE_NAME.lower() == extractor_id.lower()) + except StopIteration: + raise ExtractorError( + ('Found multiple matching extractors: %s' % + ' '.join(ie.IE_NAME for ie in matching_extractors)), + expected=True) + + num_str = mobj.group('num') + num = int(num_str) if num_str else 0 + + testcases = [] + t = getattr(extractor, '_TEST', None) + if t: + testcases.append(t) + testcases.extend(getattr(extractor, '_TESTS', [])) + + try: + tc = testcases[num] + except IndexError: + raise ExtractorError( + ('Test case %d not found, got only %d tests' % + (num, len(testcases))), + expected=True) + + self.to_screen('Test URL: %s' % tc['url']) + + return { + '_type': 'url', + 'url': tc['url'], + 'id': video_id, + } From c3771105396289799b0ce1f35228728894aff92d Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Wed, 19 Feb 2014 01:08:16 +0100 Subject: [PATCH 337/339] release 2014.02.19 --- youtube_dl/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/version.py b/youtube_dl/version.py index a9fead95d..9e1c8a339 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,2 +1,2 @@ -__version__ = '2014.02.17' +__version__ = '2014.02.19' From 572a89cc4e4a4ff24ed68617b64c7d91cc82f716 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Wed, 19 Feb 2014 01:27:12 +0100 Subject: [PATCH 338/339] [liveleak] Add support for prochan embeds (Fixes #2406) --- youtube_dl/extractor/liveleak.py | 55 ++++++++++++++++++++++++++------ 1 file changed, 45 insertions(+), 10 deletions(-) diff --git a/youtube_dl/extractor/liveleak.py b/youtube_dl/extractor/liveleak.py index 0a700d663..8e50e8f79 100644 --- a/youtube_dl/extractor/liveleak.py +++ b/youtube_dl/extractor/liveleak.py @@ -4,15 +4,17 @@ import json import re from .common import InfoExtractor +from ..utils import int_or_none class LiveLeakIE(InfoExtractor): _VALID_URL = r'^(?:http://)?(?:\w+\.)?liveleak\.com/view\?(?:.*?)i=(?P<video_id>[\w_]+)(?:.*)' _TESTS = [{ 'url': 'http://www.liveleak.com/view?i=757_1364311680', - 'file': '757_1364311680.mp4', 'md5': '0813c2430bea7a46bf13acf3406992f4', 'info_dict': { + 'id': '757_1364311680', + 'ext': 'mp4', 'description': 'extremely bad day for this guy..!', 'uploader': 'ljfriel2', 'title': 'Most unlucky car accident' @@ -20,25 +22,62 @@ class LiveLeakIE(InfoExtractor): }, { 'url': 'http://www.liveleak.com/view?i=f93_1390833151', - 'file': 'f93_1390833151.mp4', 'md5': 'd3f1367d14cc3c15bf24fbfbe04b9abf', 'info_dict': { + 'id': 'f93_1390833151', + 'ext': 'mp4', 'description': 'German Television Channel NDR does an exclusive interview with Edward Snowden.\r\nUploaded on LiveLeak cause German Television thinks the rest of the world isn\'t intereseted in Edward Snowden.', 'uploader': 'ARD_Stinkt', 'title': 'German Television does first Edward Snowden Interview (ENGLISH)', } + }, + { + 'url': 'http://www.liveleak.com/view?i=4f7_1392687779', + 'md5': '42c6d97d54f1db107958760788c5f48f', + 'info_dict': { + 'id': '4f7_1392687779', + 'ext': 'mp4', + 'description': "The guy with the cigarette seems amazingly nonchalant about the whole thing... I really hope my friends' reactions would be a bit stronger.\r\n\r\nAction-go to 0:55.", + 'uploader': 'CapObveus', + 'title': 'Man is Fatally Struck by Reckless Car While Packing up a Moving Truck', + 'age_limit': 18, + } }] def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('video_id') webpage = self._download_webpage(url, video_id) + + video_title = self._og_search_title(webpage).replace('LiveLeak.com -', '').strip() + video_description = self._og_search_description(webpage) + video_uploader = self._html_search_regex( + r'By:.*?(\w+)</a>', webpage, 'uploader', fatal=False) + age_limit = int_or_none(self._search_regex( + r'you confirm that you are ([0-9]+) years and over.', + webpage, 'age limit', default=None)) + sources_raw = self._search_regex( r'(?s)sources:\s*(\[.*?\]),', webpage, 'video URLs', default=None) if sources_raw is None: - sources_raw = '[{ %s}]' % ( - self._search_regex(r'(file: ".*?"),', webpage, 'video URL')) + alt_source = self._search_regex( + r'(file: ".*?"),', webpage, 'video URL', default=None) + if alt_source: + sources_raw = '[{ %s}]' % alt_source + else: + # Maybe an embed? + embed_url = self._search_regex( + r'<iframe[^>]+src="(http://www.prochan.com/embed\?[^"]+)"', + webpage, 'embed URL') + return { + '_type': 'url_transparent', + 'url': embed_url, + 'id': video_id, + 'title': video_title, + 'description': video_description, + 'uploader': video_uploader, + 'age_limit': age_limit, + } sources_json = re.sub(r'\s([a-z]+):\s', r'"\1": ', sources_raw) sources = json.loads(sources_json) @@ -49,15 +88,11 @@ class LiveLeakIE(InfoExtractor): } for s in sources] self._sort_formats(formats) - video_title = self._og_search_title(webpage).replace('LiveLeak.com -', '').strip() - video_description = self._og_search_description(webpage) - video_uploader = self._html_search_regex( - r'By:.*?(\w+)</a>', webpage, 'uploader', fatal=False) - return { 'id': video_id, 'title': video_title, 'description': video_description, 'uploader': video_uploader, 'formats': formats, + 'age_limit': age_limit, } From 882907a8183fd026106de2159b5a2edb7f65ce22 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Wed, 19 Feb 2014 01:27:22 +0100 Subject: [PATCH 339/339] release 2014.02.19.1 --- youtube_dl/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 9e1c8a339..b7ea461c3 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,2 +1,2 @@ -__version__ = '2014.02.19' +__version__ = '2014.02.19.1'