From f870544302f75bee0d96f6a8623c8ff270beca89 Mon Sep 17 00:00:00 2001 From: fnord Date: Mon, 13 Jul 2015 07:41:38 -0500 Subject: [PATCH 0001/1286] Add support for democracynow.org Supports downloading clips or entire shows. Subtitle support --- youtube_dl/extractor/__init__.py | 1 + youtube_dl/extractor/democracynow.py | 100 +++++++++++++++++++++++++++ 2 files changed, 101 insertions(+) create mode 100644 youtube_dl/extractor/democracynow.py diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index cbaa07391..5cc03b875 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -112,6 +112,7 @@ from .daum import DaumIE from .dbtv import DBTVIE from .dctp import DctpTvIE from .deezer import DeezerPlaylistIE +from .democracynow import DemocracynowIE from .dfb import DFBIE from .dhm import DHMIE from .dotsub import DotsubIE diff --git a/youtube_dl/extractor/democracynow.py b/youtube_dl/extractor/democracynow.py new file mode 100644 index 000000000..1c9b36052 --- /dev/null +++ b/youtube_dl/extractor/democracynow.py @@ -0,0 +1,100 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import json +import time +import hmac +import hashlib +import itertools +import re +from ..utils import ( + ExtractorError, + int_or_none, + parse_age_limit, + parse_iso8601, +) +from ..compat import compat_urllib_request +from .common import InfoExtractor + + +class DemocracynowIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?democracynow.org/?(?P[^\?]*)' + IE_NAME = 'democracynow' + _TESTS = [{ + 'url': 'http://www.democracynow.org/shows/2015/7/3', + 'info_dict': { + 'id': '2015-0703-001', + 'ext': 'mp4', + 'title': 'July 03, 2015 - Democracy Now!', + 'description': 'A daily independent global news hour with Amy Goodman & Juan Gonz\xe1lez "What to the Slave is 4th of July?": James Earl Jones Reads Frederick Douglass\u2019 Historic Speech : "This Flag Comes Down Today": Bree Newsome Scales SC Capitol Flagpole, Takes Down Confederate Flag : "We Shall Overcome": Remembering Folk Icon, Activist Pete Seeger in His Own Words & Songs', + 'uploader': 'Democracy Now', + 'upload_date': None, + }, + },{ + 'url': 'http://www.democracynow.org/2015/7/3/this_flag_comes_down_today_bree', + 'info_dict': { + 'id': '2015-0703-001', + 'ext': 'mp4', + 'title': '"This Flag Comes Down Today": Bree Newsome Scales SC Capitol Flagpole, Takes Down Confederate Flag', + 'description': 'md5:4d2bc4f0d29f5553c2210a4bc7761a21', + 'uploader': 'Democracy Now', + 'upload_date': None, + }, + + }] + + def _real_extract(self, url): + display_id = self._match_id(url) + base_host = re.search(r'^(.+?://[^/]+)', url).group(1) + if display_id == '': + display_id = 'home' + webpage = self._download_webpage(url, display_id) + re_desc = re.search(r'[^/]+)/(?:dn)?(?P[^/]+?)\.(?P[^\.\?]+)(?P\?|$)',url) + if video_id == None: + video_id = purl.group('fn') + if js.get('start') != None: + url += '&' if purl.group('hasparams') == '?' else '?' + url = url + 'start='+str(js.get('start')) + formats.append({ + 'format_id': purl.group('dir'), + 'ext': purl.group('ext'), + 'url': url, + }) + self._sort_formats(formats) + ret = { + 'id': video_id, + 'title': js.get('title'), + 'description': description, + 'uploader': 'Democracy Now', +# 'thumbnails': thumbnails, + 'subtitles': subtitles, + 'formats': formats, + } + return ret +# \ No newline at end of file From eb08081330f5ef52d66140589137ae1bb05eee5f Mon Sep 17 00:00:00 2001 From: fnord Date: Fri, 17 Jul 2015 02:57:08 -0500 Subject: [PATCH 0002/1286] democracynow: correct syntax --- youtube_dl/extractor/democracynow.py | 43 +++++++++------------------- 1 file changed, 14 insertions(+), 29 deletions(-) diff --git a/youtube_dl/extractor/democracynow.py b/youtube_dl/extractor/democracynow.py index 1c9b36052..973bb437b 100644 --- a/youtube_dl/extractor/democracynow.py +++ b/youtube_dl/extractor/democracynow.py @@ -1,19 +1,7 @@ # coding: utf-8 from __future__ import unicode_literals -import json -import time -import hmac -import hashlib -import itertools import re -from ..utils import ( - ExtractorError, - int_or_none, - parse_age_limit, - parse_iso8601, -) -from ..compat import compat_urllib_request from .common import InfoExtractor @@ -30,7 +18,7 @@ class DemocracynowIE(InfoExtractor): 'uploader': 'Democracy Now', 'upload_date': None, }, - },{ + }, { 'url': 'http://www.democracynow.org/2015/7/3/this_flag_comes_down_today_bree', 'info_dict': { 'id': '2015-0703-001', @@ -40,7 +28,6 @@ class DemocracynowIE(InfoExtractor): 'uploader': 'Democracy Now', 'upload_date': None, }, - }] def _real_extract(self, url): @@ -49,7 +36,7 @@ class DemocracynowIE(InfoExtractor): if display_id == '': display_id = 'home' webpage = self._download_webpage(url, display_id) - re_desc = re.search(r'[^/]+)/(?:dn)?(?P[^/]+?)\.(?P[^\.\?]+)(?P\?|$)',url) - if video_id == None: + purl = re.search(r'/(?P[^/]+)/(?:dn)?(?P[^/]+?)\.(?P[^\.\?]+)(?P\?|$)', url) + if video_id is None: video_id = purl.group('fn') - if js.get('start') != None: + if js.get('start') is not None: url += '&' if purl.group('hasparams') == '?' else '?' - url = url + 'start='+str(js.get('start')) + url = url + 'start=' + str(js.get('start')) formats.append({ 'format_id': purl.group('dir'), 'ext': purl.group('ext'), @@ -92,9 +79,7 @@ class DemocracynowIE(InfoExtractor): 'title': js.get('title'), 'description': description, 'uploader': 'Democracy Now', -# 'thumbnails': thumbnails, 'subtitles': subtitles, 'formats': formats, } return ret -# \ No newline at end of file From f57f84f606b246db4f102fc5bc55e64e4f7a3d60 Mon Sep 17 00:00:00 2001 From: fnord Date: Tue, 21 Jul 2015 16:38:40 -0500 Subject: [PATCH 0003/1286] Twitter: get and describe video from status urls --- youtube_dl/extractor/twitter.py | 44 +++++++++++++++++++++++++++++++++ 1 file changed, 44 insertions(+) diff --git a/youtube_dl/extractor/twitter.py b/youtube_dl/extractor/twitter.py index 1aaa06305..a65252cc6 100644 --- a/youtube_dl/extractor/twitter.py +++ b/youtube_dl/extractor/twitter.py @@ -70,3 +70,47 @@ class TwitterCardIE(InfoExtractor): 'duration': duration, 'formats': formats, } + + +class TwitterIE(TwitterCardIE): + _VALID_URL = r'https?://(?:www|m|mobile)?\.?twitter\.com/(?P[^/]+/status/\d+)' + + _TESTS = [{ + 'url': 'https://m.twitter.com/thereaIbanksy/status/614301758345490432', + 'md5': '8bbccb487bd7a31349b775915fcd412f', + 'info_dict': { + 'id': '614301758345490432', + 'ext': 'mp4', + 'title': 'thereaIbanksy - This time lapse is so pretty \U0001f60d\U0001f60d', + 'thumbnail': 're:^https?://.*\.jpg', + 'duration': 29.5, + 'description': 'banksy on Twitter: "This time lapse is so pretty \U0001f60d\U0001f60d http://t.co/QB8DDbqiR1"', + 'uploader': 'banksy', + 'uploader_id': 'thereaIbanksy', + }, + }] + + def _real_extract(self, url): + id = self._match_id(url) + username, twid = re.match(r'([^/]+)/status/(\d+)', id).groups() + name = username + url = re.sub(r'https?://(m|mobile)\.', 'https://', url) + webpage = self._download_webpage(url, 'tweet: ' + url) + description = unescapeHTML(self._search_regex('\s*(.+?)\s*', webpage, 'title')) + title = description.replace('\n', ' ') + splitdesc = re.match(r'^(.+?)\s*on Twitter:\s* "(.+?)"$', title) + if splitdesc: + name, title = splitdesc.groups() + title = re.sub(r'\s*https?://[^ ]+', '', title) # strip 'https -_t.co_BJYgOjSeGA' junk from filenames + card_id = self._search_regex(r'["\']/i/cards/tfw/v1/(\d+)', webpage, '/i/card/...') + card_url = 'https://twitter.com/i/cards/tfw/v1/' + card_id + return { + '_type': 'url_transparent', + 'ie_key': 'TwitterCard', + 'uploader_id': username, + 'uploader': name, + 'url': card_url, + 'webpage_url': url, + 'description': description, + 'title': username + ' - ' + title, + } From c3dea3f878133f3cbdad9e548609d3077572af66 Mon Sep 17 00:00:00 2001 From: fnord Date: Tue, 21 Jul 2015 16:45:36 -0500 Subject: [PATCH 0004/1286] Twittercard: support vmapurl method --- youtube_dl/extractor/twitter.py | 47 ++++++++++++++++++++++++++------- 1 file changed, 37 insertions(+), 10 deletions(-) diff --git a/youtube_dl/extractor/twitter.py b/youtube_dl/extractor/twitter.py index a65252cc6..1dd43ff3c 100644 --- a/youtube_dl/extractor/twitter.py +++ b/youtube_dl/extractor/twitter.py @@ -12,17 +12,30 @@ from ..utils import ( class TwitterCardIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?twitter\.com/i/cards/tfw/v1/(?P\d+)' - _TEST = { - 'url': 'https://twitter.com/i/cards/tfw/v1/560070183650213889', - 'md5': 'a74f50b310c83170319ba16de6955192', - 'info_dict': { - 'id': '560070183650213889', - 'ext': 'mp4', - 'title': 'TwitterCard', - 'thumbnail': 're:^https?://.*\.jpg$', - 'duration': 30.033, + _TESTS = [ + { + 'url': 'https://twitter.com/i/cards/tfw/v1/560070183650213889', + 'md5': 'a74f50b310c83170319ba16de6955192', + 'info_dict': { + 'id': '560070183650213889', + 'ext': 'mp4', + 'title': 'TwitterCard', + 'thumbnail': 're:^https?://.*\.jpg$', + 'duration': 30.033, + } }, - } + { + 'url': 'https://twitter.com/i/cards/tfw/v1/623160978427936768', + 'md5': '7ee2a553b63d1bccba97fbed97d9e1c8', + 'info_dict': { + 'id': '623160978427936768', + 'ext': 'mp4', + 'title': 'TwitterCard', + 'thumbnail': 're:^https?://.*\.jpg', + 'duration': 80.155, + }, + } + ] def _real_extract(self, url): video_id = self._match_id(url) @@ -44,6 +57,20 @@ class TwitterCardIE(InfoExtractor): unescapeHTML(self._search_regex( r'data-player-config="([^"]+)"', webpage, 'data player config')), video_id) + if 'playlist' not in config: + if 'vmapUrl' in config: + webpage = self._download_webpage(config['vmapUrl'], video_id + ' (xml)') + video_url = self._search_regex( + r'\s*', webpage, 'data player config (xml)') + f = { + 'url': video_url, + } + ext = re.search(r'\.([a-z0-9]{2,4})(\?.+)?$', video_url) + if ext: + f['ext'] = ext.group(1) + formats.append(f) + break # same video regardless of UA + continue video_url = config['playlist'][0]['source'] From 9e7e0dffd5e3e3c959e8d99a5e236b9099886fe9 Mon Sep 17 00:00:00 2001 From: fnord Date: Tue, 21 Jul 2015 16:56:35 -0500 Subject: [PATCH 0005/1286] Actually add the extractor --- youtube_dl/extractor/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 50da08830..5c03bf8e8 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -651,7 +651,7 @@ from .twitch import ( TwitchBookmarksIE, TwitchStreamIE, ) -from .twitter import TwitterCardIE +from .twitter import TwitterCardIE, TwitterIE from .ubu import UbuIE from .udemy import ( UdemyIE, From 984e4d487520bd2a860b31b3165416c879b28096 Mon Sep 17 00:00:00 2001 From: remitamine Date: Wed, 24 Jun 2015 01:13:23 +0100 Subject: [PATCH 0006/1286] [googledrive] Add new extractor --- youtube_dl/extractor/__init__.py | 1 + youtube_dl/extractor/googledrive.py | 106 ++++++++++++++++++++++++++++ 2 files changed, 107 insertions(+) create mode 100644 youtube_dl/extractor/googledrive.py diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 3cfa804ec..6655d7eb5 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -209,6 +209,7 @@ from .globo import GloboIE from .godtube import GodTubeIE from .goldenmoustache import GoldenMoustacheIE from .golem import GolemIE +from .googledrive import GoogleDriveIE from .googleplus import GooglePlusIE from .googlesearch import GoogleSearchIE from .gorillavid import GorillaVidIE diff --git a/youtube_dl/extractor/googledrive.py b/youtube_dl/extractor/googledrive.py new file mode 100644 index 000000000..8c611fa47 --- /dev/null +++ b/youtube_dl/extractor/googledrive.py @@ -0,0 +1,106 @@ +from .common import InfoExtractor +from ..utils import RegexNotFoundError + +class GoogleDriveIE(InfoExtractor): + _VALID_URL = r'(?:https?://)?(?:video\.google\.com/get_player\?.*?docid=|(?:docs|drive)\.google\.com/(?:uc\?.*?id=|file/d/))(?P.+?)(?:&|/|$)' + _TEST = { + 'url': 'https://drive.google.com/file/d/0BzpExh0WzJF0NlR5WUlxdEVsY0U/edit?pli=1', + 'info_dict': { + 'id': '0BzpExh0WzJF0NlR5WUlxdEVsY0U', + 'ext': 'mp4', + 'title': '[AHSH] Fairy Tail S2 - 01 [720p].mp4', + } + } + _formats = { + '5': {'ext': 'flv'}, + '6': {'ext': 'flv'}, + '13': {'ext': '3gp'}, + '17': {'ext': '3gp'}, + '18': {'ext': 'mp4'}, + '22': {'ext': 'mp4'}, + '34': {'ext': 'flv'}, + '35': {'ext': 'flv'}, + '36': {'ext': '3gp'}, + '37': {'ext': 'mp4'}, + '38': {'ext': 'mp4'}, + '43': {'ext': 'webm'}, + '44': {'ext': 'webm'}, + '45': {'ext': 'webm'}, + '46': {'ext': 'webm'}, + '59': {'ext': 'mp4'} + } + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage( + 'http://docs.google.com/file/d/'+video_id, video_id, encoding='unicode_escape' + ) + try: + title = self._html_search_regex( + r'"title","(?P.*?)"', + webpage, + 'title', + group='title' + ) + fmt_stream_map = self._html_search_regex( + r'"fmt_stream_map","(?P<fmt_stream_map>.*?)"', + webpage, + 'fmt_stream_map', + group='fmt_stream_map' + ) + fmt_list = self._html_search_regex( + r'"fmt_list","(?P<fmt_list>.*?)"', + webpage, + 'fmt_list', + group='fmt_list' + ) +# timestamp = self._html_search_regex( +# r'"timestamp","(?P<timestamp>.*?)"', +# webpage, +# 'timestamp', +# group='timestamp' +# ) + length_seconds = self._html_search_regex( + r'"length_seconds","(?P<length_seconds>.*?)"', + webpage, + 'length_seconds', + group='length_seconds' + ) + except RegexNotFoundError: + try: + reason = self._html_search_regex( + r'"reason","(?P<reason>.*?)"', + webpage, + 'reason', + group='reason' + ) + self.report_warning(reason) + return + except RegexNotFoundError: + self.report_warning('not a video') + return + + fmt_stream_map = fmt_stream_map.split(',') + fmt_list = fmt_list.split(',') + formats = [] + for i in range(len(fmt_stream_map)): + fmt_id, fmt_url = fmt_stream_map[i].split('|') + resolution = fmt_list[i].split('/')[1] + width, height = resolution.split('x') + formats.append({ + 'url': fmt_url, + 'format_id': fmt_id, + 'resolution': resolution, + 'width': int(width), + 'height': int(height), + 'ext': self._formats[fmt_id]['ext'] + }) + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': title, +# 'timestamp': int(timestamp), + 'duration': int(length_seconds), + 'formats': formats + } From f120a7ab5e9c560a8114f9662e2f213243a945b0 Mon Sep 17 00:00:00 2001 From: remitamine <remitamine@gmail.com> Date: Wed, 24 Jun 2015 14:56:19 +0100 Subject: [PATCH 0007/1286] change the _TEST info --- youtube_dl/extractor/googledrive.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/googledrive.py b/youtube_dl/extractor/googledrive.py index 8c611fa47..e3d5c3418 100644 --- a/youtube_dl/extractor/googledrive.py +++ b/youtube_dl/extractor/googledrive.py @@ -4,11 +4,11 @@ from ..utils import RegexNotFoundError class GoogleDriveIE(InfoExtractor): _VALID_URL = r'(?:https?://)?(?:video\.google\.com/get_player\?.*?docid=|(?:docs|drive)\.google\.com/(?:uc\?.*?id=|file/d/))(?P<id>.+?)(?:&|/|$)' _TEST = { - 'url': 'https://drive.google.com/file/d/0BzpExh0WzJF0NlR5WUlxdEVsY0U/edit?pli=1', + 'url': 'https://drive.google.com/file/d/0ByeS4oOUV-49Zzh4R1J6R09zazQ/edit?pli=1', 'info_dict': { - 'id': '0BzpExh0WzJF0NlR5WUlxdEVsY0U', + 'id': '0ByeS4oOUV-49Zzh4R1J6R09zazQ', 'ext': 'mp4', - 'title': '[AHSH] Fairy Tail S2 - 01 [720p].mp4', + 'title': 'Big Buck Bunny.mp4', } } _formats = { From 3e5f3df1729846a33631dd38a887cd1d81a727c1 Mon Sep 17 00:00:00 2001 From: remitamine <remitamine@gmail.com> Date: Mon, 29 Jun 2015 07:53:21 +0100 Subject: [PATCH 0008/1286] move the embed to a separate class --- youtube_dl/extractor/googledrive.py | 31 ++++++++++++++++++++++++++++- 1 file changed, 30 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/googledrive.py b/youtube_dl/extractor/googledrive.py index e3d5c3418..ac891b275 100644 --- a/youtube_dl/extractor/googledrive.py +++ b/youtube_dl/extractor/googledrive.py @@ -1,8 +1,37 @@ +import re + from .common import InfoExtractor from ..utils import RegexNotFoundError +class GoogleDriveEmbedIE(InfoExtractor): + _VALID_URL = r'https?://(?:video\.google\.com/get_player\?.*?docid=|(?:docs|drive)\.google\.com/file/d/)(?P<id>[a-zA-Z0-9-]{28})(?:/preview)' + _TEST = { + 'url': 'https://docs.google.com/file/d/0B8KB9DRosYGKMXNoeWxqa3JYclE/preview', + 'info_dict': { + 'id': '0B8KB9DRosYGKMXNoeWxqa3JYclE', + 'ext': 'mp4', + 'title': 'Jimmy Fallon Sings Since You\'ve Been Gone.wmv', + } + } + + @staticmethod + def _extract_url(webpage): + mobj = re.search( + r'<iframe src="https?://(?:video\.google\.com/get_player\?.*?docid=|(?:docs|drive)\.google\.com/file/d/)(?P<id>[a-zA-Z0-9-]{28})(?:/preview)', + webpage) + if mobj: + return 'https://drive.google.com/file/d/%s' % mobj.group('id') + + def _real_extract(self, url): + video_id = self._match_id(url) + return { + '_type': 'url', + 'ie-key': 'GoogleDrive', + 'url': 'https://drive.google.com/file/d/%s' % video_id + } + class GoogleDriveIE(InfoExtractor): - _VALID_URL = r'(?:https?://)?(?:video\.google\.com/get_player\?.*?docid=|(?:docs|drive)\.google\.com/(?:uc\?.*?id=|file/d/))(?P<id>.+?)(?:&|/|$)' + _VALID_URL = r'https?://(?:docs|drive)\.google\.com/(?:uc\?.*?id=|file/d/)(?P<id>[a-zA-Z0-9-]{28})' _TEST = { 'url': 'https://drive.google.com/file/d/0ByeS4oOUV-49Zzh4R1J6R09zazQ/edit?pli=1', 'info_dict': { From 2d651a2d02885cddf1752b45497e9113d3a3d403 Mon Sep 17 00:00:00 2001 From: remitamine <remitamine@gmail.com> Date: Mon, 29 Jun 2015 07:55:44 +0100 Subject: [PATCH 0009/1286] import google drive embed class --- youtube_dl/extractor/__init__.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 6655d7eb5..02e18a0da 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -209,7 +209,10 @@ from .globo import GloboIE from .godtube import GodTubeIE from .goldenmoustache import GoldenMoustacheIE from .golem import GolemIE -from .googledrive import GoogleDriveIE +from .googledrive import ( + GoogleDriveEmbedIE, + GoogleDriveIE, +) from .googleplus import GooglePlusIE from .googlesearch import GoogleSearchIE from .gorillavid import GorillaVidIE From 653789afc72d1a225b971541fb633dd768d58942 Mon Sep 17 00:00:00 2001 From: remitamine <remitamine@gmail.com> Date: Mon, 29 Jun 2015 08:01:30 +0100 Subject: [PATCH 0010/1286] add google drive embeds --- youtube_dl/extractor/generic.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 6d2efb22e..3f7b094db 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -48,6 +48,7 @@ from .vimeo import VimeoIE from .dailymotion import DailymotionCloudIE from .onionstudios import OnionStudiosIE from .snagfilms import SnagFilmsEmbedIE +from .googledrive import GoogleDriveEmbedIE class GenericIE(InfoExtractor): @@ -1599,6 +1600,11 @@ class GenericIE(InfoExtractor): if nbc_sports_url: return self.url_result(nbc_sports_url, 'NBCSportsVPlayer') + # Look for Google Drive embeds + google_drive_url = GoogleDriveEmbedIE._extract_url(webpage) + if google_drive_url: + return self.url_result(google_drive_url, 'GoogleDrive') + # Look for UDN embeds mobj = re.search( r'<iframe[^>]+src="(?P<url>%s)"' % UDNEmbedIE._VALID_URL, webpage) From 3b3d531965f0f36c20f5fa8557481c144170653f Mon Sep 17 00:00:00 2001 From: remitamine <remitamine@gmail.com> Date: Fri, 17 Jul 2015 14:17:19 +0100 Subject: [PATCH 0011/1286] fix embed regex --- youtube_dl/extractor/googledrive.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/googledrive.py b/youtube_dl/extractor/googledrive.py index ac891b275..c82c9037f 100644 --- a/youtube_dl/extractor/googledrive.py +++ b/youtube_dl/extractor/googledrive.py @@ -4,7 +4,7 @@ from .common import InfoExtractor from ..utils import RegexNotFoundError class GoogleDriveEmbedIE(InfoExtractor): - _VALID_URL = r'https?://(?:video\.google\.com/get_player\?.*?docid=|(?:docs|drive)\.google\.com/file/d/)(?P<id>[a-zA-Z0-9-]{28})(?:/preview)' + _VALID_URL = r'https?://(?:video\.google\.com/get_player\?.*?docid=|(?:docs|drive)\.google\.com/file/d/)(?P<id>[a-zA-Z0-9_-]{28})' _TEST = { 'url': 'https://docs.google.com/file/d/0B8KB9DRosYGKMXNoeWxqa3JYclE/preview', 'info_dict': { @@ -17,7 +17,7 @@ class GoogleDriveEmbedIE(InfoExtractor): @staticmethod def _extract_url(webpage): mobj = re.search( - r'<iframe src="https?://(?:video\.google\.com/get_player\?.*?docid=|(?:docs|drive)\.google\.com/file/d/)(?P<id>[a-zA-Z0-9-]{28})(?:/preview)', + r'<iframe src="https?://(?:video\.google\.com/get_player\?.*?docid=|(?:docs|drive)\.google\.com/file/d/)(?P<id>[a-zA-Z0-9_-]{28})', webpage) if mobj: return 'https://drive.google.com/file/d/%s' % mobj.group('id') @@ -31,7 +31,7 @@ class GoogleDriveEmbedIE(InfoExtractor): } class GoogleDriveIE(InfoExtractor): - _VALID_URL = r'https?://(?:docs|drive)\.google\.com/(?:uc\?.*?id=|file/d/)(?P<id>[a-zA-Z0-9-]{28})' + _VALID_URL = r'https?://(?:docs|drive)\.google\.com/(?:uc\?.*?id=|file/d/)(?P<id>[a-zA-Z0-9_-]{28})' _TEST = { 'url': 'https://drive.google.com/file/d/0ByeS4oOUV-49Zzh4R1J6R09zazQ/edit?pli=1', 'info_dict': { From d1cc05e17eccccb7ee6473574c6a4f887104baeb Mon Sep 17 00:00:00 2001 From: remitamine <remitamine@gmail.com> Date: Fri, 17 Jul 2015 14:37:21 +0100 Subject: [PATCH 0012/1286] remove unnecessary regex group names --- youtube_dl/extractor/googledrive.py | 32 ++++++++++++----------------- 1 file changed, 13 insertions(+), 19 deletions(-) diff --git a/youtube_dl/extractor/googledrive.py b/youtube_dl/extractor/googledrive.py index c82c9037f..6d9bcfefd 100644 --- a/youtube_dl/extractor/googledrive.py +++ b/youtube_dl/extractor/googledrive.py @@ -62,46 +62,40 @@ class GoogleDriveIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage( - 'http://docs.google.com/file/d/'+video_id, video_id, encoding='unicode_escape' + 'http://docs.google.com/file/d/' + video_id, video_id, encoding='unicode_escape' ) try: title = self._html_search_regex( - r'"title","(?P<title>.*?)"', + r'"title"\s+,\s+"[^"]+', webpage, - 'title', - group='title' + 'title' ) fmt_stream_map = self._html_search_regex( - r'"fmt_stream_map","(?P<fmt_stream_map>.*?)"', + r'"fmt_stream_map"\s+,\s+"[^"]+', webpage, - 'fmt_stream_map', - group='fmt_stream_map' + 'fmt_stream_map' ) fmt_list = self._html_search_regex( - r'"fmt_list","(?P<fmt_list>.*?)"', + r'"fmt_list"\s+,\s+"[^"]+', webpage, - 'fmt_list', - group='fmt_list' + 'fmt_list' ) # timestamp = self._html_search_regex( -# r'"timestamp","(?P<timestamp>.*?)"', +# r'"timestamp"\s+,\s+"[^"]+', # webpage, -# 'timestamp', -# group='timestamp' +# 'timestamp' # ) length_seconds = self._html_search_regex( - r'"length_seconds","(?P<length_seconds>.*?)"', + r'"length_seconds"\s+,\s+"[^"]+', webpage, - 'length_seconds', - group='length_seconds' + 'length_seconds' ) except RegexNotFoundError: try: reason = self._html_search_regex( - r'"reason","(?P<reason>.*?)"', + r'"reason","[^"]+', webpage, - 'reason', - group='reason' + 'reason' ) self.report_warning(reason) return From 36dbca87848fc5698d3e0b89380c7bcec741ceaf Mon Sep 17 00:00:00 2001 From: remitamine <remitamine@gmail.com> Date: Fri, 17 Jul 2015 14:52:01 +0100 Subject: [PATCH 0013/1286] fix recursive error --- youtube_dl/extractor/googledrive.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/youtube_dl/extractor/googledrive.py b/youtube_dl/extractor/googledrive.py index 6d9bcfefd..a3d9b4450 100644 --- a/youtube_dl/extractor/googledrive.py +++ b/youtube_dl/extractor/googledrive.py @@ -26,7 +26,7 @@ class GoogleDriveEmbedIE(InfoExtractor): video_id = self._match_id(url) return { '_type': 'url', - 'ie-key': 'GoogleDrive', + 'ie_key': 'GoogleDrive', 'url': 'https://drive.google.com/file/d/%s' % video_id } @@ -66,34 +66,34 @@ class GoogleDriveIE(InfoExtractor): ) try: title = self._html_search_regex( - r'"title"\s+,\s+"[^"]+', + r'"title"\s*,\s*"([^"]+)', webpage, 'title' ) fmt_stream_map = self._html_search_regex( - r'"fmt_stream_map"\s+,\s+"[^"]+', + r'"fmt_stream_map"\s*,\s*"([^"]+)', webpage, 'fmt_stream_map' ) fmt_list = self._html_search_regex( - r'"fmt_list"\s+,\s+"[^"]+', + r'"fmt_list"\s*,\s*"([^"]+)', webpage, 'fmt_list' ) # timestamp = self._html_search_regex( -# r'"timestamp"\s+,\s+"[^"]+', +# r'"timestamp"\s*,\s*"([^"]+)', # webpage, # 'timestamp' # ) length_seconds = self._html_search_regex( - r'"length_seconds"\s+,\s+"[^"]+', + r'"length_seconds"\s*,\s*"([^"]+)', webpage, 'length_seconds' ) except RegexNotFoundError: try: reason = self._html_search_regex( - r'"reason","[^"]+', + r'"reason","([^"]+)', webpage, 'reason' ) From 8e92d21ebf6f17e14c9e916f22e49f27529556af Mon Sep 17 00:00:00 2001 From: remitamine <remitamine@gmail.com> Date: Sat, 18 Jul 2015 23:31:14 +0100 Subject: [PATCH 0014/1286] [googledrive] raise ExtractorError instead of warning --- youtube_dl/extractor/googledrive.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/googledrive.py b/youtube_dl/extractor/googledrive.py index a3d9b4450..7bc7b7a0d 100644 --- a/youtube_dl/extractor/googledrive.py +++ b/youtube_dl/extractor/googledrive.py @@ -1,7 +1,10 @@ import re from .common import InfoExtractor -from ..utils import RegexNotFoundError +from ..utils import ( + RegexNotFoundError, + ExtractorError, +) class GoogleDriveEmbedIE(InfoExtractor): _VALID_URL = r'https?://(?:video\.google\.com/get_player\?.*?docid=|(?:docs|drive)\.google\.com/file/d/)(?P<id>[a-zA-Z0-9_-]{28})' @@ -97,10 +100,10 @@ class GoogleDriveIE(InfoExtractor): webpage, 'reason' ) - self.report_warning(reason) + raise ExtractorError(reason) return except RegexNotFoundError: - self.report_warning('not a video') + raise ExtractorError('not a video') return fmt_stream_map = fmt_stream_map.split(',') From 9f4921bfa0ce3a48d2f93b4946f361116cfde5e9 Mon Sep 17 00:00:00 2001 From: remitamine <remitamine@gmail.com> Date: Thu, 3 Sep 2015 00:29:53 +0100 Subject: [PATCH 0015/1286] [dcn] add show extraction and support for other types of urls --- youtube_dl/extractor/__init__.py | 6 ++- youtube_dl/extractor/dcn.py | 81 ++++++++++++++++++++++++++++++-- 2 files changed, 83 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 39b05ce8f..d4a3e8ab0 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -118,7 +118,11 @@ from .dailymotion import ( ) from .daum import DaumIE from .dbtv import DBTVIE -from .dcn import DCNIE +from .dcn import ( + DCNGeneralIE, + DCNVideoIE, + DCNShowIE, +) from .dctp import DctpTvIE from .deezer import DeezerPlaylistIE from .dfb import DFBIE diff --git a/youtube_dl/extractor/dcn.py b/youtube_dl/extractor/dcn.py index 82261e25c..352d35c7a 100644 --- a/youtube_dl/extractor/dcn.py +++ b/youtube_dl/extractor/dcn.py @@ -1,6 +1,8 @@ # coding: utf-8 from __future__ import unicode_literals +import re + from .common import InfoExtractor from ..compat import ( compat_urllib_parse, @@ -12,10 +14,33 @@ from ..utils import ( ) -class DCNIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?dcndigital\.ae/(?:#/)?(?:video/.+|show/\d+/.+?)/(?P<id>\d+)' +class DCNGeneralIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?dcndigital\.ae/(?:#/)?show/(?P<show_id>\d+)/[^/]+(?:/(?P<video_id>\d+)/(?P<season_id>\d+))?' + + def _real_extract(self, url): + show_id, video_id, season_id = re.match(self._VALID_URL, url).groups() + url = '' + ie_key = '' + if video_id and int(video_id) > 0: + url = 'http://www.dcndigital.ae/#/media/%s' % video_id + ie_key = 'DCNVideo' + else: + ie_key = 'DCNShow' + if season_id and int(season_id) > 0: + url = 'http://www.dcndigital.ae/#/program/season/%s' % season_id + else: + url = 'http://www.dcndigital.ae/#/program/%s' % show_id + return { + 'url': url, + '_type': 'url', + 'ie_key': ie_key + } + + +class DCNVideoIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?dcndigital\.ae/(?:#/)?(?:video/[^/]+|media)/(?P<id>\d+)' _TEST = { - 'url': 'http://www.dcndigital.ae/#/show/199074/%D8%B1%D8%AD%D9%84%D8%A9-%D8%A7%D9%84%D8%B9%D9%85%D8%B1-%D8%A7%D9%84%D8%AD%D9%84%D9%82%D8%A9-1/17375/6887', + 'url': 'http://www.dcndigital.ae/#/video/%D8%B1%D8%AD%D9%84%D8%A9-%D8%A7%D9%84%D8%B9%D9%85%D8%B1-%D8%A7%D9%84%D8%AD%D9%84%D9%82%D8%A9-1/17375', 'info_dict': { 'id': '17375', @@ -82,3 +107,53 @@ class DCNIE(InfoExtractor): 'timestamp': timestamp, 'formats': formats, } + + +class DCNShowIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?dcndigital\.ae/(?:#/)?program/(?:(?P<show_id>\d+)|season/(?P<season_id>\d+))' + _TEST = { + 'url': 'http://dcndigital.ae/#/program/205024/%D9%85%D8%AD%D8%A7%D8%B6%D8%B1%D8%A7%D8%AA-%D8%A7%D9%84%D8%B4%D9%8A%D8%AE-%D8%A7%D9%84%D8%B4%D8%B9%D8%B1%D8%A7%D9%88%D9%8A', + 'info_dict': + { + 'id': '205024', + 'title': 'محاضرات الشيخ الشعراوي', + 'description': '', + }, + 'playlist_mincount': 27, + } + + def _real_extract(self, url): + show_id, season_id = re.match(self._VALID_URL, url).groups() + data = {} + if season_id: + request = compat_urllib_request.Request( + 'http://admin.mangomolo.com/analytics/index.php/plus/season_info?id=%s' % season_id, + headers={'Origin': 'http://www.dcndigital.ae'}) + season = self._download_json(request, season_id) + show_id = season['id'] + data['season'] = season_id + data['show_id'] = show_id + request = compat_urllib_request.Request( + 'http://admin.mangomolo.com/analytics/index.php/plus/show', + compat_urllib_parse.urlencode(data), + { + 'Origin': 'http://www.dcndigital.ae', + 'Content-Type': 'application/x-www-form-urlencoded' + }) + show = self._download_json(request, show_id) + title = show['cat'].get('title_en') or show['cat']['title_ar'] + description = show['cat'].get('description_en') or show['cat'].get('description_ar') + entries = [] + for video in show['videos']: + entries.append({ + 'url': 'http://www.dcndigital.ae/#/media/%s' % video['id'], + '_type': 'url', + 'ie_key': 'DCNVideo', + }) + return { + 'id': show_id, + 'title': title, + 'description': description, + 'entries': entries, + '_type': 'playlist', + } From b477da2094db30a232f67edf3b342dc460aa14d4 Mon Sep 17 00:00:00 2001 From: remitamine <remitamine@gmail.com> Date: Thu, 3 Sep 2015 16:59:10 +0100 Subject: [PATCH 0016/1286] correct the extractor name and id and remove unnecessary request --- youtube_dl/extractor/__init__.py | 2 +- youtube_dl/extractor/dcn.py | 28 ++++++++++++++++++---------- 2 files changed, 19 insertions(+), 11 deletions(-) diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index d4a3e8ab0..4e41d9bf9 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -121,7 +121,7 @@ from .dbtv import DBTVIE from .dcn import ( DCNGeneralIE, DCNVideoIE, - DCNShowIE, + DCNSeasonIE, ) from .dctp import DctpTvIE from .deezer import DeezerPlaylistIE diff --git a/youtube_dl/extractor/dcn.py b/youtube_dl/extractor/dcn.py index 352d35c7a..8a36c10f6 100644 --- a/youtube_dl/extractor/dcn.py +++ b/youtube_dl/extractor/dcn.py @@ -11,6 +11,8 @@ from ..compat import ( from ..utils import ( int_or_none, parse_iso8601, + smuggle_url, + unsmuggle_url, ) @@ -25,9 +27,9 @@ class DCNGeneralIE(InfoExtractor): url = 'http://www.dcndigital.ae/#/media/%s' % video_id ie_key = 'DCNVideo' else: - ie_key = 'DCNShow' + ie_key = 'DCNSeason' if season_id and int(season_id) > 0: - url = 'http://www.dcndigital.ae/#/program/season/%s' % season_id + url = smuggle_url('http://www.dcndigital.ae/#/program/season/%s' % season_id, {'show_id': show_id}) else: url = 'http://www.dcndigital.ae/#/program/%s' % show_id return { @@ -38,6 +40,7 @@ class DCNGeneralIE(InfoExtractor): class DCNVideoIE(InfoExtractor): + IE_NAME = 'dcn:video' _VALID_URL = r'https?://(?:www\.)?dcndigital\.ae/(?:#/)?(?:video/[^/]+|media)/(?P<id>\d+)' _TEST = { 'url': 'http://www.dcndigital.ae/#/video/%D8%B1%D8%AD%D9%84%D8%A9-%D8%A7%D9%84%D8%B9%D9%85%D8%B1-%D8%A7%D9%84%D8%AD%D9%84%D9%82%D8%A9-1/17375', @@ -109,13 +112,14 @@ class DCNVideoIE(InfoExtractor): } -class DCNShowIE(InfoExtractor): +class DCNSeasonIE(InfoExtractor): + IE_NAME = 'dcn:season' _VALID_URL = r'https?://(?:www\.)?dcndigital\.ae/(?:#/)?program/(?:(?P<show_id>\d+)|season/(?P<season_id>\d+))' _TEST = { 'url': 'http://dcndigital.ae/#/program/205024/%D9%85%D8%AD%D8%A7%D8%B6%D8%B1%D8%A7%D8%AA-%D8%A7%D9%84%D8%B4%D9%8A%D8%AE-%D8%A7%D9%84%D8%B4%D8%B9%D8%B1%D8%A7%D9%88%D9%8A', 'info_dict': { - 'id': '205024', + 'id': '7910', 'title': 'محاضرات الشيخ الشعراوي', 'description': '', }, @@ -123,15 +127,18 @@ class DCNShowIE(InfoExtractor): } def _real_extract(self, url): + url, smuggled_data = unsmuggle_url(url, {}) show_id, season_id = re.match(self._VALID_URL, url).groups() data = {} if season_id: - request = compat_urllib_request.Request( - 'http://admin.mangomolo.com/analytics/index.php/plus/season_info?id=%s' % season_id, - headers={'Origin': 'http://www.dcndigital.ae'}) - season = self._download_json(request, season_id) - show_id = season['id'] data['season'] = season_id + show_id = smuggled_data.get('show_id') + if show_id is None: + request = compat_urllib_request.Request( + 'http://admin.mangomolo.com/analytics/index.php/plus/season_info?id=%s' % season_id, + headers={'Origin': 'http://www.dcndigital.ae'}) + season = self._download_json(request, season_id) + show_id = season['id'] data['show_id'] = show_id request = compat_urllib_request.Request( 'http://admin.mangomolo.com/analytics/index.php/plus/show', @@ -141,6 +148,7 @@ class DCNShowIE(InfoExtractor): 'Content-Type': 'application/x-www-form-urlencoded' }) show = self._download_json(request, show_id) + season_id = season_id or show['default_season'] title = show['cat'].get('title_en') or show['cat']['title_ar'] description = show['cat'].get('description_en') or show['cat'].get('description_ar') entries = [] @@ -151,7 +159,7 @@ class DCNShowIE(InfoExtractor): 'ie_key': 'DCNVideo', }) return { - 'id': show_id, + 'id': season_id, 'title': title, 'description': description, 'entries': entries, From 8e2898edf930830260ab6b294c8866e7651a01a6 Mon Sep 17 00:00:00 2001 From: remitamine <remitamine@gmail.com> Date: Fri, 4 Sep 2015 15:42:09 +0100 Subject: [PATCH 0017/1286] [dcn] add support for live streams and catchup videos --- youtube_dl/extractor/__init__.py | 1 + youtube_dl/extractor/dcn.py | 62 +++++++++++++++++++++++++++++++- 2 files changed, 62 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 4e41d9bf9..677c75564 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -121,6 +121,7 @@ from .dbtv import DBTVIE from .dcn import ( DCNGeneralIE, DCNVideoIE, + DCNLiveIE, DCNSeasonIE, ) from .dctp import DctpTvIE diff --git a/youtube_dl/extractor/dcn.py b/youtube_dl/extractor/dcn.py index 8a36c10f6..2e8fff660 100644 --- a/youtube_dl/extractor/dcn.py +++ b/youtube_dl/extractor/dcn.py @@ -2,6 +2,7 @@ from __future__ import unicode_literals import re +import base64 from .common import InfoExtractor from ..compat import ( @@ -41,7 +42,7 @@ class DCNGeneralIE(InfoExtractor): class DCNVideoIE(InfoExtractor): IE_NAME = 'dcn:video' - _VALID_URL = r'https?://(?:www\.)?dcndigital\.ae/(?:#/)?(?:video/[^/]+|media)/(?P<id>\d+)' + _VALID_URL = r'https?://(?:www\.)?dcndigital\.ae/(?:#/)?(?:video/[^/]+|media|catchup/[^/]+/[^/]+)/(?P<id>\d+)' _TEST = { 'url': 'http://www.dcndigital.ae/#/video/%D8%B1%D8%AD%D9%84%D8%A9-%D8%A7%D9%84%D8%B9%D9%85%D8%B1-%D8%A7%D9%84%D8%AD%D9%84%D9%82%D8%A9-1/17375', 'info_dict': @@ -112,6 +113,65 @@ class DCNVideoIE(InfoExtractor): } +class DCNLiveIE(InfoExtractor): + IE_NAME = 'dcn:live' + _VALID_URL = r'https?://(?:www\.)?dcndigital\.ae/(?:#/)?live/(?P<id>\d+)' + _TEST = { + 'url': 'http://www.dcndigital.ae/#/live/6/dubai-tv', + 'info_dict': + { + 'id': '6', + 'ext': 'mp4', + 'title': 'Dubai Al Oula', + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + } + + def _real_extract(self, url): + channel_id = self._match_id(url) + + request = compat_urllib_request.Request( + 'http://admin.mangomolo.com/analytics/index.php/plus/getchanneldetails?channel_id=%s' % channel_id, + headers={'Origin': 'http://www.dcndigital.ae'}) + + channel = self._download_json(request, channel_id) + title = channel.get('title_en') or channel['title_ar'] + + webpage = self._download_webpage( + 'http://admin.mangomolo.com/analytics/index.php/customers/embed/index?' + + compat_urllib_parse.urlencode({ + 'id': base64.b64encode(channel['user_id'].encode()).decode(), + 'channelid': base64.b64encode(channel['id'].encode()).decode(), + 'signature': channel['signature'], + 'countries': 'Q0M=', + 'filter': 'DENY', + }), channel_id) + + m3u8_url = self._html_search_regex(r'file:\s*"([^"]+)', webpage, 'm3u8 url') + formats = self._extract_m3u8_formats( + m3u8_url, channel_id, 'mp4', entry_protocol='m3u8_native', m3u8_id='hls') + + rtsp_url = self._search_regex( + r'<a[^>]+href="(rtsp://[^"]+)"', webpage, 'rtsp url', fatal=False) + if rtsp_url: + formats.append({ + 'url': rtsp_url, + 'format_id': 'rtsp', + }) + + self._sort_formats(formats) + + return { + 'id': channel_id, + 'title': title, + 'formats': formats, + 'is_live': True, + } + + class DCNSeasonIE(InfoExtractor): IE_NAME = 'dcn:season' _VALID_URL = r'https?://(?:www\.)?dcndigital\.ae/(?:#/)?program/(?:(?P<show_id>\d+)|season/(?P<season_id>\d+))' From 486375154cb7d79bd084879467bc70550104b555 Mon Sep 17 00:00:00 2001 From: remitamine <remitamine@gmail.com> Date: Sat, 5 Sep 2015 11:30:42 +0100 Subject: [PATCH 0018/1286] correct season info extraction and simplify --- youtube_dl/extractor/dcn.py | 64 ++++++++++++++++++------------------- 1 file changed, 31 insertions(+), 33 deletions(-) diff --git a/youtube_dl/extractor/dcn.py b/youtube_dl/extractor/dcn.py index 2e8fff660..8b360a9d7 100644 --- a/youtube_dl/extractor/dcn.py +++ b/youtube_dl/extractor/dcn.py @@ -25,19 +25,13 @@ class DCNGeneralIE(InfoExtractor): url = '' ie_key = '' if video_id and int(video_id) > 0: - url = 'http://www.dcndigital.ae/#/media/%s' % video_id - ie_key = 'DCNVideo' + return self.url_result('http://www.dcndigital.ae/#/media/%s' % video_id, 'DCNVideo') else: - ie_key = 'DCNSeason' if season_id and int(season_id) > 0: url = smuggle_url('http://www.dcndigital.ae/#/program/season/%s' % season_id, {'show_id': show_id}) else: url = 'http://www.dcndigital.ae/#/program/%s' % show_id - return { - 'url': url, - '_type': 'url', - 'ie_key': ie_key - } + return self.url_result(url, 'DCNSeason') class DCNVideoIE(InfoExtractor): @@ -71,6 +65,11 @@ class DCNVideoIE(InfoExtractor): video = self._download_json(request, video_id) title = video.get('title_en') or video['title_ar'] + img = video.get('img') + thumbnail = 'http://admin.mangomolo.com/analytics/%s' % img if img else None + duration = int_or_none(video.get('duration')) + description = video.get('description_en') or video.get('description_ar') + timestamp = parse_iso8601(video.get('create_time') or video.get('update_time'), ' ') webpage = self._download_webpage( 'http://admin.mangomolo.com/analytics/index.php/customers/embed/video?' @@ -96,12 +95,6 @@ class DCNVideoIE(InfoExtractor): self._sort_formats(formats) - img = video.get('img') - thumbnail = 'http://admin.mangomolo.com/analytics/%s' % img if img else None - duration = int_or_none(video.get('duration')) - description = video.get('description_en') or video.get('description_ar') - timestamp = parse_iso8601(video.get('create_time') or video.get('update_time'), ' ') - return { 'id': video_id, 'title': title, @@ -122,7 +115,9 @@ class DCNLiveIE(InfoExtractor): { 'id': '6', 'ext': 'mp4', - 'title': 'Dubai Al Oula', + 'title': 're:^Dubai Al Oula [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', + 'thumbnail': 're:^https?://.*\.png$', + 'is_live': True, }, 'params': { # m3u8 download @@ -139,10 +134,14 @@ class DCNLiveIE(InfoExtractor): channel = self._download_json(request, channel_id) title = channel.get('title_en') or channel['title_ar'] + img = channel.get('thumbnail') + thumbnail = 'http://admin.mangomolo.com/analytics/%s' % img if img else None + description = channel.get('description_en') or channel.get('description_ar') + timestamp = parse_iso8601(channel.get('create_time') or channel.get('update_time'), ' ') webpage = self._download_webpage( - 'http://admin.mangomolo.com/analytics/index.php/customers/embed/index?' - + compat_urllib_parse.urlencode({ + 'http://admin.mangomolo.com/analytics/index.php/customers/embed/index?' + + compat_urllib_parse.urlencode({ 'id': base64.b64encode(channel['user_id'].encode()).decode(), 'channelid': base64.b64encode(channel['id'].encode()).decode(), 'signature': channel['signature'], @@ -166,7 +165,9 @@ class DCNLiveIE(InfoExtractor): return { 'id': channel_id, - 'title': title, + 'title': self._live_title(title), + 'description': description, + 'thumbnail': thumbnail, 'formats': formats, 'is_live': True, } @@ -181,7 +182,6 @@ class DCNSeasonIE(InfoExtractor): { 'id': '7910', 'title': 'محاضرات الشيخ الشعراوي', - 'description': '', }, 'playlist_mincount': 27, } @@ -189,6 +189,7 @@ class DCNSeasonIE(InfoExtractor): def _real_extract(self, url): url, smuggled_data = unsmuggle_url(url, {}) show_id, season_id = re.match(self._VALID_URL, url).groups() + data = {} if season_id: data['season'] = season_id @@ -207,21 +208,18 @@ class DCNSeasonIE(InfoExtractor): 'Origin': 'http://www.dcndigital.ae', 'Content-Type': 'application/x-www-form-urlencoded' }) + show = self._download_json(request, show_id) season_id = season_id or show['default_season'] - title = show['cat'].get('title_en') or show['cat']['title_ar'] - description = show['cat'].get('description_en') or show['cat'].get('description_ar') + season = {} + for _ in show['seasons']: + if _['id'] == season_id: + season = _ + break + title = season.get('title_en') or season['title_ar'] + entries = [] for video in show['videos']: - entries.append({ - 'url': 'http://www.dcndigital.ae/#/media/%s' % video['id'], - '_type': 'url', - 'ie_key': 'DCNVideo', - }) - return { - 'id': season_id, - 'title': title, - 'description': description, - 'entries': entries, - '_type': 'playlist', - } + entries.append(self.url_result('http://www.dcndigital.ae/#/media/%s' % video['id'], 'DCNVideo')) + + return self.playlist_result(entries, season_id, title) From 8b55cadc83f198e0fa6bac7158f9b05826f39257 Mon Sep 17 00:00:00 2001 From: remitamine <remitamine@gmail.com> Date: Mon, 7 Sep 2015 16:39:01 +0100 Subject: [PATCH 0019/1286] [canal13cl] fix info extraction --- youtube_dl/extractor/__init__.py | 2 +- youtube_dl/extractor/canal13cl.py | 48 ------------------- youtube_dl/extractor/tele13.py | 77 +++++++++++++++++++++++++++++++ 3 files changed, 78 insertions(+), 49 deletions(-) delete mode 100644 youtube_dl/extractor/canal13cl.py create mode 100644 youtube_dl/extractor/tele13.py diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 5d2ea39d0..661b53e63 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -67,7 +67,6 @@ from .camdemy import ( CamdemyIE, CamdemyFolderIE ) -from .canal13cl import Canal13clIE from .canalplus import CanalplusIE from .canalc2 import Canalc2IE from .cbs import CBSIE @@ -612,6 +611,7 @@ from .teachingchannel import TeachingChannelIE from .teamcoco import TeamcocoIE from .techtalks import TechTalksIE from .ted import TEDIE +from .tele13 import Tele13IE from .telebruxelles import TeleBruxellesIE from .telecinco import TelecincoIE from .telegraaf import TelegraafIE diff --git a/youtube_dl/extractor/canal13cl.py b/youtube_dl/extractor/canal13cl.py deleted file mode 100644 index 93241fefe..000000000 --- a/youtube_dl/extractor/canal13cl.py +++ /dev/null @@ -1,48 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor - - -class Canal13clIE(InfoExtractor): - _VALID_URL = r'^http://(?:www\.)?13\.cl/(?:[^/?#]+/)*(?P<id>[^/?#]+)' - _TEST = { - 'url': 'http://www.13.cl/t13/nacional/el-circulo-de-hierro-de-michelle-bachelet-en-su-regreso-a-la-moneda', - 'md5': '4cb1fa38adcad8fea88487a078831755', - 'info_dict': { - 'id': '1403022125', - 'display_id': 'el-circulo-de-hierro-de-michelle-bachelet-en-su-regreso-a-la-moneda', - 'ext': 'mp4', - 'title': 'El "círculo de hierro" de Michelle Bachelet en su regreso a La Moneda', - 'description': '(Foto: Agencia Uno) En nueve días más, Michelle Bachelet va a asumir por segunda vez como presidenta de la República. Entre aquellos que la acompañarán hay caras que se repiten y otras que se consolidan en su entorno de colaboradores más cercanos.', - } - } - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - display_id = mobj.group('id') - - webpage = self._download_webpage(url, display_id) - - title = self._html_search_meta( - 'twitter:title', webpage, 'title', fatal=True) - description = self._html_search_meta( - 'twitter:description', webpage, 'description') - url = self._html_search_regex( - r'articuloVideo = \"(.*?)\"', webpage, 'url') - real_id = self._search_regex( - r'[^0-9]([0-9]{7,})[^0-9]', url, 'id', default=display_id) - thumbnail = self._html_search_regex( - r'articuloImagen = \"(.*?)\"', webpage, 'thumbnail') - - return { - 'id': real_id, - 'display_id': display_id, - 'url': url, - 'title': title, - 'description': description, - 'ext': 'mp4', - 'thumbnail': thumbnail, - } diff --git a/youtube_dl/extractor/tele13.py b/youtube_dl/extractor/tele13.py new file mode 100644 index 000000000..5d89e757f --- /dev/null +++ b/youtube_dl/extractor/tele13.py @@ -0,0 +1,77 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import js_to_json + + +class Tele13IE(InfoExtractor): + _VALID_URL = r'^http://(?:www\.)?t13\.cl/videos(?:/[^/]+)+/(?P<id>[\w-]+)' + _TESTS = [ + { + 'url': 'http://www.t13.cl/videos/actualidad/el-circulo-de-hierro-de-michelle-bachelet-en-su-regreso-a-la-moneda', + 'md5': '4cb1fa38adcad8fea88487a078831755', + 'info_dict': { + 'id': 'el-circulo-de-hierro-de-michelle-bachelet-en-su-regreso-a-la-moneda', + 'ext': 'mp4', + 'title': 'El c\u00edrculo de hierro de Michelle Bachelet en su regreso a La Moneda', + } + }, + { + 'url': 'http://www.t13.cl/videos/mundo/tendencias/video-captan-misteriosa-bola-fuego-cielos-bangkok', + 'md5': '65d1ae54812c96f4b345dd21d3bb1adc', + 'info_dict': { + 'id': 'rOoKv2OMpOw', + 'ext': 'mp4', + 'title': 'Shooting star seen on 7-Sep-2015', + 'description': 'md5:a1cd2e74f6ee6851552c9cf5851d6b06', + 'uploader': 'Porjai Jaturongkhakun', + 'upload_date': '20150906', + 'uploader_id': 'UCnLY_3ezwNcDSC_Wc6suZxw', + }, + 'add_ie': ['Youtube'], + } + ] + + def _real_extract(self, url): + display_id = self._match_id(url) + + webpage = self._download_webpage(url, display_id) + + setup_js = self._parse_json( + js_to_json( + self._search_regex( + r"jwplayer\('player-vivo'\).setup\((\{.*?\})\)", + webpage, + 'setup code', + flags=re.DOTALL + ).replace('\n//', '') + ), + display_id + ) + title = setup_js['title'] + thumbnail = setup_js.get('image') or setup_js['playlist'][0].get('image') + description = self._html_search_meta( + 'description', webpage, 'description') + + formats = [] + for f in setup_js['playlist'][0]['sources']: + format_url = f['file'] + if format_url != '': + if '.m3u8' in format_url: + formats.extend(self._extract_m3u8_formats(format_url, display_id)) + else: + if 'youtube.com' in format_url: + return self.url_result(format_url, 'Youtube') + else: + formats.append({'url': format_url, 'format_id': f.get('label')}) + + return { + 'id': display_id, + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + 'formats': formats, + } From 436416afe2ea70dd6b55f8c9d699ddb0bdc1ec5f Mon Sep 17 00:00:00 2001 From: remitamine <remitamine@gmail.com> Date: Mon, 7 Sep 2015 21:13:49 +0100 Subject: [PATCH 0020/1286] [tele13] skip test --- youtube_dl/extractor/tele13.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/tele13.py b/youtube_dl/extractor/tele13.py index 5d89e757f..f1764eb2f 100644 --- a/youtube_dl/extractor/tele13.py +++ b/youtube_dl/extractor/tele13.py @@ -16,8 +16,12 @@ class Tele13IE(InfoExtractor): 'info_dict': { 'id': 'el-circulo-de-hierro-de-michelle-bachelet-en-su-regreso-a-la-moneda', 'ext': 'mp4', - 'title': 'El c\u00edrculo de hierro de Michelle Bachelet en su regreso a La Moneda', - } + 'title': 'El círculo de hierro de Michelle Bachelet en su regreso a La Moneda', + }, + 'params': { + # HTTP Error 404: Not Found + 'skip_download': True, + }, }, { 'url': 'http://www.t13.cl/videos/mundo/tendencias/video-captan-misteriosa-bola-fuego-cielos-bangkok', From 689fb748ee1ba8e61f99d21a3bcb1bc83b708649 Mon Sep 17 00:00:00 2001 From: remitamine <remitamine@gmail.com> Date: Fri, 11 Sep 2015 04:44:17 +0100 Subject: [PATCH 0021/1286] [utlis] add extract_attributes for extracting html tags attributes --- youtube_dl/utils.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 206dd56bc..bcebf9cc5 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -248,6 +248,14 @@ def get_element_by_attribute(attribute, value, html): return unescapeHTML(res) +def extract_attributes(attributes_str, attributes_regex=r'(?s)\s*([^\s=]+)\s*=\s*["\']([^"\']+)["\']'): + attributes = re.findall(attributes_regex, attributes_str) + attributes_dict = {} + if attributes: + attributes_dict = {attribute_name: attribute_value for (attribute_name, attribute_value) in attributes} + return attributes_dict + + def clean_html(html): """Clean an HTML snippet into a readable string""" From ed1269000f24a6ddc683a295ff402ef3ded5c4fb Mon Sep 17 00:00:00 2001 From: remitamine <remitamine@gmail.com> Date: Fri, 11 Sep 2015 04:46:21 +0100 Subject: [PATCH 0022/1286] [brightcove] add support for brightcove in page embed(fixes #6824) --- youtube_dl/extractor/__init__.py | 5 +- youtube_dl/extractor/brightcove.py | 92 ++++++++++++++++++++++++++++++ youtube_dl/extractor/generic.py | 21 ++++++- 3 files changed, 116 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 57f55b479..fcd9edec3 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -59,7 +59,10 @@ from .bloomberg import BloombergIE from .bpb import BpbIE from .br import BRIE from .breakcom import BreakIE -from .brightcove import BrightcoveIE +from .brightcove import ( + BrightcoveIE, + BrightcoveInPageEmbedIE, +) from .buzzfeed import BuzzFeedIE from .byutv import BYUtvIE from .c56 import C56IE diff --git a/youtube_dl/extractor/brightcove.py b/youtube_dl/extractor/brightcove.py index 4721c2293..a07c0888f 100644 --- a/youtube_dl/extractor/brightcove.py +++ b/youtube_dl/extractor/brightcove.py @@ -22,6 +22,10 @@ from ..utils import ( fix_xml_ampersands, unescapeHTML, unsmuggle_url, + js_to_json, + int_or_none, + parse_iso8601, + extract_attributes, ) @@ -346,3 +350,91 @@ class BrightcoveIE(InfoExtractor): if 'url' not in info and not info.get('formats'): raise ExtractorError('Unable to extract video url for %s' % info['id']) return info + + +class BrightcoveInPageEmbedIE(InfoExtractor): + _VALID_URL = r'https?://players\.brightcove\.net/(?P<account_id>\d+)/([a-z0-9-]+)_([a-z]+)/index.html?.*videoId=(?P<video_id>\d+)' + TEST = { + 'url': 'http://players.brightcove.net/929656772001/e41d32dc-ec74-459e-a845-6c69f7b724ea_default/index.html?videoId=4463358922001', + 'info_dict': { + 'id': '4463358922001', + 'ext': 'flv', + 'title': 'Meet the man behind Popcorn Time', + 'description': 'md5:a950cc4285c43e44d763d036710cd9cd', + 'duration': 165768, + } + } + + @staticmethod + def _extract_url(webpage): + video_attributes = re.search(r'(?s)<video([^>]*)>.*?</(?:video|audio)>', webpage) + if video_attributes: + video_attributes = extract_attributes(video_attributes.group(), r'(?s)\s*data-(account|video-id|playlist-id|policy-key|player|embed)\s*=\s*["\']([^"\']+)["\']') + account_id = video_attributes.get('account') + player_id = video_attributes.get('player') + embed = video_attributes.get('embed') + video_id = video_attributes.get('video-id') + if account_id and player_id and embed and video_id: + return 'http://players.brightcove.net/%s/%s_%s/index.html?videoId=%s' % (account_id, player_id, embed, video_id) + return None + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + account_id, player_id, embed, video_id = mobj.groups() + + webpage = self._download_webpage('http://players.brightcove.net/%s/%s_%s/index.min.js' % (account_id, player_id, embed), video_id) + + catalog = self._parse_json( + js_to_json( + self._search_regex( + r'catalog\(({[^}]+})\);', + webpage, + 'catalog' + ) + ), + video_id + ) + policy_key = catalog['policyKey'] + + req = compat_urllib_request.Request( + 'https://edge.api.brightcove.com/playback/v1/accounts/%s/videos/%s' % (account_id, video_id), + headers={'Accept': 'application/json;pk=%s' % policy_key}) + json_data = self._download_json(req, video_id) + + title = json_data['name'] + description = json_data.get('description') + thumbnail = json_data.get('name') + timestamp = parse_iso8601(json_data.get('published_at')) + duration = int_or_none(json_data.get('duration')) + + formats = [] + for source in json_data.get('sources'): + source_type = source.get('type') + if source_type == 'application/x-mpegURL': + formats.extend(self._extract_m3u8_formats(source.get('src'), video_id)) + else: + src = source.get('src') + if src: + formats.append({ + 'url': src, + 'abr': source.get('avg_bitrate'), + 'width': int_or_none(source.get('width')), + 'height': int_or_none(source.get('height')), + 'filesize': source.get('size'), + 'container': source.get('container'), + 'vcodec': source.get('container'), + }) + else: + formats.extend(self._extract_f4m_formats(source.get('streaming_src'), video_id)) + + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + 'timestamp': timestamp, + 'duration': duration, + 'formats': formats, + } diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index ec748ed9f..7a3a7f66b 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -29,7 +29,10 @@ from ..utils import ( url_basename, xpath_text, ) -from .brightcove import BrightcoveIE +from .brightcove import ( + BrightcoveIE, + BrightcoveInPageEmbedIE, +) from .nbc import NBCSportsVPlayerIE from .ooyala import OoyalaIE from .rutv import RUTVIE @@ -1012,6 +1015,17 @@ class GenericIE(InfoExtractor): 'ext': 'mp4', 'title': 'cinemasnob', }, + }, + # BrightcoveInPageEmbed embed + { + 'url': 'http://www.geekandsundry.com/tabletop-bonus-wils-final-thoughts-on-dread/', + 'info_dict': { + 'id': '4238694884001', + 'ext': 'flv', + 'title': 'Tabletop: Dread, Last Thoughts', + 'description': 'Tabletop: Dread, Last Thoughts', + 'duration': 51690, + }, } ] @@ -1288,6 +1302,11 @@ class GenericIE(InfoExtractor): 'entries': entries, } + # Look for Brightcove In Page Embed: + brightcove_in_page_embed_url = BrightcoveInPageEmbedIE._extract_url(webpage) + if brightcove_in_page_embed_url: + return self.url_result(brightcove_in_page_embed_url, 'BrightcoveInPageEmbed') + # Look for embedded rtl.nl player matches = re.findall( r'<iframe[^>]+?src="((?:https?:)?//(?:www\.)?rtl\.nl/system/videoplayer/[^"]+(?:video_)?embed[^"]+)"', From b306c439d7f2997ebf2a88385c73fe2d92227b76 Mon Sep 17 00:00:00 2001 From: remitamine <remitamine@gmail.com> Date: Wed, 23 Sep 2015 13:28:05 +0100 Subject: [PATCH 0023/1286] [cnet] fix extraction and extract more formats --- youtube_dl/extractor/cnet.py | 54 +++++++++++++++--------------------- 1 file changed, 23 insertions(+), 31 deletions(-) diff --git a/youtube_dl/extractor/cnet.py b/youtube_dl/extractor/cnet.py index 5dd69bff7..2fac0d79d 100644 --- a/youtube_dl/extractor/cnet.py +++ b/youtube_dl/extractor/cnet.py @@ -4,9 +4,7 @@ from __future__ import unicode_literals import json from .common import InfoExtractor -from ..utils import ( - ExtractorError, -) +from .theplatform import ThePlatformIE class CNETIE(InfoExtractor): @@ -15,29 +13,22 @@ class CNETIE(InfoExtractor): 'url': 'http://www.cnet.com/videos/hands-on-with-microsofts-windows-8-1-update/', 'info_dict': { 'id': '56f4ea68-bd21-4852-b08c-4de5b8354c60', - 'ext': 'flv', + 'ext': 'mp4', 'title': 'Hands-on with Microsoft Windows 8.1 Update', 'description': 'The new update to the Windows 8 OS brings improved performance for mouse and keyboard users.', - 'thumbnail': 're:^http://.*/flmswindows8.jpg$', 'uploader_id': '6085384d-619e-11e3-b231-14feb5ca9861', 'uploader': 'Sarah Mitroff', }, - 'params': { - 'skip_download': 'requires rtmpdump', - } }, { 'url': 'http://www.cnet.com/videos/whiny-pothole-tweets-at-local-government-when-hit-by-cars-tomorrow-daily-187/', 'info_dict': { 'id': '56527b93-d25d-44e3-b738-f989ce2e49ba', - 'ext': 'flv', + 'ext': 'mp4', 'description': 'Khail and Ashley wonder what other civic woes can be solved by self-tweeting objects, investigate a new kind of VR camera and watch an origami robot self-assemble, walk, climb, dig and dissolve. #TDPothole', 'uploader_id': 'b163284d-6b73-44fc-b3e6-3da66c392d40', 'uploader': 'Ashley Esqueda', 'title': 'Whiny potholes tweet at local government when hit by cars (Tomorrow Daily 187)', }, - 'params': { - 'skip_download': True, # requires rtmpdump - }, }] def _real_extract(self, url): @@ -45,26 +36,13 @@ class CNETIE(InfoExtractor): webpage = self._download_webpage(url, display_id) data_json = self._html_search_regex( - r"<div class=\"cnetVideoPlayer\"\s+.*?data-cnet-video-options='([^']+)'", + r"<div class=\"videoPlayer\"\s+.*?data-cnet-video-uvp-options='([^']+)'", webpage, 'data json') data = json.loads(data_json) - vdata = data['video'] - if not vdata: - vdata = data['videos'][0] - if not vdata: - raise ExtractorError('Cannot find video data') - - mpx_account = data['config']['players']['default']['mpx_account'] - vid = vdata['files'].get('rtmp', vdata['files']['hds']) - tp_link = 'http://link.theplatform.com/s/%s/%s' % (mpx_account, vid) + vdata = data['videos'][0] video_id = vdata['id'] - title = vdata.get('headline') - if title is None: - title = vdata.get('title') - if title is None: - raise ExtractorError('Cannot find title!') - thumbnail = vdata.get('image', {}).get('path') + title = vdata['title'] author = vdata.get('author') if author: uploader = '%s %s' % (author['firstName'], author['lastName']) @@ -73,13 +51,27 @@ class CNETIE(InfoExtractor): uploader = None uploader_id = None + mpx_account = data['config']['uvpConfig']['default']['mpx_account'] + tp = ThePlatformIE(self._downloader) + formats = [] + subtitles = {} + description = vdata.get('description') + + for vid in vdata['files'].values(): + result = tp.extract(('http://link.theplatform.com/s/%s/%s' % (mpx_account, vid))) + formats.extend(result['formats']) + subtitles = self._merge_subtitles(subtitles, result['subtitles']) + description = description or result.get('description') + + self._sort_formats(formats) + return { - '_type': 'url_transparent', - 'url': tp_link, 'id': video_id, 'display_id': display_id, 'title': title, + 'description': description, 'uploader': uploader, 'uploader_id': uploader_id, - 'thumbnail': thumbnail, + 'subtitles': subtitles, + 'formats': formats, } From 53407e3f383ed80c67db9e06b8c3480257aa3184 Mon Sep 17 00:00:00 2001 From: remitamine <remitamine@gmail.com> Date: Wed, 23 Sep 2015 14:02:13 +0100 Subject: [PATCH 0024/1286] [brightcove] fix streaming_src extraction --- youtube_dl/extractor/brightcove.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/youtube_dl/extractor/brightcove.py b/youtube_dl/extractor/brightcove.py index a07c0888f..e4a7befee 100644 --- a/youtube_dl/extractor/brightcove.py +++ b/youtube_dl/extractor/brightcove.py @@ -413,7 +413,7 @@ class BrightcoveInPageEmbedIE(InfoExtractor): if source_type == 'application/x-mpegURL': formats.extend(self._extract_m3u8_formats(source.get('src'), video_id)) else: - src = source.get('src') + src = source.get('src') or source.get('streaming_src') if src: formats.append({ 'url': src, @@ -424,8 +424,6 @@ class BrightcoveInPageEmbedIE(InfoExtractor): 'container': source.get('container'), 'vcodec': source.get('container'), }) - else: - formats.extend(self._extract_f4m_formats(source.get('streaming_src'), video_id)) self._sort_formats(formats) From 1f9fb20fcda76f165ce39b01fe907fc74c8054d3 Mon Sep 17 00:00:00 2001 From: remitamine <remitamine@gmail.com> Date: Fri, 25 Sep 2015 07:39:22 +0100 Subject: [PATCH 0025/1286] [nextmedia] update AppleDailyIE tests --- youtube_dl/extractor/nextmedia.py | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/nextmedia.py b/youtube_dl/extractor/nextmedia.py index c10784f6b..d1688457f 100644 --- a/youtube_dl/extractor/nextmedia.py +++ b/youtube_dl/extractor/nextmedia.py @@ -126,7 +126,8 @@ class AppleDailyIE(NextMediaIE): 'thumbnail': 're:^https?://.*\.jpg$', 'description': 'md5:23c0aac567dc08c9c16a3161a2c2e3cd', 'upload_date': '20150128', - } + }, + 'skip': 'redirect to http://www.appledaily.com.tw/animation/', }, { # No thumbnail 'url': 'http://www.appledaily.com.tw/animation/realtimenews/new/20150128/5003673/', @@ -140,10 +141,19 @@ class AppleDailyIE(NextMediaIE): }, 'expected_warnings': [ 'video thumbnail', - ] + ], + 'skip': 'redirect to http://www.appledaily.com.tw/animation/', }, { 'url': 'http://www.appledaily.com.tw/appledaily/article/supplement/20140417/35770334/', - 'only_matching': True, + 'md5': 'eaa20e6b9df418c912d7f5dec2ba734d', + 'info_dict': { + 'id': '35770334', + 'ext': 'mp4', + 'title': '咖啡占卜測 XU裝熟指數', + 'thumbnail': 're:^https?://.*\.jpg$', + 'description': 'md5:7b859991a6a4fedbdf3dd3b66545c748', + 'upload_date': '20140417', + }, }] _URL_PATTERN = r'\{url: \'(.+)\'\}' From 6aeba407db84a636fc2522b4f2344eac9e0c1fdb Mon Sep 17 00:00:00 2001 From: remitamine <remitamine@gmail.com> Date: Fri, 25 Sep 2015 10:52:48 +0100 Subject: [PATCH 0026/1286] [jukebox] remove extractor and handle it using generic extractor --- youtube_dl/extractor/__init__.py | 1 - youtube_dl/extractor/generic.py | 21 +++++++ youtube_dl/extractor/jukebox.py | 59 ------------------ youtube_dl/extractor/ultimedia.py | 99 +++++++++++++------------------ 4 files changed, 61 insertions(+), 119 deletions(-) delete mode 100644 youtube_dl/extractor/jukebox.py diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 7272859db..1813c7e1b 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -262,7 +262,6 @@ from .izlesene import IzleseneIE from .jadorecettepub import JadoreCettePubIE from .jeuxvideo import JeuxVideoIE from .jove import JoveIE -from .jukebox import JukeboxIE from .jpopsukitv import JpopsukiIE from .kaltura import KalturaIE from .kanalplay import KanalPlayIE diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 8881a8a23..4d1f75e63 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -50,6 +50,7 @@ from .dailymotion import DailymotionCloudIE from .onionstudios import OnionStudiosIE from .snagfilms import SnagFilmsEmbedIE from .screenwavemedia import ScreenwaveMediaIE +from .ultimedia import UltimediaIE class GenericIE(InfoExtractor): @@ -1029,6 +1030,21 @@ class GenericIE(InfoExtractor): 'ext': 'mp4', 'title': 'cinemasnob', }, + }, + # Ultimedia embed + { + 'url': 'http://www.jukebox.es/kosheen/videoclip,pride,r303r.html', + 'md5': '25551df6e7c7ab8096ceeeae048c5f64', + 'info_dict': { + 'id': 'r303r', + 'ext': 'mp4', + 'title': 'Kosheen - Pride (live)', + 'thumbnail': 're:^https?://.*\.jpg', + 'duration': 293, + 'upload_date': '20081103', + 'timestamp': 1225733392, + 'uploader_id': '33m03', + }, } ] @@ -1751,6 +1767,11 @@ class GenericIE(InfoExtractor): if mobj is not None: return self.url_result(unescapeHTML(mobj.group('url')), 'ScreenwaveMedia') + # Look for Ulltimedia embeds + ultimedia_url = UltimediaIE._extract_url(webpage) + if ultimedia_url: + return self.url_result(self._proto_relative_url(ultimedia_url), 'Ultimedia') + # Look for AdobeTVVideo embeds mobj = re.search( r'<iframe[^>]+src=[\'"]((?:https?:)?//video\.tv\.adobe\.com/v/\d+[^"]+)[\'"]', diff --git a/youtube_dl/extractor/jukebox.py b/youtube_dl/extractor/jukebox.py deleted file mode 100644 index da8068efc..000000000 --- a/youtube_dl/extractor/jukebox.py +++ /dev/null @@ -1,59 +0,0 @@ -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from ..utils import ( - ExtractorError, - RegexNotFoundError, - unescapeHTML, -) - - -class JukeboxIE(InfoExtractor): - _VALID_URL = r'^http://www\.jukebox?\..+?\/.+[,](?P<id>[a-z0-9\-]+)\.html' - _TEST = { - 'url': 'http://www.jukebox.es/kosheen/videoclip,pride,r303r.html', - 'info_dict': { - 'id': 'r303r', - 'ext': 'flv', - 'title': 'Kosheen-En Vivo Pride', - 'uploader': 'Kosheen', - }, - } - - def _real_extract(self, url): - video_id = self._match_id(url) - - html = self._download_webpage(url, video_id) - iframe_url = unescapeHTML(self._search_regex(r'<iframe .*src="([^"]*)"', html, 'iframe url')) - - iframe_html = self._download_webpage(iframe_url, video_id, 'Downloading iframe') - if re.search(r'class="jkb_waiting"', iframe_html) is not None: - raise ExtractorError('Video is not available(in your country?)!') - - self.report_extraction(video_id) - - try: - video_url = self._search_regex(r'"config":{"file":"(?P<video_url>http:[^"]+\?mdtk=[0-9]+)"', - iframe_html, 'video url') - video_url = unescapeHTML(video_url).replace('\/', '/') - except RegexNotFoundError: - youtube_url = self._search_regex( - r'config":{"file":"(http:\\/\\/www\.youtube\.com\\/watch\?v=[^"]+)"', - iframe_html, 'youtube url') - youtube_url = unescapeHTML(youtube_url).replace('\/', '/') - self.to_screen('Youtube video detected') - return self.url_result(youtube_url, ie='Youtube') - - title = self._html_search_regex(r'<h1 class="inline">([^<]+)</h1>', - html, 'title') - artist = self._html_search_regex(r'<span id="infos_article_artist">([^<]+)</span>', - html, 'artist') - - return { - 'id': video_id, - 'url': video_url, - 'title': artist + '-' + title, - 'uploader': artist, - } diff --git a/youtube_dl/extractor/ultimedia.py b/youtube_dl/extractor/ultimedia.py index c4751050e..45201332d 100644 --- a/youtube_dl/extractor/ultimedia.py +++ b/youtube_dl/extractor/ultimedia.py @@ -4,102 +4,83 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..compat import compat_urllib_parse_urlparse -from ..utils import ( - ExtractorError, - qualities, - unified_strdate, - clean_html, -) +from ..utils import int_or_none class UltimediaIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?ultimedia\.com/default/index/video[^/]+/id/(?P<id>[\d+a-z]+)' + _VALID_URL = r'https?://(?:www\.)?ultimedia\.com/deliver/(?P<type>generic|musique)(?:/[^/]+)*/(?:src|article)/(?P<id>[\d+a-z]+)' _TESTS = [{ # news - 'url': 'https://www.ultimedia.com/default/index/videogeneric/id/s8uk0r', + 'url': 'https://www.ultimedia.com/deliver/generic/iframe/mdtk/01601930/zone/1/src/s8uk0r/autoplay/yes/ad/no/width/714/height/435', 'md5': '276a0e49de58c7e85d32b057837952a2', 'info_dict': { 'id': 's8uk0r', 'ext': 'mp4', 'title': 'Loi sur la fin de vie: le texte prévoit un renforcement des directives anticipées', - 'description': 'md5:3e5c8fd65791487333dda5db8aed32af', 'thumbnail': 're:^https?://.*\.jpg', + 'duration': 74, 'upload_date': '20150317', + 'timestamp': 1426604939, + 'uploader_id': '3fszv', }, }, { # music - 'url': 'https://www.ultimedia.com/default/index/videomusic/id/xvpfp8', + 'url': 'https://www.ultimedia.com/deliver/musique/iframe/mdtk/01601930/zone/1/article/xvpfp8/autoplay/yes/ad/no/width/714/height/435', 'md5': '2ea3513813cf230605c7e2ffe7eca61c', 'info_dict': { 'id': 'xvpfp8', 'ext': 'mp4', - 'title': "Two - C'est la vie (Clip)", - 'description': 'Two', + 'title': 'Two - C\'est La Vie (clip)', 'thumbnail': 're:^https?://.*\.jpg', + 'duration': 233, 'upload_date': '20150224', + 'timestamp': 1424760500, + 'uploader_id': '3rfzk', }, }] + @staticmethod + def _extract_url(webpage): + mobj = re.search( + r'<(?:iframe|script)[^>]+src=["\'](?P<url>(?:https?:)?//(?:www\.)?ultimedia\.com/deliver/(?:generic|musique)(?:/[^/]+)*/(?:src|article)/[\d+a-z]+)', + webpage) + if mobj: + return mobj.group('url') + def _real_extract(self, url): - video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) + video_type, video_id = re.match(self._VALID_URL, url).groups() - deliver_url = self._proto_relative_url(self._search_regex( - r'<iframe[^>]+src="((?:https?:)?//(?:www\.)?ultimedia\.com/deliver/[^"]+)"', - webpage, 'deliver URL'), compat_urllib_parse_urlparse(url).scheme + ':') - - deliver_page = self._download_webpage( - deliver_url, video_id, 'Downloading iframe page') - - if '>This video is currently not available' in deliver_page: - raise ExtractorError( - 'Video %s is currently not available' % video_id, expected=True) - - player = self._parse_json( - self._search_regex( - r"jwplayer\('player(?:_temp)?'\)\.setup\(({.+?})\)\.on", - deliver_page, 'player'), + deliver_info = self._download_json( + 'http://www.ultimedia.com/deliver/video?video=%s&topic=%s' % (video_id, video_type), video_id) - quality = qualities(['flash', 'html5']) + yt_id = deliver_info.get('yt_id') + if yt_id: + return self.url_result(yt_id, 'Youtube') + + jwconf = deliver_info['jwconf'] + formats = [] - for mode in player['modes']: - video_url = mode.get('config', {}).get('file') - if not video_url: - continue - if re.match(r'https?://www\.youtube\.com/.+?', video_url): - return self.url_result(video_url, 'Youtube') + for source in jwconf['playlist'][0]['sources']: formats.append({ - 'url': video_url, - 'format_id': mode.get('type'), - 'quality': quality(mode.get('type')), + 'url': source['file'], + 'format_id': source.get('label'), }) + self._sort_formats(formats) - thumbnail = player.get('image') - - title = clean_html(( - self._html_search_regex( - r'(?s)<div\s+id="catArticle">.+?</div>(.+?)</h1>', - webpage, 'title', default=None) or - self._search_regex( - r"var\s+nameVideo\s*=\s*'([^']+)'", - deliver_page, 'title'))) - - description = clean_html(self._html_search_regex( - r'(?s)<span>Description</span>(.+?)</p>', webpage, - 'description', fatal=False)) - - upload_date = unified_strdate(self._search_regex( - r'Ajouté le\s*<span>([^<]+)', webpage, - 'upload date', fatal=False)) + title = deliver_info['title'] + thumbnail = jwconf.get('image') + duration = int_or_none(deliver_info.get('duration')) + timestamp = int_or_none(deliver_info.get('release_time')) + uploader_id = deliver_info.get('owner_id') return { 'id': video_id, 'title': title, - 'description': description, 'thumbnail': thumbnail, - 'upload_date': upload_date, + 'duration': duration, + 'timestamp': timestamp, + 'uploader_id': uploader_id, 'formats': formats, } From c01e1a96aa964ef6d5f0bf7675dbe34096b1d2c8 Mon Sep 17 00:00:00 2001 From: remitamine <remitamine@gmail.com> Date: Wed, 30 Sep 2015 11:20:43 +0100 Subject: [PATCH 0027/1286] [brightcove] fix test and fields extraction --- youtube_dl/extractor/brightcove.py | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/youtube_dl/extractor/brightcove.py b/youtube_dl/extractor/brightcove.py index e4a7befee..b41cee91b 100644 --- a/youtube_dl/extractor/brightcove.py +++ b/youtube_dl/extractor/brightcove.py @@ -354,14 +354,18 @@ class BrightcoveIE(InfoExtractor): class BrightcoveInPageEmbedIE(InfoExtractor): _VALID_URL = r'https?://players\.brightcove\.net/(?P<account_id>\d+)/([a-z0-9-]+)_([a-z]+)/index.html?.*videoId=(?P<video_id>\d+)' - TEST = { + _TEST = { 'url': 'http://players.brightcove.net/929656772001/e41d32dc-ec74-459e-a845-6c69f7b724ea_default/index.html?videoId=4463358922001', + 'md5': 'c8100925723840d4b0d243f7025703be', 'info_dict': { 'id': '4463358922001', - 'ext': 'flv', + 'ext': 'mp4', 'title': 'Meet the man behind Popcorn Time', - 'description': 'md5:a950cc4285c43e44d763d036710cd9cd', + 'description': 'md5:eac376a4fe366edc70279bfb681aea16', + 'timestamp': 1441391203, + 'upload_date': '20150904', 'duration': 165768, + 'uploader_id': '929656772001', } } @@ -403,7 +407,7 @@ class BrightcoveInPageEmbedIE(InfoExtractor): title = json_data['name'] description = json_data.get('description') - thumbnail = json_data.get('name') + thumbnail = json_data.get('thumbnail') timestamp = parse_iso8601(json_data.get('published_at')) duration = int_or_none(json_data.get('duration')) @@ -417,12 +421,13 @@ class BrightcoveInPageEmbedIE(InfoExtractor): if src: formats.append({ 'url': src, - 'abr': source.get('avg_bitrate'), + 'tbr': source.get('avg_bitrate'), 'width': int_or_none(source.get('width')), 'height': int_or_none(source.get('height')), 'filesize': source.get('size'), 'container': source.get('container'), - 'vcodec': source.get('container'), + 'vcodec': source.get('codec'), + 'ext': source.get('container').lower(), }) self._sort_formats(formats) @@ -435,4 +440,5 @@ class BrightcoveInPageEmbedIE(InfoExtractor): 'timestamp': timestamp, 'duration': duration, 'formats': formats, + 'uploader_id': account_id, } From 8fc226ef994a82f7b1050cdb72ec38922d3ab9cf Mon Sep 17 00:00:00 2001 From: remitamine <remitamine@gmail.com> Date: Fri, 2 Oct 2015 17:24:30 +0100 Subject: [PATCH 0028/1286] [nba] extract all video formats and extract more info --- youtube_dl/extractor/__init__.py | 5 +- youtube_dl/extractor/nba.py | 102 +++++++++++++++++++++---------- 2 files changed, 74 insertions(+), 33 deletions(-) diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index a73a1317e..78478b38b 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -351,7 +351,10 @@ from .myvideo import MyVideoIE from .myvidster import MyVidsterIE from .nationalgeographic import NationalGeographicIE from .naver import NaverIE -from .nba import NBAIE +from .nba import ( + NBAIE, + NBAWatchIE, +) from .nbc import ( NBCIE, NBCNewsIE, diff --git a/youtube_dl/extractor/nba.py b/youtube_dl/extractor/nba.py index 944096e1c..36ece5b64 100644 --- a/youtube_dl/extractor/nba.py +++ b/youtube_dl/extractor/nba.py @@ -2,62 +2,100 @@ from __future__ import unicode_literals from .common import InfoExtractor from ..utils import ( - remove_end, parse_duration, + parse_iso8601, + int_or_none, ) -class NBAIE(InfoExtractor): - _VALID_URL = r'https?://(?:watch\.|www\.)?nba\.com/(?:nba/)?video(?P<id>/[^?]*?)/?(?:/index\.html)?(?:\?.*)?$' +class NBABaseIE(InfoExtractor): + def _get_formats(self, video_id): + base_url = 'http://nba.cdn.turner.com/nba/big%s' % video_id + return [{ + 'url': base_url + '_nba_android_high.mp4', + 'width': 480, + 'height': 320, + 'format_id': '320p', + },{ + 'url': base_url + '_640x360_664b.mp4', + 'width': 640, + 'height': 360, + 'format_id': '360p', + },{ + 'url': base_url + '_768x432_1404.mp4', + 'width': 768, + 'height': 432, + 'format_id': '432p', + },{ + 'url': base_url + '_1280x720.mp4', + 'width': 1280, + 'height': 720, + 'format_id': '720p', + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + ret = self._extract_metadata(webpage, video_id) + ret['id'] = video_id.rpartition('/')[2] + ret['formats'] = self._get_formats(video_id) + return ret + + +class NBAIE(NBABaseIE): + IE_NAME = 'nba' + _VALID_URL = r'https?://(?:www\.)?nba\.com/(?:nba/)?video(?P<id>/[^?]*?)/?(?:/index\.html)?(?:\?.*)?$' _TESTS = [{ 'url': 'http://www.nba.com/video/games/nets/2012/12/04/0021200253-okc-bkn-recap.nba/index.html', - 'md5': 'c0edcfc37607344e2ff8f13c378c88a4', + 'md5': '9d902940d2a127af3f7f9d2f3dc79c96', 'info_dict': { 'id': '0021200253-okc-bkn-recap.nba', 'ext': 'mp4', 'title': 'Thunder vs. Nets', 'description': 'Kevin Durant scores 32 points and dishes out six assists as the Thunder beat the Nets in Brooklyn.', 'duration': 181, + 'timestamp': 1354680189, + 'upload_date': '20121205', }, }, { 'url': 'http://www.nba.com/video/games/hornets/2014/12/05/0021400276-nyk-cha-play5.nba/', 'only_matching': True, - }, { + }] + + def _extract_metadata(self, webpage, video_id): + return { + 'title': self._html_search_meta('name', webpage), + 'description': self._html_search_meta('description', webpage), + 'duration': parse_duration(self._html_search_meta('duration', webpage)), + 'thumbnail': self._html_search_meta('thumbnailUrl', webpage), + 'timestamp': parse_iso8601(self._html_search_meta('uploadDate', webpage)) + } + +class NBAWatchIE(NBABaseIE): + IE_NAME = 'nba:watch' + _VALID_URL = r'https?://watch.nba\.com/(?:nba/)?video(?P<id>/[^?]*?)/?(?:/index\.html)?(?:\?.*)?$' + _TESTS = [{ 'url': 'http://watch.nba.com/nba/video/channels/playoffs/2015/05/20/0041400301-cle-atl-recap.nba', + 'md5': 'b2b39b81cf28615ae0c3360a3f9668c4', 'info_dict': { 'id': '0041400301-cle-atl-recap.nba', 'ext': 'mp4', - 'title': 'NBA GAME TIME | Video: Hawks vs. Cavaliers Game 1', + 'title': 'Hawks vs. Cavaliers Game 1', 'description': 'md5:8094c3498d35a9bd6b1a8c396a071b4d', 'duration': 228, - }, - 'params': { - 'skip_download': True, + 'timestamp': 1432094400, + 'upload_date': '20150520', } }] - def _real_extract(self, url): - video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) - - video_url = 'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4' - - shortened_video_id = video_id.rpartition('/')[2] - title = remove_end( - self._og_search_title(webpage, default=shortened_video_id), ' : NBA.com') - - description = self._og_search_description(webpage) - duration_str = self._html_search_meta( - 'duration', webpage, 'duration', default=None) - if not duration_str: - duration_str = self._html_search_regex( - r'Duration:</b>\s*(\d+:\d+)', webpage, 'duration', fatal=False) - duration = parse_duration(duration_str) - + def _extract_metadata(self, webpage, video_id): + program_id = self._search_regex(r'var\s+programId\s*=\s*(\d+);', webpage, 'program id') + metadata = self._download_json( + 'http://smbsolr.cdnak.neulion.com/solr_nbav6/nba/nba/mlt/?wt=json&fl=name,description,image,runtime,releaseDate&q=sequence%3A' + program_id, video_id)['match']['docs'][0] return { - 'id': shortened_video_id, - 'url': video_url, - 'title': title, - 'description': description, - 'duration': duration, + 'title': metadata['name'], + 'description': metadata.get('description'), + 'duration': int_or_none(metadata.get('runtime')), + 'thumbnail': metadata.get('image'), + 'timestamp': parse_iso8601(metadata.get('releaseDate')) } From adccf33632c51def397cdfb08c1271de6d6ec95e Mon Sep 17 00:00:00 2001 From: remitamine <remitamine@gmail.com> Date: Fri, 2 Oct 2015 21:58:20 +0100 Subject: [PATCH 0029/1286] [ign] add support for pcmag and extract all formats and more metadata --- youtube_dl/extractor/__init__.py | 6 +- youtube_dl/extractor/ign.py | 117 +++++++++++++++++++++++-------- 2 files changed, 93 insertions(+), 30 deletions(-) diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index a73a1317e..191661390 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -231,7 +231,11 @@ from .howstuffworks import HowStuffWorksIE from .huffpost import HuffPostIE from .hypem import HypemIE from .iconosquare import IconosquareIE -from .ign import IGNIE, OneUPIE +from .ign import ( + IGNIE, + OneUPIE, + PCMagIE, +) from .imdb import ( ImdbIE, ImdbListIE diff --git a/youtube_dl/extractor/ign.py b/youtube_dl/extractor/ign.py index bf2d2041b..fa4e67394 100644 --- a/youtube_dl/extractor/ign.py +++ b/youtube_dl/extractor/ign.py @@ -3,6 +3,10 @@ from __future__ import unicode_literals import re from .common import InfoExtractor +from ..utils import ( + int_or_none, + parse_iso8601, +) class IGNIE(InfoExtractor): @@ -11,25 +15,23 @@ class IGNIE(InfoExtractor): Some videos of it.ign.com are also supported """ - _VALID_URL = r'https?://.+?\.ign\.com/(?P<type>videos|show_videos|articles|(?:[^/]*/feature))(/.+)?/(?P<name_or_id>.+)' + _VALID_URL = r'https?://.+?\.ign\.com/(?:[^/]+/)?(?P<type>videos|show_videos|articles|feature|(?:[^/]+/\d+/video))(/.+)?/(?P<name_or_id>.+)' IE_NAME = 'ign.com' - _CONFIG_URL_TEMPLATE = 'http://www.ign.com/videos/configs/id/%s.config' - _DESCRIPTION_RE = [ - r'<span class="page-object-description">(.+?)</span>', - r'id="my_show_video">.*?<p>(.*?)</p>', - r'<meta name="description" content="(.*?)"', - ] + _API_URL_TEMPLATE = 'http://apis.ign.com/video/v3/videos/%s' + _EMBED_RE = r'<iframe[^>]+?["\']((?:https?:)?//.+?\.ign\.com.+?/embed.+?)["\']' _TESTS = [ { 'url': 'http://www.ign.com/videos/2013/06/05/the-last-of-us-review', - 'md5': 'eac8bdc1890980122c3b66f14bdd02e9', + 'md5': 'febda82c4bafecd2d44b6e1a18a595f8', 'info_dict': { 'id': '8f862beef863986b2785559b9e1aa599', 'ext': 'mp4', 'title': 'The Last of Us Review', 'description': 'md5:c8946d4260a4d43a00d5ae8ed998870c', + 'timestamp': 1370440800, + 'upload_date': '20130605', } }, { @@ -44,6 +46,8 @@ class IGNIE(InfoExtractor): 'ext': 'mp4', 'title': 'GTA 5 Video Review', 'description': 'Rockstar drops the mic on this generation of games. Watch our review of the masterly Grand Theft Auto V.', + 'timestamp': 1379339880, + 'upload_date': '20130916', }, }, { @@ -52,6 +56,8 @@ class IGNIE(InfoExtractor): 'ext': 'mp4', 'title': '26 Twisted Moments from GTA 5 in Slow Motion', 'description': 'The twisted beauty of GTA 5 in stunning slow motion.', + 'timestamp': 1386878820, + 'upload_date': '20131212', }, }, ], @@ -66,10 +72,9 @@ class IGNIE(InfoExtractor): 'id': '078fdd005f6d3c02f63d795faa1b984f', 'ext': 'mp4', 'title': 'Rewind Theater - Wild Trailer Gamescom 2014', - 'description': ( - 'Giant skeletons, bloody hunts, and captivating' - ' natural beauty take our breath away.' - ), + 'description': 'Brian and Jared explore Michel Ancel\'s captivating new preview.', + 'timestamp': 1408047180, + 'upload_date': '20140814', }, }, ] @@ -82,7 +87,7 @@ class IGNIE(InfoExtractor): r'<object id="vid_(.+?)"', r'<meta name="og:image" content=".*/(.+?)-(.+?)/.+.jpg"', ] - return self._search_regex(res_id, webpage, 'video id') + return self._search_regex(res_id, webpage, 'video id', default=None) def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) @@ -102,22 +107,45 @@ class IGNIE(InfoExtractor): } video_id = self._find_video_id(webpage) - result = self._get_video_info(video_id) - description = self._html_search_regex(self._DESCRIPTION_RE, - webpage, 'video description', flags=re.DOTALL) - result['description'] = description - return result + if not video_id: + return self.url_result(self._search_regex(self._EMBED_RE, webpage, 'embed url')) + return self._get_video_info(video_id) def _get_video_info(self, video_id): - config_url = self._CONFIG_URL_TEMPLATE % video_id - config = self._download_json(config_url, video_id) - media = config['playlist']['media'] + api_data = self._download_json(self._API_URL_TEMPLATE % video_id, video_id) + + formats = [] + m3u8_url = api_data['refs'].get('m3uUrl') + if m3u8_url: + formats.extend(self._extract_m3u8_formats(m3u8_url, video_id)) + f4m_url = api_data['refs'].get('f4mUrl') + if f4m_url: + formats.extend(self._extract_f4m_formats(f4m_url, video_id)) + for asset in api_data['assets']: + formats.append({ + 'url': asset['url'], + 'tbr': asset.get('actual_bitrate_kbps'), + 'fps': asset.get('frame_rate'), + 'height': int_or_none(asset.get('height')), + 'width': int_or_none(asset.get('width')), + }) + self._sort_formats(formats) + + thumbnails = [] + for thumbnail in api_data['thumbnails']: + thumbnails.append({'url': thumbnail['url']}) + + metadata = api_data['metadata'] return { - 'id': media['metadata']['videoId'], - 'url': media['url'], - 'title': media['metadata']['title'], - 'thumbnail': media['poster'][0]['url'].replace('{size}', 'grande'), + 'id': api_data.get('videoId') or video_id, + 'title': metadata.get('longTitle') or metadata.get('name') or metadata.get['title'], + 'description': metadata.get('description'), + 'timestamp': parse_iso8601(metadata.get('publishDate')), + 'duration': int_or_none(metadata.get('duration')), + 'display_id': metadata.get('slug') or video_id, + 'thumbnails': thumbnails, + 'formats': formats, } @@ -125,16 +153,16 @@ class OneUPIE(IGNIE): _VALID_URL = r'https?://gamevideos\.1up\.com/(?P<type>video)/id/(?P<name_or_id>.+)\.html' IE_NAME = '1up.com' - _DESCRIPTION_RE = r'<div id="vid_summary">(.+?)</div>' - _TESTS = [{ 'url': 'http://gamevideos.1up.com/video/id/34976.html', - 'md5': '68a54ce4ebc772e4b71e3123d413163d', + 'md5': 'c9cc69e07acb675c31a16719f909e347', 'info_dict': { 'id': '34976', 'ext': 'mp4', 'title': 'Sniper Elite V2 - Trailer', - 'description': 'md5:5d289b722f5a6d940ca3136e9dae89cf', + 'description': 'md5:bf0516c5ee32a3217aa703e9b1bc7826', + 'timestamp': 1313099220, + 'upload_date': '20110811', } }] @@ -143,3 +171,34 @@ class OneUPIE(IGNIE): result = super(OneUPIE, self)._real_extract(url) result['id'] = mobj.group('name_or_id') return result + + +class PCMagIE(IGNIE): + _VALID_URL = r'https?://(?:www\.)?pcmag\.com/(?P<type>videos|article2)(/.+)?/(?P<name_or_id>.+)' + IE_NAME = 'pcmag' + + _EMBED_RE = r'iframe.setAttribute\("src",\s*__util.objToUrlString\("http://widgets\.ign\.com/video/embed/content.html?[^"]*url=([^"]+)["&]' + + _TESTS = [{ + 'url': 'http://www.pcmag.com/videos/2015/01/06/010615-whats-new-now-is-gogo-snooping-on-your-data', + 'md5': '212d6154fd0361a2781075f1febbe9ad', + 'info_dict': { + 'id': 'ee10d774b508c9b8ec07e763b9125b91', + 'ext': 'mp4', + 'title': '010615_What\'s New Now: Is GoGo Snooping on Your Data?', + 'description': 'md5:a7071ae64d2f68cc821c729d4ded6bb3', + 'timestamp': 1420571160, + 'upload_date': '20150106', + } + },{ + 'url': 'http://www.pcmag.com/article2/0,2817,2470156,00.asp', + 'md5': '94130c1ca07ba0adb6088350681f16c1', + 'info_dict': { + 'id': '042e560ba94823d43afcb12ddf7142ca', + 'ext': 'mp4', + 'title': 'HTC\'s Weird New Re Camera - What\'s New Now', + 'description': 'md5:53433c45df96d2ea5d0fda18be2ca908', + 'timestamp': 1412953920, + 'upload_date': '20141010', + } + }] From 28809ab07a8d10f9cafc3d712414c7b355c27166 Mon Sep 17 00:00:00 2001 From: remitamine <remitamine@gmail.com> Date: Sat, 3 Oct 2015 09:47:19 +0100 Subject: [PATCH 0030/1286] [nba] extract more formats --- youtube_dl/extractor/nba.py | 36 ++++++++++++++++++++++++++++++++++-- 1 file changed, 34 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/nba.py b/youtube_dl/extractor/nba.py index 36ece5b64..8844b61a5 100644 --- a/youtube_dl/extractor/nba.py +++ b/youtube_dl/extractor/nba.py @@ -10,28 +10,60 @@ from ..utils import ( class NBABaseIE(InfoExtractor): def _get_formats(self, video_id): + formats = self._extract_m3u8_formats( + 'http://nbavod-f.akamaihd.net/i/nba/big%s_,640x360_664m,768x432_996,768x432_1404,960x540_2104,1280x720,.mp4.csmil/master.m3u8' % video_id, + video_id, + m3u8_id='hls') + formats.extend(self._extract_f4m_formats( + 'http://nbavod-f.akamaihd.net/z/nba/big%s_,640x360_664m,768x432_996,768x432_1404,960x540_2104,1280x720,.mp4.csmil/manifest.f4m?hdcore=3.4.1.1' % video_id, + video_id, + f4m_id='hds')) base_url = 'http://nba.cdn.turner.com/nba/big%s' % video_id - return [{ + formats.extend([{ + 'url': base_url + '_nba_ipad.mp4', + 'width': 400, + 'height': 224, + 'format_id': '224p', + 'preference': 1, + },{ 'url': base_url + '_nba_android_high.mp4', 'width': 480, 'height': 320, 'format_id': '320p', + 'preference': 2, + },{ + 'url': base_url + '_nba_576x324.mp4', + 'width': 576, + 'height': 324, + 'format_id': '324p', + 'preference': 3, },{ 'url': base_url + '_640x360_664b.mp4', 'width': 640, 'height': 360, 'format_id': '360p', + 'preference': 4, },{ 'url': base_url + '_768x432_1404.mp4', 'width': 768, 'height': 432, 'format_id': '432p', + 'preference': 5, + },{ + 'url': base_url + '_960x540_2104.mp4', + 'width': 960, + 'height': 540, + 'format_id': '540p', + 'preference': 6, },{ 'url': base_url + '_1280x720.mp4', 'width': 1280, 'height': 720, 'format_id': '720p', - }] + 'preference': 7, + }]) + self._sort_formats(formats) + return formats def _real_extract(self, url): video_id = self._match_id(url) From c233e6bcc398f9734d7138854978c1cb00fe757f Mon Sep 17 00:00:00 2001 From: remitamine <remitamine@gmail.com> Date: Sat, 3 Oct 2015 12:30:05 +0100 Subject: [PATCH 0031/1286] [nba] extract video info from xml feed --- youtube_dl/extractor/__init__.py | 5 +- youtube_dl/extractor/nba.py | 224 +++++++++++++++++-------------- 2 files changed, 126 insertions(+), 103 deletions(-) diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 78478b38b..a73a1317e 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -351,10 +351,7 @@ from .myvideo import MyVideoIE from .myvidster import MyVidsterIE from .nationalgeographic import NationalGeographicIE from .naver import NaverIE -from .nba import ( - NBAIE, - NBAWatchIE, -) +from .nba import NBAIE from .nbc import ( NBCIE, NBCNewsIE, diff --git a/youtube_dl/extractor/nba.py b/youtube_dl/extractor/nba.py index 8844b61a5..3d38d080e 100644 --- a/youtube_dl/extractor/nba.py +++ b/youtube_dl/extractor/nba.py @@ -3,131 +3,157 @@ from __future__ import unicode_literals from .common import InfoExtractor from ..utils import ( parse_duration, - parse_iso8601, int_or_none, ) -class NBABaseIE(InfoExtractor): - def _get_formats(self, video_id): - formats = self._extract_m3u8_formats( - 'http://nbavod-f.akamaihd.net/i/nba/big%s_,640x360_664m,768x432_996,768x432_1404,960x540_2104,1280x720,.mp4.csmil/master.m3u8' % video_id, - video_id, - m3u8_id='hls') - formats.extend(self._extract_f4m_formats( - 'http://nbavod-f.akamaihd.net/z/nba/big%s_,640x360_664m,768x432_996,768x432_1404,960x540_2104,1280x720,.mp4.csmil/manifest.f4m?hdcore=3.4.1.1' % video_id, - video_id, - f4m_id='hds')) - base_url = 'http://nba.cdn.turner.com/nba/big%s' % video_id - formats.extend([{ - 'url': base_url + '_nba_ipad.mp4', - 'width': 400, - 'height': 224, - 'format_id': '224p', - 'preference': 1, - },{ - 'url': base_url + '_nba_android_high.mp4', - 'width': 480, - 'height': 320, - 'format_id': '320p', - 'preference': 2, - },{ - 'url': base_url + '_nba_576x324.mp4', - 'width': 576, - 'height': 324, - 'format_id': '324p', - 'preference': 3, - },{ - 'url': base_url + '_640x360_664b.mp4', - 'width': 640, - 'height': 360, - 'format_id': '360p', - 'preference': 4, - },{ - 'url': base_url + '_768x432_1404.mp4', - 'width': 768, - 'height': 432, - 'format_id': '432p', - 'preference': 5, - },{ - 'url': base_url + '_960x540_2104.mp4', - 'width': 960, - 'height': 540, - 'format_id': '540p', - 'preference': 6, - },{ - 'url': base_url + '_1280x720.mp4', - 'width': 1280, - 'height': 720, - 'format_id': '720p', - 'preference': 7, - }]) - self._sort_formats(formats) - return formats - - def _real_extract(self, url): - video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) - ret = self._extract_metadata(webpage, video_id) - ret['id'] = video_id.rpartition('/')[2] - ret['formats'] = self._get_formats(video_id) - return ret - - -class NBAIE(NBABaseIE): - IE_NAME = 'nba' - _VALID_URL = r'https?://(?:www\.)?nba\.com/(?:nba/)?video(?P<id>/[^?]*?)/?(?:/index\.html)?(?:\?.*)?$' +class NBAIE(InfoExtractor): + _VALID_URL = r'https?://(?:watch\.|www\.)?nba\.com/(?:nba/)?video/(?P<id>[^?]*?)/?(?:/index\.html)?(?:\?.*)?$' _TESTS = [{ 'url': 'http://www.nba.com/video/games/nets/2012/12/04/0021200253-okc-bkn-recap.nba/index.html', 'md5': '9d902940d2a127af3f7f9d2f3dc79c96', 'info_dict': { - 'id': '0021200253-okc-bkn-recap.nba', + 'id': '0021200253-okc-bkn-recap', 'ext': 'mp4', 'title': 'Thunder vs. Nets', 'description': 'Kevin Durant scores 32 points and dishes out six assists as the Thunder beat the Nets in Brooklyn.', 'duration': 181, - 'timestamp': 1354680189, - 'upload_date': '20121205', + 'timestamp': 1354638466, + 'upload_date': '20121204', }, }, { 'url': 'http://www.nba.com/video/games/hornets/2014/12/05/0021400276-nyk-cha-play5.nba/', 'only_matching': True, - }] - - def _extract_metadata(self, webpage, video_id): - return { - 'title': self._html_search_meta('name', webpage), - 'description': self._html_search_meta('description', webpage), - 'duration': parse_duration(self._html_search_meta('duration', webpage)), - 'thumbnail': self._html_search_meta('thumbnailUrl', webpage), - 'timestamp': parse_iso8601(self._html_search_meta('uploadDate', webpage)) - } - -class NBAWatchIE(NBABaseIE): - IE_NAME = 'nba:watch' - _VALID_URL = r'https?://watch.nba\.com/(?:nba/)?video(?P<id>/[^?]*?)/?(?:/index\.html)?(?:\?.*)?$' - _TESTS = [{ + },{ 'url': 'http://watch.nba.com/nba/video/channels/playoffs/2015/05/20/0041400301-cle-atl-recap.nba', 'md5': 'b2b39b81cf28615ae0c3360a3f9668c4', 'info_dict': { - 'id': '0041400301-cle-atl-recap.nba', + 'id': '0041400301-cle-atl-recap', 'ext': 'mp4', 'title': 'Hawks vs. Cavaliers Game 1', 'description': 'md5:8094c3498d35a9bd6b1a8c396a071b4d', 'duration': 228, - 'timestamp': 1432094400, + 'timestamp': 1432134543, 'upload_date': '20150520', } }] - def _extract_metadata(self, webpage, video_id): - program_id = self._search_regex(r'var\s+programId\s*=\s*(\d+);', webpage, 'program id') - metadata = self._download_json( - 'http://smbsolr.cdnak.neulion.com/solr_nbav6/nba/nba/mlt/?wt=json&fl=name,description,image,runtime,releaseDate&q=sequence%3A' + program_id, video_id)['match']['docs'][0] + _BASE_PATHS = { + 'turner': 'http://nba.cdn.turner.com/nba/big', + 'akamai': 'http://nbavod-f.akamaihd.net', + } + + _QUALITIES = { + '420mp4': { + 'width': 400, + 'height': 224, + 'preference': 1, + }, + '416x234': { + 'width': 416, + 'height': 234, + 'preference': 2, + }, + '556': { + 'width': 416, + 'height': 234, + 'preference': 3, + }, + '480x320_910': { + 'width': 480, + 'height': 320, + 'preference': 4, + }, + 'nba_576x324': { + 'width': 576, + 'height': 324, + 'preference': 5, + }, + 'nba_640x360': { + 'width': 640, + 'height': 360, + 'preference': 6, + }, + '640x360_664b': { + 'width': 640, + 'height': 360, + 'preference': 7, + }, + '640x360_664m': { + 'width': 640, + 'height': 360, + 'preference': 8, + }, + '768x432_996': { + 'width': 768, + 'height': 432, + 'preference': 9, + }, + '768x432_1404': { + 'width': 768, + 'height': 432, + 'preference': 10, + }, + '960x540_2104': { + 'width': 960, + 'height': 540, + 'preference': 11, + }, + '1280x720_3072': { + 'width': 1280, + 'height': 720, + 'preference': 12, + }, + } + + def _real_extract(self, url): + video_id = self._match_id(url) + video_info = self._download_xml('http://www.nba.com/video/%s.xml' % video_id, video_id) + video_id = video_info.find('slug').text + title = video_info.find('headline').text + description = video_info.find('description').text + duration = parse_duration(video_info.find('length').text) + timestamp = int_or_none(video_info.find('dateCreated').attrib.get('uts')) + + thumbnails = [] + for image in video_info.find('images'): + thumbnails.append({ + 'id': image.attrib.get('cut'), + 'url': image.text, + 'width': int_or_none(image.attrib.get('width')), + 'height': int_or_none(image.attrib.get('height')), + }) + + formats = [] + for video_file in video_info.find('files').iter('file'): + video_url = video_file.text + if not video_url.startswith('http://'): + if video_url.endswith('.m3u8') or video_url.endswith('.f4m'): + video_url = self._BASE_PATHS['akamai'] + video_url + else: + video_url = self._BASE_PATHS['turner'] + video_url + if video_url.endswith('.m3u8'): + formats.extend(self._extract_m3u8_formats(video_url, video_id)) + elif video_url.endswith('.f4m'): + formats.extend(self._extract_f4m_formats(video_url + '?hdcore=3.4.1.1', video_id)) + else: + key = video_file.attrib.get('bitrate') + quality = self._QUALITIES[key] + formats.append({ + 'format_id': key, + 'url': video_url, + 'width': quality['width'], + 'height': quality['height'], + 'preference': quality['preference'], + }) + self._sort_formats(formats) + return { - 'title': metadata['name'], - 'description': metadata.get('description'), - 'duration': int_or_none(metadata.get('runtime')), - 'thumbnail': metadata.get('image'), - 'timestamp': parse_iso8601(metadata.get('releaseDate')) + 'id': video_id, + 'title': title, + 'description': description, + 'duration': duration, + 'timestamp': timestamp, + 'thumbnails': thumbnails, + 'formats': formats, } From 30787f7259c4e6a08f691cc691f14fa0c8fe4b87 Mon Sep 17 00:00:00 2001 From: remitamine <remitamine@gmail.com> Date: Sat, 3 Oct 2015 19:28:48 +0100 Subject: [PATCH 0032/1286] [cspan] correct the clip info extraction --- youtube_dl/extractor/cspan.py | 58 ++++++++++++++++------------------- 1 file changed, 27 insertions(+), 31 deletions(-) diff --git a/youtube_dl/extractor/cspan.py b/youtube_dl/extractor/cspan.py index fbefd37d0..994e080d5 100644 --- a/youtube_dl/extractor/cspan.py +++ b/youtube_dl/extractor/cspan.py @@ -18,22 +18,21 @@ class CSpanIE(InfoExtractor): IE_DESC = 'C-SPAN' _TESTS = [{ 'url': 'http://www.c-span.org/video/?313572-1/HolderonV', - 'md5': '8e44ce11f0f725527daccc453f553eb0', + 'md5': '067803f994e049b455a58b16e5aab442', 'info_dict': { 'id': '315139', 'ext': 'mp4', 'title': 'Attorney General Eric Holder on Voting Rights Act Decision', - 'description': 'Attorney General Eric Holder spoke to reporters following the Supreme Court decision in Shelby County v. Holder in which the court ruled that the preclearance provisions of the Voting Rights Act could not be enforced until Congress established new guidelines for review.', + 'description': 'Attorney General Eric Holder speaks to reporters following the Supreme Court decision in [Shelby County v. Holder], in which the court ruled that the preclearance provisions of the Voting Rights Act could not be enforced.', }, 'skip': 'Regularly fails on travis, for unknown reasons', }, { 'url': 'http://www.c-span.org/video/?c4486943/cspan-international-health-care-models', - # For whatever reason, the served video alternates between - # two different ones + 'md5': '4eafd1e91a75d2b1e6a3cbd0995816a2', 'info_dict': { - 'id': '340723', + 'id': 'c4486943', 'ext': 'mp4', - 'title': 'International Health Care Models', + 'title': 'CSPAN - International Health Care Models', 'description': 'md5:7a985a2d595dba00af3d9c9f0783c967', } }, { @@ -44,7 +43,7 @@ class CSpanIE(InfoExtractor): 'ext': 'mp4', 'title': 'General Motors Ignition Switch Recall', 'duration': 14848, - 'description': 'md5:70c7c3b8fa63fa60d42772440596034c' + 'description': 'md5:118081aedd24bf1d3b68b3803344e7f3' }, }, { # Video from senate.gov @@ -57,36 +56,33 @@ class CSpanIE(InfoExtractor): }] def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - page_id = mobj.group('id') - webpage = self._download_webpage(url, page_id) - video_id = self._search_regex(r'progid=\'?([0-9]+)\'?>', webpage, 'video id') + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + matches = re.search(r'data-(prog|clip)id=\'([0-9]+)\'', webpage) + if matches: + video_type, video_id = matches.groups() + if video_type == 'prog': + video_type = 'program' + else: + senate_isvp_url = SenateISVPIE._search_iframe_url(webpage) + if senate_isvp_url: + title = self._og_search_title(webpage) + surl = smuggle_url(senate_isvp_url, {'force_title': title}) + return self.url_result(surl, 'SenateISVP', video_id, title) - description = self._html_search_regex( - [ - # The full description - r'<div class=\'expandable\'>(.*?)<a href=\'#\'', - # If the description is small enough the other div is not - # present, otherwise this is a stripped version - r'<p class=\'initial\'>(.*?)</p>' - ], - webpage, 'description', flags=re.DOTALL, default=None) - - info_url = 'http://c-spanvideo.org/videoLibrary/assets/player/ajax-player.php?os=android&html5=program&id=' + video_id - data = self._download_json(info_url, video_id) + data = self._download_json( + 'http://c-spanvideo.org/videoLibrary/assets/player/ajax-player.php?os=android&html5=%s&id=%s' % (video_type, video_id), + video_id) doc = self._download_xml( - 'http://www.c-span.org/common/services/flashXml.php?programid=' + video_id, + 'http://www.c-span.org/common/services/flashXml.php?%sid=%s' % (video_type, video_id), video_id) + description = self._html_search_meta('description', webpage) + title = find_xpath_attr(doc, './/string', 'name', 'title').text thumbnail = find_xpath_attr(doc, './/string', 'name', 'poster').text - senate_isvp_url = SenateISVPIE._search_iframe_url(webpage) - if senate_isvp_url: - surl = smuggle_url(senate_isvp_url, {'force_title': title}) - return self.url_result(surl, 'SenateISVP', video_id, title) - files = data['video']['files'] try: capfile = data['video']['capfile']['#text'] @@ -112,12 +108,12 @@ class CSpanIE(InfoExtractor): if len(entries) == 1: entry = dict(entries[0]) - entry['id'] = video_id + entry['id'] = 'c' + video_id if video_type == 'clip' else video_id return entry else: return { '_type': 'playlist', 'entries': entries, 'title': title, - 'id': video_id, + 'id': 'c' + video_id if video_type == 'clip' else video_id, } From e759a00119768862e63dbda33522f2399f1f43a4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Mon, 5 Oct 2015 20:21:53 +0600 Subject: [PATCH 0033/1286] [appletrailers] Quotes consistency --- youtube_dl/extractor/appletrailers.py | 68 +++++++++++++-------------- 1 file changed, 34 insertions(+), 34 deletions(-) diff --git a/youtube_dl/extractor/appletrailers.py b/youtube_dl/extractor/appletrailers.py index 576f03b5b..f68dc3236 100644 --- a/youtube_dl/extractor/appletrailers.py +++ b/youtube_dl/extractor/appletrailers.py @@ -13,53 +13,53 @@ from ..utils import ( class AppleTrailersIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?trailers\.apple\.com/(?:trailers|ca)/(?P<company>[^/]+)/(?P<movie>[^/]+)' _TESTS = [{ - "url": "http://trailers.apple.com/trailers/wb/manofsteel/", + 'url': 'http://trailers.apple.com/trailers/wb/manofsteel/', 'info_dict': { 'id': 'manofsteel', }, - "playlist": [ + 'playlist': [ { - "md5": "d97a8e575432dbcb81b7c3acb741f8a8", - "info_dict": { - "id": "manofsteel-trailer4", - "ext": "mov", - "duration": 111, - "title": "Trailer 4", - "upload_date": "20130523", - "uploader_id": "wb", + 'md5': 'd97a8e575432dbcb81b7c3acb741f8a8', + 'info_dict': { + 'id': 'manofsteel-trailer4', + 'ext': 'mov', + 'duration': 111, + 'title': 'Trailer 4', + 'upload_date': '20130523', + 'uploader_id': 'wb', }, }, { - "md5": "b8017b7131b721fb4e8d6f49e1df908c", - "info_dict": { - "id": "manofsteel-trailer3", - "ext": "mov", - "duration": 182, - "title": "Trailer 3", - "upload_date": "20130417", - "uploader_id": "wb", + 'md5': 'b8017b7131b721fb4e8d6f49e1df908c', + 'info_dict': { + 'id': 'manofsteel-trailer3', + 'ext': 'mov', + 'duration': 182, + 'title': 'Trailer 3', + 'upload_date': '20130417', + 'uploader_id': 'wb', }, }, { - "md5": "d0f1e1150989b9924679b441f3404d48", - "info_dict": { - "id": "manofsteel-trailer", - "ext": "mov", - "duration": 148, - "title": "Trailer", - "upload_date": "20121212", - "uploader_id": "wb", + 'md5': 'd0f1e1150989b9924679b441f3404d48', + 'info_dict': { + 'id': 'manofsteel-trailer', + 'ext': 'mov', + 'duration': 148, + 'title': 'Trailer', + 'upload_date': '20121212', + 'uploader_id': 'wb', }, }, { - "md5": "5fe08795b943eb2e757fa95cb6def1cb", - "info_dict": { - "id": "manofsteel-teaser", - "ext": "mov", - "duration": 93, - "title": "Teaser", - "upload_date": "20120721", - "uploader_id": "wb", + 'md5': '5fe08795b943eb2e757fa95cb6def1cb', + 'info_dict': { + 'id': 'manofsteel-teaser', + 'ext': 'mov', + 'duration': 93, + 'title': 'Teaser', + 'upload_date': '20120721', + 'uploader_id': 'wb', }, }, ] From 2a27e66234ec0030d2ca6f1ddf229db7f7eb8ded Mon Sep 17 00:00:00 2001 From: Oli Allen <oli@oliallen.com> Date: Thu, 1 Oct 2015 22:50:12 +0100 Subject: [PATCH 0034/1286] [tumblr] Added support for HD video where available (#7036) [tumblr] Replaced test URL for HD video as old one lead to 404 [tumblr] Don't make assumptions about video resolution, cleaner handling of no HD version available [tumblr] Removed extraneous resolution key in HD video tests --- youtube_dl/extractor/tumblr.py | 58 ++++++++++++++++++++++++++++++++-- 1 file changed, 55 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/tumblr.py b/youtube_dl/extractor/tumblr.py index 3d3b635e4..71cced562 100644 --- a/youtube_dl/extractor/tumblr.py +++ b/youtube_dl/extractor/tumblr.py @@ -28,6 +28,32 @@ class TumblrIE(InfoExtractor): 'description': 'md5:dba62ac8639482759c8eb10ce474586a', 'thumbnail': 're:http://.*\.jpg', } + }, { + 'url': 'http://hdvideotest.tumblr.com/post/130323439814/test-description-for-my-hd-video', + 'md5': '99a84522f60972bf064a0b80f87bcbb5', + 'info_dict': { + 'id': '130323439814', + 'ext': 'mp4', + 'title': 'HD Video Testing \u2014 Test description for my HD video', + 'description': 'md5:97cc3ab5fcd27ee4af6356701541319c', + 'thumbnail': 're:http://.*\.jpg', + }, + 'params': { + 'format': 'sd', + }, + }, { + 'url': 'http://hdvideotest.tumblr.com/post/130323439814/test-description-for-my-hd-video', + 'md5': '7ae503065ad150122dc3089f8cf1546c', + 'info_dict': { + 'id': '130323439814', + 'ext': 'mp4', + 'title': 'HD Video Testing \u2014 Test description for my HD video', + 'description': 'md5:97cc3ab5fcd27ee4af6356701541319c', + 'thumbnail': 're:http://.*\.jpg', + }, + 'params': { + 'format': 'hd', + }, }, { 'url': 'http://naked-yogi.tumblr.com/post/118312946248/naked-smoking-stretching', 'md5': 'de07e5211d60d4f3a2c3df757ea9f6ab', @@ -57,6 +83,8 @@ class TumblrIE(InfoExtractor): video_id = m_url.group('id') blog = m_url.group('blog_name') + video_urls = [] + url = 'http://%s.tumblr.com/post/%s/' % (blog, video_id) webpage, urlh = self._download_webpage_handle(url, video_id) @@ -68,8 +96,32 @@ class TumblrIE(InfoExtractor): iframe = self._download_webpage(iframe_url, video_id, 'Downloading iframe page') - video_url = self._search_regex(r'<source src="([^"]+)"', - iframe, 'video url') + + sd_video_url = self._search_regex(r'<source src="([^"]+)"', + iframe, 'sd video url') + resolution_id = sd_video_url.split("/")[-1] + 'p' + if len(resolution_id) != 4: + resolution_id = None + video_urls.append({ + 'ext': 'mp4', + 'format_id': 'sd', + 'url': sd_video_url, + 'resolution': resolution_id, + }) + + hd_video_url = self._search_regex(r'hdUrl":"([^"]+)"', iframe, + 'hd video url', default=None) + if hd_video_url: + hd_video_url = hd_video_url.replace("\\", "") + resolution_id = hd_video_url.split("/")[-1] + 'p' + if len(resolution_id) != 4: + resolution_id = None + video_urls.append({ + 'ext': 'mp4', + 'format_id': 'hd', + 'url': hd_video_url, + 'resolution': resolution_id, + }) # The only place where you can get a title, it's not complete, # but searching in other places doesn't work for all videos @@ -79,7 +131,7 @@ class TumblrIE(InfoExtractor): return { 'id': video_id, - 'url': video_url, + 'formats': video_urls, 'ext': 'mp4', 'title': video_title, 'description': self._og_search_description(webpage, default=None), From 140ac7396542d92a8ddd53be6c35c7a79db16180 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Mon, 5 Oct 2015 21:53:01 +0600 Subject: [PATCH 0035/1286] [tumblr] Simplify and extract duration --- youtube_dl/extractor/tumblr.py | 60 ++++++++++++++++++---------------- 1 file changed, 32 insertions(+), 28 deletions(-) diff --git a/youtube_dl/extractor/tumblr.py b/youtube_dl/extractor/tumblr.py index 71cced562..449ba29fa 100644 --- a/youtube_dl/extractor/tumblr.py +++ b/youtube_dl/extractor/tumblr.py @@ -4,6 +4,7 @@ from __future__ import unicode_literals import re from .common import InfoExtractor +from ..utils import int_or_none class TumblrIE(InfoExtractor): @@ -83,8 +84,6 @@ class TumblrIE(InfoExtractor): video_id = m_url.group('id') blog = m_url.group('blog_name') - video_urls = [] - url = 'http://%s.tumblr.com/post/%s/' % (blog, video_id) webpage, urlh = self._download_webpage_handle(url, video_id) @@ -94,34 +93,38 @@ class TumblrIE(InfoExtractor): if iframe_url is None: return self.url_result(urlh.geturl(), 'Generic') - iframe = self._download_webpage(iframe_url, video_id, - 'Downloading iframe page') + iframe = self._download_webpage(iframe_url, video_id, 'Downloading iframe page') - sd_video_url = self._search_regex(r'<source src="([^"]+)"', - iframe, 'sd video url') - resolution_id = sd_video_url.split("/")[-1] + 'p' - if len(resolution_id) != 4: - resolution_id = None - video_urls.append({ + duration = None + sources = [] + + sd_url = self._search_regex( + r'<source[^>]+src=(["\'])(?P<url>.+?)\1', iframe, + 'sd video url', default=None, group='url') + if sd_url: + sources.append((sd_url, 'sd')) + + options = self._parse_json( + self._search_regex( + r'data-crt-options=(["\'])(?P<options>.+?)\1', iframe, + 'hd video url', default='', group='options'), + video_id, fatal=False) + if options: + duration = int_or_none(options.get('duration')) + hd_url = options.get('hdUrl') + if hd_url: + sources.append((hd_url, 'hd')) + + formats = [{ + 'url': video_url, 'ext': 'mp4', - 'format_id': 'sd', - 'url': sd_video_url, - 'resolution': resolution_id, - }) + 'format_id': format_id, + 'height': int_or_none(self._search_regex( + r'/(\d{3,4})$', video_url, 'height', default=None)), + 'quality': quality, + } for quality, (video_url, format_id) in enumerate(sources)] - hd_video_url = self._search_regex(r'hdUrl":"([^"]+)"', iframe, - 'hd video url', default=None) - if hd_video_url: - hd_video_url = hd_video_url.replace("\\", "") - resolution_id = hd_video_url.split("/")[-1] + 'p' - if len(resolution_id) != 4: - resolution_id = None - video_urls.append({ - 'ext': 'mp4', - 'format_id': 'hd', - 'url': hd_video_url, - 'resolution': resolution_id, - }) + self._sort_formats(formats) # The only place where you can get a title, it's not complete, # but searching in other places doesn't work for all videos @@ -131,9 +134,10 @@ class TumblrIE(InfoExtractor): return { 'id': video_id, - 'formats': video_urls, 'ext': 'mp4', 'title': video_title, 'description': self._og_search_description(webpage, default=None), 'thumbnail': self._og_search_thumbnail(webpage, default=None), + 'duration': duration, + 'formats': formats, } From 5d84b79a3002925084690e17669738bf1ea711f7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Mon, 5 Oct 2015 21:53:59 +0600 Subject: [PATCH 0036/1286] [tumblr] Remove redundant test --- youtube_dl/extractor/tumblr.py | 13 ------------- 1 file changed, 13 deletions(-) diff --git a/youtube_dl/extractor/tumblr.py b/youtube_dl/extractor/tumblr.py index 449ba29fa..9f270318b 100644 --- a/youtube_dl/extractor/tumblr.py +++ b/youtube_dl/extractor/tumblr.py @@ -29,19 +29,6 @@ class TumblrIE(InfoExtractor): 'description': 'md5:dba62ac8639482759c8eb10ce474586a', 'thumbnail': 're:http://.*\.jpg', } - }, { - 'url': 'http://hdvideotest.tumblr.com/post/130323439814/test-description-for-my-hd-video', - 'md5': '99a84522f60972bf064a0b80f87bcbb5', - 'info_dict': { - 'id': '130323439814', - 'ext': 'mp4', - 'title': 'HD Video Testing \u2014 Test description for my HD video', - 'description': 'md5:97cc3ab5fcd27ee4af6356701541319c', - 'thumbnail': 're:http://.*\.jpg', - }, - 'params': { - 'format': 'sd', - }, }, { 'url': 'http://hdvideotest.tumblr.com/post/130323439814/test-description-for-my-hd-video', 'md5': '7ae503065ad150122dc3089f8cf1546c', From 88c86d211bd2f542625d3b91ac4b15e62995345d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Mon, 5 Oct 2015 21:54:54 +0600 Subject: [PATCH 0037/1286] [tumblr] Add missing fields for vidme test --- youtube_dl/extractor/tumblr.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/youtube_dl/extractor/tumblr.py b/youtube_dl/extractor/tumblr.py index 9f270318b..cb91d08eb 100644 --- a/youtube_dl/extractor/tumblr.py +++ b/youtube_dl/extractor/tumblr.py @@ -51,6 +51,9 @@ class TumblrIE(InfoExtractor): 'title': 'naked smoking & stretching', 'upload_date': '20150506', 'timestamp': 1430931613, + 'age_limit': 18, + 'uploader_id': '1638622', + 'uploader': 'naked-yogi', }, 'add_ie': ['Vidme'], }, { From 7fd4ed9939d7c467c61663cad7189ee8ad27c89b Mon Sep 17 00:00:00 2001 From: David Rabinowitz <drabinowitz@agtinternational.com> Date: Thu, 1 Oct 2015 12:07:27 +0300 Subject: [PATCH 0038/1286] Fixed the ustream extractor to use the current ustream API --- youtube_dl/extractor/ustream.py | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/youtube_dl/extractor/ustream.py b/youtube_dl/extractor/ustream.py index c39c278ab..18add908e 100644 --- a/youtube_dl/extractor/ustream.py +++ b/youtube_dl/extractor/ustream.py @@ -52,17 +52,12 @@ class UstreamIE(InfoExtractor): desktop_url = 'http://www.ustream.tv/recorded/' + desktop_video_id return self.url_result(desktop_url, 'Ustream') - params = self._download_json( - 'http://cdngw.ustream.tv/rgwjson/Viewer.getVideo/' + json.dumps({ - 'brandId': 1, - 'videoId': int(video_id), - 'autoplay': False, - }), video_id) + params = self._download_json('https://api.ustream.tv/videos/' + video_id + '.json', video_id) if 'error' in params: raise ExtractorError(params['error']['message'], expected=True) - video_url = params['flv'] + video_url = params['video']['media_urls']['flv'] webpage = self._download_webpage(url, video_id) From 5820c4a29ea3781c47f6249c179a081e28eeeb64 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Mon, 5 Oct 2015 22:30:38 +0600 Subject: [PATCH 0039/1286] [ustream] Switch extraction to api --- youtube_dl/extractor/ustream.py | 69 ++++++++++++++++++--------------- 1 file changed, 37 insertions(+), 32 deletions(-) diff --git a/youtube_dl/extractor/ustream.py b/youtube_dl/extractor/ustream.py index 18add908e..3065c9f31 100644 --- a/youtube_dl/extractor/ustream.py +++ b/youtube_dl/extractor/ustream.py @@ -7,7 +7,11 @@ from .common import InfoExtractor from ..compat import ( compat_urlparse, ) -from ..utils import ExtractorError +from ..utils import ( + ExtractorError, + int_or_none, + float_or_none, +) class UstreamIE(InfoExtractor): @@ -54,46 +58,47 @@ class UstreamIE(InfoExtractor): params = self._download_json('https://api.ustream.tv/videos/' + video_id + '.json', video_id) - if 'error' in params: - raise ExtractorError(params['error']['message'], expected=True) + error = params.get('error') + if error: + raise ExtractorError( + '%s returned error: %s' % (self.IE_NAME, error), expected=True) - video_url = params['video']['media_urls']['flv'] + video = params['video'] - webpage = self._download_webpage(url, video_id) + formats = [{ + 'id': format_id, + 'url': video_url, + 'ext': format_id, + } for format_id, video_url in video['media_urls'].items()] + self._sort_formats(formats) - self.report_extraction(video_id) + title = video['title'] + description = video.get('description') + timestamp = int_or_none(video.get('created_at')) + duration = float_or_none(video.get('length')) + filesize = float_or_none(video.get('file_size')) + view_count = int_or_none(video.get('views')) - video_title = self._html_search_regex(r'data-title="(?P<title>.+)"', - webpage, 'title', default=None) + uploader = video.get('owner', {}).get('username') + uploader_id = video.get('owner', {}).get('id') - if not video_title: - try: - video_title = params['moduleConfig']['meta']['title'] - except KeyError: - pass - - if not video_title: - video_title = 'Ustream video ' + video_id - - uploader = self._html_search_regex(r'data-content-type="channel".*?>(?P<uploader>.*?)</a>', - webpage, 'uploader', fatal=False, flags=re.DOTALL, default=None) - - if not uploader: - try: - uploader = params['moduleConfig']['meta']['userName'] - except KeyError: - uploader = None - - thumbnail = self._html_search_regex(r'<link rel="image_src" href="(?P<thumb>.*?)"', - webpage, 'thumbnail', fatal=False) + thumbnails = [{ + 'id': thumbnail_id, + 'url': thumbnail_url, + } for thumbnail_id, thumbnail_url in video.get('thumbnail', {}).items()] return { 'id': video_id, - 'url': video_url, - 'ext': 'flv', - 'title': video_title, + 'title': title, + 'description': description, + 'thumbnails': thumbnails, + 'timestamp': timestamp, + 'duration': duration, + 'filesize': filesize, + 'view_count': view_count, 'uploader': uploader, - 'thumbnail': thumbnail, + 'uploader_id': uploader_id, + 'formats': formats, } From 4853eb63fec766c7154e9353e91e531a3f3b1d1e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Mon, 5 Oct 2015 22:40:20 +0600 Subject: [PATCH 0040/1286] [ustream] Modernize --- youtube_dl/extractor/ustream.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/youtube_dl/extractor/ustream.py b/youtube_dl/extractor/ustream.py index 3065c9f31..9fdeb064d 100644 --- a/youtube_dl/extractor/ustream.py +++ b/youtube_dl/extractor/ustream.py @@ -15,7 +15,7 @@ from ..utils import ( class UstreamIE(InfoExtractor): - _VALID_URL = r'https?://www\.ustream\.tv/(?P<type>recorded|embed|embed/recorded)/(?P<videoID>\d+)' + _VALID_URL = r'https?://www\.ustream\.tv/(?P<type>recorded|embed|embed/recorded)/(?P<id>\d+)' IE_NAME = 'ustream' _TESTS = [{ 'url': 'http://www.ustream.tv/recorded/20274954', @@ -41,22 +41,23 @@ class UstreamIE(InfoExtractor): def _real_extract(self, url): m = re.match(self._VALID_URL, url) - video_id = m.group('videoID') + video_id = m.group('id') # some sites use this embed format (see: http://github.com/rg3/youtube-dl/issues/2990) if m.group('type') == 'embed/recorded': - video_id = m.group('videoID') + video_id = m.group('id') desktop_url = 'http://www.ustream.tv/recorded/' + video_id return self.url_result(desktop_url, 'Ustream') if m.group('type') == 'embed': - video_id = m.group('videoID') + video_id = m.group('id') webpage = self._download_webpage(url, video_id) desktop_video_id = self._html_search_regex( r'ContentVideoIds=\["([^"]*?)"\]', webpage, 'desktop_video_id') desktop_url = 'http://www.ustream.tv/recorded/' + desktop_video_id return self.url_result(desktop_url, 'Ustream') - params = self._download_json('https://api.ustream.tv/videos/' + video_id + '.json', video_id) + params = self._download_json( + 'https://api.ustream.tv/videos/%s.json' % video_id, video_id) error = params.get('error') if error: From f2a7ed77ef76b8d564326f564afc880473fba7ea Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Mon, 5 Oct 2015 22:44:36 +0600 Subject: [PATCH 0041/1286] [tumblr] Remove redundant field --- youtube_dl/extractor/tumblr.py | 1 - 1 file changed, 1 deletion(-) diff --git a/youtube_dl/extractor/tumblr.py b/youtube_dl/extractor/tumblr.py index cb91d08eb..4f844706d 100644 --- a/youtube_dl/extractor/tumblr.py +++ b/youtube_dl/extractor/tumblr.py @@ -124,7 +124,6 @@ class TumblrIE(InfoExtractor): return { 'id': video_id, - 'ext': 'mp4', 'title': video_title, 'description': self._og_search_description(webpage, default=None), 'thumbnail': self._og_search_thumbnail(webpage, default=None), From 0bf219889e39d4d7e75fdb59d7452b8e09f4ddab Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Mon, 5 Oct 2015 22:44:59 +0600 Subject: [PATCH 0042/1286] [ustream] Remove unused import --- youtube_dl/extractor/ustream.py | 1 - 1 file changed, 1 deletion(-) diff --git a/youtube_dl/extractor/ustream.py b/youtube_dl/extractor/ustream.py index 9fdeb064d..7243d0eca 100644 --- a/youtube_dl/extractor/ustream.py +++ b/youtube_dl/extractor/ustream.py @@ -1,6 +1,5 @@ from __future__ import unicode_literals -import json import re from .common import InfoExtractor From 41db73330835d6e8fe29ce18f869247a739467ce Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Mon, 5 Oct 2015 22:48:47 +0600 Subject: [PATCH 0043/1286] [ustream] Move filesize --- youtube_dl/extractor/ustream.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/ustream.py b/youtube_dl/extractor/ustream.py index 7243d0eca..888f39f7a 100644 --- a/youtube_dl/extractor/ustream.py +++ b/youtube_dl/extractor/ustream.py @@ -65,18 +65,20 @@ class UstreamIE(InfoExtractor): video = params['video'] + title = video['title'] + filesize = float_or_none(video.get('file_size')) + formats = [{ 'id': format_id, 'url': video_url, 'ext': format_id, + 'filesize': filesize, } for format_id, video_url in video['media_urls'].items()] self._sort_formats(formats) - title = video['title'] description = video.get('description') timestamp = int_or_none(video.get('created_at')) duration = float_or_none(video.get('length')) - filesize = float_or_none(video.get('file_size')) view_count = int_or_none(video.get('views')) uploader = video.get('owner', {}).get('username') @@ -94,7 +96,6 @@ class UstreamIE(InfoExtractor): 'thumbnails': thumbnails, 'timestamp': timestamp, 'duration': duration, - 'filesize': filesize, 'view_count': view_count, 'uploader': uploader, 'uploader_id': uploader_id, From dc5756fd7729f616185508fba296b330f0892c24 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Mon, 5 Oct 2015 22:51:04 +0600 Subject: [PATCH 0044/1286] [ustream] Fix typo --- youtube_dl/extractor/ustream.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/ustream.py b/youtube_dl/extractor/ustream.py index 888f39f7a..a29d67e9f 100644 --- a/youtube_dl/extractor/ustream.py +++ b/youtube_dl/extractor/ustream.py @@ -69,7 +69,7 @@ class UstreamIE(InfoExtractor): filesize = float_or_none(video.get('file_size')) formats = [{ - 'id': format_id, + 'id': video_id, 'url': video_url, 'ext': format_id, 'filesize': filesize, From 40fbb05e1c58625349160ec134343af789b803ff Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Mon, 5 Oct 2015 22:52:51 +0600 Subject: [PATCH 0045/1286] [ustream] Fix tests --- youtube_dl/extractor/ustream.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/ustream.py b/youtube_dl/extractor/ustream.py index a29d67e9f..73b05ecab 100644 --- a/youtube_dl/extractor/ustream.py +++ b/youtube_dl/extractor/ustream.py @@ -22,8 +22,12 @@ class UstreamIE(InfoExtractor): 'info_dict': { 'id': '20274954', 'ext': 'flv', - 'uploader': 'Young Americans for Liberty', 'title': 'Young Americans for Liberty February 7, 2012 2:28 AM', + 'description': 'Young Americans for Liberty February 7, 2012 2:28 AM', + 'timestamp': 1328577035, + 'upload_date': '20120207', + 'uploader': 'yaliberty', + 'uploader_id': '6780869', }, }, { # From http://sportscanada.tv/canadagames/index.php/week2/figure-skating/444 @@ -35,7 +39,8 @@ class UstreamIE(InfoExtractor): 'ext': 'flv', 'title': '-CG11- Canada Games Figure Skating', 'uploader': 'sportscanadatv', - } + }, + 'skip': 'This Pro Broadcaster has chosen to remove this video from the ustream.tv site.', }] def _real_extract(self, url): From fcc25462693fb0468d66b28664ba622f8f3bdb3d Mon Sep 17 00:00:00 2001 From: Sergey M <dstftw@gmail.com> Date: Tue, 6 Oct 2015 02:30:05 +0600 Subject: [PATCH 0046/1286] [README.md] Markdown improvements --- README.md | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index 80152071d..e7004c9fc 100644 --- a/README.md +++ b/README.md @@ -359,7 +359,7 @@ If you have installed youtube-dl with a package manager, pip, setup.py or a tarb By default, youtube-dl intends to have the best options (incidentally, if you have a convincing case that these should be different, [please file an issue where you explain that](https://yt-dl.org/bug)). Therefore, it is unnecessary and sometimes harmful to copy long option strings from webpages. In particular, the only option out of `-citw` that is regularly useful is `-i`. -### Can you please put the -b option back? +### Can you please put the `-b` option back? Most people asking this question are not aware that youtube-dl now defaults to downloading the highest available quality as reported by YouTube, which will be 1080p or 720p in some cases, so you no longer need the `-b` option. For some specific videos, maybe YouTube does not report them to be available in a specific high quality format you're interested in. In that case, simply request it with the `-f` option and youtube-dl will try to download it. @@ -371,13 +371,13 @@ Apparently YouTube requires you to pass a CAPTCHA test if you download too much. Once the video is fully downloaded, use any video player, such as [vlc](http://www.videolan.org) or [mplayer](http://www.mplayerhq.hu/). -### I extracted a video URL with -g, but it does not play on another machine / in my webbrowser. +### I extracted a video URL with `-g`, but it does not play on another machine / in my webbrowser. It depends a lot on the service. In many cases, requests for the video (to download/play it) must come from the same IP address and with the same cookies. Use the `--cookies` option to write the required cookies into a file, and advise your downloader to read cookies from that file. Some sites also require a common user agent to be used, use `--dump-user-agent` to see the one in use by youtube-dl. It may be beneficial to use IPv6; in some cases, the restrictions are only applied to IPv4. Some services (sometimes only for a subset of videos) do not restrict the video URL by IP address, cookie, or user-agent, but these are the exception rather than the rule. -Please bear in mind that some URL protocols are **not** supported by browsers out of the box, including RTMP. If you are using -g, your own downloader must support these as well. +Please bear in mind that some URL protocols are **not** supported by browsers out of the box, including RTMP. If you are using `-g`, your own downloader must support these as well. If you want to play the video on a machine that is not running youtube-dl, you can relay the video content from the machine that runs youtube-dl. You can use `-o -` to let youtube-dl stream a video to stdout, or simply allow the player to download the files written by youtube-dl in turn. @@ -643,15 +643,15 @@ So please elaborate on what feature you are requesting, or what bug you want to If your report is shorter than two lines, it is almost certainly missing some of these, which makes it hard for us to respond to it. We're often too polite to close the issue outright, but the missing info makes misinterpretation likely. As a commiter myself, I often get frustrated by these issues, since the only possible way for me to move forward on them is to ask for clarification over and over. -For bug reports, this means that your report should contain the *complete* output of youtube-dl when called with the -v flag. The error message you get for (most) bugs even says so, but you would not believe how many of our bug reports do not contain this information. +For bug reports, this means that your report should contain the *complete* output of youtube-dl when called with the `-v` flag. The error message you get for (most) bugs even says so, but you would not believe how many of our bug reports do not contain this information. -If your server has multiple IPs or you suspect censorship, adding --call-home may be a good idea to get more diagnostics. If the error is `ERROR: Unable to extract ...` and you cannot reproduce it from multiple countries, add `--dump-pages` (warning: this will yield a rather large output, redirect it to the file `log.txt` by adding `>log.txt 2>&1` to your command-line) or upload the `.dump` files you get when you add `--write-pages` [somewhere](https://gist.github.com/). +If your server has multiple IPs or you suspect censorship, adding `--call-home` may be a good idea to get more diagnostics. If the error is `ERROR: Unable to extract ...` and you cannot reproduce it from multiple countries, add `--dump-pages` (warning: this will yield a rather large output, redirect it to the file `log.txt` by adding `>log.txt 2>&1` to your command-line) or upload the `.dump` files you get when you add `--write-pages` [somewhere](https://gist.github.com/). **Site support requests must contain an example URL**. An example URL is a URL you might want to download, like http://www.youtube.com/watch?v=BaW_jenozKc . There should be an obvious video present. Except under very special circumstances, the main page of a video service (e.g. http://www.youtube.com/ ) is *not* an example URL. ### Are you using the latest version? -Before reporting any issue, type youtube-dl -U. This should report that you're up-to-date. About 20% of the reports we receive are already fixed, but people are using outdated versions. This goes for feature requests as well. +Before reporting any issue, type `youtube-dl -U`. This should report that you're up-to-date. About 20% of the reports we receive are already fixed, but people are using outdated versions. This goes for feature requests as well. ### Is the issue already documented? From c4af7684d85a17441c5f6f0b6e9f8c470644fec0 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Tue, 6 Oct 2015 09:08:10 +0200 Subject: [PATCH 0047/1286] release 2015.10.06 --- CONTRIBUTING.md | 6 +- README.md | 395 +++++++++++++++++++++++++++++------------ docs/supportedsites.md | 4 + youtube_dl/version.py | 2 +- 4 files changed, 289 insertions(+), 118 deletions(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index f8ab29631..57a94231d 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -16,15 +16,15 @@ So please elaborate on what feature you are requesting, or what bug you want to If your report is shorter than two lines, it is almost certainly missing some of these, which makes it hard for us to respond to it. We're often too polite to close the issue outright, but the missing info makes misinterpretation likely. As a commiter myself, I often get frustrated by these issues, since the only possible way for me to move forward on them is to ask for clarification over and over. -For bug reports, this means that your report should contain the *complete* output of youtube-dl when called with the -v flag. The error message you get for (most) bugs even says so, but you would not believe how many of our bug reports do not contain this information. +For bug reports, this means that your report should contain the *complete* output of youtube-dl when called with the `-v` flag. The error message you get for (most) bugs even says so, but you would not believe how many of our bug reports do not contain this information. -If your server has multiple IPs or you suspect censorship, adding --call-home may be a good idea to get more diagnostics. If the error is `ERROR: Unable to extract ...` and you cannot reproduce it from multiple countries, add `--dump-pages` (warning: this will yield a rather large output, redirect it to the file `log.txt` by adding `>log.txt 2>&1` to your command-line) or upload the `.dump` files you get when you add `--write-pages` [somewhere](https://gist.github.com/). +If your server has multiple IPs or you suspect censorship, adding `--call-home` may be a good idea to get more diagnostics. If the error is `ERROR: Unable to extract ...` and you cannot reproduce it from multiple countries, add `--dump-pages` (warning: this will yield a rather large output, redirect it to the file `log.txt` by adding `>log.txt 2>&1` to your command-line) or upload the `.dump` files you get when you add `--write-pages` [somewhere](https://gist.github.com/). **Site support requests must contain an example URL**. An example URL is a URL you might want to download, like http://www.youtube.com/watch?v=BaW_jenozKc . There should be an obvious video present. Except under very special circumstances, the main page of a video service (e.g. http://www.youtube.com/ ) is *not* an example URL. ### Are you using the latest version? -Before reporting any issue, type youtube-dl -U. This should report that you're up-to-date. About 20% of the reports we receive are already fixed, but people are using outdated versions. This goes for feature requests as well. +Before reporting any issue, type `youtube-dl -U`. This should report that you're up-to-date. About 20% of the reports we receive are already fixed, but people are using outdated versions. This goes for feature requests as well. ### Is the issue already documented? diff --git a/README.md b/README.md index e7004c9fc..5ff8ca85e 100644 --- a/README.md +++ b/README.md @@ -49,110 +49,220 @@ which means you can modify it, redistribute it or use it however you like. # OPTIONS -h, --help Print this help text and exit --version Print program version and exit - -U, --update Update this program to latest version. Make sure that you have sufficient permissions (run with sudo if needed) - -i, --ignore-errors Continue on download errors, for example to skip unavailable videos in a playlist - --abort-on-error Abort downloading of further videos (in the playlist or the command line) if an error occurs + -U, --update Update this program to latest version. Make + sure that you have sufficient permissions + (run with sudo if needed) + -i, --ignore-errors Continue on download errors, for example to + skip unavailable videos in a playlist + --abort-on-error Abort downloading of further videos (in the + playlist or the command line) if an error + occurs --dump-user-agent Display the current browser identification --list-extractors List all supported extractors - --extractor-descriptions Output descriptions of all supported extractors - --force-generic-extractor Force extraction to use the generic extractor - --default-search PREFIX Use this prefix for unqualified URLs. For example "gvsearch2:" downloads two videos from google videos for youtube-dl "large apple". - Use the value "auto" to let youtube-dl guess ("auto_warning" to emit a warning when guessing). "error" just throws an error. The - default value "fixup_error" repairs broken URLs, but emits an error if this is not possible instead of searching. - --ignore-config Do not read configuration files. When given in the global configuration file /etc/youtube-dl.conf: Do not read the user configuration - in ~/.config/youtube-dl/config (%APPDATA%/youtube-dl/config.txt on Windows) - --flat-playlist Do not extract the videos of a playlist, only list them. + --extractor-descriptions Output descriptions of all supported + extractors + --force-generic-extractor Force extraction to use the generic + extractor + --default-search PREFIX Use this prefix for unqualified URLs. For + example "gvsearch2:" downloads two videos + from google videos for youtube-dl "large + apple". Use the value "auto" to let + youtube-dl guess ("auto_warning" to emit a + warning when guessing). "error" just throws + an error. The default value "fixup_error" + repairs broken URLs, but emits an error if + this is not possible instead of searching. + --ignore-config Do not read configuration files. When given + in the global configuration file /etc + /youtube-dl.conf: Do not read the user + configuration in ~/.config/youtube- + dl/config (%APPDATA%/youtube-dl/config.txt + on Windows) + --flat-playlist Do not extract the videos of a playlist, + only list them. --no-color Do not emit color codes in output ## Network Options: - --proxy URL Use the specified HTTP/HTTPS proxy. Pass in an empty string (--proxy "") for direct connection + --proxy URL Use the specified HTTP/HTTPS proxy. Pass in + an empty string (--proxy "") for direct + connection --socket-timeout SECONDS Time to wait before giving up, in seconds - --source-address IP Client-side IP address to bind to (experimental) - -4, --force-ipv4 Make all connections via IPv4 (experimental) - -6, --force-ipv6 Make all connections via IPv6 (experimental) - --cn-verification-proxy URL Use this proxy to verify the IP address for some Chinese sites. The default proxy specified by --proxy (or none, if the options is - not present) is used for the actual downloading. (experimental) + --source-address IP Client-side IP address to bind to + (experimental) + -4, --force-ipv4 Make all connections via IPv4 + (experimental) + -6, --force-ipv6 Make all connections via IPv6 + (experimental) + --cn-verification-proxy URL Use this proxy to verify the IP address for + some Chinese sites. The default proxy + specified by --proxy (or none, if the + options is not present) is used for the + actual downloading. (experimental) ## Video Selection: --playlist-start NUMBER Playlist video to start at (default is 1) --playlist-end NUMBER Playlist video to end at (default is last) - --playlist-items ITEM_SPEC Playlist video items to download. Specify indices of the videos in the playlist separated by commas like: "--playlist-items 1,2,5,8" - if you want to download videos indexed 1, 2, 5, 8 in the playlist. You can specify range: "--playlist-items 1-3,7,10-13", it will - download the videos at index 1, 2, 3, 7, 10, 11, 12 and 13. - --match-title REGEX Download only matching titles (regex or caseless sub-string) - --reject-title REGEX Skip download for matching titles (regex or caseless sub-string) + --playlist-items ITEM_SPEC Playlist video items to download. Specify + indices of the videos in the playlist + separated by commas like: "--playlist-items + 1,2,5,8" if you want to download videos + indexed 1, 2, 5, 8 in the playlist. You can + specify range: "--playlist-items + 1-3,7,10-13", it will download the videos + at index 1, 2, 3, 7, 10, 11, 12 and 13. + --match-title REGEX Download only matching titles (regex or + caseless sub-string) + --reject-title REGEX Skip download for matching titles (regex or + caseless sub-string) --max-downloads NUMBER Abort after downloading NUMBER files - --min-filesize SIZE Do not download any videos smaller than SIZE (e.g. 50k or 44.6m) - --max-filesize SIZE Do not download any videos larger than SIZE (e.g. 50k or 44.6m) + --min-filesize SIZE Do not download any videos smaller than + SIZE (e.g. 50k or 44.6m) + --max-filesize SIZE Do not download any videos larger than SIZE + (e.g. 50k or 44.6m) --date DATE Download only videos uploaded in this date - --datebefore DATE Download only videos uploaded on or before this date (i.e. inclusive) - --dateafter DATE Download only videos uploaded on or after this date (i.e. inclusive) - --min-views COUNT Do not download any videos with less than COUNT views - --max-views COUNT Do not download any videos with more than COUNT views - --match-filter FILTER Generic video filter (experimental). Specify any key (see help for -o for a list of available keys) to match if the key is present, - !key to check if the key is not present,key > NUMBER (like "comment_count > 12", also works with >=, <, <=, !=, =) to compare against - a number, and & to require multiple matches. Values which are not known are excluded unless you put a question mark (?) after the - operator.For example, to only match videos that have been liked more than 100 times and disliked less than 50 times (or the dislike - functionality is not available at the given service), but who also have a description, use --match-filter "like_count > 100 & + --datebefore DATE Download only videos uploaded on or before + this date (i.e. inclusive) + --dateafter DATE Download only videos uploaded on or after + this date (i.e. inclusive) + --min-views COUNT Do not download any videos with less than + COUNT views + --max-views COUNT Do not download any videos with more than + COUNT views + --match-filter FILTER Generic video filter (experimental). + Specify any key (see help for -o for a list + of available keys) to match if the key is + present, !key to check if the key is not + present,key > NUMBER (like "comment_count > + 12", also works with >=, <, <=, !=, =) to + compare against a number, and & to require + multiple matches. Values which are not + known are excluded unless you put a + question mark (?) after the operator.For + example, to only match videos that have + been liked more than 100 times and disliked + less than 50 times (or the dislike + functionality is not available at the given + service), but who also have a description, + use --match-filter "like_count > 100 & dislike_count <? 50 & description" . - --no-playlist Download only the video, if the URL refers to a video and a playlist. - --yes-playlist Download the playlist, if the URL refers to a video and a playlist. - --age-limit YEARS Download only videos suitable for the given age - --download-archive FILE Download only videos not listed in the archive file. Record the IDs of all downloaded videos in it. - --include-ads Download advertisements as well (experimental) + --no-playlist Download only the video, if the URL refers + to a video and a playlist. + --yes-playlist Download the playlist, if the URL refers to + a video and a playlist. + --age-limit YEARS Download only videos suitable for the given + age + --download-archive FILE Download only videos not listed in the + archive file. Record the IDs of all + downloaded videos in it. + --include-ads Download advertisements as well + (experimental) ## Download Options: - -r, --rate-limit LIMIT Maximum download rate in bytes per second (e.g. 50K or 4.2M) - -R, --retries RETRIES Number of retries (default is 10), or "infinite". - --buffer-size SIZE Size of download buffer (e.g. 1024 or 16K) (default is 1024) - --no-resize-buffer Do not automatically adjust the buffer size. By default, the buffer size is automatically resized from an initial value of SIZE. + -r, --rate-limit LIMIT Maximum download rate in bytes per second + (e.g. 50K or 4.2M) + -R, --retries RETRIES Number of retries (default is 10), or + "infinite". + --buffer-size SIZE Size of download buffer (e.g. 1024 or 16K) + (default is 1024) + --no-resize-buffer Do not automatically adjust the buffer + size. By default, the buffer size is + automatically resized from an initial value + of SIZE. --playlist-reverse Download playlist videos in reverse order - --xattr-set-filesize Set file xattribute ytdl.filesize with expected filesize (experimental) - --hls-prefer-native Use the native HLS downloader instead of ffmpeg (experimental) - --external-downloader COMMAND Use the specified external downloader. Currently supports aria2c,axel,curl,httpie,wget - --external-downloader-args ARGS Give these arguments to the external downloader + --xattr-set-filesize Set file xattribute ytdl.filesize with + expected filesize (experimental) + --hls-prefer-native Use the native HLS downloader instead of + ffmpeg (experimental) + --external-downloader COMMAND Use the specified external downloader. + Currently supports + aria2c,axel,curl,httpie,wget + --external-downloader-args ARGS Give these arguments to the external + downloader ## Filesystem Options: - -a, --batch-file FILE File containing URLs to download ('-' for stdin) + -a, --batch-file FILE File containing URLs to download ('-' for + stdin) --id Use only video ID in file name - -o, --output TEMPLATE Output filename template. Use %(title)s to get the title, %(uploader)s for the uploader name, %(uploader_id)s for the uploader - nickname if different, %(autonumber)s to get an automatically incremented number, %(ext)s for the filename extension, %(format)s for - the format description (like "22 - 1280x720" or "HD"), %(format_id)s for the unique id of the format (like YouTube's itags: "137"), - %(upload_date)s for the upload date (YYYYMMDD), %(extractor)s for the provider (youtube, metacafe, etc), %(id)s for the video id, - %(playlist_title)s, %(playlist_id)s, or %(playlist)s (=title if present, ID otherwise) for the playlist the video is in, - %(playlist_index)s for the position in the playlist. %(height)s and %(width)s for the width and height of the video format. - %(resolution)s for a textual description of the resolution of the video format. %% for a literal percent. Use - to output to stdout. - Can also be used to download to a different directory, for example with -o '/my/downloads/%(uploader)s/%(title)s-%(id)s.%(ext)s' . - --autonumber-size NUMBER Specify the number of digits in %(autonumber)s when it is present in output filename template or --auto-number option is given - --restrict-filenames Restrict filenames to only ASCII characters, and avoid "&" and spaces in filenames - -A, --auto-number [deprecated; use -o "%(autonumber)s-%(title)s.%(ext)s" ] Number downloaded files starting from 00000 - -t, --title [deprecated] Use title in file name (default) + -o, --output TEMPLATE Output filename template. Use %(title)s to + get the title, %(uploader)s for the + uploader name, %(uploader_id)s for the + uploader nickname if different, + %(autonumber)s to get an automatically + incremented number, %(ext)s for the + filename extension, %(format)s for the + format description (like "22 - 1280x720" or + "HD"), %(format_id)s for the unique id of + the format (like YouTube's itags: "137"), + %(upload_date)s for the upload date + (YYYYMMDD), %(extractor)s for the provider + (youtube, metacafe, etc), %(id)s for the + video id, %(playlist_title)s, + %(playlist_id)s, or %(playlist)s (=title if + present, ID otherwise) for the playlist the + video is in, %(playlist_index)s for the + position in the playlist. %(height)s and + %(width)s for the width and height of the + video format. %(resolution)s for a textual + description of the resolution of the video + format. %% for a literal percent. Use - to + output to stdout. Can also be used to + download to a different directory, for + example with -o '/my/downloads/%(uploader)s + /%(title)s-%(id)s.%(ext)s' . + --autonumber-size NUMBER Specify the number of digits in + %(autonumber)s when it is present in output + filename template or --auto-number option + is given + --restrict-filenames Restrict filenames to only ASCII + characters, and avoid "&" and spaces in + filenames + -A, --auto-number [deprecated; use -o + "%(autonumber)s-%(title)s.%(ext)s" ] Number + downloaded files starting from 00000 + -t, --title [deprecated] Use title in file name + (default) -l, --literal [deprecated] Alias of --title -w, --no-overwrites Do not overwrite files - -c, --continue Force resume of partially downloaded files. By default, youtube-dl will resume downloads if possible. - --no-continue Do not resume partially downloaded files (restart from beginning) - --no-part Do not use .part files - write directly into output file - --no-mtime Do not use the Last-modified header to set the file modification time - --write-description Write video description to a .description file + -c, --continue Force resume of partially downloaded files. + By default, youtube-dl will resume + downloads if possible. + --no-continue Do not resume partially downloaded files + (restart from beginning) + --no-part Do not use .part files - write directly + into output file + --no-mtime Do not use the Last-modified header to set + the file modification time + --write-description Write video description to a .description + file --write-info-json Write video metadata to a .info.json file - --write-annotations Write video annotations to a .annotations.xml file - --load-info FILE JSON file containing the video information (created with the "--write-info-json" option) - --cookies FILE File to read cookies from and dump cookie jar in - --cache-dir DIR Location in the filesystem where youtube-dl can store some downloaded information permanently. By default $XDG_CACHE_HOME/youtube-dl - or ~/.cache/youtube-dl . At the moment, only YouTube player files (for videos with obfuscated signatures) are cached, but that may - change. + --write-annotations Write video annotations to a + .annotations.xml file + --load-info FILE JSON file containing the video information + (created with the "--write-info-json" + option) + --cookies FILE File to read cookies from and dump cookie + jar in + --cache-dir DIR Location in the filesystem where youtube-dl + can store some downloaded information + permanently. By default $XDG_CACHE_HOME + /youtube-dl or ~/.cache/youtube-dl . At the + moment, only YouTube player files (for + videos with obfuscated signatures) are + cached, but that may change. --no-cache-dir Disable filesystem caching --rm-cache-dir Delete all filesystem cache files ## Thumbnail images: --write-thumbnail Write thumbnail image to disk --write-all-thumbnails Write all thumbnail image formats to disk - --list-thumbnails Simulate and list all available thumbnail formats + --list-thumbnails Simulate and list all available thumbnail + formats ## Verbosity / Simulation Options: -q, --quiet Activate quiet mode --no-warnings Ignore warnings - -s, --simulate Do not download the video and do not write anything to disk + -s, --simulate Do not download the video and do not write + anything to disk --skip-download Do not download the video -g, --get-url Simulate, quiet but print URL -e, --get-title Simulate, quiet but print title @@ -162,78 +272,135 @@ which means you can modify it, redistribute it or use it however you like. --get-duration Simulate, quiet but print video length --get-filename Simulate, quiet but print output filename --get-format Simulate, quiet but print output format - -j, --dump-json Simulate, quiet but print JSON information. See --output for a description of available keys. - -J, --dump-single-json Simulate, quiet but print JSON information for each command-line argument. If the URL refers to a playlist, dump the whole playlist - information in a single line. - --print-json Be quiet and print the video information as JSON (video is still being downloaded). + -j, --dump-json Simulate, quiet but print JSON information. + See --output for a description of available + keys. + -J, --dump-single-json Simulate, quiet but print JSON information + for each command-line argument. If the URL + refers to a playlist, dump the whole + playlist information in a single line. + --print-json Be quiet and print the video information as + JSON (video is still being downloaded). --newline Output progress bar as new lines --no-progress Do not print progress bar --console-title Display progress in console titlebar -v, --verbose Print various debugging information - --dump-pages Print downloaded pages encoded using base64 to debug problems (very verbose) - --write-pages Write downloaded intermediary pages to files in the current directory to debug problems + --dump-pages Print downloaded pages encoded using base64 + to debug problems (very verbose) + --write-pages Write downloaded intermediary pages to + files in the current directory to debug + problems --print-traffic Display sent and read HTTP traffic -C, --call-home Contact the youtube-dl server for debugging - --no-call-home Do NOT contact the youtube-dl server for debugging + --no-call-home Do NOT contact the youtube-dl server for + debugging ## Workarounds: --encoding ENCODING Force the specified encoding (experimental) --no-check-certificate Suppress HTTPS certificate validation - --prefer-insecure Use an unencrypted connection to retrieve information about the video. (Currently supported only for YouTube) + --prefer-insecure Use an unencrypted connection to retrieve + information about the video. (Currently + supported only for YouTube) --user-agent UA Specify a custom user agent - --referer URL Specify a custom referer, use if the video access is restricted to one domain - --add-header FIELD:VALUE Specify a custom HTTP header and its value, separated by a colon ':'. You can use this option multiple times - --bidi-workaround Work around terminals that lack bidirectional text support. Requires bidiv or fribidi executable in PATH - --sleep-interval SECONDS Number of seconds to sleep before each download. + --referer URL Specify a custom referer, use if the video + access is restricted to one domain + --add-header FIELD:VALUE Specify a custom HTTP header and its value, + separated by a colon ':'. You can use this + option multiple times + --bidi-workaround Work around terminals that lack + bidirectional text support. Requires bidiv + or fribidi executable in PATH + --sleep-interval SECONDS Number of seconds to sleep before each + download. ## Video Format Options: - -f, --format FORMAT Video format code, see the "FORMAT SELECTION" for all the info + -f, --format FORMAT Video format code, see the "FORMAT + SELECTION" for all the info --all-formats Download all available video formats - --prefer-free-formats Prefer free video formats unless a specific one is requested + --prefer-free-formats Prefer free video formats unless a specific + one is requested -F, --list-formats List all available formats - --youtube-skip-dash-manifest Do not download the DASH manifests and related data on YouTube videos - --merge-output-format FORMAT If a merge is required (e.g. bestvideo+bestaudio), output to given container format. One of mkv, mp4, ogg, webm, flv. Ignored if no - merge is required + --youtube-skip-dash-manifest Do not download the DASH manifests and + related data on YouTube videos + --merge-output-format FORMAT If a merge is required (e.g. + bestvideo+bestaudio), output to given + container format. One of mkv, mp4, ogg, + webm, flv. Ignored if no merge is required ## Subtitle Options: --write-sub Write subtitle file - --write-auto-sub Write automatic subtitle file (YouTube only) - --all-subs Download all the available subtitles of the video + --write-auto-sub Write automatic subtitle file (YouTube + only) + --all-subs Download all the available subtitles of the + video --list-subs List all available subtitles for the video - --sub-format FORMAT Subtitle format, accepts formats preference, for example: "srt" or "ass/srt/best" - --sub-lang LANGS Languages of the subtitles to download (optional) separated by commas, use IETF language tags like 'en,pt' + --sub-format FORMAT Subtitle format, accepts formats + preference, for example: "srt" or + "ass/srt/best" + --sub-lang LANGS Languages of the subtitles to download + (optional) separated by commas, use IETF + language tags like 'en,pt' ## Authentication Options: -u, --username USERNAME Login with this account ID - -p, --password PASSWORD Account password. If this option is left out, youtube-dl will ask interactively. + -p, --password PASSWORD Account password. If this option is left + out, youtube-dl will ask interactively. -2, --twofactor TWOFACTOR Two-factor auth code -n, --netrc Use .netrc authentication data --video-password PASSWORD Video password (vimeo, smotri, youku) ## Post-processing Options: - -x, --extract-audio Convert video files to audio-only files (requires ffmpeg or avconv and ffprobe or avprobe) - --audio-format FORMAT Specify audio format: "best", "aac", "vorbis", "mp3", "m4a", "opus", or "wav"; "best" by default - --audio-quality QUALITY Specify ffmpeg/avconv audio quality, insert a value between 0 (better) and 9 (worse) for VBR or a specific bitrate like 128K (default - 5) - --recode-video FORMAT Encode the video to another format if necessary (currently supported: mp4|flv|ogg|webm|mkv|avi) + -x, --extract-audio Convert video files to audio-only files + (requires ffmpeg or avconv and ffprobe or + avprobe) + --audio-format FORMAT Specify audio format: "best", "aac", + "vorbis", "mp3", "m4a", "opus", or "wav"; + "best" by default + --audio-quality QUALITY Specify ffmpeg/avconv audio quality, insert + a value between 0 (better) and 9 (worse) + for VBR or a specific bitrate like 128K + (default 5) + --recode-video FORMAT Encode the video to another format if + necessary (currently supported: + mp4|flv|ogg|webm|mkv|avi) --postprocessor-args ARGS Give these arguments to the postprocessor - -k, --keep-video Keep the video file on disk after the post-processing; the video is erased by default - --no-post-overwrites Do not overwrite post-processed files; the post-processed files are overwritten by default - --embed-subs Embed subtitles in the video (only for mkv and mp4 videos) + -k, --keep-video Keep the video file on disk after the post- + processing; the video is erased by default + --no-post-overwrites Do not overwrite post-processed files; the + post-processed files are overwritten by + default + --embed-subs Embed subtitles in the video (only for mkv + and mp4 videos) --embed-thumbnail Embed thumbnail in the audio as cover art --add-metadata Write metadata to the video file - --metadata-from-title FORMAT Parse additional metadata like song title / artist from the video title. The format syntax is the same as --output, the parsed - parameters replace existing values. Additional templates: %(album)s, %(artist)s. Example: --metadata-from-title "%(artist)s - - %(title)s" matches a title like "Coldplay - Paradise" - --xattrs Write metadata to the video file's xattrs (using dublin core and xdg standards) - --fixup POLICY Automatically correct known faults of the file. One of never (do nothing), warn (only emit a warning), detect_or_warn (the default; - fix file if we can, warn otherwise) - --prefer-avconv Prefer avconv over ffmpeg for running the postprocessors (default) - --prefer-ffmpeg Prefer ffmpeg over avconv for running the postprocessors - --ffmpeg-location PATH Location of the ffmpeg/avconv binary; either the path to the binary or its containing directory. - --exec CMD Execute a command on the file after downloading, similar to find's -exec syntax. Example: --exec 'adb push {} /sdcard/Music/ && rm - {}' - --convert-subtitles FORMAT Convert the subtitles to other format (currently supported: srt|ass|vtt) + --metadata-from-title FORMAT Parse additional metadata like song title / + artist from the video title. The format + syntax is the same as --output, the parsed + parameters replace existing values. + Additional templates: %(album)s, + %(artist)s. Example: --metadata-from-title + "%(artist)s - %(title)s" matches a title + like "Coldplay - Paradise" + --xattrs Write metadata to the video file's xattrs + (using dublin core and xdg standards) + --fixup POLICY Automatically correct known faults of the + file. One of never (do nothing), warn (only + emit a warning), detect_or_warn (the + default; fix file if we can, warn + otherwise) + --prefer-avconv Prefer avconv over ffmpeg for running the + postprocessors (default) + --prefer-ffmpeg Prefer ffmpeg over avconv for running the + postprocessors + --ffmpeg-location PATH Location of the ffmpeg/avconv binary; + either the path to the binary or its + containing directory. + --exec CMD Execute a command on the file after + downloading, similar to find's -exec + syntax. Example: --exec 'adb push {} + /sdcard/Music/ && rm {}' + --convert-subtitles FORMAT Convert the subtitles to other format + (currently supported: srt|ass|vtt) # CONFIGURATION diff --git a/docs/supportedsites.md b/docs/supportedsites.md index fa83b68ad..5beb2ecd4 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -150,6 +150,7 @@ - **Escapist** - **ESPN** (Currently broken) - **EsriVideo** + - **Europa** - **EveryonesMixtape** - **exfm**: ex.fm - **ExpoTV** @@ -263,6 +264,9 @@ - **Libsyn** - **life:embed** - **lifenews**: LIFE | NEWS + - **limelight** + - **limelight:channel** + - **limelight:channel_list** - **LiveLeak** - **livestream** - **livestream:original** diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 8f0977849..46c3356e9 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2015.09.28' +__version__ = '2015.10.06' From 4810c48d6d7e950c2cb1203f4d07bea1ba02c1e1 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Tue, 6 Oct 2015 14:28:14 +0200 Subject: [PATCH 0048/1286] [compat] Do not compare None <= 0 The result is meaningless (and it emits a warning in cpython2 when called with -3), so handle None before making integer comparisons. --- youtube_dl/compat.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/youtube_dl/compat.py b/youtube_dl/compat.py index c36c9c23f..1ba4ab78c 100644 --- a/youtube_dl/compat.py +++ b/youtube_dl/compat.py @@ -417,18 +417,18 @@ else: _terminal_size = collections.namedtuple('terminal_size', ['columns', 'lines']) def compat_get_terminal_size(fallback=(80, 24)): - columns = compat_getenv('COLUMNS', None) + columns = compat_getenv('COLUMNS') if columns: columns = int(columns) else: columns = None - lines = compat_getenv('LINES', None) + lines = compat_getenv('LINES') if lines: lines = int(lines) else: lines = None - if columns <= 0 or lines <= 0: + if columns is None or lines is None or columns <= 0 or lines <= 0: try: sp = subprocess.Popen( ['stty', 'size'], @@ -438,9 +438,9 @@ else: except Exception: _columns, _lines = _terminal_size(*fallback) - if columns <= 0: + if columns is None or columns <= 0: columns = _columns - if lines <= 0: + if lines is None or lines <= 0: lines = _lines return _terminal_size(columns, lines) From 86be82610c35a684bee97b22c8d9a2a83bab1bba Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Tue, 6 Oct 2015 17:43:50 +0200 Subject: [PATCH 0049/1286] release 2015.10.06.1 --- youtube_dl/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 46c3356e9..4fce70fa6 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2015.10.06' +__version__ = '2015.10.06.1' From f2dbc54066f56a98c689099b920e6a596d4ffdfc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Tue, 6 Oct 2015 22:02:28 +0600 Subject: [PATCH 0050/1286] [compat] Fix wrong lines/columns order stty size is rows x columns --- youtube_dl/compat.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/compat.py b/youtube_dl/compat.py index 1ba4ab78c..192e1c515 100644 --- a/youtube_dl/compat.py +++ b/youtube_dl/compat.py @@ -434,7 +434,7 @@ else: ['stty', 'size'], stdout=subprocess.PIPE, stderr=subprocess.PIPE) out, err = sp.communicate() - _columns, _lines = map(int, out.split()) + _lines, _columns = map(int, out.split()) except Exception: _columns, _lines = _terminal_size(*fallback) From f648e682a7a82f71a278ec92159ac5d343a4b3eb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Tue, 6 Oct 2015 22:58:18 +0600 Subject: [PATCH 0051/1286] [bandcamp] Prepend download URL with scheme when necessary (Closes #7077) --- youtube_dl/extractor/bandcamp.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/bandcamp.py b/youtube_dl/extractor/bandcamp.py index 505877b77..a27f3e748 100644 --- a/youtube_dl/extractor/bandcamp.py +++ b/youtube_dl/extractor/bandcamp.py @@ -93,8 +93,8 @@ class BandcampIE(InfoExtractor): final_url_webpage = self._download_webpage(request_url, video_id, 'Requesting download url') # If we could correctly generate the .rand field the url would be # in the "download_url" key - final_url = self._search_regex( - r'"retry_url":"(.*?)"', final_url_webpage, 'final video URL') + final_url = self._proto_relative_url(self._search_regex( + r'"retry_url":"(.+?)"', final_url_webpage, 'final video URL'), 'http:') return { 'id': video_id, From 83a56686944225137ce646748b8e940a21584941 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Tue, 6 Oct 2015 23:08:28 +0600 Subject: [PATCH 0052/1286] [canalplus] Extend video id regex (Closes #7076) --- youtube_dl/extractor/canalplus.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/canalplus.py b/youtube_dl/extractor/canalplus.py index 57e0cda2c..c0ca3a0fa 100644 --- a/youtube_dl/extractor/canalplus.py +++ b/youtube_dl/extractor/canalplus.py @@ -78,7 +78,8 @@ class CanalplusIE(InfoExtractor): if video_id is None: webpage = self._download_webpage(url, display_id) video_id = self._search_regex( - r'<canal:player[^>]+?videoId="(\d+)"', webpage, 'video id') + [r'<canal:player[^>]+?videoId=(["\'])(?P<id>\d+)', r'id=["\']canal_video_player(?P<id>\d+)'], + webpage, 'video id', group='id') info_url = self._VIDEO_INFO_TEMPLATE % (site_id, video_id) doc = self._download_xml(info_url, video_id, 'Downloading video XML') From fc10824cb67feb837ea57d60decc293b7b719cfb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Wed, 7 Oct 2015 02:43:12 +0600 Subject: [PATCH 0053/1286] [canalplus] PEP 8 --- youtube_dl/extractor/canalplus.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/canalplus.py b/youtube_dl/extractor/canalplus.py index c0ca3a0fa..004372f8d 100644 --- a/youtube_dl/extractor/canalplus.py +++ b/youtube_dl/extractor/canalplus.py @@ -79,7 +79,7 @@ class CanalplusIE(InfoExtractor): webpage = self._download_webpage(url, display_id) video_id = self._search_regex( [r'<canal:player[^>]+?videoId=(["\'])(?P<id>\d+)', r'id=["\']canal_video_player(?P<id>\d+)'], - webpage, 'video id', group='id') + webpage, 'video id', group='id') info_url = self._VIDEO_INFO_TEMPLATE % (site_id, video_id) doc = self._download_xml(info_url, video_id, 'Downloading video XML') From 945e5c56e3ccd45cdcc3ee45b9ffcbad7e614f90 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Tue, 6 Oct 2015 23:46:16 +0200 Subject: [PATCH 0054/1286] release 2015.10.06.2 --- youtube_dl/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 4fce70fa6..112c78835 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2015.10.06.1' +__version__ = '2015.10.06.2' From 139f27827e1d771aba5cf7f1473129073686f5ab Mon Sep 17 00:00:00 2001 From: remitamine <remitamine@gmail.com> Date: Wed, 7 Oct 2015 06:53:19 +0100 Subject: [PATCH 0055/1286] [nba] skip Legacy Video Files --- youtube_dl/extractor/nba.py | 35 +++++++++++------------------------ 1 file changed, 11 insertions(+), 24 deletions(-) diff --git a/youtube_dl/extractor/nba.py b/youtube_dl/extractor/nba.py index 3d38d080e..73116c7c6 100644 --- a/youtube_dl/extractor/nba.py +++ b/youtube_dl/extractor/nba.py @@ -38,11 +38,6 @@ class NBAIE(InfoExtractor): } }] - _BASE_PATHS = { - 'turner': 'http://nba.cdn.turner.com/nba/big', - 'akamai': 'http://nbavod-f.akamaihd.net', - } - _QUALITIES = { '420mp4': { 'width': 400, @@ -54,55 +49,50 @@ class NBAIE(InfoExtractor): 'height': 234, 'preference': 2, }, - '556': { - 'width': 416, - 'height': 234, - 'preference': 3, - }, '480x320_910': { 'width': 480, 'height': 320, - 'preference': 4, + 'preference': 3, }, 'nba_576x324': { 'width': 576, 'height': 324, - 'preference': 5, + 'preference': 4, }, 'nba_640x360': { 'width': 640, 'height': 360, - 'preference': 6, + 'preference': 5, }, '640x360_664b': { 'width': 640, 'height': 360, - 'preference': 7, + 'preference': 6, }, '640x360_664m': { 'width': 640, 'height': 360, - 'preference': 8, + 'preference': 7, }, '768x432_996': { 'width': 768, 'height': 432, - 'preference': 9, + 'preference': 8, }, '768x432_1404': { 'width': 768, 'height': 432, - 'preference': 10, + 'preference': 9, }, '960x540_2104': { 'width': 960, 'height': 540, - 'preference': 11, + 'preference': 10, }, '1280x720_3072': { 'width': 1280, 'height': 720, - 'preference': 12, + 'preference': 11, }, } @@ -127,11 +117,8 @@ class NBAIE(InfoExtractor): formats = [] for video_file in video_info.find('files').iter('file'): video_url = video_file.text - if not video_url.startswith('http://'): - if video_url.endswith('.m3u8') or video_url.endswith('.f4m'): - video_url = self._BASE_PATHS['akamai'] + video_url - else: - video_url = self._BASE_PATHS['turner'] + video_url + if video_url.startswith('/'): + continue if video_url.endswith('.m3u8'): formats.extend(self._extract_m3u8_formats(video_url, video_id)) elif video_url.endswith('.f4m'): From ecf6de5b02ad3996f770efd33f9b400d04ac8a85 Mon Sep 17 00:00:00 2001 From: remitamine <remitamine@gmail.com> Date: Wed, 7 Oct 2015 07:09:45 +0100 Subject: [PATCH 0056/1286] [nba] extract width,height and bitrate from format key --- youtube_dl/extractor/nba.py | 68 ++++--------------------------------- 1 file changed, 6 insertions(+), 62 deletions(-) diff --git a/youtube_dl/extractor/nba.py b/youtube_dl/extractor/nba.py index 73116c7c6..ea1482fc8 100644 --- a/youtube_dl/extractor/nba.py +++ b/youtube_dl/extractor/nba.py @@ -1,5 +1,7 @@ from __future__ import unicode_literals +import re + from .common import InfoExtractor from ..utils import ( parse_duration, @@ -38,64 +40,6 @@ class NBAIE(InfoExtractor): } }] - _QUALITIES = { - '420mp4': { - 'width': 400, - 'height': 224, - 'preference': 1, - }, - '416x234': { - 'width': 416, - 'height': 234, - 'preference': 2, - }, - '480x320_910': { - 'width': 480, - 'height': 320, - 'preference': 3, - }, - 'nba_576x324': { - 'width': 576, - 'height': 324, - 'preference': 4, - }, - 'nba_640x360': { - 'width': 640, - 'height': 360, - 'preference': 5, - }, - '640x360_664b': { - 'width': 640, - 'height': 360, - 'preference': 6, - }, - '640x360_664m': { - 'width': 640, - 'height': 360, - 'preference': 7, - }, - '768x432_996': { - 'width': 768, - 'height': 432, - 'preference': 8, - }, - '768x432_1404': { - 'width': 768, - 'height': 432, - 'preference': 9, - }, - '960x540_2104': { - 'width': 960, - 'height': 540, - 'preference': 10, - }, - '1280x720_3072': { - 'width': 1280, - 'height': 720, - 'preference': 11, - }, - } - def _real_extract(self, url): video_id = self._match_id(url) video_info = self._download_xml('http://www.nba.com/video/%s.xml' % video_id, video_id) @@ -125,13 +69,13 @@ class NBAIE(InfoExtractor): formats.extend(self._extract_f4m_formats(video_url + '?hdcore=3.4.1.1', video_id)) else: key = video_file.attrib.get('bitrate') - quality = self._QUALITIES[key] + width, height, bitrate = re.search(r'(\d+)x(\d+)(?:_(\d+))?', key).groups() formats.append({ 'format_id': key, 'url': video_url, - 'width': quality['width'], - 'height': quality['height'], - 'preference': quality['preference'], + 'width': int_or_none(width), + 'height': int_or_none(height), + 'tbr': int_or_none(bitrate), }) self._sort_formats(formats) From bd5376c182bd5ee2103aec21144286dbefbeb797 Mon Sep 17 00:00:00 2001 From: Tom Gijselinck <tomgijselinck@gmail.com> Date: Wed, 7 Oct 2015 10:32:44 +0200 Subject: [PATCH 0057/1286] Fix typos and improve grammar and spelling --- README.md | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/README.md b/README.md index 5ff8ca85e..cf4aebf3d 100644 --- a/README.md +++ b/README.md @@ -404,18 +404,18 @@ which means you can modify it, redistribute it or use it however you like. # CONFIGURATION -You can configure youtube-dl by placing any supported command line option to a configuration file. On Linux, system wide configuration file is located at `/etc/youtube-dl.conf` and user wide configuration file at `~/.config/youtube-dl/config`. On Windows, the user wide configuration file locations are `%APPDATA%\youtube-dl\config.txt` or `C:\Users\<user name>\youtube-dl.conf`. For example, with the following configration file youtube-dl will always extract the audio, not copy the mtime and use proxy: +You can configure youtube-dl by placing any supported command line option to a configuration file. On Linux, the system wide configuration file is located at `/etc/youtube-dl.conf` and the user wide configuration file at `~/.config/youtube-dl/config`. On Windows, the user wide configuration file locations are `%APPDATA%\youtube-dl\config.txt` or `C:\Users\<user name>\youtube-dl.conf`. For example, with the following configuration file youtube-dl will always extract the audio, not copy the mtime and use a proxy: ``` --extract-audio --no-mtime --proxy 127.0.0.1:3128 ``` -You can use `--ignore-config` if you want to disable configuration file for a particular youtube-dl run. +You can use `--ignore-config` if you want to disable the configuration file for a particular youtube-dl run. ### Authentication with `.netrc` file ### -You may also want to configure automatic credentials storage for extractors that support authentication (by providing login and password with `--username` and `--password`) in order not to pass credentials as command line arguments on every youtube-dl execution and prevent tracking plain text passwords in shell command history. You can achieve this using [`.netrc` file](http://stackoverflow.com/tags/.netrc/info) on per extractor basis. For that you will need to create `.netrc` file in your `$HOME` and restrict permissions to read/write by you only: +You may also want to configure automatic credentials storage for extractors that support authentication (by providing login and password with `--username` and `--password`) in order not to pass credentials as command line arguments on every youtube-dl execution and prevent tracking plain text passwords in the shell command history. You can achieve this using a [`.netrc` file](http://stackoverflow.com/tags/.netrc/info) on per extractor basis. For that you will need to create a`.netrc` file in your `$HOME` and restrict permissions to read/write by you only: ``` touch $HOME/.netrc chmod a-rwx,u+rw $HOME/.netrc @@ -429,13 +429,13 @@ For example: machine youtube login myaccount@gmail.com password my_youtube_password machine twitch login my_twitch_account_name password my_twitch_password ``` -To activate authentication with `.netrc` file you should pass `--netrc` to youtube-dl or place it in [configuration file](#configuration). +To activate authentication with the `.netrc` file you should pass `--netrc` to youtube-dl or place it in the [configuration file](#configuration). -On Windows you may also need to setup `%HOME%` environment variable manually. +On Windows you may also need to setup the `%HOME%` environment variable manually. # OUTPUT TEMPLATE -The `-o` option allows users to indicate a template for the output file names. The basic usage is not to set any template arguments when downloading a single file, like in `youtube-dl -o funny_video.flv "http://some/video"`. However, it may contain special sequences that will be replaced when downloading each video. The special sequences have the format `%(NAME)s`. To clarify, that is a percent symbol followed by a name in parenthesis, followed by a lowercase S. Allowed names are: +The `-o` option allows users to indicate a template for the output file names. The basic usage is not to set any template arguments when downloading a single file, like in `youtube-dl -o funny_video.flv "http://some/video"`. However, it may contain special sequences that will be replaced when downloading each video. The special sequences have the format `%(NAME)s`. To clarify, that is a percent symbol followed by a name in parentheses, followed by a lowercase S. Allowed names are: - `id`: The sequence will be replaced by the video identifier. - `url`: The sequence will be replaced by the video URL. @@ -463,18 +463,18 @@ youtube-dl_test_video_.mp4 # A simple file name # FORMAT SELECTION -By default youtube-dl tries to download the best quality, but sometimes you may want to download other format. +By default youtube-dl tries to download the best quality, but sometimes you may want to download in a different format. The simplest case is requesting a specific format, for example `-f 22`. You can get the list of available formats using `--list-formats`, you can also use a file extension (currently it supports aac, m4a, mp3, mp4, ogg, wav, webm) or the special names `best`, `bestvideo`, `bestaudio` and `worst`. If you want to download multiple videos and they don't have the same formats available, you can specify the order of preference using slashes, as in `-f 22/17/18`. You can also filter the video results by putting a condition in brackets, as in `-f "best[height=720]"` (or `-f "[filesize>10M]"`). This works for filesize, height, width, tbr, abr, vbr, asr, and fps and the comparisons <, <=, >, >=, =, != and for ext, acodec, vcodec, container, and protocol and the comparisons =, != . Formats for which the value is not known are excluded unless you put a question mark (?) after the operator. You can combine format filters, so `-f "[height <=? 720][tbr>500]"` selects up to 720p videos (or videos where the height is not known) with a bitrate of at least 500 KBit/s. Use commas to download multiple formats, such as `-f 136/137/mp4/bestvideo,140/m4a/bestaudio`. You can merge the video and audio of two formats into a single file using `-f <video-format>+<audio-format>` (requires ffmpeg or avconv), for example `-f bestvideo+bestaudio`. Format selectors can also be grouped using parentheses, for example if you want to download the best mp4 and webm formats with a height lower than 480 you can use `-f '(mp4,webm)[height<480]'`. -Since the end of April 2015 and version 2015.04.26 youtube-dl uses `-f bestvideo+bestaudio/best` as default format selection (see #5447, #5456). If ffmpeg or avconv are installed this results in downloading `bestvideo` and `bestaudio` separately and muxing them together into a single file giving the best overall quality available. Otherwise it falls back to `best` and results in downloading best available quality served as a single file. `best` is also needed for videos that don't come from YouTube because they don't provide the audio and video in two different files. If you want to only download some dash formats (for example if you are not interested in getting videos with a resolution higher than 1080p), you can add `-f bestvideo[height<=?1080]+bestaudio/best` to your configuration file. Note that if you use youtube-dl to stream to `stdout` (and most likely to pipe it to your media player then), i.e. you explicitly specify output template as `-o -`, youtube-dl still uses `-f best` format selection in order to start content delivery immediately to your player and not to wait until `bestvideo` and `bestaudio` are downloaded and muxed. +Since the end of April 2015 and version 2015.04.26 youtube-dl uses `-f bestvideo+bestaudio/best` as default format selection (see #5447, #5456). If ffmpeg or avconv are installed this results in downloading `bestvideo` and `bestaudio` separately and muxing them together into a single file giving the best overall quality available. Otherwise it falls back to `best` and results in downloading the best available quality served as a single file. `best` is also needed for videos that don't come from YouTube because they don't provide the audio and video in two different files. If you want to only download some dash formats (for example if you are not interested in getting videos with a resolution higher than 1080p), you can add `-f bestvideo[height<=?1080]+bestaudio/best` to your configuration file. Note that if you use youtube-dl to stream to `stdout` (and most likely to pipe it to your media player then), i.e. you explicitly specify output template as `-o -`, youtube-dl still uses `-f best` format selection in order to start content delivery immediately to your player and not to wait until `bestvideo` and `bestaudio` are downloaded and muxed. -If you want to preserve the old format selection behavior (prior to youtube-dl 2015.04.26), i.e. you want to download best available quality media served as a single file, you should explicitly specify your choice with `-f best`. You may want to add it to the [configuration file](#configuration) in order not to type it every time you run youtube-dl. +If you want to preserve the old format selection behavior (prior to youtube-dl 2015.04.26), i.e. you want to download the best available quality media served as a single file, you should explicitly specify your choice with `-f best`. You may want to add it to the [configuration file](#configuration) in order not to type it every time you run youtube-dl. # VIDEO SELECTION -Videos can be filtered by their upload date using the options `--date`, `--datebefore` or `--dateafter`, they accept dates in two formats: +Videos can be filtered by their upload date using the options `--date`, `--datebefore` or `--dateafter`. They accept dates in two formats: - Absolute dates: Dates in the format `YYYYMMDD`. - Relative dates: Dates in the format `(now|today)[+-][0-9](day|week|month|year)(s)?` @@ -488,7 +488,7 @@ $ youtube-dl --dateafter now-6months # Download only the videos uploaded on January 1, 1970 $ youtube-dl --date 19700101 -$ # will only download the videos uploaded in the 200x decade +$ # Download only the videos uploaded in the 200x decade $ youtube-dl --dateafter 20000101 --datebefore 20091231 ``` @@ -500,7 +500,7 @@ If you've followed [our manual installation instructions](http://rg3.github.io/y If you have used pip, a simple `sudo pip install -U youtube-dl` is sufficient to update. -If you have installed youtube-dl using a package manager like *apt-get* or *yum*, use the standard system update mechanism to update. Note that distribution packages are often outdated. As a rule of thumb, youtube-dl releases at least once a month, and often weekly or even daily. Simply go to http://yt-dl.org/ to find out the current version. Unfortunately, there is nothing we youtube-dl developers can do if your distributions serves a really outdated version. You can (and should) complain to your distribution in their bugtracker or support forum. +If you have installed youtube-dl using a package manager like *apt-get* or *yum*, use the standard system update mechanism to update. Note that distribution packages are often outdated. As a rule of thumb, youtube-dl releases at least once a month, and often weekly or even daily. Simply go to http://yt-dl.org/ to find out the current version. Unfortunately, there is nothing we youtube-dl developers can do if your distribution serves a really outdated version. You can (and should) complain to your distribution in their bugtracker or support forum. As a last resort, you can also uninstall the version installed by your package manager and follow our manual installation instructions. For that, remove the distribution's package, with a line like @@ -558,7 +558,7 @@ YouTube requires an additional signature since September 2012 which is not suppo ### Video URL contains an ampersand and I'm getting some strange output `[1] 2839` or `'v' is not recognized as an internal or external command` ### -That's actually the output from your shell. Since ampersand is one of the special shell characters it's interpreted by shell preventing you from passing the whole URL to youtube-dl. To disable your shell from interpreting the ampersands (or any other special characters) you have to either put the whole URL in quotes or escape them with a backslash (which approach will work depends on your shell). +That's actually the output from your shell. Since ampersand is one of the special shell characters it's interpreted by the shell preventing you from passing the whole URL to youtube-dl. To disable your shell from interpreting the ampersands (or any other special characters) you have to either put the whole URL in quotes or escape them with a backslash (which approach will work depends on your shell). For example if your URL is https://www.youtube.com/watch?t=4&v=BaW_jenozKc you should end up with following command: @@ -618,9 +618,9 @@ Either prepend `http://www.youtube.com/watch?v=` or separate the ID from the opt ### How do I pass cookies to youtube-dl? -Use the `--cookies` option, for example `--cookies /path/to/cookies/file.txt`. Note that cookies file must be in Mozilla/Netscape format and the first line of cookies file must be either `# HTTP Cookie File` or `# Netscape HTTP Cookie File`. Make sure you have correct [newline format](https://en.wikipedia.org/wiki/Newline) in cookies file and convert newlines if necessary to correspond your OS, namely `CRLF` (`\r\n`) for Windows, `LF` (`\n`) for Linux and `CR` (`\r`) for Mac OS. `HTTP Error 400: Bad Request` when using `--cookies` is a good sign of invalid newline format. +Use the `--cookies` option, for example `--cookies /path/to/cookies/file.txt`. Note that the cookies file must be in Mozilla/Netscape format and the first line of the cookies file must be either `# HTTP Cookie File` or `# Netscape HTTP Cookie File`. Make sure you have correct [newline format](https://en.wikipedia.org/wiki/Newline) in the cookies file and convert newlines if necessary to correspond with your OS, namely `CRLF` (`\r\n`) for Windows, `LF` (`\n`) for Linux and `CR` (`\r`) for Mac OS. `HTTP Error 400: Bad Request` when using `--cookies` is a good sign of invalid newline format. -Passing cookies to youtube-dl is a good way to workaround login when particular extractor does not implement it explicitly. +Passing cookies to youtube-dl is a good way to workaround login when a particular extractor does not implement it explicitly. ### Can you add support for this anime video site, or site which shows current movies for free? @@ -720,7 +720,7 @@ If you want to add support for a new site, you can follow this quick list (assum } ``` 5. Add an import in [`youtube_dl/extractor/__init__.py`](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/__init__.py). -6. Run `python test/test_download.py TestDownload.test_YourExtractor`. This *should fail* at first, but you can continually re-run it until you're done. If you decide to add more than one test, then rename ``_TEST`` to ``_TESTS`` and make it into a list of dictionaries. The tests will be then be named `TestDownload.test_YourExtractor`, `TestDownload.test_YourExtractor_1`, `TestDownload.test_YourExtractor_2`, etc. +6. Run `python test/test_download.py TestDownload.test_YourExtractor`. This *should fail* at first, but you can continually re-run it until you're done. If you decide to add more than one test, then rename ``_TEST`` to ``_TESTS`` and make it into a list of dictionaries. The tests will then be named `TestDownload.test_YourExtractor`, `TestDownload.test_YourExtractor_1`, `TestDownload.test_YourExtractor_2`, etc. 7. Have a look at [`youtube_dl/extractor/common.py`](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/common.py) for possible helper methods and a [detailed description of what your extractor should and may return](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/common.py#L62-L200). Add tests and code for as many as you want. 8. If you can, check the code with [flake8](https://pypi.python.org/pypi/flake8). 9. When the tests pass, [add](http://git-scm.com/docs/git-add) the new files and [commit](http://git-scm.com/docs/git-commit) them and [push](http://git-scm.com/docs/git-push) the result, like this: From 6a11bb77baf9f70da76f2595b74061b31223d4ff Mon Sep 17 00:00:00 2001 From: remitamine <remitamine@gmail.com> Date: Wed, 7 Oct 2015 12:17:32 +0100 Subject: [PATCH 0058/1286] [nba] add support for team subsites --- youtube_dl/extractor/nba.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/youtube_dl/extractor/nba.py b/youtube_dl/extractor/nba.py index ea1482fc8..a0cc58c12 100644 --- a/youtube_dl/extractor/nba.py +++ b/youtube_dl/extractor/nba.py @@ -10,13 +10,13 @@ from ..utils import ( class NBAIE(InfoExtractor): - _VALID_URL = r'https?://(?:watch\.|www\.)?nba\.com/(?:nba/)?video/(?P<id>[^?]*?)/?(?:/index\.html)?(?:\?.*)?$' + _VALID_URL = r'https?://(?:watch\.|www\.)?nba\.com/(?P<path>(?:[^/]+/)?video/(?P<id>[^?]*?))/?(?:/index\.html)?(?:\?.*)?$' _TESTS = [{ 'url': 'http://www.nba.com/video/games/nets/2012/12/04/0021200253-okc-bkn-recap.nba/index.html', - 'md5': '9d902940d2a127af3f7f9d2f3dc79c96', + 'md5': '9e7729d3010a9c71506fd1248f74e4f4', 'info_dict': { 'id': '0021200253-okc-bkn-recap', - 'ext': 'mp4', + 'ext': 'flv', 'title': 'Thunder vs. Nets', 'description': 'Kevin Durant scores 32 points and dishes out six assists as the Thunder beat the Nets in Brooklyn.', 'duration': 181, @@ -27,7 +27,7 @@ class NBAIE(InfoExtractor): 'url': 'http://www.nba.com/video/games/hornets/2014/12/05/0021400276-nyk-cha-play5.nba/', 'only_matching': True, },{ - 'url': 'http://watch.nba.com/nba/video/channels/playoffs/2015/05/20/0041400301-cle-atl-recap.nba', + 'url': 'http://watch.nba.com/video/channels/playoffs/2015/05/20/0041400301-cle-atl-recap.nba', 'md5': 'b2b39b81cf28615ae0c3360a3f9668c4', 'info_dict': { 'id': '0041400301-cle-atl-recap', @@ -41,8 +41,8 @@ class NBAIE(InfoExtractor): }] def _real_extract(self, url): - video_id = self._match_id(url) - video_info = self._download_xml('http://www.nba.com/video/%s.xml' % video_id, video_id) + path, video_id = re.match(self._VALID_URL, url).groups() + video_info = self._download_xml('http://www.nba.com/%s.xml' % path, video_id) video_id = video_info.find('slug').text title = video_info.find('headline').text description = video_info.find('description').text @@ -64,9 +64,9 @@ class NBAIE(InfoExtractor): if video_url.startswith('/'): continue if video_url.endswith('.m3u8'): - formats.extend(self._extract_m3u8_formats(video_url, video_id)) + formats.extend(self._extract_m3u8_formats(video_url, video_id, m3u8_id='hls')) elif video_url.endswith('.f4m'): - formats.extend(self._extract_f4m_formats(video_url + '?hdcore=3.4.1.1', video_id)) + formats.extend(self._extract_f4m_formats(video_url + '?hdcore=3.4.1.1', video_id, f4m_id='hds')) else: key = video_file.attrib.get('bitrate') width, height, bitrate = re.search(r'(\d+)x(\d+)(?:_(\d+))?', key).groups() From db7a28eccb743db4e546a12fb78ae5dc40ef44d9 Mon Sep 17 00:00:00 2001 From: AndroKev <AndroKev@users.noreply.github.com> Date: Thu, 8 Oct 2015 09:56:39 +0200 Subject: [PATCH 0059/1286] FIX: nowtv now the download works for me thx to http://board.gulli.com/thread/1251646-videos-von-webseiten-runterladen-sammelthread-alle-fragen-hier-rein-/?p=14980081#post14980081 --- youtube_dl/extractor/nowtv.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/nowtv.py b/youtube_dl/extractor/nowtv.py index c8257719f..b0bdffc4e 100644 --- a/youtube_dl/extractor/nowtv.py +++ b/youtube_dl/extractor/nowtv.py @@ -167,8 +167,8 @@ class NowTVIE(InfoExtractor): 'app': app, 'play_path': 'mp4:%s' % play_path, 'ext': 'flv', - 'page_url': url, - 'player_url': 'http://rtl-now.rtl.de/includes/nc_player.swf', + 'page_url': 'http://rtlnow.rtl.de', + 'player_url': 'http://cdn.static-fra.de/now/vodplayer.swf', 'tbr': int_or_none(item.get('bitrate')), }) self._sort_formats(formats) From b90b0c4ffa6f84a2ea5556d4df99de5f8ef2c7dd Mon Sep 17 00:00:00 2001 From: kitty <magicvidyakitty@gmail.com> Date: Thu, 8 Oct 2015 06:31:23 -0700 Subject: [PATCH 0060/1286] Fixed 4tube.com extractor to pull metadata from associated Javascript and not the HTML of the desired page. --- youtube_dl/extractor/fourtube.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/youtube_dl/extractor/fourtube.py b/youtube_dl/extractor/fourtube.py index 3bb4f6239..226ee67f0 100644 --- a/youtube_dl/extractor/fourtube.py +++ b/youtube_dl/extractor/fourtube.py @@ -45,11 +45,9 @@ class FourTubeIE(InfoExtractor): 'uploadDate', webpage)) thumbnail = self._html_search_meta('thumbnailUrl', webpage) uploader_id = self._html_search_regex( - r'<a class="img-avatar" href="[^"]+/channels/([^/"]+)" title="Go to [^"]+ page">', - webpage, 'uploader id') + r'<a class="img-avatar" href="[^"]+/users/([^/"]+)" title="Go to [^"]+ page">', webpage, 'uploader id') uploader = self._html_search_regex( - r'<a class="img-avatar" href="[^"]+/channels/[^/"]+" title="Go to ([^"]+) page">', - webpage, 'uploader') + r'<a class="img-avatar" href="[^"]+/users/[^/"]+" title="Go to ([^"]+) page">', webpage, 'uploader') categories_html = self._search_regex( r'(?s)><i class="icon icon-tag"></i>\s*Categories / Tags\s*.*?<ul class="list">(.*?)</ul>', @@ -68,9 +66,12 @@ class FourTubeIE(InfoExtractor): webpage, 'like count', fatal=False)) duration = parse_duration(self._html_search_meta('duration', webpage)) + player_url = self._search_regex(r'<script id="playerembed" src="([^"]+)">',webpage,'player javascript') + player_js = self._download_webpage(player_url,video_id,'Downloading player Javascript') + params_js = self._search_regex( r'\$\.ajax\(url,\ opts\);\s*\}\s*\}\)\(([0-9,\[\] ]+)\)', - webpage, 'initialization parameters' + player_js, 'initialization parameters' ) params = self._parse_json('[%s]' % params_js, video_id) media_id = params[0] From 96c48553317cf28ea92d7feaa5701e3ebd93ea25 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Thu, 8 Oct 2015 20:21:07 +0600 Subject: [PATCH 0061/1286] [4tube] Style and make more robust --- youtube_dl/extractor/fourtube.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/fourtube.py b/youtube_dl/extractor/fourtube.py index 226ee67f0..406387e57 100644 --- a/youtube_dl/extractor/fourtube.py +++ b/youtube_dl/extractor/fourtube.py @@ -66,8 +66,11 @@ class FourTubeIE(InfoExtractor): webpage, 'like count', fatal=False)) duration = parse_duration(self._html_search_meta('duration', webpage)) - player_url = self._search_regex(r'<script id="playerembed" src="([^"]+)">',webpage,'player javascript') - player_js = self._download_webpage(player_url,video_id,'Downloading player Javascript') + player_js = self._download_webpage( + self._search_regex( + r'<script[^>]id=(["\'])playerembed\1[^>]+src=(["\'])(?P<url>.+?)\2', + webpage, 'player JS', group='url'), + video_id, 'Downloading player JS') params_js = self._search_regex( r'\$\.ajax\(url,\ opts\);\s*\}\s*\}\)\(([0-9,\[\] ]+)\)', From c677e49bd11c3dfada05de9fc1d4de04a577d70e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Thu, 8 Oct 2015 20:22:08 +0600 Subject: [PATCH 0062/1286] [4tube] Revert uploader regexes and make non fatal --- youtube_dl/extractor/fourtube.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/fourtube.py b/youtube_dl/extractor/fourtube.py index 406387e57..cd7668f18 100644 --- a/youtube_dl/extractor/fourtube.py +++ b/youtube_dl/extractor/fourtube.py @@ -45,9 +45,11 @@ class FourTubeIE(InfoExtractor): 'uploadDate', webpage)) thumbnail = self._html_search_meta('thumbnailUrl', webpage) uploader_id = self._html_search_regex( - r'<a class="img-avatar" href="[^"]+/users/([^/"]+)" title="Go to [^"]+ page">', webpage, 'uploader id') + r'<a class="img-avatar" href="[^"]+/channels/([^/"]+)" title="Go to [^"]+ page">', + webpage, 'uploader id', fatal=False) uploader = self._html_search_regex( - r'<a class="img-avatar" href="[^"]+/users/[^/"]+" title="Go to ([^"]+) page">', webpage, 'uploader') + r'<a class="img-avatar" href="[^"]+/channels/[^/"]+" title="Go to ([^"]+) page">', + webpage, 'uploader', fatal=False) categories_html = self._search_regex( r'(?s)><i class="icon icon-tag"></i>\s*Categories / Tags\s*.*?<ul class="list">(.*?)</ul>', From 4356d907c1afae09ff67a17c90b53be466b13623 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Thu, 8 Oct 2015 20:49:56 +0600 Subject: [PATCH 0063/1286] [4tube] Try extracting sources from the webpage before fetching player.js (Closes #7103) --- youtube_dl/extractor/fourtube.py | 31 ++++++++++++++++++------------- 1 file changed, 18 insertions(+), 13 deletions(-) diff --git a/youtube_dl/extractor/fourtube.py b/youtube_dl/extractor/fourtube.py index cd7668f18..fb6d108c0 100644 --- a/youtube_dl/extractor/fourtube.py +++ b/youtube_dl/extractor/fourtube.py @@ -68,19 +68,24 @@ class FourTubeIE(InfoExtractor): webpage, 'like count', fatal=False)) duration = parse_duration(self._html_search_meta('duration', webpage)) - player_js = self._download_webpage( - self._search_regex( - r'<script[^>]id=(["\'])playerembed\1[^>]+src=(["\'])(?P<url>.+?)\2', - webpage, 'player JS', group='url'), - video_id, 'Downloading player JS') - - params_js = self._search_regex( - r'\$\.ajax\(url,\ opts\);\s*\}\s*\}\)\(([0-9,\[\] ]+)\)', - player_js, 'initialization parameters' - ) - params = self._parse_json('[%s]' % params_js, video_id) - media_id = params[0] - sources = ['%s' % p for p in params[2]] + media_id = self._search_regex( + r'<button[^>]+data-id=(["\'])(?P<id>\d+)\1[^>]+data-quality=', webpage, + 'media id', default=None, group='id') + sources = [ + quality + for _, quality in re.findall(r'<button[^>]+data-quality=(["\'])(.+?)\1', webpage)] + if not (media_id and sources): + player_js = self._download_webpage( + self._search_regex( + r'<script[^>]id=(["\'])playerembed\1[^>]+src=(["\'])(?P<url>.+?)\2', + webpage, 'player JS', group='url'), + video_id, 'Downloading player JS') + params_js = self._search_regex( + r'\$\.ajax\(url,\ opts\);\s*\}\s*\}\)\(([0-9,\[\] ]+)\)', + player_js, 'initialization parameters') + params = self._parse_json('[%s]' % params_js, video_id) + media_id = params[0] + sources = ['%s' % p for p in params[2]] token_url = 'http://tkn.4tube.com/{0}/desktop/{1}'.format( media_id, '+'.join(sources)) From 7faf7e752342b21772c8647902bbf4f13cc57956 Mon Sep 17 00:00:00 2001 From: remitamine <remitamine@gmail.com> Date: Thu, 8 Oct 2015 18:39:24 +0100 Subject: [PATCH 0064/1286] [pbs] detect errors from http error code --- youtube_dl/extractor/pbs.py | 23 +++++++++++++++-------- 1 file changed, 15 insertions(+), 8 deletions(-) diff --git a/youtube_dl/extractor/pbs.py b/youtube_dl/extractor/pbs.py index 6923c6094..82218b933 100644 --- a/youtube_dl/extractor/pbs.py +++ b/youtube_dl/extractor/pbs.py @@ -39,6 +39,7 @@ class PBSIE(InfoExtractor): 'params': { 'skip_download': True, # requires ffmpeg }, + 'skip': 'Expired', }, { 'url': 'http://www.pbs.org/wgbh/pages/frontline/losing-iraq/', @@ -108,12 +109,12 @@ class PBSIE(InfoExtractor): { 'url': 'http://www.pbs.org/wgbh/americanexperience/films/death/player/', 'info_dict': { - 'id': '2280706814', + 'id': '2276541483', 'display_id': 'player', 'ext': 'mp4', - 'title': 'American Experience - Death and the Civil War', + 'title': 'American Experience - Death and the Civil War, Chapter 1', 'description': 'American Experience, TV’s most-watched history series, brings to life the compelling stories from our past that inform our understanding of the world today.', - 'duration': 6705, + 'duration': 682, 'thumbnail': 're:^https?://.*\.jpg$', }, 'params': { @@ -134,6 +135,7 @@ class PBSIE(InfoExtractor): 'params': { 'skip_download': True, # requires ffmpeg }, + 'skip': 'Expired', }, { # Video embedded in iframe containing angle brackets as attribute's value (e.g. @@ -231,13 +233,18 @@ class PBSIE(InfoExtractor): 'Downloading %s video url info' % encoding_name) if redirect_info['status'] == 'error': - if redirect_info['http_code'] == 403: - message = ( - 'The video is not available in your region due to ' - 'right restrictions') + http_code = redirect_info['http_code'] + if http_code == 403: + message = 'We\'re sorry, but this video is not available in your region due to right restrictions.' + elif http_code == 101: + message = 'We\'re sorry, but this video is not yet available.' + elif http_code == 404: + message = 'We are experiencing technical difficulties that are preventing us from playing the video at this time. Please check back again soon.' + elif http_code == 410: + message = 'This video has expired and is no longer available for online streaming.' else: message = redirect_info['message'] - raise ExtractorError(message, expected=True) + raise ExtractorError('PBS said: %s' % message, expected=True) format_url = redirect_info.get('url') if not format_url: From 0553d0ee40b6ecd62ed25ba8999aa2e4f7641590 Mon Sep 17 00:00:00 2001 From: remitamine <remitamine@gmail.com> Date: Thu, 8 Oct 2015 18:57:57 +0100 Subject: [PATCH 0065/1286] [pbs] place errors into a dict --- youtube_dl/extractor/pbs.py | 20 +++++++------------- 1 file changed, 7 insertions(+), 13 deletions(-) diff --git a/youtube_dl/extractor/pbs.py b/youtube_dl/extractor/pbs.py index 82218b933..814e97f48 100644 --- a/youtube_dl/extractor/pbs.py +++ b/youtube_dl/extractor/pbs.py @@ -39,7 +39,6 @@ class PBSIE(InfoExtractor): 'params': { 'skip_download': True, # requires ffmpeg }, - 'skip': 'Expired', }, { 'url': 'http://www.pbs.org/wgbh/pages/frontline/losing-iraq/', @@ -156,6 +155,12 @@ class PBSIE(InfoExtractor): }, } ] + _ERRORS = { + 101: 'We\'re sorry, but this video is not yet available.', + 403: 'We\'re sorry, but this video is not available in your region due to right restrictions.', + 404: 'We are experiencing technical difficulties that are preventing us from playing the video at this time. Please check back again soon.', + 410: 'This video has expired and is no longer available for online streaming.', + } def _extract_webpage(self, url): mobj = re.match(self._VALID_URL, url) @@ -233,18 +238,7 @@ class PBSIE(InfoExtractor): 'Downloading %s video url info' % encoding_name) if redirect_info['status'] == 'error': - http_code = redirect_info['http_code'] - if http_code == 403: - message = 'We\'re sorry, but this video is not available in your region due to right restrictions.' - elif http_code == 101: - message = 'We\'re sorry, but this video is not yet available.' - elif http_code == 404: - message = 'We are experiencing technical difficulties that are preventing us from playing the video at this time. Please check back again soon.' - elif http_code == 410: - message = 'This video has expired and is no longer available for online streaming.' - else: - message = redirect_info['message'] - raise ExtractorError('PBS said: %s' % message, expected=True) + raise ExtractorError('PBS said: %s' % self._ERRORS.get(redirect_info['http_code'], redirect_info['message']), expected=True) format_url = redirect_info.get('url') if not format_url: From 9d5fb3b58d95fbd8c28d9c0eaf4a652660324d9e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Fri, 9 Oct 2015 00:09:10 +0600 Subject: [PATCH 0066/1286] [pbs] Carry long line --- youtube_dl/extractor/pbs.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/pbs.py b/youtube_dl/extractor/pbs.py index 814e97f48..3448736a2 100644 --- a/youtube_dl/extractor/pbs.py +++ b/youtube_dl/extractor/pbs.py @@ -238,7 +238,11 @@ class PBSIE(InfoExtractor): 'Downloading %s video url info' % encoding_name) if redirect_info['status'] == 'error': - raise ExtractorError('PBS said: %s' % self._ERRORS.get(redirect_info['http_code'], redirect_info['message']), expected=True) + raise ExtractorError( + '%s said: %s' % ( + self.IE_NAME, + self._ERRORS.get(redirect_info['http_code'], redirect_info['message'])), + expected=True) format_url = redirect_info.get('url') if not format_url: From 54a5428518adc4ddca085bdac471bc4f286024e4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Fri, 9 Oct 2015 03:54:49 +0600 Subject: [PATCH 0067/1286] [dailymotion] Update player v5 regex (Closes #7107) --- youtube_dl/extractor/dailymotion.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/dailymotion.py b/youtube_dl/extractor/dailymotion.py index 2d90b2224..80a05cfee 100644 --- a/youtube_dl/extractor/dailymotion.py +++ b/youtube_dl/extractor/dailymotion.py @@ -119,7 +119,7 @@ class DailymotionIE(DailymotionBaseInfoExtractor): webpage, 'comment count', fatal=False)) player_v5 = self._search_regex( - r'playerV5\s*=\s*dmp\.create\([^,]+?,\s*({.+?})\);', + [r'buildPlayer\(({.+?})\);', r'playerV5\s*=\s*dmp\.create\([^,]+?,\s*({.+?})\);'], webpage, 'player v5', default=None) if player_v5: player = self._parse_json(player_v5, video_id) From 57935b2564c082b90a60468d4c844b219118886a Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan <yan12125@gmail.com> Date: Fri, 9 Oct 2015 14:11:00 +0800 Subject: [PATCH 0068/1286] [extractor/common] Allow HTML5 unquoted attribute values Fixes #7108 HTML5 allows unquoted attribute values. See the "Unquoted attribute value syntax" section [1] for more information [1] http://www.w3.org/TR/html5/syntax.html --- youtube_dl/extractor/common.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index dbae75406..242618c58 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -646,7 +646,7 @@ class InfoExtractor(object): @staticmethod def _og_regexes(prop): content_re = r'content=(?:"([^>]+?)"|\'([^>]+?)\')' - property_re = r'(?:name|property)=[\'"]og:%s[\'"]' % re.escape(prop) + property_re = r'(?:name|property)=[\'"]?og:%s[\'"]?' % re.escape(prop) template = r'<meta[^>]+?%s[^>]+?%s' return [ template % (property_re, content_re), From ef47b2c15f3fa9d9d491090f2ea46d2bd9967d21 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Fri, 9 Oct 2015 09:09:22 +0200 Subject: [PATCH 0069/1286] release 2015.10.09 --- CONTRIBUTING.md | 2 +- youtube_dl/version.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 57a94231d..32c2fd84c 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -124,7 +124,7 @@ If you want to add support for a new site, you can follow this quick list (assum } ``` 5. Add an import in [`youtube_dl/extractor/__init__.py`](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/__init__.py). -6. Run `python test/test_download.py TestDownload.test_YourExtractor`. This *should fail* at first, but you can continually re-run it until you're done. If you decide to add more than one test, then rename ``_TEST`` to ``_TESTS`` and make it into a list of dictionaries. The tests will be then be named `TestDownload.test_YourExtractor`, `TestDownload.test_YourExtractor_1`, `TestDownload.test_YourExtractor_2`, etc. +6. Run `python test/test_download.py TestDownload.test_YourExtractor`. This *should fail* at first, but you can continually re-run it until you're done. If you decide to add more than one test, then rename ``_TEST`` to ``_TESTS`` and make it into a list of dictionaries. The tests will then be named `TestDownload.test_YourExtractor`, `TestDownload.test_YourExtractor_1`, `TestDownload.test_YourExtractor_2`, etc. 7. Have a look at [`youtube_dl/extractor/common.py`](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/common.py) for possible helper methods and a [detailed description of what your extractor should and may return](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/common.py#L62-L200). Add tests and code for as many as you want. 8. If you can, check the code with [flake8](https://pypi.python.org/pypi/flake8). 9. When the tests pass, [add](http://git-scm.com/docs/git-add) the new files and [commit](http://git-scm.com/docs/git-commit) them and [push](http://git-scm.com/docs/git-push) the result, like this: diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 112c78835..faae8a2d7 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2015.10.06.2' +__version__ = '2015.10.09' From 1ef1563649374568870e9334cce7055f7c83a817 Mon Sep 17 00:00:00 2001 From: remitamine <remitamine@gmail.com> Date: Fri, 9 Oct 2015 20:08:37 +0100 Subject: [PATCH 0070/1286] [srgssr] Add generic extractor for SRGSSR Group sites --- youtube_dl/extractor/__init__.py | 5 +- youtube_dl/extractor/srf.py | 104 ------------------------- youtube_dl/extractor/srgssr.py | 130 +++++++++++++++++++++++++++++++ 3 files changed, 134 insertions(+), 105 deletions(-) delete mode 100644 youtube_dl/extractor/srf.py create mode 100644 youtube_dl/extractor/srgssr.py diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 3ace1cc2c..042ad3678 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -591,7 +591,10 @@ from .sportbox import ( SportBoxEmbedIE, ) from .sportdeutschland import SportDeutschlandIE -from .srf import SrfIE +from .srgssr import ( + SRGSSRIE, + SRGSSRPlayIE, +) from .srmediathek import SRMediathekIE from .ssa import SSAIE from .stanfordoc import StanfordOpenClassroomIE diff --git a/youtube_dl/extractor/srf.py b/youtube_dl/extractor/srf.py deleted file mode 100644 index 77eec0bc7..000000000 --- a/youtube_dl/extractor/srf.py +++ /dev/null @@ -1,104 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re -from .common import InfoExtractor -from ..utils import ( - determine_ext, - parse_iso8601, - xpath_text, -) - - -class SrfIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.srf\.ch/play(?:er)?/tv/[^/]+/video/(?P<display_id>[^?]+)\?id=|tp\.srgssr\.ch/p/flash\?urn=urn:srf:ais:video:)(?P<id>[0-9a-f\-]{36})' - _TESTS = [{ - 'url': 'http://www.srf.ch/play/tv/10vor10/video/snowden-beantragt-asyl-in-russland?id=28e1a57d-5b76-4399-8ab3-9097f071e6c5', - 'md5': '4cd93523723beff51bb4bee974ee238d', - 'info_dict': { - 'id': '28e1a57d-5b76-4399-8ab3-9097f071e6c5', - 'display_id': 'snowden-beantragt-asyl-in-russland', - 'ext': 'm4v', - 'upload_date': '20130701', - 'title': 'Snowden beantragt Asyl in Russland', - 'timestamp': 1372713995, - } - }, { - # No Speichern (Save) button - 'url': 'http://www.srf.ch/play/tv/top-gear/video/jaguar-xk120-shadow-und-tornado-dampflokomotive?id=677f5829-e473-4823-ac83-a1087fe97faa', - 'md5': 'd97e236e80d1d24729e5d0953d276a4f', - 'info_dict': { - 'id': '677f5829-e473-4823-ac83-a1087fe97faa', - 'display_id': 'jaguar-xk120-shadow-und-tornado-dampflokomotive', - 'ext': 'flv', - 'upload_date': '20130710', - 'title': 'Jaguar XK120, Shadow und Tornado-Dampflokomotive', - 'timestamp': 1373493600, - }, - }, { - 'url': 'http://www.srf.ch/player/tv/10vor10/video/snowden-beantragt-asyl-in-russland?id=28e1a57d-5b76-4399-8ab3-9097f071e6c5', - 'only_matching': True, - }, { - 'url': 'https://tp.srgssr.ch/p/flash?urn=urn:srf:ais:video:28e1a57d-5b76-4399-8ab3-9097f071e6c5', - 'only_matching': True, - }] - - def _real_extract(self, url): - video_id = self._match_id(url) - display_id = re.match(self._VALID_URL, url).group('display_id') or video_id - - video_data = self._download_xml( - 'http://il.srgssr.ch/integrationlayer/1.0/ue/srf/video/play/%s.xml' % video_id, - display_id) - - title = xpath_text( - video_data, './AssetMetadatas/AssetMetadata/title', fatal=True) - thumbnails = [{ - 'url': s.text - } for s in video_data.findall('.//ImageRepresentation/url')] - timestamp = parse_iso8601(xpath_text(video_data, './createdDate')) - # The <duration> field in XML is different from the exact duration, skipping - - formats = [] - for item in video_data.findall('./Playlists/Playlist') + video_data.findall('./Downloads/Download'): - for url_node in item.findall('url'): - quality = url_node.attrib['quality'] - full_url = url_node.text - original_ext = determine_ext(full_url) - format_id = '%s-%s' % (quality, item.attrib['protocol']) - if original_ext == 'f4m': - formats.extend(self._extract_f4m_formats( - full_url + '?hdcore=3.4.0', display_id, f4m_id=format_id)) - elif original_ext == 'm3u8': - formats.extend(self._extract_m3u8_formats( - full_url, display_id, 'mp4', m3u8_id=format_id)) - else: - formats.append({ - 'url': full_url, - 'ext': original_ext, - 'format_id': format_id, - 'quality': 0 if 'HD' in quality else -1, - 'preference': 1, - }) - - self._sort_formats(formats) - - subtitles = {} - subtitles_data = video_data.find('Subtitles') - if subtitles_data is not None: - subtitles_list = [{ - 'url': sub.text, - 'ext': determine_ext(sub.text), - } for sub in subtitles_data] - if subtitles_list: - subtitles['de'] = subtitles_list - - return { - 'id': video_id, - 'display_id': display_id, - 'formats': formats, - 'title': title, - 'thumbnails': thumbnails, - 'timestamp': timestamp, - 'subtitles': subtitles, - } diff --git a/youtube_dl/extractor/srgssr.py b/youtube_dl/extractor/srgssr.py new file mode 100644 index 000000000..addf4d26e --- /dev/null +++ b/youtube_dl/extractor/srgssr.py @@ -0,0 +1,130 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + parse_iso8601, + qualities, +) + + +class SRGSSRIE(InfoExtractor): + _VALID_URL = r'(?:https?://tp\.srgssr\.ch/p(?:/[^/]+)+\?urn=)?urn:(?P<bu>srf|rts|rsi|rtr|swi):(?:[^:]+:)?(?P<type>video|audio):(?P<id>[0-9a-f\-]{36}|\d+)' + + _ERRORS = { + 'AGERATING12': 'To protect children under the age of 12, this video is only available between 8 p.m. and 6 a.m.', + 'AGERATING18': 'To protect children under the age of 18, this video is only available between 11 p.m. and 5 a.m.', +# 'ENDDATE': 'For legal reasons, this video was only available for a specified period of time.', + 'GEOBLOCK': 'For legal reasons, this video is only available in Switzerland.', + 'LEGAL': 'The video cannot be transmitted for legal reasons.', + 'STARTDATE': 'This video is not yet available. Please try again later.', + } + + def _real_extract(self, url): + bu, media_type, media_id = re.match(self._VALID_URL, url).groups() + + media_data = self._download_json( + 'http://il.srgssr.ch/integrationlayer/1.0/ue/%s/%s/play/%s.json' % (bu, media_type, media_id), + media_id)[media_type.capitalize()] + + if media_data.get('block') and media_data['block'] in self._ERRORS: + raise ExtractorError( + '%s said: %s' % ( + self.IE_NAME, + self._ERRORS[media_data['block']]), + expected=True) + + metadata = media_data['AssetMetadatas']['AssetMetadata'][0] + title = metadata['title'] + description = metadata.get('description') + created_date = media_data.get('createdDate') or metadata.get('createdDate') + timestamp = parse_iso8601(created_date) + + thumbnails = [] + for image in media_data['Image']['ImageRepresentations']['ImageRepresentation']: + thumbnails.append({ + 'id': image.get('id'), + 'url': image['url'], + }) + + preference = qualities(['LQ', 'MQ', 'SD', 'HQ', 'HD']) + formats = [] + for source in media_data['Playlists']['Playlist']: + assets = {} + protocol = source.get('@protocol') + if protocol in ('HTTP-HDS', 'HTTP-HLS'): + for quality in source['url']: + assets[quality['@quality']] = quality['text'] + asset_url = assets.get('HD') or assets.get('HQ') or assets.get('SD') or assets.get('MQ') or assets.get('LQ') + if '.f4m' in asset_url: + formats.extend(self._extract_f4m_formats(asset_url + '?hdcore=3.4.0', media_id, f4m_id='hds')) + elif '.m3u8' in asset_url: + formats.extend(self._extract_m3u8_formats(asset_url, media_id, m3u8_id='hls')) + else: + for asset in source['url']: + asset_url = asset['text'] + ext = None + if asset_url.startswith('rtmp'): + ext = self._search_regex(r'([a-z0-9]+):[^/]+', asset_url, 'ext') + formats.append({ + 'url': asset_url, + 'preference': preference(asset['@quality']), + 'ext': ext, + }) + + downloads = media_data.get('Downloads') + if downloads: + for source in downloads['Download']: + for asset in source['url']: + formats.append({ + 'url': asset['text'], + 'preference': preference(asset['@quality']) + }) + self._sort_formats(formats) + + return { + 'id': media_id, + 'title': title, + 'description': description, + 'timestamp': timestamp, + 'thumbnails': thumbnails, + 'formats': formats, + } + + +class SRGSSRPlayIE(InfoExtractor): + _VALID_URL = r'https?://(?:(?:www|play)\.)?(?P<bu>srf|rts|rsi|rtr|swi)\.ch/play/(?:tv|radio)/[^/]+/(?P<type>video|audio)/[^?]+\?id=(?P<id>[0-9a-f\-]{36}|\d+)' + + _TESTS = [{ + 'url': 'http://www.srf.ch/play/tv/10vor10/video/snowden-beantragt-asyl-in-russland?id=28e1a57d-5b76-4399-8ab3-9097f071e6c5', + 'md5': '4cd93523723beff51bb4bee974ee238d', + 'info_dict': { + 'id': '28e1a57d-5b76-4399-8ab3-9097f071e6c5', + 'ext': 'm4v', + 'upload_date': '20130701', + 'title': 'Snowden beantragt Asyl in Russland', + 'timestamp': 1372713995, + } + }, { + # No Speichern (Save) button + 'url': 'http://www.srf.ch/play/tv/top-gear/video/jaguar-xk120-shadow-und-tornado-dampflokomotive?id=677f5829-e473-4823-ac83-a1087fe97faa', + 'md5': '0a274ce38fda48c53c01890651985bc6', + 'info_dict': { + 'id': '677f5829-e473-4823-ac83-a1087fe97faa', + 'ext': 'flv', + 'upload_date': '20130710', + 'title': 'Jaguar XK120, Shadow und Tornado-Dampflokomotive', + 'description': 'md5:88604432b60d5a38787f152dec89cd56', + 'timestamp': 1373493600, + }, + }, { + 'url': 'http://www.srf.ch/player/tv/10vor10/video/snowden-beantragt-asyl-in-russland?id=28e1a57d-5b76-4399-8ab3-9097f071e6c5', + 'only_matching': True, + }] + + def _real_extract(self, url): + bu, media_type, media_id = re.match(self._VALID_URL, url).groups() + return self.url_result('urn:%s:%s:%s' % (bu, media_type, media_id), 'SRGSSR') From 05ad5409b4fd044169ea0f67b9ae92d555564c4e Mon Sep 17 00:00:00 2001 From: remitamine <remitamine@gmail.com> Date: Fri, 9 Oct 2015 20:34:03 +0100 Subject: [PATCH 0071/1286] [srgssr] fix regex for swissinfo.ch --- youtube_dl/extractor/srgssr.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/youtube_dl/extractor/srgssr.py b/youtube_dl/extractor/srgssr.py index addf4d26e..3b5dcc503 100644 --- a/youtube_dl/extractor/srgssr.py +++ b/youtube_dl/extractor/srgssr.py @@ -96,7 +96,7 @@ class SRGSSRIE(InfoExtractor): class SRGSSRPlayIE(InfoExtractor): - _VALID_URL = r'https?://(?:(?:www|play)\.)?(?P<bu>srf|rts|rsi|rtr|swi)\.ch/play/(?:tv|radio)/[^/]+/(?P<type>video|audio)/[^?]+\?id=(?P<id>[0-9a-f\-]{36}|\d+)' + _VALID_URL = r'https?://(?:(?:www|play)\.)?(?P<bu>srf|rts|rsi|rtr|swissinfo)\.ch/play/(?:tv|radio)/[^/]+/(?P<type>video|audio)/[^?]+\?id=(?P<id>[0-9a-f\-]{36}|\d+)' _TESTS = [{ 'url': 'http://www.srf.ch/play/tv/10vor10/video/snowden-beantragt-asyl-in-russland?id=28e1a57d-5b76-4399-8ab3-9097f071e6c5', @@ -120,11 +120,8 @@ class SRGSSRPlayIE(InfoExtractor): 'description': 'md5:88604432b60d5a38787f152dec89cd56', 'timestamp': 1373493600, }, - }, { - 'url': 'http://www.srf.ch/player/tv/10vor10/video/snowden-beantragt-asyl-in-russland?id=28e1a57d-5b76-4399-8ab3-9097f071e6c5', - 'only_matching': True, }] def _real_extract(self, url): bu, media_type, media_id = re.match(self._VALID_URL, url).groups() - return self.url_result('urn:%s:%s:%s' % (bu, media_type, media_id), 'SRGSSR') + return self.url_result('urn:%s:%s:%s' % (bu[:3], media_type, media_id), 'SRGSSR') From 4180a3d8b7c40792b5371ca8804c1dad8fabd56d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 10 Oct 2015 01:44:33 +0600 Subject: [PATCH 0072/1286] [extractor/common] Allow quoteless content attribute in og regexes (Closes #7115) --- youtube_dl/extractor/common.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 242618c58..0082a4c84 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -645,7 +645,7 @@ class InfoExtractor(object): # Helper functions for extracting OpenGraph info @staticmethod def _og_regexes(prop): - content_re = r'content=(?:"([^>]+?)"|\'([^>]+?)\')' + content_re = r'content=(?:"([^>]+?)"|\'([^>]+?)\'|\s*([^\s"\'=<>`]+?))' property_re = r'(?:name|property)=[\'"]?og:%s[\'"]?' % re.escape(prop) template = r'<meta[^>]+?%s[^>]+?%s' return [ From 47c165c3a9f40dd9a175b11f29f9e2002fdda8a9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 10 Oct 2015 05:56:01 +0600 Subject: [PATCH 0073/1286] [vimeo] Fix authentication (Closes #7110) --- youtube_dl/extractor/vimeo.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/youtube_dl/extractor/vimeo.py b/youtube_dl/extractor/vimeo.py index 50df79ca1..7dd52627d 100644 --- a/youtube_dl/extractor/vimeo.py +++ b/youtube_dl/extractor/vimeo.py @@ -40,6 +40,9 @@ class VimeoBaseInfoExtractor(InfoExtractor): self.report_login() webpage = self._download_webpage(self._LOGIN_URL, None, False) token = self._extract_xsrft(webpage) + vuid = self._search_regex( + r'["\']vuid["\']\s*:\s*(["\'])(?P<vuid>.+?)\1', + webpage, 'vuid', group='vuid') data = urlencode_postdata({ 'action': 'login', 'email': username, @@ -49,6 +52,7 @@ class VimeoBaseInfoExtractor(InfoExtractor): }) login_request = compat_urllib_request.Request(self._LOGIN_URL, data) login_request.add_header('Content-Type', 'application/x-www-form-urlencoded') + login_request.add_header('Cookie', 'vuid=%s' % vuid) login_request.add_header('Referer', self._LOGIN_URL) self._download_webpage(login_request, None, False, 'Wrong login info') From 6a959f2e5266dcc6037a33589eb34fad9190c63e Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan <yan12125@gmail.com> Date: Sat, 10 Oct 2015 15:03:01 +0800 Subject: [PATCH 0074/1286] [iqiyi] Update enc_key --- youtube_dl/extractor/iqiyi.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/iqiyi.py b/youtube_dl/extractor/iqiyi.py index ce1ab3820..0e53cb154 100644 --- a/youtube_dl/extractor/iqiyi.py +++ b/youtube_dl/extractor/iqiyi.py @@ -191,7 +191,7 @@ class IqiyiIE(InfoExtractor): 'vid': video_id, 'vinfo': 1, 'tm': tm, - 'enc': self.md5_text((enc_key + tail)[1:64:2] + tail), + 'enc': self.md5_text(enc_key + tail), 'qyid': _uuid, 'tn': random.random(), 'um': 0, @@ -205,7 +205,9 @@ class IqiyiIE(InfoExtractor): def get_enc_key(self, swf_url, video_id): # TODO: automatic key extraction - enc_key = 'eac64f22daf001da6ba9aa8da4d501508bbe90a4d4091fea3b0582a85b38c2cc' # last update at 2015-09-23-23 for Zombie::bite + # last update at 2015-10-10 for Zombie::bite + # '7239670519b6ac209a0bee4ef0446a6b24894b8ac2751506e42116212a0d0272e505'[2:66][1::2] + enc_key = '97596c0abee04ab49ba25564161ad225' return enc_key def _real_extract(self, url): From dab062fb6ecd48e0c243a6d030d89b44cd44bd84 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 10 Oct 2015 20:34:06 +0600 Subject: [PATCH 0075/1286] [bbc] Add support for videos in news articles embedded with data-playable --- youtube_dl/extractor/bbc.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/youtube_dl/extractor/bbc.py b/youtube_dl/extractor/bbc.py index cc2f6fed2..b2b39ff21 100644 --- a/youtube_dl/extractor/bbc.py +++ b/youtube_dl/extractor/bbc.py @@ -11,6 +11,7 @@ from ..utils import ( int_or_none, parse_duration, parse_iso8601, + unescapeHTML, ) from ..compat import compat_HTTPError @@ -682,6 +683,21 @@ class BBCIE(BBCCoUkIE): [r'data-video-player-vpid="([\da-z]{8})"', r'<param[^>]+name="externalIdentifier"[^>]+value="([\da-z]{8})"'], webpage, 'vpid', default=None) + + duration = None + if not programme_id: + # single video in news article embedded with data-playable (e.g. + # http://www.bbc.com/news/world-us-canada-34473351) + data_playable = self._parse_json( + unescapeHTML(self._search_regex( + r'data-playable="({.+?})"', webpage, 'data playable', default='{}')), + programme_id, fatal=False) + if data_playable: + items = data_playable.get('settings', {}).get('playlistObject', {}).get('items') + if items and isinstance(items, list): + duration = int_or_none(items[0].get('duration')) + programme_id = items[0].get('vpid') + if programme_id: formats, subtitles = self._download_media_selector(programme_id) self._sort_formats(formats) @@ -699,6 +715,7 @@ class BBCIE(BBCCoUkIE): 'title': title, 'description': description, 'timestamp': timestamp, + 'duration': duration, 'formats': formats, 'subtitles': subtitles, } From de66571371e2a9705ecd3aed903880c4846d04ff Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 10 Oct 2015 20:40:56 +0600 Subject: [PATCH 0076/1286] [bbc] Support multiple videos in articles embedded with playlist.sxml --- youtube_dl/extractor/bbc.py | 38 ++++++++++++++++++++----------------- 1 file changed, 21 insertions(+), 17 deletions(-) diff --git a/youtube_dl/extractor/bbc.py b/youtube_dl/extractor/bbc.py index b2b39ff21..930637cd7 100644 --- a/youtube_dl/extractor/bbc.py +++ b/youtube_dl/extractor/bbc.py @@ -660,23 +660,27 @@ class BBCIE(BBCCoUkIE): r'itemprop="datePublished"[^>]+datetime="([^"]+)"'], webpage, 'date', default=None)) - # single video with playlist.sxml URL (e.g. http://www.bbc.com/sport/0/football/3365340ng) - playlist = self._search_regex( - r'<param[^>]+name="playlist"[^>]+value="([^"]+)"', - webpage, 'playlist', default=None) - if playlist: - programme_id, title, description, duration, formats, subtitles = \ - self._process_legacy_playlist_url(playlist, playlist_id) - self._sort_formats(formats) - return { - 'id': programme_id, - 'title': title, - 'description': description, - 'duration': duration, - 'timestamp': timestamp, - 'formats': formats, - 'subtitles': subtitles, - } + # article with multiple videos embedded with playlist.sxml (e.g. + # http://www.bbc.com/sport/0/football/34475836) + playlists = re.findall(r'<param[^>]+name="playlist"[^>]+value="([^"]+)"', webpage) + if playlists: + entries = [] + for playlist in playlists: + programme_id, title, description, duration, formats, subtitles = \ + self._process_legacy_playlist_url(playlist, playlist_id) + self._sort_formats(formats) + entries.append({ + 'id': programme_id, + 'title': title, + 'description': description, + 'duration': duration, + 'timestamp': timestamp, + 'formats': formats, + 'subtitles': subtitles, + }) + playlist_title = self._og_search_title(webpage) + playlist_description = self._og_search_description(webpage) + return self.playlist_result(entries, playlist_id, playlist_title, playlist_description) # single video story (e.g. http://www.bbc.com/travel/story/20150625-sri-lankas-spicy-secret) programme_id = self._search_regex( From b5d48cb1ef741ef999d038a3bc80943b1ae37a50 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 10 Oct 2015 20:55:46 +0600 Subject: [PATCH 0077/1286] [bbc] Add test for atricle with multiple videos embedded with playlist.sxml --- youtube_dl/extractor/bbc.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/youtube_dl/extractor/bbc.py b/youtube_dl/extractor/bbc.py index 930637cd7..ffe2afa96 100644 --- a/youtube_dl/extractor/bbc.py +++ b/youtube_dl/extractor/bbc.py @@ -608,6 +608,14 @@ class BBCIE(BBCCoUkIE): # rtmp download 'skip_download': True, } + }, { + # article with multiple videos embedded with playlist.sxml + 'url': 'http://www.bbc.com/sport/0/football/34475836', + 'info_dict': { + 'id': '34475836', + 'title': 'What Liverpool can expect from Klopp', + }, + 'playlist_count': 3, }, { # single video with playlist URL from weather section 'url': 'http://www.bbc.com/weather/features/33601775', From 975977860d1880bbf5e9996ad77fb7f67622ccbe Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 10 Oct 2015 20:56:07 +0600 Subject: [PATCH 0078/1286] [bbc] Make playlist title optional --- youtube_dl/extractor/bbc.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/bbc.py b/youtube_dl/extractor/bbc.py index ffe2afa96..f943b5fd1 100644 --- a/youtube_dl/extractor/bbc.py +++ b/youtube_dl/extractor/bbc.py @@ -687,7 +687,7 @@ class BBCIE(BBCCoUkIE): 'subtitles': subtitles, }) playlist_title = self._og_search_title(webpage) - playlist_description = self._og_search_description(webpage) + playlist_description = self._og_search_description(webpage, default=None) return self.playlist_result(entries, playlist_id, playlist_title, playlist_description) # single video story (e.g. http://www.bbc.com/travel/story/20150625-sri-lankas-spicy-secret) From baf39a1aa8af38ca295e5efdad277ddee5b11eb5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 10 Oct 2015 21:32:27 +0600 Subject: [PATCH 0079/1286] [bbc] Add one more scenario for data-playable embeds --- youtube_dl/extractor/bbc.py | 53 ++++++++++++++++++++++++------------- 1 file changed, 35 insertions(+), 18 deletions(-) diff --git a/youtube_dl/extractor/bbc.py b/youtube_dl/extractor/bbc.py index f943b5fd1..df61f0157 100644 --- a/youtube_dl/extractor/bbc.py +++ b/youtube_dl/extractor/bbc.py @@ -657,6 +657,20 @@ class BBCIE(BBCCoUkIE): return [], [] + def _extract_from_playlist_sxml(self, url, playlist_id, timestamp): + programme_id, title, description, duration, formats, subtitles = \ + self._process_legacy_playlist_url(url, playlist_id) + self._sort_formats(formats) + return { + 'id': programme_id, + 'title': title, + 'description': description, + 'duration': duration, + 'timestamp': timestamp, + 'formats': formats, + 'subtitles': subtitles, + } + def _real_extract(self, url): playlist_id = self._match_id(url) @@ -672,20 +686,9 @@ class BBCIE(BBCCoUkIE): # http://www.bbc.com/sport/0/football/34475836) playlists = re.findall(r'<param[^>]+name="playlist"[^>]+value="([^"]+)"', webpage) if playlists: - entries = [] - for playlist in playlists: - programme_id, title, description, duration, formats, subtitles = \ - self._process_legacy_playlist_url(playlist, playlist_id) - self._sort_formats(formats) - entries.append({ - 'id': programme_id, - 'title': title, - 'description': description, - 'duration': duration, - 'timestamp': timestamp, - 'formats': formats, - 'subtitles': subtitles, - }) + entries = [ + self._extract_from_playlist_sxml(playlist_url, playlist_id, timestamp) + for playlist_url in playlists] playlist_title = self._og_search_title(webpage) playlist_description = self._og_search_description(webpage, default=None) return self.playlist_result(entries, playlist_id, playlist_title, playlist_description) @@ -705,10 +708,24 @@ class BBCIE(BBCCoUkIE): r'data-playable="({.+?})"', webpage, 'data playable', default='{}')), programme_id, fatal=False) if data_playable: - items = data_playable.get('settings', {}).get('playlistObject', {}).get('items') - if items and isinstance(items, list): - duration = int_or_none(items[0].get('duration')) - programme_id = items[0].get('vpid') + # data-playable has video vpid in settings.playlistObject.items (e.g. + # http://www.bbc.com/news/world-us-canada-34473351) + settings = data_playable.get('settings', {}) + if settings: + playlist_object = settings.get('playlistObject', {}) + if playlist_object: + items = playlist_object.get('items') + if items and isinstance(items, list): + duration = int_or_none(items[0].get('duration')) + programme_id = items[0].get('vpid') + if not programme_id: + # data-playable has no vpid but has a playlist.sxml URLs + # in otherSettings.playlist (e.g. + # http://www.bbc.com/turkce/multimedya/2015/10/151010_vid_ankara_patlama_ani) + playlist = data_playable.get('otherSettings', {}).get('playlist', {}) + if playlist: + return self._extract_from_playlist_sxml( + playlist.get('progressiveDownloadUrl'), playlist_id, timestamp) if programme_id: formats, subtitles = self._download_media_selector(programme_id) From 87cc0fbd18345117926d27db7312cba663d0f5ad Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 10 Oct 2015 21:35:20 +0600 Subject: [PATCH 0080/1286] [bbc] Make summary optional in legacy playlist --- youtube_dl/extractor/bbc.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/bbc.py b/youtube_dl/extractor/bbc.py index df61f0157..911ccd757 100644 --- a/youtube_dl/extractor/bbc.py +++ b/youtube_dl/extractor/bbc.py @@ -401,7 +401,8 @@ class BBCCoUkIE(InfoExtractor): if kind != 'programme' and kind != 'radioProgramme': continue title = playlist.find('./{http://bbc.co.uk/2008/emp/playlist}title').text - description = playlist.find('./{http://bbc.co.uk/2008/emp/playlist}summary').text + description_el = playlist.find('./{http://bbc.co.uk/2008/emp/playlist}summary') + description = description_el.text if description_el else None def get_programme_id(item): def get_from_attributes(item): From 97ae4d166c43b7e5068a7563bc5fbce385d86c2b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ph=E1=BA=A1m=20Ng=E1=BB=8Dc=20Quang=20Nam?= <nampnq@gmail.com> Date: Sat, 10 Oct 2015 22:07:06 +0700 Subject: [PATCH 0081/1286] [zingmp3:album] Add support for playlists Update for work with playlist, it same album but different url Ex: http://mp3.zing.vn/playlist/Duong-Hong-Loan-apollobee/IWCAACCB.html http://mp3.zing.vn/album/Duong-Hong-Loan-apollobee/IWCAACCB.html --- youtube_dl/extractor/zingmp3.py | 24 +++++++++++++++--------- 1 file changed, 15 insertions(+), 9 deletions(-) diff --git a/youtube_dl/extractor/zingmp3.py b/youtube_dl/extractor/zingmp3.py index 7dc1e2f2b..42ac124f0 100644 --- a/youtube_dl/extractor/zingmp3.py +++ b/youtube_dl/extractor/zingmp3.py @@ -85,16 +85,22 @@ class ZingMp3SongIE(ZingMp3BaseInfoExtractor): class ZingMp3AlbumIE(ZingMp3BaseInfoExtractor): - _VALID_URL = r'https?://mp3\.zing\.vn/album/(?P<slug>[^/]+)/(?P<album_id>\w+)\.html' - _TESTS = [{ - 'url': 'http://mp3.zing.vn/album/Lau-Dai-Tinh-Ai-Bang-Kieu-Minh-Tuyet/ZWZBWDAF.html', - 'info_dict': { - '_type': 'playlist', - 'id': 'ZWZBWDAF', - 'title': 'Lâu Đài Tình Ái - Bằng Kiều ft. Minh Tuyết | Album 320 lossless', + _VALID_URL = r'https?://mp3\.zing\.vn/(?:album|playlist)/(?P<slug>[^/]+)/(?P<album_id>\w+)\.html' + _TESTS = [ + { + 'url': 'http://mp3.zing.vn/album/Lau-Dai-Tinh-Ai-Bang-Kieu-Minh-Tuyet/ZWZBWDAF.html', + 'info_dict': { + '_type': 'playlist', + 'id': 'ZWZBWDAF', + 'title': 'Lâu Đài Tình Ái - Bằng Kiều ft. Minh Tuyết | Album 320 lossless', + }, + 'playlist_count': 10, }, - 'playlist_count': 10, - }] + { + 'url': 'http://mp3.zing.vn/playlist/Duong-Hong-Loan-apollobee/IWCAACCB.html', + 'only_matching': True, + } + ] IE_NAME = 'zingmp3:album' IE_DESC = 'mp3.zing.vn albums' From 43abd79950b7ba0c3b6e8a26b624e439036b1c7a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 10 Oct 2015 22:05:01 +0600 Subject: [PATCH 0082/1286] [zingmp3:album] Style --- youtube_dl/extractor/zingmp3.py | 25 +++++++++++-------------- 1 file changed, 11 insertions(+), 14 deletions(-) diff --git a/youtube_dl/extractor/zingmp3.py b/youtube_dl/extractor/zingmp3.py index 42ac124f0..1059dd75f 100644 --- a/youtube_dl/extractor/zingmp3.py +++ b/youtube_dl/extractor/zingmp3.py @@ -86,21 +86,18 @@ class ZingMp3SongIE(ZingMp3BaseInfoExtractor): class ZingMp3AlbumIE(ZingMp3BaseInfoExtractor): _VALID_URL = r'https?://mp3\.zing\.vn/(?:album|playlist)/(?P<slug>[^/]+)/(?P<album_id>\w+)\.html' - _TESTS = [ - { - 'url': 'http://mp3.zing.vn/album/Lau-Dai-Tinh-Ai-Bang-Kieu-Minh-Tuyet/ZWZBWDAF.html', - 'info_dict': { - '_type': 'playlist', - 'id': 'ZWZBWDAF', - 'title': 'Lâu Đài Tình Ái - Bằng Kiều ft. Minh Tuyết | Album 320 lossless', - }, - 'playlist_count': 10, + _TESTS = [{ + 'url': 'http://mp3.zing.vn/album/Lau-Dai-Tinh-Ai-Bang-Kieu-Minh-Tuyet/ZWZBWDAF.html', + 'info_dict': { + '_type': 'playlist', + 'id': 'ZWZBWDAF', + 'title': 'Lâu Đài Tình Ái - Bằng Kiều ft. Minh Tuyết | Album 320 lossless', }, - { - 'url': 'http://mp3.zing.vn/playlist/Duong-Hong-Loan-apollobee/IWCAACCB.html', - 'only_matching': True, - } - ] + 'playlist_count': 10, + }, { + 'url': 'http://mp3.zing.vn/playlist/Duong-Hong-Loan-apollobee/IWCAACCB.html', + 'only_matching': True, + }] IE_NAME = 'zingmp3:album' IE_DESC = 'mp3.zing.vn albums' From 8119597d6ff52a671fa359378dfbdee13f24f882 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 10 Oct 2015 22:08:38 +0600 Subject: [PATCH 0083/1286] [zingmp3] Add fatal flag --- youtube_dl/extractor/zingmp3.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/zingmp3.py b/youtube_dl/extractor/zingmp3.py index 1059dd75f..3f46f9049 100644 --- a/youtube_dl/extractor/zingmp3.py +++ b/youtube_dl/extractor/zingmp3.py @@ -9,9 +9,11 @@ from ..utils import ExtractorError class ZingMp3BaseInfoExtractor(InfoExtractor): - def _extract_item(self, item): + def _extract_item(self, item, fatal=True): error_message = item.find('./errormessage').text if error_message: + if not fatal: + return raise ExtractorError( '%s returned error: %s' % (self.IE_NAME, error_message), expected=True) From f790c43f6e33782c6a6eaaff68bbd30aad875efe Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 10 Oct 2015 22:09:21 +0600 Subject: [PATCH 0084/1286] [zingmp3:album] Skip broken items --- youtube_dl/extractor/zingmp3.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/zingmp3.py b/youtube_dl/extractor/zingmp3.py index 3f46f9049..437eecb67 100644 --- a/youtube_dl/extractor/zingmp3.py +++ b/youtube_dl/extractor/zingmp3.py @@ -45,7 +45,9 @@ class ZingMp3BaseInfoExtractor(InfoExtractor): entries = [] for i, item in enumerate(items, 1): - entry = self._extract_item(item) + entry = self._extract_item(item, fatal=False) + if not entry: + continue entry['id'] = '%s-%d' % (id, i) entries.append(entry) From e6174ee9753e044bdeac22c653f5762376cadbac Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 10 Oct 2015 23:01:20 +0600 Subject: [PATCH 0085/1286] [bbc] Extract legacy playlist embedded media --- youtube_dl/extractor/bbc.py | 48 ++++++++++++++++++++++++++----------- 1 file changed, 34 insertions(+), 14 deletions(-) diff --git a/youtube_dl/extractor/bbc.py b/youtube_dl/extractor/bbc.py index 911ccd757..7bbbc0b90 100644 --- a/youtube_dl/extractor/bbc.py +++ b/youtube_dl/extractor/bbc.py @@ -29,6 +29,14 @@ class BBCCoUkIE(InfoExtractor): 'http://open.live.bbc.co.uk/mediaselector/5/select/version/2.0/mediaset/pc/vpid/%s', ] + _MEDIASELECTION_NS = 'http://bbc.co.uk/2008/mp/mediaselection' + _EMP_PLAYLIST_NS = 'http://bbc.co.uk/2008/emp/playlist' + + _NAMESPACES = ( + _MEDIASELECTION_NS, + _EMP_PLAYLIST_NS, + ) + _TESTS = [ { 'url': 'http://www.bbc.co.uk/programmes/b039g8p7', @@ -194,6 +202,7 @@ class BBCCoUkIE(InfoExtractor): def _extract_connection(self, connection, programme_id): formats = [] + kind = connection.get('kind') protocol = connection.get('protocol') supplier = connection.get('supplier') if protocol == 'http': @@ -219,7 +228,7 @@ class BBCCoUkIE(InfoExtractor): else: formats.append({ 'url': href, - 'format_id': supplier, + 'format_id': supplier or kind or protocol, }) elif protocol == 'rtmp': application = connection.get('application', 'ondemand') @@ -239,16 +248,24 @@ class BBCCoUkIE(InfoExtractor): return formats def _extract_items(self, playlist): - return playlist.findall('./{http://bbc.co.uk/2008/emp/playlist}item') + return playlist.findall('./{%s}item' % self._EMP_PLAYLIST_NS) + + def _findall_ns(self, element, xpath): + elements = [] + for ns in self._NAMESPACES: + elements.extend(element.findall(xpath % ns)) + return elements def _extract_medias(self, media_selection): - error = media_selection.find('./{http://bbc.co.uk/2008/mp/mediaselection}error') + error = media_selection.find('./{%s}error' % self._MEDIASELECTION_NS) + if error is None: + media_selection.find('./{%s}error' % self._EMP_PLAYLIST_NS) if error is not None: raise BBCCoUkIE.MediaSelectionError(error.get('id')) - return media_selection.findall('./{http://bbc.co.uk/2008/mp/mediaselection}media') + return self._findall_ns(media_selection, './{%s}media') def _extract_connections(self, media): - return media.findall('./{http://bbc.co.uk/2008/mp/mediaselection}connection') + return self._findall_ns(media, './{%s}connection') def _extract_video(self, media, programme_id): formats = [] @@ -262,13 +279,14 @@ class BBCCoUkIE(InfoExtractor): conn_formats = self._extract_connection(connection, programme_id) for format in conn_formats: format.update({ - 'format_id': '%s_%s' % (service, format['format_id']), 'width': width, 'height': height, 'vbr': vbr, 'vcodec': vcodec, 'filesize': file_size, }) + if service: + format['format_id'] = '%s_%s' % (service, format['format_id']) formats.extend(conn_formats) return formats @@ -383,7 +401,7 @@ class BBCCoUkIE(InfoExtractor): url, playlist_id, 'Downloading legacy playlist XML') def _extract_from_legacy_playlist(self, playlist, playlist_id): - no_items = playlist.find('./{http://bbc.co.uk/2008/emp/playlist}noItems') + no_items = playlist.find('./{%s}noItems' % self._EMP_PLAYLIST_NS) if no_items is not None: reason = no_items.get('reason') if reason == 'preAvailability': @@ -400,8 +418,8 @@ class BBCCoUkIE(InfoExtractor): kind = item.get('kind') if kind != 'programme' and kind != 'radioProgramme': continue - title = playlist.find('./{http://bbc.co.uk/2008/emp/playlist}title').text - description_el = playlist.find('./{http://bbc.co.uk/2008/emp/playlist}summary') + title = playlist.find('./{%s}title' % self._EMP_PLAYLIST_NS).text + description_el = playlist.find('./{%s}summary' % self._EMP_PLAYLIST_NS) description = description_el.text if description_el else None def get_programme_id(item): @@ -411,16 +429,18 @@ class BBCCoUkIE(InfoExtractor): if value and re.match(r'^[pb][\da-z]{7}$', value): return value get_from_attributes(item) - mediator = item.find('./{http://bbc.co.uk/2008/emp/playlist}mediator') + mediator = item.find('./{%s}mediator' % self._EMP_PLAYLIST_NS) if mediator is not None: return get_from_attributes(mediator) programme_id = get_programme_id(item) duration = int_or_none(item.get('duration')) - # TODO: programme_id can be None and media items can be incorporated right inside - # playlist's item (e.g. http://www.bbc.com/turkce/haberler/2015/06/150615_telabyad_kentin_cogu) - # as f4m and m3u8 - formats, subtitles = self._download_media_selector(programme_id) + + if programme_id: + formats, subtitles = self._download_media_selector(programme_id) + else: + formats, subtitles = self._process_media_selector(item, playlist_id) + programme_id = playlist_id return programme_id, title, description, duration, formats, subtitles From c936d8cc7b247f79dce2f5749dea5c18a08858fb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 10 Oct 2015 23:14:25 +0600 Subject: [PATCH 0086/1286] [bbc] Add another test --- youtube_dl/extractor/bbc.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/youtube_dl/extractor/bbc.py b/youtube_dl/extractor/bbc.py index 7bbbc0b90..0a08a57bd 100644 --- a/youtube_dl/extractor/bbc.py +++ b/youtube_dl/extractor/bbc.py @@ -559,6 +559,19 @@ class BBCIE(BBCCoUkIE): 'params': { 'skip_download': True, } + }, { + # single video embedded with playlist.sxml in data-playable + 'url': 'http://www.bbc.com/turkce/multimedya/2015/10/151010_vid_ankara_patlama_ani', + 'info_dict': { + 'id': '151010_vid_ankara_patlama_ani', + 'ext': 'mp4', + 'title': "Ankara'da patlama anı", + 'timestamp': 1444480325, + 'upload_date': '20151010', + }, + 'params': { + 'skip_download': True, + } }, { # single video embedded with mediaAssetPage.init() (regional section) 'url': 'http://www.bbc.com/mundo/video_fotos/2015/06/150619_video_honduras_militares_hospitales_corrupcion_aw', From b1ec70e4a9fc53b0ec583f48a5262c9f864db40b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 10 Oct 2015 23:20:30 +0600 Subject: [PATCH 0087/1286] [bbc] Improve data-playable regex --- youtube_dl/extractor/bbc.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/bbc.py b/youtube_dl/extractor/bbc.py index 0a08a57bd..972abd0d4 100644 --- a/youtube_dl/extractor/bbc.py +++ b/youtube_dl/extractor/bbc.py @@ -739,7 +739,8 @@ class BBCIE(BBCCoUkIE): # http://www.bbc.com/news/world-us-canada-34473351) data_playable = self._parse_json( unescapeHTML(self._search_regex( - r'data-playable="({.+?})"', webpage, 'data playable', default='{}')), + r'data-playable=(["\'])(?P<json>{.+?})\1', webpage, + 'data playable', default='{}', group='json')), programme_id, fatal=False) if data_playable: # data-playable has video vpid in settings.playlistObject.items (e.g. From 78f9d843186977c614c5a0f6004732f5d410cd0c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 10 Oct 2015 23:40:20 +0600 Subject: [PATCH 0088/1286] [bbc] Support playlists of data-playable --- youtube_dl/extractor/bbc.py | 73 ++++++++++++++++++++++--------------- 1 file changed, 44 insertions(+), 29 deletions(-) diff --git a/youtube_dl/extractor/bbc.py b/youtube_dl/extractor/bbc.py index 972abd0d4..a15e67114 100644 --- a/youtube_dl/extractor/bbc.py +++ b/youtube_dl/extractor/bbc.py @@ -716,6 +716,8 @@ class BBCIE(BBCCoUkIE): r'itemprop="datePublished"[^>]+datetime="([^"]+)"'], webpage, 'date', default=None)) + entries = [] + # article with multiple videos embedded with playlist.sxml (e.g. # http://www.bbc.com/sport/0/football/34475836) playlists = re.findall(r'<param[^>]+name="playlist"[^>]+value="([^"]+)"', webpage) @@ -723,6 +725,48 @@ class BBCIE(BBCCoUkIE): entries = [ self._extract_from_playlist_sxml(playlist_url, playlist_id, timestamp) for playlist_url in playlists] + + # news article with multiple videos embedded with data-playable + data_playables = re.findall(r'data-playable=(["\'])({.+?})\1', webpage) + if data_playables: + for _, data_playable_json in data_playables: + data_playable = self._parse_json( + unescapeHTML(data_playable_json), playlist_id, fatal=False) + if not data_playable: + continue + settings = data_playable.get('settings', {}) + if settings: + # data-playable with video vpid in settings.playlistObject.items (e.g. + # http://www.bbc.com/news/world-us-canada-34473351) + playlist_object = settings.get('playlistObject', {}) + if playlist_object: + items = playlist_object.get('items') + if items and isinstance(items, list): + title = playlist_object['title'] + description = playlist_object.get('summary') + duration = int_or_none(items[0].get('duration')) + programme_id = items[0].get('vpid') + formats, subtitles = self._download_media_selector(programme_id) + self._sort_formats(formats) + entries.append({ + 'id': programme_id, + 'title': title, + 'description': description, + 'timestamp': timestamp, + 'duration': duration, + 'formats': formats, + 'subtitles': subtitles, + }) + else: + # data-playable without vpid but with a playlist.sxml URLs + # in otherSettings.playlist (e.g. + # http://www.bbc.com/turkce/multimedya/2015/10/151010_vid_ankara_patlama_ani) + playlist = data_playable.get('otherSettings', {}).get('playlist', {}) + if playlist: + entries.append(self._extract_from_playlist_sxml( + playlist.get('progressiveDownloadUrl'), playlist_id, timestamp)) + + if entries: playlist_title = self._og_search_title(webpage) playlist_description = self._og_search_description(webpage, default=None) return self.playlist_result(entries, playlist_id, playlist_title, playlist_description) @@ -733,35 +777,6 @@ class BBCIE(BBCCoUkIE): r'<param[^>]+name="externalIdentifier"[^>]+value="([\da-z]{8})"'], webpage, 'vpid', default=None) - duration = None - if not programme_id: - # single video in news article embedded with data-playable (e.g. - # http://www.bbc.com/news/world-us-canada-34473351) - data_playable = self._parse_json( - unescapeHTML(self._search_regex( - r'data-playable=(["\'])(?P<json>{.+?})\1', webpage, - 'data playable', default='{}', group='json')), - programme_id, fatal=False) - if data_playable: - # data-playable has video vpid in settings.playlistObject.items (e.g. - # http://www.bbc.com/news/world-us-canada-34473351) - settings = data_playable.get('settings', {}) - if settings: - playlist_object = settings.get('playlistObject', {}) - if playlist_object: - items = playlist_object.get('items') - if items and isinstance(items, list): - duration = int_or_none(items[0].get('duration')) - programme_id = items[0].get('vpid') - if not programme_id: - # data-playable has no vpid but has a playlist.sxml URLs - # in otherSettings.playlist (e.g. - # http://www.bbc.com/turkce/multimedya/2015/10/151010_vid_ankara_patlama_ani) - playlist = data_playable.get('otherSettings', {}).get('playlist', {}) - if playlist: - return self._extract_from_playlist_sxml( - playlist.get('progressiveDownloadUrl'), playlist_id, timestamp) - if programme_id: formats, subtitles = self._download_media_selector(programme_id) self._sort_formats(formats) From f6295bcb04c0e518369c31a44b691ad276fcc22f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 10 Oct 2015 23:45:15 +0600 Subject: [PATCH 0089/1286] [bbc] Remove duration --- youtube_dl/extractor/bbc.py | 1 - 1 file changed, 1 deletion(-) diff --git a/youtube_dl/extractor/bbc.py b/youtube_dl/extractor/bbc.py index a15e67114..fad1f9ca7 100644 --- a/youtube_dl/extractor/bbc.py +++ b/youtube_dl/extractor/bbc.py @@ -794,7 +794,6 @@ class BBCIE(BBCCoUkIE): 'title': title, 'description': description, 'timestamp': timestamp, - 'duration': duration, 'formats': formats, 'subtitles': subtitles, } From 6a747190605229e9cfba5450cf0ecaf435b7a85e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 10 Oct 2015 23:56:55 +0600 Subject: [PATCH 0090/1286] [bbc] Update tests --- youtube_dl/extractor/bbc.py | 33 ++++++++++----------------------- 1 file changed, 10 insertions(+), 23 deletions(-) diff --git a/youtube_dl/extractor/bbc.py b/youtube_dl/extractor/bbc.py index fad1f9ca7..4eae4f52e 100644 --- a/youtube_dl/extractor/bbc.py +++ b/youtube_dl/extractor/bbc.py @@ -501,8 +501,7 @@ class BBCIE(BBCCoUkIE): ] _TESTS = [{ - # article with multiple videos embedded with data-media-meta containing - # playlist.sxml, externalId and no direct video links + # article with multiple videos embedded with data-playable containing vpids 'url': 'http://www.bbc.com/news/world-europe-32668511', 'info_dict': { 'id': 'world-europe-32668511', @@ -511,7 +510,7 @@ class BBCIE(BBCCoUkIE): }, 'playlist_count': 2, }, { - # article with multiple videos embedded with data-media-meta (more videos) + # article with multiple videos embedded with data-playable (more videos) 'url': 'http://www.bbc.com/news/business-28299555', 'info_dict': { 'id': 'business-28299555', @@ -522,6 +521,7 @@ class BBCIE(BBCCoUkIE): 'skip': 'Save time', }, { # article with multiple videos embedded with `new SMP()` + # broken 'url': 'http://www.bbc.co.uk/blogs/adamcurtis/entries/3662a707-0af9-3149-963f-47bea720b460', 'info_dict': { 'id': '3662a707-0af9-3149-963f-47bea720b460', @@ -529,7 +529,7 @@ class BBCIE(BBCCoUkIE): }, 'playlist_count': 18, }, { - # single video embedded with mediaAssetPage.init() + # single video embedded with data-playable containing vpid 'url': 'http://www.bbc.com/news/world-europe-32041533', 'info_dict': { 'id': 'p02mprgb', @@ -544,9 +544,9 @@ class BBCIE(BBCCoUkIE): 'skip_download': True, } }, { - # article with single video embedded with data-media-meta containing - # direct video links (for now these are extracted) and playlist.xml (with - # media items as f4m and m3u8 - currently unsupported) + # article with single video embedded with data-playable containing XML playlist + # with direct video links as progressiveDownloadUrl (for now these are extracted) + # and playlist with f4m and m3u8 as streamingUrl 'url': 'http://www.bbc.com/turkce/haberler/2015/06/150615_telabyad_kentin_cogu', 'info_dict': { 'id': '150615_telabyad_kentin_cogu', @@ -560,20 +560,7 @@ class BBCIE(BBCCoUkIE): 'skip_download': True, } }, { - # single video embedded with playlist.sxml in data-playable - 'url': 'http://www.bbc.com/turkce/multimedya/2015/10/151010_vid_ankara_patlama_ani', - 'info_dict': { - 'id': '151010_vid_ankara_patlama_ani', - 'ext': 'mp4', - 'title': "Ankara'da patlama anı", - 'timestamp': 1444480325, - 'upload_date': '20151010', - }, - 'params': { - 'skip_download': True, - } - }, { - # single video embedded with mediaAssetPage.init() (regional section) + # single video embedded with data-playable containing XML playlists (regional section) 'url': 'http://www.bbc.com/mundo/video_fotos/2015/06/150619_video_honduras_militares_hospitales_corrupcion_aw', 'info_dict': { 'id': '150619_video_honduras_militares_hospitales_corrupcion_aw', @@ -629,7 +616,7 @@ class BBCIE(BBCCoUkIE): 'skip_download': True, } }, { - # single video with playlist.sxml URL + # single video with playlist.sxml URL in playlist param 'url': 'http://www.bbc.com/sport/0/football/33653409', 'info_dict': { 'id': 'p02xycnp', @@ -643,7 +630,7 @@ class BBCIE(BBCCoUkIE): 'skip_download': True, } }, { - # article with multiple videos embedded with playlist.sxml + # article with multiple videos embedded with playlist.sxml in playlist param 'url': 'http://www.bbc.com/sport/0/football/34475836', 'info_dict': { 'id': '34475836', From ae8bdfd1a1548c83ab7df378096da927b5374a29 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 11 Oct 2015 00:25:09 +0600 Subject: [PATCH 0091/1286] [bbc] Extract article JSON and actualize tests --- youtube_dl/extractor/bbc.py | 45 ++++++++++++++++++++++++------------- 1 file changed, 29 insertions(+), 16 deletions(-) diff --git a/youtube_dl/extractor/bbc.py b/youtube_dl/extractor/bbc.py index 4eae4f52e..b98db95b9 100644 --- a/youtube_dl/extractor/bbc.py +++ b/youtube_dl/extractor/bbc.py @@ -11,6 +11,7 @@ from ..utils import ( int_or_none, parse_duration, parse_iso8601, + remove_end, unescapeHTML, ) from ..compat import compat_HTTPError @@ -533,7 +534,7 @@ class BBCIE(BBCCoUkIE): 'url': 'http://www.bbc.com/news/world-europe-32041533', 'info_dict': { 'id': 'p02mprgb', - 'ext': 'mp4', + 'ext': 'flv', 'title': 'Aerial footage showed the site of the crash in the Alps - courtesy BFM TV', 'duration': 47, 'timestamp': 1427219242, @@ -552,7 +553,6 @@ class BBCIE(BBCCoUkIE): 'id': '150615_telabyad_kentin_cogu', 'ext': 'mp4', 'title': "YPG: Tel Abyad'ın tamamı kontrolümüzde", - 'duration': 47, 'timestamp': 1434397334, 'upload_date': '20150615', }, @@ -566,7 +566,6 @@ class BBCIE(BBCCoUkIE): 'id': '150619_video_honduras_militares_hospitales_corrupcion_aw', 'ext': 'mp4', 'title': 'Honduras militariza sus hospitales por nuevo escándalo de corrupción', - 'duration': 87, 'timestamp': 1434713142, 'upload_date': '20150619', }, @@ -578,7 +577,7 @@ class BBCIE(BBCCoUkIE): 'url': 'http://www.bbc.com/news/video_and_audio/must_see/33376376', 'info_dict': { 'id': 'p02w6qjc', - 'ext': 'mp4', + 'ext': 'flv', 'title': '''Judge Mindy Glazer: "I'm sorry to see you here... I always wondered what happened to you"''', 'duration': 56, }, @@ -605,11 +604,11 @@ class BBCIE(BBCCoUkIE): 'url': 'http://www.bbc.com/autos/story/20130513-hyundais-rock-star', 'info_dict': { 'id': 'p018zqqg', - 'ext': 'mp4', + 'ext': 'flv', 'title': 'Hyundai Santa Fe Sport: Rock star', 'description': 'md5:b042a26142c4154a6e472933cf20793d', - 'timestamp': 1368473503, - 'upload_date': '20130513', + 'timestamp': 1415867444, + 'upload_date': '20141113', }, 'params': { # rtmp download @@ -620,9 +619,8 @@ class BBCIE(BBCCoUkIE): 'url': 'http://www.bbc.com/sport/0/football/33653409', 'info_dict': { 'id': 'p02xycnp', - 'ext': 'mp4', + 'ext': 'flv', 'title': 'Transfers: Cristiano Ronaldo to Man Utd, Arsenal to spend?', - 'description': 'md5:398fca0e2e701c609d726e034fa1fc89', 'duration': 140, }, 'params': { @@ -697,11 +695,26 @@ class BBCIE(BBCCoUkIE): webpage = self._download_webpage(url, playlist_id) - timestamp = parse_iso8601(self._search_regex( - [r'"datePublished":\s*"([^"]+)', - r'<meta[^>]+property="article:published_time"[^>]+content="([^"]+)"', - r'itemprop="datePublished"[^>]+datetime="([^"]+)"'], - webpage, 'date', default=None)) + timestamp = None + playlist_title = None + playlist_description = None + + ld = self._parse_json( + self._search_regex( + r'(?s)<script type="application/ld\+json">(.+?)</script>', + webpage, 'ld json', default='{}'), + playlist_id, fatal=False) + if ld: + timestamp = parse_iso8601(ld.get('datePublished')) + playlist_title = ld.get('headline') + playlist_description = ld.get('articleBody') + + if not timestamp: + timestamp = parse_iso8601(self._search_regex( + [r'<meta[^>]+property="article:published_time"[^>]+content="([^"]+)"', + r'itemprop="datePublished"[^>]+datetime="([^"]+)"', + r'"datePublished":\s*"([^"]+)',], + webpage, 'date', default=None)) entries = [] @@ -754,8 +767,8 @@ class BBCIE(BBCCoUkIE): playlist.get('progressiveDownloadUrl'), playlist_id, timestamp)) if entries: - playlist_title = self._og_search_title(webpage) - playlist_description = self._og_search_description(webpage, default=None) + playlist_title = playlist_title or remove_end(self._og_search_title(webpage), ' - BBC News') + playlist_description = playlist_description or self._og_search_description(webpage, default=None) return self.playlist_result(entries, playlist_id, playlist_title, playlist_description) # single video story (e.g. http://www.bbc.com/travel/story/20150625-sri-lankas-spicy-secret) From 55ebae26f937a6a3a1fcb78d2f797972080f8b88 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 11 Oct 2015 00:37:39 +0600 Subject: [PATCH 0092/1286] [bbc] Prefer iptv-all mediaset --- youtube_dl/extractor/bbc.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/bbc.py b/youtube_dl/extractor/bbc.py index b98db95b9..f3ded3f9c 100644 --- a/youtube_dl/extractor/bbc.py +++ b/youtube_dl/extractor/bbc.py @@ -493,6 +493,9 @@ class BBCIE(BBCCoUkIE): _VALID_URL = r'https?://(?:www\.)?bbc\.(?:com|co\.uk)/(?:[^/]+/)+(?P<id>[^/#?]+)' _MEDIASELECTOR_URLS = [ + # Provides HQ HLS streams but fails with geolocation in some cases when it's + # even not geo restricted at all + 'http://open.live.bbc.co.uk/mediaselector/5/select/version/2.0/mediaset/iptv-all/vpid/%s', # Provides more formats, namely direct mp4 links, but fails on some videos with # notukerror for non UK (?) users (e.g. # http://www.bbc.com/travel/story/20150625-sri-lankas-spicy-secret) @@ -534,8 +537,9 @@ class BBCIE(BBCCoUkIE): 'url': 'http://www.bbc.com/news/world-europe-32041533', 'info_dict': { 'id': 'p02mprgb', - 'ext': 'flv', + 'ext': 'mp4', 'title': 'Aerial footage showed the site of the crash in the Alps - courtesy BFM TV', + 'description': 'md5:2868290467291b37feda7863f7a83f54', 'duration': 47, 'timestamp': 1427219242, 'upload_date': '20150324', @@ -577,7 +581,7 @@ class BBCIE(BBCCoUkIE): 'url': 'http://www.bbc.com/news/video_and_audio/must_see/33376376', 'info_dict': { 'id': 'p02w6qjc', - 'ext': 'flv', + 'ext': 'mp4', 'title': '''Judge Mindy Glazer: "I'm sorry to see you here... I always wondered what happened to you"''', 'duration': 56, }, @@ -604,7 +608,7 @@ class BBCIE(BBCCoUkIE): 'url': 'http://www.bbc.com/autos/story/20130513-hyundais-rock-star', 'info_dict': { 'id': 'p018zqqg', - 'ext': 'flv', + 'ext': 'mp4', 'title': 'Hyundai Santa Fe Sport: Rock star', 'description': 'md5:b042a26142c4154a6e472933cf20793d', 'timestamp': 1415867444, @@ -619,7 +623,7 @@ class BBCIE(BBCCoUkIE): 'url': 'http://www.bbc.com/sport/0/football/33653409', 'info_dict': { 'id': 'p02xycnp', - 'ext': 'flv', + 'ext': 'mp4', 'title': 'Transfers: Cristiano Ronaldo to Man Utd, Arsenal to spend?', 'duration': 140, }, From 6f7893653c86c620099d7bf0e3bd4951be8b4ad1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 11 Oct 2015 00:39:28 +0600 Subject: [PATCH 0093/1286] [bbc] PEP 8 --- youtube_dl/extractor/bbc.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/bbc.py b/youtube_dl/extractor/bbc.py index f3ded3f9c..68995f81e 100644 --- a/youtube_dl/extractor/bbc.py +++ b/youtube_dl/extractor/bbc.py @@ -717,7 +717,7 @@ class BBCIE(BBCCoUkIE): timestamp = parse_iso8601(self._search_regex( [r'<meta[^>]+property="article:published_time"[^>]+content="([^"]+)"', r'itemprop="datePublished"[^>]+datetime="([^"]+)"', - r'"datePublished":\s*"([^"]+)',], + r'"datePublished":\s*"([^"]+)'], webpage, 'date', default=None)) entries = [] From f4076bb73686484310a3a2b7c0c4c20db6e186bc Mon Sep 17 00:00:00 2001 From: remitamine <remitamine@gmail.com> Date: Sat, 10 Oct 2015 19:56:55 +0100 Subject: [PATCH 0094/1286] [vimeo] extract m3u8 manifest and bitrate --- youtube_dl/extractor/vimeo.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/youtube_dl/extractor/vimeo.py b/youtube_dl/extractor/vimeo.py index 7dd52627d..93638d6b2 100644 --- a/youtube_dl/extractor/vimeo.py +++ b/youtube_dl/extractor/vimeo.py @@ -397,8 +397,12 @@ class VimeoIE(VimeoBaseInfoExtractor): 'format_id': format_id, 'width': file_info.get('width'), 'height': file_info.get('height'), + 'tbr': file_info.get('bitrate'), }) formats = [] + hls = config_files.get("hls") + if hls: + formats = self._extract_m3u8_formats(hls['all'], video_id, m3u8_id='hls') for key in ('other', 'sd', 'hd'): formats += files[key] if len(formats) == 0: From 58cd7e173e70ae40a79bb10e08b2c2ea02bc8248 Mon Sep 17 00:00:00 2001 From: remitamine <remitamine@gmail.com> Date: Sat, 10 Oct 2015 12:28:12 +0100 Subject: [PATCH 0095/1286] [adultswim] detect when video needs authentication --- youtube_dl/extractor/adultswim.py | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/adultswim.py b/youtube_dl/extractor/adultswim.py index 27de07587..0eb21b16d 100644 --- a/youtube_dl/extractor/adultswim.py +++ b/youtube_dl/extractor/adultswim.py @@ -41,7 +41,8 @@ class AdultSwimIE(InfoExtractor): 'id': 'rQxZvXQ4ROaSOqq-or2Mow', 'title': 'Rick and Morty - Pilot', 'description': "Rick moves in with his daughter's family and establishes himself as a bad influence on his grandson, Morty. " - } + }, + 'skip': 'This video is only available for registered users', }, { 'url': 'http://www.adultswim.com/videos/playlists/american-parenting/putting-francine-out-of-business/', 'playlist': [ @@ -84,7 +85,10 @@ class AdultSwimIE(InfoExtractor): def find_video_info(collection, slug): for video in collection.get('videos'): if video.get('slug') == slug: - return video + if video.get('auth'): + raise ExtractorError('This video is only available for registered users', expected=True) + else: + return video @staticmethod def find_collection_by_linkURL(collections, linkURL): @@ -97,7 +101,10 @@ class AdultSwimIE(InfoExtractor): for collection in collections: for video in collection.get('videos'): if video.get('slug') == slug: - return collection, video + if video.get('auth'): + raise ExtractorError('This video is only available for registered users', expected=True) + else: + return collection, video return None, None def _real_extract(self, url): @@ -128,6 +135,8 @@ class AdultSwimIE(InfoExtractor): if video_info is None: if bootstrapped_data.get('slugged_video', {}).get('slug') == episode_path: video_info = bootstrapped_data['slugged_video'] + if video_info.get('auth'): + raise ExtractorError('This video is only available for registered users', expected=True) else: raise ExtractorError('Unable to find video info') From 00cde0b8dc3a534a929b88da086cc540dc212ea4 Mon Sep 17 00:00:00 2001 From: remitamine <remitamine@gmail.com> Date: Sat, 10 Oct 2015 17:57:05 +0100 Subject: [PATCH 0096/1286] [adultswim] raise ExtractorError if no clips in video_info --- youtube_dl/extractor/adultswim.py | 19 ++++++++----------- 1 file changed, 8 insertions(+), 11 deletions(-) diff --git a/youtube_dl/extractor/adultswim.py b/youtube_dl/extractor/adultswim.py index 0eb21b16d..35606878d 100644 --- a/youtube_dl/extractor/adultswim.py +++ b/youtube_dl/extractor/adultswim.py @@ -85,10 +85,7 @@ class AdultSwimIE(InfoExtractor): def find_video_info(collection, slug): for video in collection.get('videos'): if video.get('slug') == slug: - if video.get('auth'): - raise ExtractorError('This video is only available for registered users', expected=True) - else: - return video + return video @staticmethod def find_collection_by_linkURL(collections, linkURL): @@ -101,10 +98,7 @@ class AdultSwimIE(InfoExtractor): for collection in collections: for video in collection.get('videos'): if video.get('slug') == slug: - if video.get('auth'): - raise ExtractorError('This video is only available for registered users', expected=True) - else: - return collection, video + return collection, video return None, None def _real_extract(self, url): @@ -135,15 +129,18 @@ class AdultSwimIE(InfoExtractor): if video_info is None: if bootstrapped_data.get('slugged_video', {}).get('slug') == episode_path: video_info = bootstrapped_data['slugged_video'] - if video_info.get('auth'): - raise ExtractorError('This video is only available for registered users', expected=True) else: raise ExtractorError('Unable to find video info') show = bootstrapped_data['show'] show_title = show['title'] stream = video_info.get('stream') - clips = [stream] if stream else video_info['clips'] + clips = [stream] if stream else video_info.get('clips') + if not clips: + if video_info.get('auth'): + raise ExtractorError('This video is only available for registered users', expected=True) + else: + raise ExtractorError('Unable to find clips') segment_ids = [clip['videoPlaybackID'] for clip in clips] episode_id = video_info['id'] From 75f105d455bd0a2226ce4fa78b56c7a344ad6bd8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 11 Oct 2015 01:04:41 +0600 Subject: [PATCH 0097/1286] [adultswim] Clarify no media message --- youtube_dl/extractor/adultswim.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/adultswim.py b/youtube_dl/extractor/adultswim.py index 35606878d..130afe791 100644 --- a/youtube_dl/extractor/adultswim.py +++ b/youtube_dl/extractor/adultswim.py @@ -137,10 +137,11 @@ class AdultSwimIE(InfoExtractor): stream = video_info.get('stream') clips = [stream] if stream else video_info.get('clips') if not clips: - if video_info.get('auth'): - raise ExtractorError('This video is only available for registered users', expected=True) - else: - raise ExtractorError('Unable to find clips') + raise ExtractorError( + 'This video is only available via cable service provider subscription that' + ' is not currently supported. You may want to use --cookies.' + if video_info.get('auth') is True else 'Unable to find stream or clips', + expected=True) segment_ids = [clip['videoPlaybackID'] for clip in clips] episode_id = video_info['id'] From e5c209a1bcea206bee684914599c84acf886487c Mon Sep 17 00:00:00 2001 From: remitamine <remitamine@gmail.com> Date: Sat, 10 Oct 2015 20:34:10 +0100 Subject: [PATCH 0098/1286] [vimeo] add parameters to _extract_m3u8_formats and sort formats --- youtube_dl/extractor/vimeo.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/vimeo.py b/youtube_dl/extractor/vimeo.py index 93638d6b2..2ea5f0b79 100644 --- a/youtube_dl/extractor/vimeo.py +++ b/youtube_dl/extractor/vimeo.py @@ -402,11 +402,12 @@ class VimeoIE(VimeoBaseInfoExtractor): formats = [] hls = config_files.get("hls") if hls: - formats = self._extract_m3u8_formats(hls['all'], video_id, m3u8_id='hls') + formats = self._extract_m3u8_formats(hls['all'], video_id, 'mp4', 'm3u8_native', 0, 'hls', fatal=False) for key in ('other', 'sd', 'hd'): formats += files[key] if len(formats) == 0: raise ExtractorError('No known codec found') + self._sort_formats(formats) subtitles = {} text_tracks = config['request'].get('text_tracks') From fff496c689e44ac96909cf55c9ae746fb6b14e07 Mon Sep 17 00:00:00 2001 From: remitamine <remitamine@gmail.com> Date: Sat, 10 Oct 2015 20:45:34 +0100 Subject: [PATCH 0099/1286] [vimeo] remove check for empty formats --- youtube_dl/extractor/vimeo.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/youtube_dl/extractor/vimeo.py b/youtube_dl/extractor/vimeo.py index 2ea5f0b79..2051ac9de 100644 --- a/youtube_dl/extractor/vimeo.py +++ b/youtube_dl/extractor/vimeo.py @@ -405,8 +405,6 @@ class VimeoIE(VimeoBaseInfoExtractor): formats = self._extract_m3u8_formats(hls['all'], video_id, 'mp4', 'm3u8_native', 0, 'hls', fatal=False) for key in ('other', 'sd', 'hd'): formats += files[key] - if len(formats) == 0: - raise ExtractorError('No known codec found') self._sort_formats(formats) subtitles = {} From 35a3ff1d337edd527c73db133d87ed23ca4469f5 Mon Sep 17 00:00:00 2001 From: remitamine <remitamine@gmail.com> Date: Sat, 10 Oct 2015 21:05:29 +0100 Subject: [PATCH 0100/1286] [vimeo] always convert width, height and bitrate to int --- youtube_dl/extractor/vimeo.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/youtube_dl/extractor/vimeo.py b/youtube_dl/extractor/vimeo.py index 2051ac9de..97590d220 100644 --- a/youtube_dl/extractor/vimeo.py +++ b/youtube_dl/extractor/vimeo.py @@ -395,14 +395,14 @@ class VimeoIE(VimeoBaseInfoExtractor): 'ext': codec_extension, 'url': video_url, 'format_id': format_id, - 'width': file_info.get('width'), - 'height': file_info.get('height'), - 'tbr': file_info.get('bitrate'), + 'width': int_or_none(file_info.get('width')), + 'height': int_or_none(file_info.get('height')), + 'tbr': int_or_none(file_info.get('bitrate')), }) formats = [] - hls = config_files.get("hls") - if hls: - formats = self._extract_m3u8_formats(hls['all'], video_id, 'mp4', 'm3u8_native', 0, 'hls', fatal=False) + m3u8_url = config_files.get('hls', {}).get('all') + if m3u8_url: + formats = self._extract_m3u8_formats(m3u8_url, video_id, 'mp4', 'm3u8_native', 0, 'hls', fatal=False) for key in ('other', 'sd', 'hd'): formats += files[key] self._sort_formats(formats) From 68f3b61f0e278891b5a0d6557d297dcfd3fb53cf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 11 Oct 2015 02:22:42 +0600 Subject: [PATCH 0101/1286] [vimeo] Update tests --- youtube_dl/extractor/vimeo.py | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/youtube_dl/extractor/vimeo.py b/youtube_dl/extractor/vimeo.py index 97590d220..9c173ce07 100644 --- a/youtube_dl/extractor/vimeo.py +++ b/youtube_dl/extractor/vimeo.py @@ -84,12 +84,12 @@ class VimeoIE(VimeoBaseInfoExtractor): 'info_dict': { 'id': '56015672', 'ext': 'mp4', - "upload_date": "20121220", - "description": "This is a test case for youtube-dl.\nFor more information, see github.com/rg3/youtube-dl\nTest chars: \u2605 \" ' \u5e78 / \\ \u00e4 \u21ad \U0001d550", - "uploader_id": "user7108434", - "uploader": "Filippo Valsorda", - "title": "youtube-dl test video - \u2605 \" ' \u5e78 / \\ \u00e4 \u21ad \U0001d550", - "duration": 10, + 'title': "youtube-dl test video - \u2605 \" ' \u5e78 / \\ \u00e4 \u21ad \U0001d550", + 'description': 'md5:2d3305bad981a06ff79f027f19865021', + 'upload_date': '20121220', + 'uploader_id': 'user7108434', + 'uploader': 'Filippo Valsorda', + 'duration': 10, }, }, { @@ -102,7 +102,7 @@ class VimeoIE(VimeoBaseInfoExtractor): 'uploader_id': 'openstreetmapus', 'uploader': 'OpenStreetMap US', 'title': 'Andy Allan - Putting the Carto into OpenStreetMap Cartography', - 'description': 'md5:380943ec71b89736ff4bf27183233d09', + 'description': 'md5:fd69a7b8d8c34a4e1d2ec2e4afd6ec30', 'duration': 1595, }, }, @@ -156,7 +156,6 @@ class VimeoIE(VimeoBaseInfoExtractor): }, { 'url': 'http://vimeo.com/76979871', - 'md5': '3363dd6ffebe3784d56f4132317fd446', 'note': 'Video with subtitles', 'info_dict': { 'id': '76979871', From 7c84562945cc5c37979b0d8d786118b1fe31f731 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 11 Oct 2015 02:31:37 +0600 Subject: [PATCH 0102/1286] [vimeo] Fix password protected videos --- youtube_dl/extractor/vimeo.py | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/youtube_dl/extractor/vimeo.py b/youtube_dl/extractor/vimeo.py index 9c173ce07..346eb60b4 100644 --- a/youtube_dl/extractor/vimeo.py +++ b/youtube_dl/extractor/vimeo.py @@ -39,10 +39,7 @@ class VimeoBaseInfoExtractor(InfoExtractor): return self.report_login() webpage = self._download_webpage(self._LOGIN_URL, None, False) - token = self._extract_xsrft(webpage) - vuid = self._search_regex( - r'["\']vuid["\']\s*:\s*(["\'])(?P<vuid>.+?)\1', - webpage, 'vuid', group='vuid') + token, vuid = self._extract_xsrft_and_vuid(webpage) data = urlencode_postdata({ 'action': 'login', 'email': username, @@ -56,10 +53,14 @@ class VimeoBaseInfoExtractor(InfoExtractor): login_request.add_header('Referer', self._LOGIN_URL) self._download_webpage(login_request, None, False, 'Wrong login info') - def _extract_xsrft(self, webpage): - return self._search_regex( + def _extract_xsrft_and_vuid(self, webpage): + xsrft = self._search_regex( r'xsrft\s*[=:]\s*(?P<q>["\'])(?P<xsrft>.+?)(?P=q)', webpage, 'login token', group='xsrft') + vuid = self._search_regex( + r'["\']vuid["\']\s*:\s*(["\'])(?P<vuid>.+?)\1', + webpage, 'vuid', group='vuid') + return xsrft, vuid class VimeoIE(VimeoBaseInfoExtractor): @@ -201,7 +202,7 @@ class VimeoIE(VimeoBaseInfoExtractor): password = self._downloader.params.get('videopassword', None) if password is None: raise ExtractorError('This video is protected by a password, use the --video-password option', expected=True) - token = self._extract_xsrft(webpage) + token, vuid = self._extract_xsrft_and_vuid(webpage) data = urlencode_postdata({ 'password': password, 'token': token, @@ -211,6 +212,7 @@ class VimeoIE(VimeoBaseInfoExtractor): url = url.replace('http://', 'https://') password_request = compat_urllib_request.Request(url + '/password', data) password_request.add_header('Content-Type', 'application/x-www-form-urlencoded') + password_request.add_header('Cookie', 'clip_v=1; vuid=%s' % vuid) password_request.add_header('Referer', url) return self._download_webpage( password_request, video_id, @@ -465,7 +467,7 @@ class VimeoChannelIE(VimeoBaseInfoExtractor): if password is None: raise ExtractorError('This album is protected by a password, use the --video-password option', expected=True) fields = self._hidden_inputs(login_form) - token = self._extract_xsrft(webpage) + token, vuid = self._extract_xsrft_and_vuid(webpage) fields['token'] = token fields['password'] = password post = urlencode_postdata(fields) @@ -474,6 +476,7 @@ class VimeoChannelIE(VimeoBaseInfoExtractor): password_url = compat_urlparse.urljoin(page_url, password_path) password_request = compat_urllib_request.Request(password_url, post) password_request.add_header('Content-type', 'application/x-www-form-urlencoded') + password_request.add_header('Cookie', 'vuid=%s' % vuid) self._set_cookie('vimeo.com', 'xsrft', token) return self._download_webpage( From 70cb4d51c91c5b6c72b8564258cb161bcc68626c Mon Sep 17 00:00:00 2001 From: remitamine <remitamine@gmail.com> Date: Sat, 10 Oct 2015 00:45:23 +0100 Subject: [PATCH 0103/1286] [bild] extract info from json request --- youtube_dl/extractor/bild.py | 19 ++++++++----------- 1 file changed, 8 insertions(+), 11 deletions(-) diff --git a/youtube_dl/extractor/bild.py b/youtube_dl/extractor/bild.py index 4d8cce1ef..ea84f20f2 100644 --- a/youtube_dl/extractor/bild.py +++ b/youtube_dl/extractor/bild.py @@ -4,7 +4,7 @@ from __future__ import unicode_literals from .common import InfoExtractor from ..utils import ( int_or_none, - fix_xml_ampersands, + unescapeHTML, ) @@ -17,7 +17,7 @@ class BildIE(InfoExtractor): 'info_dict': { 'id': '38184146', 'ext': 'mp4', - 'title': 'BILD hat sie getestet', + 'title': 'Das können die neuen iPads ', 'thumbnail': 're:^https?://.*\.jpg$', 'duration': 196, 'description': 'Mit dem iPad Air 2 und dem iPad Mini 3 hat Apple zwei neue Tablet-Modelle präsentiert. BILD-Reporter Sven Stein durfte die Geräte bereits testen. ', @@ -27,16 +27,13 @@ class BildIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) - xml_url = url.split(".bild.html")[0] + ",view=xml.bild.xml" - doc = self._download_xml(xml_url, video_id, transform_source=fix_xml_ampersands) - - duration = int_or_none(doc.attrib.get('duration'), scale=1000) + video_data = self._download_json(url.split(".bild.html")[0] + ",view=json.bild.html", video_id) return { 'id': video_id, - 'title': doc.attrib['ueberschrift'], - 'description': doc.attrib.get('text'), - 'url': doc.attrib['src'], - 'thumbnail': doc.attrib.get('img'), - 'duration': duration, + 'title': unescapeHTML(video_data['title']), + 'description': unescapeHTML(video_data.get('description')), + 'url': video_data['clipList'][0]['srces'][0]['src'], + 'thumbnail': video_data.get('poster'), + 'duration': int_or_none(video_data.get('durationSec')), } From d8348c351d5ed4bb820b190e42b9a02e2dfc27a7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 11 Oct 2015 19:16:51 +0600 Subject: [PATCH 0104/1286] [bild] Strip title --- youtube_dl/extractor/bild.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/bild.py b/youtube_dl/extractor/bild.py index ea84f20f2..1a0184861 100644 --- a/youtube_dl/extractor/bild.py +++ b/youtube_dl/extractor/bild.py @@ -17,21 +17,22 @@ class BildIE(InfoExtractor): 'info_dict': { 'id': '38184146', 'ext': 'mp4', - 'title': 'Das können die neuen iPads ', + 'title': 'Das können die neuen iPads', + 'description': 'md5:a4058c4fa2a804ab59c00d7244bbf62f', 'thumbnail': 're:^https?://.*\.jpg$', 'duration': 196, - 'description': 'Mit dem iPad Air 2 und dem iPad Mini 3 hat Apple zwei neue Tablet-Modelle präsentiert. BILD-Reporter Sven Stein durfte die Geräte bereits testen. ', } } def _real_extract(self, url): video_id = self._match_id(url) - video_data = self._download_json(url.split(".bild.html")[0] + ",view=json.bild.html", video_id) + video_data = self._download_json( + url.split('.bild.html')[0] + ',view=json.bild.html', video_id) return { 'id': video_id, - 'title': unescapeHTML(video_data['title']), + 'title': unescapeHTML(video_data['title']).strip(), 'description': unescapeHTML(video_data.get('description')), 'url': video_data['clipList'][0]['srces'][0]['src'], 'thumbnail': video_data.get('poster'), From 03e3b4e1198631b30914e8669b01bf1825a4385c Mon Sep 17 00:00:00 2001 From: remitamine <remitamine@gmail.com> Date: Fri, 9 Oct 2015 23:36:31 +0100 Subject: [PATCH 0105/1286] [expotv] parse m3u8 manifest --- youtube_dl/extractor/expotv.py | 28 ++++++++++++++++------------ 1 file changed, 16 insertions(+), 12 deletions(-) diff --git a/youtube_dl/extractor/expotv.py b/youtube_dl/extractor/expotv.py index a38b773e8..23a38c7c1 100644 --- a/youtube_dl/extractor/expotv.py +++ b/youtube_dl/extractor/expotv.py @@ -33,20 +33,24 @@ class ExpoTVIE(InfoExtractor): webpage = self._download_webpage(url, video_id) player_key = self._search_regex( r'<param name="playerKey" value="([^"]+)"', webpage, 'player key') - config_url = 'http://client.expotv.com/video/config/%s/%s' % ( - video_id, player_key) config = self._download_json( - config_url, video_id, - note='Downloading video configuration') + 'http://client.expotv.com/video/config/%s/%s' % (video_id, player_key), + video_id, + note='Downloading video configuration') - formats = [{ - 'url': fcfg['file'], - 'height': int_or_none(fcfg.get('height')), - 'format_note': fcfg.get('label'), - 'ext': self._search_regex( - r'filename=.*\.([a-z0-9_A-Z]+)&', fcfg['file'], - 'file extension', default=None), - } for fcfg in config['sources']] + formats = [] + for fcfg in config['sources']: + if fcfg['type'] == 'm3u8': + formats.extend(self._extract_m3u8_formats(fcfg['file'], video_id)) + else: + formats.append({ + 'url': fcfg['file'], + 'height': int_or_none(fcfg.get('height')), + 'format_id': fcfg.get('label'), + 'ext': self._search_regex( + r'filename=.*\.([a-z0-9_A-Z]+)&', fcfg['file'], + 'file extension', default=None), + }) self._sort_formats(formats) title = self._og_search_title(webpage) From 7d49502ab0990972bdb479e6bf32dbc8ffdb3e1c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 11 Oct 2015 19:28:00 +0600 Subject: [PATCH 0106/1286] [bild] Make more robust and improve hls extraction --- youtube_dl/extractor/expotv.py | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/youtube_dl/extractor/expotv.py b/youtube_dl/extractor/expotv.py index 23a38c7c1..1585a03bb 100644 --- a/youtube_dl/extractor/expotv.py +++ b/youtube_dl/extractor/expotv.py @@ -34,22 +34,25 @@ class ExpoTVIE(InfoExtractor): player_key = self._search_regex( r'<param name="playerKey" value="([^"]+)"', webpage, 'player key') config = self._download_json( - 'http://client.expotv.com/video/config/%s/%s' % (video_id, player_key), - video_id, - note='Downloading video configuration') + 'http://client.expotv.com/video/config/%s/%s' % (video_id, player_key), + video_id, 'Downloading video configuration') formats = [] for fcfg in config['sources']: - if fcfg['type'] == 'm3u8': - formats.extend(self._extract_m3u8_formats(fcfg['file'], video_id)) + media_url = fcfg.get('file') + if not media_url: + continue + if fcfg.get('type') == 'm3u8': + formats.extend(self._extract_m3u8_formats( + media_url, video_id, 'mp4', entry_protocol='m3u8_native', m3u8_id='hls')) else: formats.append({ - 'url': fcfg['file'], + 'url': media_url, 'height': int_or_none(fcfg.get('height')), 'format_id': fcfg.get('label'), 'ext': self._search_regex( - r'filename=.*\.([a-z0-9_A-Z]+)&', fcfg['file'], - 'file extension', default=None), + r'filename=.*\.([a-z0-9_A-Z]+)&', media_url, + 'file extension', default=None) or fcfg.get('type'), }) self._sort_formats(formats) From da4daed5ef77b4a7219b1786978065a3606b85bc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 11 Oct 2015 19:36:23 +0600 Subject: [PATCH 0107/1286] [vimeo] Do not fail when no hls formats --- youtube_dl/extractor/vimeo.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/vimeo.py b/youtube_dl/extractor/vimeo.py index 346eb60b4..6ee3069a8 100644 --- a/youtube_dl/extractor/vimeo.py +++ b/youtube_dl/extractor/vimeo.py @@ -403,7 +403,10 @@ class VimeoIE(VimeoBaseInfoExtractor): formats = [] m3u8_url = config_files.get('hls', {}).get('all') if m3u8_url: - formats = self._extract_m3u8_formats(m3u8_url, video_id, 'mp4', 'm3u8_native', 0, 'hls', fatal=False) + m3u8_formats = self._extract_m3u8_formats( + m3u8_url, video_id, 'mp4', 'm3u8_native', 0, 'hls', fatal=False) + if m3u8_formats: + formats.append(m3u8_formats) for key in ('other', 'sd', 'hd'): formats += files[key] self._sort_formats(formats) From 1bd390358205728823ff38a51b12ae35f9468929 Mon Sep 17 00:00:00 2001 From: PC <tioocnt@yandex.com> Date: Tue, 6 Oct 2015 22:28:58 +0100 Subject: [PATCH 0108/1286] chaturbate streams --- youtube_dl/extractor/__init__.py | 1 + youtube_dl/extractor/chaturbate.py | 24 ++++++++++++++++++++++++ 2 files changed, 25 insertions(+) create mode 100644 youtube_dl/extractor/chaturbate.py diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 3ace1cc2c..75720843c 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -76,6 +76,7 @@ from .cbssports import CBSSportsIE from .ccc import CCCIE from .ceskatelevize import CeskaTelevizeIE from .channel9 import Channel9IE +from .chaturbate import ChaturbateIE from .chilloutzone import ChilloutzoneIE from .chirbit import ( ChirbitIE, diff --git a/youtube_dl/extractor/chaturbate.py b/youtube_dl/extractor/chaturbate.py new file mode 100644 index 000000000..5e24e1e4f --- /dev/null +++ b/youtube_dl/extractor/chaturbate.py @@ -0,0 +1,24 @@ +# encoding: utf-8 + +from .common import InfoExtractor + + +class ChaturbateIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?chaturbate\.com/(?P<id>[^/]+)/?$' + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + m3u8_url = self._search_regex(r"'(https?://.*?\.m3u8)'", webpage, 'playlist') + + formats = self._extract_m3u8_formats(m3u8_url, video_id, ext='mp4') + + return { + 'id': video_id, + 'title': self._live_title(video_id), + 'description': self._html_search_meta('description', webpage, 'description'), + 'is_live': True, + 'thumbnail': 'https://cdn-s.highwebmedia.com/uHK3McUtGCG3SMFcd4ZJsRv8/roomimage/%s.jpg' % (video_id,), + 'formats': formats, + } From 0f61db4469db4c17712b8a3f9e527e845ec96ed4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 11 Oct 2015 20:35:22 +0600 Subject: [PATCH 0109/1286] [chaturbate] Improve and capture error message --- youtube_dl/extractor/chaturbate.py | 36 +++++++++++++++++++++++++----- 1 file changed, 31 insertions(+), 5 deletions(-) diff --git a/youtube_dl/extractor/chaturbate.py b/youtube_dl/extractor/chaturbate.py index 5e24e1e4f..0b67ba67d 100644 --- a/youtube_dl/extractor/chaturbate.py +++ b/youtube_dl/extractor/chaturbate.py @@ -1,24 +1,50 @@ -# encoding: utf-8 +from __future__ import unicode_literals from .common import InfoExtractor +from ..utils import ExtractorError class ChaturbateIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?chaturbate\.com/(?P<id>[^/]+)/?$' + _VALID_URL = r'https?://(?:[^/]+\.)?chaturbate\.com/(?P<id>[^/?#]+)' + _TESTS = [{ + 'url': 'https://www.chaturbate.com/siswet19/', + 'info_dict': { + 'id': 'siswet19', + 'ext': 'mp4', + 'title': 're:^siswet19 [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', + 'age_limit': 18, + 'is_live': True, + }, + 'params': { + 'skip_download': True, + } + }, { + 'url': 'https://en.chaturbate.com/siswet19/', + 'only_matching': True, + }] def _real_extract(self, url): video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) - m3u8_url = self._search_regex(r"'(https?://.*?\.m3u8)'", webpage, 'playlist') + m3u8_url = self._search_regex( + r'src=(["\'])(?P<url>http.+?\.m3u8.*?)\1', webpage, + 'playlist', default=None, group='url') + + if not m3u8_url: + error = self._search_regex( + r'<span[^>]+class=(["\'])desc_span\1[^>]*>(?P<error>[^<]+)</span>', + webpage, 'error', group='error') + raise ExtractorError(error, expected=True) formats = self._extract_m3u8_formats(m3u8_url, video_id, ext='mp4') return { 'id': video_id, 'title': self._live_title(video_id), - 'description': self._html_search_meta('description', webpage, 'description'), + 'thumbnail': 'https://cdn-s.highwebmedia.com/uHK3McUtGCG3SMFcd4ZJsRv8/roomimage/%s.jpg' % video_id, + 'age_limit': self._rta_search(webpage), 'is_live': True, - 'thumbnail': 'https://cdn-s.highwebmedia.com/uHK3McUtGCG3SMFcd4ZJsRv8/roomimage/%s.jpg' % (video_id,), 'formats': formats, } From f101079ae02560fee2df1ea0cacb1989eefc8e6c Mon Sep 17 00:00:00 2001 From: AndroKev <AndroKev@users.noreply.github.com> Date: Sun, 11 Oct 2015 19:17:24 +0200 Subject: [PATCH 0110/1286] [downloader/rtmp] Respect --no-continue option now when downloading a "rtmp-file" the --no-continue option works! --- youtube_dl/downloader/rtmp.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/downloader/rtmp.py b/youtube_dl/downloader/rtmp.py index 7d19bb808..f1d219ba9 100644 --- a/youtube_dl/downloader/rtmp.py +++ b/youtube_dl/downloader/rtmp.py @@ -105,7 +105,7 @@ class RtmpFD(FileDownloader): protocol = info_dict.get('rtmp_protocol', None) real_time = info_dict.get('rtmp_real_time', False) no_resume = info_dict.get('no_resume', False) - continue_dl = info_dict.get('continuedl', True) + continue_dl = self.params.get('continuedl', True) self.report_destination(filename) tmpfilename = self.temp_name(filename) From 964e7b2dd09099bd021a770522684716f341db43 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Mon, 12 Oct 2015 00:43:54 +0600 Subject: [PATCH 0111/1286] [downloader/common] Always skip "already downloaded" check when outputting to stdout --- youtube_dl/downloader/common.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/downloader/common.py b/youtube_dl/downloader/common.py index 97e755d4b..29a4500d3 100644 --- a/youtube_dl/downloader/common.py +++ b/youtube_dl/downloader/common.py @@ -325,7 +325,7 @@ class FileDownloader(object): ) # Check file already present - if filename != '-' and nooverwrites_and_exists or continuedl_and_exists: + if filename != '-' and (nooverwrites_and_exists or continuedl_and_exists): self.report_file_already_downloaded(filename) self._hook_progress({ 'filename': filename, From 57d1db8dd00b833c08b3ac74fbd589a0216775f6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= <jaime.marquinez.ferrandiz@gmail.com> Date: Sun, 11 Oct 2015 22:45:13 +0200 Subject: [PATCH 0112/1286] [rtbf] Fix extraction (closes #7133) --- youtube_dl/extractor/rtbf.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/rtbf.py b/youtube_dl/extractor/rtbf.py index e4215d546..04a66df90 100644 --- a/youtube_dl/extractor/rtbf.py +++ b/youtube_dl/extractor/rtbf.py @@ -36,7 +36,7 @@ class RTBFIE(InfoExtractor): data = self._parse_json( unescapeHTML(self._search_regex( - r'data-video="([^"]+)"', webpage, 'data video')), + r'data-media="([^"]+)"', webpage, 'data video')), video_id) if data.get('provider').lower() == 'youtube': From cd7364a89c69c4c19e4b9d4e54d08fd1eff9c51d Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Mon, 12 Oct 2015 06:37:20 +0200 Subject: [PATCH 0113/1286] release 2015.10.12 --- docs/supportedsites.md | 1 + youtube_dl/version.py | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/docs/supportedsites.md b/docs/supportedsites.md index 5beb2ecd4..dc0354095 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -81,6 +81,7 @@ - **CBSSports** - **CeskaTelevize** - **channel9**: Channel 9 + - **Chaturbate** - **Chilloutzone** - **chirbit** - **chirbit:profile** diff --git a/youtube_dl/version.py b/youtube_dl/version.py index faae8a2d7..0908e963d 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2015.10.09' +__version__ = '2015.10.12' From 73e732eb6b5fb2edd61701ae110cc52fe6836364 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan <yan12125@gmail.com> Date: Mon, 12 Oct 2015 16:34:55 +0800 Subject: [PATCH 0114/1286] [vimeo] Fix m3u8 formats Reported at https://github.com/rg3/youtube-dl/pull/7126#issuecomment-147327584 --- youtube_dl/extractor/vimeo.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/vimeo.py b/youtube_dl/extractor/vimeo.py index 6ee3069a8..027f47ee3 100644 --- a/youtube_dl/extractor/vimeo.py +++ b/youtube_dl/extractor/vimeo.py @@ -406,7 +406,7 @@ class VimeoIE(VimeoBaseInfoExtractor): m3u8_formats = self._extract_m3u8_formats( m3u8_url, video_id, 'mp4', 'm3u8_native', 0, 'hls', fatal=False) if m3u8_formats: - formats.append(m3u8_formats) + formats.extend(m3u8_formats) for key in ('other', 'sd', 'hd'): formats += files[key] self._sort_formats(formats) From ee2edd838a1e8770488e695c380943ded44d0983 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Tue, 13 Oct 2015 00:53:05 +0200 Subject: [PATCH 0115/1286] release 2015.10.13 --- youtube_dl/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 0908e963d..aaa43d315 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2015.10.12' +__version__ = '2015.10.13' From 5946cda7c6f2e4a7eb90fff6f10c66af0ff2a0d7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Tue, 13 Oct 2015 21:04:39 +0600 Subject: [PATCH 0116/1286] [beeg] Fix extraction (Closes #7155) --- youtube_dl/extractor/beeg.py | 68 +++++++++++++++++++----------------- 1 file changed, 35 insertions(+), 33 deletions(-) diff --git a/youtube_dl/extractor/beeg.py b/youtube_dl/extractor/beeg.py index b38057f2f..e6c928699 100644 --- a/youtube_dl/extractor/beeg.py +++ b/youtube_dl/extractor/beeg.py @@ -1,65 +1,67 @@ from __future__ import unicode_literals -import re - from .common import InfoExtractor +from ..utils import ( + int_or_none, + parse_iso8601, +) class BeegIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?beeg\.com/(?P<id>\d+)' _TEST = { 'url': 'http://beeg.com/5416503', - 'md5': '1bff67111adb785c51d1b42959ec10e5', + 'md5': '46c384def73b33dbc581262e5ee67cef', 'info_dict': { 'id': '5416503', 'ext': 'mp4', 'title': 'Sultry Striptease', - 'description': 'md5:6db3c6177972822aaba18652ff59c773', - 'categories': list, # NSFW - 'thumbnail': 're:https?://.*\.jpg$', + 'description': 'md5:d22219c09da287c14bed3d6c37ce4bc2', + 'timestamp': 1391813355, + 'upload_date': '20140207', + 'duration': 383, + 'tags': list, 'age_limit': 18, } } def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') + video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) - - quality_arr = self._search_regex( - r'(?s)var\s+qualityArr\s*=\s*{\s*(.+?)\s*}', webpage, 'quality formats') - - formats = [{ - 'url': fmt[1], - 'format_id': fmt[0], - 'height': int(fmt[0][:-1]), - } for fmt in re.findall(r"'([^']+)'\s*:\s*'([^']+)'", quality_arr)] + video = self._download_json( + 'http://beeg.com/api/v1/video/%s' % video_id, video_id) + formats = [] + for format_id, video_url in video.items(): + height = self._search_regex( + r'^(\d+)[pP]$', format_id, 'height', default=None) + if not height: + continue + formats.append({ + 'url': self._proto_relative_url(video_url.replace('{DATA_MARKERS}', ''), 'http:'), + 'format_id': format_id, + 'height': int(height), + }) self._sort_formats(formats) - title = self._html_search_regex( - r'<title>([^<]+)\s*-\s*beeg\.?', webpage, 'title') + title = video['title'] + video_id = video.get('id') or video_id + display_id = video.get('code') + description = video.get('desc') - description = self._html_search_regex( - r' Date: Tue, 13 Oct 2015 16:29:16 +0700 Subject: [PATCH 0117/1286] Extract thumbnail url --- youtube_dl/extractor/yandexmusic.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/youtube_dl/extractor/yandexmusic.py b/youtube_dl/extractor/yandexmusic.py index 4098e4629..6842f834f 100644 --- a/youtube_dl/extractor/yandexmusic.py +++ b/youtube_dl/extractor/yandexmusic.py @@ -46,6 +46,14 @@ class YandexMusicTrackIE(InfoExtractor): % (data['host'], key, data['ts'] + data['path'], storage[1])) def _get_track_info(self, track): + album = track['albums'][0] + a_thumb = None + + if 'coverUri' in album: + a_thumb = album['coverUri'] + if a_thumb: + a_thumb = 'http://' + a_thumb.replace('%%', '1000x1000') + return { 'id': track['id'], 'ext': 'mp3', @@ -53,6 +61,7 @@ class YandexMusicTrackIE(InfoExtractor): 'title': '%s - %s' % (track['artists'][0]['name'], track['title']), 'filesize': int_or_none(track.get('fileSize')), 'duration': float_or_none(track.get('durationMs'), 1000), + 'thumbnail': a_thumb, } def _real_extract(self, url): From b30c4992a93d411f4f89faf2af153fc580138a90 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Tue, 13 Oct 2015 21:14:33 +0200 Subject: [PATCH 0118/1286] [channel9] Return a single dictionary for single videos (closes #7086) Returning a list is deprecated. --- youtube_dl/extractor/channel9.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/youtube_dl/extractor/channel9.py b/youtube_dl/extractor/channel9.py index 3dfc24f5b..79fd0a30e 100644 --- a/youtube_dl/extractor/channel9.py +++ b/youtube_dl/extractor/channel9.py @@ -224,12 +224,12 @@ class Channel9IE(InfoExtractor): if contents is None: return contents - authors = self._extract_authors(html) + if len(contents) > 1: + raise ExtractorError('Got more than one entry') + result = contents[0] + result['authors'] = self._extract_authors(html) - for content in contents: - content['authors'] = authors - - return contents + return result def _extract_session(self, html, content_path): contents = self._extract_content(html, content_path) From 506e261d2073d8c00d5b43d272e8173cb0d63728 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Tue, 13 Oct 2015 21:18:30 +0200 Subject: [PATCH 0119/1286] [channel9] strip 'session_day' --- youtube_dl/extractor/channel9.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/channel9.py b/youtube_dl/extractor/channel9.py index 79fd0a30e..1ce004932 100644 --- a/youtube_dl/extractor/channel9.py +++ b/youtube_dl/extractor/channel9.py @@ -158,7 +158,7 @@ class Channel9IE(InfoExtractor): def _extract_session_day(self, html): m = re.search(r'
  • \s*(?P[^<]+)\s*
  • ', html) - return m.group('day') if m is not None else None + return m.group('day').strip() if m is not None else None def _extract_session_room(self, html): m = re.search(r'
  • \s*(?P.+?)\s*
  • ', html) From 3dc582e5ea69af4ad7f51d30c1d87cf93aa6b72b Mon Sep 17 00:00:00 2001 From: kaspi Date: Mon, 12 Oct 2015 01:25:57 -0400 Subject: [PATCH 0120/1286] [fczenit] Add extractor Closes #7143. --- youtube_dl/extractor/__init__.py | 1 + youtube_dl/extractor/fczenit.py | 41 ++++++++++++++++++++++++++++++++ 2 files changed, 42 insertions(+) create mode 100644 youtube_dl/extractor/fczenit.py diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 75720843c..f6d185818 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -167,6 +167,7 @@ from .extremetube import ExtremeTubeIE from .facebook import FacebookIE from .faz import FazIE from .fc2 import FC2IE +from .fczenit import FczenitIE from .firstpost import FirstpostIE from .firsttv import FirstTVIE from .fivemin import FiveMinIE diff --git a/youtube_dl/extractor/fczenit.py b/youtube_dl/extractor/fczenit.py new file mode 100644 index 000000000..f1f150ef2 --- /dev/null +++ b/youtube_dl/extractor/fczenit.py @@ -0,0 +1,41 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor + + +class FczenitIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?fc-zenit\.ru/video/gl(?P[0-9]+)' + _TEST = { + 'url': 'http://fc-zenit.ru/video/gl6785/', + 'md5': '458bacc24549173fe5a5aa29174a5606', + 'info_dict': { + 'id': '6785', + 'ext': 'mp4', + 'title': '«Зенит-ТВ»: как Олег Шатов играл против «Урала»', + }, + } + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + video_title = self._html_search_regex(r'
    ([^<]+)', webpage, 'title') + + bitrates_raw = self._html_search_regex(r'bitrates:.*\n(.*)\]', webpage, 'video URL') + bitrates = re.findall(r'url:.?\'(.+?)\'.*?bitrate:.?([0-9]{3}?)', bitrates_raw) + + formats = [{ + 'url': furl, + 'tbr': tbr, + } for furl, tbr in bitrates] + + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': video_title, + 'formats': formats, + } From e09f58b3bc3af6ce1e541fb7d034fe869fba6e82 Mon Sep 17 00:00:00 2001 From: remitamine Date: Wed, 14 Oct 2015 10:40:54 +0100 Subject: [PATCH 0121/1286] [srgssr] change the url chortcut, fix image extraction ,add a test and extract format id --- youtube_dl/extractor/srgssr.py | 35 ++++++++++++++++++++++++---------- 1 file changed, 25 insertions(+), 10 deletions(-) diff --git a/youtube_dl/extractor/srgssr.py b/youtube_dl/extractor/srgssr.py index 3b5dcc503..f759e5600 100644 --- a/youtube_dl/extractor/srgssr.py +++ b/youtube_dl/extractor/srgssr.py @@ -12,7 +12,7 @@ from ..utils import ( class SRGSSRIE(InfoExtractor): - _VALID_URL = r'(?:https?://tp\.srgssr\.ch/p(?:/[^/]+)+\?urn=)?urn:(?Psrf|rts|rsi|rtr|swi):(?:[^:]+:)?(?Pvideo|audio):(?P[0-9a-f\-]{36}|\d+)' + _VALID_URL = r'(?:https?://tp\.srgssr\.ch/p(?:/[^/]+)+\?urn=urn|srgssr):(?Psrf|rts|rsi|rtr|swi):(?:[^:]+:)?(?Pvideo|audio):(?P[0-9a-f\-]{36}|\d+)' _ERRORS = { 'AGERATING12': 'To protect children under the age of 12, this video is only available between 8 p.m. and 6 a.m.', @@ -44,11 +44,12 @@ class SRGSSRIE(InfoExtractor): timestamp = parse_iso8601(created_date) thumbnails = [] - for image in media_data['Image']['ImageRepresentations']['ImageRepresentation']: - thumbnails.append({ - 'id': image.get('id'), - 'url': image['url'], - }) + if 'Image' in media_data: + for image in media_data['Image']['ImageRepresentations']['ImageRepresentation']: + thumbnails.append({ + 'id': image.get('id'), + 'url': image['url'], + }) preference = qualities(['LQ', 'MQ', 'SD', 'HQ', 'HD']) formats = [] @@ -70,16 +71,17 @@ class SRGSSRIE(InfoExtractor): if asset_url.startswith('rtmp'): ext = self._search_regex(r'([a-z0-9]+):[^/]+', asset_url, 'ext') formats.append({ + 'format_id': asset['@quality'], 'url': asset_url, 'preference': preference(asset['@quality']), 'ext': ext, }) - downloads = media_data.get('Downloads') - if downloads: - for source in downloads['Download']: + if 'Downloads' in media_data: + for source in media_data['Downloads']['Download']: for asset in source['url']: formats.append({ + 'format_id': asset['@quality'], 'url': asset['text'], 'preference': preference(asset['@quality']) }) @@ -120,8 +122,21 @@ class SRGSSRPlayIE(InfoExtractor): 'description': 'md5:88604432b60d5a38787f152dec89cd56', 'timestamp': 1373493600, }, + },{ + 'url': 'http://www.rtr.ch/play/radio/actualitad/audio/saira-tujetsch-tuttina-cuntinuar-cun-sedrun-muster-turissem?id=63cb0778-27f8-49af-9284-8c7a8c6d15fc', + 'info_dict': { + 'id': '63cb0778-27f8-49af-9284-8c7a8c6d15fc', + 'ext': 'mp3', + 'upload_date': '20151013', + 'title': 'Saira: Tujetsch - tuttina cuntinuar cun Sedrun Mustér Turissem', + 'timestamp': 1444750398, + }, + 'params': { + # rtmp download + 'skip_download': True, + }, }] def _real_extract(self, url): bu, media_type, media_id = re.match(self._VALID_URL, url).groups() - return self.url_result('urn:%s:%s:%s' % (bu[:3], media_type, media_id), 'SRGSSR') + return self.url_result('srgssr:%s:%s:%s' % (bu[:3], media_type, media_id), 'SRGSSR') From db8e38b8cff2e67a9ff51104c4a7b33c20650204 Mon Sep 17 00:00:00 2001 From: remitamine Date: Wed, 14 Oct 2015 11:55:03 +0100 Subject: [PATCH 0122/1286] [ign] add tests for me.ign specific language urls --- youtube_dl/extractor/ign.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/youtube_dl/extractor/ign.py b/youtube_dl/extractor/ign.py index fa4e67394..fb2753738 100644 --- a/youtube_dl/extractor/ign.py +++ b/youtube_dl/extractor/ign.py @@ -77,6 +77,14 @@ class IGNIE(InfoExtractor): 'upload_date': '20140814', }, }, + { + 'url': 'http://me.ign.com/en/videos/112203/video/how-hitman-aims-to-be-different-than-every-other-s', + 'only_matching': True, + }, + { + 'url': 'http://me.ign.com/ar/angry-birds-2/106533/video/lrd-ldyy-lwl-lfylm-angry-birds', + 'only_matching': True, + }, ] def _find_video_id(self, webpage): From 26669ea3cf596f2ea4bce9e21ce73c1d8fc3ff72 Mon Sep 17 00:00:00 2001 From: remitamine Date: Sat, 10 Oct 2015 16:51:37 +0100 Subject: [PATCH 0123/1286] [5min] extract more video info and formats Closes #7124. --- youtube_dl/extractor/fivemin.py | 84 ++++++++++++++++++++++++++++----- 1 file changed, 71 insertions(+), 13 deletions(-) diff --git a/youtube_dl/extractor/fivemin.py b/youtube_dl/extractor/fivemin.py index 157094e8c..2955965d9 100644 --- a/youtube_dl/extractor/fivemin.py +++ b/youtube_dl/extractor/fivemin.py @@ -2,11 +2,15 @@ from __future__ import unicode_literals from .common import InfoExtractor from ..compat import ( - compat_str, compat_urllib_parse, + compat_parse_qs, + compat_urllib_parse_urlparse, + compat_urlparse, ) from ..utils import ( ExtractorError, + parse_duration, + replace_extension, ) @@ -28,6 +32,7 @@ class FiveMinIE(InfoExtractor): 'id': '518013791', 'ext': 'mp4', 'title': 'iPad Mini with Retina Display Review', + 'duration': 177, }, }, { @@ -38,9 +43,52 @@ class FiveMinIE(InfoExtractor): 'id': '518086247', 'ext': 'mp4', 'title': 'How to Make a Next-Level Fruit Salad', + 'duration': 184, }, }, ] + _ERRORS = { + 'ErrorVideoNotExist': 'We\'re sorry, but the video you are trying to watch does not exist.', + 'ErrorVideoNoLongerAvailable': 'We\'re sorry, but the video you are trying to watch is no longer available.', + 'ErrorVideoRejected': 'We\'re sorry, but the video you are trying to watch has been removed.', + 'ErrorVideoUserNotGeo': 'We\'re sorry, but the video you are trying to watch cannot be viewed from your current location.', + 'ErrorVideoLibraryRestriction': 'We\'re sorry, but the video you are trying to watch is currently unavailable for viewing at this domain.', + 'ErrorExposurePermission': 'We\'re sorry, but the video you are trying to watch is currently unavailable for viewing at this domain.', + } + _QUALITIES = { + 1: { + 'width': 640, + 'height': 360, + }, + 2: { + 'width': 854, + 'height': 480, + }, + 4: { + 'width': 1280, + 'height': 720, + }, + 8: { + 'width': 1920, + 'height': 1080, + }, + 16: { + 'width': 640, + 'height': 360, + }, + 32: { + 'width': 854, + 'height': 480, + }, + 64: { + 'width': 1280, + 'height': 720, + }, + 128: { + 'width': 640, + 'height': 360, + }, + } def _real_extract(self, url): video_id = self._match_id(url) @@ -59,26 +107,36 @@ class FiveMinIE(InfoExtractor): 'https://syn.5min.com/handlers/SenseHandler.ashx?' + query, video_id) if not response['success']: - err_msg = response['errorMessage'] - if err_msg == 'ErrorVideoUserNotGeo': - msg = 'Video not available from your location' - else: - msg = 'Aol said: %s' % err_msg - raise ExtractorError(msg, expected=True, video_id=video_id) + raise ExtractorError( + '%s said: %s' % ( + self.IE_NAME, + self._ERRORS.get(response['errorMessage'], response['errorMessage'])), + expected=True) info = response['binding'][0] - second_id = compat_str(int(video_id[:-2]) + 1) formats = [] - for quality, height in [(1, 320), (2, 480), (4, 720), (8, 1080)]: - if any(r['ID'] == quality for r in info['Renditions']): + parsed_video_url = compat_urllib_parse_urlparse(compat_parse_qs( + compat_urllib_parse_urlparse(info['EmbededURL']).query)['videoUrl'][0]) + for rendition in info['Renditions']: + if rendition['RenditionType'] == 'm3u8': + formats.extend(self._extract_m3u8_formats(rendition['Url'], video_id, m3u8_id='hls')) + elif rendition['RenditionType'] == 'aac': + continue + else: + rendition_url = compat_urlparse.urlunparse(parsed_video_url._replace(path=replace_extension(parsed_video_url.path.replace('//', '/%s/' % rendition['ID']), rendition['RenditionType']))) + quality = self._QUALITIES.get(rendition['ID'], {}) formats.append({ - 'format_id': compat_str(quality), - 'url': 'http://avideos.5min.com/%s/%s/%s_%s.mp4' % (second_id[-3:], second_id, video_id, quality), - 'height': height, + 'format_id': '%s-%d' % (rendition['RenditionType'], rendition['ID']), + 'url': rendition_url, + 'width': quality.get('width'), + 'height': quality.get('height'), }) + self._sort_formats(formats) return { 'id': video_id, 'title': info['Title'], + 'thumbnail': info.get('ThumbURL'), + 'duration': parse_duration(info.get('Duration')), 'formats': formats, } From 1f36085df94c2addd1175e7e299f6235aca3ac68 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Wed, 14 Oct 2015 13:41:39 +0200 Subject: [PATCH 0124/1286] [vimeo] Fix extraction of password protected videos (fixes #7169) --- youtube_dl/extractor/vimeo.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/vimeo.py b/youtube_dl/extractor/vimeo.py index 027f47ee3..fa1b22049 100644 --- a/youtube_dl/extractor/vimeo.py +++ b/youtube_dl/extractor/vimeo.py @@ -212,7 +212,7 @@ class VimeoIE(VimeoBaseInfoExtractor): url = url.replace('http://', 'https://') password_request = compat_urllib_request.Request(url + '/password', data) password_request.add_header('Content-Type', 'application/x-www-form-urlencoded') - password_request.add_header('Cookie', 'clip_v=1; vuid=%s' % vuid) + password_request.add_header('Cookie', 'clip_test2=1; vuid=%s' % vuid) password_request.add_header('Referer', url) return self._download_webpage( password_request, video_id, From 36bb63fad19df5ee419979f875e2265936511644 Mon Sep 17 00:00:00 2001 From: remitamine Date: Wed, 14 Oct 2015 14:13:53 +0100 Subject: [PATCH 0125/1286] [criterion] fix description extraction --- youtube_dl/extractor/criterion.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/youtube_dl/extractor/criterion.py b/youtube_dl/extractor/criterion.py index 4fb178165..dedb810a0 100644 --- a/youtube_dl/extractor/criterion.py +++ b/youtube_dl/extractor/criterion.py @@ -27,9 +27,7 @@ class CriterionIE(InfoExtractor): final_url = self._search_regex( r'so.addVariable\("videoURL", "(.+?)"\)\;', webpage, 'video url') title = self._og_search_title(webpage) - description = self._html_search_regex( - r'', - webpage, 'video description') + description = self._html_search_meta('description', webpage) thumbnail = self._search_regex( r'so.addVariable\("thumbnailURL", "(.+?)"\)\;', webpage, 'thumbnail url') From 7a6d76a64d8a89a08bb79791506fc18b993c4580 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Wed, 14 Oct 2015 20:49:39 +0600 Subject: [PATCH 0126/1286] [extractor/common] Require closing quote in _og_regexes (Closes #7174) E.g. do not match `property='og:video:type'` when `og:video` is requested. --- youtube_dl/extractor/common.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 0082a4c84..a0c4af92f 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -646,7 +646,8 @@ class InfoExtractor(object): @staticmethod def _og_regexes(prop): content_re = r'content=(?:"([^>]+?)"|\'([^>]+?)\'|\s*([^\s"\'=<>`]+?))' - property_re = r'(?:name|property)=[\'"]?og:%s[\'"]?' % re.escape(prop) + property_re = (r'(?:name|property)=(?:\'og:%(prop)s\'|"og:%(prop)s"|\s*og:%(prop)s\b)' + % {'prop': re.escape(prop)}) template = r']+?%s[^>]+?%s' return [ template % (property_re, content_re), From 1c29e81e620241b9013b23e7acd9d6ab06587fb1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Wed, 14 Oct 2015 20:58:52 +0600 Subject: [PATCH 0127/1286] [test_InfoExtractor] Add test for 7a6d76a64d8a89a08bb79791506fc18b993c4580 --- test/test_InfoExtractor.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/test/test_InfoExtractor.py b/test/test_InfoExtractor.py index be8d12997..4ce5b5a35 100644 --- a/test/test_InfoExtractor.py +++ b/test/test_InfoExtractor.py @@ -35,10 +35,12 @@ class TestInfoExtractor(unittest.TestCase): + ''' self.assertEqual(ie._og_search_title(html), 'Foo') self.assertEqual(ie._og_search_description(html), 'Some video\'s description ') self.assertEqual(ie._og_search_thumbnail(html), 'http://domain.com/pic.jpg?key1=val1&key2=val2') + self.assertEqual(ie._og_search_video_url(html, default=None), None) def test_html_search_meta(self): ie = self.ie From db0a8ad97993cb3f0c398d3a5dc55389565e0ffd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Wed, 14 Oct 2015 21:11:06 +0600 Subject: [PATCH 0128/1286] [test_InfoExtractor] Add test for unquoted attribute --- test/test_InfoExtractor.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/test/test_InfoExtractor.py b/test/test_InfoExtractor.py index 4ce5b5a35..2a00d09a5 100644 --- a/test/test_InfoExtractor.py +++ b/test/test_InfoExtractor.py @@ -36,11 +36,13 @@ class TestInfoExtractor(unittest.TestCase): + ''' self.assertEqual(ie._og_search_title(html), 'Foo') self.assertEqual(ie._og_search_description(html), 'Some video\'s description ') self.assertEqual(ie._og_search_thumbnail(html), 'http://domain.com/pic.jpg?key1=val1&key2=val2') self.assertEqual(ie._og_search_video_url(html, default=None), None) + self.assertEqual(ie._og_search_property('foobar', html), 'Foo') def test_html_search_meta(self): ie = self.ie From ab953c64a0e8b8558e95d0318110c0885a4eec3d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Wed, 14 Oct 2015 21:15:29 +0600 Subject: [PATCH 0129/1286] [yandexmusic:track] Extract original size thumbnail (Closes #7160) --- youtube_dl/extractor/yandexmusic.py | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/youtube_dl/extractor/yandexmusic.py b/youtube_dl/extractor/yandexmusic.py index 6842f834f..08dc81f3a 100644 --- a/youtube_dl/extractor/yandexmusic.py +++ b/youtube_dl/extractor/yandexmusic.py @@ -46,14 +46,12 @@ class YandexMusicTrackIE(InfoExtractor): % (data['host'], key, data['ts'] + data['path'], storage[1])) def _get_track_info(self, track): - album = track['albums'][0] - a_thumb = None - - if 'coverUri' in album: - a_thumb = album['coverUri'] - if a_thumb: - a_thumb = 'http://' + a_thumb.replace('%%', '1000x1000') - + thumbnail = None + cover_uri = track.get('albums', [{}])[0].get('coverUri') + if cover_uri: + thumbnail = cover_uri.replace('%%', 'orig') + if not thumbnail.startswith('http'): + thumbnail = 'http://' + thumbnail return { 'id': track['id'], 'ext': 'mp3', @@ -61,7 +59,7 @@ class YandexMusicTrackIE(InfoExtractor): 'title': '%s - %s' % (track['artists'][0]['name'], track['title']), 'filesize': int_or_none(track.get('fileSize')), 'duration': float_or_none(track.get('durationMs'), 1000), - 'thumbnail': a_thumb, + 'thumbnail': thumbnail, } def _real_extract(self, url): From 9fb66c780cee8668b1bb07f70e70ae1161e13320 Mon Sep 17 00:00:00 2001 From: remitamine Date: Wed, 14 Oct 2015 17:25:07 +0100 Subject: [PATCH 0130/1286] [megavideoz] remove extractor --- youtube_dl/extractor/__init__.py | 1 - youtube_dl/extractor/megavideoz.py | 56 ------------------------------ 2 files changed, 57 deletions(-) delete mode 100644 youtube_dl/extractor/megavideoz.py diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index f6d185818..462717b1e 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -319,7 +319,6 @@ from .macgamestore import MacGameStoreIE from .mailru import MailRuIE from .malemotion import MalemotionIE from .mdr import MDRIE -from .megavideoz import MegaVideozIE from .metacafe import MetacafeIE from .metacritic import MetacriticIE from .mgoon import MgoonIE diff --git a/youtube_dl/extractor/megavideoz.py b/youtube_dl/extractor/megavideoz.py deleted file mode 100644 index af7ff07ea..000000000 --- a/youtube_dl/extractor/megavideoz.py +++ /dev/null @@ -1,56 +0,0 @@ -# encoding: utf-8 -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from ..utils import ( - ExtractorError, - float_or_none, - xpath_text, -) - - -class MegaVideozIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?megavideoz\.eu/video/(?P[^/]+)(?:/(?P[^/]+))?' - _TEST = { - 'url': 'http://megavideoz.eu/video/WM6UB919XMXH/SMPTE-Universal-Film-Leader', - 'info_dict': { - 'id': '48723', - 'display_id': 'SMPTE-Universal-Film-Leader', - 'ext': 'mp4', - 'title': 'SMPTE Universal Film Leader', - 'thumbnail': 're:https?://.*?\.jpg', - 'duration': 10.93, - } - } - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') - display_id = mobj.group('display_id') or video_id - - webpage = self._download_webpage(url, display_id) - - if any(p in webpage for p in ('>Video Not Found<', '>404 Error<')): - raise ExtractorError('Video %s does not exist' % video_id, expected=True) - - config = self._download_xml( - self._search_regex( - r"var\s+cnf\s*=\s*'([^']+)'", webpage, 'cnf url'), - display_id) - - video_url = xpath_text(config, './file', 'video url', fatal=True) - title = xpath_text(config, './title', 'title', fatal=True) - thumbnail = xpath_text(config, './image', 'thumbnail') - duration = float_or_none(xpath_text(config, './duration', 'duration')) - video_id = xpath_text(config, './mediaid', 'video id') or video_id - - return { - 'id': video_id, - 'display_id': display_id, - 'url': video_url, - 'title': title, - 'thumbnail': thumbnail, - 'duration': duration - } From 1812afb7b396f4954d5d1ca1cec1c3f2d67550c6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Wed, 14 Oct 2015 22:35:01 +0600 Subject: [PATCH 0131/1286] [utils] Do not fail in int_or_none on non-numeric data (Closes #7175) --- youtube_dl/utils.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 1dc3153fd..86c693358 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -1371,7 +1371,12 @@ def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1): v = getattr(v, get_attr, None) if v == '': v = None - return default if v is None else (int(v) * invscale // scale) + if v is None: + return default + try: + return int(v) * invscale // scale + except ValueError: + pass def str_or_none(v, default=None): From caf80631f0c57b29187e2aa909fa1a3a6325d6e6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Wed, 14 Oct 2015 22:36:37 +0600 Subject: [PATCH 0132/1286] [utils] Do not fail in float_or_none on non-numeric data --- youtube_dl/utils.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 86c693358..83b44caaa 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -1392,7 +1392,12 @@ def str_to_int(int_str): def float_or_none(v, scale=1, invscale=1, default=None): - return default if v is None else (float(v) * invscale / scale) + if v is None: + return default + try: + return float(v) * invscale / scale + except ValueError: + return default def parse_duration(s): From af98f8ff37b3a0d9d1f743f4fc6c646333501eb6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Wed, 14 Oct 2015 22:37:03 +0600 Subject: [PATCH 0133/1286] [utils] Return default on fail in int_or_none --- youtube_dl/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 83b44caaa..7dbe25661 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -1376,7 +1376,7 @@ def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1): try: return int(v) * invscale // scale except ValueError: - pass + return default def str_or_none(v, default=None): From 1db82381e38181aafbd78c65c58f005ad84cc08a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Thu, 15 Oct 2015 01:52:25 +0600 Subject: [PATCH 0134/1286] [channel9] Add low quality formats and modernize --- youtube_dl/extractor/channel9.py | 35 ++++++++++++-------------------- 1 file changed, 13 insertions(+), 22 deletions(-) diff --git a/youtube_dl/extractor/channel9.py b/youtube_dl/extractor/channel9.py index 1ce004932..3a88181d8 100644 --- a/youtube_dl/extractor/channel9.py +++ b/youtube_dl/extractor/channel9.py @@ -3,7 +3,11 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..utils import ExtractorError +from ..utils import ( + ExtractorError, + parse_filesize, + qualities, +) class Channel9IE(InfoExtractor): @@ -52,23 +56,6 @@ class Channel9IE(InfoExtractor): _RSS_URL = 'http://channel9.msdn.com/%s/RSS' - # Sorted by quality - _known_formats = ['MP3', 'MP4', 'Mid Quality WMV', 'Mid Quality MP4', 'High Quality WMV', 'High Quality MP4'] - - def _restore_bytes(self, formatted_size): - if not formatted_size: - return 0 - m = re.match(r'^(?P\d+(?:\.\d+)?)\s+(?P[a-zA-Z]+)', formatted_size) - if not m: - return 0 - units = m.group('units') - try: - exponent = ['B', 'KB', 'MB', 'GB', 'TB', 'PB', 'EB', 'ZB', 'YB'].index(units.upper()) - except ValueError: - return 0 - size = float(m.group('size')) - return int(size * (1024 ** exponent)) - def _formats_from_html(self, html): FORMAT_REGEX = r''' (?x) @@ -78,16 +65,20 @@ class Channel9IE(InfoExtractor):

    File\s+size

    \s*(?P.*?)\s*
    )? # File size part may be missing ''' - # Extract known formats + quality = qualities(( + 'MP3', 'MP4', + 'Low Quality WMV', 'Low Quality MP4', + 'Mid Quality WMV', 'Mid Quality MP4', + 'High Quality WMV', 'High Quality MP4')) formats = [{ 'url': x.group('url'), 'format_id': x.group('quality'), 'format_note': x.group('note'), 'format': '%s (%s)' % (x.group('quality'), x.group('note')), - 'filesize': self._restore_bytes(x.group('filesize')), # File size is approximate - 'preference': self._known_formats.index(x.group('quality')), + 'filesize_approx': parse_filesize(x.group('filesize')), + 'quality': quality(x.group('quality')), 'vcodec': 'none' if x.group('note') == 'Audio only' else None, - } for x in list(re.finditer(FORMAT_REGEX, html)) if x.group('quality') in self._known_formats] + } for x in list(re.finditer(FORMAT_REGEX, html))] self._sort_formats(formats) From a13d06de420f6968425d48030c37e1150ff9ed6a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Thu, 15 Oct 2015 01:57:59 +0600 Subject: [PATCH 0135/1286] [channel9] Add test for low quality mp4 --- youtube_dl/extractor/channel9.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/youtube_dl/extractor/channel9.py b/youtube_dl/extractor/channel9.py index 3a88181d8..554399787 100644 --- a/youtube_dl/extractor/channel9.py +++ b/youtube_dl/extractor/channel9.py @@ -51,6 +51,21 @@ class Channel9IE(InfoExtractor): 'thumbnail': 'http://video.ch9.ms/ch9/87e1/0300391f-a455-4c72-bec3-4422f19287e1/selfservicenuk_512.jpg', 'authors': ['Mike Wilmot'], }, + }, + { + # low quality mp4 is best + 'url': 'https://channel9.msdn.com/Events/CPP/CppCon-2015/Ranges-for-the-Standard-Library', + 'info_dict': { + 'id': 'Events/CPP/CppCon-2015/Ranges-for-the-Standard-Library', + 'ext': 'mp4', + 'title': 'Ranges for the Standard Library', + 'description': 'md5:2e6b4917677af3728c5f6d63784c4c5d', + 'duration': 5646, + 'thumbnail': 're:http://.*\.jpg', + }, + 'params': { + 'skip_download': True, + }, } ] From fafc7950e2230bf25ac7c7563f1704cf8f134f64 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Thu, 15 Oct 2015 01:59:11 +0600 Subject: [PATCH 0136/1286] [channel9] Update tests' thumbnails --- youtube_dl/extractor/channel9.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/channel9.py b/youtube_dl/extractor/channel9.py index 554399787..c74553dcf 100644 --- a/youtube_dl/extractor/channel9.py +++ b/youtube_dl/extractor/channel9.py @@ -32,7 +32,7 @@ class Channel9IE(InfoExtractor): 'title': 'Developer Kick-Off Session: Stuff We Love', 'description': 'md5:c08d72240b7c87fcecafe2692f80e35f', 'duration': 4576, - 'thumbnail': 'http://video.ch9.ms/ch9/9d51/03902f2d-fc97-4d3c-b195-0bfe15a19d51/KOS002_220.jpg', + 'thumbnail': 're:http://.*\.jpg', 'session_code': 'KOS002', 'session_day': 'Day 1', 'session_room': 'Arena 1A', @@ -48,7 +48,7 @@ class Channel9IE(InfoExtractor): 'title': 'Self-service BI with Power BI - nuclear testing', 'description': 'md5:d1e6ecaafa7fb52a2cacdf9599829f5b', 'duration': 1540, - 'thumbnail': 'http://video.ch9.ms/ch9/87e1/0300391f-a455-4c72-bec3-4422f19287e1/selfservicenuk_512.jpg', + 'thumbnail': 're:http://.*\.jpg', 'authors': ['Mike Wilmot'], }, }, From 90bddb6cdd59107d137c13970dc50a6193d204a7 Mon Sep 17 00:00:00 2001 From: remitamine Date: Thu, 15 Oct 2015 14:28:56 +0100 Subject: [PATCH 0137/1286] [ooyala] extract more formats and metadata --- youtube_dl/extractor/ooyala.py | 151 ++++++++++++--------------------- 1 file changed, 53 insertions(+), 98 deletions(-) diff --git a/youtube_dl/extractor/ooyala.py b/youtube_dl/extractor/ooyala.py index a262a9f6d..592cdc564 100644 --- a/youtube_dl/extractor/ooyala.py +++ b/youtube_dl/extractor/ooyala.py @@ -1,108 +1,64 @@ from __future__ import unicode_literals import re -import json import base64 from .common import InfoExtractor from ..utils import ( - unescapeHTML, - ExtractorError, - determine_ext, int_or_none, + float_or_none, ) class OoyalaBaseIE(InfoExtractor): - def _extract_result(self, info, more_info): - embedCode = info['embedCode'] - video_url = info.get('ipad_url') or info['url'] - - if determine_ext(video_url) == 'm3u8': - formats = self._extract_m3u8_formats(video_url, embedCode, ext='mp4') - else: - formats = [{ - 'url': video_url, - 'ext': 'mp4', - }] - - return { - 'id': embedCode, - 'title': unescapeHTML(info['title']), - 'formats': formats, - 'description': unescapeHTML(more_info['description']), - 'thumbnail': more_info['promo'], + def _extract(self, player_url, video_id): + print(player_url) + content_tree = self._download_json(player_url, video_id)['content_tree'] + metadata = content_tree[list(content_tree)[0]] + embed_code = metadata['embed_code'] + pcode = metadata.get('asset_pcode') or embed_code + video_info = { + 'id': embed_code, + 'title': metadata['title'], + 'description': metadata.get('description'), + 'thumbnail': metadata.get('thumbnail_image') or metadata.get('promo_image'), + 'duration': int_or_none(metadata.get('duration')), } - def _extract(self, player_url, video_id): - player = self._download_webpage(player_url, video_id) - mobile_url = self._search_regex(r'mobile_player_url="(.+?)&device="', - player, 'mobile player url') - # Looks like some videos are only available for particular devices - # (e.g. http://player.ooyala.com/player.js?embedCode=x1b3lqZDq9y_7kMyC2Op5qo-p077tXD0 - # is only available for ipad) - # Working around with fetching URLs for all the devices found starting with 'unknown' - # until we succeed or eventually fail for each device. - devices = re.findall(r'device\s*=\s*"([^"]+)";', player) - devices.remove('unknown') - devices.insert(0, 'unknown') - for device in devices: - mobile_player = self._download_webpage( - '%s&device=%s' % (mobile_url, device), video_id, - 'Downloading mobile player JS for %s device' % device) - videos_info = self._search_regex( - r'var streams=window.oo_testEnv\?\[\]:eval\("\((\[{.*?}\])\)"\);', - mobile_player, 'info', fatal=False, default=None) - if videos_info: - break - - if not videos_info: - formats = [] + formats = [] + for supported_format in ('mp4', 'm3u8', 'hds', 'rtmp'): auth_data = self._download_json( - 'http://player.ooyala.com/sas/player_api/v1/authorization/embed_code/%s/%s?domain=www.example.org&supportedFormats=mp4,webm' % (video_id, video_id), - video_id) + 'http://player.ooyala.com/sas/player_api/v1/authorization/embed_code/%s/%s?domain=www.example.org&supportedFormats=%s' % (pcode, embed_code, supported_format), + video_id, 'Downloading %s JSON' % supported_format) - cur_auth_data = auth_data['authorization_data'][video_id] + cur_auth_data = auth_data['authorization_data'][embed_code] for stream in cur_auth_data['streams']: - formats.append({ - 'url': base64.b64decode(stream['url']['data'].encode('ascii')).decode('utf-8'), - 'ext': stream.get('delivery_type'), - 'format': stream.get('video_codec'), - 'format_id': stream.get('profile'), - 'width': int_or_none(stream.get('width')), - 'height': int_or_none(stream.get('height')), - 'abr': int_or_none(stream.get('audio_bitrate')), - 'vbr': int_or_none(stream.get('video_bitrate')), - }) - if formats: - return { - 'id': video_id, - 'formats': formats, - 'title': 'Ooyala video', - } + url = base64.b64decode(stream['url']['data'].encode('ascii')).decode('utf-8') + delivery_type = stream['delivery_type'] + if delivery_type == 'remote_asset': + video_info['url'] = url + return video_info + if delivery_type == 'hls': + formats.extend(self._extract_m3u8_formats(url, embed_code, 'mp4', 'm3u8_native', 0, m3u8_id='hls', fatal=False)) + elif delivery_type == 'hds': + formats.extend(self._extract_f4m_formats(url, embed_code, f4m_id='hds', fatal=False)) + else: + formats.append({ + 'url': url, + 'ext': stream.get('delivery_type'), + 'vcodec': stream.get('video_codec'), + 'format_id': stream.get('profile'), + 'width': int_or_none(stream.get('width')), + 'height': int_or_none(stream.get('height')), + 'abr': int_or_none(stream.get('audio_bitrate')), + 'vbr': int_or_none(stream.get('video_bitrate')), + 'fps': float_or_none(stream.get('framerate')), + }) + self._sort_formats(formats) - if not cur_auth_data['authorized']: - raise ExtractorError(cur_auth_data['message'], expected=True) - - if not videos_info: - raise ExtractorError('Unable to extract info') - videos_info = videos_info.replace('\\"', '"') - videos_more_info = self._search_regex( - r'eval\("\(({.*?\\"promo\\".*?})\)"', mobile_player, 'more info').replace('\\"', '"') - videos_info = json.loads(videos_info) - videos_more_info = json.loads(videos_more_info) - - if videos_more_info.get('lineup'): - videos = [self._extract_result(info, more_info) for (info, more_info) in zip(videos_info, videos_more_info['lineup'])] - return { - '_type': 'playlist', - 'id': video_id, - 'title': unescapeHTML(videos_more_info['title']), - 'entries': videos, - } - else: - return self._extract_result(videos_info[0], videos_more_info) + video_info['formats'] = formats + return video_info class OoyalaIE(OoyalaBaseIE): @@ -117,6 +73,7 @@ class OoyalaIE(OoyalaBaseIE): 'ext': 'mp4', 'title': 'Explaining Data Recovery from Hard Drives and SSDs', 'description': 'How badly damaged does a drive have to be to defeat Russell and his crew? Apparently, smashed to bits.', + 'duration': 853386, }, }, { # Only available for ipad @@ -125,7 +82,7 @@ class OoyalaIE(OoyalaBaseIE): 'id': 'x1b3lqZDq9y_7kMyC2Op5qo-p077tXD0', 'ext': 'mp4', 'title': 'Simulation Overview - Levels of Simulation', - 'description': '', + 'duration': 194948, }, }, { @@ -136,7 +93,8 @@ class OoyalaIE(OoyalaBaseIE): 'info_dict': { 'id': 'FiOG81ZTrvckcchQxmalf4aQj590qTEx', 'ext': 'mp4', - 'title': 'Ooyala video', + 'title': 'Divide Tool Path.mp4', + 'duration': 204405, } } ] @@ -152,8 +110,8 @@ class OoyalaIE(OoyalaBaseIE): def _real_extract(self, url): embed_code = self._match_id(url) - player_url = 'http://player.ooyala.com/player.js?embedCode=%s' % embed_code - return self._extract(player_url, embed_code) + content_tree_url = 'http://player.ooyala.com/player_api/v1/content_tree/embed_code/%s/%s' % (embed_code, embed_code) + return self._extract(content_tree_url, embed_code) class OoyalaExternalIE(OoyalaBaseIE): @@ -170,7 +128,7 @@ class OoyalaExternalIE(OoyalaBaseIE): .*?&pcode= ) (?P.+?) - (&|$) + (?:&|$) ''' _TEST = { @@ -179,7 +137,7 @@ class OoyalaExternalIE(OoyalaBaseIE): 'id': 'FkYWtmazr6Ed8xmvILvKLWjd4QvYZpzG', 'ext': 'mp4', 'title': 'dm_140128_30for30Shorts___JudgingJewellv2', - 'description': '', + 'duration': 1302000, }, 'params': { # m3u8 download @@ -188,9 +146,6 @@ class OoyalaExternalIE(OoyalaBaseIE): } def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - partner_id = mobj.group('partner_id') - video_id = mobj.group('id') - pcode = mobj.group('pcode') - player_url = 'http://player.ooyala.com/player.js?externalId=%s:%s&pcode=%s' % (partner_id, video_id, pcode) - return self._extract(player_url, video_id) + partner_id, video_id, pcode = re.match(self._VALID_URL, url).groups() + content_tree_url = 'http://player.ooyala.com/player_api/v1/content_tree/external_id/%s/%s:%s' % (pcode, partner_id, video_id) + return self._extract(content_tree_url, video_id) From 497ca088a60fdd0a98f16e22a9d4fec135a26ab0 Mon Sep 17 00:00:00 2001 From: remitamine Date: Thu, 15 Oct 2015 14:37:05 +0100 Subject: [PATCH 0138/1286] [ooyala] remove print statment --- youtube_dl/extractor/ooyala.py | 1 - 1 file changed, 1 deletion(-) diff --git a/youtube_dl/extractor/ooyala.py b/youtube_dl/extractor/ooyala.py index 592cdc564..df99a39f4 100644 --- a/youtube_dl/extractor/ooyala.py +++ b/youtube_dl/extractor/ooyala.py @@ -12,7 +12,6 @@ from ..utils import ( class OoyalaBaseIE(InfoExtractor): def _extract(self, player_url, video_id): - print(player_url) content_tree = self._download_json(player_url, video_id)['content_tree'] metadata = content_tree[list(content_tree)[0]] embed_code = metadata['embed_code'] From 77302fe5c989b9cafcb675c0a03642b80fa557ff Mon Sep 17 00:00:00 2001 From: remitamine Date: Thu, 15 Oct 2015 23:27:46 +0100 Subject: [PATCH 0139/1286] [bliptv] remove extractor and add support for site replacement(makertv) --- youtube_dl/extractor/__init__.py | 3 +- youtube_dl/extractor/bliptv.py | 292 --------------------------- youtube_dl/extractor/cinemassacre.py | 18 +- youtube_dl/extractor/generic.py | 6 - youtube_dl/extractor/jwplatform.py | 67 ++++++ youtube_dl/extractor/makertv.py | 27 +++ 6 files changed, 103 insertions(+), 310 deletions(-) delete mode 100644 youtube_dl/extractor/bliptv.py create mode 100644 youtube_dl/extractor/jwplatform.py create mode 100644 youtube_dl/extractor/makertv.py diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 462717b1e..f9c40e6cd 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -54,7 +54,6 @@ from .bet import BetIE from .bild import BildIE from .bilibili import BiliBiliIE from .blinkx import BlinkxIE -from .bliptv import BlipTVIE, BlipTVUserIE from .bloomberg import BloombergIE from .bpb import BpbIE from .br import BRIE @@ -263,6 +262,7 @@ from .jadorecettepub import JadoreCettePubIE from .jeuxvideo import JeuxVideoIE from .jove import JoveIE from .jukebox import JukeboxIE +from .jwplatform import JWPlatformIE from .jpopsukitv import JpopsukiIE from .kaltura import KalturaIE from .kanalplay import KanalPlayIE @@ -317,6 +317,7 @@ from .lynda import ( from .m6 import M6IE from .macgamestore import MacGameStoreIE from .mailru import MailRuIE +from .makertv import MakerTVIE from .malemotion import MalemotionIE from .mdr import MDRIE from .metacafe import MetacafeIE diff --git a/youtube_dl/extractor/bliptv.py b/youtube_dl/extractor/bliptv.py deleted file mode 100644 index c3296283d..000000000 --- a/youtube_dl/extractor/bliptv.py +++ /dev/null @@ -1,292 +0,0 @@ -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor - -from ..compat import ( - compat_urllib_request, - compat_urlparse, -) -from ..utils import ( - clean_html, - int_or_none, - parse_iso8601, - unescapeHTML, - xpath_text, - xpath_with_ns, -) - - -class BlipTVIE(InfoExtractor): - _VALID_URL = r'https?://(?:\w+\.)?blip\.tv/(?:(?:.+-|rss/flash/)(?P\d+)|((?:play/|api\.swf#)(?P[\da-zA-Z+_]+)))' - - _TESTS = [ - { - 'url': 'http://blip.tv/cbr/cbr-exclusive-gotham-city-imposters-bats-vs-jokerz-short-3-5796352', - 'md5': '80baf1ec5c3d2019037c1c707d676b9f', - 'info_dict': { - 'id': '5779306', - 'ext': 'm4v', - 'title': 'CBR EXCLUSIVE: "Gotham City Imposters" Bats VS Jokerz Short 3', - 'description': 'md5:9bc31f227219cde65e47eeec8d2dc596', - 'timestamp': 1323138843, - 'upload_date': '20111206', - 'uploader': 'cbr', - 'uploader_id': '679425', - 'duration': 81, - } - }, - { - # https://github.com/rg3/youtube-dl/pull/2274 - 'note': 'Video with subtitles', - 'url': 'http://blip.tv/play/h6Uag5OEVgI.html', - 'md5': '309f9d25b820b086ca163ffac8031806', - 'info_dict': { - 'id': '6586561', - 'ext': 'mp4', - 'title': 'Red vs. Blue Season 11 Episode 1', - 'description': 'One-Zero-One', - 'timestamp': 1371261608, - 'upload_date': '20130615', - 'uploader': 'redvsblue', - 'uploader_id': '792887', - 'duration': 279, - } - }, - { - # https://bugzilla.redhat.com/show_bug.cgi?id=967465 - 'url': 'http://a.blip.tv/api.swf#h6Uag5KbVwI', - 'md5': '314e87b1ebe7a48fcbfdd51b791ce5a6', - 'info_dict': { - 'id': '6573122', - 'ext': 'mov', - 'upload_date': '20130520', - 'description': 'Two hapless space marines argue over what to do when they realize they have an astronomically huge problem on their hands.', - 'title': 'Red vs. Blue Season 11 Trailer', - 'timestamp': 1369029609, - 'uploader': 'redvsblue', - 'uploader_id': '792887', - } - }, - { - 'url': 'http://blip.tv/play/gbk766dkj4Yn', - 'md5': 'fe0a33f022d49399a241e84a8ea8b8e3', - 'info_dict': { - 'id': '1749452', - 'ext': 'mp4', - 'upload_date': '20090208', - 'description': 'Witness the first appearance of the Nostalgia Critic character, as Doug reviews the movie Transformers.', - 'title': 'Nostalgia Critic: Transformers', - 'timestamp': 1234068723, - 'uploader': 'NostalgiaCritic', - 'uploader_id': '246467', - } - }, - { - # https://github.com/rg3/youtube-dl/pull/4404 - 'note': 'Audio only', - 'url': 'http://blip.tv/hilarios-productions/weekly-manga-recap-kingdom-7119982', - 'md5': '76c0a56f24e769ceaab21fbb6416a351', - 'info_dict': { - 'id': '7103299', - 'ext': 'flv', - 'title': 'Weekly Manga Recap: Kingdom', - 'description': 'And then Shin breaks the enemy line, and he's all like HWAH! And then he slices a guy and it's all like FWASHING! And... it's really hard to describe the best parts of this series without breaking down into sound effects, okay?', - 'timestamp': 1417660321, - 'upload_date': '20141204', - 'uploader': 'The Rollo T', - 'uploader_id': '407429', - 'duration': 7251, - 'vcodec': 'none', - } - }, - { - # missing duration - 'url': 'http://blip.tv/rss/flash/6700880', - 'info_dict': { - 'id': '6684191', - 'ext': 'm4v', - 'title': 'Cowboy Bebop: Gateway Shuffle Review', - 'description': 'md5:3acc480c0f9ae157f5fe88547ecaf3f8', - 'timestamp': 1386639757, - 'upload_date': '20131210', - 'uploader': 'sfdebris', - 'uploader_id': '706520', - } - } - ] - - @staticmethod - def _extract_url(webpage): - mobj = re.search(r']*https?://api\.blip\.tv/\w+/redirect/\w+/(\d+)', webpage) - if mobj: - return 'http://blip.tv/a/a-' + mobj.group(1) - mobj = re.search(r'<(?:iframe|embed|object)\s[^>]*(https?://(?:\w+\.)?blip\.tv/(?:play/|api\.swf#)[a-zA-Z0-9_]+)', webpage) - if mobj: - return mobj.group(1) - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - lookup_id = mobj.group('lookup_id') - - # See https://github.com/rg3/youtube-dl/issues/857 and - # https://github.com/rg3/youtube-dl/issues/4197 - if lookup_id: - urlh = self._request_webpage( - 'http://blip.tv/play/%s' % lookup_id, lookup_id, 'Resolving lookup id') - url = compat_urlparse.urlparse(urlh.geturl()) - qs = compat_urlparse.parse_qs(url.query) - mobj = re.match(self._VALID_URL, qs['file'][0]) - - video_id = mobj.group('id') - - rss = self._download_xml('http://blip.tv/rss/flash/%s' % video_id, video_id, 'Downloading video RSS') - - def _x(p): - return xpath_with_ns(p, { - 'blip': 'http://blip.tv/dtd/blip/1.0', - 'media': 'http://search.yahoo.com/mrss/', - 'itunes': 'http://www.itunes.com/dtds/podcast-1.0.dtd', - }) - - item = rss.find('channel/item') - - video_id = xpath_text(item, _x('blip:item_id'), 'video id') or lookup_id - title = xpath_text(item, 'title', 'title', fatal=True) - description = clean_html(xpath_text(item, _x('blip:puredescription'), 'description')) - timestamp = parse_iso8601(xpath_text(item, _x('blip:datestamp'), 'timestamp')) - uploader = xpath_text(item, _x('blip:user'), 'uploader') - uploader_id = xpath_text(item, _x('blip:userid'), 'uploader id') - duration = int_or_none(xpath_text(item, _x('blip:runtime'), 'duration')) - media_thumbnail = item.find(_x('media:thumbnail')) - thumbnail = (media_thumbnail.get('url') if media_thumbnail is not None - else xpath_text(item, 'image', 'thumbnail')) - categories = [category.text for category in item.findall('category') if category is not None] - - formats = [] - subtitles_urls = {} - - media_group = item.find(_x('media:group')) - for media_content in media_group.findall(_x('media:content')): - url = media_content.get('url') - role = media_content.get(_x('blip:role')) - msg = self._download_webpage( - url + '?showplayer=20140425131715&referrer=http://blip.tv&mask=7&skin=flashvars&view=url', - video_id, 'Resolving URL for %s' % role) - real_url = compat_urlparse.parse_qs(msg.strip())['message'][0] - - media_type = media_content.get('type') - if media_type == 'text/srt' or url.endswith('.srt'): - LANGS = { - 'english': 'en', - } - lang = role.rpartition('-')[-1].strip().lower() - langcode = LANGS.get(lang, lang) - subtitles_urls[langcode] = url - elif media_type.startswith('video/'): - formats.append({ - 'url': real_url, - 'format_id': role, - 'format_note': media_type, - 'vcodec': media_content.get(_x('blip:vcodec')) or 'none', - 'acodec': media_content.get(_x('blip:acodec')), - 'filesize': media_content.get('filesize'), - 'width': int_or_none(media_content.get('width')), - 'height': int_or_none(media_content.get('height')), - }) - self._check_formats(formats, video_id) - self._sort_formats(formats) - - subtitles = self.extract_subtitles(video_id, subtitles_urls) - - return { - 'id': video_id, - 'title': title, - 'description': description, - 'timestamp': timestamp, - 'uploader': uploader, - 'uploader_id': uploader_id, - 'duration': duration, - 'thumbnail': thumbnail, - 'categories': categories, - 'formats': formats, - 'subtitles': subtitles, - } - - def _get_subtitles(self, video_id, subtitles_urls): - subtitles = {} - for lang, url in subtitles_urls.items(): - # For some weird reason, blip.tv serves a video instead of subtitles - # when we request with a common UA - req = compat_urllib_request.Request(url) - req.add_header('User-Agent', 'youtube-dl') - subtitles[lang] = [{ - # The extension is 'srt' but it's actually an 'ass' file - 'ext': 'ass', - 'data': self._download_webpage(req, None, note=False), - }] - return subtitles - - -class BlipTVUserIE(InfoExtractor): - _VALID_URL = r'(?:(?:https?://(?:\w+\.)?blip\.tv/)|bliptvuser:)(?!api\.swf)([^/]+)/*$' - _PAGE_SIZE = 12 - IE_NAME = 'blip.tv:user' - _TEST = { - 'url': 'http://blip.tv/actone', - 'info_dict': { - 'id': 'actone', - 'title': 'Act One: The Series', - }, - 'playlist_count': 5, - } - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - username = mobj.group(1) - - page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1' - - page = self._download_webpage(url, username, 'Downloading user page') - mobj = re.search(r'data-users-id="([^"]+)"', page) - page_base = page_base % mobj.group(1) - title = self._og_search_title(page) - - # Download video ids using BlipTV Ajax calls. Result size per - # query is limited (currently to 12 videos) so we need to query - # page by page until there are no video ids - it means we got - # all of them. - - video_ids = [] - pagenum = 1 - - while True: - url = page_base + "&page=" + str(pagenum) - page = self._download_webpage( - url, username, 'Downloading video ids from page %d' % pagenum) - - # Extract video identifiers - ids_in_page = [] - - for mobj in re.finditer(r'href="/([^"]+)"', page): - if mobj.group(1) not in ids_in_page: - ids_in_page.append(unescapeHTML(mobj.group(1))) - - video_ids.extend(ids_in_page) - - # A little optimization - if current page is not - # "full", ie. does not contain PAGE_SIZE video ids then - # we can assume that this page is the last one - there - # are no more ids on further pages - no need to query - # again. - - if len(ids_in_page) < self._PAGE_SIZE: - break - - pagenum += 1 - - urls = ['http://blip.tv/%s' % video_id for video_id in video_ids] - url_entries = [self.url_result(vurl, 'BlipTV') for vurl in urls] - return self.playlist_result( - url_entries, playlist_title=title, playlist_id=username) diff --git a/youtube_dl/extractor/cinemassacre.py b/youtube_dl/extractor/cinemassacre.py index fd1770dac..6d9cd8abd 100644 --- a/youtube_dl/extractor/cinemassacre.py +++ b/youtube_dl/extractor/cinemassacre.py @@ -5,7 +5,6 @@ import re from .common import InfoExtractor from ..utils import ExtractorError -from .bliptv import BlipTVIE from .screenwavemedia import ScreenwaveMediaIE @@ -34,18 +33,17 @@ class CinemassacreIE(InfoExtractor): }, }, { - # blip.tv embedded video + # Youtube embedded video 'url': 'http://cinemassacre.com/2006/12/07/chronologically-confused-about-bad-movie-and-video-game-sequel-titles/', - 'md5': 'ca9b3c8dd5a66f9375daeb5135f5a3de', + 'md5': 'df4cf8a1dcedaec79a73d96d83b99023', 'info_dict': { - 'id': '4065369', - 'ext': 'flv', + 'id': 'OEVzPCY2T-g', + 'ext': 'mp4', 'title': 'AVGN: Chronologically Confused about Bad Movie and Video Game Sequel Titles', 'upload_date': '20061207', - 'uploader': 'cinemassacre', - 'uploader_id': '250778', - 'timestamp': 1283233867, - 'description': 'md5:0a108c78d130676b207d0f6d029ecffd', + 'uploader': 'Cinemassacre', + 'uploader_id': 'JamesNintendoNerd', + 'description': 'md5:784734696c2b8b7f4b8625cc799e07f6', } }, { @@ -88,8 +86,6 @@ class CinemassacreIE(InfoExtractor): r']+src="(?P(?:https?:)?//(?:[^.]+\.)?youtube\.com/.+?)"', ], webpage, 'player data URL', default=None, group='url') - if not playerdata_url: - playerdata_url = BlipTVIE._extract_url(webpage) if not playerdata_url: raise ExtractorError('Unable to find player data') diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index ca5fbafb2..285c0ff66 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -41,7 +41,6 @@ from .myvi import MyviIE from .condenast import CondeNastIE from .udn import UDNEmbedIE from .senateisvp import SenateISVPIE -from .bliptv import BlipTVIE from .svt import SVTIE from .pornhub import PornHubIE from .xhamster import XHamsterEmbedIE @@ -1389,11 +1388,6 @@ class GenericIE(InfoExtractor): 'id': match.group('id') } - # Look for embedded blip.tv player - bliptv_url = BlipTVIE._extract_url(webpage) - if bliptv_url: - return self.url_result(bliptv_url, 'BlipTV') - # Look for SVT player svt_url = SVTIE._extract_url(webpage) if svt_url: diff --git a/youtube_dl/extractor/jwplatform.py b/youtube_dl/extractor/jwplatform.py new file mode 100644 index 000000000..3a3dc439a --- /dev/null +++ b/youtube_dl/extractor/jwplatform.py @@ -0,0 +1,67 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import int_or_none + + +class JWPlatformIE(InfoExtractor): + _VALID_URL = r'(?:https?://content\.jwplatform\.com/(?:feeds|players|jw6)/|jwplatform:)(?P[a-zA-Z0-9]{8})' + _TEST = { + 'url': 'http://content.jwplatform.com/players/nPripu9l-ALJ3XQCI.js', + 'md5': 'fa8899fa601eb7c83a64e9d568bdf325', + 'info_dict': { + 'id': 'nPripu9l', + 'ext': 'mov', + 'title': 'Big Buck Bunny Trailer', + 'description': 'Big Buck Bunny is a short animated film by the Blender Institute. It is made using free and open source software.', + 'upload_date': '20081127', + 'timestamp': 1227796140, + } + } + + @staticmethod + def _extract_url(webpage): + mobj = re.search( + r']+?src=["\'](?P(?:https?:)?//content.jwplatform.com/players/[a-zA-Z0-9]{8}', + webpage) + if mobj: + return mobj.group('url') + + def _real_extract(self, url): + video_id = self._match_id(url) + json_data = self._download_json('http://content.jwplatform.com/feeds/%s.json' % video_id, video_id) + video_data = json_data['playlist'][0] + subtitles = {} + for track in video_data['tracks']: + if track['kind'] == 'captions': + subtitles[track['label']] = [{'url': self._proto_relative_url(track['file'])}] + + formats = [] + for source in video_data['sources']: + source_url = self._proto_relative_url(source['file']) + source_type = source.get('type') or '' + if source_type == 'application/vnd.apple.mpegurl': + formats.extend(self._extract_m3u8_formats(source_url, video_id, 'mp4', 'm3u8_native', fatal=None)) + elif source_type.startswith('audio'): + formats.append({ + 'url': source_url, + 'vcodec': 'none', + }) + else: + formats.append({ + 'url': source_url, + 'width': int_or_none(source.get('width')), + 'height': int_or_none(source.get('height')), + }) + self._sort_formats(formats) + + return { + 'id': video_data['mediaid'], + 'title': video_data['title'], + 'description': video_data.get('description'), + 'thumbnail': self._proto_relative_url(video_data.get('image')), + 'timestamp': int_or_none(video_data.get('pubdate')), + 'subtitles': subtitles, + 'formats': formats, + } diff --git a/youtube_dl/extractor/makertv.py b/youtube_dl/extractor/makertv.py new file mode 100644 index 000000000..0256e4e24 --- /dev/null +++ b/youtube_dl/extractor/makertv.py @@ -0,0 +1,27 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor + + +class MakerTVIE(InfoExtractor): + _VALID_URL = r'https?://(?:(?:www\.)?maker\.tv/(?:[^/]+/)?video|http://makerplayer.com/embed/maker)/(?P[a-zA-Z0-9]{12})' + _TEST = { + 'url': 'http://www.maker.tv/video/Fh3QgymL9gsc', + 'md5': 'ca237a53a8eb20b6dc5bd60564d4ab3e', + 'info_dict': { + 'id': 'brOEcGut', + 'ext': 'mp4', + 'title': 'Maze Runner: The Scorch Trials Official Movie Review', + 'description': 'md5:11ff3362d7ef1d679fdb649f6413975a', + 'upload_date': '20150918', + 'timestamp': 1442549540, + } + } + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + jwplatform_id = self._search_regex([r'jwid="([^"]+)"', r'Maker.jw_id\s*=\s*"([^"]+)";'], webpage, 'jwplatform id') + + return self.url_result('jwplatform:%s' % jwplatform_id, 'JWPlatform') From 6744f36db710eebe2ccc633e7f4f6132b968b0ec Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Fri, 16 Oct 2015 08:44:19 +0600 Subject: [PATCH 0140/1286] [jeuxvideo] Fallback on og:title (Closes #7186, closes #7190) --- youtube_dl/extractor/jeuxvideo.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/jeuxvideo.py b/youtube_dl/extractor/jeuxvideo.py index 1df084d87..eef7daa29 100644 --- a/youtube_dl/extractor/jeuxvideo.py +++ b/youtube_dl/extractor/jeuxvideo.py @@ -28,7 +28,7 @@ class JeuxVideoIE(InfoExtractor): mobj = re.match(self._VALID_URL, url) title = mobj.group(1) webpage = self._download_webpage(url, title) - title = self._html_search_meta('name', webpage) + title = self._html_search_meta('name', webpage) or self._og_search_title(webpage) config_url = self._html_search_regex( r'data-src="(/contenu/medias/video.php.*?)"', webpage, 'config URL') From dd414c970bcc493358ff6a76f6544a0417125594 Mon Sep 17 00:00:00 2001 From: remitamine Date: Fri, 16 Oct 2015 10:12:42 +0100 Subject: [PATCH 0141/1286] [ooyala] fix sorting and format id --- youtube_dl/extractor/ooyala.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/ooyala.py b/youtube_dl/extractor/ooyala.py index df99a39f4..075b594ce 100644 --- a/youtube_dl/extractor/ooyala.py +++ b/youtube_dl/extractor/ooyala.py @@ -39,15 +39,15 @@ class OoyalaBaseIE(InfoExtractor): video_info['url'] = url return video_info if delivery_type == 'hls': - formats.extend(self._extract_m3u8_formats(url, embed_code, 'mp4', 'm3u8_native', 0, m3u8_id='hls', fatal=False)) + formats.extend(self._extract_m3u8_formats(url, embed_code, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False)) elif delivery_type == 'hds': - formats.extend(self._extract_f4m_formats(url, embed_code, f4m_id='hds', fatal=False)) + formats.extend(self._extract_f4m_formats(url, embed_code, -1, 'hds', fatal=False)) else: formats.append({ 'url': url, 'ext': stream.get('delivery_type'), 'vcodec': stream.get('video_codec'), - 'format_id': stream.get('profile'), + 'format_id': '%s-%s-%sp' % (stream.get('profile'), delivery_type, stream.get('height')), 'width': int_or_none(stream.get('width')), 'height': int_or_none(stream.get('height')), 'abr': int_or_none(stream.get('audio_bitrate')), From cce9d15d0115e8b4cd1f6e2a327b5e9dbdf0ee54 Mon Sep 17 00:00:00 2001 From: remitamine Date: Fri, 16 Oct 2015 16:02:40 +0100 Subject: [PATCH 0142/1286] [ooyala] extract domain,handle errors and change related tests --- youtube_dl/extractor/byutv.py | 5 ++- youtube_dl/extractor/generic.py | 9 ++-- youtube_dl/extractor/groupon.py | 2 + youtube_dl/extractor/howcast.py | 1 + youtube_dl/extractor/ooyala.py | 60 ++++++++++++++----------- youtube_dl/extractor/teachingchannel.py | 1 + youtube_dl/extractor/vice.py | 1 + 7 files changed, 48 insertions(+), 31 deletions(-) diff --git a/youtube_dl/extractor/byutv.py b/youtube_dl/extractor/byutv.py index 3b2de517e..ce25816f0 100644 --- a/youtube_dl/extractor/byutv.py +++ b/youtube_dl/extractor/byutv.py @@ -14,9 +14,10 @@ class BYUtvIE(InfoExtractor): 'info_dict': { 'id': 'studio-c-season-5-episode-5', 'ext': 'mp4', - 'description': 'md5:5438d33774b6bdc662f9485a340401cc', + 'description': 'md5:e07269172baff037f8e8bf9956bc9747', 'title': 'Season 5 Episode 5', - 'thumbnail': 're:^https?://.*\.jpg$' + 'thumbnail': 're:^https?://.*\.jpg$', + 'duration': 1486486, }, 'params': { 'skip_download': True, diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index ca5fbafb2..805677364 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -335,6 +335,7 @@ class GenericIE(InfoExtractor): 'id': 'BwY2RxaTrTkslxOfcan0UCf0YqyvWysJ', 'ext': 'mp4', 'title': '2cc213299525360.mov', # that's what we get + 'duration': 238231, }, 'add_ie': ['Ooyala'], }, @@ -346,6 +347,7 @@ class GenericIE(InfoExtractor): 'ext': 'mp4', 'title': '"Steve Jobs: Man in the Machine" trailer', 'description': 'The first trailer for the Alex Gibney documentary "Steve Jobs: Man in the Machine."', + 'duration': 135427, }, 'params': { 'skip_download': True, @@ -943,8 +945,9 @@ class GenericIE(InfoExtractor): 'info_dict': { 'id': '50YnY4czr4ms1vJ7yz3xzq0excz_pUMs', 'ext': 'mp4', - 'description': 'VIDEO: Index/Match versus VLOOKUP.', + 'description': 'VIDEO: INDEX/MATCH versus VLOOKUP.', 'title': 'This is what separates the Excel masters from the wannabes', + 'duration': 191933, }, 'params': { # m3u8 downloads @@ -1454,7 +1457,7 @@ class GenericIE(InfoExtractor): re.search(r'SBN\.VideoLinkset\.ooyala\([\'"](?P.{32})[\'"]\)', webpage) or re.search(r'data-ooyala-video-id\s*=\s*[\'"](?P.{32})[\'"]', webpage)) if mobj is not None: - return OoyalaIE._build_url_result(mobj.group('ec')) + return OoyalaIE._build_url_result(smuggle_url(mobj.group('ec'), {'domain': url})) # Look for multiple Ooyala embeds on SBN network websites mobj = re.search(r'SBN\.VideoLinkset\.entryGroup\((\[.*?\])', webpage) @@ -1462,7 +1465,7 @@ class GenericIE(InfoExtractor): embeds = self._parse_json(mobj.group(1), video_id, fatal=False) if embeds: return _playlist_from_matches( - embeds, getter=lambda v: OoyalaIE._url_for_embed_code(v['provider_video_id']), ie='Ooyala') + embeds, getter=lambda v: OoyalaIE._url_for_embed_code(smuggle_url(v['provider_video_id'], {'domain': url})), ie='Ooyala') # Look for Aparat videos mobj = re.search(r'', webpage): + url = self._search_regex( + r'src=(["\'])(?P.+?partnerplayer.+?)\1', iframe, + 'player URL', default=None, group='url') + if url: + break + mobj = re.match(self._VALID_URL, url) player_id = mobj.group('player_id') From 686f98816ecbbcb224d1336682688b05cdb051a6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 7 Nov 2015 00:39:16 +0600 Subject: [PATCH 0352/1286] [pbs] Add support for flp frontlines (Closes #7369) --- youtube_dl/extractor/pbs.py | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/youtube_dl/extractor/pbs.py b/youtube_dl/extractor/pbs.py index 7b868d057..3169e9c3f 100644 --- a/youtube_dl/extractor/pbs.py +++ b/youtube_dl/extractor/pbs.py @@ -8,6 +8,7 @@ from ..utils import ( ExtractorError, determine_ext, int_or_none, + strip_jsonp, unified_strdate, US_RATINGS, ) @@ -191,6 +192,23 @@ class PBSIE(InfoExtractor): if media_id: return media_id, presumptive_id, upload_date + # Fronline video embedded via flp + video_id = self._search_regex( + r'videoid\s*:\s*"([\d+a-z]{7,})"', webpage, 'videoid') + if video_id: + # pkg_id calculation is reverse engineered from + # http://www.pbs.org/wgbh/pages/frontline/js/flp2012.js + prg_id = self._search_regex( + r'videoid\s*:\s*"([\d+a-z]{7,})"', webpage, 'videoid')[7:] + if 'q' in prg_id: + prg_id = prg_id.split('q')[1] + prg_id = int(prg_id, 16) + getdir = self._download_json( + 'http://www.pbs.org/wgbh/pages/frontline/.json/getdir/getdir%d.json' % prg_id, + presumptive_id, 'Downloading getdir JSON', + transform_source=strip_jsonp) + return getdir['mid'], presumptive_id, upload_date + for iframe in re.findall(r'(?s)', webpage): url = self._search_regex( r'src=(["\'])(?P.+?partnerplayer.+?)\1', iframe, From 8b6d9406db1d3361b006016e6aace54b05cb6fea Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 7 Nov 2015 00:42:30 +0600 Subject: [PATCH 0353/1286] [pbs] Add test for flp frontline embeds --- youtube_dl/extractor/pbs.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/youtube_dl/extractor/pbs.py b/youtube_dl/extractor/pbs.py index 3169e9c3f..a690f9c29 100644 --- a/youtube_dl/extractor/pbs.py +++ b/youtube_dl/extractor/pbs.py @@ -154,6 +154,22 @@ class PBSIE(InfoExtractor): 'params': { 'skip_download': True, # requires ffmpeg }, + }, + { + # Frontline video embedded via flp2012.js + 'url': 'http://www.pbs.org/wgbh/pages/frontline/the-atomic-artists', + 'info_dict': { + 'id': '2070868960', + 'display_id': 'the-atomic-artists', + 'ext': 'mp4', + 'title': 'FRONTLINE - The Atomic Artists', + 'description': 'md5:f5bfbefadf421e8bb8647602011caf8e', + 'duration': 723, + 'thumbnail': 're:^https?://.*\.jpg$', + }, + 'params': { + 'skip_download': True, # requires ffmpeg + }, } ] _ERRORS = { From 21d0c33ecde573db961b97f5f0c37ba9d3c02ff3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 7 Nov 2015 01:08:40 +0600 Subject: [PATCH 0354/1286] [pbs] Make flp embed lookup non fatal --- youtube_dl/extractor/pbs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/pbs.py b/youtube_dl/extractor/pbs.py index a690f9c29..8fb9b1849 100644 --- a/youtube_dl/extractor/pbs.py +++ b/youtube_dl/extractor/pbs.py @@ -210,7 +210,7 @@ class PBSIE(InfoExtractor): # Fronline video embedded via flp video_id = self._search_regex( - r'videoid\s*:\s*"([\d+a-z]{7,})"', webpage, 'videoid') + r'videoid\s*:\s*"([\d+a-z]{7,})"', webpage, 'videoid', default=None) if video_id: # pkg_id calculation is reverse engineered from # http://www.pbs.org/wgbh/pages/frontline/js/flp2012.js From ee223abb88263bdda2d92c4b2139d1dca60ba3ae Mon Sep 17 00:00:00 2001 From: Mister Hat Date: Tue, 3 Nov 2015 19:13:27 -0600 Subject: [PATCH 0355/1286] [vidzi] fixed. finds url from hash and host in script Closes #7386. --- youtube_dl/extractor/vidzi.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/vidzi.py b/youtube_dl/extractor/vidzi.py index 08a5a7b8d..2ba9f31df 100644 --- a/youtube_dl/extractor/vidzi.py +++ b/youtube_dl/extractor/vidzi.py @@ -20,8 +20,14 @@ class VidziIE(InfoExtractor): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) - video_url = self._html_search_regex( - r'{\s*file\s*:\s*"([^"]+)"\s*}', webpage, 'video url') + video_host = self._html_search_regex( + r'id=\'vplayer\'>(.*?)', webpage, 'title') From 5d0f84d32cc038dd71673987cb6efaa85e953474 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 7 Nov 2015 06:23:00 +0600 Subject: [PATCH 0356/1286] [beeg] Skip empty URLs (Closes #7392) --- youtube_dl/extractor/beeg.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/youtube_dl/extractor/beeg.py b/youtube_dl/extractor/beeg.py index e6c928699..61bc2f744 100644 --- a/youtube_dl/extractor/beeg.py +++ b/youtube_dl/extractor/beeg.py @@ -33,6 +33,8 @@ class BeegIE(InfoExtractor): formats = [] for format_id, video_url in video.items(): + if not video_url: + continue height = self._search_regex( r'^(\d+)[pP]$', format_id, 'height', default=None) if not height: From 5214f1e31d5e5ba692fb1ed4803ff71ef4e480e8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 7 Nov 2015 19:25:59 +0600 Subject: [PATCH 0357/1286] [crunchyroll] Fix title extraction (Closes #7396) --- youtube_dl/extractor/crunchyroll.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/crunchyroll.py b/youtube_dl/extractor/crunchyroll.py index 0c9b8ca02..4243f3e2e 100644 --- a/youtube_dl/extractor/crunchyroll.py +++ b/youtube_dl/extractor/crunchyroll.py @@ -287,7 +287,9 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text if 'To view this, please log in to verify you are 18 or older.' in webpage: self.raise_login_required() - video_title = self._html_search_regex(r']*>(.+?)', webpage, 'video_title', flags=re.DOTALL) + video_title = self._html_search_regex( + r'(?s)]*>((?:(?!]+itemprop=["\']title["\'][^>]*>(?:(?!', + webpage, 'video_title') video_title = re.sub(r' {2,}', ' ', video_title) video_description = self._html_search_regex(r'"description":"([^"]+)', webpage, 'video_description', default='') if not video_description: From 2c740cf28d257d2a915195e7cc60f83e6690d2cc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 7 Nov 2015 19:29:09 +0600 Subject: [PATCH 0358/1286] [crunchyroll] Simplify description extraction --- youtube_dl/extractor/crunchyroll.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/crunchyroll.py b/youtube_dl/extractor/crunchyroll.py index 4243f3e2e..9aa5d58b4 100644 --- a/youtube_dl/extractor/crunchyroll.py +++ b/youtube_dl/extractor/crunchyroll.py @@ -291,9 +291,8 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text r'(?s)]*>((?:(?!]+itemprop=["\']title["\'][^>]*>(?:(?!', webpage, 'video_title') video_title = re.sub(r' {2,}', ' ', video_title) - video_description = self._html_search_regex(r'"description":"([^"]+)', webpage, 'video_description', default='') - if not video_description: - video_description = None + video_description = self._html_search_regex( + r'"description":"([^"]+)', webpage, 'video_description', default=None) video_upload_date = self._html_search_regex( [r'
    Availability for free users:(.+?)
    ', r'
    [^<>]+\s*(.+?\d{4})\s*
    '], webpage, 'video_upload_date', fatal=False, flags=re.DOTALL) From 6d02b9a392d39c114d3fb58bf7965f62196ccecd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 7 Nov 2015 20:02:39 +0600 Subject: [PATCH 0359/1286] [crunchyroll] Fix description extraction --- youtube_dl/extractor/crunchyroll.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/crunchyroll.py b/youtube_dl/extractor/crunchyroll.py index 9aa5d58b4..6e5999c72 100644 --- a/youtube_dl/extractor/crunchyroll.py +++ b/youtube_dl/extractor/crunchyroll.py @@ -21,6 +21,7 @@ from ..utils import ( bytes_to_intlist, intlist_to_bytes, int_or_none, + lowercase_escape, remove_end, unified_strdate, urlencode_postdata, @@ -104,7 +105,7 @@ class CrunchyrollIE(CrunchyrollBaseIE): 'id': '589804', 'ext': 'flv', 'title': 'Culture Japan Episode 1 – Rebuilding Japan after the 3.11', - 'description': 'md5:fe2743efedb49d279552926d0bd0cd9e', + 'description': 'md5:2fbc01f90b87e8e9137296f37b461c12', 'thumbnail': 're:^https?://.*\.jpg$', 'uploader': 'Danny Choo Network', 'upload_date': '20120213', @@ -292,7 +293,10 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text webpage, 'video_title') video_title = re.sub(r' {2,}', ' ', video_title) video_description = self._html_search_regex( - r'"description":"([^"]+)', webpage, 'video_description', default=None) + r']*>\s*.+?\[media_id=%s\].+?"description"\s*:\s*"([^"]+)' % video_id, + webpage, 'description', default=None) + if video_description: + video_description = lowercase_escape(video_description.replace(r'\r\n', '\n')) video_upload_date = self._html_search_regex( [r'
    Availability for free users:(.+?)
    ', r'
    [^<>]+\s*(.+?\d{4})\s*
    '], webpage, 'video_upload_date', fatal=False, flags=re.DOTALL) From 3793090b1b1c1e3462b80dd3045a3573545cfb29 Mon Sep 17 00:00:00 2001 From: remitamine Date: Sat, 7 Nov 2015 16:54:35 +0100 Subject: [PATCH 0360/1286] [amp] Add generic extractor for Akamai AMP feeds and use it in dramafever and foxnews extractors --- youtube_dl/extractor/amp.py | 84 ++++++++++++++++++++++++++++++ youtube_dl/extractor/dramafever.py | 65 ++++------------------- youtube_dl/extractor/foxnews.py | 64 ++++------------------- 3 files changed, 105 insertions(+), 108 deletions(-) create mode 100644 youtube_dl/extractor/amp.py diff --git a/youtube_dl/extractor/amp.py b/youtube_dl/extractor/amp.py new file mode 100644 index 000000000..b573b9280 --- /dev/null +++ b/youtube_dl/extractor/amp.py @@ -0,0 +1,84 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + int_or_none, + parse_iso8601, +) + + +class AMPIE(InfoExtractor): + def _get_media_node(self, item, name, default=None): + media_name = 'media-%s' % name + media_group = item.get('media-group') or item + return media_group.get(media_name) or item.get(media_name) or item.get(name, default) + + # parse Akamai Adaptive Media Player feed + def _extract_feed_info(self, url): + item = self._download_json( + url, None, + 'Downloading Akamai AMP feed', + 'Unable to download Akamai AMP feed' + )['channel']['item'] + + video_id = item['guid'] + + thumbnails = [] + media_thumbnail = self._get_media_node(item, 'thumbnail') + if media_thumbnail: + if isinstance(media_thumbnail, dict): + media_thumbnail = [media_thumbnail] + for thumbnail_data in media_thumbnail: + thumbnail = thumbnail_data['@attributes'] + thumbnails.append({ + 'url': self._proto_relative_url(thumbnail['url'], 'http:'), + 'width': int_or_none(thumbnail.get('width')), + 'height': int_or_none(thumbnail.get('height')), + }) + + subtitles = {} + media_subtitle = self._get_media_node(item, 'subTitle') + if media_subtitle: + if isinstance(media_subtitle, dict): + media_subtitle = [media_subtitle] + for subtitle_data in media_subtitle: + subtitle = subtitle_data['@attributes'] + lang = subtitle.get('lang') or 'en' + subtitles[lang] = [{'url': subtitle['href']}] + + formats = [] + media_content = self._get_media_node(item, 'content') + if isinstance(media_content, dict): + media_content = [media_content] + for media_data in media_content: + media = media_data['@attributes'] + media_type = media['type'] + if media_type == 'video/f4m': + f4m_formats = self._extract_f4m_formats(media['url'] + '?hdcore=3.4.0&plugin=aasp-3.4.0.132.124', video_id, f4m_id='hds', fatal=False) + if f4m_formats: + formats.extend(f4m_formats) + elif media_type == 'application/x-mpegURL': + m3u8_formats = self._extract_m3u8_formats(media['url'], video_id, m3u8_id='hls', fatal=False) + if m3u8_formats: + formats.extend(m3u8_formats) + else: + formats.append({ + 'format_id': media_data['media-category']['@attributes']['label'], + 'url': media['url'], + 'preference': 1, + 'vbr': int_or_none(media.get('bitrate')), + 'filesize': int_or_none(media.get('fileSize')), + }) + + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': self._get_media_node(item, 'title'), + 'description': self._get_media_node(item, 'description'), + 'thumbnails': thumbnails, + 'timestamp': parse_iso8601(item.get('pubDate'), ' '), + 'duration': int_or_none(media_content[0].get('@attributes', {}).get('duration')), + 'formats': formats, + } diff --git a/youtube_dl/extractor/dramafever.py b/youtube_dl/extractor/dramafever.py index 38e6597c8..80a928827 100644 --- a/youtube_dl/extractor/dramafever.py +++ b/youtube_dl/extractor/dramafever.py @@ -3,7 +3,7 @@ from __future__ import unicode_literals import itertools -from .common import InfoExtractor +from .amp import AMPIE from ..compat import ( compat_HTTPError, compat_urllib_parse, @@ -19,7 +19,7 @@ from ..utils import ( ) -class DramaFeverBaseIE(InfoExtractor): +class DramaFeverBaseIE(AMPIE): _LOGIN_URL = 'https://www.dramafever.com/accounts/login/' _NETRC_MACHINE = 'dramafever' @@ -80,60 +80,24 @@ class DramaFeverIE(DramaFeverBaseIE): 'timestamp': 1404336058, 'upload_date': '20140702', 'duration': 343, - } + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, } def _real_extract(self, url): video_id = self._match_id(url).replace('/', '.') try: - feed = self._download_json( - 'http://www.dramafever.com/amp/episode/feed.json?guid=%s' % video_id, - video_id, 'Downloading episode JSON')['channel']['item'] + info = self._extract_feed_info('http://www.dramafever.com/amp/episode/feed.json?guid=%s' % video_id) except ExtractorError as e: if isinstance(e.cause, compat_HTTPError): raise ExtractorError( 'Currently unavailable in your country.', expected=True) raise - media_group = feed.get('media-group', {}) - - formats = [] - for media_content in media_group['media-content']: - src = media_content.get('@attributes', {}).get('url') - if not src: - continue - ext = determine_ext(src) - if ext == 'f4m': - formats.extend(self._extract_f4m_formats( - src, video_id, f4m_id='hds')) - elif ext == 'm3u8': - formats.extend(self._extract_m3u8_formats( - src, video_id, 'mp4', m3u8_id='hls')) - else: - formats.append({ - 'url': src, - }) - self._sort_formats(formats) - - title = media_group.get('media-title') - description = media_group.get('media-description') - duration = int_or_none(media_group['media-content'][0].get('@attributes', {}).get('duration')) - thumbnail = self._proto_relative_url( - media_group.get('media-thumbnail', {}).get('@attributes', {}).get('url')) - timestamp = parse_iso8601(feed.get('pubDate'), ' ') - - subtitles = {} - for media_subtitle in media_group.get('media-subTitle', []): - lang = media_subtitle.get('@attributes', {}).get('lang') - href = media_subtitle.get('@attributes', {}).get('href') - if not lang or not href: - continue - subtitles[lang] = [{ - 'ext': 'ttml', - 'url': href, - }] - series_id, episode_number = video_id.split('.') episode_info = self._download_json( # We only need a single episode info, so restricting page size to one episode @@ -146,21 +110,12 @@ class DramaFeverIE(DramaFeverBaseIE): if value: subfile = value[0].get('subfile') or value[0].get('new_subfile') if subfile and subfile != 'http://www.dramafever.com/st/': - subtitles.setdefault('English', []).append({ + info['subtitiles'].setdefault('English', []).append({ 'ext': 'srt', 'url': subfile, }) - return { - 'id': video_id, - 'title': title, - 'description': description, - 'thumbnail': thumbnail, - 'timestamp': timestamp, - 'duration': duration, - 'formats': formats, - 'subtitles': subtitles, - } + return info class DramaFeverSeriesIE(DramaFeverBaseIE): diff --git a/youtube_dl/extractor/foxnews.py b/youtube_dl/extractor/foxnews.py index 3a4a59135..0cd0f9fa8 100644 --- a/youtube_dl/extractor/foxnews.py +++ b/youtube_dl/extractor/foxnews.py @@ -2,14 +2,14 @@ from __future__ import unicode_literals import re -from .common import InfoExtractor +from .amp import AMPIE from ..utils import ( parse_iso8601, int_or_none, ) -class FoxNewsIE(InfoExtractor): +class FoxNewsIE(AMPIE): IE_DESC = 'Fox News and Fox Business Video' _VALID_URL = r'https?://(?Pvideo\.fox(?:news|business)\.com)/v/(?:video-embed\.html\?video_id=)?(?P\d+)' _TESTS = [ @@ -20,10 +20,10 @@ class FoxNewsIE(InfoExtractor): 'id': '3937480', 'ext': 'flv', 'title': 'Frozen in Time', - 'description': 'Doctors baffled by 16-year-old girl that is the size of a toddler', + 'description': '16-year-old girl is size of toddler', 'duration': 265, - 'timestamp': 1304411491, - 'upload_date': '20110503', + #'timestamp': 1304411491, + #'upload_date': '20110503', 'thumbnail': 're:^https?://.*\.jpg$', }, }, @@ -34,10 +34,10 @@ class FoxNewsIE(InfoExtractor): 'id': '3922535568001', 'ext': 'mp4', 'title': "Rep. Luis Gutierrez on if Obama's immigration plan is legal", - 'description': "Congressman discusses the president's executive action", + 'description': "Congressman discusses president's plan", 'duration': 292, - 'timestamp': 1417662047, - 'upload_date': '20141204', + #'timestamp': 1417662047, + #'upload_date': '20141204', 'thumbnail': 're:^https?://.*\.jpg$', }, }, @@ -56,48 +56,6 @@ class FoxNewsIE(InfoExtractor): video_id = mobj.group('id') host = mobj.group('host') - video = self._download_json( - 'http://%s/v/feed/video/%s.js?template=fox' % (host, video_id), video_id) - - item = video['channel']['item'] - title = item['title'] - description = item['description'] - timestamp = parse_iso8601(item['dc-date']) - - media_group = item['media-group'] - duration = None - formats = [] - for media in media_group['media-content']: - attributes = media['@attributes'] - video_url = attributes['url'] - if video_url.endswith('.f4m'): - formats.extend(self._extract_f4m_formats(video_url + '?hdcore=3.4.0&plugin=aasp-3.4.0.132.124', video_id)) - elif video_url.endswith('.m3u8'): - formats.extend(self._extract_m3u8_formats(video_url, video_id, 'flv')) - elif not video_url.endswith('.smil'): - duration = int_or_none(attributes.get('duration')) - formats.append({ - 'url': video_url, - 'format_id': media['media-category']['@attributes']['label'], - 'preference': 1, - 'vbr': int_or_none(attributes.get('bitrate')), - 'filesize': int_or_none(attributes.get('fileSize')) - }) - self._sort_formats(formats) - - media_thumbnail = media_group['media-thumbnail']['@attributes'] - thumbnails = [{ - 'url': media_thumbnail['url'], - 'width': int_or_none(media_thumbnail.get('width')), - 'height': int_or_none(media_thumbnail.get('height')), - }] if media_thumbnail else [] - - return { - 'id': video_id, - 'title': title, - 'description': description, - 'duration': duration, - 'timestamp': timestamp, - 'formats': formats, - 'thumbnails': thumbnails, - } + info = self._extract_feed_info('http://%s/v/feed/video/%s.js?template=fox' % (host, video_id)) + info['id'] = video_id + return info From 63b728f06f00c2f1a45a67eddebd18bcdc36a753 Mon Sep 17 00:00:00 2001 From: remitamine Date: Sat, 7 Nov 2015 16:56:21 +0100 Subject: [PATCH 0361/1286] [bleacherreport] Add new Extractor --- youtube_dl/extractor/__init__.py | 4 + youtube_dl/extractor/bleacherreport.py | 121 +++++++++++++++++++++++++ 2 files changed, 125 insertions(+) create mode 100644 youtube_dl/extractor/bleacherreport.py diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 94150a28f..4d65ece94 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -54,6 +54,10 @@ from .beatportpro import BeatportProIE from .bet import BetIE from .bild import BildIE from .bilibili import BiliBiliIE +from .bleacherreport import ( + BleacherReportIE, + BleacherReportCMSIE, +) from .blinkx import BlinkxIE from .bliptv import BlipTVIE, BlipTVUserIE from .bloomberg import BloombergIE diff --git a/youtube_dl/extractor/bleacherreport.py b/youtube_dl/extractor/bleacherreport.py new file mode 100644 index 000000000..a55e696d2 --- /dev/null +++ b/youtube_dl/extractor/bleacherreport.py @@ -0,0 +1,121 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from .amp import AMPIE +from ..utils import ( + ExtractorError, + int_or_none, + parse_iso8601, +) + + +class BleacherReportIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?bleacherreport\.com/articles/(?P\d+)' + _TESTS = [{ + 'url': 'http://bleacherreport.com/articles/2496438-fsu-stat-projections-is-jalen-ramsey-best-defensive-player-in-college-football', + 'md5': 'a3ffc3dc73afdbc2010f02d98f990f20', + 'info_dict': { + 'id': '2496438', + 'ext': 'mp4', + 'title': 'FSU Stat Projections: Is Jalen Ramsey Best Defensive Player in College Football?', + 'uploader_id': 3992341, + 'description': 'CFB, ACC, Florida State', + 'timestamp': 1434380212, + 'upload_date': '20150615', + 'uploader': 'Team Stream Now ', + }, + 'add_ie': ['Ooyala'], + },{ + 'url': 'http://bleacherreport.com/articles/2586817-aussie-golfers-get-fright-of-their-lives-after-being-chased-by-angry-kangaroo', + 'md5': 'af5f90dc9c7ba1c19d0a3eac806bbf50', + 'info_dict': { + 'id': '2586817', + 'ext': 'mp4', + 'title': 'Aussie Golfers Get Fright of Their Lives After Being Chased by Angry Kangaroo', + 'timestamp': 1446839961, + 'uploader': 'Sean Fay', + 'description': 'md5:e95afafa43619816552723878b3b0a84', + 'uploader_id': 6466954, + 'upload_date': '20151011', + }, + 'add_ie': ['Youtube'], + },{ + 'url': 'http://bleacherreport.com/articles/2496438-fsu-stat-projections-is-jalen-ramsey-best-defensive-player-in-college-football', + 'md5': 'a3ffc3dc73afdbc2010f02d98f990f20', + 'info_dict': { + 'id': '2496438', + 'ext': 'mp4', + 'title': 'FSU Stat Projections: Is Jalen Ramsey Best Defensive Player in College Football?', + 'upload_date': '20150615', + 'uploader': 'Team Stream Now ', + 'timestamp': 1434380212, + 'description': 'CFB, ACC, Florida State', + 'uploader_id': 3992341, + }, + 'add_ie': ['Vine'], + }] + + def _real_extract(self, url): + article_id = self._match_id(url) + + article_data = self._download_json('http://api.bleacherreport.com/api/v1/articles/%s' % article_id, article_id)['article'] + + thumbnails = [] + primary_photo = article_data.get('primaryPhoto') + if primary_photo: + thumbnails = [{ + 'url': primary_photo['url'], + 'width': primary_photo.get('width'), + 'height': primary_photo.get('height'), + }] + + info = { + '_type': 'url_transparent', + 'id': article_id, + 'title': article_data['title'], + 'uploader': article_data.get('author', {}).get('name'), + 'uploader_id': article_data.get('authorId'), + 'timestamp': parse_iso8601(article_data.get('createdAt')), + 'thumbnails': thumbnails, + 'comment_count': int_or_none(article_data.get('commentsCount')), + 'view_count': int_or_none(article_data.get('hitCount')), + } + + video = article_data.get('video') + if video: + video_type = video['type'] + if video_type == 'cms.bleacherreport.com': + info['url'] = 'http://bleacherreport.com/video_embed?id=%s' % video['id'] + elif video_type == 'ooyala.com': + info['url'] = 'ooyala:%s' % video['id'] + elif video_type == 'youtube.com': + info['url'] = video['id'] + elif video_type == 'vine.co': + info['url'] = 'https://vine.co/v/%s' % video['id'] + else: + info['url'] = video_type + video['id'] + return info + else: + raise ExtractorError('no video in the article', expected=True) + + +class BleacherReportCMSIE(AMPIE): + _VALID_URL = r'https?://(?:www\.)?bleacherreport\.com/video_embed\?id=(?P[0-9a-f-]{36})' + _TESTS = [{ + 'url': 'http://bleacherreport.com/video_embed?id=8fd44c2f-3dc5-4821-9118-2c825a98c0e1', + 'md5': 'f0ca220af012d4df857b54f792c586bb', + 'info_dict': { + 'id': '8fd44c2f-3dc5-4821-9118-2c825a98c0e1', + 'ext': 'flv', + 'title': 'Cena vs. Rollins Would Expose the Heavyweight Division', + 'description': 'md5:984afb4ade2f9c0db35f3267ed88b36e', + }, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + info = self._extract_feed_info('http://cms.bleacherreport.com/media/items/%s/akamai.json' % video_id) + info['id'] = video_id + return info From cff551c0b0ed8eb55c1ab63ec669c07a51aa4998 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Sat, 7 Nov 2015 18:43:22 +0100 Subject: [PATCH 0362/1286] [googleplus] Fix extraction of formats --- youtube_dl/extractor/googleplus.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/googleplus.py b/youtube_dl/extractor/googleplus.py index fcefe54cd..731bacd67 100644 --- a/youtube_dl/extractor/googleplus.py +++ b/youtube_dl/extractor/googleplus.py @@ -61,7 +61,7 @@ class GooglePlusIE(InfoExtractor): 'width': int(width), 'height': int(height), } for width, height, video_url in re.findall( - r'\d+,(\d+),(\d+),"(https?://redirector\.googlevideo\.com.*?)"', webpage)] + r'\d+,(\d+),(\d+),"(https?://[^.]+\.googleusercontent.com.*?)"', webpage)] self._sort_formats(formats) return { From ee4337d100f68bbb2ae795101d4c391b522ec753 Mon Sep 17 00:00:00 2001 From: remitamine Date: Fri, 6 Nov 2015 20:16:14 +0100 Subject: [PATCH 0363/1286] [videolecture] add support for multi part videos --- youtube_dl/extractor/videolecturesnet.py | 95 +++++++++++++++++------- 1 file changed, 70 insertions(+), 25 deletions(-) diff --git a/youtube_dl/extractor/videolecturesnet.py b/youtube_dl/extractor/videolecturesnet.py index 649ac9433..351706362 100644 --- a/youtube_dl/extractor/videolecturesnet.py +++ b/youtube_dl/extractor/videolecturesnet.py @@ -10,17 +10,19 @@ from ..compat import ( from ..utils import ( ExtractorError, parse_duration, + js_to_json, + parse_iso8601, ) class VideoLecturesNetIE(InfoExtractor): - _VALID_URL = r'http://(?:www\.)?videolectures\.net/(?P[^/#?]+)/*(?:[#?].*)?$' + _VALID_URL = r'http://(?:www\.)?videolectures\.net/(?P[^/]+)(?:/video/(?P\d+))?' IE_NAME = 'videolectures.net' _TESTS = [{ 'url': 'http://videolectures.net/promogram_igor_mekjavic_eng/', 'info_dict': { - 'id': 'promogram_igor_mekjavic_eng', + 'id': '20171_part1', 'ext': 'mp4', 'title': 'Automatics, robotics and biocybernetics', 'description': 'md5:815fc1deb6b3a2bff99de2d5325be482', @@ -32,7 +34,7 @@ class VideoLecturesNetIE(InfoExtractor): # video with invalid direct format links (HTTP 403) 'url': 'http://videolectures.net/russir2010_filippova_nlp/', 'info_dict': { - 'id': 'russir2010_filippova_nlp', + 'id': '14891_part1', 'ext': 'flv', 'title': 'NLP at Google', 'description': 'md5:fc7a6d9bf0302d7cc0e53f7ca23747b3', @@ -46,37 +48,80 @@ class VideoLecturesNetIE(InfoExtractor): }, { 'url': 'http://videolectures.net/deeplearning2015_montreal/', 'info_dict': { - 'id': 'deeplearning2015_montreal', + 'id': '23181', 'title': 'Deep Learning Summer School, Montreal 2015', - 'description': 'md5:90121a40cc6926df1bf04dcd8563ed3b', + 'description': 'md5:0533a85e4bd918df52a01f0e1ebe87b7', + 'timestamp': 1438560000, }, 'playlist_count': 30, + }, { + # multi part lecture + 'url': 'http://videolectures.net/mlss09uk_bishop_ibi/', + 'info_dict': { + 'id': '9737', + 'title': 'Introduction To Bayesian Inference', + 'timestamp': 1251622800, + }, + 'playlist': [{ + 'info_dict': { + 'id': '9737_part1', + 'ext': 'wmv', + 'title': 'Introduction To Bayesian Inference', + }, + }, { + 'info_dict': { + 'id': '9737_part2', + 'ext': 'wmv', + 'title': 'Introduction To Bayesian Inference', + }, + }], + 'playlist_count': 2, }] def _real_extract(self, url): - video_id = self._match_id(url) + lecture_slug, part = re.match(self._VALID_URL, url).groups() - smil_url = 'http://videolectures.net/%s/video/1/smil.xml' % video_id + webpage = self._download_webpage(url, lecture_slug) - try: - smil = self._download_smil(smil_url, video_id) - except ExtractorError as e: - if isinstance(e.cause, compat_HTTPError) and e.cause.code == 404: - # Probably a playlist - webpage = self._download_webpage(url, video_id) - entries = [ - self.url_result(compat_urlparse.urljoin(url, video_url), 'VideoLecturesNet') - for _, video_url in re.findall(r']+href=(["\'])(.+?)\1[^>]+id=["\']lec=\d+', webpage)] - playlist_title = self._html_search_meta('title', webpage, 'title', fatal=True) - playlist_description = self._html_search_meta('description', webpage, 'description') - return self.playlist_result(entries, video_id, playlist_title, playlist_description) + cfg = self._parse_json(self._search_regex(r'cfg\s*:\s*({[^}]+})', webpage, 'cfg'), lecture_slug, js_to_json) - info = self._parse_smil(smil, smil_url, video_id) + lecture_id = str(cfg['obj_id']) - info['id'] = video_id + lecture_data = self._download_json('%s/site/api/lecture/%s?format=json' % (self._proto_relative_url(cfg['livepipe'], 'http:'), lecture_id), lecture_id)['lecture'][0] - switch = smil.find('.//switch') - if switch is not None: - info['duration'] = parse_duration(switch.attrib.get('dur')) + lecture_info = { + 'id': lecture_id, + 'display_id': lecture_slug, + 'title': lecture_data['title'], + 'timestamp': parse_iso8601(lecture_data.get('time')), + 'description': lecture_data.get('description_wiki'), + 'thumbnail': lecture_data.get('thumb'), + } - return info + entries = [] + parts = cfg.get('videos') + if parts: + if len(parts) == 1: + part = str(parts[0]) + if part: + smil_url = 'http://videolectures.net/%s/video/%s/smil.xml' % (lecture_slug, part) + smil = self._download_smil(smil_url, lecture_id) + info = self._parse_smil(smil, smil_url, lecture_id) + info['id'] = '%s_part%s' % (lecture_id, part) + switch = smil.find('.//switch') + if switch is not None: + info['duration'] = parse_duration(switch.attrib.get('dur')) + return info + else: + for part in parts: + entries.append(self.url_result('http://videolectures.net/%s/video/%s' % (lecture_slug, part), 'VideoLecturesNet')) + lecture_info['_type'] = 'multi_video' + else: + # Probably a playlist + entries = [ + self.url_result(compat_urlparse.urljoin(url, video_url), 'VideoLecturesNet') + for _, video_url in re.findall(r']+href=(["\'])(.+?)\1[^>]+id=["\']lec=\d+', webpage)] + lecture_info['_type'] = 'playlist' + + lecture_info['entries'] = entries + return lecture_info From a06bf87a2c6009d82ec28afe566f653b3deb11bf Mon Sep 17 00:00:00 2001 From: remitamine Date: Fri, 6 Nov 2015 21:23:41 +0100 Subject: [PATCH 0364/1286] [viidea] add support for sites using viidea service --- youtube_dl/extractor/__init__.py | 2 +- .../{videolecturesnet.py => viidea.py} | 33 ++++++++++++++----- 2 files changed, 26 insertions(+), 9 deletions(-) rename youtube_dl/extractor/{videolecturesnet.py => viidea.py} (77%) diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 94150a28f..0a90da73c 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -724,7 +724,6 @@ from .vh1 import VH1IE from .vice import ViceIE from .viddler import ViddlerIE from .videodetective import VideoDetectiveIE -from .videolecturesnet import VideoLecturesNetIE from .videofyme import VideofyMeIE from .videomega import VideoMegaIE from .videopremium import VideoPremiumIE @@ -734,6 +733,7 @@ from .vidme import VidmeIE from .vidzi import VidziIE from .vier import VierIE, VierVideosIE from .viewster import ViewsterIE +from .viidea import ViideaIE from .vimeo import ( VimeoIE, VimeoAlbumIE, diff --git a/youtube_dl/extractor/videolecturesnet.py b/youtube_dl/extractor/viidea.py similarity index 77% rename from youtube_dl/extractor/videolecturesnet.py rename to youtube_dl/extractor/viidea.py index 351706362..71fb298e6 100644 --- a/youtube_dl/extractor/videolecturesnet.py +++ b/youtube_dl/extractor/viidea.py @@ -15,9 +15,23 @@ from ..utils import ( ) -class VideoLecturesNetIE(InfoExtractor): - _VALID_URL = r'http://(?:www\.)?videolectures\.net/(?P[^/]+)(?:/video/(?P\d+))?' - IE_NAME = 'videolectures.net' +class ViideaIE(InfoExtractor): + _VALID_URL = r'''(?x)http://(?:www\.)?(?: + videolectures\.net| + flexilearn\.viidea\.net| + presentations\.ocwconsortium\.org| + video\.travel-zoom\.si| + video\.pomp-forum\.si| + tv\.nil\.si| + video\.hekovnik.com| + video\.szko\.si| + kpk\.viidea\.com| + inside\.viidea\.net| + video\.kiberpipa\.org| + bvvideo\.si| + kongres\.viidea\.net| + edemokracija\.viidea\.com + )(?:/lecture)?/(?P[^/]+)(?:/video/(?P\d+))?''' _TESTS = [{ 'url': 'http://videolectures.net/promogram_igor_mekjavic_eng/', @@ -87,7 +101,9 @@ class VideoLecturesNetIE(InfoExtractor): lecture_id = str(cfg['obj_id']) - lecture_data = self._download_json('%s/site/api/lecture/%s?format=json' % (self._proto_relative_url(cfg['livepipe'], 'http:'), lecture_id), lecture_id)['lecture'][0] + base_url = self._proto_relative_url(cfg['livepipe'], 'http:') + + lecture_data = self._download_json('%s/site/api/lecture/%s?format=json' % (base_url, lecture_id), lecture_id)['lecture'][0] lecture_info = { 'id': lecture_id, @@ -104,7 +120,7 @@ class VideoLecturesNetIE(InfoExtractor): if len(parts) == 1: part = str(parts[0]) if part: - smil_url = 'http://videolectures.net/%s/video/%s/smil.xml' % (lecture_slug, part) + smil_url = '%s/%s/video/%s/smil.xml' % (base_url, lecture_slug, part) smil = self._download_smil(smil_url, lecture_id) info = self._parse_smil(smil, smil_url, lecture_id) info['id'] = '%s_part%s' % (lecture_id, part) @@ -114,13 +130,14 @@ class VideoLecturesNetIE(InfoExtractor): return info else: for part in parts: - entries.append(self.url_result('http://videolectures.net/%s/video/%s' % (lecture_slug, part), 'VideoLecturesNet')) + entries.append(self.url_result('%s/video/%s' % (base_url, lecture_id, part), 'Viidea')) lecture_info['_type'] = 'multi_video' else: # Probably a playlist + playlist_webpage = self._download_webpage('%s/site/ajax/drilldown/?id=%s' % (base_url, lecture_id), lecture_id) entries = [ - self.url_result(compat_urlparse.urljoin(url, video_url), 'VideoLecturesNet') - for _, video_url in re.findall(r']+href=(["\'])(.+?)\1[^>]+id=["\']lec=\d+', webpage)] + self.url_result(compat_urlparse.urljoin(url, video_url), 'Viidea') + for _, video_url in re.findall(r']+href=(["\'])(.+?)\1[^>]+id=["\']lec=\d+', playlist_webpage)] lecture_info['_type'] = 'playlist' lecture_info['entries'] = entries From 8e3a2bd6200660f9fb9d485b1c924fa5462bd566 Mon Sep 17 00:00:00 2001 From: remitamine Date: Sat, 7 Nov 2015 17:43:23 +0100 Subject: [PATCH 0365/1286] [viidea] fix _VALID_URL regex and tests --- youtube_dl/extractor/viidea.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/viidea.py b/youtube_dl/extractor/viidea.py index 71fb298e6..ae9a42737 100644 --- a/youtube_dl/extractor/viidea.py +++ b/youtube_dl/extractor/viidea.py @@ -31,7 +31,7 @@ class ViideaIE(InfoExtractor): bvvideo\.si| kongres\.viidea\.net| edemokracija\.viidea\.com - )(?:/lecture)?/(?P[^/]+)(?:/video/(?P\d+))?''' + )(?:/lecture)?/(?P[^/]+)(?:/video/(?P\d+))?/*(?:[#?].*)?$''' _TESTS = [{ 'url': 'http://videolectures.net/promogram_igor_mekjavic_eng/', @@ -130,7 +130,7 @@ class ViideaIE(InfoExtractor): return info else: for part in parts: - entries.append(self.url_result('%s/video/%s' % (base_url, lecture_id, part), 'Viidea')) + entries.append(self.url_result('%s/%s/video/%s' % (base_url, lecture_slug, part), 'Viidea')) lecture_info['_type'] = 'multi_video' else: # Probably a playlist From 6fdb39ded15c6276b49fa67cb517bf1fed63af35 Mon Sep 17 00:00:00 2001 From: remitamine Date: Sat, 7 Nov 2015 20:38:33 +0100 Subject: [PATCH 0366/1286] [viidia] Cleaup [viidea] extract playlist if lecture is an event [viidia] use compat_str --- youtube_dl/extractor/viidea.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/youtube_dl/extractor/viidea.py b/youtube_dl/extractor/viidea.py index ae9a42737..2541a36ed 100644 --- a/youtube_dl/extractor/viidea.py +++ b/youtube_dl/extractor/viidea.py @@ -4,11 +4,10 @@ import re from .common import InfoExtractor from ..compat import ( - compat_HTTPError, compat_urlparse, + compat_str, ) from ..utils import ( - ExtractorError, parse_duration, js_to_json, parse_iso8601, @@ -97,9 +96,9 @@ class ViideaIE(InfoExtractor): webpage = self._download_webpage(url, lecture_slug) - cfg = self._parse_json(self._search_regex(r'cfg\s*:\s*({[^}]+})', webpage, 'cfg'), lecture_slug, js_to_json) + cfg = self._parse_json(self._search_regex([r'cfg\s*:\s*({.+?}),[\da-zA-Z_]:\(?function', r'cfg\s*:\s*({[^}]+})'], webpage, 'cfg'), lecture_slug, js_to_json) - lecture_id = str(cfg['obj_id']) + lecture_id = compat_str(cfg['obj_id']) base_url = self._proto_relative_url(cfg['livepipe'], 'http:') @@ -118,7 +117,7 @@ class ViideaIE(InfoExtractor): parts = cfg.get('videos') if parts: if len(parts) == 1: - part = str(parts[0]) + part = compat_str(parts[0]) if part: smil_url = '%s/%s/video/%s/smil.xml' % (base_url, lecture_slug, part) smil = self._download_smil(smil_url, lecture_id) @@ -132,7 +131,7 @@ class ViideaIE(InfoExtractor): for part in parts: entries.append(self.url_result('%s/%s/video/%s' % (base_url, lecture_slug, part), 'Viidea')) lecture_info['_type'] = 'multi_video' - else: + if not parts or lecture_data.get('type') == 'evt': # Probably a playlist playlist_webpage = self._download_webpage('%s/site/ajax/drilldown/?id=%s' % (base_url, lecture_id), lecture_id) entries = [ From e8ce2375e0851e65c4882002297404825fe1045e Mon Sep 17 00:00:00 2001 From: Sergey M? Date: Sun, 8 Nov 2015 06:54:27 +0600 Subject: [PATCH 0367/1286] [viidea] Improve and cleanup (Closes #7390) * Optimize requests for multipart videos * Fix cfg regex * Improve titles and identifiers --- youtube_dl/extractor/viidea.py | 99 ++++++++++++++++++++++++---------- 1 file changed, 72 insertions(+), 27 deletions(-) diff --git a/youtube_dl/extractor/viidea.py b/youtube_dl/extractor/viidea.py index 2541a36ed..525e303d4 100644 --- a/youtube_dl/extractor/viidea.py +++ b/youtube_dl/extractor/viidea.py @@ -35,35 +35,42 @@ class ViideaIE(InfoExtractor): _TESTS = [{ 'url': 'http://videolectures.net/promogram_igor_mekjavic_eng/', 'info_dict': { - 'id': '20171_part1', + 'id': '20171', + 'display_id': 'promogram_igor_mekjavic_eng', 'ext': 'mp4', 'title': 'Automatics, robotics and biocybernetics', 'description': 'md5:815fc1deb6b3a2bff99de2d5325be482', + 'thumbnail': 're:http://.*\.jpg', + 'timestamp': 1372349289, 'upload_date': '20130627', 'duration': 565, - 'thumbnail': 're:http://.*\.jpg', }, }, { # video with invalid direct format links (HTTP 403) 'url': 'http://videolectures.net/russir2010_filippova_nlp/', 'info_dict': { - 'id': '14891_part1', + 'id': '14891', + 'display_id': 'russir2010_filippova_nlp', 'ext': 'flv', 'title': 'NLP at Google', 'description': 'md5:fc7a6d9bf0302d7cc0e53f7ca23747b3', - 'duration': 5352, 'thumbnail': 're:http://.*\.jpg', + 'timestamp': 1284375600, + 'upload_date': '20100913', + 'duration': 5352, }, 'params': { # rtmp download 'skip_download': True, }, }, { + # event playlist 'url': 'http://videolectures.net/deeplearning2015_montreal/', 'info_dict': { 'id': '23181', 'title': 'Deep Learning Summer School, Montreal 2015', 'description': 'md5:0533a85e4bd918df52a01f0e1ebe87b7', + 'thumbnail': 're:http://.*\.jpg', 'timestamp': 1438560000, }, 'playlist_count': 30, @@ -72,37 +79,54 @@ class ViideaIE(InfoExtractor): 'url': 'http://videolectures.net/mlss09uk_bishop_ibi/', 'info_dict': { 'id': '9737', + 'display_id': 'mlss09uk_bishop_ibi', 'title': 'Introduction To Bayesian Inference', + 'thumbnail': 're:http://.*\.jpg', 'timestamp': 1251622800, }, 'playlist': [{ 'info_dict': { 'id': '9737_part1', + 'display_id': 'mlss09uk_bishop_ibi_part1', 'ext': 'wmv', - 'title': 'Introduction To Bayesian Inference', + 'title': 'Introduction To Bayesian Inference (Part 1)', + 'thumbnail': 're:http://.*\.jpg', + 'duration': 4622, + 'timestamp': 1251622800, + 'upload_date': '20090830', }, }, { 'info_dict': { 'id': '9737_part2', + 'display_id': 'mlss09uk_bishop_ibi_part2', 'ext': 'wmv', - 'title': 'Introduction To Bayesian Inference', + 'title': 'Introduction To Bayesian Inference (Part 2)', + 'thumbnail': 're:http://.*\.jpg', + 'duration': 5641, + 'timestamp': 1251622800, + 'upload_date': '20090830', }, }], 'playlist_count': 2, }] def _real_extract(self, url): - lecture_slug, part = re.match(self._VALID_URL, url).groups() + lecture_slug, explicit_part_id = re.match(self._VALID_URL, url).groups() webpage = self._download_webpage(url, lecture_slug) - cfg = self._parse_json(self._search_regex([r'cfg\s*:\s*({.+?}),[\da-zA-Z_]:\(?function', r'cfg\s*:\s*({[^}]+})'], webpage, 'cfg'), lecture_slug, js_to_json) + cfg = self._parse_json(self._search_regex( + [r'cfg\s*:\s*({.+?})\s*,\s*[\da-zA-Z_]+\s*:\s*\(?\s*function', + r'cfg\s*:\s*({[^}]+})'], + webpage, 'cfg'), lecture_slug, js_to_json) lecture_id = compat_str(cfg['obj_id']) base_url = self._proto_relative_url(cfg['livepipe'], 'http:') - lecture_data = self._download_json('%s/site/api/lecture/%s?format=json' % (base_url, lecture_id), lecture_id)['lecture'][0] + lecture_data = self._download_json( + '%s/site/api/lecture/%s?format=json' % (base_url, lecture_id), + lecture_id)['lecture'][0] lecture_info = { 'id': lecture_id, @@ -113,31 +137,52 @@ class ViideaIE(InfoExtractor): 'thumbnail': lecture_data.get('thumb'), } - entries = [] - parts = cfg.get('videos') + playlist_entries = [] + lecture_type = lecture_data.get('type') + parts = [compat_str(video) for video in cfg.get('videos', [])] if parts: - if len(parts) == 1: - part = compat_str(parts[0]) - if part: - smil_url = '%s/%s/video/%s/smil.xml' % (base_url, lecture_slug, part) + multipart = len(parts) > 1 + + def extract_part(part_id): + smil_url = '%s/%s/video/%s/smil.xml' % (base_url, lecture_slug, part_id) smil = self._download_smil(smil_url, lecture_id) info = self._parse_smil(smil, smil_url, lecture_id) - info['id'] = '%s_part%s' % (lecture_id, part) + info['id'] = lecture_id if not multipart else '%s_part%s' % (lecture_id, part_id) + info['display_id'] = lecture_slug if not multipart else '%s_part%s' % (lecture_slug, part_id) + if multipart: + info['title'] += ' (Part %s)' % part_id switch = smil.find('.//switch') if switch is not None: info['duration'] = parse_duration(switch.attrib.get('dur')) - return info + item_info = lecture_info.copy() + item_info.update(info) + return item_info + + if explicit_part_id or not multipart: + result = extract_part(explicit_part_id or parts[0]) else: - for part in parts: - entries.append(self.url_result('%s/%s/video/%s' % (base_url, lecture_slug, part), 'Viidea')) - lecture_info['_type'] = 'multi_video' - if not parts or lecture_data.get('type') == 'evt': - # Probably a playlist - playlist_webpage = self._download_webpage('%s/site/ajax/drilldown/?id=%s' % (base_url, lecture_id), lecture_id) + result = { + '_type': 'multi_video', + 'entries': [extract_part(part) for part in parts], + } + result.update(lecture_info) + + # Immediately return explicitly requested part or non event item + if explicit_part_id or lecture_type != 'evt': + return result + + playlist_entries.append(result) + + # It's probably a playlist + if not parts or lecture_type == 'evt': + playlist_webpage = self._download_webpage( + '%s/site/ajax/drilldown/?id=%s' % (base_url, lecture_id), lecture_id) entries = [ self.url_result(compat_urlparse.urljoin(url, video_url), 'Viidea') - for _, video_url in re.findall(r']+href=(["\'])(.+?)\1[^>]+id=["\']lec=\d+', playlist_webpage)] - lecture_info['_type'] = 'playlist' + for _, video_url in re.findall( + r']+href=(["\'])(.+?)\1[^>]+id=["\']lec=\d+', playlist_webpage)] + playlist_entries.extend(entries) - lecture_info['entries'] = entries - return lecture_info + playlist = self.playlist_result(playlist_entries, lecture_id) + playlist.update(lecture_info) + return playlist From d5c181a14e08198e400932d591b47683a630c8c1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Sun, 8 Nov 2015 11:49:51 +0100 Subject: [PATCH 0368/1286] [movieclips] Fix extraction (fixes #7404) They use theplatform now. Changed the test, because the old one seems to be georestricted. --- youtube_dl/extractor/movieclips.py | 77 ++++++++---------------------- 1 file changed, 19 insertions(+), 58 deletions(-) diff --git a/youtube_dl/extractor/movieclips.py b/youtube_dl/extractor/movieclips.py index 04e17d055..e06828b55 100644 --- a/youtube_dl/extractor/movieclips.py +++ b/youtube_dl/extractor/movieclips.py @@ -1,80 +1,41 @@ from __future__ import unicode_literals -import re - from .common import InfoExtractor from ..compat import ( - compat_str, -) -from ..utils import ( - ExtractorError, - clean_html, + compat_urllib_request, ) class MovieClipsIE(InfoExtractor): - _VALID_URL = r'https?://movieclips\.com/(?P[\da-zA-Z]+)(?:-(?P[\da-z-]+))?' + _VALID_URL = r'https?://(?:www.)?movieclips\.com/videos/(?P[^/?#]+)' _TEST = { - 'url': 'http://movieclips.com/Wy7ZU-my-week-with-marilyn-movie-do-you-love-me/', + 'url': 'http://www.movieclips.com/videos/warcraft-trailer-1-561180739597?autoPlay=true&playlistId=5', 'info_dict': { - 'id': 'Wy7ZU', - 'display_id': 'my-week-with-marilyn-movie-do-you-love-me', + 'id': 'pKIGmG83AqD9', + 'display_id': 'warcraft-trailer-1-561180739597', 'ext': 'mp4', - 'title': 'My Week with Marilyn - Do You Love Me?', - 'description': 'md5:e86795bd332fe3cff461e7c8dc542acb', + 'title': 'Warcraft Trailer 1', + 'description': 'Watch Trailer 1 from Warcraft (2016). Legendary’s WARCRAFT is a 3D epic adventure of world-colliding conflict based.', 'thumbnail': 're:^https?://.*\.jpg$', }, - 'params': { - # rtmp download - 'skip_download': True, - } + 'add_ie': ['ThePlatform'], } def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') - display_id = mobj.group('display_id') - show_id = display_id or video_id + display_id = self._match_id(url) - config = self._download_xml( - 'http://config.movieclips.com/player/config/%s' % video_id, - show_id, 'Downloading player config') - - if config.find('./country-region').text == 'false': - raise ExtractorError( - '%s said: %s' % (self.IE_NAME, config.find('./region_alert').text), expected=True) - - properties = config.find('./video/properties') - smil_file = properties.attrib['smil_file'] - - smil = self._download_xml(smil_file, show_id, 'Downloading SMIL') - base_url = smil.find('./head/meta').attrib['base'] - - formats = [] - for video in smil.findall('./body/switch/video'): - vbr = int(video.attrib['system-bitrate']) / 1000 - src = video.attrib['src'] - formats.append({ - 'url': base_url, - 'play_path': src, - 'ext': src.split(':')[0], - 'vbr': vbr, - 'format_id': '%dk' % vbr, - }) - - self._sort_formats(formats) - - title = '%s - %s' % (properties.attrib['clip_movie_title'], properties.attrib['clip_title']) - description = clean_html(compat_str(properties.attrib['clip_description'])) - thumbnail = properties.attrib['image'] - categories = properties.attrib['clip_categories'].split(',') + req = compat_urllib_request.Request(url) + # it doesn't work if it thinks the browser it's too old + req.add_header('User-Agent', 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20150101 Firefox/43.0 (Chrome)') + webpage = self._download_webpage(req, display_id) + theplatform_link = self._html_search_regex(r'src="(http://player.theplatform.com/p/.*?)"', webpage, 'theplatform link') + title = self._html_search_regex(r']*>([^>]+)-\s*\d+\s*|\s*Movieclips.com', webpage, 'title') + description = self._html_search_meta('description', webpage) return { - 'id': video_id, - 'display_id': display_id, + '_type': 'url_transparent', + 'url': theplatform_link, 'title': title, + 'display_id': display_id, 'description': description, - 'thumbnail': thumbnail, - 'categories': categories, - 'formats': formats, } From 937511dfc01c3d00c35a00f78c2b6f989b4d46e3 Mon Sep 17 00:00:00 2001 From: Frans de Jonge Date: Sat, 7 Nov 2015 22:55:02 +0100 Subject: [PATCH 0369/1286] Added support for the RTBF OUFtivi subpage --- youtube_dl/extractor/rtbf.py | 41 ++++++++++++++++++++++++++---------- 1 file changed, 30 insertions(+), 11 deletions(-) diff --git a/youtube_dl/extractor/rtbf.py b/youtube_dl/extractor/rtbf.py index 04a66df90..e75b45112 100644 --- a/youtube_dl/extractor/rtbf.py +++ b/youtube_dl/extractor/rtbf.py @@ -9,17 +9,36 @@ from ..utils import ( class RTBFIE(InfoExtractor): - _VALID_URL = r'https?://www.rtbf.be/video/[^\?]+\?id=(?P\d+)' - _TEST = { - 'url': 'https://www.rtbf.be/video/detail_les-diables-au-coeur-episode-2?id=1921274', - 'md5': '799f334ddf2c0a582ba80c44655be570', - 'info_dict': { - 'id': '1921274', - 'ext': 'mp4', - 'title': 'Les Diables au coeur (épisode 2)', - 'duration': 3099, - } - } + _VALID_URL = r'''(?x) + https?://www\.rtbf\.be/ + (?: + video/[^\?]+\?id=| + ouftivi/heros/[^&]+&videoId= + ) + (?P\d+) + ''' + _TESTS = [ + { + 'url': 'https://www.rtbf.be/video/detail_les-diables-au-coeur-episode-2?id=1921274', + 'md5': '799f334ddf2c0a582ba80c44655be570', + 'info_dict': { + 'id': '1921274', + 'ext': 'mp4', + 'title': 'Les Diables au coeur (épisode 2)', + 'duration': 3099, + } + }, + { + 'url': 'http://www.rtbf.be/ouftivi/heros/detail_scooby-doo-mysteres-associes?id=1097&videoId=2057442', + 'md5': '25aea17e949e1e0c7c41270d60d25f22', + 'info_dict': { + 'id': '2057442', + 'ext': 'mp4', + 'title': 'Scooby-Doo, myst\xe8res associ\xe9s', + 'duration': 1279, + } + }, + ] _QUALITIES = [ ('mobile', 'mobile'), From fda2717ef9d429358d5816582590d15d18f9109f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 8 Nov 2015 16:56:20 +0600 Subject: [PATCH 0370/1286] [movieclips] Add coding cookie --- youtube_dl/extractor/movieclips.py | 1 + 1 file changed, 1 insertion(+) diff --git a/youtube_dl/extractor/movieclips.py b/youtube_dl/extractor/movieclips.py index e06828b55..b8c43a163 100644 --- a/youtube_dl/extractor/movieclips.py +++ b/youtube_dl/extractor/movieclips.py @@ -1,3 +1,4 @@ +# coding: utf-8 from __future__ import unicode_literals from .common import InfoExtractor From 114e6025b09e12bd01b5ce22bd2c43a3ef0ba460 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 8 Nov 2015 17:01:45 +0600 Subject: [PATCH 0371/1286] [rtbf] Expand _VALID_URL (Closes #7402) --- youtube_dl/extractor/rtbf.py | 48 ++++++++++++++---------------------- 1 file changed, 18 insertions(+), 30 deletions(-) diff --git a/youtube_dl/extractor/rtbf.py b/youtube_dl/extractor/rtbf.py index e75b45112..acf10e253 100644 --- a/youtube_dl/extractor/rtbf.py +++ b/youtube_dl/extractor/rtbf.py @@ -9,36 +9,24 @@ from ..utils import ( class RTBFIE(InfoExtractor): - _VALID_URL = r'''(?x) - https?://www\.rtbf\.be/ - (?: - video/[^\?]+\?id=| - ouftivi/heros/[^&]+&videoId= - ) - (?P\d+) - ''' - _TESTS = [ - { - 'url': 'https://www.rtbf.be/video/detail_les-diables-au-coeur-episode-2?id=1921274', - 'md5': '799f334ddf2c0a582ba80c44655be570', - 'info_dict': { - 'id': '1921274', - 'ext': 'mp4', - 'title': 'Les Diables au coeur (épisode 2)', - 'duration': 3099, - } - }, - { - 'url': 'http://www.rtbf.be/ouftivi/heros/detail_scooby-doo-mysteres-associes?id=1097&videoId=2057442', - 'md5': '25aea17e949e1e0c7c41270d60d25f22', - 'info_dict': { - 'id': '2057442', - 'ext': 'mp4', - 'title': 'Scooby-Doo, myst\xe8res associ\xe9s', - 'duration': 1279, - } - }, - ] + _VALID_URL = r'https?://www\.rtbf\.be/(?:video/[^?]+\?.*\bid=|ouftivi/(?:[^/]+/)*[^?]+\?.*\bvideoId=)(?P\d+)' + _TESTS = [{ + 'url': 'https://www.rtbf.be/video/detail_les-diables-au-coeur-episode-2?id=1921274', + 'md5': '799f334ddf2c0a582ba80c44655be570', + 'info_dict': { + 'id': '1921274', + 'ext': 'mp4', + 'title': 'Les Diables au coeur (épisode 2)', + 'duration': 3099, + } + }, { + # geo restricted + 'url': 'http://www.rtbf.be/ouftivi/heros/detail_scooby-doo-mysteres-associes?id=1097&videoId=2057442', + 'only_matching': True, + }, { + 'url': 'http://www.rtbf.be/ouftivi/niouzz?videoId=2055858', + 'only_matching': True, + }] _QUALITIES = [ ('mobile', 'mobile'), From aa8d2d5be6a99542b85a85af3310fab1bf641e86 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 8 Nov 2015 17:03:21 +0600 Subject: [PATCH 0372/1286] [rtbf] Make www optional in _VALID_URL --- youtube_dl/extractor/rtbf.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/rtbf.py b/youtube_dl/extractor/rtbf.py index acf10e253..e42b319a3 100644 --- a/youtube_dl/extractor/rtbf.py +++ b/youtube_dl/extractor/rtbf.py @@ -9,7 +9,7 @@ from ..utils import ( class RTBFIE(InfoExtractor): - _VALID_URL = r'https?://www\.rtbf\.be/(?:video/[^?]+\?.*\bid=|ouftivi/(?:[^/]+/)*[^?]+\?.*\bvideoId=)(?P\d+)' + _VALID_URL = r'https?://(?:www\.)?rtbf\.be/(?:video/[^?]+\?.*\bid=|ouftivi/(?:[^/]+/)*[^?]+\?.*\bvideoId=)(?P\d+)' _TESTS = [{ 'url': 'https://www.rtbf.be/video/detail_les-diables-au-coeur-episode-2?id=1921274', 'md5': '799f334ddf2c0a582ba80c44655be570', From 50506cb60798fe4d2ebb9603798b3fb1cb81e55f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 8 Nov 2015 19:01:37 +0600 Subject: [PATCH 0373/1286] [extremetube] Fix extraction (Closes #7163) --- youtube_dl/extractor/extremetube.py | 45 +++++++++++++++++++---------- 1 file changed, 29 insertions(+), 16 deletions(-) diff --git a/youtube_dl/extractor/extremetube.py b/youtube_dl/extractor/extremetube.py index c826a5404..3e11e3299 100644 --- a/youtube_dl/extractor/extremetube.py +++ b/youtube_dl/extractor/extremetube.py @@ -3,12 +3,9 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..compat import ( - compat_parse_qs, - compat_urllib_request, -) +from ..compat import compat_urllib_request from ..utils import ( - qualities, + int_or_none, str_to_int, ) @@ -49,20 +46,36 @@ class ExtremeTubeIE(InfoExtractor): r'Views:\s*\s*([\d,\.]+)', webpage, 'view count', fatal=False)) - flash_vars = compat_parse_qs(self._search_regex( - r']+?name="flashvars"[^>]+?value="([^"]+)"', webpage, 'flash vars')) + flash_vars = self._parse_json( + self._search_regex( + r'var\s+flashvars\s*=\s*({.+?});', webpage, 'flash vars'), + video_id) formats = [] - quality = qualities(['180p', '240p', '360p', '480p', '720p', '1080p']) - for k, vals in flash_vars.items(): - m = re.match(r'quality_(?P[0-9]+p)$', k) - if m is not None: - formats.append({ - 'format_id': m.group('quality'), - 'quality': quality(m.group('quality')), - 'url': vals[0], + for quality_key, video_url in flash_vars.items(): + height = int_or_none(self._search_regex( + r'quality_(\d+)[pP]$', quality_key, 'height', default=None)) + if not height: + continue + f = { + 'url': video_url, + } + mobj = re.search( + r'/(?P\d{3,4})[pP]_(?P\d+)[kK]_\d+', video_url) + if mobj: + height = int(mobj.group('height')) + bitrate = int(mobj.group('bitrate')) + f.update({ + 'format_id': '%dp-%dk' % (height, bitrate), + 'height': height, + 'tbr': bitrate, }) - + else: + f.update({ + 'format_id': '%dp' % height, + 'height': height, + }) + formats.append(f) self._sort_formats(formats) return { From cc8034cc4c52fcbfaf9f8edf34d562c481860193 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 8 Nov 2015 19:14:39 +0600 Subject: [PATCH 0374/1286] [extremetube] Modernize --- youtube_dl/extractor/extremetube.py | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/youtube_dl/extractor/extremetube.py b/youtube_dl/extractor/extremetube.py index 3e11e3299..c5677c82b 100644 --- a/youtube_dl/extractor/extremetube.py +++ b/youtube_dl/extractor/extremetube.py @@ -11,12 +11,12 @@ from ..utils import ( class ExtremeTubeIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?(?Pextremetube\.com/.*?video/.+?(?P[0-9]+))(?:[/?&]|$)' + _VALID_URL = r'https?://(?:www\.)?extremetube\.com/(?:[^/]+/)?video/(?P[^/#?&]+)' _TESTS = [{ 'url': 'http://www.extremetube.com/video/music-video-14-british-euro-brit-european-cumshots-swallow-652431', 'md5': '344d0c6d50e2f16b06e49ca011d8ac69', 'info_dict': { - 'id': '652431', + 'id': 'music-video-14-british-euro-brit-european-cumshots-swallow-652431', 'ext': 'mp4', 'title': 'Music Video 14 british euro brit european cumshots swallow', 'uploader': 'unknown', @@ -26,12 +26,16 @@ class ExtremeTubeIE(InfoExtractor): }, { 'url': 'http://www.extremetube.com/gay/video/abcde-1234', 'only_matching': True, + }, { + 'url': 'http://www.extremetube.com/video/latina-slut-fucked-by-fat-black-dick', + 'only_matching': True, + }, { + 'url': 'http://www.extremetube.com/video/652431', + 'only_matching': True, }] def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') - url = 'http://www.' + mobj.group('url') + video_id = self._match_id(url) req = compat_urllib_request.Request(url) req.add_header('Cookie', 'age_verified=1') From f09a767d3198823e5c0ac187a91284c8d2736eb6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 8 Nov 2015 19:19:13 +0600 Subject: [PATCH 0375/1286] [mit] Allow external embeds (Closes #7406) --- youtube_dl/extractor/mit.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/mit.py b/youtube_dl/extractor/mit.py index f088ab9e2..29ca45778 100644 --- a/youtube_dl/extractor/mit.py +++ b/youtube_dl/extractor/mit.py @@ -86,7 +86,7 @@ class MITIE(TechTVMITIE): webpage = self._download_webpage(url, page_title) embed_url = self._search_regex( r'