From ccff2c404d7ea9f5b21ede8ae57bb79feec7eb94 Mon Sep 17 00:00:00 2001 From: Wang Jun Tham Date: Sun, 24 Apr 2016 00:08:02 +0800 Subject: [PATCH 001/387] [ffmpeg] Fix embedding subtitles (#9063) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Changed command line parameters for ffmpeg when embedding subtitles. Changed to ‘-map 0:v -c:v copy -map 0:a -c:a copy’ --- youtube_dl/postprocessor/ffmpeg.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/youtube_dl/postprocessor/ffmpeg.py b/youtube_dl/postprocessor/ffmpeg.py index 1793a878c..ca2d401f8 100644 --- a/youtube_dl/postprocessor/ffmpeg.py +++ b/youtube_dl/postprocessor/ffmpeg.py @@ -363,8 +363,10 @@ class FFmpegEmbedSubtitlePP(FFmpegPostProcessor): input_files = [filename] + sub_filenames opts = [ - '-map', '0', - '-c', 'copy', + '-map', '0:v', + '-c:v', 'copy', + '-map', '0:a', + '-c:a', 'copy', # Don't copy the existing subtitles, we may be running the # postprocessor a second time '-map', '-0:s', From ded7511a700cb6962f8a5922b1e3b4ef480a4c81 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Fri, 17 Jun 2016 23:42:52 +0700 Subject: [PATCH 002/387] [bbccouk] Add support for playlists (Closes #9812) --- youtube_dl/extractor/bbc.py | 75 +++++++++++++++++++++++++++++- youtube_dl/extractor/extractors.py | 2 + 2 files changed, 75 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/bbc.py b/youtube_dl/extractor/bbc.py index 74c4510f9..de236fbde 100644 --- a/youtube_dl/extractor/bbc.py +++ b/youtube_dl/extractor/bbc.py @@ -31,7 +31,7 @@ class BBCCoUkIE(InfoExtractor): music/clips[/#]| radio/player/ ) - (?P%s) + (?P%s)(?!/(?:episodes|broadcasts|clips)) ''' % _ID_REGEX _MEDIASELECTOR_URLS = [ @@ -698,7 +698,9 @@ class BBCIE(BBCCoUkIE): @classmethod def suitable(cls, url): - return False if BBCCoUkIE.suitable(url) or BBCCoUkArticleIE.suitable(url) else super(BBCIE, cls).suitable(url) + EXCLUDE_IE = (BBCCoUkIE, BBCCoUkArticleIE, BBCCoUkIPlayerPlaylistIE, BBCCoUkPlaylistIE) + return (False if any(ie.suitable(url) for ie in EXCLUDE_IE) + else super(BBCIE, cls).suitable(url)) def _extract_from_media_meta(self, media_meta, video_id): # Direct links to media in media metadata (e.g. @@ -975,3 +977,72 @@ class BBCCoUkArticleIE(InfoExtractor): r']+typeof="Clip"[^>]+resource="([^"]+)"', webpage)] return self.playlist_result(entries, playlist_id, title, description) + + +class BBCCoUkPlaylistBaseIE(InfoExtractor): + def _real_extract(self, url): + playlist_id = self._match_id(url) + + webpage = self._download_webpage(url, playlist_id) + + entries = [ + self.url_result(self._URL_TEMPLATE % video_id, BBCCoUkIE.ie_key()) + for video_id in re.findall( + self._VIDEO_ID_TEMPLATE % BBCCoUkIE._ID_REGEX, webpage)] + + title, description = self._extract_title_and_description(webpage) + + return self.playlist_result(entries, playlist_id, title, description) + + +class BBCCoUkIPlayerPlaylistIE(BBCCoUkPlaylistBaseIE): + IE_NAME = 'bbc.co.uk:iplayer:playlist' + _VALID_URL = r'https?://(?:www\.)?bbc\.co\.uk/iplayer/episodes/(?P%s)' % BBCCoUkIE._ID_REGEX + _URL_TEMPLATE = 'http://www.bbc.co.uk/iplayer/episode/%s' + _VIDEO_ID_TEMPLATE = r'data-ip-id=["\'](%s)' + _TEST = { + 'url': 'http://www.bbc.co.uk/iplayer/episodes/b05rcz9v', + 'info_dict': { + 'id': 'b05rcz9v', + 'title': 'The Disappearance', + 'description': 'French thriller serial about a missing teenager.', + }, + 'playlist_mincount': 6, + } + + def _extract_title_and_description(self, webpage): + title = self._search_regex(r'

([^<]+)

', webpage, 'title', fatal=False) + description = self._search_regex( + r']+class=(["\'])subtitle\1[^>]*>(?P[^<]+)

', + webpage, 'description', fatal=False, group='value') + return title, description + + +class BBCCoUkPlaylistIE(BBCCoUkPlaylistBaseIE): + IE_NAME = 'bbc.co.uk:playlist' + _VALID_URL = r'https?://(?:www\.)?bbc\.co\.uk/programmes/(?P%s)/(?:episodes|broadcasts|clips)' % BBCCoUkIE._ID_REGEX + _URL_TEMPLATE = 'http://www.bbc.co.uk/programmes/%s' + _VIDEO_ID_TEMPLATE = r'data-pid=["\'](%s)' + _TESTS = [{ + 'url': 'http://www.bbc.co.uk/programmes/b05rcz9v/clips', + 'info_dict': { + 'id': 'b05rcz9v', + 'title': 'The Disappearance - Clips - BBC Four', + 'description': 'French thriller serial about a missing teenager.', + }, + 'playlist_mincount': 7, + }, { + 'url': 'http://www.bbc.co.uk/programmes/b05rcz9v/broadcasts/2016/06', + 'only_matching': True, + }, { + 'url': 'http://www.bbc.co.uk/programmes/b05rcz9v/clips', + 'only_matching': True, + }, { + 'url': 'http://www.bbc.co.uk/programmes/b055jkys/episodes/player', + 'only_matching': True, + }] + + def _extract_title_and_description(self, webpage): + title = self._og_search_title(webpage, fatal=False) + description = self._og_search_description(webpage) + return title, description diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 5fce9f47a..ed51dfdaa 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -71,6 +71,8 @@ from .bandcamp import BandcampIE, BandcampAlbumIE from .bbc import ( BBCCoUkIE, BBCCoUkArticleIE, + BBCCoUkIPlayerPlaylistIE, + BBCCoUkPlaylistIE, BBCIE, ) from .beeg import BeegIE From eb451890da79e686a218e42c9761df2a6f5b6ee3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 18 Jun 2016 03:04:14 +0700 Subject: [PATCH 003/387] [carambatv] Add extractor (Closes #9815) --- youtube_dl/extractor/carambatv.py | 88 ++++++++++++++++++++++++++++++ youtube_dl/extractor/extractors.py | 4 ++ 2 files changed, 92 insertions(+) create mode 100644 youtube_dl/extractor/carambatv.py diff --git a/youtube_dl/extractor/carambatv.py b/youtube_dl/extractor/carambatv.py new file mode 100644 index 000000000..5797fb951 --- /dev/null +++ b/youtube_dl/extractor/carambatv.py @@ -0,0 +1,88 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import ( + float_or_none, + int_or_none, + try_get, +) + + +class CarambaTVIE(InfoExtractor): + _VALID_URL = r'(?:carambatv:|https?://video1\.carambatv\.ru/v/)(?P\d+)' + _TESTS = [{ + 'url': 'http://video1.carambatv.ru/v/191910501', + 'md5': '2f4a81b7cfd5ab866ee2d7270cb34a2a', + 'info_dict': { + 'id': '191910501', + 'ext': 'mp4', + 'title': '[BadComedian] - Разборка в Маниле (Абсолютный обзор)', + 'thumbnail': 're:^https?://.*\.jpg', + 'duration': 2678.31, + }, + }, { + 'url': 'carambatv:191910501', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + video = self._download_json( + 'http://video1.carambatv.ru/v/%s/videoinfo.js' % video_id, + video_id) + + title = video['title'] + + base_url = video.get('video') or 'http://video1.carambatv.ru/v/%s/' % video_id + + formats = [{ + 'url': base_url + f['fn'], + 'height': int_or_none(f.get('height')), + 'format_id': '%sp' % f['height'] if f.get('height') else None, + } for f in video['qualities'] if f.get('fn')] + self._sort_formats(formats) + + thumbnail = video.get('splash') + duration = float_or_none(try_get( + video, lambda x: x['annotations'][0]['end_time'], compat_str)) + + return { + 'id': video_id, + 'title': title, + 'thumbnail': thumbnail, + 'duration': duration, + 'formats': formats, + } + + +class CarambaTVPageIE(InfoExtractor): + _VALID_URL = r'https?://carambatv\.ru/(?:[^/]+/)+(?P[^/?#&]+)' + _TEST = { + 'url': 'http://carambatv.ru/movie/bad-comedian/razborka-v-manile/', + 'md5': '', + 'info_dict': { + 'id': '191910501', + 'ext': 'mp4', + 'title': '[BadComedian] - Разборка в Маниле (Абсолютный обзор)', + 'thumbnail': 're:^https?://.*\.jpg$', + 'duration': 2678.31, + }, + } + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage(url, video_id) + + video_url = self._og_search_property('video:iframe', webpage, default=None) + + if not video_url: + video_id = self._search_regex( + r'(?:video_id|crmb_vuid)\s*[:=]\s*["\']?(\d+)', + webpage, 'video id') + video_url = 'carambatv:%s' % video_id + + return self.url_result(video_url, CarambaTVIE.ie_key()) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index ed51dfdaa..23320229b 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -110,6 +110,10 @@ from .camwithher import CamWithHerIE from .canalplus import CanalplusIE from .canalc2 import Canalc2IE from .canvas import CanvasIE +from .carambatv import ( + CarambaTVIE, + CarambaTVPageIE, +) from .cbc import ( CBCIE, CBCPlayerIE, From 6929569403aeade9aced5c4103db652e9c16bdca Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 18 Jun 2016 04:06:19 +0700 Subject: [PATCH 004/387] [mitele] Extract series metadata and make title more robust (Closes #9758) --- youtube_dl/extractor/mitele.py | 49 +++++++++++++++++++++++++++++++--- 1 file changed, 46 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/mitele.py b/youtube_dl/extractor/mitele.py index 3589c223d..5a00cd397 100644 --- a/youtube_dl/extractor/mitele.py +++ b/youtube_dl/extractor/mitele.py @@ -1,5 +1,8 @@ +# coding: utf-8 from __future__ import unicode_literals +import re + from .common import InfoExtractor from ..compat import ( compat_urllib_parse_urlencode, @@ -8,6 +11,7 @@ from ..compat import ( from ..utils import ( get_element_by_attribute, int_or_none, + remove_start, ) @@ -15,7 +19,7 @@ class MiTeleIE(InfoExtractor): IE_DESC = 'mitele.es' _VALID_URL = r'https?://www\.mitele\.es/[^/]+/[^/]+/[^/]+/(?P[^/]+)/' - _TEST = { + _TESTS = [{ 'url': 'http://www.mitele.es/programas-tv/diario-de/la-redaccion/programa-144/', # MD5 is unstable 'info_dict': { @@ -24,10 +28,31 @@ class MiTeleIE(InfoExtractor): 'ext': 'flv', 'title': 'Tor, la web invisible', 'description': 'md5:3b6fce7eaa41b2d97358726378d9369f', + 'series': 'Diario de', + 'season': 'La redacción', + 'episode': 'Programa 144', 'thumbnail': 're:(?i)^https?://.*\.jpg$', 'duration': 2913, }, - } + }, { + # no explicit title + 'url': 'http://www.mitele.es/programas-tv/cuarto-milenio/temporada-6/programa-226/', + 'info_dict': { + 'id': 'eLZSwoEd1S3pVyUm8lc6F', + 'display_id': 'programa-226', + 'ext': 'flv', + 'title': 'Cuarto Milenio - Temporada 6 - Programa 226', + 'description': 'md5:50daf9fadefa4e62d9fc866d0c015701', + 'series': 'Cuarto Milenio', + 'season': 'Temporada 6', + 'episode': 'Programa 226', + 'thumbnail': 're:(?i)^https?://.*\.jpg$', + 'duration': 7312, + }, + 'params': { + 'skip_download': True, + }, + }] def _real_extract(self, url): display_id = self._match_id(url) @@ -70,7 +95,22 @@ class MiTeleIE(InfoExtractor): self._sort_formats(formats) title = self._search_regex( - r'class="Destacado-text"[^>]*>\s*([^<]+)', webpage, 'title') + r'class="Destacado-text"[^>]*>\s*([^<]+)', + webpage, 'title', default=None) + + mobj = re.search(r'''(?sx) + class="Destacado-text"[^>]*>.*?

\s* + (?P[^<]+)\s* + (?P[^<]+)\s* + (?P[^<]+)''', webpage) + series, season, episode = mobj.groups() if mobj else [None] * 3 + + if not title: + if mobj: + title = '%s - %s - %s' % (series, season, episode) + else: + title = remove_start(self._search_regex( + r'([^<]+)', webpage, 'title'), 'Ver online ') video_id = self._search_regex( r'data-media-id\s*=\s*"([^"]+)"', webpage, @@ -83,6 +123,9 @@ class MiTeleIE(InfoExtractor): 'display_id': display_id, 'title': title, 'description': get_element_by_attribute('class', 'text', webpage), + 'series': series, + 'season': season, + 'episode': episode, 'thumbnail': thumbnail, 'duration': duration, 'formats': formats, From f011876076a9fc4ee3fcb8b17f8bc2bcf5c8b8b0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 18 Jun 2016 04:40:48 +0700 Subject: [PATCH 005/387] [nickde] Add extractor (Closes #9778) --- youtube_dl/extractor/extractors.py | 5 ++++- youtube_dl/extractor/nick.py | 24 ++++++++++++++++++++++++ 2 files changed, 28 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 23320229b..efa58f70f 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -518,7 +518,10 @@ from .nhl import ( NHLVideocenterCategoryIE, NHLIE, ) -from .nick import NickIE +from .nick import ( + NickIE, + NickDeIE, +) from .niconico import NiconicoIE, NiconicoPlaylistIE from .ninegag import NineGagIE from .noco import NocoIE diff --git a/youtube_dl/extractor/nick.py b/youtube_dl/extractor/nick.py index ce065f2b0..e96013791 100644 --- a/youtube_dl/extractor/nick.py +++ b/youtube_dl/extractor/nick.py @@ -3,6 +3,7 @@ from __future__ import unicode_literals from .mtv import MTVServicesInfoExtractor from ..compat import compat_urllib_parse_urlencode +from ..utils import update_url_query class NickIE(MTVServicesInfoExtractor): @@ -61,3 +62,26 @@ class NickIE(MTVServicesInfoExtractor): def _extract_mgid(self, webpage): return self._search_regex(r'data-contenturi="([^"]+)', webpage, 'mgid') + + +class NickDeIE(MTVServicesInfoExtractor): + IE_NAME = 'nick.de' + _VALID_URL = r'https?://(?:www\.)?nick\.de/(?:playlist|shows)/(?:[^/]+/)*(?P[^/?#&]+)' + _TESTS = [{ + 'url': 'http://www.nick.de/playlist/3773-top-videos/videos/episode/17306-zu-wasser-und-zu-land-rauchende-erdnusse', + 'only_matching': True, + }, { + 'url': 'http://www.nick.de/shows/342-icarly', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage(url, video_id) + + mrss_url = update_url_query(self._search_regex( + r'data-mrss=(["\'])(?Phttp.+?)\1', webpage, 'mrss url', group='url'), + {'siteKey': 'nick.de'}) + + return self._get_videos_info_from_url(mrss_url, video_id) From 20a6a154fe8e6a5d246ad1326c4082ddd6091718 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 18 Jun 2016 04:46:26 +0700 Subject: [PATCH 006/387] [mtv] Use compat_xpath and fix FutureWarning --- youtube_dl/extractor/mtv.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/mtv.py b/youtube_dl/extractor/mtv.py index 640ee3d93..8a638a47c 100644 --- a/youtube_dl/extractor/mtv.py +++ b/youtube_dl/extractor/mtv.py @@ -6,6 +6,7 @@ from .common import InfoExtractor from ..compat import ( compat_urllib_parse_urlencode, compat_str, + compat_xpath, ) from ..utils import ( ExtractorError, @@ -139,9 +140,9 @@ class MTVServicesInfoExtractor(InfoExtractor): itemdoc, './/{http://search.yahoo.com/mrss/}category', 'scheme', 'urn:mtvn:video_title') if title_el is None: - title_el = itemdoc.find('.//{http://search.yahoo.com/mrss/}title') + title_el = itemdoc.find(compat_xpath('.//{http://search.yahoo.com/mrss/}title')) if title_el is None: - title_el = itemdoc.find('.//title') or itemdoc.find('./title') + title_el = itemdoc.find(compat_xpath('.//title')) if title_el.text is None: title_el = None From d0d93f76ea0dd1dae15bdba6059815d9cc467b05 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 18 Jun 2016 05:30:46 +0700 Subject: [PATCH 007/387] [pornhd] Fix metadata extraction --- youtube_dl/extractor/pornhd.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/pornhd.py b/youtube_dl/extractor/pornhd.py index 39b53ecf6..33faf5e58 100644 --- a/youtube_dl/extractor/pornhd.py +++ b/youtube_dl/extractor/pornhd.py @@ -39,9 +39,10 @@ class PornHdIE(InfoExtractor): [r']+class=["\']video-name["\'][^>]*>([^<]+)', r'(.+?) - .*?[Pp]ornHD.*?'], webpage, 'title') description = self._html_search_regex( - r'
([^<]+)
', webpage, 'description', fatal=False) + r'<(div|p)[^>]+class="description"[^>]*>(?P[^<]+)', webpage, 'view count', fatal=False)) + r'(\d+) views\s*<', webpage, 'view count', fatal=False)) thumbnail = self._search_regex( r"'poster'\s*:\s*'([^']+)'", webpage, 'thumbnail', fatal=False) From e6fe993c318738fee5a4a2ce7a86c4512e42653a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 18 Jun 2016 05:37:53 +0700 Subject: [PATCH 008/387] [pornhd] Improve formats extraction --- youtube_dl/extractor/pornhd.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/youtube_dl/extractor/pornhd.py b/youtube_dl/extractor/pornhd.py index 33faf5e58..e7721b013 100644 --- a/youtube_dl/extractor/pornhd.py +++ b/youtube_dl/extractor/pornhd.py @@ -7,7 +7,6 @@ from .common import InfoExtractor from ..utils import ( int_or_none, js_to_json, - qualities, ) @@ -46,18 +45,19 @@ class PornHdIE(InfoExtractor): thumbnail = self._search_regex( r"'poster'\s*:\s*'([^']+)'", webpage, 'thumbnail', fatal=False) - quality = qualities(['sd', 'hd']) sources = json.loads(js_to_json(self._search_regex( r"(?s)'sources'\s*:\s*(\{.+?\})\s*\}[;,)]", webpage, 'sources'))) formats = [] - for qname, video_url in sources.items(): + for format_id, video_url in sources.items(): if not video_url: continue + height = int_or_none(self._search_regex( + r'^(\d+)[pP]', format_id, 'height', default=None)) formats.append({ 'url': video_url, - 'format_id': qname, - 'quality': quality(qname), + 'format_id': format_id, + 'height': height, }) self._sort_formats(formats) From 667d96480b4f9c78ceace063415c3424d4d562ea Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 18 Jun 2016 05:42:20 +0700 Subject: [PATCH 009/387] [pornhd] Detect removed videos and modernize --- youtube_dl/extractor/pornhd.py | 28 ++++++++++++++++++---------- 1 file changed, 18 insertions(+), 10 deletions(-) diff --git a/youtube_dl/extractor/pornhd.py b/youtube_dl/extractor/pornhd.py index e7721b013..7a5f00fe0 100644 --- a/youtube_dl/extractor/pornhd.py +++ b/youtube_dl/extractor/pornhd.py @@ -1,10 +1,10 @@ from __future__ import unicode_literals import re -import json from .common import InfoExtractor from ..utils import ( + ExtractorError, int_or_none, js_to_json, ) @@ -37,17 +37,17 @@ class PornHdIE(InfoExtractor): title = self._html_search_regex( [r']+class=["\']video-name["\'][^>]*>([^<]+)', r'(.+?) - .*?[Pp]ornHD.*?'], webpage, 'title') - description = self._html_search_regex( - r'<(div|p)[^>]+class="description"[^>]*>(?P[^<]+)]+class="no-video"[^>]*>(?P.+?)]+class="description"[^>]*>(?P[^<]+) Date: Sat, 18 Jun 2016 05:50:17 +0700 Subject: [PATCH 010/387] [pornhd] Add working test --- youtube_dl/extractor/pornhd.py | 21 ++++++++++++++++++--- 1 file changed, 18 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/pornhd.py b/youtube_dl/extractor/pornhd.py index 7a5f00fe0..8df12eec0 100644 --- a/youtube_dl/extractor/pornhd.py +++ b/youtube_dl/extractor/pornhd.py @@ -12,7 +12,21 @@ from ..utils import ( class PornHdIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?pornhd\.com/(?:[a-z]{2,4}/)?videos/(?P\d+)(?:/(?P.+))?' - _TEST = { + _TESTS = [{ + 'url': 'http://www.pornhd.com/videos/9864/selfie-restroom-masturbation-fun-with-chubby-cutie-hd-porn-video', + 'md5': 'c8b964b1f0a4b5f7f28ae3a5c9f86ad5', + 'info_dict': { + 'id': '9864', + 'display_id': 'selfie-restroom-masturbation-fun-with-chubby-cutie-hd-porn-video', + 'ext': 'mp4', + 'title': 'Restroom selfie masturbation', + 'description': 'md5:3748420395e03e31ac96857a8f125b2b', + 'thumbnail': 're:^https?://.*\.jpg', + 'view_count': int, + 'age_limit': 18, + } + }, { + # removed video 'url': 'http://www.pornhd.com/videos/1962/sierra-day-gets-his-cum-all-over-herself-hd-porn-video', 'md5': '956b8ca569f7f4d8ec563e2c41598441', 'info_dict': { @@ -24,8 +38,9 @@ class PornHdIE(InfoExtractor): 'thumbnail': 're:^https?://.*\.jpg', 'view_count': int, 'age_limit': 18, - } - } + }, + 'skip': 'Not available anymore', + }] def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) From 5fc2757682483b4b0277df2e2454dab882237882 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 18 Jun 2016 06:00:05 +0700 Subject: [PATCH 011/387] release 2016.06.18 --- .github/ISSUE_TEMPLATE.md | 6 +++--- docs/supportedsites.md | 5 +++++ youtube_dl/version.py | 2 +- 3 files changed, 9 insertions(+), 4 deletions(-) diff --git a/.github/ISSUE_TEMPLATE.md b/.github/ISSUE_TEMPLATE.md index eeac09d5d..4b281e649 100644 --- a/.github/ISSUE_TEMPLATE.md +++ b/.github/ISSUE_TEMPLATE.md @@ -6,8 +6,8 @@ --- -### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2016.06.16*. If it's not read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. -- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2016.06.16** +### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2016.06.18*. If it's not read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. +- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2016.06.18** ### Before submitting an *issue* make sure you have: - [ ] At least skimmed through [README](https://github.com/rg3/youtube-dl/blob/master/README.md) and **most notably** [FAQ](https://github.com/rg3/youtube-dl#faq) and [BUGS](https://github.com/rg3/youtube-dl#bugs) sections @@ -35,7 +35,7 @@ $ youtube-dl -v [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 -[debug] youtube-dl version 2016.06.16 +[debug] youtube-dl version 2016.06.18 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/docs/supportedsites.md b/docs/supportedsites.md index 13315f4f4..6a7a4bf2d 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -74,6 +74,8 @@ - **bbc**: BBC - **bbc.co.uk**: BBC iPlayer - **bbc.co.uk:article**: BBC articles + - **bbc.co.uk:iplayer:playlist** + - **bbc.co.uk:playlist** - **BeatportPro** - **Beeg** - **BehindKink** @@ -104,6 +106,8 @@ - **canalc2.tv** - **Canalplus**: canalplus.fr, piwiplus.fr and d8.tv - **Canvas** + - **CarambaTV** + - **CarambaTVPage** - **CBC** - **CBCPlayer** - **CBS** @@ -432,6 +436,7 @@ - **nhl.com:videocenter** - **nhl.com:videocenter:category**: NHL videocenter category - **nick.com** + - **nick.de** - **niconico**: ニコニコ動画 - **NiconicoPlaylist** - **njoy**: N-JOY diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 52619cae8..92b7badc9 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2016.06.16' +__version__ = '2016.06.18' From 0f47cc2e925014afef4339a8213d52797a710eb3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 18 Jun 2016 06:20:34 +0700 Subject: [PATCH 012/387] release 2016.06.18.1 --- .github/ISSUE_TEMPLATE.md | 6 +++--- docs/supportedsites.md | 2 +- youtube_dl/version.py | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/.github/ISSUE_TEMPLATE.md b/.github/ISSUE_TEMPLATE.md index 4b281e649..2736bb23b 100644 --- a/.github/ISSUE_TEMPLATE.md +++ b/.github/ISSUE_TEMPLATE.md @@ -6,8 +6,8 @@ --- -### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2016.06.18*. If it's not read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. -- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2016.06.18** +### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2016.06.18.1*. If it's not read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. +- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2016.06.18.1** ### Before submitting an *issue* make sure you have: - [ ] At least skimmed through [README](https://github.com/rg3/youtube-dl/blob/master/README.md) and **most notably** [FAQ](https://github.com/rg3/youtube-dl#faq) and [BUGS](https://github.com/rg3/youtube-dl#bugs) sections @@ -35,7 +35,7 @@ $ youtube-dl -v [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 -[debug] youtube-dl version 2016.06.18 +[debug] youtube-dl version 2016.06.18.1 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/docs/supportedsites.md b/docs/supportedsites.md index 6a7a4bf2d..c79798d86 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -44,8 +44,8 @@ - **appletrailers:section** - **archive.org**: archive.org videos - **ARD** - - **ARD:mediathek** - **ARD:mediathek**: Saarländischer Rundfunk + - **ARD:mediathek** - **arte.tv** - **arte.tv:+7** - **arte.tv:cinema** diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 92b7badc9..f71bc8c2a 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2016.06.18' +__version__ = '2016.06.18.1' From c878e635de1563ded54b2504104e94cd59c70b55 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Sat, 18 Jun 2016 12:17:24 +0800 Subject: [PATCH 013/387] [bet] Moved to MTVServices --- youtube_dl/extractor/bet.py | 96 +++++++++++++------------------------ 1 file changed, 34 insertions(+), 62 deletions(-) diff --git a/youtube_dl/extractor/bet.py b/youtube_dl/extractor/bet.py index 986245bf0..bd3ee2e2e 100644 --- a/youtube_dl/extractor/bet.py +++ b/youtube_dl/extractor/bet.py @@ -1,31 +1,27 @@ from __future__ import unicode_literals -from .common import InfoExtractor -from ..compat import compat_urllib_parse_unquote -from ..utils import ( - xpath_text, - xpath_with_ns, - int_or_none, - parse_iso8601, -) +from .mtv import MTVServicesInfoExtractor +from ..utils import unified_strdate +from ..compat import compat_urllib_parse_urlencode -class BetIE(InfoExtractor): +class BetIE(MTVServicesInfoExtractor): _VALID_URL = r'https?://(?:www\.)?bet\.com/(?:[^/]+/)+(?P.+?)\.html' _TESTS = [ { 'url': 'http://www.bet.com/news/politics/2014/12/08/in-bet-exclusive-obama-talks-race-and-racism.html', 'info_dict': { - 'id': 'news/national/2014/a-conversation-with-president-obama', + 'id': '07e96bd3-8850-3051-b856-271b457f0ab8', 'display_id': 'in-bet-exclusive-obama-talks-race-and-racism', 'ext': 'flv', 'title': 'A Conversation With President Obama', - 'description': 'md5:699d0652a350cf3e491cd15cc745b5da', + 'description': 'President Obama urges persistence in confronting racism and bias.', 'duration': 1534, - 'timestamp': 1418075340, 'upload_date': '20141208', - 'uploader': 'admin', 'thumbnail': 're:(?i)^https?://.*\.jpg$', + 'subtitles': { + 'en': 'mincount:2', + } }, 'params': { # rtmp download @@ -35,16 +31,17 @@ class BetIE(InfoExtractor): { 'url': 'http://www.bet.com/video/news/national/2014/justice-for-ferguson-a-community-reacts.html', 'info_dict': { - 'id': 'news/national/2014/justice-for-ferguson-a-community-reacts', + 'id': '9f516bf1-7543-39c4-8076-dd441b459ba9', 'display_id': 'justice-for-ferguson-a-community-reacts', 'ext': 'flv', 'title': 'Justice for Ferguson: A Community Reacts', 'description': 'A BET News special.', 'duration': 1696, - 'timestamp': 1416942360, 'upload_date': '20141125', - 'uploader': 'admin', 'thumbnail': 're:(?i)^https?://.*\.jpg$', + 'subtitles': { + 'en': 'mincount:2', + } }, 'params': { # rtmp download @@ -53,57 +50,32 @@ class BetIE(InfoExtractor): } ] + _FEED_URL = "http://feeds.mtvnservices.com/od/feed/bet-mrss-player" + + def _get_feed_query(self, uri): + return compat_urllib_parse_urlencode({ + 'uuid': uri, + }) + + def _extract_mgid(self, webpage): + return self._search_regex(r'data-uri="([^"]+)', webpage, 'mgid') + def _real_extract(self, url): display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + mgid = self._extract_mgid(webpage) + videos_info = self._get_videos_info(mgid) - media_url = compat_urllib_parse_unquote(self._search_regex( - [r'mediaURL\s*:\s*"([^"]+)"', r"var\s+mrssMediaUrl\s*=\s*'([^']+)'"], - webpage, 'media URL')) + info_dict = videos_info['entries'][0] - video_id = self._search_regex( - r'/video/(.*)/_jcr_content/', media_url, 'video id') + upload_date = unified_strdate(self._html_search_meta('date', webpage)) + description = self._html_search_meta('description', webpage) - mrss = self._download_xml(media_url, display_id) - - item = mrss.find('./channel/item') - - NS_MAP = { - 'dc': 'http://purl.org/dc/elements/1.1/', - 'media': 'http://search.yahoo.com/mrss/', - 'ka': 'http://kickapps.com/karss', - } - - title = xpath_text(item, './title', 'title') - description = xpath_text( - item, './description', 'description', fatal=False) - - timestamp = parse_iso8601(xpath_text( - item, xpath_with_ns('./dc:date', NS_MAP), - 'upload date', fatal=False)) - uploader = xpath_text( - item, xpath_with_ns('./dc:creator', NS_MAP), - 'uploader', fatal=False) - - media_content = item.find( - xpath_with_ns('./media:content', NS_MAP)) - duration = int_or_none(media_content.get('duration')) - smil_url = media_content.get('url') - - thumbnail = media_content.find( - xpath_with_ns('./media:thumbnail', NS_MAP)).get('url') - - formats = self._extract_smil_formats(smil_url, display_id) - self._sort_formats(formats) - - return { - 'id': video_id, + info_dict.update({ 'display_id': display_id, - 'title': title, 'description': description, - 'thumbnail': thumbnail, - 'timestamp': timestamp, - 'uploader': uploader, - 'duration': duration, - 'formats': formats, - } + 'upload_date': upload_date, + }) + + return info_dict From a72df5f36feddaffbcfa35e3415562509a9f67b4 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Sat, 18 Jun 2016 12:19:06 +0800 Subject: [PATCH 014/387] [mtvservices] Fix ext for RTMP streams --- youtube_dl/extractor/mtv.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/mtv.py b/youtube_dl/extractor/mtv.py index 8a638a47c..dd0639589 100644 --- a/youtube_dl/extractor/mtv.py +++ b/youtube_dl/extractor/mtv.py @@ -85,9 +85,10 @@ class MTVServicesInfoExtractor(InfoExtractor): rtmp_video_url = rendition.find('./src').text if rtmp_video_url.endswith('siteunavail.png'): continue + new_url = self._transform_rtmp_url(rtmp_video_url) formats.append({ - 'ext': ext, - 'url': self._transform_rtmp_url(rtmp_video_url), + 'ext': 'flv' if new_url.startswith('rtmp') else ext, + 'url': new_url, 'format_id': rendition.get('bitrate'), 'width': int(rendition.get('width')), 'height': int(rendition.get('height')), From 26264cb0566309bcdc7f0f2a8ee376bf205fc675 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Sat, 18 Jun 2016 12:21:40 +0800 Subject: [PATCH 015/387] [adobetv] Use embedded data in the webpage Sometimes the HTML webpage is returned even with '?format=json' --- youtube_dl/extractor/adobetv.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/adobetv.py b/youtube_dl/extractor/adobetv.py index 8753ee2cf..5ae16fa16 100644 --- a/youtube_dl/extractor/adobetv.py +++ b/youtube_dl/extractor/adobetv.py @@ -156,7 +156,10 @@ class AdobeTVVideoIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) - video_data = self._download_json(url + '?format=json', video_id) + webpage = self._download_webpage(url, video_id) + + video_data = self._parse_json(self._search_regex( + r'var\s+bridge\s*=\s*([^;]+);', webpage, 'bridged data'), video_id) formats = [{ 'format_id': '%s-%s' % (determine_ext(source['src']), source.get('height')), From 1b6cf16be7e8a771ca79e7eb161db9ff59238dab Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Sat, 18 Jun 2016 12:27:39 +0800 Subject: [PATCH 016/387] [aftonbladet] Fix extraction --- youtube_dl/extractor/aftonbladet.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/aftonbladet.py b/youtube_dl/extractor/aftonbladet.py index d548592fe..5766b4fe8 100644 --- a/youtube_dl/extractor/aftonbladet.py +++ b/youtube_dl/extractor/aftonbladet.py @@ -24,10 +24,10 @@ class AftonbladetIE(InfoExtractor): webpage = self._download_webpage(url, video_id) # find internal video meta data - meta_url = 'http://aftonbladet-play.drlib.aptoma.no/video/%s.json' + meta_url = 'http://aftonbladet-play-metadata.cdn.drvideo.aptoma.no/video/%s.json' player_config = self._parse_json(self._html_search_regex( r'data-player-config="([^"]+)"', webpage, 'player config'), video_id) - internal_meta_id = player_config['videoId'] + internal_meta_id = player_config['aptomaVideoId'] internal_meta_url = meta_url % internal_meta_id internal_meta_json = self._download_json( internal_meta_url, video_id, 'Downloading video meta data') From 09e3f91e408eb357929abad3710c799376004138 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Sat, 18 Jun 2016 12:34:58 +0800 Subject: [PATCH 017/387] [arte] Update _TESTS and fix for pages with multiple YouTube videos Some tests are from #6895 and #6613 --- youtube_dl/extractor/arte.py | 36 +++++++++++++++--------------------- 1 file changed, 15 insertions(+), 21 deletions(-) diff --git a/youtube_dl/extractor/arte.py b/youtube_dl/extractor/arte.py index f40532929..9e39faf47 100644 --- a/youtube_dl/extractor/arte.py +++ b/youtube_dl/extractor/arte.py @@ -240,10 +240,10 @@ class ArteTVPlus7IE(ArteTVBaseIE): return self._extract_from_json_url(json_url, video_id, lang, title=title) # Different kind of embed URL (e.g. # http://www.arte.tv/magazine/trepalium/fr/episode-0406-replay-trepalium) - embed_url = self._search_regex( - r']+src=(["\'])(?P.+?)\1', - webpage, 'embed url', group='url') - return self.url_result(embed_url) + entries = [ + self.url_result(url) + for _, url in re.findall(r']+src=(["\'])(?P.+?)\1', webpage)] + return self.playlist_result(entries) # It also uses the arte_vp_url url from the webpage to extract the information @@ -252,22 +252,17 @@ class ArteTVCreativeIE(ArteTVPlus7IE): _VALID_URL = r'https?://creative\.arte\.tv/(?Pfr|de|en|es)/(?:[^/]+/)*(?P[^/?#&]+)' _TESTS = [{ - 'url': 'http://creative.arte.tv/de/magazin/agentur-amateur-corporate-design', + 'url': 'http://creative.arte.tv/fr/episode/osmosis-episode-1', 'info_dict': { - 'id': '72176', + 'id': '057405-001-A', 'ext': 'mp4', - 'title': 'Folge 2 - Corporate Design', - 'upload_date': '20131004', + 'title': 'OSMOSIS - N\'AYEZ PLUS PEUR D\'AIMER (1)', + 'upload_date': '20150716', }, }, { 'url': 'http://creative.arte.tv/fr/Monty-Python-Reunion', - 'info_dict': { - 'id': '160676', - 'ext': 'mp4', - 'title': 'Monty Python live (mostly)', - 'description': 'Événement ! Quarante-cinq ans après leurs premiers succès, les légendaires Monty Python remontent sur scène.\n', - 'upload_date': '20140805', - } + 'playlist_count': 11, + 'add_ie': ['Youtube'], }, { 'url': 'http://creative.arte.tv/de/episode/agentur-amateur-4-der-erste-kunde', 'only_matching': True, @@ -349,14 +344,13 @@ class ArteTVCinemaIE(ArteTVPlus7IE): _VALID_URL = r'https?://cinema\.arte\.tv/(?Pfr|de|en|es)/(?P.+)' _TESTS = [{ - 'url': 'http://cinema.arte.tv/de/node/38291', - 'md5': '6b275511a5107c60bacbeeda368c3aa1', + 'url': 'http://cinema.arte.tv/fr/article/les-ailes-du-desir-de-julia-reck', + 'md5': 'a5b9dd5575a11d93daf0e3f404f45438', 'info_dict': { - 'id': '055876-000_PWA12025-D', + 'id': '062494-000-A', 'ext': 'mp4', - 'title': 'Tod auf dem Nil', - 'upload_date': '20160122', - 'description': 'md5:7f749bbb77d800ef2be11d54529b96bc', + 'title': 'Film lauréat du concours web - "Les ailes du désir" de Julia Reck', + 'upload_date': '20150807', }, }] From 573c35272f7a1973e44109614c8639e0d3e21fdd Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Sat, 18 Jun 2016 12:35:55 +0800 Subject: [PATCH 018/387] [bbc] Skip a geo-restricted test case --- youtube_dl/extractor/bbc.py | 1 + 1 file changed, 1 insertion(+) diff --git a/youtube_dl/extractor/bbc.py b/youtube_dl/extractor/bbc.py index de236fbde..4b3cd8c65 100644 --- a/youtube_dl/extractor/bbc.py +++ b/youtube_dl/extractor/bbc.py @@ -192,6 +192,7 @@ class BBCCoUkIE(InfoExtractor): # rtmp download 'skip_download': True, }, + 'skip': 'Now it\'s really geo-restricted', }, { # compact player (https://github.com/rg3/youtube-dl/issues/8147) 'url': 'http://www.bbc.co.uk/programmes/p028bfkf/player', From 1f3574575851eb34b6c6a983e276fa77a0dc3da1 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Sat, 18 Jun 2016 12:39:08 +0800 Subject: [PATCH 019/387] [azubu] Don't fail on optional fields --- youtube_dl/extractor/azubu.py | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/youtube_dl/extractor/azubu.py b/youtube_dl/extractor/azubu.py index efa624de1..a813eb429 100644 --- a/youtube_dl/extractor/azubu.py +++ b/youtube_dl/extractor/azubu.py @@ -46,6 +46,7 @@ class AzubuIE(InfoExtractor): 'uploader_id': 272749, 'view_count': int, }, + 'skip': 'Channel offline', }, ] @@ -56,22 +57,26 @@ class AzubuIE(InfoExtractor): 'http://www.azubu.tv/api/video/%s' % video_id, video_id)['data'] title = data['title'].strip() - description = data['description'] - thumbnail = data['thumbnail'] - view_count = data['view_count'] - uploader = data['user']['username'] - uploader_id = data['user']['id'] + description = data.get('description') + thumbnail = data.get('thumbnail') + view_count = data.get('view_count') + user = data.get('user', {}) + uploader = user.get('username') + uploader_id = user.get('id') stream_params = json.loads(data['stream_params']) - timestamp = float_or_none(stream_params['creationDate'], 1000) - duration = float_or_none(stream_params['length'], 1000) + timestamp = float_or_none(stream_params.get('creationDate'), 1000) + duration = float_or_none(stream_params.get('length'), 1000) renditions = stream_params.get('renditions') or [] video = stream_params.get('FLVFullLength') or stream_params.get('videoFullLength') if video: renditions.append(video) + if not renditions and not user.get('channel', {}).get('is_live', True): + raise ExtractorError('%s said: channel is offline.' % self.IE_NAME, expected=True) + formats = [{ 'url': fmt['url'], 'width': fmt['frameWidth'], From 0278aa443f7b4cc0886f40d6704925dc2488921d Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Sat, 18 Jun 2016 12:53:48 +0800 Subject: [PATCH 020/387] [br] Skip invalid tests --- youtube_dl/extractor/br.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/br.py b/youtube_dl/extractor/br.py index 11cf49851..ff0aa11b1 100644 --- a/youtube_dl/extractor/br.py +++ b/youtube_dl/extractor/br.py @@ -29,7 +29,8 @@ class BRIE(InfoExtractor): 'duration': 180, 'uploader': 'Reinhard Weber', 'upload_date': '20150422', - } + }, + 'skip': '404 not found', }, { 'url': 'http://www.br.de/nachrichten/oberbayern/inhalt/muenchner-polizeipraesident-schreiber-gestorben-100.html', @@ -40,7 +41,8 @@ class BRIE(InfoExtractor): 'title': 'Manfred Schreiber ist tot', 'description': 'md5:b454d867f2a9fc524ebe88c3f5092d97', 'duration': 26, - } + }, + 'skip': '404 not found', }, { 'url': 'https://www.br-klassik.de/audio/peeping-tom-premierenkritik-dance-festival-muenchen-100.html', @@ -51,7 +53,8 @@ class BRIE(InfoExtractor): 'title': 'Kurzweilig und sehr bewegend', 'description': 'md5:0351996e3283d64adeb38ede91fac54e', 'duration': 296, - } + }, + 'skip': '404 not found', }, { 'url': 'http://www.br.de/radio/bayern1/service/team/videos/team-video-erdelt100.html', From 6d0d4fc26d45c55ef6e99b31892047b0bdfed0e0 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Sat, 18 Jun 2016 13:40:55 +0800 Subject: [PATCH 021/387] [wdr] Add WDRBaseIE, for Sportschau (#9799) --- youtube_dl/extractor/wdr.py | 179 +++++++++++++++++++----------------- 1 file changed, 95 insertions(+), 84 deletions(-) diff --git a/youtube_dl/extractor/wdr.py b/youtube_dl/extractor/wdr.py index 6b83a2a04..390f9e830 100644 --- a/youtube_dl/extractor/wdr.py +++ b/youtube_dl/extractor/wdr.py @@ -15,7 +15,87 @@ from ..utils import ( ) -class WDRIE(InfoExtractor): +class WDRBaseIE(InfoExtractor): + def _extract_wdr_video(self, webpage, display_id): + # for wdr.de the data-extension is in a tag with the class "mediaLink" + # for wdr.de radio players, in a tag with the class "wdrrPlayerPlayBtn" + # for wdrmaus its in a link to the page in a multiline "videoLink"-tag + json_metadata = self._html_search_regex( + r'class=(?:"(?:mediaLink|wdrrPlayerPlayBtn)\b[^"]*"[^>]+|"videoLink\b[^"]*"[\s]*>\n[^\n]*)data-extension="([^"]+)"', + webpage, 'media link', default=None, flags=re.MULTILINE) + + if not json_metadata: + return + + media_link_obj = self._parse_json(json_metadata, display_id, + transform_source=js_to_json) + jsonp_url = media_link_obj['mediaObj']['url'] + + metadata = self._download_json( + jsonp_url, 'metadata', transform_source=strip_jsonp) + + metadata_tracker_data = metadata['trackerData'] + metadata_media_resource = metadata['mediaResource'] + + formats = [] + + # check if the metadata contains a direct URL to a file + for kind, media_resource in metadata_media_resource.items(): + if kind not in ('dflt', 'alt'): + continue + + for tag_name, medium_url in media_resource.items(): + if tag_name not in ('videoURL', 'audioURL'): + continue + + ext = determine_ext(medium_url) + if ext == 'm3u8': + formats.extend(self._extract_m3u8_formats( + medium_url, display_id, 'mp4', 'm3u8_native', + m3u8_id='hls')) + elif ext == 'f4m': + manifest_url = update_url_query( + medium_url, {'hdcore': '3.2.0', 'plugin': 'aasp-3.2.0.77.18'}) + formats.extend(self._extract_f4m_formats( + manifest_url, display_id, f4m_id='hds', fatal=False)) + elif ext == 'smil': + formats.extend(self._extract_smil_formats( + medium_url, 'stream', fatal=False)) + else: + a_format = { + 'url': medium_url + } + if ext == 'unknown_video': + urlh = self._request_webpage( + medium_url, display_id, note='Determining extension') + ext = urlhandle_detect_ext(urlh) + a_format['ext'] = ext + formats.append(a_format) + + self._sort_formats(formats) + + subtitles = {} + caption_url = metadata_media_resource.get('captionURL') + if caption_url: + subtitles['de'] = [{ + 'url': caption_url, + 'ext': 'ttml', + }] + + title = metadata_tracker_data['trackerClipTitle'] + + return { + 'id': metadata_tracker_data.get('trackerClipId', display_id), + 'display_id': display_id, + 'title': title, + 'alt_title': metadata_tracker_data.get('trackerClipSubcategory'), + 'formats': formats, + 'subtitles': subtitles, + 'upload_date': unified_strdate(metadata_tracker_data.get('trackerClipAirTime')), + } + + +class WDRIE(WDRBaseIE): _CURRENT_MAUS_URL = r'https?://(?:www\.)wdrmaus.de/(?:[^/]+/){1,2}[^/?#]+\.php5' _PAGE_REGEX = r'/(?:mediathek/)?[^/]+/(?P[^/]+)/(?P.+)\.html' _VALID_URL = r'(?Phttps?://(?:www\d\.)?wdr\d?\.de)' + _PAGE_REGEX + '|' + _CURRENT_MAUS_URL @@ -91,10 +171,10 @@ class WDRIE(InfoExtractor): }, { 'url': 'http://www.wdrmaus.de/sachgeschichten/sachgeschichten/achterbahn.php5', - # HDS download, MD5 is unstable + 'md5': '803138901f6368ee497b4d195bb164f2', 'info_dict': { 'id': 'mdb-186083', - 'ext': 'flv', + 'ext': 'mp4', 'upload_date': '20130919', 'title': 'Sachgeschichte - Achterbahn ', 'description': '- Die Sendung mit der Maus -', @@ -120,14 +200,9 @@ class WDRIE(InfoExtractor): display_id = mobj.group('display_id') webpage = self._download_webpage(url, display_id) - # for wdr.de the data-extension is in a tag with the class "mediaLink" - # for wdr.de radio players, in a tag with the class "wdrrPlayerPlayBtn" - # for wdrmaus its in a link to the page in a multiline "videoLink"-tag - json_metadata = self._html_search_regex( - r'class=(?:"(?:mediaLink|wdrrPlayerPlayBtn)\b[^"]*"[^>]+|"videoLink\b[^"]*"[\s]*>\n[^\n]*)data-extension="([^"]+)"', - webpage, 'media link', default=None, flags=re.MULTILINE) + info_dict = self._extract_wdr_video(webpage, display_id) - if not json_metadata: + if not info_dict: entries = [ self.url_result(page_url + href[0], 'WDR') for href in re.findall( @@ -140,86 +215,22 @@ class WDRIE(InfoExtractor): raise ExtractorError('No downloadable streams found', expected=True) - media_link_obj = self._parse_json(json_metadata, display_id, - transform_source=js_to_json) - jsonp_url = media_link_obj['mediaObj']['url'] - - metadata = self._download_json( - jsonp_url, 'metadata', transform_source=strip_jsonp) - - metadata_tracker_data = metadata['trackerData'] - metadata_media_resource = metadata['mediaResource'] - - formats = [] - - # check if the metadata contains a direct URL to a file - for kind, media_resource in metadata_media_resource.items(): - if kind not in ('dflt', 'alt'): - continue - - for tag_name, medium_url in media_resource.items(): - if tag_name not in ('videoURL', 'audioURL'): - continue - - ext = determine_ext(medium_url) - if ext == 'm3u8': - formats.extend(self._extract_m3u8_formats( - medium_url, display_id, 'mp4', 'm3u8_native', - m3u8_id='hls')) - elif ext == 'f4m': - manifest_url = update_url_query( - medium_url, {'hdcore': '3.2.0', 'plugin': 'aasp-3.2.0.77.18'}) - formats.extend(self._extract_f4m_formats( - manifest_url, display_id, f4m_id='hds', fatal=False)) - elif ext == 'smil': - formats.extend(self._extract_smil_formats( - medium_url, 'stream', fatal=False)) - else: - a_format = { - 'url': medium_url - } - if ext == 'unknown_video': - urlh = self._request_webpage( - medium_url, display_id, note='Determining extension') - ext = urlhandle_detect_ext(urlh) - a_format['ext'] = ext - formats.append(a_format) - - self._sort_formats(formats) - - subtitles = {} - caption_url = metadata_media_resource.get('captionURL') - if caption_url: - subtitles['de'] = [{ - 'url': caption_url, - 'ext': 'ttml', - }] - - title = metadata_tracker_data.get('trackerClipTitle') is_live = url_type == 'live' if is_live: - title = self._live_title(title) - upload_date = None - elif 'trackerClipAirTime' in metadata_tracker_data: - upload_date = metadata_tracker_data['trackerClipAirTime'] - else: - upload_date = self._html_search_meta('DC.Date', webpage, 'upload date') + info_dict.update({ + 'title': self._live_title(info_dict['title']), + 'upload_date': None, + }) + elif 'upload_date' not in info_dict: + info_dict['upload_date'] = unified_strdate(self._html_search_meta('DC.Date', webpage, 'upload date')) - if upload_date: - upload_date = unified_strdate(upload_date) - - return { - 'id': metadata_tracker_data.get('trackerClipId', display_id), - 'display_id': display_id, - 'title': title, - 'alt_title': metadata_tracker_data.get('trackerClipSubcategory'), - 'formats': formats, - 'upload_date': upload_date, + info_dict.update({ 'description': self._html_search_meta('Description', webpage), 'is_live': is_live, - 'subtitles': subtitles, - } + }) + + return info_dict class WDRMobileIE(InfoExtractor): From b5aad37f6bdc72acaca198202dc9f7eaa3185e51 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Sat, 18 Jun 2016 13:41:50 +0800 Subject: [PATCH 022/387] [ard] Remove SportschauIE, which is now based on WDR (#9799) --- youtube_dl/extractor/ard.py | 39 ------------------------------------- 1 file changed, 39 deletions(-) diff --git a/youtube_dl/extractor/ard.py b/youtube_dl/extractor/ard.py index 26446c2fe..fd45b3e42 100644 --- a/youtube_dl/extractor/ard.py +++ b/youtube_dl/extractor/ard.py @@ -8,7 +8,6 @@ from .generic import GenericIE from ..utils import ( determine_ext, ExtractorError, - get_element_by_attribute, qualities, int_or_none, parse_duration, @@ -274,41 +273,3 @@ class ARDIE(InfoExtractor): 'upload_date': upload_date, 'thumbnail': thumbnail, } - - -class SportschauIE(ARDMediathekIE): - IE_NAME = 'Sportschau' - _VALID_URL = r'(?Phttps?://(?:www\.)?sportschau\.de/(?:[^/]+/)+video(?P[^/#?]+))\.html' - _TESTS = [{ - 'url': 'http://www.sportschau.de/tourdefrance/videoseppeltkokainhatnichtsmitklassischemdopingzutun100.html', - 'info_dict': { - 'id': 'seppeltkokainhatnichtsmitklassischemdopingzutun100', - 'ext': 'mp4', - 'title': 'Seppelt: "Kokain hat nichts mit klassischem Doping zu tun"', - 'thumbnail': 're:^https?://.*\.jpg$', - 'description': 'Der ARD-Doping Experte Hajo Seppelt gibt seine Einschätzung zum ersten Dopingfall der diesjährigen Tour de France um den Italiener Luca Paolini ab.', - }, - 'params': { - # m3u8 download - 'skip_download': True, - }, - }] - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') - base_url = mobj.group('baseurl') - - webpage = self._download_webpage(url, video_id) - title = get_element_by_attribute('class', 'headline', webpage) - description = self._html_search_meta('description', webpage, 'description') - - info = self._extract_media_info( - base_url + '-mc_defaultQuality-h.json', webpage, video_id) - - info.update({ - 'title': title, - 'description': description, - }) - - return info From e8f13f2637fd33b20ac2682dbbdaef63b6288bf4 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Sat, 18 Jun 2016 13:42:58 +0800 Subject: [PATCH 023/387] [sportschau.de] Fix extraction and moved to its own file (closes #9799) --- youtube_dl/extractor/sportschau.py | 38 ++++++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) create mode 100644 youtube_dl/extractor/sportschau.py diff --git a/youtube_dl/extractor/sportschau.py b/youtube_dl/extractor/sportschau.py new file mode 100644 index 000000000..0d7925a08 --- /dev/null +++ b/youtube_dl/extractor/sportschau.py @@ -0,0 +1,38 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .wdr import WDRBaseIE +from ..utils import get_element_by_attribute + + +class SportschauIE(WDRBaseIE): + IE_NAME = 'Sportschau' + _VALID_URL = r'https?://(?:www\.)?sportschau\.de/(?:[^/]+/)+video-?(?P[^/#?]+)\.html' + _TEST = { + 'url': 'http://www.sportschau.de/uefaeuro2016/videos/video-dfb-team-geht-gut-gelaunt-ins-spiel-gegen-polen-100.html', + 'info_dict': { + 'id': 'mdb-1140188', + 'display_id': 'dfb-team-geht-gut-gelaunt-ins-spiel-gegen-polen-100', + 'ext': 'mp4', + 'title': 'DFB-Team geht gut gelaunt ins Spiel gegen Polen', + 'description': 'Vor dem zweiten Gruppenspiel gegen Polen herrscht gute Stimmung im deutschen Team. Insbesondere Bastian Schweinsteiger strotzt vor Optimismus nach seinem Tor gegen die Ukraine.', + 'upload_date': '20160615', + }, + 'skip': 'Geo-restricted to Germany', + } + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage(url, video_id) + title = get_element_by_attribute('class', 'headline', webpage) + description = self._html_search_meta('description', webpage, 'description') + + info = self._extract_wdr_video(webpage, video_id) + + info.update({ + 'title': title, + 'description': description, + }) + + return info From b0b128049a9180e58698e74bdc8079d80086dbef Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Sat, 18 Jun 2016 13:43:47 +0800 Subject: [PATCH 024/387] [extractors] Update references to sportschau (#9799) --- youtube_dl/extractor/extractors.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index efa58f70f..6dc5904b3 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -44,7 +44,6 @@ from .archiveorg import ArchiveOrgIE from .ard import ( ARDIE, ARDMediathekIE, - SportschauIE, ) from .arte import ( ArteTvIE, @@ -747,6 +746,7 @@ from .sportbox import ( SportBoxEmbedIE, ) from .sportdeutschland import SportDeutschlandIE +from .sportschau import SportschauIE from .srgssr import ( SRGSSRIE, SRGSSRPlayIE, From d7c6c656c5c2fa64a1f8a4365a5fe62861b1dceb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 18 Jun 2016 21:42:17 +0700 Subject: [PATCH 025/387] [arte:+7] Expand _VALID_URL (Closes #9820) --- youtube_dl/extractor/arte.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/arte.py b/youtube_dl/extractor/arte.py index 9e39faf47..e602d3673 100644 --- a/youtube_dl/extractor/arte.py +++ b/youtube_dl/extractor/arte.py @@ -180,11 +180,14 @@ class ArteTVBaseIE(InfoExtractor): class ArteTVPlus7IE(ArteTVBaseIE): IE_NAME = 'arte.tv:+7' - _VALID_URL = r'https?://(?:www\.)?arte\.tv/guide/(?Pfr|de|en|es)/(?:(?:sendungen|emissions|embed)/)?(?P[^/]+)/(?P[^/?#&]+)' + _VALID_URL = r'https?://(?:(?:www|sites)\.)?arte\.tv/[^/]+/(?Pfr|de|en|es)/(?:(?:sendungen|emissions|embed)/)?(?P[^/]+)/(?P[^/?#&]+)' _TESTS = [{ 'url': 'http://www.arte.tv/guide/de/sendungen/XEN/xenius/?vid=055918-015_PLUS7-D', 'only_matching': True, + }. { + 'url': 'http://sites.arte.tv/karambolage/de/video/karambolage-22', + 'only_matching': True, }] @classmethod From c1823c8ad9ca49f61ef15bbc126b0f95e4825d14 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 18 Jun 2016 22:08:48 +0700 Subject: [PATCH 026/387] [README.md] Remove 'small' from description (#9814) --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index f1e59542d..c6feef116 100644 --- a/README.md +++ b/README.md @@ -44,7 +44,7 @@ Or with [MacPorts](https://www.macports.org/): Alternatively, refer to the [developer instructions](#developer-instructions) for how to check out and work with the git repository. For further options, including PGP signatures, see the [youtube-dl Download Page](https://rg3.github.io/youtube-dl/download.html). # DESCRIPTION -**youtube-dl** is a small command-line program to download videos from +**youtube-dl** is a command-line program to download videos from YouTube.com and a few more sites. It requires the Python interpreter, version 2.6, 2.7, or 3.2+, and it is not platform specific. It should work on your Unix box, on Windows or on Mac OS X. It is released to the public domain, From 90b6288cce3e5a433a521bc862d98d31be9624c2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 18 Jun 2016 22:23:48 +0700 Subject: [PATCH 027/387] [arte:+7] Simplify _VALID_URL --- youtube_dl/extractor/arte.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/arte.py b/youtube_dl/extractor/arte.py index e602d3673..049f1fa9e 100644 --- a/youtube_dl/extractor/arte.py +++ b/youtube_dl/extractor/arte.py @@ -180,12 +180,12 @@ class ArteTVBaseIE(InfoExtractor): class ArteTVPlus7IE(ArteTVBaseIE): IE_NAME = 'arte.tv:+7' - _VALID_URL = r'https?://(?:(?:www|sites)\.)?arte\.tv/[^/]+/(?Pfr|de|en|es)/(?:(?:sendungen|emissions|embed)/)?(?P[^/]+)/(?P[^/?#&]+)' + _VALID_URL = r'https?://(?:(?:www|sites)\.)?arte\.tv/[^/]+/(?Pfr|de|en|es)/(?:[^/]+/)*(?P[^/?#&]+)' _TESTS = [{ 'url': 'http://www.arte.tv/guide/de/sendungen/XEN/xenius/?vid=055918-015_PLUS7-D', 'only_matching': True, - }. { + }, { 'url': 'http://sites.arte.tv/karambolage/de/video/karambolage-22', 'only_matching': True, }] From 41c1023300596f62dff93d9275f5e4d7a6762e66 Mon Sep 17 00:00:00 2001 From: Steven Gosseling Date: Fri, 26 Feb 2016 13:31:52 +0100 Subject: [PATCH 028/387] [closertotruth] Add extractor Removed print statement from code. Replaced two regex searches with the corret ones. Removed some unnecessary semicolumns fixed title extraction refactored everything to search_regex processed comments on commit 5650b0d, fixed feedback from flake8 Improved regexes and returns info dict now. Added support for closertotruth interview URL Added support for episodes page --- youtube_dl/extractor/closertotruth.py | 69 +++++++++++++++++++++++++++ 1 file changed, 69 insertions(+) create mode 100644 youtube_dl/extractor/closertotruth.py diff --git a/youtube_dl/extractor/closertotruth.py b/youtube_dl/extractor/closertotruth.py new file mode 100644 index 000000000..d04ff5e4f --- /dev/null +++ b/youtube_dl/extractor/closertotruth.py @@ -0,0 +1,69 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor + + +class CloserToTruthIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?closertotruth\.com/(episodes/|(series|interviews)/(?:[^#]+#video-)?(?P\d+))' + _TESTS = [ + { + 'url': 'http://closertotruth.com/series/solutions-the-mind-body-problem#video-3688', + 'md5': '5c548bde260a9247ddfdc07c7458ed29', + 'info_dict': { + 'id': '0_zof1ktre', + 'ext': 'mov', + 'title': 'Solutions to the Mind-Body Problem?', + 'upload_date': '20140221', + 'timestamp': 1392956007, + 'uploader_id': 'CTTXML' + } + }, + { + 'url': 'http://closertotruth.com/interviews/1725', + 'md5': 'b00598fd6a38372edb976408f72c5792', + 'info_dict': { + 'id': '0_19qv5rn1', + 'ext': 'mov', + 'title': 'AyaFr-002 - Francisco J. Ayala', + 'upload_date': '20140307', + 'timestamp': 1394236431, + 'uploader_id': 'CTTXML' + } + }, + { + 'url': 'http://closertotruth.com/episodes/how-do-brains-work', + 'md5': '4dd96aa0a5c296afa5c0bd24895c2f16', + 'info_dict': { + 'id': '0_iuxai6g6', + 'ext': 'mov', + 'title': 'How do Brains Work?', + 'upload_date': '20140221', + 'timestamp': 1392956024, + 'uploader_id': 'CTTXML' + } + }, + ] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + video_title = self._search_regex(r'(.+) \|.+', webpage, 'video title') + + entry_id = self._search_regex(r']+id="(?:video-%s|embed-kaltura)"[^>]+data-kaltura="([^"]+)' % video_id, webpage, "video entry_id") + + interviewee_name = self._search_regex(r'
(.*)

.+', webpage, "video interviewee_name", False) + + if interviewee_name: + video_title = video_title + ' - ' + interviewee_name + + p_id = self._search_regex(r']+src=["\'].+?partner_id/(\d+)', webpage, "kaltura partner_id") + + return { + '_type': 'url_transparent', + 'id': entry_id, + 'url': 'kaltura:%s:%s' % (p_id, entry_id), + 'ie_key': 'Kaltura', + 'title': video_title + } From cb23192bc4c56d80229a7a5f70cb61d0879db6c5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 19 Jun 2016 00:35:29 +0700 Subject: [PATCH 029/387] [closertotruth] Update and improve (Closes #8680) --- youtube_dl/extractor/closertotruth.py | 117 +++++++++++++++----------- youtube_dl/extractor/extractors.py | 1 + 2 files changed, 71 insertions(+), 47 deletions(-) diff --git a/youtube_dl/extractor/closertotruth.py b/youtube_dl/extractor/closertotruth.py index d04ff5e4f..26243d52d 100644 --- a/youtube_dl/extractor/closertotruth.py +++ b/youtube_dl/extractor/closertotruth.py @@ -1,69 +1,92 @@ # coding: utf-8 from __future__ import unicode_literals +import re + from .common import InfoExtractor class CloserToTruthIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?closertotruth\.com/(episodes/|(series|interviews)/(?:[^#]+#video-)?(?P\d+))' - _TESTS = [ - { - 'url': 'http://closertotruth.com/series/solutions-the-mind-body-problem#video-3688', - 'md5': '5c548bde260a9247ddfdc07c7458ed29', - 'info_dict': { - 'id': '0_zof1ktre', - 'ext': 'mov', - 'title': 'Solutions to the Mind-Body Problem?', - 'upload_date': '20140221', - 'timestamp': 1392956007, - 'uploader_id': 'CTTXML' - } + _VALID_URL = r'https?://(?:www\.)?closertotruth\.com/(?:[^/]+/)*(?P[^/?#&]+)' + _TESTS = [{ + 'url': 'http://closertotruth.com/series/solutions-the-mind-body-problem#video-3688', + 'info_dict': { + 'id': '0_zof1ktre', + 'display_id': 'solutions-the-mind-body-problem', + 'ext': 'mov', + 'title': 'Solutions to the Mind-Body Problem?', + 'upload_date': '20140221', + 'timestamp': 1392956007, + 'uploader_id': 'CTTXML' }, - { - 'url': 'http://closertotruth.com/interviews/1725', - 'md5': 'b00598fd6a38372edb976408f72c5792', - 'info_dict': { - 'id': '0_19qv5rn1', - 'ext': 'mov', - 'title': 'AyaFr-002 - Francisco J. Ayala', - 'upload_date': '20140307', - 'timestamp': 1394236431, - 'uploader_id': 'CTTXML' - } + 'params': { + 'skip_download': True, }, - { - 'url': 'http://closertotruth.com/episodes/how-do-brains-work', - 'md5': '4dd96aa0a5c296afa5c0bd24895c2f16', - 'info_dict': { - 'id': '0_iuxai6g6', - 'ext': 'mov', - 'title': 'How do Brains Work?', - 'upload_date': '20140221', - 'timestamp': 1392956024, - 'uploader_id': 'CTTXML' - } + }, { + 'url': 'http://closertotruth.com/episodes/how-do-brains-work', + 'info_dict': { + 'id': '0_iuxai6g6', + 'display_id': 'how-do-brains-work', + 'ext': 'mov', + 'title': 'How do Brains Work?', + 'upload_date': '20140221', + 'timestamp': 1392956024, + 'uploader_id': 'CTTXML' }, - ] + 'params': { + 'skip_download': True, + }, + }, { + 'url': 'http://closertotruth.com/interviews/1725', + 'info_dict': { + 'id': '1725', + 'title': 'AyaFr-002', + }, + 'playlist_mincount': 2, + }] def _real_extract(self, url): - video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) + display_id = self._match_id(url) - video_title = self._search_regex(r'(.+) \|.+', webpage, 'video title') + webpage = self._download_webpage(url, display_id) - entry_id = self._search_regex(r']+id="(?:video-%s|embed-kaltura)"[^>]+data-kaltura="([^"]+)' % video_id, webpage, "video entry_id") + partner_id = self._search_regex( + r']+src=["\'].*?\b(?:partner_id|p)/(\d+)', + webpage, 'kaltura partner_id') - interviewee_name = self._search_regex(r'
(.*).+', webpage, "video interviewee_name", False) + title = self._search_regex( + r'(.+?)\s*\|\s*.+?', webpage, 'video title') - if interviewee_name: - video_title = video_title + ' - ' + interviewee_name + select = self._search_regex( + r'(?s)]+id="select-version"[^>]*>(.+?)', + webpage, 'select version', default=None) + if select: + entry_ids = set() + entries = [] + for mobj in re.finditer( + r']+value=(["\'])(?P[0-9a-z_]+)(?:#.+?)?\1[^>]*>(?P[^<]+)', + webpage): + entry_id = mobj.group('id') + if entry_id in entry_ids: + continue + entry_ids.add(entry_id) + entries.append({ + '_type': 'url_transparent', + 'url': 'kaltura:%s:%s' % (partner_id, entry_id), + 'ie_key': 'Kaltura', + 'title': mobj.group('title'), + }) + if entries: + return self.playlist_result(entries, display_id, title) - p_id = self._search_regex(r'<script[^>]+src=["\'].+?partner_id/(\d+)', webpage, "kaltura partner_id") + entry_id = self._search_regex( + r'<a[^>]+id=(["\'])embed-kaltura\1[^>]+data-kaltura=(["\'])(?P<id>[0-9a-z_]+)\2', + webpage, 'kaltura entry_id', group='id') return { '_type': 'url_transparent', - 'id': entry_id, - 'url': 'kaltura:%s:%s' % (p_id, entry_id), + 'display_id': display_id, + 'url': 'kaltura:%s:%s' % (partner_id, entry_id), 'ie_key': 'Kaltura', - 'title': video_title + 'title': title } diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 6dc5904b3..2ff867651 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -140,6 +140,7 @@ from .cliprs import ClipRsIE from .clipfish import ClipfishIE from .cliphunter import CliphunterIE from .clipsyndicate import ClipsyndicateIE +from .closertotruth import CloserToTruthIE from .cloudy import CloudyIE from .clubic import ClubicIE from .clyp import ClypIE From 7577d849a62ecdcc52ede6dcf73edf2a717fc646 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 19 Jun 2016 02:25:34 +0700 Subject: [PATCH 030/387] [r7] Fix extraction and add support for articles (Closes #9826) --- youtube_dl/extractor/extractors.py | 5 +- youtube_dl/extractor/r7.py | 95 +++++++++++++++++++----------- 2 files changed, 64 insertions(+), 36 deletions(-) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 2ff867651..b1b04f2fc 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -631,7 +631,10 @@ from .qqmusic import ( QQMusicToplistIE, QQMusicPlaylistIE, ) -from .r7 import R7IE +from .r7 import ( + R7IE, + R7ArticleIE, +) from .radiocanada import ( RadioCanadaIE, RadioCanadaAudioVideoIE, diff --git a/youtube_dl/extractor/r7.py b/youtube_dl/extractor/r7.py index 976c8feec..069dbfaed 100644 --- a/youtube_dl/extractor/r7.py +++ b/youtube_dl/extractor/r7.py @@ -2,22 +2,19 @@ from __future__ import unicode_literals from .common import InfoExtractor -from ..utils import ( - js_to_json, - unescapeHTML, - int_or_none, -) +from ..utils import int_or_none class R7IE(InfoExtractor): - _VALID_URL = r'''(?x)https?:// + _VALID_URL = r'''(?x) + https?:// (?: (?:[a-zA-Z]+)\.r7\.com(?:/[^/]+)+/idmedia/| noticias\.r7\.com(?:/[^/]+)+/[^/]+-| player\.r7\.com/video/i/ ) (?P<id>[\da-f]{24}) - ''' + ''' _TESTS = [{ 'url': 'http://videos.r7.com/policiais-humilham-suspeito-a-beira-da-morte-morre-com-dignidade-/idmedia/54e7050b0cf2ff57e0279389.html', 'md5': '403c4e393617e8e8ddc748978ee8efde', @@ -25,6 +22,7 @@ class R7IE(InfoExtractor): 'id': '54e7050b0cf2ff57e0279389', 'ext': 'mp4', 'title': 'Policiais humilham suspeito à beira da morte: "Morre com dignidade"', + 'description': 'md5:01812008664be76a6479aa58ec865b72', 'thumbnail': 're:^https?://.*\.jpg$', 'duration': 98, 'like_count': int, @@ -44,45 +42,72 @@ class R7IE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) - webpage = self._download_webpage( - 'http://player.r7.com/video/i/%s' % video_id, video_id) + video = self._download_json( + 'http://player-api.r7.com/video/i/%s' % video_id, video_id) - item = self._parse_json(js_to_json(self._search_regex( - r'(?s)var\s+item\s*=\s*({.+?});', webpage, 'player')), video_id) - - title = unescapeHTML(item['title']) - thumbnail = item.get('init', {}).get('thumbUri') - duration = None - - statistics = item.get('statistics', {}) - like_count = int_or_none(statistics.get('likes')) - view_count = int_or_none(statistics.get('views')) + title = video['title'] formats = [] - for format_key, format_dict in item['playlist'][0].items(): - src = format_dict.get('src') - if not src: - continue - format_id = format_dict.get('format') or format_key - if duration is None: - duration = format_dict.get('duration') - if '.f4m' in src: - formats.extend(self._extract_f4m_formats(src, video_id, preference=-1)) - elif src.endswith('.m3u8'): - formats.extend(self._extract_m3u8_formats(src, video_id, 'mp4', preference=-2)) - else: - formats.append({ - 'url': src, - 'format_id': format_id, - }) + media_url_hls = video.get('media_url_hls') + if media_url_hls: + formats.extend(self._extract_m3u8_formats( + media_url_hls, video_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id='hls', fatal=False)) + media_url = video.get('media_url') + if media_url: + f = { + 'url': media_url, + 'format_id': 'http', + } + # m3u8 format always matches the http format, let's copy metadata from + # one to another + m3u8_formats = list(filter( + lambda f: f.get('vcodec') != 'none' and f.get('resolution') != 'multiple', + formats)) + if len(m3u8_formats) == 1: + f_copy = m3u8_formats[0].copy() + f_copy.update(f) + f_copy['protocol'] = 'http' + f = f_copy + formats.append(f) self._sort_formats(formats) + description = video.get('description') + thumbnail = video.get('thumb') + duration = int_or_none(video.get('media_duration')) + like_count = int_or_none(video.get('likes')) + view_count = int_or_none(video.get('views')) + return { 'id': video_id, 'title': title, + 'description': description, 'thumbnail': thumbnail, 'duration': duration, 'like_count': like_count, 'view_count': view_count, 'formats': formats, } + + +class R7ArticleIE(InfoExtractor): + _VALID_URL = r'https?://(?:[a-zA-Z]+)\.r7\.com/(?:[^/]+/)+[^/?#&]+-(?P<id>\d+)' + _TEST = { + 'url': 'http://tv.r7.com/record-play/balanco-geral/videos/policiais-humilham-suspeito-a-beira-da-morte-morre-com-dignidade-16102015', + 'only_matching': True, + } + + @classmethod + def suitable(cls, url): + return False if R7IE.suitable(url) else super(R7ArticleIE, cls).suitable(url) + + def _real_extract(self, url): + display_id = self._match_id(url) + + webpage = self._download_webpage(url, display_id) + + video_id = self._search_regex( + r'<div[^>]+(?:id=["\']player-|class=["\']embed["\'][^>]+id=["\'])([\da-f]{24})', + webpage, 'video id') + + return self.url_result('http://player.r7.com/video/i/%s' % video_id, R7IE.ie_key()) From 589568789f500b7a515355a07efec4bcec0f3243 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 19 Jun 2016 02:30:29 +0700 Subject: [PATCH 031/387] release 2016.06.19 --- .github/ISSUE_TEMPLATE.md | 6 +++--- docs/supportedsites.md | 4 +++- youtube_dl/version.py | 2 +- 3 files changed, 7 insertions(+), 5 deletions(-) diff --git a/.github/ISSUE_TEMPLATE.md b/.github/ISSUE_TEMPLATE.md index 2736bb23b..52e04aa74 100644 --- a/.github/ISSUE_TEMPLATE.md +++ b/.github/ISSUE_TEMPLATE.md @@ -6,8 +6,8 @@ --- -### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2016.06.18.1*. If it's not read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. -- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2016.06.18.1** +### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2016.06.19*. If it's not read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. +- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2016.06.19** ### Before submitting an *issue* make sure you have: - [ ] At least skimmed through [README](https://github.com/rg3/youtube-dl/blob/master/README.md) and **most notably** [FAQ](https://github.com/rg3/youtube-dl#faq) and [BUGS](https://github.com/rg3/youtube-dl#bugs) sections @@ -35,7 +35,7 @@ $ youtube-dl -v <your command line> [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 -[debug] youtube-dl version 2016.06.18.1 +[debug] youtube-dl version 2016.06.19 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/docs/supportedsites.md b/docs/supportedsites.md index c79798d86..7c90940c7 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -44,8 +44,8 @@ - **appletrailers:section** - **archive.org**: archive.org videos - **ARD** - - **ARD:mediathek**: Saarländischer Rundfunk - **ARD:mediathek** + - **ARD:mediathek**: Saarländischer Rundfunk - **arte.tv** - **arte.tv:+7** - **arte.tv:cinema** @@ -128,6 +128,7 @@ - **cliphunter** - **ClipRs** - **Clipsyndicate** + - **CloserToTruth** - **cloudtime**: CloudTime - **Cloudy** - **Clubic** @@ -521,6 +522,7 @@ - **qqmusic:singer**: QQ音乐 - 歌手 - **qqmusic:toplist**: QQ音乐 - 排行榜 - **R7** + - **R7Article** - **radio.de** - **radiobremen** - **radiocanada** diff --git a/youtube_dl/version.py b/youtube_dl/version.py index f71bc8c2a..417e86ed6 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2016.06.18.1' +__version__ = '2016.06.19' From 7c05097633138459e9bdf7e10738e021b04689a7 Mon Sep 17 00:00:00 2001 From: Lucas Moura <lucas.moura128@gmail.com> Date: Sat, 18 Jun 2016 17:01:47 -0300 Subject: [PATCH 032/387] [jsinterp] Avoid double key lookup for setting new key In order to add a new key to both __objects and __functions dicts on jsinterp.py, it is necessary to first verify if a key was present and if not, create the key and assign it to a value. However, this can be done with a single step using dict setdefault method. --- youtube_dl/jsinterp.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/youtube_dl/jsinterp.py b/youtube_dl/jsinterp.py index a7440c582..4a5a0dbc3 100644 --- a/youtube_dl/jsinterp.py +++ b/youtube_dl/jsinterp.py @@ -131,9 +131,8 @@ class JSInterpreter(object): if variable in local_vars: obj = local_vars[variable] else: - if variable not in self._objects: - self._objects[variable] = self.extract_object(variable) - obj = self._objects[variable] + obj = self._objects.setdefault( + variable, self.extract_object(variable)) if arg_str is None: # Member access @@ -204,8 +203,7 @@ class JSInterpreter(object): argvals = tuple([ int(v) if v.isdigit() else local_vars[v] for v in m.group('args').split(',')]) - if fname not in self._functions: - self._functions[fname] = self.extract_function(fname) + self._functions.setdefault(fname, self.extract_function(fname)) return self._functions[fname](argvals) raise ExtractorError('Unsupported JS expression %r' % expr) From 6a55bb66ee4367a8445c8df2d803090e68c42fe9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 19 Jun 2016 03:56:01 +0700 Subject: [PATCH 033/387] [vimeo] Fix rented videos (Closes #9830) --- youtube_dl/extractor/vimeo.py | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/vimeo.py b/youtube_dl/extractor/vimeo.py index 0fd2c18a0..c52986af6 100644 --- a/youtube_dl/extractor/vimeo.py +++ b/youtube_dl/extractor/vimeo.py @@ -8,6 +8,7 @@ import itertools from .common import InfoExtractor from ..compat import ( compat_HTTPError, + compat_str, compat_urlparse, ) from ..utils import ( @@ -24,6 +25,7 @@ from ..utils import ( urlencode_postdata, unescapeHTML, parse_filesize, + try_get, ) @@ -445,7 +447,18 @@ class VimeoIE(VimeoBaseInfoExtractor): if config.get('view') == 4: config = self._verify_player_video_password(url, video_id) - if '>You rented this title.<' in webpage: + def is_rented(): + if '>You rented this title.<' in webpage: + return True + if config.get('user', {}).get('purchased'): + return True + label = try_get( + config, lambda x: x['video']['vod']['purchase_options'][0]['label_string'], compat_str) + if label and label.startswith('You rented this'): + return True + return False + + if is_rented(): feature_id = config.get('video', {}).get('vod', {}).get('feature_id') if feature_id and not data.get('force_feature_id', False): return self.url_result(smuggle_url( From a50fd6e0263b6e5d97a13a0f781a2325c0ab7efa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 19 Jun 2016 03:57:14 +0700 Subject: [PATCH 034/387] release 2016.06.19.1 --- .github/ISSUE_TEMPLATE.md | 6 +++--- youtube_dl/version.py | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/ISSUE_TEMPLATE.md b/.github/ISSUE_TEMPLATE.md index 52e04aa74..1b25628bf 100644 --- a/.github/ISSUE_TEMPLATE.md +++ b/.github/ISSUE_TEMPLATE.md @@ -6,8 +6,8 @@ --- -### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2016.06.19*. If it's not read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. -- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2016.06.19** +### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2016.06.19.1*. If it's not read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. +- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2016.06.19.1** ### Before submitting an *issue* make sure you have: - [ ] At least skimmed through [README](https://github.com/rg3/youtube-dl/blob/master/README.md) and **most notably** [FAQ](https://github.com/rg3/youtube-dl#faq) and [BUGS](https://github.com/rg3/youtube-dl#bugs) sections @@ -35,7 +35,7 @@ $ youtube-dl -v <your command line> [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 -[debug] youtube-dl version 2016.06.19 +[debug] youtube-dl version 2016.06.19.1 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 417e86ed6..a3e2c3079 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2016.06.19' +__version__ = '2016.06.19.1' From e154c65128305fcaf6c6b2d5fd41ecd83e4ec154 Mon Sep 17 00:00:00 2001 From: remitamine <remitamine@gmail.com> Date: Sun, 10 Jan 2016 20:09:53 +0100 Subject: [PATCH 035/387] [downloader/hls] Add support for AES-128 encrypted segments in hlsnative downloader --- youtube_dl/downloader/hls.py | 78 ++++++++++++++++++++++++---------- youtube_dl/extractor/common.py | 17 ++------ youtube_dl/utils.py | 9 ++++ 3 files changed, 68 insertions(+), 36 deletions(-) diff --git a/youtube_dl/downloader/hls.py b/youtube_dl/downloader/hls.py index 54f2108e9..1d5f178a0 100644 --- a/youtube_dl/downloader/hls.py +++ b/youtube_dl/downloader/hls.py @@ -2,14 +2,24 @@ from __future__ import unicode_literals import os.path import re +import binascii +try: + from Crypto.Cipher import AES + can_decrypt_frag = True +except ImportError: + can_decrypt_frag = False from .fragment import FragmentFD from .external import FFmpegFD -from ..compat import compat_urlparse +from ..compat import ( + compat_urlparse, + compat_struct_pack, +) from ..utils import ( encodeFilename, sanitize_open, + parse_m3u8_attributes, ) @@ -21,7 +31,7 @@ class HlsFD(FragmentFD): @staticmethod def can_download(manifest): UNSUPPORTED_FEATURES = ( - r'#EXT-X-KEY:METHOD=(?!NONE)', # encrypted streams [1] + r'#EXT-X-KEY:METHOD=(?!NONE|AES-128)', # encrypted streams [1] r'#EXT-X-BYTERANGE', # playlists composed of byte ranges of media files [2] # Live streams heuristic does not always work (e.g. geo restricted to Germany @@ -39,7 +49,9 @@ class HlsFD(FragmentFD): # 3. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.3.2 # 4. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.3.5 ) - return all(not re.search(feature, manifest) for feature in UNSUPPORTED_FEATURES) + check_results = [not re.search(feature, manifest) for feature in UNSUPPORTED_FEATURES] + check_results.append(not (re.search(r'#EXT-X-KEY:METHOD=AES-128', manifest) and not can_decrypt_frag)) + return all(check_results) def real_download(self, filename, info_dict): man_url = info_dict['url'] @@ -57,36 +69,58 @@ class HlsFD(FragmentFD): fd.add_progress_hook(ph) return fd.real_download(filename, info_dict) - fragment_urls = [] + total_frags = 0 for line in s.splitlines(): line = line.strip() if line and not line.startswith('#'): - segment_url = ( - line - if re.match(r'^https?://', line) - else compat_urlparse.urljoin(man_url, line)) - fragment_urls.append(segment_url) - # We only download the first fragment during the test - if self.params.get('test', False): - break + total_frags += 1 ctx = { 'filename': filename, - 'total_frags': len(fragment_urls), + 'total_frags': total_frags, } self._prepare_and_start_frag_download(ctx) + i = 0 + media_sequence = 0 + decrypt_info = {'METHOD': 'NONE'} frags_filenames = [] - for i, frag_url in enumerate(fragment_urls): - frag_filename = '%s-Frag%d' % (ctx['tmpfilename'], i) - success = ctx['dl'].download(frag_filename, {'url': frag_url}) - if not success: - return False - down, frag_sanitized = sanitize_open(frag_filename, 'rb') - ctx['dest_stream'].write(down.read()) - down.close() - frags_filenames.append(frag_sanitized) + for line in s.splitlines(): + line = line.strip() + if line: + if not line.startswith('#'): + frag_url = ( + line + if re.match(r'^https?://', line) + else compat_urlparse.urljoin(man_url, line)) + frag_filename = '%s-Frag%d' % (ctx['tmpfilename'], i) + success = ctx['dl'].download(frag_filename, {'url': frag_url}) + if not success: + return False + down, frag_sanitized = sanitize_open(frag_filename, 'rb') + frag_content = down.read() + down.close() + if decrypt_info['METHOD'] == 'AES-128': + iv = decrypt_info.get('IV') or compat_struct_pack(">8xq", media_sequence) + frag_content = AES.new(decrypt_info['KEY'], AES.MODE_CBC, iv).decrypt(frag_content) + ctx['dest_stream'].write(frag_content) + frags_filenames.append(frag_sanitized) + # We only download the first fragment during the test + if self.params.get('test', False): + break + i += 1 + media_sequence += 1 + elif line.startswith('#EXT-X-KEY'): + decrypt_info = parse_m3u8_attributes(line[11:]) + if decrypt_info['METHOD'] == 'AES-128': + if 'IV' in decrypt_info: + decrypt_info['IV'] = binascii.unhexlify(decrypt_info['IV'][2:]) + if not re.match(r'^https?://', decrypt_info['URI']): + decrypt_info['URI'] = compat_urlparse.urljoin(man_url, decrypt_info['URI']) + decrypt_info['KEY'] = self.ydl.urlopen(decrypt_info['URI']).read() + elif line.startswith('#EXT-X-MEDIA-SEQUENCE'): + media_sequence = int(line[22:]) self._finish_frag_download(ctx) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index bfd432160..5a2603b50 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -53,6 +53,7 @@ from ..utils import ( mimetype2ext, update_Request, update_url_query, + parse_m3u8_attributes, ) @@ -1150,23 +1151,11 @@ class InfoExtractor(object): }] last_info = None last_media = None - kv_rex = re.compile( - r'(?P<key>[a-zA-Z_-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)') for line in m3u8_doc.splitlines(): if line.startswith('#EXT-X-STREAM-INF:'): - last_info = {} - for m in kv_rex.finditer(line): - v = m.group('val') - if v.startswith('"'): - v = v[1:-1] - last_info[m.group('key')] = v + last_info = parse_m3u8_attributes(line) elif line.startswith('#EXT-X-MEDIA:'): - last_media = {} - for m in kv_rex.finditer(line): - v = m.group('val') - if v.startswith('"'): - v = v[1:-1] - last_media[m.group('key')] = v + last_media = parse_m3u8_attributes(line) elif line.startswith('#') or not line.strip(): continue else: diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 82f67f6cd..562031fe1 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -2852,3 +2852,12 @@ def decode_packed_codes(code): return re.sub( r'\b(\w+)\b', lambda mobj: symbol_table[mobj.group(0)], obfucasted_code) + + +def parse_m3u8_attributes(attrib): + info = {} + for (key, val) in re.findall(r'(?P<key>[A-Z0-9-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)', attrib): + if val.startswith('"'): + val = val[1:-1] + info[key] = val + return info From 6cd64b6806e92b7246aebd89448189180d88db82 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Sun, 19 Jun 2016 05:45:48 +0100 Subject: [PATCH 036/387] [foxsports] extract http formats --- youtube_dl/extractor/foxsports.py | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/youtube_dl/extractor/foxsports.py b/youtube_dl/extractor/foxsports.py index df7665176..a3bb98377 100644 --- a/youtube_dl/extractor/foxsports.py +++ b/youtube_dl/extractor/foxsports.py @@ -1,7 +1,10 @@ from __future__ import unicode_literals from .common import InfoExtractor -from ..utils import smuggle_url +from ..utils import ( + smuggle_url, + update_url_query, +) class FoxSportsIE(InfoExtractor): @@ -9,11 +12,15 @@ class FoxSportsIE(InfoExtractor): _TEST = { 'url': 'http://www.foxsports.com/video?vid=432609859715', + 'md5': 'b49050e955bebe32c301972e4012ac17', 'info_dict': { - 'id': 'gA0bHB3Ladz3', - 'ext': 'flv', + 'id': 'i0qKWsk3qJaM', + 'ext': 'mp4', 'title': 'Courtney Lee on going up 2-0 in series vs. Blazers', 'description': 'Courtney Lee talks about Memphis being focused.', + 'upload_date': '20150423', + 'timestamp': 1429761109, + 'uploader': 'NEWA-FNG-FOXSPORTS', }, 'add_ie': ['ThePlatform'], } @@ -28,5 +35,8 @@ class FoxSportsIE(InfoExtractor): r"data-player-config='([^']+)'", webpage, 'data player config'), video_id) - return self.url_result(smuggle_url( - config['releaseURL'] + '&manifest=f4m', {'force_smil_url': True})) + return self.url_result(smuggle_url(update_url_query( + config['releaseURL'], { + 'mbr': 'true', + 'switch': 'http', + }), {'force_smil_url': True})) From 6c83e583b334226965bdf45583c09dbe8bfe9dab Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan <yan12125@gmail.com> Date: Sun, 19 Jun 2016 13:32:08 +0800 Subject: [PATCH 037/387] [radiojavan] PEP8 E275 is added in pycodestyle 2.6 See https://github.com/PyCQA/pycodestyle/pull/491 --- youtube_dl/extractor/radiojavan.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/radiojavan.py b/youtube_dl/extractor/radiojavan.py index 884c28420..ec4fa6e60 100644 --- a/youtube_dl/extractor/radiojavan.py +++ b/youtube_dl/extractor/radiojavan.py @@ -3,7 +3,7 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..utils import( +from ..utils import ( unified_strdate, str_to_int, ) From 5839d556e4d491ce940324965eaeecfb843306cc Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Sun, 19 Jun 2016 23:37:05 +0100 Subject: [PATCH 038/387] [theplatform] reduce requests for theplatform feed info extraction --- youtube_dl/extractor/theplatform.py | 54 +++++++++++++++++++---------- 1 file changed, 36 insertions(+), 18 deletions(-) diff --git a/youtube_dl/extractor/theplatform.py b/youtube_dl/extractor/theplatform.py index 5793ec6ef..07d222ae3 100644 --- a/youtube_dl/extractor/theplatform.py +++ b/youtube_dl/extractor/theplatform.py @@ -277,9 +277,9 @@ class ThePlatformIE(ThePlatformBaseIE): class ThePlatformFeedIE(ThePlatformBaseIE): - _URL_TEMPLATE = '%s//feed.theplatform.com/f/%s/%s?form=json&byGuid=%s' - _VALID_URL = r'https?://feed\.theplatform\.com/f/(?P<provider_id>[^/]+)/(?P<feed_id>[^?/]+)\?(?:[^&]+&)*byGuid=(?P<id>[a-zA-Z0-9_]+)' - _TEST = { + _URL_TEMPLATE = '%s//feed.theplatform.com/f/%s/%s?form=json&%s' + _VALID_URL = r'https?://feed\.theplatform\.com/f/(?P<provider_id>[^/]+)/(?P<feed_id>[^?/]+)\?(?:[^&]+&)*(?P<filter>by(?:Gui|I)d=(?P<id>[\w-]+))' + _TESTS = [{ # From http://player.theplatform.com/p/7wvmTC/MSNBCEmbeddedOffSite?guid=n_hardball_5biden_140207 'url': 'http://feed.theplatform.com/f/7wvmTC/msnbc_video-p-test?form=json&pretty=true&range=-40&byGuid=n_hardball_5biden_140207', 'md5': '6e32495b5073ab414471b615c5ded394', @@ -295,32 +295,38 @@ class ThePlatformFeedIE(ThePlatformBaseIE): 'categories': ['MSNBC/Issues/Democrats', 'MSNBC/Issues/Elections/Election 2016'], 'uploader': 'NBCU-NEWS', }, - } + }] - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - - video_id = mobj.group('id') - provider_id = mobj.group('provider_id') - feed_id = mobj.group('feed_id') - - real_url = self._URL_TEMPLATE % (self.http_scheme(), provider_id, feed_id, video_id) - feed = self._download_json(real_url, video_id) - entry = feed['entries'][0] + def _extract_feed_info(self, provider_id, feed_id, filter_query, video_id, custom_fields=None, asset_types_query={}): + real_url = self._URL_TEMPLATE % (self.http_scheme(), provider_id, feed_id, filter_query) + entry = self._download_json(real_url, video_id)['entries'][0] formats = [] subtitles = {} first_video_id = None duration = None + asset_types = [] for item in entry['media$content']: - smil_url = item['plfile$url'] + '&mbr=true' + smil_url = item['plfile$url'] cur_video_id = ThePlatformIE._match_id(smil_url) if first_video_id is None: first_video_id = cur_video_id duration = float_or_none(item.get('plfile$duration')) - cur_formats, cur_subtitles = self._extract_theplatform_smil(smil_url, video_id, 'Downloading SMIL data for %s' % cur_video_id) - formats.extend(cur_formats) - subtitles = self._merge_subtitles(subtitles, cur_subtitles) + for asset_type in item['plfile$assetTypes']: + if asset_type in asset_types: + continue + asset_types.append(asset_type) + query = { + 'mbr': 'true', + 'formats': item['plfile$format'], + 'assetTypes': asset_type, + } + if asset_type in asset_types_query: + query.update(asset_types_query[asset_type]) + cur_formats, cur_subtitles = self._extract_theplatform_smil(update_url_query( + smil_url, query), video_id, 'Downloading SMIL data for %s' % asset_type) + formats.extend(cur_formats) + subtitles = self._merge_subtitles(subtitles, cur_subtitles) self._sort_formats(formats) @@ -344,5 +350,17 @@ class ThePlatformFeedIE(ThePlatformBaseIE): 'timestamp': timestamp, 'categories': categories, }) + if custom_fields: + ret.update(custom_fields(entry)) return ret + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + + video_id = mobj.group('id') + provider_id = mobj.group('provider_id') + feed_id = mobj.group('feed_id') + filter_query = mobj.group('filter') + + return self._extract_feed_info(provider_id, feed_id, filter_query, video_id) From 43518503a66d670330a8406829fc30b431420c9c Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Sun, 19 Jun 2016 23:40:00 +0100 Subject: [PATCH 039/387] [cbs,cbsnews,cbssports] reduce requests while extracting all formats --- youtube_dl/extractor/cbs.py | 84 +++++++------------------------ youtube_dl/extractor/cbsnews.py | 31 +++--------- youtube_dl/extractor/cbssports.py | 40 +++++++-------- 3 files changed, 44 insertions(+), 111 deletions(-) diff --git a/youtube_dl/extractor/cbs.py b/youtube_dl/extractor/cbs.py index ac2c7dced..030eeaa65 100644 --- a/youtube_dl/extractor/cbs.py +++ b/youtube_dl/extractor/cbs.py @@ -2,16 +2,14 @@ from __future__ import unicode_literals import re -from .theplatform import ThePlatformIE +from .theplatform import ThePlatformFeedIE from ..utils import ( - xpath_text, - xpath_element, int_or_none, find_xpath_attr, ) -class CBSBaseIE(ThePlatformIE): +class CBSBaseIE(ThePlatformFeedIE): def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'): closed_caption_e = find_xpath_attr(smil, self._xpath_ns('.//param', namespace), 'name', 'ClosedCaptionURL') return { @@ -21,9 +19,22 @@ class CBSBaseIE(ThePlatformIE): }] } if closed_caption_e is not None and closed_caption_e.attrib.get('value') else [] + def _extract_video_info(self, filter_query, video_id): + return self._extract_feed_info( + 'dJ5BDC', 'VxxJg8Ymh8sE', filter_query, video_id, lambda entry: { + 'series': entry.get('cbs$SeriesTitle'), + 'season_number': int_or_none(entry.get('cbs$SeasonNumber')), + 'episode': entry.get('cbs$EpisodeTitle'), + 'episode_number': int_or_none(entry.get('cbs$EpisodeNumber')), + }, { + 'StreamPack': { + 'manifest': 'm3u', + } + }) + class CBSIE(CBSBaseIE): - _VALID_URL = r'(?:cbs:(?P<content_id>\w+)|https?://(?:www\.)?(?:cbs\.com/shows/[^/]+/(?:video|artist)|colbertlateshow\.com/(?:video|podcasts))/[^/]+/(?P<display_id>[^/]+))' + _VALID_URL = r'(?:cbs|https?://(?:www\.)?(?:cbs\.com/shows/[^/]+/video|colbertlateshow\.com/(?:video|podcasts))/)(?P<id>[\w-]+)' _TESTS = [{ 'url': 'http://www.cbs.com/shows/garth-brooks/video/_u7W953k6la293J7EPTd9oHkSPs6Xn6_/connect-chat-feat-garth-brooks/', @@ -38,25 +49,7 @@ class CBSIE(CBSBaseIE): 'upload_date': '20131127', 'uploader': 'CBSI-NEW', }, - 'params': { - # rtmp download - 'skip_download': True, - }, - '_skip': 'Blocked outside the US', - }, { - 'url': 'http://www.cbs.com/shows/liveonletterman/artist/221752/st-vincent/', - 'info_dict': { - 'id': 'WWF_5KqY3PK1', - 'display_id': 'st-vincent', - 'ext': 'flv', - 'title': 'Live on Letterman - St. Vincent', - 'description': 'Live On Letterman: St. Vincent in concert from New York\'s Ed Sullivan Theater on Tuesday, July 16, 2014.', - 'duration': 3221, - }, - 'params': { - # rtmp download - 'skip_download': True, - }, + 'expected_warnings': ['Failed to download m3u8 information'], '_skip': 'Blocked outside the US', }, { 'url': 'http://colbertlateshow.com/video/8GmB0oY0McANFvp2aEffk9jZZZ2YyXxy/the-colbeard/', @@ -68,44 +61,5 @@ class CBSIE(CBSBaseIE): TP_RELEASE_URL_TEMPLATE = 'http://link.theplatform.com/s/dJ5BDC/%s?mbr=true' def _real_extract(self, url): - content_id, display_id = re.match(self._VALID_URL, url).groups() - if not content_id: - webpage = self._download_webpage(url, display_id) - content_id = self._search_regex( - [r"video\.settings\.content_id\s*=\s*'([^']+)';", r"cbsplayer\.contentId\s*=\s*'([^']+)';"], - webpage, 'content id') - items_data = self._download_xml( - 'http://can.cbs.com/thunder/player/videoPlayerService.php', - content_id, query={'partner': 'cbs', 'contentId': content_id}) - video_data = xpath_element(items_data, './/item') - title = xpath_text(video_data, 'videoTitle', 'title', True) - - subtitles = {} - formats = [] - for item in items_data.findall('.//item'): - pid = xpath_text(item, 'pid') - if not pid: - continue - tp_release_url = self.TP_RELEASE_URL_TEMPLATE % pid - if '.m3u8' in xpath_text(item, 'contentUrl', default=''): - tp_release_url += '&manifest=m3u' - tp_formats, tp_subtitles = self._extract_theplatform_smil( - tp_release_url, content_id, 'Downloading %s SMIL data' % pid) - formats.extend(tp_formats) - subtitles = self._merge_subtitles(subtitles, tp_subtitles) - self._sort_formats(formats) - - info = self.get_metadata('dJ5BDC/media/guid/2198311517/%s' % content_id, content_id) - info.update({ - 'id': content_id, - 'display_id': display_id, - 'title': title, - 'series': xpath_text(video_data, 'seriesTitle'), - 'season_number': int_or_none(xpath_text(video_data, 'seasonNumber')), - 'episode_number': int_or_none(xpath_text(video_data, 'episodeNumber')), - 'duration': int_or_none(xpath_text(video_data, 'videoLength'), 1000), - 'thumbnail': xpath_text(video_data, 'previewImageURL'), - 'formats': formats, - 'subtitles': subtitles, - }) - return info + content_id = self._match_id(url) + return self._extract_video_info('byGuid=%s' % content_id, content_id) diff --git a/youtube_dl/extractor/cbsnews.py b/youtube_dl/extractor/cbsnews.py index 79ddc20a0..387537e76 100644 --- a/youtube_dl/extractor/cbsnews.py +++ b/youtube_dl/extractor/cbsnews.py @@ -30,9 +30,12 @@ class CBSNewsIE(CBSBaseIE): { 'url': 'http://www.cbsnews.com/videos/fort-hood-shooting-army-downplays-mental-illness-as-cause-of-attack/', 'info_dict': { - 'id': 'fort-hood-shooting-army-downplays-mental-illness-as-cause-of-attack', + 'id': 'SNJBOYzXiWBOvaLsdzwH8fmtP1SCd91Y', 'ext': 'mp4', 'title': 'Fort Hood shooting: Army downplays mental illness as cause of attack', + 'description': 'md5:4a6983e480542d8b333a947bfc64ddc7', + 'upload_date': '19700101', + 'uploader': 'CBSI-NEW', 'thumbnail': 're:^https?://.*\.jpg$', 'duration': 205, 'subtitles': { @@ -58,30 +61,8 @@ class CBSNewsIE(CBSBaseIE): webpage, 'video JSON info'), video_id) item = video_info['item'] if 'item' in video_info else video_info - title = item.get('articleTitle') or item.get('hed') - duration = item.get('duration') - thumbnail = item.get('mediaImage') or item.get('thumbnail') - - subtitles = {} - formats = [] - for format_id in ['RtmpMobileLow', 'RtmpMobileHigh', 'Hls', 'RtmpDesktop']: - pid = item.get('media' + format_id) - if not pid: - continue - release_url = 'http://link.theplatform.com/s/dJ5BDC/%s?mbr=true' % pid - tp_formats, tp_subtitles = self._extract_theplatform_smil(release_url, video_id, 'Downloading %s SMIL data' % pid) - formats.extend(tp_formats) - subtitles = self._merge_subtitles(subtitles, tp_subtitles) - self._sort_formats(formats) - - return { - 'id': video_id, - 'title': title, - 'thumbnail': thumbnail, - 'duration': duration, - 'formats': formats, - 'subtitles': subtitles, - } + guid = item['mpxRefId'] + return self._extract_video_info('byGuid=%s' % guid, guid) class CBSNewsLiveVideoIE(InfoExtractor): diff --git a/youtube_dl/extractor/cbssports.py b/youtube_dl/extractor/cbssports.py index 549ae32f3..78ca44b02 100644 --- a/youtube_dl/extractor/cbssports.py +++ b/youtube_dl/extractor/cbssports.py @@ -1,30 +1,28 @@ from __future__ import unicode_literals -import re - -from .common import InfoExtractor +from .cbs import CBSBaseIE -class CBSSportsIE(InfoExtractor): - _VALID_URL = r'https?://www\.cbssports\.com/video/player/(?P<section>[^/]+)/(?P<id>[^/]+)' +class CBSSportsIE(CBSBaseIE): + _VALID_URL = r'https?://www\.cbssports\.com/video/player/[^/]+/(?P<id>\d+)' - _TEST = { - 'url': 'http://www.cbssports.com/video/player/tennis/318462531970/0/us-open-flashbacks-1990s', + _TESTS = [{ + 'url': 'http://www.cbssports.com/video/player/videos/708337219968/0/ben-simmons-the-next-lebron?-not-so-fast', 'info_dict': { - 'id': '_d5_GbO8p1sT', - 'ext': 'flv', - 'title': 'US Open flashbacks: 1990s', - 'description': 'Bill Macatee relives the best moments in US Open history from the 1990s.', + 'id': '708337219968', + 'ext': 'mp4', + 'title': 'Ben Simmons the next LeBron? Not so fast', + 'description': 'md5:854294f627921baba1f4b9a990d87197', + 'timestamp': 1466293740, + 'upload_date': '20160618', + 'uploader': 'CBSI-NEW', }, - } + 'params': { + # m3u8 download + 'skip_download': True, + } + }] def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - section = mobj.group('section') - video_id = mobj.group('id') - all_videos = self._download_json( - 'http://www.cbssports.com/data/video/player/getVideos/%s?as=json' % section, - video_id) - # The json file contains the info of all the videos in the section - video_info = next(v for v in all_videos if v['pcid'] == video_id) - return self.url_result('theplatform:%s' % video_info['pid'], 'ThePlatform') + video_id = self._match_id(url) + return self._extract_video_info('byId=%s' % video_id, video_id) From 819707920a63946ea1e4f0ae2bf842425d22c2e9 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Sun, 19 Jun 2016 23:55:19 +0100 Subject: [PATCH 040/387] [cbs] fix _VALID_URL --- youtube_dl/extractor/cbs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/cbs.py b/youtube_dl/extractor/cbs.py index 030eeaa65..21720f084 100644 --- a/youtube_dl/extractor/cbs.py +++ b/youtube_dl/extractor/cbs.py @@ -34,7 +34,7 @@ class CBSBaseIE(ThePlatformFeedIE): class CBSIE(CBSBaseIE): - _VALID_URL = r'(?:cbs|https?://(?:www\.)?(?:cbs\.com/shows/[^/]+/video|colbertlateshow\.com/(?:video|podcasts))/)(?P<id>[\w-]+)' + _VALID_URL = r'(?:cbs:|https?://(?:www\.)?(?:cbs\.com/shows/[^/]+/video|colbertlateshow\.com/(?:video|podcasts))/)(?P<id>[\w-]+)' _TESTS = [{ 'url': 'http://www.cbs.com/shows/garth-brooks/video/_u7W953k6la293J7EPTd9oHkSPs6Xn6_/connect-chat-feat-garth-brooks/', From 1f749b6658439049b952fdb979acb6c4422a358a Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Mon, 20 Jun 2016 13:29:13 +0200 Subject: [PATCH 041/387] Revert "[jsinterp] Avoid double key lookup for setting new key" This reverts commit 7c05097633138459e9bdf7e10738e021b04689a7. --- youtube_dl/jsinterp.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/youtube_dl/jsinterp.py b/youtube_dl/jsinterp.py index 4a5a0dbc3..a7440c582 100644 --- a/youtube_dl/jsinterp.py +++ b/youtube_dl/jsinterp.py @@ -131,8 +131,9 @@ class JSInterpreter(object): if variable in local_vars: obj = local_vars[variable] else: - obj = self._objects.setdefault( - variable, self.extract_object(variable)) + if variable not in self._objects: + self._objects[variable] = self.extract_object(variable) + obj = self._objects[variable] if arg_str is None: # Member access @@ -203,7 +204,8 @@ class JSInterpreter(object): argvals = tuple([ int(v) if v.isdigit() else local_vars[v] for v in m.group('args').split(',')]) - self._functions.setdefault(fname, self.extract_function(fname)) + if fname not in self._functions: + self._functions[fname] = self.extract_function(fname) return self._functions[fname](argvals) raise ExtractorError('Unsupported JS expression %r' % expr) From 8369a4fe768b1838f640ad984fbc923037b06c3a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Mon, 20 Jun 2016 21:55:17 +0700 Subject: [PATCH 042/387] [downloader/hls] Simplify and carry long lines --- youtube_dl/downloader/hls.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/youtube_dl/downloader/hls.py b/youtube_dl/downloader/hls.py index 1d5f178a0..3b7bb3508 100644 --- a/youtube_dl/downloader/hls.py +++ b/youtube_dl/downloader/hls.py @@ -50,7 +50,7 @@ class HlsFD(FragmentFD): # 4. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.3.5 ) check_results = [not re.search(feature, manifest) for feature in UNSUPPORTED_FEATURES] - check_results.append(not (re.search(r'#EXT-X-KEY:METHOD=AES-128', manifest) and not can_decrypt_frag)) + check_results.append(can_decrypt_frag or '#EXT-X-KEY:METHOD=AES-128' not in manifest) return all(check_results) def real_download(self, filename, info_dict): @@ -102,8 +102,9 @@ class HlsFD(FragmentFD): frag_content = down.read() down.close() if decrypt_info['METHOD'] == 'AES-128': - iv = decrypt_info.get('IV') or compat_struct_pack(">8xq", media_sequence) - frag_content = AES.new(decrypt_info['KEY'], AES.MODE_CBC, iv).decrypt(frag_content) + iv = decrypt_info.get('IV') or compat_struct_pack('>8xq', media_sequence) + frag_content = AES.new( + decrypt_info['KEY'], AES.MODE_CBC, iv).decrypt(frag_content) ctx['dest_stream'].write(frag_content) frags_filenames.append(frag_sanitized) # We only download the first fragment during the test @@ -117,7 +118,8 @@ class HlsFD(FragmentFD): if 'IV' in decrypt_info: decrypt_info['IV'] = binascii.unhexlify(decrypt_info['IV'][2:]) if not re.match(r'^https?://', decrypt_info['URI']): - decrypt_info['URI'] = compat_urlparse.urljoin(man_url, decrypt_info['URI']) + decrypt_info['URI'] = compat_urlparse.urljoin( + man_url, decrypt_info['URI']) decrypt_info['KEY'] = self.ydl.urlopen(decrypt_info['URI']).read() elif line.startswith('#EXT-X-MEDIA-SEQUENCE'): media_sequence = int(line[22:]) From 19e2d1cdeaf36805d72206a6309a6f7421f3c9ed Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Mon, 20 Jun 2016 20:50:01 +0700 Subject: [PATCH 043/387] release 2016.06.20 --- .github/ISSUE_TEMPLATE.md | 6 +++--- docs/supportedsites.md | 2 +- youtube_dl/version.py | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/.github/ISSUE_TEMPLATE.md b/.github/ISSUE_TEMPLATE.md index 1b25628bf..64ddb891e 100644 --- a/.github/ISSUE_TEMPLATE.md +++ b/.github/ISSUE_TEMPLATE.md @@ -6,8 +6,8 @@ --- -### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2016.06.19.1*. If it's not read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. -- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2016.06.19.1** +### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2016.06.20*. If it's not read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. +- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2016.06.20** ### Before submitting an *issue* make sure you have: - [ ] At least skimmed through [README](https://github.com/rg3/youtube-dl/blob/master/README.md) and **most notably** [FAQ](https://github.com/rg3/youtube-dl#faq) and [BUGS](https://github.com/rg3/youtube-dl#bugs) sections @@ -35,7 +35,7 @@ $ youtube-dl -v <your command line> [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 -[debug] youtube-dl version 2016.06.19.1 +[debug] youtube-dl version 2016.06.20 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/docs/supportedsites.md b/docs/supportedsites.md index 7c90940c7..5be8238c0 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -44,8 +44,8 @@ - **appletrailers:section** - **archive.org**: archive.org videos - **ARD** - - **ARD:mediathek** - **ARD:mediathek**: Saarländischer Rundfunk + - **ARD:mediathek** - **arte.tv** - **arte.tv:+7** - **arte.tv:cinema** diff --git a/youtube_dl/version.py b/youtube_dl/version.py index a3e2c3079..4a9f162c1 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2016.06.19.1' +__version__ = '2016.06.20' From feef925f49c80fc125ff24f61a144af902a648d2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Mon, 20 Jun 2016 22:40:22 +0700 Subject: [PATCH 044/387] [streamcloud] Capture error message (#9840) --- youtube_dl/extractor/streamcloud.py | 27 ++++++++++++++++----------- 1 file changed, 16 insertions(+), 11 deletions(-) diff --git a/youtube_dl/extractor/streamcloud.py b/youtube_dl/extractor/streamcloud.py index 58560ec64..6a6bb90c4 100644 --- a/youtube_dl/extractor/streamcloud.py +++ b/youtube_dl/extractor/streamcloud.py @@ -6,7 +6,6 @@ import re from .common import InfoExtractor from ..utils import ( ExtractorError, - sanitized_Request, urlencode_postdata, ) @@ -45,20 +44,26 @@ class StreamcloudIE(InfoExtractor): (?:id="[^"]+"\s+)? value="([^"]*)" ''', orig_webpage) - post = urlencode_postdata(fields) self._sleep(12, video_id) - headers = { - b'Content-Type': b'application/x-www-form-urlencoded', - } - req = sanitized_Request(url, post, headers) webpage = self._download_webpage( - req, video_id, note='Downloading video page ...') - title = self._html_search_regex( - r'<h1[^>]*>([^<]+)<', webpage, 'title') - video_url = self._search_regex( - r'file:\s*"([^"]+)"', webpage, 'video URL') + url, video_id, data=urlencode_postdata(fields), headers={ + b'Content-Type': b'application/x-www-form-urlencoded', + }) + + try: + title = self._html_search_regex( + r'<h1[^>]*>([^<]+)<', webpage, 'title') + video_url = self._search_regex( + r'file:\s*"([^"]+)"', webpage, 'video URL') + except ExtractorError: + message = self._html_search_regex( + r'(?s)<div[^>]+class=(["\']).*?msgboxinfo.*?\1[^>]*>(?P<message>.+?)</div>', + webpage, 'message', default=None, group='message') + if message: + raise ExtractorError('%s said: %s' % (self.IE_NAME, message), expected=True) + raise thumbnail = self._search_regex( r'image:\s*"([^"]+)"', webpage, 'thumbnail URL', fatal=False) From cdfc187cd5bd163e7e67ca0c02108380cc06c180 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Mon, 20 Jun 2016 22:40:33 +0700 Subject: [PATCH 045/387] [cbs] Remove unused import --- youtube_dl/extractor/cbs.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/youtube_dl/extractor/cbs.py b/youtube_dl/extractor/cbs.py index 21720f084..a23173d6f 100644 --- a/youtube_dl/extractor/cbs.py +++ b/youtube_dl/extractor/cbs.py @@ -1,7 +1,5 @@ from __future__ import unicode_literals -import re - from .theplatform import ThePlatformFeedIE from ..utils import ( int_or_none, From e4f90ea0a72711f6577d4cde1dd145f03ab34803 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan <yan12125@gmail.com> Date: Tue, 21 Jun 2016 17:55:53 +0800 Subject: [PATCH 046/387] [svt] Fix extraction for SVTPlay (closes #9809) --- youtube_dl/extractor/svt.py | 63 ++++++++++++++++++++++++------------- 1 file changed, 42 insertions(+), 21 deletions(-) diff --git a/youtube_dl/extractor/svt.py b/youtube_dl/extractor/svt.py index 2ab30e45f..6526a6345 100644 --- a/youtube_dl/extractor/svt.py +++ b/youtube_dl/extractor/svt.py @@ -6,17 +6,14 @@ import re from .common import InfoExtractor from ..utils import ( determine_ext, + dict_get, ) class SVTBaseIE(InfoExtractor): - def _extract_video(self, url, video_id): - info = self._download_json(url, video_id) + def _extract_video(self, info, video_id): + video_info = self._get_video_info(info) - title = info['context']['title'] - thumbnail = info['context'].get('thumbnailImage') - - video_info = info['video'] formats = [] for vr in video_info['videoReferences']: player_type = vr.get('playerType') @@ -43,22 +40,25 @@ class SVTBaseIE(InfoExtractor): self._sort_formats(formats) subtitles = {} - subtitle_references = video_info.get('subtitleReferences') + subtitle_references = dict_get(video_info, ('subtitles', 'subtitleReferences')) if isinstance(subtitle_references, list): for sr in subtitle_references: subtitle_url = sr.get('url') + subtitle_lang = sr.get('language', 'sv') if subtitle_url: - subtitles.setdefault('sv', []).append({'url': subtitle_url}) + if determine_ext(subtitle_url) == 'm3u8': + # TODO(yan12125): handle WebVTT in m3u8 manifests + continue + + subtitles.setdefault(subtitle_lang, []).append({'url': subtitle_url}) duration = video_info.get('materialLength') age_limit = 18 if video_info.get('inappropriateForChildren') else 0 return { 'id': video_id, - 'title': title, 'formats': formats, 'subtitles': subtitles, - 'thumbnail': thumbnail, 'duration': duration, 'age_limit': age_limit, } @@ -68,11 +68,11 @@ class SVTIE(SVTBaseIE): _VALID_URL = r'https?://(?:www\.)?svt\.se/wd\?(?:.*?&)?widgetId=(?P<widget_id>\d+)&.*?\barticleId=(?P<id>\d+)' _TEST = { 'url': 'http://www.svt.se/wd?widgetId=23991§ionId=541&articleId=2900353&type=embed&contextSectionId=123&autostart=false', - 'md5': '9648197555fc1b49e3dc22db4af51d46', + 'md5': '33e9a5d8f646523ce0868ecfb0eed77d', 'info_dict': { 'id': '2900353', - 'ext': 'flv', - 'title': 'Här trycker Jagr till Giroux (under SVT-intervjun)', + 'ext': 'mp4', + 'title': 'Stjärnorna skojar till det - under SVT-intervjun', 'duration': 27, 'age_limit': 0, }, @@ -85,18 +85,26 @@ class SVTIE(SVTBaseIE): if mobj: return mobj.group('url') + def _get_video_info(self, info): + return info['video'] + def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) widget_id = mobj.group('widget_id') article_id = mobj.group('id') - return self._extract_video( + + info = self._download_json( 'http://www.svt.se/wd?widgetId=%s&articleId=%s&format=json&type=embed&output=json' % (widget_id, article_id), article_id) + info_dict = self._extract_video(info, article_id) + info_dict['title'] = info['context']['title'] + return info_dict + class SVTPlayIE(SVTBaseIE): IE_DESC = 'SVT Play and Öppet arkiv' - _VALID_URL = r'https?://(?:www\.)?(?P<host>svtplay|oppetarkiv)\.se/video/(?P<id>[0-9]+)' + _VALID_URL = r'https?://(?:www\.)?(?:svtplay|oppetarkiv)\.se/video/(?P<id>[0-9]+)' _TEST = { 'url': 'http://www.svtplay.se/video/5996901/flygplan-till-haile-selassie/flygplan-till-haile-selassie-2', 'md5': '2b6704fe4a28801e1a098bbf3c5ac611', @@ -115,10 +123,23 @@ class SVTPlayIE(SVTBaseIE): }, } + def _get_video_info(self, info): + return info['context']['dispatcher']['stores']['VideoTitlePageStore']['data']['video'] + def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') - host = mobj.group('host') - return self._extract_video( - 'http://www.%s.se/video/%s?output=json' % (host, video_id), - video_id) + video_id = self._match_id(url) + + webpage = self._download_webpage(url, video_id) + + data = self._parse_json(self._search_regex( + r'root\["__svtplay"\]\s*=\s*([^;]+);', webpage, 'embedded data'), video_id) + + thumbnail = self._og_search_thumbnail(webpage) + + info_dict = self._extract_video(data, video_id) + info_dict.update({ + 'title': data['context']['dispatcher']['stores']['MetaStore']['title'], + 'thumbnail': thumbnail, + }) + + return info_dict From 1ac5705f62aa3f6fdb6f2a97fbd24594010b7598 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Tue, 21 Jun 2016 13:36:56 +0100 Subject: [PATCH 047/387] [gamespot] extract all formats --- youtube_dl/extractor/gamespot.py | 91 ++++++++++++++++++++++++-------- 1 file changed, 68 insertions(+), 23 deletions(-) diff --git a/youtube_dl/extractor/gamespot.py b/youtube_dl/extractor/gamespot.py index 4ffdd7515..621257c9f 100644 --- a/youtube_dl/extractor/gamespot.py +++ b/youtube_dl/extractor/gamespot.py @@ -1,19 +1,19 @@ from __future__ import unicode_literals import re -import json -from .common import InfoExtractor +from .once import OnceIE from ..compat import ( compat_urllib_parse_unquote, - compat_urlparse, ) from ..utils import ( unescapeHTML, + url_basename, + dict_get, ) -class GameSpotIE(InfoExtractor): +class GameSpotIE(OnceIE): _VALID_URL = r'https?://(?:www\.)?gamespot\.com/.*-(?P<id>\d+)/?' _TESTS = [{ 'url': 'http://www.gamespot.com/videos/arma-3-community-guide-sitrep-i/2300-6410818/', @@ -39,29 +39,73 @@ class GameSpotIE(InfoExtractor): webpage = self._download_webpage(url, page_id) data_video_json = self._search_regex( r'data-video=["\'](.*?)["\']', webpage, 'data video') - data_video = json.loads(unescapeHTML(data_video_json)) + data_video = self._parse_json(unescapeHTML(data_video_json), page_id) streams = data_video['videoStreams'] + manifest_url = None formats = [] f4m_url = streams.get('f4m_stream') - if f4m_url is not None: - # Transform the manifest url to a link to the mp4 files - # they are used in mobile devices. - f4m_path = compat_urlparse.urlparse(f4m_url).path - QUALITIES_RE = r'((,\d+)+,?)' - qualities = self._search_regex(QUALITIES_RE, f4m_path, 'qualities').strip(',').split(',') - http_path = f4m_path[1:].split('/', 1)[1] - http_template = re.sub(QUALITIES_RE, r'%s', http_path) - http_template = http_template.replace('.csmil/manifest.f4m', '') - http_template = compat_urlparse.urljoin( - 'http://video.gamespotcdn.com/', http_template) - for q in qualities: - formats.append({ - 'url': http_template % q, - 'ext': 'mp4', - 'format_id': q, - }) - else: + if f4m_url: + manifest_url = f4m_url + formats.extend(self._extract_f4m_formats( + f4m_url + '?hdcore=3.7.0', page_id, f4m_id='hds', fatal=False)) + m3u8_url = streams.get('m3u8_stream') + if m3u8_url: + manifest_url = m3u8_url + m3u8_formats = self._extract_m3u8_formats( + m3u8_url, page_id, 'mp4', 'm3u8_native', + m3u8_id='hls', fatal=False) + formats.extend(m3u8_formats) + progressive_url = dict_get( + streams, ('progressive_hd', 'progressive_high', 'progressive_low')) + if progressive_url and manifest_url: + qualities_basename = self._search_regex( + '/([^/]+)\.csmil/', + manifest_url, 'qualities basename', default=None) + if qualities_basename: + QUALITIES_RE = r'((,\d+)+,?)' + qualities = self._search_regex( + QUALITIES_RE, qualities_basename, + 'qualities', default=None) + if qualities: + qualities = list(map(lambda q: int(q), qualities.strip(',').split(','))) + qualities.sort() + http_template = re.sub(QUALITIES_RE, r'%d', qualities_basename) + http_url_basename = url_basename(progressive_url) + if m3u8_formats: + self._sort_formats(m3u8_formats) + m3u8_formats = list(filter( + lambda f: f.get('vcodec') != 'none' and f.get('resolution') != 'multiple', + m3u8_formats)) + if len(qualities) == len(m3u8_formats): + for q, m3u8_format in zip(qualities, m3u8_formats): + f = m3u8_format.copy() + f.update({ + 'url': progressive_url.replace( + http_url_basename, http_template % q), + 'format_id': f['format_id'].replace('hls', 'http'), + 'protocol': 'http', + }) + formats.append(f) + else: + for q in qualities: + formats.append({ + 'url': progressive_url.replace( + http_url_basename, http_template % q), + 'ext': 'mp4', + 'format_id': 'http-%d' % q, + 'tbr': q, + }) + + onceux_json = self._search_regex( + r'data-onceux-options=["\'](.*?)["\']', webpage, 'data video', default=None) + if onceux_json: + onceux_url = self._parse_json(unescapeHTML(onceux_json), page_id).get('metadataUri') + if onceux_url: + formats.extend(self._extract_once_formats(re.sub( + r'https?://[^/]+', 'http://once.unicornmedia.com', onceux_url).replace('ads/vmap/', ''))) + + if not formats: for quality in ['sd', 'hd']: # It's actually a link to a flv file flv_url = streams.get('f4m_{0}'.format(quality)) @@ -71,6 +115,7 @@ class GameSpotIE(InfoExtractor): 'ext': 'flv', 'format_id': quality, }) + self._sort_formats(formats) return { 'id': data_video['guid'], From 7cfc1e2a104977c41f6008885b36b96bcb2b146e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Tue, 21 Jun 2016 22:31:41 +0700 Subject: [PATCH 048/387] [gametrailers] Remove extractor gametrailers closed (see http://www.polygon.com/2016/2/8/10944452/gametrailers-shuts-down-after-13-year-run) --- youtube_dl/extractor/extractors.py | 1 - youtube_dl/extractor/gametrailers.py | 62 ---------------------------- 2 files changed, 63 deletions(-) delete mode 100644 youtube_dl/extractor/gametrailers.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index b1b04f2fc..4e2a2f2e9 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -285,7 +285,6 @@ from .gameone import ( from .gamersyde import GamersydeIE from .gamespot import GameSpotIE from .gamestar import GameStarIE -from .gametrailers import GametrailersIE from .gazeta import GazetaIE from .gdcvault import GDCVaultIE from .generic import GenericIE diff --git a/youtube_dl/extractor/gametrailers.py b/youtube_dl/extractor/gametrailers.py deleted file mode 100644 index 1e7948ab8..000000000 --- a/youtube_dl/extractor/gametrailers.py +++ /dev/null @@ -1,62 +0,0 @@ -from __future__ import unicode_literals - -from .common import InfoExtractor -from ..utils import ( - int_or_none, - parse_age_limit, - url_basename, -) - - -class GametrailersIE(InfoExtractor): - _VALID_URL = r'https?://www\.gametrailers\.com/videos/view/[^/]+/(?P<id>.+)' - - _TEST = { - 'url': 'http://www.gametrailers.com/videos/view/gametrailers-com/116437-Just-Cause-3-Review', - 'md5': 'f28c4efa0bdfaf9b760f6507955b6a6a', - 'info_dict': { - 'id': '2983958', - 'ext': 'mp4', - 'display_id': '116437-Just-Cause-3-Review', - 'title': 'Just Cause 3 - Review', - 'description': 'It\'s a lot of fun to shoot at things and then watch them explode in Just Cause 3, but should there be more to the experience than that?', - }, - } - - def _real_extract(self, url): - display_id = self._match_id(url) - webpage = self._download_webpage(url, display_id) - title = self._html_search_regex( - r'<title>(.+?)\|', webpage, 'title').strip() - embed_url = self._proto_relative_url( - self._search_regex( - r'src=\'(//embed.gametrailers.com/embed/[^\']+)\'', webpage, - 'embed url'), - scheme='http:') - video_id = url_basename(embed_url) - embed_page = self._download_webpage(embed_url, video_id) - embed_vars_json = self._search_regex( - r'(?s)var embedVars = (\{.*?\})\s*</script>', embed_page, - 'embed vars') - info = self._parse_json(embed_vars_json, video_id) - - formats = [] - for media in info['media']: - if media['mediaPurpose'] == 'play': - formats.append({ - 'url': media['uri'], - 'height': media['height'], - 'width:': media['width'], - }) - self._sort_formats(formats) - - return { - 'id': video_id, - 'display_id': display_id, - 'title': title, - 'formats': formats, - 'thumbnail': info.get('thumbUri'), - 'description': self._og_search_description(webpage), - 'duration': int_or_none(info.get('videoLengthInSeconds')), - 'age_limit': parse_age_limit(info.get('audienceRating')), - } From ca74c90bf55cfb16f5eadc3a63e9389202ee80f5 Mon Sep 17 00:00:00 2001 From: Shai Coleman <shai.coleman@storyful.com> Date: Wed, 22 Jun 2016 12:52:15 +0100 Subject: [PATCH 049/387] Fix issue downloading facebook videos youtube-dl expects the format items to be returned as a list, but when there's only one item Facebook returns a dict instead, this wraps the dict in a list if necessary --- youtube_dl/extractor/facebook.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/youtube_dl/extractor/facebook.py b/youtube_dl/extractor/facebook.py index f5bbd39d2..9b87b37ae 100644 --- a/youtube_dl/extractor/facebook.py +++ b/youtube_dl/extractor/facebook.py @@ -239,6 +239,8 @@ class FacebookIE(InfoExtractor): formats = [] for format_id, f in video_data.items(): + if f and isinstance(f, dict): + f = [f] if not f or not isinstance(f, list): continue for quality in ('sd', 'hd'): From 23bdae0955ae5e0adaf6212bb7aa6cec77ae4d1d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Wed, 22 Jun 2016 23:36:07 +0700 Subject: [PATCH 050/387] [svt] Various improvements + [svt:play] Add fallback path looking for video id and fix extraction for oppetarkiv * [svt:base] Detect geo restriction * [svt:base] Extract series related metadata --- youtube_dl/extractor/svt.py | 82 +++++++++++++++++++++++++++---------- 1 file changed, 60 insertions(+), 22 deletions(-) diff --git a/youtube_dl/extractor/svt.py b/youtube_dl/extractor/svt.py index 6526a6345..67f56fab8 100644 --- a/youtube_dl/extractor/svt.py +++ b/youtube_dl/extractor/svt.py @@ -7,13 +7,13 @@ from .common import InfoExtractor from ..utils import ( determine_ext, dict_get, + int_or_none, + try_get, ) class SVTBaseIE(InfoExtractor): - def _extract_video(self, info, video_id): - video_info = self._get_video_info(info) - + def _extract_video(self, video_info, video_id): formats = [] for vr in video_info['videoReferences']: player_type = vr.get('playerType') @@ -37,6 +37,8 @@ class SVTBaseIE(InfoExtractor): 'format_id': player_type, 'url': vurl, }) + if not formats and video_info.get('rights', {}).get('geoBlockedSweden'): + self.raise_geo_restricted('This video is only available in Sweden') self._sort_formats(formats) subtitles = {} @@ -52,15 +54,32 @@ class SVTBaseIE(InfoExtractor): subtitles.setdefault(subtitle_lang, []).append({'url': subtitle_url}) - duration = video_info.get('materialLength') - age_limit = 18 if video_info.get('inappropriateForChildren') else 0 + title = video_info.get('title') + + series = video_info.get('programTitle') + season_number = int_or_none(video_info.get('season')) + episode = video_info.get('episodeTitle') + episode_number = int_or_none(video_info.get('episodeNumber')) + + duration = int_or_none(dict_get(video_info, ('materialLength', 'contentDuration'))) + age_limit = None + adult = dict_get( + video_info, ('inappropriateForChildren', 'blockedForChildren'), + skip_false_values=False) + if adult is not None: + age_limit = 18 if adult else 0 return { 'id': video_id, + 'title': title, 'formats': formats, 'subtitles': subtitles, 'duration': duration, 'age_limit': age_limit, + 'series': series, + 'season_number': season_number, + 'episode': episode, + 'episode_number': episode_number, } @@ -85,9 +104,6 @@ class SVTIE(SVTBaseIE): if mobj: return mobj.group('url') - def _get_video_info(self, info): - return info['video'] - def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) widget_id = mobj.group('widget_id') @@ -97,7 +113,7 @@ class SVTIE(SVTBaseIE): 'http://www.svt.se/wd?widgetId=%s&articleId=%s&format=json&type=embed&output=json' % (widget_id, article_id), article_id) - info_dict = self._extract_video(info, article_id) + info_dict = self._extract_video(info['video'], article_id) info_dict['title'] = info['context']['title'] return info_dict @@ -105,7 +121,7 @@ class SVTIE(SVTBaseIE): class SVTPlayIE(SVTBaseIE): IE_DESC = 'SVT Play and Öppet arkiv' _VALID_URL = r'https?://(?:www\.)?(?:svtplay|oppetarkiv)\.se/video/(?P<id>[0-9]+)' - _TEST = { + _TESTS = [{ 'url': 'http://www.svtplay.se/video/5996901/flygplan-till-haile-selassie/flygplan-till-haile-selassie-2', 'md5': '2b6704fe4a28801e1a098bbf3c5ac611', 'info_dict': { @@ -121,25 +137,47 @@ class SVTPlayIE(SVTBaseIE): }] }, }, - } - - def _get_video_info(self, info): - return info['context']['dispatcher']['stores']['VideoTitlePageStore']['data']['video'] + }, { + # geo restricted to Sweden + 'url': 'http://www.oppetarkiv.se/video/5219710/trollflojten', + 'only_matching': True, + }] def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) - data = self._parse_json(self._search_regex( - r'root\["__svtplay"\]\s*=\s*([^;]+);', webpage, 'embedded data'), video_id) + data = self._parse_json( + self._search_regex( + r'root\["__svtplay"\]\s*=\s*([^;]+);', + webpage, 'embedded data', default='{}'), + video_id, fatal=False) thumbnail = self._og_search_thumbnail(webpage) - info_dict = self._extract_video(data, video_id) - info_dict.update({ - 'title': data['context']['dispatcher']['stores']['MetaStore']['title'], - 'thumbnail': thumbnail, - }) + if data: + video_info = try_get( + data, lambda x: x['context']['dispatcher']['stores']['VideoTitlePageStore']['data']['video'], + dict) + if video_info: + info_dict = self._extract_video(video_info, video_id) + info_dict.update({ + 'title': data['context']['dispatcher']['stores']['MetaStore']['title'], + 'thumbnail': thumbnail, + }) + return info_dict - return info_dict + video_id = self._search_regex( + r'<video[^>]+data-video-id=["\']([\da-zA-Z-]+)', + webpage, 'video id', default=None) + + if video_id: + data = self._download_json( + 'http://www.svt.se/videoplayer-api/video/%s' % video_id, video_id) + info_dict = self._extract_video(data, video_id) + if not info_dict.get('title'): + info_dict['title'] = re.sub( + r'\s*\|\s*.+?$', '', + info_dict.get('episode') or self._og_search_title(webpage)) + return info_dict From cf40fdf5c1da33180b5a1b333784c529bc504b6f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Wed, 22 Jun 2016 23:43:24 +0700 Subject: [PATCH 051/387] release 2016.06.22 --- .github/ISSUE_TEMPLATE.md | 6 +++--- docs/supportedsites.md | 1 - youtube_dl/version.py | 2 +- 3 files changed, 4 insertions(+), 5 deletions(-) diff --git a/.github/ISSUE_TEMPLATE.md b/.github/ISSUE_TEMPLATE.md index 64ddb891e..e17625f21 100644 --- a/.github/ISSUE_TEMPLATE.md +++ b/.github/ISSUE_TEMPLATE.md @@ -6,8 +6,8 @@ --- -### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2016.06.20*. If it's not read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. -- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2016.06.20** +### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2016.06.22*. If it's not read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. +- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2016.06.22** ### Before submitting an *issue* make sure you have: - [ ] At least skimmed through [README](https://github.com/rg3/youtube-dl/blob/master/README.md) and **most notably** [FAQ](https://github.com/rg3/youtube-dl#faq) and [BUGS](https://github.com/rg3/youtube-dl#bugs) sections @@ -35,7 +35,7 @@ $ youtube-dl -v <your command line> [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 -[debug] youtube-dl version 2016.06.20 +[debug] youtube-dl version 2016.06.22 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/docs/supportedsites.md b/docs/supportedsites.md index 5be8238c0..96cc407db 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -248,7 +248,6 @@ - **Gamersyde** - **GameSpot** - **GameStar** - - **Gametrailers** - **Gazeta** - **GDCVault** - **generic**: Generic downloader that works on some sites diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 4a9f162c1..d2152b2f1 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2016.06.20' +__version__ = '2016.06.22' From 6ae938b295eaca06944f66faa7c6d668c6c5866c Mon Sep 17 00:00:00 2001 From: TRox1972 <TRox1972@users.noreply.github.com> Date: Tue, 21 Jun 2016 11:36:54 +0200 Subject: [PATCH 052/387] [Vine] Extract view count --- youtube_dl/extractor/vine.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/youtube_dl/extractor/vine.py b/youtube_dl/extractor/vine.py index a6a6cc479..5b801849c 100644 --- a/youtube_dl/extractor/vine.py +++ b/youtube_dl/extractor/vine.py @@ -24,6 +24,7 @@ class VineIE(InfoExtractor): 'upload_date': '20130519', 'uploader': 'Jack Dorsey', 'uploader_id': '76', + 'view_count': int, 'like_count': int, 'comment_count': int, 'repost_count': int, @@ -39,6 +40,7 @@ class VineIE(InfoExtractor): 'upload_date': '20140815', 'uploader': 'Mars Ruiz', 'uploader_id': '1102363502380728320', + 'view_count': int, 'like_count': int, 'comment_count': int, 'repost_count': int, @@ -54,6 +56,7 @@ class VineIE(InfoExtractor): 'upload_date': '20130430', 'uploader': 'Z3k3', 'uploader_id': '936470460173008896', + 'view_count': int, 'like_count': int, 'comment_count': int, 'repost_count': int, @@ -71,6 +74,7 @@ class VineIE(InfoExtractor): 'upload_date': '20150705', 'uploader': 'Pimry_zaa', 'uploader_id': '1135760698325307392', + 'view_count': int, 'like_count': int, 'comment_count': int, 'repost_count': int, @@ -109,6 +113,7 @@ class VineIE(InfoExtractor): 'upload_date': unified_strdate(data.get('created')), 'uploader': username, 'uploader_id': data.get('userIdStr'), + 'view_count': int_or_none(data.get('loops', {}).get('count')), 'like_count': int_or_none(data.get('likes', {}).get('count')), 'comment_count': int_or_none(data.get('comments', {}).get('count')), 'repost_count': int_or_none(data.get('reposts', {}).get('count')), From 169d836feb9d796205a02713db33eafcbb49f1e6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= <jaime.marquinez.ferrandiz@gmail.com> Date: Wed, 22 Jun 2016 19:13:46 +0200 Subject: [PATCH 053/387] lazy-extractors: Fix after commit 6e6b9f600f2f447604f6108fb6486b73cc25def1 The problem was in the following code: class ArteTVPlus7IE(ArteTVBaseIE): ... @classmethod def suitable(cls, url): return False if ArteTVPlaylistIE.suitable(url) else super(ArteTVPlus7IE, cls).suitable(url) And its sublcasses like ArteTVCinemaIE. Since in the lazy_extractors.py file ArteTVCinemaIE was not a subclass of ArteTVPlus7IE, super(ArteTVPlus7IE, cls) failed. To fix it we have to make it a subclass. Since the order of _ALL_CLASSES is arbitrary we must sort them so that the base classes are defined first. We also must add base classes like YoutubeBaseInfoExtractor. --- devscripts/make_lazy_extractors.py | 47 ++++++++++++++++++++++++++---- 1 file changed, 41 insertions(+), 6 deletions(-) diff --git a/devscripts/make_lazy_extractors.py b/devscripts/make_lazy_extractors.py index b5a8b9190..9a79c2bc5 100644 --- a/devscripts/make_lazy_extractors.py +++ b/devscripts/make_lazy_extractors.py @@ -14,15 +14,17 @@ if os.path.exists(lazy_extractors_filename): os.remove(lazy_extractors_filename) from youtube_dl.extractor import _ALL_CLASSES -from youtube_dl.extractor.common import InfoExtractor +from youtube_dl.extractor.common import InfoExtractor, SearchInfoExtractor with open('devscripts/lazy_load_template.py', 'rt') as f: module_template = f.read() -module_contents = [module_template + '\n' + getsource(InfoExtractor.suitable)] +module_contents = [ + module_template + '\n' + getsource(InfoExtractor.suitable) + '\n', + 'class LazyLoadSearchExtractor(LazyLoadExtractor):\n pass\n'] ie_template = ''' -class {name}(LazyLoadExtractor): +class {name}({bases}): _VALID_URL = {valid_url!r} _module = '{module}' ''' @@ -34,10 +36,20 @@ make_valid_template = ''' ''' +def get_base_name(base): + if base is InfoExtractor: + return 'LazyLoadExtractor' + elif base is SearchInfoExtractor: + return 'LazyLoadSearchExtractor' + else: + return base.__name__ + + def build_lazy_ie(ie, name): valid_url = getattr(ie, '_VALID_URL', None) s = ie_template.format( name=name, + bases=', '.join(map(get_base_name, ie.__bases__)), valid_url=valid_url, module=ie.__module__) if ie.suitable.__func__ is not InfoExtractor.suitable.__func__: @@ -47,12 +59,35 @@ def build_lazy_ie(ie, name): s += make_valid_template.format(valid_url=ie._make_valid_url()) return s +# find the correct sorting and add the required base classes so that sublcasses +# can be correctly created +classes = _ALL_CLASSES[:-1] +ordered_cls = [] +while classes: + for c in classes[:]: + bases = set(c.__bases__) - set((object, InfoExtractor, SearchInfoExtractor)) + stop = False + for b in bases: + if b not in classes and b not in ordered_cls: + if b.__name__ == 'GenericIE': + exit() + classes.insert(0, b) + stop = True + if stop: + break + if all(b in ordered_cls for b in bases): + ordered_cls.append(c) + classes.remove(c) + break +ordered_cls.append(_ALL_CLASSES[-1]) + names = [] -for ie in list(sorted(_ALL_CLASSES[:-1], key=lambda cls: cls.ie_key())) + _ALL_CLASSES[-1:]: - name = ie.ie_key() + 'IE' +for ie in ordered_cls: + name = ie.__name__ src = build_lazy_ie(ie, name) module_contents.append(src) - names.append(name) + if ie in _ALL_CLASSES: + names.append(name) module_contents.append( '_ALL_CLASSES = [{0}]'.format(', '.join(names))) From c143ddce5d1e24697f891292ea865e6ed499f162 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Thu, 23 Jun 2016 00:51:36 +0700 Subject: [PATCH 054/387] [vimeo] Override original URL only when necessary --- youtube_dl/extractor/vimeo.py | 29 +++++++++++++++++++++++++---- 1 file changed, 25 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/vimeo.py b/youtube_dl/extractor/vimeo.py index c52986af6..1f163d6a4 100644 --- a/youtube_dl/extractor/vimeo.py +++ b/youtube_dl/extractor/vimeo.py @@ -227,8 +227,6 @@ class VimeoIE(VimeoBaseInfoExtractor): { 'url': 'http://vimeo.com/channels/keypeele/75629013', 'md5': '2f86a05afe9d7abc0b9126d229bbe15d', - 'note': 'Video is freely available via original URL ' - 'and protected with password when accessed via http://vimeo.com/75629013', 'info_dict': { 'id': '75629013', 'ext': 'mp4', @@ -272,7 +270,7 @@ class VimeoIE(VimeoBaseInfoExtractor): { # contains original format 'url': 'https://vimeo.com/33951933', - 'md5': '53c688fa95a55bf4b7293d37a89c5c53', + 'md5': '2d9f5475e0537f013d0073e812ab89e6', 'info_dict': { 'id': '33951933', 'ext': 'mp4', @@ -284,6 +282,29 @@ class VimeoIE(VimeoBaseInfoExtractor): 'description': 'md5:ae23671e82d05415868f7ad1aec21147', }, }, + { + # only available via https://vimeo.com/channels/tributes/6213729 and + # not via https://vimeo.com/6213729 + 'url': 'https://vimeo.com/channels/tributes/6213729', + 'info_dict': { + 'id': '6213729', + 'ext': 'mp4', + 'title': 'Vimeo Tribute: The Shining', + 'uploader': 'Casey Donahue', + 'uploader_url': 're:https?://(?:www\.)?vimeo\.com/caseydonahue', + 'uploader_id': 'caseydonahue', + 'upload_date': '20090821', + 'description': 'md5:bdbf314014e58713e6e5b66eb252f4a6', + }, + 'params': { + 'skip_download': True, + }, + 'expected_warnings': ['Unable to download JSON metadata'], + }, + { + 'url': 'http://vimeo.com/moogaloop.swf?clip_id=2539741', + 'only_matching': True, + }, { 'url': 'https://vimeo.com/109815029', 'note': 'Video not completely processed, "failed" seed status', @@ -369,7 +390,7 @@ class VimeoIE(VimeoBaseInfoExtractor): orig_url = url if mobj.group('pro') or mobj.group('player'): url = 'https://player.vimeo.com/video/' + video_id - else: + elif any(p in url for p in ('play_redirect_hls', 'moogaloop.swf')): url = 'https://vimeo.com/' + video_id # Retrieve video webpage to extract further information From dfc8f46e1c0e47a3b080d2e38d7d6da279f18fd2 Mon Sep 17 00:00:00 2001 From: Purdea Andrei <andrei@purdea.ro> Date: Wed, 22 Jun 2016 00:34:57 +0300 Subject: [PATCH 055/387] [vimeo:channel] Add video id to url_result This will allow us to decide much faster that we don't want an already archived video, and will allow having to download webpages for each video that has already been downloaded, thus significantly speeding up the archival of channels that have no new content. --- youtube_dl/extractor/vimeo.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/vimeo.py b/youtube_dl/extractor/vimeo.py index 1f163d6a4..32490a8ed 100644 --- a/youtube_dl/extractor/vimeo.py +++ b/youtube_dl/extractor/vimeo.py @@ -652,7 +652,7 @@ class VimeoChannelIE(VimeoBaseInfoExtractor): yield self._extract_list_title(webpage) for video_id in re.findall(r'id="clip_(\d+?)"', webpage): - yield self.url_result('https://vimeo.com/%s' % video_id, 'Vimeo') + yield self.url_result('https://vimeo.com/%s' % video_id, 'Vimeo', video_id=video_id) if re.search(self._MORE_PAGES_INDICATOR, webpage, re.DOTALL) is None: break From c8e3e0974b4ffb6792694336664f90eff38fc762 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Thu, 23 Jun 2016 01:28:36 +0700 Subject: [PATCH 056/387] [vimeo:channel] Improve playlist extraction --- youtube_dl/extractor/vimeo.py | 25 +++++++++++++++++++++---- 1 file changed, 21 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/vimeo.py b/youtube_dl/extractor/vimeo.py index 32490a8ed..26a3d9931 100644 --- a/youtube_dl/extractor/vimeo.py +++ b/youtube_dl/extractor/vimeo.py @@ -146,7 +146,7 @@ class VimeoIE(VimeoBaseInfoExtractor): \. )? vimeo(?P<pro>pro)?\.com/ - (?!channels/[^/?#]+/?(?:$|[?#])|[^/]+/review/|(?:album|ondemand)/) + (?!(?:channels|album)/[^/?#]+/?(?:$|[?#])|[^/]+/review/|ondemand/) (?:.*?/)? (?: (?: @@ -314,6 +314,10 @@ class VimeoIE(VimeoBaseInfoExtractor): 'url': 'https://vimeo.com/groups/travelhd/videos/22439234', 'only_matching': True, }, + { + 'url': 'https://vimeo.com/album/2632481/video/79010983', + 'only_matching': True, + }, { # source file returns 403: Forbidden 'url': 'https://vimeo.com/7809605', @@ -651,8 +655,21 @@ class VimeoChannelIE(VimeoBaseInfoExtractor): webpage = self._login_list_password(page_url, list_id, webpage) yield self._extract_list_title(webpage) - for video_id in re.findall(r'id="clip_(\d+?)"', webpage): - yield self.url_result('https://vimeo.com/%s' % video_id, 'Vimeo', video_id=video_id) + # Try extracting href first since not all videos are available via + # short https://vimeo.com/id URL (e.g. https://vimeo.com/channels/tributes/6213729) + clips = re.findall( + r'id="clip_(\d+)"[^>]*>\s*<a[^>]+href="(/(?:[^/]+/)*\1)', webpage) + if clips: + for video_id, video_url in clips: + yield self.url_result( + compat_urlparse.urljoin(base_url, video_url), + VimeoIE.ie_key(), video_id=video_id) + # More relaxed fallback + else: + for video_id in re.findall(r'id=["\']clip_(\d+)', webpage): + yield self.url_result( + 'https://vimeo.com/%s' % video_id, + VimeoIE.ie_key(), video_id=video_id) if re.search(self._MORE_PAGES_INDICATOR, webpage, re.DOTALL) is None: break @@ -689,7 +706,7 @@ class VimeoUserIE(VimeoChannelIE): class VimeoAlbumIE(VimeoChannelIE): IE_NAME = 'vimeo:album' - _VALID_URL = r'https://vimeo\.com/album/(?P<id>\d+)' + _VALID_URL = r'https://vimeo\.com/album/(?P<id>\d+)/?(?:$|[?#])' _TITLE_RE = r'<header id="page_header">\n\s*<h1>(.*?)</h1>' _TESTS = [{ 'url': 'https://vimeo.com/album/2632481', From b5eab86c2424ec04d17fac5de9d15574320ea8f1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Thu, 23 Jun 2016 01:56:58 +0700 Subject: [PATCH 057/387] [vimeo:album] Impove _VALID_URL --- youtube_dl/extractor/vimeo.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/vimeo.py b/youtube_dl/extractor/vimeo.py index 26a3d9931..8ba3f55f4 100644 --- a/youtube_dl/extractor/vimeo.py +++ b/youtube_dl/extractor/vimeo.py @@ -706,7 +706,7 @@ class VimeoUserIE(VimeoChannelIE): class VimeoAlbumIE(VimeoChannelIE): IE_NAME = 'vimeo:album' - _VALID_URL = r'https://vimeo\.com/album/(?P<id>\d+)/?(?:$|[?#])' + _VALID_URL = r'https://vimeo\.com/album/(?P<id>\d+)(?:$|[?#]|/(?!video))' _TITLE_RE = r'<header id="page_header">\n\s*<h1>(.*?)</h1>' _TESTS = [{ 'url': 'https://vimeo.com/album/2632481', @@ -726,6 +726,9 @@ class VimeoAlbumIE(VimeoChannelIE): 'params': { 'videopassword': 'youtube-dl', } + }, { + 'url': 'https://vimeo.com/album/2632481/sort:plays/format:thumbnail', + 'only_matching': True, }] def _page_url(self, base_url, pagenum): From 089657ed1f6edcdb10a958a8cd7d91b4888e41eb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Thu, 23 Jun 2016 02:00:03 +0700 Subject: [PATCH 058/387] [vimeo:album] Add paged example URL --- youtube_dl/extractor/vimeo.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/youtube_dl/extractor/vimeo.py b/youtube_dl/extractor/vimeo.py index 8ba3f55f4..4bdeb1187 100644 --- a/youtube_dl/extractor/vimeo.py +++ b/youtube_dl/extractor/vimeo.py @@ -729,6 +729,10 @@ class VimeoAlbumIE(VimeoChannelIE): }, { 'url': 'https://vimeo.com/album/2632481/sort:plays/format:thumbnail', 'only_matching': True, + }, { + # TODO: respect page number + 'url': 'https://vimeo.com/album/2632481/page:2/sort:plays/format:thumbnail', + 'only_matching': True, }] def _page_url(self, base_url, pagenum): From 75ca6bcee2466cb9ca3dc4d1ca35b56a59f6cc4d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Thu, 23 Jun 2016 04:17:13 +0700 Subject: [PATCH 059/387] [vk] Workaround buggy new.vk.com Set-Cookie headers --- youtube_dl/extractor/vk.py | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/vk.py b/youtube_dl/extractor/vk.py index 79c819bc3..4e8ec0f86 100644 --- a/youtube_dl/extractor/vk.py +++ b/youtube_dl/extractor/vk.py @@ -3,6 +3,7 @@ from __future__ import unicode_literals import re import json +import sys from .common import InfoExtractor from ..compat import compat_str @@ -190,7 +191,7 @@ class VKIE(InfoExtractor): if username is None: return - login_page = self._download_webpage( + login_page, url_handle = self._download_webpage_handle( 'https://vk.com', None, 'Downloading login page') login_form = self._hidden_inputs(login_page) @@ -200,6 +201,22 @@ class VKIE(InfoExtractor): 'pass': password.encode('cp1251'), }) + # https://new.vk.com/ serves two same remixlhk cookies in Set-Cookie header + # and expects the first one to be set rather than second (see + # https://github.com/rg3/youtube-dl/issues/9841#issuecomment-227871201). + # As of RFC6265 the newer one cookie should be set into cookie store + # what actually happens. + # We will workaround this VK issue by resetting the remixlhk cookie to + # the first one manually. + cookies = url_handle.headers.get('Set-Cookie') + if sys.version_info[0] >= 3: + cookies = cookies.encode('iso-8859-1') + cookies = cookies.decode('utf-8') + remixlhk = re.search(r'remixlhk=(.+?);.*?\bdomain=(.+?)(?:[,;]|$)', cookies) + if remixlhk: + value, domain = remixlhk.groups() + self._set_cookie(domain, 'remixlhk', value) + request = sanitized_Request( 'https://login.vk.com/?act=login', urlencode_postdata(login_form)) From f2bb8c036a0b1feab726321bd877544cb973d7cc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Thu, 23 Jun 2016 04:18:43 +0700 Subject: [PATCH 060/387] [vk] Modernize --- youtube_dl/extractor/vk.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/vk.py b/youtube_dl/extractor/vk.py index 4e8ec0f86..f8d07beaf 100644 --- a/youtube_dl/extractor/vk.py +++ b/youtube_dl/extractor/vk.py @@ -217,11 +217,10 @@ class VKIE(InfoExtractor): value, domain = remixlhk.groups() self._set_cookie(domain, 'remixlhk', value) - request = sanitized_Request( - 'https://login.vk.com/?act=login', - urlencode_postdata(login_form)) login_page = self._download_webpage( - request, None, note='Logging in as %s' % username) + 'https://login.vk.com/?act=login', None, + note='Logging in as %s' % username, + data=urlencode_postdata(login_form)) if re.search(r'onLoginFailed', login_page): raise ExtractorError( From 73843ae8acb378e986ab4e7bb3a525b2f6b53cf2 Mon Sep 17 00:00:00 2001 From: rr- <rr-@sakuya.pl> Date: Wed, 22 Jun 2016 17:24:35 +0200 Subject: [PATCH 061/387] [xnxx] fix url regex The pattern has changed from "video123412" to "video-o8xa19". The changes maintain backwards compatibility with old-style URLs. --- youtube_dl/extractor/xnxx.py | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/youtube_dl/extractor/xnxx.py b/youtube_dl/extractor/xnxx.py index 5a41f8ffa..f0d4cb9bc 100644 --- a/youtube_dl/extractor/xnxx.py +++ b/youtube_dl/extractor/xnxx.py @@ -6,17 +6,20 @@ from ..compat import compat_urllib_parse_unquote class XNXXIE(InfoExtractor): - _VALID_URL = r'^https?://(?:video|www)\.xnxx\.com/video(?P<id>[0-9]+)/(.*)' - _TEST = { - 'url': 'http://video.xnxx.com/video1135332/lida_naked_funny_actress_5_', - 'md5': '0831677e2b4761795f68d417e0b7b445', + _VALID_URL = r'^https?://(?:video|www)\.xnxx\.com/video-?(?P<id>[0-9a-z]+)/(.*)' + _TESTS = [{ + 'url': 'http://www.xnxx.com/video-6gqggeb/hd_star-581_sam', + 'md5': '6a2a6aff3f10467d94e572edb7b7deb6', 'info_dict': { - 'id': '1135332', + 'id': '6gqggeb', 'ext': 'flv', - 'title': 'lida » Naked Funny Actress (5)', + 'title': 'HD STAR-581 sam', 'age_limit': 18, - } - } + }, + }, { + 'url': 'http://video.xnxx.com/video1135332/lida_naked_funny_actress_5_', + 'only_matching': True, + }] def _real_extract(self, url): video_id = self._match_id(url) From 97674f041916860343d804b8b07b73017e1a517f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Thu, 23 Jun 2016 04:24:00 +0700 Subject: [PATCH 062/387] [xnxx] Replace test --- youtube_dl/extractor/xnxx.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/xnxx.py b/youtube_dl/extractor/xnxx.py index f0d4cb9bc..1e677a63b 100644 --- a/youtube_dl/extractor/xnxx.py +++ b/youtube_dl/extractor/xnxx.py @@ -8,12 +8,12 @@ from ..compat import compat_urllib_parse_unquote class XNXXIE(InfoExtractor): _VALID_URL = r'^https?://(?:video|www)\.xnxx\.com/video-?(?P<id>[0-9a-z]+)/(.*)' _TESTS = [{ - 'url': 'http://www.xnxx.com/video-6gqggeb/hd_star-581_sam', - 'md5': '6a2a6aff3f10467d94e572edb7b7deb6', + 'url': 'http://www.xnxx.com/video-55awb78/skyrim_test_video', + 'md5': 'ef7ecee5af78f8b03dca2cf31341d3a0', 'info_dict': { - 'id': '6gqggeb', + 'id': '55awb78', 'ext': 'flv', - 'title': 'HD STAR-581 sam', + 'title': 'Skyrim Test Video', 'age_limit': 18, }, }, { From adf1921dc157af23e8b317d6095b88c87a149e2f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Thu, 23 Jun 2016 04:26:49 +0700 Subject: [PATCH 063/387] [xnxx] Improve _VALID_URL (Closes #9858) --- youtube_dl/extractor/xnxx.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/xnxx.py b/youtube_dl/extractor/xnxx.py index 1e677a63b..bcb140305 100644 --- a/youtube_dl/extractor/xnxx.py +++ b/youtube_dl/extractor/xnxx.py @@ -6,7 +6,7 @@ from ..compat import compat_urllib_parse_unquote class XNXXIE(InfoExtractor): - _VALID_URL = r'^https?://(?:video|www)\.xnxx\.com/video-?(?P<id>[0-9a-z]+)/(.*)' + _VALID_URL = r'https?://(?:video|www)\.xnxx\.com/video-?(?P<id>[0-9a-z]+)/' _TESTS = [{ 'url': 'http://www.xnxx.com/video-55awb78/skyrim_test_video', 'md5': 'ef7ecee5af78f8b03dca2cf31341d3a0', @@ -19,6 +19,9 @@ class XNXXIE(InfoExtractor): }, { 'url': 'http://video.xnxx.com/video1135332/lida_naked_funny_actress_5_', 'only_matching': True, + }, { + 'url': 'http://www.xnxx.com/video-55awb78/', + 'only_matching': True, }] def _real_extract(self, url): From 3331a4644d141ba9163ecf08015ccb9e0a5b87f4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Thu, 23 Jun 2016 04:27:10 +0700 Subject: [PATCH 064/387] [vk] Remove unused import --- youtube_dl/extractor/vk.py | 1 - 1 file changed, 1 deletion(-) diff --git a/youtube_dl/extractor/vk.py b/youtube_dl/extractor/vk.py index f8d07beaf..cfc5ffd8b 100644 --- a/youtube_dl/extractor/vk.py +++ b/youtube_dl/extractor/vk.py @@ -11,7 +11,6 @@ from ..utils import ( ExtractorError, int_or_none, orderedSet, - sanitized_Request, str_to_int, unescapeHTML, unified_strdate, From 96f88e91b7bac15b3a6f1eafb6a66964d2d11a7c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Thu, 23 Jun 2016 04:29:34 +0700 Subject: [PATCH 065/387] release 2016.06.23 --- .github/ISSUE_TEMPLATE.md | 6 +++--- youtube_dl/version.py | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/ISSUE_TEMPLATE.md b/.github/ISSUE_TEMPLATE.md index e17625f21..f7d1020d3 100644 --- a/.github/ISSUE_TEMPLATE.md +++ b/.github/ISSUE_TEMPLATE.md @@ -6,8 +6,8 @@ --- -### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2016.06.22*. If it's not read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. -- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2016.06.22** +### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2016.06.23*. If it's not read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. +- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2016.06.23** ### Before submitting an *issue* make sure you have: - [ ] At least skimmed through [README](https://github.com/rg3/youtube-dl/blob/master/README.md) and **most notably** [FAQ](https://github.com/rg3/youtube-dl#faq) and [BUGS](https://github.com/rg3/youtube-dl#bugs) sections @@ -35,7 +35,7 @@ $ youtube-dl -v <your command line> [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 -[debug] youtube-dl version 2016.06.22 +[debug] youtube-dl version 2016.06.23 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/youtube_dl/version.py b/youtube_dl/version.py index d2152b2f1..bff747906 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2016.06.22' +__version__ = '2016.06.23' From 22b7ac13ef4e34654bcb3fb3dbb40d2fac9b4278 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Thu, 23 Jun 2016 00:13:52 +0100 Subject: [PATCH 066/387] [tf1] fix wat id extraction(closes #9862) --- youtube_dl/extractor/tf1.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/tf1.py b/youtube_dl/extractor/tf1.py index 6c848dc6f..e595c4a69 100644 --- a/youtube_dl/extractor/tf1.py +++ b/youtube_dl/extractor/tf1.py @@ -48,6 +48,6 @@ class TF1IE(InfoExtractor): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) wat_id = self._html_search_regex( - r'(["\'])(?:https?:)?//www\.wat\.tv/embedframe/.*?(?P<id>\d{8}).*?\1', + r'(["\'])(?:https?:)?//www\.wat\.tv/embedframe/.*?(?P<id>\d{8})\1', webpage, 'wat id', group='id') return self.url_result('wat:%s' % wat_id, 'Wat') From 0437307a41dc97becc807abc9a86070ac9c847d8 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Thu, 23 Jun 2016 01:36:19 +0100 Subject: [PATCH 067/387] [nbc:nbcnews] improve extraction and add msnbc to the extractor --- youtube_dl/extractor/extractors.py | 1 - youtube_dl/extractor/nbc.py | 157 +++++++++-------------------- 2 files changed, 50 insertions(+), 108 deletions(-) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 4e2a2f2e9..6fc5a18f5 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -480,7 +480,6 @@ from .nbc import ( NBCNewsIE, NBCSportsIE, NBCSportsVPlayerIE, - MSNBCIE, ) from .ndr import ( NDRIE, diff --git a/youtube_dl/extractor/nbc.py b/youtube_dl/extractor/nbc.py index 6b7da1149..f694e210b 100644 --- a/youtube_dl/extractor/nbc.py +++ b/youtube_dl/extractor/nbc.py @@ -9,10 +9,6 @@ from ..utils import ( lowercase_escape, smuggle_url, unescapeHTML, - update_url_query, - int_or_none, - HEADRequest, - parse_iso8601, ) @@ -192,9 +188,9 @@ class CSNNEIE(InfoExtractor): class NBCNewsIE(ThePlatformIE): - _VALID_URL = r'''(?x)https?://(?:www\.)?(?:nbcnews|today)\.com/ + _VALID_URL = r'''(?x)https?://(?:www\.)?(?:nbcnews|today|msnbc)\.com/ (?:video/.+?/(?P<id>\d+)| - ([^/]+/)*(?P<display_id>[^/?]+)) + ([^/]+/)*(?:.*-)?(?P<mpx_id>[^/?]+)) ''' _TESTS = [ @@ -216,13 +212,16 @@ class NBCNewsIE(ThePlatformIE): 'ext': 'mp4', 'title': 'How Twitter Reacted To The Snowden Interview', 'description': 'md5:65a0bd5d76fe114f3c2727aa3a81fe64', + 'uploader': 'NBCU-NEWS', + 'timestamp': 1401363060, + 'upload_date': '20140529', }, }, { 'url': 'http://www.nbcnews.com/feature/dateline-full-episodes/full-episode-family-business-n285156', 'md5': 'fdbf39ab73a72df5896b6234ff98518a', 'info_dict': { - 'id': 'Wjf9EDR3A_60', + 'id': '529953347624', 'ext': 'mp4', 'title': 'FULL EPISODE: Family Business', 'description': 'md5:757988edbaae9d7be1d585eb5d55cc04', @@ -237,6 +236,9 @@ class NBCNewsIE(ThePlatformIE): 'ext': 'mp4', 'title': 'Nightly News with Brian Williams Full Broadcast (February 4)', 'description': 'md5:1c10c1eccbe84a26e5debb4381e2d3c5', + 'timestamp': 1423104900, + 'uploader': 'NBCU-NEWS', + 'upload_date': '20150205', }, }, { @@ -245,10 +247,12 @@ class NBCNewsIE(ThePlatformIE): 'info_dict': { 'id': '529953347624', 'ext': 'mp4', - 'title': 'Volkswagen U.S. Chief: We \'Totally Screwed Up\'', - 'description': 'md5:d22d1281a24f22ea0880741bb4dd6301', + 'title': 'Volkswagen U.S. Chief:\xa0 We Have Totally Screwed Up', + 'description': 'md5:c8be487b2d80ff0594c005add88d8351', + 'upload_date': '20150922', + 'timestamp': 1442917800, + 'uploader': 'NBCU-NEWS', }, - 'expected_warnings': ['http-6000 is not available'] }, { 'url': 'http://www.today.com/video/see-the-aurora-borealis-from-space-in-stunning-new-nasa-video-669831235788', @@ -260,6 +264,22 @@ class NBCNewsIE(ThePlatformIE): 'description': 'md5:74752b7358afb99939c5f8bb2d1d04b1', 'upload_date': '20160420', 'timestamp': 1461152093, + 'uploader': 'NBCU-NEWS', + }, + }, + { + 'url': 'http://www.msnbc.com/all-in-with-chris-hayes/watch/the-chaotic-gop-immigration-vote-314487875924', + 'md5': '6d236bf4f3dddc226633ce6e2c3f814d', + 'info_dict': { + 'id': '314487875924', + 'ext': 'mp4', + 'title': 'The chaotic GOP immigration vote', + 'description': 'The Republican House votes on a border bill that has no chance of getting through the Senate or signed by the President and is drawing criticism from all sides.', + 'thumbnail': 're:^https?://.*\.jpg$', + 'timestamp': 1406937606, + 'upload_date': '20140802', + 'uploader': 'NBCU-NEWS', + 'categories': ['MSNBC/Topics/Franchise/Best of last night', 'MSNBC/Topics/General/Congress'], }, }, { @@ -290,105 +310,28 @@ class NBCNewsIE(ThePlatformIE): } else: # "feature" and "nightly-news" pages use theplatform.com - display_id = mobj.group('display_id') - webpage = self._download_webpage(url, display_id) - info = None - bootstrap_json = self._search_regex( - [r'(?m)(?:var\s+(?:bootstrapJson|playlistData)|NEWS\.videoObj)\s*=\s*({.+});?\s*$', - r'videoObj\s*:\s*({.+})', r'data-video="([^"]+)"'], - webpage, 'bootstrap json', default=None) - bootstrap = self._parse_json( - bootstrap_json, display_id, transform_source=unescapeHTML) - if 'results' in bootstrap: - info = bootstrap['results'][0]['video'] - elif 'video' in bootstrap: - info = bootstrap['video'] - else: - info = bootstrap - video_id = info['mpxId'] - title = info['title'] - - subtitles = {} - caption_links = info.get('captionLinks') - if caption_links: - for (sub_key, sub_ext) in (('smpte-tt', 'ttml'), ('web-vtt', 'vtt'), ('srt', 'srt')): - sub_url = caption_links.get(sub_key) - if sub_url: - subtitles.setdefault('en', []).append({ - 'url': sub_url, - 'ext': sub_ext, - }) - - formats = [] - for video_asset in info['videoAssets']: - video_url = video_asset.get('publicUrl') - if not video_url: - continue - container = video_asset.get('format') - asset_type = video_asset.get('assetType') or '' - if container == 'ISM' or asset_type == 'FireTV-Once': - continue - elif asset_type == 'OnceURL': - tp_formats, tp_subtitles = self._extract_theplatform_smil( - video_url, video_id) - formats.extend(tp_formats) - subtitles = self._merge_subtitles(subtitles, tp_subtitles) + video_id = mobj.group('mpx_id') + if not video_id.isdigit(): + webpage = self._download_webpage(url, video_id) + info = None + bootstrap_json = self._search_regex( + [r'(?m)(?:var\s+(?:bootstrapJson|playlistData)|NEWS\.videoObj)\s*=\s*({.+});?\s*$', + r'videoObj\s*:\s*({.+})', r'data-video="([^"]+)"'], + webpage, 'bootstrap json', default=None) + bootstrap = self._parse_json( + bootstrap_json, video_id, transform_source=unescapeHTML) + if 'results' in bootstrap: + info = bootstrap['results'][0]['video'] + elif 'video' in bootstrap: + info = bootstrap['video'] else: - tbr = int_or_none(video_asset.get('bitRate') or video_asset.get('bitrate'), 1000) - format_id = 'http%s' % ('-%d' % tbr if tbr else '') - video_url = update_url_query( - video_url, {'format': 'redirect'}) - # resolve the url so that we can check availability and detect the correct extension - head = self._request_webpage( - HEADRequest(video_url), video_id, - 'Checking %s url' % format_id, - '%s is not available' % format_id, - fatal=False) - if head: - video_url = head.geturl() - formats.append({ - 'format_id': format_id, - 'url': video_url, - 'width': int_or_none(video_asset.get('width')), - 'height': int_or_none(video_asset.get('height')), - 'tbr': tbr, - 'container': video_asset.get('format'), - }) - self._sort_formats(formats) + info = bootstrap + video_id = info['mpxId'] return { + '_type': 'url_transparent', 'id': video_id, - 'title': title, - 'description': info.get('description'), - 'thumbnail': info.get('thumbnail'), - 'duration': int_or_none(info.get('duration')), - 'timestamp': parse_iso8601(info.get('pubDate') or info.get('pub_date')), - 'formats': formats, - 'subtitles': subtitles, + # http://feed.theplatform.com/f/2E2eJC/nbcnews also works + 'url': 'http://feed.theplatform.com/f/2E2eJC/nnd_NBCNews?byId=%s' % video_id, + 'ie_key': 'ThePlatformFeed', } - - -class MSNBCIE(InfoExtractor): - # https URLs redirect to corresponding http ones - _VALID_URL = r'https?://www\.msnbc\.com/[^/]+/watch/(?P<id>[^/]+)' - _TEST = { - 'url': 'http://www.msnbc.com/all-in-with-chris-hayes/watch/the-chaotic-gop-immigration-vote-314487875924', - 'md5': '6d236bf4f3dddc226633ce6e2c3f814d', - 'info_dict': { - 'id': 'n_hayes_Aimm_140801_272214', - 'ext': 'mp4', - 'title': 'The chaotic GOP immigration vote', - 'description': 'The Republican House votes on a border bill that has no chance of getting through the Senate or signed by the President and is drawing criticism from all sides.', - 'thumbnail': 're:^https?://.*\.jpg$', - 'timestamp': 1406937606, - 'upload_date': '20140802', - 'uploader': 'NBCU-NEWS', - 'categories': ['MSNBC/Topics/Franchise/Best of last night', 'MSNBC/Topics/General/Congress'], - }, - } - - def _real_extract(self, url): - video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) - embed_url = self._html_search_meta('embedURL', webpage) - return self.url_result(embed_url) From b46eabecd3d6e8ea6dd3dc5a948ecbb65d818205 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Thu, 23 Jun 2016 09:41:34 +0700 Subject: [PATCH 068/387] [jsinterp] Relax JS function regex (Closes #9863) --- youtube_dl/jsinterp.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/jsinterp.py b/youtube_dl/jsinterp.py index a7440c582..9737f7002 100644 --- a/youtube_dl/jsinterp.py +++ b/youtube_dl/jsinterp.py @@ -232,7 +232,7 @@ class JSInterpreter(object): def extract_function(self, funcname): func_m = re.search( r'''(?x) - (?:function\s+%s|[{;,]%s\s*=\s*function|var\s+%s\s*=\s*function)\s* + (?:function\s+%s|[{;,]\s*%s\s*=\s*function|var\s+%s\s*=\s*function)\s* \((?P<args>[^)]*)\)\s* \{(?P<code>[^}]+)\}''' % ( re.escape(funcname), re.escape(funcname), re.escape(funcname)), From 011bd3221b1541eaef9bb14786da37abe4d74ecb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Thu, 23 Jun 2016 09:42:56 +0700 Subject: [PATCH 069/387] release 2016.06.23.1 --- .github/ISSUE_TEMPLATE.md | 6 +++--- docs/supportedsites.md | 3 +-- youtube_dl/version.py | 2 +- 3 files changed, 5 insertions(+), 6 deletions(-) diff --git a/.github/ISSUE_TEMPLATE.md b/.github/ISSUE_TEMPLATE.md index f7d1020d3..62cb18d7d 100644 --- a/.github/ISSUE_TEMPLATE.md +++ b/.github/ISSUE_TEMPLATE.md @@ -6,8 +6,8 @@ --- -### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2016.06.23*. If it's not read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. -- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2016.06.23** +### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2016.06.23.1*. If it's not read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. +- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2016.06.23.1** ### Before submitting an *issue* make sure you have: - [ ] At least skimmed through [README](https://github.com/rg3/youtube-dl/blob/master/README.md) and **most notably** [FAQ](https://github.com/rg3/youtube-dl#faq) and [BUGS](https://github.com/rg3/youtube-dl#bugs) sections @@ -35,7 +35,7 @@ $ youtube-dl -v <your command line> [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 -[debug] youtube-dl version 2016.06.23 +[debug] youtube-dl version 2016.06.23.1 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/docs/supportedsites.md b/docs/supportedsites.md index 96cc407db..891499f59 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -44,8 +44,8 @@ - **appletrailers:section** - **archive.org**: archive.org videos - **ARD** - - **ARD:mediathek**: Saarländischer Rundfunk - **ARD:mediathek** + - **ARD:mediathek**: Saarländischer Rundfunk - **arte.tv** - **arte.tv:+7** - **arte.tv:cinema** @@ -385,7 +385,6 @@ - **MovieFap** - **Moviezine** - **MPORA** - - **MSNBC** - **MTV** - **mtv.de** - **mtviggy.com** diff --git a/youtube_dl/version.py b/youtube_dl/version.py index bff747906..0238dc97c 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2016.06.23' +__version__ = '2016.06.23.1' From 6e3c2047f8a51da3bac0d4d290d64b1b8bb8f1c2 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Thu, 23 Jun 2016 04:34:07 +0100 Subject: [PATCH 070/387] [tvp] extract all formats and detect erros --- youtube_dl/extractor/tvp.py | 59 ++++++++++++++++++++++++++++++------- 1 file changed, 48 insertions(+), 11 deletions(-) diff --git a/youtube_dl/extractor/tvp.py b/youtube_dl/extractor/tvp.py index a4997cb89..5070082da 100644 --- a/youtube_dl/extractor/tvp.py +++ b/youtube_dl/extractor/tvp.py @@ -4,6 +4,12 @@ from __future__ import unicode_literals import re from .common import InfoExtractor +from ..utils import ( + determine_ext, + clean_html, + get_element_by_attribute, + ExtractorError, +) class TVPIE(InfoExtractor): @@ -21,7 +27,7 @@ class TVPIE(InfoExtractor): }, }, { 'url': 'http://www.tvp.pl/there-can-be-anything-so-i-shortened-it/17916176', - 'md5': 'c3b15ed1af288131115ff17a17c19dda', + 'md5': 'b0005b542e5b4de643a9690326ab1257', 'info_dict': { 'id': '17916176', 'ext': 'mp4', @@ -53,6 +59,11 @@ class TVPIE(InfoExtractor): webpage = self._download_webpage( 'http://www.tvp.pl/sess/tvplayer.php?object_id=%s' % video_id, video_id) + error_massage = get_element_by_attribute('class', 'msg error', webpage) + if error_massage: + raise ExtractorError('%s said: %s' % ( + self.IE_NAME, clean_html(error_massage)), expected=True) + title = self._search_regex( r'name\s*:\s*([\'"])Title\1\s*,\s*value\s*:\s*\1(?P<title>.+?)\1', webpage, 'title', group='title') @@ -66,24 +77,50 @@ class TVPIE(InfoExtractor): r"poster\s*:\s*'([^']+)'", webpage, 'thumbnail', default=None) video_url = self._search_regex( - r'0:{src:([\'"])(?P<url>.*?)\1', webpage, 'formats', group='url', default=None) - if not video_url: + r'0:{src:([\'"])(?P<url>.*?)\1', webpage, + 'formats', group='url', default=None) + if not video_url or 'material_niedostepny.mp4' in video_url: video_url = self._download_json( 'http://www.tvp.pl/pub/stat/videofileinfo?video_id=%s' % video_id, video_id)['video_url'] - ext = video_url.rsplit('.', 1)[-1] - if ext != 'ism/manifest': - if '/' in ext: - ext = 'mp4' + formats = [] + video_url_base = self._search_regex( + r'(https?://.+?/video)(?:\.(?:ism|f4m|m3u8)|-\d+\.mp4)', + video_url, 'video base url', default=None) + if video_url_base: + # TODO: Current DASH formats are broken - $Time$ pattern in + # <SegmentTemplate> not implemented yet + # formats.extend(self._extract_mpd_formats( + # video_url_base + '.ism/video.mpd', + # video_id, mpd_id='dash', fatal=False)) + formats.extend(self._extract_f4m_formats( + video_url_base + '.ism/video.f4m', + video_id, f4m_id='hds', fatal=False)) + m3u8_formats = self._extract_m3u8_formats( + video_url_base + '.ism/video.m3u8', video_id, + 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False) + self._sort_formats(m3u8_formats) + m3u8_formats = list(filter( + lambda f: f.get('vcodec') != 'none' and f.get('resolution') != 'multiple', + m3u8_formats)) + formats.extend(m3u8_formats) + for i, m3u8_format in enumerate(m3u8_formats, 2): + http_url = '%s-%d.mp4' % (video_url_base, i) + if self._is_valid_url(http_url, video_id): + f = m3u8_format.copy() + f.update({ + 'url': http_url, + 'format_id': f['format_id'].replace('hls', 'http'), + 'protocol': 'http', + }) + formats.append(f) + else: formats = [{ 'format_id': 'direct', 'url': video_url, - 'ext': ext, + 'ext': determine_ext(video_url, 'mp4'), }] - else: - m3u8_url = re.sub('([^/]*)\.ism/manifest', r'\1.ism/\1.m3u8', video_url) - formats = self._extract_m3u8_formats(m3u8_url, video_id, 'mp4') self._sort_formats(formats) From 494172d2e5b2d5b6f309b42e1a2bd7108aed40de Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Thu, 23 Jun 2016 15:49:42 +0100 Subject: [PATCH 071/387] [appletrailers] extract info from an alternative source if available(closes #8422)(closes #8422) --- youtube_dl/extractor/appletrailers.py | 53 ++++++++++++++++++++++++++- 1 file changed, 52 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/appletrailers.py b/youtube_dl/extractor/appletrailers.py index be40f85b4..babbd0265 100644 --- a/youtube_dl/extractor/appletrailers.py +++ b/youtube_dl/extractor/appletrailers.py @@ -7,6 +7,8 @@ from .common import InfoExtractor from ..compat import compat_urlparse from ..utils import ( int_or_none, + parse_duration, + unified_strdate, ) @@ -16,7 +18,8 @@ class AppleTrailersIE(InfoExtractor): _TESTS = [{ 'url': 'http://trailers.apple.com/trailers/wb/manofsteel/', 'info_dict': { - 'id': 'manofsteel', + 'id': '5111', + 'title': 'Man of Steel', }, 'playlist': [ { @@ -70,6 +73,15 @@ class AppleTrailersIE(InfoExtractor): 'id': 'blackthorn', }, 'playlist_mincount': 2, + 'expected_warnings': ['Unable to download JSON metadata'], + }, { + # json data only available from http://trailers.apple.com/trailers/feeds/data/15881.json + 'url': 'http://trailers.apple.com/trailers/fox/kungfupanda3/', + 'info_dict': { + 'id': '15881', + 'title': 'Kung Fu Panda 3', + }, + 'playlist_mincount': 4, }, { 'url': 'http://trailers.apple.com/ca/metropole/autrui/', 'only_matching': True, @@ -85,6 +97,45 @@ class AppleTrailersIE(InfoExtractor): movie = mobj.group('movie') uploader_id = mobj.group('company') + webpage = self._download_webpage(url, movie) + film_id = self._search_regex(r"FilmId\s*=\s*'(\d+)'", webpage, 'film id') + film_data = self._download_json( + 'http://trailers.apple.com/trailers/feeds/data/%s.json' % film_id, + film_id, fatal=False) + + if film_data: + entries = [] + for clip in film_data.get('clips', []): + clip_title = clip['title'] + + formats = [] + for version, version_data in clip.get('versions', {}).items(): + for size, size_data in version_data.get('sizes', {}).items(): + src = size_data.get('src') + if not src: + continue + formats.append({ + 'format_id': '%s-%s' % (version, size), + 'url': re.sub(r'_(\d+p.mov)', r'_h\1', src), + 'width': int_or_none(size_data.get('width')), + 'height': int_or_none(size_data.get('height')), + 'language': version[:2], + }) + self._sort_formats(formats) + + entries.append({ + 'id': movie + '-' + re.sub(r'[^a-zA-Z0-9]', '', clip_title).lower(), + 'formats': formats, + 'title': clip_title, + 'thumbnail': clip.get('screen') or clip.get('runtime'), + 'duration': parse_duration(clip.get('runtime') or clip.get('faded')), + 'upload_date': unified_strdate(clip.get('posted')), + 'uploader_id': uploader_id, + }) + + page_data = film_data.get('page', {}) + return self.playlist_result(entries, film_id, page_data.get('movie_title')) + playlist_url = compat_urlparse.urljoin(url, 'includes/playlists/itunes.inc') def fix_html(s): From 8065d6c55f02c6f618e8495049f253d311cf347f Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Thu, 23 Jun 2016 17:22:15 +0100 Subject: [PATCH 072/387] [dcn] extend _VALID_URL for awaan.ae and extract all available formats --- youtube_dl/extractor/dcn.py | 47 ++++++++++++++++++++----------------- 1 file changed, 26 insertions(+), 21 deletions(-) diff --git a/youtube_dl/extractor/dcn.py b/youtube_dl/extractor/dcn.py index 5deff5f30..efb8585e8 100644 --- a/youtube_dl/extractor/dcn.py +++ b/youtube_dl/extractor/dcn.py @@ -20,7 +20,7 @@ from ..utils import ( class DCNIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?dcndigital\.ae/(?:#/)?show/(?P<show_id>\d+)/[^/]+(?:/(?P<video_id>\d+)/(?P<season_id>\d+))?' + _VALID_URL = r'https?://(?:www\.)?(?:awaan|dcndigital)\.ae/(?:#/)?show/(?P<show_id>\d+)/[^/]+(?:/(?P<video_id>\d+)/(?P<season_id>\d+))?' def _real_extract(self, url): show_id, video_id, season_id = re.match(self._VALID_URL, url).groups() @@ -55,30 +55,32 @@ class DCNBaseIE(InfoExtractor): 'is_live': is_live, } - def _extract_video_formats(self, webpage, video_id, entry_protocol): + def _extract_video_formats(self, webpage, video_id, m3u8_entry_protocol): formats = [] - m3u8_url = self._html_search_regex( - r'file\s*:\s*"([^"]+)', webpage, 'm3u8 url', fatal=False) - if m3u8_url: - formats.extend(self._extract_m3u8_formats( - m3u8_url, video_id, 'mp4', entry_protocol, m3u8_id='hls', fatal=None)) - - rtsp_url = self._search_regex( - r'<a[^>]+href="(rtsp://[^"]+)"', webpage, 'rtsp url', fatal=False) - if rtsp_url: - formats.append({ - 'url': rtsp_url, - 'format_id': 'rtsp', - }) - + format_url_base = 'http' + self._html_search_regex( + [ + r'file\s*:\s*"https?(://[^"]+)/playlist.m3u8', + r'<a[^>]+href="rtsp(://[^"]+)"' + ], webpage, 'format url') + # TODO: Current DASH formats are broken - $Time$ pattern in + # <SegmentTemplate> not implemented yet + # formats.extend(self._extract_mpd_formats( + # format_url_base + '/manifest.mpd', + # video_id, mpd_id='dash', fatal=False)) + formats.extend(self._extract_m3u8_formats( + format_url_base + '/playlist.m3u8', video_id, 'mp4', + m3u8_entry_protocol, m3u8_id='hls', fatal=False)) + formats.extend(self._extract_f4m_formats( + format_url_base + '/manifest.f4m', + video_id, f4m_id='hds', fatal=False)) self._sort_formats(formats) return formats class DCNVideoIE(DCNBaseIE): IE_NAME = 'dcn:video' - _VALID_URL = r'https?://(?:www\.)?dcndigital\.ae/(?:#/)?(?:video/[^/]+|media|catchup/[^/]+/[^/]+)/(?P<id>\d+)' - _TEST = { + _VALID_URL = r'https?://(?:www\.)?(?:awaan|dcndigital)\.ae/(?:#/)?(?:video(?:/[^/]+)?|media|catchup/[^/]+/[^/]+)/(?P<id>\d+)' + _TESTS = [{ 'url': 'http://www.dcndigital.ae/#/video/%D8%B1%D8%AD%D9%84%D8%A9-%D8%A7%D9%84%D8%B9%D9%85%D8%B1-%D8%A7%D9%84%D8%AD%D9%84%D9%82%D8%A9-1/17375', 'info_dict': { @@ -94,7 +96,10 @@ class DCNVideoIE(DCNBaseIE): # m3u8 download 'skip_download': True, }, - } + }, { + 'url': 'http://awaan.ae/video/26723981/%D8%AF%D8%A7%D8%B1-%D8%A7%D9%84%D8%B3%D9%84%D8%A7%D9%85:-%D8%AE%D9%8A%D8%B1-%D8%AF%D9%88%D8%B1-%D8%A7%D9%84%D8%A3%D9%86%D8%B5%D8%A7%D8%B1', + 'only_matching': True, + }] def _real_extract(self, url): video_id = self._match_id(url) @@ -120,7 +125,7 @@ class DCNVideoIE(DCNBaseIE): class DCNLiveIE(DCNBaseIE): IE_NAME = 'dcn:live' - _VALID_URL = r'https?://(?:www\.)?dcndigital\.ae/(?:#/)?live/(?P<id>\d+)' + _VALID_URL = r'https?://(?:www\.)?(?:awaan|dcndigital)\.ae/(?:#/)?live/(?P<id>\d+)' def _real_extract(self, url): channel_id = self._match_id(url) @@ -147,7 +152,7 @@ class DCNLiveIE(DCNBaseIE): class DCNSeasonIE(InfoExtractor): IE_NAME = 'dcn:season' - _VALID_URL = r'https?://(?:www\.)?dcndigital\.ae/(?:#/)?program/(?:(?P<show_id>\d+)|season/(?P<season_id>\d+))' + _VALID_URL = r'https?://(?:www\.)?(?:awaan|dcndigital)\.ae/(?:#/)?program/(?:(?P<show_id>\d+)|season/(?P<season_id>\d+))' _TEST = { 'url': 'http://dcndigital.ae/#/program/205024/%D9%85%D8%AD%D8%A7%D8%B6%D8%B1%D8%A7%D8%AA-%D8%A7%D9%84%D8%B4%D9%8A%D8%AE-%D8%A7%D9%84%D8%B4%D8%B9%D8%B1%D8%A7%D9%88%D9%8A', 'info_dict': From fee70322d76f416c3d68f58abdc73f9d3960083e Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Thu, 23 Jun 2016 19:03:34 +0100 Subject: [PATCH 073/387] [appletrailers] correct thumbnail fallback --- youtube_dl/extractor/appletrailers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/appletrailers.py b/youtube_dl/extractor/appletrailers.py index babbd0265..a6801f3d4 100644 --- a/youtube_dl/extractor/appletrailers.py +++ b/youtube_dl/extractor/appletrailers.py @@ -127,7 +127,7 @@ class AppleTrailersIE(InfoExtractor): 'id': movie + '-' + re.sub(r'[^a-zA-Z0-9]', '', clip_title).lower(), 'formats': formats, 'title': clip_title, - 'thumbnail': clip.get('screen') or clip.get('runtime'), + 'thumbnail': clip.get('screen') or clip.get('thumb'), 'duration': parse_duration(clip.get('runtime') or clip.get('faded')), 'upload_date': unified_strdate(clip.get('posted')), 'uploader_id': uploader_id, From c1ff6e1ad08c781ce1d486ddb7389fe90c79af35 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan <yan12125@gmail.com> Date: Fri, 24 Jun 2016 16:48:37 +0800 Subject: [PATCH 074/387] [vimeo:review] Fix extraction for password-protected videos Closes #9853 --- youtube_dl/extractor/vimeo.py | 72 ++++++++++++++++++++++++----------- 1 file changed, 50 insertions(+), 22 deletions(-) diff --git a/youtube_dl/extractor/vimeo.py b/youtube_dl/extractor/vimeo.py index 4bdeb1187..d9c9852d4 100644 --- a/youtube_dl/extractor/vimeo.py +++ b/youtube_dl/extractor/vimeo.py @@ -16,6 +16,7 @@ from ..utils import ( ExtractorError, InAdvancePagedList, int_or_none, + NO_DEFAULT, RegexNotFoundError, sanitized_Request, smuggle_url, @@ -56,6 +57,26 @@ class VimeoBaseInfoExtractor(InfoExtractor): self._set_vimeo_cookie('vuid', vuid) self._download_webpage(login_request, None, False, 'Wrong login info') + def _verify_video_password(self, url, video_id, webpage): + password = self._downloader.params.get('videopassword') + if password is None: + raise ExtractorError('This video is protected by a password, use the --video-password option', expected=True) + token, vuid = self._extract_xsrft_and_vuid(webpage) + data = urlencode_postdata({ + 'password': password, + 'token': token, + }) + if url.startswith('http://'): + # vimeo only supports https now, but the user can give an http url + url = url.replace('http://', 'https://') + password_request = sanitized_Request(url + '/password', data) + password_request.add_header('Content-Type', 'application/x-www-form-urlencoded') + password_request.add_header('Referer', url) + self._set_vimeo_cookie('vuid', vuid) + return self._download_webpage( + password_request, video_id, + 'Verifying the password', 'Wrong password') + def _extract_xsrft_and_vuid(self, webpage): xsrft = self._search_regex( r'(?:(?P<q1>["\'])xsrft(?P=q1)\s*:|xsrft\s*[=:])\s*(?P<q>["\'])(?P<xsrft>.+?)(?P=q)', @@ -344,26 +365,6 @@ class VimeoIE(VimeoBaseInfoExtractor): if mobj: return mobj.group(1) - def _verify_video_password(self, url, video_id, webpage): - password = self._downloader.params.get('videopassword') - if password is None: - raise ExtractorError('This video is protected by a password, use the --video-password option', expected=True) - token, vuid = self._extract_xsrft_and_vuid(webpage) - data = urlencode_postdata({ - 'password': password, - 'token': token, - }) - if url.startswith('http://'): - # vimeo only supports https now, but the user can give an http url - url = url.replace('http://', 'https://') - password_request = sanitized_Request(url + '/password', data) - password_request.add_header('Content-Type', 'application/x-www-form-urlencoded') - password_request.add_header('Referer', url) - self._set_vimeo_cookie('vuid', vuid) - return self._download_webpage( - password_request, video_id, - 'Verifying the password', 'Wrong password') - def _verify_player_video_password(self, url, video_id): password = self._downloader.params.get('videopassword') if password is None: @@ -791,12 +792,39 @@ class VimeoReviewIE(VimeoBaseInfoExtractor): 'thumbnail': 're:^https?://.*\.jpg$', 'uploader_id': 'user22258446', } + }, { + 'note': 'Password protected', + 'url': 'https://vimeo.com/user37284429/review/138823582/c4d865efde', + 'info_dict': { + 'id': '138823582', + 'ext': 'mp4', + 'title': 'EFFICIENT PICKUP MASTERCLASS MODULE 1', + 'uploader': 'TMB', + 'uploader_id': 'user37284429', + }, + 'params': { + 'videopassword': 'holygrail', + }, }] + def _real_initialize(self): + self._login() + + def _get_config_url(self, webpage_url, video_id, video_password_verified=False): + webpage = self._download_webpage(webpage_url, video_id) + config_url = self._html_search_regex( + r'data-config-url="([^"]+)"', webpage, 'config URL', + default=NO_DEFAULT if video_password_verified else None) + if config_url is None: + self._verify_video_password(webpage_url, video_id, webpage) + config_url = self._get_config_url( + webpage_url, video_id, video_password_verified=True) + return config_url + def _real_extract(self, url): video_id = self._match_id(url) - config = self._download_json( - 'https://player.vimeo.com/video/%s/config' % video_id, video_id) + config_url = self._get_config_url(url, video_id) + config = self._download_json(config_url, video_id) info_dict = self._parse_config(config, video_id) self._vimeo_sort_formats(info_dict['formats']) info_dict['id'] = video_id From 896cc727508f1d1054d88405c64e731c4d5c4ce4 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan <yan12125@gmail.com> Date: Fri, 24 Jun 2016 17:26:12 +0800 Subject: [PATCH 075/387] [mixcloud] View count and like count may be absent Closes #9874 --- youtube_dl/extractor/mixcloud.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/mixcloud.py b/youtube_dl/extractor/mixcloud.py index 483f6925f..560fe188b 100644 --- a/youtube_dl/extractor/mixcloud.py +++ b/youtube_dl/extractor/mixcloud.py @@ -102,11 +102,11 @@ class MixcloudIE(InfoExtractor): description = self._og_search_description(webpage) like_count = parse_count(self._search_regex( r'\bbutton-favorite[^>]+>.*?<span[^>]+class=["\']toggle-number[^>]+>\s*([^<]+)', - webpage, 'like count', fatal=False)) + webpage, 'like count', default=None)) view_count = str_to_int(self._search_regex( [r'<meta itemprop="interactionCount" content="UserPlays:([0-9]+)"', r'/listeners/?">([0-9,.]+)</a>'], - webpage, 'play count', fatal=False)) + webpage, 'play count', default=None)) return { 'id': track_id, From de3c7fe0d42fd6027b99ab87ee5b4a4b4054daf6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Fri, 24 Jun 2016 22:27:55 +0700 Subject: [PATCH 076/387] [youtube] Fix 141 format tests --- youtube_dl/extractor/youtube.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 00dd602ff..54c6e45f8 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -501,6 +501,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'youtube_include_dash_manifest': True, 'format': '141', }, + 'skip': 'format 141 not served anymore', }, # DASH manifest with encrypted signature { @@ -517,7 +518,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): }, 'params': { 'youtube_include_dash_manifest': True, - 'format': '141', + 'format': '141/bestaudio[ext=m4a]', }, }, # JS player signature function name containing $ @@ -537,7 +538,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): }, 'params': { 'youtube_include_dash_manifest': True, - 'format': '141', + 'format': '141/bestaudio[ext=m4a]', }, }, # Controversy video From 525cedb971c091793da5a0ef90670e6b5faded62 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Fri, 24 Jun 2016 22:37:13 +0700 Subject: [PATCH 077/387] [youtube] Relax URL expansion in description --- youtube_dl/extractor/youtube.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 54c6e45f8..46b9dc66c 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -1332,7 +1332,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): (?:[a-zA-Z-]+="[^"]*"\s+)*? (?:title|href)="([^"]+)"\s+ (?:[a-zA-Z-]+="[^"]*"\s+)*? - class="(?:yt-uix-redirect-link|yt-uix-sessionlink[^"]*)"[^>]*> + class="[^"]*"[^>]*> [^<]+\.{3}\s* </a> ''', r'\1', video_description) From be49068d65ae39bef5797071f8a7cf1c733f033b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Fri, 24 Jun 2016 22:47:19 +0700 Subject: [PATCH 078/387] [youtube] Fix and skip some tests --- youtube_dl/extractor/youtube.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 46b9dc66c..c8d54f22a 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -619,7 +619,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'uploader_url': 're:https?://(?:www\.)?youtube\.com/user/olympic', 'license': 'Standard YouTube License', 'description': 'HO09 - Women - GER-AUS - Hockey - 31 July 2012 - London 2012 Olympic Games', - 'uploader': 'Olympics', + 'uploader': 'Olympic', 'title': 'Hockey - Women - GER-AUS - London 2012 Olympic Games', }, 'params': { @@ -672,7 +672,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'uploader_url': 're:https?://(?:www\.)?youtube\.com/user/dorappi2000', 'uploader': 'dorappi2000', 'license': 'Standard YouTube License', - 'formats': 'mincount:33', + 'formats': 'mincount:32', }, }, # DASH manifest with segment_list @@ -692,7 +692,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'params': { 'youtube_include_dash_manifest': True, 'format': '135', # bestvideo - } + }, + 'skip': 'This live event has ended.', }, { # Multifeed videos (multiple cameras), URL is for Main Camera @@ -763,6 +764,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'title': 'DevConf.cz 2016 Day 2 Workshops 1 14:00 - 15:30', }, 'playlist_count': 2, + 'skip': 'Not multifeed anymore', }, { 'url': 'http://vid.plus/FlRa-iH7PGw', @@ -815,6 +817,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'params': { 'skip_download': True, }, + 'skip': 'This video does not exist.', }, { # Video licensed under Creative Commons From 3d4b08dfc7a1cf49686b68d405053475a4c3c490 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 25 Jun 2016 02:50:12 +0700 Subject: [PATCH 079/387] [setup.py] Add file version information and quotes consistency (Closes #9878) --- setup.py | 63 +++++++++++++++++++++++++++++++------------------------- 1 file changed, 35 insertions(+), 28 deletions(-) diff --git a/setup.py b/setup.py index c1e923f71..508b27f37 100644 --- a/setup.py +++ b/setup.py @@ -21,25 +21,37 @@ try: import py2exe except ImportError: if len(sys.argv) >= 2 and sys.argv[1] == 'py2exe': - print("Cannot import py2exe", file=sys.stderr) + print('Cannot import py2exe', file=sys.stderr) exit(1) py2exe_options = { - "bundle_files": 1, - "compressed": 1, - "optimize": 2, - "dist_dir": '.', - "dll_excludes": ['w9xpopen.exe', 'crypt32.dll'], + 'bundle_files': 1, + 'compressed': 1, + 'optimize': 2, + 'dist_dir': '.', + 'dll_excludes': ['w9xpopen.exe', 'crypt32.dll'], } +# Get the version from youtube_dl/version.py without importing the package +exec(compile(open('youtube_dl/version.py').read(), + 'youtube_dl/version.py', 'exec')) + +DESCRIPTION = 'YouTube video downloader' +LONG_DESCRIPTION = 'Command-line program to download videos from YouTube.com and other video sites' + py2exe_console = [{ - "script": "./youtube_dl/__main__.py", - "dest_base": "youtube-dl", + 'script': './youtube_dl/__main__.py', + 'dest_base': 'youtube-dl', + 'version': __version__, + 'description': DESCRIPTION, + 'comments': LONG_DESCRIPTION, + 'product_name': 'youtube-dl', + 'product_version': __version__, }] py2exe_params = { 'console': py2exe_console, - 'options': {"py2exe": py2exe_options}, + 'options': {'py2exe': py2exe_options}, 'zipfile': None } @@ -72,7 +84,7 @@ else: params['scripts'] = ['bin/youtube-dl'] class build_lazy_extractors(Command): - description = "Build the extractor lazy loading module" + description = 'Build the extractor lazy loading module' user_options = [] def initialize_options(self): @@ -87,16 +99,11 @@ class build_lazy_extractors(Command): dry_run=self.dry_run, ) -# Get the version from youtube_dl/version.py without importing the package -exec(compile(open('youtube_dl/version.py').read(), - 'youtube_dl/version.py', 'exec')) - setup( name='youtube_dl', version=__version__, - description='YouTube video downloader', - long_description='Small command-line program to download videos from' - ' YouTube.com and other video sites.', + description=DESCRIPTION, + long_description=LONG_DESCRIPTION, url='https://github.com/rg3/youtube-dl', author='Ricardo Garcia', author_email='ytdl@yt-dl.org', @@ -112,17 +119,17 @@ setup( # test_requires = ['nosetest'], classifiers=[ - "Topic :: Multimedia :: Video", - "Development Status :: 5 - Production/Stable", - "Environment :: Console", - "License :: Public Domain", - "Programming Language :: Python :: 2.6", - "Programming Language :: Python :: 2.7", - "Programming Language :: Python :: 3", - "Programming Language :: Python :: 3.2", - "Programming Language :: Python :: 3.3", - "Programming Language :: Python :: 3.4", - "Programming Language :: Python :: 3.5", + 'Topic :: Multimedia :: Video', + 'Development Status :: 5 - Production/Stable', + 'Environment :: Console', + 'License :: Public Domain', + 'Programming Language :: Python :: 2.6', + 'Programming Language :: Python :: 2.7', + 'Programming Language :: Python :: 3', + 'Programming Language :: Python :: 3.2', + 'Programming Language :: Python :: 3.3', + 'Programming Language :: Python :: 3.4', + 'Programming Language :: Python :: 3.5', ], cmdclass={'build_lazy_extractors': build_lazy_extractors}, From b4241e308e9b2d38d564833cb6c43c9fcc0fd280 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 25 Jun 2016 03:03:20 +0700 Subject: [PATCH 080/387] release 2016.06.25 --- .github/ISSUE_TEMPLATE.md | 6 +++--- youtube_dl/version.py | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/ISSUE_TEMPLATE.md b/.github/ISSUE_TEMPLATE.md index 62cb18d7d..c73f9a904 100644 --- a/.github/ISSUE_TEMPLATE.md +++ b/.github/ISSUE_TEMPLATE.md @@ -6,8 +6,8 @@ --- -### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2016.06.23.1*. If it's not read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. -- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2016.06.23.1** +### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2016.06.25*. If it's not read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. +- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2016.06.25** ### Before submitting an *issue* make sure you have: - [ ] At least skimmed through [README](https://github.com/rg3/youtube-dl/blob/master/README.md) and **most notably** [FAQ](https://github.com/rg3/youtube-dl#faq) and [BUGS](https://github.com/rg3/youtube-dl#bugs) sections @@ -35,7 +35,7 @@ $ youtube-dl -v <your command line> [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 -[debug] youtube-dl version 2016.06.23.1 +[debug] youtube-dl version 2016.06.25 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 0238dc97c..2b7a4c98d 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2016.06.23.1' +__version__ = '2016.06.25' From 46f59e89ea1e75bf2bd1657f0863a3e5e81f91ea Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 25 Jun 2016 22:30:35 +0700 Subject: [PATCH 081/387] [utils] Add unified_timestamp --- test/test_utils.py | 21 +++++++ youtube_dl/utils.py | 150 +++++++++++++++++++++++++++----------------- 2 files changed, 113 insertions(+), 58 deletions(-) diff --git a/test/test_utils.py b/test/test_utils.py index b7ef51f8d..7f9385deb 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -60,6 +60,7 @@ from youtube_dl.utils import ( timeconvert, unescapeHTML, unified_strdate, + unified_timestamp, unsmuggle_url, uppercase_escape, lowercase_escape, @@ -283,8 +284,28 @@ class TestUtil(unittest.TestCase): '20150202') self.assertEqual(unified_strdate('Feb 14th 2016 5:45PM'), '20160214') self.assertEqual(unified_strdate('25-09-2014'), '20140925') + self.assertEqual(unified_strdate('27.02.2016 17:30'), '20160227') self.assertEqual(unified_strdate('UNKNOWN DATE FORMAT'), None) + def test_unified_timestamps(self): + self.assertEqual(unified_timestamp('December 21, 2010'), 1292889600) + self.assertEqual(unified_timestamp('8/7/2009'), 1247011200) + self.assertEqual(unified_timestamp('Dec 14, 2012'), 1355443200) + self.assertEqual(unified_timestamp('2012/10/11 01:56:38 +0000'), 1349920598) + self.assertEqual(unified_timestamp('1968 12 10'), -33436800) + self.assertEqual(unified_timestamp('1968-12-10'), -33436800) + self.assertEqual(unified_timestamp('28/01/2014 21:00:00 +0100'), 1390939200) + self.assertEqual( + unified_timestamp('11/26/2014 11:30:00 AM PST', day_first=False), + 1417001400) + self.assertEqual( + unified_timestamp('2/2/2015 6:47:40 PM', day_first=False), + 1422902860) + self.assertEqual(unified_timestamp('Feb 14th 2016 5:45PM'), 1455471900) + self.assertEqual(unified_timestamp('25-09-2014'), 1411603200) + self.assertEqual(unified_timestamp('27.02.2016 17:30'), 1456594200) + self.assertEqual(unified_timestamp('UNKNOWN DATE FORMAT'), None) + def test_determine_ext(self): self.assertEqual(determine_ext('http://example.com/foo/bar.mp4/?download'), 'mp4') self.assertEqual(determine_ext('http://example.com/foo/bar/?download', None), None) diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 562031fe1..de66cb482 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -110,6 +110,49 @@ ACCENT_CHARS = dict(zip('ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖŐ،٠itertools.chain('AAAAAA', ['AE'], 'CEEEEIIIIDNOOOOOOO', ['OE'], 'UUUUUYP', ['ss'], 'aaaaaa', ['ae'], 'ceeeeiiiionooooooo', ['oe'], 'uuuuuypy'))) +DATE_FORMATS = ( + '%d %B %Y', + '%d %b %Y', + '%B %d %Y', + '%b %d %Y', + '%b %dst %Y %I:%M', + '%b %dnd %Y %I:%M', + '%b %dth %Y %I:%M', + '%Y %m %d', + '%Y-%m-%d', + '%Y/%m/%d', + '%Y/%m/%d %H:%M:%S', + '%Y-%m-%d %H:%M:%S', + '%Y-%m-%d %H:%M:%S.%f', + '%d.%m.%Y %H:%M', + '%d.%m.%Y %H.%M', + '%Y-%m-%dT%H:%M:%SZ', + '%Y-%m-%dT%H:%M:%S.%fZ', + '%Y-%m-%dT%H:%M:%S.%f0Z', + '%Y-%m-%dT%H:%M:%S', + '%Y-%m-%dT%H:%M:%S.%f', + '%Y-%m-%dT%H:%M', +) + +DATE_FORMATS_DAY_FIRST = list(DATE_FORMATS) +DATE_FORMATS_DAY_FIRST.extend([ + '%d-%m-%Y', + '%d.%m.%Y', + '%d.%m.%y', + '%d/%m/%Y', + '%d/%m/%y', + '%d/%m/%Y %H:%M:%S', +]) + +DATE_FORMATS_MONTH_FIRST = list(DATE_FORMATS) +DATE_FORMATS_MONTH_FIRST.extend([ + '%m-%d-%Y', + '%m.%d.%Y', + '%m/%d/%Y', + '%m/%d/%y', + '%m/%d/%Y %H:%M:%S', +]) + def preferredencoding(): """Get preferred encoding. @@ -975,6 +1018,24 @@ class YoutubeDLCookieProcessor(compat_urllib_request.HTTPCookieProcessor): https_response = http_response +def extract_timezone(date_str): + m = re.search( + r'^.{8,}?(?P<tz>Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$)', + date_str) + if not m: + timezone = datetime.timedelta() + else: + date_str = date_str[:-len(m.group('tz'))] + if not m.group('sign'): + timezone = datetime.timedelta() + else: + sign = 1 if m.group('sign') == '+' else -1 + timezone = datetime.timedelta( + hours=sign * int(m.group('hours')), + minutes=sign * int(m.group('minutes'))) + return timezone, date_str + + def parse_iso8601(date_str, delimiter='T', timezone=None): """ Return a UNIX timestamp from the given date """ @@ -984,20 +1045,8 @@ def parse_iso8601(date_str, delimiter='T', timezone=None): date_str = re.sub(r'\.[0-9]+', '', date_str) if timezone is None: - m = re.search( - r'(?:Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$)', - date_str) - if not m: - timezone = datetime.timedelta() - else: - date_str = date_str[:-len(m.group(0))] - if not m.group('sign'): - timezone = datetime.timedelta() - else: - sign = 1 if m.group('sign') == '+' else -1 - timezone = datetime.timedelta( - hours=sign * int(m.group('hours')), - minutes=sign * int(m.group('minutes'))) + timezone, date_str = extract_timezone(date_str) + try: date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter) dt = datetime.datetime.strptime(date_str, date_format) - timezone @@ -1006,6 +1055,10 @@ def parse_iso8601(date_str, delimiter='T', timezone=None): pass +def date_formats(day_first=True): + return DATE_FORMATS_DAY_FIRST if day_first else DATE_FORMATS_MONTH_FIRST + + def unified_strdate(date_str, day_first=True): """Return a string with the date in the format YYYYMMDD""" @@ -1014,53 +1067,11 @@ def unified_strdate(date_str, day_first=True): upload_date = None # Replace commas date_str = date_str.replace(',', ' ') - # %z (UTC offset) is only supported in python>=3.2 - if not re.match(r'^[0-9]{1,2}-[0-9]{1,2}-[0-9]{4}$', date_str): - date_str = re.sub(r' ?(\+|-)[0-9]{2}:?[0-9]{2}$', '', date_str) # Remove AM/PM + timezone date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str) + _, date_str = extract_timezone(date_str) - format_expressions = [ - '%d %B %Y', - '%d %b %Y', - '%B %d %Y', - '%b %d %Y', - '%b %dst %Y %I:%M', - '%b %dnd %Y %I:%M', - '%b %dth %Y %I:%M', - '%Y %m %d', - '%Y-%m-%d', - '%Y/%m/%d', - '%Y/%m/%d %H:%M:%S', - '%Y-%m-%d %H:%M:%S', - '%Y-%m-%d %H:%M:%S.%f', - '%d.%m.%Y %H:%M', - '%d.%m.%Y %H.%M', - '%Y-%m-%dT%H:%M:%SZ', - '%Y-%m-%dT%H:%M:%S.%fZ', - '%Y-%m-%dT%H:%M:%S.%f0Z', - '%Y-%m-%dT%H:%M:%S', - '%Y-%m-%dT%H:%M:%S.%f', - '%Y-%m-%dT%H:%M', - ] - if day_first: - format_expressions.extend([ - '%d-%m-%Y', - '%d.%m.%Y', - '%d.%m.%y', - '%d/%m/%Y', - '%d/%m/%y', - '%d/%m/%Y %H:%M:%S', - ]) - else: - format_expressions.extend([ - '%m-%d-%Y', - '%m.%d.%Y', - '%m/%d/%Y', - '%m/%d/%y', - '%m/%d/%Y %H:%M:%S', - ]) - for expression in format_expressions: + for expression in date_formats(day_first): try: upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d') except ValueError: @@ -1076,6 +1087,29 @@ def unified_strdate(date_str, day_first=True): return compat_str(upload_date) +def unified_timestamp(date_str, day_first=True): + if date_str is None: + return None + + date_str = date_str.replace(',', ' ') + + pm_delta = datetime.timedelta(hours=12 if re.search(r'(?i)PM', date_str) else 0) + timezone, date_str = extract_timezone(date_str) + + # Remove AM/PM + timezone + date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str) + + for expression in date_formats(day_first): + try: + dt = datetime.datetime.strptime(date_str, expression) - timezone + pm_delta + return calendar.timegm(dt.timetuple()) + except ValueError: + pass + timetuple = email.utils.parsedate_tz(date_str) + if timetuple: + return calendar.timegm(timetuple.timetuple()) + + def determine_ext(url, default_ext='unknown_video'): if url is None: return default_ext From b72b44318c0de52befe003c14297cae9f7a283c3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 25 Jun 2016 22:32:02 +0700 Subject: [PATCH 082/387] [utils] Add strip_or_none --- youtube_dl/utils.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index de66cb482..a375282f2 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -1660,6 +1660,10 @@ def float_or_none(v, scale=1, invscale=1, default=None): return default +def strip_or_none(v): + return None if v is None else v.strip() + + def parse_duration(s): if not isinstance(s, compat_basestring): return None From 2d185706ea0236a30bd1037a3ab97fbe5fe575a5 Mon Sep 17 00:00:00 2001 From: Jakub Adam Wieczorek <ja.wieczorek@student.uw.edu.pl> Date: Thu, 16 Jun 2016 21:00:27 +0200 Subject: [PATCH 083/387] [polskieradio] Add support for Polskie Radio. Polskie Radio is the main Polish state-funded radio broadcasting service. --- youtube_dl/extractor/extractors.py | 1 + youtube_dl/extractor/polskieradio.py | 74 ++++++++++++++++++++++++++++ 2 files changed, 75 insertions(+) create mode 100644 youtube_dl/extractor/polskieradio.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 6fc5a18f5..9f98a1490 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -606,6 +606,7 @@ from .pluralsight import ( PluralsightCourseIE, ) from .podomatic import PodomaticIE +from .polskieradio import PolskieRadioIE from .porn91 import Porn91IE from .pornhd import PornHdIE from .pornhub import ( diff --git a/youtube_dl/extractor/polskieradio.py b/youtube_dl/extractor/polskieradio.py new file mode 100644 index 000000000..5d4b116d8 --- /dev/null +++ b/youtube_dl/extractor/polskieradio.py @@ -0,0 +1,74 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import int_or_none + +import calendar +from datetime import datetime + + +class PolskieRadioIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?polskieradio\.pl/[0-9]+/[0-9]+/Artykul/(?P<id>[0-9]+),.+' + _TESTS = [{ + 'url': 'http://www.polskieradio.pl/7/5102/Artykul/1587943,Prof-Andrzej-Nowak-o-historii-nie-da-sie-myslec-beznamietnie', + 'md5': '2984ee6ce9046d91fc233bc1a864a09a', + 'info_dict': { + 'id': '1587943', + 'ext': 'mp3', + 'title': 'Prof. Andrzej Nowak: o historii nie da się myśleć beznamiętnie', + 'description': 'md5:12f954edbf3120c5e7075e17bf9fc5c5', + 'release_date': '20160227', + 'upload_date': '20160227', + 'timestamp': 1456594200, + 'duration': 2364 + } + }, { + 'url': 'http://polskieradio.pl/9/305/Artykul/1632955,Bardzo-popularne-slowo-remis', + 'md5': '68a393e25b942c1a76872f56d303a31a', + 'info_dict': { + 'id': '1632955', + 'ext': 'mp3', + 'title': 'Bardzo popularne słowo: remis', + 'description': 'md5:3b58dfae614100abc0f175a0b26d5680', + 'release_date': '20160617', + 'upload_date': '20160617', + 'timestamp': 1466184900, + 'duration': 393 + } + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + metadata_string = self._html_search_regex(r'<span class="play pr-media-play" data-media=(\{.+\})>', webpage, 'metadata') + metadata = self._parse_json(metadata_string, video_id) + + title = self._og_search_title(webpage) + if title is not None: + title = title.strip() + + description = self._og_search_description(webpage) + if description is not None: + description = description.strip() + + release_date = self._html_search_regex(r'Data emisji:[^0-9]+([0-9]{1,2}\.[0-9]{2}\.[0-9]{4})', webpage, 'release date', fatal=False) + if release_date is not None: + release_date = datetime.strptime(release_date, '%d.%m.%Y').strftime('%Y%m%d') + + upload_datetime = self._html_search_regex(r'<span id="datetime2" class="time">\s+(.+)\s+</span>', webpage, 'release time', fatal=False) + if upload_datetime is not None: + timestamp = calendar.timegm(datetime.strptime(upload_datetime, '%d.%m.%Y %H:%M').timetuple()) + else: + timestamp = None + + return { + 'id': video_id, + 'title': title, + 'description': description, + 'display_id': metadata.get('id'), + 'duration': int_or_none(metadata.get('length')), + 'url': self._proto_relative_url(metadata.get('file'), 'http:'), + 'release_date': release_date, + 'timestamp': timestamp + } From 0463b77a1f83f3f9239c6c5f5d1ca251afd267e4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 25 Jun 2016 23:18:40 +0700 Subject: [PATCH 084/387] [polskieradio] Improve extraction (Closes #9813) --- youtube_dl/extractor/polskieradio.py | 119 +++++++++++++++------------ 1 file changed, 68 insertions(+), 51 deletions(-) diff --git a/youtube_dl/extractor/polskieradio.py b/youtube_dl/extractor/polskieradio.py index 5d4b116d8..f5adff08f 100644 --- a/youtube_dl/extractor/polskieradio.py +++ b/youtube_dl/extractor/polskieradio.py @@ -1,74 +1,91 @@ # coding: utf-8 from __future__ import unicode_literals -from .common import InfoExtractor -from ..utils import int_or_none +import re -import calendar -from datetime import datetime +from .common import InfoExtractor +from ..compat import ( + compat_str, + compat_urllib_parse_unquote, +) +from ..utils import ( + int_or_none, + strip_or_none, + unified_timestamp, +) class PolskieRadioIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?polskieradio\.pl/[0-9]+/[0-9]+/Artykul/(?P<id>[0-9]+),.+' + _VALID_URL = r'https?://(?:www\.)?polskieradio\.pl/\d+/\d+/Artykul/(?P<id>[0-9]+)' _TESTS = [{ 'url': 'http://www.polskieradio.pl/7/5102/Artykul/1587943,Prof-Andrzej-Nowak-o-historii-nie-da-sie-myslec-beznamietnie', - 'md5': '2984ee6ce9046d91fc233bc1a864a09a', 'info_dict': { 'id': '1587943', - 'ext': 'mp3', 'title': 'Prof. Andrzej Nowak: o historii nie da się myśleć beznamiętnie', 'description': 'md5:12f954edbf3120c5e7075e17bf9fc5c5', - 'release_date': '20160227', - 'upload_date': '20160227', - 'timestamp': 1456594200, - 'duration': 2364 - } + }, + 'playlist': [{ + 'md5': '2984ee6ce9046d91fc233bc1a864a09a', + 'info_dict': { + 'id': '1540576', + 'ext': 'mp3', + 'title': 'md5:d4623290d4ac983bf924061c75c23a0d', + 'timestamp': 1456594200, + 'upload_date': '20160227', + 'duration': 2364, + }, + }], + }, { + 'url': 'http://www.polskieradio.pl/265/5217/Artykul/1635803,Euro-2016-nie-ma-miejsca-na-blad-Polacy-graja-ze-Szwajcaria-o-cwiercfinal', + 'info_dict': { + 'id': '1635803', + 'title': 'Euro 2016: nie ma miejsca na błąd. Polacy grają ze Szwajcarią o ćwierćfinał', + 'description': 'md5:01cb7d0cad58664095d72b51a1ebada2', + }, + 'playlist_mincount': 12, }, { 'url': 'http://polskieradio.pl/9/305/Artykul/1632955,Bardzo-popularne-slowo-remis', - 'md5': '68a393e25b942c1a76872f56d303a31a', - 'info_dict': { - 'id': '1632955', - 'ext': 'mp3', - 'title': 'Bardzo popularne słowo: remis', - 'description': 'md5:3b58dfae614100abc0f175a0b26d5680', - 'release_date': '20160617', - 'upload_date': '20160617', - 'timestamp': 1466184900, - 'duration': 393 - } + 'only_matching': True, + }, { + 'url': 'http://www.polskieradio.pl/7/5102/Artykul/1587943', + 'only_matching': True, }] def _real_extract(self, url): - video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) - metadata_string = self._html_search_regex(r'<span class="play pr-media-play" data-media=(\{.+\})>', webpage, 'metadata') - metadata = self._parse_json(metadata_string, video_id) + playlist_id = self._match_id(url) - title = self._og_search_title(webpage) - if title is not None: - title = title.strip() + webpage = self._download_webpage(url, playlist_id) - description = self._og_search_description(webpage) - if description is not None: - description = description.strip() + content = self._search_regex( + r'(?s)<div[^>]+class="audio atarticle"[^>]*>(.+?)<script>', + webpage, 'content') - release_date = self._html_search_regex(r'Data emisji:[^0-9]+([0-9]{1,2}\.[0-9]{2}\.[0-9]{4})', webpage, 'release date', fatal=False) - if release_date is not None: - release_date = datetime.strptime(release_date, '%d.%m.%Y').strftime('%Y%m%d') + timestamp = unified_timestamp(self._html_search_regex( + r'(?s)<span[^>]+id="datetime2"[^>]*>(.+?)</span>', + webpage, 'timestamp', fatal=False)) - upload_datetime = self._html_search_regex(r'<span id="datetime2" class="time">\s+(.+)\s+</span>', webpage, 'release time', fatal=False) - if upload_datetime is not None: - timestamp = calendar.timegm(datetime.strptime(upload_datetime, '%d.%m.%Y %H:%M').timetuple()) - else: - timestamp = None + entries = [] - return { - 'id': video_id, - 'title': title, - 'description': description, - 'display_id': metadata.get('id'), - 'duration': int_or_none(metadata.get('length')), - 'url': self._proto_relative_url(metadata.get('file'), 'http:'), - 'release_date': release_date, - 'timestamp': timestamp - } + media_urls = set() + + for data_media in re.findall(r'<[^>]+data-media=({[^>]+})', content): + media = self._parse_json(data_media, playlist_id, fatal=False) + if not media.get('file') or not media.get('desc'): + continue + media_url = self._proto_relative_url(media['file'], 'http:') + if media_url in media_urls: + continue + media_urls.add(media_url) + entries.append({ + 'id': compat_str(media['id']), + 'url': media_url, + 'title': compat_urllib_parse_unquote(media['desc']), + 'duration': int_or_none(media.get('length')), + 'vcodec': 'none' if media.get('provider') == 'audio' else None, + 'timestamp': timestamp, + }) + + title = self._og_search_title(webpage).strip() + description = strip_or_none(self._og_search_description(webpage)) + + return self.playlist_result(entries, playlist_id, title, description) From ce96ed05f42d42f8a506a2a527c776054c44ad1e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 25 Jun 2016 23:31:21 +0700 Subject: [PATCH 085/387] [polskieradio] Add test with video --- youtube_dl/extractor/polskieradio.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/youtube_dl/extractor/polskieradio.py b/youtube_dl/extractor/polskieradio.py index f5adff08f..d3bebaea3 100644 --- a/youtube_dl/extractor/polskieradio.py +++ b/youtube_dl/extractor/polskieradio.py @@ -49,6 +49,10 @@ class PolskieRadioIE(InfoExtractor): }, { 'url': 'http://www.polskieradio.pl/7/5102/Artykul/1587943', 'only_matching': True, + }, { + # with mp4 video + 'url': 'http://www.polskieradio.pl/9/299/Artykul/1634903,Brexit-Leszek-Miller-swiat-sie-nie-zawali-Europa-bedzie-trwac-dalej', + 'only_matching': True, }] def _real_extract(self, url): From 0c00e889f3616cecd4de161681924c4cb12ce320 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 25 Jun 2016 23:35:57 +0700 Subject: [PATCH 086/387] Credit @JakubAdamWieczorek for #9813 --- AUTHORS | 1 + 1 file changed, 1 insertion(+) diff --git a/AUTHORS b/AUTHORS index cdf655c39..bdd29687d 100644 --- a/AUTHORS +++ b/AUTHORS @@ -175,3 +175,4 @@ Tomáš Čech Déstin Reed Roman Tsiupa Artur Krysiak +Jakub Adam Wieczorek From ac782306f18430479e881ffd1ac749baff4dd9aa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 26 Jun 2016 00:25:41 +0700 Subject: [PATCH 087/387] [iqiyi] Mark broken --- youtube_dl/extractor/iqiyi.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/youtube_dl/extractor/iqiyi.py b/youtube_dl/extractor/iqiyi.py index ddcb3c916..5dd15e26f 100644 --- a/youtube_dl/extractor/iqiyi.py +++ b/youtube_dl/extractor/iqiyi.py @@ -165,6 +165,8 @@ class IqiyiIE(InfoExtractor): IE_NAME = 'iqiyi' IE_DESC = '爱奇艺' + _WORKING = False + _VALID_URL = r'https?://(?:(?:[^.]+\.)?iqiyi\.com|www\.pps\.tv)/.+\.html' _NETRC_MACHINE = 'iqiyi' From 3b34ab538c5bc67be18df8376c25f433ea1ff92b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 26 Jun 2016 00:29:53 +0700 Subject: [PATCH 088/387] [svtplay] Extend _VALID_URL (#9900) --- youtube_dl/extractor/svt.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/svt.py b/youtube_dl/extractor/svt.py index 67f56fab8..1c04dfb7b 100644 --- a/youtube_dl/extractor/svt.py +++ b/youtube_dl/extractor/svt.py @@ -120,7 +120,7 @@ class SVTIE(SVTBaseIE): class SVTPlayIE(SVTBaseIE): IE_DESC = 'SVT Play and Öppet arkiv' - _VALID_URL = r'https?://(?:www\.)?(?:svtplay|oppetarkiv)\.se/video/(?P<id>[0-9]+)' + _VALID_URL = r'https?://(?:www\.)?(?:svtplay|oppetarkiv)\.se/(?:video|klipp)/(?P<id>[0-9]+)' _TESTS = [{ 'url': 'http://www.svtplay.se/video/5996901/flygplan-till-haile-selassie/flygplan-till-haile-selassie-2', 'md5': '2b6704fe4a28801e1a098bbf3c5ac611', @@ -141,6 +141,9 @@ class SVTPlayIE(SVTBaseIE): # geo restricted to Sweden 'url': 'http://www.oppetarkiv.se/video/5219710/trollflojten', 'only_matching': True, + }, { + 'url': 'http://www.svtplay.se/klipp/9023742/stopptid-om-bjorn-borg', + 'only_matching': True, }] def _real_extract(self, url): From a2406fce3c657af116f95c5e9b965315aa23cd95 Mon Sep 17 00:00:00 2001 From: stepshal <nessento@openmailbox.org> Date: Sun, 26 Jun 2016 01:23:48 +0700 Subject: [PATCH 089/387] Fix misspelling --- youtube_dl/socks.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/socks.py b/youtube_dl/socks.py index fd49d7435..104807242 100644 --- a/youtube_dl/socks.py +++ b/youtube_dl/socks.py @@ -76,7 +76,7 @@ class Socks4Error(ProxyError): CODES = { 91: 'request rejected or failed', - 92: 'request rejected becasue SOCKS server cannot connect to identd on the client', + 92: 'request rejected because SOCKS server cannot connect to identd on the client', 93: 'request rejected because the client program and identd report different user-ids' } From 7d52c052efe7accf098bca84aef0ea70caa64889 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan <yan12125@gmail.com> Date: Sun, 26 Jun 2016 11:54:52 +0800 Subject: [PATCH 090/387] [generic] Fix test_Generic_76 Broken: https://travis-ci.org/rg3/youtube-dl/jobs/140251658 --- youtube_dl/extractor/generic.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 4aa24061c..1592a8a3a 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -1091,12 +1091,17 @@ class GenericIE(InfoExtractor): # Dailymotion Cloud video { 'url': 'http://replay.publicsenat.fr/vod/le-debat/florent-kolandjian,dominique-cena,axel-decourtye,laurence-abeille,bruno-parmentier/175910', - 'md5': '49444254273501a64675a7e68c502681', + 'md5': 'dcaf23ad0c67a256f4278bce6e0bae38', 'info_dict': { - 'id': '5585de919473990de4bee11b', + 'id': 'x2uy8t3', 'ext': 'mp4', - 'title': 'Le débat', + 'title': 'Sauvons les abeilles ! - Le débat', + 'description': 'md5:d9082128b1c5277987825d684939ca26', 'thumbnail': 're:^https?://.*\.jpe?g$', + 'timestamp': 1434970506, + 'upload_date': '20150622', + 'uploader': 'Public Sénat', + 'uploader_id': 'xa9gza', } }, # OnionStudios embed From 1143535d762fc4260aacc108f2c41079867f9f00 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan <yan12125@gmail.com> Date: Sun, 26 Jun 2016 15:16:49 +0800 Subject: [PATCH 091/387] [utils] Add urshift() Used in IqiyiIE and LeIE --- test/test_utils.py | 5 +++++ youtube_dl/utils.py | 4 ++++ 2 files changed, 9 insertions(+) diff --git a/test/test_utils.py b/test/test_utils.py index 7f9385deb..ed61e4c27 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -66,6 +66,7 @@ from youtube_dl.utils import ( lowercase_escape, url_basename, urlencode_postdata, + urshift, update_url_query, version_tuple, xpath_with_ns, @@ -980,5 +981,9 @@ The first line self.assertRaises(ValueError, encode_base_n, 0, 70) self.assertRaises(ValueError, encode_base_n, 0, 60, custom_table) + def test_urshift(self): + self.assertEqual(urshift(3, 1), 1) + self.assertEqual(urshift(-3, 1), 2147483646) + if __name__ == '__main__': unittest.main() diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index a375282f2..a2cfb48a6 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -2899,3 +2899,7 @@ def parse_m3u8_attributes(attrib): val = val[1:-1] info[key] = val return info + + +def urshift(val, n): + return val >> n if val >= 0 else (val + 0x100000000) >> n From 30105f4ac0291bd3e1350a5bb383e88e260b9ad9 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan <yan12125@gmail.com> Date: Sun, 26 Jun 2016 15:17:26 +0800 Subject: [PATCH 092/387] [le] Move urshift() to utils.py --- youtube_dl/extractor/leeco.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/youtube_dl/extractor/leeco.py b/youtube_dl/extractor/leeco.py index 63f581cd9..959d71617 100644 --- a/youtube_dl/extractor/leeco.py +++ b/youtube_dl/extractor/leeco.py @@ -23,6 +23,7 @@ from ..utils import ( sanitized_Request, str_or_none, url_basename, + urshift, ) @@ -74,15 +75,11 @@ class LeIE(InfoExtractor): 'only_matching': True, }] - @staticmethod - def urshift(val, n): - return val >> n if val >= 0 else (val + 0x100000000) >> n - # ror() and calc_time_key() are reversed from a embedded swf file in KLetvPlayer.swf def ror(self, param1, param2): _loc3_ = 0 while _loc3_ < param2: - param1 = self.urshift(param1, 1) + ((param1 & 1) << 31) + param1 = urshift(param1, 1) + ((param1 & 1) << 31) _loc3_ += 1 return param1 From 5b6ad8630c4947f3695513c9707406b2d12ae7b8 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan <yan12125@gmail.com> Date: Sun, 26 Jun 2016 15:18:32 +0800 Subject: [PATCH 093/387] [iqiyi] Partially fix IqiyiIE Use the HTML5 API. Only low-resolution formats available Related: #9839 Thanks @zhangn1985 for the overall algorithm (soimort/you-get#1224) --- youtube_dl/extractor/iqiyi.py | 404 ++++++++++------------------------ 1 file changed, 118 insertions(+), 286 deletions(-) diff --git a/youtube_dl/extractor/iqiyi.py b/youtube_dl/extractor/iqiyi.py index 5dd15e26f..b717ca09c 100644 --- a/youtube_dl/extractor/iqiyi.py +++ b/youtube_dl/extractor/iqiyi.py @@ -1,30 +1,25 @@ # coding: utf-8 from __future__ import unicode_literals +import binascii import hashlib import itertools import math -import os -import random import re import time -import uuid from .common import InfoExtractor from ..compat import ( - compat_parse_qs, compat_str, compat_urllib_parse_urlencode, - compat_urllib_parse_urlparse, ) from ..utils import ( decode_packed_codes, ExtractorError, + intlist_to_bytes, ohdave_rsa_encrypt, remove_start, - sanitized_Request, - urlencode_postdata, - url_basename, + urshift, ) @@ -165,77 +160,28 @@ class IqiyiIE(InfoExtractor): IE_NAME = 'iqiyi' IE_DESC = '爱奇艺' - _WORKING = False - _VALID_URL = r'https?://(?:(?:[^.]+\.)?iqiyi\.com|www\.pps\.tv)/.+\.html' _NETRC_MACHINE = 'iqiyi' _TESTS = [{ 'url': 'http://www.iqiyi.com/v_19rrojlavg.html', - 'md5': '2cb594dc2781e6c941a110d8f358118b', + 'md5': '470a6c160618577166db1a7aac5a3606', 'info_dict': { 'id': '9c1fb1b99d192b21c559e5a1a2cb3c73', + 'ext': 'mp4', 'title': '美国德州空中惊现奇异云团 酷似UFO', - 'ext': 'f4v', } }, { 'url': 'http://www.iqiyi.com/v_19rrhnnclk.html', + 'md5': 'f09f0a6a59b2da66a26bf4eda669a4cc', 'info_dict': { 'id': 'e3f585b550a280af23c98b6cb2be19fb', - 'title': '名侦探柯南第752集', + 'ext': 'mp4', + 'title': '名侦探柯南 国语版', }, - 'playlist': [{ - 'info_dict': { - 'id': 'e3f585b550a280af23c98b6cb2be19fb_part1', - 'ext': 'f4v', - 'title': '名侦探柯南第752集', - }, - }, { - 'info_dict': { - 'id': 'e3f585b550a280af23c98b6cb2be19fb_part2', - 'ext': 'f4v', - 'title': '名侦探柯南第752集', - }, - }, { - 'info_dict': { - 'id': 'e3f585b550a280af23c98b6cb2be19fb_part3', - 'ext': 'f4v', - 'title': '名侦探柯南第752集', - }, - }, { - 'info_dict': { - 'id': 'e3f585b550a280af23c98b6cb2be19fb_part4', - 'ext': 'f4v', - 'title': '名侦探柯南第752集', - }, - }, { - 'info_dict': { - 'id': 'e3f585b550a280af23c98b6cb2be19fb_part5', - 'ext': 'f4v', - 'title': '名侦探柯南第752集', - }, - }, { - 'info_dict': { - 'id': 'e3f585b550a280af23c98b6cb2be19fb_part6', - 'ext': 'f4v', - 'title': '名侦探柯南第752集', - }, - }, { - 'info_dict': { - 'id': 'e3f585b550a280af23c98b6cb2be19fb_part7', - 'ext': 'f4v', - 'title': '名侦探柯南第752集', - }, - }, { - 'info_dict': { - 'id': 'e3f585b550a280af23c98b6cb2be19fb_part8', - 'ext': 'f4v', - 'title': '名侦探柯南第752集', - }, - }], 'params': { - 'skip_download': True, + 'cn_verification_proxy': 'http://proxy.uku.im:443/', }, }, { 'url': 'http://www.iqiyi.com/w_19rt6o8t9p.html', @@ -289,13 +235,6 @@ class IqiyiIE(InfoExtractor): ('10', 'h1'), ] - AUTH_API_ERRORS = { - # No preview available (不允许试看鉴权失败) - 'Q00505': 'This video requires a VIP account', - # End of preview time (试看结束鉴权失败) - 'Q00506': 'Needs a VIP account for full video', - } - def _real_initialize(self): self._login() @@ -354,177 +293,101 @@ class IqiyiIE(InfoExtractor): return True - def _authenticate_vip_video(self, api_video_url, video_id, tvid, _uuid, do_report_warning): - auth_params = { - # version and platform hard-coded in com/qiyi/player/core/model/remote/AuthenticationRemote.as - 'version': '2.0', - 'platform': 'b6c13e26323c537d', - 'aid': tvid, + @staticmethod + def _gen_sc(tvid, timestamp): + M = [1732584193, -271733879] + M.extend([~M[0], ~M[1]]) + I_table = [7, 12, 17, 22, 5, 9, 14, 20, 4, 11, 16, 23, 6, 10, 15, 21] + C_base = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 8388608, 432] + + def L(n, t): + if t is None: + t = 0 + return trunc(((n >> 1) + (t >> 1) << 1) + (n & 1) + (t & 1)) + + def trunc(n): + n = n % 0x100000000 + if n > 0x7fffffff: + n -= 0x100000000 + return n + + def transform(string, mod): + num = int(string, 16) + return (num >> 8 * (i % 4) & 255 ^ i % mod) << ((a & 3) << 3) + + C = list(C_base) + o = list(M) + k = str(timestamp - 7) + for i in range(13): + a = i + C[a >> 2] |= ord(k[a]) << 8 * (a % 4) + + for i in range(16): + a = i + 13 + start = (i >> 2) * 8 + r = '03967743b643f66763d623d637e30733' + C[a >> 2] |= transform(''.join(reversed(r[start:start + 8])), 7) + + for i in range(16): + a = i + 29 + start = (i >> 2) * 8 + r = '7038766939776a32776a32706b337139' + C[a >> 2] |= transform(r[start:start + 8], 1) + + for i in range(9): + a = i + 45 + if i < len(tvid): + C[a >> 2] |= ord(tvid[i]) << 8 * (a % 4) + + for a in range(64): + i = a + I = i >> 4 + C_index = [i, 5 * i + 1, 3 * i + 5, 7 * i][I] % 16 + urshift(a, 6) + m = L(L(o[0], [ + trunc(o[1] & o[2]) | trunc(~o[1] & o[3]), + trunc(o[3] & o[1]) | trunc(~o[3] & o[2]), + o[1] ^ o[2] ^ o[3], + o[2] ^ trunc(o[1] | ~o[3]) + ][I]), L( + trunc(int(abs(math.sin(i + 1)) * 4294967296)), + C[C_index] if C_index < len(C) else None)) + I = I_table[4 * I + i % 4] + o = [o[3], + L(o[1], trunc(trunc(m << I) | urshift(m, 32 - I))), + o[1], + o[2]] + + new_M = [L(o[0], M[0]), L(o[1], M[1]), L(o[2], M[2]), L(o[3], M[3])] + s = [new_M[a >> 3] >> (1 ^ a & 7) * 4 & 15 for a in range(32)] + return binascii.hexlify(intlist_to_bytes(s))[1::2].decode('ascii') + + def get_raw_data(self, tvid, video_id): + tm = int(time.time() * 1000) + + sc = self._gen_sc(tvid, tm) + params = { + 'platForm': 'h5', + 'rate': 1, 'tvid': tvid, - 'uid': '', - 'deviceId': _uuid, - 'playType': 'main', # XXX: always main? - 'filename': os.path.splitext(url_basename(api_video_url))[0], - } - - qd_items = compat_parse_qs(compat_urllib_parse_urlparse(api_video_url).query) - for key, val in qd_items.items(): - auth_params[key] = val[0] - - auth_req = sanitized_Request( - 'http://api.vip.iqiyi.com/services/ckn.action', - urlencode_postdata(auth_params)) - # iQiyi server throws HTTP 405 error without the following header - auth_req.add_header('Content-Type', 'application/x-www-form-urlencoded') - auth_result = self._download_json( - auth_req, video_id, - note='Downloading video authentication JSON', - errnote='Unable to download video authentication JSON') - - code = auth_result.get('code') - msg = self.AUTH_API_ERRORS.get(code) or auth_result.get('msg') or code - if code == 'Q00506': - if do_report_warning: - self.report_warning(msg) - return False - if 'data' not in auth_result: - if msg is not None: - raise ExtractorError('%s said: %s' % (self.IE_NAME, msg), expected=True) - raise ExtractorError('Unexpected error from Iqiyi auth API') - - return auth_result['data'] - - def construct_video_urls(self, data, video_id, _uuid, tvid): - def do_xor(x, y): - a = y % 3 - if a == 1: - return x ^ 121 - if a == 2: - return x ^ 72 - return x ^ 103 - - def get_encode_code(l): - a = 0 - b = l.split('-') - c = len(b) - s = '' - for i in range(c - 1, -1, -1): - a = do_xor(int(b[c - i - 1], 16), i) - s += chr(a) - return s[::-1] - - def get_path_key(x, format_id, segment_index): - mg = ')(*&^flash@#$%a' - tm = self._download_json( - 'http://data.video.qiyi.com/t?tn=' + str(random.random()), video_id, - note='Download path key of segment %d for format %s' % (segment_index + 1, format_id) - )['t'] - t = str(int(math.floor(int(tm) / (600.0)))) - return md5_text(t + mg + x) - - video_urls_dict = {} - need_vip_warning_report = True - for format_item in data['vp']['tkl'][0]['vs']: - if 0 < int(format_item['bid']) <= 10: - format_id = self.get_format(format_item['bid']) - else: - continue - - video_urls = [] - - video_urls_info = format_item['fs'] - if not format_item['fs'][0]['l'].startswith('/'): - t = get_encode_code(format_item['fs'][0]['l']) - if t.endswith('mp4'): - video_urls_info = format_item['flvs'] - - for segment_index, segment in enumerate(video_urls_info): - vl = segment['l'] - if not vl.startswith('/'): - vl = get_encode_code(vl) - is_vip_video = '/vip/' in vl - filesize = segment['b'] - base_url = data['vp']['du'].split('/') - if not is_vip_video: - key = get_path_key( - vl.split('/')[-1].split('.')[0], format_id, segment_index) - base_url.insert(-1, key) - base_url = '/'.join(base_url) - param = { - 'su': _uuid, - 'qyid': uuid.uuid4().hex, - 'client': '', - 'z': '', - 'bt': '', - 'ct': '', - 'tn': str(int(time.time())) - } - api_video_url = base_url + vl - if is_vip_video: - api_video_url = api_video_url.replace('.f4v', '.hml') - auth_result = self._authenticate_vip_video( - api_video_url, video_id, tvid, _uuid, need_vip_warning_report) - if auth_result is False: - need_vip_warning_report = False - break - param.update({ - 't': auth_result['t'], - # cid is hard-coded in com/qiyi/player/core/player/RuntimeData.as - 'cid': 'afbe8fd3d73448c9', - 'vid': video_id, - 'QY00001': auth_result['u'], - }) - api_video_url += '?' if '?' not in api_video_url else '&' - api_video_url += compat_urllib_parse_urlencode(param) - js = self._download_json( - api_video_url, video_id, - note='Download video info of segment %d for format %s' % (segment_index + 1, format_id)) - video_url = js['l'] - video_urls.append( - (video_url, filesize)) - - video_urls_dict[format_id] = video_urls - return video_urls_dict - - def get_format(self, bid): - matched_format_ids = [_format_id for _bid, _format_id in self._FORMATS_MAP if _bid == str(bid)] - return matched_format_ids[0] if len(matched_format_ids) else None - - def get_bid(self, format_id): - matched_bids = [_bid for _bid, _format_id in self._FORMATS_MAP if _format_id == format_id] - return matched_bids[0] if len(matched_bids) else None - - def get_raw_data(self, tvid, video_id, enc_key, _uuid): - tm = str(int(time.time())) - tail = tm + tvid - param = { - 'key': 'fvip', - 'src': md5_text('youtube-dl'), - 'tvId': tvid, 'vid': video_id, - 'vinfo': 1, - 'tm': tm, - 'enc': md5_text(enc_key + tail), - 'qyid': _uuid, - 'tn': random.random(), - # In iQiyi's flash player, um is set to 1 if there's a logged user - # Some 1080P formats are only available with a logged user. - # Here force um=1 to trick the iQiyi server - 'um': 1, - 'authkey': md5_text(md5_text('') + tail), - 'k_tag': 1, + 'cupid': 'qc_100001_100186', + 'type': 'mp4', + 'nolimit': 0, + 'agenttype': 13, + 'src': 'd846d0c32d664d32b6b54ea48997a589', + 'sc': sc, + 't': tm - 7, + '__jsT': None, } - api_url = 'http://cache.video.qiyi.com/vms' + '?' + \ - compat_urllib_parse_urlencode(param) - raw_data = self._download_json(api_url, video_id) - return raw_data - - def get_enc_key(self, video_id): - # TODO: automatic key extraction - # last update at 2016-01-22 for Zombie::bite - enc_key = '4a1caba4b4465345366f28da7c117d20' - return enc_key + headers = {} + cn_verification_proxy = self._downloader.params.get('cn_verification_proxy') + if cn_verification_proxy: + headers['Ytdl-request-proxy'] = cn_verification_proxy + return self._download_json( + 'http://cache.m.iqiyi.com/jp/tmts/%s/%s/' % (tvid, video_id), + video_id, transform_source=lambda s: remove_start(s, 'var tvInfoJs='), + query=params, headers=headers) def _extract_playlist(self, webpage): PAGE_SIZE = 50 @@ -573,58 +436,27 @@ class IqiyiIE(InfoExtractor): r'data-player-tvid\s*=\s*[\'"](\d+)', webpage, 'tvid') video_id = self._search_regex( r'data-player-videoid\s*=\s*[\'"]([a-f\d]+)', webpage, 'video_id') - _uuid = uuid.uuid4().hex - enc_key = self.get_enc_key(video_id) + for _ in range(5): + raw_data = self.get_raw_data(tvid, video_id) - raw_data = self.get_raw_data(tvid, video_id, enc_key, _uuid) + if raw_data['code'] != 'A00000': + if raw_data['code'] == 'A00111': + self.raise_geo_restricted() + raise ExtractorError('Unable to load data. Error code: ' + raw_data['code']) - if raw_data['code'] != 'A000000': - raise ExtractorError('Unable to load data. Error code: ' + raw_data['code']) + data = raw_data['data'] - data = raw_data['data'] + # iQiYi sometimes returns Ads + if not isinstance(data['playInfo'], dict): + self._sleep(5, video_id) + continue - title = data['vi']['vn'] + title = data['playInfo']['an'] + break - # generate video_urls_dict - video_urls_dict = self.construct_video_urls( - data, video_id, _uuid, tvid) - - # construct info - entries = [] - for format_id in video_urls_dict: - video_urls = video_urls_dict[format_id] - for i, video_url_info in enumerate(video_urls): - if len(entries) < i + 1: - entries.append({'formats': []}) - entries[i]['formats'].append( - { - 'url': video_url_info[0], - 'filesize': video_url_info[-1], - 'format_id': format_id, - 'preference': int(self.get_bid(format_id)) - } - ) - - for i in range(len(entries)): - self._sort_formats(entries[i]['formats']) - entries[i].update( - { - 'id': '%s_part%d' % (video_id, i + 1), - 'title': title, - } - ) - - if len(entries) > 1: - info = { - '_type': 'multi_video', - 'id': video_id, - 'title': title, - 'entries': entries, - } - else: - info = entries[0] - info['id'] = video_id - info['title'] = title - - return info + return { + 'id': video_id, + 'title': title, + 'url': data['m3u'], + } From fc3996bfe15deae02f4d8f1f4dc34a89fb8bfb03 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan <yan12125@gmail.com> Date: Sun, 26 Jun 2016 15:45:41 +0800 Subject: [PATCH 094/387] [iqiyi] Remove codes for debugging --- youtube_dl/extractor/iqiyi.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/youtube_dl/extractor/iqiyi.py b/youtube_dl/extractor/iqiyi.py index b717ca09c..fea26685e 100644 --- a/youtube_dl/extractor/iqiyi.py +++ b/youtube_dl/extractor/iqiyi.py @@ -180,9 +180,7 @@ class IqiyiIE(InfoExtractor): 'ext': 'mp4', 'title': '名侦探柯南 国语版', }, - 'params': { - 'cn_verification_proxy': 'http://proxy.uku.im:443/', - }, + 'skip': 'Geo-restricted to China', }, { 'url': 'http://www.iqiyi.com/w_19rt6o8t9p.html', 'only_matching': True, From 3c9c088f9c51cce86d3df878feba1884c0234df5 Mon Sep 17 00:00:00 2001 From: TRox1972 <TRox1972@users.noreply.github.com> Date: Sun, 12 Jun 2016 03:18:56 +0200 Subject: [PATCH 095/387] [Vidbit] Add new extractor --- youtube_dl/extractor/extractors.py | 1 + youtube_dl/extractor/vidbit.py | 36 ++++++++++++++++++++++++++++++ 2 files changed, 37 insertions(+) create mode 100644 youtube_dl/extractor/vidbit.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 9f98a1490..5ccac7c0c 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -916,6 +916,7 @@ from .vice import ( ViceIE, ViceShowIE, ) +from .vidbit import VidbitIE from .viddler import ViddlerIE from .videodetective import VideoDetectiveIE from .videofyme import VideofyMeIE diff --git a/youtube_dl/extractor/vidbit.py b/youtube_dl/extractor/vidbit.py new file mode 100644 index 000000000..39d508962 --- /dev/null +++ b/youtube_dl/extractor/vidbit.py @@ -0,0 +1,36 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import url_basename +from ..compat import compat_urlparse + + +class VidbitIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?vidbit\.co/watch\?v=(?P<id>[\w-]+)' + _TEST = { + 'url': 'http://www.vidbit.co/watch?v=MrM7LeaMJq', + 'md5': 'f1a579a93282a78de7e1c53220ef0f12', + 'info_dict': { + 'id': 'MrM7LeaMJq', + 'ext': 'mp4', + 'title': 'RoboCop (1987) - Dick You\'re Fired', + 'thumbnail': 'http://www.vidbit.co/thumbnails/MrM7LeaMJq.jpg', + } + } + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + return { + 'id': video_id, + 'title': self._html_search_regex(r'<h1>(.+)</h1>', webpage, 'title'), + 'url': compat_urlparse.urljoin(url, self._html_search_regex(r'file:\s*(["\'])((?:(?!\1).)+)\1', + webpage, 'video URL', group=2)), + 'thumbnail': self._og_search_thumbnail(webpage), + 'description': self._html_search_regex(r'description:(["\'])((?:(?!\1).)+)\1', + webpage, 'description', None, group=2), + } From 88d9f6c0c4c3d1d2179ee4fe0af560f500e62579 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 26 Jun 2016 16:57:14 +0700 Subject: [PATCH 096/387] [utils] Add support for name list in _html_search_meta --- test/test_InfoExtractor.py | 7 ++++++- youtube_dl/extractor/common.py | 6 ++++-- 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/test/test_InfoExtractor.py b/test/test_InfoExtractor.py index 6404ac89f..88e8ff904 100644 --- a/test/test_InfoExtractor.py +++ b/test/test_InfoExtractor.py @@ -11,7 +11,7 @@ sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) from test.helper import FakeYDL from youtube_dl.extractor.common import InfoExtractor from youtube_dl.extractor import YoutubeIE, get_info_extractor -from youtube_dl.utils import encode_data_uri, strip_jsonp, ExtractorError +from youtube_dl.utils import encode_data_uri, strip_jsonp, ExtractorError, RegexNotFoundError class TestIE(InfoExtractor): @@ -66,6 +66,11 @@ class TestInfoExtractor(unittest.TestCase): self.assertEqual(ie._html_search_meta('d', html), '4') self.assertEqual(ie._html_search_meta('e', html), '5') self.assertEqual(ie._html_search_meta('f', html), '6') + self.assertEqual(ie._html_search_meta(('a', 'b', 'c'), html), '1') + self.assertEqual(ie._html_search_meta(('c', 'b', 'a'), html), '3') + self.assertEqual(ie._html_search_meta(('z', 'x', 'c'), html), '3') + self.assertRaises(RegexNotFoundError, ie._html_search_meta, 'z', html, None, fatal=True) + self.assertRaises(RegexNotFoundError, ie._html_search_meta, ('z', 'x'), html, None, fatal=True) def test_download_json(self): uri = encode_data_uri(b'{"foo": "blah"}', 'application/json') diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 5a2603b50..4eda4e2ea 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -749,10 +749,12 @@ class InfoExtractor(object): return self._og_search_property('url', html, **kargs) def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs): + if not isinstance(name, (list, tuple)): + name = [name] if display_name is None: - display_name = name + display_name = name[0] return self._html_search_regex( - self._meta_regex(name), + [self._meta_regex(n) for n in name], html, display_name, fatal=fatal, group='content', **kwargs) def _dc_search_uploader(self, html): From f484c5fa257420201768158aaec31af25f904f9c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 26 Jun 2016 16:59:28 +0700 Subject: [PATCH 097/387] [vidbit] Improve (Closes #9759) --- youtube_dl/extractor/vidbit.py | 84 ++++++++++++++++++++++++++-------- 1 file changed, 66 insertions(+), 18 deletions(-) diff --git a/youtube_dl/extractor/vidbit.py b/youtube_dl/extractor/vidbit.py index 39d508962..e7ac5a842 100644 --- a/youtube_dl/extractor/vidbit.py +++ b/youtube_dl/extractor/vidbit.py @@ -1,36 +1,84 @@ -# coding: utf-8 from __future__ import unicode_literals -import re - from .common import InfoExtractor -from ..utils import url_basename from ..compat import compat_urlparse +from ..utils import ( + int_or_none, + js_to_json, + remove_end, + unified_strdate, +) class VidbitIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?vidbit\.co/watch\?v=(?P<id>[\w-]+)' - _TEST = { - 'url': 'http://www.vidbit.co/watch?v=MrM7LeaMJq', - 'md5': 'f1a579a93282a78de7e1c53220ef0f12', + _VALID_URL = r'https?://(?:www\.)?vidbit\.co/(?:watch|embed)\?.*?\bv=(?P<id>[\da-zA-Z]+)' + _TESTS = [{ + 'url': 'http://www.vidbit.co/watch?v=jkL2yDOEq2', + 'md5': '1a34b7f14defe3b8fafca9796892924d', 'info_dict': { - 'id': 'MrM7LeaMJq', + 'id': 'jkL2yDOEq2', 'ext': 'mp4', - 'title': 'RoboCop (1987) - Dick You\'re Fired', - 'thumbnail': 'http://www.vidbit.co/thumbnails/MrM7LeaMJq.jpg', + 'title': 'Intro to VidBit', + 'description': 'md5:5e0d6142eec00b766cbf114bfd3d16b7', + 'thumbnail': 're:https?://.*\.jpg$', + 'upload_date': '20160618', + 'view_count': int, + 'comment_count': int, } - } + }, { + 'url': 'http://www.vidbit.co/embed?v=jkL2yDOEq2&auto=0&water=0', + 'only_matching': True, + }] def _real_extract(self, url): video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) + + webpage = self._download_webpage( + compat_urlparse.urljoin(url, '/watch?v=%s' % video_id), video_id) + + video_url, title = [None] * 2 + + config = self._parse_json(self._search_regex( + r'(?s)\.setup\(({.+?})\);', webpage, 'setup', default='{}'), + video_id, transform_source=js_to_json) + if config: + if config.get('file'): + video_url = compat_urlparse.urljoin(url, config['file']) + title = config.get('title') + + if not video_url: + video_url = compat_urlparse.urljoin(url, self._search_regex( + r'file\s*:\s*(["\'])(?P<url>(?:(?!\1).)+)\1', + webpage, 'video URL', group='url')) + + if not title: + title = remove_end( + self._html_search_regex( + (r'<h1>(.+?)</h1>', r'<title>(.+?)'), + webpage, 'title', default=None) or self._og_search_title(webpage), + ' - VidBit') + + description = self._html_search_meta( + ('description', 'og:description', 'twitter:description'), + webpage, 'description') + + upload_date = unified_strdate(self._html_search_meta( + 'datePublished', webpage, 'upload date')) + + view_count = int_or_none(self._search_regex( + r'(\d+) views', + webpage, 'view count', fatal=False)) + comment_count = int_or_none(self._search_regex( + r'id=["\']cmt_num["\'][^>]*>\((\d+)\)', + webpage, 'comment count', fatal=False)) return { 'id': video_id, - 'title': self._html_search_regex(r'

(.+)

', webpage, 'title'), - 'url': compat_urlparse.urljoin(url, self._html_search_regex(r'file:\s*(["\'])((?:(?!\1).)+)\1', - webpage, 'video URL', group=2)), + 'url': video_url, + 'title': title, + 'description': description, 'thumbnail': self._og_search_thumbnail(webpage), - 'description': self._html_search_regex(r'description:(["\'])((?:(?!\1).)+)\1', - webpage, 'description', None, group=2), + 'upload_date': upload_date, + 'view_count': view_count, + 'comment_count': comment_count, } From 4f3c5e062715bb8c2084bda139ddcd9a2036f267 Mon Sep 17 00:00:00 2001 From: remitamine Date: Wed, 16 Mar 2016 18:48:06 +0100 Subject: [PATCH 098/387] [utils] add helper function for parsing codecs --- test/test_utils.py | 24 ++++++++++++++++++++++++ youtube_dl/utils.py | 36 ++++++++++++++++++++++++++++++++++++ 2 files changed, 60 insertions(+) diff --git a/test/test_utils.py b/test/test_utils.py index b7ef51f8d..d84eb438f 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -78,6 +78,7 @@ from youtube_dl.utils import ( cli_option, cli_valueless_option, cli_bool_option, + parse_codecs, ) from youtube_dl.compat import ( compat_chr, @@ -579,6 +580,29 @@ class TestUtil(unittest.TestCase): limit_length('foo bar baz asd', 12).startswith('foo bar')) self.assertTrue('...' in limit_length('foo bar baz asd', 12)) + def test_parse_codecs(self): + self.assertEqual(parse_codecs(''), {}) + self.assertEqual(parse_codecs('avc1.77.30, mp4a.40.2'), { + 'vcodec': 'avc1.77.30', + 'acodec': 'mp4a.40.2', + }) + self.assertEqual(parse_codecs('mp4a.40.2'), { + 'vcodec': 'none', + 'acodec': 'mp4a.40.2', + }) + self.assertEqual(parse_codecs('mp4a.40.5,avc1.42001e'), { + 'vcodec': 'avc1.42001e', + 'acodec': 'mp4a.40.5', + }) + self.assertEqual(parse_codecs('avc3.640028'), { + 'vcodec': 'avc3.640028', + 'acodec': 'none', + }) + self.assertEqual(parse_codecs(', h264,,newcodec,aac'), { + 'vcodec': 'h264', + 'acodec': 'aac', + }) + def test_escape_rfc3986(self): reserved = "!*'();:@&=+$,/?#[]" unreserved = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-_.~' diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 562031fe1..fe175e82c 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -2060,6 +2060,42 @@ def mimetype2ext(mt): }.get(res, res) +def parse_codecs(codecs_str): + # http://tools.ietf.org/html/rfc6381 + if not codecs_str: + return {} + splited_codecs = list(filter(None, map( + lambda str: str.strip(), codecs_str.strip().strip(',').split(',')))) + vcodec, acodec = None, None + for full_codec in splited_codecs: + codec = full_codec.split('.')[0] + if codec in ('avc1', 'avc2', 'avc3', 'avc4', 'vp9', 'vp8', 'hev1', 'hev2', 'h263', 'h264', 'mp4v'): + if not vcodec: + vcodec = full_codec + elif codec in ('mp4a', 'opus', 'vorbis', 'mp3', 'aac'): + if not acodec: + acodec = full_codec + else: + write_string('WARNING: Unknown codec %s' % full_codec, sys.stderr) + if not vcodec and not acodec: + if len(splited_codecs) == 2: + return { + 'vcodec': vcodec, + 'acodec': acodec, + } + elif len(splited_codecs) == 1: + return { + 'vcodec': 'none', + 'acodec': vcodec, + } + else: + return { + 'vcodec': vcodec or 'none', + 'acodec': acodec or 'none', + } + return {} + + def urlhandle_detect_ext(url_handle): getheader = url_handle.headers.get From 59bbe4911acd4493bf407925bfdeb1ad03db6ef3 Mon Sep 17 00:00:00 2001 From: remitamine Date: Wed, 16 Mar 2016 18:50:45 +0100 Subject: [PATCH 099/387] [extractor/common] add helper method to extract html5 media entries --- youtube_dl/extractor/common.py | 58 ++++++++++++++++++++++++++++++++++ 1 file changed, 58 insertions(+) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 5a2603b50..661889593 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -54,6 +54,8 @@ from ..utils import ( update_Request, update_url_query, parse_m3u8_attributes, + extract_attributes, + parse_codecs, ) @@ -1610,6 +1612,62 @@ class InfoExtractor(object): self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type) return formats + def _parse_html5_media_entries(self, base_url, webpage): + def absolute_url(video_url): + return compat_urlparse.urljoin(base_url, video_url) + + def parse_content_type(content_type): + if not content_type: + return {} + ctr = re.search(r'(?P[^/]+/[^;]+)(?:;\s*codecs="?(?P[^"]+))?', content_type) + if ctr: + mimetype, codecs = ctr.groups() + f = parse_codecs(codecs) + f['ext'] = mimetype2ext(mimetype) + return f + return {} + + entries = [] + for media_tag, media_type, media_content in re.findall(r'(?s)(<(?Pvideo|audio)[^>]*>)(.*?)', webpage): + media_info = { + 'formats': [], + 'subtitles': {}, + } + media_attributes = extract_attributes(media_tag) + src = media_attributes.get('src') + if src: + media_info['formats'].append({ + 'url': absolute_url(src), + 'vcodec': 'none' if media_type == 'audio' else None, + }) + media_info['thumbnail'] = media_attributes.get('poster') + if media_content: + for source_tag in re.findall(r']+>', media_content): + source_attributes = extract_attributes(source_tag) + src = source_attributes.get('src') + if not src: + continue + f = parse_content_type(source_attributes.get('type')) + f.update({ + 'url': absolute_url(src), + 'vcodec': 'none' if media_type == 'audio' else None, + }) + media_info['formats'].append(f) + for track_tag in re.findall(r']+>', media_content): + track_attributes = extract_attributes(track_tag) + kind = track_attributes.get('kind') + if not kind or kind == 'subtitles': + src = track_attributes.get('src') + if not src: + continue + lang = track_attributes.get('srclang') or track_attributes.get('lang') or track_attributes.get('label') + media_info['subtitles'].setdefault(lang, []).append({ + 'url': absolute_url(src), + }) + if media_info['formats']: + entries.append(media_info) + return entries + def _live_title(self, name): """ Generate the title for a live video """ now = datetime.datetime.now() From c6781156aa023c1131db6c5b1f575e1833649b33 Mon Sep 17 00:00:00 2001 From: TRox1972 Date: Thu, 19 May 2016 20:59:59 +0200 Subject: [PATCH 100/387] [MSN] add new extractor --- youtube_dl/extractor/extractors.py | 1 + youtube_dl/extractor/msn.py | 90 ++++++++++++++++++++++++++++++ 2 files changed, 91 insertions(+) create mode 100644 youtube_dl/extractor/msn.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 5ccac7c0c..2f9ee1596 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -454,6 +454,7 @@ from .motherless import MotherlessIE from .motorsport import MotorsportIE from .movieclips import MovieClipsIE from .moviezine import MoviezineIE +from .msn import MSNIE from .mtv import ( MTVIE, MTVServicesEmbeddedIE, diff --git a/youtube_dl/extractor/msn.py b/youtube_dl/extractor/msn.py new file mode 100644 index 000000000..4dd57fca0 --- /dev/null +++ b/youtube_dl/extractor/msn.py @@ -0,0 +1,90 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re +from .common import InfoExtractor + +from ..utils import ( + unescapeHTML, + int_or_none, +) + +class MSNIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?msn\.com/[a-z-]{2,5}(?:/[a-z]+)+/(?P[a-z-]+)/[a-z]{2}-(?P[a-zA-Z]+)' + _TESTS = [{ + 'url': 'http://www.msn.com/en-ae/foodanddrink/joinourtable/criminal-minds-shemar-moore-shares-a-touching-goodbye-message/vp-BBqQYNE', + 'info_dict': { + 'id': 'BBqQYNE', + 'title': 'Criminal Minds - Shemar Moore Shares A Touching Goodbye Message', + 'description': 'md5:e8e89b897b222eb33a6b5067a8f1bc25', + 'duration': 104, + 'ext': 'mp4', + }, + 'params': { + # m3u8 download + 'skip_download': True, + } + }, { + 'url': 'http://www.msn.com/en-ae/news/offbeat/meet-the-nine-year-old-self-made-millionaire/ar-BBt6ZKf', + 'info_dict': { + 'id': 'BBt6ZKf', + 'title': 'All That Bling: Self-Made Millionaire Child Builds Fashion & Jewellery Empire', + 'description': 'md5:8e683bd5c729d5fb16d96539a582aa5e', + 'duration': 350, + 'ext': 'mp4', + }, + 'params': { + # m3u8 download + 'skip_download': True, + } + }] + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id, display_id = mobj.group('id', 'display_id') + + webpage = self._download_webpage(url, display_id) + + self.report_extraction(display_id) + video_data = self._parse_json(self._html_search_regex(r'data-metadata\s*=\s*["\'](.+)["\']', + webpage, 'video data'), display_id) + + formats = [] + for video_file in video_data.get('videoFiles', []): + if not '.ism' in video_file.get('url', '.ism'): + formats.append({ + 'url': unescapeHTML(video_file.get('url')), + 'ext': 'mp4', + 'width': int_or_none(video_file.get('width')), + 'height': int_or_none(video_file.get('height')), + }) + elif 'm3u8' in video_file.get('url'): + formats.extend(self._extract_m3u8_formats( + video_file.get('url'), display_id, 'mp4')) + # There (often) exists an Microsoft Smooth Streaming manifest + # (.ism) which is not yet supported + # (https://github.com/rg3/youtube-dl/issues/8118) + + self._sort_formats(formats) + + subtitles = {} + for f in video_data.get('files', []): + if f.get('formatCode', '') == '3100': + lang = f.get('culture', '') + if not lang: + continue + subtitles.setdefault(lang, []).append({ + 'ext': 'ttml', + 'url': unescapeHTML(f.get('url')), + }) + + return { + 'id': video_id, + 'title': video_data['title'], + 'formats': formats, + 'thumbnail': video_data.get('headlineImage', {}).get('url'), + 'description': video_data.get('description'), + 'creator': video_data.get('creator'), + 'subtitles': subtitles, + 'duration': int_or_none(video_data.get('durationSecs')), + } From bf8dd790456acc4a96d2961e61e96c4771e4d787 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 26 Jun 2016 21:09:07 +0700 Subject: [PATCH 101/387] [extractor/common] Fix sorting with custom field preference --- youtube_dl/extractor/common.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 4eda4e2ea..e6c15de42 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -878,7 +878,11 @@ class InfoExtractor(object): f['ext'] = determine_ext(f['url']) if isinstance(field_preference, (list, tuple)): - return tuple(f.get(field) if f.get(field) is not None else -1 for field in field_preference) + return tuple( + f.get(field) + if f.get(field) is not None + else ('' if field == 'format_id' else -1) + for field in field_preference) preference = f.get('preference') if preference is None: From f1f336322da6e719cf4298b08680c3e903e956c4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 26 Jun 2016 21:10:05 +0700 Subject: [PATCH 102/387] [msn] Fix extraction (Closes #8960, closes #9542) --- youtube_dl/extractor/msn.py | 125 ++++++++++++++++++++++-------------- 1 file changed, 77 insertions(+), 48 deletions(-) diff --git a/youtube_dl/extractor/msn.py b/youtube_dl/extractor/msn.py index 4dd57fca0..d4569e325 100644 --- a/youtube_dl/extractor/msn.py +++ b/youtube_dl/extractor/msn.py @@ -2,41 +2,42 @@ from __future__ import unicode_literals import re -from .common import InfoExtractor +from .common import InfoExtractor +from ..compat import compat_str from ..utils import ( - unescapeHTML, + determine_ext, + ExtractorError, int_or_none, + unescapeHTML, ) + class MSNIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?msn\.com/[a-z-]{2,5}(?:/[a-z]+)+/(?P[a-z-]+)/[a-z]{2}-(?P[a-zA-Z]+)' + _VALID_URL = r'https?://(?:www\.)?msn\.com/(?:[^/]+/)+(?P[^/]+)/[a-z]{2}-(?P[\da-zA-Z]+)' _TESTS = [{ 'url': 'http://www.msn.com/en-ae/foodanddrink/joinourtable/criminal-minds-shemar-moore-shares-a-touching-goodbye-message/vp-BBqQYNE', + 'md5': '8442f66c116cbab1ff7098f986983458', 'info_dict': { 'id': 'BBqQYNE', + 'display_id': 'criminal-minds-shemar-moore-shares-a-touching-goodbye-message', + 'ext': 'mp4', 'title': 'Criminal Minds - Shemar Moore Shares A Touching Goodbye Message', 'description': 'md5:e8e89b897b222eb33a6b5067a8f1bc25', 'duration': 104, - 'ext': 'mp4', + 'uploader': 'CBS Entertainment', + 'uploader_id': 'IT0X5aoJ6bJgYerJXSDCgFmYPB1__54v', }, - 'params': { - # m3u8 download - 'skip_download': True, - } }, { 'url': 'http://www.msn.com/en-ae/news/offbeat/meet-the-nine-year-old-self-made-millionaire/ar-BBt6ZKf', - 'info_dict': { - 'id': 'BBt6ZKf', - 'title': 'All That Bling: Self-Made Millionaire Child Builds Fashion & Jewellery Empire', - 'description': 'md5:8e683bd5c729d5fb16d96539a582aa5e', - 'duration': 350, - 'ext': 'mp4', - }, - 'params': { - # m3u8 download - 'skip_download': True, - } + 'only_matching': True, + }, { + 'url': 'http://www.msn.com/en-ae/video/watch/obama-a-lot-of-people-will-be-disappointed/vi-AAhxUMH', + 'only_matching': True, + }, { + # geo restricted + 'url': 'http://www.msn.com/en-ae/foodanddrink/joinourtable/the-first-fart-makes-you-laugh-the-last-fart-makes-you-cry/vp-AAhzIBU', + 'only_matching': True, }] def _real_extract(self, url): @@ -45,46 +46,74 @@ class MSNIE(InfoExtractor): webpage = self._download_webpage(url, display_id) - self.report_extraction(display_id) - video_data = self._parse_json(self._html_search_regex(r'data-metadata\s*=\s*["\'](.+)["\']', - webpage, 'video data'), display_id) + video = self._parse_json( + self._search_regex( + r'data-metadata\s*=\s*(["\'])(?P.+?)\1', + webpage, 'video data', default='{}', group='data'), + display_id, transform_source=unescapeHTML) + + if not video: + error = unescapeHTML(self._search_regex( + r'data-error=(["\'])(?P.+?)\1', + webpage, 'error', group='error')) + raise ExtractorError('%s said: %s' % (self.IE_NAME, error), expected=True) + + title = video['title'] formats = [] - for video_file in video_data.get('videoFiles', []): - if not '.ism' in video_file.get('url', '.ism'): + for file_ in video.get('videoFiles', []): + format_url = file_.get('url') + if not format_url: + continue + ext = determine_ext(format_url) + # .ism is not yet supported (see + # https://github.com/rg3/youtube-dl/issues/8118) + if ext == 'ism': + continue + if 'm3u8' in format_url: + # m3u8_native should not be used here until + # https://github.com/rg3/youtube-dl/issues/9913 is fixed + m3u8_formats = self._extract_m3u8_formats( + format_url, display_id, 'mp4', + m3u8_id='hls', fatal=False) + # Despite metadata in m3u8 all video+audio formats are + # actually video-only (no audio) + for f in m3u8_formats: + if f.get('acodec') != 'none' and f.get('vcodec') != 'none': + f['acodec'] = 'none' + formats.extend(m3u8_formats) + else: formats.append({ - 'url': unescapeHTML(video_file.get('url')), + 'url': format_url, 'ext': 'mp4', - 'width': int_or_none(video_file.get('width')), - 'height': int_or_none(video_file.get('height')), + 'format_id': 'http', + 'width': int_or_none(file_.get('width')), + 'height': int_or_none(file_.get('height')), }) - elif 'm3u8' in video_file.get('url'): - formats.extend(self._extract_m3u8_formats( - video_file.get('url'), display_id, 'mp4')) - # There (often) exists an Microsoft Smooth Streaming manifest - # (.ism) which is not yet supported - # (https://github.com/rg3/youtube-dl/issues/8118) - self._sort_formats(formats) subtitles = {} - for f in video_data.get('files', []): - if f.get('formatCode', '') == '3100': - lang = f.get('culture', '') - if not lang: - continue - subtitles.setdefault(lang, []).append({ - 'ext': 'ttml', - 'url': unescapeHTML(f.get('url')), + for file_ in video.get('files', []): + format_url = file_.get('url') + format_code = file_.get('formatCode') + if not format_url or not format_code: + continue + if compat_str(format_code) == '3100': + subtitles.setdefault(file_.get('culture', 'en'), []).append({ + 'ext': determine_ext(format_url, 'ttml'), + 'url': format_url, }) return { 'id': video_id, - 'title': video_data['title'], - 'formats': formats, - 'thumbnail': video_data.get('headlineImage', {}).get('url'), - 'description': video_data.get('description'), - 'creator': video_data.get('creator'), + 'display_id': display_id, + 'title': title, + 'description': video.get('description'), + 'thumbnail': video.get('headlineImage', {}).get('url'), + 'duration': int_or_none(video.get('durationSecs')), + 'uploader': video.get('sourceFriendly'), + 'uploader_id': video.get('providerId'), + 'creator': video.get('creator'), 'subtitles': subtitles, - 'duration': int_or_none(video_data.get('durationSecs')), + 'formats': formats, } From 92747e664a70e6739644a9c2b3abfbdcc68fd136 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 26 Jun 2016 21:15:24 +0700 Subject: [PATCH 103/387] release 2016.06.26 --- .github/ISSUE_TEMPLATE.md | 6 +++--- docs/supportedsites.md | 3 +++ youtube_dl/version.py | 2 +- 3 files changed, 7 insertions(+), 4 deletions(-) diff --git a/.github/ISSUE_TEMPLATE.md b/.github/ISSUE_TEMPLATE.md index c73f9a904..63b687fef 100644 --- a/.github/ISSUE_TEMPLATE.md +++ b/.github/ISSUE_TEMPLATE.md @@ -6,8 +6,8 @@ --- -### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2016.06.25*. If it's not read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. -- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2016.06.25** +### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2016.06.26*. If it's not read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. +- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2016.06.26** ### Before submitting an *issue* make sure you have: - [ ] At least skimmed through [README](https://github.com/rg3/youtube-dl/blob/master/README.md) and **most notably** [FAQ](https://github.com/rg3/youtube-dl#faq) and [BUGS](https://github.com/rg3/youtube-dl#bugs) sections @@ -35,7 +35,7 @@ $ youtube-dl -v [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 -[debug] youtube-dl version 2016.06.25 +[debug] youtube-dl version 2016.06.26 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/docs/supportedsites.md b/docs/supportedsites.md index 891499f59..a725e8c6b 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -385,6 +385,7 @@ - **MovieFap** - **Moviezine** - **MPORA** + - **MSN** - **MTV** - **mtv.de** - **mtviggy.com** @@ -501,6 +502,7 @@ - **plus.google**: Google Plus - **pluzz.francetv.fr** - **podomatic** + - **PolskieRadio** - **PornHd** - **PornHub** - **PornHubPlaylist** @@ -736,6 +738,7 @@ - **vh1.com** - **Vice** - **ViceShow** + - **Vidbit** - **Viddler** - **video.google:search**: Google Video search - **video.mit.edu** diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 2b7a4c98d..52de19517 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2016.06.25' +__version__ = '2016.06.26' From b0c200f1ec594b7c7d5a5023853970ff789a3470 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 26 Jun 2016 22:02:46 +0700 Subject: [PATCH 104/387] [msn] Add test URL with non-alphanumeric characters --- youtube_dl/extractor/msn.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/youtube_dl/extractor/msn.py b/youtube_dl/extractor/msn.py index d4569e325..1ec8e0f50 100644 --- a/youtube_dl/extractor/msn.py +++ b/youtube_dl/extractor/msn.py @@ -38,6 +38,9 @@ class MSNIE(InfoExtractor): # geo restricted 'url': 'http://www.msn.com/en-ae/foodanddrink/joinourtable/the-first-fart-makes-you-laugh-the-last-fart-makes-you-cry/vp-AAhzIBU', 'only_matching': True, + }, { + 'url': 'http://www.msn.com/en-ae/entertainment/bollywood/watch-how-salman-khan-reacted-when-asked-if-he-would-apologize-for-his-‘raped-woman’-comment/vi-AAhvzW6', + 'only_matching': True, }] def _real_extract(self, url): From 427cd050a3b64319c19e4596d8885378604e388e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Mon, 27 Jun 2016 04:11:53 +0700 Subject: [PATCH 105/387] [extractor/generic] Improve kaltura embed detection (Closes #9911) --- youtube_dl/extractor/generic.py | 20 +++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 1592a8a3a..26a7d10be 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -920,6 +920,24 @@ class GenericIE(InfoExtractor): }, 'add_ie': ['Kaltura'], }, + { + # Kaltura embedded via quoted entry_id + 'url': 'https://www.oreilly.com/ideas/my-cloud-makes-pretty-pictures', + 'info_dict': { + 'id': '0_utuok90b', + 'ext': 'mp4', + 'title': '06_matthew_brender_raj_dutt', + 'timestamp': 1466638791, + 'upload_date': '20160622', + }, + 'add_ie': ['Kaltura'], + 'expected_warnings': [ + 'Could not send HEAD request' + ], + 'params': { + 'skip_download': True, + } + }, # Eagle.Platform embed (generic URL) { 'url': 'http://lenta.ru/news/2015/03/06/navalny/', @@ -1909,7 +1927,7 @@ class GenericIE(InfoExtractor): # Look for Kaltura embeds mobj = (re.search(r"(?s)kWidget\.(?:thumb)?[Ee]mbed\(\{.*?(?P['\"])wid(?P=q1)\s*:\s*(?P['\"])_?(?P[^'\"]+)(?P=q2),.*?(?P['\"])entry_?[Ii]d(?P=q3)\s*:\s*(?P['\"])(?P[^'\"]+)(?P=q4),", webpage) or - re.search(r'(?s)(?P["\'])(?:https?:)?//cdnapi(?:sec)?\.kaltura\.com/.*?(?:p|partner_id)/(?P\d+).*?(?P=q1).*?entry_?[Ii]d\s*:\s*(?P["\'])(?P.+?)(?P=q2)', webpage)) + re.search(r'(?s)(?P["\'])(?:https?:)?//cdnapi(?:sec)?\.kaltura\.com/.*?(?:p|partner_id)/(?P\d+).*?(?P=q1).*?(?P["\'])?entry_?[Ii]d(?P=q2)\s*:\s*(?P["\'])(?P.+?)(?P=q3)', webpage)) if mobj is not None: return self.url_result(smuggle_url( 'kaltura:%(partner_id)s:%(id)s' % mobj.groupdict(), From 81fda1536924db0ec4f583ae83bc77cb91ca6835 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Mon, 27 Jun 2016 05:07:12 +0700 Subject: [PATCH 106/387] [sr:mediathek] Clarify IE_NAME --- youtube_dl/extractor/srmediathek.py | 1 + 1 file changed, 1 insertion(+) diff --git a/youtube_dl/extractor/srmediathek.py b/youtube_dl/extractor/srmediathek.py index 74d01183f..a2569dfba 100644 --- a/youtube_dl/extractor/srmediathek.py +++ b/youtube_dl/extractor/srmediathek.py @@ -9,6 +9,7 @@ from ..utils import ( class SRMediathekIE(ARDMediathekIE): + IE_NAME = 'sr:mediathek' IE_DESC = 'Saarländischer Rundfunk' _VALID_URL = r'https?://sr-mediathek\.sr-online\.de/index\.php\?.*?&id=(?P[0-9]+)' From f41ffc00d15697c6d4c8975d261ffd5b0c5e971f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Mon, 27 Jun 2016 05:08:09 +0700 Subject: [PATCH 107/387] [skynewsarabia:article] Clarify IE_NAME --- youtube_dl/extractor/skynewsarabia.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/skynewsarabia.py b/youtube_dl/extractor/skynewsarabia.py index 05e1b02ad..fffc9aa22 100644 --- a/youtube_dl/extractor/skynewsarabia.py +++ b/youtube_dl/extractor/skynewsarabia.py @@ -67,7 +67,7 @@ class SkyNewsArabiaIE(SkyNewsArabiaBaseIE): class SkyNewsArabiaArticleIE(SkyNewsArabiaBaseIE): - IE_NAME = 'skynewsarabia:video' + IE_NAME = 'skynewsarabia:article' _VALID_URL = r'https?://(?:www\.)?skynewsarabia\.com/web/article/(?P[0-9]+)' _TESTS = [{ 'url': 'http://www.skynewsarabia.com/web/article/794549/%D8%A7%D9%94%D8%AD%D8%AF%D8%A7%D8%AB-%D8%A7%D9%84%D8%B4%D8%B1%D9%82-%D8%A7%D9%84%D8%A7%D9%94%D9%88%D8%B3%D8%B7-%D8%AE%D8%B1%D9%8A%D8%B7%D8%A9-%D8%A7%D9%84%D8%A7%D9%94%D9%84%D8%B9%D8%A7%D8%A8-%D8%A7%D9%84%D8%B0%D9%83%D9%8A%D8%A9', From e3a6747d8f19ad0ba8aee7c3214cdb64903beba0 Mon Sep 17 00:00:00 2001 From: Matthieu Muffato Date: Sun, 26 Jun 2016 23:31:55 +0100 Subject: [PATCH 108/387] New test-case: extractor names are supposed to be unique @dstftw explained in https://github.com/rg3/youtube-dl/pull/9918#issuecomment-228625878 that extractor names are supposed to be unique. @dstftw has fixed the two offending extractors, and here I add a test to ensure this does not happen in the future. --- test/test_all_urls.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/test/test_all_urls.py b/test/test_all_urls.py index f5af184e6..133d438eb 100644 --- a/test/test_all_urls.py +++ b/test/test_all_urls.py @@ -6,6 +6,7 @@ from __future__ import unicode_literals import os import sys import unittest +import collections sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) @@ -130,6 +131,13 @@ class TestAllURLsMatching(unittest.TestCase): 'https://screen.yahoo.com/smartwatches-latest-wearable-gadgets-163745379-cbs.html', ['Yahoo']) + def test_no_duplicated_ie_names(self): + name_accu = collections.defaultdict(list) + for ie in self.ies: + name_accu[ie.IE_NAME.lower()].append(ie) + for (ie_name,ie_list) in name_accu.items(): + self.assertEqual(len(ie_list), 1, 'Only 1 extractor with IE_NAME "%s" (%s)' % (ie_name, ie_list)) + if __name__ == '__main__': unittest.main() From fd7a7498a47c5d79663ec8d86a87325aa634c652 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Mon, 27 Jun 2016 22:11:45 +0700 Subject: [PATCH 109/387] [test_all_urls] PEP 8 and change wording --- test/test_all_urls.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/test/test_all_urls.py b/test/test_all_urls.py index 133d438eb..1f6079c29 100644 --- a/test/test_all_urls.py +++ b/test/test_all_urls.py @@ -134,9 +134,11 @@ class TestAllURLsMatching(unittest.TestCase): def test_no_duplicated_ie_names(self): name_accu = collections.defaultdict(list) for ie in self.ies: - name_accu[ie.IE_NAME.lower()].append(ie) - for (ie_name,ie_list) in name_accu.items(): - self.assertEqual(len(ie_list), 1, 'Only 1 extractor with IE_NAME "%s" (%s)' % (ie_name, ie_list)) + name_accu[ie.IE_NAME.lower()].append(type(ie).__name__) + for (ie_name, ie_list) in name_accu.items(): + self.assertEqual( + len(ie_list), 1, + 'Multiple extractors with the same IE_NAME "%s" (%s)' % (ie_name, ', '.join(ie_list))) if __name__ == '__main__': From 9ea5c04c0d16f5519079ae04fdad62fc28c884b0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Mon, 27 Jun 2016 22:44:17 +0700 Subject: [PATCH 110/387] [kaltura] Add _extract_url with fixed regex --- youtube_dl/extractor/kaltura.py | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/youtube_dl/extractor/kaltura.py b/youtube_dl/extractor/kaltura.py index a65697ff5..c75a958ba 100644 --- a/youtube_dl/extractor/kaltura.py +++ b/youtube_dl/extractor/kaltura.py @@ -64,6 +64,32 @@ class KalturaIE(InfoExtractor): } ] + @staticmethod + def _extract_url(webpage): + mobj = ( + re.search( + r"""(?xs) + kWidget\.(?:thumb)?[Ee]mbed\( + \{.*? + (?P['\"])wid(?P=q1)\s*:\s* + (?P['\"])_?(?P[^'\"]+)(?P=q2),.*? + (?P['\"])entry_?[Ii]d(?P=q3)\s*:\s* + (?P['\"])(?P[^'\"]+)(?P=q4), + """, webpage) or + re.search( + r'''(?xs) + (?P["\']) + (?:https?:)?//cdnapi(?:sec)?\.kaltura\.com/.*?(?:p|partner_id)/(?P\d+).*? + (?P=q1).*? + (?: + entry_?[Ii]d| + (?P["\'])entry_?[Ii]d(?P=q2) + )\s*:\s* + (?P["\'])(?P.+?)(?P=q3) + ''', webpage)) + if mobj: + return 'kaltura:%(partner_id)s:%(id)s' % mobj.groupdict() + def _kaltura_api_call(self, video_id, actions, *args, **kwargs): params = actions[0] if len(actions) > 1: From c287f2bc6073182323aada26309539d724943fa8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Mon, 27 Jun 2016 22:45:26 +0700 Subject: [PATCH 111/387] [extractor/generic] Use _extract_url for kaltura embeds (Closes #9922) --- youtube_dl/extractor/generic.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 26a7d10be..2188f8bb2 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -64,6 +64,7 @@ from .liveleak import LiveLeakIE from .threeqsdn import ThreeQSDNIE from .theplatform import ThePlatformIE from .vessel import VesselIE +from .kaltura import KalturaIE class GenericIE(InfoExtractor): @@ -1926,12 +1927,9 @@ class GenericIE(InfoExtractor): return self.url_result(mobj.group('url'), 'Zapiks') # Look for Kaltura embeds - mobj = (re.search(r"(?s)kWidget\.(?:thumb)?[Ee]mbed\(\{.*?(?P['\"])wid(?P=q1)\s*:\s*(?P['\"])_?(?P[^'\"]+)(?P=q2),.*?(?P['\"])entry_?[Ii]d(?P=q3)\s*:\s*(?P['\"])(?P[^'\"]+)(?P=q4),", webpage) or - re.search(r'(?s)(?P["\'])(?:https?:)?//cdnapi(?:sec)?\.kaltura\.com/.*?(?:p|partner_id)/(?P\d+).*?(?P=q1).*?(?P["\'])?entry_?[Ii]d(?P=q2)\s*:\s*(?P["\'])(?P.+?)(?P=q3)', webpage)) - if mobj is not None: - return self.url_result(smuggle_url( - 'kaltura:%(partner_id)s:%(id)s' % mobj.groupdict(), - {'source_url': url}), 'Kaltura') + kaltura_url = KalturaIE._extract_url(webpage) + if kaltura_url: + return self.url_result(smuggle_url(kaltura_url, {'source_url': url}), KalturaIE.ie_key()) # Look for Eagle.Platform embeds mobj = re.search( From 8704778d95d2abef021757c85efd75664c6a424a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Mon, 27 Jun 2016 23:06:42 +0700 Subject: [PATCH 112/387] [pbs] Check manually constructed http links (Closes #9921) --- youtube_dl/extractor/pbs.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/pbs.py b/youtube_dl/extractor/pbs.py index 81918ac6e..f6f423597 100644 --- a/youtube_dl/extractor/pbs.py +++ b/youtube_dl/extractor/pbs.py @@ -516,9 +516,14 @@ class PBSIE(InfoExtractor): # https://projects.pbs.org/confluence/display/coveapi/COVE+Video+Specifications if not bitrate or bitrate not in ('400k', '800k', '1200k', '2500k'): continue + f_url = re.sub(r'\d+k|baseline', bitrate, http_url) + # This may produce invalid links sometimes (e.g. + # http://www.pbs.org/wgbh/frontline/film/suicide-plan) + if not self._is_valid_url(f_url, display_id, 'http-%s video' % bitrate): + continue f = m3u8_format.copy() f.update({ - 'url': re.sub(r'\d+k|baseline', bitrate, http_url), + 'url': f_url, 'format_id': m3u8_format['format_id'].replace('hls', 'http'), 'protocol': 'http', }) From 8174d0fe95db736f0fc53fd5e2d25c0c471a97fb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Mon, 27 Jun 2016 23:09:39 +0700 Subject: [PATCH 113/387] release 2016.06.27 --- .github/ISSUE_TEMPLATE.md | 6 +++--- docs/supportedsites.md | 4 ++-- youtube_dl/version.py | 2 +- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/.github/ISSUE_TEMPLATE.md b/.github/ISSUE_TEMPLATE.md index 63b687fef..f9a1aa990 100644 --- a/.github/ISSUE_TEMPLATE.md +++ b/.github/ISSUE_TEMPLATE.md @@ -6,8 +6,8 @@ --- -### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2016.06.26*. If it's not read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. -- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2016.06.26** +### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2016.06.27*. If it's not read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. +- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2016.06.27** ### Before submitting an *issue* make sure you have: - [ ] At least skimmed through [README](https://github.com/rg3/youtube-dl/blob/master/README.md) and **most notably** [FAQ](https://github.com/rg3/youtube-dl#faq) and [BUGS](https://github.com/rg3/youtube-dl#bugs) sections @@ -35,7 +35,7 @@ $ youtube-dl -v [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 -[debug] youtube-dl version 2016.06.26 +[debug] youtube-dl version 2016.06.27 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/docs/supportedsites.md b/docs/supportedsites.md index a725e8c6b..2a94f4feb 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -45,7 +45,6 @@ - **archive.org**: archive.org videos - **ARD** - **ARD:mediathek** - - **ARD:mediathek**: Saarländischer Rundfunk - **arte.tv** - **arte.tv:+7** - **arte.tv:cinema** @@ -588,7 +587,7 @@ - **Shared**: shared.sx and vivo.sx - **ShareSix** - **Sina** - - **skynewsarabia:video** + - **skynewsarabia:article** - **skynewsarabia:video** - **Slideshare** - **Slutload** @@ -621,6 +620,7 @@ - **SportBoxEmbed** - **SportDeutschland** - **Sportschau** + - **sr:mediathek**: Saarländischer Rundfunk - **SRGSSR** - **SRGSSRPlay**: srf.ch, rts.ch, rsi.ch, rtr.ch and swissinfo.ch play sites - **SSA** diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 52de19517..2dd24dec1 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2016.06.26' +__version__ = '2016.06.27' From 32616c14ccf5051484e9236c54bdbeccb6e85ee4 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Tue, 28 Jun 2016 14:02:03 +0100 Subject: [PATCH 114/387] [vrt] extract all formats --- youtube_dl/extractor/vrt.py | 61 ++++++++++++++++++++++++++++++++++--- 1 file changed, 56 insertions(+), 5 deletions(-) diff --git a/youtube_dl/extractor/vrt.py b/youtube_dl/extractor/vrt.py index 8e35f24e8..bec7ab327 100644 --- a/youtube_dl/extractor/vrt.py +++ b/youtube_dl/extractor/vrt.py @@ -25,7 +25,8 @@ class VRTIE(InfoExtractor): 'timestamp': 1414271750.949, 'upload_date': '20141025', 'duration': 929, - } + }, + 'skip': 'HTTP Error 404: Not Found', }, # sporza.be { @@ -39,7 +40,8 @@ class VRTIE(InfoExtractor): 'timestamp': 1413835980.560, 'upload_date': '20141020', 'duration': 3238, - } + }, + 'skip': 'HTTP Error 404: Not Found', }, # cobra.be { @@ -53,16 +55,39 @@ class VRTIE(InfoExtractor): 'timestamp': 1413967500.494, 'upload_date': '20141022', 'duration': 661, - } + }, + 'skip': 'HTTP Error 404: Not Found', }, { # YouTube video 'url': 'http://deredactie.be/cm/vrtnieuws/videozone/nieuws/cultuurenmedia/1.2622957', - 'only_matching': True, + 'md5': 'b8b93da1df1cea6c8556255a796b7d61', + 'info_dict': { + 'id': 'Wji-BZ0oCwg', + 'ext': 'mp4', + 'title': 'ROGUE ONE: A STAR WARS STORY Official Teaser Trailer', + 'description': 'md5:8e468944dce15567a786a67f74262583', + 'uploader': 'Star Wars', + 'uploader_id': 'starwars', + 'upload_date': '20160407', + }, + 'add_ie': ['Youtube'], }, { 'url': 'http://cobra.canvas.be/cm/cobra/videozone/rubriek/film-videozone/1.2377055', - 'only_matching': True, + 'md5': '', + 'info_dict': { + 'id': '2377055', + 'ext': 'mp4', + 'title': 'Cafe Derby', + 'description': 'Lenny Van Wesemael debuteert met de langspeelfilm Café Derby. Een waar gebeurd maar ook verzonnen verhaal.', + 'upload_date': '20150626', + 'timestamp': 1435305240.769, + }, + 'params': { + # m3u8 download + 'skip_download': True, + } } ] @@ -98,6 +123,32 @@ class VRTIE(InfoExtractor): formats.extend(self._extract_m3u8_formats( src, video_id, 'mp4', entry_protocol='m3u8_native', m3u8_id='hls', fatal=False)) + formats.extend(self._extract_f4m_formats( + src.replace('playlist.m3u8', 'manifest.f4m'), + video_id, f4m_id='hds', fatal=False)) + if 'data-video-geoblocking="true"' not in webpage: + rtmp_formats = self._extract_smil_formats( + src.replace('playlist.m3u8', 'jwplayer.smil'), + video_id, fatal=False) + formats.extend(rtmp_formats) + for rtmp_format in rtmp_formats: + rtmp_format_c = rtmp_format.copy() + rtmp_format_c['url'] = '%s/%s' % (rtmp_format['url'], rtmp_format['play_path']) + del rtmp_format_c['play_path'] + del rtmp_format_c['ext'] + http_format = rtmp_format_c.copy() + http_format.update({ + 'url': rtmp_format_c['url'].replace('rtmp://', 'http://').replace('vod.', 'download.').replace('/_definst_/', '/').replace('mp4:', ''), + 'format_id': rtmp_format['format_id'].replace('rtmp', 'http'), + 'protocol': 'http', + }) + rtsp_format = rtmp_format_c.copy() + rtsp_format.update({ + 'url': rtsp_format['url'].replace('rtmp://', 'rtsp://'), + 'format_id': rtmp_format['format_id'].replace('rtmp', 'rtsp'), + 'protocol': 'rtsp', + }) + formats.extend([http_format, rtsp_format]) else: formats.extend(self._extract_f4m_formats( '%s/manifest.f4m' % src, video_id, f4m_id='hds', fatal=False)) From 97124e572db5f5d5d11ee630aeb18a4c5585d087 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Tue, 28 Jun 2016 22:39:53 +0700 Subject: [PATCH 115/387] [arte:playlist] Fix test --- youtube_dl/extractor/arte.py | 1 + 1 file changed, 1 insertion(+) diff --git a/youtube_dl/extractor/arte.py b/youtube_dl/extractor/arte.py index 049f1fa9e..e0c5c1804 100644 --- a/youtube_dl/extractor/arte.py +++ b/youtube_dl/extractor/arte.py @@ -419,6 +419,7 @@ class ArteTVPlaylistIE(ArteTVBaseIE): 'info_dict': { 'id': 'PL-013263', 'title': 'Areva & Uramin', + 'description': 'md5:a1dc0312ce357c262259139cfd48c9bf', }, 'playlist_mincount': 6, }, { From 42362fdb5e780611b7054e52eb28621f5a9fd7ba Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Wed, 29 Jun 2016 15:49:17 +0100 Subject: [PATCH 116/387] [aenetworks] add support for show and season for A&E Network sites and History topics(closes #9816) --- youtube_dl/extractor/aenetworks.py | 181 +++++++++++++++++++++-------- youtube_dl/extractor/extractors.py | 5 +- 2 files changed, 135 insertions(+), 51 deletions(-) diff --git a/youtube_dl/extractor/aenetworks.py b/youtube_dl/extractor/aenetworks.py index 1bbfe2641..cbde8b46e 100644 --- a/youtube_dl/extractor/aenetworks.py +++ b/youtube_dl/extractor/aenetworks.py @@ -7,18 +7,118 @@ from ..utils import ( smuggle_url, update_url_query, unescapeHTML, + extract_attributes, +) +from ..compat import ( + compat_urlparse, ) -class AENetworksIE(InfoExtractor): +class AENetworksBaseIE(InfoExtractor): + def theplatform_url_result(self, theplatform_url, video_id, query): + return { + '_type': 'url_transparent', + 'id': video_id, + 'url': smuggle_url( + update_url_query(theplatform_url, query), + { + 'sig': { + 'key': 'crazyjava', + 'secret': 's3cr3t' + }, + 'force_smil_url': True + }), + 'ie_key': 'ThePlatform', + } + + +class AENetworksIE(AENetworksBaseIE): IE_NAME = 'aenetworks' IE_DESC = 'A+E Networks: A&E, Lifetime, History.com, FYI Network' - _VALID_URL = r'https?://(?:www\.)?(?:(?:history|aetv|mylifetime)\.com|fyi\.tv)/(?P[^/]+)/(?:[^/]+/)+(?P[^/]+?)(?:$|[?#])' + _VALID_URL = r'https?://(?:www\.)?(?:(?:history|aetv|mylifetime)\.com|fyi\.tv)/shows/(?P[^/]+(?:/[^/]+){0,2})' + _TESTS = [{ + 'url': 'http://www.history.com/shows/mountain-men/season-1/episode-1', + 'md5': '8ff93eb073449f151d6b90c0ae1ef0c7', + 'info_dict': { + 'id': '22253814', + 'ext': 'mp4', + 'title': 'Winter Is Coming', + 'description': 'md5:641f424b7a19d8e24f26dea22cf59d74', + 'timestamp': 1338306241, + 'upload_date': '20120529', + 'uploader': 'AENE-NEW', + }, + 'add_ie': ['ThePlatform'], + }, { + 'url': 'http://www.history.com/shows/ancient-aliens/season-1', + 'info_dict': { + 'id': '71889446852', + }, + 'playlist_mincount': 5, + }, { + 'url': 'http://www.mylifetime.com/shows/atlanta-plastic', + 'info_dict': { + 'id': 'SERIES4317', + 'title': 'Atlanta Plastic', + }, + 'playlist_mincount': 2, + }, { + 'url': 'http://www.aetv.com/shows/duck-dynasty/season-9/episode-1', + 'only_matching': True + }, { + 'url': 'http://www.fyi.tv/shows/tiny-house-nation/season-1/episode-8', + 'only_matching': True + }, { + 'url': 'http://www.mylifetime.com/shows/project-runway-junior/season-1/episode-6', + 'only_matching': True + }] + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + url_parts = display_id.split('/') + url_parts_len = len(url_parts) + if url_parts_len == 1: + entries = [] + for season_url_path in re.findall(r'(?s)]+data-href="(/shows/%s/season-\d+)"' % url_parts[0], webpage): + entries.append(self.url_result( + compat_urlparse.urljoin(url, season_url_path), 'AENetworks')) + return self.playlist_result( + entries, self._html_search_meta('aetn:SeriesId', webpage), + self._html_search_meta('aetn:SeriesTitle', webpage)) + elif url_parts_len == 2: + entries = [] + for episode_item in re.findall(r'(?s)]+class="[^"]*episode-item[^"]*"[^>]*>', webpage): + episode_attributes = extract_attributes(episode_item) + episode_url = compat_urlparse.urljoin( + url, episode_attributes['data-canonical']) + entries.append(self.url_result( + episode_url, 'AENetworks', + episode_attributes['data-videoid'])) + return self.playlist_result( + entries, self._html_search_meta('aetn:SeasonId', webpage)) + else: + video_id = self._html_search_meta('aetn:VideoID', webpage) + media_url = self._search_regex( + r"media_url\s*=\s*'([^']+)'", webpage, 'video url') + + info = self._search_json_ld(webpage, video_id, fatal=False) + info.update(self.theplatform_url_result( + media_url, video_id, { + 'mbr': 'true', + 'assetTypes': 'medium_video_s3' + })) + return info + + +class HistoryTopicIE(AENetworksBaseIE): + IE_NAME = 'history:topic' + IE_DESC = 'History.com Topic' + _VALID_URL = r'https?://(?:www\.)?history\.com/topics/(?:[^/]+/)?(?P[^/]+)/videos(?:/(?P[^/?#]+))?' _TESTS = [{ 'url': 'http://www.history.com/topics/valentines-day/history-of-valentines-day/videos/bet-you-didnt-know-valentines-day?m=528e394da93ae&s=undefined&f=1&free=false', 'info_dict': { - 'id': 'g12m5Gyt3fdR', + 'id': '40700995724', 'ext': 'mp4', 'title': "Bet You Didn't Know: Valentine's Day", 'description': 'md5:7b57ea4829b391995b405fa60bd7b5f7', @@ -31,57 +131,38 @@ class AENetworksIE(InfoExtractor): 'skip_download': True, }, 'add_ie': ['ThePlatform'], - 'expected_warnings': ['JSON-LD'], }, { - 'url': 'http://www.history.com/shows/mountain-men/season-1/episode-1', - 'md5': '8ff93eb073449f151d6b90c0ae1ef0c7', - 'info_dict': { - 'id': 'eg47EERs_JsZ', - 'ext': 'mp4', - 'title': 'Winter Is Coming', - 'description': 'md5:641f424b7a19d8e24f26dea22cf59d74', - 'timestamp': 1338306241, - 'upload_date': '20120529', - 'uploader': 'AENE-NEW', + 'url': 'http://www.history.com/topics/world-war-i/world-war-i-history/videos', + 'info_dict': + { + 'id': 'world-war-i-history', }, - 'add_ie': ['ThePlatform'], + 'playlist_mincount': 24, }, { - 'url': 'http://www.aetv.com/shows/duck-dynasty/video/inlawful-entry', - 'only_matching': True - }, { - 'url': 'http://www.fyi.tv/shows/tiny-house-nation/videos/207-sq-ft-minnesota-prairie-cottage', - 'only_matching': True - }, { - 'url': 'http://www.mylifetime.com/shows/project-runway-junior/video/season-1/episode-6/superstar-clients', - 'only_matching': True + 'url': 'http://www.history.com/topics/world-war-i-history/videos', + 'only_matching': True, }] def _real_extract(self, url): - page_type, video_id = re.match(self._VALID_URL, url).groups() + topic_id, display_id = re.match(self._VALID_URL, url).groups() + if display_id: + webpage = self._download_webpage(url, display_id) + release_url, video_id = re.search(r"_videoPlayer.play\('([^']+)'\s*,\s*'[^']+'\s*,\s*'(\d+)'\)", webpage).groups() + release_url = unescapeHTML(release_url) - webpage = self._download_webpage(url, video_id) - - video_url_re = [ - r'data-href="[^"]*/%s"[^>]+data-release-url="([^"]+)"' % video_id, - r"media_url\s*=\s*'([^']+)'" - ] - video_url = unescapeHTML(self._search_regex(video_url_re, webpage, 'video url')) - query = {'mbr': 'true'} - if page_type == 'shows': - query['assetTypes'] = 'medium_video_s3' - if 'switch=hds' in video_url: - query['switch'] = 'hls' - - info = self._search_json_ld(webpage, video_id, fatal=False) - info.update({ - '_type': 'url_transparent', - 'url': smuggle_url( - update_url_query(video_url, query), - { - 'sig': { - 'key': 'crazyjava', - 'secret': 's3cr3t'}, - 'force_smil_url': True - }), - }) - return info + return self.theplatform_url_result( + release_url, video_id, { + 'mbr': 'true', + 'switch': 'hls' + }) + else: + webpage = self._download_webpage(url, topic_id) + entries = [] + for episode_item in re.findall(r']*>', webpage): + video_attributes = extract_attributes(episode_item) + entries.append(self.theplatform_url_result( + video_attributes['data-href'], video_attributes['data-id'], { + 'mbr': 'true', + 'switch': 'hls' + })) + return self.playlist_result(entries, topic_id) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 2f9ee1596..bba88e9eb 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -20,7 +20,10 @@ from .adobetv import ( AdobeTVVideoIE, ) from .adultswim import AdultSwimIE -from .aenetworks import AENetworksIE +from .aenetworks import ( + AENetworksIE, + HistoryTopicIE, +) from .afreecatv import AfreecaTVIE from .aftonbladet import AftonbladetIE from .airmozilla import AirMozillaIE From 4c7821227c54836a17d9c02d4f8d3dcbd97105fc Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Wed, 29 Jun 2016 16:03:32 +0100 Subject: [PATCH 117/387] [aenetworks:historytopic] fix topic video url --- youtube_dl/extractor/aenetworks.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/aenetworks.py b/youtube_dl/extractor/aenetworks.py index cbde8b46e..1376dd70f 100644 --- a/youtube_dl/extractor/aenetworks.py +++ b/youtube_dl/extractor/aenetworks.py @@ -161,7 +161,7 @@ class HistoryTopicIE(AENetworksBaseIE): for episode_item in re.findall(r']*>', webpage): video_attributes = extract_attributes(episode_item) entries.append(self.theplatform_url_result( - video_attributes['data-href'], video_attributes['data-id'], { + video_attributes['data-release-url'], video_attributes['data-id'], { 'mbr': 'true', 'switch': 'hls' })) From c58ed8563d37e39235332b35e7feafe32711c623 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Wed, 29 Jun 2016 16:18:16 +0100 Subject: [PATCH 118/387] [aenetworks] extract history topic playlist title --- youtube_dl/extractor/aenetworks.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/aenetworks.py b/youtube_dl/extractor/aenetworks.py index 1376dd70f..2536f75d6 100644 --- a/youtube_dl/extractor/aenetworks.py +++ b/youtube_dl/extractor/aenetworks.py @@ -8,6 +8,7 @@ from ..utils import ( update_url_query, unescapeHTML, extract_attributes, + get_element_by_attribute, ) from ..compat import ( compat_urlparse, @@ -136,6 +137,7 @@ class HistoryTopicIE(AENetworksBaseIE): 'info_dict': { 'id': 'world-war-i-history', + 'title': 'World War I History', }, 'playlist_mincount': 24, }, { @@ -165,4 +167,4 @@ class HistoryTopicIE(AENetworksBaseIE): 'mbr': 'true', 'switch': 'hls' })) - return self.playlist_result(entries, topic_id) + return self.playlist_result(entries, topic_id, get_element_by_attribute('class', 'show-title', webpage)) From 70157c2c43068b23007a6d71e16967ba85b274d5 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Wed, 29 Jun 2016 16:55:17 +0100 Subject: [PATCH 119/387] [aenetworks] add support for movie pages --- youtube_dl/extractor/aenetworks.py | 80 ++++++++++++++++-------------- 1 file changed, 42 insertions(+), 38 deletions(-) diff --git a/youtube_dl/extractor/aenetworks.py b/youtube_dl/extractor/aenetworks.py index 2536f75d6..8b60e2ab6 100644 --- a/youtube_dl/extractor/aenetworks.py +++ b/youtube_dl/extractor/aenetworks.py @@ -36,7 +36,7 @@ class AENetworksBaseIE(InfoExtractor): class AENetworksIE(AENetworksBaseIE): IE_NAME = 'aenetworks' IE_DESC = 'A+E Networks: A&E, Lifetime, History.com, FYI Network' - _VALID_URL = r'https?://(?:www\.)?(?:(?:history|aetv|mylifetime)\.com|fyi\.tv)/shows/(?P[^/]+(?:/[^/]+){0,2})' + _VALID_URL = r'https?://(?:www\.)?(?:(?:history|aetv|mylifetime)\.com|fyi\.tv)/(?:shows/(?P[^/]+(?:/[^/]+){0,2})|movies/(?P[^/]+)/full-movie)' _TESTS = [{ 'url': 'http://www.history.com/shows/mountain-men/season-1/episode-1', 'md5': '8ff93eb073449f151d6b90c0ae1ef0c7', @@ -72,50 +72,54 @@ class AENetworksIE(AENetworksBaseIE): }, { 'url': 'http://www.mylifetime.com/shows/project-runway-junior/season-1/episode-6', 'only_matching': True + }, { + 'url': 'http://www.mylifetime.com/movies/center-stage-on-pointe/full-movie', + 'only_matching': True }] def _real_extract(self, url): - display_id = self._match_id(url) + show_path, movie_display_id = re.match(self._VALID_URL, url).groups() + display_id = show_path or movie_display_id webpage = self._download_webpage(url, display_id) - url_parts = display_id.split('/') - url_parts_len = len(url_parts) - if url_parts_len == 1: - entries = [] - for season_url_path in re.findall(r'(?s)]+data-href="(/shows/%s/season-\d+)"' % url_parts[0], webpage): - entries.append(self.url_result( - compat_urlparse.urljoin(url, season_url_path), 'AENetworks')) - return self.playlist_result( - entries, self._html_search_meta('aetn:SeriesId', webpage), - self._html_search_meta('aetn:SeriesTitle', webpage)) - elif url_parts_len == 2: - entries = [] - for episode_item in re.findall(r'(?s)]+class="[^"]*episode-item[^"]*"[^>]*>', webpage): - episode_attributes = extract_attributes(episode_item) - episode_url = compat_urlparse.urljoin( - url, episode_attributes['data-canonical']) - entries.append(self.url_result( - episode_url, 'AENetworks', - episode_attributes['data-videoid'])) - return self.playlist_result( - entries, self._html_search_meta('aetn:SeasonId', webpage)) - else: - video_id = self._html_search_meta('aetn:VideoID', webpage) - media_url = self._search_regex( - r"media_url\s*=\s*'([^']+)'", webpage, 'video url') + if show_path: + url_parts = show_path.split('/') + url_parts_len = len(url_parts) + if url_parts_len == 1: + entries = [] + for season_url_path in re.findall(r'(?s)]+data-href="(/shows/%s/season-\d+)"' % url_parts[0], webpage): + entries.append(self.url_result( + compat_urlparse.urljoin(url, season_url_path), 'AENetworks')) + return self.playlist_result( + entries, self._html_search_meta('aetn:SeriesId', webpage), + self._html_search_meta('aetn:SeriesTitle', webpage)) + elif url_parts_len == 2: + entries = [] + for episode_item in re.findall(r'(?s)]+class="[^"]*episode-item[^"]*"[^>]*>', webpage): + episode_attributes = extract_attributes(episode_item) + episode_url = compat_urlparse.urljoin( + url, episode_attributes['data-canonical']) + entries.append(self.url_result( + episode_url, 'AENetworks', + episode_attributes['data-videoid'])) + return self.playlist_result( + entries, self._html_search_meta('aetn:SeasonId', webpage)) + video_id = self._html_search_meta('aetn:VideoID', webpage) + media_url = self._search_regex( + r"media_url\s*=\s*'([^']+)'", webpage, 'video url') - info = self._search_json_ld(webpage, video_id, fatal=False) - info.update(self.theplatform_url_result( - media_url, video_id, { - 'mbr': 'true', - 'assetTypes': 'medium_video_s3' - })) - return info + info = self._search_json_ld(webpage, video_id, fatal=False) + info.update(self.theplatform_url_result( + media_url, video_id, { + 'mbr': 'true', + 'assetTypes': 'medium_video_s3' + })) + return info class HistoryTopicIE(AENetworksBaseIE): IE_NAME = 'history:topic' IE_DESC = 'History.com Topic' - _VALID_URL = r'https?://(?:www\.)?history\.com/topics/(?:[^/]+/)?(?P[^/]+)/videos(?:/(?P[^/?#]+))?' + _VALID_URL = r'https?://(?:www\.)?history\.com/topics/(?:[^/]+/)?(?P[^/]+)/videos(?:/(?P[^/?#]+))?' _TESTS = [{ 'url': 'http://www.history.com/topics/valentines-day/history-of-valentines-day/videos/bet-you-didnt-know-valentines-day?m=528e394da93ae&s=undefined&f=1&free=false', 'info_dict': { @@ -146,9 +150,9 @@ class HistoryTopicIE(AENetworksBaseIE): }] def _real_extract(self, url): - topic_id, display_id = re.match(self._VALID_URL, url).groups() - if display_id: - webpage = self._download_webpage(url, display_id) + topic_id, video_display_id = re.match(self._VALID_URL, url).groups() + if video_display_id: + webpage = self._download_webpage(url, video_display_id) release_url, video_id = re.search(r"_videoPlayer.play\('([^']+)'\s*,\s*'[^']+'\s*,\s*'(\d+)'\)", webpage).groups() release_url = unescapeHTML(release_url) From 06a96da15bfde93a2f2aa17cdaa10e1bf11dde0c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Wed, 29 Jun 2016 23:01:34 +0700 Subject: [PATCH 120/387] [eagleplatform] Improve embed detection and extract in separate routine (Closes #9926) --- youtube_dl/extractor/eagleplatform.py | 8 ++++++++ youtube_dl/extractor/generic.py | 8 ++++---- 2 files changed, 12 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/eagleplatform.py b/youtube_dl/extractor/eagleplatform.py index 113a4966f..12d28d3b9 100644 --- a/youtube_dl/extractor/eagleplatform.py +++ b/youtube_dl/extractor/eagleplatform.py @@ -50,6 +50,14 @@ class EaglePlatformIE(InfoExtractor): 'skip': 'Georestricted', }] + @staticmethod + def _extract_url(webpage): + mobj = re.search( + r']+src=(["\'])(?P(?:https?:)?//.+?\.media\.eagleplatform\.com/index/player\?.+?)\1', + webpage) + if mobj is not None: + return mobj.group('url') + @staticmethod def _handle_error(response): status = int_or_none(response.get('status', 200)) diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 2188f8bb2..712dd8a94 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -65,6 +65,7 @@ from .threeqsdn import ThreeQSDNIE from .theplatform import ThePlatformIE from .vessel import VesselIE from .kaltura import KalturaIE +from .eagleplatform import EaglePlatformIE class GenericIE(InfoExtractor): @@ -1932,10 +1933,9 @@ class GenericIE(InfoExtractor): return self.url_result(smuggle_url(kaltura_url, {'source_url': url}), KalturaIE.ie_key()) # Look for Eagle.Platform embeds - mobj = re.search( - r']+src="(?Phttps?://.+?\.media\.eagleplatform\.com/index/player\?.+?)"', webpage) - if mobj is not None: - return self.url_result(mobj.group('url'), 'EaglePlatform') + eagleplatform_url = EaglePlatformIE._extract_url(webpage) + if eagleplatform_url: + return self.url_result(eagleplatform_url, EaglePlatformIE.ie_key()) # Look for ClipYou (uses Eagle.Platform) embeds mobj = re.search( From e496fa50cd82877d8daeda8e29056c5d7fce2de0 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Wed, 29 Jun 2016 20:19:31 +0100 Subject: [PATCH 121/387] [urplay] Add new extractor(closes #9332) --- youtube_dl/extractor/extractors.py | 1 + youtube_dl/extractor/urplay.py | 67 ++++++++++++++++++++++++++++++ 2 files changed, 68 insertions(+) create mode 100644 youtube_dl/extractor/urplay.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index bba88e9eb..eeedc675b 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -894,6 +894,7 @@ from .udn import UDNEmbedIE from .digiteka import DigitekaIE from .unistra import UnistraIE from .urort import UrortIE +from .urplay import URPlayIE from .usatoday import USATodayIE from .ustream import UstreamIE, UstreamChannelIE from .ustudio import ( diff --git a/youtube_dl/extractor/urplay.py b/youtube_dl/extractor/urplay.py new file mode 100644 index 000000000..24ecdd2b1 --- /dev/null +++ b/youtube_dl/extractor/urplay.py @@ -0,0 +1,67 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor + + +class URPlayIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?urplay\.se/program/(?P[0-9]+)' + _TEST = { + 'url': 'http://urplay.se/program/190031-tripp-trapp-trad-sovkudde', + 'md5': '15ca67b63fd8fb320ac2bcd854bad7b6', + 'info_dict': { + 'id': '190031', + 'ext': 'mp4', + 'title': 'Tripp, Trapp, Träd : Sovkudde', + 'description': 'md5:b86bffdae04a7e9379d1d7e5947df1d1', + } + } + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage(url, video_id) + urplayer_data = self._parse_json(self._search_regex( + r'urPlayer.init\(({.+?})\);', webpage, 'urplayer data'), video_id) + host = self._download_json('http://streaming-loadbalancer.ur.se/loadbalancer.json', video_id)['redirect'] + + formats = [] + for quality_attr, quality, preference in (('', 'sd', 0), ('_hd', 'hd', 1)): + file_rtmp = urplayer_data.get('file_rtmp' + quality_attr) + if file_rtmp: + formats.append({ + 'url': 'rtmp://%s/urplay/mp4:%s' % (host, file_rtmp), + 'format_id': quality + '-rtmp', + 'ext': 'flv', + 'preference': preference, + }) + file_http = urplayer_data.get('file_http' + quality_attr) or urplayer_data.get('file_http_sub' + quality_attr) + if file_http: + file_http_base_url = 'http://%s/%s' % (host, file_http) + formats.extend(self._extract_f4m_formats( + file_http_base_url + 'manifest.f4m', video_id, + preference, '%s-hds' % quality, fatal=False)) + formats.extend(self._extract_m3u8_formats( + file_http_base_url + 'playlist.m3u8', video_id, 'mp4', + 'm3u8_native', preference, '%s-hls' % quality, fatal=False)) + self._sort_formats(formats) + + subtitles = {} + for subtitle in urplayer_data.get('subtitles', []): + subtitle_url = subtitle.get('file') + kind = subtitle.get('kind') + if subtitle_url or kind and kind != 'captions': + continue + subtitles.setdefault(subtitle.get('label', 'Svenska'), []).append({ + 'url': subtitle_url, + }) + + return { + 'id': video_id, + 'title': urplayer_data['title'], + 'description': self._og_search_description(webpage), + 'thumbnail': urplayer_data.get('image'), + 'series': urplayer_data.get('series_title'), + 'subtitles': subtitles, + 'formats': formats, + } \ No newline at end of file From 397b305cfe1a7ec2957331602727edb009c71e99 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Thu, 30 Jun 2016 00:21:03 +0100 Subject: [PATCH 122/387] [meta] Add new extractor(closes #8789) --- youtube_dl/extractor/extractors.py | 1 + youtube_dl/extractor/meta.py | 56 ++++++++++++++++++++++++++++++ 2 files changed, 57 insertions(+) create mode 100644 youtube_dl/extractor/meta.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index eeedc675b..84c39ab48 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -425,6 +425,7 @@ from .makerschannel import MakersChannelIE from .makertv import MakerTVIE from .matchtv import MatchTVIE from .mdr import MDRIE +from .meta import METAIE from .metacafe import MetacafeIE from .metacritic import MetacriticIE from .mgoon import MgoonIE diff --git a/youtube_dl/extractor/meta.py b/youtube_dl/extractor/meta.py new file mode 100644 index 000000000..674b8d264 --- /dev/null +++ b/youtube_dl/extractor/meta.py @@ -0,0 +1,56 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + unescapeHTML, + int_or_none, + ExtractorError, +) + + +class METAIE(InfoExtractor): + _VALID_URL = r'https?://video\.meta\.ua/(?P[0-9]+)' + _TEST = { + 'url': 'http://video.meta.ua/5502115.video', + 'md5': '71b6f3ee274bef16f1ab410f7f56b476', + 'info_dict': { + 'id': '5502115', + 'ext': 'mp4', + 'title': 'Sony Xperia Z camera test [HQ]', + 'description': 'Xperia Z shoots video in FullHD HDR.', + 'uploader_id': 'nomobile', + 'uploader': 'CHЁZA.TV', + 'upload_date': '20130211', + }, + 'add_ie': ['Youtube'], + } + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + st_html5 = self._search_regex(r"st_html5\s*=\s*'#([^']+)'", webpage, 'uppod html5 st') + json_str = '' + for i in range(0, len(st_html5), 3): + json_str += '�%s;' % st_html5[i:i + 3] + uppod_data = self._parse_json(unescapeHTML(json_str), video_id) + error = uppod_data.get('customnotfound') + if error: + raise ExtractorError('%s said: %s' % (self.IE_NAME, error), expected=True) + + video_url = uppod_data['file'] + info = { + 'id': video_id, + 'url': video_url, + 'title': uppod_data.get('comment') or self._og_search_title(webpage), + 'description': self._og_search_description(webpage), + 'thumbnail': uppod_data.get('poster') or self._og_search_thumbnail(webpage), + 'duration': int_or_none(self._og_search_property('video:duration', webpage)), + } + if 'youtube.com/' in video_url: + info.update({ + '_type': 'url_transparent', + 'ie_key': 'Youtube', + }) + return info From df43389ade6e7a6394521ae91c0640508dceb4dc Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Thu, 30 Jun 2016 02:54:21 +0100 Subject: [PATCH 123/387] [skysports] Add new extractor(closes #7066) --- youtube_dl/extractor/extractors.py | 1 + youtube_dl/extractor/skysports.py | 33 ++++++++++++++++++++++++++++++ 2 files changed, 34 insertions(+) create mode 100644 youtube_dl/extractor/skysports.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 84c39ab48..80d1bbe20 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -714,6 +714,7 @@ from .skynewsarabia import ( SkyNewsArabiaIE, SkyNewsArabiaArticleIE, ) +from .skysports import SkySportsIE from .slideshare import SlideshareIE from .slutload import SlutloadIE from .smotri import ( diff --git a/youtube_dl/extractor/skysports.py b/youtube_dl/extractor/skysports.py new file mode 100644 index 000000000..9dc78c7d2 --- /dev/null +++ b/youtube_dl/extractor/skysports.py @@ -0,0 +1,33 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor + + +class SkySportsIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?skysports\.com/watch/video/(?P[0-9]+)' + _TEST = { + 'url': 'http://www.skysports.com/watch/video/10328419/bale-its-our-time-to-shine', + 'md5': 'c44a1db29f27daf9a0003e010af82100', + 'info_dict': { + 'id': '10328419', + 'ext': 'flv', + 'title': 'Bale: Its our time to shine', + 'description': 'md5:9fd1de3614d525f5addda32ac3c482c9', + }, + 'add_ie': ['Ooyala'], + } + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + return { + '_type': 'url_transparent', + 'id': video_id, + 'url': 'ooyala:%s' % self._search_regex( + r'data-video-id="([^"]+)"', webpage, 'ooyala id'), + 'title': self._og_search_title(webpage), + 'description': self._og_search_description(webpage), + 'ie_key': 'Ooyala', + } From ab47b6e881269a0329b78a294318745a54e9e7c7 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Thu, 30 Jun 2016 04:08:24 +0100 Subject: [PATCH 124/387] [theatlantic] Add new extractor(closes #6611) --- youtube_dl/extractor/extractors.py | 1 + youtube_dl/extractor/theatlantic.py | 40 +++++++++++++++++++++++++++++ 2 files changed, 41 insertions(+) create mode 100644 youtube_dl/extractor/theatlantic.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 80d1bbe20..d9ffde449 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -799,6 +799,7 @@ from .teletask import TeleTaskIE from .telewebion import TelewebionIE from .testurl import TestURLIE from .tf1 import TF1IE +from .theatlantic import TheAtlanticIE from .theintercept import TheInterceptIE from .theplatform import ( ThePlatformIE, diff --git a/youtube_dl/extractor/theatlantic.py b/youtube_dl/extractor/theatlantic.py new file mode 100644 index 000000000..df4254fea --- /dev/null +++ b/youtube_dl/extractor/theatlantic.py @@ -0,0 +1,40 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor + + +class TheAtlanticIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?theatlantic\.com/video/index/(?P\d+)' + _TEST = { + 'url': 'http://www.theatlantic.com/video/index/477918/capture-a-unified-theory-on-mental-health/', + 'md5': '', + 'info_dict': { + 'id': '477918', + 'ext': 'mp4', + 'title': 'Are All Mental Illnesses Related?', + 'description': 'Depression, anxiety, overeating, addiction, and all other mental disorders share a common mechanism.', + 'timestamp': 1460490952, + 'uploader': 'TheAtlantic', + 'upload_date': '20160412', + 'uploader_id': '29913724001', + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + 'add_ie': ['BrightcoveLegacy'], + } + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + return { + '_type': 'url_transparent', + 'url': self._html_search_meta('twitter:player', webpage), + 'id': video_id, + 'title': self._og_search_title(webpage), + 'description': self._og_search_description(webpage), + 'thumbnail': self._og_search_thumbnail(webpage), + 'ie_key': 'BrightcoveLegacy', + } From 4d86d2008eeae5d4e75d8f688a666e7b9504bbeb Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Thu, 30 Jun 2016 11:30:42 +0100 Subject: [PATCH 125/387] [urplay] fix typo and check with flake8 --- youtube_dl/extractor/urplay.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/urplay.py b/youtube_dl/extractor/urplay.py index 24ecdd2b1..ce3bf6b02 100644 --- a/youtube_dl/extractor/urplay.py +++ b/youtube_dl/extractor/urplay.py @@ -22,7 +22,7 @@ class URPlayIE(InfoExtractor): webpage = self._download_webpage(url, video_id) urplayer_data = self._parse_json(self._search_regex( - r'urPlayer.init\(({.+?})\);', webpage, 'urplayer data'), video_id) + r'urPlayer\.init\(({.+?})\);', webpage, 'urplayer data'), video_id) host = self._download_json('http://streaming-loadbalancer.ur.se/loadbalancer.json', video_id)['redirect'] formats = [] @@ -64,4 +64,4 @@ class URPlayIE(InfoExtractor): 'series': urplayer_data.get('series_title'), 'subtitles': subtitles, 'formats': formats, - } \ No newline at end of file + } From 329179073b93e37ab76e759d1fe96d8f984367f3 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Thu, 30 Jun 2016 12:01:30 +0100 Subject: [PATCH 126/387] [generic] add generic support for twitter:player embeds --- youtube_dl/extractor/extractors.py | 1 - youtube_dl/extractor/generic.py | 21 +++++++++++++++ youtube_dl/extractor/theatlantic.py | 40 ----------------------------- 3 files changed, 21 insertions(+), 41 deletions(-) delete mode 100644 youtube_dl/extractor/theatlantic.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index d9ffde449..80d1bbe20 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -799,7 +799,6 @@ from .teletask import TeleTaskIE from .telewebion import TelewebionIE from .testurl import TestURLIE from .tf1 import TF1IE -from .theatlantic import TheAtlanticIE from .theintercept import TheInterceptIE from .theplatform import ( ThePlatformIE, diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 712dd8a94..c2a7f9202 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -1245,6 +1245,22 @@ class GenericIE(InfoExtractor): 'uploader': 'www.hudl.com', }, }, + # twitter:player embed + { + 'url': 'http://www.theatlantic.com/video/index/484130/what-do-black-holes-sound-like/', + 'md5': 'a3e0df96369831de324f0778e126653c', + 'info_dict': { + 'id': '4909620399001', + 'ext': 'mp4', + 'title': 'What Do Black Holes Sound Like?', + 'description': 'what do black holes sound like', + 'upload_date': '20160524', + 'uploader_id': '29913724001', + 'timestamp': 1464107587, + 'uploader': 'TheAtlantic', + }, + 'add_ie': ['BrightcoveLegacy'], + } ] def report_following_redirect(self, new_url): @@ -2081,6 +2097,11 @@ class GenericIE(InfoExtractor): 'uploader': video_uploader, } + # https://dev.twitter.com/cards/types/player#On_twitter.com_via_desktop_browser + embed_url = self._twitter_search_player(webpage) + if embed_url: + return self.url_result(embed_url) + def check_video(vurl): if YoutubeIE.suitable(vurl): return True diff --git a/youtube_dl/extractor/theatlantic.py b/youtube_dl/extractor/theatlantic.py deleted file mode 100644 index df4254fea..000000000 --- a/youtube_dl/extractor/theatlantic.py +++ /dev/null @@ -1,40 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -from .common import InfoExtractor - - -class TheAtlanticIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?theatlantic\.com/video/index/(?P\d+)' - _TEST = { - 'url': 'http://www.theatlantic.com/video/index/477918/capture-a-unified-theory-on-mental-health/', - 'md5': '', - 'info_dict': { - 'id': '477918', - 'ext': 'mp4', - 'title': 'Are All Mental Illnesses Related?', - 'description': 'Depression, anxiety, overeating, addiction, and all other mental disorders share a common mechanism.', - 'timestamp': 1460490952, - 'uploader': 'TheAtlantic', - 'upload_date': '20160412', - 'uploader_id': '29913724001', - }, - 'params': { - # m3u8 download - 'skip_download': True, - }, - 'add_ie': ['BrightcoveLegacy'], - } - - def _real_extract(self, url): - video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) - return { - '_type': 'url_transparent', - 'url': self._html_search_meta('twitter:player', webpage), - 'id': video_id, - 'title': self._og_search_title(webpage), - 'description': self._og_search_description(webpage), - 'thumbnail': self._og_search_thumbnail(webpage), - 'ie_key': 'BrightcoveLegacy', - } From 93ad6c6bfaae8f1ce87a832ece92fa099f0e2095 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Thu, 30 Jun 2016 13:50:49 +0100 Subject: [PATCH 127/387] [sixplay] Add new extractor(closes #2183) --- youtube_dl/extractor/extractors.py | 1 + youtube_dl/extractor/m6.py | 35 +---------------- youtube_dl/extractor/sixplay.py | 60 ++++++++++++++++++++++++++++++ 3 files changed, 63 insertions(+), 33 deletions(-) create mode 100644 youtube_dl/extractor/sixplay.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 80d1bbe20..a7b110450 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -710,6 +710,7 @@ from .shahid import ShahidIE from .shared import SharedIE from .sharesix import ShareSixIE from .sina import SinaIE +from .sixplay import SixPlayIE from .skynewsarabia import ( SkyNewsArabiaIE, SkyNewsArabiaArticleIE, diff --git a/youtube_dl/extractor/m6.py b/youtube_dl/extractor/m6.py index d5945ad66..39d2742c8 100644 --- a/youtube_dl/extractor/m6.py +++ b/youtube_dl/extractor/m6.py @@ -1,8 +1,6 @@ # encoding: utf-8 from __future__ import unicode_literals -import re - from .common import InfoExtractor @@ -23,34 +21,5 @@ class M6IE(InfoExtractor): } def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') - - rss = self._download_xml('http://ws.m6.fr/v1/video/info/m6/bonus/%s' % video_id, video_id, - 'Downloading video RSS') - - title = rss.find('./channel/item/title').text - description = rss.find('./channel/item/description').text - thumbnail = rss.find('./channel/item/visuel_clip_big').text - duration = int(rss.find('./channel/item/duration').text) - view_count = int(rss.find('./channel/item/nombre_vues').text) - - formats = [] - for format_id in ['lq', 'sd', 'hq', 'hd']: - video_url = rss.find('./channel/item/url_video_%s' % format_id) - if video_url is None: - continue - formats.append({ - 'url': video_url.text, - 'format_id': format_id, - }) - - return { - 'id': video_id, - 'title': title, - 'description': description, - 'thumbnail': thumbnail, - 'duration': duration, - 'view_count': view_count, - 'formats': formats, - } + video_id = self._match_id(url) + return self.url_result('6play:%s' % video_id, 'SixPlay', video_id) diff --git a/youtube_dl/extractor/sixplay.py b/youtube_dl/extractor/sixplay.py new file mode 100644 index 000000000..f855a1a00 --- /dev/null +++ b/youtube_dl/extractor/sixplay.py @@ -0,0 +1,60 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + qualities, + int_or_none, +) + + +class SixPlayIE(InfoExtractor): + _VALID_URL = r'(?:6play:|https?://(?:www\.)?6play\.fr/.+?-c_)(?P[0-9]+)' + _TEST = { + 'url': 'http://www.6play.fr/jamel-et-ses-amis-au-marrakech-du-rire-p_1316/jamel-et-ses-amis-au-marrakech-du-rire-2015-c_11495320', + 'md5': '42310bffe4ba3982db112b9cd3467328', + 'info_dict': { + 'id': '11495320', + 'ext': 'mp4', + 'title': 'Jamel et ses amis au Marrakech du rire 2015', + 'description': 'md5:ba2149d5c321d5201b78070ee839d872', + }, + } + + def _real_extract(self, url): + video_id = self._match_id(url) + clip_data = self._download_json( + 'https://player.m6web.fr/v2/video/config/6play-auth/FR/%s.json' % video_id, + video_id) + video_data = clip_data['videoInfo'] + + preference = qualities(['lq', 'sd', 'hq', 'hd']) + formats = [] + for source in clip_data['sources']: + source_type, source_url = source.get('type'), source.get('src') + if not source_url or source_type == 'hls/primetime': + continue + if source_type == 'application/vnd.apple.mpegURL': + formats.extend(self._extract_m3u8_formats( + source_url, video_id, 'mp4', 'm3u8_native', + m3u8_id='hls', fatal=False)) + formats.extend(self._extract_f4m_formats( + source_url.replace('.m3u8', '.f4m'), + video_id, f4m_id='hds', fatal=False)) + elif source_type == 'video/mp4': + quality = source.get('quality') + formats.append({ + 'url': source_url, + 'format_id': quality, + 'preference': preference(quality), + }) + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': video_data['title'].strip(), + 'description': video_data.get('description'), + 'duration': int_or_none(video_data.get('duration')), + 'series': video_data.get('titlePgm'), + 'formats': formats, + } From 7dbeee7e229a357cfc8acf0a908b10f3f326cd96 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Thu, 30 Jun 2016 14:11:55 +0100 Subject: [PATCH 128/387] [generic] make twitter:player extraction non fatal --- youtube_dl/extractor/generic.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index c2a7f9202..9315b9e21 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -2098,7 +2098,7 @@ class GenericIE(InfoExtractor): } # https://dev.twitter.com/cards/types/player#On_twitter.com_via_desktop_browser - embed_url = self._twitter_search_player(webpage) + embed_url = self._html_search_meta('twitter:player', webpage, default=None) if embed_url: return self.url_result(embed_url) From 049da7cb6cc7d6b47020480fa780907be265b9cf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Thu, 30 Jun 2016 23:04:18 +0700 Subject: [PATCH 129/387] [meta] Extend _VALID_URL --- youtube_dl/extractor/meta.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/meta.py b/youtube_dl/extractor/meta.py index 674b8d264..2ca7092e5 100644 --- a/youtube_dl/extractor/meta.py +++ b/youtube_dl/extractor/meta.py @@ -10,8 +10,8 @@ from ..utils import ( class METAIE(InfoExtractor): - _VALID_URL = r'https?://video\.meta\.ua/(?P[0-9]+)' - _TEST = { + _VALID_URL = r'https?://video\.meta\.ua/(?:iframe/)?(?P[0-9]+)' + _TESTS = [{ 'url': 'http://video.meta.ua/5502115.video', 'md5': '71b6f3ee274bef16f1ab410f7f56b476', 'info_dict': { @@ -24,7 +24,10 @@ class METAIE(InfoExtractor): 'upload_date': '20130211', }, 'add_ie': ['Youtube'], - } + }, { + 'url': 'http://video.meta.ua/iframe/5502115', + 'only_matching': True, + }] def _real_extract(self, url): video_id = self._match_id(url) From eafa643715c0989dff927c9a44e837ca62247b4e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Thu, 30 Jun 2016 23:06:13 +0700 Subject: [PATCH 130/387] [meta] Make duration and description optional For iframe URLs --- youtube_dl/extractor/meta.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/meta.py b/youtube_dl/extractor/meta.py index 2ca7092e5..2e2db5620 100644 --- a/youtube_dl/extractor/meta.py +++ b/youtube_dl/extractor/meta.py @@ -47,9 +47,10 @@ class METAIE(InfoExtractor): 'id': video_id, 'url': video_url, 'title': uppod_data.get('comment') or self._og_search_title(webpage), - 'description': self._og_search_description(webpage), + 'description': self._og_search_description(webpage, default=None), 'thumbnail': uppod_data.get('poster') or self._og_search_thumbnail(webpage), - 'duration': int_or_none(self._og_search_property('video:duration', webpage)), + 'duration': int_or_none(self._og_search_property( + 'video:duration', webpage, default=None)), } if 'youtube.com/' in video_url: info.update({ From 8ff6697861b918ff9221c7dd46a6e1109ad0ef34 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Thu, 30 Jun 2016 23:19:29 +0700 Subject: [PATCH 131/387] [pladform] Improve embed detection --- youtube_dl/extractor/pladform.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/pladform.py b/youtube_dl/extractor/pladform.py index bc559d1df..77e1211d6 100644 --- a/youtube_dl/extractor/pladform.py +++ b/youtube_dl/extractor/pladform.py @@ -49,7 +49,7 @@ class PladformIE(InfoExtractor): @staticmethod def _extract_url(webpage): mobj = re.search( - r']+src="(?P(?:https?:)?//out\.pladform\.ru/player\?.+?)"', webpage) + r']+src=(["\'])(?P(?:https?:)?//out\.pladform\.ru/player\?.+?)\1', webpage) if mobj: return mobj.group('url') From fd94e2671a8b2174d38f9e81b0956e31f90df326 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Thu, 30 Jun 2016 23:20:44 +0700 Subject: [PATCH 132/387] [meta] Add support for pladform embeds --- youtube_dl/extractor/meta.py | 60 +++++++++++++++++++++--------------- 1 file changed, 36 insertions(+), 24 deletions(-) diff --git a/youtube_dl/extractor/meta.py b/youtube_dl/extractor/meta.py index 2e2db5620..42bedc48f 100644 --- a/youtube_dl/extractor/meta.py +++ b/youtube_dl/extractor/meta.py @@ -2,6 +2,7 @@ from __future__ import unicode_literals from .common import InfoExtractor +from .pladform import PladformIE from ..utils import ( unescapeHTML, int_or_none, @@ -27,34 +28,45 @@ class METAIE(InfoExtractor): }, { 'url': 'http://video.meta.ua/iframe/5502115', 'only_matching': True, + }, { + # pladform embed + 'url': 'http://video.meta.ua/7121015.video', + 'only_matching': True, }] def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) - st_html5 = self._search_regex(r"st_html5\s*=\s*'#([^']+)'", webpage, 'uppod html5 st') - json_str = '' - for i in range(0, len(st_html5), 3): - json_str += '�%s;' % st_html5[i:i + 3] - uppod_data = self._parse_json(unescapeHTML(json_str), video_id) - error = uppod_data.get('customnotfound') - if error: - raise ExtractorError('%s said: %s' % (self.IE_NAME, error), expected=True) + st_html5 = self._search_regex( + r"st_html5\s*=\s*'#([^']+)'", webpage, 'uppod html5 st', default=None) - video_url = uppod_data['file'] - info = { - 'id': video_id, - 'url': video_url, - 'title': uppod_data.get('comment') or self._og_search_title(webpage), - 'description': self._og_search_description(webpage, default=None), - 'thumbnail': uppod_data.get('poster') or self._og_search_thumbnail(webpage), - 'duration': int_or_none(self._og_search_property( - 'video:duration', webpage, default=None)), - } - if 'youtube.com/' in video_url: - info.update({ - '_type': 'url_transparent', - 'ie_key': 'Youtube', - }) - return info + if st_html5: + json_str = '' + for i in range(0, len(st_html5), 3): + json_str += '�%s;' % st_html5[i:i + 3] + uppod_data = self._parse_json(unescapeHTML(json_str), video_id) + error = uppod_data.get('customnotfound') + if error: + raise ExtractorError('%s said: %s' % (self.IE_NAME, error), expected=True) + + video_url = uppod_data['file'] + info = { + 'id': video_id, + 'url': video_url, + 'title': uppod_data.get('comment') or self._og_search_title(webpage), + 'description': self._og_search_description(webpage, default=None), + 'thumbnail': uppod_data.get('poster') or self._og_search_thumbnail(webpage), + 'duration': int_or_none(self._og_search_property( + 'video:duration', webpage, default=None)), + } + if 'youtube.com/' in video_url: + info.update({ + '_type': 'url_transparent', + 'ie_key': 'Youtube', + }) + return info + + pladform_url = PladformIE._extract_url(webpage) + if pladform_url: + return self.url_result(pladform_url) From 66a42309fa235af2bb92b7fb73d90d8b79d6bf5d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Thu, 30 Jun 2016 23:56:55 +0700 Subject: [PATCH 133/387] release 2016.06.30 --- .github/ISSUE_TEMPLATE.md | 6 +++--- docs/supportedsites.md | 5 +++++ youtube_dl/version.py | 2 +- 3 files changed, 9 insertions(+), 4 deletions(-) diff --git a/.github/ISSUE_TEMPLATE.md b/.github/ISSUE_TEMPLATE.md index f9a1aa990..8d24c14f3 100644 --- a/.github/ISSUE_TEMPLATE.md +++ b/.github/ISSUE_TEMPLATE.md @@ -6,8 +6,8 @@ --- -### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2016.06.27*. If it's not read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. -- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2016.06.27** +### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2016.06.30*. If it's not read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. +- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2016.06.30** ### Before submitting an *issue* make sure you have: - [ ] At least skimmed through [README](https://github.com/rg3/youtube-dl/blob/master/README.md) and **most notably** [FAQ](https://github.com/rg3/youtube-dl#faq) and [BUGS](https://github.com/rg3/youtube-dl#bugs) sections @@ -35,7 +35,7 @@ $ youtube-dl -v [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 -[debug] youtube-dl version 2016.06.27 +[debug] youtube-dl version 2016.06.30 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/docs/supportedsites.md b/docs/supportedsites.md index 2a94f4feb..c05cda6ab 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -272,6 +272,7 @@ - **Helsinki**: helsinki.fi - **HentaiStigma** - **HistoricFilms** + - **history:topic**: History.com Topic - **hitbox** - **hitbox:live** - **HornBunny** @@ -358,6 +359,7 @@ - **MatchTV** - **MDR**: MDR.DE and KiKA - **media.ccc.de** + - **META** - **metacafe** - **Metacritic** - **Mgoon** @@ -587,8 +589,10 @@ - **Shared**: shared.sx and vivo.sx - **ShareSix** - **Sina** + - **SixPlay** - **skynewsarabia:article** - **skynewsarabia:video** + - **SkySports** - **Slideshare** - **Slutload** - **smotri**: Smotri.com @@ -721,6 +725,7 @@ - **UDNEmbed**: 聯合影音 - **Unistra** - **Urort**: NRK P3 Urørt + - **URPlay** - **USAToday** - **ustream** - **ustream:channel** diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 2dd24dec1..cf725db9b 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2016.06.27' +__version__ = '2016.06.30' From 05a0068a7680d41f56545a22ab34b004cd6d72e9 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Thu, 30 Jun 2016 18:13:49 +0100 Subject: [PATCH 134/387] [9c9media] Add new extractor --- youtube_dl/extractor/extractors.py | 1 + youtube_dl/extractor/ninecninemedia.py | 55 ++++++++++++++++++++++++++ 2 files changed, 56 insertions(+) create mode 100644 youtube_dl/extractor/ninecninemedia.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index a7b110450..1b2854cb9 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -526,6 +526,7 @@ from .nick import ( NickDeIE, ) from .niconico import NiconicoIE, NiconicoPlaylistIE +from .ninecninemedia import NineCNineMediaIE from .ninegag import NineGagIE from .noco import NocoIE from .normalboots import NormalbootsIE diff --git a/youtube_dl/extractor/ninecninemedia.py b/youtube_dl/extractor/ninecninemedia.py new file mode 100644 index 000000000..d889245ad --- /dev/null +++ b/youtube_dl/extractor/ninecninemedia.py @@ -0,0 +1,55 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + parse_iso8601, + parse_duration, + ExtractorError +) + + +class NineCNineMediaIE(InfoExtractor): + _VALID_URL = r'9c9media:(?P[^:]+):(?P\d+)' + + def _real_extract(self, url): + destination_code, video_id = re.match(self._VALID_URL, url).groups() + api_base_url = 'http://capi.9c9media.com/destinations/%s/platforms/desktop/contents/%s/' % (destination_code, video_id) + content = self._download_json(api_base_url, video_id, query={ + '$include': '[contentpackages]', + }) + title = content['Name'] + if len(content['ContentPackages']) > 1: + raise ExtractorError('multiple content packages') + content_package = content['ContentPackages'][0] + stacks_base_url = api_base_url + 'contentpackages/%s/stacks/' % content_package['Id'] + stacks = self._download_json(stacks_base_url, video_id)['Items'] + if len(stacks) > 1: + raise ExtractorError('multiple stacks') + stack = stacks[0] + stack_base_url = '%s%s/manifest.' % (stacks_base_url, stack['Id']) + formats = [] + formats.extend(self._extract_m3u8_formats( + stack_base_url + 'm3u8', video_id, 'mp4', + 'm3u8_native', m3u8_id='hls', fatal=False)) + formats.extend(self._extract_f4m_formats( + stack_base_url + 'f4m', video_id, + f4m_id='hds', fatal=False)) + mp4_url = self._download_webpage(stack_base_url + 'pd', video_id, fatal=False) + if mp4_url: + formats.append({ + 'url': mp4_url, + 'format_id': 'mp4', + }) + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': title, + 'description': content.get('Desc') or content.get('ShortDesc'), + 'timestamp': parse_iso8601(content.get('BroadcastDateTime')), + 'duration': parse_duration(content.get('BroadcastTime')), + 'formats': formats, + } From 20361b4f2511a4395ae489c04a68c6098ffab7a4 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Thu, 30 Jun 2016 18:14:23 +0100 Subject: [PATCH 135/387] [rds] extract 9c9media formats --- youtube_dl/extractor/rds.py | 27 ++++++++++++--------------- 1 file changed, 12 insertions(+), 15 deletions(-) diff --git a/youtube_dl/extractor/rds.py b/youtube_dl/extractor/rds.py index 796adfdf9..bf200ea4d 100644 --- a/youtube_dl/extractor/rds.py +++ b/youtube_dl/extractor/rds.py @@ -1,23 +1,23 @@ # coding: utf-8 from __future__ import unicode_literals -import re - from .common import InfoExtractor from ..utils import ( parse_duration, parse_iso8601, + js_to_json, ) +from ..compat import compat_str class RDSIE(InfoExtractor): IE_DESC = 'RDS.ca' - _VALID_URL = r'https?://(?:www\.)?rds\.ca/vid(?:[eé]|%C3%A9)os/(?:[^/]+/)*(?P[^/]+)-(?P\d+\.\d+)' + _VALID_URL = r'https?://(?:www\.)?rds\.ca/vid(?:[eé]|%C3%A9)os/(?:[^/]+/)*(?P[^/]+)-\d+\.\d+' _TESTS = [{ 'url': 'http://www.rds.ca/videos/football/nfl/fowler-jr-prend-la-direction-de-jacksonville-3.1132799', 'info_dict': { - 'id': '3.1132799', + 'id': '604333', 'display_id': 'fowler-jr-prend-la-direction-de-jacksonville', 'ext': 'mp4', 'title': 'Fowler Jr. prend la direction de Jacksonville', @@ -33,22 +33,17 @@ class RDSIE(InfoExtractor): }] def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') - display_id = mobj.group('display_id') + display_id = self._match_id(url) webpage = self._download_webpage(url, display_id) - # TODO: extract f4m from 9c9media.com - video_url = self._search_regex( - r']+itemprop="contentURL"[^>]+content="([^"]+)"', - webpage, 'video url') - - title = self._og_search_title(webpage) or self._html_search_meta( + item = self._parse_json(self._search_regex(r'(?s)itemToPush\s*=\s*({.+?});', webpage, 'item'), display_id, js_to_json) + video_id = compat_str(item['id']) + title = item.get('title') or self._og_search_title(webpage) or self._html_search_meta( 'title', webpage, 'title', fatal=True) description = self._og_search_description(webpage) or self._html_search_meta( 'description', webpage, 'description') - thumbnail = self._og_search_thumbnail(webpage) or self._search_regex( + thumbnail = item.get('urlImageBig') or self._og_search_thumbnail(webpage) or self._search_regex( [r']+itemprop="thumbnailUrl"[^>]+href="([^"]+)"', r']+itemprop="thumbnailUrl"[^>]+content="([^"]+)"'], webpage, 'thumbnail', fatal=False) @@ -61,13 +56,15 @@ class RDSIE(InfoExtractor): age_limit = self._family_friendly_search(webpage) return { + '_type': 'url_transparent', 'id': video_id, 'display_id': display_id, - 'url': video_url, + 'url': '9c9media:rds_web:%s' % video_id, 'title': title, 'description': description, 'thumbnail': thumbnail, 'timestamp': timestamp, 'duration': duration, 'age_limit': age_limit, + 'ie_key': 'NineCNineMedia', } From bf4fa24414d2f4f4418b17ed379eb60df5726c4f Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Thu, 30 Jun 2016 18:14:59 +0100 Subject: [PATCH 136/387] [ctvnews] Add new extractor(closes #2156) --- youtube_dl/extractor/ctvnews.py | 64 ++++++++++++++++++++++++++++++ youtube_dl/extractor/extractors.py | 1 + 2 files changed, 65 insertions(+) create mode 100644 youtube_dl/extractor/ctvnews.py diff --git a/youtube_dl/extractor/ctvnews.py b/youtube_dl/extractor/ctvnews.py new file mode 100644 index 000000000..e14b30085 --- /dev/null +++ b/youtube_dl/extractor/ctvnews.py @@ -0,0 +1,64 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor + + +class CTVNewsIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?ctvnews\.ca/(?:video\?(?:clip|playlist|bin)Id=|.*?)(?P[0-9.]+)' + _TESTS = [{ + 'url': 'http://www.ctvnews.ca/video?clipId=901995', + 'md5': '10deb320dc0ccb8d01d34d12fc2ea672', + 'info_dict': { + 'id': '901995', + 'ext': 'mp4', + 'title': 'Extended: \'That person cannot be me\' Johnson says', + 'description': 'md5:958dd3b4f5bbbf0ed4d045c790d89285', + 'timestamp': 1467286284, + 'upload_date': '20160630', + } + }, { + 'url': 'http://www.ctvnews.ca/video?playlistId=1.2966224', + 'info_dict': + { + 'id': '1.2966224', + }, + 'playlist_mincount': 19, + }, { + 'url': 'http://www.ctvnews.ca/video?binId=1.810401', + 'info_dict': + { + 'id': '1.810401', + }, + 'playlist_mincount': 91, + }, { + 'url': 'http://www.ctvnews.ca/1.810401', + 'only_matching': True, + }, { + 'url': 'http://www.ctvnews.ca/canadiens-send-p-k-subban-to-nashville-in-blockbuster-trade-1.2967231', + 'only_matching': True, + }] + + def _real_extract(self, url): + page_id = self._match_id(url) + + def ninecninemedia_url_result(clip_id): + return { + '_type': 'url_transparent', + 'id': clip_id, + 'url': '9c9media:ctvnews_web:%s' % clip_id, + 'ie_key': 'NineCNineMedia', + } + + if page_id.isdigit(): + return ninecninemedia_url_result(page_id) + else: + webpage = self._download_webpage('http://www.ctvnews.ca/%s' % page_id, page_id, query={ + 'ot': 'example.AjaxPageLayout.ot', + 'maxItemsPerPage': 20, + }) + entries = [ninecninemedia_url_result(clip_id) for clip_id in set( + re.findall(r'clip\.id\s*=\s*(\d+);', webpage))] + return self.playlist_result(entries, page_id) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 1b2854cb9..4765fbc77 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -171,6 +171,7 @@ from .crunchyroll import ( ) from .cspan import CSpanIE from .ctsnews import CtsNewsIE +from .ctvnews import CTVNewsIE from .cultureunplugged import CultureUnpluggedIE from .cwtv import CWTVIE from .dailymail import DailyMailIE From 9617b557aa2a96840026a9c915bc57e335a76272 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Thu, 30 Jun 2016 18:21:47 +0100 Subject: [PATCH 137/387] [ctv] Add new extractor(closes #4077) --- youtube_dl/extractor/ctv.py | 30 ++++++++++++++++++++++++++++++ youtube_dl/extractor/extractors.py | 1 + 2 files changed, 31 insertions(+) create mode 100644 youtube_dl/extractor/ctv.py diff --git a/youtube_dl/extractor/ctv.py b/youtube_dl/extractor/ctv.py new file mode 100644 index 000000000..5807fbac9 --- /dev/null +++ b/youtube_dl/extractor/ctv.py @@ -0,0 +1,30 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor + + +class CTVIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?ctv\.ca/video/player\?vid=(?P[0-9.]+)' + _TESTS = [{ + 'url': 'http://www.ctv.ca/video/player?vid=706966', + 'md5': 'ff2ebbeae0aa2dcc32a830c3fd69b7b0', + 'info_dict': { + 'id': '706966', + 'ext': 'mp4', + 'title': 'Larry Day and Richard Jutras on the TIFF red carpet of \'Stonewall\'', + 'description': 'etalk catches up with Larry Day and Richard Jutras on the TIFF red carpet of "Stonewall”.', + 'upload_date': '20150919', + 'timestamp': 1442624700, + }, + 'expected_warnings': ['HTTP Error 404'], + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + return { + '_type': 'url_transparent', + 'id': video_id, + 'url': '9c9media:ctv_web:%s' % video_id, + 'ie_key': 'NineCNineMedia', + } diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 4765fbc77..62b5fed18 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -171,6 +171,7 @@ from .crunchyroll import ( ) from .cspan import CSpanIE from .ctsnews import CtsNewsIE +from .ctv import CTVIE from .ctvnews import CTVNewsIE from .cultureunplugged import CultureUnpluggedIE from .cwtv import CWTVIE From 76dad392f5bd82493777d8efc35bcfccf70fafec Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Thu, 30 Jun 2016 18:27:57 +0100 Subject: [PATCH 138/387] [meta] Clarify the source of uppod st decryption algorithm --- youtube_dl/extractor/meta.py | 1 + 1 file changed, 1 insertion(+) diff --git a/youtube_dl/extractor/meta.py b/youtube_dl/extractor/meta.py index 42bedc48f..cdb46e163 100644 --- a/youtube_dl/extractor/meta.py +++ b/youtube_dl/extractor/meta.py @@ -42,6 +42,7 @@ class METAIE(InfoExtractor): r"st_html5\s*=\s*'#([^']+)'", webpage, 'uppod html5 st', default=None) if st_html5: + # uppod st decryption algorithm is reverse engineered from function un(s) at uppod.js json_str = '' for i in range(0, len(st_html5), 3): json_str += '�%s;' % st_html5[i:i + 3] From c9e538a3b1cde6ce140323a029c7b6f7386eb004 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Thu, 30 Jun 2016 19:52:32 +0100 Subject: [PATCH 139/387] [ctvnews] use orderedSet, increase the number of items for playlists and use smaller bin list for test --- youtube_dl/extractor/ctvnews.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/youtube_dl/extractor/ctvnews.py b/youtube_dl/extractor/ctvnews.py index e14b30085..1023b6130 100644 --- a/youtube_dl/extractor/ctvnews.py +++ b/youtube_dl/extractor/ctvnews.py @@ -4,6 +4,7 @@ from __future__ import unicode_literals import re from .common import InfoExtractor +from ..utils import orderedSet class CTVNewsIE(InfoExtractor): @@ -27,12 +28,12 @@ class CTVNewsIE(InfoExtractor): }, 'playlist_mincount': 19, }, { - 'url': 'http://www.ctvnews.ca/video?binId=1.810401', + 'url': 'http://www.ctvnews.ca/video?binId=1.2876780', 'info_dict': { - 'id': '1.810401', + 'id': '1.2876780', }, - 'playlist_mincount': 91, + 'playlist_mincount': 100, }, { 'url': 'http://www.ctvnews.ca/1.810401', 'only_matching': True, @@ -57,8 +58,8 @@ class CTVNewsIE(InfoExtractor): else: webpage = self._download_webpage('http://www.ctvnews.ca/%s' % page_id, page_id, query={ 'ot': 'example.AjaxPageLayout.ot', - 'maxItemsPerPage': 20, + 'maxItemsPerPage': 1000000, }) - entries = [ninecninemedia_url_result(clip_id) for clip_id in set( + entries = [ninecninemedia_url_result(clip_id) for clip_id in orderedSet( re.findall(r'clip\.id\s*=\s*(\d+);', webpage))] return self.playlist_result(entries, page_id) From 044e3d91b5715f7aa63c578097b77fd510ed0f73 Mon Sep 17 00:00:00 2001 From: kidol Date: Thu, 30 Jun 2016 21:06:22 +0200 Subject: [PATCH 140/387] [Pornhub] Fix error detection --- youtube_dl/extractor/pornhub.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/pornhub.py b/youtube_dl/extractor/pornhub.py index 6d57e1d35..4bbf1ec3b 100644 --- a/youtube_dl/extractor/pornhub.py +++ b/youtube_dl/extractor/pornhub.py @@ -87,7 +87,7 @@ class PornHubIE(InfoExtractor): webpage = self._download_webpage(req, video_id) error_msg = self._html_search_regex( - r'(?s)
(.*?)
', + r']+class="removed">\s*]*>\s*

\s*([^<]*)', webpage, 'error message', default=None) if error_msg: error_msg = re.sub(r'\s+', ' ', error_msg) From 3cb3b60064fc8d99a8175b751000892b141e8de0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Fri, 1 Jul 2016 03:14:23 +0700 Subject: [PATCH 141/387] [pornhub] Relax removed message regex (Closes #9964) --- youtube_dl/extractor/pornhub.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/pornhub.py b/youtube_dl/extractor/pornhub.py index 4bbf1ec3b..c1694893c 100644 --- a/youtube_dl/extractor/pornhub.py +++ b/youtube_dl/extractor/pornhub.py @@ -87,8 +87,8 @@ class PornHubIE(InfoExtractor): webpage = self._download_webpage(req, video_id) error_msg = self._html_search_regex( - r']+class="removed">\s*]*>\s*

\s*([^<]*)', - webpage, 'error message', default=None) + r'(?s)]+class=(["\']).*?\bremoved\b.*?\1[^>]*>(?P.+?)

', + webpage, 'error message', default=None, group='error') if error_msg: error_msg = re.sub(r'\s+', ' ', error_msg) raise ExtractorError( From eaaaaec042f8e6afa8f8ec6a2a8b137943f802df Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Fri, 1 Jul 2016 03:18:27 +0700 Subject: [PATCH 142/387] [pornhub] Add more tests with removed videos --- youtube_dl/extractor/pornhub.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/youtube_dl/extractor/pornhub.py b/youtube_dl/extractor/pornhub.py index c1694893c..77182bf07 100644 --- a/youtube_dl/extractor/pornhub.py +++ b/youtube_dl/extractor/pornhub.py @@ -63,8 +63,17 @@ class PornHubIE(InfoExtractor): 'url': 'http://www.pornhub.com/view_video.php?viewkey=ph557bbb6676d2d', 'only_matching': True, }, { + # removed at the request of cam4.com 'url': 'http://fr.pornhub.com/view_video.php?viewkey=ph55ca2f9760862', 'only_matching': True, + }, { + # removed at the request of the copyright owner + 'url': 'http://www.pornhub.com/view_video.php?viewkey=788152859', + 'only_matching': True, + }, { + # removed by uploader + 'url': 'http://www.pornhub.com/view_video.php?viewkey=ph572716d15a111', + 'only_matching': True, }] @classmethod From 9e29ef13a378769c19ccec200aba377ad504fe8f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Wed, 29 Jun 2016 14:56:05 +0200 Subject: [PATCH 143/387] [options] Accept quoted string across multiple lines (#9940) Like: -f " bestvideo+bestaudio/ best " --- youtube_dl/options.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/youtube_dl/options.py b/youtube_dl/options.py index 99ce4131f..c9033e3cb 100644 --- a/youtube_dl/options.py +++ b/youtube_dl/options.py @@ -26,9 +26,7 @@ def parseOpts(overrideArguments=None): except IOError: return default # silently skip if file is not present try: - res = [] - for l in optionf: - res += compat_shlex_split(l, comments=True) + res = compat_shlex_split(optionf.read(), comments=True) finally: optionf.close() return res From a9eede3913a9e9c7e094907f36a22bc6719ce73d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Wed, 29 Jun 2016 18:54:30 +0200 Subject: [PATCH 144/387] [test/compat] compat_shlex_split: test with newlines --- test/test_compat.py | 1 + 1 file changed, 1 insertion(+) diff --git a/test/test_compat.py b/test/test_compat.py index f5317ac3e..1d7ac9f16 100644 --- a/test/test_compat.py +++ b/test/test_compat.py @@ -87,6 +87,7 @@ class TestCompat(unittest.TestCase): def test_compat_shlex_split(self): self.assertEqual(compat_shlex_split('-option "one two"'), ['-option', 'one two']) + self.assertEqual(compat_shlex_split('-option "one\ntwo" \n -flag'), ['-option', 'one\ntwo', '-flag']) def test_compat_etree_fromstring(self): xml = ''' From 0c2ac64bb81462bed0c31be5a2a549601f95f166 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Fri, 1 Jul 2016 03:57:59 +0700 Subject: [PATCH 145/387] [sixplay] Rename preference key to quality in format dict --- youtube_dl/extractor/sixplay.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/sixplay.py b/youtube_dl/extractor/sixplay.py index f855a1a00..759a332d2 100644 --- a/youtube_dl/extractor/sixplay.py +++ b/youtube_dl/extractor/sixplay.py @@ -28,7 +28,7 @@ class SixPlayIE(InfoExtractor): video_id) video_data = clip_data['videoInfo'] - preference = qualities(['lq', 'sd', 'hq', 'hd']) + quality_key = qualities(['lq', 'sd', 'hq', 'hd']) formats = [] for source in clip_data['sources']: source_type, source_url = source.get('type'), source.get('src') @@ -46,7 +46,7 @@ class SixPlayIE(InfoExtractor): formats.append({ 'url': source_url, 'format_id': quality, - 'preference': preference(quality), + 'quality': quality_key(quality), }) self._sort_formats(formats) From f11315e8d4239611c42d8fe438c7ded58293d54d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Fri, 1 Jul 2016 03:59:57 +0700 Subject: [PATCH 146/387] release 2016.07.01 --- .github/ISSUE_TEMPLATE.md | 6 +++--- docs/supportedsites.md | 3 +++ youtube_dl/version.py | 2 +- 3 files changed, 7 insertions(+), 4 deletions(-) diff --git a/.github/ISSUE_TEMPLATE.md b/.github/ISSUE_TEMPLATE.md index 8d24c14f3..d9b196e52 100644 --- a/.github/ISSUE_TEMPLATE.md +++ b/.github/ISSUE_TEMPLATE.md @@ -6,8 +6,8 @@ --- -### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2016.06.30*. If it's not read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. -- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2016.06.30** +### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2016.07.01*. If it's not read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. +- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2016.07.01** ### Before submitting an *issue* make sure you have: - [ ] At least skimmed through [README](https://github.com/rg3/youtube-dl/blob/master/README.md) and **most notably** [FAQ](https://github.com/rg3/youtube-dl#faq) and [BUGS](https://github.com/rg3/youtube-dl#bugs) sections @@ -35,7 +35,7 @@ $ youtube-dl -v [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 -[debug] youtube-dl version 2016.06.30 +[debug] youtube-dl version 2016.07.01 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/docs/supportedsites.md b/docs/supportedsites.md index c05cda6ab..6e4041614 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -152,6 +152,8 @@ - **CSNNE** - **CSpan**: C-SPAN - **CtsNews**: 華視新聞 + - **CTV** + - **CTVNews** - **culturebox.francetvinfo.fr** - **CultureUnplugged** - **CWTV** @@ -440,6 +442,7 @@ - **nick.de** - **niconico**: ニコニコ動画 - **NiconicoPlaylist** + - **NineCNineMedia** - **njoy**: N-JOY - **njoy:embed** - **Noco** diff --git a/youtube_dl/version.py b/youtube_dl/version.py index cf725db9b..e5f10da39 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2016.06.30' +__version__ = '2016.07.01' From 9f4576a7ebbc4af4971796325f0799f894daaa1c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Fri, 1 Jul 2016 23:16:43 +0700 Subject: [PATCH 147/387] [twitch] Update usher URL (Closes #9975) --- youtube_dl/extractor/twitch.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/twitch.py b/youtube_dl/extractor/twitch.py index 20919774d..67b1277cc 100644 --- a/youtube_dl/extractor/twitch.py +++ b/youtube_dl/extractor/twitch.py @@ -29,7 +29,7 @@ class TwitchBaseIE(InfoExtractor): _VALID_URL_BASE = r'https?://(?:www\.)?twitch\.tv' _API_BASE = 'https://api.twitch.tv' - _USHER_BASE = 'http://usher.twitch.tv' + _USHER_BASE = 'https://usher.ttvnw.net' _LOGIN_URL = 'http://www.twitch.tv/login' _NETRC_MACHINE = 'twitch' From 564dc3c6e8ad235160cfea01e41fc01fefc39be8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 2 Jul 2016 01:24:57 +0700 Subject: [PATCH 148/387] [vine] Fix extraction (Closes #9970) --- youtube_dl/extractor/vine.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/vine.py b/youtube_dl/extractor/vine.py index 5b801849c..5a2c53b36 100644 --- a/youtube_dl/extractor/vine.py +++ b/youtube_dl/extractor/vine.py @@ -90,9 +90,11 @@ class VineIE(InfoExtractor): data = self._parse_json( self._search_regex( - r'window\.POST_DATA\s*=\s*{\s*%s\s*:\s*({.+?})\s*};\s*' % video_id, + r'window\.POST_DATA\s*=\s*({.+?});\s*', webpage, 'vine data'), video_id) + + data = data[list(data.keys())[0]] formats = [{ 'format_id': '%(format)s-%(rate)s' % f, From 347227237b5a101c3bed260f8efbdbfe65c5f196 Mon Sep 17 00:00:00 2001 From: cant-think-of-a-name Date: Thu, 30 Jun 2016 20:19:17 -0500 Subject: [PATCH 149/387] [periscope] fix playlist extraction (#9967) The JSON response changed and the extractor needed to be updated in order to gather the video IDs. --- youtube_dl/extractor/periscope.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/periscope.py b/youtube_dl/extractor/periscope.py index c23b314e7..34e0d3d30 100644 --- a/youtube_dl/extractor/periscope.py +++ b/youtube_dl/extractor/periscope.py @@ -122,7 +122,7 @@ class PeriscopeUserIE(InfoExtractor): entries = [ self.url_result( - 'https://www.periscope.tv/%s/%s' % (user_id, broadcast['id'])) - for broadcast in data_store.get('UserBroadcastHistory', {}).get('broadcasts', [])] + 'https://www.periscope.tv/%s/%s' % (user_id, broadcast)) + for broadcast in data_store.get('UserBroadcastHistory', {}).get('broadcastIds', [])] return self.playlist_result(entries, user_id, title, description) From 35fc3021ba6e1e0d7b7d400fdaccc709546a4bc5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 2 Jul 2016 01:35:57 +0700 Subject: [PATCH 150/387] [periscope] Add another fallback source --- youtube_dl/extractor/periscope.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/periscope.py b/youtube_dl/extractor/periscope.py index 34e0d3d30..75f5884a9 100644 --- a/youtube_dl/extractor/periscope.py +++ b/youtube_dl/extractor/periscope.py @@ -120,9 +120,12 @@ class PeriscopeUserIE(InfoExtractor): title = user.get('display_name') or user.get('username') description = user.get('description') + broadcast_ids = (data_store.get('UserBroadcastHistory', {}).get('broadcastIds') or + data_store.get('BroadcastCache', {}).get('broadcastIds', [])) + entries = [ self.url_result( - 'https://www.periscope.tv/%s/%s' % (user_id, broadcast)) - for broadcast in data_store.get('UserBroadcastHistory', {}).get('broadcastIds', [])] + 'https://www.periscope.tv/%s/%s' % (user_id, broadcast_id)) + for broadcast_id in broadcast_ids] return self.playlist_result(entries, user_id, title, description) From bc4b2d75ba5b8fbd2ac9d42f0fb7a4fcd2ea3038 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 2 Jul 2016 02:11:07 +0700 Subject: [PATCH 151/387] [pornhub] Add support for thumbzilla (Closes #8696) --- youtube_dl/extractor/pornhub.py | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/pornhub.py b/youtube_dl/extractor/pornhub.py index 77182bf07..c76afe1c4 100644 --- a/youtube_dl/extractor/pornhub.py +++ b/youtube_dl/extractor/pornhub.py @@ -25,7 +25,15 @@ from ..aes import ( class PornHubIE(InfoExtractor): - _VALID_URL = r'https?://(?:[a-z]+\.)?pornhub\.com/(?:view_video\.php\?viewkey=|embed/)(?P[0-9a-z]+)' + IE_DESC = 'PornHub and Thumbzilla' + _VALID_URL = r'''(?x) + https?:// + (?: + (?:[a-z]+\.)?pornhub\.com/(?:view_video\.php\?viewkey=|embed/)| + (?:www\.)?thumbzilla\.com/video/ + ) + (?P[0-9a-z]+) + ''' _TESTS = [{ 'url': 'http://www.pornhub.com/view_video.php?viewkey=648719015', 'md5': '1e19b41231a02eba417839222ac9d58e', @@ -74,6 +82,9 @@ class PornHubIE(InfoExtractor): # removed by uploader 'url': 'http://www.pornhub.com/view_video.php?viewkey=ph572716d15a111', 'only_matching': True, + }, { + 'url': 'https://www.thumbzilla.com/video/ph56c6114abd99a/horny-girlfriend-sex', + 'only_matching': True, }] @classmethod From bb08101ec4a8728677ee23466608ab6aa65cbb4f Mon Sep 17 00:00:00 2001 From: TRox1972 Date: Thu, 30 Jun 2016 14:57:42 +0200 Subject: [PATCH 152/387] [Fusion] Add new extractor --- youtube_dl/extractor/extractors.py | 1 + youtube_dl/extractor/fusion.py | 29 +++++++++++++++++++++++++++++ 2 files changed, 30 insertions(+) create mode 100644 youtube_dl/extractor/fusion.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 62b5fed18..16fa4d35c 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -281,6 +281,7 @@ from .freespeech import FreespeechIE from .freevideo import FreeVideoIE from .funimation import FunimationIE from .funnyordie import FunnyOrDieIE +from .fusion import FusionIE from .gameinformer import GameInformerIE from .gamekings import GamekingsIE from .gameone import ( diff --git a/youtube_dl/extractor/fusion.py b/youtube_dl/extractor/fusion.py new file mode 100644 index 000000000..771abcdb1 --- /dev/null +++ b/youtube_dl/extractor/fusion.py @@ -0,0 +1,29 @@ +from __future__ import unicode_literals + +from .common import InfoExtractor +from .ooyala import OoyalaIE + + +class FusionIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?fusion\.net/video/\d+/(?P[\w-]+)' + _TEST = { + 'url': 'http://fusion.net/video/201781/u-s-and-panamanian-forces-work-together-to-stop-a-vessel-smuggling-drugs/', + 'md5': '55c3dd61d2b96dc17c4ab6711d02a39e', + 'info_dict': { + 'id': 'ZpcWNoMTE6x6uVIIWYpHh0qQDjxBuq5P', + 'ext': 'mp4', + 'title': 'U.S. and Panamanian forces work together to stop a vessel smuggling drugs', + 'description': 'md5:0cc84a9943c064c0f46b128b41b1b0d7', + 'duration': 140.0, + }, + 'add_ie': ['Ooyala'], + } + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + + ooyala_code = self._search_regex(r'data-video-id="([^"]{32})"', + webpage, 'ooyala code') + + return OoyalaIE._build_url_result(ooyala_code) From 14ff6baa0ee3fa0ead2e2b460017abe5e853647c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 2 Jul 2016 02:44:37 +0700 Subject: [PATCH 153/387] [fusion] Improve --- youtube_dl/extractor/fusion.py | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/youtube_dl/extractor/fusion.py b/youtube_dl/extractor/fusion.py index 771abcdb1..b4ab4cbb7 100644 --- a/youtube_dl/extractor/fusion.py +++ b/youtube_dl/extractor/fusion.py @@ -5,10 +5,9 @@ from .ooyala import OoyalaIE class FusionIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?fusion\.net/video/\d+/(?P[\w-]+)' - _TEST = { + _VALID_URL = r'https?://(?:www\.)?fusion\.net/video/(?P\d+)' + _TESTS = [{ 'url': 'http://fusion.net/video/201781/u-s-and-panamanian-forces-work-together-to-stop-a-vessel-smuggling-drugs/', - 'md5': '55c3dd61d2b96dc17c4ab6711d02a39e', 'info_dict': { 'id': 'ZpcWNoMTE6x6uVIIWYpHh0qQDjxBuq5P', 'ext': 'mp4', @@ -16,14 +15,21 @@ class FusionIE(InfoExtractor): 'description': 'md5:0cc84a9943c064c0f46b128b41b1b0d7', 'duration': 140.0, }, + 'params': { + 'skip_download': True, + }, 'add_ie': ['Ooyala'], - } + }, { + 'url': 'http://fusion.net/video/201781', + 'only_matching': True, + }] def _real_extract(self, url): display_id = self._match_id(url) webpage = self._download_webpage(url, display_id) - ooyala_code = self._search_regex(r'data-video-id="([^"]{32})"', - webpage, 'ooyala code') + ooyala_code = self._search_regex( + r'data-video-id=(["\'])(?P.+?)\1', + webpage, 'ooyala code', group='code') return OoyalaIE._build_url_result(ooyala_code) From ac2d8f54d1f95ff54cae6808602d5ddb39bc978b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 2 Jul 2016 02:45:00 +0700 Subject: [PATCH 154/387] [vine] Remove superfluous whitespace --- youtube_dl/extractor/vine.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/vine.py b/youtube_dl/extractor/vine.py index 5a2c53b36..0183f052a 100644 --- a/youtube_dl/extractor/vine.py +++ b/youtube_dl/extractor/vine.py @@ -93,7 +93,7 @@ class VineIE(InfoExtractor): r'window\.POST_DATA\s*=\s*({.+?});\s*', webpage, 'vine data'), video_id) - + data = data[list(data.keys())[0]] formats = [{ From 7a1e71575e8bf6918ece07cb72a58e7425692fd4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 2 Jul 2016 02:47:42 +0700 Subject: [PATCH 155/387] release 2016.07.02 --- .github/ISSUE_TEMPLATE.md | 6 +++--- docs/supportedsites.md | 3 ++- youtube_dl/version.py | 2 +- 3 files changed, 6 insertions(+), 5 deletions(-) diff --git a/.github/ISSUE_TEMPLATE.md b/.github/ISSUE_TEMPLATE.md index d9b196e52..637103b6b 100644 --- a/.github/ISSUE_TEMPLATE.md +++ b/.github/ISSUE_TEMPLATE.md @@ -6,8 +6,8 @@ --- -### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2016.07.01*. If it's not read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. -- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2016.07.01** +### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2016.07.02*. If it's not read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. +- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2016.07.02** ### Before submitting an *issue* make sure you have: - [ ] At least skimmed through [README](https://github.com/rg3/youtube-dl/blob/master/README.md) and **most notably** [FAQ](https://github.com/rg3/youtube-dl#faq) and [BUGS](https://github.com/rg3/youtube-dl#bugs) sections @@ -35,7 +35,7 @@ $ youtube-dl -v [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 -[debug] youtube-dl version 2016.07.01 +[debug] youtube-dl version 2016.07.02 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/docs/supportedsites.md b/docs/supportedsites.md index 6e4041614..8fd1ab5af 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -242,6 +242,7 @@ - **FreeVideo** - **Funimation** - **FunnyOrDie** + - **Fusion** - **GameInformer** - **Gamekings** - **GameOne** @@ -508,7 +509,7 @@ - **podomatic** - **PolskieRadio** - **PornHd** - - **PornHub** + - **PornHub**: PornHub and Thumbzilla - **PornHubPlaylist** - **PornHubUserVideos** - **Pornotube** diff --git a/youtube_dl/version.py b/youtube_dl/version.py index e5f10da39..d0483f83b 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2016.07.01' +__version__ = '2016.07.02' From bdafd88da07046f91e0585f083dea7795096e5d7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 2 Jul 2016 16:43:19 +0700 Subject: [PATCH 156/387] [vk] Extend _VALID_URLs to support new domain (Closes #9981) --- youtube_dl/extractor/vk.py | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/vk.py b/youtube_dl/extractor/vk.py index cfc5ffd8b..758d9c86b 100644 --- a/youtube_dl/extractor/vk.py +++ b/youtube_dl/extractor/vk.py @@ -27,12 +27,12 @@ class VKIE(InfoExtractor): https?:// (?: (?: - (?:m\.)?vk\.com/video_| + (?:(?:m|new)\.)?vk\.com/video_| (?:www\.)?daxab.com/ ) ext\.php\?(?P.*?\boid=(?P-?\d+).*?\bid=(?P\d+).*)| (?: - (?:m\.)?vk\.com/(?:.+?\?.*?z=)?video| + (?:(?:m|new)\.)?vk\.com/(?:.+?\?.*?z=)?video| (?:www\.)?daxab.com/embed/ ) (?P-?\d+_\d+)(?:.*\blist=(?P[\da-f]+))? @@ -182,6 +182,10 @@ class VKIE(InfoExtractor): # pladform embed 'url': 'https://vk.com/video-76116461_171554880', 'only_matching': True, + }, + { + 'url': 'http://new.vk.com/video205387401_165548505', + 'only_matching': True, } ] @@ -354,7 +358,7 @@ class VKIE(InfoExtractor): class VKUserVideosIE(InfoExtractor): IE_NAME = 'vk:uservideos' IE_DESC = "VK - User's Videos" - _VALID_URL = r'https?://vk\.com/videos(?P-?[0-9]+)(?!\?.*\bz=video)(?:[/?#&]|$)' + _VALID_URL = r'https?://(?:(?:m|new)\.)?vk\.com/videos(?P-?[0-9]+)(?!\?.*\bz=video)(?:[/?#&]|$)' _TEMPLATE_URL = 'https://vk.com/videos' _TESTS = [{ 'url': 'http://vk.com/videos205387401', @@ -369,6 +373,12 @@ class VKUserVideosIE(InfoExtractor): }, { 'url': 'http://vk.com/videos-97664626?section=all', 'only_matching': True, + }, { + 'url': 'http://m.vk.com/videos205387401', + 'only_matching': True, + }, { + 'url': 'http://new.vk.com/videos205387401', + 'only_matching': True, }] def _real_extract(self, url): From fd6ca382628afbc4a229a15cd26552e226ac4536 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Sat, 2 Jul 2016 21:33:23 +0800 Subject: [PATCH 157/387] [facebook] Improve Facebook embedded detection Related to #9938. Another example comes from 9834872bf63b4e03b66c5e3b8f306556e735d8c5. --- youtube_dl/extractor/facebook.py | 15 +++++++++++++++ youtube_dl/extractor/generic.py | 26 ++++++++++++++++++++++---- 2 files changed, 37 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/facebook.py b/youtube_dl/extractor/facebook.py index 9b87b37ae..6eaa22d89 100644 --- a/youtube_dl/extractor/facebook.py +++ b/youtube_dl/extractor/facebook.py @@ -129,6 +129,21 @@ class FacebookIE(InfoExtractor): 'only_matching': True, }] + @staticmethod + def _extract_url(webpage): + mobj = re.search( + r']+?src=(["\'])(?Phttps://www\.facebook\.com/video/embed.+?)\1', webpage) + if mobj is not None: + return mobj.group('url') + + # Facebook API embed + # see https://developers.facebook.com/docs/plugins/embedded-video-player + mobj = re.search(r'''(?x)]+ + class=(?P[\'"])[^\'"]*\bfb-video\b[^\'"]*(?P=q1)[^>]+ + data-href=(?P[\'"])(?P[^\'"]+)(?P=q2)''', webpage) + if mobj is not None: + return mobj.group('url') + def _login(self): (useremail, password) = self._get_login_info() if useremail is None: diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 9315b9e21..7212e0edd 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -66,6 +66,7 @@ from .theplatform import ThePlatformIE from .vessel import VesselIE from .kaltura import KalturaIE from .eagleplatform import EaglePlatformIE +from .facebook import FacebookIE class GenericIE(InfoExtractor): @@ -1260,6 +1261,24 @@ class GenericIE(InfoExtractor): 'uploader': 'TheAtlantic', }, 'add_ie': ['BrightcoveLegacy'], + }, + # Facebook