From 29ac31afaf627363fbc1f757aa50078d343acf1f Mon Sep 17 00:00:00 2001 From: sprhawk <465558+sprhawk@users.noreply.github.com> Date: Mon, 11 Dec 2017 12:25:13 +0800 Subject: [PATCH 001/137] simply get the correct webpage, but not parsed to extract information --- youtube_dl/extractor/extractors.py | 1 + youtube_dl/extractor/weibo.py | 97 ++++++++++++++++++++++++++++++ 2 files changed, 98 insertions(+) create mode 100644 youtube_dl/extractor/weibo.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 2cc3bc463..12dc2e7e8 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -1286,6 +1286,7 @@ from .webofstories import ( WebOfStoriesIE, WebOfStoriesPlaylistIE, ) +from .weibo import WeiboIE from .weiqitv import WeiqiTVIE from .wimp import WimpIE from .wistia import WistiaIE diff --git a/youtube_dl/extractor/weibo.py b/youtube_dl/extractor/weibo.py new file mode 100644 index 000000000..195508e99 --- /dev/null +++ b/youtube_dl/extractor/weibo.py @@ -0,0 +1,97 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor + +from urllib.request import Request +from urllib.parse import urlencode +import json +import random as rnd + +class WeiboIE(InfoExtractor): + _VALID_URL = r'https?://weibo\.com/[0-9]+/(?P[a-zA-Z0-9]+)' + _TEST = { + 'url': 'https://weibo.com/6275294458/Fp6RGfbff?from=page_1005056275294458_profile&wvr=6&mod=weibotime&type=comment', + 'md5': 'TODO: md5 sum of the first 10241 bytes of the video file (use --test)', + 'info_dict': { + 'id': '42', + 'ext': 'mp4', + 'title': 'Video title goes here', + 'thumbnail': r're:^https?://.*\.jpg$', + # TODO more properties, either as: + # * A value + # * MD5 checksum; start the string with md5: + # * A regular expression; start the string with re: + # * Any Python type (for example int or float) + } + } + + def _real_extract(self, url): + video_id = self._match_id(url) + headers = { + 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', + 'Accept-Encoding': 'gzip, deflate, br', + 'Accept-Language': 'en,zh-CN;q=0.9,zh;q=0.8', + 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36', + 'Upgrade-Insecure-Requests': '1', + } + # to get Referer url for genvisitor + webpage,urlh = self._download_webpage_handle(url, video_id, headers=headers, note="first visit the page") + + visitor_url = urlh.geturl() + + data = urlencode({ + "cb": "gen_callback", + "fp": '{"os":"2","browser":"Gecko57,0,0,0","fonts":"undefined","screenInfo":"1440*900*24","plugins":""}', + }).encode() + headers = { + 'Accept-Encoding': 'gzip, deflate, br', + 'Accept': '*/*', + 'Referer': visitor_url, + } + + r_genvisitor = Request( + 'https://passport.weibo.com/visitor/genvisitor', + data = data, + headers = headers, + method = 'POST' + ) + webpage,urlh = self._download_webpage_handle(r_genvisitor, video_id, note="gen visitor") + print("webpage", webpage) + + p = webpage.split("&&")[1] # split "gen_callback && gen_callback(...)" + i1 = p.find('{') + i2 = p.rfind('}') + j = p[i1:i2+1] # get JSON object + d = json.loads(j) + tid = d["data"]["tid"] + cnfd = "%03d" % d["data"]["confidence"] + + param = urlencode({ + 'a': 'incarnate', + 't': tid, + 'w': 2, + 'c': cnfd, + 'cb': 'cross_domain', + 'from': 'weibo', + '_rand': rnd.random() + }) + gencallback_url = "https://passport.weibo.com/visitor/visitor?" + param + webpage,urlh = self._download_webpage_handle(gencallback_url, video_id, note="gen callback") + print("webpage", webpage) + + webpage,urlh = self._download_webpage_handle(url, video_id, headers=headers, note="retry to visit the page") + print("webpage", webpage) + + # TODO more code goes here, for example ... + title = self._html_search_regex(r'(.+?)', webpage, 'title') + + video_sources = self._search_regex(r'video-sources=(.+?)', webpage, 'video_sources') + print("video_sources:", video_sources) + return { + 'id': video_id, + 'title': title, + 'description': self._og_search_description(webpage), + 'uploader': self._search_regex(r']+id="uploader"[^>]*>([^<]+)<', webpage, 'uploader', fatal=False), + # TODO more properties (see youtube_dl/extractor/common.py) + } From 3281af3464a910cb88f22ef0ece4a8323c2a4d38 Mon Sep 17 00:00:00 2001 From: sprhawk <465558+sprhawk@users.noreply.github.com> Date: Mon, 11 Dec 2017 15:56:54 +0800 Subject: [PATCH 002/137] a working version --- youtube_dl/extractor/weibo.py | 41 +++++++++++++++++++++-------------- 1 file changed, 25 insertions(+), 16 deletions(-) diff --git a/youtube_dl/extractor/weibo.py b/youtube_dl/extractor/weibo.py index 195508e99..9b398e931 100644 --- a/youtube_dl/extractor/weibo.py +++ b/youtube_dl/extractor/weibo.py @@ -5,24 +5,19 @@ from .common import InfoExtractor from urllib.request import Request from urllib.parse import urlencode +from urllib import parse import json import random as rnd +from os import path class WeiboIE(InfoExtractor): _VALID_URL = r'https?://weibo\.com/[0-9]+/(?P[a-zA-Z0-9]+)' _TEST = { - 'url': 'https://weibo.com/6275294458/Fp6RGfbff?from=page_1005056275294458_profile&wvr=6&mod=weibotime&type=comment', - 'md5': 'TODO: md5 sum of the first 10241 bytes of the video file (use --test)', + 'url': 'https://weibo.com/6275294458/Fp6RGfbff?type=comment', 'info_dict': { - 'id': '42', + 'id': 'Fp6RGfbff', 'ext': 'mp4', - 'title': 'Video title goes here', - 'thumbnail': r're:^https?://.*\.jpg$', - # TODO more properties, either as: - # * A value - # * MD5 checksum; start the string with md5: - # * A regular expression; start the string with re: - # * Any Python type (for example int or float) + 'title': 'You should have servants to massage you,... 来自Hosico_猫 - 微博', } } @@ -78,20 +73,34 @@ class WeiboIE(InfoExtractor): }) gencallback_url = "https://passport.weibo.com/visitor/visitor?" + param webpage,urlh = self._download_webpage_handle(gencallback_url, video_id, note="gen callback") - print("webpage", webpage) webpage,urlh = self._download_webpage_handle(url, video_id, headers=headers, note="retry to visit the page") - print("webpage", webpage) # TODO more code goes here, for example ... title = self._html_search_regex(r'(.+?)', webpage, 'title') - video_sources = self._search_regex(r'video-sources=(.+?)', webpage, 'video_sources') - print("video_sources:", video_sources) + video_sources_text = self._search_regex("video-sources=\\\\\"(.+?)\"", webpage, 'video_sources') + + video_formats = parse.parse_qs(video_sources_text) + + formats = [] + supported_resolutions = ['720', '480'] + for res in supported_resolutions: + f = video_formats.get(res) + if isinstance(f, list): + if len(f) > 0: + vid_url = f[0] + print("%s:%s" % (res, vid_url)) + formats.append({ + 'url': vid_url + }) + self._sort_formats(formats) + uploader = self._og_search_property('nick-name', webpage, 'uploader', default = None) + print(title, uploader) return { 'id': video_id, 'title': title, - 'description': self._og_search_description(webpage), - 'uploader': self._search_regex(r']+id="uploader"[^>]*>([^<]+)<', webpage, 'uploader', fatal=False), + 'uploader': uploader, + 'formats': formats # TODO more properties (see youtube_dl/extractor/common.py) } From 0c69958844a446bc3373f45f8f750cbc3202d14e Mon Sep 17 00:00:00 2001 From: sprhawk <465558+sprhawk@users.noreply.github.com> Date: Mon, 11 Dec 2017 16:02:14 +0800 Subject: [PATCH 003/137] add other properties; remove print verbose --- youtube_dl/extractor/weibo.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/weibo.py b/youtube_dl/extractor/weibo.py index 9b398e931..b835f8975 100644 --- a/youtube_dl/extractor/weibo.py +++ b/youtube_dl/extractor/weibo.py @@ -52,7 +52,6 @@ class WeiboIE(InfoExtractor): method = 'POST' ) webpage,urlh = self._download_webpage_handle(r_genvisitor, video_id, note="gen visitor") - print("webpage", webpage) p = webpage.split("&&")[1] # split "gen_callback && gen_callback(...)" i1 = p.find('{') @@ -90,13 +89,13 @@ class WeiboIE(InfoExtractor): if isinstance(f, list): if len(f) > 0: vid_url = f[0] - print("%s:%s" % (res, vid_url)) formats.append({ - 'url': vid_url + 'url': vid_url, + 'format': 'mp4', + 'height': int(res), }) self._sort_formats(formats) uploader = self._og_search_property('nick-name', webpage, 'uploader', default = None) - print(title, uploader) return { 'id': video_id, 'title': title, From 17c3aced5d0d2cf7df41e9978500260756ee8ad9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Tue, 19 Dec 2017 22:53:04 +0700 Subject: [PATCH 004/137] [animeondemand] Relax login error regex --- youtube_dl/extractor/animeondemand.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/animeondemand.py b/youtube_dl/extractor/animeondemand.py index 34c2b363e..be032d5b4 100644 --- a/youtube_dl/extractor/animeondemand.py +++ b/youtube_dl/extractor/animeondemand.py @@ -85,8 +85,8 @@ class AnimeOnDemandIE(InfoExtractor): if all(p not in response for p in ('>Logout<', 'href="/users/sign_out"')): error = self._search_regex( - r'

(.+?)

', - response, 'error', default=None) + r']+\bclass=(["\'])(?:(?!\1).)*\balert\s(?:(?!\1).)*\1[^>]*>(?P.+?)

', + response, 'error', default=None, group='error') if error: raise ExtractorError('Unable to login: %s' % error, expected=True) raise ExtractorError('Unable to log in') From d2d766bc6d6f976c28fad8b69a1de060b55f5b17 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Wed, 20 Dec 2017 23:17:36 +0700 Subject: [PATCH 005/137] [animeondemand] Fix typo --- youtube_dl/extractor/animeondemand.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/animeondemand.py b/youtube_dl/extractor/animeondemand.py index be032d5b4..e4fa72f46 100644 --- a/youtube_dl/extractor/animeondemand.py +++ b/youtube_dl/extractor/animeondemand.py @@ -85,7 +85,7 @@ class AnimeOnDemandIE(InfoExtractor): if all(p not in response for p in ('>Logout<', 'href="/users/sign_out"')): error = self._search_regex( - r']+\bclass=(["\'])(?:(?!\1).)*\balert\s(?:(?!\1).)*\1[^>]*>(?P.+?)

', + r']+\bclass=(["\'])(?:(?!\1).)*\balert\b(?:(?!\1).)*\1[^>]*>(?P.+?)

', response, 'error', default=None, group='error') if error: raise ExtractorError('Unable to login: %s' % error, expected=True) From 963d237d26c7e6da7b6f514c1d240a7046501b05 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Fri, 22 Dec 2017 23:38:16 +0700 Subject: [PATCH 006/137] Add LICENSE, AUTHORS and ChangeLog to PyPI package (closes #15054) --- MANIFEST.in | 3 +++ setup.py | 1 + 2 files changed, 4 insertions(+) diff --git a/MANIFEST.in b/MANIFEST.in index 5743f605a..af7518e0d 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -1,4 +1,7 @@ include README.md +include LICENSE +include AUTHORS +include ChangeLog include test/*.py include test/*.json include youtube-dl.bash-completion diff --git a/setup.py b/setup.py index 67d6633ed..7dbb5805f 100644 --- a/setup.py +++ b/setup.py @@ -109,6 +109,7 @@ setup( author_email='ytdl@yt-dl.org', maintainer='Sergey M.', maintainer_email='dstftw@gmail.com', + license='Unlicense', packages=[ 'youtube_dl', 'youtube_dl.extractor', 'youtube_dl.downloader', From 3e191da6d9d1cbe62d8f638ed68a93a46348b38b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Fri, 22 Dec 2017 23:46:08 +0700 Subject: [PATCH 007/137] [Makefile] Add AUTHORS to youtube-dl.tar.gz --- Makefile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Makefile b/Makefile index 1c760bef8..fe247810f 100644 --- a/Makefile +++ b/Makefile @@ -110,7 +110,7 @@ _EXTRACTOR_FILES = $(shell find youtube_dl/extractor -iname '*.py' -and -not -in youtube_dl/extractor/lazy_extractors.py: devscripts/make_lazy_extractors.py devscripts/lazy_load_template.py $(_EXTRACTOR_FILES) $(PYTHON) devscripts/make_lazy_extractors.py $@ -youtube-dl.tar.gz: youtube-dl README.md README.txt youtube-dl.1 youtube-dl.bash-completion youtube-dl.zsh youtube-dl.fish ChangeLog +youtube-dl.tar.gz: youtube-dl README.md README.txt youtube-dl.1 youtube-dl.bash-completion youtube-dl.zsh youtube-dl.fish ChangeLog AUTHORS @tar -czf youtube-dl.tar.gz --transform "s|^|youtube-dl/|" --owner 0 --group 0 \ --exclude '*.DS_Store' \ --exclude '*.kate-swp' \ @@ -122,7 +122,7 @@ youtube-dl.tar.gz: youtube-dl README.md README.txt youtube-dl.1 youtube-dl.bash- --exclude 'docs/_build' \ -- \ bin devscripts test youtube_dl docs \ - ChangeLog LICENSE README.md README.txt \ + ChangeLog AUTHORS LICENSE README.md README.txt \ Makefile MANIFEST.in youtube-dl.1 youtube-dl.bash-completion \ youtube-dl.zsh youtube-dl.fish setup.py setup.cfg \ youtube-dl From 9e3682d555d431514d9583170ae8be1b6fc12839 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Fri, 22 Dec 2017 23:53:27 +0700 Subject: [PATCH 008/137] [MANIFEST.in] Include all test data in PyPI package --- MANIFEST.in | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/MANIFEST.in b/MANIFEST.in index af7518e0d..4e43e99f3 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -2,9 +2,8 @@ include README.md include LICENSE include AUTHORS include ChangeLog -include test/*.py -include test/*.json include youtube-dl.bash-completion include youtube-dl.fish include youtube-dl.1 recursive-include docs Makefile conf.py *.rst +recursive-include test * From 4b7dd1705a7c16c1426ed7ed39e51e275124b4f3 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Sat, 23 Dec 2017 13:21:33 +0100 Subject: [PATCH 009/137] [7plus] Add new extractor(closes #15043) --- youtube_dl/extractor/brightcove.py | 116 +++++++++++++++-------------- youtube_dl/extractor/extractors.py | 1 + youtube_dl/extractor/sevenplus.py | 67 +++++++++++++++++ 3 files changed, 128 insertions(+), 56 deletions(-) create mode 100644 youtube_dl/extractor/sevenplus.py diff --git a/youtube_dl/extractor/brightcove.py b/youtube_dl/extractor/brightcove.py index 0ed59bcbc..f04505011 100644 --- a/youtube_dl/extractor/brightcove.py +++ b/youtube_dl/extractor/brightcove.py @@ -464,7 +464,7 @@ class BrightcoveNewIE(AdobePassIE): 'timestamp': 1441391203, 'upload_date': '20150904', 'uploader_id': '929656772001', - 'formats': 'mincount:22', + 'formats': 'mincount:20', }, }, { # with rtmp streams @@ -478,7 +478,7 @@ class BrightcoveNewIE(AdobePassIE): 'timestamp': 1433556729, 'upload_date': '20150606', 'uploader_id': '4036320279001', - 'formats': 'mincount:41', + 'formats': 'mincount:39', }, 'params': { # m3u8 download @@ -564,59 +564,7 @@ class BrightcoveNewIE(AdobePassIE): return entries - def _real_extract(self, url): - url, smuggled_data = unsmuggle_url(url, {}) - self._initialize_geo_bypass(smuggled_data.get('geo_countries')) - - account_id, player_id, embed, video_id = re.match(self._VALID_URL, url).groups() - - webpage = self._download_webpage( - 'http://players.brightcove.net/%s/%s_%s/index.min.js' - % (account_id, player_id, embed), video_id) - - policy_key = None - - catalog = self._search_regex( - r'catalog\(({.+?})\);', webpage, 'catalog', default=None) - if catalog: - catalog = self._parse_json( - js_to_json(catalog), video_id, fatal=False) - if catalog: - policy_key = catalog.get('policyKey') - - if not policy_key: - policy_key = self._search_regex( - r'policyKey\s*:\s*(["\'])(?P.+?)\1', - webpage, 'policy key', group='pk') - - api_url = 'https://edge.api.brightcove.com/playback/v1/accounts/%s/videos/%s' % (account_id, video_id) - try: - json_data = self._download_json(api_url, video_id, headers={ - 'Accept': 'application/json;pk=%s' % policy_key - }) - except ExtractorError as e: - if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403: - json_data = self._parse_json(e.cause.read().decode(), video_id)[0] - message = json_data.get('message') or json_data['error_code'] - if json_data.get('error_subcode') == 'CLIENT_GEO': - self.raise_geo_restricted(msg=message) - raise ExtractorError(message, expected=True) - raise - - errors = json_data.get('errors') - if errors and errors[0].get('error_subcode') == 'TVE_AUTH': - custom_fields = json_data['custom_fields'] - tve_token = self._extract_mvpd_auth( - smuggled_data['source_url'], video_id, - custom_fields['bcadobepassrequestorid'], - custom_fields['bcadobepassresourceid']) - json_data = self._download_json( - api_url, video_id, headers={ - 'Accept': 'application/json;pk=%s' % policy_key - }, query={ - 'tveToken': tve_token, - }) - + def _parse_brightcove_metadata(self, json_data, video_id): title = json_data['name'].strip() formats = [] @@ -682,6 +630,7 @@ class BrightcoveNewIE(AdobePassIE): }) formats.append(f) + errors = json_data.get('errors') if not formats and errors: error = errors[0] raise ExtractorError( @@ -708,9 +657,64 @@ class BrightcoveNewIE(AdobePassIE): 'thumbnail': json_data.get('thumbnail') or json_data.get('poster'), 'duration': duration, 'timestamp': parse_iso8601(json_data.get('published_at')), - 'uploader_id': account_id, + 'uploader_id': json_data.get('account_id'), 'formats': formats, 'subtitles': subtitles, 'tags': json_data.get('tags', []), 'is_live': is_live, } + + def _real_extract(self, url): + url, smuggled_data = unsmuggle_url(url, {}) + self._initialize_geo_bypass(smuggled_data.get('geo_countries')) + + account_id, player_id, embed, video_id = re.match(self._VALID_URL, url).groups() + + webpage = self._download_webpage( + 'http://players.brightcove.net/%s/%s_%s/index.min.js' + % (account_id, player_id, embed), video_id) + + policy_key = None + + catalog = self._search_regex( + r'catalog\(({.+?})\);', webpage, 'catalog', default=None) + if catalog: + catalog = self._parse_json( + js_to_json(catalog), video_id, fatal=False) + if catalog: + policy_key = catalog.get('policyKey') + + if not policy_key: + policy_key = self._search_regex( + r'policyKey\s*:\s*(["\'])(?P.+?)\1', + webpage, 'policy key', group='pk') + + api_url = 'https://edge.api.brightcove.com/playback/v1/accounts/%s/videos/%s' % (account_id, video_id) + try: + json_data = self._download_json(api_url, video_id, headers={ + 'Accept': 'application/json;pk=%s' % policy_key + }) + except ExtractorError as e: + if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403: + json_data = self._parse_json(e.cause.read().decode(), video_id)[0] + message = json_data.get('message') or json_data['error_code'] + if json_data.get('error_subcode') == 'CLIENT_GEO': + self.raise_geo_restricted(msg=message) + raise ExtractorError(message, expected=True) + raise + + errors = json_data.get('errors') + if errors and errors[0].get('error_subcode') == 'TVE_AUTH': + custom_fields = json_data['custom_fields'] + tve_token = self._extract_mvpd_auth( + smuggled_data['source_url'], video_id, + custom_fields['bcadobepassrequestorid'], + custom_fields['bcadobepassresourceid']) + json_data = self._download_json( + api_url, video_id, headers={ + 'Accept': 'application/json;pk=%s' % policy_key + }, query={ + 'tveToken': tve_token, + }) + + return self._parse_brightcove_metadata(json_data, video_id) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 513074801..9ba1be2cd 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -926,6 +926,7 @@ from .senateisvp import SenateISVPIE from .sendtonews import SendtoNewsIE from .servingsys import ServingSysIE from .servus import ServusIE +from .sevenplus import SevenPlusIE from .sexu import SexuIE from .shahid import ( ShahidIE, diff --git a/youtube_dl/extractor/sevenplus.py b/youtube_dl/extractor/sevenplus.py new file mode 100644 index 000000000..9792f820a --- /dev/null +++ b/youtube_dl/extractor/sevenplus.py @@ -0,0 +1,67 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .brightcove import BrightcoveNewIE +from ..utils import update_url_query + + +class SevenPlusIE(BrightcoveNewIE): + IE_NAME = '7plus' + _VALID_URL = r'https?://(?:www\.)?7plus\.com\.au/(?P[^?]+\?.*?\bepisode-id=(?P[^&#]+))' + _TESTS = [{ + 'url': 'https://7plus.com.au/BEAT?episode-id=BEAT-001', + 'info_dict': { + 'id': 'BEAT-001', + 'ext': 'mp4', + 'title': 'S1 E1 - Help / Lucy In The Sky With Diamonds', + 'description': 'md5:37718bea20a8eedaca7f7361af566131', + 'uploader_id': '5303576322001', + 'upload_date': '20171031', + 'timestamp': 1509440068, + }, + 'params': { + 'format': 'bestvideo', + 'skip_download': True, + } + }, { + 'url': 'https://7plus.com.au/UUUU?episode-id=AUMS43-001', + 'only_matching': True, + }] + + def _real_extract(self, url): + path, episode_id = re.match(self._VALID_URL, url).groups() + + media = self._download_json( + 'https://videoservice.swm.digital/playback', episode_id, query={ + 'appId': '7plus', + 'deviceType': 'web', + 'platformType': 'web', + 'accountId': 5303576322001, + 'referenceId': 'ref:' + episode_id, + 'deliveryId': 'csai', + 'videoType': 'vod', + })['media'] + + for source in media.get('sources', {}): + src = source.get('src') + if not src: + continue + source['src'] = update_url_query(src, {'rule': ''}) + + info = self._parse_brightcove_metadata(media, episode_id) + + content = self._download_json( + 'https://component-cdn.swm.digital/content/' + path, + episode_id, headers={ + 'market-id': 4, + }, fatal=False) or {} + for item in content.get('items', {}): + if item.get('componentData', {}).get('componentType') == 'infoPanel': + for src_key, dst_key in [('title', 'title'), ('shortSynopsis', 'description')]: + value = item.get(src_key) + if value: + info[dst_key] = value + + return info From 2132edaa03857085821b6a1214ce1410e0c2e463 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 23 Dec 2017 20:57:35 +0700 Subject: [PATCH 010/137] [extractor/common] Move X-Forwarded-For setup code into _request_webpage --- youtube_dl/extractor/common.py | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index e5ef5e490..3b79b8cb4 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -495,6 +495,16 @@ class InfoExtractor(object): self.to_screen('%s' % (note,)) else: self.to_screen('%s: %s' % (video_id, note)) + + # Some sites check X-Forwarded-For HTTP header in order to figure out + # the origin of the client behind proxy. This allows bypassing geo + # restriction by faking this header's value to IP that belongs to some + # geo unrestricted country. We will do so once we encounter any + # geo restriction error. + if self._x_forwarded_for_ip: + if 'X-Forwarded-For' not in headers: + headers['X-Forwarded-For'] = self._x_forwarded_for_ip + if isinstance(url_or_request, compat_urllib_request.Request): url_or_request = update_Request( url_or_request, data=data, headers=headers, query=query) @@ -524,15 +534,6 @@ class InfoExtractor(object): if isinstance(url_or_request, (compat_str, str)): url_or_request = url_or_request.partition('#')[0] - # Some sites check X-Forwarded-For HTTP header in order to figure out - # the origin of the client behind proxy. This allows bypassing geo - # restriction by faking this header's value to IP that belongs to some - # geo unrestricted country. We will do so once we encounter any - # geo restriction error. - if self._x_forwarded_for_ip: - if 'X-Forwarded-For' not in headers: - headers['X-Forwarded-For'] = self._x_forwarded_for_ip - urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal, data=data, headers=headers, query=query) if urlh is False: assert not fatal From 5c5e60cff894e5372f89e6ba45d7ab6575c0a0b6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 23 Dec 2017 20:59:14 +0700 Subject: [PATCH 011/137] [voot] Fix video identification --- youtube_dl/extractor/voot.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/voot.py b/youtube_dl/extractor/voot.py index 426754489..751b21ee5 100644 --- a/youtube_dl/extractor/voot.py +++ b/youtube_dl/extractor/voot.py @@ -16,7 +16,7 @@ class VootIE(InfoExtractor): _TESTS = [{ 'url': 'https://www.voot.com/shows/ishq-ka-rang-safed/1/360558/is-this-the-end-of-kamini-/441353', 'info_dict': { - 'id': '441353', + 'id': '0_8ledb18o', 'ext': 'mp4', 'title': 'Ishq Ka Rang Safed - Season 01 - Episode 340', 'description': 'md5:06291fbbbc4dcbe21235c40c262507c1', @@ -59,9 +59,10 @@ class VootIE(InfoExtractor): media = media_info['assets'] + entry_id = media['EntryId'] title = media['MediaName'] formats = self._extract_m3u8_formats( - 'https://cdnapisec.kaltura.com/p/1982551/playManifest/pt/https/f/applehttp/t/web/e/' + media['EntryId'], + 'https://cdnapisec.kaltura.com/p/1982551/playManifest/pt/https/f/applehttp/t/web/e/' + entry_id, video_id, 'mp4', m3u8_id='hls') self._sort_formats(formats) @@ -83,7 +84,8 @@ class VootIE(InfoExtractor): episode_number = int_or_none(value) return { - 'id': video_id, + 'extractor_key': 'Kaltura', + 'id': entry_id, 'title': title, 'description': description, 'series': series, From 69d69da98aa093c05776371beac1f2ffb4f5eea6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 23 Dec 2017 21:16:32 +0700 Subject: [PATCH 012/137] [kaltura] Add another embed pattern for entry_id For cases when player configuration map is setup via indexing operator, e.g. kalturaPlayerConfiguration_1_lre6rg3i_10[entry_id] = 1_lre6rg3i (see https://www.heise.de/video/artikel/odcast-c-t-uplink-20-1-Apple-CarPlay-vs-Android-Auto-Galileo-3D-Sound-erklaert-3919694.html) --- youtube_dl/extractor/kaltura.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/kaltura.py b/youtube_dl/extractor/kaltura.py index bdac2df3e..e369959e3 100644 --- a/youtube_dl/extractor/kaltura.py +++ b/youtube_dl/extractor/kaltura.py @@ -125,9 +125,12 @@ class KalturaIE(InfoExtractor): (?:https?:)?//cdnapi(?:sec)?\.kaltura\.com(?::\d+)?/(?:(?!(?P=q1)).)*\b(?:p|partner_id)/(?P\d+)(?:(?!(?P=q1)).)* (?P=q1).*? (?: - entry_?[Ii]d| - (?P["'])entry_?[Ii]d(?P=q2) - )\s*:\s* + (?: + entry_?[Ii]d| + (?P["'])entry_?[Ii]d(?P=q2) + )\s*:\s*| + \[\s*(?P["'])entry_?[Ii]d(?P=q2_1)\s*\]?\s*=\s* + ) (?P["'])(?P(?:(?!(?P=q3)).)+)(?P=q3) ''', webpage) or re.search( From f5a6321107db17ec8efaccaa2a4febc64b5aa5ea Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 23 Dec 2017 21:17:28 +0700 Subject: [PATCH 013/137] [ChangeLog] Actualize --- ChangeLog | 33 +++++++++++++++++++++++++++++++-- 1 file changed, 31 insertions(+), 2 deletions(-) diff --git a/ChangeLog b/ChangeLog index 03d2defb7..2d62f2fde 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,32 @@ +version + +Core +* [extractor/common] Move X-Forwarded-For setup code into _request_webpage ++ [YoutubeDL] Add support for playlist_uploader and playlist_uploader_id in + output template (#11427, #15018) ++ [extractor/common] Introduce uploader, uploader_id and uploader_url + meta fields for playlists (#11427, #15018) +* [downloader/fragment] Encode filename of fragment being removed (#15020) ++ [utils] Add another date format pattern (#14999) + +Extractors ++ [kaltura] Add another embed pattern for entry_id ++ [7plus] Add support for 7plus.com.au (#15043) +* [animeondemand] Relax login error regular expression ++ [shahid] Add support for show pages (#7401) ++ [youtube] Extract uploader, uploader_id and uploader_url for playlists + (#11427, #15018) +* [afreecatv] Improve format extraction (#15019) ++ [cspan] Add support for audio only pages and catch page errors (#14995) ++ [mailru] Add support for embed URLs (#14904) +* [crunchyroll] Future-proof XML element checks (#15013) +* [cbslocal] Fix timestamp extraction (#14999, #15000) +* [discoverygo] Correct TTML subtitle extension +* [vk] Make view count optional (#14979) +* [disney] Skip Apple FairPlay formats (#14982) +* [voot] Fix format extraction (#14758) + + version 2017.12.14 Core @@ -148,8 +177,8 @@ Extractors + [fxnetworks] Extract series metadata (#14603) + [younow] Add support for younow.com (#9255, #9432, #12436) * [dctptv] Fix extraction (#14599) -* [youtube] Restrict embed regex (#14600) -* [vimeo] Restrict iframe embed regex (#14600) +* [youtube] Restrict embed regular expression (#14600) +* [vimeo] Restrict iframe embed regular expression (#14600) * [soundgasm] Improve extraction (#14588) - [myvideo] Remove extractor (#8557) + [nbc] Add support for classic-tv videos (#14575) From c2f2f8b120628f7e0e4b0a6f7184884fa976d9c0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 23 Dec 2017 21:22:41 +0700 Subject: [PATCH 014/137] [kaltura] Fix typo --- youtube_dl/extractor/kaltura.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/kaltura.py b/youtube_dl/extractor/kaltura.py index e369959e3..562e25f6d 100644 --- a/youtube_dl/extractor/kaltura.py +++ b/youtube_dl/extractor/kaltura.py @@ -129,7 +129,7 @@ class KalturaIE(InfoExtractor): entry_?[Ii]d| (?P["'])entry_?[Ii]d(?P=q2) )\s*:\s*| - \[\s*(?P["'])entry_?[Ii]d(?P=q2_1)\s*\]?\s*=\s* + \[\s*(?P["'])entry_?[Ii]d(?P=q2_1)\s*\]\s*=\s* ) (?P["'])(?P(?:(?!(?P=q3)).)+)(?P=q3) ''', webpage) or From 307a7588b0a9205688e8ebc2539c1b0e19f68a6f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 23 Dec 2017 21:24:18 +0700 Subject: [PATCH 015/137] release 2017.12.23 --- .github/ISSUE_TEMPLATE.md | 6 +++--- ChangeLog | 2 +- docs/supportedsites.md | 2 ++ youtube_dl/version.py | 2 +- 4 files changed, 7 insertions(+), 5 deletions(-) diff --git a/.github/ISSUE_TEMPLATE.md b/.github/ISSUE_TEMPLATE.md index f37d8aa42..d7a91239f 100644 --- a/.github/ISSUE_TEMPLATE.md +++ b/.github/ISSUE_TEMPLATE.md @@ -6,8 +6,8 @@ --- -### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2017.12.14*. If it's not, read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. -- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2017.12.14** +### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2017.12.23*. If it's not, read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. +- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2017.12.23** ### Before submitting an *issue* make sure you have: - [ ] At least skimmed through the [README](https://github.com/rg3/youtube-dl/blob/master/README.md), **most notably** the [FAQ](https://github.com/rg3/youtube-dl#faq) and [BUGS](https://github.com/rg3/youtube-dl#bugs) sections @@ -35,7 +35,7 @@ Add the `-v` flag to **your command line** you run youtube-dl with (`youtube-dl [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 -[debug] youtube-dl version 2017.12.14 +[debug] youtube-dl version 2017.12.23 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/ChangeLog b/ChangeLog index 2d62f2fde..ba64f3e02 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,4 +1,4 @@ -version +version 2017.12.23 Core * [extractor/common] Move X-Forwarded-For setup code into _request_webpage diff --git a/docs/supportedsites.md b/docs/supportedsites.md index ebddd5b9d..eac35e390 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -10,6 +10,7 @@ - **56.com** - **5min** - **6play** + - **7plus** - **8tracks** - **91porn** - **9c9media** @@ -728,6 +729,7 @@ - **Servus** - **Sexu** - **Shahid** + - **ShahidShow** - **Shared**: shared.sx - **ShowRoomLive** - **Sina** diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 2b5a63464..f999584d7 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2017.12.14' +__version__ = '2017.12.23' From 0e25a1a27875369a4fdf11b6a7fcfc969b1f482d Mon Sep 17 00:00:00 2001 From: JianxinLi Date: Mon, 4 Dec 2017 10:33:56 +0800 Subject: [PATCH 016/137] [youku] Update ccode Change-Id: Id397e814e81ff560506d68563b7409eebbe5943d --- youtube_dl/extractor/youku.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/youku.py b/youtube_dl/extractor/youku.py index f0ba01197..9d0caee93 100644 --- a/youtube_dl/extractor/youku.py +++ b/youtube_dl/extractor/youku.py @@ -154,7 +154,7 @@ class YoukuIE(InfoExtractor): # request basic data basic_data_params = { 'vid': video_id, - 'ccode': '0501', + 'ccode': '0507', 'client_ip': '192.168.1.1', 'utid': cna, 'client_ts': time.time() / 1000, From 116561697d605f22f749e3d092e8e4795ca0573d Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Sat, 23 Dec 2017 23:41:24 +0800 Subject: [PATCH 017/137] [ChangeLog] Update after #14903 --- ChangeLog | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/ChangeLog b/ChangeLog index ba64f3e02..658c00c5b 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,9 @@ +version + +Extractors +* [youku] Update ccode + + version 2017.12.23 Core From b954e72c8731e65b9b0548a537cd0e3275b54e4d Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Sat, 23 Dec 2017 23:42:02 +0800 Subject: [PATCH 018/137] [ChangeLog] typo --- ChangeLog | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ChangeLog b/ChangeLog index 658c00c5b..cb750e270 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,4 +1,4 @@ -version +version Extractors * [youku] Update ccode From 273c23d960cbd2da18fadaef002473db41b5f56b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 24 Dec 2017 13:53:27 +0700 Subject: [PATCH 019/137] [openload] Add support for oload.stream (closes #15070) --- youtube_dl/extractor/openload.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/openload.py b/youtube_dl/extractor/openload.py index a99af12a4..aed579f36 100644 --- a/youtube_dl/extractor/openload.py +++ b/youtube_dl/extractor/openload.py @@ -242,7 +242,7 @@ class PhantomJSwrapper(object): class OpenloadIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?(?:openload\.(?:co|io|link)|oload\.tv)/(?:f|embed)/(?P[a-zA-Z0-9-_]+)' + _VALID_URL = r'https?://(?:www\.)?(?:openload\.(?:co|io|link)|oload\.(?:tv|stream))/(?:f|embed)/(?P[a-zA-Z0-9-_]+)' _TESTS = [{ 'url': 'https://openload.co/f/kUEfGclsU9o', @@ -289,6 +289,9 @@ class OpenloadIE(InfoExtractor): }, { 'url': 'http://www.openload.link/f/KnG-kKZdcfY', 'only_matching': True, + }, { + 'url': 'https://oload.stream/f/KnG-kKZdcfY', + 'only_matching': True, }] _USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36' From a75419586bb900df711de49adf5047afa9f083ef Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Sun, 24 Dec 2017 20:47:42 +0800 Subject: [PATCH 020/137] [openload] Remove a confusing exception If phantomjs is not installed, there's an error besides the missing phantomjs exception: Exception ignored in: > Traceback (most recent call last): File "/home/yen/Projects/youtube-dl/youtube_dl/extractor/openload.py", line 142, in __del__ os.remove(self._TMP_FILES[name].name) AttributeError: 'PhantomJSwrapper' object has no attribute '_TMP_FILES' --- youtube_dl/extractor/openload.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/openload.py b/youtube_dl/extractor/openload.py index aed579f36..d1eb3be25 100644 --- a/youtube_dl/extractor/openload.py +++ b/youtube_dl/extractor/openload.py @@ -112,6 +112,8 @@ class PhantomJSwrapper(object): return get_exe_version('phantomjs', version_re=r'([0-9.]+)') def __init__(self, extractor, required_version=None, timeout=10000): + self._TMP_FILES = {} + self.exe = check_executable('phantomjs', ['-v']) if not self.exe: raise ExtractorError('PhantomJS executable not found in PATH, ' @@ -130,7 +132,6 @@ class PhantomJSwrapper(object): self.options = { 'timeout': timeout, } - self._TMP_FILES = {} for name in self._TMP_FILE_NAMES: tmp = tempfile.NamedTemporaryFile(delete=False) tmp.close() @@ -140,7 +141,7 @@ class PhantomJSwrapper(object): for name in self._TMP_FILE_NAMES: try: os.remove(self._TMP_FILES[name].name) - except (IOError, OSError): + except (IOError, OSError, KeyError): pass def _save_cookies(self, url): From d99a1000c7522cb37910afe772d7317687521eb0 Mon Sep 17 00:00:00 2001 From: JianxinLi Date: Sun, 24 Dec 2017 00:30:27 +0800 Subject: [PATCH 021/137] [youku] Fix list extraction.(close #15065) Change-Id: I578fdc5b69509bdcd8d3191e3917afe47c234ff6 --- youtube_dl/extractor/youku.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/youku.py b/youtube_dl/extractor/youku.py index 9d0caee93..3e64cce38 100644 --- a/youtube_dl/extractor/youku.py +++ b/youtube_dl/extractor/youku.py @@ -276,9 +276,9 @@ class YoukuShowIE(InfoExtractor): r']+id="(reload_\d+)', first_page, 'first page reload id') # The first reload_id has the same items as first_page reload_ids = re.findall(']+data-id="([^"]+)">', first_page) + entries.extend(initial_entries) for idx, reload_id in enumerate(reload_ids): if reload_id == first_page_reload_id: - entries.extend(initial_entries) continue _, new_entries = self._extract_entries( 'http://list.youku.com/show/episode', show_id, From d3ca28323545a36819d9c32797907bc190095b5c Mon Sep 17 00:00:00 2001 From: JianxinLi Date: Mon, 25 Dec 2017 21:39:10 +0800 Subject: [PATCH 022/137] [youku] Add test case. Some playlist has no data-id value. Change-Id: I97455f2907f08bda03b538cdc13ec827e2f8ce26 --- youtube_dl/extractor/youku.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/youtube_dl/extractor/youku.py b/youtube_dl/extractor/youku.py index 3e64cce38..c7947d4a1 100644 --- a/youtube_dl/extractor/youku.py +++ b/youtube_dl/extractor/youku.py @@ -241,6 +241,10 @@ class YoukuShowIE(InfoExtractor): # Ongoing playlist. The initial page is the last one 'url': 'http://list.youku.com/show/id_za7c275ecd7b411e1a19e.html', 'only_matching': True, + }, { + # No data-id value. + 'url': 'http://list.youku.com/show/id_zefbfbd61237fefbfbdef.html', + 'only_matching': True, }] def _extract_entries(self, playlist_data_url, show_id, note, query): From 173558ce9620bf1b22ba2d4c67288e2a45c715fc Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Mon, 25 Dec 2017 22:06:18 +0800 Subject: [PATCH 023/137] [ChangeLog] Update after #15065 --- ChangeLog | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/ChangeLog b/ChangeLog index cb750e270..420a1bd11 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,7 +1,8 @@ version Extractors -* [youku] Update ccode +* [youku] Update ccode (#14880) +* [youku] Fix list extraction (#15065) version 2017.12.23 From 0f897e0929b2a3ebcae616f8b1bbdac8cd9c6f75 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Mon, 25 Dec 2017 23:28:51 +0100 Subject: [PATCH 024/137] [espn] add support for espnfc and extract more formats(closes #8053) --- youtube_dl/extractor/espn.py | 70 +++++++++++++++++++++++++----------- 1 file changed, 50 insertions(+), 20 deletions(-) diff --git a/youtube_dl/extractor/espn.py b/youtube_dl/extractor/espn.py index 7a7436068..0e135b8bc 100644 --- a/youtube_dl/extractor/espn.py +++ b/youtube_dl/extractor/espn.py @@ -1,6 +1,9 @@ from __future__ import unicode_literals +import re + from .common import InfoExtractor +from .once import OnceIE from ..compat import compat_str from ..utils import ( determine_ext, @@ -9,22 +12,27 @@ from ..utils import ( ) -class ESPNIE(InfoExtractor): +class ESPNIE(OnceIE): _VALID_URL = r'''(?x) https?:// - (?: - (?:(?:\w+\.)+)?espn\.go| - (?:www\.)?espn - )\.com/ (?: (?: - video/clip| - watch/player - ) - (?: - \?.*?\bid=| - /_/id/ - ) + (?: + (?:(?:\w+\.)+)?espn\.go| + (?:www\.)?espn + )\.com/ + (?: + (?: + video/(?:clip|iframe/twitter)| + watch/player + ) + (?: + .*?\?.*?\bid=| + /_/id/ + ) + ) + )| + (?:www\.)espnfc\.(?:com|us)/(?:video/)?[^/]+/\d+/video/ ) (?P\d+) ''' @@ -77,6 +85,15 @@ class ESPNIE(InfoExtractor): }, { 'url': 'http://www.espn.com/video/clip/_/id/17989860', 'only_matching': True, + }, { + 'url': 'https://espn.go.com/video/iframe/twitter/?cms=espn&id=10365079', + 'only_matching': True, + }, { + 'url': 'http://www.espnfc.us/video/espn-fc-tv/86/video/3319154/nashville-unveiled-as-the-newest-club-in-mls', + 'only_matching': True, + }, { + 'url': 'http://www.espnfc.com/english-premier-league/23/video/3324163/premier-league-in-90-seconds-golden-tweets', + 'only_matching': True, }] def _real_extract(self, url): @@ -93,7 +110,9 @@ class ESPNIE(InfoExtractor): def traverse_source(source, base_source_id=None): for source_id, source in source.items(): - if isinstance(source, compat_str): + if source_id == 'alert': + continue + elif isinstance(source, compat_str): extract_source(source, base_source_id) elif isinstance(source, dict): traverse_source( @@ -106,7 +125,9 @@ class ESPNIE(InfoExtractor): return format_urls.add(source_url) ext = determine_ext(source_url) - if ext == 'smil': + if OnceIE.suitable(source_url): + formats.extend(self._extract_once_formats(source_url)) + elif ext == 'smil': formats.extend(self._extract_smil_formats( source_url, video_id, fatal=False)) elif ext == 'f4m': @@ -117,12 +138,24 @@ class ESPNIE(InfoExtractor): source_url, video_id, 'mp4', entry_protocol='m3u8_native', m3u8_id=source_id, fatal=False)) else: - formats.append({ + f = { 'url': source_url, 'format_id': source_id, - }) + } + mobj = re.search(r'(\d+)p(\d+)_(\d+)k\.', source_url) + if mobj: + f.update({ + 'height': int(mobj.group(1)), + 'fps': int(mobj.group(2)), + 'tbr': int(mobj.group(3)), + }) + if source_id == 'mezzanine': + f['preference'] = 1 + formats.append(f) - traverse_source(clip['links']['source']) + links = clip.get('links', {}) + traverse_source(links.get('source', {})) + traverse_source(links.get('mobile', {})) self._sort_formats(formats) description = clip.get('caption') or clip.get('description') @@ -144,9 +177,6 @@ class ESPNIE(InfoExtractor): class ESPNArticleIE(InfoExtractor): _VALID_URL = r'https?://(?:espn\.go|(?:www\.)?espn)\.com/(?:[^/]+/)*(?P[^/]+)' _TESTS = [{ - 'url': 'https://espn.go.com/video/iframe/twitter/?cms=espn&id=10365079', - 'only_matching': True, - }, { 'url': 'http://espn.go.com/nba/recap?gameId=400793786', 'only_matching': True, }, { From 447a5a710dcd05741ea8cefa2fe98b333534e07d Mon Sep 17 00:00:00 2001 From: sprhawk <465558+sprhawk@users.noreply.github.com> Date: Tue, 26 Dec 2017 16:18:35 +0800 Subject: [PATCH 025/137] added weibo mobile site support --- youtube_dl/extractor/extractors.py | 5 +++- youtube_dl/extractor/weibo.py | 46 ++++++++++++++++++++++++++++++ 2 files changed, 50 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 12dc2e7e8..f1ea735b5 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -1286,7 +1286,10 @@ from .webofstories import ( WebOfStoriesIE, WebOfStoriesPlaylistIE, ) -from .weibo import WeiboIE +from .weibo import ( + WeiboIE, + WeiboMobileIE +) from .weiqitv import WeiqiTVIE from .wimp import WimpIE from .wistia import WistiaIE diff --git a/youtube_dl/extractor/weibo.py b/youtube_dl/extractor/weibo.py index b835f8975..eda0fa63d 100644 --- a/youtube_dl/extractor/weibo.py +++ b/youtube_dl/extractor/weibo.py @@ -9,6 +9,11 @@ from urllib import parse import json import random as rnd from os import path +import re + +from ..utils import ( + js_to_json, +) class WeiboIE(InfoExtractor): _VALID_URL = r'https?://weibo\.com/[0-9]+/(?P[a-zA-Z0-9]+)' @@ -103,3 +108,44 @@ class WeiboIE(InfoExtractor): 'formats': formats # TODO more properties (see youtube_dl/extractor/common.py) } + +class WeiboMobileIE(InfoExtractor): + _VALID_URL = r'https?://m.weibo.cn/status/(?P[0-9]+)(\?.+)?' + _TEST = { + 'url': 'https://m.weibo.cn/status/4189191225395228?wm=3333_2001&sourcetype=weixin&featurecode=newtitle&from=singlemessage&isappinstalled=0', + 'info_dict': { + 'id': '4189191225395228', + 'ext': 'mp4', + 'title': '午睡当然是要甜甜蜜蜜的啦', + 'uploader': '柴犬柴犬' + } + } + + def _real_extract(self, url): + video_id = self._match_id(url) + headers = { + 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', + 'Accept-Encoding': 'gzip, deflate, br', + 'Accept-Language': 'en,zh-CN;q=0.9,zh;q=0.8', + 'Upgrade-Insecure-Requests': '1', + } + # to get Referer url for genvisitor + webpage,urlh = self._download_webpage_handle(url, video_id, headers=headers, note="visit the page") + js_code = self._search_regex(r'var\s+\$render_data\s*=\s*\[({.*})\]\[0\] \|\| {};', webpage, 'js_code', flags = re.DOTALL) + weibo_info = self._parse_json(js_code, video_id, transform_source=js_to_json) + page_info = weibo_info['status']['page_info'] + title = weibo_info['status']['status_title'] + format = { + 'url': page_info['media_info']['stream_url'], + 'format': 'mp4', + } + formats = [format] + uploader = weibo_info['status']['user']['screen_name'] + + return { + 'id': video_id, + 'title': title, + 'uploader': uploader, + 'formats': formats + # TODO more properties (see youtube_dl/extractor/common.py) + } From d2be5bb5af7a1d7108b272315265e103a4358b28 Mon Sep 17 00:00:00 2001 From: sprhawk <465558+sprhawk@users.noreply.github.com> Date: Tue, 26 Dec 2017 16:28:47 +0800 Subject: [PATCH 026/137] change to use compat urllib --- youtube_dl/extractor/weibo.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/weibo.py b/youtube_dl/extractor/weibo.py index eda0fa63d..6a4e0a4cb 100644 --- a/youtube_dl/extractor/weibo.py +++ b/youtube_dl/extractor/weibo.py @@ -3,14 +3,16 @@ from __future__ import unicode_literals from .common import InfoExtractor -from urllib.request import Request -from urllib.parse import urlencode -from urllib import parse import json import random as rnd from os import path import re +from ..compat import ( + compat_urllib_parse_urlencode as urlencode, + compat_urllib_request as Request, + compat_urlparse as parse, +) from ..utils import ( js_to_json, ) From 951043724f91b3cfce60cf62cc3228a91a04ae81 Mon Sep 17 00:00:00 2001 From: sprhawk <465558+sprhawk@users.noreply.github.com> Date: Tue, 26 Dec 2017 16:38:51 +0800 Subject: [PATCH 027/137] re-format code to pass flake8 --- youtube_dl/extractor/weibo.py | 103 +++++++++++++++++----------------- 1 file changed, 52 insertions(+), 51 deletions(-) diff --git a/youtube_dl/extractor/weibo.py b/youtube_dl/extractor/weibo.py index 6a4e0a4cb..b4ac7b9fa 100644 --- a/youtube_dl/extractor/weibo.py +++ b/youtube_dl/extractor/weibo.py @@ -5,7 +5,6 @@ from .common import InfoExtractor import json import random as rnd -from os import path import re from ..compat import ( @@ -17,16 +16,17 @@ from ..utils import ( js_to_json, ) + class WeiboIE(InfoExtractor): _VALID_URL = r'https?://weibo\.com/[0-9]+/(?P[a-zA-Z0-9]+)' _TEST = { - 'url': 'https://weibo.com/6275294458/Fp6RGfbff?type=comment', - 'info_dict': { - 'id': 'Fp6RGfbff', - 'ext': 'mp4', - 'title': 'You should have servants to massage you,... 来自Hosico_猫 - 微博', - } - } + 'url': 'https://weibo.com/6275294458/Fp6RGfbff?type=comment', + 'info_dict': { + 'id': 'Fp6RGfbff', + 'ext': 'mp4', + 'title': 'You should have servants to massage you,... 来自Hosico_猫 - 微博', + } + } def _real_extract(self, url): video_id = self._match_id(url) @@ -38,32 +38,32 @@ class WeiboIE(InfoExtractor): 'Upgrade-Insecure-Requests': '1', } # to get Referer url for genvisitor - webpage,urlh = self._download_webpage_handle(url, video_id, headers=headers, note="first visit the page") + webpage, urlh = self._download_webpage_handle(url, video_id, headers=headers, note="first visit the page") visitor_url = urlh.geturl() data = urlencode({ "cb": "gen_callback", "fp": '{"os":"2","browser":"Gecko57,0,0,0","fonts":"undefined","screenInfo":"1440*900*24","plugins":""}', - }).encode() + }).encode() headers = { - 'Accept-Encoding': 'gzip, deflate, br', - 'Accept': '*/*', - 'Referer': visitor_url, - } + 'Accept-Encoding': 'gzip, deflate, br', + 'Accept': '*/*', + 'Referer': visitor_url, + } r_genvisitor = Request( 'https://passport.weibo.com/visitor/genvisitor', - data = data, - headers = headers, - method = 'POST' - ) - webpage,urlh = self._download_webpage_handle(r_genvisitor, video_id, note="gen visitor") + data=data, + headers=headers, + method='POST' + ) + webpage, urlh = self._download_webpage_handle(r_genvisitor, video_id, note="gen visitor") - p = webpage.split("&&")[1] # split "gen_callback && gen_callback(...)" + p = webpage.split("&&")[1] # split "gen_callback && gen_callback(...)" i1 = p.find('{') i2 = p.rfind('}') - j = p[i1:i2+1] # get JSON object + j = p[i1:i2 + 1] # get JSON object d = json.loads(j) tid = d["data"]["tid"] cnfd = "%03d" % d["data"]["confidence"] @@ -76,17 +76,17 @@ class WeiboIE(InfoExtractor): 'cb': 'cross_domain', 'from': 'weibo', '_rand': rnd.random() - }) + }) gencallback_url = "https://passport.weibo.com/visitor/visitor?" + param - webpage,urlh = self._download_webpage_handle(gencallback_url, video_id, note="gen callback") + webpage, urlh = self._download_webpage_handle(gencallback_url, video_id, note="gen callback") - webpage,urlh = self._download_webpage_handle(url, video_id, headers=headers, note="retry to visit the page") + webpage, urlh = self._download_webpage_handle(url, video_id, headers=headers, note="retry to visit the page") # TODO more code goes here, for example ... title = self._html_search_regex(r'(.+?)', webpage, 'title') video_sources_text = self._search_regex("video-sources=\\\\\"(.+?)\"", webpage, 'video_sources') - + video_formats = parse.parse_qs(video_sources_text) formats = [] @@ -100,28 +100,29 @@ class WeiboIE(InfoExtractor): 'url': vid_url, 'format': 'mp4', 'height': int(res), - }) + }) self._sort_formats(formats) - uploader = self._og_search_property('nick-name', webpage, 'uploader', default = None) + uploader = self._og_search_property('nick-name', webpage, 'uploader', default=None) return { - 'id': video_id, - 'title': title, - 'uploader': uploader, - 'formats': formats - # TODO more properties (see youtube_dl/extractor/common.py) - } + 'id': video_id, + 'title': title, + 'uploader': uploader, + 'formats': formats + # TODO more properties (see youtube_dl/extractor/common.py) + } + class WeiboMobileIE(InfoExtractor): _VALID_URL = r'https?://m.weibo.cn/status/(?P[0-9]+)(\?.+)?' _TEST = { - 'url': 'https://m.weibo.cn/status/4189191225395228?wm=3333_2001&sourcetype=weixin&featurecode=newtitle&from=singlemessage&isappinstalled=0', - 'info_dict': { - 'id': '4189191225395228', - 'ext': 'mp4', - 'title': '午睡当然是要甜甜蜜蜜的啦', - 'uploader': '柴犬柴犬' - } - } + 'url': 'https://m.weibo.cn/status/4189191225395228?wm=3333_2001&sourcetype=weixin&featurecode=newtitle&from=singlemessage&isappinstalled=0', + 'info_dict': { + 'id': '4189191225395228', + 'ext': 'mp4', + 'title': '午睡当然是要甜甜蜜蜜的啦', + 'uploader': '柴犬柴犬' + } + } def _real_extract(self, url): video_id = self._match_id(url) @@ -132,22 +133,22 @@ class WeiboMobileIE(InfoExtractor): 'Upgrade-Insecure-Requests': '1', } # to get Referer url for genvisitor - webpage,urlh = self._download_webpage_handle(url, video_id, headers=headers, note="visit the page") - js_code = self._search_regex(r'var\s+\$render_data\s*=\s*\[({.*})\]\[0\] \|\| {};', webpage, 'js_code', flags = re.DOTALL) + webpage, urlh = self._download_webpage_handle(url, video_id, headers=headers, note="visit the page") + js_code = self._search_regex(r'var\s+\$render_data\s*=\s*\[({.*})\]\[0\] \|\| {};', webpage, 'js_code', flags=re.DOTALL) weibo_info = self._parse_json(js_code, video_id, transform_source=js_to_json) page_info = weibo_info['status']['page_info'] title = weibo_info['status']['status_title'] format = { 'url': page_info['media_info']['stream_url'], - 'format': 'mp4', - } + 'format': 'mp4', + } formats = [format] uploader = weibo_info['status']['user']['screen_name'] return { - 'id': video_id, - 'title': title, - 'uploader': uploader, - 'formats': formats - # TODO more properties (see youtube_dl/extractor/common.py) - } + 'id': video_id, + 'title': title, + 'uploader': uploader, + 'formats': formats + # TODO more properties (see youtube_dl/extractor/common.py) + } From 25936512245fc571ab716d59e2d73c50d8cad6ce Mon Sep 17 00:00:00 2001 From: sprhawk <465558+sprhawk@users.noreply.github.com> Date: Tue, 26 Dec 2017 16:46:01 +0800 Subject: [PATCH 028/137] fix compat_urllib_request for python2.7 --- youtube_dl/extractor/weibo.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/weibo.py b/youtube_dl/extractor/weibo.py index b4ac7b9fa..f8a5ee71c 100644 --- a/youtube_dl/extractor/weibo.py +++ b/youtube_dl/extractor/weibo.py @@ -9,7 +9,7 @@ import re from ..compat import ( compat_urllib_parse_urlencode as urlencode, - compat_urllib_request as Request, + compat_urllib_request as request, compat_urlparse as parse, ) from ..utils import ( @@ -52,11 +52,10 @@ class WeiboIE(InfoExtractor): 'Referer': visitor_url, } - r_genvisitor = Request( + r_genvisitor = request.Request( 'https://passport.weibo.com/visitor/genvisitor', data=data, headers=headers, - method='POST' ) webpage, urlh = self._download_webpage_handle(r_genvisitor, video_id, note="gen visitor") @@ -85,7 +84,7 @@ class WeiboIE(InfoExtractor): # TODO more code goes here, for example ... title = self._html_search_regex(r'(.+?)', webpage, 'title') - video_sources_text = self._search_regex("video-sources=\\\\\"(.+?)\"", webpage, 'video_sources') + video_sources_text = self._search_regex(r'video-sources=\\\"(.+?)\"', webpage, 'video_sources') video_formats = parse.parse_qs(video_sources_text) From 45d20488f188b680daa39c5b9fa88d0bba102ab5 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Tue, 26 Dec 2017 12:32:04 +0100 Subject: [PATCH 029/137] [umg:de] Add new extractor(closes #11582)(closes #11584) --- youtube_dl/extractor/extractors.py | 1 + youtube_dl/extractor/umg.py | 103 +++++++++++++++++++++++++++++ 2 files changed, 104 insertions(+) create mode 100644 youtube_dl/extractor/umg.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 9ba1be2cd..3269ed743 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -1143,6 +1143,7 @@ from .udemy import ( from .udn import UDNEmbedIE from .uktvplay import UKTVPlayIE from .digiteka import DigitekaIE +from .umg import UMGDeIE from .unistra import UnistraIE from .unity import UnityIE from .uol import UOLIE diff --git a/youtube_dl/extractor/umg.py b/youtube_dl/extractor/umg.py new file mode 100644 index 000000000..d815cd9a6 --- /dev/null +++ b/youtube_dl/extractor/umg.py @@ -0,0 +1,103 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + int_or_none, + parse_filesize, + parse_iso8601, +) + + +class UMGDeIE(InfoExtractor): + IE_NAME = 'umg:de' + IE_DESC = 'Universal Music Deutschland' + _VALID_URL = r'https?://(?:www\.)?universal-music\.de/[^/]+/videos/[^/?#]+-(?P\d+)' + _TEST = { + 'url': 'https://www.universal-music.de/sido/videos/jedes-wort-ist-gold-wert-457803', + 'md5': 'ebd90f48c80dcc82f77251eb1902634f', + 'info_dict': { + 'id': '457803', + 'ext': 'mp4', + 'title': 'Jedes Wort ist Gold wert', + 'timestamp': 1513591800, + 'upload_date': '20171218', + } + } + + def _real_extract(self, url): + video_id = self._match_id(url) + video_data = self._download_json( + 'https://api.universal-music.de/graphql', + video_id, query={ + 'query': '''{ + universalMusic(channel:16) { + video(id:%s) { + headline + formats { + formatId + url + type + width + height + mimeType + fileSize + } + duration + createdDate + } + } +}''' % video_id})['data']['universalMusic']['video'] + + title = video_data['headline'] + hls_url_template = 'http://mediadelivery.universal-music-services.de/vod/mp4:autofill/storage/' + '/'.join(list(video_id)) + '/content/%s/file/playlist.m3u8' + + thumbnails = [] + formats = [] + + def add_m3u8_format(format_id): + m3u8_formats = self._extract_m3u8_formats( + hls_url_template % format_id, video_id, 'mp4', + 'm3u8_native', m3u8_id='hls', fatal='False') + if m3u8_formats and m3u8_formats[0].get('height'): + formats.extend(m3u8_formats) + + for f in video_data.get('formats', []): + f_url = f.get('url') + mime_type = f.get('mimeType') + if not f_url or mime_type == 'application/mxf': + continue + fmt = { + 'url': f_url, + 'width': int_or_none(f.get('width')), + 'height': int_or_none(f.get('height')), + 'filesize': parse_filesize(f.get('fileSize')), + } + f_type = f.get('type') + if f_type == 'Image': + thumbnails.append(fmt) + elif f_type == 'Video': + format_id = f.get('formatId') + if format_id: + fmt['format_id'] = format_id + if mime_type == 'video/mp4': + add_m3u8_format(format_id) + urlh = self._request_webpage(f_url, video_id, fatal=False) + if urlh: + first_byte = urlh.read(1) + if first_byte not in (b'F', b'\x00'): + continue + formats.append(fmt) + if not formats: + for format_id in (867, 836, 940): + add_m3u8_format(format_id) + self._sort_formats(formats, ('width', 'height', 'filesize', 'tbr')) + + return { + 'id': video_id, + 'title': title, + 'duration': int_or_none(video_data.get('duration')), + 'timestamp': parse_iso8601(video_data.get('createdDate'), ' '), + 'thumbnails': thumbnails, + 'formats': formats, + } From db145ee54a57f5ccc89639de8c589eb111a91b19 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Tue, 26 Dec 2017 14:20:21 +0100 Subject: [PATCH 030/137] [espn] Add new extractor for http://fivethirtyeight.com(closes #6864) --- youtube_dl/extractor/espn.py | 31 ++++++++++++++++++++++++++++++ youtube_dl/extractor/extractors.py | 1 + 2 files changed, 32 insertions(+) diff --git a/youtube_dl/extractor/espn.py b/youtube_dl/extractor/espn.py index 0e135b8bc..127c69b2e 100644 --- a/youtube_dl/extractor/espn.py +++ b/youtube_dl/extractor/espn.py @@ -205,3 +205,34 @@ class ESPNArticleIE(InfoExtractor): return self.url_result( 'http://espn.go.com/video/clip?id=%s' % video_id, ESPNIE.ie_key()) + + +class FiveThirtyEightIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?fivethirtyeight\.com/features/(?P[^/?#]+)' + _TEST = { + 'url': 'http://fivethirtyeight.com/features/how-the-6-8-raiders-can-still-make-the-playoffs/', + 'info_dict': { + 'id': '21846851', + 'ext': 'mp4', + 'title': 'FiveThirtyEight: The Raiders can still make the playoffs', + 'description': 'Neil Paine breaks down the simplest scenario that will put the Raiders into the playoffs at 8-8.', + 'timestamp': 1513960621, + 'upload_date': '20171222', + }, + 'params': { + 'skip_download': True, + }, + 'expected_warnings': ['Unable to download f4m manifest'], + } + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage(url, video_id) + + video_id = self._search_regex( + r'data-video-id=["\'](?P\d+)', + webpage, 'video id', group='id') + + return self.url_result( + 'http://espn.go.com/video/clip?id=%s' % video_id, ESPNIE.ie_key()) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 3269ed743..91bd3287c 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -322,6 +322,7 @@ from .escapist import EscapistIE from .espn import ( ESPNIE, ESPNArticleIE, + FiveThirtyEightIE, ) from .esri import EsriVideoIE from .etonline import ETOnlineIE From a14001a5a13b1639dc98b75b0775d251487aad1d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?D=C3=A9stin=20Reed?= Date: Wed, 17 Aug 2016 16:02:59 +0200 Subject: [PATCH 031/137] [Filmweb] Add extractor --- youtube_dl/extractor/extractors.py | 2 ++ youtube_dl/extractor/filmweb.py | 45 ++++++++++++++++++++++++ youtube_dl/extractor/twentythreevideo.py | 45 ++++++++++++++++++++++++ 3 files changed, 92 insertions(+) create mode 100644 youtube_dl/extractor/filmweb.py create mode 100644 youtube_dl/extractor/twentythreevideo.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 91bd3287c..04125d9f2 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -345,6 +345,7 @@ from .filmon import ( FilmOnIE, FilmOnChannelIE, ) +from .filmweb import FilmwebIE from .firsttv import FirstTVIE from .fivemin import FiveMinIE from .fivetv import FiveTVIE @@ -1120,6 +1121,7 @@ from .tvplayer import TVPlayerIE from .tweakers import TweakersIE from .twentyfourvideo import TwentyFourVideoIE from .twentymin import TwentyMinutenIE +from .twentythreevideo import TwentyThreeVideoIE from .twitch import ( TwitchVideoIE, TwitchChapterIE, diff --git a/youtube_dl/extractor/filmweb.py b/youtube_dl/extractor/filmweb.py new file mode 100644 index 000000000..a3d9f872e --- /dev/null +++ b/youtube_dl/extractor/filmweb.py @@ -0,0 +1,45 @@ +from __future__ import unicode_literals + +from .twentythreevideo import TwentyThreeVideoIE + + +class FilmwebIE(TwentyThreeVideoIE): + IE_NAME = 'Filmweb' + _VALID_URL = r'https?://(?:www\.)?filmweb\.no/trailere/article(?P\d+).ece' + _TEST = { + 'url': 'http://www.filmweb.no/trailere/article1264921.ece', + 'md5': 'e353f47df98e557d67edaceda9dece89', + 'info_dict': { + 'id': '1264921', + 'title': 'Det som en gang var', + 'ext': 'mp4', + 'description': 'Trailer: Scener fra et vennskap', + } + } + + _CLIENT_NAME = 'filmweb' + _CLIENT_ID = '12732917' + _EMBED_BASE_URL = 'http://www.filmweb.no/template/ajax/json_trailerEmbed.jsp?articleId=%s&autoplay=true' + + def _real_extract(self, url): + article_id = self._match_id(url) + webpage = self._download_webpage(url, article_id) + + title = self._search_regex(r'var\s+jsTitle\s*=\s*escape\("([^"]+)"\);', + webpage, 'title', fatal=True) + + format_url = self._proto_relative_url( + self._html_search_regex(r'"(//filmweb\.23video\.com/[^"]+)"', + self._download_json(self._EMBED_BASE_URL % article_id, + article_id)['embedCode'], 'format url')) + + formats = self._extract_formats(format_url, self._CLIENT_ID) + self._sort_formats(formats) + + return { + 'id': article_id, + 'title': title, + 'alt_title': self._og_search_title(webpage), + 'formats': formats, + 'description': self._og_search_description(webpage), + } diff --git a/youtube_dl/extractor/twentythreevideo.py b/youtube_dl/extractor/twentythreevideo.py new file mode 100644 index 000000000..2bad2dbd6 --- /dev/null +++ b/youtube_dl/extractor/twentythreevideo.py @@ -0,0 +1,45 @@ +from __future__ import unicode_literals + +from .common import InfoExtractor + + +class TwentyThreeVideoIE(InfoExtractor): + IE_NAME = '23video' + _VALID_URL = r'https?://(?:www\.)?(?P[\w-]+)\.23video\.com/v.ihtml/player.html.*photo_id=(?P\d+)' + _TEST = {} + + _URL_TEMPLATE = 'https://%s.23video.com/%s/%s/%s/%s/download-video.mp4' + _FORMATS = { + 'video_hd': { + 'width': 1280, + 'height': 720, + }, + 'video_medium': { + 'width': 640, + 'height': 360, + }, + 'video_mobile_high': { + 'width': 320, + 'height': 180, + } + } + + def _extract_formats(self, url, client_id): + client_name = self._search_regex(r'([a-z]+)\.23video\.com', url, 'client name') + video_id = self._search_regex(r'photo%5fid=([^?&]+)', url, 'video id') + token = self._search_regex(r'token=([^?&]+)', url, 'token') + + formats = [] + for format_key in self._FORMATS.keys(): + formats.append({ + 'url': self._URL_TEMPLATE % (client_name, client_id, video_id, + token, format_key), + 'width': self._FORMATS.get(format_key, {}).get('width'), + 'height': self._FORMATS.get(format_key, {}).get('height'), + }) + + return formats + + def _real_extract(self, url): + # TODO: Find out how to extract client_id + raise NotImplementedError('Not able to extract the `client_id`') From 42a1012c7767306626c5358a18ad3e86417bd7b7 Mon Sep 17 00:00:00 2001 From: sprhawk <465558+sprhawk@users.noreply.github.com> Date: Tue, 26 Dec 2017 22:20:43 +0800 Subject: [PATCH 032/137] fix according to "https://github.com/rg3/youtube-dl/pull/15079#discussion_r158688607" --- youtube_dl/extractor/weibo.py | 85 +++++++++++++---------------------- 1 file changed, 32 insertions(+), 53 deletions(-) diff --git a/youtube_dl/extractor/weibo.py b/youtube_dl/extractor/weibo.py index f8a5ee71c..2be31fe77 100644 --- a/youtube_dl/extractor/weibo.py +++ b/youtube_dl/extractor/weibo.py @@ -4,13 +4,12 @@ from __future__ import unicode_literals from .common import InfoExtractor import json -import random as rnd +import random import re from ..compat import ( - compat_urllib_parse_urlencode as urlencode, - compat_urllib_request as request, - compat_urlparse as parse, + compat_urllib_parse_urlencode, + compat_urlparse, ) from ..utils import ( js_to_json, @@ -30,34 +29,28 @@ class WeiboIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) - headers = { - 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', - 'Accept-Encoding': 'gzip, deflate, br', - 'Accept-Language': 'en,zh-CN;q=0.9,zh;q=0.8', - 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36', - 'Upgrade-Insecure-Requests': '1', - } # to get Referer url for genvisitor - webpage, urlh = self._download_webpage_handle(url, video_id, headers=headers, note="first visit the page") + webpage, urlh = self._download_webpage_handle(url, video_id, note="first visit the page") visitor_url = urlh.geturl() - - data = urlencode({ - "cb": "gen_callback", - "fp": '{"os":"2","browser":"Gecko57,0,0,0","fonts":"undefined","screenInfo":"1440*900*24","plugins":""}', - }).encode() headers = { - 'Accept-Encoding': 'gzip, deflate, br', - 'Accept': '*/*', - 'Referer': visitor_url, + 'Referer': visitor_url } - r_genvisitor = request.Request( - 'https://passport.weibo.com/visitor/genvisitor', - data=data, - headers=headers, - ) - webpage, urlh = self._download_webpage_handle(r_genvisitor, video_id, note="gen visitor") + fp = { + "os": "2", + "browser": "Gecko57,0,0,0", + "fonts": "undefined", + "screenInfo": "1440*900*24", + "plugins": "" + } + data = compat_urllib_parse_urlencode({ + "cb": "gen_callback", + "fp": json.dumps(fp), + }).encode() + + genvisitor_url = 'https://passport.weibo.com/visitor/genvisitor' + webpage, _ = self._download_webpage_handle(genvisitor_url, video_id, data=data, headers=headers, note="gen visitor") p = webpage.split("&&")[1] # split "gen_callback && gen_callback(...)" i1 = p.find('{') @@ -67,29 +60,28 @@ class WeiboIE(InfoExtractor): tid = d["data"]["tid"] cnfd = "%03d" % d["data"]["confidence"] - param = urlencode({ + query = { 'a': 'incarnate', 't': tid, 'w': 2, 'c': cnfd, 'cb': 'cross_domain', 'from': 'weibo', - '_rand': rnd.random() - }) - gencallback_url = "https://passport.weibo.com/visitor/visitor?" + param - webpage, urlh = self._download_webpage_handle(gencallback_url, video_id, note="gen callback") + '_rand': random.random() + } + gencallback_url = "https://passport.weibo.com/visitor/visitor" + self._download_webpage_handle(gencallback_url, video_id, note="gen callback", query=query) - webpage, urlh = self._download_webpage_handle(url, video_id, headers=headers, note="retry to visit the page") + webpage, _ = self._download_webpage_handle(url, video_id, note="retry to visit the page") - # TODO more code goes here, for example ... title = self._html_search_regex(r'(.+?)', webpage, 'title') video_sources_text = self._search_regex(r'video-sources=\\\"(.+?)\"', webpage, 'video_sources') - video_formats = parse.parse_qs(video_sources_text) + video_formats = compat_urlparse.parse_qs(video_sources_text) formats = [] - supported_resolutions = ['720', '480'] + supported_resolutions = ('720', '480') for res in supported_resolutions: f = video_formats.get(res) if isinstance(f, list): @@ -107,12 +99,11 @@ class WeiboIE(InfoExtractor): 'title': title, 'uploader': uploader, 'formats': formats - # TODO more properties (see youtube_dl/extractor/common.py) } class WeiboMobileIE(InfoExtractor): - _VALID_URL = r'https?://m.weibo.cn/status/(?P[0-9]+)(\?.+)?' + _VALID_URL = r'https?://m\.weibo\.cn/status/(?P[0-9]+)(\?.+)?' _TEST = { 'url': 'https://m.weibo.cn/status/4189191225395228?wm=3333_2001&sourcetype=weixin&featurecode=newtitle&from=singlemessage&isappinstalled=0', 'info_dict': { @@ -125,29 +116,17 @@ class WeiboMobileIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) - headers = { - 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', - 'Accept-Encoding': 'gzip, deflate, br', - 'Accept-Language': 'en,zh-CN;q=0.9,zh;q=0.8', - 'Upgrade-Insecure-Requests': '1', - } # to get Referer url for genvisitor - webpage, urlh = self._download_webpage_handle(url, video_id, headers=headers, note="visit the page") + webpage, _ = self._download_webpage_handle(url, video_id, note="visit the page") js_code = self._search_regex(r'var\s+\$render_data\s*=\s*\[({.*})\]\[0\] \|\| {};', webpage, 'js_code', flags=re.DOTALL) weibo_info = self._parse_json(js_code, video_id, transform_source=js_to_json) page_info = weibo_info['status']['page_info'] - title = weibo_info['status']['status_title'] - format = { - 'url': page_info['media_info']['stream_url'], - 'format': 'mp4', - } - formats = [format] - uploader = weibo_info['status']['user']['screen_name'] + title = weibo_info.get('status').get('status_title') + uploader = weibo_info.get('status').get('user').get('screen_name') return { 'id': video_id, 'title': title, 'uploader': uploader, - 'formats': formats - # TODO more properties (see youtube_dl/extractor/common.py) + 'url': page_info['media_info']['stream_url'] } From be069839b4acb645799f7b216d14c046fb4a3400 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Tue, 26 Dec 2017 19:41:08 +0100 Subject: [PATCH 033/137] [filmweb] improve extraction --- youtube_dl/extractor/filmweb.py | 53 +++++++------- youtube_dl/extractor/twentythreevideo.py | 90 ++++++++++++++++-------- 2 files changed, 86 insertions(+), 57 deletions(-) diff --git a/youtube_dl/extractor/filmweb.py b/youtube_dl/extractor/filmweb.py index a3d9f872e..56000bc5b 100644 --- a/youtube_dl/extractor/filmweb.py +++ b/youtube_dl/extractor/filmweb.py @@ -1,45 +1,42 @@ from __future__ import unicode_literals -from .twentythreevideo import TwentyThreeVideoIE +import re + +from .common import InfoExtractor -class FilmwebIE(TwentyThreeVideoIE): - IE_NAME = 'Filmweb' - _VALID_URL = r'https?://(?:www\.)?filmweb\.no/trailere/article(?P\d+).ece' +class FilmwebIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?filmweb\.no/(?Ptrailere|filmnytt)/article(?P\d+)\.ece' _TEST = { 'url': 'http://www.filmweb.no/trailere/article1264921.ece', 'md5': 'e353f47df98e557d67edaceda9dece89', 'info_dict': { - 'id': '1264921', - 'title': 'Det som en gang var', + 'id': '13033574', 'ext': 'mp4', - 'description': 'Trailer: Scener fra et vennskap', + 'title': 'Det som en gang var', + 'upload_date': '20160316', + 'timestamp': 1458140101, + 'uploader_id': '12639966', + 'uploader': 'Live Roaldset', } } - _CLIENT_NAME = 'filmweb' - _CLIENT_ID = '12732917' - _EMBED_BASE_URL = 'http://www.filmweb.no/template/ajax/json_trailerEmbed.jsp?articleId=%s&autoplay=true' - def _real_extract(self, url): - article_id = self._match_id(url) - webpage = self._download_webpage(url, article_id) - - title = self._search_regex(r'var\s+jsTitle\s*=\s*escape\("([^"]+)"\);', - webpage, 'title', fatal=True) - - format_url = self._proto_relative_url( - self._html_search_regex(r'"(//filmweb\.23video\.com/[^"]+)"', - self._download_json(self._EMBED_BASE_URL % article_id, - article_id)['embedCode'], 'format url')) - - formats = self._extract_formats(format_url, self._CLIENT_ID) - self._sort_formats(formats) + article_type, article_id = re.match(self._VALID_URL, url).groups() + if article_type == 'filmnytt': + webpage = self._download_webpage(url, article_id) + article_id = self._search_regex(r'data-videoid="(\d+)"', webpage, 'article id') + embed_code = self._download_json( + 'https://www.filmweb.no/template_v2/ajax/json_trailerEmbed.jsp', + article_id, query={ + 'articleId': article_id, + })['embedCode'] + iframe_url = self._proto_relative_url(self._search_regex( + r']+src="([^"]+)', embed_code, 'iframe url')) return { + '_type': 'url_transparent', 'id': article_id, - 'title': title, - 'alt_title': self._og_search_title(webpage), - 'formats': formats, - 'description': self._og_search_description(webpage), + 'url': iframe_url, + 'ie_key': 'TwentyThreeVideo', } diff --git a/youtube_dl/extractor/twentythreevideo.py b/youtube_dl/extractor/twentythreevideo.py index 2bad2dbd6..aa0c6e90f 100644 --- a/youtube_dl/extractor/twentythreevideo.py +++ b/youtube_dl/extractor/twentythreevideo.py @@ -1,45 +1,77 @@ from __future__ import unicode_literals +import re + from .common import InfoExtractor +from ..utils import int_or_none class TwentyThreeVideoIE(InfoExtractor): IE_NAME = '23video' - _VALID_URL = r'https?://(?:www\.)?(?P[\w-]+)\.23video\.com/v.ihtml/player.html.*photo_id=(?P\d+)' - _TEST = {} - - _URL_TEMPLATE = 'https://%s.23video.com/%s/%s/%s/%s/download-video.mp4' - _FORMATS = { - 'video_hd': { - 'width': 1280, - 'height': 720, - }, - 'video_medium': { - 'width': 640, - 'height': 360, - }, - 'video_mobile_high': { - 'width': 320, - 'height': 180, + _VALID_URL = r'https?://video\.(?Ptwentythree\.net|23video\.com|filmweb\.no)/v\.ihtml/player\.html\?(?P.*?\bphoto(?:_|%5f)id=(?P\d+).*)' + _TEST = { + 'url': 'https://video.twentythree.net/v.ihtml/player.html?showDescriptions=0&source=site&photo%5fid=20448876&autoPlay=1', + 'md5': '75fcf216303eb1dae9920d651f85ced4', + 'info_dict': { + 'id': '20448876', + 'ext': 'mp4', + 'title': 'Video Marketing Minute: Personalized Video', + 'timestamp': 1513855354, + 'upload_date': '20171221', + 'uploader_id': '12258964', + 'uploader': 'Rasmus Bysted', } } - def _extract_formats(self, url, client_id): - client_name = self._search_regex(r'([a-z]+)\.23video\.com', url, 'client name') - video_id = self._search_regex(r'photo%5fid=([^?&]+)', url, 'video id') - token = self._search_regex(r'token=([^?&]+)', url, 'token') + def _real_extract(self, url): + domain, query, photo_id = re.match(self._VALID_URL, url).groups() + base_url = 'https://video.%s' % domain + photo_data = self._download_json( + base_url + '/api/photo/list?' + query, photo_id, query={ + 'format': 'json', + }, transform_source=lambda s: self._search_regex(r'(?s)({.+})', s, 'photo data'))['photo'] + title = photo_data['title'] formats = [] - for format_key in self._FORMATS.keys(): + + audio_path = photo_data.get('audio_download') + if audio_path: formats.append({ - 'url': self._URL_TEMPLATE % (client_name, client_id, video_id, - token, format_key), - 'width': self._FORMATS.get(format_key, {}).get('width'), - 'height': self._FORMATS.get(format_key, {}).get('height'), + 'format_id': 'audio', + 'url': base_url + audio_path, + 'filesize': int_or_none(photo_data.get('audio_size')), + 'vcodec': 'none', }) - return formats + def add_common_info_to_list(l, template, id_field, id_value): + f_base = template % id_value + f_path = photo_data.get(f_base + 'download') + if not f_path: + return + l.append({ + id_field: id_value, + 'url': base_url + f_path, + 'width': int_or_none(photo_data.get(f_base + 'width')), + 'height': int_or_none(photo_data.get(f_base + 'height')), + 'filesize': int_or_none(photo_data.get(f_base + 'size')), + }) - def _real_extract(self, url): - # TODO: Find out how to extract client_id - raise NotImplementedError('Not able to extract the `client_id`') + for f in ('mobile_high', 'medium', 'hd', '1080p', '4k'): + add_common_info_to_list(formats, 'video_%s_', 'format_id', f) + + thumbnails = [] + for t in ('quad16', 'quad50', 'quad75', 'quad100', 'small', 'portrait', 'standard', 'medium', 'large', 'original'): + add_common_info_to_list(thumbnails, '%s_', 'id', t) + + return { + 'id': photo_id, + 'title': title, + 'timestamp': int_or_none(photo_data.get('creation_date_epoch')), + 'duration': int_or_none(photo_data.get('video_length')), + 'view_count': int_or_none(photo_data.get('view_count')), + 'comment_count': int_or_none(photo_data.get('number_of_comments')), + 'uploader_id': photo_data.get('user_id'), + 'uploader': photo_data.get('display_name'), + 'thumbnails': thumbnails, + 'formats': formats, + } From 616bb95b280e74b1a5048e95128259e2a85ca962 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ond=C5=99ej=20Caletka?= Date: Wed, 27 Dec 2017 16:57:26 +0100 Subject: [PATCH 034/137] [playtvak] Relax video regex and make description optional --- youtube_dl/extractor/playtvak.py | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/youtube_dl/extractor/playtvak.py b/youtube_dl/extractor/playtvak.py index 391e1bd09..4c5f57919 100644 --- a/youtube_dl/extractor/playtvak.py +++ b/youtube_dl/extractor/playtvak.py @@ -24,7 +24,7 @@ class PlaytvakIE(InfoExtractor): 'id': 'A150730_150323_hodinovy-manzel_kuko', 'ext': 'mp4', 'title': 'Vyžeňte vosy a sršně ze zahrady', - 'description': 'md5:f93d398691044d303bc4a3de62f3e976', + 'description': 'md5:4436e61b7df227a093778efb7e373571', 'thumbnail': r're:(?i)^https?://.*\.(?:jpg|png)$', 'duration': 279, 'timestamp': 1438732860, @@ -36,9 +36,19 @@ class PlaytvakIE(InfoExtractor): 'info_dict': { 'id': 'A150624_164934_planespotting_cat', 'ext': 'flv', - 'title': 're:^Přímý přenos iDNES.cz [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', + 'title': 're:^Planespotting [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', 'description': 'Sledujte provoz na ranveji Letiště Václava Havla v Praze', - 'thumbnail': r're:(?i)^https?://.*\.(?:jpg|png)$', + 'is_live': True, + }, + 'params': { + 'skip_download': True, # requires rtmpdump + }, + }, { # another live stream, this one without Misc.videoFLV + 'url': 'https://slowtv.playtvak.cz/zive-sledujte-vlaky-v-primem-prenosu-dwi-/hlavni-nadrazi.aspx?c=A151218_145728_hlavni-nadrazi_plap', + 'info_dict': { + 'id': 'A151218_145728_hlavni-nadrazi_plap', + 'ext': 'flv', + 'title': 're:^Hlavní nádraží [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', 'is_live': True, }, 'params': { @@ -95,7 +105,7 @@ class PlaytvakIE(InfoExtractor): webpage = self._download_webpage(url, video_id) info_url = self._html_search_regex( - r'Misc\.videoFLV\(\s*{\s*data\s*:\s*"([^"]+)"', webpage, 'info url') + r'Misc\.video(?:FLV)?\(\s*{\s*data\s*:\s*"([^"]+)"', webpage, 'info url') parsed_url = compat_urlparse.urlparse(info_url) @@ -160,7 +170,7 @@ class PlaytvakIE(InfoExtractor): if is_live: title = self._live_title(title) description = self._og_search_description(webpage, default=None) or self._html_search_meta( - 'description', webpage, 'description') + 'description', webpage, 'description', default=None) timestamp = None duration = None if not is_live: From 1ae0f0a21dec0cfce85b6f8285cc64cd702846b5 Mon Sep 17 00:00:00 2001 From: Leonardo Taccari Date: Wed, 13 Dec 2017 12:24:07 +0100 Subject: [PATCH 035/137] [internazionale] Add extractor --- youtube_dl/extractor/extractors.py | 1 + youtube_dl/extractor/internazionale.py | 46 ++++++++++++++++++++++++++ 2 files changed, 47 insertions(+) create mode 100644 youtube_dl/extractor/internazionale.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 04125d9f2..e7b93a699 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -466,6 +466,7 @@ from .indavideo import ( ) from .infoq import InfoQIE from .instagram import InstagramIE, InstagramUserIE +from .internazionale import InternazionaleIE from .internetvideoarchive import InternetVideoArchiveIE from .iprima import IPrimaIE from .iqiyi import IqiyiIE diff --git a/youtube_dl/extractor/internazionale.py b/youtube_dl/extractor/internazionale.py new file mode 100644 index 000000000..2ebf05ded --- /dev/null +++ b/youtube_dl/extractor/internazionale.py @@ -0,0 +1,46 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor + + +class InternazionaleIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?internazionale\.it/video/(?P.*)' + _TESTS = [{ + 'url': 'https://www.internazionale.it/video/2015/02/19/richard-linklater-racconta-una-scena-di-boyhood', + 'md5': '11b54a3d3333e455c00684e50a65c58e', + 'info_dict': { + 'id': '265968', + 'ext': 'mp4', + 'description': 'md5:efb7e5bbfb1a54ae2ed5a4a015f0e665', + 'title': 'Richard Linklater racconta una scena di Boyhood', + 'thumbnail': r're:^https?://.*\.jpg$', + } + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + data_job_id = self._html_search_regex(r'data-job-id="([^"]+)"', webpage, 'data-job-id') + data_video_path = self._html_search_regex(r'data-video-path="([^"]+)"', webpage, 'data-video-path') + + formats = [] + + formats.extend(self._extract_m3u8_formats( + 'https://video.internazionale.it/%s/%s.m3u8' % (data_video_path, data_job_id), + video_id)) + + formats.extend(self._extract_mpd_formats( + 'https://video.internazionale.it/%s/%s.mpd' % (data_video_path, data_job_id), + video_id)) + + self._sort_formats(formats) + + return { + 'id': data_job_id, + 'title': self._og_search_title(webpage), + 'thumbnail': self._og_search_thumbnail(webpage), + 'description': self._og_search_description(webpage), + 'formats': formats, + } From 640788f6f476f4344c13ed7c369c73575d3e705c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Wed, 27 Dec 2017 23:26:48 +0700 Subject: [PATCH 036/137] [internazionale] Improve extraction (closes #14973) --- youtube_dl/extractor/internazionale.py | 56 +++++++++++++++++--------- 1 file changed, 37 insertions(+), 19 deletions(-) diff --git a/youtube_dl/extractor/internazionale.py b/youtube_dl/extractor/internazionale.py index 2ebf05ded..10ba1f6cf 100644 --- a/youtube_dl/extractor/internazionale.py +++ b/youtube_dl/extractor/internazionale.py @@ -2,45 +2,63 @@ from __future__ import unicode_literals from .common import InfoExtractor +from ..utils import unified_timestamp class InternazionaleIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?internazionale\.it/video/(?P.*)' - _TESTS = [{ + _VALID_URL = r'https?://(?:www\.)?internazionale\.it/video/(?:[^/]+/)*(?P[^/?#&]+)' + _TEST = { 'url': 'https://www.internazionale.it/video/2015/02/19/richard-linklater-racconta-una-scena-di-boyhood', - 'md5': '11b54a3d3333e455c00684e50a65c58e', + 'md5': '3e39d32b66882c1218e305acbf8348ca', 'info_dict': { 'id': '265968', + 'display_id': 'richard-linklater-racconta-una-scena-di-boyhood', 'ext': 'mp4', - 'description': 'md5:efb7e5bbfb1a54ae2ed5a4a015f0e665', 'title': 'Richard Linklater racconta una scena di Boyhood', + 'description': 'md5:efb7e5bbfb1a54ae2ed5a4a015f0e665', + 'timestamp': 1424354635, + 'upload_date': '20150219', 'thumbnail': r're:^https?://.*\.jpg$', - } - }] + }, + 'params': { + 'format': 'bestvideo', + }, + } def _real_extract(self, url): - video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) + display_id = self._match_id(url) - data_job_id = self._html_search_regex(r'data-job-id="([^"]+)"', webpage, 'data-job-id') - data_video_path = self._html_search_regex(r'data-video-path="([^"]+)"', webpage, 'data-video-path') + webpage = self._download_webpage(url, display_id) - formats = [] + DATA_RE = r'data-%s=(["\'])(?P(?:(?!\1).)+)\1' - formats.extend(self._extract_m3u8_formats( - 'https://video.internazionale.it/%s/%s.m3u8' % (data_video_path, data_job_id), - video_id)) + title = self._search_regex( + DATA_RE % 'video-title', webpage, 'title', default=None, + group='value') or self._og_search_title(webpage) + video_id = self._search_regex( + DATA_RE % 'job-id', webpage, 'video id', group='value') + video_path = self._search_regex( + DATA_RE % 'video-path', webpage, 'video path', group='value') + + video_base = 'https://video.internazionale.it/%s/%s.' % (video_path, video_id) + + formats = self._extract_m3u8_formats( + video_base + 'm3u8', display_id, 'mp4', + entry_protocol='m3u8_native', m3u8_id='hls', fatal=False) formats.extend(self._extract_mpd_formats( - 'https://video.internazionale.it/%s/%s.mpd' % (data_video_path, data_job_id), - video_id)) - + video_base + 'mpd', display_id, mpd_id='dash', fatal=False)) self._sort_formats(formats) + timestamp = unified_timestamp(self._html_search_meta( + 'article:published_time', webpage, 'timestamp')) + return { - 'id': data_job_id, - 'title': self._og_search_title(webpage), + 'id': video_id, + 'display_id': display_id, + 'title': title, 'thumbnail': self._og_search_thumbnail(webpage), 'description': self._og_search_description(webpage), + 'timestamp': timestamp, 'formats': formats, } From 99277daaac5b2cb6990f1d592b78ca8839635c72 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Thu, 28 Dec 2017 23:10:42 +0700 Subject: [PATCH 037/137] [ChangeLog] Actualize --- ChangeLog | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/ChangeLog b/ChangeLog index 420a1bd11..794b00223 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,7 +1,16 @@ version Extractors ++ [internazionale] Add support for internazionale.it (#14973) +* [playtvak] Relax video regular expression and make description optional + (#15037) ++ [filmweb] Add support for filmweb.no (#8773, #10368) ++ [23video] Add support for 23video.com ++ [espn] Add support for fivethirtyeight.com (#6864) ++ [umg:de] Add support for universal-music.de (#11582, #11584) ++ [espn] Add support for espnfc and extract more formats (#8053) * [youku] Update ccode (#14880) ++ [openload] Add support for oload.stream (#15070) * [youku] Fix list extraction (#15065) From a491fd0c6f5abd68bae128cacbcc9926a017cee3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Thu, 28 Dec 2017 23:12:56 +0700 Subject: [PATCH 038/137] release 2017.12.28 --- .github/ISSUE_TEMPLATE.md | 6 +++--- ChangeLog | 2 +- docs/supportedsites.md | 5 +++++ youtube_dl/version.py | 2 +- 4 files changed, 10 insertions(+), 5 deletions(-) diff --git a/.github/ISSUE_TEMPLATE.md b/.github/ISSUE_TEMPLATE.md index d7a91239f..f40927d36 100644 --- a/.github/ISSUE_TEMPLATE.md +++ b/.github/ISSUE_TEMPLATE.md @@ -6,8 +6,8 @@ --- -### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2017.12.23*. If it's not, read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. -- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2017.12.23** +### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2017.12.28*. If it's not, read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. +- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2017.12.28** ### Before submitting an *issue* make sure you have: - [ ] At least skimmed through the [README](https://github.com/rg3/youtube-dl/blob/master/README.md), **most notably** the [FAQ](https://github.com/rg3/youtube-dl#faq) and [BUGS](https://github.com/rg3/youtube-dl#bugs) sections @@ -35,7 +35,7 @@ Add the `-v` flag to **your command line** you run youtube-dl with (`youtube-dl [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 -[debug] youtube-dl version 2017.12.23 +[debug] youtube-dl version 2017.12.28 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/ChangeLog b/ChangeLog index 794b00223..4323cfc5c 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,4 +1,4 @@ -version +version 2017.12.28 Extractors + [internazionale] Add support for internazionale.it (#14973) diff --git a/docs/supportedsites.md b/docs/supportedsites.md index eac35e390..6b935e572 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -3,6 +3,7 @@ - **1up.com** - **20min** - **220.ro** + - **23video** - **24video** - **3qsdn**: 3Q SDN - **3sat** @@ -269,6 +270,8 @@ - **Fczenit** - **filmon** - **filmon:channel** + - **Filmweb** + - **FiveThirtyEight** - **FiveTV** - **Flickr** - **Flipagram** @@ -359,6 +362,7 @@ - **InfoQ** - **Instagram** - **instagram:user**: Instagram user profile + - **Internazionale** - **InternetVideoArchive** - **IPrima** - **iqiyi**: 爱奇艺 @@ -889,6 +893,7 @@ - **udemy:course** - **UDNEmbed**: 聯合影音 - **UKTVPlay** + - **umg:de**: Universal Music Deutschland - **Unistra** - **Unity** - **uol.com.br** diff --git a/youtube_dl/version.py b/youtube_dl/version.py index f999584d7..f4a1b67a2 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2017.12.23' +__version__ = '2017.12.28' From 84f085d4bdb66ee025fb337bcd571eab7469da97 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Fri, 29 Dec 2017 00:13:23 +0100 Subject: [PATCH 039/137] [aws] fix canonical/signed headers generation in python 2(closes #15102) --- youtube_dl/extractor/aws.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/aws.py b/youtube_dl/extractor/aws.py index 670abce0c..dccfeaf73 100644 --- a/youtube_dl/extractor/aws.py +++ b/youtube_dl/extractor/aws.py @@ -21,11 +21,11 @@ class AWSIE(InfoExtractor): 'Accept': 'application/json', 'Host': self._AWS_PROXY_HOST, 'X-Amz-Date': amz_date, + 'X-Api-Key': self._AWS_API_KEY } session_token = aws_dict.get('session_token') if session_token: headers['X-Amz-Security-Token'] = session_token - headers['X-Api-Key'] = self._AWS_API_KEY def aws_hash(s): return hashlib.sha256(s.encode('utf-8')).hexdigest() @@ -33,9 +33,9 @@ class AWSIE(InfoExtractor): # Task 1: http://docs.aws.amazon.com/general/latest/gr/sigv4-create-canonical-request.html canonical_querystring = compat_urllib_parse_urlencode(query) canonical_headers = '' - for header_name, header_value in headers.items(): + for header_name, header_value in sorted(headers.items()): canonical_headers += '%s:%s\n' % (header_name.lower(), header_value) - signed_headers = ';'.join([header.lower() for header in headers.keys()]) + signed_headers = ';'.join([header.lower() for header in sorted(headers.keys())]) canonical_request = '\n'.join([ 'GET', aws_dict['uri'], From 9d6ac71c27b1dfb662c795ef598dbfd0286682da Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Fri, 29 Dec 2017 23:14:15 +0700 Subject: [PATCH 040/137] [extractor/common] Fix extraction of DASH formats with the same representation id (closes #15111) --- test/test_InfoExtractor.py | 11 +++++++++++ youtube_dl/extractor/common.py | 18 ++++++++---------- 2 files changed, 19 insertions(+), 10 deletions(-) diff --git a/test/test_InfoExtractor.py b/test/test_InfoExtractor.py index 8a372d2c9..7b31d5198 100644 --- a/test/test_InfoExtractor.py +++ b/test/test_InfoExtractor.py @@ -493,9 +493,20 @@ jwplayer("mediaplayer").setup({"abouttext":"Visit Indie DB","aboutlink":"http:\/ _TEST_CASES = [ ( # https://github.com/rg3/youtube-dl/issues/13919 + # Also tests duplicate representation ids, see + # https://github.com/rg3/youtube-dl/issues/15111 'float_duration', 'http://unknown/manifest.mpd', [{ + 'manifest_url': 'http://unknown/manifest.mpd', + 'ext': 'm4a', + 'format_id': '318597', + 'format_note': 'DASH audio', + 'protocol': 'http_dash_segments', + 'acodec': 'mp4a.40.2', + 'vcodec': 'none', + 'tbr': 61.587, + }, { 'manifest_url': 'http://unknown/manifest.mpd', 'ext': 'mp4', 'format_id': '318597', diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 3b79b8cb4..35d427eec 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -2007,16 +2007,14 @@ class InfoExtractor(object): f['url'] = initialization_url f['fragments'].append({location_key(initialization_url): initialization_url}) f['fragments'].extend(representation_ms_info['fragments']) - try: - existing_format = next( - fo for fo in formats - if fo['format_id'] == representation_id) - except StopIteration: - full_info = formats_dict.get(representation_id, {}).copy() - full_info.update(f) - formats.append(full_info) - else: - existing_format.update(f) + # According to [1, 5.3.5.2, Table 7, page 35] @id of Representation + # is not necessarily unique within a Period thus formats with + # the same `format_id` are quite possible. There are numerous examples + # of such manifests (see https://github.com/rg3/youtube-dl/issues/15111, + # https://github.com/rg3/youtube-dl/issues/13919) + full_info = formats_dict.get(representation_id, {}).copy() + full_info.update(f) + formats.append(full_info) else: self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type) return formats From 5c97ec5ff5fd77a7975e1e946d53a76ccd5ef0de Mon Sep 17 00:00:00 2001 From: sprhawk <465558+sprhawk@users.noreply.github.com> Date: Sat, 30 Dec 2017 01:08:56 +0800 Subject: [PATCH 041/137] replace urlencode.encode with urlencode_postdata --- youtube_dl/extractor/weibo.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/weibo.py b/youtube_dl/extractor/weibo.py index 2be31fe77..0b28952c9 100644 --- a/youtube_dl/extractor/weibo.py +++ b/youtube_dl/extractor/weibo.py @@ -8,11 +8,11 @@ import random import re from ..compat import ( - compat_urllib_parse_urlencode, compat_urlparse, ) from ..utils import ( js_to_json, + urlencode_postdata, ) @@ -44,10 +44,10 @@ class WeiboIE(InfoExtractor): "screenInfo": "1440*900*24", "plugins": "" } - data = compat_urllib_parse_urlencode({ + data = urlencode_postdata({ "cb": "gen_callback", "fp": json.dumps(fp), - }).encode() + }) genvisitor_url = 'https://passport.weibo.com/visitor/genvisitor' webpage, _ = self._download_webpage_handle(genvisitor_url, video_id, data=data, headers=headers, note="gen visitor") From 6a41a12d2960efb7b32d3b6ef74cf6237766b569 Mon Sep 17 00:00:00 2001 From: sprhawk <465558+sprhawk@users.noreply.github.com> Date: Sat, 30 Dec 2017 01:11:30 +0800 Subject: [PATCH 042/137] replace split with strip_jsonp --- youtube_dl/extractor/weibo.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/weibo.py b/youtube_dl/extractor/weibo.py index 0b28952c9..71e7123e4 100644 --- a/youtube_dl/extractor/weibo.py +++ b/youtube_dl/extractor/weibo.py @@ -12,6 +12,7 @@ from ..compat import ( ) from ..utils import ( js_to_json, + strip_jsonp, urlencode_postdata, ) @@ -52,7 +53,7 @@ class WeiboIE(InfoExtractor): genvisitor_url = 'https://passport.weibo.com/visitor/genvisitor' webpage, _ = self._download_webpage_handle(genvisitor_url, video_id, data=data, headers=headers, note="gen visitor") - p = webpage.split("&&")[1] # split "gen_callback && gen_callback(...)" + p = strip_jsonp(webpage) i1 = p.find('{') i2 = p.rfind('}') j = p[i1:i2 + 1] # get JSON object From 48058d82dc3b448a72fd5ac1e7fa5492cd11f640 Mon Sep 17 00:00:00 2001 From: sprhawk <465558+sprhawk@users.noreply.github.com> Date: Sat, 30 Dec 2017 01:14:21 +0800 Subject: [PATCH 043/137] replace unused _download_webpage_handle with _download_webpage --- youtube_dl/extractor/weibo.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/weibo.py b/youtube_dl/extractor/weibo.py index 71e7123e4..34809bdb2 100644 --- a/youtube_dl/extractor/weibo.py +++ b/youtube_dl/extractor/weibo.py @@ -51,7 +51,7 @@ class WeiboIE(InfoExtractor): }) genvisitor_url = 'https://passport.weibo.com/visitor/genvisitor' - webpage, _ = self._download_webpage_handle(genvisitor_url, video_id, data=data, headers=headers, note="gen visitor") + webpage = self._download_webpage(genvisitor_url, video_id, data=data, headers=headers, note="gen visitor") p = strip_jsonp(webpage) i1 = p.find('{') @@ -71,9 +71,9 @@ class WeiboIE(InfoExtractor): '_rand': random.random() } gencallback_url = "https://passport.weibo.com/visitor/visitor" - self._download_webpage_handle(gencallback_url, video_id, note="gen callback", query=query) + self._download_webpage(gencallback_url, video_id, note="gen callback", query=query) - webpage, _ = self._download_webpage_handle(url, video_id, note="retry to visit the page") + webpage = self._download_webpage(url, video_id, note="retry to visit the page") title = self._html_search_regex(r'(.+?)', webpage, 'title') @@ -118,7 +118,7 @@ class WeiboMobileIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) # to get Referer url for genvisitor - webpage, _ = self._download_webpage_handle(url, video_id, note="visit the page") + webpage = self._download_webpage(url, video_id, note="visit the page") js_code = self._search_regex(r'var\s+\$render_data\s*=\s*\[({.*})\]\[0\] \|\| {};', webpage, 'js_code', flags=re.DOTALL) weibo_info = self._parse_json(js_code, video_id, transform_source=js_to_json) page_info = weibo_info['status']['page_info'] From 580f3c79d52ae3f880b4ca64e22665bf094ae65f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 30 Dec 2017 03:54:14 +0700 Subject: [PATCH 044/137] [vimeo] Improve password protected videos extraction (closes #15114) --- youtube_dl/extractor/vimeo.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/youtube_dl/extractor/vimeo.py b/youtube_dl/extractor/vimeo.py index cedb54876..6af705657 100644 --- a/youtube_dl/extractor/vimeo.py +++ b/youtube_dl/extractor/vimeo.py @@ -468,11 +468,12 @@ class VimeoIE(VimeoBaseInfoExtractor): request = sanitized_Request(url, headers=headers) try: webpage, urlh = self._download_webpage_handle(request, video_id) + redirect_url = compat_str(urlh.geturl()) # Some URLs redirect to ondemand can't be extracted with # this extractor right away thus should be passed through # ondemand extractor (e.g. https://vimeo.com/73445910) - if VimeoOndemandIE.suitable(urlh.geturl()): - return self.url_result(urlh.geturl(), VimeoOndemandIE.ie_key()) + if VimeoOndemandIE.suitable(redirect_url): + return self.url_result(redirect_url, VimeoOndemandIE.ie_key()) except ExtractorError as ee: if isinstance(ee.cause, compat_HTTPError) and ee.cause.code == 403: errmsg = ee.cause.read() @@ -541,15 +542,15 @@ class VimeoIE(VimeoBaseInfoExtractor): if re.search(r']+?id="pw_form"', webpage) is not None: if '_video_password_verified' in data: raise ExtractorError('video password verification failed!') - self._verify_video_password(url, video_id, webpage) + self._verify_video_password(redirect_url, video_id, webpage) return self._real_extract( - smuggle_url(url, {'_video_password_verified': 'verified'})) + smuggle_url(redirect_url, {'_video_password_verified': 'verified'})) else: raise ExtractorError('Unable to extract info section', cause=e) else: if config.get('view') == 4: - config = self._verify_player_video_password(url, video_id) + config = self._verify_player_video_password(redirect_url, video_id) def is_rented(): if '>You rented this title.<' in webpage: From d2c5b5a951868ae974bc3af6659ab39b8abd2157 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 30 Dec 2017 05:52:35 +0700 Subject: [PATCH 045/137] [openload] Fallback on f-page extraction (closes #14665, closes #14879) --- youtube_dl/extractor/openload.py | 28 +++++++++++++++++++++------- 1 file changed, 21 insertions(+), 7 deletions(-) diff --git a/youtube_dl/extractor/openload.py b/youtube_dl/extractor/openload.py index d1eb3be25..81c1317b6 100644 --- a/youtube_dl/extractor/openload.py +++ b/youtube_dl/extractor/openload.py @@ -284,6 +284,11 @@ class OpenloadIE(InfoExtractor): # for title and ext 'url': 'https://openload.co/embed/Sxz5sADo82g/', 'only_matching': True, + }, { + # unavailable via https://openload.co/embed/e-Ixz9ZR5L0/ but available + # via https://openload.co/f/e-Ixz9ZR5L0/ + 'url': 'https://openload.co/f/e-Ixz9ZR5L0/', + 'only_matching': True, }, { 'url': 'https://oload.tv/embed/KnG-kKZdcfY/', 'only_matching': True, @@ -305,18 +310,27 @@ class OpenloadIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) - url = 'https://openload.co/embed/%s/' % video_id + url_pattern = 'https://openload.co/%%s/%s/' % video_id headers = { 'User-Agent': self._USER_AGENT, } - webpage = self._download_webpage(url, video_id, headers=headers) - - if 'File not found' in webpage or 'deleted by the owner' in webpage: - raise ExtractorError('File not found', expected=True, video_id=video_id) + for path in ('embed', 'f'): + page_url = url_pattern % path + last = path == 'f' + webpage = self._download_webpage( + page_url, video_id, 'Downloading %s webpage' % path, + headers=headers, fatal=last) + if not webpage: + continue + if 'File not found' in webpage or 'deleted by the owner' in webpage: + if not last: + continue + raise ExtractorError('File not found', expected=True, video_id=video_id) + break phantom = PhantomJSwrapper(self, required_version='2.0') - webpage, _ = phantom.get(url, html=webpage, video_id=video_id, headers=headers) + webpage, _ = phantom.get(page_url, html=webpage, video_id=video_id, headers=headers) decoded_id = get_element_by_id('streamurl', webpage) @@ -327,7 +341,7 @@ class OpenloadIE(InfoExtractor): 'title', default=None) or self._html_search_meta( 'description', webpage, 'title', fatal=True) - entries = self._parse_html5_media_entries(url, webpage, video_id) + entries = self._parse_html5_media_entries(page_url, webpage, video_id) entry = entries[0] if entries else {} subtitles = entry.get('subtitles') From 2c8e11b4af253fec9096a800ccd26b4daef521a1 Mon Sep 17 00:00:00 2001 From: 50csent <32077344+50csent@users.noreply.github.com> Date: Fri, 29 Dec 2017 22:59:49 +0200 Subject: [PATCH 046/137] [pluralsight] Fix missing first line of subtitles (closes #11118) --- youtube_dl/extractor/pluralsight.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/pluralsight.py b/youtube_dl/extractor/pluralsight.py index 597b11218..aacc5d4bb 100644 --- a/youtube_dl/extractor/pluralsight.py +++ b/youtube_dl/extractor/pluralsight.py @@ -171,12 +171,12 @@ class PluralsightIE(PluralsightBaseIE): for num, current in enumerate(subs): current = subs[num] start, text = ( - float_or_none(dict_get(current, TIME_OFFSET_KEYS)), + float_or_none(dict_get(current, TIME_OFFSET_KEYS, skip_false_values=False)), dict_get(current, TEXT_KEYS)) if start is None or text is None: continue end = duration if num == len(subs) - 1 else float_or_none( - dict_get(subs[num + 1], TIME_OFFSET_KEYS)) + dict_get(subs[num + 1], TIME_OFFSET_KEYS, skip_false_values=False)) if end is None: continue srt += os.linesep.join( From d97cb84b313f94da76bff0520392334e753e2238 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Sat, 30 Dec 2017 00:30:15 +0100 Subject: [PATCH 047/137] [ufctv] Add new extractor(closes #14520) --- youtube_dl/extractor/extractors.py | 1 + youtube_dl/extractor/ufctv.py | 55 ++++++++++++++++++++++++++++++ 2 files changed, 56 insertions(+) create mode 100644 youtube_dl/extractor/ufctv.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index e7b93a699..5ed031833 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -1145,6 +1145,7 @@ from .udemy import ( UdemyCourseIE ) from .udn import UDNEmbedIE +from .ufctv import UFCTVIE from .uktvplay import UKTVPlayIE from .digiteka import DigitekaIE from .umg import UMGDeIE diff --git a/youtube_dl/extractor/ufctv.py b/youtube_dl/extractor/ufctv.py new file mode 100644 index 000000000..ab823814b --- /dev/null +++ b/youtube_dl/extractor/ufctv.py @@ -0,0 +1,55 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + parse_duration, + parse_iso8601, +) + + +class UFCTVIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?ufc\.tv/video/(?P[^/]+)' + _TEST = { + 'url': 'https://www.ufc.tv/video/ufc-219-countdown-full-episode', + 'info_dict': { + 'id': '34167', + 'ext': 'mp4', + 'title': 'UFC 219 Countdown: Full Episode', + 'description': 'md5:26d4e8bf4665ae5878842d7050c3c646', + 'timestamp': 1513962360, + 'upload_date': '20171222', + }, + 'params': { + # m3u8 download + 'skip_download': True, + } + } + + def _real_extract(self, url): + display_id = self._match_id(url) + video_data = self._download_json(url, display_id, query={ + 'format': 'json', + }) + video_id = str(video_data['id']) + title = video_data['name'] + m3u8_url = self._download_json( + 'https://www.ufc.tv/service/publishpoint', video_id, query={ + 'type': 'video', + 'format': 'json', + 'id': video_id, + }, headers={ + 'User-Agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 11_0_1 like Mac OS X) AppleWebKit/604.1.38 (KHTML, like Gecko) Version/11.0 Mobile/15A402 Safari/604.1', + })['path'] + m3u8_url = m3u8_url.replace('_iphone.', '.') + formats = self._extract_m3u8_formats(m3u8_url, video_id, 'mp4') + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': title, + 'description': video_data.get('description'), + 'duration': parse_duration(video_data.get('runtime')), + 'timestamp': parse_iso8601(video_data.get('releaseDate')), + 'formats': formats, + } From 2501d41ef4b9ed0349cf4f9838e12873350e60d5 Mon Sep 17 00:00:00 2001 From: felix Date: Sat, 12 Nov 2016 22:15:51 +0100 Subject: [PATCH 048/137] [common] use AACL as the default fourcc when AudioTag is 255 --- youtube_dl/extractor/common.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 35d427eec..5e7e7a3f7 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -2054,7 +2054,7 @@ class InfoExtractor(object): stream_timescale = int_or_none(stream.get('TimeScale')) or timescale stream_name = stream.get('Name') for track in stream.findall('QualityLevel'): - fourcc = track.get('FourCC') + fourcc = track.get('FourCC', 'AACL' if track.get('AudioTag') == '255' else None) # TODO: add support for WVC1 and WMAP if fourcc not in ('H264', 'AVC1', 'AACL'): self.report_warning('%s is not a supported codec' % fourcc) From 8056c8542d008ac2f103b4fcb862dc1ad78dc8fc Mon Sep 17 00:00:00 2001 From: felix Date: Sat, 12 Nov 2016 21:52:02 +0100 Subject: [PATCH 049/137] [mediasite] Add extractor, subsume sandia and collegerama extractors --- youtube_dl/extractor/collegerama.py | 93 -------------- youtube_dl/extractor/extractors.py | 3 +- youtube_dl/extractor/generic.py | 10 ++ youtube_dl/extractor/mediasite.py | 184 ++++++++++++++++++++++++++++ youtube_dl/extractor/sandia.py | 65 ---------- 5 files changed, 195 insertions(+), 160 deletions(-) delete mode 100644 youtube_dl/extractor/collegerama.py create mode 100644 youtube_dl/extractor/mediasite.py delete mode 100644 youtube_dl/extractor/sandia.py diff --git a/youtube_dl/extractor/collegerama.py b/youtube_dl/extractor/collegerama.py deleted file mode 100644 index 6a41db87c..000000000 --- a/youtube_dl/extractor/collegerama.py +++ /dev/null @@ -1,93 +0,0 @@ -from __future__ import unicode_literals - -import json - -from .common import InfoExtractor -from ..utils import ( - float_or_none, - int_or_none, - sanitized_Request, -) - - -class CollegeRamaIE(InfoExtractor): - _VALID_URL = r'https?://collegerama\.tudelft\.nl/Mediasite/Play/(?P[\da-f]+)' - _TESTS = [ - { - 'url': 'https://collegerama.tudelft.nl/Mediasite/Play/585a43626e544bdd97aeb71a0ec907a01d', - 'md5': '481fda1c11f67588c0d9d8fbdced4e39', - 'info_dict': { - 'id': '585a43626e544bdd97aeb71a0ec907a01d', - 'ext': 'mp4', - 'title': 'Een nieuwe wereld: waarden, bewustzijn en techniek van de mensheid 2.0.', - 'description': '', - 'thumbnail': r're:^https?://.*\.jpg(?:\?.*?)?$', - 'duration': 7713.088, - 'timestamp': 1413309600, - 'upload_date': '20141014', - }, - }, - { - 'url': 'https://collegerama.tudelft.nl/Mediasite/Play/86a9ea9f53e149079fbdb4202b521ed21d?catalog=fd32fd35-6c99-466c-89d4-cd3c431bc8a4', - 'md5': 'ef1fdded95bdf19b12c5999949419c92', - 'info_dict': { - 'id': '86a9ea9f53e149079fbdb4202b521ed21d', - 'ext': 'wmv', - 'title': '64ste Vakantiecursus: Afvalwater', - 'description': 'md5:7fd774865cc69d972f542b157c328305', - 'thumbnail': r're:^https?://.*\.jpg(?:\?.*?)?$', - 'duration': 10853, - 'timestamp': 1326446400, - 'upload_date': '20120113', - }, - }, - ] - - def _real_extract(self, url): - video_id = self._match_id(url) - - player_options_request = { - 'getPlayerOptionsRequest': { - 'ResourceId': video_id, - 'QueryString': '', - } - } - - request = sanitized_Request( - 'http://collegerama.tudelft.nl/Mediasite/PlayerService/PlayerService.svc/json/GetPlayerOptions', - json.dumps(player_options_request)) - request.add_header('Content-Type', 'application/json') - - player_options = self._download_json(request, video_id) - - presentation = player_options['d']['Presentation'] - title = presentation['Title'] - description = presentation.get('Description') - thumbnail = None - duration = float_or_none(presentation.get('Duration'), 1000) - timestamp = int_or_none(presentation.get('UnixTime'), 1000) - - formats = [] - for stream in presentation['Streams']: - for video in stream['VideoUrls']: - thumbnail_url = stream.get('ThumbnailUrl') - if thumbnail_url: - thumbnail = 'http://collegerama.tudelft.nl' + thumbnail_url - format_id = video['MediaType'] - if format_id == 'SS': - continue - formats.append({ - 'url': video['Location'], - 'format_id': format_id, - }) - self._sort_formats(formats) - - return { - 'id': video_id, - 'title': title, - 'description': description, - 'thumbnail': thumbnail, - 'duration': duration, - 'timestamp': timestamp, - 'formats': formats, - } diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 5ed031833..e64defe62 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -205,7 +205,6 @@ from .cnn import ( CNNArticleIE, ) from .coub import CoubIE -from .collegerama import CollegeRamaIE from .comedycentral import ( ComedyCentralFullEpisodesIE, ComedyCentralIE, @@ -576,6 +575,7 @@ from .massengeschmacktv import MassengeschmackTVIE from .matchtv import MatchTVIE from .mdr import MDRIE from .mediaset import MediasetIE +from .mediasite import MediasiteIE from .medici import MediciIE from .megaphone import MegaphoneIE from .meipai import MeipaiIE @@ -912,7 +912,6 @@ from .rutube import ( from .rutv import RUTVIE from .ruutu import RuutuIE from .ruv import RuvIE -from .sandia import SandiaIE from .safari import ( SafariIE, SafariApiIE, diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index c7b609215..d5622c823 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -2883,6 +2883,16 @@ class GenericIE(InfoExtractor): return self.playlist_from_matches( vshare_urls, video_id, video_title, ie=VShareIE.ie_key()) + # Look for Mediasite embeds + mobj = re.search(r'''(?xi) + ]+src="((?:https?://[a-z0-9\-\.:\[\]]+)? + /Mediasite/Play/[0-9a-f]{32,34}(?:\?.*?)?)" + ''', webpage) + if mobj is not None: + return self.url_result(smuggle_url( + compat_urlparse.urljoin(url, unescapeHTML(mobj.group(1))), + { 'UrlReferrer': url }), 'Livestream') + def merge_dicts(dict1, dict2): merged = {} for k, v in dict1.items(): diff --git a/youtube_dl/extractor/mediasite.py b/youtube_dl/extractor/mediasite.py new file mode 100644 index 000000000..5d281684e --- /dev/null +++ b/youtube_dl/extractor/mediasite.py @@ -0,0 +1,184 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re +import json + +from .common import InfoExtractor +from ..compat import compat_urlparse +from ..utils import ( + ExtractorError, + unsmuggle_url, + mimetype2ext, + float_or_none, +) + + +class MediasiteIE(InfoExtractor): + _VALID_URL = r'''(?xi) + https?://[a-z0-9\-\.:\[\]]+/Mediasite/Play/ + (?P[0-9a-f]{32,34}) + (?P\?[^#]+|) + ''' + _TESTS = [ + { + 'url': 'https://hitsmediaweb.h-its.org/mediasite/Play/2db6c271681e4f199af3c60d1f82869b1d', + 'info_dict': { + 'id': '2db6c271681e4f199af3c60d1f82869b1d', + 'ext': 'mp4', + 'title': 'Lecture: Tuesday, September 20, 2016 - Sir Andrew Wiles', + 'description': 'Sir Andrew Wiles: “Equations in arithmetic”\\n\\nI will describe some of the interactions between modern number theory and the problem of solving equations in rational numbers or integers\\u0027.', + 'timestamp': 1474268400.0, + 'upload_date': '20160919', + }, + }, + { + 'url': 'http://mediasite.uib.no/Mediasite/Play/90bb363295d945d6b548c867d01181361d?catalog=a452b7df-9ae1-46b7-a3ba-aceeb285f3eb', + 'info_dict': { + 'id': '90bb363295d945d6b548c867d01181361d', + 'ext': 'mp4', + 'upload_date': '20150429', + 'title': '5) IT-forum 2015-Dag 1 - Dungbeetle - How and why Rain created a tiny bug tracker for Unity', + 'timestamp': 1430311380.0, + }, + }, + { + 'url': 'https://collegerama.tudelft.nl/Mediasite/Play/585a43626e544bdd97aeb71a0ec907a01d', + 'md5': '481fda1c11f67588c0d9d8fbdced4e39', + 'info_dict': { + 'id': '585a43626e544bdd97aeb71a0ec907a01d', + 'ext': 'mp4', + 'title': 'Een nieuwe wereld: waarden, bewustzijn en techniek van de mensheid 2.0.', + 'description': '', + 'thumbnail': r're:^https?://.*\.jpg(?:\?.*)?$', + 'duration': 7713.088, + 'timestamp': 1413309600, + 'upload_date': '20141014', + }, + }, + { + 'url': 'https://collegerama.tudelft.nl/Mediasite/Play/86a9ea9f53e149079fbdb4202b521ed21d?catalog=fd32fd35-6c99-466c-89d4-cd3c431bc8a4', + 'md5': 'ef1fdded95bdf19b12c5999949419c92', + 'info_dict': { + 'id': '86a9ea9f53e149079fbdb4202b521ed21d', + 'ext': 'wmv', + 'title': '64ste Vakantiecursus: Afvalwater', + 'description': 'md5:7fd774865cc69d972f542b157c328305', + 'thumbnail': r're:^https?://.*\.jpg(?:\?.*?)?$', + 'duration': 10853, + 'timestamp': 1326446400, + 'upload_date': '20120113', + }, + }, + { + 'url': 'http://digitalops.sandia.gov/Mediasite/Play/24aace4429fc450fb5b38cdbf424a66e1d', + 'md5': '9422edc9b9a60151727e4b6d8bef393d', + 'info_dict': { + 'id': '24aace4429fc450fb5b38cdbf424a66e1d', + 'ext': 'mp4', + 'title': 'Xyce Software Training - Section 1', + 'description': r're:(?s)SAND Number: SAND 2013-7800.{200,}', + 'upload_date': '20120409', + 'timestamp': 1333983600, + 'duration': 7794, + } + } + ] + + # look in Mediasite.Core.js (Mediasite.ContentStreamType[*]) + _STREAM_TYPES = { + 0: 'video1', # the main video + 2: 'slide', + 3: 'presentation', + 4: 'video2', # screencast? + 5: 'video3', + } + + def _real_extract(self, url): + url, data = unsmuggle_url(url, {}) + mobj = re.match(self._VALID_URL, url) + ResourceId = mobj.group('id') + QueryString = mobj.group('QueryString') + + webpage = self._download_webpage(url, ResourceId) # XXX: add UrlReferrer? + + # XXX: might have also extracted UrlReferrer and QueryString from the html + ServicePath = compat_urlparse.urljoin(url, self._html_search_regex( + r'
(.+?)
', webpage, ResourceId, + default='/Mediasite/PlayerService/PlayerService.svc/json')) + + PlayerOptions = self._download_json( + '%s/GetPlayerOptions' % (ServicePath), ResourceId, + headers={ + 'Content-type': 'application/json; charset=utf-8', + 'X-Requested-With': 'XMLHttpRequest', + }, + data=json.dumps({ + 'getPlayerOptionsRequest': { + 'ResourceId': ResourceId, + 'QueryString': QueryString, + 'UrlReferrer': data.get('UrlReferrer', ''), + 'UseScreenReader': False, + } + }).encode('utf-8')) + Presentation = PlayerOptions['d']['Presentation'] + if Presentation is None: + raise ExtractorError('Mediasite says: %s' % + (PlayerOptions['d']['PlayerPresentationStatusMessage'],), + expected=True) + + thumbnails = [] + formats = [] + for snum, Stream in enumerate(Presentation['Streams']): + stream_type = self._STREAM_TYPES.get( + Stream['StreamType'], 'type%u' % Stream['StreamType']) + + stream_formats = [] + for unum, VideoUrl in enumerate(Stream['VideoUrls']): + url = VideoUrl['Location'] + # XXX: if Stream.get('CanChangeScheme', False), switch scheme to HTTP/HTTPS + + if VideoUrl['MediaType'] == 'SS': + stream_formats.extend(self._extract_ism_formats( + url, ResourceId, ism_id='%s-%u.%u' % (stream_type, snum, unum))) + continue + + stream_formats.append({ + 'format_id': '%s-%u.%u' % (stream_type, snum, unum), + 'url': url, + 'ext': mimetype2ext(VideoUrl['MimeType']), + }) + + # TODO: if Stream['HasSlideContent']: + # synthesise an MJPEG video stream '%s-%u.slides' % (stream_type, snum) + # from Stream['Slides'] + # this will require writing a custom downloader... + + # disprefer 'secondary' streams + if Stream['StreamType'] != 0: + for fmt in stream_formats: + fmt['preference'] = -1 + + ThumbnailUrl = Stream.get('ThumbnailUrl') + if ThumbnailUrl: + thumbnails.append({ + 'id': '%s-%u' % (stream_type, snum), + 'url': compat_urlparse.urljoin(url, ThumbnailUrl), + 'preference': -1 if Stream['StreamType'] != 0 else 0, + }) + formats.extend(stream_formats) + + self._sort_formats(formats) + + # XXX: Presentation['Presenters'] + # XXX: Presentation['Transcript'] + + return { + 'id': ResourceId, + 'title': Presentation['Title'], + 'description': Presentation.get('Description'), + 'duration': float_or_none(Presentation.get('Duration'), 1000), + 'timestamp': float_or_none(Presentation.get('UnixTime'), 1000), + 'formats': formats, + 'thumbnails': thumbnails, + } diff --git a/youtube_dl/extractor/sandia.py b/youtube_dl/extractor/sandia.py deleted file mode 100644 index 96e43af84..000000000 --- a/youtube_dl/extractor/sandia.py +++ /dev/null @@ -1,65 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import json - -from .common import InfoExtractor -from ..utils import ( - int_or_none, - mimetype2ext, -) - - -class SandiaIE(InfoExtractor): - IE_DESC = 'Sandia National Laboratories' - _VALID_URL = r'https?://digitalops\.sandia\.gov/Mediasite/Play/(?P[0-9a-f]+)' - _TEST = { - 'url': 'http://digitalops.sandia.gov/Mediasite/Play/24aace4429fc450fb5b38cdbf424a66e1d', - 'md5': '9422edc9b9a60151727e4b6d8bef393d', - 'info_dict': { - 'id': '24aace4429fc450fb5b38cdbf424a66e1d', - 'ext': 'mp4', - 'title': 'Xyce Software Training - Section 1', - 'description': 're:(?s)SAND Number: SAND 2013-7800.{200,}', - 'upload_date': '20120409', - 'timestamp': 1333983600, - 'duration': 7794, - } - } - - def _real_extract(self, url): - video_id = self._match_id(url) - - presentation_data = self._download_json( - 'http://digitalops.sandia.gov/Mediasite/PlayerService/PlayerService.svc/json/GetPlayerOptions', - video_id, data=json.dumps({ - 'getPlayerOptionsRequest': { - 'ResourceId': video_id, - 'QueryString': '', - } - }), headers={ - 'Content-Type': 'application/json; charset=utf-8', - })['d']['Presentation'] - - title = presentation_data['Title'] - - formats = [] - for stream in presentation_data.get('Streams', []): - for fd in stream.get('VideoUrls', []): - formats.append({ - 'format_id': fd['MediaType'], - 'format_note': fd['MimeType'].partition('/')[2], - 'ext': mimetype2ext(fd['MimeType']), - 'url': fd['Location'], - 'protocol': 'f4m' if fd['MimeType'] == 'video/x-mp4-fragmented' else None, - }) - self._sort_formats(formats) - - return { - 'id': video_id, - 'title': title, - 'description': presentation_data.get('Description'), - 'formats': formats, - 'timestamp': int_or_none(presentation_data.get('UnixTime'), 1000), - 'duration': int_or_none(presentation_data.get('Duration'), 1000), - } From 2ca7ed41fed73cf37581b07d0c67d3bad8a6acc3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 30 Dec 2017 07:28:18 +0700 Subject: [PATCH 050/137] [mediasite] Improve extraction and code style, add support for DASH (closes #11185, closes #14343, refs #5428) --- youtube_dl/extractor/generic.py | 29 +++++-- youtube_dl/extractor/mediasite.py | 128 ++++++++++++++++++------------ 2 files changed, 100 insertions(+), 57 deletions(-) diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index d5622c823..cc4c90b8c 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -100,6 +100,7 @@ from .megaphone import MegaphoneIE from .vzaar import VzaarIE from .channel9 import Channel9IE from .vshare import VShareIE +from .mediasite import MediasiteIE class GenericIE(InfoExtractor): @@ -1925,6 +1926,18 @@ class GenericIE(InfoExtractor): 'title': 'vl14062007715967', 'ext': 'mp4', } + }, + { + 'url': 'http://www.heidelberg-laureate-forum.org/blog/video/lecture-friday-september-23-2016-sir-c-antony-r-hoare/', + 'md5': 'aecd089f55b1cb5a59032cb049d3a356', + 'info_dict': { + 'id': '90227f51a80c4d8f86c345a7fa62bd9a1d', + 'ext': 'mp4', + 'title': 'Lecture: Friday, September 23, 2016 - Sir Tony Hoare', + 'description': 'md5:5a51db84a62def7b7054df2ade403c6c', + 'timestamp': 1474354800, + 'upload_date': '20160920', + } } # { # # TODO: find another test @@ -2884,14 +2897,14 @@ class GenericIE(InfoExtractor): vshare_urls, video_id, video_title, ie=VShareIE.ie_key()) # Look for Mediasite embeds - mobj = re.search(r'''(?xi) - ]+src="((?:https?://[a-z0-9\-\.:\[\]]+)? - /Mediasite/Play/[0-9a-f]{32,34}(?:\?.*?)?)" - ''', webpage) - if mobj is not None: - return self.url_result(smuggle_url( - compat_urlparse.urljoin(url, unescapeHTML(mobj.group(1))), - { 'UrlReferrer': url }), 'Livestream') + mediasite_urls = MediasiteIE._extract_urls(webpage) + if mediasite_urls: + entries = [ + self.url_result(smuggle_url( + compat_urlparse.urljoin(url, mediasite_url), + {'UrlReferrer': url}), ie=MediasiteIE.ie_key()) + for mediasite_url in mediasite_urls] + return self.playlist_result(entries, video_id, video_title) def merge_dicts(dict1, dict2): merged = {} diff --git a/youtube_dl/extractor/mediasite.py b/youtube_dl/extractor/mediasite.py index 5d281684e..0e2645c55 100644 --- a/youtube_dl/extractor/mediasite.py +++ b/youtube_dl/extractor/mediasite.py @@ -5,21 +5,22 @@ import re import json from .common import InfoExtractor -from ..compat import compat_urlparse +from ..compat import ( + compat_str, + compat_urlparse, +) from ..utils import ( ExtractorError, - unsmuggle_url, - mimetype2ext, float_or_none, + mimetype2ext, + unescapeHTML, + unsmuggle_url, + urljoin, ) class MediasiteIE(InfoExtractor): - _VALID_URL = r'''(?xi) - https?://[a-z0-9\-\.:\[\]]+/Mediasite/Play/ - (?P[0-9a-f]{32,34}) - (?P\?[^#]+|) - ''' + _VALID_URL = r'(?xi)https?://[^/]+/Mediasite/Play/(?P[0-9a-f]{32,34})(?P\?[^#]+|)' _TESTS = [ { 'url': 'https://hitsmediaweb.h-its.org/mediasite/Play/2db6c271681e4f199af3c60d1f82869b1d', @@ -87,67 +88,96 @@ class MediasiteIE(InfoExtractor): # look in Mediasite.Core.js (Mediasite.ContentStreamType[*]) _STREAM_TYPES = { - 0: 'video1', # the main video + 0: 'video1', # the main video 2: 'slide', 3: 'presentation', - 4: 'video2', # screencast? + 4: 'video2', # screencast? 5: 'video3', } + @staticmethod + def _extract_urls(webpage): + return [ + unescapeHTML(mobj.group('url')) + for mobj in re.finditer( + r'(?xi)]+\bsrc=(["\'])(?P(?:(?:https?:)?//[^/]+)?/Mediasite/Play/[0-9a-f]{32,34}(?:\?.*?)?)\1', + webpage)] + def _real_extract(self, url): url, data = unsmuggle_url(url, {}) mobj = re.match(self._VALID_URL, url) - ResourceId = mobj.group('id') - QueryString = mobj.group('QueryString') + resource_id = mobj.group('id') + query = mobj.group('query') - webpage = self._download_webpage(url, ResourceId) # XXX: add UrlReferrer? + webpage, urlh = self._download_webpage_handle(url, resource_id) # XXX: add UrlReferrer? + redirect_url = compat_str(urlh.geturl()) # XXX: might have also extracted UrlReferrer and QueryString from the html - ServicePath = compat_urlparse.urljoin(url, self._html_search_regex( - r'
(.+?)
', webpage, ResourceId, + service_path = compat_urlparse.urljoin(redirect_url, self._html_search_regex( + r']+\bid=["\']ServicePath[^>]+>(.+?)', webpage, resource_id, default='/Mediasite/PlayerService/PlayerService.svc/json')) - PlayerOptions = self._download_json( - '%s/GetPlayerOptions' % (ServicePath), ResourceId, + player_options = self._download_json( + '%s/GetPlayerOptions' % service_path, resource_id, headers={ 'Content-type': 'application/json; charset=utf-8', 'X-Requested-With': 'XMLHttpRequest', }, data=json.dumps({ 'getPlayerOptionsRequest': { - 'ResourceId': ResourceId, - 'QueryString': QueryString, + 'ResourceId': resource_id, + 'QueryString': query, 'UrlReferrer': data.get('UrlReferrer', ''), 'UseScreenReader': False, } - }).encode('utf-8')) - Presentation = PlayerOptions['d']['Presentation'] - if Presentation is None: - raise ExtractorError('Mediasite says: %s' % - (PlayerOptions['d']['PlayerPresentationStatusMessage'],), + }).encode('utf-8'))['d'] + + presentation = player_options['Presentation'] + title = presentation['Title'] + + if presentation is None: + raise ExtractorError( + 'Mediasite says: %s' % player_options['PlayerPresentationStatusMessage'], expected=True) thumbnails = [] formats = [] - for snum, Stream in enumerate(Presentation['Streams']): - stream_type = self._STREAM_TYPES.get( - Stream['StreamType'], 'type%u' % Stream['StreamType']) + for snum, Stream in enumerate(presentation['Streams']): + stream_type = Stream.get('StreamType') + if stream_type is None: + continue + + video_urls = Stream.get('VideoUrls') + if not isinstance(video_urls, list): + video_urls = [] + + stream_id = self._STREAM_TYPES.get( + stream_type, 'type%u' % stream_type) stream_formats = [] - for unum, VideoUrl in enumerate(Stream['VideoUrls']): - url = VideoUrl['Location'] + for unum, VideoUrl in enumerate(video_urls): + video_url = VideoUrl.get('Location') + if not video_url or not isinstance(video_url, compat_str): + continue # XXX: if Stream.get('CanChangeScheme', False), switch scheme to HTTP/HTTPS - if VideoUrl['MediaType'] == 'SS': + media_type = VideoUrl.get('MediaType') + if media_type == 'SS': stream_formats.extend(self._extract_ism_formats( - url, ResourceId, ism_id='%s-%u.%u' % (stream_type, snum, unum))) - continue - - stream_formats.append({ - 'format_id': '%s-%u.%u' % (stream_type, snum, unum), - 'url': url, - 'ext': mimetype2ext(VideoUrl['MimeType']), - }) + video_url, resource_id, + ism_id='%s-%u.%u' % (stream_id, snum, unum), + fatal=False)) + elif media_type == 'Dash': + stream_formats.extend(self._extract_mpd_formats( + video_url, resource_id, + mpd_id='%s-%u.%u' % (stream_id, snum, unum), + fatal=False)) + else: + stream_formats.append({ + 'format_id': '%s-%u.%u' % (stream_id, snum, unum), + 'url': video_url, + 'ext': mimetype2ext(VideoUrl.get('MimeType')), + }) # TODO: if Stream['HasSlideContent']: # synthesise an MJPEG video stream '%s-%u.slides' % (stream_type, snum) @@ -155,16 +185,16 @@ class MediasiteIE(InfoExtractor): # this will require writing a custom downloader... # disprefer 'secondary' streams - if Stream['StreamType'] != 0: + if stream_type != 0: for fmt in stream_formats: fmt['preference'] = -1 - ThumbnailUrl = Stream.get('ThumbnailUrl') - if ThumbnailUrl: + thumbnail_url = Stream.get('ThumbnailUrl') + if thumbnail_url: thumbnails.append({ - 'id': '%s-%u' % (stream_type, snum), - 'url': compat_urlparse.urljoin(url, ThumbnailUrl), - 'preference': -1 if Stream['StreamType'] != 0 else 0, + 'id': '%s-%u' % (stream_id, snum), + 'url': urljoin(redirect_url, thumbnail_url), + 'preference': -1 if stream_type != 0 else 0, }) formats.extend(stream_formats) @@ -174,11 +204,11 @@ class MediasiteIE(InfoExtractor): # XXX: Presentation['Transcript'] return { - 'id': ResourceId, - 'title': Presentation['Title'], - 'description': Presentation.get('Description'), - 'duration': float_or_none(Presentation.get('Duration'), 1000), - 'timestamp': float_or_none(Presentation.get('UnixTime'), 1000), + 'id': resource_id, + 'title': title, + 'description': presentation.get('Description'), + 'duration': float_or_none(presentation.get('Duration'), 1000), + 'timestamp': float_or_none(presentation.get('UnixTime'), 1000), 'formats': formats, 'thumbnails': thumbnails, } From 620ee8712e264c71b444748c86a51b258818b4a1 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Sat, 30 Dec 2017 15:03:13 +0800 Subject: [PATCH 051/137] [openload] Fix extraction (closes #15118) --- ChangeLog | 6 ++++++ youtube_dl/extractor/openload.py | 3 ++- 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/ChangeLog b/ChangeLog index 4323cfc5c..08aed1ba7 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,9 @@ +version + +Extractors +* [openload] Fix extraction (#15118) + + version 2017.12.28 Extractors diff --git a/youtube_dl/extractor/openload.py b/youtube_dl/extractor/openload.py index 81c1317b6..b282bcfd9 100644 --- a/youtube_dl/extractor/openload.py +++ b/youtube_dl/extractor/openload.py @@ -332,7 +332,8 @@ class OpenloadIE(InfoExtractor): phantom = PhantomJSwrapper(self, required_version='2.0') webpage, _ = phantom.get(page_url, html=webpage, video_id=video_id, headers=headers) - decoded_id = get_element_by_id('streamurl', webpage) + decoded_id = (get_element_by_id('streamurl', webpage) or + get_element_by_id('streamuri', webpage)) video_url = 'https://openload.co/stream/%s?mime=true' % decoded_id From 538d4f8681535507838d19942d2581478ebdc28b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 31 Dec 2017 01:15:35 +0700 Subject: [PATCH 052/137] [downloader/hls] Use HTTP headers for key request --- youtube_dl/downloader/hls.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/youtube_dl/downloader/hls.py b/youtube_dl/downloader/hls.py index 1a6e226c8..4dc3ab46a 100644 --- a/youtube_dl/downloader/hls.py +++ b/youtube_dl/downloader/hls.py @@ -163,7 +163,8 @@ class HlsFD(FragmentFD): return False if decrypt_info['METHOD'] == 'AES-128': iv = decrypt_info.get('IV') or compat_struct_pack('>8xq', media_sequence) - decrypt_info['KEY'] = decrypt_info.get('KEY') or self.ydl.urlopen(decrypt_info['URI']).read() + decrypt_info['KEY'] = decrypt_info.get('KEY') or self.ydl.urlopen( + self._prepare_url(info_dict, decrypt_info['URI'])).read() frag_content = AES.new( decrypt_info['KEY'], AES.MODE_CBC, iv).decrypt(frag_content) self._append_fragment(ctx, frag_content) From 2e65e7db9e39f22a8d20ea950906ee39fbf2618c Mon Sep 17 00:00:00 2001 From: d2au Date: Sat, 18 Nov 2017 05:28:58 +1030 Subject: [PATCH 053/137] [abc:iview] Fix extraction (closes #14711) ABC dropped unmetering, so change to metered hls urls which require auth. --- youtube_dl/extractor/abc.py | 28 +++++++++++++++++++++++----- 1 file changed, 23 insertions(+), 5 deletions(-) diff --git a/youtube_dl/extractor/abc.py b/youtube_dl/extractor/abc.py index 60f753b95..9610d0cb7 100644 --- a/youtube_dl/extractor/abc.py +++ b/youtube_dl/extractor/abc.py @@ -1,6 +1,9 @@ from __future__ import unicode_literals +import hashlib +import hmac import re +import time from .common import InfoExtractor from ..compat import compat_str @@ -126,20 +129,35 @@ class ABCIViewIE(InfoExtractor): title = video_params.get('title') or video_params['seriesTitle'] stream = next(s for s in video_params['playlist'] if s.get('type') == 'program') - format_urls = [ - try_get(stream, lambda x: x['hds-unmetered'], compat_str)] + key = 'android.content.res.Resources'.encode('utf-8') + time_str = str(int(time.time())) + house_number = video_params.get('episodeHouseNumber') + path = '/auth/hls/sign?ts={0}&hn={1}&d=android-mobile'.format( + time_str, house_number) + sig = hmac.new(key, path.encode('utf-8'), hashlib.sha256).hexdigest() + auth_url = 'http://iview.abc.net.au{0}&sig={1}'.format(path, sig) + token = self._download_webpage(auth_url, video_id) + + format_urls = [] + + def tokenize_url(url, token): + return ''.join([url, '?hdnea=', token]) # May have higher quality video sd_url = try_get( - stream, lambda x: x['streams']['hds']['sd'], compat_str) + stream, lambda x: x['streams']['hls']['sd'], compat_str) if sd_url: - format_urls.append(sd_url.replace('metered', 'um')) + format_urls.append(tokenize_url(sd_url, token)) + else: + sd_low_url = try_get( + stream, lambda x: x['streams']['hls']['sd-low'], compat_str) + format_urls.append(tokenize_url(sd_low_url, token)) formats = [] for format_url in format_urls: if format_url: formats.extend( - self._extract_akamai_formats(format_url, video_id)) + self._extract_m3u8_formats(format_url, video_id, 'mp4')) self._sort_formats(formats) subtitles = {} From 77341dae1418fa1442bdc7689e428450ff83e5d9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 31 Dec 2017 01:22:15 +0700 Subject: [PATCH 054/137] [abc:iview] Improve extraction and bypass geo restriction (closes #14782) --- youtube_dl/extractor/abc.py | 63 ++++++++++++++++++------------------- 1 file changed, 31 insertions(+), 32 deletions(-) diff --git a/youtube_dl/extractor/abc.py b/youtube_dl/extractor/abc.py index 9610d0cb7..87017ed39 100644 --- a/youtube_dl/extractor/abc.py +++ b/youtube_dl/extractor/abc.py @@ -13,6 +13,7 @@ from ..utils import ( int_or_none, parse_iso8601, try_get, + update_url_query, ) @@ -104,21 +105,24 @@ class ABCIE(InfoExtractor): class ABCIViewIE(InfoExtractor): IE_NAME = 'abc.net.au:iview' _VALID_URL = r'https?://iview\.abc\.net\.au/programs/[^/]+/(?P[^/?#]+)' + _GEO_COUNTRIES = ['AU'] # ABC iview programs are normally available for 14 days only. _TESTS = [{ - 'url': 'http://iview.abc.net.au/programs/diaries-of-a-broken-mind/ZX9735A001S00', + 'url': 'http://iview.abc.net.au/programs/call-the-midwife/ZW0898A003S00', 'md5': 'cde42d728b3b7c2b32b1b94b4a548afc', 'info_dict': { - 'id': 'ZX9735A001S00', + 'id': 'ZW0898A003S00', 'ext': 'mp4', - 'title': 'Diaries Of A Broken Mind', - 'description': 'md5:7de3903874b7a1be279fe6b68718fc9e', - 'upload_date': '20161010', - 'uploader_id': 'abc2', - 'timestamp': 1476064920, + 'title': 'Series 5 Ep 3', + 'description': 'md5:e0ef7d4f92055b86c4f33611f180ed79', + 'upload_date': '20171228', + 'uploader_id': 'abc1', + 'timestamp': 1514499187, + }, + 'params': { + 'skip_download': True, }, - 'skip': 'Video gone', }] def _real_extract(self, url): @@ -129,35 +133,30 @@ class ABCIViewIE(InfoExtractor): title = video_params.get('title') or video_params['seriesTitle'] stream = next(s for s in video_params['playlist'] if s.get('type') == 'program') - key = 'android.content.res.Resources'.encode('utf-8') - time_str = str(int(time.time())) house_number = video_params.get('episodeHouseNumber') path = '/auth/hls/sign?ts={0}&hn={1}&d=android-mobile'.format( - time_str, house_number) - sig = hmac.new(key, path.encode('utf-8'), hashlib.sha256).hexdigest() - auth_url = 'http://iview.abc.net.au{0}&sig={1}'.format(path, sig) - token = self._download_webpage(auth_url, video_id) - - format_urls = [] + int(time.time()), house_number) + sig = hmac.new( + 'android.content.res.Resources'.encode('utf-8'), + path.encode('utf-8'), hashlib.sha256).hexdigest() + token = self._download_webpage( + 'http://iview.abc.net.au{0}&sig={1}'.format(path, sig), video_id) def tokenize_url(url, token): - return ''.join([url, '?hdnea=', token]) + return update_url_query(url, { + 'hdnea': token, + }) - # May have higher quality video - sd_url = try_get( - stream, lambda x: x['streams']['hls']['sd'], compat_str) - if sd_url: - format_urls.append(tokenize_url(sd_url, token)) - else: - sd_low_url = try_get( - stream, lambda x: x['streams']['hls']['sd-low'], compat_str) - format_urls.append(tokenize_url(sd_low_url, token)) - - formats = [] - for format_url in format_urls: - if format_url: - formats.extend( - self._extract_m3u8_formats(format_url, video_id, 'mp4')) + for sd in ('sd', 'sd-low'): + sd_url = try_get( + stream, lambda x: x['streams']['hls'][sd], compat_str) + if not sd_url: + continue + formats = self._extract_m3u8_formats( + tokenize_url(sd_url, token), video_id, 'mp4', + entry_protocol='m3u8_native', m3u8_id='hls', fatal=False) + if formats: + break self._sort_formats(formats) subtitles = {} From 4f5cf319772b693c7781de47dbc5451c3c18a81c Mon Sep 17 00:00:00 2001 From: Windom Date: Sat, 30 Dec 2017 20:41:07 +0200 Subject: [PATCH 055/137] [slutload] Add support for mobile URLs --- youtube_dl/extractor/slutload.py | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/slutload.py b/youtube_dl/extractor/slutload.py index 7145d285a..6fc2ff60d 100644 --- a/youtube_dl/extractor/slutload.py +++ b/youtube_dl/extractor/slutload.py @@ -1,11 +1,13 @@ from __future__ import unicode_literals +import re + from .common import InfoExtractor class SlutloadIE(InfoExtractor): _VALID_URL = r'^https?://(?:\w+\.)?slutload\.com/video/[^/]+/(?P[^/]+)/?$' - _TEST = { + _TESTS = [{ 'url': 'http://www.slutload.com/video/virginie-baisee-en-cam/TD73btpBqSxc/', 'md5': '868309628ba00fd488cf516a113fd717', 'info_dict': { @@ -15,11 +17,17 @@ class SlutloadIE(InfoExtractor): 'age_limit': 18, 'thumbnail': r're:https?://.*?\.jpg' } - } + }, { + # mobile site + 'url': 'http://mobile.slutload.com/video/masturbation-solo/fviFLmc6kzJ/', + 'only_matching': True, + }] def _real_extract(self, url): video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) + + desktop_url = re.sub(r'^(https?://)mobile\.', r'\1', url) + webpage = self._download_webpage(desktop_url, video_id) video_title = self._html_search_regex(r'

([^<]+)', webpage, 'title').strip() From 126f225bcffbdc55725fe2272daf22489abacf54 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ond=C5=99ej=20Caletka?= Date: Sat, 30 Dec 2017 22:02:46 +0100 Subject: [PATCH 056/137] [extractor/common] Add container meta field for formats extracted in _parse_mpd_formats --- youtube_dl/extractor/common.py | 1 + 1 file changed, 1 insertion(+) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 5e7e7a3f7..5b6a09c0b 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -1880,6 +1880,7 @@ class InfoExtractor(object): 'language': lang if lang not in ('mul', 'und', 'zxx', 'mis') else None, 'format_note': 'DASH %s' % content_type, 'filesize': filesize, + 'container': mimetype2ext(mime_type) + '_dash', } f.update(parse_codecs(representation_attrib.get('codecs'))) representation_ms_info = extract_multisegment_info(representation, adaption_set_ms_info) From c95c08a856af2499729e2141f5b683b266324fd0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 31 Dec 2017 04:28:01 +0700 Subject: [PATCH 057/137] [ChangeLog] Actualize --- ChangeLog | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/ChangeLog b/ChangeLog index 08aed1ba7..eff1fe8f7 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,7 +1,28 @@ version +Core ++ [extractor/common] Add container meta field for formats extracted + in _parse_mpd_formats (#13616) ++ [downloader/hls] Use HTTP headers for key request +* [common] Use AACL as the default fourcc when AudioTag is 255 +* [extractor/common] Fix extraction of DASH formats with the same + representation id (#15111) + Extractors ++ [slutload] Add support for mobile URLs (#14806) +* [abc:iview] Bypass geo restriction +* [abc:iview] Fix extraction (#14711, #14782, #14838, #14917, #14963, #14985, + #15035, #15057, #15061, #15071, #15095, #15106) * [openload] Fix extraction (#15118) +- [sandia] Remove extractor +- [collegerama] Remove extractor ++ [mediasite] Add support for sites based on Mediasite Video Platform (#5428, + #11185, #14343) ++ [ufctv] Add support for ufc.tv (#14520) +* [pluralsight] Fix missing first line of subtitles (#11118) +* [openload] Fallback on f-page extraction (#14665, #14879) +* [vimeo] Improve password protected videos extraction (#15114) +* [aws] Fix canonical/signed headers generation on python 2 (#15102) version 2017.12.28 From 04cf1a191a6b47bac93364ed8eb237a25058ee5e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 31 Dec 2017 04:30:49 +0700 Subject: [PATCH 058/137] release 2017.12.31 --- .github/ISSUE_TEMPLATE.md | 6 +++--- ChangeLog | 2 +- docs/supportedsites.md | 4 ++-- youtube_dl/version.py | 2 +- 4 files changed, 7 insertions(+), 7 deletions(-) diff --git a/.github/ISSUE_TEMPLATE.md b/.github/ISSUE_TEMPLATE.md index f40927d36..3f8984943 100644 --- a/.github/ISSUE_TEMPLATE.md +++ b/.github/ISSUE_TEMPLATE.md @@ -6,8 +6,8 @@ --- -### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2017.12.28*. If it's not, read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. -- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2017.12.28** +### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2017.12.31*. If it's not, read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. +- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2017.12.31** ### Before submitting an *issue* make sure you have: - [ ] At least skimmed through the [README](https://github.com/rg3/youtube-dl/blob/master/README.md), **most notably** the [FAQ](https://github.com/rg3/youtube-dl#faq) and [BUGS](https://github.com/rg3/youtube-dl#bugs) sections @@ -35,7 +35,7 @@ Add the `-v` flag to **your command line** you run youtube-dl with (`youtube-dl [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 -[debug] youtube-dl version 2017.12.28 +[debug] youtube-dl version 2017.12.31 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/ChangeLog b/ChangeLog index eff1fe8f7..bfffb1f5f 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,4 +1,4 @@ -version +version 2017.12.31 Core + [extractor/common] Add container meta field for formats extracted diff --git a/docs/supportedsites.md b/docs/supportedsites.md index 6b935e572..75bd5c922 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -171,7 +171,6 @@ - **CNN** - **CNNArticle** - **CNNBlogs** - - **CollegeRama** - **ComCarCoff** - **ComedyCentral** - **ComedyCentralFullEpisodes** @@ -449,6 +448,7 @@ - **media.ccc.de** - **Medialaan** - **Mediaset** + - **Mediasite** - **Medici** - **megaphone.fm**: megaphone.fm embedded players - **Meipai**: 美拍 @@ -717,7 +717,6 @@ - **safari**: safaribooksonline.com online video - **safari:api** - **safari:course**: safaribooksonline.com online courses - - **Sandia**: Sandia National Laboratories - **Sapo**: SAPO Vídeos - **savefrom.net** - **SBS**: sbs.com.au @@ -892,6 +891,7 @@ - **udemy** - **udemy:course** - **UDNEmbed**: 聯合影音 + - **UFCTV** - **UKTVPlay** - **umg:de**: Universal Music Deutschland - **Unistra** diff --git a/youtube_dl/version.py b/youtube_dl/version.py index f4a1b67a2..a3f84b9ea 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2017.12.28' +__version__ = '2017.12.31' From 6648fd8ad6e581354f46c840465cff4c92d2c6f3 Mon Sep 17 00:00:00 2001 From: sprhawk <465558+sprhawk@users.noreply.github.com> Date: Mon, 1 Jan 2018 18:33:14 +0800 Subject: [PATCH 059/137] changed to use .get to get field from json object --- youtube_dl/extractor/weibo.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/weibo.py b/youtube_dl/extractor/weibo.py index 34809bdb2..cbe0c3228 100644 --- a/youtube_dl/extractor/weibo.py +++ b/youtube_dl/extractor/weibo.py @@ -121,7 +121,7 @@ class WeiboMobileIE(InfoExtractor): webpage = self._download_webpage(url, video_id, note="visit the page") js_code = self._search_regex(r'var\s+\$render_data\s*=\s*\[({.*})\]\[0\] \|\| {};', webpage, 'js_code', flags=re.DOTALL) weibo_info = self._parse_json(js_code, video_id, transform_source=js_to_json) - page_info = weibo_info['status']['page_info'] + page_info = weibo_info.get('status').get('page_info') title = weibo_info.get('status').get('status_title') uploader = weibo_info.get('status').get('user').get('screen_name') From b300cda4769a987c501db298f3f107326521d45c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Mon, 1 Jan 2018 21:52:24 +0700 Subject: [PATCH 060/137] [YoutubeDL] Output python implementation in debug header --- youtube_dl/YoutubeDL.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index ace80f14b..97bd9c526 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -2233,8 +2233,16 @@ class YoutubeDL(object): sys.exc_clear() except Exception: pass - self._write_string('[debug] Python version %s - %s\n' % ( - platform.python_version(), platform_name())) + + def python_implementation(): + impl_name = platform.python_implementation() + if impl_name == 'PyPy' and hasattr(sys, 'pypy_version_info'): + return impl_name + ' version %d.%d.%d' % sys.pypy_version_info[:3] + return impl_name + + self._write_string('[debug] Python version %s (%s) - %s\n' % ( + platform.python_version(), python_implementation(), + platform_name())) exe_versions = FFmpegPostProcessor.get_versions(self) exe_versions['rtmpdump'] = rtmpdump_version() From 54009c246e8eed38cfa8dc3eecb5619c1c81a1f2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Mon, 1 Jan 2018 21:54:28 +0700 Subject: [PATCH 061/137] [travis] Add PyPy builds --- .travis.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.travis.yml b/.travis.yml index 5f4f3922b..7b175e9c9 100644 --- a/.travis.yml +++ b/.travis.yml @@ -7,6 +7,8 @@ python: - "3.4" - "3.5" - "3.6" + - "pypy" + - "pypy3" sudo: false env: - YTDL_TEST_SET=core From d7cd9a9e847fab3ac3f0fb5b4ad2e4788aeea775 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Fri, 12 Dec 2014 04:01:08 +0100 Subject: [PATCH 062/137] [utils] Fix youtube-dl under PyPy3 on Windows --- youtube_dl/compat.py | 21 +++++++++++++++++++++ youtube_dl/utils.py | 15 ++++++++------- 2 files changed, 29 insertions(+), 7 deletions(-) diff --git a/youtube_dl/compat.py b/youtube_dl/compat.py index 2a62248ef..41ca9adf1 100644 --- a/youtube_dl/compat.py +++ b/youtube_dl/compat.py @@ -3,12 +3,14 @@ from __future__ import unicode_literals import binascii import collections +import ctypes import email import getpass import io import itertools import optparse import os +import platform import re import shlex import shutil @@ -2906,6 +2908,24 @@ except ImportError: # not 2.6+ or is 3.x except ImportError: compat_zip = zip +if platform.python_implementation() == 'PyPy' and sys.pypy_version_info < (5, 4, 0): + # PyPy2 prior to version 5.4.0 expects byte strings as Windows function + # names, see the original PyPy issue [1] and the youtube-dl one [2]. + # 1. https://bitbucket.org/pypy/pypy/issues/2360/windows-ctypescdll-typeerror-function-name + # 2. https://github.com/rg3/youtube-dl/pull/4392 + def compat_ctypes_WINFUNCTYPE(*args, **kwargs): + real = ctypes.WINFUNCTYPE(*args, **kwargs) + + def resf(tpl, *args, **kwargs): + funcname, dll = tpl + return real((str(funcname), dll), *args, **kwargs) + + return resf +else: + def compat_ctypes_WINFUNCTYPE(*args, **kwargs): + return ctypes.WINFUNCTYPE(*args, **kwargs) + + __all__ = [ 'compat_HTMLParseError', 'compat_HTMLParser', @@ -2914,6 +2934,7 @@ __all__ = [ 'compat_chr', 'compat_cookiejar', 'compat_cookies', + 'compat_ctypes_WINFUNCTYPE', 'compat_etree_fromstring', 'compat_etree_register_namespace', 'compat_expanduser', diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 2843a3dc0..386897a85 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -39,6 +39,7 @@ from .compat import ( compat_HTMLParser, compat_basestring, compat_chr, + compat_ctypes_WINFUNCTYPE, compat_etree_fromstring, compat_expanduser, compat_html_entities, @@ -1330,24 +1331,24 @@ def _windows_write_string(s, out): if fileno not in WIN_OUTPUT_IDS: return False - GetStdHandle = ctypes.WINFUNCTYPE( + GetStdHandle = compat_ctypes_WINFUNCTYPE( ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)( - (b'GetStdHandle', ctypes.windll.kernel32)) + ('GetStdHandle', ctypes.windll.kernel32)) h = GetStdHandle(WIN_OUTPUT_IDS[fileno]) - WriteConsoleW = ctypes.WINFUNCTYPE( + WriteConsoleW = compat_ctypes_WINFUNCTYPE( ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR, ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD), - ctypes.wintypes.LPVOID)((b'WriteConsoleW', ctypes.windll.kernel32)) + ctypes.wintypes.LPVOID)(('WriteConsoleW', ctypes.windll.kernel32)) written = ctypes.wintypes.DWORD(0) - GetFileType = ctypes.WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)((b'GetFileType', ctypes.windll.kernel32)) + GetFileType = compat_ctypes_WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)(('GetFileType', ctypes.windll.kernel32)) FILE_TYPE_CHAR = 0x0002 FILE_TYPE_REMOTE = 0x8000 - GetConsoleMode = ctypes.WINFUNCTYPE( + GetConsoleMode = compat_ctypes_WINFUNCTYPE( ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.POINTER(ctypes.wintypes.DWORD))( - (b'GetConsoleMode', ctypes.windll.kernel32)) + ('GetConsoleMode', ctypes.windll.kernel32)) INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value def not_a_console(handle): From 7a6c204fcb6ba5a1a5149ea7a3c186eab87fc7e4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Mon, 1 Jan 2018 23:21:39 +0700 Subject: [PATCH 063/137] [travis] Add Jython build --- .travis.yml | 11 +++++++---- devscripts/install_jython.sh | 5 +++++ 2 files changed, 12 insertions(+), 4 deletions(-) create mode 100755 devscripts/install_jython.sh diff --git a/.travis.yml b/.travis.yml index 7b175e9c9..92f326860 100644 --- a/.travis.yml +++ b/.travis.yml @@ -14,11 +14,14 @@ env: - YTDL_TEST_SET=core - YTDL_TEST_SET=download matrix: + include: + - env: JYTHON=true; YTDL_TEST_SET=core + - env: JYTHON=true; YTDL_TEST_SET=download fast_finish: true allow_failures: - env: YTDL_TEST_SET=download + - env: JYTHON=true; YTDL_TEST_SET=core + - env: JYTHON=true; YTDL_TEST_SET=download +before_install: + - if [ "$JYTHON" == "true" ]; then ./devscripts/install_jython.sh; export PATH="$HOME/jython/bin:$PATH"; fi script: ./devscripts/run_tests.sh -notifications: - email: - - filippo.valsorda@gmail.com - - yasoob.khld@gmail.com diff --git a/devscripts/install_jython.sh b/devscripts/install_jython.sh new file mode 100755 index 000000000..bafca4da4 --- /dev/null +++ b/devscripts/install_jython.sh @@ -0,0 +1,5 @@ +#!/bin/bash + +wget http://central.maven.org/maven2/org/python/jython-installer/2.7.1/jython-installer-2.7.1.jar +java -jar jython-installer-2.7.1.jar -s -d "$HOME/jython" +$HOME/jython/bin/jython -m pip install nose From b5e531f31a57d2cdc9f0edd77e0cbef426e016b9 Mon Sep 17 00:00:00 2001 From: Mattias Wadman Date: Tue, 2 Jan 2018 17:32:17 +0100 Subject: [PATCH 064/137] [acast] Fix extraction --- youtube_dl/extractor/acast.py | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/youtube_dl/extractor/acast.py b/youtube_dl/extractor/acast.py index 6dace3051..5871e72dc 100644 --- a/youtube_dl/extractor/acast.py +++ b/youtube_dl/extractor/acast.py @@ -8,7 +8,7 @@ from .common import InfoExtractor from ..compat import compat_str from ..utils import ( int_or_none, - parse_iso8601, + unified_timestamp, OnDemandPagedList, ) @@ -32,7 +32,7 @@ class ACastIE(InfoExtractor): }, { # test with multiple blings 'url': 'https://www.acast.com/sparpodcast/2.raggarmordet-rosterurdetforflutna', - 'md5': '55c0097badd7095f494c99a172f86501', + 'md5': 'e87d5b8516cd04c0d81b6ee1caca28d0', 'info_dict': { 'id': '2a92b283-1a75-4ad8-8396-499c641de0d9', 'ext': 'mp3', @@ -40,23 +40,24 @@ class ACastIE(InfoExtractor): 'timestamp': 1477346700, 'upload_date': '20161024', 'description': 'md5:4f81f6d8cf2e12ee21a321d8bca32db4', - 'duration': 2797, + 'duration': 2766, } }] def _real_extract(self, url): channel, display_id = re.match(self._VALID_URL, url).groups() cast_data = self._download_json( - 'https://embed.acast.com/api/acasts/%s/%s' % (channel, display_id), display_id) + 'https://play-api.acast.com/splash/%s/%s' % (channel, display_id), display_id) + e = cast_data['result']['episode'] return { - 'id': compat_str(cast_data['id']), + 'id': compat_str(e['id']), 'display_id': display_id, - 'url': [b['audio'] for b in cast_data['blings'] if b['type'] == 'BlingAudio'][0], - 'title': cast_data['name'], - 'description': cast_data.get('description'), - 'thumbnail': cast_data.get('image'), - 'timestamp': parse_iso8601(cast_data.get('publishingDate')), - 'duration': int_or_none(cast_data.get('duration')), + 'url': e['mediaUrl'], + 'title': e['name'], + 'description': e.get('description'), + 'thumbnail': e.get('image'), + 'timestamp': unified_timestamp(e.get('publishingDate')), + 'duration': int_or_none(e.get('duration')), } From 9650c3e91d96fc054e37485d2bfd86a6a12417e5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Tue, 2 Jan 2018 21:12:39 +0100 Subject: [PATCH 065/137] [rtve.es:alacarta] Fix extraction of some new URLs --- ChangeLog | 6 ++++++ youtube_dl/extractor/rtve.py | 3 +++ 2 files changed, 9 insertions(+) diff --git a/ChangeLog b/ChangeLog index bfffb1f5f..3e6afca92 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,9 @@ +version + +Extractors +* [rtve.es:alacarta] Fix extraction of some new URLs + + version 2017.12.31 Core diff --git a/youtube_dl/extractor/rtve.py b/youtube_dl/extractor/rtve.py index d9edf9da2..fa60ffd5e 100644 --- a/youtube_dl/extractor/rtve.py +++ b/youtube_dl/extractor/rtve.py @@ -31,6 +31,9 @@ def _decrypt_url(png): hash_index = data.index('#') alphabet_data = data[:hash_index] url_data = data[hash_index + 1:] + if url_data[0] == 'H' and url_data[3] == '%': + # remove useless HQ%% at the start + url_data = url_data[4:] alphabet = [] e = 0 From f0c6c2bce29281d37d2bbd589143b35323e38e3d Mon Sep 17 00:00:00 2001 From: Luc Ritchie Date: Wed, 3 Jan 2018 04:22:55 -0500 Subject: [PATCH 066/137] [twitch] Pass video id to url_result when extracting playlist --- youtube_dl/extractor/twitch.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/twitch.py b/youtube_dl/extractor/twitch.py index bf57eac01..f9164af09 100644 --- a/youtube_dl/extractor/twitch.py +++ b/youtube_dl/extractor/twitch.py @@ -358,9 +358,16 @@ class TwitchPlaylistBaseIE(TwitchBaseIE): break offset += limit return self.playlist_result( - [self.url_result(entry) for entry in orderedSet(entries)], + [self._make_url_result(entry) for entry in orderedSet(entries)], channel_id, channel_name) + def _make_url_result(self, url): + try: + video_id = 'v%s' % TwitchVodIE._match_id(url) + return self.url_result(url, TwitchVodIE.ie_key(), video_id=video_id) + except AssertionError: + return self.url_result(url) + def _extract_playlist_page(self, response): videos = response.get('videos') return [video['url'] for video in videos] if videos else [] From 75ba0efb52c898601ef9febcba3e7a84ff2f311e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Wed, 3 Jan 2018 16:41:28 +0700 Subject: [PATCH 067/137] [lynda] Skip invalid subtitles (closes #15159) --- youtube_dl/extractor/lynda.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/lynda.py b/youtube_dl/extractor/lynda.py index 1b6f5091d..0d6026aad 100644 --- a/youtube_dl/extractor/lynda.py +++ b/youtube_dl/extractor/lynda.py @@ -244,8 +244,9 @@ class LyndaIE(LyndaBaseIE): def _get_subtitles(self, video_id): url = 'https://www.lynda.com/ajax/player?videoId=%s&type=transcript' % video_id subs = self._download_json(url, None, False) - if subs: - return {'en': [{'ext': 'srt', 'data': self._fix_subtitles(subs)}]} + fixed_subs = self._fix_subtitles(subs) + if fixed_subs: + return {'en': [{'ext': 'srt', 'data': fixed_subs}]} else: return {} From de329f64abd920e148701436d6a20bfc5b2d3ef3 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Thu, 4 Jan 2018 13:26:08 +0800 Subject: [PATCH 068/137] [openload] Fix extraction (closes #15166) --- ChangeLog | 1 + youtube_dl/extractor/openload.py | 6 +++++- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/ChangeLog b/ChangeLog index 3e6afca92..94b27b6a1 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,6 +1,7 @@ version Extractors +* [openload] Fix extraction (#15166) * [rtve.es:alacarta] Fix extraction of some new URLs diff --git a/youtube_dl/extractor/openload.py b/youtube_dl/extractor/openload.py index b282bcfd9..eaaaf8a08 100644 --- a/youtube_dl/extractor/openload.py +++ b/youtube_dl/extractor/openload.py @@ -333,7 +333,11 @@ class OpenloadIE(InfoExtractor): webpage, _ = phantom.get(page_url, html=webpage, video_id=video_id, headers=headers) decoded_id = (get_element_by_id('streamurl', webpage) or - get_element_by_id('streamuri', webpage)) + get_element_by_id('streamuri', webpage) or + get_element_by_id('streamurj', webpage)) + + if not decoded_id: + raise ExtractorError('Can\'t find stream URL', video_id=video_id) video_url = 'https://openload.co/stream/%s?mime=true' % decoded_id From da35331c6c23f90fc29ca3315f7e00481aff1d5c Mon Sep 17 00:00:00 2001 From: JianxinLi Date: Thu, 4 Jan 2018 22:25:28 +0800 Subject: [PATCH 069/137] [youku] Fix list extraction.(close #15135) (#15137) * [youku] Fix list extraction.(close #15135) Change-Id: I2e9c920143f4f16012252625943a8f18b8ff40eb * [youku] Remove KeyError try-except Change-Id: Ic46327905cbef1356b7b12d5eb3db5d9746ca338 --- youtube_dl/extractor/youku.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/youku.py b/youtube_dl/extractor/youku.py index c7947d4a1..5b0b248cd 100644 --- a/youtube_dl/extractor/youku.py +++ b/youtube_dl/extractor/youku.py @@ -245,13 +245,19 @@ class YoukuShowIE(InfoExtractor): # No data-id value. 'url': 'http://list.youku.com/show/id_zefbfbd61237fefbfbdef.html', 'only_matching': True, + }, { + # Wrong number of reload_id. + 'url': 'http://list.youku.com/show/id_z20eb4acaf5c211e3b2ad.html', + 'only_matching': True, }] def _extract_entries(self, playlist_data_url, show_id, note, query): query['callback'] = 'cb' playlist_data = self._download_json( playlist_data_url, show_id, query=query, note=note, - transform_source=lambda s: js_to_json(strip_jsonp(s)))['html'] + transform_source=lambda s: js_to_json(strip_jsonp(s))).get('html') + if playlist_data is None: + return [None, None] drama_list = (get_element_by_class('p-drama-grid', playlist_data) or get_element_by_class('p-drama-half-row', playlist_data)) if drama_list is None: @@ -291,8 +297,8 @@ class YoukuShowIE(InfoExtractor): 'id': page_config['showid'], 'stage': reload_id, }) - entries.extend(new_entries) - + if new_entries is not None: + entries.extend(new_entries) desc = self._html_search_meta('description', webpage, fatal=False) playlist_title = desc.split(',')[0] if desc else None detail_li = get_element_by_class('p-intro', webpage) From c2f18e1c4915d43b37c12337dc4fbae68d43a304 Mon Sep 17 00:00:00 2001 From: Chih-Hsuan Yen Date: Thu, 4 Jan 2018 22:28:00 +0800 Subject: [PATCH 070/137] [ChangeLog] Update after #15137 [skip ci] --- ChangeLog | 1 + 1 file changed, 1 insertion(+) diff --git a/ChangeLog b/ChangeLog index 94b27b6a1..96bc471f3 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,6 +1,7 @@ version Extractors +* [youku] Fix list extraction (#15135) * [openload] Fix extraction (#15166) * [rtve.es:alacarta] Fix extraction of some new URLs From 0b0870f9d0dd3e72be3ff6be6bfa9fa43b693b50 Mon Sep 17 00:00:00 2001 From: Parmjit Virk Date: Thu, 4 Jan 2018 19:25:42 -0600 Subject: [PATCH 071/137] [soundcloud] Fallback to avatar picture for thumbnail (closes #12878) --- youtube_dl/extractor/soundcloud.py | 21 ++++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/soundcloud.py b/youtube_dl/extractor/soundcloud.py index 8894f4b0c..6c9816eef 100644 --- a/youtube_dl/extractor/soundcloud.py +++ b/youtube_dl/extractor/soundcloud.py @@ -136,6 +136,25 @@ class SoundcloudIE(InfoExtractor): 'license': 'all-rights-reserved', }, }, + # no album art, use avatar pic for thumbnail + { + 'url': 'https://soundcloud.com/garyvee/sideways-prod-mad-real', + 'md5': '59c7872bc44e5d99b7211891664760c2', + 'info_dict': { + 'id': '309699954', + 'ext': 'mp3', + 'title': 'Sideways (Prod. Mad Real)', + 'description': 'md5:d41d8cd98f00b204e9800998ecf8427e', + 'uploader': 'garyvee', + 'upload_date': '20170226', + 'duration': 207, + 'thumbnail': r're:https?://.*\.jpg', + 'license': 'all-rights-reserved', + }, + 'params': { + 'skip_download': True, + }, + }, ] _CLIENT_ID = 'c6CU49JDMapyrQo06UxU9xouB9ZVzqCn' @@ -160,7 +179,7 @@ class SoundcloudIE(InfoExtractor): name = full_title or track_id if quiet: self.report_extraction(name) - thumbnail = info.get('artwork_url') + thumbnail = info.get('artwork_url') or info.get('user', {}).get('avatar_url') if isinstance(thumbnail, compat_str): thumbnail = thumbnail.replace('-large', '-t500x500') ext = 'mp3' From b7c74c04036c07f8a81d3048b482afd6ef384b40 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 6 Jan 2018 23:12:30 +0700 Subject: [PATCH 072/137] [lynda] Relax _VALID_URL (closes #15185) --- youtube_dl/extractor/lynda.py | 23 +++++++++++++++++++++-- 1 file changed, 21 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/lynda.py b/youtube_dl/extractor/lynda.py index 0d6026aad..f5c7abc13 100644 --- a/youtube_dl/extractor/lynda.py +++ b/youtube_dl/extractor/lynda.py @@ -94,7 +94,15 @@ class LyndaBaseIE(InfoExtractor): class LyndaIE(LyndaBaseIE): IE_NAME = 'lynda' IE_DESC = 'lynda.com videos' - _VALID_URL = r'https?://(?:www\.)?(?:lynda\.com|educourse\.ga)/(?:[^/]+/[^/]+/(?P\d+)|player/embed)/(?P\d+)' + _VALID_URL = r'''(?x) + https?:// + (?:www\.)?(?:lynda\.com|educourse\.ga)/ + (?: + (?:[^/]+/){2,3}(?P\d+)| + player/embed + )/ + (?P\d+) + ''' _TIMECODE_REGEX = r'\[(?P\d+:\d+:\d+[\.,]\d+)\]' @@ -113,6 +121,9 @@ class LyndaIE(LyndaBaseIE): }, { 'url': 'https://educourse.ga/Bootstrap-tutorials/Using-exercise-files/110885/114408-4.html', 'only_matching': True, + }, { + 'url': 'https://www.lynda.com/de/Graphic-Design-tutorials/Willkommen-Grundlagen-guten-Gestaltung/393570/393572-4.html', + 'only_matching': True, }] def _raise_unavailable(self, video_id): @@ -257,7 +268,15 @@ class LyndaCourseIE(LyndaBaseIE): # Course link equals to welcome/introduction video link of same course # We will recognize it as course link - _VALID_URL = r'https?://(?:www|m)\.(?:lynda\.com|educourse\.ga)/(?P[^/]+/[^/]+/(?P\d+))-\d\.html' + _VALID_URL = r'https?://(?:www|m)\.(?:lynda\.com|educourse\.ga)/(?P(?:[^/]+/){2,3}(?P\d+))-2\.html' + + _TESTS = [{ + 'url': 'https://www.lynda.com/Graphic-Design-tutorials/Grundlagen-guten-Gestaltung/393570-2.html', + 'only_matching': True, + }, { + 'url': 'https://www.lynda.com/de/Graphic-Design-tutorials/Grundlagen-guten-Gestaltung/393570-2.html', + 'only_matching': True, + }] def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) From 45283afdec81af21ba50ff3aca3d86fb6d2584b0 Mon Sep 17 00:00:00 2001 From: Martin Weinelt Date: Sat, 6 Jan 2018 17:33:40 +0100 Subject: [PATCH 073/137] [motherless] Add support for groups --- youtube_dl/extractor/extractors.py | 5 +- youtube_dl/extractor/motherless.py | 73 ++++++++++++++++++++++++++++++ 2 files changed, 77 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index e64defe62..fb0997d39 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -609,7 +609,10 @@ from .mofosex import MofosexIE from .mojvideo import MojvideoIE from .moniker import MonikerIE from .morningstar import MorningstarIE -from .motherless import MotherlessIE +from .motherless import ( + MotherlessIE, + MotherlessGroupIE +) from .motorsport import MotorsportIE from .movieclips import MovieClipsIE from .moviezine import MoviezineIE diff --git a/youtube_dl/extractor/motherless.py b/youtube_dl/extractor/motherless.py index 6fe3b6049..90ed91ba6 100644 --- a/youtube_dl/extractor/motherless.py +++ b/youtube_dl/extractor/motherless.py @@ -4,8 +4,11 @@ import datetime import re from .common import InfoExtractor +from ..compat import compat_urlparse from ..utils import ( ExtractorError, + InAdvancePagedList, + orderedSet, str_to_int, unified_strdate, ) @@ -114,3 +117,73 @@ class MotherlessIE(InfoExtractor): 'age_limit': age_limit, 'url': video_url, } + + +class MotherlessGroupIE(InfoExtractor): + _VALID_URL = 'https?://(?:www\.)?motherless\.com/gv?/(?P[a-z0-9_]+)' + _TESTS = [{ + 'url': 'http://motherless.com/g/movie_scenes', + 'info_dict': { + 'id': 'movie_scenes', + 'title': 'Movie Scenes', + 'description': 'Hot and sexy scenes from "regular" movies... ' + 'Beautiful actresses fully nude... A looot of ' + 'skin! :)Enjoy!', + }, + 'playlist_mincount': 662, + }, { + 'url': 'http://motherless.com/gv/sex_must_be_funny', + 'info_dict': { + 'id': 'sex_must_be_funny', + 'title': 'Sex must be funny', + 'description': 'Sex can be funny. Wide smiles,laugh, games, fun of ' + 'any kind!' + }, + 'playlist_mincount': 9, + }] + + @classmethod + def suitable(cls, url): + return (False if MotherlessIE.suitable(url) + else super(MotherlessGroupIE, cls).suitable(url)) + + def _extract_entries(self, webpage, base): + return [ + self.url_result( + compat_urlparse.urljoin(base, video_path), + MotherlessIE.ie_key(), video_title=title) + for video_path, title in orderedSet(re.findall( + r'href="/([^"]+)"[^>]+>\s+]+alt="[^-]+-\s([^"]+)"', + webpage)) + ] + + def _real_extract(self, url): + group_id = self._match_id(url) + page_url = compat_urlparse.urljoin(url, '/gv/%s' % group_id) + webpage = self._download_webpage(page_url, group_id) + title = self._search_regex( + r'([\w\s]+\w)\s+-', webpage, 'title', fatal=False) + description = self._html_search_meta( + 'description', webpage, fatal=False) + page_count = self._int(self._search_regex( + r'(\d+)</(?:a|span)><(?:a|span)[^>]+>\s*NEXT', + webpage, 'page_count'), 'page_count') + PAGE_SIZE = 80 + + def _get_page(idx): + webpage = self._download_webpage( + page_url, group_id, query={'page': idx + 1}, + note='Downloading page %d/%d' % (idx + 1, page_count) + ) + for entry in self._extract_entries(webpage, url): + yield entry + + playlist = InAdvancePagedList(_get_page, page_count, PAGE_SIZE) + + return { + '_type': 'playlist', + 'id': group_id, + 'title': title, + 'description': description, + 'entries': playlist + } From f12628f934ff50cc8e6441c4e64fe61019ebae5c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 6 Jan 2018 23:58:00 +0700 Subject: [PATCH 074/137] [mitele] Fix extraction (closes #15186) --- youtube_dl/extractor/mitele.py | 30 ++++++++++++++++-------------- 1 file changed, 16 insertions(+), 14 deletions(-) diff --git a/youtube_dl/extractor/mitele.py b/youtube_dl/extractor/mitele.py index 964dc542c..42759eae8 100644 --- a/youtube_dl/extractor/mitele.py +++ b/youtube_dl/extractor/mitele.py @@ -1,13 +1,13 @@ # coding: utf-8 from __future__ import unicode_literals +import json import uuid from .common import InfoExtractor from .ooyala import OoyalaIE from ..compat import ( compat_str, - compat_urllib_parse_urlencode, compat_urlparse, ) from ..utils import ( @@ -42,31 +42,33 @@ class MiTeleBaseIE(InfoExtractor): duration = int_or_none(mmc.get('duration')) for location in mmc['locations']: gat = self._proto_relative_url(location.get('gat'), 'http:') - bas = location.get('bas') - loc = location.get('loc') + gcp = location.get('gcp') ogn = location.get('ogn') - if None in (gat, bas, loc, ogn): + if None in (gat, gcp, ogn): continue token_data = { - 'bas': bas, - 'icd': loc, + 'gcp': gcp, 'ogn': ogn, - 'sta': '0', + 'sta': 0, } media = self._download_json( - '%s/?%s' % (gat, compat_urllib_parse_urlencode(token_data)), - video_id, 'Downloading %s JSON' % location['loc']) - file_ = media.get('file') - if not file_: + gat, video_id, data=json.dumps(token_data).encode('utf-8'), + headers={ + 'Content-Type': 'application/json;charset=utf-8', + 'Referer': url, + }) + stream = media.get('stream') or media.get('file') + if not stream: continue - ext = determine_ext(file_) + ext = determine_ext(stream) if ext == 'f4m': formats.extend(self._extract_f4m_formats( - file_ + '&hdcore=3.2.0&plugin=aasp-3.2.0.77.18', + stream + '&hdcore=3.2.0&plugin=aasp-3.2.0.77.18', video_id, f4m_id='hds', fatal=False)) elif ext == 'm3u8': formats.extend(self._extract_m3u8_formats( - file_, video_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False)) + stream, video_id, 'mp4', 'm3u8_native', + m3u8_id='hls', fatal=False)) self._sort_formats(formats) return { From a133eb7764594b830cb975e3925972214e932704 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 7 Jan 2018 00:02:41 +0700 Subject: [PATCH 075/137] [motherless:group] Capture leading slash of video path --- youtube_dl/extractor/motherless.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/motherless.py b/youtube_dl/extractor/motherless.py index 90ed91ba6..4adac691c 100644 --- a/youtube_dl/extractor/motherless.py +++ b/youtube_dl/extractor/motherless.py @@ -153,7 +153,7 @@ class MotherlessGroupIE(InfoExtractor): compat_urlparse.urljoin(base, video_path), MotherlessIE.ie_key(), video_title=title) for video_path, title in orderedSet(re.findall( - r'href="/([^"]+)"[^>]+>\s+<img[^>]+alt="[^-]+-\s([^"]+)"', + r'href="(/[^"]+)"[^>]+>\s+<img[^>]+alt="[^-]+-\s([^"]+)"', webpage)) ] From 0a5b1295b7c1aa6395b65ee137087c540b37b32b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 7 Jan 2018 00:31:53 +0700 Subject: [PATCH 076/137] [motherless:group] Relax entry extraction and add a fallback scenario --- youtube_dl/extractor/motherless.py | 29 +++++++++++++++++++++-------- 1 file changed, 21 insertions(+), 8 deletions(-) diff --git a/youtube_dl/extractor/motherless.py b/youtube_dl/extractor/motherless.py index 4adac691c..e24396e79 100644 --- a/youtube_dl/extractor/motherless.py +++ b/youtube_dl/extractor/motherless.py @@ -148,14 +148,27 @@ class MotherlessGroupIE(InfoExtractor): else super(MotherlessGroupIE, cls).suitable(url)) def _extract_entries(self, webpage, base): - return [ - self.url_result( - compat_urlparse.urljoin(base, video_path), - MotherlessIE.ie_key(), video_title=title) - for video_path, title in orderedSet(re.findall( - r'href="(/[^"]+)"[^>]+>\s+<img[^>]+alt="[^-]+-\s([^"]+)"', - webpage)) - ] + entries = [] + for mobj in re.finditer( + r'href="(?P<href>/[^"]+)"[^>]*>(?:\s*<img[^>]+alt="[^-]+-\s(?P<title>[^"]+)")?', + webpage): + video_url = compat_urlparse.urljoin(base, mobj.group('href')) + if not MotherlessIE.suitable(video_url): + continue + video_id = MotherlessIE._match_id(video_url) + title = mobj.group('title') + entries.append(self.url_result( + video_url, ie=MotherlessIE.ie_key(), video_id=video_id, + video_title=title)) + # Alternative fallback + if not entries: + entries = [ + self.url_result( + compat_urlparse.urljoin(base, '/' + video_id), + ie=MotherlessIE.ie_key(), video_id=video_id) + for video_id in orderedSet(re.findall( + r'data-codename=["\']([A-Z0-9]+)', webpage))] + return entries def _real_extract(self, url): group_id = self._match_id(url) From b0ead0e09aae6de6026a018cda7019eb7eade919 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 7 Jan 2018 21:49:23 +0700 Subject: [PATCH 077/137] [jwplatform] Add support for multiple embeds (closes #15192) --- youtube_dl/extractor/generic.py | 6 +++--- youtube_dl/extractor/jwplatform.py | 11 +++++++---- 2 files changed, 10 insertions(+), 7 deletions(-) diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index cc4c90b8c..9b0cd004f 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -2708,9 +2708,9 @@ class GenericIE(InfoExtractor): return self.url_result(viewlift_url) # Look for JWPlatform embeds - jwplatform_url = JWPlatformIE._extract_url(webpage) - if jwplatform_url: - return self.url_result(jwplatform_url, 'JWPlatform') + jwplatform_urls = JWPlatformIE._extract_urls(webpage) + if jwplatform_urls: + return self.playlist_from_matches(jwplatform_urls, video_id, video_title, ie=JWPlatformIE.ie_key()) # Look for Digiteka embeds digiteka_url = DigitekaIE._extract_url(webpage) diff --git a/youtube_dl/extractor/jwplatform.py b/youtube_dl/extractor/jwplatform.py index c9bcbb08f..63d0dc998 100644 --- a/youtube_dl/extractor/jwplatform.py +++ b/youtube_dl/extractor/jwplatform.py @@ -23,11 +23,14 @@ class JWPlatformIE(InfoExtractor): @staticmethod def _extract_url(webpage): - mobj = re.search( - r'<(?:script|iframe)[^>]+?src=["\'](?P<url>(?:https?:)?//content.jwplatform.com/players/[a-zA-Z0-9]{8})', + urls = JWPlatformIE._extract_urls(webpage) + return urls[0] if urls else None + + @staticmethod + def _extract_urls(webpage): + return re.findall( + r'<(?:script|iframe)[^>]+?src=["\']((?:https?:)?//content\.jwplatform\.com/players/[a-zA-Z0-9]{8})', webpage) - if mobj: - return mobj.group('url') def _real_extract(self, url): video_id = self._match_id(url) From 8faa9576bb4599dc3e77b8d3339122aa4f1230b4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 7 Jan 2018 23:48:56 +0700 Subject: [PATCH 078/137] [ChangeLog] Actualize --- ChangeLog | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/ChangeLog b/ChangeLog index 96bc471f3..67de65355 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,9 +1,21 @@ version <unreleased> +Core +* [utils] Fix youtube-dl under PyPy3 on Windows +* [YoutubeDL] Output python implementation in debug header + Extractors ++ [jwplatform] Add support for multiple embeds (#15192) +* [mitele] Fix extraction (#15186) ++ [motherless] Add support for groups (#15124) +* [lynda] Relax URL regular expression (#15185) +* [soundcloud] Fallback to avatar picture for thumbnail (#12878) * [youku] Fix list extraction (#15135) * [openload] Fix extraction (#15166) +* [lynda] Skip invalid subtitles (#15159) +* [twitch] Pass video id to url_result when extracting playlist (#15139) * [rtve.es:alacarta] Fix extraction of some new URLs +* [acast] Fix extraction (#15147) version 2017.12.31 From 950b5f296986ed0a2dd9feeb69dbb950592b6047 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 7 Jan 2018 23:52:16 +0700 Subject: [PATCH 079/137] release 2018.01.07 --- .github/ISSUE_TEMPLATE.md | 6 +++--- ChangeLog | 2 +- docs/supportedsites.md | 1 + youtube_dl/version.py | 2 +- 4 files changed, 6 insertions(+), 5 deletions(-) diff --git a/.github/ISSUE_TEMPLATE.md b/.github/ISSUE_TEMPLATE.md index 3f8984943..ad52c8900 100644 --- a/.github/ISSUE_TEMPLATE.md +++ b/.github/ISSUE_TEMPLATE.md @@ -6,8 +6,8 @@ --- -### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2017.12.31*. If it's not, read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. -- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2017.12.31** +### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2018.01.07*. If it's not, read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. +- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2018.01.07** ### Before submitting an *issue* make sure you have: - [ ] At least skimmed through the [README](https://github.com/rg3/youtube-dl/blob/master/README.md), **most notably** the [FAQ](https://github.com/rg3/youtube-dl#faq) and [BUGS](https://github.com/rg3/youtube-dl#bugs) sections @@ -35,7 +35,7 @@ Add the `-v` flag to **your command line** you run youtube-dl with (`youtube-dl [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 -[debug] youtube-dl version 2017.12.31 +[debug] youtube-dl version 2018.01.07 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/ChangeLog b/ChangeLog index 67de65355..9c45ae000 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,4 +1,4 @@ -version <unreleased> +version 2018.01.07 Core * [utils] Fix youtube-dl under PyPy3 on Windows diff --git a/docs/supportedsites.md b/docs/supportedsites.md index 75bd5c922..79b343048 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -478,6 +478,7 @@ - **Moniker**: allmyvideos.net and vidspot.net - **Morningstar**: morningstar.com - **Motherless** + - **MotherlessGroup** - **Motorsport**: motorsport.com - **MovieClips** - **MovieFap** diff --git a/youtube_dl/version.py b/youtube_dl/version.py index a3f84b9ea..9030e2415 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2017.12.31' +__version__ = '2018.01.07' From 3a513f29adc42fc46fd8b754806d38444bcee151 Mon Sep 17 00:00:00 2001 From: Luca Steeb <contact@luca-steeb.com> Date: Sat, 6 Jan 2018 20:27:26 +0100 Subject: [PATCH 080/137] fix bilibili extraction (closes #15171) --- youtube_dl/extractor/bilibili.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/bilibili.py b/youtube_dl/extractor/bilibili.py index 1e57310d6..beffcecd0 100644 --- a/youtube_dl/extractor/bilibili.py +++ b/youtube_dl/extractor/bilibili.py @@ -102,6 +102,7 @@ class BiliBiliIE(InfoExtractor): video_id, anime_id, compat_urlparse.urljoin(url, '//bangumi.bilibili.com/anime/%s' % anime_id))) headers = { 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8', + 'Referer': url } headers.update(self.geo_verification_headers()) @@ -116,10 +117,15 @@ class BiliBiliIE(InfoExtractor): payload = 'appkey=%s&cid=%s&otype=json&quality=2&type=mp4' % (self._APP_KEY, cid) sign = hashlib.md5((payload + self._BILIBILI_KEY).encode('utf-8')).hexdigest() + headers = { + 'Referer': url + } + headers.update(self.geo_verification_headers()) + video_info = self._download_json( 'http://interface.bilibili.com/playurl?%s&sign=%s' % (payload, sign), video_id, note='Downloading video info page', - headers=self.geo_verification_headers()) + headers=headers) if 'durl' not in video_info: self._report_error(video_info) From 7643916a3794f52169d16df093bd4a2b3abbb323 Mon Sep 17 00:00:00 2001 From: Chih-Hsuan Yen <yan12125@gmail.com> Date: Mon, 8 Jan 2018 01:32:13 +0800 Subject: [PATCH 081/137] [ChangeLog] update after #15188 [ci skip] --- ChangeLog | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/ChangeLog b/ChangeLog index 9c45ae000..9d37cdcef 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,9 @@ +version <unreleased> + +Extractors +* [bilibili] fix extraction (#15188) + + version 2018.01.07 Core From a39e15c516865259735bd8f4f5629de5b0e77847 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Sun, 7 Jan 2018 22:15:44 +0100 Subject: [PATCH 082/137] [canalplus] fix extraction(closes #15072) --- youtube_dl/extractor/canalplus.py | 99 ++++--------------------------- 1 file changed, 12 insertions(+), 87 deletions(-) diff --git a/youtube_dl/extractor/canalplus.py b/youtube_dl/extractor/canalplus.py index d8bf073f4..51c11cb7e 100644 --- a/youtube_dl/extractor/canalplus.py +++ b/youtube_dl/extractor/canalplus.py @@ -4,59 +4,36 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..compat import compat_urllib_parse_urlparse from ..utils import ( - dict_get, # ExtractorError, # HEADRequest, int_or_none, qualities, - remove_end, unified_strdate, ) class CanalplusIE(InfoExtractor): - IE_DESC = 'canalplus.fr, piwiplus.fr and d8.tv' - _VALID_URL = r'''(?x) - https?:// - (?: - (?: - (?:(?:www|m)\.)?canalplus\.fr| - (?:www\.)?piwiplus\.fr| - (?:www\.)?d8\.tv| - (?:www\.)?c8\.fr| - (?:www\.)?d17\.tv| - (?:(?:football|www)\.)?cstar\.fr| - (?:www\.)?itele\.fr - )/(?:(?:[^/]+/)*(?P<display_id>[^/?#&]+))?(?:\?.*\bvid=(?P<vid>\d+))?| - player\.canalplus\.fr/#/(?P<id>\d+) - ) - - ''' + IE_DESC = 'mycanal.fr and piwiplus.fr' + _VALID_URL = r'https?://(?:www\.)?(?P<site>mycanal|piwiplus)\.fr/(?:[^/]+/)*(?P<display_id>[^?/]+)(?:\.html\?.*\bvid=|/p/)(?P<id>\d+)' _VIDEO_INFO_TEMPLATE = 'http://service.canal-plus.com/video/rest/getVideosLiees/%s/%s?format=json' _SITE_ID_MAP = { - 'canalplus': 'cplus', + 'mycanal': 'cplus', 'piwiplus': 'teletoon', - 'd8': 'd8', - 'c8': 'd8', - 'd17': 'd17', - 'cstar': 'd17', - 'itele': 'itele', } # Only works for direct mp4 URLs _GEO_COUNTRIES = ['FR'] _TESTS = [{ - 'url': 'http://www.canalplus.fr/c-emissions/pid1830-c-zapping.html?vid=1192814', + 'url': 'https://www.mycanal.fr/d17-emissions/lolywood/p/1397061', 'info_dict': { - 'id': '1405510', - 'display_id': 'pid1830-c-zapping', + 'id': '1397061', + 'display_id': 'lolywood', 'ext': 'mp4', - 'title': 'Zapping - 02/07/2016', - 'description': 'Le meilleur de toutes les chaînes, tous les jours', - 'upload_date': '20160702', + 'title': 'Euro 2016 : Je préfère te prévenir - Lolywood - Episode 34', + 'description': 'md5:7d97039d455cb29cdba0d652a0efaa5e', + 'upload_date': '20160602', }, }, { # geo restricted, bypassed @@ -70,64 +47,12 @@ class CanalplusIE(InfoExtractor): 'upload_date': '20140724', }, 'expected_warnings': ['HTTP Error 403: Forbidden'], - }, { - # geo restricted, bypassed - 'url': 'http://www.c8.fr/c8-divertissement/ms-touche-pas-a-mon-poste/pid6318-videos-integrales.html?vid=1443684', - 'md5': 'bb6f9f343296ab7ebd88c97b660ecf8d', - 'info_dict': { - 'id': '1443684', - 'display_id': 'pid6318-videos-integrales', - 'ext': 'mp4', - 'title': 'Guess my iep ! - TPMP - 07/04/2017', - 'description': 'md5:6f005933f6e06760a9236d9b3b5f17fa', - 'upload_date': '20170407', - }, - 'expected_warnings': ['HTTP Error 403: Forbidden'], - }, { - 'url': 'http://www.itele.fr/chroniques/invite-michael-darmon/rachida-dati-nicolas-sarkozy-est-le-plus-en-phase-avec-les-inquietudes-des-francais-171510', - 'info_dict': { - 'id': '1420176', - 'display_id': 'rachida-dati-nicolas-sarkozy-est-le-plus-en-phase-avec-les-inquietudes-des-francais-171510', - 'ext': 'mp4', - 'title': 'L\'invité de Michaël Darmon du 14/10/2016 - ', - 'description': 'Chaque matin du lundi au vendredi, Michaël Darmon reçoit un invité politique à 8h25.', - 'upload_date': '20161014', - }, - }, { - 'url': 'http://football.cstar.fr/cstar-minisite-foot/pid7566-feminines-videos.html?vid=1416769', - 'info_dict': { - 'id': '1416769', - 'display_id': 'pid7566-feminines-videos', - 'ext': 'mp4', - 'title': 'France - Albanie : les temps forts de la soirée - 20/09/2016', - 'description': 'md5:c3f30f2aaac294c1c969b3294de6904e', - 'upload_date': '20160921', - }, - 'params': { - 'skip_download': True, - }, - }, { - 'url': 'http://m.canalplus.fr/?vid=1398231', - 'only_matching': True, - }, { - 'url': 'http://www.d17.tv/emissions/pid8303-lolywood.html?vid=1397061', - 'only_matching': True, }] def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) + site, display_id, video_id = re.match(self._VALID_URL, url).groups() - site_id = self._SITE_ID_MAP[compat_urllib_parse_urlparse(url).netloc.rsplit('.', 2)[-2]] - - # Beware, some subclasses do not define an id group - display_id = remove_end(dict_get(mobj.groupdict(), ('display_id', 'id', 'vid')), '.html') - - webpage = self._download_webpage(url, display_id) - video_id = self._search_regex( - [r'<canal:player[^>]+?videoId=(["\'])(?P<id>\d+)', - r'id=["\']canal_video_player(?P<id>\d+)', - r'data-video=["\'](?P<id>\d+)'], - webpage, 'video id', default=mobj.group('vid'), group='id') + site_id = self._SITE_ID_MAP[site] info_url = self._VIDEO_INFO_TEMPLATE % (site_id, video_id) video_data = self._download_json(info_url, video_id, 'Downloading video JSON') @@ -161,7 +86,7 @@ class CanalplusIE(InfoExtractor): format_url + '?hdcore=2.11.3', video_id, f4m_id=format_id, fatal=False)) else: formats.append({ - # the secret extracted ya function in http://player.canalplus.fr/common/js/canalPlayer.js + # the secret extracted from ya function in http://player.canalplus.fr/common/js/canalPlayer.js 'url': format_url + '?secret=pqzerjlsmdkjfoiuerhsdlfknaes', 'format_id': format_id, 'preference': preference(format_id), From 8005dc68cbdfc15b6353a071ef87d7e57d69ff59 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Mon, 8 Jan 2018 21:53:03 +0700 Subject: [PATCH 083/137] [ok] Add support for live streams --- youtube_dl/extractor/odnoklassniki.py | 26 +++++++++++++++++++++++--- 1 file changed, 23 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/odnoklassniki.py b/youtube_dl/extractor/odnoklassniki.py index 8e13bcf1f..5c8b37e18 100644 --- a/youtube_dl/extractor/odnoklassniki.py +++ b/youtube_dl/extractor/odnoklassniki.py @@ -19,11 +19,11 @@ from ..utils import ( class OdnoklassnikiIE(InfoExtractor): - _VALID_URL = r'https?://(?:(?:www|m|mobile)\.)?(?:odnoklassniki|ok)\.ru/(?:video(?:embed)?|web-api/video/moviePlayer)/(?P<id>[\d-]+)' + _VALID_URL = r'https?://(?:(?:www|m|mobile)\.)?(?:odnoklassniki|ok)\.ru/(?:video(?:embed)?|web-api/video/moviePlayer|live)/(?P<id>[\d-]+)' _TESTS = [{ # metadata in JSON 'url': 'http://ok.ru/video/20079905452', - 'md5': '6ba728d85d60aa2e6dd37c9e70fdc6bc', + 'md5': '0b62089b479e06681abaaca9d204f152', 'info_dict': { 'id': '20079905452', 'ext': 'mp4', @@ -35,7 +35,6 @@ class OdnoklassnikiIE(InfoExtractor): 'like_count': int, 'age_limit': 0, }, - 'skip': 'Video has been blocked', }, { # metadataUrl 'url': 'http://ok.ru/video/63567059965189-0?fromTime=5', @@ -99,6 +98,9 @@ class OdnoklassnikiIE(InfoExtractor): }, { 'url': 'http://mobile.ok.ru/video/20079905452', 'only_matching': True, + }, { + 'url': 'https://www.ok.ru/live/484531969818', + 'only_matching': True, }] def _real_extract(self, url): @@ -184,6 +186,10 @@ class OdnoklassnikiIE(InfoExtractor): }) return info + assert title + if provider == 'LIVE_TV_APP': + info['title'] = self._live_title(title) + quality = qualities(('4', '0', '1', '2', '3', '5')) formats = [{ @@ -210,6 +216,20 @@ class OdnoklassnikiIE(InfoExtractor): if fmt_type: fmt['quality'] = quality(fmt_type) + # Live formats + m3u8_url = metadata.get('hlsMasterPlaylistUrl') + if m3u8_url: + formats.extend(self._extract_m3u8_formats( + m3u8_url, video_id, 'mp4', entry_protocol='m3u8', + m3u8_id='hls', fatal=False)) + rtmp_url = metadata.get('rtmpUrl') + if rtmp_url: + formats.append({ + 'url': rtmp_url, + 'format_id': 'rtmp', + 'ext': 'flv', + }) + self._sort_formats(formats) info['formats'] = formats From 5eca00a2e33a6ca26a7f52589e5d77bab7e5edf4 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan <yan12125@gmail.com> Date: Tue, 9 Jan 2018 18:12:55 +0800 Subject: [PATCH 084/137] [weibo] Misc improvements --- youtube_dl/extractor/weibo.py | 125 ++++++++++++++++++---------------- 1 file changed, 66 insertions(+), 59 deletions(-) diff --git a/youtube_dl/extractor/weibo.py b/youtube_dl/extractor/weibo.py index cbe0c3228..3cb4d71a6 100644 --- a/youtube_dl/extractor/weibo.py +++ b/youtube_dl/extractor/weibo.py @@ -8,7 +8,8 @@ import random import re from ..compat import ( - compat_urlparse, + compat_parse_qs, + compat_str, ) from ..utils import ( js_to_json, @@ -31,70 +32,71 @@ class WeiboIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) # to get Referer url for genvisitor - webpage, urlh = self._download_webpage_handle(url, video_id, note="first visit the page") + webpage, urlh = self._download_webpage_handle(url, video_id) visitor_url = urlh.geturl() - headers = { - 'Referer': visitor_url - } - fp = { - "os": "2", - "browser": "Gecko57,0,0,0", - "fonts": "undefined", - "screenInfo": "1440*900*24", - "plugins": "" - } - data = urlencode_postdata({ - "cb": "gen_callback", - "fp": json.dumps(fp), - }) + if 'passport.weibo.com' in visitor_url: + # first visit + visitor_data = self._download_json( + 'https://passport.weibo.com/visitor/genvisitor', video_id, + note='Generating first-visit data', + transform_source=strip_jsonp, + headers={'Referer': visitor_url}, + data=urlencode_postdata({ + 'cb': 'gen_callback', + 'fp': json.dumps({ + 'os': '2', + 'browser': 'Gecko57,0,0,0', + 'fonts': 'undefined', + 'screenInfo': '1440*900*24', + 'plugins': '', + }), + })) - genvisitor_url = 'https://passport.weibo.com/visitor/genvisitor' - webpage = self._download_webpage(genvisitor_url, video_id, data=data, headers=headers, note="gen visitor") + tid = visitor_data['data']['tid'] + cnfd = '%03d' % visitor_data['data']['confidence'] - p = strip_jsonp(webpage) - i1 = p.find('{') - i2 = p.rfind('}') - j = p[i1:i2 + 1] # get JSON object - d = json.loads(j) - tid = d["data"]["tid"] - cnfd = "%03d" % d["data"]["confidence"] + self._download_webpage( + 'https://passport.weibo.com/visitor/visitor', video_id, + note='Running first-visit callback', + query={ + 'a': 'incarnate', + 't': tid, + 'w': 2, + 'c': cnfd, + 'cb': 'cross_domain', + 'from': 'weibo', + '_rand': random.random(), + }) - query = { - 'a': 'incarnate', - 't': tid, - 'w': 2, - 'c': cnfd, - 'cb': 'cross_domain', - 'from': 'weibo', - '_rand': random.random() - } - gencallback_url = "https://passport.weibo.com/visitor/visitor" - self._download_webpage(gencallback_url, video_id, note="gen callback", query=query) + webpage = self._download_webpage( + url, video_id, note='Revisiting webpage') - webpage = self._download_webpage(url, video_id, note="retry to visit the page") + title = self._html_search_regex( + r'<title>(.+?)', webpage, 'title') - title = self._html_search_regex(r'(.+?)', webpage, 'title') - - video_sources_text = self._search_regex(r'video-sources=\\\"(.+?)\"', webpage, 'video_sources') - - video_formats = compat_urlparse.parse_qs(video_sources_text) + video_formats = compat_parse_qs(self._search_regex( + r'video-sources=\\\"(.+?)\"', webpage, 'video_sources')) formats = [] - supported_resolutions = ('720', '480') + supported_resolutions = (480, 720) for res in supported_resolutions: - f = video_formats.get(res) - if isinstance(f, list): - if len(f) > 0: - vid_url = f[0] - formats.append({ - 'url': vid_url, - 'format': 'mp4', - 'height': int(res), - }) + vid_urls = video_formats.get(compat_str(res)) + if not vid_urls or not isinstance(vid_urls, list): + continue + + vid_url = vid_urls[0] + formats.append({ + 'url': vid_url, + 'height': res, + }) + self._sort_formats(formats) - uploader = self._og_search_property('nick-name', webpage, 'uploader', default=None) + + uploader = self._og_search_property( + 'nick-name', webpage, 'uploader', default=None) + return { 'id': video_id, 'title': title, @@ -118,12 +120,17 @@ class WeiboMobileIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) # to get Referer url for genvisitor - webpage = self._download_webpage(url, video_id, note="visit the page") - js_code = self._search_regex(r'var\s+\$render_data\s*=\s*\[({.*})\]\[0\] \|\| {};', webpage, 'js_code', flags=re.DOTALL) - weibo_info = self._parse_json(js_code, video_id, transform_source=js_to_json) - page_info = weibo_info.get('status').get('page_info') - title = weibo_info.get('status').get('status_title') - uploader = weibo_info.get('status').get('user').get('screen_name') + webpage = self._download_webpage(url, video_id, note='visit the page') + + weibo_info = self._parse_json(self._search_regex( + r'var\s+\$render_data\s*=\s*\[({.*})\]\[0\]\s*\|\|\s*{};', + webpage, 'js_code', flags=re.DOTALL), + video_id, transform_source=js_to_json) + + status_data = weibo_info.get('status', {}) + page_info = status_data.get('page_info') + title = status_data['status_title'] + uploader = status_data.get('user', {}).get('screen_name') return { 'id': video_id, From 0f71de076144f59fae0b3b7e9a5251f44449cd9b Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Tue, 9 Jan 2018 18:13:49 +0800 Subject: [PATCH 085/137] [ChangeLog] Update after #15079 --- ChangeLog | 1 + 1 file changed, 1 insertion(+) diff --git a/ChangeLog b/ChangeLog index 9d37cdcef..51825ccfe 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,6 +1,7 @@ version Extractors ++ [weibo] Add extractor (#15079) * [bilibili] fix extraction (#15188) From 5b23845125ba20b83ab3a41fb8ff4b34e460a5dd Mon Sep 17 00:00:00 2001 From: Chih-Hsuan Yen Date: Tue, 9 Jan 2018 19:35:39 +0800 Subject: [PATCH 086/137] Credit @sprhawk for the Weibo extractor (#15079) --- AUTHORS | 1 + 1 file changed, 1 insertion(+) diff --git a/AUTHORS b/AUTHORS index 7e012247c..5a090a3ef 100644 --- a/AUTHORS +++ b/AUTHORS @@ -231,3 +231,4 @@ John Dong Tatsuyuki Ishi Daniel Weber Kay Bouché +Yang Hongbo From 310ea4661ddaea002c86d0ebbf4663b6c943b8f4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Tue, 9 Jan 2018 22:04:50 +0700 Subject: [PATCH 087/137] [ndr:embed:base] Make separate formats extraction non fatal (closes #15203) --- youtube_dl/extractor/ndr.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/ndr.py b/youtube_dl/extractor/ndr.py index 07528d140..aec2ea133 100644 --- a/youtube_dl/extractor/ndr.py +++ b/youtube_dl/extractor/ndr.py @@ -190,10 +190,12 @@ class NDREmbedBaseIE(InfoExtractor): ext = determine_ext(src, None) if ext == 'f4m': formats.extend(self._extract_f4m_formats( - src + '?hdcore=3.7.0&plugin=aasp-3.7.0.39.44', video_id, f4m_id='hds')) + src + '?hdcore=3.7.0&plugin=aasp-3.7.0.39.44', video_id, + f4m_id='hds', fatal=False)) elif ext == 'm3u8': formats.extend(self._extract_m3u8_formats( - src, video_id, 'mp4', m3u8_id='hls', entry_protocol='m3u8_native')) + src, video_id, 'mp4', m3u8_id='hls', + entry_protocol='m3u8_native', fatal=False)) else: quality = f.get('quality') ff = { From 2b4e1ace4ac422acbe63be2f8cc23429de6812b8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Wed, 10 Jan 2018 05:36:03 +0700 Subject: [PATCH 088/137] [limelight] Tolerate empty pc formats (closes #15150, closes #15151, closes #15207) --- youtube_dl/extractor/limelight.py | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/youtube_dl/extractor/limelight.py b/youtube_dl/extractor/limelight.py index ad65b2759..2803d7e8d 100644 --- a/youtube_dl/extractor/limelight.py +++ b/youtube_dl/extractor/limelight.py @@ -10,6 +10,7 @@ from ..utils import ( float_or_none, int_or_none, smuggle_url, + try_get, unsmuggle_url, ExtractorError, ) @@ -220,6 +221,12 @@ class LimelightBaseIE(InfoExtractor): 'subtitles': subtitles, } + def _extract_info_helper(self, pc, mobile, i, metadata): + return self._extract_info( + try_get(pc, lambda x: x['playlistItems'][i]['streams'], list) or [], + try_get(mobile, lambda x: x['mediaList'][i]['mobileUrls'], list) or [], + metadata) + class LimelightMediaIE(LimelightBaseIE): IE_NAME = 'limelight' @@ -282,10 +289,7 @@ class LimelightMediaIE(LimelightBaseIE): 'getMobilePlaylistByMediaId', 'properties', smuggled_data.get('source_url')) - return self._extract_info( - pc['playlistItems'][0].get('streams', []), - mobile['mediaList'][0].get('mobileUrls', []) if mobile else [], - metadata) + return self._extract_info_helper(pc, mobile, 0, metadata) class LimelightChannelIE(LimelightBaseIE): @@ -326,10 +330,7 @@ class LimelightChannelIE(LimelightBaseIE): 'media', smuggled_data.get('source_url')) entries = [ - self._extract_info( - pc['playlistItems'][i].get('streams', []), - mobile['mediaList'][i].get('mobileUrls', []) if mobile else [], - medias['media_list'][i]) + self._extract_info_helper(pc, mobile, i, medias['media_list'][i]) for i in range(len(medias['media_list']))] return self.playlist_result(entries, channel_id, pc['title']) From e654829b4c4b8ebd4efb4554dd02cc1418c6fc23 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Wed, 10 Jan 2018 21:24:22 +0700 Subject: [PATCH 089/137] [digg] Add extractor (closes #15214) --- youtube_dl/extractor/digg.py | 41 ++++++++++++++++++++++++++++++ youtube_dl/extractor/extractors.py | 1 + 2 files changed, 42 insertions(+) create mode 100644 youtube_dl/extractor/digg.py diff --git a/youtube_dl/extractor/digg.py b/youtube_dl/extractor/digg.py new file mode 100644 index 000000000..611134ac0 --- /dev/null +++ b/youtube_dl/extractor/digg.py @@ -0,0 +1,41 @@ +from __future__ import unicode_literals + +from .common import InfoExtractor + + +class DiggIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?digg\.com/video/(?P[^/?#&]+)' + _TEST = { + 'url': 'http://digg.com/video/sci-fi-short-jonah-daniel-kaluuya-get-out', + 'info_dict': { + 'id': 'LcqvmS0b', + 'ext': 'mp4', + 'title': "'Get Out' Star Daniel Kaluuya Goes On 'Moby Dick'-Like Journey In Sci-Fi Short 'Jonah'", + 'description': 'md5:541bb847648b6ee3d6514bc84b82efda', + 'upload_date': '20180109', + 'timestamp': 1515530551, + }, + 'params': { + 'skip_download': True, + }, + } + + def _real_extract(self, url): + display_id = self._match_id(url) + + webpage = self._download_webpage(url, display_id) + + jwplatform_id = self._search_regex( + r'video_id\s*:\s*["\']([a-zA-Z0-9]{8})', webpage, 'jwplatform id', + default=None) + + if not jwplatform_id: + return self.url_result(url, 'Generic') + + return { + '_type': 'url_transparent', + 'ie_key': 'JWPlatform', + 'url': 'jwplatform:%s' % jwplatform_id, + 'id': jwplatform_id, + 'display_id': display_id, + } diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index a3ad4df1f..3bfd1b7ed 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -259,6 +259,7 @@ from .deezer import DeezerPlaylistIE from .democracynow import DemocracynowIE from .dfb import DFBIE from .dhm import DHMIE +from .digg import DiggIE from .dotsub import DotsubIE from .douyutv import ( DouyuShowIE, From 1b79daffd965fcb3776e8304bd393db6573b50ea Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Wed, 10 Jan 2018 22:19:51 +0700 Subject: [PATCH 090/137] [digg] Improve extraction --- youtube_dl/extractor/digg.py | 43 ++++++++++++++++++++++++------------ 1 file changed, 29 insertions(+), 14 deletions(-) diff --git a/youtube_dl/extractor/digg.py b/youtube_dl/extractor/digg.py index 611134ac0..913c1750f 100644 --- a/youtube_dl/extractor/digg.py +++ b/youtube_dl/extractor/digg.py @@ -1,11 +1,13 @@ from __future__ import unicode_literals from .common import InfoExtractor +from ..utils import js_to_json class DiggIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?digg\.com/video/(?P[^/?#&]+)' - _TEST = { + _TESTS = [{ + # JWPlatform via provider 'url': 'http://digg.com/video/sci-fi-short-jonah-daniel-kaluuya-get-out', 'info_dict': { 'id': 'LcqvmS0b', @@ -18,24 +20,37 @@ class DiggIE(InfoExtractor): 'params': { 'skip_download': True, }, - } + }, { + # Youtube via provider + 'url': 'http://digg.com/video/dog-boat-seal-play', + 'only_matching': True, + }, { + # vimeo as regular embed + 'url': 'http://digg.com/video/dream-girl-short-film', + 'only_matching': True, + }] def _real_extract(self, url): display_id = self._match_id(url) webpage = self._download_webpage(url, display_id) - jwplatform_id = self._search_regex( - r'video_id\s*:\s*["\']([a-zA-Z0-9]{8})', webpage, 'jwplatform id', - default=None) + info = self._parse_json( + self._search_regex( + r'(?s)video_info\s*=\s*({.+?});\n', webpage, 'video info', + default='{}'), display_id, transform_source=js_to_json, + fatal=False) - if not jwplatform_id: - return self.url_result(url, 'Generic') + video_id = info.get('video_id') - return { - '_type': 'url_transparent', - 'ie_key': 'JWPlatform', - 'url': 'jwplatform:%s' % jwplatform_id, - 'id': jwplatform_id, - 'display_id': display_id, - } + if video_id: + provider = info.get('provider_name') + if provider == 'youtube': + return self.url_result( + video_id, ie='Youtube', video_id=video_id) + elif provider == 'jwplayer': + return self.url_result( + 'jwplatform:%s' % video_id, ie='JWPlatform', + video_id=video_id) + + return self.url_result(url, 'Generic') From a90641fe87d62936b717b9c2fdbe453578b441ed Mon Sep 17 00:00:00 2001 From: scil Date: Thu, 11 Jan 2018 20:35:09 +0800 Subject: [PATCH 091/137] [ximalaya_extractor] Add new extractor ximalaya (#14687) * [ximalaya_extractor] Add new extractor * format change according by flake8 * changes accoring to review by @yan12125 at github pull #14687 * change %d to %s in a temp str * seond changes accoring to review by @yan12125 at github pull #1468 * improve TESTS about contains * changes accoring to third review by @yan12125 at github pull #1468 * forth changes accoring to forth review by @yan12125 at github pull #1468 --- youtube_dl/extractor/extractors.py | 4 + youtube_dl/extractor/ximalaya.py | 233 +++++++++++++++++++++++++++++ 2 files changed, 237 insertions(+) create mode 100644 youtube_dl/extractor/ximalaya.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 3bfd1b7ed..37624d37a 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -1328,6 +1328,10 @@ from .xiami import ( XiamiArtistIE, XiamiCollectionIE ) +from .ximalaya import ( + XimalayaIE, + XimalayaAlbumIE +) from .xminus import XMinusIE from .xnxx import XNXXIE from .xstream import XstreamIE diff --git a/youtube_dl/extractor/ximalaya.py b/youtube_dl/extractor/ximalaya.py new file mode 100644 index 000000000..a912e54b8 --- /dev/null +++ b/youtube_dl/extractor/ximalaya.py @@ -0,0 +1,233 @@ +# coding: utf-8 + +from __future__ import unicode_literals + +import itertools +import re + +from .common import InfoExtractor + + +class XimalayaBaseIE(InfoExtractor): + _GEO_COUNTRIES = ['CN'] + + +class XimalayaIE(XimalayaBaseIE): + IE_NAME = 'ximalaya' + IE_DESC = '喜马拉雅FM' + _VALID_URL = r'https?://(?:www\.|m\.)?ximalaya\.com/(?P[0-9]+)/sound/(?P[0-9]+)' + _USER_URL_FORMAT = '%s://www.ximalaya.com/zhubo/%i/' + _TESTS = [ + { + 'url': 'http://www.ximalaya.com/61425525/sound/47740352/', + 'info_dict': { + 'id': '47740352', + 'ext': 'm4a', + 'uploader': '小彬彬爱听书', + 'uploader_id': 61425525, + 'uploader_url': 'http://www.ximalaya.com/zhubo/61425525/', + 'title': '261.唐诗三百首.卷八.送孟浩然之广陵.李白', + 'description': "contains:《送孟浩然之广陵》\n作者:李白\n故人西辞黄鹤楼,烟花三月下扬州。\n孤帆远影碧空尽,惟见长江天际流。", + 'thumbnails': [ + { + 'name': 'cover_url', + 'url': r're:^https?://.*\.jpg$', + }, + { + 'name': 'cover_url_142', + 'url': r're:^https?://.*\.jpg$', + 'width': 180, + 'height': 180 + } + ], + 'categories': ['renwen', '人文'], + 'duration': 93, + 'view_count': int, + 'like_count': int, + } + }, + { + 'url': 'http://m.ximalaya.com/61425525/sound/47740352/', + 'info_dict': { + 'id': '47740352', + 'ext': 'm4a', + 'uploader': '小彬彬爱听书', + 'uploader_id': 61425525, + 'uploader_url': 'http://www.ximalaya.com/zhubo/61425525/', + 'title': '261.唐诗三百首.卷八.送孟浩然之广陵.李白', + 'description': "contains:《送孟浩然之广陵》\n作者:李白\n故人西辞黄鹤楼,烟花三月下扬州。\n孤帆远影碧空尽,惟见长江天际流。", + 'thumbnails': [ + { + 'name': 'cover_url', + 'url': r're:^https?://.*\.jpg$', + }, + { + 'name': 'cover_url_142', + 'url': r're:^https?://.*\.jpg$', + 'width': 180, + 'height': 180 + } + ], + 'categories': ['renwen', '人文'], + 'duration': 93, + 'view_count': int, + 'like_count': int, + } + }, + { + 'url': 'https://www.ximalaya.com/11045267/sound/15705996/', + 'info_dict': { + 'id': '15705996', + 'ext': 'm4a', + 'uploader': '李延隆老师', + 'uploader_id': 11045267, + 'uploader_url': 'https://www.ximalaya.com/zhubo/11045267/', + 'title': 'Lesson 1 Excuse me!', + 'description': "contains:Listen to the tape then answer\xa0this question. Whose handbag is it?\n" + "听录音,然后回答问题,这是谁的手袋?", + 'thumbnails': [ + { + 'name': 'cover_url', + 'url': r're:^https?://.*\.jpg$', + }, + { + 'name': 'cover_url_142', + 'url': r're:^https?://.*\.jpg$', + 'width': 180, + 'height': 180 + } + ], + 'categories': ['train', '外语'], + 'duration': 40, + 'view_count': int, + 'like_count': int, + } + }, + ] + + def _real_extract(self, url): + + is_m = 'm.ximalaya' in url + scheme = 'https' if url.startswith('https') else 'http' + + audio_id = self._match_id(url) + webpage = self._download_webpage(url, audio_id, + note='Download sound page for %s' % audio_id, + errnote='Unable to get sound page') + + audio_info_file = '%s://m.ximalaya.com/tracks/%s.json' % (scheme, audio_id) + audio_info = self._download_json(audio_info_file, audio_id, + 'Downloading info json %s' % audio_info_file, + 'Unable to download info file') + + formats = [] + for bps, k in (('24k', 'play_path_32'), ('64k', 'play_path_64')): + if audio_info.get(k): + formats.append({ + 'format_id': bps, + 'url': audio_info[k], + }) + + thumbnails = [] + for k in audio_info.keys(): + # cover pics kyes like: cover_url', 'cover_url_142' + if k.startswith('cover_url'): + thumbnail = {'name': k, 'url': audio_info[k]} + if k == 'cover_url_142': + thumbnail['width'] = 180 + thumbnail['height'] = 180 + thumbnails.append(thumbnail) + + audio_uploader_id = audio_info.get('uid') + + if is_m: + audio_description = self._html_search_regex(r'(?s)]+>(.+?)', + webpage, 'audio_description', fatal=False) + else: + audio_description = self._html_search_regex(r'(?s)]*>(.+?)', + webpage, 'audio_description', fatal=False) + + if not audio_description: + audio_description_file = '%s://www.ximalaya.com/sounds/%s/rich_intro' % (scheme, audio_id) + audio_description = self._download_webpage(audio_description_file, audio_id, + note='Downloading description file %s' % audio_description_file, + errnote='Unable to download descrip file', + fatal=False) + audio_description = audio_description.strip() if audio_description else None + + return { + 'id': audio_id, + 'uploader': audio_info.get('nickname'), + 'uploader_id': audio_uploader_id, + 'uploader_url': self._USER_URL_FORMAT % (scheme, audio_uploader_id) if audio_uploader_id else None, + 'title': audio_info['title'], + 'thumbnails': thumbnails, + 'description': audio_description, + 'categories': list(filter(None, (audio_info.get('category_name'), audio_info.get('category_title')))), + 'duration': audio_info.get('duration'), + 'view_count': audio_info.get('play_count'), + 'like_count': audio_info.get('favorites_count'), + 'formats': formats, + } + + +class XimalayaAlbumIE(XimalayaBaseIE): + IE_NAME = 'ximalaya:album' + IE_DESC = '喜马拉雅FM 专辑' + _VALID_URL = r'https?://(?:www\.|m\.)?ximalaya\.com/(?P[0-9]+)/album/(?P[0-9]+)' + _TEMPLATE_URL = '%s://www.ximalaya.com/%s/album/%s/' + _BASE_URL_TEMPL = '%s://www.ximalaya.com%s' + _LIST_VIDEO_RE = r']+?href="(?P/%s/sound/(?P\d+)/?)"[^>]+?title="(?P[^>]+)">' + _TESTS = [{ + 'url': 'http://www.ximalaya.com/61425525/album/5534601/', + 'info_dict': { + 'title': '唐诗三百首(含赏析)', + 'id': '5534601', + }, + 'playlist_count': 312, + }, { + 'url': 'http://m.ximalaya.com/61425525/album/5534601', + 'info_dict': { + 'title': '唐诗三百首(含赏析)', + 'id': '5534601', + }, + 'playlist_count': 312, + }, + ] + + def _real_extract(self, url): + self.scheme = scheme = 'https' if url.startswith('https') else 'http' + + mobj = re.match(self._VALID_URL, url) + uid, playlist_id = mobj.group('uid'), mobj.group('id') + + webpage = self._download_webpage(self._TEMPLATE_URL % (scheme, uid, playlist_id), playlist_id, + note='Download album page for %s' % playlist_id, + errnote='Unable to get album info') + + title = self._html_search_regex(r'detailContent_title[^>]*><h1(?:[^>]+)?>([^<]+)</h1>', + webpage, 'title', fatal=False) + + return self.playlist_result(self._entries(webpage, playlist_id, uid), playlist_id, title) + + def _entries(self, page, playlist_id, uid): + html = page + for page_num in itertools.count(1): + for entry in self._process_page(html, uid): + yield entry + + next_url = self._search_regex(r'<a\s+href=(["\'])(?P<more>[\S]+)\1[^>]+rel=(["\'])next\3', + html, 'list_next_url', default=None, group='more') + if not next_url: + break + + next_full_url = self._BASE_URL_TEMPL % (self.scheme, next_url) + html = self._download_webpage(next_full_url, playlist_id) + + def _process_page(self, html, uid): + find_from = html.index('album_soundlist') + for mobj in re.finditer(self._LIST_VIDEO_RE % uid, html[find_from:]): + yield self.url_result(self._BASE_URL_TEMPL % (self.scheme, mobj.group('url')), + XimalayaIE.ie_key(), + mobj.group('id'), + mobj.group('title')) From 37941fe204ef855590af64584b75d7fc95997fc6 Mon Sep 17 00:00:00 2001 From: Chih-Hsuan Yen <yan12125@gmail.com> Date: Thu, 11 Jan 2018 20:36:06 +0800 Subject: [PATCH 092/137] [ChangeLog] Update after #14687 [ci skip] --- ChangeLog | 1 + 1 file changed, 1 insertion(+) diff --git a/ChangeLog b/ChangeLog index 51825ccfe..ea1be934c 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,6 +1,7 @@ version <unreleased> Extractors ++ [ximalaya] Add extractor (#14687) + [weibo] Add extractor (#15079) * [bilibili] fix extraction (#15188) From 64287560e4a7af9401e84318d9d04783c1b289af Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Thu, 11 Jan 2018 23:06:56 +0700 Subject: [PATCH 093/137] [pandoratv] Add support for new URL format (closes #15131) --- youtube_dl/extractor/pandoratv.py | 28 ++++++++++++++++++++++------ 1 file changed, 22 insertions(+), 6 deletions(-) diff --git a/youtube_dl/extractor/pandoratv.py b/youtube_dl/extractor/pandoratv.py index fc7bd3411..0c27a61d7 100644 --- a/youtube_dl/extractor/pandoratv.py +++ b/youtube_dl/extractor/pandoratv.py @@ -1,6 +1,8 @@ # coding: utf-8 from __future__ import unicode_literals +import re + from .common import InfoExtractor from ..compat import ( compat_str, @@ -18,7 +20,13 @@ from ..utils import ( class PandoraTVIE(InfoExtractor): IE_NAME = 'pandora.tv' IE_DESC = '판도라TV' - _VALID_URL = r'https?://(?:.+?\.)?channel\.pandora\.tv/channel/video\.ptv\?' + _VALID_URL = r'''(?x) + https?:// + (?: + (?:www\.)?pandora\.tv/view/(?P<user_id>[^/]+)/(?P<id>\d+)| # new format + (?:.+?\.)?channel\.pandora\.tv/channel/video\.ptv\? # old format + ) + ''' _TESTS = [{ 'url': 'http://jp.channel.pandora.tv/channel/video.ptv?c1=&prgid=53294230&ch_userid=mikakim&ref=main&lot=cate_01_2', 'info_dict': { @@ -53,14 +61,22 @@ class PandoraTVIE(InfoExtractor): # Test metadata only 'skip_download': True, }, + }, { + 'url': 'http://www.pandora.tv/view/mikakim/53294230#36797454_new', + 'only_matching': True, }] def _real_extract(self, url): - qs = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query) - video_id = qs.get('prgid', [None])[0] - user_id = qs.get('ch_userid', [None])[0] - if any(not f for f in (video_id, user_id,)): - raise ExtractorError('Invalid URL', expected=True) + mobj = re.match(self._VALID_URL, url) + user_id = mobj.group('user_id') + video_id = mobj.group('id') + + if not user_id or not video_id: + qs = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query) + video_id = qs.get('prgid', [None])[0] + user_id = qs.get('ch_userid', [None])[0] + if any(not f for f in (video_id, user_id,)): + raise ExtractorError('Invalid URL', expected=True) data = self._download_json( 'http://m.pandora.tv/?c=view&m=viewJsonApi&ch_userid=%s&prgid=%s' From 609850acfb6c03dcdfa4d9cdba77df3b0a259968 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Thu, 11 Jan 2018 23:10:18 +0700 Subject: [PATCH 094/137] [pandoratv] Add support for mobile URLs (closes #12441) --- youtube_dl/extractor/pandoratv.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/pandoratv.py b/youtube_dl/extractor/pandoratv.py index 0c27a61d7..538738c09 100644 --- a/youtube_dl/extractor/pandoratv.py +++ b/youtube_dl/extractor/pandoratv.py @@ -24,7 +24,8 @@ class PandoraTVIE(InfoExtractor): https?:// (?: (?:www\.)?pandora\.tv/view/(?P<user_id>[^/]+)/(?P<id>\d+)| # new format - (?:.+?\.)?channel\.pandora\.tv/channel/video\.ptv\? # old format + (?:.+?\.)?channel\.pandora\.tv/channel/video\.ptv\?| # old format + m\.pandora\.tv/?\? # mobile ) ''' _TESTS = [{ @@ -64,6 +65,9 @@ class PandoraTVIE(InfoExtractor): }, { 'url': 'http://www.pandora.tv/view/mikakim/53294230#36797454_new', 'only_matching': True, + }, { + 'url': 'http://m.pandora.tv/?c=view&ch_userid=mikakim&prgid=54600346', + 'only_matching': True, }] def _real_extract(self, url): From e565a6386e61f7741a5386520f1b36efe2cb3310 Mon Sep 17 00:00:00 2001 From: Chih-Hsuan Yen <yan12125@gmail.com> Date: Fri, 12 Jan 2018 15:36:01 +0800 Subject: [PATCH 095/137] Credit @scil for ximalaya extractor (#14687) [ci skip] --- AUTHORS | 1 + 1 file changed, 1 insertion(+) diff --git a/AUTHORS b/AUTHORS index 5a090a3ef..40215a5cf 100644 --- a/AUTHORS +++ b/AUTHORS @@ -232,3 +232,4 @@ Tatsuyuki Ishi Daniel Weber Kay Bouché Yang Hongbo +Lei Wang From 47e2a9bc53c1f4a10dda62e473ec553108f7ee89 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 13 Jan 2018 18:47:47 +0700 Subject: [PATCH 096/137] [viafree] Skip rtmp formats (closes #15232) --- youtube_dl/extractor/tvplay.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/tvplay.py b/youtube_dl/extractor/tvplay.py index 46132eda1..84597b55e 100644 --- a/youtube_dl/extractor/tvplay.py +++ b/youtube_dl/extractor/tvplay.py @@ -273,6 +273,8 @@ class TVPlayIE(InfoExtractor): 'ext': ext, } if video_url.startswith('rtmp'): + if smuggled_data.get('skip_rtmp'): + continue m = re.search( r'^(?P<url>rtmp://[^/]+/(?P<app>[^/]+))/(?P<playpath>.+)$', video_url) if not m: @@ -434,6 +436,10 @@ class ViafreeIE(InfoExtractor): return self.url_result( smuggle_url( 'mtg:%s' % video_id, - {'geo_countries': [ - compat_urlparse.urlparse(url).netloc.rsplit('.', 1)[-1]]}), + { + 'geo_countries': [ + compat_urlparse.urlparse(url).netloc.rsplit('.', 1)[-1]], + # rtmp host mtgfs.fplive.net for viafree is unresolvable + 'skip_rtmp': True, + }), ie=TVPlayIE.ie_key(), video_id=video_id) From d4aedca3bd82288c802d4d766d5542bfcec4a91a Mon Sep 17 00:00:00 2001 From: "Hendrik v. Raven" <hendrik@consetetur.de> Date: Sat, 6 Jan 2018 15:09:53 +0100 Subject: [PATCH 097/137] [gamestar] Add support for gamepro.de (closes #3384) --- youtube_dl/extractor/gamestar.py | 43 +++++++++++++++++++------------- 1 file changed, 25 insertions(+), 18 deletions(-) diff --git a/youtube_dl/extractor/gamestar.py b/youtube_dl/extractor/gamestar.py index e607d6ab8..7ce2f15de 100644 --- a/youtube_dl/extractor/gamestar.py +++ b/youtube_dl/extractor/gamestar.py @@ -9,21 +9,27 @@ from ..utils import ( class GameStarIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?gamestar\.de/videos/.*,(?P<id>[0-9]+)\.html' - _TEST = { - 'url': 'http://www.gamestar.de/videos/trailer,3/hobbit-3-die-schlacht-der-fuenf-heere,76110.html', - 'md5': '96974ecbb7fd8d0d20fca5a00810cea7', - 'info_dict': { - 'id': '76110', - 'ext': 'mp4', - 'title': 'Hobbit 3: Die Schlacht der Fünf Heere - Teaser-Trailer zum dritten Teil', - 'description': 'Der Teaser-Trailer zu Hobbit 3: Die Schlacht der Fünf Heere zeigt einige Szenen aus dem dritten Teil der Saga und kündigt den...', - 'thumbnail': r're:^https?://.*\.jpg$', - 'timestamp': 1406542020, - 'upload_date': '20140728', - 'duration': 17 - } - } + _VALID_URL = r'https?://(?:www\.)?game(?:pro|star)\.de/videos/.*,(?P<id>[0-9]+)\.html' + _TESTS = [ + { + 'url': 'http://www.gamestar.de/videos/trailer,3/hobbit-3-die-schlacht-der-fuenf-heere,76110.html', + 'md5': 'ee782f1f8050448c95c5cacd63bc851c', + 'info_dict': { + 'id': '76110', + 'ext': 'mp4', + 'title': 'Hobbit 3: Die Schlacht der Fünf Heere - Teaser-Trailer zum dritten Teil', + 'description': 'Der Teaser-Trailer zu Hobbit 3: Die Schlacht der Fünf Heere zeigt einige Szenen aus dem dritten Teil der Saga und kündigt den...', + 'thumbnail': r're:^https?://.*\.jpg$', + 'timestamp': 1406542380, + 'upload_date': '20140728', + 'duration': 17, + } + }, + { + 'url': 'http://www.gamepro.de/videos/top-10-indie-spiele-fuer-nintendo-switch-video-tolle-nindies-games-zum-download,95316.html', + 'only_matching': True, + }, + ] def _real_extract(self, url): video_id = self._match_id(url) @@ -38,11 +44,12 @@ class GameStarIE(InfoExtractor): webpage, 'JSON-LD', group='json_ld'), video_id) info_dict = self._json_ld(json_ld, video_id) info_dict['title'] = remove_end(info_dict['title'], ' - GameStar') + info_dict['title'] = remove_end(info_dict['title'], ' - GamePro') - view_count = json_ld.get('interactionCount') + view_count = int_or_none(json_ld.get('interactionCount')) comment_count = int_or_none(self._html_search_regex( - r'([0-9]+) Kommentare</span>', webpage, 'comment_count', - fatal=False)) + r'<span>Kommentare</span><span class="count">\(([0-9]+)\)</span>', + webpage, 'comment_count', fatal=False)) info_dict.update({ 'id': video_id, From df16e645f60def2b5e1cf88d74164d6ced0d5651 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 13 Jan 2018 19:36:26 +0700 Subject: [PATCH 098/137] [gamestar] Fix issues (closes #15179) --- youtube_dl/extractor/gamestar.py | 61 +++++++++++++++++--------------- 1 file changed, 32 insertions(+), 29 deletions(-) diff --git a/youtube_dl/extractor/gamestar.py b/youtube_dl/extractor/gamestar.py index 7ce2f15de..f00dab2f3 100644 --- a/youtube_dl/extractor/gamestar.py +++ b/youtube_dl/extractor/gamestar.py @@ -1,6 +1,8 @@ # coding: utf-8 from __future__ import unicode_literals +import re + from .common import InfoExtractor from ..utils import ( int_or_none, @@ -9,33 +11,34 @@ from ..utils import ( class GameStarIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?game(?:pro|star)\.de/videos/.*,(?P<id>[0-9]+)\.html' - _TESTS = [ - { - 'url': 'http://www.gamestar.de/videos/trailer,3/hobbit-3-die-schlacht-der-fuenf-heere,76110.html', - 'md5': 'ee782f1f8050448c95c5cacd63bc851c', - 'info_dict': { - 'id': '76110', - 'ext': 'mp4', - 'title': 'Hobbit 3: Die Schlacht der Fünf Heere - Teaser-Trailer zum dritten Teil', - 'description': 'Der Teaser-Trailer zu Hobbit 3: Die Schlacht der Fünf Heere zeigt einige Szenen aus dem dritten Teil der Saga und kündigt den...', - 'thumbnail': r're:^https?://.*\.jpg$', - 'timestamp': 1406542380, - 'upload_date': '20140728', - 'duration': 17, - } - }, - { - 'url': 'http://www.gamepro.de/videos/top-10-indie-spiele-fuer-nintendo-switch-video-tolle-nindies-games-zum-download,95316.html', - 'only_matching': True, - }, - ] + _VALID_URL = r'https?://(?:www\.)?game(?P<site>pro|star)\.de/videos/.*,(?P<id>[0-9]+)\.html' + _TESTS = [{ + 'url': 'http://www.gamestar.de/videos/trailer,3/hobbit-3-die-schlacht-der-fuenf-heere,76110.html', + 'md5': 'ee782f1f8050448c95c5cacd63bc851c', + 'info_dict': { + 'id': '76110', + 'ext': 'mp4', + 'title': 'Hobbit 3: Die Schlacht der Fünf Heere - Teaser-Trailer zum dritten Teil', + 'description': 'Der Teaser-Trailer zu Hobbit 3: Die Schlacht der Fünf Heere zeigt einige Szenen aus dem dritten Teil der Saga und kündigt den...', + 'thumbnail': r're:^https?://.*\.jpg$', + 'timestamp': 1406542380, + 'upload_date': '20140728', + 'duration': 17, + } + }, { + 'url': 'http://www.gamepro.de/videos/top-10-indie-spiele-fuer-nintendo-switch-video-tolle-nindies-games-zum-download,95316.html', + 'only_matching': True, + }, { + 'url': 'http://www.gamestar.de/videos/top-10-indie-spiele-fuer-nintendo-switch-video-tolle-nindies-games-zum-download,95316.html', + 'only_matching': True, + }] def _real_extract(self, url): - video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) + mobj = re.match(self._VALID_URL, url) + site = mobj.group('site') + video_id = mobj.group('id') - url = 'http://gamestar.de/_misc/videos/portal/getVideoUrl.cfm?premium=0&videoId=' + video_id + webpage = self._download_webpage(url, video_id) # TODO: there are multiple ld+json objects in the webpage, # while _search_json_ld finds only the first one @@ -43,17 +46,17 @@ class GameStarIE(InfoExtractor): r'(?s)<script[^>]+type=(["\'])application/ld\+json\1[^>]*>(?P<json_ld>[^<]+VideoObject[^<]+)</script>', webpage, 'JSON-LD', group='json_ld'), video_id) info_dict = self._json_ld(json_ld, video_id) - info_dict['title'] = remove_end(info_dict['title'], ' - GameStar') - info_dict['title'] = remove_end(info_dict['title'], ' - GamePro') + info_dict['title'] = remove_end( + info_dict['title'], ' - Game%s' % site.title()) view_count = int_or_none(json_ld.get('interactionCount')) comment_count = int_or_none(self._html_search_regex( - r'<span>Kommentare</span><span class="count">\(([0-9]+)\)</span>', - webpage, 'comment_count', fatal=False)) + r'<span>Kommentare</span>\s*<span[^>]+class=["\']count[^>]+>\s*\(\s*([0-9]+)', + webpage, 'comment count', fatal=False)) info_dict.update({ 'id': video_id, - 'url': url, + 'url': 'http://gamestar.de/_misc/videos/portal/getVideoUrl.cfm?premium=0&videoId=' + video_id, 'ext': 'mp4', 'view_count': view_count, 'comment_count': comment_count From 2d8bb80c60289868d479e36a90cf1b73e9221893 Mon Sep 17 00:00:00 2001 From: Sebastian Leske <sebastian.leske@sleske.name> Date: Wed, 25 Oct 2017 14:59:57 +0200 Subject: [PATCH 099/137] [wdr:elefant] Add extractor --- youtube_dl/extractor/extractors.py | 1 + youtube_dl/extractor/wdr.py | 54 ++++++++++++++++++++++++++++-- 2 files changed, 52 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 37624d37a..255df75fe 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -1289,6 +1289,7 @@ from .watchbox import WatchBoxIE from .watchindianporn import WatchIndianPornIE from .wdr import ( WDRIE, + WDRElefantIE, WDRMobileIE, ) from .webcaster import ( diff --git a/youtube_dl/extractor/wdr.py b/youtube_dl/extractor/wdr.py index 621de1e1e..4871ae92b 100644 --- a/youtube_dl/extractor/wdr.py +++ b/youtube_dl/extractor/wdr.py @@ -16,7 +16,7 @@ from ..utils import ( class WDRBaseIE(InfoExtractor): - def _extract_wdr_video(self, webpage, display_id): + def _extract_jsonp_url(self, webpage, display_id): # for wdr.de the data-extension is in a tag with the class "mediaLink" # for wdr.de radio players, in a tag with the class "wdrrPlayerPlayBtn" # for wdrmaus, in a tag with the class "videoButton" (previously a link @@ -35,8 +35,9 @@ class WDRBaseIE(InfoExtractor): media_link_obj = self._parse_json(json_metadata, display_id, transform_source=js_to_json) - jsonp_url = media_link_obj['mediaObj']['url'] + return media_link_obj['mediaObj']['url'] + def _extract_wdr_video(self, jsonp_url, display_id): metadata = self._download_json( jsonp_url, display_id, transform_source=strip_jsonp) @@ -206,7 +207,8 @@ class WDRIE(WDRBaseIE): display_id = mobj.group('display_id') webpage = self._download_webpage(url, display_id) - info_dict = self._extract_wdr_video(webpage, display_id) + jsonp_url = self._extract_jsonp_url(webpage, display_id) + info_dict = self._extract_wdr_video(jsonp_url, display_id) if not info_dict: entries = [ @@ -239,6 +241,52 @@ class WDRIE(WDRBaseIE): return info_dict +class WDRElefantIE(WDRBaseIE): + _VALID_URL = r'https?://(?:www\.)wdrmaus.de/elefantenseite/#(?P<display_id>.+)' + IE_NAME = 'wdr:elefant' + + _TESTS = [ + { + 'url': 'http://www.wdrmaus.de/elefantenseite/#folge_ostern_2015', + 'info_dict': { + 'title': 'Folge Oster-Spezial 2015', + 'id': 'mdb-1088195', + 'ext': 'mp4', + 'age_limit': None, + 'upload_date': '20150406' + }, + 'params': { + 'skip_download' : True, + }, + }, + ] + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + display_id = mobj.group('display_id') + + # Table of Contents seems to always be at this address, so fetch it directly. + # The website fetches configurationJS.php5, which links to tableOfContentsJS.php5. + table_of_contents = self._download_json( + 'https://www.wdrmaus.de/elefantenseite/data/tableOfContentsJS.php5', display_id) + if display_id not in table_of_contents: + raise ExtractorError( + 'No entry in site\'s table of contents for this URL. ' + 'Is the fragment part of the URL (after the #) correct?', + expected=True) + xml_metadata_path = table_of_contents[display_id]['xmlPath'] + xml_metadata = self._download_xml( + 'https://www.wdrmaus.de/elefantenseite/' + xml_metadata_path, display_id) + zmdb_url_element = xml_metadata.find('./movie/zmdb_url') + if zmdb_url_element is None: + raise ExtractorError( + 'The URL looks valid, but no video was found. Note that download only works ' + 'on pages showing a single video, not on video selection pages.', + expected=True) + info_dict = self._extract_wdr_video(zmdb_url_element.text, display_id) + return info_dict + + class WDRMobileIE(InfoExtractor): _VALID_URL = r'''(?x) https?://mobile-ondemand\.wdr\.de/ From 54e8f62e01b54eeccd8313349f86ae541082704f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 13 Jan 2018 23:28:08 +0700 Subject: [PATCH 100/137] [wdr] Rework extractors (closes #14598) --- youtube_dl/extractor/extractors.py | 2 +- youtube_dl/extractor/sportschau.py | 38 ----- youtube_dl/extractor/wdr.py | 232 +++++++++++++++-------------- 3 files changed, 124 insertions(+), 148 deletions(-) delete mode 100644 youtube_dl/extractor/sportschau.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 255df75fe..c82614bf9 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -991,7 +991,6 @@ from .stitcher import StitcherIE from .sport5 import Sport5IE from .sportbox import SportBoxEmbedIE from .sportdeutschland import SportDeutschlandIE -from .sportschau import SportschauIE from .sprout import SproutIE from .srgssr import ( SRGSSRIE, @@ -1289,6 +1288,7 @@ from .watchbox import WatchBoxIE from .watchindianporn import WatchIndianPornIE from .wdr import ( WDRIE, + WDRPageIE, WDRElefantIE, WDRMobileIE, ) diff --git a/youtube_dl/extractor/sportschau.py b/youtube_dl/extractor/sportschau.py deleted file mode 100644 index 0d7925a08..000000000 --- a/youtube_dl/extractor/sportschau.py +++ /dev/null @@ -1,38 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -from .wdr import WDRBaseIE -from ..utils import get_element_by_attribute - - -class SportschauIE(WDRBaseIE): - IE_NAME = 'Sportschau' - _VALID_URL = r'https?://(?:www\.)?sportschau\.de/(?:[^/]+/)+video-?(?P<id>[^/#?]+)\.html' - _TEST = { - 'url': 'http://www.sportschau.de/uefaeuro2016/videos/video-dfb-team-geht-gut-gelaunt-ins-spiel-gegen-polen-100.html', - 'info_dict': { - 'id': 'mdb-1140188', - 'display_id': 'dfb-team-geht-gut-gelaunt-ins-spiel-gegen-polen-100', - 'ext': 'mp4', - 'title': 'DFB-Team geht gut gelaunt ins Spiel gegen Polen', - 'description': 'Vor dem zweiten Gruppenspiel gegen Polen herrscht gute Stimmung im deutschen Team. Insbesondere Bastian Schweinsteiger strotzt vor Optimismus nach seinem Tor gegen die Ukraine.', - 'upload_date': '20160615', - }, - 'skip': 'Geo-restricted to Germany', - } - - def _real_extract(self, url): - video_id = self._match_id(url) - - webpage = self._download_webpage(url, video_id) - title = get_element_by_attribute('class', 'headline', webpage) - description = self._html_search_meta('description', webpage, 'description') - - info = self._extract_wdr_video(webpage, video_id) - - info.update({ - 'title': title, - 'description': description, - }) - - return info diff --git a/youtube_dl/extractor/wdr.py b/youtube_dl/extractor/wdr.py index 4871ae92b..6bf5aeaed 100644 --- a/youtube_dl/extractor/wdr.py +++ b/youtube_dl/extractor/wdr.py @@ -4,50 +4,52 @@ from __future__ import unicode_literals import re from .common import InfoExtractor +from ..compat import ( + compat_str, + compat_urlparse, +) from ..utils import ( determine_ext, ExtractorError, js_to_json, strip_jsonp, + try_get, unified_strdate, update_url_query, urlhandle_detect_ext, ) -class WDRBaseIE(InfoExtractor): - def _extract_jsonp_url(self, webpage, display_id): - # for wdr.de the data-extension is in a tag with the class "mediaLink" - # for wdr.de radio players, in a tag with the class "wdrrPlayerPlayBtn" - # for wdrmaus, in a tag with the class "videoButton" (previously a link - # to the page in a multiline "videoLink"-tag) - json_metadata = self._html_search_regex( - r'''(?sx)class= - (?: - (["\'])(?:mediaLink|wdrrPlayerPlayBtn|videoButton)\b.*?\1[^>]+| - (["\'])videoLink\b.*?\2[\s]*>\n[^\n]* - )data-extension=(["\'])(?P<data>(?:(?!\3).)+)\3 - ''', - webpage, 'media link', default=None, group='data') +class WDRIE(InfoExtractor): + _VALID_URL = r'https?://deviceids-medp\.wdr\.de/ondemand/\d+/(?P<id>\d+)\.js' + _TEST = { + 'url': 'http://deviceids-medp.wdr.de/ondemand/155/1557833.js', + 'info_dict': { + 'id': 'mdb-1140188', + 'display_id': 'dfb-team-geht-gut-gelaunt-ins-spiel-gegen-polen-100', + 'ext': 'mp4', + 'title': 'DFB-Team geht gut gelaunt ins Spiel gegen Polen', + 'description': 'Vor dem zweiten Gruppenspiel gegen Polen herrscht gute Stimmung im deutschen Team. Insbesondere Bastian Schweinsteiger strotzt vor Optimismus nach seinem Tor gegen die Ukraine.', + 'upload_date': '20160615', + }, + 'skip': 'Geo-restricted to Germany', + } - if not json_metadata: - return + def _real_extract(self, url): + video_id = self._match_id(url) - media_link_obj = self._parse_json(json_metadata, display_id, - transform_source=js_to_json) - return media_link_obj['mediaObj']['url'] - - def _extract_wdr_video(self, jsonp_url, display_id): metadata = self._download_json( - jsonp_url, display_id, transform_source=strip_jsonp) + url, video_id, transform_source=strip_jsonp) - metadata_tracker_data = metadata['trackerData'] - metadata_media_resource = metadata['mediaResource'] + is_live = metadata.get('mediaType') == 'live' + + tracker_data = metadata['trackerData'] + media_resource = metadata['mediaResource'] formats = [] # check if the metadata contains a direct URL to a file - for kind, media_resource in metadata_media_resource.items(): + for kind, media_resource in media_resource.items(): if kind not in ('dflt', 'alt'): continue @@ -58,13 +60,13 @@ class WDRBaseIE(InfoExtractor): ext = determine_ext(medium_url) if ext == 'm3u8': formats.extend(self._extract_m3u8_formats( - medium_url, display_id, 'mp4', 'm3u8_native', + medium_url, video_id, 'mp4', 'm3u8_native', m3u8_id='hls')) elif ext == 'f4m': manifest_url = update_url_query( medium_url, {'hdcore': '3.2.0', 'plugin': 'aasp-3.2.0.77.18'}) formats.extend(self._extract_f4m_formats( - manifest_url, display_id, f4m_id='hds', fatal=False)) + manifest_url, video_id, f4m_id='hds', fatal=False)) elif ext == 'smil': formats.extend(self._extract_smil_formats( medium_url, 'stream', fatal=False)) @@ -74,7 +76,7 @@ class WDRBaseIE(InfoExtractor): } if ext == 'unknown_video': urlh = self._request_webpage( - medium_url, display_id, note='Determining extension') + medium_url, video_id, note='Determining extension') ext = urlhandle_detect_ext(urlh) a_format['ext'] = ext formats.append(a_format) @@ -82,30 +84,30 @@ class WDRBaseIE(InfoExtractor): self._sort_formats(formats) subtitles = {} - caption_url = metadata_media_resource.get('captionURL') + caption_url = media_resource.get('captionURL') if caption_url: subtitles['de'] = [{ 'url': caption_url, 'ext': 'ttml', }] - title = metadata_tracker_data['trackerClipTitle'] + title = tracker_data['trackerClipTitle'] return { - 'id': metadata_tracker_data.get('trackerClipId', display_id), - 'display_id': display_id, - 'title': title, - 'alt_title': metadata_tracker_data.get('trackerClipSubcategory'), + 'id': tracker_data.get('trackerClipId', video_id), + 'title': self._live_title(title) if is_live else title, + 'alt_title': tracker_data.get('trackerClipSubcategory'), 'formats': formats, 'subtitles': subtitles, - 'upload_date': unified_strdate(metadata_tracker_data.get('trackerClipAirTime')), + 'upload_date': unified_strdate(tracker_data.get('trackerClipAirTime')), + 'is_live': is_live, } -class WDRIE(WDRBaseIE): +class WDRPageIE(InfoExtractor): _CURRENT_MAUS_URL = r'https?://(?:www\.)wdrmaus.de/(?:[^/]+/){1,2}[^/?#]+\.php5' - _PAGE_REGEX = r'/(?:mediathek/)?[^/]+/(?P<type>[^/]+)/(?P<display_id>.+)\.html' - _VALID_URL = r'(?P<page_url>https?://(?:www\d\.)?wdr\d?\.de)' + _PAGE_REGEX + '|' + _CURRENT_MAUS_URL + _PAGE_REGEX = r'/(?:mediathek/)?(?:[^/]+/)*(?P<display_id>[^/]+)\.html' + _VALID_URL = r'https?://(?:www\d?\.)?(?:wdr\d?|sportschau)\.de' + _PAGE_REGEX + '|' + _CURRENT_MAUS_URL _TESTS = [ { @@ -125,6 +127,7 @@ class WDRIE(WDRBaseIE): 'ext': 'ttml', }]}, }, + 'skip': 'HTTP Error 404: Not Found', }, { 'url': 'http://www1.wdr.de/mediathek/audio/wdr3/wdr3-gespraech-am-samstag/audio-schriftstellerin-juli-zeh-100.html', @@ -140,19 +143,17 @@ class WDRIE(WDRBaseIE): 'is_live': False, 'subtitles': {} }, + 'skip': 'HTTP Error 404: Not Found', }, { 'url': 'http://www1.wdr.de/mediathek/video/live/index.html', 'info_dict': { - 'id': 'mdb-103364', + 'id': 'mdb-1406149', 'ext': 'mp4', - 'display_id': 'index', - 'title': r're:^WDR Fernsehen im Livestream [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', + 'title': r're:^WDR Fernsehen im Livestream \(nur in Deutschland erreichbar\) [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', 'alt_title': 'WDR Fernsehen Live', - 'upload_date': None, - 'description': 'md5:ae2ff888510623bf8d4b115f95a9b7c9', + 'upload_date': '20150101', 'is_live': True, - 'subtitles': {} }, 'params': { 'skip_download': True, # m3u8 download @@ -160,19 +161,18 @@ class WDRIE(WDRBaseIE): }, { 'url': 'http://www1.wdr.de/mediathek/video/sendungen/aktuelle-stunde/aktuelle-stunde-120.html', - 'playlist_mincount': 8, + 'playlist_mincount': 7, 'info_dict': { - 'id': 'aktuelle-stunde/aktuelle-stunde-120', + 'id': 'aktuelle-stunde-120', }, }, { 'url': 'http://www.wdrmaus.de/aktuelle-sendung/index.php5', 'info_dict': { - 'id': 'mdb-1323501', + 'id': 'mdb-1552552', 'ext': 'mp4', 'upload_date': 're:^[0-9]{8}$', 'title': 're:^Die Sendung mit der Maus vom [0-9.]{10}$', - 'description': 'Die Seite mit der Maus -', }, 'skip': 'The id changes from week to week because of the new episode' }, @@ -184,7 +184,6 @@ class WDRIE(WDRBaseIE): 'ext': 'mp4', 'upload_date': '20130919', 'title': 'Sachgeschichte - Achterbahn ', - 'description': 'Die Seite mit der Maus -', }, }, { @@ -192,83 +191,100 @@ class WDRIE(WDRBaseIE): # Live stream, MD5 unstable 'info_dict': { 'id': 'mdb-869971', - 'ext': 'flv', - 'title': 'COSMO Livestream', - 'description': 'md5:2309992a6716c347891c045be50992e4', + 'ext': 'mp4', + 'title': r're:^COSMO Livestream [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', 'upload_date': '20160101', }, + 'params': { + 'skip_download': True, # m3u8 download + } + }, + { + 'url': 'http://www.sportschau.de/handballem2018/handball-nationalmannschaft-em-stolperstein-vorrunde-100.html', + 'info_dict': { + 'id': 'mdb-1556012', + 'ext': 'mp4', + 'title': 'DHB-Vizepräsident Bob Hanning - "Die Weltspitze ist extrem breit"', + 'upload_date': '20180111', + }, + 'params': { + 'skip_download': True, + }, + }, + { + 'url': 'http://www.sportschau.de/handballem2018/audio-vorschau---die-handball-em-startet-mit-grossem-favoritenfeld-100.html', + 'only_matching': True, } ] def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) - url_type = mobj.group('type') - page_url = mobj.group('page_url') display_id = mobj.group('display_id') webpage = self._download_webpage(url, display_id) - jsonp_url = self._extract_jsonp_url(webpage, display_id) - info_dict = self._extract_wdr_video(jsonp_url, display_id) + entries = [] - if not info_dict: + # Article with several videos + + # for wdr.de the data-extension is in a tag with the class "mediaLink" + # for wdr.de radio players, in a tag with the class "wdrrPlayerPlayBtn" + # for wdrmaus, in a tag with the class "videoButton" (previously a link + # to the page in a multiline "videoLink"-tag) + for mobj in re.finditer( + r'''(?sx)class= + (?: + (["\'])(?:mediaLink|wdrrPlayerPlayBtn|videoButton)\b.*?\1[^>]+| + (["\'])videoLink\b.*?\2[\s]*>\n[^\n]* + )data-extension=(["\'])(?P<data>(?:(?!\3).)+)\3 + ''', webpage): + media_link_obj = self._parse_json( + mobj.group('data'), display_id, transform_source=js_to_json, + fatal=False) + if not media_link_obj: + continue + jsonp_url = try_get( + media_link_obj, lambda x: x['mediaObj']['url'], compat_str) + if jsonp_url: + entries.append(self.url_result(jsonp_url, ie=WDRIE.ie_key())) + + # Playlist (e.g. https://www1.wdr.de/mediathek/video/sendungen/aktuelle-stunde/aktuelle-stunde-120.html) + if not entries: entries = [ - self.url_result(page_url + href[0], 'WDR') - for href in re.findall( - r'<a href="(%s)"[^>]+data-extension=' % self._PAGE_REGEX, - webpage) + self.url_result( + compat_urlparse.urljoin(url, mobj.group('href')), + ie=WDRPageIE.ie_key()) + for mobj in re.finditer( + r'<a[^>]+\bhref=(["\'])(?P<href>(?:(?!\1).)+)\1[^>]+\bdata-extension=', + webpage) if re.match(self._PAGE_REGEX, mobj.group('href')) ] - if entries: # Playlist page - return self.playlist_result(entries, playlist_id=display_id) - - raise ExtractorError('No downloadable streams found', expected=True) - - is_live = url_type == 'live' - - if is_live: - info_dict.update({ - 'title': self._live_title(info_dict['title']), - 'upload_date': None, - }) - elif 'upload_date' not in info_dict: - info_dict['upload_date'] = unified_strdate(self._html_search_meta('DC.Date', webpage, 'upload date')) - - info_dict.update({ - 'description': self._html_search_meta('Description', webpage), - 'is_live': is_live, - }) - - return info_dict + return self.playlist_result(entries, playlist_id=display_id) -class WDRElefantIE(WDRBaseIE): - _VALID_URL = r'https?://(?:www\.)wdrmaus.de/elefantenseite/#(?P<display_id>.+)' - IE_NAME = 'wdr:elefant' - - _TESTS = [ - { - 'url': 'http://www.wdrmaus.de/elefantenseite/#folge_ostern_2015', - 'info_dict': { - 'title': 'Folge Oster-Spezial 2015', - 'id': 'mdb-1088195', - 'ext': 'mp4', - 'age_limit': None, - 'upload_date': '20150406' - }, - 'params': { - 'skip_download' : True, - }, +class WDRElefantIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)wdrmaus\.de/elefantenseite/#(?P<id>.+)' + _TEST = { + 'url': 'http://www.wdrmaus.de/elefantenseite/#folge_ostern_2015', + 'info_dict': { + 'title': 'Folge Oster-Spezial 2015', + 'id': 'mdb-1088195', + 'ext': 'mp4', + 'age_limit': None, + 'upload_date': '20150406' }, - ] + 'params': { + 'skip_download': True, + }, + } def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - display_id = mobj.group('display_id') + display_id = self._match_id(url) # Table of Contents seems to always be at this address, so fetch it directly. # The website fetches configurationJS.php5, which links to tableOfContentsJS.php5. table_of_contents = self._download_json( - 'https://www.wdrmaus.de/elefantenseite/data/tableOfContentsJS.php5', display_id) + 'https://www.wdrmaus.de/elefantenseite/data/tableOfContentsJS.php5', + display_id) if display_id not in table_of_contents: raise ExtractorError( 'No entry in site\'s table of contents for this URL. ' @@ -276,15 +292,13 @@ class WDRElefantIE(WDRBaseIE): expected=True) xml_metadata_path = table_of_contents[display_id]['xmlPath'] xml_metadata = self._download_xml( - 'https://www.wdrmaus.de/elefantenseite/' + xml_metadata_path, display_id) + 'https://www.wdrmaus.de/elefantenseite/' + xml_metadata_path, + display_id) zmdb_url_element = xml_metadata.find('./movie/zmdb_url') if zmdb_url_element is None: raise ExtractorError( - 'The URL looks valid, but no video was found. Note that download only works ' - 'on pages showing a single video, not on video selection pages.', - expected=True) - info_dict = self._extract_wdr_video(zmdb_url_element.text, display_id) - return info_dict + '%s is not a video' % display_id, expected=True) + return self.url_result(zmdb_url_element.text, ie=WDRIE.ie_key()) class WDRMobileIE(InfoExtractor): From 1915662d4fe09120d3f28db55c7be90b4d12a9f1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 13 Jan 2018 23:30:56 +0700 Subject: [PATCH 101/137] [wdr] Bypass geo restriction --- youtube_dl/extractor/wdr.py | 1 + 1 file changed, 1 insertion(+) diff --git a/youtube_dl/extractor/wdr.py b/youtube_dl/extractor/wdr.py index 6bf5aeaed..d6ba254f5 100644 --- a/youtube_dl/extractor/wdr.py +++ b/youtube_dl/extractor/wdr.py @@ -22,6 +22,7 @@ from ..utils import ( class WDRIE(InfoExtractor): _VALID_URL = r'https?://deviceids-medp\.wdr\.de/ondemand/\d+/(?P<id>\d+)\.js' + _GEO_COUNTRIES = ['DE'] _TEST = { 'url': 'http://deviceids-medp.wdr.de/ondemand/155/1557833.js', 'info_dict': { From 0ce39bc542813462c2d95dc21f1b363c4ae7a1e0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 13 Jan 2018 23:33:52 +0700 Subject: [PATCH 102/137] [wdr] Fix test --- youtube_dl/extractor/wdr.py | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/youtube_dl/extractor/wdr.py b/youtube_dl/extractor/wdr.py index d6ba254f5..cf6f7c7ed 100644 --- a/youtube_dl/extractor/wdr.py +++ b/youtube_dl/extractor/wdr.py @@ -26,14 +26,11 @@ class WDRIE(InfoExtractor): _TEST = { 'url': 'http://deviceids-medp.wdr.de/ondemand/155/1557833.js', 'info_dict': { - 'id': 'mdb-1140188', - 'display_id': 'dfb-team-geht-gut-gelaunt-ins-spiel-gegen-polen-100', + 'id': 'mdb-1557833', 'ext': 'mp4', - 'title': 'DFB-Team geht gut gelaunt ins Spiel gegen Polen', - 'description': 'Vor dem zweiten Gruppenspiel gegen Polen herrscht gute Stimmung im deutschen Team. Insbesondere Bastian Schweinsteiger strotzt vor Optimismus nach seinem Tor gegen die Ukraine.', - 'upload_date': '20160615', + 'title': 'Biathlon-Staffel verpasst Podest bei Olympia-Generalprobe', + 'upload_date': '20180112', }, - 'skip': 'Geo-restricted to Germany', } def _real_extract(self, url): From 391dd6f0946e2dd499147c5f3c6bb13642314515 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 14 Jan 2018 00:03:22 +0700 Subject: [PATCH 103/137] [youtube] Fix live streams extraction (closes #15202) --- youtube_dl/extractor/youtube.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 0919bef0e..a01ec1436 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -1810,7 +1810,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'url': video_info['conn'][0], 'player_url': player_url, }] - elif len(video_info.get('url_encoded_fmt_stream_map', [''])[0]) >= 1 or len(video_info.get('adaptive_fmts', [''])[0]) >= 1: + elif not is_live and (len(video_info.get('url_encoded_fmt_stream_map', [''])[0]) >= 1 or len(video_info.get('adaptive_fmts', [''])[0]) >= 1): encoded_url_map = video_info.get('url_encoded_fmt_stream_map', [''])[0] + ',' + video_info.get('adaptive_fmts', [''])[0] if 'rtmpe%3Dyes' in encoded_url_map: raise ExtractorError('rtmpe downloads are not supported, see https://github.com/rg3/youtube-dl/issues/343 for more information.', expected=True) From dd896a6a07939a343776570770b5c8f69e8c0988 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 14 Jan 2018 00:10:04 +0700 Subject: [PATCH 104/137] [ChangeLog] Actualize --- ChangeLog | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/ChangeLog b/ChangeLog index ea1be934c..7c63d4bac 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,9 +1,22 @@ version <unreleased> Extractors -+ [ximalaya] Add extractor (#14687) +* [youtube] Fix live streams extraction (#15202) +* [wdr] Bypass geo restriction +* [wdr] Rework extractors (#14598) ++ [wdr] Add support for wdrmaus.de/elefantenseite (#14598) ++ [gamestar] Add support for gamepro.de (#3384) +* [viafree] Skip rtmp formats (#15232) ++ [pandoratv] Add support for mobile URLs (#12441) ++ [pandoratv] Add support for new URL format (#15131) ++ [ximalaya] Add support for ximalaya.com (#14687) ++ [digg] Add support for digg.com (#15214) +* [limelight] Tolerate empty pc formats (#15150, #15151, #15207) +* [ndr:embed:base] Make separate formats extraction non fatal (#15203) + [weibo] Add extractor (#15079) -* [bilibili] fix extraction (#15188) ++ [ok] Add support for live streams +* [canalplus] Fix extraction (#15072) +* [bilibili] Fix extraction (#15188) version 2018.01.07 From e11ccd76c6dc8438c01d30445627eab5203cb1dc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 14 Jan 2018 00:13:56 +0700 Subject: [PATCH 105/137] release 2018.01.14 --- .github/ISSUE_TEMPLATE.md | 6 +++--- ChangeLog | 2 +- docs/supportedsites.md | 10 ++++++++-- youtube_dl/version.py | 2 +- 4 files changed, 13 insertions(+), 7 deletions(-) diff --git a/.github/ISSUE_TEMPLATE.md b/.github/ISSUE_TEMPLATE.md index ad52c8900..6bc7d0366 100644 --- a/.github/ISSUE_TEMPLATE.md +++ b/.github/ISSUE_TEMPLATE.md @@ -6,8 +6,8 @@ --- -### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2018.01.07*. If it's not, read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. -- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2018.01.07** +### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2018.01.14*. If it's not, read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. +- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2018.01.14** ### Before submitting an *issue* make sure you have: - [ ] At least skimmed through the [README](https://github.com/rg3/youtube-dl/blob/master/README.md), **most notably** the [FAQ](https://github.com/rg3/youtube-dl#faq) and [BUGS](https://github.com/rg3/youtube-dl#bugs) sections @@ -35,7 +35,7 @@ Add the `-v` flag to **your command line** you run youtube-dl with (`youtube-dl [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 -[debug] youtube-dl version 2018.01.07 +[debug] youtube-dl version 2018.01.14 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/ChangeLog b/ChangeLog index 7c63d4bac..bfafaca6a 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,4 +1,4 @@ -version <unreleased> +version 2018.01.14 Extractors * [youtube] Fix live streams extraction (#15202) diff --git a/docs/supportedsites.md b/docs/supportedsites.md index 79b343048..c04a75b88 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -128,7 +128,7 @@ - **CamdemyFolder** - **CamWithHer** - **canalc2.tv** - - **Canalplus**: canalplus.fr, piwiplus.fr and d8.tv + - **Canalplus**: mycanal.fr and piwiplus.fr - **Canvas** - **CanvasEen**: canvas.be and een.be - **CarambaTV** @@ -210,6 +210,7 @@ - **defense.gouv.fr** - **democracynow** - **DHM**: Filmarchiv - Deutsches Historisches Museum + - **Digg** - **DigitallySpeaking** - **Digiteka** - **Discovery** @@ -773,7 +774,6 @@ - **Sport5** - **SportBoxEmbed** - **SportDeutschland** - - **Sportschau** - **Sprout** - **sr:mediathek**: Saarländischer Rundfunk - **SRGSSR** @@ -1002,10 +1002,14 @@ - **WatchIndianPorn**: Watch Indian Porn - **WDR** - **wdr:mobile** + - **WDRElefant** + - **WDRPage** - **Webcaster** - **WebcasterFeed** - **WebOfStories** - **WebOfStoriesPlaylist** + - **Weibo** + - **WeiboMobile** - **WeiqiTV**: WQTV - **wholecloud**: WholeCloud - **Wimp** @@ -1025,6 +1029,8 @@ - **xiami:artist**: 虾米音乐 - 歌手 - **xiami:collection**: 虾米音乐 - 精选集 - **xiami:song**: 虾米音乐 + - **ximalaya**: 喜马拉雅FM + - **ximalaya:album**: 喜马拉雅FM 专辑 - **XMinus** - **XNXX** - **Xstream** diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 9030e2415..498149110 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2018.01.07' +__version__ = '2018.01.14' From a86922c4702e2c8538337124c5bf02a4b5f9aa4a Mon Sep 17 00:00:00 2001 From: Reto Kromer <retokromer@users.noreply.github.com> Date: Sat, 13 Jan 2018 18:58:38 +0100 Subject: [PATCH 106/137] [README.md] Clarify macOS name --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 47b0640ab..eb05f848f 100644 --- a/README.md +++ b/README.md @@ -46,7 +46,7 @@ Or with [MacPorts](https://www.macports.org/): Alternatively, refer to the [developer instructions](#developer-instructions) for how to check out and work with the git repository. For further options, including PGP signatures, see the [youtube-dl Download Page](https://rg3.github.io/youtube-dl/download.html). # DESCRIPTION -**youtube-dl** is a command-line program to download videos from YouTube.com and a few more sites. It requires the Python interpreter, version 2.6, 2.7, or 3.2+, and it is not platform specific. It should work on your Unix box, on Windows or on Mac OS X. It is released to the public domain, which means you can modify it, redistribute it or use it however you like. +**youtube-dl** is a command-line program to download videos from YouTube.com and a few more sites. It requires the Python interpreter, version 2.6, 2.7, or 3.2+, and it is not platform specific. It should work on your Unix box, on Windows or on macOS. It is released to the public domain, which means you can modify it, redistribute it or use it however you like. youtube-dl [OPTIONS] URL [URL...] @@ -863,7 +863,7 @@ Use the `--cookies` option, for example `--cookies /path/to/cookies/file.txt`. In order to extract cookies from browser use any conforming browser extension for exporting cookies. For example, [cookies.txt](https://chrome.google.com/webstore/detail/cookiestxt/njabckikapfpffapmjgojcnbfjonfjfg) (for Chrome) or [Export Cookies](https://addons.mozilla.org/en-US/firefox/addon/export-cookies/) (for Firefox). -Note that the cookies file must be in Mozilla/Netscape format and the first line of the cookies file must be either `# HTTP Cookie File` or `# Netscape HTTP Cookie File`. Make sure you have correct [newline format](https://en.wikipedia.org/wiki/Newline) in the cookies file and convert newlines if necessary to correspond with your OS, namely `CRLF` (`\r\n`) for Windows and `LF` (`\n`) for Unix and Unix-like systems (Linux, Mac OS, etc.). `HTTP Error 400: Bad Request` when using `--cookies` is a good sign of invalid newline format. +Note that the cookies file must be in Mozilla/Netscape format and the first line of the cookies file must be either `# HTTP Cookie File` or `# Netscape HTTP Cookie File`. Make sure you have correct [newline format](https://en.wikipedia.org/wiki/Newline) in the cookies file and convert newlines if necessary to correspond with your OS, namely `CRLF` (`\r\n`) for Windows and `LF` (`\n`) for Unix and Unix-like systems (Linux, macOS, etc.). `HTTP Error 400: Bad Request` when using `--cookies` is a good sign of invalid newline format. Passing cookies to youtube-dl is a good way to workaround login when a particular extractor does not implement it explicitly. Another use case is working around [CAPTCHA](https://en.wikipedia.org/wiki/CAPTCHA) some websites require you to solve in particular cases in order to get access (e.g. YouTube, CloudFlare). From 1d1d60f6dd4d7c33dc690ff5c68fdab11e6c0af7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Mon, 15 Jan 2018 22:56:45 +0700 Subject: [PATCH 107/137] [vk] Detect more errors due to copyright complaints (#15259) --- youtube_dl/extractor/vk.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/vk.py b/youtube_dl/extractor/vk.py index d4838b3e5..b8ea50362 100644 --- a/youtube_dl/extractor/vk.py +++ b/youtube_dl/extractor/vk.py @@ -318,9 +318,14 @@ class VKIE(VKBaseIE): 'You are trying to log in from an unusual location. You should confirm ownership at vk.com to log in with this IP.', expected=True) + ERROR_COPYRIGHT = 'Video %s has been removed from public access due to rightholder complaint.' + ERRORS = { r'>Видеозапись .*? была изъята из публичного доступа в связи с обращением правообладателя.<': - 'Video %s has been removed from public access due to rightholder complaint.', + ERROR_COPYRIGHT, + + r'>The video .*? was removed from public access by request of the copyright holder.<': + ERROR_COPYRIGHT, r'<!>Please log in or <': 'Video %s is only available for registered users, ' From 1370dba59f140dc5313ed1da1dad7bcba09cfb1c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Tue, 16 Jan 2018 22:34:16 +0700 Subject: [PATCH 108/137] [twitch] Fix authentication and error capture (closes #14090, closes #15264) --- youtube_dl/extractor/twitch.py | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/twitch.py b/youtube_dl/extractor/twitch.py index f9164af09..1981b4d4a 100644 --- a/youtube_dl/extractor/twitch.py +++ b/youtube_dl/extractor/twitch.py @@ -85,10 +85,15 @@ class TwitchBaseIE(InfoExtractor): if isinstance(e.cause, compat_HTTPError) and e.cause.code == 400: response = self._parse_json( e.cause.read().decode('utf-8'), None) - fail(response['message']) + fail(response.get('message') or response['errors'][0]) raise - redirect_url = urljoin(post_url, response['redirect']) + if 'Authenticated successfully' in response.get('message', ''): + return None, None + + redirect_url = urljoin( + post_url, + response.get('redirect') or response['redirect_path']) return self._download_webpage_handle( redirect_url, None, 'Downloading login redirect page', headers=headers) @@ -106,6 +111,10 @@ class TwitchBaseIE(InfoExtractor): 'password': password, }) + # Successful login + if not redirect_page: + return + if re.search(r'(?i)<form[^>]+id="two-factor-submit"', redirect_page) is not None: # TODO: Add mechanism to request an SMS or phone call tfa_token = self._get_tfa_info('two-factor authentication token') From 4471affc348af40409188f133786780edd969623 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Wed, 17 Jan 2018 22:03:56 +0700 Subject: [PATCH 109/137] [spiegel] Add support for nexx videos (closes #15285) --- youtube_dl/extractor/spiegel.py | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/spiegel.py b/youtube_dl/extractor/spiegel.py index 84298fee4..fc995e8c1 100644 --- a/youtube_dl/extractor/spiegel.py +++ b/youtube_dl/extractor/spiegel.py @@ -4,7 +4,10 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from .nexx import NexxEmbedIE +from .nexx import ( + NexxIE, + NexxEmbedIE, +) from .spiegeltv import SpiegeltvIE from ..compat import compat_urlparse from ..utils import ( @@ -51,6 +54,10 @@ class SpiegelIE(InfoExtractor): }, { 'url': 'http://www.spiegel.de/video/astronaut-alexander-gerst-von-der-iss-station-beantwortet-fragen-video-1519126-iframe.html', 'only_matching': True, + }, { + # nexx video + 'url': 'http://www.spiegel.de/video/spiegel-tv-magazin-ueber-guellekrise-in-schleswig-holstein-video-99012776.html', + 'only_matching': True, }] def _real_extract(self, url): @@ -61,6 +68,14 @@ class SpiegelIE(InfoExtractor): if SpiegeltvIE.suitable(handle.geturl()): return self.url_result(handle.geturl(), 'Spiegeltv') + nexx_id = self._search_regex( + r'nexxOmniaId\s*:\s*(\d+)', webpage, 'nexx id', default=None) + if nexx_id: + domain_id = NexxIE._extract_domain_id(webpage) or '748' + return self.url_result( + 'nexx:%s:%s' % (domain_id, nexx_id), ie=NexxIE.ie_key(), + video_id=nexx_id) + video_data = extract_attributes(self._search_regex(r'(<div[^>]+id="spVideoElements"[^>]+>)', webpage, 'video element', default='')) title = video_data.get('data-video-title') or get_element_by_attribute('class', 'module-title', webpage) From cad9caf76b529c4feb611ad85c54fc2f4f15c71e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Thu, 18 Jan 2018 22:26:43 +0700 Subject: [PATCH 110/137] [kamcord] Remove extractor (closes #15322) --- youtube_dl/extractor/extractors.py | 1 - youtube_dl/extractor/kamcord.py | 71 ------------------------------ 2 files changed, 72 deletions(-) delete mode 100644 youtube_dl/extractor/kamcord.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index c82614bf9..b9d6ea807 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -490,7 +490,6 @@ from .jwplatform import JWPlatformIE from .jpopsukitv import JpopsukiIE from .kakao import KakaoIE from .kaltura import KalturaIE -from .kamcord import KamcordIE from .kanalplay import KanalPlayIE from .kankan import KankanIE from .karaoketv import KaraoketvIE diff --git a/youtube_dl/extractor/kamcord.py b/youtube_dl/extractor/kamcord.py deleted file mode 100644 index b50120d98..000000000 --- a/youtube_dl/extractor/kamcord.py +++ /dev/null @@ -1,71 +0,0 @@ -from __future__ import unicode_literals - -from .common import InfoExtractor -from ..compat import compat_str -from ..utils import ( - int_or_none, - qualities, -) - - -class KamcordIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?kamcord\.com/v/(?P<id>[^/?#&]+)' - _TEST = { - 'url': 'https://www.kamcord.com/v/hNYRduDgWb4', - 'md5': 'c3180e8a9cfac2e86e1b88cb8751b54c', - 'info_dict': { - 'id': 'hNYRduDgWb4', - 'ext': 'mp4', - 'title': 'Drinking Madness', - 'uploader': 'jacksfilms', - 'uploader_id': '3044562', - 'view_count': int, - 'like_count': int, - 'comment_count': int, - }, - } - - def _real_extract(self, url): - video_id = self._match_id(url) - - webpage = self._download_webpage(url, video_id) - - video = self._parse_json( - self._search_regex( - r'window\.__props\s*=\s*({.+?});?(?:\n|\s*</script)', - webpage, 'video'), - video_id)['video'] - - title = video['title'] - - formats = self._extract_m3u8_formats( - video['play']['hls'], video_id, 'mp4', entry_protocol='m3u8_native') - self._sort_formats(formats) - - uploader = video.get('user', {}).get('username') - uploader_id = video.get('user', {}).get('id') - - view_count = int_or_none(video.get('viewCount')) - like_count = int_or_none(video.get('heartCount')) - comment_count = int_or_none(video.get('messageCount')) - - preference_key = qualities(('small', 'medium', 'large')) - - thumbnails = [{ - 'url': thumbnail_url, - 'id': thumbnail_id, - 'preference': preference_key(thumbnail_id), - } for thumbnail_id, thumbnail_url in (video.get('thumbnail') or {}).items() - if isinstance(thumbnail_id, compat_str) and isinstance(thumbnail_url, compat_str)] - - return { - 'id': video_id, - 'title': title, - 'uploader': uploader, - 'uploader_id': uploader_id, - 'view_count': view_count, - 'like_count': like_count, - 'comment_count': comment_count, - 'thumbnails': thumbnails, - 'formats': formats, - } From 67408fe0e94c59dd261ae96e7ae09d765042069f Mon Sep 17 00:00:00 2001 From: Varun <mailvarunest@gmail.com> Date: Thu, 18 Jan 2018 21:00:43 +0530 Subject: [PATCH 111/137] [soundcloud] Update client id (closes #15306) --- youtube_dl/extractor/soundcloud.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/soundcloud.py b/youtube_dl/extractor/soundcloud.py index 6c9816eef..97ff422f0 100644 --- a/youtube_dl/extractor/soundcloud.py +++ b/youtube_dl/extractor/soundcloud.py @@ -157,7 +157,7 @@ class SoundcloudIE(InfoExtractor): }, ] - _CLIENT_ID = 'c6CU49JDMapyrQo06UxU9xouB9ZVzqCn' + _CLIENT_ID = 'DQskPX1pntALRzMp4HSxya3Mc0AO66Ro' _IPHONE_CLIENT_ID = '376f225bf427445fc4bfb6b99b72e0bf' @staticmethod From 68da3d033c0ff88424d1966bc342f3d72a6d677a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Thu, 18 Jan 2018 23:37:46 +0700 Subject: [PATCH 112/137] [ChangeLog] Actualize --- ChangeLog | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/ChangeLog b/ChangeLog index bfafaca6a..cd9ebccab 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,13 @@ +version <unreleased> + +Extractors +* [soundcloud] Update client id (#15306) +- [kamcord] Remove extractor (#15322) ++ [spiegel] Add support for nexx videos (#15285) +* [twitch] Fix authentication and error capture (#14090, #15264) +* [vk] Detect more errors due to copyright complaints (#15259) + + version 2018.01.14 Extractors From e2fc6df16971cf710596915887de25c1c52ed47c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Thu, 18 Jan 2018 23:41:44 +0700 Subject: [PATCH 113/137] release 2018.01.18 --- .github/ISSUE_TEMPLATE.md | 6 +++--- ChangeLog | 2 +- docs/supportedsites.md | 1 - youtube_dl/version.py | 2 +- 4 files changed, 5 insertions(+), 6 deletions(-) diff --git a/.github/ISSUE_TEMPLATE.md b/.github/ISSUE_TEMPLATE.md index 6bc7d0366..079e7629e 100644 --- a/.github/ISSUE_TEMPLATE.md +++ b/.github/ISSUE_TEMPLATE.md @@ -6,8 +6,8 @@ --- -### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2018.01.14*. If it's not, read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. -- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2018.01.14** +### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2018.01.18*. If it's not, read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. +- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2018.01.18** ### Before submitting an *issue* make sure you have: - [ ] At least skimmed through the [README](https://github.com/rg3/youtube-dl/blob/master/README.md), **most notably** the [FAQ](https://github.com/rg3/youtube-dl#faq) and [BUGS](https://github.com/rg3/youtube-dl#bugs) sections @@ -35,7 +35,7 @@ Add the `-v` flag to **your command line** you run youtube-dl with (`youtube-dl [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 -[debug] youtube-dl version 2018.01.14 +[debug] youtube-dl version 2018.01.18 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/ChangeLog b/ChangeLog index cd9ebccab..d678bc7af 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,4 +1,4 @@ -version <unreleased> +version 2018.01.18 Extractors * [soundcloud] Update client id (#15306) diff --git a/docs/supportedsites.md b/docs/supportedsites.md index c04a75b88..0caced5bd 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -383,7 +383,6 @@ - **JWPlatform** - **Kakao** - **Kaltura** - - **Kamcord** - **KanalPlay**: Kanal 5/9/11 Play - **Kankan** - **Karaoketv** diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 498149110..717c1e899 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2018.01.14' +__version__ = '2018.01.18' From 154e4fdace1d0caad92ccc8556b4cfe4eb28b00a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Fri, 19 Jan 2018 22:49:58 +0700 Subject: [PATCH 114/137] [ringtv] Remove extractor (closes #15345) --- youtube_dl/extractor/extractors.py | 1 - youtube_dl/extractor/ringtv.py | 44 ------------------------------ 2 files changed, 45 deletions(-) delete mode 100644 youtube_dl/extractor/ringtv.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index b9d6ea807..9d3582dfa 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -881,7 +881,6 @@ from .revision3 import ( Revision3IE, ) from .rice import RICEIE -from .ringtv import RingTVIE from .rmcdecouverte import RMCDecouverteIE from .ro220 import Ro220IE from .rockstargames import RockstarGamesIE diff --git a/youtube_dl/extractor/ringtv.py b/youtube_dl/extractor/ringtv.py deleted file mode 100644 index 2c2c707bd..000000000 --- a/youtube_dl/extractor/ringtv.py +++ /dev/null @@ -1,44 +0,0 @@ -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor - - -class RingTVIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?ringtv\.craveonline\.com/(?P<type>news|videos/video)/(?P<id>[^/?#]+)' - _TEST = { - 'url': 'http://ringtv.craveonline.com/news/310833-luis-collazo-says-victor-ortiz-better-not-quit-on-jan-30', - 'md5': 'd25945f5df41cdca2d2587165ac28720', - 'info_dict': { - 'id': '857645', - 'ext': 'mp4', - 'title': 'Video: Luis Collazo says Victor Ortiz "better not quit on Jan. 30" - Ring TV', - 'description': 'Luis Collazo is excited about his Jan. 30 showdown with fellow former welterweight titleholder Victor Ortiz at Barclays Center in his hometown of Brooklyn. The SuperBowl week fight headlines a Golden Boy Live! card on Fox Sports 1.', - } - } - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id').split('-')[0] - webpage = self._download_webpage(url, video_id) - - if mobj.group('type') == 'news': - video_id = self._search_regex( - r'''(?x)<iframe[^>]+src="http://cms\.springboardplatform\.com/ - embed_iframe/[0-9]+/video/([0-9]+)/''', - webpage, 'real video ID') - title = self._og_search_title(webpage) - description = self._html_search_regex( - r'addthis:description="([^"]+)"', - webpage, 'description', fatal=False) - final_url = 'http://ringtv.craveonline.springboardplatform.com/storage/ringtv.craveonline.com/conversion/%s.mp4' % video_id - thumbnail_url = 'http://ringtv.craveonline.springboardplatform.com/storage/ringtv.craveonline.com/snapshots/%s.jpg' % video_id - - return { - 'id': video_id, - 'url': final_url, - 'title': title, - 'thumbnail': thumbnail_url, - 'description': description, - } From 2a3683c3780698dfa1d1175734fca97d1cd6c215 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Fri, 19 Jan 2018 18:26:47 +0100 Subject: [PATCH 115/137] prosiebensat1: add another clip ID regexp --- youtube_dl/extractor/prosiebensat1.py | 1 + 1 file changed, 1 insertion(+) diff --git a/youtube_dl/extractor/prosiebensat1.py b/youtube_dl/extractor/prosiebensat1.py index d8a4bd244..7c9743fe8 100644 --- a/youtube_dl/extractor/prosiebensat1.py +++ b/youtube_dl/extractor/prosiebensat1.py @@ -344,6 +344,7 @@ class ProSiebenSat1IE(ProSiebenSat1BaseIE): r'clip[iI]d=(\d+)', r'clip[iI]d\s*=\s*["\'](\d+)', r"'itemImageUrl'\s*:\s*'/dynamic/thumbnails/full/\d+/(\d+)", + r'proMamsId":"(\d+)', ] _TITLE_REGEXES = [ r'<h2 class="subtitle" itemprop="name">\s*(.+?)</h2>', From 7d5406216ae061d11815a8e8a97ad6617eeaab38 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 20 Jan 2018 00:33:45 +0700 Subject: [PATCH 116/137] [springboardplatform] Add extractor --- youtube_dl/extractor/extractors.py | 1 + youtube_dl/extractor/generic.py | 22 ++++ youtube_dl/extractor/springboardplatform.py | 125 ++++++++++++++++++++ 3 files changed, 148 insertions(+) create mode 100644 youtube_dl/extractor/springboardplatform.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 9d3582dfa..f5bac6fd9 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -989,6 +989,7 @@ from .stitcher import StitcherIE from .sport5 import Sport5IE from .sportbox import SportBoxEmbedIE from .sportdeutschland import SportDeutschlandIE +from .springboardplatform import SpringboardPlatformIE from .sprout import SproutIE from .srgssr import ( SRGSSRIE, diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 9b0cd004f..1d9da8115 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -101,6 +101,7 @@ from .vzaar import VzaarIE from .channel9 import Channel9IE from .vshare import VShareIE from .mediasite import MediasiteIE +from .springboardplatform import SpringboardPlatformIE class GenericIE(InfoExtractor): @@ -1938,6 +1939,21 @@ class GenericIE(InfoExtractor): 'timestamp': 1474354800, 'upload_date': '20160920', } + }, + { + 'url': 'http://www.kidzworld.com/article/30935-trolls-the-beat-goes-on-interview-skylar-astin-and-amanda-leighton', + 'info_dict': { + 'id': '1731611', + 'ext': 'mp4', + 'title': 'Official Trailer | TROLLS: THE BEAT GOES ON!', + 'description': 'md5:eb5f23826a027ba95277d105f248b825', + 'timestamp': 1516100691, + 'upload_date': '20180116', + }, + 'params': { + 'skip_download': True, + }, + 'add_ie': [SpringboardPlatformIE.ie_key()], } # { # # TODO: find another test @@ -2906,6 +2922,12 @@ class GenericIE(InfoExtractor): for mediasite_url in mediasite_urls] return self.playlist_result(entries, video_id, video_title) + springboardplatform_urls = SpringboardPlatformIE._extract_urls(webpage) + if springboardplatform_urls: + return self.playlist_from_matches( + springboardplatform_urls, video_id, video_title, + ie=SpringboardPlatformIE.ie_key()) + def merge_dicts(dict1, dict2): merged = {} for k, v in dict1.items(): diff --git a/youtube_dl/extractor/springboardplatform.py b/youtube_dl/extractor/springboardplatform.py new file mode 100644 index 000000000..07d99b579 --- /dev/null +++ b/youtube_dl/extractor/springboardplatform.py @@ -0,0 +1,125 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + int_or_none, + xpath_attr, + xpath_text, + xpath_element, + unescapeHTML, + unified_timestamp, +) + + +class SpringboardPlatformIE(InfoExtractor): + _VALID_URL = r'''(?x) + https?:// + cms\.springboardplatform\.com/ + (?: + (?:previews|embed_iframe)/(?P<index>\d+)/video/(?P<id>\d+)| + xml_feeds_advanced/index/(?P<index_2>\d+)/rss3/(?P<id_2>\d+) + ) + ''' + _TESTS = [{ + 'url': 'http://cms.springboardplatform.com/previews/159/video/981017/0/0/1', + 'md5': '5c3cb7b5c55740d482561099e920f192', + 'info_dict': { + 'id': '981017', + 'ext': 'mp4', + 'title': 'Redman "BUD like YOU" "Usher Good Kisser" REMIX', + 'description': 'Redman "BUD like YOU" "Usher Good Kisser" REMIX', + 'thumbnail': r're:^https?://.*\.jpg$', + 'timestamp': 1409132328, + 'upload_date': '20140827', + 'duration': 193, + }, + }, { + 'url': 'http://cms.springboardplatform.com/embed_iframe/159/video/981017/rab007/rapbasement.com/1/1', + 'only_matching': True, + }, { + 'url': 'http://cms.springboardplatform.com/embed_iframe/20/video/1731611/ki055/kidzworld.com/10', + 'only_matching': True, + }, { + 'url': 'http://cms.springboardplatform.com/xml_feeds_advanced/index/159/rss3/981017/0/0/1/', + 'only_matching': True, + }] + + @staticmethod + def _extract_urls(webpage): + return [ + mobj.group('url') + for mobj in re.finditer( + r'<iframe\b[^>]+\bsrc=(["\'])(?P<url>(?:https?:)?//cms\.springboardplatform\.com/embed_iframe/\d+/video/\d+.*?)\1', + webpage)] + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') or mobj.group('id_2') + index = mobj.group('index') or mobj.group('index_2') + + video = self._download_xml( + 'http://cms.springboardplatform.com/xml_feeds_advanced/index/%s/rss3/%s' + % (index, video_id), video_id) + + item = xpath_element(video, './/item', 'item', fatal=True) + + content = xpath_element( + item, './{http://search.yahoo.com/mrss/}content', 'content', + fatal=True) + title = unescapeHTML(xpath_text(item, './title', 'title', fatal=True)) + + video_url = content.attrib['url'] + + if 'error_video.mp4' in video_url: + raise ExtractorError( + 'Video %s no longer exists' % video_id, expected=True) + + duration = int_or_none(content.get('duration')) + tbr = int_or_none(content.get('bitrate')) + filesize = int_or_none(content.get('fileSize')) + width = int_or_none(content.get('width')) + height = int_or_none(content.get('height')) + + description = unescapeHTML(xpath_text( + item, './description', 'description')) + thumbnail = xpath_attr( + item, './{http://search.yahoo.com/mrss/}thumbnail', 'url', + 'thumbnail') + + timestamp = unified_timestamp(xpath_text( + item, './{http://cms.springboardplatform.com/namespaces.html}created', + 'timestamp')) + + formats = [{ + 'url': video_url, + 'format_id': 'http', + 'tbr': tbr, + 'filesize': filesize, + 'width': width, + 'height': height, + }] + + m3u8_format = formats[0].copy() + m3u8_format.update({ + 'url': re.sub(r'(https?://)cdn\.', r'\1hls.', video_url) + '.m3u8', + 'ext': 'mp4', + 'format_id': 'hls', + 'protocol': 'm3u8_native', + }) + formats.append(m3u8_format) + + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + 'timestamp': timestamp, + 'duration': duration, + 'formats': formats, + } From e7f3529f68ee7c8ca78366d37f851cb31fa00f31 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 20 Jan 2018 17:57:20 +0700 Subject: [PATCH 117/137] [youtube:live] Improve live detection (closes #15365) --- youtube_dl/extractor/youtube.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index a01ec1436..f698a5627 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -2530,10 +2530,11 @@ class YoutubeLiveIE(YoutubeBaseInfoExtractor): webpage = self._download_webpage(url, channel_id, fatal=False) if webpage: page_type = self._og_search_property( - 'type', webpage, 'page type', default=None) + 'type', webpage, 'page type', default='') video_id = self._html_search_meta( 'videoId', webpage, 'video id', default=None) - if page_type == 'video' and video_id and re.match(r'^[0-9A-Za-z_-]{11}$', video_id): + if page_type.startswith('video') and video_id and re.match( + r'^[0-9A-Za-z_-]{11}$', video_id): return self.url_result(video_id, YoutubeIE.ie_key()) return self.url_result(base_url) From c384d537f882efab10a78a56ce6dcb0a30f54b47 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= <mike_faehrmann@web.de> Date: Sat, 20 Jan 2018 16:54:21 +0100 Subject: [PATCH 118/137] [util] Improve scientific notation handling in js_to_json (closes #14789) --- test/test_utils.py | 6 ++++++ youtube_dl/utils.py | 2 +- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/test/test_utils.py b/test/test_utils.py index 0857c0fc0..6ef498a66 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -814,6 +814,9 @@ class TestUtil(unittest.TestCase): inp = '''{"duration": "00:01:07"}''' self.assertEqual(js_to_json(inp), '''{"duration": "00:01:07"}''') + inp = '''{segments: [{"offset":-3.885780586188048e-16,"duration":39.75000000000001}]}''' + self.assertEqual(js_to_json(inp), '''{"segments": [{"offset":-3.885780586188048e-16,"duration":39.75000000000001}]}''') + def test_js_to_json_edgecases(self): on = js_to_json("{abc_def:'1\\'\\\\2\\\\\\'3\"4'}") self.assertEqual(json.loads(on), {"abc_def": "1'\\2\\'3\"4"}) @@ -885,6 +888,9 @@ class TestUtil(unittest.TestCase): on = js_to_json('{/*comment\n*/42/*comment\n*/:/*comment\n*/42/*comment\n*/}') self.assertEqual(json.loads(on), {'42': 42}) + on = js_to_json('{42:4.2e1}') + self.assertEqual(json.loads(on), {'42': 42.0}) + def test_extract_attributes(self): self.assertEqual(extract_attributes('<e x="y">'), {'x': 'y'}) self.assertEqual(extract_attributes("<e x='y'>"), {'x': 'y'}) diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 386897a85..2fe9cf585 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -2267,7 +2267,7 @@ def js_to_json(code): "(?:[^"\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^"\\]*"| '(?:[^'\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^'\\]*'| {comment}|,(?={skip}[\]}}])| - [a-zA-Z_][.a-zA-Z_0-9]*| + (?:(?<![0-9])[eE]|[a-df-zA-DF-Z_])[.a-zA-Z_0-9]*| \b(?:0[xX][0-9a-fA-F]+|0+[0-7]+)(?:{skip}:)?| [0-9]+(?={skip}:) '''.format(comment=COMMENT_RE, skip=SKIP_RE), fix_kv, code) From c707b1d828c74f1ec8ffbfda7c910aec0c89a6e2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 20 Jan 2018 22:58:48 +0700 Subject: [PATCH 119/137] [test_utils] Add tests for malformed JSON handling in js_to_json --- test/test_utils.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/test/test_utils.py b/test/test_utils.py index 6ef498a66..fdf6031f7 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -891,6 +891,10 @@ class TestUtil(unittest.TestCase): on = js_to_json('{42:4.2e1}') self.assertEqual(json.loads(on), {'42': 42.0}) + def test_js_to_json_malformed(self): + self.assertEqual(js_to_json('42a1'), '42"a1"') + self.assertEqual(js_to_json('42a-1'), '42"a"-1') + def test_extract_attributes(self): self.assertEqual(extract_attributes('<e x="y">'), {'x': 'y'}) self.assertEqual(extract_attributes("<e x='y'>"), {'x': 'y'}) From 7df18fcc65aada949c5e3bf0f7cc814dee158c79 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 20 Jan 2018 23:19:02 +0700 Subject: [PATCH 120/137] [restudy] Fix extraction (closes #15347) --- youtube_dl/extractor/restudy.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/restudy.py b/youtube_dl/extractor/restudy.py index fd50065d4..18e4055f4 100644 --- a/youtube_dl/extractor/restudy.py +++ b/youtube_dl/extractor/restudy.py @@ -29,7 +29,7 @@ class RestudyIE(InfoExtractor): description = self._og_search_description(webpage).strip() formats = self._extract_smil_formats( - 'https://www.restudy.dk/awsmedia/SmilDirectory/video_%s.xml' % video_id, + 'https://cdn.portal.restudy.dk/dynamic/themes/front/awsmedia/SmilDirectory/video_%s.xml' % video_id, video_id) self._sort_formats(formats) From ac458e90a34f7ad93b797176c2c86c50f9ca23ee Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 20 Jan 2018 23:25:06 +0700 Subject: [PATCH 121/137] [restudy] Extend _VALID_URL (#15347) --- youtube_dl/extractor/restudy.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/restudy.py b/youtube_dl/extractor/restudy.py index 18e4055f4..d47fb45ca 100644 --- a/youtube_dl/extractor/restudy.py +++ b/youtube_dl/extractor/restudy.py @@ -5,8 +5,8 @@ from .common import InfoExtractor class RestudyIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?restudy\.dk/video/play/id/(?P<id>[0-9]+)' - _TEST = { + _VALID_URL = r'https?://(?:(?:www|portal)\.)?restudy\.dk/video/[^/]+/id/(?P<id>[0-9]+)' + _TESTS = [{ 'url': 'https://www.restudy.dk/video/play/id/1637', 'info_dict': { 'id': '1637', @@ -18,7 +18,10 @@ class RestudyIE(InfoExtractor): # rtmp download 'skip_download': True, } - } + }, { + 'url': 'https://portal.restudy.dk/video/leiden-frosteffekt/id/1637', + 'only_matching': True, + }] def _real_extract(self, url): video_id = self._match_id(url) From df58ecbeba72e03b241c28e7c4760ef9b215c402 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 21 Jan 2018 17:40:00 +0700 Subject: [PATCH 122/137] [rtvs] Add extractor (closes #9242, closes #15187) --- youtube_dl/extractor/extractors.py | 1 + youtube_dl/extractor/rtvs.py | 47 ++++++++++++++++++++++++++++++ 2 files changed, 48 insertions(+) create mode 100644 youtube_dl/extractor/rtvs.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index f5bac6fd9..541f73e3c 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -900,6 +900,7 @@ from .rtp import RTPIE from .rts import RTSIE from .rtve import RTVEALaCartaIE, RTVELiveIE, RTVEInfantilIE, RTVELiveIE, RTVETelevisionIE from .rtvnh import RTVNHIE +from .rtvs import RTVSIE from .rudo import RudoIE from .ruhd import RUHDIE from .ruleporn import RulePornIE diff --git a/youtube_dl/extractor/rtvs.py b/youtube_dl/extractor/rtvs.py new file mode 100644 index 000000000..6573b260d --- /dev/null +++ b/youtube_dl/extractor/rtvs.py @@ -0,0 +1,47 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor + + +class RTVSIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?rtvs\.sk/(?:radio|televizia)/archiv/\d+/(?P<id>\d+)' + _TESTS = [{ + # radio archive + 'url': 'http://www.rtvs.sk/radio/archiv/11224/414872', + 'md5': '134d5d6debdeddf8a5d761cbc9edacb8', + 'info_dict': { + 'id': '414872', + 'ext': 'mp3', + 'title': 'Ostrov pokladov 1 časť.mp3' + }, + 'params': { + 'skip_download': True, + } + }, { + # tv archive + 'url': 'http://www.rtvs.sk/televizia/archiv/8249/63118', + 'md5': '85e2c55cf988403b70cac24f5c086dc6', + 'info_dict': { + 'id': '63118', + 'ext': 'mp4', + 'title': 'Amaro Džives - Náš deň', + 'description': 'Galavečer pri príležitosti Medzinárodného dňa Rómov.' + }, + 'params': { + 'skip_download': True, + } + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage(url, video_id) + + playlist_url = self._search_regex( + r'playlist["\']?\s*:\s*(["\'])(?P<url>(?:(?!\1).)+)\1', webpage, + 'playlist url', group='url') + + data = self._download_json( + playlist_url, video_id, 'Downloading playlist')[0] + return self._parse_jwplayer_data(data, video_id=video_id) From 0d9c48de4f7877e05cbb9e89b97104796d01ca70 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 21 Jan 2018 17:42:48 +0700 Subject: [PATCH 123/137] [extractor/common] Improve DASH formats extraction for jwplayer (#9242, #15187) --- youtube_dl/extractor/common.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 5b6a09c0b..a072e9bc9 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -2404,7 +2404,7 @@ class InfoExtractor(object): formats.extend(self._extract_m3u8_formats( source_url, video_id, 'mp4', entry_protocol='m3u8_native', m3u8_id=m3u8_id, fatal=False)) - elif ext == 'mpd': + elif source_type == 'dash' or ext == 'mpd': formats.extend(self._extract_mpd_formats( source_url, video_id, mpd_id=mpd_id, fatal=False)) elif ext == 'smil': From b2a027fc6f858d08fb0f76e3e9ae777f8f846361 Mon Sep 17 00:00:00 2001 From: helb <helb@helb.cz> Date: Sun, 21 Jan 2018 11:50:53 +0100 Subject: [PATCH 124/137] [franceinter] Fix upload date extraction (closes #14996) --- youtube_dl/extractor/franceinter.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/franceinter.py b/youtube_dl/extractor/franceinter.py index 707b9e00d..05806895c 100644 --- a/youtube_dl/extractor/franceinter.py +++ b/youtube_dl/extractor/franceinter.py @@ -33,7 +33,7 @@ class FranceInterIE(InfoExtractor): description = self._og_search_description(webpage) upload_date_str = self._search_regex( - r'class=["\']cover-emission-period["\'][^>]*>[^<]+\s+(\d{1,2}\s+[^\s]+\s+\d{4})<', + r'class=["\']\s*cover-emission-period\s*["\'][^>]*>[^<]+\s+(\d{1,2}\s+[^\s]+\s+\d{4})<', webpage, 'upload date', fatal=False) if upload_date_str: upload_date_list = upload_date_str.split() From 655c410063da3400272b1e08d2f7432f2bb9e182 Mon Sep 17 00:00:00 2001 From: squibbysquibby <34315567+squibbysquibby@users.noreply.github.com> Date: Sun, 21 Jan 2018 08:15:11 -0300 Subject: [PATCH 125/137] [test_download] Fix download tests for lazy extractors (closes #13554, closes #13757) --- test/test_download.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/test_download.py b/test/test_download.py index 209f5f6d6..ebe820dfc 100644 --- a/test/test_download.py +++ b/test/test_download.py @@ -92,8 +92,8 @@ class TestDownload(unittest.TestCase): def generator(test_case, tname): def test_template(self): - ie = youtube_dl.extractor.get_info_extractor(test_case['name']) - other_ies = [get_info_extractor(ie_key) for ie_key in test_case.get('add_ie', [])] + ie = youtube_dl.extractor.get_info_extractor(test_case['name'])() + other_ies = [get_info_extractor(ie_key)() for ie_key in test_case.get('add_ie', [])] is_playlist = any(k.startswith('playlist') for k in test_case) test_cases = test_case.get( 'playlist', [] if is_playlist else [test_case]) From 6289e078836454eedda0a441f66347cd40051443 Mon Sep 17 00:00:00 2001 From: catlover999 <33813895+catlover999@users.noreply.github.com> Date: Sun, 19 Nov 2017 22:05:41 +0100 Subject: [PATCH 126/137] [southpark] Add support for collections --- youtube_dl/extractor/southpark.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/southpark.py b/youtube_dl/extractor/southpark.py index d8ce416fc..983437d0b 100644 --- a/youtube_dl/extractor/southpark.py +++ b/youtube_dl/extractor/southpark.py @@ -6,7 +6,7 @@ from .mtv import MTVServicesInfoExtractor class SouthParkIE(MTVServicesInfoExtractor): IE_NAME = 'southpark.cc.com' - _VALID_URL = r'https?://(?:www\.)?(?P<url>southpark\.cc\.com/(?:clips|(?:full-)?episodes)/(?P<id>.+?)(\?|#|$))' + _VALID_URL = r'https?://(?:www\.)?(?P<url>southpark\.cc\.com/(?:clips|(?:full-)?episodes|collections)/(?P<id>.+?)(\?|#|$))' _FEED_URL = 'http://www.southparkstudios.com/feeds/video-player/mrss' @@ -41,7 +41,7 @@ class SouthParkEsIE(SouthParkIE): class SouthParkDeIE(SouthParkIE): IE_NAME = 'southpark.de' - _VALID_URL = r'https?://(?:www\.)?(?P<url>southpark\.de/(?:clips|alle-episoden)/(?P<id>.+?)(\?|#|$))' + _VALID_URL = r'https?://(?:www\.)?(?P<url>southpark\.de/(?:clips|alle-episoden|collections)/(?P<id>.+?)(\?|#|$))' _FEED_URL = 'http://www.southpark.de/feeds/video-player/mrss/' _TESTS = [{ @@ -75,7 +75,7 @@ class SouthParkDeIE(SouthParkIE): class SouthParkNlIE(SouthParkIE): IE_NAME = 'southpark.nl' - _VALID_URL = r'https?://(?:www\.)?(?P<url>southpark\.nl/(?:clips|(?:full-)?episodes)/(?P<id>.+?)(\?|#|$))' + _VALID_URL = r'https?://(?:www\.)?(?P<url>southpark\.nl/(?:clips|(?:full-)?episodes|collections)/(?P<id>.+?)(\?|#|$))' _FEED_URL = 'http://www.southpark.nl/feeds/video-player/mrss/' _TESTS = [{ @@ -90,7 +90,7 @@ class SouthParkNlIE(SouthParkIE): class SouthParkDkIE(SouthParkIE): IE_NAME = 'southparkstudios.dk' - _VALID_URL = r'https?://(?:www\.)?(?P<url>southparkstudios\.dk/(?:clips|full-episodes)/(?P<id>.+?)(\?|#|$))' + _VALID_URL = r'https?://(?:www\.)?(?P<url>southparkstudios\.dk/(?:clips|full-episodes|collections)/(?P<id>.+?)(\?|#|$))' _FEED_URL = 'http://www.southparkstudios.dk/feeds/video-player/mrss/' _TESTS = [{ From 99d6e696fced51bbabc1b70dee347a3990ffb8df Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 21 Jan 2018 18:40:37 +0700 Subject: [PATCH 127/137] [southpark] Add tests for collections (closes #14803) --- youtube_dl/extractor/southpark.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/youtube_dl/extractor/southpark.py b/youtube_dl/extractor/southpark.py index 983437d0b..48d5db583 100644 --- a/youtube_dl/extractor/southpark.py +++ b/youtube_dl/extractor/southpark.py @@ -20,6 +20,9 @@ class SouthParkIE(MTVServicesInfoExtractor): 'timestamp': 1112760000, 'upload_date': '20050406', }, + }, { + 'url': 'http://southpark.cc.com/collections/7758/fan-favorites/1', + 'only_matching': True, }] @@ -70,6 +73,9 @@ class SouthParkDeIE(SouthParkIE): 'description': 'Kyle will mit seinem kleinen Bruder Ike Videospiele spielen. Als der nicht mehr mit ihm spielen will, hat Kyle Angst, dass er die Kids von heute nicht mehr versteht.', }, 'playlist_count': 3, + }, { + 'url': 'http://www.southpark.de/collections/2476/superhero-showdown/1', + 'only_matching': True, }] @@ -100,4 +106,7 @@ class SouthParkDkIE(SouthParkIE): 'description': 'Butters is convinced he\'s living in a virtual reality.', }, 'playlist_mincount': 3, + }, { + 'url': 'http://www.southparkstudios.dk/collections/2476/superhero-showdown/1', + 'only_matching': True, }] From e0ab56571ed553d1fcae90ac78c53a84924ca212 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 21 Jan 2018 18:42:34 +0700 Subject: [PATCH 128/137] [southparkdk] Add support for southparkstudios.nu --- youtube_dl/extractor/southpark.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/southpark.py b/youtube_dl/extractor/southpark.py index 48d5db583..da75a43a7 100644 --- a/youtube_dl/extractor/southpark.py +++ b/youtube_dl/extractor/southpark.py @@ -96,7 +96,7 @@ class SouthParkNlIE(SouthParkIE): class SouthParkDkIE(SouthParkIE): IE_NAME = 'southparkstudios.dk' - _VALID_URL = r'https?://(?:www\.)?(?P<url>southparkstudios\.dk/(?:clips|full-episodes|collections)/(?P<id>.+?)(\?|#|$))' + _VALID_URL = r'https?://(?:www\.)?(?P<url>southparkstudios\.(?:dk|nu)/(?:clips|full-episodes|collections)/(?P<id>.+?)(\?|#|$))' _FEED_URL = 'http://www.southparkstudios.dk/feeds/video-player/mrss/' _TESTS = [{ @@ -109,4 +109,7 @@ class SouthParkDkIE(SouthParkIE): }, { 'url': 'http://www.southparkstudios.dk/collections/2476/superhero-showdown/1', 'only_matching': True, + }, { + 'url': 'http://www.southparkstudios.nu/collections/2476/superhero-showdown/1', + 'only_matching': True, }] From 721a0c3c7b5ee80c002658f7f336c7085e007580 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 21 Jan 2018 21:22:38 +0700 Subject: [PATCH 129/137] [prosiebensat1] Relax clip id --- youtube_dl/extractor/prosiebensat1.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/prosiebensat1.py b/youtube_dl/extractor/prosiebensat1.py index 7c9743fe8..7e680a728 100644 --- a/youtube_dl/extractor/prosiebensat1.py +++ b/youtube_dl/extractor/prosiebensat1.py @@ -344,7 +344,7 @@ class ProSiebenSat1IE(ProSiebenSat1BaseIE): r'clip[iI]d=(\d+)', r'clip[iI]d\s*=\s*["\'](\d+)', r"'itemImageUrl'\s*:\s*'/dynamic/thumbnails/full/\d+/(\d+)", - r'proMamsId":"(\d+)', + r'proMamsId"\s*:\s*"(\d+)', ] _TITLE_REGEXES = [ r'<h2 class="subtitle" itemprop="name">\s*(.+?)</h2>', From d7da6db4e18e47b0831c73ed013c092c74ee59b4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 21 Jan 2018 21:23:24 +0700 Subject: [PATCH 130/137] [ChangeLog] Actualize --- ChangeLog | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/ChangeLog b/ChangeLog index d678bc7af..1e9c3e071 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,21 @@ +version <unreleased> + +Core +* [extractor/common] Improve jwplayer DASH formats extraction (#9242, #15187) +* [utils] Improve scientific notation handling in js_to_json (#14789) + +Extractors ++ [southparkdk] Add support for southparkstudios.nu ++ [southpark] Add support for collections (#14803) +* [franceinter] Fix upload date extraction (#14996) ++ [rtvs] Add support for rtvs.sk (#9242, #15187) +* [restudy] Fix extraction and extend URL regular expression (#15347) +* [youtube:live] Improve live detection (#15365) ++ [springboardplatform] Add support for springboardplatform.com +* [prosiebensat1] Add another clip id regular expression (#15290) +- [ringtv] Remove extractor (#15345) + + version 2018.01.18 Extractors From 6e5eacb7705b68c108160780b4ded3b7abd9ebb6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 21 Jan 2018 21:26:05 +0700 Subject: [PATCH 131/137] release 2018.01.21 --- .github/ISSUE_TEMPLATE.md | 6 +++--- ChangeLog | 2 +- docs/supportedsites.md | 3 ++- youtube_dl/version.py | 2 +- 4 files changed, 7 insertions(+), 6 deletions(-) diff --git a/.github/ISSUE_TEMPLATE.md b/.github/ISSUE_TEMPLATE.md index 079e7629e..145c3ff83 100644 --- a/.github/ISSUE_TEMPLATE.md +++ b/.github/ISSUE_TEMPLATE.md @@ -6,8 +6,8 @@ --- -### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2018.01.18*. If it's not, read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. -- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2018.01.18** +### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2018.01.21*. If it's not, read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. +- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2018.01.21** ### Before submitting an *issue* make sure you have: - [ ] At least skimmed through the [README](https://github.com/rg3/youtube-dl/blob/master/README.md), **most notably** the [FAQ](https://github.com/rg3/youtube-dl#faq) and [BUGS](https://github.com/rg3/youtube-dl#bugs) sections @@ -35,7 +35,7 @@ Add the `-v` flag to **your command line** you run youtube-dl with (`youtube-dl [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 -[debug] youtube-dl version 2018.01.18 +[debug] youtube-dl version 2018.01.21 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/ChangeLog b/ChangeLog index 1e9c3e071..65a01fcc7 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,4 +1,4 @@ -version <unreleased> +version 2018.01.21 Core * [extractor/common] Improve jwplayer DASH formats extraction (#9242, #15187) diff --git a/docs/supportedsites.md b/docs/supportedsites.md index 0caced5bd..b0825c58b 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -682,7 +682,6 @@ - **revision** - **revision3:embed** - **RICE** - - **RingTV** - **RMCDecouverte** - **RockstarGames** - **RoosterTeeth** @@ -703,6 +702,7 @@ - **rtve.es:live**: RTVE.es live streams - **rtve.es:television** - **RTVNH** + - **RTVS** - **Rudo** - **RUHD** - **RulePorn** @@ -773,6 +773,7 @@ - **Sport5** - **SportBoxEmbed** - **SportDeutschland** + - **SpringboardPlatform** - **Sprout** - **sr:mediathek**: Saarländischer Rundfunk - **SRGSSR** diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 717c1e899..11e82f433 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2018.01.18' +__version__ = '2018.01.21' From 021bd012bb55da7d0be8e0ff7565d38ae17551bd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Mon, 22 Jan 2018 22:30:28 +0700 Subject: [PATCH 132/137] [thesixtyone] Remove extractor (closes #15341) --- youtube_dl/extractor/extractors.py | 1 - youtube_dl/extractor/thesixtyone.py | 106 ---------------------------- 2 files changed, 107 deletions(-) delete mode 100644 youtube_dl/extractor/thesixtyone.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 541f73e3c..57e74ba62 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -1046,7 +1046,6 @@ from .theplatform import ( ThePlatformFeedIE, ) from .thescene import TheSceneIE -from .thesixtyone import TheSixtyOneIE from .thestar import TheStarIE from .thesun import TheSunIE from .theweatherchannel import TheWeatherChannelIE diff --git a/youtube_dl/extractor/thesixtyone.py b/youtube_dl/extractor/thesixtyone.py deleted file mode 100644 index d63aef5de..000000000 --- a/youtube_dl/extractor/thesixtyone.py +++ /dev/null @@ -1,106 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -from .common import InfoExtractor -from ..utils import unified_strdate - - -class TheSixtyOneIE(InfoExtractor): - _VALID_URL = r'''(?x)https?://(?:www\.)?thesixtyone\.com/ - (?:.*?/)* - (?: - s| - song/comments/list| - song - )/(?:[^/]+/)?(?P<id>[A-Za-z0-9]+)/?$''' - _SONG_URL_TEMPLATE = 'http://thesixtyone.com/s/{0:}' - _SONG_FILE_URL_TEMPLATE = 'http://{audio_server:}/thesixtyone_production/audio/{0:}_stream' - _THUMBNAIL_URL_TEMPLATE = '{photo_base_url:}_desktop' - _TESTS = [ - { - 'url': 'http://www.thesixtyone.com/s/SrE3zD7s1jt/', - 'md5': '821cc43b0530d3222e3e2b70bb4622ea', - 'info_dict': { - 'id': 'SrE3zD7s1jt', - 'ext': 'mp3', - 'title': 'CASIO - Unicorn War Mixtape', - 'thumbnail': 're:^https?://.*_desktop$', - 'upload_date': '20071217', - 'duration': 3208, - } - }, - { - 'url': 'http://www.thesixtyone.com/song/comments/list/SrE3zD7s1jt', - 'only_matching': True, - }, - { - 'url': 'http://www.thesixtyone.com/s/ULoiyjuJWli#/s/SrE3zD7s1jt/', - 'only_matching': True, - }, - { - 'url': 'http://www.thesixtyone.com/#/s/SrE3zD7s1jt/', - 'only_matching': True, - }, - { - 'url': 'http://www.thesixtyone.com/song/SrE3zD7s1jt/', - 'only_matching': True, - }, - { - 'url': 'http://www.thesixtyone.com/maryatmidnight/song/StrawberriesandCream/yvWtLp0c4GQ/', - 'only_matching': True, - }, - ] - - _DECODE_MAP = { - 'x': 'a', - 'm': 'b', - 'w': 'c', - 'q': 'd', - 'n': 'e', - 'p': 'f', - 'a': '0', - 'h': '1', - 'e': '2', - 'u': '3', - 's': '4', - 'i': '5', - 'o': '6', - 'y': '7', - 'r': '8', - 'c': '9' - } - - def _real_extract(self, url): - song_id = self._match_id(url) - - webpage = self._download_webpage( - self._SONG_URL_TEMPLATE.format(song_id), song_id) - - song_data = self._parse_json(self._search_regex( - r'"%s":\s(\{.*?\})' % song_id, webpage, 'song_data'), song_id) - - if self._search_regex(r'(t61\.s3_audio_load\s*=\s*1\.0;)', webpage, 's3_audio_load marker', default=None): - song_data['audio_server'] = 's3.amazonaws.com' - else: - song_data['audio_server'] = song_data['audio_server'] + '.thesixtyone.com' - - keys = [self._DECODE_MAP.get(s, s) for s in song_data['key']] - url = self._SONG_FILE_URL_TEMPLATE.format( - "".join(reversed(keys)), **song_data) - - formats = [{ - 'format_id': 'sd', - 'url': url, - 'ext': 'mp3', - }] - - return { - 'id': song_id, - 'title': '{artist:} - {name:}'.format(**song_data), - 'formats': formats, - 'comment_count': song_data.get('comments_count'), - 'duration': song_data.get('play_time'), - 'like_count': song_data.get('score'), - 'thumbnail': self._THUMBNAIL_URL_TEMPLATE.format(**song_data), - 'upload_date': unified_strdate(song_data.get('publish_date')), - } From f206126df090d78f30426321473ebd566c3b7866 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Tue, 23 Jan 2018 21:53:01 +0700 Subject: [PATCH 133/137] [compat] Add compat_b64decode --- youtube_dl/compat.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/youtube_dl/compat.py b/youtube_dl/compat.py index 41ca9adf1..646c9d79c 100644 --- a/youtube_dl/compat.py +++ b/youtube_dl/compat.py @@ -1,6 +1,7 @@ # coding: utf-8 from __future__ import unicode_literals +import base64 import binascii import collections import ctypes @@ -2908,6 +2909,16 @@ except ImportError: # not 2.6+ or is 3.x except ImportError: compat_zip = zip + +if sys.version_info < (3, 3): + def compat_b64decode(s, *args, **kwargs): + if isinstance(s, compat_str): + s = s.encode('ascii') + return base64.b64decode(s, *args, **kwargs) +else: + compat_b64decode = base64.b64decode + + if platform.python_implementation() == 'PyPy' and sys.pypy_version_info < (5, 4, 0): # PyPy2 prior to version 5.4.0 expects byte strings as Windows function # names, see the original PyPy issue [1] and the youtube-dl one [2]. @@ -2930,6 +2941,7 @@ __all__ = [ 'compat_HTMLParseError', 'compat_HTMLParser', 'compat_HTTPError', + 'compat_b64decode', 'compat_basestring', 'compat_chr', 'compat_cookiejar', From 5d7d805ca90992cac1cdffbe5d3df3d894d2b979 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Tue, 23 Jan 2018 21:53:45 +0700 Subject: [PATCH 134/137] [mixcloud] Use compat_b64decode (closes #15394) --- youtube_dl/extractor/mixcloud.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/mixcloud.py b/youtube_dl/extractor/mixcloud.py index 7b2bb6e20..785b99bc3 100644 --- a/youtube_dl/extractor/mixcloud.py +++ b/youtube_dl/extractor/mixcloud.py @@ -7,6 +7,7 @@ import re from .common import InfoExtractor from ..compat import ( + compat_b64decode, compat_chr, compat_ord, compat_str, @@ -79,7 +80,7 @@ class MixcloudIE(InfoExtractor): if encrypted_play_info is not None: # Decode - encrypted_play_info = base64.b64decode(encrypted_play_info) + encrypted_play_info = compat_b64decode(encrypted_play_info) else: # New path full_info_json = self._parse_json(self._html_search_regex( @@ -109,7 +110,7 @@ class MixcloudIE(InfoExtractor): kpa_target = encrypted_play_info else: kps = ['https://', 'http://'] - kpa_target = base64.b64decode(info_json['streamInfo']['url']) + kpa_target = compat_b64decode(info_json['streamInfo']['url']) for kp in kps: partial_key = self._decrypt_xor_cipher(kpa_target, kp) for quote in ["'", '"']: @@ -165,7 +166,7 @@ class MixcloudIE(InfoExtractor): format_url = stream_info.get(url_key) if not format_url: continue - decrypted = self._decrypt_xor_cipher(key, base64.b64decode(format_url)) + decrypted = self._decrypt_xor_cipher(key, compat_b64decode(format_url)) if not decrypted: continue if url_key == 'hlsUrl': From cf2820710d61742818a906af07f6d6c9669d58a9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Tue, 23 Jan 2018 22:23:12 +0700 Subject: [PATCH 135/137] Switch codebase to use compat_b64decode --- youtube_dl/aes.py | 4 ++-- youtube_dl/downloader/f4m.py | 6 +++--- youtube_dl/extractor/adn.py | 10 ++++++---- youtube_dl/extractor/bigflix.py | 10 ++++++---- youtube_dl/extractor/chilloutzone.py | 4 ++-- youtube_dl/extractor/chirbit.py | 5 ++--- youtube_dl/extractor/crunchyroll.py | 6 +++--- youtube_dl/extractor/daisuki.py | 3 ++- youtube_dl/extractor/dumpert.py | 4 ++-- youtube_dl/extractor/einthusan.py | 8 ++++---- youtube_dl/extractor/hotnewhiphop.py | 5 ++--- youtube_dl/extractor/infoq.py | 5 ++--- youtube_dl/extractor/leeco.py | 4 ++-- youtube_dl/extractor/mangomolo.py | 11 +++++------ youtube_dl/extractor/mixcloud.py | 1 - youtube_dl/extractor/ooyala.py | 11 +++++++---- youtube_dl/extractor/rtl2.py | 8 ++++---- youtube_dl/extractor/rtve.py | 3 ++- youtube_dl/extractor/shared.py | 10 ++++------ youtube_dl/extractor/teamcoco.py | 8 +++++--- youtube_dl/extractor/tutv.py | 9 +++++---- 21 files changed, 70 insertions(+), 65 deletions(-) diff --git a/youtube_dl/aes.py b/youtube_dl/aes.py index c5bb3c4ef..461bb6d41 100644 --- a/youtube_dl/aes.py +++ b/youtube_dl/aes.py @@ -1,8 +1,8 @@ from __future__ import unicode_literals -import base64 from math import ceil +from .compat import compat_b64decode from .utils import bytes_to_intlist, intlist_to_bytes BLOCK_SIZE_BYTES = 16 @@ -180,7 +180,7 @@ def aes_decrypt_text(data, password, key_size_bytes): """ NONCE_LENGTH_BYTES = 8 - data = bytes_to_intlist(base64.b64decode(data.encode('utf-8'))) + data = bytes_to_intlist(compat_b64decode(data)) password = bytes_to_intlist(password.encode('utf-8')) key = password[:key_size_bytes] + [0] * (key_size_bytes - len(password)) diff --git a/youtube_dl/downloader/f4m.py b/youtube_dl/downloader/f4m.py index fdb80f42a..15e71be9a 100644 --- a/youtube_dl/downloader/f4m.py +++ b/youtube_dl/downloader/f4m.py @@ -1,12 +1,12 @@ from __future__ import division, unicode_literals -import base64 import io import itertools import time from .fragment import FragmentFD from ..compat import ( + compat_b64decode, compat_etree_fromstring, compat_urlparse, compat_urllib_error, @@ -312,7 +312,7 @@ class F4mFD(FragmentFD): boot_info = self._get_bootstrap_from_url(bootstrap_url) else: bootstrap_url = None - bootstrap = base64.b64decode(node.text.encode('ascii')) + bootstrap = compat_b64decode(node.text) boot_info = read_bootstrap_info(bootstrap) return boot_info, bootstrap_url @@ -349,7 +349,7 @@ class F4mFD(FragmentFD): live = boot_info['live'] metadata_node = media.find(_add_ns('metadata')) if metadata_node is not None: - metadata = base64.b64decode(metadata_node.text.encode('ascii')) + metadata = compat_b64decode(metadata_node.text) else: metadata = None diff --git a/youtube_dl/extractor/adn.py b/youtube_dl/extractor/adn.py index cffdab6ca..64fb755da 100644 --- a/youtube_dl/extractor/adn.py +++ b/youtube_dl/extractor/adn.py @@ -1,13 +1,15 @@ # coding: utf-8 from __future__ import unicode_literals -import base64 import json import os from .common import InfoExtractor from ..aes import aes_cbc_decrypt -from ..compat import compat_ord +from ..compat import ( + compat_b64decode, + compat_ord, +) from ..utils import ( bytes_to_intlist, ExtractorError, @@ -48,9 +50,9 @@ class ADNIE(InfoExtractor): # http://animedigitalnetwork.fr/components/com_vodvideo/videojs/adn-vjs.min.js dec_subtitles = intlist_to_bytes(aes_cbc_decrypt( - bytes_to_intlist(base64.b64decode(enc_subtitles[24:])), + bytes_to_intlist(compat_b64decode(enc_subtitles[24:])), bytes_to_intlist(b'\x1b\xe0\x29\x61\x38\x94\x24\x00\x12\xbd\xc5\x80\xac\xce\xbe\xb0'), - bytes_to_intlist(base64.b64decode(enc_subtitles[:24])) + bytes_to_intlist(compat_b64decode(enc_subtitles[:24])) )) subtitles_json = self._parse_json( dec_subtitles[:-compat_ord(dec_subtitles[-1])].decode(), diff --git a/youtube_dl/extractor/bigflix.py b/youtube_dl/extractor/bigflix.py index b4ce767af..28e3e59f6 100644 --- a/youtube_dl/extractor/bigflix.py +++ b/youtube_dl/extractor/bigflix.py @@ -1,11 +1,13 @@ # coding: utf-8 from __future__ import unicode_literals -import base64 import re from .common import InfoExtractor -from ..compat import compat_urllib_parse_unquote +from ..compat import ( + compat_b64decode, + compat_urllib_parse_unquote, +) class BigflixIE(InfoExtractor): @@ -39,8 +41,8 @@ class BigflixIE(InfoExtractor): webpage, 'title') def decode_url(quoted_b64_url): - return base64.b64decode(compat_urllib_parse_unquote( - quoted_b64_url).encode('ascii')).decode('utf-8') + return compat_b64decode(compat_urllib_parse_unquote( + quoted_b64_url)).decode('utf-8') formats = [] for height, encoded_url in re.findall( diff --git a/youtube_dl/extractor/chilloutzone.py b/youtube_dl/extractor/chilloutzone.py index d4769da75..5aac21299 100644 --- a/youtube_dl/extractor/chilloutzone.py +++ b/youtube_dl/extractor/chilloutzone.py @@ -1,11 +1,11 @@ from __future__ import unicode_literals import re -import base64 import json from .common import InfoExtractor from .youtube import YoutubeIE +from ..compat import compat_b64decode from ..utils import ( clean_html, ExtractorError @@ -58,7 +58,7 @@ class ChilloutzoneIE(InfoExtractor): base64_video_info = self._html_search_regex( r'var cozVidData = "(.+?)";', webpage, 'video data') - decoded_video_info = base64.b64decode(base64_video_info.encode('utf-8')).decode('utf-8') + decoded_video_info = compat_b64decode(base64_video_info).decode('utf-8') video_info_dict = json.loads(decoded_video_info) # get video information from dict diff --git a/youtube_dl/extractor/chirbit.py b/youtube_dl/extractor/chirbit.py index 4815b34be..8d75cdf19 100644 --- a/youtube_dl/extractor/chirbit.py +++ b/youtube_dl/extractor/chirbit.py @@ -1,10 +1,10 @@ # coding: utf-8 from __future__ import unicode_literals -import base64 import re from .common import InfoExtractor +from ..compat import compat_b64decode from ..utils import parse_duration @@ -44,8 +44,7 @@ class ChirbitIE(InfoExtractor): # Reverse engineered from https://chirb.it/js/chirbit.player.js (look # for soundURL) - audio_url = base64.b64decode( - data_fd[::-1].encode('ascii')).decode('utf-8') + audio_url = compat_b64decode(data_fd[::-1]).decode('utf-8') title = self._search_regex( r'class=["\']chirbit-title["\'][^>]*>([^<]+)', webpage, 'title') diff --git a/youtube_dl/extractor/crunchyroll.py b/youtube_dl/extractor/crunchyroll.py index b92f25447..3efdc8c21 100644 --- a/youtube_dl/extractor/crunchyroll.py +++ b/youtube_dl/extractor/crunchyroll.py @@ -3,13 +3,13 @@ from __future__ import unicode_literals import re import json -import base64 import zlib from hashlib import sha1 from math import pow, sqrt, floor from .common import InfoExtractor from ..compat import ( + compat_b64decode, compat_etree_fromstring, compat_urllib_parse_urlencode, compat_urllib_request, @@ -272,8 +272,8 @@ class CrunchyrollIE(CrunchyrollBaseIE): } def _decrypt_subtitles(self, data, iv, id): - data = bytes_to_intlist(base64.b64decode(data.encode('utf-8'))) - iv = bytes_to_intlist(base64.b64decode(iv.encode('utf-8'))) + data = bytes_to_intlist(compat_b64decode(data)) + iv = bytes_to_intlist(compat_b64decode(iv)) id = int(id) def obfuscate_key_aux(count, modulo, start): diff --git a/youtube_dl/extractor/daisuki.py b/youtube_dl/extractor/daisuki.py index 5c9ac68a0..dbc1aa5d4 100644 --- a/youtube_dl/extractor/daisuki.py +++ b/youtube_dl/extractor/daisuki.py @@ -10,6 +10,7 @@ from ..aes import ( aes_cbc_decrypt, aes_cbc_encrypt, ) +from ..compat import compat_b64decode from ..utils import ( bytes_to_intlist, bytes_to_long, @@ -93,7 +94,7 @@ class DaisukiMottoIE(InfoExtractor): rtn = self._parse_json( intlist_to_bytes(aes_cbc_decrypt(bytes_to_intlist( - base64.b64decode(encrypted_rtn)), + compat_b64decode(encrypted_rtn)), aes_key, iv)).decode('utf-8').rstrip('\0'), video_id) diff --git a/youtube_dl/extractor/dumpert.py b/youtube_dl/extractor/dumpert.py index c9fc9b5a9..be2e3d378 100644 --- a/youtube_dl/extractor/dumpert.py +++ b/youtube_dl/extractor/dumpert.py @@ -1,10 +1,10 @@ # coding: utf-8 from __future__ import unicode_literals -import base64 import re from .common import InfoExtractor +from ..compat import compat_b64decode from ..utils import ( qualities, sanitized_Request, @@ -42,7 +42,7 @@ class DumpertIE(InfoExtractor): r'data-files="([^"]+)"', webpage, 'data files') files = self._parse_json( - base64.b64decode(files_base64.encode('utf-8')).decode('utf-8'), + compat_b64decode(files_base64).decode('utf-8'), video_id) quality = qualities(['flv', 'mobile', 'tablet', '720p']) diff --git a/youtube_dl/extractor/einthusan.py b/youtube_dl/extractor/einthusan.py index 3f6268637..4485bf8c1 100644 --- a/youtube_dl/extractor/einthusan.py +++ b/youtube_dl/extractor/einthusan.py @@ -1,13 +1,13 @@ # coding: utf-8 from __future__ import unicode_literals -import base64 import json from .common import InfoExtractor from ..compat import ( - compat_urlparse, + compat_b64decode, compat_str, + compat_urlparse, ) from ..utils import ( extract_attributes, @@ -36,9 +36,9 @@ class EinthusanIE(InfoExtractor): # reversed from jsoncrypto.prototype.decrypt() in einthusan-PGMovieWatcher.js def _decrypt(self, encrypted_data, video_id): - return self._parse_json(base64.b64decode(( + return self._parse_json(compat_b64decode(( encrypted_data[:10] + encrypted_data[-1] + encrypted_data[12:-1] - ).encode('ascii')).decode('utf-8'), video_id) + )).decode('utf-8'), video_id) def _real_extract(self, url): video_id = self._match_id(url) diff --git a/youtube_dl/extractor/hotnewhiphop.py b/youtube_dl/extractor/hotnewhiphop.py index 34163725f..4703e1894 100644 --- a/youtube_dl/extractor/hotnewhiphop.py +++ b/youtube_dl/extractor/hotnewhiphop.py @@ -1,8 +1,7 @@ from __future__ import unicode_literals -import base64 - from .common import InfoExtractor +from ..compat import compat_b64decode from ..utils import ( ExtractorError, HEADRequest, @@ -48,7 +47,7 @@ class HotNewHipHopIE(InfoExtractor): if 'mediaKey' not in mkd: raise ExtractorError('Did not get a media key') - redirect_url = base64.b64decode(video_url_base64).decode('utf-8') + redirect_url = compat_b64decode(video_url_base64).decode('utf-8') redirect_req = HEADRequest(redirect_url) req = self._request_webpage( redirect_req, video_id, diff --git a/youtube_dl/extractor/infoq.py b/youtube_dl/extractor/infoq.py index c3e892feb..391c2f5d0 100644 --- a/youtube_dl/extractor/infoq.py +++ b/youtube_dl/extractor/infoq.py @@ -2,9 +2,8 @@ from __future__ import unicode_literals -import base64 - from ..compat import ( + compat_b64decode, compat_urllib_parse_unquote, compat_urlparse, ) @@ -61,7 +60,7 @@ class InfoQIE(BokeCCBaseIE): encoded_id = self._search_regex( r"jsclassref\s*=\s*'([^']*)'", webpage, 'encoded id', default=None) - real_id = compat_urllib_parse_unquote(base64.b64decode(encoded_id.encode('ascii')).decode('utf-8')) + real_id = compat_urllib_parse_unquote(compat_b64decode(encoded_id).decode('utf-8')) playpath = 'mp4:' + real_id return [{ diff --git a/youtube_dl/extractor/leeco.py b/youtube_dl/extractor/leeco.py index 0a07c1320..ffe10154b 100644 --- a/youtube_dl/extractor/leeco.py +++ b/youtube_dl/extractor/leeco.py @@ -1,7 +1,6 @@ # coding: utf-8 from __future__ import unicode_literals -import base64 import datetime import hashlib import re @@ -9,6 +8,7 @@ import time from .common import InfoExtractor from ..compat import ( + compat_b64decode, compat_ord, compat_str, compat_urllib_parse_urlencode, @@ -329,7 +329,7 @@ class LetvCloudIE(InfoExtractor): raise ExtractorError('Letv cloud returned an unknwon error') def b64decode(s): - return base64.b64decode(s.encode('utf-8')).decode('utf-8') + return compat_b64decode(s).decode('utf-8') formats = [] for media in play_json['data']['video_info']['media'].values(): diff --git a/youtube_dl/extractor/mangomolo.py b/youtube_dl/extractor/mangomolo.py index dbd761a67..482175a34 100644 --- a/youtube_dl/extractor/mangomolo.py +++ b/youtube_dl/extractor/mangomolo.py @@ -1,13 +1,12 @@ # coding: utf-8 from __future__ import unicode_literals -import base64 - from .common import InfoExtractor -from ..compat import compat_urllib_parse_unquote -from ..utils import ( - int_or_none, +from ..compat import ( + compat_b64decode, + compat_urllib_parse_unquote, ) +from ..utils import int_or_none class MangomoloBaseIE(InfoExtractor): @@ -51,4 +50,4 @@ class MangomoloLiveIE(MangomoloBaseIE): _IS_LIVE = True def _get_real_id(self, page_id): - return base64.b64decode(compat_urllib_parse_unquote(page_id).encode()).decode() + return compat_b64decode(compat_urllib_parse_unquote(page_id)).decode() diff --git a/youtube_dl/extractor/mixcloud.py b/youtube_dl/extractor/mixcloud.py index 785b99bc3..a56b7690f 100644 --- a/youtube_dl/extractor/mixcloud.py +++ b/youtube_dl/extractor/mixcloud.py @@ -1,6 +1,5 @@ from __future__ import unicode_literals -import base64 import functools import itertools import re diff --git a/youtube_dl/extractor/ooyala.py b/youtube_dl/extractor/ooyala.py index 52580baed..ad8bf03f8 100644 --- a/youtube_dl/extractor/ooyala.py +++ b/youtube_dl/extractor/ooyala.py @@ -1,9 +1,13 @@ from __future__ import unicode_literals + import re -import base64 from .common import InfoExtractor -from ..compat import compat_str +from ..compat import ( + compat_b64decode, + compat_str, + compat_urllib_parse_urlencode, +) from ..utils import ( determine_ext, ExtractorError, @@ -12,7 +16,6 @@ from ..utils import ( try_get, unsmuggle_url, ) -from ..compat import compat_urllib_parse_urlencode class OoyalaBaseIE(InfoExtractor): @@ -44,7 +47,7 @@ class OoyalaBaseIE(InfoExtractor): url_data = try_get(stream, lambda x: x['url']['data'], compat_str) if not url_data: continue - s_url = base64.b64decode(url_data.encode('ascii')).decode('utf-8') + s_url = compat_b64decode(url_data).decode('utf-8') if not s_url or s_url in urls: continue urls.append(s_url) diff --git a/youtube_dl/extractor/rtl2.py b/youtube_dl/extractor/rtl2.py index 666e90e90..18a327d81 100644 --- a/youtube_dl/extractor/rtl2.py +++ b/youtube_dl/extractor/rtl2.py @@ -1,12 +1,12 @@ # coding: utf-8 from __future__ import unicode_literals -import base64 import re from .common import InfoExtractor from ..aes import aes_cbc_decrypt from ..compat import ( + compat_b64decode, compat_ord, compat_str, ) @@ -142,11 +142,11 @@ class RTL2YouIE(RTL2YouBaseIE): stream_data = self._download_json( self._BACKWERK_BASE_URL + 'stream/video/' + video_id, video_id) - data, iv = base64.b64decode(stream_data['streamUrl']).decode().split(':') + data, iv = compat_b64decode(stream_data['streamUrl']).decode().split(':') stream_url = intlist_to_bytes(aes_cbc_decrypt( - bytes_to_intlist(base64.b64decode(data)), + bytes_to_intlist(compat_b64decode(data)), bytes_to_intlist(self._AES_KEY), - bytes_to_intlist(base64.b64decode(iv)) + bytes_to_intlist(compat_b64decode(iv)) )) if b'rtl2_you_video_not_found' in stream_url: raise ExtractorError('video not found', expected=True) diff --git a/youtube_dl/extractor/rtve.py b/youtube_dl/extractor/rtve.py index fa60ffd5e..ce9db0629 100644 --- a/youtube_dl/extractor/rtve.py +++ b/youtube_dl/extractor/rtve.py @@ -7,6 +7,7 @@ import time from .common import InfoExtractor from ..compat import ( + compat_b64decode, compat_struct_unpack, ) from ..utils import ( @@ -21,7 +22,7 @@ from ..utils import ( def _decrypt_url(png): - encrypted_data = base64.b64decode(png.encode('utf-8')) + encrypted_data = compat_b64decode(png) text_index = encrypted_data.find(b'tEXt') text_chunk = encrypted_data[text_index - 4:] length = compat_struct_unpack('!I', text_chunk[:4])[0] diff --git a/youtube_dl/extractor/shared.py b/youtube_dl/extractor/shared.py index 89e19e927..b2250afdd 100644 --- a/youtube_dl/extractor/shared.py +++ b/youtube_dl/extractor/shared.py @@ -1,8 +1,7 @@ from __future__ import unicode_literals -import base64 - from .common import InfoExtractor +from ..compat import compat_b64decode from ..utils import ( ExtractorError, int_or_none, @@ -22,8 +21,8 @@ class SharedBaseIE(InfoExtractor): video_url = self._extract_video_url(webpage, video_id, url) - title = base64.b64decode(self._html_search_meta( - 'full:title', webpage, 'title').encode('utf-8')).decode('utf-8') + title = compat_b64decode(self._html_search_meta( + 'full:title', webpage, 'title')).decode('utf-8') filesize = int_or_none(self._html_search_meta( 'full:size', webpage, 'file size', fatal=False)) @@ -92,5 +91,4 @@ class VivoIE(SharedBaseIE): r'InitializeStream\s*\(\s*(["\'])(?P<url>(?:(?!\1).)+)\1', webpage, 'stream', group='url'), video_id, - transform_source=lambda x: base64.b64decode( - x.encode('ascii')).decode('utf-8'))[0] + transform_source=lambda x: compat_b64decode(x).decode('utf-8'))[0] diff --git a/youtube_dl/extractor/teamcoco.py b/youtube_dl/extractor/teamcoco.py index 75346393b..9056c8cbc 100644 --- a/youtube_dl/extractor/teamcoco.py +++ b/youtube_dl/extractor/teamcoco.py @@ -1,18 +1,20 @@ # coding: utf-8 from __future__ import unicode_literals -import base64 import binascii import re import json from .common import InfoExtractor +from ..compat import ( + compat_b64decode, + compat_ord, +) from ..utils import ( ExtractorError, qualities, determine_ext, ) -from ..compat import compat_ord class TeamcocoIE(InfoExtractor): @@ -97,7 +99,7 @@ class TeamcocoIE(InfoExtractor): for i in range(len(cur_fragments)): cur_sequence = (''.join(cur_fragments[i:] + cur_fragments[:i])).encode('ascii') try: - raw_data = base64.b64decode(cur_sequence) + raw_data = compat_b64decode(cur_sequence) if compat_ord(raw_data[0]) == compat_ord('{'): return json.loads(raw_data.decode('utf-8')) except (TypeError, binascii.Error, UnicodeDecodeError, ValueError): diff --git a/youtube_dl/extractor/tutv.py b/youtube_dl/extractor/tutv.py index 822372ea1..362318b24 100644 --- a/youtube_dl/extractor/tutv.py +++ b/youtube_dl/extractor/tutv.py @@ -1,9 +1,10 @@ from __future__ import unicode_literals -import base64 - from .common import InfoExtractor -from ..compat import compat_parse_qs +from ..compat import ( + compat_b64decode, + compat_parse_qs, +) class TutvIE(InfoExtractor): @@ -26,7 +27,7 @@ class TutvIE(InfoExtractor): data_content = self._download_webpage( 'http://tu.tv/flvurl.php?codVideo=%s' % internal_id, video_id, 'Downloading video info') - video_url = base64.b64decode(compat_parse_qs(data_content)['kpt'][0].encode('utf-8')).decode('utf-8') + video_url = compat_b64decode(compat_parse_qs(data_content)['kpt'][0]).decode('utf-8') return { 'id': internal_id, From dc400ed6a2f79977cda7968b626b1ead35523b37 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Tue, 23 Jan 2018 19:06:46 +0100 Subject: [PATCH 136/137] [tbs] update tokenizer url(fixes #15395) --- youtube_dl/extractor/tbs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/tbs.py b/youtube_dl/extractor/tbs.py index eab22c38f..edc31729d 100644 --- a/youtube_dl/extractor/tbs.py +++ b/youtube_dl/extractor/tbs.py @@ -58,7 +58,7 @@ class TBSIE(TurnerBaseIE): continue if stream_data.get('playlistProtection') == 'spe': m3u8_url = self._add_akamai_spe_token( - 'http://www.%s.com/service/token_spe' % site, + 'http://token.vgtf.net/token/token_spe', m3u8_url, media_id, { 'url': url, 'site_name': site[:3].upper(), From 967ebbdb6cdb655815f73482763ed8f6eeff5c96 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Tue, 23 Jan 2018 19:22:44 +0100 Subject: [PATCH 137/137] [prosiebensat1] add another clip ID regexp(fixes #15378) --- youtube_dl/extractor/prosiebensat1.py | 1 + 1 file changed, 1 insertion(+) diff --git a/youtube_dl/extractor/prosiebensat1.py b/youtube_dl/extractor/prosiebensat1.py index 7e680a728..48757fd4f 100644 --- a/youtube_dl/extractor/prosiebensat1.py +++ b/youtube_dl/extractor/prosiebensat1.py @@ -345,6 +345,7 @@ class ProSiebenSat1IE(ProSiebenSat1BaseIE): r'clip[iI]d\s*=\s*["\'](\d+)', r"'itemImageUrl'\s*:\s*'/dynamic/thumbnails/full/\d+/(\d+)", r'proMamsId"\s*:\s*"(\d+)', + r'proMamsId"\s*:\s*"(\d+)', ] _TITLE_REGEXES = [ r'<h2 class="subtitle" itemprop="name">\s*(.+?)</h2>',