From 29ac31afaf627363fbc1f757aa50078d343acf1f Mon Sep 17 00:00:00 2001 From: sprhawk <465558+sprhawk@users.noreply.github.com> Date: Mon, 11 Dec 2017 12:25:13 +0800 Subject: [PATCH 01/29] simply get the correct webpage, but not parsed to extract information --- youtube_dl/extractor/extractors.py | 1 + youtube_dl/extractor/weibo.py | 97 ++++++++++++++++++++++++++++++ 2 files changed, 98 insertions(+) create mode 100644 youtube_dl/extractor/weibo.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 2cc3bc463..12dc2e7e8 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -1286,6 +1286,7 @@ from .webofstories import ( WebOfStoriesIE, WebOfStoriesPlaylistIE, ) +from .weibo import WeiboIE from .weiqitv import WeiqiTVIE from .wimp import WimpIE from .wistia import WistiaIE diff --git a/youtube_dl/extractor/weibo.py b/youtube_dl/extractor/weibo.py new file mode 100644 index 000000000..195508e99 --- /dev/null +++ b/youtube_dl/extractor/weibo.py @@ -0,0 +1,97 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor + +from urllib.request import Request +from urllib.parse import urlencode +import json +import random as rnd + +class WeiboIE(InfoExtractor): + _VALID_URL = r'https?://weibo\.com/[0-9]+/(?P[a-zA-Z0-9]+)' + _TEST = { + 'url': 'https://weibo.com/6275294458/Fp6RGfbff?from=page_1005056275294458_profile&wvr=6&mod=weibotime&type=comment', + 'md5': 'TODO: md5 sum of the first 10241 bytes of the video file (use --test)', + 'info_dict': { + 'id': '42', + 'ext': 'mp4', + 'title': 'Video title goes here', + 'thumbnail': r're:^https?://.*\.jpg$', + # TODO more properties, either as: + # * A value + # * MD5 checksum; start the string with md5: + # * A regular expression; start the string with re: + # * Any Python type (for example int or float) + } + } + + def _real_extract(self, url): + video_id = self._match_id(url) + headers = { + 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', + 'Accept-Encoding': 'gzip, deflate, br', + 'Accept-Language': 'en,zh-CN;q=0.9,zh;q=0.8', + 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36', + 'Upgrade-Insecure-Requests': '1', + } + # to get Referer url for genvisitor + webpage,urlh = self._download_webpage_handle(url, video_id, headers=headers, note="first visit the page") + + visitor_url = urlh.geturl() + + data = urlencode({ + "cb": "gen_callback", + "fp": '{"os":"2","browser":"Gecko57,0,0,0","fonts":"undefined","screenInfo":"1440*900*24","plugins":""}', + }).encode() + headers = { + 'Accept-Encoding': 'gzip, deflate, br', + 'Accept': '*/*', + 'Referer': visitor_url, + } + + r_genvisitor = Request( + 'https://passport.weibo.com/visitor/genvisitor', + data = data, + headers = headers, + method = 'POST' + ) + webpage,urlh = self._download_webpage_handle(r_genvisitor, video_id, note="gen visitor") + print("webpage", webpage) + + p = webpage.split("&&")[1] # split "gen_callback && gen_callback(...)" + i1 = p.find('{') + i2 = p.rfind('}') + j = p[i1:i2+1] # get JSON object + d = json.loads(j) + tid = d["data"]["tid"] + cnfd = "%03d" % d["data"]["confidence"] + + param = urlencode({ + 'a': 'incarnate', + 't': tid, + 'w': 2, + 'c': cnfd, + 'cb': 'cross_domain', + 'from': 'weibo', + '_rand': rnd.random() + }) + gencallback_url = "https://passport.weibo.com/visitor/visitor?" + param + webpage,urlh = self._download_webpage_handle(gencallback_url, video_id, note="gen callback") + print("webpage", webpage) + + webpage,urlh = self._download_webpage_handle(url, video_id, headers=headers, note="retry to visit the page") + print("webpage", webpage) + + # TODO more code goes here, for example ... + title = self._html_search_regex(r'(.+?)', webpage, 'title') + + video_sources = self._search_regex(r'video-sources=(.+?)', webpage, 'video_sources') + print("video_sources:", video_sources) + return { + 'id': video_id, + 'title': title, + 'description': self._og_search_description(webpage), + 'uploader': self._search_regex(r']+id="uploader"[^>]*>([^<]+)<', webpage, 'uploader', fatal=False), + # TODO more properties (see youtube_dl/extractor/common.py) + } From 3281af3464a910cb88f22ef0ece4a8323c2a4d38 Mon Sep 17 00:00:00 2001 From: sprhawk <465558+sprhawk@users.noreply.github.com> Date: Mon, 11 Dec 2017 15:56:54 +0800 Subject: [PATCH 02/29] a working version --- youtube_dl/extractor/weibo.py | 41 +++++++++++++++++++++-------------- 1 file changed, 25 insertions(+), 16 deletions(-) diff --git a/youtube_dl/extractor/weibo.py b/youtube_dl/extractor/weibo.py index 195508e99..9b398e931 100644 --- a/youtube_dl/extractor/weibo.py +++ b/youtube_dl/extractor/weibo.py @@ -5,24 +5,19 @@ from .common import InfoExtractor from urllib.request import Request from urllib.parse import urlencode +from urllib import parse import json import random as rnd +from os import path class WeiboIE(InfoExtractor): _VALID_URL = r'https?://weibo\.com/[0-9]+/(?P[a-zA-Z0-9]+)' _TEST = { - 'url': 'https://weibo.com/6275294458/Fp6RGfbff?from=page_1005056275294458_profile&wvr=6&mod=weibotime&type=comment', - 'md5': 'TODO: md5 sum of the first 10241 bytes of the video file (use --test)', + 'url': 'https://weibo.com/6275294458/Fp6RGfbff?type=comment', 'info_dict': { - 'id': '42', + 'id': 'Fp6RGfbff', 'ext': 'mp4', - 'title': 'Video title goes here', - 'thumbnail': r're:^https?://.*\.jpg$', - # TODO more properties, either as: - # * A value - # * MD5 checksum; start the string with md5: - # * A regular expression; start the string with re: - # * Any Python type (for example int or float) + 'title': 'You should have servants to massage you,... 来自Hosico_猫 - 微博', } } @@ -78,20 +73,34 @@ class WeiboIE(InfoExtractor): }) gencallback_url = "https://passport.weibo.com/visitor/visitor?" + param webpage,urlh = self._download_webpage_handle(gencallback_url, video_id, note="gen callback") - print("webpage", webpage) webpage,urlh = self._download_webpage_handle(url, video_id, headers=headers, note="retry to visit the page") - print("webpage", webpage) # TODO more code goes here, for example ... title = self._html_search_regex(r'(.+?)', webpage, 'title') - video_sources = self._search_regex(r'video-sources=(.+?)', webpage, 'video_sources') - print("video_sources:", video_sources) + video_sources_text = self._search_regex("video-sources=\\\\\"(.+?)\"", webpage, 'video_sources') + + video_formats = parse.parse_qs(video_sources_text) + + formats = [] + supported_resolutions = ['720', '480'] + for res in supported_resolutions: + f = video_formats.get(res) + if isinstance(f, list): + if len(f) > 0: + vid_url = f[0] + print("%s:%s" % (res, vid_url)) + formats.append({ + 'url': vid_url + }) + self._sort_formats(formats) + uploader = self._og_search_property('nick-name', webpage, 'uploader', default = None) + print(title, uploader) return { 'id': video_id, 'title': title, - 'description': self._og_search_description(webpage), - 'uploader': self._search_regex(r']+id="uploader"[^>]*>([^<]+)<', webpage, 'uploader', fatal=False), + 'uploader': uploader, + 'formats': formats # TODO more properties (see youtube_dl/extractor/common.py) } From 0c69958844a446bc3373f45f8f750cbc3202d14e Mon Sep 17 00:00:00 2001 From: sprhawk <465558+sprhawk@users.noreply.github.com> Date: Mon, 11 Dec 2017 16:02:14 +0800 Subject: [PATCH 03/29] add other properties; remove print verbose --- youtube_dl/extractor/weibo.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/weibo.py b/youtube_dl/extractor/weibo.py index 9b398e931..b835f8975 100644 --- a/youtube_dl/extractor/weibo.py +++ b/youtube_dl/extractor/weibo.py @@ -52,7 +52,6 @@ class WeiboIE(InfoExtractor): method = 'POST' ) webpage,urlh = self._download_webpage_handle(r_genvisitor, video_id, note="gen visitor") - print("webpage", webpage) p = webpage.split("&&")[1] # split "gen_callback && gen_callback(...)" i1 = p.find('{') @@ -90,13 +89,13 @@ class WeiboIE(InfoExtractor): if isinstance(f, list): if len(f) > 0: vid_url = f[0] - print("%s:%s" % (res, vid_url)) formats.append({ - 'url': vid_url + 'url': vid_url, + 'format': 'mp4', + 'height': int(res), }) self._sort_formats(formats) uploader = self._og_search_property('nick-name', webpage, 'uploader', default = None) - print(title, uploader) return { 'id': video_id, 'title': title, From 447a5a710dcd05741ea8cefa2fe98b333534e07d Mon Sep 17 00:00:00 2001 From: sprhawk <465558+sprhawk@users.noreply.github.com> Date: Tue, 26 Dec 2017 16:18:35 +0800 Subject: [PATCH 04/29] added weibo mobile site support --- youtube_dl/extractor/extractors.py | 5 +++- youtube_dl/extractor/weibo.py | 46 ++++++++++++++++++++++++++++++ 2 files changed, 50 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 12dc2e7e8..f1ea735b5 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -1286,7 +1286,10 @@ from .webofstories import ( WebOfStoriesIE, WebOfStoriesPlaylistIE, ) -from .weibo import WeiboIE +from .weibo import ( + WeiboIE, + WeiboMobileIE +) from .weiqitv import WeiqiTVIE from .wimp import WimpIE from .wistia import WistiaIE diff --git a/youtube_dl/extractor/weibo.py b/youtube_dl/extractor/weibo.py index b835f8975..eda0fa63d 100644 --- a/youtube_dl/extractor/weibo.py +++ b/youtube_dl/extractor/weibo.py @@ -9,6 +9,11 @@ from urllib import parse import json import random as rnd from os import path +import re + +from ..utils import ( + js_to_json, +) class WeiboIE(InfoExtractor): _VALID_URL = r'https?://weibo\.com/[0-9]+/(?P[a-zA-Z0-9]+)' @@ -103,3 +108,44 @@ class WeiboIE(InfoExtractor): 'formats': formats # TODO more properties (see youtube_dl/extractor/common.py) } + +class WeiboMobileIE(InfoExtractor): + _VALID_URL = r'https?://m.weibo.cn/status/(?P[0-9]+)(\?.+)?' + _TEST = { + 'url': 'https://m.weibo.cn/status/4189191225395228?wm=3333_2001&sourcetype=weixin&featurecode=newtitle&from=singlemessage&isappinstalled=0', + 'info_dict': { + 'id': '4189191225395228', + 'ext': 'mp4', + 'title': '午睡当然是要甜甜蜜蜜的啦', + 'uploader': '柴犬柴犬' + } + } + + def _real_extract(self, url): + video_id = self._match_id(url) + headers = { + 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', + 'Accept-Encoding': 'gzip, deflate, br', + 'Accept-Language': 'en,zh-CN;q=0.9,zh;q=0.8', + 'Upgrade-Insecure-Requests': '1', + } + # to get Referer url for genvisitor + webpage,urlh = self._download_webpage_handle(url, video_id, headers=headers, note="visit the page") + js_code = self._search_regex(r'var\s+\$render_data\s*=\s*\[({.*})\]\[0\] \|\| {};', webpage, 'js_code', flags = re.DOTALL) + weibo_info = self._parse_json(js_code, video_id, transform_source=js_to_json) + page_info = weibo_info['status']['page_info'] + title = weibo_info['status']['status_title'] + format = { + 'url': page_info['media_info']['stream_url'], + 'format': 'mp4', + } + formats = [format] + uploader = weibo_info['status']['user']['screen_name'] + + return { + 'id': video_id, + 'title': title, + 'uploader': uploader, + 'formats': formats + # TODO more properties (see youtube_dl/extractor/common.py) + } From d2be5bb5af7a1d7108b272315265e103a4358b28 Mon Sep 17 00:00:00 2001 From: sprhawk <465558+sprhawk@users.noreply.github.com> Date: Tue, 26 Dec 2017 16:28:47 +0800 Subject: [PATCH 05/29] change to use compat urllib --- youtube_dl/extractor/weibo.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/weibo.py b/youtube_dl/extractor/weibo.py index eda0fa63d..6a4e0a4cb 100644 --- a/youtube_dl/extractor/weibo.py +++ b/youtube_dl/extractor/weibo.py @@ -3,14 +3,16 @@ from __future__ import unicode_literals from .common import InfoExtractor -from urllib.request import Request -from urllib.parse import urlencode -from urllib import parse import json import random as rnd from os import path import re +from ..compat import ( + compat_urllib_parse_urlencode as urlencode, + compat_urllib_request as Request, + compat_urlparse as parse, +) from ..utils import ( js_to_json, ) From 951043724f91b3cfce60cf62cc3228a91a04ae81 Mon Sep 17 00:00:00 2001 From: sprhawk <465558+sprhawk@users.noreply.github.com> Date: Tue, 26 Dec 2017 16:38:51 +0800 Subject: [PATCH 06/29] re-format code to pass flake8 --- youtube_dl/extractor/weibo.py | 103 +++++++++++++++++----------------- 1 file changed, 52 insertions(+), 51 deletions(-) diff --git a/youtube_dl/extractor/weibo.py b/youtube_dl/extractor/weibo.py index 6a4e0a4cb..b4ac7b9fa 100644 --- a/youtube_dl/extractor/weibo.py +++ b/youtube_dl/extractor/weibo.py @@ -5,7 +5,6 @@ from .common import InfoExtractor import json import random as rnd -from os import path import re from ..compat import ( @@ -17,16 +16,17 @@ from ..utils import ( js_to_json, ) + class WeiboIE(InfoExtractor): _VALID_URL = r'https?://weibo\.com/[0-9]+/(?P[a-zA-Z0-9]+)' _TEST = { - 'url': 'https://weibo.com/6275294458/Fp6RGfbff?type=comment', - 'info_dict': { - 'id': 'Fp6RGfbff', - 'ext': 'mp4', - 'title': 'You should have servants to massage you,... 来自Hosico_猫 - 微博', - } - } + 'url': 'https://weibo.com/6275294458/Fp6RGfbff?type=comment', + 'info_dict': { + 'id': 'Fp6RGfbff', + 'ext': 'mp4', + 'title': 'You should have servants to massage you,... 来自Hosico_猫 - 微博', + } + } def _real_extract(self, url): video_id = self._match_id(url) @@ -38,32 +38,32 @@ class WeiboIE(InfoExtractor): 'Upgrade-Insecure-Requests': '1', } # to get Referer url for genvisitor - webpage,urlh = self._download_webpage_handle(url, video_id, headers=headers, note="first visit the page") + webpage, urlh = self._download_webpage_handle(url, video_id, headers=headers, note="first visit the page") visitor_url = urlh.geturl() data = urlencode({ "cb": "gen_callback", "fp": '{"os":"2","browser":"Gecko57,0,0,0","fonts":"undefined","screenInfo":"1440*900*24","plugins":""}', - }).encode() + }).encode() headers = { - 'Accept-Encoding': 'gzip, deflate, br', - 'Accept': '*/*', - 'Referer': visitor_url, - } + 'Accept-Encoding': 'gzip, deflate, br', + 'Accept': '*/*', + 'Referer': visitor_url, + } r_genvisitor = Request( 'https://passport.weibo.com/visitor/genvisitor', - data = data, - headers = headers, - method = 'POST' - ) - webpage,urlh = self._download_webpage_handle(r_genvisitor, video_id, note="gen visitor") + data=data, + headers=headers, + method='POST' + ) + webpage, urlh = self._download_webpage_handle(r_genvisitor, video_id, note="gen visitor") - p = webpage.split("&&")[1] # split "gen_callback && gen_callback(...)" + p = webpage.split("&&")[1] # split "gen_callback && gen_callback(...)" i1 = p.find('{') i2 = p.rfind('}') - j = p[i1:i2+1] # get JSON object + j = p[i1:i2 + 1] # get JSON object d = json.loads(j) tid = d["data"]["tid"] cnfd = "%03d" % d["data"]["confidence"] @@ -76,17 +76,17 @@ class WeiboIE(InfoExtractor): 'cb': 'cross_domain', 'from': 'weibo', '_rand': rnd.random() - }) + }) gencallback_url = "https://passport.weibo.com/visitor/visitor?" + param - webpage,urlh = self._download_webpage_handle(gencallback_url, video_id, note="gen callback") + webpage, urlh = self._download_webpage_handle(gencallback_url, video_id, note="gen callback") - webpage,urlh = self._download_webpage_handle(url, video_id, headers=headers, note="retry to visit the page") + webpage, urlh = self._download_webpage_handle(url, video_id, headers=headers, note="retry to visit the page") # TODO more code goes here, for example ... title = self._html_search_regex(r'(.+?)', webpage, 'title') video_sources_text = self._search_regex("video-sources=\\\\\"(.+?)\"", webpage, 'video_sources') - + video_formats = parse.parse_qs(video_sources_text) formats = [] @@ -100,28 +100,29 @@ class WeiboIE(InfoExtractor): 'url': vid_url, 'format': 'mp4', 'height': int(res), - }) + }) self._sort_formats(formats) - uploader = self._og_search_property('nick-name', webpage, 'uploader', default = None) + uploader = self._og_search_property('nick-name', webpage, 'uploader', default=None) return { - 'id': video_id, - 'title': title, - 'uploader': uploader, - 'formats': formats - # TODO more properties (see youtube_dl/extractor/common.py) - } + 'id': video_id, + 'title': title, + 'uploader': uploader, + 'formats': formats + # TODO more properties (see youtube_dl/extractor/common.py) + } + class WeiboMobileIE(InfoExtractor): _VALID_URL = r'https?://m.weibo.cn/status/(?P[0-9]+)(\?.+)?' _TEST = { - 'url': 'https://m.weibo.cn/status/4189191225395228?wm=3333_2001&sourcetype=weixin&featurecode=newtitle&from=singlemessage&isappinstalled=0', - 'info_dict': { - 'id': '4189191225395228', - 'ext': 'mp4', - 'title': '午睡当然是要甜甜蜜蜜的啦', - 'uploader': '柴犬柴犬' - } - } + 'url': 'https://m.weibo.cn/status/4189191225395228?wm=3333_2001&sourcetype=weixin&featurecode=newtitle&from=singlemessage&isappinstalled=0', + 'info_dict': { + 'id': '4189191225395228', + 'ext': 'mp4', + 'title': '午睡当然是要甜甜蜜蜜的啦', + 'uploader': '柴犬柴犬' + } + } def _real_extract(self, url): video_id = self._match_id(url) @@ -132,22 +133,22 @@ class WeiboMobileIE(InfoExtractor): 'Upgrade-Insecure-Requests': '1', } # to get Referer url for genvisitor - webpage,urlh = self._download_webpage_handle(url, video_id, headers=headers, note="visit the page") - js_code = self._search_regex(r'var\s+\$render_data\s*=\s*\[({.*})\]\[0\] \|\| {};', webpage, 'js_code', flags = re.DOTALL) + webpage, urlh = self._download_webpage_handle(url, video_id, headers=headers, note="visit the page") + js_code = self._search_regex(r'var\s+\$render_data\s*=\s*\[({.*})\]\[0\] \|\| {};', webpage, 'js_code', flags=re.DOTALL) weibo_info = self._parse_json(js_code, video_id, transform_source=js_to_json) page_info = weibo_info['status']['page_info'] title = weibo_info['status']['status_title'] format = { 'url': page_info['media_info']['stream_url'], - 'format': 'mp4', - } + 'format': 'mp4', + } formats = [format] uploader = weibo_info['status']['user']['screen_name'] return { - 'id': video_id, - 'title': title, - 'uploader': uploader, - 'formats': formats - # TODO more properties (see youtube_dl/extractor/common.py) - } + 'id': video_id, + 'title': title, + 'uploader': uploader, + 'formats': formats + # TODO more properties (see youtube_dl/extractor/common.py) + } From 25936512245fc571ab716d59e2d73c50d8cad6ce Mon Sep 17 00:00:00 2001 From: sprhawk <465558+sprhawk@users.noreply.github.com> Date: Tue, 26 Dec 2017 16:46:01 +0800 Subject: [PATCH 07/29] fix compat_urllib_request for python2.7 --- youtube_dl/extractor/weibo.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/weibo.py b/youtube_dl/extractor/weibo.py index b4ac7b9fa..f8a5ee71c 100644 --- a/youtube_dl/extractor/weibo.py +++ b/youtube_dl/extractor/weibo.py @@ -9,7 +9,7 @@ import re from ..compat import ( compat_urllib_parse_urlencode as urlencode, - compat_urllib_request as Request, + compat_urllib_request as request, compat_urlparse as parse, ) from ..utils import ( @@ -52,11 +52,10 @@ class WeiboIE(InfoExtractor): 'Referer': visitor_url, } - r_genvisitor = Request( + r_genvisitor = request.Request( 'https://passport.weibo.com/visitor/genvisitor', data=data, headers=headers, - method='POST' ) webpage, urlh = self._download_webpage_handle(r_genvisitor, video_id, note="gen visitor") @@ -85,7 +84,7 @@ class WeiboIE(InfoExtractor): # TODO more code goes here, for example ... title = self._html_search_regex(r'(.+?)', webpage, 'title') - video_sources_text = self._search_regex("video-sources=\\\\\"(.+?)\"", webpage, 'video_sources') + video_sources_text = self._search_regex(r'video-sources=\\\"(.+?)\"', webpage, 'video_sources') video_formats = parse.parse_qs(video_sources_text) From 42a1012c7767306626c5358a18ad3e86417bd7b7 Mon Sep 17 00:00:00 2001 From: sprhawk <465558+sprhawk@users.noreply.github.com> Date: Tue, 26 Dec 2017 22:20:43 +0800 Subject: [PATCH 08/29] fix according to "https://github.com/rg3/youtube-dl/pull/15079#discussion_r158688607" --- youtube_dl/extractor/weibo.py | 85 +++++++++++++---------------------- 1 file changed, 32 insertions(+), 53 deletions(-) diff --git a/youtube_dl/extractor/weibo.py b/youtube_dl/extractor/weibo.py index f8a5ee71c..2be31fe77 100644 --- a/youtube_dl/extractor/weibo.py +++ b/youtube_dl/extractor/weibo.py @@ -4,13 +4,12 @@ from __future__ import unicode_literals from .common import InfoExtractor import json -import random as rnd +import random import re from ..compat import ( - compat_urllib_parse_urlencode as urlencode, - compat_urllib_request as request, - compat_urlparse as parse, + compat_urllib_parse_urlencode, + compat_urlparse, ) from ..utils import ( js_to_json, @@ -30,34 +29,28 @@ class WeiboIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) - headers = { - 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', - 'Accept-Encoding': 'gzip, deflate, br', - 'Accept-Language': 'en,zh-CN;q=0.9,zh;q=0.8', - 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36', - 'Upgrade-Insecure-Requests': '1', - } # to get Referer url for genvisitor - webpage, urlh = self._download_webpage_handle(url, video_id, headers=headers, note="first visit the page") + webpage, urlh = self._download_webpage_handle(url, video_id, note="first visit the page") visitor_url = urlh.geturl() - - data = urlencode({ - "cb": "gen_callback", - "fp": '{"os":"2","browser":"Gecko57,0,0,0","fonts":"undefined","screenInfo":"1440*900*24","plugins":""}', - }).encode() headers = { - 'Accept-Encoding': 'gzip, deflate, br', - 'Accept': '*/*', - 'Referer': visitor_url, + 'Referer': visitor_url } - r_genvisitor = request.Request( - 'https://passport.weibo.com/visitor/genvisitor', - data=data, - headers=headers, - ) - webpage, urlh = self._download_webpage_handle(r_genvisitor, video_id, note="gen visitor") + fp = { + "os": "2", + "browser": "Gecko57,0,0,0", + "fonts": "undefined", + "screenInfo": "1440*900*24", + "plugins": "" + } + data = compat_urllib_parse_urlencode({ + "cb": "gen_callback", + "fp": json.dumps(fp), + }).encode() + + genvisitor_url = 'https://passport.weibo.com/visitor/genvisitor' + webpage, _ = self._download_webpage_handle(genvisitor_url, video_id, data=data, headers=headers, note="gen visitor") p = webpage.split("&&")[1] # split "gen_callback && gen_callback(...)" i1 = p.find('{') @@ -67,29 +60,28 @@ class WeiboIE(InfoExtractor): tid = d["data"]["tid"] cnfd = "%03d" % d["data"]["confidence"] - param = urlencode({ + query = { 'a': 'incarnate', 't': tid, 'w': 2, 'c': cnfd, 'cb': 'cross_domain', 'from': 'weibo', - '_rand': rnd.random() - }) - gencallback_url = "https://passport.weibo.com/visitor/visitor?" + param - webpage, urlh = self._download_webpage_handle(gencallback_url, video_id, note="gen callback") + '_rand': random.random() + } + gencallback_url = "https://passport.weibo.com/visitor/visitor" + self._download_webpage_handle(gencallback_url, video_id, note="gen callback", query=query) - webpage, urlh = self._download_webpage_handle(url, video_id, headers=headers, note="retry to visit the page") + webpage, _ = self._download_webpage_handle(url, video_id, note="retry to visit the page") - # TODO more code goes here, for example ... title = self._html_search_regex(r'(.+?)', webpage, 'title') video_sources_text = self._search_regex(r'video-sources=\\\"(.+?)\"', webpage, 'video_sources') - video_formats = parse.parse_qs(video_sources_text) + video_formats = compat_urlparse.parse_qs(video_sources_text) formats = [] - supported_resolutions = ['720', '480'] + supported_resolutions = ('720', '480') for res in supported_resolutions: f = video_formats.get(res) if isinstance(f, list): @@ -107,12 +99,11 @@ class WeiboIE(InfoExtractor): 'title': title, 'uploader': uploader, 'formats': formats - # TODO more properties (see youtube_dl/extractor/common.py) } class WeiboMobileIE(InfoExtractor): - _VALID_URL = r'https?://m.weibo.cn/status/(?P[0-9]+)(\?.+)?' + _VALID_URL = r'https?://m\.weibo\.cn/status/(?P[0-9]+)(\?.+)?' _TEST = { 'url': 'https://m.weibo.cn/status/4189191225395228?wm=3333_2001&sourcetype=weixin&featurecode=newtitle&from=singlemessage&isappinstalled=0', 'info_dict': { @@ -125,29 +116,17 @@ class WeiboMobileIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) - headers = { - 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', - 'Accept-Encoding': 'gzip, deflate, br', - 'Accept-Language': 'en,zh-CN;q=0.9,zh;q=0.8', - 'Upgrade-Insecure-Requests': '1', - } # to get Referer url for genvisitor - webpage, urlh = self._download_webpage_handle(url, video_id, headers=headers, note="visit the page") + webpage, _ = self._download_webpage_handle(url, video_id, note="visit the page") js_code = self._search_regex(r'var\s+\$render_data\s*=\s*\[({.*})\]\[0\] \|\| {};', webpage, 'js_code', flags=re.DOTALL) weibo_info = self._parse_json(js_code, video_id, transform_source=js_to_json) page_info = weibo_info['status']['page_info'] - title = weibo_info['status']['status_title'] - format = { - 'url': page_info['media_info']['stream_url'], - 'format': 'mp4', - } - formats = [format] - uploader = weibo_info['status']['user']['screen_name'] + title = weibo_info.get('status').get('status_title') + uploader = weibo_info.get('status').get('user').get('screen_name') return { 'id': video_id, 'title': title, 'uploader': uploader, - 'formats': formats - # TODO more properties (see youtube_dl/extractor/common.py) + 'url': page_info['media_info']['stream_url'] } From 5c97ec5ff5fd77a7975e1e946d53a76ccd5ef0de Mon Sep 17 00:00:00 2001 From: sprhawk <465558+sprhawk@users.noreply.github.com> Date: Sat, 30 Dec 2017 01:08:56 +0800 Subject: [PATCH 09/29] replace urlencode.encode with urlencode_postdata --- youtube_dl/extractor/weibo.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/weibo.py b/youtube_dl/extractor/weibo.py index 2be31fe77..0b28952c9 100644 --- a/youtube_dl/extractor/weibo.py +++ b/youtube_dl/extractor/weibo.py @@ -8,11 +8,11 @@ import random import re from ..compat import ( - compat_urllib_parse_urlencode, compat_urlparse, ) from ..utils import ( js_to_json, + urlencode_postdata, ) @@ -44,10 +44,10 @@ class WeiboIE(InfoExtractor): "screenInfo": "1440*900*24", "plugins": "" } - data = compat_urllib_parse_urlencode({ + data = urlencode_postdata({ "cb": "gen_callback", "fp": json.dumps(fp), - }).encode() + }) genvisitor_url = 'https://passport.weibo.com/visitor/genvisitor' webpage, _ = self._download_webpage_handle(genvisitor_url, video_id, data=data, headers=headers, note="gen visitor") From 6a41a12d2960efb7b32d3b6ef74cf6237766b569 Mon Sep 17 00:00:00 2001 From: sprhawk <465558+sprhawk@users.noreply.github.com> Date: Sat, 30 Dec 2017 01:11:30 +0800 Subject: [PATCH 10/29] replace split with strip_jsonp --- youtube_dl/extractor/weibo.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/weibo.py b/youtube_dl/extractor/weibo.py index 0b28952c9..71e7123e4 100644 --- a/youtube_dl/extractor/weibo.py +++ b/youtube_dl/extractor/weibo.py @@ -12,6 +12,7 @@ from ..compat import ( ) from ..utils import ( js_to_json, + strip_jsonp, urlencode_postdata, ) @@ -52,7 +53,7 @@ class WeiboIE(InfoExtractor): genvisitor_url = 'https://passport.weibo.com/visitor/genvisitor' webpage, _ = self._download_webpage_handle(genvisitor_url, video_id, data=data, headers=headers, note="gen visitor") - p = webpage.split("&&")[1] # split "gen_callback && gen_callback(...)" + p = strip_jsonp(webpage) i1 = p.find('{') i2 = p.rfind('}') j = p[i1:i2 + 1] # get JSON object From 48058d82dc3b448a72fd5ac1e7fa5492cd11f640 Mon Sep 17 00:00:00 2001 From: sprhawk <465558+sprhawk@users.noreply.github.com> Date: Sat, 30 Dec 2017 01:14:21 +0800 Subject: [PATCH 11/29] replace unused _download_webpage_handle with _download_webpage --- youtube_dl/extractor/weibo.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/weibo.py b/youtube_dl/extractor/weibo.py index 71e7123e4..34809bdb2 100644 --- a/youtube_dl/extractor/weibo.py +++ b/youtube_dl/extractor/weibo.py @@ -51,7 +51,7 @@ class WeiboIE(InfoExtractor): }) genvisitor_url = 'https://passport.weibo.com/visitor/genvisitor' - webpage, _ = self._download_webpage_handle(genvisitor_url, video_id, data=data, headers=headers, note="gen visitor") + webpage = self._download_webpage(genvisitor_url, video_id, data=data, headers=headers, note="gen visitor") p = strip_jsonp(webpage) i1 = p.find('{') @@ -71,9 +71,9 @@ class WeiboIE(InfoExtractor): '_rand': random.random() } gencallback_url = "https://passport.weibo.com/visitor/visitor" - self._download_webpage_handle(gencallback_url, video_id, note="gen callback", query=query) + self._download_webpage(gencallback_url, video_id, note="gen callback", query=query) - webpage, _ = self._download_webpage_handle(url, video_id, note="retry to visit the page") + webpage = self._download_webpage(url, video_id, note="retry to visit the page") title = self._html_search_regex(r'(.+?)', webpage, 'title') @@ -118,7 +118,7 @@ class WeiboMobileIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) # to get Referer url for genvisitor - webpage, _ = self._download_webpage_handle(url, video_id, note="visit the page") + webpage = self._download_webpage(url, video_id, note="visit the page") js_code = self._search_regex(r'var\s+\$render_data\s*=\s*\[({.*})\]\[0\] \|\| {};', webpage, 'js_code', flags=re.DOTALL) weibo_info = self._parse_json(js_code, video_id, transform_source=js_to_json) page_info = weibo_info['status']['page_info'] From 6648fd8ad6e581354f46c840465cff4c92d2c6f3 Mon Sep 17 00:00:00 2001 From: sprhawk <465558+sprhawk@users.noreply.github.com> Date: Mon, 1 Jan 2018 18:33:14 +0800 Subject: [PATCH 12/29] changed to use .get to get field from json object --- youtube_dl/extractor/weibo.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/weibo.py b/youtube_dl/extractor/weibo.py index 34809bdb2..cbe0c3228 100644 --- a/youtube_dl/extractor/weibo.py +++ b/youtube_dl/extractor/weibo.py @@ -121,7 +121,7 @@ class WeiboMobileIE(InfoExtractor): webpage = self._download_webpage(url, video_id, note="visit the page") js_code = self._search_regex(r'var\s+\$render_data\s*=\s*\[({.*})\]\[0\] \|\| {};', webpage, 'js_code', flags=re.DOTALL) weibo_info = self._parse_json(js_code, video_id, transform_source=js_to_json) - page_info = weibo_info['status']['page_info'] + page_info = weibo_info.get('status').get('page_info') title = weibo_info.get('status').get('status_title') uploader = weibo_info.get('status').get('user').get('screen_name') From 0b0870f9d0dd3e72be3ff6be6bfa9fa43b693b50 Mon Sep 17 00:00:00 2001 From: Parmjit Virk Date: Thu, 4 Jan 2018 19:25:42 -0600 Subject: [PATCH 13/29] [soundcloud] Fallback to avatar picture for thumbnail (closes #12878) --- youtube_dl/extractor/soundcloud.py | 21 ++++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/soundcloud.py b/youtube_dl/extractor/soundcloud.py index 8894f4b0c..6c9816eef 100644 --- a/youtube_dl/extractor/soundcloud.py +++ b/youtube_dl/extractor/soundcloud.py @@ -136,6 +136,25 @@ class SoundcloudIE(InfoExtractor): 'license': 'all-rights-reserved', }, }, + # no album art, use avatar pic for thumbnail + { + 'url': 'https://soundcloud.com/garyvee/sideways-prod-mad-real', + 'md5': '59c7872bc44e5d99b7211891664760c2', + 'info_dict': { + 'id': '309699954', + 'ext': 'mp3', + 'title': 'Sideways (Prod. Mad Real)', + 'description': 'md5:d41d8cd98f00b204e9800998ecf8427e', + 'uploader': 'garyvee', + 'upload_date': '20170226', + 'duration': 207, + 'thumbnail': r're:https?://.*\.jpg', + 'license': 'all-rights-reserved', + }, + 'params': { + 'skip_download': True, + }, + }, ] _CLIENT_ID = 'c6CU49JDMapyrQo06UxU9xouB9ZVzqCn' @@ -160,7 +179,7 @@ class SoundcloudIE(InfoExtractor): name = full_title or track_id if quiet: self.report_extraction(name) - thumbnail = info.get('artwork_url') + thumbnail = info.get('artwork_url') or info.get('user', {}).get('avatar_url') if isinstance(thumbnail, compat_str): thumbnail = thumbnail.replace('-large', '-t500x500') ext = 'mp3' From b7c74c04036c07f8a81d3048b482afd6ef384b40 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 6 Jan 2018 23:12:30 +0700 Subject: [PATCH 14/29] [lynda] Relax _VALID_URL (closes #15185) --- youtube_dl/extractor/lynda.py | 23 +++++++++++++++++++++-- 1 file changed, 21 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/lynda.py b/youtube_dl/extractor/lynda.py index 0d6026aad..f5c7abc13 100644 --- a/youtube_dl/extractor/lynda.py +++ b/youtube_dl/extractor/lynda.py @@ -94,7 +94,15 @@ class LyndaBaseIE(InfoExtractor): class LyndaIE(LyndaBaseIE): IE_NAME = 'lynda' IE_DESC = 'lynda.com videos' - _VALID_URL = r'https?://(?:www\.)?(?:lynda\.com|educourse\.ga)/(?:[^/]+/[^/]+/(?P\d+)|player/embed)/(?P\d+)' + _VALID_URL = r'''(?x) + https?:// + (?:www\.)?(?:lynda\.com|educourse\.ga)/ + (?: + (?:[^/]+/){2,3}(?P\d+)| + player/embed + )/ + (?P\d+) + ''' _TIMECODE_REGEX = r'\[(?P\d+:\d+:\d+[\.,]\d+)\]' @@ -113,6 +121,9 @@ class LyndaIE(LyndaBaseIE): }, { 'url': 'https://educourse.ga/Bootstrap-tutorials/Using-exercise-files/110885/114408-4.html', 'only_matching': True, + }, { + 'url': 'https://www.lynda.com/de/Graphic-Design-tutorials/Willkommen-Grundlagen-guten-Gestaltung/393570/393572-4.html', + 'only_matching': True, }] def _raise_unavailable(self, video_id): @@ -257,7 +268,15 @@ class LyndaCourseIE(LyndaBaseIE): # Course link equals to welcome/introduction video link of same course # We will recognize it as course link - _VALID_URL = r'https?://(?:www|m)\.(?:lynda\.com|educourse\.ga)/(?P[^/]+/[^/]+/(?P\d+))-\d\.html' + _VALID_URL = r'https?://(?:www|m)\.(?:lynda\.com|educourse\.ga)/(?P(?:[^/]+/){2,3}(?P\d+))-2\.html' + + _TESTS = [{ + 'url': 'https://www.lynda.com/Graphic-Design-tutorials/Grundlagen-guten-Gestaltung/393570-2.html', + 'only_matching': True, + }, { + 'url': 'https://www.lynda.com/de/Graphic-Design-tutorials/Grundlagen-guten-Gestaltung/393570-2.html', + 'only_matching': True, + }] def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) From 45283afdec81af21ba50ff3aca3d86fb6d2584b0 Mon Sep 17 00:00:00 2001 From: Martin Weinelt Date: Sat, 6 Jan 2018 17:33:40 +0100 Subject: [PATCH 15/29] [motherless] Add support for groups --- youtube_dl/extractor/extractors.py | 5 +- youtube_dl/extractor/motherless.py | 73 ++++++++++++++++++++++++++++++ 2 files changed, 77 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index e64defe62..fb0997d39 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -609,7 +609,10 @@ from .mofosex import MofosexIE from .mojvideo import MojvideoIE from .moniker import MonikerIE from .morningstar import MorningstarIE -from .motherless import MotherlessIE +from .motherless import ( + MotherlessIE, + MotherlessGroupIE +) from .motorsport import MotorsportIE from .movieclips import MovieClipsIE from .moviezine import MoviezineIE diff --git a/youtube_dl/extractor/motherless.py b/youtube_dl/extractor/motherless.py index 6fe3b6049..90ed91ba6 100644 --- a/youtube_dl/extractor/motherless.py +++ b/youtube_dl/extractor/motherless.py @@ -4,8 +4,11 @@ import datetime import re from .common import InfoExtractor +from ..compat import compat_urlparse from ..utils import ( ExtractorError, + InAdvancePagedList, + orderedSet, str_to_int, unified_strdate, ) @@ -114,3 +117,73 @@ class MotherlessIE(InfoExtractor): 'age_limit': age_limit, 'url': video_url, } + + +class MotherlessGroupIE(InfoExtractor): + _VALID_URL = 'https?://(?:www\.)?motherless\.com/gv?/(?P[a-z0-9_]+)' + _TESTS = [{ + 'url': 'http://motherless.com/g/movie_scenes', + 'info_dict': { + 'id': 'movie_scenes', + 'title': 'Movie Scenes', + 'description': 'Hot and sexy scenes from "regular" movies... ' + 'Beautiful actresses fully nude... A looot of ' + 'skin! :)Enjoy!', + }, + 'playlist_mincount': 662, + }, { + 'url': 'http://motherless.com/gv/sex_must_be_funny', + 'info_dict': { + 'id': 'sex_must_be_funny', + 'title': 'Sex must be funny', + 'description': 'Sex can be funny. Wide smiles,laugh, games, fun of ' + 'any kind!' + }, + 'playlist_mincount': 9, + }] + + @classmethod + def suitable(cls, url): + return (False if MotherlessIE.suitable(url) + else super(MotherlessGroupIE, cls).suitable(url)) + + def _extract_entries(self, webpage, base): + return [ + self.url_result( + compat_urlparse.urljoin(base, video_path), + MotherlessIE.ie_key(), video_title=title) + for video_path, title in orderedSet(re.findall( + r'href="/([^"]+)"[^>]+>\s+]+alt="[^-]+-\s([^"]+)"', + webpage)) + ] + + def _real_extract(self, url): + group_id = self._match_id(url) + page_url = compat_urlparse.urljoin(url, '/gv/%s' % group_id) + webpage = self._download_webpage(page_url, group_id) + title = self._search_regex( + r'([\w\s]+\w)\s+-', webpage, 'title', fatal=False) + description = self._html_search_meta( + 'description', webpage, fatal=False) + page_count = self._int(self._search_regex( + r'(\d+)</(?:a|span)><(?:a|span)[^>]+>\s*NEXT', + webpage, 'page_count'), 'page_count') + PAGE_SIZE = 80 + + def _get_page(idx): + webpage = self._download_webpage( + page_url, group_id, query={'page': idx + 1}, + note='Downloading page %d/%d' % (idx + 1, page_count) + ) + for entry in self._extract_entries(webpage, url): + yield entry + + playlist = InAdvancePagedList(_get_page, page_count, PAGE_SIZE) + + return { + '_type': 'playlist', + 'id': group_id, + 'title': title, + 'description': description, + 'entries': playlist + } From f12628f934ff50cc8e6441c4e64fe61019ebae5c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 6 Jan 2018 23:58:00 +0700 Subject: [PATCH 16/29] [mitele] Fix extraction (closes #15186) --- youtube_dl/extractor/mitele.py | 30 ++++++++++++++++-------------- 1 file changed, 16 insertions(+), 14 deletions(-) diff --git a/youtube_dl/extractor/mitele.py b/youtube_dl/extractor/mitele.py index 964dc542c..42759eae8 100644 --- a/youtube_dl/extractor/mitele.py +++ b/youtube_dl/extractor/mitele.py @@ -1,13 +1,13 @@ # coding: utf-8 from __future__ import unicode_literals +import json import uuid from .common import InfoExtractor from .ooyala import OoyalaIE from ..compat import ( compat_str, - compat_urllib_parse_urlencode, compat_urlparse, ) from ..utils import ( @@ -42,31 +42,33 @@ class MiTeleBaseIE(InfoExtractor): duration = int_or_none(mmc.get('duration')) for location in mmc['locations']: gat = self._proto_relative_url(location.get('gat'), 'http:') - bas = location.get('bas') - loc = location.get('loc') + gcp = location.get('gcp') ogn = location.get('ogn') - if None in (gat, bas, loc, ogn): + if None in (gat, gcp, ogn): continue token_data = { - 'bas': bas, - 'icd': loc, + 'gcp': gcp, 'ogn': ogn, - 'sta': '0', + 'sta': 0, } media = self._download_json( - '%s/?%s' % (gat, compat_urllib_parse_urlencode(token_data)), - video_id, 'Downloading %s JSON' % location['loc']) - file_ = media.get('file') - if not file_: + gat, video_id, data=json.dumps(token_data).encode('utf-8'), + headers={ + 'Content-Type': 'application/json;charset=utf-8', + 'Referer': url, + }) + stream = media.get('stream') or media.get('file') + if not stream: continue - ext = determine_ext(file_) + ext = determine_ext(stream) if ext == 'f4m': formats.extend(self._extract_f4m_formats( - file_ + '&hdcore=3.2.0&plugin=aasp-3.2.0.77.18', + stream + '&hdcore=3.2.0&plugin=aasp-3.2.0.77.18', video_id, f4m_id='hds', fatal=False)) elif ext == 'm3u8': formats.extend(self._extract_m3u8_formats( - file_, video_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False)) + stream, video_id, 'mp4', 'm3u8_native', + m3u8_id='hls', fatal=False)) self._sort_formats(formats) return { From a133eb7764594b830cb975e3925972214e932704 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 7 Jan 2018 00:02:41 +0700 Subject: [PATCH 17/29] [motherless:group] Capture leading slash of video path --- youtube_dl/extractor/motherless.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/motherless.py b/youtube_dl/extractor/motherless.py index 90ed91ba6..4adac691c 100644 --- a/youtube_dl/extractor/motherless.py +++ b/youtube_dl/extractor/motherless.py @@ -153,7 +153,7 @@ class MotherlessGroupIE(InfoExtractor): compat_urlparse.urljoin(base, video_path), MotherlessIE.ie_key(), video_title=title) for video_path, title in orderedSet(re.findall( - r'href="/([^"]+)"[^>]+>\s+<img[^>]+alt="[^-]+-\s([^"]+)"', + r'href="(/[^"]+)"[^>]+>\s+<img[^>]+alt="[^-]+-\s([^"]+)"', webpage)) ] From 0a5b1295b7c1aa6395b65ee137087c540b37b32b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 7 Jan 2018 00:31:53 +0700 Subject: [PATCH 18/29] [motherless:group] Relax entry extraction and add a fallback scenario --- youtube_dl/extractor/motherless.py | 29 +++++++++++++++++++++-------- 1 file changed, 21 insertions(+), 8 deletions(-) diff --git a/youtube_dl/extractor/motherless.py b/youtube_dl/extractor/motherless.py index 4adac691c..e24396e79 100644 --- a/youtube_dl/extractor/motherless.py +++ b/youtube_dl/extractor/motherless.py @@ -148,14 +148,27 @@ class MotherlessGroupIE(InfoExtractor): else super(MotherlessGroupIE, cls).suitable(url)) def _extract_entries(self, webpage, base): - return [ - self.url_result( - compat_urlparse.urljoin(base, video_path), - MotherlessIE.ie_key(), video_title=title) - for video_path, title in orderedSet(re.findall( - r'href="(/[^"]+)"[^>]+>\s+<img[^>]+alt="[^-]+-\s([^"]+)"', - webpage)) - ] + entries = [] + for mobj in re.finditer( + r'href="(?P<href>/[^"]+)"[^>]*>(?:\s*<img[^>]+alt="[^-]+-\s(?P<title>[^"]+)")?', + webpage): + video_url = compat_urlparse.urljoin(base, mobj.group('href')) + if not MotherlessIE.suitable(video_url): + continue + video_id = MotherlessIE._match_id(video_url) + title = mobj.group('title') + entries.append(self.url_result( + video_url, ie=MotherlessIE.ie_key(), video_id=video_id, + video_title=title)) + # Alternative fallback + if not entries: + entries = [ + self.url_result( + compat_urlparse.urljoin(base, '/' + video_id), + ie=MotherlessIE.ie_key(), video_id=video_id) + for video_id in orderedSet(re.findall( + r'data-codename=["\']([A-Z0-9]+)', webpage))] + return entries def _real_extract(self, url): group_id = self._match_id(url) From b0ead0e09aae6de6026a018cda7019eb7eade919 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 7 Jan 2018 21:49:23 +0700 Subject: [PATCH 19/29] [jwplatform] Add support for multiple embeds (closes #15192) --- youtube_dl/extractor/generic.py | 6 +++--- youtube_dl/extractor/jwplatform.py | 11 +++++++---- 2 files changed, 10 insertions(+), 7 deletions(-) diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index cc4c90b8c..9b0cd004f 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -2708,9 +2708,9 @@ class GenericIE(InfoExtractor): return self.url_result(viewlift_url) # Look for JWPlatform embeds - jwplatform_url = JWPlatformIE._extract_url(webpage) - if jwplatform_url: - return self.url_result(jwplatform_url, 'JWPlatform') + jwplatform_urls = JWPlatformIE._extract_urls(webpage) + if jwplatform_urls: + return self.playlist_from_matches(jwplatform_urls, video_id, video_title, ie=JWPlatformIE.ie_key()) # Look for Digiteka embeds digiteka_url = DigitekaIE._extract_url(webpage) diff --git a/youtube_dl/extractor/jwplatform.py b/youtube_dl/extractor/jwplatform.py index c9bcbb08f..63d0dc998 100644 --- a/youtube_dl/extractor/jwplatform.py +++ b/youtube_dl/extractor/jwplatform.py @@ -23,11 +23,14 @@ class JWPlatformIE(InfoExtractor): @staticmethod def _extract_url(webpage): - mobj = re.search( - r'<(?:script|iframe)[^>]+?src=["\'](?P<url>(?:https?:)?//content.jwplatform.com/players/[a-zA-Z0-9]{8})', + urls = JWPlatformIE._extract_urls(webpage) + return urls[0] if urls else None + + @staticmethod + def _extract_urls(webpage): + return re.findall( + r'<(?:script|iframe)[^>]+?src=["\']((?:https?:)?//content\.jwplatform\.com/players/[a-zA-Z0-9]{8})', webpage) - if mobj: - return mobj.group('url') def _real_extract(self, url): video_id = self._match_id(url) From 8faa9576bb4599dc3e77b8d3339122aa4f1230b4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 7 Jan 2018 23:48:56 +0700 Subject: [PATCH 20/29] [ChangeLog] Actualize --- ChangeLog | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/ChangeLog b/ChangeLog index 96bc471f3..67de65355 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,9 +1,21 @@ version <unreleased> +Core +* [utils] Fix youtube-dl under PyPy3 on Windows +* [YoutubeDL] Output python implementation in debug header + Extractors ++ [jwplatform] Add support for multiple embeds (#15192) +* [mitele] Fix extraction (#15186) ++ [motherless] Add support for groups (#15124) +* [lynda] Relax URL regular expression (#15185) +* [soundcloud] Fallback to avatar picture for thumbnail (#12878) * [youku] Fix list extraction (#15135) * [openload] Fix extraction (#15166) +* [lynda] Skip invalid subtitles (#15159) +* [twitch] Pass video id to url_result when extracting playlist (#15139) * [rtve.es:alacarta] Fix extraction of some new URLs +* [acast] Fix extraction (#15147) version 2017.12.31 From 950b5f296986ed0a2dd9feeb69dbb950592b6047 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 7 Jan 2018 23:52:16 +0700 Subject: [PATCH 21/29] release 2018.01.07 --- .github/ISSUE_TEMPLATE.md | 6 +++--- ChangeLog | 2 +- docs/supportedsites.md | 1 + youtube_dl/version.py | 2 +- 4 files changed, 6 insertions(+), 5 deletions(-) diff --git a/.github/ISSUE_TEMPLATE.md b/.github/ISSUE_TEMPLATE.md index 3f8984943..ad52c8900 100644 --- a/.github/ISSUE_TEMPLATE.md +++ b/.github/ISSUE_TEMPLATE.md @@ -6,8 +6,8 @@ --- -### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2017.12.31*. If it's not, read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. -- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2017.12.31** +### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2018.01.07*. If it's not, read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. +- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2018.01.07** ### Before submitting an *issue* make sure you have: - [ ] At least skimmed through the [README](https://github.com/rg3/youtube-dl/blob/master/README.md), **most notably** the [FAQ](https://github.com/rg3/youtube-dl#faq) and [BUGS](https://github.com/rg3/youtube-dl#bugs) sections @@ -35,7 +35,7 @@ Add the `-v` flag to **your command line** you run youtube-dl with (`youtube-dl [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 -[debug] youtube-dl version 2017.12.31 +[debug] youtube-dl version 2018.01.07 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/ChangeLog b/ChangeLog index 67de65355..9c45ae000 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,4 +1,4 @@ -version <unreleased> +version 2018.01.07 Core * [utils] Fix youtube-dl under PyPy3 on Windows diff --git a/docs/supportedsites.md b/docs/supportedsites.md index 75bd5c922..79b343048 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -478,6 +478,7 @@ - **Moniker**: allmyvideos.net and vidspot.net - **Morningstar**: morningstar.com - **Motherless** + - **MotherlessGroup** - **Motorsport**: motorsport.com - **MovieClips** - **MovieFap** diff --git a/youtube_dl/version.py b/youtube_dl/version.py index a3f84b9ea..9030e2415 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2017.12.31' +__version__ = '2018.01.07' From 3a513f29adc42fc46fd8b754806d38444bcee151 Mon Sep 17 00:00:00 2001 From: Luca Steeb <contact@luca-steeb.com> Date: Sat, 6 Jan 2018 20:27:26 +0100 Subject: [PATCH 22/29] fix bilibili extraction (closes #15171) --- youtube_dl/extractor/bilibili.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/bilibili.py b/youtube_dl/extractor/bilibili.py index 1e57310d6..beffcecd0 100644 --- a/youtube_dl/extractor/bilibili.py +++ b/youtube_dl/extractor/bilibili.py @@ -102,6 +102,7 @@ class BiliBiliIE(InfoExtractor): video_id, anime_id, compat_urlparse.urljoin(url, '//bangumi.bilibili.com/anime/%s' % anime_id))) headers = { 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8', + 'Referer': url } headers.update(self.geo_verification_headers()) @@ -116,10 +117,15 @@ class BiliBiliIE(InfoExtractor): payload = 'appkey=%s&cid=%s&otype=json&quality=2&type=mp4' % (self._APP_KEY, cid) sign = hashlib.md5((payload + self._BILIBILI_KEY).encode('utf-8')).hexdigest() + headers = { + 'Referer': url + } + headers.update(self.geo_verification_headers()) + video_info = self._download_json( 'http://interface.bilibili.com/playurl?%s&sign=%s' % (payload, sign), video_id, note='Downloading video info page', - headers=self.geo_verification_headers()) + headers=headers) if 'durl' not in video_info: self._report_error(video_info) From 7643916a3794f52169d16df093bd4a2b3abbb323 Mon Sep 17 00:00:00 2001 From: Chih-Hsuan Yen <yan12125@gmail.com> Date: Mon, 8 Jan 2018 01:32:13 +0800 Subject: [PATCH 23/29] [ChangeLog] update after #15188 [ci skip] --- ChangeLog | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/ChangeLog b/ChangeLog index 9c45ae000..9d37cdcef 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,9 @@ +version <unreleased> + +Extractors +* [bilibili] fix extraction (#15188) + + version 2018.01.07 Core From a39e15c516865259735bd8f4f5629de5b0e77847 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Sun, 7 Jan 2018 22:15:44 +0100 Subject: [PATCH 24/29] [canalplus] fix extraction(closes #15072) --- youtube_dl/extractor/canalplus.py | 99 ++++--------------------------- 1 file changed, 12 insertions(+), 87 deletions(-) diff --git a/youtube_dl/extractor/canalplus.py b/youtube_dl/extractor/canalplus.py index d8bf073f4..51c11cb7e 100644 --- a/youtube_dl/extractor/canalplus.py +++ b/youtube_dl/extractor/canalplus.py @@ -4,59 +4,36 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..compat import compat_urllib_parse_urlparse from ..utils import ( - dict_get, # ExtractorError, # HEADRequest, int_or_none, qualities, - remove_end, unified_strdate, ) class CanalplusIE(InfoExtractor): - IE_DESC = 'canalplus.fr, piwiplus.fr and d8.tv' - _VALID_URL = r'''(?x) - https?:// - (?: - (?: - (?:(?:www|m)\.)?canalplus\.fr| - (?:www\.)?piwiplus\.fr| - (?:www\.)?d8\.tv| - (?:www\.)?c8\.fr| - (?:www\.)?d17\.tv| - (?:(?:football|www)\.)?cstar\.fr| - (?:www\.)?itele\.fr - )/(?:(?:[^/]+/)*(?P<display_id>[^/?#&]+))?(?:\?.*\bvid=(?P<vid>\d+))?| - player\.canalplus\.fr/#/(?P<id>\d+) - ) - - ''' + IE_DESC = 'mycanal.fr and piwiplus.fr' + _VALID_URL = r'https?://(?:www\.)?(?P<site>mycanal|piwiplus)\.fr/(?:[^/]+/)*(?P<display_id>[^?/]+)(?:\.html\?.*\bvid=|/p/)(?P<id>\d+)' _VIDEO_INFO_TEMPLATE = 'http://service.canal-plus.com/video/rest/getVideosLiees/%s/%s?format=json' _SITE_ID_MAP = { - 'canalplus': 'cplus', + 'mycanal': 'cplus', 'piwiplus': 'teletoon', - 'd8': 'd8', - 'c8': 'd8', - 'd17': 'd17', - 'cstar': 'd17', - 'itele': 'itele', } # Only works for direct mp4 URLs _GEO_COUNTRIES = ['FR'] _TESTS = [{ - 'url': 'http://www.canalplus.fr/c-emissions/pid1830-c-zapping.html?vid=1192814', + 'url': 'https://www.mycanal.fr/d17-emissions/lolywood/p/1397061', 'info_dict': { - 'id': '1405510', - 'display_id': 'pid1830-c-zapping', + 'id': '1397061', + 'display_id': 'lolywood', 'ext': 'mp4', - 'title': 'Zapping - 02/07/2016', - 'description': 'Le meilleur de toutes les chaînes, tous les jours', - 'upload_date': '20160702', + 'title': 'Euro 2016 : Je préfère te prévenir - Lolywood - Episode 34', + 'description': 'md5:7d97039d455cb29cdba0d652a0efaa5e', + 'upload_date': '20160602', }, }, { # geo restricted, bypassed @@ -70,64 +47,12 @@ class CanalplusIE(InfoExtractor): 'upload_date': '20140724', }, 'expected_warnings': ['HTTP Error 403: Forbidden'], - }, { - # geo restricted, bypassed - 'url': 'http://www.c8.fr/c8-divertissement/ms-touche-pas-a-mon-poste/pid6318-videos-integrales.html?vid=1443684', - 'md5': 'bb6f9f343296ab7ebd88c97b660ecf8d', - 'info_dict': { - 'id': '1443684', - 'display_id': 'pid6318-videos-integrales', - 'ext': 'mp4', - 'title': 'Guess my iep ! - TPMP - 07/04/2017', - 'description': 'md5:6f005933f6e06760a9236d9b3b5f17fa', - 'upload_date': '20170407', - }, - 'expected_warnings': ['HTTP Error 403: Forbidden'], - }, { - 'url': 'http://www.itele.fr/chroniques/invite-michael-darmon/rachida-dati-nicolas-sarkozy-est-le-plus-en-phase-avec-les-inquietudes-des-francais-171510', - 'info_dict': { - 'id': '1420176', - 'display_id': 'rachida-dati-nicolas-sarkozy-est-le-plus-en-phase-avec-les-inquietudes-des-francais-171510', - 'ext': 'mp4', - 'title': 'L\'invité de Michaël Darmon du 14/10/2016 - ', - 'description': 'Chaque matin du lundi au vendredi, Michaël Darmon reçoit un invité politique à 8h25.', - 'upload_date': '20161014', - }, - }, { - 'url': 'http://football.cstar.fr/cstar-minisite-foot/pid7566-feminines-videos.html?vid=1416769', - 'info_dict': { - 'id': '1416769', - 'display_id': 'pid7566-feminines-videos', - 'ext': 'mp4', - 'title': 'France - Albanie : les temps forts de la soirée - 20/09/2016', - 'description': 'md5:c3f30f2aaac294c1c969b3294de6904e', - 'upload_date': '20160921', - }, - 'params': { - 'skip_download': True, - }, - }, { - 'url': 'http://m.canalplus.fr/?vid=1398231', - 'only_matching': True, - }, { - 'url': 'http://www.d17.tv/emissions/pid8303-lolywood.html?vid=1397061', - 'only_matching': True, }] def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) + site, display_id, video_id = re.match(self._VALID_URL, url).groups() - site_id = self._SITE_ID_MAP[compat_urllib_parse_urlparse(url).netloc.rsplit('.', 2)[-2]] - - # Beware, some subclasses do not define an id group - display_id = remove_end(dict_get(mobj.groupdict(), ('display_id', 'id', 'vid')), '.html') - - webpage = self._download_webpage(url, display_id) - video_id = self._search_regex( - [r'<canal:player[^>]+?videoId=(["\'])(?P<id>\d+)', - r'id=["\']canal_video_player(?P<id>\d+)', - r'data-video=["\'](?P<id>\d+)'], - webpage, 'video id', default=mobj.group('vid'), group='id') + site_id = self._SITE_ID_MAP[site] info_url = self._VIDEO_INFO_TEMPLATE % (site_id, video_id) video_data = self._download_json(info_url, video_id, 'Downloading video JSON') @@ -161,7 +86,7 @@ class CanalplusIE(InfoExtractor): format_url + '?hdcore=2.11.3', video_id, f4m_id=format_id, fatal=False)) else: formats.append({ - # the secret extracted ya function in http://player.canalplus.fr/common/js/canalPlayer.js + # the secret extracted from ya function in http://player.canalplus.fr/common/js/canalPlayer.js 'url': format_url + '?secret=pqzerjlsmdkjfoiuerhsdlfknaes', 'format_id': format_id, 'preference': preference(format_id), From 8005dc68cbdfc15b6353a071ef87d7e57d69ff59 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Mon, 8 Jan 2018 21:53:03 +0700 Subject: [PATCH 25/29] [ok] Add support for live streams --- youtube_dl/extractor/odnoklassniki.py | 26 +++++++++++++++++++++++--- 1 file changed, 23 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/odnoklassniki.py b/youtube_dl/extractor/odnoklassniki.py index 8e13bcf1f..5c8b37e18 100644 --- a/youtube_dl/extractor/odnoklassniki.py +++ b/youtube_dl/extractor/odnoklassniki.py @@ -19,11 +19,11 @@ from ..utils import ( class OdnoklassnikiIE(InfoExtractor): - _VALID_URL = r'https?://(?:(?:www|m|mobile)\.)?(?:odnoklassniki|ok)\.ru/(?:video(?:embed)?|web-api/video/moviePlayer)/(?P<id>[\d-]+)' + _VALID_URL = r'https?://(?:(?:www|m|mobile)\.)?(?:odnoklassniki|ok)\.ru/(?:video(?:embed)?|web-api/video/moviePlayer|live)/(?P<id>[\d-]+)' _TESTS = [{ # metadata in JSON 'url': 'http://ok.ru/video/20079905452', - 'md5': '6ba728d85d60aa2e6dd37c9e70fdc6bc', + 'md5': '0b62089b479e06681abaaca9d204f152', 'info_dict': { 'id': '20079905452', 'ext': 'mp4', @@ -35,7 +35,6 @@ class OdnoklassnikiIE(InfoExtractor): 'like_count': int, 'age_limit': 0, }, - 'skip': 'Video has been blocked', }, { # metadataUrl 'url': 'http://ok.ru/video/63567059965189-0?fromTime=5', @@ -99,6 +98,9 @@ class OdnoklassnikiIE(InfoExtractor): }, { 'url': 'http://mobile.ok.ru/video/20079905452', 'only_matching': True, + }, { + 'url': 'https://www.ok.ru/live/484531969818', + 'only_matching': True, }] def _real_extract(self, url): @@ -184,6 +186,10 @@ class OdnoklassnikiIE(InfoExtractor): }) return info + assert title + if provider == 'LIVE_TV_APP': + info['title'] = self._live_title(title) + quality = qualities(('4', '0', '1', '2', '3', '5')) formats = [{ @@ -210,6 +216,20 @@ class OdnoklassnikiIE(InfoExtractor): if fmt_type: fmt['quality'] = quality(fmt_type) + # Live formats + m3u8_url = metadata.get('hlsMasterPlaylistUrl') + if m3u8_url: + formats.extend(self._extract_m3u8_formats( + m3u8_url, video_id, 'mp4', entry_protocol='m3u8', + m3u8_id='hls', fatal=False)) + rtmp_url = metadata.get('rtmpUrl') + if rtmp_url: + formats.append({ + 'url': rtmp_url, + 'format_id': 'rtmp', + 'ext': 'flv', + }) + self._sort_formats(formats) info['formats'] = formats From 5eca00a2e33a6ca26a7f52589e5d77bab7e5edf4 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan <yan12125@gmail.com> Date: Tue, 9 Jan 2018 18:12:55 +0800 Subject: [PATCH 26/29] [weibo] Misc improvements --- youtube_dl/extractor/weibo.py | 125 ++++++++++++++++++---------------- 1 file changed, 66 insertions(+), 59 deletions(-) diff --git a/youtube_dl/extractor/weibo.py b/youtube_dl/extractor/weibo.py index cbe0c3228..3cb4d71a6 100644 --- a/youtube_dl/extractor/weibo.py +++ b/youtube_dl/extractor/weibo.py @@ -8,7 +8,8 @@ import random import re from ..compat import ( - compat_urlparse, + compat_parse_qs, + compat_str, ) from ..utils import ( js_to_json, @@ -31,70 +32,71 @@ class WeiboIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) # to get Referer url for genvisitor - webpage, urlh = self._download_webpage_handle(url, video_id, note="first visit the page") + webpage, urlh = self._download_webpage_handle(url, video_id) visitor_url = urlh.geturl() - headers = { - 'Referer': visitor_url - } - fp = { - "os": "2", - "browser": "Gecko57,0,0,0", - "fonts": "undefined", - "screenInfo": "1440*900*24", - "plugins": "" - } - data = urlencode_postdata({ - "cb": "gen_callback", - "fp": json.dumps(fp), - }) + if 'passport.weibo.com' in visitor_url: + # first visit + visitor_data = self._download_json( + 'https://passport.weibo.com/visitor/genvisitor', video_id, + note='Generating first-visit data', + transform_source=strip_jsonp, + headers={'Referer': visitor_url}, + data=urlencode_postdata({ + 'cb': 'gen_callback', + 'fp': json.dumps({ + 'os': '2', + 'browser': 'Gecko57,0,0,0', + 'fonts': 'undefined', + 'screenInfo': '1440*900*24', + 'plugins': '', + }), + })) - genvisitor_url = 'https://passport.weibo.com/visitor/genvisitor' - webpage = self._download_webpage(genvisitor_url, video_id, data=data, headers=headers, note="gen visitor") + tid = visitor_data['data']['tid'] + cnfd = '%03d' % visitor_data['data']['confidence'] - p = strip_jsonp(webpage) - i1 = p.find('{') - i2 = p.rfind('}') - j = p[i1:i2 + 1] # get JSON object - d = json.loads(j) - tid = d["data"]["tid"] - cnfd = "%03d" % d["data"]["confidence"] + self._download_webpage( + 'https://passport.weibo.com/visitor/visitor', video_id, + note='Running first-visit callback', + query={ + 'a': 'incarnate', + 't': tid, + 'w': 2, + 'c': cnfd, + 'cb': 'cross_domain', + 'from': 'weibo', + '_rand': random.random(), + }) - query = { - 'a': 'incarnate', - 't': tid, - 'w': 2, - 'c': cnfd, - 'cb': 'cross_domain', - 'from': 'weibo', - '_rand': random.random() - } - gencallback_url = "https://passport.weibo.com/visitor/visitor" - self._download_webpage(gencallback_url, video_id, note="gen callback", query=query) + webpage = self._download_webpage( + url, video_id, note='Revisiting webpage') - webpage = self._download_webpage(url, video_id, note="retry to visit the page") + title = self._html_search_regex( + r'<title>(.+?)', webpage, 'title') - title = self._html_search_regex(r'(.+?)', webpage, 'title') - - video_sources_text = self._search_regex(r'video-sources=\\\"(.+?)\"', webpage, 'video_sources') - - video_formats = compat_urlparse.parse_qs(video_sources_text) + video_formats = compat_parse_qs(self._search_regex( + r'video-sources=\\\"(.+?)\"', webpage, 'video_sources')) formats = [] - supported_resolutions = ('720', '480') + supported_resolutions = (480, 720) for res in supported_resolutions: - f = video_formats.get(res) - if isinstance(f, list): - if len(f) > 0: - vid_url = f[0] - formats.append({ - 'url': vid_url, - 'format': 'mp4', - 'height': int(res), - }) + vid_urls = video_formats.get(compat_str(res)) + if not vid_urls or not isinstance(vid_urls, list): + continue + + vid_url = vid_urls[0] + formats.append({ + 'url': vid_url, + 'height': res, + }) + self._sort_formats(formats) - uploader = self._og_search_property('nick-name', webpage, 'uploader', default=None) + + uploader = self._og_search_property( + 'nick-name', webpage, 'uploader', default=None) + return { 'id': video_id, 'title': title, @@ -118,12 +120,17 @@ class WeiboMobileIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) # to get Referer url for genvisitor - webpage = self._download_webpage(url, video_id, note="visit the page") - js_code = self._search_regex(r'var\s+\$render_data\s*=\s*\[({.*})\]\[0\] \|\| {};', webpage, 'js_code', flags=re.DOTALL) - weibo_info = self._parse_json(js_code, video_id, transform_source=js_to_json) - page_info = weibo_info.get('status').get('page_info') - title = weibo_info.get('status').get('status_title') - uploader = weibo_info.get('status').get('user').get('screen_name') + webpage = self._download_webpage(url, video_id, note='visit the page') + + weibo_info = self._parse_json(self._search_regex( + r'var\s+\$render_data\s*=\s*\[({.*})\]\[0\]\s*\|\|\s*{};', + webpage, 'js_code', flags=re.DOTALL), + video_id, transform_source=js_to_json) + + status_data = weibo_info.get('status', {}) + page_info = status_data.get('page_info') + title = status_data['status_title'] + uploader = status_data.get('user', {}).get('screen_name') return { 'id': video_id, From 0f71de076144f59fae0b3b7e9a5251f44449cd9b Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Tue, 9 Jan 2018 18:13:49 +0800 Subject: [PATCH 27/29] [ChangeLog] Update after #15079 --- ChangeLog | 1 + 1 file changed, 1 insertion(+) diff --git a/ChangeLog b/ChangeLog index 9d37cdcef..51825ccfe 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,6 +1,7 @@ version Extractors ++ [weibo] Add extractor (#15079) * [bilibili] fix extraction (#15188) From 5b23845125ba20b83ab3a41fb8ff4b34e460a5dd Mon Sep 17 00:00:00 2001 From: Chih-Hsuan Yen Date: Tue, 9 Jan 2018 19:35:39 +0800 Subject: [PATCH 28/29] Credit @sprhawk for the Weibo extractor (#15079) --- AUTHORS | 1 + 1 file changed, 1 insertion(+) diff --git a/AUTHORS b/AUTHORS index 7e012247c..5a090a3ef 100644 --- a/AUTHORS +++ b/AUTHORS @@ -231,3 +231,4 @@ John Dong Tatsuyuki Ishi Daniel Weber Kay Bouché +Yang Hongbo From 310ea4661ddaea002c86d0ebbf4663b6c943b8f4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Tue, 9 Jan 2018 22:04:50 +0700 Subject: [PATCH 29/29] [ndr:embed:base] Make separate formats extraction non fatal (closes #15203) --- youtube_dl/extractor/ndr.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/ndr.py b/youtube_dl/extractor/ndr.py index 07528d140..aec2ea133 100644 --- a/youtube_dl/extractor/ndr.py +++ b/youtube_dl/extractor/ndr.py @@ -190,10 +190,12 @@ class NDREmbedBaseIE(InfoExtractor): ext = determine_ext(src, None) if ext == 'f4m': formats.extend(self._extract_f4m_formats( - src + '?hdcore=3.7.0&plugin=aasp-3.7.0.39.44', video_id, f4m_id='hds')) + src + '?hdcore=3.7.0&plugin=aasp-3.7.0.39.44', video_id, + f4m_id='hds', fatal=False)) elif ext == 'm3u8': formats.extend(self._extract_m3u8_formats( - src, video_id, 'mp4', m3u8_id='hls', entry_protocol='m3u8_native')) + src, video_id, 'mp4', m3u8_id='hls', + entry_protocol='m3u8_native', fatal=False)) else: quality = f.get('quality') ff = {