From 419bf5314f31a692f7b24edb285e2a85c8d86bcc Mon Sep 17 00:00:00 2001 From: ZhangXinYang <1069262011@qq.com> Date: Sun, 28 Aug 2016 17:11:58 +0800 Subject: [PATCH 1/2] commit --- youtube_dl/extractor/bilibili.py | 73 +++++++++----------------------- 1 file changed, 19 insertions(+), 54 deletions(-) diff --git a/youtube_dl/extractor/bilibili.py b/youtube_dl/extractor/bilibili.py index d87c38a02..186f78864 100644 --- a/youtube_dl/extractor/bilibili.py +++ b/youtube_dl/extractor/bilibili.py @@ -1,28 +1,23 @@ # coding: utf-8 from __future__ import unicode_literals +import urllib2 import calendar import datetime import re +import json from .common import InfoExtractor from ..compat import ( - compat_etree_fromstring, compat_str, compat_parse_qs, - compat_xml_parse_error, ) from ..utils import ( - ExtractorError, int_or_none, - float_or_none, - xpath_text, ) class BiliBiliIE(InfoExtractor): - _WORKING = False - _VALID_URL = r'https?://www\.bilibili\.(?:tv|com)/video/av(?P\d+)' _TESTS = [{ @@ -85,75 +80,46 @@ class BiliBiliIE(InfoExtractor): # BiliBili blocks keys from time to time. The current key is extracted from # the Android client # TODO: find the sign algorithm used in the flash player - _APP_KEY = '86385cdc024c0f6c' def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) video_id = mobj.group('id') webpage = self._download_webpage(url, video_id) + api = 'http://www.bilibili.com/m/html5?aid=%s&page=1' % video_id + info = json.loads(self._download_webpage(api,video_id)) + url = info['src'] params = compat_parse_qs(self._search_regex( [r'EmbedPlayer\([^)]+,\s*"([^"]+)"\)', r']+src="https://secure\.bilibili\.com/secure,([^"]+)"'], webpage, 'player parameters')) cid = params['cid'][0] - - info_xml_str = self._download_webpage( - 'http://interface.bilibili.com/v_cdn_play', - cid, query={'appkey': self._APP_KEY, 'cid': cid}, - note='Downloading video info page') - - err_msg = None - durls = None - info_xml = None - try: - info_xml = compat_etree_fromstring(info_xml_str.encode('utf-8')) - except compat_xml_parse_error: - info_json = self._parse_json(info_xml_str, video_id, fatal=False) - err_msg = (info_json or {}).get('error_text') - else: - err_msg = xpath_text(info_xml, './message') - - if info_xml is not None: - durls = info_xml.findall('./durl') - if not durls: - if err_msg: - raise ExtractorError('%s said: %s' % (self.IE_NAME, err_msg), expected=True) - else: - raise ExtractorError('No videos found!') + + response = urllib2.Request(url) + html = urllib2.urlopen(response) + size = html.headers['Content-Length'] entries = [] + formats = [{ + 'url': url, + 'filesize': int_or_none(size), + }] - for durl in durls: - size = xpath_text(durl, ['./filesize', './size']) - formats = [{ - 'url': durl.find('./url').text, - 'filesize': int_or_none(size), - }] - for backup_url in durl.findall('./backup_url/url'): - formats.append({ - 'url': backup_url.text, - # backup URLs have lower priorities - 'preference': -2 if 'hd.mp4' in backup_url.text else -3, - }) + self._sort_formats(formats) - self._sort_formats(formats) - - entries.append({ - 'id': '%s_part%s' % (cid, xpath_text(durl, './order')), - 'duration': int_or_none(xpath_text(durl, './length'), 1000), + entries.append({ + 'id': '%s_part' % cid, + 'duration': int_or_none(size), 'formats': formats, }) - title = self._html_search_regex(']+title="([^"]+)">', webpage, 'title') - description = self._html_search_meta('description', webpage) + description = self._html_search_meta('description', webpage) datetime_str = self._html_search_regex( r']+datetime="([^"]+)"', webpage, 'upload time', fatal=False) timestamp = None if datetime_str: timestamp = calendar.timegm(datetime.datetime.strptime(datetime_str, '%Y-%m-%dT%H:%M').timetuple()) - # TODO 'view_count' requires deobfuscating Javascript info = { 'id': compat_str(cid), @@ -161,9 +127,8 @@ class BiliBiliIE(InfoExtractor): 'description': description, 'timestamp': timestamp, 'thumbnail': self._html_search_meta('thumbnailUrl', webpage), - 'duration': float_or_none(xpath_text(info_xml, './timelength'), scale=1000), + 'duration': int_or_none(size), } - uploader_mobj = re.search( r']+href="https?://space\.bilibili\.com/(?P\d+)"[^>]+title="(?P[^"]+)"', webpage) From 815b5f2fa60b63dcd5d1679518a66b8faaa87264 Mon Sep 17 00:00:00 2001 From: ZhangXinYang <1069262011@qq.com> Date: Mon, 29 Aug 2016 15:55:46 +0800 Subject: [PATCH 2/2] Update bilibili.py --- youtube_dl/extractor/bilibili.py | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/youtube_dl/extractor/bilibili.py b/youtube_dl/extractor/bilibili.py index 186f78864..3339fe6e6 100644 --- a/youtube_dl/extractor/bilibili.py +++ b/youtube_dl/extractor/bilibili.py @@ -1,11 +1,11 @@ # coding: utf-8 from __future__ import unicode_literals -import urllib2 + import calendar import datetime import re -import json + from .common import InfoExtractor from ..compat import ( @@ -87,8 +87,8 @@ class BiliBiliIE(InfoExtractor): webpage = self._download_webpage(url, video_id) api = 'http://www.bilibili.com/m/html5?aid=%s&page=1' % video_id - info = json.loads(self._download_webpage(api,video_id)) - url = info['src'] + info = self._download_json(api,video_id) + urlh = info['src'] params = compat_parse_qs(self._search_regex( [r'EmbedPlayer\([^)]+,\s*"([^"]+)"\)', @@ -96,13 +96,12 @@ class BiliBiliIE(InfoExtractor): webpage, 'player parameters')) cid = params['cid'][0] - response = urllib2.Request(url) - html = urllib2.urlopen(response) - size = html.headers['Content-Length'] - + request_headers =self. _request_webpage(urlh,video_id).headers + size = request_headers['Content-Length'] + entries = [] formats = [{ - 'url': url, + 'url': urlh, 'filesize': int_or_none(size), }]