1
0
mirror of https://github.com/l1ving/youtube-dl synced 2025-03-10 20:37:15 +08:00

Refine Bilibili site support for url like:

+ https://www.bilibili.com/video/av28152675/?p=2
+ https://www.bilibili.com/video/av28152675/?p=1

Before this commit, the "?p=<page_id>" param are just ignored and
this tool can only download the video corresponds to "?p=1".

2 test cases are added.
This commit is contained in:
suhang 2019-01-12 13:42:05 +08:00
parent ed8db0a25c
commit e91a5dd848

View File

@ -3,6 +3,7 @@ from __future__ import unicode_literals
import hashlib import hashlib
import re import re
import json
from .common import InfoExtractor from .common import InfoExtractor
from ..compat import ( from ..compat import (
@ -25,7 +26,42 @@ from ..utils import (
class BiliBiliIE(InfoExtractor): class BiliBiliIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.|bangumi\.|)bilibili\.(?:tv|com)/(?:video/av|anime/(?P<anime_id>\d+)/play#)(?P<id>\d+)' _VALID_URL = r'https?://(?:www\.|bangumi\.|)bilibili\.(?:tv|com)/(?:video/av|anime/(?P<anime_id>\d+)/play#)(?P<id>\d+)'
# url like:
# + https://www.bilibili.com/video/av28152675
# + https://www.bilibili.com/video/av28152675/?p=2
_VALID_MULTI_P_URL = r'https?://(?:www\.|bangumi\.|)bilibili\.(?:tv|com)/video/av(?P<id>\d+)(/\?p=(?P<p_id>\d+))?'
_TESTS = [{ _TESTS = [{
'url': 'https://www.bilibili.com/video/av28152675/?p=1',
'md5': '3d0af4158af2e628eb2fc2cad2390da2',
'info_dict': {
'id': '28152675_p1',
'ext': 'flv',
'title': '【2020宏观系列-Ray Dalio】 桥水 达里奥的观点汇总__RAY DALIO- US economy looks like 1937 and we need to be careful',
'description': 'YouTube2020宏观系列一代传奇桥水达里奥的观点汇总。1.美国经济就像1937年 US economy looks like 19372.有70%的概率2020前美国经济陷入衰退3.我们正处于一生一见的长债务周期末端持续补充',
'duration': 222.791,
'timestamp': 1533037003,
'upload_date': '20180731',
'thumbnail': r're:^https?://.+\.jpg',
'uploader': 'RaymondWarrior',
'uploader_id': '599478',
},
}, {
'url': 'https://www.bilibili.com/video/av28152675/?p=2',
'md5': '2ebe1cc8ed1ff74d6c0871b54369ec83',
'info_dict': {
'id': '28152675_p2',
'ext': 'flv',
'title': '【2020宏观系列-Ray Dalio】 桥水 达里奥的观点汇总__Ray Dalio- 70% Chance Of Recession By 2020',
'description': 'YouTube2020宏观系列一代传奇桥水达里奥的观点汇总。1.美国经济就像1937年 US economy looks like 19372.有70%的概率2020前美国经济陷入衰退3.我们正处于一生一见的长债务周期末端持续补充',
'duration': 237.481,
'timestamp': 1533037003,
'upload_date': '20180731',
'thumbnail': r're:^https?://.+\.jpg',
'uploader': 'RaymondWarrior',
'uploader_id': '599478',
},
}, {
'url': 'http://www.bilibili.tv/video/av1074402/', 'url': 'http://www.bilibili.tv/video/av1074402/',
'md5': '5f7d29e1a2872f3df0cf76b1f87d3788', 'md5': '5f7d29e1a2872f3df0cf76b1f87d3788',
'info_dict': { 'info_dict': {
@ -104,15 +140,45 @@ class BiliBiliIE(InfoExtractor):
else: else:
raise ExtractorError('Can\'t extract Bangumi episode ID') raise ExtractorError('Can\'t extract Bangumi episode ID')
def _extract_pages_info(self, pages_array_json_str, page_id):
why_wrong_msg = "This may be caused by: 1. Your url is not correct. " + \
"2. Bilibili has change the format (someone needs to update this tool for the change.)"
try:
pages_array = json.loads(pages_array_json_str)
except Exception:
raise ExtractorError(
"Failed to parse \"pages\" info as JSON for your Bilibili url. " + why_wrong_msg
)
if not isinstance(pages_array, type([])):
raise ExtractorError(
"\"pages\" info JSON object is not an Array for your Bilibili url. " + why_wrong_msg
)
for page_obj in pages_array:
if not isinstance(page_obj, type({})):
raise ExtractorError(
"\"page\" object in \"pages\" JSON Array is not an Map for your Bilibili url. " + why_wrong_msg
)
if page_id == str(page_obj["page"]):
return (page_obj["cid"], page_obj["part"])
raise ExtractorError(
"We can't find page {} in \"pages\" Object from your Bilibili url. ".format(page_id) + why_wrong_msg
)
def _real_extract(self, url): def _real_extract(self, url):
url, smuggled_data = unsmuggle_url(url, {}) url, smuggled_data = unsmuggle_url(url, {})
mobj = re.match(self._VALID_URL, url) page_name = None
if 'anime/' not in url:
mobj = re.match(self._VALID_MULTI_P_URL, url)
video_id = mobj.group('id') video_id = mobj.group('id')
anime_id = mobj.group('anime_id') anime_id = None
page_id = mobj.group('p_id')
webpage = self._download_webpage(url, video_id) webpage = self._download_webpage(url, video_id)
if 'anime/' not in url: if page_id is None:
cid = self._search_regex( cid = self._search_regex(
r'\bcid(?:["\']:|=)(\d+)', webpage, 'cid', r'\bcid(?:["\']:|=)(\d+)', webpage, 'cid',
default=None default=None
@ -122,6 +188,18 @@ class BiliBiliIE(InfoExtractor):
r'<iframe[^>]+src="https://secure\.bilibili\.com/secure,([^"]+)"'], r'<iframe[^>]+src="https://secure\.bilibili\.com/secure,([^"]+)"'],
webpage, 'player parameters'))['cid'][0] webpage, 'player parameters'))['cid'][0]
else: else:
pages_array_json_str = self._search_regex(
r'(?:"pages":)(\[[^\[\]]*\])', webpage, 'pages',
default=None
)
(cid, page_name) = self._extract_pages_info(pages_array_json_str, page_id)
else:
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('id')
anime_id = mobj.group('anime_id')
webpage = self._download_webpage(url, video_id)
if 'no_bangumi_tip' not in smuggled_data: if 'no_bangumi_tip' not in smuggled_data:
self.to_screen('Downloading episode %s. To download all videos in anime %s, re-run youtube-dl with %s' % ( self.to_screen('Downloading episode %s. To download all videos in anime %s, re-run youtube-dl with %s' % (
video_id, anime_id, compat_urlparse.urljoin(url, '//bangumi.bilibili.com/anime/%s' % anime_id))) video_id, anime_id, compat_urlparse.urljoin(url, '//bangumi.bilibili.com/anime/%s' % anime_id)))
@ -227,7 +305,11 @@ class BiliBiliIE(InfoExtractor):
entry.update(info) entry.update(info)
if len(entries) == 1: if len(entries) == 1:
if page_name is not None:
entries[0]['id'] += "_p{}".format(page_id)
entries[0]['title'] += "__{}".format(page_name)
return entries[0] return entries[0]
else: else:
for idx, entry in enumerate(entries): for idx, entry in enumerate(entries):
entry['id'] = '%s_part%d' % (video_id, (idx + 1)) entry['id'] = '%s_part%d' % (video_id, (idx + 1))