From f226880c6d44098b5e99b05a83ed739e18d15690 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Wed, 14 Mar 2018 01:28:40 +0100 Subject: [PATCH 01/11] [tennistv] Add support for tennistv.com --- test/test_utils.py | 1 + youtube_dl/extractor/extractors.py | 1 + youtube_dl/extractor/tennistv.py | 113 +++++++++++++++++++++++++++++ youtube_dl/utils.py | 5 ++ 4 files changed, 120 insertions(+) create mode 100644 youtube_dl/extractor/tennistv.py diff --git a/test/test_utils.py b/test/test_utils.py index f92c65b59..a1fe6fdb2 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -352,6 +352,7 @@ class TestUtil(unittest.TestCase): self.assertEqual(unified_timestamp('2017-03-30T17:52:41Q'), 1490896361) self.assertEqual(unified_timestamp('Sep 11, 2013 | 5:49 AM'), 1378878540) self.assertEqual(unified_timestamp('December 15, 2017 at 7:49 am'), 1513324140) + self.assertEqual(unified_timestamp('2018-03-14T08:32:43.1493874+00:00'), 1521016363) def test_determine_ext(self): self.assertEqual(determine_ext('http://example.com/foo/bar.mp4/?download'), 'mp4') diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 6d6ae89f8..3bde40eb3 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -1062,6 +1062,7 @@ from .telequebec import ( ) from .teletask import TeleTaskIE from .telewebion import TelewebionIE +from .tennistv import TennisTVIE from .testurl import TestURLIE from .tf1 import TF1IE from .tfo import TFOIE diff --git a/youtube_dl/extractor/tennistv.py b/youtube_dl/extractor/tennistv.py new file mode 100644 index 000000000..601a17e57 --- /dev/null +++ b/youtube_dl/extractor/tennistv.py @@ -0,0 +1,113 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import json + +from .common import InfoExtractor + +from ..utils import ( + ExtractorError, + unified_timestamp, +) + + +class TennisTVIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?tennistv\.com/videos/(?P[-a-z0-9]+)' + _TEST = { + 'url': 'https://www.tennistv.com/videos/indian-wells-2018-verdasco-fritz', + 'info_dict': { + 'id': 'indian-wells-2018-verdasco-fritz', + 'ext': 'mp4', + 'title': 'Fernando Verdasco v Taylor Fritz', + 'description': 're:^After his stunning victory.{174}$', + 'thumbnail': 'https://atp-prod.akamaized.net/api/images/v1/images/112831/landscape/1242/0', + 'timestamp': 1521017381, + 'upload_date': '20180314', + }, + 'params': { + 'skip_download': True, + }, + 'skip': 'Requires email and password of a subscribed account', + } + _NETRC_MACHINE = 'tennistv' + + def _login(self): + (username, password) = self._get_login_info() + if not username or not password: + raise ExtractorError('No login info available, needed for using %s.' % self.IE_NAME, expected=True) + + login_form = { + 'Email': username, + 'Password': password, + } + login_json = json.dumps(login_form) + headers = { + 'content-type': 'application/json', + 'Referer': 'https://www.tennistv.com/login', + 'Origin': 'https://www.tennistv.com', + } + + login_result = self._download_json( + 'https://www.tennistv.com/api/users/v1/login', None, + note='Logging in', + errnote='Login failed (wrong password?)', + headers=headers, + data=login_json) + + if login_result['error']['errorCode']: + raise ExtractorError('Login failed, %s said: %r' % (self.IE_NAME, login_result['error']['errorMessage'])) + + if login_result['entitlement'] != 'SUBSCRIBED': + self.report_warning('%s may not be subscribed to %s.' % (username, self.IE_NAME)) + + self._session_token = login_result['sessionToken'] + + def _real_initialize(self): + self._login() + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + internal_id = self._search_regex(r'video=([0-9]+)', webpage, 'internal video id') + + headers = { + 'Origin': 'https://www.tennistv.com', + 'authorization': 'ATP %s' % self._session_token, + 'content-type': 'application/json', + 'Referer': url, + } + check_data = { + 'videoID': internal_id, + 'VideoUrlType': 'HLSV3', + } + check_json = json.dumps(check_data) + check_result = self._download_json( + 'https://www.tennistv.com/api/users/v1/entitlementchecknondiva', + video_id, note='Checking video authorization', headers=headers, data=check_json) + formats = self._extract_m3u8_formats(check_result['contentUrl'], video_id, ext='mp4') + + vdata_url = 'https://www.tennistv.com/api/channels/v1/de/none/video/%s' % video_id + vdata = self._download_json(vdata_url, video_id) + + timestamp = unified_timestamp(vdata['timestamp']) + thumbnail = vdata['video']['thumbnailUrl'] + description = vdata['displayText']['description'] + title = vdata['video']['title'] + + series = vdata['tour'] + venue = vdata['displayText']['venue'] + round_str = vdata['seo']['round'] + + return { + 'id': video_id, + 'title': title, + 'description': description, + 'formats': formats, + 'timestamp': timestamp, + 'thumbnail': thumbnail, + 'timestamp': timestamp, + 'series': series, + 'season': venue, + 'episode': round_str, + } diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index a21455f70..027d12785 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -1211,6 +1211,11 @@ def unified_timestamp(date_str, day_first=True): if m: date_str = date_str[:-len(m.group('tz'))] + # Python only supports microseconds, so remove nanoseconds + m = re.search(r'^([0-9]{4,}-[0-9]{1,2}-[0-9]{1,2}T[0-9]{1,2}:[0-9]{1,2}:[0-9]{1,2}\.[0-9]{6})[0-9]+$', date_str) + if m: + date_str = m.group(1) + for expression in date_formats(day_first): try: dt = datetime.datetime.strptime(date_str, expression) - timezone + datetime.timedelta(hours=pm_delta) From f3672ac522e85b3eae339a95d34f46e92d8ebaa3 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Wed, 14 Mar 2018 09:55:46 +0100 Subject: [PATCH 02/11] [line] lint (remove space on empty line) --- youtube_dl/extractor/line.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/line.py b/youtube_dl/extractor/line.py index 8414312fc..7f5fa446e 100644 --- a/youtube_dl/extractor/line.py +++ b/youtube_dl/extractor/line.py @@ -77,7 +77,7 @@ class LineTVIE(InfoExtractor): title = self._og_search_title(webpage) # like_count requires an additional API request https://tv.line.me/api/likeit/getCount - + return { 'id': video_id, 'title': title, From e6e68069f6fe25fe4a2b72487be840ba2ec3c5c6 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Wed, 14 Mar 2018 11:23:09 +0100 Subject: [PATCH 03/11] [tennistv] Correctly encode POST parameters In python 3.x, the POST parameters must be bytes, not str. --- ChangeLog | 2 +- youtube_dl/extractor/tennistv.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/ChangeLog b/ChangeLog index f2f0d6143..ad639c805 100644 --- a/ChangeLog +++ b/ChangeLog @@ -2,7 +2,7 @@ version Extractors + [line] Add support for tv.line.me (#9427) - ++ [tennistv] Add support for tennistv.com version 2018.03.10 diff --git a/youtube_dl/extractor/tennistv.py b/youtube_dl/extractor/tennistv.py index 601a17e57..def29b6fa 100644 --- a/youtube_dl/extractor/tennistv.py +++ b/youtube_dl/extractor/tennistv.py @@ -40,7 +40,7 @@ class TennisTVIE(InfoExtractor): 'Email': username, 'Password': password, } - login_json = json.dumps(login_form) + login_json = json.dumps(login_form).encode('utf-8') headers = { 'content-type': 'application/json', 'Referer': 'https://www.tennistv.com/login', @@ -81,7 +81,7 @@ class TennisTVIE(InfoExtractor): 'videoID': internal_id, 'VideoUrlType': 'HLSV3', } - check_json = json.dumps(check_data) + check_json = json.dumps(check_data).encode('utf-8') check_result = self._download_json( 'https://www.tennistv.com/api/users/v1/entitlementchecknondiva', video_id, note='Checking video authorization', headers=headers, data=check_json) From b848a4ca1a4bb5b2f64eb551d1bbd73ddcd2e9b1 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Wed, 14 Mar 2018 11:48:20 +0100 Subject: [PATCH 04/11] [tennistv] Remove duplicate key in dictionary --- youtube_dl/extractor/tennistv.py | 1 - 1 file changed, 1 deletion(-) diff --git a/youtube_dl/extractor/tennistv.py b/youtube_dl/extractor/tennistv.py index def29b6fa..0c6f70784 100644 --- a/youtube_dl/extractor/tennistv.py +++ b/youtube_dl/extractor/tennistv.py @@ -104,7 +104,6 @@ class TennisTVIE(InfoExtractor): 'title': title, 'description': description, 'formats': formats, - 'timestamp': timestamp, 'thumbnail': thumbnail, 'timestamp': timestamp, 'series': series, From b8c6badc96fa52e1851d2c5803cb9a1563bf9de5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Wed, 14 Mar 2018 22:38:30 +0700 Subject: [PATCH 05/11] [soundcloud] Update client id (closes #15866) --- youtube_dl/extractor/soundcloud.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/soundcloud.py b/youtube_dl/extractor/soundcloud.py index 1ca310b90..46332e5c2 100644 --- a/youtube_dl/extractor/soundcloud.py +++ b/youtube_dl/extractor/soundcloud.py @@ -157,7 +157,7 @@ class SoundcloudIE(InfoExtractor): }, ] - _CLIENT_ID = 'DQskPX1pntALRzMp4HSxya3Mc0AO66Ro' + _CLIENT_ID = 'LvWovRaJZlWCHql0bISuum8Bd2KX79mb' @staticmethod def _extract_urls(webpage): From c95dfb050942e353fa39f83d02bf08dedb13963a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Wed, 14 Mar 2018 22:45:05 +0700 Subject: [PATCH 06/11] [ChangeLog] Actualize [ci skip] --- ChangeLog | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/ChangeLog b/ChangeLog index ad639c805..1b6e62135 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,8 +1,12 @@ version Extractors -+ [line] Add support for tv.line.me (#9427) +* [soundcloud] Update client id (#15866) + [tennistv] Add support for tennistv.com ++ [line] Add support for tv.line.me (#9427) +* [xnxx] Fix extraction (#15817) +* [njpwworld] Fix authentication (#15815) + version 2018.03.10 From 46c6742d4f1b1afa2d6dc787e8b0b119f9c5ee98 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Wed, 14 Mar 2018 22:49:22 +0700 Subject: [PATCH 07/11] release 2018.03.14 --- .github/ISSUE_TEMPLATE.md | 6 +++--- ChangeLog | 2 +- docs/supportedsites.md | 2 ++ youtube_dl/version.py | 2 +- 4 files changed, 7 insertions(+), 5 deletions(-) diff --git a/.github/ISSUE_TEMPLATE.md b/.github/ISSUE_TEMPLATE.md index bc0c5ef18..481e2ed74 100644 --- a/.github/ISSUE_TEMPLATE.md +++ b/.github/ISSUE_TEMPLATE.md @@ -6,8 +6,8 @@ --- -### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2018.03.10*. If it's not, read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. -- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2018.03.10** +### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2018.03.14*. If it's not, read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. +- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2018.03.14** ### Before submitting an *issue* make sure you have: - [ ] At least skimmed through the [README](https://github.com/rg3/youtube-dl/blob/master/README.md), **most notably** the [FAQ](https://github.com/rg3/youtube-dl#faq) and [BUGS](https://github.com/rg3/youtube-dl#bugs) sections @@ -36,7 +36,7 @@ Add the `-v` flag to **your command line** you run youtube-dl with (`youtube-dl [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 -[debug] youtube-dl version 2018.03.10 +[debug] youtube-dl version 2018.03.14 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/ChangeLog b/ChangeLog index 1b6e62135..47736e076 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,4 +1,4 @@ -version +version 2018.03.14 Extractors * [soundcloud] Update client id (#15866) diff --git a/docs/supportedsites.md b/docs/supportedsites.md index cb11f1b42..80358bb14 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -427,6 +427,7 @@ - **limelight** - **limelight:channel** - **limelight:channel_list** + - **LineTV** - **LiTV** - **LiveLeak** - **LiveLeakEmbed** @@ -829,6 +830,7 @@ - **TeleQuebecLive** - **TeleTask** - **Telewebion** + - **TennisTV** - **TF1** - **TFO** - **TheIntercept** diff --git a/youtube_dl/version.py b/youtube_dl/version.py index a35d10818..6ce11c39b 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2018.03.10' +__version__ = '2018.03.14' From 27b1c73f14617eec4286cc85c68c87b6635cfff3 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Thu, 15 Mar 2018 14:33:36 +0100 Subject: [PATCH 08/11] [instagram] fix user videos extraction(fixes #15858) --- youtube_dl/extractor/instagram.py | 106 +++++++++++++----------------- 1 file changed, 47 insertions(+), 59 deletions(-) diff --git a/youtube_dl/extractor/instagram.py b/youtube_dl/extractor/instagram.py index a77f619d2..ac9d92a8d 100644 --- a/youtube_dl/extractor/instagram.py +++ b/youtube_dl/extractor/instagram.py @@ -1,6 +1,6 @@ from __future__ import unicode_literals -import itertools +import json import re from .common import InfoExtractor @@ -238,70 +238,58 @@ class InstagramUserIE(InfoExtractor): } def _entries(self, uploader_id): - query = { - '__a': 1, - } - - def get_count(kind): + def get_count(suffix): return int_or_none(try_get( - node, lambda x: x['%ss' % kind]['count'])) + node, lambda x: x['edge_media_' + suffix]['count'])) - for page_num in itertools.count(1): - page = self._download_json( - 'https://instagram.com/%s/' % uploader_id, uploader_id, - note='Downloading page %d' % page_num, - fatal=False, query=query) - if not page: - break - - nodes = try_get(page, lambda x: x['user']['media']['nodes'], list) - if not nodes: - break - - max_id = None - - for node in nodes: - node_id = node.get('id') - if node_id: - max_id = node_id - - if node.get('__typename') != 'GraphVideo' and node.get('is_video') is not True: - continue - video_id = node.get('code') - if not video_id: - continue - - info = self.url_result( - 'https://instagram.com/p/%s/' % video_id, - ie=InstagramIE.ie_key(), video_id=video_id) - - description = try_get( - node, [lambda x: x['caption'], lambda x: x['text']['id']], - compat_str) - thumbnail = node.get('thumbnail_src') or node.get('display_src') - timestamp = int_or_none(node.get('date')) - - comment_count = get_count('comment') - like_count = get_count('like') - view_count = int_or_none(node.get('video_views')) - - info.update({ - 'description': description, - 'thumbnail': thumbnail, - 'timestamp': timestamp, - 'comment_count': comment_count, - 'like_count': like_count, - 'view_count': view_count, + edges = self._download_json( + 'https://www.instagram.com/graphql/query/', uploader_id, query={ + 'query_hash': '472f257a40c653c64c666ce877d59d2b', + 'variables': json.dumps({ + 'id': uploader_id, + 'first': 999999999, }) + })['data']['user']['edge_owner_to_timeline_media']['edges'] - yield info + for edge in edges: + node = edge['node'] - if not max_id: - break + if node.get('__typename') != 'GraphVideo' and node.get('is_video') is not True: + continue + video_id = node.get('shortcode') + if not video_id: + continue - query['max_id'] = max_id + info = self.url_result( + 'https://instagram.com/p/%s/' % video_id, + ie=InstagramIE.ie_key(), video_id=video_id) + + description = try_get( + node, lambda x: x['edge_media_to_caption']['edges'][0]['node']['text'], + compat_str) + thumbnail = node.get('thumbnail_src') or node.get('display_src') + timestamp = int_or_none(node.get('taken_at_timestamp')) + + comment_count = get_count('to_comment') + like_count = get_count('preview_like') + view_count = int_or_none(node.get('video_view_count')) + + info.update({ + 'description': description, + 'thumbnail': thumbnail, + 'timestamp': timestamp, + 'comment_count': comment_count, + 'like_count': like_count, + 'view_count': view_count, + }) + + yield info def _real_extract(self, url): - uploader_id = self._match_id(url) + username = self._match_id(url) + uploader_id = self._download_json( + 'https://instagram.com/%s/' % username, username, query={ + '__a': 1, + })['graphql']['user']['id'] return self.playlist_result( - self._entries(uploader_id), uploader_id, uploader_id) + self._entries(uploader_id), username, username) From 8e70c1bfac98b3d0d304b66ff1d616dd26522acb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Thu, 15 Mar 2018 22:37:41 +0700 Subject: [PATCH 09/11] [heise] Improve extraction (closes #15496, closes #15784, closes #15026) --- youtube_dl/extractor/heise.py | 76 +++++++++++++++++++++++++---------- 1 file changed, 54 insertions(+), 22 deletions(-) diff --git a/youtube_dl/extractor/heise.py b/youtube_dl/extractor/heise.py index 8f49f52ef..5c03780a3 100644 --- a/youtube_dl/extractor/heise.py +++ b/youtube_dl/extractor/heise.py @@ -7,6 +7,7 @@ from .youtube import YoutubeIE from ..utils import ( determine_ext, int_or_none, + NO_DEFAULT, parse_iso8601, smuggle_url, xpath_text, @@ -16,18 +17,19 @@ from ..utils import ( class HeiseIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?heise\.de/(?:[^/]+/)+[^/]+-(?P[0-9]+)\.html' _TESTS = [{ + # kaltura embed 'url': 'http://www.heise.de/video/artikel/Podcast-c-t-uplink-3-3-Owncloud-Tastaturen-Peilsender-Smartphone-2404147.html', - 'md5': 'ffed432483e922e88545ad9f2f15d30e', 'info_dict': { - 'id': '2404147', + 'id': '1_kkrq94sm', 'ext': 'mp4', 'title': "Podcast: c't uplink 3.3 – Owncloud / Tastaturen / Peilsender Smartphone", - 'format_id': 'mp4_720p', - 'timestamp': 1411812600, - 'upload_date': '20140927', + 'timestamp': 1512734959, + 'upload_date': '20171208', 'description': 'md5:c934cbfb326c669c2bcabcbe3d3fcd20', - 'thumbnail': r're:^https?://.*/gallery/$', - } + }, + 'params': { + 'skip_download': True, + }, }, { # YouTube embed 'url': 'http://www.heise.de/newsticker/meldung/Netflix-In-20-Jahren-vom-Videoverleih-zum-TV-Revolutionaer-3814130.html', @@ -46,13 +48,26 @@ class HeiseIE(InfoExtractor): }, }, { 'url': 'https://www.heise.de/video/artikel/nachgehakt-Wie-sichert-das-c-t-Tool-Restric-tor-Windows-10-ab-3700244.html', - 'md5': '4b58058b46625bdbd841fc2804df95fc', 'info_dict': { 'id': '1_ntrmio2s', + 'ext': 'mp4', + 'title': "nachgehakt: Wie sichert das c't-Tool Restric'tor Windows 10 ab?", + 'description': 'md5:47e8ffb6c46d85c92c310a512d6db271', 'timestamp': 1512470717, 'upload_date': '20171205', + }, + 'params': { + 'skip_download': True, + }, + }, { + 'url': 'https://www.heise.de/ct/artikel/c-t-uplink-20-8-Staubsaugerroboter-Xiaomi-Vacuum-2-AR-Brille-Meta-2-und-Android-rooten-3959893.html', + 'info_dict': { + 'id': '1_59mk80sf', 'ext': 'mp4', - 'title': 'ct10 nachgehakt hos restrictor', + 'title': "c't uplink 20.8: Staubsaugerroboter Xiaomi Vacuum 2, AR-Brille Meta 2 und Android rooten", + 'description': 'md5:f50fe044d3371ec73a8f79fcebd74afc', + 'timestamp': 1517567237, + 'upload_date': '20180202', }, 'params': { 'skip_download': True, @@ -72,19 +87,40 @@ class HeiseIE(InfoExtractor): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) - title = self._html_search_meta('fulltitle', webpage, default=None) - if not title or title == "c't": - title = self._search_regex( - r']+class="videoplayerjw"[^>]+data-title="([^"]+)"', - webpage, 'title') + def extract_title(default=NO_DEFAULT): + title = self._html_search_meta( + ('fulltitle', 'title'), webpage, default=None) + if not title or title == "c't": + title = self._search_regex( + r']+class="videoplayerjw"[^>]+data-title="([^"]+)"', + webpage, 'title', default=None) + if not title: + title = self._html_search_regex( + r']+\bclass=["\']article_page_title[^>]+>(.+?)<', + webpage, 'title', default=default) + return title - yt_urls = YoutubeIE._extract_urls(webpage) - if yt_urls: - return self.playlist_from_matches(yt_urls, video_id, title, ie=YoutubeIE.ie_key()) + title = extract_title(default=None) + description = self._og_search_description( + webpage, default=None) or self._html_search_meta( + 'description', webpage) kaltura_url = KalturaIE._extract_url(webpage) if kaltura_url: - return self.url_result(smuggle_url(kaltura_url, {'source_url': url}), KalturaIE.ie_key()) + return { + '_type': 'url_transparent', + 'url': smuggle_url(kaltura_url, {'source_url': url}), + 'ie_key': KalturaIE.ie_key(), + 'title': title, + 'description': description, + } + + yt_urls = YoutubeIE._extract_urls(webpage) + if yt_urls: + return self.playlist_from_matches( + yt_urls, video_id, title, ie=YoutubeIE.ie_key()) + + title = extract_title() container_id = self._search_regex( r'
]+data-container="([0-9]+)"', @@ -115,10 +151,6 @@ class HeiseIE(InfoExtractor): }) self._sort_formats(formats) - description = self._og_search_description( - webpage, default=None) or self._html_search_meta( - 'description', webpage) - return { 'id': video_id, 'title': title, From 3526c3043b5d6ce64d9cf0ccab20ef0b7a1e6a13 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Fri, 16 Mar 2018 00:19:17 +0700 Subject: [PATCH 10/11] [bilibili] Fix and improve extraction (closes #15048, closes #15430, closes #15622, closes #15863) --- youtube_dl/extractor/bilibili.py | 145 ++++++++++++++++++++----------- 1 file changed, 93 insertions(+), 52 deletions(-) diff --git a/youtube_dl/extractor/bilibili.py b/youtube_dl/extractor/bilibili.py index beffcecd0..b898223e3 100644 --- a/youtube_dl/extractor/bilibili.py +++ b/youtube_dl/extractor/bilibili.py @@ -27,14 +27,14 @@ class BiliBiliIE(InfoExtractor): _TESTS = [{ 'url': 'http://www.bilibili.tv/video/av1074402/', - 'md5': '9fa226fe2b8a9a4d5a69b4c6a183417e', + 'md5': '5f7d29e1a2872f3df0cf76b1f87d3788', 'info_dict': { 'id': '1074402', - 'ext': 'mp4', + 'ext': 'flv', 'title': '【金坷垃】金泡沫', 'description': 'md5:ce18c2a2d2193f0df2917d270f2e5923', - 'duration': 308.315, - 'timestamp': 1398012660, + 'duration': 308.067, + 'timestamp': 1398012678, 'upload_date': '20140420', 'thumbnail': r're:^https?://.+\.jpg', 'uploader': '菊子桑', @@ -59,17 +59,38 @@ class BiliBiliIE(InfoExtractor): 'url': 'http://www.bilibili.com/video/av8903802/', 'info_dict': { 'id': '8903802', - 'ext': 'mp4', 'title': '阿滴英文|英文歌分享#6 "Closer', 'description': '滴妹今天唱Closer給你聽! 有史以来,被推最多次也是最久的歌曲,其实歌词跟我原本想像差蛮多的,不过还是好听! 微博@阿滴英文', - 'uploader': '阿滴英文', - 'uploader_id': '65880958', - 'timestamp': 1488382620, - 'upload_date': '20170301', - }, - 'params': { - 'skip_download': True, # Test metadata only }, + 'playlist': [{ + 'info_dict': { + 'id': '8903802_part1', + 'ext': 'flv', + 'title': '阿滴英文|英文歌分享#6 "Closer', + 'description': 'md5:3b1b9e25b78da4ef87e9b548b88ee76a', + 'uploader': '阿滴英文', + 'uploader_id': '65880958', + 'timestamp': 1488382634, + 'upload_date': '20170301', + }, + 'params': { + 'skip_download': True, # Test metadata only + }, + }, { + 'info_dict': { + 'id': '8903802_part2', + 'ext': 'flv', + 'title': '阿滴英文|英文歌分享#6 "Closer', + 'description': 'md5:3b1b9e25b78da4ef87e9b548b88ee76a', + 'uploader': '阿滴英文', + 'uploader_id': '65880958', + 'timestamp': 1488382634, + 'upload_date': '20170301', + }, + 'params': { + 'skip_download': True, # Test metadata only + }, + }] }] _APP_KEY = '84956560bc028eb7' @@ -92,9 +113,13 @@ class BiliBiliIE(InfoExtractor): webpage = self._download_webpage(url, video_id) if 'anime/' not in url: - cid = compat_parse_qs(self._search_regex( - [r'EmbedPlayer\([^)]+,\s*"([^"]+)"\)', - r']+src="https://secure\.bilibili\.com/secure,([^"]+)"'], + cid = self._search_regex( + r'cid(?:["\']:|=)(\d+)', webpage, 'cid', + default=None + ) or compat_parse_qs(self._search_regex( + [r'1EmbedPlayer\([^)]+,\s*"([^"]+)"\)', + r'1EmbedPlayer\([^)]+,\s*\\"([^"]+)\\"\)', + r'1]+src="https://secure\.bilibili\.com/secure,([^"]+)"'], webpage, 'player parameters'))['cid'][0] else: if 'no_bangumi_tip' not in smuggled_data: @@ -114,53 +139,66 @@ class BiliBiliIE(InfoExtractor): self._report_error(js) cid = js['result']['cid'] - payload = 'appkey=%s&cid=%s&otype=json&quality=2&type=mp4' % (self._APP_KEY, cid) - sign = hashlib.md5((payload + self._BILIBILI_KEY).encode('utf-8')).hexdigest() - headers = { 'Referer': url } headers.update(self.geo_verification_headers()) - video_info = self._download_json( - 'http://interface.bilibili.com/playurl?%s&sign=%s' % (payload, sign), - video_id, note='Downloading video info page', - headers=headers) - - if 'durl' not in video_info: - self._report_error(video_info) - entries = [] - for idx, durl in enumerate(video_info['durl']): - formats = [{ - 'url': durl['url'], - 'filesize': int_or_none(durl['size']), - }] - for backup_url in durl.get('backup_url', []): - formats.append({ - 'url': backup_url, - # backup URLs have lower priorities - 'preference': -2 if 'hd.mp4' in backup_url else -3, + RENDITIONS = ('qn=80&quality=80&type=', 'quality=2&type=mp4') + for num, rendition in enumerate(RENDITIONS, start=1): + payload = 'appkey=%s&cid=%s&otype=json&%s' % (self._APP_KEY, cid, rendition) + sign = hashlib.md5((payload + self._BILIBILI_KEY).encode('utf-8')).hexdigest() + + video_info = self._download_json( + 'http://interface.bilibili.com/playurl?%s&sign=%s' % (payload, sign), + video_id, note='Downloading video info page', + headers=headers, fatal=num == len(RENDITIONS)) + + if not video_info: + continue + + if 'durl' not in video_info: + if num < len(RENDITIONS): + continue + self._report_error(video_info) + + for idx, durl in enumerate(video_info['durl']): + formats = [{ + 'url': durl['url'], + 'filesize': int_or_none(durl['size']), + }] + for backup_url in durl.get('backup_url', []): + formats.append({ + 'url': backup_url, + # backup URLs have lower priorities + 'preference': -2 if 'hd.mp4' in backup_url else -3, + }) + + for a_format in formats: + a_format.setdefault('http_headers', {}).update({ + 'Referer': url, + }) + + self._sort_formats(formats) + + entries.append({ + 'id': '%s_part%s' % (video_id, idx), + 'duration': float_or_none(durl.get('length'), 1000), + 'formats': formats, }) + break - for a_format in formats: - a_format.setdefault('http_headers', {}).update({ - 'Referer': url, - }) - - self._sort_formats(formats) - - entries.append({ - 'id': '%s_part%s' % (video_id, idx), - 'duration': float_or_none(durl.get('length'), 1000), - 'formats': formats, - }) - - title = self._html_search_regex(']*>([^<]+)', webpage, 'title') + title = self._html_search_regex( + (']+\btitle=(["\'])(?P(?:(?!\1).)+)\1', + '(?s)<h1[^>]*>(?P<title>.+?)</h1>'), webpage, 'title', + group='title') description = self._html_search_meta('description', webpage) timestamp = unified_timestamp(self._html_search_regex( - r'<time[^>]+datetime="([^"]+)"', webpage, 'upload time', default=None)) + r'<time[^>]+datetime="([^"]+)"', webpage, 'upload time', + default=None) or self._html_search_meta( + 'uploadDate', webpage, 'timestamp', default=None)) thumbnail = self._html_search_meta(['og:image', 'thumbnailUrl'], webpage) # TODO 'view_count' requires deobfuscating Javascript @@ -174,13 +212,16 @@ class BiliBiliIE(InfoExtractor): } uploader_mobj = re.search( - r'<a[^>]+href="(?:https?:)?//space\.bilibili\.com/(?P<id>\d+)"[^>]+title="(?P<name>[^"]+)"', + r'<a[^>]+href="(?:https?:)?//space\.bilibili\.com/(?P<id>\d+)"[^>]*>(?P<name>[^<]+)', webpage) if uploader_mobj: info.update({ 'uploader': uploader_mobj.group('name'), 'uploader_id': uploader_mobj.group('id'), }) + if not info.get('uploader'): + info['uploader'] = self._html_search_meta( + 'author', webpage, 'uploader', default=None) for entry in entries: entry.update(info) From d12396085754a597c2c5e621e4a68471871e2cfd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Fri, 16 Mar 2018 03:18:53 +0700 Subject: [PATCH 11/11] [bilibili] Switch to v2 playurl API --- youtube_dl/extractor/bilibili.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/bilibili.py b/youtube_dl/extractor/bilibili.py index b898223e3..90697c4a7 100644 --- a/youtube_dl/extractor/bilibili.py +++ b/youtube_dl/extractor/bilibili.py @@ -152,7 +152,7 @@ class BiliBiliIE(InfoExtractor): sign = hashlib.md5((payload + self._BILIBILI_KEY).encode('utf-8')).hexdigest() video_info = self._download_json( - 'http://interface.bilibili.com/playurl?%s&sign=%s' % (payload, sign), + 'http://interface.bilibili.com/v2/playurl?%s&sign=%s' % (payload, sign), video_id, note='Downloading video info page', headers=headers, fatal=num == len(RENDITIONS))