From 1a1c9e3f7122c93da81977c79de3f510c4c8bfc2 Mon Sep 17 00:00:00 2001 From: Kyle Date: Fri, 24 May 2019 18:01:49 +0900 Subject: [PATCH 1/9] Add Yahoo Japan News extractor. --- youtube_dl/extractor/extractors.py | 1 + youtube_dl/extractor/yahoo.py | 286 +++++++++++++++++++++++++++++ 2 files changed, 287 insertions(+) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 15f54a214..06de556b7 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -1448,6 +1448,7 @@ from .yahoo import ( YahooSearchIE, YahooGyaOPlayerIE, YahooGyaOIE, + YahooJapanNewsIE, ) from .yandexdisk import YandexDiskIE from .yandexmusic import ( diff --git a/youtube_dl/extractor/yahoo.py b/youtube_dl/extractor/yahoo.py index a3b5f00c8..4d6d04c5c 100644 --- a/youtube_dl/extractor/yahoo.py +++ b/youtube_dl/extractor/yahoo.py @@ -1,12 +1,14 @@ # coding: utf-8 from __future__ import unicode_literals +import hashlib import itertools import json import re from .common import InfoExtractor, SearchInfoExtractor from ..compat import ( + compat_str, compat_urllib_parse, compat_urlparse, ) @@ -18,7 +20,9 @@ from ..utils import ( int_or_none, mimetype2ext, smuggle_url, + try_get, unescapeHTML, + url_or_none, ) from .brightcove import ( @@ -556,3 +560,285 @@ class YahooGyaOIE(InfoExtractor): 'https://gyao.yahoo.co.jp/player/%s/' % video_id.replace(':', '/'), YahooGyaOPlayerIE.ie_key(), video_id)) return self.playlist_result(entries, program_id) + + +class YahooJapanNewsIE(InfoExtractor): + IE_NAME = 'yahoo:japannews' + IE_DESC = 'Yahoo! Japan News' + _VALID_URL = r'https?://(?P(?:news|headlines)\.yahoo\.co\.jp)(/[^\d]*(?P\d[\d-]*\d))?' + _TESTS = [ + { + 'url': 'https://headlines.yahoo.co.jp/videonews/nnn?a=20190531-00000180-nnn-int', + 'info_dict': { + 'id': '20190531-00000180', + 'ext': 'mp4', + 'title': '北“対米担当特別代表を銃殺”韓国紙報じる(日本テレビ系(NNN)) - Yahoo!ニュース', + 'description': '韓国の主要紙である朝鮮日報は、2回目の米朝首脳会談が決裂した責任を問われ、北朝鮮 - Yahoo!ニュース(日本テレビ系(NNN))', + 'thumbnail': r're:^https?://.*\.[a-zA-Z\d]{3,4}$', + }, + 'params': {'skip_download': True}, + }, + { + 'url': 'https://headlines.yahoo.co.jp/hl?a=20190529-00010015-houdoukvq-soci', + 'info_dict': { + 'id': '20190529-00010015', + 'ext': 'mp4', + 'title': '高校屋上から男子高校生 転落\u3000目撃 女子生徒パニックで搬送(FNN.jpプライムオンライン) - Yahoo!ニュース', + 'description': '29日午後、宮崎市の高校の屋上から男子高校生が転落し、重傷となっている。29日午後2 - Yahoo!ニュース(FNN.jpプライムオンライン)', + 'thumbnail': r're:^https?://.*\.[a-zA-Z\d]{3,4}$', + }, + 'params': {'skip_download': True}, + }, + { + 'url': 'https://headlines.yahoo.co.jp/videonews/', + 'info_dict': { + 'id': 'headlines.yahoo.co.jp', + 'ext': 'mp4', + 'title': '映像ニュース - Yahoo!ニュース', + 'description': 'テレビ局などが配信する映像ニュースを掲載。', + }, + 'params': {'skip_download': True}, + }, + { + 'url': 'https://news.yahoo.co.jp', + 'info_dict': { + 'id': 'news.yahoo.co.jp', + 'ext': 'mp4', + 'title': 'Yahoo!ニュース', + 'description': 'Yahoo!ニュースは、新聞・通信社が配信するニュースのほか、映像、雑誌や個人の書き手が執筆する記事など多種多様なニュースを掲載しています。', + }, + 'params': {'skip_download': True}, + }, + { + 'url': 'https://news.yahoo.co.jp/byline/fujitatakanori/20190528-00127666/', + 'only_matching': True, + }, + {'url': 'https://news.yahoo.co.jp/pickup/6325141', 'only_matching': True}, + ] + _USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36' + _ORIGIN = 'https://s.yimg.jp' + + def _extract_formats(self, json_data, content_id): + formats = [] + + video_data = try_get( + json_data, + lambda x: x['ResultSet']['Result'][0]['VideoUrlSet']['VideoUrl'], + list, + ) + if video_data: + # Article page + for vid in video_data: + delivery = vid.get('delivery') + url = url_or_none(vid.get('Url')) + if not delivery or not url: + continue + elif delivery == 'hls': + formats.extend( + self._extract_m3u8_formats( + url, + content_id, + 'mp4', + entry_protocol='m3u8_native', + m3u8_id='hls', + fatal=False, + ) + ) + elif delivery == 'progressive': + formats.append( + { + 'url': url, + 'format_id': 'http-%s' % compat_str(vid.get('bitrate', '')), + 'ext': determine_ext(url), + 'height': int_or_none(vid.get('height')), + 'width': int_or_none(vid.get('width')), + 'btr': int_or_none(vid.get('bitrate')), + } + ) + else: + # Headline page with multiple videos + for vid in json_data.get('videos', []): + for src in vid.get('sources', []): + url = url_or_none(src.get('src')) + ext = determine_ext(url) + if not url: + continue + if ext == 'm3u8': + formats.extend( + self._extract_m3u8_formats( + url, + content_id, + 'mp4', + entry_protocol='m3u8_native', + m3u8_id='hls', + fatal=False, + ) + ) + else: + formats.append({'url': url, 'ext': ext}) + + self._remove_duplicate_formats(formats) + self._sort_formats(formats) + + return formats + + @staticmethod + def _get_md5(s): + return hashlib.md5(s.encode()).hexdigest() + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + host = mobj.group('host') + # Headline page without `displayid` defaults to `host`. + display_id = mobj.group('id') or host + webpage = self._download_webpage(url, display_id) + + title = ( + self._og_search_title(webpage, default=None) + or self._html_search_meta('twitter:title', webpage, default=None) + or self._html_search_regex('([^<]+)', webpage, 'title') + ) + description = ( + self._og_search_description(webpage, default=None) + or self._html_search_meta('description', webpage, default=None) + or self._html_search_meta('twitter:description', webpage, default=None) + ) + thumbnail = self._og_search_thumbnail( + webpage, default=None + ) or self._html_search_meta('twitter:image', webpage, default=None) + + if display_id == host: + # Headline page with playlist of videos + formats = [] + + # No single useful `contentid` on page + content_id = display_id + + stream_urls_plists = re.findall( + r'YJNEWS\.LIVESTREAM\.SRC.+=(?:\s+)?["\'](?P[^"\']+plist=(?P\d+)[^"\']+)', + webpage, + ) + if not stream_urls_plists: + # Manually build stream urls ('news.yahoo.co.jp', ...) + sbase_url = 'https://s.yimg.jp/images/media/video/player/html/embed_1.2.5.html?service=news&service_type=video&plist=%s&poster=%s&sp=%s&country=JP' + + params = re.findall( + r'["\']plist["\']:\s*["\'](?P[^"\']+)[^;]+?["\']poster["\']:\s*["\'](?P[^"\']+)[^;]+?["\']spaceId["\']:\s*["\'](?P[^"\']+)', + webpage, + ) + for p in params: + stream_urls_plists.append((sbase_url % p, p[0])) + + for url, plist_id in stream_urls_plists: + embed_page = self._download_webpage( + 'https://s.yimg.jp/images/media/video/player/js/embed_1.0.9.min.js', + plist_id, + headers={ + 'Authority': 's.yimg.jp', + 'Path': '/images/media/video/player/js/embed_1.0.9.min.js', + 'Referer': url, + 'User-Agent': self._USER_AGENT, + }, + ) + account = self._search_regex( + r'\w+\.video_attr\[(["\'])data-account\1\]\s*=\s*\1(?P[\d]+)', + embed_page, + 'account id', + group='accountid', + ) + player = self._search_regex( + r'\w+\.video_attr\[(["\'])data-player\1\]\s*=\s*\1(?P[^"\']+)', + embed_page, + 'player id', + group='playerid', + ) + embed = self._search_regex( + r'(["\'])data-embed\1\s*:\s*\1(?P[^"\']+)', + embed_page, + 'embed', + group='embed', + ) + indexjs_page = self._download_webpage( + 'https://players.brightcove.net/%s/%s_%s/index.min.js' + % (account, player, embed), + plist_id, + headers={'Referer': url, 'User-Agent': self._USER_AGENT}, + ) + policy_key = self._search_regex( + r'policyKey:\s*["\']([^"\']+)', indexjs_page, 'policy key' + ) + + json_data = self._download_json( + 'https://edge.api.brightcove.com/playback/v1/accounts/%s/playlists/%s' + % (account, plist_id), + plist_id, + headers={ + 'Accept': 'application/json;pk=%s' % policy_key, + 'Origin': self._ORIGIN, + 'Referer': url, + 'User-Agent': self._USER_AGENT, + }, + fatal=False, + ) + + formats.extend(self._extract_formats(json_data, content_id)) + + else: + # Article page + app_id = 'dj0zaiZpPVZMTVFJR0FwZWpiMyZzPWNvbnN1bWVyc2VjcmV0Jng9YjU-' + space_id = ( + self._search_regex( + r']+class=(["\'])yvpub-player\1[^>]+spaceid=(?P[^&"\']+)', + webpage, + 'spaceid', + group='spaceid', + default=None, + ) + or self._search_regex( + r'YAHOO\.JP\.srch\.\w+link\.onLoad[^;]+spaceID["\' ]*:["\' ]+([^"\']+)', + webpage, + 'spaceid', + default=None, + ) + or self._search_regex(r'