From b602c16f61ac783b1867808caf39b3ecf9571bad Mon Sep 17 00:00:00 2001 From: Alex Seiler Date: Fri, 17 Feb 2017 22:58:07 +0100 Subject: [PATCH 1/2] [tegnamedia] Add new information extractors --- youtube_dl/extractor/extractors.py | 5 ++ youtube_dl/extractor/tegnamedia.py | 124 +++++++++++++++++++++++++++++ 2 files changed, 129 insertions(+) create mode 100644 youtube_dl/extractor/tegnamedia.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index be3688d5a..cd27fa3f2 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -932,6 +932,11 @@ from .teamcoco import TeamcocoIE from .teamfourstar import TeamFourStarIE from .techtalks import TechTalksIE from .ted import TEDIE +from .tegnamedia import ( + NineNewsIE, + THVElevenIE, + TwelveNewsIE, +) from .tele13 import Tele13IE from .telebruxelles import TeleBruxellesIE from .telecinco import TelecincoIE diff --git a/youtube_dl/extractor/tegnamedia.py b/youtube_dl/extractor/tegnamedia.py new file mode 100644 index 000000000..f46bc40d8 --- /dev/null +++ b/youtube_dl/extractor/tegnamedia.py @@ -0,0 +1,124 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + int_or_none, + parse_duration, + str_or_none, + unified_timestamp, +) + + +class TegnaMediaIE(InfoExtractor): + SUBSCRIPTION_KEY = '' + + def _real_extract(self, url): + show_id = self._match_id(url) + webpage = self._download_webpage(url, show_id) + + player_info = self._html_search_regex( + r']+class="js-jwloader"(?P[^>]+)', webpage, 'player info') + data_id = self._search_regex( + r'data-id="(?P\d+)"', player_info, 'video id') + data_site = self._search_regex( + r'data-site="(?P\d+)"', player_info, 'data site') + + api_url = 'http://api.tegna-tv.com/video/v2/getAllVideoPathsById/%s/%s?subscription-key=%s' % (data_id, data_site, self.SUBSCRIPTION_KEY) + video_json = self._download_json(api_url, show_id) + + video_id = str_or_none(video_json['Id']) + title = str_or_none(video_json['Title']) + description = str_or_none(video_json['Description']) + thumbnail = str_or_none(video_json['Image']) + + duration = parse_duration(str_or_none(video_json['VideoLength'])) + timestamp = unified_timestamp(str_or_none(video_json['DateCreated'])) + + formats = [] + for elem in video_json.get('Sources'): + path = str_or_none(elem['Path']) + if elem.get('Format') == 'MP4': + formats.append( + { + 'url': path, + 'format_id': 'mp4-' + str_or_none(elem['EncodingRate']), + 'vbr': int_or_none(elem['EncodingRate']), + } + ) + elif elem.get('Format') == 'HLS': + forms = self._extract_m3u8_formats( + path, video_id, ext='mp4', entry_protocol='m3u8_native') + formats += forms + elif elem.get('Format') == 'HDS': + # I am not sure how to extract this format, I have tried the + # following, but this format seems to be only mentioned + # in the json, but not really available: + # forms = self._extract_akamai_formats(path, video_id) + # formats += forms + pass + + self._sort_formats(formats) + return { + 'id': video_id, + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + 'duration': duration, + 'timestamp': timestamp, + 'formats': formats, + } + + +class NineNewsIE(TegnaMediaIE): + _VALID_URL = r'https?://(?:www\.)?9news\.com/.+/(?P[0-9]+)' + SUBSCRIPTION_KEY = 'ae1d3e46c9914e9b87757fead91d7654' + + _TEST = { + 'url': 'http://www.9news.com/news/local/father-worries-about-immigration-status/408808900', + 'md5': 'e367c89e52eed4ff3bcc696d664e4f4b', + 'info_dict': { + 'id': '2512310', + 'ext': 'mp4', + 'title': 'Father worries about immigration status', + 'description': '9NEWS @ 9. 2/15/2017', + 'thumbnail': 'http://kusa-download.edgesuite.net/video/2512310/2512310_Still.jpg', + 'duration': 96.0, + 'timestamp': 1487218434, + 'upload_date': '20170216', + } + } + + def _real_extract(self, url): + return super(NineNewsIE, self)._real_extract(url) + + +class TwelveNewsIE(TegnaMediaIE): + _VALID_URL = r'https?://(?:www\.)?12news\.com/.+/(?P[0-9]+)' + SUBSCRIPTION_KEY = 'd721cdf2210c493cb8a194d1e53b4ef5' + + _TEST = { + 'url': 'http://www.12news.com/news/local/valley/dps-stops-wrong-way-driver-after-several-miles/408864874', + 'info_dict': { + 'id': '2514219', + 'ext': 'mp4', + 'title': '''Megan Melanson's initial court appearance''', + 'description': 'md5:24188e754669c29700e8dd6d19e4943b', + 'timestamp': 1487360943, + 'upload_date': '20170217', + }, + 'params': { + 'skip_download': True, + } + } + + def _real_extract(self, url): + return super(TwelveNewsIE, self)._real_extract(url) + + +class THVElevenIE(TegnaMediaIE): + _VALID_URL = r'https?://(?:www\.)?thv11\.com/.+/(?P[0-9]+)' + SUBSCRIPTION_KEY = 'd8d2110b71e5490f8652a270ef1cc8c2' + + def _real_extract(self, url): + return super(THVElevenIE, self)._real_extract(url) From fba4a4fc68e8e4c829c82a4269e5666ad2ca9ed5 Mon Sep 17 00:00:00 2001 From: Alex Seiler Date: Sat, 18 Feb 2017 13:12:22 +0100 Subject: [PATCH 2/2] [tegnamedia] Support HDS format --- youtube_dl/extractor/tegnamedia.py | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/youtube_dl/extractor/tegnamedia.py b/youtube_dl/extractor/tegnamedia.py index f46bc40d8..943bc5120 100644 --- a/youtube_dl/extractor/tegnamedia.py +++ b/youtube_dl/extractor/tegnamedia.py @@ -51,12 +51,9 @@ class TegnaMediaIE(InfoExtractor): path, video_id, ext='mp4', entry_protocol='m3u8_native') formats += forms elif elem.get('Format') == 'HDS': - # I am not sure how to extract this format, I have tried the - # following, but this format seems to be only mentioned - # in the json, but not really available: - # forms = self._extract_akamai_formats(path, video_id) - # formats += forms - pass + path += '/manifest.f4m?hdcode' + forms = self._extract_akamai_formats(path, video_id) + formats += forms self._sort_formats(formats) return {