From 9ea71d2e7736587291eb915a911c5300a92952a5 Mon Sep 17 00:00:00 2001 From: Jan 'Yenda' Trmal Date: Wed, 8 Jan 2020 15:53:29 +0100 Subject: [PATCH 1/3] [tvnoe] Fixing/changing the extractor for TV Noe (Czech Christian TV) --- youtube_dl/extractor/tvnoe.py | 51 ++++++++++++++++++----------------- 1 file changed, 26 insertions(+), 25 deletions(-) diff --git a/youtube_dl/extractor/tvnoe.py b/youtube_dl/extractor/tvnoe.py index 26a5aeae4..5ad001905 100644 --- a/youtube_dl/extractor/tvnoe.py +++ b/youtube_dl/extractor/tvnoe.py @@ -4,22 +4,18 @@ from __future__ import unicode_literals from .common import InfoExtractor from ..utils import ( clean_html, - get_element_by_class, - js_to_json, ) class TVNoeIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?tvnoe\.cz/video/(?P[0-9]+)' + _VALID_URL = r'https?://(?:www\.)?tvnoe\.cz/porad/(?P[-0-9a-z]+)' _TEST = { - 'url': 'http://www.tvnoe.cz/video/10362', - 'md5': 'aee983f279aab96ec45ab6e2abb3c2ca', + 'url': 'https://www.tvnoe.cz/porad/26011-terra-santa-news-13-11-2019', 'info_dict': { - 'id': '10362', + 'id': '26011-terra-santa-news-13-11-2019', 'ext': 'mp4', - 'series': 'Noční univerzita', - 'title': 'prof. Tomáš Halík, Th.D. - Návrat náboženství a střet civilizací', - 'description': 'md5:f337bae384e1a531a52c55ebc50fff41', + 'series': 'Terra Santa News', + 'title': '13. 11. 2019', } } @@ -27,22 +23,27 @@ class TVNoeIE(InfoExtractor): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) - iframe_url = self._search_regex( - r']+src="([^"]+)"', webpage, 'iframe URL') + dash_url = self._search_regex( + r"\s*src:\s*\'(?Phttps?://[^\']+manifest.mpd)\',", webpage, 'mpd') + hls_url = self._search_regex( + r"\s*src:\s*\'(?Phttps?://[^\']+playlist.m3u8)\',", webpage, 'm3u8') - ifs_page = self._download_webpage(iframe_url, video_id) - jwplayer_data = self._find_jwplayer_data( - ifs_page, video_id, transform_source=js_to_json) - info_dict = self._parse_jwplayer_data( - jwplayer_data, video_id, require_title=False, base_url=iframe_url) + formats = [] + if dash_url: + formats.extend(self._extract_mpd_formats( + dash_url, video_id, mpd_id='dash', fatal=False)) + if hls_url: + formats.extend(self._extract_m3u8_formats( + hls_url, video_id, ext='mp4', m3u8_id='hls', fatal=False)) - info_dict.update({ + self._sort_formats(formats) + title = clean_html(self._search_regex( + r"

(?P.*)<\/h2>", webpage, 'title')) + series = clean_html(self._search_regex( + r"<h1>(?P<series>.*)<\/h1>", webpage, 'series')) + return { 'id': video_id, - 'title': clean_html(get_element_by_class( - 'field-name-field-podnazev', webpage)), - 'description': clean_html(get_element_by_class( - 'field-name-body', webpage)), - 'series': clean_html(get_element_by_class('title', webpage)) - }) - - return info_dict + 'title': title, + 'series': series, + 'formats': formats + } From 651fe192d0d98828b83205373dc695a84521659e Mon Sep 17 00:00:00 2001 From: Jan 'Yenda' Trmal <jtrmal@gmail.com> Date: Fri, 10 Jan 2020 10:39:57 +0100 Subject: [PATCH 2/3] coding conventions --- youtube_dl/extractor/tvnoe.py | 22 +++++++++++++++------- 1 file changed, 15 insertions(+), 7 deletions(-) diff --git a/youtube_dl/extractor/tvnoe.py b/youtube_dl/extractor/tvnoe.py index 5ad001905..6c6ec8d1f 100644 --- a/youtube_dl/extractor/tvnoe.py +++ b/youtube_dl/extractor/tvnoe.py @@ -23,18 +23,26 @@ class TVNoeIE(InfoExtractor): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) - dash_url = self._search_regex( - r"\s*src:\s*\'(?P<url>https?://[^\']+manifest.mpd)\',", webpage, 'mpd') - hls_url = self._search_regex( - r"\s*src:\s*\'(?P<url>https?://[^\']+playlist.m3u8)\',", webpage, 'm3u8') - formats = [] + hls_url = self._search_regex( + r"\s*src:\s*\'(?P<url>https?://[^\']+playlist.m3u8)\',", webpage, 'm3u8', fatal=False) + if hls_url: + dash_url = self._search_regex( + r"\s*src:\s*\'(?P<url>https?://[^\']+manifest.mpd)\',", webpage, 'mpd', fatal=False) + else: + dash_url = self._search_regex( + r"\s*src:\s*\'(?P<url>https?://[^\']+manifest.mpd)\',", webpage, 'mpd') + if dash_url: formats.extend(self._extract_mpd_formats( dash_url, video_id, mpd_id='dash', fatal=False)) if hls_url: - formats.extend(self._extract_m3u8_formats( - hls_url, video_id, ext='mp4', m3u8_id='hls', fatal=False)) + if formats: + formats.extend(self._extract_m3u8_formats( + hls_url, video_id, ext='mp4', m3u8_id='hls', fatal=False)) + else: + formats.extend(self._extract_m3u8_formats( + hls_url, video_id, ext='mp4', m3u8_id='hls')) self._sort_formats(formats) title = clean_html(self._search_regex( From 6763b8e88d1f4f2954f752e26ff90ddb6b2287c6 Mon Sep 17 00:00:00 2001 From: Jan 'Yenda' Trmal <jtrmal@gmail.com> Date: Sun, 26 Jan 2020 21:06:34 +0100 Subject: [PATCH 3/3] fixing flake8 error --- youtube_dl/extractor/tvnoe.py | 30 ++++++++++++++++++------------ 1 file changed, 18 insertions(+), 12 deletions(-) diff --git a/youtube_dl/extractor/tvnoe.py b/youtube_dl/extractor/tvnoe.py index 6c6ec8d1f..b5ddf8f39 100644 --- a/youtube_dl/extractor/tvnoe.py +++ b/youtube_dl/extractor/tvnoe.py @@ -4,15 +4,21 @@ from __future__ import unicode_literals from .common import InfoExtractor from ..utils import ( clean_html, + url_or_none, + js_to_json, + try_get +) +from ..compat import ( + compat_str ) class TVNoeIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?tvnoe\.cz/porad/(?P<id>[-0-9a-z]+)' + _VALID_URL = r'https?://(?:www\.)?tvnoe\.cz/porad/(?P<id>[0-9]+).*' _TEST = { 'url': 'https://www.tvnoe.cz/porad/26011-terra-santa-news-13-11-2019', 'info_dict': { - 'id': '26011-terra-santa-news-13-11-2019', + 'id': '26011', 'ext': 'mp4', 'series': 'Terra Santa News', 'title': '13. 11. 2019', @@ -24,14 +30,12 @@ class TVNoeIE(InfoExtractor): webpage = self._download_webpage(url, video_id) formats = [] - hls_url = self._search_regex( - r"\s*src:\s*\'(?P<url>https?://[^\']+playlist.m3u8)\',", webpage, 'm3u8', fatal=False) - if hls_url: - dash_url = self._search_regex( - r"\s*src:\s*\'(?P<url>https?://[^\']+manifest.mpd)\',", webpage, 'mpd', fatal=False) - else: - dash_url = self._search_regex( - r"\s*src:\s*\'(?P<url>https?://[^\']+manifest.mpd)\',", webpage, 'mpd') + json = self._search_regex(r'(?sm)var *INIT_PLAYER *= *(?P<json>[^;]+);', webpage, 'json') + player_data = self._parse_json(json, video_id, js_to_json) + hls_url = url_or_none(try_get(player_data, + lambda x: x['tracks']['HLS'][0]['src'], compat_str)) + dash_url = url_or_none(try_get(player_data, + lambda x: x['tracks']['DASH'][0]['src'], compat_str)) if dash_url: formats.extend(self._extract_mpd_formats( @@ -45,10 +49,12 @@ class TVNoeIE(InfoExtractor): hls_url, video_id, ext='mp4', m3u8_id='hls')) self._sort_formats(formats) + title = clean_html(self._search_regex( - r"<h2>(?P<title>.*)<\/h2>", webpage, 'title')) + r'<h2>(?P<title>.+)<\/h2>', webpage, 'title', fatal=False)) series = clean_html(self._search_regex( - r"<h1>(?P<series>.*)<\/h1>", webpage, 'series')) + r'<h1>(?P<series>.+)<\/h1>', webpage, 'series', fatal=False)) + return { 'id': video_id, 'title': title,