From 338e916b7cf73ba0c31d2a0796ea716fb07565eb Mon Sep 17 00:00:00 2001 From: Jakub Adam Wieczorek Date: Tue, 24 May 2016 17:44:57 +0200 Subject: [PATCH] =?UTF-8?q?[tvp]=20Add=20support=20for=20TVP=20Wiadomo?= =?UTF-8?q?=C5=9Bci?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This is the main state-funded daily news programme in Poland. --- youtube_dl/extractor/extractors.py | 2 +- youtube_dl/extractor/tvp.py | 105 +++++++++++++++++------------ 2 files changed, 62 insertions(+), 45 deletions(-) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 05561149a..1beaf0e8b 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -833,7 +833,7 @@ from .tvc import ( ) from .tvigle import TvigleIE from .tvland import TVLandIE -from .tvp import TvpIE, TvpSeriesIE +from .tvp import TvpIE, TvpWiadomosciIE, TvpSeriesIE from .tvplay import TVPlayIE from .tweakers import TweakersIE from .twentyfourvideo import TwentyFourVideoIE diff --git a/youtube_dl/extractor/tvp.py b/youtube_dl/extractor/tvp.py index f57d609d4..b9491867e 100644 --- a/youtube_dl/extractor/tvp.py +++ b/youtube_dl/extractor/tvp.py @@ -6,7 +6,56 @@ import re from .common import InfoExtractor -class TvpIE(InfoExtractor): +class _TvpCommonIE(InfoExtractor): + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage( + 'http://www.tvp.pl/sess/tvplayer.php?object_id=%s' % video_id, video_id) + + title = self._search_regex( + r'name\s*:\s*([\'"])Title\1\s*,\s*value\s*:\s*\1(?P.+?)\1', + webpage, 'title', group='title') + series_title = self._search_regex( + r'name\s*:\s*([\'"])SeriesTitle\1\s*,\s*value\s*:\s*\1(?P<series>.+?)\1', + webpage, 'series', group='series', default=None) + if series_title: + title = '%s, %s' % (series_title, title) + + thumbnail = self._search_regex( + r"poster\s*:\s*'([^']+)'", webpage, 'thumbnail', default=None) + + video_url = self._search_regex( + r'0:{src:([\'"])(?P<url>.*?)\1', webpage, 'formats', group='url', default=None) + if not video_url: + video_url = self._download_json( + 'http://www.tvp.pl/pub/stat/videofileinfo?video_id=%s' % video_id, + video_id)['video_url'] + + ext = video_url.rsplit('.', 1)[-1] + if ext == 'ism/manifest': + m3u8_url = re.sub('([^/]*)\.ism/manifest', r'\1.ism/\1.m3u8', video_url) + formats = self._extract_m3u8_formats(m3u8_url, video_id, 'mp4') + else: + if '/' in ext: + ext = 'mp4' + formats = [{ + 'format_id': 'direct', + 'url': video_url, + 'ext': ext, + }] + + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': title, + 'thumbnail': thumbnail, + 'formats': formats, + } + + +class TvpIE(_TvpCommonIE): IE_NAME = 'tvp.pl' _VALID_URL = r'https?://(?:vod|www)\.tvp\.pl/.*/(?P<id>\d+)$' @@ -44,52 +93,20 @@ class TvpIE(InfoExtractor): }, }] - def _real_extract(self, url): - video_id = self._match_id(url) - webpage = self._download_webpage( - 'http://www.tvp.pl/sess/tvplayer.php?object_id=%s' % video_id, video_id) +class TvpWiadomosciIE(_TvpCommonIE): + IE_NAME = 'wiadomosci.tvp.pl' + _VALID_URL = r'https?://wiadomosci\.tvp\.pl/(?P<id>\d+)/.*$' - title = self._search_regex( - r'name\s*:\s*([\'"])Title\1\s*,\s*value\s*:\s*\1(?P<title>.+?)\1', - webpage, 'title', group='title') - series_title = self._search_regex( - r'name\s*:\s*([\'"])SeriesTitle\1\s*,\s*value\s*:\s*\1(?P<series>.+?)\1', - webpage, 'series', group='series', default=None) - if series_title: - title = '%s, %s' % (series_title, title) - - thumbnail = self._search_regex( - r"poster\s*:\s*'([^']+)'", webpage, 'thumbnail', default=None) - - video_url = self._search_regex( - r'0:{src:([\'"])(?P<url>.*?)\1', webpage, 'formats', group='url', default=None) - if not video_url: - video_url = self._download_json( - 'http://www.tvp.pl/pub/stat/videofileinfo?video_id=%s' % video_id, - video_id)['video_url'] - - ext = video_url.rsplit('.', 1)[-1] - if ext != 'ism/manifest': - if '/' in ext: - ext = 'mp4' - formats = [{ - 'format_id': 'direct', - 'url': video_url, - 'ext': ext, - }] - else: - m3u8_url = re.sub('([^/]*)\.ism/manifest', r'\1.ism/\1.m3u8', video_url) - formats = self._extract_m3u8_formats(m3u8_url, video_id, 'mp4') - - self._sort_formats(formats) - - return { - 'id': video_id, - 'title': title, - 'thumbnail': thumbnail, - 'formats': formats, + _TESTS = [{ + 'url': 'http://wiadomosci.tvp.pl/25169746/24052016-1200', + 'md5': 'ac104f7adc03b324d1fecfa0fbcfcdb8', + 'info_dict': { + 'id': '25169746', + 'ext': 'mp4', + 'title': 'WiadomoĊ›ci, 24.05.2016, 12:00' } + }] class TvpSeriesIE(InfoExtractor):