From 38d14cd94d948ee71221a56380d24b09179bf58b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?dzikie=20dro=C5=BCd=C5=BCe?= Date: Fri, 8 Jan 2016 06:39:46 +0100 Subject: [PATCH 01/15] rework tvp.pl extractor --- youtube_dl/extractor/__init__.py | 2 +- youtube_dl/extractor/tvp.py | 277 ++++++++++++++++++++----------- 2 files changed, 181 insertions(+), 98 deletions(-) diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 625b0bf16..9e92dfb31 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -742,7 +742,7 @@ from .tvc import ( ) from .tvigle import TvigleIE from .tvland import TVLandIE -from .tvp import TvpIE, TvpSeriesIE +from .tvp import TvpIE, TvpLegacyIE from .tvplay import TVPlayIE from .tweakers import TweakersIE from .twentyfourvideo import TwentyFourVideoIE diff --git a/youtube_dl/extractor/tvp.py b/youtube_dl/extractor/tvp.py index f57d609d4..4b3bd9982 100644 --- a/youtube_dl/extractor/tvp.py +++ b/youtube_dl/extractor/tvp.py @@ -1,12 +1,181 @@ -# -*- coding: utf-8 -*- -from __future__ import unicode_literals +from .common import InfoExtractor, ExtractorError -import re +VIDEO_LISTING_URL = ('http://www.api.v3.tvp.pl/shared/listing.php' + '?dump=json&direct=true&count=-1&parent_id={id}') +META_URL = 'http://www.tvp.pl/shared/video_data.php?dump=json&video_id={id}' +TOKENIZER_URL = 'http://www.tvp.pl/shared/cdn/tokenizer_v2.php?object_id={id}' +IGNORED_MIMETYPES = 'application/vnd.ms-ss', 'application/x-mpegurl' -from .common import InfoExtractor + +class TvpApi: + + def __init__(self, ie): + """:type ie: InfoExtractor""" + self.ie = ie + + def listing(self, id): + json = self._get_json(VIDEO_LISTING_URL, id) + return json + + def meta(self, id): + json = self._get_json(META_URL, id) + return json + + def context(self, id): + meta = self.meta(id) + return meta['context'] + + def formats(self, id): + json = self._get_json(TOKENIZER_URL, id) + status = json['status'] + if status == 'NOT_PLAYABLE': + raise ExtractorError("video is not playable") + if status != 'OK': + raise ExtractorError("unknown status: %s", status) + return json['formats'] + + def _get_json(self, url, id): + id = int(id) + formatted_url = url.format(id=id) + return self.ie._download_json(formatted_url, id) class TvpIE(InfoExtractor): + IE_NAME = 'tvp.pl' + _VALID_URL = r'https?://(?:vod\.|www\.)?tvp\.pl/(?P\d+)/.*' + + _TESTS = [{ + 'url': 'http://vod.tvp.pl/4278035/odc-2', + 'md5': 'cdd98303338b8a7f7abab5cd14092bf2', + 'info_dict': { + 'id': '4278035', + 'ext': 'wmv', + 'title': 'Ogniem i mieczem, odc. 2', + 'description': 'Bohun dowiaduje się o złamaniu przez kniahinię danego mu słowa i wyrusza do Rozłogów. Helenie w ostatniej chwili udaje się uciec dzięki pomocy Zagłoby.' + }, + }, { + 'url': 'http://vod.tvp.pl/194536/i-seria-odc-13', + 'md5': '8aa518c15e5cc32dfe8db400dc921fbb', + 'info_dict': { + 'id': '194536', + 'ext': 'mp4', + 'title': 'Czas honoru, I seria – odc. 13', + 'description': 'Czesław prosi Marię o dostarczenie Władkowi zarazki tyfusu. Jeśli zachoruje zostanie przewieziony do szpitala skąd łatwiej będzie go odbić. Czy matka zdecyduje się zarazić syna?' + }, + }, { + 'url': 'http://vod.tvp.pl/17834272/odc-39', + 'md5': 'dafdadb130a45e79bab64aed94b73661', + 'info_dict': { + 'id': '17834272', + 'ext': 'mp4', + 'title': 'Na sygnale, odc. 39', + 'description': 'Ekipa Wiktora ratuje młodą matkę, która spadła ze schodów trzymając na rękach noworodka. Okazuje się, że dziewczyna jest surogatką, a biologiczni rodzice dziecka próbują zmusić ją do oddania synka…', + }, + }, { + 'url': 'http://vod.tvp.pl/4278026/ogniem-i-mieczem', + 'info_dict': { + 'title': 'Ogniem i mieczem', + 'id': '4278026', + 'description': 'Romans z historią w tle', + }, + 'playlist_count': 4, + }, { + 'url': 'http://vod.tvp.pl/9329207/', + 'info_dict': { + 'title': 'Boso przez świat', + 'id': '9329207', + 'description': 'Docieramy do plemion w zapomnianych regionach naszej planety. Poznajemy ich kulturę, wierzenia i zwyczaje. Na ile są podobne do naszych? Wojciech Cejrowski jest naszym przewodnikiem po najbardziej dzikich zakątkach globu.', + }, + 'playlist_count': 86, + }] + + @classmethod + def suitable(cls, url): + return super(TvpIE, cls).suitable(url) + + def _real_initialize(self): + self.api = TvpApi(self) + + @staticmethod + def _format_formats(formats, video_id): + + mime_ext = { + 'video/x-ms-wmv': 'wmv', + 'video/mp4': 'mp4' + } + + viable_formats = [] + for f in formats: + if f['mimeType'] in IGNORED_MIMETYPES: + continue + + elif f['mimeType'].startswith('video/'): + viable_formats.append( + {'url': f['url'], + 'ext': mime_ext.get(f['mimeType'], None), + 'vbr': f['totalBitrate']}) + + return viable_formats + + def _get_video(self, context): + id = context['material_id'] + if context['title_root']: + title = context['title_root'] + elif not context['website_title']: + title = context['title'] + else: + title = ', '.join([context['website_title'], context['title']]) + url = context['url'] + description = context['description_root'] + + try: + formats = self._format_formats(self.api.formats(id), id) + except ExtractorError as e: + self.to_screen("%s: %s" % (title, e)) + raise + + self._sort_formats(formats) + + return { + 'id': str(id), + 'url': url, + 'title': title, + 'description': description, + 'formats': formats, + } + + def _get_playlist_videos(self, playlist_id): + ids = [playlist_id] + + while ids: + item_id = ids.pop() + listing = self.api.listing(item_id) + for item in listing['items']: + if 'directory_video' in item['types']: + ids.append(item['_id']) + if 'video' in item['types'] and item['is_released']: + meta = self.api.context(item['_id']) + try: + yield self._get_video(meta) + except ExtractorError: + pass + + def _get_playlist(self, context): + id = context['material_id'] + title = context['title'] + description = context['lead_root'] + + return self.playlist_result(self._get_playlist_videos(id), + str(id), title, description) + + def _real_extract(self, url): + video_id = self._match_id(url) + ctx = self.api.context(video_id) + is_playlist = ctx['format_id'] == 0 + return self._get_playlist(ctx) if is_playlist else self._get_video(ctx) + + +class TvpLegacyIE(TvpIE): IE_NAME = 'tvp.pl' _VALID_URL = r'https?://(?:vod|www)\.tvp\.pl/.*/(?P\d+)$' @@ -17,6 +186,7 @@ class TvpIE(InfoExtractor): 'id': '4278035', 'ext': 'wmv', 'title': 'Ogniem i mieczem, odc. 2', + 'description': 'Bohun dowiaduje się o złamaniu przez kniahinię danego mu słowa i wyrusza do Rozłogów. Helenie w ostatniej chwili udaje się uciec dzięki pomocy Zagłoby.', }, }, { 'url': 'http://vod.tvp.pl/seriale/obyczajowe/czas-honoru/sezon-1-1-13/i-seria-odc-13/194536', @@ -25,10 +195,11 @@ class TvpIE(InfoExtractor): 'id': '194536', 'ext': 'mp4', 'title': 'Czas honoru, I seria – odc. 13', + 'description': 'Czesław prosi Marię o dostarczenie Władkowi zarazki tyfusu. Jeśli zachoruje zostanie przewieziony do szpitala skąd łatwiej będzie go odbić. Czy matka zdecyduje się zarazić syna?', }, }, { 'url': 'http://www.tvp.pl/there-can-be-anything-so-i-shortened-it/17916176', - 'md5': 'c3b15ed1af288131115ff17a17c19dda', + 'md5': 'b0005b542e5b4de643a9690326ab1257', 'info_dict': { 'id': '17916176', 'ext': 'mp4', @@ -36,104 +207,16 @@ class TvpIE(InfoExtractor): }, }, { 'url': 'http://vod.tvp.pl/seriale/obyczajowe/na-sygnale/sezon-2-27-/odc-39/17834272', - 'md5': 'c3b15ed1af288131115ff17a17c19dda', + 'md5': 'dafdadb130a45e79bab64aed94b73661', 'info_dict': { 'id': '17834272', 'ext': 'mp4', 'title': 'Na sygnale, odc. 39', + 'description': 'Ekipa Wiktora ratuje młodą matkę, która spadła ze schodów trzymając na rękach noworodka. Okazuje się, że dziewczyna jest surogatką, a biologiczni rodzice dziecka próbują zmusić ją do oddania synka…', }, }] def _real_extract(self, url): video_id = self._match_id(url) - - webpage = self._download_webpage( - 'http://www.tvp.pl/sess/tvplayer.php?object_id=%s' % video_id, video_id) - - title = self._search_regex( - r'name\s*:\s*([\'"])Title\1\s*,\s*value\s*:\s*\1(?P.+?)\1', - webpage, 'title', group='title') - series_title = self._search_regex( - r'name\s*:\s*([\'"])SeriesTitle\1\s*,\s*value\s*:\s*\1(?P<series>.+?)\1', - webpage, 'series', group='series', default=None) - if series_title: - title = '%s, %s' % (series_title, title) - - thumbnail = self._search_regex( - r"poster\s*:\s*'([^']+)'", webpage, 'thumbnail', default=None) - - video_url = self._search_regex( - r'0:{src:([\'"])(?P<url>.*?)\1', webpage, 'formats', group='url', default=None) - if not video_url: - video_url = self._download_json( - 'http://www.tvp.pl/pub/stat/videofileinfo?video_id=%s' % video_id, - video_id)['video_url'] - - ext = video_url.rsplit('.', 1)[-1] - if ext != 'ism/manifest': - if '/' in ext: - ext = 'mp4' - formats = [{ - 'format_id': 'direct', - 'url': video_url, - 'ext': ext, - }] - else: - m3u8_url = re.sub('([^/]*)\.ism/manifest', r'\1.ism/\1.m3u8', video_url) - formats = self._extract_m3u8_formats(m3u8_url, video_id, 'mp4') - - self._sort_formats(formats) - - return { - 'id': video_id, - 'title': title, - 'thumbnail': thumbnail, - 'formats': formats, - } - - -class TvpSeriesIE(InfoExtractor): - IE_NAME = 'tvp.pl:Series' - _VALID_URL = r'https?://vod\.tvp\.pl/(?:[^/]+/){2}(?P<id>[^/]+)/?$' - - _TESTS = [{ - 'url': 'http://vod.tvp.pl/filmy-fabularne/filmy-za-darmo/ogniem-i-mieczem', - 'info_dict': { - 'title': 'Ogniem i mieczem', - 'id': '4278026', - }, - 'playlist_count': 4, - }, { - 'url': 'http://vod.tvp.pl/audycje/podroze/boso-przez-swiat', - 'info_dict': { - 'title': 'Boso przez świat', - 'id': '9329207', - }, - 'playlist_count': 86, - }] - - def _real_extract(self, url): - display_id = self._match_id(url) - webpage = self._download_webpage(url, display_id, tries=5) - - title = self._html_search_regex( - r'(?s) id=[\'"]path[\'"]>(?:.*? / ){2}(.*?)</span>', webpage, 'series') - playlist_id = self._search_regex(r'nodeId:\s*(\d+)', webpage, 'playlist id') - playlist = self._download_webpage( - 'http://vod.tvp.pl/vod/seriesAjax?type=series&nodeId=%s&recommend' - 'edId=0&sort=&page=0&pageSize=10000' % playlist_id, display_id, tries=5, - note='Downloading playlist') - - videos_paths = re.findall( - '(?s)class="shortTitle">.*?href="(/[^"]+)', playlist) - entries = [ - self.url_result('http://vod.tvp.pl%s' % v_path, ie=TvpIE.ie_key()) - for v_path in videos_paths] - - return { - '_type': 'playlist', - 'id': playlist_id, - 'display_id': display_id, - 'title': title, - 'entries': entries, - } + context = self.api.context(video_id) + return self._get_video(context) From 35a5e04066b5a10d58e698c8da072503d3f3a7a0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?dzikie=20dro=C5=BCd=C5=BCe?= <daz@hackerspace.pl> Date: Fri, 8 Jan 2016 06:46:34 +0100 Subject: [PATCH 02/15] unnecessary method --- youtube_dl/extractor/tvp.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/youtube_dl/extractor/tvp.py b/youtube_dl/extractor/tvp.py index 4b3bd9982..f43b8b614 100644 --- a/youtube_dl/extractor/tvp.py +++ b/youtube_dl/extractor/tvp.py @@ -89,10 +89,6 @@ class TvpIE(InfoExtractor): 'playlist_count': 86, }] - @classmethod - def suitable(cls, url): - return super(TvpIE, cls).suitable(url) - def _real_initialize(self): self.api = TvpApi(self) From 20f9a56e2eb70c2eef4f95e1395574a8667f71dc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?dzikie=20dro=C5=BCd=C5=BCe?= <daz@hackerspace.pl> Date: Fri, 8 Jan 2016 07:20:40 +0100 Subject: [PATCH 03/15] moar subdomains --- youtube_dl/extractor/tvp.py | 26 +++++++++++++++++++++++--- 1 file changed, 23 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/tvp.py b/youtube_dl/extractor/tvp.py index f43b8b614..60181d4ef 100644 --- a/youtube_dl/extractor/tvp.py +++ b/youtube_dl/extractor/tvp.py @@ -4,6 +4,7 @@ VIDEO_LISTING_URL = ('http://www.api.v3.tvp.pl/shared/listing.php' '?dump=json&direct=true&count=-1&parent_id={id}') META_URL = 'http://www.tvp.pl/shared/video_data.php?dump=json&video_id={id}' TOKENIZER_URL = 'http://www.tvp.pl/shared/cdn/tokenizer_v2.php?object_id={id}' +FILE_INFO_URL = 'http://www.tvp.pl/pub/stat/videofileinfo?video_id={id}' IGNORED_MIMETYPES = 'application/vnd.ms-ss', 'application/x-mpegurl' @@ -21,6 +22,10 @@ class TvpApi: json = self._get_json(META_URL, id) return json + def info(self, id): + json = self._get_json(FILE_INFO_URL, id) + return json + def context(self, id): meta = self.meta(id) return meta['context'] @@ -42,7 +47,7 @@ class TvpApi: class TvpIE(InfoExtractor): IE_NAME = 'tvp.pl' - _VALID_URL = r'https?://(?:vod\.|www\.)?tvp\.pl/(?P<id>\d+)/.*' + _VALID_URL = r'https?://(?:vod|www)\.(\w+\.)?tvp\.pl/(?P<id>\d+)/.*' _TESTS = [{ 'url': 'http://vod.tvp.pl/4278035/odc-2', @@ -53,6 +58,15 @@ class TvpIE(InfoExtractor): 'title': 'Ogniem i mieczem, odc. 2', 'description': 'Bohun dowiaduje się o złamaniu przez kniahinię danego mu słowa i wyrusza do Rozłogów. Helenie w ostatniej chwili udaje się uciec dzięki pomocy Zagłoby.' }, + }, { + 'url': 'http://www.rodzinka.tvp.pl/22729075/odc-169', + 'md5': '4dc102e0883555d31b120e8328c02022', + 'info_dict': { + 'id': '22353810', + 'ext': 'mp4', + 'title': 'rodzinka.pl, odc. 169', + 'description': 'Natalia szykuje dla Marii paczkę z ubrankami dla dziecka,\nale ciężko jej się z nimi rozstać – wiążę się z tym zbyt wiele wspomnień. Kacper chce wymusić od Ludwika pieniądze opowiadając o wróżce zębuszcze. A czy zna tak zwanego „Skrzata Dlatata”?', + }, }, { 'url': 'http://vod.tvp.pl/194536/i-seria-odc-13', 'md5': '8aa518c15e5cc32dfe8db400dc921fbb', @@ -165,8 +179,14 @@ class TvpIE(InfoExtractor): str(id), title, description) def _real_extract(self, url): - video_id = self._match_id(url) - ctx = self.api.context(video_id) + id = self._match_id(url) + ctx = self.api.context(id) + if ctx['format_id'] == 0: + file_info = self.api.info(id) + original_id = file_info.get('copy_of_object_id') + if original_id: + ctx = self.api.context(original_id) + is_playlist = ctx['format_id'] == 0 return self._get_playlist(ctx) if is_playlist else self._get_video(ctx) From 6ebaca4df3eadd532c7dc067cd6a8f4e6b54f94d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?dzikie=20dro=C5=BCd=C5=BCe?= <daz@hackerspace.pl> Date: Fri, 8 Jan 2016 07:28:02 +0100 Subject: [PATCH 04/15] encoding for py2 --- youtube_dl/extractor/tvp.py | 1 + 1 file changed, 1 insertion(+) diff --git a/youtube_dl/extractor/tvp.py b/youtube_dl/extractor/tvp.py index 60181d4ef..3bc0b8f0a 100644 --- a/youtube_dl/extractor/tvp.py +++ b/youtube_dl/extractor/tvp.py @@ -1,3 +1,4 @@ +# -*- encoding: utf-8 -*- from .common import InfoExtractor, ExtractorError VIDEO_LISTING_URL = ('http://www.api.v3.tvp.pl/shared/listing.php' From dcb069d69cf869b98e3620cc4f6df71dcd723c6b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?dzikie=20dro=C5=BCd=C5=BCe?= <daz@hackerspace.pl> Date: Fri, 8 Jan 2016 07:37:10 +0100 Subject: [PATCH 05/15] There's more of them! --- youtube_dl/extractor/tvp.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/tvp.py b/youtube_dl/extractor/tvp.py index 3bc0b8f0a..e912647b8 100644 --- a/youtube_dl/extractor/tvp.py +++ b/youtube_dl/extractor/tvp.py @@ -48,7 +48,7 @@ class TvpApi: class TvpIE(InfoExtractor): IE_NAME = 'tvp.pl' - _VALID_URL = r'https?://(?:vod|www)\.(\w+\.)?tvp\.pl/(?P<id>\d+)/.*' + _VALID_URL = r'https?://(\w+\.)+tvp\.pl/(?P<id>\d+)/.*' _TESTS = [{ 'url': 'http://vod.tvp.pl/4278035/odc-2', From eaef7bdf99c06c0ddc78c85fa609f92ced45b2f1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?dzikie=20dro=C5=BCd=C5=BCe?= <daz@hackerspace.pl> Date: Fri, 8 Jan 2016 07:43:31 +0100 Subject: [PATCH 06/15] more tests --- youtube_dl/extractor/tvp.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/youtube_dl/extractor/tvp.py b/youtube_dl/extractor/tvp.py index e912647b8..02db4b3a5 100644 --- a/youtube_dl/extractor/tvp.py +++ b/youtube_dl/extractor/tvp.py @@ -59,6 +59,14 @@ class TvpIE(InfoExtractor): 'title': 'Ogniem i mieczem, odc. 2', 'description': 'Bohun dowiaduje się o złamaniu przez kniahinię danego mu słowa i wyrusza do Rozłogów. Helenie w ostatniej chwili udaje się uciec dzięki pomocy Zagłoby.' }, + }, { + 'url': 'http://warszawa.tvp.pl/23433721/03012016', + 'md5': '8740c6e0532f37e836104f3fb38921d9', + 'info_dict': { + 'id': '23433721', + 'ext': 'mp4', + 'title': 'Echa tygodnia – kraj, 03.01.2016', + }, }, { 'url': 'http://www.rodzinka.tvp.pl/22729075/odc-169', 'md5': '4dc102e0883555d31b120e8328c02022', From a507d7aebd5846b505157f28672ee171b3ac07fd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?dzikie=20dro=C5=BCd=C5=BCe?= <daz@hackerspace.pl> Date: Fri, 8 Jan 2016 09:58:42 +0100 Subject: [PATCH 07/15] remove exception voodoo --- youtube_dl/extractor/tvp.py | 13 +++---------- 1 file changed, 3 insertions(+), 10 deletions(-) diff --git a/youtube_dl/extractor/tvp.py b/youtube_dl/extractor/tvp.py index 02db4b3a5..e40244ad4 100644 --- a/youtube_dl/extractor/tvp.py +++ b/youtube_dl/extractor/tvp.py @@ -35,7 +35,7 @@ class TvpApi: json = self._get_json(TOKENIZER_URL, id) status = json['status'] if status == 'NOT_PLAYABLE': - raise ExtractorError("video is not playable") + raise ExtractorError("video is not playable", expected=True) if status != 'OK': raise ExtractorError("unknown status: %s", status) return json['formats'] @@ -147,11 +147,7 @@ class TvpIE(InfoExtractor): url = context['url'] description = context['description_root'] - try: - formats = self._format_formats(self.api.formats(id), id) - except ExtractorError as e: - self.to_screen("%s: %s" % (title, e)) - raise + formats = self._format_formats(self.api.formats(id), id) self._sort_formats(formats) @@ -174,10 +170,7 @@ class TvpIE(InfoExtractor): ids.append(item['_id']) if 'video' in item['types'] and item['is_released']: meta = self.api.context(item['_id']) - try: - yield self._get_video(meta) - except ExtractorError: - pass + yield self._get_video(meta) def _get_playlist(self, context): id = context['material_id'] From c213f8cc3384e8319568bd2f3cea5ae5ad2cfdf0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?dzikie=20dro=C5=BCd=C5=BCe?= <daz@hackerspace.pl> Date: Fri, 8 Jan 2016 10:00:33 +0100 Subject: [PATCH 08/15] squashing action don't rely on optional key single quotes unnecessary overload remove api class more exceptions more better make _format_formats not static refactor guessing title to a method fix old api calls bring stabilization in the field of id types don't extract videos in a playlist --- youtube_dl/extractor/tvp.py | 128 ++++++++++++++---------------------- 1 file changed, 51 insertions(+), 77 deletions(-) diff --git a/youtube_dl/extractor/tvp.py b/youtube_dl/extractor/tvp.py index e40244ad4..fe79704e8 100644 --- a/youtube_dl/extractor/tvp.py +++ b/youtube_dl/extractor/tvp.py @@ -1,55 +1,18 @@ # -*- encoding: utf-8 -*- from .common import InfoExtractor, ExtractorError -VIDEO_LISTING_URL = ('http://www.api.v3.tvp.pl/shared/listing.php' - '?dump=json&direct=true&count=-1&parent_id={id}') -META_URL = 'http://www.tvp.pl/shared/video_data.php?dump=json&video_id={id}' -TOKENIZER_URL = 'http://www.tvp.pl/shared/cdn/tokenizer_v2.php?object_id={id}' -FILE_INFO_URL = 'http://www.tvp.pl/pub/stat/videofileinfo?video_id={id}' -IGNORED_MIMETYPES = 'application/vnd.ms-ss', 'application/x-mpegurl' - - -class TvpApi: - - def __init__(self, ie): - """:type ie: InfoExtractor""" - self.ie = ie - - def listing(self, id): - json = self._get_json(VIDEO_LISTING_URL, id) - return json - - def meta(self, id): - json = self._get_json(META_URL, id) - return json - - def info(self, id): - json = self._get_json(FILE_INFO_URL, id) - return json - - def context(self, id): - meta = self.meta(id) - return meta['context'] - - def formats(self, id): - json = self._get_json(TOKENIZER_URL, id) - status = json['status'] - if status == 'NOT_PLAYABLE': - raise ExtractorError("video is not playable", expected=True) - if status != 'OK': - raise ExtractorError("unknown status: %s", status) - return json['formats'] - - def _get_json(self, url, id): - id = int(id) - formatted_url = url.format(id=id) - return self.ie._download_json(formatted_url, id) - class TvpIE(InfoExtractor): IE_NAME = 'tvp.pl' _VALID_URL = r'https?://(\w+\.)+tvp\.pl/(?P<id>\d+)/.*' + _VIDEO_LISTING_URL = ('http://www.api.v3.tvp.pl/shared/listing.php' + '?dump=json&direct=true&count=-1&parent_id={id}') + _META_URL = 'http://www.tvp.pl/shared/video_data.php?dump=json&video_id={id}' + _TOKENIZER_URL = 'http://www.tvp.pl/shared/cdn/tokenizer_v2.php?object_id={id}' + _FILE_INFO_URL = 'http://www.tvp.pl/pub/stat/videofileinfo?video_id={id}' + _IGNORED_MIMETYPES = 'application/vnd.ms-ss', 'application/x-mpegurl' + _TESTS = [{ 'url': 'http://vod.tvp.pl/4278035/odc-2', 'md5': 'cdd98303338b8a7f7abab5cd14092bf2', @@ -112,11 +75,11 @@ class TvpIE(InfoExtractor): 'playlist_count': 86, }] - def _real_initialize(self): - self.api = TvpApi(self) + def _get_json(self, url, entry_id): + formatted_url = url.format(id=int(entry_id)) + return self._download_json(formatted_url, entry_id) - @staticmethod - def _format_formats(formats, video_id): + def _format_formats(self, formats, video_id): mime_ext = { 'video/x-ms-wmv': 'wmv', @@ -125,7 +88,7 @@ class TvpIE(InfoExtractor): viable_formats = [] for f in formats: - if f['mimeType'] in IGNORED_MIMETYPES: + if f['mimeType'] in self._IGNORED_MIMETYPES: continue elif f['mimeType'].startswith('video/'): @@ -136,23 +99,37 @@ class TvpIE(InfoExtractor): return viable_formats - def _get_video(self, context): - id = context['material_id'] - if context['title_root']: - title = context['title_root'] - elif not context['website_title']: - title = context['title'] - else: - title = ', '.join([context['website_title'], context['title']]) - url = context['url'] - description = context['description_root'] + @staticmethod + def _guess_title(item): + title_root = item.get('title_root') + title = item.get('title') + website_title = item.get('website_title') + if title_root: + return item['title_root'] + if title and website_title: + return '{}, {}'.format(website_title, title) + return title - formats = self._format_formats(self.api.formats(id), id) + def _get_video(self, context): + video_id = str(context['material_id']) + title = self._guess_title(context) + url = context['url'] + description = context.get('description_root') + + formats_req = self._get_json(self._TOKENIZER_URL, video_id) + req_status = formats_req['status'] + if req_status == 'NOT_PLAYABLE': + raise ExtractorError('(%s) is not playable' % title, + expected=True, video_id=video_id) + elif req_status != 'OK': + raise ExtractorError('(%s) unknown status: %s' % (title, req_status), + video_id=video_id) + formats = self._format_formats(formats_req['formats'], video_id) self._sort_formats(formats) return { - 'id': str(id), + 'id': video_id, 'url': url, 'title': title, 'description': description, @@ -164,30 +141,32 @@ class TvpIE(InfoExtractor): while ids: item_id = ids.pop() - listing = self.api.listing(item_id) + listing = self._get_json(self._VIDEO_LISTING_URL, item_id) for item in listing['items']: if 'directory_video' in item['types']: ids.append(item['_id']) if 'video' in item['types'] and item['is_released']: - meta = self.api.context(item['_id']) - yield self._get_video(meta) + yield { + '_type': 'url', + 'title': self._guess_title(item), + 'url': item['url']} def _get_playlist(self, context): - id = context['material_id'] + pls_id = str(context['material_id']) title = context['title'] - description = context['lead_root'] + description = context.get('lead_root') - return self.playlist_result(self._get_playlist_videos(id), - str(id), title, description) + return self.playlist_result(self._get_playlist_videos(pls_id), + pls_id, title, description) def _real_extract(self, url): - id = self._match_id(url) - ctx = self.api.context(id) + entry_id = self._match_id(url) + ctx = self._get_json(self._META_URL, entry_id)['context'] if ctx['format_id'] == 0: - file_info = self.api.info(id) + file_info = self._get_json(self._FILE_INFO_URL, entry_id) original_id = file_info.get('copy_of_object_id') if original_id: - ctx = self.api.context(original_id) + ctx = self._get_json(self._META_URL, original_id)['context'] is_playlist = ctx['format_id'] == 0 return self._get_playlist(ctx) if is_playlist else self._get_video(ctx) @@ -233,8 +212,3 @@ class TvpLegacyIE(TvpIE): 'description': 'Ekipa Wiktora ratuje młodą matkę, która spadła ze schodów trzymając na rękach noworodka. Okazuje się, że dziewczyna jest surogatką, a biologiczni rodzice dziecka próbują zmusić ją do oddania synka…', }, }] - - def _real_extract(self, url): - video_id = self._match_id(url) - context = self.api.context(video_id) - return self._get_video(context) From 77d024527b529beb7651179487399764a9c67a34 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?dzikie=20dro=C5=BCd=C5=BCe?= <daz@hackerspace.pl> Date: Sat, 9 Jan 2016 05:13:04 +0100 Subject: [PATCH 09/15] no need for None in get --- youtube_dl/extractor/tvp.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/tvp.py b/youtube_dl/extractor/tvp.py index fe79704e8..273908418 100644 --- a/youtube_dl/extractor/tvp.py +++ b/youtube_dl/extractor/tvp.py @@ -94,7 +94,7 @@ class TvpIE(InfoExtractor): elif f['mimeType'].startswith('video/'): viable_formats.append( {'url': f['url'], - 'ext': mime_ext.get(f['mimeType'], None), + 'ext': mime_ext.get(f['mimeType']), 'vbr': f['totalBitrate']}) return viable_formats From bc05fc757552acd46e23eb62d398019ac916e856 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?dzikie=20dro=C5=BCd=C5=BCe?= <daz@hackerspace.pl> Date: Sat, 9 Jan 2016 05:18:58 +0100 Subject: [PATCH 10/15] increase chance for a title --- youtube_dl/extractor/tvp.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/tvp.py b/youtube_dl/extractor/tvp.py index 273908418..1aa2146b9 100644 --- a/youtube_dl/extractor/tvp.py +++ b/youtube_dl/extractor/tvp.py @@ -153,7 +153,7 @@ class TvpIE(InfoExtractor): def _get_playlist(self, context): pls_id = str(context['material_id']) - title = context['title'] + title = self._guess_title(context) description = context.get('lead_root') return self.playlist_result(self._get_playlist_videos(pls_id), From 8bee77a175a7724fde513b0e468e16f0cdaedc88 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?dzikie=20dro=C5=BCd=C5=BCe?= <daz@hackerspace.pl> Date: Sat, 9 Jan 2016 05:41:16 +0100 Subject: [PATCH 11/15] expect less from format json --- youtube_dl/extractor/tvp.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/youtube_dl/extractor/tvp.py b/youtube_dl/extractor/tvp.py index 1aa2146b9..42d318c5e 100644 --- a/youtube_dl/extractor/tvp.py +++ b/youtube_dl/extractor/tvp.py @@ -88,14 +88,15 @@ class TvpIE(InfoExtractor): viable_formats = [] for f in formats: - if f['mimeType'] in self._IGNORED_MIMETYPES: + if f.get('mimeType') in self._IGNORED_MIMETYPES: + continue + if 'url' not in f: continue - elif f['mimeType'].startswith('video/'): - viable_formats.append( - {'url': f['url'], - 'ext': mime_ext.get(f['mimeType']), - 'vbr': f['totalBitrate']}) + viable_formats.append( + {'url': f['url'], + 'ext': mime_ext.get(f.get('mimeType')), + 'vbr': f.get('totalBitrate')}) return viable_formats From e8784b67879dd498583ea50498a6ef171ac99ede Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?dzikie=20dro=C5=BCd=C5=BCe?= <daz@hackerspace.pl> Date: Sat, 9 Jan 2016 05:52:43 +0100 Subject: [PATCH 12/15] not vital part of api --- youtube_dl/extractor/tvp.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/tvp.py b/youtube_dl/extractor/tvp.py index 42d318c5e..2446d4404 100644 --- a/youtube_dl/extractor/tvp.py +++ b/youtube_dl/extractor/tvp.py @@ -75,9 +75,10 @@ class TvpIE(InfoExtractor): 'playlist_count': 86, }] - def _get_json(self, url, entry_id): + def _get_json(self, url, entry_id, fatal=True): formatted_url = url.format(id=int(entry_id)) - return self._download_json(formatted_url, entry_id) + json = self._download_json(formatted_url, entry_id, fatal=fatal) + return {} if json is None else json def _format_formats(self, formats, video_id): @@ -164,7 +165,7 @@ class TvpIE(InfoExtractor): entry_id = self._match_id(url) ctx = self._get_json(self._META_URL, entry_id)['context'] if ctx['format_id'] == 0: - file_info = self._get_json(self._FILE_INFO_URL, entry_id) + file_info = self._get_json(self._FILE_INFO_URL, entry_id, fatal=False) original_id = file_info.get('copy_of_object_id') if original_id: ctx = self._get_json(self._META_URL, original_id)['context'] From 3fb7f85dd40459e56c9b360484f77a14fa5263af Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?dzikie=20dro=C5=BCd=C5=BCe?= <daz@hackerspace.pl> Date: Sun, 10 Jan 2016 05:47:57 +0100 Subject: [PATCH 13/15] unicode_literals --- youtube_dl/extractor/tvp.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/youtube_dl/extractor/tvp.py b/youtube_dl/extractor/tvp.py index 2446d4404..66cf07d32 100644 --- a/youtube_dl/extractor/tvp.py +++ b/youtube_dl/extractor/tvp.py @@ -1,4 +1,6 @@ # -*- encoding: utf-8 -*- +from __future__ import unicode_literals + from .common import InfoExtractor, ExtractorError From e3225fc155eaa895b386e4e6b1b90368405c2f77 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?dzikie=20dro=C5=BCd=C5=BCe?= <daz@hackerspace.pl> Date: Sun, 10 Jan 2016 05:48:59 +0100 Subject: [PATCH 14/15] change dict to url_result --- youtube_dl/extractor/tvp.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/tvp.py b/youtube_dl/extractor/tvp.py index 66cf07d32..364047992 100644 --- a/youtube_dl/extractor/tvp.py +++ b/youtube_dl/extractor/tvp.py @@ -150,10 +150,9 @@ class TvpIE(InfoExtractor): if 'directory_video' in item['types']: ids.append(item['_id']) if 'video' in item['types'] and item['is_released']: - yield { - '_type': 'url', - 'title': self._guess_title(item), - 'url': item['url']} + yield self.url_result(item['url'], + video_id=item['_id'], + video_title=self._guess_title(item)) def _get_playlist(self, context): pls_id = str(context['material_id']) From e8b7c3220e77f27b55576e80c8654c44198b8f81 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?dzikie=20dro=C5=BCd=C5=BCe?= <daz@hackerspace.pl> Date: Sun, 10 Jan 2016 06:32:54 +0100 Subject: [PATCH 15/15] tvp.info --- youtube_dl/extractor/tvp.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/tvp.py b/youtube_dl/extractor/tvp.py index 364047992..a14d87792 100644 --- a/youtube_dl/extractor/tvp.py +++ b/youtube_dl/extractor/tvp.py @@ -6,7 +6,7 @@ from .common import InfoExtractor, ExtractorError class TvpIE(InfoExtractor): IE_NAME = 'tvp.pl' - _VALID_URL = r'https?://(\w+\.)+tvp\.pl/(?P<id>\d+)/.*' + _VALID_URL = r'https?://(\w+\.)+tvp\.(?:pl|info)/(?P<id>\d+)/.*' _VIDEO_LISTING_URL = ('http://www.api.v3.tvp.pl/shared/listing.php' '?dump=json&direct=true&count=-1&parent_id={id}')