From ef5e6d925f39f0da6b20f556f1aadb6bbf58ccc6 Mon Sep 17 00:00:00 2001 From: Petr Novak Date: Sat, 28 Oct 2017 17:38:01 +0200 Subject: [PATCH 01/15] [seznamzpravy] Add new extractor --- youtube_dl/extractor/extractors.py | 1 + youtube_dl/extractor/seznamzpravy.py | 75 ++++++++++++++++++++++++++++ 2 files changed, 76 insertions(+) create mode 100644 youtube_dl/extractor/seznamzpravy.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 18350810b..faa1f4c16 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -927,6 +927,7 @@ from .sendtonews import SendtoNewsIE from .servingsys import ServingSysIE from .servus import ServusIE from .sexu import SexuIE +from .seznamzpravy import SeznamZpravyIE from .shahid import ShahidIE from .shared import ( SharedIE, diff --git a/youtube_dl/extractor/seznamzpravy.py b/youtube_dl/extractor/seznamzpravy.py new file mode 100644 index 000000000..e319483e2 --- /dev/null +++ b/youtube_dl/extractor/seznamzpravy.py @@ -0,0 +1,75 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + urljoin, + int_or_none, +) + + +class SeznamZpravyIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?seznam\.cz/zpravy/clanek/(?:[-a-z0-9]+)-(?P[0-9]+)' + _API_URL = 'https://apiclanky.seznam.cz/' + _MAGIC_SUFFIX = 'spl2,2,VOD' + + _TESTS = [{ + 'url': 'https://www.seznam.cz/zpravy/clanek/jejich-svet-na-nas-utoci-je-lepsi-branit-se-na-jejich-pisecku-rika-reziser-a-major-v-zaloze-marhoul-35990', + 'md5': '855f9fed87bd93e48775d59671a3a3e3', + 'info_dict': { + 'id': '35990', + 'ext': 'mp4', + 'title': 'Svět bez obalu: Rozhovor s Václavem Marhoulem o zahraničních vojenských misích a aktivních zálohách.', + 'description': 'O nasazení českých vojáků v zahraničí. Marhoul by na mise posílal i zálohy. „Nejdříve se ale musí vycvičit,“ říká.', + } + }, { + 'url': 'https://www.seznam.cz/zpravy/clanek/vyzva-volicum-letos-se-na-to-klidne-vykaslete-kdyby-mohly-volby-neco-zmenit-davno-by-je-prece-zakazali-38474', + 'md5': '542ebc27baa3b2dd99d1671c12f5b28c', + 'info_dict': { + 'id': '38474', + 'ext': 'mp4', + 'title': 'Šťastné pondělí Jindřicha Šídla.', + 'description': 'Do voleb zbývají čtyři dny. Jindřich Šídlo proto přichází se zásadním doporučením voličům, jak se letos zachovat. Další díl satirického pořadu.', + } + }, { + 'url': 'https://www.seznam.cz/zpravy/clanek/znovu-do-vlady-s-ano-pavel-belobradek-ve-volebnim-specialu-seznamu-38489', + 'md5': '3da261b41d776b2c860c191f47517057', + 'info_dict': { + 'id': '38489', + 'ext': 'mp4', + 'title': 'Předseda KDU-ČSL Pavel Bělobrádek ve volební Výzvě Seznamu.', + 'description': 'Předvolební rozhovory s lídry deseti hlavních stran pokračují. Ve Výzvě Jindřicha Šídla odpovídal předseda lidovců Pavel Bělobrádek.', + } + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + api_url = self._API_URL + 'v1/documents/{}'.format(video_id) + data = self._download_json(api_url, video_id) + + if 'video' in data['caption']: + sdn_url = data['caption']['video']['sdn'] + self._MAGIC_SUFFIX + else: + location_url = data['caption']['liveStreamUrl'] + self._MAGIC_SUFFIX + sdn_url = self._download_json(location_url, video_id)['Location'] + + sdn_data = self._download_json(sdn_url, video_id) + + formats = [] + for fmt, fmtdata in sdn_data['data']['mp4'].items(): + resolution = fmtdata.get('resolution') + formats.append({ + 'format_id': fmt, + 'width': int_or_none(resolution[0]) if resolution is not None else None, + 'height': int_or_none(resolution[1]) if resolution is not None else None, + 'url': urljoin(sdn_url, fmtdata['url']), + }) + + formats.sort(key=lambda x: x['height']) + + return { + 'id': video_id, + 'title': data['captionTitle'], + 'description': data.get('perex'), + 'formats': formats, + } From 3747cf1d8d5836960baf7b3b01ae4ae4a25fdf91 Mon Sep 17 00:00:00 2001 From: Petr Novak Date: Sat, 28 Oct 2017 21:10:20 +0200 Subject: [PATCH 02/15] [seznamzpravy] Fixes per dstftw --- youtube_dl/extractor/seznamzpravy.py | 39 +++++++++++++--------------- 1 file changed, 18 insertions(+), 21 deletions(-) diff --git a/youtube_dl/extractor/seznamzpravy.py b/youtube_dl/extractor/seznamzpravy.py index e319483e2..7181eb408 100644 --- a/youtube_dl/extractor/seznamzpravy.py +++ b/youtube_dl/extractor/seznamzpravy.py @@ -14,6 +14,7 @@ class SeznamZpravyIE(InfoExtractor): _MAGIC_SUFFIX = 'spl2,2,VOD' _TESTS = [{ + # video with SDN URL 'url': 'https://www.seznam.cz/zpravy/clanek/jejich-svet-na-nas-utoci-je-lepsi-branit-se-na-jejich-pisecku-rika-reziser-a-major-v-zaloze-marhoul-35990', 'md5': '855f9fed87bd93e48775d59671a3a3e3', 'info_dict': { @@ -23,15 +24,7 @@ class SeznamZpravyIE(InfoExtractor): 'description': 'O nasazení českých vojáků v zahraničí. Marhoul by na mise posílal i zálohy. „Nejdříve se ale musí vycvičit,“ říká.', } }, { - 'url': 'https://www.seznam.cz/zpravy/clanek/vyzva-volicum-letos-se-na-to-klidne-vykaslete-kdyby-mohly-volby-neco-zmenit-davno-by-je-prece-zakazali-38474', - 'md5': '542ebc27baa3b2dd99d1671c12f5b28c', - 'info_dict': { - 'id': '38474', - 'ext': 'mp4', - 'title': 'Šťastné pondělí Jindřicha Šídla.', - 'description': 'Do voleb zbývají čtyři dny. Jindřich Šídlo proto přichází se zásadním doporučením voličům, jak se letos zachovat. Další díl satirického pořadu.', - } - }, { + # video with live stream URL 'url': 'https://www.seznam.cz/zpravy/clanek/znovu-do-vlady-s-ano-pavel-belobradek-ve-volebnim-specialu-seznamu-38489', 'md5': '3da261b41d776b2c860c191f47517057', 'info_dict': { @@ -44,28 +37,32 @@ class SeznamZpravyIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) - api_url = self._API_URL + 'v1/documents/{}'.format(video_id) - data = self._download_json(api_url, video_id) + data = self._download_json(self._API_URL + 'v1/documents/' + video_id, video_id) if 'video' in data['caption']: sdn_url = data['caption']['video']['sdn'] + self._MAGIC_SUFFIX else: - location_url = data['caption']['liveStreamUrl'] + self._MAGIC_SUFFIX - sdn_url = self._download_json(location_url, video_id)['Location'] - - sdn_data = self._download_json(sdn_url, video_id) + sdn_url = self._download_json(data['caption']['liveStreamUrl'] + self._MAGIC_SUFFIX, video_id)['Location'] formats = [] - for fmt, fmtdata in sdn_data['data']['mp4'].items(): - resolution = fmtdata.get('resolution') + for fmt, fmtdata in self._download_json(sdn_url, video_id)['data']['mp4'].items(): + relative_url = fmtdata.get('url') + if not relative_url: + continue + + try: + width, height = fmtdata.get('resolution') + except TypeError: + width, height = None, None + formats.append({ 'format_id': fmt, - 'width': int_or_none(resolution[0]) if resolution is not None else None, - 'height': int_or_none(resolution[1]) if resolution is not None else None, - 'url': urljoin(sdn_url, fmtdata['url']), + 'width': int_or_none(width), + 'height': int_or_none(height), + 'url': urljoin(sdn_url, relative_url), }) - formats.sort(key=lambda x: x['height']) + self._sort_formats(formats) return { 'id': video_id, From 4d8729978f151c7956ee30603781dcb777759ed1 Mon Sep 17 00:00:00 2001 From: Petr Novak Date: Sat, 28 Oct 2017 23:01:11 +0200 Subject: [PATCH 03/15] [seznamzpravy] Removed sometimes-failing test md5 Second test can return at least two videos which are not binary identical, so removing the checksum. --- youtube_dl/extractor/seznamzpravy.py | 1 - 1 file changed, 1 deletion(-) diff --git a/youtube_dl/extractor/seznamzpravy.py b/youtube_dl/extractor/seznamzpravy.py index 7181eb408..869ffafc8 100644 --- a/youtube_dl/extractor/seznamzpravy.py +++ b/youtube_dl/extractor/seznamzpravy.py @@ -26,7 +26,6 @@ class SeznamZpravyIE(InfoExtractor): }, { # video with live stream URL 'url': 'https://www.seznam.cz/zpravy/clanek/znovu-do-vlady-s-ano-pavel-belobradek-ve-volebnim-specialu-seznamu-38489', - 'md5': '3da261b41d776b2c860c191f47517057', 'info_dict': { 'id': '38489', 'ext': 'mp4', From defcd753ef25ea21a2d29a7ef57f4dcf57c77766 Mon Sep 17 00:00:00 2001 From: Petr Novak Date: Sun, 29 Oct 2017 21:40:06 +0100 Subject: [PATCH 04/15] [seznamzpravy] Parse HLS and DASH Includes extension of generic MPD extractor and few more fixes per dstftw. --- youtube_dl/extractor/common.py | 9 +++++++++ youtube_dl/extractor/seznamzpravy.py | 21 +++++++++++++++++---- 2 files changed, 26 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index a69240693..20abd06f2 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -1979,6 +1979,15 @@ class InfoExtractor(object): }) segment_index += 1 representation_ms_info['fragments'] = fragments + elif 'segment_urls' in representation_ms_info: + # Segment URLs with no SegmentTimeline + # Example: https://www.seznam.cz/zpravy/clanek/cesko-zasahne-vitr-o-sile-vichrice-muze-byt-i-zivotu-nebezpecny-39091 + fragments = [] + for segment_url in representation_ms_info['segment_urls']: + fragments.append({ + location_key(segment_url): segment_url, + }) + representation_ms_info['fragments'] = fragments # NB: MPD manifest may contain direct URLs to unfragmented media. # No fragments key is present in this case. if 'fragments' in representation_ms_info: diff --git a/youtube_dl/extractor/seznamzpravy.py b/youtube_dl/extractor/seznamzpravy.py index 869ffafc8..d1ca50479 100644 --- a/youtube_dl/extractor/seznamzpravy.py +++ b/youtube_dl/extractor/seznamzpravy.py @@ -16,7 +16,9 @@ class SeznamZpravyIE(InfoExtractor): _TESTS = [{ # video with SDN URL 'url': 'https://www.seznam.cz/zpravy/clanek/jejich-svet-na-nas-utoci-je-lepsi-branit-se-na-jejich-pisecku-rika-reziser-a-major-v-zaloze-marhoul-35990', - 'md5': '855f9fed87bd93e48775d59671a3a3e3', + 'params': {'skip_download': True}, + # ^ this is here instead of 'file_minsize': 1586, which does not work because + # test_download.py forces expected_minsize to at least 10k when test is running 'info_dict': { 'id': '35990', 'ext': 'mp4', @@ -37,21 +39,23 @@ class SeznamZpravyIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) data = self._download_json(self._API_URL + 'v1/documents/' + video_id, video_id) + title = data['captionTitle'] if 'video' in data['caption']: sdn_url = data['caption']['video']['sdn'] + self._MAGIC_SUFFIX else: sdn_url = self._download_json(data['caption']['liveStreamUrl'] + self._MAGIC_SUFFIX, video_id)['Location'] + sdn_data = self._download_json(sdn_url, video_id) formats = [] - for fmt, fmtdata in self._download_json(sdn_url, video_id)['data']['mp4'].items(): + for fmt, fmtdata in sdn_data.get('data', {}).get('mp4', {}).items(): relative_url = fmtdata.get('url') if not relative_url: continue try: width, height = fmtdata.get('resolution') - except TypeError: + except (TypeError, ValueError): width, height = None, None formats.append({ @@ -61,11 +65,20 @@ class SeznamZpravyIE(InfoExtractor): 'url': urljoin(sdn_url, relative_url), }) + playlists = sdn_data.get('pls', {}) + dash_rel_url = playlists.get('dash', {}).get('url') + if dash_rel_url: + formats.extend(self._extract_mpd_formats(urljoin(sdn_url, dash_rel_url), video_id, mpd_id='dash', fatal=False)) + + hls_rel_url = playlists.get('hls', {}).get('url') + if hls_rel_url: + formats.extend(self._extract_m3u8_formats(urljoin(sdn_url, hls_rel_url), video_id, ext='mp4', m3u8_id='hls', fatal=False)) + self._sort_formats(formats) return { 'id': video_id, - 'title': data['captionTitle'], + 'title': title, 'description': data.get('perex'), 'formats': formats, } From 255491dda70da23b2a079ecf8589ecca4027fc94 Mon Sep 17 00:00:00 2001 From: Petr Novak Date: Sun, 29 Oct 2017 23:44:17 +0100 Subject: [PATCH 05/15] [seznamzpravy] Parse multiple videos Also use primarily title instead of captionTitle --- youtube_dl/extractor/seznamzpravy.py | 84 +++++++++++++++++++++------- 1 file changed, 65 insertions(+), 19 deletions(-) diff --git a/youtube_dl/extractor/seznamzpravy.py b/youtube_dl/extractor/seznamzpravy.py index d1ca50479..7b93e86bd 100644 --- a/youtube_dl/extractor/seznamzpravy.py +++ b/youtube_dl/extractor/seznamzpravy.py @@ -14,7 +14,7 @@ class SeznamZpravyIE(InfoExtractor): _MAGIC_SUFFIX = 'spl2,2,VOD' _TESTS = [{ - # video with SDN URL + # two videos on one page, with SDN URL 'url': 'https://www.seznam.cz/zpravy/clanek/jejich-svet-na-nas-utoci-je-lepsi-branit-se-na-jejich-pisecku-rika-reziser-a-major-v-zaloze-marhoul-35990', 'params': {'skip_download': True}, # ^ this is here instead of 'file_minsize': 1586, which does not work because @@ -22,7 +22,7 @@ class SeznamZpravyIE(InfoExtractor): 'info_dict': { 'id': '35990', 'ext': 'mp4', - 'title': 'Svět bez obalu: Rozhovor s Václavem Marhoulem o zahraničních vojenských misích a aktivních zálohách.', + 'title': 'Jejich svět na nás útočí. Je lepší bránit se na jejich písečku, říká režisér a major v záloze Marhoul', 'description': 'O nasazení českých vojáků v zahraničí. Marhoul by na mise posílal i zálohy. „Nejdříve se ale musí vycvičit,“ říká.', } }, { @@ -31,21 +31,12 @@ class SeznamZpravyIE(InfoExtractor): 'info_dict': { 'id': '38489', 'ext': 'mp4', - 'title': 'Předseda KDU-ČSL Pavel Bělobrádek ve volební Výzvě Seznamu.', + 'title': 'ČSSD a ANO nás s elektronickou evidencí podrazily, říká šéf lidovců', 'description': 'Předvolební rozhovory s lídry deseti hlavních stran pokračují. Ve Výzvě Jindřicha Šídla odpovídal předseda lidovců Pavel Bělobrádek.', } }] - def _real_extract(self, url): - video_id = self._match_id(url) - data = self._download_json(self._API_URL + 'v1/documents/' + video_id, video_id) - title = data['captionTitle'] - - if 'video' in data['caption']: - sdn_url = data['caption']['video']['sdn'] + self._MAGIC_SUFFIX - else: - sdn_url = self._download_json(data['caption']['liveStreamUrl'] + self._MAGIC_SUFFIX, video_id)['Location'] - + def _extract_sdn_formats(self, sdn_url, video_id): sdn_data = self._download_json(sdn_url, video_id) formats = [] for fmt, fmtdata in sdn_data.get('data', {}).get('mp4', {}).items(): @@ -75,10 +66,65 @@ class SeznamZpravyIE(InfoExtractor): formats.extend(self._extract_m3u8_formats(urljoin(sdn_url, hls_rel_url), video_id, ext='mp4', m3u8_id='hls', fatal=False)) self._sort_formats(formats) + return formats - return { - 'id': video_id, - 'title': title, - 'description': data.get('perex'), - 'formats': formats, - } + def _extract_caption(self, api_data, video_id): + title = api_data.get('title') or api_data.get('captionTitle') + caption = api_data.get('caption') + if not title or not caption: + return {} + + if 'sdn' in caption.get('video', {}): + sdn_url = caption['video']['sdn'] + self._MAGIC_SUFFIX + elif 'liveStreamUrl' in caption: + sdn_url = self._download_json(caption['liveStreamUrl'] + self._MAGIC_SUFFIX, video_id)['Location'] + else: + return {} + + formats = self._extract_sdn_formats(sdn_url, video_id) + if formats: + return { + 'id': video_id, + 'title': title, + 'description': api_data.get('perex'), + 'display_id': api_data.get('slug'), + 'formats': formats, + } + + def _extract_content(self, api_data, video_id): + entries = [] + for num, item in enumerate(api_data.get('content', [])): + media = item.get('properties', {}).get('media', {}) + sdn_url_part = media.get('video', {}).get('sdn') + title = media.get('title') + if not sdn_url_part or not title: + continue + + entry_id = '%s-%s' % (video_id, num) + formats = self._extract_sdn_formats(sdn_url_part + self._MAGIC_SUFFIX, entry_id) + if formats: + entries.append({ + 'id': entry_id, + 'title': title, + 'formats': formats, + }) + + return entries + + def _real_extract(self, url): + video_id = self._match_id(url) + api_data = self._download_json(self._API_URL + 'v1/documents/' + video_id, video_id) + + caption = self._extract_caption(api_data, video_id) + content = self._extract_content(api_data, video_id) + + if caption and not content: + return caption + else: + if caption: + content.insert(0, caption) + return { + '_type': 'playlist', + 'entries': content, + 'title': caption.get('title'), + } From d13cb7d70255fe1f80a662d6638e775cfad01386 Mon Sep 17 00:00:00 2001 From: Petr Novak Date: Sun, 29 Oct 2017 23:49:38 +0100 Subject: [PATCH 06/15] [seznamzpravy] Fixed test --- youtube_dl/extractor/seznamzpravy.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/seznamzpravy.py b/youtube_dl/extractor/seznamzpravy.py index 7b93e86bd..66edd13aa 100644 --- a/youtube_dl/extractor/seznamzpravy.py +++ b/youtube_dl/extractor/seznamzpravy.py @@ -31,7 +31,7 @@ class SeznamZpravyIE(InfoExtractor): 'info_dict': { 'id': '38489', 'ext': 'mp4', - 'title': 'ČSSD a ANO nás s elektronickou evidencí podrazily, říká šéf lidovců', + 'title': 'ČSSD a ANO nás s\xa0elektronickou evidencí podrazily, říká šéf lidovců', 'description': 'Předvolební rozhovory s lídry deseti hlavních stran pokračují. Ve Výzvě Jindřicha Šídla odpovídal předseda lidovců Pavel Bělobrádek.', } }] From 87c82b2b3d998375196a081ae589be4cd4c7e642 Mon Sep 17 00:00:00 2001 From: Petr Novak Date: Sat, 4 Nov 2017 23:25:16 +0100 Subject: [PATCH 07/15] [test/test_download] In test we download 10000 bytes at max Also removed workaround in seznamzpravy extractor test. --- test/test_download.py | 2 +- youtube_dl/extractor/seznamzpravy.py | 4 +--- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/test/test_download.py b/test/test_download.py index 209f5f6d6..0513b6731 100644 --- a/test/test_download.py +++ b/test/test_download.py @@ -216,7 +216,7 @@ def generator(test_case, tname): expected_minsize = tc.get('file_minsize', 10000) if expected_minsize is not None: if params.get('test'): - expected_minsize = max(expected_minsize, 10000) + expected_minsize = min(expected_minsize, 10000) got_fsize = os.path.getsize(tc_filename) assertGreaterEqual( self, got_fsize, expected_minsize, diff --git a/youtube_dl/extractor/seznamzpravy.py b/youtube_dl/extractor/seznamzpravy.py index 66edd13aa..6bcc767d4 100644 --- a/youtube_dl/extractor/seznamzpravy.py +++ b/youtube_dl/extractor/seznamzpravy.py @@ -16,9 +16,7 @@ class SeznamZpravyIE(InfoExtractor): _TESTS = [{ # two videos on one page, with SDN URL 'url': 'https://www.seznam.cz/zpravy/clanek/jejich-svet-na-nas-utoci-je-lepsi-branit-se-na-jejich-pisecku-rika-reziser-a-major-v-zaloze-marhoul-35990', - 'params': {'skip_download': True}, - # ^ this is here instead of 'file_minsize': 1586, which does not work because - # test_download.py forces expected_minsize to at least 10k when test is running + 'file_minsize': 1586, 'info_dict': { 'id': '35990', 'ext': 'mp4', From bf4f780ba74c055ffb6d94300267df5d55bff581 Mon Sep 17 00:00:00 2001 From: Petr Novak Date: Fri, 24 Nov 2017 23:29:47 +0100 Subject: [PATCH 08/15] [seznamzpravy] Updated API URL --- youtube_dl/extractor/seznamzpravy.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/seznamzpravy.py b/youtube_dl/extractor/seznamzpravy.py index 6bcc767d4..d2349aa74 100644 --- a/youtube_dl/extractor/seznamzpravy.py +++ b/youtube_dl/extractor/seznamzpravy.py @@ -10,7 +10,7 @@ from ..utils import ( class SeznamZpravyIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?seznam\.cz/zpravy/clanek/(?:[-a-z0-9]+)-(?P[0-9]+)' - _API_URL = 'https://apiclanky.seznam.cz/' + _API_URL = 'https://apizpravy.seznam.cz/' _MAGIC_SUFFIX = 'spl2,2,VOD' _TESTS = [{ From 8e189bb1f91f495c7394da15e4899cde641b83ca Mon Sep 17 00:00:00 2001 From: Petr Novak Date: Sat, 25 Nov 2017 01:51:20 +0100 Subject: [PATCH 09/15] Revert "[test/test_download] In test we download 10000 bytes at max" This reverts commit 87c82b2b3d998375196a081ae589be4cd4c7e642. --- test/test_download.py | 2 +- youtube_dl/extractor/seznamzpravy.py | 4 +++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/test/test_download.py b/test/test_download.py index 0513b6731..209f5f6d6 100644 --- a/test/test_download.py +++ b/test/test_download.py @@ -216,7 +216,7 @@ def generator(test_case, tname): expected_minsize = tc.get('file_minsize', 10000) if expected_minsize is not None: if params.get('test'): - expected_minsize = min(expected_minsize, 10000) + expected_minsize = max(expected_minsize, 10000) got_fsize = os.path.getsize(tc_filename) assertGreaterEqual( self, got_fsize, expected_minsize, diff --git a/youtube_dl/extractor/seznamzpravy.py b/youtube_dl/extractor/seznamzpravy.py index d2349aa74..ed4e7da0c 100644 --- a/youtube_dl/extractor/seznamzpravy.py +++ b/youtube_dl/extractor/seznamzpravy.py @@ -16,7 +16,9 @@ class SeznamZpravyIE(InfoExtractor): _TESTS = [{ # two videos on one page, with SDN URL 'url': 'https://www.seznam.cz/zpravy/clanek/jejich-svet-na-nas-utoci-je-lepsi-branit-se-na-jejich-pisecku-rika-reziser-a-major-v-zaloze-marhoul-35990', - 'file_minsize': 1586, + 'params': {'skip_download': True}, + # ^ this is here instead of 'file_minsize': 1586, which does not work because + # test_download.py forces expected_minsize to at least 10k when test is running 'info_dict': { 'id': '35990', 'ext': 'mp4', From 16ca00501a150bf419b98ac02052b2c608bdb9fd Mon Sep 17 00:00:00 2001 From: Petr Novak Date: Sat, 25 Nov 2017 02:34:07 +0100 Subject: [PATCH 10/15] [seznamzpravy] use try_get --- youtube_dl/extractor/seznamzpravy.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/seznamzpravy.py b/youtube_dl/extractor/seznamzpravy.py index ed4e7da0c..f1a4fc933 100644 --- a/youtube_dl/extractor/seznamzpravy.py +++ b/youtube_dl/extractor/seznamzpravy.py @@ -2,9 +2,11 @@ from __future__ import unicode_literals from .common import InfoExtractor +from ..compat import compat_str from ..utils import ( urljoin, int_or_none, + try_get, ) @@ -39,7 +41,8 @@ class SeznamZpravyIE(InfoExtractor): def _extract_sdn_formats(self, sdn_url, video_id): sdn_data = self._download_json(sdn_url, video_id) formats = [] - for fmt, fmtdata in sdn_data.get('data', {}).get('mp4', {}).items(): + mp4_formats = try_get(sdn_data, lambda x: x['data']['mp4'], dict) or {} + for fmt, fmtdata in mp4_formats.items(): relative_url = fmtdata.get('url') if not relative_url: continue @@ -57,11 +60,11 @@ class SeznamZpravyIE(InfoExtractor): }) playlists = sdn_data.get('pls', {}) - dash_rel_url = playlists.get('dash', {}).get('url') + dash_rel_url = try_get(playlists, lambda x: x['dash']['url'], compat_str) if dash_rel_url: formats.extend(self._extract_mpd_formats(urljoin(sdn_url, dash_rel_url), video_id, mpd_id='dash', fatal=False)) - hls_rel_url = playlists.get('hls', {}).get('url') + hls_rel_url = try_get(playlists, lambda x: x['hls']['url'], compat_str) if hls_rel_url: formats.extend(self._extract_m3u8_formats(urljoin(sdn_url, hls_rel_url), video_id, ext='mp4', m3u8_id='hls', fatal=False)) From 548c00833084876dcf5a8c7ff7532864d27263bc Mon Sep 17 00:00:00 2001 From: Petr Novak Date: Mon, 4 Dec 2017 02:45:45 +0100 Subject: [PATCH 11/15] [seznamzpravy] Split to article and iframe extractor --- youtube_dl/extractor/extractors.py | 5 +- youtube_dl/extractor/seznamzpravy.py | 155 ++++++++++++++++----------- 2 files changed, 99 insertions(+), 61 deletions(-) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index d8860249a..dd5bb965e 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -931,7 +931,10 @@ from .sendtonews import SendtoNewsIE from .servingsys import ServingSysIE from .servus import ServusIE from .sexu import SexuIE -from .seznamzpravy import SeznamZpravyIE +from .seznamzpravy import ( + SeznamZpravyIframeIE, + SeznamZpravyArticleIE, +) from .shahid import ShahidIE from .shared import ( SharedIE, diff --git a/youtube_dl/extractor/seznamzpravy.py b/youtube_dl/extractor/seznamzpravy.py index f1a4fc933..2667f7a8c 100644 --- a/youtube_dl/extractor/seznamzpravy.py +++ b/youtube_dl/extractor/seznamzpravy.py @@ -2,7 +2,12 @@ from __future__ import unicode_literals from .common import InfoExtractor -from ..compat import compat_str +from ..compat import ( + compat_parse_qs, + compat_str, + compat_urllib_parse_urlencode, + compat_urllib_parse_urlparse, +) from ..utils import ( urljoin, int_or_none, @@ -10,34 +15,10 @@ from ..utils import ( ) -class SeznamZpravyIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?seznam\.cz/zpravy/clanek/(?:[-a-z0-9]+)-(?P[0-9]+)' +class SeznamZpravyGenericIE(InfoExtractor): _API_URL = 'https://apizpravy.seznam.cz/' _MAGIC_SUFFIX = 'spl2,2,VOD' - _TESTS = [{ - # two videos on one page, with SDN URL - 'url': 'https://www.seznam.cz/zpravy/clanek/jejich-svet-na-nas-utoci-je-lepsi-branit-se-na-jejich-pisecku-rika-reziser-a-major-v-zaloze-marhoul-35990', - 'params': {'skip_download': True}, - # ^ this is here instead of 'file_minsize': 1586, which does not work because - # test_download.py forces expected_minsize to at least 10k when test is running - 'info_dict': { - 'id': '35990', - 'ext': 'mp4', - 'title': 'Jejich svět na nás útočí. Je lepší bránit se na jejich písečku, říká režisér a major v záloze Marhoul', - 'description': 'O nasazení českých vojáků v zahraničí. Marhoul by na mise posílal i zálohy. „Nejdříve se ale musí vycvičit,“ říká.', - } - }, { - # video with live stream URL - 'url': 'https://www.seznam.cz/zpravy/clanek/znovu-do-vlady-s-ano-pavel-belobradek-ve-volebnim-specialu-seznamu-38489', - 'info_dict': { - 'id': '38489', - 'ext': 'mp4', - 'title': 'ČSSD a ANO nás s\xa0elektronickou evidencí podrazily, říká šéf lidovců', - 'description': 'Předvolební rozhovory s lídry deseti hlavních stran pokračují. Ve Výzvě Jindřicha Šídla odpovídal předseda lidovců Pavel Bělobrádek.', - } - }] - def _extract_sdn_formats(self, sdn_url, video_id): sdn_data = self._download_json(sdn_url, video_id) formats = [] @@ -71,63 +52,117 @@ class SeznamZpravyIE(InfoExtractor): self._sort_formats(formats) return formats - def _extract_caption(self, api_data, video_id): + def _raw_id(self, src_url): + return compat_urllib_parse_urlparse(src_url).path.split('/')[-1] + + +class SeznamZpravyIframeIE(SeznamZpravyGenericIE): + _VALID_URL = r'https?://(?:www\.)?seznam\.cz/zpravy/iframe/player\?.*\bsrc=' + _TESTS = [{ + 'url': r'https://www.seznam.cz/zpravy/iframe/player?duration=241&serviceSlug=zpravy&src=https%3A%2F%2Fv39-a.sdn.szn.cz%2Fv_39%2Fvmd%2F5999c902ea707c67d8e267a9%3Ffl%3Dmdk%2C432f65a0%7C&itemType=video&autoPlay=false&title=Sv%C4%9Bt%20bez%20obalu%3A%20%C4%8Ce%C5%A1t%C3%AD%20voj%C3%A1ci%20na%20mis%C3%ADch%20(kr%C3%A1tk%C3%A1%20verze)&series=Sv%C4%9Bt%20bez%20obalu&serviceName=Seznam%20Zpr%C3%A1vy&poster=%2F%2Fd39-a.sdn.szn.cz%2Fd_39%2Fc_img_F_I%2FR5puJ.jpeg%3Ffl%3Dcro%2C0%2C0%2C1920%2C1080%7Cres%2C1200%2C%2C1%7Cjpg%2C80%2C%2C1&width=1920&height=1080&cutFrom=0&cutTo=0&splVersion=VOD&contentId=170889&contextId=35990&showAdvert=true&collocation=&autoplayPossible=true&embed=&isVideoTooShortForPreroll=false&isVideoTooLongForPostroll=true&videoCommentOpKey=&videoCommentId=&version=4.0.76&dotService=zpravy&gemiusPrismIdentifier=bVc1ZIb_Qax4W2v5xOPGpMeCP31kFfrTzj0SqPTLh_b.Z7&zoneIdPreroll=seznam.pack.videospot&skipOffsetPreroll=5§ionPrefixPreroll=%2Fzpravy', + 'params': {'skip_download': True}, # 'file_minsize': 1586 seems to get killed in test_download.py + 'info_dict': { + 'id': '170889', + 'ext': 'mp4', + 'title': 'Svět bez obalu: Čeští vojáci na misích (krátká verze)', + } + }] + + def _real_extract(self, url): + params = compat_parse_qs(compat_urllib_parse_urlparse(url).query) + src = params['src'][0] + video_id = params.get('contentId', [self._raw_id(src)])[0] + + return { + 'id': video_id, + 'title': params['title'][0], + 'formats': self._extract_sdn_formats(src + self._MAGIC_SUFFIX, video_id), + } + + +class SeznamZpravyArticleIE(SeznamZpravyGenericIE): + _VALID_URL = r'https?://(?:www\.)?seznam\.cz/zpravy/clanek/(?:[-a-z0-9]+)-(?P[0-9]+)' + _TESTS = [{ + # two videos on one page, with SDN URL + 'url': 'https://www.seznam.cz/zpravy/clanek/jejich-svet-na-nas-utoci-je-lepsi-branit-se-na-jejich-pisecku-rika-reziser-a-major-v-zaloze-marhoul-35990', + 'params': {'skip_download': True}, + # ^ this is here instead of 'file_minsize': 1586, which does not work because + # test_download.py forces expected_minsize to at least 10k when test is running + 'info_dict': { + 'id': '170889', + 'ext': 'mp4', + 'title': 'Svět bez obalu: Čeští vojáci na misích (krátká verze)', + } + }, { + # video with live stream URL + 'url': 'https://www.seznam.cz/zpravy/clanek/znovu-do-vlady-s-ano-pavel-belobradek-ve-volebnim-specialu-seznamu-38489', + 'info_dict': { + 'id': '185688', + 'ext': 'mp4', + 'title': 'Předseda KDU-ČSL Pavel Bělobrádek ve volební Výzvě Seznamu', + } + }] + + def _extract_caption(self, api_data, article_id): title = api_data.get('title') or api_data.get('captionTitle') caption = api_data.get('caption') if not title or not caption: return {} if 'sdn' in caption.get('video', {}): - sdn_url = caption['video']['sdn'] + self._MAGIC_SUFFIX + src_url = caption['video']['sdn'] elif 'liveStreamUrl' in caption: - sdn_url = self._download_json(caption['liveStreamUrl'] + self._MAGIC_SUFFIX, video_id)['Location'] + src_url = self._download_json(caption['liveStreamUrl'], article_id)['Location'] else: return {} - formats = self._extract_sdn_formats(sdn_url, video_id) - if formats: - return { - 'id': video_id, - 'title': title, - 'description': api_data.get('perex'), - 'display_id': api_data.get('slug'), - 'formats': formats, - } + return { + 'id': caption.get('uid'), + 'title': caption.get('title'), + 'src': src_url, + } - def _extract_content(self, api_data, video_id): + def _extract_content(self, api_data): entries = [] for num, item in enumerate(api_data.get('content', [])): media = item.get('properties', {}).get('media', {}) - sdn_url_part = media.get('video', {}).get('sdn') + src_url = media.get('video', {}).get('sdn') title = media.get('title') - if not sdn_url_part or not title: + if not src_url or not title: continue - entry_id = '%s-%s' % (video_id, num) - formats = self._extract_sdn_formats(sdn_url_part + self._MAGIC_SUFFIX, entry_id) - if formats: - entries.append({ - 'id': entry_id, - 'title': title, - 'formats': formats, - }) + entries.append({ + 'id': media.get('uid'), + 'title': title, + 'src': src_url, + }) return entries - def _real_extract(self, url): - video_id = self._match_id(url) - api_data = self._download_json(self._API_URL + 'v1/documents/' + video_id, video_id) + def _iframe_result(self, info_dict): + video_id = info_dict['id'] or self._raw_id(info_dict['src']) + url = 'https://www.seznam.cz/zpravy/iframe/player?%s' % compat_urllib_parse_urlencode({ + 'src': info_dict['src'], + 'title': info_dict['title'], + 'contentId': video_id, + 'serviceName': 'Seznam Zprávy', + }) + return self.url_result(url, ie='SeznamZpravyIframe', video_id=video_id, video_title=info_dict['title']) - caption = self._extract_caption(api_data, video_id) - content = self._extract_content(api_data, video_id) + def _real_extract(self, url): + article_id = self._match_id(url) + api_data = self._download_json(self._API_URL + 'v1/documents/' + article_id, article_id) + + caption = self._extract_caption(api_data, article_id) + content = self._extract_content(api_data) if caption and not content: - return caption + return self._iframe_result(caption) else: if caption: content.insert(0, caption) - return { - '_type': 'playlist', - 'entries': content, - 'title': caption.get('title'), - } + return self.playlist_result( + [self._iframe_result(x) for x in content], + playlist_id=article_id, + playlist_title=api_data.get('title') or caption.get('title') + ) From 6f203862aa07eaa66aac63655e0361da953b61f7 Mon Sep 17 00:00:00 2001 From: Petr Novak Date: Tue, 12 Dec 2017 19:06:42 +0100 Subject: [PATCH 12/15] [seznamzpravy] New URL --- youtube_dl/extractor/seznamzpravy.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/seznamzpravy.py b/youtube_dl/extractor/seznamzpravy.py index 2667f7a8c..80b227df7 100644 --- a/youtube_dl/extractor/seznamzpravy.py +++ b/youtube_dl/extractor/seznamzpravy.py @@ -57,9 +57,9 @@ class SeznamZpravyGenericIE(InfoExtractor): class SeznamZpravyIframeIE(SeznamZpravyGenericIE): - _VALID_URL = r'https?://(?:www\.)?seznam\.cz/zpravy/iframe/player\?.*\bsrc=' + _VALID_URL = r'https?://(?:www\.)?(?:seznam\.cz/zpravy|seznamzpravy\.cz)/iframe/player\?.*\bsrc=' _TESTS = [{ - 'url': r'https://www.seznam.cz/zpravy/iframe/player?duration=241&serviceSlug=zpravy&src=https%3A%2F%2Fv39-a.sdn.szn.cz%2Fv_39%2Fvmd%2F5999c902ea707c67d8e267a9%3Ffl%3Dmdk%2C432f65a0%7C&itemType=video&autoPlay=false&title=Sv%C4%9Bt%20bez%20obalu%3A%20%C4%8Ce%C5%A1t%C3%AD%20voj%C3%A1ci%20na%20mis%C3%ADch%20(kr%C3%A1tk%C3%A1%20verze)&series=Sv%C4%9Bt%20bez%20obalu&serviceName=Seznam%20Zpr%C3%A1vy&poster=%2F%2Fd39-a.sdn.szn.cz%2Fd_39%2Fc_img_F_I%2FR5puJ.jpeg%3Ffl%3Dcro%2C0%2C0%2C1920%2C1080%7Cres%2C1200%2C%2C1%7Cjpg%2C80%2C%2C1&width=1920&height=1080&cutFrom=0&cutTo=0&splVersion=VOD&contentId=170889&contextId=35990&showAdvert=true&collocation=&autoplayPossible=true&embed=&isVideoTooShortForPreroll=false&isVideoTooLongForPostroll=true&videoCommentOpKey=&videoCommentId=&version=4.0.76&dotService=zpravy&gemiusPrismIdentifier=bVc1ZIb_Qax4W2v5xOPGpMeCP31kFfrTzj0SqPTLh_b.Z7&zoneIdPreroll=seznam.pack.videospot&skipOffsetPreroll=5§ionPrefixPreroll=%2Fzpravy', + 'url': r'https://www.seznamzpravy.cz/iframe/player?duration=241&serviceSlug=zpravy&src=https%3A%2F%2Fv39-a.sdn.szn.cz%2Fv_39%2Fvmd%2F5999c902ea707c67d8e267a9%3Ffl%3Dmdk%2C432f65a0%7C&itemType=video&autoPlay=false&title=Sv%C4%9Bt%20bez%20obalu%3A%20%C4%8Ce%C5%A1t%C3%AD%20voj%C3%A1ci%20na%20mis%C3%ADch%20(kr%C3%A1tk%C3%A1%20verze)&series=Sv%C4%9Bt%20bez%20obalu&serviceName=Seznam%20Zpr%C3%A1vy&poster=%2F%2Fd39-a.sdn.szn.cz%2Fd_39%2Fc_img_F_I%2FR5puJ.jpeg%3Ffl%3Dcro%2C0%2C0%2C1920%2C1080%7Cres%2C1200%2C%2C1%7Cjpg%2C80%2C%2C1&width=1920&height=1080&cutFrom=0&cutTo=0&splVersion=VOD&contentId=170889&contextId=35990&showAdvert=true&collocation=&autoplayPossible=true&embed=&isVideoTooShortForPreroll=false&isVideoTooLongForPostroll=true&videoCommentOpKey=&videoCommentId=&version=4.0.76&dotService=zpravy&gemiusPrismIdentifier=bVc1ZIb_Qax4W2v5xOPGpMeCP31kFfrTzj0SqPTLh_b.Z7&zoneIdPreroll=seznam.pack.videospot&skipOffsetPreroll=5§ionPrefixPreroll=%2Fzpravy', 'params': {'skip_download': True}, # 'file_minsize': 1586 seems to get killed in test_download.py 'info_dict': { 'id': '170889', @@ -81,10 +81,10 @@ class SeznamZpravyIframeIE(SeznamZpravyGenericIE): class SeznamZpravyArticleIE(SeznamZpravyGenericIE): - _VALID_URL = r'https?://(?:www\.)?seznam\.cz/zpravy/clanek/(?:[-a-z0-9]+)-(?P[0-9]+)' + _VALID_URL = r'https?://(?:www\.)?(?:seznam\.cz/zpravy|seznamzpravy\.cz)/clanek/(?:[-a-z0-9]+)-(?P[0-9]+)' _TESTS = [{ # two videos on one page, with SDN URL - 'url': 'https://www.seznam.cz/zpravy/clanek/jejich-svet-na-nas-utoci-je-lepsi-branit-se-na-jejich-pisecku-rika-reziser-a-major-v-zaloze-marhoul-35990', + 'url': 'https://www.seznamzpravy.cz/clanek/jejich-svet-na-nas-utoci-je-lepsi-branit-se-na-jejich-pisecku-rika-reziser-a-major-v-zaloze-marhoul-35990', 'params': {'skip_download': True}, # ^ this is here instead of 'file_minsize': 1586, which does not work because # test_download.py forces expected_minsize to at least 10k when test is running From 7999450ae9fd865981628f38d517bfb87ae1f6e0 Mon Sep 17 00:00:00 2001 From: Petr Novak Date: Mon, 18 Dec 2017 20:10:42 +0100 Subject: [PATCH 13/15] [seznamzpravy] Fixes per dstftw --- youtube_dl/extractor/extractors.py | 2 +- youtube_dl/extractor/seznamzpravy.py | 52 +++++++++++++--------------- 2 files changed, 26 insertions(+), 28 deletions(-) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 05623b582..88851f5f8 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -931,7 +931,7 @@ from .servingsys import ServingSysIE from .servus import ServusIE from .sexu import SexuIE from .seznamzpravy import ( - SeznamZpravyIframeIE, + SeznamZpravyIE, SeznamZpravyArticleIE, ) from .shahid import ShahidIE diff --git a/youtube_dl/extractor/seznamzpravy.py b/youtube_dl/extractor/seznamzpravy.py index 80b227df7..83543e15b 100644 --- a/youtube_dl/extractor/seznamzpravy.py +++ b/youtube_dl/extractor/seznamzpravy.py @@ -5,19 +5,31 @@ from .common import InfoExtractor from ..compat import ( compat_parse_qs, compat_str, - compat_urllib_parse_urlencode, compat_urllib_parse_urlparse, ) from ..utils import ( urljoin, int_or_none, try_get, + update_url_query, ) -class SeznamZpravyGenericIE(InfoExtractor): - _API_URL = 'https://apizpravy.seznam.cz/' - _MAGIC_SUFFIX = 'spl2,2,VOD' +def _raw_id(src_url): + return compat_urllib_parse_urlparse(src_url).path.split('/')[-1] + + +class SeznamZpravyIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?(?:seznam\.cz/zpravy|seznamzpravy\.cz)/iframe/player\?.*\bsrc=' + _TESTS = [{ + 'url': r'https://www.seznamzpravy.cz/iframe/player?duration=241&serviceSlug=zpravy&src=https%3A%2F%2Fv39-a.sdn.szn.cz%2Fv_39%2Fvmd%2F5999c902ea707c67d8e267a9%3Ffl%3Dmdk%2C432f65a0%7C&itemType=video&autoPlay=false&title=Sv%C4%9Bt%20bez%20obalu%3A%20%C4%8Ce%C5%A1t%C3%AD%20voj%C3%A1ci%20na%20mis%C3%ADch%20(kr%C3%A1tk%C3%A1%20verze)&series=Sv%C4%9Bt%20bez%20obalu&serviceName=Seznam%20Zpr%C3%A1vy&poster=%2F%2Fd39-a.sdn.szn.cz%2Fd_39%2Fc_img_F_I%2FR5puJ.jpeg%3Ffl%3Dcro%2C0%2C0%2C1920%2C1080%7Cres%2C1200%2C%2C1%7Cjpg%2C80%2C%2C1&width=1920&height=1080&cutFrom=0&cutTo=0&splVersion=VOD&contentId=170889&contextId=35990&showAdvert=true&collocation=&autoplayPossible=true&embed=&isVideoTooShortForPreroll=false&isVideoTooLongForPostroll=true&videoCommentOpKey=&videoCommentId=&version=4.0.76&dotService=zpravy&gemiusPrismIdentifier=bVc1ZIb_Qax4W2v5xOPGpMeCP31kFfrTzj0SqPTLh_b.Z7&zoneIdPreroll=seznam.pack.videospot&skipOffsetPreroll=5§ionPrefixPreroll=%2Fzpravy', + 'params': {'skip_download': True}, # 'file_minsize': 1586 seems to get killed in test_download.py + 'info_dict': { + 'id': '170889', + 'ext': 'mp4', + 'title': 'Svět bez obalu: Čeští vojáci na misích (krátká verze)', + } + }] def _extract_sdn_formats(self, sdn_url, video_id): sdn_data = self._download_json(sdn_url, video_id) @@ -52,36 +64,22 @@ class SeznamZpravyGenericIE(InfoExtractor): self._sort_formats(formats) return formats - def _raw_id(self, src_url): - return compat_urllib_parse_urlparse(src_url).path.split('/')[-1] - - -class SeznamZpravyIframeIE(SeznamZpravyGenericIE): - _VALID_URL = r'https?://(?:www\.)?(?:seznam\.cz/zpravy|seznamzpravy\.cz)/iframe/player\?.*\bsrc=' - _TESTS = [{ - 'url': r'https://www.seznamzpravy.cz/iframe/player?duration=241&serviceSlug=zpravy&src=https%3A%2F%2Fv39-a.sdn.szn.cz%2Fv_39%2Fvmd%2F5999c902ea707c67d8e267a9%3Ffl%3Dmdk%2C432f65a0%7C&itemType=video&autoPlay=false&title=Sv%C4%9Bt%20bez%20obalu%3A%20%C4%8Ce%C5%A1t%C3%AD%20voj%C3%A1ci%20na%20mis%C3%ADch%20(kr%C3%A1tk%C3%A1%20verze)&series=Sv%C4%9Bt%20bez%20obalu&serviceName=Seznam%20Zpr%C3%A1vy&poster=%2F%2Fd39-a.sdn.szn.cz%2Fd_39%2Fc_img_F_I%2FR5puJ.jpeg%3Ffl%3Dcro%2C0%2C0%2C1920%2C1080%7Cres%2C1200%2C%2C1%7Cjpg%2C80%2C%2C1&width=1920&height=1080&cutFrom=0&cutTo=0&splVersion=VOD&contentId=170889&contextId=35990&showAdvert=true&collocation=&autoplayPossible=true&embed=&isVideoTooShortForPreroll=false&isVideoTooLongForPostroll=true&videoCommentOpKey=&videoCommentId=&version=4.0.76&dotService=zpravy&gemiusPrismIdentifier=bVc1ZIb_Qax4W2v5xOPGpMeCP31kFfrTzj0SqPTLh_b.Z7&zoneIdPreroll=seznam.pack.videospot&skipOffsetPreroll=5§ionPrefixPreroll=%2Fzpravy', - 'params': {'skip_download': True}, # 'file_minsize': 1586 seems to get killed in test_download.py - 'info_dict': { - 'id': '170889', - 'ext': 'mp4', - 'title': 'Svět bez obalu: Čeští vojáci na misích (krátká verze)', - } - }] - def _real_extract(self, url): params = compat_parse_qs(compat_urllib_parse_urlparse(url).query) src = params['src'][0] - video_id = params.get('contentId', [self._raw_id(src)])[0] + video_id = params.get('contentId', [_raw_id(src)])[0] return { 'id': video_id, 'title': params['title'][0], - 'formats': self._extract_sdn_formats(src + self._MAGIC_SUFFIX, video_id), + 'formats': self._extract_sdn_formats(src + 'spl2,2,VOD', video_id), } -class SeznamZpravyArticleIE(SeznamZpravyGenericIE): +class SeznamZpravyArticleIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?(?:seznam\.cz/zpravy|seznamzpravy\.cz)/clanek/(?:[-a-z0-9]+)-(?P[0-9]+)' + _API_URL = 'https://apizpravy.seznam.cz/' + _TESTS = [{ # two videos on one page, with SDN URL 'url': 'https://www.seznamzpravy.cz/clanek/jejich-svet-na-nas-utoci-je-lepsi-branit-se-na-jejich-pisecku-rika-reziser-a-major-v-zaloze-marhoul-35990', @@ -124,7 +122,7 @@ class SeznamZpravyArticleIE(SeznamZpravyGenericIE): def _extract_content(self, api_data): entries = [] - for num, item in enumerate(api_data.get('content', [])): + for item in api_data.get('content', []): media = item.get('properties', {}).get('media', {}) src_url = media.get('video', {}).get('sdn') title = media.get('title') @@ -140,14 +138,14 @@ class SeznamZpravyArticleIE(SeznamZpravyGenericIE): return entries def _iframe_result(self, info_dict): - video_id = info_dict['id'] or self._raw_id(info_dict['src']) - url = 'https://www.seznam.cz/zpravy/iframe/player?%s' % compat_urllib_parse_urlencode({ + video_id = info_dict['id'] or _raw_id(info_dict['src']) + url = update_url_query('https://www.seznam.cz/zpravy/iframe/player', { 'src': info_dict['src'], 'title': info_dict['title'], 'contentId': video_id, 'serviceName': 'Seznam Zprávy', }) - return self.url_result(url, ie='SeznamZpravyIframe', video_id=video_id, video_title=info_dict['title']) + return self.url_result(url, ie='SeznamZpravy', video_id=video_id, video_title=info_dict['title']) def _real_extract(self, url): article_id = self._match_id(url) From 0b74f2f934c30b516d349d44e903e94e117eca5a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ond=C5=99ej=20Caletka?= Date: Tue, 19 Dec 2017 14:07:06 +0100 Subject: [PATCH 14/15] [seznamzpravy] Add more metadata for SDN streams --- youtube_dl/extractor/seznamzpravy.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/youtube_dl/extractor/seznamzpravy.py b/youtube_dl/extractor/seznamzpravy.py index 83543e15b..a60861de3 100644 --- a/youtube_dl/extractor/seznamzpravy.py +++ b/youtube_dl/extractor/seznamzpravy.py @@ -44,12 +44,21 @@ class SeznamZpravyIE(InfoExtractor): width, height = fmtdata.get('resolution') except (TypeError, ValueError): width, height = None, None + tbr = int_or_none(fmtdata['bandwidth']) + if tbr: + tbr = int(tbr / 1000) + duration = int_or_none(fmtdata['duration']) + if duration: + duration = int(duration / 1000) formats.append({ 'format_id': fmt, 'width': int_or_none(width), 'height': int_or_none(height), 'url': urljoin(sdn_url, relative_url), + 'vcodec': fmtdata.get('codec'), + 'tbr': tbr, + 'duration': duration, }) playlists = sdn_data.get('pls', {}) From e0f78d00e091d93b6ac4ec90f07c89aaac683598 Mon Sep 17 00:00:00 2001 From: Petr Novak Date: Tue, 19 Dec 2017 16:15:24 +0100 Subject: [PATCH 15/15] [seznamzpravy] A bit shorter duration+tbr extraction --- youtube_dl/extractor/seznamzpravy.py | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/youtube_dl/extractor/seznamzpravy.py b/youtube_dl/extractor/seznamzpravy.py index a60861de3..0d1e7668e 100644 --- a/youtube_dl/extractor/seznamzpravy.py +++ b/youtube_dl/extractor/seznamzpravy.py @@ -44,12 +44,6 @@ class SeznamZpravyIE(InfoExtractor): width, height = fmtdata.get('resolution') except (TypeError, ValueError): width, height = None, None - tbr = int_or_none(fmtdata['bandwidth']) - if tbr: - tbr = int(tbr / 1000) - duration = int_or_none(fmtdata['duration']) - if duration: - duration = int(duration / 1000) formats.append({ 'format_id': fmt, @@ -57,8 +51,8 @@ class SeznamZpravyIE(InfoExtractor): 'height': int_or_none(height), 'url': urljoin(sdn_url, relative_url), 'vcodec': fmtdata.get('codec'), - 'tbr': tbr, - 'duration': duration, + 'tbr': int_or_none(fmtdata.get('bandwidth'), scale=1000), + 'duration': int_or_none(fmtdata.get('duration'), scale=1000), }) playlists = sdn_data.get('pls', {})