1
0
mirror of https://github.com/l1ving/youtube-dl synced 2025-03-13 06:37:14 +08:00

[seznamzpravy] Split to article and iframe extractor

This commit is contained in:
Petr Novak 2017-12-04 02:45:45 +01:00
parent 16ca00501a
commit 548c008330
2 changed files with 99 additions and 61 deletions

View File

@ -931,7 +931,10 @@ from .sendtonews import SendtoNewsIE
from .servingsys import ServingSysIE from .servingsys import ServingSysIE
from .servus import ServusIE from .servus import ServusIE
from .sexu import SexuIE from .sexu import SexuIE
from .seznamzpravy import SeznamZpravyIE from .seznamzpravy import (
SeznamZpravyIframeIE,
SeznamZpravyArticleIE,
)
from .shahid import ShahidIE from .shahid import ShahidIE
from .shared import ( from .shared import (
SharedIE, SharedIE,

View File

@ -2,7 +2,12 @@
from __future__ import unicode_literals from __future__ import unicode_literals
from .common import InfoExtractor from .common import InfoExtractor
from ..compat import compat_str from ..compat import (
compat_parse_qs,
compat_str,
compat_urllib_parse_urlencode,
compat_urllib_parse_urlparse,
)
from ..utils import ( from ..utils import (
urljoin, urljoin,
int_or_none, int_or_none,
@ -10,34 +15,10 @@ from ..utils import (
) )
class SeznamZpravyIE(InfoExtractor): class SeznamZpravyGenericIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?seznam\.cz/zpravy/clanek/(?:[-a-z0-9]+)-(?P<id>[0-9]+)'
_API_URL = 'https://apizpravy.seznam.cz/' _API_URL = 'https://apizpravy.seznam.cz/'
_MAGIC_SUFFIX = 'spl2,2,VOD' _MAGIC_SUFFIX = 'spl2,2,VOD'
_TESTS = [{
# two videos on one page, with SDN URL
'url': 'https://www.seznam.cz/zpravy/clanek/jejich-svet-na-nas-utoci-je-lepsi-branit-se-na-jejich-pisecku-rika-reziser-a-major-v-zaloze-marhoul-35990',
'params': {'skip_download': True},
# ^ this is here instead of 'file_minsize': 1586, which does not work because
# test_download.py forces expected_minsize to at least 10k when test is running
'info_dict': {
'id': '35990',
'ext': 'mp4',
'title': 'Jejich svět na nás útočí. Je lepší bránit se na jejich písečku, říká režisér a major v záloze Marhoul',
'description': 'O nasazení českých vojáků v zahraničí. Marhoul by na mise posílal i zálohy. „Nejdříve se ale musí vycvičit,“ říká.',
}
}, {
# video with live stream URL
'url': 'https://www.seznam.cz/zpravy/clanek/znovu-do-vlady-s-ano-pavel-belobradek-ve-volebnim-specialu-seznamu-38489',
'info_dict': {
'id': '38489',
'ext': 'mp4',
'title': 'ČSSD a ANO nás s\xa0elektronickou evidencí podrazily, říká šéf lidovců',
'description': 'Předvolební rozhovory s lídry deseti hlavních stran pokračují. Ve Výzvě Jindřicha Šídla odpovídal předseda lidovců Pavel Bělobrádek.',
}
}]
def _extract_sdn_formats(self, sdn_url, video_id): def _extract_sdn_formats(self, sdn_url, video_id):
sdn_data = self._download_json(sdn_url, video_id) sdn_data = self._download_json(sdn_url, video_id)
formats = [] formats = []
@ -71,63 +52,117 @@ class SeznamZpravyIE(InfoExtractor):
self._sort_formats(formats) self._sort_formats(formats)
return formats return formats
def _extract_caption(self, api_data, video_id): def _raw_id(self, src_url):
return compat_urllib_parse_urlparse(src_url).path.split('/')[-1]
class SeznamZpravyIframeIE(SeznamZpravyGenericIE):
_VALID_URL = r'https?://(?:www\.)?seznam\.cz/zpravy/iframe/player\?.*\bsrc='
_TESTS = [{
'url': r'https://www.seznam.cz/zpravy/iframe/player?duration=241&serviceSlug=zpravy&src=https%3A%2F%2Fv39-a.sdn.szn.cz%2Fv_39%2Fvmd%2F5999c902ea707c67d8e267a9%3Ffl%3Dmdk%2C432f65a0%7C&itemType=video&autoPlay=false&title=Sv%C4%9Bt%20bez%20obalu%3A%20%C4%8Ce%C5%A1t%C3%AD%20voj%C3%A1ci%20na%20mis%C3%ADch%20(kr%C3%A1tk%C3%A1%20verze)&series=Sv%C4%9Bt%20bez%20obalu&serviceName=Seznam%20Zpr%C3%A1vy&poster=%2F%2Fd39-a.sdn.szn.cz%2Fd_39%2Fc_img_F_I%2FR5puJ.jpeg%3Ffl%3Dcro%2C0%2C0%2C1920%2C1080%7Cres%2C1200%2C%2C1%7Cjpg%2C80%2C%2C1&width=1920&height=1080&cutFrom=0&cutTo=0&splVersion=VOD&contentId=170889&contextId=35990&showAdvert=true&collocation=&autoplayPossible=true&embed=&isVideoTooShortForPreroll=false&isVideoTooLongForPostroll=true&videoCommentOpKey=&videoCommentId=&version=4.0.76&dotService=zpravy&gemiusPrismIdentifier=bVc1ZIb_Qax4W2v5xOPGpMeCP31kFfrTzj0SqPTLh_b.Z7&zoneIdPreroll=seznam.pack.videospot&skipOffsetPreroll=5&sectionPrefixPreroll=%2Fzpravy',
'params': {'skip_download': True}, # 'file_minsize': 1586 seems to get killed in test_download.py
'info_dict': {
'id': '170889',
'ext': 'mp4',
'title': 'Svět bez obalu: Čeští vojáci na misích (krátká verze)',
}
}]
def _real_extract(self, url):
params = compat_parse_qs(compat_urllib_parse_urlparse(url).query)
src = params['src'][0]
video_id = params.get('contentId', [self._raw_id(src)])[0]
return {
'id': video_id,
'title': params['title'][0],
'formats': self._extract_sdn_formats(src + self._MAGIC_SUFFIX, video_id),
}
class SeznamZpravyArticleIE(SeznamZpravyGenericIE):
_VALID_URL = r'https?://(?:www\.)?seznam\.cz/zpravy/clanek/(?:[-a-z0-9]+)-(?P<id>[0-9]+)'
_TESTS = [{
# two videos on one page, with SDN URL
'url': 'https://www.seznam.cz/zpravy/clanek/jejich-svet-na-nas-utoci-je-lepsi-branit-se-na-jejich-pisecku-rika-reziser-a-major-v-zaloze-marhoul-35990',
'params': {'skip_download': True},
# ^ this is here instead of 'file_minsize': 1586, which does not work because
# test_download.py forces expected_minsize to at least 10k when test is running
'info_dict': {
'id': '170889',
'ext': 'mp4',
'title': 'Svět bez obalu: Čeští vojáci na misích (krátká verze)',
}
}, {
# video with live stream URL
'url': 'https://www.seznam.cz/zpravy/clanek/znovu-do-vlady-s-ano-pavel-belobradek-ve-volebnim-specialu-seznamu-38489',
'info_dict': {
'id': '185688',
'ext': 'mp4',
'title': 'Předseda KDU-ČSL Pavel Bělobrádek ve volební Výzvě Seznamu',
}
}]
def _extract_caption(self, api_data, article_id):
title = api_data.get('title') or api_data.get('captionTitle') title = api_data.get('title') or api_data.get('captionTitle')
caption = api_data.get('caption') caption = api_data.get('caption')
if not title or not caption: if not title or not caption:
return {} return {}
if 'sdn' in caption.get('video', {}): if 'sdn' in caption.get('video', {}):
sdn_url = caption['video']['sdn'] + self._MAGIC_SUFFIX src_url = caption['video']['sdn']
elif 'liveStreamUrl' in caption: elif 'liveStreamUrl' in caption:
sdn_url = self._download_json(caption['liveStreamUrl'] + self._MAGIC_SUFFIX, video_id)['Location'] src_url = self._download_json(caption['liveStreamUrl'], article_id)['Location']
else: else:
return {} return {}
formats = self._extract_sdn_formats(sdn_url, video_id) return {
if formats: 'id': caption.get('uid'),
return { 'title': caption.get('title'),
'id': video_id, 'src': src_url,
'title': title, }
'description': api_data.get('perex'),
'display_id': api_data.get('slug'),
'formats': formats,
}
def _extract_content(self, api_data, video_id): def _extract_content(self, api_data):
entries = [] entries = []
for num, item in enumerate(api_data.get('content', [])): for num, item in enumerate(api_data.get('content', [])):
media = item.get('properties', {}).get('media', {}) media = item.get('properties', {}).get('media', {})
sdn_url_part = media.get('video', {}).get('sdn') src_url = media.get('video', {}).get('sdn')
title = media.get('title') title = media.get('title')
if not sdn_url_part or not title: if not src_url or not title:
continue continue
entry_id = '%s-%s' % (video_id, num) entries.append({
formats = self._extract_sdn_formats(sdn_url_part + self._MAGIC_SUFFIX, entry_id) 'id': media.get('uid'),
if formats: 'title': title,
entries.append({ 'src': src_url,
'id': entry_id, })
'title': title,
'formats': formats,
})
return entries return entries
def _real_extract(self, url): def _iframe_result(self, info_dict):
video_id = self._match_id(url) video_id = info_dict['id'] or self._raw_id(info_dict['src'])
api_data = self._download_json(self._API_URL + 'v1/documents/' + video_id, video_id) url = 'https://www.seznam.cz/zpravy/iframe/player?%s' % compat_urllib_parse_urlencode({
'src': info_dict['src'],
'title': info_dict['title'],
'contentId': video_id,
'serviceName': 'Seznam Zprávy',
})
return self.url_result(url, ie='SeznamZpravyIframe', video_id=video_id, video_title=info_dict['title'])
caption = self._extract_caption(api_data, video_id) def _real_extract(self, url):
content = self._extract_content(api_data, video_id) article_id = self._match_id(url)
api_data = self._download_json(self._API_URL + 'v1/documents/' + article_id, article_id)
caption = self._extract_caption(api_data, article_id)
content = self._extract_content(api_data)
if caption and not content: if caption and not content:
return caption return self._iframe_result(caption)
else: else:
if caption: if caption:
content.insert(0, caption) content.insert(0, caption)
return { return self.playlist_result(
'_type': 'playlist', [self._iframe_result(x) for x in content],
'entries': content, playlist_id=article_id,
'title': caption.get('title'), playlist_title=api_data.get('title') or caption.get('title')
} )