From 44f48e75218d40e53ece247f3bd2a724888536ba Mon Sep 17 00:00:00 2001 From: Ales Jirasek Date: Wed, 10 Oct 2018 23:47:21 +0200 Subject: [PATCH] [MallTV] Add new extractor, better regexe update generic JSON_LD_RE to find json_ld in attributes without quotes, closes #18058 --- youtube_dl/extractor/extractors.py | 3 +- youtube_dl/extractor/malltv.py | 71 ++++++++++++++++++++++++++++++ youtube_dl/utils.py | 2 +- 3 files changed, 74 insertions(+), 2 deletions(-) create mode 100644 youtube_dl/extractor/malltv.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 464c8d690..268b2cbb1 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -595,6 +595,7 @@ from .mailru import ( MailRuMusicSearchIE, ) from .makertv import MakerTVIE +from .malltv import MallTVIE from .mangomolo import ( MangomoloVideoIE, MangomoloLiveIE, @@ -1367,7 +1368,7 @@ from .webofstories import ( WebOfStoriesPlaylistIE, ) from .weibo import ( - WeiboIE, + WeiboIE, WeiboMobileIE ) from .weiqitv import WeiqiTVIE diff --git a/youtube_dl/extractor/malltv.py b/youtube_dl/extractor/malltv.py new file mode 100644 index 000000000..512d999c9 --- /dev/null +++ b/youtube_dl/extractor/malltv.py @@ -0,0 +1,71 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import parse_duration + + +class MallTVIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?mall\.tv/(?:.+/)?(?P.+)' + _TESTS = [ + { + 'url': ('https://www.mall.tv/18-miliard-pro-neziskovky' + '-opravdu-jsou-sportovci-nebo-clovek-v-tisni-pijavice'), + 'info_dict': { + 'id': ('18-miliard-pro-neziskovky-opravdu-jsou-sportovci-nebo-' + 'clovek-v-tisni-pijavice'), + 'ext': 'mp4', + 'title': ('18 miliard pro neziskovky. Opravdu jsou sportovci ' + 'nebo Člověk v tísni pijavice?'), + 'description': ('Pokud někdo hospodaří s penězmi daňových ' + 'poplatníků, pak logicky chceme vědět, jak s ' + 'nimi nakládá. Objem dotací pro neziskovky ' + 'roste, ale opravdu jsou tyto organizace ' + '„pijavice", jak o nich hovoří And') + }, + 'params': { + 'skip_download': True + } + }, + { + 'url': ('https://www.mall.tv/kdo-to-plati/18-miliard-pro-neziskovky' + '-opravdu-jsou-sportovci-nebo-clovek-v-tisni-pijavice'), + 'info_dict': { + 'id': ('18-miliard-pro-neziskovky-opravdu-jsou-sportovci-nebo-' + 'clovek-v-tisni-pijavice'), + 'ext': 'mp4', + 'title': ('18 miliard pro neziskovky. Opravdu jsou sportovci ' + 'nebo Člověk v tísni pijavice?'), + 'description': ('Pokud někdo hospodaří s penězmi daňových ' + 'poplatníků, pak logicky chceme vědět, jak s ' + 'nimi nakládá. Objem dotací pro neziskovky ' + 'roste, ale opravdu jsou tyto organizace ' + '„pijavice", jak o nich hovoří And') + }, + 'params': { + 'skip_download': True + } + }, + ] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + info = self._search_json_ld(webpage, video_id, default={}) + + format_url = self._html_search_regex( + r'.+?index)\1?[^>]*?>', + webpage, 'm3u8 URL', group='src') + formats = self._extract_m3u8_formats(format_url+'.m3u8', + video_id, 'mp4') + self._sort_formats(formats) + title = info.get('title', self._og_search_title(webpage, fatal=False)) + thumbnail = info.get('thumbnailUrl', self._og_search_thumbnail(webpage)) + return { + 'id': video_id, + 'title': title, + 'thumbnail': thumbnail, + 'description': self._og_search_description(webpage), + 'duration': parse_duration(info.get('duration')), + 'formats': formats + } diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index e84d35d4d..4f21d8821 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -183,7 +183,7 @@ DATE_FORMATS_MONTH_FIRST.extend([ ]) PACKED_CODES_RE = r"}\('(.+)',(\d+),(\d+),'([^']+)'\.split\('\|'\)" -JSON_LD_RE = r'(?is)]+type=(["\'])application/ld\+json\1[^>]*>(?P.+?)' +JSON_LD_RE = r'(?is)]+type=(["\'])?application/ld\+json\1?[^>]*>(?P.+?)' def preferredencoding():