From 95b8a52327dd2e71e6540e43cfa06942cc696a07 Mon Sep 17 00:00:00 2001 From: Michal Duda Date: Sun, 7 Oct 2018 21:33:26 +0200 Subject: [PATCH 1/3] [MallTv] Add new extractor --- youtube_dl/extractor/extractors.py | 1 + youtube_dl/extractor/malltv.py | 75 ++++++++++++++++++++++++++++++ 2 files changed, 76 insertions(+) mode change 100644 => 100755 youtube_dl/extractor/extractors.py create mode 100644 youtube_dl/extractor/malltv.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py old mode 100644 new mode 100755 index 464c8d690..175824fdf --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -595,6 +595,7 @@ from .mailru import ( MailRuMusicSearchIE, ) from .makertv import MakerTVIE +from .malltv import MallTvIE from .mangomolo import ( MangomoloVideoIE, MangomoloLiveIE, diff --git a/youtube_dl/extractor/malltv.py b/youtube_dl/extractor/malltv.py new file mode 100644 index 000000000..3351d5e8f --- /dev/null +++ b/youtube_dl/extractor/malltv.py @@ -0,0 +1,75 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + NO_DEFAULT, +) + + +class MallTvIE(InfoExtractor): + _VALID_URL = r'https://mall.tv/(?P[^/#?]+)' + _TEST = { + 'url': 'https://mall.tv/tajemstvi-nejkrupavejsich-kurecich-kridylek', + 'info_dict': { + 'id': 'tajemstvi-nejkrupavejsich-kurecich-kridylek', + 'ext': 'mp4', + 'title': 'Tajemství nejkřupavějších kuřecích křidýlek', + 'description': 'md5:f77cbb85d08745bfc85a2768fa34b57d', + 'thumbnail': r're:^https?://.*\.jpg', + 'duration': 58.0, + 'upload_date': '20180912', + 'timestamp': 1536781320, + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + } + + # MAll.tv has malformed type atribute (i.e. missing quotes) + # + JSON_LD_RE_MALLTV_MALFORMED = r'(?is)]+type=application/ld\+json[^>]*>(?P.+?)' + + def _search_json_ld(self, html, video_id, expected_type=None, **kwargs): + json_ld = self._search_regex( + self.JSON_LD_RE_MALLTV_MALFORMED, html, 'JSON-LD', group='json_ld', **kwargs) + default = kwargs.get('default', NO_DEFAULT) + if not json_ld: + return default if default is not NO_DEFAULT else {} + # JSON-LD may be malformed and thus `fatal` should be respected. + # At the same time `default` may be passed that assumes `fatal=False` + # for _search_regex. Let's simulate the same behavior here as well. + fatal = kwargs.get('fatal', True) if default == NO_DEFAULT else False + return self._json_ld(json_ld, video_id, fatal=fatal, expected_type=expected_type) + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage(url, video_id) + + title = self._og_search_title(webpage, default=None) + description = self._og_search_description(webpage, default=None) + + ldjson = self._search_json_ld(webpage, video_id, default=None) + + # Again, the malform attribute + # + source = self._search_regex(re.compile(r' Date: Mon, 8 Oct 2018 18:34:59 +0200 Subject: [PATCH 2/3] [MallTv] Add new extractor - removed comments about malformed attrib values --- youtube_dl/extractor/malltv.py | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/youtube_dl/extractor/malltv.py b/youtube_dl/extractor/malltv.py index 3351d5e8f..d0557b9c3 100644 --- a/youtube_dl/extractor/malltv.py +++ b/youtube_dl/extractor/malltv.py @@ -29,13 +29,11 @@ class MallTvIE(InfoExtractor): }, } - # MAll.tv has malformed type atribute (i.e. missing quotes) - # - JSON_LD_RE_MALLTV_MALFORMED = r'(?is)]+type=application/ld\+json[^>]*>(?P.+?)' + JSON_LD_RE_UNQUOTED_ATTRIB = r'(?is)]+type=application/ld\+json[^>]*>(?P.+?)' def _search_json_ld(self, html, video_id, expected_type=None, **kwargs): json_ld = self._search_regex( - self.JSON_LD_RE_MALLTV_MALFORMED, html, 'JSON-LD', group='json_ld', **kwargs) + self.JSON_LD_RE_UNQUOTED_ATTRIB, html, 'JSON-LD', group='json_ld', **kwargs) default = kwargs.get('default', NO_DEFAULT) if not json_ld: return default if default is not NO_DEFAULT else {} @@ -47,7 +45,6 @@ class MallTvIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) title = self._og_search_title(webpage, default=None) @@ -55,8 +52,6 @@ class MallTvIE(InfoExtractor): ldjson = self._search_json_ld(webpage, video_id, default=None) - # Again, the malform attribute - # source = self._search_regex(re.compile(r' Date: Mon, 8 Oct 2018 20:21:50 +0200 Subject: [PATCH 3/3] [MallTv] Add new extractor - added optional www subdomain --- youtube_dl/extractor/malltv.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/malltv.py b/youtube_dl/extractor/malltv.py index d0557b9c3..48d7a7986 100644 --- a/youtube_dl/extractor/malltv.py +++ b/youtube_dl/extractor/malltv.py @@ -10,9 +10,9 @@ from ..utils import ( class MallTvIE(InfoExtractor): - _VALID_URL = r'https://mall.tv/(?P[^/#?]+)' + _VALID_URL = r'https://(?:www\.)?mall.tv/(?P[^/#?]+)' _TEST = { - 'url': 'https://mall.tv/tajemstvi-nejkrupavejsich-kurecich-kridylek', + 'url': 'https://www.mall.tv/tajemstvi-nejkrupavejsich-kurecich-kridylek', 'info_dict': { 'id': 'tajemstvi-nejkrupavejsich-kurecich-kridylek', 'ext': 'mp4',