From d05e77148c0a656ead9383901c2efa80c4020284 Mon Sep 17 00:00:00 2001 From: Olivier Mehani Date: Sat, 14 Dec 2019 00:05:25 +1100 Subject: [PATCH] [abc:iview:shows] Handle human-friendly landing pages Add an extractor to redirect human-friendly page URLs to their canonical video URL. This includes 'movie length' TV shows (#16868) Signed-off-by: Olivier Mehani --- youtube_dl/extractor/abc.py | 54 ++++++++++++++++++++++++++++++ youtube_dl/extractor/extractors.py | 1 + 2 files changed, 55 insertions(+) diff --git a/youtube_dl/extractor/abc.py b/youtube_dl/extractor/abc.py index 4ac323bf6..3620fec4f 100644 --- a/youtube_dl/extractor/abc.py +++ b/youtube_dl/extractor/abc.py @@ -191,3 +191,57 @@ class ABCIViewIE(InfoExtractor): 'subtitles': subtitles, 'is_live': is_live, } + + +class ABCIViewShowIE(InfoExtractor): + """ + This is a stub extractor that looks for a canonical URL, and processes it + with the ABCIViewIE + """ + IE_NAME = 'abc.net.au:iview:shows' + _VALID_URL = r'https?://iview\.abc\.net\.au/show/(?P[^/?#]+)' + # The canonical URL to look for + _CANONICAL_URL = r'd_canonicalUrl\\":\\"(?Phttps://iview.abc.net.au/video/(?P[^/?#\\"]+))\\"' + _GEO_COUNTRIES = ['AU'] + + # ABC iview programs are normally available for 14 days only. + _TESTS = [{ + 'url': 'https://iview.abc.net.au/show/stick-man', + 'md5': 'cde42d728b3b7c2b32b1b94b4a548afc', + 'info_dict': { + 'id': 'ZW0021A001S00', + 'ext': 'mp4', + 'title': "Stick Man", + 'series': "Stick Man", + 'description': 'md5:ffc3ab0c9df0255d646924dbd29fa0d5', + 'uploader_id': 'abc4kids', + 'timestamp': 1576249200, + }, + 'params': { + 'skip_download': True, + }, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + canonical_url = self._match_canonical_url(webpage) + self.report_canonical_url(video_id, canonical_url) + + iview_ie = ABCIViewIE(self._downloader) + + return iview_ie.extract(canonical_url) + + # The below method may be moved to common.py if the redirection + # to canonical URL pattern is more widespread + @classmethod + def _match_canonical_url(cls, webpage): + if '_CANONICAL_URL_RE' not in cls.__dict__: + cls._CANONICAL_URL_RE = re.compile(cls._CANONICAL_URL) + m = cls._CANONICAL_URL_RE.search(webpage) + assert m + return compat_str(m.group('url')) + + def report_canonical_url(self, video_id, canonical_url): + """Report URL redirect.""" + self.to_screen('%s: Canonical URL: %s' % (video_id, canonical_url)) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index fd93730fa..e50f3192e 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -4,6 +4,7 @@ from __future__ import unicode_literals from .abc import ( ABCIE, ABCIViewIE, + ABCIViewShowIE, ) from .abcnews import ( AbcNewsIE,