From 3dcb639d07285d7eded13c1407b6e3d8dd24e26b Mon Sep 17 00:00:00 2001 From: AndersVittrup Date: Sat, 11 Jan 2020 09:04:55 +0100 Subject: [PATCH 1/8] Added DRTV playlist extractor --- youtube_dl/extractor/drtv.py | 69 ++++++++++++++++++++++++++++++ youtube_dl/extractor/extractors.py | 1 + 2 files changed, 70 insertions(+) diff --git a/youtube_dl/extractor/drtv.py b/youtube_dl/extractor/drtv.py index 390e79f8c..ae5594471 100644 --- a/youtube_dl/extractor/drtv.py +++ b/youtube_dl/extractor/drtv.py @@ -21,6 +21,8 @@ from ..utils import ( unified_timestamp, update_url_query, url_or_none, + urljoin, + base_url ) @@ -296,6 +298,73 @@ class DRTVIE(InfoExtractor): } +class DRTVPlaylistIE(InfoExtractor): + _VALID_URL = r'''(?x) + https?:// + (?: + (?:www\.)?(?:dr\.dk)/drtv/(?:serie|saeson)/ + ) + (?P[\da-z_-]+) + ''' + _TEST = { + 'url': 'https://www.dr.dk/drtv/serie/spise-med-price_43537', + 'info_dict': { + 'id': 'spise-med-price_43537', + 'title': 'Spise med Price' + }, + 'playlist_mincount': 2, + } + + @classmethod + def suitable(cls, url): + return False if DRTVIE.suitable(url) else super( + DRTVPlaylistIE, cls).suitable(url) + + def _extract_series(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + + episodes = [] + for season in re.finditer(r'href="(?P/drtv/saeson/.+?)"', webpage): + season_url = urljoin(base_url(url), season.group('url')) + episodes = episodes + self._extract_episode_from_season(season_url) + + return episodes + + def _extract_episode_from_season(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + + episodes = [] + + for episode in re.finditer(r'href="(?P/drtv/se/.+?)"', webpage): + episode_url = urljoin(base_url(url), episode.group('url')) + episodes.append(episode_url) + + return episodes + + def _real_extract(self, url): + playlist_id = self._match_id(url) + webpage = self._download_webpage(url, playlist_id) + + title = self._html_search_regex( + r'

(.+?)

', webpage, + 'title', default=None) + + if title: + title = re.sub(r'\s*\|\s*.+?$', '', title) + + episodes = [] + if 'serie' in url: + episodes = self._extract_series(url) + elif 'saeson' in url: + episodes = self._extract_episode_from_season(url) + + entries = [self.url_result(ep, ie=DRTVIE.ie_key()) for ep in episodes] + + return self.playlist_result(entries, playlist_id, title) + + class DRTVLiveIE(InfoExtractor): IE_NAME = 'drtv:live' _VALID_URL = r'https?://(?:www\.)?dr\.dk/(?:tv|TV)/live/(?P[\da-z-]+)' diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 7b05f5410..4b8179315 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -277,6 +277,7 @@ from .drbonanza import DRBonanzaIE from .drtuber import DrTuberIE from .drtv import ( DRTVIE, + DRTVPlaylistIE, DRTVLiveIE, ) from .dtube import DTubeIE From 97fc210349a4607ef8e83df3e4b35382e61df27d Mon Sep 17 00:00:00 2001 From: AndersVittrup Date: Sat, 11 Jan 2020 09:11:30 +0100 Subject: [PATCH 2/8] New test with series only with --- youtube_dl/extractor/drtv.py | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/drtv.py b/youtube_dl/extractor/drtv.py index ae5594471..ef24924f9 100644 --- a/youtube_dl/extractor/drtv.py +++ b/youtube_dl/extractor/drtv.py @@ -306,7 +306,14 @@ class DRTVPlaylistIE(InfoExtractor): ) (?P[\da-z_-]+) ''' - _TEST = { + _TESTS = [{ + 'url': 'https://www.dr.dk/drtv/serie/tv-avisen-21_00_160258', + 'info_dict': { + 'id': 'tv-avisen-21_00_160258', + 'title': 'TV AVISEN 21:00' + }, + 'playlist_mincount': 2, + },{ 'url': 'https://www.dr.dk/drtv/serie/spise-med-price_43537', 'info_dict': { 'id': 'spise-med-price_43537', @@ -314,6 +321,7 @@ class DRTVPlaylistIE(InfoExtractor): }, 'playlist_mincount': 2, } + ] @classmethod def suitable(cls, url): @@ -328,6 +336,9 @@ class DRTVPlaylistIE(InfoExtractor): for season in re.finditer(r'href="(?P/drtv/saeson/.+?)"', webpage): season_url = urljoin(base_url(url), season.group('url')) episodes = episodes + self._extract_episode_from_season(season_url) + + if len(episodes) == 0: + episodes = episodes + self._extract_episode_from_season(url) return episodes From 87aac421fae805772a44fb80878d57753405f460 Mon Sep 17 00:00:00 2001 From: AndersVittrup Date: Sat, 11 Jan 2020 09:12:54 +0100 Subject: [PATCH 3/8] Fixed flake8 --- youtube_dl/extractor/drtv.py | 33 +++++++++++++++++---------------- 1 file changed, 17 insertions(+), 16 deletions(-) diff --git a/youtube_dl/extractor/drtv.py b/youtube_dl/extractor/drtv.py index ef24924f9..0a87dd684 100644 --- a/youtube_dl/extractor/drtv.py +++ b/youtube_dl/extractor/drtv.py @@ -306,21 +306,22 @@ class DRTVPlaylistIE(InfoExtractor): ) (?P[\da-z_-]+) ''' - _TESTS = [{ - 'url': 'https://www.dr.dk/drtv/serie/tv-avisen-21_00_160258', - 'info_dict': { - 'id': 'tv-avisen-21_00_160258', - 'title': 'TV AVISEN 21:00' - }, - 'playlist_mincount': 2, - },{ - 'url': 'https://www.dr.dk/drtv/serie/spise-med-price_43537', - 'info_dict': { - 'id': 'spise-med-price_43537', - 'title': 'Spise med Price' - }, - 'playlist_mincount': 2, - } + _TESTS = [ + { + 'url': 'https://www.dr.dk/drtv/serie/tv-avisen-21_00_160258', + 'info_dict': { + 'id': 'tv-avisen-21_00_160258', + 'title': 'TV AVISEN 21:00' + }, + 'playlist_mincount': 2, + }, { + 'url': 'https://www.dr.dk/drtv/serie/spise-med-price_43537', + 'info_dict': { + 'id': 'spise-med-price_43537', + 'title': 'Spise med Price' + }, + 'playlist_mincount': 2, + } ] @classmethod @@ -336,7 +337,7 @@ class DRTVPlaylistIE(InfoExtractor): for season in re.finditer(r'href="(?P/drtv/saeson/.+?)"', webpage): season_url = urljoin(base_url(url), season.group('url')) episodes = episodes + self._extract_episode_from_season(season_url) - + if len(episodes) == 0: episodes = episodes + self._extract_episode_from_season(url) From eeb818adcc60ff26bba74845c0de69ad4cf8908c Mon Sep 17 00:00:00 2001 From: Anders Vittrup Date: Thu, 6 Feb 2020 13:46:46 +0100 Subject: [PATCH 4/8] Fixed Alphabetic --- youtube_dl/extractor/drtv.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/drtv.py b/youtube_dl/extractor/drtv.py index 0a87dd684..4073896ea 100644 --- a/youtube_dl/extractor/drtv.py +++ b/youtube_dl/extractor/drtv.py @@ -10,6 +10,7 @@ from .common import InfoExtractor from ..aes import aes_cbc_decrypt from ..compat import compat_urllib_parse_unquote from ..utils import ( + base_url, bytes_to_intlist, ExtractorError, int_or_none, @@ -21,8 +22,7 @@ from ..utils import ( unified_timestamp, update_url_query, url_or_none, - urljoin, - base_url + urljoin ) From 261e6dbfea5b6a7a2f6413d8c1f0f54285540333 Mon Sep 17 00:00:00 2001 From: Anders Vittrup Date: Thu, 6 Feb 2020 14:01:23 +0100 Subject: [PATCH 5/8] Fixed unused group --- youtube_dl/extractor/drtv.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/youtube_dl/extractor/drtv.py b/youtube_dl/extractor/drtv.py index 4073896ea..db5427a32 100644 --- a/youtube_dl/extractor/drtv.py +++ b/youtube_dl/extractor/drtv.py @@ -301,9 +301,7 @@ class DRTVIE(InfoExtractor): class DRTVPlaylistIE(InfoExtractor): _VALID_URL = r'''(?x) https?:// - (?: - (?:www\.)?(?:dr\.dk)/drtv/(?:serie|saeson)/ - ) + (?:www\.)?(?:dr\.dk)/drtv/(?:serie|saeson)/ (?P[\da-z_-]+) ''' _TESTS = [ From 9b8cdd572fe9b20d71ce7cca0584ca9255c99312 Mon Sep 17 00:00:00 2001 From: Anders Vittrup Date: Thu, 6 Feb 2020 14:06:19 +0100 Subject: [PATCH 6/8] Added dr-massive.com to validator --- youtube_dl/extractor/drtv.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/drtv.py b/youtube_dl/extractor/drtv.py index db5427a32..bc5627b7f 100644 --- a/youtube_dl/extractor/drtv.py +++ b/youtube_dl/extractor/drtv.py @@ -301,7 +301,7 @@ class DRTVIE(InfoExtractor): class DRTVPlaylistIE(InfoExtractor): _VALID_URL = r'''(?x) https?:// - (?:www\.)?(?:dr\.dk)/drtv/(?:serie|saeson)/ + (?:www\.)?(?:dr\.dk|dr-massive\.com)/drtv/(?:serie|saeson)/ (?P[\da-z_-]+) ''' _TESTS = [ From 5279d2f4e1312e5525ffe76eaed51f9ee90738a1 Mon Sep 17 00:00:00 2001 From: AndersVittrup Date: Thu, 6 Feb 2020 21:49:55 +0100 Subject: [PATCH 7/8] Begin fixing series resolve --- youtube_dl/extractor/drtv.py | 32 ++++++++++++++++++++++++++++---- 1 file changed, 28 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/drtv.py b/youtube_dl/extractor/drtv.py index bc5627b7f..2f7709a51 100644 --- a/youtube_dl/extractor/drtv.py +++ b/youtube_dl/extractor/drtv.py @@ -4,7 +4,7 @@ from __future__ import unicode_literals import binascii import hashlib import re - +import json from .common import InfoExtractor from ..aes import aes_cbc_decrypt @@ -21,6 +21,7 @@ from ..utils import ( try_get, unified_timestamp, update_url_query, + url_basename, url_or_none, urljoin ) @@ -352,11 +353,21 @@ class DRTVPlaylistIE(InfoExtractor): episodes.append(episode_url) return episodes + + def _extract_json_data(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + + return json.loads(re.search(r'(?P{"app":.*?})<\/', webpage).group('json')) + def _real_extract(self, url): playlist_id = self._match_id(url) - webpage = self._download_webpage(url, playlist_id) + json = self._extract_json_data(url) + + + webpage = self._download_webpage(url, playlist_id) title = self._html_search_regex( r'

(.+?)

', webpage, 'title', default=None) @@ -364,11 +375,24 @@ class DRTVPlaylistIE(InfoExtractor): if title: title = re.sub(r'\s*\|\s*.+?$', '', title) + seasons = [] episodes = [] + base = re.search(r'(?P.*?/drtv)', url).group() + if 'serie' in url: - episodes = self._extract_series(url) + series_item = re.search(r'(?P/serie/[\da-z_-]+)', url).group('item') + seasons = [ i['path'] for i in json.get('cache', {}).get('page', {}).get(series_item, {}).get('item', {}).get('show', {}).get('seasons', {}).get('items', {}) ] elif 'saeson' in url: - episodes = self._extract_episode_from_season(url) + seasons = [url] + + episodes = [] + + ep = self._extract_json_data(base + seasons[0]) + items = ep.get('cache', {}).get('page', {}).get(seasons[0], {}).get('item', {}).get('episodes', {}).get('items', {}) + + episodes = [ + base + i['watchPath'] for i in items + ] entries = [self.url_result(ep, ie=DRTVIE.ie_key()) for ep in episodes] From cd0f150ce996a932c72a5f49fd8c9d56cdaf5e5e Mon Sep 17 00:00:00 2001 From: AndersVittrup Date: Fri, 7 Feb 2020 15:21:28 +0100 Subject: [PATCH 8/8] Fixed flake8 and now using json data for extraction --- youtube_dl/extractor/drtv.py | 112 ++++++++++++++++++----------------- 1 file changed, 59 insertions(+), 53 deletions(-) diff --git a/youtube_dl/extractor/drtv.py b/youtube_dl/extractor/drtv.py index 2f7709a51..26a9ff082 100644 --- a/youtube_dl/extractor/drtv.py +++ b/youtube_dl/extractor/drtv.py @@ -10,7 +10,6 @@ from .common import InfoExtractor from ..aes import aes_cbc_decrypt from ..compat import compat_urllib_parse_unquote from ..utils import ( - base_url, bytes_to_intlist, ExtractorError, int_or_none, @@ -21,7 +20,6 @@ from ..utils import ( try_get, unified_timestamp, update_url_query, - url_basename, url_or_none, urljoin ) @@ -312,7 +310,7 @@ class DRTVPlaylistIE(InfoExtractor): 'id': 'tv-avisen-21_00_160258', 'title': 'TV AVISEN 21:00' }, - 'playlist_mincount': 2, + 'playlist_mincount': 20, }, { 'url': 'https://www.dr.dk/drtv/serie/spise-med-price_43537', 'info_dict': { @@ -320,6 +318,13 @@ class DRTVPlaylistIE(InfoExtractor): 'title': 'Spise med Price' }, 'playlist_mincount': 2, + }, { + 'url': 'https://www.dr.dk/drtv/saeson/spise-med-price_163641', + 'info_dict': { + 'id': 'spise-med-price_163641', + 'title': 'Spise med Price' + }, + 'playlist_mincount': 2, } ] @@ -328,46 +333,15 @@ class DRTVPlaylistIE(InfoExtractor): return False if DRTVIE.suitable(url) else super( DRTVPlaylistIE, cls).suitable(url) - def _extract_series(self, url): - display_id = self._match_id(url) - webpage = self._download_webpage(url, display_id) - - episodes = [] - for season in re.finditer(r'href="(?P/drtv/saeson/.+?)"', webpage): - season_url = urljoin(base_url(url), season.group('url')) - episodes = episodes + self._extract_episode_from_season(season_url) - - if len(episodes) == 0: - episodes = episodes + self._extract_episode_from_season(url) - - return episodes - - def _extract_episode_from_season(self, url): - display_id = self._match_id(url) - webpage = self._download_webpage(url, display_id) - - episodes = [] - - for episode in re.finditer(r'href="(?P/drtv/se/.+?)"', webpage): - episode_url = urljoin(base_url(url), episode.group('url')) - episodes.append(episode_url) - - return episodes - - def _extract_json_data(self, url): - display_id = self._match_id(url) - webpage = self._download_webpage(url, display_id) - + def _extract_json_data(self, webpage): return json.loads(re.search(r'(?P{"app":.*?})<\/', webpage).group('json')) - def _real_extract(self, url): playlist_id = self._match_id(url) - - json = self._extract_json_data(url) - + base = re.search(r'(?P.*?/drtv/)', url).group() webpage = self._download_webpage(url, playlist_id) + json = self._extract_json_data(webpage) title = self._html_search_regex( r'

(.+?)

', webpage, 'title', default=None) @@ -375,26 +349,58 @@ class DRTVPlaylistIE(InfoExtractor): if title: title = re.sub(r'\s*\|\s*.+?$', '', title) - seasons = [] - episodes = [] - base = re.search(r'(?P.*?/drtv)', url).group() + def iterate_all(iterable, returned="key"): + """Returns an iterator that returns all keys or values + of a (nested) iterable. - if 'serie' in url: - series_item = re.search(r'(?P/serie/[\da-z_-]+)', url).group('item') - seasons = [ i['path'] for i in json.get('cache', {}).get('page', {}).get(series_item, {}).get('item', {}).get('show', {}).get('seasons', {}).get('items', {}) ] - elif 'saeson' in url: - seasons = [url] + Arguments: + - iterable: or + - returned: "key" or "value" + Returns: + - + """ + + if isinstance(iterable, dict): + for key, value in iterable.items(): + if returned == "key": + yield key + elif returned == "value": + if not (isinstance(value, dict) or isinstance(value, list)): + yield value + else: + raise ValueError("'returned' keyword only accepts 'key' or 'value'.") + for ret in iterate_all(value, returned=returned): + yield ret + elif isinstance(iterable, list): + for el in iterable: + for ret in iterate_all(el, returned=returned): + yield ret + + seasons = [url] + if 'saeson' not in url: + seasons = list(dict.fromkeys([ + re.search(r'/(?Psaeson/[\da-z_-]+)', str(i)).group('season') for i in list(iterate_all(json, "value")) + if re.search(r'/(saeson/[\da-z_-]+)', str(i)) + and i != re.search(r'drtv(?P/.+)', url).group('item') + ])) episodes = [] - - ep = self._extract_json_data(base + seasons[0]) - items = ep.get('cache', {}).get('page', {}).get(seasons[0], {}).get('item', {}).get('episodes', {}).get('items', {}) - - episodes = [ - base + i['watchPath'] for i in items - ] + for season in seasons: + if season == url: + season_data = json + else: + season_url = urljoin(base, season) + season_display_id = self._match_id(season_url) + season_webpage = self._download_webpage(season_url, season_display_id) + season_data = self._extract_json_data(season_webpage) - entries = [self.url_result(ep, ie=DRTVIE.ie_key()) for ep in episodes] + episodes.extend([ + re.search(r'/(?Pse/[\da-z_-]+)', str(i)).group('item') for i in list(iterate_all(season_data, "value")) + if re.search(r'/(se/[\da-z_-]+)', str(i)) + ]) + episodes = list(dict.fromkeys(episodes)) + + entries = [self.url_result(urljoin(base, ep), ie=DRTVIE.ie_key()) for ep in episodes] return self.playlist_result(entries, playlist_id, title)