From 4c2279c34f3e828df80373173195bdd9f87215c0 Mon Sep 17 00:00:00 2001 From: Forthrin Date: Wed, 8 May 2019 07:51:14 +0200 Subject: [PATCH 1/2] [tv2] Updated extractor according to site changes (was broken) --- youtube_dl/extractor/tv2.py | 24 ++++++++++++++---------- 1 file changed, 14 insertions(+), 10 deletions(-) diff --git a/youtube_dl/extractor/tv2.py b/youtube_dl/extractor/tv2.py index d5071e8a5..0f34f4941 100644 --- a/youtube_dl/extractor/tv2.py +++ b/youtube_dl/extractor/tv2.py @@ -15,12 +15,12 @@ from ..utils import ( class TV2IE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?tv2\.no/v/(?P\d+)' + _VALID_URL = r'https?://(?:www\.)?tv2\.no/.*/(?P\d+)/?$' _TEST = { 'url': 'http://www.tv2.no/v/916509/', 'info_dict': { 'id': '916509', - 'ext': 'mp4', + 'ext': 'flv', 'title': 'Se Frode Gryttens hyllest av Steven Gerrard', 'description': 'TV 2 Sportens huspoet tar avskjed med Liverpools kaptein Steven Gerrard.', 'timestamp': 1431715610, @@ -37,14 +37,19 @@ class TV2IE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + assets = re.findall(r'assetId\s*:\s*(\d+)', webpage) formats = [] format_urls = [] for protocol in ('HDS', 'HLS'): - data = self._download_json( - 'http://sumo.tv2.no/api/web/asset/%s/play.json?protocol=%s&videoFormat=SMIL+ISMUSP' % (video_id, protocol), - video_id, 'Downloading play JSON')['playback'] - for item in data['items']['item']: + items = self._download_json( + 'http://sumo.tv2.no/api/web/asset/%s/play.json?protocol=%s&videoFormat=SMIL+ISMUSP' % (assets[0], protocol), + video_id, 'Downloading play JSON')['playback']['items']['item'] + # the item/items elements have a non-intuitive, non-reliable layout + if not isinstance(items, list): + items = [items] + for item in items: video_url = item.get('url') if not video_url or video_url in format_urls: continue @@ -72,7 +77,7 @@ class TV2IE(InfoExtractor): self._sort_formats(formats) asset = self._download_json( - 'http://sumo.tv2.no/api/web/asset/%s.json' % video_id, + 'http://sumo.tv2.no/api/web/asset/%s.json' % assets[0], video_id, 'Downloading metadata JSON')['asset'] title = asset['title'] @@ -108,7 +113,7 @@ class TV2ArticleIE(InfoExtractor): 'info_dict': { 'id': '6930542', 'title': 'Russen hetses etter pingvintyveri - innrømmer å ha åpnet luken på buret', - 'description': 'md5:339573779d3eea3542ffe12006190954', + 'description': 'De fire siktede nekter fortsatt for å ha stjålet pingvinbabyene, men innrømmer å ha åpnet luken til de små kyllingene.', }, 'playlist_count': 2, }, { @@ -121,8 +126,7 @@ class TV2ArticleIE(InfoExtractor): webpage = self._download_webpage(url, playlist_id) - # Old embed pattern (looks unused nowadays) - assets = re.findall(r'data-assetid=["\'](\d+)', webpage) + assets = re.findall(r'assetId\s*:\s*(\d+)', webpage) if not assets: # New embed pattern From 2e938416402b61739f6611da229dec47faa2847a Mon Sep 17 00:00:00 2001 From: Forthrin Date: Wed, 8 May 2019 08:32:33 +0200 Subject: [PATCH 2/2] [tv2] Removed TV2ArticleIE as TV2IE can handle all necessary extraction --- youtube_dl/extractor/extractors.py | 5 +- youtube_dl/extractor/tv2.py | 94 ++++++++++-------------------- 2 files changed, 33 insertions(+), 66 deletions(-) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 0d0732dcb..6e02aec0b 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -1201,10 +1201,7 @@ from .tunein import ( from .tunepk import TunePkIE from .turbo import TurboIE from .tutv import TutvIE -from .tv2 import ( - TV2IE, - TV2ArticleIE, -) +from .tv2 import TV2IE from .tv2hu import TV2HuIE from .tv4 import TV4IE from .tv5mondeplus import TV5MondePlusIE diff --git a/youtube_dl/extractor/tv2.py b/youtube_dl/extractor/tv2.py index 0f34f4941..ab9fb263d 100644 --- a/youtube_dl/extractor/tv2.py +++ b/youtube_dl/extractor/tv2.py @@ -8,32 +8,45 @@ from ..utils import ( determine_ext, int_or_none, float_or_none, - js_to_json, parse_iso8601, - remove_end, ) class TV2IE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?tv2\.no/.*/(?P\d+)/?$' - _TEST = { - 'url': 'http://www.tv2.no/v/916509/', - 'info_dict': { - 'id': '916509', - 'ext': 'flv', - 'title': 'Se Frode Gryttens hyllest av Steven Gerrard', - 'description': 'TV 2 Sportens huspoet tar avskjed med Liverpools kaptein Steven Gerrard.', - 'timestamp': 1431715610, - 'upload_date': '20150515', - 'duration': 156.967, - 'view_count': int, - 'categories': list, + _TESTS = [ + { + 'url': 'http://www.tv2.no/v/916509/', + 'info_dict': { + 'id': '916509', + 'ext': 'flv', + 'title': 'Se Frode Gryttens hyllest av Steven Gerrard', + 'description': 'TV 2 Sportens huspoet tar avskjed med Liverpools kaptein Steven Gerrard.', + 'timestamp': 1431715610, + 'upload_date': '20150515', + 'duration': 156.967, + 'view_count': int, + 'categories': list, + }, + 'params': { + 'skip_download': True, + }, }, - 'params': { - # m3u8 download - 'skip_download': True, - }, - } + { + 'url': 'http://www.tv2.no/2015/05/16/nyheter/alesund/krim/pingvin/6930542', + 'info_dict': { + 'id': '6930542', + 'title': 'Russen hetset etter pingvintyveriet', + 'description': 'Etter at fire russ er siktet for pinvintyveriet i Atlandethavsparken i Ålesund opplever resten av byens russ å bli hetset på åpen gate.', + 'upload_date': '20150516', + 'timestamp': 1431803333, + 'ext': 'flv', + }, + 'params': { + 'skip_download': True, + }, + } + ] def _real_extract(self, url): video_id = self._match_id(url) @@ -104,46 +117,3 @@ class TV2IE(InfoExtractor): 'categories': categories, 'formats': formats, } - - -class TV2ArticleIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?tv2\.no/(?:a|\d{4}/\d{2}/\d{2}(/[^/]+)+)/(?P\d+)' - _TESTS = [{ - 'url': 'http://www.tv2.no/2015/05/16/nyheter/alesund/krim/pingvin/6930542', - 'info_dict': { - 'id': '6930542', - 'title': 'Russen hetses etter pingvintyveri - innrømmer å ha åpnet luken på buret', - 'description': 'De fire siktede nekter fortsatt for å ha stjålet pingvinbabyene, men innrømmer å ha åpnet luken til de små kyllingene.', - }, - 'playlist_count': 2, - }, { - 'url': 'http://www.tv2.no/a/6930542', - 'only_matching': True, - }] - - def _real_extract(self, url): - playlist_id = self._match_id(url) - - webpage = self._download_webpage(url, playlist_id) - - assets = re.findall(r'assetId\s*:\s*(\d+)', webpage) - - if not assets: - # New embed pattern - for v in re.findall(r'TV2ContentboxVideo\(({.+?})\)', webpage): - video = self._parse_json( - v, playlist_id, transform_source=js_to_json, fatal=False) - if not video: - continue - asset = video.get('assetId') - if asset: - assets.append(asset) - - entries = [ - self.url_result('http://www.tv2.no/v/%s' % asset_id, 'TV2') - for asset_id in assets] - - title = remove_end(self._og_search_title(webpage), ' - TV2.no') - description = remove_end(self._og_search_description(webpage), ' - TV2.no') - - return self.playlist_result(entries, playlist_id, title, description)