From 3c3e04c97541daa0937d38b405d600d454e4f5a1 Mon Sep 17 00:00:00 2001 From: Alex Seiler Date: Mon, 9 Jan 2017 21:19:55 +0100 Subject: [PATCH 1/9] [twentymin] Began to fix 20min.ch extractor. --- youtube_dl/extractor/twentymin.py | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/twentymin.py b/youtube_dl/extractor/twentymin.py index b721ecb0a..2f93bdb36 100644 --- a/youtube_dl/extractor/twentymin.py +++ b/youtube_dl/extractor/twentymin.py @@ -60,6 +60,7 @@ class TwentyMinutenIE(InfoExtractor): mobj = re.match(self._VALID_URL, url) video_id = mobj.group('id') display_id = mobj.group('display_id') or video_id + print('DISPLAY_ID: {}'.format(display_id)) webpage = self._download_webpage(url, display_id) @@ -75,13 +76,23 @@ class TwentyMinutenIE(InfoExtractor): if not title: title = remove_end(re.sub( r'^20 [Mm]inuten.*? -', '', self._og_search_title(webpage)), ' - News') + print('TITLE: {}'.format(title)) + # if not video_id: + # video_id = self._search_regex( + # r'"file\d?"\s*,\s*\"(\d+)', webpage, 'video id') if not video_id: - video_id = self._search_regex( - r'"file\d?"\s*,\s*\"(\d+)', webpage, 'video id') + videoplayer_url = self._html_search_regex( + r']+src="((?:https?:)?//www\.20min\.ch/videoplayer/videoplayer\.html\?params=*?[^"]+)"', + webpage, '20min embed URL', default=None) + vid = re.match(r'videoID@\d+', videoplayer_url) + print(vid) + + description = self._html_search_meta( 'description', webpage, 'description') + print('DESCRIPTION: {}'.format(description)) thumbnail = self._og_search_thumbnail(webpage) return { From 69807d59ed02b6adf393cc2c116b6ef27e8decca Mon Sep 17 00:00:00 2001 From: Alex Seiler Date: Tue, 10 Jan 2017 11:01:14 +0100 Subject: [PATCH 2/9] [twentymin] Updated the 20min extractor to support the site after the update. --- youtube_dl/extractor/twentymin.py | 32 +++++++++++++++++++++---------- 1 file changed, 22 insertions(+), 10 deletions(-) diff --git a/youtube_dl/extractor/twentymin.py b/youtube_dl/extractor/twentymin.py index 2f93bdb36..bc7fb8c13 100644 --- a/youtube_dl/extractor/twentymin.py +++ b/youtube_dl/extractor/twentymin.py @@ -13,7 +13,7 @@ class TwentyMinutenIE(InfoExtractor): _TESTS = [{ # regular video 'url': 'http://www.20min.ch/videotv/?vid=469148&cid=2', - 'md5': 'b52d6bc6ea6398e6a38f12cfd418149c', + 'md5': 'e7264320db31eed8c38364150c12496e', 'info_dict': { 'id': '469148', 'ext': 'flv', @@ -34,6 +34,18 @@ class TwentyMinutenIE(InfoExtractor): 'thumbnail': 'http://www.20min.ch/images/content/2/2/0/22050469/10/teaserbreit.jpg' }, 'skip': '"This video is no longer available" is shown both on the web page and in the downloaded file.', + }, { + # news article with video + 'url': 'http://www.20min.ch/schweiz/news/story/So-kommen-Sie-bei-Eis-und-Schnee-sicher-an-27032552', + 'md5': '807f9e1e06a69b77440a9b315e52e580', + 'info_dict': { + 'id': '523629', + 'display_id': 'So-kommen-Sie-bei-Eis-und-Schnee-sicher-an-27032552', + 'ext': 'mp4', + 'title': 'So kommen Sie bei Eis und Schnee sicher an', + 'description': 'Schneegestöber und Glatteis führten in den letzten Tagen zu zahlreichen Strassenunfällen. Ein Experte erklärt, worauf man nun beim Autofahren achten muss.', + 'thumbnail': 'http://www.20min.ch/images/content/2/7/0/27032552/81/teaserbreit.jpg', + } }, { # YouTube embed 'url': 'http://www.20min.ch/ro/sports/football/story/Il-marque-une-bicyclette-de-plus-de-30-metres--21115184', @@ -78,27 +90,27 @@ class TwentyMinutenIE(InfoExtractor): r'^20 [Mm]inuten.*? -', '', self._og_search_title(webpage)), ' - News') print('TITLE: {}'.format(title)) - # if not video_id: - # video_id = self._search_regex( - # r'"file\d?"\s*,\s*\"(\d+)', webpage, 'video id') if not video_id: - videoplayer_url = self._html_search_regex( - r']+src="((?:https?:)?//www\.20min\.ch/videoplayer/videoplayer\.html\?params=*?[^"]+)"', + params = self._html_search_regex( + r']+src="(?:https?:)?//www\.20min\.ch/videoplayer/videoplayer\.html\?params=(.+?[^"])"', webpage, '20min embed URL', default=None) - vid = re.match(r'videoID@\d+', videoplayer_url) - print(vid) - + print('PARMAS: {}'.format(params)) + video_id = self._search_regex( + r'.*videoId@(\d+)', + params, 'Video Id', default=None) if params is not None else '' + print('VIDEO ID: {}'.format(video_id)) description = self._html_search_meta( 'description', webpage, 'description') print('DESCRIPTION: {}'.format(description)) thumbnail = self._og_search_thumbnail(webpage) + print('THUMBNAIL: {}'.format(thumbnail)) return { 'id': video_id, 'display_id': display_id, - 'url': 'http://speed.20min-tv.ch/%sm.flv' % video_id, + 'url': 'http://podcast.20min-tv.ch/podcast/20min/%sh.mp4' % video_id, 'title': title, 'description': description, 'thumbnail': thumbnail, From eaffc609a7fe6db645e35c36a5969806f4a040cd Mon Sep 17 00:00:00 2001 From: Alex Seiler Date: Wed, 11 Jan 2017 19:17:05 +0100 Subject: [PATCH 3/9] [twentymin] Fixed 20min information extractor. Tried to add better video description handler, but the site seems to be too inconsistent... --- youtube_dl/extractor/twentymin.py | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/twentymin.py b/youtube_dl/extractor/twentymin.py index bc7fb8c13..2fe6d3a22 100644 --- a/youtube_dl/extractor/twentymin.py +++ b/youtube_dl/extractor/twentymin.py @@ -4,7 +4,12 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..utils import remove_end +from ..utils import ( + remove_end, + ExtractorError, + clean_html, + get_element_by_class +) class TwentyMinutenIE(InfoExtractor): @@ -99,10 +104,16 @@ class TwentyMinutenIE(InfoExtractor): r'.*videoId@(\d+)', params, 'Video Id', default=None) if params is not None else '' print('VIDEO ID: {}'.format(video_id)) + if not video_id: # the article does not contain a video + raise ExtractorError('No media links found on %s.' % url, expected=True) - - description = self._html_search_meta( - 'description', webpage, 'description') + # # Try to use the real video description: + # description = clean_html(get_element_by_class('caption', webpage)) + # # Otherwise, use the lead text of the article as the video description: + # if not description: + # description = self._html_search_meta( + # 'description', webpage, 'description') + description = self._html_search_meta('description', webpage, 'description') print('DESCRIPTION: {}'.format(description)) thumbnail = self._og_search_thumbnail(webpage) print('THUMBNAIL: {}'.format(thumbnail)) From 451ae1f8b1bab294e978c61f576dcf8409dff1c9 Mon Sep 17 00:00:00 2001 From: Alex Seiler Date: Wed, 11 Jan 2017 19:27:44 +0100 Subject: [PATCH 4/9] [twentymin] Clean up. --- youtube_dl/extractor/twentymin.py | 24 +++++------------------- 1 file changed, 5 insertions(+), 19 deletions(-) diff --git a/youtube_dl/extractor/twentymin.py b/youtube_dl/extractor/twentymin.py index 2fe6d3a22..f9e40d7c1 100644 --- a/youtube_dl/extractor/twentymin.py +++ b/youtube_dl/extractor/twentymin.py @@ -6,9 +6,7 @@ import re from .common import InfoExtractor from ..utils import ( remove_end, - ExtractorError, - clean_html, - get_element_by_class + ExtractorError ) @@ -21,7 +19,7 @@ class TwentyMinutenIE(InfoExtractor): 'md5': 'e7264320db31eed8c38364150c12496e', 'info_dict': { 'id': '469148', - 'ext': 'flv', + 'ext': 'mp4', 'title': '85 000 Franken für 15 perfekte Minuten', 'description': 'Was die Besucher vom Silvesterzauber erwarten können. (Video: Alice Grosjean/Murat Temel)', 'thumbnail': 'http://thumbnails.20min-tv.ch/server063/469148/frame-72-469148.jpg' @@ -49,19 +47,19 @@ class TwentyMinutenIE(InfoExtractor): 'ext': 'mp4', 'title': 'So kommen Sie bei Eis und Schnee sicher an', 'description': 'Schneegestöber und Glatteis führten in den letzten Tagen zu zahlreichen Strassenunfällen. Ein Experte erklärt, worauf man nun beim Autofahren achten muss.', - 'thumbnail': 'http://www.20min.ch/images/content/2/7/0/27032552/81/teaserbreit.jpg', + 'thumbnail': 'http://www.20min.ch/images/content/2/7/0/27032552/83/teaserbreit.jpg', } }, { # YouTube embed 'url': 'http://www.20min.ch/ro/sports/football/story/Il-marque-une-bicyclette-de-plus-de-30-metres--21115184', - 'md5': 'cec64d59aa01c0ed9dbba9cf639dd82f', + 'md5': 'e7e237fd98da2a3cc1422ce683df234d', 'info_dict': { 'id': 'ivM7A7SpDOs', 'ext': 'mp4', 'title': 'GOLAZO DE CHILENA DE JAVI GÓMEZ, FINALISTA AL BALÓN DE CLM 2016', 'description': 'md5:903c92fbf2b2f66c09de514bc25e9f5a', 'upload_date': '20160424', - 'uploader': 'RTVCM Castilla-La Mancha', + 'uploader': 'CMM Castilla-La Mancha Media', 'uploader_id': 'RTVCM', }, 'add_ie': ['Youtube'], @@ -77,7 +75,6 @@ class TwentyMinutenIE(InfoExtractor): mobj = re.match(self._VALID_URL, url) video_id = mobj.group('id') display_id = mobj.group('display_id') or video_id - print('DISPLAY_ID: {}'.format(display_id)) webpage = self._download_webpage(url, display_id) @@ -93,30 +90,19 @@ class TwentyMinutenIE(InfoExtractor): if not title: title = remove_end(re.sub( r'^20 [Mm]inuten.*? -', '', self._og_search_title(webpage)), ' - News') - print('TITLE: {}'.format(title)) if not video_id: params = self._html_search_regex( r']+src="(?:https?:)?//www\.20min\.ch/videoplayer/videoplayer\.html\?params=(.+?[^"])"', webpage, '20min embed URL', default=None) - print('PARMAS: {}'.format(params)) video_id = self._search_regex( r'.*videoId@(\d+)', params, 'Video Id', default=None) if params is not None else '' - print('VIDEO ID: {}'.format(video_id)) if not video_id: # the article does not contain a video raise ExtractorError('No media links found on %s.' % url, expected=True) - # # Try to use the real video description: - # description = clean_html(get_element_by_class('caption', webpage)) - # # Otherwise, use the lead text of the article as the video description: - # if not description: - # description = self._html_search_meta( - # 'description', webpage, 'description') description = self._html_search_meta('description', webpage, 'description') - print('DESCRIPTION: {}'.format(description)) thumbnail = self._og_search_thumbnail(webpage) - print('THUMBNAIL: {}'.format(thumbnail)) return { 'id': video_id, From 77eac436cbc266af5be132141c3c36a3d099d10a Mon Sep 17 00:00:00 2001 From: Alex Seiler Date: Wed, 11 Jan 2017 19:34:04 +0100 Subject: [PATCH 5/9] [twentymin] flake8 --- youtube_dl/extractor/twentymin.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/twentymin.py b/youtube_dl/extractor/twentymin.py index f9e40d7c1..ab35d6be2 100644 --- a/youtube_dl/extractor/twentymin.py +++ b/youtube_dl/extractor/twentymin.py @@ -98,7 +98,7 @@ class TwentyMinutenIE(InfoExtractor): video_id = self._search_regex( r'.*videoId@(\d+)', params, 'Video Id', default=None) if params is not None else '' - if not video_id: # the article does not contain a video + if not video_id: # the article does not contain a video raise ExtractorError('No media links found on %s.' % url, expected=True) description = self._html_search_meta('description', webpage, 'description') From 8bba00cfcb403e3ad5884e079ea255a1cee737b6 Mon Sep 17 00:00:00 2001 From: Alex Seiler Date: Wed, 11 Jan 2017 19:45:51 +0100 Subject: [PATCH 6/9] [twentymin] Revert to old style --- youtube_dl/extractor/twentymin.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/twentymin.py b/youtube_dl/extractor/twentymin.py index ab35d6be2..3e57c5ce7 100644 --- a/youtube_dl/extractor/twentymin.py +++ b/youtube_dl/extractor/twentymin.py @@ -101,7 +101,8 @@ class TwentyMinutenIE(InfoExtractor): if not video_id: # the article does not contain a video raise ExtractorError('No media links found on %s.' % url, expected=True) - description = self._html_search_meta('description', webpage, 'description') + description = self._html_search_meta( + 'description', webpage, 'description') thumbnail = self._og_search_thumbnail(webpage) return { From 61f5aff95e890056d02d546e9ae1ad0c53d446a1 Mon Sep 17 00:00:00 2001 From: Alex Seiler Date: Tue, 17 Jan 2017 17:03:03 +0100 Subject: [PATCH 7/9] [twentymin] Revert to old behavior if article does not contain any videos, and include both video formats. --- youtube_dl/extractor/twentymin.py | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/youtube_dl/extractor/twentymin.py b/youtube_dl/extractor/twentymin.py index 3e57c5ce7..4f42747dd 100644 --- a/youtube_dl/extractor/twentymin.py +++ b/youtube_dl/extractor/twentymin.py @@ -4,10 +4,7 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..utils import ( - remove_end, - ExtractorError -) +from ..utils import remove_end class TwentyMinutenIE(InfoExtractor): @@ -40,7 +37,7 @@ class TwentyMinutenIE(InfoExtractor): }, { # news article with video 'url': 'http://www.20min.ch/schweiz/news/story/So-kommen-Sie-bei-Eis-und-Schnee-sicher-an-27032552', - 'md5': '807f9e1e06a69b77440a9b315e52e580', + 'md5': '372917ba85ed969e176d287ae54b2f94', 'info_dict': { 'id': '523629', 'display_id': 'So-kommen-Sie-bei-Eis-und-Schnee-sicher-an-27032552', @@ -98,8 +95,6 @@ class TwentyMinutenIE(InfoExtractor): video_id = self._search_regex( r'.*videoId@(\d+)', params, 'Video Id', default=None) if params is not None else '' - if not video_id: # the article does not contain a video - raise ExtractorError('No media links found on %s.' % url, expected=True) description = self._html_search_meta( 'description', webpage, 'description') @@ -108,8 +103,16 @@ class TwentyMinutenIE(InfoExtractor): return { 'id': video_id, 'display_id': display_id, - 'url': 'http://podcast.20min-tv.ch/podcast/20min/%sh.mp4' % video_id, 'title': title, 'description': description, 'thumbnail': thumbnail, + 'formats': [{ + 'format_id': 'sd', + 'url': 'http://podcast.20min-tv.ch/podcast/20min/%s.mp4' % video_id, + 'preference': -2 + }, { + 'format_id': 'hd', + 'url': 'http://podcast.20min-tv.ch/podcast/20min/%sh.mp4' % video_id, + 'preference': -1 + }] } From b14da99b3c5f379e09adcb5db6b889939e13b2f2 Mon Sep 17 00:00:00 2001 From: Alex Seiler Date: Tue, 17 Jan 2017 17:39:35 +0100 Subject: [PATCH 8/9] [twentymin] Code cleanup for regular expression search in html. --- youtube_dl/extractor/twentymin.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/twentymin.py b/youtube_dl/extractor/twentymin.py index 4f42747dd..d7fc4c4b5 100644 --- a/youtube_dl/extractor/twentymin.py +++ b/youtube_dl/extractor/twentymin.py @@ -91,10 +91,10 @@ class TwentyMinutenIE(InfoExtractor): if not video_id: params = self._html_search_regex( r']+src="(?:https?:)?//www\.20min\.ch/videoplayer/videoplayer\.html\?params=(.+?[^"])"', - webpage, '20min embed URL', default=None) + webpage, '20min embed URL', default='') video_id = self._search_regex( r'.*videoId@(\d+)', - params, 'Video Id', default=None) if params is not None else '' + params, 'Video Id', default='') description = self._html_search_meta( 'description', webpage, 'description') From 7b2fd0f5b2e58c2b9beab1531d19f3e7dd0ab03a Mon Sep 17 00:00:00 2001 From: Alex Seiler Date: Tue, 17 Jan 2017 18:15:20 +0100 Subject: [PATCH 9/9] [twentymin] Removed default values in regular expression search and avoided code duplication for video formats. --- youtube_dl/extractor/twentymin.py | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/youtube_dl/extractor/twentymin.py b/youtube_dl/extractor/twentymin.py index d7fc4c4b5..68d5a0cb5 100644 --- a/youtube_dl/extractor/twentymin.py +++ b/youtube_dl/extractor/twentymin.py @@ -91,28 +91,29 @@ class TwentyMinutenIE(InfoExtractor): if not video_id: params = self._html_search_regex( r']+src="(?:https?:)?//www\.20min\.ch/videoplayer/videoplayer\.html\?params=(.+?[^"])"', - webpage, '20min embed URL', default='') + webpage, '20min embed URL') video_id = self._search_regex( r'.*videoId@(\d+)', - params, 'Video Id', default='') + params, 'Video Id') description = self._html_search_meta( 'description', webpage, 'description') thumbnail = self._og_search_thumbnail(webpage) + formats = [] + format_preferences = [('sd', ''), ('hd', 'h')] + for format_id, url_extension in format_preferences: + format_url = 'http://podcast.20min-tv.ch/podcast/20min/%s%s.mp4' % (video_id, url_extension) + formats.append({ + 'format_id': format_id, + 'url': format_url, + }) + return { 'id': video_id, 'display_id': display_id, 'title': title, 'description': description, 'thumbnail': thumbnail, - 'formats': [{ - 'format_id': 'sd', - 'url': 'http://podcast.20min-tv.ch/podcast/20min/%s.mp4' % video_id, - 'preference': -2 - }, { - 'format_id': 'hd', - 'url': 'http://podcast.20min-tv.ch/podcast/20min/%sh.mp4' % video_id, - 'preference': -1 - }] + 'formats': formats, }