From 877741a9e58f76f2d4101e25d6b9e12e58a8a08b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?D=C3=A9stin=20Reed?= Date: Fri, 30 Sep 2016 20:03:25 +0200 Subject: [PATCH 1/4] [clubic] Rely on _match_id and _parse_json --- youtube_dl/extractor/clubic.py | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/youtube_dl/extractor/clubic.py b/youtube_dl/extractor/clubic.py index 2fba93543..f7ee3a8f8 100644 --- a/youtube_dl/extractor/clubic.py +++ b/youtube_dl/extractor/clubic.py @@ -1,9 +1,6 @@ # coding: utf-8 from __future__ import unicode_literals -import json -import re - from .common import InfoExtractor from ..utils import ( clean_html, @@ -30,16 +27,14 @@ class ClubicIE(InfoExtractor): }] def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') + video_id = self._match_id(url) player_url = 'http://player.m6web.fr/v1/player/clubic/%s.html' % video_id player_page = self._download_webpage(player_url, video_id) - config_json = self._search_regex( + config = self._parse_json(self._search_regex( r'(?m)M6\.Player\.config\s*=\s*(\{.+?\});$', player_page, - 'configuration') - config = json.loads(config_json) + 'configuration'), video_id) video_info = config['videoInfo'] sources = config['sources'] From c3e0904bcae108f7f70c6df8336544b84937266e Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Sat, 1 Oct 2016 16:37:49 +0800 Subject: [PATCH 2/4] [twitch] Skip a 404 test --- youtube_dl/extractor/twitch.py | 1 + 1 file changed, 1 insertion(+) diff --git a/youtube_dl/extractor/twitch.py b/youtube_dl/extractor/twitch.py index bc352391e..46c2cfe7b 100644 --- a/youtube_dl/extractor/twitch.py +++ b/youtube_dl/extractor/twitch.py @@ -247,6 +247,7 @@ class TwitchVodIE(TwitchItemBaseIE): # m3u8 download 'skip_download': True, }, + 'skip': 'HTTP Error 404: Not Found', }] def _real_extract(self, url): From ce032ddae4818eaa01bda16c7789c5ddb9af0faa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?D=C3=A9stin=20Reed?= Date: Fri, 30 Sep 2016 20:06:08 +0200 Subject: [PATCH 3/4] [criterion] Rely on _match_id, improve regex and add thumbnail to test --- youtube_dl/extractor/criterion.py | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/youtube_dl/extractor/criterion.py b/youtube_dl/extractor/criterion.py index ad32673a8..cf6a5d6cb 100644 --- a/youtube_dl/extractor/criterion.py +++ b/youtube_dl/extractor/criterion.py @@ -1,8 +1,6 @@ -# -*- coding: utf-8 -*- +# coding: utf-8 from __future__ import unicode_literals -import re - from .common import InfoExtractor @@ -16,20 +14,20 @@ class CriterionIE(InfoExtractor): 'ext': 'mp4', 'title': 'Le Samouraï', 'description': 'md5:a2b4b116326558149bef81f76dcbb93f', + 'thumbnail': 're:^https?://.*\.jpg$', } } def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') + video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) final_url = self._search_regex( - r'so.addVariable\("videoURL", "(.+?)"\)\;', webpage, 'video url') + r'so\.addVariable\("videoURL", "(.+?)"\)\;', webpage, 'video url') title = self._og_search_title(webpage) description = self._html_search_meta('description', webpage) thumbnail = self._search_regex( - r'so.addVariable\("thumbnailURL", "(.+?)"\)\;', + r'so\.addVariable\("thumbnailURL", "(.+?)"\)\;', webpage, 'thumbnail url') return { From d955ee4a71382b163561954e4b9b91d73d0aca04 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?D=C3=A9stin=20Reed?= Date: Sat, 1 Oct 2016 14:44:33 +0200 Subject: [PATCH 4/4] [anysex] Improve metadata extraction --- youtube_dl/extractor/anysex.py | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/youtube_dl/extractor/anysex.py b/youtube_dl/extractor/anysex.py index ad86d6e58..628b13e86 100644 --- a/youtube_dl/extractor/anysex.py +++ b/youtube_dl/extractor/anysex.py @@ -4,8 +4,11 @@ import re from .common import InfoExtractor from ..utils import ( - parse_duration, + get_element_by_attribute, + get_element_by_class, int_or_none, + parse_duration, + js_to_json, ) @@ -19,6 +22,7 @@ class AnySexIE(InfoExtractor): 'ext': 'mp4', 'title': 'Busty and sexy blondie in her bikini strips for you', 'description': 'md5:de9e418178e2931c10b62966474e1383', + 'thumbnail': 're:^https?://.*\.jpg$', 'categories': ['Erotic'], 'duration': 270, 'age_limit': 18, @@ -26,24 +30,20 @@ class AnySexIE(InfoExtractor): } def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') - + video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) - video_url = self._html_search_regex(r"video_url\s*:\s*'([^']+)'", webpage, 'video URL') + video_data = self._parse_json(self._search_regex( + r'var\s+flashvars\s*=\s*({[^}]+});', webpage, 'video data'), + video_id, transform_source=js_to_json) + video_url = video_data['video_url'] title = self._html_search_regex(r'(.*?)', webpage, 'title') - description = self._html_search_regex( - r'
]*>([^<]+)
', webpage, 'description', fatal=False) - thumbnail = self._html_search_regex( - r'preview_url\s*:\s*\'(.*?)\'', webpage, 'thumbnail', fatal=False) categories = re.findall( r'([^<]+)', webpage) - duration = parse_duration(self._search_regex( - r'Duration: (?:)?(\d+:\d+)', webpage, 'duration', fatal=False)) + duration = parse_duration(get_element_by_attribute('itemprop', 'duration', webpage)) view_count = int_or_none(self._html_search_regex( r'Views: (\d+)', webpage, 'view count', fatal=False)) @@ -52,8 +52,8 @@ class AnySexIE(InfoExtractor): 'url': video_url, 'ext': 'mp4', 'title': title, - 'description': description, - 'thumbnail': thumbnail, + 'description': get_element_by_class('description', webpage), + 'thumbnail': video_data.get('preview_url'), 'categories': categories, 'duration': duration, 'view_count': view_count,