From ec40feeced8ba2f095573605548016f6d3b5697e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?D=C3=A9stin=20Reed?= Date: Sat, 1 Oct 2016 14:51:22 +0200 Subject: [PATCH] [anysex] Improve metadata extraction --- youtube_dl/extractor/anysex.py | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/youtube_dl/extractor/anysex.py b/youtube_dl/extractor/anysex.py index ad86d6e58..628b13e86 100644 --- a/youtube_dl/extractor/anysex.py +++ b/youtube_dl/extractor/anysex.py @@ -4,8 +4,11 @@ import re from .common import InfoExtractor from ..utils import ( - parse_duration, + get_element_by_attribute, + get_element_by_class, int_or_none, + parse_duration, + js_to_json, ) @@ -19,6 +22,7 @@ class AnySexIE(InfoExtractor): 'ext': 'mp4', 'title': 'Busty and sexy blondie in her bikini strips for you', 'description': 'md5:de9e418178e2931c10b62966474e1383', + 'thumbnail': 're:^https?://.*\.jpg$', 'categories': ['Erotic'], 'duration': 270, 'age_limit': 18, @@ -26,24 +30,20 @@ class AnySexIE(InfoExtractor): } def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') - + video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) - video_url = self._html_search_regex(r"video_url\s*:\s*'([^']+)'", webpage, 'video URL') + video_data = self._parse_json(self._search_regex( + r'var\s+flashvars\s*=\s*({[^}]+});', webpage, 'video data'), + video_id, transform_source=js_to_json) + video_url = video_data['video_url'] title = self._html_search_regex(r'(.*?)', webpage, 'title') - description = self._html_search_regex( - r'
]*>([^<]+)
', webpage, 'description', fatal=False) - thumbnail = self._html_search_regex( - r'preview_url\s*:\s*\'(.*?)\'', webpage, 'thumbnail', fatal=False) categories = re.findall( r'([^<]+)', webpage) - duration = parse_duration(self._search_regex( - r'Duration: (?:)?(\d+:\d+)', webpage, 'duration', fatal=False)) + duration = parse_duration(get_element_by_attribute('itemprop', 'duration', webpage)) view_count = int_or_none(self._html_search_regex( r'Views: (\d+)', webpage, 'view count', fatal=False)) @@ -52,8 +52,8 @@ class AnySexIE(InfoExtractor): 'url': video_url, 'ext': 'mp4', 'title': title, - 'description': description, - 'thumbnail': thumbnail, + 'description': get_element_by_class('description', webpage), + 'thumbnail': video_data.get('preview_url'), 'categories': categories, 'duration': duration, 'view_count': view_count,