From f369799459fcaaada15b882d6da20ee15c2e46b5 Mon Sep 17 00:00:00 2001 From: nindogo Date: Mon, 20 May 2019 23:41:47 +0300 Subject: [PATCH 1/2] Initial commit of modifications. Change the way the website is parsed and how the data is presented to YouTubeDl. --- youtube_dl/extractor/pornflip.py | 95 +++++++++++++++++--------------- 1 file changed, 51 insertions(+), 44 deletions(-) diff --git a/youtube_dl/extractor/pornflip.py b/youtube_dl/extractor/pornflip.py index 025985fbc..c806a0203 100644 --- a/youtube_dl/extractor/pornflip.py +++ b/youtube_dl/extractor/pornflip.py @@ -1,21 +1,40 @@ # coding: utf-8 from __future__ import unicode_literals +import re + from .common import InfoExtractor -from ..compat import ( - compat_parse_qs, - compat_str, -) from ..utils import ( int_or_none, - try_get, unified_timestamp, + unified_strdate, + parse_duration, + str_to_int, ) class PornFlipIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?pornflip\.com/(?:v|embed)/(?P[^/?#&]+)' + _VALID_URL = r'https?://(?:www\.)?pornflip\.com(?:/v||/embed)/(?P[^/?#&]+)' _TESTS = [{ + 'url': 'https://www.pornflip.com/k27gGfg7cqt/green-hair', + 'info_dict': { + 'id': 'k27gGfg7cqt', + 'ext': 'mp4', + 'title': 'Green hair', + 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 992, + 'timestamp': 1555970182, + 'upload_date': '20190422', + 'uploader_id': '402056', + 'uploader': 'berserk993', + 'view_count': int, + 'age_limit': 18, + 'only_matching': True + }, + 'params': { + 'skip_download': False, + } + }, { 'url': 'https://www.pornflip.com/v/wz7DfNhMmep', 'md5': '98c46639849145ae1fd77af532a9278c', 'info_dict': { @@ -30,6 +49,10 @@ class PornFlipIE(InfoExtractor): 'uploader': 'figifoto', 'view_count': int, 'age_limit': 18, + 'only_matching': True + }, + 'params': { + 'skip_download': True, } }, { 'url': 'https://www.pornflip.com/embed/wz7DfNhMmep', @@ -51,51 +74,35 @@ class PornFlipIE(InfoExtractor): webpage = self._download_webpage( 'https://www.pornflip.com/v/%s' % video_id, video_id) - flashvars = compat_parse_qs(self._search_regex( - r']+flashvars=(["\'])(?P(?:(?!\1).)+)\1', - webpage, 'flashvars', group='flashvars')) - - title = flashvars['video_vars[title]'][0] - - def flashvar(kind): - return try_get( - flashvars, lambda x: x['video_vars[%s]' % kind][0], compat_str) - - formats = [] - for key, value in flashvars.items(): - if not (value and isinstance(value, list)): - continue - format_url = value[0] - if key == 'video_vars[hds_manifest]': - formats.extend(self._extract_mpd_formats( - format_url, video_id, mpd_id='dash', fatal=False)) - continue - height = self._search_regex( - r'video_vars\[video_urls\]\[(\d+)', key, 'height', default=None) - if not height: - continue - formats.append({ - 'url': format_url, - 'format_id': 'http-%s' % height, - 'height': int_or_none(height), - }) + mpd_url = self._search_regex(r'data-src=[\'\"](.*?)[\'\"]', webpage, 'mpd_url', fatal=False).replace(r'&', r'&') + mpd_id = (mpd_url.split('/')[4] or 'DASH') + formats = list() + formats.extend(self._extract_mpd_formats(mpd_url, video_id, mpd_id=mpd_id,)) self._sort_formats(formats) - uploader = self._html_search_regex( - (r']+class="name"[^>]*>\s*]+>\s*(?P[^<]+)', - r']+content=(["\'])[^>]*\buploaded by (?P.+?)\1'), - webpage, 'uploader', fatal=False, group='uploader') + title, uploader = self._search_regex('(.*?)', webpage, 'title').rsplit(',', 1) + title = title.strip() + uploader = uploader.strip() + + thumbnail = self._search_regex(r'background:\s*?url\((.*?)\)', webpage, 'thumbnail', default=None) + + views = str_to_int(self._search_regex(r'\s*?(.*?)', webpage, 'views')) + uploader_id_regex = re.compile(r'item=(\d+?)\&') + uploader_id = re.findall(uploader_id_regex, webpage)[0] + upload_date = self._html_search_meta('uploadDate', webpage, 'upload_date') return { 'id': video_id, 'formats': formats, 'title': title, - 'thumbnail': flashvar('big_thumb'), - 'duration': int_or_none(flashvar('duration')), - 'timestamp': unified_timestamp(self._html_search_meta( - 'uploadDate', webpage, 'timestamp')), - 'uploader_id': flashvar('author_id'), + 'url': mpd_url, + 'thumbnail': thumbnail, + 'duration': int_or_none(parse_duration(self._html_search_meta( + 'duration', webpage, 'duration'))), + 'timestamp': unified_timestamp(upload_date), + 'upload_date': unified_strdate(upload_date), + 'uploader_id': uploader_id, 'uploader': uploader, - 'view_count': int_or_none(flashvar('views')), + 'view_count': int_or_none(views), 'age_limit': 18, } From 6bee1627a811fe4c3ece3c76f34de87e826b5724 Mon Sep 17 00:00:00 2001 From: nindogo Date: Tue, 21 May 2019 03:20:06 +0300 Subject: [PATCH 2/2] Commit for first review. --- youtube_dl/extractor/pornflip.py | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/youtube_dl/extractor/pornflip.py b/youtube_dl/extractor/pornflip.py index c806a0203..cf74cc85d 100644 --- a/youtube_dl/extractor/pornflip.py +++ b/youtube_dl/extractor/pornflip.py @@ -1,8 +1,6 @@ # coding: utf-8 from __future__ import unicode_literals -import re - from .common import InfoExtractor from ..utils import ( int_or_none, @@ -75,7 +73,7 @@ class PornFlipIE(InfoExtractor): 'https://www.pornflip.com/v/%s' % video_id, video_id) mpd_url = self._search_regex(r'data-src=[\'\"](.*?)[\'\"]', webpage, 'mpd_url', fatal=False).replace(r'&', r'&') - mpd_id = (mpd_url.split('/')[4] or 'DASH') + mpd_id = (mpd_url.split('/')[-2] or 'DASH') formats = list() formats.extend(self._extract_mpd_formats(mpd_url, video_id, mpd_id=mpd_id,)) self._sort_formats(formats) @@ -86,10 +84,10 @@ class PornFlipIE(InfoExtractor): thumbnail = self._search_regex(r'background:\s*?url\((.*?)\)', webpage, 'thumbnail', default=None) - views = str_to_int(self._search_regex(r'\s*?(.*?)', webpage, 'views')) - uploader_id_regex = re.compile(r'item=(\d+?)\&') - uploader_id = re.findall(uploader_id_regex, webpage)[0] - upload_date = self._html_search_meta('uploadDate', webpage, 'upload_date') + view_count = str_to_int(self._search_regex(r'class=[\'\"]views[\'\"]>\s*?(.*?)', webpage, 'view_count')) + uploader_id = int_or_none(self._search_regex(r'item=(\d+?)\&', webpage, 'uploader_id')) + iso_8601_datetime_extended = self._html_search_meta('uploadDate', webpage, 'iso_8601_datetime_extended') + '''http://support.sas.com/documentation/cdl/en/lrdict/64316/HTML/default/viewer.htm#a003169814.htm''' return { 'id': video_id, @@ -99,10 +97,10 @@ class PornFlipIE(InfoExtractor): 'thumbnail': thumbnail, 'duration': int_or_none(parse_duration(self._html_search_meta( 'duration', webpage, 'duration'))), - 'timestamp': unified_timestamp(upload_date), - 'upload_date': unified_strdate(upload_date), + 'timestamp': unified_timestamp(iso_8601_datetime_extended), + 'upload_date': unified_strdate(iso_8601_datetime_extended), 'uploader_id': uploader_id, 'uploader': uploader, - 'view_count': int_or_none(views), + 'view_count': int_or_none(view_count), 'age_limit': 18, }