From 4de58cdf6d9ce4d8a0e974f21bfcf81f330e04ee Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ji=C5=99=C3=AD=20Pale=C4=8Dek?= Date: Wed, 30 May 2018 15:14:51 +0200 Subject: [PATCH 1/2] [alphaporno] Fix test errors This also allows selection of different formats when available --- youtube_dl/extractor/alphaporno.py | 23 ++++++++++++++++++----- 1 file changed, 18 insertions(+), 5 deletions(-) diff --git a/youtube_dl/extractor/alphaporno.py b/youtube_dl/extractor/alphaporno.py index 3a6d99f6b..7bf9546e0 100644 --- a/youtube_dl/extractor/alphaporno.py +++ b/youtube_dl/extractor/alphaporno.py @@ -1,11 +1,14 @@ from __future__ import unicode_literals +import re + from .common import InfoExtractor from ..utils import ( parse_iso8601, parse_duration, parse_filesize, int_or_none, + js_to_json, ) @@ -35,11 +38,21 @@ class AlphaPornoIE(InfoExtractor): webpage = self._download_webpage(url, display_id) - video_id = self._search_regex( - r"video_id\s*:\s*'([^']+)'", webpage, 'video id', default=None) + video_id = re.sub(r'^https?://.*/embed/', '', self._html_search_meta('embedUrl', webpage, 'video id')) + + sources = self._parse_json( + self._search_regex(r'sources\s*:\s*(\[[^\]]*\])', webpage, 'source data'), video_id, + transform_source=js_to_json + ) + + formats = [] + for s in sources: + video_url = s['file'] + formats.append({ + 'url': video_url, + 'height': int_or_none(re.sub('^(\d+)[pi].*', r'\1', s.get('label') or '')) + }) - video_url = self._search_regex( - r"video_url\s*:\s*'([^']+)'", webpage, 'video url') ext = self._html_search_meta( 'encodingFormat', webpage, 'ext', default='.mp4')[1:] @@ -64,7 +77,6 @@ class AlphaPornoIE(InfoExtractor): return { 'id': video_id, 'display_id': display_id, - 'url': video_url, 'ext': ext, 'title': title, 'thumbnail': thumbnail, @@ -74,4 +86,5 @@ class AlphaPornoIE(InfoExtractor): 'tbr': bitrate, 'categories': categories, 'age_limit': age_limit, + 'formats': formats, } From cf7233eec06e7c1bdc5d6d99bd9c41c05429053f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ji=C5=99=C3=AD=20Pale=C4=8Dek?= Date: Mon, 4 Jun 2018 14:57:08 +0200 Subject: [PATCH 2/2] [alpahaporno] Use _extract_jwplayer_data to get jwplayer playlist --- youtube_dl/extractor/alphaporno.py | 48 +++++++++++++----------------- 1 file changed, 20 insertions(+), 28 deletions(-) diff --git a/youtube_dl/extractor/alphaporno.py b/youtube_dl/extractor/alphaporno.py index 7bf9546e0..d832739c9 100644 --- a/youtube_dl/extractor/alphaporno.py +++ b/youtube_dl/extractor/alphaporno.py @@ -40,51 +40,43 @@ class AlphaPornoIE(InfoExtractor): video_id = re.sub(r'^https?://.*/embed/', '', self._html_search_meta('embedUrl', webpage, 'video id')) - sources = self._parse_json( - self._search_regex(r'sources\s*:\s*(\[[^\]]*\])', webpage, 'source data'), video_id, - transform_source=js_to_json - ) - - formats = [] - for s in sources: - video_url = s['file'] - formats.append({ - 'url': video_url, - 'height': int_or_none(re.sub('^(\d+)[pi].*', r'\1', s.get('label') or '')) - }) - - ext = self._html_search_meta( - 'encodingFormat', webpage, 'ext', default='.mp4')[1:] + result = self._extract_jwplayer_data(webpage, video_id, require_title=False) title = self._search_regex( [r'', r'class="title" itemprop="name">([^<]+)<'], webpage, 'title') - thumbnail = self._html_search_meta('thumbnail', webpage, 'thumbnail') timestamp = parse_iso8601(self._html_search_meta( 'uploadDate', webpage, 'upload date')) duration = parse_duration(self._html_search_meta( 'duration', webpage, 'duration')) - filesize_approx = parse_filesize(self._html_search_meta( - 'contentSize', webpage, 'file size')) - bitrate = int_or_none(self._html_search_meta( - 'bitrate', webpage, 'bitrate')) + filesize_approx = self._html_search_meta( + 'contentSize', webpage, 'file size') + + # bitrates are taken from the URL; the document only contains + # a single value for the lowest quality + for f in result.get('formats') or []: + m = re.search(r'[?&]br=(\d+)', f.get('url') or '') + if m: + f['tbr'] = int(m.group(1)) + + # filesizes are concatenated together in the meta tag + if filesize_approx: + filesizes = re.findall(r'\s*[\d.]+\s*[A-Za-z]+', filesize_approx) + for f, size in zip(result.get('formats') or [], filesizes): + f['filesize_approx'] = parse_filesize(size) + categories = self._html_search_meta( 'keywords', webpage, 'categories', default='').split(',') age_limit = self._rta_search(webpage) - return { - 'id': video_id, + result.update({ 'display_id': display_id, - 'ext': ext, 'title': title, - 'thumbnail': thumbnail, 'timestamp': timestamp, 'duration': duration, - 'filesize_approx': filesize_approx, - 'tbr': bitrate, 'categories': categories, 'age_limit': age_limit, - 'formats': formats, - } + }) + return result