From 08850e913c0cd450a71faf697f2af7513300235f Mon Sep 17 00:00:00 2001 From: Mattias Wadman Date: Sat, 31 Dec 2016 14:35:11 +0100 Subject: [PATCH 1/7] [infoq] Add audio only format if available Refactor cookie code into a function. Renamed formats to http_video, http_audio, rtmp_video Renamed extract functions to video instead of videos as they return one or no video. --- youtube_dl/extractor/infoq.py | 46 ++++++++++++++++++++++++++--------- 1 file changed, 35 insertions(+), 11 deletions(-) diff --git a/youtube_dl/extractor/infoq.py b/youtube_dl/extractor/infoq.py index cca0b8a93..b15b6f689 100644 --- a/youtube_dl/extractor/infoq.py +++ b/youtube_dl/extractor/infoq.py @@ -4,7 +4,10 @@ from __future__ import unicode_literals import base64 -from ..compat import compat_urllib_parse_unquote +from ..compat import ( + compat_urllib_parse_unquote, + compat_urlparse, +) from ..utils import determine_ext from .bokecc import BokeCCBaseIE @@ -35,7 +38,7 @@ class InfoQIE(BokeCCBaseIE): }, }] - def _extract_rtmp_videos(self, webpage): + def _extract_rtmp_video(self, webpage): # The server URL is hardcoded video_url = 'rtmpe://video.infoq.com/cfx/st/' @@ -47,25 +50,43 @@ class InfoQIE(BokeCCBaseIE): playpath = 'mp4:' + real_id return [{ - 'format_id': 'rtmp', + 'format_id': 'rtmp_video', 'url': video_url, 'ext': determine_ext(playpath), 'play_path': playpath, }] - def _extract_http_videos(self, webpage): - http_video_url = self._search_regex(r'P\.s\s*=\s*\'([^\']+)\'', webpage, 'video URL') - + def _extract_cookie(self, webpage): policy = self._search_regex(r'InfoQConstants.scp\s*=\s*\'([^\']+)\'', webpage, 'policy') signature = self._search_regex(r'InfoQConstants.scs\s*=\s*\'([^\']+)\'', webpage, 'signature') key_pair_id = self._search_regex(r'InfoQConstants.sck\s*=\s*\'([^\']+)\'', webpage, 'key-pair-id') + return 'CloudFront-Policy=%s; CloudFront-Signature=%s; CloudFront-Key-Pair-Id=%s' % ( + policy, signature, key_pair_id) + + def _extract_http_video(self, webpage): + http_video_url = self._search_regex(r'P\.s\s*=\s*\'([^\']+)\'', webpage, 'video URL') + return [{ + 'format_id': 'http_video', + 'url': http_video_url, + 'ext': determine_ext(http_video_url), + 'http_headers': { + 'Cookie': self._extract_cookie(webpage) + }, + }] + + def _extract_http_audio(self, webpage): + http_audio_url = self._search_regex(r']*?name="filename"[^>]*?value="([^\"]+)"[^>]*?>', webpage, 'audio URL', fatal=False) + if http_audio_url is None: + return [] + http_audio_url = compat_urlparse.urljoin('http://res.infoq.com/downloads/mp3downloads/', http_audio_url) return [{ - 'format_id': 'http', - 'url': http_video_url, + 'format_id': 'http_audio', + 'url': http_audio_url, + 'ext': determine_ext(http_audio_url, ""), + 'vcodec': 'none', 'http_headers': { - 'Cookie': 'CloudFront-Policy=%s; CloudFront-Signature=%s; CloudFront-Key-Pair-Id=%s' % ( - policy, signature, key_pair_id), + 'Cookie': self._extract_cookie(webpage) }, }] @@ -80,7 +101,10 @@ class InfoQIE(BokeCCBaseIE): # for China videos, HTTP video URL exists but always fails with 403 formats = self._extract_bokecc_formats(webpage, video_id) else: - formats = self._extract_rtmp_videos(webpage) + self._extract_http_videos(webpage) + formats = ( + self._extract_rtmp_video(webpage) + + self._extract_http_video(webpage) + + self._extract_http_audio(webpage)) self._sort_formats(formats) From 24158ee4719e8fa887b7481d92db0adf83357b24 Mon Sep 17 00:00:00 2001 From: Mattias Wadman Date: Tue, 17 Jan 2017 23:24:32 +0100 Subject: [PATCH 2/7] [infoq] Rename to _extract_cookies as it more than one --- youtube_dl/extractor/infoq.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/infoq.py b/youtube_dl/extractor/infoq.py index b15b6f689..5400c638d 100644 --- a/youtube_dl/extractor/infoq.py +++ b/youtube_dl/extractor/infoq.py @@ -56,7 +56,7 @@ class InfoQIE(BokeCCBaseIE): 'play_path': playpath, }] - def _extract_cookie(self, webpage): + def _extract_cookies(self, webpage): policy = self._search_regex(r'InfoQConstants.scp\s*=\s*\'([^\']+)\'', webpage, 'policy') signature = self._search_regex(r'InfoQConstants.scs\s*=\s*\'([^\']+)\'', webpage, 'signature') key_pair_id = self._search_regex(r'InfoQConstants.sck\s*=\s*\'([^\']+)\'', webpage, 'key-pair-id') @@ -70,7 +70,7 @@ class InfoQIE(BokeCCBaseIE): 'url': http_video_url, 'ext': determine_ext(http_video_url), 'http_headers': { - 'Cookie': self._extract_cookie(webpage) + 'Cookie': self._extract_cookies(webpage) }, }] @@ -86,7 +86,7 @@ class InfoQIE(BokeCCBaseIE): 'ext': determine_ext(http_audio_url, ""), 'vcodec': 'none', 'http_headers': { - 'Cookie': self._extract_cookie(webpage) + 'Cookie': self._extract_cookies(webpage) }, }] From 6d7c50761237a26ee8c7a86d1d84b8b8bd384b8d Mon Sep 17 00:00:00 2001 From: Mattias Wadman Date: Tue, 17 Jan 2017 23:28:09 +0100 Subject: [PATCH 3/7] [infoq] Remove redundant determine_ext --- youtube_dl/extractor/infoq.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/youtube_dl/extractor/infoq.py b/youtube_dl/extractor/infoq.py index 5400c638d..9e41476cd 100644 --- a/youtube_dl/extractor/infoq.py +++ b/youtube_dl/extractor/infoq.py @@ -68,7 +68,6 @@ class InfoQIE(BokeCCBaseIE): return [{ 'format_id': 'http_video', 'url': http_video_url, - 'ext': determine_ext(http_video_url), 'http_headers': { 'Cookie': self._extract_cookies(webpage) }, @@ -83,7 +82,6 @@ class InfoQIE(BokeCCBaseIE): return [{ 'format_id': 'http_audio', 'url': http_audio_url, - 'ext': determine_ext(http_audio_url, ""), 'vcodec': 'none', 'http_headers': { 'Cookie': self._extract_cookies(webpage) From 31f5fc2c3226950c7ca01cb946d3777f272f2a7e Mon Sep 17 00:00:00 2001 From: Mattias Wadman Date: Tue, 17 Jan 2017 23:28:33 +0100 Subject: [PATCH 4/7] [infoq] Add comment about hardcoded URL --- youtube_dl/extractor/infoq.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/youtube_dl/extractor/infoq.py b/youtube_dl/extractor/infoq.py index 9e41476cd..84799271e 100644 --- a/youtube_dl/extractor/infoq.py +++ b/youtube_dl/extractor/infoq.py @@ -77,6 +77,8 @@ class InfoQIE(BokeCCBaseIE): http_audio_url = self._search_regex(r']*?name="filename"[^>]*?value="([^\"]+)"[^>]*?>', webpage, 'audio URL', fatal=False) if http_audio_url is None: return [] + # base URL is found in the Location header in the response returned by + # GET https://www.infoq.com/mp3download.action?filename=... when logged in. http_audio_url = compat_urlparse.urljoin('http://res.infoq.com/downloads/mp3downloads/', http_audio_url) return [{ From 11a8f1679bdfad55a95dc42d38962910b6cb98a6 Mon Sep 17 00:00:00 2001 From: Mattias Wadman Date: Mon, 23 Jan 2017 20:42:31 +0100 Subject: [PATCH 5/7] [infoq] Use _hidden_inputs instead of messy regex --- youtube_dl/extractor/infoq.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/infoq.py b/youtube_dl/extractor/infoq.py index 84799271e..d63e95854 100644 --- a/youtube_dl/extractor/infoq.py +++ b/youtube_dl/extractor/infoq.py @@ -74,7 +74,8 @@ class InfoQIE(BokeCCBaseIE): }] def _extract_http_audio(self, webpage): - http_audio_url = self._search_regex(r']*?name="filename"[^>]*?value="([^\"]+)"[^>]*?>', webpage, 'audio URL', fatal=False) + fields = self._hidden_inputs(webpage) + http_audio_url = fields['filename'] if http_audio_url is None: return [] # base URL is found in the Location header in the response returned by From 00d8d755fcd949e8374e548345c74e4288717542 Mon Sep 17 00:00:00 2001 From: Mattias Wadman Date: Mon, 23 Jan 2017 20:43:42 +0100 Subject: [PATCH 6/7] [infoq] Probe if audio URL is valid Make it possible to pass headers to _is_valid_url --- youtube_dl/extractor/common.py | 4 ++-- youtube_dl/extractor/infoq.py | 16 +++++++++++----- 2 files changed, 13 insertions(+), 7 deletions(-) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 6fa7c334e..a257721f1 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -1015,13 +1015,13 @@ class InfoExtractor(object): unique_formats.append(f) formats[:] = unique_formats - def _is_valid_url(self, url, video_id, item='video'): + def _is_valid_url(self, url, video_id, item='video', headers={}): url = self._proto_relative_url(url, scheme='http:') # For now assume non HTTP(S) URLs always valid if not (url.startswith('http://') or url.startswith('https://')): return True try: - self._request_webpage(url, video_id, 'Checking %s URL' % item) + self._request_webpage(url, video_id, 'Checking %s URL' % item, headers=headers) return True except ExtractorError as e: if isinstance(e.cause, compat_urllib_error.URLError): diff --git a/youtube_dl/extractor/infoq.py b/youtube_dl/extractor/infoq.py index d63e95854..08f9e86a8 100644 --- a/youtube_dl/extractor/infoq.py +++ b/youtube_dl/extractor/infoq.py @@ -73,22 +73,28 @@ class InfoQIE(BokeCCBaseIE): }, }] - def _extract_http_audio(self, webpage): + def _extract_http_audio(self, webpage, video_id): fields = self._hidden_inputs(webpage) http_audio_url = fields['filename'] if http_audio_url is None: return [] + + cookies_header = {'Cookie': self._extract_cookies(webpage)} + # base URL is found in the Location header in the response returned by # GET https://www.infoq.com/mp3download.action?filename=... when logged in. http_audio_url = compat_urlparse.urljoin('http://res.infoq.com/downloads/mp3downloads/', http_audio_url) + # audio file seem to be missing some times even if there is a download link + # so probe URL to make sure + if not self._is_valid_url(http_audio_url, video_id, headers=cookies_header): + return [] + return [{ 'format_id': 'http_audio', 'url': http_audio_url, 'vcodec': 'none', - 'http_headers': { - 'Cookie': self._extract_cookies(webpage) - }, + 'http_headers': cookies_header, }] def _real_extract(self, url): @@ -105,7 +111,7 @@ class InfoQIE(BokeCCBaseIE): formats = ( self._extract_rtmp_video(webpage) + self._extract_http_video(webpage) + - self._extract_http_audio(webpage)) + self._extract_http_audio(webpage, video_id)) self._sort_formats(formats) From 94b7fc14af53510c7cc1dc61fe231219d5b1859e Mon Sep 17 00:00:00 2001 From: Mattias Wadman Date: Mon, 23 Jan 2017 20:54:05 +0100 Subject: [PATCH 7/7] [infoq] Add audio only test --- youtube_dl/extractor/infoq.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/youtube_dl/extractor/infoq.py b/youtube_dl/extractor/infoq.py index 08f9e86a8..9fb71e8ef 100644 --- a/youtube_dl/extractor/infoq.py +++ b/youtube_dl/extractor/infoq.py @@ -36,6 +36,18 @@ class InfoQIE(BokeCCBaseIE): 'ext': 'flv', 'description': 'md5:308d981fb28fa42f49f9568322c683ff', }, + }, { + 'url': 'https://www.infoq.com/presentations/Simple-Made-Easy', + 'md5': '0e34642d4d9ef44bf86f66f6399672db', + 'info_dict': { + 'id': 'Simple-Made-Easy', + 'title': 'Simple Made Easy', + 'ext': 'mp3', + 'description': 'md5:3e0e213a8bbd074796ef89ea35ada25b', + }, + 'params': { + 'format': 'bestaudio', + }, }] def _extract_rtmp_video(self, webpage):