From cc1c263c95987df80a37a9446ab907a6df1ef49f Mon Sep 17 00:00:00 2001 From: Adam Buchbinder Date: Mon, 12 Sep 2016 11:53:37 -0700 Subject: [PATCH 1/4] Fix Bloomberg extractor. (closes #10630) The JSON being sent from Bloomberg uses single quotes, which is invalid JSON, and contains function calls. Regex both of those out. Additionally, the API endpoint requires an additional parameter, and must be called over HTTPS now. Change that as well. Signed-off-by: Adam Buchbinder --- youtube_dl/extractor/bloomberg.py | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/bloomberg.py b/youtube_dl/extractor/bloomberg.py index 2a8cd64b9..9b07ca2d4 100644 --- a/youtube_dl/extractor/bloomberg.py +++ b/youtube_dl/extractor/bloomberg.py @@ -48,13 +48,20 @@ class BloombergIE(InfoExtractor): r'["\']bmmrId["\']\s*:\s*(["\'])(?P.+?)\1', webpage, 'id', group='url', default=None) if not video_id: - bplayer_data = self._parse_json(self._search_regex( - r'BPlayer\(null,\s*({[^;]+})\);', webpage, 'id'), name) + bplayer_json = self._search_regex(r'BPlayer\(null,\s*({[^;]+})\);', + webpage, 'id') + # It's not good JSON; it uses single quotes and contains function + # calls. Sweep that under the rug. + bplayer_json = bplayer_json.replace("\'", '"') + bplayer_json = re.sub("\w+\(([^)]+)\)", '"FUNCTION"', bplayer_json) + bplayer_data = self._parse_json(bplayer_json, name) video_id = bplayer_data['id'] + video_id_type = bplayer_data['idType'] title = re.sub(': Video$', '', self._og_search_title(webpage)) embed_info = self._download_json( - 'http://www.bloomberg.com/api/embed?id=%s' % video_id, video_id) + 'https://www.bloomberg.com/api/embed?id=%s&idType=%s' % + (video_id, video_id_type), video_id) formats = [] for stream in embed_info['streams']: stream_url = stream.get('url') From 7f2681f60096ab3e448fe0ddb6c1bc86e5b62d1a Mon Sep 17 00:00:00 2001 From: Adam Buchbinder Date: Mon, 12 Sep 2016 11:53:37 -0700 Subject: [PATCH 2/4] Fix Bloomberg extractor. (closes #10630) The JSON being sent from Bloomberg uses single quotes, which is invalid JSON, and contains function calls. Regex both of those out. Additionally, the API endpoint requires an additional parameter, and must be called over HTTPS now. Change that as well. Signed-off-by: Adam Buchbinder --- youtube_dl/extractor/bloomberg.py | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/bloomberg.py b/youtube_dl/extractor/bloomberg.py index 2a8cd64b9..9b07ca2d4 100644 --- a/youtube_dl/extractor/bloomberg.py +++ b/youtube_dl/extractor/bloomberg.py @@ -48,13 +48,20 @@ class BloombergIE(InfoExtractor): r'["\']bmmrId["\']\s*:\s*(["\'])(?P.+?)\1', webpage, 'id', group='url', default=None) if not video_id: - bplayer_data = self._parse_json(self._search_regex( - r'BPlayer\(null,\s*({[^;]+})\);', webpage, 'id'), name) + bplayer_json = self._search_regex(r'BPlayer\(null,\s*({[^;]+})\);', + webpage, 'id') + # It's not good JSON; it uses single quotes and contains function + # calls. Sweep that under the rug. + bplayer_json = bplayer_json.replace("\'", '"') + bplayer_json = re.sub("\w+\(([^)]+)\)", '"FUNCTION"', bplayer_json) + bplayer_data = self._parse_json(bplayer_json, name) video_id = bplayer_data['id'] + video_id_type = bplayer_data['idType'] title = re.sub(': Video$', '', self._og_search_title(webpage)) embed_info = self._download_json( - 'http://www.bloomberg.com/api/embed?id=%s' % video_id, video_id) + 'https://www.bloomberg.com/api/embed?id=%s&idType=%s' % + (video_id, video_id_type), video_id) formats = [] for stream in embed_info['streams']: stream_url = stream.get('url') From 037e1d83e0f0247ffa43baf33c9a8873aa0710e4 Mon Sep 17 00:00:00 2001 From: Adam Buchbinder Date: Mon, 12 Sep 2016 11:53:37 -0700 Subject: [PATCH 3/4] Fix Bloomberg extractor. (closes #10630) The JSON being sent from Bloomberg uses single quotes, which is invalid JSON, and contains function calls. Regex both of those out. Additionally, the API endpoint requires an additional parameter, and must be called over HTTPS now. Change that as well. Signed-off-by: Adam Buchbinder --- youtube_dl/extractor/bloomberg.py | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/bloomberg.py b/youtube_dl/extractor/bloomberg.py index 2a8cd64b9..9b07ca2d4 100644 --- a/youtube_dl/extractor/bloomberg.py +++ b/youtube_dl/extractor/bloomberg.py @@ -48,13 +48,20 @@ class BloombergIE(InfoExtractor): r'["\']bmmrId["\']\s*:\s*(["\'])(?P.+?)\1', webpage, 'id', group='url', default=None) if not video_id: - bplayer_data = self._parse_json(self._search_regex( - r'BPlayer\(null,\s*({[^;]+})\);', webpage, 'id'), name) + bplayer_json = self._search_regex(r'BPlayer\(null,\s*({[^;]+})\);', + webpage, 'id') + # It's not good JSON; it uses single quotes and contains function + # calls. Sweep that under the rug. + bplayer_json = bplayer_json.replace("\'", '"') + bplayer_json = re.sub("\w+\(([^)]+)\)", '"FUNCTION"', bplayer_json) + bplayer_data = self._parse_json(bplayer_json, name) video_id = bplayer_data['id'] + video_id_type = bplayer_data['idType'] title = re.sub(': Video$', '', self._og_search_title(webpage)) embed_info = self._download_json( - 'http://www.bloomberg.com/api/embed?id=%s' % video_id, video_id) + 'https://www.bloomberg.com/api/embed?id=%s&idType=%s' % + (video_id, video_id_type), video_id) formats = [] for stream in embed_info['streams']: stream_url = stream.get('url') From d4d790d0e58b07c6f11ca760c36c52f78fb8bee2 Mon Sep 17 00:00:00 2001 From: Adam Buchbinder Date: Mon, 12 Sep 2016 19:43:00 -0700 Subject: [PATCH 4/4] Handle video_id_type for older-type Bloomberg videos. --- youtube_dl/extractor/bloomberg.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/bloomberg.py b/youtube_dl/extractor/bloomberg.py index 9b07ca2d4..c7c9ad7f8 100644 --- a/youtube_dl/extractor/bloomberg.py +++ b/youtube_dl/extractor/bloomberg.py @@ -47,7 +47,9 @@ class BloombergIE(InfoExtractor): video_id = self._search_regex( r'["\']bmmrId["\']\s*:\s*(["\'])(?P.+?)\1', webpage, 'id', group='url', default=None) - if not video_id: + if video_id: + video_id_type = 'BMMR' + else: bplayer_json = self._search_regex(r'BPlayer\(null,\s*({[^;]+})\);', webpage, 'id') # It's not good JSON; it uses single quotes and contains function