From 45abe2051dafed38f4c8319a7bfe9bdbf3e372b4 Mon Sep 17 00:00:00 2001 From: Nehal Patel Date: Wed, 6 Jul 2016 23:36:29 -0500 Subject: [PATCH 1/5] [BrainPOP] Add new extractor --- youtube_dl/extractor/brainpop.py | 47 ++++++++++++++++++++++++++++++ youtube_dl/extractor/extractors.py | 1 + 2 files changed, 48 insertions(+) create mode 100644 youtube_dl/extractor/brainpop.py diff --git a/youtube_dl/extractor/brainpop.py b/youtube_dl/extractor/brainpop.py new file mode 100644 index 000000000..6b3dd6a92 --- /dev/null +++ b/youtube_dl/extractor/brainpop.py @@ -0,0 +1,47 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor + + +class BrainPOPIE(InfoExtractor): + _VALID_URL = r'https?:\/\/(?:(.+)\.)?brainpop\.com\/(?P[^\r\n]+)' + _TEST = { + 'url': 'https://www.brainpop.com/english/freemovies/williamshakespeare/', + 'md5': '676d936271b628dc05e4cec377751919', + 'info_dict': { + 'id': 'english/freemovies/williamshakespeare/', + 'ext': 'mp4', + 'title': 'William Shakespeare - BrainPOP', + 'thumbnail': 're:^https?://.*\.png$', + 'description': 'He could do comedies, tragedies, histories and poetry. Learn about the greatest playwright in the history of the English language!', + } + } + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + self.report_extraction(video_id) + + ec_token = self._html_search_regex(r'ec_token : \'(.+)\'', webpage, "token") + movie_cdn_path = self._html_search_regex(r'movie_cdn_path : \'(.+)\'', webpage, "cdn path") + mp4 = self._html_search_regex(r'mp4":"([^"]*)', webpage, "mp4") + + url = movie_cdn_path + mp4.replace("\\", "") + "?" + ec_token + + title = self._html_search_regex(r'type":"Movie","name":"([^"]*)"', webpage, "title") + + thumbnail_cdn = self._html_search_regex(r"'cdn_path' : '([^']*)'", webpage, "thumbnail cdn") + thumbnail_image = self._html_search_regex(r'type":"Movie","name":"[^"]*","image":"([^"]*)"', webpage, "thumbnail") + thumbnail = thumbnail_cdn + thumbnail_image.replace("\\", "") + + description = self._html_search_regex(r'type":"Movie","name":"[^"]*","image":"[^"]*","description":"([^"]*)"', webpage, "description") + + return { + 'id': video_id, + 'url': url, + 'title': title, + 'thumbnail': thumbnail, + 'description': description, + } diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 4e2a2f2e9..cc45f5c23 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -93,6 +93,7 @@ from .bokecc import BokeCCIE from .bpb import BpbIE from .br import BRIE from .bravotv import BravoTVIE +from .brainpop import BrainPOPIE from .breakcom import BreakIE from .brightcove import ( BrightcoveLegacyIE, From f56a9dbdbc20eebc7c93a5ea45ddcdf841236e9c Mon Sep 17 00:00:00 2001 From: Nehal Patel Date: Wed, 6 Jul 2016 23:53:10 -0500 Subject: [PATCH 2/5] [BrainPOP] Clean up code and account for non-mandatory fields --- youtube_dl/extractor/brainpop.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/youtube_dl/extractor/brainpop.py b/youtube_dl/extractor/brainpop.py index 6b3dd6a92..1dee770e8 100644 --- a/youtube_dl/extractor/brainpop.py +++ b/youtube_dl/extractor/brainpop.py @@ -24,19 +24,19 @@ class BrainPOPIE(InfoExtractor): self.report_extraction(video_id) - ec_token = self._html_search_regex(r'ec_token : \'(.+)\'', webpage, "token") - movie_cdn_path = self._html_search_regex(r'movie_cdn_path : \'(.+)\'', webpage, "cdn path") - mp4 = self._html_search_regex(r'mp4":"([^"]*)', webpage, "mp4") + ec_token = self._html_search_regex(r'ec_token : \'(.+)\'', webpage, 'token') + movie_cdn_path = self._html_search_regex(r'movie_cdn_path : \'(.+)\'', webpage, 'cdn path') + mp4 = self._html_search_regex(r'mp4":"([^"]*)', webpage, 'mp4') - url = movie_cdn_path + mp4.replace("\\", "") + "?" + ec_token + url = movie_cdn_path + mp4.replace('\\', '') + '?' + ec_token - title = self._html_search_regex(r'type":"Movie","name":"([^"]*)"', webpage, "title") + title = self._html_search_regex(r'type":"Movie","name":"([^"]*)"', webpage, 'title') or self._html_search_regex(r'(.+?)', webpage, 'title') - thumbnail_cdn = self._html_search_regex(r"'cdn_path' : '([^']*)'", webpage, "thumbnail cdn") - thumbnail_image = self._html_search_regex(r'type":"Movie","name":"[^"]*","image":"([^"]*)"', webpage, "thumbnail") - thumbnail = thumbnail_cdn + thumbnail_image.replace("\\", "") + thumbnail_cdn = self._html_search_regex(r"'cdn_path' : '([^']*)'", webpage, 'thumbnail cdn', fatal=False) + thumbnail_image = self._html_search_regex(r'type":"Movie","name":"[^"]*","image":"([^"]*)"', webpage, 'thumbnail', fatal=False) + thumbnail = thumbnail_cdn + thumbnail_image.replace('\\', '') - description = self._html_search_regex(r'type":"Movie","name":"[^"]*","image":"[^"]*","description":"([^"]*)"', webpage, "description") + description = self._html_search_regex(r'type":"Movie","name":"[^"]*","image":"[^"]*","description":"([^"]*)"', webpage, 'description', fatal=False) return { 'id': video_id, From b00d17edeaaa18715472061857bf539a6a2f2bdf Mon Sep 17 00:00:00 2001 From: Nehal Patel Date: Fri, 8 Jul 2016 20:20:57 -0500 Subject: [PATCH 3/5] [BrainPOP] Switch from regex to parsing JSON and include both resolutions --- youtube_dl/extractor/brainpop.py | 41 +++++++++++++++++++++++--------- 1 file changed, 30 insertions(+), 11 deletions(-) diff --git a/youtube_dl/extractor/brainpop.py b/youtube_dl/extractor/brainpop.py index 1dee770e8..a930942b2 100644 --- a/youtube_dl/extractor/brainpop.py +++ b/youtube_dl/extractor/brainpop.py @@ -24,24 +24,43 @@ class BrainPOPIE(InfoExtractor): self.report_extraction(video_id) - ec_token = self._html_search_regex(r'ec_token : \'(.+)\'', webpage, 'token') - movie_cdn_path = self._html_search_regex(r'movie_cdn_path : \'(.+)\'', webpage, 'cdn path') - mp4 = self._html_search_regex(r'mp4":"([^"]*)', webpage, 'mp4') + ec_token = self._html_search_regex(r"ec_token : '([^']*)'", webpage, 'token') - url = movie_cdn_path + mp4.replace('\\', '') + '?' + ec_token + settings = self._parse_json(self._html_search_regex(r'var settings = ([^;]*)', webpage, 'settings'), video_id) + title = settings['title'] + description = settings['description'] - title = self._html_search_regex(r'type":"Movie","name":"([^"]*)"', webpage, 'title') or self._html_search_regex(r'(.+?)', webpage, 'title') + global_content = self._parse_json(self._html_search_regex(r'var global_content = ([^;]*)', webpage, 'global content').replace("'", '"'), video_id) + cdn_path = global_content['cdn_path'] + movie_cdn_path = global_content['movie_cdn_path'] - thumbnail_cdn = self._html_search_regex(r"'cdn_path' : '([^']*)'", webpage, 'thumbnail cdn', fatal=False) - thumbnail_image = self._html_search_regex(r'type":"Movie","name":"[^"]*","image":"([^"]*)"', webpage, 'thumbnail', fatal=False) - thumbnail = thumbnail_cdn + thumbnail_image.replace('\\', '') + content = self._parse_json(self._html_search_regex(r'var content = ([^;]*)', webpage, 'content'), video_id) + movies = content['category']['unit']['topic']['movies'] + screenshots = content['category']['unit']['topic']['screenshots'] - description = self._html_search_regex(r'type":"Movie","name":"[^"]*","image":"[^"]*","description":"([^"]*)"', webpage, 'description', fatal=False) + formats = [] + formats.append({ + 'url': movie_cdn_path + movies['mp4'] + '?' + ec_token, + 'height': 768, + 'width': 768, + }) + formats.append({ + 'url': movie_cdn_path + movies['mp4_small'] + '?' + ec_token, + 'height': 480, + 'width': 480, + }) + self._sort_formats(formats) + + thumbnails = [] + for (i, screenshot) in enumerate(screenshots): + thumbnails.append({ + 'url': cdn_path + screenshot, + }) return { 'id': video_id, - 'url': url, 'title': title, - 'thumbnail': thumbnail, + 'formats': formats, + 'thumbnails': thumbnails, 'description': description, } From 7022e24b1dc8897cbbdd807b32fbf2691b7ecf44 Mon Sep 17 00:00:00 2001 From: Nehal Patel Date: Tue, 12 Jul 2016 19:08:03 -0500 Subject: [PATCH 4/5] [BrainPOP] Optimize regex and extractor, improve metadata, and add subscription video detection --- youtube_dl/extractor/brainpop.py | 51 ++++++++++++++++---------------- 1 file changed, 25 insertions(+), 26 deletions(-) diff --git a/youtube_dl/extractor/brainpop.py b/youtube_dl/extractor/brainpop.py index a930942b2..f3fc66ee1 100644 --- a/youtube_dl/extractor/brainpop.py +++ b/youtube_dl/extractor/brainpop.py @@ -2,42 +2,44 @@ from __future__ import unicode_literals from .common import InfoExtractor +from ..utils import ( + remove_end +) class BrainPOPIE(InfoExtractor): - _VALID_URL = r'https?:\/\/(?:(.+)\.)?brainpop\.com\/(?P[^\r\n]+)' + _VALID_URL = r'https?:\/\/(?:(.+)\.)?brainpop\.com\/[^/]+/[^/]+/(?P[^/?#&]+)' _TEST = { 'url': 'https://www.brainpop.com/english/freemovies/williamshakespeare/', 'md5': '676d936271b628dc05e4cec377751919', 'info_dict': { - 'id': 'english/freemovies/williamshakespeare/', + 'id': '3026', + 'display_id': 'williamshakespeare', 'ext': 'mp4', - 'title': 'William Shakespeare - BrainPOP', + 'title': 'William Shakespeare', 'thumbnail': 're:^https?://.*\.png$', 'description': 'He could do comedies, tragedies, histories and poetry. Learn about the greatest playwright in the history of the English language!', } } def _real_extract(self, url): - video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) - self.report_extraction(video_id) + content = self._parse_json(self._html_search_regex(r'var content = ([^;]*)', webpage, 'content'), display_id) + + if content['category']['unit']['topic']['free'] == 'no': + self.raise_login_required('%s is only available for users with Subscriptions' % display_id) + global_content = self._parse_json(self._html_search_regex(r'var global_content = ([^;]*)', webpage, 'global content').replace("'", '"'), display_id) + cdn_path = global_content.get('cdn_path', 'https://cdn.brainpop.com') + movie_cdn_path = global_content.get('movie_cdn_path', 'https://svideos.brainpop.com') ec_token = self._html_search_regex(r"ec_token : '([^']*)'", webpage, 'token') - settings = self._parse_json(self._html_search_regex(r'var settings = ([^;]*)', webpage, 'settings'), video_id) - title = settings['title'] - description = settings['description'] + screenshots = content['category']['unit']['topic'].get('screenshots', {}) + thumbnails = [{'url': cdn_path + screenshot} for screenshot in screenshots] - global_content = self._parse_json(self._html_search_regex(r'var global_content = ([^;]*)', webpage, 'global content').replace("'", '"'), video_id) - cdn_path = global_content['cdn_path'] - movie_cdn_path = global_content['movie_cdn_path'] - - content = self._parse_json(self._html_search_regex(r'var content = ([^;]*)', webpage, 'content'), video_id) movies = content['category']['unit']['topic']['movies'] - screenshots = content['category']['unit']['topic']['screenshots'] - formats = [] formats.append({ 'url': movie_cdn_path + movies['mp4'] + '?' + ec_token, @@ -50,17 +52,14 @@ class BrainPOPIE(InfoExtractor): 'width': 480, }) self._sort_formats(formats) - - thumbnails = [] - for (i, screenshot) in enumerate(screenshots): - thumbnails.append({ - 'url': cdn_path + screenshot, - }) + + settings = self._parse_json(self._html_search_regex(r'var settings = ([^;]*)', webpage, 'settings'), display_id) return { - 'id': video_id, - 'title': title, - 'formats': formats, + 'id': content['category']['unit']['topic']['EntryID'], + 'display_id': display_id, + 'title': remove_end(settings['title'], ' - BrainPOP'), + 'description': settings['description'], 'thumbnails': thumbnails, - 'description': description, + 'formats': formats, } From f02b57d5a7d59cbc63a3c36d9172d57fe0f315b7 Mon Sep 17 00:00:00 2001 From: Nehal Patel Date: Tue, 12 Jul 2016 19:51:50 -0500 Subject: [PATCH 5/5] [BrainPOP] Trim code and make optional metadata less brittle --- youtube_dl/extractor/brainpop.py | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/youtube_dl/extractor/brainpop.py b/youtube_dl/extractor/brainpop.py index f3fc66ee1..7f825c114 100644 --- a/youtube_dl/extractor/brainpop.py +++ b/youtube_dl/extractor/brainpop.py @@ -26,20 +26,20 @@ class BrainPOPIE(InfoExtractor): display_id = self._match_id(url) webpage = self._download_webpage(url, display_id) - content = self._parse_json(self._html_search_regex(r'var content = ([^;]*)', webpage, 'content'), display_id) - - if content['category']['unit']['topic']['free'] == 'no': + content = self._parse_json(self._html_search_regex(r'var content = ([^;]*)', webpage, 'content JSON'), display_id) + topic = content['category']['unit']['topic'] + + if topic['free'] == 'no': self.raise_login_required('%s is only available for users with Subscriptions' % display_id) - global_content = self._parse_json(self._html_search_regex(r'var global_content = ([^;]*)', webpage, 'global content').replace("'", '"'), display_id) + global_content = self._parse_json(self._html_search_regex(r'var global_content = ([^;]*)', webpage, 'global content JSON').replace("'", '"'), display_id) cdn_path = global_content.get('cdn_path', 'https://cdn.brainpop.com') movie_cdn_path = global_content.get('movie_cdn_path', 'https://svideos.brainpop.com') ec_token = self._html_search_regex(r"ec_token : '([^']*)'", webpage, 'token') - screenshots = content['category']['unit']['topic'].get('screenshots', {}) - thumbnails = [{'url': cdn_path + screenshot} for screenshot in screenshots] + thumbnails = [{'url': cdn_path + screenshot} for screenshot in topic.get('screenshots', {})] - movies = content['category']['unit']['topic']['movies'] + movies = topic['movies'] formats = [] formats.append({ 'url': movie_cdn_path + movies['mp4'] + '?' + ec_token, @@ -52,14 +52,14 @@ class BrainPOPIE(InfoExtractor): 'width': 480, }) self._sort_formats(formats) - - settings = self._parse_json(self._html_search_regex(r'var settings = ([^;]*)', webpage, 'settings'), display_id) + + settings = self._parse_json(self._html_search_regex(r'var settings = ([^;]*)', webpage, 'settings JSON', '{}'), display_id) return { - 'id': content['category']['unit']['topic']['EntryID'], + 'id': topic['EntryID'], 'display_id': display_id, - 'title': remove_end(settings['title'], ' - BrainPOP'), - 'description': settings['description'], + 'title': remove_end(settings.get('title', display_id), ' - BrainPOP'), + 'description': settings.get('description', ''), 'thumbnails': thumbnails, 'formats': formats, }