From ab72d143071cc253224b6e35962edb1dc9a34939 Mon Sep 17 00:00:00 2001 From: John Hawkinson Date: Fri, 7 Apr 2017 23:23:31 -0400 Subject: [PATCH 1/4] [kaltura] Support iframe embeds, with test Note that these need to back to through the Generic extractor because the iframe URLs may be redirects that cannot be parsed by KalturaIE without being followed, and Generic checks for such redirects and follows them. Hence dropping the IE from url_result(). --- youtube_dl/extractor/generic.py | 17 ++++++++++++++++- youtube_dl/extractor/kaltura.py | 7 +++++++ 2 files changed, 23 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 73911940c..4bfa3f8a1 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -1080,6 +1080,21 @@ class GenericIE(InfoExtractor): }, 'add_ie': ['Kaltura'], }, + { + # Kaltura iframe embed + 'url': 'http://www.gsd.harvard.edu/event/i-m-pei-a-centennial-celebration/', + 'md5': 'ae5ace8eb09dc1a35d03b579a9c2cc44', + 'info_dict': { + 'id': '0_f2cfbpwy', + 'ext': 'mp4', + 'title': 'I. M. Pei: A Centennial Celebration', + 'description': 'md5:1db8f40c69edc46ca180ba30c567f37c', + 'upload_date': '20170403', + 'uploader_id': 'batchUser', + 'timestamp': 1491232186, + }, + 'add_ie': ['Kaltura'], + }, # Eagle.Platform embed (generic URL) { 'url': 'http://lenta.ru/news/2015/03/06/navalny/', @@ -2290,7 +2305,7 @@ class GenericIE(InfoExtractor): # Look for Kaltura embeds kaltura_url = KalturaIE._extract_url(webpage) if kaltura_url: - return self.url_result(smuggle_url(kaltura_url, {'source_url': url}), KalturaIE.ie_key()) + return self.url_result(smuggle_url(kaltura_url, {'source_url': url})) # Look for Eagle.Platform embeds eagleplatform_url = EaglePlatformIE._extract_url(webpage) diff --git a/youtube_dl/extractor/kaltura.py b/youtube_dl/extractor/kaltura.py index 54374ea76..f1e8b25cc 100644 --- a/youtube_dl/extractor/kaltura.py +++ b/youtube_dl/extractor/kaltura.py @@ -139,6 +139,13 @@ class KalturaIE(InfoExtractor): url = smuggle_url(url, {'service_url': service_url.group(1)}) return url + # Check for an iframe, which may require redirection. + mobj = re.search( + r"]+src=['\"](?P(https?:)?//www\.kaltura\.com/[^'\"]+)['\"]", + webpage) + if mobj: + return mobj.group('url') + def _kaltura_api_call(self, video_id, actions, service_url=None, *args, **kwargs): params = actions[0] if len(actions) > 1: From 07970f3ae299e0ccfc1f1601ce22ad0121666019 Mon Sep 17 00:00:00 2001 From: John Hawkinson Date: Sat, 8 Apr 2017 00:53:23 -0400 Subject: [PATCH 2/4] [Kaltura] Skip failing Kaltura_1 test Fails like so: > /Users/jhawk/src/youtube-dl/youtube_dl/extractor/kaltura.py(266)_real_extract() -> data_url = info['dataUrl'] (Pdb) info {u'message': u'Entry id "0_l5ye1133" not found', u'code': u'ENTRY_ID_NOT_FOUND', u'args': {u'ENTRY_ID': u'0_l5ye1133'}, u'objectType': u'KalturaAPIException'} Also note another URL to the same video, which might be helpful in figuring out the right Kaltura entry ID, at least maybe? --- youtube_dl/extractor/kaltura.py | 1 + 1 file changed, 1 insertion(+) diff --git a/youtube_dl/extractor/kaltura.py b/youtube_dl/extractor/kaltura.py index f1e8b25cc..741dd8dc5 100644 --- a/youtube_dl/extractor/kaltura.py +++ b/youtube_dl/extractor/kaltura.py @@ -91,6 +91,7 @@ class KalturaIE(InfoExtractor): }], }, }, + 'skip': 'Gone. Maybe https://www.safaribooksonline.com/library/tutorials/introduction-to-python-anon/3469/', 'params': { 'skip_download': True, }, From 4905e589d74e72fccb6e050d4e0db0d96c8e3930 Mon Sep 17 00:00:00 2001 From: John Hawkinson Date: Sat, 8 Apr 2017 04:21:52 -0400 Subject: [PATCH 3/4] [kaltura] Be rigorous on iframe Per @dstftw, don't pull out just any kaltura.com iframes, make sure they have /p/{PARTNER_ID} and &entry_id={ENTRY_ID} and return a kaltura: URL. Go back to specifying the IE is Kaltura in url_result(). --- youtube_dl/extractor/generic.py | 2 +- youtube_dl/extractor/kaltura.py | 19 +++++++++++-------- 2 files changed, 12 insertions(+), 9 deletions(-) diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 4bfa3f8a1..658533cf6 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -2305,7 +2305,7 @@ class GenericIE(InfoExtractor): # Look for Kaltura embeds kaltura_url = KalturaIE._extract_url(webpage) if kaltura_url: - return self.url_result(smuggle_url(kaltura_url, {'source_url': url})) + return self.url_result(smuggle_url(kaltura_url, {'source_url': url}), KalturaIE.ie_key()) # Look for Eagle.Platform embeds eagleplatform_url = EaglePlatformIE._extract_url(webpage) diff --git a/youtube_dl/extractor/kaltura.py b/youtube_dl/extractor/kaltura.py index 741dd8dc5..6e992ee4b 100644 --- a/youtube_dl/extractor/kaltura.py +++ b/youtube_dl/extractor/kaltura.py @@ -128,7 +128,17 @@ class KalturaIE(InfoExtractor): (?P["\'])entry_?[Ii]d(?P=q2) )\s*:\s* (?P["\'])(?P(?:(?!(?P=q3)).)+)(?P=q3) - ''', webpage)) + ''', webpage) or + re.search( + # + r'''(?xs) + (?P["\']) + (?:https?:)?//(?:www\.)?kaltura\.com/p/(?P\d+)/ + (?:(?!(?P=q1)).)* + [\?&]entry_id=(?P(?:(?!(?P=q1))[^&])+) + (?P=q1) + ''', webpage) + ) if mobj: embed_info = mobj.groupdict() url = 'kaltura:%(partner_id)s:%(id)s' % embed_info @@ -140,13 +150,6 @@ class KalturaIE(InfoExtractor): url = smuggle_url(url, {'service_url': service_url.group(1)}) return url - # Check for an iframe, which may require redirection. - mobj = re.search( - r"]+src=['\"](?P(https?:)?//www\.kaltura\.com/[^'\"]+)['\"]", - webpage) - if mobj: - return mobj.group('url') - def _kaltura_api_call(self, video_id, actions, service_url=None, *args, **kwargs): params = actions[0] if len(actions) > 1: From a8df272cabe79552524759c7a1231c420603c9a9 Mon Sep 17 00:00:00 2001 From: John Hawkinson Date: Sat, 8 Apr 2017 05:33:25 -0400 Subject: [PATCH 4/4] [Kaltura] comment; anchor RE to 'iframe' Remove the example of an iframe spec from the spec page, and instead link to it. Do so at the top of the method as it documents the prviously two re.search()s as well. Limit the iframe search to urls that actually appear in an r'''(?xs) - (?P["\']) + ]+src=(?P["\']) (?:https?:)?//(?:www\.)?kaltura\.com/p/(?P\d+)/ (?:(?!(?P=q1)).)* [\?&]entry_id=(?P(?:(?!(?P=q1))[^&])+)