From 70d117053d867e91799ab4eabb2ac68acbd32e37 Mon Sep 17 00:00:00 2001 From: John Hawkinson Date: Thu, 6 Oct 2016 01:09:53 -0400 Subject: [PATCH 01/29] [IQM2] Add new extractor first cut --- youtube_dl/extractor/extractors.py | 1 + youtube_dl/extractor/iqm2.py | 81 ++++++++++++++++++++++++++++++ 2 files changed, 82 insertions(+) create mode 100644 youtube_dl/extractor/iqm2.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index feee06004..6be43a613 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -392,6 +392,7 @@ from .instagram import InstagramIE, InstagramUserIE from .internetvideoarchive import InternetVideoArchiveIE from .iprima import IPrimaIE from .iqiyi import IqiyiIE +from .iqm2 import IQM2IE from .ir90tv import Ir90TvIE from .ivi import ( IviIE, diff --git a/youtube_dl/extractor/iqm2.py b/youtube_dl/extractor/iqm2.py new file mode 100644 index 000000000..06df0df0c --- /dev/null +++ b/youtube_dl/extractor/iqm2.py @@ -0,0 +1,81 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..compat import compat_urllib_parse_urlparse +from .generic import GenericIE + +# IQM2 aka Accela is a municipal meeting management platform that +# (among other things) stores livestreamed video from municipal +# meetings. After a hefty (several-hour) processing time, that video +# is avaialble in easily downloadable form from their web portal, but +# prior to that, the video can only be watched in realtime through +# JWPlayer. This extractor is designed to download the realtime video +# prior to download links being available. See: +# http://www.iqm2.com/About/Accela.aspx +# http://www.accela.com/ + +# This makes it challenging to produce a test case for, because the +# extractor will want to follow the processed and easily downloadble +# version. So there may be interesting bugs during the race condition +# time before the processed video is available (which is really the +# only time this extractor is especially important). + +# This is also a relatively braindead extractor. It parses a given page like +# http://cambridgema.iqm2.com/Citizens/SplitView.aspx?Mode=Video&MeetingID=1679# +# to determine the location of an inner div defined by a URL of the form +# http://cambridgema.iqm2.com/Citizens/VideoScreen.aspx?MediaID=1563&Frame=SplitView + +# and then simply hands that URL to the generic extractor, which +# matches it under the "Broaden the findall a little bit: JWPlayer JS +# loader" (line 2372 as of 6 Oct 2016). + +# This also appears to be the only example to date of an extractor +# that calls-out to the GenericIE generic extractor, so it may be +# useful as an example. Or perhaps it means that there's a better way +# to do this and it should be rewritten differently, esp. to not +# leverage the generic. + +# Contributed by John Hawkinson , 6 Oct 2016. + + +# https://CambridgeMA.IQM2.com/Citizens/VideoMain.aspx?MeetingID=1679 + +class IQM2IE(InfoExtractor): + _VALID_URL = r'https?://(?:\w+\.)?iqm2\.com/Citizens/SplitView.aspx\?Mode=Video&MeetingID=(?P[0-9]+)' + _TEST = { + 'url': 'http://cambridgema.iqm2.com/Citizens/SplitView.aspx?Mode=Video&MeetingID=1679#', + 'md5': '478ea30eee1966f7be0d8dd623122148', + 'info_dict': { + 'id': '1563_720', + 'ext': 'mp4', + 'title': 'Cambridge, MA (2)', + 'uploader': 'cambridgema.iqm2.com', + # TODO more properties, either as: + # * A value + # * MD5 checksum; start the string with md5: + # * A regular expression; start the string with re: + # * Any Python type (for example int or float) + } + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + + webpage = self._download_webpage(url, video_id) + # title = self._html_search_regex(r'

(.*?)

', webpage, 'title') + title = 'Video title goes here' + + purl = compat_urllib_parse_urlparse(url) + hostname = purl.hostname + print "URL is", url, "at", hostname + nurl = self._html_search_regex(r'
Date: Thu, 6 Oct 2016 01:29:58 -0400 Subject: [PATCH 02/29] Case-insensitive URL match --- youtube_dl/extractor/iqm2.py | 34 +++++++++++++++++++++++++++++++--- 1 file changed, 31 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/iqm2.py b/youtube_dl/extractor/iqm2.py index 06df0df0c..eab14cee2 100644 --- a/youtube_dl/extractor/iqm2.py +++ b/youtube_dl/extractor/iqm2.py @@ -32,6 +32,11 @@ from .generic import GenericIE # matches it under the "Broaden the findall a little bit: JWPlayer JS # loader" (line 2372 as of 6 Oct 2016). +# It appears that the metadata associated with the video (like it's +# title) does not appear anywhere in the 2 HTML pages that get +# downloaded through this extractor. So it would need to download +# additional HTTP resources in order to get appropriate metadata. + # This also appears to be the only example to date of an extractor # that calls-out to the GenericIE generic extractor, so it may be # useful as an example. Or perhaps it means that there's a better way @@ -40,11 +45,34 @@ from .generic import GenericIE # Contributed by John Hawkinson , 6 Oct 2016. - +# Potential test URLs: +# http://cambridgema.iqm2.com/Citizens/SplitView.aspx?Mode=Video&MeetingID=1679 # https://CambridgeMA.IQM2.com/Citizens/VideoMain.aspx?MeetingID=1679 class IQM2IE(InfoExtractor): - _VALID_URL = r'https?://(?:\w+\.)?iqm2\.com/Citizens/SplitView.aspx\?Mode=Video&MeetingID=(?P[0-9]+)' + + # xxx is really right that InfoExtractor.suitable() calls re.compile() + # on _VALID_URL in a case-sensitive fashion? It's obviously reasonable + # for the path portion of a URL to be case-sensitive, but the hostname + # ought not to be. And it seems like strict adherence might mess up a + # bunch of extractors in funny-cased URLs? Redefine suitable() to search + # case-insentitively. Note this also changes the re.match() call at the + # start of _real_extract() + # + # In this case, we commonly see both iqm2.com and IQM2.com + + @classmethod + def suitable(cls, url): + """Receives a URL and returns True if suitable for this IE.""" + + # This does not use has/getattr intentionally - we want to know whether + # we have cached the regexp for *this* class, whereas getattr would also + # match the superclass + if '_VALID_URL_RE' not in cls.__dict__: + cls._VALID_URL_RE = re.compile(cls._VALID_URL, flags=re.IGNORECASE) + return cls._VALID_URL_RE.match(url) is not None + + _VALID_URL = r'https?://(?:\w+\.)?iqm2\.com/Citizens/\w+.aspx\?.*MeetingID=(?P[0-9]+)' _TEST = { 'url': 'http://cambridgema.iqm2.com/Citizens/SplitView.aspx?Mode=Video&MeetingID=1679#', 'md5': '478ea30eee1966f7be0d8dd623122148', @@ -62,7 +90,7 @@ class IQM2IE(InfoExtractor): } def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) + mobj = re.match(self._VALID_URL, url, flags=re.IGNORECASE) video_id = mobj.group('id') webpage = self._download_webpage(url, video_id) From 841b2af1582711612007ab0433749e98cf9f4f7b Mon Sep 17 00:00:00 2001 From: John Hawkinson Date: Thu, 6 Oct 2016 01:30:44 -0400 Subject: [PATCH 03/29] strip unnecessary comments from template --- youtube_dl/extractor/iqm2.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/youtube_dl/extractor/iqm2.py b/youtube_dl/extractor/iqm2.py index eab14cee2..0ddb41300 100644 --- a/youtube_dl/extractor/iqm2.py +++ b/youtube_dl/extractor/iqm2.py @@ -81,11 +81,6 @@ class IQM2IE(InfoExtractor): 'ext': 'mp4', 'title': 'Cambridge, MA (2)', 'uploader': 'cambridgema.iqm2.com', - # TODO more properties, either as: - # * A value - # * MD5 checksum; start the string with md5: - # * A regular expression; start the string with re: - # * Any Python type (for example int or float) } } From 9adff20bb5b41b363aba2e9c13f6c5758dc0395b Mon Sep 17 00:00:00 2001 From: John Hawkinson Date: Thu, 6 Oct 2016 01:33:34 -0400 Subject: [PATCH 04/29] strip unused title= code --- youtube_dl/extractor/iqm2.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/youtube_dl/extractor/iqm2.py b/youtube_dl/extractor/iqm2.py index 0ddb41300..80071a2e8 100644 --- a/youtube_dl/extractor/iqm2.py +++ b/youtube_dl/extractor/iqm2.py @@ -89,8 +89,6 @@ class IQM2IE(InfoExtractor): video_id = mobj.group('id') webpage = self._download_webpage(url, video_id) - # title = self._html_search_regex(r'

(.*?)

', webpage, 'title') - title = 'Video title goes here' purl = compat_urllib_parse_urlparse(url) hostname = purl.hostname From e2135e879165d905afd1b509d9959e53a2901df4 Mon Sep 17 00:00:00 2001 From: John Hawkinson Date: Thu, 6 Oct 2016 01:38:25 -0400 Subject: [PATCH 05/29] Handle relative URLs with urlparse.urljoin() --- youtube_dl/extractor/iqm2.py | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/youtube_dl/extractor/iqm2.py b/youtube_dl/extractor/iqm2.py index 80071a2e8..389517984 100644 --- a/youtube_dl/extractor/iqm2.py +++ b/youtube_dl/extractor/iqm2.py @@ -4,7 +4,7 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..compat import compat_urllib_parse_urlparse +from ..compat import compat_urlparse from .generic import GenericIE # IQM2 aka Accela is a municipal meeting management platform that @@ -90,13 +90,14 @@ class IQM2IE(InfoExtractor): webpage = self._download_webpage(url, video_id) - purl = compat_urllib_parse_urlparse(url) - hostname = purl.hostname - print "URL is", url, "at", hostname - nurl = self._html_search_regex(r'
Date: Thu, 6 Oct 2016 01:42:30 -0400 Subject: [PATCH 06/29] Comment verbatim example of
--- youtube_dl/extractor/iqm2.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/iqm2.py b/youtube_dl/extractor/iqm2.py index 389517984..dd78ce6cf 100644 --- a/youtube_dl/extractor/iqm2.py +++ b/youtube_dl/extractor/iqm2.py @@ -90,14 +90,20 @@ class IQM2IE(InfoExtractor): webpage = self._download_webpage(url, video_id) - print "Original URL is", url + # print "Original URL is", url + # We want to extract an inner URL like this: + #
+ #
inner_url_rel = self._html_search_regex( r'
Date: Thu, 6 Oct 2016 01:51:06 -0400 Subject: [PATCH 07/29] copyedit comments --- youtube_dl/extractor/iqm2.py | 34 +++++++++++++++++----------------- 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/youtube_dl/extractor/iqm2.py b/youtube_dl/extractor/iqm2.py index dd78ce6cf..09525cbe1 100644 --- a/youtube_dl/extractor/iqm2.py +++ b/youtube_dl/extractor/iqm2.py @@ -10,38 +10,38 @@ from .generic import GenericIE # IQM2 aka Accela is a municipal meeting management platform that # (among other things) stores livestreamed video from municipal # meetings. After a hefty (several-hour) processing time, that video -# is avaialble in easily downloadable form from their web portal, but +# is available in easily downloadable form from their web portal, but # prior to that, the video can only be watched in realtime through # JWPlayer. This extractor is designed to download the realtime video -# prior to download links being available. See: +# prior to download links being available. For more info on Accela, see: # http://www.iqm2.com/About/Accela.aspx # http://www.accela.com/ -# This makes it challenging to produce a test case for, because the -# extractor will want to follow the processed and easily downloadble -# version. So there may be interesting bugs during the race condition -# time before the processed video is available (which is really the -# only time this extractor is especially important). +# This processing makes it challenging to produce a test case for, +# because the extractor will want to find the processed and easily +# downloadable version. So there may be interesting bugs during the +# race condition time before the processed video is available (which +# is really the only time this extractor is especially important). # This is also a relatively braindead extractor. It parses a given page like -# http://cambridgema.iqm2.com/Citizens/SplitView.aspx?Mode=Video&MeetingID=1679# +# http://cambridgema.iqm2.com/Citizens/SplitView.aspx?Mode=Video&MeetingID=1679 # to determine the location of an inner div defined by a URL of the form # http://cambridgema.iqm2.com/Citizens/VideoScreen.aspx?MediaID=1563&Frame=SplitView -# and then simply hands that URL to the generic extractor, which -# matches it under the "Broaden the findall a little bit: JWPlayer JS -# loader" (line 2372 as of 6 Oct 2016). +# and then simply hands that URL to the GenericIE generic extractor, +# which matches it under the "Broaden the findall a little bit: +# JWPlayer JS loader" (line 2372 as of 6 Oct 2016). -# It appears that the metadata associated with the video (like it's +# It appears that the metadata associated with the video (like its # title) does not appear anywhere in the 2 HTML pages that get # downloaded through this extractor. So it would need to download -# additional HTTP resources in order to get appropriate metadata. +# additional HTTP resources in order to get "real" metadata. # This also appears to be the only example to date of an extractor -# that calls-out to the GenericIE generic extractor, so it may be +# that calls-out to the generic extractor, so it may be # useful as an example. Or perhaps it means that there's a better way # to do this and it should be rewritten differently, esp. to not -# leverage the generic. +# leverage the generic? (xxx) # Contributed by John Hawkinson , 6 Oct 2016. @@ -56,10 +56,10 @@ class IQM2IE(InfoExtractor): # for the path portion of a URL to be case-sensitive, but the hostname # ought not to be. And it seems like strict adherence might mess up a # bunch of extractors in funny-cased URLs? Redefine suitable() to search - # case-insentitively. Note this also changes the re.match() call at the + # case-insensitively. Note this also changes the re.match() call at the # start of _real_extract() # - # In this case, we commonly see both iqm2.com and IQM2.com + # In this case, we commonly see both iqm2.com and IQM2.com. @classmethod def suitable(cls, url): From e468f9196851b9e72618d4b0b864276acc8f2ee6 Mon Sep 17 00:00:00 2001 From: John Hawkinson Date: Sat, 8 Oct 2016 19:17:32 -0400 Subject: [PATCH 08/29] Move test cases from comment to _TESTS --- youtube_dl/extractor/iqm2.py | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/youtube_dl/extractor/iqm2.py b/youtube_dl/extractor/iqm2.py index 09525cbe1..9ece64421 100644 --- a/youtube_dl/extractor/iqm2.py +++ b/youtube_dl/extractor/iqm2.py @@ -45,10 +45,6 @@ from .generic import GenericIE # Contributed by John Hawkinson , 6 Oct 2016. -# Potential test URLs: -# http://cambridgema.iqm2.com/Citizens/SplitView.aspx?Mode=Video&MeetingID=1679 -# https://CambridgeMA.IQM2.com/Citizens/VideoMain.aspx?MeetingID=1679 - class IQM2IE(InfoExtractor): # xxx is really right that InfoExtractor.suitable() calls re.compile() @@ -73,7 +69,7 @@ class IQM2IE(InfoExtractor): return cls._VALID_URL_RE.match(url) is not None _VALID_URL = r'https?://(?:\w+\.)?iqm2\.com/Citizens/\w+.aspx\?.*MeetingID=(?P[0-9]+)' - _TEST = { + _TESTS = [ { 'url': 'http://cambridgema.iqm2.com/Citizens/SplitView.aspx?Mode=Video&MeetingID=1679#', 'md5': '478ea30eee1966f7be0d8dd623122148', 'info_dict': { @@ -81,8 +77,10 @@ class IQM2IE(InfoExtractor): 'ext': 'mp4', 'title': 'Cambridge, MA (2)', 'uploader': 'cambridgema.iqm2.com', - } - } + }}, { + 'url': 'https://CambridgeMA.IQM2.com/Citizens/VideoMain.aspx?MeetingID=1679', + 'only_matching': True, + }] def _real_extract(self, url): mobj = re.match(self._VALID_URL, url, flags=re.IGNORECASE) From 71fbddb78a33e4f8716c0fdec8cccbb4788d4f90 Mon Sep 17 00:00:00 2001 From: John Hawkinson Date: Sat, 8 Oct 2016 19:21:33 -0400 Subject: [PATCH 09/29] Use (?i) for case-insensitivity in URLs --- youtube_dl/extractor/iqm2.py | 26 +++----------------------- 1 file changed, 3 insertions(+), 23 deletions(-) diff --git a/youtube_dl/extractor/iqm2.py b/youtube_dl/extractor/iqm2.py index 9ece64421..3d1231628 100644 --- a/youtube_dl/extractor/iqm2.py +++ b/youtube_dl/extractor/iqm2.py @@ -47,28 +47,8 @@ from .generic import GenericIE class IQM2IE(InfoExtractor): - # xxx is really right that InfoExtractor.suitable() calls re.compile() - # on _VALID_URL in a case-sensitive fashion? It's obviously reasonable - # for the path portion of a URL to be case-sensitive, but the hostname - # ought not to be. And it seems like strict adherence might mess up a - # bunch of extractors in funny-cased URLs? Redefine suitable() to search - # case-insensitively. Note this also changes the re.match() call at the - # start of _real_extract() - # - # In this case, we commonly see both iqm2.com and IQM2.com. - - @classmethod - def suitable(cls, url): - """Receives a URL and returns True if suitable for this IE.""" - - # This does not use has/getattr intentionally - we want to know whether - # we have cached the regexp for *this* class, whereas getattr would also - # match the superclass - if '_VALID_URL_RE' not in cls.__dict__: - cls._VALID_URL_RE = re.compile(cls._VALID_URL, flags=re.IGNORECASE) - return cls._VALID_URL_RE.match(url) is not None - - _VALID_URL = r'https?://(?:\w+\.)?iqm2\.com/Citizens/\w+.aspx\?.*MeetingID=(?P[0-9]+)' + # We commonly see both iqm2.com and IQM2.com. + _VALID_URL = r'(?i)https?://(?:\w+\.)?iqm2\.com/Citizens/\w+.aspx\?.*MeetingID=(?P[0-9]+)' _TESTS = [ { 'url': 'http://cambridgema.iqm2.com/Citizens/SplitView.aspx?Mode=Video&MeetingID=1679#', 'md5': '478ea30eee1966f7be0d8dd623122148', @@ -83,7 +63,7 @@ class IQM2IE(InfoExtractor): }] def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url, flags=re.IGNORECASE) + mobj = re.match(self._VALID_URL, url) video_id = mobj.group('id') webpage = self._download_webpage(url, video_id) From e6a1522743358ea9254339a756e77819af2d86f6 Mon Sep 17 00:00:00 2001 From: John Hawkinson Date: Sat, 8 Oct 2016 19:23:59 -0400 Subject: [PATCH 10/29] Tighten up regex comment --- youtube_dl/extractor/iqm2.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/youtube_dl/extractor/iqm2.py b/youtube_dl/extractor/iqm2.py index 3d1231628..7c639b3a3 100644 --- a/youtube_dl/extractor/iqm2.py +++ b/youtube_dl/extractor/iqm2.py @@ -70,12 +70,8 @@ class IQM2IE(InfoExtractor): # print "Original URL is", url - # We want to extract an inner URL like this: #
- #
+ #
inner_url_rel = self._html_search_regex( r'
Date: Sat, 8 Oct 2016 19:31:50 -0400 Subject: [PATCH 11/29] Use url_result instead of instance of GenericIE() --- youtube_dl/extractor/iqm2.py | 16 +++++----------- 1 file changed, 5 insertions(+), 11 deletions(-) diff --git a/youtube_dl/extractor/iqm2.py b/youtube_dl/extractor/iqm2.py index 7c639b3a3..a348917f0 100644 --- a/youtube_dl/extractor/iqm2.py +++ b/youtube_dl/extractor/iqm2.py @@ -27,22 +27,13 @@ from .generic import GenericIE # http://cambridgema.iqm2.com/Citizens/SplitView.aspx?Mode=Video&MeetingID=1679 # to determine the location of an inner div defined by a URL of the form # http://cambridgema.iqm2.com/Citizens/VideoScreen.aspx?MediaID=1563&Frame=SplitView - -# and then simply hands that URL to the GenericIE generic extractor, -# which matches it under the "Broaden the findall a little bit: -# JWPlayer JS loader" (line 2372 as of 6 Oct 2016). +# and then feed it to the generic extractor. # It appears that the metadata associated with the video (like its # title) does not appear anywhere in the 2 HTML pages that get # downloaded through this extractor. So it would need to download # additional HTTP resources in order to get "real" metadata. -# This also appears to be the only example to date of an extractor -# that calls-out to the generic extractor, so it may be -# useful as an example. Or perhaps it means that there's a better way -# to do this and it should be rewritten differently, esp. to not -# leverage the generic? (xxx) - # Contributed by John Hawkinson , 6 Oct 2016. class IQM2IE(InfoExtractor): @@ -80,4 +71,7 @@ class IQM2IE(InfoExtractor): inner_url = compat_urlparse.urljoin(url, inner_url_rel) # print "Joined URL is", inner_url - return GenericIE(self._downloader)._real_extract(inner_url) + # Generic extractor matches this under the "Broaden the + # findall a little bit: JWPlayer JS loader" (line 2372 as of 6 + # Oct 2016, dcdb292fddc82ae11f4c0b647815a45c88a6b6d5). + return self.url_result(inner_url, 'Generic') From 0bdb0c707b3703b20a39f2171f6a3bcb3064cf38 Mon Sep 17 00:00:00 2001 From: John Hawkinson Date: Sat, 8 Oct 2016 19:38:28 -0400 Subject: [PATCH 12/29] Condense comments, distribute --- youtube_dl/extractor/iqm2.py | 28 +++++++++++----------------- 1 file changed, 11 insertions(+), 17 deletions(-) diff --git a/youtube_dl/extractor/iqm2.py b/youtube_dl/extractor/iqm2.py index a348917f0..5200b1f01 100644 --- a/youtube_dl/extractor/iqm2.py +++ b/youtube_dl/extractor/iqm2.py @@ -17,22 +17,12 @@ from .generic import GenericIE # http://www.iqm2.com/About/Accela.aspx # http://www.accela.com/ -# This processing makes it challenging to produce a test case for, -# because the extractor will want to find the processed and easily -# downloadable version. So there may be interesting bugs during the -# race condition time before the processed video is available (which -# is really the only time this extractor is especially important). +# This processing makes hard to test since there's only a narrow +# window when it matters. After that the extractor finds links to +# the processed video intead. -# This is also a relatively braindead extractor. It parses a given page like -# http://cambridgema.iqm2.com/Citizens/SplitView.aspx?Mode=Video&MeetingID=1679 -# to determine the location of an inner div defined by a URL of the form -# http://cambridgema.iqm2.com/Citizens/VideoScreen.aspx?MediaID=1563&Frame=SplitView -# and then feed it to the generic extractor. - -# It appears that the metadata associated with the video (like its -# title) does not appear anywhere in the 2 HTML pages that get -# downloaded through this extractor. So it would need to download -# additional HTTP resources in order to get "real" metadata. +# No metadata is retrieved, as that would require finding a metadata +# URL and retreiving a 3rd HTTP resource. # Contributed by John Hawkinson , 6 Oct 2016. @@ -61,8 +51,12 @@ class IQM2IE(InfoExtractor): # print "Original URL is", url - #
- #
+ # Simple extractor: take, e.g. + # http://cambridgema.iqm2.com/Citizens/SplitView.aspx?Mode=Video&MeetingID=1679 + # and look for + #
+ #
+ # and feed the canonicalized src element to the generic extractor inner_url_rel = self._html_search_regex( r'
Date: Sat, 8 Oct 2016 21:01:43 -0400 Subject: [PATCH 13/29] Some instances don't have downloadable video E.g. somervillecityma.iqm2.com only has the JWPlayer video. Makes a better test case, so add it as the first. --- youtube_dl/extractor/iqm2.py | 22 +++++++++++++++++----- 1 file changed, 17 insertions(+), 5 deletions(-) diff --git a/youtube_dl/extractor/iqm2.py b/youtube_dl/extractor/iqm2.py index 5200b1f01..15eaac1a3 100644 --- a/youtube_dl/extractor/iqm2.py +++ b/youtube_dl/extractor/iqm2.py @@ -9,13 +9,17 @@ from .generic import GenericIE # IQM2 aka Accela is a municipal meeting management platform that # (among other things) stores livestreamed video from municipal -# meetings. After a hefty (several-hour) processing time, that video -# is available in easily downloadable form from their web portal, but -# prior to that, the video can only be watched in realtime through -# JWPlayer. This extractor is designed to download the realtime video -# prior to download links being available. For more info on Accela, see: +# meetings. In some cases (e.g. cambridgema.iqm2.com), after a hefty +# (several-hour) processing time, that video is available in easily +# downloadable form from their web portal, but prior to that, the +# video can only be watched in realtime through JWPlayer. Other +# (somervillecityma.iqm2.com) instances don't seem to ever offer a +# downloadable form. This extractor is designed to download the +# realtime video without the download links being available. For more +# info on Accela, see: # http://www.iqm2.com/About/Accela.aspx # http://www.accela.com/ +# https://github.com/Accela-Inc/leg-man-api-docs # This processing makes hard to test since there's only a narrow # window when it matters. After that the extractor finds links to @@ -31,6 +35,14 @@ class IQM2IE(InfoExtractor): # We commonly see both iqm2.com and IQM2.com. _VALID_URL = r'(?i)https?://(?:\w+\.)?iqm2\.com/Citizens/\w+.aspx\?.*MeetingID=(?P[0-9]+)' _TESTS = [ { + 'url': 'http://somervillecityma.iqm2.com/Citizens/SplitView.aspx?Mode=Video&MeetingID=2308', + 'md5': '9ef458ff6c93f8b9323cf79db4ede9cf', + 'info_dict': { + 'id': '70472_480', + 'ext': 'mp4', + 'title': 'City of Somerville, Massachusetts', + 'uploader': 'somervillecityma.iqm2.com', + }}, { 'url': 'http://cambridgema.iqm2.com/Citizens/SplitView.aspx?Mode=Video&MeetingID=1679#', 'md5': '478ea30eee1966f7be0d8dd623122148', 'info_dict': { From 9a2ed02b6539df289cc970ec3d1fe1bfef8656ac Mon Sep 17 00:00:00 2001 From: John Hawkinson Date: Sat, 8 Oct 2016 22:46:09 -0400 Subject: [PATCH 14/29] debugging print -> self.to_screen() --- youtube_dl/extractor/iqm2.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/iqm2.py b/youtube_dl/extractor/iqm2.py index 15eaac1a3..435978258 100644 --- a/youtube_dl/extractor/iqm2.py +++ b/youtube_dl/extractor/iqm2.py @@ -75,7 +75,9 @@ class IQM2IE(InfoExtractor): # print "inner_URL is", inner_url_rel inner_url = compat_urlparse.urljoin(url, inner_url_rel) - # print "Joined URL is", inner_url + + if self._downloader.params.get('verbose'): + self.to_screen('Invoking downloader on %s' % inner_url) # Generic extractor matches this under the "Broaden the # findall a little bit: JWPlayer JS loader" (line 2372 as of 6 From 2cc84715bcab3f834c3e493e4fd1b0221fad1e56 Mon Sep 17 00:00:00 2001 From: John Hawkinson Date: Sat, 8 Oct 2016 23:11:07 -0400 Subject: [PATCH 15/29] Set to_generic -> True to suppress fallback msg --- youtube_dl/extractor/iqm2.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/iqm2.py b/youtube_dl/extractor/iqm2.py index 435978258..9ff545455 100644 --- a/youtube_dl/extractor/iqm2.py +++ b/youtube_dl/extractor/iqm2.py @@ -1,5 +1,6 @@ # coding: utf-8 from __future__ import unicode_literals +from ..utils import smuggle_url import re @@ -82,4 +83,6 @@ class IQM2IE(InfoExtractor): # Generic extractor matches this under the "Broaden the # findall a little bit: JWPlayer JS loader" (line 2372 as of 6 # Oct 2016, dcdb292fddc82ae11f4c0b647815a45c88a6b6d5). - return self.url_result(inner_url, 'Generic') + + return self.url_result(smuggle_url(inner_url, {'to_generic': True}), + 'Generic') From f46aea8404fdaa2594549be6868d9be5d95bcbfa Mon Sep 17 00:00:00 2001 From: John Hawkinson Date: Sat, 8 Oct 2016 23:12:37 -0400 Subject: [PATCH 16/29] remove blank line --- youtube_dl/extractor/iqm2.py | 1 - 1 file changed, 1 deletion(-) diff --git a/youtube_dl/extractor/iqm2.py b/youtube_dl/extractor/iqm2.py index 9ff545455..01e409d87 100644 --- a/youtube_dl/extractor/iqm2.py +++ b/youtube_dl/extractor/iqm2.py @@ -83,6 +83,5 @@ class IQM2IE(InfoExtractor): # Generic extractor matches this under the "Broaden the # findall a little bit: JWPlayer JS loader" (line 2372 as of 6 # Oct 2016, dcdb292fddc82ae11f4c0b647815a45c88a6b6d5). - return self.url_result(smuggle_url(inner_url, {'to_generic': True}), 'Generic') From 8d8acd193efdb1441276bcf35fed2231c3dc1c60 Mon Sep 17 00:00:00 2001 From: John Hawkinson Date: Sun, 9 Oct 2016 08:00:15 -0400 Subject: [PATCH 17/29] Use _match_id() instead of re.match() Oops, when I created this extractor I copied the sample code from the 2014 manpage on my system, thus missing 4bc77c8417ca0340d09dcebb311d06aa7d5ba0ac's introduction of the _match_id() helper function. --- youtube_dl/extractor/iqm2.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/youtube_dl/extractor/iqm2.py b/youtube_dl/extractor/iqm2.py index 01e409d87..d20b78146 100644 --- a/youtube_dl/extractor/iqm2.py +++ b/youtube_dl/extractor/iqm2.py @@ -57,9 +57,7 @@ class IQM2IE(InfoExtractor): }] def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') - + video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) # print "Original URL is", url From 6589917a0b7d2ba742b557094933e65b2b3f0dbd Mon Sep 17 00:00:00 2001 From: John Hawkinson Date: Sun, 9 Oct 2016 09:15:52 -0400 Subject: [PATCH 18/29] Remove #'d debugging per @yan12125 --- youtube_dl/extractor/iqm2.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/youtube_dl/extractor/iqm2.py b/youtube_dl/extractor/iqm2.py index d20b78146..6244990c4 100644 --- a/youtube_dl/extractor/iqm2.py +++ b/youtube_dl/extractor/iqm2.py @@ -60,8 +60,6 @@ class IQM2IE(InfoExtractor): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) - # print "Original URL is", url - # Simple extractor: take, e.g. # http://cambridgema.iqm2.com/Citizens/SplitView.aspx?Mode=Video&MeetingID=1679 # and look for @@ -71,7 +69,6 @@ class IQM2IE(InfoExtractor): inner_url_rel = self._html_search_regex( r'
Date: Sun, 9 Oct 2016 09:18:23 -0400 Subject: [PATCH 19/29] Remove verbose printing It should move to YoutubeDL.process_ie_result per @yan12125, future pull request forthcoming. --- youtube_dl/extractor/iqm2.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/youtube_dl/extractor/iqm2.py b/youtube_dl/extractor/iqm2.py index 6244990c4..57f96a036 100644 --- a/youtube_dl/extractor/iqm2.py +++ b/youtube_dl/extractor/iqm2.py @@ -72,9 +72,6 @@ class IQM2IE(InfoExtractor): inner_url = compat_urlparse.urljoin(url, inner_url_rel) - if self._downloader.params.get('verbose'): - self.to_screen('Invoking downloader on %s' % inner_url) - # Generic extractor matches this under the "Broaden the # findall a little bit: JWPlayer JS loader" (line 2372 as of 6 # Oct 2016, dcdb292fddc82ae11f4c0b647815a45c88a6b6d5). From c1ce8deed8b1c4bbf762282c2b3b229494ffa0f6 Mon Sep 17 00:00:00 2001 From: John Hawkinson Date: Mon, 10 Oct 2016 01:57:56 -0400 Subject: [PATCH 20/29] re-fill _TESTS (whitespace) --- youtube_dl/extractor/iqm2.py | 35 +++++++++++++++++++---------------- 1 file changed, 19 insertions(+), 16 deletions(-) diff --git a/youtube_dl/extractor/iqm2.py b/youtube_dl/extractor/iqm2.py index 57f96a036..f3930a5c9 100644 --- a/youtube_dl/extractor/iqm2.py +++ b/youtube_dl/extractor/iqm2.py @@ -35,23 +35,26 @@ class IQM2IE(InfoExtractor): # We commonly see both iqm2.com and IQM2.com. _VALID_URL = r'(?i)https?://(?:\w+\.)?iqm2\.com/Citizens/\w+.aspx\?.*MeetingID=(?P[0-9]+)' - _TESTS = [ { + _TESTS = [ + { 'url': 'http://somervillecityma.iqm2.com/Citizens/SplitView.aspx?Mode=Video&MeetingID=2308', - 'md5': '9ef458ff6c93f8b9323cf79db4ede9cf', - 'info_dict': { - 'id': '70472_480', - 'ext': 'mp4', - 'title': 'City of Somerville, Massachusetts', - 'uploader': 'somervillecityma.iqm2.com', - }}, { - 'url': 'http://cambridgema.iqm2.com/Citizens/SplitView.aspx?Mode=Video&MeetingID=1679#', - 'md5': '478ea30eee1966f7be0d8dd623122148', - 'info_dict': { - 'id': '1563_720', - 'ext': 'mp4', - 'title': 'Cambridge, MA (2)', - 'uploader': 'cambridgema.iqm2.com', - }}, { + 'md5': '9ef458ff6c93f8b9323cf79db4ede9cf', + 'info_dict': { + 'id': '70472_480', + 'ext': 'mp4', + 'title': 'City of Somerville, Massachusetts', + 'uploader': 'somervillecityma.iqm2.com', + }}, + { + 'url': 'http://cambridgema.iqm2.com/Citizens/SplitView.aspx?Mode=Video&MeetingID=1679#', + 'md5': '478ea30eee1966f7be0d8dd623122148', + 'info_dict': { + 'id': '1563_720', + 'ext': 'mp4', + 'title': 'Cambridge, MA (2)', + 'uploader': 'cambridgema.iqm2.com', + }}, + { 'url': 'https://CambridgeMA.IQM2.com/Citizens/VideoMain.aspx?MeetingID=1679', 'only_matching': True, }] From 00253f8312b0860fef8d9178d03fcb39cef6e0cd Mon Sep 17 00:00:00 2001 From: John Hawkinson Date: Mon, 10 Oct 2016 02:07:34 -0400 Subject: [PATCH 21/29] [JWPlatformBase] handle a few more cases * If our parsed JSON ends up as a list, rather than a dict, then store it in ['sources'] as that list, rather than trying to wrap it in an array, which leads to type errors. (Such a list indicates multiple file formats/sources, rather than a playlist.) * Allow format labels like 'SD 480' and 'HD 720' in addition to '1080p' --- youtube_dl/extractor/jwplatform.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/jwplatform.py b/youtube_dl/extractor/jwplatform.py index 5d56e0a28..ff7097160 100644 --- a/youtube_dl/extractor/jwplatform.py +++ b/youtube_dl/extractor/jwplatform.py @@ -50,7 +50,11 @@ class JWPlatformBaseIE(InfoExtractor): # JWPlayer backward compatibility: flattened sources # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/playlist/item.js#L29-L35 if 'sources' not in video_data: - video_data['sources'] = [video_data] + if isinstance(video_data, list): + video_data = {'sources': video_data } + video_data['tracks'] = video_data['sources'][0].get('tracks') + else: + video_data['sources'] = [video_data] this_video_id = video_id or video_data['mediaid'] @@ -78,9 +82,10 @@ class JWPlatformBaseIE(InfoExtractor): height = int_or_none(source.get('height')) if height is None: # Often no height is provided but there is a label in - # format like 1080p. + # format like 1080p or 'SD 480' height = int_or_none(self._search_regex( - r'^(\d{3,})[pP]$', source.get('label') or '', + [r'^(\d{3,})[pP]$', r'^[SH]D (\d{3,})$'], + source.get('label') or '', 'height', default=None)) a_format = { 'url': source_url, From 87af84de37ce812634c031c63bc7b49beb0252b1 Mon Sep 17 00:00:00 2001 From: John Hawkinson Date: Mon, 10 Oct 2016 02:12:44 -0400 Subject: [PATCH 22/29] Leverage JWPlatformBase instead of Generic Per @yan12125's suggestion: * Redefine _find_jwplayer_data() to use the SetupJWPlayer RE that's IQM2-specific * Retreive the 2ndary webpage on our own * Search for the title just like generic does --- youtube_dl/extractor/iqm2.py | 40 ++++++++++++++++++++++-------------- 1 file changed, 25 insertions(+), 15 deletions(-) diff --git a/youtube_dl/extractor/iqm2.py b/youtube_dl/extractor/iqm2.py index f3930a5c9..1fa14ac07 100644 --- a/youtube_dl/extractor/iqm2.py +++ b/youtube_dl/extractor/iqm2.py @@ -6,7 +6,7 @@ import re from .common import InfoExtractor from ..compat import compat_urlparse -from .generic import GenericIE +from .jwplatform import JWPlatformBaseIE # IQM2 aka Accela is a municipal meeting management platform that # (among other things) stores livestreamed video from municipal @@ -31,52 +31,62 @@ from .generic import GenericIE # Contributed by John Hawkinson , 6 Oct 2016. -class IQM2IE(InfoExtractor): +class IQM2IE(JWPlatformBaseIE): # We commonly see both iqm2.com and IQM2.com. _VALID_URL = r'(?i)https?://(?:\w+\.)?iqm2\.com/Citizens/\w+.aspx\?.*MeetingID=(?P[0-9]+)' _TESTS = [ - { + { # This is a "realtime" case 'url': 'http://somervillecityma.iqm2.com/Citizens/SplitView.aspx?Mode=Video&MeetingID=2308', 'md5': '9ef458ff6c93f8b9323cf79db4ede9cf', 'info_dict': { - 'id': '70472_480', + 'id': '2308', 'ext': 'mp4', 'title': 'City of Somerville, Massachusetts', - 'uploader': 'somervillecityma.iqm2.com', }}, { + # This is a "postprocessed" case 'url': 'http://cambridgema.iqm2.com/Citizens/SplitView.aspx?Mode=Video&MeetingID=1679#', 'md5': '478ea30eee1966f7be0d8dd623122148', 'info_dict': { - 'id': '1563_720', + 'id': '1679', 'ext': 'mp4', - 'title': 'Cambridge, MA (2)', - 'uploader': 'cambridgema.iqm2.com', + 'title': 'Cambridge, MA', }}, { 'url': 'https://CambridgeMA.IQM2.com/Citizens/VideoMain.aspx?MeetingID=1679', 'only_matching': True, }] + def _find_jwplayer_data(self, webpage): + mobj = re.search(r'SetupJWPlayer\(eval\(\'(?P[^)]+)\'\)', webpage) + if mobj: + return mobj.group('options') + def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) - # Simple extractor: take, e.g. + # Take, e.g. # http://cambridgema.iqm2.com/Citizens/SplitView.aspx?Mode=Video&MeetingID=1679 # and look for #
#
- # and feed the canonicalized src element to the generic extractor + # and then parse the canonicalized src element inner_url_rel = self._html_search_regex( r'
(.*?)', webpage, 'video title', + default='video') + info_dict['title'] = video_title + + return info_dict From cb2e9ec69e135aefb4bb1c8f3e40bcddac3bfd7a Mon Sep 17 00:00:00 2001 From: John Hawkinson Date: Mon, 10 Oct 2016 02:24:39 -0400 Subject: [PATCH 23/29] Use subsidiary page's media ID, not parent page Video IDs should be based on the unique ID of the video, not the meeting ID of the parent page that links to the media page. Unfortunately we don't learn the media ID until after downloading the first page. --- youtube_dl/extractor/iqm2.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/iqm2.py b/youtube_dl/extractor/iqm2.py index 1fa14ac07..ee3b46450 100644 --- a/youtube_dl/extractor/iqm2.py +++ b/youtube_dl/extractor/iqm2.py @@ -40,7 +40,7 @@ class IQM2IE(JWPlatformBaseIE): 'url': 'http://somervillecityma.iqm2.com/Citizens/SplitView.aspx?Mode=Video&MeetingID=2308', 'md5': '9ef458ff6c93f8b9323cf79db4ede9cf', 'info_dict': { - 'id': '2308', + 'id': '70472', 'ext': 'mp4', 'title': 'City of Somerville, Massachusetts', }}, @@ -49,7 +49,7 @@ class IQM2IE(JWPlatformBaseIE): 'url': 'http://cambridgema.iqm2.com/Citizens/SplitView.aspx?Mode=Video&MeetingID=1679#', 'md5': '478ea30eee1966f7be0d8dd623122148', 'info_dict': { - 'id': '1679', + 'id': '1563', 'ext': 'mp4', 'title': 'Cambridge, MA', }}, @@ -64,8 +64,8 @@ class IQM2IE(JWPlatformBaseIE): return mobj.group('options') def _real_extract(self, url): - video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) + parent_id = self._match_id(url) + webpage = self._download_webpage(url, parent_id) # Take, e.g. # http://cambridgema.iqm2.com/Citizens/SplitView.aspx?Mode=Video&MeetingID=1679 @@ -78,6 +78,10 @@ class IQM2IE(JWPlatformBaseIE): webpage, 'url'); inner_url = compat_urlparse.urljoin(url, inner_url_rel) + mobj = re.match( + r'(?i)https?://(?:\w+\.)?iqm2\.com/Citizens/\w+.aspx\?.*MediaID=(?P[0-9]+)', + inner_url) + video_id = mobj.group('id') webpage = self._download_webpage(inner_url, video_id) info_dict = self._extract_jwplayer_data( From 02f4d4e44f0b0ec7c4a1e0b3d991e068c5f5e86e Mon Sep 17 00:00:00 2001 From: John Hawkinson Date: Mon, 10 Oct 2016 10:55:42 -0400 Subject: [PATCH 24/29] Move JWPlayer JSON array handling to IQM2 Out of @yan12125's concern that the presumption that JWPlayer data as an array representing multiple formats rather than a playlist might be specific to IQM2, move this code from jwplatform.py to iqm2.py. JWPlatformBase now reverts to throwing a TypeError if it gets an array. Now IQM2 redefines the _extract_jwplayer_data() method as well, but it continues to leverage JWPlatformBase for _parse_jwplayer_data(), which is the bulk of the work. --- youtube_dl/extractor/iqm2.py | 14 +++++++++++++- youtube_dl/extractor/jwplatform.py | 6 +----- 2 files changed, 14 insertions(+), 6 deletions(-) diff --git a/youtube_dl/extractor/iqm2.py b/youtube_dl/extractor/iqm2.py index ee3b46450..367aec9ff 100644 --- a/youtube_dl/extractor/iqm2.py +++ b/youtube_dl/extractor/iqm2.py @@ -1,12 +1,12 @@ # coding: utf-8 from __future__ import unicode_literals -from ..utils import smuggle_url import re from .common import InfoExtractor from ..compat import compat_urlparse from .jwplatform import JWPlatformBaseIE +from ..utils import js_to_json # IQM2 aka Accela is a municipal meeting management platform that # (among other things) stores livestreamed video from municipal @@ -63,6 +63,18 @@ class IQM2IE(JWPlatformBaseIE): if mobj: return mobj.group('options') + def _extract_jwplayer_data(self, webpage, video_id, *args, **kwargs): + jwplayer_data = self._parse_json( + self._find_jwplayer_data(webpage), video_id, + transform_source=js_to_json) + + assert(isinstance(jwplayer_data, list)) + jwplayer_data = {'sources': jwplayer_data } + jwplayer_data['tracks'] = jwplayer_data['sources'][0].get('tracks') + + return self._parse_jwplayer_data( + jwplayer_data, video_id, *args, **kwargs) + def _real_extract(self, url): parent_id = self._match_id(url) webpage = self._download_webpage(url, parent_id) diff --git a/youtube_dl/extractor/jwplatform.py b/youtube_dl/extractor/jwplatform.py index ff7097160..ea848f529 100644 --- a/youtube_dl/extractor/jwplatform.py +++ b/youtube_dl/extractor/jwplatform.py @@ -50,11 +50,7 @@ class JWPlatformBaseIE(InfoExtractor): # JWPlayer backward compatibility: flattened sources # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/playlist/item.js#L29-L35 if 'sources' not in video_data: - if isinstance(video_data, list): - video_data = {'sources': video_data } - video_data['tracks'] = video_data['sources'][0].get('tracks') - else: - video_data['sources'] = [video_data] + video_data['sources'] = [video_data] this_video_id = video_id or video_data['mediaid'] From 3cd6469e364e23f6dd4ff6f322af707353ec2043 Mon Sep 17 00:00:00 2001 From: John Hawkinson Date: Tue, 18 Oct 2016 00:00:11 -0400 Subject: [PATCH 25/29] [iqm2] Improve jwplayer_data regexp Surprise! URLs can have parentheses in them, just like like @dstftw reminded us in https://github.com/rg3/youtube-dl/pull/10926/files/001a30f3352bcc829d8e6d4060af2a19cc2c4a82#r83538385 Use . instead of [^)] and anchor the regexp with \)\); at the end. Add an only_matching test case for this, although it may switch from realtime to processed in a few hours... --- youtube_dl/extractor/iqm2.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/iqm2.py b/youtube_dl/extractor/iqm2.py index 367aec9ff..c5576e5fd 100644 --- a/youtube_dl/extractor/iqm2.py +++ b/youtube_dl/extractor/iqm2.py @@ -56,10 +56,15 @@ class IQM2IE(JWPlatformBaseIE): { 'url': 'https://CambridgeMA.IQM2.com/Citizens/VideoMain.aspx?MeetingID=1679', 'only_matching': True, - }] + }, + { + 'url': 'https://CambridgeMA.IQM2.com/Citizens/VideoMain.aspx?MeetingID=1594', + 'only_matching': True, + } + ] def _find_jwplayer_data(self, webpage): - mobj = re.search(r'SetupJWPlayer\(eval\(\'(?P[^)]+)\'\)', webpage) + mobj = re.search(r'SetupJWPlayer\(eval\(\'(?P.+)\'\)\);', webpage) if mobj: return mobj.group('options') From 6f919d74c4f5689c51d90986d95b94df472b545c Mon Sep 17 00:00:00 2001 From: John Hawkinson Date: Tue, 24 Jan 2017 00:40:14 -0500 Subject: [PATCH 26/29] [iqm2] reduce documentation to 1 line Comply with youtube-dl coding standards. Reduce top comment to single-line summary per @yan12125. Move explanation of problems testing extractor into tests. --- youtube_dl/extractor/iqm2.py | 32 ++++++++++++++------------------ 1 file changed, 14 insertions(+), 18 deletions(-) diff --git a/youtube_dl/extractor/iqm2.py b/youtube_dl/extractor/iqm2.py index c5576e5fd..f2a3a801c 100644 --- a/youtube_dl/extractor/iqm2.py +++ b/youtube_dl/extractor/iqm2.py @@ -8,23 +8,7 @@ from ..compat import compat_urlparse from .jwplatform import JWPlatformBaseIE from ..utils import js_to_json -# IQM2 aka Accela is a municipal meeting management platform that -# (among other things) stores livestreamed video from municipal -# meetings. In some cases (e.g. cambridgema.iqm2.com), after a hefty -# (several-hour) processing time, that video is available in easily -# downloadable form from their web portal, but prior to that, the -# video can only be watched in realtime through JWPlayer. Other -# (somervillecityma.iqm2.com) instances don't seem to ever offer a -# downloadable form. This extractor is designed to download the -# realtime video without the download links being available. For more -# info on Accela, see: -# http://www.iqm2.com/About/Accela.aspx -# http://www.accela.com/ -# https://github.com/Accela-Inc/leg-man-api-docs - -# This processing makes hard to test since there's only a narrow -# window when it matters. After that the extractor finds links to -# the processed video intead. +# IQM2 aka Accela stores livestreamed video from municipal meetings. # No metadata is retrieved, as that would require finding a metadata # URL and retreiving a 3rd HTTP resource. @@ -36,7 +20,19 @@ class IQM2IE(JWPlatformBaseIE): # We commonly see both iqm2.com and IQM2.com. _VALID_URL = r'(?i)https?://(?:\w+\.)?iqm2\.com/Citizens/\w+.aspx\?.*MeetingID=(?P[0-9]+)' _TESTS = [ - { # This is a "realtime" case + # In some cases (e.g. cambridgema.iqm2.com), after a hefty + # (several-hour) processing time, that video is available in easily + # downloadable form from their web portal, but prior to that, the + # video can only be watched in realtime through JWPlayer. Other + # (somervillecityma.iqm2.com) instances don't seem to ever offer a + # downloadable form. This extractor is designed to download the + # realtime video without the download links being available. + # + # This processing makes it hard to test since there's only a narrow + # window when it matters. After that the extractor finds links to the + # processed video intead. + { + # This is a "realtime" case 'url': 'http://somervillecityma.iqm2.com/Citizens/SplitView.aspx?Mode=Video&MeetingID=2308', 'md5': '9ef458ff6c93f8b9323cf79db4ede9cf', 'info_dict': { From 0a2d66088c3851ac79600c09110d0b098cd3ed5d Mon Sep 17 00:00:00 2001 From: John Hawkinson Date: Tue, 24 Jan 2017 00:45:34 -0500 Subject: [PATCH 27/29] [iqm2] don't reuse webpage var twice --- youtube_dl/extractor/iqm2.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/iqm2.py b/youtube_dl/extractor/iqm2.py index f2a3a801c..cffd0790a 100644 --- a/youtube_dl/extractor/iqm2.py +++ b/youtube_dl/extractor/iqm2.py @@ -78,7 +78,7 @@ class IQM2IE(JWPlatformBaseIE): def _real_extract(self, url): parent_id = self._match_id(url) - webpage = self._download_webpage(url, parent_id) + parent_page = self._download_webpage(url, parent_id) # Take, e.g. # http://cambridgema.iqm2.com/Citizens/SplitView.aspx?Mode=Video&MeetingID=1679 @@ -88,7 +88,7 @@ class IQM2IE(JWPlatformBaseIE): # and then parse the canonicalized src element inner_url_rel = self._html_search_regex( r'
Date: Sat, 4 Mar 2017 19:04:17 -0500 Subject: [PATCH 28/29] move toplevel comment into IE_DESC and move metadata comment further down. --- youtube_dl/extractor/iqm2.py | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/youtube_dl/extractor/iqm2.py b/youtube_dl/extractor/iqm2.py index cffd0790a..e025429f7 100644 --- a/youtube_dl/extractor/iqm2.py +++ b/youtube_dl/extractor/iqm2.py @@ -8,15 +8,10 @@ from ..compat import compat_urlparse from .jwplatform import JWPlatformBaseIE from ..utils import js_to_json -# IQM2 aka Accela stores livestreamed video from municipal meetings. - -# No metadata is retrieved, as that would require finding a metadata -# URL and retreiving a 3rd HTTP resource. - # Contributed by John Hawkinson , 6 Oct 2016. class IQM2IE(JWPlatformBaseIE): - + IE_DESC = 'IQM2 (aka Accela) livestreamed video from municipal meetings' # We commonly see both iqm2.com and IQM2.com. _VALID_URL = r'(?i)https?://(?:\w+\.)?iqm2\.com/Citizens/\w+.aspx\?.*MeetingID=(?P[0-9]+)' _TESTS = [ @@ -105,5 +100,8 @@ class IQM2IE(JWPlatformBaseIE): r'(?s)(.*?)', webpage, 'video title', default='video') info_dict['title'] = video_title - + + # No metadata is retrieved, as that would require finding a metadata + # URL and retrieving a 3rd HTTP resource. + return info_dict From c7645161440eb3f190203feaac532fe97c9d1592 Mon Sep 17 00:00:00 2001 From: John Hawkinson Date: Tue, 7 Mar 2017 23:50:34 -0500 Subject: [PATCH 29/29] [IQM2] flake8 for PEP 8 --- youtube_dl/extractor/iqm2.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/youtube_dl/extractor/iqm2.py b/youtube_dl/extractor/iqm2.py index 34f5808e8..f8a08f489 100644 --- a/youtube_dl/extractor/iqm2.py +++ b/youtube_dl/extractor/iqm2.py @@ -9,6 +9,7 @@ from ..utils import js_to_json # Contributed by John Hawkinson , 6 Oct 2016. + class IQM2IE(InfoExtractor): IE_DESC = 'IQM2 (aka Accela) livestreamed video from municipal meetings' # We commonly see both iqm2.com and IQM2.com. @@ -35,7 +36,7 @@ class IQM2IE(InfoExtractor): 'title': 'City of Somerville, Massachusetts', }}, { - # This is a "postprocessed" case + # This is a "postprocessed" case 'url': 'http://cambridgema.iqm2.com/Citizens/SplitView.aspx?Mode=Video&MeetingID=1679#', 'md5': '478ea30eee1966f7be0d8dd623122148', 'info_dict': { @@ -50,23 +51,23 @@ class IQM2IE(InfoExtractor): { 'url': 'https://CambridgeMA.IQM2.com/Citizens/VideoMain.aspx?MeetingID=1594', 'only_matching': True, - } + }, ] def _find_jwplayer_data(self, webpage): mobj = re.search(r'SetupJWPlayer\(eval\(\'(?P.+)\'\)\);', webpage) if mobj: return mobj.group('options') - + def _extract_jwplayer_data(self, webpage, video_id, *args, **kwargs): jwplayer_data = self._parse_json( self._find_jwplayer_data(webpage), video_id, transform_source=js_to_json) assert(isinstance(jwplayer_data, list)) - jwplayer_data = {'sources': jwplayer_data } + jwplayer_data = {'sources': jwplayer_data} jwplayer_data['tracks'] = jwplayer_data['sources'][0].get('tracks') - + return self._parse_jwplayer_data( jwplayer_data, video_id, *args, **kwargs) @@ -82,7 +83,7 @@ class IQM2IE(InfoExtractor): # and then parse the canonicalized src element inner_url_rel = self._html_search_regex( r'