From ecccfc8f8f30bbf931ba0e4dea0e7cec2891f469 Mon Sep 17 00:00:00 2001 From: jaykbull Date: Wed, 1 Nov 2017 09:44:10 +0100 Subject: [PATCH 1/2] Ignore missing attributes in MPD manifests. Some sites, like thisav, does not include all expected attributes in their MPD manifests. I don't know the MPD/DASH spec at all, so I can't tell if this makes the manifest itself non-compliant... That said, it works when played in a browser. By not treating this attributes as not required in code and simply moving along when we cannot find them, we seem to be able to successfully download videos from such sites. This closes https://github.com/rg3/youtube-dl/issues/13784. --- youtube_dl/extractor/common.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 52f2055b5..ddbc04050 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -1806,7 +1806,9 @@ class InfoExtractor(object): def extract_Initialization(source): initialization = source.find(_add_ns('Initialization')) if initialization is not None: - ms_info['initialization_url'] = initialization.attrib['sourceURL'] + initialization_source_url = initialization.attrib.get('sourceURL') + if initialization_source_url is not None: + ms_info['initialization_url'] = initialization_source_url segment_list = element.find(_add_ns('SegmentList')) if segment_list is not None: @@ -1814,7 +1816,9 @@ class InfoExtractor(object): extract_Initialization(segment_list) segment_urls_e = segment_list.findall(_add_ns('SegmentURL')) if segment_urls_e: - ms_info['segment_urls'] = [segment.attrib['media'] for segment in segment_urls_e] + segment_urls = [segment.attrib.get('media') for segment in segment_urls_e] + if segment_urls[0] is not None: + ms_info['segment_urls'] = segment_urls else: segment_template = element.find(_add_ns('SegmentTemplate')) if segment_template is not None: From 3a65c37d6eb14b5e5e893263adaa71e7dcce0ca4 Mon Sep 17 00:00:00 2001 From: jaykbull Date: Fri, 3 Nov 2017 20:57:13 +0100 Subject: [PATCH 2/2] Add test-case for thisav. Fix processing of empty mpd_base_url when mpd_url is present. --- test/test_InfoExtractor.py | 17 ++++ test/testdata/mpd/thisav.mpd | 141 +++++++++++++++++++++++++++++++++ youtube_dl/extractor/common.py | 3 + 3 files changed, 161 insertions(+) create mode 100644 test/testdata/mpd/thisav.mpd diff --git a/test/test_InfoExtractor.py b/test/test_InfoExtractor.py index f18a823fc..0dc4cc6b0 100644 --- a/test/test_InfoExtractor.py +++ b/test/test_InfoExtractor.py @@ -563,6 +563,23 @@ jwplayer("mediaplayer").setup({"abouttext":"Visit Indie DB","aboutlink":"http:\/ 'height': 1080, }] ), + ( + # https://github.com/rg3/youtube-dl/issues/13784 + 'thisav', + 'http://unknown/manifest.mpd', + [{ + 'url': 'http://unknown/300708_dashinit.mp4', + 'manifest_url': 'http://unknown/manifest.mpd', + 'ext': 'mp4', + 'format_note': 'DASH video', + 'protocol': None, + 'acodec': 'mp4a.40.2', + 'vcodec': 'avc3.64001e', + 'tbr': 574.578, + 'width': 640, + 'height': 426, + }] + ) ] for mpd_file, mpd_url, expected_formats in _TEST_CASES: diff --git a/test/testdata/mpd/thisav.mpd b/test/testdata/mpd/thisav.mpd new file mode 100644 index 000000000..4cb397fb0 --- /dev/null +++ b/test/testdata/mpd/thisav.mpd @@ -0,0 +1,141 @@ + + + + + 300708 + + + + + + + + + 300708_dashinit.mp4 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index ddbc04050..c6691e3b8 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -1864,6 +1864,9 @@ class InfoExtractor(object): base_url = base_url_e.text + base_url if re.match(r'^https?://', base_url): break + if mpd_base_url == '' and re.match(r'^https?://', mpd_url): + mpd_base_url = "/".join(mpd_url.split("/")[0:-1]) + if mpd_base_url and not re.match(r'^https?://', base_url): if not mpd_base_url.endswith('/') and not base_url.startswith('/'): mpd_base_url += '/'