diff --git a/.github/ISSUE_TEMPLATE.md b/.github/ISSUE_TEMPLATE.md index 4dd1a6e59..513823b9b 100644 --- a/.github/ISSUE_TEMPLATE.md +++ b/.github/ISSUE_TEMPLATE.md @@ -6,8 +6,8 @@ --- -### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2017.11.15*. If it's not, read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. -- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2017.11.15** +### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2017.12.02*. If it's not, read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. +- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2017.12.02** ### Before submitting an *issue* make sure you have: - [ ] At least skimmed through the [README](https://github.com/rg3/youtube-dl/blob/master/README.md), **most notably** the [FAQ](https://github.com/rg3/youtube-dl#faq) and [BUGS](https://github.com/rg3/youtube-dl#bugs) sections @@ -35,7 +35,7 @@ Add the `-v` flag to **your command line** you run youtube-dl with (`youtube-dl [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 -[debug] youtube-dl version 2017.11.15 +[debug] youtube-dl version 2017.12.02 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/ChangeLog b/ChangeLog index c4a84c597..63837d62b 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,8 +1,54 @@ -version +version 2017.12.02 + +Core ++ [downloader/fragment] Commit part file after each fragment ++ [extractor/common] Add durations for DASH fragments with bare SegmentURLs ++ [extractor/common] Add support for DASH manifests with SegmentLists with + bare SegmentURLs (#14844) ++ [utils] Add hvc1 codec code to parse_codecs Extractors +* [xhamster] Fix extraction (#14884) +* [youku] Update ccode (#14872) +* [mnet] Fix format extraction (#14883) ++ [xiami] Add Referer header to API request +* [mtv] Correct scc extention in extracted subtitles (#13730) +* [vvvvid] Fix extraction for kenc videos (#13406) ++ [br] Add support for BR Mediathek videos (#14560, #14788) ++ [daisuki] Add support for motto.daisuki.com (#14681) +* [odnoklassniki] Fix API metadata request (#14862) +* [itv] Fix HLS formats extraction ++ [pbs] Add another media id regular expression + + +version 2017.11.26 + +Core +* [extractor/common] Use final URL when dumping request (#14769) + +Extractors +* [fczenit] Fix extraction +- [firstpost] Remove extractor +* [freespeech] Fix extraction +* [nexx] Extract more formats ++ [openload] Add support for openload.link (#14763) +* [empflix] Relax URL regular expression +* [empflix] Fix extractrion +* [tnaflix] Don't modify download URLs (#14811) +- [gamersyde] Remove extractor +* [francetv:generationwhat] Fix extraction ++ [massengeschmacktv] Add support for Massengeschmack TV +* [fox9] Fix extraction +* [faz] Fix extraction and add support for Perform Group embeds (#14714) ++ [performgroup] Add support for performgroup.com ++ [jwplatform] Add support for iframes (#14828) +* [culturebox] Fix extraction (#14827) * [youku] Fix extraction; update ccode (#14815) -+ [JWPlatform] support iframes (#14828) +* [livestream] Make SMIL extraction non fatal (#14792) ++ [drtuber] Add support for mobile URLs (#14772) ++ [spankbang] Add support for mobile URLs (#14771) +* [instagram] Fix description, timestamp and counters extraction (#14755) + version 2017.11.15 diff --git a/Makefile b/Makefile index c74eea792..1c760bef8 100644 --- a/Makefile +++ b/Makefile @@ -36,8 +36,17 @@ test: ot: offlinetest +# Keep this list in sync with devscripts/run_tests.sh offlinetest: codetest - $(PYTHON) -m nose --verbose test --exclude test_download.py --exclude test_age_restriction.py --exclude test_subtitles.py --exclude test_write_annotations.py --exclude test_youtube_lists.py --exclude test_iqiyi_sdk_interpreter.py --exclude test_socks.py + $(PYTHON) -m nose --verbose test \ + --exclude test_age_restriction.py \ + --exclude test_download.py \ + --exclude test_iqiyi_sdk_interpreter.py \ + --exclude test_socks.py \ + --exclude test_subtitles.py \ + --exclude test_write_annotations.py \ + --exclude test_youtube_lists.py \ + --exclude test_youtube_signature.py tar: youtube-dl.tar.gz @@ -110,11 +119,10 @@ youtube-dl.tar.gz: youtube-dl README.md README.txt youtube-dl.1 youtube-dl.bash- --exclude '*~' \ --exclude '__pycache__' \ --exclude '.git' \ - --exclude 'testdata' \ --exclude 'docs/_build' \ -- \ bin devscripts test youtube_dl docs \ ChangeLog LICENSE README.md README.txt \ Makefile MANIFEST.in youtube-dl.1 youtube-dl.bash-completion \ - youtube-dl.zsh youtube-dl.fish setup.py \ + youtube-dl.zsh youtube-dl.fish setup.py setup.cfg \ youtube-dl diff --git a/devscripts/run_tests.sh b/devscripts/run_tests.sh index 6ba26720d..dd37a80f5 100755 --- a/devscripts/run_tests.sh +++ b/devscripts/run_tests.sh @@ -1,6 +1,7 @@ #!/bin/bash -DOWNLOAD_TESTS="age_restriction|download|subtitles|write_annotations|iqiyi_sdk_interpreter|youtube_lists" +# Keep this list in sync with the `offlinetest` target in Makefile +DOWNLOAD_TESTS="age_restriction|download|iqiyi_sdk_interpreter|socks|subtitles|write_annotations|youtube_lists|youtube_signature" test_set="" multiprocess_args="" diff --git a/docs/supportedsites.md b/docs/supportedsites.md index 6009df571..0287a4011 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -112,11 +112,12 @@ - **BokeCC** - **BostonGlobe** - **Bpb**: Bundeszentrale für politische Bildung - - **BR**: Bayerischer Rundfunk Mediathek + - **BR**: Bayerischer Rundfunk - **BravoTV** - **Break** - **brightcove:legacy** - **brightcove:new** + - **BRMediathek**: Bayerischer Rundfunk Mediathek - **bt:article**: Bergens Tidende Articles - **bt:vestlendingen**: Bergens Tidende - Vestlendingen - **BuzzFeed** @@ -198,8 +199,8 @@ - **dailymotion:playlist** - **dailymotion:user** - **DailymotionCloud** - - **Daisuki** - - **DaisukiPlaylist** + - **DaisukiMotto** + - **DaisukiMottoPlaylist** - **daum.net** - **daum.net:clip** - **daum.net:playlist** @@ -266,10 +267,8 @@ - **fc2** - **fc2:embed** - **Fczenit** - - **fernsehkritik.tv** - **filmon** - **filmon:channel** - - **Firstpost** - **FiveTV** - **Flickr** - **Flipagram** @@ -283,7 +282,7 @@ - **foxnews:article** - **foxnews:insider** - **FoxSports** - - **france2.fr:generation-quoi** + - **france2.fr:generation-what** - **FranceCulture** - **FranceInter** - **FranceTV** @@ -301,7 +300,6 @@ - **GameInformer** - **GameOne** - **gameone:playlist** - - **Gamersyde** - **GameSpot** - **GameStar** - **Gaskrank** @@ -441,6 +439,7 @@ - **mangomolo:live** - **mangomolo:video** - **ManyVids** + - **massengeschmack.tv** - **MatchTV** - **MDR**: MDR.DE and KiKA - **media.ccc.de** @@ -608,6 +607,7 @@ - **pcmag** - **PearVideo** - **People** + - **PerformGroup** - **periscope**: Periscope - **periscope:user**: Periscope user videos - **PhilharmonieDeParis**: Philharmonie de Paris diff --git a/test/test_InfoExtractor.py b/test/test_InfoExtractor.py index 686c63efa..8a372d2c9 100644 --- a/test/test_InfoExtractor.py +++ b/test/test_InfoExtractor.py @@ -562,7 +562,89 @@ jwplayer("mediaplayer").setup({"abouttext":"Visit Indie DB","aboutlink":"http:\/ 'width': 1920, 'height': 1080, }] - ), + ), ( + # https://github.com/rg3/youtube-dl/pull/14844 + 'urls_only', + 'http://unknown/manifest.mpd', + [{ + 'manifest_url': 'http://unknown/manifest.mpd', + 'ext': 'mp4', + 'format_id': 'h264_aac_144p_m4s', + 'format_note': 'DASH video', + 'protocol': 'http_dash_segments', + 'acodec': 'mp4a.40.2', + 'vcodec': 'avc3.42c01e', + 'tbr': 200, + 'width': 256, + 'height': 144, + }, { + 'manifest_url': 'http://unknown/manifest.mpd', + 'ext': 'mp4', + 'format_id': 'h264_aac_240p_m4s', + 'format_note': 'DASH video', + 'protocol': 'http_dash_segments', + 'acodec': 'mp4a.40.2', + 'vcodec': 'avc3.42c01e', + 'tbr': 400, + 'width': 424, + 'height': 240, + }, { + 'manifest_url': 'http://unknown/manifest.mpd', + 'ext': 'mp4', + 'format_id': 'h264_aac_360p_m4s', + 'format_note': 'DASH video', + 'protocol': 'http_dash_segments', + 'acodec': 'mp4a.40.2', + 'vcodec': 'avc3.42c01e', + 'tbr': 800, + 'width': 640, + 'height': 360, + }, { + 'manifest_url': 'http://unknown/manifest.mpd', + 'ext': 'mp4', + 'format_id': 'h264_aac_480p_m4s', + 'format_note': 'DASH video', + 'protocol': 'http_dash_segments', + 'acodec': 'mp4a.40.2', + 'vcodec': 'avc3.42c01e', + 'tbr': 1200, + 'width': 856, + 'height': 480, + }, { + 'manifest_url': 'http://unknown/manifest.mpd', + 'ext': 'mp4', + 'format_id': 'h264_aac_576p_m4s', + 'format_note': 'DASH video', + 'protocol': 'http_dash_segments', + 'acodec': 'mp4a.40.2', + 'vcodec': 'avc3.42c01e', + 'tbr': 1600, + 'width': 1024, + 'height': 576, + }, { + 'manifest_url': 'http://unknown/manifest.mpd', + 'ext': 'mp4', + 'format_id': 'h264_aac_720p_m4s', + 'format_note': 'DASH video', + 'protocol': 'http_dash_segments', + 'acodec': 'mp4a.40.2', + 'vcodec': 'avc3.42c01e', + 'tbr': 2400, + 'width': 1280, + 'height': 720, + }, { + 'manifest_url': 'http://unknown/manifest.mpd', + 'ext': 'mp4', + 'format_id': 'h264_aac_1080p_m4s', + 'format_note': 'DASH video', + 'protocol': 'http_dash_segments', + 'acodec': 'mp4a.40.2', + 'vcodec': 'avc3.42c01e', + 'tbr': 4400, + 'width': 1920, + 'height': 1080, + }] + ) ] for mpd_file, mpd_url, expected_formats in _TEST_CASES: @@ -601,5 +683,6 @@ jwplayer("mediaplayer").setup({"abouttext":"Visit Indie DB","aboutlink":"http:\/ self.ie._sort_formats(formats) expect_value(self, formats, expected_formats, None) + if __name__ == '__main__': unittest.main() diff --git a/test/test_YoutubeDL.py b/test/test_YoutubeDL.py index 4af92fbd4..f0f5a8470 100644 --- a/test/test_YoutubeDL.py +++ b/test/test_YoutubeDL.py @@ -466,11 +466,11 @@ class TestFormatSelection(unittest.TestCase): ydl = YDL({'simulate': True}) self.assertEqual(ydl._default_format_spec({}), 'bestvideo+bestaudio/best') - ydl = YDL({'is_live': True}) - self.assertEqual(ydl._default_format_spec({}), 'best/bestvideo+bestaudio') + ydl = YDL({}) + self.assertEqual(ydl._default_format_spec({'is_live': True}), 'best/bestvideo+bestaudio') - ydl = YDL({'simulate': True, 'is_live': True}) - self.assertEqual(ydl._default_format_spec({}), 'bestvideo+bestaudio/best') + ydl = YDL({'simulate': True}) + self.assertEqual(ydl._default_format_spec({'is_live': True}), 'bestvideo+bestaudio/best') ydl = YDL({'outtmpl': '-'}) self.assertEqual(ydl._default_format_spec({}), 'best/bestvideo+bestaudio') diff --git a/test/testdata/mpd/urls_only.mpd b/test/testdata/mpd/urls_only.mpd new file mode 100644 index 000000000..2b9d595d3 --- /dev/null +++ b/test/testdata/mpd/urls_only.mpd @@ -0,0 +1,218 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/youtube_dl/downloader/fragment.py b/youtube_dl/downloader/fragment.py index 93002e45a..7bb61a541 100644 --- a/youtube_dl/downloader/fragment.py +++ b/youtube_dl/downloader/fragment.py @@ -107,6 +107,7 @@ class FragmentFD(FileDownloader): def _append_fragment(self, ctx, frag_content): try: ctx['dest_stream'].write(frag_content) + ctx['dest_stream'].flush() finally: if self.__do_ytdl_file(ctx): self._write_ytdl_file(ctx) diff --git a/youtube_dl/extractor/br.py b/youtube_dl/extractor/br.py index 2c32b6ae2..9bde7f2d8 100644 --- a/youtube_dl/extractor/br.py +++ b/youtube_dl/extractor/br.py @@ -1,20 +1,23 @@ # coding: utf-8 from __future__ import unicode_literals +import json import re from .common import InfoExtractor from ..utils import ( + determine_ext, ExtractorError, int_or_none, parse_duration, + parse_iso8601, xpath_element, xpath_text, ) class BRIE(InfoExtractor): - IE_DESC = 'Bayerischer Rundfunk Mediathek' + IE_DESC = 'Bayerischer Rundfunk' _VALID_URL = r'(?Phttps?://(?:www\.)?br(?:-klassik)?\.de)/(?:[a-z0-9\-_]+/)+(?P[a-z0-9\-_]+)\.html' _TESTS = [ @@ -123,10 +126,10 @@ class BRIE(InfoExtractor): for asset in assets.findall('asset'): format_url = xpath_text(asset, ['downloadUrl', 'url']) asset_type = asset.get('type') - if asset_type == 'HDS': + if asset_type.startswith('HDS'): formats.extend(self._extract_f4m_formats( format_url + '?hdcore=3.2.0', media_id, f4m_id='hds', fatal=False)) - elif asset_type == 'HLS': + elif asset_type.startswith('HLS'): formats.extend(self._extract_m3u8_formats( format_url, media_id, 'mp4', 'm3u8_native', m3u8_id='hds', fatal=False)) else: @@ -169,3 +172,140 @@ class BRIE(InfoExtractor): } for variant in variants.findall('variant') if xpath_text(variant, 'url')] thumbnails.sort(key=lambda x: x['width'] * x['height'], reverse=True) return thumbnails + + +class BRMediathekIE(InfoExtractor): + IE_DESC = 'Bayerischer Rundfunk Mediathek' + _VALID_URL = r'https?://(?:www\.)?br\.de/mediathek/video/[^/?&#]*?-(?Pav:[0-9a-f]{24})' + + _TESTS = [{ + 'url': 'https://www.br.de/mediathek/video/gesundheit-die-sendung-vom-28112017-av:5a1e6a6e8fce6d001871cc8e', + 'md5': 'fdc3d485835966d1622587d08ba632ec', + 'info_dict': { + 'id': 'av:5a1e6a6e8fce6d001871cc8e', + 'ext': 'mp4', + 'title': 'Die Sendung vom 28.11.2017', + 'description': 'md5:6000cdca5912ab2277e5b7339f201ccc', + 'timestamp': 1511942766, + 'upload_date': '20171129', + } + }] + + def _real_extract(self, url): + clip_id = self._match_id(url) + + clip = self._download_json( + 'https://proxy-base.master.mango.express/graphql', + clip_id, data=json.dumps({ + "query": """{ + viewer { + clip(id: "%s") { + title + description + duration + createdAt + ageRestriction + videoFiles { + edges { + node { + publicLocation + fileSize + videoProfile { + width + height + bitrate + encoding + } + } + } + } + captionFiles { + edges { + node { + publicLocation + } + } + } + teaserImages { + edges { + node { + imageFiles { + edges { + node { + publicLocation + width + height + } + } + } + } + } + } + } + } +}""" % clip_id}).encode(), headers={ + 'Content-Type': 'application/json', + })['data']['viewer']['clip'] + title = clip['title'] + + formats = [] + for edge in clip.get('videoFiles', {}).get('edges', []): + node = edge.get('node', {}) + n_url = node.get('publicLocation') + if not n_url: + continue + ext = determine_ext(n_url) + if ext == 'm3u8': + formats.extend(self._extract_m3u8_formats( + n_url, clip_id, 'mp4', 'm3u8_native', + m3u8_id='hls', fatal=False)) + else: + video_profile = node.get('videoProfile', {}) + tbr = int_or_none(video_profile.get('bitrate')) + format_id = 'http' + if tbr: + format_id += '-%d' % tbr + formats.append({ + 'format_id': format_id, + 'url': n_url, + 'width': int_or_none(video_profile.get('width')), + 'height': int_or_none(video_profile.get('height')), + 'tbr': tbr, + 'filesize': int_or_none(node.get('fileSize')), + }) + self._sort_formats(formats) + + subtitles = {} + for edge in clip.get('captionFiles', {}).get('edges', []): + node = edge.get('node', {}) + n_url = node.get('publicLocation') + if not n_url: + continue + subtitles.setdefault('de', []).append({ + 'url': n_url, + }) + + thumbnails = [] + for edge in clip.get('teaserImages', {}).get('edges', []): + for image_edge in edge.get('node', {}).get('imageFiles', {}).get('edges', []): + node = image_edge.get('node', {}) + n_url = node.get('publicLocation') + if not n_url: + continue + thumbnails.append({ + 'url': n_url, + 'width': int_or_none(node.get('width')), + 'height': int_or_none(node.get('height')), + }) + + return { + 'id': clip_id, + 'title': title, + 'description': clip.get('description'), + 'duration': int_or_none(clip.get('duration')), + 'timestamp': parse_iso8601(clip.get('createdAt')), + 'age_limit': int_or_none(clip.get('ageRestriction')), + 'formats': formats, + 'subtitles': subtitles, + 'thumbnails': thumbnails, + } diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 15999411b..80a9c982f 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -1978,11 +1978,18 @@ class InfoExtractor(object): elif 'segment_urls' in representation_ms_info: # Segment URLs with no SegmentTimeline # Example: https://www.seznam.cz/zpravy/clanek/cesko-zasahne-vitr-o-sile-vichrice-muze-byt-i-zivotu-nebezpecny-39091 + # https://github.com/rg3/youtube-dl/pull/14844 fragments = [] + segment_duration = float_or_none( + representation_ms_info['segment_duration'], + representation_ms_info['timescale']) if 'segment_duration' in representation_ms_info else None for segment_url in representation_ms_info['segment_urls']: - fragments.append({ + fragment = { location_key(segment_url): segment_url, - }) + } + if segment_duration: + fragment['duration'] = segment_duration + fragments.append(fragment) representation_ms_info['fragments'] = fragments # NB: MPD manifest may contain direct URLs to unfragmented media. # No fragments key is present in this case. diff --git a/youtube_dl/extractor/daisuki.py b/youtube_dl/extractor/daisuki.py index 58cc98666..5c9ac68a0 100644 --- a/youtube_dl/extractor/daisuki.py +++ b/youtube_dl/extractor/daisuki.py @@ -13,33 +13,30 @@ from ..aes import ( from ..utils import ( bytes_to_intlist, bytes_to_long, - clean_html, + extract_attributes, ExtractorError, intlist_to_bytes, - get_element_by_id, js_to_json, int_or_none, long_to_bytes, pkcs1pad, - remove_end, ) -class DaisukiIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?daisuki\.net/[^/]+/[^/]+/[^/]+/watch\.[^.]+\.(?P\d+)\.html' +class DaisukiMottoIE(InfoExtractor): + _VALID_URL = r'https?://motto\.daisuki\.net/framewatch/embed/[^/]+/(?P[0-9a-zA-Z]{3})' _TEST = { - 'url': 'http://www.daisuki.net/tw/en/anime/watch.TheIdolMasterCG.11213.html', + 'url': 'http://motto.daisuki.net/framewatch/embed/embedDRAGONBALLSUPERUniverseSurvivalsaga/V2e/760/428', 'info_dict': { - 'id': '11213', + 'id': 'V2e', 'ext': 'mp4', - 'title': '#01 Who is in the pumpkin carriage? - THE IDOLM@STER CINDERELLA GIRLS', + 'title': '#117 SHOWDOWN OF LOVE! ANDROIDS VS UNIVERSE 2!!', 'subtitles': { 'mul': [{ 'ext': 'ttml', }], }, - 'creator': 'BANDAI NAMCO Entertainment', }, 'params': { 'skip_download': True, # AES-encrypted HLS stream @@ -73,15 +70,17 @@ class DaisukiIE(InfoExtractor): n, e = self._RSA_KEY encrypted_aeskey = long_to_bytes(pow(bytes_to_long(padded_aeskey), e, n)) - init_data = self._download_json('http://www.daisuki.net/bin/bgn/init', video_id, query={ - 's': flashvars.get('s', ''), - 'c': flashvars.get('ss3_prm', ''), - 'e': url, - 'd': base64.b64encode(intlist_to_bytes(aes_cbc_encrypt( - bytes_to_intlist(json.dumps(data)), - aes_key, iv))).decode('ascii'), - 'a': base64.b64encode(encrypted_aeskey).decode('ascii'), - }, note='Downloading JSON metadata' + (' (try #%d)' % (idx + 1) if idx > 0 else '')) + init_data = self._download_json( + 'http://motto.daisuki.net/fastAPI/bgn/init/', + video_id, query={ + 's': flashvars.get('s', ''), + 'c': flashvars.get('ss3_prm', ''), + 'e': url, + 'd': base64.b64encode(intlist_to_bytes(aes_cbc_encrypt( + bytes_to_intlist(json.dumps(data)), + aes_key, iv))).decode('ascii'), + 'a': base64.b64encode(encrypted_aeskey).decode('ascii'), + }, note='Downloading JSON metadata' + (' (try #%d)' % (idx + 1) if idx > 0 else '')) if 'rtn' in init_data: encrypted_rtn = init_data['rtn'] @@ -98,14 +97,11 @@ class DaisukiIE(InfoExtractor): aes_key, iv)).decode('utf-8').rstrip('\0'), video_id) + title = rtn['title_str'] + formats = self._extract_m3u8_formats( rtn['play_url'], video_id, ext='mp4', entry_protocol='m3u8_native') - title = remove_end(self._og_search_title(webpage), ' - DAISUKI') - - creator = self._html_search_regex( - r'Creator\s*:\s*([^<]+)', webpage, 'creator', fatal=False) - subtitles = {} caption_url = rtn.get('caption_url') if caption_url: @@ -120,21 +116,18 @@ class DaisukiIE(InfoExtractor): 'title': title, 'formats': formats, 'subtitles': subtitles, - 'creator': creator, } -class DaisukiPlaylistIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)daisuki\.net/[^/]+/[^/]+/[^/]+/detail\.(?P[a-zA-Z0-9]+)\.html' +class DaisukiMottoPlaylistIE(InfoExtractor): + _VALID_URL = r'https?://motto\.daisuki\.net/(?Pinformation)/' _TEST = { - 'url': 'http://www.daisuki.net/tw/en/anime/detail.TheIdolMasterCG.html', + 'url': 'http://motto.daisuki.net/information/', 'info_dict': { - 'id': 'TheIdolMasterCG', - 'title': 'THE IDOLM@STER CINDERELLA GIRLS', - 'description': 'md5:0f2c028a9339f7a2c7fbf839edc5c5d8', + 'title': 'DRAGON BALL SUPER', }, - 'playlist_count': 26, + 'playlist_mincount': 117, } def _real_extract(self, url): @@ -142,18 +135,19 @@ class DaisukiPlaylistIE(InfoExtractor): webpage = self._download_webpage(url, playlist_id) - episode_pattern = r'''(?sx) - ]+delay="[^"]+/(\d+)/movie\.jpg".+? - ]+class=".*?\bepisodeNumber\b.*?">(?:]+>)?([^<]+)''' - entries = [{ - '_type': 'url_transparent', - 'url': url.replace('detail', 'watch').replace('.html', '.' + movie_id + '.html'), - 'episode_id': episode_id, - 'episode_number': int_or_none(episode_id), - } for movie_id, episode_id in re.findall(episode_pattern, webpage)] + entries = [] + for li in re.findall(r'(]+?data-product_id="[a-zA-Z0-9]{3}"[^>]+>)', webpage): + attr = extract_attributes(li) + ad_id = attr.get('data-ad_id') + product_id = attr.get('data-product_id') + if ad_id and product_id: + episode_id = attr.get('data-chapter') + entries.append({ + '_type': 'url_transparent', + 'url': 'http://motto.daisuki.net/framewatch/embed/%s/%s/760/428' % (ad_id, product_id), + 'episode_id': episode_id, + 'episode_number': int_or_none(episode_id), + 'ie_key': 'DaisukiMotto', + }) - playlist_title = remove_end( - self._og_search_title(webpage, fatal=False), ' - Anime - DAISUKI') - playlist_description = clean_html(get_element_by_id('synopsisTxt', webpage)) - - return self.playlist_result(entries, playlist_id, playlist_title, playlist_description) + return self.playlist_result(entries, playlist_title='DRAGON BALL SUPER') diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index dd5bb965e..c2a1c3bb9 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -127,7 +127,10 @@ from .bloomberg import BloombergIE from .bokecc import BokeCCIE from .bostonglobe import BostonGlobeIE from .bpb import BpbIE -from .br import BRIE +from .br import ( + BRIE, + BRMediathekIE, +) from .bravotv import BravoTVIE from .breakcom import BreakIE from .brightcove import ( @@ -246,8 +249,8 @@ from .dailymotion import ( DailymotionCloudIE, ) from .daisuki import ( - DaisukiIE, - DaisukiPlaylistIE, + DaisukiMottoIE, + DaisukiMottoPlaylistIE, ) from .daum import ( DaumIE, @@ -344,7 +347,6 @@ from .filmon import ( FilmOnIE, FilmOnChannelIE, ) -from .firstpost import FirstpostIE from .firsttv import FirstTVIE from .fivemin import FiveMinIE from .fivetv import FiveTVIE @@ -374,7 +376,7 @@ from .francetv import ( FranceTVIE, FranceTVEmbedIE, FranceTVInfoIE, - GenerationQuoiIE, + GenerationWhatIE, CultureboxIE, ) from .freesound import FreesoundIE @@ -390,7 +392,6 @@ from .gameone import ( GameOneIE, GameOnePlaylistIE, ) -from .gamersyde import GamersydeIE from .gamespot import GameSpotIE from .gamestar import GameStarIE from .gaskrank import GaskrankIE diff --git a/youtube_dl/extractor/fczenit.py b/youtube_dl/extractor/fczenit.py index 8d1010b88..8db7c5963 100644 --- a/youtube_dl/extractor/fczenit.py +++ b/youtube_dl/extractor/fczenit.py @@ -2,7 +2,10 @@ from __future__ import unicode_literals from .common import InfoExtractor -from ..compat import compat_urlparse +from ..utils import ( + int_or_none, + float_or_none, +) class FczenitIE(InfoExtractor): @@ -14,6 +17,8 @@ class FczenitIE(InfoExtractor): 'id': '41044', 'ext': 'mp4', 'title': 'Так пишется история: казанский разгром ЦСКА на «Зенит-ТВ»', + 'timestamp': 1462283735, + 'upload_date': '20160503', }, } @@ -21,28 +26,31 @@ class FczenitIE(InfoExtractor): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) - video_title = self._html_search_regex( - r'<[^>]+class=\"photoalbum__title\">([^<]+)', webpage, 'title') + msi_id = self._search_regex( + r"(?s)config\s*=\s*{.+?video_id\s*:\s*'([^']+)'", webpage, 'msi id') - video_items = self._parse_json(self._search_regex( - r'arrPath\s*=\s*JSON\.parse\(\'(.+)\'\)', webpage, 'video items'), - video_id) - - def merge_dicts(*dicts): - ret = {} - for a_dict in dicts: - ret.update(a_dict) - return ret + msi_data = self._download_json( + 'http://player.fc-zenit.ru/msi/video', msi_id, query={ + 'video': msi_id, + })['data'] + title = msi_data['name'] formats = [{ - 'url': compat_urlparse.urljoin(url, video_url), - 'tbr': int(tbr), - } for tbr, video_url in merge_dicts(*video_items).items()] + 'format_id': q.get('label'), + 'url': q['url'], + 'height': int_or_none(q.get('label')), + } for q in msi_data['qualities'] if q.get('url')] self._sort_formats(formats) + tags = [tag['label'] for tag in msi_data.get('tags', []) if tag.get('label')] + return { 'id': video_id, - 'title': video_title, + 'title': title, + 'thumbnail': msi_data.get('preview'), 'formats': formats, + 'duration': float_or_none(msi_data.get('duration')), + 'timestamp': int_or_none(msi_data.get('date')), + 'tags': tags, } diff --git a/youtube_dl/extractor/firstpost.py b/youtube_dl/extractor/firstpost.py deleted file mode 100644 index e8936cb24..000000000 --- a/youtube_dl/extractor/firstpost.py +++ /dev/null @@ -1,50 +0,0 @@ -from __future__ import unicode_literals - -from .common import InfoExtractor - - -class FirstpostIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?firstpost\.com/[^/]+/.*-(?P[0-9]+)\.html' - - _TEST = { - 'url': 'http://www.firstpost.com/india/india-to-launch-indigenous-aircraft-carrier-monday-1025403.html', - 'md5': 'ee9114957692f01fb1263ed87039112a', - 'info_dict': { - 'id': '1025403', - 'ext': 'mp4', - 'title': 'India to launch indigenous aircraft carrier INS Vikrant today', - 'description': 'md5:feef3041cb09724e0bdc02843348f5f4', - } - } - - def _real_extract(self, url): - video_id = self._match_id(url) - page = self._download_webpage(url, video_id) - - title = self._html_search_meta('twitter:title', page, 'title', fatal=True) - description = self._html_search_meta('twitter:description', page, 'title') - - data = self._download_xml( - 'http://www.firstpost.com/getvideoxml-%s.xml' % video_id, video_id, - 'Downloading video XML') - - item = data.find('./playlist/item') - thumbnail = item.find('./image').text - - formats = [ - { - 'url': details.find('./file').text, - 'format_id': details.find('./label').text.strip(), - 'width': int(details.find('./width').text.strip()), - 'height': int(details.find('./height').text.strip()), - } for details in item.findall('./source/file_details') if details.find('./file').text - ] - self._sort_formats(formats) - - return { - 'id': video_id, - 'title': title, - 'description': description, - 'thumbnail': thumbnail, - 'formats': formats, - } diff --git a/youtube_dl/extractor/francetv.py b/youtube_dl/extractor/francetv.py index 037e538cc..5a3abeaff 100644 --- a/youtube_dl/extractor/francetv.py +++ b/youtube_dl/extractor/francetv.py @@ -3,7 +3,6 @@ from __future__ import unicode_literals import re -import json from .common import InfoExtractor from ..compat import compat_urlparse @@ -308,31 +307,32 @@ class FranceTVInfoIE(FranceTVBaseInfoExtractor): return self._extract_video(video_id, catalogue) -class GenerationQuoiIE(InfoExtractor): - IE_NAME = 'france2.fr:generation-quoi' - _VALID_URL = r'https?://generation-quoi\.france2\.fr/portrait/(?P[^/?#]+)' +class GenerationWhatIE(InfoExtractor): + IE_NAME = 'france2.fr:generation-what' + _VALID_URL = r'https?://generation-what\.francetv\.fr/[^/]+/video/(?P[^/?#]+)' - _TEST = { - 'url': 'http://generation-quoi.france2.fr/portrait/garde-a-vous', + _TESTS = [{ + 'url': 'http://generation-what.francetv.fr/portrait/video/present-arms', 'info_dict': { - 'id': 'k7FJX8VBcvvLmX4wA5Q', + 'id': 'wtvKYUG45iw', 'ext': 'mp4', - 'title': 'Génération Quoi - Garde à Vous', - 'uploader': 'Génération Quoi', + 'title': 'Generation What - Garde à vous - FRA', + 'uploader': 'Generation What', + 'uploader_id': 'UCHH9p1eetWCgt4kXBYCb3_w', + 'upload_date': '20160411', }, - 'params': { - # It uses Dailymotion - 'skip_download': True, - }, - } + }, { + 'url': 'http://generation-what.francetv.fr/europe/video/present-arms', + 'only_matching': True, + }] def _real_extract(self, url): display_id = self._match_id(url) - info_url = compat_urlparse.urljoin(url, '/medias/video/%s.json' % display_id) - info_json = self._download_webpage(info_url, display_id) - info = json.loads(info_json) - return self.url_result('http://www.dailymotion.com/video/%s' % info['id'], - ie='Dailymotion') + webpage = self._download_webpage(url, display_id) + youtube_id = self._search_regex( + r"window\.videoURL\s*=\s*'([0-9A-Za-z_-]{11})';", + webpage, 'youtube id') + return self.url_result(youtube_id, 'Youtube', youtube_id) class CultureboxIE(FranceTVBaseInfoExtractor): diff --git a/youtube_dl/extractor/freespeech.py b/youtube_dl/extractor/freespeech.py index 7fa271b51..486a49c05 100644 --- a/youtube_dl/extractor/freespeech.py +++ b/youtube_dl/extractor/freespeech.py @@ -1,37 +1,34 @@ from __future__ import unicode_literals -import re -import json - from .common import InfoExtractor class FreespeechIE(InfoExtractor): IE_NAME = 'freespeech.org' - _VALID_URL = r'https?://(?:www\.)?freespeech\.org/video/(?P.+)' + _VALID_URL = r'https?://(?:www\.)?freespeech\.org/stories/(?P<id>.+)' _TEST = { 'add_ie': ['Youtube'], - 'url': 'https://www.freespeech.org/video/obama-romney-campaign-colorado-ahead-debate-0', + 'url': 'http://www.freespeech.org/stories/fcc-announces-net-neutrality-rollback-whats-stake/', 'info_dict': { - 'id': 'poKsVCZ64uU', - 'ext': 'webm', - 'title': 'Obama, Romney Campaign in Colorado Ahead of Debate', - 'description': 'Obama, Romney Campaign in Colorado Ahead of Debate', - 'uploader': 'freespeechtv', + 'id': 'waRk6IPqyWM', + 'ext': 'mp4', + 'title': 'What\'s At Stake - Net Neutrality Special', + 'description': 'Presented by MNN and FSTV', + 'upload_date': '20170728', 'uploader_id': 'freespeechtv', - 'upload_date': '20121002', + 'uploader': 'freespeechtv', }, } def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - title = mobj.group('title') - webpage = self._download_webpage(url, title) - info_json = self._search_regex(r'jQuery\.extend\(Drupal\.settings, ({.*?})\);', webpage, 'info') - info = json.loads(info_json) + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + youtube_url = self._search_regex( + r'data-video-url="([^"]+)"', + webpage, 'youtube url') return { '_type': 'url', - 'url': info['jw_player']['basic_video_node_player']['file'], + 'url': youtube_url, 'ie_key': 'Youtube', } diff --git a/youtube_dl/extractor/gamersyde.py b/youtube_dl/extractor/gamersyde.py deleted file mode 100644 index a218a6944..000000000 --- a/youtube_dl/extractor/gamersyde.py +++ /dev/null @@ -1,70 +0,0 @@ -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from ..utils import ( - js_to_json, - parse_duration, - remove_start, -) - - -class GamersydeIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?gamersyde\.com/hqstream_(?P<display_id>[\da-z_]+)-(?P<id>\d+)_[a-z]{2}\.html' - _TEST = { - 'url': 'http://www.gamersyde.com/hqstream_bloodborne_birth_of_a_hero-34371_en.html', - 'md5': 'f38d400d32f19724570040d5ce3a505f', - 'info_dict': { - 'id': '34371', - 'ext': 'mp4', - 'duration': 372, - 'title': 'Bloodborne - Birth of a hero', - 'thumbnail': r're:^https?://.*\.jpg$', - } - } - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') - display_id = mobj.group('display_id') - - webpage = self._download_webpage(url, display_id) - - playlist = self._parse_json( - self._search_regex( - r'(?s)playlist: \[({.+?})\]\s*}\);', webpage, 'files'), - display_id, transform_source=js_to_json) - - formats = [] - for source in playlist['sources']: - video_url = source.get('file') - if not video_url: - continue - format_id = source.get('label') - f = { - 'url': video_url, - 'format_id': format_id, - } - m = re.search(r'^(?P<height>\d+)[pP](?P<fps>\d+)fps', format_id) - if m: - f.update({ - 'height': int(m.group('height')), - 'fps': int(m.group('fps')), - }) - formats.append(f) - self._sort_formats(formats) - - title = remove_start(playlist['title'], '%s - ' % video_id) - thumbnail = playlist.get('image') - duration = parse_duration(self._search_regex( - r'Length:</label>([^<]+)<', webpage, 'duration', fatal=False)) - - return { - 'id': video_id, - 'display_id': display_id, - 'title': title, - 'thumbnail': thumbnail, - 'duration': duration, - 'formats': formats, - } diff --git a/youtube_dl/extractor/itv.py b/youtube_dl/extractor/itv.py index 26c48e4b8..413a219dc 100644 --- a/youtube_dl/extractor/itv.py +++ b/youtube_dl/extractor/itv.py @@ -4,6 +4,7 @@ from __future__ import unicode_literals import uuid import xml.etree.ElementTree as etree import json +import re from .common import InfoExtractor from ..compat import ( @@ -142,9 +143,9 @@ class ITVIE(InfoExtractor): f['url'] = rtmp_url formats.append(f) - ios_playlist_url = params.get('data-video-playlist') + ios_playlist_url = params.get('data-video-playlist') or params.get('data-video-id') hmac = params.get('data-video-hmac') - if ios_playlist_url and hmac: + if ios_playlist_url and hmac and re.match(r'https?://', ios_playlist_url): headers = self.geo_verification_headers() headers.update({ 'Accept': 'application/vnd.itv.vod.playlist.v2+json', @@ -159,12 +160,12 @@ class ITVIE(InfoExtractor): 'token': '' }, 'device': { - 'manufacturer': 'Apple', - 'model': 'iPad', + 'manufacturer': 'Safari', + 'model': '5', 'os': { - 'name': 'iPhone OS', - 'version': '9.3', - 'type': 'ios' + 'name': 'Windows NT', + 'version': '6.1', + 'type': 'desktop' } }, 'client': { @@ -173,10 +174,10 @@ class ITVIE(InfoExtractor): }, 'variantAvailability': { 'featureset': { - 'min': ['hls', 'aes'], - 'max': ['hls', 'aes'] + 'min': ['hls', 'aes', 'outband-webvtt'], + 'max': ['hls', 'aes', 'outband-webvtt'] }, - 'platformTag': 'mobile' + 'platformTag': 'dotcom' } }).encode(), headers=headers, fatal=False) if ios_playlist: diff --git a/youtube_dl/extractor/mnet.py b/youtube_dl/extractor/mnet.py index 6a85dcbd5..0e26ca1b3 100644 --- a/youtube_dl/extractor/mnet.py +++ b/youtube_dl/extractor/mnet.py @@ -40,21 +40,29 @@ class MnetIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) + # TODO: extract rtmp formats + # no stype -> rtmp url + # stype=H -> m3u8 url + # stype=M -> mpd url info = self._download_json( - 'http://content.api.mnet.com/player/vodConfig?id=%s&ctype=CLIP' % video_id, - video_id, 'Downloading vod config JSON')['data']['info'] + 'http://content.api.mnet.com/player/vodConfig', + video_id, 'Downloading vod config JSON', query={ + 'id': video_id, + 'ctype': 'CLIP', + 'stype': 'H', + })['data']['info'] title = info['title'] - rtmp_info = self._download_json( - info['cdn'], video_id, 'Downloading vod cdn JSON') - - formats = [{ - 'url': rtmp_info['serverurl'] + rtmp_info['fileurl'], - 'ext': 'flv', - 'page_url': url, - 'player_url': 'http://flvfile.mnet.com/service/player/201602/cjem_player_tv.swf?v=201602191318', - }] + cdn_data = self._download_json( + info['cdn'], video_id, 'Downloading vod cdn JSON')['data'][0] + m3u8_url = cdn_data['url'] + token = cdn_data.get('token') + if token and token != '-': + m3u8_url += '?' + token + formats = self._extract_wowza_formats( + m3u8_url, video_id, skip_protocols=['rtmp', 'rtsp', 'f4m']) + self._sort_formats(formats) description = info.get('ment') duration = parse_duration(info.get('time')) diff --git a/youtube_dl/extractor/mtv.py b/youtube_dl/extractor/mtv.py index 1154a3536..7a3b57abd 100644 --- a/youtube_dl/extractor/mtv.py +++ b/youtube_dl/extractor/mtv.py @@ -115,10 +115,17 @@ class MTVServicesInfoExtractor(InfoExtractor): if transcript.get('kind') != 'captions': continue lang = transcript.get('srclang') - subtitles[lang] = [{ - 'url': compat_str(typographic.get('src')), - 'ext': typographic.get('format') - } for typographic in transcript.findall('./typographic')] + for typographic in transcript.findall('./typographic'): + sub_src = typographic.get('src') + if not sub_src: + continue + ext = typographic.get('format') + if ext == 'cea-608': + ext = 'scc' + subtitles.setdefault(lang, []).append({ + 'url': compat_str(sub_src), + 'ext': ext + }) return subtitles def _get_video_info(self, itemdoc, use_hls=True): diff --git a/youtube_dl/extractor/nexx.py b/youtube_dl/extractor/nexx.py index 071879ba4..9203c0477 100644 --- a/youtube_dl/extractor/nexx.py +++ b/youtube_dl/extractor/nexx.py @@ -28,7 +28,7 @@ class NexxIE(InfoExtractor): _TESTS = [{ # movie 'url': 'https://api.nexx.cloud/v3/748/videos/byid/128907', - 'md5': '16746bfc28c42049492385c989b26c4a', + 'md5': '828cea195be04e66057b846288295ba1', 'info_dict': { 'id': '128907', 'ext': 'mp4', @@ -42,9 +42,6 @@ class NexxIE(InfoExtractor): 'timestamp': 1384264416, 'upload_date': '20131112', }, - 'params': { - 'format': 'bestvideo', - }, }, { # episode 'url': 'https://api.nexx.cloud/v3/741/videos/byid/247858', @@ -62,7 +59,6 @@ class NexxIE(InfoExtractor): 'season_number': 2, }, 'params': { - 'format': 'bestvideo', 'skip_download': True, }, }, { @@ -193,35 +189,67 @@ class NexxIE(InfoExtractor): stream_data = video['streamdata'] language = general.get('language_raw') or '' - # TODO: reverse more cdns and formats + # TODO: reverse more cdns cdn = stream_data['cdnType'] assert cdn == 'azure' azure_locator = stream_data['azureLocator'] - AZURE_URL = 'http://nx-p%02d.akamaized.net/' + AZURE_URL = 'http://nx%s%02d.akamaized.net/' - for secure in ('s', ''): - cdn_shield = stream_data.get('cdnShieldHTTP%s' % secure.upper()) - if cdn_shield: - azure_base = 'http%s://%s' % (secure, cdn_shield) - break - else: - azure_base = AZURE_URL % int(stream_data['azureAccount'].replace('nexxplayplus', '')) + def get_cdn_shield_base(shield_type='', prefix='-p'): + for secure in ('', 's'): + cdn_shield = stream_data.get('cdnShield%sHTTP%s' % (shield_type, secure.upper())) + if cdn_shield: + return 'http%s://%s' % (secure, cdn_shield) + else: + return AZURE_URL % (prefix, int(stream_data['azureAccount'].replace('nexxplayplus', ''))) + azure_stream_base = get_cdn_shield_base() is_ml = ',' in language - azure_m3u8_url = '%s%s/%s_src%s.ism/Manifest(format=m3u8-aapl)' % ( - azure_base, azure_locator, video_id, ('_manifest' if is_ml else '')) + azure_manifest_url = '%s%s/%s_src%s.ism/Manifest' % ( + azure_stream_base, azure_locator, video_id, ('_manifest' if is_ml else '')) + '%s' protection_token = try_get( video, lambda x: x['protectiondata']['token'], compat_str) if protection_token: - azure_m3u8_url += '?hdnts=%s' % protection_token + azure_manifest_url += '?hdnts=%s' % protection_token formats = self._extract_m3u8_formats( - azure_m3u8_url, video_id, 'mp4', entry_protocol='m3u8_native', - m3u8_id='%s-hls' % cdn) + azure_manifest_url % '(format=m3u8-aapl)', + video_id, 'mp4', 'm3u8_native', + m3u8_id='%s-hls' % cdn, fatal=False) + formats.extend(self._extract_mpd_formats( + azure_manifest_url % '(format=mpd-time-csf)', + video_id, mpd_id='%s-dash' % cdn, fatal=False)) + formats.extend(self._extract_ism_formats( + azure_manifest_url % '', video_id, ism_id='%s-mss' % cdn, fatal=False)) + + azure_progressive_base = get_cdn_shield_base('Prog', '-d') + azure_file_distribution = stream_data.get('azureFileDistribution') + if azure_file_distribution: + fds = azure_file_distribution.split(',') + if fds: + for fd in fds: + ss = fd.split(':') + if len(ss) == 2: + tbr = int_or_none(ss[0]) + if tbr: + f = { + 'url': '%s%s/%s_src_%s_%d.mp4' % ( + azure_progressive_base, azure_locator, video_id, ss[1], tbr), + 'format_id': '%s-http-%d' % (cdn, tbr), + 'tbr': tbr, + } + width_height = ss[1].split('x') + if len(width_height) == 2: + f.update({ + 'width': int_or_none(width_height[0]), + 'height': int_or_none(width_height[1]), + }) + formats.append(f) + self._sort_formats(formats) return { diff --git a/youtube_dl/extractor/odnoklassniki.py b/youtube_dl/extractor/odnoklassniki.py index 854b6800c..8e13bcf1f 100644 --- a/youtube_dl/extractor/odnoklassniki.py +++ b/youtube_dl/extractor/odnoklassniki.py @@ -14,6 +14,7 @@ from ..utils import ( int_or_none, qualities, unescapeHTML, + urlencode_postdata, ) @@ -56,7 +57,7 @@ class OdnoklassnikiIE(InfoExtractor): 'url': 'http://ok.ru/video/64211978996595-1', 'md5': '2f206894ffb5dbfcce2c5a14b909eea5', 'info_dict': { - 'id': '64211978996595-1', + 'id': 'V_VztHT5BzY', 'ext': 'mp4', 'title': 'Космическая среда от 26 августа 2015', 'description': 'md5:848eb8b85e5e3471a3a803dae1343ed0', @@ -127,9 +128,14 @@ class OdnoklassnikiIE(InfoExtractor): if metadata: metadata = self._parse_json(metadata, video_id) else: + data = {} + st_location = flashvars.get('location') + if st_location: + data['st.location'] = st_location metadata = self._download_json( compat_urllib_parse_unquote(flashvars['metadataUrl']), - video_id, 'Downloading metadata JSON') + video_id, 'Downloading metadata JSON', + data=urlencode_postdata(data)) movie = metadata['movie'] diff --git a/youtube_dl/extractor/openload.py b/youtube_dl/extractor/openload.py index b50d6c77b..a99af12a4 100644 --- a/youtube_dl/extractor/openload.py +++ b/youtube_dl/extractor/openload.py @@ -140,7 +140,7 @@ class PhantomJSwrapper(object): for name in self._TMP_FILE_NAMES: try: os.remove(self._TMP_FILES[name].name) - except: + except (IOError, OSError): pass def _save_cookies(self, url): @@ -242,7 +242,7 @@ class PhantomJSwrapper(object): class OpenloadIE(InfoExtractor): - _VALID_URL = r'https?://(?:openload\.(?:co|io)|oload\.tv)/(?:f|embed)/(?P<id>[a-zA-Z0-9-_]+)' + _VALID_URL = r'https?://(?:www\.)?(?:openload\.(?:co|io|link)|oload\.tv)/(?:f|embed)/(?P<id>[a-zA-Z0-9-_]+)' _TESTS = [{ 'url': 'https://openload.co/f/kUEfGclsU9o', @@ -286,6 +286,9 @@ class OpenloadIE(InfoExtractor): }, { 'url': 'https://oload.tv/embed/KnG-kKZdcfY/', 'only_matching': True, + }, { + 'url': 'http://www.openload.link/f/KnG-kKZdcfY', + 'only_matching': True, }] _USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36' diff --git a/youtube_dl/extractor/orf.py b/youtube_dl/extractor/orf.py index 74fe8017e..c1fb580ca 100644 --- a/youtube_dl/extractor/orf.py +++ b/youtube_dl/extractor/orf.py @@ -49,13 +49,13 @@ class ORFTVthekIE(InfoExtractor): 'params': { 'skip_download': True, # rtsp downloads }, - '_skip': 'Blocked outside of Austria / Germany', + 'skip': 'Blocked outside of Austria / Germany', }, { 'url': 'http://tvthek.orf.at/topic/Fluechtlingskrise/10463081/Heimat-Fremde-Heimat/13879132/Senioren-betreuen-Migrantenkinder/13879141', - 'skip_download': True, + 'only_matching': True, }, { 'url': 'http://tvthek.orf.at/profile/Universum/35429', - 'skip_download': True, + 'only_matching': True, }] def _real_extract(self, url): diff --git a/youtube_dl/extractor/pbs.py b/youtube_dl/extractor/pbs.py index b51dcbe10..f11d5da52 100644 --- a/youtube_dl/extractor/pbs.py +++ b/youtube_dl/extractor/pbs.py @@ -421,6 +421,7 @@ class PBSIE(InfoExtractor): r'class="coveplayerid">([^<]+)<', # coveplayer r'<section[^>]+data-coveid="(\d+)"', # coveplayer from http://www.pbs.org/wgbh/frontline/film/real-csi/ r'<input type="hidden" id="pbs_video_id_[0-9]+" value="([0-9]+)"/>', # jwplayer + r"(?s)window\.PBS\.playerConfig\s*=\s*{.*?id\s*:\s*'([0-9]+)',", ] media_id = self._search_regex( diff --git a/youtube_dl/extractor/rozhlas.py b/youtube_dl/extractor/rozhlas.py index f8eda8dea..fccf69401 100644 --- a/youtube_dl/extractor/rozhlas.py +++ b/youtube_dl/extractor/rozhlas.py @@ -21,7 +21,7 @@ class RozhlasIE(InfoExtractor): } }, { 'url': 'http://prehravac.rozhlas.cz/audio/3421320/embed', - 'skip_download': True, + 'only_matching': True, }] def _real_extract(self, url): diff --git a/youtube_dl/extractor/tnaflix.py b/youtube_dl/extractor/tnaflix.py index 7e6ec3430..0c2f8f119 100644 --- a/youtube_dl/extractor/tnaflix.py +++ b/youtube_dl/extractor/tnaflix.py @@ -21,6 +21,8 @@ class TNAFlixNetworkBaseIE(InfoExtractor): r'flashvars\.config\s*=\s*escape\("([^"]+)"', r'<input[^>]+name="config\d?" value="([^"]+)"', ] + _HOST = 'tna' + _VKEY_SUFFIX = '' _TITLE_REGEX = r'<input[^>]+name="title" value="([^"]+)"' _DESCRIPTION_REGEX = r'<input[^>]+name="description" value="([^"]+)"' _UPLOADER_REGEX = r'<input[^>]+name="username" value="([^"]+)"' @@ -72,7 +74,13 @@ class TNAFlixNetworkBaseIE(InfoExtractor): def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) video_id = mobj.group('id') - display_id = mobj.group('display_id') if 'display_id' in mobj.groupdict() else video_id + for display_id_key in ('display_id', 'display_id_2'): + if display_id_key in mobj.groupdict(): + display_id = mobj.group(display_id_key) + if display_id: + break + else: + display_id = video_id webpage = self._download_webpage(url, display_id) @@ -81,8 +89,8 @@ class TNAFlixNetworkBaseIE(InfoExtractor): if not cfg_url: inputs = self._hidden_inputs(webpage) - cfg_url = ('https://cdn-fck.tnaflix.com/tnaflix/%s.fid?key=%s&VID=%s&premium=1&vip=1&alpha' - % (inputs['vkey'], inputs['nkey'], video_id)) + cfg_url = ('https://cdn-fck.%sflix.com/%sflix/%s%s.fid?key=%s&VID=%s&premium=1&vip=1&alpha' + % (self._HOST, self._HOST, inputs['vkey'], self._VKEY_SUFFIX, inputs['nkey'], video_id)) cfg_xml = self._download_xml( cfg_url, display_id, 'Downloading metadata', @@ -91,7 +99,8 @@ class TNAFlixNetworkBaseIE(InfoExtractor): formats = [] def extract_video_url(vl): - return re.sub(r'speed=\d+', 'speed=', unescapeHTML(vl.text)) + # Any URL modification now results in HTTP Error 403: Forbidden + return unescapeHTML(vl.text) video_link = cfg_xml.find('./videoLink') if video_link is not None: @@ -192,18 +201,21 @@ class TNAFlixNetworkEmbedIE(TNAFlixNetworkBaseIE): webpage)] -class TNAFlixIE(TNAFlixNetworkBaseIE): +class TNAEMPFlixBaseIE(TNAFlixNetworkBaseIE): + _DESCRIPTION_REGEX = r'(?s)>Description:</[^>]+>(.+?)<' + _UPLOADER_REGEX = r'<span>by\s*<a[^>]+\bhref=["\']/profile/[^>]+>([^<]+)<' + _CATEGORIES_REGEX = r'(?s)<span[^>]*>Categories:</span>(.+?)</div>' + + +class TNAFlixIE(TNAEMPFlixBaseIE): _VALID_URL = r'https?://(?:www\.)?tnaflix\.com/[^/]+/(?P<display_id>[^/]+)/video(?P<id>\d+)' _TITLE_REGEX = r'<title>(.+?) - (?:TNAFlix Porn Videos|TNAFlix\.com)' - _DESCRIPTION_REGEX = r'(?s)>Description:]+>(.+?)<' - _UPLOADER_REGEX = r'\s*Verified Member\s*\s*(.+?)<' - _CATEGORIES_REGEX = r'(?s)]*>Categories:(.+?)' _TESTS = [{ # anonymous uploader, no categories 'url': 'http://www.tnaflix.com/porn-stars/Carmella-Decesare-striptease/video553878', - 'md5': 'ecf3498417d09216374fc5907f9c6ec0', + 'md5': '7e569419fe6d69543d01e6be22f5f7c4', 'info_dict': { 'id': '553878', 'display_id': 'Carmella-Decesare-striptease', @@ -228,7 +240,7 @@ class TNAFlixIE(TNAFlixNetworkBaseIE): 'duration': 164, 'age_limit': 18, 'uploader': 'bobwhite39', - 'categories': ['Amateur Porn', 'Squirting Videos', 'Teen Girls 18+'], + 'categories': list, } }, { 'url': 'https://www.tnaflix.com/amateur-porn/bunzHD-Ms.Donk/video358632', @@ -236,14 +248,15 @@ class TNAFlixIE(TNAFlixNetworkBaseIE): }] -class EMPFlixIE(TNAFlixNetworkBaseIE): - _VALID_URL = r'https?://(?:www\.)?empflix\.com/videos/(?P.+?)-(?P[0-9]+)\.html' +class EMPFlixIE(TNAEMPFlixBaseIE): + _VALID_URL = r'https?://(?:www\.)?empflix\.com/(?:videos/(?P.+?)-|[^/]+/(?P[^/]+)/video)(?P[0-9]+)' - _UPLOADER_REGEX = r']+class="infoTitle"[^>]*>Uploaded By:(.+?)' + _HOST = 'emp' + _VKEY_SUFFIX = '-1' _TESTS = [{ 'url': 'http://www.empflix.com/videos/Amateur-Finger-Fuck-33051.html', - 'md5': 'b1bc15b6412d33902d6e5952035fcabc', + 'md5': 'bc30d48b91a7179448a0bda465114676', 'info_dict': { 'id': '33051', 'display_id': 'Amateur-Finger-Fuck', @@ -259,6 +272,9 @@ class EMPFlixIE(TNAFlixNetworkBaseIE): }, { 'url': 'http://www.empflix.com/videos/[AROMA][ARMD-718]-Aoi-Yoshino-Sawa-25826.html', 'only_matching': True, + }, { + 'url': 'https://www.empflix.com/amateur-porn/Amateur-Finger-Fuck/video33051', + 'only_matching': True, }] diff --git a/youtube_dl/extractor/toutv.py b/youtube_dl/extractor/toutv.py index e59ed2661..17c0adc15 100644 --- a/youtube_dl/extractor/toutv.py +++ b/youtube_dl/extractor/toutv.py @@ -1,6 +1,8 @@ # coding: utf-8 from __future__ import unicode_literals +import re + from .common import InfoExtractor from ..utils import ( int_or_none, @@ -41,7 +43,7 @@ class TouTvIE(InfoExtractor): email, password = self._get_login_info() if email is None: return - state = 'http://ici.tou.tv//' + state = 'http://ici.tou.tv/' webpage = self._download_webpage(state, None, 'Downloading homepage') toutvlogin = self._parse_json(self._search_regex( r'(?s)toutvlogin\s*=\s*({.+?});', webpage, 'toutvlogin'), None, js_to_json) @@ -54,16 +56,30 @@ class TouTvIE(InfoExtractor): 'scope': 'media-drmt openid profile email id.write media-validation.read.privileged', 'state': state, }) - login_form = self._search_regex( - r'(?s)(]+(?:id|name)="Form-login".+?)', login_webpage, 'login form') - form_data = self._hidden_inputs(login_form) + + def extract_form_url_and_data(wp, default_form_url, form_spec_re=''): + form, form_elem = re.search( + r'(?s)((]+?%s[^>]*?>).+?)' % form_spec_re, wp).groups() + form_data = self._hidden_inputs(form) + form_url = extract_attributes(form_elem).get('action') or default_form_url + return form_url, form_data + + post_url, form_data = extract_form_url_and_data( + login_webpage, + 'https://services.radio-canada.ca/auth/oauth/v2/authorize/login', + r'(?:id|name)="Form-login"') form_data.update({ 'login-email': email, 'login-password': password, }) - post_url = extract_attributes(login_form).get('action') or authorize_url - _, urlh = self._download_webpage_handle( + consent_webpage = self._download_webpage( post_url, None, 'Logging in', data=urlencode_postdata(form_data)) + post_url, form_data = extract_form_url_and_data( + consent_webpage, + 'https://services.radio-canada.ca/auth/oauth/v2/authorize/consent') + _, urlh = self._download_webpage_handle( + post_url, None, 'Following Redirection', + data=urlencode_postdata(form_data)) self._access_token = self._search_regex( r'access_token=([\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})', urlh.geturl(), 'access token') diff --git a/youtube_dl/extractor/vidzi.py b/youtube_dl/extractor/vidzi.py index e64873bce..ac35d55a9 100644 --- a/youtube_dl/extractor/vidzi.py +++ b/youtube_dl/extractor/vidzi.py @@ -28,10 +28,10 @@ class VidziIE(InfoExtractor): }, }, { 'url': 'http://vidzi.tv/embed-4z2yb0rzphe9-600x338.html', - 'skip_download': True, + 'only_matching': True, }, { 'url': 'http://vidzi.cc/cghql9yq6emu.html', - 'skip_download': True, + 'only_matching': True, }] def _real_extract(self, url): diff --git a/youtube_dl/extractor/vvvvid.py b/youtube_dl/extractor/vvvvid.py index 656a4b9e5..3d0dc403b 100644 --- a/youtube_dl/extractor/vvvvid.py +++ b/youtube_dl/extractor/vvvvid.py @@ -22,6 +22,9 @@ class VVVVIDIE(InfoExtractor): 'ext': 'mp4', 'title': 'Ping Pong', }, + 'params': { + 'skip_download': True, + }, }, { # video_type == 'video/rcs' 'url': 'https://www.vvvvid.it/#!show/376/death-note-live-action/377/482493/episodio-01', @@ -31,6 +34,9 @@ class VVVVIDIE(InfoExtractor): 'ext': 'mp4', 'title': 'Episodio 01', }, + 'params': { + 'skip_download': True, + }, }] _conn_id = None @@ -116,8 +122,20 @@ class VVVVIDIE(InfoExtractor): embed_code = ds(embed_code) video_type = video_data.get('video_type') if video_type in ('video/rcs', 'video/kenc'): - formats.extend(self._extract_akamai_formats( - embed_code, video_id)) + embed_code = re.sub(r'https?://([^/]+)/z/', r'https://\1/i/', embed_code).replace('/manifest.f4m', '/master.m3u8') + if video_type == 'video/kenc': + kenc = self._download_json( + 'https://www.vvvvid.it/kenc', video_id, query={ + 'action': 'kt', + 'conn_id': self._conn_id, + 'url': embed_code, + }, fatal=False) or {} + kenc_message = kenc.get('message') + if kenc_message: + embed_code += '?' + ds(kenc_message) + formats.extend(self._extract_m3u8_formats( + embed_code, video_id, 'mp4', + m3u8_id='hls', fatal=False)) else: formats.extend(self._extract_wowza_formats( 'http://sb.top-ix.org/videomg/_definst_/mp4:%s/playlist.m3u8' % embed_code, video_id)) diff --git a/youtube_dl/extractor/xhamster.py b/youtube_dl/extractor/xhamster.py index be3624ef2..52f8ded2f 100644 --- a/youtube_dl/extractor/xhamster.py +++ b/youtube_dl/extractor/xhamster.py @@ -6,10 +6,12 @@ from .common import InfoExtractor from ..compat import compat_str from ..utils import ( clean_html, + determine_ext, dict_get, ExtractorError, int_or_none, parse_duration, + try_get, unified_strdate, ) @@ -32,6 +34,7 @@ class XHamsterIE(InfoExtractor): 'display_id': 'femaleagent_shy_beauty_takes_the_bait', 'ext': 'mp4', 'title': 'FemaleAgent Shy beauty takes the bait', + 'timestamp': 1350194821, 'upload_date': '20121014', 'uploader': 'Ruseful2011', 'duration': 893, @@ -45,6 +48,7 @@ class XHamsterIE(InfoExtractor): 'display_id': 'britney_spears_sexy_booty', 'ext': 'mp4', 'title': 'Britney Spears Sexy Booty', + 'timestamp': 1379123460, 'upload_date': '20130914', 'uploader': 'jojo747400', 'duration': 200, @@ -61,6 +65,7 @@ class XHamsterIE(InfoExtractor): 'id': '5667973', 'ext': 'mp4', 'title': '....', + 'timestamp': 1454948101, 'upload_date': '20160208', 'uploader': 'parejafree', 'duration': 72, @@ -96,6 +101,83 @@ class XHamsterIE(InfoExtractor): if error: raise ExtractorError(error, expected=True) + age_limit = self._rta_search(webpage) + + def get_height(s): + return int_or_none(self._search_regex( + r'^(\d+)[pP]', s, 'height', default=None)) + + initials = self._parse_json( + self._search_regex( + r'window\.initials\s*=\s*({.+?})\s*;\s*\n', webpage, 'initials', + default='{}'), + video_id, fatal=False) + if initials: + video = initials['videoModel'] + title = video['title'] + formats = [] + for format_id, formats_dict in video['sources'].items(): + if not isinstance(formats_dict, dict): + continue + for quality, format_item in formats_dict.items(): + if format_id == 'download': + # Download link takes some time to be generated, + # skipping for now + continue + if not isinstance(format_item, dict): + continue + format_url = format_item.get('link') + filesize = int_or_none( + format_item.get('size'), invscale=1000000) + else: + format_url = format_item + filesize = None + if not isinstance(format_url, compat_str): + continue + formats.append({ + 'format_id': '%s-%s' % (format_id, quality), + 'url': format_url, + 'ext': determine_ext(format_url, 'mp4'), + 'height': get_height(quality), + 'filesize': filesize, + }) + self._sort_formats(formats) + + categories_list = video.get('categories') + if isinstance(categories_list, list): + categories = [] + for c in categories_list: + if not isinstance(c, dict): + continue + c_name = c.get('name') + if isinstance(c_name, compat_str): + categories.append(c_name) + else: + categories = None + + return { + 'id': video_id, + 'display_id': display_id, + 'title': title, + 'description': video.get('description'), + 'timestamp': int_or_none(video.get('created')), + 'uploader': try_get( + video, lambda x: x['author']['name'], compat_str), + 'thumbnail': video.get('thumbURL'), + 'duration': int_or_none(video.get('duration')), + 'view_count': int_or_none(video.get('views')), + 'like_count': int_or_none(try_get( + video, lambda x: x['rating']['likes'], int)), + 'dislike_count': int_or_none(try_get( + video, lambda x: x['rating']['dislikes'], int)), + 'comment_count': int_or_none(video.get('views')), + 'age_limit': age_limit, + 'categories': categories, + 'formats': formats, + } + + # Old layout fallback + title = self._html_search_regex( [r']*>([^<]+)', r']+itemprop=".*?caption.*?"[^>]+content="(.+?)"', @@ -119,8 +201,7 @@ class XHamsterIE(InfoExtractor): formats.append({ 'format_id': format_id, 'url': format_url, - 'height': int_or_none(self._search_regex( - r'^(\d+)[pP]', format_id, 'height', default=None)) + 'height': get_height(format_id), }) video_url = self._search_regex( @@ -167,8 +248,6 @@ class XHamsterIE(InfoExtractor): mobj = re.search(r'Comments \((?P\d+)\)', webpage) comment_count = mobj.group('commentcount') if mobj else 0 - age_limit = self._rta_search(webpage) - categories_html = self._search_regex( r'(?s)Categories:.+?)', webpage, 'categories', default=None) diff --git a/youtube_dl/extractor/xiami.py b/youtube_dl/extractor/xiami.py index d017e03de..7f871c8ec 100644 --- a/youtube_dl/extractor/xiami.py +++ b/youtube_dl/extractor/xiami.py @@ -40,9 +40,12 @@ class XiamiBaseIE(InfoExtractor): 'subtitles': subtitles, } - def _extract_tracks(self, item_id, typ=None): + def _extract_tracks(self, item_id, referer, typ=None): playlist = self._download_json( - '%s/%s%s' % (self._API_BASE_URL, item_id, '/type/%s' % typ if typ else ''), item_id) + '%s/%s%s' % (self._API_BASE_URL, item_id, '/type/%s' % typ if typ else ''), + item_id, headers={ + 'Referer': referer, + }) return [ self._extract_track(track, item_id) for track in playlist['data']['trackList']] @@ -135,13 +138,13 @@ class XiamiSongIE(XiamiBaseIE): }] def _real_extract(self, url): - return self._extract_tracks(self._match_id(url))[0] + return self._extract_tracks(self._match_id(url), url)[0] class XiamiPlaylistBaseIE(XiamiBaseIE): def _real_extract(self, url): item_id = self._match_id(url) - return self.playlist_result(self._extract_tracks(item_id, self._TYPE), item_id) + return self.playlist_result(self._extract_tracks(item_id, url, self._TYPE), item_id) class XiamiAlbumIE(XiamiPlaylistBaseIE): diff --git a/youtube_dl/extractor/youku.py b/youtube_dl/extractor/youku.py index 6822a30bc..f0ba01197 100644 --- a/youtube_dl/extractor/youku.py +++ b/youtube_dl/extractor/youku.py @@ -154,7 +154,7 @@ class YoukuIE(InfoExtractor): # request basic data basic_data_params = { 'vid': video_id, - 'ccode': '0502', + 'ccode': '0501', 'client_ip': '192.168.1.1', 'utid': cna, 'client_ts': time.time() / 1000, @@ -240,7 +240,7 @@ class YoukuShowIE(InfoExtractor): }, { # Ongoing playlist. The initial page is the last one 'url': 'http://list.youku.com/show/id_za7c275ecd7b411e1a19e.html', - 'only_matchine': True, + 'only_matching': True, }] def _extract_entries(self, playlist_data_url, show_id, note, query): diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 34866a54b..eccbc0b1f 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -2350,6 +2350,7 @@ def mimetype2ext(mt): 'ttml+xml': 'ttml', 'x-flv': 'flv', 'x-mp4-fragmented': 'mp4', + 'x-ms-sami': 'sami', 'x-ms-wmv': 'wmv', 'mpegurl': 'm3u8', 'x-mpegurl': 'm3u8', @@ -2372,7 +2373,7 @@ def parse_codecs(codecs_str): vcodec, acodec = None, None for full_codec in splited_codecs: codec = full_codec.split('.')[0] - if codec in ('avc1', 'avc2', 'avc3', 'avc4', 'vp9', 'vp8', 'hev1', 'hev2', 'h263', 'h264', 'mp4v'): + if codec in ('avc1', 'avc2', 'avc3', 'avc4', 'vp9', 'vp8', 'hev1', 'hev2', 'h263', 'h264', 'mp4v', 'hvc1'): if not vcodec: vcodec = full_codec elif codec in ('mp4a', 'opus', 'vorbis', 'mp3', 'aac', 'ac-3', 'ec-3', 'eac3', 'dtsc', 'dtse', 'dtsh', 'dtsl'): diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 1c3cbefeb..88bf1d652 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2017.11.15' +__version__ = '2017.12.02'