diff --git a/.github/ISSUE_TEMPLATE.md b/.github/ISSUE_TEMPLATE.md index 81fe10d54..be6e6ddab 100644 --- a/.github/ISSUE_TEMPLATE.md +++ b/.github/ISSUE_TEMPLATE.md @@ -6,8 +6,8 @@ --- -### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2017.10.20*. If it's not, read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. -- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2017.10.20** +### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2017.11.06*. If it's not, read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. +- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2017.11.06** ### Before submitting an *issue* make sure you have: - [ ] At least skimmed through the [README](https://github.com/rg3/youtube-dl/blob/master/README.md), **most notably** the [FAQ](https://github.com/rg3/youtube-dl#faq) and [BUGS](https://github.com/rg3/youtube-dl#bugs) sections @@ -35,7 +35,7 @@ Add the `-v` flag to **your command line** you run youtube-dl with (`youtube-dl [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 -[debug] youtube-dl version 2017.10.20 +[debug] youtube-dl version 2017.11.06 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/ChangeLog b/ChangeLog index 547b55981..cedab4723 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,50 @@ +version + +Extractors ++ [wsj] Recognize another URL pattern (#14704) + + +version 2017.11.06 + +Core ++ [extractor/common] Add protocol for f4m formats +* [f4m] Prefer baseURL for relative URLs (#14660) +* [extractor/common] Respect URL query in _extract_wowza_formats (14645) + +Extractors ++ [hotstar:playlist] Add support for playlists (#12465) +* [hotstar] Bypass geo restriction (#14672) +- [22tracks] Remove extractor (#11024, #14628) ++ [skysport] Sdd support ooyala videos protected with embed_token (#14641) +* [gamespot] Extract formats referenced with new data fields (#14652) +* [spankbang] Detect unavailable videos (#14644) + + +version 2017.10.29 + +Core +* [extractor/common] Prefix format id for audio only HLS formats ++ [utils] Add support for zero years and months in parse_duration + +Extractors +* [egghead] Fix extraction (#14388) ++ [fxnetworks] Extract series metadata (#14603) ++ [younow] Add support for younow.com (#9255, #9432, #12436) +* [dctptv] Fix extraction (#14599) +* [youtube] Restrict embed regex (#14600) +* [vimeo] Restrict iframe embed regex (#14600) +* [soundgasm] Improve extraction (#14588) +- [myvideo] Remove extractor (#8557) ++ [nbc] Add support for classic-tv videos (#14575) ++ [vrtnu] Add support for cookies authentication and simplify (#11873) ++ [canvas] Add support for vrt.be/vrtnu (#11873) +* [twitch:clips] Fix title extraction (#14566) ++ [ndtv] Add support for sub-sites (#14534) +* [dramafever] Fix login error message extraction ++ [nick] Add support for more nickelodeon sites (no, dk, se, ch, fr, es, pt, + ro, hu) (#14553) + + version 2017.10.20 Core diff --git a/docs/supportedsites.md b/docs/supportedsites.md index be5de22df..6009df571 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -3,8 +3,6 @@ - **1up.com** - **20min** - **220.ro** - - **22tracks:genre** - - **22tracks:track** - **24video** - **3qsdn**: 3Q SDN - **3sat** @@ -342,6 +340,7 @@ - **HornBunny** - **HotNewHipHop** - **HotStar** + - **hotstar:playlist** - **Howcast** - **HowStuffWorks** - **HRTi** @@ -498,7 +497,6 @@ - **MySpace:album** - **MySpass** - **Myvi** - - **myvideo** (Currently broken) - **MyVidster** - **n-tv.de** - **natgeo** @@ -977,6 +975,7 @@ - **vpro**: npo.nl, ntr.nl, omroepwnl.nl, zapp.nl and npo3.nl - **Vrak** - **VRT**: deredactie.be, sporza.be, cobra.be and cobra.canvas.be + - **VrtNU**: VrtNU.be - **vrv** - **vrv:series** - **VShare** @@ -1035,6 +1034,9 @@ - **YouJizz** - **youku**: 优酷 - **youku:show** + - **YouNowChannel** + - **YouNowLive** + - **YouNowMoment** - **YouPorn** - **YourUpload** - **youtube**: YouTube.com diff --git a/test/test_InfoExtractor.py b/test/test_InfoExtractor.py index f18a823fc..686c63efa 100644 --- a/test/test_InfoExtractor.py +++ b/test/test_InfoExtractor.py @@ -574,6 +574,32 @@ jwplayer("mediaplayer").setup({"abouttext":"Visit Indie DB","aboutlink":"http:\/ self.ie._sort_formats(formats) expect_value(self, formats, expected_formats, None) + def test_parse_f4m_formats(self): + _TEST_CASES = [ + ( + # https://github.com/rg3/youtube-dl/issues/14660 + 'custom_base_url', + 'http://api.new.livestream.com/accounts/6115179/events/6764928/videos/144884262.f4m', + [{ + 'manifest_url': 'http://api.new.livestream.com/accounts/6115179/events/6764928/videos/144884262.f4m', + 'ext': 'flv', + 'format_id': '2148', + 'protocol': 'f4m', + 'tbr': 2148, + 'width': 1280, + 'height': 720, + }] + ), + ] + + for f4m_file, f4m_url, expected_formats in _TEST_CASES: + with io.open('./test/testdata/f4m/%s.f4m' % f4m_file, + mode='r', encoding='utf-8') as f: + formats = self.ie._parse_f4m_formats( + compat_etree_fromstring(f.read().encode('utf-8')), + f4m_url, None) + self.ie._sort_formats(formats) + expect_value(self, formats, expected_formats, None) if __name__ == '__main__': unittest.main() diff --git a/test/test_utils.py b/test/test_utils.py index efa73d0f4..cc13f795c 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -540,6 +540,7 @@ class TestUtil(unittest.TestCase): self.assertEqual(parse_duration('87 Min.'), 5220) self.assertEqual(parse_duration('PT1H0.040S'), 3600.04) self.assertEqual(parse_duration('PT00H03M30SZ'), 210) + self.assertEqual(parse_duration('P0Y0M0DT0H4M20.880S'), 260.88) def test_fix_xml_ampersands(self): self.assertEqual( diff --git a/test/testdata/f4m/custom_base_url.f4m b/test/testdata/f4m/custom_base_url.f4m new file mode 100644 index 000000000..74e1539e8 --- /dev/null +++ b/test/testdata/f4m/custom_base_url.f4m @@ -0,0 +1,10 @@ + + + recorded + http://vod.livestream.com/events/0000000000673980/ + 269.293 + AAAAm2Fic3QAAAAAAAAAAQAAAAPoAAAAAAAEG+0AAAAAAAAAAAAAAAAAAQAAABlhc3J0AAAAAAAAAAABAAAAAQAAAC4BAAAAVmFmcnQAAAAAAAAD6AAAAAAEAAAAAQAAAAAAAAAAAAAXcAAAAC0AAAAAAAQHQAAAE5UAAAAuAAAAAAAEGtUAAAEYAAAAAAAAAAAAAAAAAAAAAAA= + + AgAKb25NZXRhRGF0YQgAAAAIAAhkdXJhdGlvbgBAcNSwIMSbpgAFd2lkdGgAQJQAAAAAAAAABmhlaWdodABAhoAAAAAAAAAJZnJhbWVyYXRlAEA4/7DoLwW3AA12aWRlb2RhdGFyYXRlAECe1DLgjcobAAx2aWRlb2NvZGVjaWQAQBwAAAAAAAAADWF1ZGlvZGF0YXJhdGUAQGSimlvaPKQADGF1ZGlvY29kZWNpZABAJAAAAAAAAAAACQ== + + diff --git a/youtube_dl/downloader/f4m.py b/youtube_dl/downloader/f4m.py index c8fde9a89..fdb80f42a 100644 --- a/youtube_dl/downloader/f4m.py +++ b/youtube_dl/downloader/f4m.py @@ -243,8 +243,17 @@ def remove_encrypted_media(media): media)) -def _add_ns(prop): - return '{http://ns.adobe.com/f4m/1.0}%s' % prop +def _add_ns(prop, ver=1): + return '{http://ns.adobe.com/f4m/%d.0}%s' % (ver, prop) + + +def get_base_url(manifest): + base_url = xpath_text( + manifest, [_add_ns('baseURL'), _add_ns('baseURL', 2)], + 'base URL', default=None) + if base_url: + base_url = base_url.strip() + return base_url class F4mFD(FragmentFD): @@ -330,13 +339,13 @@ class F4mFD(FragmentFD): rate, media = list(filter( lambda f: int(f[0]) == requested_bitrate, formats))[0] - base_url = compat_urlparse.urljoin(man_url, media.attrib['url']) + # Prefer baseURL for relative URLs as per 11.2 of F4M 3.0 spec. + man_base_url = get_base_url(doc) or man_url + + base_url = compat_urlparse.urljoin(man_base_url, media.attrib['url']) bootstrap_node = doc.find(_add_ns('bootstrapInfo')) - # From Adobe F4M 3.0 spec: - # The element SHALL be the base URL for all relative - # (HTTP-based) URLs in the manifest. If is not present, said - # URLs should be relative to the location of the containing document. - boot_info, bootstrap_url = self._parse_bootstrap_node(bootstrap_node, man_url) + boot_info, bootstrap_url = self._parse_bootstrap_node( + bootstrap_node, man_base_url) live = boot_info['live'] metadata_node = media.find(_add_ns('metadata')) if metadata_node is not None: diff --git a/youtube_dl/extractor/animeondemand.py b/youtube_dl/extractor/animeondemand.py index 69d363311..34c2b363e 100644 --- a/youtube_dl/extractor/animeondemand.py +++ b/youtube_dl/extractor/animeondemand.py @@ -78,7 +78,7 @@ class AnimeOnDemandIE(InfoExtractor): post_url = urljoin(self._LOGIN_URL, post_url) response = self._download_webpage( - post_url, None, 'Logging in as %s' % username, + post_url, None, 'Logging in', data=urlencode_postdata(login_form), headers={ 'Referer': self._LOGIN_URL, }) diff --git a/youtube_dl/extractor/atresplayer.py b/youtube_dl/extractor/atresplayer.py index 01fa308ff..1a31ebe08 100644 --- a/youtube_dl/extractor/atresplayer.py +++ b/youtube_dl/extractor/atresplayer.py @@ -87,7 +87,7 @@ class AtresPlayerIE(InfoExtractor): self._LOGIN_URL, urlencode_postdata(login_form)) request.add_header('Content-Type', 'application/x-www-form-urlencoded') response = self._download_webpage( - request, None, 'Logging in as %s' % username) + request, None, 'Logging in') error = self._html_search_regex( r'(?s)]+class="[^"]*\blist_error\b[^"]*">(.+?)', diff --git a/youtube_dl/extractor/bambuser.py b/youtube_dl/extractor/bambuser.py index 0eb1930c2..633c57553 100644 --- a/youtube_dl/extractor/bambuser.py +++ b/youtube_dl/extractor/bambuser.py @@ -59,7 +59,7 @@ class BambuserIE(InfoExtractor): self._LOGIN_URL, urlencode_postdata(login_form)) request.add_header('Referer', self._LOGIN_URL) response = self._download_webpage( - request, None, 'Logging in as %s' % username) + request, None, 'Logging in') login_error = self._html_search_regex( r'(?s)
(.+?)
', diff --git a/youtube_dl/extractor/cartoonnetwork.py b/youtube_dl/extractor/cartoonnetwork.py index 086ec90c9..6aeebd7b3 100644 --- a/youtube_dl/extractor/cartoonnetwork.py +++ b/youtube_dl/extractor/cartoonnetwork.py @@ -31,7 +31,7 @@ class CartoonNetworkIE(TurnerBaseIE): 'http://www.cartoonnetwork.com/video-seo-svc/episodeservices/getCvpPlaylist?networkName=CN2&' + query, video_id, { 'secure': { 'media_src': 'http://androidhls-secure.cdn.turner.com/toon/big', - 'tokenizer_src': 'http://www.cartoonnetwork.com/cntv/mvpd/processors/services/token_ipadAdobe.do', + 'tokenizer_src': 'https://token.vgtf.net/token/token_mobile', }, }, { 'url': url, diff --git a/youtube_dl/extractor/ccma.py b/youtube_dl/extractor/ccma.py index 39938c9ac..bec0a825a 100644 --- a/youtube_dl/extractor/ccma.py +++ b/youtube_dl/extractor/ccma.py @@ -93,7 +93,7 @@ class CCMAIE(InfoExtractor): 'description': clean_html(informacio.get('descripcio')), 'duration': duration, 'timestamp': timestamp, - 'thumnails': thumbnails, + 'thumbnails': thumbnails, 'subtitles': subtitles, 'formats': formats, } diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index e832d056c..ffc80ec36 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -29,7 +29,10 @@ from ..compat import ( compat_urlparse, compat_xml_parse_error, ) -from ..downloader.f4m import remove_encrypted_media +from ..downloader.f4m import ( + get_base_url, + remove_encrypted_media, +) from ..utils import ( NO_DEFAULT, age_restricted, @@ -1255,11 +1258,8 @@ class InfoExtractor(object): media_nodes = remove_encrypted_media(media_nodes) if not media_nodes: return formats - base_url = xpath_text( - manifest, ['{http://ns.adobe.com/f4m/1.0}baseURL', '{http://ns.adobe.com/f4m/2.0}baseURL'], - 'base URL', default=None) - if base_url: - base_url = base_url.strip() + + manifest_base_url = get_base_url(manifest) bootstrap_info = xpath_element( manifest, ['{http://ns.adobe.com/f4m/1.0}bootstrapInfo', '{http://ns.adobe.com/f4m/2.0}bootstrapInfo'], @@ -1291,7 +1291,7 @@ class InfoExtractor(object): continue manifest_url = ( media_url if media_url.startswith('http://') or media_url.startswith('https://') - else ((base_url or '/'.join(manifest_url.split('/')[:-1])) + '/' + media_url)) + else ((manifest_base_url or '/'.join(manifest_url.split('/')[:-1])) + '/' + media_url)) # If media_url is itself a f4m manifest do the recursive extraction # since bitrates in parent manifest (this one) and media_url manifest # may differ leading to inability to resolve the format by requested @@ -1326,6 +1326,7 @@ class InfoExtractor(object): 'url': manifest_url, 'manifest_url': manifest_url, 'ext': 'flv' if bootstrap_info is not None else None, + 'protocol': 'f4m', 'tbr': tbr, 'width': width, 'height': height, @@ -1417,7 +1418,7 @@ class InfoExtractor(object): media_url = media.get('URI') if media_url: format_id = [] - for v in (group_id, name): + for v in (m3u8_id, group_id, name): if v: format_id.append(v) f = { @@ -2249,27 +2250,35 @@ class InfoExtractor(object): return formats def _extract_wowza_formats(self, url, video_id, m3u8_entry_protocol='m3u8_native', skip_protocols=[]): + query = compat_urlparse.urlparse(url).query url = re.sub(r'/(?:manifest|playlist|jwplayer)\.(?:m3u8|f4m|mpd|smil)', '', url) url_base = self._search_regex( r'(?:(?:https?|rtmp|rtsp):)?(//[^?]+)', url, 'format url') http_base_url = '%s:%s' % ('http', url_base) formats = [] + + def manifest_url(manifest): + m_url = '%s/%s' % (http_base_url, manifest) + if query: + m_url += '?%s' % query + return m_url + if 'm3u8' not in skip_protocols: formats.extend(self._extract_m3u8_formats( - http_base_url + '/playlist.m3u8', video_id, 'mp4', + manifest_url('playlist.m3u8'), video_id, 'mp4', m3u8_entry_protocol, m3u8_id='hls', fatal=False)) if 'f4m' not in skip_protocols: formats.extend(self._extract_f4m_formats( - http_base_url + '/manifest.f4m', + manifest_url('manifest.f4m'), video_id, f4m_id='hds', fatal=False)) if 'dash' not in skip_protocols: formats.extend(self._extract_mpd_formats( - http_base_url + '/manifest.mpd', + manifest_url('manifest.mpd'), video_id, mpd_id='dash', fatal=False)) if re.search(r'(?:/smil:|\.smil)', url_base): if 'smil' not in skip_protocols: rtmp_formats = self._extract_smil_formats( - http_base_url + '/jwplayer.smil', + manifest_url('jwplayer.smil'), video_id, fatal=False) for rtmp_format in rtmp_formats: rtsp_format = rtmp_format.copy() diff --git a/youtube_dl/extractor/crunchyroll.py b/youtube_dl/extractor/crunchyroll.py index 8bdaf0c2c..18ef3da10 100644 --- a/youtube_dl/extractor/crunchyroll.py +++ b/youtube_dl/extractor/crunchyroll.py @@ -43,6 +43,17 @@ class CrunchyrollBaseIE(InfoExtractor): if username is None: return + self._download_webpage( + 'https://www.crunchyroll.com/?a=formhandler', + None, 'Logging in', 'Wrong login info', + data=urlencode_postdata({ + 'formname': 'RpcApiUser_Login', + 'next_url': 'https://www.crunchyroll.com/acct/membership', + 'name': username, + 'password': password, + })) + + ''' login_page = self._download_webpage( self._LOGIN_URL, None, 'Downloading login page') @@ -86,6 +97,7 @@ class CrunchyrollBaseIE(InfoExtractor): raise ExtractorError('Unable to login: %s' % error, expected=True) raise ExtractorError('Unable to log in') + ''' def _real_initialize(self): self._login() diff --git a/youtube_dl/extractor/dctp.py b/youtube_dl/extractor/dctp.py index 00fbbff2f..3a6d0560e 100644 --- a/youtube_dl/extractor/dctp.py +++ b/youtube_dl/extractor/dctp.py @@ -2,53 +2,85 @@ from __future__ import unicode_literals from .common import InfoExtractor -from ..utils import unified_strdate +from ..compat import compat_str +from ..utils import ( + float_or_none, + unified_strdate, +) class DctpTvIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?dctp\.tv/(#/)?filme/(?P.+?)/$' + _VALID_URL = r'https?://(?:www\.)?dctp\.tv/(?:#/)?filme/(?P[^/?#&]+)' _TEST = { 'url': 'http://www.dctp.tv/filme/videoinstallation-fuer-eine-kaufhausfassade/', - 'md5': '174dd4a8a6225cf5655952f969cfbe24', 'info_dict': { 'id': '95eaa4f33dad413aa17b4ee613cccc6c', 'display_id': 'videoinstallation-fuer-eine-kaufhausfassade', - 'ext': 'mp4', + 'ext': 'flv', 'title': 'Videoinstallation für eine Kaufhausfassade', 'description': 'Kurzfilm', 'upload_date': '20110407', 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 71.24, + }, + 'params': { + # rtmp download + 'skip_download': True, }, } def _real_extract(self, url): - video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) + display_id = self._match_id(url) - object_id = self._html_search_meta('DC.identifier', webpage) + webpage = self._download_webpage(url, display_id) - servers_json = self._download_json( - 'http://www.dctp.tv/elastic_streaming_client/get_streaming_server/', - video_id, note='Downloading server list') - server = servers_json[0]['server'] - m3u8_path = self._search_regex( - r'\'([^\'"]+/playlist\.m3u8)"', webpage, 'm3u8 path') - formats = self._extract_m3u8_formats( - 'http://%s%s' % (server, m3u8_path), video_id, ext='mp4', - entry_protocol='m3u8_native') + video_id = self._html_search_meta( + 'DC.identifier', webpage, 'video id', + default=None) or self._search_regex( + r'id=["\']uuid[^>]+>([^<]+)<', webpage, 'video id') title = self._og_search_title(webpage) + + servers = self._download_json( + 'http://www.dctp.tv/streaming_servers/', display_id, + note='Downloading server list', fatal=False) + + if servers: + endpoint = next( + server['endpoint'] + for server in servers + if isinstance(server.get('endpoint'), compat_str) and + 'cloudfront' in server['endpoint']) + else: + endpoint = 'rtmpe://s2pqqn4u96e4j8.cloudfront.net/cfx/st/' + + app = self._search_regex( + r'^rtmpe?://[^/]+/(?P.*)$', endpoint, 'app') + + formats = [{ + 'url': endpoint, + 'app': app, + 'play_path': 'mp4:%s_dctp_0500_4x3.m4v' % video_id, + 'page_url': url, + 'player_url': 'http://svm-prod-dctptv-static.s3.amazonaws.com/dctptv-relaunch2012-109.swf', + 'ext': 'flv', + }] + description = self._html_search_meta('DC.description', webpage) upload_date = unified_strdate( self._html_search_meta('DC.date.created', webpage)) thumbnail = self._og_search_thumbnail(webpage) + duration = float_or_none(self._search_regex( + r'id=["\']duration_in_ms[^+]>(\d+)', webpage, 'duration', + default=None), scale=1000) return { - 'id': object_id, + 'id': video_id, 'title': title, 'formats': formats, - 'display_id': video_id, + 'display_id': display_id, 'description': description, 'upload_date': upload_date, 'thumbnail': thumbnail, + 'duration': duration, } diff --git a/youtube_dl/extractor/dramafever.py b/youtube_dl/extractor/dramafever.py index 95883a037..6b60e542b 100644 --- a/youtube_dl/extractor/dramafever.py +++ b/youtube_dl/extractor/dramafever.py @@ -54,7 +54,7 @@ class DramaFeverBaseIE(AMPIE): request = sanitized_Request( self._LOGIN_URL, urlencode_postdata(login_form)) response = self._download_webpage( - request, None, 'Logging in as %s' % username) + request, None, 'Logging in') if all(logout_pattern not in response for logout_pattern in ['href="/accounts/logout/"', '>Log out<']): diff --git a/youtube_dl/extractor/egghead.py b/youtube_dl/extractor/egghead.py index e4a3046af..edabaafe6 100644 --- a/youtube_dl/extractor/egghead.py +++ b/youtube_dl/extractor/egghead.py @@ -2,7 +2,9 @@ from __future__ import unicode_literals from .common import InfoExtractor +from ..compat import compat_str from ..utils import ( + determine_ext, int_or_none, try_get, unified_timestamp, @@ -17,7 +19,7 @@ class EggheadCourseIE(InfoExtractor): 'url': 'https://egghead.io/courses/professor-frisby-introduces-composable-functional-javascript', 'playlist_count': 29, 'info_dict': { - 'id': 'professor-frisby-introduces-composable-functional-javascript', + 'id': '72', 'title': 'Professor Frisby Introduces Composable Functional JavaScript', 'description': 're:(?s)^This course teaches the ubiquitous.*You\'ll start composing functionality before you know it.$', }, @@ -26,14 +28,28 @@ class EggheadCourseIE(InfoExtractor): def _real_extract(self, url): playlist_id = self._match_id(url) - course = self._download_json( - 'https://egghead.io/api/v1/series/%s' % playlist_id, playlist_id) + lessons = self._download_json( + 'https://egghead.io/api/v1/series/%s/lessons' % playlist_id, + playlist_id, 'Downloading course lessons JSON') - entries = [ - self.url_result( - 'wistia:%s' % lesson['wistia_id'], ie='Wistia', - video_id=lesson['wistia_id'], video_title=lesson.get('title')) - for lesson in course['lessons'] if lesson.get('wistia_id')] + entries = [] + for lesson in lessons: + lesson_url = lesson.get('http_url') + if not lesson_url or not isinstance(lesson_url, compat_str): + continue + lesson_id = lesson.get('id') + if lesson_id: + lesson_id = compat_str(lesson_id) + entries.append(self.url_result( + lesson_url, ie=EggheadLessonIE.ie_key(), video_id=lesson_id)) + + course = self._download_json( + 'https://egghead.io/api/v1/series/%s' % playlist_id, + playlist_id, 'Downloading course JSON', fatal=False) or {} + + playlist_id = course.get('id') + if playlist_id: + playlist_id = compat_str(playlist_id) return self.playlist_result( entries, playlist_id, course.get('title'), @@ -43,11 +59,12 @@ class EggheadCourseIE(InfoExtractor): class EggheadLessonIE(InfoExtractor): IE_DESC = 'egghead.io lesson' IE_NAME = 'egghead:lesson' - _VALID_URL = r'https://egghead\.io/lessons/(?P[^/?#&]+)' - _TEST = { + _VALID_URL = r'https://egghead\.io/(?:api/v1/)?lessons/(?P[^/?#&]+)' + _TESTS = [{ 'url': 'https://egghead.io/lessons/javascript-linear-data-flow-with-container-style-types-box', 'info_dict': { - 'id': 'fv5yotjxcg', + 'id': '1196', + 'display_id': 'javascript-linear-data-flow-with-container-style-types-box', 'ext': 'mp4', 'title': 'Create linear data flow with container style types (Box)', 'description': 'md5:9aa2cdb6f9878ed4c39ec09e85a8150e', @@ -60,25 +77,51 @@ class EggheadLessonIE(InfoExtractor): }, 'params': { 'skip_download': True, + 'format': 'bestvideo', }, - } + }, { + 'url': 'https://egghead.io/api/v1/lessons/react-add-redux-to-a-react-application', + 'only_matching': True, + }] def _real_extract(self, url): - lesson_id = self._match_id(url) + display_id = self._match_id(url) lesson = self._download_json( - 'https://egghead.io/api/v1/lessons/%s' % lesson_id, lesson_id) + 'https://egghead.io/api/v1/lessons/%s' % display_id, display_id) + + lesson_id = compat_str(lesson['id']) + title = lesson['title'] + + formats = [] + for _, format_url in lesson['media_urls'].items(): + if not format_url or not isinstance(format_url, compat_str): + continue + ext = determine_ext(format_url) + if ext == 'm3u8': + formats.extend(self._extract_m3u8_formats( + format_url, lesson_id, 'mp4', entry_protocol='m3u8', + m3u8_id='hls', fatal=False)) + elif ext == 'mpd': + formats.extend(self._extract_mpd_formats( + format_url, lesson_id, mpd_id='dash', fatal=False)) + else: + formats.append({ + 'url': format_url, + }) + self._sort_formats(formats) return { - '_type': 'url_transparent', - 'ie_key': 'Wistia', - 'url': 'wistia:%s' % lesson['wistia_id'], - 'id': lesson['wistia_id'], - 'title': lesson.get('title'), + 'id': lesson_id, + 'display_id': display_id, + 'title': title, 'description': lesson.get('summary'), 'thumbnail': lesson.get('thumb_nail'), 'timestamp': unified_timestamp(lesson.get('published_at')), 'duration': int_or_none(lesson.get('duration')), 'view_count': int_or_none(lesson.get('plays_count')), 'tags': try_get(lesson, lambda x: x['tag_list'], list), + 'series': try_get( + lesson, lambda x: x['series']['title'], compat_str), + 'formats': formats, } diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 18350810b..d084707ee 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -432,7 +432,10 @@ from .hitbox import HitboxIE, HitboxLiveIE from .hitrecord import HitRecordIE from .hornbunny import HornBunnyIE from .hotnewhiphop import HotNewHipHopIE -from .hotstar import HotStarIE +from .hotstar import ( + HotStarIE, + HotStarPlaylistIE, +) from .howcast import HowcastIE from .howstuffworks import HowStuffWorksIE from .hrti import ( @@ -1110,10 +1113,6 @@ from .tvplayer import TVPlayerIE from .tweakers import TweakersIE from .twentyfourvideo import TwentyFourVideoIE from .twentymin import TwentyMinutenIE -from .twentytwotracks import ( - TwentyTwoTracksIE, - TwentyTwoTracksGenreIE -) from .twitch import ( TwitchVideoIE, TwitchChapterIE, @@ -1335,6 +1334,11 @@ from .youku import ( YoukuIE, YoukuShowIE, ) +from .younow import ( + YouNowLiveIE, + YouNowChannelIE, + YouNowMomentIE, +) from .youporn import YouPornIE from .yourupload import YourUploadIE from .youtube import ( diff --git a/youtube_dl/extractor/funimation.py b/youtube_dl/extractor/funimation.py index 8c37509ec..107f658ba 100644 --- a/youtube_dl/extractor/funimation.py +++ b/youtube_dl/extractor/funimation.py @@ -57,7 +57,7 @@ class FunimationIE(InfoExtractor): try: data = self._download_json( 'https://prod-api-funimationnow.dadcdigital.com/api/auth/login/', - None, 'Logging in as %s' % username, data=urlencode_postdata({ + None, 'Logging in', data=urlencode_postdata({ 'username': username, 'password': password, })) diff --git a/youtube_dl/extractor/fxnetworks.py b/youtube_dl/extractor/fxnetworks.py index 629897317..37549fb01 100644 --- a/youtube_dl/extractor/fxnetworks.py +++ b/youtube_dl/extractor/fxnetworks.py @@ -3,27 +3,31 @@ from __future__ import unicode_literals from .adobepass import AdobePassIE from ..utils import ( - update_url_query, extract_attributes, + int_or_none, parse_age_limit, smuggle_url, + update_url_query, ) class FXNetworksIE(AdobePassIE): _VALID_URL = r'https?://(?:www\.)?(?:fxnetworks|simpsonsworld)\.com/video/(?P\d+)' _TESTS = [{ - 'url': 'http://www.fxnetworks.com/video/719841347694', - 'md5': '1447d4722e42ebca19e5232ab93abb22', + 'url': 'http://www.fxnetworks.com/video/1032565827847', + 'md5': '8d99b97b4aa7a202f55b6ed47ea7e703', 'info_dict': { - 'id': '719841347694', + 'id': 'dRzwHC_MMqIv', 'ext': 'mp4', - 'title': 'Vanpage', - 'description': 'F*ck settling down. You\'re the Worst returns for an all new season August 31st on FXX.', + 'title': 'First Look: Better Things - Season 2', + 'description': 'Because real life is like a fart. Watch this FIRST LOOK to see what inspired the new season of Better Things.', 'age_limit': 14, 'uploader': 'NEWA-FNG-FX', - 'upload_date': '20160706', - 'timestamp': 1467844741, + 'upload_date': '20170825', + 'timestamp': 1503686274, + 'episode_number': 0, + 'season_number': 2, + 'series': 'Better Things', }, 'add_ie': ['ThePlatform'], }, { @@ -64,6 +68,9 @@ class FXNetworksIE(AdobePassIE): 'id': video_id, 'title': title, 'url': smuggle_url(update_url_query(release_url, query), {'force_smil_url': True}), + 'series': video_data.get('data-show-title'), + 'episode_number': int_or_none(video_data.get('data-episode')), + 'season_number': int_or_none(video_data.get('data-season')), 'thumbnail': video_data.get('data-large-thumb'), 'age_limit': parse_age_limit(rating), 'ie_key': 'ThePlatform', diff --git a/youtube_dl/extractor/gamespot.py b/youtube_dl/extractor/gamespot.py index 02804d297..ab647dd41 100644 --- a/youtube_dl/extractor/gamespot.py +++ b/youtube_dl/extractor/gamespot.py @@ -14,7 +14,7 @@ from ..utils import ( class GameSpotIE(OnceIE): - _VALID_URL = r'https?://(?:www\.)?gamespot\.com/.*-(?P\d+)/?' + _VALID_URL = r'https?://(?:www\.)?gamespot\.com/(?:video|article)s/(?:[^/]+/\d+-|embed/)(?P\d+)' _TESTS = [{ 'url': 'http://www.gamespot.com/videos/arma-3-community-guide-sitrep-i/2300-6410818/', 'md5': 'b2a30deaa8654fcccd43713a6b6a4825', @@ -35,6 +35,12 @@ class GameSpotIE(OnceIE): 'params': { 'skip_download': True, # m3u8 downloads }, + }, { + 'url': 'https://www.gamespot.com/videos/embed/6439218/', + 'only_matching': True, + }, { + 'url': 'https://www.gamespot.com/articles/the-last-of-us-2-receives-new-ps4-trailer/1100-6454469/', + 'only_matching': True, }] def _real_extract(self, url): @@ -52,7 +58,7 @@ class GameSpotIE(OnceIE): manifest_url = f4m_url formats.extend(self._extract_f4m_formats( f4m_url + '?hdcore=3.7.0', page_id, f4m_id='hds', fatal=False)) - m3u8_url = streams.get('m3u8_stream') + m3u8_url = dict_get(streams, ('m3u8_stream', 'adaptive_stream')) if m3u8_url: manifest_url = m3u8_url m3u8_formats = self._extract_m3u8_formats( @@ -60,7 +66,7 @@ class GameSpotIE(OnceIE): m3u8_id='hls', fatal=False) formats.extend(m3u8_formats) progressive_url = dict_get( - streams, ('progressive_hd', 'progressive_high', 'progressive_low')) + streams, ('progressive_hd', 'progressive_high', 'progressive_low', 'other_lr')) if progressive_url and manifest_url: qualities_basename = self._search_regex( r'/([^/]+)\.csmil/', @@ -105,7 +111,8 @@ class GameSpotIE(OnceIE): onceux_url = self._parse_json(unescapeHTML(onceux_json), page_id).get('metadataUri') if onceux_url: formats.extend(self._extract_once_formats(re.sub( - r'https?://[^/]+', 'http://once.unicornmedia.com', onceux_url))) + r'https?://[^/]+', 'http://once.unicornmedia.com', onceux_url), + http_formats_preference=-1)) if not formats: for quality in ['sd', 'hd']: diff --git a/youtube_dl/extractor/hotstar.py b/youtube_dl/extractor/hotstar.py index 3a7a66a34..d28af36ec 100644 --- a/youtube_dl/extractor/hotstar.py +++ b/youtube_dl/extractor/hotstar.py @@ -1,22 +1,47 @@ # coding: utf-8 from __future__ import unicode_literals +import re + from .common import InfoExtractor +from ..compat import compat_str from ..utils import ( - ExtractorError, determine_ext, + ExtractorError, int_or_none, ) -class HotStarIE(InfoExtractor): +class HotStarBaseIE(InfoExtractor): + _GEO_COUNTRIES = ['IN'] + + def _download_json(self, *args, **kwargs): + response = super(HotStarBaseIE, self)._download_json(*args, **kwargs) + if response['resultCode'] != 'OK': + if kwargs.get('fatal'): + raise ExtractorError( + response['errorDescription'], expected=True) + return None + return response['resultObj'] + + def _download_content_info(self, content_id): + return self._download_json( + 'https://account.hotstar.com/AVS/besc', content_id, query={ + 'action': 'GetAggregatedContentDetails', + 'appVersion': '5.0.40', + 'channel': 'PCTV', + 'contentId': content_id, + })['contentInfo'][0] + + +class HotStarIE(HotStarBaseIE): _VALID_URL = r'https?://(?:www\.)?hotstar\.com/(?:.+?[/-])?(?P\d{10})' _TESTS = [{ 'url': 'http://www.hotstar.com/on-air-with-aib--english-1000076273', 'info_dict': { 'id': '1000076273', 'ext': 'mp4', - 'title': 'On Air With AIB - English', + 'title': 'On Air With AIB', 'description': 'md5:c957d8868e9bc793ccb813691cc4c434', 'timestamp': 1447227000, 'upload_date': '20151111', @@ -34,23 +59,11 @@ class HotStarIE(InfoExtractor): 'only_matching': True, }] - def _download_json(self, url_or_request, video_id, note='Downloading JSON metadata', fatal=True, query=None): - json_data = super(HotStarIE, self)._download_json( - url_or_request, video_id, note, fatal=fatal, query=query) - if json_data['resultCode'] != 'OK': - if fatal: - raise ExtractorError(json_data['errorDescription']) - return None - return json_data['resultObj'] - def _real_extract(self, url): video_id = self._match_id(url) - video_data = self._download_json( - 'http://account.hotstar.com/AVS/besc', video_id, query={ - 'action': 'GetAggregatedContentDetails', - 'channel': 'PCTV', - 'contentId': video_id, - })['contentInfo'][0] + + video_data = self._download_content_info(video_id) + title = video_data['episodeTitle'] if video_data.get('encrypted') == 'Y': @@ -99,3 +112,51 @@ class HotStarIE(InfoExtractor): 'episode_number': int_or_none(video_data.get('episodeNumber')), 'series': video_data.get('contentTitle'), } + + +class HotStarPlaylistIE(HotStarBaseIE): + IE_NAME = 'hotstar:playlist' + _VALID_URL = r'(?Phttps?://(?:www\.)?hotstar\.com/tv/[^/]+/(?P\d+))/(?P[^/]+)/(?P\d+)' + _TESTS = [{ + 'url': 'http://www.hotstar.com/tv/pratidaan/14982/episodes/14812/9993', + 'info_dict': { + 'id': '14812', + }, + 'playlist_mincount': 75, + }, { + 'url': 'http://www.hotstar.com/tv/pratidaan/14982/popular-clips/9998/9998', + 'only_matching': True, + }] + _ITEM_TYPES = { + 'episodes': 'EPISODE', + 'popular-clips': 'CLIPS', + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + base_url = mobj.group('url') + content_id = mobj.group('content_id') + playlist_type = mobj.group('type') + + content_info = self._download_content_info(content_id) + playlist_id = compat_str(content_info['categoryId']) + + collection = self._download_json( + 'https://search.hotstar.com/AVS/besc', playlist_id, query={ + 'action': 'SearchContents', + 'appVersion': '5.0.40', + 'channel': 'PCTV', + 'moreFilters': 'series:%s;' % playlist_id, + 'query': '*', + 'searchOrder': 'last_broadcast_date desc,year desc,title asc', + 'type': self._ITEM_TYPES.get(playlist_type, 'EPISODE'), + }) + + entries = [ + self.url_result( + '%s/_/%s' % (base_url, video['contentId']), + ie=HotStarIE.ie_key(), video_id=video['contentId']) + for video in collection['response']['docs'] + if video.get('contentId')] + + return self.playlist_result(entries, playlist_id) diff --git a/youtube_dl/extractor/instagram.py b/youtube_dl/extractor/instagram.py index 4667335e0..20db31f86 100644 --- a/youtube_dl/extractor/instagram.py +++ b/youtube_dl/extractor/instagram.py @@ -1,5 +1,6 @@ from __future__ import unicode_literals +import itertools import re from .common import InfoExtractor @@ -7,7 +8,6 @@ from ..compat import compat_str from ..utils import ( get_element_by_attribute, int_or_none, - limit_length, lowercase_escape, try_get, ) @@ -212,7 +212,7 @@ class InstagramIE(InfoExtractor): class InstagramUserIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?instagram\.com/(?P[^/]{2,})/?(?:$|[?#])' + _VALID_URL = r'https?://(?:www\.)?instagram\.com/(?P[^/]{2,})/?(?:$|[?#])' IE_DESC = 'Instagram user profile' IE_NAME = 'instagram:user' _TEST = { @@ -221,82 +221,79 @@ class InstagramUserIE(InfoExtractor): 'id': 'porsche', 'title': 'porsche', }, - 'playlist_mincount': 2, - 'playlist': [{ - 'info_dict': { - 'id': '614605558512799803_462752227', - 'ext': 'mp4', - 'title': '#Porsche Intelligent Performance.', - 'thumbnail': r're:^https?://.*\.jpg', - 'uploader': 'Porsche', - 'uploader_id': 'porsche', - 'timestamp': 1387486713, - 'upload_date': '20131219', - }, - }], + 'playlist_count': 5, 'params': { 'extract_flat': True, 'skip_download': True, + 'playlistend': 5, } } - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - uploader_id = mobj.group('username') + def _entries(self, uploader_id): + query = { + '__a': 1, + } - entries = [] - page_count = 0 - media_url = 'http://instagram.com/%s/media' % uploader_id - while True: + def get_count(kind): + return int_or_none(try_get( + node, lambda x: x['%ss' % kind]['count'])) + + for page_num in itertools.count(1): page = self._download_json( - media_url, uploader_id, - note='Downloading page %d ' % (page_count + 1), - ) - page_count += 1 + 'https://instagram.com/%s/' % uploader_id, uploader_id, + note='Downloading page %d' % page_num, + fatal=False, query=query) + if not page: + break - for it in page['items']: - if it.get('type') != 'video': + nodes = try_get(page, lambda x: x['user']['media']['nodes'], list) + if not nodes: + break + + max_id = None + + for node in nodes: + node_id = node.get('id') + if node_id: + max_id = node_id + + if node.get('__typename') != 'GraphVideo' and node.get('is_video') is not True: + continue + video_id = node.get('code') + if not video_id: continue - like_count = int_or_none(it.get('likes', {}).get('count')) - user = it.get('user', {}) - formats = [{ - 'format_id': k, - 'height': v.get('height'), - 'width': v.get('width'), - 'url': v['url'], - } for k, v in it['videos'].items()] - self._sort_formats(formats) + info = self.url_result( + 'https://instagram.com/p/%s/' % video_id, + ie=InstagramIE.ie_key(), video_id=video_id) - thumbnails_el = it.get('images', {}) - thumbnail = thumbnails_el.get('thumbnail', {}).get('url') + description = try_get( + node, [lambda x: x['caption'], lambda x: x['text']['id']], + compat_str) + thumbnail = node.get('thumbnail_src') or node.get('display_src') + timestamp = int_or_none(node.get('date')) - # In some cases caption is null, which corresponds to None - # in python. As a result, it.get('caption', {}) gives None - title = (it.get('caption') or {}).get('text', it['id']) + comment_count = get_count('comment') + like_count = get_count('like') + view_count = int_or_none(node.get('video_views')) - entries.append({ - 'id': it['id'], - 'title': limit_length(title, 80), - 'formats': formats, + info.update({ + 'description': description, 'thumbnail': thumbnail, - 'webpage_url': it.get('link'), - 'uploader': user.get('full_name'), - 'uploader_id': user.get('username'), + 'timestamp': timestamp, + 'comment_count': comment_count, 'like_count': like_count, - 'timestamp': int_or_none(it.get('created_time')), + 'view_count': view_count, }) - if not page['items']: - break - max_id = page['items'][-1]['id'].split('_')[0] - media_url = ( - 'http://instagram.com/%s/media?max_id=%s' % ( - uploader_id, max_id)) + yield info - return { - '_type': 'playlist', - 'entries': entries, - 'id': uploader_id, - 'title': uploader_id, - } + if not max_id: + break + + query['max_id'] = max_id + + def _real_extract(self, url): + uploader_id = self._match_id(url) + return self.playlist_result( + self._entries(uploader_id), uploader_id, uploader_id) diff --git a/youtube_dl/extractor/noco.py b/youtube_dl/extractor/noco.py index 8b83e1f76..a9f9b10c4 100644 --- a/youtube_dl/extractor/noco.py +++ b/youtube_dl/extractor/noco.py @@ -70,7 +70,7 @@ class NocoIE(InfoExtractor): return login = self._download_json( - self._LOGIN_URL, None, 'Logging in as %s' % username, + self._LOGIN_URL, None, 'Logging in', data=urlencode_postdata({ 'a': 'login', 'cookie': '1', diff --git a/youtube_dl/extractor/once.py b/youtube_dl/extractor/once.py index a637c8ecf..8ae5fadd8 100644 --- a/youtube_dl/extractor/once.py +++ b/youtube_dl/extractor/once.py @@ -11,7 +11,7 @@ class OnceIE(InfoExtractor): ADAPTIVE_URL_TEMPLATE = 'http://once.unicornmedia.com/now/master/playlist/%s/%s/%s/content.m3u8' PROGRESSIVE_URL_TEMPLATE = 'http://once.unicornmedia.com/now/media/progressive/%s/%s/%s/%s/content.mp4' - def _extract_once_formats(self, url): + def _extract_once_formats(self, url, http_formats_preference=None): domain_id, application_id, media_item_id = re.match( OnceIE._VALID_URL, url).groups() formats = self._extract_m3u8_formats( @@ -35,6 +35,7 @@ class OnceIE(InfoExtractor): 'format_id': adaptive_format['format_id'].replace( 'hls', 'http'), 'protocol': 'http', + 'preference': http_formats_preference, }) progressive_formats.append(progressive_format) self._check_formats(progressive_formats, media_item_id) diff --git a/youtube_dl/extractor/pandatv.py b/youtube_dl/extractor/pandatv.py index c86d70771..13a2e7efc 100644 --- a/youtube_dl/extractor/pandatv.py +++ b/youtube_dl/extractor/pandatv.py @@ -33,7 +33,7 @@ class PandaTVIE(InfoExtractor): video_id = self._match_id(url) config = self._download_json( - 'https://www.panda.tv/api_room?roomid=%s' % video_id, video_id) + 'https://www.panda.tv/api_room_v2?roomid=%s' % video_id, video_id) error_code = config.get('errno', 0) if error_code is not 0: @@ -66,6 +66,11 @@ class PandaTVIE(InfoExtractor): plflag1 = '4' live_panda = 'live_panda' if plflag0 < 1 else '' + plflag_auth = self._parse_json(video_info['plflag_list'], video_id) + sign = plflag_auth['auth']['sign'] + ts = plflag_auth['auth']['time'] + rid = plflag_auth['auth']['rid'] + quality_key = qualities(['OD', 'HD', 'SD']) suffix = ['_small', '_mid', ''] formats = [] @@ -77,8 +82,8 @@ class PandaTVIE(InfoExtractor): continue for pref, (ext, pl) in enumerate((('m3u8', '-hls'), ('flv', ''))): formats.append({ - 'url': 'https://pl%s%s.live.panda.tv/live_panda/%s%s%s.%s' - % (pl, plflag1, room_key, live_panda, suffix[quality], ext), + 'url': 'https://pl%s%s.live.panda.tv/live_panda/%s%s%s.%s?sign=%s&ts=%s&rid=%s' + % (pl, plflag1, room_key, live_panda, suffix[quality], ext, sign, ts, rid), 'format_id': '%s-%s' % (k, ext), 'quality': quality, 'source_preference': pref, diff --git a/youtube_dl/extractor/patreon.py b/youtube_dl/extractor/patreon.py index a6a2c273f..d4b1d34ca 100644 --- a/youtube_dl/extractor/patreon.py +++ b/youtube_dl/extractor/patreon.py @@ -67,7 +67,7 @@ class PatreonIE(InfoExtractor): 'https://www.patreon.com/processLogin', compat_urllib_parse_urlencode(login_form).encode('utf-8') ) - login_page = self._download_webpage(request, None, note='Logging in as %s' % username) + login_page = self._download_webpage(request, None, note='Logging in') if re.search(r'onLoginFailed', login_page): raise ExtractorError('Unable to login, incorrect username and/or password', expected=True) diff --git a/youtube_dl/extractor/pluralsight.py b/youtube_dl/extractor/pluralsight.py index f6a9131b1..4bf0aa786 100644 --- a/youtube_dl/extractor/pluralsight.py +++ b/youtube_dl/extractor/pluralsight.py @@ -116,7 +116,7 @@ class PluralsightIE(PluralsightBaseIE): post_url = compat_urlparse.urljoin(self._LOGIN_URL, post_url) response = self._download_webpage( - post_url, None, 'Logging in as %s' % username, + post_url, None, 'Logging in', data=urlencode_postdata(login_form), headers={'Content-Type': 'application/x-www-form-urlencoded'}) diff --git a/youtube_dl/extractor/roosterteeth.py b/youtube_dl/extractor/roosterteeth.py index 46dfc78f5..8b703800e 100644 --- a/youtube_dl/extractor/roosterteeth.py +++ b/youtube_dl/extractor/roosterteeth.py @@ -68,7 +68,7 @@ class RoosterTeethIE(InfoExtractor): login_request = self._download_webpage( self._LOGIN_URL, None, - note='Logging in as %s' % username, + note='Logging in', data=urlencode_postdata(login_form), headers={ 'Referer': self._LOGIN_URL, diff --git a/youtube_dl/extractor/safari.py b/youtube_dl/extractor/safari.py index 909a6ba97..cc6698f88 100644 --- a/youtube_dl/extractor/safari.py +++ b/youtube_dl/extractor/safari.py @@ -61,7 +61,7 @@ class SafariBaseIE(InfoExtractor): request = sanitized_Request( self._LOGIN_URL, urlencode_postdata(login_form), headers=headers) login_page = self._download_webpage( - request, None, 'Logging in as %s' % username) + request, None, 'Logging in') if not is_logged(login_page): raise ExtractorError( diff --git a/youtube_dl/extractor/skysports.py b/youtube_dl/extractor/skysports.py index 4ca9f6b3c..efcbb36a9 100644 --- a/youtube_dl/extractor/skysports.py +++ b/youtube_dl/extractor/skysports.py @@ -2,7 +2,12 @@ from __future__ import unicode_literals from .common import InfoExtractor -from ..utils import strip_or_none +from ..utils import ( + extract_attributes, + smuggle_url, + strip_or_none, + urljoin, +) class SkySportsIE(InfoExtractor): @@ -22,12 +27,22 @@ class SkySportsIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) + video_data = extract_attributes(self._search_regex( + r'(]+>)', webpage, 'video data')) + + video_url = 'ooyala:%s' % video_data['data-video-id'] + if video_data.get('data-token-required') == 'true': + token_fetch_options = self._parse_json(video_data.get('data-token-fetch-options', '{}'), video_id, fatal=False) or {} + token_fetch_url = token_fetch_options.get('url') + if token_fetch_url: + embed_token = self._download_webpage(urljoin(url, token_fetch_url), video_id, fatal=False) + if embed_token: + video_url = smuggle_url(video_url, {'embed_token': embed_token.strip('"')}) return { '_type': 'url_transparent', 'id': video_id, - 'url': 'ooyala:%s' % self._search_regex( - r'data-video-id="([^"]+)"', webpage, 'ooyala id'), + 'url': video_url, 'title': self._og_search_title(webpage), 'description': strip_or_none(self._og_search_description(webpage)), 'ie_key': 'Ooyala', diff --git a/youtube_dl/extractor/soundgasm.py b/youtube_dl/extractor/soundgasm.py index e004e2c5a..3d78a9d76 100644 --- a/youtube_dl/extractor/soundgasm.py +++ b/youtube_dl/extractor/soundgasm.py @@ -8,36 +8,49 @@ from .common import InfoExtractor class SoundgasmIE(InfoExtractor): IE_NAME = 'soundgasm' - _VALID_URL = r'https?://(?:www\.)?soundgasm\.net/u/(?P[0-9a-zA-Z_\-]+)/(?P[0-9a-zA-Z_\-]+)' + _VALID_URL = r'https?://(?:www\.)?soundgasm\.net/u/(?P<user>[0-9a-zA-Z_-]+)/(?P<display_id>[0-9a-zA-Z_-]+)' _TEST = { 'url': 'http://soundgasm.net/u/ytdl/Piano-sample', 'md5': '010082a2c802c5275bb00030743e75ad', 'info_dict': { 'id': '88abd86ea000cafe98f96321b23cc1206cbcbcc9', 'ext': 'm4a', - 'title': 'ytdl_Piano-sample', - 'description': 'Royalty Free Sample Music' + 'title': 'Piano sample', + 'description': 'Royalty Free Sample Music', + 'uploader': 'ytdl', } } def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) - display_id = mobj.group('title') - audio_title = mobj.group('user') + '_' + mobj.group('title') + display_id = mobj.group('display_id') + webpage = self._download_webpage(url, display_id) + audio_url = self._html_search_regex( - r'(?s)m4a\:\s"([^"]+)"', webpage, 'audio URL') - audio_id = re.split(r'\/|\.', audio_url)[-2] + r'(?s)m4a\s*:\s*(["\'])(?P<url>(?:(?!\1).)+)\1', webpage, + 'audio URL', group='url') + + title = self._search_regex( + r'<div[^>]+\bclass=["\']jp-title[^>]+>([^<]+)', + webpage, 'title', default=display_id) + description = self._html_search_regex( - r'(?s)<li>Description:\s(.*?)<\/li>', webpage, 'description', - fatal=False) + (r'(?s)<div[^>]+\bclass=["\']jp-description[^>]+>(.+?)</div>', + r'(?s)<li>Description:\s(.*?)<\/li>'), + webpage, 'description', fatal=False) + + audio_id = self._search_regex( + r'/([^/]+)\.m4a', audio_url, 'audio id', default=display_id) return { 'id': audio_id, 'display_id': display_id, 'url': audio_url, - 'title': audio_title, - 'description': description + 'vcodec': 'none', + 'title': title, + 'description': description, + 'uploader': mobj.group('user'), } diff --git a/youtube_dl/extractor/spankbang.py b/youtube_dl/extractor/spankbang.py index 3394c7e6b..2863e53b5 100644 --- a/youtube_dl/extractor/spankbang.py +++ b/youtube_dl/extractor/spankbang.py @@ -3,6 +3,7 @@ from __future__ import unicode_literals import re from .common import InfoExtractor +from ..utils import ExtractorError class SpankBangIE(InfoExtractor): @@ -33,6 +34,10 @@ class SpankBangIE(InfoExtractor): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) + if re.search(r'<[^>]+\bid=["\']video_removed', webpage): + raise ExtractorError( + 'Video %s is not available' % video_id, expected=True) + stream_key = self._html_search_regex( r'''var\s+stream_key\s*=\s*['"](.+?)['"]''', webpage, 'stream key') diff --git a/youtube_dl/extractor/tva.py b/youtube_dl/extractor/tva.py index b57abeaa4..0b863df2f 100644 --- a/youtube_dl/extractor/tva.py +++ b/youtube_dl/extractor/tva.py @@ -32,6 +32,8 @@ class TVAIE(InfoExtractor): video_data = self._download_json( 'https://videos.tva.ca/proxy/item/_' + video_id, video_id, headers={ 'Accept': 'application/json', + }, query={ + 'appId': '5955fc5f23eec60006c951f1', }) def get_attribute(key): diff --git a/youtube_dl/extractor/twentytwotracks.py b/youtube_dl/extractor/twentytwotracks.py deleted file mode 100644 index d6c0ab184..000000000 --- a/youtube_dl/extractor/twentytwotracks.py +++ /dev/null @@ -1,86 +0,0 @@ -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from ..utils import int_or_none - -# 22Tracks regularly replace the audio tracks that can be streamed on their -# site. The tracks usually expire after 1 months, so we can't add tests. - - -class TwentyTwoTracksIE(InfoExtractor): - _VALID_URL = r'https?://22tracks\.com/(?P<city>[a-z]+)/(?P<genre>[\da-z]+)/(?P<id>\d+)' - IE_NAME = '22tracks:track' - - _API_BASE = 'http://22tracks.com/api' - - def _extract_info(self, city, genre_name, track_id=None): - item_id = track_id if track_id else genre_name - - cities = self._download_json( - '%s/cities' % self._API_BASE, item_id, - 'Downloading cities info', - 'Unable to download cities info') - city_id = [x['id'] for x in cities if x['slug'] == city][0] - - genres = self._download_json( - '%s/genres/%s' % (self._API_BASE, city_id), item_id, - 'Downloading %s genres info' % city, - 'Unable to download %s genres info' % city) - genre = [x for x in genres if x['slug'] == genre_name][0] - genre_id = genre['id'] - - tracks = self._download_json( - '%s/tracks/%s' % (self._API_BASE, genre_id), item_id, - 'Downloading %s genre tracks info' % genre_name, - 'Unable to download track info') - - return [x for x in tracks if x['id'] == item_id][0] if track_id else [genre['title'], tracks] - - def _get_track_url(self, filename, track_id): - token = self._download_json( - 'http://22tracks.com/token.php?desktop=true&u=/128/%s' % filename, - track_id, 'Downloading token', 'Unable to download token') - return 'http://audio.22tracks.com%s?st=%s&e=%d' % (token['filename'], token['st'], token['e']) - - def _extract_track_info(self, track_info, track_id): - download_url = self._get_track_url(track_info['filename'], track_id) - title = '%s - %s' % (track_info['artist'].strip(), track_info['title'].strip()) - return { - 'id': track_id, - 'url': download_url, - 'ext': 'mp3', - 'title': title, - 'duration': int_or_none(track_info.get('duration')), - 'timestamp': int_or_none(track_info.get('published_at') or track_info.get('created')) - } - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - - city = mobj.group('city') - genre = mobj.group('genre') - track_id = mobj.group('id') - - track_info = self._extract_info(city, genre, track_id) - return self._extract_track_info(track_info, track_id) - - -class TwentyTwoTracksGenreIE(TwentyTwoTracksIE): - _VALID_URL = r'https?://22tracks\.com/(?P<city>[a-z]+)/(?P<genre>[\da-z]+)/?$' - IE_NAME = '22tracks:genre' - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - - city = mobj.group('city') - genre = mobj.group('genre') - - genre_title, tracks = self._extract_info(city, genre) - - entries = [ - self._extract_track_info(track_info, track_info['id']) - for track_info in tracks] - - return self.playlist_result(entries, genre, genre_title) diff --git a/youtube_dl/extractor/twitch.py b/youtube_dl/extractor/twitch.py index fefcd2807..bf57eac01 100644 --- a/youtube_dl/extractor/twitch.py +++ b/youtube_dl/extractor/twitch.py @@ -101,7 +101,7 @@ class TwitchBaseIE(InfoExtractor): fail(clean_html(login_page)) redirect_page, handle = login_step( - login_page, handle, 'Logging in as %s' % username, { + login_page, handle, 'Logging in', { 'username': username, 'password': password, }) diff --git a/youtube_dl/extractor/udemy.py b/youtube_dl/extractor/udemy.py index 207c4a6a7..c248ea727 100644 --- a/youtube_dl/extractor/udemy.py +++ b/youtube_dl/extractor/udemy.py @@ -164,7 +164,7 @@ class UdemyIE(InfoExtractor): }) response = self._download_webpage( - self._LOGIN_URL, None, 'Logging in as %s' % username, + self._LOGIN_URL, None, 'Logging in', data=urlencode_postdata(login_form), headers={ 'Referer': self._ORIGIN_URL, diff --git a/youtube_dl/extractor/viki.py b/youtube_dl/extractor/viki.py index 853e5c75f..ad2a2a4b7 100644 --- a/youtube_dl/extractor/viki.py +++ b/youtube_dl/extractor/viki.py @@ -99,7 +99,7 @@ class VikiBaseIE(InfoExtractor): login = self._call_api( 'sessions.json', None, - 'Logging in as %s' % username, post_data=login_form) + 'Logging in', post_data=login_form) self._token = login.get('token') if not self._token: diff --git a/youtube_dl/extractor/vimeo.py b/youtube_dl/extractor/vimeo.py index c3f71b45e..cedb54876 100644 --- a/youtube_dl/extractor/vimeo.py +++ b/youtube_dl/extractor/vimeo.py @@ -412,7 +412,7 @@ class VimeoIE(VimeoBaseInfoExtractor): urls = [] # Look for embedded (iframe) Vimeo player for mobj in re.finditer( - r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//player\.vimeo\.com/video/.+?)\1', + r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//player\.vimeo\.com/video/\d+.*?)\1', webpage): urls.append(VimeoIE._smuggle_referrer(unescapeHTML(mobj.group('url')), url)) PLAIN_EMBED_RE = ( diff --git a/youtube_dl/extractor/vk.py b/youtube_dl/extractor/vk.py index 105e172d5..0d8376522 100644 --- a/youtube_dl/extractor/vk.py +++ b/youtube_dl/extractor/vk.py @@ -67,7 +67,7 @@ class VKBaseIE(InfoExtractor): login_page = self._download_webpage( 'https://login.vk.com/?act=login', None, - note='Logging in as %s' % username, + note='Logging in', data=urlencode_postdata(login_form)) if re.search(r'onLoginFailed', login_page): diff --git a/youtube_dl/extractor/wsj.py b/youtube_dl/extractor/wsj.py index 9b5487710..67236f377 100644 --- a/youtube_dl/extractor/wsj.py +++ b/youtube_dl/extractor/wsj.py @@ -13,7 +13,7 @@ class WSJIE(InfoExtractor): _VALID_URL = r'''(?x) (?: https?://video-api\.wsj\.com/api-video/player/iframe\.html\?.*?\bguid=| - https?://(?:www\.)?(?:wsj|barrons)\.com/video/[^/]+/| + https?://(?:www\.)?(?:wsj|barrons)\.com/video/(?:[^/]+/)+| wsj: ) (?P<id>[a-fA-F0-9-]{36}) @@ -38,6 +38,9 @@ class WSJIE(InfoExtractor): }, { 'url': 'http://www.barrons.com/video/capitalism-deserves-more-respect-from-millennials/F301217E-6F46-43AE-B8D2-B7180D642EE9.html', 'only_matching': True, + }, { + 'url': 'https://www.wsj.com/video/series/a-brief-history-of/the-modern-cell-carrier-how-we-got-here/980E2187-401D-48A1-B82B-1486CEE06CB9', + 'only_matching': True, }] def _real_extract(self, url): diff --git a/youtube_dl/extractor/younow.py b/youtube_dl/extractor/younow.py new file mode 100644 index 000000000..04dbc87fc --- /dev/null +++ b/youtube_dl/extractor/younow.py @@ -0,0 +1,202 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import itertools + +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import ( + ExtractorError, + int_or_none, + try_get, +) + +CDN_API_BASE = 'https://cdn.younow.com/php/api' +MOMENT_URL_FORMAT = '%s/moment/fetch/id=%%s' % CDN_API_BASE + + +class YouNowLiveIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?younow\.com/(?P<id>[^/?#&]+)' + _TEST = { + 'url': 'https://www.younow.com/AmandaPadeezy', + 'info_dict': { + 'id': 'AmandaPadeezy', + 'ext': 'mp4', + 'is_live': True, + 'title': 'March 26, 2017', + 'thumbnail': r're:^https?://.*\.jpg$', + 'tags': ['girls'], + 'categories': ['girls'], + 'uploader': 'AmandaPadeezy', + 'uploader_id': '6716501', + 'uploader_url': 'https://www.younow.com/AmandaPadeezy', + 'creator': 'AmandaPadeezy', + }, + 'skip': True, + } + + @classmethod + def suitable(cls, url): + return (False + if YouNowChannelIE.suitable(url) or YouNowMomentIE.suitable(url) + else super(YouNowLiveIE, cls).suitable(url)) + + def _real_extract(self, url): + username = self._match_id(url) + + data = self._download_json( + 'https://api.younow.com/php/api/broadcast/info/curId=0/user=%s' + % username, username) + + if data.get('errorCode') != 0: + raise ExtractorError(data['errorMsg'], expected=True) + + uploader = try_get( + data, lambda x: x['user']['profileUrlString'], + compat_str) or username + + return { + 'id': uploader, + 'is_live': True, + 'title': self._live_title(uploader), + 'thumbnail': data.get('awsUrl'), + 'tags': data.get('tags'), + 'categories': data.get('tags'), + 'uploader': uploader, + 'uploader_id': data.get('userId'), + 'uploader_url': 'https://www.younow.com/%s' % username, + 'creator': uploader, + 'view_count': int_or_none(data.get('viewers')), + 'like_count': int_or_none(data.get('likes')), + 'formats': [{ + 'url': '%s/broadcast/videoPath/hls=1/broadcastId=%s/channelId=%s' + % (CDN_API_BASE, data['broadcastId'], data['userId']), + 'ext': 'mp4', + 'protocol': 'm3u8', + }], + } + + +def _extract_moment(item, fatal=True): + moment_id = item.get('momentId') + if not moment_id: + if not fatal: + return + raise ExtractorError('Unable to extract moment id') + + moment_id = compat_str(moment_id) + + title = item.get('text') + if not title: + title = 'YouNow %s' % ( + item.get('momentType') or item.get('titleType') or 'moment') + + uploader = try_get(item, lambda x: x['owner']['name'], compat_str) + uploader_id = try_get(item, lambda x: x['owner']['userId']) + uploader_url = 'https://www.younow.com/%s' % uploader if uploader else None + + entry = { + 'extractor_key': 'YouNowMoment', + 'id': moment_id, + 'title': title, + 'view_count': int_or_none(item.get('views')), + 'like_count': int_or_none(item.get('likes')), + 'timestamp': int_or_none(item.get('created')), + 'creator': uploader, + 'uploader': uploader, + 'uploader_id': uploader_id, + 'uploader_url': uploader_url, + 'formats': [{ + 'url': 'https://hls.younow.com/momentsplaylists/live/%s/%s.m3u8' + % (moment_id, moment_id), + 'ext': 'mp4', + 'protocol': 'm3u8_native', + }], + } + + return entry + + +class YouNowChannelIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?younow\.com/(?P<id>[^/]+)/channel' + _TEST = { + 'url': 'https://www.younow.com/its_Kateee_/channel', + 'info_dict': { + 'id': '14629760', + 'title': 'its_Kateee_ moments' + }, + 'playlist_mincount': 8, + } + + def _entries(self, username, channel_id): + created_before = 0 + for page_num in itertools.count(1): + if created_before is None: + break + info = self._download_json( + '%s/moment/profile/channelId=%s/createdBefore=%d/records=20' + % (CDN_API_BASE, channel_id, created_before), username, + note='Downloading moments page %d' % page_num) + items = info.get('items') + if not items or not isinstance(items, list): + break + for item in items: + if not isinstance(item, dict): + continue + item_type = item.get('type') + if item_type == 'moment': + entry = _extract_moment(item, fatal=False) + if entry: + yield entry + elif item_type == 'collection': + moments = item.get('momentsIds') + if isinstance(moments, list): + for moment_id in moments: + m = self._download_json( + MOMENT_URL_FORMAT % moment_id, username, + note='Downloading %s moment JSON' % moment_id, + fatal=False) + if m and isinstance(m, dict) and m.get('item'): + entry = _extract_moment(m['item']) + if entry: + yield entry + created_before = int_or_none(item.get('created')) + + def _real_extract(self, url): + username = self._match_id(url) + channel_id = compat_str(self._download_json( + 'https://api.younow.com/php/api/broadcast/info/curId=0/user=%s' + % username, username, note='Downloading user information')['userId']) + return self.playlist_result( + self._entries(username, channel_id), channel_id, + '%s moments' % username) + + +class YouNowMomentIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?younow\.com/[^/]+/(?P<id>[^/?#&]+)' + _TEST = { + 'url': 'https://www.younow.com/GABO.../20712117/36319236/3b316doc/m', + 'md5': 'a30c70eadb9fb39a1aa3c8c0d22a0807', + 'info_dict': { + 'id': '20712117', + 'ext': 'mp4', + 'title': 'YouNow capture', + 'view_count': int, + 'like_count': int, + 'timestamp': 1490432040, + 'upload_date': '20170325', + 'uploader': 'GABO...', + 'uploader_id': 35917228, + }, + } + + @classmethod + def suitable(cls, url): + return (False + if YouNowChannelIE.suitable(url) + else super(YouNowMomentIE, cls).suitable(url)) + + def _real_extract(self, url): + video_id = self._match_id(url) + item = self._download_json(MOMENT_URL_FORMAT % video_id, video_id) + return _extract_moment(item['item']) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 5aef555fb..9943dddc1 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -1391,7 +1391,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): ) (["\']) (?P<url>(?:https?:)?//(?:www\.)?youtube(?:-nocookie)?\.com/ - (?:embed|v|p)/.+?) + (?:embed|v|p)/[0-9A-Za-z_-]{11}.*?) \1''', webpage)] # lazyYT YouTube embed diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 59fb33435..34866a54b 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -1835,10 +1835,20 @@ def parse_duration(s): days, hours, mins, secs, ms = m.groups() else: m = re.match( - r'''(?ix)(?:P?T)? + r'''(?ix)(?:P? + (?: + [0-9]+\s*y(?:ears?)?\s* + )? + (?: + [0-9]+\s*m(?:onths?)?\s* + )? + (?: + [0-9]+\s*w(?:eeks?)?\s* + )? (?: (?P<days>[0-9]+)\s*d(?:ays?)?\s* )? + T)? (?: (?P<hours>[0-9]+)\s*h(?:ours?)?\s* )? diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 4d1686670..8b67d23fe 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2017.10.20' +__version__ = '2017.11.06'