diff --git a/.github/ISSUE_TEMPLATE.md b/.github/ISSUE_TEMPLATE.md index 3f8984943..ad52c8900 100644 --- a/.github/ISSUE_TEMPLATE.md +++ b/.github/ISSUE_TEMPLATE.md @@ -6,8 +6,8 @@ --- -### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2017.12.31*. If it's not, read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. -- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2017.12.31** +### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2018.01.07*. If it's not, read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. +- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2018.01.07** ### Before submitting an *issue* make sure you have: - [ ] At least skimmed through the [README](https://github.com/rg3/youtube-dl/blob/master/README.md), **most notably** the [FAQ](https://github.com/rg3/youtube-dl#faq) and [BUGS](https://github.com/rg3/youtube-dl#bugs) sections @@ -35,7 +35,7 @@ Add the `-v` flag to **your command line** you run youtube-dl with (`youtube-dl [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 -[debug] youtube-dl version 2017.12.31 +[debug] youtube-dl version 2018.01.07 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/AUTHORS b/AUTHORS index 7e012247c..5a090a3ef 100644 --- a/AUTHORS +++ b/AUTHORS @@ -231,3 +231,4 @@ John Dong Tatsuyuki Ishi Daniel Weber Kay Bouché +Yang Hongbo diff --git a/ChangeLog b/ChangeLog index 96bc471f3..51825ccfe 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,9 +1,28 @@ version Extractors ++ [weibo] Add extractor (#15079) +* [bilibili] fix extraction (#15188) + + +version 2018.01.07 + +Core +* [utils] Fix youtube-dl under PyPy3 on Windows +* [YoutubeDL] Output python implementation in debug header + +Extractors ++ [jwplatform] Add support for multiple embeds (#15192) +* [mitele] Fix extraction (#15186) ++ [motherless] Add support for groups (#15124) +* [lynda] Relax URL regular expression (#15185) +* [soundcloud] Fallback to avatar picture for thumbnail (#12878) * [youku] Fix list extraction (#15135) * [openload] Fix extraction (#15166) +* [lynda] Skip invalid subtitles (#15159) +* [twitch] Pass video id to url_result when extracting playlist (#15139) * [rtve.es:alacarta] Fix extraction of some new URLs +* [acast] Fix extraction (#15147) version 2017.12.31 diff --git a/docs/supportedsites.md b/docs/supportedsites.md index 75bd5c922..79b343048 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -478,6 +478,7 @@ - **Moniker**: allmyvideos.net and vidspot.net - **Morningstar**: morningstar.com - **Motherless** + - **MotherlessGroup** - **Motorsport**: motorsport.com - **MovieClips** - **MovieFap** diff --git a/youtube_dl/extractor/bilibili.py b/youtube_dl/extractor/bilibili.py index 1e57310d6..beffcecd0 100644 --- a/youtube_dl/extractor/bilibili.py +++ b/youtube_dl/extractor/bilibili.py @@ -102,6 +102,7 @@ class BiliBiliIE(InfoExtractor): video_id, anime_id, compat_urlparse.urljoin(url, '//bangumi.bilibili.com/anime/%s' % anime_id))) headers = { 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8', + 'Referer': url } headers.update(self.geo_verification_headers()) @@ -116,10 +117,15 @@ class BiliBiliIE(InfoExtractor): payload = 'appkey=%s&cid=%s&otype=json&quality=2&type=mp4' % (self._APP_KEY, cid) sign = hashlib.md5((payload + self._BILIBILI_KEY).encode('utf-8')).hexdigest() + headers = { + 'Referer': url + } + headers.update(self.geo_verification_headers()) + video_info = self._download_json( 'http://interface.bilibili.com/playurl?%s&sign=%s' % (payload, sign), video_id, note='Downloading video info page', - headers=self.geo_verification_headers()) + headers=headers) if 'durl' not in video_info: self._report_error(video_info) diff --git a/youtube_dl/extractor/canalplus.py b/youtube_dl/extractor/canalplus.py index d8bf073f4..51c11cb7e 100644 --- a/youtube_dl/extractor/canalplus.py +++ b/youtube_dl/extractor/canalplus.py @@ -4,59 +4,36 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..compat import compat_urllib_parse_urlparse from ..utils import ( - dict_get, # ExtractorError, # HEADRequest, int_or_none, qualities, - remove_end, unified_strdate, ) class CanalplusIE(InfoExtractor): - IE_DESC = 'canalplus.fr, piwiplus.fr and d8.tv' - _VALID_URL = r'''(?x) - https?:// - (?: - (?: - (?:(?:www|m)\.)?canalplus\.fr| - (?:www\.)?piwiplus\.fr| - (?:www\.)?d8\.tv| - (?:www\.)?c8\.fr| - (?:www\.)?d17\.tv| - (?:(?:football|www)\.)?cstar\.fr| - (?:www\.)?itele\.fr - )/(?:(?:[^/]+/)*(?P[^/?#&]+))?(?:\?.*\bvid=(?P\d+))?| - player\.canalplus\.fr/#/(?P\d+) - ) - - ''' + IE_DESC = 'mycanal.fr and piwiplus.fr' + _VALID_URL = r'https?://(?:www\.)?(?Pmycanal|piwiplus)\.fr/(?:[^/]+/)*(?P[^?/]+)(?:\.html\?.*\bvid=|/p/)(?P\d+)' _VIDEO_INFO_TEMPLATE = 'http://service.canal-plus.com/video/rest/getVideosLiees/%s/%s?format=json' _SITE_ID_MAP = { - 'canalplus': 'cplus', + 'mycanal': 'cplus', 'piwiplus': 'teletoon', - 'd8': 'd8', - 'c8': 'd8', - 'd17': 'd17', - 'cstar': 'd17', - 'itele': 'itele', } # Only works for direct mp4 URLs _GEO_COUNTRIES = ['FR'] _TESTS = [{ - 'url': 'http://www.canalplus.fr/c-emissions/pid1830-c-zapping.html?vid=1192814', + 'url': 'https://www.mycanal.fr/d17-emissions/lolywood/p/1397061', 'info_dict': { - 'id': '1405510', - 'display_id': 'pid1830-c-zapping', + 'id': '1397061', + 'display_id': 'lolywood', 'ext': 'mp4', - 'title': 'Zapping - 02/07/2016', - 'description': 'Le meilleur de toutes les chaînes, tous les jours', - 'upload_date': '20160702', + 'title': 'Euro 2016 : Je préfère te prévenir - Lolywood - Episode 34', + 'description': 'md5:7d97039d455cb29cdba0d652a0efaa5e', + 'upload_date': '20160602', }, }, { # geo restricted, bypassed @@ -70,64 +47,12 @@ class CanalplusIE(InfoExtractor): 'upload_date': '20140724', }, 'expected_warnings': ['HTTP Error 403: Forbidden'], - }, { - # geo restricted, bypassed - 'url': 'http://www.c8.fr/c8-divertissement/ms-touche-pas-a-mon-poste/pid6318-videos-integrales.html?vid=1443684', - 'md5': 'bb6f9f343296ab7ebd88c97b660ecf8d', - 'info_dict': { - 'id': '1443684', - 'display_id': 'pid6318-videos-integrales', - 'ext': 'mp4', - 'title': 'Guess my iep ! - TPMP - 07/04/2017', - 'description': 'md5:6f005933f6e06760a9236d9b3b5f17fa', - 'upload_date': '20170407', - }, - 'expected_warnings': ['HTTP Error 403: Forbidden'], - }, { - 'url': 'http://www.itele.fr/chroniques/invite-michael-darmon/rachida-dati-nicolas-sarkozy-est-le-plus-en-phase-avec-les-inquietudes-des-francais-171510', - 'info_dict': { - 'id': '1420176', - 'display_id': 'rachida-dati-nicolas-sarkozy-est-le-plus-en-phase-avec-les-inquietudes-des-francais-171510', - 'ext': 'mp4', - 'title': 'L\'invité de Michaël Darmon du 14/10/2016 - ', - 'description': 'Chaque matin du lundi au vendredi, Michaël Darmon reçoit un invité politique à 8h25.', - 'upload_date': '20161014', - }, - }, { - 'url': 'http://football.cstar.fr/cstar-minisite-foot/pid7566-feminines-videos.html?vid=1416769', - 'info_dict': { - 'id': '1416769', - 'display_id': 'pid7566-feminines-videos', - 'ext': 'mp4', - 'title': 'France - Albanie : les temps forts de la soirée - 20/09/2016', - 'description': 'md5:c3f30f2aaac294c1c969b3294de6904e', - 'upload_date': '20160921', - }, - 'params': { - 'skip_download': True, - }, - }, { - 'url': 'http://m.canalplus.fr/?vid=1398231', - 'only_matching': True, - }, { - 'url': 'http://www.d17.tv/emissions/pid8303-lolywood.html?vid=1397061', - 'only_matching': True, }] def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) + site, display_id, video_id = re.match(self._VALID_URL, url).groups() - site_id = self._SITE_ID_MAP[compat_urllib_parse_urlparse(url).netloc.rsplit('.', 2)[-2]] - - # Beware, some subclasses do not define an id group - display_id = remove_end(dict_get(mobj.groupdict(), ('display_id', 'id', 'vid')), '.html') - - webpage = self._download_webpage(url, display_id) - video_id = self._search_regex( - [r']+?videoId=(["\'])(?P\d+)', - r'id=["\']canal_video_player(?P\d+)', - r'data-video=["\'](?P\d+)'], - webpage, 'video id', default=mobj.group('vid'), group='id') + site_id = self._SITE_ID_MAP[site] info_url = self._VIDEO_INFO_TEMPLATE % (site_id, video_id) video_data = self._download_json(info_url, video_id, 'Downloading video JSON') @@ -161,7 +86,7 @@ class CanalplusIE(InfoExtractor): format_url + '?hdcore=2.11.3', video_id, f4m_id=format_id, fatal=False)) else: formats.append({ - # the secret extracted ya function in http://player.canalplus.fr/common/js/canalPlayer.js + # the secret extracted from ya function in http://player.canalplus.fr/common/js/canalPlayer.js 'url': format_url + '?secret=pqzerjlsmdkjfoiuerhsdlfknaes', 'format_id': format_id, 'preference': preference(format_id), diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index e64defe62..a3ad4df1f 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -609,7 +609,10 @@ from .mofosex import MofosexIE from .mojvideo import MojvideoIE from .moniker import MonikerIE from .morningstar import MorningstarIE -from .motherless import MotherlessIE +from .motherless import ( + MotherlessIE, + MotherlessGroupIE +) from .motorsport import MotorsportIE from .movieclips import MovieClipsIE from .moviezine import MoviezineIE @@ -1295,6 +1298,10 @@ from .webofstories import ( WebOfStoriesIE, WebOfStoriesPlaylistIE, ) +from .weibo import ( + WeiboIE, + WeiboMobileIE +) from .weiqitv import WeiqiTVIE from .wimp import WimpIE from .wistia import WistiaIE diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index cc4c90b8c..9b0cd004f 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -2708,9 +2708,9 @@ class GenericIE(InfoExtractor): return self.url_result(viewlift_url) # Look for JWPlatform embeds - jwplatform_url = JWPlatformIE._extract_url(webpage) - if jwplatform_url: - return self.url_result(jwplatform_url, 'JWPlatform') + jwplatform_urls = JWPlatformIE._extract_urls(webpage) + if jwplatform_urls: + return self.playlist_from_matches(jwplatform_urls, video_id, video_title, ie=JWPlatformIE.ie_key()) # Look for Digiteka embeds digiteka_url = DigitekaIE._extract_url(webpage) diff --git a/youtube_dl/extractor/jwplatform.py b/youtube_dl/extractor/jwplatform.py index c9bcbb08f..63d0dc998 100644 --- a/youtube_dl/extractor/jwplatform.py +++ b/youtube_dl/extractor/jwplatform.py @@ -23,11 +23,14 @@ class JWPlatformIE(InfoExtractor): @staticmethod def _extract_url(webpage): - mobj = re.search( - r'<(?:script|iframe)[^>]+?src=["\'](?P(?:https?:)?//content.jwplatform.com/players/[a-zA-Z0-9]{8})', + urls = JWPlatformIE._extract_urls(webpage) + return urls[0] if urls else None + + @staticmethod + def _extract_urls(webpage): + return re.findall( + r'<(?:script|iframe)[^>]+?src=["\']((?:https?:)?//content\.jwplatform\.com/players/[a-zA-Z0-9]{8})', webpage) - if mobj: - return mobj.group('url') def _real_extract(self, url): video_id = self._match_id(url) diff --git a/youtube_dl/extractor/lynda.py b/youtube_dl/extractor/lynda.py index 0d6026aad..f5c7abc13 100644 --- a/youtube_dl/extractor/lynda.py +++ b/youtube_dl/extractor/lynda.py @@ -94,7 +94,15 @@ class LyndaBaseIE(InfoExtractor): class LyndaIE(LyndaBaseIE): IE_NAME = 'lynda' IE_DESC = 'lynda.com videos' - _VALID_URL = r'https?://(?:www\.)?(?:lynda\.com|educourse\.ga)/(?:[^/]+/[^/]+/(?P\d+)|player/embed)/(?P\d+)' + _VALID_URL = r'''(?x) + https?:// + (?:www\.)?(?:lynda\.com|educourse\.ga)/ + (?: + (?:[^/]+/){2,3}(?P\d+)| + player/embed + )/ + (?P\d+) + ''' _TIMECODE_REGEX = r'\[(?P\d+:\d+:\d+[\.,]\d+)\]' @@ -113,6 +121,9 @@ class LyndaIE(LyndaBaseIE): }, { 'url': 'https://educourse.ga/Bootstrap-tutorials/Using-exercise-files/110885/114408-4.html', 'only_matching': True, + }, { + 'url': 'https://www.lynda.com/de/Graphic-Design-tutorials/Willkommen-Grundlagen-guten-Gestaltung/393570/393572-4.html', + 'only_matching': True, }] def _raise_unavailable(self, video_id): @@ -257,7 +268,15 @@ class LyndaCourseIE(LyndaBaseIE): # Course link equals to welcome/introduction video link of same course # We will recognize it as course link - _VALID_URL = r'https?://(?:www|m)\.(?:lynda\.com|educourse\.ga)/(?P[^/]+/[^/]+/(?P\d+))-\d\.html' + _VALID_URL = r'https?://(?:www|m)\.(?:lynda\.com|educourse\.ga)/(?P(?:[^/]+/){2,3}(?P\d+))-2\.html' + + _TESTS = [{ + 'url': 'https://www.lynda.com/Graphic-Design-tutorials/Grundlagen-guten-Gestaltung/393570-2.html', + 'only_matching': True, + }, { + 'url': 'https://www.lynda.com/de/Graphic-Design-tutorials/Grundlagen-guten-Gestaltung/393570-2.html', + 'only_matching': True, + }] def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) diff --git a/youtube_dl/extractor/mitele.py b/youtube_dl/extractor/mitele.py index 964dc542c..42759eae8 100644 --- a/youtube_dl/extractor/mitele.py +++ b/youtube_dl/extractor/mitele.py @@ -1,13 +1,13 @@ # coding: utf-8 from __future__ import unicode_literals +import json import uuid from .common import InfoExtractor from .ooyala import OoyalaIE from ..compat import ( compat_str, - compat_urllib_parse_urlencode, compat_urlparse, ) from ..utils import ( @@ -42,31 +42,33 @@ class MiTeleBaseIE(InfoExtractor): duration = int_or_none(mmc.get('duration')) for location in mmc['locations']: gat = self._proto_relative_url(location.get('gat'), 'http:') - bas = location.get('bas') - loc = location.get('loc') + gcp = location.get('gcp') ogn = location.get('ogn') - if None in (gat, bas, loc, ogn): + if None in (gat, gcp, ogn): continue token_data = { - 'bas': bas, - 'icd': loc, + 'gcp': gcp, 'ogn': ogn, - 'sta': '0', + 'sta': 0, } media = self._download_json( - '%s/?%s' % (gat, compat_urllib_parse_urlencode(token_data)), - video_id, 'Downloading %s JSON' % location['loc']) - file_ = media.get('file') - if not file_: + gat, video_id, data=json.dumps(token_data).encode('utf-8'), + headers={ + 'Content-Type': 'application/json;charset=utf-8', + 'Referer': url, + }) + stream = media.get('stream') or media.get('file') + if not stream: continue - ext = determine_ext(file_) + ext = determine_ext(stream) if ext == 'f4m': formats.extend(self._extract_f4m_formats( - file_ + '&hdcore=3.2.0&plugin=aasp-3.2.0.77.18', + stream + '&hdcore=3.2.0&plugin=aasp-3.2.0.77.18', video_id, f4m_id='hds', fatal=False)) elif ext == 'm3u8': formats.extend(self._extract_m3u8_formats( - file_, video_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False)) + stream, video_id, 'mp4', 'm3u8_native', + m3u8_id='hls', fatal=False)) self._sort_formats(formats) return { diff --git a/youtube_dl/extractor/motherless.py b/youtube_dl/extractor/motherless.py index 6fe3b6049..e24396e79 100644 --- a/youtube_dl/extractor/motherless.py +++ b/youtube_dl/extractor/motherless.py @@ -4,8 +4,11 @@ import datetime import re from .common import InfoExtractor +from ..compat import compat_urlparse from ..utils import ( ExtractorError, + InAdvancePagedList, + orderedSet, str_to_int, unified_strdate, ) @@ -114,3 +117,86 @@ class MotherlessIE(InfoExtractor): 'age_limit': age_limit, 'url': video_url, } + + +class MotherlessGroupIE(InfoExtractor): + _VALID_URL = 'https?://(?:www\.)?motherless\.com/gv?/(?P[a-z0-9_]+)' + _TESTS = [{ + 'url': 'http://motherless.com/g/movie_scenes', + 'info_dict': { + 'id': 'movie_scenes', + 'title': 'Movie Scenes', + 'description': 'Hot and sexy scenes from "regular" movies... ' + 'Beautiful actresses fully nude... A looot of ' + 'skin! :)Enjoy!', + }, + 'playlist_mincount': 662, + }, { + 'url': 'http://motherless.com/gv/sex_must_be_funny', + 'info_dict': { + 'id': 'sex_must_be_funny', + 'title': 'Sex must be funny', + 'description': 'Sex can be funny. Wide smiles,laugh, games, fun of ' + 'any kind!' + }, + 'playlist_mincount': 9, + }] + + @classmethod + def suitable(cls, url): + return (False if MotherlessIE.suitable(url) + else super(MotherlessGroupIE, cls).suitable(url)) + + def _extract_entries(self, webpage, base): + entries = [] + for mobj in re.finditer( + r'href="(?P/[^"]+)"[^>]*>(?:\s*]+alt="[^-]+-\s(?P[^"]+)")?', + webpage): + video_url = compat_urlparse.urljoin(base, mobj.group('href')) + if not MotherlessIE.suitable(video_url): + continue + video_id = MotherlessIE._match_id(video_url) + title = mobj.group('title') + entries.append(self.url_result( + video_url, ie=MotherlessIE.ie_key(), video_id=video_id, + video_title=title)) + # Alternative fallback + if not entries: + entries = [ + self.url_result( + compat_urlparse.urljoin(base, '/' + video_id), + ie=MotherlessIE.ie_key(), video_id=video_id) + for video_id in orderedSet(re.findall( + r'data-codename=["\']([A-Z0-9]+)', webpage))] + return entries + + def _real_extract(self, url): + group_id = self._match_id(url) + page_url = compat_urlparse.urljoin(url, '/gv/%s' % group_id) + webpage = self._download_webpage(page_url, group_id) + title = self._search_regex( + r'<title>([\w\s]+\w)\s+-', webpage, 'title', fatal=False) + description = self._html_search_meta( + 'description', webpage, fatal=False) + page_count = self._int(self._search_regex( + r'(\d+)</(?:a|span)><(?:a|span)[^>]+>\s*NEXT', + webpage, 'page_count'), 'page_count') + PAGE_SIZE = 80 + + def _get_page(idx): + webpage = self._download_webpage( + page_url, group_id, query={'page': idx + 1}, + note='Downloading page %d/%d' % (idx + 1, page_count) + ) + for entry in self._extract_entries(webpage, url): + yield entry + + playlist = InAdvancePagedList(_get_page, page_count, PAGE_SIZE) + + return { + '_type': 'playlist', + 'id': group_id, + 'title': title, + 'description': description, + 'entries': playlist + } diff --git a/youtube_dl/extractor/ndr.py b/youtube_dl/extractor/ndr.py index 07528d140..aec2ea133 100644 --- a/youtube_dl/extractor/ndr.py +++ b/youtube_dl/extractor/ndr.py @@ -190,10 +190,12 @@ class NDREmbedBaseIE(InfoExtractor): ext = determine_ext(src, None) if ext == 'f4m': formats.extend(self._extract_f4m_formats( - src + '?hdcore=3.7.0&plugin=aasp-3.7.0.39.44', video_id, f4m_id='hds')) + src + '?hdcore=3.7.0&plugin=aasp-3.7.0.39.44', video_id, + f4m_id='hds', fatal=False)) elif ext == 'm3u8': formats.extend(self._extract_m3u8_formats( - src, video_id, 'mp4', m3u8_id='hls', entry_protocol='m3u8_native')) + src, video_id, 'mp4', m3u8_id='hls', + entry_protocol='m3u8_native', fatal=False)) else: quality = f.get('quality') ff = { diff --git a/youtube_dl/extractor/odnoklassniki.py b/youtube_dl/extractor/odnoklassniki.py index 8e13bcf1f..5c8b37e18 100644 --- a/youtube_dl/extractor/odnoklassniki.py +++ b/youtube_dl/extractor/odnoklassniki.py @@ -19,11 +19,11 @@ from ..utils import ( class OdnoklassnikiIE(InfoExtractor): - _VALID_URL = r'https?://(?:(?:www|m|mobile)\.)?(?:odnoklassniki|ok)\.ru/(?:video(?:embed)?|web-api/video/moviePlayer)/(?P<id>[\d-]+)' + _VALID_URL = r'https?://(?:(?:www|m|mobile)\.)?(?:odnoklassniki|ok)\.ru/(?:video(?:embed)?|web-api/video/moviePlayer|live)/(?P<id>[\d-]+)' _TESTS = [{ # metadata in JSON 'url': 'http://ok.ru/video/20079905452', - 'md5': '6ba728d85d60aa2e6dd37c9e70fdc6bc', + 'md5': '0b62089b479e06681abaaca9d204f152', 'info_dict': { 'id': '20079905452', 'ext': 'mp4', @@ -35,7 +35,6 @@ class OdnoklassnikiIE(InfoExtractor): 'like_count': int, 'age_limit': 0, }, - 'skip': 'Video has been blocked', }, { # metadataUrl 'url': 'http://ok.ru/video/63567059965189-0?fromTime=5', @@ -99,6 +98,9 @@ class OdnoklassnikiIE(InfoExtractor): }, { 'url': 'http://mobile.ok.ru/video/20079905452', 'only_matching': True, + }, { + 'url': 'https://www.ok.ru/live/484531969818', + 'only_matching': True, }] def _real_extract(self, url): @@ -184,6 +186,10 @@ class OdnoklassnikiIE(InfoExtractor): }) return info + assert title + if provider == 'LIVE_TV_APP': + info['title'] = self._live_title(title) + quality = qualities(('4', '0', '1', '2', '3', '5')) formats = [{ @@ -210,6 +216,20 @@ class OdnoklassnikiIE(InfoExtractor): if fmt_type: fmt['quality'] = quality(fmt_type) + # Live formats + m3u8_url = metadata.get('hlsMasterPlaylistUrl') + if m3u8_url: + formats.extend(self._extract_m3u8_formats( + m3u8_url, video_id, 'mp4', entry_protocol='m3u8', + m3u8_id='hls', fatal=False)) + rtmp_url = metadata.get('rtmpUrl') + if rtmp_url: + formats.append({ + 'url': rtmp_url, + 'format_id': 'rtmp', + 'ext': 'flv', + }) + self._sort_formats(formats) info['formats'] = formats diff --git a/youtube_dl/extractor/soundcloud.py b/youtube_dl/extractor/soundcloud.py index 8894f4b0c..6c9816eef 100644 --- a/youtube_dl/extractor/soundcloud.py +++ b/youtube_dl/extractor/soundcloud.py @@ -136,6 +136,25 @@ class SoundcloudIE(InfoExtractor): 'license': 'all-rights-reserved', }, }, + # no album art, use avatar pic for thumbnail + { + 'url': 'https://soundcloud.com/garyvee/sideways-prod-mad-real', + 'md5': '59c7872bc44e5d99b7211891664760c2', + 'info_dict': { + 'id': '309699954', + 'ext': 'mp3', + 'title': 'Sideways (Prod. Mad Real)', + 'description': 'md5:d41d8cd98f00b204e9800998ecf8427e', + 'uploader': 'garyvee', + 'upload_date': '20170226', + 'duration': 207, + 'thumbnail': r're:https?://.*\.jpg', + 'license': 'all-rights-reserved', + }, + 'params': { + 'skip_download': True, + }, + }, ] _CLIENT_ID = 'c6CU49JDMapyrQo06UxU9xouB9ZVzqCn' @@ -160,7 +179,7 @@ class SoundcloudIE(InfoExtractor): name = full_title or track_id if quiet: self.report_extraction(name) - thumbnail = info.get('artwork_url') + thumbnail = info.get('artwork_url') or info.get('user', {}).get('avatar_url') if isinstance(thumbnail, compat_str): thumbnail = thumbnail.replace('-large', '-t500x500') ext = 'mp3' diff --git a/youtube_dl/extractor/weibo.py b/youtube_dl/extractor/weibo.py new file mode 100644 index 000000000..3cb4d71a6 --- /dev/null +++ b/youtube_dl/extractor/weibo.py @@ -0,0 +1,140 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor + +import json +import random +import re + +from ..compat import ( + compat_parse_qs, + compat_str, +) +from ..utils import ( + js_to_json, + strip_jsonp, + urlencode_postdata, +) + + +class WeiboIE(InfoExtractor): + _VALID_URL = r'https?://weibo\.com/[0-9]+/(?P<id>[a-zA-Z0-9]+)' + _TEST = { + 'url': 'https://weibo.com/6275294458/Fp6RGfbff?type=comment', + 'info_dict': { + 'id': 'Fp6RGfbff', + 'ext': 'mp4', + 'title': 'You should have servants to massage you,... 来自Hosico_猫 - 微博', + } + } + + def _real_extract(self, url): + video_id = self._match_id(url) + # to get Referer url for genvisitor + webpage, urlh = self._download_webpage_handle(url, video_id) + + visitor_url = urlh.geturl() + + if 'passport.weibo.com' in visitor_url: + # first visit + visitor_data = self._download_json( + 'https://passport.weibo.com/visitor/genvisitor', video_id, + note='Generating first-visit data', + transform_source=strip_jsonp, + headers={'Referer': visitor_url}, + data=urlencode_postdata({ + 'cb': 'gen_callback', + 'fp': json.dumps({ + 'os': '2', + 'browser': 'Gecko57,0,0,0', + 'fonts': 'undefined', + 'screenInfo': '1440*900*24', + 'plugins': '', + }), + })) + + tid = visitor_data['data']['tid'] + cnfd = '%03d' % visitor_data['data']['confidence'] + + self._download_webpage( + 'https://passport.weibo.com/visitor/visitor', video_id, + note='Running first-visit callback', + query={ + 'a': 'incarnate', + 't': tid, + 'w': 2, + 'c': cnfd, + 'cb': 'cross_domain', + 'from': 'weibo', + '_rand': random.random(), + }) + + webpage = self._download_webpage( + url, video_id, note='Revisiting webpage') + + title = self._html_search_regex( + r'<title>(.+?)', webpage, 'title') + + video_formats = compat_parse_qs(self._search_regex( + r'video-sources=\\\"(.+?)\"', webpage, 'video_sources')) + + formats = [] + supported_resolutions = (480, 720) + for res in supported_resolutions: + vid_urls = video_formats.get(compat_str(res)) + if not vid_urls or not isinstance(vid_urls, list): + continue + + vid_url = vid_urls[0] + formats.append({ + 'url': vid_url, + 'height': res, + }) + + self._sort_formats(formats) + + uploader = self._og_search_property( + 'nick-name', webpage, 'uploader', default=None) + + return { + 'id': video_id, + 'title': title, + 'uploader': uploader, + 'formats': formats + } + + +class WeiboMobileIE(InfoExtractor): + _VALID_URL = r'https?://m\.weibo\.cn/status/(?P[0-9]+)(\?.+)?' + _TEST = { + 'url': 'https://m.weibo.cn/status/4189191225395228?wm=3333_2001&sourcetype=weixin&featurecode=newtitle&from=singlemessage&isappinstalled=0', + 'info_dict': { + 'id': '4189191225395228', + 'ext': 'mp4', + 'title': '午睡当然是要甜甜蜜蜜的啦', + 'uploader': '柴犬柴犬' + } + } + + def _real_extract(self, url): + video_id = self._match_id(url) + # to get Referer url for genvisitor + webpage = self._download_webpage(url, video_id, note='visit the page') + + weibo_info = self._parse_json(self._search_regex( + r'var\s+\$render_data\s*=\s*\[({.*})\]\[0\]\s*\|\|\s*{};', + webpage, 'js_code', flags=re.DOTALL), + video_id, transform_source=js_to_json) + + status_data = weibo_info.get('status', {}) + page_info = status_data.get('page_info') + title = status_data['status_title'] + uploader = status_data.get('user', {}).get('screen_name') + + return { + 'id': video_id, + 'title': title, + 'uploader': uploader, + 'url': page_info['media_info']['stream_url'] + } diff --git a/youtube_dl/version.py b/youtube_dl/version.py index a3f84b9ea..9030e2415 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2017.12.31' +__version__ = '2018.01.07'