diff --git a/AUTHORS b/AUTHORS index ded9e87d2..d1693224e 100644 --- a/AUTHORS +++ b/AUTHORS @@ -139,3 +139,4 @@ slangangular Behrouz Abbasi ngld nyuszika7h +Shaun Walbridge diff --git a/README.md b/README.md index 15baf75ce..542a7c26a 100644 --- a/README.md +++ b/README.md @@ -272,6 +272,7 @@ The `-o` option allows users to indicate a template for the output file names. T - `autonumber`: The sequence will be replaced by a five-digit number that will be increased with each download, starting at zero. - `playlist`: The name or the id of the playlist that contains the video. - `playlist_index`: The index of the video in the playlist, a five-digit number. + - `format_id`: The sequence will be replaced by the format code specified by `--format`. The current default template is `%(title)s-%(id)s.%(ext)s`. @@ -544,7 +545,7 @@ If you want to add support for a new site, you can follow this quick list (assum ``` 5. Add an import in [`youtube_dl/extractor/__init__.py`](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/__init__.py). 6. Run `python test/test_download.py TestDownload.test_YourExtractor`. This *should fail* at first, but you can continually re-run it until you're done. If you decide to add more than one test, then rename ``_TEST`` to ``_TESTS`` and make it into a list of dictionaries. The tests will be then be named `TestDownload.test_YourExtractor`, `TestDownload.test_YourExtractor_1`, `TestDownload.test_YourExtractor_2`, etc. -7. Have a look at [`youtube_dl/common/extractor/common.py`](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/common.py) for possible helper methods and a [detailed description of what your extractor should return](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/common.py#L38). Add tests and code for as many as you want. +7. Have a look at [`youtube_dl/common/extractor/common.py`](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/common.py) for possible helper methods and a [detailed description of what your extractor should and may return](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/common.py#L62-L200). Add tests and code for as many as you want. 8. If you can, check the code with [flake8](https://pypi.python.org/pypi/flake8). 9. When the tests pass, [add](http://git-scm.com/docs/git-add) the new files and [commit](http://git-scm.com/docs/git-commit) them and [push](http://git-scm.com/docs/git-push) the result, like this: @@ -572,7 +573,7 @@ with youtube_dl.YoutubeDL(ydl_opts) as ydl: ydl.download(['http://www.youtube.com/watch?v=BaW_jenozKc']) ``` -Most likely, you'll want to use various options. For a list of what can be done, have a look at [youtube_dl/YoutubeDL.py](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/YoutubeDL.py#L69). For a start, if you want to intercept youtube-dl's output, set a `logger` object. +Most likely, you'll want to use various options. For a list of what can be done, have a look at [youtube_dl/YoutubeDL.py](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/YoutubeDL.py#L117-L265). For a start, if you want to intercept youtube-dl's output, set a `logger` object. Here's a more complete example of a program that outputs only errors (and a short message after the download is finished), and downloads/converts the video to an mp3 file: diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 83d21bd15..7459a1944 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -158,6 +158,7 @@ from .eporner import EpornerIE from .eroprofile import EroProfileIE from .escapist import EscapistIE from .espn import ESPNIE +from .esri import EsriVideoIE from .everyonesmixtape import EveryonesMixtapeIE from .exfm import ExfmIE from .expotv import ExpoTVIE @@ -522,6 +523,7 @@ from .senateisvp import SenateISVPIE from .servingsys import ServingSysIE from .sexu import SexuIE from .sexykarma import SexyKarmaIE +from .shahid import ShahidIE from .shared import SharedIE from .sharesix import ShareSixIE from .sina import SinaIE diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 5982055be..16ae4b98f 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -640,7 +640,7 @@ class InfoExtractor(object): @staticmethod def _meta_regex(prop): return r'''(?isx)]+(?:itemprop|name|property|id)=(["\']?)%s\1) + (?=[^>]+(?:itemprop|name|property|id|http-equiv)=(["\']?)%s\1) [^>]+?content=(["\'])(?P.*?)\2''' % re.escape(prop) def _og_search_property(self, prop, html, name=None, **kargs): diff --git a/youtube_dl/extractor/esri.py b/youtube_dl/extractor/esri.py new file mode 100644 index 000000000..bf5d2019f --- /dev/null +++ b/youtube_dl/extractor/esri.py @@ -0,0 +1,74 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..compat import compat_urlparse +from ..utils import ( + int_or_none, + parse_filesize, + unified_strdate, +) + + +class EsriVideoIE(InfoExtractor): + _VALID_URL = r'https?://video\.esri\.com/watch/(?P[0-9]+)' + _TEST = { + 'url': 'https://video.esri.com/watch/1124/arcgis-online-_dash_-developing-applications', + 'md5': 'd4aaf1408b221f1b38227a9bbaeb95bc', + 'info_dict': { + 'id': '1124', + 'ext': 'mp4', + 'title': 'ArcGIS Online - Developing Applications', + 'description': 'Jeremy Bartley demonstrates how to develop applications with ArcGIS Online.', + 'thumbnail': 're:^https?://.*\.jpg$', + 'duration': 185, + 'upload_date': '20120419', + } + } + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage(url, video_id) + + formats = [] + for width, height, content in re.findall( + r'(?s)
  • (\d+)x(\d+):(.+?)
  • ', webpage): + for video_url, ext, filesize in re.findall( + r']+href="([^"]+)">([^<]+) \(([^<]+)\)', content): + formats.append({ + 'url': compat_urlparse.urljoin(url, video_url), + 'ext': ext.lower(), + 'format_id': '%s-%s' % (ext.lower(), height), + 'width': int(width), + 'height': int(height), + 'filesize_approx': parse_filesize(filesize), + }) + self._sort_formats(formats) + + title = self._html_search_meta('title', webpage, 'title') + description = self._html_search_meta( + 'description', webpage, 'description', fatal=False) + + thumbnail = self._html_search_meta('thumbnail', webpage, 'thumbnail', fatal=False) + if thumbnail: + thumbnail = re.sub(r'_[st]\.jpg$', '_x.jpg', thumbnail) + + duration = int_or_none(self._search_regex( + [r'var\s+videoSeconds\s*=\s*(\d+)', r"'duration'\s*:\s*(\d+)"], + webpage, 'duration', fatal=False)) + + upload_date = unified_strdate(self._html_search_meta( + 'last-modified', webpage, 'upload date', fatal=None)) + + return { + 'id': video_id, + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + 'duration': duration, + 'upload_date': upload_date, + 'formats': formats + } diff --git a/youtube_dl/extractor/lynda.py b/youtube_dl/extractor/lynda.py index deead220a..5b9157ed4 100644 --- a/youtube_dl/extractor/lynda.py +++ b/youtube_dl/extractor/lynda.py @@ -11,6 +11,7 @@ from ..compat import ( ) from ..utils import ( ExtractorError, + clean_html, int_or_none, ) @@ -70,6 +71,15 @@ class LyndaBaseIE(InfoExtractor): 'Confirming log in and log out from another device') if all(not re.search(p, login_page) for p in ('isLoggedIn\s*:\s*true', r'logout\.aspx', r'>Log out<')): + if 'login error' in login_page: + mobj = re.search( + r'(?s)]+class="topmost">(?P[^<]+)</h1>\s*<div>(?P<description>.+?)</div>', + login_page) + if mobj: + raise ExtractorError( + 'lynda returned error: %s - %s' + % (mobj.group('title'), clean_html(mobj.group('description'))), + expected=True) raise ExtractorError('Unable to log in') diff --git a/youtube_dl/extractor/moniker.py b/youtube_dl/extractor/moniker.py index 88dcd4f73..69e4bcd1a 100644 --- a/youtube_dl/extractor/moniker.py +++ b/youtube_dl/extractor/moniker.py @@ -9,7 +9,10 @@ from ..compat import ( compat_urllib_parse, compat_urllib_request, ) -from ..utils import ExtractorError +from ..utils import ( + ExtractorError, + remove_start, +) class MonikerIE(InfoExtractor): @@ -24,6 +27,14 @@ class MonikerIE(InfoExtractor): 'ext': 'mp4', 'title': 'youtube-dl test video', }, + }, { + 'url': 'http://allmyvideos.net/embed-jih3nce3x6wn', + 'md5': '710883dee1bfc370ecf9fa6a89307c88', + 'info_dict': { + 'id': 'jih3nce3x6wn', + 'ext': 'mp4', + 'title': 'youtube-dl test video', + }, }, { 'url': 'http://vidspot.net/l2ngsmhs8ci5', 'md5': '710883dee1bfc370ecf9fa6a89307c88', @@ -38,7 +49,10 @@ class MonikerIE(InfoExtractor): }] def _real_extract(self, url): - video_id = self._match_id(url) + orig_video_id = self._match_id(url) + video_id = remove_start(orig_video_id, 'embed-') + url = url.replace(orig_video_id, video_id) + assert re.match(self._VALID_URL, url) is not None orig_webpage = self._download_webpage(url, video_id) if '>File Not Found<' in orig_webpage: diff --git a/youtube_dl/extractor/shahid.py b/youtube_dl/extractor/shahid.py new file mode 100644 index 000000000..6e9903d5e --- /dev/null +++ b/youtube_dl/extractor/shahid.py @@ -0,0 +1,107 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..compat import compat_urllib_parse +from ..utils import ( + ExtractorError, + int_or_none, + parse_iso8601, +) + + +class ShahidIE(InfoExtractor): + _VALID_URL = r'https?://shahid\.mbc\.net/ar/episode/(?P<id>\d+)/?' + _TESTS = [{ + 'url': 'https://shahid.mbc.net/ar/episode/90574/%D8%A7%D9%84%D9%85%D9%84%D9%83-%D8%B9%D8%A8%D8%AF%D8%A7%D9%84%D9%84%D9%87-%D8%A7%D9%84%D8%A5%D9%86%D8%B3%D8%A7%D9%86-%D8%A7%D9%84%D9%85%D9%88%D8%B3%D9%85-1-%D9%83%D9%84%D9%8A%D8%A8-3.html', + 'info_dict': { + 'id': '90574', + 'ext': 'm3u8', + 'title': 'الملك عبدالله الإنسان الموسم 1 كليب 3', + 'description': 'الفيلم الوثائقي - الملك عبد الله الإنسان', + 'duration': 2972, + 'timestamp': 1422057420, + 'upload_date': '20150123', + }, + 'params': { + # m3u8 download + 'skip_download': True, + } + }, { + # shahid plus subscriber only + 'url': 'https://shahid.mbc.net/ar/episode/90511/%D9%85%D8%B1%D8%A7%D9%8A%D8%A7-2011-%D8%A7%D9%84%D9%85%D9%88%D8%B3%D9%85-1-%D8%A7%D9%84%D8%AD%D9%84%D9%82%D8%A9-1.html', + 'only_matching': True + }] + + def _handle_error(self, response): + if not isinstance(response, dict): + return + error = response.get('error') + if error: + raise ExtractorError( + '%s returned error: %s' % (self.IE_NAME, '\n'.join(error.values())), + expected=True) + + def _download_json(self, url, video_id, note='Downloading JSON metadata'): + response = super(ShahidIE, self)._download_json(url, video_id, note)['data'] + self._handle_error(response) + return response + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage(url, video_id) + + api_vars = { + 'id': video_id, + 'type': 'player', + 'url': 'http://api.shahid.net/api/v1_1', + 'playerType': 'episode', + } + + flashvars = self._search_regex( + r'var\s+flashvars\s*=\s*({[^}]+})', webpage, 'flashvars', default=None) + if flashvars: + for key in api_vars.keys(): + value = self._search_regex( + r'\b%s\s*:\s*(?P<q>["\'])(?P<value>.+?)(?P=q)' % key, + flashvars, 'type', default=None, group='value') + if value: + api_vars[key] = value + + player = self._download_json( + 'https://shahid.mbc.net/arContent/getPlayerContent-param-.id-%s.type-%s.html' + % (video_id, api_vars['type']), video_id, 'Downloading player JSON') + + formats = self._extract_m3u8_formats(player['url'], video_id, 'mp4') + + video = self._download_json( + '%s/%s/%s?%s' % ( + api_vars['url'], api_vars['playerType'], api_vars['id'], + compat_urllib_parse.urlencode({ + 'apiKey': 'sh@hid0nlin3', + 'hash': 'b2wMCTHpSmyxGqQjJFOycRmLSex+BpTK/ooxy6vHaqs=', + }).encode('utf-8')), + video_id, 'Downloading video JSON') + + video = video[api_vars['playerType']] + + title = video['title'] + description = video.get('description') + thumbnail = video.get('thumbnailUrl') + duration = int_or_none(video.get('duration')) + timestamp = parse_iso8601(video.get('referenceDate')) + categories = [ + category['name'] + for category in video.get('genres', []) if 'name' in category] + + return { + 'id': video_id, + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + 'duration': duration, + 'timestamp': timestamp, + 'categories': categories, + 'formats': formats, + }