diff --git a/.gitignore b/.gitignore index 86312d4e4..0422adf44 100644 --- a/.gitignore +++ b/.gitignore @@ -31,3 +31,5 @@ updates_key.pem test/testdata .tox youtube-dl.zsh +.idea +.idea/* \ No newline at end of file diff --git a/.travis.yml b/.travis.yml index c6cc7a994..cc21fae8f 100644 --- a/.travis.yml +++ b/.travis.yml @@ -2,14 +2,16 @@ language: python python: - "2.6" - "2.7" + - "3.2" - "3.3" - "3.4" + - "3.5" +sudo: false script: nosetests test --verbose notifications: email: - filippo.valsorda@gmail.com - phihag@phihag.de - - jaime.marquinez.ferrandiz+travis@gmail.com - yasoob.khld@gmail.com # irc: # channels: diff --git a/AUTHORS b/AUTHORS index bd2e967e3..ce350e96c 100644 --- a/AUTHORS +++ b/AUTHORS @@ -89,3 +89,63 @@ Oskar Jauch Matthew Rayfield t0mm0 Tithen-Firion +Zack Fernandes +cryptonaut +Adrian Kretz +Mathias Rav +Petr Kutalek +Will Glynn +Max Reimann +Cédric Luthi +Thijs Vermeir +Joel Leclerc +Christopher Krooss +Ondřej Caletka +Dinesh S +Johan K. Jensen +Yen Chi Hsuan +Enam Mijbah Noor +David Luhmer +Shaya Goldberg +Paul Hartmann +Frans de Jonge +Robin de Rooij +Ryan Schmidt +Leslie P. Polzer +Duncan Keall +Alexander Mamay +Devin J. Pohly +Eduardo Ferro Aldama +Jeff Buchbinder +Amish Bhadeshia +Joram Schrijver +Will W. +Mohammad Teimori Pabandi +Roman Le Négrate +Matthias Küch +Julian Richen +Ping O. +Mister Hat +Peter Ding +jackyzy823 +George Brighton +Remita Amine +Aurélio A. Heckert +Bernhard Minks +sceext +Zach Bruggeman +Tjark Saul +slangangular +Behrouz Abbasi +ngld +nyuszika7h +Shaun Walbridge +Lee Jenkins +Anssi Hannula +Lukáš Lalinský +Qijiang Fan +Rémy Léone +Marco Ferragina +reiv +Muratcan Simsek +Evan Lu diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md new file mode 100644 index 000000000..d15267d7e --- /dev/null +++ b/CONTRIBUTING.md @@ -0,0 +1,155 @@ +**Please include the full output of youtube-dl when run with `-v`**, i.e. add `-v` flag to your command line, copy the **whole** output and post it in the issue body wrapped in \`\`\` for better formatting. It should look similar to this: +``` +$ youtube-dl -v http://www.youtube.com/watch?v=BaW_jenozKcj +[debug] System config: [] +[debug] User config: [] +[debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] +[debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 +[debug] youtube-dl version 2015.12.06 +[debug] Git HEAD: 135392e +[debug] Python version 2.6.6 - Windows-2003Server-5.2.3790-SP2 +[debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 +[debug] Proxy map: {} +... +``` +**Do not post screenshots of verbose log only plain text is acceptable.** + +The output (including the first lines) contains important debugging information. Issues without the full output are often not reproducible and therefore do not get solved in short order, if ever. + +Please re-read your issue once again to avoid a couple of common mistakes (you can and should use this as a checklist): + +### Is the description of the issue itself sufficient? + +We often get issue reports that we cannot really decipher. While in most cases we eventually get the required information after asking back multiple times, this poses an unnecessary drain on our resources. Many contributors, including myself, are also not native speakers, so we may misread some parts. + +So please elaborate on what feature you are requesting, or what bug you want to be fixed. Make sure that it's obvious + +- What the problem is +- How it could be fixed +- How your proposed solution would look like + +If your report is shorter than two lines, it is almost certainly missing some of these, which makes it hard for us to respond to it. We're often too polite to close the issue outright, but the missing info makes misinterpretation likely. As a committer myself, I often get frustrated by these issues, since the only possible way for me to move forward on them is to ask for clarification over and over. + +For bug reports, this means that your report should contain the *complete* output of youtube-dl when called with the `-v` flag. The error message you get for (most) bugs even says so, but you would not believe how many of our bug reports do not contain this information. + +If your server has multiple IPs or you suspect censorship, adding `--call-home` may be a good idea to get more diagnostics. If the error is `ERROR: Unable to extract ...` and you cannot reproduce it from multiple countries, add `--dump-pages` (warning: this will yield a rather large output, redirect it to the file `log.txt` by adding `>log.txt 2>&1` to your command-line) or upload the `.dump` files you get when you add `--write-pages` [somewhere](https://gist.github.com/). + +**Site support requests must contain an example URL**. An example URL is a URL you might want to download, like `http://www.youtube.com/watch?v=BaW_jenozKc`. There should be an obvious video present. Except under very special circumstances, the main page of a video service (e.g. `http://www.youtube.com/`) is *not* an example URL. + +### Are you using the latest version? + +Before reporting any issue, type `youtube-dl -U`. This should report that you're up-to-date. About 20% of the reports we receive are already fixed, but people are using outdated versions. This goes for feature requests as well. + +### Is the issue already documented? + +Make sure that someone has not already opened the issue you're trying to open. Search at the top of the window or browse the [GitHub Issues](https://github.com/rg3/youtube-dl/search?type=Issues) of this repository. If there is an issue, feel free to write something along the lines of "This affects me as well, with version 2015.01.01. Here is some more information on the issue: ...". While some issues may be old, a new post into them often spurs rapid activity. + +### Why are existing options not enough? + +Before requesting a new feature, please have a quick peek at [the list of supported options](https://github.com/rg3/youtube-dl/blob/master/README.md#synopsis). Many feature requests are for features that actually exist already! Please, absolutely do show off your work in the issue report and detail how the existing similar options do *not* solve your problem. + +### Is there enough context in your bug report? + +People want to solve problems, and often think they do us a favor by breaking down their larger problems (e.g. wanting to skip already downloaded files) to a specific request (e.g. requesting us to look whether the file exists before downloading the info page). However, what often happens is that they break down the problem into two steps: One simple, and one impossible (or extremely complicated one). + +We are then presented with a very complicated request when the original problem could be solved far easier, e.g. by recording the downloaded video IDs in a separate file. To avoid this, you must include the greater context where it is non-obvious. In particular, every feature request that does not consist of adding support for a new site should contain a use case scenario that explains in what situation the missing feature would be useful. + +### Does the issue involve one problem, and one problem only? + +Some of our users seem to think there is a limit of issues they can or should open. There is no limit of issues they can or should open. While it may seem appealing to be able to dump all your issues into one ticket, that means that someone who solves one of your issues cannot mark the issue as closed. Typically, reporting a bunch of issues leads to the ticket lingering since nobody wants to attack that behemoth, until someone mercifully splits the issue into multiple ones. + +In particular, every site support request issue should only pertain to services at one site (generally under a common domain, but always using the same backend technology). Do not request support for vimeo user videos, Whitehouse podcasts, and Google Plus pages in the same issue. Also, make sure that you don't post bug reports alongside feature requests. As a rule of thumb, a feature request does not include outputs of youtube-dl that are not immediately related to the feature at hand. Do not post reports of a network error alongside the request for a new video service. + +### Is anyone going to need the feature? + +Only post features that you (or an incapacitated friend you can personally talk to) require. Do not post features because they seem like a good idea. If they are really useful, they will be requested by someone who requires them. + +### Is your question about youtube-dl? + +It may sound strange, but some bug reports we receive are completely unrelated to youtube-dl and relate to a different or even the reporter's own application. Please make sure that you are actually using youtube-dl. If you are using a UI for youtube-dl, report the bug to the maintainer of the actual application providing the UI. On the other hand, if your UI for youtube-dl fails in some way you believe is related to youtube-dl, by all means, go ahead and report the bug. + +# DEVELOPER INSTRUCTIONS + +Most users do not need to build youtube-dl and can [download the builds](http://rg3.github.io/youtube-dl/download.html) or get them from their distribution. + +To run youtube-dl as a developer, you don't need to build anything either. Simply execute + + python -m youtube_dl + +To run the test, simply invoke your favorite test runner, or execute a test file directly; any of the following work: + + python -m unittest discover + python test/test_download.py + nosetests + +If you want to create a build of youtube-dl yourself, you'll need + +* python +* make +* pandoc +* zip +* nosetests + +### Adding support for a new site + +If you want to add support for a new site, you can follow this quick list (assuming your service is called `yourextractor`): + +1. [Fork this repository](https://github.com/rg3/youtube-dl/fork) +2. Check out the source code with `git clone git@github.com:YOUR_GITHUB_USERNAME/youtube-dl.git` +3. Start a new git branch with `cd youtube-dl; git checkout -b yourextractor` +4. Start with this simple template and save it to `youtube_dl/extractor/yourextractor.py`: + ```python + # coding: utf-8 + from __future__ import unicode_literals + + from .common import InfoExtractor + + + class YourExtractorIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?yourextractor\.com/watch/(?P[0-9]+)' + _TEST = { + 'url': 'http://yourextractor.com/watch/42', + 'md5': 'TODO: md5 sum of the first 10241 bytes of the video file (use --test)', + 'info_dict': { + 'id': '42', + 'ext': 'mp4', + 'title': 'Video title goes here', + 'thumbnail': 're:^https?://.*\.jpg$', + # TODO more properties, either as: + # * A value + # * MD5 checksum; start the string with md5: + # * A regular expression; start the string with re: + # * Any Python type (for example int or float) + } + } + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + # TODO more code goes here, for example ... + title = self._html_search_regex(r'

(.+?)

', webpage, 'title') + + return { + 'id': video_id, + 'title': title, + 'description': self._og_search_description(webpage), + 'uploader': self._search_regex(r']+id="uploader"[^>]*>([^<]+)<', webpage, 'uploader', fatal=False), + # TODO more properties (see youtube_dl/extractor/common.py) + } + ``` +5. Add an import in [`youtube_dl/extractor/__init__.py`](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/__init__.py). +6. Run `python test/test_download.py TestDownload.test_YourExtractor`. This *should fail* at first, but you can continually re-run it until you're done. If you decide to add more than one test, then rename ``_TEST`` to ``_TESTS`` and make it into a list of dictionaries. The tests will then be named `TestDownload.test_YourExtractor`, `TestDownload.test_YourExtractor_1`, `TestDownload.test_YourExtractor_2`, etc. +7. Have a look at [`youtube_dl/extractor/common.py`](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/common.py) for possible helper methods and a [detailed description of what your extractor should and may return](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/common.py#L62-L200). Add tests and code for as many as you want. +8. If you can, check the code with [flake8](https://pypi.python.org/pypi/flake8). +9. When the tests pass, [add](http://git-scm.com/docs/git-add) the new files and [commit](http://git-scm.com/docs/git-commit) them and [push](http://git-scm.com/docs/git-push) the result, like this: + + $ git add youtube_dl/extractor/__init__.py + $ git add youtube_dl/extractor/yourextractor.py + $ git commit -m '[yourextractor] Add new extractor' + $ git push origin yourextractor + +10. Finally, [create a pull request](https://help.github.com/articles/creating-a-pull-request). We'll then review and merge it. + +In any case, thank you very much for your contributions! + diff --git a/Makefile b/Makefile index 3e1debc7e..f826c1685 100644 --- a/Makefile +++ b/Makefile @@ -1,10 +1,8 @@ -all: youtube-dl README.md README.txt youtube-dl.1 youtube-dl.bash-completion youtube-dl.zsh youtube-dl.fish +all: youtube-dl README.md CONTRIBUTING.md README.txt youtube-dl.1 youtube-dl.bash-completion youtube-dl.zsh youtube-dl.fish supportedsites clean: - rm -rf youtube-dl.1.temp.md youtube-dl.1 youtube-dl.bash-completion README.txt MANIFEST build/ dist/ .coverage cover/ youtube-dl.tar.gz youtube-dl.zsh youtube-dl.fish *.dump *.part - -cleanall: clean - rm -f youtube-dl youtube-dl.exe + rm -rf youtube-dl.1.temp.md youtube-dl.1 youtube-dl.bash-completion README.txt MANIFEST build/ dist/ .coverage cover/ youtube-dl.tar.gz youtube-dl.zsh youtube-dl.fish *.dump *.part *.info.json *.mp4 *.flv *.mp3 *.avi CONTRIBUTING.md.tmp youtube-dl youtube-dl.exe + find . -name "*.pyc" -delete PREFIX ?= /usr/local BINDIR ?= $(PREFIX)/bin @@ -35,13 +33,22 @@ install: youtube-dl youtube-dl.1 youtube-dl.bash-completion youtube-dl.zsh youtu install -d $(DESTDIR)$(SYSCONFDIR)/fish/completions install -m 644 youtube-dl.fish $(DESTDIR)$(SYSCONFDIR)/fish/completions/youtube-dl.fish +codetest: + flake8 . + test: #nosetests --with-coverage --cover-package=youtube_dl --cover-html --verbose --processes 4 test nosetests --verbose test + $(MAKE) codetest + +ot: offlinetest + +offlinetest: codetest + nosetests --verbose test --exclude test_download.py --exclude test_age_restriction.py --exclude test_subtitles.py --exclude test_write_annotations.py --exclude test_youtube_lists.py tar: youtube-dl.tar.gz -.PHONY: all clean install test tar bash-completion pypi-files zsh-completion fish-completion +.PHONY: all clean install test tar bash-completion pypi-files zsh-completion fish-completion ot offlinetest codetest supportedsites pypi-files: youtube-dl.bash-completion README.txt youtube-dl.1 youtube-dl.fish @@ -54,28 +61,34 @@ youtube-dl: youtube_dl/*.py youtube_dl/*/*.py chmod a+x youtube-dl README.md: youtube_dl/*.py youtube_dl/*/*.py - COLUMNS=80 python -m youtube_dl --help | python devscripts/make_readme.py + COLUMNS=80 $(PYTHON) youtube_dl/__main__.py --help | $(PYTHON) devscripts/make_readme.py + +CONTRIBUTING.md: README.md + $(PYTHON) devscripts/make_contributing.py README.md CONTRIBUTING.md + +supportedsites: + $(PYTHON) devscripts/make_supportedsites.py docs/supportedsites.md README.txt: README.md pandoc -f markdown -t plain README.md -o README.txt youtube-dl.1: README.md - python devscripts/prepare_manpage.py >youtube-dl.1.temp.md + $(PYTHON) devscripts/prepare_manpage.py >youtube-dl.1.temp.md pandoc -s -f markdown -t man youtube-dl.1.temp.md -o youtube-dl.1 rm -f youtube-dl.1.temp.md youtube-dl.bash-completion: youtube_dl/*.py youtube_dl/*/*.py devscripts/bash-completion.in - python devscripts/bash-completion.py + $(PYTHON) devscripts/bash-completion.py bash-completion: youtube-dl.bash-completion youtube-dl.zsh: youtube_dl/*.py youtube_dl/*/*.py devscripts/zsh-completion.in - python devscripts/zsh-completion.py + $(PYTHON) devscripts/zsh-completion.py zsh-completion: youtube-dl.zsh youtube-dl.fish: youtube_dl/*.py youtube_dl/*/*.py devscripts/fish-completion.in - python devscripts/fish-completion.py + $(PYTHON) devscripts/fish-completion.py fish-completion: youtube-dl.fish diff --git a/README.md b/README.md index d6e7ff902..3a4707227 100644 --- a/README.md +++ b/README.md @@ -1,19 +1,29 @@ youtube-dl - download videos from youtube.com or other video platforms -# SYNOPSIS -**youtube-dl** [OPTIONS] URL [URL...] +- [INSTALLATION](#installation) +- [DESCRIPTION](#description) +- [OPTIONS](#options) +- [CONFIGURATION](#configuration) +- [OUTPUT TEMPLATE](#output-template) +- [FORMAT SELECTION](#format-selection) +- [VIDEO SELECTION](#video-selection) +- [FAQ](#faq) +- [DEVELOPER INSTRUCTIONS](#developer-instructions) +- [EMBEDDING YOUTUBE-DL](#embedding-youtube-dl) +- [BUGS](#bugs) +- [COPYRIGHT](#copyright) # INSTALLATION To install it right away for all UNIX users (Linux, OS X, etc.), type: sudo curl https://yt-dl.org/latest/youtube-dl -o /usr/local/bin/youtube-dl - sudo chmod a+x /usr/local/bin/youtube-dl + sudo chmod a+rx /usr/local/bin/youtube-dl If you do not have curl, you can alternatively use a recent wget: sudo wget https://yt-dl.org/downloads/latest/youtube-dl -O /usr/local/bin/youtube-dl - sudo chmod a+x /usr/local/bin/youtube-dl + sudo chmod a+rx /usr/local/bin/youtube-dl Windows users can [download a .exe file](https://yt-dl.org/latest/youtube-dl.exe) and place it in their home directory or any other location on their [PATH](http://en.wikipedia.org/wiki/PATH_%28variable%29). @@ -25,7 +35,7 @@ You can also use pip: sudo pip install youtube-dl -Alternatively, refer to the developer instructions below for how to check out and work with the git repository. For further options, including PGP signatures, see https://rg3.github.io/youtube-dl/download.html . +Alternatively, refer to the [developer instructions](#developer-instructions) for how to check out and work with the git repository. For further options, including PGP signatures, see the [youtube-dl Download Page](https://rg3.github.io/youtube-dl/download.html). # DESCRIPTION **youtube-dl** is a small command-line program to download videos from @@ -34,29 +44,28 @@ YouTube.com and a few more sites. It requires the Python interpreter, version your Unix box, on Windows or on Mac OS X. It is released to the public domain, which means you can modify it, redistribute it or use it however you like. + youtube-dl [OPTIONS] URL [URL...] + # OPTIONS - -h, --help print this help text and exit - --version print program version and exit - -U, --update update this program to latest version. Make + -h, --help Print this help text and exit + --version Print program version and exit + -U, --update Update this program to latest version. Make sure that you have sufficient permissions (run with sudo if needed) - -i, --ignore-errors continue on download errors, for example to + -i, --ignore-errors Continue on download errors, for example to skip unavailable videos in a playlist --abort-on-error Abort downloading of further videos (in the playlist or the command line) if an error occurs - --dump-user-agent display the current browser identification - --list-extractors List all supported extractors and the URLs - they would handle + --dump-user-agent Display the current browser identification + --list-extractors List all supported extractors --extractor-descriptions Output descriptions of all supported extractors - --proxy URL Use the specified HTTP/HTTPS proxy. Pass in - an empty string (--proxy "") for direct - connection - --socket-timeout None Time to wait before giving up, in seconds + --force-generic-extractor Force extraction to use the generic + extractor --default-search PREFIX Use this prefix for unqualified URLs. For example "gvsearch2:" downloads two videos - from google videos for youtube-dl "large + from google videos for youtube-dl "large apple". Use the value "auto" to let youtube-dl guess ("auto_warning" to emit a warning when guessing). "error" just throws @@ -65,37 +74,82 @@ which means you can modify it, redistribute it or use it however you like. this is not possible instead of searching. --ignore-config Do not read configuration files. When given in the global configuration file /etc - /youtube-dl.conf: do not read the user - configuration in ~/.config/youtube-dl.conf - (%APPDATA%/youtube-dl/config.txt on - Windows) + /youtube-dl.conf: Do not read the user + configuration in ~/.config/youtube- + dl/config (%APPDATA%/youtube-dl/config.txt + on Windows) --flat-playlist Do not extract the videos of a playlist, only list them. + --no-color Do not emit color codes in output + +## Network Options: + --proxy URL Use the specified HTTP/HTTPS proxy. Pass in + an empty string (--proxy "") for direct + connection + --socket-timeout SECONDS Time to wait before giving up, in seconds + --source-address IP Client-side IP address to bind to + (experimental) + -4, --force-ipv4 Make all connections via IPv4 + (experimental) + -6, --force-ipv6 Make all connections via IPv6 + (experimental) + --cn-verification-proxy URL Use this proxy to verify the IP address for + some Chinese sites. The default proxy + specified by --proxy (or none, if the + options is not present) is used for the + actual downloading. (experimental) ## Video Selection: - --playlist-start NUMBER playlist video to start at (default is 1) - --playlist-end NUMBER playlist video to end at (default is last) - --match-title REGEX download only matching titles (regex or + --playlist-start NUMBER Playlist video to start at (default is 1) + --playlist-end NUMBER Playlist video to end at (default is last) + --playlist-items ITEM_SPEC Playlist video items to download. Specify + indices of the videos in the playlist + separated by commas like: "--playlist-items + 1,2,5,8" if you want to download videos + indexed 1, 2, 5, 8 in the playlist. You can + specify range: "--playlist-items + 1-3,7,10-13", it will download the videos + at index 1, 2, 3, 7, 10, 11, 12 and 13. + --match-title REGEX Download only matching titles (regex or caseless sub-string) - --reject-title REGEX skip download for matching titles (regex or + --reject-title REGEX Skip download for matching titles (regex or caseless sub-string) --max-downloads NUMBER Abort after downloading NUMBER files --min-filesize SIZE Do not download any videos smaller than SIZE (e.g. 50k or 44.6m) --max-filesize SIZE Do not download any videos larger than SIZE (e.g. 50k or 44.6m) - --date DATE download only videos uploaded in this date - --datebefore DATE download only videos uploaded on or before + --date DATE Download only videos uploaded in this date + --datebefore DATE Download only videos uploaded on or before this date (i.e. inclusive) - --dateafter DATE download only videos uploaded on or after + --dateafter DATE Download only videos uploaded on or after this date (i.e. inclusive) --min-views COUNT Do not download any videos with less than COUNT views --max-views COUNT Do not download any videos with more than COUNT views - --no-playlist If the URL refers to a video and a - playlist, download only the video. - --age-limit YEARS download only videos suitable for the given + --match-filter FILTER Generic video filter (experimental). + Specify any key (see help for -o for a list + of available keys) to match if the key is + present, !key to check if the key is not + present,key > NUMBER (like "comment_count > + 12", also works with >=, <, <=, !=, =) to + compare against a number, and & to require + multiple matches. Values which are not + known are excluded unless you put a + question mark (?) after the operator.For + example, to only match videos that have + been liked more than 100 times and disliked + less than 50 times (or the dislike + functionality is not available at the given + service), but who also have a description, + use --match-filter "like_count > 100 & + dislike_count + (requires ffmpeg or - avconv), for example -f - bestvideo+bestaudio. - --all-formats download all available video formats - --prefer-free-formats prefer free video formats unless a specific + -f, --format FORMAT Video format code, see the "FORMAT + SELECTION" for all the info + --all-formats Download all available video formats + --prefer-free-formats Prefer free video formats unless a specific one is requested - --max-quality FORMAT highest quality format to download - -F, --list-formats list all available formats - --youtube-skip-dash-manifest Do not download the DASH manifest on - YouTube videos + -F, --list-formats List all available formats of requested + videos + --youtube-skip-dash-manifest Do not download the DASH manifests and + related data on YouTube videos + --merge-output-format FORMAT If a merge is required (e.g. + bestvideo+bestaudio), output to given + container format. One of mkv, mp4, ogg, + webm, flv. Ignored if no merge is required ## Subtitle Options: - --write-sub write subtitle file - --write-auto-sub write automatic subtitle file (youtube - only) - --all-subs downloads all the available subtitles of - the video - --list-subs lists all available subtitles for the video - --sub-format FORMAT subtitle format (default=srt) ([sbv/vtt] - youtube only) - --sub-lang LANGS languages of the subtitles to download + --write-sub Write subtitle file + --write-auto-sub Write automatically generated subtitle file + (YouTube only) + --all-subs Download all the available subtitles of the + video + --list-subs List all available subtitles for the video + --sub-format FORMAT Subtitle format, accepts formats + preference, for example: "srt" or + "ass/srt/best" + --sub-lang LANGS Languages of the subtitles to download (optional) separated by commas, use IETF language tags like 'en,pt' ## Authentication Options: - -u, --username USERNAME login with this account ID - -p, --password PASSWORD account password - -2, --twofactor TWOFACTOR two-factor auth code - -n, --netrc use .netrc authentication data - --video-password PASSWORD video password (vimeo, smotri) + -u, --username USERNAME Login with this account ID + -p, --password PASSWORD Account password. If this option is left + out, youtube-dl will ask interactively. + -2, --twofactor TWOFACTOR Two-factor auth code + -n, --netrc Use .netrc authentication data + --video-password PASSWORD Video password (vimeo, smotri, youku) ## Post-processing Options: - -x, --extract-audio convert video files to audio-only files + -x, --extract-audio Convert video files to audio-only files (requires ffmpeg or avconv and ffprobe or avprobe) - --audio-format FORMAT "best", "aac", "vorbis", "mp3", "m4a", - "opus", or "wav"; "best" by default - --audio-quality QUALITY ffmpeg/avconv audio quality specification, - insert a value between 0 (better) and 9 - (worse) for VBR or a specific bitrate like - 128K (default 5) + --audio-format FORMAT Specify audio format: "best", "aac", + "vorbis", "mp3", "m4a", "opus", or "wav"; + "best" by default + --audio-quality QUALITY Specify ffmpeg/avconv audio quality, insert + a value between 0 (better) and 9 (worse) + for VBR or a specific bitrate like 128K + (default 5) --recode-video FORMAT Encode the video to another format if necessary (currently supported: - mp4|flv|ogg|webm|mkv) - -k, --keep-video keeps the video file on disk after the - post-processing; the video is erased by - default - --no-post-overwrites do not overwrite post-processed files; the + mp4|flv|ogg|webm|mkv|avi) + --postprocessor-args ARGS Give these arguments to the postprocessor + -k, --keep-video Keep the video file on disk after the post- + processing; the video is erased by default + --no-post-overwrites Do not overwrite post-processed files; the post-processed files are overwritten by default - --embed-subs embed subtitles in the video (only for mp4 - videos) - --embed-thumbnail embed thumbnail in the audio as cover art - --add-metadata write metadata to the video file - --xattrs write metadata to the video file's xattrs + --embed-subs Embed subtitles in the video (only for mkv + and mp4 videos) + --embed-thumbnail Embed thumbnail in the audio as cover art + --add-metadata Write metadata to the video file + --metadata-from-title FORMAT Parse additional metadata like song title / + artist from the video title. The format + syntax is the same as --output, the parsed + parameters replace existing values. + Additional templates: %(album)s, + %(artist)s. Example: --metadata-from-title + "%(artist)s - %(title)s" matches a title + like "Coldplay - Paradise" + --xattrs Write metadata to the video file's xattrs (using dublin core and xdg standards) + --fixup POLICY Automatically correct known faults of the + file. One of never (do nothing), warn (only + emit a warning), detect_or_warn (the + default; fix file if we can, warn + otherwise) --prefer-avconv Prefer avconv over ffmpeg for running the postprocessors (default) --prefer-ffmpeg Prefer ffmpeg over avconv for running the postprocessors + --ffmpeg-location PATH Location of the ffmpeg/avconv binary; + either the path to the binary or its + containing directory. --exec CMD Execute a command on the file after downloading, similar to find's -exec syntax. Example: --exec 'adb push {} /sdcard/Music/ && rm {}' + --convert-subtitles FORMAT Convert the subtitles to other format + (currently supported: srt|ass|vtt) # CONFIGURATION -You can configure youtube-dl by placing default arguments (such as `--extract-audio --no-mtime` to always extract the audio and not copy the mtime) into `/etc/youtube-dl.conf` and/or `~/.config/youtube-dl/config`. On Windows, the configuration file locations are `%APPDATA%\youtube-dl\config.txt` and `C:\Users\\youtube-dl.conf`. +You can configure youtube-dl by placing any supported command line option to a configuration file. On Linux, the system wide configuration file is located at `/etc/youtube-dl.conf` and the user wide configuration file at `~/.config/youtube-dl/config`. On Windows, the user wide configuration file locations are `%APPDATA%\youtube-dl\config.txt` or `C:\Users\\youtube-dl.conf`. For example, with the following configuration file youtube-dl will always extract the audio, not copy the mtime and use a proxy: +``` +--extract-audio +--no-mtime +--proxy 127.0.0.1:3128 +``` + +You can use `--ignore-config` if you want to disable the configuration file for a particular youtube-dl run. + +### Authentication with `.netrc` file + +You may also want to configure automatic credentials storage for extractors that support authentication (by providing login and password with `--username` and `--password`) in order not to pass credentials as command line arguments on every youtube-dl execution and prevent tracking plain text passwords in the shell command history. You can achieve this using a [`.netrc` file](http://stackoverflow.com/tags/.netrc/info) on per extractor basis. For that you will need to create a`.netrc` file in your `$HOME` and restrict permissions to read/write by you only: +``` +touch $HOME/.netrc +chmod a-rwx,u+rw $HOME/.netrc +``` +After that you can add credentials for extractor in the following format, where *extractor* is the name of extractor in lowercase: +``` +machine login password +``` +For example: +``` +machine youtube login myaccount@gmail.com password my_youtube_password +machine twitch login my_twitch_account_name password my_twitch_password +``` +To activate authentication with the `.netrc` file you should pass `--netrc` to youtube-dl or place it in the [configuration file](#configuration). + +On Windows you may also need to setup the `%HOME%` environment variable manually. # OUTPUT TEMPLATE -The `-o` option allows users to indicate a template for the output file names. The basic usage is not to set any template arguments when downloading a single file, like in `youtube-dl -o funny_video.flv "http://some/video"`. However, it may contain special sequences that will be replaced when downloading each video. The special sequences have the format `%(NAME)s`. To clarify, that is a percent symbol followed by a name in parenthesis, followed by a lowercase S. Allowed names are: +The `-o` option allows users to indicate a template for the output file names. The basic usage is not to set any template arguments when downloading a single file, like in `youtube-dl -o funny_video.flv "http://some/video"`. However, it may contain special sequences that will be replaced when downloading each video. The special sequences have the format `%(NAME)s`. To clarify, that is a percent symbol followed by a name in parentheses, followed by a lowercase S. Allowed names are: - `id`: The sequence will be replaced by the video identifier. - `url`: The sequence will be replaced by the video URL. @@ -327,8 +446,10 @@ The `-o` option allows users to indicate a template for the output file names. T - `ext`: The sequence will be replaced by the appropriate extension (like flv or mp4). - `epoch`: The sequence will be replaced by the Unix epoch when creating the file. - `autonumber`: The sequence will be replaced by a five-digit number that will be increased with each download, starting at zero. - - `playlist`: The name or the id of the playlist that contains the video. - - `playlist_index`: The index of the video in the playlist, a five-digit number. + - `playlist`: The sequence will be replaced by the name or the id of the playlist that contains the video. + - `playlist_index`: The sequence will be replaced by the index of the video in the playlist padded with leading zeros according to the total length of the playlist. + - `format_id`: The sequence will be replaced by the format code specified by `--format`. + - `duration`: The sequence will be replaced by the length of the video in seconds. The current default template is `%(title)s-%(id)s.%(ext)s`. @@ -341,9 +462,20 @@ $ youtube-dl --get-filename -o "%(title)s.%(ext)s" BaW_jenozKc --restrict-filena youtube-dl_test_video_.mp4 # A simple file name ``` +# FORMAT SELECTION + +By default youtube-dl tries to download the best quality, but sometimes you may want to download in a different format. +The simplest case is requesting a specific format, for example `-f 22`. You can get the list of available formats using `--list-formats`, you can also use a file extension (currently it supports aac, m4a, mp3, mp4, ogg, wav, webm) or the special names `best`, `bestvideo`, `bestaudio` and `worst`. + +If you want to download multiple videos and they don't have the same formats available, you can specify the order of preference using slashes, as in `-f 22/17/18`. You can also filter the video results by putting a condition in brackets, as in `-f "best[height=720]"` (or `-f "[filesize>10M]"`). This works for filesize, height, width, tbr, abr, vbr, asr, and fps and the comparisons <, <=, >, >=, =, != and for ext, acodec, vcodec, container, and protocol and the comparisons =, != . Formats for which the value is not known are excluded unless you put a question mark (?) after the operator. You can combine format filters, so `-f "[height <=? 720][tbr>500]"` selects up to 720p videos (or videos where the height is not known) with a bitrate of at least 500 KBit/s. Use commas to download multiple formats, such as `-f 136/137/mp4/bestvideo,140/m4a/bestaudio`. You can merge the video and audio of two formats into a single file using `-f +` (requires ffmpeg or avconv), for example `-f bestvideo+bestaudio`. Format selectors can also be grouped using parentheses, for example if you want to download the best mp4 and webm formats with a height lower than 480 you can use `-f '(mp4,webm)[height<480]'`. + +Since the end of April 2015 and version 2015.04.26 youtube-dl uses `-f bestvideo+bestaudio/best` as default format selection (see #5447, #5456). If ffmpeg or avconv are installed this results in downloading `bestvideo` and `bestaudio` separately and muxing them together into a single file giving the best overall quality available. Otherwise it falls back to `best` and results in downloading the best available quality served as a single file. `best` is also needed for videos that don't come from YouTube because they don't provide the audio and video in two different files. If you want to only download some dash formats (for example if you are not interested in getting videos with a resolution higher than 1080p), you can add `-f bestvideo[height<=?1080]+bestaudio/best` to your configuration file. Note that if you use youtube-dl to stream to `stdout` (and most likely to pipe it to your media player then), i.e. you explicitly specify output template as `-o -`, youtube-dl still uses `-f best` format selection in order to start content delivery immediately to your player and not to wait until `bestvideo` and `bestaudio` are downloaded and muxed. + +If you want to preserve the old format selection behavior (prior to youtube-dl 2015.04.26), i.e. you want to download the best available quality media served as a single file, you should explicitly specify your choice with `-f best`. You may want to add it to the [configuration file](#configuration) in order not to type it every time you run youtube-dl. + # VIDEO SELECTION -Videos can be filtered by their upload date using the options `--date`, `--datebefore` or `--dateafter`, they accept dates in two formats: +Videos can be filtered by their upload date using the options `--date`, `--datebefore` or `--dateafter`. They accept dates in two formats: - Absolute dates: Dates in the format `YYYYMMDD`. - Relative dates: Dates in the format `(now|today)[+-][0-9](day|week|month|year)(s)?` @@ -357,7 +489,7 @@ $ youtube-dl --dateafter now-6months # Download only the videos uploaded on January 1, 1970 $ youtube-dl --date 19700101 -$ # will only download the videos uploaded in the 200x decade +$ # Download only the videos uploaded in the 200x decade $ youtube-dl --dateafter 20000101 --datebefore 20091231 ``` @@ -369,7 +501,7 @@ If you've followed [our manual installation instructions](http://rg3.github.io/y If you have used pip, a simple `sudo pip install -U youtube-dl` is sufficient to update. -If you have installed youtube-dl using a package manager like *apt-get* or *yum*, use the standard system update mechanism to update. Note that distribution packages are often outdated. As a rule of thumb, youtube-dl releases at least once a month, and often weekly or even daily. Simply go to http://yt-dl.org/ to find out the current version. Unfortunately, there is nothing we youtube-dl developers can do if your distributions serves a really outdated version. You can (and should) complain to your distribution in their bugtracker or support forum. +If you have installed youtube-dl using a package manager like *apt-get* or *yum*, use the standard system update mechanism to update. Note that distribution packages are often outdated. As a rule of thumb, youtube-dl releases at least once a month, and often weekly or even daily. Simply go to http://yt-dl.org/ to find out the current version. Unfortunately, there is nothing we youtube-dl developers can do if your distribution serves a really outdated version. You can (and should) complain to your distribution in their bugtracker or support forum. As a last resort, you can also uninstall the version installed by your package manager and follow our manual installation instructions. For that, remove the distribution's package, with a line like @@ -391,11 +523,11 @@ YouTube changed their playlist format in March 2014 and later on, so you'll need If you have installed youtube-dl with a package manager, pip, setup.py or a tarball, please use that to update. Note that Ubuntu packages do not seem to get updated anymore. Since we are not affiliated with Ubuntu, there is little we can do. Feel free to [report bugs](https://bugs.launchpad.net/ubuntu/+source/youtube-dl/+filebug) to the [Ubuntu packaging guys](mailto:ubuntu-motu@lists.ubuntu.com?subject=outdated%20version%20of%20youtube-dl) - all they have to do is update the package to a somewhat recent version. See above for a way to update. -### Do I always have to pass in `--max-quality FORMAT`, or `-citw`? +### Do I always have to pass `-citw`? -By default, youtube-dl intends to have the best options (incidentally, if you have a convincing case that these should be different, [please file an issue where you explain that](https://yt-dl.org/bug)). Therefore, it is unnecessary and sometimes harmful to copy long option strings from webpages. In particular, `--max-quality` *limits* the video quality (so if you want the best quality, do NOT pass it in), and the only option out of `-citw` that is regularly useful is `-i`. +By default, youtube-dl intends to have the best options (incidentally, if you have a convincing case that these should be different, [please file an issue where you explain that](https://yt-dl.org/bug)). Therefore, it is unnecessary and sometimes harmful to copy long option strings from webpages. In particular, the only option out of `-citw` that is regularly useful is `-i`. -### Can you please put the -b option back? +### Can you please put the `-b` option back? Most people asking this question are not aware that youtube-dl now defaults to downloading the highest available quality as reported by YouTube, which will be 1080p or 720p in some cases, so you no longer need the `-b` option. For some specific videos, maybe YouTube does not report them to be available in a specific high quality format you're interested in. In that case, simply request it with the `-f` option and youtube-dl will try to download it. @@ -403,23 +535,59 @@ Most people asking this question are not aware that youtube-dl now defaults to d Apparently YouTube requires you to pass a CAPTCHA test if you download too much. We're [considering to provide a way to let you solve the CAPTCHA](https://github.com/rg3/youtube-dl/issues/154), but at the moment, your best course of action is pointing a webbrowser to the youtube URL, solving the CAPTCHA, and restart youtube-dl. +### Do I need any other programs? + +youtube-dl works fine on its own on most sites. However, if you want to convert video/audio, you'll need [avconv](https://libav.org/) or [ffmpeg](https://www.ffmpeg.org/). On some sites - most notably YouTube - videos can be retrieved in a higher quality format without sound. youtube-dl will detect whether avconv/ffmpeg is present and automatically pick the best option. + +Videos or video formats streamed via RTMP protocol can only be downloaded when [rtmpdump](https://rtmpdump.mplayerhq.hu/) is installed. Downloading MMS and RTSP videos requires either [mplayer](http://mplayerhq.hu/) or [mpv](https://mpv.io/) to be installed. + ### I have downloaded a video but how can I play it? Once the video is fully downloaded, use any video player, such as [vlc](http://www.videolan.org) or [mplayer](http://www.mplayerhq.hu/). -### The links provided by youtube-dl -g are not working anymore +### I extracted a video URL with `-g`, but it does not play on another machine / in my webbrowser. -The URLs youtube-dl outputs require the downloader to have the correct cookies. Use the `--cookies` option to write the required cookies into a file, and advise your downloader to read cookies from that file. Some sites also require a common user agent to be used, use `--dump-user-agent` to see the one in use by youtube-dl. +It depends a lot on the service. In many cases, requests for the video (to download/play it) must come from the same IP address and with the same cookies. Use the `--cookies` option to write the required cookies into a file, and advise your downloader to read cookies from that file. Some sites also require a common user agent to be used, use `--dump-user-agent` to see the one in use by youtube-dl. + +It may be beneficial to use IPv6; in some cases, the restrictions are only applied to IPv4. Some services (sometimes only for a subset of videos) do not restrict the video URL by IP address, cookie, or user-agent, but these are the exception rather than the rule. + +Please bear in mind that some URL protocols are **not** supported by browsers out of the box, including RTMP. If you are using `-g`, your own downloader must support these as well. + +If you want to play the video on a machine that is not running youtube-dl, you can relay the video content from the machine that runs youtube-dl. You can use `-o -` to let youtube-dl stream a video to stdout, or simply allow the player to download the files written by youtube-dl in turn. ### ERROR: no fmt_url_map or conn information found in video info -youtube has switched to a new video info format in July 2011 which is not supported by old versions of youtube-dl. You can update youtube-dl with `sudo youtube-dl --update`. +YouTube has switched to a new video info format in July 2011 which is not supported by old versions of youtube-dl. See [above](#how-do-i-update-youtube-dl) for how to update youtube-dl. -### ERROR: unable to download video ### +### ERROR: unable to download video -youtube requires an additional signature since September 2012 which is not supported by old versions of youtube-dl. You can update youtube-dl with `sudo youtube-dl --update`. +YouTube requires an additional signature since September 2012 which is not supported by old versions of youtube-dl. See [above](#how-do-i-update-youtube-dl) for how to update youtube-dl. -### SyntaxError: Non-ASCII character ### +### Video URL contains an ampersand and I'm getting some strange output `[1] 2839` or `'v' is not recognized as an internal or external command` + +That's actually the output from your shell. Since ampersand is one of the special shell characters it's interpreted by the shell preventing you from passing the whole URL to youtube-dl. To disable your shell from interpreting the ampersands (or any other special characters) you have to either put the whole URL in quotes or escape them with a backslash (which approach will work depends on your shell). + +For example if your URL is https://www.youtube.com/watch?t=4&v=BaW_jenozKc you should end up with following command: + +```youtube-dl 'https://www.youtube.com/watch?t=4&v=BaW_jenozKc'``` + +or + +```youtube-dl https://www.youtube.com/watch?t=4\&v=BaW_jenozKc``` + +For Windows you have to use the double quotes: + +```youtube-dl "https://www.youtube.com/watch?t=4&v=BaW_jenozKc"``` + +### ExtractorError: Could not find JS function u'OF' + +In February 2015, the new YouTube player contained a character sequence in a string that was misinterpreted by old versions of youtube-dl. See [above](#how-do-i-update-youtube-dl) for how to update youtube-dl. + +### HTTP Error 429: Too Many Requests or 402: Payment Required + +These two error codes indicate that the service is blocking your IP address because of overuse. Contact the service and ask them to unblock your IP address, or - if you have acquired a whitelisted IP address already - use the [`--proxy` or `--source-address` options](#network-options) to select another IP address. + +### SyntaxError: Non-ASCII character The error @@ -436,6 +604,59 @@ Since June 2012 (#342) youtube-dl is packed as an executable zipfile, simply unz To run the exe you need to install first the [Microsoft Visual C++ 2008 Redistributable Package](http://www.microsoft.com/en-us/download/details.aspx?id=29). +### On Windows, how should I set up ffmpeg and youtube-dl? Where should I put the exe files? + +If you put youtube-dl and ffmpeg in the same directory that you're running the command from, it will work, but that's rather cumbersome. + +To make a different directory work - either for ffmpeg, or for youtube-dl, or for both - simply create the directory (say, `C:\bin`, or `C:\Users\\bin`), put all the executables directly in there, and then [set your PATH environment variable](https://www.java.com/en/download/help/path.xml) to include that directory. + +From then on, after restarting your shell, you will be able to access both youtube-dl and ffmpeg (and youtube-dl will be able to find ffmpeg) by simply typing `youtube-dl` or `ffmpeg`, no matter what directory you're in. + +### How do I put downloads into a specific folder? + +Use the `-o` to specify an [output template](#output-template), for example `-o "/home/user/videos/%(title)s-%(id)s.%(ext)s"`. If you want this for all of your downloads, put the option into your [configuration file](#configuration). + +### How do I download a video starting with a `-`? + +Either prepend `http://www.youtube.com/watch?v=` or separate the ID from the options with `--`: + + youtube-dl -- -wNyEUrxzFU + youtube-dl "http://www.youtube.com/watch?v=-wNyEUrxzFU" + +### How do I pass cookies to youtube-dl? + +Use the `--cookies` option, for example `--cookies /path/to/cookies/file.txt`. Note that the cookies file must be in Mozilla/Netscape format and the first line of the cookies file must be either `# HTTP Cookie File` or `# Netscape HTTP Cookie File`. Make sure you have correct [newline format](https://en.wikipedia.org/wiki/Newline) in the cookies file and convert newlines if necessary to correspond with your OS, namely `CRLF` (`\r\n`) for Windows, `LF` (`\n`) for Linux and `CR` (`\r`) for Mac OS. `HTTP Error 400: Bad Request` when using `--cookies` is a good sign of invalid newline format. + +Passing cookies to youtube-dl is a good way to workaround login when a particular extractor does not implement it explicitly. + +### Can you add support for this anime video site, or site which shows current movies for free? + +As a matter of policy (as well as legality), youtube-dl does not include support for services that specialize in infringing copyright. As a rule of thumb, if you cannot easily find a video that the service is quite obviously allowed to distribute (i.e. that has been uploaded by the creator, the creator's distributor, or is published under a free license), the service is probably unfit for inclusion to youtube-dl. + +A note on the service that they don't host the infringing content, but just link to those who do, is evidence that the service should **not** be included into youtube-dl. The same goes for any DMCA note when the whole front page of the service is filled with videos they are not allowed to distribute. A "fair use" note is equally unconvincing if the service shows copyright-protected videos in full without authorization. + +Support requests for services that **do** purchase the rights to distribute their content are perfectly fine though. If in doubt, you can simply include a source that mentions the legitimate purchase of content. + +### How can I speed up work on my issue? + +(Also known as: Help, my important issue not being solved!) The youtube-dl core developer team is quite small. While we do our best to solve as many issues as possible, sometimes that can take quite a while. To speed up your issue, here's what you can do: + +First of all, please do report the issue [at our issue tracker](https://yt-dl.org/bugs). That allows us to coordinate all efforts by users and developers, and serves as a unified point. Unfortunately, the youtube-dl project has grown too large to use personal email as an effective communication channel. + +Please read the [bug reporting instructions](#bugs) below. A lot of bugs lack all the necessary information. If you can, offer proxy, VPN, or shell access to the youtube-dl developers. If you are able to, test the issue from multiple computers in multiple countries to exclude local censorship or misconfiguration issues. + +If nobody is interested in solving your issue, you are welcome to take matters into your own hands and submit a pull request (or coerce/pay somebody else to do so). + +Feel free to bump the issue from time to time by writing a small comment ("Issue is still present in youtube-dl version ...from France, but fixed from Belgium"), but please not more than once a month. Please do not declare your issue as `important` or `urgent`. + +### How can I detect whether a given URL is supported by youtube-dl? + +For one, have a look at the [list of supported sites](docs/supportedsites.md). Note that it can sometimes happen that the site changes its URL scheme (say, from http://example.com/video/1234567 to http://example.com/v/1234567 ) and youtube-dl reports an URL of a service in that list as unsupported. In that case, simply report a bug. + +It is *not* possible to detect whether a URL is supported or not. That's because youtube-dl contains a generic extractor which matches **all** URLs. You may be tempted to disable, exclude, or remove the generic extractor, but the generic extractor not only allows users to extract videos from lots of websites that embed a video from another service, but may also be used to extract video from a service that it's hosting itself. Therefore, we neither recommend nor support disabling, excluding, or removing the generic extractor. + +If you want to find out whether a given URL is supported, simply call youtube-dl with it. If you get no videos back, chances are the URL is either not referring to a video or unsupported. You can find out which by examining the output (if you run youtube-dl on the console) or catching an `UnsupportedError` exception if you run it from a Python program. + # DEVELOPER INSTRUCTIONS Most users do not need to build youtube-dl and can [download the builds](http://rg3.github.io/youtube-dl/download.html) or get them from their distribution. @@ -496,19 +717,20 @@ If you want to add support for a new site, you can follow this quick list (assum webpage = self._download_webpage(url, video_id) # TODO more code goes here, for example ... - title = self._html_search_regex(r'

(.*?)

', webpage, 'title') + title = self._html_search_regex(r'

(.+?)

', webpage, 'title') return { 'id': video_id, 'title': title, 'description': self._og_search_description(webpage), + 'uploader': self._search_regex(r']+id="uploader"[^>]*>([^<]+)<', webpage, 'uploader', fatal=False), # TODO more properties (see youtube_dl/extractor/common.py) } ``` 5. Add an import in [`youtube_dl/extractor/__init__.py`](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/__init__.py). -6. Run `python test/test_download.py TestDownload.test_YourExtractor`. This *should fail* at first, but you can continually re-run it until you're done. If you decide to add more than one test, then rename ``_TEST`` to ``_TESTS`` and make it into a list of dictionaries. The tests will be then be named `TestDownload.test_YourExtractor`, `TestDownload.test_YourExtractor_1`, `TestDownload.test_YourExtractor_2`, etc. -7. Have a look at [`youtube_dl/common/extractor/common.py`](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/common.py) for possible helper methods and a [detailed description of what your extractor should return](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/common.py#L38). Add tests and code for as many as you want. -8. If you can, check the code with [pyflakes](https://pypi.python.org/pypi/pyflakes) (a good idea) and [pep8](https://pypi.python.org/pypi/pep8) (optional, ignore E501). +6. Run `python test/test_download.py TestDownload.test_YourExtractor`. This *should fail* at first, but you can continually re-run it until you're done. If you decide to add more than one test, then rename ``_TEST`` to ``_TESTS`` and make it into a list of dictionaries. The tests will then be named `TestDownload.test_YourExtractor`, `TestDownload.test_YourExtractor_1`, `TestDownload.test_YourExtractor_2`, etc. +7. Have a look at [`youtube_dl/extractor/common.py`](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/common.py) for possible helper methods and a [detailed description of what your extractor should and may return](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/common.py#L62-L200). Add tests and code for as many as you want. +8. If you can, check the code with [flake8](https://pypi.python.org/pypi/flake8). 9. When the tests pass, [add](http://git-scm.com/docs/git-add) the new files and [commit](http://git-scm.com/docs/git-commit) them and [push](http://git-scm.com/docs/git-push) the result, like this: $ git add youtube_dl/extractor/__init__.py @@ -526,23 +748,77 @@ youtube-dl makes the best effort to be a good command-line program, and thus sho From a Python program, you can embed youtube-dl in a more powerful fashion, like this: - import youtube_dl +```python +from __future__ import unicode_literals +import youtube_dl - ydl_opts = {} - with youtube_dl.YoutubeDL(ydl_opts) as ydl: - ydl.download(['http://www.youtube.com/watch?v=BaW_jenozKc']) +ydl_opts = {} +with youtube_dl.YoutubeDL(ydl_opts) as ydl: + ydl.download(['http://www.youtube.com/watch?v=BaW_jenozKc']) +``` -Most likely, you'll want to use various options. For a list of what can be done, have a look at [youtube_dl/YoutubeDL.py](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/YoutubeDL.py#L69). For a start, if you want to intercept youtube-dl's output, set a `logger` object. +Most likely, you'll want to use various options. For a list of what can be done, have a look at [`youtube_dl/YoutubeDL.py`](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/YoutubeDL.py#L121-L269). For a start, if you want to intercept youtube-dl's output, set a `logger` object. + +Here's a more complete example of a program that outputs only errors (and a short message after the download is finished), and downloads/converts the video to an mp3 file: + +```python +from __future__ import unicode_literals +import youtube_dl + + +class MyLogger(object): + def debug(self, msg): + pass + + def warning(self, msg): + pass + + def error(self, msg): + print(msg) + + +def my_hook(d): + if d['status'] == 'finished': + print('Done downloading, now converting ...') + + +ydl_opts = { + 'format': 'bestaudio/best', + 'postprocessors': [{ + 'key': 'FFmpegExtractAudio', + 'preferredcodec': 'mp3', + 'preferredquality': '192', + }], + 'logger': MyLogger(), + 'progress_hooks': [my_hook], +} +with youtube_dl.YoutubeDL(ydl_opts) as ydl: + ydl.download(['http://www.youtube.com/watch?v=BaW_jenozKc']) +``` # BUGS -Bugs and suggestions should be reported at: . Unless you were prompted so or there is another pertinent reason (e.g. GitHub fails to accept the bug report), please do not send bug reports via personal email. +Bugs and suggestions should be reported at: . Unless you were prompted so or there is another pertinent reason (e.g. GitHub fails to accept the bug report), please do not send bug reports via personal email. For discussions, join us in the IRC channel [#youtube-dl](irc://chat.freenode.net/#youtube-dl) on freenode ([webchat](http://webchat.freenode.net/?randomnick=1&channels=youtube-dl)). -Please include the full output of the command when run with `--verbose`. The output (including the first lines) contain important debugging information. Issues without the full output are often not reproducible and therefore do not get solved in short order, if ever. +**Please include the full output of youtube-dl when run with `-v`**, i.e. add `-v` flag to your command line, copy the **whole** output and post it in the issue body wrapped in \`\`\` for better formatting. It should look similar to this: +``` +$ youtube-dl -v http://www.youtube.com/watch?v=BaW_jenozKcj +[debug] System config: [] +[debug] User config: [] +[debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] +[debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 +[debug] youtube-dl version 2015.12.06 +[debug] Git HEAD: 135392e +[debug] Python version 2.6.6 - Windows-2003Server-5.2.3790-SP2 +[debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 +[debug] Proxy map: {} +... +``` +**Do not post screenshots of verbose log only plain text is acceptable.** -For discussions, join us in the irc channel #youtube-dl on freenode. +The output (including the first lines) contains important debugging information. Issues without the full output are often not reproducible and therefore do not get solved in short order, if ever. -When you submit a request, please re-read it once to avoid a couple of mistakes (you can and should use this as a checklist): +Please re-read your issue once again to avoid a couple of common mistakes (you can and should use this as a checklist): ### Is the description of the issue itself sufficient? @@ -554,19 +830,21 @@ So please elaborate on what feature you are requesting, or what bug you want to - How it could be fixed - How your proposed solution would look like -If your report is shorter than two lines, it is almost certainly missing some of these, which makes it hard for us to respond to it. We're often too polite to close the issue outright, but the missing info makes misinterpretation likely. As a commiter myself, I often get frustrated by these issues, since the only possible way for me to move forward on them is to ask for clarification over and over. +If your report is shorter than two lines, it is almost certainly missing some of these, which makes it hard for us to respond to it. We're often too polite to close the issue outright, but the missing info makes misinterpretation likely. As a committer myself, I often get frustrated by these issues, since the only possible way for me to move forward on them is to ask for clarification over and over. -For bug reports, this means that your report should contain the *complete* output of youtube-dl when called with the -v flag. The error message you get for (most) bugs even says so, but you would not believe how many of our bug reports do not contain this information. +For bug reports, this means that your report should contain the *complete* output of youtube-dl when called with the `-v` flag. The error message you get for (most) bugs even says so, but you would not believe how many of our bug reports do not contain this information. -Site support requests **must contain an example URL**. An example URL is a URL you might want to download, like http://www.youtube.com/watch?v=BaW_jenozKc . There should be an obvious video present. Except under very special circumstances, the main page of a video service (e.g. http://www.youtube.com/ ) is *not* an example URL. +If your server has multiple IPs or you suspect censorship, adding `--call-home` may be a good idea to get more diagnostics. If the error is `ERROR: Unable to extract ...` and you cannot reproduce it from multiple countries, add `--dump-pages` (warning: this will yield a rather large output, redirect it to the file `log.txt` by adding `>log.txt 2>&1` to your command-line) or upload the `.dump` files you get when you add `--write-pages` [somewhere](https://gist.github.com/). + +**Site support requests must contain an example URL**. An example URL is a URL you might want to download, like `http://www.youtube.com/watch?v=BaW_jenozKc`. There should be an obvious video present. Except under very special circumstances, the main page of a video service (e.g. `http://www.youtube.com/`) is *not* an example URL. ### Are you using the latest version? -Before reporting any issue, type youtube-dl -U. This should report that you're up-to-date. About 20% of the reports we receive are already fixed, but people are using outdated versions. This goes for feature requests as well. +Before reporting any issue, type `youtube-dl -U`. This should report that you're up-to-date. About 20% of the reports we receive are already fixed, but people are using outdated versions. This goes for feature requests as well. ### Is the issue already documented? -Make sure that someone has not already opened the issue you're trying to open. Search at the top of the window or at https://github.com/rg3/youtube-dl/search?type=Issues . If there is an issue, feel free to write something along the lines of "This affects me as well, with version 2015.01.01. Here is some more information on the issue: ...". While some issues may be old, a new post into them often spurs rapid activity. +Make sure that someone has not already opened the issue you're trying to open. Search at the top of the window or browse the [GitHub Issues](https://github.com/rg3/youtube-dl/search?type=Issues) of this repository. If there is an issue, feel free to write something along the lines of "This affects me as well, with version 2015.01.01. Here is some more information on the issue: ...". While some issues may be old, a new post into them often spurs rapid activity. ### Why are existing options not enough? @@ -586,7 +864,7 @@ In particular, every site support request issue should only pertain to services ### Is anyone going to need the feature? -Only post features that you (or an incapicated friend you can personally talk to) require. Do not post features because they seem like a good idea. If they are really useful, they will be requested by someone who requires them. +Only post features that you (or an incapacitated friend you can personally talk to) require. Do not post features because they seem like a good idea. If they are really useful, they will be requested by someone who requires them. ### Is your question about youtube-dl? @@ -596,4 +874,4 @@ It may sound strange, but some bug reports we receive are completely unrelated t youtube-dl is released into the public domain by the copyright holders. -This README file was originally written by Daniel Bolton () and is likewise released into the public domain. +This README file was originally written by [Daniel Bolton](https://github.com/dbbolton) and is likewise released into the public domain. diff --git a/devscripts/bash-completion.py b/devscripts/bash-completion.py index cd26cc089..ce68f26f9 100755 --- a/devscripts/bash-completion.py +++ b/devscripts/bash-completion.py @@ -5,7 +5,7 @@ import os from os.path import dirname as dirn import sys -sys.path.append(dirn(dirn((os.path.abspath(__file__))))) +sys.path.insert(0, dirn(dirn((os.path.abspath(__file__))))) import youtube_dl BASH_COMPLETION_FILE = "youtube-dl.bash-completion" diff --git a/devscripts/check-porn.py b/devscripts/check-porn.py index 216282712..7a219ebe9 100644 --- a/devscripts/check-porn.py +++ b/devscripts/check-porn.py @@ -28,7 +28,7 @@ for test in get_testcases(): if METHOD == 'EURISTIC': try: webpage = compat_urllib_request.urlopen(test['url'], timeout=10).read() - except: + except Exception: print('\nFail: {0}'.format(test['name'])) continue @@ -45,12 +45,12 @@ for test in get_testcases(): RESULT = ('.' + domain + '\n' in LIST or '\n' + domain + '\n' in LIST) - if RESULT and ('info_dict' not in test or 'age_limit' not in test['info_dict'] - or test['info_dict']['age_limit'] != 18): + if RESULT and ('info_dict' not in test or 'age_limit' not in test['info_dict'] or + test['info_dict']['age_limit'] != 18): print('\nPotential missing age_limit check: {0}'.format(test['name'])) - elif not RESULT and ('info_dict' in test and 'age_limit' in test['info_dict'] - and test['info_dict']['age_limit'] == 18): + elif not RESULT and ('info_dict' in test and 'age_limit' in test['info_dict'] and + test['info_dict']['age_limit'] == 18): print('\nPotential false negative: {0}'.format(test['name'])) else: diff --git a/devscripts/fish-completion.py b/devscripts/fish-completion.py index c2f238798..41629d87d 100755 --- a/devscripts/fish-completion.py +++ b/devscripts/fish-completion.py @@ -6,7 +6,7 @@ import os from os.path import dirname as dirn import sys -sys.path.append(dirn(dirn((os.path.abspath(__file__))))) +sys.path.insert(0, dirn(dirn((os.path.abspath(__file__))))) import youtube_dl from youtube_dl.utils import shell_quote diff --git a/devscripts/generate_aes_testdata.py b/devscripts/generate_aes_testdata.py new file mode 100644 index 000000000..2e389fc8e --- /dev/null +++ b/devscripts/generate_aes_testdata.py @@ -0,0 +1,42 @@ +from __future__ import unicode_literals + +import codecs +import subprocess + +import os +import sys +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + +from youtube_dl.utils import intlist_to_bytes +from youtube_dl.aes import aes_encrypt, key_expansion + +secret_msg = b'Secret message goes here' + + +def hex_str(int_list): + return codecs.encode(intlist_to_bytes(int_list), 'hex') + + +def openssl_encode(algo, key, iv): + cmd = ['openssl', 'enc', '-e', '-' + algo, '-K', hex_str(key), '-iv', hex_str(iv)] + prog = subprocess.Popen(cmd, stdin=subprocess.PIPE, stdout=subprocess.PIPE) + out, _ = prog.communicate(secret_msg) + return out + +iv = key = [0x20, 0x15] + 14 * [0] + +r = openssl_encode('aes-128-cbc', key, iv) +print('aes_cbc_decrypt') +print(repr(r)) + +password = key +new_key = aes_encrypt(password, key_expansion(password)) +r = openssl_encode('aes-128-ctr', new_key, iv) +print('aes_decrypt_text 16') +print(repr(r)) + +password = key + 16 * [0] +new_key = aes_encrypt(password, key_expansion(password)) * (32 // 16) +r = openssl_encode('aes-256-ctr', new_key, iv) +print('aes_decrypt_text 32') +print(repr(r)) diff --git a/devscripts/gh-pages/update-sites.py b/devscripts/gh-pages/update-sites.py index f0f0481c7..503c1372f 100755 --- a/devscripts/gh-pages/update-sites.py +++ b/devscripts/gh-pages/update-sites.py @@ -6,7 +6,7 @@ import os import textwrap # We must be able to import youtube_dl -sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))) +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))) import youtube_dl @@ -16,7 +16,7 @@ def main(): template = tmplf.read() ie_htmls = [] - for ie in sorted(youtube_dl.gen_extractors(), key=lambda i: i.IE_NAME.lower()): + for ie in youtube_dl.list_extractors(age_limit=None): ie_html = '{}'.format(ie.IE_NAME) ie_desc = getattr(ie, 'IE_DESC', None) if ie_desc is False: diff --git a/devscripts/make_contributing.py b/devscripts/make_contributing.py new file mode 100755 index 000000000..5e454a429 --- /dev/null +++ b/devscripts/make_contributing.py @@ -0,0 +1,32 @@ +#!/usr/bin/env python +from __future__ import unicode_literals + +import io +import optparse +import re + + +def main(): + parser = optparse.OptionParser(usage='%prog INFILE OUTFILE') + options, args = parser.parse_args() + if len(args) != 2: + parser.error('Expected an input and an output filename') + + infile, outfile = args + + with io.open(infile, encoding='utf-8') as inf: + readme = inf.read() + + bug_text = re.search( + r'(?s)#\s*BUGS\s*[^\n]*\s*(.*?)#\s*COPYRIGHT', readme).group(1) + dev_text = re.search( + r'(?s)(#\s*DEVELOPER INSTRUCTIONS.*?)#\s*EMBEDDING YOUTUBE-DL', + readme).group(1) + + out = bug_text + dev_text + + with io.open(outfile, 'w', encoding='utf-8') as outf: + outf.write(out) + +if __name__ == '__main__': + main() diff --git a/devscripts/make_supportedsites.py b/devscripts/make_supportedsites.py new file mode 100644 index 000000000..8cb4a4638 --- /dev/null +++ b/devscripts/make_supportedsites.py @@ -0,0 +1,45 @@ +#!/usr/bin/env python +from __future__ import unicode_literals + +import io +import optparse +import os +import sys + + +# Import youtube_dl +ROOT_DIR = os.path.join(os.path.dirname(__file__), '..') +sys.path.insert(0, ROOT_DIR) +import youtube_dl + + +def main(): + parser = optparse.OptionParser(usage='%prog OUTFILE.md') + options, args = parser.parse_args() + if len(args) != 1: + parser.error('Expected an output filename') + + outfile, = args + + def gen_ies_md(ies): + for ie in ies: + ie_md = '**{0}**'.format(ie.IE_NAME) + ie_desc = getattr(ie, 'IE_DESC', None) + if ie_desc is False: + continue + if ie_desc is not None: + ie_md += ': {0}'.format(ie.IE_DESC) + if not ie.working(): + ie_md += ' (Currently broken)' + yield ie_md + + ies = sorted(youtube_dl.gen_extractors(), key=lambda i: i.IE_NAME.lower()) + out = '# Supported sites\n' + ''.join( + ' - ' + md + '\n' + for md in gen_ies_md(ies)) + + with io.open(outfile, 'w', encoding='utf-8') as outf: + outf.write(out) + +if __name__ == '__main__': + main() diff --git a/devscripts/prepare_manpage.py b/devscripts/prepare_manpage.py index f66bebfea..776e6556e 100644 --- a/devscripts/prepare_manpage.py +++ b/devscripts/prepare_manpage.py @@ -8,13 +8,55 @@ import re ROOT_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) README_FILE = os.path.join(ROOT_DIR, 'README.md') + +def filter_options(readme): + ret = '' + in_options = False + for line in readme.split('\n'): + if line.startswith('# '): + if line[2:].startswith('OPTIONS'): + in_options = True + else: + in_options = False + + if in_options: + if line.lstrip().startswith('-'): + option, description = re.split(r'\s{2,}', line.lstrip()) + split_option = option.split(' ') + + if not split_option[-1].startswith('-'): # metavar + option = ' '.join(split_option[:-1] + ['*%s*' % split_option[-1]]) + + # Pandoc's definition_lists. See http://pandoc.org/README.html + # for more information. + ret += '\n%s\n: %s\n' % (option, description) + else: + ret += line.lstrip() + '\n' + else: + ret += line + '\n' + + return ret + with io.open(README_FILE, encoding='utf-8') as f: readme = f.read() -PREFIX = '%YOUTUBE-DL(1)\n\n# NAME\n' -readme = re.sub(r'(?s)# INSTALLATION.*?(?=# DESCRIPTION)', '', readme) +PREFIX = '''%YOUTUBE-DL(1) + +# NAME + +youtube\-dl \- download videos from youtube.com or other video platforms + +# SYNOPSIS + +**youtube-dl** \[OPTIONS\] URL [URL...] + +''' +readme = re.sub(r'(?s)^.*?(?=# DESCRIPTION)', '', readme) +readme = re.sub(r'\s+youtube-dl \[OPTIONS\] URL \[URL\.\.\.\]', '', readme) readme = PREFIX + readme +readme = filter_options(readme) + if sys.version_info < (3, 0): print(readme.encode('utf-8')) else: diff --git a/devscripts/release.sh b/devscripts/release.sh index 691517ceb..61806961c 100755 --- a/devscripts/release.sh +++ b/devscripts/release.sh @@ -35,7 +35,7 @@ if [ ! -z "$useless_files" ]; then echo "ERROR: Non-.py files in youtube_dl: $us if [ ! -f "updates_key.pem" ]; then echo 'ERROR: updates_key.pem missing'; exit 1; fi /bin/echo -e "\n### First of all, testing..." -make cleanall +make clean if $skip_tests ; then echo 'SKIPPING TESTS' else @@ -45,9 +45,9 @@ fi /bin/echo -e "\n### Changing version in version.py..." sed -i "s/__version__ = '.*'/__version__ = '$version'/" youtube_dl/version.py -/bin/echo -e "\n### Committing README.md and youtube_dl/version.py..." -make README.md -git add README.md youtube_dl/version.py +/bin/echo -e "\n### Committing documentation and youtube_dl/version.py..." +make README.md CONTRIBUTING.md supportedsites +git add README.md CONTRIBUTING.md docs/supportedsites.md youtube_dl/version.py git commit -m "release $version" /bin/echo -e "\n### Now tagging, signing and pushing..." diff --git a/devscripts/zsh-completion.py b/devscripts/zsh-completion.py index f200f2c80..04728e8e2 100755 --- a/devscripts/zsh-completion.py +++ b/devscripts/zsh-completion.py @@ -5,7 +5,7 @@ import os from os.path import dirname as dirn import sys -sys.path.append(dirn(dirn((os.path.abspath(__file__))))) +sys.path.insert(0, dirn(dirn((os.path.abspath(__file__))))) import youtube_dl ZSH_COMPLETION_FILE = "youtube-dl.zsh" diff --git a/docs/supportedsites.md b/docs/supportedsites.md new file mode 100644 index 000000000..1a5c7cde9 --- /dev/null +++ b/docs/supportedsites.md @@ -0,0 +1,730 @@ +# Supported sites + - **1tv**: Первый канал + - **1up.com** + - **220.ro** + - **22tracks:genre** + - **22tracks:track** + - **24video** + - **3sat** + - **4tube** + - **56.com** + - **5min** + - **8tracks** + - **91porn** + - **9gag** + - **abc.net.au** + - **Abc7News** + - **AcademicEarth:Course** + - **acast** + - **acast:channel** + - **AddAnime** + - **AdobeTV** + - **AdobeTVChannel** + - **AdobeTVShow** + - **AdobeTVVideo** + - **AdultSwim** + - **Aftonbladet** + - **AirMozilla** + - **AlJazeera** + - **Allocine** + - **AlphaPorno** + - **anitube.se** + - **AnySex** + - **Aparat** + - **AppleConnect** + - **AppleDaily**: 臺灣蘋果日報 + - **appletrailers** + - **appletrailers:section** + - **archive.org**: archive.org videos + - **ARD** + - **ARD:mediathek** + - **arte.tv** + - **arte.tv:+7** + - **arte.tv:concert** + - **arte.tv:creative** + - **arte.tv:ddc** + - **arte.tv:embed** + - **arte.tv:future** + - **AtresPlayer** + - **ATTTechChannel** + - **AudiMedia** + - **audiomack** + - **audiomack:album** + - **Azubu** + - **BaiduVideo**: 百度视频 + - **bambuser** + - **bambuser:channel** + - **Bandcamp** + - **Bandcamp:album** + - **bbc**: BBC + - **bbc.co.uk**: BBC iPlayer + - **bbc.co.uk:article**: BBC articles + - **BeatportPro** + - **Beeg** + - **BehindKink** + - **Bet** + - **Bild**: Bild.de + - **BiliBili** + - **BleacherReport** + - **BleacherReportCMS** + - **blinkx** + - **Bloomberg** + - **Bpb**: Bundeszentrale für politische Bildung + - **BR**: Bayerischer Rundfunk Mediathek + - **Break** + - **brightcove:legacy** + - **brightcove:new** + - **bt:article**: Bergens Tidende Articles + - **bt:vestlendingen**: Bergens Tidende - Vestlendingen + - **BuzzFeed** + - **BYUtv** + - **Camdemy** + - **CamdemyFolder** + - **canalc2.tv** + - **Canalplus**: canalplus.fr, piwiplus.fr and d8.tv + - **CBS** + - **CBSNews**: CBS News + - **CBSSports** + - **CeskaTelevize** + - **channel9**: Channel 9 + - **Chaturbate** + - **Chilloutzone** + - **chirbit** + - **chirbit:profile** + - **Cinchcast** + - **Cinemassacre** + - **Clipfish** + - **cliphunter** + - **Clipsyndicate** + - **cloudtime**: CloudTime + - **Cloudy** + - **Clubic** + - **Clyp** + - **cmt.com** + - **CNET** + - **CNN** + - **CNNArticle** + - **CNNBlogs** + - **CollegeHumor** + - **CollegeRama** + - **ComCarCoff** + - **ComedyCentral** + - **ComedyCentralShows**: The Daily Show / The Colbert Report + - **CondeNast**: Condé Nast media group: Allure, Architectural Digest, Ars Technica, Bon Appétit, Brides, Condé Nast, Condé Nast Traveler, Details, Epicurious, GQ, Glamour, Golf Digest, SELF, Teen Vogue, The New Yorker, Vanity Fair, Vogue, W Magazine, WIRED + - **Cracked** + - **Criterion** + - **CrooksAndLiars** + - **Crunchyroll** + - **crunchyroll:playlist** + - **CSpan**: C-SPAN + - **CtsNews**: 華視新聞 + - **culturebox.francetvinfo.fr** + - **dailymotion** + - **dailymotion:playlist** + - **dailymotion:user** + - **DailymotionCloud** + - **daum.net** + - **DBTV** + - **DCN** + - **DctpTv** + - **DeezerPlaylist** + - **defense.gouv.fr** + - **democracynow** + - **DHM**: Filmarchiv - Deutsches Historisches Museum + - **Discovery** + - **Dotsub** + - **DouyuTV**: 斗鱼 + - **DPlay** + - **dramafever** + - **dramafever:series** + - **DRBonanza** + - **Dropbox** + - **DrTuber** + - **DRTV** + - **Dump** + - **Dumpert** + - **dvtv**: http://video.aktualne.cz/ + - **EaglePlatform** + - **EbaumsWorld** + - **EchoMsk** + - **eHow** + - **Einthusan** + - **eitb.tv** + - **EllenTV** + - **EllenTV:clips** + - **ElPais**: El País + - **Embedly** + - **EMPFlix** + - **Engadget** + - **Eporner** + - **EroProfile** + - **Escapist** + - **ESPN** (Currently broken) + - **EsriVideo** + - **Europa** + - **EveryonesMixtape** + - **exfm**: ex.fm + - **ExpoTV** + - **ExtremeTube** + - **facebook** + - **faz.net** + - **fc2** + - **Fczenit** + - **fernsehkritik.tv** + - **Firstpost** + - **FiveTV** + - **Flickr** + - **Folketinget**: Folketinget (ft.dk; Danish parliament) + - **FootyRoom** + - **Foxgay** + - **FoxNews**: Fox News and Fox Business Video + - **FoxSports** + - **france2.fr:generation-quoi** + - **FranceCulture** + - **FranceInter** + - **francetv**: France 2, 3, 4, 5 and Ô + - **francetvinfo.fr** + - **Freesound** + - **freespeech.org** + - **FreeVideo** + - **Funimation** + - **FunnyOrDie** + - **GameInformer** + - **Gamekings** + - **GameOne** + - **gameone:playlist** + - **Gamersyde** + - **GameSpot** + - **GameStar** + - **Gametrailers** + - **Gazeta** + - **GDCVault** + - **generic**: Generic downloader that works on some sites + - **Gfycat** + - **GiantBomb** + - **Giga** + - **Glide**: Glide mobile video messages (glide.me) + - **Globo** + - **GloboArticle** + - **GodTube** + - **GoldenMoustache** + - **Golem** + - **GoogleDrive** + - **Goshgay** + - **GPUTechConf** + - **Groupon** + - **Hark** + - **HearThisAt** + - **Heise** + - **HellPorno** + - **Helsinki**: helsinki.fi + - **HentaiStigma** + - **HistoricFilms** + - **History** + - **hitbox** + - **hitbox:live** + - **HornBunny** + - **HotNewHipHop** + - **Howcast** + - **HowStuffWorks** + - **HuffPost**: Huffington Post + - **Hypem** + - **Iconosquare** + - **ign.com** + - **imdb**: Internet Movie Database trailers + - **imdb:list**: Internet Movie Database lists + - **Imgur** + - **ImgurAlbum** + - **Ina** + - **Indavideo** + - **IndavideoEmbed** + - **InfoQ** + - **Instagram** + - **instagram:user**: Instagram user profile + - **InternetVideoArchive** + - **IPrima** + - **iqiyi**: 爱奇艺 + - **Ir90Tv** + - **ivi**: ivi.ru + - **ivi:compilation**: ivi.ru compilations + - **Izlesene** + - **JadoreCettePub** + - **JeuxVideo** + - **Jove** + - **jpopsuki.tv** + - **Jukebox** + - **JWPlatform** + - **Kaltura** + - **KanalPlay**: Kanal 5/9/11 Play + - **Kankan** + - **Karaoketv** + - **KarriereVideos** + - **keek** + - **KeezMovies** + - **KhanAcademy** + - **KickStarter** + - **kontrtube**: KontrTube.ru - Труба зовёт + - **KrasView**: Красвью + - **Ku6** + - **kuwo:album**: 酷我音乐 - 专辑 + - **kuwo:category**: 酷我音乐 - 分类 + - **kuwo:chart**: 酷我音乐 - 排行榜 + - **kuwo:mv**: 酷我音乐 - MV + - **kuwo:singer**: 酷我音乐 - 歌手 + - **kuwo:song**: 酷我音乐 + - **la7.tv** + - **Laola1Tv** + - **Lecture2Go** + - **Letv**: 乐视网 + - **LetvPlaylist** + - **LetvTv** + - **Libsyn** + - **life:embed** + - **lifenews**: LIFE | NEWS + - **limelight** + - **limelight:channel** + - **limelight:channel_list** + - **LiveLeak** + - **livestream** + - **livestream:original** + - **LnkGo** + - **lrt.lt** + - **lynda**: lynda.com videos + - **lynda:course**: lynda.com online courses + - **m6** + - **macgamestore**: MacGameStore trailers + - **mailru**: Видео@Mail.Ru + - **MakerTV** + - **Malemotion** + - **MDR**: MDR.DE and KiKA + - **media.ccc.de** + - **metacafe** + - **Metacritic** + - **Mgoon** + - **Minhateca** + - **MinistryGrid** + - **miomio.tv** + - **MiTele**: mitele.es + - **mixcloud** + - **MLB** + - **MoeVideo**: LetitBit video services: moevideo.net, playreplay.net and videochart.net + - **Mofosex** + - **Mojvideo** + - **Moniker**: allmyvideos.net and vidspot.net + - **mooshare**: Mooshare.biz + - **Morningstar**: morningstar.com + - **Motherless** + - **Motorsport**: motorsport.com + - **MovieClips** + - **MovieFap** + - **Moviezine** + - **MPORA** + - **MSNBC** + - **MTV** + - **mtv.de** + - **mtviggy.com** + - **mtvservices:embedded** + - **MuenchenTV**: münchen.tv + - **MusicPlayOn** + - **muzu.tv** + - **Mwave** + - **MySpace** + - **MySpace:album** + - **MySpass** + - **Myvi** + - **myvideo** + - **MyVidster** + - **n-tv.de** + - **NationalGeographic** + - **Naver** + - **NBA** + - **NBC** + - **NBCNews** + - **NBCSports** + - **NBCSportsVPlayer** + - **ndr**: NDR.de - Norddeutscher Rundfunk + - **ndr:embed** + - **ndr:embed:base** + - **NDTV** + - **NerdCubedFeed** + - **Nerdist** + - **netease:album**: 网易云音乐 - 专辑 + - **netease:djradio**: 网易云音乐 - 电台 + - **netease:mv**: 网易云音乐 - MV + - **netease:playlist**: 网易云音乐 - 歌单 + - **netease:program**: 网易云音乐 - 电台节目 + - **netease:singer**: 网易云音乐 - 歌手 + - **netease:song**: 网易云音乐 + - **Netzkino** + - **Newgrounds** + - **Newstube** + - **NextMedia**: 蘋果日報 + - **NextMediaActionNews**: 蘋果日報 - 動新聞 + - **nfb**: National Film Board of Canada + - **nfl.com** + - **nhl.com** + - **nhl.com:news**: NHL news + - **nhl.com:videocenter**: NHL videocenter category + - **niconico**: ニコニコ動画 + - **NiconicoPlaylist** + - **njoy**: N-JOY + - **njoy:embed** + - **Noco** + - **Normalboots** + - **NosVideo** + - **Nova**: TN.cz, Prásk.tv, Nova.cz, Novaplus.cz, FANDA.tv, Krásná.cz and Doma.cz + - **novamov**: NovaMov + - **nowness** + - **nowness:playlist** + - **nowness:series** + - **NowTV** + - **NowTVList** + - **nowvideo**: NowVideo + - **npo**: npo.nl and ntr.nl + - **npo.nl:live** + - **npo.nl:radio** + - **npo.nl:radio:fragment** + - **NRK** + - **NRKPlaylist** + - **NRKTV**: NRK TV and NRK Radio + - **ntv.ru** + - **Nuvid** + - **NYTimes** + - **NYTimesArticle** + - **ocw.mit.edu** + - **Odnoklassniki** + - **OktoberfestTV** + - **on.aol.com** + - **OnionStudios** + - **Ooyala** + - **OoyalaExternal** + - **orf:fm4**: radio FM4 + - **orf:iptv**: iptv.ORF.at + - **orf:oe1**: Radio Österreich 1 + - **orf:tvthek**: ORF TVthek + - **parliamentlive.tv**: UK parliament videos + - **Patreon** + - **pbs**: Public Broadcasting Service (PBS) and member stations: PBS: Public Broadcasting Service, APT - Alabama Public Television (WBIQ), GPB/Georgia Public Broadcasting (WGTV), Mississippi Public Broadcasting (WMPN), Nashville Public Television (WNPT), WFSU-TV (WFSU), WSRE (WSRE), WTCI (WTCI), WPBA/Channel 30 (WPBA), Alaska Public Media (KAKM), Arizona PBS (KAET), KNME-TV/Channel 5 (KNME), Vegas PBS (KLVX), AETN/ARKANSAS ETV NETWORK (KETS), KET (WKLE), WKNO/Channel 10 (WKNO), LPB/LOUISIANA PUBLIC BROADCASTING (WLPB), OETA (KETA), Ozarks Public Television (KOZK), WSIU Public Broadcasting (WSIU), KEET TV (KEET), KIXE/Channel 9 (KIXE), KPBS San Diego (KPBS), KQED (KQED), KVIE Public Television (KVIE), PBS SoCal/KOCE (KOCE), ValleyPBS (KVPT), CONNECTICUT PUBLIC TELEVISION (WEDH), KNPB Channel 5 (KNPB), SOPTV (KSYS), Rocky Mountain PBS (KRMA), KENW-TV3 (KENW), KUED Channel 7 (KUED), Wyoming PBS (KCWC), Colorado Public Television / KBDI 12 (KBDI), KBYU-TV (KBYU), Thirteen/WNET New York (WNET), WGBH/Channel 2 (WGBH), WGBY (WGBY), NJTV Public Media NJ (WNJT), WLIW21 (WLIW), mpt/Maryland Public Television (WMPB), WETA Television and Radio (WETA), WHYY (WHYY), PBS 39 (WLVT), WVPT - Your Source for PBS and More! (WVPT), Howard University Television (WHUT), WEDU PBS (WEDU), WGCU Public Media (WGCU), WPBT2 (WPBT), WUCF TV (WUCF), WUFT/Channel 5 (WUFT), WXEL/Channel 42 (WXEL), WLRN/Channel 17 (WLRN), WUSF Public Broadcasting (WUSF), ETV (WRLK), UNC-TV (WUNC), PBS Hawaii - Oceanic Cable Channel 10 (KHET), Idaho Public Television (KAID), KSPS (KSPS), OPB (KOPB), KWSU/Channel 10 & KTNW/Channel 31 (KWSU), WILL-TV (WILL), Network Knowledge - WSEC/Springfield (WSEC), WTTW11 (WTTW), Iowa Public Television/IPTV (KDIN), Nine Network (KETC), PBS39 Fort Wayne (WFWA), WFYI Indianapolis (WFYI), Milwaukee Public Television (WMVS), WNIN (WNIN), WNIT Public Television (WNIT), WPT (WPNE), WVUT/Channel 22 (WVUT), WEIU/Channel 51 (WEIU), WQPT-TV (WQPT), WYCC PBS Chicago (WYCC), WIPB-TV (WIPB), WTIU (WTIU), CET (WCET), ThinkTVNetwork (WPTD), WBGU-TV (WBGU), WGVU TV (WGVU), NET1 (KUON), Pioneer Public Television (KWCM), SDPB Television (KUSD), TPT (KTCA), KSMQ (KSMQ), KPTS/Channel 8 (KPTS), KTWU/Channel 11 (KTWU), East Tennessee PBS (WSJK), WCTE-TV (WCTE), WLJT, Channel 11 (WLJT), WOSU TV (WOSU), WOUB/WOUC (WOUB), WVPB (WVPB), WKYU-PBS (WKYU), KERA 13 (KERA), MPBN (WCBB), Mountain Lake PBS (WCFE), NHPTV (WENH), Vermont PBS (WETK), witf (WITF), WQED Multimedia (WQED), WMHT Educational Telecommunications (WMHT), Q-TV (WDCQ), WTVS Detroit Public TV (WTVS), CMU Public Television (WCMU), WKAR-TV (WKAR), WNMU-TV Public TV 13 (WNMU), WDSE - WRPT (WDSE), WGTE TV (WGTE), Lakeland Public Television (KAWE), KMOS-TV - Channels 6.1, 6.2 and 6.3 (KMOS), MontanaPBS (KUSM), KRWG/Channel 22 (KRWG), KACV (KACV), KCOS/Channel 13 (KCOS), WCNY/Channel 24 (WCNY), WNED (WNED), WPBS (WPBS), WSKG Public TV (WSKG), WXXI (WXXI), WPSU (WPSU), WVIA Public Media Studios (WVIA), WTVI (WTVI), Western Reserve PBS (WNEO), WVIZ/PBS ideastream (WVIZ), KCTS 9 (KCTS), Basin PBS (KPBT), KUHT / Channel 8 (KUHT), KLRN (KLRN), KLRU (KLRU), WTJX Channel 12 (WTJX), WCVE PBS (WCVE), KBTC Public Television (KBTC) + - **Periscope**: Periscope + - **PhilharmonieDeParis**: Philharmonie de Paris + - **Phoenix** + - **Photobucket** + - **Pinkbike** + - **Pladform** + - **PlanetaPlay** + - **play.fm** + - **played.to** + - **Playtvak**: Playtvak.cz, iDNES.cz and Lidovky.cz + - **Playvid** + - **Playwire** + - **pluralsight** + - **pluralsight:course** + - **plus.google**: Google Plus + - **pluzz.francetv.fr** + - **podomatic** + - **PornHd** + - **PornHub** + - **PornHubPlaylist** + - **Pornotube** + - **PornoVoisines** + - **PornoXO** + - **PrimeShareTV** + - **PromptFile** + - **prosiebensat1**: ProSiebenSat.1 Digital + - **Puls4** + - **Pyvideo** + - **qqmusic**: QQ音乐 + - **qqmusic:album**: QQ音乐 - 专辑 + - **qqmusic:playlist**: QQ音乐 - 歌单 + - **qqmusic:singer**: QQ音乐 - 歌手 + - **qqmusic:toplist**: QQ音乐 - 排行榜 + - **QuickVid** + - **R7** + - **radio.de** + - **radiobremen** + - **radiofrance** + - **RadioJavan** + - **Rai** + - **RBMARadio** + - **RDS**: RDS.ca + - **RedTube** + - **Restudy** + - **ReverbNation** + - **RingTV** + - **RottenTomatoes** + - **Roxwel** + - **RTBF** + - **Rte** + - **rtl.nl**: rtl.nl and rtlxl.nl + - **RTL2** + - **RTP** + - **RTS**: RTS.ch + - **rtve.es:alacarta**: RTVE a la carta + - **rtve.es:infantil**: RTVE infantil + - **rtve.es:live**: RTVE.es live streams + - **RTVNH** + - **RUHD** + - **rutube**: Rutube videos + - **rutube:channel**: Rutube channels + - **rutube:embed**: Rutube embedded videos + - **rutube:movie**: Rutube movies + - **rutube:person**: Rutube person videos + - **RUTV**: RUTV.RU + - **Ruutu** + - **safari**: safaribooksonline.com online video + - **safari:course**: safaribooksonline.com online courses + - **Sandia**: Sandia National Laboratories + - **Sapo**: SAPO Vídeos + - **savefrom.net** + - **SBS**: sbs.com.au + - **SciVee** + - **screen.yahoo:search**: Yahoo screen search + - **Screencast** + - **ScreencastOMatic** + - **ScreenwaveMedia** + - **SenateISVP** + - **ServingSys** + - **Sexu** + - **SexyKarma**: Sexy Karma and Watch Indian Porn + - **Shahid** + - **Shared**: shared.sx and vivo.sx + - **ShareSix** + - **Sina** + - **skynewsarabia:video** + - **skynewsarabia:video** + - **Slideshare** + - **Slutload** + - **smotri**: Smotri.com + - **smotri:broadcast**: Smotri.com broadcasts + - **smotri:community**: Smotri.com community videos + - **smotri:user**: Smotri.com user videos + - **SnagFilms** + - **SnagFilmsEmbed** + - **Snotr** + - **Sohu** + - **soundcloud** + - **soundcloud:playlist** + - **soundcloud:search**: Soundcloud search + - **soundcloud:set** + - **soundcloud:user** + - **soundgasm** + - **soundgasm:profile** + - **southpark.cc.com** + - **southpark.cc.com:español** + - **southpark.de** + - **southpark.nl** + - **southparkstudios.dk** + - **Space** + - **SpankBang** + - **Spankwire** + - **Spiegel** + - **Spiegel:Article**: Articles on spiegel.de + - **Spiegeltv** + - **Spike** + - **Sport5** + - **SportBox** + - **SportBoxEmbed** + - **SportDeutschland** + - **Sportschau** + - **Srf** + - **SRMediathek**: Saarländischer Rundfunk + - **SSA** + - **stanfordoc**: Stanford Open ClassRoom + - **Steam** + - **Stitcher** + - **streamcloud.eu** + - **StreamCZ** + - **StreetVoice** + - **SunPorno** + - **SVT** + - **SVTPlay**: SVT Play and Öppet arkiv + - **SWRMediathek** + - **Syfy** + - **SztvHu** + - **Tagesschau** + - **Tapely** + - **Tass** + - **teachertube**: teachertube.com videos + - **teachertube:user:collection**: teachertube.com user and collection videos + - **TeachingChannel** + - **Teamcoco** + - **TeamFour** + - **TechTalks** + - **techtv.mit.edu** + - **ted** + - **Tele13** + - **TeleBruxelles** + - **Telecinco**: telecinco.es, cuatro.com and mediaset.es + - **Telegraaf** + - **TeleMB** + - **TeleTask** + - **TenPlay** + - **TestTube** + - **TF1** + - **TheOnion** + - **ThePlatform** + - **ThePlatformFeed** + - **TheSixtyOne** + - **ThisAmericanLife** + - **ThisAV** + - **THVideo** + - **THVideoPlaylist** + - **tinypic**: tinypic.com videos + - **tlc.com** + - **tlc.de** + - **TMZ** + - **TMZArticle** + - **TNAFlix** + - **toggle** + - **tou.tv** + - **Toypics**: Toypics user profile + - **ToypicsUser**: Toypics user profile + - **TrailerAddict** (Currently broken) + - **Trilulilu** + - **TruTube** + - **Tube8** + - **TubiTv** + - **Tudou** + - **Tumblr** + - **TuneIn** + - **Turbo** + - **Tutv** + - **tv.dfb.de** + - **TV2** + - **TV2Article** + - **TV4**: tv4.se and tv4play.se + - **TVC** + - **TVCArticle** + - **tvigle**: Интернет-телевидение Tvigle.ru + - **tvp.pl** + - **tvp.pl:Series** + - **TVPlay**: TV3Play and related services + - **Tweakers** + - **twitch:bookmarks** + - **twitch:chapter** + - **twitch:past_broadcasts** + - **twitch:profile** + - **twitch:stream** + - **twitch:video** + - **twitch:vod** + - **twitter** + - **twitter:card** + - **Ubu** + - **udemy** + - **udemy:course** + - **UDNEmbed**: 聯合影音 + - **Ultimedia** + - **Unistra** + - **Urort**: NRK P3 Urørt + - **ustream** + - **ustream:channel** + - **Varzesh3** + - **Vbox7** + - **VeeHD** + - **Veoh** + - **Vessel** + - **Vesti**: Вести.Ru + - **Vevo** + - **VGTV**: VGTV, BTTV, FTV, Aftenposten and Aftonbladet + - **vh1.com** + - **Vice** + - **Viddler** + - **video.google:search**: Google Video search + - **video.mit.edu** + - **VideoDetective** + - **videofy.me** + - **VideoMega** + - **VideoPremium** + - **VideoTt**: video.tt - Your True Tube + - **videoweed**: VideoWeed + - **Vidme** + - **Vidzi** + - **vier** + - **vier:videos** + - **Viewster** + - **Viidea** + - **viki** + - **viki:channel** + - **vimeo** + - **vimeo:album** + - **vimeo:channel** + - **vimeo:group** + - **vimeo:likes**: Vimeo user likes + - **vimeo:review**: Review pages on vimeo + - **vimeo:user** + - **vimeo:watchlater**: Vimeo watch later list, "vimeowatchlater" keyword (requires authentication) + - **Vimple**: Vimple - one-click video hosting + - **Vine** + - **vine:user** + - **vk**: VK + - **vk:uservideos**: VK - User's Videos + - **vlive** + - **Vodlocker** + - **VoiceRepublic** + - **Vporn** + - **vpro**: npo.nl and ntr.nl + - **VRT** + - **vube**: Vube.com + - **VuClip** + - **vulture.com** + - **Walla** + - **WashingtonPost** + - **wat.tv** + - **WayOfTheMaster** + - **WDR** + - **wdr:mobile** + - **WDRMaus**: Sendung mit der Maus + - **WebOfStories** + - **WebOfStoriesPlaylist** + - **Weibo** + - **wholecloud**: WholeCloud + - **Wimp** + - **Wistia** + - **WNL** + - **WorldStarHipHop** + - **wrzuta.pl** + - **WSJ**: Wall Street Journal + - **XBef** + - **XboxClips** + - **XFileShare**: XFileShare based sites: GorillaVid.in, daclips.in, movpod.in, fastvideo.in, realvid.net, filehoot.com and vidto.me + - **XHamster** + - **XHamsterEmbed** + - **XMinus** + - **XNXX** + - **Xstream** + - **XTube** + - **XTubeUser**: XTube user profile + - **Xuite**: 隨意窩Xuite影音 + - **XVideos** + - **XXXYMovies** + - **Yahoo**: Yahoo screen and movies + - **Yam**: 蕃薯藤yam天空部落 + - **yandexmusic:album**: Яндекс.Музыка - Альбом + - **yandexmusic:playlist**: Яндекс.Музыка - Плейлист + - **yandexmusic:track**: Яндекс.Музыка - Трек + - **YesJapan** + - **yinyuetai:video**: 音悦Tai + - **Ynet** + - **YouJizz** + - **youku**: 优酷 + - **YouPorn** + - **YourUpload** + - **youtube**: YouTube.com + - **youtube:channel**: YouTube.com channels + - **youtube:favorites**: YouTube.com favourite videos, ":ytfav" for short (requires authentication) + - **youtube:history**: Youtube watch history, ":ythistory" for short (requires authentication) + - **youtube:playlist**: YouTube.com playlists + - **youtube:playlists**: YouTube.com user/channel playlists + - **youtube:recommended**: YouTube.com recommended videos, ":ytrec" for short (requires authentication) + - **youtube:search**: YouTube.com searches + - **youtube:search:date**: YouTube.com searches, newest videos first + - **youtube:search_url**: YouTube.com search URLs + - **youtube:show**: YouTube.com (multi-season) shows + - **youtube:subscriptions**: YouTube.com subscriptions feed, "ytsubs" keyword (requires authentication) + - **youtube:user**: YouTube.com user videos (URL or "ytuser" keyword) + - **youtube:watchlater**: Youtube watch later list, ":ytwatchlater" for short (requires authentication) + - **Zapiks** + - **ZDF** + - **ZDFChannel** + - **zingmp3:album**: mp3.zing.vn albums + - **zingmp3:song**: mp3.zing.vn songs diff --git a/setup.cfg b/setup.cfg index e57d130e3..26857750c 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,2 +1,6 @@ [wheel] universal = True + +[flake8] +exclude = youtube_dl/extractor/__init__.py,devscripts/buildserver.py,setup.py,build,.git +ignore = E402,E501,E731 diff --git a/setup.py b/setup.py index 4686260e0..bfe931f5b 100644 --- a/setup.py +++ b/setup.py @@ -28,7 +28,7 @@ py2exe_options = { "compressed": 1, "optimize": 2, "dist_dir": '.', - "dll_excludes": ['w9xpopen.exe'], + "dll_excludes": ['w9xpopen.exe', 'crypt32.dll'], } py2exe_console = [{ diff --git a/test/helper.py b/test/helper.py index 9a7f0746e..bdd7acca4 100644 --- a/test/helper.py +++ b/test/helper.py @@ -82,51 +82,90 @@ class FakeYDL(YoutubeDL): def gettestcases(include_onlymatching=False): for ie in youtube_dl.extractor.gen_extractors(): - t = getattr(ie, '_TEST', None) - if t: - assert not hasattr(ie, '_TESTS'), \ - '%s has _TEST and _TESTS' % type(ie).__name__ - tests = [t] - else: - tests = getattr(ie, '_TESTS', []) - for t in tests: - if not include_onlymatching and t.get('only_matching', False): - continue - t['name'] = type(ie).__name__[:-len('IE')] - yield t + for tc in ie.get_testcases(include_onlymatching): + yield tc md5 = lambda s: hashlib.md5(s.encode('utf-8')).hexdigest() -def expect_info_dict(self, expected_dict, got_dict): +def expect_value(self, got, expected, field): + if isinstance(expected, compat_str) and expected.startswith('re:'): + match_str = expected[len('re:'):] + match_rex = re.compile(match_str) + + self.assertTrue( + isinstance(got, compat_str), + 'Expected a %s object, but got %s for field %s' % ( + compat_str.__name__, type(got).__name__, field)) + self.assertTrue( + match_rex.match(got), + 'field %s (value: %r) should match %r' % (field, got, match_str)) + elif isinstance(expected, compat_str) and expected.startswith('startswith:'): + start_str = expected[len('startswith:'):] + self.assertTrue( + isinstance(got, compat_str), + 'Expected a %s object, but got %s for field %s' % ( + compat_str.__name__, type(got).__name__, field)) + self.assertTrue( + got.startswith(start_str), + 'field %s (value: %r) should start with %r' % (field, got, start_str)) + elif isinstance(expected, compat_str) and expected.startswith('contains:'): + contains_str = expected[len('contains:'):] + self.assertTrue( + isinstance(got, compat_str), + 'Expected a %s object, but got %s for field %s' % ( + compat_str.__name__, type(got).__name__, field)) + self.assertTrue( + contains_str in got, + 'field %s (value: %r) should contain %r' % (field, got, contains_str)) + elif isinstance(expected, type): + self.assertTrue( + isinstance(got, expected), + 'Expected type %r for field %s, but got value %r of type %r' % (expected, field, got, type(got))) + elif isinstance(expected, dict) and isinstance(got, dict): + expect_dict(self, got, expected) + elif isinstance(expected, list) and isinstance(got, list): + self.assertEqual( + len(expected), len(got), + 'Expect a list of length %d, but got a list of length %d for field %s' % ( + len(expected), len(got), field)) + for index, (item_got, item_expected) in enumerate(zip(got, expected)): + type_got = type(item_got) + type_expected = type(item_expected) + self.assertEqual( + type_expected, type_got, + 'Type mismatch for list item at index %d for field %s, expected %r, got %r' % ( + index, field, type_expected, type_got)) + expect_value(self, item_got, item_expected, field) + else: + if isinstance(expected, compat_str) and expected.startswith('md5:'): + got = 'md5:' + md5(got) + elif isinstance(expected, compat_str) and expected.startswith('mincount:'): + self.assertTrue( + isinstance(got, (list, dict)), + 'Expected field %s to be a list or a dict, but it is of type %s' % ( + field, type(got).__name__)) + expected_num = int(expected.partition(':')[2]) + assertGreaterEqual( + self, len(got), expected_num, + 'Expected %d items in field %s, but only got %d' % (expected_num, field, len(got))) + return + self.assertEqual( + expected, got, + 'Invalid value for field %s, expected %r, got %r' % (field, expected, got)) + + +def expect_dict(self, got_dict, expected_dict): for info_field, expected in expected_dict.items(): - if isinstance(expected, compat_str) and expected.startswith('re:'): - got = got_dict.get(info_field) - match_str = expected[len('re:'):] - match_rex = re.compile(match_str) + got = got_dict.get(info_field) + expect_value(self, got, expected, info_field) - self.assertTrue( - isinstance(got, compat_str), - 'Expected a %s object, but got %s for field %s' % ( - compat_str.__name__, type(got).__name__, info_field)) - self.assertTrue( - match_rex.match(got), - 'field %s (value: %r) should match %r' % (info_field, got, match_str)) - elif isinstance(expected, type): - got = got_dict.get(info_field) - self.assertTrue(isinstance(got, expected), - 'Expected type %r for field %s, but got value %r of type %r' % (expected, info_field, got, type(got))) - else: - if isinstance(expected, compat_str) and expected.startswith('md5:'): - got = 'md5:' + md5(got_dict.get(info_field)) - else: - got = got_dict.get(info_field) - self.assertEqual(expected, got, - 'invalid value for field %s, expected %r, got %r' % (info_field, expected, got)) +def expect_info_dict(self, got_dict, expected_dict): + expect_dict(self, got_dict, expected_dict) # Check for the presence of mandatory fields - if got_dict.get('_type') != 'playlist': + if got_dict.get('_type') not in ('playlist', 'multi_video'): for key in ('id', 'url', 'title', 'ext'): self.assertTrue(got_dict.get(key), 'Missing mandatory field %s' % key) # Check for mandatory fields that are automatically set by YoutubeDL @@ -136,7 +175,7 @@ def expect_info_dict(self, expected_dict, got_dict): # Are checkable fields missing from the test case definition? test_info_dict = dict((key, value if not isinstance(value, compat_str) or len(value) < 250 else 'md5:' + md5(value)) for key, value in got_dict.items() - if value and key in ('title', 'description', 'uploader', 'upload_date', 'timestamp', 'uploader_id', 'location')) + if value and key in ('id', 'title', 'description', 'uploader', 'upload_date', 'timestamp', 'uploader_id', 'location', 'age_limit')) missing_keys = set(test_info_dict.keys()) - set(expected_dict.keys()) if missing_keys: def _repr(v): @@ -144,11 +183,19 @@ def expect_info_dict(self, expected_dict, got_dict): return "'%s'" % v.replace('\\', '\\\\').replace("'", "\\'").replace('\n', '\\n') else: return repr(v) - info_dict_str = ''.join( - ' %s: %s,\n' % (_repr(k), _repr(v)) - for k, v in test_info_dict.items()) + info_dict_str = '' + if len(missing_keys) != len(expected_dict): + info_dict_str += ''.join( + ' %s: %s,\n' % (_repr(k), _repr(v)) + for k, v in test_info_dict.items() if k not in missing_keys) + + if info_dict_str: + info_dict_str += '\n' + info_dict_str += ''.join( + ' %s: %s,\n' % (_repr(k), _repr(test_info_dict[k])) + for k in missing_keys) write_string( - '\n\'info_dict\': {\n' + info_dict_str + '}\n', out=sys.stderr) + '\n\'info_dict\': {\n' + info_dict_str + '},\n', out=sys.stderr) self.assertFalse( missing_keys, 'Missing keys in test definition: %s' % ( @@ -161,7 +208,9 @@ def assertRegexpMatches(self, text, regexp, msg=None): else: m = re.match(regexp, text) if not m: - note = 'Regexp didn\'t match: %r not found in %r' % (regexp, text) + note = 'Regexp didn\'t match: %r not found' % (regexp) + if len(text) < 1000: + note += ' in %r' % text if msg is None: msg = note else: diff --git a/test/parameters.json b/test/parameters.json index 098cd0cd0..7bf59c25f 100644 --- a/test/parameters.json +++ b/test/parameters.json @@ -7,8 +7,7 @@ "forcethumbnail": false, "forcetitle": false, "forceurl": false, - "format": null, - "format_limit": null, + "format": "best", "ignoreerrors": false, "listformats": null, "logtostderr": false, @@ -28,7 +27,7 @@ "retries": 10, "simulate": false, "subtitleslang": null, - "subtitlesformat": "srt", + "subtitlesformat": "best", "test": true, "updatetime": true, "usenetrc": false, @@ -39,5 +38,6 @@ "writesubtitles": false, "allsubtitles": false, "listssubtitles": false, - "socket_timeout": 20 + "socket_timeout": 20, + "fixup": "never" } diff --git a/test/test_InfoExtractor.py b/test/test_InfoExtractor.py index 13c18ed95..938466a80 100644 --- a/test/test_InfoExtractor.py +++ b/test/test_InfoExtractor.py @@ -35,10 +35,36 @@ class TestInfoExtractor(unittest.TestCase): + + + + ''' self.assertEqual(ie._og_search_title(html), 'Foo') self.assertEqual(ie._og_search_description(html), 'Some video\'s description ') self.assertEqual(ie._og_search_thumbnail(html), 'http://domain.com/pic.jpg?key1=val1&key2=val2') + self.assertEqual(ie._og_search_video_url(html, default=None), None) + self.assertEqual(ie._og_search_property('foobar', html), 'Foo') + self.assertEqual(ie._og_search_property('test1', html), 'foo > < bar') + self.assertEqual(ie._og_search_property('test2', html), 'foo >//< bar') + + def test_html_search_meta(self): + ie = self.ie + html = ''' + + + + + + + ''' + + self.assertEqual(ie._html_search_meta('a', html), '1') + self.assertEqual(ie._html_search_meta('b', html), '2') + self.assertEqual(ie._html_search_meta('c', html), '3') + self.assertEqual(ie._html_search_meta('d', html), '4') + self.assertEqual(ie._html_search_meta('e', html), '5') + self.assertEqual(ie._html_search_meta('f', html), '6') if __name__ == '__main__': unittest.main() diff --git a/test/test_YoutubeDL.py b/test/test_YoutubeDL.py index f8e4f930e..0388c0bf3 100644 --- a/test/test_YoutubeDL.py +++ b/test/test_YoutubeDL.py @@ -8,9 +8,16 @@ import sys import unittest sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) +import copy + from test.helper import FakeYDL, assertRegexpMatches from youtube_dl import YoutubeDL +from youtube_dl.compat import compat_str from youtube_dl.extractor import YoutubeIE +from youtube_dl.postprocessor.common import PostProcessor +from youtube_dl.utils import ExtractorError, match_filter_func + +TEST_URL = 'http://localhost/sample.mp4' class YDL(FakeYDL): @@ -43,8 +50,8 @@ class TestFormatSelection(unittest.TestCase): ydl = YDL() ydl.params['prefer_free_formats'] = True formats = [ - {'ext': 'webm', 'height': 460, 'url': 'x'}, - {'ext': 'mp4', 'height': 460, 'url': 'y'}, + {'ext': 'webm', 'height': 460, 'url': TEST_URL}, + {'ext': 'mp4', 'height': 460, 'url': TEST_URL}, ] info_dict = _make_result(formats) yie = YoutubeIE(ydl) @@ -57,8 +64,8 @@ class TestFormatSelection(unittest.TestCase): ydl = YDL() ydl.params['prefer_free_formats'] = True formats = [ - {'ext': 'webm', 'height': 720, 'url': 'a'}, - {'ext': 'mp4', 'height': 1080, 'url': 'b'}, + {'ext': 'webm', 'height': 720, 'url': TEST_URL}, + {'ext': 'mp4', 'height': 1080, 'url': TEST_URL}, ] info_dict['formats'] = formats yie = YoutubeIE(ydl) @@ -71,9 +78,9 @@ class TestFormatSelection(unittest.TestCase): ydl = YDL() ydl.params['prefer_free_formats'] = False formats = [ - {'ext': 'webm', 'height': 720, 'url': '_'}, - {'ext': 'mp4', 'height': 720, 'url': '_'}, - {'ext': 'flv', 'height': 720, 'url': '_'}, + {'ext': 'webm', 'height': 720, 'url': TEST_URL}, + {'ext': 'mp4', 'height': 720, 'url': TEST_URL}, + {'ext': 'flv', 'height': 720, 'url': TEST_URL}, ] info_dict['formats'] = formats yie = YoutubeIE(ydl) @@ -85,8 +92,8 @@ class TestFormatSelection(unittest.TestCase): ydl = YDL() ydl.params['prefer_free_formats'] = False formats = [ - {'ext': 'flv', 'height': 720, 'url': '_'}, - {'ext': 'webm', 'height': 720, 'url': '_'}, + {'ext': 'flv', 'height': 720, 'url': TEST_URL}, + {'ext': 'webm', 'height': 720, 'url': TEST_URL}, ] info_dict['formats'] = formats yie = YoutubeIE(ydl) @@ -95,45 +102,13 @@ class TestFormatSelection(unittest.TestCase): downloaded = ydl.downloaded_info_dicts[0] self.assertEqual(downloaded['ext'], 'flv') - def test_format_limit(self): - formats = [ - {'format_id': 'meh', 'url': 'http://example.com/meh', 'preference': 1}, - {'format_id': 'good', 'url': 'http://example.com/good', 'preference': 2}, - {'format_id': 'great', 'url': 'http://example.com/great', 'preference': 3}, - {'format_id': 'excellent', 'url': 'http://example.com/exc', 'preference': 4}, - ] - info_dict = _make_result(formats) - - ydl = YDL() - ydl.process_ie_result(info_dict) - downloaded = ydl.downloaded_info_dicts[0] - self.assertEqual(downloaded['format_id'], 'excellent') - - ydl = YDL({'format_limit': 'good'}) - assert ydl.params['format_limit'] == 'good' - ydl.process_ie_result(info_dict.copy()) - downloaded = ydl.downloaded_info_dicts[0] - self.assertEqual(downloaded['format_id'], 'good') - - ydl = YDL({'format_limit': 'great', 'format': 'all'}) - ydl.process_ie_result(info_dict.copy()) - self.assertEqual(ydl.downloaded_info_dicts[0]['format_id'], 'meh') - self.assertEqual(ydl.downloaded_info_dicts[1]['format_id'], 'good') - self.assertEqual(ydl.downloaded_info_dicts[2]['format_id'], 'great') - self.assertTrue('3' in ydl.msgs[0]) - - ydl = YDL() - ydl.params['format_limit'] = 'excellent' - ydl.process_ie_result(info_dict.copy()) - downloaded = ydl.downloaded_info_dicts[0] - self.assertEqual(downloaded['format_id'], 'excellent') - def test_format_selection(self): formats = [ - {'format_id': '35', 'ext': 'mp4', 'preference': 1, 'url': '_'}, - {'format_id': '45', 'ext': 'webm', 'preference': 2, 'url': '_'}, - {'format_id': '47', 'ext': 'webm', 'preference': 3, 'url': '_'}, - {'format_id': '2', 'ext': 'flv', 'preference': 4, 'url': '_'}, + {'format_id': '35', 'ext': 'mp4', 'preference': 1, 'url': TEST_URL}, + {'format_id': 'example-with-dashes', 'ext': 'webm', 'preference': 1, 'url': TEST_URL}, + {'format_id': '45', 'ext': 'webm', 'preference': 2, 'url': TEST_URL}, + {'format_id': '47', 'ext': 'webm', 'preference': 3, 'url': TEST_URL}, + {'format_id': '2', 'ext': 'flv', 'preference': 4, 'url': TEST_URL}, ] info_dict = _make_result(formats) @@ -162,12 +137,17 @@ class TestFormatSelection(unittest.TestCase): downloaded = ydl.downloaded_info_dicts[0] self.assertEqual(downloaded['format_id'], '35') + ydl = YDL({'format': 'example-with-dashes'}) + ydl.process_ie_result(info_dict.copy()) + downloaded = ydl.downloaded_info_dicts[0] + self.assertEqual(downloaded['format_id'], 'example-with-dashes') + def test_format_selection_audio(self): formats = [ - {'format_id': 'audio-low', 'ext': 'webm', 'preference': 1, 'vcodec': 'none', 'url': '_'}, - {'format_id': 'audio-mid', 'ext': 'webm', 'preference': 2, 'vcodec': 'none', 'url': '_'}, - {'format_id': 'audio-high', 'ext': 'flv', 'preference': 3, 'vcodec': 'none', 'url': '_'}, - {'format_id': 'vid', 'ext': 'mp4', 'preference': 4, 'url': '_'}, + {'format_id': 'audio-low', 'ext': 'webm', 'preference': 1, 'vcodec': 'none', 'url': TEST_URL}, + {'format_id': 'audio-mid', 'ext': 'webm', 'preference': 2, 'vcodec': 'none', 'url': TEST_URL}, + {'format_id': 'audio-high', 'ext': 'flv', 'preference': 3, 'vcodec': 'none', 'url': TEST_URL}, + {'format_id': 'vid', 'ext': 'mp4', 'preference': 4, 'url': TEST_URL}, ] info_dict = _make_result(formats) @@ -182,8 +162,8 @@ class TestFormatSelection(unittest.TestCase): self.assertEqual(downloaded['format_id'], 'audio-low') formats = [ - {'format_id': 'vid-low', 'ext': 'mp4', 'preference': 1, 'url': '_'}, - {'format_id': 'vid-high', 'ext': 'mp4', 'preference': 2, 'url': '_'}, + {'format_id': 'vid-low', 'ext': 'mp4', 'preference': 1, 'url': TEST_URL}, + {'format_id': 'vid-high', 'ext': 'mp4', 'preference': 2, 'url': TEST_URL}, ] info_dict = _make_result(formats) @@ -192,11 +172,42 @@ class TestFormatSelection(unittest.TestCase): downloaded = ydl.downloaded_info_dicts[0] self.assertEqual(downloaded['format_id'], 'vid-high') + def test_format_selection_audio_exts(self): + formats = [ + {'format_id': 'mp3-64', 'ext': 'mp3', 'abr': 64, 'url': 'http://_', 'vcodec': 'none'}, + {'format_id': 'ogg-64', 'ext': 'ogg', 'abr': 64, 'url': 'http://_', 'vcodec': 'none'}, + {'format_id': 'aac-64', 'ext': 'aac', 'abr': 64, 'url': 'http://_', 'vcodec': 'none'}, + {'format_id': 'mp3-32', 'ext': 'mp3', 'abr': 32, 'url': 'http://_', 'vcodec': 'none'}, + {'format_id': 'aac-32', 'ext': 'aac', 'abr': 32, 'url': 'http://_', 'vcodec': 'none'}, + ] + + info_dict = _make_result(formats) + ydl = YDL({'format': 'best'}) + ie = YoutubeIE(ydl) + ie._sort_formats(info_dict['formats']) + ydl.process_ie_result(copy.deepcopy(info_dict)) + downloaded = ydl.downloaded_info_dicts[0] + self.assertEqual(downloaded['format_id'], 'aac-64') + + ydl = YDL({'format': 'mp3'}) + ie = YoutubeIE(ydl) + ie._sort_formats(info_dict['formats']) + ydl.process_ie_result(copy.deepcopy(info_dict)) + downloaded = ydl.downloaded_info_dicts[0] + self.assertEqual(downloaded['format_id'], 'mp3-64') + + ydl = YDL({'prefer_free_formats': True}) + ie = YoutubeIE(ydl) + ie._sort_formats(info_dict['formats']) + ydl.process_ie_result(copy.deepcopy(info_dict)) + downloaded = ydl.downloaded_info_dicts[0] + self.assertEqual(downloaded['format_id'], 'ogg-64') + def test_format_selection_video(self): formats = [ - {'format_id': 'dash-video-low', 'ext': 'mp4', 'preference': 1, 'acodec': 'none', 'url': '_'}, - {'format_id': 'dash-video-high', 'ext': 'mp4', 'preference': 2, 'acodec': 'none', 'url': '_'}, - {'format_id': 'vid', 'ext': 'mp4', 'preference': 3, 'url': '_'}, + {'format_id': 'dash-video-low', 'ext': 'mp4', 'preference': 1, 'acodec': 'none', 'url': TEST_URL}, + {'format_id': 'dash-video-high', 'ext': 'mp4', 'preference': 2, 'acodec': 'none', 'url': TEST_URL}, + {'format_id': 'vid', 'ext': 'mp4', 'preference': 3, 'url': TEST_URL}, ] info_dict = _make_result(formats) @@ -218,35 +229,223 @@ class TestFormatSelection(unittest.TestCase): # 3D '85', '84', '102', '83', '101', '82', '100', # Dash video - '138', '137', '248', '136', '247', '135', '246', + '137', '248', '136', '247', '135', '246', '245', '244', '134', '243', '133', '242', '160', # Dash audio '141', '172', '140', '171', '139', ] - for f1id, f2id in zip(order, order[1:]): - f1 = YoutubeIE._formats[f1id].copy() - f1['format_id'] = f1id - f1['url'] = 'url:' + f1id - f2 = YoutubeIE._formats[f2id].copy() - f2['format_id'] = f2id - f2['url'] = 'url:' + f2id + def format_info(f_id): + info = YoutubeIE._formats[f_id].copy() + info['format_id'] = f_id + info['url'] = 'url:' + f_id + return info + formats_order = [format_info(f_id) for f_id in order] + info_dict = _make_result(list(formats_order), extractor='youtube') + ydl = YDL({'format': 'bestvideo+bestaudio'}) + yie = YoutubeIE(ydl) + yie._sort_formats(info_dict['formats']) + ydl.process_ie_result(info_dict) + downloaded = ydl.downloaded_info_dicts[0] + self.assertEqual(downloaded['format_id'], '137+141') + self.assertEqual(downloaded['ext'], 'mp4') + + info_dict = _make_result(list(formats_order), extractor='youtube') + ydl = YDL({'format': 'bestvideo[height>=999999]+bestaudio/best'}) + yie = YoutubeIE(ydl) + yie._sort_formats(info_dict['formats']) + ydl.process_ie_result(info_dict) + downloaded = ydl.downloaded_info_dicts[0] + self.assertEqual(downloaded['format_id'], '38') + + info_dict = _make_result(list(formats_order), extractor='youtube') + ydl = YDL({'format': 'bestvideo/best,bestaudio'}) + yie = YoutubeIE(ydl) + yie._sort_formats(info_dict['formats']) + ydl.process_ie_result(info_dict) + downloaded_ids = [info['format_id'] for info in ydl.downloaded_info_dicts] + self.assertEqual(downloaded_ids, ['137', '141']) + + info_dict = _make_result(list(formats_order), extractor='youtube') + ydl = YDL({'format': '(bestvideo[ext=mp4],bestvideo[ext=webm])+bestaudio'}) + yie = YoutubeIE(ydl) + yie._sort_formats(info_dict['formats']) + ydl.process_ie_result(info_dict) + downloaded_ids = [info['format_id'] for info in ydl.downloaded_info_dicts] + self.assertEqual(downloaded_ids, ['137+141', '248+141']) + + info_dict = _make_result(list(formats_order), extractor='youtube') + ydl = YDL({'format': '(bestvideo[ext=mp4],bestvideo[ext=webm])[height<=720]+bestaudio'}) + yie = YoutubeIE(ydl) + yie._sort_formats(info_dict['formats']) + ydl.process_ie_result(info_dict) + downloaded_ids = [info['format_id'] for info in ydl.downloaded_info_dicts] + self.assertEqual(downloaded_ids, ['136+141', '247+141']) + + info_dict = _make_result(list(formats_order), extractor='youtube') + ydl = YDL({'format': '(bestvideo[ext=none]/bestvideo[ext=webm])+bestaudio'}) + yie = YoutubeIE(ydl) + yie._sort_formats(info_dict['formats']) + ydl.process_ie_result(info_dict) + downloaded_ids = [info['format_id'] for info in ydl.downloaded_info_dicts] + self.assertEqual(downloaded_ids, ['248+141']) + + for f1, f2 in zip(formats_order, formats_order[1:]): info_dict = _make_result([f1, f2], extractor='youtube') - ydl = YDL() + ydl = YDL({'format': 'best/bestvideo'}) yie = YoutubeIE(ydl) yie._sort_formats(info_dict['formats']) ydl.process_ie_result(info_dict) downloaded = ydl.downloaded_info_dicts[0] - self.assertEqual(downloaded['format_id'], f1id) + self.assertEqual(downloaded['format_id'], f1['format_id']) info_dict = _make_result([f2, f1], extractor='youtube') - ydl = YDL() + ydl = YDL({'format': 'best/bestvideo'}) yie = YoutubeIE(ydl) yie._sort_formats(info_dict['formats']) ydl.process_ie_result(info_dict) downloaded = ydl.downloaded_info_dicts[0] - self.assertEqual(downloaded['format_id'], f1id) + self.assertEqual(downloaded['format_id'], f1['format_id']) + + def test_invalid_format_specs(self): + def assert_syntax_error(format_spec): + ydl = YDL({'format': format_spec}) + info_dict = _make_result([{'format_id': 'foo', 'url': TEST_URL}]) + self.assertRaises(SyntaxError, ydl.process_ie_result, info_dict) + + assert_syntax_error('bestvideo,,best') + assert_syntax_error('+bestaudio') + assert_syntax_error('bestvideo+') + assert_syntax_error('/') + + def test_format_filtering(self): + formats = [ + {'format_id': 'A', 'filesize': 500, 'width': 1000}, + {'format_id': 'B', 'filesize': 1000, 'width': 500}, + {'format_id': 'C', 'filesize': 1000, 'width': 400}, + {'format_id': 'D', 'filesize': 2000, 'width': 600}, + {'format_id': 'E', 'filesize': 3000}, + {'format_id': 'F'}, + {'format_id': 'G', 'filesize': 1000000}, + ] + for f in formats: + f['url'] = 'http://_/' + f['ext'] = 'unknown' + info_dict = _make_result(formats) + + ydl = YDL({'format': 'best[filesize<3000]'}) + ydl.process_ie_result(info_dict) + downloaded = ydl.downloaded_info_dicts[0] + self.assertEqual(downloaded['format_id'], 'D') + + ydl = YDL({'format': 'best[filesize<=3000]'}) + ydl.process_ie_result(info_dict) + downloaded = ydl.downloaded_info_dicts[0] + self.assertEqual(downloaded['format_id'], 'E') + + ydl = YDL({'format': 'best[filesize <= ? 3000]'}) + ydl.process_ie_result(info_dict) + downloaded = ydl.downloaded_info_dicts[0] + self.assertEqual(downloaded['format_id'], 'F') + + ydl = YDL({'format': 'best [filesize = 1000] [width>450]'}) + ydl.process_ie_result(info_dict) + downloaded = ydl.downloaded_info_dicts[0] + self.assertEqual(downloaded['format_id'], 'B') + + ydl = YDL({'format': 'best [filesize = 1000] [width!=450]'}) + ydl.process_ie_result(info_dict) + downloaded = ydl.downloaded_info_dicts[0] + self.assertEqual(downloaded['format_id'], 'C') + + ydl = YDL({'format': '[filesize>?1]'}) + ydl.process_ie_result(info_dict) + downloaded = ydl.downloaded_info_dicts[0] + self.assertEqual(downloaded['format_id'], 'G') + + ydl = YDL({'format': '[filesize<1M]'}) + ydl.process_ie_result(info_dict) + downloaded = ydl.downloaded_info_dicts[0] + self.assertEqual(downloaded['format_id'], 'E') + + ydl = YDL({'format': '[filesize<1MiB]'}) + ydl.process_ie_result(info_dict) + downloaded = ydl.downloaded_info_dicts[0] + self.assertEqual(downloaded['format_id'], 'G') + + ydl = YDL({'format': 'all[width>=400][width<=600]'}) + ydl.process_ie_result(info_dict) + downloaded_ids = [info['format_id'] for info in ydl.downloaded_info_dicts] + self.assertEqual(downloaded_ids, ['B', 'C', 'D']) + + ydl = YDL({'format': 'best[height<40]'}) + try: + ydl.process_ie_result(info_dict) + except ExtractorError: + pass + self.assertEqual(ydl.downloaded_info_dicts, []) + + +class TestYoutubeDL(unittest.TestCase): + def test_subtitles(self): + def s_formats(lang, autocaption=False): + return [{ + 'ext': ext, + 'url': 'http://localhost/video.%s.%s' % (lang, ext), + '_auto': autocaption, + } for ext in ['vtt', 'srt', 'ass']] + subtitles = dict((l, s_formats(l)) for l in ['en', 'fr', 'es']) + auto_captions = dict((l, s_formats(l, True)) for l in ['it', 'pt', 'es']) + info_dict = { + 'id': 'test', + 'title': 'Test', + 'url': 'http://localhost/video.mp4', + 'subtitles': subtitles, + 'automatic_captions': auto_captions, + 'extractor': 'TEST', + } + + def get_info(params={}): + params.setdefault('simulate', True) + ydl = YDL(params) + ydl.report_warning = lambda *args, **kargs: None + return ydl.process_video_result(info_dict, download=False) + + result = get_info() + self.assertFalse(result.get('requested_subtitles')) + self.assertEqual(result['subtitles'], subtitles) + self.assertEqual(result['automatic_captions'], auto_captions) + + result = get_info({'writesubtitles': True}) + subs = result['requested_subtitles'] + self.assertTrue(subs) + self.assertEqual(set(subs.keys()), set(['en'])) + self.assertTrue(subs['en'].get('data') is None) + self.assertEqual(subs['en']['ext'], 'ass') + + result = get_info({'writesubtitles': True, 'subtitlesformat': 'foo/srt'}) + subs = result['requested_subtitles'] + self.assertEqual(subs['en']['ext'], 'srt') + + result = get_info({'writesubtitles': True, 'subtitleslangs': ['es', 'fr', 'it']}) + subs = result['requested_subtitles'] + self.assertTrue(subs) + self.assertEqual(set(subs.keys()), set(['es', 'fr'])) + + result = get_info({'writesubtitles': True, 'writeautomaticsub': True, 'subtitleslangs': ['es', 'pt']}) + subs = result['requested_subtitles'] + self.assertTrue(subs) + self.assertEqual(set(subs.keys()), set(['es', 'pt'])) + self.assertFalse(subs['es']['_auto']) + self.assertTrue(subs['pt']['_auto']) + + result = get_info({'writeautomaticsub': True, 'subtitleslangs': ['es', 'pt']}) + subs = result['requested_subtitles'] + self.assertTrue(subs) + self.assertEqual(set(subs.keys()), set(['es', 'pt'])) + self.assertTrue(subs['es']['_auto']) + self.assertTrue(subs['pt']['_auto']) def test_add_extra_info(self): test_dict = { @@ -282,5 +481,156 @@ class TestFormatSelection(unittest.TestCase): 'vbr': 10, }), '^\s*10k$') + def test_postprocessors(self): + filename = 'post-processor-testfile.mp4' + audiofile = filename + '.mp3' + + class SimplePP(PostProcessor): + def run(self, info): + with open(audiofile, 'wt') as f: + f.write('EXAMPLE') + return [info['filepath']], info + + def run_pp(params, PP): + with open(filename, 'wt') as f: + f.write('EXAMPLE') + ydl = YoutubeDL(params) + ydl.add_post_processor(PP()) + ydl.post_process(filename, {'filepath': filename}) + + run_pp({'keepvideo': True}, SimplePP) + self.assertTrue(os.path.exists(filename), '%s doesn\'t exist' % filename) + self.assertTrue(os.path.exists(audiofile), '%s doesn\'t exist' % audiofile) + os.unlink(filename) + os.unlink(audiofile) + + run_pp({'keepvideo': False}, SimplePP) + self.assertFalse(os.path.exists(filename), '%s exists' % filename) + self.assertTrue(os.path.exists(audiofile), '%s doesn\'t exist' % audiofile) + os.unlink(audiofile) + + class ModifierPP(PostProcessor): + def run(self, info): + with open(info['filepath'], 'wt') as f: + f.write('MODIFIED') + return [], info + + run_pp({'keepvideo': False}, ModifierPP) + self.assertTrue(os.path.exists(filename), '%s doesn\'t exist' % filename) + os.unlink(filename) + + def test_match_filter(self): + class FilterYDL(YDL): + def __init__(self, *args, **kwargs): + super(FilterYDL, self).__init__(*args, **kwargs) + self.params['simulate'] = True + + def process_info(self, info_dict): + super(YDL, self).process_info(info_dict) + + def _match_entry(self, info_dict, incomplete): + res = super(FilterYDL, self)._match_entry(info_dict, incomplete) + if res is None: + self.downloaded_info_dicts.append(info_dict) + return res + + first = { + 'id': '1', + 'url': TEST_URL, + 'title': 'one', + 'extractor': 'TEST', + 'duration': 30, + 'filesize': 10 * 1024, + } + second = { + 'id': '2', + 'url': TEST_URL, + 'title': 'two', + 'extractor': 'TEST', + 'duration': 10, + 'description': 'foo', + 'filesize': 5 * 1024, + } + videos = [first, second] + + def get_videos(filter_=None): + ydl = FilterYDL({'match_filter': filter_}) + for v in videos: + ydl.process_ie_result(v, download=True) + return [v['id'] for v in ydl.downloaded_info_dicts] + + res = get_videos() + self.assertEqual(res, ['1', '2']) + + def f(v): + if v['id'] == '1': + return None + else: + return 'Video id is not 1' + res = get_videos(f) + self.assertEqual(res, ['1']) + + f = match_filter_func('duration < 30') + res = get_videos(f) + self.assertEqual(res, ['2']) + + f = match_filter_func('description = foo') + res = get_videos(f) + self.assertEqual(res, ['2']) + + f = match_filter_func('description =? foo') + res = get_videos(f) + self.assertEqual(res, ['1', '2']) + + f = match_filter_func('filesize > 5KiB') + res = get_videos(f) + self.assertEqual(res, ['1']) + + def test_playlist_items_selection(self): + entries = [{ + 'id': compat_str(i), + 'title': compat_str(i), + 'url': TEST_URL, + } for i in range(1, 5)] + playlist = { + '_type': 'playlist', + 'id': 'test', + 'entries': entries, + 'extractor': 'test:playlist', + 'extractor_key': 'test:playlist', + 'webpage_url': 'http://example.com', + } + + def get_ids(params): + ydl = YDL(params) + # make a copy because the dictionary can be modified + ydl.process_ie_result(playlist.copy()) + return [int(v['id']) for v in ydl.downloaded_info_dicts] + + result = get_ids({}) + self.assertEqual(result, [1, 2, 3, 4]) + + result = get_ids({'playlistend': 10}) + self.assertEqual(result, [1, 2, 3, 4]) + + result = get_ids({'playlistend': 2}) + self.assertEqual(result, [1, 2]) + + result = get_ids({'playliststart': 10}) + self.assertEqual(result, []) + + result = get_ids({'playliststart': 2}) + self.assertEqual(result, [2, 3, 4]) + + result = get_ids({'playlist_items': '2-4'}) + self.assertEqual(result, [2, 3, 4]) + + result = get_ids({'playlist_items': '2,4'}) + self.assertEqual(result, [2, 4]) + + result = get_ids({'playlist_items': '10'}) + self.assertEqual(result, []) + + if __name__ == '__main__': unittest.main() diff --git a/test/test_aes.py b/test/test_aes.py new file mode 100644 index 000000000..315a3f5ae --- /dev/null +++ b/test/test_aes.py @@ -0,0 +1,55 @@ +#!/usr/bin/env python + +from __future__ import unicode_literals + +# Allow direct execution +import os +import sys +import unittest +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + +from youtube_dl.aes import aes_decrypt, aes_encrypt, aes_cbc_decrypt, aes_decrypt_text +from youtube_dl.utils import bytes_to_intlist, intlist_to_bytes +import base64 + +# the encrypted data can be generate with 'devscripts/generate_aes_testdata.py' + + +class TestAES(unittest.TestCase): + def setUp(self): + self.key = self.iv = [0x20, 0x15] + 14 * [0] + self.secret_msg = b'Secret message goes here' + + def test_encrypt(self): + msg = b'message' + key = list(range(16)) + encrypted = aes_encrypt(bytes_to_intlist(msg), key) + decrypted = intlist_to_bytes(aes_decrypt(encrypted, key)) + self.assertEqual(decrypted, msg) + + def test_cbc_decrypt(self): + data = bytes_to_intlist( + b"\x97\x92+\xe5\x0b\xc3\x18\x91ky9m&\xb3\xb5@\xe6'\xc2\x96.\xc8u\x88\xab9-[\x9e|\xf1\xcd" + ) + decrypted = intlist_to_bytes(aes_cbc_decrypt(data, self.key, self.iv)) + self.assertEqual(decrypted.rstrip(b'\x08'), self.secret_msg) + + def test_decrypt_text(self): + password = intlist_to_bytes(self.key).decode('utf-8') + encrypted = base64.b64encode( + intlist_to_bytes(self.iv[:8]) + + b'\x17\x15\x93\xab\x8d\x80V\xcdV\xe0\t\xcdo\xc2\xa5\xd8ksM\r\xe27N\xae' + ).decode('utf-8') + decrypted = (aes_decrypt_text(encrypted, password, 16)) + self.assertEqual(decrypted, self.secret_msg) + + password = intlist_to_bytes(self.key).decode('utf-8') + encrypted = base64.b64encode( + intlist_to_bytes(self.iv[:8]) + + b'\x0b\xe6\xa4\xd9z\x0e\xb8\xb9\xd0\xd4i_\x85\x1d\x99\x98_\xe5\x80\xe7.\xbf\xa5\x83' + ).decode('utf-8') + decrypted = (aes_decrypt_text(encrypted, password, 32)) + self.assertEqual(decrypted, self.secret_msg) + +if __name__ == '__main__': + unittest.main() diff --git a/test/test_age_restriction.py b/test/test_age_restriction.py index 5be065c43..6f5513faa 100644 --- a/test/test_age_restriction.py +++ b/test/test_age_restriction.py @@ -45,11 +45,6 @@ class TestAgeRestriction(unittest.TestCase): 'http://www.youporn.com/watch/505835/sex-ed-is-it-safe-to-masturbate-daily/', '505835.mp4', 2, old_age=25) - def test_pornotube(self): - self._assert_restricted( - 'http://pornotube.com/c/173/m/1689755/Marilyn-Monroe-Bathing', - '1689755.flv', 13) - if __name__ == '__main__': unittest.main() diff --git a/test/test_all_urls.py b/test/test_all_urls.py index bd4fe17bf..a0c11e6c1 100644 --- a/test/test_all_urls.py +++ b/test/test_all_urls.py @@ -14,7 +14,6 @@ from test.helper import gettestcases from youtube_dl.extractor import ( FacebookIE, gen_extractors, - TwitchIE, YoutubeIE, ) @@ -60,7 +59,7 @@ class TestAllURLsMatching(unittest.TestCase): self.assertMatch('www.youtube.com/NASAgovVideo/videos', ['youtube:user']) def test_youtube_feeds(self): - self.assertMatch('https://www.youtube.com/feed/watch_later', ['youtube:watch_later']) + self.assertMatch('https://www.youtube.com/feed/watch_later', ['youtube:watchlater']) self.assertMatch('https://www.youtube.com/feed/subscriptions', ['youtube:subscriptions']) self.assertMatch('https://www.youtube.com/feed/recommended', ['youtube:recommended']) self.assertMatch('https://www.youtube.com/my_favorites', ['youtube:favorites']) @@ -72,18 +71,6 @@ class TestAllURLsMatching(unittest.TestCase): self.assertMatch('http://www.youtube.com/results?search_query=making+mustard', ['youtube:search_url']) self.assertMatch('https://www.youtube.com/results?baz=bar&search_query=youtube-dl+test+video&filters=video&lclk=video', ['youtube:search_url']) - def test_twitch_channelid_matching(self): - self.assertTrue(TwitchIE.suitable('twitch.tv/vanillatv')) - self.assertTrue(TwitchIE.suitable('www.twitch.tv/vanillatv')) - self.assertTrue(TwitchIE.suitable('http://www.twitch.tv/vanillatv')) - self.assertTrue(TwitchIE.suitable('http://www.twitch.tv/vanillatv/')) - - def test_twitch_videoid_matching(self): - self.assertTrue(TwitchIE.suitable('http://www.twitch.tv/vanillatv/b/328087483')) - - def test_twitch_chapterid_matching(self): - self.assertTrue(TwitchIE.suitable('http://www.twitch.tv/tsm_theoddone/c/2349361')) - def test_youtube_extract(self): assertExtractId = lambda url, id: self.assertEqual(YoutubeIE.extract_id(url), id) assertExtractId('http://www.youtube.com/watch?&v=BaW_jenozKc', 'BaW_jenozKc') @@ -115,15 +102,13 @@ class TestAllURLsMatching(unittest.TestCase): self.assertMatch(':ythistory', ['youtube:history']) self.assertMatch(':thedailyshow', ['ComedyCentralShows']) self.assertMatch(':tds', ['ComedyCentralShows']) - self.assertMatch(':colbertreport', ['ComedyCentralShows']) - self.assertMatch(':cr', ['ComedyCentralShows']) def test_vimeo_matching(self): - self.assertMatch('http://vimeo.com/channels/tributes', ['vimeo:channel']) - self.assertMatch('http://vimeo.com/channels/31259', ['vimeo:channel']) - self.assertMatch('http://vimeo.com/channels/31259/53576664', ['vimeo']) - self.assertMatch('http://vimeo.com/user7108434', ['vimeo:user']) - self.assertMatch('http://vimeo.com/user7108434/videos', ['vimeo:user']) + self.assertMatch('https://vimeo.com/channels/tributes', ['vimeo:channel']) + self.assertMatch('https://vimeo.com/channels/31259', ['vimeo:channel']) + self.assertMatch('https://vimeo.com/channels/31259/53576664', ['vimeo']) + self.assertMatch('https://vimeo.com/user7108434', ['vimeo:user']) + self.assertMatch('https://vimeo.com/user7108434/videos', ['vimeo:user']) self.assertMatch('https://vimeo.com/user21297594/review/75524534/3c257a1b5d', ['vimeo:review']) # https://github.com/rg3/youtube-dl/issues/1930 @@ -136,8 +121,8 @@ class TestAllURLsMatching(unittest.TestCase): def test_pbs(self): # https://github.com/rg3/youtube-dl/issues/2350 - self.assertMatch('http://video.pbs.org/viralplayer/2365173446/', ['PBS']) - self.assertMatch('http://video.pbs.org/widget/partnerplayer/980042464/', ['PBS']) + self.assertMatch('http://video.pbs.org/viralplayer/2365173446/', ['pbs']) + self.assertMatch('http://video.pbs.org/widget/partnerplayer/980042464/', ['pbs']) def test_yahoo_https(self): # https://github.com/rg3/youtube-dl/issues/2701 diff --git a/test/test_compat.py b/test/test_compat.py index 1eb454e06..b6bfad05e 100644 --- a/test/test_compat.py +++ b/test/test_compat.py @@ -13,7 +13,12 @@ sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) from youtube_dl.utils import get_filesystem_encoding from youtube_dl.compat import ( compat_getenv, + compat_etree_fromstring, compat_expanduser, + compat_shlex_split, + compat_str, + compat_urllib_parse_unquote, + compat_urllib_parse_unquote_plus, ) @@ -42,5 +47,46 @@ class TestCompat(unittest.TestCase): dir(youtube_dl.compat))) - set(['unicode_literals']) self.assertEqual(all_names, sorted(present_names)) + def test_compat_urllib_parse_unquote(self): + self.assertEqual(compat_urllib_parse_unquote('abc%20def'), 'abc def') + self.assertEqual(compat_urllib_parse_unquote('%7e/abc+def'), '~/abc+def') + self.assertEqual(compat_urllib_parse_unquote(''), '') + self.assertEqual(compat_urllib_parse_unquote('%'), '%') + self.assertEqual(compat_urllib_parse_unquote('%%'), '%%') + self.assertEqual(compat_urllib_parse_unquote('%%%'), '%%%') + self.assertEqual(compat_urllib_parse_unquote('%2F'), '/') + self.assertEqual(compat_urllib_parse_unquote('%2f'), '/') + self.assertEqual(compat_urllib_parse_unquote('%E6%B4%A5%E6%B3%A2'), '津波') + self.assertEqual( + compat_urllib_parse_unquote(''' +%%a'''), + ''' +%%a''') + self.assertEqual( + compat_urllib_parse_unquote('''%28%5E%E2%97%A3_%E2%97%A2%5E%29%E3%81%A3%EF%B8%BB%E3%83%87%E2%95%90%E4%B8%80 %E2%87%80 %E2%87%80 %E2%87%80 %E2%87%80 %E2%87%80 %E2%86%B6%I%Break%25Things%'''), + '''(^◣_◢^)っ︻デ═一 ⇀ ⇀ ⇀ ⇀ ⇀ ↶%I%Break%Things%''') + + def test_compat_urllib_parse_unquote_plus(self): + self.assertEqual(compat_urllib_parse_unquote_plus('abc%20def'), 'abc def') + self.assertEqual(compat_urllib_parse_unquote_plus('%7e/abc+def'), '~/abc def') + + def test_compat_shlex_split(self): + self.assertEqual(compat_shlex_split('-option "one two"'), ['-option', 'one two']) + + def test_compat_etree_fromstring(self): + xml = ''' + + foo + 中文 + spam + + ''' + doc = compat_etree_fromstring(xml.encode('utf-8')) + self.assertTrue(isinstance(doc.attrib['foo'], compat_str)) + self.assertTrue(isinstance(doc.attrib['spam'], compat_str)) + self.assertTrue(isinstance(doc.find('normal').text, compat_str)) + self.assertTrue(isinstance(doc.find('chinese').text, compat_str)) + self.assertTrue(isinstance(doc.find('foo/bar').text, compat_str)) + if __name__ == '__main__': unittest.main() diff --git a/test/test_download.py b/test/test_download.py index a009aa475..a3f1c0644 100644 --- a/test/test_download.py +++ b/test/test_download.py @@ -89,7 +89,7 @@ def generator(test_case): for tc in test_cases: info_dict = tc.get('info_dict', {}) - if not tc.get('file') and not (info_dict.get('id') and info_dict.get('ext')): + if not (info_dict.get('id') and info_dict.get('ext')): raise Exception('Test definition incorrect. The output file cannot be known. Are both \'id\' and \'ext\' keys present?') if 'skip' in test_case: @@ -102,7 +102,7 @@ def generator(test_case): params = get_params(test_case.get('params', {})) if is_playlist and 'playlist' not in test_case: - params.setdefault('extract_flat', True) + params.setdefault('extract_flat', 'in_playlist') params.setdefault('skip_download', True) ydl = YoutubeDL(params, auto_init=False) @@ -116,7 +116,7 @@ def generator(test_case): expect_warnings(ydl, test_case.get('expected_warnings', [])) def get_tc_filename(tc): - return tc.get('file') or ydl.prepare_filename(tc.get('info_dict', {})) + return ydl.prepare_filename(tc.get('info_dict', {})) res_dict = None @@ -136,7 +136,9 @@ def generator(test_case): # We're not using .download here sine that is just a shim # for outside error handling, and returns the exit code # instead of the result dict. - res_dict = ydl.extract_info(test_case['url']) + res_dict = ydl.extract_info( + test_case['url'], + force_generic_extractor=params.get('force_generic_extractor', False)) except (DownloadError, ExtractorError) as err: # Check if the exception is not a network related one if not err.exc_info[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError, compat_http_client.BadStatusLine) or (err.exc_info[0] == compat_HTTPError and err.exc_info[1].code == 503): @@ -153,9 +155,9 @@ def generator(test_case): break if is_playlist: - self.assertEqual(res_dict['_type'], 'playlist') + self.assertTrue(res_dict['_type'] in ['playlist', 'multi_video']) self.assertTrue('entries' in res_dict) - expect_info_dict(self, test_case.get('info_dict', {}), res_dict) + expect_info_dict(self, res_dict, test_case.get('info_dict', {})) if 'playlist_mincount' in test_case: assertGreaterEqual( @@ -204,7 +206,7 @@ def generator(test_case): with io.open(info_json_fn, encoding='utf-8') as infof: info_dict = json.load(infof) - expect_info_dict(self, tc.get('info_dict', {}), info_dict) + expect_info_dict(self, info_dict, tc.get('info_dict', {})) finally: try_rm_tcs_files() if is_playlist and res_dict is not None and res_dict.get('entries'): diff --git a/test/test_execution.py b/test/test_execution.py index 60df187de..620db080e 100644 --- a/test/test_execution.py +++ b/test/test_execution.py @@ -1,4 +1,6 @@ #!/usr/bin/env python +# coding: utf-8 + from __future__ import unicode_literals import unittest @@ -6,6 +8,9 @@ import unittest import sys import os import subprocess +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + +from youtube_dl.utils import encodeArgument rootDir = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) @@ -27,5 +32,12 @@ class TestExecution(unittest.TestCase): def test_main_exec(self): subprocess.check_call([sys.executable, 'youtube_dl/__main__.py', '--version'], cwd=rootDir, stdout=_DEV_NULL) + def test_cmdline_umlauts(self): + p = subprocess.Popen( + [sys.executable, 'youtube_dl/__main__.py', encodeArgument('ä'), '--version'], + cwd=rootDir, stdout=_DEV_NULL, stderr=subprocess.PIPE) + _, stderr = p.communicate() + self.assertFalse(stderr) + if __name__ == '__main__': unittest.main() diff --git a/test/test_http.py b/test/test_http.py new file mode 100644 index 000000000..f2e305b6f --- /dev/null +++ b/test/test_http.py @@ -0,0 +1,119 @@ +#!/usr/bin/env python +from __future__ import unicode_literals + +# Allow direct execution +import os +import sys +import unittest +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + +from youtube_dl import YoutubeDL +from youtube_dl.compat import compat_http_server, compat_urllib_request +import ssl +import threading + +TEST_DIR = os.path.dirname(os.path.abspath(__file__)) + + +class HTTPTestRequestHandler(compat_http_server.BaseHTTPRequestHandler): + def log_message(self, format, *args): + pass + + def do_GET(self): + if self.path == '/video.html': + self.send_response(200) + self.send_header('Content-Type', 'text/html; charset=utf-8') + self.end_headers() + self.wfile.write(b'
.*)\.(?P(flv)|(mp4))' _TEST = { 'url': 'http://media.photobucket.com/user/rachaneronas/media/TiredofLinkBuildingTryBacklinkMyDomaincom_zpsc0c3b9fa.mp4.html?filters[term]=search&filters[primary]=videos&filters[secondary]=images&sort=1&o=0', - 'file': 'zpsc0c3b9fa.mp4', 'md5': '7dabfb92b0a31f6c16cebc0f8e60ff99', 'info_dict': { + 'id': 'zpsc0c3b9fa', + 'ext': 'mp4', 'timestamp': 1367669341, 'upload_date': '20130504', 'uploader': 'rachaneronas', @@ -33,7 +34,7 @@ class PhotobucketIE(InfoExtractor): info_json = self._search_regex(r'Pb\.Data\.Shared\.put\(Pb\.Data\.Shared\.MEDIA, (.*?)\);', webpage, 'info json') info = json.loads(info_json) - url = compat_urllib_parse.unquote(self._html_search_regex(r'file=(.+\.mp4)', info['linkcodes']['html'], 'url')) + url = compat_urllib_parse_unquote(self._html_search_regex(r'file=(.+\.mp4)', info['linkcodes']['html'], 'url')) return { 'id': video_id, 'url': url, diff --git a/youtube_dl/extractor/pinkbike.py b/youtube_dl/extractor/pinkbike.py new file mode 100644 index 000000000..a52210fab --- /dev/null +++ b/youtube_dl/extractor/pinkbike.py @@ -0,0 +1,96 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + int_or_none, + remove_end, + remove_start, + str_to_int, + unified_strdate, +) + + +class PinkbikeIE(InfoExtractor): + _VALID_URL = r'https?://(?:(?:www\.)?pinkbike\.com/video/|es\.pinkbike\.org/i/kvid/kvid-y5\.swf\?id=)(?P[0-9]+)' + _TESTS = [{ + 'url': 'http://www.pinkbike.com/video/402811/', + 'md5': '4814b8ca7651034cd87e3361d5c2155a', + 'info_dict': { + 'id': '402811', + 'ext': 'mp4', + 'title': 'Brandon Semenuk - RAW 100', + 'description': 'Official release: www.redbull.ca/rupertwalker', + 'thumbnail': 're:^https?://.*\.jpg$', + 'duration': 100, + 'upload_date': '20150406', + 'uploader': 'revelco', + 'location': 'Victoria, British Columbia, Canada', + 'view_count': int, + 'comment_count': int, + } + }, { + 'url': 'http://es.pinkbike.org/i/kvid/kvid-y5.swf?id=406629', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage( + 'http://www.pinkbike.com/video/%s' % video_id, video_id) + + formats = [] + for _, format_id, src in re.findall( + r'data-quality=((?:\\)?["\'])(.+?)\1[^>]+src=\1(.+?)\1', webpage): + height = int_or_none(self._search_regex( + r'^(\d+)[pP]$', format_id, 'height', default=None)) + formats.append({ + 'url': src, + 'format_id': format_id, + 'height': height, + }) + self._sort_formats(formats) + + title = remove_end(self._og_search_title(webpage), ' Video - Pinkbike') + description = self._html_search_regex( + r'(?s)id="media-description"[^>]*>(.+?)<', + webpage, 'description', default=None) or remove_start( + self._og_search_description(webpage), title + '. ') + thumbnail = self._og_search_thumbnail(webpage) + duration = int_or_none(self._html_search_meta( + 'video:duration', webpage, 'duration')) + + uploader = self._search_regex( + r'un:\s*"([^"]+)"', webpage, 'uploader', fatal=False) + upload_date = unified_strdate(self._search_regex( + r'class="fullTime"[^>]+title="([^"]+)"', + webpage, 'upload date', fatal=False)) + + location = self._html_search_regex( + r'(?s)
Location
\s*
(.+?)<', + webpage, 'location', fatal=False) + + def extract_count(webpage, label): + return str_to_int(self._search_regex( + r']+class="stat-num"[^>]*>([\d,.]+)\s*]+class="stat-label"[^>]*>%s' % label, + webpage, label, fatal=False)) + + view_count = extract_count(webpage, 'Views') + comment_count = extract_count(webpage, 'Comments') + + return { + 'id': video_id, + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + 'duration': duration, + 'upload_date': upload_date, + 'uploader': uploader, + 'location': location, + 'view_count': view_count, + 'comment_count': comment_count, + 'formats': formats + } diff --git a/youtube_dl/extractor/pladform.py b/youtube_dl/extractor/pladform.py new file mode 100644 index 000000000..bc559d1df --- /dev/null +++ b/youtube_dl/extractor/pladform.py @@ -0,0 +1,99 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + int_or_none, + xpath_text, + qualities, +) + + +class PladformIE(InfoExtractor): + _VALID_URL = r'''(?x) + https?:// + (?: + (?: + out\.pladform\.ru/player| + static\.pladform\.ru/player\.swf + ) + \?.*\bvideoid=| + video\.pladform\.ru/catalog/video/videoid/ + ) + (?P\d+) + ''' + _TESTS = [{ + # http://muz-tv.ru/kinozal/view/7400/ + 'url': 'http://out.pladform.ru/player?pl=24822&videoid=100183293', + 'md5': '61f37b575dd27f1bb2e1854777fe31f4', + 'info_dict': { + 'id': '100183293', + 'ext': 'mp4', + 'title': 'Тайны перевала Дятлова • 1 серия 2 часть', + 'description': 'Документальный сериал-расследование одной из самых жутких тайн ХХ века', + 'thumbnail': 're:^https?://.*\.jpg$', + 'duration': 694, + 'age_limit': 0, + }, + }, { + 'url': 'http://static.pladform.ru/player.swf?pl=21469&videoid=100183293&vkcid=0', + 'only_matching': True, + }, { + 'url': 'http://video.pladform.ru/catalog/video/videoid/100183293/vkcid/0', + 'only_matching': True, + }] + + @staticmethod + def _extract_url(webpage): + mobj = re.search( + r']+src="(?P(?:https?:)?//out\.pladform\.ru/player\?.+?)"', webpage) + if mobj: + return mobj.group('url') + + def _real_extract(self, url): + video_id = self._match_id(url) + + video = self._download_xml( + 'http://out.pladform.ru/getVideo?pl=1&videoid=%s' % video_id, + video_id) + + if video.tag == 'error': + raise ExtractorError( + '%s returned error: %s' % (self.IE_NAME, video.text), + expected=True) + + quality = qualities(('ld', 'sd', 'hd')) + + formats = [{ + 'url': src.text, + 'format_id': src.get('quality'), + 'quality': quality(src.get('quality')), + } for src in video.findall('./src')] + self._sort_formats(formats) + + webpage = self._download_webpage( + 'http://video.pladform.ru/catalog/video/videoid/%s' % video_id, + video_id) + + title = self._og_search_title(webpage, fatal=False) or xpath_text( + video, './/title', 'title', fatal=True) + description = self._search_regex( + r'\s*

([^<]+)

', webpage, 'description', fatal=False) + thumbnail = self._og_search_thumbnail(webpage) or xpath_text( + video, './/cover', 'cover') + + duration = int_or_none(xpath_text(video, './/time', 'duration')) + age_limit = int_or_none(xpath_text(video, './/age18', 'age limit')) + + return { + 'id': video_id, + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + 'duration': duration, + 'age_limit': age_limit, + 'formats': formats, + } diff --git a/youtube_dl/extractor/planetaplay.py b/youtube_dl/extractor/planetaplay.py index 596c621d7..06505e96f 100644 --- a/youtube_dl/extractor/planetaplay.py +++ b/youtube_dl/extractor/planetaplay.py @@ -18,7 +18,8 @@ class PlanetaPlayIE(InfoExtractor): 'id': '3586', 'ext': 'flv', 'title': 'md5:e829428ee28b1deed00de90de49d1da1', - } + }, + 'skip': 'Not accessible from Travis CI server', } _SONG_FORMATS = { diff --git a/youtube_dl/extractor/played.py b/youtube_dl/extractor/played.py index 17880471d..2856af96f 100644 --- a/youtube_dl/extractor/played.py +++ b/youtube_dl/extractor/played.py @@ -5,10 +5,10 @@ import re import os.path from .common import InfoExtractor +from ..compat import compat_urllib_parse from ..utils import ( ExtractorError, - compat_urllib_parse, - compat_urllib_request, + sanitized_Request, ) @@ -24,11 +24,11 @@ class PlayedIE(InfoExtractor): 'ext': 'flv', 'title': 'youtube-dl_test_video.mp4', }, + 'skip': 'Removed for copyright infringement.', # oh wow } def _real_extract(self, url): video_id = self._match_id(url) - orig_webpage = self._download_webpage(url, video_id) m_error = re.search( @@ -36,9 +36,7 @@ class PlayedIE(InfoExtractor): if m_error: raise ExtractorError(m_error.group('msg'), expected=True) - fields = re.findall( - r'type="hidden" name="([^"]+)"\s+value="([^"]+)">', orig_webpage) - data = dict(fields) + data = self._hidden_inputs(orig_webpage) self._sleep(2, video_id) @@ -46,7 +44,7 @@ class PlayedIE(InfoExtractor): headers = { b'Content-Type': b'application/x-www-form-urlencoded', } - req = compat_urllib_request.Request(url, post, headers) + req = sanitized_Request(url, post, headers) webpage = self._download_webpage( req, video_id, note='Downloading video page ...') diff --git a/youtube_dl/extractor/playfm.py b/youtube_dl/extractor/playfm.py index ebc046804..e766ccca3 100644 --- a/youtube_dl/extractor/playfm.py +++ b/youtube_dl/extractor/playfm.py @@ -4,83 +4,72 @@ from __future__ import unicode_literals import re from .common import InfoExtractor +from ..compat import compat_str from ..utils import ( - compat_urllib_parse, - compat_urllib_request, ExtractorError, - float_or_none, int_or_none, - str_to_int, + parse_iso8601, ) class PlayFMIE(InfoExtractor): IE_NAME = 'play.fm' - _VALID_URL = r'https?://(?:www\.)?play\.fm/[^?#]*(?P[0-9]{8})(?P[0-9]{6})(?:$|[?#])' + _VALID_URL = r'https?://(?:www\.)?play\.fm/(?P(?:[^/]+/)+(?P[^/]+))/?(?:$|[?#])' _TEST = { - 'url': 'http://www.play.fm/recording/leipzigelectronicmusicbatofarparis_fr20140712137220', + 'url': 'https://www.play.fm/dan-drastic/sven-tasnadi-leipzig-electronic-music-batofar-paris-fr-2014-07-12', 'md5': 'c505f8307825a245d0c7ad1850001f22', 'info_dict': { - 'id': '137220', + 'id': '71276', 'ext': 'mp3', - 'title': 'LEIPZIG ELECTRONIC MUSIC @ Batofar (Paris,FR) - 2014-07-12', - 'uploader': 'Sven Tasnadi', - 'uploader_id': 'sventasnadi', - 'duration': 5627.428, - 'upload_date': '20140712', + 'title': 'Sven Tasnadi - LEIPZIG ELECTRONIC MUSIC @ Batofar (Paris,FR) - 2014-07-12', + 'description': '', + 'duration': 5627, + 'timestamp': 1406033781, + 'upload_date': '20140722', + 'uploader': 'Dan Drastic', + 'uploader_id': '71170', 'view_count': int, 'comment_count': int, - 'thumbnail': 're:^https?://.*\.jpg$', }, } def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) video_id = mobj.group('id') - upload_date = mobj.group('upload_date') + slug = mobj.group('slug') - rec_data = compat_urllib_parse.urlencode({'rec_id': video_id}) - req = compat_urllib_request.Request( - 'http://www.play.fm/flexRead/recording', data=rec_data) - req.add_header('Content-Type', 'application/x-www-form-urlencoded') - rec_doc = self._download_xml(req, video_id) + recordings = self._download_json( + 'http://v2api.play.fm/recordings/slug/%s' % slug, video_id) - error_node = rec_doc.find('./error') - if error_node is not None: - raise ExtractorError('An error occured: %s (code %s)' % ( - error_node.text, rec_doc.find('./status').text)) + error = recordings.get('error') + if isinstance(error, dict): + raise ExtractorError( + '%s returned error: %s' % (self.IE_NAME, error.get('message')), + expected=True) - recording = rec_doc.find('./recording') - title = recording.find('./title').text - view_count = str_to_int(recording.find('./stats/playcount').text) - comment_count = str_to_int(recording.find('./stats/comments').text) - duration = float_or_none(recording.find('./duration').text, scale=1000) - thumbnail = recording.find('./image').text - - artist = recording.find('./artists/artist') - uploader = artist.find('./name').text - uploader_id = artist.find('./slug').text - - video_url = '%s//%s/%s/%s/offset/0/sh/%s/rec/%s/jingle/%s/loc/%s' % ( - 'http:', recording.find('./url').text, - recording.find('./_class').text, recording.find('./file_id').text, - rec_doc.find('./uuid').text, video_id, - rec_doc.find('./jingle/file_id').text, - 'http%3A%2F%2Fwww.play.fm%2Fplayer', - ) + audio_url = recordings['audio'] + video_id = compat_str(recordings.get('id') or video_id) + title = recordings['title'] + description = recordings.get('description') + duration = int_or_none(recordings.get('recordingDuration')) + timestamp = parse_iso8601(recordings.get('created_at')) + uploader = recordings.get('page', {}).get('title') + uploader_id = compat_str(recordings.get('page', {}).get('id')) + view_count = int_or_none(recordings.get('playCount')) + comment_count = int_or_none(recordings.get('commentCount')) + categories = [tag['name'] for tag in recordings.get('tags', []) if tag.get('name')] return { 'id': video_id, - 'url': video_url, - 'ext': 'mp3', - 'filesize': int_or_none(recording.find('./size').text), + 'url': audio_url, 'title': title, - 'upload_date': upload_date, - 'view_count': view_count, - 'comment_count': comment_count, + 'description': description, 'duration': duration, - 'thumbnail': thumbnail, + 'timestamp': timestamp, 'uploader': uploader, 'uploader_id': uploader_id, + 'view_count': view_count, + 'comment_count': comment_count, + 'categories': categories, } diff --git a/youtube_dl/extractor/playtvak.py b/youtube_dl/extractor/playtvak.py new file mode 100644 index 000000000..e360404f7 --- /dev/null +++ b/youtube_dl/extractor/playtvak.py @@ -0,0 +1,181 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..compat import ( + compat_urlparse, + compat_urllib_parse, +) +from ..utils import ( + ExtractorError, + int_or_none, + parse_iso8601, + qualities, +) + + +class PlaytvakIE(InfoExtractor): + IE_DESC = 'Playtvak.cz, iDNES.cz and Lidovky.cz' + _VALID_URL = r'https?://(?:.+?\.)?(?:playtvak|idnes|lidovky|metro)\.cz/.*\?(?:c|idvideo)=(?P[^&]+)' + _TESTS = [{ + 'url': 'http://www.playtvak.cz/vyzente-vosy-a-srsne-ze-zahrady-dn5-/hodinovy-manzel.aspx?c=A150730_150323_hodinovy-manzel_kuko', + 'md5': '4525ae312c324b4be2f4603cc78ceb4a', + 'info_dict': { + 'id': 'A150730_150323_hodinovy-manzel_kuko', + 'ext': 'mp4', + 'title': 'Vyžeňte vosy a sršně ze zahrady', + 'description': 'md5:f93d398691044d303bc4a3de62f3e976', + 'thumbnail': 're:(?i)^https?://.*\.(?:jpg|png)$', + 'duration': 279, + 'timestamp': 1438732860, + 'upload_date': '20150805', + 'is_live': False, + } + }, { # live video test + 'url': 'http://slowtv.playtvak.cz/planespotting-0pr-/planespotting.aspx?c=A150624_164934_planespotting_cat', + 'info_dict': { + 'id': 'A150624_164934_planespotting_cat', + 'ext': 'flv', + 'title': 're:^Přímý přenos iDNES.cz [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', + 'description': 'Sledujte provoz na ranveji Letiště Václava Havla v Praze', + 'thumbnail': 're:(?i)^https?://.*\.(?:jpg|png)$', + 'is_live': True, + }, + 'params': { + 'skip_download': True, # requires rtmpdump + }, + }, { # idnes.cz + 'url': 'http://zpravy.idnes.cz/pes-zavreny-v-aute-rozbijeni-okynek-v-aute-fj5-/domaci.aspx?c=A150809_104116_domaci_pku', + 'md5': '819832ba33cd7016e58a6658577fe289', + 'info_dict': { + 'id': 'A150809_104116_domaci_pku', + 'ext': 'mp4', + 'title': 'Zavřeli jsme mraženou pizzu do auta. Upekla se', + 'description': 'md5:01e73f02329e2e5760bd5eed4d42e3c2', + 'thumbnail': 're:(?i)^https?://.*\.(?:jpg|png)$', + 'duration': 39, + 'timestamp': 1438969140, + 'upload_date': '20150807', + 'is_live': False, + } + }, { # lidovky.cz + 'url': 'http://www.lidovky.cz/dalsi-demonstrace-v-praze-o-migraci-duq-/video.aspx?c=A150808_214044_ln-video_ELE', + 'md5': 'c7209ac4ba9d234d4ad5bab7485bcee8', + 'info_dict': { + 'id': 'A150808_214044_ln-video_ELE', + 'ext': 'mp4', + 'title': 'Táhni! Demonstrace proti imigrantům budila emoce', + 'description': 'md5:97c81d589a9491fbfa323c9fa3cca72c', + 'thumbnail': 're:(?i)^https?://.*\.(?:jpg|png)$', + 'timestamp': 1439052180, + 'upload_date': '20150808', + 'is_live': False, + } + }, { # metro.cz + 'url': 'http://www.metro.cz/video-pod-billboardem-se-na-vltavske-roztocil-kolotoc-deti-vozil-jen-par-hodin-1hx-/metro-extra.aspx?c=A141111_173251_metro-extra_row', + 'md5': '84fc1deedcac37b7d4a6ccae7c716668', + 'info_dict': { + 'id': 'A141111_173251_metro-extra_row', + 'ext': 'mp4', + 'title': 'Recesisté udělali z billboardu kolotoč', + 'description': 'md5:7369926049588c3989a66c9c1a043c4c', + 'thumbnail': 're:(?i)^https?://.*\.(?:jpg|png)$', + 'timestamp': 1415725500, + 'upload_date': '20141111', + 'is_live': False, + } + }, { + 'url': 'http://www.playtvak.cz/embed.aspx?idvideo=V150729_141549_play-porad_kuko', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage(url, video_id) + + info_url = self._html_search_regex( + r'Misc\.videoFLV\(\s*{\s*data\s*:\s*"([^"]+)"', webpage, 'info url') + + parsed_url = compat_urlparse.urlparse(info_url) + + qs = compat_urlparse.parse_qs(parsed_url.query) + qs.update({ + 'reklama': ['0'], + 'type': ['js'], + }) + + info_url = compat_urlparse.urlunparse( + parsed_url._replace(query=compat_urllib_parse.urlencode(qs, True))) + + json_info = self._download_json( + info_url, video_id, + transform_source=lambda s: s[s.index('{'):s.rindex('}') + 1]) + + item = None + for i in json_info['items']: + if i.get('type') == 'video' or i.get('type') == 'stream': + item = i + break + if not item: + raise ExtractorError('No suitable stream found') + + quality = qualities(('low', 'middle', 'high')) + + formats = [] + for fmt in item['video']: + video_url = fmt.get('file') + if not video_url: + continue + + format_ = fmt['format'] + format_id = '%s_%s' % (format_, fmt['quality']) + preference = None + + if format_ in ('mp4', 'webm'): + ext = format_ + elif format_ == 'rtmp': + ext = 'flv' + elif format_ == 'apple': + ext = 'mp4' + # Some streams have mp3 audio which does not play + # well with ffmpeg filter aac_adtstoasc + preference = -1 + elif format_ == 'adobe': # f4m manifest fails with 404 in 80% of requests + continue + else: # Other formats not supported yet + continue + + formats.append({ + 'url': video_url, + 'ext': ext, + 'format_id': format_id, + 'quality': quality(fmt.get('quality')), + 'preference': preference, + }) + self._sort_formats(formats) + + title = item['title'] + is_live = item['type'] == 'stream' + if is_live: + title = self._live_title(title) + description = self._og_search_description(webpage, default=None) or self._html_search_meta( + 'description', webpage, 'description') + timestamp = None + duration = None + if not is_live: + duration = int_or_none(item.get('length')) + timestamp = item.get('published') + if timestamp: + timestamp = parse_iso8601(timestamp[:-5]) + + return { + 'id': video_id, + 'title': title, + 'description': description, + 'thumbnail': item.get('image'), + 'duration': duration, + 'timestamp': timestamp, + 'is_live': is_live, + 'formats': formats, + } diff --git a/youtube_dl/extractor/playvid.py b/youtube_dl/extractor/playvid.py index cd3905acb..2eb4fd96d 100644 --- a/youtube_dl/extractor/playvid.py +++ b/youtube_dl/extractor/playvid.py @@ -3,31 +3,32 @@ from __future__ import unicode_literals import re from .common import InfoExtractor +from ..compat import ( + compat_urllib_parse_unquote, + compat_urllib_parse_unquote_plus, +) from ..utils import ( - ExtractorError, clean_html, - compat_urllib_parse, + ExtractorError, ) class PlayvidIE(InfoExtractor): - _VALID_URL = r'^https?://www\.playvid\.com/watch(\?v=|/)(?P.+?)(?:#|$)' + _VALID_URL = r'https?://www\.playvid\.com/watch(\?v=|/)(?P.+?)(?:#|$)' _TEST = { - 'url': 'http://www.playvid.com/watch/agbDDi7WZTV', - 'md5': '44930f8afa616efdf9482daf4fe53e1e', + 'url': 'http://www.playvid.com/watch/RnmBNgtrrJu', + 'md5': 'ffa2f6b2119af359f544388d8c01eb6c', 'info_dict': { - 'id': 'agbDDi7WZTV', + 'id': 'RnmBNgtrrJu', 'ext': 'mp4', - 'title': 'Michelle Lewin in Miami Beach', - 'duration': 240, + 'title': 'md5:9256d01c6317e3f703848b5906880dc8', + 'duration': 82, 'age_limit': 18, } } def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') - + video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) m_error = re.search( @@ -44,7 +45,7 @@ class PlayvidIE(InfoExtractor): flashvars = self._html_search_regex( r'flashvars="(.+?)"', webpage, 'flashvars') - infos = compat_urllib_parse.unquote(flashvars).split(r'&') + infos = compat_urllib_parse_unquote(flashvars).split(r'&') for info in infos: videovars_match = re.match(r'^video_vars\[(.+?)\]=(.+?)$', info) if videovars_match: @@ -52,7 +53,7 @@ class PlayvidIE(InfoExtractor): val = videovars_match.group(2) if key == 'title': - video_title = compat_urllib_parse.unquote_plus(val) + video_title = compat_urllib_parse_unquote_plus(val) if key == 'duration': try: duration = int(val) diff --git a/youtube_dl/extractor/playwire.py b/youtube_dl/extractor/playwire.py new file mode 100644 index 000000000..6d138ef25 --- /dev/null +++ b/youtube_dl/extractor/playwire.py @@ -0,0 +1,78 @@ +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + xpath_text, + float_or_none, + int_or_none, +) + + +class PlaywireIE(InfoExtractor): + _VALID_URL = r'https?://(?:config|cdn)\.playwire\.com(?:/v2)?/(?P\d+)/(?:videos/v2|embed|config)/(?P\d+)' + _TESTS = [{ + 'url': 'http://config.playwire.com/14907/videos/v2/3353705/player.json', + 'md5': 'e6398701e3595888125729eaa2329ed9', + 'info_dict': { + 'id': '3353705', + 'ext': 'mp4', + 'title': 'S04_RM_UCL_Rus', + 'thumbnail': 're:^https?://.*\.png$', + 'duration': 145.94, + }, + }, { + 'url': 'http://cdn.playwire.com/11625/embed/85228.html', + 'only_matching': True, + }, { + 'url': 'http://config.playwire.com/12421/videos/v2/3389892/zeus.json', + 'only_matching': True, + }, { + 'url': 'http://cdn.playwire.com/v2/12342/config/1532636.json', + 'only_matching': True, + }] + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + publisher_id, video_id = mobj.group('publisher_id'), mobj.group('id') + + player = self._download_json( + 'http://config.playwire.com/%s/videos/v2/%s/zeus.json' % (publisher_id, video_id), + video_id) + + title = player['settings']['title'] + duration = float_or_none(player.get('duration'), 1000) + + content = player['content'] + thumbnail = content.get('poster') + src = content['media']['f4m'] + + f4m = self._download_xml(src, video_id) + base_url = xpath_text(f4m, './{http://ns.adobe.com/f4m/1.0}baseURL', 'base url', fatal=True) + formats = [] + for media in f4m.findall('./{http://ns.adobe.com/f4m/1.0}media'): + media_url = media.get('url') + if not media_url: + continue + tbr = int_or_none(media.get('bitrate')) + width = int_or_none(media.get('width')) + height = int_or_none(media.get('height')) + f = { + 'url': '%s/%s' % (base_url, media.attrib['url']), + 'tbr': tbr, + 'width': width, + 'height': height, + } + if not (tbr or width or height): + f['quality'] = 1 if '-hd.' in media_url else 0 + formats.append(f) + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': title, + 'thumbnail': thumbnail, + 'duration': duration, + 'formats': formats, + } diff --git a/youtube_dl/extractor/pluralsight.py b/youtube_dl/extractor/pluralsight.py new file mode 100644 index 000000000..55c11b3bf --- /dev/null +++ b/youtube_dl/extractor/pluralsight.py @@ -0,0 +1,291 @@ +from __future__ import unicode_literals + +import re +import json +import random +import collections + +from .common import InfoExtractor +from ..compat import ( + compat_str, + compat_urllib_parse, + compat_urlparse, +) +from ..utils import ( + ExtractorError, + int_or_none, + parse_duration, + qualities, + sanitized_Request, +) + + +class PluralsightBaseIE(InfoExtractor): + _API_BASE = 'http://app.pluralsight.com' + + +class PluralsightIE(PluralsightBaseIE): + IE_NAME = 'pluralsight' + _VALID_URL = r'https?://(?:(?:www|app)\.)?pluralsight\.com/training/player\?' + _LOGIN_URL = 'https://app.pluralsight.com/id/' + + _NETRC_MACHINE = 'pluralsight' + + _TESTS = [{ + 'url': 'http://www.pluralsight.com/training/player?author=mike-mckeown&name=hosting-sql-server-windows-azure-iaas-m7-mgmt&mode=live&clip=3&course=hosting-sql-server-windows-azure-iaas', + 'md5': '4d458cf5cf4c593788672419a8dd4cf8', + 'info_dict': { + 'id': 'hosting-sql-server-windows-azure-iaas-m7-mgmt-04', + 'ext': 'mp4', + 'title': 'Management of SQL Server - Demo Monitoring', + 'duration': 338, + }, + 'skip': 'Requires pluralsight account credentials', + }, { + 'url': 'https://app.pluralsight.com/training/player?course=angularjs-get-started&author=scott-allen&name=angularjs-get-started-m1-introduction&clip=0&mode=live', + 'only_matching': True, + }, { + # available without pluralsight account + 'url': 'http://app.pluralsight.com/training/player?author=scott-allen&name=angularjs-get-started-m1-introduction&mode=live&clip=0&course=angularjs-get-started', + 'only_matching': True, + }] + + def _real_initialize(self): + self._login() + + def _login(self): + (username, password) = self._get_login_info() + if username is None: + return + + login_page = self._download_webpage( + self._LOGIN_URL, None, 'Downloading login page') + + login_form = self._hidden_inputs(login_page) + + login_form.update({ + 'Username': username.encode('utf-8'), + 'Password': password.encode('utf-8'), + }) + + post_url = self._search_regex( + r']+action=(["\'])(?P.+?)\1', login_page, + 'post url', default=self._LOGIN_URL, group='url') + + if not post_url.startswith('http'): + post_url = compat_urlparse.urljoin(self._LOGIN_URL, post_url) + + request = sanitized_Request( + post_url, compat_urllib_parse.urlencode(login_form).encode('utf-8')) + request.add_header('Content-Type', 'application/x-www-form-urlencoded') + + response = self._download_webpage( + request, None, 'Logging in as %s' % username) + + error = self._search_regex( + r']+class="field-validation-error"[^>]*>([^<]+)', + response, 'error message', default=None) + if error: + raise ExtractorError('Unable to login: %s' % error, expected=True) + + if all(p not in response for p in ('__INITIAL_STATE__', '"currentUser"')): + raise ExtractorError('Unable to log in') + + def _real_extract(self, url): + qs = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query) + + author = qs.get('author', [None])[0] + name = qs.get('name', [None])[0] + clip_id = qs.get('clip', [None])[0] + course = qs.get('course', [None])[0] + + if any(not f for f in (author, name, clip_id, course,)): + raise ExtractorError('Invalid URL', expected=True) + + display_id = '%s-%s' % (name, clip_id) + + webpage = self._download_webpage(url, display_id) + + modules = self._search_regex( + r'moduleCollection\s*:\s*new\s+ModuleCollection\((\[.+?\])\s*,\s*\$rootScope\)', + webpage, 'modules', default=None) + + if modules: + collection = self._parse_json(modules, display_id) + else: + # Webpage may be served in different layout (see + # https://github.com/rg3/youtube-dl/issues/7607) + collection = self._parse_json( + self._search_regex( + r'var\s+initialState\s*=\s*({.+?});\n', webpage, 'initial state'), + display_id)['course']['modules'] + + module, clip = None, None + + for module_ in collection: + if name in (module_.get('moduleName'), module_.get('name')): + module = module_ + for clip_ in module_.get('clips', []): + clip_index = clip_.get('clipIndex') + if clip_index is None: + clip_index = clip_.get('index') + if clip_index is None: + continue + if compat_str(clip_index) == clip_id: + clip = clip_ + break + + if not clip: + raise ExtractorError('Unable to resolve clip') + + QUALITIES = { + 'low': {'width': 640, 'height': 480}, + 'medium': {'width': 848, 'height': 640}, + 'high': {'width': 1024, 'height': 768}, + 'high-widescreen': {'width': 1280, 'height': 720}, + } + + QUALITIES_PREFERENCE = ('low', 'medium', 'high', 'high-widescreen',) + quality_key = qualities(QUALITIES_PREFERENCE) + + AllowedQuality = collections.namedtuple('AllowedQuality', ['ext', 'qualities']) + + ALLOWED_QUALITIES = ( + AllowedQuality('webm', ['high', ]), + AllowedQuality('mp4', ['low', 'medium', 'high', ]), + ) + + # Some courses also offer widescreen resolution for high quality (see + # https://github.com/rg3/youtube-dl/issues/7766) + widescreen = True if re.search( + r'courseSupportsWidescreenVideoFormats\s*:\s*true', webpage) else False + best_quality = 'high-widescreen' if widescreen else 'high' + if widescreen: + for allowed_quality in ALLOWED_QUALITIES: + allowed_quality.qualities.append(best_quality) + + # In order to minimize the number of calls to ViewClip API and reduce + # the probability of being throttled or banned by Pluralsight we will request + # only single format until formats listing was explicitly requested. + if self._downloader.params.get('listformats', False): + allowed_qualities = ALLOWED_QUALITIES + else: + def guess_allowed_qualities(): + req_format = self._downloader.params.get('format') or 'best' + req_format_split = req_format.split('-', 1) + if len(req_format_split) > 1: + req_ext, req_quality = req_format_split + for allowed_quality in ALLOWED_QUALITIES: + if req_ext == allowed_quality.ext and req_quality in allowed_quality.qualities: + return (AllowedQuality(req_ext, (req_quality, )), ) + req_ext = 'webm' if self._downloader.params.get('prefer_free_formats') else 'mp4' + return (AllowedQuality(req_ext, (best_quality, )), ) + allowed_qualities = guess_allowed_qualities() + + formats = [] + for ext, qualities_ in allowed_qualities: + for quality in qualities_: + f = QUALITIES[quality].copy() + clip_post = { + 'a': author, + 'cap': 'false', + 'cn': clip_id, + 'course': course, + 'lc': 'en', + 'm': name, + 'mt': ext, + 'q': '%dx%d' % (f['width'], f['height']), + } + request = sanitized_Request( + '%s/training/Player/ViewClip' % self._API_BASE, + json.dumps(clip_post).encode('utf-8')) + request.add_header('Content-Type', 'application/json;charset=utf-8') + format_id = '%s-%s' % (ext, quality) + clip_url = self._download_webpage( + request, display_id, 'Downloading %s URL' % format_id, fatal=False) + + # Pluralsight tracks multiple sequential calls to ViewClip API and start + # to return 429 HTTP errors after some time (see + # https://github.com/rg3/youtube-dl/pull/6989). Moreover it may even lead + # to account ban (see https://github.com/rg3/youtube-dl/issues/6842). + # To somewhat reduce the probability of these consequences + # we will sleep random amount of time before each call to ViewClip. + self._sleep( + random.randint(2, 5), display_id, + '%(video_id)s: Waiting for %(timeout)s seconds to avoid throttling') + + if not clip_url: + continue + f.update({ + 'url': clip_url, + 'ext': ext, + 'format_id': format_id, + 'quality': quality_key(quality), + }) + formats.append(f) + self._sort_formats(formats) + + # TODO: captions + # http://www.pluralsight.com/training/Player/ViewClip + cap = true + # or + # http://www.pluralsight.com/training/Player/Captions + # { a = author, cn = clip_id, lc = end, m = name } + + return { + 'id': clip['clipName'], + 'title': '%s - %s' % (module['title'], clip['title']), + 'duration': int_or_none(clip.get('duration')) or parse_duration(clip.get('formattedDuration')), + 'creator': author, + 'formats': formats + } + + +class PluralsightCourseIE(PluralsightBaseIE): + IE_NAME = 'pluralsight:course' + _VALID_URL = r'https?://(?:(?:www|app)\.)?pluralsight\.com/(?:library/)?courses/(?P[^/]+)' + _TESTS = [{ + # Free course from Pluralsight Starter Subscription for Microsoft TechNet + # https://offers.pluralsight.com/technet?loc=zTS3z&prod=zOTprodz&tech=zOttechz&prog=zOTprogz&type=zSOz&media=zOTmediaz&country=zUSz + 'url': 'http://www.pluralsight.com/courses/hosting-sql-server-windows-azure-iaas', + 'info_dict': { + 'id': 'hosting-sql-server-windows-azure-iaas', + 'title': 'Hosting SQL Server in Microsoft Azure IaaS Fundamentals', + 'description': 'md5:61b37e60f21c4b2f91dc621a977d0986', + }, + 'playlist_count': 31, + }, { + # available without pluralsight account + 'url': 'https://www.pluralsight.com/courses/angularjs-get-started', + 'only_matching': True, + }, { + 'url': 'https://app.pluralsight.com/library/courses/understanding-microsoft-azure-amazon-aws/table-of-contents', + 'only_matching': True, + }] + + def _real_extract(self, url): + course_id = self._match_id(url) + + # TODO: PSM cookie + + course = self._download_json( + '%s/data/course/%s' % (self._API_BASE, course_id), + course_id, 'Downloading course JSON') + + title = course['title'] + description = course.get('description') or course.get('shortDescription') + + course_data = self._download_json( + '%s/data/course/content/%s' % (self._API_BASE, course_id), + course_id, 'Downloading course data JSON') + + entries = [] + for module in course_data: + for clip in module.get('clips', []): + player_parameters = clip.get('playerParameters') + if not player_parameters: + continue + entries.append(self.url_result( + '%s/training/player?%s' % (self._API_BASE, player_parameters), + 'Pluralsight')) + + return self.playlist_result(entries, course_id, title, description) diff --git a/youtube_dl/extractor/porn91.py b/youtube_dl/extractor/porn91.py new file mode 100644 index 000000000..3e15533e9 --- /dev/null +++ b/youtube_dl/extractor/porn91.py @@ -0,0 +1,73 @@ +# encoding: utf-8 +from __future__ import unicode_literals + +from ..compat import compat_urllib_parse +from .common import InfoExtractor +from ..utils import ( + parse_duration, + int_or_none, + ExtractorError, +) + + +class Porn91IE(InfoExtractor): + IE_NAME = '91porn' + _VALID_URL = r'(?:https?://)(?:www\.|)91porn\.com/.+?\?viewkey=(?P[\w\d]+)' + + _TEST = { + 'url': 'http://91porn.com/view_video.php?viewkey=7e42283b4f5ab36da134', + 'md5': '6df8f6d028bc8b14f5dbd73af742fb20', + 'info_dict': { + 'id': '7e42283b4f5ab36da134', + 'title': '18岁大一漂亮学妹,水嫩性感,再爽一次!', + 'ext': 'mp4', + 'duration': 431, + 'age_limit': 18, + } + } + + def _real_extract(self, url): + video_id = self._match_id(url) + url = 'http://91porn.com/view_video.php?viewkey=%s' % video_id + self._set_cookie('91porn.com', 'language', 'cn_CN') + webpage = self._download_webpage(url, video_id, 'get HTML content') + + if '作为游客,你每天只可观看10个视频' in webpage: + raise ExtractorError('91 Porn says: Daily limit 10 videos exceeded', expected=True) + + title = self._search_regex( + r'
([^<]+)
', webpage, 'title') + title = title.replace('\n', '') + + # get real url + file_id = self._search_regex( + r'so.addVariable\(\'file\',\'(\d+)\'', webpage, 'file id') + sec_code = self._search_regex( + r'so.addVariable\(\'seccode\',\'([^\']+)\'', webpage, 'sec code') + max_vid = self._search_regex( + r'so.addVariable\(\'max_vid\',\'(\d+)\'', webpage, 'max vid') + url_params = compat_urllib_parse.urlencode({ + 'VID': file_id, + 'mp4': '1', + 'seccode': sec_code, + 'max_vid': max_vid, + }) + info_cn = self._download_webpage( + 'http://91porn.com/getfile.php?' + url_params, video_id, + 'get real video url') + video_url = self._search_regex(r'file=([^&]+)&', info_cn, 'url') + + duration = parse_duration(self._search_regex( + r'时长:\s*\s*(\d+:\d+)', webpage, 'duration', fatal=False)) + + comment_count = int_or_none(self._search_regex( + r'留言:\s*\s*(\d+)', webpage, 'comment count', fatal=False)) + + return { + 'id': video_id, + 'title': title, + 'url': video_url, + 'duration': duration, + 'comment_count': comment_count, + 'age_limit': self._rta_search(webpage), + } diff --git a/youtube_dl/extractor/pornhd.py b/youtube_dl/extractor/pornhd.py index bac484c67..57c78ba52 100644 --- a/youtube_dl/extractor/pornhd.py +++ b/youtube_dl/extractor/pornhd.py @@ -8,7 +8,6 @@ from ..utils import ( int_or_none, js_to_json, qualities, - determine_ext, ) @@ -37,7 +36,8 @@ class PornHdIE(InfoExtractor): webpage = self._download_webpage(url, display_id or video_id) title = self._html_search_regex( - r'(.+) porn HD.+?', webpage, 'title') + [r']+class=["\']video-name["\'][^>]*>([^<]+)', + r'(.+?) - .*?[Pp]ornHD.*?'], webpage, 'title') description = self._html_search_regex( r'
([^<]+)
', webpage, 'description', fatal=False) view_count = int_or_none(self._html_search_regex( @@ -45,13 +45,19 @@ class PornHdIE(InfoExtractor): thumbnail = self._search_regex( r"'poster'\s*:\s*'([^']+)'", webpage, 'thumbnail', fatal=False) - quality = qualities(['SD', 'HD']) - formats = [{ - 'url': source['file'], - 'format_id': '%s-%s' % (source['label'], determine_ext(source['file'])), - 'quality': quality(source['label']), - } for source in json.loads(js_to_json(self._search_regex( - r"(?s)'sources'\s*:\s*(\[.+?\])", webpage, 'sources')))] + quality = qualities(['sd', 'hd']) + sources = json.loads(js_to_json(self._search_regex( + r"(?s)'sources'\s*:\s*(\{.+?\})\s*\}[;,)]", + webpage, 'sources'))) + formats = [] + for qname, video_url in sources.items(): + if not video_url: + continue + formats.append({ + 'url': video_url, + 'format_id': qname, + 'quality': quality(qname), + }) self._sort_formats(formats) return { diff --git a/youtube_dl/extractor/pornhub.py b/youtube_dl/extractor/pornhub.py index 2ca15b717..08275687d 100644 --- a/youtube_dl/extractor/pornhub.py +++ b/youtube_dl/extractor/pornhub.py @@ -4,10 +4,14 @@ import os import re from .common import InfoExtractor -from ..utils import ( +from ..compat import ( + compat_urllib_parse_unquote, + compat_urllib_parse_unquote_plus, compat_urllib_parse_urlparse, - compat_urllib_request, - compat_urllib_parse, +) +from ..utils import ( + ExtractorError, + sanitized_Request, str_to_int, ) from ..aes import ( @@ -16,8 +20,8 @@ from ..aes import ( class PornHubIE(InfoExtractor): - _VALID_URL = r'^https?://(?:www\.)?pornhub\.com/view_video\.php\?viewkey=(?P[0-9a-f]+)' - _TEST = { + _VALID_URL = r'https?://(?:[a-z]+\.)?pornhub\.com/(?:view_video\.php\?viewkey=|embed/)(?P[0-9a-z]+)' + _TESTS = [{ 'url': 'http://www.pornhub.com/view_video.php?viewkey=648719015', 'md5': '882f488fa1f0026f023f33576004a2ed', 'info_dict': { @@ -27,38 +31,63 @@ class PornHubIE(InfoExtractor): "title": "Seductive Indian beauty strips down and fingers her pink pussy", "age_limit": 18 } - } + }, { + 'url': 'http://www.pornhub.com/view_video.php?viewkey=ph557bbb6676d2d', + 'only_matching': True, + }, { + 'url': 'http://fr.pornhub.com/view_video.php?viewkey=ph55ca2f9760862', + 'only_matching': True, + }] + + @classmethod + def _extract_url(cls, webpage): + mobj = re.search( + r']+?src=(["\'])(?P(?:https?:)?//(?:www\.)?pornhub\.com/embed/\d+)\1', webpage) + if mobj: + return mobj.group('url') def _extract_count(self, pattern, webpage, name): - count = self._html_search_regex(pattern, webpage, '%s count' % name, fatal=False) - if count: - count = str_to_int(count) - return count + return str_to_int(self._search_regex( + pattern, webpage, '%s count' % name, fatal=False)) def _real_extract(self, url): video_id = self._match_id(url) - req = compat_urllib_request.Request(url) + req = sanitized_Request( + 'http://www.pornhub.com/view_video.php?viewkey=%s' % video_id) req.add_header('Cookie', 'age_verified=1') webpage = self._download_webpage(req, video_id) + error_msg = self._html_search_regex( + r'(?s)
(.*?)
', + webpage, 'error message', default=None) + if error_msg: + error_msg = re.sub(r'\s+', ' ', error_msg) + raise ExtractorError( + 'PornHub said: %s' % error_msg, + expected=True, video_id=video_id) + video_title = self._html_search_regex(r'

]+>([^<]+)', webpage, 'title') video_uploader = self._html_search_regex( - r'(?s)From: .+?<(?:a href="/users/|a href="/channels/|]+>(.+?)<', webpage, 'uploader', fatal=False) thumbnail = self._html_search_regex(r'"image_url":"([^"]+)', webpage, 'thumbnail', fatal=False) if thumbnail: - thumbnail = compat_urllib_parse.unquote(thumbnail) + thumbnail = compat_urllib_parse_unquote(thumbnail) - view_count = self._extract_count(r'([\d,\.]+) views', webpage, 'view') - like_count = self._extract_count(r'([\d,\.]+)', webpage, 'like') - dislike_count = self._extract_count(r'([\d,\.]+)', webpage, 'dislike') + view_count = self._extract_count( + r'([\d,\.]+) views', webpage, 'view') + like_count = self._extract_count( + r'([\d,\.]+)', webpage, 'like') + dislike_count = self._extract_count( + r'([\d,\.]+)', webpage, 'dislike') comment_count = self._extract_count( - r'All comments \(([\d,\.]+)', webpage, 'comment') + r'All Comments\s*\(([\d,.]+)\)', webpage, 'comment') - video_urls = list(map(compat_urllib_parse.unquote, re.findall(r'"quality_[0-9]{3}p":"([^"]+)', webpage))) + video_urls = list(map(compat_urllib_parse_unquote, re.findall(r"player_quality_[0-9]{3}p\s*=\s*'([^']+)'", webpage))) if webpage.find('"encrypted":true') != -1: - password = compat_urllib_parse.unquote_plus(self._html_search_regex(r'"video_title":"([^"]+)', webpage, 'password')) + password = compat_urllib_parse_unquote_plus( + self._search_regex(r'"video_title":"([^"]+)', webpage, 'password')) video_urls = list(map(lambda s: aes_decrypt_text(s, password, 32).decode('utf-8'), video_urls)) formats = [] @@ -68,7 +97,7 @@ class PornHubIE(InfoExtractor): format = path.split('/')[5].split('_')[:2] format = "-".join(format) - m = re.match(r'^(?P[0-9]+)P-(?P[0-9]+)K$', format) + m = re.match(r'^(?P[0-9]+)[pP]-(?P[0-9]+)[kK]$', format) if m is None: height = None tbr = None @@ -98,3 +127,34 @@ class PornHubIE(InfoExtractor): 'formats': formats, 'age_limit': 18, } + + +class PornHubPlaylistIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?pornhub\.com/playlist/(?P\d+)' + _TESTS = [{ + 'url': 'http://www.pornhub.com/playlist/6201671', + 'info_dict': { + 'id': '6201671', + 'title': 'P0p4', + }, + 'playlist_mincount': 35, + }] + + def _real_extract(self, url): + playlist_id = self._match_id(url) + + webpage = self._download_webpage(url, playlist_id) + + entries = [ + self.url_result('http://www.pornhub.com/%s' % video_url, 'PornHub') + for video_url in set(re.findall( + r'href="/?(view_video\.php\?.*\bviewkey=[\da-z]+[^"]*)"', webpage)) + ] + + playlist = self._parse_json( + self._search_regex( + r'playlistObject\s*=\s*({.+?});', webpage, 'playlist'), + playlist_id) + + return self.playlist_result( + entries, playlist_id, playlist.get('title'), playlist.get('description')) diff --git a/youtube_dl/extractor/pornotube.py b/youtube_dl/extractor/pornotube.py index 5253aa3d3..5398e708b 100644 --- a/youtube_dl/extractor/pornotube.py +++ b/youtube_dl/extractor/pornotube.py @@ -1,56 +1,92 @@ from __future__ import unicode_literals -import re +import json from .common import InfoExtractor from ..utils import ( - compat_urllib_parse, - - unified_strdate, + int_or_none, + sanitized_Request, ) class PornotubeIE(InfoExtractor): - _VALID_URL = r'https?://(?:\w+\.)?pornotube\.com(/c/(?P[0-9]+))?(/m/(?P[0-9]+))(/(?P.+))$' + _VALID_URL = r'https?://(?:\w+\.)?pornotube\.com/(?:[^?#]*?)/video/(?P<id>[0-9]+)' _TEST = { - 'url': 'http://pornotube.com/c/173/m/1689755/Marilyn-Monroe-Bathing', - 'md5': '374dd6dcedd24234453b295209aa69b6', + 'url': 'http://www.pornotube.com/orientation/straight/video/4964/title/weird-hot-and-wet-science', + 'md5': '60fc5a4f0d93a97968fc7999d98260c9', 'info_dict': { - 'id': '1689755', - 'ext': 'flv', - 'upload_date': '20090708', - 'title': 'Marilyn-Monroe-Bathing', - 'age_limit': 18 + 'id': '4964', + 'ext': 'mp4', + 'upload_date': '20141203', + 'title': 'Weird Hot and Wet Science', + 'description': 'md5:a8304bef7ef06cb4ab476ca6029b01b0', + 'categories': ['Adult Humor', 'Blondes'], + 'uploader': 'Alpha Blue Archives', + 'thumbnail': 're:^https?://.*\\.jpg$', + 'timestamp': 1417582800, + 'age_limit': 18, } } def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) + video_id = self._match_id(url) - video_id = mobj.group('videoid') - video_title = mobj.group('title') + # Fetch origin token + js_config = self._download_webpage( + 'http://www.pornotube.com/assets/src/app/config.js', video_id, + note='Download JS config') + originAuthenticationSpaceKey = self._search_regex( + r"constant\('originAuthenticationSpaceKey',\s*'([^']+)'", + js_config, 'originAuthenticationSpaceKey') - # Get webpage content - webpage = self._download_webpage(url, video_id) + # Fetch actual token + token_req_data = { + 'authenticationSpaceKey': originAuthenticationSpaceKey, + 'credentials': 'Clip Application', + } + token_req = sanitized_Request( + 'https://api.aebn.net/auth/v1/token/primal', + data=json.dumps(token_req_data).encode('utf-8')) + token_req.add_header('Content-Type', 'application/json') + token_req.add_header('Origin', 'http://www.pornotube.com') + token_answer = self._download_json( + token_req, video_id, note='Requesting primal token') + token = token_answer['tokenKey'] - # Get the video URL - VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",' - video_url = self._search_regex(VIDEO_URL_RE, webpage, 'video url') - video_url = compat_urllib_parse.unquote(video_url) + # Get video URL + delivery_req = sanitized_Request( + 'https://api.aebn.net/delivery/v1/clips/%s/MP4' % video_id) + delivery_req.add_header('Authorization', token) + delivery_info = self._download_json( + delivery_req, video_id, note='Downloading delivery information') + video_url = delivery_info['mediaUrl'] - # Get the uploaded date - VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by' - upload_date = self._html_search_regex(VIDEO_UPLOADED_RE, webpage, 'upload date', fatal=False) - if upload_date: - upload_date = unified_strdate(upload_date) - age_limit = self._rta_search(webpage) + # Get additional info (title etc.) + info_req = sanitized_Request( + 'https://api.aebn.net/content/v1/clips/%s?expand=' + 'title,description,primaryImageNumber,startSecond,endSecond,' + 'movie.title,movie.MovieId,movie.boxCoverFront,movie.stars,' + 'movie.studios,stars.name,studios.name,categories.name,' + 'clipActive,movieActive,publishDate,orientations' % video_id) + info_req.add_header('Authorization', token) + info = self._download_json( + info_req, video_id, note='Downloading metadata') + + timestamp = int_or_none(info.get('publishDate'), scale=1000) + uploader = info.get('studios', [{}])[0].get('name') + movie_id = info['movie']['movieId'] + thumbnail = 'http://pic.aebn.net/dis/t/%s/%s_%08d.jpg' % ( + movie_id, movie_id, info['primaryImageNumber']) + categories = [c['name'] for c in info.get('categories')] return { 'id': video_id, 'url': video_url, - 'upload_date': upload_date, - 'title': video_title, - 'ext': 'flv', - 'format': 'flv', - 'age_limit': age_limit, + 'title': info['title'], + 'description': info.get('description'), + 'timestamp': timestamp, + 'uploader': uploader, + 'thumbnail': thumbnail, + 'categories': categories, + 'age_limit': 18, } diff --git a/youtube_dl/extractor/pornovoisines.py b/youtube_dl/extractor/pornovoisines.py new file mode 100644 index 000000000..eba4dfbb3 --- /dev/null +++ b/youtube_dl/extractor/pornovoisines.py @@ -0,0 +1,96 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re +import random + +from .common import InfoExtractor +from ..utils import ( + int_or_none, + float_or_none, + unified_strdate, +) + + +class PornoVoisinesIE(InfoExtractor): + _VALID_URL = r'http://(?:www\.)?pornovoisines\.com/showvideo/(?P<id>\d+)/(?P<display_id>[^/]+)' + + _VIDEO_URL_TEMPLATE = 'http://stream%d.pornovoisines.com' \ + '/static/media/video/transcoded/%s-640x360-1000-trscded.mp4' + + _SERVER_NUMBERS = (1, 2) + + _TEST = { + 'url': 'http://www.pornovoisines.com/showvideo/1285/recherche-appartement/', + 'md5': '5ac670803bc12e9e7f9f662ce64cf1d1', + 'info_dict': { + 'id': '1285', + 'display_id': 'recherche-appartement', + 'ext': 'mp4', + 'title': 'Recherche appartement', + 'description': 'md5:819ea0b785e2a04667a1a01cdc89594e', + 'thumbnail': 're:^https?://.*\.jpg$', + 'upload_date': '20140925', + 'duration': 120, + 'view_count': int, + 'average_rating': float, + 'categories': ['Débutantes', 'Scénario', 'Sodomie'], + 'age_limit': 18, + } + } + + @classmethod + def build_video_url(cls, num): + return cls._VIDEO_URL_TEMPLATE % (random.choice(cls._SERVER_NUMBERS), num) + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + display_id = mobj.group('display_id') + + webpage = self._download_webpage(url, video_id) + + video_url = self.build_video_url(video_id) + + title = self._html_search_regex( + r'<h1>(.+?)</h1>', webpage, 'title', flags=re.DOTALL) + description = self._html_search_regex( + r'<article id="descriptif">(.+?)</article>', + webpage, "description", fatal=False, flags=re.DOTALL) + + thumbnail = self._search_regex( + r'<div id="mediaspace%s">\s*<img src="/?([^"]+)"' % video_id, + webpage, 'thumbnail', fatal=False) + if thumbnail: + thumbnail = 'http://www.pornovoisines.com/%s' % thumbnail + + upload_date = unified_strdate(self._search_regex( + r'Publié le ([\d-]+)', webpage, 'upload date', fatal=False)) + duration = int_or_none(self._search_regex( + 'Durée (\d+)', webpage, 'duration', fatal=False)) + view_count = int_or_none(self._search_regex( + r'(\d+) vues', webpage, 'view count', fatal=False)) + average_rating = self._search_regex( + r'Note\s*:\s*(\d+(?:,\d+)?)', webpage, 'average rating', fatal=False) + if average_rating: + average_rating = float_or_none(average_rating.replace(',', '.')) + + categories = self._html_search_meta( + 'keywords', webpage, 'categories', fatal=False) + if categories: + categories = [category.strip() for category in categories.split(',')] + + return { + 'id': video_id, + 'display_id': display_id, + 'url': video_url, + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + 'upload_date': upload_date, + 'duration': duration, + 'view_count': view_count, + 'average_rating': average_rating, + 'categories': categories, + 'age_limit': 18, + } diff --git a/youtube_dl/extractor/primesharetv.py b/youtube_dl/extractor/primesharetv.py new file mode 100644 index 000000000..85aae9576 --- /dev/null +++ b/youtube_dl/extractor/primesharetv.py @@ -0,0 +1,62 @@ +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..compat import compat_urllib_parse +from ..utils import ( + ExtractorError, + sanitized_Request, +) + + +class PrimeShareTVIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?primeshare\.tv/download/(?P<id>[\da-zA-Z]+)' + + _TEST = { + 'url': 'http://primeshare.tv/download/238790B611', + 'md5': 'b92d9bf5461137c36228009f31533fbc', + 'info_dict': { + 'id': '238790B611', + 'ext': 'mp4', + 'title': 'Public Domain - 1960s Commercial - Crest Toothpaste-YKsuFona', + }, + } + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage(url, video_id) + + if '>File not exist<' in webpage: + raise ExtractorError('Video %s does not exist' % video_id, expected=True) + + fields = self._hidden_inputs(webpage) + + headers = { + 'Referer': url, + 'Content-Type': 'application/x-www-form-urlencoded', + } + + wait_time = int(self._search_regex( + r'var\s+cWaitTime\s*=\s*(\d+)', + webpage, 'wait time', default=7)) + 1 + self._sleep(wait_time, video_id) + + req = sanitized_Request( + url, compat_urllib_parse.urlencode(fields), headers) + video_page = self._download_webpage( + req, video_id, 'Downloading video page') + + video_url = self._search_regex( + r"url\s*:\s*'([^']+\.primeshare\.tv(?::443)?/file/[^']+)'", + video_page, 'video url') + + title = self._html_search_regex( + r'<h1>Watch\s*(?: )?\s*\((.+?)(?:\s*\[\.\.\.\])?\)\s*(?: )?\s*<strong>', + video_page, 'title') + + return { + 'id': video_id, + 'url': video_url, + 'title': title, + 'ext': 'mp4', + } diff --git a/youtube_dl/extractor/promptfile.py b/youtube_dl/extractor/promptfile.py index 7fcde086c..d5357283a 100644 --- a/youtube_dl/extractor/promptfile.py +++ b/youtube_dl/extractor/promptfile.py @@ -4,11 +4,11 @@ from __future__ import unicode_literals import re from .common import InfoExtractor +from ..compat import compat_urllib_parse from ..utils import ( - ExtractorError, determine_ext, - compat_urllib_parse, - compat_urllib_request, + ExtractorError, + sanitized_Request, ) @@ -33,12 +33,9 @@ class PromptFileIE(InfoExtractor): raise ExtractorError('Video %s does not exist' % video_id, expected=True) - fields = dict(re.findall(r'''(?x)type="hidden"\s+ - name="(.+?)"\s+ - value="(.*?)" - ''', webpage)) + fields = self._hidden_inputs(webpage) post = compat_urllib_parse.urlencode(fields) - req = compat_urllib_request.Request(url, post) + req = sanitized_Request(url, post) req.add_header('Content-type', 'application/x-www-form-urlencoded') webpage = self._download_webpage( req, video_id, 'Downloading video page') diff --git a/youtube_dl/extractor/prosiebensat1.py b/youtube_dl/extractor/prosiebensat1.py index 32d747ede..baa54a3af 100644 --- a/youtube_dl/extractor/prosiebensat1.py +++ b/youtube_dl/extractor/prosiebensat1.py @@ -5,8 +5,14 @@ import re from hashlib import sha1 from .common import InfoExtractor -from ..utils import ( +from ..compat import ( compat_urllib_parse, +) +from ..utils import ( + ExtractorError, + determine_ext, + float_or_none, + int_or_none, unified_strdate, ) @@ -14,15 +20,20 @@ from ..utils import ( class ProSiebenSat1IE(InfoExtractor): IE_NAME = 'prosiebensat1' IE_DESC = 'ProSiebenSat.1 Digital' - _VALID_URL = r'https?://(?:www\.)?(?:(?:prosieben|prosiebenmaxx|sixx|sat1|kabeleins|ran|the-voice-of-germany)\.de|fem\.com)/(?P<id>.+)' + _VALID_URL = r'https?://(?:www\.)?(?:(?:prosieben|prosiebenmaxx|sixx|sat1|kabeleins|the-voice-of-germany)\.(?:de|at|ch)|ran\.de|fem\.com)/(?P<id>.+)' _TESTS = [ { + # Tests changes introduced in https://github.com/rg3/youtube-dl/pull/6242 + # in response to fixing https://github.com/rg3/youtube-dl/issues/6215: + # - malformed f4m manifest support + # - proper handling of URLs starting with `https?://` in 2.0 manifests + # - recursive child f4m manifests extraction 'url': 'http://www.prosieben.de/tv/circus-halligalli/videos/218-staffel-2-episode-18-jahresrueckblick-ganze-folge', 'info_dict': { 'id': '2104602', 'ext': 'mp4', - 'title': 'Staffel 2, Episode 18 - Jahresrückblick', + 'title': 'Episode 18 - Staffel 2', 'description': 'md5:8733c81b702ea472e069bc48bb658fc1', 'upload_date': '20131231', 'duration': 5845.04, @@ -85,7 +96,7 @@ class ProSiebenSat1IE(InfoExtractor): 'ext': 'mp4', 'title': 'Im Interview: Kai Wiesinger', 'description': 'md5:e4e5370652ec63b95023e914190b4eb9', - 'upload_date': '20140225', + 'upload_date': '20140203', 'duration': 522.56, }, 'params': { @@ -100,7 +111,7 @@ class ProSiebenSat1IE(InfoExtractor): 'ext': 'mp4', 'title': 'Jagd auf Fertigkost im Elsthal - Teil 2', 'description': 'md5:2669cde3febe9bce13904f701e774eb6', - 'upload_date': '20140225', + 'upload_date': '20141014', 'duration': 2410.44, }, 'params': { @@ -152,18 +163,29 @@ class ProSiebenSat1IE(InfoExtractor): 'skip_download': True, }, }, + { + 'url': 'http://www.prosieben.de/tv/joko-gegen-klaas/videos/playlists/episode-8-ganze-folge-playlist', + 'info_dict': { + 'id': '439664', + 'title': 'Episode 8 - Ganze Folge - Playlist', + 'description': 'md5:63b8963e71f481782aeea877658dec84', + }, + 'playlist_count': 2, + }, ] _CLIPID_REGEXES = [ r'"clip_id"\s*:\s+"(\d+)"', r'clipid: "(\d+)"', r'clip[iI]d=(\d+)', + r"'itemImageUrl'\s*:\s*'/dynamic/thumbnails/full/\d+/(\d+)", ] _TITLE_REGEXES = [ r'<h2 class="subtitle" itemprop="name">\s*(.+?)</h2>', r'<header class="clearfix">\s*<h3>(.+?)</h3>', r'<!-- start video -->\s*<h1>(.+?)</h1>', r'<h1 class="att-name">\s*(.+?)</h1>', + r'<header class="module_header">\s*<h2>([^<]+)</h2>\s*</header>', ] _DESCRIPTION_REGEXES = [ r'<p itemprop="description">\s*(.+?)</p>', @@ -178,15 +200,23 @@ class ProSiebenSat1IE(InfoExtractor): r'<span style="padding-left: 4px;line-height:20px; color:#404040">(\d{2}\.\d{2}\.\d{4})</span>', r'(\d{2}\.\d{2}\.\d{4}) \| \d{2}:\d{2} Min<br/>', ] + _PAGE_TYPE_REGEXES = [ + r'<meta name="page_type" content="([^"]+)">', + r"'itemType'\s*:\s*'([^']*)'", + ] + _PLAYLIST_ID_REGEXES = [ + r'content[iI]d=(\d+)', + r"'itemId'\s*:\s*'([^']*)'", + ] + _PLAYLIST_CLIP_REGEXES = [ + r'(?s)data-qvt=.+?<a href="([^"]+)"', + ] - def _real_extract(self, url): - video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) - + def _extract_clip(self, url, webpage): clip_id = self._html_search_regex(self._CLIPID_REGEXES, webpage, 'clip id') - access_token = 'testclient' - client_name = 'kolibri-1.2.5' + access_token = 'prosieben' + client_name = 'kolibri-2.0.19-splec4' client_location = url videos_api_url = 'http://vas.sim-technik.de/vas/live/v2/videos?%s' % compat_urllib_parse.urlencode({ @@ -196,10 +226,13 @@ class ProSiebenSat1IE(InfoExtractor): 'ids': clip_id, }) - videos = self._download_json(videos_api_url, clip_id, 'Downloading videos JSON') + video = self._download_json(videos_api_url, clip_id, 'Downloading videos JSON')[0] - duration = float(videos[0]['duration']) - source_ids = [source['id'] for source in videos[0]['sources']] + if video.get('is_protected') is True: + raise ExtractorError('This video is DRM protected.', expected=True) + + duration = float_or_none(video.get('duration')) + source_ids = [source['id'] for source in video['sources']] source_ids_str = ','.join(map(str, source_ids)) g = '01!8d8F_)r9]4s[qeuXfP%' @@ -246,27 +279,37 @@ class ProSiebenSat1IE(InfoExtractor): urls_sources = urls_sources.values() def fix_bitrate(bitrate): + bitrate = int_or_none(bitrate) + if not bitrate: + return None return (bitrate // 1000) if bitrate % 1000 == 0 else bitrate for source in urls_sources: protocol = source['protocol'] + source_url = source['url'] if protocol == 'rtmp' or protocol == 'rtmpe': - mobj = re.search(r'^(?P<url>rtmpe?://[^/]+/(?P<app>[^/]+))/(?P<playpath>.+)$', source['url']) + mobj = re.search(r'^(?P<url>rtmpe?://[^/]+)/(?P<path>.+)$', source_url) if not mobj: continue + path = mobj.group('path') + mp4colon_index = path.rfind('mp4:') + app = path[:mp4colon_index] + play_path = path[mp4colon_index:] formats.append({ - 'url': mobj.group('url'), - 'app': mobj.group('app'), - 'play_path': mobj.group('playpath'), + 'url': '%s/%s' % (mobj.group('url'), app), + 'app': app, + 'play_path': play_path, 'player_url': 'http://livepassdl.conviva.com/hf/ver/2.79.0.17083/LivePassModuleMain.swf', 'page_url': 'http://www.prosieben.de', 'vbr': fix_bitrate(source['bitrate']), 'ext': 'mp4', 'format_id': '%s_%s' % (source['cdn'], source['bitrate']), }) + elif 'f4mgenerator' in source_url or determine_ext(source_url) == 'f4m': + formats.extend(self._extract_f4m_formats(source_url, clip_id)) else: formats.append({ - 'url': source['url'], + 'url': source_url, 'vbr': fix_bitrate(source['bitrate']), }) @@ -281,3 +324,31 @@ class ProSiebenSat1IE(InfoExtractor): 'duration': duration, 'formats': formats, } + + def _extract_playlist(self, url, webpage): + playlist_id = self._html_search_regex( + self._PLAYLIST_ID_REGEXES, webpage, 'playlist id') + for regex in self._PLAYLIST_CLIP_REGEXES: + playlist_clips = re.findall(regex, webpage) + if playlist_clips: + title = self._html_search_regex( + self._TITLE_REGEXES, webpage, 'title') + description = self._html_search_regex( + self._DESCRIPTION_REGEXES, webpage, 'description', fatal=False) + entries = [ + self.url_result( + re.match('(.+?//.+?)/', url).group(1) + clip_path, + 'ProSiebenSat1') + for clip_path in playlist_clips] + return self.playlist_result(entries, playlist_id, title, description) + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + page_type = self._search_regex( + self._PAGE_TYPE_REGEXES, webpage, + 'page type', default='clip').lower() + if page_type == 'clip': + return self._extract_clip(url, webpage) + elif page_type == 'playlist': + return self._extract_playlist(url, webpage) diff --git a/youtube_dl/extractor/puls4.py b/youtube_dl/extractor/puls4.py new file mode 100644 index 000000000..cce84b9e4 --- /dev/null +++ b/youtube_dl/extractor/puls4.py @@ -0,0 +1,88 @@ +# -*- coding: utf-8 -*- +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + unified_strdate, + int_or_none, +) + + +class Puls4IE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?puls4\.com/video/[^/]+/play/(?P<id>[0-9]+)' + _TESTS = [{ + 'url': 'http://www.puls4.com/video/pro-und-contra/play/2716816', + 'md5': '49f6a6629747eeec43cef6a46b5df81d', + 'info_dict': { + 'id': '2716816', + 'ext': 'mp4', + 'title': 'Pro und Contra vom 23.02.2015', + 'description': 'md5:293e44634d9477a67122489994675db6', + 'duration': 2989, + 'upload_date': '20150224', + 'uploader': 'PULS_4', + }, + 'skip': 'Only works from Germany', + }, { + 'url': 'http://www.puls4.com/video/kult-spielfilme/play/1298106', + 'md5': '6a48316c8903ece8dab9b9a7bf7a59ec', + 'info_dict': { + 'id': '1298106', + 'ext': 'mp4', + 'title': 'Lucky Fritz', + }, + 'skip': 'Only works from Germany', + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + error_message = self._html_search_regex( + r'<div class="message-error">(.+?)</div>', + webpage, 'error message', default=None) + if error_message: + raise ExtractorError( + '%s returned error: %s' % (self.IE_NAME, error_message), expected=True) + + real_url = self._html_search_regex( + r'\"fsk-button\".+?href=\"([^"]+)', + webpage, 'fsk_button', default=None) + if real_url: + webpage = self._download_webpage(real_url, video_id) + + player = self._search_regex( + r'p4_video_player(?:_iframe)?\("video_\d+_container"\s*,(.+?)\);\s*\}', + webpage, 'player') + + player_json = self._parse_json( + '[%s]' % player, video_id, + transform_source=lambda s: s.replace('undefined,', '')) + + formats = None + result = None + + for v in player_json: + if isinstance(v, list) and not formats: + formats = [{ + 'url': f['url'], + 'format': 'hd' if f.get('hd') else 'sd', + 'width': int_or_none(f.get('size_x')), + 'height': int_or_none(f.get('size_y')), + 'tbr': int_or_none(f.get('bitrate')), + } for f in v] + self._sort_formats(formats) + elif isinstance(v, dict) and not result: + result = { + 'id': video_id, + 'title': v['videopartname'].strip(), + 'description': v.get('videotitle'), + 'duration': int_or_none(v.get('videoduration') or v.get('episodeduration')), + 'upload_date': unified_strdate(v.get('clipreleasetime')), + 'uploader': v.get('channel'), + } + + result['formats'] = formats + + return result diff --git a/youtube_dl/extractor/qqmusic.py b/youtube_dl/extractor/qqmusic.py new file mode 100644 index 000000000..1ba3bbddf --- /dev/null +++ b/youtube_dl/extractor/qqmusic.py @@ -0,0 +1,344 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import random +import time +import re + +from .common import InfoExtractor +from ..utils import ( + sanitized_Request, + strip_jsonp, + unescapeHTML, + clean_html, +) + + +class QQMusicIE(InfoExtractor): + IE_NAME = 'qqmusic' + IE_DESC = 'QQ音乐' + _VALID_URL = r'http://y.qq.com/#type=song&mid=(?P<id>[0-9A-Za-z]+)' + _TESTS = [{ + 'url': 'http://y.qq.com/#type=song&mid=004295Et37taLD', + 'md5': '9ce1c1c8445f561506d2e3cfb0255705', + 'info_dict': { + 'id': '004295Et37taLD', + 'ext': 'mp3', + 'title': '可惜没如果', + 'release_date': '20141227', + 'creator': '林俊杰', + 'description': 'md5:d327722d0361576fde558f1ac68a7065', + 'thumbnail': 're:^https?://.*\.jpg$', + } + }, { + 'note': 'There is no mp3-320 version of this song.', + 'url': 'http://y.qq.com/#type=song&mid=004MsGEo3DdNxV', + 'md5': 'fa3926f0c585cda0af8fa4f796482e3e', + 'info_dict': { + 'id': '004MsGEo3DdNxV', + 'ext': 'mp3', + 'title': '如果', + 'release_date': '20050626', + 'creator': '李季美', + 'description': 'md5:46857d5ed62bc4ba84607a805dccf437', + 'thumbnail': 're:^https?://.*\.jpg$', + } + }, { + 'note': 'lyrics not in .lrc format', + 'url': 'http://y.qq.com/#type=song&mid=001JyApY11tIp6', + 'info_dict': { + 'id': '001JyApY11tIp6', + 'ext': 'mp3', + 'title': 'Shadows Over Transylvania', + 'release_date': '19970225', + 'creator': 'Dark Funeral', + 'description': 'md5:ed14d5bd7ecec19609108052c25b2c11', + 'thumbnail': 're:^https?://.*\.jpg$', + }, + 'params': { + 'skip_download': True, + }, + }] + + _FORMATS = { + 'mp3-320': {'prefix': 'M800', 'ext': 'mp3', 'preference': 40, 'abr': 320}, + 'mp3-128': {'prefix': 'M500', 'ext': 'mp3', 'preference': 30, 'abr': 128}, + 'm4a': {'prefix': 'C200', 'ext': 'm4a', 'preference': 10} + } + + # Reference: m_r_GetRUin() in top_player.js + # http://imgcache.gtimg.cn/music/portal_v3/y/top_player.js + @staticmethod + def m_r_get_ruin(): + curMs = int(time.time() * 1000) % 1000 + return int(round(random.random() * 2147483647) * curMs % 1E10) + + def _real_extract(self, url): + mid = self._match_id(url) + + detail_info_page = self._download_webpage( + 'http://s.plcloud.music.qq.com/fcgi-bin/fcg_yqq_song_detail_info.fcg?songmid=%s&play=0' % mid, + mid, note='Download song detail info', + errnote='Unable to get song detail info', encoding='gbk') + + song_name = self._html_search_regex( + r"songname:\s*'([^']+)'", detail_info_page, 'song name') + + publish_time = self._html_search_regex( + r'发行时间:(\d{4}-\d{2}-\d{2})', detail_info_page, + 'publish time', default=None) + if publish_time: + publish_time = publish_time.replace('-', '') + + singer = self._html_search_regex( + r"singer:\s*'([^']+)", detail_info_page, 'singer', default=None) + + lrc_content = self._html_search_regex( + r'<div class="content" id="lrc_content"[^<>]*>([^<>]+)</div>', + detail_info_page, 'LRC lyrics', default=None) + if lrc_content: + lrc_content = lrc_content.replace('\\n', '\n') + + thumbnail_url = None + albummid = self._search_regex( + [r'albummid:\'([0-9a-zA-Z]+)\'', r'"albummid":"([0-9a-zA-Z]+)"'], + detail_info_page, 'album mid', default=None) + if albummid: + thumbnail_url = "http://i.gtimg.cn/music/photo/mid_album_500/%s/%s/%s.jpg" \ + % (albummid[-2:-1], albummid[-1], albummid) + + guid = self.m_r_get_ruin() + + vkey = self._download_json( + 'http://base.music.qq.com/fcgi-bin/fcg_musicexpress.fcg?json=3&guid=%s' % guid, + mid, note='Retrieve vkey', errnote='Unable to get vkey', + transform_source=strip_jsonp)['key'] + + formats = [] + for format_id, details in self._FORMATS.items(): + formats.append({ + 'url': 'http://cc.stream.qqmusic.qq.com/%s%s.%s?vkey=%s&guid=%s&fromtag=0' + % (details['prefix'], mid, details['ext'], vkey, guid), + 'format': format_id, + 'format_id': format_id, + 'preference': details['preference'], + 'abr': details.get('abr'), + }) + self._check_formats(formats, mid) + self._sort_formats(formats) + + actual_lrc_lyrics = ''.join( + line + '\n' for line in re.findall( + r'(?m)^(\[[0-9]{2}:[0-9]{2}(?:\.[0-9]{2,})?\][^\n]*|\[[^\]]*\])', lrc_content)) + + info_dict = { + 'id': mid, + 'formats': formats, + 'title': song_name, + 'release_date': publish_time, + 'creator': singer, + 'description': lrc_content, + 'thumbnail': thumbnail_url + } + if actual_lrc_lyrics: + info_dict['subtitles'] = { + 'origin': [{ + 'ext': 'lrc', + 'data': actual_lrc_lyrics, + }] + } + return info_dict + + +class QQPlaylistBaseIE(InfoExtractor): + @staticmethod + def qq_static_url(category, mid): + return 'http://y.qq.com/y/static/%s/%s/%s/%s.html' % (category, mid[-2], mid[-1], mid) + + @classmethod + def get_entries_from_page(cls, page): + entries = [] + + for item in re.findall(r'class="data"[^<>]*>([^<>]+)</', page): + song_mid = unescapeHTML(item).split('|')[-5] + entries.append(cls.url_result( + 'http://y.qq.com/#type=song&mid=' + song_mid, 'QQMusic', + song_mid)) + + return entries + + +class QQMusicSingerIE(QQPlaylistBaseIE): + IE_NAME = 'qqmusic:singer' + IE_DESC = 'QQ音乐 - 歌手' + _VALID_URL = r'http://y.qq.com/#type=singer&mid=(?P<id>[0-9A-Za-z]+)' + _TEST = { + 'url': 'http://y.qq.com/#type=singer&mid=001BLpXF2DyJe2', + 'info_dict': { + 'id': '001BLpXF2DyJe2', + 'title': '林俊杰', + 'description': 'md5:2a222d89ba4455a3af19940c0481bb78', + }, + 'playlist_count': 12, + } + + def _real_extract(self, url): + mid = self._match_id(url) + + singer_page = self._download_webpage( + self.qq_static_url('singer', mid), mid, 'Download singer page') + + entries = self.get_entries_from_page(singer_page) + + singer_name = self._html_search_regex( + r"singername\s*:\s*'([^']+)'", singer_page, 'singer name', + default=None) + + singer_id = self._html_search_regex( + r"singerid\s*:\s*'([0-9]+)'", singer_page, 'singer id', + default=None) + + singer_desc = None + + if singer_id: + req = sanitized_Request( + 'http://s.plcloud.music.qq.com/fcgi-bin/fcg_get_singer_desc.fcg?utf8=1&outCharset=utf-8&format=xml&singerid=%s' % singer_id) + req.add_header( + 'Referer', 'http://s.plcloud.music.qq.com/xhr_proxy_utf8.html') + singer_desc_page = self._download_xml( + req, mid, 'Donwload singer description XML') + + singer_desc = singer_desc_page.find('./data/info/desc').text + + return self.playlist_result(entries, mid, singer_name, singer_desc) + + +class QQMusicAlbumIE(QQPlaylistBaseIE): + IE_NAME = 'qqmusic:album' + IE_DESC = 'QQ音乐 - 专辑' + _VALID_URL = r'http://y.qq.com/#type=album&mid=(?P<id>[0-9A-Za-z]+)' + + _TESTS = [{ + 'url': 'http://y.qq.com/#type=album&mid=000gXCTb2AhRR1', + 'info_dict': { + 'id': '000gXCTb2AhRR1', + 'title': '我们都是这样长大的', + 'description': 'md5:179c5dce203a5931970d306aa9607ea6', + }, + 'playlist_count': 4, + }, { + 'url': 'http://y.qq.com/#type=album&mid=002Y5a3b3AlCu3', + 'info_dict': { + 'id': '002Y5a3b3AlCu3', + 'title': '그리고...', + 'description': 'md5:a48823755615508a95080e81b51ba729', + }, + 'playlist_count': 8, + }] + + def _real_extract(self, url): + mid = self._match_id(url) + + album = self._download_json( + 'http://i.y.qq.com/v8/fcg-bin/fcg_v8_album_info_cp.fcg?albummid=%s&format=json' % mid, + mid, 'Download album page')['data'] + + entries = [ + self.url_result( + 'http://y.qq.com/#type=song&mid=' + song['songmid'], 'QQMusic', song['songmid'] + ) for song in album['list'] + ] + album_name = album.get('name') + album_detail = album.get('desc') + if album_detail is not None: + album_detail = album_detail.strip() + + return self.playlist_result(entries, mid, album_name, album_detail) + + +class QQMusicToplistIE(QQPlaylistBaseIE): + IE_NAME = 'qqmusic:toplist' + IE_DESC = 'QQ音乐 - 排行榜' + _VALID_URL = r'http://y\.qq\.com/#type=toplist&p=(?P<id>(top|global)_[0-9]+)' + + _TESTS = [{ + 'url': 'http://y.qq.com/#type=toplist&p=global_123', + 'info_dict': { + 'id': 'global_123', + 'title': '美国iTunes榜', + }, + 'playlist_count': 10, + }, { + 'url': 'http://y.qq.com/#type=toplist&p=top_3', + 'info_dict': { + 'id': 'top_3', + 'title': 'QQ音乐巅峰榜·欧美', + 'description': 'QQ音乐巅峰榜·欧美根据用户收听行为自动生成,集结当下最流行的欧美新歌!:更新时间:每周四22点|统' + '计周期:一周(上周四至本周三)|统计对象:三个月内发行的欧美歌曲|统计数量:100首|统计算法:根据' + '歌曲在一周内的有效播放次数,由高到低取前100名(同一歌手最多允许5首歌曲同时上榜)|有效播放次数:' + '登录用户完整播放一首歌曲,记为一次有效播放;同一用户收听同一首歌曲,每天记录为1次有效播放' + }, + 'playlist_count': 100, + }, { + 'url': 'http://y.qq.com/#type=toplist&p=global_106', + 'info_dict': { + 'id': 'global_106', + 'title': '韩国Mnet榜', + }, + 'playlist_count': 50, + }] + + def _real_extract(self, url): + list_id = self._match_id(url) + + list_type, num_id = list_id.split("_") + + toplist_json = self._download_json( + 'http://i.y.qq.com/v8/fcg-bin/fcg_v8_toplist_cp.fcg?type=%s&topid=%s&format=json' + % (list_type, num_id), + list_id, 'Download toplist page') + + entries = [ + self.url_result( + 'http://y.qq.com/#type=song&mid=' + song['data']['songmid'], 'QQMusic', song['data']['songmid'] + ) for song in toplist_json['songlist'] + ] + + topinfo = toplist_json.get('topinfo', {}) + list_name = topinfo.get('ListName') + list_description = topinfo.get('info') + return self.playlist_result(entries, list_id, list_name, list_description) + + +class QQMusicPlaylistIE(QQPlaylistBaseIE): + IE_NAME = 'qqmusic:playlist' + IE_DESC = 'QQ音乐 - 歌单' + _VALID_URL = r'http://y\.qq\.com/#type=taoge&id=(?P<id>[0-9]+)' + + _TEST = { + 'url': 'http://y.qq.com/#type=taoge&id=3462654915', + 'info_dict': { + 'id': '3462654915', + 'title': '韩国5月新歌精选下旬', + 'description': 'md5:d2c9d758a96b9888cf4fe82f603121d4', + }, + 'playlist_count': 40, + } + + def _real_extract(self, url): + list_id = self._match_id(url) + + list_json = self._download_json( + 'http://i.y.qq.com/qzone-music/fcg-bin/fcg_ucc_getcdinfo_byids_cp.fcg?type=1&json=1&utf8=1&onlysong=0&disstid=%s' + % list_id, list_id, 'Download list page', + transform_source=strip_jsonp)['cdlist'][0] + + entries = [ + self.url_result( + 'http://y.qq.com/#type=song&mid=' + song['songmid'], 'QQMusic', song['songmid'] + ) for song in list_json['songlist'] + ] + + list_name = list_json.get('dissname') + list_description = clean_html(unescapeHTML(list_json.get('desc'))) + return self.playlist_result(entries, list_id, list_name, list_description) diff --git a/youtube_dl/extractor/quickvid.py b/youtube_dl/extractor/quickvid.py index 3bc78060d..f414e2384 100644 --- a/youtube_dl/extractor/quickvid.py +++ b/youtube_dl/extractor/quickvid.py @@ -3,8 +3,10 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..utils import ( +from ..compat import ( compat_urlparse, +) +from ..utils import ( determine_ext, int_or_none, ) @@ -22,6 +24,7 @@ class QuickVidIE(InfoExtractor): 'thumbnail': 're:^https?://.*\.(?:png|jpg|gif)$', 'view_count': int, }, + 'skip': 'Not accessible from Travis CI server', } def _real_extract(self, url): diff --git a/youtube_dl/extractor/r7.py b/youtube_dl/extractor/r7.py new file mode 100644 index 000000000..976c8feec --- /dev/null +++ b/youtube_dl/extractor/r7.py @@ -0,0 +1,88 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + js_to_json, + unescapeHTML, + int_or_none, +) + + +class R7IE(InfoExtractor): + _VALID_URL = r'''(?x)https?:// + (?: + (?:[a-zA-Z]+)\.r7\.com(?:/[^/]+)+/idmedia/| + noticias\.r7\.com(?:/[^/]+)+/[^/]+-| + player\.r7\.com/video/i/ + ) + (?P<id>[\da-f]{24}) + ''' + _TESTS = [{ + 'url': 'http://videos.r7.com/policiais-humilham-suspeito-a-beira-da-morte-morre-com-dignidade-/idmedia/54e7050b0cf2ff57e0279389.html', + 'md5': '403c4e393617e8e8ddc748978ee8efde', + 'info_dict': { + 'id': '54e7050b0cf2ff57e0279389', + 'ext': 'mp4', + 'title': 'Policiais humilham suspeito à beira da morte: "Morre com dignidade"', + 'thumbnail': 're:^https?://.*\.jpg$', + 'duration': 98, + 'like_count': int, + 'view_count': int, + }, + }, { + 'url': 'http://esportes.r7.com/videos/cigano-manda-recado-aos-fas/idmedia/4e176727b51a048ee6646a1b.html', + 'only_matching': True, + }, { + 'url': 'http://noticias.r7.com/record-news/video/representante-do-instituto-sou-da-paz-fala-sobre-fim-do-estatuto-do-desarmamento-5480fc580cf2285b117f438d/', + 'only_matching': True, + }, { + 'url': 'http://player.r7.com/video/i/54e7050b0cf2ff57e0279389?play=true&video=http://vsh.r7.com/54e7050b0cf2ff57e0279389/ER7_RE_BG_MORTE_JOVENS_570kbps_2015-02-2009f17818-cc82-4c8f-86dc-89a66934e633-ATOS_copy.mp4&linkCallback=http://videos.r7.com/policiais-humilham-suspeito-a-beira-da-morte-morre-com-dignidade-/idmedia/54e7050b0cf2ff57e0279389.html&thumbnail=http://vtb.r7.com/ER7_RE_BG_MORTE_JOVENS_570kbps_2015-02-2009f17818-cc82-4c8f-86dc-89a66934e633-thumb.jpg&idCategory=192&share=true&layout=full&full=true', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage( + 'http://player.r7.com/video/i/%s' % video_id, video_id) + + item = self._parse_json(js_to_json(self._search_regex( + r'(?s)var\s+item\s*=\s*({.+?});', webpage, 'player')), video_id) + + title = unescapeHTML(item['title']) + thumbnail = item.get('init', {}).get('thumbUri') + duration = None + + statistics = item.get('statistics', {}) + like_count = int_or_none(statistics.get('likes')) + view_count = int_or_none(statistics.get('views')) + + formats = [] + for format_key, format_dict in item['playlist'][0].items(): + src = format_dict.get('src') + if not src: + continue + format_id = format_dict.get('format') or format_key + if duration is None: + duration = format_dict.get('duration') + if '.f4m' in src: + formats.extend(self._extract_f4m_formats(src, video_id, preference=-1)) + elif src.endswith('.m3u8'): + formats.extend(self._extract_m3u8_formats(src, video_id, 'mp4', preference=-2)) + else: + formats.append({ + 'url': src, + 'format_id': format_id, + }) + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': title, + 'thumbnail': thumbnail, + 'duration': duration, + 'like_count': like_count, + 'view_count': view_count, + 'formats': formats, + } diff --git a/youtube_dl/extractor/radiobremen.py b/youtube_dl/extractor/radiobremen.py new file mode 100644 index 000000000..0d706312e --- /dev/null +++ b/youtube_dl/extractor/radiobremen.py @@ -0,0 +1,63 @@ +# -*- coding: utf-8 -*- + +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import parse_duration + + +class RadioBremenIE(InfoExtractor): + _VALID_URL = r'http?://(?:www\.)?radiobremen\.de/mediathek/(?:index\.html)?\?id=(?P<id>[0-9]+)' + IE_NAME = 'radiobremen' + + _TEST = { + 'url': 'http://www.radiobremen.de/mediathek/index.html?id=114720', + 'info_dict': { + 'id': '114720', + 'ext': 'mp4', + 'duration': 1685, + 'width': 512, + 'title': 'buten un binnen vom 22. Dezember', + 'thumbnail': 're:https?://.*\.jpg$', + 'description': 'Unter anderem mit diesen Themen: 45 Flüchtlinge sind in Worpswede angekommen +++ Freies Internet für alle: Bremer arbeiten an einem flächendeckenden W-Lan-Netzwerk +++ Aktivisten kämpfen für das Unibad +++ So war das Wetter 2014 +++', + }, + } + + def _real_extract(self, url): + video_id = self._match_id(url) + + meta_url = "http://www.radiobremen.de/apps/php/mediathek/metadaten.php?id=%s" % video_id + meta_doc = self._download_webpage( + meta_url, video_id, 'Downloading metadata') + title = self._html_search_regex( + r"<h1.*>(?P<title>.+)</h1>", meta_doc, "title") + description = self._html_search_regex( + r"<p>(?P<description>.*)</p>", meta_doc, "description", fatal=False) + duration = parse_duration(self._html_search_regex( + r"Länge:</td>\s+<td>(?P<duration>[0-9]+:[0-9]+)</td>", + meta_doc, "duration", fatal=False)) + + page_doc = self._download_webpage( + url, video_id, 'Downloading video information') + mobj = re.search( + r"ardformatplayerclassic\(\'playerbereich\',\'(?P<width>[0-9]+)\',\'.*\',\'(?P<video_id>[0-9]+)\',\'(?P<secret>[0-9]+)\',\'(?P<thumbnail>.+)\',\'\'\)", + page_doc) + video_url = ( + "http://dl-ondemand.radiobremen.de/mediabase/%s/%s_%s_%s.mp4" % + (video_id, video_id, mobj.group("secret"), mobj.group('width'))) + + formats = [{ + 'url': video_url, + 'ext': 'mp4', + 'width': int(mobj.group("width")), + }] + return { + 'id': video_id, + 'title': title, + 'description': description, + 'duration': duration, + 'formats': formats, + 'thumbnail': mobj.group('thumbnail'), + } diff --git a/youtube_dl/extractor/radiode.py b/youtube_dl/extractor/radiode.py new file mode 100644 index 000000000..aa5f6f8ad --- /dev/null +++ b/youtube_dl/extractor/radiode.py @@ -0,0 +1,52 @@ +from __future__ import unicode_literals + +from .common import InfoExtractor + + +class RadioDeIE(InfoExtractor): + IE_NAME = 'radio.de' + _VALID_URL = r'https?://(?P<id>.+?)\.(?:radio\.(?:de|at|fr|pt|es|pl|it)|rad\.io)' + _TEST = { + 'url': 'http://ndr2.radio.de/', + 'info_dict': { + 'id': 'ndr2', + 'ext': 'mp3', + 'title': 're:^NDR 2 [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', + 'description': 'md5:591c49c702db1a33751625ebfb67f273', + 'thumbnail': 're:^https?://.*\.png', + 'is_live': True, + }, + 'params': { + 'skip_download': True, + } + } + + def _real_extract(self, url): + radio_id = self._match_id(url) + webpage = self._download_webpage(url, radio_id) + jscode = self._search_regex( + r"'components/station/stationService':\s*\{\s*'?station'?:\s*(\{.*?\s*\}),\n", + webpage, 'broadcast') + + broadcast = self._parse_json(jscode, radio_id) + title = self._live_title(broadcast['name']) + description = broadcast.get('description') or broadcast.get('shortDescription') + thumbnail = broadcast.get('picture4Url') or broadcast.get('picture4TransUrl') or broadcast.get('logo100x100') + + formats = [{ + 'url': stream['streamUrl'], + 'ext': stream['streamContentFormat'].lower(), + 'acodec': stream['streamContentFormat'], + 'abr': stream['bitRate'], + 'asr': stream['sampleRate'] + } for stream in broadcast['streamUrls']] + self._sort_formats(formats) + + return { + 'id': radio_id, + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + 'is_live': True, + 'formats': formats, + } diff --git a/youtube_dl/extractor/radiojavan.py b/youtube_dl/extractor/radiojavan.py new file mode 100644 index 000000000..884c28420 --- /dev/null +++ b/youtube_dl/extractor/radiojavan.py @@ -0,0 +1,67 @@ +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import( + unified_strdate, + str_to_int, +) + + +class RadioJavanIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?radiojavan\.com/videos/video/(?P<id>[^/]+)/?' + _TEST = { + 'url': 'http://www.radiojavan.com/videos/video/chaartaar-ashoobam', + 'md5': 'e85208ffa3ca8b83534fca9fe19af95b', + 'info_dict': { + 'id': 'chaartaar-ashoobam', + 'ext': 'mp4', + 'title': 'Chaartaar - Ashoobam', + 'thumbnail': 're:^https?://.*\.jpe?g$', + 'upload_date': '20150215', + 'view_count': int, + 'like_count': int, + 'dislike_count': int, + } + } + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage(url, video_id) + + formats = [{ + 'url': 'https://media.rdjavan.com/media/music_video/%s' % video_path, + 'format_id': '%sp' % height, + 'height': int(height), + } for height, video_path in re.findall(r"RJ\.video(\d+)p\s*=\s*'/?([^']+)'", webpage)] + self._sort_formats(formats) + + title = self._og_search_title(webpage) + thumbnail = self._og_search_thumbnail(webpage) + + upload_date = unified_strdate(self._search_regex( + r'class="date_added">Date added: ([^<]+)<', + webpage, 'upload date', fatal=False)) + + view_count = str_to_int(self._search_regex( + r'class="views">Plays: ([\d,]+)', + webpage, 'view count', fatal=False)) + like_count = str_to_int(self._search_regex( + r'class="rating">([\d,]+) likes', + webpage, 'like count', fatal=False)) + dislike_count = str_to_int(self._search_regex( + r'class="rating">([\d,]+) dislikes', + webpage, 'dislike count', fatal=False)) + + return { + 'id': video_id, + 'title': title, + 'thumbnail': thumbnail, + 'upload_date': upload_date, + 'view_count': view_count, + 'like_count': like_count, + 'dislike_count': dislike_count, + 'formats': formats, + } diff --git a/youtube_dl/extractor/rai.py b/youtube_dl/extractor/rai.py index 2d39ecfe4..278b1d2bf 100644 --- a/youtube_dl/extractor/rai.py +++ b/youtube_dl/extractor/rai.py @@ -2,23 +2,30 @@ from __future__ import unicode_literals import re -from .subtitles import SubtitlesInfoExtractor +from .common import InfoExtractor +from ..compat import ( + compat_urllib_parse, + compat_urlparse, +) from ..utils import ( + ExtractorError, + determine_ext, parse_duration, unified_strdate, - compat_urllib_parse, + int_or_none, + xpath_text, ) -class RaiIE(SubtitlesInfoExtractor): - _VALID_URL = r'(?P<url>http://(?:.+?\.)?(?:rai\.it|rai\.tv|rainews\.it)/dl/.+?-(?P<id>[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})(?:-.+?)?\.html)' +class RaiTVIE(InfoExtractor): + _VALID_URL = r'http://(?:.+?\.)?(?:rai\.it|rai\.tv|rainews\.it)/dl/(?:[^/]+/)+media/.+?-(?P<id>[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})(?:-.+?)?\.html' _TESTS = [ { 'url': 'http://www.rai.tv/dl/RaiTV/programmi/media/ContentItem-cb27157f-9dd0-4aee-b788-b1f67643a391.html', - 'md5': 'c064c0b2d09c278fb293116ef5d0a32d', + 'md5': '96382709b61dd64a6b88e0f791e6df4c', 'info_dict': { 'id': 'cb27157f-9dd0-4aee-b788-b1f67643a391', - 'ext': 'mp4', + 'ext': 'flv', 'title': 'Report del 07/04/2014', 'description': 'md5:f27c544694cacb46a078db84ec35d2d9', 'upload_date': '20140407', @@ -27,16 +34,14 @@ class RaiIE(SubtitlesInfoExtractor): }, { 'url': 'http://www.raisport.rai.it/dl/raiSport/media/rassegna-stampa-04a9f4bd-b563-40cf-82a6-aad3529cb4a9.html', - 'md5': '8bb9c151924ce241b74dd52ef29ceafa', + 'md5': 'd9751b78eac9710d62c2447b224dea39', 'info_dict': { 'id': '04a9f4bd-b563-40cf-82a6-aad3529cb4a9', - 'ext': 'mp4', + 'ext': 'flv', 'title': 'TG PRIMO TEMPO', - 'description': '', 'upload_date': '20140612', 'duration': 1758, }, - 'skip': 'Error 404', }, { 'url': 'http://www.rainews.it/dl/rainews/media/state-of-the-net-Antonella-La-Carpia-regole-virali-7aafdea9-0e5d-49d5-88a6-7e65da67ae13.html', @@ -52,64 +57,111 @@ class RaiIE(SubtitlesInfoExtractor): }, { 'url': 'http://www.rai.tv/dl/RaiTV/programmi/media/ContentItem-b4a49761-e0cc-4b14-8736-2729f6f73132-tg2.html', - 'md5': '35694f062977fe6619943f08ed935730', 'info_dict': { 'id': 'b4a49761-e0cc-4b14-8736-2729f6f73132', 'ext': 'mp4', 'title': 'Alluvione in Sardegna e dissesto idrogeologico', 'description': 'Edizione delle ore 20:30 ', + }, + 'skip': 'invalid urls', + }, + { + 'url': 'http://www.ilcandidato.rai.it/dl/ray/media/Il-Candidato---Primo-episodio-Le-Primarie-28e5525a-b495-45e8-a7c3-bc48ba45d2b6.html', + 'md5': '496ab63e420574447f70d02578333437', + 'info_dict': { + 'id': '28e5525a-b495-45e8-a7c3-bc48ba45d2b6', + 'ext': 'flv', + 'title': 'Il Candidato - Primo episodio: "Le Primarie"', + 'description': 'md5:364b604f7db50594678f483353164fb8', + 'upload_date': '20140923', + 'duration': 386, } }, ] def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') + video_id = self._match_id(url) + media = self._download_json( + 'http://www.rai.tv/dl/RaiTV/programmi/media/ContentItem-%s.html?json' % video_id, + video_id, 'Downloading video JSON') - media = self._download_json('%s?json' % mobj.group('url'), video_id, 'Downloading video JSON') - - title = media.get('name') - description = media.get('desc') - thumbnail = media.get('image_300') or media.get('image_medium') or media.get('image') - duration = parse_duration(media.get('length')) - uploader = media.get('author') - upload_date = unified_strdate(media.get('date')) + thumbnails = [] + for image_type in ('image', 'image_medium', 'image_300'): + thumbnail_url = media.get(image_type) + if thumbnail_url: + thumbnails.append({ + 'url': thumbnail_url, + }) + subtitles = [] formats = [] - - for format_id in ['wmv', 'm3u8', 'mediaUri', 'h264']: - media_url = media.get(format_id) - if not media_url: - continue + media_type = media['type'] + if 'Audio' in media_type: formats.append({ - 'url': media_url, - 'format_id': format_id, - 'ext': 'mp4', + 'format_id': media.get('formatoAudio'), + 'url': media['audioUrl'], + 'ext': media.get('formatoAudio'), }) + elif 'Video' in media_type: + def fix_xml(xml): + return xml.replace(' tag elementi', '').replace('>/', '</') - if self._downloader.params.get('listsubtitles', False): - page = self._download_webpage(url, video_id) - self._list_available_subtitles(video_id, page) - return + relinker = self._download_xml( + media['mediaUri'] + '&output=43', video_id, transform_source=fix_xml) - subtitles = {} - if self._have_to_download_any_subtitles: - page = self._download_webpage(url, video_id) - subtitles = self.extract_subtitles(video_id, page) + has_subtitle = False + + for element in relinker.findall('element'): + media_url = xpath_text(element, 'url') + ext = determine_ext(media_url) + content_type = xpath_text(element, 'content-type') + if ext == 'm3u8': + m3u8_formats = self._extract_m3u8_formats( + media_url, video_id, 'mp4', 'm3u8_native', m3u8_id='hls', + fatal=False) + if m3u8_formats: + formats.extend(m3u8_formats) + elif ext == 'f4m': + f4m_formats = self._extract_f4m_formats( + media_url + '?hdcore=3.7.0&plugin=aasp-3.7.0.39.44', + video_id, f4m_id='hds', fatal=False) + if f4m_formats: + formats.extend(f4m_formats) + elif ext == 'stl': + has_subtitle = True + elif content_type.startswith('video/'): + bitrate = int_or_none(xpath_text(element, 'bitrate')) + formats.append({ + 'url': media_url, + 'tbr': bitrate if bitrate > 0 else None, + 'format_id': 'http-%d' % bitrate if bitrate > 0 else 'http', + }) + elif content_type.startswith('image/'): + thumbnails.append({ + 'url': media_url, + }) + + self._sort_formats(formats) + + if has_subtitle: + webpage = self._download_webpage(url, video_id) + subtitles = self._get_subtitles(video_id, webpage) + else: + raise ExtractorError('not a media file') return { 'id': video_id, - 'title': title, - 'description': description, - 'thumbnail': thumbnail, - 'uploader': uploader, - 'upload_date': upload_date, - 'duration': duration, + 'title': media['name'], + 'description': media.get('desc'), + 'thumbnails': thumbnails, + 'uploader': media.get('author'), + 'upload_date': unified_strdate(media.get('date')), + 'duration': parse_duration(media.get('length')), 'formats': formats, 'subtitles': subtitles, } - def _get_available_subtitles(self, video_id, webpage): + def _get_subtitles(self, video_id, webpage): subtitles = {} m = re.search(r'<meta name="closedcaption" content="(?P<captions>[^"]+)"', webpage) if m: @@ -118,5 +170,41 @@ class RaiIE(SubtitlesInfoExtractor): SRT_EXT = '.srt' if captions.endswith(STL_EXT): captions = captions[:-len(STL_EXT)] + SRT_EXT - subtitles['it'] = 'http://www.rai.tv%s' % compat_urllib_parse.quote(captions) + subtitles['it'] = [{ + 'ext': 'srt', + 'url': 'http://www.rai.tv%s' % compat_urllib_parse.quote(captions), + }] return subtitles + + +class RaiIE(InfoExtractor): + _VALID_URL = r'http://(?:.+?\.)?(?:rai\.it|rai\.tv|rainews\.it)/dl/.+?-(?P<id>[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})(?:-.+?)?\.html' + _TESTS = [ + { + 'url': 'http://www.report.rai.it/dl/Report/puntata/ContentItem-0c7a664b-d0f4-4b2c-8835-3f82e46f433e.html', + 'md5': 'e0e7a8a131e249d1aa0ebf270d1d8db7', + 'info_dict': { + 'id': '59d69d28-6bb6-409d-a4b5-ed44096560af', + 'ext': 'flv', + 'title': 'Il pacco', + 'description': 'md5:4b1afae1364115ce5d78ed83cd2e5b3a', + 'upload_date': '20141221', + }, + } + ] + + @classmethod + def suitable(cls, url): + return False if RaiTVIE.suitable(url) else super(RaiIE, cls).suitable(url) + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + iframe_url = self._search_regex( + [r'<iframe[^>]+src="([^"]*/dl/[^"]+\?iframe\b[^"]*)"', + r'drawMediaRaiTV\(["\'](.+?)["\']'], + webpage, 'iframe') + if not iframe_url.startswith('http'): + iframe_url = compat_urlparse.urljoin(url, iframe_url) + return self.url_result(iframe_url) diff --git a/youtube_dl/extractor/rds.py b/youtube_dl/extractor/rds.py new file mode 100644 index 000000000..796adfdf9 --- /dev/null +++ b/youtube_dl/extractor/rds.py @@ -0,0 +1,73 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + parse_duration, + parse_iso8601, +) + + +class RDSIE(InfoExtractor): + IE_DESC = 'RDS.ca' + _VALID_URL = r'https?://(?:www\.)?rds\.ca/vid(?:[eé]|%C3%A9)os/(?:[^/]+/)*(?P<display_id>[^/]+)-(?P<id>\d+\.\d+)' + + _TESTS = [{ + 'url': 'http://www.rds.ca/videos/football/nfl/fowler-jr-prend-la-direction-de-jacksonville-3.1132799', + 'info_dict': { + 'id': '3.1132799', + 'display_id': 'fowler-jr-prend-la-direction-de-jacksonville', + 'ext': 'mp4', + 'title': 'Fowler Jr. prend la direction de Jacksonville', + 'description': 'Dante Fowler Jr. est le troisième choix du repêchage 2015 de la NFL. ', + 'timestamp': 1430397346, + 'upload_date': '20150430', + 'duration': 154.354, + 'age_limit': 0, + } + }, { + 'url': 'http://www.rds.ca/vid%C3%A9os/un-voyage-positif-3.877934', + 'only_matching': True, + }] + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + display_id = mobj.group('display_id') + + webpage = self._download_webpage(url, display_id) + + # TODO: extract f4m from 9c9media.com + video_url = self._search_regex( + r'<span[^>]+itemprop="contentURL"[^>]+content="([^"]+)"', + webpage, 'video url') + + title = self._og_search_title(webpage) or self._html_search_meta( + 'title', webpage, 'title', fatal=True) + description = self._og_search_description(webpage) or self._html_search_meta( + 'description', webpage, 'description') + thumbnail = self._og_search_thumbnail(webpage) or self._search_regex( + [r'<link[^>]+itemprop="thumbnailUrl"[^>]+href="([^"]+)"', + r'<span[^>]+itemprop="thumbnailUrl"[^>]+content="([^"]+)"'], + webpage, 'thumbnail', fatal=False) + timestamp = parse_iso8601(self._search_regex( + r'<span[^>]+itemprop="uploadDate"[^>]+content="([^"]+)"', + webpage, 'upload date', fatal=False)) + duration = parse_duration(self._search_regex( + r'<span[^>]+itemprop="duration"[^>]+content="([^"]+)"', + webpage, 'duration', fatal=False)) + age_limit = self._family_friendly_search(webpage) + + return { + 'id': video_id, + 'display_id': display_id, + 'url': video_url, + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + 'timestamp': timestamp, + 'duration': duration, + 'age_limit': age_limit, + } diff --git a/youtube_dl/extractor/redtube.py b/youtube_dl/extractor/redtube.py index 846b76c81..d6054d717 100644 --- a/youtube_dl/extractor/redtube.py +++ b/youtube_dl/extractor/redtube.py @@ -1,17 +1,19 @@ from __future__ import unicode_literals from .common import InfoExtractor +from ..utils import ExtractorError class RedTubeIE(InfoExtractor): _VALID_URL = r'http://(?:www\.)?redtube\.com/(?P<id>[0-9]+)' _TEST = { 'url': 'http://www.redtube.com/66418', + 'md5': '7b8c22b5e7098a3e1c09709df1126d2d', 'info_dict': { 'id': '66418', 'ext': 'mp4', - "title": "Sucked on a toilet", - "age_limit": 18, + 'title': 'Sucked on a toilet', + 'age_limit': 18, } } @@ -19,6 +21,9 @@ class RedTubeIE(InfoExtractor): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) + if any(s in webpage for s in ['video-deleted-info', '>This video has been removed']): + raise ExtractorError('Video %s has been removed' % video_id, expected=True) + video_url = self._html_search_regex( r'<source src="(.+?)" type="video/mp4">', webpage, 'video URL') video_title = self._html_search_regex( diff --git a/youtube_dl/extractor/restudy.py b/youtube_dl/extractor/restudy.py new file mode 100644 index 000000000..b17c2bfc0 --- /dev/null +++ b/youtube_dl/extractor/restudy.py @@ -0,0 +1,40 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor + + +class RestudyIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?restudy\.dk/video/play/id/(?P<id>[0-9]+)' + _TEST = { + 'url': 'https://www.restudy.dk/video/play/id/1637', + 'info_dict': { + 'id': '1637', + 'ext': 'flv', + 'title': 'Leiden-frosteffekt', + 'description': 'Denne video er et eksperiment med flydende kvælstof.', + }, + 'params': { + # rtmp download + 'skip_download': True, + } + } + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage(url, video_id) + + title = self._og_search_title(webpage).strip() + description = self._og_search_description(webpage).strip() + + formats = self._extract_smil_formats( + 'https://www.restudy.dk/awsmedia/SmilDirectory/video_%s.xml' % video_id, + video_id) + + return { + 'id': video_id, + 'title': title, + 'description': description, + 'formats': formats, + } diff --git a/youtube_dl/extractor/ringtv.py b/youtube_dl/extractor/ringtv.py index 59dc137cc..efa4afeb6 100644 --- a/youtube_dl/extractor/ringtv.py +++ b/youtube_dl/extractor/ringtv.py @@ -6,12 +6,13 @@ from .common import InfoExtractor class RingTVIE(InfoExtractor): - _VALID_URL = r'(?:http://)?(?:www\.)?ringtv\.craveonline\.com/(?P<type>news|videos/video)/(?P<id>[^/?#]+)' + _VALID_URL = r'http://(?:www\.)?ringtv\.craveonline\.com/(?P<type>news|videos/video)/(?P<id>[^/?#]+)' _TEST = { "url": "http://ringtv.craveonline.com/news/310833-luis-collazo-says-victor-ortiz-better-not-quit-on-jan-30", - "file": "857645.mp4", "md5": "d25945f5df41cdca2d2587165ac28720", "info_dict": { + 'id': '857645', + 'ext': 'mp4', "title": 'Video: Luis Collazo says Victor Ortiz "better not quit on Jan. 30" - Ring TV', "description": 'Luis Collazo is excited about his Jan. 30 showdown with fellow former welterweight titleholder Victor Ortiz at Barclays Center in his hometown of Brooklyn. The SuperBowl week fight headlines a Golden Boy Live! card on Fox Sports 1.', } diff --git a/youtube_dl/extractor/rottentomatoes.py b/youtube_dl/extractor/rottentomatoes.py index c1500b82f..e8bb20a08 100644 --- a/youtube_dl/extractor/rottentomatoes.py +++ b/youtube_dl/extractor/rottentomatoes.py @@ -10,8 +10,9 @@ class RottenTomatoesIE(VideoDetectiveIE): _TEST = { 'url': 'http://www.rottentomatoes.com/m/toy_story_3/trailers/11028566/', - 'file': '613340.mp4', 'info_dict': { + 'id': '613340', + 'ext': 'mp4', 'title': 'TOY STORY 3', 'description': 'From the creators of the beloved TOY STORY films, comes a story that will reunite the gang in a whole new way.', }, diff --git a/youtube_dl/extractor/rtbf.py b/youtube_dl/extractor/rtbf.py index dce64e151..e42b319a3 100644 --- a/youtube_dl/extractor/rtbf.py +++ b/youtube_dl/extractor/rtbf.py @@ -1,49 +1,70 @@ # coding: utf-8 from __future__ import unicode_literals -import re -import json - from .common import InfoExtractor +from ..utils import ( + int_or_none, + unescapeHTML, +) class RTBFIE(InfoExtractor): - _VALID_URL = r'https?://www.rtbf.be/video/[^\?]+\?id=(?P<id>\d+)' - _TEST = { + _VALID_URL = r'https?://(?:www\.)?rtbf\.be/(?:video/[^?]+\?.*\bid=|ouftivi/(?:[^/]+/)*[^?]+\?.*\bvideoId=)(?P<id>\d+)' + _TESTS = [{ 'url': 'https://www.rtbf.be/video/detail_les-diables-au-coeur-episode-2?id=1921274', 'md5': '799f334ddf2c0a582ba80c44655be570', 'info_dict': { 'id': '1921274', 'ext': 'mp4', 'title': 'Les Diables au coeur (épisode 2)', - 'description': 'Football - Diables Rouges', 'duration': 3099, - 'timestamp': 1398456336, - 'upload_date': '20140425', } - } + }, { + # geo restricted + 'url': 'http://www.rtbf.be/ouftivi/heros/detail_scooby-doo-mysteres-associes?id=1097&videoId=2057442', + 'only_matching': True, + }, { + 'url': 'http://www.rtbf.be/ouftivi/niouzz?videoId=2055858', + 'only_matching': True, + }] + + _QUALITIES = [ + ('mobile', 'mobile'), + ('web', 'SD'), + ('url', 'MD'), + ('high', 'HD'), + ] def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') + video_id = self._match_id(url) - page = self._download_webpage('https://www.rtbf.be/video/embed?id=%s' % video_id, video_id) + webpage = self._download_webpage( + 'http://www.rtbf.be/video/embed?id=%s' % video_id, video_id) - data = json.loads(self._html_search_regex( - r'<div class="js-player-embed(?: player-embed)?" data-video="([^"]+)"', page, 'data video'))['data'] + data = self._parse_json( + unescapeHTML(self._search_regex( + r'data-media="([^"]+)"', webpage, 'data video')), + video_id) - video_url = data.get('downloadUrl') or data.get('url') - - if data['provider'].lower() == 'youtube': + if data.get('provider').lower() == 'youtube': + video_url = data.get('downloadUrl') or data.get('url') return self.url_result(video_url, 'Youtube') + formats = [] + for key, format_id in self._QUALITIES: + format_url = data['sources'].get(key) + if format_url: + formats.append({ + 'format_id': format_id, + 'url': format_url, + }) return { 'id': video_id, - 'url': video_url, + 'formats': formats, 'title': data['title'], 'description': data.get('description') or data.get('subtitle'), - 'thumbnail': data['thumbnail']['large'], + 'thumbnail': data.get('thumbnail'), 'duration': data.get('duration') or data.get('realDuration'), - 'timestamp': data['created'], - 'view_count': data['viewCount'], + 'timestamp': int_or_none(data.get('created')), + 'view_count': int_or_none(data.get('viewCount')), } diff --git a/youtube_dl/extractor/rte.py b/youtube_dl/extractor/rte.py new file mode 100644 index 000000000..d9cfbf180 --- /dev/null +++ b/youtube_dl/extractor/rte.py @@ -0,0 +1,62 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor + +from ..utils import ( + float_or_none, +) + + +class RteIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?rte\.ie/player/[^/]{2,3}/show/[^/]+/(?P<id>[0-9]+)' + _TEST = { + 'url': 'http://www.rte.ie/player/ie/show/iwitness-862/10478715/', + 'info_dict': { + 'id': '10478715', + 'ext': 'mp4', + 'title': 'Watch iWitness online', + 'thumbnail': 're:^https?://.*\.jpg$', + 'description': 'iWitness : The spirit of Ireland, one voice and one minute at a time.', + 'duration': 60.046, + }, + 'params': { + 'skip_download': 'f4m fails with --test atm' + } + } + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + title = self._og_search_title(webpage) + description = self._html_search_meta('description', webpage, 'description') + duration = float_or_none(self._html_search_meta( + 'duration', webpage, 'duration', fatal=False), 1000) + + thumbnail_id = self._search_regex( + r'<meta name="thumbnail" content="uri:irus:(.*?)" />', webpage, 'thumbnail') + thumbnail = 'http://img.rasset.ie/' + thumbnail_id + '.jpg' + + feeds_url = self._html_search_meta("feeds-prefix", webpage, 'feeds url') + video_id + json_string = self._download_json(feeds_url, video_id) + + # f4m_url = server + relative_url + f4m_url = json_string['shows'][0]['media:group'][0]['rte:server'] + json_string['shows'][0]['media:group'][0]['url'] + f4m_formats = self._extract_f4m_formats(f4m_url, video_id) + f4m_formats = [{ + 'format_id': f['format_id'], + 'url': f['url'], + 'ext': 'mp4', + 'width': f['width'], + 'height': f['height'], + } for f in f4m_formats] + + return { + 'id': video_id, + 'title': title, + 'formats': f4m_formats, + 'description': description, + 'thumbnail': thumbnail, + 'duration': duration, + } diff --git a/youtube_dl/extractor/rtl2.py b/youtube_dl/extractor/rtl2.py new file mode 100644 index 000000000..25f7faf76 --- /dev/null +++ b/youtube_dl/extractor/rtl2.py @@ -0,0 +1,85 @@ +# encoding: utf-8 +from __future__ import unicode_literals + +import re +from .common import InfoExtractor + + +class RTL2IE(InfoExtractor): + _VALID_URL = r'http?://(?:www\.)?rtl2\.de/[^?#]*?/(?P<id>[^?#/]*?)(?:$|/(?:$|[?#]))' + _TESTS = [{ + 'url': 'http://www.rtl2.de/sendung/grip-das-motormagazin/folge/folge-203-0', + 'info_dict': { + 'id': 'folge-203-0', + 'ext': 'f4v', + 'title': 'GRIP sucht den Sommerkönig', + 'description': 'Matthias, Det und Helge treten gegeneinander an.' + }, + 'params': { + # rtmp download + 'skip_download': True, + }, + }, { + 'url': 'http://www.rtl2.de/sendung/koeln-50667/video/5512-anna/21040-anna-erwischt-alex/', + 'info_dict': { + 'id': '21040-anna-erwischt-alex', + 'ext': 'mp4', + 'title': 'Anna erwischt Alex!', + 'description': 'Anna ist Alex\' Tochter bei Köln 50667.' + }, + 'params': { + # rtmp download + 'skip_download': True, + }, + }] + + def _real_extract(self, url): + # Some rtl2 urls have no slash at the end, so append it. + if not url.endswith('/'): + url += '/' + + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + mobj = re.search( + r'<div[^>]+data-collection="(?P<vico_id>\d+)"[^>]+data-video="(?P<vivi_id>\d+)"', + webpage) + if mobj: + vico_id = mobj.group('vico_id') + vivi_id = mobj.group('vivi_id') + else: + vico_id = self._html_search_regex( + r'vico_id\s*:\s*([0-9]+)', webpage, 'vico_id') + vivi_id = self._html_search_regex( + r'vivi_id\s*:\s*([0-9]+)', webpage, 'vivi_id') + info_url = 'http://www.rtl2.de/video/php/get_video.php?vico_id=' + vico_id + '&vivi_id=' + vivi_id + + info = self._download_json(info_url, video_id) + video_info = info['video'] + title = video_info['titel'] + description = video_info.get('beschreibung') + thumbnail = video_info.get('image') + + download_url = video_info['streamurl'] + download_url = download_url.replace('\\', '') + stream_url = 'mp4:' + self._html_search_regex(r'ondemand/(.*)', download_url, 'stream URL') + rtmp_conn = ["S:connect", "O:1", "NS:pageUrl:" + url, "NB:fpad:0", "NN:videoFunction:1", "O:0"] + + formats = [{ + 'url': download_url, + 'play_path': stream_url, + 'player_url': 'http://www.rtl2.de/flashplayer/vipo_player.swf', + 'page_url': url, + 'flash_version': 'LNX 11,2,202,429', + 'rtmp_conn': rtmp_conn, + 'no_resume': True, + }] + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': title, + 'thumbnail': thumbnail, + 'description': description, + 'formats': formats, + } diff --git a/youtube_dl/extractor/rtlnl.py b/youtube_dl/extractor/rtlnl.py index d029b0ec5..543d94417 100644 --- a/youtube_dl/extractor/rtlnl.py +++ b/youtube_dl/extractor/rtlnl.py @@ -1,16 +1,25 @@ +# coding: utf-8 from __future__ import unicode_literals -import re - from .common import InfoExtractor -from ..utils import parse_duration +from ..utils import ( + int_or_none, + parse_duration, +) -class RtlXlIE(InfoExtractor): - IE_NAME = 'rtlxl.nl' - _VALID_URL = r'https?://www\.rtlxl\.nl/#!/[^/]+/(?P<uuid>[^/?]+)' +class RtlNlIE(InfoExtractor): + IE_NAME = 'rtl.nl' + IE_DESC = 'rtl.nl and rtlxl.nl' + _VALID_URL = r'''(?x) + https?://(?:www\.)? + (?: + rtlxl\.nl/\#!/[^/]+/| + rtl\.nl/system/videoplayer/(?:[^/]+/)+(?:video_)?embed\.html\b.+?\buuid= + ) + (?P<id>[0-9a-f-]+)''' - _TEST = { + _TESTS = [{ 'url': 'http://www.rtlxl.nl/#!/rtl-nieuws-132237/6e4203a6-0a5e-3596-8424-c599a59e0677', 'md5': 'cc16baa36a6c169391f0764fa6b16654', 'info_dict': { @@ -22,29 +31,72 @@ class RtlXlIE(InfoExtractor): 'upload_date': '20140814', 'duration': 576.880, }, - } + }, { + 'url': 'http://www.rtl.nl/system/videoplayer/derden/rtlnieuws/video_embed.html#uuid=84ae5571-ac25-4225-ae0c-ef8d9efb2aed/autoplay=false', + 'md5': 'dea7474214af1271d91ef332fb8be7ea', + 'info_dict': { + 'id': '84ae5571-ac25-4225-ae0c-ef8d9efb2aed', + 'ext': 'mp4', + 'timestamp': 1424039400, + 'title': 'RTL Nieuws - Nieuwe beelden Kopenhagen: chaos direct na aanslag', + 'thumbnail': 're:^https?://screenshots\.rtl\.nl/system/thumb/sz=[0-9]+x[0-9]+/uuid=84ae5571-ac25-4225-ae0c-ef8d9efb2aed$', + 'upload_date': '20150215', + 'description': 'Er zijn nieuwe beelden vrijgegeven die vlak na de aanslag in Kopenhagen zijn gemaakt. Op de video is goed te zien hoe omstanders zich bekommeren om één van de slachtoffers, terwijl de eerste agenten ter plaatse komen.', + } + }, { + # empty synopsis and missing episodes (see https://github.com/rg3/youtube-dl/issues/6275) + 'url': 'http://www.rtl.nl/system/videoplayer/derden/rtlnieuws/video_embed.html#uuid=f536aac0-1dc3-4314-920e-3bd1c5b3811a/autoplay=false', + 'info_dict': { + 'id': 'f536aac0-1dc3-4314-920e-3bd1c5b3811a', + 'ext': 'mp4', + 'title': 'RTL Nieuws - Meer beelden van overval juwelier', + 'thumbnail': 're:^https?://screenshots\.rtl\.nl/system/thumb/sz=[0-9]+x[0-9]+/uuid=f536aac0-1dc3-4314-920e-3bd1c5b3811a$', + 'timestamp': 1437233400, + 'upload_date': '20150718', + 'duration': 30.474, + }, + 'params': { + 'skip_download': True, + }, + }, { + # encrypted m3u8 streams, georestricted + 'url': 'http://www.rtlxl.nl/#!/afl-2-257632/52a74543-c504-4cde-8aa8-ec66fe8d68a7', + 'only_matching': True, + }, { + 'url': 'http://www.rtl.nl/system/videoplayer/derden/embed.html#!/uuid=bb0353b0-d6a4-1dad-90e9-18fe75b8d1f0', + 'only_matching': True, + }] def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - uuid = mobj.group('uuid') - + uuid = self._match_id(url) info = self._download_json( - 'http://www.rtl.nl/system/s4m/vfd/version=2/uuid=%s/fmt=flash/' % uuid, + 'http://www.rtl.nl/system/s4m/vfd/version=2/uuid=%s/fmt=adaptive/' % uuid, uuid) material = info['material'][0] - episode_info = info['episodes'][0] + title = info['abstracts'][0]['name'] + subtitle = material.get('title') + if subtitle: + title += ' - %s' % subtitle + description = material.get('synopsis') - progname = info['abstracts'][0]['name'] - subtitle = material['title'] or info['episodes'][0]['name'] + meta = info.get('meta', {}) - # Use unencrypted m3u8 streams (See https://github.com/rg3/youtube-dl/issues/4118) - videopath = material['videopath'].replace('.f4m', '.m3u8') - m3u8_url = 'http://manifest.us.rtl.nl' + videopath + # m3u8 streams are encrypted and may not be handled properly by older ffmpeg/avconv. + # To workaround this previously adaptive -> flash trick was used to obtain + # unencrypted m3u8 streams (see https://github.com/rg3/youtube-dl/issues/4118) + # and bypass georestrictions as well. + # Currently, unencrypted m3u8 playlists are (intentionally?) invalid and therefore + # unusable albeit can be fixed by simple string replacement (see + # https://github.com/rg3/youtube-dl/pull/6337) + # Since recent ffmpeg and avconv handle encrypted streams just fine encrypted + # streams are used now. + videopath = material['videopath'] + m3u8_url = meta.get('videohost', 'http://manifest.us.rtl.nl') + videopath formats = self._extract_m3u8_formats(m3u8_url, uuid, ext='mp4') - video_urlpart = videopath.split('/flash/')[1][:-5] + video_urlpart = videopath.split('/adaptive/')[1][:-5] PG_URL_TEMPLATE = 'http://pg.us.rtl.nl/rtlxl/network/%s/progressive/%s.mp4' formats.extend([ @@ -58,14 +110,29 @@ class RtlXlIE(InfoExtractor): 'quality': 0, } ]) - self._sort_formats(formats) + thumbnails = [] + + for p in ('poster_base_url', '"thumb_base_url"'): + if not meta.get(p): + continue + + thumbnails.append({ + 'url': self._proto_relative_url(meta[p] + uuid), + 'width': int_or_none(self._search_regex( + r'/sz=([0-9]+)', meta[p], 'thumbnail width', fatal=False)), + 'height': int_or_none(self._search_regex( + r'/sz=[0-9]+x([0-9]+)', + meta[p], 'thumbnail height', fatal=False)) + }) + return { 'id': uuid, - 'title': '%s - %s' % (progname, subtitle), + 'title': title, 'formats': formats, 'timestamp': material['original_date'], - 'description': episode_info['synopsis'], + 'description': description, 'duration': parse_duration(material.get('duration')), + 'thumbnails': thumbnails, } diff --git a/youtube_dl/extractor/rtlnow.py b/youtube_dl/extractor/rtlnow.py deleted file mode 100644 index 285c3c4be..000000000 --- a/youtube_dl/extractor/rtlnow.py +++ /dev/null @@ -1,156 +0,0 @@ -# encoding: utf-8 -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from ..utils import ( - ExtractorError, - clean_html, - unified_strdate, - int_or_none, -) - - -class RTLnowIE(InfoExtractor): - """Information Extractor for RTL NOW, RTL2 NOW, RTL NITRO, SUPER RTL NOW, VOX NOW and n-tv NOW""" - _VALID_URL = r'''(?x) - (?:https?://)? - (?P<url> - (?P<domain> - rtl-now\.rtl\.de| - rtl2now\.rtl2\.de| - (?:www\.)?voxnow\.de| - (?:www\.)?rtlnitronow\.de| - (?:www\.)?superrtlnow\.de| - (?:www\.)?n-tvnow\.de) - /+[a-zA-Z0-9-]+/[a-zA-Z0-9-]+\.php\? - (?:container_id|film_id)=(?P<video_id>[0-9]+)& - player=1(?:&season=[0-9]+)?(?:&.*)? - )''' - - _TESTS = [ - { - 'url': 'http://rtl-now.rtl.de/ahornallee/folge-1.php?film_id=90419&player=1&season=1', - 'info_dict': { - 'id': '90419', - 'ext': 'flv', - 'title': 'Ahornallee - Folge 1 - Der Einzug', - 'description': 'md5:ce843b6b5901d9a7f7d04d1bbcdb12de', - 'upload_date': '20070416', - 'duration': 1685, - }, - 'params': { - 'skip_download': True, - }, - 'skip': 'Only works from Germany', - }, - { - 'url': 'http://rtl2now.rtl2.de/aerger-im-revier/episode-15-teil-1.php?film_id=69756&player=1&season=2&index=5', - 'info_dict': { - 'id': '69756', - 'ext': 'flv', - 'title': 'Ärger im Revier - Ein junger Ladendieb, ein handfester Streit u.a.', - 'description': 'md5:3fb247005ed21a935ffc82b7dfa70cf0', - 'thumbnail': 'http://autoimg.static-fra.de/rtl2now/219850/1500x1500/image2.jpg', - 'upload_date': '20120519', - 'duration': 1245, - }, - 'params': { - 'skip_download': True, - }, - 'skip': 'Only works from Germany', - }, - { - 'url': 'http://www.voxnow.de/voxtours/suedafrika-reporter-ii.php?film_id=13883&player=1&season=17', - 'info_dict': { - 'id': '13883', - 'ext': 'flv', - 'title': 'Voxtours - Südafrika-Reporter II', - 'description': 'md5:de7f8d56be6fd4fed10f10f57786db00', - 'upload_date': '20090627', - 'duration': 1800, - }, - 'params': { - 'skip_download': True, - }, - }, - { - 'url': 'http://superrtlnow.de/medicopter-117/angst.php?film_id=99205&player=1', - 'info_dict': { - 'id': '99205', - 'ext': 'flv', - 'title': 'Medicopter 117 - Angst!', - 'description': 're:^Im Therapiezentrum \'Sonnalm\' kommen durch eine Unachtsamkeit die für die B.handlung mit Phobikern gehaltenen Voglespinnen frei\. Eine Ausreißerin', - 'thumbnail': 'http://autoimg.static-fra.de/superrtlnow/287529/1500x1500/image2.jpg', - 'upload_date': '20080928', - 'duration': 2691, - }, - 'params': { - 'skip_download': True, - }, - }, - { - 'url': 'http://www.n-tvnow.de/deluxe-alles-was-spass-macht/thema-ua-luxushotel-fuer-vierbeiner.php?container_id=153819&player=1&season=0', - 'only_matching': True, - }, - ] - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_page_url = 'http://%s/' % mobj.group('domain') - video_id = mobj.group('video_id') - - webpage = self._download_webpage('http://' + mobj.group('url'), video_id) - - mobj = re.search(r'(?s)<div style="margin-left: 20px; font-size: 13px;">(.*?)<div id="playerteaser">', webpage) - if mobj: - raise ExtractorError(clean_html(mobj.group(1)), expected=True) - - title = self._og_search_title(webpage) - description = self._og_search_description(webpage) - thumbnail = self._og_search_thumbnail(webpage, default=None) - - upload_date = unified_strdate(self._html_search_meta('uploadDate', webpage, 'upload date')) - - mobj = re.search(r'<meta itemprop="duration" content="PT(?P<seconds>\d+)S" />', webpage) - duration = int(mobj.group('seconds')) if mobj else None - - playerdata_url = self._html_search_regex( - r"'playerdata': '(?P<playerdata_url>[^']+)'", webpage, 'playerdata_url') - - playerdata = self._download_xml(playerdata_url, video_id, 'Downloading player data XML') - - videoinfo = playerdata.find('./playlist/videoinfo') - - formats = [] - for filename in videoinfo.findall('filename'): - mobj = re.search(r'(?P<url>rtmpe://(?:[^/]+/){2})(?P<play_path>.+)', filename.text) - if mobj: - fmt = { - 'url': mobj.group('url'), - 'play_path': 'mp4:' + mobj.group('play_path'), - 'page_url': video_page_url, - 'player_url': video_page_url + 'includes/vodplayer.swf', - } - else: - fmt = { - 'url': filename.text, - } - fmt.update({ - 'width': int_or_none(filename.get('width')), - 'height': int_or_none(filename.get('height')), - 'vbr': int_or_none(filename.get('bitrate')), - 'ext': 'flv', - }) - formats.append(fmt) - - return { - 'id': video_id, - 'title': title, - 'description': description, - 'thumbnail': thumbnail, - 'upload_date': upload_date, - 'duration': duration, - 'formats': formats, - } diff --git a/youtube_dl/extractor/rtp.py b/youtube_dl/extractor/rtp.py new file mode 100644 index 000000000..82b323cdd --- /dev/null +++ b/youtube_dl/extractor/rtp.py @@ -0,0 +1,89 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor + + +class RTPIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?rtp\.pt/play/p(?P<program_id>[0-9]+)/(?P<id>[^/?#]+)/?' + _TESTS = [{ + 'url': 'http://www.rtp.pt/play/p405/e174042/paixoes-cruzadas', + 'md5': 'e736ce0c665e459ddb818546220b4ef8', + 'info_dict': { + 'id': 'e174042', + 'ext': 'mp3', + 'title': 'Paixões Cruzadas', + 'description': 'As paixões musicais de António Cartaxo e António Macedo', + 'thumbnail': 're:^https?://.*\.jpg', + }, + 'params': { + # rtmp download + 'skip_download': True, + }, + }, { + 'url': 'http://www.rtp.pt/play/p831/a-quimica-das-coisas', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage(url, video_id) + title = self._html_search_meta( + 'twitter:title', webpage, display_name='title', fatal=True) + description = self._html_search_meta('description', webpage) + thumbnail = self._og_search_thumbnail(webpage) + + player_config = self._search_regex( + r'(?s)RTPPLAY\.player\.newPlayer\(\s*(\{.*?\})\s*\)', webpage, 'player config') + config = self._parse_json(player_config, video_id) + + path, ext = config.get('file').rsplit('.', 1) + formats = [{ + 'format_id': 'rtmp', + 'ext': ext, + 'vcodec': config.get('type') == 'audio' and 'none' or None, + 'preference': -2, + 'url': 'rtmp://{streamer:s}/{application:s}'.format(**config), + 'app': config.get('application'), + 'play_path': '{ext:s}:{path:s}'.format(ext=ext, path=path), + 'page_url': url, + 'rtmp_live': config.get('live', False), + 'player_url': 'http://programas.rtp.pt/play/player.swf?v3', + 'rtmp_real_time': True, + }] + + # Construct regular HTTP download URLs + replacements = { + 'audio': { + 'format_id': 'mp3', + 'pattern': r'^nas2\.share/wavrss/', + 'repl': 'http://rsspod.rtp.pt/podcasts/', + 'vcodec': 'none', + }, + 'video': { + 'format_id': 'mp4_h264', + 'pattern': r'^nas2\.share/h264/', + 'repl': 'http://rsspod.rtp.pt/videocasts/', + 'vcodec': 'h264', + }, + } + r = replacements[config['type']] + if re.match(r['pattern'], config['file']) is not None: + formats.append({ + 'format_id': r['format_id'], + 'url': re.sub(r['pattern'], r['repl'], config['file']), + 'vcodec': r['vcodec'], + }) + + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': title, + 'formats': formats, + 'description': description, + 'thumbnail': thumbnail, + } diff --git a/youtube_dl/extractor/rts.py b/youtube_dl/extractor/rts.py index dc59a5e5c..12639f08b 100644 --- a/youtube_dl/extractor/rts.py +++ b/youtube_dl/extractor/rts.py @@ -4,18 +4,31 @@ from __future__ import unicode_literals import re from .common import InfoExtractor +from ..compat import ( + compat_str, + compat_urllib_parse_urlparse, +) from ..utils import ( int_or_none, parse_duration, parse_iso8601, unescapeHTML, - compat_str, + xpath_text, ) class RTSIE(InfoExtractor): IE_DESC = 'RTS.ch' - _VALID_URL = r'https?://(?:www\.)?rts\.ch/(?:(?:[^/]+/){2,}(?P<id>[0-9]+)-(?P<display_id>.+?)\.html|play/tv/[^/]+/video/(?P<display_id_new>.+?)\?id=(?P<id_new>[0-9]+))' + _VALID_URL = r'''(?x) + (?: + rts:(?P<rts_id>\d+)| + https?:// + (?:www\.)?rts\.ch/ + (?: + (?:[^/]+/){2,}(?P<id>[0-9]+)-(?P<display_id>.+?)\.html| + play/tv/[^/]+/video/(?P<display_id_new>.+?)\?id=(?P<id_new>[0-9]+) + ) + )''' _TESTS = [ { @@ -118,6 +131,15 @@ class RTSIE(InfoExtractor): 'view_count': int, }, }, + { + # article with videos on rhs + 'url': 'http://www.rts.ch/sport/hockey/6693917-hockey-davos-decroche-son-31e-titre-de-champion-de-suisse.html', + 'info_dict': { + 'id': '6693917', + 'title': 'Hockey: Davos décroche son 31e titre de champion de Suisse', + }, + 'playlist_mincount': 5, + }, { 'url': 'http://www.rts.ch/play/tv/le-19h30/video/le-chantier-du-nouveau-parlement-vaudois-a-permis-une-trouvaille-historique?id=6348280', 'only_matching': True, @@ -126,7 +148,7 @@ class RTSIE(InfoExtractor): def _real_extract(self, url): m = re.match(self._VALID_URL, url) - video_id = m.group('id') or m.group('id_new') + video_id = m.group('rts_id') or m.group('id') or m.group('id_new') display_id = m.group('display_id') or m.group('display_id_new') def download_json(internal_id): @@ -139,6 +161,15 @@ class RTSIE(InfoExtractor): # video_id extracted out of URL is not always a real id if 'video' not in all_info and 'audio' not in all_info: page = self._download_webpage(url, display_id) + + # article with videos on rhs + videos = re.findall( + r'<article[^>]+class="content-item"[^>]*>\s*<a[^>]+data-video-urn="urn:rts:video:(\d+)"', + page) + if videos: + entries = [self.url_result('rts:%s' % video_urn, 'RTS') for video_urn in videos] + return self.playlist_result(entries, video_id, self._og_search_title(page)) + internal_id = self._html_search_regex( r'<(?:video|audio) data-id="([0-9]+)"', page, 'internal video id') @@ -157,11 +188,27 @@ class RTSIE(InfoExtractor): return int_or_none(self._search_regex( r'-([0-9]+)k\.', url, 'bitrate', default=None)) - formats = [{ - 'format_id': fid, - 'url': furl, - 'tbr': extract_bitrate(furl), - } for fid, furl in info['streams'].items()] + formats = [] + for format_id, format_url in info['streams'].items(): + if format_url.endswith('.f4m'): + token = self._download_xml( + 'http://tp.srgssr.ch/token/akahd.xml?stream=%s/*' % compat_urllib_parse_urlparse(format_url).path, + video_id, 'Downloading %s token' % format_id) + auth_params = xpath_text(token, './/authparams', 'auth params') + if not auth_params: + continue + formats.extend(self._extract_f4m_formats( + '%s?%s&hdcore=3.4.0&plugin=aasp-3.4.0.132.66' % (format_url, auth_params), + video_id, f4m_id=format_id)) + elif format_url.endswith('.m3u8'): + formats.extend(self._extract_m3u8_formats( + format_url, video_id, 'mp4', m3u8_id=format_id)) + else: + formats.append({ + 'format_id': format_id, + 'url': format_url, + 'tbr': extract_bitrate(format_url), + }) if 'media' in info: formats.extend([{ @@ -170,6 +217,7 @@ class RTSIE(InfoExtractor): 'tbr': media['rate'] or extract_bitrate(media['url']), } for media in info['media'] if media.get('rate')]) + self._check_formats(formats, video_id) self._sort_formats(formats) return { diff --git a/youtube_dl/extractor/rtve.py b/youtube_dl/extractor/rtve.py index 0ce22d60c..603d7bd00 100644 --- a/youtube_dl/extractor/rtve.py +++ b/youtube_dl/extractor/rtve.py @@ -7,13 +7,17 @@ import time from .common import InfoExtractor from ..utils import ( - struct_unpack, + ExtractorError, + float_or_none, remove_end, + sanitized_Request, + std_headers, + struct_unpack, ) def _decrypt_url(png): - encrypted_data = base64.b64decode(png) + encrypted_data = base64.b64decode(png.encode('utf-8')) text_index = encrypted_data.find(b'tEXt') text_chunk = encrypted_data[text_index - 4:] length = struct_unpack('!I', text_chunk[:4])[0] @@ -57,7 +61,7 @@ def _decrypt_url(png): class RTVEALaCartaIE(InfoExtractor): IE_NAME = 'rtve.es:alacarta' IE_DESC = 'RTVE a la carta' - _VALID_URL = r'http://www\.rtve\.es/alacarta/videos/[^/]+/[^/]+/(?P<id>\d+)' + _VALID_URL = r'http://www\.rtve\.es/(m/)?alacarta/videos/[^/]+/[^/]+/(?P<id>\d+)' _TESTS = [{ 'url': 'http://www.rtve.es/alacarta/videos/balonmano/o-swiss-cup-masculina-final-espana-suecia/2491869/', @@ -66,6 +70,7 @@ class RTVEALaCartaIE(InfoExtractor): 'id': '2491869', 'ext': 'mp4', 'title': 'Balonmano - Swiss Cup masculina. Final: España-Suecia', + 'duration': 5024.566, }, }, { 'note': 'Live stream', @@ -74,18 +79,41 @@ class RTVEALaCartaIE(InfoExtractor): 'id': '1694255', 'ext': 'flv', 'title': 'TODO', - } + }, + 'skip': 'The f4m manifest can\'t be used yet', + }, { + 'url': 'http://www.rtve.es/m/alacarta/videos/cuentame-como-paso/cuentame-como-paso-t16-ultimo-minuto-nuestra-vida-capitulo-276/2969138/?media=tve', + 'only_matching': True, }] + def _real_initialize(self): + user_agent_b64 = base64.b64encode(std_headers['User-Agent'].encode('utf-8')).decode('utf-8') + manager_info = self._download_json( + 'http://www.rtve.es/odin/loki/' + user_agent_b64, + None, 'Fetching manager info') + self._manager = manager_info['manager'] + def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) video_id = mobj.group('id') info = self._download_json( 'http://www.rtve.es/api/videos/%s/config/alacarta_videos.json' % video_id, video_id)['page']['items'][0] - png_url = 'http://www.rtve.es/ztnr/movil/thumbnail/default/videos/%s.png' % video_id - png = self._download_webpage(png_url, video_id, 'Downloading url information') + if info['state'] == 'DESPU': + raise ExtractorError('The video is no longer available', expected=True) + png_url = 'http://www.rtve.es/ztnr/movil/thumbnail/%s/videos/%s.png' % (self._manager, video_id) + png_request = sanitized_Request(png_url) + png_request.add_header('Referer', url) + png = self._download_webpage(png_request, video_id, 'Downloading url information') video_url = _decrypt_url(png) + if not video_url.endswith('.f4m'): + video_url = video_url.replace( + 'resources/', 'auth/resources/' + ).replace('.net.rtve', '.multimedia.cdn.rtve') + + subtitles = None + if info.get('sbtFile') is not None: + subtitles = self.extract_subtitles(video_id, info['sbtFile']) return { 'id': video_id, @@ -93,6 +121,57 @@ class RTVEALaCartaIE(InfoExtractor): 'url': video_url, 'thumbnail': info.get('image'), 'page_url': url, + 'subtitles': subtitles, + 'duration': float_or_none(info.get('duration'), scale=1000), + } + + def _get_subtitles(self, video_id, sub_file): + subs = self._download_json( + sub_file + '.json', video_id, + 'Downloading subtitles info')['page']['items'] + return dict( + (s['lang'], [{'ext': 'vtt', 'url': s['src']}]) + for s in subs) + + +class RTVEInfantilIE(InfoExtractor): + IE_NAME = 'rtve.es:infantil' + IE_DESC = 'RTVE infantil' + _VALID_URL = r'https?://(?:www\.)?rtve\.es/infantil/serie/(?P<show>[^/]*)/video/(?P<short_title>[^/]*)/(?P<id>[0-9]+)/' + + _TESTS = [{ + 'url': 'http://www.rtve.es/infantil/serie/cleo/video/maneras-vivir/3040283/', + 'md5': '915319587b33720b8e0357caaa6617e6', + 'info_dict': { + 'id': '3040283', + 'ext': 'mp4', + 'title': 'Maneras de vivir', + 'thumbnail': 'http://www.rtve.es/resources/jpg/6/5/1426182947956.JPG', + 'duration': 357.958, + }, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + info = self._download_json( + 'http://www.rtve.es/api/videos/%s/config/alacarta_videos.json' % video_id, + video_id)['page']['items'][0] + + webpage = self._download_webpage(url, video_id) + vidplayer_id = self._search_regex( + r' id="vidplayer([0-9]+)"', webpage, 'internal video ID') + + png_url = 'http://www.rtve.es/ztnr/movil/thumbnail/default/videos/%s.png' % vidplayer_id + png = self._download_webpage(png_url, video_id, 'Downloading url information') + video_url = _decrypt_url(png) + + return { + 'id': video_id, + 'ext': 'mp4', + 'title': info['title'], + 'url': video_url, + 'thumbnail': info.get('image'), + 'duration': float_or_none(info.get('duration'), scale=1000), } diff --git a/youtube_dl/extractor/rtvnh.py b/youtube_dl/extractor/rtvnh.py new file mode 100644 index 000000000..7c9d4b0cd --- /dev/null +++ b/youtube_dl/extractor/rtvnh.py @@ -0,0 +1,47 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ExtractorError + + +class RTVNHIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?rtvnh\.nl/video/(?P<id>[0-9]+)' + _TEST = { + 'url': 'http://www.rtvnh.nl/video/131946', + 'md5': '6e1d0ab079e2a00b6161442d3ceacfc1', + 'info_dict': { + 'id': '131946', + 'ext': 'mp4', + 'title': 'Grote zoektocht in zee bij Zandvoort naar vermiste vrouw', + 'thumbnail': 're:^https?:.*\.jpg$' + } + } + + def _real_extract(self, url): + video_id = self._match_id(url) + + meta = self._parse_json(self._download_webpage( + 'http://www.rtvnh.nl/video/json?m=' + video_id, video_id), video_id) + + status = meta.get('status') + if status != 200: + raise ExtractorError( + '%s returned error code %d' % (self.IE_NAME, status), expected=True) + + formats = self._extract_smil_formats( + 'http://www.rtvnh.nl/video/smil?m=' + video_id, video_id, fatal=False) + + for item in meta['source']['fb']: + if item.get('type') == 'hls': + formats.extend(self._extract_m3u8_formats( + item['file'], video_id, ext='mp4', entry_protocol='m3u8_native')) + elif item.get('type') == '': + formats.append({'url': item['file']}) + + return { + 'id': video_id, + 'title': meta['title'].strip(), + 'thumbnail': meta.get('image'), + 'formats': formats + } diff --git a/youtube_dl/extractor/rutube.py b/youtube_dl/extractor/rutube.py index 6941d96fb..9db62adb1 100644 --- a/youtube_dl/extractor/rutube.py +++ b/youtube_dl/extractor/rutube.py @@ -5,19 +5,21 @@ import re import itertools from .common import InfoExtractor -from ..utils import ( +from ..compat import ( compat_str, +) +from ..utils import ( + determine_ext, unified_strdate, - ExtractorError, ) class RutubeIE(InfoExtractor): IE_NAME = 'rutube' IE_DESC = 'Rutube videos' - _VALID_URL = r'https?://rutube\.ru/video/(?P<id>[\da-z]{32})' + _VALID_URL = r'https?://rutube\.ru/(?:video|play/embed)/(?P<id>[\da-z]{32})' - _TEST = { + _TESTS = [{ 'url': 'http://rutube.ru/video/3eac3b4561676c17df9132a9a1e62e3e/', 'info_dict': { 'id': '3eac3b4561676c17df9132a9a1e62e3e', @@ -28,17 +30,19 @@ class RutubeIE(InfoExtractor): 'uploader': 'NTDRussian', 'uploader_id': '29790', 'upload_date': '20131016', + 'age_limit': 0, }, 'params': { # It requires ffmpeg (m3u8 download) 'skip_download': True, }, - } + }, { + 'url': 'http://rutube.ru/play/embed/a10e53b86e8f349080f718582ce4c661', + 'only_matching': True, + }] def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') - + video_id = self._match_id(url) video = self._download_json( 'http://rutube.ru/api/video/%s/?format=json' % video_id, video_id, 'Downloading video JSON') @@ -50,10 +54,25 @@ class RutubeIE(InfoExtractor): 'http://rutube.ru/api/play/options/%s/?format=json' % video_id, video_id, 'Downloading options JSON') - m3u8_url = options['video_balancer'].get('m3u8') - if m3u8_url is None: - raise ExtractorError('Couldn\'t find m3u8 manifest url') - formats = self._extract_m3u8_formats(m3u8_url, video_id, ext='mp4') + formats = [] + for format_id, format_url in options['video_balancer'].items(): + ext = determine_ext(format_url) + if ext == 'm3u8': + m3u8_formats = self._extract_m3u8_formats( + format_url, video_id, 'mp4', m3u8_id=format_id, fatal=False) + if m3u8_formats: + formats.extend(m3u8_formats) + elif ext == 'f4m': + f4m_formats = self._extract_f4m_formats( + format_url, video_id, f4m_id=format_id, fatal=False) + if f4m_formats: + formats.extend(f4m_formats) + else: + formats.append({ + 'url': format_url, + 'format_id': format_id, + }) + self._sort_formats(formats) return { 'id': video['id'], @@ -70,6 +89,40 @@ class RutubeIE(InfoExtractor): } +class RutubeEmbedIE(InfoExtractor): + IE_NAME = 'rutube:embed' + IE_DESC = 'Rutube embedded videos' + _VALID_URL = 'https?://rutube\.ru/(?:video|play)/embed/(?P<id>[0-9]+)' + + _TESTS = [{ + 'url': 'http://rutube.ru/video/embed/6722881?vk_puid37=&vk_puid38=', + 'info_dict': { + 'id': 'a10e53b86e8f349080f718582ce4c661', + 'ext': 'mp4', + 'upload_date': '20131223', + 'uploader_id': '297833', + 'description': 'Видео группы ★http://vk.com/foxkidsreset★ музей Fox Kids и Jetix<br/><br/> восстановлено и сделано в шикоформате subziro89 http://vk.com/subziro89', + 'uploader': 'subziro89 ILya', + 'title': 'Мистический городок Эйри в Индиан 5 серия озвучка subziro89', + }, + 'params': { + 'skip_download': 'Requires ffmpeg', + }, + }, { + 'url': 'http://rutube.ru/play/embed/8083783', + 'only_matching': True, + }] + + def _real_extract(self, url): + embed_id = self._match_id(url) + webpage = self._download_webpage(url, embed_id) + + canonical_url = self._html_search_regex( + r'<link\s+rel="canonical"\s+href="([^"]+?)"', webpage, + 'Canonical URL') + return self.url_result(canonical_url, 'Rutube') + + class RutubeChannelIE(InfoExtractor): IE_NAME = 'rutube:channel' IE_DESC = 'Rutube channels' @@ -114,8 +167,7 @@ class RutubeMovieIE(RutubeChannelIE): _PAGE_TEMPLATE = 'http://rutube.ru/api/metainfo/tv/%s/video?page=%s&format=json' def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - movie_id = mobj.group('id') + movie_id = self._match_id(url) movie = self._download_json( self._MOVIE_TEMPLATE % movie_id, movie_id, 'Downloading movie JSON') diff --git a/youtube_dl/extractor/rutv.py b/youtube_dl/extractor/rutv.py index a73e6f331..f7fe1fece 100644 --- a/youtube_dl/extractor/rutv.py +++ b/youtube_dl/extractor/rutv.py @@ -84,18 +84,27 @@ class RUTVIE(InfoExtractor): 'title': 'Сочи-2014. Биатлон. Индивидуальная гонка. Мужчины ', 'description': 'md5:9e0ed5c9d2fa1efbfdfed90c9a6d179c', }, + 'skip': 'Translation has finished', + }, + { + 'url': 'http://player.rutv.ru/iframe/live/id/21/showZoomBtn/false/isPlay/true/', + 'info_dict': { + 'id': '21', + 'ext': 'mp4', + 'title': 're:^Россия 24. Прямой эфир [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', + 'is_live': True, + }, 'params': { - # rtmp download + # m3u8 download 'skip_download': True, }, - 'skip': 'Translation has finished', }, ] @classmethod def _extract_url(cls, webpage): mobj = re.search( - r'<iframe[^>]+?src=(["\'])(?P<url>https?://player\.rutv\.ru/(?:iframe/(?:swf|video|live)/id|index/iframe/cast_id)/.+?)\1', webpage) + r'<iframe[^>]+?src=(["\'])(?P<url>https?://player\.(?:rutv\.ru|vgtrk\.com)/(?:iframe/(?:swf|video|live)/id|index/iframe/cast_id)/.+?)\1', webpage) if mobj: return mobj.group('url') @@ -119,8 +128,10 @@ class RUTVIE(InfoExtractor): elif video_path.startswith('index/iframe/cast_id'): video_type = 'live' + is_live = video_type == 'live' + json_data = self._download_json( - 'http://player.rutv.ru/iframe/%splay/id/%s' % ('live-' if video_type == 'live' else '', video_id), + 'http://player.rutv.ru/iframe/data%s/id/%s' % ('live' if is_live else 'video', video_id), video_id, 'Downloading JSON') if json_data['errors']: @@ -147,6 +158,7 @@ class RUTVIE(InfoExtractor): for transport, links in media['sources'].items(): for quality, url in links.items(): + preference = -1 if priority_transport == transport else -2 if transport == 'rtmp': mobj = re.search(r'^(?P<url>rtmp://[^/]+/(?P<app>.+))/(?P<playpath>.+)$', url) if not mobj: @@ -160,12 +172,12 @@ class RUTVIE(InfoExtractor): 'rtmp_live': True, 'ext': 'flv', 'vbr': int(quality), + 'preference': preference, } elif transport == 'm3u8': - fmt = { - 'url': url, - 'ext': 'mp4', - } + formats.extend(self._extract_m3u8_formats( + url, video_id, 'mp4', preference=preference, m3u8_id='hls')) + continue else: fmt = { 'url': url @@ -174,21 +186,18 @@ class RUTVIE(InfoExtractor): 'width': width, 'height': height, 'format_id': '%s-%s' % (transport, quality), - 'preference': -1 if priority_transport == transport else -2, }) formats.append(fmt) - if not formats: - raise ExtractorError('No media links available for %s' % video_id) - self._sort_formats(formats) return { 'id': video_id, - 'title': title, + 'title': self._live_title(title) if is_live else title, 'description': description, 'thumbnail': thumbnail, 'view_count': view_count, 'duration': duration, 'formats': formats, + 'is_live': is_live, } diff --git a/youtube_dl/extractor/ruutu.py b/youtube_dl/extractor/ruutu.py new file mode 100644 index 000000000..e417bf661 --- /dev/null +++ b/youtube_dl/extractor/ruutu.py @@ -0,0 +1,103 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..compat import compat_urllib_parse_urlparse +from ..utils import ( + determine_ext, + int_or_none, + xpath_attr, + xpath_text, +) + + +class RuutuIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?ruutu\.fi/video/(?P<id>\d+)' + _TESTS = [ + { + 'url': 'http://www.ruutu.fi/video/2058907', + 'md5': 'ab2093f39be1ca8581963451b3c0234f', + 'info_dict': { + 'id': '2058907', + 'ext': 'mp4', + 'title': 'Oletko aina halunnut tietää mitä tapahtuu vain hetki ennen lähetystä? - Nyt se selvisi!', + 'description': 'md5:cfc6ccf0e57a814360df464a91ff67d6', + 'thumbnail': 're:^https?://.*\.jpg$', + 'duration': 114, + 'age_limit': 0, + }, + }, + { + 'url': 'http://www.ruutu.fi/video/2057306', + 'md5': '065a10ae4d5b8cfd9d0c3d332465e3d9', + 'info_dict': { + 'id': '2057306', + 'ext': 'mp4', + 'title': 'Superpesis: katso koko kausi Ruudussa', + 'description': 'md5:da2736052fef3b2bd5e0005e63c25eac', + 'thumbnail': 're:^https?://.*\.jpg$', + 'duration': 40, + 'age_limit': 0, + }, + }, + ] + + def _real_extract(self, url): + video_id = self._match_id(url) + + video_xml = self._download_xml( + 'http://gatling.ruutu.fi/media-xml-cache?id=%s' % video_id, video_id) + + formats = [] + processed_urls = [] + + def extract_formats(node): + for child in node: + if child.tag.endswith('Files'): + extract_formats(child) + elif child.tag.endswith('File'): + video_url = child.text + if (not video_url or video_url in processed_urls or + any(p in video_url for p in ('NOT_USED', 'NOT-USED'))): + return + processed_urls.append(video_url) + ext = determine_ext(video_url) + if ext == 'm3u8': + m3u8_formats = self._extract_m3u8_formats( + video_url, video_id, 'mp4', m3u8_id='hls', fatal=False) + if m3u8_formats: + formats.extend(m3u8_formats) + elif ext == 'f4m': + f4m_formats = self._extract_f4m_formats( + video_url, video_id, f4m_id='hds', fatal=False) + if f4m_formats: + formats.extend(f4m_formats) + else: + proto = compat_urllib_parse_urlparse(video_url).scheme + if not child.tag.startswith('HTTP') and proto != 'rtmp': + continue + preference = -1 if proto == 'rtmp' else 1 + label = child.get('label') + tbr = int_or_none(child.get('bitrate')) + width, height = [int_or_none(x) for x in child.get('resolution', 'x').split('x')[:2]] + formats.append({ + 'format_id': '%s-%s' % (proto, label if label else tbr), + 'url': video_url, + 'width': width, + 'height': height, + 'tbr': tbr, + 'preference': preference, + }) + + extract_formats(video_xml.find('./Clip')) + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': xpath_attr(video_xml, './/Behavior/Program', 'program_name', 'title', fatal=True), + 'description': xpath_attr(video_xml, './/Behavior/Program', 'description', 'description'), + 'thumbnail': xpath_attr(video_xml, './/Behavior/Startpicture', 'href', 'thumbnail'), + 'duration': int_or_none(xpath_text(video_xml, './/Runtime', 'duration')), + 'age_limit': int_or_none(xpath_text(video_xml, './/AgeLimit', 'age limit')), + 'formats': formats, + } diff --git a/youtube_dl/extractor/safari.py b/youtube_dl/extractor/safari.py new file mode 100644 index 000000000..7de7b7273 --- /dev/null +++ b/youtube_dl/extractor/safari.py @@ -0,0 +1,156 @@ +# encoding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from .brightcove import BrightcoveLegacyIE + +from ..utils import ( + ExtractorError, + sanitized_Request, + smuggle_url, + std_headers, + urlencode_postdata, +) + + +class SafariBaseIE(InfoExtractor): + _LOGIN_URL = 'https://www.safaribooksonline.com/accounts/login/' + _SUCCESSFUL_LOGIN_REGEX = r'<a href="/accounts/logout/"[^>]*>Sign Out</a>' + _NETRC_MACHINE = 'safari' + + _API_BASE = 'https://www.safaribooksonline.com/api/v1/book' + _API_FORMAT = 'json' + + LOGGED_IN = False + + def _real_initialize(self): + # We only need to log in once for courses or individual videos + if not self.LOGGED_IN: + self._login() + SafariBaseIE.LOGGED_IN = True + + def _login(self): + (username, password) = self._get_login_info() + if username is None: + self.raise_login_required('safaribooksonline.com account is required') + + headers = std_headers + if 'Referer' not in headers: + headers['Referer'] = self._LOGIN_URL + + login_page = self._download_webpage( + self._LOGIN_URL, None, + 'Downloading login form') + + csrf = self._html_search_regex( + r"name='csrfmiddlewaretoken'\s+value='([^']+)'", + login_page, 'csrf token') + + login_form = { + 'csrfmiddlewaretoken': csrf, + 'email': username, + 'password1': password, + 'login': 'Sign In', + 'next': '', + } + + request = sanitized_Request( + self._LOGIN_URL, urlencode_postdata(login_form), headers=headers) + login_page = self._download_webpage( + request, None, 'Logging in as %s' % username) + + if re.search(self._SUCCESSFUL_LOGIN_REGEX, login_page) is None: + raise ExtractorError( + 'Login failed; make sure your credentials are correct and try again.', + expected=True) + + self.to_screen('Login successful') + + +class SafariIE(SafariBaseIE): + IE_NAME = 'safari' + IE_DESC = 'safaribooksonline.com online video' + _VALID_URL = r'''(?x)https?:// + (?:www\.)?safaribooksonline\.com/ + (?: + library/view/[^/]+| + api/v1/book + )/ + (?P<course_id>[^/]+)/ + (?:chapter(?:-content)?/)? + (?P<part>part\d+)\.html + ''' + + _TESTS = [{ + 'url': 'https://www.safaribooksonline.com/library/view/hadoop-fundamentals-livelessons/9780133392838/part00.html', + 'md5': '5b0c4cc1b3c1ba15dda7344085aa5592', + 'info_dict': { + 'id': '2842601850001', + 'ext': 'mp4', + 'title': 'Introduction', + }, + 'skip': 'Requires safaribooksonline account credentials', + }, { + 'url': 'https://www.safaribooksonline.com/api/v1/book/9780133392838/chapter/part00.html', + 'only_matching': True, + }, { + # non-digits in course id + 'url': 'https://www.safaribooksonline.com/library/view/create-a-nodejs/100000006A0210/part00.html', + 'only_matching': True, + }] + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + course_id = mobj.group('course_id') + part = mobj.group('part') + + webpage = self._download_webpage( + '%s/%s/chapter-content/%s.html' % (self._API_BASE, course_id, part), + part) + + bc_url = BrightcoveLegacyIE._extract_brightcove_url(webpage) + if not bc_url: + raise ExtractorError('Could not extract Brightcove URL from %s' % url, expected=True) + + return self.url_result(smuggle_url(bc_url, {'Referer': url}), 'BrightcoveLegacy') + + +class SafariCourseIE(SafariBaseIE): + IE_NAME = 'safari:course' + IE_DESC = 'safaribooksonline.com online courses' + + _VALID_URL = r'https?://(?:www\.)?safaribooksonline\.com/(?:library/view/[^/]+|api/v1/book)/(?P<id>[^/]+)/?(?:[#?]|$)' + + _TESTS = [{ + 'url': 'https://www.safaribooksonline.com/library/view/hadoop-fundamentals-livelessons/9780133392838/', + 'info_dict': { + 'id': '9780133392838', + 'title': 'Hadoop Fundamentals LiveLessons', + }, + 'playlist_count': 22, + 'skip': 'Requires safaribooksonline account credentials', + }, { + 'url': 'https://www.safaribooksonline.com/api/v1/book/9781449396459/?override_format=json', + 'only_matching': True, + }] + + def _real_extract(self, url): + course_id = self._match_id(url) + + course_json = self._download_json( + '%s/%s/?override_format=%s' % (self._API_BASE, course_id, self._API_FORMAT), + course_id, 'Downloading course JSON') + + if 'chapters' not in course_json: + raise ExtractorError( + 'No chapters found for course %s' % course_id, expected=True) + + entries = [ + self.url_result(chapter, 'Safari') + for chapter in course_json['chapters']] + + course_title = course_json['title'] + + return self.playlist_result(entries, course_id, course_title) diff --git a/youtube_dl/extractor/sandia.py b/youtube_dl/extractor/sandia.py new file mode 100644 index 000000000..759898a49 --- /dev/null +++ b/youtube_dl/extractor/sandia.py @@ -0,0 +1,115 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import itertools +import json +import re + +from .common import InfoExtractor +from ..compat import compat_urlparse +from ..utils import ( + int_or_none, + js_to_json, + mimetype2ext, + sanitized_Request, + unified_strdate, +) + + +class SandiaIE(InfoExtractor): + IE_DESC = 'Sandia National Laboratories' + _VALID_URL = r'https?://digitalops\.sandia\.gov/Mediasite/Play/(?P<id>[0-9a-f]+)' + _TEST = { + 'url': 'http://digitalops.sandia.gov/Mediasite/Play/24aace4429fc450fb5b38cdbf424a66e1d', + 'md5': '9422edc9b9a60151727e4b6d8bef393d', + 'info_dict': { + 'id': '24aace4429fc450fb5b38cdbf424a66e1d', + 'ext': 'mp4', + 'title': 'Xyce Software Training - Section 1', + 'description': 're:(?s)SAND Number: SAND 2013-7800.{200,}', + 'upload_date': '20120904', + 'duration': 7794, + } + } + + def _real_extract(self, url): + video_id = self._match_id(url) + + req = sanitized_Request(url) + req.add_header('Cookie', 'MediasitePlayerCaps=ClientPlugins=4') + webpage = self._download_webpage(req, video_id) + + js_path = self._search_regex( + r'<script type="text/javascript" src="(/Mediasite/FileServer/Presentation/[^"]+)"', + webpage, 'JS code URL') + js_url = compat_urlparse.urljoin(url, js_path) + + js_code = self._download_webpage( + js_url, video_id, note='Downloading player') + + def extract_str(key, **args): + return self._search_regex( + r'Mediasite\.PlaybackManifest\.%s\s*=\s*(.+);\s*?\n' % re.escape(key), + js_code, key, **args) + + def extract_data(key, **args): + data_json = extract_str(key, **args) + if data_json is None: + return data_json + return self._parse_json( + data_json, video_id, transform_source=js_to_json) + + formats = [] + for i in itertools.count(): + fd = extract_data('VideoUrls[%d]' % i, default=None) + if fd is None: + break + formats.append({ + 'format_id': '%s' % i, + 'format_note': fd['MimeType'].partition('/')[2], + 'ext': mimetype2ext(fd['MimeType']), + 'url': fd['Location'], + 'protocol': 'f4m' if fd['MimeType'] == 'video/x-mp4-fragmented' else None, + }) + self._sort_formats(formats) + + slide_baseurl = compat_urlparse.urljoin( + url, extract_data('SlideBaseUrl')) + slide_template = slide_baseurl + re.sub( + r'\{0:D?([0-9+])\}', r'%0\1d', extract_data('SlideImageFileNameTemplate')) + slides = [] + last_slide_time = 0 + for i in itertools.count(1): + sd = extract_str('Slides[%d]' % i, default=None) + if sd is None: + break + timestamp = int_or_none(self._search_regex( + r'^Mediasite\.PlaybackManifest\.CreateSlide\("[^"]*"\s*,\s*([0-9]+),', + sd, 'slide %s timestamp' % i, fatal=False)) + slides.append({ + 'url': slide_template % i, + 'duration': timestamp - last_slide_time, + }) + last_slide_time = timestamp + formats.append({ + 'format_id': 'slides', + 'protocol': 'slideshow', + 'url': json.dumps(slides), + 'preference': -10000, # Downloader not yet written + }) + self._sort_formats(formats) + + title = extract_data('Title') + description = extract_data('Description', fatal=False) + duration = int_or_none(extract_data( + 'Duration', fatal=False), scale=1000) + upload_date = unified_strdate(extract_data('AirDate', fatal=False)) + + return { + 'id': video_id, + 'title': title, + 'description': description, + 'formats': formats, + 'upload_date': upload_date, + 'duration': duration, + } diff --git a/youtube_dl/extractor/sbs.py b/youtube_dl/extractor/sbs.py index b8775c2f9..d6ee2d9e2 100644 --- a/youtube_dl/extractor/sbs.py +++ b/youtube_dl/extractor/sbs.py @@ -1,18 +1,12 @@ # -*- coding: utf-8 -*- from __future__ import unicode_literals -import json -import re from .common import InfoExtractor -from ..utils import ( - js_to_json, - remove_end, -) class SBSIE(InfoExtractor): IE_DESC = 'sbs.com.au' - _VALID_URL = r'https?://(?:www\.)?sbs\.com\.au/ondemand/video/(?:single/)?(?P<id>[0-9]+)' + _VALID_URL = r'https?://(?:www\.)?sbs\.com\.au/(?:ondemand|news)/video/(?:single/)?(?P<id>[0-9]+)' _TESTS = [{ # Original URL is handled by the generic IE which finds the iframe: @@ -22,38 +16,36 @@ class SBSIE(InfoExtractor): 'info_dict': { 'id': '320403011771', 'ext': 'mp4', - 'title': 'Dingo Conservation', - 'description': 'Dingoes are on the brink of extinction; most of the animals we think are dingoes are in fact crossbred with wild dogs. This family run a dingo conservation park to prevent their extinction', + 'title': 'Dingo Conservation (The Feed)', + 'description': 'md5:f250a9856fca50d22dec0b5b8015f8a5', 'thumbnail': 're:http://.*\.jpg', + 'duration': 308, }, - 'add_ies': ['generic'], }, { 'url': 'http://www.sbs.com.au/ondemand/video/320403011771/Dingo-Conservation-The-Feed', 'only_matching': True, + }, { + 'url': 'http://www.sbs.com.au/news/video/471395907773/The-Feed-July-9', + 'only_matching': True, }] def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') - webpage = self._download_webpage(url, video_id) + video_id = self._match_id(url) - release_urls_json = js_to_json(self._search_regex( - r'(?s)playerParams\.releaseUrls\s*=\s*(\{.*?\n\});\n', - webpage, '')) - release_urls = json.loads(release_urls_json) - theplatform_url = ( - release_urls.get('progressive') or release_urls.get('standard')) + webpage = self._download_webpage( + 'http://www.sbs.com.au/ondemand/video/single/%s?context=web' % video_id, video_id) - title = remove_end(self._og_search_title(webpage), ' (The Feed)') - description = self._html_search_meta('description', webpage) - thumbnail = self._og_search_thumbnail(webpage) + player_params = self._parse_json( + self._search_regex( + r'(?s)var\s+playerParams\s*=\s*({.+?});', webpage, 'playerParams'), + video_id) + + urls = player_params['releaseUrls'] + theplatform_url = (urls.get('progressive') or urls.get('standard') or + urls.get('html') or player_params['relatedItemsURL']) return { '_type': 'url_transparent', 'id': video_id, 'url': theplatform_url, - - 'title': title, - 'description': description, - 'thumbnail': thumbnail, } diff --git a/youtube_dl/extractor/screencast.py b/youtube_dl/extractor/screencast.py index c145f6fc7..dfd897ba3 100644 --- a/youtube_dl/extractor/screencast.py +++ b/youtube_dl/extractor/screencast.py @@ -1,14 +1,14 @@ # -*- coding: utf-8 -*- from __future__ import unicode_literals -import re - from .common import InfoExtractor -from ..utils import ( - ExtractorError, +from ..compat import ( compat_parse_qs, compat_urllib_request, ) +from ..utils import ( + ExtractorError, +) class ScreencastIE(InfoExtractor): @@ -57,8 +57,7 @@ class ScreencastIE(InfoExtractor): ] def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') + video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) video_url = self._html_search_regex( diff --git a/youtube_dl/extractor/screencastomatic.py b/youtube_dl/extractor/screencastomatic.py new file mode 100644 index 000000000..05337421c --- /dev/null +++ b/youtube_dl/extractor/screencastomatic.py @@ -0,0 +1,49 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..compat import compat_urlparse +from ..utils import ( + ExtractorError, + js_to_json, +) + + +class ScreencastOMaticIE(InfoExtractor): + _VALID_URL = r'https?://screencast-o-matic\.com/watch/(?P<id>[0-9a-zA-Z]+)' + _TEST = { + 'url': 'http://screencast-o-matic.com/watch/c2lD3BeOPl', + 'md5': '483583cb80d92588f15ccbedd90f0c18', + 'info_dict': { + 'id': 'c2lD3BeOPl', + 'ext': 'mp4', + 'title': 'Welcome to 3-4 Philosophy @ DECV!', + 'thumbnail': 're:^https?://.*\.jpg$', + 'description': 'as the title says! also: some general info re 1) VCE philosophy and 2) distance learning.', + } + } + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + setup_js = self._search_regex( + r"(?s)jwplayer\('mp4Player'\).setup\((\{.*?\})\);", + webpage, 'setup code') + data = self._parse_json(setup_js, video_id, transform_source=js_to_json) + try: + video_data = next( + m for m in data['modes'] if m.get('type') == 'html5') + except StopIteration: + raise ExtractorError('Could not find any video entries!') + video_url = compat_urlparse.urljoin(url, video_data['config']['file']) + thumbnail = data.get('image') + + return { + 'id': video_id, + 'title': self._og_search_title(webpage), + 'description': self._og_search_description(webpage), + 'url': video_url, + 'ext': 'mp4', + 'thumbnail': thumbnail, + } diff --git a/youtube_dl/extractor/screenwavemedia.py b/youtube_dl/extractor/screenwavemedia.py new file mode 100644 index 000000000..05f93904c --- /dev/null +++ b/youtube_dl/extractor/screenwavemedia.py @@ -0,0 +1,140 @@ +# encoding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + int_or_none, + unified_strdate, + js_to_json, +) + + +class ScreenwaveMediaIE(InfoExtractor): + _VALID_URL = r'https?://player\d?\.screenwavemedia\.com/(?:play/)?[a-zA-Z]+\.php\?.*\bid=(?P<id>[A-Za-z0-9-]+)' + EMBED_PATTERN = r'src=(["\'])(?P<url>(?:https?:)?//player\d?\.screenwavemedia\.com/(?:play/)?[a-zA-Z]+\.php\?.*\bid=.+?)\1' + _TESTS = [{ + 'url': 'http://player.screenwavemedia.com/play/play.php?playerdiv=videoarea&companiondiv=squareAd&id=Cinemassacre-19911', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + playerdata = self._download_webpage( + 'http://player.screenwavemedia.com/player.php?id=%s' % video_id, + video_id, 'Downloading player webpage') + + vidtitle = self._search_regex( + r'\'vidtitle\'\s*:\s*"([^"]+)"', playerdata, 'vidtitle').replace('\\/', '/') + + playerconfig = self._download_webpage( + 'http://player.screenwavemedia.com/player.js', + video_id, 'Downloading playerconfig webpage') + + videoserver = self._search_regex(r'SWMServer\s*=\s*"([\d\.]+)"', playerdata, 'videoserver') + + sources = self._parse_json( + js_to_json( + re.sub( + r'(?s)/\*.*?\*/', '', + self._search_regex( + r"sources\s*:\s*(\[[^\]]+?\])", playerconfig, + 'sources', + ).replace( + "' + thisObj.options.videoserver + '", + videoserver + ).replace( + "' + playerVidId + '", + video_id + ) + ) + ), + video_id, fatal=False + ) + + # Fallback to hardcoded sources if JS changes again + if not sources: + self.report_warning('Falling back to a hardcoded list of streams') + sources = [{ + 'file': 'http://%s/vod/%s_%s.mp4' % (videoserver, video_id, format_id), + 'type': 'mp4', + 'label': format_label, + } for format_id, format_label in ( + ('low', '144p Low'), ('med', '160p Med'), ('high', '360p High'), ('hd1', '720p HD1'))] + sources.append({ + 'file': 'http://%s/vod/smil:%s.smil/playlist.m3u8' % (videoserver, video_id), + 'type': 'hls', + }) + + formats = [] + for source in sources: + if source['type'] == 'hls': + formats.extend(self._extract_m3u8_formats(source['file'], video_id)) + else: + file_ = source.get('file') + if not file_: + continue + format_label = source.get('label') + format_id = self._search_regex( + r'_(.+?)\.[^.]+$', file_, 'format id', default=None) + height = int_or_none(self._search_regex( + r'^(\d+)[pP]', format_label, 'height', default=None)) + formats.append({ + 'url': source['file'], + 'format_id': format_id, + 'format': format_label, + 'ext': source.get('type'), + 'height': height, + }) + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': vidtitle, + 'formats': formats, + } + + +class TeamFourIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?teamfourstar\.com/video/(?P<id>[a-z0-9\-]+)/?' + _TEST = { + 'url': 'http://teamfourstar.com/video/a-moment-with-tfs-episode-4/', + 'info_dict': { + 'id': 'TeamFourStar-5292a02f20bfa', + 'ext': 'mp4', + 'upload_date': '20130401', + 'description': 'Check out this and more on our website: http://teamfourstar.com\nTFS Store: http://sharkrobot.com/team-four-star\nFollow on Twitter: http://twitter.com/teamfourstar\nLike on FB: http://facebook.com/teamfourstar', + 'title': 'A Moment With TFS Episode 4', + } + } + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + + playerdata_url = self._search_regex( + r'src="(http://player\d?\.screenwavemedia\.com/(?:play/)?[a-zA-Z]+\.php\?[^"]*\bid=.+?)"', + webpage, 'player data URL') + + video_title = self._html_search_regex( + r'<div class="heroheadingtitle">(?P<title>.+?)</div>', + webpage, 'title') + video_date = unified_strdate(self._html_search_regex( + r'<div class="heroheadingdate">(?P<date>.+?)</div>', + webpage, 'date', fatal=False)) + video_description = self._html_search_regex( + r'(?s)<div class="postcontent">(?P<description>.+?)</div>', + webpage, 'description', fatal=False) + video_thumbnail = self._og_search_thumbnail(webpage) + + return { + '_type': 'url_transparent', + 'display_id': display_id, + 'title': video_title, + 'description': video_description, + 'upload_date': video_date, + 'thumbnail': video_thumbnail, + 'url': playerdata_url, + } diff --git a/youtube_dl/extractor/senateisvp.py b/youtube_dl/extractor/senateisvp.py new file mode 100644 index 000000000..474ebb49b --- /dev/null +++ b/youtube_dl/extractor/senateisvp.py @@ -0,0 +1,145 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + unsmuggle_url, +) +from ..compat import ( + compat_parse_qs, + compat_urlparse, +) + + +class SenateISVPIE(InfoExtractor): + _COMM_MAP = [ + ["ag", "76440", "http://ag-f.akamaihd.net"], + ["aging", "76442", "http://aging-f.akamaihd.net"], + ["approps", "76441", "http://approps-f.akamaihd.net"], + ["armed", "76445", "http://armed-f.akamaihd.net"], + ["banking", "76446", "http://banking-f.akamaihd.net"], + ["budget", "76447", "http://budget-f.akamaihd.net"], + ["cecc", "76486", "http://srs-f.akamaihd.net"], + ["commerce", "80177", "http://commerce1-f.akamaihd.net"], + ["csce", "75229", "http://srs-f.akamaihd.net"], + ["dpc", "76590", "http://dpc-f.akamaihd.net"], + ["energy", "76448", "http://energy-f.akamaihd.net"], + ["epw", "76478", "http://epw-f.akamaihd.net"], + ["ethics", "76449", "http://ethics-f.akamaihd.net"], + ["finance", "76450", "http://finance-f.akamaihd.net"], + ["foreign", "76451", "http://foreign-f.akamaihd.net"], + ["govtaff", "76453", "http://govtaff-f.akamaihd.net"], + ["help", "76452", "http://help-f.akamaihd.net"], + ["indian", "76455", "http://indian-f.akamaihd.net"], + ["intel", "76456", "http://intel-f.akamaihd.net"], + ["intlnarc", "76457", "http://intlnarc-f.akamaihd.net"], + ["jccic", "85180", "http://jccic-f.akamaihd.net"], + ["jec", "76458", "http://jec-f.akamaihd.net"], + ["judiciary", "76459", "http://judiciary-f.akamaihd.net"], + ["rpc", "76591", "http://rpc-f.akamaihd.net"], + ["rules", "76460", "http://rules-f.akamaihd.net"], + ["saa", "76489", "http://srs-f.akamaihd.net"], + ["smbiz", "76461", "http://smbiz-f.akamaihd.net"], + ["srs", "75229", "http://srs-f.akamaihd.net"], + ["uscc", "76487", "http://srs-f.akamaihd.net"], + ["vetaff", "76462", "http://vetaff-f.akamaihd.net"], + ["arch", "", "http://ussenate-f.akamaihd.net/"] + ] + _IE_NAME = 'senate.gov' + _VALID_URL = r'http://www\.senate\.gov/isvp/?\?(?P<qs>.+)' + _TESTS = [{ + 'url': 'http://www.senate.gov/isvp/?comm=judiciary&type=live&stt=&filename=judiciary031715&auto_play=false&wmode=transparent&poster=http%3A%2F%2Fwww.judiciary.senate.gov%2Fthemes%2Fjudiciary%2Fimages%2Fvideo-poster-flash-fit.png', + 'info_dict': { + 'id': 'judiciary031715', + 'ext': 'flv', + 'title': 'Integrated Senate Video Player', + 'thumbnail': 're:^https?://.*\.(?:jpg|png)$', + } + }, { + 'url': 'http://www.senate.gov/isvp/?type=live&comm=commerce&filename=commerce011514.mp4&auto_play=false', + 'info_dict': { + 'id': 'commerce011514', + 'ext': 'flv', + 'title': 'Integrated Senate Video Player' + } + }, { + 'url': 'http://www.senate.gov/isvp/?type=arch&comm=intel&filename=intel090613&hc_location=ufi', + # checksum differs each time + 'info_dict': { + 'id': 'intel090613', + 'ext': 'mp4', + 'title': 'Integrated Senate Video Player' + } + }, { + # From http://www.c-span.org/video/?96791-1 + 'url': 'http://www.senate.gov/isvp?type=live&comm=banking&filename=banking012715', + 'only_matching': True, + }] + + @staticmethod + def _search_iframe_url(webpage): + mobj = re.search( + r"<iframe[^>]+src=['\"](?P<url>http://www\.senate\.gov/isvp/?\?[^'\"]+)['\"]", + webpage) + if mobj: + return mobj.group('url') + + def _get_info_for_comm(self, committee): + for entry in self._COMM_MAP: + if entry[0] == committee: + return entry[1:] + + def _real_extract(self, url): + url, smuggled_data = unsmuggle_url(url, {}) + + qs = compat_parse_qs(re.match(self._VALID_URL, url).group('qs')) + if not qs.get('filename') or not qs.get('type') or not qs.get('comm'): + raise ExtractorError('Invalid URL', expected=True) + + video_id = re.sub(r'.mp4$', '', qs['filename'][0]) + + webpage = self._download_webpage(url, video_id) + + if smuggled_data.get('force_title'): + title = smuggled_data['force_title'] + else: + title = self._html_search_regex(r'<title>([^<]+)', webpage, video_id) + poster = qs.get('poster') + thumbnail = poster[0] if poster else None + + video_type = qs['type'][0] + committee = video_type if video_type == 'arch' else qs['comm'][0] + stream_num, domain = self._get_info_for_comm(committee) + + formats = [] + if video_type == 'arch': + filename = video_id if '.' in video_id else video_id + '.mp4' + formats = [{ + # All parameters in the query string are necessary to prevent a 403 error + 'url': compat_urlparse.urljoin(domain, filename) + '?v=3.1.0&fp=&r=&g=', + }] + else: + hdcore_sign = 'hdcore=3.1.0' + url_params = (domain, video_id, stream_num) + f4m_url = '%s/z/%s_1@%s/manifest.f4m?' % url_params + hdcore_sign + m3u8_url = '%s/i/%s_1@%s/master.m3u8' % url_params + for entry in self._extract_f4m_formats(f4m_url, video_id, f4m_id='f4m'): + # URLs without the extra param induce an 404 error + entry.update({'extra_param_to_segment_url': hdcore_sign}) + formats.append(entry) + for entry in self._extract_m3u8_formats(m3u8_url, video_id, ext='mp4', m3u8_id='m3u8'): + mobj = re.search(r'(?P(?:-p|-b)).m3u8', entry['url']) + if mobj: + entry['format_id'] += mobj.group('tag') + formats.append(entry) + + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': title, + 'formats': formats, + 'thumbnail': thumbnail, + } diff --git a/youtube_dl/extractor/servingsys.py b/youtube_dl/extractor/servingsys.py index 16dc3736b..c013d678f 100644 --- a/youtube_dl/extractor/servingsys.py +++ b/youtube_dl/extractor/servingsys.py @@ -1,7 +1,5 @@ from __future__ import unicode_literals -import re - from .common import InfoExtractor from ..utils import ( int_or_none, @@ -13,10 +11,15 @@ class ServingSysIE(InfoExtractor): _TEST = { 'url': 'http://bs.serving-sys.com/BurstingPipe/adServer.bs?cn=is&c=23&pl=VAST&pli=5349193&PluID=0&pos=7135&ord=[timestamp]&cim=1?', + 'info_dict': { + 'id': '5349193', + 'title': 'AdAPPter_Hyundai_demo', + }, 'playlist': [{ - 'file': '29955898.flv', 'md5': 'baed851342df6846eb8677a60a011a0f', 'info_dict': { + 'id': '29955898', + 'ext': 'flv', 'title': 'AdAPPter_Hyundai_demo (1)', 'duration': 74, 'tbr': 1378, @@ -24,9 +27,10 @@ class ServingSysIE(InfoExtractor): 'height': 400, }, }, { - 'file': '29907998.flv', 'md5': '979b4da2655c4bc2d81aeb915a8c5014', 'info_dict': { + 'id': '29907998', + 'ext': 'flv', 'title': 'AdAPPter_Hyundai_demo (2)', 'duration': 34, 'width': 854, @@ -37,14 +41,13 @@ class ServingSysIE(InfoExtractor): 'params': { 'playlistend': 2, }, - 'skip': 'Blocked in the US [sic]', + '_skip': 'Blocked in the US [sic]', } def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - pl_id = mobj.group('id') - + pl_id = self._match_id(url) vast_doc = self._download_xml(url, pl_id) + title = vast_doc.find('.//AdTitle').text media = vast_doc.find('.//MediaFile').text info_url = self._search_regex(r'&adData=([^&]+)&', media, 'info URL') diff --git a/youtube_dl/extractor/sexykarma.py b/youtube_dl/extractor/sexykarma.py index c833fc8ee..e33483674 100644 --- a/youtube_dl/extractor/sexykarma.py +++ b/youtube_dl/extractor/sexykarma.py @@ -24,11 +24,12 @@ class SexyKarmaIE(InfoExtractor): 'title': 'Taking a quick pee.', 'thumbnail': 're:^https?://.*\.jpg$', 'uploader': 'wildginger7', - 'upload_date': '20141007', + 'upload_date': '20141008', 'duration': 22, 'view_count': int, 'comment_count': int, 'categories': list, + 'age_limit': 18, } }, { 'url': 'http://www.sexykarma.com/gonewild/video/pot-pixie-tribute-8Id6EZPbuHf.html', @@ -45,6 +46,7 @@ class SexyKarmaIE(InfoExtractor): 'view_count': int, 'comment_count': int, 'categories': list, + 'age_limit': 18, } }, { 'url': 'http://www.watchindianporn.net/video/desi-dancer-namrata-stripping-completely-nude-and-dancing-on-a-hot-number-dW2mtctxJfs.html', @@ -61,6 +63,7 @@ class SexyKarmaIE(InfoExtractor): 'view_count': int, 'comment_count': int, 'categories': list, + 'age_limit': 18, } }] @@ -114,4 +117,5 @@ class SexyKarmaIE(InfoExtractor): 'view_count': view_count, 'comment_count': comment_count, 'categories': categories, + 'age_limit': 18, } diff --git a/youtube_dl/extractor/shahid.py b/youtube_dl/extractor/shahid.py new file mode 100644 index 000000000..f76fb12c0 --- /dev/null +++ b/youtube_dl/extractor/shahid.py @@ -0,0 +1,107 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..compat import compat_urllib_parse +from ..utils import ( + ExtractorError, + int_or_none, + parse_iso8601, +) + + +class ShahidIE(InfoExtractor): + _VALID_URL = r'https?://shahid\.mbc\.net/ar/episode/(?P\d+)/?' + _TESTS = [{ + 'url': 'https://shahid.mbc.net/ar/episode/90574/%D8%A7%D9%84%D9%85%D9%84%D9%83-%D8%B9%D8%A8%D8%AF%D8%A7%D9%84%D9%84%D9%87-%D8%A7%D9%84%D8%A5%D9%86%D8%B3%D8%A7%D9%86-%D8%A7%D9%84%D9%85%D9%88%D8%B3%D9%85-1-%D9%83%D9%84%D9%8A%D8%A8-3.html', + 'info_dict': { + 'id': '90574', + 'ext': 'mp4', + 'title': 'الملك عبدالله الإنسان الموسم 1 كليب 3', + 'description': 'الفيلم الوثائقي - الملك عبد الله الإنسان', + 'duration': 2972, + 'timestamp': 1422057420, + 'upload_date': '20150123', + }, + 'params': { + # m3u8 download + 'skip_download': True, + } + }, { + # shahid plus subscriber only + 'url': 'https://shahid.mbc.net/ar/episode/90511/%D9%85%D8%B1%D8%A7%D9%8A%D8%A7-2011-%D8%A7%D9%84%D9%85%D9%88%D8%B3%D9%85-1-%D8%A7%D9%84%D8%AD%D9%84%D9%82%D8%A9-1.html', + 'only_matching': True + }] + + def _handle_error(self, response): + if not isinstance(response, dict): + return + error = response.get('error') + if error: + raise ExtractorError( + '%s returned error: %s' % (self.IE_NAME, '\n'.join(error.values())), + expected=True) + + def _download_json(self, url, video_id, note='Downloading JSON metadata'): + response = super(ShahidIE, self)._download_json(url, video_id, note)['data'] + self._handle_error(response) + return response + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage(url, video_id) + + api_vars = { + 'id': video_id, + 'type': 'player', + 'url': 'http://api.shahid.net/api/v1_1', + 'playerType': 'episode', + } + + flashvars = self._search_regex( + r'var\s+flashvars\s*=\s*({[^}]+})', webpage, 'flashvars', default=None) + if flashvars: + for key in api_vars.keys(): + value = self._search_regex( + r'\b%s\s*:\s*(?P["\'])(?P.+?)(?P=q)' % key, + flashvars, 'type', default=None, group='value') + if value: + api_vars[key] = value + + player = self._download_json( + 'https://shahid.mbc.net/arContent/getPlayerContent-param-.id-%s.type-%s.html' + % (video_id, api_vars['type']), video_id, 'Downloading player JSON') + + formats = self._extract_m3u8_formats(player['url'], video_id, 'mp4') + + video = self._download_json( + '%s/%s/%s?%s' % ( + api_vars['url'], api_vars['playerType'], api_vars['id'], + compat_urllib_parse.urlencode({ + 'apiKey': 'sh@hid0nlin3', + 'hash': 'b2wMCTHpSmyxGqQjJFOycRmLSex+BpTK/ooxy6vHaqs=', + })), + video_id, 'Downloading video JSON') + + video = video[api_vars['playerType']] + + title = video['title'] + description = video.get('description') + thumbnail = video.get('thumbnailUrl') + duration = int_or_none(video.get('duration')) + timestamp = parse_iso8601(video.get('referenceDate')) + categories = [ + category['name'] + for category in video.get('genres', []) if 'name' in category] + + return { + 'id': video_id, + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + 'duration': duration, + 'timestamp': timestamp, + 'categories': categories, + 'formats': formats, + } diff --git a/youtube_dl/extractor/shared.py b/youtube_dl/extractor/shared.py index fdc31603a..8eda3c864 100644 --- a/youtube_dl/extractor/shared.py +++ b/youtube_dl/extractor/shared.py @@ -1,51 +1,64 @@ from __future__ import unicode_literals -import re import base64 from .common import InfoExtractor +from ..compat import compat_urllib_parse from ..utils import ( ExtractorError, - compat_urllib_request, - compat_urllib_parse, int_or_none, + sanitized_Request, ) class SharedIE(InfoExtractor): - _VALID_URL = r'http://shared\.sx/(?P[\da-z]{10})' + IE_DESC = 'shared.sx and vivo.sx' + _VALID_URL = r'http://(?:shared|vivo)\.sx/(?P[\da-z]{10})' - _TEST = { + _TESTS = [{ 'url': 'http://shared.sx/0060718775', 'md5': '106fefed92a8a2adb8c98e6a0652f49b', 'info_dict': { 'id': '0060718775', 'ext': 'mp4', 'title': 'Bmp4', + 'filesize': 1720110, }, - } + }, { + 'url': 'http://vivo.sx/d7ddda0e78', + 'md5': '15b3af41be0b4fe01f4df075c2678b2c', + 'info_dict': { + 'id': 'd7ddda0e78', + 'ext': 'mp4', + 'title': 'Chicken', + 'filesize': 528031, + }, + }] def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) - page = self._download_webpage(url, video_id) + if '>File does not exist<' in webpage: + raise ExtractorError( + 'Video %s does not exist' % video_id, expected=True) - if re.search(r'>File does not exist<', page) is not None: - raise ExtractorError('Video %s does not exist' % video_id, expected=True) - - download_form = dict(re.findall(r'\d+).html)|(.*?(\#|(vid=)|b/)(?P\d+?)($|&|\-)))) | @@ -23,9 +21,10 @@ class SinaIE(InfoExtractor): _TESTS = [ { 'url': 'http://video.sina.com.cn/news/vlist/zt/chczlj2013/?opsubject_id=top12#110028898', - 'file': '110028898.flv', 'md5': 'd65dd22ddcf44e38ce2bf58a10c3e71f', 'info_dict': { + 'id': '110028898', + 'ext': 'flv', 'title': '《中国新闻》 朝鲜要求巴拿马立即释放被扣船员', } }, @@ -39,10 +38,6 @@ class SinaIE(InfoExtractor): }, ] - @classmethod - def suitable(cls, url): - return re.match(cls._VALID_URL, url, flags=re.VERBOSE) is not None - def _extract_video(self, video_id): data = compat_urllib_parse.urlencode({'vid': video_id}) url_doc = self._download_xml('http://v.iask.com/v_play.php?%s' % data, @@ -59,12 +54,12 @@ class SinaIE(InfoExtractor): } def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url, flags=re.VERBOSE) + mobj = re.match(self._VALID_URL, url) video_id = mobj.group('id') if mobj.group('token') is not None: # The video id is in the redirected url self.to_screen('Getting video id') - request = compat_urllib_request.Request(url) + request = sanitized_Request(url) request.get_method = lambda: 'HEAD' (_, urlh) = self._download_webpage_handle(request, 'NA', False) return self._real_extract(urlh.geturl()) diff --git a/youtube_dl/extractor/skynewsarabia.py b/youtube_dl/extractor/skynewsarabia.py new file mode 100644 index 000000000..05e1b02ad --- /dev/null +++ b/youtube_dl/extractor/skynewsarabia.py @@ -0,0 +1,117 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import ( + parse_iso8601, + parse_duration, +) + + +class SkyNewsArabiaBaseIE(InfoExtractor): + _IMAGE_BASE_URL = 'http://www.skynewsarabia.com/web/images' + + def _call_api(self, path, value): + return self._download_json('http://api.skynewsarabia.com/web/rest/v2/%s/%s.json' % (path, value), value) + + def _get_limelight_media_id(self, url): + return self._search_regex(r'/media/[^/]+/([a-z0-9]{32})', url, 'limelight media id') + + def _get_image_url(self, image_path_template, width='1600', height='1200'): + return self._IMAGE_BASE_URL + image_path_template.format(width=width, height=height) + + def _extract_video_info(self, video_data): + video_id = compat_str(video_data['id']) + topic = video_data.get('topicTitle') + return { + '_type': 'url_transparent', + 'url': 'limelight:media:%s' % self._get_limelight_media_id(video_data['videoUrl'][0]['url']), + 'id': video_id, + 'title': video_data['headline'], + 'description': video_data.get('summary'), + 'thumbnail': self._get_image_url(video_data['mediaAsset']['imageUrl']), + 'timestamp': parse_iso8601(video_data.get('date')), + 'duration': parse_duration(video_data.get('runTime')), + 'tags': video_data.get('tags', []), + 'categories': [topic] if topic else [], + 'webpage_url': 'http://www.skynewsarabia.com/web/video/%s' % video_id, + 'ie_key': 'LimelightMedia', + } + + +class SkyNewsArabiaIE(SkyNewsArabiaBaseIE): + IE_NAME = 'skynewsarabia:video' + _VALID_URL = r'https?://(?:www\.)?skynewsarabia\.com/web/video/(?P[0-9]+)' + _TEST = { + 'url': 'http://www.skynewsarabia.com/web/video/794902/%D9%86%D8%B5%D9%81-%D9%85%D9%84%D9%8A%D9%88%D9%86-%D9%85%D8%B5%D8%A8%D8%A7%D8%AD-%D8%B4%D8%AC%D8%B1%D8%A9-%D9%83%D8%B1%D9%8A%D8%B3%D9%85%D8%A7%D8%B3', + 'info_dict': { + 'id': '794902', + 'ext': 'flv', + 'title': 'نصف مليون مصباح على شجرة كريسماس', + 'description': 'md5:22f1b27f0850eeb10c7e59b1f16eb7c6', + 'upload_date': '20151128', + 'timestamp': 1448697198, + 'duration': 2119, + }, + 'params': { + # rtmp download + 'skip_download': True, + }, + } + + def _real_extract(self, url): + video_id = self._match_id(url) + video_data = self._call_api('video', video_id) + return self._extract_video_info(video_data) + + +class SkyNewsArabiaArticleIE(SkyNewsArabiaBaseIE): + IE_NAME = 'skynewsarabia:video' + _VALID_URL = r'https?://(?:www\.)?skynewsarabia\.com/web/article/(?P[0-9]+)' + _TESTS = [{ + 'url': 'http://www.skynewsarabia.com/web/article/794549/%D8%A7%D9%94%D8%AD%D8%AF%D8%A7%D8%AB-%D8%A7%D9%84%D8%B4%D8%B1%D9%82-%D8%A7%D9%84%D8%A7%D9%94%D9%88%D8%B3%D8%B7-%D8%AE%D8%B1%D9%8A%D8%B7%D8%A9-%D8%A7%D9%84%D8%A7%D9%94%D9%84%D8%B9%D8%A7%D8%A8-%D8%A7%D9%84%D8%B0%D9%83%D9%8A%D8%A9', + 'info_dict': { + 'id': '794549', + 'ext': 'flv', + 'title': 'بالفيديو.. ألعاب ذكية تحاكي واقع المنطقة', + 'description': 'md5:0c373d29919a851e080ee4edd0c5d97f', + 'upload_date': '20151126', + 'timestamp': 1448559336, + 'duration': 281.6, + }, + 'params': { + # rtmp download + 'skip_download': True, + }, + }, { + 'url': 'http://www.skynewsarabia.com/web/article/794844/%D8%A7%D8%B3%D8%AA%D9%87%D8%AF%D8%A7%D9%81-%D9%82%D9%88%D8%A7%D8%B1%D8%A8-%D8%A7%D9%94%D8%B3%D9%84%D8%AD%D8%A9-%D9%84%D9%85%D9%8A%D9%84%D9%8A%D8%B4%D9%8A%D8%A7%D8%AA-%D8%A7%D9%84%D8%AD%D9%88%D8%AB%D9%8A-%D9%88%D8%B5%D8%A7%D9%84%D8%AD', + 'info_dict': { + 'id': '794844', + 'title': 'إحباط تهريب أسلحة لميليشيات الحوثي وصالح بجنوب اليمن', + 'description': 'md5:5c927b8b2e805796e7f693538d96fc7e', + }, + 'playlist_mincount': 2, + }] + + def _real_extract(self, url): + article_id = self._match_id(url) + article_data = self._call_api('article', article_id) + media_asset = article_data['mediaAsset'] + if media_asset['type'] == 'VIDEO': + topic = article_data.get('topicTitle') + return { + '_type': 'url_transparent', + 'url': 'limelight:media:%s' % self._get_limelight_media_id(media_asset['videoUrl'][0]['url']), + 'id': article_id, + 'title': article_data['headline'], + 'description': article_data.get('summary'), + 'thumbnail': self._get_image_url(media_asset['imageUrl']), + 'timestamp': parse_iso8601(article_data.get('date')), + 'tags': article_data.get('tags', []), + 'categories': [topic] if topic else [], + 'webpage_url': url, + 'ie_key': 'LimelightMedia', + } + entries = [self._extract_video_info(item) for item in article_data.get('inlineItems', []) if item['type'] == 'VIDEO'] + return self.playlist_result(entries, article_id, article_data['headline'], article_data.get('summary')) diff --git a/youtube_dl/extractor/slideshare.py b/youtube_dl/extractor/slideshare.py index 5864b9936..0b717a1e4 100644 --- a/youtube_dl/extractor/slideshare.py +++ b/youtube_dl/extractor/slideshare.py @@ -4,8 +4,10 @@ import re import json from .common import InfoExtractor -from ..utils import ( +from ..compat import ( compat_urlparse, +) +from ..utils import ( ExtractorError, ) @@ -28,7 +30,7 @@ class SlideshareIE(InfoExtractor): page_title = mobj.group('title') webpage = self._download_webpage(url, page_title) slideshare_obj = self._search_regex( - r'var slideshare_object = ({.*?}); var user_info =', + r'\$\.extend\(slideshare_object,\s*(\{.*?\})\);', webpage, 'slideshare object') info = json.loads(slideshare_obj) if info['slideshow']['type'] != 'video': @@ -39,7 +41,7 @@ class SlideshareIE(InfoExtractor): ext = info['jsplayer']['video_extension'] video_url = compat_urlparse.urljoin(bucket, doc + '-SD.' + ext) description = self._html_search_regex( - r']*>(.*?)

', webpage, + r'(?s)]+itemprop="description"[^>]*>(.+?)

', webpage, 'description', fatal=False) return { diff --git a/youtube_dl/extractor/smotri.py b/youtube_dl/extractor/smotri.py index 0751efc61..30210c8a3 100644 --- a/youtube_dl/extractor/smotri.py +++ b/youtube_dl/extractor/smotri.py @@ -7,11 +7,11 @@ import hashlib import uuid from .common import InfoExtractor +from ..compat import compat_urllib_parse from ..utils import ( - compat_urllib_parse, - compat_urllib_request, ExtractorError, int_or_none, + sanitized_Request, unified_strdate, ) @@ -51,7 +51,7 @@ class SmotriIE(InfoExtractor): 'thumbnail': 'http://frame4.loadup.ru/03/ed/57591.2.3.jpg', }, }, - # video-password + # video-password, not approved by moderator { 'url': 'http://smotri.com/video/view/?id=v1390466a13c', 'md5': 'f6331cef33cad65a0815ee482a54440b', @@ -67,8 +67,26 @@ class SmotriIE(InfoExtractor): 'params': { 'videopassword': 'qwerty', }, + 'skip': 'Video is not approved by moderator', }, - # age limit + video-password + # video-password + { + 'url': 'http://smotri.com/video/view/?id=v6984858774#', + 'md5': 'f11e01d13ac676370fc3b95b9bda11b0', + 'info_dict': { + 'id': 'v6984858774', + 'ext': 'mp4', + 'title': 'Дача Солженицина ПАРОЛЬ 223322', + 'uploader': 'psavari1', + 'uploader_id': 'psavari1', + 'upload_date': '20081103', + 'thumbnail': 're:^https?://.*\.jpg$', + }, + 'params': { + 'videopassword': '223322', + }, + }, + # age limit + video-password, not approved by moderator { 'url': 'http://smotri.com/video/view/?id=v15408898bcf', 'md5': '91e909c9f0521adf5ee86fbe073aad70', @@ -84,12 +102,31 @@ class SmotriIE(InfoExtractor): }, 'params': { 'videopassword': '333' - } + }, + 'skip': 'Video is not approved by moderator', + }, + # age limit + video-password + { + 'url': 'http://smotri.com/video/view/?id=v7780025814', + 'md5': 'b4599b068422559374a59300c5337d72', + 'info_dict': { + 'id': 'v7780025814', + 'ext': 'mp4', + 'title': 'Sexy Beach (пароль 123)', + 'uploader': 'вАся', + 'uploader_id': 'asya_prosto', + 'upload_date': '20081218', + 'thumbnail': 're:^https?://.*\.jpg$', + 'age_limit': 18, + }, + 'params': { + 'videopassword': '123' + }, }, # swf player { 'url': 'http://pics.smotri.com/scrubber_custom8.swf?file=v9188090500', - 'md5': '4d47034979d9390d14acdf59c4935bc2', + 'md5': '31099eeb4bc906712c5f40092045108d', 'info_dict': { 'id': 'v9188090500', 'ext': 'mp4', @@ -120,9 +157,6 @@ class SmotriIE(InfoExtractor): def _search_meta(self, name, html, display_name=None): if display_name is None: display_name = name - return self._html_search_regex( - r'' % re.escape(name), - html, display_name, fatal=False) return self._html_search_meta(name, html, display_name) def _real_extract(self, url): @@ -136,19 +170,31 @@ class SmotriIE(InfoExtractor): 'getvideoinfo': '1', } - request = compat_urllib_request.Request( + video_password = self._downloader.params.get('videopassword', None) + if video_password: + video_form['pass'] = hashlib.md5(video_password.encode('utf-8')).hexdigest() + + request = sanitized_Request( 'http://smotri.com/video/view/url/bot/', compat_urllib_parse.urlencode(video_form)) request.add_header('Content-Type', 'application/x-www-form-urlencoded') video = self._download_json(request, video_id, 'Downloading video JSON') - if video.get('_moderate_no') or not video.get('moderated'): - raise ExtractorError('Video %s has not been approved by moderator' % video_id, expected=True) - - if video.get('error'): - raise ExtractorError('Video %s does not exist' % video_id, expected=True) - video_url = video.get('_vidURL') or video.get('_vidURL_mp4') + + if not video_url: + if video.get('_moderate_no'): + raise ExtractorError( + 'Video %s has not been approved by moderator' % video_id, expected=True) + + if video.get('error'): + raise ExtractorError('Video %s does not exist' % video_id, expected=True) + + if video.get('_pass_protected') == 1: + msg = ('Invalid video password' if video_password + else 'This video is protected by a password, use the --video-password option') + raise ExtractorError(msg, expected=True) + title = video['title'] thumbnail = video['_imgURL'] upload_date = unified_strdate(video['added']) @@ -274,15 +320,15 @@ class SmotriBroadcastIE(InfoExtractor): broadcast_page = self._download_webpage(broadcast_url, broadcast_id, 'Downloading broadcast page') if re.search('>Режиссер с логином
"%s"
не существует<' % broadcast_id, broadcast_page) is not None: - raise ExtractorError('Broadcast %s does not exist' % broadcast_id, expected=True) + raise ExtractorError( + 'Broadcast %s does not exist' % broadcast_id, expected=True) # Adult content if re.search('EroConfirmText">', broadcast_page) is not None: (username, password) = self._get_login_info() if username is None: - raise ExtractorError('Erotic broadcasts allowed only for registered users, ' - 'use --username and --password options to provide account credentials.', expected=True) + self.raise_login_required('Erotic broadcasts allowed only for registered users') login_form = { 'login-hint53': '1', @@ -291,9 +337,11 @@ class SmotriBroadcastIE(InfoExtractor): 'password': password, } - request = compat_urllib_request.Request(broadcast_url + '/?no_redirect=1', compat_urllib_parse.urlencode(login_form)) + request = sanitized_Request( + broadcast_url + '/?no_redirect=1', compat_urllib_parse.urlencode(login_form)) request.add_header('Content-Type', 'application/x-www-form-urlencoded') - broadcast_page = self._download_webpage(request, broadcast_id, 'Logging in and confirming age') + broadcast_page = self._download_webpage( + request, broadcast_id, 'Logging in and confirming age') if re.search('>Неверный логин или пароль<', broadcast_page) is not None: raise ExtractorError('Unable to log in: bad username or password', expected=True) @@ -303,7 +351,7 @@ class SmotriBroadcastIE(InfoExtractor): adult_content = False ticket = self._html_search_regex( - 'window\.broadcast_control\.addFlashVar\\(\'file\', \'([^\']+)\'\\);', + r"window\.broadcast_control\.addFlashVar\('file'\s*,\s*'([^']+)'\)", broadcast_page, 'broadcast ticket') url = 'http://smotri.com/broadcast/view/url/?ticket=%s' % ticket @@ -312,26 +360,31 @@ class SmotriBroadcastIE(InfoExtractor): if broadcast_password: url += '&pass=%s' % hashlib.md5(broadcast_password.encode('utf-8')).hexdigest() - broadcast_json_page = self._download_webpage(url, broadcast_id, 'Downloading broadcast JSON') + broadcast_json_page = self._download_webpage( + url, broadcast_id, 'Downloading broadcast JSON') try: broadcast_json = json.loads(broadcast_json_page) protected_broadcast = broadcast_json['_pass_protected'] == 1 if protected_broadcast and not broadcast_password: - raise ExtractorError('This broadcast is protected by a password, use the --video-password option', expected=True) + raise ExtractorError( + 'This broadcast is protected by a password, use the --video-password option', + expected=True) broadcast_offline = broadcast_json['is_play'] == 0 if broadcast_offline: raise ExtractorError('Broadcast %s is offline' % broadcast_id, expected=True) rtmp_url = broadcast_json['_server'] - if not rtmp_url.startswith('rtmp://'): + mobj = re.search(r'^rtmp://[^/]+/(?P.+)/?$', rtmp_url) + if not mobj: raise ExtractorError('Unexpected broadcast rtmp URL') broadcast_playpath = broadcast_json['_streamName'] + broadcast_app = '%s/%s' % (mobj.group('app'), broadcast_json['_vidURL']) broadcast_thumbnail = broadcast_json['_imgURL'] - broadcast_title = broadcast_json['title'] + broadcast_title = self._live_title(broadcast_json['title']) broadcast_description = broadcast_json['description'] broadcaster_nick = broadcast_json['nick'] broadcaster_login = broadcast_json['login'] @@ -352,6 +405,9 @@ class SmotriBroadcastIE(InfoExtractor): 'age_limit': 18 if adult_content else 0, 'ext': 'flv', 'play_path': broadcast_playpath, + 'player_url': 'http://pics.smotri.com/broadcast_play.swf', + 'app': broadcast_app, 'rtmp_live': True, - 'rtmp_conn': rtmp_conn + 'rtmp_conn': rtmp_conn, + 'is_live': True, } diff --git a/youtube_dl/extractor/snagfilms.py b/youtube_dl/extractor/snagfilms.py new file mode 100644 index 000000000..6977afb27 --- /dev/null +++ b/youtube_dl/extractor/snagfilms.py @@ -0,0 +1,181 @@ +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + clean_html, + determine_ext, + int_or_none, + js_to_json, + parse_duration, +) + + +class SnagFilmsEmbedIE(InfoExtractor): + _VALID_URL = r'https?://(?:(?:www|embed)\.)?snagfilms\.com/embed/player\?.*\bfilmId=(?P[\da-f-]{36})' + _TESTS = [{ + 'url': 'http://embed.snagfilms.com/embed/player?filmId=74849a00-85a9-11e1-9660-123139220831&w=500', + 'md5': '2924e9215c6eff7a55ed35b72276bd93', + 'info_dict': { + 'id': '74849a00-85a9-11e1-9660-123139220831', + 'ext': 'mp4', + 'title': '#whilewewatch', + } + }, { + # invalid labels, 360p is better that 480p + 'url': 'http://www.snagfilms.com/embed/player?filmId=17ca0950-a74a-11e0-a92a-0026bb61d036', + 'md5': '882fca19b9eb27ef865efeeaed376a48', + 'info_dict': { + 'id': '17ca0950-a74a-11e0-a92a-0026bb61d036', + 'ext': 'mp4', + 'title': 'Life in Limbo', + } + }, { + 'url': 'http://www.snagfilms.com/embed/player?filmId=0000014c-de2f-d5d6-abcf-ffef58af0017', + 'only_matching': True, + }] + + @staticmethod + def _extract_url(webpage): + mobj = re.search( + r']+?src=(["\'])(?P(?:https?:)?//(?:embed\.)?snagfilms\.com/embed/player.+?)\1', + webpage) + if mobj: + return mobj.group('url') + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage(url, video_id) + + if '>This film is not playable in your area.<' in webpage: + raise ExtractorError( + 'Film %s is not playable in your area.' % video_id, expected=True) + + formats = [] + for source in self._parse_json(js_to_json(self._search_regex( + r'(?s)sources:\s*(\[.+?\]),', webpage, 'json')), video_id): + file_ = source.get('file') + if not file_: + continue + type_ = source.get('type') + ext = determine_ext(file_) + format_id = source.get('label') or ext + if all(v == 'm3u8' for v in (type_, ext)): + formats.extend(self._extract_m3u8_formats( + file_, video_id, 'mp4', m3u8_id='hls')) + else: + bitrate = int_or_none(self._search_regex( + [r'(\d+)kbps', r'_\d{1,2}x\d{1,2}_(\d{3,})\.%s' % ext], + file_, 'bitrate', default=None)) + height = int_or_none(self._search_regex( + r'^(\d+)[pP]$', format_id, 'height', default=None)) + formats.append({ + 'url': file_, + 'format_id': format_id, + 'tbr': bitrate, + 'height': height, + }) + self._sort_formats(formats) + + title = self._search_regex( + [r"title\s*:\s*'([^']+)'", r'([^<]+)'], + webpage, 'title') + + return { + 'id': video_id, + 'title': title, + 'formats': formats, + } + + +class SnagFilmsIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?snagfilms\.com/(?:films/title|show)/(?P[^?#]+)' + _TESTS = [{ + 'url': 'http://www.snagfilms.com/films/title/lost_for_life', + 'md5': '19844f897b35af219773fd63bdec2942', + 'info_dict': { + 'id': '0000014c-de2f-d5d6-abcf-ffef58af0017', + 'display_id': 'lost_for_life', + 'ext': 'mp4', + 'title': 'Lost for Life', + 'description': 'md5:fbdacc8bb6b455e464aaf98bc02e1c82', + 'thumbnail': 're:^https?://.*\.jpg', + 'duration': 4489, + 'categories': ['Documentary', 'Crime', 'Award Winning', 'Festivals'] + } + }, { + 'url': 'http://www.snagfilms.com/show/the_world_cut_project/india', + 'md5': 'e6292e5b837642bbda82d7f8bf3fbdfd', + 'info_dict': { + 'id': '00000145-d75c-d96e-a9c7-ff5c67b20000', + 'display_id': 'the_world_cut_project/india', + 'ext': 'mp4', + 'title': 'India', + 'description': 'md5:5c168c5a8f4719c146aad2e0dfac6f5f', + 'thumbnail': 're:^https?://.*\.jpg', + 'duration': 979, + 'categories': ['Documentary', 'Sports', 'Politics'] + } + }, { + # Film is not playable in your area. + 'url': 'http://www.snagfilms.com/films/title/inside_mecca', + 'only_matching': True, + }, { + # Film is not available. + 'url': 'http://www.snagfilms.com/show/augie_alone/flirting', + 'only_matching': True, + }] + + def _real_extract(self, url): + display_id = self._match_id(url) + + webpage = self._download_webpage(url, display_id) + + if ">Sorry, the Film you're looking for is not available.<" in webpage: + raise ExtractorError( + 'Film %s is not available.' % display_id, expected=True) + + film_id = self._search_regex(r'filmId=([\da-f-]{36})"', webpage, 'film id') + + snag = self._parse_json( + self._search_regex( + 'Snag\.page\.data\s*=\s*(\[.+?\]);', webpage, 'snag'), + display_id) + + for item in snag: + if item.get('data', {}).get('film', {}).get('id') == film_id: + data = item['data']['film'] + title = data['title'] + description = clean_html(data.get('synopsis')) + thumbnail = data.get('image') + duration = int_or_none(data.get('duration') or data.get('runtime')) + categories = [ + category['title'] for category in data.get('categories', []) + if category.get('title')] + break + else: + title = self._search_regex( + r'itemprop="title">([^<]+)<', webpage, 'title') + description = self._html_search_regex( + r'(?s)
(.+?)
', + webpage, 'description', default=None) or self._og_search_description(webpage) + thumbnail = self._og_search_thumbnail(webpage) + duration = parse_duration(self._search_regex( + r'([^<]+)<', + webpage, 'duration', fatal=False)) + categories = re.findall(r'
([^<]+)', webpage) + + return { + '_type': 'url_transparent', + 'url': 'http://embed.snagfilms.com/embed/player?filmId=%s' % film_id, + 'id': film_id, + 'display_id': display_id, + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + 'duration': duration, + 'categories': categories, + } diff --git a/youtube_dl/extractor/sockshare.py b/youtube_dl/extractor/sockshare.py deleted file mode 100644 index c663e56d4..000000000 --- a/youtube_dl/extractor/sockshare.py +++ /dev/null @@ -1,83 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -from ..utils import ( - ExtractorError, - compat_urllib_parse, - compat_urllib_request, - determine_ext, -) -import re - -from .common import InfoExtractor - - -class SockshareIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?sockshare\.com/file/(?P[0-9A-Za-z]+)' - _FILE_DELETED_REGEX = r'This file doesn\'t exist, or has been removed\.

' - _TEST = { - 'url': 'http://www.sockshare.com/file/437BE28B89D799D7', - 'md5': '9d0bf1cfb6dbeaa8d562f6c97506c5bd', - 'info_dict': { - 'id': '437BE28B89D799D7', - 'title': 'big_buck_bunny_720p_surround.avi', - 'ext': 'avi', - 'thumbnail': 're:^http://.*\.jpg$', - } - } - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') - - url = 'http://sockshare.com/file/%s' % video_id - webpage = self._download_webpage(url, video_id) - - if re.search(self._FILE_DELETED_REGEX, webpage) is not None: - raise ExtractorError('Video %s does not exist' % video_id, - expected=True) - - confirm_hash = self._html_search_regex(r'''(?x)(.+)', - r'var name = "([^"]+)";'), - webpage, 'title', default=None) - thumbnail = self._html_search_regex( - r'my\.)?tv\.sohu\.com/.+?/(?(mytv)|n)(?P\d+)\.shtml.*?' - _TEST = { + _TESTS = [{ + 'note': 'This video is available only in Mainland China', 'url': 'http://tv.sohu.com/20130724/n382479172.shtml#super', - 'md5': 'bde8d9a6ffd82c63a1eefaef4eeefec7', + 'md5': '29175c8cadd8b5cc4055001e85d6b372', 'info_dict': { 'id': '382479172', 'ext': 'mp4', 'title': 'MV:Far East Movement《The Illest》', }, - 'skip': 'Only available from China', - } + 'skip': 'On available in China', + }, { + 'url': 'http://tv.sohu.com/20150305/n409385080.shtml', + 'md5': '699060e75cf58858dd47fb9c03c42cfb', + 'info_dict': { + 'id': '409385080', + 'ext': 'mp4', + 'title': '《2015湖南卫视羊年元宵晚会》唐嫣《花好月圆》', + } + }, { + 'url': 'http://my.tv.sohu.com/us/232799889/78693464.shtml', + 'md5': '9bf34be48f2f4dadcb226c74127e203c', + 'info_dict': { + 'id': '78693464', + 'ext': 'mp4', + 'title': '【爱范品】第31期:MWC见不到的奇葩手机', + } + }, { + 'note': 'Multipart video', + 'url': 'http://my.tv.sohu.com/pl/8384802/78910339.shtml', + 'info_dict': { + 'id': '78910339', + 'title': '【神探苍实战秘籍】第13期 战争之影 赫卡里姆', + }, + 'playlist': [{ + 'md5': 'bdbfb8f39924725e6589c146bc1883ad', + 'info_dict': { + 'id': '78910339_part1', + 'ext': 'mp4', + 'duration': 294, + 'title': '【神探苍实战秘籍】第13期 战争之影 赫卡里姆', + } + }, { + 'md5': '3e1f46aaeb95354fd10e7fca9fc1804e', + 'info_dict': { + 'id': '78910339_part2', + 'ext': 'mp4', + 'duration': 300, + 'title': '【神探苍实战秘籍】第13期 战争之影 赫卡里姆', + } + }, { + 'md5': '8407e634175fdac706766481b9443450', + 'info_dict': { + 'id': '78910339_part3', + 'ext': 'mp4', + 'duration': 150, + 'title': '【神探苍实战秘籍】第13期 战争之影 赫卡里姆', + } + }] + }, { + 'note': 'Video with title containing dash', + 'url': 'http://my.tv.sohu.com/us/249884221/78932792.shtml', + 'info_dict': { + 'id': '78932792', + 'ext': 'mp4', + 'title': 'youtube-dl testing video', + }, + 'params': { + 'skip_download': True + } + }] def _real_extract(self, url): @@ -29,69 +95,117 @@ class SohuIE(InfoExtractor): base_data_url = 'http://my.tv.sohu.com/play/videonew.do?vid=' else: base_data_url = 'http://hot.vrs.sohu.com/vrs_flash.action?vid=' - data_url = base_data_url + str(vid_id) - data_json = self._download_webpage( - data_url, video_id, - note='Downloading JSON data for ' + str(vid_id)) - return json.loads(data_json) + + req = sanitized_Request(base_data_url + vid_id) + + cn_verification_proxy = self._downloader.params.get('cn_verification_proxy') + if cn_verification_proxy: + req.add_header('Ytdl-request-proxy', cn_verification_proxy) + + return self._download_json( + req, video_id, + 'Downloading JSON data for %s' % vid_id) mobj = re.match(self._VALID_URL, url) video_id = mobj.group('id') mytv = mobj.group('mytv') is not None webpage = self._download_webpage(url, video_id) - raw_title = self._html_search_regex(r'(?s)(.+?)', - webpage, 'video title') - title = raw_title.partition('-')[0].strip() - vid = self._html_search_regex(r'var vid ?= ?["\'](\d+)["\']', webpage, - 'video path') - data = _fetch_data(vid, mytv) + title = re.sub(r' - 搜狐视频$', '', self._og_search_title(webpage)) - QUALITIES = ('ori', 'super', 'high', 'nor') - vid_ids = [data['data'][q + 'Vid'] - for q in QUALITIES - if data['data'][q + 'Vid'] != 0] - if not vid_ids: - raise ExtractorError('No formats available for this video') + vid = self._html_search_regex( + r'var vid ?= ?["\'](\d+)["\']', + webpage, 'video path') + vid_data = _fetch_data(vid, mytv) + if vid_data['play'] != 1: + if vid_data.get('status') == 12: + raise ExtractorError( + 'Sohu said: There\'s something wrong in the video.', + expected=True) + else: + raise ExtractorError( + 'Sohu said: The video is only licensed to users in Mainland China.', + expected=True) - # For now, we just pick the highest available quality - vid_id = vid_ids[-1] + formats_json = {} + for format_id in ('nor', 'high', 'super', 'ori', 'h2644k', 'h2654k'): + vid_id = vid_data['data'].get('%sVid' % format_id) + if not vid_id: + continue + vid_id = compat_str(vid_id) + formats_json[format_id] = vid_data if vid == vid_id else _fetch_data(vid_id, mytv) - format_data = data if vid == vid_id else _fetch_data(vid_id, mytv) - part_count = format_data['data']['totalBlocks'] - allot = format_data['allot'] - prot = format_data['prot'] - clipsURL = format_data['data']['clipsURL'] - su = format_data['data']['su'] + part_count = vid_data['data']['totalBlocks'] playlist = [] for i in range(part_count): - part_url = ('http://%s/?prot=%s&file=%s&new=%s' % - (allot, prot, clipsURL[i], su[i])) - part_str = self._download_webpage( - part_url, video_id, - note='Downloading part %d of %d' % (i + 1, part_count)) + formats = [] + for format_id, format_data in formats_json.items(): + allot = format_data['allot'] - part_info = part_str.split('|') - video_url = '%s%s?key=%s' % (part_info[0], su[i], part_info[3]) + data = format_data['data'] + clips_url = data['clipsURL'] + su = data['su'] - video_info = { - 'id': '%s_part%02d' % (video_id, i + 1), + video_url = 'newflv.sohu.ccgslb.net' + cdnId = None + retries = 0 + + while 'newflv.sohu.ccgslb.net' in video_url: + params = { + 'prot': 9, + 'file': clips_url[i], + 'new': su[i], + 'prod': 'flash', + 'rb': 1, + } + + if cdnId is not None: + params['idc'] = cdnId + + download_note = 'Downloading %s video URL part %d of %d' % ( + format_id, i + 1, part_count) + + if retries > 0: + download_note += ' (retry #%d)' % retries + part_info = self._parse_json(self._download_webpage( + 'http://%s/?%s' % (allot, compat_urllib_parse.urlencode(params)), + video_id, download_note), video_id) + + video_url = part_info['url'] + cdnId = part_info.get('nid') + + retries += 1 + if retries > 5: + raise ExtractorError('Failed to get video URL') + + formats.append({ + 'url': video_url, + 'format_id': format_id, + 'filesize': data['clipsBytes'][i], + 'width': data['width'], + 'height': data['height'], + 'fps': data['fps'], + }) + self._sort_formats(formats) + + playlist.append({ + 'id': '%s_part%d' % (video_id, i + 1), 'title': title, - 'url': video_url, - 'ext': 'mp4', - } - playlist.append(video_info) + 'duration': vid_data['data']['clipsDuration'][i], + 'formats': formats, + }) if len(playlist) == 1: info = playlist[0] info['id'] = video_id else: info = { - '_type': 'playlist', + '_type': 'multi_video', 'entries': playlist, 'id': video_id, + 'title': title, } return info diff --git a/youtube_dl/extractor/soundcloud.py b/youtube_dl/extractor/soundcloud.py index ab9483d2d..02e64e094 100644 --- a/youtube_dl/extractor/soundcloud.py +++ b/youtube_dl/extractor/soundcloud.py @@ -4,12 +4,17 @@ from __future__ import unicode_literals import re import itertools -from .common import InfoExtractor -from ..utils import ( +from .common import ( + InfoExtractor, + SearchInfoExtractor +) +from ..compat import ( compat_str, compat_urlparse, compat_urllib_parse, - +) +from ..utils import ( + encode_dict, ExtractorError, int_or_none, unified_strdate, @@ -28,7 +33,7 @@ class SoundcloudIE(InfoExtractor): _VALID_URL = r'''(?x)^(?:https?://)? (?:(?:(?:www\.|m\.)?soundcloud\.com/ (?P[\w\d-]+)/ - (?!sets/|likes/?(?:$|[?#])) + (?!(?:tracks|sets(?:/[^/?#]+)?|reposts|likes|spotlight)/?(?:$|[?#])) (?P[\w\d-]+)/? (?P<token>[^?]+?)?(?:[?].*)?$) |(?:api\.soundcloud\.com/tracks/(?P<track_id>\d+) @@ -112,7 +117,7 @@ class SoundcloudIE(InfoExtractor): }, ] - _CLIENT_ID = 'b45b1aa10f1ac2941910a7f0d10f8e28' + _CLIENT_ID = '02gUJC0hH2ct1EGOcYXQIzRFU91c72Ea' _IPHONE_CLIENT_ID = '376f225bf427445fc4bfb6b99b72e0bf' def report_resolve(self, video_id): @@ -179,7 +184,7 @@ class SoundcloudIE(InfoExtractor): 'format_id': key, 'url': url, 'play_path': 'mp3:' + path, - 'ext': ext, + 'ext': 'flv', 'vcodec': 'none', }) @@ -199,8 +204,9 @@ class SoundcloudIE(InfoExtractor): if f['format_id'].startswith('rtmp'): f['protocol'] = 'rtmp' - self._sort_formats(formats) - result['formats'] = formats + self._check_formats(formats, track_id) + self._sort_formats(formats) + result['formats'] = formats return result @@ -219,7 +225,12 @@ class SoundcloudIE(InfoExtractor): info_json_url += "&secret_token=" + token elif mobj.group('player'): query = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query) - return self.url_result(query['url'][0]) + real_url = query['url'][0] + # If the token is in the query of the original url we have to + # manually add it + if 'secret_token' in query: + real_url += '?secret_token=' + query['secret_token'][0] + return self.url_result(real_url) else: # extract uploader (which is in the url) uploader = mobj.group('uploader') @@ -240,11 +251,12 @@ class SoundcloudIE(InfoExtractor): class SoundcloudSetIE(SoundcloudIE): - _VALID_URL = r'https?://(?:www\.)?soundcloud\.com/(?P<uploader>[\w\d-]+)/sets/(?P<slug_title>[\w\d-]+)(?:/(?P<token>[^?/]+))?' + _VALID_URL = r'https?://(?:(?:www|m)\.)?soundcloud\.com/(?P<uploader>[\w\d-]+)/sets/(?P<slug_title>[\w\d-]+)(?:/(?P<token>[^?/]+))?' IE_NAME = 'soundcloud:set' _TESTS = [{ 'url': 'https://soundcloud.com/the-concept-band/sets/the-royal-concept-ep', 'info_dict': { + 'id': '2284613', 'title': 'The Royal Concept EP', }, 'playlist_mincount': 6, @@ -271,70 +283,153 @@ class SoundcloudSetIE(SoundcloudIE): info = self._download_json(resolv_url, full_title) if 'errors' in info: - for err in info['errors']: - self._downloader.report_error('unable to download video webpage: %s' % compat_str(err['error_message'])) - return + msgs = (compat_str(err['error_message']) for err in info['errors']) + raise ExtractorError('unable to download video webpage: %s' % ','.join(msgs)) + + entries = [self.url_result(track['permalink_url'], 'Soundcloud') for track in info['tracks']] return { '_type': 'playlist', - 'entries': [self._extract_info_dict(track, secret_token=token) for track in info['tracks']], - 'id': info['id'], + 'entries': entries, + 'id': '%s' % info['id'], 'title': info['title'], } class SoundcloudUserIE(SoundcloudIE): - _VALID_URL = r'https?://(www\.)?soundcloud\.com/(?P<user>[^/]+)/?((?P<rsrc>tracks|likes)/?)?(\?.*)?$' + _VALID_URL = r'''(?x) + https?:// + (?:(?:www|m)\.)?soundcloud\.com/ + (?P<user>[^/]+) + (?:/ + (?P<rsrc>tracks|sets|reposts|likes|spotlight) + )? + /?(?:[?#].*)?$ + ''' IE_NAME = 'soundcloud:user' _TESTS = [{ - 'url': 'https://soundcloud.com/the-concept-band', + 'url': 'https://soundcloud.com/the-akashic-chronicler', 'info_dict': { - 'id': '9615865', - 'title': 'The Royal Concept', + 'id': '114582580', + 'title': 'The Akashic Chronicler (All)', }, - 'playlist_mincount': 12 + 'playlist_mincount': 111, }, { - 'url': 'https://soundcloud.com/the-concept-band/likes', + 'url': 'https://soundcloud.com/the-akashic-chronicler/tracks', 'info_dict': { - 'id': '9615865', - 'title': 'The Royal Concept', + 'id': '114582580', + 'title': 'The Akashic Chronicler (Tracks)', + }, + 'playlist_mincount': 50, + }, { + 'url': 'https://soundcloud.com/the-akashic-chronicler/sets', + 'info_dict': { + 'id': '114582580', + 'title': 'The Akashic Chronicler (Playlists)', + }, + 'playlist_mincount': 3, + }, { + 'url': 'https://soundcloud.com/the-akashic-chronicler/reposts', + 'info_dict': { + 'id': '114582580', + 'title': 'The Akashic Chronicler (Reposts)', + }, + 'playlist_mincount': 7, + }, { + 'url': 'https://soundcloud.com/the-akashic-chronicler/likes', + 'info_dict': { + 'id': '114582580', + 'title': 'The Akashic Chronicler (Likes)', + }, + 'playlist_mincount': 321, + }, { + 'url': 'https://soundcloud.com/grynpyret/spotlight', + 'info_dict': { + 'id': '7098329', + 'title': 'Grynpyret (Spotlight)', }, 'playlist_mincount': 1, }] + _API_BASE = 'https://api.soundcloud.com' + _API_V2_BASE = 'https://api-v2.soundcloud.com' + + _BASE_URL_MAP = { + 'all': '%s/profile/soundcloud:users:%%s' % _API_V2_BASE, + 'tracks': '%s/users/%%s/tracks' % _API_BASE, + 'sets': '%s/users/%%s/playlists' % _API_V2_BASE, + 'reposts': '%s/profile/soundcloud:users:%%s/reposts' % _API_V2_BASE, + 'likes': '%s/users/%%s/likes' % _API_V2_BASE, + 'spotlight': '%s/users/%%s/spotlight' % _API_V2_BASE, + } + + _TITLE_MAP = { + 'all': 'All', + 'tracks': 'Tracks', + 'sets': 'Playlists', + 'reposts': 'Reposts', + 'likes': 'Likes', + 'spotlight': 'Spotlight', + } + def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) uploader = mobj.group('user') - resource = mobj.group('rsrc') - if resource is None: - resource = 'tracks' - elif resource == 'likes': - resource = 'favorites' url = 'http://soundcloud.com/%s/' % uploader resolv_url = self._resolv_url(url) user = self._download_json( resolv_url, uploader, 'Downloading user info') - base_url = 'http://api.soundcloud.com/users/%s/%s.json?' % (uploader, resource) + + resource = mobj.group('rsrc') or 'all' + base_url = self._BASE_URL_MAP[resource] % user['id'] + + next_href = None entries = [] for i in itertools.count(): - data = compat_urllib_parse.urlencode({ - 'offset': i * 50, - 'limit': 50, - 'client_id': self._CLIENT_ID, - }) - new_entries = self._download_json( - base_url + data, uploader, 'Downloading track page %s' % (i + 1)) - if len(new_entries) == 0: + if not next_href: + data = compat_urllib_parse.urlencode({ + 'offset': i * 50, + 'limit': 50, + 'client_id': self._CLIENT_ID, + 'linked_partitioning': '1', + 'representation': 'speedy', + }) + next_href = base_url + '?' + data + + response = self._download_json( + next_href, uploader, 'Downloading track page %s' % (i + 1)) + + collection = response['collection'] + + if not collection: self.to_screen('%s: End page received' % uploader) break - entries.extend(self._extract_info_dict(e, quiet=True) for e in new_entries) + + def resolve_permalink_url(candidates): + for cand in candidates: + if isinstance(cand, dict): + permalink_url = cand.get('permalink_url') + if permalink_url and permalink_url.startswith('http'): + return permalink_url + + for e in collection: + permalink_url = resolve_permalink_url((e, e.get('track'), e.get('playlist'))) + if permalink_url: + entries.append(self.url_result(permalink_url)) + + if 'next_href' in response: + next_href = response['next_href'] + if not next_href: + break + else: + next_href = None return { '_type': 'playlist', 'id': compat_str(user['id']), - 'title': user['username'], + 'title': '%s (%s)' % (user['username'], self._TITLE_MAP[resource]), 'entries': entries, } @@ -369,9 +464,7 @@ class SoundcloudPlaylistIE(SoundcloudIE): data = self._download_json( base_url + data, playlist_id, 'Downloading playlist') - entries = [ - self._extract_info_dict(t, quiet=True, secret_token=token) - for t in data['tracks']] + entries = [self.url_result(track['permalink_url'], 'Soundcloud') for track in data['tracks']] return { '_type': 'playlist', @@ -380,3 +473,60 @@ class SoundcloudPlaylistIE(SoundcloudIE): 'description': data.get('description'), 'entries': entries, } + + +class SoundcloudSearchIE(SearchInfoExtractor, SoundcloudIE): + IE_NAME = 'soundcloud:search' + IE_DESC = 'Soundcloud search' + _MAX_RESULTS = float('inf') + _TESTS = [{ + 'url': 'scsearch15:post-avant jazzcore', + 'info_dict': { + 'title': 'post-avant jazzcore', + }, + 'playlist_count': 15, + }] + + _SEARCH_KEY = 'scsearch' + _MAX_RESULTS_PER_PAGE = 200 + _DEFAULT_RESULTS_PER_PAGE = 50 + _API_V2_BASE = 'https://api-v2.soundcloud.com' + + def _get_collection(self, endpoint, collection_id, **query): + limit = min( + query.get('limit', self._DEFAULT_RESULTS_PER_PAGE), + self._MAX_RESULTS_PER_PAGE) + query['limit'] = limit + query['client_id'] = self._CLIENT_ID + query['linked_partitioning'] = '1' + query['offset'] = 0 + data = compat_urllib_parse.urlencode(encode_dict(query)) + next_url = '{0}{1}?{2}'.format(self._API_V2_BASE, endpoint, data) + + collected_results = 0 + + for i in itertools.count(1): + response = self._download_json( + next_url, collection_id, 'Downloading page {0}'.format(i), + 'Unable to download API page') + + collection = response.get('collection', []) + if not collection: + break + + collection = list(filter(bool, collection)) + collected_results += len(collection) + + for item in collection: + yield self.url_result(item['uri'], SoundcloudIE.ie_key()) + + if not collection or collected_results >= limit: + break + + next_url = response.get('next_href') + if not next_url: + break + + def _get_n_results(self, query, n): + tracks = self._get_collection('/search/tracks', query, limit=n, q=query) + return self.playlist_result(tracks, playlist_title=query) diff --git a/youtube_dl/extractor/soundgasm.py b/youtube_dl/extractor/soundgasm.py index a4f8ce6c3..3a4ddf57e 100644 --- a/youtube_dl/extractor/soundgasm.py +++ b/youtube_dl/extractor/soundgasm.py @@ -7,6 +7,7 @@ from .common import InfoExtractor class SoundgasmIE(InfoExtractor): + IE_NAME = 'soundgasm' _VALID_URL = r'https?://(?:www\.)?soundgasm\.net/u/(?P<user>[0-9a-zA-Z_\-]+)/(?P<title>[0-9a-zA-Z_\-]+)' _TEST = { 'url': 'http://soundgasm.net/u/ytdl/Piano-sample', @@ -38,3 +39,26 @@ class SoundgasmIE(InfoExtractor): 'title': audio_title, 'description': description } + + +class SoundgasmProfileIE(InfoExtractor): + IE_NAME = 'soundgasm:profile' + _VALID_URL = r'https?://(?:www\.)?soundgasm\.net/u/(?P<id>[^/]+)/?(?:\#.*)?$' + _TEST = { + 'url': 'http://soundgasm.net/u/ytdl', + 'info_dict': { + 'id': 'ytdl', + }, + 'playlist_count': 1, + } + + def _real_extract(self, url): + profile_id = self._match_id(url) + + webpage = self._download_webpage(url, profile_id) + + entries = [ + self.url_result(audio_url, 'Soundgasm') + for audio_url in re.findall(r'href="([^"]+/u/%s/[^"]+)' % profile_id, webpage)] + + return self.playlist_result(entries, profile_id) diff --git a/youtube_dl/extractor/southpark.py b/youtube_dl/extractor/southpark.py index c20397b3d..87b650468 100644 --- a/youtube_dl/extractor/southpark.py +++ b/youtube_dl/extractor/southpark.py @@ -1,3 +1,4 @@ +# encoding: utf-8 from __future__ import unicode_literals from .mtv import MTVServicesInfoExtractor @@ -5,7 +6,7 @@ from .mtv import MTVServicesInfoExtractor class SouthParkIE(MTVServicesInfoExtractor): IE_NAME = 'southpark.cc.com' - _VALID_URL = r'https?://(www\.)?(?P<url>southpark\.cc\.com/(clips|full-episodes)/(?P<id>.+?)(\?|#|$))' + _VALID_URL = r'https?://(?:www\.)?(?P<url>southpark\.cc\.com/(?:clips|full-episodes)/(?P<id>.+?)(\?|#|$))' _FEED_URL = 'http://www.southparkstudios.com/feeds/video-player/mrss' @@ -20,9 +21,20 @@ class SouthParkIE(MTVServicesInfoExtractor): }] -class SouthparkDeIE(SouthParkIE): +class SouthParkEsIE(SouthParkIE): + IE_NAME = 'southpark.cc.com:español' + _VALID_URL = r'https?://(?:www\.)?(?P<url>southpark\.cc\.com/episodios-en-espanol/(?P<id>.+?)(\?|#|$))' + _LANG = 'es' + + _TESTS = [{ + 'url': 'http://southpark.cc.com/episodios-en-espanol/s01e01-cartman-consigue-una-sonda-anal#source=351c1323-0b96-402d-a8b9-40d01b2e9bde&position=1&sort=!airdate', + 'playlist_count': 4, + }] + + +class SouthParkDeIE(SouthParkIE): IE_NAME = 'southpark.de' - _VALID_URL = r'https?://(www\.)?(?P<url>southpark\.de/(clips|alle-episoden)/(?P<id>.+?)(\?|#|$))' + _VALID_URL = r'https?://(?:www\.)?(?P<url>southpark\.de/(?:clips|alle-episoden)/(?P<id>.+?)(\?|#|$))' _FEED_URL = 'http://www.southpark.de/feeds/video-player/mrss/' _TESTS = [{ @@ -33,4 +45,34 @@ class SouthparkDeIE(SouthParkIE): 'title': 'The Government Won\'t Respect My Privacy', 'description': 'Cartman explains the benefits of "Shitter" to Stan, Kyle and Craig.', }, + }, { + # non-ASCII characters in initial URL + 'url': 'http://www.southpark.de/alle-episoden/s18e09-hashtag-aufwärmen', + 'playlist_count': 4, + }, { + # non-ASCII characters in redirect URL + 'url': 'http://www.southpark.de/alle-episoden/s18e09', + 'playlist_count': 4, + }] + + +class SouthParkNlIE(SouthParkIE): + IE_NAME = 'southpark.nl' + _VALID_URL = r'https?://(?:www\.)?(?P<url>southpark\.nl/(?:clips|full-episodes)/(?P<id>.+?)(\?|#|$))' + _FEED_URL = 'http://www.southpark.nl/feeds/video-player/mrss/' + + _TESTS = [{ + 'url': 'http://www.southpark.nl/full-episodes/s18e06-freemium-isnt-free', + 'playlist_count': 4, + }] + + +class SouthParkDkIE(SouthParkIE): + IE_NAME = 'southparkstudios.dk' + _VALID_URL = r'https?://(?:www\.)?(?P<url>southparkstudios\.dk/(?:clips|full-episodes)/(?P<id>.+?)(\?|#|$))' + _FEED_URL = 'http://www.southparkstudios.dk/feeds/video-player/mrss/' + + _TESTS = [{ + 'url': 'http://www.southparkstudios.dk/full-episodes/s18e07-grounded-vindaloop', + 'playlist_count': 4, }] diff --git a/youtube_dl/extractor/space.py b/youtube_dl/extractor/space.py index c2d0d36a6..ebb5d6ec0 100644 --- a/youtube_dl/extractor/space.py +++ b/youtube_dl/extractor/space.py @@ -3,14 +3,14 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from .brightcove import BrightcoveIE +from .brightcove import BrightcoveLegacyIE from ..utils import RegexNotFoundError, ExtractorError class SpaceIE(InfoExtractor): _VALID_URL = r'https?://(?:(?:www|m)\.)?space\.com/\d+-(?P<title>[^/\.\?]*?)-video\.html' _TEST = { - 'add_ie': ['Brightcove'], + 'add_ie': ['BrightcoveLegacy'], 'url': 'http://www.space.com/23373-huge-martian-landforms-detail-revealed-by-european-probe-video.html', 'info_dict': { 'id': '2780937028001', @@ -31,8 +31,8 @@ class SpaceIE(InfoExtractor): brightcove_url = self._og_search_video_url(webpage) except RegexNotFoundError: # Other videos works fine with the info from the object - brightcove_url = BrightcoveIE._extract_brightcove_url(webpage) + brightcove_url = BrightcoveLegacyIE._extract_brightcove_url(webpage) if brightcove_url is None: raise ExtractorError( 'The webpage does not contain a video', expected=True) - return self.url_result(brightcove_url, BrightcoveIE.ie_key()) + return self.url_result(brightcove_url, BrightcoveLegacyIE.ie_key()) diff --git a/youtube_dl/extractor/spankbang.py b/youtube_dl/extractor/spankbang.py new file mode 100644 index 000000000..7f060b15b --- /dev/null +++ b/youtube_dl/extractor/spankbang.py @@ -0,0 +1,60 @@ +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor + + +class SpankBangIE(InfoExtractor): + _VALID_URL = r'https?://(?:(?:www|[a-z]{2})\.)?spankbang\.com/(?P<id>[\da-z]+)/video' + _TEST = { + 'url': 'http://spankbang.com/3vvn/video/fantasy+solo', + 'md5': '1cc433e1d6aa14bc376535b8679302f7', + 'info_dict': { + 'id': '3vvn', + 'ext': 'mp4', + 'title': 'fantasy solo', + 'description': 'dillion harper masturbates on a bed', + 'thumbnail': 're:^https?://.*\.jpg$', + 'uploader': 'silly2587', + 'age_limit': 18, + } + } + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + stream_key = self._html_search_regex( + r'''var\s+stream_key\s*=\s*['"](.+?)['"]''', + webpage, 'stream key') + + formats = [{ + 'url': 'http://spankbang.com/_%s/%s/title/%sp__mp4' % (video_id, stream_key, height), + 'ext': 'mp4', + 'format_id': '%sp' % height, + 'height': int(height), + } for height in re.findall(r'<span[^>]+q_(\d+)p', webpage)] + self._sort_formats(formats) + + title = self._html_search_regex( + r'(?s)<h1>(.+?)</h1>', webpage, 'title') + description = self._search_regex( + r'class="desc"[^>]*>([^<]+)', + webpage, 'description', default=None) + thumbnail = self._og_search_thumbnail(webpage) + uploader = self._search_regex( + r'class="user"[^>]*>([^<]+)', + webpage, 'uploader', fatal=False) + + age_limit = self._rta_search(webpage) + + return { + 'id': video_id, + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + 'uploader': uploader, + 'formats': formats, + 'age_limit': age_limit, + } diff --git a/youtube_dl/extractor/spankwire.py b/youtube_dl/extractor/spankwire.py index 94602e89e..692fd78e8 100644 --- a/youtube_dl/extractor/spankwire.py +++ b/youtube_dl/extractor/spankwire.py @@ -3,19 +3,22 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..utils import ( +from ..compat import ( + compat_urllib_parse_unquote, compat_urllib_parse_urlparse, - compat_urllib_request, - compat_urllib_parse, - unified_strdate, +) +from ..utils import ( + sanitized_Request, str_to_int, + unified_strdate, ) from ..aes import aes_decrypt_text class SpankwireIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?(?P<url>spankwire\.com/[^/]*/video(?P<videoid>[0-9]+)/?)' - _TEST = { + _VALID_URL = r'https?://(?:www\.)?(?P<url>spankwire\.com/[^/]*/video(?P<id>[0-9]+)/?)' + _TESTS = [{ + # download URL pattern: */<height>P_<tbr>K_<video_id>.mp4 'url': 'http://www.spankwire.com/Buckcherry-s-X-Rated-Music-Video-Crazy-Bitch/video103545/', 'md5': '8bbfde12b101204b39e4b9fe7eb67095', 'info_dict': { @@ -25,24 +28,37 @@ class SpankwireIE(InfoExtractor): 'description': 'Crazy Bitch X rated music video.', 'uploader': 'oreusz', 'uploader_id': '124697', - 'upload_date': '20070508', + 'upload_date': '20070507', 'age_limit': 18, } - } + }, { + # download URL pattern: */mp4_<format_id>_<video_id>.mp4 + 'url': 'http://www.spankwire.com/Titcums-Compiloation-I/video1921551/', + 'md5': '09b3c20833308b736ae8902db2f8d7e6', + 'info_dict': { + 'id': '1921551', + 'ext': 'mp4', + 'title': 'Titcums Compiloation I', + 'description': 'cum on tits', + 'uploader': 'dannyh78999', + 'uploader_id': '3056053', + 'upload_date': '20150822', + 'age_limit': 18, + }, + }] def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('videoid') - url = 'http://www.' + mobj.group('url') + video_id = mobj.group('id') - req = compat_urllib_request.Request(url) + req = sanitized_Request('http://www.' + mobj.group('url')) req.add_header('Cookie', 'age_verified=1') webpage = self._download_webpage(req, video_id) title = self._html_search_regex( r'<h1>([^<]+)', webpage, 'title') description = self._html_search_regex( - r'<div\s+id="descriptionContent">([^<]+)<', + r'(?s)<div\s+id="descriptionContent">(.+?)</div>', webpage, 'description', fatal=False) thumbnail = self._html_search_regex( r'playerData\.screenShot\s*=\s*["\']([^"\']+)["\']', @@ -52,7 +68,7 @@ class SpankwireIE(InfoExtractor): r'by:\s*<a [^>]*>(.+?)</a>', webpage, 'uploader', fatal=False) uploader_id = self._html_search_regex( - r'by:\s*<a href="/Profile\.aspx\?.*?UserId=(\d+).*?"', + r'by:\s*<a href="/(?:user/viewProfile|Profile\.aspx)\?.*?UserId=(\d+).*?"', webpage, 'uploader id', fatal=False) upload_date = unified_strdate(self._html_search_regex( r'</a> on (.+?) at \d+:\d+', @@ -62,14 +78,15 @@ class SpankwireIE(InfoExtractor): r'<div id="viewsCounter"><span>([\d,\.]+)</span> views</div>', webpage, 'view count', fatal=False)) comment_count = str_to_int(self._html_search_regex( - r'Comments<span[^>]+>\s*\(([\d,\.]+)\)</span>', + r'<span\s+id="spCommentCount"[^>]*>([\d,\.]+)</span>', webpage, 'comment count', fatal=False)) - video_urls = list(map( - compat_urllib_parse.unquote, - re.findall(r'playerData\.cdnPath[0-9]{3,}\s*=\s*["\']([^"\']+)["\']', webpage))) + videos = re.findall( + r'playerData\.cdnPath([0-9]{3,})\s*=\s*(?:encodeURIComponent\()?["\']([^"\']+)["\']', webpage) + heights = [int(video[0]) for video in videos] + video_urls = list(map(compat_urllib_parse_unquote, [video[1] for video in videos])) if webpage.find('flashvars\.encrypted = "true"') != -1: - password = self._html_search_regex( + password = self._search_regex( r'flashvars\.video_title = "([^"]+)', webpage, 'password').replace('+', ' ') video_urls = list(map( @@ -77,21 +94,22 @@ class SpankwireIE(InfoExtractor): video_urls)) formats = [] - for video_url in video_urls: + for height, video_url in zip(heights, video_urls): path = compat_urllib_parse_urlparse(video_url).path - format = path.split('/')[4].split('_')[:2] - resolution, bitrate_str = format - format = "-".join(format) - height = int(resolution.rstrip('Pp')) - tbr = int(bitrate_str.rstrip('Kk')) - formats.append({ + _, quality = path.split('/')[4].split('_')[:2] + f = { 'url': video_url, - 'resolution': resolution, - 'format': format, - 'tbr': tbr, 'height': height, - 'format_id': format, - }) + } + tbr = self._search_regex(r'^(\d+)[Kk]$', quality, 'tbr', default=None) + if tbr: + f.update({ + 'tbr': int(tbr), + 'format_id': '%dp' % height, + }) + else: + f['format_id'] = quality + formats.append(f) self._sort_formats(formats) age_limit = self._rta_search(webpage) diff --git a/youtube_dl/extractor/spiegel.py b/youtube_dl/extractor/spiegel.py index 1e55a9ffb..39a7aaf9d 100644 --- a/youtube_dl/extractor/spiegel.py +++ b/youtube_dl/extractor/spiegel.py @@ -9,7 +9,7 @@ from .spiegeltv import SpiegeltvIE class SpiegelIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<id>[0-9]+)(?:-embed)?(?:\.html)?(?:#.*)?$' + _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<id>[0-9]+)(?:-embed|-iframe)?(?:\.html)?(?:#.*)?$' _TESTS = [{ 'url': 'http://www.spiegel.de/video/vulkan-tungurahua-in-ecuador-ist-wieder-aktiv-video-1259285.html', 'md5': '2c2754212136f35fb4b19767d242f66e', @@ -39,6 +39,9 @@ class SpiegelIE(InfoExtractor): 'description': 'SPIEGEL ONLINE-Nutzer durften den deutschen Astronauten Alexander Gerst über sein Leben auf der ISS-Station befragen. Hier kommen seine Antworten auf die besten sechs Fragen.', 'title': 'Fragen an Astronaut Alexander Gerst: "Bekommen Sie die Tageszeiten mit?"', } + }, { + 'url': 'http://www.spiegel.de/video/astronaut-alexander-gerst-von-der-iss-station-beantwortet-fragen-video-1519126-iframe.html', + 'only_matching': True, }] def _real_extract(self, url): @@ -55,28 +58,30 @@ class SpiegelIE(InfoExtractor): description = self._html_search_meta('description', webpage, 'description') base_url = self._search_regex( - r'var\s+server\s*=\s*"([^"]+)\"', webpage, 'server URL') + [r'server\s*:\s*(["\'])(?P<url>.+?)\1', r'var\s+server\s*=\s*"(?P<url>[^"]+)\"'], + webpage, 'server URL', group='url') xml_url = base_url + video_id + '.xml' idoc = self._download_xml(xml_url, video_id) - formats = [ - { - 'format_id': n.tag.rpartition('type')[2], - 'url': base_url + n.find('./filename').text, - 'width': int(n.find('./width').text), - 'height': int(n.find('./height').text), - 'abr': int(n.find('./audiobitrate').text), - 'vbr': int(n.find('./videobitrate').text), - 'vcodec': n.find('./codec').text, - 'acodec': 'MP4A', - } - for n in list(idoc) - # Blacklist type 6, it's extremely LQ and not available on the same server - if n.tag.startswith('type') and n.tag != 'type6' - ] + formats = [] + for n in list(idoc): + if n.tag.startswith('type') and n.tag != 'type6': + format_id = n.tag.rpartition('type')[2] + video_url = base_url + n.find('./filename').text + formats.append({ + 'format_id': format_id, + 'url': video_url, + 'width': int(n.find('./width').text), + 'height': int(n.find('./height').text), + 'abr': int(n.find('./audiobitrate').text), + 'vbr': int(n.find('./videobitrate').text), + 'vcodec': n.find('./codec').text, + 'acodec': 'MP4A', + }) duration = float(idoc[0].findall('./duration')[0].text) + self._check_formats(formats, video_id) self._sort_formats(formats) return { diff --git a/youtube_dl/extractor/spiegeltv.py b/youtube_dl/extractor/spiegeltv.py index 98cf92d89..034bd47ff 100644 --- a/youtube_dl/extractor/spiegeltv.py +++ b/youtube_dl/extractor/spiegeltv.py @@ -2,7 +2,11 @@ from __future__ import unicode_literals from .common import InfoExtractor -from ..utils import float_or_none +from ..compat import compat_urllib_parse_urlparse +from ..utils import ( + determine_ext, + float_or_none, +) class SpiegeltvIE(InfoExtractor): @@ -17,7 +21,7 @@ class SpiegeltvIE(InfoExtractor): 'thumbnail': 're:http://.*\.jpg$', }, 'params': { - # rtmp download + # m3u8 download 'skip_download': True, } }, { @@ -51,9 +55,43 @@ class SpiegeltvIE(InfoExtractor): is_wide = media_json['is_wide'] server_json = self._download_json( - 'http://www.spiegel.tv/streaming_servers/', video_id, - note='Downloading server information') - server = server_json[0]['endpoint'] + 'http://spiegeltv-prod-static.s3.amazonaws.com/projectConfigs/projectConfig.json', + video_id, note='Downloading server information') + + format = '16x9' if is_wide else '4x3' + + formats = [] + for streamingserver in server_json['streamingserver']: + endpoint = streamingserver.get('endpoint') + if not endpoint: + continue + play_path = 'mp4:%s_spiegeltv_0500_%s.m4v' % (uuid, format) + if endpoint.startswith('rtmp'): + formats.append({ + 'url': endpoint, + 'format_id': 'rtmp', + 'app': compat_urllib_parse_urlparse(endpoint).path[1:], + 'play_path': play_path, + 'player_path': 'http://prod-static.spiegel.tv/frontend-076.swf', + 'ext': 'flv', + 'rtmp_live': True, + }) + elif determine_ext(endpoint) == 'm3u8': + formats.append({ + 'url': endpoint.replace('[video]', play_path), + 'ext': 'm4v', + 'format_id': 'hls', # Prefer hls since it allows to workaround georestriction + 'protocol': 'm3u8', + 'preference': 1, + 'http_headers': { + 'Accept-Encoding': 'deflate', # gzip causes trouble on the server side + }, + }) + else: + formats.append({ + 'url': endpoint, + }) + self._check_formats(formats, video_id) thumbnails = [] for image in media_json['images']: @@ -65,16 +103,12 @@ class SpiegeltvIE(InfoExtractor): description = media_json['subtitle'] duration = float_or_none(media_json.get('duration_in_ms'), scale=1000) - format = '16x9' if is_wide else '4x3' - - url = server + 'mp4:' + uuid + '_spiegeltv_0500_' + format + '.m4v' return { 'id': video_id, 'title': title, - 'url': url, - 'ext': 'm4v', 'description': description, 'duration': duration, - 'thumbnails': thumbnails + 'thumbnails': thumbnails, + 'formats': formats, } diff --git a/youtube_dl/extractor/spike.py b/youtube_dl/extractor/spike.py index a3adf54e3..182f286df 100644 --- a/youtube_dl/extractor/spike.py +++ b/youtube_dl/extractor/spike.py @@ -1,14 +1,12 @@ from __future__ import unicode_literals -import re - from .mtv import MTVServicesInfoExtractor class SpikeIE(MTVServicesInfoExtractor): _VALID_URL = r'''(?x)https?:// - (www\.spike\.com/(video-clips|episodes)/.+| - m\.spike\.com/videos/video.rbml\?id=(?P<mobile_id>[^&]+)) + (?:www\.spike\.com/(?:video-(?:clips|playlists)|(?:full-)?episodes)/.+| + m\.spike\.com/videos/video\.rbml\?id=(?P<id>[^&]+)) ''' _TEST = { 'url': 'http://www.spike.com/video-clips/lhtu8m/auction-hunters-can-allen-ride-a-hundred-year-old-motorcycle', @@ -25,8 +23,7 @@ class SpikeIE(MTVServicesInfoExtractor): _MOBILE_TEMPLATE = 'http://m.spike.com/videos/video.rbml?id=%s' def _real_extract(self, url): - mobj = re.search(self._VALID_URL, url) - mobile_id = mobj.group('mobile_id') - if mobile_id is not None: + mobile_id = self._match_id(url) + if mobile_id: url = 'http://www.spike.com/video-clips/%s' % mobile_id return super(SpikeIE, self)._real_extract(url) diff --git a/youtube_dl/extractor/sportbox.py b/youtube_dl/extractor/sportbox.py index becdf658f..86d509ae5 100644 --- a/youtube_dl/extractor/sportbox.py +++ b/youtube_dl/extractor/sportbox.py @@ -4,37 +4,36 @@ from __future__ import unicode_literals import re from .common import InfoExtractor +from ..compat import compat_urlparse from ..utils import ( - parse_duration, - parse_iso8601, + unified_strdate, ) class SportBoxIE(InfoExtractor): - _VALID_URL = r'https?://news\.sportbox\.ru/Vidy_sporta/(?:[^/]+/)+spbvideo_NI\d+_(?P<display_id>.+)' - _TESTS = [ - { - 'url': 'http://news.sportbox.ru/Vidy_sporta/Avtosport/Rossijskij/spbvideo_NI483529_Gonka-2-zaezd-Obyedinenniy-2000-klassi-Turing-i-S', - 'md5': 'ff56a598c2cf411a9a38a69709e97079', - 'info_dict': { - 'id': '80822', - 'ext': 'mp4', - 'title': 'Гонка 2 заезд ««Объединенный 2000»: классы Туринг и Супер-продакшн', - 'description': 'md5:81715fa9c4ea3d9e7915dc8180c778ed', - 'thumbnail': 're:^https?://.*\.jpg$', - 'timestamp': 1411896237, - 'upload_date': '20140928', - 'duration': 4846, - }, - 'params': { - # m3u8 download - 'skip_download': True, - }, - }, { - 'url': 'http://news.sportbox.ru/Vidy_sporta/billiard/spbvideo_NI486287_CHempionat-mira-po-dinamichnoy-piramide-4', - 'only_matching': True, - } - ] + _VALID_URL = r'https?://news\.sportbox\.ru/(?:[^/]+/)+spbvideo_NI\d+_(?P<display_id>.+)' + _TESTS = [{ + 'url': 'http://news.sportbox.ru/Vidy_sporta/Avtosport/Rossijskij/spbvideo_NI483529_Gonka-2-zaezd-Obyedinenniy-2000-klassi-Turing-i-S', + 'md5': 'ff56a598c2cf411a9a38a69709e97079', + 'info_dict': { + 'id': '80822', + 'ext': 'mp4', + 'title': 'Гонка 2 заезд ««Объединенный 2000»: классы Туринг и Супер-продакшн', + 'description': 'md5:3d72dc4a006ab6805d82f037fdc637ad', + 'thumbnail': 're:^https?://.*\.jpg$', + 'upload_date': '20140928', + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + }, { + 'url': 'http://news.sportbox.ru/Vidy_sporta/billiard/spbvideo_NI486287_CHempionat-mira-po-dinamichnoy-piramide-4', + 'only_matching': True, + }, { + 'url': 'http://news.sportbox.ru/video/no_ads/spbvideo_NI536574_V_Novorossijske_proshel_detskij_turnir_Pole_slavy_bojevoj?ci=211355', + 'only_matching': True, + }] def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) @@ -42,35 +41,75 @@ class SportBoxIE(InfoExtractor): webpage = self._download_webpage(url, display_id) - video_id = self._search_regex( - r'src="/vdl/player/media/(\d+)"', webpage, 'video id') - - player = self._download_webpage( - 'http://news.sportbox.ru/vdl/player/media/%s' % video_id, - display_id, 'Downloading player webpage') - - hls = self._search_regex( - r"var\s+original_hls_file\s*=\s*'([^']+)'", player, 'hls file') - - formats = self._extract_m3u8_formats(hls, display_id, 'mp4') + player = self._search_regex( + r'src="/?(vdl/player/[^"]+)"', webpage, 'player') title = self._html_search_regex( - r'<h1 itemprop="name">([^<]+)</h1>', webpage, 'title') - description = self._html_search_regex( - r'(?s)<div itemprop="description">(.+?)</div>', webpage, 'description', fatal=False) + [r'"nodetitle"\s*:\s*"([^"]+)"', r'class="node-header_{1,2}title">([^<]+)'], + webpage, 'title') + description = self._og_search_description(webpage) or self._html_search_meta( + 'description', webpage, 'description') thumbnail = self._og_search_thumbnail(webpage) - timestamp = parse_iso8601(self._search_regex( - r'<span itemprop="uploadDate">([^<]+)</span>', webpage, 'timestamp', fatal=False)) - duration = parse_duration(self._html_search_regex( - r'<meta itemprop="duration" content="PT([^"]+)">', webpage, 'duration', fatal=False)) + upload_date = unified_strdate(self._html_search_meta( + 'dateCreated', webpage, 'upload date')) return { - 'id': video_id, + '_type': 'url_transparent', + 'url': compat_urlparse.urljoin(url, '/%s' % player), 'display_id': display_id, 'title': title, 'description': description, 'thumbnail': thumbnail, - 'timestamp': timestamp, - 'duration': duration, + 'upload_date': upload_date, + } + + +class SportBoxEmbedIE(InfoExtractor): + _VALID_URL = r'https?://news\.sportbox\.ru/vdl/player(?:/[^/]+/|\?.*?\bn?id=)(?P<id>\d+)' + _TESTS = [{ + 'url': 'http://news.sportbox.ru/vdl/player/ci/211355', + 'info_dict': { + 'id': '211355', + 'ext': 'mp4', + 'title': 'В Новороссийске прошел детский турнир «Поле славы боевой»', + 'thumbnail': 're:^https?://.*\.jpg$', + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + }, { + 'url': 'http://news.sportbox.ru/vdl/player?nid=370908&only_player=1&autostart=false&playeri=2&height=340&width=580', + 'only_matching': True, + }] + + @staticmethod + def _extract_urls(webpage): + return re.findall( + r'<iframe[^>]+src="(https?://news\.sportbox\.ru/vdl/player[^"]+)"', + webpage) + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage(url, video_id) + + hls = self._search_regex( + r"sportboxPlayer\.jwplayer_common_params\.file\s*=\s*['\"]([^'\"]+)['\"]", + webpage, 'hls file') + + formats = self._extract_m3u8_formats(hls, video_id, 'mp4') + + title = self._search_regex( + r'sportboxPlayer\.node_title\s*=\s*"([^"]+)"', webpage, 'title') + + thumbnail = self._search_regex( + r'sportboxPlayer\.jwplayer_common_params\.image\s*=\s*"([^"]+)"', + webpage, 'thumbnail', default=None) + + return { + 'id': video_id, + 'title': title, + 'thumbnail': thumbnail, 'formats': formats, } diff --git a/youtube_dl/extractor/sportdeutschland.py b/youtube_dl/extractor/sportdeutschland.py index 057ef5251..a9927f6e2 100644 --- a/youtube_dl/extractor/sportdeutschland.py +++ b/youtube_dl/extractor/sportdeutschland.py @@ -5,8 +5,8 @@ import re from .common import InfoExtractor from ..utils import ( - compat_urllib_request, parse_iso8601, + sanitized_Request, ) @@ -36,10 +36,12 @@ class SportDeutschlandIE(InfoExtractor): 'upload_date': '20140825', 'description': 'md5:60a20536b57cee7d9a4ec005e8687504', 'timestamp': 1408976060, + 'duration': 2732, 'title': 'Li-Ning Badminton Weltmeisterschaft 2014 Kopenhagen: Herren Einzel, Wei Lee vs. Keun Lee', 'thumbnail': 're:^https?://.*\.jpg$', 'view_count': int, 'categories': ['Li-Ning Badminton WM 2014'], + } }] @@ -48,16 +50,16 @@ class SportDeutschlandIE(InfoExtractor): video_id = mobj.group('id') sport_id = mobj.group('sport') - api_url = 'http://splink.tv/api/permalinks/%s/%s' % ( + api_url = 'http://proxy.vidibusdynamic.net/sportdeutschland.tv/api/permalinks/%s/%s?access_token=true' % ( sport_id, video_id) - req = compat_urllib_request.Request(api_url, headers={ + req = sanitized_Request(api_url, headers={ 'Accept': 'application/vnd.vidibus.v2.html+json', 'Referer': url, }) data = self._download_json(req, video_id) - categories = list(data.get('section', {}).get('tags', {}).values()) asset = data['asset'] + categories = [data['section']['title']] formats = [] smil_url = asset['video'] @@ -68,10 +70,12 @@ class SportDeutschlandIE(InfoExtractor): smil_doc = self._download_xml( smil_url, video_id, note='Downloading SMIL metadata') - base_url = smil_doc.find('./head/meta').attrib['base'] + base_url_el = smil_doc.find('./head/meta') + if base_url_el: + base_url = base_url_el.attrib['base'] formats.extend([{ 'format_id': 'rmtp', - 'url': base_url, + 'url': base_url if base_url_el else n.attrib['src'], 'play_path': n.attrib['src'], 'ext': 'flv', 'preference': -100, @@ -88,6 +92,7 @@ class SportDeutschlandIE(InfoExtractor): 'title': asset['title'], 'thumbnail': asset.get('image'), 'description': asset.get('teaser'), + 'duration': asset.get('duration'), 'categories': categories, 'view_count': asset.get('views'), 'rtmp_live': asset.get('live'), diff --git a/youtube_dl/extractor/srf.py b/youtube_dl/extractor/srf.py new file mode 100644 index 000000000..16e1bf2d6 --- /dev/null +++ b/youtube_dl/extractor/srf.py @@ -0,0 +1,120 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re +from .common import InfoExtractor +from ..utils import ( + determine_ext, + parse_iso8601, + xpath_text, +) + + +class SrfIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.srf\.ch/play(?:er)?/(?:tv|radio)/[^/]+/(?P<media_type>video|audio)/(?P<display_id>[^?]+)\?id=|tp\.srgssr\.ch/p/flash\?urn=urn:srf:ais:video:)(?P<id>[0-9a-f\-]{36})' + _TESTS = [{ + 'url': 'http://www.srf.ch/play/tv/10vor10/video/snowden-beantragt-asyl-in-russland?id=28e1a57d-5b76-4399-8ab3-9097f071e6c5', + 'md5': '4cd93523723beff51bb4bee974ee238d', + 'info_dict': { + 'id': '28e1a57d-5b76-4399-8ab3-9097f071e6c5', + 'display_id': 'snowden-beantragt-asyl-in-russland', + 'ext': 'm4v', + 'upload_date': '20130701', + 'title': 'Snowden beantragt Asyl in Russland', + 'timestamp': 1372713995, + } + }, { + # No Speichern (Save) button + 'url': 'http://www.srf.ch/play/tv/top-gear/video/jaguar-xk120-shadow-und-tornado-dampflokomotive?id=677f5829-e473-4823-ac83-a1087fe97faa', + 'md5': 'd97e236e80d1d24729e5d0953d276a4f', + 'info_dict': { + 'id': '677f5829-e473-4823-ac83-a1087fe97faa', + 'display_id': 'jaguar-xk120-shadow-und-tornado-dampflokomotive', + 'ext': 'flv', + 'upload_date': '20130710', + 'title': 'Jaguar XK120, Shadow und Tornado-Dampflokomotive', + 'timestamp': 1373493600, + }, + }, { + 'url': 'http://www.srf.ch/play/radio/hoerspielarchiv-srf-musikwelle/audio/saegel-ohni-wind-von-jakob-stebler?id=415bf3d3-6429-4de7-968d-95866e37cfbc', + 'md5': '', + 'info_dict': { + 'id': '415bf3d3-6429-4de7-968d-95866e37cfbc', + 'display_id': 'saegel-ohni-wind-von-jakob-stebler', + 'ext': 'mp3', + 'upload_date': '20080518', + 'title': '«Sägel ohni Wind» von Jakob Stebler', + 'timestamp': 1211112000, + }, + 'params': { + 'skip_download': True, # requires rtmpdump + }, + }, { + 'url': 'http://www.srf.ch/player/tv/10vor10/video/snowden-beantragt-asyl-in-russland?id=28e1a57d-5b76-4399-8ab3-9097f071e6c5', + 'only_matching': True, + }, { + 'url': 'https://tp.srgssr.ch/p/flash?urn=urn:srf:ais:video:28e1a57d-5b76-4399-8ab3-9097f071e6c5', + 'only_matching': True, + }] + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + media_type = mobj.group('media_type') + display_id = mobj.group('display_id') or video_id + + video_data = self._download_xml( + 'http://il.srgssr.ch/integrationlayer/1.0/ue/srf/%s/play/%s.xml' % (media_type, video_id), + display_id) + + title = xpath_text( + video_data, './AssetMetadatas/AssetMetadata/title', fatal=True) + thumbnails = [{ + 'url': s.text + } for s in video_data.findall('.//ImageRepresentation/url')] + timestamp = parse_iso8601(xpath_text(video_data, './createdDate')) + # The <duration> field in XML is different from the exact duration, skipping + + formats = [] + for item in video_data.findall('./Playlists/Playlist') + video_data.findall('./Downloads/Download'): + for url_node in item.findall('url'): + quality = url_node.attrib['quality'] + full_url = url_node.text + original_ext = determine_ext(full_url).lower() + format_id = '%s-%s' % (quality, item.attrib['protocol']) + if original_ext == 'f4m': + formats.extend(self._extract_f4m_formats( + full_url + '?hdcore=3.4.0', display_id, f4m_id=format_id)) + elif original_ext == 'm3u8': + formats.extend(self._extract_m3u8_formats( + full_url, display_id, 'mp4', m3u8_id=format_id)) + else: + formats.append({ + 'url': full_url, + 'ext': original_ext, + 'format_id': format_id, + 'quality': 0 if 'HD' in quality else -1, + 'preference': 1, + }) + + self._sort_formats(formats) + + subtitles = {} + subtitles_data = video_data.find('Subtitles') + if subtitles_data is not None: + subtitles_list = [{ + 'url': sub.text, + 'ext': determine_ext(sub.text), + } for sub in subtitles_data] + if subtitles_list: + subtitles['de'] = subtitles_list + + return { + 'id': video_id, + 'display_id': display_id, + 'formats': formats, + 'title': title, + 'thumbnails': thumbnails, + 'timestamp': timestamp, + 'subtitles': subtitles, + } diff --git a/youtube_dl/extractor/srmediathek.py b/youtube_dl/extractor/srmediathek.py index 666a7dcc8..74d01183f 100644 --- a/youtube_dl/extractor/srmediathek.py +++ b/youtube_dl/extractor/srmediathek.py @@ -1,17 +1,18 @@ # encoding: utf-8 from __future__ import unicode_literals -import json - -from .common import InfoExtractor -from ..utils import js_to_json +from .ard import ARDMediathekIE +from ..utils import ( + ExtractorError, + get_element_by_attribute, +) -class SRMediathekIE(InfoExtractor): - IE_DESC = 'Süddeutscher Rundfunk' +class SRMediathekIE(ARDMediathekIE): + IE_DESC = 'Saarländischer Rundfunk' _VALID_URL = r'https?://sr-mediathek\.sr-online\.de/index\.php\?.*?&id=(?P<id>[0-9]+)' - _TEST = { + _TESTS = [{ 'url': 'http://sr-mediathek.sr-online.de/index.php?seite=7&id=28455', 'info_dict': { 'id': '28455', @@ -20,24 +21,36 @@ class SRMediathekIE(InfoExtractor): 'description': 'Ringen: KSV Köllerbach gegen Aachen-Walheim; Frauen-Fußball: 1. FC Saarbrücken gegen Sindelfingen; Motorsport: Rallye in Losheim; dazu: Interview mit Timo Bernhard; Turnen: TG Saar; Reitsport: Deutscher Voltigier-Pokal; Badminton: Interview mit Michael Fuchs ', 'thumbnail': 're:^https?://.*\.jpg$', }, - } + 'skip': 'no longer available', + }, { + 'url': 'http://sr-mediathek.sr-online.de/index.php?seite=7&id=37682', + 'info_dict': { + 'id': '37682', + 'ext': 'mp4', + 'title': 'Love, Cakes and Rock\'n\'Roll', + 'description': 'md5:18bf9763631c7d326c22603681e1123d', + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + 'expected_warnings': ['Unable to download f4m manifest'] + }] def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) - murls = json.loads(js_to_json(self._search_regex( - r'var mediaURLs\s*=\s*(.*?);\n', webpage, 'video URLs'))) - formats = [{'url': murl} for murl in murls] - self._sort_formats(formats) + if '>Der gewünschte Beitrag ist leider nicht mehr verfügbar.<' in webpage: + raise ExtractorError('Video %s is no longer available' % video_id, expected=True) - title = json.loads(js_to_json(self._search_regex( - r'var mediaTitles\s*=\s*(.*?);\n', webpage, 'title')))[0] - - return { + media_collection_url = self._search_regex( + r'data-mediacollection-ardplayer="([^"]+)"', webpage, 'media collection url') + info = self._extract_media_info(media_collection_url, webpage, video_id) + info.update({ 'id': video_id, - 'title': title, - 'formats': formats, + 'title': get_element_by_attribute('class', 'ardplayer-title', webpage), 'description': self._og_search_description(webpage), 'thumbnail': self._og_search_thumbnail(webpage), - } + }) + return info diff --git a/youtube_dl/extractor/ssa.py b/youtube_dl/extractor/ssa.py new file mode 100644 index 000000000..13101c714 --- /dev/null +++ b/youtube_dl/extractor/ssa.py @@ -0,0 +1,58 @@ +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + unescapeHTML, + parse_duration, +) + + +class SSAIE(InfoExtractor): + _VALID_URL = r'http://ssa\.nls\.uk/film/(?P<id>\d+)' + _TEST = { + 'url': 'http://ssa.nls.uk/film/3561', + 'info_dict': { + 'id': '3561', + 'ext': 'flv', + 'title': 'SHETLAND WOOL', + 'description': 'md5:c5afca6871ad59b4271e7704fe50ab04', + 'duration': 900, + 'thumbnail': 're:^https?://.*\.jpg$', + }, + 'params': { + # rtmp download + 'skip_download': True, + }, + } + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage(url, video_id) + + streamer = self._search_regex( + r"'streamer'\s*,\S*'(rtmp[^']+)'", webpage, 'streamer') + play_path = self._search_regex( + r"'file'\s*,\s*'([^']+)'", webpage, 'file').rpartition('.')[0] + + def search_field(field_name, fatal=False): + return self._search_regex( + r'<span\s+class="field_title">%s:</span>\s*<span\s+class="field_content">([^<]+)</span>' % field_name, + webpage, 'title', fatal=fatal) + + title = unescapeHTML(search_field('Title', fatal=True)).strip('()[]') + description = unescapeHTML(search_field('Description')) + duration = parse_duration(search_field('Running time')) + thumbnail = self._search_regex( + r"'image'\s*,\s*'([^']+)'", webpage, 'thumbnails', fatal=False) + + return { + 'id': video_id, + 'url': streamer, + 'play_path': play_path, + 'ext': 'flv', + 'title': title, + 'description': description, + 'duration': duration, + 'thumbnail': thumbnail, + } diff --git a/youtube_dl/extractor/stitcher.py b/youtube_dl/extractor/stitcher.py new file mode 100644 index 000000000..d5c852f52 --- /dev/null +++ b/youtube_dl/extractor/stitcher.py @@ -0,0 +1,81 @@ +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + determine_ext, + int_or_none, + js_to_json, + unescapeHTML, +) + + +class StitcherIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?stitcher\.com/podcast/(?:[^/]+/)+e/(?:(?P<display_id>[^/#?&]+?)-)?(?P<id>\d+)(?:[/#?&]|$)' + _TESTS = [{ + 'url': 'http://www.stitcher.com/podcast/the-talking-machines/e/40789481?autoplay=true', + 'md5': '391dd4e021e6edeb7b8e68fbf2e9e940', + 'info_dict': { + 'id': '40789481', + 'ext': 'mp3', + 'title': 'Machine Learning Mastery and Cancer Clusters', + 'description': 'md5:55163197a44e915a14a1ac3a1de0f2d3', + 'duration': 1604, + 'thumbnail': 're:^https?://.*\.jpg', + }, + }, { + 'url': 'http://www.stitcher.com/podcast/panoply/vulture-tv/e/the-rare-hourlong-comedy-plus-40846275?autoplay=true', + 'info_dict': { + 'id': '40846275', + 'display_id': 'the-rare-hourlong-comedy-plus', + 'ext': 'mp3', + 'title': "The CW's 'Crazy Ex-Girlfriend'", + 'description': 'md5:04f1e2f98eb3f5cbb094cea0f9e19b17', + 'duration': 2235, + 'thumbnail': 're:^https?://.*\.jpg', + }, + 'params': { + 'skip_download': True, + }, + }, { + # escaped title + 'url': 'http://www.stitcher.com/podcast/marketplace-on-stitcher/e/40910226?autoplay=true', + 'only_matching': True, + }, { + 'url': 'http://www.stitcher.com/podcast/panoply/getting-in/e/episode-2a-how-many-extracurriculars-should-i-have-40876278?autoplay=true', + 'only_matching': True, + }] + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + audio_id = mobj.group('id') + display_id = mobj.group('display_id') or audio_id + + webpage = self._download_webpage(url, display_id) + + episode = self._parse_json( + js_to_json(self._search_regex( + r'(?s)var\s+stitcher\s*=\s*({.+?});\n', webpage, 'episode config')), + display_id)['config']['episode'] + + title = unescapeHTML(episode['title']) + formats = [{ + 'url': episode[episode_key], + 'ext': determine_ext(episode[episode_key]) or 'mp3', + 'vcodec': 'none', + } for episode_key in ('episodeURL',) if episode.get(episode_key)] + description = self._search_regex( + r'Episode Info:\s*</span>([^<]+)<', webpage, 'description', fatal=False) + duration = int_or_none(episode.get('duration')) + thumbnail = episode.get('episodeImage') + + return { + 'id': audio_id, + 'display_id': display_id, + 'title': title, + 'description': description, + 'duration': duration, + 'thumbnail': thumbnail, + 'formats': formats, + } diff --git a/youtube_dl/extractor/streamcloud.py b/youtube_dl/extractor/streamcloud.py index c1178f26d..77841b946 100644 --- a/youtube_dl/extractor/streamcloud.py +++ b/youtube_dl/extractor/streamcloud.py @@ -2,13 +2,10 @@ from __future__ import unicode_literals import re -import time from .common import InfoExtractor -from ..utils import ( - compat_urllib_parse, - compat_urllib_request, -) +from ..compat import compat_urllib_parse +from ..utils import sanitized_Request class StreamcloudIE(InfoExtractor): @@ -40,12 +37,11 @@ class StreamcloudIE(InfoExtractor): ''', orig_webpage) post = compat_urllib_parse.urlencode(fields) - self.to_screen('%s: Waiting for timeout' % video_id) - time.sleep(12) + self._sleep(12, video_id) headers = { b'Content-Type': b'application/x-www-form-urlencoded', } - req = compat_urllib_request.Request(url, post, headers) + req = sanitized_Request(url, post, headers) webpage = self._download_webpage( req, video_id, note='Downloading video page ...') diff --git a/youtube_dl/extractor/streamcz.py b/youtube_dl/extractor/streamcz.py index 73efe9542..d3d2b7eb7 100644 --- a/youtube_dl/extractor/streamcz.py +++ b/youtube_dl/extractor/streamcz.py @@ -1,18 +1,28 @@ # -*- coding: utf-8 -*- from __future__ import unicode_literals -import re -import json +import hashlib +import time from .common import InfoExtractor from ..utils import ( int_or_none, - compat_str, + sanitized_Request, ) +def _get_api_key(api_path): + if api_path.endswith('?'): + api_path = api_path[:-1] + + api_key = 'fb5f58a820353bd7095de526253c14fd' + a = '{0:}{1:}{2:}'.format(api_key, api_path, int(round(time.time() / 24 / 3600))) + return hashlib.md5(a.encode('ascii')).hexdigest() + + class StreamCZIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?stream\.cz/.+/(?P<videoid>.+)' + _VALID_URL = r'https?://(?:www\.)?stream\.cz/.+/(?P<id>[0-9]+)' + _API_URL = 'http://www.stream.cz/API' _TESTS = [{ 'url': 'http://www.stream.cz/peklonataliri/765767-ecka-pro-deti', @@ -21,61 +31,66 @@ class StreamCZIE(InfoExtractor): 'id': '765767', 'ext': 'mp4', 'title': 'Peklo na talíři: Éčka pro děti', - 'description': 'md5:49ace0df986e95e331d0fe239d421519', - 'thumbnail': 'http://im.stream.cz/episode/52961d7e19d423f8f06f0100', + 'description': 'Taška s grónskou pomazánkou a další pekelnosti ZDE', + 'thumbnail': 're:^http://im.stream.cz/episode/52961d7e19d423f8f06f0100', 'duration': 256, }, }, { 'url': 'http://www.stream.cz/blanik/10002447-tri-roky-pro-mazanka', - 'md5': '246272e753e26bbace7fcd9deca0650c', + 'md5': 'e54a254fb8b871968fd8403255f28589', 'info_dict': { 'id': '10002447', 'ext': 'mp4', 'title': 'Kancelář Blaník: Tři roky pro Mazánka', - 'description': 'md5:9177695a8b756a0a8ab160de4043b392', - 'thumbnail': 'http://im.stream.cz/episode/537f838c50c11f8d21320000', + 'description': 'md5:3862a00ba7bf0b3e44806b544032c859', + 'thumbnail': 're:^http://im.stream.cz/episode/537f838c50c11f8d21320000', 'duration': 368, }, }] def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('videoid') + video_id = self._match_id(url) + api_path = '/episode/%s' % video_id - webpage = self._download_webpage(url, video_id) - - data = self._html_search_regex(r'Stream\.Data\.Episode\((.+?)\);', webpage, 'stream data') - - jsonData = json.loads(data) + req = sanitized_Request(self._API_URL + api_path) + req.add_header('Api-Password', _get_api_key(api_path)) + data = self._download_json(req, video_id) formats = [] - for video in jsonData['instances']: - for video_format in video['instances']: - format_id = video_format['quality'] - - if format_id == '240p': - quality = 0 - elif format_id == '360p': - quality = 1 - elif format_id == '480p': - quality = 2 - elif format_id == '720p': - quality = 3 - + for quality, video in enumerate(data['video_qualities']): + for f in video['formats']: + typ = f['type'].partition('/')[2] + qlabel = video.get('quality_label') formats.append({ - 'format_id': '%s-%s' % (video_format['type'].split('/')[1], format_id), - 'url': video_format['source'], + 'format_note': '%s-%s' % (qlabel, typ) if qlabel else typ, + 'format_id': '%s-%s' % (typ, f['quality']), + 'url': f['source'], + 'height': int_or_none(f['quality'].rstrip('p')), 'quality': quality, }) - self._sort_formats(formats) + image = data.get('image') + if image: + thumbnail = self._proto_relative_url( + image.replace('{width}', '1240').replace('{height}', '697'), + scheme='http:', + ) + else: + thumbnail = None + + stream = data.get('_embedded', {}).get('stream:show', {}).get('name') + if stream: + title = '%s: %s' % (stream, data['name']) + else: + title = data['name'] + return { - 'id': compat_str(jsonData['episode_id']), - 'title': self._og_search_title(webpage), - 'thumbnail': jsonData['episode_image_original_url'].replace('//', 'http://'), + 'id': video_id, + 'title': title, + 'thumbnail': thumbnail, 'formats': formats, - 'description': self._og_search_description(webpage), - 'duration': int_or_none(jsonData['duration']), - 'view_count': int_or_none(jsonData['stats_total']), + 'description': data.get('web_site_text'), + 'duration': int_or_none(data.get('duration')), + 'view_count': int_or_none(data.get('views')), } diff --git a/youtube_dl/extractor/streetvoice.py b/youtube_dl/extractor/streetvoice.py new file mode 100644 index 000000000..6a57fa60a --- /dev/null +++ b/youtube_dl/extractor/streetvoice.py @@ -0,0 +1,51 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import unified_strdate + + +class StreetVoiceIE(InfoExtractor): + _VALID_URL = r'https?://(?:.+?\.)?streetvoice\.com/[^/]+/songs/(?P<id>[0-9]+)' + _TESTS = [{ + 'url': 'http://streetvoice.com/skippylu/songs/94440/', + 'md5': '15974627fc01a29e492c98593c2fd472', + 'info_dict': { + 'id': '94440', + 'ext': 'mp3', + 'filesize': 4167053, + 'title': '輸', + 'description': 'Crispy脆樂團 - 輸', + 'thumbnail': 're:^https?://.*\.jpg$', + 'duration': 260, + 'upload_date': '20091018', + 'uploader': 'Crispy脆樂團', + 'uploader_id': '627810', + } + }, { + 'url': 'http://tw.streetvoice.com/skippylu/songs/94440/', + 'only_matching': True, + }] + + def _real_extract(self, url): + song_id = self._match_id(url) + + song = self._download_json( + 'http://streetvoice.com/music/api/song/%s' % song_id, song_id) + + title = song['name'] + author = song['musician']['name'] + + return { + 'id': song_id, + 'url': song['file'], + 'filesize': song.get('size'), + 'title': title, + 'description': '%s - %s' % (author, title), + 'thumbnail': self._proto_relative_url(song.get('image'), 'http:'), + 'duration': song.get('length'), + 'upload_date': unified_strdate(song.get('created_at')), + 'uploader': author, + 'uploader_id': compat_str(song['musician']['id']), + } diff --git a/youtube_dl/extractor/subtitles.py b/youtube_dl/extractor/subtitles.py deleted file mode 100644 index 59a51268d..000000000 --- a/youtube_dl/extractor/subtitles.py +++ /dev/null @@ -1,99 +0,0 @@ -from __future__ import unicode_literals -from .common import InfoExtractor - -from ..compat import compat_str -from ..utils import ( - ExtractorError, -) - - -class SubtitlesInfoExtractor(InfoExtractor): - @property - def _have_to_download_any_subtitles(self): - return any([self._downloader.params.get('writesubtitles', False), - self._downloader.params.get('writeautomaticsub')]) - - def _list_available_subtitles(self, video_id, webpage): - """ outputs the available subtitles for the video """ - sub_lang_list = self._get_available_subtitles(video_id, webpage) - auto_captions_list = self._get_available_automatic_caption(video_id, webpage) - sub_lang = ",".join(list(sub_lang_list.keys())) - self.to_screen('%s: Available subtitles for video: %s' % - (video_id, sub_lang)) - auto_lang = ",".join(auto_captions_list.keys()) - self.to_screen('%s: Available automatic captions for video: %s' % - (video_id, auto_lang)) - - def extract_subtitles(self, video_id, webpage): - """ - returns {sub_lang: sub} ,{} if subtitles not found or None if the - subtitles aren't requested. - """ - if not self._have_to_download_any_subtitles: - return None - available_subs_list = {} - if self._downloader.params.get('writeautomaticsub', False): - available_subs_list.update(self._get_available_automatic_caption(video_id, webpage)) - if self._downloader.params.get('writesubtitles', False): - available_subs_list.update(self._get_available_subtitles(video_id, webpage)) - - if not available_subs_list: # error, it didn't get the available subtitles - return {} - if self._downloader.params.get('allsubtitles', False): - sub_lang_list = available_subs_list - else: - if self._downloader.params.get('subtitleslangs', False): - requested_langs = self._downloader.params.get('subtitleslangs') - elif 'en' in available_subs_list: - requested_langs = ['en'] - else: - requested_langs = [list(available_subs_list.keys())[0]] - - sub_lang_list = {} - for sub_lang in requested_langs: - if sub_lang not in available_subs_list: - self._downloader.report_warning('no closed captions found in the specified language "%s"' % sub_lang) - continue - sub_lang_list[sub_lang] = available_subs_list[sub_lang] - - subtitles = {} - for sub_lang, url in sub_lang_list.items(): - subtitle = self._request_subtitle_url(sub_lang, url) - if subtitle: - subtitles[sub_lang] = subtitle - return subtitles - - def _download_subtitle_url(self, sub_lang, url): - return self._download_webpage(url, None, note=False) - - def _request_subtitle_url(self, sub_lang, url): - """ makes the http request for the subtitle """ - try: - sub = self._download_subtitle_url(sub_lang, url) - except ExtractorError as err: - self._downloader.report_warning('unable to download video subtitles for %s: %s' % (sub_lang, compat_str(err))) - return - if not sub: - self._downloader.report_warning('Did not fetch video subtitles') - return - return sub - - def _get_available_subtitles(self, video_id, webpage): - """ - returns {sub_lang: url} or {} if not available - Must be redefined by the subclasses - """ - - # By default, allow implementations to simply pass in the result - assert isinstance(webpage, dict), \ - '_get_available_subtitles not implemented' - return webpage - - def _get_available_automatic_caption(self, video_id, webpage): - """ - returns {sub_lang: url} or {} if not available - Must be redefined by the subclasses that support automatic captions, - otherwise it will return {} - """ - self._downloader.report_warning('Automatic Captions not supported by this server') - return {} diff --git a/youtube_dl/extractor/sunporno.py b/youtube_dl/extractor/sunporno.py index 263f09b46..e527aa971 100644 --- a/youtube_dl/extractor/sunporno.py +++ b/youtube_dl/extractor/sunporno.py @@ -28,27 +28,31 @@ class SunPornoIE(InfoExtractor): } def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') + video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) - title = self._html_search_regex(r'<title>([^<]+)', webpage, 'title') - description = self._html_search_meta('description', webpage, 'description') + title = self._html_search_regex( + r'([^<]+)', webpage, 'title') + description = self._html_search_meta( + 'description', webpage, 'description') thumbnail = self._html_search_regex( r'poster="([^"]+)"', webpage, 'thumbnail', fatal=False) duration = parse_duration(self._search_regex( - r'Duration:\s*(\d+:\d+)\s*<', webpage, 'duration', fatal=False)) + r'itemprop="duration">\s*(\d+:\d+)\s*<', + webpage, 'duration', fatal=False)) view_count = int_or_none(self._html_search_regex( - r'class="views">\s*(\d+)\s*<', webpage, 'view count', fatal=False)) + r'class="views">(?: