diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index f8ab29631..32c2fd84c 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -16,15 +16,15 @@ So please elaborate on what feature you are requesting, or what bug you want to If your report is shorter than two lines, it is almost certainly missing some of these, which makes it hard for us to respond to it. We're often too polite to close the issue outright, but the missing info makes misinterpretation likely. As a commiter myself, I often get frustrated by these issues, since the only possible way for me to move forward on them is to ask for clarification over and over. -For bug reports, this means that your report should contain the *complete* output of youtube-dl when called with the -v flag. The error message you get for (most) bugs even says so, but you would not believe how many of our bug reports do not contain this information. +For bug reports, this means that your report should contain the *complete* output of youtube-dl when called with the `-v` flag. The error message you get for (most) bugs even says so, but you would not believe how many of our bug reports do not contain this information. -If your server has multiple IPs or you suspect censorship, adding --call-home may be a good idea to get more diagnostics. If the error is `ERROR: Unable to extract ...` and you cannot reproduce it from multiple countries, add `--dump-pages` (warning: this will yield a rather large output, redirect it to the file `log.txt` by adding `>log.txt 2>&1` to your command-line) or upload the `.dump` files you get when you add `--write-pages` [somewhere](https://gist.github.com/). +If your server has multiple IPs or you suspect censorship, adding `--call-home` may be a good idea to get more diagnostics. If the error is `ERROR: Unable to extract ...` and you cannot reproduce it from multiple countries, add `--dump-pages` (warning: this will yield a rather large output, redirect it to the file `log.txt` by adding `>log.txt 2>&1` to your command-line) or upload the `.dump` files you get when you add `--write-pages` [somewhere](https://gist.github.com/). **Site support requests must contain an example URL**. An example URL is a URL you might want to download, like http://www.youtube.com/watch?v=BaW_jenozKc . There should be an obvious video present. Except under very special circumstances, the main page of a video service (e.g. http://www.youtube.com/ ) is *not* an example URL. ### Are you using the latest version? -Before reporting any issue, type youtube-dl -U. This should report that you're up-to-date. About 20% of the reports we receive are already fixed, but people are using outdated versions. This goes for feature requests as well. +Before reporting any issue, type `youtube-dl -U`. This should report that you're up-to-date. About 20% of the reports we receive are already fixed, but people are using outdated versions. This goes for feature requests as well. ### Is the issue already documented? @@ -124,7 +124,7 @@ If you want to add support for a new site, you can follow this quick list (assum } ``` 5. Add an import in [`youtube_dl/extractor/__init__.py`](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/__init__.py). -6. Run `python test/test_download.py TestDownload.test_YourExtractor`. This *should fail* at first, but you can continually re-run it until you're done. If you decide to add more than one test, then rename ``_TEST`` to ``_TESTS`` and make it into a list of dictionaries. The tests will be then be named `TestDownload.test_YourExtractor`, `TestDownload.test_YourExtractor_1`, `TestDownload.test_YourExtractor_2`, etc. +6. Run `python test/test_download.py TestDownload.test_YourExtractor`. This *should fail* at first, but you can continually re-run it until you're done. If you decide to add more than one test, then rename ``_TEST`` to ``_TESTS`` and make it into a list of dictionaries. The tests will then be named `TestDownload.test_YourExtractor`, `TestDownload.test_YourExtractor_1`, `TestDownload.test_YourExtractor_2`, etc. 7. Have a look at [`youtube_dl/extractor/common.py`](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/common.py) for possible helper methods and a [detailed description of what your extractor should and may return](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/common.py#L62-L200). Add tests and code for as many as you want. 8. If you can, check the code with [flake8](https://pypi.python.org/pypi/flake8). 9. When the tests pass, [add](http://git-scm.com/docs/git-add) the new files and [commit](http://git-scm.com/docs/git-commit) them and [push](http://git-scm.com/docs/git-push) the result, like this: diff --git a/README.md b/README.md index 80152071d..cf4aebf3d 100644 --- a/README.md +++ b/README.md @@ -49,110 +49,220 @@ which means you can modify it, redistribute it or use it however you like. # OPTIONS -h, --help Print this help text and exit --version Print program version and exit - -U, --update Update this program to latest version. Make sure that you have sufficient permissions (run with sudo if needed) - -i, --ignore-errors Continue on download errors, for example to skip unavailable videos in a playlist - --abort-on-error Abort downloading of further videos (in the playlist or the command line) if an error occurs + -U, --update Update this program to latest version. Make + sure that you have sufficient permissions + (run with sudo if needed) + -i, --ignore-errors Continue on download errors, for example to + skip unavailable videos in a playlist + --abort-on-error Abort downloading of further videos (in the + playlist or the command line) if an error + occurs --dump-user-agent Display the current browser identification --list-extractors List all supported extractors - --extractor-descriptions Output descriptions of all supported extractors - --force-generic-extractor Force extraction to use the generic extractor - --default-search PREFIX Use this prefix for unqualified URLs. For example "gvsearch2:" downloads two videos from google videos for youtube-dl "large apple". - Use the value "auto" to let youtube-dl guess ("auto_warning" to emit a warning when guessing). "error" just throws an error. The - default value "fixup_error" repairs broken URLs, but emits an error if this is not possible instead of searching. - --ignore-config Do not read configuration files. When given in the global configuration file /etc/youtube-dl.conf: Do not read the user configuration - in ~/.config/youtube-dl/config (%APPDATA%/youtube-dl/config.txt on Windows) - --flat-playlist Do not extract the videos of a playlist, only list them. + --extractor-descriptions Output descriptions of all supported + extractors + --force-generic-extractor Force extraction to use the generic + extractor + --default-search PREFIX Use this prefix for unqualified URLs. For + example "gvsearch2:" downloads two videos + from google videos for youtube-dl "large + apple". Use the value "auto" to let + youtube-dl guess ("auto_warning" to emit a + warning when guessing). "error" just throws + an error. The default value "fixup_error" + repairs broken URLs, but emits an error if + this is not possible instead of searching. + --ignore-config Do not read configuration files. When given + in the global configuration file /etc + /youtube-dl.conf: Do not read the user + configuration in ~/.config/youtube- + dl/config (%APPDATA%/youtube-dl/config.txt + on Windows) + --flat-playlist Do not extract the videos of a playlist, + only list them. --no-color Do not emit color codes in output ## Network Options: - --proxy URL Use the specified HTTP/HTTPS proxy. Pass in an empty string (--proxy "") for direct connection + --proxy URL Use the specified HTTP/HTTPS proxy. Pass in + an empty string (--proxy "") for direct + connection --socket-timeout SECONDS Time to wait before giving up, in seconds - --source-address IP Client-side IP address to bind to (experimental) - -4, --force-ipv4 Make all connections via IPv4 (experimental) - -6, --force-ipv6 Make all connections via IPv6 (experimental) - --cn-verification-proxy URL Use this proxy to verify the IP address for some Chinese sites. The default proxy specified by --proxy (or none, if the options is - not present) is used for the actual downloading. (experimental) + --source-address IP Client-side IP address to bind to + (experimental) + -4, --force-ipv4 Make all connections via IPv4 + (experimental) + -6, --force-ipv6 Make all connections via IPv6 + (experimental) + --cn-verification-proxy URL Use this proxy to verify the IP address for + some Chinese sites. The default proxy + specified by --proxy (or none, if the + options is not present) is used for the + actual downloading. (experimental) ## Video Selection: --playlist-start NUMBER Playlist video to start at (default is 1) --playlist-end NUMBER Playlist video to end at (default is last) - --playlist-items ITEM_SPEC Playlist video items to download. Specify indices of the videos in the playlist separated by commas like: "--playlist-items 1,2,5,8" - if you want to download videos indexed 1, 2, 5, 8 in the playlist. You can specify range: "--playlist-items 1-3,7,10-13", it will - download the videos at index 1, 2, 3, 7, 10, 11, 12 and 13. - --match-title REGEX Download only matching titles (regex or caseless sub-string) - --reject-title REGEX Skip download for matching titles (regex or caseless sub-string) + --playlist-items ITEM_SPEC Playlist video items to download. Specify + indices of the videos in the playlist + separated by commas like: "--playlist-items + 1,2,5,8" if you want to download videos + indexed 1, 2, 5, 8 in the playlist. You can + specify range: "--playlist-items + 1-3,7,10-13", it will download the videos + at index 1, 2, 3, 7, 10, 11, 12 and 13. + --match-title REGEX Download only matching titles (regex or + caseless sub-string) + --reject-title REGEX Skip download for matching titles (regex or + caseless sub-string) --max-downloads NUMBER Abort after downloading NUMBER files - --min-filesize SIZE Do not download any videos smaller than SIZE (e.g. 50k or 44.6m) - --max-filesize SIZE Do not download any videos larger than SIZE (e.g. 50k or 44.6m) + --min-filesize SIZE Do not download any videos smaller than + SIZE (e.g. 50k or 44.6m) + --max-filesize SIZE Do not download any videos larger than SIZE + (e.g. 50k or 44.6m) --date DATE Download only videos uploaded in this date - --datebefore DATE Download only videos uploaded on or before this date (i.e. inclusive) - --dateafter DATE Download only videos uploaded on or after this date (i.e. inclusive) - --min-views COUNT Do not download any videos with less than COUNT views - --max-views COUNT Do not download any videos with more than COUNT views - --match-filter FILTER Generic video filter (experimental). Specify any key (see help for -o for a list of available keys) to match if the key is present, - !key to check if the key is not present,key > NUMBER (like "comment_count > 12", also works with >=, <, <=, !=, =) to compare against - a number, and & to require multiple matches. Values which are not known are excluded unless you put a question mark (?) after the - operator.For example, to only match videos that have been liked more than 100 times and disliked less than 50 times (or the dislike - functionality is not available at the given service), but who also have a description, use --match-filter "like_count > 100 & + --datebefore DATE Download only videos uploaded on or before + this date (i.e. inclusive) + --dateafter DATE Download only videos uploaded on or after + this date (i.e. inclusive) + --min-views COUNT Do not download any videos with less than + COUNT views + --max-views COUNT Do not download any videos with more than + COUNT views + --match-filter FILTER Generic video filter (experimental). + Specify any key (see help for -o for a list + of available keys) to match if the key is + present, !key to check if the key is not + present,key > NUMBER (like "comment_count > + 12", also works with >=, <, <=, !=, =) to + compare against a number, and & to require + multiple matches. Values which are not + known are excluded unless you put a + question mark (?) after the operator.For + example, to only match videos that have + been liked more than 100 times and disliked + less than 50 times (or the dislike + functionality is not available at the given + service), but who also have a description, + use --match-filter "like_count > 100 & dislike_count \youtube-dl.conf`. For example, with the following configration file youtube-dl will always extract the audio, not copy the mtime and use proxy: +You can configure youtube-dl by placing any supported command line option to a configuration file. On Linux, the system wide configuration file is located at `/etc/youtube-dl.conf` and the user wide configuration file at `~/.config/youtube-dl/config`. On Windows, the user wide configuration file locations are `%APPDATA%\youtube-dl\config.txt` or `C:\Users\\youtube-dl.conf`. For example, with the following configuration file youtube-dl will always extract the audio, not copy the mtime and use a proxy: ``` --extract-audio --no-mtime --proxy 127.0.0.1:3128 ``` -You can use `--ignore-config` if you want to disable configuration file for a particular youtube-dl run. +You can use `--ignore-config` if you want to disable the configuration file for a particular youtube-dl run. ### Authentication with `.netrc` file ### -You may also want to configure automatic credentials storage for extractors that support authentication (by providing login and password with `--username` and `--password`) in order not to pass credentials as command line arguments on every youtube-dl execution and prevent tracking plain text passwords in shell command history. You can achieve this using [`.netrc` file](http://stackoverflow.com/tags/.netrc/info) on per extractor basis. For that you will need to create `.netrc` file in your `$HOME` and restrict permissions to read/write by you only: +You may also want to configure automatic credentials storage for extractors that support authentication (by providing login and password with `--username` and `--password`) in order not to pass credentials as command line arguments on every youtube-dl execution and prevent tracking plain text passwords in the shell command history. You can achieve this using a [`.netrc` file](http://stackoverflow.com/tags/.netrc/info) on per extractor basis. For that you will need to create a`.netrc` file in your `$HOME` and restrict permissions to read/write by you only: ``` touch $HOME/.netrc chmod a-rwx,u+rw $HOME/.netrc @@ -262,13 +429,13 @@ For example: machine youtube login myaccount@gmail.com password my_youtube_password machine twitch login my_twitch_account_name password my_twitch_password ``` -To activate authentication with `.netrc` file you should pass `--netrc` to youtube-dl or place it in [configuration file](#configuration). +To activate authentication with the `.netrc` file you should pass `--netrc` to youtube-dl or place it in the [configuration file](#configuration). -On Windows you may also need to setup `%HOME%` environment variable manually. +On Windows you may also need to setup the `%HOME%` environment variable manually. # OUTPUT TEMPLATE -The `-o` option allows users to indicate a template for the output file names. The basic usage is not to set any template arguments when downloading a single file, like in `youtube-dl -o funny_video.flv "http://some/video"`. However, it may contain special sequences that will be replaced when downloading each video. The special sequences have the format `%(NAME)s`. To clarify, that is a percent symbol followed by a name in parenthesis, followed by a lowercase S. Allowed names are: +The `-o` option allows users to indicate a template for the output file names. The basic usage is not to set any template arguments when downloading a single file, like in `youtube-dl -o funny_video.flv "http://some/video"`. However, it may contain special sequences that will be replaced when downloading each video. The special sequences have the format `%(NAME)s`. To clarify, that is a percent symbol followed by a name in parentheses, followed by a lowercase S. Allowed names are: - `id`: The sequence will be replaced by the video identifier. - `url`: The sequence will be replaced by the video URL. @@ -296,18 +463,18 @@ youtube-dl_test_video_.mp4 # A simple file name # FORMAT SELECTION -By default youtube-dl tries to download the best quality, but sometimes you may want to download other format. +By default youtube-dl tries to download the best quality, but sometimes you may want to download in a different format. The simplest case is requesting a specific format, for example `-f 22`. You can get the list of available formats using `--list-formats`, you can also use a file extension (currently it supports aac, m4a, mp3, mp4, ogg, wav, webm) or the special names `best`, `bestvideo`, `bestaudio` and `worst`. If you want to download multiple videos and they don't have the same formats available, you can specify the order of preference using slashes, as in `-f 22/17/18`. You can also filter the video results by putting a condition in brackets, as in `-f "best[height=720]"` (or `-f "[filesize>10M]"`). This works for filesize, height, width, tbr, abr, vbr, asr, and fps and the comparisons <, <=, >, >=, =, != and for ext, acodec, vcodec, container, and protocol and the comparisons =, != . Formats for which the value is not known are excluded unless you put a question mark (?) after the operator. You can combine format filters, so `-f "[height <=? 720][tbr>500]"` selects up to 720p videos (or videos where the height is not known) with a bitrate of at least 500 KBit/s. Use commas to download multiple formats, such as `-f 136/137/mp4/bestvideo,140/m4a/bestaudio`. You can merge the video and audio of two formats into a single file using `-f +` (requires ffmpeg or avconv), for example `-f bestvideo+bestaudio`. Format selectors can also be grouped using parentheses, for example if you want to download the best mp4 and webm formats with a height lower than 480 you can use `-f '(mp4,webm)[height<480]'`. -Since the end of April 2015 and version 2015.04.26 youtube-dl uses `-f bestvideo+bestaudio/best` as default format selection (see #5447, #5456). If ffmpeg or avconv are installed this results in downloading `bestvideo` and `bestaudio` separately and muxing them together into a single file giving the best overall quality available. Otherwise it falls back to `best` and results in downloading best available quality served as a single file. `best` is also needed for videos that don't come from YouTube because they don't provide the audio and video in two different files. If you want to only download some dash formats (for example if you are not interested in getting videos with a resolution higher than 1080p), you can add `-f bestvideo[height<=?1080]+bestaudio/best` to your configuration file. Note that if you use youtube-dl to stream to `stdout` (and most likely to pipe it to your media player then), i.e. you explicitly specify output template as `-o -`, youtube-dl still uses `-f best` format selection in order to start content delivery immediately to your player and not to wait until `bestvideo` and `bestaudio` are downloaded and muxed. +Since the end of April 2015 and version 2015.04.26 youtube-dl uses `-f bestvideo+bestaudio/best` as default format selection (see #5447, #5456). If ffmpeg or avconv are installed this results in downloading `bestvideo` and `bestaudio` separately and muxing them together into a single file giving the best overall quality available. Otherwise it falls back to `best` and results in downloading the best available quality served as a single file. `best` is also needed for videos that don't come from YouTube because they don't provide the audio and video in two different files. If you want to only download some dash formats (for example if you are not interested in getting videos with a resolution higher than 1080p), you can add `-f bestvideo[height<=?1080]+bestaudio/best` to your configuration file. Note that if you use youtube-dl to stream to `stdout` (and most likely to pipe it to your media player then), i.e. you explicitly specify output template as `-o -`, youtube-dl still uses `-f best` format selection in order to start content delivery immediately to your player and not to wait until `bestvideo` and `bestaudio` are downloaded and muxed. -If you want to preserve the old format selection behavior (prior to youtube-dl 2015.04.26), i.e. you want to download best available quality media served as a single file, you should explicitly specify your choice with `-f best`. You may want to add it to the [configuration file](#configuration) in order not to type it every time you run youtube-dl. +If you want to preserve the old format selection behavior (prior to youtube-dl 2015.04.26), i.e. you want to download the best available quality media served as a single file, you should explicitly specify your choice with `-f best`. You may want to add it to the [configuration file](#configuration) in order not to type it every time you run youtube-dl. # VIDEO SELECTION -Videos can be filtered by their upload date using the options `--date`, `--datebefore` or `--dateafter`, they accept dates in two formats: +Videos can be filtered by their upload date using the options `--date`, `--datebefore` or `--dateafter`. They accept dates in two formats: - Absolute dates: Dates in the format `YYYYMMDD`. - Relative dates: Dates in the format `(now|today)[+-][0-9](day|week|month|year)(s)?` @@ -321,7 +488,7 @@ $ youtube-dl --dateafter now-6months # Download only the videos uploaded on January 1, 1970 $ youtube-dl --date 19700101 -$ # will only download the videos uploaded in the 200x decade +$ # Download only the videos uploaded in the 200x decade $ youtube-dl --dateafter 20000101 --datebefore 20091231 ``` @@ -333,7 +500,7 @@ If you've followed [our manual installation instructions](http://rg3.github.io/y If you have used pip, a simple `sudo pip install -U youtube-dl` is sufficient to update. -If you have installed youtube-dl using a package manager like *apt-get* or *yum*, use the standard system update mechanism to update. Note that distribution packages are often outdated. As a rule of thumb, youtube-dl releases at least once a month, and often weekly or even daily. Simply go to http://yt-dl.org/ to find out the current version. Unfortunately, there is nothing we youtube-dl developers can do if your distributions serves a really outdated version. You can (and should) complain to your distribution in their bugtracker or support forum. +If you have installed youtube-dl using a package manager like *apt-get* or *yum*, use the standard system update mechanism to update. Note that distribution packages are often outdated. As a rule of thumb, youtube-dl releases at least once a month, and often weekly or even daily. Simply go to http://yt-dl.org/ to find out the current version. Unfortunately, there is nothing we youtube-dl developers can do if your distribution serves a really outdated version. You can (and should) complain to your distribution in their bugtracker or support forum. As a last resort, you can also uninstall the version installed by your package manager and follow our manual installation instructions. For that, remove the distribution's package, with a line like @@ -359,7 +526,7 @@ If you have installed youtube-dl with a package manager, pip, setup.py or a tarb By default, youtube-dl intends to have the best options (incidentally, if you have a convincing case that these should be different, [please file an issue where you explain that](https://yt-dl.org/bug)). Therefore, it is unnecessary and sometimes harmful to copy long option strings from webpages. In particular, the only option out of `-citw` that is regularly useful is `-i`. -### Can you please put the -b option back? +### Can you please put the `-b` option back? Most people asking this question are not aware that youtube-dl now defaults to downloading the highest available quality as reported by YouTube, which will be 1080p or 720p in some cases, so you no longer need the `-b` option. For some specific videos, maybe YouTube does not report them to be available in a specific high quality format you're interested in. In that case, simply request it with the `-f` option and youtube-dl will try to download it. @@ -371,13 +538,13 @@ Apparently YouTube requires you to pass a CAPTCHA test if you download too much. Once the video is fully downloaded, use any video player, such as [vlc](http://www.videolan.org) or [mplayer](http://www.mplayerhq.hu/). -### I extracted a video URL with -g, but it does not play on another machine / in my webbrowser. +### I extracted a video URL with `-g`, but it does not play on another machine / in my webbrowser. It depends a lot on the service. In many cases, requests for the video (to download/play it) must come from the same IP address and with the same cookies. Use the `--cookies` option to write the required cookies into a file, and advise your downloader to read cookies from that file. Some sites also require a common user agent to be used, use `--dump-user-agent` to see the one in use by youtube-dl. It may be beneficial to use IPv6; in some cases, the restrictions are only applied to IPv4. Some services (sometimes only for a subset of videos) do not restrict the video URL by IP address, cookie, or user-agent, but these are the exception rather than the rule. -Please bear in mind that some URL protocols are **not** supported by browsers out of the box, including RTMP. If you are using -g, your own downloader must support these as well. +Please bear in mind that some URL protocols are **not** supported by browsers out of the box, including RTMP. If you are using `-g`, your own downloader must support these as well. If you want to play the video on a machine that is not running youtube-dl, you can relay the video content from the machine that runs youtube-dl. You can use `-o -` to let youtube-dl stream a video to stdout, or simply allow the player to download the files written by youtube-dl in turn. @@ -391,7 +558,7 @@ YouTube requires an additional signature since September 2012 which is not suppo ### Video URL contains an ampersand and I'm getting some strange output `[1] 2839` or `'v' is not recognized as an internal or external command` ### -That's actually the output from your shell. Since ampersand is one of the special shell characters it's interpreted by shell preventing you from passing the whole URL to youtube-dl. To disable your shell from interpreting the ampersands (or any other special characters) you have to either put the whole URL in quotes or escape them with a backslash (which approach will work depends on your shell). +That's actually the output from your shell. Since ampersand is one of the special shell characters it's interpreted by the shell preventing you from passing the whole URL to youtube-dl. To disable your shell from interpreting the ampersands (or any other special characters) you have to either put the whole URL in quotes or escape them with a backslash (which approach will work depends on your shell). For example if your URL is https://www.youtube.com/watch?t=4&v=BaW_jenozKc you should end up with following command: @@ -451,9 +618,9 @@ Either prepend `http://www.youtube.com/watch?v=` or separate the ID from the opt ### How do I pass cookies to youtube-dl? -Use the `--cookies` option, for example `--cookies /path/to/cookies/file.txt`. Note that cookies file must be in Mozilla/Netscape format and the first line of cookies file must be either `# HTTP Cookie File` or `# Netscape HTTP Cookie File`. Make sure you have correct [newline format](https://en.wikipedia.org/wiki/Newline) in cookies file and convert newlines if necessary to correspond your OS, namely `CRLF` (`\r\n`) for Windows, `LF` (`\n`) for Linux and `CR` (`\r`) for Mac OS. `HTTP Error 400: Bad Request` when using `--cookies` is a good sign of invalid newline format. +Use the `--cookies` option, for example `--cookies /path/to/cookies/file.txt`. Note that the cookies file must be in Mozilla/Netscape format and the first line of the cookies file must be either `# HTTP Cookie File` or `# Netscape HTTP Cookie File`. Make sure you have correct [newline format](https://en.wikipedia.org/wiki/Newline) in the cookies file and convert newlines if necessary to correspond with your OS, namely `CRLF` (`\r\n`) for Windows, `LF` (`\n`) for Linux and `CR` (`\r`) for Mac OS. `HTTP Error 400: Bad Request` when using `--cookies` is a good sign of invalid newline format. -Passing cookies to youtube-dl is a good way to workaround login when particular extractor does not implement it explicitly. +Passing cookies to youtube-dl is a good way to workaround login when a particular extractor does not implement it explicitly. ### Can you add support for this anime video site, or site which shows current movies for free? @@ -553,7 +720,7 @@ If you want to add support for a new site, you can follow this quick list (assum } ``` 5. Add an import in [`youtube_dl/extractor/__init__.py`](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/__init__.py). -6. Run `python test/test_download.py TestDownload.test_YourExtractor`. This *should fail* at first, but you can continually re-run it until you're done. If you decide to add more than one test, then rename ``_TEST`` to ``_TESTS`` and make it into a list of dictionaries. The tests will be then be named `TestDownload.test_YourExtractor`, `TestDownload.test_YourExtractor_1`, `TestDownload.test_YourExtractor_2`, etc. +6. Run `python test/test_download.py TestDownload.test_YourExtractor`. This *should fail* at first, but you can continually re-run it until you're done. If you decide to add more than one test, then rename ``_TEST`` to ``_TESTS`` and make it into a list of dictionaries. The tests will then be named `TestDownload.test_YourExtractor`, `TestDownload.test_YourExtractor_1`, `TestDownload.test_YourExtractor_2`, etc. 7. Have a look at [`youtube_dl/extractor/common.py`](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/common.py) for possible helper methods and a [detailed description of what your extractor should and may return](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/common.py#L62-L200). Add tests and code for as many as you want. 8. If you can, check the code with [flake8](https://pypi.python.org/pypi/flake8). 9. When the tests pass, [add](http://git-scm.com/docs/git-add) the new files and [commit](http://git-scm.com/docs/git-commit) them and [push](http://git-scm.com/docs/git-push) the result, like this: @@ -643,15 +810,15 @@ So please elaborate on what feature you are requesting, or what bug you want to If your report is shorter than two lines, it is almost certainly missing some of these, which makes it hard for us to respond to it. We're often too polite to close the issue outright, but the missing info makes misinterpretation likely. As a commiter myself, I often get frustrated by these issues, since the only possible way for me to move forward on them is to ask for clarification over and over. -For bug reports, this means that your report should contain the *complete* output of youtube-dl when called with the -v flag. The error message you get for (most) bugs even says so, but you would not believe how many of our bug reports do not contain this information. +For bug reports, this means that your report should contain the *complete* output of youtube-dl when called with the `-v` flag. The error message you get for (most) bugs even says so, but you would not believe how many of our bug reports do not contain this information. -If your server has multiple IPs or you suspect censorship, adding --call-home may be a good idea to get more diagnostics. If the error is `ERROR: Unable to extract ...` and you cannot reproduce it from multiple countries, add `--dump-pages` (warning: this will yield a rather large output, redirect it to the file `log.txt` by adding `>log.txt 2>&1` to your command-line) or upload the `.dump` files you get when you add `--write-pages` [somewhere](https://gist.github.com/). +If your server has multiple IPs or you suspect censorship, adding `--call-home` may be a good idea to get more diagnostics. If the error is `ERROR: Unable to extract ...` and you cannot reproduce it from multiple countries, add `--dump-pages` (warning: this will yield a rather large output, redirect it to the file `log.txt` by adding `>log.txt 2>&1` to your command-line) or upload the `.dump` files you get when you add `--write-pages` [somewhere](https://gist.github.com/). **Site support requests must contain an example URL**. An example URL is a URL you might want to download, like http://www.youtube.com/watch?v=BaW_jenozKc . There should be an obvious video present. Except under very special circumstances, the main page of a video service (e.g. http://www.youtube.com/ ) is *not* an example URL. ### Are you using the latest version? -Before reporting any issue, type youtube-dl -U. This should report that you're up-to-date. About 20% of the reports we receive are already fixed, but people are using outdated versions. This goes for feature requests as well. +Before reporting any issue, type `youtube-dl -U`. This should report that you're up-to-date. About 20% of the reports we receive are already fixed, but people are using outdated versions. This goes for feature requests as well. ### Is the issue already documented? diff --git a/devscripts/prepare_manpage.py b/devscripts/prepare_manpage.py index 7ece37754..776e6556e 100644 --- a/devscripts/prepare_manpage.py +++ b/devscripts/prepare_manpage.py @@ -8,6 +8,35 @@ import re ROOT_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) README_FILE = os.path.join(ROOT_DIR, 'README.md') + +def filter_options(readme): + ret = '' + in_options = False + for line in readme.split('\n'): + if line.startswith('# '): + if line[2:].startswith('OPTIONS'): + in_options = True + else: + in_options = False + + if in_options: + if line.lstrip().startswith('-'): + option, description = re.split(r'\s{2,}', line.lstrip()) + split_option = option.split(' ') + + if not split_option[-1].startswith('-'): # metavar + option = ' '.join(split_option[:-1] + ['*%s*' % split_option[-1]]) + + # Pandoc's definition_lists. See http://pandoc.org/README.html + # for more information. + ret += '\n%s\n: %s\n' % (option, description) + else: + ret += line.lstrip() + '\n' + else: + ret += line + '\n' + + return ret + with io.open(README_FILE, encoding='utf-8') as f: readme = f.read() @@ -26,6 +55,8 @@ readme = re.sub(r'(?s)^.*?(?=# DESCRIPTION)', '', readme) readme = re.sub(r'\s+youtube-dl \[OPTIONS\] URL \[URL\.\.\.\]', '', readme) readme = PREFIX + readme +readme = filter_options(readme) + if sys.version_info < (3, 0): print(readme.encode('utf-8')) else: diff --git a/docs/supportedsites.md b/docs/supportedsites.md index fa83b68ad..47f7da86d 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -81,6 +81,7 @@ - **CBSSports** - **CeskaTelevize** - **channel9**: Channel 9 + - **Chaturbate** - **Chilloutzone** - **chirbit** - **chirbit:profile** @@ -150,6 +151,7 @@ - **Escapist** - **ESPN** (Currently broken) - **EsriVideo** + - **Europa** - **EveryonesMixtape** - **exfm**: ex.fm - **ExpoTV** @@ -157,6 +159,7 @@ - **facebook** - **faz.net** - **fc2** + - **Fczenit** - **fernsehkritik.tv** - **Firstpost** - **FiveTV** @@ -263,6 +266,9 @@ - **Libsyn** - **life:embed** - **lifenews**: LIFE | NEWS + - **limelight** + - **limelight:channel** + - **limelight:channel_list** - **LiveLeak** - **livestream** - **livestream:original** @@ -276,7 +282,6 @@ - **Malemotion** - **MDR** - **media.ccc.de** - - **MegaVideoz** - **metacafe** - **Metacritic** - **Mgoon** diff --git a/test/helper.py b/test/helper.py index cb6eec8d9..bdd7acca4 100644 --- a/test/helper.py +++ b/test/helper.py @@ -89,66 +89,81 @@ def gettestcases(include_onlymatching=False): md5 = lambda s: hashlib.md5(s.encode('utf-8')).hexdigest() -def expect_info_dict(self, got_dict, expected_dict): +def expect_value(self, got, expected, field): + if isinstance(expected, compat_str) and expected.startswith('re:'): + match_str = expected[len('re:'):] + match_rex = re.compile(match_str) + + self.assertTrue( + isinstance(got, compat_str), + 'Expected a %s object, but got %s for field %s' % ( + compat_str.__name__, type(got).__name__, field)) + self.assertTrue( + match_rex.match(got), + 'field %s (value: %r) should match %r' % (field, got, match_str)) + elif isinstance(expected, compat_str) and expected.startswith('startswith:'): + start_str = expected[len('startswith:'):] + self.assertTrue( + isinstance(got, compat_str), + 'Expected a %s object, but got %s for field %s' % ( + compat_str.__name__, type(got).__name__, field)) + self.assertTrue( + got.startswith(start_str), + 'field %s (value: %r) should start with %r' % (field, got, start_str)) + elif isinstance(expected, compat_str) and expected.startswith('contains:'): + contains_str = expected[len('contains:'):] + self.assertTrue( + isinstance(got, compat_str), + 'Expected a %s object, but got %s for field %s' % ( + compat_str.__name__, type(got).__name__, field)) + self.assertTrue( + contains_str in got, + 'field %s (value: %r) should contain %r' % (field, got, contains_str)) + elif isinstance(expected, type): + self.assertTrue( + isinstance(got, expected), + 'Expected type %r for field %s, but got value %r of type %r' % (expected, field, got, type(got))) + elif isinstance(expected, dict) and isinstance(got, dict): + expect_dict(self, got, expected) + elif isinstance(expected, list) and isinstance(got, list): + self.assertEqual( + len(expected), len(got), + 'Expect a list of length %d, but got a list of length %d for field %s' % ( + len(expected), len(got), field)) + for index, (item_got, item_expected) in enumerate(zip(got, expected)): + type_got = type(item_got) + type_expected = type(item_expected) + self.assertEqual( + type_expected, type_got, + 'Type mismatch for list item at index %d for field %s, expected %r, got %r' % ( + index, field, type_expected, type_got)) + expect_value(self, item_got, item_expected, field) + else: + if isinstance(expected, compat_str) and expected.startswith('md5:'): + got = 'md5:' + md5(got) + elif isinstance(expected, compat_str) and expected.startswith('mincount:'): + self.assertTrue( + isinstance(got, (list, dict)), + 'Expected field %s to be a list or a dict, but it is of type %s' % ( + field, type(got).__name__)) + expected_num = int(expected.partition(':')[2]) + assertGreaterEqual( + self, len(got), expected_num, + 'Expected %d items in field %s, but only got %d' % (expected_num, field, len(got))) + return + self.assertEqual( + expected, got, + 'Invalid value for field %s, expected %r, got %r' % (field, expected, got)) + + +def expect_dict(self, got_dict, expected_dict): for info_field, expected in expected_dict.items(): - if isinstance(expected, compat_str) and expected.startswith('re:'): - got = got_dict.get(info_field) - match_str = expected[len('re:'):] - match_rex = re.compile(match_str) + got = got_dict.get(info_field) + expect_value(self, got, expected, info_field) - self.assertTrue( - isinstance(got, compat_str), - 'Expected a %s object, but got %s for field %s' % ( - compat_str.__name__, type(got).__name__, info_field)) - self.assertTrue( - match_rex.match(got), - 'field %s (value: %r) should match %r' % (info_field, got, match_str)) - elif isinstance(expected, compat_str) and expected.startswith('startswith:'): - got = got_dict.get(info_field) - start_str = expected[len('startswith:'):] - self.assertTrue( - isinstance(got, compat_str), - 'Expected a %s object, but got %s for field %s' % ( - compat_str.__name__, type(got).__name__, info_field)) - self.assertTrue( - got.startswith(start_str), - 'field %s (value: %r) should start with %r' % (info_field, got, start_str)) - elif isinstance(expected, compat_str) and expected.startswith('contains:'): - got = got_dict.get(info_field) - contains_str = expected[len('contains:'):] - self.assertTrue( - isinstance(got, compat_str), - 'Expected a %s object, but got %s for field %s' % ( - compat_str.__name__, type(got).__name__, info_field)) - self.assertTrue( - contains_str in got, - 'field %s (value: %r) should contain %r' % (info_field, got, contains_str)) - elif isinstance(expected, type): - got = got_dict.get(info_field) - self.assertTrue(isinstance(got, expected), - 'Expected type %r for field %s, but got value %r of type %r' % (expected, info_field, got, type(got))) - else: - if isinstance(expected, compat_str) and expected.startswith('md5:'): - got = 'md5:' + md5(got_dict.get(info_field)) - elif isinstance(expected, compat_str) and expected.startswith('mincount:'): - got = got_dict.get(info_field) - self.assertTrue( - isinstance(got, (list, dict)), - 'Expected field %s to be a list or a dict, but it is of type %s' % ( - info_field, type(got).__name__)) - expected_num = int(expected.partition(':')[2]) - assertGreaterEqual( - self, len(got), expected_num, - 'Expected %d items in field %s, but only got %d' % ( - expected_num, info_field, len(got) - ) - ) - continue - else: - got = got_dict.get(info_field) - self.assertEqual(expected, got, - 'invalid value for field %s, expected %r, got %r' % (info_field, expected, got)) +def expect_info_dict(self, got_dict, expected_dict): + expect_dict(self, got_dict, expected_dict) # Check for the presence of mandatory fields if got_dict.get('_type') not in ('playlist', 'multi_video'): for key in ('id', 'url', 'title', 'ext'): diff --git a/test/test_InfoExtractor.py b/test/test_InfoExtractor.py index be8d12997..2a00d09a5 100644 --- a/test/test_InfoExtractor.py +++ b/test/test_InfoExtractor.py @@ -35,10 +35,14 @@ class TestInfoExtractor(unittest.TestCase): + + ''' self.assertEqual(ie._og_search_title(html), 'Foo') self.assertEqual(ie._og_search_description(html), 'Some video\'s description ') self.assertEqual(ie._og_search_thumbnail(html), 'http://domain.com/pic.jpg?key1=val1&key2=val2') + self.assertEqual(ie._og_search_video_url(html, default=None), None) + self.assertEqual(ie._og_search_property('foobar', html), 'Foo') def test_html_search_meta(self): ie = self.ie diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index a33db4da2..963840f6f 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -1238,13 +1238,20 @@ class YoutubeDL(object): except (ValueError, OverflowError, OSError): pass + subtitles = info_dict.get('subtitles') + if subtitles: + for _, subtitle in subtitles.items(): + for subtitle_format in subtitle: + if 'ext' not in subtitle_format: + subtitle_format['ext'] = determine_ext(subtitle_format['url']).lower() + if self.params.get('listsubtitles', False): if 'automatic_captions' in info_dict: self.list_subtitles(info_dict['id'], info_dict.get('automatic_captions'), 'automatic captions') - self.list_subtitles(info_dict['id'], info_dict.get('subtitles'), 'subtitles') + self.list_subtitles(info_dict['id'], subtitles, 'subtitles') return info_dict['requested_subtitles'] = self.process_subtitles( - info_dict['id'], info_dict.get('subtitles'), + info_dict['id'], subtitles, info_dict.get('automatic_captions')) # We now pick which formats have to be downloaded diff --git a/youtube_dl/compat.py b/youtube_dl/compat.py index 1ff42d94b..192e1c515 100644 --- a/youtube_dl/compat.py +++ b/youtube_dl/compat.py @@ -416,26 +416,32 @@ if hasattr(shutil, 'get_terminal_size'): # Python >= 3.3 else: _terminal_size = collections.namedtuple('terminal_size', ['columns', 'lines']) - def compat_get_terminal_size(): - columns = compat_getenv('COLUMNS', None) + def compat_get_terminal_size(fallback=(80, 24)): + columns = compat_getenv('COLUMNS') if columns: columns = int(columns) else: columns = None - lines = compat_getenv('LINES', None) + lines = compat_getenv('LINES') if lines: lines = int(lines) else: lines = None - try: - sp = subprocess.Popen( - ['stty', 'size'], - stdout=subprocess.PIPE, stderr=subprocess.PIPE) - out, err = sp.communicate() - lines, columns = map(int, out.split()) - except Exception: - pass + if columns is None or lines is None or columns <= 0 or lines <= 0: + try: + sp = subprocess.Popen( + ['stty', 'size'], + stdout=subprocess.PIPE, stderr=subprocess.PIPE) + out, err = sp.communicate() + _lines, _columns = map(int, out.split()) + except Exception: + _columns, _lines = _terminal_size(*fallback) + + if columns is None or columns <= 0: + columns = _columns + if lines is None or lines <= 0: + lines = _lines return _terminal_size(columns, lines) try: diff --git a/youtube_dl/downloader/common.py b/youtube_dl/downloader/common.py index 97e755d4b..29a4500d3 100644 --- a/youtube_dl/downloader/common.py +++ b/youtube_dl/downloader/common.py @@ -325,7 +325,7 @@ class FileDownloader(object): ) # Check file already present - if filename != '-' and nooverwrites_and_exists or continuedl_and_exists: + if filename != '-' and (nooverwrites_and_exists or continuedl_and_exists): self.report_file_already_downloaded(filename) self._hook_progress({ 'filename': filename, diff --git a/youtube_dl/downloader/rtmp.py b/youtube_dl/downloader/rtmp.py index 7d19bb808..f1d219ba9 100644 --- a/youtube_dl/downloader/rtmp.py +++ b/youtube_dl/downloader/rtmp.py @@ -105,7 +105,7 @@ class RtmpFD(FileDownloader): protocol = info_dict.get('rtmp_protocol', None) real_time = info_dict.get('rtmp_real_time', False) no_resume = info_dict.get('no_resume', False) - continue_dl = info_dict.get('continuedl', True) + continue_dl = self.params.get('continuedl', True) self.report_destination(filename) tmpfilename = self.temp_name(filename) diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index a73a1317e..462717b1e 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -76,6 +76,7 @@ from .cbssports import CBSSportsIE from .ccc import CCCIE from .ceskatelevize import CeskaTelevizeIE from .channel9 import Channel9IE +from .chaturbate import ChaturbateIE from .chilloutzone import ChilloutzoneIE from .chirbit import ( ChirbitIE, @@ -158,6 +159,7 @@ from .eroprofile import EroProfileIE from .escapist import EscapistIE from .espn import ESPNIE from .esri import EsriVideoIE +from .europa import EuropaIE from .everyonesmixtape import EveryonesMixtapeIE from .exfm import ExfmIE from .expotv import ExpoTVIE @@ -165,6 +167,7 @@ from .extremetube import ExtremeTubeIE from .facebook import FacebookIE from .faz import FazIE from .fc2 import FC2IE +from .fczenit import FczenitIE from .firstpost import FirstpostIE from .firsttv import FirstTVIE from .fivemin import FiveMinIE @@ -294,6 +297,11 @@ from .lifenews import ( LifeNewsIE, LifeEmbedIE, ) +from .limelight import ( + LimelightMediaIE, + LimelightChannelIE, + LimelightChannelListIE, +) from .liveleak import LiveLeakIE from .livestream import ( LivestreamIE, @@ -311,7 +319,6 @@ from .macgamestore import MacGameStoreIE from .mailru import MailRuIE from .malemotion import MalemotionIE from .mdr import MDRIE -from .megavideoz import MegaVideozIE from .metacafe import MetacafeIE from .metacritic import MetacriticIE from .mgoon import MgoonIE diff --git a/youtube_dl/extractor/adultswim.py b/youtube_dl/extractor/adultswim.py index 4327c2f61..130afe791 100644 --- a/youtube_dl/extractor/adultswim.py +++ b/youtube_dl/extractor/adultswim.py @@ -5,6 +5,7 @@ import re from .common import InfoExtractor from ..utils import ( + determine_ext, ExtractorError, float_or_none, xpath_text, @@ -40,7 +41,8 @@ class AdultSwimIE(InfoExtractor): 'id': 'rQxZvXQ4ROaSOqq-or2Mow', 'title': 'Rick and Morty - Pilot', 'description': "Rick moves in with his daughter's family and establishes himself as a bad influence on his grandson, Morty. " - } + }, + 'skip': 'This video is only available for registered users', }, { 'url': 'http://www.adultswim.com/videos/playlists/american-parenting/putting-francine-out-of-business/', 'playlist': [ @@ -123,7 +125,6 @@ class AdultSwimIE(InfoExtractor): else: collections = bootstrapped_data['show']['collections'] collection, video_info = self.find_collection_containing_video(collections, episode_path) - # Video wasn't found in the collections, let's try `slugged_video`. if video_info is None: if bootstrapped_data.get('slugged_video', {}).get('slug') == episode_path: @@ -133,7 +134,15 @@ class AdultSwimIE(InfoExtractor): show = bootstrapped_data['show'] show_title = show['title'] - segment_ids = [clip['videoPlaybackID'] for clip in video_info['clips']] + stream = video_info.get('stream') + clips = [stream] if stream else video_info.get('clips') + if not clips: + raise ExtractorError( + 'This video is only available via cable service provider subscription that' + ' is not currently supported. You may want to use --cookies.' + if video_info.get('auth') is True else 'Unable to find stream or clips', + expected=True) + segment_ids = [clip['videoPlaybackID'] for clip in clips] episode_id = video_info['id'] episode_title = video_info['title'] @@ -142,7 +151,7 @@ class AdultSwimIE(InfoExtractor): entries = [] for part_num, segment_id in enumerate(segment_ids): - segment_url = 'http://www.adultswim.com/videos/api/v0/assets?id=%s&platform=mobile' % segment_id + segment_url = 'http://www.adultswim.com/videos/api/v0/assets?id=%s&platform=desktop' % segment_id segment_title = '%s - %s' % (show_title, episode_title) if len(segment_ids) > 1: @@ -158,17 +167,30 @@ class AdultSwimIE(InfoExtractor): formats = [] file_els = idoc.findall('.//files/file') or idoc.findall('./files/file') + unique_urls = [] + unique_file_els = [] for file_el in file_els: + media_url = file_el.text + if not media_url or determine_ext(media_url) == 'f4m': + continue + if file_el.text not in unique_urls: + unique_urls.append(file_el.text) + unique_file_els.append(file_el) + + for file_el in unique_file_els: bitrate = file_el.attrib.get('bitrate') ftype = file_el.attrib.get('type') - - formats.append({ - 'format_id': '%s_%s' % (bitrate, ftype), - 'url': file_el.text.strip(), - # The bitrate may not be a number (for example: 'iphone') - 'tbr': int(bitrate) if bitrate.isdigit() else None, - 'quality': 1 if ftype == 'hd' else -1 - }) + media_url = file_el.text + if determine_ext(media_url) == 'm3u8': + formats.extend(self._extract_m3u8_formats( + media_url, segment_title, 'mp4', 'm3u8_native', preference=0, m3u8_id='hls')) + else: + formats.append({ + 'format_id': '%s_%s' % (bitrate, ftype), + 'url': file_el.text.strip(), + # The bitrate may not be a number (for example: 'iphone') + 'tbr': int(bitrate) if bitrate.isdigit() else None, + }) self._sort_formats(formats) diff --git a/youtube_dl/extractor/appletrailers.py b/youtube_dl/extractor/appletrailers.py index 576f03b5b..f68dc3236 100644 --- a/youtube_dl/extractor/appletrailers.py +++ b/youtube_dl/extractor/appletrailers.py @@ -13,53 +13,53 @@ from ..utils import ( class AppleTrailersIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?trailers\.apple\.com/(?:trailers|ca)/(?P[^/]+)/(?P[^/]+)' _TESTS = [{ - "url": "http://trailers.apple.com/trailers/wb/manofsteel/", + 'url': 'http://trailers.apple.com/trailers/wb/manofsteel/', 'info_dict': { 'id': 'manofsteel', }, - "playlist": [ + 'playlist': [ { - "md5": "d97a8e575432dbcb81b7c3acb741f8a8", - "info_dict": { - "id": "manofsteel-trailer4", - "ext": "mov", - "duration": 111, - "title": "Trailer 4", - "upload_date": "20130523", - "uploader_id": "wb", + 'md5': 'd97a8e575432dbcb81b7c3acb741f8a8', + 'info_dict': { + 'id': 'manofsteel-trailer4', + 'ext': 'mov', + 'duration': 111, + 'title': 'Trailer 4', + 'upload_date': '20130523', + 'uploader_id': 'wb', }, }, { - "md5": "b8017b7131b721fb4e8d6f49e1df908c", - "info_dict": { - "id": "manofsteel-trailer3", - "ext": "mov", - "duration": 182, - "title": "Trailer 3", - "upload_date": "20130417", - "uploader_id": "wb", + 'md5': 'b8017b7131b721fb4e8d6f49e1df908c', + 'info_dict': { + 'id': 'manofsteel-trailer3', + 'ext': 'mov', + 'duration': 182, + 'title': 'Trailer 3', + 'upload_date': '20130417', + 'uploader_id': 'wb', }, }, { - "md5": "d0f1e1150989b9924679b441f3404d48", - "info_dict": { - "id": "manofsteel-trailer", - "ext": "mov", - "duration": 148, - "title": "Trailer", - "upload_date": "20121212", - "uploader_id": "wb", + 'md5': 'd0f1e1150989b9924679b441f3404d48', + 'info_dict': { + 'id': 'manofsteel-trailer', + 'ext': 'mov', + 'duration': 148, + 'title': 'Trailer', + 'upload_date': '20121212', + 'uploader_id': 'wb', }, }, { - "md5": "5fe08795b943eb2e757fa95cb6def1cb", - "info_dict": { - "id": "manofsteel-teaser", - "ext": "mov", - "duration": 93, - "title": "Teaser", - "upload_date": "20120721", - "uploader_id": "wb", + 'md5': '5fe08795b943eb2e757fa95cb6def1cb', + 'info_dict': { + 'id': 'manofsteel-teaser', + 'ext': 'mov', + 'duration': 93, + 'title': 'Teaser', + 'upload_date': '20120721', + 'uploader_id': 'wb', }, }, ] diff --git a/youtube_dl/extractor/bandcamp.py b/youtube_dl/extractor/bandcamp.py index 505877b77..c1ef8051d 100644 --- a/youtube_dl/extractor/bandcamp.py +++ b/youtube_dl/extractor/bandcamp.py @@ -10,6 +10,8 @@ from ..compat import ( ) from ..utils import ( ExtractorError, + float_or_none, + int_or_none, ) @@ -52,11 +54,11 @@ class BandcampIE(InfoExtractor): ext, abr_str = format_id.split('-', 1) formats.append({ 'format_id': format_id, - 'url': format_url, + 'url': self._proto_relative_url(format_url, 'http:'), 'ext': ext, 'vcodec': 'none', 'acodec': ext, - 'abr': int(abr_str), + 'abr': int_or_none(abr_str), }) self._sort_formats(formats) @@ -65,7 +67,7 @@ class BandcampIE(InfoExtractor): 'id': compat_str(data['id']), 'title': data['title'], 'formats': formats, - 'duration': float(data['duration']), + 'duration': float_or_none(data.get('duration')), } else: raise ExtractorError('No free songs found') @@ -93,8 +95,8 @@ class BandcampIE(InfoExtractor): final_url_webpage = self._download_webpage(request_url, video_id, 'Requesting download url') # If we could correctly generate the .rand field the url would be # in the "download_url" key - final_url = self._search_regex( - r'"retry_url":"(.*?)"', final_url_webpage, 'final video URL') + final_url = self._proto_relative_url(self._search_regex( + r'"retry_url":"(.+?)"', final_url_webpage, 'final video URL'), 'http:') return { 'id': video_id, diff --git a/youtube_dl/extractor/bbc.py b/youtube_dl/extractor/bbc.py index cc2f6fed2..1b3a33e4e 100644 --- a/youtube_dl/extractor/bbc.py +++ b/youtube_dl/extractor/bbc.py @@ -11,6 +11,8 @@ from ..utils import ( int_or_none, parse_duration, parse_iso8601, + remove_end, + unescapeHTML, ) from ..compat import compat_HTTPError @@ -28,6 +30,14 @@ class BBCCoUkIE(InfoExtractor): 'http://open.live.bbc.co.uk/mediaselector/5/select/version/2.0/mediaset/pc/vpid/%s', ] + _MEDIASELECTION_NS = 'http://bbc.co.uk/2008/mp/mediaselection' + _EMP_PLAYLIST_NS = 'http://bbc.co.uk/2008/emp/playlist' + + _NAMESPACES = ( + _MEDIASELECTION_NS, + _EMP_PLAYLIST_NS, + ) + _TESTS = [ { 'url': 'http://www.bbc.co.uk/programmes/b039g8p7', @@ -193,6 +203,7 @@ class BBCCoUkIE(InfoExtractor): def _extract_connection(self, connection, programme_id): formats = [] + kind = connection.get('kind') protocol = connection.get('protocol') supplier = connection.get('supplier') if protocol == 'http': @@ -218,7 +229,7 @@ class BBCCoUkIE(InfoExtractor): else: formats.append({ 'url': href, - 'format_id': supplier, + 'format_id': supplier or kind or protocol, }) elif protocol == 'rtmp': application = connection.get('application', 'ondemand') @@ -238,16 +249,24 @@ class BBCCoUkIE(InfoExtractor): return formats def _extract_items(self, playlist): - return playlist.findall('./{http://bbc.co.uk/2008/emp/playlist}item') + return playlist.findall('./{%s}item' % self._EMP_PLAYLIST_NS) + + def _findall_ns(self, element, xpath): + elements = [] + for ns in self._NAMESPACES: + elements.extend(element.findall(xpath % ns)) + return elements def _extract_medias(self, media_selection): - error = media_selection.find('./{http://bbc.co.uk/2008/mp/mediaselection}error') + error = media_selection.find('./{%s}error' % self._MEDIASELECTION_NS) + if error is None: + media_selection.find('./{%s}error' % self._EMP_PLAYLIST_NS) if error is not None: raise BBCCoUkIE.MediaSelectionError(error.get('id')) - return media_selection.findall('./{http://bbc.co.uk/2008/mp/mediaselection}media') + return self._findall_ns(media_selection, './{%s}media') def _extract_connections(self, media): - return media.findall('./{http://bbc.co.uk/2008/mp/mediaselection}connection') + return self._findall_ns(media, './{%s}connection') def _extract_video(self, media, programme_id): formats = [] @@ -261,13 +280,14 @@ class BBCCoUkIE(InfoExtractor): conn_formats = self._extract_connection(connection, programme_id) for format in conn_formats: format.update({ - 'format_id': '%s_%s' % (service, format['format_id']), 'width': width, 'height': height, 'vbr': vbr, 'vcodec': vcodec, 'filesize': file_size, }) + if service: + format['format_id'] = '%s_%s' % (service, format['format_id']) formats.extend(conn_formats) return formats @@ -382,7 +402,7 @@ class BBCCoUkIE(InfoExtractor): url, playlist_id, 'Downloading legacy playlist XML') def _extract_from_legacy_playlist(self, playlist, playlist_id): - no_items = playlist.find('./{http://bbc.co.uk/2008/emp/playlist}noItems') + no_items = playlist.find('./{%s}noItems' % self._EMP_PLAYLIST_NS) if no_items is not None: reason = no_items.get('reason') if reason == 'preAvailability': @@ -399,8 +419,9 @@ class BBCCoUkIE(InfoExtractor): kind = item.get('kind') if kind != 'programme' and kind != 'radioProgramme': continue - title = playlist.find('./{http://bbc.co.uk/2008/emp/playlist}title').text - description = playlist.find('./{http://bbc.co.uk/2008/emp/playlist}summary').text + title = playlist.find('./{%s}title' % self._EMP_PLAYLIST_NS).text + description_el = playlist.find('./{%s}summary' % self._EMP_PLAYLIST_NS) + description = description_el.text if description_el is not None else None def get_programme_id(item): def get_from_attributes(item): @@ -409,16 +430,18 @@ class BBCCoUkIE(InfoExtractor): if value and re.match(r'^[pb][\da-z]{7}$', value): return value get_from_attributes(item) - mediator = item.find('./{http://bbc.co.uk/2008/emp/playlist}mediator') + mediator = item.find('./{%s}mediator' % self._EMP_PLAYLIST_NS) if mediator is not None: return get_from_attributes(mediator) programme_id = get_programme_id(item) duration = int_or_none(item.get('duration')) - # TODO: programme_id can be None and media items can be incorporated right inside - # playlist's item (e.g. http://www.bbc.com/turkce/haberler/2015/06/150615_telabyad_kentin_cogu) - # as f4m and m3u8 - formats, subtitles = self._download_media_selector(programme_id) + + if programme_id: + formats, subtitles = self._download_media_selector(programme_id) + else: + formats, subtitles = self._process_media_selector(item, playlist_id) + programme_id = playlist_id return programme_id, title, description, duration, formats, subtitles @@ -470,6 +493,9 @@ class BBCIE(BBCCoUkIE): _VALID_URL = r'https?://(?:www\.)?bbc\.(?:com|co\.uk)/(?:[^/]+/)+(?P[^/#?]+)' _MEDIASELECTOR_URLS = [ + # Provides HQ HLS streams but fails with geolocation in some cases when it's + # even not geo restricted at all + 'http://open.live.bbc.co.uk/mediaselector/5/select/version/2.0/mediaset/iptv-all/vpid/%s', # Provides more formats, namely direct mp4 links, but fails on some videos with # notukerror for non UK (?) users (e.g. # http://www.bbc.com/travel/story/20150625-sri-lankas-spicy-secret) @@ -479,8 +505,7 @@ class BBCIE(BBCCoUkIE): ] _TESTS = [{ - # article with multiple videos embedded with data-media-meta containing - # playlist.sxml, externalId and no direct video links + # article with multiple videos embedded with data-playable containing vpids 'url': 'http://www.bbc.com/news/world-europe-32668511', 'info_dict': { 'id': 'world-europe-32668511', @@ -489,7 +514,7 @@ class BBCIE(BBCCoUkIE): }, 'playlist_count': 2, }, { - # article with multiple videos embedded with data-media-meta (more videos) + # article with multiple videos embedded with data-playable (more videos) 'url': 'http://www.bbc.com/news/business-28299555', 'info_dict': { 'id': 'business-28299555', @@ -500,6 +525,7 @@ class BBCIE(BBCCoUkIE): 'skip': 'Save time', }, { # article with multiple videos embedded with `new SMP()` + # broken 'url': 'http://www.bbc.co.uk/blogs/adamcurtis/entries/3662a707-0af9-3149-963f-47bea720b460', 'info_dict': { 'id': '3662a707-0af9-3149-963f-47bea720b460', @@ -507,12 +533,13 @@ class BBCIE(BBCCoUkIE): }, 'playlist_count': 18, }, { - # single video embedded with mediaAssetPage.init() + # single video embedded with data-playable containing vpid 'url': 'http://www.bbc.com/news/world-europe-32041533', 'info_dict': { 'id': 'p02mprgb', 'ext': 'mp4', 'title': 'Aerial footage showed the site of the crash in the Alps - courtesy BFM TV', + 'description': 'md5:2868290467291b37feda7863f7a83f54', 'duration': 47, 'timestamp': 1427219242, 'upload_date': '20150324', @@ -522,15 +549,14 @@ class BBCIE(BBCCoUkIE): 'skip_download': True, } }, { - # article with single video embedded with data-media-meta containing - # direct video links (for now these are extracted) and playlist.xml (with - # media items as f4m and m3u8 - currently unsupported) + # article with single video embedded with data-playable containing XML playlist + # with direct video links as progressiveDownloadUrl (for now these are extracted) + # and playlist with f4m and m3u8 as streamingUrl 'url': 'http://www.bbc.com/turkce/haberler/2015/06/150615_telabyad_kentin_cogu', 'info_dict': { 'id': '150615_telabyad_kentin_cogu', 'ext': 'mp4', 'title': "YPG: Tel Abyad'ın tamamı kontrolümüzde", - 'duration': 47, 'timestamp': 1434397334, 'upload_date': '20150615', }, @@ -538,13 +564,12 @@ class BBCIE(BBCCoUkIE): 'skip_download': True, } }, { - # single video embedded with mediaAssetPage.init() (regional section) + # single video embedded with data-playable containing XML playlists (regional section) 'url': 'http://www.bbc.com/mundo/video_fotos/2015/06/150619_video_honduras_militares_hospitales_corrupcion_aw', 'info_dict': { 'id': '150619_video_honduras_militares_hospitales_corrupcion_aw', 'ext': 'mp4', 'title': 'Honduras militariza sus hospitales por nuevo escándalo de corrupción', - 'duration': 87, 'timestamp': 1434713142, 'upload_date': '20150619', }, @@ -586,27 +611,34 @@ class BBCIE(BBCCoUkIE): 'ext': 'mp4', 'title': 'Hyundai Santa Fe Sport: Rock star', 'description': 'md5:b042a26142c4154a6e472933cf20793d', - 'timestamp': 1368473503, - 'upload_date': '20130513', + 'timestamp': 1415867444, + 'upload_date': '20141113', }, 'params': { # rtmp download 'skip_download': True, } }, { - # single video with playlist.sxml URL + # single video with playlist.sxml URL in playlist param 'url': 'http://www.bbc.com/sport/0/football/33653409', 'info_dict': { 'id': 'p02xycnp', 'ext': 'mp4', 'title': 'Transfers: Cristiano Ronaldo to Man Utd, Arsenal to spend?', - 'description': 'md5:398fca0e2e701c609d726e034fa1fc89', 'duration': 140, }, 'params': { # rtmp download 'skip_download': True, } + }, { + # article with multiple videos embedded with playlist.sxml in playlist param + 'url': 'http://www.bbc.com/sport/0/football/34475836', + 'info_dict': { + 'id': '34475836', + 'title': 'What Liverpool can expect from Klopp', + }, + 'playlist_count': 3, }, { # single video with playlist URL from weather section 'url': 'http://www.bbc.com/weather/features/33601775', @@ -648,40 +680,107 @@ class BBCIE(BBCCoUkIE): return [], [] + def _extract_from_playlist_sxml(self, url, playlist_id, timestamp): + programme_id, title, description, duration, formats, subtitles = \ + self._process_legacy_playlist_url(url, playlist_id) + self._sort_formats(formats) + return { + 'id': programme_id, + 'title': title, + 'description': description, + 'duration': duration, + 'timestamp': timestamp, + 'formats': formats, + 'subtitles': subtitles, + } + def _real_extract(self, url): playlist_id = self._match_id(url) webpage = self._download_webpage(url, playlist_id) - timestamp = parse_iso8601(self._search_regex( - [r'"datePublished":\s*"([^"]+)', - r']+property="article:published_time"[^>]+content="([^"]+)"', - r'itemprop="datePublished"[^>]+datetime="([^"]+)"'], - webpage, 'date', default=None)) + timestamp = None + playlist_title = None + playlist_description = None - # single video with playlist.sxml URL (e.g. http://www.bbc.com/sport/0/football/3365340ng) - playlist = self._search_regex( - r']+name="playlist"[^>]+value="([^"]+)"', - webpage, 'playlist', default=None) - if playlist: - programme_id, title, description, duration, formats, subtitles = \ - self._process_legacy_playlist_url(playlist, playlist_id) - self._sort_formats(formats) - return { - 'id': programme_id, - 'title': title, - 'description': description, - 'duration': duration, - 'timestamp': timestamp, - 'formats': formats, - 'subtitles': subtitles, - } + ld = self._parse_json( + self._search_regex( + r'(?s)', + webpage, 'ld json', default='{}'), + playlist_id, fatal=False) + if ld: + timestamp = parse_iso8601(ld.get('datePublished')) + playlist_title = ld.get('headline') + playlist_description = ld.get('articleBody') + + if not timestamp: + timestamp = parse_iso8601(self._search_regex( + [r']+property="article:published_time"[^>]+content="([^"]+)"', + r'itemprop="datePublished"[^>]+datetime="([^"]+)"', + r'"datePublished":\s*"([^"]+)'], + webpage, 'date', default=None)) + + entries = [] + + # article with multiple videos embedded with playlist.sxml (e.g. + # http://www.bbc.com/sport/0/football/34475836) + playlists = re.findall(r']+name="playlist"[^>]+value="([^"]+)"', webpage) + if playlists: + entries = [ + self._extract_from_playlist_sxml(playlist_url, playlist_id, timestamp) + for playlist_url in playlists] + + # news article with multiple videos embedded with data-playable + data_playables = re.findall(r'data-playable=(["\'])({.+?})\1', webpage) + if data_playables: + for _, data_playable_json in data_playables: + data_playable = self._parse_json( + unescapeHTML(data_playable_json), playlist_id, fatal=False) + if not data_playable: + continue + settings = data_playable.get('settings', {}) + if settings: + # data-playable with video vpid in settings.playlistObject.items (e.g. + # http://www.bbc.com/news/world-us-canada-34473351) + playlist_object = settings.get('playlistObject', {}) + if playlist_object: + items = playlist_object.get('items') + if items and isinstance(items, list): + title = playlist_object['title'] + description = playlist_object.get('summary') + duration = int_or_none(items[0].get('duration')) + programme_id = items[0].get('vpid') + formats, subtitles = self._download_media_selector(programme_id) + self._sort_formats(formats) + entries.append({ + 'id': programme_id, + 'title': title, + 'description': description, + 'timestamp': timestamp, + 'duration': duration, + 'formats': formats, + 'subtitles': subtitles, + }) + else: + # data-playable without vpid but with a playlist.sxml URLs + # in otherSettings.playlist (e.g. + # http://www.bbc.com/turkce/multimedya/2015/10/151010_vid_ankara_patlama_ani) + playlist = data_playable.get('otherSettings', {}).get('playlist', {}) + if playlist: + entries.append(self._extract_from_playlist_sxml( + playlist.get('progressiveDownloadUrl'), playlist_id, timestamp)) + + if entries: + playlist_title = playlist_title or remove_end(self._og_search_title(webpage), ' - BBC News') + playlist_description = playlist_description or self._og_search_description(webpage, default=None) + return self.playlist_result(entries, playlist_id, playlist_title, playlist_description) # single video story (e.g. http://www.bbc.com/travel/story/20150625-sri-lankas-spicy-secret) programme_id = self._search_regex( [r'data-video-player-vpid="([\da-z]{8})"', r']+name="externalIdentifier"[^>]+value="([\da-z]{8})"'], webpage, 'vpid', default=None) + if programme_id: formats, subtitles = self._download_media_selector(programme_id) self._sort_formats(formats) diff --git a/youtube_dl/extractor/beeg.py b/youtube_dl/extractor/beeg.py index b38057f2f..e6c928699 100644 --- a/youtube_dl/extractor/beeg.py +++ b/youtube_dl/extractor/beeg.py @@ -1,65 +1,67 @@ from __future__ import unicode_literals -import re - from .common import InfoExtractor +from ..utils import ( + int_or_none, + parse_iso8601, +) class BeegIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?beeg\.com/(?P\d+)' _TEST = { 'url': 'http://beeg.com/5416503', - 'md5': '1bff67111adb785c51d1b42959ec10e5', + 'md5': '46c384def73b33dbc581262e5ee67cef', 'info_dict': { 'id': '5416503', 'ext': 'mp4', 'title': 'Sultry Striptease', - 'description': 'md5:6db3c6177972822aaba18652ff59c773', - 'categories': list, # NSFW - 'thumbnail': 're:https?://.*\.jpg$', + 'description': 'md5:d22219c09da287c14bed3d6c37ce4bc2', + 'timestamp': 1391813355, + 'upload_date': '20140207', + 'duration': 383, + 'tags': list, 'age_limit': 18, } } def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') + video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) - - quality_arr = self._search_regex( - r'(?s)var\s+qualityArr\s*=\s*{\s*(.+?)\s*}', webpage, 'quality formats') - - formats = [{ - 'url': fmt[1], - 'format_id': fmt[0], - 'height': int(fmt[0][:-1]), - } for fmt in re.findall(r"'([^']+)'\s*:\s*'([^']+)'", quality_arr)] + video = self._download_json( + 'http://beeg.com/api/v1/video/%s' % video_id, video_id) + formats = [] + for format_id, video_url in video.items(): + height = self._search_regex( + r'^(\d+)[pP]$', format_id, 'height', default=None) + if not height: + continue + formats.append({ + 'url': self._proto_relative_url(video_url.replace('{DATA_MARKERS}', ''), 'http:'), + 'format_id': format_id, + 'height': int(height), + }) self._sort_formats(formats) - title = self._html_search_regex( - r'([^<]+)\s*-\s*beeg\.?', webpage, 'title') + title = video['title'] + video_id = video.get('id') or video_id + display_id = video.get('code') + description = video.get('desc') - description = self._html_search_regex( - r']+?videoId="(\d+)"', webpage, 'video id') + [r']+?videoId=(["\'])(?P\d+)', r'id=["\']canal_video_player(?P\d+)'], + webpage, 'video id', group='id') info_url = self._VIDEO_INFO_TEMPLATE % (site_id, video_id) doc = self._download_xml(info_url, video_id, 'Downloading video XML') diff --git a/youtube_dl/extractor/channel9.py b/youtube_dl/extractor/channel9.py index 3dfc24f5b..c74553dcf 100644 --- a/youtube_dl/extractor/channel9.py +++ b/youtube_dl/extractor/channel9.py @@ -3,7 +3,11 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..utils import ExtractorError +from ..utils import ( + ExtractorError, + parse_filesize, + qualities, +) class Channel9IE(InfoExtractor): @@ -28,7 +32,7 @@ class Channel9IE(InfoExtractor): 'title': 'Developer Kick-Off Session: Stuff We Love', 'description': 'md5:c08d72240b7c87fcecafe2692f80e35f', 'duration': 4576, - 'thumbnail': 'http://video.ch9.ms/ch9/9d51/03902f2d-fc97-4d3c-b195-0bfe15a19d51/KOS002_220.jpg', + 'thumbnail': 're:http://.*\.jpg', 'session_code': 'KOS002', 'session_day': 'Day 1', 'session_room': 'Arena 1A', @@ -44,31 +48,29 @@ class Channel9IE(InfoExtractor): 'title': 'Self-service BI with Power BI - nuclear testing', 'description': 'md5:d1e6ecaafa7fb52a2cacdf9599829f5b', 'duration': 1540, - 'thumbnail': 'http://video.ch9.ms/ch9/87e1/0300391f-a455-4c72-bec3-4422f19287e1/selfservicenuk_512.jpg', + 'thumbnail': 're:http://.*\.jpg', 'authors': ['Mike Wilmot'], }, + }, + { + # low quality mp4 is best + 'url': 'https://channel9.msdn.com/Events/CPP/CppCon-2015/Ranges-for-the-Standard-Library', + 'info_dict': { + 'id': 'Events/CPP/CppCon-2015/Ranges-for-the-Standard-Library', + 'ext': 'mp4', + 'title': 'Ranges for the Standard Library', + 'description': 'md5:2e6b4917677af3728c5f6d63784c4c5d', + 'duration': 5646, + 'thumbnail': 're:http://.*\.jpg', + }, + 'params': { + 'skip_download': True, + }, } ] _RSS_URL = 'http://channel9.msdn.com/%s/RSS' - # Sorted by quality - _known_formats = ['MP3', 'MP4', 'Mid Quality WMV', 'Mid Quality MP4', 'High Quality WMV', 'High Quality MP4'] - - def _restore_bytes(self, formatted_size): - if not formatted_size: - return 0 - m = re.match(r'^(?P\d+(?:\.\d+)?)\s+(?P[a-zA-Z]+)', formatted_size) - if not m: - return 0 - units = m.group('units') - try: - exponent = ['B', 'KB', 'MB', 'GB', 'TB', 'PB', 'EB', 'ZB', 'YB'].index(units.upper()) - except ValueError: - return 0 - size = float(m.group('size')) - return int(size * (1024 ** exponent)) - def _formats_from_html(self, html): FORMAT_REGEX = r''' (?x) @@ -78,16 +80,20 @@ class Channel9IE(InfoExtractor):

File\s+size

\s*(?P.*?)\s* )? # File size part may be missing ''' - # Extract known formats + quality = qualities(( + 'MP3', 'MP4', + 'Low Quality WMV', 'Low Quality MP4', + 'Mid Quality WMV', 'Mid Quality MP4', + 'High Quality WMV', 'High Quality MP4')) formats = [{ 'url': x.group('url'), 'format_id': x.group('quality'), 'format_note': x.group('note'), 'format': '%s (%s)' % (x.group('quality'), x.group('note')), - 'filesize': self._restore_bytes(x.group('filesize')), # File size is approximate - 'preference': self._known_formats.index(x.group('quality')), + 'filesize_approx': parse_filesize(x.group('filesize')), + 'quality': quality(x.group('quality')), 'vcodec': 'none' if x.group('note') == 'Audio only' else None, - } for x in list(re.finditer(FORMAT_REGEX, html)) if x.group('quality') in self._known_formats] + } for x in list(re.finditer(FORMAT_REGEX, html))] self._sort_formats(formats) @@ -158,7 +164,7 @@ class Channel9IE(InfoExtractor): def _extract_session_day(self, html): m = re.search(r'
  • \s*(?P[^<]+)\s*
  • ', html) - return m.group('day') if m is not None else None + return m.group('day').strip() if m is not None else None def _extract_session_room(self, html): m = re.search(r'
  • \s*(?P.+?)\s*
  • ', html) @@ -224,12 +230,12 @@ class Channel9IE(InfoExtractor): if contents is None: return contents - authors = self._extract_authors(html) + if len(contents) > 1: + raise ExtractorError('Got more than one entry') + result = contents[0] + result['authors'] = self._extract_authors(html) - for content in contents: - content['authors'] = authors - - return contents + return result def _extract_session(self, html, content_path): contents = self._extract_content(html, content_path) diff --git a/youtube_dl/extractor/chaturbate.py b/youtube_dl/extractor/chaturbate.py new file mode 100644 index 000000000..0b67ba67d --- /dev/null +++ b/youtube_dl/extractor/chaturbate.py @@ -0,0 +1,50 @@ +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ExtractorError + + +class ChaturbateIE(InfoExtractor): + _VALID_URL = r'https?://(?:[^/]+\.)?chaturbate\.com/(?P[^/?#]+)' + _TESTS = [{ + 'url': 'https://www.chaturbate.com/siswet19/', + 'info_dict': { + 'id': 'siswet19', + 'ext': 'mp4', + 'title': 're:^siswet19 [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', + 'age_limit': 18, + 'is_live': True, + }, + 'params': { + 'skip_download': True, + } + }, { + 'url': 'https://en.chaturbate.com/siswet19/', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage(url, video_id) + + m3u8_url = self._search_regex( + r'src=(["\'])(?Phttp.+?\.m3u8.*?)\1', webpage, + 'playlist', default=None, group='url') + + if not m3u8_url: + error = self._search_regex( + r']+class=(["\'])desc_span\1[^>]*>(?P[^<]+)', + webpage, 'error', group='error') + raise ExtractorError(error, expected=True) + + formats = self._extract_m3u8_formats(m3u8_url, video_id, ext='mp4') + + return { + 'id': video_id, + 'title': self._live_title(video_id), + 'thumbnail': 'https://cdn-s.highwebmedia.com/uHK3McUtGCG3SMFcd4ZJsRv8/roomimage/%s.jpg' % video_id, + 'age_limit': self._rta_search(webpage), + 'is_live': True, + 'formats': formats, + } diff --git a/youtube_dl/extractor/comedycentral.py b/youtube_dl/extractor/comedycentral.py index 91ebb0ce5..3e4bd10b6 100644 --- a/youtube_dl/extractor/comedycentral.py +++ b/youtube_dl/extractor/comedycentral.py @@ -151,12 +151,7 @@ class ComedyCentralShowsIE(MTVServicesInfoExtractor): mobj = re.match(self._VALID_URL, url) if mobj.group('shortname'): - if mobj.group('shortname') in ('tds', 'thedailyshow'): - url = 'http://thedailyshow.cc.com/full-episodes/' - else: - url = 'http://thecolbertreport.cc.com/full-episodes/' - mobj = re.match(self._VALID_URL, url, re.VERBOSE) - assert mobj is not None + return self.url_result('http://www.cc.com/shows/the-daily-show-with-trevor-noah/full-episodes') if mobj.group('clip'): if mobj.group('videotitle'): diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 2fe0d5d37..a0c4af92f 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -39,6 +39,7 @@ from ..utils import ( RegexNotFoundError, sanitize_filename, unescapeHTML, + unified_strdate, url_basename, xpath_text, xpath_with_ns, @@ -164,6 +165,7 @@ class InfoExtractor(object): with the "ext" entry and one of: * "data": The subtitles file contents * "url": A URL pointing to the subtitles file + "ext" will be calculated from URL if missing automatic_captions: Like 'subtitles', used by the YoutubeIE for automatically generated captions duration: Length of the video in seconds, as an integer. @@ -643,8 +645,9 @@ class InfoExtractor(object): # Helper functions for extracting OpenGraph info @staticmethod def _og_regexes(prop): - content_re = r'content=(?:"([^>]+?)"|\'([^>]+?)\')' - property_re = r'(?:name|property)=[\'"]og:%s[\'"]' % re.escape(prop) + content_re = r'content=(?:"([^>]+?)"|\'([^>]+?)\'|\s*([^\s"\'=<>`]+?))' + property_re = (r'(?:name|property)=(?:\'og:%(prop)s\'|"og:%(prop)s"|\s*og:%(prop)s\b)' + % {'prop': re.escape(prop)}) template = r']+?%s[^>]+?%s' return [ template % (property_re, content_re), @@ -869,13 +872,18 @@ class InfoExtractor(object): time.sleep(timeout) def _extract_f4m_formats(self, manifest_url, video_id, preference=None, f4m_id=None, - transform_source=lambda s: fix_xml_ampersands(s).strip()): + transform_source=lambda s: fix_xml_ampersands(s).strip(), + fatal=True): manifest = self._download_xml( manifest_url, video_id, 'Downloading f4m manifest', 'Unable to download f4m manifest', # Some manifests may be malformed, e.g. prosiebensat1 generated manifests # (see https://github.com/rg3/youtube-dl/issues/6215#issuecomment-121704244) - transform_source=transform_source) + transform_source=transform_source, + fatal=fatal) + + if manifest is False: + return manifest formats = [] manifest_version = '1.0' @@ -896,7 +904,10 @@ class InfoExtractor(object): # may differ leading to inability to resolve the format by requested # bitrate in f4m downloader if determine_ext(manifest_url) == 'f4m': - formats.extend(self._extract_f4m_formats(manifest_url, video_id, preference, f4m_id)) + f4m_formats = self._extract_f4m_formats( + manifest_url, video_id, preference, f4m_id, fatal=fatal) + if f4m_formats: + formats.extend(f4m_formats) continue tbr = int_or_none(media_el.attrib.get('bitrate')) formats.append({ @@ -1044,6 +1055,7 @@ class InfoExtractor(object): video_id = os.path.splitext(url_basename(smil_url))[0] title = None description = None + upload_date = None for meta in smil.findall(self._xpath_ns('./head/meta', namespace)): name = meta.attrib.get('name') content = meta.attrib.get('content') @@ -1053,11 +1065,22 @@ class InfoExtractor(object): title = content elif not description and name in ('description', 'abstract'): description = content + elif not upload_date and name == 'date': + upload_date = unified_strdate(content) + + thumbnails = [{ + 'id': image.get('type'), + 'url': image.get('src'), + 'width': int_or_none(image.get('width')), + 'height': int_or_none(image.get('height')), + } for image in smil.findall(self._xpath_ns('.//image', namespace)) if image.get('src')] return { 'id': video_id, 'title': title or video_id, 'description': description, + 'upload_date': upload_date, + 'thumbnails': thumbnails, 'formats': formats, 'subtitles': subtitles, } @@ -1084,7 +1107,7 @@ class InfoExtractor(object): if not src: continue - bitrate = int_or_none(video.get('system-bitrate') or video.get('systemBitrate'), 1000) + bitrate = float_or_none(video.get('system-bitrate') or video.get('systemBitrate'), 1000) filesize = int_or_none(video.get('size') or video.get('fileSize')) width = int_or_none(video.get('width')) height = int_or_none(video.get('height')) @@ -1116,8 +1139,10 @@ class InfoExtractor(object): src_url = src if src.startswith('http') else compat_urlparse.urljoin(base, src) if proto == 'm3u8' or src_ext == 'm3u8': - formats.extend(self._extract_m3u8_formats( - src_url, video_id, ext or 'mp4', m3u8_id='hls')) + m3u8_formats = self._extract_m3u8_formats( + src_url, video_id, ext or 'mp4', m3u8_id='hls', fatal=False) + if m3u8_formats: + formats.extend(m3u8_formats) continue if src_ext == 'f4m': @@ -1129,10 +1154,12 @@ class InfoExtractor(object): } f4m_url += '&' if '?' in f4m_url else '?' f4m_url += compat_urllib_parse.urlencode(f4m_params) - formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds')) + f4m_formats = self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds', fatal=False) + if f4m_formats: + formats.extend(f4m_formats) continue - if src_url.startswith('http'): + if src_url.startswith('http') and self._is_valid_url(src, video_id): http_count += 1 formats.append({ 'url': src_url, diff --git a/youtube_dl/extractor/criterion.py b/youtube_dl/extractor/criterion.py index 4fb178165..dedb810a0 100644 --- a/youtube_dl/extractor/criterion.py +++ b/youtube_dl/extractor/criterion.py @@ -27,9 +27,7 @@ class CriterionIE(InfoExtractor): final_url = self._search_regex( r'so.addVariable\("videoURL", "(.+?)"\)\;', webpage, 'video url') title = self._og_search_title(webpage) - description = self._html_search_regex( - r'', - webpage, 'video description') + description = self._html_search_meta('description', webpage) thumbnail = self._search_regex( r'so.addVariable\("thumbnailURL", "(.+?)"\)\;', webpage, 'thumbnail url') diff --git a/youtube_dl/extractor/dailymotion.py b/youtube_dl/extractor/dailymotion.py index 2d90b2224..80a05cfee 100644 --- a/youtube_dl/extractor/dailymotion.py +++ b/youtube_dl/extractor/dailymotion.py @@ -119,7 +119,7 @@ class DailymotionIE(DailymotionBaseInfoExtractor): webpage, 'comment count', fatal=False)) player_v5 = self._search_regex( - r'playerV5\s*=\s*dmp\.create\([^,]+?,\s*({.+?})\);', + [r'buildPlayer\(({.+?})\);', r'playerV5\s*=\s*dmp\.create\([^,]+?,\s*({.+?})\);'], webpage, 'player v5', default=None) if player_v5: player = self._parse_json(player_v5, video_id) diff --git a/youtube_dl/extractor/engadget.py b/youtube_dl/extractor/engadget.py index 4ea37ebd9..e4180701d 100644 --- a/youtube_dl/extractor/engadget.py +++ b/youtube_dl/extractor/engadget.py @@ -10,7 +10,7 @@ from ..utils import ( class EngadgetIE(InfoExtractor): _VALID_URL = r'''(?x)https?://www.engadget.com/ - (?:video/5min/(?P\d+)| + (?:video(?:/5min)?/(?P\d+)| [\d/]+/.*?) ''' diff --git a/youtube_dl/extractor/europa.py b/youtube_dl/extractor/europa.py new file mode 100644 index 000000000..adc43919e --- /dev/null +++ b/youtube_dl/extractor/europa.py @@ -0,0 +1,93 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..compat import compat_urlparse +from ..utils import ( + int_or_none, + orderedSet, + parse_duration, + qualities, + unified_strdate, + xpath_text +) + + +class EuropaIE(InfoExtractor): + _VALID_URL = r'https?://ec\.europa\.eu/avservices/(?:video/player|audio/audioDetails)\.cfm\?.*?\bref=(?P[A-Za-z0-9-]+)' + _TESTS = [{ + 'url': 'http://ec.europa.eu/avservices/video/player.cfm?ref=I107758', + 'md5': '574f080699ddd1e19a675b0ddf010371', + 'info_dict': { + 'id': 'I107758', + 'ext': 'mp4', + 'title': 'TRADE - Wikileaks on TTIP', + 'description': 'NEW LIVE EC Midday press briefing of 11/08/2015', + 'thumbnail': 're:^https?://.*\.jpg$', + 'upload_date': '20150811', + 'duration': 34, + 'view_count': int, + 'formats': 'mincount:3', + } + }, { + 'url': 'http://ec.europa.eu/avservices/video/player.cfm?sitelang=en&ref=I107786', + 'only_matching': True, + }, { + 'url': 'http://ec.europa.eu/avservices/audio/audioDetails.cfm?ref=I-109295&sitelang=en', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + playlist = self._download_xml( + 'http://ec.europa.eu/avservices/video/player/playlist.cfm?ID=%s' % video_id, video_id) + + def get_item(type_, preference): + items = {} + for item in playlist.findall('./info/%s/item' % type_): + lang, label = xpath_text(item, 'lg', default=None), xpath_text(item, 'label', default=None) + if lang and label: + items[lang] = label.strip() + for p in preference: + if items.get(p): + return items[p] + + query = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query) + preferred_lang = query.get('sitelang', ('en', ))[0] + + preferred_langs = orderedSet((preferred_lang, 'en', 'int')) + + title = get_item('title', preferred_langs) or video_id + description = get_item('description', preferred_langs) + thumbnmail = xpath_text(playlist, './info/thumburl', 'thumbnail') + upload_date = unified_strdate(xpath_text(playlist, './info/date', 'upload date')) + duration = parse_duration(xpath_text(playlist, './info/duration', 'duration')) + view_count = int_or_none(xpath_text(playlist, './info/views', 'views')) + + language_preference = qualities(preferred_langs[::-1]) + + formats = [] + for file_ in playlist.findall('./files/file'): + video_url = xpath_text(file_, './url') + if not video_url: + continue + lang = xpath_text(file_, './lg') + formats.append({ + 'url': video_url, + 'format_id': lang, + 'format_note': xpath_text(file_, './lglabel'), + 'language_preference': language_preference(lang) + }) + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': title, + 'description': description, + 'thumbnail': thumbnmail, + 'upload_date': upload_date, + 'duration': duration, + 'view_count': view_count, + 'formats': formats + } diff --git a/youtube_dl/extractor/expotv.py b/youtube_dl/extractor/expotv.py index a38b773e8..1585a03bb 100644 --- a/youtube_dl/extractor/expotv.py +++ b/youtube_dl/extractor/expotv.py @@ -33,20 +33,27 @@ class ExpoTVIE(InfoExtractor): webpage = self._download_webpage(url, video_id) player_key = self._search_regex( r'[0-9]+)' + _TEST = { + 'url': 'http://fc-zenit.ru/video/gl6785/', + 'md5': '458bacc24549173fe5a5aa29174a5606', + 'info_dict': { + 'id': '6785', + 'ext': 'mp4', + 'title': '«Зенит-ТВ»: как Олег Шатов играл против «Урала»', + }, + } + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + video_title = self._html_search_regex(r'
    ([^<]+)', webpage, 'title') + + bitrates_raw = self._html_search_regex(r'bitrates:.*\n(.*)\]', webpage, 'video URL') + bitrates = re.findall(r'url:.?\'(.+?)\'.*?bitrate:.?([0-9]{3}?)', bitrates_raw) + + formats = [{ + 'url': furl, + 'tbr': tbr, + } for furl, tbr in bitrates] + + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': video_title, + 'formats': formats, + } diff --git a/youtube_dl/extractor/fivemin.py b/youtube_dl/extractor/fivemin.py index 157094e8c..2955965d9 100644 --- a/youtube_dl/extractor/fivemin.py +++ b/youtube_dl/extractor/fivemin.py @@ -2,11 +2,15 @@ from __future__ import unicode_literals from .common import InfoExtractor from ..compat import ( - compat_str, compat_urllib_parse, + compat_parse_qs, + compat_urllib_parse_urlparse, + compat_urlparse, ) from ..utils import ( ExtractorError, + parse_duration, + replace_extension, ) @@ -28,6 +32,7 @@ class FiveMinIE(InfoExtractor): 'id': '518013791', 'ext': 'mp4', 'title': 'iPad Mini with Retina Display Review', + 'duration': 177, }, }, { @@ -38,9 +43,52 @@ class FiveMinIE(InfoExtractor): 'id': '518086247', 'ext': 'mp4', 'title': 'How to Make a Next-Level Fruit Salad', + 'duration': 184, }, }, ] + _ERRORS = { + 'ErrorVideoNotExist': 'We\'re sorry, but the video you are trying to watch does not exist.', + 'ErrorVideoNoLongerAvailable': 'We\'re sorry, but the video you are trying to watch is no longer available.', + 'ErrorVideoRejected': 'We\'re sorry, but the video you are trying to watch has been removed.', + 'ErrorVideoUserNotGeo': 'We\'re sorry, but the video you are trying to watch cannot be viewed from your current location.', + 'ErrorVideoLibraryRestriction': 'We\'re sorry, but the video you are trying to watch is currently unavailable for viewing at this domain.', + 'ErrorExposurePermission': 'We\'re sorry, but the video you are trying to watch is currently unavailable for viewing at this domain.', + } + _QUALITIES = { + 1: { + 'width': 640, + 'height': 360, + }, + 2: { + 'width': 854, + 'height': 480, + }, + 4: { + 'width': 1280, + 'height': 720, + }, + 8: { + 'width': 1920, + 'height': 1080, + }, + 16: { + 'width': 640, + 'height': 360, + }, + 32: { + 'width': 854, + 'height': 480, + }, + 64: { + 'width': 1280, + 'height': 720, + }, + 128: { + 'width': 640, + 'height': 360, + }, + } def _real_extract(self, url): video_id = self._match_id(url) @@ -59,26 +107,36 @@ class FiveMinIE(InfoExtractor): 'https://syn.5min.com/handlers/SenseHandler.ashx?' + query, video_id) if not response['success']: - err_msg = response['errorMessage'] - if err_msg == 'ErrorVideoUserNotGeo': - msg = 'Video not available from your location' - else: - msg = 'Aol said: %s' % err_msg - raise ExtractorError(msg, expected=True, video_id=video_id) + raise ExtractorError( + '%s said: %s' % ( + self.IE_NAME, + self._ERRORS.get(response['errorMessage'], response['errorMessage'])), + expected=True) info = response['binding'][0] - second_id = compat_str(int(video_id[:-2]) + 1) formats = [] - for quality, height in [(1, 320), (2, 480), (4, 720), (8, 1080)]: - if any(r['ID'] == quality for r in info['Renditions']): + parsed_video_url = compat_urllib_parse_urlparse(compat_parse_qs( + compat_urllib_parse_urlparse(info['EmbededURL']).query)['videoUrl'][0]) + for rendition in info['Renditions']: + if rendition['RenditionType'] == 'm3u8': + formats.extend(self._extract_m3u8_formats(rendition['Url'], video_id, m3u8_id='hls')) + elif rendition['RenditionType'] == 'aac': + continue + else: + rendition_url = compat_urlparse.urlunparse(parsed_video_url._replace(path=replace_extension(parsed_video_url.path.replace('//', '/%s/' % rendition['ID']), rendition['RenditionType']))) + quality = self._QUALITIES.get(rendition['ID'], {}) formats.append({ - 'format_id': compat_str(quality), - 'url': 'http://avideos.5min.com/%s/%s/%s_%s.mp4' % (second_id[-3:], second_id, video_id, quality), - 'height': height, + 'format_id': '%s-%d' % (rendition['RenditionType'], rendition['ID']), + 'url': rendition_url, + 'width': quality.get('width'), + 'height': quality.get('height'), }) + self._sort_formats(formats) return { 'id': video_id, 'title': info['Title'], + 'thumbnail': info.get('ThumbURL'), + 'duration': parse_duration(info.get('Duration')), 'formats': formats, } diff --git a/youtube_dl/extractor/fourtube.py b/youtube_dl/extractor/fourtube.py index 3bb4f6239..fb6d108c0 100644 --- a/youtube_dl/extractor/fourtube.py +++ b/youtube_dl/extractor/fourtube.py @@ -46,10 +46,10 @@ class FourTubeIE(InfoExtractor): thumbnail = self._html_search_meta('thumbnailUrl', webpage) uploader_id = self._html_search_regex( r'', - webpage, 'uploader id') + webpage, 'uploader id', fatal=False) uploader = self._html_search_regex( r'', - webpage, 'uploader') + webpage, 'uploader', fatal=False) categories_html = self._search_regex( r'(?s)>\s*Categories / Tags\s*.*?
      (.*?)
    ', @@ -68,13 +68,24 @@ class FourTubeIE(InfoExtractor): webpage, 'like count', fatal=False)) duration = parse_duration(self._html_search_meta('duration', webpage)) - params_js = self._search_regex( - r'\$\.ajax\(url,\ opts\);\s*\}\s*\}\)\(([0-9,\[\] ]+)\)', - webpage, 'initialization parameters' - ) - params = self._parse_json('[%s]' % params_js, video_id) - media_id = params[0] - sources = ['%s' % p for p in params[2]] + media_id = self._search_regex( + r']+data-id=(["\'])(?P\d+)\1[^>]+data-quality=', webpage, + 'media id', default=None, group='id') + sources = [ + quality + for _, quality in re.findall(r']+data-quality=(["\'])(.+?)\1', webpage)] + if not (media_id and sources): + player_js = self._download_webpage( + self._search_regex( + r']id=(["\'])playerembed\1[^>]+src=(["\'])(?P.+?)\2', + webpage, 'player JS', group='url'), + video_id, 'Downloading player JS') + params_js = self._search_regex( + r'\$\.ajax\(url,\ opts\);\s*\}\s*\}\)\(([0-9,\[\] ]+)\)', + player_js, 'initialization parameters') + params = self._parse_json('[%s]' % params_js, video_id) + media_id = params[0] + sources = ['%s' % p for p in params[2]] token_url = 'http://tkn.4tube.com/{0}/desktop/{1}'.format( media_id, '+'.join(sources)) diff --git a/youtube_dl/extractor/iqiyi.py b/youtube_dl/extractor/iqiyi.py index ce1ab3820..0e53cb154 100644 --- a/youtube_dl/extractor/iqiyi.py +++ b/youtube_dl/extractor/iqiyi.py @@ -191,7 +191,7 @@ class IqiyiIE(InfoExtractor): 'vid': video_id, 'vinfo': 1, 'tm': tm, - 'enc': self.md5_text((enc_key + tail)[1:64:2] + tail), + 'enc': self.md5_text(enc_key + tail), 'qyid': _uuid, 'tn': random.random(), 'um': 0, @@ -205,7 +205,9 @@ class IqiyiIE(InfoExtractor): def get_enc_key(self, swf_url, video_id): # TODO: automatic key extraction - enc_key = 'eac64f22daf001da6ba9aa8da4d501508bbe90a4d4091fea3b0582a85b38c2cc' # last update at 2015-09-23-23 for Zombie::bite + # last update at 2015-10-10 for Zombie::bite + # '7239670519b6ac209a0bee4ef0446a6b24894b8ac2751506e42116212a0d0272e505'[2:66][1::2] + enc_key = '97596c0abee04ab49ba25564161ad225' return enc_key def _real_extract(self, url): diff --git a/youtube_dl/extractor/jeuxvideo.py b/youtube_dl/extractor/jeuxvideo.py index 1df084d87..eef7daa29 100644 --- a/youtube_dl/extractor/jeuxvideo.py +++ b/youtube_dl/extractor/jeuxvideo.py @@ -28,7 +28,7 @@ class JeuxVideoIE(InfoExtractor): mobj = re.match(self._VALID_URL, url) title = mobj.group(1) webpage = self._download_webpage(url, title) - title = self._html_search_meta('name', webpage) + title = self._html_search_meta('name', webpage) or self._og_search_title(webpage) config_url = self._html_search_regex( r'data-src="(/contenu/medias/video.php.*?)"', webpage, 'config URL') diff --git a/youtube_dl/extractor/limelight.py b/youtube_dl/extractor/limelight.py new file mode 100644 index 000000000..fb03dd527 --- /dev/null +++ b/youtube_dl/extractor/limelight.py @@ -0,0 +1,229 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + determine_ext, + float_or_none, + int_or_none, +) + + +class LimelightBaseIE(InfoExtractor): + _PLAYLIST_SERVICE_URL = 'http://production-ps.lvp.llnw.net/r/PlaylistService/%s/%s/%s' + _API_URL = 'http://api.video.limelight.com/rest/organizations/%s/%s/%s/%s.json' + + def _call_playlist_service(self, item_id, method, fatal=True): + return self._download_json( + self._PLAYLIST_SERVICE_URL % (self._PLAYLIST_SERVICE_PATH, item_id, method), + item_id, 'Downloading PlaylistService %s JSON' % method, fatal=fatal) + + def _call_api(self, organization_id, item_id, method): + return self._download_json( + self._API_URL % (organization_id, self._API_PATH, item_id, method), + item_id, 'Downloading API %s JSON' % method) + + def _extract(self, item_id, pc_method, mobile_method, meta_method): + pc = self._call_playlist_service(item_id, pc_method) + metadata = self._call_api(pc['orgId'], item_id, meta_method) + mobile = self._call_playlist_service(item_id, mobile_method, fatal=False) + return pc, mobile, metadata + + def _extract_info(self, streams, mobile_urls, properties): + video_id = properties['media_id'] + formats = [] + + for stream in streams: + stream_url = stream.get('url') + if not stream_url: + continue + if '.f4m' in stream_url: + formats.extend(self._extract_f4m_formats(stream_url, video_id)) + else: + fmt = { + 'url': stream_url, + 'abr': float_or_none(stream.get('audioBitRate')), + 'vbr': float_or_none(stream.get('videoBitRate')), + 'fps': float_or_none(stream.get('videoFrameRate')), + 'width': int_or_none(stream.get('videoWidthInPixels')), + 'height': int_or_none(stream.get('videoHeightInPixels')), + 'ext': determine_ext(stream_url) + } + rtmp = re.search(r'^(?Prtmpe?://[^/]+/(?P.+))/(?Pmp4:.+)$', stream_url) + if rtmp: + format_id = 'rtmp' + if stream.get('videoBitRate'): + format_id += '-%d' % int_or_none(stream['videoBitRate']) + fmt.update({ + 'url': rtmp.group('url'), + 'play_path': rtmp.group('playpath'), + 'app': rtmp.group('app'), + 'ext': 'flv', + 'format_id': format_id, + }) + formats.append(fmt) + + for mobile_url in mobile_urls: + media_url = mobile_url.get('mobileUrl') + if not media_url: + continue + format_id = mobile_url.get('targetMediaPlatform') + if determine_ext(media_url) == 'm3u8': + formats.extend(self._extract_m3u8_formats( + media_url, video_id, 'mp4', entry_protocol='m3u8_native', + preference=-1, m3u8_id=format_id)) + else: + formats.append({ + 'url': media_url, + 'format_id': format_id, + 'preference': -1, + }) + + self._sort_formats(formats) + + title = properties['title'] + description = properties.get('description') + timestamp = int_or_none(properties.get('publish_date') or properties.get('create_date')) + duration = float_or_none(properties.get('duration_in_milliseconds'), 1000) + filesize = int_or_none(properties.get('total_storage_in_bytes')) + categories = [properties.get('category')] + tags = properties.get('tags', []) + thumbnails = [{ + 'url': thumbnail['url'], + 'width': int_or_none(thumbnail.get('width')), + 'height': int_or_none(thumbnail.get('height')), + } for thumbnail in properties.get('thumbnails', []) if thumbnail.get('url')] + + subtitles = {} + for caption in properties.get('captions', {}): + lang = caption.get('language_code') + subtitles_url = caption.get('url') + if lang and subtitles_url: + subtitles[lang] = [{ + 'url': subtitles_url, + }] + + return { + 'id': video_id, + 'title': title, + 'description': description, + 'formats': formats, + 'timestamp': timestamp, + 'duration': duration, + 'filesize': filesize, + 'categories': categories, + 'tags': tags, + 'thumbnails': thumbnails, + 'subtitles': subtitles, + } + + +class LimelightMediaIE(LimelightBaseIE): + IE_NAME = 'limelight' + _VALID_URL = r'(?:limelight:media:|http://link\.videoplatform\.limelight\.com/media/\??\bmediaId=)(?P[a-z0-9]{32})' + _TESTS = [{ + 'url': 'http://link.videoplatform.limelight.com/media/?mediaId=3ffd040b522b4485b6d84effc750cd86', + 'info_dict': { + 'id': '3ffd040b522b4485b6d84effc750cd86', + 'ext': 'flv', + 'title': 'HaP and the HB Prince Trailer', + 'description': 'md5:8005b944181778e313d95c1237ddb640', + 'thumbnail': 're:^https?://.*\.jpeg$', + 'duration': 144.23, + 'timestamp': 1244136834, + 'upload_date': '20090604', + }, + 'params': { + # rtmp download + 'skip_download': True, + }, + }, { + # video with subtitles + 'url': 'limelight:media:a3e00274d4564ec4a9b29b9466432335', + 'info_dict': { + 'id': 'a3e00274d4564ec4a9b29b9466432335', + 'ext': 'flv', + 'title': '3Play Media Overview Video', + 'description': '', + 'thumbnail': 're:^https?://.*\.jpeg$', + 'duration': 78.101, + 'timestamp': 1338929955, + 'upload_date': '20120605', + 'subtitles': 'mincount:9', + }, + 'params': { + # rtmp download + 'skip_download': True, + }, + }] + _PLAYLIST_SERVICE_PATH = 'media' + _API_PATH = 'media' + + def _real_extract(self, url): + video_id = self._match_id(url) + + pc, mobile, metadata = self._extract( + video_id, 'getPlaylistByMediaId', 'getMobilePlaylistByMediaId', 'properties') + + return self._extract_info( + pc['playlistItems'][0].get('streams', []), + mobile['mediaList'][0].get('mobileUrls', []) if mobile else [], + metadata) + + +class LimelightChannelIE(LimelightBaseIE): + IE_NAME = 'limelight:channel' + _VALID_URL = r'(?:limelight:channel:|http://link\.videoplatform\.limelight\.com/media/\??\bchannelId=)(?P[a-z0-9]{32})' + _TEST = { + 'url': 'http://link.videoplatform.limelight.com/media/?channelId=ab6a524c379342f9b23642917020c082', + 'info_dict': { + 'id': 'ab6a524c379342f9b23642917020c082', + 'title': 'Javascript Sample Code', + }, + 'playlist_mincount': 3, + } + _PLAYLIST_SERVICE_PATH = 'channel' + _API_PATH = 'channels' + + def _real_extract(self, url): + channel_id = self._match_id(url) + + pc, mobile, medias = self._extract( + channel_id, 'getPlaylistByChannelId', + 'getMobilePlaylistWithNItemsByChannelId?begin=0&count=-1', 'media') + + entries = [ + self._extract_info( + pc['playlistItems'][i].get('streams', []), + mobile['mediaList'][i].get('mobileUrls', []) if mobile else [], + medias['media_list'][i]) + for i in range(len(medias['media_list']))] + + return self.playlist_result(entries, channel_id, pc['title']) + + +class LimelightChannelListIE(LimelightBaseIE): + IE_NAME = 'limelight:channel_list' + _VALID_URL = r'(?:limelight:channel_list:|http://link\.videoplatform\.limelight\.com/media/\?.*?\bchannelListId=)(?P[a-z0-9]{32})' + _TEST = { + 'url': 'http://link.videoplatform.limelight.com/media/?channelListId=301b117890c4465c8179ede21fd92e2b', + 'info_dict': { + 'id': '301b117890c4465c8179ede21fd92e2b', + 'title': 'Website - Hero Player', + }, + 'playlist_mincount': 2, + } + _PLAYLIST_SERVICE_PATH = 'channel_list' + + def _real_extract(self, url): + channel_list_id = self._match_id(url) + + channel_list = self._call_playlist_service(channel_list_id, 'getMobileChannelListById') + + entries = [ + self.url_result('limelight:channel:%s' % channel['id'], 'LimelightChannel') + for channel in channel_list['channelList']] + + return self.playlist_result(entries, channel_list_id, channel_list['title']) diff --git a/youtube_dl/extractor/megavideoz.py b/youtube_dl/extractor/megavideoz.py deleted file mode 100644 index af7ff07ea..000000000 --- a/youtube_dl/extractor/megavideoz.py +++ /dev/null @@ -1,56 +0,0 @@ -# encoding: utf-8 -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from ..utils import ( - ExtractorError, - float_or_none, - xpath_text, -) - - -class MegaVideozIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?megavideoz\.eu/video/(?P[^/]+)(?:/(?P[^/]+))?' - _TEST = { - 'url': 'http://megavideoz.eu/video/WM6UB919XMXH/SMPTE-Universal-Film-Leader', - 'info_dict': { - 'id': '48723', - 'display_id': 'SMPTE-Universal-Film-Leader', - 'ext': 'mp4', - 'title': 'SMPTE Universal Film Leader', - 'thumbnail': 're:https?://.*?\.jpg', - 'duration': 10.93, - } - } - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') - display_id = mobj.group('display_id') or video_id - - webpage = self._download_webpage(url, display_id) - - if any(p in webpage for p in ('>Video Not Found<', '>404 Error<')): - raise ExtractorError('Video %s does not exist' % video_id, expected=True) - - config = self._download_xml( - self._search_regex( - r"var\s+cnf\s*=\s*'([^']+)'", webpage, 'cnf url'), - display_id) - - video_url = xpath_text(config, './file', 'video url', fatal=True) - title = xpath_text(config, './title', 'title', fatal=True) - thumbnail = xpath_text(config, './image', 'thumbnail') - duration = float_or_none(xpath_text(config, './duration', 'duration')) - video_id = xpath_text(config, './mediaid', 'video id') or video_id - - return { - 'id': video_id, - 'display_id': display_id, - 'url': video_url, - 'title': title, - 'thumbnail': thumbnail, - 'duration': duration - } diff --git a/youtube_dl/extractor/naver.py b/youtube_dl/extractor/naver.py index 925967753..1f5fc2145 100644 --- a/youtube_dl/extractor/naver.py +++ b/youtube_dl/extractor/naver.py @@ -10,7 +10,6 @@ from ..compat import ( ) from ..utils import ( ExtractorError, - clean_html, ) @@ -46,11 +45,11 @@ class NaverIE(InfoExtractor): m_id = re.search(r'var rmcPlayer = new nhn.rmcnmv.RMCVideoPlayer\("(.+?)", "(.+?)"', webpage) if m_id is None: - m_error = re.search( - r'(?s)
    \s*(?:)?\s*

    (?P.+?)

    \s*
    ', - webpage) - if m_error: - raise ExtractorError(clean_html(m_error.group('msg')), expected=True) + error = self._html_search_regex( + r'(?s)
    \s*(?:)?\s*

    (?P.+?)

    \s*
    ', + webpage, 'error', default=None) + if error: + raise ExtractorError(error, expected=True) raise ExtractorError('couldn\'t extract vid and key') vid = m_id.group(1) key = m_id.group(2) diff --git a/youtube_dl/extractor/nextmedia.py b/youtube_dl/extractor/nextmedia.py index c10784f6b..d1688457f 100644 --- a/youtube_dl/extractor/nextmedia.py +++ b/youtube_dl/extractor/nextmedia.py @@ -126,7 +126,8 @@ class AppleDailyIE(NextMediaIE): 'thumbnail': 're:^https?://.*\.jpg$', 'description': 'md5:23c0aac567dc08c9c16a3161a2c2e3cd', 'upload_date': '20150128', - } + }, + 'skip': 'redirect to http://www.appledaily.com.tw/animation/', }, { # No thumbnail 'url': 'http://www.appledaily.com.tw/animation/realtimenews/new/20150128/5003673/', @@ -140,10 +141,19 @@ class AppleDailyIE(NextMediaIE): }, 'expected_warnings': [ 'video thumbnail', - ] + ], + 'skip': 'redirect to http://www.appledaily.com.tw/animation/', }, { 'url': 'http://www.appledaily.com.tw/appledaily/article/supplement/20140417/35770334/', - 'only_matching': True, + 'md5': 'eaa20e6b9df418c912d7f5dec2ba734d', + 'info_dict': { + 'id': '35770334', + 'ext': 'mp4', + 'title': '咖啡占卜測 XU裝熟指數', + 'thumbnail': 're:^https?://.*\.jpg$', + 'description': 'md5:7b859991a6a4fedbdf3dd3b66545c748', + 'upload_date': '20140417', + }, }] _URL_PATTERN = r'\{url: \'(.+)\'\}' diff --git a/youtube_dl/extractor/nfl.py b/youtube_dl/extractor/nfl.py index 55dc6107d..200874d68 100644 --- a/youtube_dl/extractor/nfl.py +++ b/youtube_dl/extractor/nfl.py @@ -107,6 +107,20 @@ class NFLIE(InfoExtractor): 'timestamp': 1442618809, 'upload_date': '20150918', }, + }, { + # lowercase data-contentid + 'url': 'http://www.steelers.com/news/article-1/Tomlin-on-Ben-getting-Vick-ready/56399c96-4160-48cf-a7ad-1d17d4a3aef7', + 'info_dict': { + 'id': '12693586-6ea9-4743-9c1c-02c59e4a5ef2', + 'ext': 'mp4', + 'title': 'Tomlin looks ahead to Ravens on a short week', + 'description': 'md5:32f3f7b139f43913181d5cbb24ecad75', + 'timestamp': 1443459651, + 'upload_date': '20150928', + }, + 'params': { + 'skip_download': True, + }, }, { 'url': 'http://www.nfl.com/videos/nfl-network-top-ten/09000d5d810a6bd4/Top-10-Gutsiest-Performances-Jack-Youngblood', 'only_matching': True, @@ -151,7 +165,7 @@ class NFLIE(InfoExtractor): group='config')) # For articles, the id in the url is not the video id video_id = self._search_regex( - r'(?:]+data-contentId\s*=\s*|contentId\s*:\s*)(["\'])(?P.+?)\1', + r'(?:]+data-content[Ii]d\s*=\s*|content[Ii]d\s*:\s*)(["\'])(?P.+?)\1', webpage, 'video id', default=video_id, group='id') config = self._download_json(config_url, video_id, 'Downloading player config') url_template = NFLIE.prepend_host( diff --git a/youtube_dl/extractor/nowtv.py b/youtube_dl/extractor/nowtv.py index c8257719f..b0bdffc4e 100644 --- a/youtube_dl/extractor/nowtv.py +++ b/youtube_dl/extractor/nowtv.py @@ -167,8 +167,8 @@ class NowTVIE(InfoExtractor): 'app': app, 'play_path': 'mp4:%s' % play_path, 'ext': 'flv', - 'page_url': url, - 'player_url': 'http://rtl-now.rtl.de/includes/nc_player.swf', + 'page_url': 'http://rtlnow.rtl.de', + 'player_url': 'http://cdn.static-fra.de/now/vodplayer.swf', 'tbr': int_or_none(item.get('bitrate')), }) self._sort_formats(formats) diff --git a/youtube_dl/extractor/nrk.py b/youtube_dl/extractor/nrk.py index d066a96db..8ac38a174 100644 --- a/youtube_dl/extractor/nrk.py +++ b/youtube_dl/extractor/nrk.py @@ -4,6 +4,7 @@ from __future__ import unicode_literals import re from .common import InfoExtractor +from ..compat import compat_urlparse from ..utils import ( ExtractorError, float_or_none, @@ -49,7 +50,7 @@ class NRKIE(InfoExtractor): if data['usageRights']['isGeoBlocked']: raise ExtractorError( - 'NRK har ikke rettig-heter til å vise dette programmet utenfor Norge', + 'NRK har ikke rettigheter til å vise dette programmet utenfor Norge', expected=True) video_url = data['mediaUrl'] + '?hdcore=3.5.0&plugin=aasp-3.5.0.151.81' @@ -196,20 +197,6 @@ class NRKTVIE(InfoExtractor): } ] - def _debug_print(self, txt): - if self._downloader.params.get('verbose', False): - self.to_screen('[debug] %s' % txt) - - def _get_subtitles(self, subtitlesurl, video_id, baseurl): - url = "%s%s" % (baseurl, subtitlesurl) - self._debug_print('%s: Subtitle url: %s' % (video_id, url)) - captions = self._download_xml( - url, video_id, 'Downloading subtitles') - lang = captions.get('lang', 'no') - return {lang: [ - {'ext': 'ttml', 'url': url}, - ]} - def _extract_f4m(self, manifest_url, video_id): return self._extract_f4m_formats( manifest_url + '?hdcore=3.1.1&plugin=aasp-3.1.1.69.124', video_id, f4m_id='hds') @@ -218,7 +205,7 @@ class NRKTVIE(InfoExtractor): mobj = re.match(self._VALID_URL, url) video_id = mobj.group('id') part_id = mobj.group('part_id') - baseurl = mobj.group('baseurl') + base_url = mobj.group('baseurl') webpage = self._download_webpage(url, video_id) @@ -278,11 +265,14 @@ class NRKTVIE(InfoExtractor): self._sort_formats(formats) subtitles_url = self._html_search_regex( - r'data-subtitlesurl[ ]*=[ ]*"([^"]+)"', - webpage, 'subtitle URL', default=None) - subtitles = None + r'data-subtitlesurl\s*=\s*(["\'])(?P.+?)\1', + webpage, 'subtitle URL', default=None, group='url') + subtitles = {} if subtitles_url: - subtitles = self.extract_subtitles(subtitles_url, video_id, baseurl) + subtitles['no'] = [{ + 'ext': 'ttml', + 'url': compat_urlparse.urljoin(base_url, subtitles_url), + }] return { 'id': video_id, diff --git a/youtube_dl/extractor/pbs.py b/youtube_dl/extractor/pbs.py index 683c81de3..3448736a2 100644 --- a/youtube_dl/extractor/pbs.py +++ b/youtube_dl/extractor/pbs.py @@ -108,12 +108,12 @@ class PBSIE(InfoExtractor): { 'url': 'http://www.pbs.org/wgbh/americanexperience/films/death/player/', 'info_dict': { - 'id': '2280706814', + 'id': '2276541483', 'display_id': 'player', 'ext': 'mp4', - 'title': 'American Experience - Death and the Civil War', + 'title': 'American Experience - Death and the Civil War, Chapter 1', 'description': 'American Experience, TV’s most-watched history series, brings to life the compelling stories from our past that inform our understanding of the world today.', - 'duration': 6705, + 'duration': 682, 'thumbnail': 're:^https?://.*\.jpg$', }, 'params': { @@ -134,8 +134,33 @@ class PBSIE(InfoExtractor): 'params': { 'skip_download': True, # requires ffmpeg }, + 'skip': 'Expired', + }, + { + # Video embedded in iframe containing angle brackets as attribute's value (e.g. + # "