2017-04-02 05:42:10 +08:00
# coding: utf-8
2016-02-06 21:27:04 +08:00
from __future__ import unicode_literals
2014-08-28 07:04:43 +08:00
2013-06-24 01:57:38 +08:00
import base64
2014-09-28 14:53:52 +08:00
import datetime
2014-01-17 21:47:46 +08:00
import hashlib
2014-01-07 16:35:34 +08:00
import json
2014-07-11 16:57:08 +08:00
import netrc
2013-06-24 01:57:38 +08:00
import os
2017-02-04 19:49:58 +08:00
import random
2013-06-24 01:57:38 +08:00
import re
import socket
import sys
2014-07-11 16:57:08 +08:00
import time
2016-02-03 01:07:07 +08:00
import math
2013-06-24 01:57:38 +08:00
2014-11-02 18:23:40 +08:00
from . . compat import (
2014-11-30 07:03:59 +08:00
compat_cookiejar ,
2015-07-30 06:20:37 +08:00
compat_cookies ,
2016-03-03 19:24:24 +08:00
compat_etree_fromstring ,
2015-08-15 23:55:07 +08:00
compat_getpass ,
2018-06-18 05:01:48 +08:00
compat_integer_types ,
2013-06-24 01:57:38 +08:00
compat_http_client ,
2016-03-03 19:24:24 +08:00
compat_os_name ,
compat_str ,
2013-06-24 01:57:38 +08:00
compat_urllib_error ,
2016-10-07 19:20:53 +08:00
compat_urllib_parse_unquote ,
2016-03-26 03:46:57 +08:00
compat_urllib_parse_urlencode ,
2016-04-01 00:58:38 +08:00
compat_urllib_request ,
2014-09-24 20:16:56 +08:00
compat_urlparse ,
2017-08-23 01:32:41 +08:00
compat_xml_parse_error ,
2014-11-02 18:23:40 +08:00
)
2017-11-04 23:10:55 +08:00
from . . downloader . f4m import (
get_base_url ,
remove_encrypted_media ,
)
2014-11-02 18:23:40 +08:00
from . . utils import (
2015-06-29 00:56:45 +08:00
NO_DEFAULT ,
2015-01-07 14:20:20 +08:00
age_restricted ,
2016-11-02 03:14:01 +08:00
base_url ,
2015-04-17 20:55:24 +08:00
bug_reports_message ,
2013-06-24 01:57:38 +08:00
clean_html ,
compiled_regex_type ,
2015-07-16 03:15:15 +08:00
determine_ext ,
2017-03-09 07:13:54 +08:00
determine_protocol ,
2015-12-20 09:00:39 +08:00
error_to_compat_str ,
2013-06-24 01:57:38 +08:00
ExtractorError ,
2017-03-09 07:13:54 +08:00
extract_attributes ,
2015-07-16 03:14:08 +08:00
fix_xml_ampersands ,
2014-09-28 16:34:55 +08:00
float_or_none ,
2017-02-04 19:49:58 +08:00
GeoRestrictedError ,
GeoUtils ,
2014-07-28 21:25:56 +08:00
int_or_none ,
2017-02-16 23:42:36 +08:00
js_to_json ,
2018-07-10 00:43:05 +08:00
JSON_LD_RE ,
2017-03-09 07:13:54 +08:00
mimetype2ext ,
orderedSet ,
parse_codecs ,
parse_duration ,
2016-01-16 02:36:02 +08:00
parse_iso8601 ,
2017-03-09 07:13:54 +08:00
parse_m3u8_attributes ,
2013-10-23 20:38:03 +08:00
RegexNotFoundError ,
2015-11-22 00:18:17 +08:00
sanitized_Request ,
2017-03-09 07:13:54 +08:00
sanitize_filename ,
2013-07-17 16:38:23 +08:00
unescapeHTML ,
2015-10-02 00:18:59 +08:00
unified_strdate ,
2016-07-09 04:27:11 +08:00
unified_timestamp ,
2017-03-09 07:13:54 +08:00
update_Request ,
update_url_query ,
urljoin ,
2015-08-02 03:13:21 +08:00
url_basename ,
2016-06-08 00:19:33 +08:00
xpath_element ,
2015-08-09 21:07:18 +08:00
xpath_text ,
xpath_with_ns ,
2013-06-24 01:57:38 +08:00
)
2015-06-29 00:56:45 +08:00
2013-06-24 01:57:38 +08:00
class InfoExtractor ( object ) :
""" Information Extractor class.
Information extractors are the classes that , given a URL , extract
information about the video ( or videos ) the URL refers to . This
information includes the real video URL , the video title , author and
others . The information is stored in a dictionary which is then
2014-12-21 23:58:29 +08:00
passed to the YoutubeDL . The YoutubeDL processes this
2013-06-24 01:57:38 +08:00
information possibly downloading the video to the file system , among
other possible outcomes .
2015-04-29 23:03:10 +08:00
The type field determines the type of the result .
2014-11-20 23:47:59 +08:00
By far the most common value ( and the default if _type is missing ) is
" video " , which indicates a single video .
For a video , the dictionaries must include the following fields :
2013-06-24 01:57:38 +08:00
id : Video identifier .
title : Video title , unescaped .
2013-12-16 21:13:40 +08:00
2013-12-24 18:56:02 +08:00
Additionally , it must contain either a formats entry or a url one :
2013-12-16 21:13:40 +08:00
2013-12-24 18:56:02 +08:00
formats : A list of dictionaries for each format available , ordered
from worst to best quality .
Potential fields :
2016-09-17 21:35:22 +08:00
* url Mandatory . The URL of the video file
* manifest_url
The URL of the manifest file in case of
fragmented media ( DASH , hls , hds )
2015-07-24 01:37:45 +08:00
* ext Will be calculated from URL if missing
2013-12-16 21:13:40 +08:00
* format A human - readable description of the format
( " mp4 container with h264/opus " ) .
Calculated from the format_id , width , height .
and format_note fields if missing .
* format_id A short description of the format
2013-12-27 04:19:00 +08:00
( " mp4_h264_opus " or " 19 " ) .
Technically optional , but strongly recommended .
2013-12-16 21:13:40 +08:00
* format_note Additional info about the format
( " 3D " or " DASH video " )
* width Width of the video , if known
* height Height of the video , if known
2013-12-24 18:56:02 +08:00
* resolution Textual description of width and height
2013-12-25 22:18:40 +08:00
* tbr Average bitrate of audio and video in KBit / s
2013-12-16 21:13:40 +08:00
* abr Average audio bitrate in KBit / s
* acodec Name of the audio codec in use
2014-01-19 12:47:20 +08:00
* asr Audio sampling rate in Hertz
2013-12-16 21:13:40 +08:00
* vbr Average video bitrate in KBit / s
2014-10-30 16:34:13 +08:00
* fps Frame rate
2013-12-16 21:13:40 +08:00
* vcodec Name of the video codec in use
2014-01-24 06:54:06 +08:00
* container Name of the container format
2013-12-16 21:13:40 +08:00
* filesize The number of bytes , if known in advance
2014-07-21 18:02:44 +08:00
* filesize_approx An estimate for the number of bytes
2013-12-16 21:13:40 +08:00
* player_url SWF Player URL ( used for rtmpdump ) .
2013-12-25 06:32:04 +08:00
* protocol The protocol that will be used for the actual
download , lower - case .
2015-01-30 22:53:16 +08:00
" http " , " https " , " rtsp " , " rtmp " , " rtmpe " ,
2016-03-06 17:47:07 +08:00
" m3u8 " , " m3u8_native " or " http_dash_segments " .
2017-01-29 06:56:43 +08:00
* fragment_base_url
Base URL for fragments . Each fragment ' s path
value ( if present ) will be relative to
this URL .
* fragments A list of fragments of a fragmented media .
Each fragment entry must contain either an url
or a path . If an url is present it should be
considered by a client . Otherwise both path and
fragment_base_url must be present . Here is
the list of all potential fields :
* " url " - fragment ' s URL
* " path " - fragment ' s path relative to
fragment_base_url
2016-09-06 02:18:57 +08:00
* " duration " ( optional , int or float )
* " filesize " ( optional , int )
2013-12-24 18:56:02 +08:00
* preference Order number of this format . If this field is
2014-01-02 03:23:47 +08:00
present and not None , the formats get sorted
2014-03-24 00:41:43 +08:00
by this field , regardless of all other values .
2013-12-24 18:56:02 +08:00
- 1 for default ( order by other properties ) ,
- 2 or smaller for less than default .
2015-01-04 01:33:38 +08:00
< - 1000 to hide the format ( if there is
another one which is strictly better )
2016-01-01 20:28:45 +08:00
* language Language code , e . g . " de " or " en-US " .
* language_preference Is this in the language mentioned in
the URL ?
2014-11-20 19:06:33 +08:00
10 if it ' s what the URL is about,
- 1 for default ( don ' t know),
- 10 otherwise , other values reserved for now .
2014-01-07 00:15:27 +08:00
* quality Order number of the video quality of this
format , irrespective of the file format .
- 1 for default ( order by other properties ) ,
- 2 or smaller for less than default .
2014-10-25 06:10:11 +08:00
* source_preference Order number for this video source
( quality takes higher priority )
- 1 for default ( order by other properties ) ,
- 2 or smaller for less than default .
2014-08-24 07:31:35 +08:00
* http_headers A dictionary of additional HTTP headers
to add to the request .
2015-01-10 12:45:51 +08:00
* stretched_ratio If given and not 1 , indicates that the
2015-01-26 01:09:48 +08:00
video ' s pixels are not square.
width : height ratio as float .
* no_resume The server does not support resuming the
( HTTP or RTMP ) download . Boolean .
2018-02-04 08:16:22 +08:00
* downloader_options A dictionary of downloader options as
described in FileDownloader
2015-01-26 01:09:48 +08:00
2013-12-16 11:09:30 +08:00
url : Final video URL .
2013-06-24 01:57:38 +08:00
ext : Video filename extension .
2013-12-16 21:13:40 +08:00
format : The video format , defaults to ext ( used for - - get - format )
player_url : SWF Player URL ( used for rtmpdump ) .
2013-10-04 17:09:43 +08:00
2013-06-24 01:57:38 +08:00
The following fields are optional :
2014-12-12 10:34:28 +08:00
alt_title : A secondary title of the video .
2014-03-03 19:06:28 +08:00
display_id An alternative identifier for the video , not necessarily
unique , but available before title . Typically , id is
something like " 4234987 " , title " Dancing naked mole rats " ,
and display_id " dancing-naked-mole-rats "
2014-06-07 21:33:45 +08:00
thumbnails : A list of dictionaries , with the following entries :
2015-01-25 09:38:47 +08:00
* " id " ( optional , string ) - Thumbnail format ID
2014-06-07 21:33:45 +08:00
* " url "
2015-01-25 09:38:47 +08:00
* " preference " ( optional , int ) - quality of the image
2014-06-07 21:33:45 +08:00
* " width " ( optional , int )
* " height " ( optional , int )
* " resolution " ( optional , string " {width} x { height " } ,
deprecated )
2016-07-09 04:24:36 +08:00
* " filesize " ( optional , int )
2013-06-24 01:57:38 +08:00
thumbnail : Full URL to a video thumbnail image .
2014-12-12 10:34:28 +08:00
description : Full video description .
2013-06-24 01:57:38 +08:00
uploader : Full name of the video uploader .
2016-03-03 01:06:39 +08:00
license : License name the video is licensed under .
2016-05-02 23:31:35 +08:00
creator : The creator of the video .
2015-09-26 23:07:54 +08:00
release_date : The date ( YYYYMMDD ) when the video was released .
2014-03-14 01:21:55 +08:00
timestamp : UNIX timestamp of the moment the video became available .
2013-06-24 01:57:38 +08:00
upload_date : Video upload date ( YYYYMMDD ) .
2014-03-14 01:21:55 +08:00
If not explicitly set , calculated from timestamp .
2013-06-24 01:57:38 +08:00
uploader_id : Nickname or id of the video uploader .
2016-03-03 01:31:24 +08:00
uploader_url : Full URL to a personal webpage of the video uploader .
2014-08-27 07:44:47 +08:00
location : Physical location where the video was filmed .
2015-02-16 01:03:41 +08:00
subtitles : The available subtitles as a dictionary in the format
2016-12-25 01:50:50 +08:00
{ tag : subformats } . " tag " is usually a language code , and
" subformats " is a list sorted from lower to higher
preference , each element is a dictionary with the " ext "
entry and one of :
2015-02-16 01:03:41 +08:00
* " data " : The subtitles file contents
2015-07-24 01:37:45 +08:00
* " url " : A URL pointing to the subtitles file
2015-10-04 22:33:42 +08:00
" ext " will be calculated from URL if missing
2015-02-17 04:44:17 +08:00
automatic_captions : Like ' subtitles ' , used by the YoutubeIE for
automatically generated captions
2015-12-03 22:55:02 +08:00
duration : Length of the video in seconds , as an integer or float .
2013-06-29 22:32:28 +08:00
view_count : How many users have watched the video on the platform .
2013-12-06 01:29:07 +08:00
like_count : Number of positive ratings of the video
dislike_count : Number of negative ratings of the video
2015-10-18 11:34:54 +08:00
repost_count : Number of reposts of the video
2015-02-12 01:39:31 +08:00
average_rating : Average rating give by users , the scale used depends on the webpage
2013-12-06 01:29:07 +08:00
comment_count : Number of comments on the video
2015-01-10 06:59:18 +08:00
comments : A list of comments , each with one or more of the following
properties ( all but one of text or html optional ) :
* " author " - human - readable name of the comment author
* " author_id " - user ID of the comment author
* " id " - Comment ID
* " html " - Comment as HTML
* " text " - Plain text of the comment
* " timestamp " - UNIX timestamp of comment
* " parent " - ID of the comment this one is replying to .
Set to " root " to indicate that this is a
comment to the original video .
2013-10-06 12:06:30 +08:00
age_limit : Age restriction for the video , as an integer ( years )
2015-07-24 01:37:45 +08:00
webpage_url : The URL to the video webpage , if given to youtube - dl it
2013-11-03 19:11:13 +08:00
should allow to get the same result again . ( It will be set
by YoutubeDL if it ' s missing)
2014-05-15 18:41:42 +08:00
categories : A list of categories that the video falls in , for example
[ " Sports " , " Berlin " ]
2015-07-29 05:43:03 +08:00
tags : A list of tags assigned to the video , e . g . [ " sweden " , " pop music " ]
2014-09-19 15:57:53 +08:00
is_live : True , False , or None ( = unknown ) . Whether this video is a
live stream that goes on instead of a fixed - length video .
2015-07-21 03:10:28 +08:00
start_time : Time in seconds where the reproduction should start , as
2015-07-24 01:37:45 +08:00
specified in the URL .
2015-07-23 19:20:21 +08:00
end_time : Time in seconds where the reproduction should end , as
2015-07-24 01:37:45 +08:00
specified in the URL .
2016-05-06 04:40:19 +08:00
chapters : A list of dictionaries , with the following entries :
* " start_time " - The start time of the chapter in seconds
* " end_time " - The end time of the chapter in seconds
* " title " ( optional , string )
2013-06-24 01:57:38 +08:00
2015-12-31 05:10:44 +08:00
The following fields should only be used when the video belongs to some logical
chapter or section :
chapter : Name or title of the chapter the video belongs to .
2016-01-01 22:26:56 +08:00
chapter_number : Number of the chapter the video belongs to , as an integer .
chapter_id : Id of the chapter the video belongs to , as a unicode string .
2015-12-31 05:10:44 +08:00
The following fields should only be used when the video is an episode of some
2016-10-16 19:37:17 +08:00
series , programme or podcast :
2015-12-31 05:10:44 +08:00
series : Title of the series or programme the video episode belongs to .
season : Title of the season the video episode belongs to .
2016-01-01 22:26:56 +08:00
season_number : Number of the season the video episode belongs to , as an integer .
season_id : Id of the season the video episode belongs to , as a unicode string .
2015-12-31 05:10:44 +08:00
episode : Title of the video episode . Unlike mandatory video title field ,
this field should denote the exact title of the video episode
without any kind of decoration .
2016-01-01 22:26:56 +08:00
episode_number : Number of the video episode within a season , as an integer .
episode_id : Id of the video episode , as a unicode string .
2015-12-31 05:10:44 +08:00
2016-04-07 04:53:53 +08:00
The following fields should only be used when the media is a track or a part of
a music album :
track : Title of the track .
track_number : Number of the track within an album or a disc , as an integer .
track_id : Id of the track ( useful in case of custom indexing , e . g . 6. iii ) ,
as a unicode string .
artist : Artist ( s ) of the track .
genre : Genre ( s ) of the track .
album : Title of the album the track belongs to .
album_type : Type of the album ( e . g . " Demo " , " Full-length " , " Split " , " Compilation " , etc ) .
album_artist : List of all artists appeared on the album ( e . g .
" Ash Borer / Fell Voices " or " Various Artists " , useful for splits
and compilations ) .
disc_number : Number of the disc or other physical medium the track belongs to ,
as an integer .
release_year : Year ( YYYY ) when the album was released .
2013-10-04 16:40:42 +08:00
Unless mentioned otherwise , the fields should be Unicode strings .
2013-06-24 01:57:38 +08:00
2014-10-04 02:17:10 +08:00
Unless mentioned otherwise , None is equivalent to absence of information .
2014-11-20 23:47:59 +08:00
_type " playlist " indicates multiple videos .
2014-12-06 21:02:19 +08:00
There must be a key " entries " , which is a list , an iterable , or a PagedList
object , each element of which is a valid dictionary by this specification .
2014-11-20 23:47:59 +08:00
2017-12-19 04:51:03 +08:00
Additionally , playlists can have " id " , " title " , " description " , " uploader " ,
" uploader_id " , " uploader_url " attributes with the same semantics as videos
( see above ) .
2014-11-20 23:47:59 +08:00
_type " multi_video " indicates that there are multiple videos that
form a single show , for examples multiple acts of an opera or TV episode .
It must have an entries key like a playlist and contain all the keys
required for a video at the same time .
_type " url " indicates that the video must be extracted from another
location , possibly by a different extractor . Its only required key is :
" url " - the next URL to extract .
2014-12-09 17:58:06 +08:00
The key " ie_key " can be set to the class name ( minus the trailing " IE " ,
e . g . " Youtube " ) if the extractor class is known in advance .
Additionally , the dictionary may have any properties of the resolved entity
known in advance , for example " title " if the title of the referred video is
2014-11-20 23:47:59 +08:00
known ahead of time .
_type " url_transparent " entities have the same specification as " url " , but
indicate that the given additional information is more precise than the one
associated with the resolved URL .
This is useful when a site employs a video service that hosts the video and
its technical metadata , but that video service does not embed a useful
title , description etc .
2013-06-24 01:57:38 +08:00
Subclasses of this one should re - define the _real_initialize ( ) and
_real_extract ( ) methods and define a _VALID_URL regexp .
Probably , they should also be added to the list of extractors .
2017-02-19 04:53:23 +08:00
_GEO_BYPASS attribute may be set to False in order to disable
2017-02-04 19:49:58 +08:00
geo restriction bypass mechanisms for a particular extractor .
Though it won ' t disable explicit geo restriction bypass based on
2018-05-20 00:53:24 +08:00
country code provided with geo_bypass_country .
2017-02-19 04:53:23 +08:00
_GEO_COUNTRIES attribute may contain a list of presumably geo unrestricted
countries for this extractor . One of these countries will be used by
geo restriction bypass mechanism right away in order to bypass
2018-05-20 00:53:24 +08:00
geo restriction , of course , if the mechanism is not disabled .
2017-02-04 19:49:58 +08:00
2018-05-02 08:18:01 +08:00
_GEO_IP_BLOCKS attribute may contain a list of presumably geo unrestricted
IP blocks in CIDR notation for this extractor . One of these IP blocks
will be used by geo restriction bypass mechanism similarly
2018-05-20 00:53:24 +08:00
to _GEO_COUNTRIES .
2017-02-21 00:21:15 +08:00
2013-06-24 01:57:38 +08:00
Finally , the _WORKING attribute should be set to False for broken IEs
in order to warn the users and skip the tests .
"""
_ready = False
_downloader = None
2017-02-04 19:49:58 +08:00
_x_forwarded_for_ip = None
2017-02-19 04:53:23 +08:00
_GEO_BYPASS = True
_GEO_COUNTRIES = None
2018-05-02 08:18:01 +08:00
_GEO_IP_BLOCKS = None
2013-06-24 01:57:38 +08:00
_WORKING = True
def __init__ ( self , downloader = None ) :
""" Constructor. Receives an optional downloader. """
self . _ready = False
2017-02-04 19:49:58 +08:00
self . _x_forwarded_for_ip = None
2013-06-24 01:57:38 +08:00
self . set_downloader ( downloader )
@classmethod
def suitable ( cls , url ) :
""" Receives a URL and returns True if suitable for this IE. """
2013-08-21 10:06:46 +08:00
# This does not use has/getattr intentionally - we want to know whether
# we have cached the regexp for *this* class, whereas getattr would also
# match the superclass
if ' _VALID_URL_RE ' not in cls . __dict__ :
cls . _VALID_URL_RE = re . compile ( cls . _VALID_URL )
return cls . _VALID_URL_RE . match ( url ) is not None
2013-06-24 01:57:38 +08:00
2014-09-28 15:31:58 +08:00
@classmethod
def _match_id ( cls , url ) :
if ' _VALID_URL_RE ' not in cls . __dict__ :
cls . _VALID_URL_RE = re . compile ( cls . _VALID_URL )
m = cls . _VALID_URL_RE . match ( url )
assert m
2017-06-09 01:40:03 +08:00
return compat_str ( m . group ( ' id ' ) )
2014-09-28 15:31:58 +08:00
2013-06-24 01:57:38 +08:00
@classmethod
def working ( cls ) :
""" Getter method for _WORKING. """
return cls . _WORKING
def initialize ( self ) :
""" Initializes an instance (authentication, etc). """
2018-05-02 08:18:01 +08:00
self . _initialize_geo_bypass ( {
' countries ' : self . _GEO_COUNTRIES ,
' ip_blocks ' : self . _GEO_IP_BLOCKS ,
} )
2017-02-19 04:53:23 +08:00
if not self . _ready :
self . _real_initialize ( )
self . _ready = True
2018-05-02 08:18:01 +08:00
def _initialize_geo_bypass ( self , geo_bypass_context ) :
2017-02-22 00:00:43 +08:00
"""
Initialize geo restriction bypass mechanism .
This method is used to initialize geo bypass mechanism based on faking
X - Forwarded - For HTTP header . A random country from provided country list
2017-02-22 00:05:31 +08:00
is selected and a random IP belonging to this country is generated . This
2017-02-22 00:00:43 +08:00
IP will be passed as X - Forwarded - For HTTP header in all subsequent
HTTP requests .
This method will be used for initial geo bypass mechanism initialization
2018-05-02 08:18:01 +08:00
during the instance initialization with _GEO_COUNTRIES and
_GEO_IP_BLOCKS .
2017-02-22 00:00:43 +08:00
2018-05-02 08:18:01 +08:00
You may also manually call it from extractor ' s code if geo bypass
2017-02-22 00:00:43 +08:00
information is not available beforehand ( e . g . obtained during
2018-05-02 08:18:01 +08:00
extraction ) or due to some other reason . In this case you should pass
this information in geo bypass context passed as first argument . It may
contain following fields :
countries : List of geo unrestricted countries ( similar
to _GEO_COUNTRIES )
ip_blocks : List of geo unrestricted IP blocks in CIDR notation
( similar to _GEO_IP_BLOCKS )
2017-02-22 00:00:43 +08:00
"""
2017-02-04 19:49:58 +08:00
if not self . _x_forwarded_for_ip :
2018-05-02 08:18:01 +08:00
# Geo bypass mechanism is explicitly disabled by user
if not self . _downloader . params . get ( ' geo_bypass ' , True ) :
return
if not geo_bypass_context :
geo_bypass_context = { }
# Backward compatibility: previously _initialize_geo_bypass
# expected a list of countries, some 3rd party code may still use
# it this way
if isinstance ( geo_bypass_context , ( list , tuple ) ) :
geo_bypass_context = {
' countries ' : geo_bypass_context ,
}
# The whole point of geo bypass mechanism is to fake IP
# as X-Forwarded-For HTTP header based on some IP block or
# country code.
# Path 1: bypassing based on IP block in CIDR notation
# Explicit IP block specified by user, use it right away
# regardless of whether extractor is geo bypassable or not
ip_block = self . _downloader . params . get ( ' geo_bypass_ip_block ' , None )
# Otherwise use random IP block from geo bypass context but only
# if extractor is known as geo bypassable
if not ip_block :
ip_blocks = geo_bypass_context . get ( ' ip_blocks ' )
if self . _GEO_BYPASS and ip_blocks :
ip_block = random . choice ( ip_blocks )
if ip_block :
self . _x_forwarded_for_ip = GeoUtils . random_ipv4 ( ip_block )
if self . _downloader . params . get ( ' verbose ' , False ) :
self . _downloader . to_screen (
' [debug] Using fake IP %s as X-Forwarded-For. '
% self . _x_forwarded_for_ip )
return
# Path 2: bypassing based on country code
# Explicit country code specified by user, use it right away
# regardless of whether extractor is geo bypassable or not
country = self . _downloader . params . get ( ' geo_bypass_country ' , None )
# Otherwise use random country code from geo bypass context but
# only if extractor is known as geo bypassable
if not country :
countries = geo_bypass_context . get ( ' countries ' )
if self . _GEO_BYPASS and countries :
country = random . choice ( countries )
if country :
self . _x_forwarded_for_ip = GeoUtils . random_ipv4 ( country )
2017-02-19 04:53:23 +08:00
if self . _downloader . params . get ( ' verbose ' , False ) :
2017-06-15 13:04:36 +08:00
self . _downloader . to_screen (
2017-02-22 00:14:33 +08:00
' [debug] Using fake IP %s ( %s ) as X-Forwarded-For. '
2018-05-02 08:18:01 +08:00
% ( self . _x_forwarded_for_ip , country . upper ( ) ) )
2013-06-24 01:57:38 +08:00
def extract ( self , url ) :
""" Extracts URL information and returns it in list of dicts. """
2015-02-10 08:13:57 +08:00
try :
2017-02-04 19:49:58 +08:00
for _ in range ( 2 ) :
try :
self . initialize ( )
2017-02-04 22:06:07 +08:00
ie_result = self . _real_extract ( url )
if self . _x_forwarded_for_ip :
ie_result [ ' __x_forwarded_for_ip ' ] = self . _x_forwarded_for_ip
return ie_result
2017-02-04 19:49:58 +08:00
except GeoRestrictedError as e :
2017-02-19 04:53:23 +08:00
if self . __maybe_fake_ip_and_retry ( e . countries ) :
continue
2017-02-04 19:49:58 +08:00
raise
2015-02-10 08:13:57 +08:00
except ExtractorError :
raise
except compat_http_client . IncompleteRead as e :
2016-01-10 23:17:47 +08:00
raise ExtractorError ( ' A network error has occurred. ' , cause = e , expected = True )
2015-02-10 22:55:51 +08:00
except ( KeyError , StopIteration ) as e :
2016-01-10 23:17:47 +08:00
raise ExtractorError ( ' An extractor error has occurred. ' , cause = e )
2013-06-24 01:57:38 +08:00
2017-02-19 04:53:23 +08:00
def __maybe_fake_ip_and_retry ( self , countries ) :
if ( not self . _downloader . params . get ( ' geo_bypass_country ' , None ) and
self . _GEO_BYPASS and
self . _downloader . params . get ( ' geo_bypass ' , True ) and
not self . _x_forwarded_for_ip and
countries ) :
2017-02-22 00:14:33 +08:00
country_code = random . choice ( countries )
self . _x_forwarded_for_ip = GeoUtils . random_ipv4 ( country_code )
2017-02-19 04:53:23 +08:00
if self . _x_forwarded_for_ip :
self . report_warning (
2017-02-22 00:14:33 +08:00
' Video is geo restricted. Retrying extraction with fake IP %s ( %s ) as X-Forwarded-For. '
% ( self . _x_forwarded_for_ip , country_code . upper ( ) ) )
2017-02-19 04:53:23 +08:00
return True
return False
2013-06-24 01:57:38 +08:00
def set_downloader ( self , downloader ) :
""" Sets the downloader for this IE. """
self . _downloader = downloader
def _real_initialize ( self ) :
""" Real initialization process. Redefine in subclasses. """
pass
def _real_extract ( self , url ) :
""" Real extraction process. Redefine in subclasses. """
pass
2013-07-08 21:14:27 +08:00
@classmethod
def ie_key ( cls ) :
""" A string for getting the InfoExtractor with get_info_extractor """
2015-11-01 01:12:57 +08:00
return compat_str ( cls . __name__ [ : - 2 ] )
2013-07-08 21:14:27 +08:00
2013-06-24 01:57:38 +08:00
@property
def IE_NAME ( self ) :
2015-11-01 01:12:57 +08:00
return compat_str ( type ( self ) . __name__ [ : - 2 ] )
2013-06-24 01:57:38 +08:00
2018-06-18 05:01:48 +08:00
@staticmethod
def __can_accept_status_code ( err , expected_status ) :
assert isinstance ( err , compat_urllib_error . HTTPError )
if expected_status is None :
return False
if isinstance ( expected_status , compat_integer_types ) :
return err . code == expected_status
elif isinstance ( expected_status , ( list , tuple ) ) :
return err . code in expected_status
elif callable ( expected_status ) :
return expected_status ( err . code ) is True
else :
assert False
def _request_webpage ( self , url_or_request , video_id , note = None , errnote = None , fatal = True , data = None , headers = { } , query = { } , expected_status = None ) :
"""
Return the response handle .
See _download_webpage docstring for arguments specification .
"""
2013-06-24 01:57:38 +08:00
if note is None :
self . report_download_webpage ( video_id )
elif note is not False :
2013-12-09 08:49:01 +08:00
if video_id is None :
2014-08-28 07:04:43 +08:00
self . to_screen ( ' %s ' % ( note , ) )
2013-12-09 08:49:01 +08:00
else :
2014-08-28 07:04:43 +08:00
self . to_screen ( ' %s : %s ' % ( video_id , note ) )
2017-12-23 21:57:35 +08:00
# Some sites check X-Forwarded-For HTTP header in order to figure out
# the origin of the client behind proxy. This allows bypassing geo
# restriction by faking this header's value to IP that belongs to some
# geo unrestricted country. We will do so once we encounter any
# geo restriction error.
if self . _x_forwarded_for_ip :
if ' X-Forwarded-For ' not in headers :
headers [ ' X-Forwarded-For ' ] = self . _x_forwarded_for_ip
2016-04-01 00:58:38 +08:00
if isinstance ( url_or_request , compat_urllib_request . Request ) :
url_or_request = update_Request (
url_or_request , data = data , headers = headers , query = query )
else :
2016-03-11 02:49:13 +08:00
if query :
url_or_request = update_url_query ( url_or_request , query )
2016-04-21 13:06:06 +08:00
if data is not None or headers :
2016-04-01 00:58:38 +08:00
url_or_request = sanitized_Request ( url_or_request , data , headers )
2013-06-24 01:57:38 +08:00
try :
2013-11-23 02:57:52 +08:00
return self . _downloader . urlopen ( url_or_request )
2013-06-24 01:57:38 +08:00
except ( compat_urllib_error . URLError , compat_http_client . HTTPException , socket . error ) as err :
2018-06-18 05:01:48 +08:00
if isinstance ( err , compat_urllib_error . HTTPError ) :
if self . __can_accept_status_code ( err , expected_status ) :
return err . fp
2013-12-21 00:05:28 +08:00
if errnote is False :
return False
2013-06-24 01:57:38 +08:00
if errnote is None :
2014-08-28 07:04:43 +08:00
errnote = ' Unable to download webpage '
2015-12-20 07:27:38 +08:00
2015-12-20 09:00:39 +08:00
errmsg = ' %s : %s ' % ( errnote , error_to_compat_str ( err ) )
2013-12-09 08:49:01 +08:00
if fatal :
raise ExtractorError ( errmsg , sys . exc_info ( ) [ 2 ] , cause = err )
else :
self . _downloader . report_warning ( errmsg )
return False
2013-06-24 01:57:38 +08:00
2018-06-18 05:01:48 +08:00
def _download_webpage_handle ( self , url_or_request , video_id , note = None , errnote = None , fatal = True , encoding = None , data = None , headers = { } , query = { } , expected_status = None ) :
"""
Return a tuple ( page content as string , URL handle ) .
See _download_webpage docstring for arguments specification .
"""
2013-07-14 04:52:12 +08:00
# Strip hashes from the URL (#1038)
if isinstance ( url_or_request , ( compat_str , str ) ) :
url_or_request = url_or_request . partition ( ' # ' ) [ 0 ]
2018-06-18 05:01:48 +08:00
urlh = self . _request_webpage ( url_or_request , video_id , note , errnote , fatal , data = data , headers = headers , query = query , expected_status = expected_status )
2013-12-09 08:49:01 +08:00
if urlh is False :
assert not fatal
return False
2015-03-21 12:21:27 +08:00
content = self . _webpage_read_content ( urlh , url_or_request , video_id , note , errnote , fatal , encoding = encoding )
2014-10-27 00:05:44 +08:00
return ( content , urlh )
2015-03-21 12:21:27 +08:00
@staticmethod
def _guess_encoding_from_content ( content_type , webpage_bytes ) :
2013-06-24 01:57:38 +08:00
m = re . match ( r ' [a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+ \ s*; \ s*charset=(.+) ' , content_type )
if m :
encoding = m . group ( 1 )
else :
2013-08-29 17:35:15 +08:00
m = re . search ( br ' <meta[^>]+charset=[ \' " ]?([^ \' " )]+)[ / \' " >] ' ,
2013-08-28 19:59:08 +08:00
webpage_bytes [ : 1024 ] )
if m :
encoding = m . group ( 1 ) . decode ( ' ascii ' )
2014-01-21 08:39:39 +08:00
elif webpage_bytes . startswith ( b ' \xff \xfe ' ) :
encoding = ' utf-16 '
2013-08-28 19:59:08 +08:00
else :
encoding = ' utf-8 '
2015-03-21 12:21:27 +08:00
return encoding
2017-04-02 04:56:49 +08:00
def __check_blocked ( self , content ) :
first_block = content [ : 512 ]
if ( ' <title>Access to this site is blocked</title> ' in content and
' Websense ' in first_block ) :
msg = ' Access to this webpage has been blocked by Websense filtering software in your network. '
blocked_iframe = self . _html_search_regex (
r ' <iframe src= " ([^ " ]+) " ' , content ,
' Websense information URL ' , default = None )
if blocked_iframe :
msg + = ' Visit %s for more details ' % blocked_iframe
raise ExtractorError ( msg , expected = True )
if ' <title>The URL you requested has been blocked</title> ' in first_block :
msg = (
' Access to this webpage has been blocked by Indian censorship. '
' Use a VPN or proxy server (with --proxy) to route around it. ' )
block_msg = self . _html_search_regex (
r ' </h1><p>(.*?)</p> ' ,
content , ' block message ' , default = None )
if block_msg :
msg + = ' (Message: " %s " ) ' % block_msg . replace ( ' \n ' , ' ' )
raise ExtractorError ( msg , expected = True )
if ( ' <title>TTK :: Доступ к р е с у р с у ограничен</title> ' in content and
' blocklist.rkn.gov.ru ' in content ) :
raise ExtractorError (
' Access to this webpage has been blocked by decision of the Russian government. '
' Visit http://blocklist.rkn.gov.ru/ for a block reason. ' ,
expected = True )
2015-03-21 12:21:27 +08:00
def _webpage_read_content ( self , urlh , url_or_request , video_id , note = None , errnote = None , fatal = True , prefix = None , encoding = None ) :
content_type = urlh . headers . get ( ' Content-Type ' , ' ' )
webpage_bytes = urlh . read ( )
if prefix is not None :
webpage_bytes = prefix + webpage_bytes
if not encoding :
encoding = self . _guess_encoding_from_content ( content_type , webpage_bytes )
2013-06-24 01:57:38 +08:00
if self . _downloader . params . get ( ' dump_intermediate_pages ' , False ) :
2017-11-18 20:02:56 +08:00
self . to_screen ( ' Dumping request to ' + urlh . geturl ( ) )
2013-06-24 01:57:38 +08:00
dump = base64 . b64encode ( webpage_bytes ) . decode ( ' ascii ' )
self . _downloader . to_screen ( dump )
2013-10-28 17:44:02 +08:00
if self . _downloader . params . get ( ' write_pages ' , False ) :
2017-11-18 20:02:56 +08:00
basen = ' %s _ %s ' % ( video_id , urlh . geturl ( ) )
2014-05-13 03:56:10 +08:00
if len ( basen ) > 240 :
2014-08-28 07:04:43 +08:00
h = ' ___ ' + hashlib . md5 ( basen . encode ( ' utf-8 ' ) ) . hexdigest ( )
2014-05-13 03:56:10 +08:00
basen = basen [ : 240 - len ( h ) ] + h
raw_filename = basen + ' .dump '
2013-10-28 17:44:02 +08:00
filename = sanitize_filename ( raw_filename , restricted = True )
2014-08-28 07:04:43 +08:00
self . to_screen ( ' Saving request to ' + filename )
2014-10-14 22:43:48 +08:00
# Working around MAX_PATH limitation on Windows (see
# http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx)
2016-03-03 19:24:24 +08:00
if compat_os_name == ' nt ' :
2014-10-14 22:43:48 +08:00
absfilepath = os . path . abspath ( filename )
if len ( absfilepath ) > 259 :
filename = ' \\ \\ ? \\ ' + absfilepath
2013-10-28 17:44:02 +08:00
with open ( filename , ' wb ' ) as outf :
outf . write ( webpage_bytes )
2014-04-08 05:09:53 +08:00
try :
content = webpage_bytes . decode ( encoding , ' replace ' )
except LookupError :
content = webpage_bytes . decode ( ' utf-8 ' , ' replace ' )
2014-04-03 12:07:35 +08:00
2017-04-02 04:56:49 +08:00
self . __check_blocked ( content )
2014-04-03 12:07:35 +08:00
2014-10-27 00:05:44 +08:00
return content
2013-06-24 01:57:38 +08:00
2018-06-18 05:01:48 +08:00
def _download_webpage (
self , url_or_request , video_id , note = None , errnote = None ,
fatal = True , tries = 1 , timeout = 5 , encoding = None , data = None ,
headers = { } , query = { } , expected_status = None ) :
"""
Return the data of the page as a string .
Arguments :
url_or_request - - plain text URL as a string or
a compat_urllib_request . Requestobject
video_id - - Video / playlist / item identifier ( string )
Keyword arguments :
note - - note printed before downloading ( string )
errnote - - note printed in case of an error ( string )
fatal - - flag denoting whether error should be considered fatal ,
i . e . whether it should cause ExtractionError to be raised ,
otherwise a warning will be reported and extraction continued
tries - - number of tries
timeout - - sleep interval between tries
encoding - - encoding for a page content decoding , guessed automatically
when not explicitly specified
data - - POST data ( bytes )
headers - - HTTP headers ( dict )
query - - URL query ( dict )
expected_status - - allows to accept failed HTTP requests ( non 2 xx
status code ) by explicitly specifying a set of accepted status
codes . Can be any of the following entities :
- an integer type specifying an exact failed status code to
accept
- a list or a tuple of integer types specifying a list of
failed status codes to accept
- a callable accepting an actual failed status code and
returning True if it should be accepted
Note that this argument does not affect success status codes ( 2 xx )
which are always accepted .
"""
2014-12-04 21:11:27 +08:00
success = False
try_count = 0
while success is False :
try :
2018-06-18 05:01:48 +08:00
res = self . _download_webpage_handle (
url_or_request , video_id , note , errnote , fatal ,
encoding = encoding , data = data , headers = headers , query = query ,
expected_status = expected_status )
2014-12-04 21:11:27 +08:00
success = True
except compat_http_client . IncompleteRead as e :
try_count + = 1
if try_count > = tries :
raise e
self . _sleep ( timeout , video_id )
2013-12-09 08:49:01 +08:00
if res is False :
return res
else :
content , _ = res
return content
2013-06-24 01:57:38 +08:00
2018-03-18 03:17:34 +08:00
def _download_xml_handle (
self , url_or_request , video_id , note = ' Downloading XML ' ,
errnote = ' Unable to download XML ' , transform_source = None ,
2018-06-18 05:01:48 +08:00
fatal = True , encoding = None , data = None , headers = { } , query = { } ,
expected_status = None ) :
"""
Return a tuple ( xml as an xml . etree . ElementTree . Element , URL handle ) .
See _download_webpage docstring for arguments specification .
"""
2018-03-18 03:17:34 +08:00
res = self . _download_webpage_handle (
url_or_request , video_id , note , errnote , fatal = fatal ,
2018-06-18 05:01:48 +08:00
encoding = encoding , data = data , headers = headers , query = query ,
expected_status = expected_status )
2018-03-18 03:17:34 +08:00
if res is False :
return res
xml_string , urlh = res
return self . _parse_xml (
xml_string , video_id , transform_source = transform_source ,
fatal = fatal ) , urlh
2018-06-18 05:01:48 +08:00
def _download_xml (
self , url_or_request , video_id ,
note = ' Downloading XML ' , errnote = ' Unable to download XML ' ,
transform_source = None , fatal = True , encoding = None ,
data = None , headers = { } , query = { } , expected_status = None ) :
"""
Return the xml as an xml . etree . ElementTree . Element .
See _download_webpage docstring for arguments specification .
"""
2018-03-18 03:17:34 +08:00
res = self . _download_xml_handle (
url_or_request , video_id , note = note , errnote = errnote ,
transform_source = transform_source , fatal = fatal , encoding = encoding ,
2018-06-18 05:01:48 +08:00
data = data , headers = headers , query = query ,
expected_status = expected_status )
2018-03-18 03:17:34 +08:00
return res if res is False else res [ 0 ]
2017-08-23 01:32:41 +08:00
def _parse_xml ( self , xml_string , video_id , transform_source = None , fatal = True ) :
2013-12-10 19:45:22 +08:00
if transform_source :
xml_string = transform_source ( xml_string )
2017-08-23 01:32:41 +08:00
try :
return compat_etree_fromstring ( xml_string . encode ( ' utf-8 ' ) )
except compat_xml_parse_error as ve :
errmsg = ' %s : Failed to parse XML ' % video_id
if fatal :
raise ExtractorError ( errmsg , cause = ve )
else :
self . report_warning ( errmsg + str ( ve ) )
2013-11-24 21:59:19 +08:00
2018-04-28 02:59:15 +08:00
def _download_json_handle (
self , url_or_request , video_id , note = ' Downloading JSON metadata ' ,
errnote = ' Unable to download JSON metadata ' , transform_source = None ,
2018-06-18 05:01:48 +08:00
fatal = True , encoding = None , data = None , headers = { } , query = { } ,
expected_status = None ) :
"""
Return a tuple ( JSON object , URL handle ) .
See _download_webpage docstring for arguments specification .
"""
2018-04-28 02:59:15 +08:00
res = self . _download_webpage_handle (
2015-03-21 12:21:27 +08:00
url_or_request , video_id , note , errnote , fatal = fatal ,
2018-06-18 05:01:48 +08:00
encoding = encoding , data = data , headers = headers , query = query ,
expected_status = expected_status )
2018-04-28 02:59:15 +08:00
if res is False :
return res
json_string , urlh = res
2014-12-05 19:07:06 +08:00
return self . _parse_json (
2018-04-28 02:59:15 +08:00
json_string , video_id , transform_source = transform_source ,
fatal = fatal ) , urlh
def _download_json (
self , url_or_request , video_id , note = ' Downloading JSON metadata ' ,
errnote = ' Unable to download JSON metadata ' , transform_source = None ,
2018-06-18 05:01:48 +08:00
fatal = True , encoding = None , data = None , headers = { } , query = { } ,
expected_status = None ) :
"""
Return the JSON object as a dict .
See _download_webpage docstring for arguments specification .
"""
2018-04-28 02:59:15 +08:00
res = self . _download_json_handle (
url_or_request , video_id , note = note , errnote = errnote ,
transform_source = transform_source , fatal = fatal , encoding = encoding ,
2018-06-18 05:01:48 +08:00
data = data , headers = headers , query = query ,
expected_status = expected_status )
2018-04-28 02:59:15 +08:00
return res if res is False else res [ 0 ]
2014-12-05 19:07:06 +08:00
def _parse_json ( self , json_string , video_id , transform_source = None , fatal = True ) :
2014-02-10 00:56:10 +08:00
if transform_source :
json_string = transform_source ( json_string )
2014-01-07 16:35:34 +08:00
try :
return json . loads ( json_string )
except ValueError as ve :
2014-09-30 17:12:59 +08:00
errmsg = ' %s : Failed to parse JSON ' % video_id
if fatal :
raise ExtractorError ( errmsg , cause = ve )
else :
self . report_warning ( errmsg + str ( ve ) )
2014-01-07 16:35:34 +08:00
2013-12-23 22:57:43 +08:00
def report_warning ( self , msg , video_id = None ) :
2014-08-28 07:04:43 +08:00
idstr = ' ' if video_id is None else ' %s : ' % video_id
2013-12-23 22:57:43 +08:00
self . _downloader . report_warning (
2014-08-28 07:04:43 +08:00
' [ %s ] %s %s ' % ( self . IE_NAME , idstr , msg ) )
2013-12-23 22:57:43 +08:00
2013-06-24 01:57:38 +08:00
def to_screen ( self , msg ) :
""" Print msg to screen, prefixing it with ' [ie_name] ' """
2014-08-28 07:04:43 +08:00
self . _downloader . to_screen ( ' [ %s ] %s ' % ( self . IE_NAME , msg ) )
2013-06-24 01:57:38 +08:00
def report_extraction ( self , id_or_name ) :
""" Report information extraction. """
2014-08-28 07:04:43 +08:00
self . to_screen ( ' %s : Extracting information ' % id_or_name )
2013-06-24 01:57:38 +08:00
def report_download_webpage ( self , video_id ) :
""" Report webpage download. """
2014-08-28 07:04:43 +08:00
self . to_screen ( ' %s : Downloading webpage ' % video_id )
2013-06-24 01:57:38 +08:00
def report_age_confirmation ( self ) :
""" Report attempt to confirm age. """
2014-08-28 07:04:43 +08:00
self . to_screen ( ' Confirming age ' )
2013-06-24 01:57:38 +08:00
2013-07-08 05:24:34 +08:00
def report_login ( self ) :
""" Report attempt to log in. """
2014-08-28 07:04:43 +08:00
self . to_screen ( ' Logging in ' )
2013-07-08 05:24:34 +08:00
2015-08-26 23:24:47 +08:00
@staticmethod
def raise_login_required ( msg = ' This video is only available for registered users ' ) :
raise ExtractorError (
' %s . Use --username and --password or --netrc to provide account credentials. ' % msg ,
expected = True )
2015-09-22 23:50:20 +08:00
@staticmethod
2017-02-04 19:49:58 +08:00
def raise_geo_restricted ( msg = ' This video is not available from your location due to geo restriction ' , countries = None ) :
raise GeoRestrictedError ( msg , countries = countries )
2015-09-22 23:50:20 +08:00
2014-11-24 03:41:03 +08:00
# Methods for following #608
2013-12-20 03:28:52 +08:00
@staticmethod
2015-04-13 01:11:47 +08:00
def url_result ( url , ie = None , video_id = None , video_title = None ) :
2015-07-24 01:37:45 +08:00
""" Returns a URL that points to a page that should be processed """
2014-11-24 03:41:03 +08:00
# TODO: ie should be the class used for getting the info
2013-06-24 01:57:38 +08:00
video_info = { ' _type ' : ' url ' ,
' url ' : url ,
' ie_key ' : ie }
2013-11-23 05:46:46 +08:00
if video_id is not None :
video_info [ ' id ' ] = video_id
2015-04-13 01:11:47 +08:00
if video_title is not None :
video_info [ ' title ' ] = video_title
2013-06-24 01:57:38 +08:00
return video_info
2014-11-24 03:41:03 +08:00
2017-07-16 05:33:14 +08:00
def playlist_from_matches ( self , matches , playlist_id = None , playlist_title = None , getter = None , ie = None ) :
urls = orderedSet (
2017-03-09 07:13:54 +08:00
self . url_result ( self . _proto_relative_url ( getter ( m ) if getter else m ) , ie )
for m in matches )
return self . playlist_result (
2017-07-16 05:33:14 +08:00
urls , playlist_id = playlist_id , playlist_title = playlist_title )
2017-03-09 07:13:54 +08:00
2013-12-20 03:28:52 +08:00
@staticmethod
2014-12-07 03:46:30 +08:00
def playlist_result ( entries , playlist_id = None , playlist_title = None , playlist_description = None ) :
2013-06-24 01:57:38 +08:00
""" Returns a playlist """
video_info = { ' _type ' : ' playlist ' ,
' entries ' : entries }
if playlist_id :
video_info [ ' id ' ] = playlist_id
if playlist_title :
video_info [ ' title ' ] = playlist_title
2014-12-07 03:46:30 +08:00
if playlist_description :
video_info [ ' description ' ] = playlist_description
2013-06-24 01:57:38 +08:00
return video_info
2015-06-29 00:56:45 +08:00
def _search_regex ( self , pattern , string , name , default = NO_DEFAULT , fatal = True , flags = 0 , group = None ) :
2013-06-24 01:57:38 +08:00
"""
Perform a regex search on the given string , using a single or a list of
patterns returning the first matching group .
In case of failure return a default value or raise a WARNING or a
2013-10-23 20:38:03 +08:00
RegexNotFoundError , depending on fatal , specifying the field name .
2013-06-24 01:57:38 +08:00
"""
if isinstance ( pattern , ( str , compat_str , compiled_regex_type ) ) :
mobj = re . search ( pattern , string , flags )
else :
for p in pattern :
mobj = re . search ( p , string , flags )
2014-07-25 16:43:03 +08:00
if mobj :
break
2013-06-24 01:57:38 +08:00
2016-03-03 19:24:24 +08:00
if not self . _downloader . params . get ( ' no_color ' ) and compat_os_name != ' nt ' and sys . stderr . isatty ( ) :
2014-08-28 07:04:43 +08:00
_name = ' \033 [0;34m %s \033 [0m ' % name
2013-06-24 01:57:38 +08:00
else :
_name = name
if mobj :
2014-11-05 06:14:16 +08:00
if group is None :
# return the first matching group
return next ( g for g in mobj . groups ( ) if g is not None )
else :
return mobj . group ( group )
2015-06-29 00:56:45 +08:00
elif default is not NO_DEFAULT :
2013-06-24 01:57:38 +08:00
return default
elif fatal :
2014-08-28 07:04:43 +08:00
raise RegexNotFoundError ( ' Unable to extract %s ' % _name )
2013-06-24 01:57:38 +08:00
else :
2015-04-17 20:55:24 +08:00
self . _downloader . report_warning ( ' unable to extract %s ' % _name + bug_reports_message ( ) )
2013-06-24 01:57:38 +08:00
return None
2015-06-29 00:56:45 +08:00
def _html_search_regex ( self , pattern , string , name , default = NO_DEFAULT , fatal = True , flags = 0 , group = None ) :
2013-06-24 01:57:38 +08:00
"""
Like _search_regex , but strips HTML tags and unescapes entities .
"""
2014-11-05 06:14:16 +08:00
res = self . _search_regex ( pattern , string , name , default , fatal , flags , group )
2013-06-24 01:57:38 +08:00
if res :
return clean_html ( res ) . strip ( )
else :
return res
2016-08-14 18:48:13 +08:00
def _get_netrc_login_info ( self , netrc_machine = None ) :
username = None
password = None
netrc_machine = netrc_machine or self . _NETRC_MACHINE
if self . _downloader . params . get ( ' usenetrc ' , False ) :
try :
info = netrc . netrc ( ) . authenticators ( netrc_machine )
if info is not None :
username = info [ 0 ]
password = info [ 2 ]
else :
2016-09-15 23:35:12 +08:00
raise netrc . NetrcParseError (
' No authenticators for %s ' % netrc_machine )
2016-08-14 18:48:13 +08:00
except ( IOError , netrc . NetrcParseError ) as err :
2016-09-15 23:35:12 +08:00
self . _downloader . report_warning (
' parsing .netrc: %s ' % error_to_compat_str ( err ) )
2016-08-14 18:48:13 +08:00
2016-09-15 23:35:12 +08:00
return username , password
2016-08-14 18:48:13 +08:00
2016-09-14 05:16:01 +08:00
def _get_login_info ( self , username_option = ' username ' , password_option = ' password ' , netrc_machine = None ) :
2013-07-08 05:24:34 +08:00
"""
2015-04-29 23:03:10 +08:00
Get the login info as ( username , password )
2016-09-15 23:34:29 +08:00
First look for the manually specified credentials using username_option
and password_option as keys in params dictionary . If no such credentials
available look in the netrc file using the netrc_machine or _NETRC_MACHINE
value .
2013-07-08 05:24:34 +08:00
If there ' s no info available, return (None, None)
"""
if self . _downloader is None :
return ( None , None )
downloader_params = self . _downloader . params
# Attempt to use provided username and password or .netrc data
2016-09-14 05:16:01 +08:00
if downloader_params . get ( username_option ) is not None :
username = downloader_params [ username_option ]
password = downloader_params [ password_option ]
2016-08-14 18:48:13 +08:00
else :
2016-09-14 05:16:01 +08:00
username , password = self . _get_netrc_login_info ( netrc_machine )
2014-11-24 03:41:03 +08:00
2016-09-15 23:26:37 +08:00
return username , password
2013-07-08 05:24:34 +08:00
2015-08-15 23:55:07 +08:00
def _get_tfa_info ( self , note = ' two-factor verification code ' ) :
2014-08-17 05:28:41 +08:00
"""
Get the two - factor authentication info
TODO - asking the user will be required for sms / phone verify
currently just uses the command line option
If there ' s no info available, return None
"""
if self . _downloader is None :
return None
downloader_params = self . _downloader . params
2016-02-14 16:25:04 +08:00
if downloader_params . get ( ' twofactor ' ) is not None :
2014-08-17 05:28:41 +08:00
return downloader_params [ ' twofactor ' ]
2015-08-15 23:55:07 +08:00
return compat_getpass ( ' Type %s and press [Return]: ' % note )
2014-08-17 05:28:41 +08:00
2013-07-13 01:00:19 +08:00
# Helper functions for extracting OpenGraph info
@staticmethod
2013-11-15 19:24:54 +08:00
def _og_regexes ( prop ) :
2015-10-18 11:11:02 +08:00
content_re = r ' content=(?: " ([^ " ]+?) " | \' ([^ \' ]+?) \' | \ s*([^ \ s " \' =<>`]+?)) '
2015-10-14 22:49:39 +08:00
property_re = ( r ' (?:name|property)=(?: \' og: %(prop)s \' | " og: %(prop)s " | \ s*og: %(prop)s \ b) '
% { ' prop ' : re . escape ( prop ) } )
2013-11-15 19:54:13 +08:00
template = r ' <meta[^>]+? %s [^>]+? %s '
2013-11-15 19:24:54 +08:00
return [
2013-11-15 19:54:13 +08:00
template % ( property_re , content_re ) ,
template % ( content_re , property_re ) ,
2013-11-15 19:24:54 +08:00
]
2013-07-13 01:00:19 +08:00
2015-07-29 05:43:03 +08:00
@staticmethod
def _meta_regex ( prop ) :
return r ''' (?isx)<meta
2015-08-15 17:58:30 +08:00
( ? = [ ^ > ] + ( ? : itemprop | name | property | id | http - equiv ) = ( [ " \' ]?) %s \1 )
2015-07-29 05:43:03 +08:00
[ ^ > ] + ? content = ( [ " \' ])(?P<content>.*?) \2 ' ' ' % r e.escape(prop)
2013-07-14 02:39:47 +08:00
def _og_search_property ( self , prop , html , name = None , * * kargs ) :
2016-08-02 23:55:14 +08:00
if not isinstance ( prop , ( list , tuple ) ) :
prop = [ prop ]
2013-07-13 01:00:19 +08:00
if name is None :
2016-08-02 23:55:14 +08:00
name = ' OpenGraph %s ' % prop [ 0 ]
og_regexes = [ ]
for p in prop :
og_regexes . extend ( self . _og_regexes ( p ) )
escaped = self . _search_regex ( og_regexes , html , name , flags = re . DOTALL , * * kargs )
2013-11-12 17:36:23 +08:00
if escaped is None :
return None
return unescapeHTML ( escaped )
2013-07-13 01:00:19 +08:00
def _og_search_thumbnail ( self , html , * * kargs ) :
2015-07-24 01:37:45 +08:00
return self . _og_search_property ( ' image ' , html , ' thumbnail URL ' , fatal = False , * * kargs )
2013-07-13 01:00:19 +08:00
def _og_search_description ( self , html , * * kargs ) :
return self . _og_search_property ( ' description ' , html , fatal = False , * * kargs )
def _og_search_title ( self , html , * * kargs ) :
return self . _og_search_property ( ' title ' , html , * * kargs )
2013-10-28 14:34:29 +08:00
def _og_search_video_url ( self , html , name = ' video url ' , secure = True , * * kargs ) :
2014-08-21 19:05:24 +08:00
regexes = self . _og_regexes ( ' video ' ) + self . _og_regexes ( ' video:url ' )
if secure :
regexes = self . _og_regexes ( ' video:secure_url ' ) + regexes
2013-10-28 14:34:29 +08:00
return self . _html_search_regex ( regexes , html , name , * * kargs )
2013-07-13 01:00:19 +08:00
2014-06-26 22:34:36 +08:00
def _og_search_url ( self , html , * * kargs ) :
return self . _og_search_property ( ' url ' , html , * * kargs )
2014-07-11 21:38:18 +08:00
def _html_search_meta ( self , name , html , display_name = None , fatal = False , * * kwargs ) :
2016-06-26 17:57:14 +08:00
if not isinstance ( name , ( list , tuple ) ) :
name = [ name ]
2013-11-20 13:13:19 +08:00
if display_name is None :
2016-06-26 17:57:14 +08:00
display_name = name [ 0 ]
2013-11-20 13:13:19 +08:00
return self . _html_search_regex (
2016-06-26 17:57:14 +08:00
[ self . _meta_regex ( n ) for n in name ] ,
2014-11-05 06:14:16 +08:00
html , display_name , fatal = fatal , group = ' content ' , * * kwargs )
2013-11-20 13:13:19 +08:00
def _dc_search_uploader ( self , html ) :
return self . _html_search_meta ( ' dc.creator ' , html , ' uploader ' )
2013-10-06 12:06:30 +08:00
def _rta_search ( self , html ) :
# See http://www.rtalabel.org/index.php?content=howtofaq#single
if re . search ( r ' (?ix)<meta \ s+name= " rating " \ s+ '
r ' content= " RTA-5042-1996-1400-1577-RTA " ' ,
html ) :
return 18
return 0
2013-11-20 13:13:19 +08:00
def _media_rating_search ( self , html ) :
# See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
rating = self . _html_search_meta ( ' rating ' , html )
if not rating :
return None
RATING_TABLE = {
' safe for kids ' : 0 ,
' general ' : 8 ,
' 14 years ' : 14 ,
' mature ' : 17 ,
' restricted ' : 19 ,
}
2016-02-14 16:25:04 +08:00
return RATING_TABLE . get ( rating . lower ( ) )
2013-11-20 13:13:19 +08:00
2015-02-08 23:39:00 +08:00
def _family_friendly_search ( self , html ) :
2015-02-15 05:20:24 +08:00
# See http://schema.org/VideoObject
2017-08-12 18:11:35 +08:00
family_friendly = self . _html_search_meta (
' isFamilyFriendly ' , html , default = None )
2015-02-08 23:39:00 +08:00
if not family_friendly :
return None
RATING_TABLE = {
' 1 ' : 0 ,
' true ' : 0 ,
' 0 ' : 18 ,
' false ' : 18 ,
}
2016-02-14 16:25:04 +08:00
return RATING_TABLE . get ( family_friendly . lower ( ) )
2015-02-08 23:39:00 +08:00
2014-01-30 01:03:32 +08:00
def _twitter_search_player ( self , html ) :
return self . _html_search_meta ( ' twitter:player ' , html ,
2014-11-24 04:39:15 +08:00
' twitter card player ' )
2014-01-30 01:03:32 +08:00
2016-07-09 04:28:04 +08:00
def _search_json_ld ( self , html , video_id , expected_type = None , * * kwargs ) :
2016-01-16 02:36:02 +08:00
json_ld = self . _search_regex (
2018-07-10 00:43:05 +08:00
JSON_LD_RE , html , ' JSON-LD ' , group = ' json_ld ' , * * kwargs )
2016-08-08 23:36:18 +08:00
default = kwargs . get ( ' default ' , NO_DEFAULT )
2016-01-16 02:36:02 +08:00
if not json_ld :
2016-08-08 23:36:18 +08:00
return default if default is not NO_DEFAULT else { }
# JSON-LD may be malformed and thus `fatal` should be respected.
# At the same time `default` may be passed that assumes `fatal=False`
# for _search_regex. Let's simulate the same behavior here as well.
fatal = kwargs . get ( ' fatal ' , True ) if default == NO_DEFAULT else False
return self . _json_ld ( json_ld , video_id , fatal = fatal , expected_type = expected_type )
2016-01-16 02:36:02 +08:00
2016-07-09 04:28:04 +08:00
def _json_ld ( self , json_ld , video_id , fatal = True , expected_type = None ) :
2016-01-16 02:36:02 +08:00
if isinstance ( json_ld , compat_str ) :
json_ld = self . _parse_json ( json_ld , video_id , fatal = fatal )
if not json_ld :
return { }
info = { }
2016-08-06 00:14:32 +08:00
if not isinstance ( json_ld , ( list , tuple , dict ) ) :
return info
if isinstance ( json_ld , dict ) :
json_ld = [ json_ld ]
2017-04-18 23:21:38 +08:00
2018-04-28 03:48:03 +08:00
INTERACTION_TYPE_MAP = {
' CommentAction ' : ' comment ' ,
' AgreeAction ' : ' like ' ,
' DisagreeAction ' : ' dislike ' ,
' LikeAction ' : ' like ' ,
' DislikeAction ' : ' dislike ' ,
' ListenAction ' : ' view ' ,
' WatchAction ' : ' view ' ,
' ViewAction ' : ' view ' ,
}
def extract_interaction_statistic ( e ) :
interaction_statistic = e . get ( ' interactionStatistic ' )
if not isinstance ( interaction_statistic , list ) :
return
for is_e in interaction_statistic :
if not isinstance ( is_e , dict ) :
continue
if is_e . get ( ' @type ' ) != ' InteractionCounter ' :
continue
interaction_type = is_e . get ( ' interactionType ' )
if not isinstance ( interaction_type , compat_str ) :
continue
interaction_count = int_or_none ( is_e . get ( ' userInteractionCount ' ) )
if interaction_count is None :
continue
count_kind = INTERACTION_TYPE_MAP . get ( interaction_type . split ( ' / ' ) [ - 1 ] )
if not count_kind :
continue
count_key = ' %s _count ' % count_kind
if info . get ( count_key ) is not None :
continue
info [ count_key ] = interaction_count
2017-04-18 23:21:38 +08:00
def extract_video_object ( e ) :
assert e [ ' @type ' ] == ' VideoObject '
info . update ( {
' url ' : e . get ( ' contentUrl ' ) ,
' title ' : unescapeHTML ( e . get ( ' name ' ) ) ,
' description ' : unescapeHTML ( e . get ( ' description ' ) ) ,
' thumbnail ' : e . get ( ' thumbnailUrl ' ) or e . get ( ' thumbnailURL ' ) ,
' duration ' : parse_duration ( e . get ( ' duration ' ) ) ,
' timestamp ' : unified_timestamp ( e . get ( ' uploadDate ' ) ) ,
' filesize ' : float_or_none ( e . get ( ' contentSize ' ) ) ,
' tbr ' : int_or_none ( e . get ( ' bitrate ' ) ) ,
' width ' : int_or_none ( e . get ( ' width ' ) ) ,
' height ' : int_or_none ( e . get ( ' height ' ) ) ,
2017-04-30 22:11:55 +08:00
' view_count ' : int_or_none ( e . get ( ' interactionCount ' ) ) ,
2017-04-18 23:21:38 +08:00
} )
2018-04-28 03:48:03 +08:00
extract_interaction_statistic ( e )
2017-04-18 23:21:38 +08:00
2016-08-06 00:14:32 +08:00
for e in json_ld :
2018-04-08 04:09:42 +08:00
if isinstance ( e . get ( ' @context ' ) , compat_str ) and re . match ( r ' ^https?://schema.org/?$ ' , e . get ( ' @context ' ) ) :
2016-08-06 00:14:32 +08:00
item_type = e . get ( ' @type ' )
if expected_type is not None and expected_type != item_type :
return info
2017-06-30 23:19:06 +08:00
if item_type in ( ' TVEpisode ' , ' Episode ' ) :
2016-08-06 00:14:32 +08:00
info . update ( {
' episode ' : unescapeHTML ( e . get ( ' name ' ) ) ,
' episode_number ' : int_or_none ( e . get ( ' episodeNumber ' ) ) ,
' description ' : unescapeHTML ( e . get ( ' description ' ) ) ,
} )
part_of_season = e . get ( ' partOfSeason ' )
2017-06-30 23:19:06 +08:00
if isinstance ( part_of_season , dict ) and part_of_season . get ( ' @type ' ) in ( ' TVSeason ' , ' Season ' , ' CreativeWorkSeason ' ) :
2016-08-06 00:14:32 +08:00
info [ ' season_number ' ] = int_or_none ( part_of_season . get ( ' seasonNumber ' ) )
2016-08-07 01:58:38 +08:00
part_of_series = e . get ( ' partOfSeries ' ) or e . get ( ' partOfTVSeries ' )
2017-06-30 23:19:06 +08:00
if isinstance ( part_of_series , dict ) and part_of_series . get ( ' @type ' ) in ( ' TVSeries ' , ' Series ' , ' CreativeWorkSeries ' ) :
2016-08-06 00:14:32 +08:00
info [ ' series ' ] = unescapeHTML ( part_of_series . get ( ' name ' ) )
2018-01-28 00:23:36 +08:00
elif item_type in ( ' Article ' , ' NewsArticle ' ) :
2016-08-06 00:14:32 +08:00
info . update ( {
' timestamp ' : parse_iso8601 ( e . get ( ' datePublished ' ) ) ,
' title ' : unescapeHTML ( e . get ( ' headline ' ) ) ,
' description ' : unescapeHTML ( e . get ( ' articleBody ' ) ) ,
} )
elif item_type == ' VideoObject ' :
2017-04-18 23:21:38 +08:00
extract_video_object ( e )
2017-06-30 23:19:06 +08:00
continue
video = e . get ( ' video ' )
if isinstance ( video , dict ) and video . get ( ' @type ' ) == ' VideoObject ' :
extract_video_object ( video )
2016-08-06 00:14:32 +08:00
break
2016-01-16 02:36:02 +08:00
return dict ( ( k , v ) for k , v in info . items ( ) if v is not None )
2015-07-10 23:49:09 +08:00
@staticmethod
2015-07-15 00:36:30 +08:00
def _hidden_inputs ( html ) :
2015-09-11 23:07:32 +08:00
html = re . sub ( r ' <!--(?:(?!<!--).)*--> ' , ' ' , html )
2015-08-15 23:52:22 +08:00
hidden_inputs = { }
2016-09-15 22:54:48 +08:00
for input in re . findall ( r ' (?i)(<input[^>]+>) ' , html ) :
attrs = extract_attributes ( input )
if not input :
2015-08-15 23:52:22 +08:00
continue
2016-09-15 22:54:48 +08:00
if attrs . get ( ' type ' ) not in ( ' hidden ' , ' submit ' ) :
2015-08-15 23:52:22 +08:00
continue
2016-09-15 22:54:48 +08:00
name = attrs . get ( ' name ' ) or attrs . get ( ' id ' )
value = attrs . get ( ' value ' )
if name and value is not None :
hidden_inputs [ name ] = value
2015-08-15 23:52:22 +08:00
return hidden_inputs
2015-07-10 23:49:09 +08:00
2015-07-15 00:38:10 +08:00
def _form_hidden_inputs ( self , form_id , html ) :
form = self . _search_regex (
2015-09-11 22:43:05 +08:00
r ' (?is)<form[^>]+?id=([ " \' ]) %s \ 1[^>]*>(?P<form>.+?)</form> ' % form_id ,
2015-07-15 00:38:10 +08:00
html , ' %s form ' % form_id , group = ' form ' )
return self . _hidden_inputs ( form )
2015-04-20 23:13:31 +08:00
def _sort_formats ( self , formats , field_preference = None ) :
2014-01-27 14:31:54 +08:00
if not formats :
2014-08-28 07:04:43 +08:00
raise ExtractorError ( ' No video formats found ' )
2014-01-27 14:31:54 +08:00
2016-01-27 23:11:17 +08:00
for f in formats :
# Automatically determine tbr when missing based on abr and vbr (improves
# formats sorting in some cases)
2016-01-30 03:47:46 +08:00
if ' tbr ' not in f and f . get ( ' abr ' ) is not None and f . get ( ' vbr ' ) is not None :
2016-01-27 23:11:17 +08:00
f [ ' tbr ' ] = f [ ' abr ' ] + f [ ' vbr ' ]
2013-12-24 19:25:22 +08:00
def _formats_key ( f ) :
2013-12-24 19:40:23 +08:00
# TODO remove the following workaround
from . . utils import determine_ext
if not f . get ( ' ext ' ) and ' url ' in f :
f [ ' ext ' ] = determine_ext ( f [ ' url ' ] )
2015-04-20 23:13:31 +08:00
if isinstance ( field_preference , ( list , tuple ) ) :
2016-06-26 22:09:07 +08:00
return tuple (
f . get ( field )
if f . get ( field ) is not None
else ( ' ' if field == ' format_id ' else - 1 )
for field in field_preference )
2015-04-20 23:13:31 +08:00
2013-12-24 19:25:22 +08:00
preference = f . get ( ' preference ' )
if preference is None :
2015-12-06 04:14:43 +08:00
preference = 0
2013-12-24 19:25:22 +08:00
if f . get ( ' ext ' ) in [ ' f4f ' , ' f4m ' ] : # Not yet supported
preference - = 0.5
2016-08-04 16:24:20 +08:00
protocol = f . get ( ' protocol ' ) or determine_protocol ( f )
proto_preference = 0 if protocol in [ ' http ' , ' https ' ] else ( - 0.5 if protocol == ' rtsp ' else - 0.1 )
2015-12-06 04:14:43 +08:00
2013-12-24 19:25:22 +08:00
if f . get ( ' vcodec ' ) == ' none ' : # audio only
2016-02-11 17:55:50 +08:00
preference - = 50
2013-12-24 19:25:22 +08:00
if self . _downloader . params . get ( ' prefer_free_formats ' ) :
2014-08-28 07:04:43 +08:00
ORDER = [ ' aac ' , ' mp3 ' , ' m4a ' , ' webm ' , ' ogg ' , ' opus ' ]
2013-12-24 19:25:22 +08:00
else :
2014-08-28 07:04:43 +08:00
ORDER = [ ' webm ' , ' opus ' , ' ogg ' , ' mp3 ' , ' aac ' , ' m4a ' ]
2013-12-24 19:25:22 +08:00
ext_preference = 0
try :
audio_ext_preference = ORDER . index ( f [ ' ext ' ] )
except ValueError :
audio_ext_preference = - 1
else :
2016-02-11 17:55:50 +08:00
if f . get ( ' acodec ' ) == ' none ' : # video only
preference - = 40
2013-12-24 19:25:22 +08:00
if self . _downloader . params . get ( ' prefer_free_formats ' ) :
2014-08-28 07:04:43 +08:00
ORDER = [ ' flv ' , ' mp4 ' , ' webm ' ]
2013-12-24 19:25:22 +08:00
else :
2014-08-28 07:04:43 +08:00
ORDER = [ ' webm ' , ' flv ' , ' mp4 ' ]
2013-12-24 19:25:22 +08:00
try :
ext_preference = ORDER . index ( f [ ' ext ' ] )
except ValueError :
ext_preference = - 1
audio_ext_preference = 0
return (
preference ,
2014-11-20 19:06:33 +08:00
f . get ( ' language_preference ' ) if f . get ( ' language_preference ' ) is not None else - 1 ,
2014-01-07 00:15:27 +08:00
f . get ( ' quality ' ) if f . get ( ' quality ' ) is not None else - 1 ,
2014-01-07 17:25:34 +08:00
f . get ( ' tbr ' ) if f . get ( ' tbr ' ) is not None else - 1 ,
2015-02-16 11:37:55 +08:00
f . get ( ' filesize ' ) if f . get ( ' filesize ' ) is not None else - 1 ,
2013-12-24 19:25:22 +08:00
f . get ( ' vbr ' ) if f . get ( ' vbr ' ) is not None else - 1 ,
2015-02-03 17:53:05 +08:00
f . get ( ' height ' ) if f . get ( ' height ' ) is not None else - 1 ,
f . get ( ' width ' ) if f . get ( ' width ' ) is not None else - 1 ,
2015-12-06 04:14:43 +08:00
proto_preference ,
2015-02-06 22:16:43 +08:00
ext_preference ,
2013-12-24 19:25:22 +08:00
f . get ( ' abr ' ) if f . get ( ' abr ' ) is not None else - 1 ,
audio_ext_preference ,
2014-10-30 16:40:52 +08:00
f . get ( ' fps ' ) if f . get ( ' fps ' ) is not None else - 1 ,
2014-07-21 18:02:44 +08:00
f . get ( ' filesize_approx ' ) if f . get ( ' filesize_approx ' ) is not None else - 1 ,
2014-10-25 06:10:11 +08:00
f . get ( ' source_preference ' ) if f . get ( ' source_preference ' ) is not None else - 1 ,
2015-05-06 23:24:24 +08:00
f . get ( ' format_id ' ) if f . get ( ' format_id ' ) is not None else ' ' ,
2013-12-24 19:25:22 +08:00
)
formats . sort ( key = _formats_key )
2013-11-20 13:13:19 +08:00
2015-01-26 02:32:31 +08:00
def _check_formats ( self , formats , video_id ) :
if formats :
formats [ : ] = filter (
lambda f : self . _is_valid_url (
f [ ' url ' ] , video_id ,
item = ' %s video format ' % f . get ( ' format_id ' ) if f . get ( ' format_id ' ) else ' video ' ) ,
formats )
2016-02-22 03:19:39 +08:00
@staticmethod
def _remove_duplicate_formats ( formats ) :
format_urls = set ( )
unique_formats = [ ]
for f in formats :
if f [ ' url ' ] not in format_urls :
format_urls . add ( f [ ' url ' ] )
unique_formats . append ( f )
formats [ : ] = unique_formats
2017-02-03 12:10:13 +08:00
def _is_valid_url ( self , url , video_id , item = ' video ' , headers = { } ) :
2015-03-03 00:38:44 +08:00
url = self . _proto_relative_url ( url , scheme = ' http: ' )
# For now assume non HTTP(S) URLs always valid
if not ( url . startswith ( ' http:// ' ) or url . startswith ( ' https:// ' ) ) :
return True
2015-01-26 02:32:31 +08:00
try :
2017-02-03 12:10:13 +08:00
self . _request_webpage ( url , video_id , ' Checking %s URL ' % item , headers = headers )
2015-01-26 02:32:31 +08:00
return True
except ExtractorError as e :
2015-10-24 18:25:04 +08:00
if isinstance ( e . cause , compat_urllib_error . URLError ) :
2015-05-17 04:59:35 +08:00
self . to_screen (
' %s : %s URL is invalid, skipping ' % ( video_id , item ) )
2015-01-26 02:32:31 +08:00
return False
raise
2014-05-05 09:12:41 +08:00
def http_scheme ( self ) :
2014-10-24 21:34:19 +08:00
""" Either " http: " or " https: " , depending on the user ' s preferences """
2014-05-05 09:12:41 +08:00
return (
' http: '
if self . _downloader . params . get ( ' prefer_insecure ' , False )
else ' https: ' )
2014-05-13 15:42:38 +08:00
def _proto_relative_url ( self , url , scheme = None ) :
if url is None :
return url
if url . startswith ( ' // ' ) :
if scheme is None :
scheme = self . http_scheme ( )
return scheme + url
else :
return url
2014-07-11 16:57:08 +08:00
def _sleep ( self , timeout , video_id , msg_template = None ) :
if msg_template is None :
2014-08-28 07:04:43 +08:00
msg_template = ' %(video_id)s : Waiting for %(timeout)s seconds '
2014-07-11 16:57:08 +08:00
msg = msg_template % { ' video_id ' : video_id , ' timeout ' : timeout }
self . to_screen ( msg )
time . sleep ( timeout )
2015-07-17 12:02:49 +08:00
def _extract_f4m_formats ( self , manifest_url , video_id , preference = None , f4m_id = None ,
2015-10-02 01:03:31 +08:00
transform_source = lambda s : fix_xml_ampersands ( s ) . strip ( ) ,
2016-05-27 00:03:03 +08:00
fatal = True , m3u8_id = None ) :
2014-07-28 21:42:19 +08:00
manifest = self . _download_xml (
manifest_url , video_id , ' Downloading f4m manifest ' ,
2015-07-16 03:14:08 +08:00
' Unable to download f4m manifest ' ,
# Some manifests may be malformed, e.g. prosiebensat1 generated manifests
# (see https://github.com/rg3/youtube-dl/issues/6215#issuecomment-121704244)
2015-10-02 01:03:31 +08:00
transform_source = transform_source ,
fatal = fatal )
if manifest is False :
2015-12-27 22:33:39 +08:00
return [ ]
2014-07-28 21:25:56 +08:00
2016-03-13 05:16:08 +08:00
return self . _parse_f4m_formats (
manifest , manifest_url , video_id , preference = preference , f4m_id = f4m_id ,
2016-05-27 00:03:03 +08:00
transform_source = transform_source , fatal = fatal , m3u8_id = m3u8_id )
2016-03-13 05:16:08 +08:00
def _parse_f4m_formats ( self , manifest , manifest_url , video_id , preference = None , f4m_id = None ,
transform_source = lambda s : fix_xml_ampersands ( s ) . strip ( ) ,
2016-05-27 00:03:03 +08:00
fatal = True , m3u8_id = None ) :
2015-07-31 00:34:38 +08:00
# currently youtube-dl cannot decode the playerVerificationChallenge as Akamai uses Adobe Alchemy
akamai_pv = manifest . find ( ' { http://ns.adobe.com/f4m/1.0}pv-2.0 ' )
if akamai_pv is not None and ' ; ' in akamai_pv . text :
playerVerificationChallenge = akamai_pv . text . split ( ' ; ' ) [ 0 ]
if playerVerificationChallenge . strip ( ) != ' ' :
return [ ]
2014-07-28 21:25:56 +08:00
formats = [ ]
2014-10-24 12:17:39 +08:00
manifest_version = ' 1.0 '
2014-08-25 19:03:08 +08:00
media_nodes = manifest . findall ( ' { http://ns.adobe.com/f4m/1.0}media ' )
2014-10-24 05:11:10 +08:00
if not media_nodes :
2014-10-24 12:17:39 +08:00
manifest_version = ' 2.0 '
2014-10-24 05:11:10 +08:00
media_nodes = manifest . findall ( ' { http://ns.adobe.com/f4m/2.0}media ' )
2016-03-27 09:42:38 +08:00
# Remove unsupported DRM protected media from final formats
# rendition (see https://github.com/rg3/youtube-dl/issues/8573).
media_nodes = remove_encrypted_media ( media_nodes )
if not media_nodes :
return formats
2017-11-04 23:10:55 +08:00
manifest_base_url = get_base_url ( manifest )
2016-05-26 21:41:47 +08:00
2016-06-08 00:19:33 +08:00
bootstrap_info = xpath_element (
2016-05-26 21:41:47 +08:00
manifest , [ ' { http://ns.adobe.com/f4m/1.0}bootstrapInfo ' , ' { http://ns.adobe.com/f4m/2.0}bootstrapInfo ' ] ,
' bootstrap info ' , default = None )
2016-10-19 21:42:48 +08:00
vcodec = None
mime_type = xpath_text (
manifest , [ ' { http://ns.adobe.com/f4m/1.0}mimeType ' , ' { http://ns.adobe.com/f4m/2.0}mimeType ' ] ,
' base URL ' , default = None )
if mime_type and mime_type . startswith ( ' audio/ ' ) :
vcodec = ' none '
2014-08-25 19:03:08 +08:00
for i , media_el in enumerate ( media_nodes ) :
2016-05-27 03:47:44 +08:00
tbr = int_or_none ( media_el . attrib . get ( ' bitrate ' ) )
width = int_or_none ( media_el . attrib . get ( ' width ' ) )
height = int_or_none ( media_el . attrib . get ( ' height ' ) )
format_id = ' - ' . join ( filter ( None , [ f4m_id , compat_str ( i if tbr is None else tbr ) ] ) )
2016-05-27 00:03:03 +08:00
# If <bootstrapInfo> is present, the specified f4m is a
# stream-level manifest, and only set-level manifests may refer to
# external resources. See section 11.4 and section 4 of F4M spec
if bootstrap_info is None :
media_url = None
# @href is introduced in 2.0, see section 11.6 of F4M spec
if manifest_version == ' 2.0 ' :
media_url = media_el . attrib . get ( ' href ' )
if media_url is None :
media_url = media_el . attrib . get ( ' url ' )
2015-07-16 03:25:33 +08:00
if not media_url :
continue
2015-07-16 03:14:52 +08:00
manifest_url = (
media_url if media_url . startswith ( ' http:// ' ) or media_url . startswith ( ' https:// ' )
2017-11-04 23:10:55 +08:00
else ( ( manifest_base_url or ' / ' . join ( manifest_url . split ( ' / ' ) [ : - 1 ] ) ) + ' / ' + media_url ) )
2015-07-16 03:15:15 +08:00
# If media_url is itself a f4m manifest do the recursive extraction
# since bitrates in parent manifest (this one) and media_url manifest
# may differ leading to inability to resolve the format by requested
# bitrate in f4m downloader
2016-05-26 21:55:43 +08:00
ext = determine_ext ( manifest_url )
if ext == ' f4m ' :
2016-05-27 03:47:44 +08:00
f4m_formats = self . _extract_f4m_formats (
2016-03-13 05:16:08 +08:00
manifest_url , video_id , preference = preference , f4m_id = f4m_id ,
2016-05-27 03:47:44 +08:00
transform_source = transform_source , fatal = fatal )
# Sometimes stream-level manifest contains single media entry that
# does not contain any quality metadata (e.g. http://matchtv.ru/#live-player).
# At the same time parent's media entry in set-level manifest may
# contain it. We will copy it from parent in such cases.
if len ( f4m_formats ) == 1 :
f = f4m_formats [ 0 ]
f . update ( {
' tbr ' : f . get ( ' tbr ' ) or tbr ,
' width ' : f . get ( ' width ' ) or width ,
' height ' : f . get ( ' height ' ) or height ,
' format_id ' : f . get ( ' format_id ' ) if not tbr else format_id ,
2016-10-19 21:42:48 +08:00
' vcodec ' : vcodec ,
2016-05-27 03:47:44 +08:00
} )
formats . extend ( f4m_formats )
2015-07-16 03:15:15 +08:00
continue
2016-05-26 21:55:43 +08:00
elif ext == ' m3u8 ' :
formats . extend ( self . _extract_m3u8_formats (
manifest_url , video_id , ' mp4 ' , preference = preference ,
2016-05-27 01:41:27 +08:00
m3u8_id = m3u8_id , fatal = fatal ) )
2016-05-26 21:55:43 +08:00
continue
2014-07-28 21:25:56 +08:00
formats . append ( {
2016-05-27 03:47:44 +08:00
' format_id ' : format_id ,
2014-07-28 21:25:56 +08:00
' url ' : manifest_url ,
2016-09-17 22:33:38 +08:00
' manifest_url ' : manifest_url ,
2016-06-08 00:19:33 +08:00
' ext ' : ' flv ' if bootstrap_info is not None else None ,
2017-11-04 23:11:39 +08:00
' protocol ' : ' f4m ' ,
2014-08-25 19:03:08 +08:00
' tbr ' : tbr ,
2016-05-27 03:47:44 +08:00
' width ' : width ,
' height ' : height ,
2016-10-19 21:42:48 +08:00
' vcodec ' : vcodec ,
2015-02-06 00:16:27 +08:00
' preference ' : preference ,
2014-07-28 21:25:56 +08:00
} )
return formats
2016-05-21 13:15:28 +08:00
def _m3u8_meta_format ( self , m3u8_url , ext = None , preference = None , m3u8_id = None ) :
return {
2015-03-07 00:53:53 +08:00
' format_id ' : ' - ' . join ( filter ( None , [ m3u8_id , ' meta ' ] ) ) ,
2014-08-26 18:51:13 +08:00
' url ' : m3u8_url ,
' ext ' : ext ,
' protocol ' : ' m3u8 ' ,
2016-08-07 17:58:11 +08:00
' preference ' : preference - 100 if preference else - 100 ,
2014-08-26 18:51:13 +08:00
' resolution ' : ' multiple ' ,
' format_note ' : ' Quality selection URL ' ,
2016-05-21 13:15:28 +08:00
}
def _extract_m3u8_formats ( self , m3u8_url , video_id , ext = None ,
entry_protocol = ' m3u8 ' , preference = None ,
m3u8_id = None , note = None , errnote = None ,
fatal = True , live = False ) :
2015-11-01 02:01:34 +08:00
res = self . _download_webpage_handle (
2014-10-27 09:28:37 +08:00
m3u8_url , video_id ,
2015-06-07 16:33:22 +08:00
note = note or ' Downloading m3u8 information ' ,
2015-07-06 08:39:38 +08:00
errnote = errnote or ' Failed to download m3u8 information ' ,
fatal = fatal )
2017-04-22 08:01:00 +08:00
2015-11-01 02:01:34 +08:00
if res is False :
2015-12-27 22:33:39 +08:00
return [ ]
2017-04-22 08:01:00 +08:00
2015-11-01 02:01:34 +08:00
m3u8_doc , urlh = res
2015-09-09 02:35:41 +08:00
m3u8_url = urlh . geturl ( )
2016-02-27 09:01:11 +08:00
2017-04-22 08:01:00 +08:00
return self . _parse_m3u8_formats (
m3u8_doc , m3u8_url , ext = ext , entry_protocol = entry_protocol ,
preference = preference , m3u8_id = m3u8_id , live = live )
def _parse_m3u8_formats ( self , m3u8_doc , m3u8_url , ext = None ,
entry_protocol = ' m3u8 ' , preference = None ,
m3u8_id = None , live = False ) :
2017-02-10 23:51:41 +08:00
if ' #EXT-X-FAXS-CM: ' in m3u8_doc : # Adobe Flash Access
return [ ]
2017-11-15 00:41:30 +08:00
if re . search ( r ' #EXT-X-SESSION-KEY:.*?URI= " skd:// ' , m3u8_doc ) : # Apple FairPlay
return [ ]
2017-04-25 23:07:10 +08:00
formats = [ ]
2016-09-04 18:42:15 +08:00
format_url = lambda u : (
u
if re . match ( r ' ^https?:// ' , u )
else compat_urlparse . urljoin ( m3u8_url , u ) )
2017-04-22 08:01:00 +08:00
# References:
# 1. https://tools.ietf.org/html/draft-pantos-http-live-streaming-21
# 2. https://github.com/rg3/youtube-dl/issues/12211
# We should try extracting formats only from master playlists [1, 4.3.4],
# i.e. playlists that describe available qualities. On the other hand
# media playlists [1, 4.3.3] should be returned as is since they contain
# just the media without qualities renditions.
2016-02-27 09:01:11 +08:00
# Fortunately, master playlist can be easily distinguished from media
2017-04-22 08:01:00 +08:00
# playlist based on particular tags availability. As of [1, 4.3.3, 4.3.4]
# master playlist tags MUST NOT appear in a media playist and vice versa.
# As of [1, 4.3.3.1] #EXT-X-TARGETDURATION tag is REQUIRED for every
# media playlist and MUST NOT appear in master playlist thus we can
# clearly detect media playlist with this criterion.
2016-02-27 09:01:11 +08:00
if ' #EXT-X-TARGETDURATION ' in m3u8_doc : # media playlist, return as is
2016-01-27 00:44:44 +08:00
return [ {
' url ' : m3u8_url ,
' format_id ' : m3u8_id ,
' ext ' : ext ,
' protocol ' : entry_protocol ,
' preference ' : preference ,
} ]
2017-04-22 08:01:00 +08:00
groups = { }
last_stream_inf = { }
def extract_media ( x_media_line ) :
media = parse_m3u8_attributes ( x_media_line )
# As per [1, 4.3.4.1] TYPE, GROUP-ID and NAME are REQUIRED
media_type , group_id , name = media . get ( ' TYPE ' ) , media . get ( ' GROUP-ID ' ) , media . get ( ' NAME ' )
if not ( media_type and group_id and name ) :
return
groups . setdefault ( group_id , [ ] ) . append ( media )
if media_type not in ( ' VIDEO ' , ' AUDIO ' ) :
return
media_url = media . get ( ' URI ' )
if media_url :
format_id = [ ]
2017-10-29 08:05:55 +08:00
for v in ( m3u8_id , group_id , name ) :
2017-04-22 08:01:00 +08:00
if v :
format_id . append ( v )
f = {
' format_id ' : ' - ' . join ( format_id ) ,
' url ' : format_url ( media_url ) ,
2017-04-28 04:00:14 +08:00
' manifest_url ' : m3u8_url ,
2017-04-22 08:01:00 +08:00
' language ' : media . get ( ' LANGUAGE ' ) ,
' ext ' : ext ,
' protocol ' : entry_protocol ,
' preference ' : preference ,
}
if media_type == ' AUDIO ' :
f [ ' vcodec ' ] = ' none '
formats . append ( f )
def build_stream_name ( ) :
# Despite specification does not mention NAME attribute for
2017-04-23 12:51:53 +08:00
# EXT-X-STREAM-INF tag it still sometimes may be present (see [1]
# or vidio test in TestInfoExtractor.test_parse_m3u8_formats)
2017-04-23 12:49:57 +08:00
# 1. http://www.vidio.com/watch/165683-dj_ambred-booyah-live-2015
2017-04-22 08:01:00 +08:00
stream_name = last_stream_inf . get ( ' NAME ' )
if stream_name :
return stream_name
# If there is no NAME in EXT-X-STREAM-INF it will be obtained
# from corresponding rendition group
stream_group_id = last_stream_inf . get ( ' VIDEO ' )
if not stream_group_id :
return
stream_group = groups . get ( stream_group_id )
if not stream_group :
return stream_group_id
rendition = stream_group [ 0 ]
return rendition . get ( ' NAME ' ) or stream_group_id
2014-08-26 18:51:13 +08:00
for line in m3u8_doc . splitlines ( ) :
if line . startswith ( ' #EXT-X-STREAM-INF: ' ) :
2017-04-22 08:01:00 +08:00
last_stream_inf = parse_m3u8_attributes ( line )
2015-02-18 01:55:53 +08:00
elif line . startswith ( ' #EXT-X-MEDIA: ' ) :
2017-04-22 08:01:00 +08:00
extract_media ( line )
2014-08-26 18:51:13 +08:00
elif line . startswith ( ' # ' ) or not line . strip ( ) :
continue
else :
2017-04-23 12:33:19 +08:00
tbr = float_or_none (
last_stream_inf . get ( ' AVERAGE-BANDWIDTH ' ) or
last_stream_inf . get ( ' BANDWIDTH ' ) , scale = 1000 )
2015-03-07 00:52:50 +08:00
format_id = [ ]
if m3u8_id :
format_id . append ( m3u8_id )
2017-04-22 08:01:00 +08:00
stream_name = build_stream_name ( )
2016-04-26 22:30:24 +08:00
# Bandwidth of live streams may differ over time thus making
# format_id unpredictable. So it's better to keep provided
# format_id intact.
2016-04-29 22:49:04 +08:00
if not live :
2016-05-15 05:34:35 +08:00
format_id . append ( stream_name if stream_name else ' %d ' % ( tbr if tbr else len ( formats ) ) )
2016-09-17 22:33:38 +08:00
manifest_url = format_url ( line . strip ( ) )
2014-08-26 18:51:13 +08:00
f = {
2015-03-07 00:52:50 +08:00
' format_id ' : ' - ' . join ( format_id ) ,
2016-09-17 22:33:38 +08:00
' url ' : manifest_url ,
2017-04-25 23:07:10 +08:00
' manifest_url ' : m3u8_url ,
2014-08-26 18:51:13 +08:00
' tbr ' : tbr ,
' ext ' : ext ,
2017-04-22 08:01:00 +08:00
' fps ' : float_or_none ( last_stream_inf . get ( ' FRAME-RATE ' ) ) ,
2014-09-24 20:16:56 +08:00
' protocol ' : entry_protocol ,
' preference ' : preference ,
2014-08-26 18:51:13 +08:00
}
2017-04-22 08:01:00 +08:00
resolution = last_stream_inf . get ( ' RESOLUTION ' )
2014-08-26 18:51:13 +08:00
if resolution :
2016-11-04 06:02:31 +08:00
mobj = re . search ( r ' (?P<width> \ d+)[xX](?P<height> \ d+) ' , resolution )
if mobj :
f [ ' width ' ] = int ( mobj . group ( ' width ' ) )
f [ ' height ' ] = int ( mobj . group ( ' height ' ) )
2016-07-13 22:54:43 +08:00
# Unified Streaming Platform
mobj = re . search (
r ' audio.*?(?: % 3D|=)( \ d+)(?:-video.*?(?: % 3D|=)( \ d+))? ' , f [ ' url ' ] )
if mobj :
abr , vbr = mobj . groups ( )
abr , vbr = float_or_none ( abr , 1000 ) , float_or_none ( vbr , 1000 )
2016-02-27 08:48:13 +08:00
f . update ( {
2016-07-13 22:54:43 +08:00
' vbr ' : vbr ,
' abr ' : abr ,
2016-02-27 08:48:13 +08:00
} )
2017-04-22 08:01:00 +08:00
codecs = parse_codecs ( last_stream_inf . get ( ' CODECS ' ) )
f . update ( codecs )
audio_group_id = last_stream_inf . get ( ' AUDIO ' )
# As per [1, 4.3.4.1.1] any EXT-X-STREAM-INF tag which
# references a rendition group MUST have a CODECS attribute.
# However, this is not always respected, for example, [2]
# contains EXT-X-STREAM-INF tag which references AUDIO
# rendition group but does not have CODECS and despite
# referencing audio group an audio group, it represents
# a complete (with audio and video) format. So, for such cases
# we will ignore references to rendition groups and treat them
# as complete formats.
if audio_group_id and codecs and f . get ( ' vcodec ' ) != ' none ' :
audio_group = groups . get ( audio_group_id )
if audio_group and audio_group [ 0 ] . get ( ' URI ' ) :
# TODO: update acodec for audio only formats with
# the same GROUP-ID
f [ ' acodec ' ] = ' none '
2014-08-26 18:51:13 +08:00
formats . append ( f )
2017-04-22 08:01:00 +08:00
last_stream_inf = { }
2014-08-26 18:51:13 +08:00
return formats
2015-08-02 03:13:21 +08:00
@staticmethod
def _xpath_ns ( path , namespace = None ) :
if not namespace :
return path
out = [ ]
for c in path . split ( ' / ' ) :
if not c or c == ' . ' :
out . append ( c )
else :
out . append ( ' { %s } %s ' % ( namespace , c ) )
return ' / ' . join ( out )
2016-03-12 05:37:07 +08:00
def _extract_smil_formats ( self , smil_url , video_id , fatal = True , f4m_params = None , transform_source = None ) :
smil = self . _download_smil ( smil_url , video_id , fatal = fatal , transform_source = transform_source )
2015-08-02 03:13:21 +08:00
2015-02-03 06:38:35 +08:00
if smil is False :
assert not fatal
return [ ]
2014-12-10 00:28:28 +08:00
2015-08-02 03:31:17 +08:00
namespace = self . _parse_smil_namespace ( smil )
2015-08-02 03:13:21 +08:00
return self . _parse_smil_formats (
smil , smil_url , video_id , namespace = namespace , f4m_params = f4m_params )
def _extract_smil_info ( self , smil_url , video_id , fatal = True , f4m_params = None ) :
smil = self . _download_smil ( smil_url , video_id , fatal = fatal )
if smil is False :
return { }
return self . _parse_smil ( smil , smil_url , video_id , f4m_params = f4m_params )
2016-03-12 05:37:07 +08:00
def _download_smil ( self , smil_url , video_id , fatal = True , transform_source = None ) :
2015-08-02 03:13:21 +08:00
return self . _download_xml (
smil_url , video_id , ' Downloading SMIL file ' ,
2016-03-12 05:37:07 +08:00
' Unable to download SMIL file ' , fatal = fatal , transform_source = transform_source )
2015-08-02 03:13:21 +08:00
def _parse_smil ( self , smil , smil_url , video_id , f4m_params = None ) :
2015-08-02 03:31:17 +08:00
namespace = self . _parse_smil_namespace ( smil )
2015-08-02 03:13:21 +08:00
formats = self . _parse_smil_formats (
smil , smil_url , video_id , namespace = namespace , f4m_params = f4m_params )
subtitles = self . _parse_smil_subtitles ( smil , namespace = namespace )
video_id = os . path . splitext ( url_basename ( smil_url ) ) [ 0 ]
title = None
description = None
2015-10-02 00:18:59 +08:00
upload_date = None
2015-08-02 03:13:21 +08:00
for meta in smil . findall ( self . _xpath_ns ( ' ./head/meta ' , namespace ) ) :
name = meta . attrib . get ( ' name ' )
content = meta . attrib . get ( ' content ' )
if not name or not content :
continue
if not title and name == ' title ' :
title = content
elif not description and name in ( ' description ' , ' abstract ' ) :
description = content
2015-10-02 00:18:59 +08:00
elif not upload_date and name == ' date ' :
upload_date = unified_strdate ( content )
2015-08-02 03:13:21 +08:00
2015-10-02 00:08:16 +08:00
thumbnails = [ {
' id ' : image . get ( ' type ' ) ,
' url ' : image . get ( ' src ' ) ,
' width ' : int_or_none ( image . get ( ' width ' ) ) ,
' height ' : int_or_none ( image . get ( ' height ' ) ) ,
} for image in smil . findall ( self . _xpath_ns ( ' .//image ' , namespace ) ) if image . get ( ' src ' ) ]
2015-08-02 03:13:21 +08:00
return {
' id ' : video_id ,
' title ' : title or video_id ,
' description ' : description ,
2015-10-02 00:18:59 +08:00
' upload_date ' : upload_date ,
2015-10-02 00:08:16 +08:00
' thumbnails ' : thumbnails ,
2015-08-02 03:13:21 +08:00
' formats ' : formats ,
' subtitles ' : subtitles ,
}
2015-08-02 03:31:17 +08:00
def _parse_smil_namespace ( self , smil ) :
return self . _search_regex (
r ' (?i)^ { ([^}]+)?}smil$ ' , smil . tag , ' namespace ' , default = None )
2015-08-19 23:11:25 +08:00
def _parse_smil_formats ( self , smil , smil_url , video_id , namespace = None , f4m_params = None , transform_rtmp_url = None ) :
2015-08-02 03:13:21 +08:00
base = smil_url
for meta in smil . findall ( self . _xpath_ns ( ' ./head/meta ' , namespace ) ) :
b = meta . get ( ' base ' ) or meta . get ( ' httpBase ' )
if b :
base = b
break
2014-12-10 00:28:28 +08:00
formats = [ ]
rtmp_count = 0
2015-08-02 03:13:21 +08:00
http_count = 0
2016-01-27 00:44:44 +08:00
m3u8_count = 0
2015-08-02 03:13:21 +08:00
2016-02-12 00:58:48 +08:00
srcs = [ ]
2016-05-20 19:02:53 +08:00
media = smil . findall ( self . _xpath_ns ( ' .//video ' , namespace ) ) + smil . findall ( self . _xpath_ns ( ' .//audio ' , namespace ) )
for medium in media :
src = medium . get ( ' src ' )
2016-02-12 00:58:48 +08:00
if not src or src in srcs :
2015-08-02 03:13:21 +08:00
continue
2016-02-12 00:58:48 +08:00
srcs . append ( src )
2015-08-02 03:13:21 +08:00
2016-05-20 19:02:53 +08:00
bitrate = float_or_none ( medium . get ( ' system-bitrate ' ) or medium . get ( ' systemBitrate ' ) , 1000 )
filesize = int_or_none ( medium . get ( ' size ' ) or medium . get ( ' fileSize ' ) )
width = int_or_none ( medium . get ( ' width ' ) )
height = int_or_none ( medium . get ( ' height ' ) )
proto = medium . get ( ' proto ' )
ext = medium . get ( ' ext ' )
2015-08-02 03:13:21 +08:00
src_ext = determine_ext ( src )
2016-05-20 19:02:53 +08:00
streamer = medium . get ( ' streamer ' ) or base
2015-08-02 03:13:21 +08:00
if proto == ' rtmp ' or streamer . startswith ( ' rtmp ' ) :
rtmp_count + = 1
formats . append ( {
' url ' : streamer ,
' play_path ' : src ,
' ext ' : ' flv ' ,
' format_id ' : ' rtmp- %d ' % ( rtmp_count if bitrate is None else bitrate ) ,
' tbr ' : bitrate ,
' filesize ' : filesize ,
' width ' : width ,
' height ' : height ,
} )
2015-08-19 23:11:25 +08:00
if transform_rtmp_url :
streamer , src = transform_rtmp_url ( streamer , src )
formats [ - 1 ] . update ( {
' url ' : streamer ,
' play_path ' : src ,
} )
2015-08-02 03:13:21 +08:00
continue
src_url = src if src . startswith ( ' http ' ) else compat_urlparse . urljoin ( base , src )
2016-02-13 00:38:48 +08:00
src_url = src_url . strip ( )
2015-08-02 03:13:21 +08:00
if proto == ' m3u8 ' or src_ext == ' m3u8 ' :
2016-01-27 00:44:44 +08:00
m3u8_formats = self . _extract_m3u8_formats (
src_url , video_id , ext or ' mp4 ' , m3u8_id = ' hls ' , fatal = False )
if len ( m3u8_formats ) == 1 :
m3u8_count + = 1
m3u8_formats [ 0 ] . update ( {
' format_id ' : ' hls- %d ' % ( m3u8_count if bitrate is None else bitrate ) ,
' tbr ' : bitrate ,
' width ' : width ,
' height ' : height ,
} )
formats . extend ( m3u8_formats )
2018-07-19 01:29:18 +08:00
elif src_ext == ' f4m ' :
2015-08-02 03:13:21 +08:00
f4m_url = src_url
if not f4m_params :
f4m_params = {
' hdcore ' : ' 3.2.0 ' ,
' plugin ' : ' flowplayer-3.2.0.1 ' ,
}
f4m_url + = ' & ' if ' ? ' in f4m_url else ' ? '
2016-03-26 03:46:57 +08:00
f4m_url + = compat_urllib_parse_urlencode ( f4m_params )
2015-12-29 02:58:24 +08:00
formats . extend ( self . _extract_f4m_formats ( f4m_url , video_id , f4m_id = ' hds ' , fatal = False ) )
2018-07-19 01:29:18 +08:00
elif src_ext == ' mpd ' :
formats . extend ( self . _extract_mpd_formats (
src_url , video_id , mpd_id = ' dash ' , fatal = False ) )
elif re . search ( r ' \ .ism/[Mm]anifest ' , src_url ) :
formats . extend ( self . _extract_ism_formats (
src_url , video_id , ism_id = ' mss ' , fatal = False ) )
elif src_url . startswith ( ' http ' ) and self . _is_valid_url ( src , video_id ) :
2015-08-02 03:13:21 +08:00
http_count + = 1
formats . append ( {
' url ' : src_url ,
' ext ' : ext or src_ext or ' flv ' ,
' format_id ' : ' http- %d ' % ( bitrate or http_count ) ,
' tbr ' : bitrate ,
' filesize ' : filesize ,
' width ' : width ,
' height ' : height ,
} )
2015-02-22 16:16:51 +08:00
2014-12-10 00:28:28 +08:00
return formats
2015-08-20 02:56:17 +08:00
def _parse_smil_subtitles ( self , smil , namespace = None , subtitles_lang = ' en ' ) :
2016-02-10 00:15:41 +08:00
urls = [ ]
2015-08-02 03:13:21 +08:00
subtitles = { }
for num , textstream in enumerate ( smil . findall ( self . _xpath_ns ( ' .//textstream ' , namespace ) ) ) :
src = textstream . get ( ' src ' )
2016-02-10 00:15:41 +08:00
if not src or src in urls :
2015-08-02 03:13:21 +08:00
continue
2016-02-10 00:15:41 +08:00
urls . append ( src )
2016-04-02 02:39:02 +08:00
ext = textstream . get ( ' ext ' ) or mimetype2ext ( textstream . get ( ' type ' ) ) or determine_ext ( src )
2015-08-20 23:18:58 +08:00
lang = textstream . get ( ' systemLanguage ' ) or textstream . get ( ' systemLanguageName ' ) or textstream . get ( ' lang ' ) or subtitles_lang
2015-08-02 03:13:21 +08:00
subtitles . setdefault ( lang , [ ] ) . append ( {
' url ' : src ,
' ext ' : ext ,
} )
return subtitles
2015-02-22 16:16:51 +08:00
2018-03-18 03:46:50 +08:00
def _extract_xspf_playlist ( self , xspf_url , playlist_id , fatal = True ) :
2015-08-09 21:41:55 +08:00
xspf = self . _download_xml (
2018-03-18 03:46:50 +08:00
xspf_url , playlist_id , ' Downloading xpsf playlist ' ,
2015-08-09 21:41:55 +08:00
' Unable to download xspf manifest ' , fatal = fatal )
if xspf is False :
return [ ]
2018-03-18 03:46:50 +08:00
return self . _parse_xspf (
xspf , playlist_id , xspf_url = xspf_url ,
xspf_base_url = base_url ( xspf_url ) )
2015-08-09 21:07:18 +08:00
2018-03-18 03:46:50 +08:00
def _parse_xspf ( self , xspf_doc , playlist_id , xspf_url = None , xspf_base_url = None ) :
2015-08-09 21:07:18 +08:00
NS_MAP = {
' xspf ' : ' http://xspf.org/ns/0/ ' ,
' s1 ' : ' http://static.streamone.nl/player/ns/0 ' ,
}
entries = [ ]
2018-03-18 03:46:50 +08:00
for track in xspf_doc . findall ( xpath_with_ns ( ' ./xspf:trackList/xspf:track ' , NS_MAP ) ) :
2015-08-09 21:07:18 +08:00
title = xpath_text (
2015-08-09 21:18:50 +08:00
track , xpath_with_ns ( ' ./xspf:title ' , NS_MAP ) , ' title ' , default = playlist_id )
2015-08-09 21:07:18 +08:00
description = xpath_text (
track , xpath_with_ns ( ' ./xspf:annotation ' , NS_MAP ) , ' description ' )
thumbnail = xpath_text (
track , xpath_with_ns ( ' ./xspf:image ' , NS_MAP ) , ' thumbnail ' )
duration = float_or_none (
xpath_text ( track , xpath_with_ns ( ' ./xspf:duration ' , NS_MAP ) , ' duration ' ) , 1000 )
2018-03-18 03:46:50 +08:00
formats = [ ]
for location in track . findall ( xpath_with_ns ( ' ./xspf:location ' , NS_MAP ) ) :
format_url = urljoin ( xspf_base_url , location . text )
if not format_url :
continue
formats . append ( {
' url ' : format_url ,
' manifest_url ' : xspf_url ,
' format_id ' : location . get ( xpath_with_ns ( ' s1:label ' , NS_MAP ) ) ,
' width ' : int_or_none ( location . get ( xpath_with_ns ( ' s1:width ' , NS_MAP ) ) ) ,
' height ' : int_or_none ( location . get ( xpath_with_ns ( ' s1:height ' , NS_MAP ) ) ) ,
} )
2015-08-09 21:07:18 +08:00
self . _sort_formats ( formats )
entries . append ( {
' id ' : playlist_id ,
' title ' : title ,
' description ' : description ,
' thumbnail ' : thumbnail ,
' duration ' : duration ,
' formats ' : formats ,
} )
return entries
2016-02-03 01:07:07 +08:00
def _extract_mpd_formats ( self , mpd_url , video_id , mpd_id = None , note = None , errnote = None , fatal = True , formats_dict = { } ) :
2018-03-18 03:46:50 +08:00
res = self . _download_xml_handle (
2016-02-03 01:07:07 +08:00
mpd_url , video_id ,
note = note or ' Downloading MPD manifest ' ,
errnote = errnote or ' Failed to download MPD manifest ' ,
2016-01-30 22:52:23 +08:00
fatal = fatal )
2016-02-03 01:07:07 +08:00
if res is False :
2016-01-30 22:52:23 +08:00
return [ ]
2018-03-18 03:46:50 +08:00
mpd_doc , urlh = res
2016-11-02 03:14:01 +08:00
mpd_base_url = base_url ( urlh . geturl ( ) )
2016-02-03 01:07:07 +08:00
2016-02-06 21:03:48 +08:00
return self . _parse_mpd_formats (
2018-03-18 03:46:50 +08:00
mpd_doc , mpd_id = mpd_id , mpd_base_url = mpd_base_url ,
2016-09-17 21:35:22 +08:00
formats_dict = formats_dict , mpd_url = mpd_url )
2016-01-30 22:52:23 +08:00
2016-09-17 21:35:22 +08:00
def _parse_mpd_formats ( self , mpd_doc , mpd_id = None , mpd_base_url = ' ' , formats_dict = { } , mpd_url = None ) :
2016-07-24 11:27:16 +08:00
"""
Parse formats from MPD manifest .
References :
1. MPEG - DASH Standard , ISO / IEC 23009 - 1 : 2014 ( E ) ,
http : / / standards . iso . org / ittf / PubliclyAvailableStandards / c065274_ISO_IEC_23009 - 1_2014. zip
2. https : / / en . wikipedia . org / wiki / Dynamic_Adaptive_Streaming_over_HTTP
"""
2016-02-03 01:07:07 +08:00
if mpd_doc . get ( ' type ' ) == ' dynamic ' :
return [ ]
2016-01-30 22:52:23 +08:00
2016-02-06 21:03:48 +08:00
namespace = self . _search_regex ( r ' (?i)^ { ([^}]+)?}MPD$ ' , mpd_doc . tag , ' namespace ' , default = None )
2016-02-03 05:02:08 +08:00
def _add_ns ( path ) :
return self . _xpath_ns ( path , namespace )
2016-02-04 01:44:43 +08:00
def is_drm_protected ( element ) :
return element . find ( _add_ns ( ' ContentProtection ' ) ) is not None
2016-02-03 01:07:07 +08:00
def extract_multisegment_info ( element , ms_parent_info ) :
ms_info = ms_parent_info . copy ( )
2016-09-06 02:21:57 +08:00
# As per [1, 5.3.9.2.2] SegmentList and SegmentTemplate share some
# common attributes and elements. We will only extract relevant
# for us.
def extract_common ( source ) :
segment_timeline = source . find ( _add_ns ( ' SegmentTimeline ' ) )
if segment_timeline is not None :
s_e = segment_timeline . findall ( _add_ns ( ' S ' ) )
if s_e :
ms_info [ ' total_number ' ] = 0
ms_info [ ' s ' ] = [ ]
for s in s_e :
r = int ( s . get ( ' r ' , 0 ) )
ms_info [ ' total_number ' ] + = 1 + r
ms_info [ ' s ' ] . append ( {
' t ' : int ( s . get ( ' t ' , 0 ) ) ,
# @d is mandatory (see [1, 5.3.9.6.2, Table 17, page 60])
' d ' : int ( s . attrib [ ' d ' ] ) ,
' r ' : r ,
} )
start_number = source . get ( ' startNumber ' )
if start_number :
ms_info [ ' start_number ' ] = int ( start_number )
timescale = source . get ( ' timescale ' )
if timescale :
ms_info [ ' timescale ' ] = int ( timescale )
segment_duration = source . get ( ' duration ' )
if segment_duration :
2017-08-16 00:58:00 +08:00
ms_info [ ' segment_duration ' ] = float ( segment_duration )
2016-09-06 02:21:57 +08:00
def extract_Initialization ( source ) :
initialization = source . find ( _add_ns ( ' Initialization ' ) )
if initialization is not None :
ms_info [ ' initialization_url ' ] = initialization . attrib [ ' sourceURL ' ]
2016-02-03 05:02:08 +08:00
segment_list = element . find ( _add_ns ( ' SegmentList ' ) )
2016-02-03 01:07:07 +08:00
if segment_list is not None :
2016-09-06 02:21:57 +08:00
extract_common ( segment_list )
extract_Initialization ( segment_list )
2016-02-03 05:02:08 +08:00
segment_urls_e = segment_list . findall ( _add_ns ( ' SegmentURL ' ) )
2016-02-03 01:07:07 +08:00
if segment_urls_e :
ms_info [ ' segment_urls ' ] = [ segment . attrib [ ' media ' ] for segment in segment_urls_e ]
else :
2016-02-03 05:02:08 +08:00
segment_template = element . find ( _add_ns ( ' SegmentTemplate ' ) )
2016-02-03 01:07:07 +08:00
if segment_template is not None :
2016-09-06 02:21:57 +08:00
extract_common ( segment_template )
2017-01-29 07:57:39 +08:00
media = segment_template . get ( ' media ' )
if media :
ms_info [ ' media ' ] = media
2016-02-03 01:07:07 +08:00
initialization = segment_template . get ( ' initialization ' )
if initialization :
2017-01-29 07:57:39 +08:00
ms_info [ ' initialization ' ] = initialization
2016-02-03 01:07:07 +08:00
else :
2016-09-06 02:21:57 +08:00
extract_Initialization ( segment_template )
2016-02-03 01:07:07 +08:00
return ms_info
2016-01-30 21:27:43 +08:00
2016-02-03 01:07:07 +08:00
mpd_duration = parse_duration ( mpd_doc . get ( ' mediaPresentationDuration ' ) )
2016-01-30 21:05:55 +08:00
formats = [ ]
2016-02-03 05:02:08 +08:00
for period in mpd_doc . findall ( _add_ns ( ' Period ' ) ) :
2016-02-03 01:07:07 +08:00
period_duration = parse_duration ( period . get ( ' duration ' ) ) or mpd_duration
period_ms_info = extract_multisegment_info ( period , {
' start_number ' : 1 ,
' timescale ' : 1 ,
} )
2016-02-03 05:02:08 +08:00
for adaptation_set in period . findall ( _add_ns ( ' AdaptationSet ' ) ) :
2016-02-04 01:44:43 +08:00
if is_drm_protected ( adaptation_set ) :
continue
2016-02-03 01:07:07 +08:00
adaption_set_ms_info = extract_multisegment_info ( adaptation_set , period_ms_info )
2016-02-03 05:02:08 +08:00
for representation in adaptation_set . findall ( _add_ns ( ' Representation ' ) ) :
2016-02-04 01:44:43 +08:00
if is_drm_protected ( representation ) :
continue
2016-02-03 01:07:07 +08:00
representation_attrib = adaptation_set . attrib . copy ( )
representation_attrib . update ( representation . attrib )
2016-07-24 11:27:16 +08:00
# According to [1, 5.3.7.2, Table 9, page 41], @mimeType is mandatory
2016-03-11 23:49:55 +08:00
mime_type = representation_attrib [ ' mimeType ' ]
content_type = mime_type . split ( ' / ' ) [ 0 ]
2016-02-03 01:07:07 +08:00
if content_type == ' text ' :
# TODO implement WebVTT downloading
pass
2017-04-13 03:38:43 +08:00
elif content_type in ( ' video ' , ' audio ' ) :
2016-02-03 01:07:07 +08:00
base_url = ' '
for element in ( representation , adaptation_set , period , mpd_doc ) :
2016-02-03 05:02:08 +08:00
base_url_e = element . find ( _add_ns ( ' BaseURL ' ) )
2016-02-03 01:07:07 +08:00
if base_url_e is not None :
base_url = base_url_e . text + base_url
if re . match ( r ' ^https?:// ' , base_url ) :
break
2016-02-13 02:13:56 +08:00
if mpd_base_url and not re . match ( r ' ^https?:// ' , base_url ) :
if not mpd_base_url . endswith ( ' / ' ) and not base_url . startswith ( ' / ' ) :
mpd_base_url + = ' / '
2016-02-03 01:07:07 +08:00
base_url = mpd_base_url + base_url
representation_id = representation_attrib . get ( ' id ' )
2016-02-03 20:24:07 +08:00
lang = representation_attrib . get ( ' lang ' )
2016-02-10 03:05:39 +08:00
url_el = representation . find ( _add_ns ( ' BaseURL ' ) )
filesize = int_or_none ( url_el . attrib . get ( ' { http://youtube.com/yt/2012/10/10}contentLength ' ) if url_el is not None else None )
2017-01-29 07:57:39 +08:00
bandwidth = int_or_none ( representation_attrib . get ( ' bandwidth ' ) )
2016-02-03 01:07:07 +08:00
f = {
2016-02-11 17:33:26 +08:00
' format_id ' : ' %s - %s ' % ( mpd_id , representation_id ) if mpd_id else representation_id ,
2016-02-03 01:07:07 +08:00
' url ' : base_url ,
2016-09-17 21:35:22 +08:00
' manifest_url ' : mpd_url ,
2016-03-11 23:49:55 +08:00
' ext ' : mimetype2ext ( mime_type ) ,
2016-02-03 01:07:07 +08:00
' width ' : int_or_none ( representation_attrib . get ( ' width ' ) ) ,
' height ' : int_or_none ( representation_attrib . get ( ' height ' ) ) ,
2017-04-23 12:33:19 +08:00
' tbr ' : float_or_none ( bandwidth , 1000 ) ,
2016-02-03 01:07:07 +08:00
' asr ' : int_or_none ( representation_attrib . get ( ' audioSamplingRate ' ) ) ,
' fps ' : int_or_none ( representation_attrib . get ( ' frameRate ' ) ) ,
2016-02-03 20:24:07 +08:00
' language ' : lang if lang not in ( ' mul ' , ' und ' , ' zxx ' , ' mis ' ) else None ,
2016-02-03 01:07:07 +08:00
' format_note ' : ' DASH %s ' % content_type ,
2016-02-10 03:05:39 +08:00
' filesize ' : filesize ,
2017-12-31 05:02:46 +08:00
' container ' : mimetype2ext ( mime_type ) + ' _dash ' ,
2016-02-03 01:07:07 +08:00
}
2016-12-20 19:23:16 +08:00
f . update ( parse_codecs ( representation_attrib . get ( ' codecs ' ) ) )
2016-02-03 01:07:07 +08:00
representation_ms_info = extract_multisegment_info ( representation , adaption_set_ms_info )
2016-09-06 02:21:57 +08:00
2017-01-29 07:57:39 +08:00
def prepare_template ( template_name , identifiers ) :
2018-07-01 03:00:16 +08:00
tmpl = representation_ms_info [ template_name ]
# First of, % characters outside $...$ templates
# must be escaped by doubling for proper processing
# by % operator string formatting used further (see
# https://github.com/rg3/youtube-dl/issues/16867).
t = ' '
in_template = False
for c in tmpl :
t + = c
if c == ' $ ' :
in_template = not in_template
elif c == ' % ' and not in_template :
t + = c
# Next, $...$ templates are translated to their
# %(...) counterparts to be used with % operator
2017-01-29 07:57:39 +08:00
t = t . replace ( ' $RepresentationID$ ' , representation_id )
t = re . sub ( r ' \ $( %s ) \ $ ' % ' | ' . join ( identifiers ) , r ' % ( \ 1)d ' , t )
t = re . sub ( r ' \ $( %s ) %% ([^$]+) \ $ ' % ' | ' . join ( identifiers ) , r ' % ( \ 1) \ 2 ' , t )
t . replace ( ' $$ ' , ' $ ' )
return t
# @initialization is a regular template like @media one
# so it should be handled just the same way (see
# https://github.com/rg3/youtube-dl/issues/11605)
if ' initialization ' in representation_ms_info :
initialization_template = prepare_template (
' initialization ' ,
# As per [1, 5.3.9.4.2, Table 15, page 54] $Number$ and
# $Time$ shall not be included for @initialization thus
# only $Bandwidth$ remains
( ' Bandwidth ' , ) )
representation_ms_info [ ' initialization_url ' ] = initialization_template % {
' Bandwidth ' : bandwidth ,
}
2017-08-05 07:57:19 +08:00
def location_key ( location ) :
return ' url ' if re . match ( r ' ^https?:// ' , location ) else ' path '
2017-01-29 07:57:39 +08:00
if ' segment_urls ' not in representation_ms_info and ' media ' in representation_ms_info :
media_template = prepare_template ( ' media ' , ( ' Number ' , ' Bandwidth ' , ' Time ' ) )
2017-08-05 07:57:19 +08:00
media_location_key = location_key ( media_template )
2016-07-24 11:27:16 +08:00
# As per [1, 5.3.9.4.4, Table 16, page 55] $Number$ and $Time$
# can't be used at the same time
2016-09-06 02:21:57 +08:00
if ' % (Number ' in media_template and ' s ' not in representation_ms_info :
segment_duration = None
2017-10-04 04:50:27 +08:00
if ' total_number ' not in representation_ms_info and ' segment_duration ' in representation_ms_info :
2016-09-06 02:21:57 +08:00
segment_duration = float_or_none ( representation_ms_info [ ' segment_duration ' ] , representation_ms_info [ ' timescale ' ] )
representation_ms_info [ ' total_number ' ] = int ( math . ceil ( float ( period_duration ) / segment_duration ) )
representation_ms_info [ ' fragments ' ] = [ {
2017-08-05 07:57:19 +08:00
media_location_key : media_template % {
2016-09-06 02:21:57 +08:00
' Number ' : segment_number ,
2017-01-29 07:57:39 +08:00
' Bandwidth ' : bandwidth ,
2016-09-06 02:21:57 +08:00
} ,
' duration ' : segment_duration ,
} for segment_number in range (
representation_ms_info [ ' start_number ' ] ,
representation_ms_info [ ' total_number ' ] + representation_ms_info [ ' start_number ' ] ) ]
2016-07-24 11:27:16 +08:00
else :
2016-09-06 02:21:57 +08:00
# $Number*$ or $Time$ in media template with S list available
# Example $Number*$: http://www.svtplay.se/klipp/9023742/stopptid-om-bjorn-borg
# Example $Time$: https://play.arkena.com/embed/avp/v2/player/media/b41dda37-d8e7-4d3f-b1b5-9a9db578bdfe/1/129411
representation_ms_info [ ' fragments ' ] = [ ]
2016-07-24 11:27:16 +08:00
segment_time = 0
2016-09-06 02:21:57 +08:00
segment_d = None
segment_number = representation_ms_info [ ' start_number ' ]
2016-07-24 11:27:16 +08:00
def add_segment_url ( ) :
2016-09-06 02:21:57 +08:00
segment_url = media_template % {
' Time ' : segment_time ,
2017-01-29 07:57:39 +08:00
' Bandwidth ' : bandwidth ,
2016-09-06 02:21:57 +08:00
' Number ' : segment_number ,
}
representation_ms_info [ ' fragments ' ] . append ( {
2017-08-05 07:57:19 +08:00
media_location_key : segment_url ,
2016-09-06 02:21:57 +08:00
' duration ' : float_or_none ( segment_d , representation_ms_info [ ' timescale ' ] ) ,
} )
2016-07-24 11:27:16 +08:00
for num , s in enumerate ( representation_ms_info [ ' s ' ] ) :
segment_time = s . get ( ' t ' ) or segment_time
2016-09-06 02:21:57 +08:00
segment_d = s [ ' d ' ]
2016-07-24 11:27:16 +08:00
add_segment_url ( )
2016-09-06 02:21:57 +08:00
segment_number + = 1
2016-07-24 11:27:16 +08:00
for r in range ( s . get ( ' r ' , 0 ) ) :
2016-09-06 02:21:57 +08:00
segment_time + = segment_d
2016-07-24 11:27:16 +08:00
add_segment_url ( )
2016-09-06 02:21:57 +08:00
segment_number + = 1
segment_time + = segment_d
elif ' segment_urls ' in representation_ms_info and ' s ' in representation_ms_info :
# No media template
# Example: https://www.youtube.com/watch?v=iXZV5uAYMJI
# or any YouTube dashsegments video
fragments = [ ]
2017-01-29 06:36:53 +08:00
segment_index = 0
timescale = representation_ms_info [ ' timescale ' ]
for s in representation_ms_info [ ' s ' ] :
duration = float_or_none ( s [ ' d ' ] , timescale )
2016-09-06 02:21:57 +08:00
for r in range ( s . get ( ' r ' , 0 ) + 1 ) :
2017-08-05 07:57:19 +08:00
segment_uri = representation_ms_info [ ' segment_urls ' ] [ segment_index ]
2016-09-06 02:21:57 +08:00
fragments . append ( {
2017-08-05 07:57:19 +08:00
location_key ( segment_uri ) : segment_uri ,
2017-01-29 06:36:53 +08:00
' duration ' : duration ,
2016-09-06 02:21:57 +08:00
} )
2017-01-29 06:36:53 +08:00
segment_index + = 1
2016-09-06 02:21:57 +08:00
representation_ms_info [ ' fragments ' ] = fragments
2017-11-25 09:13:23 +08:00
elif ' segment_urls ' in representation_ms_info :
# Segment URLs with no SegmentTimeline
# Example: https://www.seznam.cz/zpravy/clanek/cesko-zasahne-vitr-o-sile-vichrice-muze-byt-i-zivotu-nebezpecny-39091
2017-12-02 22:22:43 +08:00
# https://github.com/rg3/youtube-dl/pull/14844
2017-11-25 09:13:23 +08:00
fragments = [ ]
2017-12-02 22:10:35 +08:00
segment_duration = float_or_none (
representation_ms_info [ ' segment_duration ' ] ,
representation_ms_info [ ' timescale ' ] ) if ' segment_duration ' in representation_ms_info else None
2017-11-25 09:13:23 +08:00
for segment_url in representation_ms_info [ ' segment_urls ' ] :
2017-12-02 22:10:35 +08:00
fragment = {
2017-11-25 09:13:23 +08:00
location_key ( segment_url ) : segment_url ,
2017-12-02 22:10:35 +08:00
}
if segment_duration :
fragment [ ' duration ' ] = segment_duration
fragments . append ( fragment )
2017-11-25 09:13:23 +08:00
representation_ms_info [ ' fragments ' ] = fragments
2016-09-17 21:35:22 +08:00
# NB: MPD manifest may contain direct URLs to unfragmented media.
# No fragments key is present in this case.
if ' fragments ' in representation_ms_info :
2016-02-03 01:07:07 +08:00
f . update ( {
2017-08-05 07:57:19 +08:00
' fragment_base_url ' : base_url ,
2016-09-06 02:21:57 +08:00
' fragments ' : [ ] ,
2016-02-03 01:07:07 +08:00
' protocol ' : ' http_dash_segments ' ,
2016-01-30 21:42:27 +08:00
} )
2016-02-03 01:07:07 +08:00
if ' initialization_url ' in representation_ms_info :
2017-01-29 07:57:39 +08:00
initialization_url = representation_ms_info [ ' initialization_url ' ]
2016-02-03 01:07:07 +08:00
if not f . get ( ' url ' ) :
f [ ' url ' ] = initialization_url
2017-08-05 07:57:19 +08:00
f [ ' fragments ' ] . append ( { location_key ( initialization_url ) : initialization_url } )
2016-09-06 02:21:57 +08:00
f [ ' fragments ' ] . extend ( representation_ms_info [ ' fragments ' ] )
2017-12-30 00:14:15 +08:00
# According to [1, 5.3.5.2, Table 7, page 35] @id of Representation
# is not necessarily unique within a Period thus formats with
# the same `format_id` are quite possible. There are numerous examples
# of such manifests (see https://github.com/rg3/youtube-dl/issues/15111,
# https://github.com/rg3/youtube-dl/issues/13919)
full_info = formats_dict . get ( representation_id , { } ) . copy ( )
full_info . update ( f )
formats . append ( full_info )
2016-01-30 21:05:55 +08:00
else :
2016-02-03 01:07:07 +08:00
self . report_warning ( ' Unknown MIME type %s in DASH manifest ' % mime_type )
2016-01-30 21:05:55 +08:00
return formats
2016-10-19 23:22:40 +08:00
def _extract_ism_formats ( self , ism_url , video_id , ism_id = None , note = None , errnote = None , fatal = True ) :
2018-03-18 03:46:50 +08:00
res = self . _download_xml_handle (
2016-10-19 23:22:40 +08:00
ism_url , video_id ,
note = note or ' Downloading ISM manifest ' ,
errnote = errnote or ' Failed to download ISM manifest ' ,
fatal = fatal )
if res is False :
return [ ]
2018-03-18 03:46:50 +08:00
ism_doc , urlh = res
2016-10-19 23:22:40 +08:00
2018-03-18 03:46:50 +08:00
return self . _parse_ism_formats ( ism_doc , urlh . geturl ( ) , ism_id )
2016-10-19 23:22:40 +08:00
def _parse_ism_formats ( self , ism_doc , ism_url , ism_id = None ) :
2017-05-14 07:11:45 +08:00
"""
Parse formats from ISM manifest .
References :
1. [ MS - SSTR ] : Smooth Streaming Protocol ,
https : / / msdn . microsoft . com / en - us / library / ff469518 . aspx
"""
2016-10-19 23:22:40 +08:00
if ism_doc . get ( ' IsLive ' ) == ' TRUE ' or ism_doc . find ( ' Protection ' ) is not None :
return [ ]
duration = int ( ism_doc . attrib [ ' Duration ' ] )
timescale = int_or_none ( ism_doc . get ( ' TimeScale ' ) ) or 10000000
formats = [ ]
for stream in ism_doc . findall ( ' StreamIndex ' ) :
stream_type = stream . get ( ' Type ' )
if stream_type not in ( ' video ' , ' audio ' ) :
continue
url_pattern = stream . attrib [ ' Url ' ]
stream_timescale = int_or_none ( stream . get ( ' TimeScale ' ) ) or timescale
stream_name = stream . get ( ' Name ' )
for track in stream . findall ( ' QualityLevel ' ) :
2016-11-13 05:15:51 +08:00
fourcc = track . get ( ' FourCC ' , ' AACL ' if track . get ( ' AudioTag ' ) == ' 255 ' else None )
2016-10-19 23:22:40 +08:00
# TODO: add support for WVC1 and WMAP
if fourcc not in ( ' H264 ' , ' AVC1 ' , ' AACL ' ) :
self . report_warning ( ' %s is not a supported codec ' % fourcc )
continue
tbr = int ( track . attrib [ ' Bitrate ' ] ) / / 1000
2017-05-14 07:11:45 +08:00
# [1] does not mention Width and Height attributes. However,
# they're often present while MaxWidth and MaxHeight are
# missing, so should be used as fallbacks
width = int_or_none ( track . get ( ' MaxWidth ' ) or track . get ( ' Width ' ) )
height = int_or_none ( track . get ( ' MaxHeight ' ) or track . get ( ' Height ' ) )
2016-10-19 23:22:40 +08:00
sampling_rate = int_or_none ( track . get ( ' SamplingRate ' ) )
track_url_pattern = re . sub ( r ' { [Bb]itrate} ' , track . attrib [ ' Bitrate ' ] , url_pattern )
track_url_pattern = compat_urlparse . urljoin ( ism_url , track_url_pattern )
fragments = [ ]
fragment_ctx = {
' time ' : 0 ,
}
stream_fragments = stream . findall ( ' c ' )
for stream_fragment_index , stream_fragment in enumerate ( stream_fragments ) :
fragment_ctx [ ' time ' ] = int_or_none ( stream_fragment . get ( ' t ' ) ) or fragment_ctx [ ' time ' ]
fragment_repeat = int_or_none ( stream_fragment . get ( ' r ' ) ) or 1
fragment_ctx [ ' duration ' ] = int_or_none ( stream_fragment . get ( ' d ' ) )
if not fragment_ctx [ ' duration ' ] :
try :
next_fragment_time = int ( stream_fragment [ stream_fragment_index + 1 ] . attrib [ ' t ' ] )
except IndexError :
next_fragment_time = duration
2016-11-02 03:21:43 +08:00
fragment_ctx [ ' duration ' ] = ( next_fragment_time - fragment_ctx [ ' time ' ] ) / fragment_repeat
2016-10-19 23:22:40 +08:00
for _ in range ( fragment_repeat ) :
fragments . append ( {
2016-11-02 03:21:43 +08:00
' url ' : re . sub ( r ' { start[ _]time} ' , compat_str ( fragment_ctx [ ' time ' ] ) , track_url_pattern ) ,
2016-10-19 23:22:40 +08:00
' duration ' : fragment_ctx [ ' duration ' ] / stream_timescale ,
} )
fragment_ctx [ ' time ' ] + = fragment_ctx [ ' duration ' ]
format_id = [ ]
if ism_id :
format_id . append ( ism_id )
if stream_name :
format_id . append ( stream_name )
format_id . append ( compat_str ( tbr ) )
formats . append ( {
' format_id ' : ' - ' . join ( format_id ) ,
' url ' : ism_url ,
' manifest_url ' : ism_url ,
' ext ' : ' ismv ' if stream_type == ' video ' else ' isma ' ,
' width ' : width ,
' height ' : height ,
' tbr ' : tbr ,
' asr ' : sampling_rate ,
' vcodec ' : ' none ' if stream_type == ' audio ' else fourcc ,
' acodec ' : ' none ' if stream_type == ' video ' else fourcc ,
' protocol ' : ' ism ' ,
' fragments ' : fragments ,
' _download_params ' : {
' duration ' : duration ,
' timescale ' : stream_timescale ,
' width ' : width or 0 ,
' height ' : height or 0 ,
' fourcc ' : fourcc ,
' codec_private_data ' : track . get ( ' CodecPrivateData ' ) ,
' sampling_rate ' : sampling_rate ,
' channels ' : int_or_none ( track . get ( ' Channels ' , 2 ) ) ,
' bits_per_sample ' : int_or_none ( track . get ( ' BitsPerSample ' , 16 ) ) ,
' nal_unit_length_field ' : int_or_none ( track . get ( ' NALUnitLengthField ' , 4 ) ) ,
} ,
} )
return formats
2017-02-25 18:40:05 +08:00
def _parse_html5_media_entries ( self , base_url , webpage , video_id , m3u8_id = None , m3u8_entry_protocol = ' m3u8 ' , mpd_id = None , preference = None ) :
2018-03-20 00:43:53 +08:00
def absolute_url ( item_url ) :
return urljoin ( base_url , item_url )
2016-03-17 01:50:45 +08:00
def parse_content_type ( content_type ) :
if not content_type :
return { }
ctr = re . search ( r ' (?P<mimetype>[^/]+/[^;]+)(?:; \ s*codecs= " ?(?P<codecs>[^ " ]+))? ' , content_type )
if ctr :
mimetype , codecs = ctr . groups ( )
f = parse_codecs ( codecs )
f [ ' ext ' ] = mimetype2ext ( mimetype )
return f
return { }
2017-08-12 20:24:26 +08:00
def _media_formats ( src , cur_media_type , type_info = { } ) :
2016-08-19 23:53:47 +08:00
full_url = absolute_url ( src )
2017-08-12 17:48:11 +08:00
ext = type_info . get ( ' ext ' ) or determine_ext ( full_url )
2016-12-18 00:03:13 +08:00
if ext == ' m3u8 ' :
2016-08-19 23:53:47 +08:00
is_plain_url = False
formats = self . _extract_m3u8_formats (
2016-08-22 02:18:46 +08:00
full_url , video_id , ext = ' mp4 ' ,
2017-02-25 18:40:05 +08:00
entry_protocol = m3u8_entry_protocol , m3u8_id = m3u8_id ,
2017-08-20 15:16:58 +08:00
preference = preference , fatal = False )
2016-12-18 00:03:13 +08:00
elif ext == ' mpd ' :
is_plain_url = False
formats = self . _extract_mpd_formats (
2017-08-20 15:16:58 +08:00
full_url , video_id , mpd_id = mpd_id , fatal = False )
2016-08-19 23:53:47 +08:00
else :
is_plain_url = True
formats = [ {
' url ' : full_url ,
' vcodec ' : ' none ' if cur_media_type == ' audio ' else None ,
} ]
return is_plain_url , formats
2016-03-17 01:50:45 +08:00
entries = [ ]
2017-07-09 17:29:52 +08:00
# amp-video and amp-audio are very similar to their HTML5 counterparts
# so we wll include them right here (see
# https://www.ampproject.org/docs/reference/components/amp-video)
2016-10-12 01:40:28 +08:00
media_tags = [ ( media_tag , media_type , ' ' )
for media_tag , media_type
2017-07-09 17:29:52 +08:00
in re . findall ( r ' (?s)(<(?:amp-)?(video|audio)[^>]*/>) ' , webpage ) ]
2017-02-06 01:20:30 +08:00
media_tags . extend ( re . findall (
# We only allow video|audio followed by a whitespace or '>'.
# Allowing more characters may end up in significant slow down (see
# https://github.com/rg3/youtube-dl/issues/11979, example URL:
# http://www.porntrex.com/maps/videositemap.xml).
2017-07-09 17:29:52 +08:00
r ' (?s)(<(?P<tag>(?:amp-)?(?:video|audio))(?: \ s+[^>]*)?>)(.*?)</(?P=tag)> ' , webpage ) )
2016-10-12 01:40:28 +08:00
for media_tag , media_type , media_content in media_tags :
2016-03-17 01:50:45 +08:00
media_info = {
' formats ' : [ ] ,
' subtitles ' : { } ,
}
media_attributes = extract_attributes ( media_tag )
src = media_attributes . get ( ' src ' )
if src :
2016-09-03 07:50:26 +08:00
_ , formats = _media_formats ( src , media_type )
2016-08-19 23:53:47 +08:00
media_info [ ' formats ' ] . extend ( formats )
2018-03-20 00:43:53 +08:00
media_info [ ' thumbnail ' ] = absolute_url ( media_attributes . get ( ' poster ' ) )
2016-03-17 01:50:45 +08:00
if media_content :
for source_tag in re . findall ( r ' <source[^>]+> ' , media_content ) :
source_attributes = extract_attributes ( source_tag )
src = source_attributes . get ( ' src ' )
if not src :
continue
2017-08-12 17:48:11 +08:00
f = parse_content_type ( source_attributes . get ( ' type ' ) )
2017-08-12 20:24:26 +08:00
is_plain_url , formats = _media_formats ( src , media_type , f )
2016-08-19 23:53:47 +08:00
if is_plain_url :
2017-08-27 04:12:56 +08:00
# res attribute is not standard but seen several times
# in the wild
2017-08-27 04:27:05 +08:00
f . update ( {
' height ' : int_or_none ( source_attributes . get ( ' res ' ) ) ,
' format_id ' : source_attributes . get ( ' label ' ) ,
} )
2016-08-19 23:53:47 +08:00
f . update ( formats [ 0 ] )
media_info [ ' formats ' ] . append ( f )
else :
media_info [ ' formats ' ] . extend ( formats )
2016-03-17 01:50:45 +08:00
for track_tag in re . findall ( r ' <track[^>]+> ' , media_content ) :
track_attributes = extract_attributes ( track_tag )
kind = track_attributes . get ( ' kind ' )
2016-09-24 14:20:42 +08:00
if not kind or kind in ( ' subtitles ' , ' captions ' ) :
2016-03-17 01:50:45 +08:00
src = track_attributes . get ( ' src ' )
if not src :
continue
lang = track_attributes . get ( ' srclang ' ) or track_attributes . get ( ' lang ' ) or track_attributes . get ( ' label ' )
media_info [ ' subtitles ' ] . setdefault ( lang , [ ] ) . append ( {
' url ' : absolute_url ( src ) ,
} )
2018-06-29 02:25:05 +08:00
for f in media_info [ ' formats ' ] :
f . setdefault ( ' http_headers ' , { } ) [ ' Referer ' ] = base_url
2016-09-24 14:20:42 +08:00
if media_info [ ' formats ' ] or media_info [ ' subtitles ' ] :
2016-03-17 01:50:45 +08:00
entries . append ( media_info )
return entries
2017-01-13 17:08:51 +08:00
def _extract_akamai_formats ( self , manifest_url , video_id , hosts = { } ) :
2016-08-22 14:47:25 +08:00
formats = [ ]
2016-09-25 04:55:53 +08:00
hdcore_sign = ' hdcore=3.7.0 '
2017-05-04 23:04:25 +08:00
f4m_url = re . sub ( r ' (https?://[^/]+)/i/ ' , r ' \ 1/z/ ' , manifest_url ) . replace ( ' /master.m3u8 ' , ' /manifest.f4m ' )
2017-01-13 17:08:51 +08:00
hds_host = hosts . get ( ' hds ' )
if hds_host :
f4m_url = re . sub ( r ' (https?://)[^/]+ ' , r ' \ 1 ' + hds_host , f4m_url )
2016-09-25 04:55:53 +08:00
if ' hdcore= ' not in f4m_url :
f4m_url + = ( ' & ' if ' ? ' in f4m_url else ' ? ' ) + hdcore_sign
f4m_formats = self . _extract_f4m_formats (
f4m_url , video_id , f4m_id = ' hds ' , fatal = False )
for entry in f4m_formats :
entry . update ( { ' extra_param_to_segment_url ' : hdcore_sign } )
formats . extend ( f4m_formats )
2017-01-13 17:08:51 +08:00
m3u8_url = re . sub ( r ' (https?://[^/]+)/z/ ' , r ' \ 1/i/ ' , manifest_url ) . replace ( ' /manifest.f4m ' , ' /master.m3u8 ' )
hls_host = hosts . get ( ' hls ' )
if hls_host :
m3u8_url = re . sub ( r ' (https?://)[^/]+ ' , r ' \ 1 ' + hls_host , m3u8_url )
2016-08-22 14:47:25 +08:00
formats . extend ( self . _extract_m3u8_formats (
m3u8_url , video_id , ' mp4 ' , ' m3u8_native ' ,
m3u8_id = ' hls ' , fatal = False ) )
return formats
2016-09-17 02:30:38 +08:00
def _extract_wowza_formats ( self , url , video_id , m3u8_entry_protocol = ' m3u8_native ' , skip_protocols = [ ] ) :
2017-11-02 00:39:26 +08:00
query = compat_urlparse . urlparse ( url ) . query
2016-09-17 02:30:38 +08:00
url = re . sub ( r ' /(?:manifest|playlist|jwplayer) \ .(?:m3u8|f4m|mpd|smil) ' , ' ' , url )
2018-02-06 00:41:55 +08:00
mobj = re . search (
r ' (?:(?:http|rtmp|rtsp)(?P<s>s)?:)?(?P<url>//[^?]+) ' , url )
url_base = mobj . group ( ' url ' )
http_base_url = ' %s %s : %s ' % ( ' http ' , mobj . group ( ' s ' ) or ' ' , url_base )
2016-09-17 02:30:38 +08:00
formats = [ ]
2017-11-02 00:39:26 +08:00
def manifest_url ( manifest ) :
m_url = ' %s / %s ' % ( http_base_url , manifest )
if query :
m_url + = ' ? %s ' % query
return m_url
2016-09-17 02:30:38 +08:00
if ' m3u8 ' not in skip_protocols :
formats . extend ( self . _extract_m3u8_formats (
2017-11-02 00:39:26 +08:00
manifest_url ( ' playlist.m3u8 ' ) , video_id , ' mp4 ' ,
2016-09-17 02:30:38 +08:00
m3u8_entry_protocol , m3u8_id = ' hls ' , fatal = False ) )
if ' f4m ' not in skip_protocols :
formats . extend ( self . _extract_f4m_formats (
2017-11-02 00:39:26 +08:00
manifest_url ( ' manifest.f4m ' ) ,
2016-09-17 02:30:38 +08:00
video_id , f4m_id = ' hds ' , fatal = False ) )
2016-10-19 21:57:12 +08:00
if ' dash ' not in skip_protocols :
formats . extend ( self . _extract_mpd_formats (
2017-11-02 00:39:26 +08:00
manifest_url ( ' manifest.mpd ' ) ,
2016-10-19 21:57:12 +08:00
video_id , mpd_id = ' dash ' , fatal = False ) )
2016-09-17 02:30:38 +08:00
if re . search ( r ' (?:/smil:| \ .smil) ' , url_base ) :
if ' smil ' not in skip_protocols :
rtmp_formats = self . _extract_smil_formats (
2017-11-02 00:39:26 +08:00
manifest_url ( ' jwplayer.smil ' ) ,
2016-09-17 02:30:38 +08:00
video_id , fatal = False )
for rtmp_format in rtmp_formats :
rtsp_format = rtmp_format . copy ( )
rtsp_format [ ' url ' ] = ' %s / %s ' % ( rtmp_format [ ' url ' ] , rtmp_format [ ' play_path ' ] )
del rtsp_format [ ' play_path ' ]
del rtsp_format [ ' ext ' ]
rtsp_format . update ( {
' url ' : rtsp_format [ ' url ' ] . replace ( ' rtmp:// ' , ' rtsp:// ' ) ,
' format_id ' : rtmp_format [ ' format_id ' ] . replace ( ' rtmp ' , ' rtsp ' ) ,
' protocol ' : ' rtsp ' ,
} )
formats . extend ( [ rtmp_format , rtsp_format ] )
else :
for protocol in ( ' rtmp ' , ' rtsp ' ) :
if protocol not in skip_protocols :
formats . append ( {
2017-05-17 23:19:33 +08:00
' url ' : ' %s : %s ' % ( protocol , url_base ) ,
2016-09-17 02:30:38 +08:00
' format_id ' : protocol ,
' protocol ' : protocol ,
} )
return formats
2017-03-26 02:38:30 +08:00
def _find_jwplayer_data ( self , webpage , video_id = None , transform_source = js_to_json ) :
2017-02-16 23:42:36 +08:00
mobj = re . search (
2017-04-26 00:46:05 +08:00
r ' (?s)jwplayer \ ((?P<quote>[ \' " ])[^ \' " ]+(?P=quote) \ )(?!</script>).*? \ .setup \ s* \ ((?P<options>[^)]+) \ ) ' ,
2017-02-16 23:42:36 +08:00
webpage )
if mobj :
2017-03-26 02:38:30 +08:00
try :
jwplayer_data = self . _parse_json ( mobj . group ( ' options ' ) ,
video_id = video_id ,
transform_source = transform_source )
except ExtractorError :
pass
else :
if isinstance ( jwplayer_data , dict ) :
return jwplayer_data
2017-02-16 23:42:36 +08:00
def _extract_jwplayer_data ( self , webpage , video_id , * args , * * kwargs ) :
2017-03-26 02:38:30 +08:00
jwplayer_data = self . _find_jwplayer_data (
webpage , video_id , transform_source = js_to_json )
2017-02-16 23:42:36 +08:00
return self . _parse_jwplayer_data (
jwplayer_data , video_id , * args , * * kwargs )
def _parse_jwplayer_data ( self , jwplayer_data , video_id = None , require_title = True ,
m3u8_id = None , mpd_id = None , rtmp_params = None , base_url = None ) :
# JWPlayer backward compatibility: flattened playlists
# https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/api/config.js#L81-L96
if ' playlist ' not in jwplayer_data :
jwplayer_data = { ' playlist ' : [ jwplayer_data ] }
entries = [ ]
# JWPlayer backward compatibility: single playlist item
# https://github.com/jwplayer/jwplayer/blob/v7.7.0/src/js/playlist/playlist.js#L10
if not isinstance ( jwplayer_data [ ' playlist ' ] , list ) :
jwplayer_data [ ' playlist ' ] = [ jwplayer_data [ ' playlist ' ] ]
for video_data in jwplayer_data [ ' playlist ' ] :
# JWPlayer backward compatibility: flattened sources
# https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/playlist/item.js#L29-L35
if ' sources ' not in video_data :
video_data [ ' sources ' ] = [ video_data ]
this_video_id = video_id or video_data [ ' mediaid ' ]
2017-03-06 00:28:32 +08:00
formats = self . _parse_jwplayer_formats (
video_data [ ' sources ' ] , video_id = this_video_id , m3u8_id = m3u8_id ,
mpd_id = mpd_id , rtmp_params = rtmp_params , base_url = base_url )
2017-02-16 23:42:36 +08:00
subtitles = { }
tracks = video_data . get ( ' tracks ' )
if tracks and isinstance ( tracks , list ) :
for track in tracks :
2017-06-16 00:40:39 +08:00
if not isinstance ( track , dict ) :
continue
2018-02-25 01:59:29 +08:00
track_kind = track . get ( ' kind ' )
if not track_kind or not isinstance ( track_kind , compat_str ) :
continue
if track_kind . lower ( ) not in ( ' captions ' , ' subtitles ' ) :
2017-02-16 23:42:36 +08:00
continue
track_url = urljoin ( base_url , track . get ( ' file ' ) )
if not track_url :
continue
subtitles . setdefault ( track . get ( ' label ' ) or ' en ' , [ ] ) . append ( {
' url ' : self . _proto_relative_url ( track_url )
} )
2017-10-13 00:12:47 +08:00
entry = {
2017-02-16 23:42:36 +08:00
' id ' : this_video_id ,
2017-10-13 00:12:47 +08:00
' title ' : unescapeHTML ( video_data [ ' title ' ] if require_title else video_data . get ( ' title ' ) ) ,
2017-02-16 23:42:36 +08:00
' description ' : video_data . get ( ' description ' ) ,
' thumbnail ' : self . _proto_relative_url ( video_data . get ( ' image ' ) ) ,
' timestamp ' : int_or_none ( video_data . get ( ' pubdate ' ) ) ,
' duration ' : float_or_none ( jwplayer_data . get ( ' duration ' ) or video_data . get ( ' duration ' ) ) ,
' subtitles ' : subtitles ,
2017-10-13 00:12:47 +08:00
}
# https://github.com/jwplayer/jwplayer/blob/master/src/js/utils/validator.js#L32
if len ( formats ) == 1 and re . search ( r ' ^(?:http|//).*(?:youtube \ .com|youtu \ .be)/.+ ' , formats [ 0 ] [ ' url ' ] ) :
entry . update ( {
' _type ' : ' url_transparent ' ,
' url ' : formats [ 0 ] [ ' url ' ] ,
} )
else :
self . _sort_formats ( formats )
entry [ ' formats ' ] = formats
entries . append ( entry )
2017-02-16 23:42:36 +08:00
if len ( entries ) == 1 :
return entries [ 0 ]
else :
return self . playlist_result ( entries )
2017-03-06 00:22:27 +08:00
def _parse_jwplayer_formats ( self , jwplayer_sources_data , video_id = None ,
m3u8_id = None , mpd_id = None , rtmp_params = None , base_url = None ) :
2017-04-17 15:48:24 +08:00
urls = [ ]
2017-03-06 00:22:27 +08:00
formats = [ ]
2017-03-06 00:28:32 +08:00
for source in jwplayer_sources_data :
2017-06-14 23:02:15 +08:00
if not isinstance ( source , dict ) :
continue
2017-04-17 15:48:24 +08:00
source_url = self . _proto_relative_url ( source . get ( ' file ' ) )
if not source_url :
continue
2017-03-06 00:22:27 +08:00
if base_url :
source_url = compat_urlparse . urljoin ( base_url , source_url )
2017-04-17 15:48:24 +08:00
if source_url in urls :
continue
urls . append ( source_url )
2017-03-06 00:22:27 +08:00
source_type = source . get ( ' type ' ) or ' '
ext = mimetype2ext ( source_type ) or determine_ext ( source_url )
if source_type == ' hls ' or ext == ' m3u8 ' :
formats . extend ( self . _extract_m3u8_formats (
2017-03-06 00:25:03 +08:00
source_url , video_id , ' mp4 ' , entry_protocol = ' m3u8_native ' ,
m3u8_id = m3u8_id , fatal = False ) )
2018-01-21 18:42:48 +08:00
elif source_type == ' dash ' or ext == ' mpd ' :
2017-03-06 00:22:27 +08:00
formats . extend ( self . _extract_mpd_formats (
source_url , video_id , mpd_id = mpd_id , fatal = False ) )
2017-03-16 04:30:53 +08:00
elif ext == ' smil ' :
formats . extend ( self . _extract_smil_formats (
source_url , video_id , fatal = False ) )
2017-03-06 00:22:27 +08:00
# https://github.com/jwplayer/jwplayer/blob/master/src/js/providers/default.js#L67
2017-03-06 00:25:03 +08:00
elif source_type . startswith ( ' audio ' ) or ext in (
' oga ' , ' aac ' , ' mp3 ' , ' mpeg ' , ' vorbis ' ) :
2017-03-06 00:22:27 +08:00
formats . append ( {
' url ' : source_url ,
' vcodec ' : ' none ' ,
' ext ' : ext ,
} )
else :
height = int_or_none ( source . get ( ' height ' ) )
if height is None :
# Often no height is provided but there is a label in
2017-03-06 00:25:03 +08:00
# format like "1080p", "720p SD", or 1080.
2017-03-06 00:22:27 +08:00
height = int_or_none ( self . _search_regex (
2017-03-06 00:25:03 +08:00
r ' ^( \ d { 3,4})[pP]?(?: \ b|$) ' , compat_str ( source . get ( ' label ' ) or ' ' ) ,
2017-03-06 00:22:27 +08:00
' height ' , default = None ) )
a_format = {
' url ' : source_url ,
' width ' : int_or_none ( source . get ( ' width ' ) ) ,
' height ' : height ,
2017-03-06 00:25:03 +08:00
' tbr ' : int_or_none ( source . get ( ' bitrate ' ) ) ,
2017-03-06 00:22:27 +08:00
' ext ' : ext ,
}
if source_url . startswith ( ' rtmp ' ) :
a_format [ ' ext ' ] = ' flv '
# See com/longtailvideo/jwplayer/media/RTMPMediaProvider.as
# of jwplayer.flash.swf
rtmp_url_parts = re . split (
r ' ((?:mp4|mp3|flv):) ' , source_url , 1 )
if len ( rtmp_url_parts ) == 3 :
rtmp_url , prefix , play_path = rtmp_url_parts
a_format . update ( {
' url ' : rtmp_url ,
' play_path ' : prefix + play_path ,
} )
if rtmp_params :
a_format . update ( rtmp_params )
formats . append ( a_format )
return formats
2014-09-28 14:53:52 +08:00
def _live_title ( self , name ) :
""" Generate the title for a live video """
now = datetime . datetime . now ( )
2016-02-14 17:37:17 +08:00
now_str = now . strftime ( ' % Y- % m- %d % H: % M ' )
2014-09-28 14:53:52 +08:00
return name + ' ' + now_str
2014-09-28 16:34:55 +08:00
def _int ( self , v , name , fatal = False , * * kwargs ) :
res = int_or_none ( v , * * kwargs )
if ' get_attr ' in kwargs :
print ( getattr ( v , kwargs [ ' get_attr ' ] ) )
if res is None :
msg = ' Failed to extract %s : Could not parse value %r ' % ( name , v )
if fatal :
raise ExtractorError ( msg )
else :
self . _downloader . report_warning ( msg )
return res
def _float ( self , v , name , fatal = False , * * kwargs ) :
res = float_or_none ( v , * * kwargs )
if res is None :
msg = ' Failed to extract %s : Could not parse value %r ' % ( name , v )
if fatal :
raise ExtractorError ( msg )
else :
self . _downloader . report_warning ( msg )
return res
2017-04-25 21:12:54 +08:00
def _set_cookie ( self , domain , name , value , expire_time = None , port = None ,
path = ' / ' , secure = False , discard = False , rest = { } , * * kwargs ) :
2014-12-04 15:27:40 +08:00
cookie = compat_cookiejar . Cookie (
2017-09-17 13:53:04 +08:00
0 , name , value , port , port is not None , domain , True ,
2017-04-25 21:12:54 +08:00
domain . startswith ( ' . ' ) , path , True , secure , expire_time ,
discard , None , None , rest )
2014-11-30 07:03:59 +08:00
self . _downloader . cookiejar . set_cookie ( cookie )
2015-07-30 06:20:37 +08:00
def _get_cookies ( self , url ) :
""" Return a compat_cookies.SimpleCookie with the cookies for the url """
2015-11-22 00:18:17 +08:00
req = sanitized_Request ( url )
2015-07-30 06:20:37 +08:00
self . _downloader . cookiejar . add_cookie_header ( req )
return compat_cookies . SimpleCookie ( req . get_header ( ' Cookie ' ) )
2015-01-07 14:20:20 +08:00
def get_testcases ( self , include_onlymatching = False ) :
t = getattr ( self , ' _TEST ' , None )
if t :
assert not hasattr ( self , ' _TESTS ' ) , \
' %s has _TEST and _TESTS ' % type ( self ) . __name__
tests = [ t ]
else :
tests = getattr ( self , ' _TESTS ' , [ ] )
for t in tests :
if not include_onlymatching and t . get ( ' only_matching ' , False ) :
continue
t [ ' name ' ] = type ( self ) . __name__ [ : - len ( ' IE ' ) ]
yield t
def is_suitable ( self , age_limit ) :
""" Test whether the extractor is generally suitable for the given
age limit ( i . e . pornographic sites are not , all others usually are ) """
any_restricted = False
for tc in self . get_testcases ( include_onlymatching = False ) :
2016-07-27 00:54:06 +08:00
if tc . get ( ' playlist ' , [ ] ) :
2015-01-07 14:20:20 +08:00
tc = tc [ ' playlist ' ] [ 0 ]
is_restricted = age_restricted (
tc . get ( ' info_dict ' , { } ) . get ( ' age_limit ' ) , age_limit )
if not is_restricted :
return True
any_restricted = any_restricted or is_restricted
return not any_restricted
2015-02-16 01:03:41 +08:00
def extract_subtitles ( self , * args , * * kwargs ) :
2015-02-18 05:16:29 +08:00
if ( self . _downloader . params . get ( ' writesubtitles ' , False ) or
self . _downloader . params . get ( ' listsubtitles ' ) ) :
return self . _get_subtitles ( * args , * * kwargs )
return { }
2015-02-16 01:03:41 +08:00
def _get_subtitles ( self , * args , * * kwargs ) :
2016-02-14 17:37:17 +08:00
raise NotImplementedError ( ' This method must be implemented by subclasses ' )
2015-02-16 01:03:41 +08:00
2015-08-21 01:37:07 +08:00
@staticmethod
def _merge_subtitle_items ( subtitle_list1 , subtitle_list2 ) :
""" Merge subtitle items for one language. Items with duplicated URLs
will be dropped . """
list1_urls = set ( [ item [ ' url ' ] for item in subtitle_list1 ] )
ret = list ( subtitle_list1 )
ret . extend ( [ item for item in subtitle_list2 if item [ ' url ' ] not in list1_urls ] )
return ret
@classmethod
2015-08-21 17:35:51 +08:00
def _merge_subtitles ( cls , subtitle_dict1 , subtitle_dict2 ) :
2015-08-21 01:37:07 +08:00
""" Merge two subtitle dictionaries, language by language. """
ret = dict ( subtitle_dict1 )
for lang in subtitle_dict2 :
2015-08-21 17:35:51 +08:00
ret [ lang ] = cls . _merge_subtitle_items ( subtitle_dict1 . get ( lang , [ ] ) , subtitle_dict2 [ lang ] )
2015-08-21 01:37:07 +08:00
return ret
2015-02-17 04:44:17 +08:00
def extract_automatic_captions ( self , * args , * * kwargs ) :
2015-02-18 05:16:29 +08:00
if ( self . _downloader . params . get ( ' writeautomaticsub ' , False ) or
self . _downloader . params . get ( ' listsubtitles ' ) ) :
return self . _get_automatic_captions ( * args , * * kwargs )
return { }
2015-02-17 04:44:17 +08:00
def _get_automatic_captions ( self , * args , * * kwargs ) :
2016-02-14 17:37:17 +08:00
raise NotImplementedError ( ' This method must be implemented by subclasses ' )
2015-02-17 04:44:17 +08:00
2016-03-01 03:01:33 +08:00
def mark_watched ( self , * args , * * kwargs ) :
if ( self . _downloader . params . get ( ' mark_watched ' , False ) and
( self . _get_login_info ( ) [ 0 ] is not None or
self . _downloader . params . get ( ' cookiefile ' ) is not None ) ) :
self . _mark_watched ( * args , * * kwargs )
def _mark_watched ( self , * args , * * kwargs ) :
raise NotImplementedError ( ' This method must be implemented by subclasses ' )
2016-07-03 23:23:48 +08:00
def geo_verification_headers ( self ) :
headers = { }
geo_verification_proxy = self . _downloader . params . get ( ' geo_verification_proxy ' )
if geo_verification_proxy :
headers [ ' Ytdl-request-proxy ' ] = geo_verification_proxy
return headers
2016-10-07 19:20:53 +08:00
def _generic_id ( self , url ) :
return compat_urllib_parse_unquote ( os . path . splitext ( url . rstrip ( ' / ' ) . split ( ' / ' ) [ - 1 ] ) [ 0 ] )
def _generic_title ( self , url ) :
return compat_urllib_parse_unquote ( os . path . splitext ( url_basename ( url ) ) [ 0 ] )
2018-07-31 18:53:39 +08:00
def _cf_solve_challenge ( self , body , domain ) :
'''
Solve CloudFlrae Callenge .
@param < String > domain result ` ompat_urlparse . urlparse ( ) . netloc `
Oryginal code from : https : / / github . com / Anorov / cloudflare - scrape / blob / master / cfscrape / __init__ . py #L112-L149
'''
try :
js = re . search ( r " setTimeout \ (function \ ( \ ) { \ s+(var s,t,o,p,b,r,e,a,k,i,n,g,f.+? \ r? \ n[ \ s \ S]+?a \ .value =.+?) \ r? \ n " , body ) . group ( 1 )
except Exception :
2018-07-31 21:19:32 +08:00
raise ExtractorError ( " Unable to identify Cloudflare IUAM Javascript on website. " )
2018-07-31 18:53:39 +08:00
js = re . sub ( r " a \ .value = (.+ \ + t \ .length).+ " , r " \ 1 " , js )
js = re . sub ( r " \ s { 3,}[a-z](?: = | \ .).+ " , " " , js ) . replace ( " t.length " , str ( len ( domain ) ) )
# Strip characters that could be used to exit the string context
# These characters are not currently used in Cloudflare's arithmetic snippet
js = re . sub ( r " [ \ n \\ ' ] " , " " , js )
if " toFixed " not in js :
2018-07-31 21:19:32 +08:00
raise ExtractorError ( " Error parsing Cloudflare IUAM Javascript challenge. " )
2018-07-31 18:53:39 +08:00
# Use vm.runInNewContext to safely evaluate code
# The sandboxed code cannot use the Node.js standard library
js = " console.log(require( ' vm ' ).runInNewContext( ' %s ' , Object.create(null), {timeout: 5000} )); " % js
import subprocess
try :
result = subprocess . check_output ( [ " node " , " -e " , js ] ) . strip ( )
except OSError as e :
if e . errno == 2 :
2018-07-31 21:19:32 +08:00
raise ExtractorError ( " Missing Node.js runtime. Node is required and must be in the PATH (check with `node -v`). Your Node binary may be called `nodejs` rather than `node`, in which case you may need to run `apt-get install nodejs-legacy` on some Debian-based systems. (Please read the cfscrape README ' s Dependencies section: https://github.com/Anorov/cloudflare-scrape#dependencies. " )
2018-07-31 18:53:39 +08:00
raise
except Exception :
self . to_screen ( " Error executing Cloudflare IUAM Javascript. " )
raise
try :
float ( result )
except Exception :
2018-07-31 21:19:32 +08:00
raise ExtractorError ( " Cloudflare IUAM challenge returned unexpected answer. " )
2018-07-31 18:53:39 +08:00
return result
2018-07-31 21:19:32 +08:00
def has_cf_challenge ( self , html ) :
return True if ' /cdn-cgi/l/chk_jschl ' in html else False
2018-07-31 18:53:39 +08:00
def cf_solve_and_download_webpage ( self , html , download_url ) :
2018-07-31 21:19:32 +08:00
if not self . has_cf_challenge ( html ) :
2018-07-31 18:53:39 +08:00
return False
parsed_url = compat_urlparse . urlparse ( download_url )
domain = parsed_url . netloc
submit_url = " %s :// %s /cdn-cgi/l/chk_jschl " % ( parsed_url . scheme , domain )
form_data = self . _form_hidden_inputs ( ' challenge-form ' , html )
form_data [ ' jschl_answer ' ] = self . _cf_solve_challenge ( html , domain )
self . _sleep ( 5 , None , ' Solving Cloudflare challenge (5s) ' )
return self . _download_webpage (
submit_url ,
None , ' Sending Cloudflare challenge ' , ' Wrong Cloudflare challenge ' , query = form_data
)
2013-10-06 12:06:30 +08:00
2013-06-24 01:57:38 +08:00
class SearchInfoExtractor ( InfoExtractor ) :
"""
Base class for paged search queries extractors .
2015-07-24 01:37:45 +08:00
They accept URLs in the format _SEARCH_KEY ( | all | [ 0 - 9 ] ) : { query }
2013-06-24 01:57:38 +08:00
Instances should define _SEARCH_KEY and _MAX_RESULTS .
"""
@classmethod
def _make_valid_url ( cls ) :
return r ' %s (?P<prefix>|[1-9][0-9]*|all):(?P<query>[ \ s \ S]+) ' % cls . _SEARCH_KEY
@classmethod
def suitable ( cls , url ) :
return re . match ( cls . _make_valid_url ( ) , url ) is not None
def _real_extract ( self , query ) :
mobj = re . match ( self . _make_valid_url ( ) , query )
if mobj is None :
2014-08-28 07:04:43 +08:00
raise ExtractorError ( ' Invalid search query " %s " ' % query )
2013-06-24 01:57:38 +08:00
prefix = mobj . group ( ' prefix ' )
query = mobj . group ( ' query ' )
if prefix == ' ' :
return self . _get_n_results ( query , 1 )
elif prefix == ' all ' :
return self . _get_n_results ( query , self . _MAX_RESULTS )
else :
n = int ( prefix )
if n < = 0 :
2014-08-28 07:04:43 +08:00
raise ExtractorError ( ' invalid download number %s for query " %s " ' % ( n , query ) )
2013-06-24 01:57:38 +08:00
elif n > self . _MAX_RESULTS :
2014-08-28 07:04:43 +08:00
self . _downloader . report_warning ( ' %s returns max %i results (you requested %i ) ' % ( self . _SEARCH_KEY , self . _MAX_RESULTS , n ) )
2013-06-24 01:57:38 +08:00
n = self . _MAX_RESULTS
return self . _get_n_results ( query , n )
def _get_n_results ( self , query , n ) :
""" Get a specified number of results for a query """
2016-02-14 17:37:17 +08:00
raise NotImplementedError ( ' This method must be implemented by subclasses ' )
2013-07-02 00:52:19 +08:00
@property
def SEARCH_KEY ( self ) :
return self . _SEARCH_KEY