2015-10-18 17:07:48 +08:00
# coding: utf-8
2015-06-28 03:22:25 +08:00
from __future__ import unicode_literals
import re
from . common import InfoExtractor
from . . compat import compat_urllib_request
from . . utils import (
float_or_none ,
2015-10-18 17:11:55 +08:00
xpath_text ,
2015-10-18 18:04:13 +08:00
remove_end ,
2015-11-12 02:13:42 +08:00
int_or_none ,
ExtractorError ,
2015-06-28 03:22:25 +08:00
)
class TwitterCardIE ( InfoExtractor ) :
2015-10-18 17:13:58 +08:00
IE_NAME = ' twitter:card '
2015-06-28 03:22:25 +08:00
_VALID_URL = r ' https?://(?:www \ .)?twitter \ .com/i/cards/tfw/v1/(?P<id> \ d+) '
2015-07-22 05:45:36 +08:00
_TESTS = [
{
' url ' : ' https://twitter.com/i/cards/tfw/v1/560070183650213889 ' ,
2015-11-14 01:55:07 +08:00
' md5 ' : ' 4fa26a35f9d1bf4b646590ba8e84be19 ' ,
2015-07-22 05:45:36 +08:00
' info_dict ' : {
' id ' : ' 560070183650213889 ' ,
' ext ' : ' mp4 ' ,
' title ' : ' TwitterCard ' ,
' thumbnail ' : ' re:^https?://.* \ .jpg$ ' ,
' duration ' : 30.033 ,
}
2015-06-28 03:22:25 +08:00
} ,
2015-07-22 05:45:36 +08:00
{
' url ' : ' https://twitter.com/i/cards/tfw/v1/623160978427936768 ' ,
' md5 ' : ' 7ee2a553b63d1bccba97fbed97d9e1c8 ' ,
' info_dict ' : {
' id ' : ' 623160978427936768 ' ,
' ext ' : ' mp4 ' ,
' title ' : ' TwitterCard ' ,
' thumbnail ' : ' re:^https?://.* \ .jpg ' ,
' duration ' : 80.155 ,
} ,
2015-10-18 19:07:37 +08:00
} ,
{
' url ' : ' https://twitter.com/i/cards/tfw/v1/654001591733886977 ' ,
' md5 ' : ' b6f35e8b08a0bec6c8af77a2f4b3a814 ' ,
' info_dict ' : {
' id ' : ' dq4Oj5quskI ' ,
' ext ' : ' mp4 ' ,
' title ' : ' Ubuntu 11.10 Overview ' ,
' description ' : ' Take a quick peek at what \' s new and improved in Ubuntu 11.10. \n \n Once installed take a look at 10 Things to Do After Installing: http://www.omgubuntu.co.uk/2011/10/10-things-to-do-after-installing-ubuntu-11-10/ ' ,
' upload_date ' : ' 20111013 ' ,
' uploader ' : ' OMG! Ubuntu! ' ,
' uploader_id ' : ' omgubuntu ' ,
} ,
2015-07-22 05:45:36 +08:00
}
]
2015-06-28 03:22:25 +08:00
def _real_extract ( self , url ) :
video_id = self . _match_id ( url )
# Different formats served for different User-Agents
USER_AGENTS = [
' Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20150101 Firefox/20.0 (Chrome) ' , # mp4
' Mozilla/5.0 (Windows NT 5.2; WOW64; rv:38.0) Gecko/20100101 Firefox/38.0 ' , # webm
]
config = None
formats = [ ]
for user_agent in USER_AGENTS :
request = compat_urllib_request . Request ( url )
request . add_header ( ' User-Agent ' , user_agent )
webpage = self . _download_webpage ( request , video_id )
2015-10-18 19:07:37 +08:00
youtube_url = self . _html_search_regex (
r ' <iframe[^>]+src= " ((?:https?:)?//www.youtube.com/embed/[^ " ]+) " ' ,
webpage , ' youtube iframe ' , default = None )
if youtube_url :
return self . url_result ( youtube_url , ' Youtube ' )
2015-10-18 18:08:24 +08:00
config = self . _parse_json ( self . _html_search_regex (
r ' data-player-config= " ([^ " ]+) " ' , webpage , ' data player config ' ) ,
2015-06-28 03:22:25 +08:00
video_id )
2015-07-22 05:45:36 +08:00
if ' playlist ' not in config :
if ' vmapUrl ' in config :
2015-10-18 17:11:55 +08:00
vmap_data = self . _download_xml ( config [ ' vmapUrl ' ] , video_id )
video_url = xpath_text ( vmap_data , ' .//MediaFile ' ) . strip ( )
2015-10-18 17:15:47 +08:00
formats . append ( {
2015-07-22 05:45:36 +08:00
' url ' : video_url ,
2015-10-18 17:15:47 +08:00
} )
2015-07-22 05:45:36 +08:00
break # same video regardless of UA
continue
2015-06-28 03:22:25 +08:00
video_url = config [ ' playlist ' ] [ 0 ] [ ' source ' ]
f = {
' url ' : video_url ,
}
m = re . search ( r ' /(?P<width> \ d+)x(?P<height> \ d+)/ ' , video_url )
if m :
f . update ( {
' width ' : int ( m . group ( ' width ' ) ) ,
' height ' : int ( m . group ( ' height ' ) ) ,
} )
formats . append ( f )
self . _sort_formats ( formats )
thumbnail = config . get ( ' posterImageUrl ' )
duration = float_or_none ( config . get ( ' duration ' ) )
return {
' id ' : video_id ,
' title ' : ' TwitterCard ' ,
' thumbnail ' : thumbnail ,
' duration ' : duration ,
' formats ' : formats ,
}
2015-07-22 05:38:40 +08:00
2015-10-18 17:16:57 +08:00
class TwitterIE ( InfoExtractor ) :
2015-10-18 17:13:58 +08:00
IE_NAME = ' twitter '
2015-10-18 18:04:13 +08:00
_VALID_URL = r ' https?://(?:www \ .|m \ .|mobile \ .)?twitter \ .com/(?P<user_id>[^/]+)/status/(?P<id> \ d+) '
_TEMPLATE_URL = ' https://twitter.com/ %s /status/ %s '
2015-07-22 05:38:40 +08:00
2015-11-12 02:13:42 +08:00
_TESTS = [ {
2015-10-18 17:07:48 +08:00
' url ' : ' https://twitter.com/freethenipple/status/643211948184596480 ' ,
2015-11-14 01:55:07 +08:00
' md5 ' : ' db6612ec5d03355953c3ca9250c97e5e ' ,
2015-07-22 05:38:40 +08:00
' info_dict ' : {
2015-10-18 17:07:48 +08:00
' id ' : ' 643211948184596480 ' ,
2015-07-22 05:38:40 +08:00
' ext ' : ' mp4 ' ,
2015-10-18 18:04:13 +08:00
' title ' : ' FREE THE NIPPLE - FTN supporters on Hollywood Blvd today! ' ,
2015-07-22 05:38:40 +08:00
' thumbnail ' : ' re:^https?://.* \ .jpg ' ,
2015-10-18 17:07:48 +08:00
' duration ' : 12.922 ,
' description ' : ' FREE THE NIPPLE on Twitter: " FTN supporters on Hollywood Blvd today! http://t.co/c7jHH749xJ " ' ,
' uploader ' : ' FREE THE NIPPLE ' ,
' uploader_id ' : ' freethenipple ' ,
2015-07-22 05:38:40 +08:00
} ,
2015-11-12 02:13:42 +08:00
} , {
' url ' : ' https://twitter.com/giphz/status/657991469417025536/photo/1 ' ,
' md5 ' : ' f36dcd5fb92bf7057f155e7d927eeb42 ' ,
' info_dict ' : {
' id ' : ' 657991469417025536 ' ,
' ext ' : ' mp4 ' ,
' title ' : ' Gifs - tu vai cai tu vai cai tu nao eh capaz disso tu vai cai ' ,
' description ' : ' Gifs on Twitter: " tu vai cai tu vai cai tu nao eh capaz disso tu vai cai https://t.co/tM46VHFlO5 " ' ,
' thumbnail ' : ' re:^https?://.* \ .png ' ,
' uploader ' : ' Gifs ' ,
' uploader_id ' : ' giphz ' ,
} ,
2015-11-14 02:09:42 +08:00
} , {
' url ' : ' https://twitter.com/starwars/status/665052190608723968 ' ,
' md5 ' : ' 39b7199856dee6cd4432e72c74bc69d4 ' ,
' info_dict ' : {
' id ' : ' 665052190608723968 ' ,
' ext ' : ' mp4 ' ,
' title ' : ' Star Wars - A new beginning is coming December 18. Watch the official 60 second #TV spot for #StarWars: #TheForceAwakens. ' ,
' description ' : ' Star Wars on Twitter: " A new beginning is coming December 18. Watch the official 60 second #TV spot for #StarWars: #TheForceAwakens. " ' ,
' uploader_id ' : ' starwars ' ,
' uploader ' : ' Star Wars ' ,
} ,
2015-11-12 02:13:42 +08:00
} ]
2015-07-22 05:38:40 +08:00
def _real_extract ( self , url ) :
2015-10-18 18:04:13 +08:00
mobj = re . match ( self . _VALID_URL , url )
user_id = mobj . group ( ' user_id ' )
twid = mobj . group ( ' id ' )
webpage = self . _download_webpage ( self . _TEMPLATE_URL % ( user_id , twid ) , twid )
username = remove_end ( self . _og_search_title ( webpage ) , ' on Twitter ' )
2015-11-14 02:09:42 +08:00
title = description = self . _og_search_description ( webpage ) . strip ( ' ' ) . replace ( ' \n ' , ' ' ) . strip ( ' “” ' )
2015-10-18 18:04:13 +08:00
# strip 'https -_t.co_BJYgOjSeGA' junk from filenames
2015-11-14 02:09:42 +08:00
title = re . sub ( r ' \ s+(https?://[^ ]+) ' , ' ' , title )
2015-10-18 18:04:13 +08:00
2015-11-12 02:13:42 +08:00
info = {
2015-10-18 18:04:13 +08:00
' uploader_id ' : user_id ,
' uploader ' : username ,
2015-07-22 05:38:40 +08:00
' webpage_url ' : url ,
2015-11-14 02:09:42 +08:00
' description ' : ' %s on Twitter: " %s " ' % ( username , description ) ,
2015-07-22 05:38:40 +08:00
' title ' : username + ' - ' + title ,
}
2015-11-12 02:13:42 +08:00
card_id = self . _search_regex (
r ' [ " \' ]/i/cards/tfw/v1/( \ d+) ' , webpage , ' twitter card url ' , default = None )
if card_id :
card_url = ' https://twitter.com/i/cards/tfw/v1/ ' + card_id
info . update ( {
' _type ' : ' url_transparent ' ,
' ie_key ' : ' TwitterCard ' ,
' url ' : card_url ,
} )
return info
mobj = re . search ( r ''' (?x)
< video [ ^ > ] + class = " animated-gif " [ ^ > ] +
( ? : data - height = " (?P<height> \ d+) " ) ? [ ^ > ] +
( ? : data - width = " (?P<width> \ d+) " ) ? [ ^ > ] +
( ? : poster = " (?P<poster>[^ " ] + ) " )?[^>]*> \ s*
< source [ ^ > ] + video - src = " (?P<url>[^ " ] + ) "
''' , webpage)
if mobj :
info . update ( {
' id ' : twid ,
' url ' : mobj . group ( ' url ' ) ,
' height ' : int_or_none ( mobj . group ( ' height ' ) ) ,
' width ' : int_or_none ( mobj . group ( ' width ' ) ) ,
' thumbnail ' : mobj . group ( ' poster ' ) ,
} )
return info
raise ExtractorError ( ' There \' s not video in this tweet. ' )