2015-06-27 12:11:23 +08:00
from __future__ import unicode_literals
import re
from . common import InfoExtractor
from . . utils import str_to_int
class MovieFapIE ( InfoExtractor ) :
_VALID_URL = r ' https?://(?:www \ .)?moviefap \ .com/videos/(?P<id>[0-9a-f]+)/(?P<name>[a-z-_]+) '
_TESTS = [ {
' url ' : ' http://www.moviefap.com/videos/e5da0d3edce5404418f5/jeune-couple-russe.html ' ,
' md5 ' : ' fa56683e291fc80635907168a743c9ad ' ,
' info_dict ' : {
' id ' : ' e5da0d3edce5404418f5 ' ,
' ext ' : ' flv ' ,
' title ' : ' Jeune Couple Russe ' ,
' description ' : ' Amateur ' ,
' thumbnail ' : ' http://pic.moviefap.com/thumbs/e5/949-18l.jpg ' ,
' uploader_id ' : ' whiskeyjar ' ,
' display_id ' : ' jeune-couple-russe '
}
} , {
' url ' : ' http://www.moviefap.com/videos/3080837f6712355015c2/busty-british-blonde-takes-backdoor-in-fake-taxi.html ' ,
' md5 ' : ' bedef72cb23d27a20755fc430a6d7a0e ' ,
' info_dict ' : {
' id ' : ' 3080837f6712355015c2 ' ,
' ext ' : ' mp4 ' ,
' title ' : ' Busty British blonde takes backdoor in fake taxi ' ,
' description ' : ' Big boobs British blonde flashing in fake taxi then giving titsjob and rimjob in the back seat before getting big cock up her tight ass ' ,
' thumbnail ' : ' http://img.moviefap.com/a16:9w990r/thumbs/30/322021-18l.jpg ' ,
' uploader_id ' : ' momcikoper ' ,
' display_id ' : ' busty-british-blonde-takes-backdoor-in-fake-taxi '
}
} ]
@staticmethod
def __get_thumbnail_data ( xml ) :
"""
Constructs a list of video thumbnails from timeline preview images .
: param xml : the information XML document to parse
"""
timeline = xml . find ( ' timeline ' )
if timeline is None :
# not all videos have the data - ah well
return [ ]
# get the required information from the XML
2015-06-27 12:22:35 +08:00
width = str_to_int ( timeline . find ( ' imageWidth ' ) . text )
height = str_to_int ( timeline . find ( ' imageHeight ' ) . text )
first = str_to_int ( timeline . find ( ' imageFirst ' ) . text )
last = str_to_int ( timeline . find ( ' imageLast ' ) . text )
2015-06-27 12:11:23 +08:00
pattern = timeline . find ( ' imagePattern ' ) . text
# generate the list of thumbnail information dicts
thumbnails = [ ]
2015-06-27 12:22:35 +08:00
for i in range ( first , last + 1 ) :
2015-06-27 12:11:23 +08:00
thumbnails . append ( {
' url ' : pattern . replace ( ' # ' , str ( i ) ) ,
2015-06-27 12:22:35 +08:00
' width ' : width ,
' height ' : height
2015-06-27 12:11:23 +08:00
} )
return thumbnails
def _real_extract ( self , url ) :
# find the video ID
video_id = self . _match_id ( url )
# retrieve the page HTML
webpage = self . _download_webpage ( url , video_id )
# find the URL of the XML document detailing video download URLs
info_url = self . _html_search_regex ( r ' flashvars \ .config = escape \ ( " (.+?) " ' , webpage , ' player parameters ' )
# download that XML
xml = self . _download_xml ( info_url , video_id )
# create dictionary of properties we know so far, or can find easily
info = {
' id ' : video_id ,
' title ' : self . _html_search_regex ( r ' <div id= " view_title " ><h1>(.*?)</h1> ' , webpage , ' title ' ) ,
' display_id ' : re . compile ( self . _VALID_URL ) . match ( url ) . group ( ' name ' ) ,
' thumbnails ' : self . __get_thumbnail_data ( xml ) ,
' thumbnail ' : xml . find ( ' startThumb ' ) . text ,
' description ' : self . _html_search_regex ( r ' name= " description " value= " (.*?) " ' , webpage , ' description ' ) ,
' uploader_id ' : self . _html_search_regex ( r ' name= " username " value= " (.*?) " ' , webpage , ' uploader_id ' ) ,
' view_count ' : str_to_int ( self . _html_search_regex ( r ' <br>Views <strong>([0-9]+)</strong> ' , webpage , ' view_count ' ) ) ,
' average_rating ' : float ( self . _html_search_regex ( r ' Current Rating<br> <strong>(.*?)</strong> ' , webpage , ' average_rating ' ) ) ,
' comment_count ' : str_to_int ( self . _html_search_regex ( r ' <span id= " comCount " >([0-9]+)</span> ' , webpage , ' comment_count ' ) ) ,
' age_limit ' : 18 ,
' webpage_url ' : self . _html_search_regex ( r ' name= " link " value= " (.*?) " ' , webpage , ' webpage_url ' ) ,
' categories ' : self . _html_search_regex ( r ' </div> \ s*(.*?) \ s*<br> ' , webpage , ' categories ' ) . split ( ' , ' )
}
# find and add the format
if xml . find ( ' videoConfig ' ) is not None :
info [ ' ext ' ] = xml . find ( ' videoConfig ' ) . find ( ' type ' ) . text
else :
info [ ' ext ' ] = ' flv ' # guess...
# work out the video URL(s)
if xml . find ( ' videoLink ' ) is not None :
# single format available
info [ ' url ' ] = xml . find ( ' videoLink ' ) . text
else :
# multiple formats available
info [ ' formats ' ] = [ ]
# N.B. formats are already in ascending order of quality
for item in xml . find ( ' quality ' ) . findall ( ' item ' ) :
info [ ' formats ' ] . append ( {
' url ' : item . find ( ' videoLink ' ) . text ,
' resolution ' : item . find ( ' res ' ) . text # 480p etc.
} )
return info