From 01639fdcfafa32805de90468d5e6f71dc194fa53 Mon Sep 17 00:00:00 2001 From: alexoner Date: Mon, 7 Oct 2013 16:00:57 +0800 Subject: [PATCH] parse tudou url like http://www.tudou.com/albumplay/TenTw_JgiPM/PzsAs5usU9A.html --- youtube_dl/extractor/tudou.py | 198 +++++++++++++++++++++++++--------- 1 file changed, 148 insertions(+), 50 deletions(-) diff --git a/youtube_dl/extractor/tudou.py b/youtube_dl/extractor/tudou.py index 1405b73f7..a2926a664 100644 --- a/youtube_dl/extractor/tudou.py +++ b/youtube_dl/extractor/tudou.py @@ -2,62 +2,160 @@ import re import json +import urllib2 + +from time import time +from random import randint from .common import InfoExtractor class TudouIE(InfoExtractor): - _VALID_URL = r'(?:http://)?(?:www\.)?tudou\.com/(?:listplay|programs)/(?:view|(.+?))/(?:([^/]+)|([^/]+))(?:\.html)?' - _TEST = { - u'url': u'http://www.tudou.com/listplay/zzdE77v6Mmo/2xN2duXMxmw.html', - u'file': u'159448201.f4v', - u'md5': u'140a49ed444bd22f93330985d8475fcb', - u'info_dict': { - u"title": u"卡马乔国足开大脚长传冲吊集锦" - } - } + #_VALID_URL = r'(?:http://)?(?:www\.)?tudou\.com/((?:listplay|programs)/(?:view|(.+?)))|(?:albumplay)/(?:([^/]+)|([^/]+))(?:\.html)?' + _VALID_URL = r'(?:http://)?(?:www\.)?tudou\.com/(((?:listplay|programs)/(?:view|(.+?)))|((?:albumplay)(?:/[^/]*)))/(?:([^/]+)|([^/]+))(?:\.html)?' + _TEST = { + u'url': u'http://www.tudou.com/listplay/zzdE77v6Mmo/2xN2duXMxmw.html', + u'file': u'159448201.f4v', + u'md5': u'140a49ed444bd22f93330985d8475fcb', + u'info_dict': { + u"title": u"卡马乔国足开大脚长传冲吊集锦" + } + } - def _url_for_id(self, id, quality = None): - info_url = "http://v2.tudou.com/f?id="+str(id) - if quality: - info_url += '&hd' + quality - webpage = self._download_webpage(info_url, id, "Opening the info webpage") - final_url = self._html_search_regex('>(.+?)',webpage, 'video url') - return final_url + def _url_for_id(self, id, quality = None): + info_url = "http://v2.tudou.com/f?id="+str(id) + if quality: + info_url += '&hd' + quality + webpage = self._download_webpage(info_url, id, "Opening the info webpage") + final_url = self._html_search_regex('>(.+?)',webpage, 'video url') + return final_url - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group(2) - webpage = self._download_webpage(url, video_id) - title = re.search(",kw:\"(.+)\"",webpage) - if title is None: - title = re.search(",kw: \'(.+)\'",webpage) - title = title.group(1) - thumbnail_url = re.search(",pic: \'(.+?)\'",webpage) - if thumbnail_url is None: - thumbnail_url = re.search(",pic:\"(.+?)\"",webpage) - thumbnail_url = thumbnail_url.group(1) + def get_page(self,url): + request=urllib2.urlopen(url) + html=request.read() + content_type=request.headers.get('Content-Type') + m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type) + if m: + encoding=m.group(1) + html=html.decode(encoding,"replace") + return html + else: + return None - segs_json = self._search_regex(r'segs: \'(.*)\'', webpage, 'segments') - segments = json.loads(segs_json) - # It looks like the keys are the arguments that have to be passed as - # the hd field in the request url, we pick the higher - quality = sorted(segments.keys())[-1] - parts = segments[quality] - result = [] - len_parts = len(parts) - if len_parts > 1: - self.to_screen(u'%s: found %s parts' % (video_id, len_parts)) - for part in parts: - part_id = part['k'] - final_url = self._url_for_id(part_id, quality) - ext = (final_url.split('?')[0]).split('.')[-1] - part_info = {'id': part_id, - 'url': final_url, - 'ext': ext, - 'title': title, - 'thumbnail': thumbnail_url, - } - result.append(part_info) - return result + def isyouku(self,url): + request=urllib2.urlopen(url) + html=request.read() + content_type=request.headers.get('Content-Type') + m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type) + if m: + encoding=m.group(1) + html=html.decode(encoding,"replace") + vcode=re.search(r'vcode\s*[:=]\s*\'([^\']+)\'',html).group(1) + title=re.search(r'kw\s*[:=]\s*[\'\"]([^\']+?)[\'\"]',html).group(1) + if vcode: + return (vcode,title) + else: + return None + + def downloadYouku_by_id(self,videoId,title): + info=self.get_youkuinfo(videoId) + result=[] + urls,sizes=zip(*self.find_video(info,None)) + pattern=re.compile(r'/st/([^/]+)/') + ext=str(re.search(pattern, urls[0]).group(1)) + for i,url in enumerate(urls): + part_info={ + 'id':i, + 'url':url, + 'ext':ext, + 'title':title, + 'thumbnail':None, + } + result.append(part_info) + + return result + + + def get_youkuinfo(self,videoId): + return json.loads(self.get_page('http://v.youku.com/player/getPlayList/VideoIDS/' + videoId + '/timezone/+08/version/5/source/out/Sc/2')) + + def find_video(self,info, stream_type = None): + #key = '%s%x' % (info['data'][0]['key2'], int(info['data'][0]['key1'], 16) ^ 0xA55AA5A5) + segs = info['data'][0]['segs'] + types = segs.keys() + if not stream_type: + for x in ['hd2', 'mp4', 'flv']: + if x in types: + stream_type = x + break + else: + raise NotImplementedError() + assert stream_type in ('hd2', 'mp4', 'flv') + file_type = {'hd2': 'flv', 'mp4': 'mp4', 'flv': 'flv'}[stream_type] + + seed = info['data'][0]['seed'] + source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\\:._-1234567890") + mixed = '' + while source: + seed = (seed * 211 + 30031) & 0xFFFF + index = seed * len(source) >> 16 + c = source.pop(index) + mixed += c + + ids = info['data'][0]['streamfileids'][stream_type].split('*')[:-1] + vid = ''.join(mixed[int(i)] for i in ids) + + sid = '%s%s%s' % (int(time() * 1000), randint(1000, 1999), randint(1000, 9999)) + + urls = [] + for s in segs[stream_type]: + no = '%02x' % int(s['no']) + url = 'http://f.youku.com/player/getFlvPath/sid/%s_%s/st/%s/fileid/%s%s%s?K=%s&ts=%s' % (sid, no, file_type, vid[:8], no.upper(), vid[10:], s['k'], s['seconds']) + urls.append((url, int(s['size']))) + return urls + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group(2) + if video_id is None: + vcode,title=self.isyouku(url) + if not vcode: + print "Not transferring to Youku" + return None + return self.downloadYouku_by_id(vcode,title) + + webpage = self._download_webpage(url, video_id) + title = re.search(",kw:\"(.+)\"",webpage) + if title is None: + title = re.search(",kw: \'(.+)\'",webpage) + + title = title.group(1) + thumbnail_url = re.search(",pic: \'(.+?)\'",webpage) + if thumbnail_url is None: + thumbnail_url = re.search(",pic:\"(.+?)\"",webpage) + thumbnail_url = thumbnail_url.group(1) + + segs_json = self._search_regex(r'segs: \'(.*)\'', webpage, 'segments') + segments = json.loads(segs_json) + # It looks like the keys are the arguments that have to be passed as + # the hd field in the request url, we pick the higher + quality = sorted(segments.keys())[-1] + parts = segments[quality] + result = [] + len_parts = len(parts) + if len_parts > 1: + self.to_screen(u'%s: found %s parts' % (video_id, len_parts)) + for part in parts: + part_id = part['k'] + final_url = self._url_for_id(part_id, quality) + ext = (final_url.split('?')[0]).split('.')[-1] + part_info = {'id': part_id, + 'url': final_url, + 'ext': ext, + 'title': title, + 'thumbnail': thumbnail_url, + } + result.append(part_info) + + return result