1
0
mirror of https://github.com/l1ving/youtube-dl synced 2025-01-02 21:02:58 +08:00

[hark] get the song info in JSON and extract more information.

This commit is contained in:
Jaime Marquínez Ferrándiz 2013-08-27 10:25:38 +02:00
parent 069d098f84
commit 2a7b4da9b2

View File

@ -1,6 +1,7 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
import re import re
import json
from .common import InfoExtractor from .common import InfoExtractor
from ..utils import determine_ext from ..utils import determine_ext
@ -12,24 +13,25 @@ class HarkIE(InfoExtractor):
u'file': u'mmbzyhkgny.mp3', u'file': u'mmbzyhkgny.mp3',
u'md5': u'6783a58491b47b92c7c1af5a77d4cbee', u'md5': u'6783a58491b47b92c7c1af5a77d4cbee',
u'info_dict': { u'info_dict': {
u"title": u"Obama: 'Beyond The Afghan Theater, We Only Target Al Qaeda' On May 23, 2013 ", u'title': u"Obama: 'Beyond The Afghan Theater, We Only Target Al Qaeda' on May 23, 2013",
u'description': u'President Barack Obama addressed the nation live on May 23, 2013 in a speech aimed at addressing counter-terrorism policies including the use of drone strikes, detainees at Guantanamo Bay prison facility, and American citizens who are terrorists.',
u'duration': 11,
} }
} }
def _real_extract(self, url): def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url) mobj = re.match(self._VALID_URL, url)
video_id = mobj.group(1) video_id = mobj.group(1)
embed_url = "http://www.hark.com/clips/%s/homepage_embed" %(video_id) json_url = "http://www.hark.com/clips/%s.json" %(video_id)
webpage = self._download_webpage(embed_url, video_id) info_json = self._download_webpage(json_url, video_id)
info = json.loads(info_json)
final_url = self._search_regex(r'src="(.+?).mp3"', final_url = info['url']
webpage, 'video url')+'.mp3'
title = self._html_search_regex(r'<title>(.+?)</title>',
webpage, 'video title').replace(' Sound Clip and Quote - Hark','').replace(
'Sound Clip , Quote, MP3, and Ringtone - Hark','')
return {'id': video_id, return {'id': video_id,
'url' : final_url, 'url' : final_url,
'title': title, 'title': info['name'],
'ext': determine_ext(final_url), 'ext': determine_ext(final_url),
'description': info['description'],
'thumbnail': info['image_original'],
'duration': info['duration'],
} }