1
0
mirror of https://github.com/l1ving/youtube-dl synced 2025-01-20 12:22:53 +08:00

Merge remote-tracking branch 'origin/master'

This commit is contained in:
Philipp Hagemeister 2013-09-16 03:32:53 +02:00
commit ef66b0c6ef
25 changed files with 323 additions and 131 deletions

View File

@ -32,9 +32,9 @@ tests = [
# 83 # 83
("qwertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKJHGFDSAZXCVBNM!#$%^&*()_+={[};?/>.<", ("qwertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKJHGFDSAZXCVBNM!#$%^&*()_+={[};?/>.<",
".>/?;}[{=+_)(*&^%<#!MNBVCXZASPFGHJKLwOIUYTREWQ0987654321mnbvcxzasdfghjklpoiuytreq"), ".>/?;}[{=+_)(*&^%<#!MNBVCXZASPFGHJKLwOIUYTREWQ0987654321mnbvcxzasdfghjklpoiuytreq"),
# 82 - vflZK4ZYR 2013/08/23 # 82 - vflGNjMhJ 2013/09/12
("qwertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKHGFDSAZXCVBNM!@#$%^&*(-+={[};?/>.<", ("qwertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKHGFDSAZXCVBNM!@#$%^&*(-+={[};?/>.<",
"wertyuioplkjhgfdsaqxcvbnm1234567890QWERTYUIOPLKHGFDSAZXCVBNM!@#$%^&z(-+={[};?/>.<"), ".>/?;}[<=+-(*&^%$#@!MNBVCXeASDFGHKLPOqUYTREWQ0987654321mnbvcxzasdfghjklpoiuytrIwZ"),
# 81 - vflLC8JvQ 2013/07/25 # 81 - vflLC8JvQ 2013/07/25
("qwertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKHGFDSAZXCVBNM!@#$%^&*(-+={[};?/>.", ("qwertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKHGFDSAZXCVBNM!@#$%^&*(-+={[};?/>.",
"C>/?;}[{=+-(*&^%$#@!MNBVYXZASDFGHKLPOIU.TREWQ0q87659321mnbvcxzasdfghjkl4oiuytrewp"), "C>/?;}[{=+-(*&^%$#@!MNBVYXZASDFGHKLPOIU.TREWQ0q87659321mnbvcxzasdfghjkl4oiuytrewp"),

View File

@ -36,6 +36,7 @@ class TestAllURLsMatching(unittest.TestCase):
self.assertFalse(YoutubeIE.suitable(u'https://www.youtube.com/watch?v=AV6J6_AeFEQ&playnext=1&list=PL4023E734DA416012')) #668 self.assertFalse(YoutubeIE.suitable(u'https://www.youtube.com/watch?v=AV6J6_AeFEQ&playnext=1&list=PL4023E734DA416012')) #668
self.assertMatch('http://youtu.be/BaW_jenozKc', ['youtube']) self.assertMatch('http://youtu.be/BaW_jenozKc', ['youtube'])
self.assertMatch('http://www.youtube.com/v/BaW_jenozKc', ['youtube']) self.assertMatch('http://www.youtube.com/v/BaW_jenozKc', ['youtube'])
self.assertMatch('https://youtube.googleapis.com/v/BaW_jenozKc', ['youtube'])
def test_youtube_channel_matching(self): def test_youtube_channel_matching(self):
assertChannel = lambda url: self.assertMatch(url, ['youtube:channel']) assertChannel = lambda url: self.assertMatch(url, ['youtube:channel'])

View File

@ -40,6 +40,7 @@ class TestDailymotionSubtitles(unittest.TestCase):
subtitles = self.getSubtitles() subtitles = self.getSubtitles()
self.assertEqual(md5(subtitles['fr']), '594564ec7d588942e384e920e5341792') self.assertEqual(md5(subtitles['fr']), '594564ec7d588942e384e920e5341792')
def test_allsubtitles(self): def test_allsubtitles(self):
self.DL.params['writesubtitles'] = True
self.DL.params['allsubtitles'] = True self.DL.params['allsubtitles'] = True
subtitles = self.getSubtitles() subtitles = self.getSubtitles()
self.assertEqual(len(subtitles.keys()), 5) self.assertEqual(len(subtitles.keys()), 5)
@ -54,6 +55,7 @@ class TestDailymotionSubtitles(unittest.TestCase):
self.assertTrue(len(subtitles.keys()) == 0) self.assertTrue(len(subtitles.keys()) == 0)
def test_nosubtitles(self): def test_nosubtitles(self):
self.url = 'http://www.dailymotion.com/video/x12u166_le-zapping-tele-star-du-08-aout-2013_tv' self.url = 'http://www.dailymotion.com/video/x12u166_le-zapping-tele-star-du-08-aout-2013_tv'
self.DL.params['writesubtitles'] = True
self.DL.params['allsubtitles'] = True self.DL.params['allsubtitles'] = True
subtitles = self.getSubtitles() subtitles = self.getSubtitles()
self.assertEqual(len(subtitles), 0) self.assertEqual(len(subtitles), 0)

View File

@ -8,7 +8,7 @@ import json
import os import os
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from youtube_dl.extractor import DailymotionPlaylistIE, VimeoChannelIE from youtube_dl.extractor import DailymotionPlaylistIE, VimeoChannelIE, UstreamChannelIE, SoundcloudUserIE
from youtube_dl.utils import * from youtube_dl.utils import *
from helper import FakeYDL from helper import FakeYDL
@ -34,5 +34,21 @@ class TestPlaylists(unittest.TestCase):
self.assertEqual(result['title'], u'Vimeo Tributes') self.assertEqual(result['title'], u'Vimeo Tributes')
self.assertTrue(len(result['entries']) > 24) self.assertTrue(len(result['entries']) > 24)
def test_ustream_channel(self):
dl = FakeYDL()
ie = UstreamChannelIE(dl)
result = ie.extract('http://www.ustream.tv/channel/young-americans-for-liberty')
self.assertIsPlaylist(result)
self.assertEqual(result['id'], u'5124905')
self.assertTrue(len(result['entries']) >= 11)
def test_soundcloud_user(self):
dl = FakeYDL()
ie = SoundcloudUserIE(dl)
result = ie.extract('https://soundcloud.com/the-concept-band')
self.assertIsPlaylist(result)
self.assertEqual(result['id'], u'9615865')
self.assertTrue(len(result['entries']) >= 12)
if __name__ == '__main__': if __name__ == '__main__':
unittest.main() unittest.main()

View File

@ -11,13 +11,16 @@ import os
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
#from youtube_dl.utils import htmlentity_transform #from youtube_dl.utils import htmlentity_transform
from youtube_dl.utils import timeconvert from youtube_dl.utils import (
from youtube_dl.utils import sanitize_filename timeconvert,
from youtube_dl.utils import unescapeHTML sanitize_filename,
from youtube_dl.utils import orderedSet unescapeHTML,
from youtube_dl.utils import DateRange orderedSet,
from youtube_dl.utils import unified_strdate DateRange,
from youtube_dl.utils import find_xpath_attr unified_strdate,
find_xpath_attr,
get_meta_content,
)
if sys.version_info < (3, 0): if sys.version_info < (3, 0):
_compat_str = lambda b: b.decode('unicode-escape') _compat_str = lambda b: b.decode('unicode-escape')
@ -127,5 +130,16 @@ class TestUtil(unittest.TestCase):
self.assertEqual(find_xpath_attr(doc, './/node', 'x', 'a'), doc[1]) self.assertEqual(find_xpath_attr(doc, './/node', 'x', 'a'), doc[1])
self.assertEqual(find_xpath_attr(doc, './/node', 'y', 'c'), doc[2]) self.assertEqual(find_xpath_attr(doc, './/node', 'y', 'c'), doc[2])
def test_meta_parser(self):
testhtml = u'''
<head>
<meta name="description" content="foo &amp; bar">
<meta content='Plato' name='author'/>
</head>
'''
get_meta = lambda name: get_meta_content(name, testhtml)
self.assertEqual(get_meta('description'), u'foo & bar')
self.assertEqual(get_meta('author'), 'Plato')
if __name__ == '__main__': if __name__ == '__main__':
unittest.main() unittest.main()

View File

@ -41,6 +41,7 @@ class TestYoutubeSubtitles(unittest.TestCase):
subtitles = self.getSubtitles() subtitles = self.getSubtitles()
self.assertEqual(md5(subtitles['it']), '164a51f16f260476a05b50fe4c2f161d') self.assertEqual(md5(subtitles['it']), '164a51f16f260476a05b50fe4c2f161d')
def test_youtube_allsubtitles(self): def test_youtube_allsubtitles(self):
self.DL.params['writesubtitles'] = True
self.DL.params['allsubtitles'] = True self.DL.params['allsubtitles'] = True
subtitles = self.getSubtitles() subtitles = self.getSubtitles()
self.assertEqual(len(subtitles.keys()), 13) self.assertEqual(len(subtitles.keys()), 13)
@ -66,6 +67,7 @@ class TestYoutubeSubtitles(unittest.TestCase):
self.assertTrue(subtitles['it'] is not None) self.assertTrue(subtitles['it'] is not None)
def test_youtube_nosubtitles(self): def test_youtube_nosubtitles(self):
self.url = 'sAjKT8FhjI8' self.url = 'sAjKT8FhjI8'
self.DL.params['writesubtitles'] = True
self.DL.params['allsubtitles'] = True self.DL.params['allsubtitles'] = True
subtitles = self.getSubtitles() subtitles = self.getSubtitles()
self.assertEqual(len(subtitles), 0) self.assertEqual(len(subtitles), 0)

View File

@ -74,6 +74,7 @@ class YoutubeDL(object):
writesubtitles: Write the video subtitles to a file writesubtitles: Write the video subtitles to a file
writeautomaticsub: Write the automatic subtitles to a file writeautomaticsub: Write the automatic subtitles to a file
allsubtitles: Downloads all the subtitles of the video allsubtitles: Downloads all the subtitles of the video
(requires writesubtitles or writeautomaticsub)
listsubtitles: Lists all available subtitles for the video listsubtitles: Lists all available subtitles for the video
subtitlesformat: Subtitle format [srt/sbv/vtt] (default=srt) subtitlesformat: Subtitle format [srt/sbv/vtt] (default=srt)
subtitleslangs: List of languages of the subtitles to download subtitleslangs: List of languages of the subtitles to download
@ -492,13 +493,14 @@ class YoutubeDL(object):
self.report_writedescription(descfn) self.report_writedescription(descfn)
with io.open(encodeFilename(descfn), 'w', encoding='utf-8') as descfile: with io.open(encodeFilename(descfn), 'w', encoding='utf-8') as descfile:
descfile.write(info_dict['description']) descfile.write(info_dict['description'])
except (KeyError, TypeError):
self.report_warning(u'There\'s no description to write.')
except (OSError, IOError): except (OSError, IOError):
self.report_error(u'Cannot write description file ' + descfn) self.report_error(u'Cannot write description file ' + descfn)
return return
subtitles_are_requested = any([self.params.get('writesubtitles', False), subtitles_are_requested = any([self.params.get('writesubtitles', False),
self.params.get('writeautomaticsub'), self.params.get('writeautomaticsub')])
self.params.get('allsubtitles', False)])
if subtitles_are_requested and 'subtitles' in info_dict and info_dict['subtitles']: if subtitles_are_requested and 'subtitles' in info_dict and info_dict['subtitles']:
# subtitles download errors are already managed as troubles in relevant IE # subtitles download errors are already managed as troubles in relevant IE

View File

@ -533,6 +533,11 @@ def _real_main(argv=None):
else: else:
date = DateRange(opts.dateafter, opts.datebefore) date = DateRange(opts.dateafter, opts.datebefore)
# --all-sub automatically sets --write-sub if --write-auto-sub is not given
# this was the old behaviour if only --all-sub was given.
if opts.allsubtitles and (opts.writeautomaticsub == False):
opts.writesubtitles = True
if sys.version_info < (3,): if sys.version_info < (3,):
# In Python 2, sys.argv is a bytestring (also note http://bugs.python.org/issue2128 for Windows systems) # In Python 2, sys.argv is a bytestring (also note http://bugs.python.org/issue2128 for Windows systems)
if opts.outtmpl is not None: if opts.outtmpl is not None:

View File

@ -52,6 +52,7 @@ from .jeuxvideo import JeuxVideoIE
from .jukebox import JukeboxIE from .jukebox import JukeboxIE
from .justintv import JustinTVIE from .justintv import JustinTVIE
from .kankan import KankanIE from .kankan import KankanIE
from .kickstarter import KickStarterIE
from .keek import KeekIE from .keek import KeekIE
from .liveleak import LiveLeakIE from .liveleak import LiveLeakIE
from .livestream import LivestreamIE from .livestream import LivestreamIE
@ -81,7 +82,8 @@ from .sina import SinaIE
from .slashdot import SlashdotIE from .slashdot import SlashdotIE
from .slideshare import SlideshareIE from .slideshare import SlideshareIE
from .sohu import SohuIE from .sohu import SohuIE
from .soundcloud import SoundcloudIE, SoundcloudSetIE from .soundcloud import SoundcloudIE, SoundcloudSetIE, SoundcloudUserIE
from .southparkstudios import SouthParkStudiosIE
from .spiegel import SpiegelIE from .spiegel import SpiegelIE
from .stanfordoc import StanfordOpenClassroomIE from .stanfordoc import StanfordOpenClassroomIE
from .statigram import StatigramIE from .statigram import StatigramIE
@ -96,7 +98,7 @@ from .tudou import TudouIE
from .tumblr import TumblrIE from .tumblr import TumblrIE
from .tutv import TutvIE from .tutv import TutvIE
from .unistra import UnistraIE from .unistra import UnistraIE
from .ustream import UstreamIE from .ustream import UstreamIE, UstreamChannelIE
from .vbox7 import Vbox7IE from .vbox7 import Vbox7IE
from .veehd import VeeHDIE from .veehd import VeeHDIE
from .veoh import VeohIE from .veoh import VeohIE

View File

@ -46,6 +46,8 @@ class ArchiveOrgIE(InfoExtractor):
for fn,fdata in data['files'].items() for fn,fdata in data['files'].items()
if 'Video' in fdata['format']] if 'Video' in fdata['format']]
formats.sort(key=lambda fdata: fdata['file_size']) formats.sort(key=lambda fdata: fdata['file_size'])
for f in formats:
f['ext'] = determine_ext(f['url'])
info = { info = {
'_type': 'video', '_type': 'video',
@ -61,7 +63,6 @@ class ArchiveOrgIE(InfoExtractor):
info['thumbnail'] = thumbnail info['thumbnail'] = thumbnail
# TODO: Remove when #980 has been merged # TODO: Remove when #980 has been merged
info['url'] = formats[-1]['url'] info.update(formats[-1])
info['ext'] = determine_ext(formats[-1]['url'])
return info return info

View File

@ -1,3 +1,4 @@
# encoding: utf-8
import re import re
import xml.etree.ElementTree import xml.etree.ElementTree
@ -5,24 +6,29 @@ from .common import InfoExtractor
from ..utils import unified_strdate from ..utils import unified_strdate
class CanalplusIE(InfoExtractor): class CanalplusIE(InfoExtractor):
_VALID_URL = r'https?://(www\.canalplus\.fr/.*?\?vid=|player\.canalplus\.fr/#/)(?P<id>\d+)' _VALID_URL = r'https?://(www\.canalplus\.fr/.*?/(?P<path>.*)|player\.canalplus\.fr/#/(?P<id>\d+))'
_VIDEO_INFO_TEMPLATE = 'http://service.canal-plus.com/video/rest/getVideosLiees/cplus/%s' _VIDEO_INFO_TEMPLATE = 'http://service.canal-plus.com/video/rest/getVideosLiees/cplus/%s'
IE_NAME = u'canalplus.fr' IE_NAME = u'canalplus.fr'
_TEST = { _TEST = {
u'url': u'http://www.canalplus.fr/c-divertissement/pid3351-c-le-petit-journal.html?vid=889861', u'url': u'http://www.canalplus.fr/c-infos-documentaires/pid1830-c-zapping.html?vid=922470',
u'file': u'889861.flv', u'file': u'922470.flv',
u'md5': u'590a888158b5f0d6832f84001fbf3e99',
u'info_dict': { u'info_dict': {
u'title': u'Le Petit Journal 20/06/13 - La guerre des drone', u'title': u'Zapping - 26/08/13',
u'upload_date': u'20130620', u'description': u'Le meilleur de toutes les chaînes, tous les jours.\nEmission du 26 août 2013',
u'upload_date': u'20130826',
},
u'params': {
u'skip_download': True,
}, },
u'skip': u'Requires rtmpdump'
} }
def _real_extract(self, url): def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url) mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('id') video_id = mobj.group('id')
if video_id is None:
webpage = self._download_webpage(url, mobj.group('path'))
video_id = self._search_regex(r'videoId = "(\d+)";', webpage, u'video id')
info_url = self._VIDEO_INFO_TEMPLATE % video_id info_url = self._VIDEO_INFO_TEMPLATE % video_id
info_page = self._download_webpage(info_url,video_id, info_page = self._download_webpage(info_url,video_id,
u'Downloading video info') u'Downloading video info')
@ -43,4 +49,6 @@ class CanalplusIE(InfoExtractor):
'ext': 'flv', 'ext': 'flv',
'upload_date': unified_strdate(infos.find('PUBLICATION/DATE').text), 'upload_date': unified_strdate(infos.find('PUBLICATION/DATE').text),
'thumbnail': media.find('IMAGES/GRAND').text, 'thumbnail': media.find('IMAGES/GRAND').text,
'description': infos.find('DESCRIPTION').text,
'view_count': int(infos.find('NB_VUES').text),
} }

View File

@ -54,6 +54,7 @@ class DreiSatIE(InfoExtractor):
'width': int(fe.find('./width').text), 'width': int(fe.find('./width').text),
'height': int(fe.find('./height').text), 'height': int(fe.find('./height').text),
'url': fe.find('./url').text, 'url': fe.find('./url').text,
'ext': determine_ext(fe.find('./url').text),
'filesize': int(fe.find('./filesize').text), 'filesize': int(fe.find('./filesize').text),
'video_bitrate': int(fe.find('./videoBitrate').text), 'video_bitrate': int(fe.find('./videoBitrate').text),
'3sat_qualityname': fe.find('./quality').text, '3sat_qualityname': fe.find('./quality').text,
@ -79,7 +80,6 @@ class DreiSatIE(InfoExtractor):
} }
# TODO: Remove when #980 has been merged # TODO: Remove when #980 has been merged
info['url'] = formats[-1]['url'] info.update(formats[-1])
info['ext'] = determine_ext(formats[-1]['url'])
return info return info

View File

@ -21,7 +21,7 @@ class FunnyOrDieIE(InfoExtractor):
video_id = mobj.group('id') video_id = mobj.group('id')
webpage = self._download_webpage(url, video_id) webpage = self._download_webpage(url, video_id)
video_url = self._search_regex(r'type: "video/mp4", src: "(.*?)"', video_url = self._search_regex(r'type="video/mp4" src="(.*?)"',
webpage, u'video URL', flags=re.DOTALL) webpage, u'video URL', flags=re.DOTALL)
info = { info = {

View File

@ -14,7 +14,7 @@ class GameSpotIE(InfoExtractor):
u"file": u"6410818.mp4", u"file": u"6410818.mp4",
u"md5": u"b2a30deaa8654fcccd43713a6b6a4825", u"md5": u"b2a30deaa8654fcccd43713a6b6a4825",
u"info_dict": { u"info_dict": {
u"title": u"Arma III - Community Guide: SITREP I", u"title": u"Arma 3 - Community Guide: SITREP I",
u"upload_date": u"20130627", u"upload_date": u"20130627",
} }
} }

View File

@ -40,7 +40,8 @@ class GooglePlusIE(InfoExtractor):
self.report_extraction(video_id) self.report_extraction(video_id)
# Extract update date # Extract update date
upload_date = self._html_search_regex('title="Timestamp">(.*?)</a>', upload_date = self._html_search_regex(
['title="Timestamp">(.*?)</a>', r'<a.+?class="g-M.+?>(.+?)</a>'],
webpage, u'upload date', fatal=False) webpage, u'upload date', fatal=False)
if upload_date: if upload_date:
# Convert timestring to a format suitable for filename # Convert timestring to a format suitable for filename

View File

@ -0,0 +1,37 @@
import re
from .common import InfoExtractor
class KickStarterIE(InfoExtractor):
_VALID_URL = r'https?://www\.kickstarter\.com/projects/(?P<id>\d*)/.*'
_TEST = {
u"url": u"https://www.kickstarter.com/projects/1404461844/intersection-the-story-of-josh-grant?ref=home_location",
u"file": u"1404461844.mp4",
u"md5": u"c81addca81327ffa66c642b5d8b08cab",
u"info_dict": {
u"title": u"Intersection: The Story of Josh Grant by Kyle Cowling",
},
}
def _real_extract(self, url):
m = re.match(self._VALID_URL, url)
video_id = m.group('id')
webpage_src = self._download_webpage(url, video_id)
video_url = self._search_regex(r'data-video="(.*?)">',
webpage_src, u'video URL')
if 'mp4' in video_url:
ext = 'mp4'
else:
ext = 'flv'
video_title = self._html_search_regex(r"<title>(.*?)</title>",
webpage_src, u'title').rpartition(u'\u2014 Kickstarter')[0].strip()
results = [{
'id': video_id,
'url': video_url,
'title': video_title,
'ext': ext,
}]
return results

View File

@ -5,34 +5,27 @@ import socket
from .common import InfoExtractor from .common import InfoExtractor
from ..utils import ( from ..utils import (
compat_http_client, compat_http_client,
compat_str,
compat_urllib_error, compat_urllib_error,
compat_urllib_request, compat_urllib_request,
unified_strdate,
ExtractorError,
) )
class MixcloudIE(InfoExtractor): class MixcloudIE(InfoExtractor):
_WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
_VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)' _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
IE_NAME = u'mixcloud' IE_NAME = u'mixcloud'
def report_download_json(self, file_id): _TEST = {
"""Report JSON download.""" u'url': u'http://www.mixcloud.com/dholbach/cryptkeeper/',
self.to_screen(u'Downloading json') u'file': u'dholbach-cryptkeeper.mp3',
u'info_dict': {
def get_urls(self, jsonData, fmt, bitrate='best'): u'title': u'Cryptkeeper',
"""Get urls from 'audio_formats' section in json""" u'description': u'After quite a long silence from myself, finally another Drum\'n\'Bass mix with my favourite current dance floor bangers.',
try: u'uploader': u'Daniel Holbach',
bitrate_list = jsonData[fmt] u'uploader_id': u'dholbach',
if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list: u'upload_date': u'20111115',
bitrate = max(bitrate_list) # select highest },
}
url_list = jsonData[fmt][bitrate]
except TypeError: # we have no bitrate info.
url_list = jsonData[fmt]
return url_list
def check_urls(self, url_list): def check_urls(self, url_list):
"""Returns 1st active url from list""" """Returns 1st active url from list"""
@ -45,71 +38,32 @@ class MixcloudIE(InfoExtractor):
return None return None
def _print_formats(self, formats):
print('Available formats:')
for fmt in formats.keys():
for b in formats[fmt]:
try:
ext = formats[fmt][b][0]
print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
except TypeError: # we have no bitrate info
ext = formats[fmt][0]
print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
break
def _real_extract(self, url): def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url) mobj = re.match(self._VALID_URL, url)
if mobj is None:
raise ExtractorError(u'Invalid URL: %s' % url)
# extract uploader & filename from url
uploader = mobj.group(1).decode('utf-8')
file_id = uploader + "-" + mobj.group(2).decode('utf-8')
# construct API request uploader = mobj.group(1)
file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json' cloudcast_name = mobj.group(2)
# retrieve .json file with links to files track_id = '-'.join((uploader, cloudcast_name))
request = compat_urllib_request.Request(file_url) api_url = 'http://api.mixcloud.com/%s/%s/' % (uploader, cloudcast_name)
try: webpage = self._download_webpage(url, track_id)
self.report_download_json(file_url) json_data = self._download_webpage(api_url, track_id,
jsonData = compat_urllib_request.urlopen(request).read() u'Downloading cloudcast info')
except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: info = json.loads(json_data)
raise ExtractorError(u'Unable to retrieve file: %s' % compat_str(err))
# parse JSON preview_url = self._search_regex(r'data-preview-url="(.+?)"', webpage, u'preview url')
json_data = json.loads(jsonData) song_url = preview_url.replace('/previews/', '/cloudcasts/originals/')
player_url = json_data['player_swf_url'] template_url = re.sub(r'(stream\d*)', 'stream%d', song_url)
formats = dict(json_data['audio_formats']) final_song_url = self.check_urls(template_url % i for i in range(30))
req_format = self._downloader.params.get('format', None) return {
'id': track_id,
if self._downloader.params.get('listformats', None): 'title': info['name'],
self._print_formats(formats) 'url': final_song_url,
return 'ext': 'mp3',
'description': info['description'],
if req_format is None or req_format == 'best': 'thumbnail': info['pictures'].get('extra_large'),
for format_param in formats.keys(): 'uploader': info['user']['name'],
url_list = self.get_urls(formats, format_param) 'uploader_id': info['user']['username'],
# check urls 'upload_date': unified_strdate(info['created_time']),
file_url = self.check_urls(url_list) 'view_count': info['play_count'],
if file_url is not None: }
break # got it!
else:
if req_format not in formats:
raise ExtractorError(u'Format is not available')
url_list = self.get_urls(formats, req_format)
file_url = self.check_urls(url_list)
format_param = req_format
return [{
'id': file_id.decode('utf-8'),
'url': file_url.decode('utf-8'),
'uploader': uploader.decode('utf-8'),
'upload_date': None,
'title': json_data['name'],
'ext': file_url.split('.')[-1].decode('utf-8'),
'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
'thumbnail': json_data['thumbnail_url'],
'description': json_data['description'],
'player_url': player_url.decode('utf-8'),
}]

View File

@ -1,10 +1,12 @@
import json import json
import re import re
import itertools
from .common import InfoExtractor from .common import InfoExtractor
from ..utils import ( from ..utils import (
compat_str, compat_str,
compat_urlparse, compat_urlparse,
compat_urllib_parse,
ExtractorError, ExtractorError,
unified_strdate, unified_strdate,
@ -53,9 +55,10 @@ class SoundcloudIE(InfoExtractor):
def _resolv_url(cls, url): def _resolv_url(cls, url):
return 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=' + cls._CLIENT_ID return 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=' + cls._CLIENT_ID
def _extract_info_dict(self, info, full_title=None): def _extract_info_dict(self, info, full_title=None, quiet=False):
video_id = info['id'] video_id = info['id']
name = full_title or video_id name = full_title or video_id
if quiet == False:
self.report_extraction(name) self.report_extraction(name)
thumbnail = info['artwork_url'] thumbnail = info['artwork_url']
@ -198,3 +201,41 @@ class SoundcloudSetIE(SoundcloudIE):
'id': info['id'], 'id': info['id'],
'title': info['title'], 'title': info['title'],
} }
class SoundcloudUserIE(SoundcloudIE):
_VALID_URL = r'https?://(www\.)?soundcloud.com/(?P<user>[^/]+)(/?(tracks/)?)?(\?.*)?$'
IE_NAME = u'soundcloud:user'
# it's in tests/test_playlists.py
_TEST = None
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
uploader = mobj.group('user')
url = 'http://soundcloud.com/%s/' % uploader
resolv_url = self._resolv_url(url)
user_json = self._download_webpage(resolv_url, uploader,
u'Downloading user info')
user = json.loads(user_json)
tracks = []
for i in itertools.count():
data = compat_urllib_parse.urlencode({'offset': i*50,
'client_id': self._CLIENT_ID,
})
tracks_url = 'http://api.soundcloud.com/users/%s/tracks.json?' % user['id'] + data
response = self._download_webpage(tracks_url, uploader,
u'Downloading tracks page %s' % (i+1))
new_tracks = json.loads(response)
tracks.extend(self._extract_info_dict(track, quiet=True) for track in new_tracks)
if len(new_tracks) < 50:
break
return {
'_type': 'playlist',
'id': compat_str(user['id']),
'title': user['username'],
'entries': tracks,
}

View File

@ -0,0 +1,34 @@
import re
from .mtv import MTVIE, _media_xml_tag
class SouthParkStudiosIE(MTVIE):
IE_NAME = u'southparkstudios.com'
_VALID_URL = r'https?://www\.southparkstudios\.com/clips/(?P<id>\d+)'
_FEED_URL = 'http://www.southparkstudios.com/feeds/video-player/mrss'
_TEST = {
u'url': u'http://www.southparkstudios.com/clips/104437/bat-daded#tab=featured',
u'file': u'a7bff6c2-ed00-11e0-aca6-0026b9414f30.mp4',
u'info_dict': {
u'title': u'Bat Daded',
u'description': u'Randy disqualifies South Park by getting into a fight with Bat Dad.',
},
}
# Overwrite MTVIE properties we don't want
_TESTS = []
def _get_thumbnail_url(self, uri, itemdoc):
search_path = '%s/%s' % (_media_xml_tag('group'), _media_xml_tag('thumbnail'))
return itemdoc.find(search_path).attrib['url']
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('id')
webpage = self._download_webpage(url, video_id)
mgid = self._search_regex(r'swfobject.embedSWF\(".*?(mgid:.*?)"',
webpage, u'mgid')
return self._get_videos_info(mgid)

View File

@ -10,8 +10,7 @@ class SubtitlesInfoExtractor(InfoExtractor):
@property @property
def _have_to_download_any_subtitles(self): def _have_to_download_any_subtitles(self):
return any([self._downloader.params.get('writesubtitles', False), return any([self._downloader.params.get('writesubtitles', False),
self._downloader.params.get('writeautomaticsub'), self._downloader.params.get('writeautomaticsub')])
self._downloader.params.get('allsubtitles', False)])
def _list_available_subtitles(self, video_id, webpage=None): def _list_available_subtitles(self, video_id, webpage=None):
""" outputs the available subtitles for the video """ """ outputs the available subtitles for the video """
@ -34,7 +33,7 @@ class SubtitlesInfoExtractor(InfoExtractor):
available_subs_list = {} available_subs_list = {}
if self._downloader.params.get('writeautomaticsub', False): if self._downloader.params.get('writeautomaticsub', False):
available_subs_list.update(self._get_available_automatic_caption(video_id, video_webpage)) available_subs_list.update(self._get_available_automatic_caption(video_id, video_webpage))
if self._downloader.params.get('writesubtitles', False) or self._downloader.params.get('allsubtitles', False): if self._downloader.params.get('writesubtitles', False):
available_subs_list.update(self._get_available_subtitles(video_id)) available_subs_list.update(self._get_available_subtitles(video_id))
if not available_subs_list: # error, it didn't get the available subtitles if not available_subs_list: # error, it didn't get the available subtitles

View File

@ -52,6 +52,7 @@ class TriluliluIE(InfoExtractor):
{ {
'format': fnode.text, 'format': fnode.text,
'url': video_url_template % fnode.text, 'url': video_url_template % fnode.text,
'ext': fnode.text.partition('-')[0]
} }
for fnode in format_doc.findall('./formats/format') for fnode in format_doc.findall('./formats/format')
@ -67,7 +68,6 @@ class TriluliluIE(InfoExtractor):
} }
# TODO: Remove when #980 has been merged # TODO: Remove when #980 has been merged
info['url'] = formats[-1]['url'] info.update(formats[-1])
info['ext'] = formats[-1]['format'].partition('-')[0]
return info return info

View File

@ -1,6 +1,11 @@
import json
import re import re
from .common import InfoExtractor from .common import InfoExtractor
from ..utils import (
compat_urlparse,
get_meta_content,
)
class UstreamIE(InfoExtractor): class UstreamIE(InfoExtractor):
@ -43,3 +48,25 @@ class UstreamIE(InfoExtractor):
'thumbnail': thumbnail, 'thumbnail': thumbnail,
} }
return info return info
class UstreamChannelIE(InfoExtractor):
_VALID_URL = r'https?://www\.ustream\.tv/channel/(?P<slug>.+)'
IE_NAME = u'ustream:channel'
def _real_extract(self, url):
m = re.match(self._VALID_URL, url)
slug = m.group('slug')
webpage = self._download_webpage(url, slug)
channel_id = get_meta_content('ustream:channel_id', webpage)
BASE = 'http://www.ustream.tv'
next_url = '/ajax/socialstream/videos/%s/1.json' % channel_id
video_ids = []
while next_url:
reply = json.loads(self._download_webpage(compat_urlparse.urljoin(BASE, next_url), channel_id))
video_ids.extend(re.findall(r'data-content-id="(\d.*)"', reply['data']))
next_url = reply['nextUrl']
urls = ['http://www.ustream.tv/recorded/' + vid for vid in video_ids]
url_entries = [self.url_result(eurl, 'Ustream') for eurl in urls]
return self.playlist_result(url_entries, channel_id)

View File

@ -27,7 +27,7 @@ class XHamsterIE(InfoExtractor):
mobj = re.match(self._VALID_URL, url) mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('id') video_id = mobj.group('id')
mrss_url = 'http://xhamster.com/movies/%s/.html' % video_id mrss_url = 'http://xhamster.com/movies/%s/.html?hd' % video_id
webpage = self._download_webpage(mrss_url, video_id) webpage = self._download_webpage(mrss_url, video_id)
mobj = re.search(r'\'srv\': \'(?P<server>[^\']*)\',\s*\'file\': \'(?P<file>[^\']+)\',', webpage) mobj = re.search(r'\'srv\': \'(?P<server>[^\']*)\',\s*\'file\': \'(?P<file>[^\']+)\',', webpage)

View File

@ -139,7 +139,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
( (
(?:https?://)? # http(s):// (optional) (?:https?://)? # http(s):// (optional)
(?:(?:(?:(?:\w+\.)?youtube(?:-nocookie)?\.com/| (?:(?:(?:(?:\w+\.)?youtube(?:-nocookie)?\.com/|
tube\.majestyc\.net/) # the various hostnames, with wildcard subdomains tube\.majestyc\.net/|
youtube\.googleapis\.com/) # the various hostnames, with wildcard subdomains
(?:.*?\#/)? # handle anchor (#/) redirect urls (?:.*?\#/)? # handle anchor (#/) redirect urls
(?: # the various things that can precede the ID: (?: # the various things that can precede the ID:
(?:(?:v|embed|e)/) # v/ or embed/ or e/ (?:(?:v|embed|e)/) # v/ or embed/ or e/
@ -434,7 +435,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
elif len(s) == 83: elif len(s) == 83:
return s[81:64:-1] + s[82] + s[63:52:-1] + s[45] + s[51:45:-1] + s[1] + s[44:1:-1] + s[0] return s[81:64:-1] + s[82] + s[63:52:-1] + s[45] + s[51:45:-1] + s[1] + s[44:1:-1] + s[0]
elif len(s) == 82: elif len(s) == 82:
return s[1:19] + s[0] + s[20:68] + s[19] + s[69:82] return s[80:73:-1] + s[81] + s[72:54:-1] + s[2] + s[53:43:-1] + s[0] + s[42:2:-1] + s[43] + s[1] + s[54]
elif len(s) == 81: elif len(s) == 81:
return s[56] + s[79:56:-1] + s[41] + s[55:41:-1] + s[80] + s[40:34:-1] + s[0] + s[33:29:-1] + s[34] + s[28:9:-1] + s[29] + s[8:0:-1] + s[9] return s[56] + s[79:56:-1] + s[41] + s[55:41:-1] + s[80] + s[40:34:-1] + s[0] + s[33:29:-1] + s[34] + s[28:9:-1] + s[29] + s[8:0:-1] + s[9]
elif len(s) == 80: elif len(s) == 80:

View File

@ -249,7 +249,17 @@ def htmlentity_transform(matchobj):
return (u'&%s;' % entity) return (u'&%s;' % entity)
compat_html_parser.locatestarttagend = re.compile(r"""<[a-zA-Z][-.a-zA-Z0-9:_]*(?:\s+(?:(?<=['"\s])[^\s/>][^\s/=>]*(?:\s*=+\s*(?:'[^']*'|"[^"]*"|(?!['"])[^>\s]*))?\s*)*)?\s*""", re.VERBOSE) # backport bugfix compat_html_parser.locatestarttagend = re.compile(r"""<[a-zA-Z][-.a-zA-Z0-9:_]*(?:\s+(?:(?<=['"\s])[^\s/>][^\s/=>]*(?:\s*=+\s*(?:'[^']*'|"[^"]*"|(?!['"])[^>\s]*))?\s*)*)?\s*""", re.VERBOSE) # backport bugfix
class AttrParser(compat_html_parser.HTMLParser): class BaseHTMLParser(compat_html_parser.HTMLParser):
def __init(self):
compat_html_parser.HTMLParser.__init__(self)
self.html = None
def loads(self, html):
self.html = html
self.feed(html)
self.close()
class AttrParser(BaseHTMLParser):
"""Modified HTMLParser that isolates a tag with the specified attribute""" """Modified HTMLParser that isolates a tag with the specified attribute"""
def __init__(self, attribute, value): def __init__(self, attribute, value):
self.attribute = attribute self.attribute = attribute
@ -257,10 +267,9 @@ class AttrParser(compat_html_parser.HTMLParser):
self.result = None self.result = None
self.started = False self.started = False
self.depth = {} self.depth = {}
self.html = None
self.watch_startpos = False self.watch_startpos = False
self.error_count = 0 self.error_count = 0
compat_html_parser.HTMLParser.__init__(self) BaseHTMLParser.__init__(self)
def error(self, message): def error(self, message):
if self.error_count > 10 or self.started: if self.error_count > 10 or self.started:
@ -269,11 +278,6 @@ class AttrParser(compat_html_parser.HTMLParser):
self.error_count += 1 self.error_count += 1
self.goahead(1) self.goahead(1)
def loads(self, html):
self.html = html
self.feed(html)
self.close()
def handle_starttag(self, tag, attrs): def handle_starttag(self, tag, attrs):
attrs = dict(attrs) attrs = dict(attrs)
if self.started: if self.started:
@ -334,6 +338,38 @@ def get_element_by_attribute(attribute, value, html):
pass pass
return parser.get_result() return parser.get_result()
class MetaParser(BaseHTMLParser):
"""
Modified HTMLParser that isolates a meta tag with the specified name
attribute.
"""
def __init__(self, name):
BaseHTMLParser.__init__(self)
self.name = name
self.content = None
self.result = None
def handle_starttag(self, tag, attrs):
if tag != 'meta':
return
attrs = dict(attrs)
if attrs.get('name') == self.name:
self.result = attrs.get('content')
def get_result(self):
return self.result
def get_meta_content(name, html):
"""
Return the content attribute from the meta tag with the given name attribute.
"""
parser = MetaParser(name)
try:
parser.loads(html)
except compat_html_parser.HTMLParseError:
pass
return parser.get_result()
def clean_html(html): def clean_html(html):
"""Clean an HTML snippet into a readable string""" """Clean an HTML snippet into a readable string"""
@ -664,7 +700,16 @@ def unified_strdate(date_str):
date_str = date_str.replace(',',' ') date_str = date_str.replace(',',' ')
# %z (UTC offset) is only supported in python>=3.2 # %z (UTC offset) is only supported in python>=3.2
date_str = re.sub(r' (\+|-)[\d]*$', '', date_str) date_str = re.sub(r' (\+|-)[\d]*$', '', date_str)
format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y', '%Y-%m-%d', '%d/%m/%Y', '%Y/%m/%d %H:%M:%S', '%d.%m.%Y %H:%M'] format_expressions = [
'%d %B %Y',
'%B %d %Y',
'%b %d %Y',
'%Y-%m-%d',
'%d/%m/%Y',
'%Y/%m/%d %H:%M:%S',
'%d.%m.%Y %H:%M',
'%Y-%m-%dT%H:%M:%SZ',
]
for expression in format_expressions: for expression in format_expressions:
try: try:
upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d') upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')