mirror of
https://github.com/l1ving/youtube-dl
synced 2025-01-05 04:52:57 +08:00
Merge remote-tracking branch 'origin/master'
This commit is contained in:
commit
ef66b0c6ef
@ -32,9 +32,9 @@ tests = [
|
|||||||
# 83
|
# 83
|
||||||
("qwertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKJHGFDSAZXCVBNM!#$%^&*()_+={[};?/>.<",
|
("qwertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKJHGFDSAZXCVBNM!#$%^&*()_+={[};?/>.<",
|
||||||
".>/?;}[{=+_)(*&^%<#!MNBVCXZASPFGHJKLwOIUYTREWQ0987654321mnbvcxzasdfghjklpoiuytreq"),
|
".>/?;}[{=+_)(*&^%<#!MNBVCXZASPFGHJKLwOIUYTREWQ0987654321mnbvcxzasdfghjklpoiuytreq"),
|
||||||
# 82 - vflZK4ZYR 2013/08/23
|
# 82 - vflGNjMhJ 2013/09/12
|
||||||
("qwertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKHGFDSAZXCVBNM!@#$%^&*(-+={[};?/>.<",
|
("qwertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKHGFDSAZXCVBNM!@#$%^&*(-+={[};?/>.<",
|
||||||
"wertyuioplkjhgfdsaqxcvbnm1234567890QWERTYUIOPLKHGFDSAZXCVBNM!@#$%^&z(-+={[};?/>.<"),
|
".>/?;}[<=+-(*&^%$#@!MNBVCXeASDFGHKLPOqUYTREWQ0987654321mnbvcxzasdfghjklpoiuytrIwZ"),
|
||||||
# 81 - vflLC8JvQ 2013/07/25
|
# 81 - vflLC8JvQ 2013/07/25
|
||||||
("qwertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKHGFDSAZXCVBNM!@#$%^&*(-+={[};?/>.",
|
("qwertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKHGFDSAZXCVBNM!@#$%^&*(-+={[};?/>.",
|
||||||
"C>/?;}[{=+-(*&^%$#@!MNBVYXZASDFGHKLPOIU.TREWQ0q87659321mnbvcxzasdfghjkl4oiuytrewp"),
|
"C>/?;}[{=+-(*&^%$#@!MNBVYXZASDFGHKLPOIU.TREWQ0q87659321mnbvcxzasdfghjkl4oiuytrewp"),
|
||||||
|
@ -36,6 +36,7 @@ class TestAllURLsMatching(unittest.TestCase):
|
|||||||
self.assertFalse(YoutubeIE.suitable(u'https://www.youtube.com/watch?v=AV6J6_AeFEQ&playnext=1&list=PL4023E734DA416012')) #668
|
self.assertFalse(YoutubeIE.suitable(u'https://www.youtube.com/watch?v=AV6J6_AeFEQ&playnext=1&list=PL4023E734DA416012')) #668
|
||||||
self.assertMatch('http://youtu.be/BaW_jenozKc', ['youtube'])
|
self.assertMatch('http://youtu.be/BaW_jenozKc', ['youtube'])
|
||||||
self.assertMatch('http://www.youtube.com/v/BaW_jenozKc', ['youtube'])
|
self.assertMatch('http://www.youtube.com/v/BaW_jenozKc', ['youtube'])
|
||||||
|
self.assertMatch('https://youtube.googleapis.com/v/BaW_jenozKc', ['youtube'])
|
||||||
|
|
||||||
def test_youtube_channel_matching(self):
|
def test_youtube_channel_matching(self):
|
||||||
assertChannel = lambda url: self.assertMatch(url, ['youtube:channel'])
|
assertChannel = lambda url: self.assertMatch(url, ['youtube:channel'])
|
||||||
|
@ -40,6 +40,7 @@ class TestDailymotionSubtitles(unittest.TestCase):
|
|||||||
subtitles = self.getSubtitles()
|
subtitles = self.getSubtitles()
|
||||||
self.assertEqual(md5(subtitles['fr']), '594564ec7d588942e384e920e5341792')
|
self.assertEqual(md5(subtitles['fr']), '594564ec7d588942e384e920e5341792')
|
||||||
def test_allsubtitles(self):
|
def test_allsubtitles(self):
|
||||||
|
self.DL.params['writesubtitles'] = True
|
||||||
self.DL.params['allsubtitles'] = True
|
self.DL.params['allsubtitles'] = True
|
||||||
subtitles = self.getSubtitles()
|
subtitles = self.getSubtitles()
|
||||||
self.assertEqual(len(subtitles.keys()), 5)
|
self.assertEqual(len(subtitles.keys()), 5)
|
||||||
@ -54,6 +55,7 @@ class TestDailymotionSubtitles(unittest.TestCase):
|
|||||||
self.assertTrue(len(subtitles.keys()) == 0)
|
self.assertTrue(len(subtitles.keys()) == 0)
|
||||||
def test_nosubtitles(self):
|
def test_nosubtitles(self):
|
||||||
self.url = 'http://www.dailymotion.com/video/x12u166_le-zapping-tele-star-du-08-aout-2013_tv'
|
self.url = 'http://www.dailymotion.com/video/x12u166_le-zapping-tele-star-du-08-aout-2013_tv'
|
||||||
|
self.DL.params['writesubtitles'] = True
|
||||||
self.DL.params['allsubtitles'] = True
|
self.DL.params['allsubtitles'] = True
|
||||||
subtitles = self.getSubtitles()
|
subtitles = self.getSubtitles()
|
||||||
self.assertEqual(len(subtitles), 0)
|
self.assertEqual(len(subtitles), 0)
|
||||||
|
@ -8,7 +8,7 @@ import json
|
|||||||
import os
|
import os
|
||||||
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
||||||
|
|
||||||
from youtube_dl.extractor import DailymotionPlaylistIE, VimeoChannelIE
|
from youtube_dl.extractor import DailymotionPlaylistIE, VimeoChannelIE, UstreamChannelIE, SoundcloudUserIE
|
||||||
from youtube_dl.utils import *
|
from youtube_dl.utils import *
|
||||||
|
|
||||||
from helper import FakeYDL
|
from helper import FakeYDL
|
||||||
@ -34,5 +34,21 @@ class TestPlaylists(unittest.TestCase):
|
|||||||
self.assertEqual(result['title'], u'Vimeo Tributes')
|
self.assertEqual(result['title'], u'Vimeo Tributes')
|
||||||
self.assertTrue(len(result['entries']) > 24)
|
self.assertTrue(len(result['entries']) > 24)
|
||||||
|
|
||||||
|
def test_ustream_channel(self):
|
||||||
|
dl = FakeYDL()
|
||||||
|
ie = UstreamChannelIE(dl)
|
||||||
|
result = ie.extract('http://www.ustream.tv/channel/young-americans-for-liberty')
|
||||||
|
self.assertIsPlaylist(result)
|
||||||
|
self.assertEqual(result['id'], u'5124905')
|
||||||
|
self.assertTrue(len(result['entries']) >= 11)
|
||||||
|
|
||||||
|
def test_soundcloud_user(self):
|
||||||
|
dl = FakeYDL()
|
||||||
|
ie = SoundcloudUserIE(dl)
|
||||||
|
result = ie.extract('https://soundcloud.com/the-concept-band')
|
||||||
|
self.assertIsPlaylist(result)
|
||||||
|
self.assertEqual(result['id'], u'9615865')
|
||||||
|
self.assertTrue(len(result['entries']) >= 12)
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
unittest.main()
|
unittest.main()
|
||||||
|
@ -11,13 +11,16 @@ import os
|
|||||||
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
||||||
|
|
||||||
#from youtube_dl.utils import htmlentity_transform
|
#from youtube_dl.utils import htmlentity_transform
|
||||||
from youtube_dl.utils import timeconvert
|
from youtube_dl.utils import (
|
||||||
from youtube_dl.utils import sanitize_filename
|
timeconvert,
|
||||||
from youtube_dl.utils import unescapeHTML
|
sanitize_filename,
|
||||||
from youtube_dl.utils import orderedSet
|
unescapeHTML,
|
||||||
from youtube_dl.utils import DateRange
|
orderedSet,
|
||||||
from youtube_dl.utils import unified_strdate
|
DateRange,
|
||||||
from youtube_dl.utils import find_xpath_attr
|
unified_strdate,
|
||||||
|
find_xpath_attr,
|
||||||
|
get_meta_content,
|
||||||
|
)
|
||||||
|
|
||||||
if sys.version_info < (3, 0):
|
if sys.version_info < (3, 0):
|
||||||
_compat_str = lambda b: b.decode('unicode-escape')
|
_compat_str = lambda b: b.decode('unicode-escape')
|
||||||
@ -127,5 +130,16 @@ class TestUtil(unittest.TestCase):
|
|||||||
self.assertEqual(find_xpath_attr(doc, './/node', 'x', 'a'), doc[1])
|
self.assertEqual(find_xpath_attr(doc, './/node', 'x', 'a'), doc[1])
|
||||||
self.assertEqual(find_xpath_attr(doc, './/node', 'y', 'c'), doc[2])
|
self.assertEqual(find_xpath_attr(doc, './/node', 'y', 'c'), doc[2])
|
||||||
|
|
||||||
|
def test_meta_parser(self):
|
||||||
|
testhtml = u'''
|
||||||
|
<head>
|
||||||
|
<meta name="description" content="foo & bar">
|
||||||
|
<meta content='Plato' name='author'/>
|
||||||
|
</head>
|
||||||
|
'''
|
||||||
|
get_meta = lambda name: get_meta_content(name, testhtml)
|
||||||
|
self.assertEqual(get_meta('description'), u'foo & bar')
|
||||||
|
self.assertEqual(get_meta('author'), 'Plato')
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
unittest.main()
|
unittest.main()
|
||||||
|
@ -41,6 +41,7 @@ class TestYoutubeSubtitles(unittest.TestCase):
|
|||||||
subtitles = self.getSubtitles()
|
subtitles = self.getSubtitles()
|
||||||
self.assertEqual(md5(subtitles['it']), '164a51f16f260476a05b50fe4c2f161d')
|
self.assertEqual(md5(subtitles['it']), '164a51f16f260476a05b50fe4c2f161d')
|
||||||
def test_youtube_allsubtitles(self):
|
def test_youtube_allsubtitles(self):
|
||||||
|
self.DL.params['writesubtitles'] = True
|
||||||
self.DL.params['allsubtitles'] = True
|
self.DL.params['allsubtitles'] = True
|
||||||
subtitles = self.getSubtitles()
|
subtitles = self.getSubtitles()
|
||||||
self.assertEqual(len(subtitles.keys()), 13)
|
self.assertEqual(len(subtitles.keys()), 13)
|
||||||
@ -66,6 +67,7 @@ class TestYoutubeSubtitles(unittest.TestCase):
|
|||||||
self.assertTrue(subtitles['it'] is not None)
|
self.assertTrue(subtitles['it'] is not None)
|
||||||
def test_youtube_nosubtitles(self):
|
def test_youtube_nosubtitles(self):
|
||||||
self.url = 'sAjKT8FhjI8'
|
self.url = 'sAjKT8FhjI8'
|
||||||
|
self.DL.params['writesubtitles'] = True
|
||||||
self.DL.params['allsubtitles'] = True
|
self.DL.params['allsubtitles'] = True
|
||||||
subtitles = self.getSubtitles()
|
subtitles = self.getSubtitles()
|
||||||
self.assertEqual(len(subtitles), 0)
|
self.assertEqual(len(subtitles), 0)
|
||||||
|
@ -74,6 +74,7 @@ class YoutubeDL(object):
|
|||||||
writesubtitles: Write the video subtitles to a file
|
writesubtitles: Write the video subtitles to a file
|
||||||
writeautomaticsub: Write the automatic subtitles to a file
|
writeautomaticsub: Write the automatic subtitles to a file
|
||||||
allsubtitles: Downloads all the subtitles of the video
|
allsubtitles: Downloads all the subtitles of the video
|
||||||
|
(requires writesubtitles or writeautomaticsub)
|
||||||
listsubtitles: Lists all available subtitles for the video
|
listsubtitles: Lists all available subtitles for the video
|
||||||
subtitlesformat: Subtitle format [srt/sbv/vtt] (default=srt)
|
subtitlesformat: Subtitle format [srt/sbv/vtt] (default=srt)
|
||||||
subtitleslangs: List of languages of the subtitles to download
|
subtitleslangs: List of languages of the subtitles to download
|
||||||
@ -492,13 +493,14 @@ class YoutubeDL(object):
|
|||||||
self.report_writedescription(descfn)
|
self.report_writedescription(descfn)
|
||||||
with io.open(encodeFilename(descfn), 'w', encoding='utf-8') as descfile:
|
with io.open(encodeFilename(descfn), 'w', encoding='utf-8') as descfile:
|
||||||
descfile.write(info_dict['description'])
|
descfile.write(info_dict['description'])
|
||||||
|
except (KeyError, TypeError):
|
||||||
|
self.report_warning(u'There\'s no description to write.')
|
||||||
except (OSError, IOError):
|
except (OSError, IOError):
|
||||||
self.report_error(u'Cannot write description file ' + descfn)
|
self.report_error(u'Cannot write description file ' + descfn)
|
||||||
return
|
return
|
||||||
|
|
||||||
subtitles_are_requested = any([self.params.get('writesubtitles', False),
|
subtitles_are_requested = any([self.params.get('writesubtitles', False),
|
||||||
self.params.get('writeautomaticsub'),
|
self.params.get('writeautomaticsub')])
|
||||||
self.params.get('allsubtitles', False)])
|
|
||||||
|
|
||||||
if subtitles_are_requested and 'subtitles' in info_dict and info_dict['subtitles']:
|
if subtitles_are_requested and 'subtitles' in info_dict and info_dict['subtitles']:
|
||||||
# subtitles download errors are already managed as troubles in relevant IE
|
# subtitles download errors are already managed as troubles in relevant IE
|
||||||
|
@ -533,6 +533,11 @@ def _real_main(argv=None):
|
|||||||
else:
|
else:
|
||||||
date = DateRange(opts.dateafter, opts.datebefore)
|
date = DateRange(opts.dateafter, opts.datebefore)
|
||||||
|
|
||||||
|
# --all-sub automatically sets --write-sub if --write-auto-sub is not given
|
||||||
|
# this was the old behaviour if only --all-sub was given.
|
||||||
|
if opts.allsubtitles and (opts.writeautomaticsub == False):
|
||||||
|
opts.writesubtitles = True
|
||||||
|
|
||||||
if sys.version_info < (3,):
|
if sys.version_info < (3,):
|
||||||
# In Python 2, sys.argv is a bytestring (also note http://bugs.python.org/issue2128 for Windows systems)
|
# In Python 2, sys.argv is a bytestring (also note http://bugs.python.org/issue2128 for Windows systems)
|
||||||
if opts.outtmpl is not None:
|
if opts.outtmpl is not None:
|
||||||
|
@ -52,6 +52,7 @@ from .jeuxvideo import JeuxVideoIE
|
|||||||
from .jukebox import JukeboxIE
|
from .jukebox import JukeboxIE
|
||||||
from .justintv import JustinTVIE
|
from .justintv import JustinTVIE
|
||||||
from .kankan import KankanIE
|
from .kankan import KankanIE
|
||||||
|
from .kickstarter import KickStarterIE
|
||||||
from .keek import KeekIE
|
from .keek import KeekIE
|
||||||
from .liveleak import LiveLeakIE
|
from .liveleak import LiveLeakIE
|
||||||
from .livestream import LivestreamIE
|
from .livestream import LivestreamIE
|
||||||
@ -81,7 +82,8 @@ from .sina import SinaIE
|
|||||||
from .slashdot import SlashdotIE
|
from .slashdot import SlashdotIE
|
||||||
from .slideshare import SlideshareIE
|
from .slideshare import SlideshareIE
|
||||||
from .sohu import SohuIE
|
from .sohu import SohuIE
|
||||||
from .soundcloud import SoundcloudIE, SoundcloudSetIE
|
from .soundcloud import SoundcloudIE, SoundcloudSetIE, SoundcloudUserIE
|
||||||
|
from .southparkstudios import SouthParkStudiosIE
|
||||||
from .spiegel import SpiegelIE
|
from .spiegel import SpiegelIE
|
||||||
from .stanfordoc import StanfordOpenClassroomIE
|
from .stanfordoc import StanfordOpenClassroomIE
|
||||||
from .statigram import StatigramIE
|
from .statigram import StatigramIE
|
||||||
@ -96,7 +98,7 @@ from .tudou import TudouIE
|
|||||||
from .tumblr import TumblrIE
|
from .tumblr import TumblrIE
|
||||||
from .tutv import TutvIE
|
from .tutv import TutvIE
|
||||||
from .unistra import UnistraIE
|
from .unistra import UnistraIE
|
||||||
from .ustream import UstreamIE
|
from .ustream import UstreamIE, UstreamChannelIE
|
||||||
from .vbox7 import Vbox7IE
|
from .vbox7 import Vbox7IE
|
||||||
from .veehd import VeeHDIE
|
from .veehd import VeeHDIE
|
||||||
from .veoh import VeohIE
|
from .veoh import VeohIE
|
||||||
|
@ -46,6 +46,8 @@ class ArchiveOrgIE(InfoExtractor):
|
|||||||
for fn,fdata in data['files'].items()
|
for fn,fdata in data['files'].items()
|
||||||
if 'Video' in fdata['format']]
|
if 'Video' in fdata['format']]
|
||||||
formats.sort(key=lambda fdata: fdata['file_size'])
|
formats.sort(key=lambda fdata: fdata['file_size'])
|
||||||
|
for f in formats:
|
||||||
|
f['ext'] = determine_ext(f['url'])
|
||||||
|
|
||||||
info = {
|
info = {
|
||||||
'_type': 'video',
|
'_type': 'video',
|
||||||
@ -61,7 +63,6 @@ class ArchiveOrgIE(InfoExtractor):
|
|||||||
info['thumbnail'] = thumbnail
|
info['thumbnail'] = thumbnail
|
||||||
|
|
||||||
# TODO: Remove when #980 has been merged
|
# TODO: Remove when #980 has been merged
|
||||||
info['url'] = formats[-1]['url']
|
info.update(formats[-1])
|
||||||
info['ext'] = determine_ext(formats[-1]['url'])
|
|
||||||
|
|
||||||
return info
|
return info
|
@ -1,3 +1,4 @@
|
|||||||
|
# encoding: utf-8
|
||||||
import re
|
import re
|
||||||
import xml.etree.ElementTree
|
import xml.etree.ElementTree
|
||||||
|
|
||||||
@ -5,24 +6,29 @@ from .common import InfoExtractor
|
|||||||
from ..utils import unified_strdate
|
from ..utils import unified_strdate
|
||||||
|
|
||||||
class CanalplusIE(InfoExtractor):
|
class CanalplusIE(InfoExtractor):
|
||||||
_VALID_URL = r'https?://(www\.canalplus\.fr/.*?\?vid=|player\.canalplus\.fr/#/)(?P<id>\d+)'
|
_VALID_URL = r'https?://(www\.canalplus\.fr/.*?/(?P<path>.*)|player\.canalplus\.fr/#/(?P<id>\d+))'
|
||||||
_VIDEO_INFO_TEMPLATE = 'http://service.canal-plus.com/video/rest/getVideosLiees/cplus/%s'
|
_VIDEO_INFO_TEMPLATE = 'http://service.canal-plus.com/video/rest/getVideosLiees/cplus/%s'
|
||||||
IE_NAME = u'canalplus.fr'
|
IE_NAME = u'canalplus.fr'
|
||||||
|
|
||||||
_TEST = {
|
_TEST = {
|
||||||
u'url': u'http://www.canalplus.fr/c-divertissement/pid3351-c-le-petit-journal.html?vid=889861',
|
u'url': u'http://www.canalplus.fr/c-infos-documentaires/pid1830-c-zapping.html?vid=922470',
|
||||||
u'file': u'889861.flv',
|
u'file': u'922470.flv',
|
||||||
u'md5': u'590a888158b5f0d6832f84001fbf3e99',
|
|
||||||
u'info_dict': {
|
u'info_dict': {
|
||||||
u'title': u'Le Petit Journal 20/06/13 - La guerre des drone',
|
u'title': u'Zapping - 26/08/13',
|
||||||
u'upload_date': u'20130620',
|
u'description': u'Le meilleur de toutes les chaînes, tous les jours.\nEmission du 26 août 2013',
|
||||||
|
u'upload_date': u'20130826',
|
||||||
|
},
|
||||||
|
u'params': {
|
||||||
|
u'skip_download': True,
|
||||||
},
|
},
|
||||||
u'skip': u'Requires rtmpdump'
|
|
||||||
}
|
}
|
||||||
|
|
||||||
def _real_extract(self, url):
|
def _real_extract(self, url):
|
||||||
mobj = re.match(self._VALID_URL, url)
|
mobj = re.match(self._VALID_URL, url)
|
||||||
video_id = mobj.group('id')
|
video_id = mobj.group('id')
|
||||||
|
if video_id is None:
|
||||||
|
webpage = self._download_webpage(url, mobj.group('path'))
|
||||||
|
video_id = self._search_regex(r'videoId = "(\d+)";', webpage, u'video id')
|
||||||
info_url = self._VIDEO_INFO_TEMPLATE % video_id
|
info_url = self._VIDEO_INFO_TEMPLATE % video_id
|
||||||
info_page = self._download_webpage(info_url,video_id,
|
info_page = self._download_webpage(info_url,video_id,
|
||||||
u'Downloading video info')
|
u'Downloading video info')
|
||||||
@ -43,4 +49,6 @@ class CanalplusIE(InfoExtractor):
|
|||||||
'ext': 'flv',
|
'ext': 'flv',
|
||||||
'upload_date': unified_strdate(infos.find('PUBLICATION/DATE').text),
|
'upload_date': unified_strdate(infos.find('PUBLICATION/DATE').text),
|
||||||
'thumbnail': media.find('IMAGES/GRAND').text,
|
'thumbnail': media.find('IMAGES/GRAND').text,
|
||||||
|
'description': infos.find('DESCRIPTION').text,
|
||||||
|
'view_count': int(infos.find('NB_VUES').text),
|
||||||
}
|
}
|
||||||
|
@ -54,6 +54,7 @@ class DreiSatIE(InfoExtractor):
|
|||||||
'width': int(fe.find('./width').text),
|
'width': int(fe.find('./width').text),
|
||||||
'height': int(fe.find('./height').text),
|
'height': int(fe.find('./height').text),
|
||||||
'url': fe.find('./url').text,
|
'url': fe.find('./url').text,
|
||||||
|
'ext': determine_ext(fe.find('./url').text),
|
||||||
'filesize': int(fe.find('./filesize').text),
|
'filesize': int(fe.find('./filesize').text),
|
||||||
'video_bitrate': int(fe.find('./videoBitrate').text),
|
'video_bitrate': int(fe.find('./videoBitrate').text),
|
||||||
'3sat_qualityname': fe.find('./quality').text,
|
'3sat_qualityname': fe.find('./quality').text,
|
||||||
@ -79,7 +80,6 @@ class DreiSatIE(InfoExtractor):
|
|||||||
}
|
}
|
||||||
|
|
||||||
# TODO: Remove when #980 has been merged
|
# TODO: Remove when #980 has been merged
|
||||||
info['url'] = formats[-1]['url']
|
info.update(formats[-1])
|
||||||
info['ext'] = determine_ext(formats[-1]['url'])
|
|
||||||
|
|
||||||
return info
|
return info
|
@ -21,7 +21,7 @@ class FunnyOrDieIE(InfoExtractor):
|
|||||||
video_id = mobj.group('id')
|
video_id = mobj.group('id')
|
||||||
webpage = self._download_webpage(url, video_id)
|
webpage = self._download_webpage(url, video_id)
|
||||||
|
|
||||||
video_url = self._search_regex(r'type: "video/mp4", src: "(.*?)"',
|
video_url = self._search_regex(r'type="video/mp4" src="(.*?)"',
|
||||||
webpage, u'video URL', flags=re.DOTALL)
|
webpage, u'video URL', flags=re.DOTALL)
|
||||||
|
|
||||||
info = {
|
info = {
|
||||||
|
@ -14,7 +14,7 @@ class GameSpotIE(InfoExtractor):
|
|||||||
u"file": u"6410818.mp4",
|
u"file": u"6410818.mp4",
|
||||||
u"md5": u"b2a30deaa8654fcccd43713a6b6a4825",
|
u"md5": u"b2a30deaa8654fcccd43713a6b6a4825",
|
||||||
u"info_dict": {
|
u"info_dict": {
|
||||||
u"title": u"Arma III - Community Guide: SITREP I",
|
u"title": u"Arma 3 - Community Guide: SITREP I",
|
||||||
u"upload_date": u"20130627",
|
u"upload_date": u"20130627",
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -40,7 +40,8 @@ class GooglePlusIE(InfoExtractor):
|
|||||||
self.report_extraction(video_id)
|
self.report_extraction(video_id)
|
||||||
|
|
||||||
# Extract update date
|
# Extract update date
|
||||||
upload_date = self._html_search_regex('title="Timestamp">(.*?)</a>',
|
upload_date = self._html_search_regex(
|
||||||
|
['title="Timestamp">(.*?)</a>', r'<a.+?class="g-M.+?>(.+?)</a>'],
|
||||||
webpage, u'upload date', fatal=False)
|
webpage, u'upload date', fatal=False)
|
||||||
if upload_date:
|
if upload_date:
|
||||||
# Convert timestring to a format suitable for filename
|
# Convert timestring to a format suitable for filename
|
||||||
|
37
youtube_dl/extractor/kickstarter.py
Normal file
37
youtube_dl/extractor/kickstarter.py
Normal file
@ -0,0 +1,37 @@
|
|||||||
|
import re
|
||||||
|
|
||||||
|
from .common import InfoExtractor
|
||||||
|
|
||||||
|
|
||||||
|
class KickStarterIE(InfoExtractor):
|
||||||
|
_VALID_URL = r'https?://www\.kickstarter\.com/projects/(?P<id>\d*)/.*'
|
||||||
|
_TEST = {
|
||||||
|
u"url": u"https://www.kickstarter.com/projects/1404461844/intersection-the-story-of-josh-grant?ref=home_location",
|
||||||
|
u"file": u"1404461844.mp4",
|
||||||
|
u"md5": u"c81addca81327ffa66c642b5d8b08cab",
|
||||||
|
u"info_dict": {
|
||||||
|
u"title": u"Intersection: The Story of Josh Grant by Kyle Cowling",
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
def _real_extract(self, url):
|
||||||
|
m = re.match(self._VALID_URL, url)
|
||||||
|
video_id = m.group('id')
|
||||||
|
webpage_src = self._download_webpage(url, video_id)
|
||||||
|
|
||||||
|
video_url = self._search_regex(r'data-video="(.*?)">',
|
||||||
|
webpage_src, u'video URL')
|
||||||
|
if 'mp4' in video_url:
|
||||||
|
ext = 'mp4'
|
||||||
|
else:
|
||||||
|
ext = 'flv'
|
||||||
|
video_title = self._html_search_regex(r"<title>(.*?)</title>",
|
||||||
|
webpage_src, u'title').rpartition(u'\u2014 Kickstarter')[0].strip()
|
||||||
|
|
||||||
|
results = [{
|
||||||
|
'id': video_id,
|
||||||
|
'url': video_url,
|
||||||
|
'title': video_title,
|
||||||
|
'ext': ext,
|
||||||
|
}]
|
||||||
|
return results
|
@ -5,34 +5,27 @@ import socket
|
|||||||
from .common import InfoExtractor
|
from .common import InfoExtractor
|
||||||
from ..utils import (
|
from ..utils import (
|
||||||
compat_http_client,
|
compat_http_client,
|
||||||
compat_str,
|
|
||||||
compat_urllib_error,
|
compat_urllib_error,
|
||||||
compat_urllib_request,
|
compat_urllib_request,
|
||||||
|
unified_strdate,
|
||||||
ExtractorError,
|
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
class MixcloudIE(InfoExtractor):
|
class MixcloudIE(InfoExtractor):
|
||||||
_WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
|
|
||||||
_VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
|
_VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
|
||||||
IE_NAME = u'mixcloud'
|
IE_NAME = u'mixcloud'
|
||||||
|
|
||||||
def report_download_json(self, file_id):
|
_TEST = {
|
||||||
"""Report JSON download."""
|
u'url': u'http://www.mixcloud.com/dholbach/cryptkeeper/',
|
||||||
self.to_screen(u'Downloading json')
|
u'file': u'dholbach-cryptkeeper.mp3',
|
||||||
|
u'info_dict': {
|
||||||
def get_urls(self, jsonData, fmt, bitrate='best'):
|
u'title': u'Cryptkeeper',
|
||||||
"""Get urls from 'audio_formats' section in json"""
|
u'description': u'After quite a long silence from myself, finally another Drum\'n\'Bass mix with my favourite current dance floor bangers.',
|
||||||
try:
|
u'uploader': u'Daniel Holbach',
|
||||||
bitrate_list = jsonData[fmt]
|
u'uploader_id': u'dholbach',
|
||||||
if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
|
u'upload_date': u'20111115',
|
||||||
bitrate = max(bitrate_list) # select highest
|
},
|
||||||
|
}
|
||||||
url_list = jsonData[fmt][bitrate]
|
|
||||||
except TypeError: # we have no bitrate info.
|
|
||||||
url_list = jsonData[fmt]
|
|
||||||
return url_list
|
|
||||||
|
|
||||||
def check_urls(self, url_list):
|
def check_urls(self, url_list):
|
||||||
"""Returns 1st active url from list"""
|
"""Returns 1st active url from list"""
|
||||||
@ -45,71 +38,32 @@ class MixcloudIE(InfoExtractor):
|
|||||||
|
|
||||||
return None
|
return None
|
||||||
|
|
||||||
def _print_formats(self, formats):
|
|
||||||
print('Available formats:')
|
|
||||||
for fmt in formats.keys():
|
|
||||||
for b in formats[fmt]:
|
|
||||||
try:
|
|
||||||
ext = formats[fmt][b][0]
|
|
||||||
print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
|
|
||||||
except TypeError: # we have no bitrate info
|
|
||||||
ext = formats[fmt][0]
|
|
||||||
print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
|
|
||||||
break
|
|
||||||
|
|
||||||
def _real_extract(self, url):
|
def _real_extract(self, url):
|
||||||
mobj = re.match(self._VALID_URL, url)
|
mobj = re.match(self._VALID_URL, url)
|
||||||
if mobj is None:
|
|
||||||
raise ExtractorError(u'Invalid URL: %s' % url)
|
|
||||||
# extract uploader & filename from url
|
|
||||||
uploader = mobj.group(1).decode('utf-8')
|
|
||||||
file_id = uploader + "-" + mobj.group(2).decode('utf-8')
|
|
||||||
|
|
||||||
# construct API request
|
uploader = mobj.group(1)
|
||||||
file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
|
cloudcast_name = mobj.group(2)
|
||||||
# retrieve .json file with links to files
|
track_id = '-'.join((uploader, cloudcast_name))
|
||||||
request = compat_urllib_request.Request(file_url)
|
api_url = 'http://api.mixcloud.com/%s/%s/' % (uploader, cloudcast_name)
|
||||||
try:
|
webpage = self._download_webpage(url, track_id)
|
||||||
self.report_download_json(file_url)
|
json_data = self._download_webpage(api_url, track_id,
|
||||||
jsonData = compat_urllib_request.urlopen(request).read()
|
u'Downloading cloudcast info')
|
||||||
except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
|
info = json.loads(json_data)
|
||||||
raise ExtractorError(u'Unable to retrieve file: %s' % compat_str(err))
|
|
||||||
|
|
||||||
# parse JSON
|
preview_url = self._search_regex(r'data-preview-url="(.+?)"', webpage, u'preview url')
|
||||||
json_data = json.loads(jsonData)
|
song_url = preview_url.replace('/previews/', '/cloudcasts/originals/')
|
||||||
player_url = json_data['player_swf_url']
|
template_url = re.sub(r'(stream\d*)', 'stream%d', song_url)
|
||||||
formats = dict(json_data['audio_formats'])
|
final_song_url = self.check_urls(template_url % i for i in range(30))
|
||||||
|
|
||||||
req_format = self._downloader.params.get('format', None)
|
return {
|
||||||
|
'id': track_id,
|
||||||
if self._downloader.params.get('listformats', None):
|
'title': info['name'],
|
||||||
self._print_formats(formats)
|
'url': final_song_url,
|
||||||
return
|
'ext': 'mp3',
|
||||||
|
'description': info['description'],
|
||||||
if req_format is None or req_format == 'best':
|
'thumbnail': info['pictures'].get('extra_large'),
|
||||||
for format_param in formats.keys():
|
'uploader': info['user']['name'],
|
||||||
url_list = self.get_urls(formats, format_param)
|
'uploader_id': info['user']['username'],
|
||||||
# check urls
|
'upload_date': unified_strdate(info['created_time']),
|
||||||
file_url = self.check_urls(url_list)
|
'view_count': info['play_count'],
|
||||||
if file_url is not None:
|
}
|
||||||
break # got it!
|
|
||||||
else:
|
|
||||||
if req_format not in formats:
|
|
||||||
raise ExtractorError(u'Format is not available')
|
|
||||||
|
|
||||||
url_list = self.get_urls(formats, req_format)
|
|
||||||
file_url = self.check_urls(url_list)
|
|
||||||
format_param = req_format
|
|
||||||
|
|
||||||
return [{
|
|
||||||
'id': file_id.decode('utf-8'),
|
|
||||||
'url': file_url.decode('utf-8'),
|
|
||||||
'uploader': uploader.decode('utf-8'),
|
|
||||||
'upload_date': None,
|
|
||||||
'title': json_data['name'],
|
|
||||||
'ext': file_url.split('.')[-1].decode('utf-8'),
|
|
||||||
'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
|
|
||||||
'thumbnail': json_data['thumbnail_url'],
|
|
||||||
'description': json_data['description'],
|
|
||||||
'player_url': player_url.decode('utf-8'),
|
|
||||||
}]
|
|
||||||
|
@ -1,10 +1,12 @@
|
|||||||
import json
|
import json
|
||||||
import re
|
import re
|
||||||
|
import itertools
|
||||||
|
|
||||||
from .common import InfoExtractor
|
from .common import InfoExtractor
|
||||||
from ..utils import (
|
from ..utils import (
|
||||||
compat_str,
|
compat_str,
|
||||||
compat_urlparse,
|
compat_urlparse,
|
||||||
|
compat_urllib_parse,
|
||||||
|
|
||||||
ExtractorError,
|
ExtractorError,
|
||||||
unified_strdate,
|
unified_strdate,
|
||||||
@ -53,10 +55,11 @@ class SoundcloudIE(InfoExtractor):
|
|||||||
def _resolv_url(cls, url):
|
def _resolv_url(cls, url):
|
||||||
return 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=' + cls._CLIENT_ID
|
return 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=' + cls._CLIENT_ID
|
||||||
|
|
||||||
def _extract_info_dict(self, info, full_title=None):
|
def _extract_info_dict(self, info, full_title=None, quiet=False):
|
||||||
video_id = info['id']
|
video_id = info['id']
|
||||||
name = full_title or video_id
|
name = full_title or video_id
|
||||||
self.report_extraction(name)
|
if quiet == False:
|
||||||
|
self.report_extraction(name)
|
||||||
|
|
||||||
thumbnail = info['artwork_url']
|
thumbnail = info['artwork_url']
|
||||||
if thumbnail is not None:
|
if thumbnail is not None:
|
||||||
@ -198,3 +201,41 @@ class SoundcloudSetIE(SoundcloudIE):
|
|||||||
'id': info['id'],
|
'id': info['id'],
|
||||||
'title': info['title'],
|
'title': info['title'],
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
class SoundcloudUserIE(SoundcloudIE):
|
||||||
|
_VALID_URL = r'https?://(www\.)?soundcloud.com/(?P<user>[^/]+)(/?(tracks/)?)?(\?.*)?$'
|
||||||
|
IE_NAME = u'soundcloud:user'
|
||||||
|
|
||||||
|
# it's in tests/test_playlists.py
|
||||||
|
_TEST = None
|
||||||
|
|
||||||
|
def _real_extract(self, url):
|
||||||
|
mobj = re.match(self._VALID_URL, url)
|
||||||
|
uploader = mobj.group('user')
|
||||||
|
|
||||||
|
url = 'http://soundcloud.com/%s/' % uploader
|
||||||
|
resolv_url = self._resolv_url(url)
|
||||||
|
user_json = self._download_webpage(resolv_url, uploader,
|
||||||
|
u'Downloading user info')
|
||||||
|
user = json.loads(user_json)
|
||||||
|
|
||||||
|
tracks = []
|
||||||
|
for i in itertools.count():
|
||||||
|
data = compat_urllib_parse.urlencode({'offset': i*50,
|
||||||
|
'client_id': self._CLIENT_ID,
|
||||||
|
})
|
||||||
|
tracks_url = 'http://api.soundcloud.com/users/%s/tracks.json?' % user['id'] + data
|
||||||
|
response = self._download_webpage(tracks_url, uploader,
|
||||||
|
u'Downloading tracks page %s' % (i+1))
|
||||||
|
new_tracks = json.loads(response)
|
||||||
|
tracks.extend(self._extract_info_dict(track, quiet=True) for track in new_tracks)
|
||||||
|
if len(new_tracks) < 50:
|
||||||
|
break
|
||||||
|
|
||||||
|
return {
|
||||||
|
'_type': 'playlist',
|
||||||
|
'id': compat_str(user['id']),
|
||||||
|
'title': user['username'],
|
||||||
|
'entries': tracks,
|
||||||
|
}
|
||||||
|
34
youtube_dl/extractor/southparkstudios.py
Normal file
34
youtube_dl/extractor/southparkstudios.py
Normal file
@ -0,0 +1,34 @@
|
|||||||
|
import re
|
||||||
|
|
||||||
|
from .mtv import MTVIE, _media_xml_tag
|
||||||
|
|
||||||
|
|
||||||
|
class SouthParkStudiosIE(MTVIE):
|
||||||
|
IE_NAME = u'southparkstudios.com'
|
||||||
|
_VALID_URL = r'https?://www\.southparkstudios\.com/clips/(?P<id>\d+)'
|
||||||
|
|
||||||
|
_FEED_URL = 'http://www.southparkstudios.com/feeds/video-player/mrss'
|
||||||
|
|
||||||
|
_TEST = {
|
||||||
|
u'url': u'http://www.southparkstudios.com/clips/104437/bat-daded#tab=featured',
|
||||||
|
u'file': u'a7bff6c2-ed00-11e0-aca6-0026b9414f30.mp4',
|
||||||
|
u'info_dict': {
|
||||||
|
u'title': u'Bat Daded',
|
||||||
|
u'description': u'Randy disqualifies South Park by getting into a fight with Bat Dad.',
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
# Overwrite MTVIE properties we don't want
|
||||||
|
_TESTS = []
|
||||||
|
|
||||||
|
def _get_thumbnail_url(self, uri, itemdoc):
|
||||||
|
search_path = '%s/%s' % (_media_xml_tag('group'), _media_xml_tag('thumbnail'))
|
||||||
|
return itemdoc.find(search_path).attrib['url']
|
||||||
|
|
||||||
|
def _real_extract(self, url):
|
||||||
|
mobj = re.match(self._VALID_URL, url)
|
||||||
|
video_id = mobj.group('id')
|
||||||
|
webpage = self._download_webpage(url, video_id)
|
||||||
|
mgid = self._search_regex(r'swfobject.embedSWF\(".*?(mgid:.*?)"',
|
||||||
|
webpage, u'mgid')
|
||||||
|
return self._get_videos_info(mgid)
|
@ -10,8 +10,7 @@ class SubtitlesInfoExtractor(InfoExtractor):
|
|||||||
@property
|
@property
|
||||||
def _have_to_download_any_subtitles(self):
|
def _have_to_download_any_subtitles(self):
|
||||||
return any([self._downloader.params.get('writesubtitles', False),
|
return any([self._downloader.params.get('writesubtitles', False),
|
||||||
self._downloader.params.get('writeautomaticsub'),
|
self._downloader.params.get('writeautomaticsub')])
|
||||||
self._downloader.params.get('allsubtitles', False)])
|
|
||||||
|
|
||||||
def _list_available_subtitles(self, video_id, webpage=None):
|
def _list_available_subtitles(self, video_id, webpage=None):
|
||||||
""" outputs the available subtitles for the video """
|
""" outputs the available subtitles for the video """
|
||||||
@ -34,7 +33,7 @@ class SubtitlesInfoExtractor(InfoExtractor):
|
|||||||
available_subs_list = {}
|
available_subs_list = {}
|
||||||
if self._downloader.params.get('writeautomaticsub', False):
|
if self._downloader.params.get('writeautomaticsub', False):
|
||||||
available_subs_list.update(self._get_available_automatic_caption(video_id, video_webpage))
|
available_subs_list.update(self._get_available_automatic_caption(video_id, video_webpage))
|
||||||
if self._downloader.params.get('writesubtitles', False) or self._downloader.params.get('allsubtitles', False):
|
if self._downloader.params.get('writesubtitles', False):
|
||||||
available_subs_list.update(self._get_available_subtitles(video_id))
|
available_subs_list.update(self._get_available_subtitles(video_id))
|
||||||
|
|
||||||
if not available_subs_list: # error, it didn't get the available subtitles
|
if not available_subs_list: # error, it didn't get the available subtitles
|
||||||
|
@ -52,6 +52,7 @@ class TriluliluIE(InfoExtractor):
|
|||||||
{
|
{
|
||||||
'format': fnode.text,
|
'format': fnode.text,
|
||||||
'url': video_url_template % fnode.text,
|
'url': video_url_template % fnode.text,
|
||||||
|
'ext': fnode.text.partition('-')[0]
|
||||||
}
|
}
|
||||||
|
|
||||||
for fnode in format_doc.findall('./formats/format')
|
for fnode in format_doc.findall('./formats/format')
|
||||||
@ -67,7 +68,6 @@ class TriluliluIE(InfoExtractor):
|
|||||||
}
|
}
|
||||||
|
|
||||||
# TODO: Remove when #980 has been merged
|
# TODO: Remove when #980 has been merged
|
||||||
info['url'] = formats[-1]['url']
|
info.update(formats[-1])
|
||||||
info['ext'] = formats[-1]['format'].partition('-')[0]
|
|
||||||
|
|
||||||
return info
|
return info
|
||||||
|
@ -1,6 +1,11 @@
|
|||||||
|
import json
|
||||||
import re
|
import re
|
||||||
|
|
||||||
from .common import InfoExtractor
|
from .common import InfoExtractor
|
||||||
|
from ..utils import (
|
||||||
|
compat_urlparse,
|
||||||
|
get_meta_content,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
class UstreamIE(InfoExtractor):
|
class UstreamIE(InfoExtractor):
|
||||||
@ -43,3 +48,25 @@ class UstreamIE(InfoExtractor):
|
|||||||
'thumbnail': thumbnail,
|
'thumbnail': thumbnail,
|
||||||
}
|
}
|
||||||
return info
|
return info
|
||||||
|
|
||||||
|
class UstreamChannelIE(InfoExtractor):
|
||||||
|
_VALID_URL = r'https?://www\.ustream\.tv/channel/(?P<slug>.+)'
|
||||||
|
IE_NAME = u'ustream:channel'
|
||||||
|
|
||||||
|
def _real_extract(self, url):
|
||||||
|
m = re.match(self._VALID_URL, url)
|
||||||
|
slug = m.group('slug')
|
||||||
|
webpage = self._download_webpage(url, slug)
|
||||||
|
channel_id = get_meta_content('ustream:channel_id', webpage)
|
||||||
|
|
||||||
|
BASE = 'http://www.ustream.tv'
|
||||||
|
next_url = '/ajax/socialstream/videos/%s/1.json' % channel_id
|
||||||
|
video_ids = []
|
||||||
|
while next_url:
|
||||||
|
reply = json.loads(self._download_webpage(compat_urlparse.urljoin(BASE, next_url), channel_id))
|
||||||
|
video_ids.extend(re.findall(r'data-content-id="(\d.*)"', reply['data']))
|
||||||
|
next_url = reply['nextUrl']
|
||||||
|
|
||||||
|
urls = ['http://www.ustream.tv/recorded/' + vid for vid in video_ids]
|
||||||
|
url_entries = [self.url_result(eurl, 'Ustream') for eurl in urls]
|
||||||
|
return self.playlist_result(url_entries, channel_id)
|
||||||
|
@ -27,7 +27,7 @@ class XHamsterIE(InfoExtractor):
|
|||||||
mobj = re.match(self._VALID_URL, url)
|
mobj = re.match(self._VALID_URL, url)
|
||||||
|
|
||||||
video_id = mobj.group('id')
|
video_id = mobj.group('id')
|
||||||
mrss_url = 'http://xhamster.com/movies/%s/.html' % video_id
|
mrss_url = 'http://xhamster.com/movies/%s/.html?hd' % video_id
|
||||||
webpage = self._download_webpage(mrss_url, video_id)
|
webpage = self._download_webpage(mrss_url, video_id)
|
||||||
|
|
||||||
mobj = re.search(r'\'srv\': \'(?P<server>[^\']*)\',\s*\'file\': \'(?P<file>[^\']+)\',', webpage)
|
mobj = re.search(r'\'srv\': \'(?P<server>[^\']*)\',\s*\'file\': \'(?P<file>[^\']+)\',', webpage)
|
||||||
|
@ -139,7 +139,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
|
|||||||
(
|
(
|
||||||
(?:https?://)? # http(s):// (optional)
|
(?:https?://)? # http(s):// (optional)
|
||||||
(?:(?:(?:(?:\w+\.)?youtube(?:-nocookie)?\.com/|
|
(?:(?:(?:(?:\w+\.)?youtube(?:-nocookie)?\.com/|
|
||||||
tube\.majestyc\.net/) # the various hostnames, with wildcard subdomains
|
tube\.majestyc\.net/|
|
||||||
|
youtube\.googleapis\.com/) # the various hostnames, with wildcard subdomains
|
||||||
(?:.*?\#/)? # handle anchor (#/) redirect urls
|
(?:.*?\#/)? # handle anchor (#/) redirect urls
|
||||||
(?: # the various things that can precede the ID:
|
(?: # the various things that can precede the ID:
|
||||||
(?:(?:v|embed|e)/) # v/ or embed/ or e/
|
(?:(?:v|embed|e)/) # v/ or embed/ or e/
|
||||||
@ -434,7 +435,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
|
|||||||
elif len(s) == 83:
|
elif len(s) == 83:
|
||||||
return s[81:64:-1] + s[82] + s[63:52:-1] + s[45] + s[51:45:-1] + s[1] + s[44:1:-1] + s[0]
|
return s[81:64:-1] + s[82] + s[63:52:-1] + s[45] + s[51:45:-1] + s[1] + s[44:1:-1] + s[0]
|
||||||
elif len(s) == 82:
|
elif len(s) == 82:
|
||||||
return s[1:19] + s[0] + s[20:68] + s[19] + s[69:82]
|
return s[80:73:-1] + s[81] + s[72:54:-1] + s[2] + s[53:43:-1] + s[0] + s[42:2:-1] + s[43] + s[1] + s[54]
|
||||||
elif len(s) == 81:
|
elif len(s) == 81:
|
||||||
return s[56] + s[79:56:-1] + s[41] + s[55:41:-1] + s[80] + s[40:34:-1] + s[0] + s[33:29:-1] + s[34] + s[28:9:-1] + s[29] + s[8:0:-1] + s[9]
|
return s[56] + s[79:56:-1] + s[41] + s[55:41:-1] + s[80] + s[40:34:-1] + s[0] + s[33:29:-1] + s[34] + s[28:9:-1] + s[29] + s[8:0:-1] + s[9]
|
||||||
elif len(s) == 80:
|
elif len(s) == 80:
|
||||||
|
@ -249,7 +249,17 @@ def htmlentity_transform(matchobj):
|
|||||||
return (u'&%s;' % entity)
|
return (u'&%s;' % entity)
|
||||||
|
|
||||||
compat_html_parser.locatestarttagend = re.compile(r"""<[a-zA-Z][-.a-zA-Z0-9:_]*(?:\s+(?:(?<=['"\s])[^\s/>][^\s/=>]*(?:\s*=+\s*(?:'[^']*'|"[^"]*"|(?!['"])[^>\s]*))?\s*)*)?\s*""", re.VERBOSE) # backport bugfix
|
compat_html_parser.locatestarttagend = re.compile(r"""<[a-zA-Z][-.a-zA-Z0-9:_]*(?:\s+(?:(?<=['"\s])[^\s/>][^\s/=>]*(?:\s*=+\s*(?:'[^']*'|"[^"]*"|(?!['"])[^>\s]*))?\s*)*)?\s*""", re.VERBOSE) # backport bugfix
|
||||||
class AttrParser(compat_html_parser.HTMLParser):
|
class BaseHTMLParser(compat_html_parser.HTMLParser):
|
||||||
|
def __init(self):
|
||||||
|
compat_html_parser.HTMLParser.__init__(self)
|
||||||
|
self.html = None
|
||||||
|
|
||||||
|
def loads(self, html):
|
||||||
|
self.html = html
|
||||||
|
self.feed(html)
|
||||||
|
self.close()
|
||||||
|
|
||||||
|
class AttrParser(BaseHTMLParser):
|
||||||
"""Modified HTMLParser that isolates a tag with the specified attribute"""
|
"""Modified HTMLParser that isolates a tag with the specified attribute"""
|
||||||
def __init__(self, attribute, value):
|
def __init__(self, attribute, value):
|
||||||
self.attribute = attribute
|
self.attribute = attribute
|
||||||
@ -257,10 +267,9 @@ class AttrParser(compat_html_parser.HTMLParser):
|
|||||||
self.result = None
|
self.result = None
|
||||||
self.started = False
|
self.started = False
|
||||||
self.depth = {}
|
self.depth = {}
|
||||||
self.html = None
|
|
||||||
self.watch_startpos = False
|
self.watch_startpos = False
|
||||||
self.error_count = 0
|
self.error_count = 0
|
||||||
compat_html_parser.HTMLParser.__init__(self)
|
BaseHTMLParser.__init__(self)
|
||||||
|
|
||||||
def error(self, message):
|
def error(self, message):
|
||||||
if self.error_count > 10 or self.started:
|
if self.error_count > 10 or self.started:
|
||||||
@ -269,11 +278,6 @@ class AttrParser(compat_html_parser.HTMLParser):
|
|||||||
self.error_count += 1
|
self.error_count += 1
|
||||||
self.goahead(1)
|
self.goahead(1)
|
||||||
|
|
||||||
def loads(self, html):
|
|
||||||
self.html = html
|
|
||||||
self.feed(html)
|
|
||||||
self.close()
|
|
||||||
|
|
||||||
def handle_starttag(self, tag, attrs):
|
def handle_starttag(self, tag, attrs):
|
||||||
attrs = dict(attrs)
|
attrs = dict(attrs)
|
||||||
if self.started:
|
if self.started:
|
||||||
@ -334,6 +338,38 @@ def get_element_by_attribute(attribute, value, html):
|
|||||||
pass
|
pass
|
||||||
return parser.get_result()
|
return parser.get_result()
|
||||||
|
|
||||||
|
class MetaParser(BaseHTMLParser):
|
||||||
|
"""
|
||||||
|
Modified HTMLParser that isolates a meta tag with the specified name
|
||||||
|
attribute.
|
||||||
|
"""
|
||||||
|
def __init__(self, name):
|
||||||
|
BaseHTMLParser.__init__(self)
|
||||||
|
self.name = name
|
||||||
|
self.content = None
|
||||||
|
self.result = None
|
||||||
|
|
||||||
|
def handle_starttag(self, tag, attrs):
|
||||||
|
if tag != 'meta':
|
||||||
|
return
|
||||||
|
attrs = dict(attrs)
|
||||||
|
if attrs.get('name') == self.name:
|
||||||
|
self.result = attrs.get('content')
|
||||||
|
|
||||||
|
def get_result(self):
|
||||||
|
return self.result
|
||||||
|
|
||||||
|
def get_meta_content(name, html):
|
||||||
|
"""
|
||||||
|
Return the content attribute from the meta tag with the given name attribute.
|
||||||
|
"""
|
||||||
|
parser = MetaParser(name)
|
||||||
|
try:
|
||||||
|
parser.loads(html)
|
||||||
|
except compat_html_parser.HTMLParseError:
|
||||||
|
pass
|
||||||
|
return parser.get_result()
|
||||||
|
|
||||||
|
|
||||||
def clean_html(html):
|
def clean_html(html):
|
||||||
"""Clean an HTML snippet into a readable string"""
|
"""Clean an HTML snippet into a readable string"""
|
||||||
@ -664,7 +700,16 @@ def unified_strdate(date_str):
|
|||||||
date_str = date_str.replace(',',' ')
|
date_str = date_str.replace(',',' ')
|
||||||
# %z (UTC offset) is only supported in python>=3.2
|
# %z (UTC offset) is only supported in python>=3.2
|
||||||
date_str = re.sub(r' (\+|-)[\d]*$', '', date_str)
|
date_str = re.sub(r' (\+|-)[\d]*$', '', date_str)
|
||||||
format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y', '%Y-%m-%d', '%d/%m/%Y', '%Y/%m/%d %H:%M:%S', '%d.%m.%Y %H:%M']
|
format_expressions = [
|
||||||
|
'%d %B %Y',
|
||||||
|
'%B %d %Y',
|
||||||
|
'%b %d %Y',
|
||||||
|
'%Y-%m-%d',
|
||||||
|
'%d/%m/%Y',
|
||||||
|
'%Y/%m/%d %H:%M:%S',
|
||||||
|
'%d.%m.%Y %H:%M',
|
||||||
|
'%Y-%m-%dT%H:%M:%SZ',
|
||||||
|
]
|
||||||
for expression in format_expressions:
|
for expression in format_expressions:
|
||||||
try:
|
try:
|
||||||
upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
|
upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
|
||||||
|
Loading…
Reference in New Issue
Block a user