From 1d0141565db37e26ece7be815d96ceb32127c733 Mon Sep 17 00:00:00 2001 From: alex Date: Wed, 15 Apr 2015 18:13:16 -0400 Subject: [PATCH] [gfycat] Add new extractor --- youtube_dl/extractor/__init__.py | 1 + youtube_dl/extractor/gfycat.py | 104 +++++++++++++++++++++++++++++++ 2 files changed, 105 insertions(+) create mode 100644 youtube_dl/extractor/gfycat.py diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index bbf3be41d..d32f1cbd2 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -185,6 +185,7 @@ from .gametrailers import GametrailersIE from .gazeta import GazetaIE from .gdcvault import GDCVaultIE from .generic import GenericIE +from .gfycat import GfycatIE from .giantbomb import GiantBombIE from .giga import GigaIE from .glide import GlideIE diff --git a/youtube_dl/extractor/gfycat.py b/youtube_dl/extractor/gfycat.py new file mode 100644 index 000000000..6af1b5772 --- /dev/null +++ b/youtube_dl/extractor/gfycat.py @@ -0,0 +1,104 @@ +from __future__ import unicode_literals + +import re +import json + +from .common import InfoExtractor +from ..utils import ( + int_or_none, + js_to_json, + mimetype2ext, + ExtractorError, +) + +class GfycatIE(InfoExtractor): + _VALID_URL = r'https?://(?:\w+\.)?gfycat\.com/(?P[a-zA-Z]+)(\.(?Pgif|webm|mp4))?' + _TESTS = [{ + 'url': 'http://gfycat.com/RequiredUnkemptBuzzard', + 'info_dict': { + 'id': 'RequiredUnkemptBuzzard', + 'title': 'Headshot!', + 'ext': 'mp4' + }, + }, { + 'url': 'https://giant.gfycat.com/RequiredUnkemptBuzzard.gif', + 'info_dict': { + 'id': 'RequiredUnkemptBuzzard', + 'title': 'Headshot!', + 'ext': 'gif' + }, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + parse = re.search(self._VALID_URL, url) + userExt = None + if parse.group('ext'): + userExt = parse.group('ext') + + url = 'http://gfycat.com/'+video_id + webpage = self._download_webpage(url, video_id) + + width = int_or_none(self._search_regex( + r'gfyWidth[\s=]*?"(?P\d+?)"', + webpage, 'width', fatal=False)) + height = int_or_none(self._search_regex( + r'gfyHeight[\s=]*?"(?P\d+?)"', + webpage, 'height', fatal=False)) + framerate = int_or_none(self._search_regex( + r'gfyFrameRate[\s=]*?"(?P\d+?)"', + webpage, 'framerate', fatal=False)) + frames = int_or_none(self._search_regex( + r'gfyNumFrames[\s=]*?"(?P\d+?)"', + webpage, 'frames', fatal=False)) + views = int_or_none(self._search_regex( + r'gfyViews[\s=]*?"(?P\d+?)"', + webpage, 'views', fatal=False)) + title = self._search_regex(r'class="gfyTitle">(?P[^<]*)',webpage, 'title', fatal=False) + + formats = [] + x=0 + for f in ['image/webm','image/gif','video/mp4']: + preference = False + fext = f.partition('/')[2] + furl = re.search('gfy'+fext.title()+'Url[\s=]*?"(.*?)"', webpage) + fsize = re.search('gfy'+fext.title()+'Size[\s=]*?"(.*?)"', webpage) + + if fext == userExt: + preference=1000 + + formats.append({ + 'format_id': f.partition('/')[2], + 'url': self._proto_relative_url(furl.group(1)), + 'acodec': 'none', + 'ext':f.partition('/')[2], + 'width': width, + 'vbr':float(fsize.group(1))/(frames/framerate)/1024, + 'preference':x if not preference else preference, + 'fps':framerate, + 'height': height, + 'bytesize': fsize.group(1), + 'id':video_id, + 'http_headers': { + 'User-Agent': 'youtube-dl (like wget)', + }, + }) + x+=1 + + if not len(formats): + raise ExtractorError('No sources found for gfycat %s. be sure to link to the page with the embed on it.' % video_id, expected=True) + + self._sort_formats(formats) + + ret = { + 'id': video_id, + 'formats': formats, + 'title': title, + 'duration':(frames/framerate), + 'view_count':views + } + + # print json.dumps(ret, sort_keys=True, indent=4, separators=(',', ': ')) + + + return ret \ No newline at end of file