From a93652070dc27dfee8e57cccae2709b21a643f20 Mon Sep 17 00:00:00 2001 From: zhengxin Date: Fri, 16 Oct 2015 17:24:19 +0800 Subject: [PATCH 1/4] add imooc --- youtube_dl/extractor/__init__.py | 1 + youtube_dl/extractor/imooc.py | 49 ++++++++++++++++++++++++++++++++ 2 files changed, 50 insertions(+) create mode 100644 youtube_dl/extractor/imooc.py diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 462717b1e..e1b7dd151 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -243,6 +243,7 @@ from .imgur import ( ImgurIE, ImgurAlbumIE, ) +from .imooc import ImoocVideoIE from .ina import InaIE from .indavideo import ( IndavideoIE, diff --git a/youtube_dl/extractor/imooc.py b/youtube_dl/extractor/imooc.py new file mode 100644 index 000000000..c2c295206 --- /dev/null +++ b/youtube_dl/extractor/imooc.py @@ -0,0 +1,49 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..compat import ( + compat_urllib_parse_unquote, + compat_urllib_request, +) + +class ImoocVideoIE(InfoExtractor): + _VALID_URL = r'http://www.imooc.com/video/(?P[0-9]+)' + _TEST = { + 'url': 'http://www.imooc.com/video/5454', + 'md5': '03a0f36327721551fce08776fe8f70f1', + 'info_dict': { + 'id': '5454', + 'ext': 'mp4', + 'title': '3-1 网络环境查看命令', + } + } + + # _ANDROID_USER_AGENT = 'Mozilla/5.0 (iPhone; U; CPU iPhone OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5' + # _ANDROID_USER_AGENT = 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20150101 Firefox/20.0 (Chrome)' + def _real_extract(self, url): + video_id = self._match_id(url) + # android_req = compat_urllib_request.Request(url) + # android_req.add_header('User-Agent', self._ANDROID_USER_AGENT) + # webpage = self._download_webpage(android_req, video_id, fatal=False) + webpage = self._download_webpage(url, video_id) + + print webpage + + title = self._search_regex(r'var videoTitle = (.+?)', webpage, 'title') + # title = self._search_regex(r'(.+?)', webpage, 'title') + + # url = self._search_regex(r'property="(.+?)"', webpage, 'url'); + url = self._search_regex(r'webkit-playsinline src="(.+?)"', webpage, 'url') + # url = self._html_search_regex(r'', webpage, 'url') + # + # description = self._html_search_regex( + # r'(?s)
.*?
]*>(.*?)
', + # webpage, 'description', fatal=False) + return { + 'id': video_id, + 'title': title, + 'url': url + } \ No newline at end of file From 5db87589da42ec7016addac2a3e017c7a83c3996 Mon Sep 17 00:00:00 2001 From: zhengxin Date: Tue, 20 Oct 2015 10:45:06 +0800 Subject: [PATCH 2/4] [ImoocVideoIE] Add new extractor --- youtube_dl/extractor/imooc.py | 37 +++++++++++++++++------------------ 1 file changed, 18 insertions(+), 19 deletions(-) diff --git a/youtube_dl/extractor/imooc.py b/youtube_dl/extractor/imooc.py index c2c295206..f42f67873 100644 --- a/youtube_dl/extractor/imooc.py +++ b/youtube_dl/extractor/imooc.py @@ -11,37 +11,36 @@ from ..compat import ( class ImoocVideoIE(InfoExtractor): _VALID_URL = r'http://www.imooc.com/video/(?P[0-9]+)' - _TEST = { + _TESTS = [ + { + 'url': 'http://www.imooc.com/video/6511', + 'md5': '756ca7b6e934aedee496e208f290bff3', + 'info_dict': { + 'id': '6511', + 'ext': 'mp4', + 'title': 'Bash变量与变量分类'} + }, + { 'url': 'http://www.imooc.com/video/5454', - 'md5': '03a0f36327721551fce08776fe8f70f1', + 'md5': '1feb8b14a07f5272b400b271292cc1f6', 'info_dict': { 'id': '5454', 'ext': 'mp4', - 'title': '3-1 网络环境查看命令', + 'title': '网络环境查看命令', } } + ] - # _ANDROID_USER_AGENT = 'Mozilla/5.0 (iPhone; U; CPU iPhone OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5' - # _ANDROID_USER_AGENT = 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20150101 Firefox/20.0 (Chrome)' def _real_extract(self, url): video_id = self._match_id(url) - # android_req = compat_urllib_request.Request(url) - # android_req.add_header('User-Agent', self._ANDROID_USER_AGENT) - # webpage = self._download_webpage(android_req, video_id, fatal=False) - webpage = self._download_webpage(url, video_id) - print webpage + json_url = 'http://www.imooc.com/course/ajaxmediainfo/?mid=%s&mode=flash' % video_id + data = self._download_json(json_url, video_id, 'Downloading video formats') - title = self._search_regex(r'var videoTitle = (.+?)', webpage, 'title') - # title = self._search_regex(r'(.+?)', webpage, 'title') + url = data['data']['result']['mpath'][0] + + title = data['data']['result']['name'] - # url = self._search_regex(r'property="(.+?)"', webpage, 'url'); - url = self._search_regex(r'webkit-playsinline src="(.+?)"', webpage, 'url') - # url = self._html_search_regex(r'', webpage, 'url') - # - # description = self._html_search_regex( - # r'(?s)
.*?
]*>(.*?)
', - # webpage, 'description', fatal=False) return { 'id': video_id, 'title': title, From c1a0b92c85cbe256a0fd214a83b430562f2fd64a Mon Sep 17 00:00:00 2001 From: zhengxin Date: Tue, 20 Oct 2015 11:24:47 +0800 Subject: [PATCH 3/4] update: docs/supportedsites.md --- docs/supportedsites.md | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/supportedsites.md b/docs/supportedsites.md index dc0354095..dba8210c5 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -220,6 +220,7 @@ - **imdb:list**: Internet Movie Database lists - **Imgur** - **ImgurAlbum** + - **Imooc**: 慕课网 - **Ina** - **Indavideo** - **IndavideoEmbed** From bd33694259ce7b32c78663cefdc9a8d139a00dc1 Mon Sep 17 00:00:00 2001 From: zhengxin Date: Tue, 20 Oct 2015 16:54:05 +0800 Subject: [PATCH 4/4] add error throwing --- youtube_dl/extractor/imooc.py | 23 +++++++++++++++++++++-- 1 file changed, 21 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/imooc.py b/youtube_dl/extractor/imooc.py index f42f67873..eeef44d73 100644 --- a/youtube_dl/extractor/imooc.py +++ b/youtube_dl/extractor/imooc.py @@ -8,6 +8,10 @@ from ..compat import ( compat_urllib_parse_unquote, compat_urllib_request, ) +from ..utils import ( + ExtractorError, + HEADRequest, +) class ImoocVideoIE(InfoExtractor): _VALID_URL = r'http://www.imooc.com/video/(?P[0-9]+)' @@ -37,9 +41,24 @@ class ImoocVideoIE(InfoExtractor): json_url = 'http://www.imooc.com/course/ajaxmediainfo/?mid=%s&mode=flash' % video_id data = self._download_json(json_url, video_id, 'Downloading video formats') - url = data['data']['result']['mpath'][0] + if data['result'] == 0: + urls = data['data']['result']['mpath'] + title = data['data']['result']['name'] + + for i, url in enumerate(urls): + req = HEADRequest(url) + res = self._request_webpage( + req, video_id, note='Testing video URL %d' % i, errnote=False) + if res: + break + else: + raise ExtractorError('No working video URLs found') + + else: + print data['msg'] + raise ValueError(data['msg']) + - title = data['data']['result']['name'] return { 'id': video_id,