From dfd751fb4f1c163932f1529532840e30f2df953e Mon Sep 17 00:00:00 2001 From: kaspi Date: Sat, 17 Oct 2015 23:27:03 -0400 Subject: [PATCH 1/5] [NPR] new extractor for NPR.org --- youtube_dl/extractor/__init__.py | 1 + youtube_dl/extractor/npr.py | 72 ++++++++++++++++++++++++++++++++ 2 files changed, 73 insertions(+) create mode 100644 youtube_dl/extractor/npr.py diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 462717b1e..b774588b8 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -422,6 +422,7 @@ from .npo import ( VPROIE, WNLIE ) +from .npr import NprIE from .nrk import ( NRKIE, NRKPlaylistIE, diff --git a/youtube_dl/extractor/npr.py b/youtube_dl/extractor/npr.py new file mode 100644 index 000000000..26a0f9bf1 --- /dev/null +++ b/youtube_dl/extractor/npr.py @@ -0,0 +1,72 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import os.path +import re + +from ..compat import compat_urllib_parse_unquote +from ..utils import url_basename +from .common import InfoExtractor + +class NprIE(InfoExtractor): + _VALID_URL = r'http://(?:www\.)?npr\.org/player/v2/mediaPlayer.html?.*id=(?P[0-9]+)' + _TEST = { + 'url': 'http://www.npr.org/player/v2/mediaPlayer.html?id=445367719', + 'md5' : '458bacc24549173fe5a5aa29174a5606', + 'info_dict': { + 'id': '445367719', + 'ext': 'mp4', + 'title': 'VEGA INTL. Night School' + } +} + + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + webpage_url = 'http://www.npr.org/player/v2/mediaPlayer.html?id=' + video_id + webpage = self._download_webpage(webpage_url, video_id) + key = 'MDAzMzQ2MjAyMDEyMzk4MTU1MDg3ZmM3MQ010' + xml_url = 'http://api.npr.org/query?id=%s&apiKey=%s' % (video_id, key) + + config = self._download_xml(xml_url,video_id, note='Downloading XML') + + audio = config.findall('./list/story/audio[@type="standard"]') + if not audio: + # audio type is primary + audio = config.findall('./list/story/audio[@type="primary"]') + + regex = ('.//*[@type="mp3"]','.//*[@type="m3u"]','.//format/wm','.//format/threegp','.//format/mp4','.//format/hls','.//format/mediastream') + album_title = config.find('.//albumTitle') + + if not album_title: + album_title = config.find('./list/story/title').text + else: + album_title = album_title.text + + print(album_title) + format = [] + entries = [] + for song in audio: + song_title = song.find('title').text + song_id = song.get('id') + song_duration = song.find('duration').text + + for r in regex: + t = song.find(r) + if t is not None: + format.append({'format': t.get('type', t.tag), + 'url' : t.text}) + + entries.append({ "title":song_title, + "id":song_id, + "duration": str(int(song_duration) / 60) +":"+ str(int(song_duration) % 60) , + "formats":format}) + format = [] + + return { + '_type': 'playlist', + 'id' : video_id, + 'title' : album_title, + 'entries': entries + } \ No newline at end of file From 0c2176b3d15a0914c867c8b5a99f26230c2ee26c Mon Sep 17 00:00:00 2001 From: kaspi Date: Sat, 17 Oct 2015 23:45:36 -0400 Subject: [PATCH 2/5] removed md5 from _TEST --- youtube_dl/extractor/npr.py | 1 - 1 file changed, 1 deletion(-) diff --git a/youtube_dl/extractor/npr.py b/youtube_dl/extractor/npr.py index 26a0f9bf1..d389e30ba 100644 --- a/youtube_dl/extractor/npr.py +++ b/youtube_dl/extractor/npr.py @@ -12,7 +12,6 @@ class NprIE(InfoExtractor): _VALID_URL = r'http://(?:www\.)?npr\.org/player/v2/mediaPlayer.html?.*id=(?P[0-9]+)' _TEST = { 'url': 'http://www.npr.org/player/v2/mediaPlayer.html?id=445367719', - 'md5' : '458bacc24549173fe5a5aa29174a5606', 'info_dict': { 'id': '445367719', 'ext': 'mp4', From 0ed276dceb0c1b75af09dec5c1839bb492ed9cde Mon Sep 17 00:00:00 2001 From: kaspi Date: Fri, 23 Oct 2015 00:57:12 -0400 Subject: [PATCH 3/5] moved from xml data to json --- youtube_dl/extractor/npr.py | 74 ++++++++++++++++++------------------- 1 file changed, 37 insertions(+), 37 deletions(-) diff --git a/youtube_dl/extractor/npr.py b/youtube_dl/extractor/npr.py index d389e30ba..0e4146c05 100644 --- a/youtube_dl/extractor/npr.py +++ b/youtube_dl/extractor/npr.py @@ -27,45 +27,45 @@ class NprIE(InfoExtractor): webpage = self._download_webpage(webpage_url, video_id) key = 'MDAzMzQ2MjAyMDEyMzk4MTU1MDg3ZmM3MQ010' xml_url = 'http://api.npr.org/query?id=%s&apiKey=%s' % (video_id, key) + json_url = 'http://api.npr.org/query?id=%s&apiKey=%s&format=json' % (video_id, key) - config = self._download_xml(xml_url,video_id, note='Downloading XML') - - audio = config.findall('./list/story/audio[@type="standard"]') - if not audio: - # audio type is primary - audio = config.findall('./list/story/audio[@type="primary"]') - - regex = ('.//*[@type="mp3"]','.//*[@type="m3u"]','.//format/wm','.//format/threegp','.//format/mp4','.//format/hls','.//format/mediastream') - album_title = config.find('.//albumTitle') - - if not album_title: - album_title = config.find('./list/story/title').text - else: - album_title = album_title.text - - print(album_title) - format = [] + formats = [] entries = [] - for song in audio: - song_title = song.find('title').text - song_id = song.get('id') - song_duration = song.find('duration').text - for r in regex: - t = song.find(r) - if t is not None: - format.append({'format': t.get('type', t.tag), - 'url' : t.text}) + config = self._download_json(json_url, video_id) - entries.append({ "title":song_title, - "id":song_id, - "duration": str(int(song_duration) / 60) +":"+ str(int(song_duration) % 60) , - "formats":format}) - format = [] + content = config["list"]["story"] - return { - '_type': 'playlist', - 'id' : video_id, - 'title' : album_title, - 'entries': entries - } \ No newline at end of file + album_title = config["list"]["story"][0]['song'][0]['album']['albumTitle'] + print album_title['$text'] + + for key in content: + if "audio" in key: + for x in key['audio']: + if x['type'] == 'standard': + playlist = True + song_duration = x["duration"]['$text'] + song_title = x["title"]["$text"] + song_id = x["id"] + + for k in x["format"]: + if type(x["format"][k]) is list: + for z in x["format"][k]: + formats.append({ 'format': z['type'], + 'url' : z['$text'] + }) + else: + formats.append({ 'format': k, + 'url' : x["format"][k]['$text'] + }) + + entries.append({ "title":song_title, + "id":song_id, + "duration": song_duration , + "formats":formats}) + formats = [] + + return { '_type': 'playlist', + 'id' : video_id, + 'title' : album_title, + 'entries': entries } \ No newline at end of file From bb15ee2c051c0386f59bff45c8a0908d044712b0 Mon Sep 17 00:00:00 2001 From: kaspi Date: Fri, 23 Oct 2015 01:03:52 -0400 Subject: [PATCH 4/5] test --- youtube_dl/extractor/npr.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/npr.py b/youtube_dl/extractor/npr.py index 0e4146c05..73a8cc3b5 100644 --- a/youtube_dl/extractor/npr.py +++ b/youtube_dl/extractor/npr.py @@ -68,4 +68,4 @@ class NprIE(InfoExtractor): return { '_type': 'playlist', 'id' : video_id, 'title' : album_title, - 'entries': entries } \ No newline at end of file + 'entries': entries } From 190e684e18fbe360b820736248eb48ad6ac6fd10 Mon Sep 17 00:00:00 2001 From: kaspi Date: Fri, 23 Oct 2015 22:42:03 -0400 Subject: [PATCH 5/5] changed _TEST url to one that will not expire, so tests would not be failing --- youtube_dl/extractor/npr.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/npr.py b/youtube_dl/extractor/npr.py index 73a8cc3b5..a823bc096 100644 --- a/youtube_dl/extractor/npr.py +++ b/youtube_dl/extractor/npr.py @@ -11,11 +11,11 @@ from .common import InfoExtractor class NprIE(InfoExtractor): _VALID_URL = r'http://(?:www\.)?npr\.org/player/v2/mediaPlayer.html?.*id=(?P[0-9]+)' _TEST = { - 'url': 'http://www.npr.org/player/v2/mediaPlayer.html?id=445367719', + 'url': 'http://www.npr.org/player/v2/mediaPlayer.html?id=449974205', 'info_dict': { - 'id': '445367719', + 'id': '449974205', 'ext': 'mp4', - 'title': 'VEGA INTL. Night School' + 'title': 'New Music From Beach House, Chairlift, CMJ Discoveries And More' } }