1
0
mirror of https://github.com/l1ving/youtube-dl synced 2025-03-28 05:50:01 +08:00

[franceculture] improve extractor

This commit is contained in:
flatgreen 2015-12-27 15:30:45 +01:00
parent 607d65fbce
commit bd7b1dd250
2 changed files with 52 additions and 5 deletions

View File

@ -198,7 +198,10 @@ from .fourtube import FourTubeIE
from .foxgay import FoxgayIE from .foxgay import FoxgayIE
from .foxnews import FoxNewsIE from .foxnews import FoxNewsIE
from .foxsports import FoxSportsIE from .foxsports import FoxSportsIE
from .franceculture import FranceCultureIE from .franceculture import (
FranceCultureIE,
FranceCultureUrlIE,
)
from .franceinter import FranceInterIE from .franceinter import FranceInterIE
from .francetv import ( from .francetv import (
PluzzIE, PluzzIE,

View File

@ -1,6 +1,8 @@
# coding: utf-8 # coding: utf-8
from __future__ import unicode_literals from __future__ import unicode_literals
import re
from .common import InfoExtractor from .common import InfoExtractor
from ..compat import ( from ..compat import (
compat_urlparse, compat_urlparse,
@ -22,14 +24,14 @@ class FranceCultureIE(InfoExtractor):
'alt_title': 'Carnet nomade | 13-14', 'alt_title': 'Carnet nomade | 13-14',
'vcodec': 'none', 'vcodec': 'none',
'upload_date': '20140301', 'upload_date': '20140301',
'thumbnail': r're:^http://www\.franceculture\.fr/.*/images/player/Carnet-nomade\.jpg$', 'thumbnail': r're:^http://static\.franceculture\.fr/.*/images/player/Carnet-nomade\.jpg$',
'description': 'startswith:Avec :Jean-Baptiste Péretié pour son documentaire sur Arte "La revanche des « geeks », une enquête menée aux Etats', 'description': 'startswith:Avec :Jean-Baptiste Péretié pour son documentaire sur Arte "La revanche',
'timestamp': 1393700400, 'timestamp': 1393700400,
} }
} }
def _real_extract(self, url): def _extract_infos_from_player(self, url, video_id):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id) webpage = self._download_webpage(url, video_id)
video_path = self._search_regex( video_path = self._search_regex(
@ -42,6 +44,9 @@ class FranceCultureIE(InfoExtractor):
r'<a id="player".*?>\s+<img src="([^"]+)"', r'<a id="player".*?>\s+<img src="([^"]+)"',
webpage, 'thumbnail', fatal=False) webpage, 'thumbnail', fatal=False)
display_id = self._search_regex(
r'<span class="path-diffusion">emission-(.*?)</span>', webpage, 'display_id')
title = self._html_search_regex( title = self._html_search_regex(
r'<span class="title-diffusion">(.*?)</span>', webpage, 'title') r'<span class="title-diffusion">(.*?)</span>', webpage, 'title')
alt_title = self._html_search_regex( alt_title = self._html_search_regex(
@ -66,4 +71,43 @@ class FranceCultureIE(InfoExtractor):
'alt_title': alt_title, 'alt_title': alt_title,
'thumbnail': thumbnail, 'thumbnail': thumbnail,
'description': description, 'description': description,
'display_id': display_id
} }
def _real_extract(self, url):
video_id = self._match_id(url)
return self._extract_infos_from_player(url, video_id)
class FranceCultureUrlIE(FranceCultureIE):
_VALID_URL = r'https?://(?:www\.)?franceculture\.fr/emission-(?P<id>[^?#]+)'
_TEST = {
'url': 'http://www.franceculture.fr/emission-les-carnets-de-la-creation-jean-gabriel-periot-cineaste-2015-10-13',
'info_dict': {
'title': 'Jean-Gabriel Périot, cinéaste',
'alt_title': 'Les Carnets de la création',
'id': '5093239',
'display_id': 'les-carnets-de-la-creation-jean-gabriel-periot-cineaste-2015-10-13',
'ext': 'mp3',
'timestamp': 1444762500,
'upload_date': '20151013',
'description': 'startswith:Aujourd\'hui dans "Les carnets de la création", le cinéaste',
}
}
# dl url
# find : <a class="rf-player-open" href="/player/reecouter?play=5093239"><img ...></a>
# extract '/player/reecouter?play=5093239' join to url base of franceculture
# extract mp3 with FranceCultureIE _extract_infos_from_player
def _real_extract(self, url):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
video_path = self._html_search_regex(
r'<a class="rf-player-open".*?href="([^"]+)"', webpage, 'video path','no_path_player')
if video_path == 'no_path_player':
self.to_screen('no player : no sound in this page.')
return None
new_id = re.search('play=(?P<id>[0-9]+)', video_path).group('id')
video_url = compat_urlparse.urljoin(url, video_path)
return self._extract_infos_from_player(video_url, new_id)