From 46561ecd6c3df3a4695f188d8998968972151e73 Mon Sep 17 00:00:00 2001 From: frinkelpi Date: Sat, 22 Oct 2016 20:59:54 +0200 Subject: [PATCH 1/2] [CanalU] Add new extractor --- youtube_dl/extractor/canalu.py | 73 ++++++++++++++++++++++++++++++ youtube_dl/extractor/extractors.py | 1 + 2 files changed, 74 insertions(+) create mode 100644 youtube_dl/extractor/canalu.py diff --git a/youtube_dl/extractor/canalu.py b/youtube_dl/extractor/canalu.py new file mode 100644 index 000000000..645f9b5a2 --- /dev/null +++ b/youtube_dl/extractor/canalu.py @@ -0,0 +1,73 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + int_or_none, + month_by_name, + unescapeHTML +) +from re import DOTALL + + +class CanalUIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?canal-u\.tv/video/(?P.*)' + _TESTS = [ + { + 'url': 'https://www.canal-u.tv/video/ecole_normale_superieure_de_lyon/gouvernement.3118', + 'md5': '9c185d26b232c3c06d805c0d639af254', + 'info_dict': { + 'id': 'ecole_normale_superieure_de_lyon/gouvernement.3118', + 'ext': 'mp4', + 'duration': 600, + 'creator': 'SENELLART Michel', + 'title': 'Gouvernement', + 'description': 'Les essentiels : La philo par les mots - Gouvernement', + 'thumbnail': 'https://www.canal-u.tv/media/images/groupe_ens_lsh/gouvernement_3118/vignette.les.essentiels.jpg', + 'release_date': '20071015'} + }, + { + 'url': 'https://www.canal-u.tv/video/ecole_normale_superieure_de_lyon/les_competences_en_situation_d_apprentissage.20850', + 'md5': 'f06aab78bf60c2a2340a733c18a5ef10', + 'info_dict': { + 'id': 'ecole_normale_superieure_de_lyon/les_competences_en_situation_d_apprentissage.20850', + 'ext': 'mp4', + 'duration': 360, + 'creator': 'COULET Jean-Claude', + 'title': 'Les compétences en situation d\'apprentissage', + 'description': 'Cette capsule présente comment on peut décliner la notion de compétence,\r dans les situations pédagogiques, en donnant un sens précis aux \r concepts de situation, tâche, et activité. Elle ouvre des pistes de \r réflexion sur l\'articulation de ces notions dans les situations \r d\'éducation et de formation.', + 'thumbnail': 'https://www.canal-u.tv/media/images/groupe_ens_lsh/les.comp.tences.en.situation.d.apprentissage_20850/craies.jpg', + 'release_date': '20151215', + } + }] + + def _real_extract(self, url): + video = {} + video_id = self._match_id(url) + video['id'] = video_id + webpage = self._download_webpage(url, video_id) + + video['title'] = self._og_search_title(webpage) + video['url'] = self._html_search_regex(r'file: "(.*?\.mp4)",', webpage, 'url') + video['ext'] = 'mp4' + + # Thumbnail + video['thumbnail'] = self._og_search_thumbnail(webpage, default=None) + # Description + description_regex = r'
.*?

\s*(.*?)\s*

.*?
' + video['description'] = self._html_search_regex(description_regex, webpage, 'description', flags=DOTALL, default=None) + # Other fields + for field in [ + ['duration', 'Durée du programme', '(\d+) min'], + ['creator', 'Auteur\(s\)', '(.*?)'], + ['release_date', 'Date de réalisation', '(.*?)'], + ]: + regex = r'
{0} : {1}
'.format(field[1], field[2]) + video[field[0]] = self._html_search_regex(regex, webpage, field[0], flags=DOTALL, default=None) + # Duration + video['duration'] = int_or_none(video['duration'], invscale=60) + # Release date + date = video['release_date'].split(' ') + video["release_date"] = "{0}{1}{2}".format(date[2], month_by_name(unescapeHTML(date[1]).lower(), 'fr'), date[0]) + + return video diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index a693f8c56..0fd84f343 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -128,6 +128,7 @@ from .camdemy import ( ) from .camwithher import CamWithHerIE from .canalplus import CanalplusIE +from .canalu import CanalUIE from .canalc2 import Canalc2IE from .canvas import CanvasIE from .carambatv import ( From ab3b80d92a4652025f74be01dee8ee189e30996c Mon Sep 17 00:00:00 2001 From: frinkelpi Date: Sat, 17 Dec 2016 14:30:44 +0100 Subject: [PATCH 2/2] Remove low importance comments --- youtube_dl/extractor/canalu.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/youtube_dl/extractor/canalu.py b/youtube_dl/extractor/canalu.py index 645f9b5a2..07bf42294 100644 --- a/youtube_dl/extractor/canalu.py +++ b/youtube_dl/extractor/canalu.py @@ -51,12 +51,9 @@ class CanalUIE(InfoExtractor): video['url'] = self._html_search_regex(r'file: "(.*?\.mp4)",', webpage, 'url') video['ext'] = 'mp4' - # Thumbnail video['thumbnail'] = self._og_search_thumbnail(webpage, default=None) - # Description description_regex = r'
.*?

\s*(.*?)\s*

.*?
' video['description'] = self._html_search_regex(description_regex, webpage, 'description', flags=DOTALL, default=None) - # Other fields for field in [ ['duration', 'Durée du programme', '(\d+) min'], ['creator', 'Auteur\(s\)', '(.*?)'], @@ -64,9 +61,7 @@ class CanalUIE(InfoExtractor): ]: regex = r'
{0} : {1}
'.format(field[1], field[2]) video[field[0]] = self._html_search_regex(regex, webpage, field[0], flags=DOTALL, default=None) - # Duration video['duration'] = int_or_none(video['duration'], invscale=60) - # Release date date = video['release_date'].split(' ') video["release_date"] = "{0}{1}{2}".format(date[2], month_by_name(unescapeHTML(date[1]).lower(), 'fr'), date[0])