From 5333bea24f70d141da33b1e6290323538b66ee7a Mon Sep 17 00:00:00 2001 From: Frederic Bournival Date: Sun, 19 Apr 2020 17:13:54 -0400 Subject: [PATCH] First implementation for the TV5UnisCa extractor --- youtube_dl/extractor/extractors.py | 1 + youtube_dl/extractor/tv5unisca.py | 51 ++++++++++++++++++++++++++++++ 2 files changed, 52 insertions(+) create mode 100644 youtube_dl/extractor/tv5unisca.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index e407ab3d9..8a24e8c95 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -1181,6 +1181,7 @@ from .tv2dk import ( from .tv2hu import TV2HuIE from .tv4 import TV4IE from .tv5mondeplus import TV5MondePlusIE +from .tv5unisca import TV5UnisCaIE from .tva import TVAIE from .tvanouvelles import ( TVANouvellesIE, diff --git a/youtube_dl/extractor/tv5unisca.py b/youtube_dl/extractor/tv5unisca.py new file mode 100644 index 000000000..4f126d25e --- /dev/null +++ b/youtube_dl/extractor/tv5unisca.py @@ -0,0 +1,51 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor + +from ..utils import ( + get_element_by_id +) + +import re + + +class TV5UnisCaIE(InfoExtractor): + IE_DESC = 'TV5UNISCA' + _VALID_URL = r'https?://(?:www\.)?tv5unis\.ca/videos/(?P[^?#]+)' + _TESTS = [] + _GEO_BYPASS = False + + def _real_extract(self, format_url): + + display_id = self._match_id(format_url) + webpage = self._download_webpage(format_url, display_id) + + next_data_dict = self._parse_json( + get_element_by_id('__NEXT_DATA__', webpage), display_id)\ + .get('props').get('apolloState') + + info_dict = self._json_ld( + next_data_dict['$ArtisanBlocksPageMetaData:50.blockConfiguration.pageMetaDataConfiguration']['jsonLd'], + display_id + ) + + formats = [] + for key in filter(lambda k: re.match(r'\$Video:\d+\.encodings\.', k), next_data_dict.keys()): + format_ul = next_data_dict[key].get('url') + if not format_ul: + continue + if format_ul.endswith('.m3u8'): + formats.extend(self._extract_m3u8_formats(format_ul, display_id)) + if format_ul.endswith('.ism/manifest'): + formats.extend(self._extract_ism_formats(format_ul, display_id, ism_id='mss', fatal=False)) + if format_ul.endswith('.mp4'): + formats.append({ + 'url': format_ul, + 'format_id': 'http' + }) + + info_dict['id'] = info_dict['display_id'] = display_id + info_dict['formats'] = formats + + return info_dict