From 4b99952e6cf9f767ec6dd5ff0ce89864a52fd703 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Daniel=20H=C3=B6pfl?= Date: Wed, 13 Feb 2019 16:29:43 +0100 Subject: [PATCH 1/3] Fixes #18906: Fixes title extraction for vivo.sx. --- youtube_dl/extractor/shared.py | 22 +++++++++++++++++++--- 1 file changed, 19 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/shared.py b/youtube_dl/extractor/shared.py index 931a0f70e..4326fc820 100644 --- a/youtube_dl/extractor/shared.py +++ b/youtube_dl/extractor/shared.py @@ -1,6 +1,11 @@ from __future__ import unicode_literals -from .common import InfoExtractor +import re + +from .common import ( + InfoExtractor, + unescapeHTML +) from ..compat import compat_b64decode from ..utils import ( ExtractorError, @@ -22,8 +27,7 @@ class SharedBaseIE(InfoExtractor): video_url = self._extract_video_url(webpage, video_id, url) - title = compat_b64decode(self._html_search_meta( - 'full:title', webpage, 'title')).decode('utf-8') + title = self._extract_title(webpage) filesize = int_or_none(self._html_search_meta( 'full:size', webpage, 'file size', fatal=False)) @@ -35,6 +39,10 @@ class SharedBaseIE(InfoExtractor): 'title': title, } + def _extract_title(self, webpage): + return compat_b64decode(self._html_search_meta( + 'full:title', webpage, 'title')).decode('utf-8') + class SharedIE(SharedBaseIE): IE_DESC = 'shared.sx' @@ -86,6 +94,14 @@ class VivoIE(SharedBaseIE): }, } + def _extract_title(self, webpage): + data_title = self._search_regex( + r'data-name\s*=\s*(["\'])(?P(?:.(?!\1))*.)\1', webpage, + 'title', default=None, group='title') + if data_title: + return unescapeHTML(re.sub(r"\.[a-z0-9]{3,4}$", "", data_title)) + return None + def _extract_video_url(self, webpage, video_id, *args): def decode_url(encoded_url): return compat_b64decode(encoded_url).decode('utf-8') From 0b0b0cf4d7f2da56ea810359ee4a42e48786fe15 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Daniel=20H=C3=B6pfl?= <daniel@hoepfl.de> Date: Fri, 15 Feb 2019 07:28:24 +0100 Subject: [PATCH 2/3] Improved code based on review. --- youtube_dl/extractor/shared.py | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) diff --git a/youtube_dl/extractor/shared.py b/youtube_dl/extractor/shared.py index 4326fc820..67d83b7e7 100644 --- a/youtube_dl/extractor/shared.py +++ b/youtube_dl/extractor/shared.py @@ -2,16 +2,15 @@ from __future__ import unicode_literals import re -from .common import ( - InfoExtractor, - unescapeHTML -) +from .common import InfoExtractor from ..compat import compat_b64decode from ..utils import ( ExtractorError, int_or_none, url_or_none, urlencode_postdata, + unescapeHTML, + extract_attributes, ) @@ -95,12 +94,10 @@ class VivoIE(SharedBaseIE): } def _extract_title(self, webpage): - data_title = self._search_regex( - r'data-name\s*=\s*(["\'])(?P<title>(?:.(?!\1))*.)\1', webpage, - 'title', default=None, group='title') - if data_title: - return unescapeHTML(re.sub(r"\.[a-z0-9]{3,4}$", "", data_title)) - return None + stream_content = extract_attributes(self._search_regex( + r'(<div[^>]+class="[^"]*stream-content[^"]*"[^>]*>)', + webpage, 'stream-content element')) + return stream_content['data-name'] or self._og_search_title(webpage) def _extract_video_url(self, webpage, video_id, *args): def decode_url(encoded_url): From 2ff2b5ff79d0ad03ad5c88fc6509f3975f00eb8a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Daniel=20H=C3=B6pfl?= <daniel@hoepfl.de> Date: Thu, 28 Feb 2019 19:00:17 +0100 Subject: [PATCH 3/3] - Returned to regex to extract title. --- youtube_dl/extractor/shared.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/youtube_dl/extractor/shared.py b/youtube_dl/extractor/shared.py index 67d83b7e7..eade8fd9e 100644 --- a/youtube_dl/extractor/shared.py +++ b/youtube_dl/extractor/shared.py @@ -10,7 +10,6 @@ from ..utils import ( url_or_none, urlencode_postdata, unescapeHTML, - extract_attributes, ) @@ -94,10 +93,12 @@ class VivoIE(SharedBaseIE): } def _extract_title(self, webpage): - stream_content = extract_attributes(self._search_regex( - r'(<div[^>]+class="[^"]*stream-content[^"]*"[^>]*>)', - webpage, 'stream-content element')) - return stream_content['data-name'] or self._og_search_title(webpage) + data_title = self._search_regex( + r'data-name\s*=\s*(["\'])(?P<title>(?:(?!\1).)+)\1', webpage, + 'title', default=None, group='title') + if data_title: + return unescapeHTML(re.sub(r"\.[a-z0-9]{3,4}$", "", data_title)) + return self._og_search_title(webpage) def _extract_video_url(self, webpage, video_id, *args): def decode_url(encoded_url):