From d26a533272c55957edcb13163fbe91788769b1b3 Mon Sep 17 00:00:00 2001 From: AlexAplin Date: Thu, 14 Jun 2018 23:22:49 -0400 Subject: [PATCH 1/3] [Iwara] Add new metadata extractors --- youtube_dl/extractor/iwara.py | 34 ++++++++++++++++++++++++++++++++++ 1 file changed, 34 insertions(+) diff --git a/youtube_dl/extractor/iwara.py b/youtube_dl/extractor/iwara.py index a7514fc80..92f8bc969 100644 --- a/youtube_dl/extractor/iwara.py +++ b/youtube_dl/extractor/iwara.py @@ -5,8 +5,13 @@ from .common import InfoExtractor from ..compat import compat_urllib_parse_urlparse from ..utils import ( int_or_none, + str_to_int, mimetype2ext, remove_end, + clean_html, + get_element_by_class, + get_element_by_id, + unified_strdate, ) @@ -20,6 +25,12 @@ class IwaraIE(InfoExtractor): 'ext': 'mp4', 'title': '【MMD R-18】ガールフレンド carry_me_off', 'age_limit': 18, + 'upload_date': '20150828', + 'uploader': 'Reimu丨Action', + 'description': '禁止转载\n=acfun=\n=bilibili=\n=youtube=\n=stage=\n=motion=\n=camera=\n=dress=', + 'comment_count': int, + 'like_count': int, + 'view_count': int, }, }, { 'url': 'http://ecchi.iwara.tv/videos/Vb4yf2yZspkzkBO', @@ -71,6 +82,23 @@ class IwaraIE(InfoExtractor): title = remove_end(self._html_search_regex( r'([^<]+)', webpage, 'title'), ' | Iwara') + upload_date = unified_strdate(self._html_search_regex( + r'作成日:(\d{4}-\d{2}-\d{2})', webpage, 'upload_date', fatal=False)) + + uploader = get_element_by_class('username', webpage) + + description = clean_html(get_element_by_class('field-type-text-with-summary', webpage)) + + comments_id = get_element_by_id('comments', webpage) + comment_count = int_or_none(self._html_search_regex( + r'([\d,]+)', get_element_by_class('title', comments_id), 'comment_count', fatal=False)) + + node_views_class = get_element_by_class('node-views', webpage) + like_count = str_to_int(self._html_search_regex( + r'glyphicon-heart[^>]+><\/i>\s+([\d,]+)', node_views_class, 'like_count', fatal=False)) + view_count = str_to_int(self._html_search_regex( + r'glyphicon-eye-open[^>]+><\/i>\s+([\d,]+)', node_views_class, 'view_count', fatal=False)) + formats = [] for a_format in video_data: format_id = a_format.get('resolution') @@ -92,4 +120,10 @@ class IwaraIE(InfoExtractor): 'title': title, 'age_limit': age_limit, 'formats': formats, + 'upload_date': upload_date, + 'uploader': uploader, + 'description': description, + 'comment_count': comment_count, + 'like_count': like_count, + 'view_count': view_count, } From cdde4e4764993cbbe466f4c589d87f3e63afb40a Mon Sep 17 00:00:00 2001 From: AlexAplin Date: Fri, 15 Jun 2018 17:20:37 -0400 Subject: [PATCH 2/3] Safer extraction --- youtube_dl/extractor/iwara.py | 21 ++++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) diff --git a/youtube_dl/extractor/iwara.py b/youtube_dl/extractor/iwara.py index 92f8bc969..38a0727f5 100644 --- a/youtube_dl/extractor/iwara.py +++ b/youtube_dl/extractor/iwara.py @@ -85,19 +85,26 @@ class IwaraIE(InfoExtractor): upload_date = unified_strdate(self._html_search_regex( r'作成日:(\d{4}-\d{2}-\d{2})', webpage, 'upload_date', fatal=False)) - uploader = get_element_by_class('username', webpage) + node_info_class = get_element_by_class('node-info', webpage) + if node_info_class is not None: + uploader = self._html_search_regex( + r']+', node_info_class, 'uploader', fatal=False) description = clean_html(get_element_by_class('field-type-text-with-summary', webpage)) comments_id = get_element_by_id('comments', webpage) - comment_count = int_or_none(self._html_search_regex( - r'([\d,]+)', get_element_by_class('title', comments_id), 'comment_count', fatal=False)) + if comments_id is not None: + comments_header = get_element_by_class('title', comments_id) + if comments_header is not None: + comment_count = int_or_none(self._html_search_regex( + r'([\d,]+)', get_element_by_class('title', comments_id), 'comment_count', fatal=False)) node_views_class = get_element_by_class('node-views', webpage) - like_count = str_to_int(self._html_search_regex( - r'glyphicon-heart[^>]+><\/i>\s+([\d,]+)', node_views_class, 'like_count', fatal=False)) - view_count = str_to_int(self._html_search_regex( - r'glyphicon-eye-open[^>]+><\/i>\s+([\d,]+)', node_views_class, 'view_count', fatal=False)) + if node_views_class is not None: + like_count = str_to_int(self._html_search_regex( + r'glyphicon-heart[^>]+>\s+([\d,]+)', node_views_class, 'like_count', fatal=False)) + view_count = str_to_int(self._html_search_regex( + r'glyphicon-eye-open[^>]+>\s+([\d,]+)', node_views_class, 'view_count', fatal=False)) formats = [] for a_format in video_data: From 3431b53c4c8930d19d26e9ca6f97d5b5724426e2 Mon Sep 17 00:00:00 2001 From: AlexAplin Date: Sun, 17 Jun 2018 00:53:49 -0400 Subject: [PATCH 3/3] Use variable --- youtube_dl/extractor/iwara.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/iwara.py b/youtube_dl/extractor/iwara.py index 38a0727f5..4038de15a 100644 --- a/youtube_dl/extractor/iwara.py +++ b/youtube_dl/extractor/iwara.py @@ -97,7 +97,7 @@ class IwaraIE(InfoExtractor): comments_header = get_element_by_class('title', comments_id) if comments_header is not None: comment_count = int_or_none(self._html_search_regex( - r'([\d,]+)', get_element_by_class('title', comments_id), 'comment_count', fatal=False)) + r'([\d,]+)', comments_header, 'comment_count', fatal=False)) node_views_class = get_element_by_class('node-views', webpage) if node_views_class is not None: