From 10cab648ca999b8a82b5f95579d90131a823dbe5 Mon Sep 17 00:00:00 2001 From: BrutuZ Date: Thu, 12 Mar 2020 01:15:29 -0300 Subject: [PATCH 01/10] [hanime] Add new extractor --- youtube_dl/extractor/extractors.py | 1 + youtube_dl/extractor/hanime.py | 71 ++++++++++++++++++++++++++++++ 2 files changed, 72 insertions(+) create mode 100644 youtube_dl/extractor/hanime.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 64d1fa251..3fd5ae986 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -420,6 +420,7 @@ from .goshgay import GoshgayIE from .gputechconf import GPUTechConfIE from .groupon import GrouponIE from .hbo import HBOIE +from .hanime import HanimeIE from .hearthisat import HearThisAtIE from .heise import HeiseIE from .hellporno import HellPornoIE diff --git a/youtube_dl/extractor/hanime.py b/youtube_dl/extractor/hanime.py new file mode 100644 index 000000000..f06066e21 --- /dev/null +++ b/youtube_dl/extractor/hanime.py @@ -0,0 +1,71 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import clean_html + + +class HanimeIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?hanime\.tv/videos/hentai/(?P.+)(?:\?playlist_id=\w+)?' + _TEST = { + 'url': 'https://hanime.tv/videos/hentai/kuroinu-1', + 'info_dict': { + 'id': '33964', + 'display_id': 'kuroinu-1', + 'title': 'Kuroinu 1', + 'description': 'md5:37d5bb20d4a0834bd147bc1bac588a0b', + 'thumbnail': r're:^https?://.*\.jpg$', + 'release_date': '20120127', + 'upload_date': '20140509', + 'creator': 'Magin Label', + 'view_count': int, + 'like_count': int, + 'dislike_count': int, + 'tags': list, + 'censored': True, + 'ext': 'mp4', + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + } + + def _real_extract(self, url): + video_slug = self._match_id(url) + + webpage = self._download_webpage(url, video_slug) + page_json = self._html_search_regex(r'window.__NUXT__=(.+?);<\/script>', webpage, 'Inline JSON') + page_json = self._parse_json(page_json, video_slug).get('state').get('data').get('video').get('hentai_video') + api_json = self._download_json( + 'https://members.hanime.tv/api/v3/videos_manifests/%s' % video_slug, + video_slug, + 'API Call', headers={'X-Directive': 'api'} + ) + api_json = api_json.get('videos_manifest').get('servers')[0].get('streams') + + title = page_json.get('name') or api_json.get[0].get('video_stream_group_id') + tags = [t.get('text') for t in page_json.get('hentai_tags')] + + video_id = str(api_json[0].get('id')) + playlist_url = api_json[0].get('url') or api_json[1].get('url') + formats = self._extract_m3u8_formats(playlist_url, video_slug, 'mp4') + return { + 'id': video_id, + 'display_id': video_slug, + 'title': title, + 'description': clean_html(page_json.get('description')).strip(), + 'thumbnails': [ + {'id': 'Cover', 'url': page_json.get('cover_url')}, + {'id': 'Poster', 'url': page_json.get('poster_url')} + ], + 'release_date': page_json.get('released_at').replace('-', '')[:8], + 'upload_date': page_json.get('created_at').replace('-', '')[:8], + 'creator': page_json.get('brand'), + 'view_count': page_json.get('views'), + 'like_count': page_json.get('likes'), + 'dislike_count': page_json.get('dislikes'), + 'tags': tags, + 'censored': page_json.get('is_censored'), + 'formats': formats, + } From 99821416d491b8707bb993755f1f2472055be0b7 Mon Sep 17 00:00:00 2001 From: BrutuZ Date: Thu, 12 Mar 2020 02:51:37 -0300 Subject: [PATCH 02/10] Parse all resolutions --- youtube_dl/extractor/hanime.py | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/hanime.py b/youtube_dl/extractor/hanime.py index f06066e21..eea6a5511 100644 --- a/youtube_dl/extractor/hanime.py +++ b/youtube_dl/extractor/hanime.py @@ -41,15 +41,24 @@ class HanimeIE(InfoExtractor): 'https://members.hanime.tv/api/v3/videos_manifests/%s' % video_slug, video_slug, 'API Call', headers={'X-Directive': 'api'} - ) - api_json = api_json.get('videos_manifest').get('servers')[0].get('streams') + ).get('videos_manifest').get('servers')[0].get('streams') title = page_json.get('name') or api_json.get[0].get('video_stream_group_id') tags = [t.get('text') for t in page_json.get('hentai_tags')] video_id = str(api_json[0].get('id')) - playlist_url = api_json[0].get('url') or api_json[1].get('url') - formats = self._extract_m3u8_formats(playlist_url, video_slug, 'mp4') + formats = [] + for f in api_json: + format = { + 'width': int(f.get('width')), + 'height': int(f.get('height')), + 'filesize_approx': f.get('filesize_mbs') * 1000000, + 'ext': 'mp4', + 'url': f.get('url') or 'https://hanime.tv/api/v1/m3u8s/%s.m3u8' % f.get('id'), + } + formats.append(format) + formats.reverse() + return { 'id': video_id, 'display_id': video_slug, From 91a186a870394b781036cb6869d4367a358154cf Mon Sep 17 00:00:00 2001 From: BrutuZ Date: Thu, 12 Mar 2020 15:23:38 -0300 Subject: [PATCH 03/10] Added m3u8 to format list with https protocol Calculate TBR from Filesize and Duration, if provided Use parsing and conversion functions --- youtube_dl/extractor/hanime.py | 67 ++++++++++++++++++++++------------ 1 file changed, 44 insertions(+), 23 deletions(-) diff --git a/youtube_dl/extractor/hanime.py b/youtube_dl/extractor/hanime.py index eea6a5511..05c23f225 100644 --- a/youtube_dl/extractor/hanime.py +++ b/youtube_dl/extractor/hanime.py @@ -2,11 +2,20 @@ from __future__ import unicode_literals from .common import InfoExtractor -from ..utils import clean_html +from ..utils import ( + clean_html, + parse_filesize, + float_or_none, + int_or_none, + unified_strdate, + str_or_none, + url_or_none, + parse_duration, +) class HanimeIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?hanime\.tv/videos/hentai/(?P.+)(?:\?playlist_id=\w+)?' + _VALID_URL = r'https?://(?:www\.)?hanime\.tv/videos/hentai/(?P.+)(?:\?playlist_id=.+)?' _TEST = { 'url': 'https://hanime.tv/videos/hentai/kuroinu-1', 'info_dict': { @@ -22,7 +31,7 @@ class HanimeIE(InfoExtractor): 'like_count': int, 'dislike_count': int, 'tags': list, - 'censored': True, + 'censored': 'True', 'ext': 'mp4', }, 'params': { @@ -40,41 +49,53 @@ class HanimeIE(InfoExtractor): api_json = self._download_json( 'https://members.hanime.tv/api/v3/videos_manifests/%s' % video_slug, video_slug, - 'API Call', headers={'X-Directive': 'api'} - ).get('videos_manifest').get('servers')[0].get('streams') + 'API Call', headers={'X-Directive': 'api'}).get('videos_manifest').get('servers')[0].get('streams') title = page_json.get('name') or api_json.get[0].get('video_stream_group_id') tags = [t.get('text') for t in page_json.get('hentai_tags')] - video_id = str(api_json[0].get('id')) formats = [] for f in api_json: - format = { - 'width': int(f.get('width')), - 'height': int(f.get('height')), - 'filesize_approx': f.get('filesize_mbs') * 1000000, + item_url = url_or_none(f.get('url')) or url_or_none('https://hanime.tv/api/v1/m3u8s/%s.m3u8' % f.get('id')) + format = [{ + 'width': int_or_none(f.get('width')), + 'height': int_or_none(f.get('height')), + 'filesize_approx': parse_filesize('%sMb' % f.get('filesize_mbs')), + 'protocol': 'm3u8', + 'format_id': 'mp4-%sp' % f.get('height'), + 'tbr': float_or_none(float_or_none(f.get('filesize_mbs'), invscale=8388), int_or_none(f.get('duration_in_ms'), 1000)), 'ext': 'mp4', - 'url': f.get('url') or 'https://hanime.tv/api/v1/m3u8s/%s.m3u8' % f.get('id'), - } - formats.append(format) + 'url': item_url, + }, { + 'width': int_or_none(f.get('width')), + 'height': int_or_none(f.get('height')), + 'protocol': 'https', + 'format_id': 'm3u8-%sp' % f.get('height'), + 'format_note': '~8-50.00Kib', + 'ext': 'm3u8', + 'url': item_url, + }] + for i in format: + formats.append(i) formats.reverse() return { - 'id': video_id, + 'id': str_or_none(api_json[0].get('id')), 'display_id': video_slug, 'title': title, 'description': clean_html(page_json.get('description')).strip(), 'thumbnails': [ - {'id': 'Cover', 'url': page_json.get('cover_url')}, - {'id': 'Poster', 'url': page_json.get('poster_url')} + {'preference': 0, 'id': 'Poster', 'url': url_or_none(page_json.get('poster_url'))}, + {'preference': 1, 'id': 'Cover', 'url': url_or_none(page_json.get('cover_url'))}, ], - 'release_date': page_json.get('released_at').replace('-', '')[:8], - 'upload_date': page_json.get('created_at').replace('-', '')[:8], - 'creator': page_json.get('brand'), - 'view_count': page_json.get('views'), - 'like_count': page_json.get('likes'), - 'dislike_count': page_json.get('dislikes'), + 'release_date': unified_strdate(page_json.get('released_at')), + 'upload_date': unified_strdate(page_json.get('created_at')), + 'creator': str_or_none(page_json.get('brand')), + 'view_count': int_or_none(page_json.get('views')), + 'like_count': int_or_none(page_json.get('likes')), + 'dislike_count': int_or_none(page_json.get('dislikes')), + 'duration': parse_duration('%sms' % f.get('duration_in_ms')), 'tags': tags, - 'censored': page_json.get('is_censored'), + 'censored': str_or_none(page_json.get('is_censored')), 'formats': formats, } From 80dc340b82281e5b1745e143c660d9d211b5ea07 Mon Sep 17 00:00:00 2001 From: BrutuZ Date: Thu, 12 Mar 2020 15:24:33 -0300 Subject: [PATCH 04/10] Add ZeroDivisionError to exception list on int_or_none and float_or_none --- youtube_dl/utils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 38262bee4..80d824f81 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -3524,7 +3524,7 @@ def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1): return default try: return int(v) * invscale // scale - except (ValueError, TypeError): + except (ValueError, TypeError, ZeroDivisionError): return default @@ -3546,7 +3546,7 @@ def float_or_none(v, scale=1, invscale=1, default=None): return default try: return float(v) * invscale / scale - except (ValueError, TypeError): + except (ValueError, TypeError, ZeroDivisionError): return default From ef753bc223ee1b4490ba14c9839b77a16b050ede Mon Sep 17 00:00:00 2001 From: BrutuZ Date: Thu, 12 Mar 2020 20:44:45 -0300 Subject: [PATCH 05/10] Revert ZeroDivisionError exception --- youtube_dl/utils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 80d824f81..38262bee4 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -3524,7 +3524,7 @@ def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1): return default try: return int(v) * invscale // scale - except (ValueError, TypeError, ZeroDivisionError): + except (ValueError, TypeError): return default @@ -3546,7 +3546,7 @@ def float_or_none(v, scale=1, invscale=1, default=None): return default try: return float(v) * invscale / scale - except (ValueError, TypeError, ZeroDivisionError): + except (ValueError, TypeError): return default From 9aaf20b0ed975f44aae283b887ce0041d11be970 Mon Sep 17 00:00:00 2001 From: BrutuZ Date: Thu, 12 Mar 2020 21:24:00 -0300 Subject: [PATCH 06/10] Requested changes --- youtube_dl/extractor/hanime.py | 58 +++++++++++++++------------------- 1 file changed, 26 insertions(+), 32 deletions(-) diff --git a/youtube_dl/extractor/hanime.py b/youtube_dl/extractor/hanime.py index 05c23f225..8f1c81e9c 100644 --- a/youtube_dl/extractor/hanime.py +++ b/youtube_dl/extractor/hanime.py @@ -7,10 +7,13 @@ from ..utils import ( parse_filesize, float_or_none, int_or_none, + parse_iso8601, unified_strdate, str_or_none, - url_or_none, parse_duration, + sanitize_url, + compat_str, + try_get, ) @@ -26,12 +29,12 @@ class HanimeIE(InfoExtractor): 'thumbnail': r're:^https?://.*\.jpg$', 'release_date': '20120127', 'upload_date': '20140509', + 'timestamp': 1399624976, 'creator': 'Magin Label', 'view_count': int, 'like_count': int, 'dislike_count': int, 'tags': list, - 'censored': 'True', 'ext': 'mp4', }, 'params': { @@ -42,60 +45,51 @@ class HanimeIE(InfoExtractor): def _real_extract(self, url): video_slug = self._match_id(url) - - webpage = self._download_webpage(url, video_slug) - page_json = self._html_search_regex(r'window.__NUXT__=(.+?);<\/script>', webpage, 'Inline JSON') - page_json = self._parse_json(page_json, video_slug).get('state').get('data').get('video').get('hentai_video') + page_json = self._html_search_regex(r'