[minhateca] Add extractor (Fixes #4094)

2024-12-21 18:12:52 +08:00 · 2014-12-04 17:02:05 +01:00 · 2014-12-04 17:02:05 +01:00 · 4349c07dd7
commit 4349c07dd7
parent 9776bc7f57
4 changed files with 78 additions and 2 deletions
--- a/test/test_utils.py
+++ b/test/test_utils.py
@ -376,6 +376,7 @@ class TestUtil(unittest.TestCase):
        self.assertEqual(parse_filesize('2 MiB'), 2097152)
        self.assertEqual(parse_filesize('5 GB'), 5000000000)
        self.assertEqual(parse_filesize('1.2Tb'), 1200000000000)
        self.assertEqual(parse_filesize('1,24 KB'), 1240)
 if __name__ == '__main__':
    unittest.main()
--- a/youtube_dl/extractor/init.py
+++ b/youtube_dl/extractor/init.py
@ -217,6 +217,7 @@ from .mdr import MDRIE
 from .metacafe import MetacafeIE
 from .metacritic import MetacriticIE
 from .mgoon import MgoonIE
 from .minhateca import MinhatecaIE
 from .ministrygrid import MinistryGridIE
 from .mit import TechTVMITIE, MITIE, OCWMITIE
 from .mitele import MiTeleIE
--- a/youtube_dl/extractor/minhateca.py
+++ b/youtube_dl/extractor/minhateca.py
@ -0,0 +1,71 @@
 # coding: utf-8
 from __future__ import unicode_literals
 from .common import InfoExtractor
 from ..compat import (
    compat_urllib_parse,
    compat_urllib_request,
 )
 from ..utils import (
    int_or_none,
    parse_filesize,
 )
 class MinhatecaIE(InfoExtractor):
    _VALID_URL = r'https?://minhateca\.com\.br/[^?#]+,(?P<id>[0-9]+)\.'
    _TEST = {
        'url': 'http://minhateca.com.br/pereba/misc/youtube-dl+test+video,125848331.mp4(video)',
        'info_dict': {
            'id': '125848331',
            'ext': 'mp4',
            'title': 'youtube-dl test video',
            'thumbnail': 're:^https?://.*\.jpg$',
            'filesize_approx': 1530000,
            'duration': 9,
            'view_count': int,
        }
    }
    def _real_extract(self, url):
        video_id = self._match_id(url)
        webpage = self._download_webpage(url, video_id)
        token = self._html_search_regex(
            r'<input name="__RequestVerificationToken".*?value="([^"]+)"',
            webpage, 'request token')
        token_data = [
            ('fileId', video_id),
            ('__RequestVerificationToken', token),
        ]
        req = compat_urllib_request.Request(
            'http://minhateca.com.br/action/License/Download',
            data=compat_urllib_parse.urlencode(token_data))
        req.add_header('Content-Type', 'application/x-www-form-urlencoded')
        data = self._download_json(
            req, video_id, note='Downloading metadata')
        video_url = data['redirectUrl']
        title_str = self._html_search_regex(
            r'<h1.*?>(.*?)</h1>', webpage, 'title')
        title, _, ext = title_str.rpartition('.')
        filesize_approx = parse_filesize(self._html_search_regex(
            r'<p class="fileSize">(.*?)</p>',
            webpage, 'file size approximation', fatal=False))
        duration = int_or_none(self._html_search_regex(
            r'(?s)<p class="fileLeng[ht][th]">.*?([0-9]+)\s*s',
            webpage, 'duration', fatal=False))
        view_count = int_or_none(self._html_search_regex(
            r'<p class="downloadsCounter">([0-9]+)</p>',
            webpage, 'view count', fatal=False))
        return {
            'id': video_id,
            'url': video_url,
            'title': title,
            'ext': ext,
            'filesize_approx': filesize_approx,
            'duration': duration,
            'view_count': view_count,
            'thumbnail': self._og_search_thumbnail(webpage),
        }
--- a/youtube_dl/utils.py
+++ b/youtube_dl/utils.py
@ -1090,11 +1090,14 @@ def parse_filesize(s):
    }
    units_re = '|'.join(re.escape(u) for u in _UNIT_TABLE)
-    m = re.match(r'(?P<num>[0-9]+(?:\.[0-9]*)?)\s*(?P<unit>%s)' % units_re, s)
+    m = re.match(
        r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)' % units_re, s)
    if not m:
        return None
-    return int(float(m.group('num')) * _UNIT_TABLE[m.group('unit')])
+    num_str = m.group('num').replace(',', '.')
    mult = _UNIT_TABLE[m.group('unit')]
    return int(float(num_str) * mult)
 def get_term_width():