[egghead:lesson] detect dash and m3u8 URLs

2025-01-24 02:52:51 +08:00 · 2017-10-03 02:09:12 +02:00 · 2017-10-03 02:09:12 +02:00 · e9ed3309fb
commit e9ed3309fb
parent 9e71f88105
4 changed files with 71 additions and 30 deletions
--- a/1
+++ b/1
@ -231,3 +231,4 @@ John Dong
 Tatsuyuki Ishi
 Daniel Weber
 Kay Bouché
 mk-pmb
--- a/test/test_utils.py
+++ b/test/test_utils.py
@ -540,6 +540,7 @@ class TestUtil(unittest.TestCase):
        self.assertEqual(parse_duration('87 Min.'), 5220)
        self.assertEqual(parse_duration('PT1H0.040S'), 3600.04)
        self.assertEqual(parse_duration('PT00H03M30SZ'), 210)
        self.assertEqual(parse_duration('P0Y0M0DT0H2M4.567S'), 124.567)
    def test_fix_xml_ampersands(self):
        self.assertEqual(
--- a/youtube_dl/extractor/egghead.py
+++ b/youtube_dl/extractor/egghead.py
@ -9,7 +9,46 @@ from ..utils import (
 )
-class EggheadCourseIE(InfoExtractor):
+class EggheadShared():
    def extract_lesson_metadata(self, lesson):
        info = {
            'title': lesson.get('title'),
            'description': lesson.get('summary'),
            'thumbnail': lesson.get('thumb_nail'),
            'timestamp': unified_timestamp(lesson.get('published_at')),
            'duration': int_or_none(lesson.get('duration')),
            'view_count': int_or_none(lesson.get('plays_count')),
            'tags': try_get(lesson, lambda x: x['tag_list'], list),
        }
        def find_id_and_dlurl():
            vid_id = lesson.get('wistia_id')
            if vid_id:
                return {'ie_key': 'Wistia', '_type': 'url_transparent',
                        'id': vid_id, 'url': 'wistia:' + vid_id}
            self.report_warning('Cannot find an proper ID, will use lesson name URL slug')
            vid_id = self._html_search_regex(
                r'^https?://egghead\.io/lessons/([A-Za-z0-9][A-Za-z0-9-]*)$',
                lesson.get('http_url'),
                'lesson name URL part as ID of last resort',
                group=1)
            mu = lesson.get('media_urls')
            if mu:
                src = mu.get('dash_url')
                if src:
                    return {'id': vid_id, 'formats': self._extract_mpd_formats(src, vid_id)}
                src = mu.get('hls_url')
                if src:
                    return {'id': vid_id, 'formats': self._extract_m3u8_formats(src, vid_id, entry_protocol='m3u8_native', m3u8_id='hls')}
            raise NotImplementedError('Unable to detect download URL')
        info.update(find_id_and_dlurl())
        return info
 class EggheadCourseIE(InfoExtractor, EggheadShared):
    IE_DESC = 'egghead.io course'
    IE_NAME = 'egghead:course'
    _VALID_URL = r'https://egghead\.io/courses/(?P<id>[^/?#&]+)'
@ -25,22 +64,16 @@ class EggheadCourseIE(InfoExtractor):
    def _real_extract(self, url):
        playlist_id = self._match_id(url)
        course = self._download_json(
-            'https://egghead.io/api/v1/series/%s' % playlist_id, playlist_id)
+            'https://egghead.io/api/v1/series/' + playlist_id, playlist_id)
-
+        entries = [self.extract_lesson_metadata(lesson)
-        entries = [
+                   for lesson in course['lessons']]
            self.url_result(
                'wistia:%s' % lesson['wistia_id'], ie='Wistia',
                video_id=lesson['wistia_id'], video_title=lesson.get('title'))
            for lesson in course['lessons'] if lesson.get('wistia_id')]
        return self.playlist_result(
            entries, playlist_id, course.get('title'),
            course.get('description'))
-class EggheadLessonIE(InfoExtractor):
+class EggheadLessonIE(InfoExtractor, EggheadShared):
    IE_DESC = 'egghead.io lesson'
    IE_NAME = 'egghead:lesson'
    _VALID_URL = r'https://egghead\.io/lessons/(?P<id>[^/?#&]+)'
@ -65,20 +98,6 @@ class EggheadLessonIE(InfoExtractor):
    def _real_extract(self, url):
        lesson_id = self._match_id(url)
        lesson = self._download_json(
-            'https://egghead.io/api/v1/lessons/%s' % lesson_id, lesson_id)
+            'https://egghead.io/api/v1/lessons/' + lesson_id, lesson_id)
-
+        return self.extract_lesson_metadata(lesson)
        return {
            '_type': 'url_transparent',
            'ie_key': 'Wistia',
            'url': 'wistia:%s' % lesson['wistia_id'],
            'id': lesson['wistia_id'],
            'title': lesson.get('title'),
            'description': lesson.get('summary'),
            'thumbnail': lesson.get('thumb_nail'),
            'timestamp': unified_timestamp(lesson.get('published_at')),
            'duration': int_or_none(lesson.get('duration')),
            'view_count': int_or_none(lesson.get('plays_count')),
            'tags': try_get(lesson, lambda x: x['tag_list'], list),
        }
--- a/youtube_dl/utils.py
+++ b/youtube_dl/utils.py
@ -1830,15 +1830,30 @@ def parse_duration(s):
    s = s.strip()
    days, hours, mins, secs, ms = [None] * 5
-    m = re.match(r'(?:(?:(?:(?P<days>[0-9]+):)?(?P<hours>[0-9]+):)?(?P<mins>[0-9]+):)?(?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?Z?$', s)
+    m = re.match(r'''(?x)
        (?:
            (?:
                (?:
                  (?P<days>[0-9]+):
                )?
                (?P<hours>[0-9]+):
            )?
            (?P<mins>[0-9]+):
        )?
        (?P<secs>[0-9]+)
        (?P<ms>\.[0-9]+)?
        Z?$''', s)
    if m:
        days, hours, mins, secs, ms = m.groups()
    else:
        m = re.match(
-            r'''(?ix)(?:P?T)?
+            r'''(?ix)P?T?
                (?:0Y)?
                (?:0M)?
                (?:
                    (?P<days>[0-9]+)\s*d(?:ays?)?\s*
                )?
                T?
                (?:
                    (?P<hours>[0-9]+)\s*h(?:ours?)?\s*
                )?
@ -1851,7 +1866,12 @@ def parse_duration(s):
        if m:
            days, hours, mins, secs, ms = m.groups()
        else:
-            m = re.match(r'(?i)(?:(?P<hours>[0-9.]+)\s*(?:hours?)|(?P<mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*)Z?$', s)
+            m = re.match(r'''(?ix)
                (?:
                    (?P<hours>[0-9.]+)\s*(?:hours?)
                    |(?P<mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*
                )
                Z?$''', s)
            if m:
                hours, mins = m.groups()
            else: