[egghead:lesson] detect dash and m3u8 URLs

2025-01-24 02:42:54 +08:00 · 2017-10-03 02:09:12 +02:00 · 2017-10-03 02:09:12 +02:00 · e9ed3309fb
commit e9ed3309fb
parent 9e71f88105
4 changed files with 71 additions and 30 deletions
--- a/1
+++ b/1
@ -231,3 +231,4 @@ John Dong
 Tatsuyuki Ishi
 Daniel Weber
 Kay Bouché
+mk-pmb
--- a/test/test_utils.py
+++ b/test/test_utils.py
@ -540,6 +540,7 @@ class TestUtil(unittest.TestCase):
        self.assertEqual(parse_duration('87 Min.'), 5220)
        self.assertEqual(parse_duration('PT1H0.040S'), 3600.04)
        self.assertEqual(parse_duration('PT00H03M30SZ'), 210)
+        self.assertEqual(parse_duration('P0Y0M0DT0H2M4.567S'), 124.567)

    def test_fix_xml_ampersands(self):
        self.assertEqual(
--- a/youtube_dl/extractor/egghead.py
+++ b/youtube_dl/extractor/egghead.py
@ -9,7 +9,46 @@ from ..utils import (
 )


-class EggheadCourseIE(InfoExtractor):
+class EggheadShared():
+    def extract_lesson_metadata(self, lesson):
+        info = {
+            'title': lesson.get('title'),
+            'description': lesson.get('summary'),
+            'thumbnail': lesson.get('thumb_nail'),
+            'timestamp': unified_timestamp(lesson.get('published_at')),
+            'duration': int_or_none(lesson.get('duration')),
+            'view_count': int_or_none(lesson.get('plays_count')),
+            'tags': try_get(lesson, lambda x: x['tag_list'], list),
+        }
+
+        def find_id_and_dlurl():
+            vid_id = lesson.get('wistia_id')
+            if vid_id:
+                return {'ie_key': 'Wistia', '_type': 'url_transparent',
+                        'id': vid_id, 'url': 'wistia:' + vid_id}
+
+            self.report_warning('Cannot find an proper ID, will use lesson name URL slug')
+            vid_id = self._html_search_regex(
+                r'^https?://egghead\.io/lessons/([A-Za-z0-9][A-Za-z0-9-]*)$',
+                lesson.get('http_url'),
+                'lesson name URL part as ID of last resort',
+                group=1)
+
+            mu = lesson.get('media_urls')
+            if mu:
+                src = mu.get('dash_url')
+                if src:
+                    return {'id': vid_id, 'formats': self._extract_mpd_formats(src, vid_id)}
+                src = mu.get('hls_url')
+                if src:
+                    return {'id': vid_id, 'formats': self._extract_m3u8_formats(src, vid_id, entry_protocol='m3u8_native', m3u8_id='hls')}
+            raise NotImplementedError('Unable to detect download URL')
+        info.update(find_id_and_dlurl())
+
+        return info
+
+
+class EggheadCourseIE(InfoExtractor, EggheadShared):
    IE_DESC = 'egghead.io course'
    IE_NAME = 'egghead:course'
    _VALID_URL = r'https://egghead\.io/courses/(?P<id>[^/?#&]+)'
@ -25,22 +64,16 @@ class EggheadCourseIE(InfoExtractor):

    def _real_extract(self, url):
        playlist_id = self._match_id(url)
-
        course = self._download_json(
-            'https://egghead.io/api/v1/series/%s' % playlist_id, playlist_id)
-
-        entries = [
-            self.url_result(
-                'wistia:%s' % lesson['wistia_id'], ie='Wistia',
-                video_id=lesson['wistia_id'], video_title=lesson.get('title'))
-            for lesson in course['lessons'] if lesson.get('wistia_id')]
-
+            'https://egghead.io/api/v1/series/' + playlist_id, playlist_id)
+        entries = [self.extract_lesson_metadata(lesson)
+                   for lesson in course['lessons']]
        return self.playlist_result(
            entries, playlist_id, course.get('title'),
            course.get('description'))


-class EggheadLessonIE(InfoExtractor):
+class EggheadLessonIE(InfoExtractor, EggheadShared):
    IE_DESC = 'egghead.io lesson'
    IE_NAME = 'egghead:lesson'
    _VALID_URL = r'https://egghead\.io/lessons/(?P<id>[^/?#&]+)'
@ -65,20 +98,6 @@ class EggheadLessonIE(InfoExtractor):

    def _real_extract(self, url):
        lesson_id = self._match_id(url)
-
        lesson = self._download_json(
-            'https://egghead.io/api/v1/lessons/%s' % lesson_id, lesson_id)
-
-        return {
-            '_type': 'url_transparent',
-            'ie_key': 'Wistia',
-            'url': 'wistia:%s' % lesson['wistia_id'],
-            'id': lesson['wistia_id'],
-            'title': lesson.get('title'),
-            'description': lesson.get('summary'),
-            'thumbnail': lesson.get('thumb_nail'),
-            'timestamp': unified_timestamp(lesson.get('published_at')),
-            'duration': int_or_none(lesson.get('duration')),
-            'view_count': int_or_none(lesson.get('plays_count')),
-            'tags': try_get(lesson, lambda x: x['tag_list'], list),
-        }
+            'https://egghead.io/api/v1/lessons/' + lesson_id, lesson_id)
+        return self.extract_lesson_metadata(lesson)
--- a/youtube_dl/utils.py
+++ b/youtube_dl/utils.py
@ -1830,15 +1830,30 @@ def parse_duration(s):
    s = s.strip()

    days, hours, mins, secs, ms = [None] * 5
-    m = re.match(r'(?:(?:(?:(?P<days>[0-9]+):)?(?P<hours>[0-9]+):)?(?P<mins>[0-9]+):)?(?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?Z?$', s)
+    m = re.match(r'''(?x)
+        (?:
+            (?:
+                (?:
+                  (?P<days>[0-9]+):
+                )?
+                (?P<hours>[0-9]+):
+            )?
+            (?P<mins>[0-9]+):
+        )?
+        (?P<secs>[0-9]+)
+        (?P<ms>\.[0-9]+)?
+        Z?$''', s)
    if m:
        days, hours, mins, secs, ms = m.groups()
    else:
        m = re.match(
-            r'''(?ix)(?:P?T)?
+            r'''(?ix)P?T?
+                (?:0Y)?
+                (?:0M)?
                (?:
                    (?P<days>[0-9]+)\s*d(?:ays?)?\s*
                )?
+                T?
                (?:
                    (?P<hours>[0-9]+)\s*h(?:ours?)?\s*
                )?
@ -1851,7 +1866,12 @@ def parse_duration(s):
        if m:
            days, hours, mins, secs, ms = m.groups()
        else:
-            m = re.match(r'(?i)(?:(?P<hours>[0-9.]+)\s*(?:hours?)|(?P<mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*)Z?$', s)
+            m = re.match(r'''(?ix)
+                (?:
+                    (?P<hours>[0-9.]+)\s*(?:hours?)
+                    |(?P<mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*
+                )
+                Z?$''', s)
            if m:
                hours, mins = m.groups()
            else: