From 89474a43177ce79466e98308ec3ce715601f1cdb Mon Sep 17 00:00:00 2001
From: remitamine <remitamine@gmail.com>
Date: Wed, 9 Sep 2015 12:17:40 +0100
Subject: [PATCH 1/3] [extractor/generic] extract multiple formats for HTML5
 video tags

---
 youtube_dl/extractor/generic.py | 54 +++++++++++++++++++--------------
 1 file changed, 32 insertions(+), 22 deletions(-)

diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py
index ec748ed9f..ebefca674 100644
--- a/youtube_dl/extractor/generic.py
+++ b/youtube_dl/extractor/generic.py
@@ -1797,7 +1797,9 @@ class GenericIE(InfoExtractor):
                 found = filter_video(re.findall(r'<meta.*?property="og:video".*?content="(.*?)"', webpage))
         if not found:
             # HTML5 video
-            found = re.findall(r'(?s)<(?:video|audio)[^<]*(?:>.*?<source[^>]*)?\s+src=["\'](.*?)["\']', webpage)
+            found = re.findall(r'(?s)<(?:video|audio)[^>]*>(.*?)</(?:video|audio)>', webpage)
+            if found:
+                found = [re.findall(r'(?s)<source[^>]*src=["\']([^"\']+)["\'][^>]*>', video) for video in found]
         if not found:
             REDIRECT_REGEX = r'[0-9]{,2};\s*(?:URL|url)=\'?([^\'"]+)'
             found = re.search(
@@ -1820,33 +1822,41 @@ class GenericIE(InfoExtractor):
             raise UnsupportedError(url)
 
         entries = []
-        for video_url in found:
-            video_url = compat_urlparse.urljoin(url, video_url)
-            video_id = compat_urllib_parse_unquote(os.path.basename(video_url))
-
-            # Sometimes, jwplayer extraction will result in a YouTube URL
-            if YoutubeIE.suitable(video_url):
-                entries.append(self.url_result(video_url, 'Youtube'))
-                continue
+        for video_urls in found:
+            if isinstance(video_urls, str):
+                video_urls = [video_urls]
 
+            video_id = compat_urllib_parse_unquote(os.path.basename(url))
             # here's a fun little line of code for you:
             video_id = os.path.splitext(video_id)[0]
 
-            ext = determine_ext(video_url)
-            if ext == 'smil':
+            formats = []
+            for video_url in video_urls:
+                video_url = compat_urlparse.urljoin(url, video_url)
+
+                # Sometimes, jwplayer extraction will result in a YouTube URL
+                if YoutubeIE.suitable(video_url):
+                    entries.append(self.url_result(video_url, 'Youtube'))
+                    continue
+
+                ext = determine_ext(video_url)
+                if ext == 'smil':
+                    entries.append({
+                        'id': video_id,
+                        'formats': self._extract_smil_formats(video_url, video_id),
+                        'uploader': video_uploader,
+                        'title': video_title,
+                        'age_limit': age_limit,
+                    })
+                elif ext == 'xspf':
+                    return self.playlist_result(self._extract_xspf_playlist(video_url, video_id), video_id)
+                else:
+                    formats.append({'url': video_url})
+
+            if formats:
                 entries.append({
                     'id': video_id,
-                    'formats': self._extract_smil_formats(video_url, video_id),
-                    'uploader': video_uploader,
-                    'title': video_title,
-                    'age_limit': age_limit,
-                })
-            elif ext == 'xspf':
-                return self.playlist_result(self._extract_xspf_playlist(video_url, video_id), video_id)
-            else:
-                entries.append({
-                    'id': video_id,
-                    'url': video_url,
+                    'formats': formats,
                     'uploader': video_uploader,
                     'title': video_title,
                     'age_limit': age_limit,

From 355e63eba1fba34786da6da90aec49e30d2cd107 Mon Sep 17 00:00:00 2001
From: remitamine <remitamine@gmail.com>
Date: Wed, 9 Sep 2015 13:27:12 +0100
Subject: [PATCH 2/3] [extractor/generic] use compat_str

---
 youtube_dl/extractor/generic.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py
index ebefca674..08c5fdb2c 100644
--- a/youtube_dl/extractor/generic.py
+++ b/youtube_dl/extractor/generic.py
@@ -12,6 +12,7 @@ from ..compat import (
     compat_urllib_request,
     compat_urlparse,
     compat_xml_parse_error,
+    compat_str,
 )
 from ..utils import (
     determine_ext,
@@ -1823,7 +1824,7 @@ class GenericIE(InfoExtractor):
 
         entries = []
         for video_urls in found:
-            if isinstance(video_urls, str):
+            if isinstance(video_urls, compat_str):
                 video_urls = [video_urls]
 
             video_id = compat_urllib_parse_unquote(os.path.basename(url))

From 60d563393b98e942a6976821f9f9ff47c549d77d Mon Sep 17 00:00:00 2001
From: remitamine <remitamine@gmail.com>
Date: Thu, 10 Sep 2015 01:50:19 +0100
Subject: [PATCH 3/3] [extractor/generic] add support for HTML5
 subtitles,inline src video and poster extraction

---
 youtube_dl/extractor/generic.py | 152 +++++++++++++++++++++++++-------
 1 file changed, 118 insertions(+), 34 deletions(-)

diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py
index 08c5fdb2c..0dbf5de42 100644
--- a/youtube_dl/extractor/generic.py
+++ b/youtube_dl/extractor/generic.py
@@ -1013,7 +1013,40 @@ class GenericIE(InfoExtractor):
                 'ext': 'mp4',
                 'title': 'cinemasnob',
             },
-        }
+        },
+        # HTML5 Videos with multiple formats
+        {
+            'url': 'https://commons.wikimedia.org/wiki/Big_Buck_Bunny',
+            'info_dict': {
+                'id': 'Big Buck Bunny - Wikimedia Commons',
+                'title': 'Big Buck Bunny - Wikimedia Commons',
+            },
+            'playlist': [{
+                'md5': '78467f74f821d12f22843647a9017e1a',
+                'info_dict': {
+                    'id': 'Big_Buck_Bunny_small',
+                    'ext': 'webm',
+                    'title': 'Big_Buck_Bunny_small (1)',
+                    'uploader': 'commons.wikimedia.org',
+                },
+            }, {
+                'md5': 'efab0fd5dfe10767df1ff5d923adc1d5',
+                'info_dict': {
+                    'id': 'Big_Buck_Bunny_medium.ogv.480p',
+                    'ext': 'webm',
+                    'title': 'Big_Buck_Bunny_medium.ogv.480p (2)',
+                    'uploader': 'commons.wikimedia.org',
+                },
+            }, {
+                'md5': '57495cddd8213e107e9227ed738bd26b',
+                'info_dict': {
+                    'id': 'Big_Buck_Bunny_8_seconds_bird_clip.ogv.720p',
+                    'ext': 'webm',
+                    'title': 'Big_Buck_Bunny_8_seconds_bird_clip.ogv.720p (3)',
+                    'uploader': 'commons.wikimedia.org',
+                },
+            }],
+        },
     ]
 
     def report_following_redirect(self, new_url):
@@ -1797,10 +1830,50 @@ class GenericIE(InfoExtractor):
             if m_video_type is not None:
                 found = filter_video(re.findall(r'<meta.*?property="og:video".*?content="(.*?)"', webpage))
         if not found:
-            # HTML5 video
-            found = re.findall(r'(?s)<(?:video|audio)[^>]*>(.*?)</(?:video|audio)>', webpage)
+            # HTML5 media(video or audio)
+            found = re.findall(r'(?s)<(video|audio)([^>]*)>(.*?)</(?:video|audio)>', webpage)
             if found:
-                found = [re.findall(r'(?s)<source[^>]*src=["\']([^"\']+)["\'][^>]*>', video) for video in found]
+                def extract_attributes(attributes_str, attributes_regex=r'(?s)\s*([^\s=]+)\s*=\s*["\']([^"\']+)["\']'):
+                    attributes = re.findall(attributes_regex, attributes_str)
+                    attributes_dict = {}
+                    if attributes:
+                        attributes_dict = {attribute_name: attribute_value for (attribute_name, attribute_value) in attributes}
+                    return attributes_dict
+
+                def absolute_url(video_url):
+                    return compat_urlparse.urljoin(url, video_url)
+
+                entries = []
+                for (media_type, media_attributes, media_content) in found:
+                    video_info = {'formats': [],'subtitles': {}}
+                    if media_attributes:
+                        media_attributes = extract_attributes(media_attributes)
+                        src = media_attributes.get('src')
+                        if src:
+                            video_info['formats'].append({'url': absolute_url(src)})
+                        video_info['thumbnail'] = media_attributes.get('poster')
+                    if media_content:
+                        tags = re.findall(r'(?s)<(source|track)([^>]*)>', media_content)
+                        for (tag_type, tag_attributes) in tags:
+                            if tag_type == 'source':
+                                format_info = {}
+                                source_attributes = extract_attributes(tag_attributes, r'(?s)\s*([^\s=]+)\s*=\s*["\']([^"\']+|[^"\']+codecs\s*=\s*["\'][^"\']+["\'])["\']')
+                                src = source_attributes.get('src')
+                                if src:
+                                    video_info['formats'].append({'url': absolute_url(src)})
+                                # TODO: extract mime and codecs info
+                            if tag_type == 'track':
+                                track_attributes = extract_attributes(tag_attributes)
+                                kind = track_attributes.get('kind')
+                                if not kind or kind == 'subtitles':
+                                    src = track_attributes.get('src')
+                                    if src:
+                                        key = track_attributes.get('srclang') or track_attributes.get('lang') or track_attributes.get('label')
+                                        video_info['subtitles'][key] = [{'url': absolute_url(src), 'ext': determine_ext(src)}]
+                    if video_info['formats']:
+                        entries.append(video_info)
+                if entries:
+                    found = entries
         if not found:
             REDIRECT_REGEX = r'[0-9]{,2};\s*(?:URL|url)=\'?([^\'"]+)'
             found = re.search(
@@ -1822,46 +1895,55 @@ class GenericIE(InfoExtractor):
         if not found:
             raise UnsupportedError(url)
 
+        def extract_filename_from_url(url):
+            filename = compat_urllib_parse_unquote(os.path.basename(url))
+            # here's a fun little line of code for you:
+            filename = os.path.splitext(filename)[0]
+            return filename
+
         entries = []
         for video_urls in found:
-            if isinstance(video_urls, compat_str):
-                video_urls = [video_urls]
+            video_info = {'formats': []}
+            if isinstance(video_urls, dict):
+                video_info = video_urls
+                video_id = extract_filename_from_url(video_info['formats'][0]['url'])
+            else:
+                if isinstance(video_urls, compat_str):
+                    video_urls = [video_urls]
+                video_id = extract_filename_from_url(video_urls[0])
 
-            video_id = compat_urllib_parse_unquote(os.path.basename(url))
-            # here's a fun little line of code for you:
-            video_id = os.path.splitext(video_id)[0]
+                for video_url in video_urls:
+                    video_url = compat_urlparse.urljoin(url, video_url)
 
-            formats = []
-            for video_url in video_urls:
-                video_url = compat_urlparse.urljoin(url, video_url)
+                    # Sometimes, jwplayer extraction will result in a YouTube URL
+                    if YoutubeIE.suitable(video_url):
+                        entries.append(self.url_result(video_url, 'Youtube'))
+                        continue
 
-                # Sometimes, jwplayer extraction will result in a YouTube URL
-                if YoutubeIE.suitable(video_url):
-                    entries.append(self.url_result(video_url, 'Youtube'))
-                    continue
+                    ext = determine_ext(video_url)
+                    if ext == 'smil':
+                        entries.append({
+                            'id': video_id,
+                            'formats': self._extract_smil_formats(video_url, video_id),
+                            'uploader': video_uploader,
+                            'title': video_title,
+                            'age_limit': age_limit,
+                        })
+                    elif ext == 'xspf':
+                        return self.playlist_result(self._extract_xspf_playlist(video_url, video_id), video_id)
+                    else:
+                        video_info['formats'].append({'url': video_url})
 
-                ext = determine_ext(video_url)
-                if ext == 'smil':
-                    entries.append({
-                        'id': video_id,
-                        'formats': self._extract_smil_formats(video_url, video_id),
-                        'uploader': video_uploader,
-                        'title': video_title,
-                        'age_limit': age_limit,
-                    })
-                elif ext == 'xspf':
-                    return self.playlist_result(self._extract_xspf_playlist(video_url, video_id), video_id)
-                else:
-                    formats.append({'url': video_url})
-
-            if formats:
-                entries.append({
+            if video_info['formats']:
+                if len(video_info['formats']) > 1:
+                    self._sort_formats(video_info['formats'])
+                video_info.update({
                     'id': video_id,
-                    'formats': formats,
                     'uploader': video_uploader,
-                    'title': video_title,
+                    'title': video_id,
                     'age_limit': age_limit,
                 })
+                entries.append(video_info)
 
         if len(entries) == 1:
             return entries[0]
@@ -1873,4 +1955,6 @@ class GenericIE(InfoExtractor):
             return {
                 '_type': 'playlist',
                 'entries': entries,
+                'id': video_title,
+                'title': video_title,
             }