[fragment,hls,f4m,dash,ism] improve fragment downloading

- resume immediately - no need to concatenate segments and decrypt them on every resume - no need to save temp files for segments and for hls downloader: - no need to download keys for segments that already downloaded
2024-12-22 19:52:53 +08:00 · 2016-06-28 18:07:50 +01:00 · 2016-06-28 18:07:50 +01:00 · 75a2485407
commit 75a2485407
parent 58f6ab72ed
7 changed files with 111 additions and 123 deletions
--- a/youtube_dl/downloader/common.py
+++ b/youtube_dl/downloader/common.py
@ -327,21 +327,22 @@ class FileDownloader(object):
            os.path.exists(encodeFilename(filename))
        )
-        continuedl_and_exists = (
+        if not hasattr(filename, 'write'):
-            self.params.get('continuedl', True) and
+            continuedl_and_exists = (
-            os.path.isfile(encodeFilename(filename)) and
+                self.params.get('continuedl', True) and
-            not self.params.get('nopart', False)
+                os.path.isfile(encodeFilename(filename)) and
-        )
+                not self.params.get('nopart', False)
            )
-        # Check file already present
+            # Check file already present
-        if filename != '-' and (nooverwrites_and_exists or continuedl_and_exists):
+            if filename != '-' and (nooverwrites_and_exists or continuedl_and_exists):
-            self.report_file_already_downloaded(filename)
+                self.report_file_already_downloaded(filename)
-            self._hook_progress({
+                self._hook_progress({
-                'filename': filename,
+                    'filename': filename,
-                'status': 'finished',
+                    'status': 'finished',
-                'total_bytes': os.path.getsize(encodeFilename(filename)),
+                    'total_bytes': os.path.getsize(encodeFilename(filename)),
-            })
+                })
-            return True
+                return True
        min_sleep_interval = self.params.get('sleep_interval')
        if min_sleep_interval:
--- a/youtube_dl/downloader/dash.py
+++ b/youtube_dl/downloader/dash.py
@ -1,13 +1,7 @@
 from __future__ import unicode_literals
 import os
 from .fragment import FragmentFD
 from ..compat import compat_urllib_error
 from ..utils import (
    sanitize_open,
    encodeFilename,
 )
 class DashSegmentsFD(FragmentFD):
@ -28,31 +22,24 @@ class DashSegmentsFD(FragmentFD):
        self._prepare_and_start_frag_download(ctx)
        segments_filenames = []
        fragment_retries = self.params.get('fragment_retries', 0)
        skip_unavailable_fragments = self.params.get('skip_unavailable_fragments', True)
-        def process_segment(segment, tmp_filename, num):
+        frag_index = 0
-            segment_url = segment['url']
+        for i, segment in enumerate(segments):
-            segment_name = 'Frag%d' % num
+            frag_index += 1
-            target_filename = '%s-%s' % (tmp_filename, segment_name)
+            if frag_index <= ctx['frag_index']:
                continue
            # In DASH, the first segment contains necessary headers to
            # generate a valid MP4 file, so always abort for the first segment
-            fatal = num == 0 or not skip_unavailable_fragments
+            fatal = i == 0 or not skip_unavailable_fragments
            count = 0
            while count <= fragment_retries:
                try:
-                    success = ctx['dl'].download(target_filename, {
+                    success, frag_content = self._download_fragment(ctx, segment['url'], info_dict)
                        'url': segment_url,
                        'http_headers': info_dict.get('http_headers'),
                    })
                    if not success:
                        return False
-                    down, target_sanitized = sanitize_open(target_filename, 'rb')
+                    self._append_fragment(ctx, frag_content)
                    ctx['dest_stream'].write(down.read())
                    down.close()
                    segments_filenames.append(target_sanitized)
                    break
                except compat_urllib_error.HTTPError as err:
                    # YouTube may often return 404 HTTP error for a fragment causing the
@ -63,22 +50,14 @@ class DashSegmentsFD(FragmentFD):
                    # HTTP error.
                    count += 1
                    if count <= fragment_retries:
-                        self.report_retry_fragment(err, segment_name, count, fragment_retries)
+                        self.report_retry_fragment(err, frag_index, count, fragment_retries)
            if count > fragment_retries:
                if not fatal:
-                    self.report_skip_fragment(segment_name)
+                    self.report_skip_fragment(frag_index)
-                    return True
+                    continue
                self.report_error('giving up after %s fragment retries' % fragment_retries)
                return False
            return True
        for i, segment in enumerate(segments):
            if not process_segment(segment, ctx['tmpfilename'], i):
                return False
        self._finish_frag_download(ctx)
        for segment_file in segments_filenames:
            os.remove(encodeFilename(segment_file))
        return True
--- a/youtube_dl/downloader/f4m.py
+++ b/youtube_dl/downloader/f4m.py
@ -3,7 +3,6 @@ from __future__ import division, unicode_literals
 import base64
 import io
 import itertools
 import os
 import time
 from .fragment import FragmentFD
@ -16,9 +15,7 @@ from ..compat import (
    compat_struct_unpack,
 )
 from ..utils import (
    encodeFilename,
    fix_xml_ampersands,
    sanitize_open,
    xpath_text,
 )
@ -366,17 +363,21 @@ class F4mFD(FragmentFD):
        dest_stream = ctx['dest_stream']
-        write_flv_header(dest_stream)
+        if ctx['complete_frags_downloaded_bytes'] == 0:
-        if not live:
+            write_flv_header(dest_stream)
-            write_metadata_tag(dest_stream, metadata)
+            if not live:
                write_metadata_tag(dest_stream, metadata)
        base_url_parsed = compat_urllib_parse_urlparse(base_url)
        self._start_frag_download(ctx)
-        frags_filenames = []
+        frag_index = 0
        while fragments_list:
            seg_i, frag_i = fragments_list.pop(0)
            frag_index += 1
            if frag_index <= ctx['frag_index']:
                continue
            name = 'Seg%d-Frag%d' % (seg_i, frag_i)
            query = []
            if base_url_parsed.query:
@ -386,17 +387,10 @@ class F4mFD(FragmentFD):
            if info_dict.get('extra_param_to_segment_url'):
                query.append(info_dict['extra_param_to_segment_url'])
            url_parsed = base_url_parsed._replace(path=base_url_parsed.path + name, query='&'.join(query))
            frag_filename = '%s-%s' % (ctx['tmpfilename'], name)
            try:
-                success = ctx['dl'].download(frag_filename, {
+                success, down_data = self._download_fragment(ctx, url_parsed.geturl(), info_dict)
                    'url': url_parsed.geturl(),
                    'http_headers': info_dict.get('http_headers'),
                })
                if not success:
                    return False
                (down, frag_sanitized) = sanitize_open(frag_filename, 'rb')
                down_data = down.read()
                down.close()
                reader = FlvReader(down_data)
                while True:
                    try:
@ -411,12 +405,8 @@ class F4mFD(FragmentFD):
                            break
                        raise
                    if box_type == b'mdat':
-                        dest_stream.write(box_data)
+                        self._append_fragment(ctx, box_data)
                        break
                if live:
                    os.remove(encodeFilename(frag_sanitized))
                else:
                    frags_filenames.append(frag_sanitized)
            except (compat_urllib_error.HTTPError, ) as err:
                if live and (err.code == 404 or err.code == 410):
                    # We didn't keep up with the live window. Continue
@ -436,7 +426,4 @@ class F4mFD(FragmentFD):
        self._finish_frag_download(ctx)
        for frag_file in frags_filenames:
            os.remove(encodeFilename(frag_file))
        return True
--- a/youtube_dl/downloader/fragment.py
+++ b/youtube_dl/downloader/fragment.py
@ -2,6 +2,7 @@ from __future__ import division, unicode_literals
 import os
 import time
 import io
 from .common import FileDownloader
 from .http import HttpFD
@ -10,6 +11,7 @@ from ..utils import (
    encodeFilename,
    sanitize_open,
    sanitized_Request,
    compat_str,
 )
@ -30,13 +32,13 @@ class FragmentFD(FileDownloader):
                        Skip unavailable fragments (DASH and hlsnative only)
    """
-    def report_retry_fragment(self, err, fragment_name, count, retries):
+    def report_retry_fragment(self, err, frag_index, count, retries):
        self.to_screen(
-            '[download] Got server HTTP error: %s. Retrying fragment %s (attempt %d of %s)...'
+            '[download] Got server HTTP error: %s. Retrying fragment %d (attempt %d of %s)...'
-            % (error_to_compat_str(err), fragment_name, count, self.format_retries(retries)))
+            % (error_to_compat_str(err), frag_index, count, self.format_retries(retries)))
-    def report_skip_fragment(self, fragment_name):
+    def report_skip_fragment(self, frag_index):
-        self.to_screen('[download] Skipping fragment %s...' % fragment_name)
+        self.to_screen('[download] Skipping fragment %d...' % frag_index)
    def _prepare_url(self, info_dict, url):
        headers = info_dict.get('http_headers')
@ -46,6 +48,25 @@ class FragmentFD(FileDownloader):
        self._prepare_frag_download(ctx)
        self._start_frag_download(ctx)
    def _download_fragment(self, ctx, frag_url, info_dict, headers=None):
        down = io.BytesIO()
        success = ctx['dl'].download(down, {
            'url': frag_url,
            'http_headers': headers or info_dict.get('http_headers'),
        })
        if not success:
            return False, None
        frag_content = down.getvalue()
        down.close()
        return True, frag_content
    def _append_fragment(self, ctx, frag_content):
        ctx['dest_stream'].write(frag_content)
        if not (ctx.get('live') or ctx['tmpfilename'] == '-'):
            frag_index_stream, _ = sanitize_open(ctx['tmpfilename'] + '.fragindex', 'w')
            frag_index_stream.write(compat_str(ctx['frag_index']))
            frag_index_stream.close()
    def _prepare_frag_download(self, ctx):
        if 'live' not in ctx:
            ctx['live'] = False
@ -66,11 +87,26 @@ class FragmentFD(FileDownloader):
            }
        )
        tmpfilename = self.temp_name(ctx['filename'])
-        dest_stream, tmpfilename = sanitize_open(tmpfilename, 'wb')
+        open_mode = 'wb'
        resume_len = 0
        frag_index = 0
        # Establish possible resume length
        if os.path.isfile(encodeFilename(tmpfilename)):
            open_mode = 'ab'
            resume_len = os.path.getsize(encodeFilename(tmpfilename))
            if os.path.isfile(encodeFilename(tmpfilename + '.fragindex')):
                frag_index_stream, _ = sanitize_open(tmpfilename + '.fragindex', 'r')
                frag_index = int(frag_index_stream.read())
                frag_index_stream.close()
        dest_stream, tmpfilename = sanitize_open(tmpfilename, open_mode)
        ctx.update({
            'dl': dl,
            'dest_stream': dest_stream,
            'tmpfilename': tmpfilename,
            'frag_index': frag_index,
            # Total complete fragments downloaded so far in bytes
            'complete_frags_downloaded_bytes': resume_len,
        })
    def _start_frag_download(self, ctx):
@ -79,8 +115,8 @@ class FragmentFD(FileDownloader):
        # hook
        state = {
            'status': 'downloading',
-            'downloaded_bytes': 0,
+            'downloaded_bytes': ctx['complete_frags_downloaded_bytes'],
-            'frag_index': 0,
+            'frag_index': ctx['frag_index'],
            'frag_count': total_frags,
            'filename': ctx['filename'],
            'tmpfilename': ctx['tmpfilename'],
@ -89,8 +125,6 @@ class FragmentFD(FileDownloader):
        start = time.time()
        ctx.update({
            'started': start,
            # Total complete fragments downloaded so far in bytes
            'complete_frags_downloaded_bytes': 0,
            # Amount of fragment's bytes downloaded by the time of the previous
            # frag progress hook invocation
            'prev_frag_downloaded_bytes': 0,
@ -111,6 +145,7 @@ class FragmentFD(FileDownloader):
            if s['status'] == 'finished':
                state['frag_index'] += 1
                ctx['frag_index'] = state['frag_index']
                state['downloaded_bytes'] += frag_total_bytes - ctx['prev_frag_downloaded_bytes']
                ctx['complete_frags_downloaded_bytes'] = state['downloaded_bytes']
                ctx['prev_frag_downloaded_bytes'] = 0
@ -132,6 +167,8 @@ class FragmentFD(FileDownloader):
    def _finish_frag_download(self, ctx):
        ctx['dest_stream'].close()
        if os.path.isfile(encodeFilename(ctx['tmpfilename'] + '.fragindex')):
            os.remove(encodeFilename(ctx['tmpfilename'] + '.fragindex'))
        elapsed = time.time() - ctx['started']
        self.try_rename(ctx['tmpfilename'], ctx['filename'])
        fsize = os.path.getsize(encodeFilename(ctx['filename']))
--- a/youtube_dl/downloader/hls.py
+++ b/youtube_dl/downloader/hls.py
@ -1,6 +1,5 @@
 from __future__ import unicode_literals
 import os.path
 import re
 import binascii
 try:
@ -18,8 +17,6 @@ from ..compat import (
    compat_struct_pack,
 )
 from ..utils import (
    encodeFilename,
    sanitize_open,
    parse_m3u8_attributes,
    update_url_query,
 )
@ -103,17 +100,18 @@ class HlsFD(FragmentFD):
        media_sequence = 0
        decrypt_info = {'METHOD': 'NONE'}
        byte_range = {}
-        frags_filenames = []
+        frag_index = 0
        for line in s.splitlines():
            line = line.strip()
            if line:
                if not line.startswith('#'):
                    frag_index += 1
                    if frag_index <= ctx['frag_index']:
                        continue
                    frag_url = (
                        line
                        if re.match(r'^https?://', line)
                        else compat_urlparse.urljoin(man_url, line))
                    frag_name = 'Frag%d' % i
                    frag_filename = '%s-%s' % (ctx['tmpfilename'], frag_name)
                    if extra_query:
                        frag_url = update_url_query(frag_url, extra_query)
                    count = 0
@ -122,15 +120,10 @@ class HlsFD(FragmentFD):
                        headers['Range'] = 'bytes=%d-%d' % (byte_range['start'], byte_range['end'])
                    while count <= fragment_retries:
                        try:
-                            success = ctx['dl'].download(frag_filename, {
+                            success, frag_content = self._download_fragment(
-                                'url': frag_url,
+                                ctx, frag_url, info_dict, headers)
                                'http_headers': headers,
                            })
                            if not success:
                                return False
                            down, frag_sanitized = sanitize_open(frag_filename, 'rb')
                            frag_content = down.read()
                            down.close()
                            break
                        except compat_urllib_error.HTTPError as err:
                            # Unavailable (possibly temporary) fragments may be served.
@ -139,28 +132,29 @@ class HlsFD(FragmentFD):
                            # https://github.com/rg3/youtube-dl/issues/10448).
                            count += 1
                            if count <= fragment_retries:
-                                self.report_retry_fragment(err, frag_name, count, fragment_retries)
+                                self.report_retry_fragment(err, frag_index, count, fragment_retries)
                    if count > fragment_retries:
                        if skip_unavailable_fragments:
                            i += 1
                            media_sequence += 1
-                            self.report_skip_fragment(frag_name)
+                            self.report_skip_fragment(frag_index)
                            continue
                        self.report_error(
                            'giving up after %s fragment retries' % fragment_retries)
                        return False
                    if decrypt_info['METHOD'] == 'AES-128':
                        iv = decrypt_info.get('IV') or compat_struct_pack('>8xq', media_sequence)
                        decrypt_info['KEY'] = decrypt_info.get('KEY') or self.ydl.urlopen(decrypt_info['URI']).read()
                        frag_content = AES.new(
                            decrypt_info['KEY'], AES.MODE_CBC, iv).decrypt(frag_content)
-                    ctx['dest_stream'].write(frag_content)
+                    self._append_fragment(ctx, frag_content)
                    frags_filenames.append(frag_sanitized)
                    # We only download the first fragment during the test
                    if test:
                        break
                    i += 1
                    media_sequence += 1
                elif line.startswith('#EXT-X-KEY'):
                    decrypt_url = decrypt_info.get('URI')
                    decrypt_info = parse_m3u8_attributes(line[11:])
                    if decrypt_info['METHOD'] == 'AES-128':
                        if 'IV' in decrypt_info:
@ -170,7 +164,8 @@ class HlsFD(FragmentFD):
                                man_url, decrypt_info['URI'])
                        if extra_query:
                            decrypt_info['URI'] = update_url_query(decrypt_info['URI'], extra_query)
-                        decrypt_info['KEY'] = self.ydl.urlopen(decrypt_info['URI']).read()
+                        if decrypt_url != decrypt_info['URI']:
                            decrypt_info['KEY'] = None
                elif line.startswith('#EXT-X-MEDIA-SEQUENCE'):
                    media_sequence = int(line[22:])
                elif line.startswith('#EXT-X-BYTERANGE'):
@ -183,7 +178,4 @@ class HlsFD(FragmentFD):
        self._finish_frag_download(ctx)
        for frag_file in frags_filenames:
            os.remove(encodeFilename(frag_file))
        return True
--- a/youtube_dl/downloader/http.py
+++ b/youtube_dl/downloader/http.py
@ -20,10 +20,14 @@ from ..utils import (
 class HttpFD(FileDownloader):
-    def real_download(self, filename, info_dict):
+    def real_download(self, filename_or_stream, info_dict):
        url = info_dict['url']
-        tmpfilename = self.temp_name(filename)
+        filename = filename_or_stream
        stream = None
        if hasattr(filename_or_stream, 'write'):
            stream = filename_or_stream
            filename = '-'
        tmpfilename = self.temp_name(filename)
        # Do not include the Accept-Encoding header
        headers = {'Youtubedl-no-compression': 'True'}
--- a/youtube_dl/downloader/ism.py
+++ b/youtube_dl/downloader/ism.py
@ -1,6 +1,5 @@
 from __future__ import unicode_literals
 import os
 import time
 import struct
 import binascii
@ -8,10 +7,6 @@ import io
 from .fragment import FragmentFD
 from ..compat import compat_urllib_error
 from ..utils import (
    sanitize_open,
    encodeFilename,
 )
 u8 = struct.Struct(b'>B')
@ -225,16 +220,15 @@ class IsmFD(FragmentFD):
        self._prepare_and_start_frag_download(ctx)
        segments_filenames = []
        fragment_retries = self.params.get('fragment_retries', 0)
        skip_unavailable_fragments = self.params.get('skip_unavailable_fragments', True)
        track_written = False
        frag_index = 0
        for i, segment in enumerate(segments):
-            segment_url = segment['url']
+            frag_index += 1
-            segment_name = 'Frag%d' % i
+            if frag_index <= ctx['frag_index']:
-            target_filename = '%s-%s' % (ctx['tmpfilename'], segment_name)
+                continue
            count = 0
            while count <= fragment_retries:
                try:
@ -242,33 +236,27 @@ class IsmFD(FragmentFD):
                        'url': segment_url,
                        'http_headers': info_dict.get('http_headers'),
                    })
                    success, frag_content = self._download_fragment(ctx, segment['url'], info_dict)
                    if not success:
                        return False
                    down, target_sanitized = sanitize_open(target_filename, 'rb')
                    down_data = down.read()
                    if not track_written:
-                        tfhd_data = extract_box_data(down_data, [b'moof', b'traf', b'tfhd'])
+                        tfhd_data = extract_box_data(frag_content, [b'moof', b'traf', b'tfhd'])
                        info_dict['_download_params']['track_id'] = u32.unpack(tfhd_data[4:8])[0]
                        write_piff_header(ctx['dest_stream'], info_dict['_download_params'])
                        track_written = True
-                    ctx['dest_stream'].write(down_data)
+                    self._append_fragment(ctx, frag_content)
                    down.close()
                    segments_filenames.append(target_sanitized)
                    break
                except compat_urllib_error.HTTPError as err:
                    count += 1
                    if count <= fragment_retries:
-                        self.report_retry_fragment(err, segment_name, count, fragment_retries)
+                        self.report_retry_fragment(err, frag_index, count, fragment_retries)
            if count > fragment_retries:
                if skip_unavailable_fragments:
-                    self.report_skip_fragment(segment_name)
+                    self.report_skip_fragment(frag_index)
                    continue
                self.report_error('giving up after %s fragment retries' % fragment_retries)
                return False
        self._finish_frag_download(ctx)
        for segment_file in segments_filenames:
            os.remove(encodeFilename(segment_file))
        return True