fixed bug #15024 in extractor/twitter.py see gist: https://gist.github.com/mwattsun/a5eeb63087c8281066242321f0511185

2025-02-08 18:22:52 +08:00 · 2020-04-03 13:24:54 -07:00 · 2020-04-03 13:24:54 -07:00 · 46b50f0784
commit 46b50f0784
parent 049c0486bb
1 changed files with 76 additions and 0 deletions
--- a/youtube_dl/extractor/twitter.py
+++ b/youtube_dl/extractor/twitter.py
@ -1,7 +1,10 @@
 # coding: utf-8
 from __future__ import unicode_literals
 import random
 import re
 import sys
 import string
 from .common import InfoExtractor
 from ..compat import (
@ -400,6 +403,7 @@ class TwitterIE(TwitterBaseIE):
        uploader = user.get('name')
        if uploader:
            title = '%s - %s' % (uploader, title)
        title = universal_filename(title)
        uploader_id = user.get('screen_name')
        tags = []
@ -596,3 +600,75 @@ class TwitterBroadcastIE(TwitterBaseIE, PeriscopeBaseIE):
        info['formats'] = self._extract_pscp_m3u8_formats(
            m3u8_url, broadcast_id, m3u8_id, state, width, height)
        return info
 max_len = 140
 def universal_filename(sutf8):
    """People have been having problems since Twitter increased Tweet length
    from 140 to 280 chars because video filenames are derived from the tweet
    text that contains emojis and many hashtags. Some file names are then too  
    long. This function is designed to be added to twitter.py in the youtube-dl 
    extractor directory and run on titles so they are a valid filename or an
    empty string. 
    - Convert string to Ascii/UTF-8 (all chars in range 32 - 127)
    - Remove control chars, illegal chars and reserved names
    - Make sure string doesn't exceed max_len
    - return a valid filename in ascii range since youtube-dl was started on 
      python2 and has some utf-8 weirdness
    """
    def make_filename():
        letters = string.ascii_lowercase
        return ''.join(random.choice(letters) for i in range(64))
    def check_name(udata):
        if len(udata) == 0:
            make_filename()
        return udata
    # make sure input is a string and not 0 length
    if sys.version_info[0] >= 3:
        if type(sutf8) != str:
            return make_filename()
    else:
        if type(sutf8) != unicode:
            return make_filename()
    if len(sutf8) == 0:
        return make_filename()
    # encode copies every char that is ascii in range 0-127
    # and 'ignore' says throw away the rest, but ignore doesn't 
    # always work on python2.7, so use a try block
    # encode returns bytes, so turn it back into a string safely
    # because it is all ascii which maps directly to utf-8 codecs.decode('ascii', 'ignore')
    try:
        udata = sutf8.encode("ascii","ignore")
        udata = udata.decode('utf-8')
    except:
        print("exception")
        return check_name("")
    if len(udata) == 0:
        return make_filename()
    # cntl chars, get ride of multiline
    udata = re.sub(r'[\x00-\x1F]*', '', udata)
    # illegal chars, leading and trailing spaces or dots
    udata = re.sub(r'^[\s.]*|[\s.]*$', '', udata)
    udata = re.sub(r'[/<>:"|\\?*]*', '', udata)
    udata = re.sub(r'\s{2,}', ' ', udata)
    pattern = re.compile(r'(?P<reserved>^COM[0-9]|LPT[0-9]|CLOCK\$|CON|PRN|AUX|NUL)(?P<more>.*)')
    m = pattern.match(udata)
    if m is not None:
        if not m.group('more'):
            return make_filename()
    # make sure it is not too long or 0 (return made up valid filename if so)
    length = len(udata)
    if length > max_len:
        udata = udata[0:max_len]
    elif length == 0:
        return make_filename()
    return udata