From fcb08be29cc14f44ababbaba419750707a0f39fa Mon Sep 17 00:00:00 2001 From: hodayabu <44240078+hodayabu@users.noreply.github.com> Date: Tue, 16 Jun 2020 16:20:45 +0300 Subject: [PATCH] Wrong title facebook crawled videos before fresh (#337) * timestamp conditions fix * title and thumbnail fix * pr fix * title and thumbnail fix * timestamp order Co-authored-by: bhodaya --- youtube_dl/extractor/facebook.py | 50 ++++++++++++++++++-------------- 1 file changed, 28 insertions(+), 22 deletions(-) diff --git a/youtube_dl/extractor/facebook.py b/youtube_dl/extractor/facebook.py index b13bf8e51..eba2edfef 100644 --- a/youtube_dl/extractor/facebook.py +++ b/youtube_dl/extractor/facebook.py @@ -442,35 +442,31 @@ class FacebookIE(InfoExtractor): if s: return lowercase_escape(s) - uploader = clean_html(get_element_by_id('fbPhotoPageAuthorName', webpage)) or \ self._search_regex(r'ownerName\s*:\s*"([^"]+)"', webpage, 'uploader',default=None) or \ _lowercase_escape(self._search_regex(r'\"ownerName\":"(.+?)"', tahoe_data.secondary, 'uploader_id', fatal=False)) or \ self._search_regex(r'ownerName"\s*:\s*"([^"]+)"', webpage, 'uploader', default=None) or \ self._og_search_title(webpage, default=None) + timestamp = self._resolve_timestamp(webpage, tahoe_data) timestamp = parse_iso8601(timestamp) - - if timestamp == None and webpage.find('Paid Partnership') == -1 or\ - (timestamp == None and webpage.find('Paid Partnership') > -1 and - 'cookiefile' in self._downloader.params): - - regex_search_result_date_time = self._search_regex(r'data-utime=\\\"(\d+)\\\"', tahoe_data.secondary, 'timestamp', default=None)\ - or self._search_regex(r'data-utime=\\\"(\d+)\\\"', tahoe_data.primary, 'timestamp', default=None)\ - or self._search_regex(r'data-utime=\\\"(\d+)\\\"', webpage,'timestamp', default=None)\ - or self._search_regex(r']+data-utime=["\'](\d+)', webpage, 'timestamp', default=None)\ - or self._search_regex(r']+data-utime=["\'](\d+)', tahoe_data.secondary, 'timestamp', default=None)\ - or self._search_regex(r']+data-utime=["\'](\d+)', tahoe_data.primary, 'timestamp', default=None) - - regex_search_result_publish_time = self._search_regex(r'publish_time":([\d]+)', webpage, 'timestamp', default=None)\ - or self._search_regex(r'publish_time":([\d]+)', tahoe_data.primary, 'timestamp', default=None)\ - or self._search_regex(r'publish_time":([\d]+)', tahoe_data.secondary, 'timestamp', default=None) - + if timestamp is None and webpage.find('Paid Partnership') == -1 or \ + (timestamp is None and webpage.find('Paid Partnership') > -1 and 'cookiefile' in self._downloader.params): + regex_search_result_date_time = self._search_regex(r'data-utime=\\\"(\d+)\\\"', tahoe_data.secondary, 'timestamp', default=None) \ + or self._search_regex(r'data-utime=\\\"(\d+)\\\"', tahoe_data.primary, 'timestamp', default=None)\ + or self._search_regex(r'data-utime=\\\"(\d+)\\\"', webpage, 'timestamp', default=None)\ + or self._search_regex(r']+data-utime=["\'](\d+)', webpage, 'timestamp', default=None)\ + or self._search_regex(r']+data-utime=["\'](\d+)', tahoe_data.secondary, 'timestamp', default=None)\ + or self._search_regex(r']+data-utime=["\'](\d+)', tahoe_data.primary, 'timestamp', default=None) + regex_search_result_publish_time = self._search_regex(r'publish_time":([\d]+)', webpage, 'timestamp', default=None) \ + or self._search_regex(r'publish_time":([\d]+)', tahoe_data.primary, 'timestamp', default=None) \ + or self._search_regex(r'publish_time":([\d]+)', tahoe_data.secondary, 'timestamp', default=None) timestamp = int_or_none(regex_search_result_date_time) or int_or_none(regex_search_result_publish_time) uploader_id = self._resolve_uploader_id(webpage, tahoe_data) - thumbnail = self._html_search_meta(['og:image', 'twitter:image'], webpage) + thumbnail = self._resolve_thumbnail(webpage, tahoe_data) + if is_live: view_count = parse_count( self._search_regex(r'viewerCount:([\d]+)', webpage, 'views', fatal=False) or \ @@ -673,16 +669,16 @@ class FacebookIE(InfoExtractor): video_title = self._html_search_regex( r']*class="uiHeaderTitle"[^>]*>([^<]*)', webpage, 'title', default=None) - if not video_title: + if not self._valid_video_title(video_title): video_title = self._html_search_regex( r'(?s)(.*?)', webpage, 'alternative title', default=None) - if not video_title: + if not self._valid_video_title(video_title): video_title = self._og_search_title(webpage, default=None) - if not video_title: + if not self._valid_video_title(video_title): video_title = self._html_search_meta( 'description', webpage, 'title', default=None) - if not video_title: + if not self._valid_video_title(video_title): values = re.findall(r'videoTitle"\s*:\s*"(.*?)"', tahoe_data.secondary) if values: video_title = values[-1] @@ -792,6 +788,16 @@ class FacebookIE(InfoExtractor): timestamp = parse_iso8601(timestamp) return timestamp + def _resolve_thumbnail(self, webpage, tahoe_data): + thumbnail = self._html_search_meta(['og:image', 'twitter:image'], webpage) + if not thumbnail: + thumbnail = self._search_regex(r'"subtitles_src":"(.+?")', tahoe_data.primary, 'thumbnail', fatal=False) + return thumbnail + + def _valid_video_title(self, video_title): + return video_title and not u'Log In or Sign Up to View' in video_title + + class FacebookTahoeData: def __init__(self, extractor, page, video_id):