[ie/youtube:tab] Fix flat thumbnails extraction for shorts (#15331 )

Closes #15329 Authored by: bashonly
[ie/parti] Fix extractors (#15319 )
2025-12-17 19:32:24 +01:00 · 2025-12-15 22:37:25 +00:00 · 2025-12-13 20:00:56 +01:00 · 2025-12-12 22:25:45 +00:00 · 2025-12-12 20:52:09 +00:00 · 2025-12-09 19:05:12 +00:00
6 changed files with 63 additions and 59 deletions
--- a/test/helper.py
+++ b/test/helper.py
@@ -261,7 +261,7 @@ def sanitize_got_info_dict(got_dict):
 def expect_info_dict(self, got_dict, expected_dict):
    ALLOWED_KEYS_SORT_ORDER = (
        # NB: Keep in sync with the docstring of extractor/common.py
-        'id', 'ext', 'direct', 'display_id', 'title', 'alt_title', 'description', 'media_type',
+        'ie_key', 'url', 'id', 'ext', 'direct', 'display_id', 'title', 'alt_title', 'description', 'media_type',
        'uploader', 'uploader_id', 'uploader_url', 'channel', 'channel_id', 'channel_url', 'channel_is_verified',
        'channel_follower_count', 'comment_count', 'view_count', 'concurrent_view_count',
        'like_count', 'dislike_count', 'repost_count', 'average_rating', 'age_limit', 'duration', 'thumbnail', 'heatmap',
--- a/yt_dlp/extractor/archiveorg.py
+++ b/yt_dlp/extractor/archiveorg.py
@@ -279,7 +279,7 @@ class ArchiveOrgIE(InfoExtractor):
                    'url': 'https://archive.org/' + track['file'].lstrip('/'),
                }

-        metadata = self._download_json('http://archive.org/metadata/' + identifier, identifier)
+        metadata = self._download_json(f'https://archive.org/metadata/{identifier}', identifier)
        m = metadata['metadata']
        identifier = m['identifier']

--- a/yt_dlp/extractor/parti.py
+++ b/yt_dlp/extractor/parti.py
@@ -6,7 +6,10 @@ from ..utils.traversal import traverse_obj
 class PartiBaseIE(InfoExtractor):
    def _call_api(self, path, video_id, note=None):
        return self._download_json(
-            f'https://api-backend.parti.com/parti_v2/profile/{path}', video_id, note)
+            f'https://prod-api.parti.com/parti_v2/profile/{path}', video_id, note, headers={
+                'Origin': 'https://parti.com',
+                'Referer': 'https://parti.com/',
+            })


 class PartiVideoIE(PartiBaseIE):
@@ -20,7 +23,7 @@ class PartiVideoIE(PartiBaseIE):
            'title': 'NOW LIVE ',
            'upload_date': '20250327',
            'categories': ['Gaming'],
-            'thumbnail': 'https://assets.parti.com/351424_eb9e5250-2821-484a-9c5f-ca99aa666c87.png',
+            'thumbnail': 'https://media.parti.com/351424_eb9e5250-2821-484a-9c5f-ca99aa666c87.png',
            'channel': 'ItZTMGG',
            'timestamp': 1743044379,
        },
@@ -34,7 +37,7 @@ class PartiVideoIE(PartiBaseIE):
        return {
            'id': video_id,
            'formats': self._extract_m3u8_formats(
-                urljoin('https://watch.parti.com', data['livestream_recording']), video_id, 'mp4'),
+                urljoin('https://media.parti.com/', data['livestream_recording']), video_id, 'mp4'),
            **traverse_obj(data, {
                'title': ('event_title', {str}),
                'channel': ('user_name', {str}),
@@ -47,32 +50,27 @@ class PartiVideoIE(PartiBaseIE):

 class PartiLivestreamIE(PartiBaseIE):
    IE_NAME = 'parti:livestream'
-    _VALID_URL = r'https?://(?:www\.)?parti\.com/creator/(?P<service>[\w]+)/(?P<id>[\w/-]+)'
+    _VALID_URL = r'https?://(?:www\.)?parti\.com/(?!video/)(?P<id>[\w/-]+)'
    _TESTS = [{
-        'url': 'https://parti.com/creator/parti/Capt_Robs_Adventures',
+        'url': 'https://parti.com/247CryptoTracker',
        'info_dict': {
-            'id': 'Capt_Robs_Adventures',
            'ext': 'mp4',
+            'id': '247CryptoTracker',
+            'description': 'md5:a78051f3d7e66e6a64c6b1eaf59fd364',
            'title': r"re:I'm Live on Parti \d{4}-\d{2}-\d{2} \d{2}:\d{2}",
-            'view_count': int,
-            'thumbnail': r're:https://assets\.parti\.com/.+\.png',
-            'timestamp': 1743879776,
-            'upload_date': '20250405',
+            'thumbnail': r're:https://media\.parti\.com/stream-screenshots/.+\.png',
            'live_status': 'is_live',
        },
        'params': {'skip_download': 'm3u8'},
-    }, {
-        'url': 'https://parti.com/creator/discord/sazboxgaming/0',
-        'only_matching': True,
    }]

    def _real_extract(self, url):
-        service, creator_slug = self._match_valid_url(url).group('service', 'id')
+        creator_slug = self._match_id(url)

        encoded_creator_slug = creator_slug.replace('/', '%23')
        creator_id = self._call_api(
-            f'get_user_by_social_media/{service}/{encoded_creator_slug}',
-            creator_slug, note='Fetching user ID')
+            f'user_id_from_name/{encoded_creator_slug}',
+            creator_slug, note='Fetching user ID')['user_id']

        data = self._call_api(
            f'get_livestream_channel_info/{creator_id}', creator_id,
@@ -85,11 +83,7 @@ class PartiLivestreamIE(PartiBaseIE):

        return {
            'id': creator_slug,
-            'formats': self._extract_m3u8_formats(
-                channel_info['playback_url'], creator_slug, live=True, query={
-                    'token': channel_info['playback_auth_token'],
-                    'player_version': '1.17.0',
-                }),
+            'formats': self._extract_m3u8_formats(channel_info['playback_url'], creator_slug, live=True),
            'is_live': True,
            **traverse_obj(data, {
                'title': ('livestream_event_info', 'event_name', {str}),
--- a/yt_dlp/extractor/pornhub.py
+++ b/yt_dlp/extractor/pornhub.py
@@ -24,6 +24,7 @@ from ..utils import (
    url_or_none,
    urlencode_postdata,
 )
+from ..utils.traversal import find_elements, traverse_obj


 class PornHubBaseIE(InfoExtractor):
@@ -137,23 +138,24 @@ class PornHubIE(PornHubBaseIE):
    _EMBED_REGEX = [r'<iframe[^>]+?src=["\'](?P<url>(?:https?:)?//(?:www\.)?pornhub(?:premium)?\.(?:com|net|org)/embed/[\da-z]+)']
    _TESTS = [{
        'url': 'http://www.pornhub.com/view_video.php?viewkey=648719015',
-        'md5': 'a6391306d050e4547f62b3f485dd9ba9',
+        'md5': '4d4a4e9178b655776f86cf89ecaf0edf',
        'info_dict': {
            'id': '648719015',
            'ext': 'mp4',
            'title': 'Seductive Indian beauty strips down and fingers her pink pussy',
-            'uploader': 'Babes',
+            'uploader': 'BABES-COM',
+            'uploader_id': '/users/babes-com',
            'upload_date': '20130628',
            'timestamp': 1372447216,
            'duration': 361,
            'view_count': int,
            'like_count': int,
-            'dislike_count': int,
            'comment_count': int,
            'age_limit': 18,
            'tags': list,
            'categories': list,
            'cast': list,
+            'thumbnail': r're:https?://.+',
        },
    }, {
        # non-ASCII title
@@ -480,13 +482,6 @@ class PornHubIE(PornHubBaseIE):
        comment_count = self._extract_count(
            r'All Comments\s*<span>\(([\d,.]+)\)', webpage, 'comment')

-        def extract_list(meta_key):
-            div = self._search_regex(
-                rf'(?s)<div[^>]+\bclass=["\'].*?\b{meta_key}Wrapper[^>]*>(.+?)</div>',
-                webpage, meta_key, default=None)
-            if div:
-                return [clean_html(x).strip() for x in re.findall(r'(?s)<a[^>]+\bhref=[^>]+>.+?</a>', div)]
-
        info = self._search_json_ld(webpage, video_id, default={})
        # description provided in JSON-LD is irrelevant
        info['description'] = None
@@ -505,9 +500,11 @@ class PornHubIE(PornHubBaseIE):
            'comment_count': comment_count,
            'formats': formats,
            'age_limit': 18,
-            'tags': extract_list('tags'),
-            'categories': extract_list('categories'),
-            'cast': extract_list('pornstars'),
+            **traverse_obj(webpage, {
+                'tags': ({find_elements(attr='data-label', value='tag')}, ..., {clean_html}),
+                'categories': ({find_elements(attr='data-label', value='category')}, ..., {clean_html}),
+                'cast': ({find_elements(attr='data-label', value='pornstar')}, ..., {clean_html}),
+            }),
            'subtitles': subtitles,
        }, info)

--- a/yt_dlp/extractor/telecinco.py
+++ b/yt_dlp/extractor/telecinco.py
@@ -6,20 +6,21 @@ from ..networking.exceptions import HTTPError
 from ..utils import (
    ExtractorError,
    clean_html,
+    extract_attributes,
    int_or_none,
    join_nonempty,
    str_or_none,
-    traverse_obj,
    update_url,
    url_or_none,
 )
+from ..utils.traversal import traverse_obj


 class TelecincoBaseIE(InfoExtractor):
    def _parse_content(self, content, url):
-        video_id = content['dataMediaId']
+        video_id = content['dataMediaId'][1]
        config = self._download_json(
-            content['dataConfig'], video_id, 'Downloading config JSON')
+            content['dataConfig'][1], video_id, 'Downloading config JSON')
        services = config['services']
        caronte = self._download_json(services['caronte'], video_id)
        if traverse_obj(caronte, ('dls', 0, 'drm', {bool})):
@@ -57,9 +58,9 @@ class TelecincoBaseIE(InfoExtractor):
            'id': video_id,
            'title': traverse_obj(config, ('info', 'title', {str})),
            'formats': formats,
-            'thumbnail': (traverse_obj(content, ('dataPoster', {url_or_none}))
+            'thumbnail': (traverse_obj(content, ('dataPoster', 1, {url_or_none}))
                          or traverse_obj(config, 'poster', 'imageUrl', expected_type=url_or_none)),
-            'duration': traverse_obj(content, ('dataDuration', {int_or_none})),
+            'duration': traverse_obj(content, ('dataDuration', 1, {int_or_none})),
            'http_headers': headers,
        }

@@ -137,30 +138,45 @@ class TelecincoIE(TelecincoBaseIE):
        'url': 'http://www.cuatro.com/chesterinlove/a-carta/chester-chester_in_love-chester_edu_2_2331030022.html',
        'only_matching': True,
    }]
+    _ASTRO_ISLAND_RE = re.compile(r'<astro-island\b[^>]+>')

    def _real_extract(self, url):
        display_id = self._match_id(url)
        webpage = self._download_webpage(url, display_id, impersonate=True)
-        article = self._search_json(
-            r'window\.\$REACTBASE_STATE\.article(?:_multisite)?\s*=',
-            webpage, 'article', display_id)['article']
-        description = traverse_obj(article, ('leadParagraph', {clean_html}, filter))

-        if article.get('editorialType') != 'VID':
+        props_list = traverse_obj(webpage, (
+            {self._ASTRO_ISLAND_RE.findall}, ...,
+            {extract_attributes}, 'props', {json.loads}))
+
+        description = traverse_obj(props_list, (..., 'leadParagraph', 1, {clean_html}, any, filter))
+        main_content = traverse_obj(props_list, (..., ('content', ('articleData', 1, 'opening')), 1, {dict}, any))
+
+        if traverse_obj(props_list, (..., 'editorialType', 1, {str}, any)) != 'VID':  # e.g. 'ART'
            entries = []

-            for p in traverse_obj(article, ((('opening', all), 'body'), lambda _, v: v['content'])):
-                content = p['content']
-                type_ = p.get('type')
-                if type_ == 'paragraph' and isinstance(content, str):
+            for p in traverse_obj(props_list, (..., 'articleData', 1, ('opening', ('body', 1, ...)), 1, {dict})):
+                type_ = traverse_obj(p, ('type', 1, {str}))
+                content = traverse_obj(p, ('content', 1, {str} if type_ == 'paragraph' else {dict}))
+                if not content:
+                    continue
+                if type_ == 'paragraph':
                    description = join_nonempty(description, content, delim='')
-                elif type_ == 'video' and isinstance(content, dict):
+                elif type_ == 'video':
                    entries.append(self._parse_content(content, url))
+                else:
+                    self.report_warning(
+                        f'Skipping unsupported content type "{type_}"', display_id, only_once=True)

            return self.playlist_result(
-                entries, str_or_none(article.get('id')),
-                traverse_obj(article, ('title', {str})), clean_html(description))
+                entries,
+                traverse_obj(props_list, (..., 'id', 1, {int}, {str_or_none}, any)) or display_id,
+                traverse_obj(main_content, ('dataTitle', 1, {str})),
+                clean_html(description))

-        info = self._parse_content(article['opening']['content'], url)
+        if not main_content:
+            raise ExtractorError('Unable to extract main content from webpage')
+
+        info = self._parse_content(main_content, url)
        info['description'] = description
+
        return info
--- a/yt_dlp/extractor/youtube/_tab.py
+++ b/yt_dlp/extractor/youtube/_tab.py
@@ -382,7 +382,8 @@ class YoutubeTabBaseInfoExtractor(YoutubeBaseInfoExtractor):
                        ('accessibilityText', {lambda x: re.fullmatch(r'(.+), (?:[\d,.]+(?:[KM]| million)?|No) views? - play Short', x)}, 1)), any),
                    'view_count': ('overlayMetadata', 'secondaryText', 'content', {parse_count}),
                }),
-                thumbnails=self._extract_thumbnails(renderer, 'thumbnail', final_key='sources'))
+                thumbnails=self._extract_thumbnails(
+                    renderer, ('thumbnailViewModel', 'thumbnailViewModel', 'image'), final_key='sources'))
            return

    def _video_entry(self, video_renderer):
@@ -1585,7 +1586,6 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor):
        'playlist_count': 50,
        'expected_warnings': ['YouTube Music is not directly supported'],
    }, {
-        # TODO: fix test suite, 208163447408c78673b08c172beafe5c310fb167 broke this test
        'note': 'unlisted single video playlist',
        'url': 'https://www.youtube.com/playlist?list=PLt5yu3-wZAlQLfIN0MMgp0wVV6MP3bM4_',
        'info_dict': {
@@ -1885,8 +1885,6 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor):
        'playlist_mincount': 30,
    }, {
        # Shorts url result in shorts tab
-        # TODO: Fix channel id extraction
-        # TODO: fix test suite, 208163447408c78673b08c172beafe5c310fb167 broke this test
        'url': 'https://www.youtube.com/channel/UCiu-3thuViMebBjw_5nWYrA/shorts',
        'info_dict': {
            'id': 'UCiu-3thuViMebBjw_5nWYrA',
@@ -1915,7 +1913,6 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor):
        'params': {'extract_flat': True},
    }, {
        # Live video status should be extracted
-        # TODO: fix test suite, 208163447408c78673b08c172beafe5c310fb167 broke this test
        'url': 'https://www.youtube.com/channel/UCQvWX73GQygcwXOTSf_VDVg/live',
        'info_dict': {
            'id': 'UCQvWX73GQygcwXOTSf_VDVg',
Author	SHA1	Message	Date
bashonly	ff61bef041	[ie/youtube:tab] Fix flat thumbnails extraction for shorts (#15331 ) Closes #15329 Authored by: bashonly	2025-12-15 22:37:25 +00:00
sepro	04f2ec4b97	[ie/parti] Fix extractors (#15319 ) Authored by: seproDev	2025-12-13 20:00:56 +01:00
0x∅	b6f24745bf	[ie/telecinco] Fix extractor (#15311 ) Closes #15240 Authored by: 0xvd, bashonly Co-authored-by: bashonly <88596187+bashonly@users.noreply.github.com>	2025-12-12 22:25:45 +00:00
norepro	f2ee2a46fc	[ie/pornhub] Optimize metadata extraction (#15231 ) Closes #14621 Authored by: norepro	2025-12-12 20:52:09 +00:00
bashonly	5f37f67d37	[ie/archive.org] Fix metadata extraction (#15286 ) Closes #15280 Authored by: bashonly	2025-12-09 19:05:12 +00:00