Compare commits

...

3 Commits

Author SHA1 Message Date
MellowKyler
4920420f85 Misc. cleanup 2024-11-02 14:13:54 -05:00
MellowKyler
786b40ebf9 Remove language field and add playlist test 2024-11-02 12:26:42 -05:00
MellowKyler
9196bd2029 seproDev redesign
- Support at uri
- Remove comment extraction for now
- Directly implement DID document lookup
- Restructure for generic extraction method
- Modify several info_dict fields
2024-11-02 08:29:34 -05:00

View File

@ -1,62 +1,70 @@
from .common import InfoExtractor from .common import InfoExtractor
from ..utils import int_or_none, mimetype2ext, parse_iso8601, traverse_obj, url_or_none from ..utils import (
ExtractorError,
format_field,
int_or_none,
mimetype2ext,
orderedSet,
parse_iso8601,
truncate_string,
update_url_query,
url_basename,
url_or_none,
variadic,
)
from ..utils.traversal import traverse_obj
class BlueskyIE(InfoExtractor): class BlueskyIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?(?:bsky\.app|main\.bsky\.dev)/profile/(?P<handle>[^/]+)/post/(?P<id>[0-9a-zA-Z]+)' _VALID_URL = [
r'https?://(?:www\.)?(?:bsky\.app|main\.bsky\.dev)/profile/(?P<handle>[\w.:%-]+)/post/(?P<id>\w+)',
r'at://(?P<handle>[\w.:%-]+)/app\.bsky\.feed\.post/(?P<id>\w+)',
]
_TESTS = [{ _TESTS = [{
'url': 'https://bsky.app/profile/blu3blue.bsky.social/post/3l4omssdl632g', 'url': 'https://bsky.app/profile/blu3blue.bsky.social/post/3l4omssdl632g',
'md5': '375539c1930ab05d15585ed772ab54fd', 'md5': '375539c1930ab05d15585ed772ab54fd',
'info_dict': { 'info_dict': {
'id': '3l4omssdl632g', 'id': '3l4omssdl632g',
'ext': 'mp4', 'ext': 'mp4',
'title': str, 'uploader': 'Blu3Blu3Lilith',
'upload_date': '20240921', 'uploader_id': 'blu3blue.bsky.social',
'description': 'OMG WE HAVE VIDEOS NOW',
'thumbnail': r're:https://video.bsky.app/watch/.*\.jpg$',
'uploader': str,
'channel': 'blu3blue.bsky.social',
'uploader_id': 'did:plc:pzdr5ylumf7vmvwasrpr5bf2',
'channel_id': 'did:plc:pzdr5ylumf7vmvwasrpr5bf2',
'uploader_url': 'https://bsky.app/profile/blu3blue.bsky.social', 'uploader_url': 'https://bsky.app/profile/blu3blue.bsky.social',
'channel_id': 'did:plc:pzdr5ylumf7vmvwasrpr5bf2',
'channel_url': 'https://bsky.app/profile/did:plc:pzdr5ylumf7vmvwasrpr5bf2', 'channel_url': 'https://bsky.app/profile/did:plc:pzdr5ylumf7vmvwasrpr5bf2',
'thumbnail': r're:https://video.bsky.app/watch/.*\.jpg$',
'title': 'OMG WE HAVE VIDEOS NOW',
'description': 'OMG WE HAVE VIDEOS NOW',
'upload_date': '20240921',
'timestamp': 1726940605, 'timestamp': 1726940605,
'like_count': int, 'like_count': int,
'repost_count': int, 'repost_count': int,
'comment_count': int, 'comment_count': int,
'webpage_url': 'https://bsky.app/profile/blu3blue.bsky.social/post/3l4omssdl632g', 'tags': [],
'tags': 'count:1',
'comments': 'mincount:29',
'age_limit': 0,
}, },
'params': {'getcomments': True},
}, { }, {
'url': 'https://bsky.app/profile/bsky.app/post/3l3vgf77uco2g', 'url': 'https://bsky.app/profile/bsky.app/post/3l3vgf77uco2g',
'md5': 'b9e344fdbce9f2852c668a97efefb105', 'md5': 'b9e344fdbce9f2852c668a97efefb105',
'info_dict': { 'info_dict': {
'id': '3l3vgf77uco2g', 'id': '3l3vgf77uco2g',
'ext': 'mp4', 'ext': 'mp4',
'title': str,
'upload_date': '20240911',
'description': r're:Bluesky now has video!',
'thumbnail': r're:https://video.bsky.app/watch/.*\.jpg$',
'alt_title': 'Bluesky video feature announcement',
'uploader': 'Bluesky', 'uploader': 'Bluesky',
'channel': 'bsky.app', 'uploader_id': 'bsky.app',
'uploader_id': 'did:plc:z72i7hdynmk6r22z27h6tvur',
'channel_id': 'did:plc:z72i7hdynmk6r22z27h6tvur',
'uploader_url': 'https://bsky.app/profile/bsky.app', 'uploader_url': 'https://bsky.app/profile/bsky.app',
'channel_id': 'did:plc:z72i7hdynmk6r22z27h6tvur',
'channel_url': 'https://bsky.app/profile/did:plc:z72i7hdynmk6r22z27h6tvur', 'channel_url': 'https://bsky.app/profile/did:plc:z72i7hdynmk6r22z27h6tvur',
'thumbnail': r're:https://video.bsky.app/watch/.*\.jpg$',
'title': 'Bluesky now has video! Update your app to versi...',
'alt_title': 'Bluesky video feature announcement',
'description': r're:Bluesky now has video!',
'upload_date': '20240911',
'timestamp': 1726074716, 'timestamp': 1726074716,
'like_count': int, 'like_count': int,
'repost_count': int, 'repost_count': int,
'comment_count': int, 'comment_count': int,
'webpage_url': 'https://bsky.app/profile/bsky.app/post/3l3vgf77uco2g', 'tags': [],
'tags': ['en', 'pt'],
'subtitles': { 'subtitles': {
'en': 'mincount:1', 'en': 'mincount:1',
}, },
'age_limit': 0,
}, },
}, { }, {
'url': 'https://main.bsky.dev/profile/souris.moe/post/3l4qhp7bcs52c', 'url': 'https://main.bsky.dev/profile/souris.moe/post/3l4qhp7bcs52c',
@ -64,24 +72,20 @@ class BlueskyIE(InfoExtractor):
'info_dict': { 'info_dict': {
'id': '3l4qhp7bcs52c', 'id': '3l4qhp7bcs52c',
'ext': 'mp4', 'ext': 'mp4',
'title': str, 'uploader': 'souris',
'upload_date': '20240922', 'uploader_id': 'souris.moe',
'description': '',
'thumbnail': r're:https://video.bsky.app/watch/.*\.jpg$',
'uploader': str,
'channel': 'souris.moe',
'uploader_id': 'did:plc:tj7g244gl5v6ai6cm4f4wlqp',
'channel_id': 'did:plc:tj7g244gl5v6ai6cm4f4wlqp',
'uploader_url': 'https://bsky.app/profile/souris.moe', 'uploader_url': 'https://bsky.app/profile/souris.moe',
'channel_id': 'did:plc:tj7g244gl5v6ai6cm4f4wlqp',
'channel_url': 'https://bsky.app/profile/did:plc:tj7g244gl5v6ai6cm4f4wlqp', 'channel_url': 'https://bsky.app/profile/did:plc:tj7g244gl5v6ai6cm4f4wlqp',
'thumbnail': r're:https://video.bsky.app/watch/.*\.jpg$',
'title': 'Bluesky video #3l4qhp7bcs52c',
'description': '',
'upload_date': '20240922',
'timestamp': 1727003838, 'timestamp': 1727003838,
'like_count': int, 'like_count': int,
'repost_count': int, 'repost_count': int,
'comment_count': int, 'comment_count': int,
'webpage_url': 'https://main.bsky.dev/profile/souris.moe/post/3l4qhp7bcs52c', 'tags': [],
'tags': 'count:1',
'subtitles': 'count:0',
'age_limit': 0,
}, },
}, { }, {
'url': 'https://bsky.app/profile/de1.pds.tentacle.expert/post/3l3w4tnezek2e', 'url': 'https://bsky.app/profile/de1.pds.tentacle.expert/post/3l3w4tnezek2e',
@ -89,24 +93,20 @@ class BlueskyIE(InfoExtractor):
'info_dict': { 'info_dict': {
'id': '3l3w4tnezek2e', 'id': '3l3w4tnezek2e',
'ext': 'mp4', 'ext': 'mp4',
'title': str, 'uploader': 'clean',
'upload_date': '20240911', 'uploader_id': 'de1.pds.tentacle.expert',
'description': '',
'thumbnail': r're:https://video.bsky.app/watch/.*\.jpg$',
'uploader': str,
'channel': 'de1.pds.tentacle.expert',
'uploader_id': 'did:web:de1.tentacle.expert',
'channel_id': 'did:web:de1.tentacle.expert',
'uploader_url': 'https://bsky.app/profile/de1.pds.tentacle.expert', 'uploader_url': 'https://bsky.app/profile/de1.pds.tentacle.expert',
'channel_id': 'did:web:de1.tentacle.expert',
'channel_url': 'https://bsky.app/profile/did:web:de1.tentacle.expert', 'channel_url': 'https://bsky.app/profile/did:web:de1.tentacle.expert',
'thumbnail': r're:https://video.bsky.app/watch/.*\.jpg$',
'title': 'Bluesky video #3l3w4tnezek2e',
'description': '',
'upload_date': '20240911',
'timestamp': 1726098823, 'timestamp': 1726098823,
'like_count': int, 'like_count': int,
'repost_count': int, 'repost_count': int,
'comment_count': int, 'comment_count': int,
'webpage_url': 'https://bsky.app/profile/de1.pds.tentacle.expert/post/3l3w4tnezek2e', 'tags': [],
'tags': 'count:1',
'subtitles': 'count:0',
'age_limit': 0,
}, },
}, { }, {
'url': 'https://bsky.app/profile/yunayuispink.bsky.social/post/3l7gqcfes742o', 'url': 'https://bsky.app/profile/yunayuispink.bsky.social/post/3l7gqcfes742o',
@ -114,81 +114,78 @@ class BlueskyIE(InfoExtractor):
'info_dict': { 'info_dict': {
'id': 'XxK3t_5V3ao', 'id': 'XxK3t_5V3ao',
'ext': 'webm', 'ext': 'webm',
'uploader': 'yunayu',
'uploader_id': '@yunayuispink', 'uploader_id': '@yunayuispink',
'live_status': 'not_live', 'uploader_url': 'https://www.youtube.com/@yunayuispink',
'view_count': int, 'channel': 'yunayu',
'channel_id': 'UCPLvXnHa7lTyNoR_dGsU14w',
'channel_url': 'https://www.youtube.com/channel/UCPLvXnHa7lTyNoR_dGsU14w', 'channel_url': 'https://www.youtube.com/channel/UCPLvXnHa7lTyNoR_dGsU14w',
'thumbnail': 'https://i.ytimg.com/vi_webp/XxK3t_5V3ao/maxresdefault.webp', 'thumbnail': 'https://i.ytimg.com/vi_webp/XxK3t_5V3ao/maxresdefault.webp',
'description': r're:Have a good goodx10000day',
'title': '5min vs 5hours drawing',
'availability': 'public',
'live_status': 'not_live',
'playable_in_embed': True,
'upload_date': '20241026', 'upload_date': '20241026',
'uploader_url': 'https://www.youtube.com/@yunayuispink', 'timestamp': 1729967784,
'description': 'md5:7d474e6ab76a88c84eb0f294e18ed828', 'duration': 321,
'age_limit': 0,
'like_count': int,
'view_count': int,
'comment_count': int,
'channel_follower_count': int,
'categories': ['Entertainment'], 'categories': ['Entertainment'],
'tags': [], 'tags': [],
'title': '5min vs 5hours drawing',
'duration': 321,
'uploader': 'yunayu',
'channel_follower_count': int,
'channel': 'yunayu',
'playable_in_embed': True,
'timestamp': 1729967784,
'like_count': int,
'channel_id': 'UCPLvXnHa7lTyNoR_dGsU14w',
'availability': 'public',
'age_limit': 0,
'comment_count': int,
}, },
'add_ie': ['Youtube'], 'add_ie': ['Youtube'],
'params': {'getcomments': True},
}, { }, {
'url': 'https://bsky.app/profile/endshark.bsky.social/post/3jzxjkcemae2m', 'url': 'https://bsky.app/profile/endshark.bsky.social/post/3jzxjkcemae2m',
'md5': 'd5c8fbc8f72b9f6ef160c150c420bb55', 'md5': 'd5c8fbc8f72b9f6ef160c150c420bb55',
'info_dict': { 'info_dict': {
'id': '222792849', 'id': '222792849',
'ext': 'mp3', 'ext': 'mp3',
'track': 'Forward to the End',
'thumbnail': 'https://f4.bcbits.com/img/a2507705510_5.jpg',
'album': 'Hari Nezumi [EP]',
'uploader_id': 'laserbatx',
'uploader': 'LASERBAT', 'uploader': 'LASERBAT',
'duration': 228.571, 'uploader_id': 'laserbatx',
'album_artists': ['LASERBAT'],
'timestamp': 1682276040.0,
'uploader_url': 'https://laserbatx.bandcamp.com', 'uploader_url': 'https://laserbatx.bandcamp.com',
'artists': ['LASERBAT'],
'album_artists': ['LASERBAT'],
'album': 'Hari Nezumi [EP]',
'track': 'Forward to the End',
'title': 'LASERBAT - Forward to the End',
'thumbnail': 'https://f4.bcbits.com/img/a2507705510_5.jpg',
'duration': 228.571,
'track_id': '222792849', 'track_id': '222792849',
'release_date': '20230423', 'release_date': '20230423',
'upload_date': '20230423', 'upload_date': '20230423',
'timestamp': 1682276040.0,
'release_timestamp': 1682276040.0, 'release_timestamp': 1682276040.0,
'track_number': 1, 'track_number': 1,
'artists': ['LASERBAT'],
'title': 'LASERBAT - Forward to the End',
}, },
'add_ie': ['Bandcamp'], 'add_ie': ['Bandcamp'],
}, { }, {
'url': 'https://bsky.app/profile/dannybhoix.bsky.social/post/3l6oe5mtr2c2j', 'url': 'https://bsky.app/profile/dannybhoix.bsky.social/post/3l6oe5mtr2c2j',
'md5': 'b9e344fdbce9f2852c668a97efefb105', 'md5': 'b9e344fdbce9f2852c668a97efefb105',
'info_dict': { 'info_dict': {
'id': '3l6oe5mtr2c2j', 'id': '3l3vgf77uco2g',
'ext': 'mp4', 'ext': 'mp4',
'description': 'this looks like a 2012 announcement video. i love it.', 'uploader': 'Bluesky',
'uploader_url': 'https://bsky.app/profile/dannybhoix.bsky.social', 'uploader_id': 'bsky.app',
'uploader': 'Danny', 'uploader_url': 'https://bsky.app/profile/bsky.app',
'title': str, 'channel_id': 'did:plc:z72i7hdynmk6r22z27h6tvur',
'channel_url': 'https://bsky.app/profile/did:plc:z72i7hdynmk6r22z27h6tvur',
'thumbnail': r're:https://video.bsky.app/watch/.*\.jpg$',
'title': 'Bluesky now has video! Update your app to versi...',
'alt_title': 'Bluesky video feature announcement',
'description': r're:Bluesky now has video!',
'upload_date': '20240911',
'timestamp': 1726074716,
'like_count': int,
'repost_count': int, 'repost_count': int,
'comment_count': int, 'comment_count': int,
'channel': 'dannybhoix.bsky.social', 'tags': [],
'timestamp': 1729130330,
'uploader_id': 'did:plc:ng7fhshaed7assvhkq7cxxnw',
'upload_date': '20241017',
'channel_url': 'https://bsky.app/profile/did:plc:ng7fhshaed7assvhkq7cxxnw',
'tags': ['en'],
'like_count': int,
'channel_id': 'did:plc:ng7fhshaed7assvhkq7cxxnw',
'thumbnail': r're:https://video.bsky.app/watch/.*\.jpg$',
'alt_title': 'Bluesky video feature announcement',
'subtitles': { 'subtitles': {
'en': 'mincount:1', 'en': 'mincount:1',
}, },
'age_limit': 0,
}, },
}, { }, {
'url': 'https://bsky.app/profile/alt.bun.how/post/3l7rdfxhyds2f', 'url': 'https://bsky.app/profile/alt.bun.how/post/3l7rdfxhyds2f',
@ -196,167 +193,202 @@ class BlueskyIE(InfoExtractor):
'info_dict': { 'info_dict': {
'id': '3l7rdfxhyds2f', 'id': '3l7rdfxhyds2f',
'ext': 'mp4', 'ext': 'mp4',
'channel_url': 'https://bsky.app/profile/did:plc:7x6rtuenkuvxq3zsvffp2ide',
'timestamp': 1730332128,
'channel_id': 'did:plc:7x6rtuenkuvxq3zsvffp2ide',
'upload_date': '20241030',
'channel': 'alt.bun.how',
'uploader_id': 'did:plc:7x6rtuenkuvxq3zsvffp2ide',
'description': 'crazy that i look like this tbh',
'comment_count': int,
'thumbnail': r're:https://video.bsky.app/watch/.*\.jpg$',
'uploader_url': 'https://bsky.app/profile/alt.bun.how',
'tags': ['en', 'sexual', 'sexual'],
'like_count': int,
'title': 'cinnamon on Bluesky',
'uploader': 'cinnamon', 'uploader': 'cinnamon',
'uploader_id': 'alt.bun.how',
'uploader_url': 'https://bsky.app/profile/alt.bun.how',
'channel_id': 'did:plc:7x6rtuenkuvxq3zsvffp2ide',
'channel_url': 'https://bsky.app/profile/did:plc:7x6rtuenkuvxq3zsvffp2ide',
'thumbnail': r're:https://video.bsky.app/watch/.*\.jpg$',
'title': 'crazy that i look like this tbh',
'description': 'crazy that i look like this tbh',
'upload_date': '20241030',
'timestamp': 1730332128,
'like_count': int,
'repost_count': int, 'repost_count': int,
'comment_count': int,
'tags': ['sexual'],
'age_limit': 18, 'age_limit': 18,
}, },
}, {
'url': 'at://did:plc:ia76kvnndjutgedggx2ibrem/app.bsky.feed.post/3l6zrz6zyl2dr',
'md5': '71b0eb6d85d03145e6af6642c7fc6d78',
'info_dict': {
'id': '3l6zrz6zyl2dr',
'ext': 'mp4',
'uploader': 'mary🐇',
'uploader_id': 'mary.my.id',
'uploader_url': 'https://bsky.app/profile/mary.my.id',
'channel_id': 'did:plc:ia76kvnndjutgedggx2ibrem',
'channel_url': 'https://bsky.app/profile/did:plc:ia76kvnndjutgedggx2ibrem',
'thumbnail': 'https://video.bsky.app/watch/did%3Aplc%3Aia76kvnndjutgedggx2ibrem/bafkreiasced5yaaodnspwgmuwvbxhoghbkq7iibhl3ftjgupq3brmgfwkm/thumbnail.jpg',
'title': 'Bluesky video #3l6zrz6zyl2dr',
'alt_title': '',
'description': '',
'upload_date': '20241021',
'timestamp': 1729523172,
'like_count': int,
'repost_count': int,
'comment_count': int,
'tags': [],
},
}, {
'url': 'https://bsky.app/profile/purpleicetea.bsky.social/post/3l7gv55dc2o2w',
'info_dict': {
'id': '3l7gv55dc2o2w',
},
'playlist': [{
'info_dict': {
'id': '3l7gv55dc2o2w',
'ext': 'mp4',
'upload_date': '20241026',
'description': 'One of my favorite videos',
'comment_count': int,
'uploader_url': 'https://bsky.app/profile/purpleicetea.bsky.social',
'uploader': 'Purple.Ice.Tea',
'thumbnail': 'https://video.bsky.app/watch/did%3Aplc%3Abjh5ffwya5f53dfy47dezuwx/bafkreicaldzyr6yr26ex4gyr6z2gbwvi53iolrcsqfipqqs3ieb2olcmiu/thumbnail.jpg',
'channel_url': 'https://bsky.app/profile/did:plc:bjh5ffwya5f53dfy47dezuwx',
'like_count': int,
'channel_id': 'did:plc:bjh5ffwya5f53dfy47dezuwx',
'repost_count': int,
'timestamp': 1729973202,
'tags': [],
'uploader_id': 'purpleicetea.bsky.social',
'title': 'One of my favorite videos',
},
}, {
'info_dict': {
'id': '3l77u64l7le2e',
'ext': 'mp4',
'title': 'hearing people on twitter say that bluesky isn\'...',
'like_count': int,
'uploader_id': 'thafnine.net',
'uploader_url': 'https://bsky.app/profile/thafnine.net',
'upload_date': '20241024',
'channel_url': 'https://bsky.app/profile/did:plc:6ttyq36rhiyed7wu3ws7dmqj',
'description': 'md5:dba7d54d6df7e5b79a24d7b287dafbe9',
'tags': [],
'alt_title': 'md5:9b1ee1937fb3d1a81e932f9ec14d560e',
'uploader': 'T9',
'channel_id': 'did:plc:6ttyq36rhiyed7wu3ws7dmqj',
'thumbnail': 'https://video.bsky.app/watch/did%3Aplc%3A6ttyq36rhiyed7wu3ws7dmqj/bafkreih4xf5k22urq5kytjbnlj2djnzmlrqnvotnglmoy35mkpobrwtota/thumbnail.jpg',
'timestamp': 1729731642,
'comment_count': int,
'repost_count': int,
},
}],
}] }]
_BLOB_URL_TMPL = '%s/xrpc/com.atproto.sync.getBlob'
def _get_comments(self, meta): def _get_service_endpoint(self, did, video_id):
yield from self.traverse_replies(meta, traverse_obj(meta, ('post', 'uri'), default='')) if did.startswith('did:web:'):
url = f'https://{did[8:]}/.well-known/did.json'
def traverse_replies(self, thread_node, root_uri): else:
post_uri = traverse_obj(thread_node, ('post', 'uri')) url = f'https://plc.directory/{did}'
if post_uri != root_uri:
post = thread_node.get('post')
parent_uri = traverse_obj(post, ('record', 'reply', 'parent', 'uri'))
author_handle = traverse_obj(post, ('author', 'handle'))
author_did = traverse_obj(post, ('author', 'did'), default='')
yield {
'id': post_uri,
**traverse_obj(post, {
'text': ('record', 'text'),
'timestamp': ('record', 'createdAt', {parse_iso8601}),
'author': ('author', 'displayName'),
'author_thumbnail': ('author', 'avatar', {url_or_none}),
}),
'parent': 'root' if parent_uri == root_uri else parent_uri,
'like_count': post.get('likeCount'),
'author_id': author_did,
'author_url': f'https://bsky.app/profile/{author_handle}',
'author_is_uploader': author_did in root_uri,
}
if replies := thread_node.get('replies'):
for reply in replies:
yield from self.traverse_replies(reply, root_uri)
if parent := thread_node.get('parent'):
yield from self.traverse_replies(parent, root_uri)
def get_service_endpoint(self, did, video_id):
services = self._download_json( services = self._download_json(
f'https://resolver.identity.foundation/1.0/identifiers/{did}', url, video_id, 'Fetching service endpoint', 'Falling back to bsky.social', fatal=False)
video_id, fatal=False).get('service') or [] return traverse_obj(
for service in services: services, ('service', lambda _, x: x['type'] == 'AtprotoPersonalDataServer',
if service.get('type') == 'AtprotoPersonalDataServer': 'serviceEndpoint', {url_or_none}, any)) or 'https://bsky.social'
return service.get('serviceEndpoint')
return 'https://bsky.social'
def _real_extract(self, url): def _real_extract(self, url):
handle, video_id = self._match_valid_url(url).groups() handle, video_id = self._match_valid_url(url).group('handle', 'id')
getcomments = self.get_param('getcomments', False) post = self._download_json(
meta = self._download_json(
'https://public.api.bsky.app/xrpc/app.bsky.feed.getPostThread', 'https://public.api.bsky.app/xrpc/app.bsky.feed.getPostThread',
video_id, headers={'Content-Type': 'application/json'}, query={ video_id, query={
'uri': f'at://{handle}/app.bsky.feed.post/{video_id}', 'uri': f'at://{handle}/app.bsky.feed.post/{video_id}',
'depth': 1000 if getcomments else 0, 'depth': 0,
'parentHeight': 1000 if getcomments else 0, 'parentHeight': 0,
})['thread'] })['thread']['post']
post = meta.get('post')
did = traverse_obj(post, ('author', 'did')) entries = []
record_embed = traverse_obj(post, ('record', 'embed', ('media', None)), get_all=False) # app.bsky.embed.video.view/app.bsky.embed.external.view
post_type = record_embed.get('$type') if record_embed else None entries.extend(self._extract_videos(post, video_id))
quoted_post = traverse_obj(post, ('embed', 'record', ('record', None)), get_all=False) # app.bsky.embed.recordWithMedia.view
quoted_type = traverse_obj(quoted_post, ('value', 'embed', ('media', None), '$type'), get_all=False) entries.extend(self._extract_videos(
quoted_media = traverse_obj(quoted_post, ('embeds', 0, ('media', None)), get_all=False) post, video_id, embed_path=('embed', 'media'), record_subpath=('embed', 'media')))
# app.bsky.embed.record.view
if nested_post := traverse_obj(post, ('embed', 'record', ('record', None), {dict}, any)):
entries.extend(self._extract_videos(
nested_post, video_id, embed_path=('embeds', 0), record_path='value'))
if post_type == 'app.bsky.embed.external': if not entries:
return self.url_result(traverse_obj( raise ExtractorError('No video could be found in this post', expected=True)
post, ('embed', ('media', None), 'external', 'uri'), get_all=False) if len(entries) == 1:
or traverse_obj(record_embed, ('external', 'uri'))) return entries[0]
elif post_type == 'app.bsky.embed.video': return self.playlist_result(entries, video_id)
formats, subs = self._extract_m3u8_formats_and_subtitles(
traverse_obj(post, ('embed', ('media', None), 'playlist'), get_all=False), @staticmethod
video_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False, def _build_profile_url(path):
note='Downloading m3u8 information', errnote='Unable to download m3u8 information') return format_field(path, None, 'https://bsky.app/profile/%s', default=None)
if blob_cid := traverse_obj(record_embed, ('video', 'ref', '$link'), ('video', 'cid')):
endpoint = self.get_service_endpoint(did, video_id) def _extract_videos(self, root, video_id, embed_path='embed', record_path='record', record_subpath='embed'):
formats.append({ embed_path = variadic(embed_path, (str, bytes, dict, set))
'format_id': 'blob', record_path = variadic(record_path, (str, bytes, dict, set))
'url': f'{endpoint}/xrpc/com.atproto.sync.getBlob?did={did}&cid={blob_cid}', record_subpath = variadic(record_subpath, (str, bytes, dict, set))
**traverse_obj(record_embed, {
'ext': ('video', 'mimeType', {mimetype2ext}), entries = []
'width': ('aspectRatio', 'width', {int_or_none}), if external_uri := traverse_obj(root, (
'height': ('aspectRatio', 'height', {int_or_none}), ((*record_path, *record_subpath), embed_path), 'external', 'uri', {url_or_none}, any)):
'filesize': ('video', 'size', {int_or_none}), entries.append(self.url_result(external_uri))
}), if playlist := traverse_obj(root, (*embed_path, 'playlist', {url_or_none})):
}) formats, subtitles = self._extract_m3u8_formats_and_subtitles(
video_info = { playlist, video_id, 'mp4', m3u8_id='hls', fatal=False)
'formats': formats,
'subtitles': subs,
**traverse_obj(post, {
'thumbnail': ('embed', 'thumbnail', {url_or_none}),
'alt_title': ('embed', 'alt'),
}),
}
elif quoted_type == 'app.bsky.embed.external':
return self.url_result(traverse_obj(quoted_media, ('external', 'uri')))
elif quoted_type == 'app.bsky.embed.video':
formats, subs = self._extract_m3u8_formats_and_subtitles(
quoted_media.get('playlist'), video_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False,
note='Downloading m3u8 information', errnote='Unable to download m3u8 information')
if blob_cid := quoted_media.get('cid'):
quoted_did = traverse_obj(quoted_post, ('author', 'did'))
quoted_embed = traverse_obj(quoted_post, ('value', 'embed', ('media', None)), get_all=False)
endpoint = self.get_service_endpoint(quoted_did, video_id)
formats.append({
'format_id': 'blob',
'url': f'{endpoint}/xrpc/com.atproto.sync.getBlob?did={quoted_did}&cid={blob_cid}',
**traverse_obj(quoted_embed, {
'ext': ('video', 'mimeType', {mimetype2ext}),
'width': ('aspectRatio', 'width', {int_or_none}),
'height': ('aspectRatio', 'height', {int_or_none}),
'filesize': ('video', 'size', {int_or_none}),
}),
})
video_info = {
'formats': formats,
'subtitles': subs,
'thumbnail': url_or_none(quoted_media.get('thumbnail')),
'alt_title': quoted_embed.get('alt') or quoted_media.get('alt'),
}
else: else:
self.raise_no_formats('No video could be found in this post', expected=True) return entries
handle = traverse_obj(post, ('author', 'handle')) video_cid = traverse_obj(
uploader = traverse_obj(post, ('author', 'displayName')) or handle root, (*embed_path, 'cid', {str}),
(*record_path, *record_subpath, 'video', 'ref', '$link', {str}))
did = traverse_obj(root, ('author', 'did', {str}))
tags = traverse_obj(post, ('record', 'langs'), default=[]) if did and video_cid:
if label_list := post.get('labels'): endpoint = self._get_service_endpoint(did, video_id)
tags.extend(label.get('val') for label in label_list)
return { formats.append({
'format_id': 'blob',
'url': update_url_query(
self._BLOB_URL_TMPL % endpoint, {'did': did, 'cid': video_cid}),
**traverse_obj(root, (*embed_path, 'aspectRatio', {
'width': ('width', {int_or_none}),
'height': ('height', {int_or_none}),
})),
**traverse_obj(root, (*record_path, *record_subpath, 'video', {
'filesize': ('size', {int_or_none}),
'ext': ('mimeType', {mimetype2ext}),
})),
})
for sub_data in traverse_obj(root, (
*record_path, *record_subpath, 'captions', lambda _, v: v['file']['ref']['$link'])):
subtitles.setdefault(sub_data.get('lang') or 'und', []).append({
'url': update_url_query(
self._BLOB_URL_TMPL % endpoint, {'did': did, 'cid': sub_data['file']['ref']['$link']}),
'ext': traverse_obj(sub_data, ('file', 'mimeType', {mimetype2ext})),
})
entries.append({
'id': video_id, 'id': video_id,
'title': f'{uploader} on Bluesky', 'formats': formats,
**video_info, 'subtitles': subtitles,
'uploader': uploader, **traverse_obj(root, {
'channel': handle, 'id': ('uri', {url_basename}),
'uploader_id': did, 'thumbnail': (*embed_path, 'thumbnail', {url_or_none}),
'channel_id': did, 'alt_title': (*embed_path, 'alt', {str}),
'uploader_url': f'https://bsky.app/profile/{handle}', 'uploader': ('author', 'displayName', {str}),
'channel_url': f'https://bsky.app/profile/{did}', 'uploader_id': ('author', 'handle', {str}),
'like_count': post.get('likeCount'), 'uploader_url': ('author', 'handle', {self._build_profile_url}),
'repost_count': post.get('repostCount'), 'channel_id': ('author', 'did', {str}),
'comment_count': post.get('replyCount'), 'channel_url': ('author', 'did', {self._build_profile_url}),
'tags': tags, 'like_count': ('likeCount', {int_or_none}),
'age_limit': 18 if {'sexual', 'porn', 'graphic-media'}.intersection(tags) else 0, 'repost_count': ('repostCount', {int_or_none}),
'__post_extractor': self.extract_comments(meta), 'comment_count': ('replyCount', {int_or_none}),
**traverse_obj(post, { 'timestamp': ('indexedAt', {parse_iso8601}),
'timestamp': ('record', 'createdAt', {parse_iso8601}), 'tags': ('labels', ..., 'val', {str}, all, {orderedSet}),
'description': ('record', 'text'), 'age_limit': (
'labels', ..., 'val', {lambda x: 18 if x in ('sexual', 'porn', 'graphic-media') else None}, any),
'description': (*record_path, 'text', {str}, any),
'title': (*record_path, 'text', {lambda x: truncate_string(x, 50)}),
}), }),
} })
return entries