diff --git a/yt_dlp/extractor/ninaprotocol.py b/yt_dlp/extractor/ninaprotocol.py index 15372e0838..fd7bbf6d05 100644 --- a/yt_dlp/extractor/ninaprotocol.py +++ b/yt_dlp/extractor/ninaprotocol.py @@ -1,21 +1,18 @@ +import re +import json from .common import InfoExtractor - class NinaProtocolIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?ninaprotocol\.com/releases/(?P[0-9]+)' + _VALID_URL = r'https?://(?:www\.)?ninaprotocol\.com/releases/(?P[a-zA-Z0-9\-]+)' + _TESTS = [{ - 'url': ' https://www.ninaprotocol.com/releases/3xl-nina-label-mix-014', - 'md5': 'TODO: md5 sum of the first 10241 bytes of the video file (use --test)', + 'url': 'https://www.ninaprotocol.com/releases/3xl-nina-label-mix-014', + 'md5': 'TODO: md5 sum of the first 10241 bytes of the audio file (use --test)', 'info_dict': { - 'id': '1', + 'id': '3xl-nina-label-mix-014', 'ext': 'mp3', 'title': '3XL - Nina Label Mix 014', - 'thumbnail': r're:^https?://.*\.jpg$', - # TODO more properties, either as: - # * A value - # * MD5 checksum; start the string with md5: - # * A regular expression; start the string with re: - # * Any Python type, e.g. int or float + # Add the thumbnail regex extraction here } }] @@ -23,13 +20,38 @@ class NinaProtocolIE(InfoExtractor): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) - # TODO more code goes here, for example ... - title = self._html_search_regex(r'

(.+?)

', webpage, 'title') + # If the title is not within

tags, adjust the regex below. + title = self._html_search_regex(r'
([^<]+)
', webpage, 'title', default=None) + + if not title: + self.report_warning(f'Could not extract title for {video_id}') + title = video_id # Use a default title if none is found + + # Extract JSON-like data within JavaScript + json_str = self._search_regex( + r'self\.__next_f\.push\(\[1,"24:\[\\"(.+?)\\"\]\]"\)', + webpage, 'JSON data', fatal=False) + + # Parse JSON data if found + audio_url = None + if json_str: + try: + # Clean up the JSON string and load it + json_str = re.sub(r'\\u003c|\\u003e|\\u0026', '', json_str) + json_data = json.loads(f'[{json_str}]') # Wrap in array brackets to form valid JSON + # Navigate through the JSON structure to find the audio URL + audio_url = json_data[0].get('animation_url') + except json.JSONDecodeError: + self.report_warning('Could not parse JSON data for audio URL.') + + # Extract thumbnail + thumbnail = self._html_search_regex( + r']+src="([^"]+)"[^>]*alt="[^"]*"', webpage, 'thumbnail', fatal=False) return { 'id': video_id, 'title': title, - 'description': self._og_search_description(webpage), - 'uploader': self._search_regex(r']+id="uploader"[^>]*>([^<]+)<', webpage, 'uploader', fatal=False), - # TODO more properties (see yt_dlp/extractor/common.py) + 'url': audio_url, + 'thumbnail': thumbnail, + # Add additional properties as needed } \ No newline at end of file