[NinaProtocol] Updated code to a working implementation. There is a minor bug, however and any help on it would be appreciated.

2024-11-13 20:53:06 +00:00 · 2023-12-13 18:03:04 -05:00 · 2023-12-13 18:03:04 -05:00 · b8671868f7
commit b8671868f7
parent 21bda14aaa
1 changed files with 38 additions and 16 deletions
--- a/yt_dlp/extractor/ninaprotocol.py
+++ b/yt_dlp/extractor/ninaprotocol.py
@ -1,21 +1,18 @@
+import re
+import json
 from .common import InfoExtractor

-
 class NinaProtocolIE(InfoExtractor):
-    _VALID_URL = r'https?://(?:www\.)?ninaprotocol\.com/releases/(?P<id>[0-9]+)'
+    _VALID_URL = r'https?://(?:www\.)?ninaprotocol\.com/releases/(?P<id>[a-zA-Z0-9\-]+)'
+
    _TESTS = [{
-        'url': ' https://www.ninaprotocol.com/releases/3xl-nina-label-mix-014',
-        'md5': 'TODO: md5 sum of the first 10241 bytes of the video file (use --test)',
+        'url': 'https://www.ninaprotocol.com/releases/3xl-nina-label-mix-014',
+        'md5': 'TODO: md5 sum of the first 10241 bytes of the audio file (use --test)',
        'info_dict': {
-            'id': '1',
+            'id': '3xl-nina-label-mix-014',
            'ext': 'mp3',
            'title': '3XL - Nina Label Mix 014',
-            'thumbnail': r're:^https?://.*\.jpg$',
-            # TODO more properties, either as:
-            # * A value
-            # * MD5 checksum; start the string with md5:
-            # * A regular expression; start the string with re:
-            # * Any Python type, e.g. int or float
+            # Add the thumbnail regex extraction here
        }
    }]

@ -23,13 +20,38 @@ class NinaProtocolIE(InfoExtractor):
        video_id = self._match_id(url)
        webpage = self._download_webpage(url, video_id)

-        # TODO more code goes here, for example ...
-        title = self._html_search_regex(r'<h1>(.+?)</h1>', webpage, 'title')
+        # If the title is not within <h1> tags, adjust the regex below.
+        title = self._html_search_regex(r'<div class="title">([^<]+)</div>', webpage, 'title', default=None)
+
+        if not title:
+            self.report_warning(f'Could not extract title for {video_id}')
+            title = video_id  # Use a default title if none is found
+
+        # Extract JSON-like data within JavaScript
+        json_str = self._search_regex(
+        r'self\.__next_f\.push\(\[1,"24:\[\\"(.+?)\\"\]\]"\)',
+        webpage, 'JSON data', fatal=False)
+
+        # Parse JSON data if found
+        audio_url = None
+        if json_str:
+            try:
+                # Clean up the JSON string and load it
+                json_str = re.sub(r'\\u003c|\\u003e|\\u0026', '', json_str) 
+                json_data = json.loads(f'[{json_str}]')  # Wrap in array brackets to form valid JSON
+                # Navigate through the JSON structure to find the audio URL
+                audio_url = json_data[0].get('animation_url')
+            except json.JSONDecodeError:
+                self.report_warning('Could not parse JSON data for audio URL.')
+
+        # Extract thumbnail
+        thumbnail = self._html_search_regex(
+            r'<img[^>]+src="([^"]+)"[^>]*alt="[^"]*"', webpage, 'thumbnail', fatal=False)

        return {
            'id': video_id,
            'title': title,
-            'description': self._og_search_description(webpage),
-            'uploader': self._search_regex(r'<div[^>]+id="uploader"[^>]*>([^<]+)<', webpage, 'uploader', fatal=False),
-            # TODO more properties (see yt_dlp/extractor/common.py)
+            'url': audio_url,
+            'thumbnail': thumbnail,
+            # Add additional properties as needed
        }