[NinaProtocol] Updated code to a working implementation. There is a minor bug, however and any help on it would be appreciated.

This commit is contained in:
Abhay Walia 2023-12-13 18:03:04 -05:00
parent 21bda14aaa
commit b8671868f7

View File

@ -1,21 +1,18 @@
import re
import json
from .common import InfoExtractor
class NinaProtocolIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?ninaprotocol\.com/releases/(?P<id>[0-9]+)'
_VALID_URL = r'https?://(?:www\.)?ninaprotocol\.com/releases/(?P<id>[a-zA-Z0-9\-]+)'
_TESTS = [{
'url': ' https://www.ninaprotocol.com/releases/3xl-nina-label-mix-014',
'md5': 'TODO: md5 sum of the first 10241 bytes of the video file (use --test)',
'url': 'https://www.ninaprotocol.com/releases/3xl-nina-label-mix-014',
'md5': 'TODO: md5 sum of the first 10241 bytes of the audio file (use --test)',
'info_dict': {
'id': '1',
'id': '3xl-nina-label-mix-014',
'ext': 'mp3',
'title': '3XL - Nina Label Mix 014',
'thumbnail': r're:^https?://.*\.jpg$',
# TODO more properties, either as:
# * A value
# * MD5 checksum; start the string with md5:
# * A regular expression; start the string with re:
# * Any Python type, e.g. int or float
# Add the thumbnail regex extraction here
}
}]
@ -23,13 +20,38 @@ class NinaProtocolIE(InfoExtractor):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
# TODO more code goes here, for example ...
title = self._html_search_regex(r'<h1>(.+?)</h1>', webpage, 'title')
# If the title is not within <h1> tags, adjust the regex below.
title = self._html_search_regex(r'<div class="title">([^<]+)</div>', webpage, 'title', default=None)
if not title:
self.report_warning(f'Could not extract title for {video_id}')
title = video_id # Use a default title if none is found
# Extract JSON-like data within JavaScript
json_str = self._search_regex(
r'self\.__next_f\.push\(\[1,"24:\[\\"(.+?)\\"\]\]"\)',
webpage, 'JSON data', fatal=False)
# Parse JSON data if found
audio_url = None
if json_str:
try:
# Clean up the JSON string and load it
json_str = re.sub(r'\\u003c|\\u003e|\\u0026', '', json_str)
json_data = json.loads(f'[{json_str}]') # Wrap in array brackets to form valid JSON
# Navigate through the JSON structure to find the audio URL
audio_url = json_data[0].get('animation_url')
except json.JSONDecodeError:
self.report_warning('Could not parse JSON data for audio URL.')
# Extract thumbnail
thumbnail = self._html_search_regex(
r'<img[^>]+src="([^"]+)"[^>]*alt="[^"]*"', webpage, 'thumbnail', fatal=False)
return {
'id': video_id,
'title': title,
'description': self._og_search_description(webpage),
'uploader': self._search_regex(r'<div[^>]+id="uploader"[^>]*>([^<]+)<', webpage, 'uploader', fatal=False),
# TODO more properties (see yt_dlp/extractor/common.py)
'url': audio_url,
'thumbnail': thumbnail,
# Add additional properties as needed
}