[dispeak] Add new extractor

Both GDCVault and GPUTechConf uses the service of DigitalSpeaking.
This commit is contained in:
Yen Chi Hsuan 2016-04-21 19:36:33 +08:00
parent 99ef96f84c
commit ec59d657e7
No known key found for this signature in database
GPG Key ID: 3FDDD575826C5C30
4 changed files with 123 additions and 99 deletions

View File

@ -0,0 +1,111 @@
from __future__ import unicode_literals
import re
from .common import InfoExtractor
from ..utils import (
int_or_none,
parse_duration,
remove_end,
xpath_element,
xpath_text,
)
class DigitalSpeakingIE(InfoExtractor):
_VALID_URL = r'http://evt.dispeak.com/([^/]+/)+xml/(?P<id>[^.]+).xml'
_TEST = {
# From http://evt.dispeak.com/ubm/gdc/sf16/xml/840376_BQRC.xml
'url': 'http://evt.dispeak.com/ubm/gdc/sf16/xml/840376_BQRC.xml',
'md5': 'a8efb6c31ed06ca8739294960b2dbabd',
'info_dict': {
'id': '840376_BQRC',
'ext': 'mp4',
'title': 'Tenacious Design and The Interface of \'Destiny\'',
},
}
def _parse_mp4(self, metadata):
video_formats = []
video_root = None
mp4_video = xpath_text(metadata, './mp4video', default=None)
if mp4_video is not None:
mobj = re.match(r'(?P<root>https?://.*?/).*', mp4_video)
video_root = mobj.group('root')
if video_root is None:
http_host = xpath_text(metadata, 'httpHost', default=None)
if http_host:
video_root = 'http://%s/' % http_host
if video_root is None:
# Hard-coded in http://evt.dispeak.com/ubm/gdc/sf16/custom/player2.js
# Works for GPUTechConf, too
video_root = 'http://s3-2u.digitallyspeaking.com/'
formats = metadata.findall('./MBRVideos/MBRVideo')
if not formats:
return None
for a_format in formats:
stream_name = xpath_text(a_format, 'streamName', fatal=True)
video_path = re.match(r'mp4\:(?P<path>.*)', stream_name).group('path')
url = video_root + video_path
vbr = xpath_text(a_format, 'bitrate')
video_formats.append({
'url': url,
'vbr': int_or_none(vbr),
})
return video_formats
def _parse_flv(self, metadata):
formats = []
akamai_url = xpath_text(metadata, './akamaiHost', fatal=True)
audios = metadata.find('./audios')
if audios is not None:
for audio in audios:
formats.append({
'url': 'rtmp://%s/ondemand?ovpfv=1.1' % akamai_url,
'play_path': remove_end(audio.get('url'), '.flv'),
'ext': 'flv',
'vcodec': 'none',
'format_id': audio.get('code'),
})
slide_video_path = xpath_text(metadata, './slideVideo', fatal=True)
formats.append({
'url': 'rtmp://%s/ondemand?ovpfv=1.1' % akamai_url,
'play_path': remove_end(slide_video_path, '.flv'),
'ext': 'flv',
'format_note': 'slide deck video',
'quality': -2,
'preference': -2,
'format_id': 'slides',
})
speaker_video_path = xpath_text(metadata, './speakerVideo', fatal=True)
formats.append({
'url': 'rtmp://%s/ondemand?ovpfv=1.1' % akamai_url,
'play_path': remove_end(speaker_video_path, '.flv'),
'ext': 'flv',
'format_note': 'speaker video',
'quality': -1,
'preference': -1,
'format_id': 'speaker',
})
return formats
def _real_extract(self, url):
video_id = self._match_id(url)
xml_description = self._download_xml(url, video_id)
metadata = xpath_element(xml_description, 'metadata')
video_formats = self._parse_mp4(metadata)
if video_formats is None:
video_formats = self._parse_flv(metadata)
return {
'id': video_id,
'formats': video_formats,
'title': xpath_text(metadata, 'title', fatal=True),
'duration': parse_duration(xpath_text(metadata, 'endTime')),
'creator': xpath_text(metadata, 'speaker'),
}

View File

@ -197,6 +197,7 @@ from .dump import DumpIE
from .dumpert import DumpertIE from .dumpert import DumpertIE
from .defense import DefenseGouvFrIE from .defense import DefenseGouvFrIE
from .discovery import DiscoveryIE from .discovery import DiscoveryIE
from .dispeak import DigitalSpeakingIE
from .dropbox import DropboxIE from .dropbox import DropboxIE
from .dw import ( from .dw import (
DWIE, DWIE,

View File

@ -4,7 +4,6 @@ import re
from .common import InfoExtractor from .common import InfoExtractor
from ..utils import ( from ..utils import (
remove_end,
HEADRequest, HEADRequest,
sanitized_Request, sanitized_Request,
urlencode_postdata, urlencode_postdata,
@ -64,66 +63,6 @@ class GDCVaultIE(InfoExtractor):
}, },
] ]
def _parse_mp4(self, xml_description):
video_formats = []
video_root = None
mp4_video = xml_description.find('./metadata/mp4video')
if mp4_video is not None:
mobj = re.match(r'(?P<root>https?://.*?/).*', mp4_video.text)
video_root = mobj.group('root')
if video_root is None:
# Hard-coded in http://evt.dispeak.com/ubm/gdc/sf16/custom/player2.js
video_root = 'http://s3-2u.digitallyspeaking.com/'
formats = xml_description.findall('./metadata/MBRVideos/MBRVideo')
if not formats:
return None
for format in formats:
mobj = re.match(r'mp4\:(?P<path>.*)', format.find('streamName').text)
url = video_root + mobj.group('path')
vbr = format.find('bitrate').text
video_formats.append({
'url': url,
'vbr': int(vbr),
})
return video_formats
def _parse_flv(self, xml_description):
formats = []
akamai_url = xml_description.find('./metadata/akamaiHost').text
audios = xml_description.find('./metadata/audios')
if audios is not None:
for audio in audios:
formats.append({
'url': 'rtmp://%s/ondemand?ovpfv=1.1' % akamai_url,
'play_path': remove_end(audio.get('url'), '.flv'),
'ext': 'flv',
'vcodec': 'none',
'format_id': audio.get('code'),
})
slide_video_path = xml_description.find('./metadata/slideVideo').text
formats.append({
'url': 'rtmp://%s/ondemand?ovpfv=1.1' % akamai_url,
'play_path': remove_end(slide_video_path, '.flv'),
'ext': 'flv',
'format_note': 'slide deck video',
'quality': -2,
'preference': -2,
'format_id': 'slides',
})
speaker_video_path = xml_description.find('./metadata/speakerVideo').text
formats.append({
'url': 'rtmp://%s/ondemand?ovpfv=1.1' % akamai_url,
'play_path': remove_end(speaker_video_path, '.flv'),
'ext': 'flv',
'format_note': 'speaker video',
'quality': -1,
'preference': -1,
'format_id': 'speaker',
})
return formats
def _login(self, webpage_url, display_id): def _login(self, webpage_url, display_id):
(username, password) = self._get_login_info() (username, password) = self._get_login_info()
if username is None or password is None: if username is None or password is None:
@ -199,17 +138,10 @@ class GDCVaultIE(InfoExtractor):
r'<iframe src=".*?\?xmlURL=xml/(?P<xml_file>.+?\.xml).*?".*?</iframe>', r'<iframe src=".*?\?xmlURL=xml/(?P<xml_file>.+?\.xml).*?".*?</iframe>',
start_page, 'xml filename') start_page, 'xml filename')
xml_description = self._download_xml(
'%s/xml/%s' % (xml_root, xml_name), display_id)
video_title = xml_description.find('./metadata/title').text
video_formats = self._parse_mp4(xml_description)
if video_formats is None:
video_formats = self._parse_flv(xml_description)
return { return {
'_type': 'url_transparent',
'id': video_id, 'id': video_id,
'display_id': display_id, 'display_id': display_id,
'title': video_title, 'url': '%s/xml/%s' % (xml_root, xml_name),
'formats': video_formats, 'ie': 'DigitalSpeaking',
} }

View File

@ -2,12 +2,6 @@
from __future__ import unicode_literals from __future__ import unicode_literals
from .common import InfoExtractor from .common import InfoExtractor
from ..utils import (
xpath_element,
xpath_text,
int_or_none,
parse_duration,
)
class GPUTechConfIE(InfoExtractor): class GPUTechConfIE(InfoExtractor):
@ -27,29 +21,15 @@ class GPUTechConfIE(InfoExtractor):
video_id = self._match_id(url) video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id) webpage = self._download_webpage(url, video_id)
root_path = self._search_regex(r'var\s+rootPath\s*=\s*"([^"]+)', webpage, 'root path', 'http://evt.dispeak.com/nvidia/events/gtc15/') root_path = self._search_regex(
xml_file_id = self._search_regex(r'var\s+xmlFileId\s*=\s*"([^"]+)', webpage, 'xml file id') r'var\s+rootPath\s*=\s*"([^"]+)', webpage, 'root path',
default='http://evt.dispeak.com/nvidia/events/gtc15/')
doc = self._download_xml('%sxml/%s.xml' % (root_path, xml_file_id), video_id) xml_file_id = self._search_regex(
r'var\s+xmlFileId\s*=\s*"([^"]+)', webpage, 'xml file id')
metadata = xpath_element(doc, 'metadata')
http_host = xpath_text(metadata, 'httpHost', 'http host', True)
mbr_videos = xpath_element(metadata, 'MBRVideos')
formats = []
for mbr_video in mbr_videos.findall('MBRVideo'):
stream_name = xpath_text(mbr_video, 'streamName')
if stream_name:
formats.append({
'url': 'http://%s/%s' % (http_host, stream_name.replace('mp4:', '')),
'tbr': int_or_none(xpath_text(mbr_video, 'bitrate')),
})
self._sort_formats(formats)
return { return {
'_type': 'url_transparent',
'id': video_id, 'id': video_id,
'title': xpath_text(metadata, 'title'), 'url': '%sxml/%s.xml' % (root_path, xml_file_id),
'duration': parse_duration(xpath_text(metadata, 'endTime')), 'ie': 'DigitalSpeaking',
'creator': xpath_text(metadata, 'speaker'),
'formats': formats,
} }