From f171bc8b59ecf4560dd4076be56570a4f090d519 Mon Sep 17 00:00:00 2001 From: Purdea Andrei Date: Tue, 28 Jul 2015 18:14:06 +0300 Subject: [PATCH 1/5] [youtube] save keywords in info jason when --write-info-json is used --- youtube_dl/extractor/youtube.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 0e411bfb65..15e327ec8b 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -1072,6 +1072,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor): else: video_categories = None + m = re.findall(r''''"]+?)['"]?\s*>''' + , video_webpage, re.DOTALL | re.IGNORECASE); + video_tags = ", ".join(m) # description video_description = get_element_by_id("eow-description", video_webpage) if video_description: @@ -1259,6 +1262,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'title': video_title, 'thumbnail': video_thumbnail, 'description': video_description, + 'tags' : video_tags, 'categories': video_categories, 'subtitles': video_subtitles, 'automatic_captions': automatic_captions, From a6f774e9015995393a086273df8db1d7b0c098c4 Mon Sep 17 00:00:00 2001 From: Purdea Andrei Date: Tue, 28 Jul 2015 18:29:13 +0300 Subject: [PATCH 2/5] [youtube]: tags key in info jason is now a list --- youtube_dl/extractor/youtube.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 15e327ec8b..c0fafbfd5c 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -1072,9 +1072,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor): else: video_categories = None - m = re.findall(r''''"]+?)['"]?\s*>''' + video_tags = re.findall(r''''"]+?)['"]?\s*>''' , video_webpage, re.DOTALL | re.IGNORECASE); - video_tags = ", ".join(m) # description video_description = get_element_by_id("eow-description", video_webpage) if video_description: From 5316bf7487b608b7c085950ff2fb0444f2c36dc0 Mon Sep 17 00:00:00 2001 From: Purdea Andrei Date: Tue, 28 Jul 2015 18:30:42 +0300 Subject: [PATCH 3/5] Documented tags as a possible dict key --- youtube_dl/extractor/common.py | 1 + 1 file changed, 1 insertion(+) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 14b9b4fe23..a227aeb9cf 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -187,6 +187,7 @@ class InfoExtractor(object): specified in the URL. end_time: Time in seconds where the reproduction should end, as specified in the URL. + tags: A list of keywords attached to the video. Unless mentioned otherwise, the fields should be Unicode strings. From 864f24bd2c0cf9bde034812a2049c3750c1bb05c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Wed, 29 Jul 2015 03:43:03 +0600 Subject: [PATCH 4/5] [extractor/common] Add _meta_regex and clarify tags field --- youtube_dl/extractor/common.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index a227aeb9cf..d54866d1f7 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -181,13 +181,13 @@ class InfoExtractor(object): by YoutubeDL if it's missing) categories: A list of categories that the video falls in, for example ["Sports", "Berlin"] + tags: A list of tags assigned to the video, e.g. ["sweden", "pop music"] is_live: True, False, or None (=unknown). Whether this video is a live stream that goes on instead of a fixed-length video. start_time: Time in seconds where the reproduction should start, as specified in the URL. end_time: Time in seconds where the reproduction should end, as specified in the URL. - tags: A list of keywords attached to the video. Unless mentioned otherwise, the fields should be Unicode strings. @@ -631,6 +631,12 @@ class InfoExtractor(object): template % (content_re, property_re), ] + @staticmethod + def _meta_regex(prop): + return r'''(?isx)]+(?:itemprop|name|property)=(["\']?)%s\1) + [^>]+?content=(["\'])(?P.*?)\2''' % re.escape(prop) + def _og_search_property(self, prop, html, name=None, **kargs): if name is None: name = 'OpenGraph %s' % prop @@ -661,9 +667,7 @@ class InfoExtractor(object): if display_name is None: display_name = name return self._html_search_regex( - r'''(?isx)]+(?:itemprop|name|property)=(["\']?)%s\1) - [^>]+?content=(["\'])(?P.*?)\2''' % re.escape(name), + self._meta_regex(name), html, display_name, fatal=fatal, group='content', **kwargs) def _dc_search_uploader(self, html): From 000b6b5ae5cc214906effe4ac5b78b579bc7db70 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Wed, 29 Jul 2015 03:43:32 +0600 Subject: [PATCH 5/5] [youtube] Improve tags extraction and add test --- youtube_dl/extractor/youtube.py | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index c0fafbfd5c..4c449fd741 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -329,6 +329,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'upload_date': '20121002', 'description': 'test chars: "\'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .', 'categories': ['Science & Technology'], + 'tags': ['youtube-dl'], 'like_count': int, 'dislike_count': int, 'start_time': 1, @@ -343,7 +344,10 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'ext': 'mp4', 'upload_date': '20120506', 'title': 'Icona Pop - I Love It (feat. Charli XCX) [OFFICIAL VIDEO]', - 'description': 'md5:fea86fda2d5a5784273df5c7cc994d9f', + 'description': 'md5:782e8651347686cba06e58f71ab51773', + 'tags': ['Icona Pop i love it', 'sweden', 'pop music', 'big beat records', 'big beat', 'charli', + 'xcx', 'charli xcx', 'girls', 'hbo', 'i love it', "i don't care", 'icona', 'pop', + 'iconic ep', 'iconic', 'love', 'it'], 'uploader': 'Icona Pop', 'uploader_id': 'IconaPop', } @@ -1072,8 +1076,10 @@ class YoutubeIE(YoutubeBaseInfoExtractor): else: video_categories = None - video_tags = re.findall(r''''"]+?)['"]?\s*>''' - , video_webpage, re.DOTALL | re.IGNORECASE); + video_tags = [ + unescapeHTML(m.group('content')) + for m in re.finditer(self._meta_regex('og:video:tag'), video_webpage)] + # description video_description = get_element_by_id("eow-description", video_webpage) if video_description: @@ -1261,8 +1267,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'title': video_title, 'thumbnail': video_thumbnail, 'description': video_description, - 'tags' : video_tags, 'categories': video_categories, + 'tags': video_tags, 'subtitles': video_subtitles, 'automatic_captions': automatic_captions, 'duration': video_duration,