Compare commits

...

19 Commits

Author SHA1 Message Date
Sergey M․
a3ccd6bd11 release 2017.03.24 2017-03-24 00:24:23 +07:00
Sergey M․
7963b6cba8 [ChangeLog] Actualize 2017-03-24 00:19:58 +07:00
Sergey M․
bea7af6947 [channel9] Remove expired comment and sort imports 2017-03-23 23:58:12 +07:00
Sergey M․
a5d783f525 [channel9] Extract more formats 2017-03-23 23:47:43 +07:00
Remita Amine
d0572557c2 [ninecninemedia] remove mp4 url extraction request 2017-03-23 13:53:07 +01:00
Remita Amine
52d5ecabd5 [bellmedia] add support for etalk.ca(closes #12447) 2017-03-23 13:52:45 +01:00
Remita Amine
b0f7f21cb9 [channel9] fix extraction(closes #11323) 2017-03-23 09:22:37 +01:00
Sergey M․
579c99a284 [cloudy] Fix extraction (closes #12525) 2017-03-22 23:48:06 +07:00
Remita Amine
ca5ed022e9 [hbo] add support for free episode urls and new formats extraction(closes #12519) 2017-03-22 17:28:53 +01:00
Sergey M․
391d076d7c [condenast] Fix extraction and style (closes #12526) 2017-03-22 23:22:14 +07:00
Sergey M․
c183e14f89 [viu] Relax _VALID_URL (closes #12529) 2017-03-22 22:26:59 +07:00
Sergey M․
093dad9e25 release 2017.03.22 2017-03-22 02:36:50 +07:00
Sergey M․
e8686e51d7 [ChangeLog] Actualize 2017-03-22 02:35:09 +07:00
Sergey M․
8e5a7c5e67 [pluralsight] Omit module title from video title (closes #12506) 2017-03-22 02:28:04 +07:00
Sergey M․
e1e35d1ac6 [pornhub] Improve extraction and style (closes #12515) 2017-03-22 01:59:27 +07:00
Throaway
21fbf0f955 [pornhub] Decode obfuscated video URL (closes #12470) 2017-03-22 01:51:45 +07:00
John Hawkinson
97952bdb78 [generic] Add test for Senate ISVP iframe embed 2017-03-22 01:12:14 +08:00
John Hawkinson
8a8cc339b6 [senateisvp] Allow https URL scheme for embeds 2017-03-20 23:35:13 +08:00
Vijay Singh
957f453429 [Openload.co] Fixed Extraction
They did it again. just a minor change though. here's quick fix
2017-03-20 16:15:00 +08:00
16 changed files with 364 additions and 344 deletions

View File

@@ -6,8 +6,8 @@
---
### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2017.03.20*. If it's not read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected.
- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2017.03.20**
### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2017.03.24*. If it's not read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected.
- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2017.03.24**
### Before submitting an *issue* make sure you have:
- [ ] At least skimmed through [README](https://github.com/rg3/youtube-dl/blob/master/README.md) and **most notably** [FAQ](https://github.com/rg3/youtube-dl#faq) and [BUGS](https://github.com/rg3/youtube-dl#bugs) sections
@@ -35,7 +35,7 @@ $ youtube-dl -v <your command line>
[debug] User config: []
[debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj']
[debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251
[debug] youtube-dl version 2017.03.20
[debug] youtube-dl version 2017.03.24
[debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2
[debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4
[debug] Proxy map: {}

View File

@@ -1,3 +1,23 @@
version 2017.03.24
Extractors
- [9c9media] Remove mp4 URL extraction request
+ [bellmedia] Add support for etalk.ca and space.ca (#12447)
* [channel9] Fix extraction (#11323)
* [cloudy] Fix extraction (#12525)
+ [hbo] Add support for free episode URLs and new formats extraction (#12519)
* [condenast] Fix extraction and style (#12526)
* [viu] Relax URL regular expression (#12529)
version 2017.03.22
Extractors
- [pluralsight] Omit module title from video title (#12506)
* [pornhub] Decode obfuscated video URL (#12470, #12515)
* [senateisvp] Allow https URL scheme for embeds (#12512)
version 2017.03.20
Core

View File

@@ -312,8 +312,8 @@
- **GPUTechConf**
- **Groupon**
- **Hark**
- **HBO**
- **HBOEpisode**
- **hbo**
- **hbo:episode**
- **HearThisAt**
- **Heise**
- **HellPorno**

View File

@@ -21,10 +21,11 @@ class BellMediaIE(InfoExtractor):
animalplanet|
bravo|
mtv|
space
space|
etalk
)\.ca|
much\.com
)/.*?(?:\bvid=|-vid|~|%7E|/(?:episode)?)(?P<id>[0-9]{6,})'''
)/.*?(?:\bvid(?:eoid)?=|-vid|~|%7E|/(?:episode)?)(?P<id>[0-9]{6,})'''
_TESTS = [{
'url': 'http://www.ctv.ca/video/player?vid=706966',
'md5': 'ff2ebbeae0aa2dcc32a830c3fd69b7b0',
@@ -58,6 +59,9 @@ class BellMediaIE(InfoExtractor):
}, {
'url': 'http://www.ctv.ca/DCs-Legends-of-Tomorrow/Video/S2E11-Turncoat-vid1051430',
'only_matching': True,
}, {
'url': 'http://www.etalk.ca/video?videoid=663455',
'only_matching': True,
}]
_DOMAINS = {
'thecomedynetwork': 'comedy',
@@ -65,6 +69,7 @@ class BellMediaIE(InfoExtractor):
'sciencechannel': 'discsci',
'investigationdiscovery': 'invdisc',
'animalplanet': 'aniplan',
'etalk': 'ctv',
}
def _real_extract(self, url):

View File

@@ -4,62 +4,62 @@ import re
from .common import InfoExtractor
from ..utils import (
clean_html,
ExtractorError,
parse_filesize,
int_or_none,
parse_iso8601,
qualities,
unescapeHTML,
)
class Channel9IE(InfoExtractor):
'''
Common extractor for channel9.msdn.com.
The type of provided URL (video or playlist) is determined according to
meta Search.PageType from web page HTML rather than URL itself, as it is
not always possible to do.
'''
IE_DESC = 'Channel 9'
IE_NAME = 'channel9'
_VALID_URL = r'https?://(?:www\.)?channel9\.msdn\.com/(?P<contentpath>.+?)(?P<rss>/RSS)?/?(?:[?#&]|$)'
_VALID_URL = r'https?://(?:www\.)?(?:channel9\.msdn\.com|s\.ch9\.ms)/(?P<contentpath>.+?)(?P<rss>/RSS)?/?(?:[?#&]|$)'
_TESTS = [{
'url': 'http://channel9.msdn.com/Events/TechEd/Australia/2013/KOS002',
'md5': 'bbd75296ba47916b754e73c3a4bbdf10',
'md5': '32083d4eaf1946db6d454313f44510ca',
'info_dict': {
'id': 'Events/TechEd/Australia/2013/KOS002',
'ext': 'mp4',
'id': '6c413323-383a-49dc-88f9-a22800cab024',
'ext': 'wmv',
'title': 'Developer Kick-Off Session: Stuff We Love',
'description': 'md5:c08d72240b7c87fcecafe2692f80e35f',
'description': 'md5:b80bf9355a503c193aff7ec6cd5a7731',
'duration': 4576,
'thumbnail': r're:http://.*\.jpg',
'thumbnail': r're:https?://.*\.jpg',
'timestamp': 1377717420,
'upload_date': '20130828',
'session_code': 'KOS002',
'session_day': 'Day 1',
'session_room': 'Arena 1A',
'session_speakers': ['Ed Blankenship', 'Andrew Coates', 'Brady Gaster', 'Patrick Klug',
'Mads Kristensen'],
'session_speakers': ['Andrew Coates', 'Brady Gaster', 'Mads Kristensen', 'Ed Blankenship', 'Patrick Klug'],
},
}, {
'url': 'http://channel9.msdn.com/posts/Self-service-BI-with-Power-BI-nuclear-testing',
'md5': 'b43ee4529d111bc37ba7ee4f34813e68',
'md5': 'dcf983ee6acd2088e7188c3cf79b46bc',
'info_dict': {
'id': 'posts/Self-service-BI-with-Power-BI-nuclear-testing',
'ext': 'mp4',
'id': 'fe8e435f-bb93-4e01-8e97-a28c01887024',
'ext': 'wmv',
'title': 'Self-service BI with Power BI - nuclear testing',
'description': 'md5:d1e6ecaafa7fb52a2cacdf9599829f5b',
'description': 'md5:2d17fec927fc91e9e17783b3ecc88f54',
'duration': 1540,
'thumbnail': r're:http://.*\.jpg',
'thumbnail': r're:https?://.*\.jpg',
'timestamp': 1386381991,
'upload_date': '20131207',
'authors': ['Mike Wilmot'],
},
}, {
# low quality mp4 is best
'url': 'https://channel9.msdn.com/Events/CPP/CppCon-2015/Ranges-for-the-Standard-Library',
'info_dict': {
'id': 'Events/CPP/CppCon-2015/Ranges-for-the-Standard-Library',
'id': '33ad69d2-6a4e-4172-83a1-a523013dec76',
'ext': 'mp4',
'title': 'Ranges for the Standard Library',
'description': 'md5:2e6b4917677af3728c5f6d63784c4c5d',
'description': 'md5:9895e0a9fd80822d2f01c454b8f4a372',
'duration': 5646,
'thumbnail': r're:http://.*\.jpg',
'thumbnail': r're:https?://.*\.jpg',
'upload_date': '20150930',
'timestamp': 1443640735,
},
'params': {
'skip_download': True,
@@ -70,7 +70,7 @@ class Channel9IE(InfoExtractor):
'id': 'Niners/Splendid22/Queue/76acff796e8f411184b008028e0d492b',
'title': 'Channel 9',
},
'playlist_count': 2,
'playlist_mincount': 100,
}, {
'url': 'https://channel9.msdn.com/Events/DEVintersection/DEVintersection-2016/RSS',
'only_matching': True,
@@ -81,189 +81,6 @@ class Channel9IE(InfoExtractor):
_RSS_URL = 'http://channel9.msdn.com/%s/RSS'
def _formats_from_html(self, html):
FORMAT_REGEX = r'''
(?x)
<a\s+href="(?P<url>[^"]+)">(?P<quality>[^<]+)</a>\s*
<span\s+class="usage">\((?P<note>[^\)]+)\)</span>\s*
(?:<div\s+class="popup\s+rounded">\s*
<h3>File\s+size</h3>\s*(?P<filesize>.*?)\s*
</div>)? # File size part may be missing
'''
quality = qualities((
'MP3', 'MP4',
'Low Quality WMV', 'Low Quality MP4',
'Mid Quality WMV', 'Mid Quality MP4',
'High Quality WMV', 'High Quality MP4'))
formats = [{
'url': x.group('url'),
'format_id': x.group('quality'),
'format_note': x.group('note'),
'format': '%s (%s)' % (x.group('quality'), x.group('note')),
'filesize_approx': parse_filesize(x.group('filesize')),
'quality': quality(x.group('quality')),
'vcodec': 'none' if x.group('note') == 'Audio only' else None,
} for x in list(re.finditer(FORMAT_REGEX, html))]
self._sort_formats(formats)
return formats
def _extract_title(self, html):
title = self._html_search_meta('title', html, 'title')
if title is None:
title = self._og_search_title(html)
TITLE_SUFFIX = ' (Channel 9)'
if title is not None and title.endswith(TITLE_SUFFIX):
title = title[:-len(TITLE_SUFFIX)]
return title
def _extract_description(self, html):
DESCRIPTION_REGEX = r'''(?sx)
<div\s+class="entry-content">\s*
<div\s+id="entry-body">\s*
(?P<description>.+?)\s*
</div>\s*
</div>
'''
m = re.search(DESCRIPTION_REGEX, html)
if m is not None:
return m.group('description')
return self._html_search_meta('description', html, 'description')
def _extract_duration(self, html):
m = re.search(r'"length": *"(?P<hours>\d{2}):(?P<minutes>\d{2}):(?P<seconds>\d{2})"', html)
return ((int(m.group('hours')) * 60 * 60) + (int(m.group('minutes')) * 60) + int(m.group('seconds'))) if m else None
def _extract_slides(self, html):
m = re.search(r'<a href="(?P<slidesurl>[^"]+)" class="slides">Slides</a>', html)
return m.group('slidesurl') if m is not None else None
def _extract_zip(self, html):
m = re.search(r'<a href="(?P<zipurl>[^"]+)" class="zip">Zip</a>', html)
return m.group('zipurl') if m is not None else None
def _extract_avg_rating(self, html):
m = re.search(r'<p class="avg-rating">Avg Rating: <span>(?P<avgrating>[^<]+)</span></p>', html)
return float(m.group('avgrating')) if m is not None else 0
def _extract_rating_count(self, html):
m = re.search(r'<div class="rating-count">\((?P<ratingcount>[^<]+)\)</div>', html)
return int(self._fix_count(m.group('ratingcount'))) if m is not None else 0
def _extract_view_count(self, html):
m = re.search(r'<li class="views">\s*<span class="count">(?P<viewcount>[^<]+)</span> Views\s*</li>', html)
return int(self._fix_count(m.group('viewcount'))) if m is not None else 0
def _extract_comment_count(self, html):
m = re.search(r'<li class="comments">\s*<a href="#comments">\s*<span class="count">(?P<commentcount>[^<]+)</span> Comments\s*</a>\s*</li>', html)
return int(self._fix_count(m.group('commentcount'))) if m is not None else 0
def _fix_count(self, count):
return int(str(count).replace(',', '')) if count is not None else None
def _extract_authors(self, html):
m = re.search(r'(?s)<li class="author">(.*?)</li>', html)
if m is None:
return None
return re.findall(r'<a href="/Niners/[^"]+">([^<]+)</a>', m.group(1))
def _extract_session_code(self, html):
m = re.search(r'<li class="code">\s*(?P<code>.+?)\s*</li>', html)
return m.group('code') if m is not None else None
def _extract_session_day(self, html):
m = re.search(r'<li class="day">\s*<a href="/Events/[^"]+">(?P<day>[^<]+)</a>\s*</li>', html)
return m.group('day').strip() if m is not None else None
def _extract_session_room(self, html):
m = re.search(r'<li class="room">\s*(?P<room>.+?)\s*</li>', html)
return m.group('room') if m is not None else None
def _extract_session_speakers(self, html):
return re.findall(r'<a href="/Events/Speakers/[^"]+">([^<]+)</a>', html)
def _extract_content(self, html, content_path):
# Look for downloadable content
formats = self._formats_from_html(html)
slides = self._extract_slides(html)
zip_ = self._extract_zip(html)
# Nothing to download
if len(formats) == 0 and slides is None and zip_ is None:
self._downloader.report_warning('None of recording, slides or zip are available for %s' % content_path)
return
# Extract meta
title = self._extract_title(html)
description = self._extract_description(html)
thumbnail = self._og_search_thumbnail(html)
duration = self._extract_duration(html)
avg_rating = self._extract_avg_rating(html)
rating_count = self._extract_rating_count(html)
view_count = self._extract_view_count(html)
comment_count = self._extract_comment_count(html)
common = {
'_type': 'video',
'id': content_path,
'description': description,
'thumbnail': thumbnail,
'duration': duration,
'avg_rating': avg_rating,
'rating_count': rating_count,
'view_count': view_count,
'comment_count': comment_count,
}
result = []
if slides is not None:
d = common.copy()
d.update({'title': title + '-Slides', 'url': slides})
result.append(d)
if zip_ is not None:
d = common.copy()
d.update({'title': title + '-Zip', 'url': zip_})
result.append(d)
if len(formats) > 0:
d = common.copy()
d.update({'title': title, 'formats': formats})
result.append(d)
return result
def _extract_entry_item(self, html, content_path):
contents = self._extract_content(html, content_path)
if contents is None:
return contents
if len(contents) > 1:
raise ExtractorError('Got more than one entry')
result = contents[0]
result['authors'] = self._extract_authors(html)
return result
def _extract_session(self, html, content_path):
contents = self._extract_content(html, content_path)
if contents is None:
return contents
session_meta = {
'session_code': self._extract_session_code(html),
'session_day': self._extract_session_day(html),
'session_room': self._extract_session_room(html),
'session_speakers': self._extract_session_speakers(html),
}
for content in contents:
content.update(session_meta)
return self.playlist_result(contents)
def _extract_list(self, video_id, rss_url=None):
if not rss_url:
rss_url = self._RSS_URL % video_id
@@ -274,9 +91,7 @@ class Channel9IE(InfoExtractor):
return self.playlist_result(entries, video_id, title_text)
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
content_path = mobj.group('contentpath')
rss = mobj.group('rss')
content_path, rss = re.match(self._VALID_URL, url).groups()
if rss:
return self._extract_list(content_path, url)
@@ -284,17 +99,158 @@ class Channel9IE(InfoExtractor):
webpage = self._download_webpage(
url, content_path, 'Downloading web page')
page_type = self._search_regex(
r'<meta[^>]+name=(["\'])WT\.entryid\1[^>]+content=(["\'])(?P<pagetype>[^:]+).+?\2',
webpage, 'page type', default=None, group='pagetype')
if page_type:
if page_type == 'Entry': # Any 'item'-like page, may contain downloadable content
return self._extract_entry_item(webpage, content_path)
elif page_type == 'Session': # Event session page, may contain downloadable content
return self._extract_session(webpage, content_path)
elif page_type == 'Event':
return self._extract_list(content_path)
episode_data = self._search_regex(
r"data-episode='([^']+)'", webpage, 'episode data', default=None)
if episode_data:
episode_data = self._parse_json(unescapeHTML(
episode_data), content_path)
content_id = episode_data['contentId']
is_session = '/Sessions(' in episode_data['api']
content_url = 'https://channel9.msdn.com/odata' + episode_data['api']
if is_session:
content_url += '?$expand=Speakers'
else:
raise ExtractorError('Unexpected WT.entryid %s' % page_type, expected=True)
else: # Assuming list
content_url += '?$expand=Authors'
content_data = self._download_json(content_url, content_id)
title = content_data['Title']
QUALITIES = (
'mp3',
'wmv', 'mp4',
'wmv-low', 'mp4-low',
'wmv-mid', 'mp4-mid',
'wmv-high', 'mp4-high',
)
quality_key = qualities(QUALITIES)
def quality(quality_id, format_url):
return (len(QUALITIES) if '_Source.' in format_url
else quality_key(quality_id))
formats = []
urls = set()
SITE_QUALITIES = {
'MP3': 'mp3',
'MP4': 'mp4',
'Low Quality WMV': 'wmv-low',
'Low Quality MP4': 'mp4-low',
'Mid Quality WMV': 'wmv-mid',
'Mid Quality MP4': 'mp4-mid',
'High Quality WMV': 'wmv-high',
'High Quality MP4': 'mp4-high',
}
formats_select = self._search_regex(
r'(?s)<select[^>]+name=["\']format[^>]+>(.+?)</select', webpage,
'formats select', default=None)
if formats_select:
for mobj in re.finditer(
r'<option\b[^>]+\bvalue=(["\'])(?P<url>(?:(?!\1).)+)\1[^>]*>\s*(?P<format>[^<]+?)\s*<',
formats_select):
format_url = mobj.group('url')
if format_url in urls:
continue
urls.add(format_url)
format_id = mobj.group('format')
quality_id = SITE_QUALITIES.get(format_id, format_id)
formats.append({
'url': format_url,
'format_id': quality_id,
'quality': quality(quality_id, format_url),
'vcodec': 'none' if quality_id == 'mp3' else None,
})
API_QUALITIES = {
'VideoMP4Low': 'mp4-low',
'VideoWMV': 'wmv-mid',
'VideoMP4Medium': 'mp4-mid',
'VideoMP4High': 'mp4-high',
'VideoWMVHQ': 'wmv-hq',
}
for format_id, q in API_QUALITIES.items():
q_url = content_data.get(format_id)
if not q_url or q_url in urls:
continue
urls.add(q_url)
formats.append({
'url': q_url,
'format_id': q,
'quality': quality(q, q_url),
})
self._sort_formats(formats)
slides = content_data.get('Slides')
zip_file = content_data.get('ZipFile')
if not formats and not slides and not zip_file:
raise ExtractorError(
'None of recording, slides or zip are available for %s' % content_path)
subtitles = {}
for caption in content_data.get('Captions', []):
caption_url = caption.get('Url')
if not caption_url:
continue
subtitles.setdefault(caption.get('Language', 'en'), []).append({
'url': caption_url,
'ext': 'vtt',
})
common = {
'id': content_id,
'title': title,
'description': clean_html(content_data.get('Description') or content_data.get('Body')),
'thumbnail': content_data.get('Thumbnail') or content_data.get('VideoPlayerPreviewImage'),
'duration': int_or_none(content_data.get('MediaLengthInSeconds')),
'timestamp': parse_iso8601(content_data.get('PublishedDate')),
'avg_rating': int_or_none(content_data.get('Rating')),
'rating_count': int_or_none(content_data.get('RatingCount')),
'view_count': int_or_none(content_data.get('Views')),
'comment_count': int_or_none(content_data.get('CommentCount')),
'subtitles': subtitles,
}
if is_session:
speakers = []
for s in content_data.get('Speakers', []):
speaker_name = s.get('FullName')
if not speaker_name:
continue
speakers.append(speaker_name)
common.update({
'session_code': content_data.get('Code'),
'session_room': content_data.get('Room'),
'session_speakers': speakers,
})
else:
authors = []
for a in content_data.get('Authors', []):
author_name = a.get('DisplayName')
if not author_name:
continue
authors.append(author_name)
common['authors'] = authors
contents = []
if slides:
d = common.copy()
d.update({'title': title + '-Slides', 'url': slides})
contents.append(d)
if zip_file:
d = common.copy()
d.update({'title': title + '-Zip', 'url': zip_file})
contents.append(d)
if formats:
d = common.copy()
d.update({'title': title, 'formats': formats})
contents.append(d)
return self.playlist_result(contents)
else:
return self._extract_list(content_path)

View File

@@ -1,97 +1,56 @@
# coding: utf-8
from __future__ import unicode_literals
import re
from .common import InfoExtractor
from ..compat import (
compat_parse_qs,
compat_HTTPError,
)
from ..utils import (
ExtractorError,
HEADRequest,
remove_end,
str_to_int,
unified_strdate,
)
class CloudyIE(InfoExtractor):
_IE_DESC = 'cloudy.ec'
_VALID_URL = r'''(?x)
https?://(?:www\.)?cloudy\.ec/
(?:v/|embed\.php\?id=)
(?P<id>[A-Za-z0-9]+)
'''
_EMBED_URL = 'http://www.cloudy.ec/embed.php?id=%s'
_API_URL = 'http://www.cloudy.ec/api/player.api.php'
_MAX_TRIES = 2
_TEST = {
_VALID_URL = r'https?://(?:www\.)?cloudy\.ec/(?:v/|embed\.php\?.*?\bid=)(?P<id>[A-Za-z0-9]+)'
_TESTS = [{
'url': 'https://www.cloudy.ec/v/af511e2527aac',
'md5': '5cb253ace826a42f35b4740539bedf07',
'md5': '29832b05028ead1b58be86bf319397ca',
'info_dict': {
'id': 'af511e2527aac',
'ext': 'flv',
'ext': 'mp4',
'title': 'Funny Cats and Animals Compilation june 2013',
'upload_date': '20130913',
'view_count': int,
}
}
def _extract_video(self, video_id, file_key, error_url=None, try_num=0):
if try_num > self._MAX_TRIES - 1:
raise ExtractorError('Unable to extract video URL', expected=True)
form = {
'file': video_id,
'key': file_key,
}
if error_url:
form.update({
'numOfErrors': try_num,
'errorCode': '404',
'errorUrl': error_url,
})
player_data = self._download_webpage(
self._API_URL, video_id, 'Downloading player data', query=form)
data = compat_parse_qs(player_data)
try_num += 1
if 'error' in data:
raise ExtractorError(
'%s error: %s' % (self.IE_NAME, ' '.join(data['error_msg'])),
expected=True)
title = data.get('title', [None])[0]
if title:
title = remove_end(title, '&asdasdas').strip()
video_url = data.get('url', [None])[0]
if video_url:
try:
self._request_webpage(HEADRequest(video_url), video_id, 'Checking video URL')
except ExtractorError as e:
if isinstance(e.cause, compat_HTTPError) and e.cause.code in [404, 410]:
self.report_warning('Invalid video URL, requesting another', video_id)
return self._extract_video(video_id, file_key, video_url, try_num)
return {
'id': video_id,
'url': video_url,
'title': title,
}
}, {
'url': 'http://www.cloudy.ec/embed.php?autoplay=1&id=af511e2527aac',
'only_matching': True,
}]
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('id')
video_id = self._match_id(url)
url = self._EMBED_URL % video_id
webpage = self._download_webpage(url, video_id)
webpage = self._download_webpage(
'http://www.cloudy.ec/embed.php?id=%s' % video_id, video_id)
file_key = self._search_regex(
[r'key\s*:\s*"([^"]+)"', r'filekey\s*=\s*"([^"]+)"'],
webpage, 'file_key')
info = self._parse_html5_media_entries(url, webpage, video_id)[0]
return self._extract_video(video_id, file_key)
webpage = self._download_webpage(
'https://www.cloudy.ec/v/%s' % video_id, video_id, fatal=False)
if webpage:
info.update({
'title': self._search_regex(
r'<h\d[^>]*>([^<]+)<', webpage, 'title'),
'upload_date': unified_strdate(self._search_regex(
r'>Published at (\d{4}-\d{1,2}-\d{1,2})', webpage,
'upload date', fatal=False)),
'view_count': str_to_int(self._search_regex(
r'([\d,.]+) views<', webpage, 'view count', fatal=False)),
})
if not info.get('title'):
info['title'] = video_id
info['id'] = video_id
return info

View File

@@ -9,13 +9,14 @@ from ..compat import (
compat_urlparse,
)
from ..utils import (
orderedSet,
remove_end,
extract_attributes,
mimetype2ext,
determine_ext,
extract_attributes,
int_or_none,
js_to_json,
mimetype2ext,
orderedSet,
parse_iso8601,
remove_end,
)
@@ -66,6 +67,16 @@ class CondeNastIE(InfoExtractor):
'upload_date': '20130314',
'timestamp': 1363219200,
}
}, {
'url': 'http://video.gq.com/watch/the-closer-with-keith-olbermann-the-only-true-surprise-trump-s-an-idiot?c=series',
'info_dict': {
'id': '58d1865bfd2e6126e2000015',
'ext': 'mp4',
'title': 'The Only True Surprise? Trumps an Idiot',
'uploader': 'gq',
'upload_date': '20170321',
'timestamp': 1490126427,
},
}, {
# JS embed
'url': 'http://player.cnevids.com/embedjs/55f9cf8b61646d1acf00000c/5511d76261646d5566020000.js',
@@ -114,26 +125,33 @@ class CondeNastIE(InfoExtractor):
})
video_id = query['videoId']
video_info = None
info_page = self._download_webpage(
info_page = self._download_json(
'http://player.cnevids.com/player/video.js',
video_id, 'Downloading video info', query=query, fatal=False)
video_id, 'Downloading video info', fatal=False, query=query)
if info_page:
video_info = self._parse_json(self._search_regex(
r'loadCallback\(({.+})\)', info_page, 'video info'), video_id)['video']
else:
video_info = info_page.get('video')
if not video_info:
info_page = self._download_webpage(
'http://player.cnevids.com/player/loader.js',
video_id, 'Downloading loader info', query=query)
video_info = self._parse_json(self._search_regex(
r'var\s+video\s*=\s*({.+?});', info_page, 'video info'), video_id)
video_info = self._parse_json(
self._search_regex(
r'(?s)var\s+config\s*=\s*({.+?});', info_page, 'config'),
video_id, transform_source=js_to_json)['video']
title = video_info['title']
formats = []
for fdata in video_info.get('sources', [{}])[0]:
for fdata in video_info['sources']:
src = fdata.get('src')
if not src:
continue
ext = mimetype2ext(fdata.get('type')) or determine_ext(src)
if ext == 'm3u8':
formats.extend(self._extract_m3u8_formats(
src, video_id, 'mp4', entry_protocol='m3u8_native',
m3u8_id='hls', fatal=False))
continue
quality = fdata.get('quality')
formats.append({
'format_id': ext + ('-%s' % quality if quality else ''),
@@ -169,7 +187,6 @@ class CondeNastIE(InfoExtractor):
path=remove_end(parsed_url.path, '.js').replace('/embedjs/', '/embed/')))
url_type = 'embed'
self.to_screen('Extracting from %s with the Condé Nast extractor' % self._SITES[site])
webpage = self._download_webpage(url, item_id)
if url_type == 'series':

View File

@@ -1542,6 +1542,17 @@ class GenericIE(InfoExtractor):
'url': 'http://www.golfchannel.com/topics/shows/golftalkcentral.htm',
'only_matching': True,
},
{
# Senate ISVP iframe https
'url': 'https://www.hsgac.senate.gov/hearings/canadas-fast-track-refugee-plan-unanswered-questions-and-implications-for-us-national-security',
'md5': 'fb8c70b0b515e5037981a2492099aab8',
'info_dict': {
'id': 'govtaff020316',
'ext': 'mp4',
'title': 'Integrated Senate Video Player',
},
'add_ie': [SenateISVPIE.ie_key()],
},
# {
# # TODO: find another test
# # http://schema.org/VideoObject

View File

@@ -4,6 +4,7 @@ from __future__ import unicode_literals
import re
from .common import InfoExtractor
from ..compat import compat_str
from ..utils import (
xpath_text,
xpath_element,
@@ -14,14 +15,26 @@ from ..utils import (
class HBOBaseIE(InfoExtractor):
_FORMATS_INFO = {
'pro7': {
'width': 1280,
'height': 720,
},
'1920': {
'width': 1280,
'height': 720,
},
'pro6': {
'width': 768,
'height': 432,
},
'640': {
'width': 768,
'height': 432,
},
'pro5': {
'width': 640,
'height': 360,
},
'highwifi': {
'width': 640,
'height': 360,
@@ -78,6 +91,17 @@ class HBOBaseIE(InfoExtractor):
formats.extend(self._extract_m3u8_formats(
video_url.replace('.tar', '/base_index_w8.m3u8'),
video_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False))
elif source.tag == 'hls':
# #EXT-X-BYTERANGE is not supported by native hls downloader
# and ffmpeg (#10955)
# formats.extend(self._extract_m3u8_formats(
# video_url.replace('.tar', '/base_index.m3u8'),
# video_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False))
continue
elif source.tag == 'dash':
formats.extend(self._extract_mpd_formats(
video_url.replace('.tar', '/manifest.mpd'),
video_id, mpd_id='dash', fatal=False))
else:
format_info = self._FORMATS_INFO.get(source.tag, {})
formats.append({
@@ -112,10 +136,11 @@ class HBOBaseIE(InfoExtractor):
class HBOIE(HBOBaseIE):
IE_NAME = 'hbo'
_VALID_URL = r'https?://(?:www\.)?hbo\.com/video/video\.html\?.*vid=(?P<id>[0-9]+)'
_TEST = {
'url': 'http://www.hbo.com/video/video.html?autoplay=true&g=u&vid=1437839',
'md5': '1c33253f0c7782142c993c0ba62a8753',
'md5': '2c6a6bc1222c7e91cb3334dad1746e5a',
'info_dict': {
'id': '1437839',
'ext': 'mp4',
@@ -131,11 +156,12 @@ class HBOIE(HBOBaseIE):
class HBOEpisodeIE(HBOBaseIE):
_VALID_URL = r'https?://(?:www\.)?hbo\.com/(?!video)([^/]+/)+video/(?P<id>[0-9a-z-]+)\.html'
IE_NAME = 'hbo:episode'
_VALID_URL = r'https?://(?:www\.)?hbo\.com/(?P<path>(?!video)(?:(?:[^/]+/)+video|watch-free-episodes)/(?P<id>[0-9a-z-]+))(?:\.html)?'
_TESTS = [{
'url': 'http://www.hbo.com/girls/episodes/5/52-i-love-you-baby/video/ep-52-inside-the-episode.html?autoplay=true',
'md5': '689132b253cc0ab7434237fc3a293210',
'md5': '61ead79b9c0dfa8d3d4b07ef4ac556fb',
'info_dict': {
'id': '1439518',
'display_id': 'ep-52-inside-the-episode',
@@ -147,16 +173,19 @@ class HBOEpisodeIE(HBOBaseIE):
}, {
'url': 'http://www.hbo.com/game-of-thrones/about/video/season-5-invitation-to-the-set.html?autoplay=true',
'only_matching': True,
}, {
'url': 'http://www.hbo.com/watch-free-episodes/last-week-tonight-with-john-oliver',
'only_matching': True,
}]
def _real_extract(self, url):
display_id = self._match_id(url)
path, display_id = re.match(self._VALID_URL, url).groups()
webpage = self._download_webpage(url, display_id)
content = self._download_json(
'http://www.hbo.com/api/content/' + path, display_id)['content']
video_id = self._search_regex(
r'(?P<q1>[\'"])videoId(?P=q1)\s*:\s*(?P<q2>[\'"])(?P<video_id>\d+)(?P=q2)',
webpage, 'video ID', group='video_id')
video_id = compat_str((content.get('parsed', {}).get(
'common:FullBleedVideo', {}) or content['selectedEpisode'])['videoId'])
info_dict = self._extract_from_id(video_id)
info_dict['display_id'] = display_id

View File

@@ -34,12 +34,6 @@ class NineCNineMediaStackIE(NineCNineMediaBaseIE):
formats.extend(self._extract_f4m_formats(
stack_base_url + 'f4m', stack_id,
f4m_id='hds', fatal=False))
mp4_url = self._download_webpage(stack_base_url + 'pd', stack_id, fatal=False)
if mp4_url:
formats.append({
'url': mp4_url,
'format_id': 'mp4',
})
self._sort_formats(formats)
return {

View File

@@ -110,7 +110,7 @@ class OpenloadIE(InfoExtractor):
elif H % 2 == 0 and H != 0 and ord(v[H - 1]) < 60:
i = int(C, 10)
h += 1
index = H % 12
index = H % 7
A = hashMap[index]
i ^= 213

View File

@@ -40,7 +40,7 @@ class PluralsightIE(PluralsightBaseIE):
'info_dict': {
'id': 'hosting-sql-server-windows-azure-iaas-m7-mgmt-04',
'ext': 'mp4',
'title': 'Management of SQL Server - Demo Monitoring',
'title': 'Demo Monitoring',
'duration': 338,
},
'skip': 'Requires pluralsight account credentials',
@@ -187,7 +187,7 @@ class PluralsightIE(PluralsightBaseIE):
if not clip:
raise ExtractorError('Unable to resolve clip')
title = '%s - %s' % (module['title'], clip['title'])
title = clip['title']
QUALITIES = {
'low': {'width': 640, 'height': 480},

View File

@@ -1,7 +1,9 @@
# coding: utf-8
from __future__ import unicode_literals
import functools
import itertools
import operator
# import os
import re
@@ -18,6 +20,7 @@ from ..utils import (
js_to_json,
orderedSet,
# sanitized_Request,
remove_quotes,
str_to_int,
)
# from ..aes import (
@@ -129,9 +132,32 @@ class PornHubIE(InfoExtractor):
tv_webpage = dl_webpage('tv')
video_url = self._search_regex(
r'<video[^>]+\bsrc=(["\'])(?P<url>(?:https?:)?//.+?)\1', tv_webpage,
'video url', group='url')
assignments = self._search_regex(
r'(var.+?mediastring.+?)</script>', tv_webpage,
'encoded url').split(';')
js_vars = {}
def parse_js_value(inp):
inp = re.sub(r'/\*(?:(?!\*/).)*?\*/', '', inp)
if '+' in inp:
inps = inp.split('+')
return functools.reduce(
operator.concat, map(parse_js_value, inps))
inp = inp.strip()
if inp in js_vars:
return js_vars[inp]
return remove_quotes(inp)
for assn in assignments:
assn = assn.strip()
if not assn:
continue
assn = re.sub(r'var\s+', '', assn)
vname, value = assn.split('=', 1)
js_vars[vname] = parse_js_value(value)
video_url = js_vars['mediastring']
title = self._search_regex(
r'<h1>([^>]+)</h1>', tv_webpage, 'title', default=None)

View File

@@ -89,7 +89,7 @@ class SenateISVPIE(InfoExtractor):
@staticmethod
def _search_iframe_url(webpage):
mobj = re.search(
r"<iframe[^>]+src=['\"](?P<url>http://www\.senate\.gov/isvp/?\?[^'\"]+)['\"]",
r"<iframe[^>]+src=['\"](?P<url>https?://www\.senate\.gov/isvp/?\?[^'\"]+)['\"]",
webpage)
if mobj:
return mobj.group('url')

View File

@@ -44,7 +44,7 @@ class ViuBaseIE(InfoExtractor):
class ViuIE(ViuBaseIE):
_VALID_URL = r'(?:viu:|https?://www\.viu\.com/[a-z]{2}/media/)(?P<id>\d+)'
_VALID_URL = r'(?:viu:|https?://[^/]+\.viu\.com/[a-z]{2}/media/)(?P<id>\d+)'
_TESTS = [{
'url': 'https://www.viu.com/en/media/1116705532?containerId=playlist-22168059',
'info_dict': {
@@ -69,6 +69,9 @@ class ViuIE(ViuBaseIE):
'skip_download': 'm3u8 download',
},
'skip': 'Geo-restricted to Indonesia',
}, {
'url': 'https://india.viu.com/en/media/1126286865',
'only_matching': True,
}]
def _real_extract(self, url):

View File

@@ -1,3 +1,3 @@
from __future__ import unicode_literals
__version__ = '2017.03.20'
__version__ = '2017.03.24'