Merge pull request #2884 from tk0miya/refactor_intersphinx

Refactor intersphinx: Use requests package
This commit is contained in:
Takeshi KOMIYA 2016-08-27 16:43:54 +09:00 committed by GitHub
commit 07e30b279b
4 changed files with 113 additions and 168 deletions

View File

@ -14,10 +14,7 @@ import socket
import codecs
import threading
from os import path
import warnings
import pkg_resources
import requests
from requests.exceptions import HTTPError
from six.moves import queue
from six.moves.urllib.parse import unquote
@ -38,32 +35,7 @@ from sphinx.builders import Builder
from sphinx.util import encode_uri
from sphinx.util.console import purple, red, darkgreen, darkgray, \
darkred, turquoise
try:
pkg_resources.require(['requests[security]'])
except pkg_resources.DistributionNotFound:
import ssl
if not getattr(ssl, 'HAS_SNI', False):
# don't complain on each url processed about the SSL issue
requests.packages.urllib3.disable_warnings(
requests.packages.urllib3.exceptions.InsecurePlatformWarning)
warnings.warn(
'Some links may return broken results due to being unable to '
'check the Server Name Indication (SNI) in the returned SSL cert '
'against the hostname in the url requested. Recommended to '
'install "requests[security]" as a dependency or upgrade to '
'a python version with SNI support (Python 3 and Python 2.7.9+).'
)
except pkg_resources.UnknownExtra:
warnings.warn(
'Some links may return broken results due to being unable to '
'check the Server Name Indication (SNI) in the returned SSL cert '
'against the hostname in the url requested. Recommended to '
'install requests-2.4.1+.'
)
requests_user_agent = [('User-agent', 'Mozilla/5.0 (X11; Linux x86_64; rv:25.0) '
'Gecko/20100101 Firefox/25.0')]
from sphinx.util.requests import requests, useragent_header
class AnchorCheckParser(HTMLParser):
@ -118,7 +90,7 @@ class CheckExternalLinksBuilder(Builder):
open(path.join(self.outdir, 'output.txt'), 'w').close()
self.session = requests.Session()
self.session.headers = dict(requests_user_agent)
self.session.headers = dict(useragent_header)
# create queues and worker threads
self.wqueue = queue.Queue()

View File

@ -34,7 +34,6 @@ from os import path
import re
from six import iteritems, string_types
from six.moves.urllib import request
from six.moves.urllib.parse import urlsplit, urlunsplit
from docutils import nodes
from docutils.utils import relative_path
@ -42,17 +41,9 @@ from docutils.utils import relative_path
import sphinx
from sphinx.locale import _
from sphinx.builders.html import INVENTORY_FILENAME
from sphinx.util.requests import requests, useragent_header
default_handlers = [request.ProxyHandler(), request.HTTPRedirectHandler(),
request.HTTPHandler()]
try:
default_handlers.append(request.HTTPSHandler)
except AttributeError:
pass
default_opener = request.build_opener(*default_handlers)
UTF8StreamReader = codecs.lookup('utf-8')[2]
@ -125,6 +116,14 @@ def read_inventory_v2(f, uri, join, bufsize=16*1024):
return invdata
def read_inventory(f, uri, join, bufsize=16*1024):
line = f.readline().rstrip().decode('utf-8')
if line == '# Sphinx inventory version 1':
return read_inventory_v1(f, uri, join)
elif line == '# Sphinx inventory version 2':
return read_inventory_v2(f, uri, join, bufsize=bufsize)
def _strip_basic_auth(url):
"""Returns *url* with basic auth credentials removed. Also returns the
basic auth username and password if they're present in *url*.
@ -136,27 +135,14 @@ def _strip_basic_auth(url):
:param url: url which may or may not contain basic auth credentials
:type url: ``str``
:return: 3-``tuple`` of:
* (``str``) -- *url* with any basic auth creds removed
* (``str`` or ``NoneType``) -- basic auth username or ``None`` if basic
auth username not given
* (``str`` or ``NoneType``) -- basic auth password or ``None`` if basic
auth password not given
:rtype: ``tuple``
:return: *url* with any basic auth creds removed
:rtype: ``str``
"""
url_parts = urlsplit(url)
username = url_parts.username
password = url_parts.password
frags = list(url_parts)
frags = list(urlsplit(url))
# swap out "user[:pass]@hostname" for "hostname"
if url_parts.port:
frags[1] = "%s:%s" % (url_parts.hostname, url_parts.port)
else:
frags[1] = url_parts.hostname
url = urlunsplit(frags)
return (url, username, password)
if '@' in frags[1]:
frags[1] = frags[1].split('@')[1]
return urlunsplit(frags)
def _read_from_url(url):
@ -175,48 +161,35 @@ def _read_from_url(url):
:return: data read from resource described by *url*
:rtype: ``file``-like object
"""
url, username, password = _strip_basic_auth(url)
if username is not None and password is not None:
# case: url contains basic auth creds
password_mgr = request.HTTPPasswordMgrWithDefaultRealm()
password_mgr.add_password(None, url, username, password)
handler = request.HTTPBasicAuthHandler(password_mgr)
opener = request.build_opener(*(default_handlers + [handler]))
else:
opener = default_opener
return opener.open(url)
r = requests.get(url, stream=True, headers=dict(useragent_header))
r.raise_for_status()
r.raw.url = r.url
return r.raw
def _get_safe_url(url):
"""Gets version of *url* with basic auth passwords obscured. This function
returns results suitable for printing and logging.
E.g.: https://user:12345@example.com => https://user:********@example.com
.. note::
The number of astrisks is invariant in the length of the basic auth
password, so minimal information is leaked.
E.g.: https://user:12345@example.com => https://user@example.com
:param url: a url
:type url: ``str``
:return: *url* with password obscured
:return: *url* with password removed
:rtype: ``str``
"""
safe_url = url
url, username, _ = _strip_basic_auth(url)
if username is not None:
# case: url contained basic auth creds; obscure password
url_parts = urlsplit(url)
safe_netloc = '{0}@{1}'.format(username, url_parts.hostname)
# replace original netloc w/ obscured version
frags = list(url_parts)
frags[1] = safe_netloc
safe_url = urlunsplit(frags)
parts = urlsplit(url)
if parts.username is None:
return url
else:
frags = list(parts)
if parts.port:
frags[1] = '{0}@{1}:{2}'.format(parts.username, parts.hostname, parts.port)
else:
frags[1] = '{0}@{1}'.format(parts.username, parts.hostname)
return safe_url
return urlunsplit(frags)
def fetch_inventory(app, uri, inv):
@ -226,8 +199,7 @@ def fetch_inventory(app, uri, inv):
localuri = '://' not in uri
if not localuri:
# case: inv URI points to remote resource; strip any existing auth
uri, _, _ = _strip_basic_auth(uri)
join = localuri and path.join or posixpath.join
uri = _strip_basic_auth(uri)
try:
if '://' in inv:
f = _read_from_url(inv)
@ -238,25 +210,19 @@ def fetch_inventory(app, uri, inv):
'%s: %s' % (inv, err.__class__, err))
return
try:
if hasattr(f, 'geturl'):
newinv = f.geturl()
if hasattr(f, 'url'):
newinv = f.url
if inv != newinv:
app.info('intersphinx inventory has moved: %s -> %s' % (inv, newinv))
if uri in (inv, path.dirname(inv), path.dirname(inv) + '/'):
uri = path.dirname(newinv)
line = f.readline().rstrip().decode('utf-8')
try:
if line == '# Sphinx inventory version 1':
invdata = read_inventory_v1(f, uri, join)
elif line == '# Sphinx inventory version 2':
invdata = read_inventory_v2(f, uri, join)
else:
raise ValueError
f.close()
except ValueError:
f.close()
raise ValueError('unknown or unsupported inventory version')
with f:
try:
join = localuri and path.join or posixpath.join
invdata = read_inventory(f, uri, join)
except ValueError:
raise ValueError('unknown or unsupported inventory version')
except Exception as err:
app.warn('intersphinx inventory %r not readable due to '
'%s: %s' % (inv, err.__class__.__name__, err))

43
sphinx/util/requests.py Normal file
View File

@ -0,0 +1,43 @@
# -*- coding: utf-8 -*-
"""
sphinx.util.requests
~~~~~~~~~~~~~~~~~~~~
Simple requests package loader
:copyright: Copyright 2007-2016 by the Sphinx team, see AUTHORS.
:license: BSD, see LICENSE for details.
"""
from __future__ import absolute_import
import requests
import warnings
import pkg_resources
# try to load requests[security]
try:
pkg_resources.require(['requests[security]'])
except pkg_resources.DistributionNotFound:
import ssl
if not getattr(ssl, 'HAS_SNI', False):
# don't complain on each url processed about the SSL issue
requests.packages.urllib3.disable_warnings(
requests.packages.urllib3.exceptions.InsecurePlatformWarning)
warnings.warn(
'Some links may return broken results due to being unable to '
'check the Server Name Indication (SNI) in the returned SSL cert '
'against the hostname in the url requested. Recommended to '
'install "requests[security]" as a dependency or upgrade to '
'a python version with SNI support (Python 3 and Python 2.7.9+).'
)
except pkg_resources.UnknownExtra:
warnings.warn(
'Some links may return broken results due to being unable to '
'check the Server Name Indication (SNI) in the returned SSL cert '
'against the hostname in the url requested. Recommended to '
'install requests-2.4.1+.'
)
useragent_header = [('User-agent',
'Mozilla/5.0 (X11; Linux x86_64; rv:25.0) Gecko/20100101 Firefox/25.0')]

View File

@ -17,7 +17,7 @@ from six import BytesIO
from docutils import nodes
from sphinx import addnodes
from sphinx.ext.intersphinx import read_inventory_v1, read_inventory_v2, \
from sphinx.ext.intersphinx import read_inventory, \
load_mappings, missing_reference, _strip_basic_auth, _read_from_url, \
_get_safe_url, fetch_inventory, INVENTORY_FILENAME
@ -49,8 +49,7 @@ a term including:colon std:term -1 glossary.html#term-a-term-including-colon -
def test_read_inventory_v1():
f = BytesIO(inventory_v1)
f.readline()
invdata = read_inventory_v1(f, '/util', posixpath.join)
invdata = read_inventory(f, '/util', posixpath.join)
assert invdata['py:module']['module'] == \
('foo', '1.0', '/util/foo.html#module-module', '-')
assert invdata['py:class']['module.cls'] == \
@ -59,13 +58,11 @@ def test_read_inventory_v1():
def test_read_inventory_v2():
f = BytesIO(inventory_v2)
f.readline()
invdata1 = read_inventory_v2(f, '/util', posixpath.join)
invdata1 = read_inventory(f, '/util', posixpath.join)
# try again with a small buffer size to test the chunking algorithm
f = BytesIO(inventory_v2)
f.readline()
invdata2 = read_inventory_v2(f, '/util', posixpath.join, bufsize=5)
invdata2 = read_inventory(f, '/util', posixpath.join, bufsize=5)
assert invdata1 == invdata2
@ -84,47 +81,47 @@ def test_read_inventory_v2():
@with_app()
@mock.patch('sphinx.ext.intersphinx.read_inventory_v2')
@mock.patch('sphinx.ext.intersphinx.read_inventory')
@mock.patch('sphinx.ext.intersphinx._read_from_url')
def test_fetch_inventory_redirection(app, status, warning, _read_from_url, read_inventory_v2):
def test_fetch_inventory_redirection(app, status, warning, _read_from_url, read_inventory):
_read_from_url().readline.return_value = '# Sphinx inventory version 2'.encode('utf-8')
# same uri and inv, not redirected
_read_from_url().geturl.return_value = 'http://hostname/' + INVENTORY_FILENAME
_read_from_url().url = 'http://hostname/' + INVENTORY_FILENAME
fetch_inventory(app, 'http://hostname/', 'http://hostname/' + INVENTORY_FILENAME)
assert 'intersphinx inventory has moved' not in status.getvalue()
assert read_inventory_v2.call_args[0][1] == 'http://hostname/'
assert read_inventory.call_args[0][1] == 'http://hostname/'
# same uri and inv, redirected
status.seek(0)
status.truncate(0)
_read_from_url().geturl.return_value = 'http://hostname/new/' + INVENTORY_FILENAME
_read_from_url().url = 'http://hostname/new/' + INVENTORY_FILENAME
fetch_inventory(app, 'http://hostname/', 'http://hostname/' + INVENTORY_FILENAME)
assert status.getvalue() == ('intersphinx inventory has moved: '
'http://hostname/%s -> http://hostname/new/%s\n' %
(INVENTORY_FILENAME, INVENTORY_FILENAME))
assert read_inventory_v2.call_args[0][1] == 'http://hostname/new'
assert read_inventory.call_args[0][1] == 'http://hostname/new'
# different uri and inv, not redirected
status.seek(0)
status.truncate(0)
_read_from_url().geturl.return_value = 'http://hostname/new/' + INVENTORY_FILENAME
_read_from_url().url = 'http://hostname/new/' + INVENTORY_FILENAME
fetch_inventory(app, 'http://hostname/', 'http://hostname/new/' + INVENTORY_FILENAME)
assert 'intersphinx inventory has moved' not in status.getvalue()
assert read_inventory_v2.call_args[0][1] == 'http://hostname/'
assert read_inventory.call_args[0][1] == 'http://hostname/'
# different uri and inv, redirected
status.seek(0)
status.truncate(0)
_read_from_url().geturl.return_value = 'http://hostname/other/' + INVENTORY_FILENAME
_read_from_url().url = 'http://hostname/other/' + INVENTORY_FILENAME
fetch_inventory(app, 'http://hostname/', 'http://hostname/new/' + INVENTORY_FILENAME)
assert status.getvalue() == ('intersphinx inventory has moved: '
'http://hostname/new/%s -> http://hostname/other/%s\n' %
(INVENTORY_FILENAME, INVENTORY_FILENAME))
assert read_inventory_v2.call_args[0][1] == 'http://hostname/'
assert read_inventory.call_args[0][1] == 'http://hostname/'
@with_app()
@ -233,64 +230,23 @@ class TestStripBasicAuth(unittest.TestCase):
"""basic auth creds stripped from URL containing creds"""
url = 'https://user:12345@domain.com/project/objects.inv'
expected = 'https://domain.com/project/objects.inv'
actual_url, actual_username, actual_password = _strip_basic_auth(url)
self.assertEqual(expected, actual_url)
self.assertEqual('user', actual_username)
self.assertEqual('12345', actual_password)
actual = _strip_basic_auth(url)
self.assertEqual(expected, actual)
def test_no_auth(self):
"""url unchanged if param doesn't contain basic auth creds"""
url = 'https://domain.com/project/objects.inv'
expected = 'https://domain.com/project/objects.inv'
actual_url, actual_username, actual_password = _strip_basic_auth(url)
self.assertEqual(expected, actual_url)
self.assertEqual(None, actual_username)
self.assertEqual(None, actual_password)
actual = _strip_basic_auth(url)
self.assertEqual(expected, actual)
def test_having_port(self):
"""basic auth creds correctly stripped from URL containing creds even if URL
contains port"""
url = 'https://user:12345@domain.com:8080/project/objects.inv'
expected = 'https://domain.com:8080/project/objects.inv'
actual_url, actual_username, actual_password = _strip_basic_auth(url)
self.assertEqual(expected, actual_url)
self.assertEqual('user', actual_username)
self.assertEqual('12345', actual_password)
@mock.patch('six.moves.urllib.request.HTTPBasicAuthHandler')
@mock.patch('six.moves.urllib.request.HTTPPasswordMgrWithDefaultRealm')
@mock.patch('six.moves.urllib.request.build_opener')
def test_readfromurl_authed(m_build_opener, m_HTTPPasswordMgrWithDefaultRealm,
m_HTTPBasicAuthHandler):
# read from URL containing basic auth creds
password_mgr = mock.Mock()
m_HTTPPasswordMgrWithDefaultRealm.return_value = password_mgr
url = 'https://user:12345@domain.com/project/objects.inv'
_read_from_url(url)
m_HTTPPasswordMgrWithDefaultRealm.assert_called_once_with()
password_mgr.add_password.assert_called_with(
None, 'https://domain.com/project/objects.inv', 'user', '12345')
@mock.patch('six.moves.urllib.request.HTTPBasicAuthHandler')
@mock.patch('six.moves.urllib.request.HTTPPasswordMgrWithDefaultRealm')
@mock.patch('sphinx.ext.intersphinx.default_opener')
def test_readfromurl_unauthed(m_default_opener, m_HTTPPasswordMgrWithDefaultRealm,
m_HTTPBasicAuthHandler):
# read from URL without auth creds
password_mgr = mock.Mock()
m_HTTPPasswordMgrWithDefaultRealm.return_value = password_mgr
url = 'https://domain.com/project/objects.inv'
_read_from_url(url)
# assert password manager not created
assert m_HTTPPasswordMgrWithDefaultRealm.call_args is None
# assert no password added to the password manager
assert password_mgr.add_password.call_args is None
actual = _strip_basic_auth(url)
self.assertEqual(expected, actual)
def test_getsafeurl_authed():
@ -301,6 +257,14 @@ def test_getsafeurl_authed():
assert expected == actual
def test_getsafeurl_authed_having_port():
"""_get_safe_url() with a url with basic auth having port"""
url = 'https://user:12345@domain.com:8080/project/objects.inv'
expected = 'https://user@domain.com:8080/project/objects.inv'
actual = _get_safe_url(url)
assert expected == actual
def test_getsafeurl_unauthed():
"""_get_safe_url() with a url without basic auth"""
url = 'https://domain.com/project/objects.inv'