Fix #2988: linkcheck: retry with GET request if denied HEAD request

This commit is contained in:
Takeshi KOMIYA 2016-10-04 11:04:58 +09:00
parent 72b76ab6d7
commit ce7fea9a35
4 changed files with 12 additions and 13 deletions

View File

@ -15,6 +15,7 @@ Bugs fixed
* #2810: Problems with pdflatex in an Italian document
* Use ``latex_elements.papersize`` to specify papersize of LaTeX in Makefile
* #2988: linkcheck: retry with GET request if denied HEAD request
Documentation
-------------

View File

@ -84,14 +84,12 @@ class CheckExternalLinksBuilder(Builder):
self.good = set()
self.broken = {}
self.redirected = {}
self.headers = dict(useragent_header)
# set a timeout for non-responding servers
socket.setdefaulttimeout(5.0)
# create output file
open(path.join(self.outdir, 'output.txt'), 'w').close()
self.session = requests.Session()
self.session.headers = dict(useragent_header)
# create queues and worker threads
self.wqueue = queue.Queue()
self.rqueue = queue.Queue()
@ -129,23 +127,23 @@ class CheckExternalLinksBuilder(Builder):
# Read the whole document and see if #anchor exists
# (Anchors starting with ! are ignored since they are
# commonly used for dynamic pages)
response = self.session.get(req_url, stream=True, **kwargs)
response = requests.get(req_url, stream=True, headers=self.headers,
**kwargs)
found = check_anchor(response, unquote(anchor))
if not found:
raise Exception("Anchor '%s' not found" % anchor)
else:
try:
# try a HEAD request, which should be easier on
# try a HEAD request first, which should be easier on
# the server and the network
response = self.session.head(req_url, **kwargs)
response = requests.head(req_url, headers=self.headers, **kwargs)
response.raise_for_status()
except HTTPError as err:
if err.response.status_code not in (403, 405):
raise
# retry with GET if that fails, some servers
# don't like HEAD requests and reply with 403 or 405
response = self.session.get(req_url, stream=True, **kwargs)
# retry with GET request if that fails, some servers
# don't like HEAD requests.
response = requests.get(req_url, stream=True, headers=self.headers,
**kwargs)
response.raise_for_status()
except HTTPError as err:
if err.response.status_code == 401:

View File

@ -39,5 +39,5 @@ except pkg_resources.UnknownExtra:
'install requests-2.4.1+.'
)
useragent_header = [('User-agent',
useragent_header = [('User-Agent',
'Mozilla/5.0 (X11; Linux x86_64; rv:25.0) Gecko/20100101 Firefox/25.0')]

View File

@ -66,7 +66,7 @@ def test_build_all():
)
with mock.patch('sphinx.builders.linkcheck.requests') as requests:
requests.Session().head = request_session_head
requests.head = request_session_head
# note: no 'html' - if it's ok with dirhtml it's ok with html
for buildername in ['dirhtml', 'singlehtml', 'latex', 'texinfo', 'pickle',