diff --git a/CHANGES b/CHANGES index 9bb479cb7..f4a109124 100644 --- a/CHANGES +++ b/CHANGES @@ -15,6 +15,7 @@ Bugs fixed * #2810: Problems with pdflatex in an Italian document * Use ``latex_elements.papersize`` to specify papersize of LaTeX in Makefile +* #2988: linkcheck: retry with GET request if denied HEAD request Documentation ------------- diff --git a/sphinx/builders/linkcheck.py b/sphinx/builders/linkcheck.py index 371114106..e1eeb396c 100644 --- a/sphinx/builders/linkcheck.py +++ b/sphinx/builders/linkcheck.py @@ -84,14 +84,12 @@ class CheckExternalLinksBuilder(Builder): self.good = set() self.broken = {} self.redirected = {} + self.headers = dict(useragent_header) # set a timeout for non-responding servers socket.setdefaulttimeout(5.0) # create output file open(path.join(self.outdir, 'output.txt'), 'w').close() - self.session = requests.Session() - self.session.headers = dict(useragent_header) - # create queues and worker threads self.wqueue = queue.Queue() self.rqueue = queue.Queue() @@ -129,23 +127,23 @@ class CheckExternalLinksBuilder(Builder): # Read the whole document and see if #anchor exists # (Anchors starting with ! are ignored since they are # commonly used for dynamic pages) - response = self.session.get(req_url, stream=True, **kwargs) + response = requests.get(req_url, stream=True, headers=self.headers, + **kwargs) found = check_anchor(response, unquote(anchor)) if not found: raise Exception("Anchor '%s' not found" % anchor) else: try: - # try a HEAD request, which should be easier on + # try a HEAD request first, which should be easier on # the server and the network - response = self.session.head(req_url, **kwargs) + response = requests.head(req_url, headers=self.headers, **kwargs) response.raise_for_status() except HTTPError as err: - if err.response.status_code not in (403, 405): - raise - # retry with GET if that fails, some servers - # don't like HEAD requests and reply with 403 or 405 - response = self.session.get(req_url, stream=True, **kwargs) + # retry with GET request if that fails, some servers + # don't like HEAD requests. + response = requests.get(req_url, stream=True, headers=self.headers, + **kwargs) response.raise_for_status() except HTTPError as err: if err.response.status_code == 401: diff --git a/sphinx/util/requests.py b/sphinx/util/requests.py index 095bf33e6..9bd8f251c 100644 --- a/sphinx/util/requests.py +++ b/sphinx/util/requests.py @@ -39,5 +39,5 @@ except pkg_resources.UnknownExtra: 'install requests-2.4.1+.' ) -useragent_header = [('User-agent', +useragent_header = [('User-Agent', 'Mozilla/5.0 (X11; Linux x86_64; rv:25.0) Gecko/20100101 Firefox/25.0')] diff --git a/tests/test_build.py b/tests/test_build.py index 82569074d..27a99461b 100644 --- a/tests/test_build.py +++ b/tests/test_build.py @@ -66,7 +66,7 @@ def test_build_all(): ) with mock.patch('sphinx.builders.linkcheck.requests') as requests: - requests.Session().head = request_session_head + requests.head = request_session_head # note: no 'html' - if it's ok with dirhtml it's ok with html for buildername in ['dirhtml', 'singlehtml', 'latex', 'texinfo', 'pickle',