2022-05-31 08:15:57 -05:00
|
|
|
#!/usr/bin/env python3
|
|
|
|
#
|
|
|
|
# This library is free software; you can redistribute it and/or
|
|
|
|
# modify it under the terms of the GNU Lesser General Public
|
|
|
|
# License as published by the Free Software Foundation; either
|
|
|
|
# version 2.1 of the License, or (at your option) any later version.
|
|
|
|
#
|
|
|
|
# This library is distributed in the hope that it will be useful,
|
|
|
|
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
|
|
# Lesser General Public License for more details.
|
|
|
|
#
|
|
|
|
# You should have received a copy of the GNU Lesser General Public
|
|
|
|
# License along with this library. If not, see
|
|
|
|
# <http://www.gnu.org/licenses/>.
|
|
|
|
#
|
|
|
|
# Check that external references between documentation HTML files are not broken.
|
|
|
|
|
|
|
|
import sys
|
|
|
|
import os
|
|
|
|
import argparse
|
|
|
|
import re
|
|
|
|
import xml.etree.ElementTree as ET
|
|
|
|
|
|
|
|
ns = {'html': 'http://www.w3.org/1999/xhtml'}
|
|
|
|
externallinks = []
|
|
|
|
|
|
|
|
|
|
|
|
def get_file_list(prefix):
|
|
|
|
filelist = []
|
|
|
|
|
|
|
|
for root, dir, files in os.walk(prefix):
|
|
|
|
for file in files:
|
|
|
|
if not re.search('\\.html$', file):
|
|
|
|
continue
|
|
|
|
|
|
|
|
# the 404 page doesn't play well
|
|
|
|
if '404.html' in file:
|
|
|
|
continue
|
|
|
|
|
2023-02-14 05:35:23 -06:00
|
|
|
filelist.append(os.path.join(root, file))
|
2022-05-31 08:15:57 -05:00
|
|
|
|
|
|
|
return filelist
|
|
|
|
|
|
|
|
|
|
|
|
# loads an XHTML and extracts all anchors, local and remote links for the one file
|
2023-02-14 05:35:23 -06:00
|
|
|
def process_file(filename):
|
2022-05-31 08:15:57 -05:00
|
|
|
tree = ET.parse(filename)
|
|
|
|
root = tree.getroot()
|
2023-02-14 05:35:23 -06:00
|
|
|
docname = root.get('data-sourcedoc')
|
2022-05-31 08:15:57 -05:00
|
|
|
|
2023-02-14 05:35:23 -06:00
|
|
|
if not docname:
|
|
|
|
docname = filename
|
|
|
|
|
|
|
|
anchors = [filename]
|
2022-05-31 08:15:57 -05:00
|
|
|
targets = []
|
|
|
|
|
|
|
|
for elem in root.findall('.//html:a', ns):
|
|
|
|
target = elem.get('href')
|
|
|
|
an = elem.get('id')
|
|
|
|
|
|
|
|
if an:
|
2023-02-14 05:35:23 -06:00
|
|
|
anchors.append(filename + '#' + an)
|
2022-05-31 08:15:57 -05:00
|
|
|
|
|
|
|
if target:
|
|
|
|
if re.search('://', target):
|
|
|
|
externallinks.append(target)
|
|
|
|
elif target[0] != '#' and 'mailto:' not in target:
|
2023-02-14 05:35:23 -06:00
|
|
|
dirname = os.path.dirname(filename)
|
|
|
|
targetfull = os.path.normpath(os.path.join(dirname, target))
|
2022-05-31 08:15:57 -05:00
|
|
|
|
2023-02-14 05:35:23 -06:00
|
|
|
targets.append((filename, docname, targetfull, target))
|
2022-05-31 08:15:57 -05:00
|
|
|
|
|
|
|
# older docutils generate "<div class='section'"
|
|
|
|
for elem in root.findall('.//html:div/[@class=\'section\']', ns):
|
|
|
|
an = elem.get('id')
|
|
|
|
|
|
|
|
if an:
|
2023-02-14 05:35:23 -06:00
|
|
|
anchors.append(filename + '#' + an)
|
2022-05-31 08:15:57 -05:00
|
|
|
|
|
|
|
# modern docutils generate a <section element
|
|
|
|
for elem in root.findall('.//html:section', ns):
|
|
|
|
an = elem.get('id')
|
|
|
|
|
|
|
|
if an:
|
2023-02-14 05:35:23 -06:00
|
|
|
anchors.append(filename + '#' + an)
|
2022-05-31 08:15:57 -05:00
|
|
|
|
|
|
|
return (anchors, targets)
|
|
|
|
|
|
|
|
|
|
|
|
def process_all(filelist):
|
|
|
|
anchors = []
|
|
|
|
targets = []
|
|
|
|
|
2023-02-14 05:35:23 -06:00
|
|
|
for file in filelist:
|
|
|
|
anchor, target = process_file(file)
|
2022-05-31 08:15:57 -05:00
|
|
|
|
|
|
|
targets = targets + target
|
|
|
|
anchors = anchors + anchor
|
|
|
|
|
|
|
|
return (targets, anchors)
|
|
|
|
|
|
|
|
|
|
|
|
def check_targets(targets, anchors):
|
|
|
|
errors = []
|
2023-02-14 05:35:23 -06:00
|
|
|
for _, docname, target, targetorig in targets:
|
2022-05-31 08:15:57 -05:00
|
|
|
if target not in anchors:
|
2023-02-14 05:35:23 -06:00
|
|
|
errors.append((docname, targetorig))
|
2022-05-31 08:15:57 -05:00
|
|
|
|
|
|
|
if errors:
|
|
|
|
errors.sort()
|
|
|
|
|
|
|
|
for file, target in errors:
|
2023-02-14 05:35:23 -06:00
|
|
|
print(f'ERROR: \'{file}\': broken link to: \'{target}\'')
|
2022-05-31 08:15:57 -05:00
|
|
|
|
|
|
|
return True
|
|
|
|
|
|
|
|
return False
|
|
|
|
|
|
|
|
|
|
|
|
parser = argparse.ArgumentParser(description='HTML reference checker')
|
2023-02-14 05:05:30 -06:00
|
|
|
parser.add_argument('--webroot', required=True,
|
|
|
|
help='path to the web root')
|
2022-05-31 08:15:57 -05:00
|
|
|
parser.add_argument('--external', action="store_true",
|
|
|
|
help='print external references instead')
|
|
|
|
|
|
|
|
args = parser.parse_args()
|
|
|
|
|
2023-02-14 05:35:23 -06:00
|
|
|
files = get_file_list(os.path.abspath(args.webroot))
|
2022-05-31 08:15:57 -05:00
|
|
|
|
|
|
|
targets, anchors = process_all(files)
|
|
|
|
|
|
|
|
if args.external:
|
|
|
|
prev = None
|
|
|
|
externallinks.sort()
|
|
|
|
for ext in externallinks:
|
|
|
|
if ext != prev:
|
|
|
|
print(ext)
|
|
|
|
|
|
|
|
prev = ext
|
|
|
|
else:
|
|
|
|
if check_targets(targets, anchors):
|
|
|
|
sys.exit(1)
|
|
|
|
|
|
|
|
sys.exit(0)
|