Check both that a file is referenced from our pages and also that pages
reference existing images.
The mode for dumping external references now also dumps images.
'--ignore-image' can be used repeatedly to suppress errors for specific
images.
Signed-off-by: Peter Krempa <pkrempa(a)redhat.com>
---
scripts/check-html-references.py | 101 ++++++++++++++++++++++++++-----
1 file changed, 87 insertions(+), 14 deletions(-)
diff --git a/scripts/check-html-references.py b/scripts/check-html-references.py
index 4f08feab59..788622a2d0 100755
--- a/scripts/check-html-references.py
+++ b/scripts/check-html-references.py
@@ -24,25 +24,32 @@ import xml.etree.ElementTree as ET
ns = {'html': 'http://www.w3.org/1999/xhtml'}
externallinks = []
+externalimages = []
def get_file_list(prefix):
filelist = []
+ imagelist = []
+ imageformats = ['.jpg', '.svg', '.png']
for root, dir, files in os.walk(prefix):
for file in files:
- if not re.search('\\.html$', file):
- continue
+ ext = os.path.splitext(file)[1]
- # the 404 page doesn't play well
- if '404.html' in file:
- continue
+ if ext == '.html':
+ # the 404 page doesn't play well
+ if '404.html' in file:
+ continue
+
+ filelist.append(os.path.join(root, file))
- filelist.append(os.path.join(root, file))
+ elif ext in imageformats:
+ imagelist.append(os.path.join(root, file))
filelist.sort()
+ imagelist.sort()
- return filelist
+ return filelist, imagelist
# loads an XHTML and extracts all anchors, local and remote links for the one file
@@ -50,12 +57,14 @@ def process_file(filename):
tree = ET.parse(filename)
root = tree.getroot()
docname = root.get('data-sourcedoc')
+ dirname = os.path.dirname(filename)
if not docname:
docname = filename
anchors = [filename]
targets = []
+ images = []
for elem in root.findall('.//html:a', ns):
target = elem.get('href')
@@ -68,7 +77,6 @@ def process_file(filename):
if re.search('://', target):
externallinks.append(target)
elif target[0] != '#' and 'mailto:' not in target:
- dirname = os.path.dirname(filename)
targetfull = os.path.normpath(os.path.join(dirname, target))
targets.append((filename, docname, targetfull, target))
@@ -87,20 +95,33 @@ def process_file(filename):
if an:
anchors.append(filename + '#' + an)
- return (anchors, targets)
+ # find local images
+ for elem in root.findall('.//html:img', ns):
+ src = elem.get('src')
+
+ if src:
+ if re.search('://', src):
+ externalimages.append(src)
+ else:
+ imagefull = os.path.normpath(os.path.join(dirname, src))
+ images.append((imagefull, docname))
+
+ return (anchors, targets, images)
def process_all(filelist):
anchors = []
targets = []
+ images = []
for file in filelist:
- anchor, target = process_file(file)
+ anchor, target, image = process_file(file)
targets = targets + target
anchors = anchors + anchor
+ images = images + image
- return (targets, anchors)
+ return (targets, anchors, images)
def check_targets(targets, anchors):
@@ -163,6 +184,46 @@ def check_usage(targets, files, entrypoint):
return fail
+# checks that images present in the directory are being used and also that
+# pages link to existing images. For favicons, which are not referenced from
+# the '.html' files there's a builtin set of exceptions.
+def check_images(usedimages, imagefiles, ignoreimages):
+ favicons = [
+ 'android-chrome-192x192.png',
+ 'android-chrome-256x256.png',
+ 'apple-touch-icon.png',
+ 'favicon-16x16.png',
+ 'favicon-32x32.png',
+ 'mstile-150x150.png',
+ ]
+ fail = False
+
+ if ignoreimages:
+ favicons = favicons + ignoreimages
+
+ for usedimage, docname in usedimages:
+ if usedimage not in imagefiles:
+ print(f'ERROR: \'{docname}\' references image
\'{usedimage}\' not among images')
+ fail = True
+
+ for imagefile in imagefiles:
+ used = False
+
+ if imagefile in (usedimage[0] for usedimage in usedimages):
+ used = True
+ else:
+ for favicon in favicons:
+ if favicon in imagefile:
+ used = True
+ break
+
+ if not used:
+ print(f'ERROR: Image \'{imagefile}\' is not used by any
page')
+ fail = True
+
+ return fail
+
+
parser = argparse.ArgumentParser(description='HTML reference checker')
parser.add_argument('--webroot', required=True,
help='path to the web root')
@@ -170,14 +231,16 @@ parser.add_argument('--entrypoint',
default="index.html",
help='file name of web entry point relative to --webroot')
parser.add_argument('--external', action="store_true",
help='print external references instead')
+parser.add_argument('--ignore-images', action='append',
+ help='paths to images that should be considered as used')
args = parser.parse_args()
-files = get_file_list(os.path.abspath(args.webroot))
+files, imagefiles = get_file_list(os.path.abspath(args.webroot))
entrypoint = os.path.join(os.path.abspath(args.webroot), args.entrypoint)
-targets, anchors = process_all(files)
+targets, anchors, usedimages = process_all(files)
fail = False
@@ -186,7 +249,14 @@ if args.external:
externallinks.sort()
for ext in externallinks:
if ext != prev:
- print(ext)
+ print(f'link: {ext}')
+
+ prev = ext
+
+ externalimages.sort()
+ for ext in externalimages:
+ if ext != prev:
+ print(f'image: {ext}')
prev = ext
else:
@@ -196,6 +266,9 @@ else:
if check_usage(targets, files, entrypoint):
fail = True
+ if check_images(usedimages, imagefiles, args.ignore_images):
+ fail = True
+
if fail:
sys.exit(1)
--
2.39.1