[PATCH 9/9] scripts: check-html-refernces: Add checking for image file usage

15 Feb 2023

Check both that a file is referenced from our pages and also that pages
reference existing images.

The mode for dumping external references now also dumps images.

'--ignore-image' can be used repeatedly to suppress errors for specific
images.

Signed-off-by: Peter Krempa <pkrempa@redhat.com>
---
 scripts/check-html-references.py | 101 ++++++++++++++++++++++++++-----
 1 file changed, 87 insertions(+), 14 deletions(-)

diff --git a/scripts/check-html-references.py b/scripts/check-html-references.py
index 4f08feab59..788622a2d0 100755
--- a/scripts/check-html-references.py
+++ b/scripts/check-html-references.py
@@ -24,25 +24,32 @@ import xml.etree.ElementTree as ET

 ns = {'html': 'http://www.w3.org/1999/xhtml'}
 externallinks = []
+externalimages = []


 def get_file_list(prefix):
     filelist = []
+    imagelist = []
+    imageformats = ['.jpg', '.svg', '.png']

     for root, dir, files in os.walk(prefix):
         for file in files:
-            if not re.search('\\.html$', file):
-                continue
+            ext = os.path.splitext(file)[1]

-            # the 404 page doesn't play well
-            if '404.html' in file:
-                continue
+            if ext == '.html':
+                # the 404 page doesn't play well
+                if '404.html' in file:
+                    continue
+
+                filelist.append(os.path.join(root, file))

-            filelist.append(os.path.join(root, file))
+            elif ext in imageformats:
+                imagelist.append(os.path.join(root, file))

     filelist.sort()
+    imagelist.sort()

-    return filelist
+    return filelist, imagelist


 # loads an XHTML and extracts all anchors, local and remote links for the one file
@@ -50,12 +57,14 @@ def process_file(filename):
     tree = ET.parse(filename)
     root = tree.getroot()
     docname = root.get('data-sourcedoc')
+    dirname = os.path.dirname(filename)

     if not docname:
         docname = filename

     anchors = [filename]
     targets = []
+    images = []

     for elem in root.findall('.//html:a', ns):
         target = elem.get('href')
@@ -68,7 +77,6 @@ def process_file(filename):
             if re.search('://', target):
                 externallinks.append(target)
             elif target[0] != '#' and 'mailto:' not in target:
-                dirname = os.path.dirname(filename)
                 targetfull = os.path.normpath(os.path.join(dirname, target))

                 targets.append((filename, docname, targetfull, target))
@@ -87,20 +95,33 @@ def process_file(filename):
         if an:
             anchors.append(filename + '#' + an)

-    return (anchors, targets)
+    # find local images
+    for elem in root.findall('.//html:img', ns):
+        src = elem.get('src')
+
+        if src:
+            if re.search('://', src):
+                externalimages.append(src)
+            else:
+                imagefull = os.path.normpath(os.path.join(dirname, src))
+                images.append((imagefull, docname))
+
+    return (anchors, targets, images)


 def process_all(filelist):
     anchors = []
     targets = []
+    images = []

     for file in filelist:
-        anchor, target = process_file(file)
+        anchor, target, image = process_file(file)

         targets = targets + target
         anchors = anchors + anchor
+        images = images + image

-    return (targets, anchors)
+    return (targets, anchors, images)


 def check_targets(targets, anchors):
@@ -163,6 +184,46 @@ def check_usage(targets, files, entrypoint):
     return fail


+# checks that images present in the directory are being used and also that
+# pages link to existing images. For favicons, which are not referenced from
+# the '.html' files there's a builtin set of exceptions.
+def check_images(usedimages, imagefiles, ignoreimages):
+    favicons = [
+        'android-chrome-192x192.png',
+        'android-chrome-256x256.png',
+        'apple-touch-icon.png',
+        'favicon-16x16.png',
+        'favicon-32x32.png',
+        'mstile-150x150.png',
+    ]
+    fail = False
+
+    if ignoreimages:
+        favicons = favicons + ignoreimages
+
+    for usedimage, docname in usedimages:
+        if usedimage not in imagefiles:
+            print(f'ERROR: \'{docname}\' references image \'{usedimage}\' not among images')
+            fail = True
+
+    for imagefile in imagefiles:
+        used = False
+
+        if imagefile in (usedimage[0] for usedimage in usedimages):
+            used = True
+        else:
+            for favicon in favicons:
+                if favicon in imagefile:
+                    used = True
+                    break
+
+        if not used:
+            print(f'ERROR: Image \'{imagefile}\' is not used by any page')
+            fail = True
+
+    return fail
+
+
 parser = argparse.ArgumentParser(description='HTML reference checker')
 parser.add_argument('--webroot', required=True,
                     help='path to the web root')
@@ -170,14 +231,16 @@ parser.add_argument('--entrypoint', default="index.html",
                     help='file name of web entry point relative to --webroot')
 parser.add_argument('--external', action="store_true",
                     help='print external references instead')
+parser.add_argument('--ignore-images', action='append',
+                    help='paths to images that should be considered as used')

 args = parser.parse_args()

-files = get_file_list(os.path.abspath(args.webroot))
+files, imagefiles = get_file_list(os.path.abspath(args.webroot))

 entrypoint = os.path.join(os.path.abspath(args.webroot), args.entrypoint)

-targets, anchors = process_all(files)
+targets, anchors, usedimages = process_all(files)

 fail = False

@@ -186,7 +249,14 @@ if args.external:
     externallinks.sort()
     for ext in externallinks:
         if ext != prev:
-            print(ext)
+            print(f'link: {ext}')
+
+        prev = ext
+
+    externalimages.sort()
+    for ext in externalimages:
+        if ext != prev:
+            print(f'image: {ext}')

         prev = ext
 else:
@@ -196,6 +266,9 @@ else:
     if check_usage(targets, files, entrypoint):
         fail = True

+    if check_images(usedimages, imagefiles, args.ignore_images):
+        fail = True
+
     if fail:
         sys.exit(1)

-- 
2.39.1