[libvirt PATCH v3 0/3] ci: Check spelling

This is a wrapper for codespell [1], a spell checker for source code. Codespell does not compare words to a dictionary, but rather works by checking words against a list of common typos, making it produce fewer false positives than other solutions. The script in this patch works around the lack of per-directory ignore lists and some oddities regarding capitalization in ignore lists. [1] (https://github.com/codespell-project/codespell/) V1: https://listman.redhat.com/archives/libvir-list/2021-October/msg00015.html V2: https://listman.redhat.com/archives/libvir-list/2022-January/msg00382.html Changes since V2: * Added meson integration * Add option to ignore files untracked by git * Pre-filter .po files to reduce run time Regards, Tim Tim Wiederhake (3): scripts: Check spelling meson: Add spell checking ci: Add spell checking .codespellrc | 2 + .gitlab-ci.yml | 13 +++- build-aux/meson.build | 16 +++++ meson.build | 1 + meson_options.txt | 1 + scripts/check-spelling.py | 135 ++++++++++++++++++++++++++++++++++++++ scripts/meson.build | 1 + 7 files changed, 168 insertions(+), 1 deletion(-) create mode 100644 .codespellrc create mode 100755 scripts/check-spelling.py -- 2.31.1

This is a wrapper for codespell [1], a spell checker for source code. Codespell does not compare words to a dictionary, but rather works by checking words against a list of common typos, making it produce fewer false positives than other solutions. The script in this patch works around the lack of per-directory ignore lists and some oddities regarding capitalization in ignore lists. The ".codespellrc" file is used to coarsly filter out translation and git files, as scanning those makes up for roughly 50% of the run time otherwise. [1] (https://github.com/codespell-project/codespell/) Signed-off-by: Tim Wiederhake <twiederh@redhat.com> --- .codespellrc | 2 + scripts/check-spelling.py | 135 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 137 insertions(+) create mode 100644 .codespellrc create mode 100755 scripts/check-spelling.py diff --git a/.codespellrc b/.codespellrc new file mode 100644 index 0000000000..0c45be445b --- /dev/null +++ b/.codespellrc @@ -0,0 +1,2 @@ +[codespell] +skip = .git/,*.po diff --git a/scripts/check-spelling.py b/scripts/check-spelling.py new file mode 100755 index 0000000000..ce3e7d89f0 --- /dev/null +++ b/scripts/check-spelling.py @@ -0,0 +1,135 @@ +#!/usr/bin/env python3 + +import argparse +import re +import subprocess +import os + + +IGNORE_LIST = [ + # ignore this script + ("scripts/check-spelling.py", []), + + # 3rd-party: keycodemapdb + ("src/keycodemapdb/", []), + + # 3rd-party: VirtualBox SDK + ("src/vbox/vbox_CAPI", []), + + # 3rd-party: qemu + ("tests/qemucapabilitiesdata/caps_", []), + + # other + ("", ["msdos", "MSDOS", "wan", "WAN", "hda", "HDA", "inout"]), + ("NEWS.rst", "crashers"), + ("docs/gitdm/companies/others", "Archiv"), + ("docs/glib-adoption.rst", "preferrable"), + ("docs/js/main.js", "whats"), + ("examples/polkit/libvirt-acl.rules", ["userA", "userB", "userC"]), + ("src/libvirt-domain.c", "PTD"), + ("src/libxl/libxl_logger.c", "purposedly"), + ("src/nwfilter/nwfilter_dhcpsnoop.c", "ether"), + ("src/nwfilter/nwfilter_ebiptables_driver.c", "parm"), + ("src/nwfilter/nwfilter_learnipaddr.c", "ether"), + ("src/qemu/qemu_agent.c", "crypted"), + ("src/qemu/qemu_agent.h", "crypted"), + ("src/qemu/qemu_process.c", "wee"), + ("src/security/apparmor/libvirt-lxc", "devic"), + ("src/security/apparmor/libvirt-qemu", "readby"), + ("src/storage_file/storage_file_probe.c", "conectix"), + ("src/util/virnetdevmacvlan.c", "calld"), + ("src/util/virtpm.c", "parm"), + ("tests/qemuagenttest.c", "IST"), + ("tests/storagepoolxml2xml", "cant"), + ("tests/sysinfodata/", "sie"), + ("tests/testutils.c", "nIn"), + ("tests/vircgroupdata/ovirt-node-6.6.mounts", "hald"), + ("tests/virhostcpudata/", "sie"), + ("tools/virt-host-validate-common.c", "sie"), +] + + +def ignore(filename, linenumber, word, suggestion): + if len(word) <= 2: + return True + + for f, w in IGNORE_LIST: + if not filename.startswith(f): + continue + if word in w or not w: + return True + return False + + +def main(): + line_pattern = re.compile("^(.*):(.*): (.*) ==> (.*)$") + output_template = "(\"{0}\", \"{2}\"),\t# line {1}, \"{3}\"?" + + parser = argparse.ArgumentParser(description="Check spelling") + parser.add_argument( + "dir", + help="Path to source directory. " + "Defaults to parent directory of this script", + type=os.path.realpath, + nargs='?') + parser.add_argument( + "-i", + "--ignore", + help="File to ignore. Can be specified more than once", + metavar="FILE", + default=list(), + action="append") + parser.add_argument( + "--ignore-untracked", + help="Ignore all files not tracked by git", + action="store_true") + args = parser.parse_args() + + if not args.dir: + args.dir = os.path.dirname(os.path.dirname(os.path.realpath(__file__))) + + if args.ignore_untracked: + args.ignore.extend(subprocess.check_output( + ["git", "-C", args.dir, "ls-files", "--others"], + universal_newlines=True).split("\n")) + + try: + process = subprocess.run( + [ + "codespell", + args.dir, + "--config", + os.path.join(args.dir, ".codespellrc")], + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + universal_newlines=True) + except FileNotFoundError: + exit("error: codespell not found") + if process.returncode not in (0, 65): + exit("error: unexpected returncode %s" % process.returncode) + + if process.stderr: + exit("error: unexpected output to stderr: \"%s\"" % process.stderr) + + findings = 0 + for line in process.stdout.split("\n"): + line = line.strip().replace(args.dir, "").lstrip("/") + if not line: + continue + + match = line_pattern.match(line) + if not match: + exit("error: unexpected line: \"%s\"" % line) + + if match.group(1) in args.ignore or ignore(*match.groups()): + continue + + print(output_template.format(*match.groups())) + findings += 1 + + if findings: + exit("error: %s spelling errors" % findings) + + +if __name__ == "__main__": + main() -- 2.31.1

On Fri, Jan 21, 2022 at 10:41:48 +0100, Tim Wiederhake wrote:
This is a wrapper for codespell [1], a spell checker for source code. Codespell does not compare words to a dictionary, but rather works by checking words against a list of common typos, making it produce fewer false positives than other solutions.
The script in this patch works around the lack of per-directory ignore lists and some oddities regarding capitalization in ignore lists. The ".codespellrc" file is used to coarsly filter out translation and git files, as scanning those makes up for roughly 50% of the run time otherwise.
[1] (https://github.com/codespell-project/codespell/)
Signed-off-by: Tim Wiederhake <twiederh@redhat.com> --- .codespellrc | 2 + scripts/check-spelling.py | 135 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 137 insertions(+) create mode 100644 .codespellrc create mode 100755 scripts/check-spelling.py
diff --git a/.codespellrc b/.codespellrc new file mode 100644 index 0000000000..0c45be445b --- /dev/null +++ b/.codespellrc @@ -0,0 +1,2 @@ +[codespell] +skip = .git/,*.po
This doesn't seem to work for me, after patch 2/3 I'm getting the following:
MALLOC_PERTURB_=230 /bin/python3 /home/pipo/libvirt/scripts/check-spelling.py --ignore-untracked ――――――――――――――――――――――――――――――――――――― ✀ ――――――――――――――――――――――――――――――――――――― stdout: (".git/logs/HEAD", "lenght"), # line 187, "length"? (".git/logs/HEAD", "programatic"), # line 424, "programmatic"? (".git/logs/HEAD", "programatic"), # line 570, "programmatic"? (".git/logs/HEAD", "programatic"), # line 583, "programmatic"? (".git/logs/HEAD", "programatic"), # line 714, "programmatic"? (".git/logs/HEAD", "programatic"), # line 727, "programmatic"? (".git/logs/HEAD", "programatic"), # line 802, "programmatic"? (".git/logs/HEAD", "programatic"), # line 810, "programmatic"? (".git/logs/HEAD", "programatic"), # line 816, "programmatic"? (".git/logs/HEAD", "programatic"), # line 935, "programmatic"? (".git/logs/HEAD", "programatic"), # line 944, "programmatic"? (".git/logs/HEAD", "programatic"), # line 1005, "programmatic"? (".git/logs/HEAD", "programatic"), # line 1007, "programmatic"? (".git/logs/HEAD", "programatic"), # line 1008, "programmatic"? (".git/logs/HEAD", "programatic"), # line 1009, "programmatic"? (".git/logs/HEAD", "programatic"), # line 1014, "programmatic"?
... (I'll be elaborating a bit more on the exclude pattern later after playing with codespell for a bit).
diff --git a/scripts/check-spelling.py b/scripts/check-spelling.py new file mode 100755 index 0000000000..ce3e7d89f0 --- /dev/null +++ b/scripts/check-spelling.py @@ -0,0 +1,135 @@ +#!/usr/bin/env python3 + +import argparse +import re +import subprocess +import os + + +IGNORE_LIST = [ + # ignore this script + ("scripts/check-spelling.py", []), + + # 3rd-party: keycodemapdb + ("src/keycodemapdb/", []), + + # 3rd-party: VirtualBox SDK + ("src/vbox/vbox_CAPI", []), + + # 3rd-party: qemu + ("tests/qemucapabilitiesdata/caps_", []), + + # other + ("", ["msdos", "MSDOS", "wan", "WAN", "hda", "HDA", "inout"]),
These can be replaced by --ignore-words-list msdos,wan,hda,inout
+ ("NEWS.rst", "crashers"),
IMO this can be reworded.
+ ("docs/gitdm/companies/others", "Archiv"), + ("docs/glib-adoption.rst", "preferrable"),
This too.
+ ("docs/js/main.js", "whats"),
Skip all of docs/js?
+ ("examples/polkit/libvirt-acl.rules", ["userA", "userB", "userC"]), + ("src/libvirt-domain.c", "PTD"), + ("src/libxl/libxl_logger.c", "purposedly"), + ("src/nwfilter/nwfilter_dhcpsnoop.c", "ether"), + ("src/nwfilter/nwfilter_ebiptables_driver.c", "parm"), + ("src/nwfilter/nwfilter_learnipaddr.c", "ether"), + ("src/qemu/qemu_agent.c", "crypted"), + ("src/qemu/qemu_agent.h", "crypted"), + ("src/qemu/qemu_process.c", "wee"),
This too.
+ ("src/security/apparmor/libvirt-lxc", "devic"), + ("src/security/apparmor/libvirt-qemu", "readby"), + ("src/storage_file/storage_file_probe.c", "conectix"), + ("src/util/virnetdevmacvlan.c", "calld"), + ("src/util/virtpm.c", "parm"), + ("tests/qemuagenttest.c", "IST"), + ("tests/storagepoolxml2xml", "cant"), + ("tests/sysinfodata/", "sie"),
This is technically 3rd party.
+ ("tests/testutils.c", "nIn"), + ("tests/vircgroupdata/ovirt-node-6.6.mounts", "hald"),
This is also 3rd party `
+ ("tests/virhostcpudata/", "sie"),
This too.
+ ("tools/virt-host-validate-common.c", "sie"), +] + + +def ignore(filename, linenumber, word, suggestion): + if len(word) <= 2: + return True + + for f, w in IGNORE_LIST: + if not filename.startswith(f): + continue + if word in w or not w: + return True + return False + + +def main(): + line_pattern = re.compile("^(.*):(.*): (.*) ==> (.*)$") + output_template = "(\"{0}\", \"{2}\"),\t# line {1}, \"{3}\"?" + + parser = argparse.ArgumentParser(description="Check spelling") + parser.add_argument( + "dir", + help="Path to source directory. " + "Defaults to parent directory of this script", + type=os.path.realpath, + nargs='?') + parser.add_argument( + "-i", + "--ignore", + help="File to ignore. Can be specified more than once", + metavar="FILE", + default=list(), + action="append")
This is IMO pointless to have as an option, as we have an internal list of ignore files and user it's not used with
+ parser.add_argument( + "--ignore-untracked", + help="Ignore all files not tracked by git", + action="store_true")
IMO this option should not exist at all, there's no good reason to check files which are not tracked by git.
+ args = parser.parse_args() + + if not args.dir: + args.dir = os.path.dirname(os.path.dirname(os.path.realpath(__file__))) + + if args.ignore_untracked: + args.ignore.extend(subprocess.check_output( + ["git", "-C", args.dir, "ls-files", "--others"], + universal_newlines=True).split("\n")) + + try: + process = subprocess.run( + [ + "codespell", + args.dir, + "--config", + os.path.join(args.dir, ".codespellrc")], + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + universal_newlines=True) + except FileNotFoundError: + exit("error: codespell not found") + if process.returncode not in (0, 65): + exit("error: unexpected returncode %s" % process.returncode) + + if process.stderr: + exit("error: unexpected output to stderr: \"%s\"" % process.stderr) + + findings = 0 + for line in process.stdout.split("\n"): + line = line.strip().replace(args.dir, "").lstrip("/") + if not line: + continue + + match = line_pattern.match(line) + if not match: + exit("error: unexpected line: \"%s\"" % line) + + if match.group(1) in args.ignore or ignore(*match.groups()): + continue + + print(output_template.format(*match.groups())) + findings += 1 + + if findings: + exit("error: %s spelling errors" % findings) +
I have a rather substantial problem with runtime of this contraption on my box a pure 'ninja test' takes a bit below 4 seconds ... real 0m3.604s user 0m34.187s sys 0m10.662s Now a pure codespell invocation with the arguments used here takes more than 4,2 seconds. In case it's run as part of the test suite it doubles the execution time. That is IMO not acceptable. We should be using the 'skip' pattern a bit more, either through generating a config file or commandline arguments. This will also allow to ignore the untracked files directly, by putting them into the ignore pattern, rather than filtering them out. I also propose to ignore all of 'tests/' there's simply too much noise and this would waste too many cpu cycles. With the following ignore pattern: codespell -L msdos,inout,hda,wan -S .git,po,tests,vbox_CAPI*,*.js,polkit,*.Dockerfile,keycodemapdb,cpu_map The analysis is now down to 1.4 seconds from 4.2 and also the invocation via ninja test is now bearable again. IMO it's not worth to fiddle with tests/ to try to include the source files and the time saving is immense. The above exclude pattern also fixes what I've complained above. codespell is matching either full absolute paths or a subdirectory name but it can't contain a '/' if it's relative.

If "codespell" is installed, use it to find spelling mistakes. Enabled by default, disable with "meson -Dspellcheck=disabled". Signed-off-by: Tim Wiederhake <twiederh@redhat.com> --- build-aux/meson.build | 16 ++++++++++++++++ meson.build | 1 + meson_options.txt | 1 + scripts/meson.build | 1 + 4 files changed, 19 insertions(+) diff --git a/build-aux/meson.build b/build-aux/meson.build index e491bdeebc..96fa694d26 100644 --- a/build-aux/meson.build +++ b/build-aux/meson.build @@ -67,3 +67,19 @@ if git ) endforeach endif + + +if get_option('spellcheck').auto() + use_spellcheck = git and codespell_prog.found() +else + use_spellcheck = get_option('spellcheck').enabled() +endif + +if use_spellcheck + test( + 'check-spelling', + python3_prog, + args: [ check_spelling_prog.path(), '--ignore-untracked' ], + suite: 'syntax-check', + ) +endif diff --git a/meson.build b/meson.build index 70843afcd5..0f0a065247 100644 --- a/meson.build +++ b/meson.build @@ -812,6 +812,7 @@ endforeach optional_programs = [ 'augparse', + 'codespell', 'dmidecode', 'ebtables', 'flake8', diff --git a/meson_options.txt b/meson_options.txt index 5b43cdbd6b..b3aabdad35 100644 --- a/meson_options.txt +++ b/meson_options.txt @@ -10,6 +10,7 @@ option('rpath', type: 'feature', value: 'auto', description: 'whether to include option('docdir', type: 'string', value: '', description: 'documentation installation directory') option('docs', type: 'feature', value: 'auto', description: 'whether to generate documentation') option('tests', type: 'feature', value: 'auto', description: 'whether to build tests') +option('spellcheck', type: 'feature', value: 'auto', description: 'whether to check spelling') # build dependencies options diff --git a/scripts/meson.build b/scripts/meson.build index 421e3d2acd..735c963c38 100644 --- a/scripts/meson.build +++ b/scripts/meson.build @@ -7,6 +7,7 @@ scripts = [ 'check-drivername.py', 'check-file-access.py', 'check-remote-protocol.py', + 'check-spelling.py', 'check-symfile.py', 'check-symsorting.py', 'dtrace2systemtap.py', -- 2.31.1

Allow the spell checking job to fail to not have false-positives fail the entire build. Signed-off-by: Tim Wiederhake <twiederh@redhat.com> --- .gitlab-ci.yml | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 6ba11a0431..8f43889067 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -82,11 +82,22 @@ codestyle: before_script: - *script_variables script: - - meson setup build --werror || (cat build/meson-logs/meson-log.txt && exit 1) + - meson setup build --werror -Dspellcheck=disabled || (cat build/meson-logs/meson-log.txt && exit 1) - ninja -C build libvirt-pot-dep - meson test -C build --suite syntax-check --no-rebuild --print-errorlogs +spellcheck: + stage: sanity_checks + image: $CI_REGISTRY_IMAGE/ci-fedora-35:latest + allow_failure: true + needs: + - x86_64-fedora-35-container + before_script: + - *script_variables + script: + - scripts/check-spelling.py + # This artifact published by this job is downloaded to push to Weblate # for translation usage: # https://gitlab.com/libvirt/libvirt/-/jobs/artifacts/master/download?job=potf... -- 2.31.1
participants (2)
-
Peter Krempa
-
Tim Wiederhake