[libvirt PATCH 0/3] Invalidate the cpu flags cache on changes of kernel command line

The kernel command line can contain settings affecting the availability of cpu features, eg. "tsx=on". This series adds the kernel command line to the cpu flags cache and declares the cache invalid if the current kernel command line differs. Fixes https://bugzilla.redhat.com/show_bug.cgi?id=1953389. Tim Wiederhake (3): virQEMUCaps: Add kernel command line virQEMUCapsCachePriv: Add kernel command line qemu: Invalidate capabilities cache on kernel command line mismatch src/qemu/qemu_capabilities.c | 34 ++++++++++++++++++++++++++++++++-- src/qemu/qemu_capspriv.h | 3 ++- tests/qemucapsprobe.c | 2 +- 3 files changed, 35 insertions(+), 4 deletions(-) -- 2.31.1

The kernel command line can contain settings affecting the availability of cpu flags. Add the kernel command line to the capabilities cache. Signed-off-by: Tim Wiederhake <twiederh@redhat.com> --- src/qemu/qemu_capabilities.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/src/qemu/qemu_capabilities.c b/src/qemu/qemu_capabilities.c index 9558938866..21b7e8050b 100644 --- a/src/qemu/qemu_capabilities.c +++ b/src/qemu/qemu_capabilities.c @@ -709,6 +709,7 @@ struct _virQEMUCaps { char *hostCPUSignature; char *package; char *kernelVersion; + char *kernelCmdline; virArch arch; @@ -1973,6 +1974,7 @@ virQEMUCaps *virQEMUCapsNewCopy(virQEMUCaps *qemuCaps) ret->package = g_strdup(qemuCaps->package); ret->kernelVersion = g_strdup(qemuCaps->kernelVersion); + ret->kernelCmdline = g_strdup(qemuCaps->kernelCmdline); ret->arch = qemuCaps->arch; @@ -2024,6 +2026,7 @@ void virQEMUCapsDispose(void *obj) g_free(qemuCaps->package); g_free(qemuCaps->kernelVersion); + g_free(qemuCaps->kernelCmdline); g_free(qemuCaps->binary); g_free(qemuCaps->hostCPUSignature); @@ -4321,6 +4324,12 @@ virQEMUCapsLoadCache(virArch hostArch, goto cleanup; } + if (virXPathBoolean("boolean(./kernelCmdline)", ctxt) > 0) { + qemuCaps->kernelCmdline = virXPathString("string(./kernelCmdline)", ctxt); + if (!qemuCaps->kernelCmdline) + goto cleanup; + } + if (!(str = virXPathString("string(./arch)", ctxt))) { virReportError(VIR_ERR_INTERNAL_ERROR, "%s", _("missing arch in QEMU capabilities cache")); @@ -4636,6 +4645,10 @@ virQEMUCapsFormatCache(virQEMUCaps *qemuCaps) virBufferAsprintf(&buf, "<kernelVersion>%s</kernelVersion>\n", qemuCaps->kernelVersion); + if (qemuCaps->kernelCmdline) + virBufferAsprintf(&buf, "<kernelCmdline>%s</kernelCmdline>\n", + qemuCaps->kernelCmdline); + virBufferAsprintf(&buf, "<arch>%s</arch>\n", virArchToString(qemuCaps->arch)); @@ -5495,6 +5508,7 @@ virQEMUCapsNewForBinaryInternal(virArch hostArch, qemuCaps->microcodeVersion = microcodeVersion; qemuCaps->kernelVersion = g_strdup(kernelVersion); + qemuCaps->kernelCmdline = NULL; qemuCaps->kvmSupportsNesting = virQEMUCapsKVMSupportsNesting(); -- 2.31.1

Signed-off-by: Tim Wiederhake <twiederh@redhat.com> --- src/qemu/qemu_capabilities.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/qemu/qemu_capabilities.c b/src/qemu/qemu_capabilities.c index 21b7e8050b..c2a885351f 100644 --- a/src/qemu/qemu_capabilities.c +++ b/src/qemu/qemu_capabilities.c @@ -4096,6 +4096,7 @@ struct _virQEMUCapsCachePriv { virArch hostArch; unsigned int microcodeVersion; char *kernelVersion; + char *kernelCmdline; char *hostCPUSignature; /* cache whether /dev/kvm is usable as runUid:runGuid */ @@ -4112,6 +4113,7 @@ virQEMUCapsCachePrivFree(void *privData) g_free(priv->libDir); g_free(priv->kernelVersion); + g_free(priv->kernelCmdline); g_free(priv->hostCPUSignature); g_free(priv); } @@ -5649,6 +5651,9 @@ virQEMUCapsCacheNew(const char *libDir, if (uname(&uts) == 0) priv->kernelVersion = g_strdup_printf("%s %s", uts.release, uts.version); + if (virFileReadValueString(&priv->kernelCmdline, "/proc/cmdline") < 0) + goto error; + cleanup: VIR_FREE(capsCacheDir); return cache; -- 2.31.1

See https://bugzilla.redhat.com/show_bug.cgi?id=1953389. Signed-off-by: Tim Wiederhake <twiederh@redhat.com> --- src/qemu/qemu_capabilities.c | 17 ++++++++++++++--- src/qemu/qemu_capspriv.h | 3 ++- tests/qemucapsprobe.c | 2 +- 3 files changed, 17 insertions(+), 5 deletions(-) diff --git a/src/qemu/qemu_capabilities.c b/src/qemu/qemu_capabilities.c index c2a885351f..abf161227f 100644 --- a/src/qemu/qemu_capabilities.c +++ b/src/qemu/qemu_capabilities.c @@ -4984,6 +4984,15 @@ virQEMUCapsIsValid(void *data, return false; } + if (STRNEQ_NULLABLE(priv->kernelCmdline, qemuCaps->kernelCmdline)) { + VIR_DEBUG("Outdated capabilities for '%s': kernel command line " + "changed ('%s' vs '%s')", + qemuCaps->binary, + priv->kernelCmdline, + qemuCaps->kernelCmdline); + return false; + } + kvmSupportsNesting = virQEMUCapsKVMSupportsNesting(); if (kvmSupportsNesting != qemuCaps->kvmSupportsNesting) { VIR_DEBUG("Outdated capabilities for '%s': kvm kernel nested " @@ -5460,7 +5469,8 @@ virQEMUCapsNewForBinaryInternal(virArch hostArch, gid_t runGid, const char *hostCPUSignature, unsigned int microcodeVersion, - const char *kernelVersion) + const char *kernelVersion, + const char *kernelCmdline) { virQEMUCaps *qemuCaps; struct stat sb; @@ -5510,7 +5520,7 @@ virQEMUCapsNewForBinaryInternal(virArch hostArch, qemuCaps->microcodeVersion = microcodeVersion; qemuCaps->kernelVersion = g_strdup(kernelVersion); - qemuCaps->kernelCmdline = NULL; + qemuCaps->kernelCmdline = g_strdup(kernelCmdline); qemuCaps->kvmSupportsNesting = virQEMUCapsKVMSupportsNesting(); @@ -5537,7 +5547,8 @@ virQEMUCapsNewData(const char *binary, priv->runGid, priv->hostCPUSignature, virHostCPUGetMicrocodeVersion(priv->hostArch), - priv->kernelVersion); + priv->kernelVersion, + priv->kernelCmdline); } diff --git a/src/qemu/qemu_capspriv.h b/src/qemu/qemu_capspriv.h index a54a22685e..80ceb1425f 100644 --- a/src/qemu/qemu_capspriv.h +++ b/src/qemu/qemu_capspriv.h @@ -35,7 +35,8 @@ virQEMUCapsNewForBinaryInternal(virArch hostArch, gid_t runGid, const char *hostCPUSignature, unsigned int microcodeVersion, - const char *kernelVersion); + const char *kernelVersion, + const char *kernelCmdline); int virQEMUCapsLoadCache(virArch hostArch, virQEMUCaps *qemuCaps, diff --git a/tests/qemucapsprobe.c b/tests/qemucapsprobe.c index bfa8ae8db7..76c18f0dcd 100644 --- a/tests/qemucapsprobe.c +++ b/tests/qemucapsprobe.c @@ -79,7 +79,7 @@ main(int argc, char **argv) return EXIT_FAILURE; if (!(caps = virQEMUCapsNewForBinaryInternal(VIR_ARCH_NONE, argv[1], "/tmp", - -1, -1, NULL, 0, NULL))) + -1, -1, NULL, 0, NULL, NULL))) return EXIT_FAILURE; host = virArchFromHost(); -- 2.31.1

On Thu, Aug 05, 2021 at 03:36:37PM +0200, Tim Wiederhake wrote:
The kernel command line can contain settings affecting the availability of cpu features, eg. "tsx=on". This series adds the kernel command line to the cpu flags cache and declares the cache invalid if the current kernel command line differs.
Multiple things can change the CPU features. kernel version, microcode version, bios settings change, kernel command line. We've been playing whack-a-mole in cache invalidation for ages adding ever more criteria for things which have side effects on CPU features available. Running the CPUID instruction is cheap. Could we directly query the set of host CPUID leaves we care about, and compare that, and potentially even get rid of some of the other checks we have ? Regards, Daniel -- |: https://berrange.com -o- https://www.flickr.com/photos/dberrange :| |: https://libvirt.org -o- https://fstop138.berrange.com :| |: https://entangle-photo.org -o- https://www.instagram.com/dberrange :|

On Thu, Aug 05, 2021 at 14:50:51 +0100, Daniel P. Berrangé wrote:
On Thu, Aug 05, 2021 at 03:36:37PM +0200, Tim Wiederhake wrote:
The kernel command line can contain settings affecting the availability of cpu features, eg. "tsx=on". This series adds the kernel command line to the cpu flags cache and declares the cache invalid if the current kernel command line differs.
Multiple things can change the CPU features. kernel version, microcode version, bios settings change, kernel command line. We've been playing whack-a-mole in cache invalidation for ages adding ever more criteria for things which have side effects on CPU features available.
Running the CPUID instruction is cheap. Could we directly query the set of host CPUID leaves we care about, and compare that, and potentially even get rid of some of the other checks we have ?
I guess it could help in some cases, but we wouldn't be able to drop some of the existing checks anyway. Because the settings usually do not result in the CPU dropping a particular bit from CPUID, the feature just becomes unusable by reporting a failure when used. So the settings would only be reflected in what features QEMU can enable on the host. Although checking CPUID might be enough for TSX, checking the command line is helpful in other cases. I'm afraid the only 100% correct solution would be to stop caching CPU data at all and always probe QEMU (for host CPU model expansion only and drop all the cache if the set of features changes), but this might be quite expensive. On the other hand, we would need to do it only for KVM, which means a single (or two for 32b vs 64b) on a host.. Jirka

On Fri, Aug 06, 2021 at 05:07:45PM +0200, Jiri Denemark wrote:
On Thu, Aug 05, 2021 at 14:50:51 +0100, Daniel P. Berrangé wrote:
On Thu, Aug 05, 2021 at 03:36:37PM +0200, Tim Wiederhake wrote:
The kernel command line can contain settings affecting the availability of cpu features, eg. "tsx=on". This series adds the kernel command line to the cpu flags cache and declares the cache invalid if the current kernel command line differs.
Multiple things can change the CPU features. kernel version, microcode version, bios settings change, kernel command line. We've been playing whack-a-mole in cache invalidation for ages adding ever more criteria for things which have side effects on CPU features available.
Running the CPUID instruction is cheap. Could we directly query the set of host CPUID leaves we care about, and compare that, and potentially even get rid of some of the other checks we have ?
I guess it could help in some cases, but we wouldn't be able to drop some of the existing checks anyway. Because the settings usually do not result in the CPU dropping a particular bit from CPUID, the feature just becomes unusable by reporting a failure when used. So the settings would only be reflected in what features QEMU can enable on the host. Although checking CPUID might be enough for TSX, checking the command line is helpful in other cases.
Would that be reflected by the answer to KVM_GET_SUPPORTED_CPUID which is the intersection of physical CPUID and what KVM is actally willing to enable ? That ioctl would be cheap too. Regards, Daniel -- |: https://berrange.com -o- https://www.flickr.com/photos/dberrange :| |: https://libvirt.org -o- https://fstop138.berrange.com :| |: https://entangle-photo.org -o- https://www.instagram.com/dberrange :|

On Fri, Aug 06, 2021 at 18:12:21 +0100, Daniel P. Berrangé wrote:
On Fri, Aug 06, 2021 at 05:07:45PM +0200, Jiri Denemark wrote:
On Thu, Aug 05, 2021 at 14:50:51 +0100, Daniel P. Berrangé wrote:
On Thu, Aug 05, 2021 at 03:36:37PM +0200, Tim Wiederhake wrote:
The kernel command line can contain settings affecting the availability of cpu features, eg. "tsx=on". This series adds the kernel command line to the cpu flags cache and declares the cache invalid if the current kernel command line differs.
Multiple things can change the CPU features. kernel version, microcode version, bios settings change, kernel command line. We've been playing whack-a-mole in cache invalidation for ages adding ever more criteria for things which have side effects on CPU features available.
Running the CPUID instruction is cheap. Could we directly query the set of host CPUID leaves we care about, and compare that, and potentially even get rid of some of the other checks we have ?
I guess it could help in some cases, but we wouldn't be able to drop some of the existing checks anyway. Because the settings usually do not result in the CPU dropping a particular bit from CPUID, the feature just becomes unusable by reporting a failure when used. So the settings would only be reflected in what features QEMU can enable on the host. Although checking CPUID might be enough for TSX, checking the command line is helpful in other cases.
Would that be reflected by the answer to KVM_GET_SUPPORTED_CPUID which is the intersection of physical CPUID and what KVM is actally willing to enable ? That ioctl would be cheap too.
I don't know, to be honest. I guess it should work unless QEMU does some additional processing/filtering of the results it gets from KVM. Eduardo, do you know if KVM_GET_SUPPORTED_CPUID would be sufficient to check any configuration changes (bios settings, kernel command line, module options, ...) that affect usable CPU features? Jirka

On Wed, Aug 11, 2021 at 4:43 AM Jiri Denemark <jdenemar@redhat.com> wrote:
On Fri, Aug 06, 2021 at 18:12:21 +0100, Daniel P. Berrangé wrote:
On Fri, Aug 06, 2021 at 05:07:45PM +0200, Jiri Denemark wrote:
On Thu, Aug 05, 2021 at 14:50:51 +0100, Daniel P. Berrangé wrote:
On Thu, Aug 05, 2021 at 03:36:37PM +0200, Tim Wiederhake wrote:
The kernel command line can contain settings affecting the availability of cpu features, eg. "tsx=on". This series adds the kernel command line to the cpu flags cache and declares the cache invalid if the current kernel command line differs.
Multiple things can change the CPU features. kernel version, microcode version, bios settings change, kernel command line. We've been playing whack-a-mole in cache invalidation for ages adding ever more criteria for things which have side effects on CPU features available.
Running the CPUID instruction is cheap. Could we directly query the set of host CPUID leaves we care about, and compare that, and potentially even get rid of some of the other checks we have ?
I guess it could help in some cases, but we wouldn't be able to drop some of the existing checks anyway. Because the settings usually do not result in the CPU dropping a particular bit from CPUID, the feature just becomes unusable by reporting a failure when used. So the settings would only be reflected in what features QEMU can enable on the host. Although checking CPUID might be enough for TSX, checking the command line is helpful in other cases.
Would that be reflected by the answer to KVM_GET_SUPPORTED_CPUID which is the intersection of physical CPUID and what KVM is actally willing to enable ? That ioctl would be cheap too.
I don't know, to be honest. I guess it should work unless QEMU does some additional processing/filtering of the results it gets from KVM.
Eduardo, do you know if KVM_GET_SUPPORTED_CPUID would be sufficient to check any configuration changes (bios settings, kernel command line, module options, ...) that affect usable CPU features?
GET_SUPPORTED_CPUID is supposed to be enough to cover all kernel-side factors (including host CPUID flags, kernel and module options, bios settings). However, I would call KVM_GET_MSRS (the system ioctl, not the VCPU ioctl) to be extra safe. Some features are available only if KVM supports MSRs required for them, and some features are exposed to guests through bits in some MSRs. That would mean KVM_GET_SUPPORTED_CPUID + KVM_GET_MSRS + QEMU binary identifier (hash or date/time?) should be sufficient today. The problem here is the word "today": we never know what kind of extra KVM capabilities new features might require. Wouldn't it be easier to simply invalidate the cache every time libvirtd is restarted? If libvirtd keeps /dev/kvm open all the time, this would also cover features affected by KVM module reloads. -- Eduardo

On Wed, Aug 11, 2021 at 09:33:08AM -0400, Eduardo Habkost wrote:
On Wed, Aug 11, 2021 at 4:43 AM Jiri Denemark <jdenemar@redhat.com> wrote:
On Fri, Aug 06, 2021 at 18:12:21 +0100, Daniel P. Berrangé wrote:
On Fri, Aug 06, 2021 at 05:07:45PM +0200, Jiri Denemark wrote:
On Thu, Aug 05, 2021 at 14:50:51 +0100, Daniel P. Berrangé wrote:
On Thu, Aug 05, 2021 at 03:36:37PM +0200, Tim Wiederhake wrote:
The kernel command line can contain settings affecting the availability of cpu features, eg. "tsx=on". This series adds the kernel command line to the cpu flags cache and declares the cache invalid if the current kernel command line differs.
Multiple things can change the CPU features. kernel version, microcode version, bios settings change, kernel command line. We've been playing whack-a-mole in cache invalidation for ages adding ever more criteria for things which have side effects on CPU features available.
Running the CPUID instruction is cheap. Could we directly query the set of host CPUID leaves we care about, and compare that, and potentially even get rid of some of the other checks we have ?
I guess it could help in some cases, but we wouldn't be able to drop some of the existing checks anyway. Because the settings usually do not result in the CPU dropping a particular bit from CPUID, the feature just becomes unusable by reporting a failure when used. So the settings would only be reflected in what features QEMU can enable on the host. Although checking CPUID might be enough for TSX, checking the command line is helpful in other cases.
Would that be reflected by the answer to KVM_GET_SUPPORTED_CPUID which is the intersection of physical CPUID and what KVM is actally willing to enable ? That ioctl would be cheap too.
I don't know, to be honest. I guess it should work unless QEMU does some additional processing/filtering of the results it gets from KVM.
Eduardo, do you know if KVM_GET_SUPPORTED_CPUID would be sufficient to check any configuration changes (bios settings, kernel command line, module options, ...) that affect usable CPU features?
GET_SUPPORTED_CPUID is supposed to be enough to cover all kernel-side factors (including host CPUID flags, kernel and module options, bios settings). However, I would call KVM_GET_MSRS (the system ioctl, not the VCPU ioctl) to be extra safe. Some features are available only if KVM supports MSRs required for them, and some features are exposed to guests through bits in some MSRs.
That would mean KVM_GET_SUPPORTED_CPUID + KVM_GET_MSRS + QEMU binary identifier (hash or date/time?) should be sufficient today. The problem here is the word "today": we never know what kind of extra KVM capabilities new features might require.
Wouldn't it be easier to simply invalidate the cache every time libvirtd is restarted? If libvirtd keeps /dev/kvm open all the time, this would also cover features affected by KVM module reloads.
Invalidating the cache on every restart defeats the very purpose of having a cache in the first place. Probing for capabilities slows startup of the daemon and that is what required introduction of a cache. Regards, Daniel -- |: https://berrange.com -o- https://www.flickr.com/photos/dberrange :| |: https://libvirt.org -o- https://fstop138.berrange.com :| |: https://entangle-photo.org -o- https://www.instagram.com/dberrange :|

On Wed, Aug 11, 2021 at 9:42 AM Daniel P. Berrangé <berrange@redhat.com> wrote:
On Wed, Aug 11, 2021 at 09:33:08AM -0400, Eduardo Habkost wrote:
Wouldn't it be easier to simply invalidate the cache every time libvirtd is restarted? If libvirtd keeps /dev/kvm open all the time, this would also cover features affected by KVM module reloads.
Invalidating the cache on every restart defeats the very purpose of having a cache in the first place. Probing for capabilities slows startup of the daemon and that is what required introduction of a cache.
Can't libvirtd query CPU capabilities only when clients ask for that information the first time? -- Eduardo

On Wed, Aug 11, 2021 at 09:46:17AM -0400, Eduardo Habkost wrote:
On Wed, Aug 11, 2021 at 9:42 AM Daniel P. Berrangé <berrange@redhat.com> wrote:
On Wed, Aug 11, 2021 at 09:33:08AM -0400, Eduardo Habkost wrote:
Wouldn't it be easier to simply invalidate the cache every time libvirtd is restarted? If libvirtd keeps /dev/kvm open all the time, this would also cover features affected by KVM module reloads.
Invalidating the cache on every restart defeats the very purpose of having a cache in the first place. Probing for capabilities slows startup of the daemon and that is what required introduction of a cache.
Can't libvirtd query CPU capabilities only when clients ask for that information the first time?
Anything is technically possible, but that's a significant change from what we would do today. It would still need caching and an invalidation strategy, because we don't want such probing to be in the startup path of every VM spawned. Regards, Daniel -- |: https://berrange.com -o- https://www.flickr.com/photos/dberrange :| |: https://libvirt.org -o- https://fstop138.berrange.com :| |: https://entangle-photo.org -o- https://www.instagram.com/dberrange :|

On Wed, Aug 11, 2021 at 9:53 AM Daniel P. Berrangé <berrange@redhat.com> wrote:
On Wed, Aug 11, 2021 at 09:46:17AM -0400, Eduardo Habkost wrote:
On Wed, Aug 11, 2021 at 9:42 AM Daniel P. Berrangé <berrange@redhat.com> wrote:
On Wed, Aug 11, 2021 at 09:33:08AM -0400, Eduardo Habkost wrote:
Wouldn't it be easier to simply invalidate the cache every time libvirtd is restarted? If libvirtd keeps /dev/kvm open all the time, this would also cover features affected by KVM module reloads.
Invalidating the cache on every restart defeats the very purpose of having a cache in the first place. Probing for capabilities slows startup of the daemon and that is what required introduction of a cache.
Can't libvirtd query CPU capabilities only when clients ask for that information the first time?
Anything is technically possible, but that's a significant change from what we would do today. It would still need caching and an invalidation strategy, because we don't want such probing to be in the startup path of every VM spawned.
Welp. If the goal is a short-term solution to the game of whack-a-mole, I'd say GET_SUPPORTED_CPUID + KVM_GET_MSRS would be a good enough solution to avoid having to enumerate every factor that affects availability of features on the KVM side. -- Eduardo
participants (4)
-
Daniel P. Berrangé
-
Eduardo Habkost
-
Jiri Denemark
-
Tim Wiederhake