[libvirt PATCH v2 0/3] qemu: Add support for /dev/userfaultfd

Jiri Denemark (3): qemu: Add support for /dev/userfaultfd build: Add userfaultfd_sysctl build option spec: Disable with_userfaultfd_sysctl on Fedora and RHEL-9 libvirt.spec.in | 16 +++++++++++ meson.build | 8 ++++++ meson_options.txt | 1 + src/qemu/meson.build | 2 +- src/qemu/qemu.conf.in | 3 +- src/qemu/qemu_cgroup.c | 1 + src/qemu/qemu_process.c | 38 +++++++++++++++++++++++++ src/qemu/qemu_security.c | 45 ++++++++++++++++++++++++++++++ src/qemu/qemu_security.h | 5 ++++ src/qemu/test_libvirtd_qemu.aug.in | 1 + 10 files changed, 118 insertions(+), 2 deletions(-) -- 2.43.0

/dev/userfaultfd device is preferred over userfaultfd syscall for post-copy migrations. Unless qemu driver is configured to disable mount namespace or to forbid access to /dev/userfaultfd in cgroup_device_acl, we will copy it to the limited /dev filesystem QEMU will have access to and label it appropriately. So in the default configuration post-copy migration will be allowed even without enabling vm.unprivileged_userfaultfd sysctl. Signed-off-by: Jiri Denemark <jdenemar@redhat.com> --- src/qemu/qemu.conf.in | 3 +- src/qemu/qemu_cgroup.c | 1 + src/qemu/qemu_process.c | 38 +++++++++++++++++++++++++ src/qemu/qemu_security.c | 45 ++++++++++++++++++++++++++++++ src/qemu/qemu_security.h | 5 ++++ src/qemu/test_libvirtd_qemu.aug.in | 1 + 6 files changed, 92 insertions(+), 1 deletion(-) diff --git a/src/qemu/qemu.conf.in b/src/qemu/qemu.conf.in index 34025a02ef..f406df8749 100644 --- a/src/qemu/qemu.conf.in +++ b/src/qemu/qemu.conf.in @@ -565,7 +565,8 @@ #cgroup_device_acl = [ # "/dev/null", "/dev/full", "/dev/zero", # "/dev/random", "/dev/urandom", -# "/dev/ptmx", "/dev/kvm" +# "/dev/ptmx", "/dev/kvm", +# "/dev/userfaultfd" #] # # RDMA migration requires the following extra files to be added to the list: diff --git a/src/qemu/qemu_cgroup.c b/src/qemu/qemu_cgroup.c index 47402b3750..5a5ba763a0 100644 --- a/src/qemu/qemu_cgroup.c +++ b/src/qemu/qemu_cgroup.c @@ -41,6 +41,7 @@ const char *const defaultDeviceACL[] = { "/dev/null", "/dev/full", "/dev/zero", "/dev/random", "/dev/urandom", "/dev/ptmx", "/dev/kvm", + "/dev/userfaultfd", NULL, }; #define DEVICE_PTY_MAJOR 136 diff --git a/src/qemu/qemu_process.c b/src/qemu/qemu_process.c index 0a6c18a671..6e51d6586b 100644 --- a/src/qemu/qemu_process.c +++ b/src/qemu/qemu_process.c @@ -2882,6 +2882,40 @@ qemuProcessStartManagedPRDaemon(virDomainObj *vm) } +static int +qemuProcessAllowPostCopyMigration(virDomainObj *vm) +{ + qemuDomainObjPrivate *priv = vm->privateData; + virQEMUDriver *driver = priv->driver; + g_autoptr(virQEMUDriverConfig) cfg = virQEMUDriverGetConfig(driver); + const char *const *devices = (const char *const *) cfg->cgroupDeviceACL; + const char *uffd = "/dev/userfaultfd"; + int rc; + + if (!virFileExists(uffd)) { + VIR_DEBUG("%s is not supported by the host", uffd); + return 0; + } + + if (!devices) + devices = defaultDeviceACL; + + if (!g_strv_contains(devices, uffd)) { + VIR_DEBUG("%s is not allowed by device ACL", uffd); + return 0; + } + + VIR_DEBUG("Labeling %s in mount namespace", uffd); + if ((rc = qemuSecurityDomainSetMountNSPathLabel(driver, vm, uffd)) < 0) + return -1; + + if (rc == 1) + VIR_DEBUG("Mount namespace is not enabled, leaving %s as is", uffd); + + return 0; +} + + static int qemuProcessInitPasswords(virQEMUDriver *driver, virDomainObj *vm, @@ -7802,6 +7836,10 @@ qemuProcessLaunch(virConnectPtr conn, qemuProcessStartManagedPRDaemon(vm) < 0) goto cleanup; + VIR_DEBUG("Setting up permissions to allow post-copy migration"); + if (qemuProcessAllowPostCopyMigration(vm) < 0) + goto cleanup; + VIR_DEBUG("Setting domain security labels"); if (qemuSecuritySetAllLabel(driver, vm, diff --git a/src/qemu/qemu_security.c b/src/qemu/qemu_security.c index 8bcef14d08..4aaa863ae9 100644 --- a/src/qemu/qemu_security.c +++ b/src/qemu/qemu_security.c @@ -615,6 +615,51 @@ qemuSecurityDomainRestorePathLabel(virQEMUDriver *driver, } +/** + * qemuSecurityDomainSetMountNSPathLabel: + * + * Label given path in mount namespace. If mount namespace is not enabled, + * nothing is labeled at all. + * + * Because the label is only applied in mount namespace, there's no need to + * restore it. + * + * Returns 0 on success, + * 1 when mount namespace is not enabled, + * -1 on error. + */ +int +qemuSecurityDomainSetMountNSPathLabel(virQEMUDriver *driver, + virDomainObj *vm, + const char *path) +{ + int ret = -1; + + if (!qemuDomainNamespaceEnabled(vm, QEMU_DOMAIN_NS_MOUNT)) { + VIR_DEBUG("Not labeling '%s': mount namespace disabled for domain '%s'", + path, vm->def->name); + return 1; + } + + if (virSecurityManagerTransactionStart(driver->securityManager) < 0) + goto cleanup; + + if (virSecurityManagerDomainSetPathLabel(driver->securityManager, + vm->def, path, false) < 0) + goto cleanup; + + if (virSecurityManagerTransactionCommit(driver->securityManager, + vm->pid, false) < 0) + goto cleanup; + + ret = 0; + + cleanup: + virSecurityManagerTransactionAbort(driver->securityManager); + return ret; +} + + /** * qemuSecurityCommandRun: * @driver: the QEMU driver diff --git a/src/qemu/qemu_security.h b/src/qemu/qemu_security.h index 10f11771b4..41da33debc 100644 --- a/src/qemu/qemu_security.h +++ b/src/qemu/qemu_security.h @@ -110,6 +110,11 @@ int qemuSecurityDomainRestorePathLabel(virQEMUDriver *driver, virDomainObj *vm, const char *path); +int +qemuSecurityDomainSetMountNSPathLabel(virQEMUDriver *driver, + virDomainObj *vm, + const char *path); + int qemuSecurityCommandRun(virQEMUDriver *driver, virDomainObj *vm, virCommand *cmd, diff --git a/src/qemu/test_libvirtd_qemu.aug.in b/src/qemu/test_libvirtd_qemu.aug.in index e4cfde6cc7..b97e6de11e 100644 --- a/src/qemu/test_libvirtd_qemu.aug.in +++ b/src/qemu/test_libvirtd_qemu.aug.in @@ -67,6 +67,7 @@ module Test_libvirtd_qemu = { "5" = "/dev/urandom" } { "6" = "/dev/ptmx" } { "7" = "/dev/kvm" } + { "8" = "/dev/userfaultfd" } } { "save_image_format" = "raw" } { "dump_image_format" = "raw" } -- 2.43.0

This option controls whether the sysctl config for enabling unprivileged userfaultfd will be installed. Signed-off-by: Jiri Denemark <jdenemar@redhat.com> --- meson.build | 8 ++++++++ meson_options.txt | 1 + src/qemu/meson.build | 2 +- 3 files changed, 10 insertions(+), 1 deletion(-) diff --git a/meson.build b/meson.build index e1c70fce92..dbd9cad6df 100644 --- a/meson.build +++ b/meson.build @@ -2019,6 +2019,12 @@ elif get_option('sysctl_config').enabled() error('sysctl configuration is supported only on linux') endif +if not get_option('userfaultfd_sysctl').disabled() and conf.has('WITH_SYSCTL') + conf.set('WITH_USERFAULTFD_SYSCTL', 1) +elif get_option('userfaultfd_sysctl').enabled() + error('userfaultfd_sysctl option requires sysctl_config to be enabled') +endif + conf.set_quoted('TLS_PRIORITY', get_option('tls_priority')) @@ -2318,6 +2324,8 @@ misc_summary = { 'virt-login-shell': conf.has('WITH_LOGIN_SHELL'), 'virt-host-validate': conf.has('WITH_HOST_VALIDATE'), 'TLS priority': conf.get_unquoted('TLS_PRIORITY'), + 'sysctl config': conf.has('WITH_SYSCTL'), + 'userfaultfd sysctl': conf.has('WITH_USERFAULTFD_SYSCTL'), } summary(misc_summary, section: 'Miscellaneous', bool_yn: true, list_sep: ' ') diff --git a/meson_options.txt b/meson_options.txt index 182e28b3d1..ed91d97abf 100644 --- a/meson_options.txt +++ b/meson_options.txt @@ -108,4 +108,5 @@ option('nbdkit', type: 'feature', value: 'auto', description: 'Build nbdkit stor option('nbdkit_config_default', type: 'feature', value: 'auto', description: 'Whether to use nbdkit storage backend for network disks by default (configurable)') option('pm_utils', type: 'feature', value: 'auto', description: 'use pm-utils for power management') option('sysctl_config', type: 'feature', value: 'auto', description: 'Whether to install sysctl configs') +option('userfaultfd_sysctl', type: 'feature', value: 'auto', description: 'Whether to install sysctl config for enabling unprivileged userfaultfd') option('tls_priority', type: 'string', value: 'NORMAL', description: 'set the default TLS session priority string') diff --git a/src/qemu/meson.build b/src/qemu/meson.build index 4c3e1dee78..faea656502 100644 --- a/src/qemu/meson.build +++ b/src/qemu/meson.build @@ -202,7 +202,7 @@ if conf.has('WITH_QEMU') 'in_file': files('virtqemud.init.in'), } - if conf.has('WITH_SYSCTL') + if conf.has('WITH_USERFAULTFD_SYSCTL') install_data( 'postcopy-migration.sysctl', install_dir: prefix / 'lib' / 'sysctl.d', -- 2.43.0

On Tue, Feb 13, 2024 at 11:39:26 +0100, Jiri Denemark wrote:
This option controls whether the sysctl config for enabling unprivileged userfaultfd will be installed.
Signed-off-by: Jiri Denemark <jdenemar@redhat.com> --- meson.build | 8 ++++++++ meson_options.txt | 1 + src/qemu/meson.build | 2 +- 3 files changed, 10 insertions(+), 1 deletion(-)
diff --git a/meson.build b/meson.build index e1c70fce92..dbd9cad6df 100644 --- a/meson.build +++ b/meson.build @@ -2019,6 +2019,12 @@ elif get_option('sysctl_config').enabled() error('sysctl configuration is supported only on linux') endif
+if not get_option('userfaultfd_sysctl').disabled() and conf.has('WITH_SYSCTL') + conf.set('WITH_USERFAULTFD_SYSCTL', 1) +elif get_option('userfaultfd_sysctl').enabled() + error('userfaultfd_sysctl option requires sysctl_config to be enabled') +endif + conf.set_quoted('TLS_PRIORITY', get_option('tls_priority'))
@@ -2318,6 +2324,8 @@ misc_summary = { 'virt-login-shell': conf.has('WITH_LOGIN_SHELL'), 'virt-host-validate': conf.has('WITH_HOST_VALIDATE'), 'TLS priority': conf.get_unquoted('TLS_PRIORITY'), + 'sysctl config': conf.has('WITH_SYSCTL'), + 'userfaultfd sysctl': conf.has('WITH_USERFAULTFD_SYSCTL'), } summary(misc_summary, section: 'Miscellaneous', bool_yn: true, list_sep: ' ')
diff --git a/meson_options.txt b/meson_options.txt index 182e28b3d1..ed91d97abf 100644 --- a/meson_options.txt +++ b/meson_options.txt @@ -108,4 +108,5 @@ option('nbdkit', type: 'feature', value: 'auto', description: 'Build nbdkit stor option('nbdkit_config_default', type: 'feature', value: 'auto', description: 'Whether to use nbdkit storage backend for network disks by default (configurable)') option('pm_utils', type: 'feature', value: 'auto', description: 'use pm-utils for power management') option('sysctl_config', type: 'feature', value: 'auto', description: 'Whether to install sysctl configs') +option('userfaultfd_sysctl', type: 'feature', value: 'auto', description: 'Whether to install sysctl config for enabling unprivileged userfaultfd') option('tls_priority', type: 'string', value: 'NORMAL', description: 'set the default TLS session priority string')
So this change apparently broke mingw build on Fedora: ERROR: Problem encountered: userfaultfd_sysctl option requires sysctl_config to be enabled The mingw part of libvirt.spec runs meson setup with -Dsysctl_config=disabled and does not specify -Duserfaultfd_sysctl at all, which I would expect means the same as -Duserfaultfd_sysctl=auto. But according to the error message it looks like userfaultfd_sysctl was actually enabled. I didn't explicitly test mingw builds, but I tested running meson setup with all combinations, especially with -Dsysctl_config=disabled and not specifying userfaultfd_sysctl option at all and it worked as expected: sysctl config : NO userfaultfd sysctl : NO Sure, I can easily fix this by passing -Duserfaultfd_sysctl=disable for mingw builds, but I'm wondering why the build doesn't work with just -Dsysctl_config=disabled? Jirka

On Tue, Feb 13, 2024 at 18:48:59 +0100, Jiri Denemark wrote:
On Tue, Feb 13, 2024 at 11:39:26 +0100, Jiri Denemark wrote:
This option controls whether the sysctl config for enabling unprivileged userfaultfd will be installed.
Signed-off-by: Jiri Denemark <jdenemar@redhat.com> --- meson.build | 8 ++++++++ meson_options.txt | 1 + src/qemu/meson.build | 2 +- 3 files changed, 10 insertions(+), 1 deletion(-)
diff --git a/meson.build b/meson.build index e1c70fce92..dbd9cad6df 100644 --- a/meson.build +++ b/meson.build @@ -2019,6 +2019,12 @@ elif get_option('sysctl_config').enabled() error('sysctl configuration is supported only on linux') endif
+if not get_option('userfaultfd_sysctl').disabled() and conf.has('WITH_SYSCTL') + conf.set('WITH_USERFAULTFD_SYSCTL', 1) +elif get_option('userfaultfd_sysctl').enabled() + error('userfaultfd_sysctl option requires sysctl_config to be enabled') +endif + conf.set_quoted('TLS_PRIORITY', get_option('tls_priority'))
@@ -2318,6 +2324,8 @@ misc_summary = { 'virt-login-shell': conf.has('WITH_LOGIN_SHELL'), 'virt-host-validate': conf.has('WITH_HOST_VALIDATE'), 'TLS priority': conf.get_unquoted('TLS_PRIORITY'), + 'sysctl config': conf.has('WITH_SYSCTL'), + 'userfaultfd sysctl': conf.has('WITH_USERFAULTFD_SYSCTL'), } summary(misc_summary, section: 'Miscellaneous', bool_yn: true, list_sep: ' ')
diff --git a/meson_options.txt b/meson_options.txt index 182e28b3d1..ed91d97abf 100644 --- a/meson_options.txt +++ b/meson_options.txt @@ -108,4 +108,5 @@ option('nbdkit', type: 'feature', value: 'auto', description: 'Build nbdkit stor option('nbdkit_config_default', type: 'feature', value: 'auto', description: 'Whether to use nbdkit storage backend for network disks by default (configurable)') option('pm_utils', type: 'feature', value: 'auto', description: 'use pm-utils for power management') option('sysctl_config', type: 'feature', value: 'auto', description: 'Whether to install sysctl configs') +option('userfaultfd_sysctl', type: 'feature', value: 'auto', description: 'Whether to install sysctl config for enabling unprivileged userfaultfd') option('tls_priority', type: 'string', value: 'NORMAL', description: 'set the default TLS session priority string')
So this change apparently broke mingw build on Fedora:
ERROR: Problem encountered: userfaultfd_sysctl option requires sysctl_config to be enabled
The mingw part of libvirt.spec runs meson setup with -Dsysctl_config=disabled and does not specify -Duserfaultfd_sysctl at all, which I would expect means the same as -Duserfaultfd_sysctl=auto. But according to the error message it looks like userfaultfd_sysctl was actually enabled.
I didn't explicitly test mingw builds, but I tested running meson setup with all combinations, especially with -Dsysctl_config=disabled and not specifying userfaultfd_sysctl option at all and it worked as expected:
sysctl config : NO userfaultfd sysctl : NO
Sure, I can easily fix this by passing -Duserfaultfd_sysctl=disable for mingw builds, but I'm wondering why the build doesn't work with just -Dsysctl_config=disabled?
Oh I see, the %meson* RPM macros explicitly enable all auto features using --auto-features=enabled. Jirka

On a Tuesday in 2024, Jiri Denemark wrote:
On Tue, Feb 13, 2024 at 18:48:59 +0100, Jiri Denemark wrote:
I didn't explicitly test mingw builds, but I tested running meson setup with all combinations, especially with -Dsysctl_config=disabled and not specifying userfaultfd_sysctl option at all and it worked as expected:
sysctl config : NO userfaultfd sysctl : NO
Sure, I can easily fix this by passing -Duserfaultfd_sysctl=disable for mingw builds, but I'm wondering why the build doesn't work with just -Dsysctl_config=disabled?
Oh I see, the %meson* RPM macros explicitly enable all auto features using --auto-features=enabled.
That is true for %meson, but it does not seem to be for %mingw_meson where we add --auto-features=enabled to its invocation. Jano
Jirka _______________________________________________ Devel mailing list -- devel@lists.libvirt.org To unsubscribe send an email to devel-leave@lists.libvirt.org

All supported versions of Fedora and RHEL >= 9.0 support /dev/userfaultfd. Signed-off-by: Jiri Denemark <jdenemar@redhat.com> --- libvirt.spec.in | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/libvirt.spec.in b/libvirt.spec.in index 8413e3c19a..6701f7b6e9 100644 --- a/libvirt.spec.in +++ b/libvirt.spec.in @@ -90,6 +90,7 @@ # Other optional features %define with_numactl 0%{!?_without_numactl:1} +%define with_userfaultfd_sysctl 0%{!?_without_userfaultfd_sysctl:1} # A few optional bits off by default, we enable later %define with_fuse 0 @@ -246,6 +247,12 @@ %define enable_werror -Dwerror=false -Dgit_werror=disabled %endif +# Fedora and RHEL-9 are new enough to support /dev/userfaultfd, which +# does not require enabling vm.unprivileged_userfaultfd sysctl. +%if 0%{?fedora} || 0%{?rhel} >= 9 + %define with_userfaultfd_sysctl 0 +%endif + %define tls_priority "@LIBVIRT,SYSTEM" # libvirt 8.1.0 stops distributing any sysconfig files. @@ -1276,6 +1283,12 @@ exit 1 %define arg_remote_mode -Dremote_default_mode=legacy %endif +%if %{with_userfaultfd_sysctl} + %define arg_userfaultfd_sysctl -Duserfaultfd_sysctl=enabled +%else + %define arg_userfaultfd_sysctl -Duserfaultfd_sysctl=disabled +%endif + %define when %(date +"%%F-%%T") %define where %(hostname) %define who %{?packager}%{!?packager:Unknown} @@ -1355,6 +1368,7 @@ export SOURCE_DATE_EPOCH=$(stat --printf='%Y' %{_specdir}/libvirt.spec) -Dqemu_moddir=%{qemu_moddir} \ -Dqemu_datadir=%{qemu_datadir} \ -Dtls_priority=%{tls_priority} \ + %{?arg_userfaultfd_sysctl} \ %{?enable_werror} \ -Dexpensive_tests=enabled \ -Dinit_script=systemd \ @@ -2211,7 +2225,9 @@ exit 0 %if %{with_qemu} %files daemon-driver-qemu %config(noreplace) %{_sysconfdir}/libvirt/virtqemud.conf + %if %{with_userfaultfd_sysctl} %config(noreplace) %{_prefix}/lib/sysctl.d/60-qemu-postcopy-migration.conf + %endif %{_datadir}/augeas/lenses/virtqemud.aug %{_datadir}/augeas/lenses/tests/test_virtqemud.aug %{_unitdir}/virtqemud.service -- 2.43.0

On a Tuesday in 2024, Jiri Denemark wrote:
Jiri Denemark (3): qemu: Add support for /dev/userfaultfd build: Add userfaultfd_sysctl build option spec: Disable with_userfaultfd_sysctl on Fedora and RHEL-9
libvirt.spec.in | 16 +++++++++++ meson.build | 8 ++++++ meson_options.txt | 1 + src/qemu/meson.build | 2 +- src/qemu/qemu.conf.in | 3 +- src/qemu/qemu_cgroup.c | 1 + src/qemu/qemu_process.c | 38 +++++++++++++++++++++++++ src/qemu/qemu_security.c | 45 ++++++++++++++++++++++++++++++ src/qemu/qemu_security.h | 5 ++++ src/qemu/test_libvirtd_qemu.aug.in | 1 + 10 files changed, 118 insertions(+), 2 deletions(-)
Reviewed-by: Ján Tomko <jtomko@redhat.com> Jano
participants (2)
-
Jiri Denemark
-
Ján Tomko