/dev/userfaultfd device is preferred over userfaultfd syscall for
post-copy migrations. Unless qemu driver is configured to disable mount
namespace or to forbid access to /dev/userfaultfd in cgroup_device_acl,
we will copy it to the limited /dev filesystem QEMU will have access to
and label it appropriately. So in the default configuration post-copy
migration will be allowed even without enabling
vm.unprivileged_userfaultfd sysctl.
Signed-off-by: Jiri Denemark <jdenemar(a)redhat.com>
---
Notes:
The question is what should we do with the
src/qemu/postcopy-migration.sysctl file which is installed by
libvirt.spec to /usr/lib/sysctl.d/60-qemu-postcopy-migration.conf by
default. The file is now useless and should ideally be removed, but only
when the host kernel is new enough to support /dev/userfaultfd
src/qemu/qemu.conf.in | 3 +-
src/qemu/qemu_cgroup.c | 1 +
src/qemu/qemu_process.c | 38 +++++++++++++++++++++++++
src/qemu/qemu_security.c | 45 ++++++++++++++++++++++++++++++
src/qemu/qemu_security.h | 5 ++++
src/qemu/test_libvirtd_qemu.aug.in | 1 +
6 files changed, 92 insertions(+), 1 deletion(-)
diff --git a/src/qemu/qemu.conf.in b/src/qemu/qemu.conf.in
index 34025a02ef..f406df8749 100644
--- a/src/qemu/qemu.conf.in
+++ b/src/qemu/qemu.conf.in
@@ -565,7 +565,8 @@
#cgroup_device_acl = [
# "/dev/null", "/dev/full", "/dev/zero",
# "/dev/random", "/dev/urandom",
-# "/dev/ptmx", "/dev/kvm"
+# "/dev/ptmx", "/dev/kvm",
+# "/dev/userfaultfd"
#]
#
# RDMA migration requires the following extra files to be added to the list:
diff --git a/src/qemu/qemu_cgroup.c b/src/qemu/qemu_cgroup.c
index 47402b3750..5a5ba763a0 100644
--- a/src/qemu/qemu_cgroup.c
+++ b/src/qemu/qemu_cgroup.c
@@ -41,6 +41,7 @@ const char *const defaultDeviceACL[] = {
"/dev/null", "/dev/full", "/dev/zero",
"/dev/random", "/dev/urandom",
"/dev/ptmx", "/dev/kvm",
+ "/dev/userfaultfd",
NULL,
};
#define DEVICE_PTY_MAJOR 136
diff --git a/src/qemu/qemu_process.c b/src/qemu/qemu_process.c
index 0a6c18a671..6e51d6586b 100644
--- a/src/qemu/qemu_process.c
+++ b/src/qemu/qemu_process.c
@@ -2882,6 +2882,40 @@ qemuProcessStartManagedPRDaemon(virDomainObj *vm)
}
+static int
+qemuProcessAllowPostCopyMigration(virDomainObj *vm)
+{
+ qemuDomainObjPrivate *priv = vm->privateData;
+ virQEMUDriver *driver = priv->driver;
+ g_autoptr(virQEMUDriverConfig) cfg = virQEMUDriverGetConfig(driver);
+ const char *const *devices = (const char *const *) cfg->cgroupDeviceACL;
+ const char *uffd = "/dev/userfaultfd";
+ int rc;
+
+ if (!virFileExists(uffd)) {
+ VIR_DEBUG("%s is not supported by the host", uffd);
+ return 0;
+ }
+
+ if (!devices)
+ devices = defaultDeviceACL;
+
+ if (!g_strv_contains(devices, uffd)) {
+ VIR_DEBUG("%s is not allowed by device ACL", uffd);
+ return 0;
+ }
+
+ VIR_DEBUG("Labeling %s in mount namespace", uffd);
+ if ((rc = qemuSecurityDomainSetMountNSPathLabel(driver, vm, uffd)) < 0)
+ return -1;
+
+ if (rc == 1)
+ VIR_DEBUG("Mount namespace is not enabled, leaving %s as is", uffd);
+
+ return 0;
+}
+
+
static int
qemuProcessInitPasswords(virQEMUDriver *driver,
virDomainObj *vm,
@@ -7802,6 +7836,10 @@ qemuProcessLaunch(virConnectPtr conn,
qemuProcessStartManagedPRDaemon(vm) < 0)
goto cleanup;
+ VIR_DEBUG("Setting up permissions to allow post-copy migration");
+ if (qemuProcessAllowPostCopyMigration(vm) < 0)
+ goto cleanup;
+
VIR_DEBUG("Setting domain security labels");
if (qemuSecuritySetAllLabel(driver,
vm,
diff --git a/src/qemu/qemu_security.c b/src/qemu/qemu_security.c
index 8bcef14d08..4aaa863ae9 100644
--- a/src/qemu/qemu_security.c
+++ b/src/qemu/qemu_security.c
@@ -615,6 +615,51 @@ qemuSecurityDomainRestorePathLabel(virQEMUDriver *driver,
}
+/**
+ * qemuSecurityDomainSetMountNSPathLabel:
+ *
+ * Label given path in mount namespace. If mount namespace is not enabled,
+ * nothing is labeled at all.
+ *
+ * Because the label is only applied in mount namespace, there's no need to
+ * restore it.
+ *
+ * Returns 0 on success,
+ * 1 when mount namespace is not enabled,
+ * -1 on error.
+ */
+int
+qemuSecurityDomainSetMountNSPathLabel(virQEMUDriver *driver,
+ virDomainObj *vm,
+ const char *path)
+{
+ int ret = -1;
+
+ if (!qemuDomainNamespaceEnabled(vm, QEMU_DOMAIN_NS_MOUNT)) {
+ VIR_DEBUG("Not labeling '%s': mount namespace disabled for domain
'%s'",
+ path, vm->def->name);
+ return 1;
+ }
+
+ if (virSecurityManagerTransactionStart(driver->securityManager) < 0)
+ goto cleanup;
+
+ if (virSecurityManagerDomainSetPathLabel(driver->securityManager,
+ vm->def, path, false) < 0)
+ goto cleanup;
+
+ if (virSecurityManagerTransactionCommit(driver->securityManager,
+ vm->pid, false) < 0)
+ goto cleanup;
+
+ ret = 0;
+
+ cleanup:
+ virSecurityManagerTransactionAbort(driver->securityManager);
+ return ret;
+}
+
+
/**
* qemuSecurityCommandRun:
* @driver: the QEMU driver
diff --git a/src/qemu/qemu_security.h b/src/qemu/qemu_security.h
index 10f11771b4..41da33debc 100644
--- a/src/qemu/qemu_security.h
+++ b/src/qemu/qemu_security.h
@@ -110,6 +110,11 @@ int qemuSecurityDomainRestorePathLabel(virQEMUDriver *driver,
virDomainObj *vm,
const char *path);
+int
+qemuSecurityDomainSetMountNSPathLabel(virQEMUDriver *driver,
+ virDomainObj *vm,
+ const char *path);
+
int qemuSecurityCommandRun(virQEMUDriver *driver,
virDomainObj *vm,
virCommand *cmd,
diff --git a/src/qemu/test_libvirtd_qemu.aug.in b/src/qemu/test_libvirtd_qemu.aug.in
index e4cfde6cc7..b97e6de11e 100644
--- a/src/qemu/test_libvirtd_qemu.aug.in
+++ b/src/qemu/test_libvirtd_qemu.aug.in
@@ -67,6 +67,7 @@ module Test_libvirtd_qemu =
{ "5" = "/dev/urandom" }
{ "6" = "/dev/ptmx" }
{ "7" = "/dev/kvm" }
+ { "8" = "/dev/userfaultfd" }
}
{ "save_image_format" = "raw" }
{ "dump_image_format" = "raw" }
--
2.43.0