On 06.05.2013 22:16, Laine Stump wrote:
VFIO device assignment requires a cgroup ACL to be setup for access
to
the /dev/vfio/nn "group" device for any devices that will be assigned
to a guest. In the case of a host device that is allocated from a
pool, it was being allocated during qemuBuildCommandLine(), which is
called by qemuProcessStart() *after* the all-encompassing
qemuSetupCgroup() is called, meaning that the standard Cgroup ACL
setup wasn't catching these devices allocated from pools.
One possible solution was to manually add a single ACL down inside
qemuBuildCommandLine() when networkAllocateActualDevice() is called,
but that has two problems: 1) the function that adds the cgroup ACL
requires a virDomainObjPtr, which isn't available in
qemuBuildCommandLine(), and 2) we really shouldn't be doing network
device setup inside qemuBuildCommandLine() anyway.
Instead, I've created a new function called
qemuNetworkPrepareDevices() which is called just before
qemuPrepareHostDevices() during qemuProcessStart() (explanation of
ordering in the comments), i.e. well before the call to
qemuSetupCgroup(). To minimize code churn in a patch that will be
backported to 1.0.5-maint, qemuNetworkPrepareDevices only does
networkAllocateActualDevice() and the bare amount of setup required
for type='hostdev network devices.
Note that some of the code that was previously needed in
qemuBuildCommandLine() is no longer required when
networkAllocateActualDevice() is called earlier:
* qemuAssignDeviceHostdevAlias() is already done further down in
qemuProcessStart().
* qemuPrepareHostdevPCIDevices() is called by
qemuPrepareHostDevices() which is called after
qemuNetworkPrepareDevices() in qemuProcessStart().
This new function should be moved into a separate qemu_network.c (or
similarly named) file along with qemuPhysIfaceConnect(),
qemuNetworkIfaceConnect(), and qemuOpenVhostNet(), and expanded to call
those functions as well, then the nnets loop in qemuBuildCommandLine() should
be reduced to only build the commandline string. However, this will
require storing away an array of tapfd and vhostfd that are needed
for the commandline, so I would rather do that in a separate patch and
leave this patch at the minimum to fix the bug.
---
src/qemu/qemu_command.c | 104 ++++++++++++++++++++++++++----------------------
src/qemu/qemu_command.h | 4 +-
src/qemu/qemu_process.c | 8 ++++
3 files changed, 67 insertions(+), 49 deletions(-)
diff --git a/src/qemu/qemu_command.c b/src/qemu/qemu_command.c
index 144620c..5ce97e8 100644
--- a/src/qemu/qemu_command.c
+++ b/src/qemu/qemu_command.c
@@ -457,6 +457,58 @@ qemuOpenVhostNet(virDomainDefPtr def,
return 0;
}
+int
+qemuNetworkPrepareDevices(virDomainDefPtr def)
+{
+ int ret = -1;
+ int ii;
+
+ for (ii = 0; ii < def->nnets; ii++) {
+ virDomainNetDefPtr net = def->nets[ii];
+ int actualType;
+
+ /* If appropriate, grab a physical device from the configured
+ * network's pool of devices, or resolve bridge device name
+ * to the one defined in the network definition.
+ */
+ if (networkAllocateActualDevice(net) < 0)
+ goto cleanup;
+
+ actualType = virDomainNetGetActualType(net);
+ if (actualType == VIR_DOMAIN_NET_TYPE_HOSTDEV &&
+ net->type == VIR_DOMAIN_NET_TYPE_NETWORK) {
+ /* Each type='hostdev' network device must also have a
+ * corresponding entry in the hostdevs array. For netdevs
+ * that are hardcoded as type='hostdev', this is already
+ * done by the parser, but for those allocated from a
+ * network / determined at runtime, we need to do it
+ * separately.
+ */
+ virDomainHostdevDefPtr hostdev = virDomainNetGetActualHostdev(net);
+
+ if (virDomainHostdevFind(def, hostdev, NULL) >= 0) {
+ virReportError(VIR_ERR_INTERNAL_ERROR,
+ _("PCI device %04x:%02x:%02x.%x "
+ "allocated from network %s is already "
+ "in use by domain %s"),
+ hostdev->source.subsys.u.pci.addr.domain,
+ hostdev->source.subsys.u.pci.addr.bus,
+ hostdev->source.subsys.u.pci.addr.slot,
+ hostdev->source.subsys.u.pci.addr.function,
+ net->data.network.name,
+ def->name);
+ goto cleanup;
+ }
+ if (virDomainHostdevInsert(def, hostdev) < 0) {
+ virReportOOMError();
+ goto cleanup;
+ }
+ }
+ }
+ ret = 0;
+cleanup:
+ return ret;
+}
static int qemuDomainDeviceAliasIndex(virDomainDeviceInfoPtr info,
const char *prefix)
@@ -7106,12 +7158,15 @@ qemuBuildCommandLine(virConnectPtr conn,
char vhostfd_name[50] = "";
int vlan;
int bootindex = bootNet;
- int actualType;
+ int actualType = virDomainNetGetActualType(net);
bootNet = 0;
if (!bootindex)
bootindex = net->info.bootIndex;
I'd expect a one line comment here at least to give a reason why we are
skipping VIR_DOMAIN_NET_TYPE_HOSTDEV. Something like:
/* hostdev interfaces already handled by qemuNetworkPrepareDevices */
It's obvious now, but if we get later to this code it won't be IMO.
+ if (actualType == VIR_DOMAIN_NET_TYPE_HOSTDEV)
+ continue;
+
/* VLANs are not used with -netdev, so don't record them */
if (virQEMUCapsGet(qemuCaps, QEMU_CAPS_NETDEV) &&
virQEMUCapsGet(qemuCaps, QEMU_CAPS_DEVICE))
I actually posted a patch which moves the whole block into a separate
function, which is what we should do:
https://www.redhat.com/archives/libvir-list/2013-April/msg01550.html
@@ -7119,53 +7174,6 @@ qemuBuildCommandLine(virConnectPtr conn,
else
vlan = i;
- /* If appropriate, grab a physical device from the configured
- * network's pool of devices, or resolve bridge device name
- * to the one defined in the network definition.
- */
- if (networkAllocateActualDevice(net) < 0)
- goto error;
-
- actualType = virDomainNetGetActualType(net);
- if (actualType == VIR_DOMAIN_NET_TYPE_HOSTDEV) {
- if (net->type == VIR_DOMAIN_NET_TYPE_NETWORK) {
- virDomainHostdevDefPtr hostdev = virDomainNetGetActualHostdev(net);
- virDomainHostdevDefPtr found;
- /* For a network with <forward mode='hostdev'>, there
is a need to
- * add the newly minted hostdev to the hostdevs array.
- */
- if (qemuAssignDeviceHostdevAlias(def, hostdev,
- (def->nhostdevs-1)) < 0) {
- goto error;
- }
-
- if (virDomainHostdevFind(def, hostdev, &found) < 0) {
- if (virDomainHostdevInsert(def, hostdev) < 0) {
- virReportOOMError();
- goto error;
- }
- if (qemuPrepareHostdevPCIDevices(driver, def->name,
def->uuid,
- &hostdev, 1) < 0) {
- goto error;
- }
- }
- else {
- virReportError(VIR_ERR_INTERNAL_ERROR,
- _("PCI device %04x:%02x:%02x.%x "
- "allocated from network %s is already
"
- "in use by domain %s"),
- hostdev->source.subsys.u.pci.addr.domain,
- hostdev->source.subsys.u.pci.addr.bus,
- hostdev->source.subsys.u.pci.addr.slot,
- hostdev->source.subsys.u.pci.addr.function,
- net->data.network.name,
- def->name);
- goto error;
- }
- }
- continue;
- }
-
if (actualType == VIR_DOMAIN_NET_TYPE_NETWORK ||
actualType == VIR_DOMAIN_NET_TYPE_BRIDGE) {
int tapfd = qemuNetworkIfaceConnect(def, conn, driver, net,
diff --git a/src/qemu/qemu_command.h b/src/qemu/qemu_command.h
index a706942..e690dee 100644
--- a/src/qemu/qemu_command.h
+++ b/src/qemu/qemu_command.h
@@ -1,7 +1,7 @@
/*
* qemu_command.h: QEMU command generation
*
- * Copyright (C) 2006-2012 Red Hat, Inc.
+ * Copyright (C) 2006-2013 Red Hat, Inc.
* Copyright (C) 2006 Daniel P. Berrange
*
* This library is free software; you can redistribute it and/or
@@ -161,6 +161,8 @@ int qemuOpenVhostNet(virDomainDefPtr def,
virQEMUCapsPtr qemuCaps,
int *vhostfd);
+int qemuNetworkPrepareDevices(virDomainDefPtr def);
+
/*
* NB: def->name can be NULL upon return and the caller
* *must* decide how to fill in a name in this case
diff --git a/src/qemu/qemu_process.c b/src/qemu/qemu_process.c
index 3dd178c..d7b0aee 100644
--- a/src/qemu/qemu_process.c
+++ b/src/qemu/qemu_process.c
@@ -3422,6 +3422,14 @@ int qemuProcessStart(virConnectPtr conn,
goto cleanup;
}
+ /* network devices must be "prepared" before hostdevs, because
+ * setting up a network device might create a new hostdev that
+ * will need to be setup.
+ */
+ VIR_DEBUG("Preparing network devices");
Is it worth mentioning s/network/network host/?
+ if (qemuNetworkPrepareDevices(vm->def) < 0)
+ goto cleanup;
+
/* Must be run before security labelling */
VIR_DEBUG("Preparing host devices");
if (qemuPrepareHostDevices(driver, vm->def, !migrateFrom) < 0)
ACK if you add the comment. The debug message fix is optional.
Michal