[libvirt] [PATCH 00/12] Couple of host info getting improvements
The rationale is to fill in the gaps as described here: https://wiki.openstack.org/wiki/VirtDriverGuestCPUMemoryPlacement Note that these are targeted for 1.2.6. Michal Privoznik (12): Introduce virNodeHugeTLB Initial virsh exposure of virNodeHugeTLB nodeinfo: Implement nodeHugeTLB virpci: Introduce virPCIDeviceIsPCIExpress and friends nodedev: Introduce <pci-express/> to PCI devices virInterface: Expose link state & speed interface_backend_udev: Implement link speed & state interface_backend_netcf: Implement link speed & state virhostdev: Move IOMMU and VFIO funcs from qemu virCaps: Introduce IOMMU and VFIO capabilities Expose IOMMU and VFIO capabilities from virCaps nodedev: Export NUMA node locality for PCI devices daemon/remote.c | 54 ++++++ docs/formatcaps.html.in | 8 +- docs/formatnode.html.in | 25 +++ docs/schemas/capability.rng | 12 ++ docs/schemas/interface.rng | 27 +++ docs/schemas/nodedev.rng | 37 ++++ include/libvirt/libvirt.h.in | 26 +++ src/conf/capabilities.c | 50 ++++++ src/conf/capabilities.h | 11 ++ src/conf/interface_conf.c | 39 ++++- src/conf/interface_conf.h | 15 ++ src/conf/node_device_conf.c | 166 +++++++++++++++++- src/conf/node_device_conf.h | 32 +++- src/driver.h | 7 + src/interface/interface_backend_netcf.c | 99 +++++++++++ src/interface/interface_backend_udev.c | 19 ++ src/libvirt.c | 59 +++++++ src/libvirt_private.syms | 13 ++ src/libvirt_public.syms | 5 + src/lxc/lxc_conf.c | 3 + src/node_device/node_device_udev.c | 38 ++++ src/nodeinfo.c | 192 +++++++++++++++++++++ src/nodeinfo.h | 6 + src/parallels/parallels_driver.c | 3 + src/phyp/phyp_driver.c | 3 + src/qemu/qemu_capabilities.c | 3 + src/qemu/qemu_driver.c | 30 +++- src/qemu/qemu_hostdev.c | 93 ++-------- src/qemu/qemu_hostdev.h | 2 - src/remote/remote_driver.c | 48 ++++++ src/remote/remote_protocol.x | 19 +- src/remote_protocol-structs | 13 ++ src/uml/uml_conf.c | 3 + src/util/virhostdev.c | 73 ++++++++ src/util/virhostdev.h | 4 + src/util/virpci.c | 81 ++++++++- src/util/virpci.h | 8 + src/vbox/vbox_tmpl.c | 3 + tests/capabilityschemadata/caps-qemu-kvm.xml | 2 + tests/capabilityschemadata/caps-test.xml | 2 + tests/capabilityschemadata/caps-test2.xml | 2 + tests/capabilityschemadata/caps-test3.xml | 2 + tests/interfaceschemadata/bridge-no-address.xml | 1 + tests/interfaceschemadata/bridge.xml | 1 + tests/interfaceschemadata/ethernet-dhcp.xml | 1 + tests/nodedevschemadata/pci_1002_71c4.xml | 1 + tests/nodedevschemadata/pci_8086_10c9_sriov_pf.xml | 1 + .../pci_8086_4238_pcie_wireless.xml | 26 +++ tests/nodedevxml2xmltest.c | 1 + tests/xencapsdata/xen-i686-pae-hvm.xml | 2 + tests/xencapsdata/xen-i686-pae.xml | 2 + tests/xencapsdata/xen-i686.xml | 2 + tests/xencapsdata/xen-ia64-be-hvm.xml | 2 + tests/xencapsdata/xen-ia64-be.xml | 2 + tests/xencapsdata/xen-ia64-hvm.xml | 2 + tests/xencapsdata/xen-ia64.xml | 2 + tests/xencapsdata/xen-ppc64.xml | 2 + tests/xencapsdata/xen-x86_64-hvm.xml | 2 + tests/xencapsdata/xen-x86_64.xml | 2 + tools/virsh-host.c | 62 +++++++ tools/virsh.pod | 7 + 61 files changed, 1371 insertions(+), 87 deletions(-) create mode 100644 tests/nodedevschemadata/pci_8086_4238_pcie_wireless.xml -- 1.9.3
The API queries huge page info in the host and reports it back to the caller. This may be handy for management application to decide whether to run a domain with huge pages enabled or not. Signed-off-by: Michal Privoznik <mprivozn@redhat.com> --- daemon/remote.c | 54 ++++++++++++++++++++++++++++++++++++++++++++ include/libvirt/libvirt.h.in | 6 +++++ src/driver.h | 7 ++++++ src/libvirt.c | 54 ++++++++++++++++++++++++++++++++++++++++++++ src/libvirt_public.syms | 5 ++++ src/remote/remote_driver.c | 48 +++++++++++++++++++++++++++++++++++++++ src/remote/remote_protocol.x | 19 +++++++++++++++- src/remote_protocol-structs | 13 +++++++++++ 8 files changed, 205 insertions(+), 1 deletion(-) diff --git a/daemon/remote.c b/daemon/remote.c index 34c96c9..95ff973 100644 --- a/daemon/remote.c +++ b/daemon/remote.c @@ -6115,6 +6115,60 @@ remoteDispatchDomainGetTime(virNetServerPtr server ATTRIBUTE_UNUSED, return rv; } +static int remoteDispatchNodeHugeTlb(virNetServerPtr server ATTRIBUTE_UNUSED, + virNetServerClientPtr client, + virNetMessagePtr msg ATTRIBUTE_UNUSED, + virNetMessageErrorPtr rerr, + remote_node_huge_tlb_args *args, + remote_node_huge_tlb_ret *ret) +{ + virTypedParameterPtr params = NULL; + int nparams = 0; + int rv = -1; + struct daemonClientPrivate *priv = virNetServerClientGetPrivateData(client); + + if (!priv->conn) { + virReportError(VIR_ERR_INTERNAL_ERROR, "%s", _("connection not open")); + goto cleanup; + } + + if (args->nparams > REMOTE_NODE_MEMORY_PARAMETERS_MAX) { + virReportError(VIR_ERR_INTERNAL_ERROR, "%s", _("nparams too large")); + goto cleanup; + } + if (args->nparams && VIR_ALLOC_N(params, args->nparams) < 0) + goto cleanup; + nparams = args->nparams; + + if (virNodeHugeTLB(priv->conn, args->type, params, &nparams, args->flags) < 0) + goto cleanup; + + /* In this case, we need to send back the number of parameters + * supported + */ + if (args->nparams == 0) { + ret->nparams = nparams; + goto success; + } + + if (remoteSerializeTypedParameters(params, nparams, + &ret->params.params_val, + &ret->params.params_len, + args->flags) < 0) + goto cleanup; + + + success: + rv = 0; + + cleanup: + if (rv < 0) + virNetMessageSaveError(rerr); + virTypedParamsFree(params, nparams); + return rv; +} + + /*----- Helpers. -----*/ /* get_nonnull_domain and get_nonnull_network turn an on-wire diff --git a/include/libvirt/libvirt.h.in b/include/libvirt/libvirt.h.in index 260c971..cb1cad9 100644 --- a/include/libvirt/libvirt.h.in +++ b/include/libvirt/libvirt.h.in @@ -858,6 +858,12 @@ int virNodeSetMemoryParameters(virConnectPtr conn, int nparams, unsigned int flags); +int virNodeHugeTLB(virConnectPtr conn, + int type, + virTypedParameterPtr params, + int *nparams, + unsigned int flags); + /* * node CPU map */ diff --git a/src/driver.h b/src/driver.h index 5ac89d6..4acde08 100644 --- a/src/driver.h +++ b/src/driver.h @@ -1070,6 +1070,12 @@ typedef int virTypedParameterPtr params, int nparams, unsigned int flags); +typedef int +(*virDrvNodeHugeTLB)(virConnectPtr conn, + int type, + virTypedParameterPtr params, + int *nparams, + unsigned int flags); typedef int (*virDrvNodeGetCPUMap)(virConnectPtr conn, @@ -1376,6 +1382,7 @@ struct _virDriver { virDrvDomainGetMetadata domainGetMetadata; virDrvNodeGetMemoryParameters nodeGetMemoryParameters; virDrvNodeSetMemoryParameters nodeSetMemoryParameters; + virDrvNodeHugeTLB nodeHugeTLB; virDrvNodeGetCPUMap nodeGetCPUMap; virDrvDomainFSTrim domainFSTrim; virDrvDomainSendProcessSignal domainSendProcessSignal; diff --git a/src/libvirt.c b/src/libvirt.c index f01b6dd..de20b0c 100644 --- a/src/libvirt.c +++ b/src/libvirt.c @@ -7561,6 +7561,60 @@ virNodeSetMemoryParameters(virConnectPtr conn, /** + * virNodeHugeTLB: + * @conn: pointer to the hypervisor connection + * @type: type + * @params: pointer to memory parameter object + * (return value, allocated by the caller) + * @nparams: pointer to number of memory parameters; input and output + * @flags: extra flags; not used yet, so callers should always pass 0 + * + * Get information about host's huge pages. On input, @nparams + * gives the size of the @params array; on output, @nparams gives + * how many slots were filled with parameter information, which + * might be less but will not exceed the input value. + * + * As a special case, calling with @params as NULL and @nparams + * as 0 on input will cause @nparams on output to contain the + * number of parameters supported by the hypervisor. The caller + * should then allocate @params array, i.e. + * (sizeof(@virTypedParameter) * @nparams) bytes and call the API + * again. See virDomainGetMemoryParameters() for an equivalent + * usage example. + * + * Returns 0 in case of success, and -1 in case of failure. + */ +int +virNodeHugeTLB(virConnectPtr conn, + int type, + virTypedParameterPtr params, + int *nparams, + unsigned int flags) +{ + VIR_DEBUG("conn=%p, type=%d, params=%p, nparams=%p, flags=%x", + conn, type, params, nparams, flags); + + virResetLastError(); + + virCheckNonNullArgGoto(nparams, error); + virCheckNonNegativeArgGoto(*nparams, error); + + if (conn->driver->nodeHugeTLB) { + int ret; + ret = conn->driver->nodeHugeTLB(conn, type, params, nparams, flags); + if (ret < 0) + goto error; + return ret; + } + virReportUnsupportedError(); + + error: + virDispatchError(conn); + return -1; +} + + +/** * virDomainGetSchedulerType: * @domain: pointer to domain object * @nparams: pointer to number of scheduler parameters, can be NULL diff --git a/src/libvirt_public.syms b/src/libvirt_public.syms index cce6bdf..0339e70 100644 --- a/src/libvirt_public.syms +++ b/src/libvirt_public.syms @@ -658,5 +658,10 @@ LIBVIRT_1.2.5 { virDomainSetTime; } LIBVIRT_1.2.3; +LIBVIRT_1.2.6 { + global: + virNodeHugeTLB; +} LIBVIRT_1.2.5; + # .... define new API here using predicted next version number .... diff --git a/src/remote/remote_driver.c b/src/remote/remote_driver.c index 85fe597..9136b78 100644 --- a/src/remote/remote_driver.c +++ b/src/remote/remote_driver.c @@ -7469,6 +7469,53 @@ remoteDomainGetTime(virDomainPtr dom, } +static int +remoteNodeHugeTLB(virConnectPtr conn, + int type, + virTypedParameterPtr params, + int *nparams, + unsigned int flags) +{ + int rv = -1; + remote_node_huge_tlb_args args; + remote_node_huge_tlb_ret ret; + struct private_data *priv = conn->privateData; + + remoteDriverLock(priv); + + args.type = type; + args.nparams = *nparams; + args.flags = flags; + + memset(&ret, 0, sizeof(ret)); + if (call(conn, priv, 0, REMOTE_PROC_NODE_HUGE_TLB, + (xdrproc_t) xdr_remote_node_huge_tlb_args, (char *) &args, + (xdrproc_t) xdr_remote_node_huge_tlb_ret, (char *) &ret) == -1) + goto done; + + if (*nparams == 0) { + *nparams = ret.nparams; + rv = 0; + goto cleanup; + } + + if (remoteDeserializeTypedParameters(ret.params.params_val, + ret.params.params_len, + REMOTE_NODE_MEMORY_PARAMETERS_MAX, + ¶ms, + nparams) < 0) + goto cleanup; + + rv = 0; + + cleanup: + xdr_free((xdrproc_t) xdr_remote_node_huge_tlb_ret, (char *) &ret); + done: + remoteDriverUnlock(priv); + return rv; +} + + /* get_nonnull_domain and get_nonnull_network turn an on-wire * (name, uuid) pair into virDomainPtr or virNetworkPtr object. * These can return NULL if underlying memory allocations fail, @@ -7805,6 +7852,7 @@ static virDriver remote_driver = { .domainFSThaw = remoteDomainFSThaw, /* 1.2.5 */ .domainGetTime = remoteDomainGetTime, /* 1.2.5 */ .domainSetTime = remoteDomainSetTime, /* 1.2.5 */ + .nodeHugeTLB = remoteNodeHugeTLB, /* 1.2.6 */ }; static virNetworkDriver network_driver = { diff --git a/src/remote/remote_protocol.x b/src/remote/remote_protocol.x index 1f9d583..4d6cc15 100644 --- a/src/remote/remote_protocol.x +++ b/src/remote/remote_protocol.x @@ -2836,6 +2836,17 @@ struct remote_node_get_memory_parameters_ret { int nparams; }; +struct remote_node_huge_tlb_args { + int type; + int nparams; + unsigned int flags; +}; + +struct remote_node_huge_tlb_ret { + remote_typed_param params<REMOTE_NODE_MEMORY_PARAMETERS_MAX>; + int nparams; +}; + struct remote_node_get_cpu_map_args { int need_map; int need_online; @@ -5338,5 +5349,11 @@ enum remote_procedure { * @generate: both * @acl: domain:set_time */ - REMOTE_PROC_DOMAIN_SET_TIME = 338 + REMOTE_PROC_DOMAIN_SET_TIME = 338, + + /** + * @generate: none + * @acl: connect:read + */ + REMOTE_PROC_NODE_HUGE_TLB = 339 }; diff --git a/src/remote_protocol-structs b/src/remote_protocol-structs index 5b22049..e3fcdea 100644 --- a/src/remote_protocol-structs +++ b/src/remote_protocol-structs @@ -2271,6 +2271,18 @@ struct remote_node_get_memory_parameters_ret { } params; int nparams; }; +struct remote_node_huge_tlb_args { + int type; + int nparams; + u_int flags; +}; +struct remote_node_huge_tlb_ret { + struct { + u_int params_len; + remote_typed_param * params_val; + } params; + int nparams; +}; struct remote_node_get_cpu_map_args { int need_map; int need_online; @@ -2802,4 +2814,5 @@ enum remote_procedure { REMOTE_PROC_DOMAIN_FSTHAW = 336, REMOTE_PROC_DOMAIN_GET_TIME = 337, REMOTE_PROC_DOMAIN_SET_TIME = 338, + REMOTE_PROC_NODE_HUGE_TLB = 339, }; -- 1.9.3
On Thu, May 29, 2014 at 10:32:35AM +0200, Michal Privoznik wrote:
/** + * virNodeHugeTLB: + * @conn: pointer to the hypervisor connection + * @type: type + * @params: pointer to memory parameter object + * (return value, allocated by the caller) + * @nparams: pointer to number of memory parameters; input and output + * @flags: extra flags; not used yet, so callers should always pass 0 + * + * Get information about host's huge pages. On input, @nparams + * gives the size of the @params array; on output, @nparams gives + * how many slots were filled with parameter information, which + * might be less but will not exceed the input value. + * + * As a special case, calling with @params as NULL and @nparams + * as 0 on input will cause @nparams on output to contain the + * number of parameters supported by the hypervisor. The caller + * should then allocate @params array, i.e. + * (sizeof(@virTypedParameter) * @nparams) bytes and call the API + * again. See virDomainGetMemoryParameters() for an equivalent + * usage example. + * + * Returns 0 in case of success, and -1 in case of failure. + */ +int +virNodeHugeTLB(virConnectPtr conn, + int type, + virTypedParameterPtr params, + int *nparams, + unsigned int flags)
What is the 'type' parameter doing ? I think in general this API needs a different design. I'd like to have an API that can request info for all page sizes on all NUMA nods in a single call. I also think the static unchanging data should be part of the cpu + NUMA info in the capabilities XML. So the API only reports info which is changing - ie the available pages. In the <cpu> element, we should report which page sizes are available for the CPU model. In the <topology> element we should report the number of pages of each size that are present in that node. We shouldn't treat huge pages separately from small pages in this respect. So as an example - CPU supports 3 page sizes 4k, 2MB, 1GB - 2 numa nodes - 3 GB of memory per numa node - First node has - 262144 * 4k pages - 2 * 1 GB pages - Second node has - 1536 * 2 MB pages This would look like this <host> <cpu> <arch>x86_64</arch> <model>Westmere</model> <vendor>Intel</vendor> <topology sockets='1' cores='6' threads='2'/> <feature name='rdtscp'/> <feature name='pdpe1gb'/> <feature name='dca'/> <feature name='pdcm'/> <feature name='xtpr'/> <pages units="KiB" size="4"/> <pages units="MiB" size="2"/> <pages units="GiB" size="1"/> </cpu> <topology> <cells num='2'> <cell id='0'> <memory unit='KiB'>3221225472</memory> <pages unit="KiB" size="4">262144</pages> <pages unit="MiB" size="2">0</pages> <pages unit="GiB" size="1">2</pages> <cpus num='4'> <cpu id='0'/> <cpu id='2'/> <cpu id='4'/> <cpu id='6'/> </cpus> </cell> <cell id='1'> <memory unit='KiB'>3221225472</memory> <pages unit="KiB" size="4">0</pages> <pages unit="MiB" size="2">1536</pages> <pages unit="GiB" size="1">2</pages> <cpus num='4'> <cpu id='1'/> <cpu id='3'/> <cpu id='5'/> <cpu id='7'/> </cpus> </cell> </cells> </topology> So then an API call to request the available pages on all nodes would look something like virNodeGetFreePages(virConnectPtr conn, unsigned int *pages, unsigned int npages, unsigned int startcell, unsigned int cellcount, unsigned long long *counts); In this API @pages - array whose elements are the page sizes to request info for @npages - number of elements in @pages @startcell - ID of first NUMA cell to request data for @cellcount - number of cells to request data for @counts - array which is @npages * @cellcount in length So if you want free count for all page sizes on all NUMA nodes you might use this as unsigned int pages[] = { 4096, 2097152, 1073741824} unsigned int npages = ARRAY_CARDINALITY(pages); unsigned int startcell = 0; unsigned int cellcount = 2; unsigned long long counts = malloc(sizeof(long long) * npages * cellcount); virNodeGetFreePages(conn, pages, npages, startcell, cellcount, counts); for (i = 0 ; i < cellcount ; i++) { fprintf(stdout, "Cell %d\n", startcell + i); for (j = 0 ; j < npages ; j++) { fprintf(stdout, " Page size=%d count=%d bytes=%llu\n", pages[j], counts[(i * npages) + j], pages[j] * counts[(i * npages) + j]); } fprintf(stderr, "\n"); } Cell 0 Page size=4096 count=300 bytes=1228800 Page size=2097152 count=0 bytes=0 Page size=1073741824 count=1 bytes=1073741824 Cell 1 Page size=4096 count=0 bytes=0 Page size=2097152 count=20 bytes=41943040 Page size=1073741824 count=0 bytes=0 Or you could request free count for one specific node, or for one specific page size. This new API would basically obsolete the existing virNodeGetCellsFreeMemory by providing something that gave you data on all pages at once, instead of only data on the smallest page size. Regards, Daniel -- |: http://berrange.com -o- http://www.flickr.com/photos/dberrange/ :| |: http://libvirt.org -o- http://virt-manager.org :| |: http://autobuild.org -o- http://search.cpan.org/~danberr/ :| |: http://entangle-photo.org -o- http://live.gnome.org/gtk-vnc :|
On 30.05.2014 10:52, Daniel P. Berrange wrote:
On Thu, May 29, 2014 at 10:32:35AM +0200, Michal Privoznik wrote:
/** + * virNodeHugeTLB: + * @conn: pointer to the hypervisor connection + * @type: type + * @params: pointer to memory parameter object + * (return value, allocated by the caller) + * @nparams: pointer to number of memory parameters; input and output + * @flags: extra flags; not used yet, so callers should always pass 0 + * + * Get information about host's huge pages. On input, @nparams + * gives the size of the @params array; on output, @nparams gives + * how many slots were filled with parameter information, which + * might be less but will not exceed the input value. + * + * As a special case, calling with @params as NULL and @nparams + * as 0 on input will cause @nparams on output to contain the + * number of parameters supported by the hypervisor. The caller + * should then allocate @params array, i.e. + * (sizeof(@virTypedParameter) * @nparams) bytes and call the API + * again. See virDomainGetMemoryParameters() for an equivalent + * usage example. + * + * Returns 0 in case of success, and -1 in case of failure. + */ +int +virNodeHugeTLB(virConnectPtr conn, + int type, + virTypedParameterPtr params, + int *nparams, + unsigned int flags)
What is the 'type' parameter doing ?
Ah, it should be named numa_node rather than type. If type==-1, then overall statistics are returned (number of {available,free} pages accumulated across all NUMA nodes), if type >= 0, info on the specific NUMA node is returned.
I think in general this API needs a different design. I'd like to have an API that can request info for all page sizes on all NUMA nods in a single call. I also think the static unchanging data should be part of the cpu + NUMA info in the capabilities XML. So the API only reports info which is changing - ie the available pages.
The only problem is, the size of huge pages pool is not immutable. Now it's possible for 2M huge pages to be allocated dynamically: # echo 8 > /sys/kernel/mm/hugepages/hugepages-2048kB/nr_hugepages and it may be possible for 1GB too in future (what if kernel learns how to do it?). In general, the only thing that we can take unalterable for now is the default size of huge pages. And I wouldn't bet on that either.
In the <cpu> element, we should report which page sizes are available for the CPU model.
Okay.
In the <topology> element we should report the number of pages of each size that are present in that node. We shouldn't treat huge pages separately from small pages in this respect.
Well, if we do that you'll still need two API calls (and hence you'll lack atomicity): virConnectGetCapabilities() followed by virNodeGetFreePages(). On the other hand, the virNodeGetFreePages() itself is not atomic either (from system POV - other processes can jump in and allocate huge pages as libvirtd is executing the API).
So as an example
- CPU supports 3 page sizes 4k, 2MB, 1GB - 2 numa nodes - 3 GB of memory per numa node - First node has - 262144 * 4k pages - 2 * 1 GB pages - Second node has - 1536 * 2 MB pages
This would look like this
<host> <cpu> <arch>x86_64</arch> <model>Westmere</model> <vendor>Intel</vendor> <topology sockets='1' cores='6' threads='2'/> <feature name='rdtscp'/> <feature name='pdpe1gb'/> <feature name='dca'/> <feature name='pdcm'/> <feature name='xtpr'/> <pages units="KiB" size="4"/> <pages units="MiB" size="2"/> <pages units="GiB" size="1"/> </cpu>
Right. This makes sense.
<topology> <cells num='2'> <cell id='0'> <memory unit='KiB'>3221225472</memory> <pages unit="KiB" size="4">262144</pages> <pages unit="MiB" size="2">0</pages> <pages unit="GiB" size="1">2</pages> <cpus num='4'> <cpu id='0'/> <cpu id='2'/> <cpu id='4'/> <cpu id='6'/> </cpus> </cell> <cell id='1'> <memory unit='KiB'>3221225472</memory> <pages unit="KiB" size="4">0</pages> <pages unit="MiB" size="2">1536</pages> <pages unit="GiB" size="1">2</pages> <cpus num='4'> <cpu id='1'/> <cpu id='3'/> <cpu id='5'/> <cpu id='7'/> </cpus> </cell> </cells> </topology
Well, and this to some extent too. We can report the pool size, but since it may vary it's on the same level as number of free pages. What about: <pages unit='MiB' size='2' free='8' avail='8'/> That way we don't even need a new API (not saying we shouldn't implement it though - API is still better than updating capabilities each single time).
So then an API call to request the available pages on all nodes would look something like
virNodeGetFreePages(virConnectPtr conn, unsigned int *pages, unsigned int npages, unsigned int startcell, unsigned int cellcount, unsigned long long *counts);
In this API
@pages - array whose elements are the page sizes to request info for @npages - number of elements in @pages @startcell - ID of first NUMA cell to request data for @cellcount - number of cells to request data for @counts - array which is @npages * @cellcount in length
I wanted to use virTypedParam to make the API open for future addition (e.g. if somebody needs count of surplus or reserved huge pages).
So if you want free count for all page sizes on all NUMA nodes you might use this as
unsigned int pages[] = { 4096, 2097152, 1073741824} unsigned int npages = ARRAY_CARDINALITY(pages); unsigned int startcell = 0; unsigned int cellcount = 2;
unsigned long long counts = malloc(sizeof(long long) * npages * cellcount);
virNodeGetFreePages(conn, pages, npages, startcell, cellcount, counts);
for (i = 0 ; i < cellcount ; i++) { fprintf(stdout, "Cell %d\n", startcell + i); for (j = 0 ; j < npages ; j++) { fprintf(stdout, " Page size=%d count=%d bytes=%llu\n", pages[j], counts[(i * npages) + j], pages[j] * counts[(i * npages) + j]); } fprintf(stderr, "\n"); }
Cell 0 Page size=4096 count=300 bytes=1228800 Page size=2097152 count=0 bytes=0 Page size=1073741824 count=1 bytes=1073741824 Cell 1 Page size=4096 count=0 bytes=0 Page size=2097152 count=20 bytes=41943040 Page size=1073741824 count=0 bytes=0
Or you could request free count for one specific node, or for one specific page size.
This new API would basically obsolete the existing virNodeGetCellsFreeMemory by providing something that gave you data on all pages at once, instead of only data on the smallest page size.
Currently, the API I'm proposing returns tuple (PAGE_SIZE, PAGE_AVAILABLE, PAGE_FREE) repeated for each huge page size. For instance: PAGE_SIZE=2048, PAGE_AVAILABLE=8, PAGE_FREE=8 ,PAGE_SIZE=1048576, PAGE_AVAILABLE=4, PAGE_FREE=2 If we want to have atomic API, how about fitting in NUMA node number? NUMA_NODE=0, PAGE_SIZE=2048, PAGE_AVAILABLE=..., PAGE_SIZE=1048576,...\ NUMA_NODE=1, PAGE_SIZE=2048, ... So the API would then look like virNodeHugeTLB(virConnectPtr conn, virTypedParameterPtr params, int *nparams, unsigned int flags); Michal
On Fri, May 30, 2014 at 01:41:06PM +0200, Michal Privoznik wrote:
On 30.05.2014 10:52, Daniel P. Berrange wrote:
On Thu, May 29, 2014 at 10:32:35AM +0200, Michal Privoznik wrote:
/** + * virNodeHugeTLB: + * @conn: pointer to the hypervisor connection + * @type: type + * @params: pointer to memory parameter object + * (return value, allocated by the caller) + * @nparams: pointer to number of memory parameters; input and output + * @flags: extra flags; not used yet, so callers should always pass 0 + * + * Get information about host's huge pages. On input, @nparams + * gives the size of the @params array; on output, @nparams gives + * how many slots were filled with parameter information, which + * might be less but will not exceed the input value. + * + * As a special case, calling with @params as NULL and @nparams + * as 0 on input will cause @nparams on output to contain the + * number of parameters supported by the hypervisor. The caller + * should then allocate @params array, i.e. + * (sizeof(@virTypedParameter) * @nparams) bytes and call the API + * again. See virDomainGetMemoryParameters() for an equivalent + * usage example. + * + * Returns 0 in case of success, and -1 in case of failure. + */ +int +virNodeHugeTLB(virConnectPtr conn, + int type, + virTypedParameterPtr params, + int *nparams, + unsigned int flags)
What is the 'type' parameter doing ?
Ah, it should be named numa_node rather than type. If type==-1, then overall statistics are returned (number of {available,free} pages accumulated across all NUMA nodes), if type >= 0, info on the specific NUMA node is returned.
I think in general this API needs a different design. I'd like to have an API that can request info for all page sizes on all NUMA nods in a single call. I also think the static unchanging data should be part of the cpu + NUMA info in the capabilities XML. So the API only reports info which is changing - ie the available pages.
The only problem is, the size of huge pages pool is not immutable. Now it's possible for 2M huge pages to be allocated dynamically:
# echo 8 > /sys/kernel/mm/hugepages/hugepages-2048kB/nr_hugepages
and it may be possible for 1GB too in future (what if kernel learns how to do it?). In general, the only thing that we can take unalterable for now is the default size of huge pages. And I wouldn't bet on that either.
Yes, you can in theory change the number of huge pages at an arbitrary time, but realistically people mostly only do it immediately at boot. With 1 GB pages is will be impossible todo it any time except immediately at boot. If you wait a little while, then memory will be too fragmented for you to be able to dynamically allocate more 1 GB pages. The same is true of 2MB pages to a lesser degree.
In the <topology> element we should report the number of pages of each size that are present in that node. We shouldn't treat huge pages separately from small pages in this respect.
Well, if we do that you'll still need two API calls (and hence you'll lack atomicity): virConnectGetCapabilities() followed by virNodeGetFreePages(). On the other hand, the virNodeGetFreePages() itself is not atomic either (from system POV - other processes can jump in and allocate huge pages as libvirtd is executing the API).
I'm not worried about atomicity wrt to free pages - no matter what you do you will always race, because there's inherantly a gap between when you check the free pages and when the VM boots. I'm more interested in the fact that the capabilities shows you the overall NUMA topology of the host, and the pages allocated per node are inherantly part of that model IMHO.
<host> <cpu> <arch>x86_64</arch> <model>Westmere</model> <vendor>Intel</vendor> <topology sockets='1' cores='6' threads='2'/> <feature name='rdtscp'/> <feature name='pdpe1gb'/> <feature name='dca'/> <feature name='pdcm'/> <feature name='xtpr'/> <pages units="KiB" size="4"/> <pages units="MiB" size="2"/> <pages units="GiB" size="1"/> </cpu>
Right. This makes sense.
<topology> <cells num='2'> <cell id='0'> <memory unit='KiB'>3221225472</memory> <pages unit="KiB" size="4">262144</pages> <pages unit="MiB" size="2">0</pages> <pages unit="GiB" size="1">2</pages> <cpus num='4'> <cpu id='0'/> <cpu id='2'/> <cpu id='4'/> <cpu id='6'/> </cpus> </cell> <cell id='1'> <memory unit='KiB'>3221225472</memory> <pages unit="KiB" size="4">0</pages> <pages unit="MiB" size="2">1536</pages> <pages unit="GiB" size="1">2</pages> <cpus num='4'> <cpu id='1'/> <cpu id='3'/> <cpu id='5'/> <cpu id='7'/> </cpus> </cell> </cells> </topology
Well, and this to some extent too. We can report the pool size, but since it may vary it's on the same level as number of free pages. What about:
<pages unit='MiB' size='2' free='8' avail='8'/>
That way we don't even need a new API (not saying we shouldn't implement it though - API is still better than updating capabilities each single time).
I'm reluctant to include the free count in the capabilities because that will encourage people to query the capabilities overly frequently which will be pretty inefficient. This is why we only provide free memory info there currently and have the separate API call. If we report the used/free count in the capabilities we're forced into refreshing this data frequently. If we only report the overall count per node, even if there's the possibility to dynamically change that, we have more potential to avoid refreshing it frequently.
So then an API call to request the available pages on all nodes would look something like
virNodeGetFreePages(virConnectPtr conn, unsigned int *pages, unsigned int npages, unsigned int startcell, unsigned int cellcount, unsigned long long *counts);
In this API
@pages - array whose elements are the page sizes to request info for @npages - number of elements in @pages @startcell - ID of first NUMA cell to request data for @cellcount - number of cells to request data for @counts - array which is @npages * @cellcount in length
I wanted to use virTypedParam to make the API open for future addition (e.g. if somebody needs count of surplus or reserved huge pages).
I'm not sure what you mean by 'reserved huge pages' - all non-4k page counts we are reporting are "reserved pages". The typed parameter APIs are really horrible to use from applications, so I think there needs to be a strong compelling reason to use that design. It makes sense with the various tunable/stats whose valid keys vary across hypervisor and are frequently extended, but I'm not really seeing a good reason for using it for memory page count reporting
Currently, the API I'm proposing returns tuple (PAGE_SIZE, PAGE_AVAILABLE, PAGE_FREE) repeated for each huge page size. For instance:
PAGE_SIZE=2048, PAGE_AVAILABLE=8, PAGE_FREE=8 ,PAGE_SIZE=1048576, PAGE_AVAILABLE=4, PAGE_FREE=2
If we want to have atomic API, how about fitting in NUMA node number?
NUMA_NODE=0, PAGE_SIZE=2048, PAGE_AVAILABLE=..., PAGE_SIZE=1048576,...\ NUMA_NODE=1, PAGE_SIZE=2048, ...
So the API would then look like
virNodeHugeTLB(virConnectPtr conn, virTypedParameterPtr params, int *nparams, unsigned int flags);
IMHO this results in a rather unpleasant API to use. Regards, Daniel -- |: http://berrange.com -o- http://www.flickr.com/photos/dberrange/ :| |: http://libvirt.org -o- http://virt-manager.org :| |: http://autobuild.org -o- http://search.cpan.org/~danberr/ :| |: http://entangle-photo.org -o- http://live.gnome.org/gtk-vnc :|
On 30.05.2014 14:46, Daniel P. Berrange wrote:
On Fri, May 30, 2014 at 01:41:06PM +0200, Michal Privoznik wrote:
On 30.05.2014 10:52, Daniel P. Berrange wrote:
On Thu, May 29, 2014 at 10:32:35AM +0200, Michal Privoznik wrote:
/** + * virNodeHugeTLB: + * @conn: pointer to the hypervisor connection + * @type: type + * @params: pointer to memory parameter object + * (return value, allocated by the caller) + * @nparams: pointer to number of memory parameters; input and output + * @flags: extra flags; not used yet, so callers should always pass 0 + * + * Get information about host's huge pages. On input, @nparams + * gives the size of the @params array; on output, @nparams gives + * how many slots were filled with parameter information, which + * might be less but will not exceed the input value. + * + * As a special case, calling with @params as NULL and @nparams + * as 0 on input will cause @nparams on output to contain the + * number of parameters supported by the hypervisor. The caller + * should then allocate @params array, i.e. + * (sizeof(@virTypedParameter) * @nparams) bytes and call the API + * again. See virDomainGetMemoryParameters() for an equivalent + * usage example. + * + * Returns 0 in case of success, and -1 in case of failure. + */ +int +virNodeHugeTLB(virConnectPtr conn, + int type, + virTypedParameterPtr params, + int *nparams, + unsigned int flags)
What is the 'type' parameter doing ?
Ah, it should be named numa_node rather than type. If type==-1, then overall statistics are returned (number of {available,free} pages accumulated across all NUMA nodes), if type >= 0, info on the specific NUMA node is returned.
I think in general this API needs a different design. I'd like to have an API that can request info for all page sizes on all NUMA nods in a single call. I also think the static unchanging data should be part of the cpu + NUMA info in the capabilities XML. So the API only reports info which is changing - ie the available pages.
The only problem is, the size of huge pages pool is not immutable. Now it's possible for 2M huge pages to be allocated dynamically:
# echo 8 > /sys/kernel/mm/hugepages/hugepages-2048kB/nr_hugepages
and it may be possible for 1GB too in future (what if kernel learns how to do it?). In general, the only thing that we can take unalterable for now is the default size of huge pages. And I wouldn't bet on that either.
Yes, you can in theory change the number of huge pages at an arbitrary time, but realistically people mostly only do it immediately at boot. With 1 GB pages is will be impossible todo it any time except immediately at boot. If you wait a little while, then memory will be too fragmented for you to be able to dynamically allocate more 1 GB pages. The same is true of 2MB pages to a lesser degree.
IMO no. Processes never ever see physical address (PA). All they see are virtual addresses (VA). So there is possibility for kernel to rearrange physical memory without effect on the processes in order to gain bigger segments of free memory. The only problem are DMA transfers which access PA directly but that can be resolved by reserving first X {M,G}B of RAM for the transfers. And kernel is doing something similar already - Kernel SamePage Merging (KSM). On the other hand, I don't think huge page pool size is going to be changed too often. Especially when compared to huge page allocation in a userspace processes. So whenever mgmt application changes the pool size, it knows it has to refetch capabilities. Well, if we document it :-)
In the <topology> element we should report the number of pages of each size that are present in that node. We shouldn't treat huge pages separately from small pages in this respect.
Well, if we do that you'll still need two API calls (and hence you'll lack atomicity): virConnectGetCapabilities() followed by virNodeGetFreePages(). On the other hand, the virNodeGetFreePages() itself is not atomic either (from system POV - other processes can jump in and allocate huge pages as libvirtd is executing the API).
I'm not worried about atomicity wrt to free pages - no matter what you do you will always race, because there's inherantly a gap between when you check the free pages and when the VM boots. I'm more interested in the fact that the capabilities shows you the overall NUMA topology of the host, and the pages allocated per node are inherantly part of that model IMHO.
<host> <cpu> <arch>x86_64</arch> <model>Westmere</model> <vendor>Intel</vendor> <topology sockets='1' cores='6' threads='2'/> <feature name='rdtscp'/> <feature name='pdpe1gb'/> <feature name='dca'/> <feature name='pdcm'/> <feature name='xtpr'/> <pages units="KiB" size="4"/> <pages units="MiB" size="2"/> <pages units="GiB" size="1"/> </cpu>
Right. This makes sense.
<topology> <cells num='2'> <cell id='0'> <memory unit='KiB'>3221225472</memory> <pages unit="KiB" size="4">262144</pages> <pages unit="MiB" size="2">0</pages> <pages unit="GiB" size="1">2</pages> <cpus num='4'> <cpu id='0'/> <cpu id='2'/> <cpu id='4'/> <cpu id='6'/> </cpus> </cell> <cell id='1'> <memory unit='KiB'>3221225472</memory> <pages unit="KiB" size="4">0</pages> <pages unit="MiB" size="2">1536</pages> <pages unit="GiB" size="1">2</pages> <cpus num='4'> <cpu id='1'/> <cpu id='3'/> <cpu id='5'/> <cpu id='7'/> </cpus> </cell> </cells> </topology
Well, and this to some extent too. We can report the pool size, but since it may vary it's on the same level as number of free pages. What about:
<pages unit='MiB' size='2' free='8' avail='8'/>
That way we don't even need a new API (not saying we shouldn't implement it though - API is still better than updating capabilities each single time).
I'm reluctant to include the free count in the capabilities because that will encourage people to query the capabilities overly frequently which will be pretty inefficient. This is why we only provide free memory info there currently and have the separate API call.
If we report the used/free count in the capabilities we're forced into refreshing this data frequently. If we only report the overall count per node, even if there's the possibility to dynamically change that, we have more potential to avoid refreshing it frequently.
So then an API call to request the available pages on all nodes would look something like
virNodeGetFreePages(virConnectPtr conn, unsigned int *pages, unsigned int npages, unsigned int startcell, unsigned int cellcount, unsigned long long *counts);
In this API
@pages - array whose elements are the page sizes to request info for @npages - number of elements in @pages @startcell - ID of first NUMA cell to request data for @cellcount - number of cells to request data for @counts - array which is @npages * @cellcount in length
I wanted to use virTypedParam to make the API open for future addition (e.g. if somebody needs count of surplus or reserved huge pages).
I'm not sure what you mean by 'reserved huge pages' - all non-4k page counts we are reporting are "reserved pages".
I mean any of /sys/kernel/mm/hugepages/hugepages-2048kB/*. But we can invent virNodeGetResvHugepages() for that. Michal
On Fri, May 30, 2014 at 03:42:01PM +0200, Michal Privoznik wrote:
On 30.05.2014 14:46, Daniel P. Berrange wrote:
On Fri, May 30, 2014 at 01:41:06PM +0200, Michal Privoznik wrote:
On 30.05.2014 10:52, Daniel P. Berrange wrote:
On Thu, May 29, 2014 at 10:32:35AM +0200, Michal Privoznik wrote:
/** + * virNodeHugeTLB: + * @conn: pointer to the hypervisor connection + * @type: type + * @params: pointer to memory parameter object + * (return value, allocated by the caller) + * @nparams: pointer to number of memory parameters; input and output + * @flags: extra flags; not used yet, so callers should always pass 0 + * + * Get information about host's huge pages. On input, @nparams + * gives the size of the @params array; on output, @nparams gives + * how many slots were filled with parameter information, which + * might be less but will not exceed the input value. + * + * As a special case, calling with @params as NULL and @nparams + * as 0 on input will cause @nparams on output to contain the + * number of parameters supported by the hypervisor. The caller + * should then allocate @params array, i.e. + * (sizeof(@virTypedParameter) * @nparams) bytes and call the API + * again. See virDomainGetMemoryParameters() for an equivalent + * usage example. + * + * Returns 0 in case of success, and -1 in case of failure. + */ +int +virNodeHugeTLB(virConnectPtr conn, + int type, + virTypedParameterPtr params, + int *nparams, + unsigned int flags)
What is the 'type' parameter doing ?
Ah, it should be named numa_node rather than type. If type==-1, then overall statistics are returned (number of {available,free} pages accumulated across all NUMA nodes), if type >= 0, info on the specific NUMA node is returned.
I think in general this API needs a different design. I'd like to have an API that can request info for all page sizes on all NUMA nods in a single call. I also think the static unchanging data should be part of the cpu + NUMA info in the capabilities XML. So the API only reports info which is changing - ie the available pages.
The only problem is, the size of huge pages pool is not immutable. Now it's possible for 2M huge pages to be allocated dynamically:
# echo 8 > /sys/kernel/mm/hugepages/hugepages-2048kB/nr_hugepages
and it may be possible for 1GB too in future (what if kernel learns how to do it?). In general, the only thing that we can take unalterable for now is the default size of huge pages. And I wouldn't bet on that either.
Yes, you can in theory change the number of huge pages at an arbitrary time, but realistically people mostly only do it immediately at boot. With 1 GB pages is will be impossible todo it any time except immediately at boot. If you wait a little while, then memory will be too fragmented for you to be able to dynamically allocate more 1 GB pages. The same is true of 2MB pages to a lesser degree.
IMO no. Processes never ever see physical address (PA). All they see are virtual addresses (VA). So there is possibility for kernel to rearrange physical memory without effect on the processes in order to gain bigger segments of free memory.
Applications aren't typically the problem - it is the kernels' own data structures that often cannot be moved at all, so over time they will cause free physical RAM regions to be very fragmented. It is a real problem with huge page usage which will be an order of magnitude worse for 1GB size pages. Regards, Daniel -- |: http://berrange.com -o- http://www.flickr.com/photos/dberrange/ :| |: http://libvirt.org -o- http://virt-manager.org :| |: http://autobuild.org -o- http://search.cpan.org/~danberr/ :| |: http://entangle-photo.org -o- http://live.gnome.org/gtk-vnc :|
The API is exposed as 'hugepage' command. Signed-off-by: Michal Privoznik <mprivozn@redhat.com> --- tools/virsh-host.c | 62 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ tools/virsh.pod | 7 ++++++ 2 files changed, 69 insertions(+) diff --git a/tools/virsh-host.c b/tools/virsh-host.c index cac6086..e71a341 100644 --- a/tools/virsh-host.c +++ b/tools/virsh-host.c @@ -644,6 +644,62 @@ cmdHostname(vshControl *ctl, const vshCmd *cmd ATTRIBUTE_UNUSED) } /* + * "hugepages" command + */ +static const vshCmdInfo info_hugepages[] = { + {.name = "help", + .data = N_("print hugepages info") + }, + {.name = "desc", + .data = N_("print hugepages info") + }, + {.name = NULL} +}; + +static const vshCmdOptDef opts_hugepages[] = { + {.name = "node", + .type = VSH_OT_INT, + .help = N_("NUMA node") + }, + {.name = NULL} +}; + +static bool +cmdHugepages(vshControl *ctl, const vshCmd *cmd) +{ + virTypedParameterPtr params = NULL; + int nparams = 0; + int node = -1; /* Overall stats */ + unsigned int flags = 0; + bool ret = false; + size_t i; + + if (vshCommandOptInt(cmd, "node", &node) < 0) + return false; + + if (virNodeHugeTLB(ctl->conn, node, NULL, &nparams, flags) < 0) + goto cleanup; + + params = vshCalloc(ctl, nparams, sizeof(*params)); + + if (virNodeHugeTLB(ctl->conn, node, params, &nparams, flags) < 0) + goto cleanup; + + vshPrint(ctl, _("Supported huge page sizes:\n")); + for (i = 0; i < nparams; i++) { + char *str = vshGetTypedParamValue(ctl, ¶ms[i]); + vshPrint(ctl, "\t%-15s\t%s\n", params[i].field, str); + VIR_FREE(str); + } + + ret = 0; + cleanup: + virTypedParamsFree(params, nparams); + return ret; +} + + +/* * "uri" command */ static const vshCmdInfo info_uri[] = { @@ -964,6 +1020,12 @@ const vshCmdDef hostAndHypervisorCmds[] = { .info = info_hostname, .flags = 0 }, + {.name = "hugepages", + .handler = cmdHugepages, + .opts = opts_hugepages, + .info = info_hugepages, + .flags = 0 + }, {.name = "maxvcpus", .handler = cmdMaxvcpus, .opts = opts_maxvcpus, diff --git a/tools/virsh.pod b/tools/virsh.pod index 02671b4..5ba591e 100644 --- a/tools/virsh.pod +++ b/tools/virsh.pod @@ -279,6 +279,13 @@ Prints the hypervisor canonical URI, can be useful in shell mode. Print the hypervisor hostname. +=item B<hugepages> [I<node>] + +Print information on host huge pages. By default overall statistics are printed. +To narrow down selection to a single NUMA node use pass it's number to I<node>. +As a special case, if NUMA node number equals to value of -1, the aggregated +stats are printed out. + =item B<sysinfo> Print the XML representation of the hypervisor sysinfo, if available. -- 1.9.3
At the same time, in the qemu driver this is exposed as qemuNodeHugeTLB. Signed-off-by: Michal Privoznik <mprivozn@redhat.com> --- include/libvirt/libvirt.h.in | 20 +++++ src/libvirt.c | 13 +++- src/libvirt_private.syms | 1 + src/nodeinfo.c | 177 +++++++++++++++++++++++++++++++++++++++++++ src/nodeinfo.h | 5 ++ src/qemu/qemu_driver.c | 14 ++++ 6 files changed, 226 insertions(+), 4 deletions(-) diff --git a/include/libvirt/libvirt.h.in b/include/libvirt/libvirt.h.in index cb1cad9..c492129 100644 --- a/include/libvirt/libvirt.h.in +++ b/include/libvirt/libvirt.h.in @@ -858,6 +858,26 @@ int virNodeSetMemoryParameters(virConnectPtr conn, int nparams, unsigned int flags); +/* VIR_NODE_HUGE_PAGE_SIZE: + * + * Macro for typed parameter that represents a huge page size. + * The size is expressed in kibibytes, so for example 2MB is + * expressed as 2048. */ +# define VIR_NODE_HUGE_PAGE_SIZE "hugepage_size" + +/* VIR_NODE_HUGE_PAGE_AVAILABLE: + * + * Macro for typed parameter that represents the size of the pool + * of huge pages of certain size. It's the sum of allocated plus + * free. */ +# define VIR_NODE_HUGE_PAGE_AVAILABLE "hugepage_available" + +/* VIR_NODE_HUGE_PAGE_FREE: + * + * Macro for typed parameter that represents the number of huge + * pages in the pool that are not yet allocated. */ +# define VIR_NODE_HUGE_PAGE_FREE "hugepage_free" + int virNodeHugeTLB(virConnectPtr conn, int type, virTypedParameterPtr params, diff --git a/src/libvirt.c b/src/libvirt.c index de20b0c..ab7c347 100644 --- a/src/libvirt.c +++ b/src/libvirt.c @@ -7569,10 +7569,15 @@ virNodeSetMemoryParameters(virConnectPtr conn, * @nparams: pointer to number of memory parameters; input and output * @flags: extra flags; not used yet, so callers should always pass 0 * - * Get information about host's huge pages. On input, @nparams - * gives the size of the @params array; on output, @nparams gives - * how many slots were filled with parameter information, which - * might be less but will not exceed the input value. + * Get information about host's huge pages. The parameters are + * positioned. That is, they form a tuple starting with + * VIR_NODE_HUGE_PAGE_SIZE followed by other values (e.g. + * VIR_NODE_HUGE_PAGE_AVAILABLE and VIR_NODE_HUGE_PAGE_FREE). + * Next tuple again starts with the size parameter. On input, + * @nparams gives the size of the @params array; on output, + * @nparams gives how many slots were filled with parameter + * information, which might be less but will not exceed the + * input value. * * As a special case, calling with @params as NULL and @nparams * as 0 on input will cause @nparams on output to contain the diff --git a/src/libvirt_private.syms b/src/libvirt_private.syms index cb635cd..1d97c89 100644 --- a/src/libvirt_private.syms +++ b/src/libvirt_private.syms @@ -874,6 +874,7 @@ nodeGetFreeMemory; nodeGetInfo; nodeGetMemoryParameters; nodeGetMemoryStats; +nodeHugeTLB; nodeSetMemoryParameters; diff --git a/src/nodeinfo.c b/src/nodeinfo.c index 56f2b02..be63a30 100644 --- a/src/nodeinfo.c +++ b/src/nodeinfo.c @@ -1880,3 +1880,180 @@ nodeGetFreeMemory(void) return freeMem; } + +static int +nodeHugePageDumpInfo(const char *prefix, + const char *name, + unsigned long long *page_avail, + unsigned long long *page_free) +{ + int ret = -1; + char *buf = NULL; + char *path = NULL; + char *end; + + if (virAsprintf(&path, "%s/%s/nr_hugepages", prefix, name) < 0) + goto cleanup; + + if (virFileReadAll(path, 1024, &buf) < 0) { + virReportSystemError(errno, + _("unable to read %s"), + path); + goto cleanup; + } + + if (virStrToLong_ull(buf, &end, 10, page_avail) < 0 || + *end != '\n') { + virReportError(VIR_ERR_INTERNAL_ERROR, + _("unable to parse: %s"), + buf); + goto cleanup; + } + + VIR_FREE(path); + VIR_FREE(buf); + + if (virAsprintf(&path, "%s/%s/free_hugepages", prefix, name) < 0) + goto cleanup; + + if (virFileReadAll(path, 1024, &buf) < 0) { + virReportSystemError(errno, + _("unable to read %s"), + path); + goto cleanup; + } + + if (virStrToLong_ull(buf, &end, 10, page_free) < 0 || + *end != '\n') { + virReportError(VIR_ERR_INTERNAL_ERROR, + _("unable to parse: %s"), + buf); + goto cleanup; + } + + ret = 0; + cleanup: + VIR_FREE(path); + VIR_FREE(buf); + return ret; +} + +#define HUGEPAGES_PREFIX "hugepages-" + +static int +nodeHugePageSupported(const char *prefix, + virTypedParameterPtr params, + int *nparams) +{ + int ret = -1; + DIR *dir = NULL; + struct dirent *entry = NULL; + size_t i = 0; + + if (!(dir = opendir(prefix))) { + virReportSystemError(errno, + _("failed to open path: %s"), + prefix); + return ret; + } + + while (virDirRead(dir, &entry, prefix) > 0) { + const char *page_name = entry->d_name; + unsigned long long page_size, page_avail, page_free; + char *end; + + /* Just to give you a hint, we're dealing with this: + * hugepages-2048kB/ or hugepages-1048576kB/ */ + if (!STRPREFIX(entry->d_name, HUGEPAGES_PREFIX)) + continue; + + page_name += strlen(HUGEPAGES_PREFIX); + + if (virStrToLong_ull(page_name, &end, 10, &page_size) < 0 || + STRCASENEQ(end, "kB")) { + virReportError(VIR_ERR_INTERNAL_ERROR, + _("unable to parse %s"), + entry->d_name); + goto cleanup; + } + + if (nodeHugePageDumpInfo(prefix, entry->d_name, + &page_avail, &page_free) < 0) + goto cleanup; + + if (!params) { + (*nparams) += 3; + } else { + if (i < *nparams && + virTypedParameterAssign(¶ms[i], + VIR_NODE_HUGE_PAGE_SIZE, + VIR_TYPED_PARAM_ULLONG, + page_size) < 0) + goto cleanup; + i++; + + if (i < *nparams && + virTypedParameterAssign(¶ms[i], + VIR_NODE_HUGE_PAGE_AVAILABLE, + VIR_TYPED_PARAM_ULLONG, + page_avail) < 0) + goto cleanup; + i++; + + if (i < *nparams && + virTypedParameterAssign(¶ms[i], + VIR_NODE_HUGE_PAGE_FREE, + VIR_TYPED_PARAM_ULLONG, + page_free) < 0) + goto cleanup; + i++; + } + } + + if (i) + *nparams = i; + + ret = 0; + + cleanup: + closedir(dir); + return ret; +} + +#define HUGEPAGES_HOST_PREFIX "/sys/kernel/mm/hugepages/" +#define HUGEPAGES_NUMA_PREFIX "/sys/devices/system/node/" + +static int +nodeHugePageInfo(int node, + virTypedParameterPtr params, + int *nparams) +{ + int ret = -1; + char *path = NULL; + + if (virAsprintf(&path, HUGEPAGES_NUMA_PREFIX "node%d/hugepages/", node) < 0) + goto cleanup; + + if (nodeHugePageSupported(path, params, nparams) < 0) + goto cleanup; + + ret = 0; + cleanup: + VIR_FREE(path); + return ret; +} + +int nodeHugeTLB(int type, + virTypedParameterPtr params, + int *nparams, + unsigned int flags) +{ + int ret = -1; + + virCheckFlags(0, ret); + + if (type == -1) + return nodeHugePageSupported(HUGEPAGES_HOST_PREFIX, params, nparams); + + return nodeHugePageInfo(type, params, nparams); +} diff --git a/src/nodeinfo.h b/src/nodeinfo.h index c81fcbb..625de39 100644 --- a/src/nodeinfo.h +++ b/src/nodeinfo.h @@ -53,6 +53,11 @@ int nodeSetMemoryParameters(virTypedParameterPtr params, int nparams, unsigned int flags); +int nodeHugeTLB(int type, + virTypedParameterPtr params, + int *nparams, + unsigned int flags); + int nodeGetCPUMap(unsigned char **cpumap, unsigned int *online, unsigned int flags); diff --git a/src/qemu/qemu_driver.c b/src/qemu/qemu_driver.c index 9becc0a..aa7deb3 100644 --- a/src/qemu/qemu_driver.c +++ b/src/qemu/qemu_driver.c @@ -16690,6 +16690,19 @@ qemuDomainSetTime(virDomainPtr dom, return ret; } +static int +qemuNodeHugeTLB(virConnectPtr conn, + int type, + virTypedParameterPtr params, + int *nparams, + unsigned int flags) +{ + if (virNodeHugeTlbEnsureACL(conn) < 0) + return -1; + + return nodeHugeTLB(type, params, nparams, flags); +} + static int qemuDomainFSFreeze(virDomainPtr dom, @@ -16975,6 +16988,7 @@ static virDriver qemuDriver = { .domainFSThaw = qemuDomainFSThaw, /* 1.2.5 */ .domainGetTime = qemuDomainGetTime, /* 1.2.5 */ .domainSetTime = qemuDomainSetTime, /* 1.2.5 */ + .nodeHugeTLB = qemuNodeHugeTLB, /* 1.2.6 */ }; -- 1.9.3
These functions will handle PCIe devices and their link capabilities to query some info about it. Signed-off-by: Michal Privoznik <mprivozn@redhat.com> --- src/libvirt_private.syms | 3 ++ src/util/virpci.c | 81 +++++++++++++++++++++++++++++++++++++++++++++++- src/util/virpci.h | 8 +++++ 3 files changed, 91 insertions(+), 1 deletion(-) diff --git a/src/libvirt_private.syms b/src/libvirt_private.syms index 1d97c89..649da16 100644 --- a/src/libvirt_private.syms +++ b/src/libvirt_private.syms @@ -1690,6 +1690,8 @@ virPCIDeviceFree; virPCIDeviceGetDriverPathAndName; virPCIDeviceGetIOMMUGroupDev; virPCIDeviceGetIOMMUGroupList; +virPCIDeviceGetLinkCap; +virPCIDeviceGetLinkSta; virPCIDeviceGetManaged; virPCIDeviceGetName; virPCIDeviceGetRemoveSlot; @@ -1698,6 +1700,7 @@ virPCIDeviceGetStubDriver; virPCIDeviceGetUnbindFromStub; virPCIDeviceGetUsedBy; virPCIDeviceIsAssignable; +virPCIDeviceIsPCIExpress; virPCIDeviceListAdd; virPCIDeviceListAddCopy; virPCIDeviceListCount; diff --git a/src/util/virpci.c b/src/util/virpci.c index e0f2344..8ad28b8 100644 --- a/src/util/virpci.c +++ b/src/util/virpci.c @@ -130,7 +130,13 @@ struct _virPCIDeviceList { /* PCIe20 7.8.3 Device Capabilities Register (Offset 04h) */ #define PCI_EXP_DEVCAP 0x4 /* Device capabilities */ -#define PCI_EXP_DEVCAP_FLR (1<<28) /* Function Level Reset */ +#define PCI_EXP_LNKCAP 0xc /* Link Capabilities */ +#define PCI_EXP_LNKSTA 0x12 /* Link Status */ +#define PCI_EXP_DEVCAP_FLR (1<<28) /* Function Level Reset */ +#define PCI_EXP_LNKCAP_SPEED 0x0000f /* Maximum Link Speed */ +#define PCI_EXP_LNKCAP_WIDTH 0x003f0 /* Maximum Link Width */ +#define PCI_EXP_LNKSTA_SPEED 0x000f /* Negotiated Link Speed */ +#define PCI_EXP_LNKSTA_WIDTH 0x03f0 /* Negotiated Link Width */ /* Header type 1 BR12 3.2 PCI-to-PCI Bridge Configuration Space Header Format */ #define PCI_PRIMARY_BUS 0x18 /* BR12 3.2.5.2 Primary bus number */ @@ -2750,3 +2756,76 @@ virPCIGetVirtualFunctionInfo(const char *vf_sysfs_device_path ATTRIBUTE_UNUSED, return -1; } #endif /* __linux__ */ + +int +virPCIDeviceIsPCIExpress(virPCIDevicePtr dev) +{ + int fd; + int ret = -1; + + if ((fd = virPCIDeviceConfigOpen(dev, true)) < 0) + return ret; + + if (virPCIDeviceInit(dev, fd) < 0) + goto cleanup; + + ret = dev->pcie_cap_pos != 0; + + cleanup: + virPCIDeviceConfigClose(dev, fd); + return ret; +} + +int +virPCIDeviceGetLinkCap(virPCIDevicePtr dev, + int *port, + unsigned int *speed, + unsigned int *width) +{ + uint32_t t; + int fd; + int ret = -1; + + if ((fd = virPCIDeviceConfigOpen(dev, true) < 0)) + return ret; + + if (virPCIDeviceInit(dev, fd) < 0) + goto cleanup; + + t = virPCIDeviceRead32(dev, fd, dev->pcie_cap_pos + PCI_EXP_LNKCAP); + + *port = t >> 0x18; + *speed = t & PCI_EXP_LNKCAP_SPEED; + *width = (t & PCI_EXP_LNKCAP_WIDTH) >> 4; + ret = 0; + + cleanup: + virPCIDeviceConfigClose(dev, fd); + return ret; +} + +int +virPCIDeviceGetLinkSta(virPCIDevicePtr dev, + unsigned int *speed, + unsigned int *width) +{ + uint32_t t; + int fd; + int ret = -1; + + if ((fd = virPCIDeviceConfigOpen(dev, true) < 0)) + return ret; + + if (virPCIDeviceInit(dev, fd) < 0) + goto cleanup; + + t = virPCIDeviceRead32(dev, fd, dev->pcie_cap_pos + PCI_EXP_LNKSTA); + + *speed = t & PCI_EXP_LNKSTA_SPEED; + *width = (t & PCI_EXP_LNKSTA_WIDTH) >> 4; + ret = 0; + + cleanup: + virPCIDeviceConfigClose(dev, fd); + return ret; +} diff --git a/src/util/virpci.h b/src/util/virpci.h index 20ffe54..0a608c2 100644 --- a/src/util/virpci.h +++ b/src/util/virpci.h @@ -176,4 +176,12 @@ int virPCIDeviceGetDriverPathAndName(virPCIDevicePtr dev, char **path, char **name); +int virPCIDeviceIsPCIExpress(virPCIDevicePtr dev); +int virPCIDeviceGetLinkCap(virPCIDevicePtr dev, + int *port, + unsigned int *speed, + unsigned int *width); +int virPCIDeviceGetLinkSta(virPCIDevicePtr dev, + unsigned int *speed, + unsigned int *width); #endif /* __VIR_PCI_H__ */ -- 1.9.3
This new element is there to represent PCI-Express capabilities of a PCI devices, like link speed, number of lanes, etc. Signed-off-by: Michal Privoznik <mprivozn@redhat.com> --- docs/formatnode.html.in | 19 ++++ docs/schemas/nodedev.rng | 26 +++++ src/conf/node_device_conf.c | 123 ++++++++++++++++++++- src/conf/node_device_conf.h | 31 +++++- src/node_device/node_device_udev.c | 31 ++++++ .../pci_8086_4238_pcie_wireless.xml | 26 +++++ tests/nodedevxml2xmltest.c | 1 + 7 files changed, 254 insertions(+), 3 deletions(-) create mode 100644 tests/nodedevschemadata/pci_8086_4238_pcie_wireless.xml diff --git a/docs/formatnode.html.in b/docs/formatnode.html.in index b424c96..46ec2bc 100644 --- a/docs/formatnode.html.in +++ b/docs/formatnode.html.in @@ -110,6 +110,21 @@ have a list of <code>address</code> subelements, one for each VF on this PF. </dd> + <dt><code>pci-express</code></dt> + <dd> + This optional element contains information on PCI Express part of + the device. For example, it can contain a child element + <code>link</code> which addresses the PCI Express device's link. + While a device has it's own capabilities + (<code>validity='cap'</code>), the actual run time capabilities + are negotiated on the device initialization + (<code>validity='sta'</code>). The <code>link</code> element then + contains three attributes: <code>port</code> which says in which + port is the device plugged in, <code>speed</code> (in + GigaTransfers per second) and <code>width</code> for the number + of lanes used. Since the port can't be negotiated, it's not + exposed in <code>./pci-express/link/[@validity='sta']</code>. + </dd> </dl> </dd> <dt><code>usb_device</code></dt> @@ -291,6 +306,10 @@ <address domain='0x0000' bus='0x02' slot='0x00' function='0x0'/> <address domain='0x0000' bus='0x02' slot='0x00' function='0x1'/> </iommuGroup> + <pci-express> + <link validity='cap' port='1' speed='2.5' width='1'/> + <link validity='sta' speed='2.5' width='1'/> + </pci-express> </capability> </device> </pre> diff --git a/docs/schemas/nodedev.rng b/docs/schemas/nodedev.rng index 81ab4d4..79e8fd2 100644 --- a/docs/schemas/nodedev.rng +++ b/docs/schemas/nodedev.rng @@ -158,6 +158,32 @@ </element> </optional> + <optional> + <element name='pci-express'> + <zeroOrMore> + <element name='link'> + <attribute name='validity'> + <choice> + <value>cap</value> + <value>sta</value> + </choice> + </attribute> + <optional> + <attribute name='port'> + <ref name='unsignedInt'/> + </attribute> + </optional> + <optional> + <attribute name='speed'> + </attribute> + </optional> + <attribute name='width'> + <ref name='unsignedInt'/> + </attribute> + </element> + </zeroOrMore> + </element> + </optional> </define> <define name='capusbdev'> diff --git a/src/conf/node_device_conf.c b/src/conf/node_device_conf.c index e65b5e4..70634cc 100644 --- a/src/conf/node_device_conf.c +++ b/src/conf/node_device_conf.c @@ -58,6 +58,9 @@ VIR_ENUM_IMPL(virNodeDevNetCap, VIR_NODE_DEV_CAP_NET_LAST, "80203", "80211") +VIR_ENUM_IMPL(virPCIELinkSpeed, VIR_PCIE_LINK_SPEED_LAST, + "", "2.5", "5", "8") + static int virNodeDevCapsDefParseString(const char *xpath, xmlXPathContextPtr ctxt, @@ -218,6 +221,35 @@ void virNodeDeviceObjRemove(virNodeDeviceObjListPtr devs, } } +static void +virPCIELinkFormat(virBufferPtr buf, + virPCIELinkPtr lnk, + const char *attrib) +{ + virBufferAsprintf(buf, "<link validity='%s'", attrib); + if (lnk->port >= 0) + virBufferAsprintf(buf, " port='%d'", lnk->port); + if (lnk->speed) + virBufferAsprintf(buf, " speed='%s'", + virPCIELinkSpeedTypeToString(lnk->speed)); + virBufferAsprintf(buf, " width='%d'", lnk->width); + virBufferAddLit(buf, "/>\n"); +} + +static void +virPCIEDeviceInfoFormat(virBufferPtr buf, + virPCIEDeviceInfoPtr info) +{ + virBufferAddLit(buf, "<pci-express>\n"); + virBufferAdjustIndent(buf, 2); + + virPCIELinkFormat(buf, &info->link_cap, "cap"); + virPCIELinkFormat(buf, &info->link_sta, "sta"); + + virBufferAdjustIndent(buf, -2); + virBufferAddLit(buf, "</pci-express>\n"); +} + char *virNodeDeviceDefFormat(const virNodeDeviceDef *def) { virBuffer buf = VIR_BUFFER_INITIALIZER; @@ -346,6 +378,8 @@ char *virNodeDeviceDefFormat(const virNodeDeviceDef *def) virBufferAdjustIndent(&buf, -2); virBufferAddLit(&buf, "</iommuGroup>\n"); } + if (data->pci_dev.flags & VIR_NODE_DEV_CAP_FLAG_PCIE) + virPCIEDeviceInfoFormat(&buf, data->pci_dev.pci_express); break; case VIR_NODE_DEV_CAP_USB_DEV: virBufferAsprintf(&buf, "<bus>%d</bus>\n", data->usb_dev.bus); @@ -1043,6 +1077,87 @@ virNodeDevCapPCIDevIommuGroupParseXML(xmlXPathContextPtr ctxt, return ret; } +static int +virNodeDevCapPCIDevPCIExpressParseLinkXML(xmlXPathContextPtr ctxt, + xmlNodePtr linkNode, + virPCIELinkPtr lnk) +{ + xmlNodePtr origNode = ctxt->node; + int ret = -1, speed; + char *speedStr = NULL, *portStr = NULL; + + ctxt->node = linkNode; + + if (virXPathUInt("number(./@width)", ctxt, &lnk->width) < 0) { + virReportError(VIR_ERR_XML_DETAIL, "%s", + _("mandatory attribute 'width' is missing or malformed")); + goto cleanup; + } + + if ((speedStr = virXPathString("string(./@speed)", ctxt))) { + if ((speed = virPCIELinkSpeedTypeFromString(speedStr)) < 0) { + virReportError(VIR_ERR_XML_DETAIL, + _("malformed 'speed' attribute: %s"), + speedStr); + goto cleanup; + } + lnk->speed = speed; + } + + if ((portStr = virXPathString("string(./@port)", ctxt))) { + if (virStrToLong_i(portStr, NULL, 10, &lnk->port) < 0) { + virReportError(VIR_ERR_XML_DETAIL, + _("malformed 'port' attribute: %s"), + portStr); + goto cleanup; + } + } else { + lnk->port = -1; + } + + ret = 0; + cleanup: + VIR_FREE(portStr); + VIR_FREE(speedStr); + ctxt->node = origNode; + return ret; +} + +static int +virNodeDevCapPCIDevPCIExpressParseXML(xmlXPathContextPtr ctxt, + xmlNodePtr pciExpressNode, + union _virNodeDevCapData *data) +{ + xmlNodePtr lnk, origNode = ctxt->node; + int ret = -1; + virPCIEDeviceInfoPtr pci_express = NULL; + + ctxt->node = pciExpressNode; + + if (VIR_ALLOC(pci_express) < 0) + goto cleanup; + + if ((lnk = virXPathNode("./link[@validity='cap']", ctxt)) && + virNodeDevCapPCIDevPCIExpressParseLinkXML(ctxt, lnk, + &pci_express->link_cap) < 0) + goto cleanup; + + if ((lnk = virXPathNode("./link[@validity='sta']", ctxt)) && + virNodeDevCapPCIDevPCIExpressParseLinkXML(ctxt, lnk, + &pci_express->link_sta) < 0) + goto cleanup; + + + data->pci_dev.pci_express = pci_express; + pci_express = NULL; + data->pci_dev.flags |= VIR_NODE_DEV_CAP_FLAG_PCIE; + ret = 0; + cleanup: + VIR_FREE(pci_express); + ctxt->node = origNode; + return ret; +} + static int virNodeDevCapPCIDevParseXML(xmlXPathContextPtr ctxt, @@ -1050,7 +1165,7 @@ virNodeDevCapPCIDevParseXML(xmlXPathContextPtr ctxt, xmlNodePtr node, union _virNodeDevCapData *data) { - xmlNodePtr orignode, iommuGroupNode; + xmlNodePtr orignode, iommuGroupNode, pciExpress; int ret = -1; orignode = ctxt->node; @@ -1101,6 +1216,12 @@ virNodeDevCapPCIDevParseXML(xmlXPathContextPtr ctxt, goto out; } } + + if ((pciExpress = virXPathNode("./pci-express[1]", ctxt))) { + if (virNodeDevCapPCIDevPCIExpressParseXML(ctxt, pciExpress, data) < 0) + goto out; + } + ret = 0; out: ctxt->node = orignode; diff --git a/src/conf/node_device_conf.h b/src/conf/node_device_conf.h index 50e6805..563bf6a 100644 --- a/src/conf/node_device_conf.h +++ b/src/conf/node_device_conf.h @@ -75,10 +75,36 @@ typedef enum { } virNodeDevSCSIHostCapFlags; typedef enum { - VIR_NODE_DEV_CAP_FLAG_PCI_PHYSICAL_FUNCTION = (1 << 0), - VIR_NODE_DEV_CAP_FLAG_PCI_VIRTUAL_FUNCTION = (1 << 1), + VIR_NODE_DEV_CAP_FLAG_PCI_PHYSICAL_FUNCTION = (1 << 0), + VIR_NODE_DEV_CAP_FLAG_PCI_VIRTUAL_FUNCTION = (1 << 1), + VIR_NODE_DEV_CAP_FLAG_PCIE = (1 << 2), } virNodeDevPCICapFlags; +typedef enum { + VIR_PCIE_LINK_SPEED_NA = 0, + VIR_PCIE_LINK_SPEED_25, + VIR_PCIE_LINK_SPEED_5, + VIR_PCIE_LINK_SPEED_8, + VIR_PCIE_LINK_SPEED_LAST +} virPCIELinkSpeed; + +VIR_ENUM_DECL(virPCIELinkSpeed) + +typedef struct _virPCIELink virPCIELink; +typedef virPCIELink *virPCIELinkPtr; +struct _virPCIELink { + int port; + virPCIELinkSpeed speed; + unsigned int width; +}; + +typedef struct _virPCIEDeviceInfo virPCIEDeviceInfo; +typedef virPCIEDeviceInfo *virPCIEDeviceInfoPtr; +struct _virPCIEDeviceInfo { + virPCIELink link_cap; /* PCIe device link capabilities */ + virPCIELink link_sta; /* Actually negotiated capabilities */ +}; + typedef struct _virNodeDevCapsDef virNodeDevCapsDef; typedef virNodeDevCapsDef *virNodeDevCapsDefPtr; struct _virNodeDevCapsDef { @@ -115,6 +141,7 @@ struct _virNodeDevCapsDef { virPCIDeviceAddressPtr *iommuGroupDevices; size_t nIommuGroupDevices; unsigned int iommuGroupNumber; + virPCIEDeviceInfoPtr pci_express; } pci_dev; struct { unsigned int bus; diff --git a/src/node_device/node_device_udev.c b/src/node_device/node_device_udev.c index 9a951d9..9ed869b 100644 --- a/src/node_device/node_device_udev.c +++ b/src/node_device/node_device_udev.c @@ -425,6 +425,8 @@ static int udevProcessPCI(struct udev_device *device, const char *syspath = NULL; union _virNodeDevCapData *data = &def->caps->data; virPCIDeviceAddress addr; + virPCIEDeviceInfoPtr pci_express = NULL; + virPCIDevicePtr pciDev = NULL; int tmpGroup, ret = -1; char *p; int rc; @@ -522,9 +524,38 @@ static int udevProcessPCI(struct udev_device *device, data->pci_dev.iommuGroupNumber = tmpGroup; } + if (!(pciDev = virPCIDeviceNew(data->pci_dev.domain, + data->pci_dev.bus, + data->pci_dev.slot, + data->pci_dev.function))) + goto out; + + if (virPCIDeviceIsPCIExpress(pciDev) > 0) { + if (VIR_ALLOC(pci_express) < 0) + goto out; + + if (virPCIDeviceGetLinkCap(pciDev, + &pci_express->link_cap.port, + &pci_express->link_cap.speed, + &pci_express->link_cap.width) < 0) + goto out; + + if (virPCIDeviceGetLinkSta(pciDev, + &pci_express->link_sta.speed, + &pci_express->link_sta.width) < 0) + goto out; + + pci_express->link_sta.port = -1; /* PCIe can't negotiate port. Yet :) */ + data->pci_dev.flags |= VIR_NODE_DEV_CAP_FLAG_PCIE; + data->pci_dev.pci_express = pci_express; + pci_express = NULL; + } + ret = 0; out: + virPCIDeviceFree(pciDev); + VIR_FREE(pci_express); return ret; } diff --git a/tests/nodedevschemadata/pci_8086_4238_pcie_wireless.xml b/tests/nodedevschemadata/pci_8086_4238_pcie_wireless.xml new file mode 100644 index 0000000..18172e9 --- /dev/null +++ b/tests/nodedevschemadata/pci_8086_4238_pcie_wireless.xml @@ -0,0 +1,26 @@ +<device> + <name>pci_0000_03_00_0</name> + <parent>pci_0000_00_1c_1</parent> + <capability type='pci'> + <domain>0</domain> + <bus>3</bus> + <slot>0</slot> + <function>0</function> + <product id='0x4238'>Centrino Ultimate-N 6300</product> + <vendor id='0x8086'>Intel Corporation</vendor> + <iommuGroup number='8'> + <address domain='0x0000' bus='0x00' slot='0x1c' function='0x0'/> + <address domain='0x0000' bus='0x00' slot='0x1c' function='0x1'/> + <address domain='0x0000' bus='0x00' slot='0x1c' function='0x3'/> + <address domain='0x0000' bus='0x00' slot='0x1c' function='0x4'/> + <address domain='0x0000' bus='0x03' slot='0x00' function='0x0'/> + <address domain='0x0000' bus='0x0d' slot='0x00' function='0x0'/> + <address domain='0x0000' bus='0x0d' slot='0x00' function='0x1'/> + <address domain='0x0000' bus='0x0d' slot='0x00' function='0x3'/> + </iommuGroup> + <pci-express> + <link validity='cap' port='1' speed='2.5' width='1'/> + <link validity='sta' speed='2.5' width='1'/> + </pci-express> + </capability> +</device> diff --git a/tests/nodedevxml2xmltest.c b/tests/nodedevxml2xmltest.c index 9390bf5..6d57318 100644 --- a/tests/nodedevxml2xmltest.c +++ b/tests/nodedevxml2xmltest.c @@ -88,6 +88,7 @@ mymain(void) DO_TEST("storage_serial_SATA_HTS721010G9SA00_MPCZ12Y0GNGWSE"); DO_TEST("usb_device_1d6b_1_0000_00_1d_0_if0"); DO_TEST("usb_device_1d6b_1_0000_00_1d_0"); + DO_TEST("pci_8086_4238_pcie_wireless"); return ret == 0 ? EXIT_SUCCESS : EXIT_FAILURE; } -- 1.9.3
Currently it is not possible to determine the speed of an interface and whether a link is actually detected from the API. Orchestrating platforms want to be able to determine when the link has failed and where multiple speeds may be available which one the interface is actually connected at. This commit introduces an extension to our interface XML (without implementation to interface driver backends): <interface type='ethernet' name='eth0'> <start mode='none'/> <mac address='aa:bb:cc:dd:ee:ff'/> <link speed='1000' state='up'/> <mtu size='1492'/> ... </interface> Where @speed is negotiated link speed in Mbits per second, and state is the current NIC state (can be one of the following: "unknown", "notpresent", "down", "lowerlayerdown","testing", "dormant", "up"). Signed-off-by: Michal Privoznik <mprivozn@redhat.com> --- docs/schemas/interface.rng | 27 +++++++++++++++++ src/conf/interface_conf.c | 39 ++++++++++++++++++++++++- src/conf/interface_conf.h | 15 ++++++++++ src/libvirt_private.syms | 2 ++ tests/interfaceschemadata/bridge-no-address.xml | 1 + tests/interfaceschemadata/bridge.xml | 1 + tests/interfaceschemadata/ethernet-dhcp.xml | 1 + 7 files changed, 85 insertions(+), 1 deletion(-) diff --git a/docs/schemas/interface.rng b/docs/schemas/interface.rng index 3984b63..d980ef5 100644 --- a/docs/schemas/interface.rng +++ b/docs/schemas/interface.rng @@ -41,6 +41,7 @@ <attribute name="address"><ref name="macAddr"/></attribute> </element> </optional> + <ref name="link-speed-state"/> <!-- FIXME: Allow (some) ethtool options --> </define> @@ -271,6 +272,32 @@ </element> </define> + <define name="link-speed-state"> + <optional> + <element name="link"> + <optional> + <attribute name="speed"> + <ref name="unsignedInt"/> + </attribute> + </optional> + <optional> + <attribute name="state"> + <choice> + <value>unknown</value> + <value>notpresent</value> + <value>down</value> + <value>lowerlayerdown</value> + <value>testing</value> + <value>dormant</value> + <value>up</value> + </choice> + </attribute> + </optional> + </element> + </optional> + </define> + + <!-- Assignment of addresses to an interface, allowing for different protocols diff --git a/src/conf/interface_conf.c b/src/conf/interface_conf.c index 1f67446..227b9d9 100644 --- a/src/conf/interface_conf.c +++ b/src/conf/interface_conf.c @@ -31,6 +31,7 @@ #include "virxml.h" #include "viruuid.h" #include "virbuffer.h" +#include "virstring.h" #define VIR_FROM_THIS VIR_FROM_INTERFACE @@ -38,6 +39,13 @@ VIR_ENUM_IMPL(virInterface, VIR_INTERFACE_TYPE_LAST, "ethernet", "bridge", "bond", "vlan") +VIR_ENUM_IMPL(virInterfaceState, + VIR_INTERFACE_STATE_LAST, + "" /* value of zero means no state */, + "unknown", "notpresent", + "down", "lowerlayerdown", + "testing", "dormant", "up") + static virInterfaceDefPtr virInterfaceDefParseXML(xmlXPathContextPtr ctxt, int parentIfType); static int @@ -666,7 +674,7 @@ static virInterfaceDefPtr virInterfaceDefParseXML(xmlXPathContextPtr ctxt, int parentIfType) { virInterfaceDefPtr def; - int type; + int type, state; char *tmp; xmlNodePtr cur = ctxt->node; @@ -711,6 +719,26 @@ virInterfaceDefParseXML(xmlXPathContextPtr ctxt, int parentIfType) tmp = virXPathString("string(./mac/@address)", ctxt); if (tmp != NULL) def->mac = tmp; + + tmp = virXPathString("string(./link/@state)", ctxt); + if (tmp) { + if ((state = virInterfaceStateTypeFromString(tmp)) < 0) { + virReportError(VIR_ERR_XML_ERROR, + _("unknown link state: %s"), + tmp); + goto error; + } + def->state = state; + } + + tmp = virXPathString("string(./link/@speed)", ctxt); + if (tmp && virStrToLong_ul(tmp, NULL, 10, &def->speed) < 0) { + virReportError(VIR_ERR_XML_ERROR, + _("Unable to parse link speed: %s"), + tmp); + goto error; + } + if (parentIfType == VIR_INTERFACE_TYPE_LAST) { /* only recognize these in toplevel bond interfaces */ if (virInterfaceDefParseStartMode(def, ctxt) < 0) @@ -1088,6 +1116,15 @@ virInterfaceDefDevFormat(virBufferPtr buf, const virInterfaceDef *def) virInterfaceStartmodeDefFormat(buf, def->startmode); if (def->mac != NULL) virBufferAsprintf(buf, "<mac address='%s'/>\n", def->mac); + if (def->state || def->speed) { + virBufferAddLit(buf, "<link"); + if (def->speed) + virBufferAsprintf(buf, " speed='%lu'", def->speed); + if (def->state) + virBufferAsprintf(buf, " state='%s'", + virInterfaceStateTypeToString(def->state)); + virBufferAddLit(buf, "/>\n"); + } if (def->mtu != 0) virBufferAsprintf(buf, "<mtu size='%d'/>\n", def->mtu); virInterfaceProtocolDefFormat(buf, def); diff --git a/src/conf/interface_conf.h b/src/conf/interface_conf.h index b3c92b2..c8b3f6c 100644 --- a/src/conf/interface_conf.h +++ b/src/conf/interface_conf.h @@ -84,6 +84,19 @@ typedef enum { VIR_INTERFACE_BOND_ARP_ALL, /* validate all */ } virInterfaceBondArpValid; +typedef enum { + VIR_INTERFACE_STATE_UNKNOWN = 1, + VIR_INTERFACE_STATE_NOT_PRESENT, + VIR_INTERFACE_STATE_DOWN, + VIR_INTERFACE_STATE_LOWER_LAYER_DOWN, + VIR_INTERFACE_STATE_TESTING, + VIR_INTERFACE_STATE_DORMANT, + VIR_INTERFACE_STATE_UP, + VIR_INTERFACE_STATE_LAST +} virInterfaceState; + +VIR_ENUM_DECL(virInterfaceState) + struct _virInterfaceDef; /* forward declaration required for bridge/bond */ typedef struct _virInterfaceBridgeDef virInterfaceBridgeDef; @@ -146,6 +159,8 @@ struct _virInterfaceDef { char *name; /* interface name */ unsigned int mtu; /* maximum transmit size in byte */ char *mac; /* MAC address */ + virInterfaceState state; /* link state */ + unsigned long speed; /* link speed in Mb per second */ virInterfaceStartMode startmode; /* how it is started */ diff --git a/src/libvirt_private.syms b/src/libvirt_private.syms index 649da16..5e908d6 100644 --- a/src/libvirt_private.syms +++ b/src/libvirt_private.syms @@ -495,6 +495,8 @@ virInterfaceObjListFree; virInterfaceObjLock; virInterfaceObjUnlock; virInterfaceRemove; +virInterfaceStateTypeFromString; +virInterfaceStateTypeToString; # conf/netdev_bandwidth_conf.h diff --git a/tests/interfaceschemadata/bridge-no-address.xml b/tests/interfaceschemadata/bridge-no-address.xml index 7757534..68b8c94 100644 --- a/tests/interfaceschemadata/bridge-no-address.xml +++ b/tests/interfaceschemadata/bridge-no-address.xml @@ -4,6 +4,7 @@ <bridge stp='off'> <interface type='ethernet' name='eth0'> <mac address='ab:bb:cc:dd:ee:ff'/> + <link speed='1000' state='up'/> </interface> <interface type='ethernet' name='eth1'> </interface> diff --git a/tests/interfaceschemadata/bridge.xml b/tests/interfaceschemadata/bridge.xml index 2535edf..c865116 100644 --- a/tests/interfaceschemadata/bridge.xml +++ b/tests/interfaceschemadata/bridge.xml @@ -7,6 +7,7 @@ <bridge stp='off' delay='0.01'> <interface type='ethernet' name='eth0'> <mac address='ab:bb:cc:dd:ee:ff'/> + <link speed='10'/> </interface> <interface type='ethernet' name='eth1'> </interface> diff --git a/tests/interfaceschemadata/ethernet-dhcp.xml b/tests/interfaceschemadata/ethernet-dhcp.xml index fe969df..c124372 100644 --- a/tests/interfaceschemadata/ethernet-dhcp.xml +++ b/tests/interfaceschemadata/ethernet-dhcp.xml @@ -1,6 +1,7 @@ <interface type='ethernet' name='eth0'> <start mode='none'/> <mac address='aa:bb:cc:dd:ee:ff'/> + <link state='down'/> <mtu size='1492'/> <protocol family='ipv4'> <dhcp peerdns='no'/> -- 1.9.3
On Thu, May 29, 2014 at 10:32:40AM +0200, Michal Privoznik wrote:
Currently it is not possible to determine the speed of an interface and whether a link is actually detected from the API. Orchestrating platforms want to be able to determine when the link has failed and where multiple speeds may be available which one the interface is actually connected at. This commit introduces an extension to our interface XML (without implementation to interface driver backends):
<interface type='ethernet' name='eth0'> <start mode='none'/> <mac address='aa:bb:cc:dd:ee:ff'/> <link speed='1000' state='up'/> <mtu size='1492'/> ... </interface>
Where @speed is negotiated link speed in Mbits per second, and state is the current NIC state (can be one of the following: "unknown", "notpresent", "down", "lowerlayerdown","testing", "dormant", "up").
This is fine for the the <interface> objects, but it is limited in usefulness for SRIOV use cases. The <interface> objects only exist for interfaces which are configured for the host. With SRIOV passthrough some of the interfaces we're interested in are not going to be configured - they're just bare devices waiting to be given to a guest. To deal with that, we need report all this info on the node device APIs which let us list all NICs, regardless of whether they are configured on the host or not. Regards, Daniel -- |: http://berrange.com -o- http://www.flickr.com/photos/dberrange/ :| |: http://libvirt.org -o- http://virt-manager.org :| |: http://autobuild.org -o- http://search.cpan.org/~danberr/ :| |: http://entangle-photo.org -o- http://live.gnome.org/gtk-vnc :|
On 30.05.2014 10:56, Daniel P. Berrange wrote:
On Thu, May 29, 2014 at 10:32:40AM +0200, Michal Privoznik wrote:
Currently it is not possible to determine the speed of an interface and whether a link is actually detected from the API. Orchestrating platforms want to be able to determine when the link has failed and where multiple speeds may be available which one the interface is actually connected at. This commit introduces an extension to our interface XML (without implementation to interface driver backends):
<interface type='ethernet' name='eth0'> <start mode='none'/> <mac address='aa:bb:cc:dd:ee:ff'/> <link speed='1000' state='up'/> <mtu size='1492'/> ... </interface>
Where @speed is negotiated link speed in Mbits per second, and state is the current NIC state (can be one of the following: "unknown", "notpresent", "down", "lowerlayerdown","testing", "dormant", "up").
This is fine for the the <interface> objects, but it is limited in usefulness for SRIOV use cases. The <interface> objects only exist for interfaces which are configured for the host. With SRIOV passthrough some of the interfaces we're interested in are not going to be configured - they're just bare devices waiting to be given to a guest.
I hear what you're saying, but unless a PCI device is given interface name I am afraid we can't do anything. For instance, if you have a NIC but detach it from the driver (echo ${PCI_ADDR} > /sys/bus/pci/drivers/<driver>/unbind), kernel still sees the PCI device (it's shown in lspci output for instance), but it's not configured anymore - kernel doesn't know device link state, hence it's not aware if NIC's link speed, etc. So tools like ethtool, ip, ifconfing won't show the device.
To deal with that, we need report all this info on the node device APIs which let us list all NICs, regardless of whether they are configured on the host or not.
Due the reasons written above I don't see much benefit in moving this to nodedev. Michal
On Fri, May 30, 2014 at 01:41:08PM +0200, Michal Privoznik wrote:
On 30.05.2014 10:56, Daniel P. Berrange wrote:
On Thu, May 29, 2014 at 10:32:40AM +0200, Michal Privoznik wrote:
Currently it is not possible to determine the speed of an interface and whether a link is actually detected from the API. Orchestrating platforms want to be able to determine when the link has failed and where multiple speeds may be available which one the interface is actually connected at. This commit introduces an extension to our interface XML (without implementation to interface driver backends):
<interface type='ethernet' name='eth0'> <start mode='none'/> <mac address='aa:bb:cc:dd:ee:ff'/> <link speed='1000' state='up'/> <mtu size='1492'/> ... </interface>
Where @speed is negotiated link speed in Mbits per second, and state is the current NIC state (can be one of the following: "unknown", "notpresent", "down", "lowerlayerdown","testing", "dormant", "up").
This is fine for the the <interface> objects, but it is limited in usefulness for SRIOV use cases. The <interface> objects only exist for interfaces which are configured for the host. With SRIOV passthrough some of the interfaces we're interested in are not going to be configured - they're just bare devices waiting to be given to a guest.
I hear what you're saying, but unless a PCI device is given interface name I am afraid we can't do anything. For instance, if you have a NIC but detach it from the driver (echo ${PCI_ADDR} > /sys/bus/pci/drivers/<driver>/unbind), kernel still sees the PCI device (it's shown in lspci output for instance), but it's not configured anymore - kernel doesn't know device link state, hence it's not aware if NIC's link speed, etc. So tools like ethtool, ip, ifconfing won't show the device.
IIUC, We have three classes of device 1 Devices not bound - no NIC visible in the host OS 2 Devices bound but not configured. NIC visible in host OS, but no /etc/sysconfig/networking/ifcfg-XXX file 3 Devices bound and configured. NIC visible in host OS, and has a /etc/sysconfig/networking/ifcfg-XXX file The <interface> configs only let you deal with NIC devices in class 3. The <nodedev> XML / APIs let you see NIC devices in class 2 + 3.
To deal with that, we need report all this info on the node device APIs which let us list all NICs, regardless of whether they are configured on the host or not.
Due the reasons written above I don't see much benefit in moving this to nodedev.
NB, I don't mean we should move it. I mean we should report the data in both places - nodedev and interface XML Regards, Daniel -- |: http://berrange.com -o- http://www.flickr.com/photos/dberrange/ :| |: http://libvirt.org -o- http://virt-manager.org :| |: http://autobuild.org -o- http://search.cpan.org/~danberr/ :| |: http://entangle-photo.org -o- http://live.gnome.org/gtk-vnc :|
On 30.05.2014 14:35, Daniel P. Berrange wrote:
On Fri, May 30, 2014 at 01:41:08PM +0200, Michal Privoznik wrote:
On 30.05.2014 10:56, Daniel P. Berrange wrote:
On Thu, May 29, 2014 at 10:32:40AM +0200, Michal Privoznik wrote:
Currently it is not possible to determine the speed of an interface and whether a link is actually detected from the API. Orchestrating platforms want to be able to determine when the link has failed and where multiple speeds may be available which one the interface is actually connected at. This commit introduces an extension to our interface XML (without implementation to interface driver backends):
<interface type='ethernet' name='eth0'> <start mode='none'/> <mac address='aa:bb:cc:dd:ee:ff'/> <link speed='1000' state='up'/> <mtu size='1492'/> ... </interface>
Where @speed is negotiated link speed in Mbits per second, and state is the current NIC state (can be one of the following: "unknown", "notpresent", "down", "lowerlayerdown","testing", "dormant", "up").
This is fine for the the <interface> objects, but it is limited in usefulness for SRIOV use cases. The <interface> objects only exist for interfaces which are configured for the host. With SRIOV passthrough some of the interfaces we're interested in are not going to be configured - they're just bare devices waiting to be given to a guest.
I hear what you're saying, but unless a PCI device is given interface name I am afraid we can't do anything. For instance, if you have a NIC but detach it from the driver (echo ${PCI_ADDR} > /sys/bus/pci/drivers/<driver>/unbind), kernel still sees the PCI device (it's shown in lspci output for instance), but it's not configured anymore - kernel doesn't know device link state, hence it's not aware if NIC's link speed, etc. So tools like ethtool, ip, ifconfing won't show the device.
IIUC, We have three classes of device
1 Devices not bound - no NIC visible in the host OS
2 Devices bound but not configured. NIC visible in host OS, but no /etc/sysconfig/networking/ifcfg-XXX file
3 Devices bound and configured. NIC visible in host OS, and has a /etc/sysconfig/networking/ifcfg-XXX file
The <interface> configs only let you deal with NIC devices in class 3.
The <nodedev> XML / APIs let you see NIC devices in class 2 + 3.
Right. The netcf world. Okay, makes sense now. In udev we have 2nd and 3rd classes merged into one. Michal
On 05/30/2014 04:42 PM, Michal Privoznik wrote:
On 30.05.2014 14:35, Daniel P. Berrange wrote:
On Fri, May 30, 2014 at 01:41:08PM +0200, Michal Privoznik wrote:
On 30.05.2014 10:56, Daniel P. Berrange wrote:
On Thu, May 29, 2014 at 10:32:40AM +0200, Michal Privoznik wrote:
Currently it is not possible to determine the speed of an interface and whether a link is actually detected from the API. Orchestrating platforms want to be able to determine when the link has failed and where multiple speeds may be available which one the interface is actually connected at. This commit introduces an extension to our interface XML (without implementation to interface driver backends):
<interface type='ethernet' name='eth0'> <start mode='none'/> <mac address='aa:bb:cc:dd:ee:ff'/> <link speed='1000' state='up'/> <mtu size='1492'/> ... </interface>
Where @speed is negotiated link speed in Mbits per second, and state is the current NIC state (can be one of the following: "unknown", "notpresent", "down", "lowerlayerdown","testing", "dormant", "up").
This is fine for the the <interface> objects, but it is limited in usefulness for SRIOV use cases. The <interface> objects only exist for interfaces which are configured for the host. With SRIOV passthrough some of the interfaces we're interested in are not going to be configured - they're just bare devices waiting to be given to a guest.
I hear what you're saying, but unless a PCI device is given interface name I am afraid we can't do anything. For instance, if you have a NIC but detach it from the driver (echo ${PCI_ADDR} > /sys/bus/pci/drivers/<driver>/unbind), kernel still sees the PCI device (it's shown in lspci output for instance), but it's not configured anymore - kernel doesn't know device link state, hence it's not aware if NIC's link speed, etc. So tools like ethtool, ip, ifconfing won't show the device.
IIUC, We have three classes of device
1 Devices not bound - no NIC visible in the host OS
2 Devices bound but not configured. NIC visible in host OS, but no /etc/sysconfig/networking/ifcfg-XXX file
3 Devices bound and configured. NIC visible in host OS, and has a /etc/sysconfig/networking/ifcfg-XXX file
The <interface> configs only let you deal with NIC devices in class 3.
The <nodedev> XML / APIs let you see NIC devices in class 2 + 3.
Right. The netcf world. Okay, makes sense now. In udev we have 2nd and 3rd classes merged into one.
Unfortunately yes. The udev backend was added for distros that didn't support netcf, and doesn't exactly fit the original semantics of the virInterface API. In particular, virConnect*Interfaces() lists the class 2 devices, which was not intended to be the case. I guess that's our fault for adding the ability to report device status into the netcf API at all - if I recall correctly, it wasn't in the initial requirements of virInterface (which were only to provide a way to *configure* host interfaces and report on their configuration), but "somebody" (I forget who, I could have been one of the guilty parties) requested that it also provide interface status, and we thought "sounds useful, what harm could it possibly do?". Then once it became an essential part of virt-manager's guest network config (in order to get a list of host interfaces the guest could connect to), distros without netcf felt the pain of missing this feature that was present in other distros, and saw the udev backend as a way to sidestep the problems of making a netcf port. It has definitely been useful, but has kind of messed up the clarity of the virinterface APIs.
In previous commit the interface XML is prepared for exporting information on NIC link speed and state. This commit implement actual logic for getting such info and writing it into XML. Signed-off-by: Michal Privoznik <mprivozn@redhat.com> --- src/interface/interface_backend_udev.c | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/src/interface/interface_backend_udev.c b/src/interface/interface_backend_udev.c index c5353ea..c8940ce 100644 --- a/src/interface/interface_backend_udev.c +++ b/src/interface/interface_backend_udev.c @@ -1036,6 +1036,8 @@ udevGetIfaceDef(struct udev *udev, const char *name) const char *mtu_str; char *vlan_parent_dev = NULL; const char *devtype; + const char *operstate; + const char *link_speed; /* Allocate our interface definition structure */ if (VIR_ALLOC(ifacedef) < 0) @@ -1059,6 +1061,23 @@ udevGetIfaceDef(struct udev *udev, const char *name) udev_device_get_sysattr_value(dev, "address")) < 0) goto error; + operstate = udev_device_get_sysattr_value(dev, "operstate"); + if ((ifacedef->state = virInterfaceStateTypeFromString(operstate)) <= 0) { + virReportError(VIR_ERR_INTERNAL_ERROR, + _("Unable to parse operstate: %s"), + operstate); + goto error; + } + + link_speed = udev_device_get_sysattr_value(dev, "speed"); + if (link_speed && + virStrToLong_ul(link_speed, NULL, 10, &ifacedef->speed) < 0) { + virReportError(VIR_ERR_INTERNAL_ERROR, + _("Unable to parse speed: %s"), + link_speed); + goto error; + } + /* MTU */ mtu_str = udev_device_get_sysattr_value(dev, "mtu"); if (virStrToLong_ui(mtu_str, NULL, 10, &mtu) < 0) { -- 1.9.3
While the previous commit was pretty straightforward, things are different with netcf as it doesn't exposed the bits we need yet. However, we can work around it by fetching the info we need from SYSFS. Signed-off-by: Michal Privoznik <mprivozn@redhat.com> --- src/interface/interface_backend_netcf.c | 99 +++++++++++++++++++++++++++++++++ 1 file changed, 99 insertions(+) diff --git a/src/interface/interface_backend_netcf.c b/src/interface/interface_backend_netcf.c index 1b9ace5..40181ef 100644 --- a/src/interface/interface_backend_netcf.c +++ b/src/interface/interface_backend_netcf.c @@ -33,6 +33,7 @@ #include "virlog.h" #include "virstring.h" #include "viraccessapicheck.h" +#include "virfile.h" #define VIR_FROM_THIS VIR_FROM_INTERFACE @@ -240,6 +241,96 @@ static struct netcf_if *interfaceDriverGetNetcfIF(struct netcf *ncf, virInterfac return iface; } +/* Okay, the following two doesn't really belong here as they dump the info + * from SYSFS rather than netcf. But netcf is not yet exposing the info we + * need, so what. */ +#define SYSFS_PREFIX "/sys/class/net/" + +static int +interfaceGetState(const char *name, + virInterfaceState *state) +{ + int ret = -1; + char *path = NULL; + char *buf = NULL; + char *tmp; + + if (virAsprintf(&path, SYSFS_PREFIX "%s/operstate", name) < 0) + goto cleanup; + + if (virFileReadAll(path, 1024, &buf) < 0) { + virReportSystemError(errno, + _("unable to read: %s"), + path); + goto cleanup; + } + + if (!(tmp = strchr(buf, '\n'))) { + virReportError(VIR_ERR_INTERNAL_ERROR, + _("Unable to parse: %s"), + buf); + goto cleanup; + } + + *tmp = '\0'; + + /* We shouldn't allow 0 here, because + * virInterfaceState enum starts from 1. */ + if ((*state = virInterfaceStateTypeFromString(buf)) <= 0) { + virReportError(VIR_ERR_INTERNAL_ERROR, + _("Unable to parse: %s"), + buf); + goto cleanup; + } + + ret = 0; + cleanup: + VIR_FREE(buf); + VIR_FREE(path); + return ret; +} + +static int +interfaceGetSpeed(const char *name, + unsigned long *speed) +{ + int ret = -1; + char *path = NULL; + char *buf = NULL; + char *tmp; + + if (virAsprintf(&path, SYSFS_PREFIX "%s/speed", name) < 0) + goto cleanup; + + if (virFileReadAll(path, 1024, &buf) < 0) { + /* Some devices doesn't report speed, in which case we get EINVAL */ + if (errno == EINVAL) { + ret = 0; + goto cleanup; + } + virReportSystemError(errno, + _("unable to read: %s"), + path); + goto cleanup; + } + + if (virStrToLong_ul(buf, &tmp, 10, speed) < 0 || + *tmp != '\n') { + virReportError(VIR_ERR_INTERNAL_ERROR, + _("Unable to parse: %s"), + buf); + goto cleanup; + } + + ret = 1; + cleanup: + VIR_FREE(buf); + VIR_FREE(path); + return ret; +} + +/* Ond of sysfs hacks */ + static int netcfInterfaceObjIsActive(struct netcf_if *iface, bool *active) @@ -840,6 +931,14 @@ static char *netcfInterfaceGetXMLDesc(virInterfacePtr ifinfo, if (virInterfaceGetXMLDescEnsureACL(ifinfo->conn, ifacedef) < 0) goto cleanup; + if (interfaceGetState(ifacedef->name, &ifacedef->state) < 0) + goto cleanup; + + /* querying inteface speed makes sense only sometimes */ + if (ifacedef->state == VIR_INTERFACE_STATE_UP && + interfaceGetSpeed(ifacedef->name, &ifacedef->speed) < 0) + goto cleanup; + ret = virInterfaceDefFormat(ifacedef); if (!ret) { /* error was already reported */ -- 1.9.3
On 05/29/2014 11:32 AM, Michal Privoznik wrote:
While the previous commit was pretty straightforward, things are different with netcf as it doesn't exposed the bits we need yet.
Rather than tacking these onto the XML returned from netcf after the fact, we should instead modify netcf to directly provide the desired information in the XML returned from ncf_if_xml_state(). I can help with that. Also, note that the interface XML only returns items that can be configured when the VIR_INTERFACE_XML_INACTIVE flag is set, or when the interface isn't active. If these values are read-only, I think they should only be present when the interface is active and the INACTIVE flag isn't set (i.e. they are part of the interface status rather than config). Of course netcf has a separate API (ncf_if_status) that checks the interface IFF_UP and IFF_RUNNING flags, and uses that to determine what is returned from libvirt's virInterfaceIsActive() API. We should probably try to at least be sure that what is returned from that is enough differentiated in documentation from what is returned in this new <state> element that they aren't confused with each other (if they are in fact different in practice). And that brings up another possible issue (depending on exactly how "<link state='up'> meshes with IFF_UP|IFF_RUNNING) - within the current virInterfaceGetXMLDesc() paradigm of "interface *status* is only returned when an interface is 'active'" (which was directly copied from libvirt's behavior in the case of domains and networks), 1) it will never be possible to learn about the state of the interface when the interface is inactive (and if the cable is unplugged, it is considered inactive), and 2) it will then be a bit pointless to report the active/inactive status of the interface in the XML since it will by definition always report "active" (state='up') if the element is present at all. In order to make it useful, we'll need to modify the semantics of virInterfaceGetXMLDesc() in some way such that it is always possible to get the current status even when the interface is down; for example, possibly we could add a VIR_INTERFACE_XML_STATUS flag that would indicate to always return the status of the interface, even when it is inactive.
However, we can work around it by fetching the info we need from SYSFS.
Signed-off-by: Michal Privoznik <mprivozn@redhat.com> --- src/interface/interface_backend_netcf.c | 99 +++++++++++++++++++++++++++++++++ 1 file changed, 99 insertions(+)
diff --git a/src/interface/interface_backend_netcf.c b/src/interface/interface_backend_netcf.c index 1b9ace5..40181ef 100644 --- a/src/interface/interface_backend_netcf.c +++ b/src/interface/interface_backend_netcf.c @@ -33,6 +33,7 @@ #include "virlog.h" #include "virstring.h" #include "viraccessapicheck.h" +#include "virfile.h"
#define VIR_FROM_THIS VIR_FROM_INTERFACE
@@ -240,6 +241,96 @@ static struct netcf_if *interfaceDriverGetNetcfIF(struct netcf *ncf, virInterfac return iface; }
+/* Okay, the following two doesn't really belong here as they dump the info + * from SYSFS rather than netcf. But netcf is not yet exposing the info we + * need, so what. */ +#define SYSFS_PREFIX "/sys/class/net/" + +static int +interfaceGetState(const char *name, + virInterfaceState *state) +{ + int ret = -1; + char *path = NULL; + char *buf = NULL; + char *tmp; + + if (virAsprintf(&path, SYSFS_PREFIX "%s/operstate", name) < 0) + goto cleanup; + + if (virFileReadAll(path, 1024, &buf) < 0) { + virReportSystemError(errno, + _("unable to read: %s"), + path); + goto cleanup; + } + + if (!(tmp = strchr(buf, '\n'))) { + virReportError(VIR_ERR_INTERNAL_ERROR, + _("Unable to parse: %s"), + buf); + goto cleanup; + } + + *tmp = '\0'; + + /* We shouldn't allow 0 here, because + * virInterfaceState enum starts from 1. */ + if ((*state = virInterfaceStateTypeFromString(buf)) <= 0) { + virReportError(VIR_ERR_INTERNAL_ERROR, + _("Unable to parse: %s"), + buf); + goto cleanup; + } + + ret = 0; + cleanup: + VIR_FREE(buf); + VIR_FREE(path); + return ret; +} + +static int +interfaceGetSpeed(const char *name, + unsigned long *speed) +{ + int ret = -1; + char *path = NULL; + char *buf = NULL; + char *tmp; + + if (virAsprintf(&path, SYSFS_PREFIX "%s/speed", name) < 0) + goto cleanup; + + if (virFileReadAll(path, 1024, &buf) < 0) { + /* Some devices doesn't report speed, in which case we get EINVAL */ + if (errno == EINVAL) { + ret = 0; + goto cleanup; + } + virReportSystemError(errno, + _("unable to read: %s"), + path); + goto cleanup; + } + + if (virStrToLong_ul(buf, &tmp, 10, speed) < 0 || + *tmp != '\n') { + virReportError(VIR_ERR_INTERNAL_ERROR, + _("Unable to parse: %s"), + buf); + goto cleanup; + } + + ret = 1; + cleanup: + VIR_FREE(buf); + VIR_FREE(path); + return ret; +} + +/* Ond of sysfs hacks */ + static int netcfInterfaceObjIsActive(struct netcf_if *iface, bool *active) @@ -840,6 +931,14 @@ static char *netcfInterfaceGetXMLDesc(virInterfacePtr ifinfo, if (virInterfaceGetXMLDescEnsureACL(ifinfo->conn, ifacedef) < 0) goto cleanup;
+ if (interfaceGetState(ifacedef->name, &ifacedef->state) < 0) + goto cleanup; + + /* querying inteface speed makes sense only sometimes */ + if (ifacedef->state == VIR_INTERFACE_STATE_UP && + interfaceGetSpeed(ifacedef->name, &ifacedef->speed) < 0) + goto cleanup; + ret = virInterfaceDefFormat(ifacedef); if (!ret) { /* error was already reported */
The functions in question are qemuHostdevHostSupportsPassthroughLegacy and qemuHostdevHostSupportsPassthroughVFIO. At the same time the functions are renamed to match 'vir' prefix instead of 'qemu' one. Signed-off-by: Michal Privoznik <mprivozn@redhat.com> --- src/libvirt_private.syms | 2 ++ src/qemu/qemu_driver.c | 4 +-- src/qemu/qemu_hostdev.c | 76 ++---------------------------------------------- src/qemu/qemu_hostdev.h | 2 -- src/util/virhostdev.c | 73 ++++++++++++++++++++++++++++++++++++++++++++++ src/util/virhostdev.h | 4 +++ 6 files changed, 83 insertions(+), 78 deletions(-) diff --git a/src/libvirt_private.syms b/src/libvirt_private.syms index 5e908d6..9a942ec 100644 --- a/src/libvirt_private.syms +++ b/src/libvirt_private.syms @@ -1340,6 +1340,8 @@ virHookPresent; # util/virhostdev.h +virHostdevHostSupportsPassthroughLegacy; +virHostdevHostSupportsPassthroughVFIO; virHostdevManagerGetDefault; virHostdevPCINodeDeviceDetach; virHostdevPCINodeDeviceReAttach; diff --git a/src/qemu/qemu_driver.c b/src/qemu/qemu_driver.c index aa7deb3..d8d0e89 100644 --- a/src/qemu/qemu_driver.c +++ b/src/qemu/qemu_driver.c @@ -11313,8 +11313,8 @@ qemuNodeDeviceDetachFlags(virNodeDevicePtr dev, int ret = -1; virNodeDeviceDefPtr def = NULL; char *xml = NULL; - bool legacy = qemuHostdevHostSupportsPassthroughLegacy(); - bool vfio = qemuHostdevHostSupportsPassthroughVFIO(); + bool legacy = virHostdevHostSupportsPassthroughLegacy(); + bool vfio = virHostdevHostSupportsPassthroughVFIO(); virHostdevManagerPtr hostdev_mgr = driver->hostdevMgr; virCheckFlags(0, -1); diff --git a/src/qemu/qemu_hostdev.c b/src/qemu/qemu_hostdev.c index 706db0c..39dacb2 100644 --- a/src/qemu/qemu_hostdev.c +++ b/src/qemu/qemu_hostdev.c @@ -23,7 +23,6 @@ #include <config.h> -#include <dirent.h> #include <fcntl.h> #include <sys/ioctl.h> #include <errno.h> @@ -84,84 +83,13 @@ qemuUpdateActiveSCSIHostdevs(virQEMUDriverPtr driver, } -bool -qemuHostdevHostSupportsPassthroughVFIO(void) -{ - DIR *iommuDir = NULL; - struct dirent *iommuGroup = NULL; - bool ret = false; - int direrr; - - /* condition 1 - /sys/kernel/iommu_groups/ contains entries */ - if (!(iommuDir = opendir("/sys/kernel/iommu_groups/"))) - goto cleanup; - - while ((direrr = virDirRead(iommuDir, &iommuGroup, NULL)) > 0) { - /* skip ./ ../ */ - if (STRPREFIX(iommuGroup->d_name, ".")) - continue; - - /* assume we found a group */ - break; - } - - if (direrr < 0 || !iommuGroup) - goto cleanup; - /* okay, iommu is on and recognizes groups */ - - /* condition 2 - /dev/vfio/vfio exists */ - if (!virFileExists("/dev/vfio/vfio")) - goto cleanup; - - ret = true; - - cleanup: - if (iommuDir) - closedir(iommuDir); - - return ret; -} - - -#if HAVE_LINUX_KVM_H -# include <linux/kvm.h> -bool -qemuHostdevHostSupportsPassthroughLegacy(void) -{ - int kvmfd = -1; - bool ret = false; - - if ((kvmfd = open("/dev/kvm", O_RDONLY)) < 0) - goto cleanup; - -# ifdef KVM_CAP_IOMMU - if ((ioctl(kvmfd, KVM_CHECK_EXTENSION, KVM_CAP_IOMMU)) <= 0) - goto cleanup; - - ret = true; -# endif - - cleanup: - VIR_FORCE_CLOSE(kvmfd); - - return ret; -} -#else -bool -qemuHostdevHostSupportsPassthroughLegacy(void) -{ - return false; -} -#endif - - static bool qemuPrepareHostdevPCICheckSupport(virDomainHostdevDefPtr *hostdevs, size_t nhostdevs, virQEMUCapsPtr qemuCaps) { - bool supportsPassthroughKVM = qemuHostdevHostSupportsPassthroughLegacy(); - bool supportsPassthroughVFIO = qemuHostdevHostSupportsPassthroughVFIO(); + bool supportsPassthroughKVM = virHostdevHostSupportsPassthroughLegacy(); + bool supportsPassthroughVFIO = virHostdevHostSupportsPassthroughVFIO(); size_t i; /* assign defaults for hostdev passthrough */ diff --git a/src/qemu/qemu_hostdev.h b/src/qemu/qemu_hostdev.h index 05bd965..75cec9c 100644 --- a/src/qemu/qemu_hostdev.h +++ b/src/qemu/qemu_hostdev.h @@ -33,8 +33,6 @@ int qemuUpdateActiveUSBHostdevs(virQEMUDriverPtr driver, virDomainDefPtr def); int qemuUpdateActiveSCSIHostdevs(virQEMUDriverPtr driver, virDomainDefPtr def); -bool qemuHostdevHostSupportsPassthroughLegacy(void); -bool qemuHostdevHostSupportsPassthroughVFIO(void); int qemuPrepareHostdevPCIDevices(virQEMUDriverPtr driver, const char *name, const unsigned char *uuid, diff --git a/src/util/virhostdev.c b/src/util/virhostdev.c index 9dd1df2..f791a10 100644 --- a/src/util/virhostdev.c +++ b/src/util/virhostdev.c @@ -24,6 +24,7 @@ #include <config.h> +#include <dirent.h> #include <fcntl.h> #include <sys/ioctl.h> #include <sys/types.h> @@ -32,6 +33,10 @@ #include <stdlib.h> #include <stdio.h> +#if HAVE_LINUX_KVM_H +# include <linux/kvm.h> +#endif + #include "virhostdev.h" #include "viralloc.h" #include "virstring.h" @@ -1635,3 +1640,71 @@ virHostdevUpdateDomainActiveDevices(virHostdevManagerPtr mgr, return 0; } + +#if HAVE_LINUX_KVM_H +bool +virHostdevHostSupportsPassthroughLegacy(void) +{ + int kvmfd = -1; + bool ret = false; + + if ((kvmfd = open("/dev/kvm", O_RDONLY)) < 0) + goto cleanup; + +# ifdef KVM_CAP_IOMMU + if ((ioctl(kvmfd, KVM_CHECK_EXTENSION, KVM_CAP_IOMMU)) <= 0) + goto cleanup; + + ret = true; +# endif + + cleanup: + VIR_FORCE_CLOSE(kvmfd); + + return ret; +} +#else +bool +virHostdevHostSupportsPassthroughLegacy(void) +{ + return false; +} +#endif + +bool +virHostdevHostSupportsPassthroughVFIO(void) +{ + DIR *iommuDir = NULL; + struct dirent *iommuGroup = NULL; + bool ret = false; + int direrr; + + /* condition 1 - /sys/kernel/iommu_groups/ contains entries */ + if (!(iommuDir = opendir("/sys/kernel/iommu_groups/"))) + goto cleanup; + + while ((direrr = virDirRead(iommuDir, &iommuGroup, NULL)) > 0) { + /* skip ./ ../ */ + if (STRPREFIX(iommuGroup->d_name, ".")) + continue; + + /* assume we found a group */ + break; + } + + if (direrr < 0 || !iommuGroup) + goto cleanup; + /* okay, iommu is on and recognizes groups */ + + /* condition 2 - /dev/vfio/vfio exists */ + if (!virFileExists("/dev/vfio/vfio")) + goto cleanup; + + ret = true; + + cleanup: + if (iommuDir) + closedir(iommuDir); + + return ret; +} diff --git a/src/util/virhostdev.h b/src/util/virhostdev.h index 2036430..99640f5 100644 --- a/src/util/virhostdev.h +++ b/src/util/virhostdev.h @@ -155,4 +155,8 @@ int virHostdevPCINodeDeviceReset(virHostdevManagerPtr mgr, virPCIDevicePtr pci) ATTRIBUTE_NONNULL(1) ATTRIBUTE_NONNULL(2); +/* KVM related functions */ +bool virHostdevHostSupportsPassthroughLegacy(void); +bool virHostdevHostSupportsPassthroughVFIO(void); + #endif /* __VIR_HOSTDEV_H__ */ -- 1.9.3
There's no need to check for these two host capabilities on each device attach or detach. It's sufficient to check them on the daemon start and then just query them from virCaps when needed. Moreover, this way it's fairly simple to expose them in capabilities XML. Signed-off-by: Michal Privoznik <mprivozn@redhat.com> --- src/conf/capabilities.c | 46 ++++++++++++++++++++++++++++++++++++++++ src/conf/capabilities.h | 11 ++++++++++ src/libvirt_private.syms | 5 +++++ src/lxc/lxc_conf.c | 3 +++ src/nodeinfo.c | 15 +++++++++++++ src/nodeinfo.h | 1 + src/parallels/parallels_driver.c | 3 +++ src/phyp/phyp_driver.c | 3 +++ src/qemu/qemu_capabilities.c | 3 +++ src/qemu/qemu_driver.c | 16 ++++++++++++-- src/qemu/qemu_hostdev.c | 21 ++++++++++++++---- src/uml/uml_conf.c | 3 +++ src/vbox/vbox_tmpl.c | 3 +++ 13 files changed, 127 insertions(+), 6 deletions(-) diff --git a/src/conf/capabilities.c b/src/conf/capabilities.c index cf474d7..9561ba3 100644 --- a/src/conf/capabilities.c +++ b/src/conf/capabilities.c @@ -1068,3 +1068,49 @@ virCapabilitiesGetCpusForNodemask(virCapsPtr caps, return ret; } + + +int +virCapabilitiesGetKVMLegacy(virCapsPtr caps, + bool *legacy) +{ + if (!caps) + return -1; + + *legacy = caps->host.legacyKVMPassthrough; + return 0; +} + +int +virCapabilitiesSetKVMLegacy(virCapsPtr caps, + bool legacy) +{ + if (!caps) + return -1; + + caps->host.legacyKVMPassthrough = legacy; + return 0; +} + + +int +virCapabilitiesGetVFIO(virCapsPtr caps, + bool *vfio) +{ + if (!caps) + return -1; + + *vfio = caps->host.VFIOPassthrough; + return 0; +} + +int +virCapabilitiesSetVFIO(virCapsPtr caps, + bool vfio) +{ + if (!caps) + return -1; + + caps->host.VFIOPassthrough = vfio; + return 0; +} diff --git a/src/conf/capabilities.h b/src/conf/capabilities.h index ba99e1a..935cd14 100644 --- a/src/conf/capabilities.h +++ b/src/conf/capabilities.h @@ -144,6 +144,9 @@ struct _virCapsHost { virCPUDefPtr cpu; unsigned char host_uuid[VIR_UUID_BUFLEN]; + + bool legacyKVMPassthrough; + bool VFIOPassthrough; }; typedef int (*virDomainDefNamespaceParse)(xmlDocPtr, xmlNodePtr, @@ -274,4 +277,12 @@ virCapabilitiesFormatXML(virCapsPtr caps); virBitmapPtr virCapabilitiesGetCpusForNodemask(virCapsPtr caps, virBitmapPtr nodemask); +int virCapabilitiesGetKVMLegacy(virCapsPtr caps, + bool *legacy); +int virCapabilitiesSetKVMLegacy(virCapsPtr caps, + bool legacy); +int virCapabilitiesGetVFIO(virCapsPtr caps, + bool *vfio); +int virCapabilitiesSetVFIO(virCapsPtr caps, + bool vfio); #endif /* __VIR_CAPABILITIES_H */ diff --git a/src/libvirt_private.syms b/src/libvirt_private.syms index 9a942ec..95f1599 100644 --- a/src/libvirt_private.syms +++ b/src/libvirt_private.syms @@ -58,9 +58,13 @@ virCapabilitiesFormatXML; virCapabilitiesFreeMachines; virCapabilitiesFreeNUMAInfo; virCapabilitiesGetCpusForNodemask; +virCapabilitiesGetKVMLegacy; +virCapabilitiesGetVFIO; virCapabilitiesHostSecModelAddBaseLabel; virCapabilitiesNew; virCapabilitiesSetHostCPU; +virCapabilitiesSetKVMLegacy; +virCapabilitiesSetVFIO; # conf/cpu_conf.h @@ -866,6 +870,7 @@ virLockManagerRelease; # nodeinfo.h +nodeCapsInitKVM; nodeCapsInitNUMA; nodeGetCellsFreeMemory; nodeGetCPUBitmap; diff --git a/src/lxc/lxc_conf.c b/src/lxc/lxc_conf.c index a35a5e0..9589290 100644 --- a/src/lxc/lxc_conf.c +++ b/src/lxc/lxc_conf.c @@ -82,6 +82,9 @@ virCapsPtr virLXCDriverCapsInit(virLXCDriverPtr driver) VIR_WARN("Failed to query host NUMA topology, disabling NUMA capabilities"); } + if (nodeCapsInitKVM(caps) < 0) + VIR_WARN("Failed to query KVM info"); + if (virNodeSuspendGetTargetMask(&caps->host.powerMgmt) < 0) VIR_WARN("Failed to get host power management capabilities"); diff --git a/src/nodeinfo.c b/src/nodeinfo.c index be63a30..d5ce75b 100644 --- a/src/nodeinfo.c +++ b/src/nodeinfo.c @@ -52,6 +52,7 @@ #include "virtypedparam.h" #include "virstring.h" #include "virnuma.h" +#include "virhostdev.h" #define VIR_FROM_THIS VIR_FROM_NONE @@ -1821,6 +1822,20 @@ nodeCapsInitNUMA(virCapsPtr caps) int +nodeCapsInitKVM(virCapsPtr caps) +{ + bool legacy = virHostdevHostSupportsPassthroughLegacy(); + bool vfio = virHostdevHostSupportsPassthroughVFIO(); + + if (virCapabilitiesSetKVMLegacy(caps, legacy) < 0 || + virCapabilitiesSetVFIO(caps, vfio) < 0) + return -1; + + return 0; +} + + +int nodeGetCellsFreeMemory(unsigned long long *freeMems, int startCell, int maxCells) diff --git a/src/nodeinfo.h b/src/nodeinfo.h index 625de39..52d74fc 100644 --- a/src/nodeinfo.h +++ b/src/nodeinfo.h @@ -28,6 +28,7 @@ int nodeGetInfo(virNodeInfoPtr nodeinfo); int nodeCapsInitNUMA(virCapsPtr caps); +int nodeCapsInitKVM(virCapsPtr caps); int nodeGetCPUStats(int cpuNum, virNodeCPUStatsPtr params, diff --git a/src/parallels/parallels_driver.c b/src/parallels/parallels_driver.c index ab59599..9bd7bef 100644 --- a/src/parallels/parallels_driver.c +++ b/src/parallels/parallels_driver.c @@ -126,6 +126,9 @@ parallelsBuildCapabilities(void) if (nodeCapsInitNUMA(caps) < 0) goto error; + if (nodeCapsInitKVM(caps) < 0) + VIR_WARN("Failed to query KVM info"); + if ((guest = virCapabilitiesAddGuest(caps, "hvm", VIR_ARCH_X86_64, "parallels", diff --git a/src/phyp/phyp_driver.c b/src/phyp/phyp_driver.c index 508e4c0..43e3aea 100644 --- a/src/phyp/phyp_driver.c +++ b/src/phyp/phyp_driver.c @@ -305,6 +305,9 @@ phypCapsInit(void) ("Failed to query host NUMA topology, disabling NUMA capabilities"); } + if (nodeCapsInitKVM(caps) < 0) + VIR_WARN("Failed to query KVM info"); + if ((guest = virCapabilitiesAddGuest(caps, "linux", caps->host.arch, diff --git a/src/qemu/qemu_capabilities.c b/src/qemu/qemu_capabilities.c index 08c3d04..89f38e2 100644 --- a/src/qemu/qemu_capabilities.c +++ b/src/qemu/qemu_capabilities.c @@ -939,6 +939,9 @@ virCapsPtr virQEMUCapsInit(virQEMUCapsCachePtr cache) VIR_WARN("Failed to query host NUMA topology, disabling NUMA capabilities"); } + if (nodeCapsInitKVM(caps) < 0) + VIR_WARN("Failed to query KVM info"); + if (virQEMUCapsInitCPU(caps, hostarch) < 0) VIR_WARN("Failed to get host CPU"); diff --git a/src/qemu/qemu_driver.c b/src/qemu/qemu_driver.c index d8d0e89..9f97b56 100644 --- a/src/qemu/qemu_driver.c +++ b/src/qemu/qemu_driver.c @@ -11313,12 +11313,23 @@ qemuNodeDeviceDetachFlags(virNodeDevicePtr dev, int ret = -1; virNodeDeviceDefPtr def = NULL; char *xml = NULL; - bool legacy = virHostdevHostSupportsPassthroughLegacy(); - bool vfio = virHostdevHostSupportsPassthroughVFIO(); + virCapsPtr caps = NULL; + bool legacy; + bool vfio; virHostdevManagerPtr hostdev_mgr = driver->hostdevMgr; virCheckFlags(0, -1); + if (!(caps = virQEMUDriverGetCapabilities(driver, false))) + goto cleanup; + + if (virCapabilitiesGetKVMLegacy(caps, &legacy) < 0 || + virCapabilitiesGetVFIO(caps, &vfio) < 0) { + virReportError(VIR_ERR_INTERNAL_ERROR, "%s", + _("Unable to get info on KVM of VFIO")); + goto cleanup; + } + xml = virNodeDeviceGetXMLDesc(dev, 0); if (!xml) goto cleanup; @@ -11379,6 +11390,7 @@ qemuNodeDeviceDetachFlags(virNodeDevicePtr dev, virPCIDeviceFree(pci); virNodeDeviceDefFree(def); VIR_FREE(xml); + virObjectUnref(caps); return ret; } diff --git a/src/qemu/qemu_hostdev.c b/src/qemu/qemu_hostdev.c index 39dacb2..1b2ae95 100644 --- a/src/qemu/qemu_hostdev.c +++ b/src/qemu/qemu_hostdev.c @@ -84,14 +84,22 @@ qemuUpdateActiveSCSIHostdevs(virQEMUDriverPtr driver, static bool -qemuPrepareHostdevPCICheckSupport(virDomainHostdevDefPtr *hostdevs, +qemuPrepareHostdevPCICheckSupport(virCapsPtr caps, + virDomainHostdevDefPtr *hostdevs, size_t nhostdevs, virQEMUCapsPtr qemuCaps) { - bool supportsPassthroughKVM = virHostdevHostSupportsPassthroughLegacy(); - bool supportsPassthroughVFIO = virHostdevHostSupportsPassthroughVFIO(); + bool supportsPassthroughKVM; + bool supportsPassthroughVFIO; size_t i; + if (virCapabilitiesGetKVMLegacy(caps, &supportsPassthroughKVM) < 0 || + virCapabilitiesGetVFIO(caps, &supportsPassthroughVFIO) < 0) { + virReportError(VIR_ERR_INTERNAL_ERROR, "%s", + _("Unable to get info on KVM of VFIO")); + return false; + } + /* assign defaults for hostdev passthrough */ for (i = 0; i < nhostdevs; i++) { virDomainHostdevDefPtr hostdev = hostdevs[i]; @@ -156,15 +164,20 @@ qemuPrepareHostdevPCIDevices(virQEMUDriverPtr driver, unsigned int flags) { int ret = -1; + virCapsPtr caps = NULL; virHostdevManagerPtr hostdev_mgr = driver->hostdevMgr; - if (!qemuPrepareHostdevPCICheckSupport(hostdevs, nhostdevs, qemuCaps)) + if (!(caps = virQEMUDriverGetCapabilities(driver, false))) + goto out; + + if (!qemuPrepareHostdevPCICheckSupport(caps, hostdevs, nhostdevs, qemuCaps)) goto out; ret = virHostdevPreparePCIDevices(hostdev_mgr, QEMU_DRIVER_NAME, name, uuid, hostdevs, nhostdevs, flags); out: + virObjectUnref(caps); return ret; } diff --git a/src/uml/uml_conf.c b/src/uml/uml_conf.c index 464d56d..6fa0546 100644 --- a/src/uml/uml_conf.c +++ b/src/uml/uml_conf.c @@ -70,6 +70,9 @@ virCapsPtr umlCapsInit(void) VIR_WARN("Failed to query host NUMA topology, disabling NUMA capabilities"); } + if (nodeCapsInitKVM(caps) < 0) + VIR_WARN("Failed to query KVM info"); + if (virNodeSuspendGetTargetMask(&caps->host.powerMgmt) < 0) VIR_WARN("Failed to get host power management capabilities"); diff --git a/src/vbox/vbox_tmpl.c b/src/vbox/vbox_tmpl.c index e124e69..5a4e170 100644 --- a/src/vbox/vbox_tmpl.c +++ b/src/vbox/vbox_tmpl.c @@ -888,6 +888,9 @@ static virCapsPtr vboxCapsInit(void) if (nodeCapsInitNUMA(caps) < 0) goto no_memory; + if (nodeCapsInitKVM(caps) < 0) + VIR_WARN("Failed to query KVM info"); + if ((guest = virCapabilitiesAddGuest(caps, "hvm", caps->host.arch, -- 1.9.3
On Thu, May 29, 2014 at 10:32:44AM +0200, Michal Privoznik wrote:
There's no need to check for these two host capabilities on each device attach or detach. It's sufficient to check them on the daemon start and then just query them from virCaps when needed. Moreover, this way it's fairly simple to expose them in capabilities XML.
Unless I'm missing something, this patch is not exposing them in the capabilities XML, as it hasn't modified the XML formatting code at all ?
diff --git a/src/conf/capabilities.c b/src/conf/capabilities.c index cf474d7..9561ba3 100644 --- a/src/conf/capabilities.c +++ b/src/conf/capabilities.c @@ -1068,3 +1068,49 @@ virCapabilitiesGetCpusForNodemask(virCapsPtr caps,
return ret; } + + +int +virCapabilitiesGetKVMLegacy(virCapsPtr caps, + bool *legacy) +{ + if (!caps) + return -1; + + *legacy = caps->host.legacyKVMPassthrough; + return 0; +} + +int +virCapabilitiesSetKVMLegacy(virCapsPtr caps, + bool legacy) +{ + if (!caps) + return -1; + + caps->host.legacyKVMPassthrough = legacy; + return 0; +} + + +int +virCapabilitiesGetVFIO(virCapsPtr caps, + bool *vfio) +{ + if (!caps) + return -1; + + *vfio = caps->host.VFIOPassthrough; + return 0; +} + +int +virCapabilitiesSetVFIO(virCapsPtr caps, + bool vfio) +{ + if (!caps) + return -1; + + caps->host.VFIOPassthrough = vfio; + return 0; +}
I'd expect this file to have modified the XML formatter. Regards, Daniel -- |: http://berrange.com -o- http://www.flickr.com/photos/dberrange/ :| |: http://libvirt.org -o- http://virt-manager.org :| |: http://autobuild.org -o- http://search.cpan.org/~danberr/ :| |: http://entangle-photo.org -o- http://live.gnome.org/gtk-vnc :|
This piece of information may be useful for management application to decide if a domain with a device passthrough can be started on given host or not. Signed-off-by: Michal Privoznik <mprivozn@redhat.com> --- Notes: I'm not very happy with the element names, but they're the best I could come up with so far. If you have any better suggestion I am all ears. docs/formatcaps.html.in | 8 +++++++- docs/schemas/capability.rng | 12 ++++++++++++ src/conf/capabilities.c | 4 ++++ tests/capabilityschemadata/caps-qemu-kvm.xml | 2 ++ tests/capabilityschemadata/caps-test.xml | 2 ++ tests/capabilityschemadata/caps-test2.xml | 2 ++ tests/capabilityschemadata/caps-test3.xml | 2 ++ tests/xencapsdata/xen-i686-pae-hvm.xml | 2 ++ tests/xencapsdata/xen-i686-pae.xml | 2 ++ tests/xencapsdata/xen-i686.xml | 2 ++ tests/xencapsdata/xen-ia64-be-hvm.xml | 2 ++ tests/xencapsdata/xen-ia64-be.xml | 2 ++ tests/xencapsdata/xen-ia64-hvm.xml | 2 ++ tests/xencapsdata/xen-ia64.xml | 2 ++ tests/xencapsdata/xen-ppc64.xml | 2 ++ tests/xencapsdata/xen-x86_64-hvm.xml | 2 ++ tests/xencapsdata/xen-x86_64.xml | 2 ++ 17 files changed, 51 insertions(+), 1 deletion(-) diff --git a/docs/formatcaps.html.in b/docs/formatcaps.html.in index d060a5b..eb8c905 100644 --- a/docs/formatcaps.html.in +++ b/docs/formatcaps.html.in @@ -35,6 +35,8 @@ BIOS you will see</p> <suspend_disk/> <suspend_hybrid/> <power_management/> + <kvm>true</kvm> + <vfio>true</vfio> </host></span> <!-- xen-3.0-x86_64 --> @@ -78,7 +80,11 @@ BIOS you will see</p> Suspend-to-Disk (S4) and Hybrid-Suspend (a combination of S3 and S4). In case the host does not support any such feature, then an empty <power_management/> - tag will be shown. </p> + tag will be shown. Then, two elements + <code><kvm/></code> and <code><vfio/></code> + expose the fact, whether the host supports legacy device + passthrough with IOMMU cooperation or newer Virtual function + I/O.</p> <p>The second block (in blue) indicates the paravirtualization support of the Xen support, you will see the os_type of xen to indicate a paravirtual kernel, then architecture diff --git a/docs/schemas/capability.rng b/docs/schemas/capability.rng index d2d9776..3b378eb 100644 --- a/docs/schemas/capability.rng +++ b/docs/schemas/capability.rng @@ -48,6 +48,18 @@ <zeroOrMore> <ref name='secmodel'/> </zeroOrMore> + <element name='kvm'> + <choice> + <value>false</value> + <value>true</value> + </choice> + </element> + <element name='vfio'> + <choice> + <value>false</value> + <value>true</value> + </choice> + </element> </element> </define> diff --git a/src/conf/capabilities.c b/src/conf/capabilities.c index 9561ba3..a91f37b 100644 --- a/src/conf/capabilities.c +++ b/src/conf/capabilities.c @@ -901,6 +901,10 @@ virCapabilitiesFormatXML(virCapsPtr caps) virBufferAddLit(&buf, "</secmodel>\n"); } + /* KVM and VFIO features */ + virBufferAsprintf(&buf, "<kvm>%s</kvm>\n", caps->host.legacyKVMPassthrough ? "true" : "false"); + virBufferAsprintf(&buf, "<vfio>%s</vfio>\n", caps->host.VFIOPassthrough ? "true" : "false"); + virBufferAdjustIndent(&buf, -2); virBufferAddLit(&buf, "</host>\n\n"); diff --git a/tests/capabilityschemadata/caps-qemu-kvm.xml b/tests/capabilityschemadata/caps-qemu-kvm.xml index 55faa16..1a7fca9 100644 --- a/tests/capabilityschemadata/caps-qemu-kvm.xml +++ b/tests/capabilityschemadata/caps-qemu-kvm.xml @@ -28,6 +28,8 @@ <baselabel type='kvm'>system_u:system_r:svirt_t:s0</baselabel> <baselabel type='qemu'>system_u:system_r:svirt_tcg_t:s0</baselabel> </secmodel> + <kvm>false</kvm> + <vfio>false</vfio> </host> <guest> diff --git a/tests/capabilityschemadata/caps-test.xml b/tests/capabilityschemadata/caps-test.xml index 64f9bb6..72bfafc 100644 --- a/tests/capabilityschemadata/caps-test.xml +++ b/tests/capabilityschemadata/caps-test.xml @@ -36,6 +36,8 @@ </cell> </cells> </topology> + <kvm>true</kvm> + <vfio>false</vfio> </host> <guest> diff --git a/tests/capabilityschemadata/caps-test2.xml b/tests/capabilityschemadata/caps-test2.xml index a99c1b8..04c9b09 100644 --- a/tests/capabilityschemadata/caps-test2.xml +++ b/tests/capabilityschemadata/caps-test2.xml @@ -33,6 +33,8 @@ <uri_transport>tcp</uri_transport> </uri_transports> </migration_features> + <kvm>false</kvm> + <vfio>true</vfio> </host> <guest> diff --git a/tests/capabilityschemadata/caps-test3.xml b/tests/capabilityschemadata/caps-test3.xml index 7e21f85..9374077 100644 --- a/tests/capabilityschemadata/caps-test3.xml +++ b/tests/capabilityschemadata/caps-test3.xml @@ -85,6 +85,8 @@ <baselabel type='kvm'>107:107</baselabel> <baselabel type='qemu'>107:107</baselabel> </secmodel> + <kvm>true</kvm> + <vfio>true</vfio> </host> </capabilities> diff --git a/tests/xencapsdata/xen-i686-pae-hvm.xml b/tests/xencapsdata/xen-i686-pae-hvm.xml index 872e5f6..4f3c720 100644 --- a/tests/xencapsdata/xen-i686-pae-hvm.xml +++ b/tests/xencapsdata/xen-i686-pae-hvm.xml @@ -14,6 +14,8 @@ <uri_transport>xenmigr</uri_transport> </uri_transports> </migration_features> + <kvm>false</kvm> + <vfio>false</vfio> </host> <guest> diff --git a/tests/xencapsdata/xen-i686-pae.xml b/tests/xencapsdata/xen-i686-pae.xml index 3dba6eb..f24306e 100644 --- a/tests/xencapsdata/xen-i686-pae.xml +++ b/tests/xencapsdata/xen-i686-pae.xml @@ -14,6 +14,8 @@ <uri_transport>xenmigr</uri_transport> </uri_transports> </migration_features> + <kvm>false</kvm> + <vfio>false</vfio> </host> <guest> diff --git a/tests/xencapsdata/xen-i686.xml b/tests/xencapsdata/xen-i686.xml index 22d7685..b4f0a86 100644 --- a/tests/xencapsdata/xen-i686.xml +++ b/tests/xencapsdata/xen-i686.xml @@ -11,6 +11,8 @@ <uri_transport>xenmigr</uri_transport> </uri_transports> </migration_features> + <kvm>false</kvm> + <vfio>false</vfio> </host> <guest> diff --git a/tests/xencapsdata/xen-ia64-be-hvm.xml b/tests/xencapsdata/xen-ia64-be-hvm.xml index 222de1d..951dbdd 100644 --- a/tests/xencapsdata/xen-ia64-be-hvm.xml +++ b/tests/xencapsdata/xen-ia64-be-hvm.xml @@ -11,6 +11,8 @@ <uri_transport>xenmigr</uri_transport> </uri_transports> </migration_features> + <kvm>false</kvm> + <vfio>false</vfio> </host> <guest> diff --git a/tests/xencapsdata/xen-ia64-be.xml b/tests/xencapsdata/xen-ia64-be.xml index 017816c..0b90fea 100644 --- a/tests/xencapsdata/xen-ia64-be.xml +++ b/tests/xencapsdata/xen-ia64-be.xml @@ -11,6 +11,8 @@ <uri_transport>xenmigr</uri_transport> </uri_transports> </migration_features> + <kvm>false</kvm> + <vfio>false</vfio> </host> <guest> diff --git a/tests/xencapsdata/xen-ia64-hvm.xml b/tests/xencapsdata/xen-ia64-hvm.xml index 33c4946..6b1780e 100644 --- a/tests/xencapsdata/xen-ia64-hvm.xml +++ b/tests/xencapsdata/xen-ia64-hvm.xml @@ -11,6 +11,8 @@ <uri_transport>xenmigr</uri_transport> </uri_transports> </migration_features> + <kvm>false</kvm> + <vfio>false</vfio> </host> <guest> diff --git a/tests/xencapsdata/xen-ia64.xml b/tests/xencapsdata/xen-ia64.xml index 82ce965..1bb58e9 100644 --- a/tests/xencapsdata/xen-ia64.xml +++ b/tests/xencapsdata/xen-ia64.xml @@ -11,6 +11,8 @@ <uri_transport>xenmigr</uri_transport> </uri_transports> </migration_features> + <kvm>false</kvm> + <vfio>false</vfio> </host> <guest> diff --git a/tests/xencapsdata/xen-ppc64.xml b/tests/xencapsdata/xen-ppc64.xml index 91401b9..fa61351 100644 --- a/tests/xencapsdata/xen-ppc64.xml +++ b/tests/xencapsdata/xen-ppc64.xml @@ -11,6 +11,8 @@ <uri_transport>xenmigr</uri_transport> </uri_transports> </migration_features> + <kvm>false</kvm> + <vfio>false</vfio> </host> <guest> diff --git a/tests/xencapsdata/xen-x86_64-hvm.xml b/tests/xencapsdata/xen-x86_64-hvm.xml index 8de8cf4..7aece30 100644 --- a/tests/xencapsdata/xen-x86_64-hvm.xml +++ b/tests/xencapsdata/xen-x86_64-hvm.xml @@ -14,6 +14,8 @@ <uri_transport>xenmigr</uri_transport> </uri_transports> </migration_features> + <kvm>false</kvm> + <vfio>false</vfio> </host> <guest> diff --git a/tests/xencapsdata/xen-x86_64.xml b/tests/xencapsdata/xen-x86_64.xml index 0c3279b..3bbd40c 100644 --- a/tests/xencapsdata/xen-x86_64.xml +++ b/tests/xencapsdata/xen-x86_64.xml @@ -14,6 +14,8 @@ <uri_transport>xenmigr</uri_transport> </uri_transports> </migration_features> + <kvm>false</kvm> + <vfio>false</vfio> </host> <guest> -- 1.9.3
On Thu, May 29, 2014 at 10:32:45AM +0200, Michal Privoznik wrote:
This piece of information may be useful for management application to decide if a domain with a device passthrough can be started on given host or not.
Signed-off-by: Michal Privoznik <mprivozn@redhat.com> ---
Notes: I'm not very happy with the element names, but they're the best I could come up with so far. If you have any better suggestion I am all ears.
docs/formatcaps.html.in | 8 +++++++- docs/schemas/capability.rng | 12 ++++++++++++ src/conf/capabilities.c | 4 ++++ tests/capabilityschemadata/caps-qemu-kvm.xml | 2 ++ tests/capabilityschemadata/caps-test.xml | 2 ++ tests/capabilityschemadata/caps-test2.xml | 2 ++ tests/capabilityschemadata/caps-test3.xml | 2 ++ tests/xencapsdata/xen-i686-pae-hvm.xml | 2 ++ tests/xencapsdata/xen-i686-pae.xml | 2 ++ tests/xencapsdata/xen-i686.xml | 2 ++ tests/xencapsdata/xen-ia64-be-hvm.xml | 2 ++ tests/xencapsdata/xen-ia64-be.xml | 2 ++ tests/xencapsdata/xen-ia64-hvm.xml | 2 ++ tests/xencapsdata/xen-ia64.xml | 2 ++ tests/xencapsdata/xen-ppc64.xml | 2 ++ tests/xencapsdata/xen-x86_64-hvm.xml | 2 ++ tests/xencapsdata/xen-x86_64.xml | 2 ++ 17 files changed, 51 insertions(+), 1 deletion(-)
diff --git a/docs/formatcaps.html.in b/docs/formatcaps.html.in index d060a5b..eb8c905 100644 --- a/docs/formatcaps.html.in +++ b/docs/formatcaps.html.in @@ -35,6 +35,8 @@ BIOS you will see</p> <suspend_disk/> <suspend_hybrid/> <power_management/> + <kvm>true</kvm> + <vfio>true</vfio> </host></span>
<!-- xen-3.0-x86_64 --> @@ -78,7 +80,11 @@ BIOS you will see</p> Suspend-to-Disk (S4) and Hybrid-Suspend (a combination of S3 and S4). In case the host does not support any such feature, then an empty <power_management/> - tag will be shown. </p> + tag will be shown. Then, two elements + <code><kvm/></code> and <code><vfio/></code> + expose the fact, whether the host supports legacy device + passthrough with IOMMU cooperation or newer Virtual function + I/O.</p> <p>The second block (in blue) indicates the paravirtualization support of the Xen support, you will see the os_type of xen to indicate a paravirtual kernel, then architecture diff --git a/docs/schemas/capability.rng b/docs/schemas/capability.rng index d2d9776..3b378eb 100644 --- a/docs/schemas/capability.rng +++ b/docs/schemas/capability.rng @@ -48,6 +48,18 @@ <zeroOrMore> <ref name='secmodel'/> </zeroOrMore> + <element name='kvm'> + <choice> + <value>false</value> + <value>true</value> + </choice> + </element> + <element name='vfio'> + <choice> + <value>false</value> + <value>true</value> + </choice> + </element> </element> </define>
diff --git a/src/conf/capabilities.c b/src/conf/capabilities.c index 9561ba3..a91f37b 100644 --- a/src/conf/capabilities.c +++ b/src/conf/capabilities.c @@ -901,6 +901,10 @@ virCapabilitiesFormatXML(virCapsPtr caps) virBufferAddLit(&buf, "</secmodel>\n"); }
+ /* KVM and VFIO features */ + virBufferAsprintf(&buf, "<kvm>%s</kvm>\n", caps->host.legacyKVMPassthrough ? "true" : "false"); + virBufferAsprintf(&buf, "<vfio>%s</vfio>\n", caps->host.VFIOPassthrough ? "true" : "false");
Ah, so this is missing from the previous patch. In fact the splt between this & previous patch is a bit confusing. I'd expect the previous patch to only change the src/conf/capabilities* files and generic test cases. While this patch only change the driver code and driver specific test cases. So this XML is really trying to provide a list of the valid enumeration options for the <driver> element of <hostdev> devices. This is quite a common request for many domain XML elements, so I feel we ought to do something that is not so ad-hoc here. When we get into this world of providing info about supported options for devices, we really need to be able to express this on a fine granularity that the capabilities XML allows for. For example, different emulator binaries will support different options, as will many different architectures, or even different machine types of virtualization types. So it feels like we need a new API for this, that accepts info about the machine we're trying to launch. eg char * virConnectGetEmulatorCapabilties(virConnectPtr conn, const char *emulatorbin, const char *machine, const char *virttype); NB I didn't include 'architecture' since that's implicit in the emulatorbin chosen. The 'char *' return value would be an XML schema As for the XML schema, I haven't given it huge thought, but perhaps something that loosely mirrors the XML schema is desirable. So for the <hostdev> driver type enum I could imagine starting off with: <emulatorCapabilities> <path>/usr/bin/qemyu-system-x86_64</path> <domain>kvm</domain> <arch>x86_64</arch> <machine>pc-1.0</machine> <devices> <hostdev> <driver> <enum name="driver"> <value>kvm</value> <value>vfio</value> </enum> </driver> </hostdev> </devices> </emulatorCapabilities> I would expect that the 'maxCpus' hack we stuffed into the existing capabilities XML would be added to this too eg <vcpus max="120"/> at the top level Regards, Daniel -- |: http://berrange.com -o- http://www.flickr.com/photos/dberrange/ :| |: http://libvirt.org -o- http://virt-manager.org :| |: http://autobuild.org -o- http://search.cpan.org/~danberr/ :| |: http://entangle-photo.org -o- http://live.gnome.org/gtk-vnc :|
On 30.05.2014 11:11, Daniel P. Berrange wrote:
On Thu, May 29, 2014 at 10:32:45AM +0200, Michal Privoznik wrote:
This piece of information may be useful for management application to decide if a domain with a device passthrough can be started on given host or not.
Signed-off-by: Michal Privoznik <mprivozn@redhat.com> ---
Notes: I'm not very happy with the element names, but they're the best I could come up with so far. If you have any better suggestion I am all ears.
docs/formatcaps.html.in | 8 +++++++- docs/schemas/capability.rng | 12 ++++++++++++ src/conf/capabilities.c | 4 ++++ tests/capabilityschemadata/caps-qemu-kvm.xml | 2 ++ tests/capabilityschemadata/caps-test.xml | 2 ++ tests/capabilityschemadata/caps-test2.xml | 2 ++ tests/capabilityschemadata/caps-test3.xml | 2 ++ tests/xencapsdata/xen-i686-pae-hvm.xml | 2 ++ tests/xencapsdata/xen-i686-pae.xml | 2 ++ tests/xencapsdata/xen-i686.xml | 2 ++ tests/xencapsdata/xen-ia64-be-hvm.xml | 2 ++ tests/xencapsdata/xen-ia64-be.xml | 2 ++ tests/xencapsdata/xen-ia64-hvm.xml | 2 ++ tests/xencapsdata/xen-ia64.xml | 2 ++ tests/xencapsdata/xen-ppc64.xml | 2 ++ tests/xencapsdata/xen-x86_64-hvm.xml | 2 ++ tests/xencapsdata/xen-x86_64.xml | 2 ++ 17 files changed, 51 insertions(+), 1 deletion(-)
diff --git a/docs/formatcaps.html.in b/docs/formatcaps.html.in index d060a5b..eb8c905 100644 --- a/docs/formatcaps.html.in +++ b/docs/formatcaps.html.in @@ -35,6 +35,8 @@ BIOS you will see</p> <suspend_disk/> <suspend_hybrid/> <power_management/> + <kvm>true</kvm> + <vfio>true</vfio> </host></span>
<!-- xen-3.0-x86_64 --> @@ -78,7 +80,11 @@ BIOS you will see</p> Suspend-to-Disk (S4) and Hybrid-Suspend (a combination of S3 and S4). In case the host does not support any such feature, then an empty <power_management/> - tag will be shown. </p> + tag will be shown. Then, two elements + <code><kvm/></code> and <code><vfio/></code> + expose the fact, whether the host supports legacy device + passthrough with IOMMU cooperation or newer Virtual function + I/O.</p> <p>The second block (in blue) indicates the paravirtualization support of the Xen support, you will see the os_type of xen to indicate a paravirtual kernel, then architecture diff --git a/docs/schemas/capability.rng b/docs/schemas/capability.rng index d2d9776..3b378eb 100644 --- a/docs/schemas/capability.rng +++ b/docs/schemas/capability.rng @@ -48,6 +48,18 @@ <zeroOrMore> <ref name='secmodel'/> </zeroOrMore> + <element name='kvm'> + <choice> + <value>false</value> + <value>true</value> + </choice> + </element> + <element name='vfio'> + <choice> + <value>false</value> + <value>true</value> + </choice> + </element> </element> </define>
diff --git a/src/conf/capabilities.c b/src/conf/capabilities.c index 9561ba3..a91f37b 100644 --- a/src/conf/capabilities.c +++ b/src/conf/capabilities.c @@ -901,6 +901,10 @@ virCapabilitiesFormatXML(virCapsPtr caps) virBufferAddLit(&buf, "</secmodel>\n"); }
+ /* KVM and VFIO features */ + virBufferAsprintf(&buf, "<kvm>%s</kvm>\n", caps->host.legacyKVMPassthrough ? "true" : "false"); + virBufferAsprintf(&buf, "<vfio>%s</vfio>\n", caps->host.VFIOPassthrough ? "true" : "false");
Ah, so this is missing from the previous patch.
In fact the splt between this & previous patch is a bit confusing.
I'd expect the previous patch to only change the src/conf/capabilities* files and generic test cases. While this patch only change the driver code and driver specific test cases.
So this XML is really trying to provide a list of the valid enumeration options for the <driver> element of <hostdev> devices. This is quite a common request for many domain XML elements, so I feel we ought to do something that is not so ad-hoc here. When we get into this world of providing info about supported options for devices, we really need to be able to express this on a fine granularity that the capabilities XML allows for.
For example, different emulator binaries will support different options, as will many different architectures, or even different machine types of virtualization types.
So it feels like we need a new API for this, that accepts info about the machine we're trying to launch. eg
char * virConnectGetEmulatorCapabilties(virConnectPtr conn, const char *emulatorbin, const char *machine, const char *virttype);
What's this virttype argument?
NB I didn't include 'architecture' since that's implicit in the emulatorbin chosen. The 'char *' return value would be an XML schema
As for the XML schema, I haven't given it huge thought, but perhaps something that loosely mirrors the XML schema is desirable.
the XML schema? You mean the capabilities XML schema?
So for the <hostdev> driver type enum I could imagine starting off with:
<emulatorCapabilities> <path>/usr/bin/qemyu-system-x86_64</path> <domain>kvm</domain> <arch>x86_64</arch> <machine>pc-1.0</machine <devices> <hostdev> <driver>
I find this misleading. I mean, why kvm and vfio is under devices/hostdev/driver?
<enum name="driver"> <value>kvm</value> <value>vfio</value> </enum> </driver> </hostdev> </devices> </emulatorCapabilities>
I would expect that the 'maxCpus' hack we stuffed into the existing capabilities XML would be added to this too eg <vcpus max="120"/> at the top level
So let me see if I understand correctly. To represent emulatorCapabilities in memory we need another structure, say: typedef struct _virEmulatorCapabilities virEmulatorCapabilities; typedef virEmulatorCapabilities *virEmulatorCapabilitiesPtr; struct _virEmulatorCapabilities { <some values here> }; which will live in src/conf/capabilities.*. Can you shed a light on the structure internals and where it should be created? IIUC, it'll be created in the virConnectGetEmulatorCapabilties(), then formated and immediately disposed. Michal
On Wed, Jun 18, 2014 at 11:51:51AM +0200, Michal Privoznik wrote:
On 30.05.2014 11:11, Daniel P. Berrange wrote:
diff --git a/src/conf/capabilities.c b/src/conf/capabilities.c index 9561ba3..a91f37b 100644 --- a/src/conf/capabilities.c +++ b/src/conf/capabilities.c @@ -901,6 +901,10 @@ virCapabilitiesFormatXML(virCapsPtr caps) virBufferAddLit(&buf, "</secmodel>\n"); }
+ /* KVM and VFIO features */ + virBufferAsprintf(&buf, "<kvm>%s</kvm>\n", caps->host.legacyKVMPassthrough ? "true" : "false"); + virBufferAsprintf(&buf, "<vfio>%s</vfio>\n", caps->host.VFIOPassthrough ? "true" : "false");
Ah, so this is missing from the previous patch.
In fact the splt between this & previous patch is a bit confusing.
I'd expect the previous patch to only change the src/conf/capabilities* files and generic test cases. While this patch only change the driver code and driver specific test cases.
So this XML is really trying to provide a list of the valid enumeration options for the <driver> element of <hostdev> devices. This is quite a common request for many domain XML elements, so I feel we ought to do something that is not so ad-hoc here. When we get into this world of providing info about supported options for devices, we really need to be able to express this on a fine granularity that the capabilities XML allows for.
For example, different emulator binaries will support different options, as will many different architectures, or even different machine types of virtualization types.
So it feels like we need a new API for this, that accepts info about the machine we're trying to launch. eg
char * virConnectGetEmulatorCapabilties(virConnectPtr conn, const char *emulatorbin, const char *machine, const char *virttype);
What's this virttype argument?
Matches the same attribute in domain XML - <domain type='kvm|qemu|xen|uml|...'>
NB I didn't include 'architecture' since that's implicit in the emulatorbin chosen. The 'char *' return value would be an XML schema
As for the XML schema, I haven't given it huge thought, but perhaps something that loosely mirrors the XML schema is desirable.
the XML schema? You mean the capabilities XML schema?
I mean the new emulator XML schema should roughly follow the structure of the domain XML schema. so if domain XML has <domain><devices><disk>...</disk></devices></domain> then, we'd want to keep this kind of nesting when reporting info about what the <disk> element supports for its various enums.
So for the <hostdev> driver type enum I could imagine starting off with:
<emulatorCapabilities> <path>/usr/bin/qemyu-system-x86_64</path> <domain>kvm</domain> <arch>x86_64</arch> <machine>pc-1.0</machine <devices> <hostdev> <driver>
I find this misleading. I mean, why kvm and vfio is under devices/hostdev/driver?
IIUC, you were attempting to expose flags showing whether the host kernel supports vfio or kvm PCI passthrough. Apps would then consume that to decide what the <hostdev mode='subsystem' type='pci'> for values of the <driver name='kvm|vfio|...'> attribute. Of course that's only half the problem because they also need to know if the QEMU binary supports the 'vfio' scheme too. I'm saying that instead of having capabilities which represent what the host kernel supports, we should directly expose information on whether you can use the attribute in the <driver> element of the hostdev. This captures both kernel and QEMU supportability.
<enum name="driver"> <value>kvm</value> <value>vfio</value> </enum> </driver> </hostdev> </devices> </emulatorCapabilities>
I would expect that the 'maxCpus' hack we stuffed into the existing capabilities XML would be added to this too eg <vcpus max="120"/> at the top level
So let me see if I understand correctly. To represent emulatorCapabilities in memory we need another structure, say:
typedef struct _virEmulatorCapabilities virEmulatorCapabilities; typedef virEmulatorCapabilities *virEmulatorCapabilitiesPtr; struct _virEmulatorCapabilities { <some values here> };
which will live in src/conf/capabilities.*. Can you shed a light on the structure internals and where it should be created? IIUC, it'll be created in the virConnectGetEmulatorCapabilties(), then formated and immediately disposed.
Ignore the existing <capabilities> XML and its code. This should be a new XML schema that is completely separate giving information specifically related to features of the emulator in question. We'd not use the existing virCapabilities structure. Instead I'd expect that we directly populate this new schema with info from the qemuCapsPtr object. This mechanism will let us address this more general problem that both libguestfs and virt-manager have repeatedly asked for - a way to query what features QEMU/libvirt supports for guest configuration. Regards, Daniel -- |: http://berrange.com -o- http://www.flickr.com/photos/dberrange/ :| |: http://libvirt.org -o- http://virt-manager.org :| |: http://autobuild.org -o- http://search.cpan.org/~danberr/ :| |: http://entangle-photo.org -o- http://live.gnome.org/gtk-vnc :|
A PCI device can be associated with a specific NUMA node. Later, when a guest is pinned to one NUMA node the PCI device can be assigned on different NUMA node. This makes DMA transfers travel across nodes and thus results in suboptimal performance. We should expose the NUMA node locality for PCI devices so management applications can make better decisions. Signed-off-by: Michal Privoznik <mprivozn@redhat.com> --- Notes: All the machines I have tried this on had only -1 in the numa_node file. From the kernel sources it seems that this is the default, so I'm not printing the <numa/> element into the XML in this case. But I'd like to hear your opinion. docs/formatnode.html.in | 6 +++ docs/schemas/nodedev.rng | 11 ++++++ src/conf/node_device_conf.c | 43 ++++++++++++++++++++++ src/conf/node_device_conf.h | 1 + src/node_device/node_device_udev.c | 7 ++++ tests/nodedevschemadata/pci_1002_71c4.xml | 1 + tests/nodedevschemadata/pci_8086_10c9_sriov_pf.xml | 1 + 7 files changed, 70 insertions(+) diff --git a/docs/formatnode.html.in b/docs/formatnode.html.in index 46ec2bc..3671c8b 100644 --- a/docs/formatnode.html.in +++ b/docs/formatnode.html.in @@ -124,6 +124,12 @@ GigaTransfers per second) and <code>width</code> for the number of lanes used. Since the port can't be negotiated, it's not exposed in <code>./pci-express/link/[@validity='sta']</code>. + <dt><code>numa</code></dt> + <dd> + This optional element contains information on the PCI device + with respect to NUMA. For example, the optional + <code>node</code> attribute tells which NUMA node is the PCI + device associated with. </dd> </dl> </dd> diff --git a/docs/schemas/nodedev.rng b/docs/schemas/nodedev.rng index 79e8fd2..10b716f 100644 --- a/docs/schemas/nodedev.rng +++ b/docs/schemas/nodedev.rng @@ -184,6 +184,17 @@ </zeroOrMore> </element> </optional> + + <optional> + <element name='numa'> + <optional> + <attribute name='node'> + <data type='int'/> + </attribute> + </optional> + </element> + </optional> + </define> <define name='capusbdev'> diff --git a/src/conf/node_device_conf.c b/src/conf/node_device_conf.c index 70634cc..57dccc4 100644 --- a/src/conf/node_device_conf.c +++ b/src/conf/node_device_conf.c @@ -380,6 +380,9 @@ char *virNodeDeviceDefFormat(const virNodeDeviceDef *def) } if (data->pci_dev.flags & VIR_NODE_DEV_CAP_FLAG_PCIE) virPCIEDeviceInfoFormat(&buf, data->pci_dev.pci_express); + if (data->pci_dev.numa_node >= 0) + virBufferAsprintf(&buf, "<numa node='%d'/>\n", + data->pci_dev.numa_node); break; case VIR_NODE_DEV_CAP_USB_DEV: virBufferAsprintf(&buf, "<bus>%d</bus>\n", data->usb_dev.bus); @@ -554,6 +557,41 @@ char *virNodeDeviceDefFormat(const virNodeDeviceDef *def) return NULL; } +/** + * virNodeDevCapsDefParseIntOptional: + * @xpath: XPath to evaluate + * @ctxt: Context + * @value: Where to store parsed value + * @def: Node device which is parsed + * @invalid_error_fmt: error message to print on invalid format + * + * Returns: -1 on error (invalid int format under @xpath) + * 0 if @xpath was not found (@value is untouched) + * 1 on success + */ +static int +virNodeDevCapsDefParseIntOptional(const char *xpath, + xmlXPathContextPtr ctxt, + int *value, + virNodeDeviceDefPtr def, + const char *invalid_error_fmt) +{ + int ret; + int val; + + ret = virXPathInt(xpath, ctxt, &val); + if (ret < -1) { + virReportError(VIR_ERR_INTERNAL_ERROR, + invalid_error_fmt, + def->name); + return -1; + } else if (ret == -1) { + return 0; + } + *value = val; + return 1; +} + static int virNodeDevCapsDefParseULong(const char *xpath, xmlXPathContextPtr ctxt, @@ -1222,6 +1260,11 @@ virNodeDevCapPCIDevParseXML(xmlXPathContextPtr ctxt, goto out; } + if (virNodeDevCapsDefParseIntOptional("number(./numa[1]/@node)", ctxt, + &data->pci_dev.numa_node, def, + _("invalid NUMA node ID supplied for '%s'")) < 0) + goto out; + ret = 0; out: ctxt->node = orignode; diff --git a/src/conf/node_device_conf.h b/src/conf/node_device_conf.h index 563bf6a..6abab5e 100644 --- a/src/conf/node_device_conf.h +++ b/src/conf/node_device_conf.h @@ -142,6 +142,7 @@ struct _virNodeDevCapsDef { size_t nIommuGroupDevices; unsigned int iommuGroupNumber; virPCIEDeviceInfoPtr pci_express; + int numa_node; } pci_dev; struct { unsigned int bus; diff --git a/src/node_device/node_device_udev.c b/src/node_device/node_device_udev.c index 9ed869b..1663edc 100644 --- a/src/node_device/node_device_udev.c +++ b/src/node_device/node_device_udev.c @@ -495,6 +495,13 @@ static int udevProcessPCI(struct udev_device *device, goto out; } + if (udevGetIntSysfsAttr(device, + "numa_node", + &data->pci_dev.numa_node, + 10) == PROPERTY_ERROR) { + goto out; + } + if (!virPCIGetPhysicalFunction(syspath, &data->pci_dev.physical_function)) data->pci_dev.flags |= VIR_NODE_DEV_CAP_FLAG_PCI_PHYSICAL_FUNCTION; diff --git a/tests/nodedevschemadata/pci_1002_71c4.xml b/tests/nodedevschemadata/pci_1002_71c4.xml index 6de09c1..6d5d85b 100644 --- a/tests/nodedevschemadata/pci_1002_71c4.xml +++ b/tests/nodedevschemadata/pci_1002_71c4.xml @@ -8,5 +8,6 @@ <function>0</function> <product id='0x71c4'>M56GL [Mobility FireGL V5200]</product> <vendor id='0x1002'>ATI Technologies Inc</vendor> + <numa node='1'/> </capability> </device> diff --git a/tests/nodedevschemadata/pci_8086_10c9_sriov_pf.xml b/tests/nodedevschemadata/pci_8086_10c9_sriov_pf.xml index eff8932..6e1dc86 100644 --- a/tests/nodedevschemadata/pci_8086_10c9_sriov_pf.xml +++ b/tests/nodedevschemadata/pci_8086_10c9_sriov_pf.xml @@ -12,5 +12,6 @@ <address domain='0x0000' bus='0x02' slot='0x00' function='0x0'/> <address domain='0x0000' bus='0x02' slot='0x00' function='0x1'/> </iommuGroup> + <numa node='0'/> </capability> </device> -- 1.9.3
On Thu, May 29, 2014 at 10:32:46AM +0200, Michal Privoznik wrote:
A PCI device can be associated with a specific NUMA node. Later, when a guest is pinned to one NUMA node the PCI device can be assigned on different NUMA node. This makes DMA transfers travel across nodes and thus results in suboptimal performance. We should expose the NUMA node locality for PCI devices so management applications can make better decisions.
Signed-off-by: Michal Privoznik <mprivozn@redhat.com> ---
Notes: All the machines I have tried this on had only -1 in the numa_node file. From the kernel sources it seems that this is the default, so I'm not printing the <numa/> element into the XML in this case. But I'd like to hear your opinion.
Yes, I believe '-1' means that there is no NUMA locality info available for the device, so it makes sense to skip this. Regards, Daniel -- |: http://berrange.com -o- http://www.flickr.com/photos/dberrange/ :| |: http://libvirt.org -o- http://virt-manager.org :| |: http://autobuild.org -o- http://search.cpan.org/~danberr/ :| |: http://entangle-photo.org -o- http://live.gnome.org/gtk-vnc :|
On Fri, May 30, 2014 at 10:14:17AM +0100, Daniel P. Berrange wrote:
On Thu, May 29, 2014 at 10:32:46AM +0200, Michal Privoznik wrote:
A PCI device can be associated with a specific NUMA node. Later, when a guest is pinned to one NUMA node the PCI device can be assigned on different NUMA node. This makes DMA transfers travel across nodes and thus results in suboptimal performance. We should expose the NUMA node locality for PCI devices so management applications can make better decisions.
Signed-off-by: Michal Privoznik <mprivozn@redhat.com> ---
Notes: All the machines I have tried this on had only -1 in the numa_node file. From the kernel sources it seems that this is the default, so I'm not printing the <numa/> element into the XML in this case. But I'd like to hear your opinion.
Yes, I believe '-1' means that there is no NUMA locality info available for the device, so it makes sense to skip this.
Confirmed in the kernel source include/linux/numa.h:#define NUMA_NO_NODE (-1) Is used when the ACPI tables don't specify any NUMA node for the PCI device, or when the NUMA node is not online. Regards, Daniel -- |: http://berrange.com -o- http://www.flickr.com/photos/dberrange/ :| |: http://libvirt.org -o- http://virt-manager.org :| |: http://autobuild.org -o- http://search.cpan.org/~danberr/ :| |: http://entangle-photo.org -o- http://live.gnome.org/gtk-vnc :|
Signed-off-by: Michal Privoznik <mprivozn@redhat.com> --- generator.py | 2 ++ libvirt-override-virConnect.py | 7 ++++ libvirt-override.c | 78 ++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 87 insertions(+) diff --git a/generator.py b/generator.py index bdac877..a35c05c 100755 --- a/generator.py +++ b/generator.py @@ -522,6 +522,8 @@ skip_function = ( 'virDomainGetTime', # overridden in virDomain.py 'virDomainSetTime', # overridden in virDomain.py + 'virNodeHugeTLB', # overridden in virConnect.py + # 'Ref' functions have no use for bindings users. "virConnectRef", "virDomainRef", diff --git a/libvirt-override-virConnect.py b/libvirt-override-virConnect.py index c228eb2..9c46e0f 100644 --- a/libvirt-override-virConnect.py +++ b/libvirt-override-virConnect.py @@ -383,3 +383,10 @@ if ret is None:raise libvirtError('virDomainCreateXMLWithFiles() failed', conn=self) __tmp = virDomain(self,_obj=ret) return __tmp + + def hugeTLB(self, node=-1, flags=0): + """Get info on huge pages. Pass node=-1 for list of supported + hugepages sizes """ + ret = libvirtmod.virNodeHugeTLB(self._o, node, flags) + if ret is None: raise libvirtError('virNodeHugeTLB() failed', conn=self) + return ret diff --git a/libvirt-override.c b/libvirt-override.c index a7a6213..1d1dc4e 100644 --- a/libvirt-override.c +++ b/libvirt-override.c @@ -7740,6 +7740,81 @@ libvirt_virDomainSetTime(PyObject *self ATTRIBUTE_UNUSED, PyObject *args) { } #endif /* LIBVIR_CHECK_VERSION(1, 2, 5) */ +#if LIBVIR_CHECK_VERSION(1, 2, 6) +static PyObject * +libvirt_virNodeHugeTLB(PyObject *self ATTRIBUTE_UNUSED, + PyObject *args) { + virConnectPtr conn; + PyObject *pyobj_conn, *py_retval = NULL; + PyObject *dict = NULL; + int node = -1; + virTypedParameterPtr params; + int nparams = 0; + unsigned int flags; + int c_retval; + ssize_t i, j; + unsigned int count = 0; + + if (!PyArg_ParseTuple(args, (char *)"Oii:virNodeHugeTLB", + &pyobj_conn, &node, &flags)) + return NULL; + conn = (virConnectPtr) PyvirConnect_Get(pyobj_conn); + + LIBVIRT_BEGIN_ALLOW_THREADS; + c_retval = virNodeHugeTLB(conn, node, NULL, &nparams, flags); + LIBVIRT_END_ALLOW_THREADS; + + if (c_retval < 0) + return VIR_PY_NONE; + + if (!nparams) + return PyDict_New(); + + if (VIR_ALLOC_N(params, nparams) < 0) + return PyErr_NoMemory(); + + LIBVIRT_BEGIN_ALLOW_THREADS; + c_retval = virNodeHugeTLB(conn, node, params, &nparams, flags); + LIBVIRT_END_ALLOW_THREADS; + + if (c_retval < 0) { + py_retval = VIR_PY_NONE; + goto cleanup; + } + + if (!(dict = PyDict_New())) + goto cleanup; + + j = i = nparams; + while (i) { + PyObject *key, *val; + + i--; + + if (STREQ(VIR_NODE_HUGE_PAGE_SIZE, params[i].field)) { + val = getPyVirTypedParameter(params + i, j - i); + key = libvirt_uintWrap(count++); + + if (!val || !key || + PyDict_SetItem(dict, key, val) < 0) { + Py_XDECREF(val); + Py_XDECREF(key); + goto cleanup; + } + j = i; + } + } + + py_retval = dict; + dict = NULL; + + cleanup: + virTypedParamsFree(params, nparams); + Py_XDECREF(dict); + return py_retval; +} +#endif /* LIBVIR_CHECK_VERSION(1, 2, 6) */ + /************************************************************************ * * * The registration stuff * @@ -7921,6 +7996,9 @@ static PyMethodDef libvirtMethods[] = { {(char *) "virDomainGetTime", libvirt_virDomainGetTime, METH_VARARGS, NULL}, {(char *) "virDomainSetTime", libvirt_virDomainSetTime, METH_VARARGS, NULL}, #endif /* LIBVIR_CHECK_VERSION(1, 2, 5) */ +#if LIBVIR_CHECK_VERSION(1, 2, 6) + {(char *) "virNodeHugeTLB", libvirt_virNodeHugeTLB, METH_VARARGS, NULL}, +#endif /* LIBVIR_CHECK_VERSION(1, 2, 6) */ {NULL, NULL, 0, NULL} }; -- 1.9.3
participants (3)
-
Daniel P. Berrange -
Laine Stump -
Michal Privoznik