[libvirt] [PATCH] Enable tuning of qemu network tap device "sndbuf" size

This is in response to a request in: https://bugzilla.redhat.com/show_bug.cgi?id=665293 In short, under heavy load, it's possible for qemu's networking to lock up due to the tap device's default 1MB sndbuf being inadequate. adding "sndbuf=0" to the qemu commandline -netdevice option will alleviate this problem (sndbuf=0 actually sets it to 0xffffffff). Because we must be able to explicitly specify "0" as a value, the standard practice of "0 means not specified" won't work here. Instead, virDomainNetDef also has a sndbuf_specified, which defaults to 0, but is set to 1 if some value was given. The sndbuf value is put inside a <tune> element of each <interface> in the domain. The intent is that further tunable settings will also be placed inside this elemnent. <interface type='network'> ... <tune> <sndbuf>0</sndbuf> ... </tune> </interface> --- Note that in qemuBuildHostNetStr() I have moved if (vhostfd && *vhostfd) { virBufferVSprintf(&buf, ",vhost=on,vhostfd=%s", vhostfd); into a newly created "if (is_tap) { ... }" block. This always should have been inside such a conditional, but none existed until now. (I can make this a separate patch if anyone wants, but it seemed so simple and obvious that I took the slothenly way out :-) Also, as with the vhost patch, I didn't get the html docs updated for this addition either. I will add both in a single followup patch next week. docs/schemas/domain.rng | 10 ++++++++++ src/conf/domain_conf.c | 31 +++++++++++++++++++++++++++++-- src/conf/domain_conf.h | 4 ++++ src/qemu/qemu_command.c | 19 +++++++++++++++++-- 4 files changed, 60 insertions(+), 4 deletions(-) diff --git a/docs/schemas/domain.rng b/docs/schemas/domain.rng index 6d0654d..e2883aa 100644 --- a/docs/schemas/domain.rng +++ b/docs/schemas/domain.rng @@ -1025,6 +1025,16 @@ <ref name="filterref-node-attributes"/> </element> </optional> + <optional> + <element name="tune"> + <optional> + <!-- size of send buffer for network tap devices --> + <element name="sndbuf"> + <ref name="unsignedInt"/> + </element> + </optional> + </element> + </optional> </interleave> </define> <define name="virtualPortProfile"> diff --git a/src/conf/domain_conf.c b/src/conf/domain_conf.c index 04ed502..5d1b8cf 100644 --- a/src/conf/domain_conf.c +++ b/src/conf/domain_conf.c @@ -2280,6 +2280,7 @@ err_exit: static virDomainNetDefPtr virDomainNetDefParseXML(virCapsPtr caps, xmlNodePtr node, + xmlXPathContextPtr ctxt, int flags ATTRIBUTE_UNUSED) { virDomainNetDefPtr def; xmlNodePtr cur; @@ -2298,15 +2299,19 @@ virDomainNetDefParseXML(virCapsPtr caps, char *internal = NULL; char *devaddr = NULL; char *mode = NULL; + unsigned long sndbuf; virNWFilterHashTablePtr filterparams = NULL; virVirtualPortProfileParams virtPort; bool virtPortParsed = false; + xmlNodePtr oldnode = ctxt->node; if (VIR_ALLOC(def) < 0) { virReportOOMError(); return NULL; } + ctxt->node = node; + type = virXMLPropString(node, "type"); if (type != NULL) { if ((int)(def->type = virDomainNetTypeFromString(type)) < 0) { @@ -2593,7 +2598,13 @@ virDomainNetDefParseXML(virCapsPtr caps, } } + if (virXPathULong("string(./tune/sndbuf)", ctxt, &sndbuf) >= 0) { + def->tune.sndbuf = sndbuf; + def->tune.sndbuf_specified = 1; + } + cleanup: + ctxt->node = oldnode; VIR_FREE(macaddr); VIR_FREE(network); VIR_FREE(address); @@ -4303,6 +4314,7 @@ virDomainDeviceDefPtr virDomainDeviceDefParse(virCapsPtr caps, { xmlDocPtr xml; xmlNodePtr node; + xmlXPathContextPtr ctxt = NULL; virDomainDeviceDefPtr dev = NULL; if (!(xml = xmlReadDoc(BAD_CAST xmlStr, "device.xml", NULL, @@ -4319,6 +4331,13 @@ virDomainDeviceDefPtr virDomainDeviceDefParse(virCapsPtr caps, goto error; } + ctxt = xmlXPathNewContext(xml); + if (ctxt == NULL) { + virReportOOMError(); + goto error; + } + ctxt->node = node; + if (VIR_ALLOC(dev) < 0) { virReportOOMError(); goto error; @@ -4334,7 +4353,7 @@ virDomainDeviceDefPtr virDomainDeviceDefParse(virCapsPtr caps, goto error; } else if (xmlStrEqual(node->name, BAD_CAST "interface")) { dev->type = VIR_DOMAIN_DEVICE_NET; - if (!(dev->data.net = virDomainNetDefParseXML(caps, node, flags))) + if (!(dev->data.net = virDomainNetDefParseXML(caps, node, ctxt, flags))) goto error; } else if (xmlStrEqual(node->name, BAD_CAST "input")) { dev->type = VIR_DOMAIN_DEVICE_INPUT; @@ -4372,11 +4391,12 @@ virDomainDeviceDefPtr virDomainDeviceDefParse(virCapsPtr caps, } xmlFreeDoc(xml); - + xmlXPathFreeContext(ctxt); return dev; error: xmlFreeDoc(xml); + xmlXPathFreeContext(ctxt); VIR_FREE(dev); return NULL; } @@ -5047,6 +5067,7 @@ static virDomainDefPtr virDomainDefParseXML(virCapsPtr caps, for (i = 0 ; i < n ; i++) { virDomainNetDefPtr net = virDomainNetDefParseXML(caps, nodes[i], + ctxt, flags); if (!net) goto error; @@ -6315,6 +6336,12 @@ virDomainNetDefFormat(virBufferPtr buf, VIR_FREE(attrs); } + if (def->tune.sndbuf_specified) { + virBufferAddLit(buf, " <tune>\n"); + virBufferVSprintf(buf, " <sndbuf>%d</sndbuf>\n", def->tune.sndbuf); + virBufferAddLit(buf, " </tune>\n"); + } + if (virDomainDeviceInfoFormat(buf, &def->info, flags) < 0) return -1; diff --git a/src/conf/domain_conf.h b/src/conf/domain_conf.h index 451ccad..2d35d68 100644 --- a/src/conf/domain_conf.h +++ b/src/conf/domain_conf.h @@ -346,6 +346,10 @@ struct _virDomainNetDef { virVirtualPortProfileParams virtPortProfile; } direct; } data; + struct { + int sndbuf_specified : 1; + int sndbuf; + } tune; char *ifname; virDomainDeviceInfo info; char *filter; diff --git a/src/qemu/qemu_command.c b/src/qemu/qemu_command.c index 9eb54a1..add66cb 100644 --- a/src/qemu/qemu_command.c +++ b/src/qemu/qemu_command.c @@ -1587,6 +1587,7 @@ qemuBuildHostNetStr(virDomainNetDefPtr net, const char *tapfd, const char *vhostfd) { + bool is_tap = false; virBuffer buf = VIR_BUFFER_INITIALIZER; switch (net->type) { @@ -1596,6 +1597,7 @@ qemuBuildHostNetStr(virDomainNetDefPtr net, virBufferAddLit(&buf, "tap"); virBufferVSprintf(&buf, "%cfd=%s", type_sep, tapfd); type_sep = ','; + is_tap = true; break; case VIR_DOMAIN_NET_TYPE_ETHERNET: @@ -1609,6 +1611,7 @@ qemuBuildHostNetStr(virDomainNetDefPtr net, net->data.ethernet.script); type_sep = ','; } + is_tap = true; break; case VIR_DOMAIN_NET_TYPE_CLIENT: @@ -1662,8 +1665,11 @@ qemuBuildHostNetStr(virDomainNetDefPtr net, type_sep, net->info.alias); } - if (vhostfd && *vhostfd) { - virBufferVSprintf(&buf, ",vhost=on,vhostfd=%s", vhostfd); + if (is_tap) { + if (vhostfd && *vhostfd) + virBufferVSprintf(&buf, ",vhost=on,vhostfd=%s", vhostfd); + if (net->tune.sndbuf_specified) + virBufferVSprintf(&buf, ",sndbuf=%d", net->tune.sndbuf); } if (virBufferError(&buf)) { @@ -4665,6 +4671,15 @@ qemuParseCommandLineNet(virCapsPtr caps, def->backend = VIR_DOMAIN_NET_BACKEND_TYPE_QEMU; } def->backend_specified = 1; + } else if (STREQ(keywords[i], "sndbuf") && values[i]) { + if (virStrToLong_i(values[i], NULL, 10, &def->tune.sndbuf) < 0) { + qemuReportError(VIR_ERR_INTERNAL_ERROR, + _("cannot parse sndbuf size in '%s'"), val); + virDomainNetDefFree(def); + def = NULL; + goto cleanup; + } + def->tune.sndbuf_specified = 1; } } -- 1.7.3.4

On 01/12/2011 11:45 PM, Laine Stump wrote:
This is in response to a request in:
https://bugzilla.redhat.com/show_bug.cgi?id=665293
In short, under heavy load, it's possible for qemu's networking to lock up due to the tap device's default 1MB sndbuf being inadequate. adding "sndbuf=0" to the qemu commandline -netdevice option will alleviate this problem (sndbuf=0 actually sets it to 0xffffffff).
Because we must be able to explicitly specify "0" as a value, the standard practice of "0 means not specified" won't work here. Instead, virDomainNetDef also has a sndbuf_specified, which defaults to 0, but is set to 1 if some value was given.
The sndbuf value is put inside a <tune> element of each <interface> in the domain. The intent is that further tunable settings will also be placed inside this elemnent.
s/elemnent/element/
Also, as with the vhost patch, I didn't get the html docs updated for this addition either. I will add both in a single followup patch next week.
Added to my list of things to remind you about, if you forget :)
+ unsigned long sndbuf;
+ if (virXPathULong("string(./tune/sndbuf)", ctxt, &sndbuf) >= 0) { + def->tune.sndbuf = sndbuf;
Hmm, we don't have virXPathInt or virXPathUInt. Type mismatch. If sndbuf > INT_MAX, we've got issues. Should def->tune.sndbuf be unsigned? If so...
@@ -6315,6 +6336,12 @@ virDomainNetDefFormat(virBufferPtr buf, VIR_FREE(attrs); }
+ if (def->tune.sndbuf_specified) { + virBufferAddLit(buf, " <tune>\n"); + virBufferVSprintf(buf, " <sndbuf>%d</sndbuf>\n", def->tune.sndbuf);
this changes to %lu
+ struct { + int sndbuf_specified : 1; + int sndbuf;
and this line changes
@@ -1662,8 +1665,11 @@ qemuBuildHostNetStr(virDomainNetDefPtr net, type_sep, net->info.alias); }
- if (vhostfd && *vhostfd) { - virBufferVSprintf(&buf, ",vhost=on,vhostfd=%s", vhostfd); + if (is_tap) { + if (vhostfd && *vhostfd) + virBufferVSprintf(&buf, ",vhost=on,vhostfd=%s", vhostfd); + if (net->tune.sndbuf_specified) + virBufferVSprintf(&buf, ",sndbuf=%d", net->tune.sndbuf);
%lu
def->backend = VIR_DOMAIN_NET_BACKEND_TYPE_QEMU; } def->backend_specified = 1; + } else if (STREQ(keywords[i], "sndbuf") && values[i]) { + if (virStrToLong_i(values[i], NULL, 10, &def->tune.sndbuf) < 0) {
Oh, we don't have virStrToLong_ul. How odd. Maybe we should do a prerequisite patch that rounds out the type system so that both xml.h and util.h have a matching set of functions (for every type we can parse out of xml, we also have a way to parse it out of a string), so that you can pick the best type to use. At any rate, 'unsigned long' may be overkill, but I do think this should use 'unsigned int' rather than int in the in-memory representation. I'll go ahead and propose a quickie patch to fix the missing utility functions, then we can do a v2 of this on top of that with a sane type throughout. But other than the type mismatch and one spelling nit, I like the approach of this patch. -- Eric Blake eblake@redhat.com +1-801-349-2682 Libvirt virtualization library http://libvirt.org

On 01/13/2011 04:21 PM, Eric Blake wrote:
On 01/12/2011 11:45 PM, Laine Stump wrote:
This is in response to a request in:
https://bugzilla.redhat.com/show_bug.cgi?id=665293
In short, under heavy load, it's possible for qemu's networking to lock up due to the tap device's default 1MB sndbuf being inadequate. adding "sndbuf=0" to the qemu commandline -netdevice option will alleviate this problem (sndbuf=0 actually sets it to 0xffffffff).
Because we must be able to explicitly specify "0" as a value, the standard practice of "0 means not specified" won't work here. Instead, virDomainNetDef also has a sndbuf_specified, which defaults to 0, but is set to 1 if some value was given.
The sndbuf value is put inside a<tune> element of each<interface> in the domain. The intent is that further tunable settings will also be placed inside this elemnent. s/elemnent/element/
Fixed.
Also, as with the vhost patch, I didn't get the html docs updated for this addition either. I will add both in a single followup patch next week. Added to my list of things to remind you about, if you forget :)
Also on my own list.
Oh, we don't have virStrToLong_ul.
Now can be changed to "we *didn't* have virStrToLong_ul" :-) Thanks! I changed all the sndbuf things to unsigned long, and have also changed sndbuf_specified to a bool in v2.

On Thu, Jan 13, 2011 at 01:45:29AM -0500, Laine Stump wrote:
This is in response to a request in:
https://bugzilla.redhat.com/show_bug.cgi?id=665293
In short, under heavy load, it's possible for qemu's networking to lock up due to the tap device's default 1MB sndbuf being inadequate. adding "sndbuf=0" to the qemu commandline -netdevice option will alleviate this problem (sndbuf=0 actually sets it to 0xffffffff).
Because we must be able to explicitly specify "0" as a value, the standard practice of "0 means not specified" won't work here. Instead, virDomainNetDef also has a sndbuf_specified, which defaults to 0, but is set to 1 if some value was given.
The sndbuf value is put inside a <tune> element of each <interface> in the domain. The intent is that further tunable settings will also be placed inside this elemnent.
<interface type='network'> ... <tune> <sndbuf>0</sndbuf> ... </tune> </interface> ---
Note that in qemuBuildHostNetStr() I have moved
if (vhostfd && *vhostfd) { virBufferVSprintf(&buf, ",vhost=on,vhostfd=%s", vhostfd);
into a newly created "if (is_tap) { ... }" block. This always should have been inside such a conditional, but none existed until now. (I can make this a separate patch if anyone wants, but it seemed so simple and obvious that I took the slothenly way out :-)
Also, as with the vhost patch, I didn't get the html docs updated for this addition either. I will add both in a single followup patch next week.
docs/schemas/domain.rng | 10 ++++++++++ src/conf/domain_conf.c | 31 +++++++++++++++++++++++++++++-- src/conf/domain_conf.h | 4 ++++ src/qemu/qemu_command.c | 19 +++++++++++++++++-- 4 files changed, 60 insertions(+), 4 deletions(-)
diff --git a/src/conf/domain_conf.c b/src/conf/domain_conf.c index 04ed502..5d1b8cf 100644 --- a/src/conf/domain_conf.c +++ b/src/conf/domain_conf.c @@ -2280,6 +2280,7 @@ err_exit: static virDomainNetDefPtr virDomainNetDefParseXML(virCapsPtr caps, xmlNodePtr node, + xmlXPathContextPtr ctxt, int flags ATTRIBUTE_UNUSED) { virDomainNetDefPtr def; xmlNodePtr cur; @@ -2298,15 +2299,19 @@ virDomainNetDefParseXML(virCapsPtr caps, char *internal = NULL; char *devaddr = NULL; char *mode = NULL; + unsigned long sndbuf; virNWFilterHashTablePtr filterparams = NULL; virVirtualPortProfileParams virtPort; bool virtPortParsed = false; + xmlNodePtr oldnode = ctxt->node;
if (VIR_ALLOC(def) < 0) { virReportOOMError(); return NULL; }
+ ctxt->node = node; + type = virXMLPropString(node, "type"); if (type != NULL) { if ((int)(def->type = virDomainNetTypeFromString(type)) < 0) { @@ -2593,7 +2598,13 @@ virDomainNetDefParseXML(virCapsPtr caps, } }
+ if (virXPathULong("string(./tune/sndbuf)", ctxt, &sndbuf) >= 0) { + def->tune.sndbuf = sndbuf; + def->tune.sndbuf_specified = 1; + }
This is parsing a 'long'
@@ -6315,6 +6336,12 @@ virDomainNetDefFormat(virBufferPtr buf, VIR_FREE(attrs); }
+ if (def->tune.sndbuf_specified) { + virBufferAddLit(buf, " <tune>\n"); + virBufferVSprintf(buf, " <sndbuf>%d</sndbuf>\n", def->tune.sndbuf); + virBufferAddLit(buf, " </tune>\n"); + }
But this is printing an int
diff --git a/src/conf/domain_conf.h b/src/conf/domain_conf.h index 451ccad..2d35d68 100644 --- a/src/conf/domain_conf.h +++ b/src/conf/domain_conf.h @@ -346,6 +346,10 @@ struct _virDomainNetDef { virVirtualPortProfileParams virtPortProfile; } direct; } data; + struct { + int sndbuf_specified : 1; + int sndbuf; + } tune;
And this is storing an int. They should really all be ints, or all be longs. Also bitfields should be 'unsigned' rather than int, otherwise you're actually storing 0 and -1 as possible values, rather than 0 & 1. Or alternatively could just use a 'bool' here. Since there's only one bitfield, padding means we're not saving any space. Regards, Daniel

On 01/14/2011 08:27 AM, Daniel P. Berrange wrote:
On Thu, Jan 13, 2011 at 01:45:29AM -0500, Laine Stump wrote:
This is in response to a request in:
https://bugzilla.redhat.com/show_bug.cgi?id=665293
In short, under heavy load, it's possible for qemu's networking to lock up due to the tap device's default 1MB sndbuf being inadequate. adding "sndbuf=0" to the qemu commandline -netdevice option will alleviate this problem (sndbuf=0 actually sets it to 0xffffffff).
Because we must be able to explicitly specify "0" as a value, the standard practice of "0 means not specified" won't work here. Instead, virDomainNetDef also has a sndbuf_specified, which defaults to 0, but is set to 1 if some value was given.
The sndbuf value is put inside a<tune> element of each<interface> in the domain. The intent is that further tunable settings will also be placed inside this elemnent.
<interface type='network'> ... <tune> <sndbuf>0</sndbuf> ... </tune> </interface> ---
Note that in qemuBuildHostNetStr() I have moved
if (vhostfd&& *vhostfd) { virBufferVSprintf(&buf, ",vhost=on,vhostfd=%s", vhostfd);
into a newly created "if (is_tap) { ... }" block. This always should have been inside such a conditional, but none existed until now. (I can make this a separate patch if anyone wants, but it seemed so simple and obvious that I took the slothenly way out :-)
Also, as with the vhost patch, I didn't get the html docs updated for this addition either. I will add both in a single followup patch next week.
docs/schemas/domain.rng | 10 ++++++++++ src/conf/domain_conf.c | 31 +++++++++++++++++++++++++++++-- src/conf/domain_conf.h | 4 ++++ src/qemu/qemu_command.c | 19 +++++++++++++++++-- 4 files changed, 60 insertions(+), 4 deletions(-) diff --git a/src/conf/domain_conf.c b/src/conf/domain_conf.c index 04ed502..5d1b8cf 100644 --- a/src/conf/domain_conf.c +++ b/src/conf/domain_conf.c @@ -2280,6 +2280,7 @@ err_exit: static virDomainNetDefPtr virDomainNetDefParseXML(virCapsPtr caps, xmlNodePtr node, + xmlXPathContextPtr ctxt, int flags ATTRIBUTE_UNUSED) { virDomainNetDefPtr def; xmlNodePtr cur; @@ -2298,15 +2299,19 @@ virDomainNetDefParseXML(virCapsPtr caps, char *internal = NULL; char *devaddr = NULL; char *mode = NULL; + unsigned long sndbuf; virNWFilterHashTablePtr filterparams = NULL; virVirtualPortProfileParams virtPort; bool virtPortParsed = false; + xmlNodePtr oldnode = ctxt->node;
if (VIR_ALLOC(def)< 0) { virReportOOMError(); return NULL; }
+ ctxt->node = node; + type = virXMLPropString(node, "type"); if (type != NULL) { if ((int)(def->type = virDomainNetTypeFromString(type))< 0) { @@ -2593,7 +2598,13 @@ virDomainNetDefParseXML(virCapsPtr caps, } }
+ if (virXPathULong("string(./tune/sndbuf)", ctxt,&sndbuf)>= 0) { + def->tune.sndbuf = sndbuf; + def->tune.sndbuf_specified = 1; + } This is parsing a 'long'
@@ -6315,6 +6336,12 @@ virDomainNetDefFormat(virBufferPtr buf, VIR_FREE(attrs); }
+ if (def->tune.sndbuf_specified) { + virBufferAddLit(buf, "<tune>\n"); + virBufferVSprintf(buf, "<sndbuf>%d</sndbuf>\n", def->tune.sndbuf); + virBufferAddLit(buf, "</tune>\n"); + } But this is printing an int
Yeah, the problem was that I needed to generate the value both with a virXPath*() function (which only had something to return a long) as well as with a conversion from string (and virStrToLong_*() could only give me an int. So either way I was out of luck. eblake came to the rescue with a patch that rounds out both function sets to do both types, so I'm going to make it an unsigned long.
Also bitfields should be 'unsigned' rather than int, otherwise you're actually storing 0 and -1 as possible values, rather than 0& 1. Or alternatively could just use a 'bool' here. Since there's only one bitfield, padding means we're not saving any space.
My first instinct was to make it a bool, but I saw cases of others using int bitfields for the same purpose, and nobody using bool; I decided to conform to existing practice. If it's okay with you to make it a bool, that's what I'll do in v2.
participants (3)
-
Daniel P. Berrange
-
Eric Blake
-
Laine Stump