These classes can borrow unused bandwidth. Basically,
only egress qdsics can have classes, therefore we can
do this kind of traffic shaping only on host's outgoing,
that is domain's incoming traffic.
---
src/lxc/lxc_process.c | 3 +-
src/network/bridge_driver.c | 3 +-
src/qemu/qemu_command.c | 3 +-
src/qemu/qemu_driver.c | 2 +-
src/util/virnetdevbandwidth.c | 93 +++++++++++++++++++++++++++++++++++++---
src/util/virnetdevbandwidth.h | 4 +-
src/util/virnetdevmacvlan.c | 2 +-
7 files changed, 97 insertions(+), 13 deletions(-)
diff --git a/src/lxc/lxc_process.c b/src/lxc/lxc_process.c
index 50c61c5..3e7fcb8 100644
--- a/src/lxc/lxc_process.c
+++ b/src/lxc/lxc_process.c
@@ -341,7 +341,8 @@ static int virLXCProcessSetupInterfaceBridged(virConnectPtr conn,
goto cleanup;
if (virNetDevBandwidthSet(net->ifname,
- virDomainNetGetActualBandwidth(net)) < 0) {
+ virDomainNetGetActualBandwidth(net),
+ false) < 0) {
virReportError(VIR_ERR_INTERNAL_ERROR,
_("cannot set bandwidth limits on %s"),
net->ifname);
diff --git a/src/network/bridge_driver.c b/src/network/bridge_driver.c
index 00cffee..58f1d2e 100644
--- a/src/network/bridge_driver.c
+++ b/src/network/bridge_driver.c
@@ -2284,7 +2284,8 @@ networkStartNetworkVirtual(struct network_driver *driver,
VIR_FORCE_CLOSE(tapfd);
}
- if (virNetDevBandwidthSet(network->def->bridge, network->def->bandwidth)
< 0) {
+ if (virNetDevBandwidthSet(network->def->bridge,
+ network->def->bandwidth, true) < 0) {
virReportError(VIR_ERR_INTERNAL_ERROR,
_("cannot set bandwidth limits on %s"),
network->def->bridge);
diff --git a/src/qemu/qemu_command.c b/src/qemu/qemu_command.c
index 9009bd2..e10eb09 100644
--- a/src/qemu/qemu_command.c
+++ b/src/qemu/qemu_command.c
@@ -292,7 +292,8 @@ qemuNetworkIfaceConnect(virDomainDefPtr def,
if (tapfd >= 0 &&
virNetDevBandwidthSet(net->ifname,
- virDomainNetGetActualBandwidth(net)) < 0) {
+ virDomainNetGetActualBandwidth(net),
+ false) < 0) {
virReportError(VIR_ERR_INTERNAL_ERROR,
_("cannot set bandwidth limits on %s"),
net->ifname);
diff --git a/src/qemu/qemu_driver.c b/src/qemu/qemu_driver.c
index d449579..e6ae3fd 100644
--- a/src/qemu/qemu_driver.c
+++ b/src/qemu/qemu_driver.c
@@ -9034,7 +9034,7 @@ qemuDomainSetInterfaceParameters(virDomainPtr dom,
sizeof(*newBandwidth->out));
}
- if (virNetDevBandwidthSet(net->ifname, newBandwidth) < 0) {
+ if (virNetDevBandwidthSet(net->ifname, newBandwidth, false) < 0) {
virReportError(VIR_ERR_INTERNAL_ERROR,
_("cannot set bandwidth limits on %s"),
device);
diff --git a/src/util/virnetdevbandwidth.c b/src/util/virnetdevbandwidth.c
index 49fc425..71c272e 100644
--- a/src/util/virnetdevbandwidth.c
+++ b/src/util/virnetdevbandwidth.c
@@ -45,17 +45,21 @@ virNetDevBandwidthFree(virNetDevBandwidthPtr def)
* virNetDevBandwidthSet:
* @ifname: on which interface
* @bandwidth: rates to set (may be NULL)
+ * @hierarchical_class: whether to create hierarchical class
*
* This function enables QoS on specified interface
* and set given traffic limits for both, incoming
* and outgoing traffic. Any previous setting get
- * overwritten.
+ * overwritten. If @hierarchical_class is TRUE, create
+ * hierarchical class. It is used to guarantee minimal
+ * throughput ('floor' attribute in NIC).
*
* Return 0 on success, -1 otherwise.
*/
int
virNetDevBandwidthSet(const char *ifname,
- virNetDevBandwidthPtr bandwidth)
+ virNetDevBandwidthPtr bandwidth,
+ bool hierarchical_class)
{
int ret = -1;
virCommandPtr cmd = NULL;
@@ -71,7 +75,7 @@ virNetDevBandwidthSet(const char *ifname,
virNetDevBandwidthClear(ifname);
- if (bandwidth->in) {
+ if (bandwidth->in && bandwidth->in->average) {
if (virAsprintf(&average, "%llukbps", bandwidth->in->average)
< 0)
goto cleanup;
if (bandwidth->in->peak &&
@@ -83,15 +87,89 @@ virNetDevBandwidthSet(const char *ifname,
cmd = virCommandNew(TC);
virCommandAddArgList(cmd, "qdisc", "add", "dev",
ifname, "root",
- "handle", "1:", "htb",
"default", "1", NULL);
+ "handle", "1:", "htb",
"default",
+ hierarchical_class ? "2" : "1", NULL);
if (virCommandRun(cmd, NULL) < 0)
goto cleanup;
+ /* If we are creating a hierarchical class, all non guaranteed traffic
+ * goes to the 1:2 class which will adjust 'rate' dynamically as NICs
+ * with guaranteed throughput are plugged and unplugged. Class 1:1
+ * exists so we don't exceed the maximum limit for the network. For each
+ * NIC with guaranteed throughput a separate classid will be created.
+ * NB '1:' is just a shorter notation of '1:0'.
+ *
+ * To get a picture how this works:
+ *
+ * +-----+ +---------+ +-----------+ +-----------+ +-----+
+ * | | | qdisc | | class 1:1 | | class 1:2 | | |
+ * | NIC | | def 1:2 | | rate | | rate | | sfq |
+ * | | --> | | --> | peak | -+-> | peak | --> |
|
+ * +-----+ +---------+ +-----------+ | +-----------+ +-----+
+ * |
+ * | +-----------+ +-----+
+ * | | class 1:3 | | |
+ * | | rate | | sfq |
+ * +-> | peak | --> |
|
+ * | +-----------+ +-----+
+ * ...
+ * | +-----------+ +-----+
+ * | | class 1:n | | |
+ * | | rate | | sfq |
+ * +-> | peak | --> |
|
+ * +-----------+ +-----+
+ *
+ * After the routing decision, when is it clear a packet is to be sent
+ * via a particular NIC, it is sent to the root qdisc (queueing
+ * discipline). In this case HTB (Hierarchical Token Bucket). It has
+ * only one direct child class (with id 1:1) which shapes the overall
+ * rate that is sent through the NIC. This class has at least one child
+ * (1:2) which is meant for all non-privileged (non guaranteed) traffic
+ * from all domains. Then, for each interface with guaranteed
+ * throughput, a separate class (1:n) is created. Imagine a class is a
+ * box. Whenever a packet ends up in a class it is stored in this box
+ * until the kernel sends it, then it is removed from box. Packets are
+ * placed into boxes based on rules (filters) - e.g. depending on
+ * destination IP/MAC address. If there is no rule to be applied, the
+ * root qdisc has a default where such packets go (1:2 in this case).
+ * Packets come in over and over again and boxes get filled more and
+ * more. Imagine that kernel sends packets just once a second. So it
+ * starts to traverse through this tree. It starts with the root qdisc
+ * and through 1:1 it gets to 1:2. It sends packets up to 1:2's
'rate'.
+ * Then it moves to 1:3 and again sends packets up to 1:3's 'rate'.
The
+ * whole process is repeated until 1:n is processed. So now we have
+ * ensured each class its guaranteed bandwidth. If the sum of sent data
+ * doesn't exceed the 'rate' in 1:1 class, we can go further and
send
+ * more packets. The rest of available bandwidth is distributed to the
+ * 1:2,1:3...1:n classes by ratio of their 'rate'. As soon as the root
+ * 'rate' limit is reached or there are no more packets to send, we stop
+ * sending and wait another second. Each class has an SFQ qdisc which
+ * shuffles packets in boxes stochastically, so one sender cannot
+ * starve others.
+ *
+ * Therefore, whenever we want to plug in a new guaranteed interface, we
+ * need to create a new class and adjust the 'rate' of the 1:2 class.
+ * When unplugging we do the exact opposite - remove the associated
+ * class, and adjust the 'rate'.
+ *
+ * This description is rather long, but it is still a good idea to read
+ * it before you dig into the code.
+ */
+ if (hierarchical_class) {
+ virCommandFree(cmd);
+ cmd = virCommandNew(TC);
+ virCommandAddArgList(cmd, "class", "add",
"dev", ifname, "parent",
+ "1:", "classid", "1:1",
"htb", "rate", average,
+ "ceil", peak ? peak : average, NULL);
+ if (virCommandRun(cmd, NULL) < 0)
+ goto cleanup;
+ }
virCommandFree(cmd);
cmd = virCommandNew(TC);
virCommandAddArgList(cmd,"class", "add", "dev",
ifname, "parent",
- "1:", "classid", "1:1",
"htb", NULL);
- virCommandAddArgList(cmd, "rate", average, NULL);
+ hierarchical_class ? "1:1" : "1:",
"classid",
+ hierarchical_class ? "1:2" : "1:1",
"htb",
+ "rate", average, NULL);
if (peak)
virCommandAddArgList(cmd, "ceil", peak, NULL);
@@ -104,7 +182,8 @@ virNetDevBandwidthSet(const char *ifname,
virCommandFree(cmd);
cmd = virCommandNew(TC);
virCommandAddArgList(cmd, "qdisc", "add", "dev",
ifname, "parent",
- "1:1", "handle", "2:",
"sfq", "perturb",
+ hierarchical_class ? "1:2" : "1:1",
+ "handle", "2:", "sfq",
"perturb",
"10", NULL);
if (virCommandRun(cmd, NULL) < 0)
diff --git a/src/util/virnetdevbandwidth.h b/src/util/virnetdevbandwidth.h
index 35f8b89..d308ab2 100644
--- a/src/util/virnetdevbandwidth.h
+++ b/src/util/virnetdevbandwidth.h
@@ -42,7 +42,9 @@ struct _virNetDevBandwidth {
void virNetDevBandwidthFree(virNetDevBandwidthPtr def);
-int virNetDevBandwidthSet(const char *ifname, virNetDevBandwidthPtr bandwidth)
+int virNetDevBandwidthSet(const char *ifname,
+ virNetDevBandwidthPtr bandwidth,
+ bool hierarchical_class)
ATTRIBUTE_NONNULL(1) ATTRIBUTE_RETURN_CHECK;
int virNetDevBandwidthClear(const char *ifname)
ATTRIBUTE_NONNULL(1);
diff --git a/src/util/virnetdevmacvlan.c b/src/util/virnetdevmacvlan.c
index d8e646a..657c484 100644
--- a/src/util/virnetdevmacvlan.c
+++ b/src/util/virnetdevmacvlan.c
@@ -925,7 +925,7 @@ create_name:
rc = 0;
}
- if (virNetDevBandwidthSet(cr_ifname, bandwidth) < 0) {
+ if (virNetDevBandwidthSet(cr_ifname, bandwidth, false) < 0) {
virReportError(VIR_ERR_INTERNAL_ERROR,
_("cannot set bandwidth limits on %s"),
cr_ifname);
--
1.7.8.6