From: Vineeth Pillai <viremana(a)linux.microsoft.com>
Signed-off-by: Vineeth Pillai <viremana(a)linux.microsoft.com>
Signed-off-by: Praveen K Paladugu <prapal(a)linux.microsoft.com>
---
po/POTFILES.in | 1 +
src/ch/ch_cgroup.c | 457 ++++++++++++++++++++++++++++++++++++++++++++
src/ch/ch_cgroup.h | 45 +++++
src/ch/ch_conf.c | 2 +
src/ch/ch_conf.h | 4 +-
src/ch/ch_domain.c | 33 ++++
src/ch/ch_domain.h | 3 +-
src/ch/ch_monitor.c | 125 ++++++++++--
src/ch/ch_monitor.h | 54 +++++-
src/ch/ch_process.c | 288 +++++++++++++++++++++++++++-
src/ch/ch_process.h | 3 +
src/ch/meson.build | 2 +
12 files changed, 991 insertions(+), 26 deletions(-)
create mode 100644 src/ch/ch_cgroup.c
create mode 100644 src/ch/ch_cgroup.h
diff --git a/po/POTFILES.in b/po/POTFILES.in
index b554cf08ca..3a8db501bc 100644
--- a/po/POTFILES.in
+++ b/po/POTFILES.in
@@ -19,6 +19,7 @@
@SRCDIR(a)src/bhyve/bhyve_parse_command.c
@SRCDIR(a)src/bhyve/bhyve_process.c
@SRCDIR(a)src/ch/ch_conf.c
+@SRCDIR(a)src/ch/ch_cgroup.c
@SRCDIR(a)src/ch/ch_domain.c
@SRCDIR(a)src/ch/ch_driver.c
@SRCDIR(a)src/ch/ch_monitor.c
diff --git a/src/ch/ch_cgroup.c b/src/ch/ch_cgroup.c
new file mode 100644
index 0000000000..6be2184cf1
--- /dev/null
+++ b/src/ch/ch_cgroup.c
@@ -0,0 +1,457 @@
+/*
+ * ch_cgroup.c: CH cgroup management
+ *
+ * Copyright Microsoft Corp. 2020-2021
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see
+ * <
http://www.gnu.org/licenses/>.
+ */
+
+#include <config.h>
+
+#include "ch_cgroup.h"
+#include "ch_domain.h"
+#include "ch_process.h"
+#include "vircgroup.h"
+#include "virlog.h"
+#include "viralloc.h"
+#include "virerror.h"
+#include "domain_audit.h"
+#include "domain_cgroup.h"
+#include "virscsi.h"
+#include "virstring.h"
+#include "virfile.h"
+#include "virtypedparam.h"
+#include "virnuma.h"
+#include "virdevmapper.h"
+#include "virutil.h"
+
+#define VIR_FROM_THIS VIR_FROM_CH
+
+VIR_LOG_INIT("ch.ch_cgroup");
+
+static int
+chSetupBlkioCgroup(virDomainObj * vm)
+{
+ virCHDomainObjPrivate *priv = vm->privateData;
+
+ if (!virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_BLKIO)) {
+ if (vm->def->blkio.weight || vm->def->blkio.ndevices) {
+ virReportError(VIR_ERR_CONFIG_UNSUPPORTED, "%s",
+ _("Block I/O tuning is not available on this
host"));
+ return -1;
+ } else {
+ return 0;
+ }
+ }
+
+ return virDomainCgroupSetupBlkio(priv->cgroup, vm->def->blkio);
+}
+
+
+static int
+chSetupMemoryCgroup(virDomainObj * vm)
+{
+ virCHDomainObjPrivate *priv = vm->privateData;
+
+ if (!virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_MEMORY)) {
+ if (virMemoryLimitIsSet(vm->def->mem.hard_limit) ||
+ virMemoryLimitIsSet(vm->def->mem.soft_limit) ||
+ virMemoryLimitIsSet(vm->def->mem.swap_hard_limit)) {
+ virReportError(VIR_ERR_CONFIG_UNSUPPORTED, "%s",
+ _("Memory cgroup is not available on this host"));
+ return -1;
+ } else {
+ return 0;
+ }
+ }
+
+ return virDomainCgroupSetupMemtune(priv->cgroup, vm->def->mem);
+}
+
+static int
+chSetupCpusetCgroup(virDomainObj * vm)
+{
+ virCHDomainObjPrivate *priv = vm->privateData;
+
+ if (!virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_CPUSET))
+ return 0;
+
+ if (virCgroupSetCpusetMemoryMigrate(priv->cgroup, true) < 0)
+ return -1;
+
+ return 0;
+}
+
+
+static int
+chSetupCpuCgroup(virDomainObj * vm)
+{
+ virCHDomainObjPrivate *priv = vm->privateData;
+
+ if (!virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_CPU)) {
+ if (vm->def->cputune.sharesSpecified) {
+ virReportError(VIR_ERR_CONFIG_UNSUPPORTED, "%s",
+ _("CPU tuning is not available on this host"));
+ return -1;
+ } else {
+ return 0;
+ }
+ }
+
+ if (vm->def->cputune.sharesSpecified) {
+
+ if (virCgroupSetCpuShares(priv->cgroup, vm->def->cputune.shares) <
0)
+ return -1;
+
+ }
+
+ return 0;
+}
+
+
+static int
+chInitCgroup(virDomainObj * vm, size_t nnicindexes, int *nicindexes)
+{
+ virCHDomainObjPrivate *priv = vm->privateData;
+
+ g_autoptr(virCHDriverConfig) cfg = virCHDriverGetConfig(priv->driver);
+
+ if (!priv->driver->privileged)
+ return 0;
+
+ if (!virCgroupAvailable())
+ return 0;
+
+ virCgroupFree(priv->cgroup);
+
+ if (!vm->def->resource) {
+ virDomainResourceDef *res;
+
+ res = g_new0(virDomainResourceDef, 1);
+
+ res->partition = g_strdup("/machine");
+
+ vm->def->resource = res;
+ }
+
+ if (vm->def->resource->partition[0] != '/') {
+ virReportError(VIR_ERR_CONFIG_UNSUPPORTED,
+ _("Resource partition '%s' must start with
'/'"),
+ vm->def->resource->partition);
+ return -1;
+ }
+
+ if (virCgroupNewMachine(priv->machineName, "ch", vm->def->uuid,
NULL, vm->pid, false, nnicindexes, nicindexes, vm->def->resource->partition,
cfg->cgroupControllers, 0, /* maxThreadsPerProc */
+ &priv->cgroup) < 0) {
+ if (virCgroupNewIgnoreError())
+ return 0;
+
+ return -1;
+ }
+
+ return 0;
+}
+
+static void
+chRestoreCgroupState(virDomainObj * vm)
+{
+ g_autofree char *mem_mask = NULL;
+ g_autofree char *nodeset = NULL;
+ virCHDomainObjPrivate *priv = vm->privateData;
+ size_t i = 0;
+
+ g_autoptr(virBitmap) all_nodes = NULL;
+ virCgroup *cgroup_temp = NULL;
+
+ if (!virNumaIsAvailable() ||
+ !virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_CPUSET))
+ return;
+
+ if (!(all_nodes = virNumaGetHostMemoryNodeset()))
+ goto error;
+
+ if (!(mem_mask = virBitmapFormat(all_nodes)))
+ goto error;
+
+ if ((virCgroupHasEmptyTasks(priv->cgroup,
+ VIR_CGROUP_CONTROLLER_CPUSET)) <= 0)
+ goto error;
+
+ if (virCgroupSetCpusetMems(priv->cgroup, mem_mask) < 0)
+ goto error;
+
+ for (i = 0; i < virDomainDefGetVcpusMax(vm->def); i++) {
+ virDomainVcpuDef *vcpu = virDomainDefGetVcpu(vm->def, i);
+
+ if (!vcpu->online)
+ continue;
+
+ if (virCgroupNewThread(priv->cgroup, VIR_CGROUP_THREAD_VCPU, i,
+ false, &cgroup_temp) < 0 ||
+ virCgroupSetCpusetMemoryMigrate(cgroup_temp, true) < 0 ||
+ virCgroupGetCpusetMems(cgroup_temp, &nodeset) < 0 ||
+ virCgroupSetCpusetMems(cgroup_temp, nodeset) < 0)
+ goto cleanup;
+
+ g_free(nodeset);
+ virCgroupFree(cgroup_temp);
+ }
+
+ for (i = 0; i < vm->def->niothreadids; i++) {
+ if (virCgroupNewThread(priv->cgroup, VIR_CGROUP_THREAD_IOTHREAD,
+ vm->def->iothreadids[i]->iothread_id,
+ false, &cgroup_temp) < 0 ||
+ virCgroupSetCpusetMemoryMigrate(cgroup_temp, true) < 0 ||
+ virCgroupGetCpusetMems(cgroup_temp, &nodeset) < 0 ||
+ virCgroupSetCpusetMems(cgroup_temp, nodeset) < 0)
+ goto cleanup;
+
+ g_free(nodeset);
+ virCgroupFree(cgroup_temp);
+ }
+
+ if (virCgroupNewThread(priv->cgroup, VIR_CGROUP_THREAD_EMULATOR, 0,
+ false, &cgroup_temp) < 0 ||
+ virCgroupSetCpusetMemoryMigrate(cgroup_temp, true) < 0 ||
+ virCgroupGetCpusetMems(cgroup_temp, &nodeset) < 0 ||
+ virCgroupSetCpusetMems(cgroup_temp, nodeset) < 0)
+ goto cleanup;
+
+ cleanup:
+ virCgroupFree(cgroup_temp);
+ return;
+
+ error:
+ virResetLastError();
+ VIR_DEBUG("Couldn't restore cgroups to meaningful state");
+ goto cleanup;
+}
+
+int
+chConnectCgroup(virDomainObj * vm)
+{
+ virCHDomainObjPrivate *priv = vm->privateData;
+
+ g_autoptr(virCHDriverConfig) cfg = virCHDriverGetConfig(priv->driver);
+
+ if (!priv->driver->privileged)
+ return 0;
+
+ if (!virCgroupAvailable())
+ return 0;
+
+ virCgroupFree(priv->cgroup);
+
+ if (virCgroupNewDetectMachine(vm->def->name,
+ "ch",
+ vm->pid,
+ cfg->cgroupControllers,
+ priv->machineName, &priv->cgroup) < 0)
+ return -1;
+
+ chRestoreCgroupState(vm);
+ return 0;
+}
+
+int
+chSetupCgroup(virDomainObj * vm, size_t nnicindexes, int *nicindexes)
+{
+ virCHDomainObjPrivate *priv = vm->privateData;
+
+ if (!vm->pid) {
+ virReportError(VIR_ERR_INTERNAL_ERROR, "%s",
+ _("Cannot setup cgroups until process is started"));
+ return -1;
+ }
+
+ if (chInitCgroup(vm, nnicindexes, nicindexes) < 0)
+ return -1;
+
+ if (!priv->cgroup)
+ return 0;
+
+ if (chSetupBlkioCgroup(vm) < 0)
+ return -1;
+
+ if (chSetupMemoryCgroup(vm) < 0)
+ return -1;
+
+ if (chSetupCpuCgroup(vm) < 0)
+ return -1;
+
+ if (chSetupCpusetCgroup(vm) < 0)
+ return -1;
+
+ return 0;
+}
+
+int
+chSetupCgroupVcpuBW(virCgroup * cgroup,
+ unsigned long long period, long long quota)
+{
+ return virCgroupSetupCpuPeriodQuota(cgroup, period, quota);
+}
+
+
+int
+chSetupCgroupCpusetCpus(virCgroup * cgroup, virBitmap * cpumask)
+{
+ return virCgroupSetupCpusetCpus(cgroup, cpumask);
+}
+
+int
+chSetupGlobalCpuCgroup(virDomainObj * vm)
+{
+ virCHDomainObjPrivate *priv = vm->privateData;
+ unsigned long long period = vm->def->cputune.global_period;
+ long long quota = vm->def->cputune.global_quota;
+ g_autofree char *mem_mask = NULL;
+ virDomainNumatuneMemMode mem_mode;
+
+ if ((period || quota) &&
+ !virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_CPU)) {
+ virReportError(VIR_ERR_CONFIG_UNSUPPORTED, "%s",
+ _("cgroup cpu is required for scheduler tuning"));
+ return -1;
+ }
+
+ /*
+ * If CPU cgroup controller is not initialized here, then we need
+ * neither period nor quota settings. And if CPUSET controller is
+ * not initialized either, then there's nothing to do anyway.
+ */
+ if (!virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_CPU) &&
+ !virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_CPUSET))
+ return 0;
+
+
+ if (virDomainNumatuneGetMode(vm->def->numa, -1, &mem_mode) == 0 &&
+ mem_mode == VIR_DOMAIN_NUMATUNE_MEM_STRICT &&
+ virDomainNumatuneMaybeFormatNodeset(vm->def->numa,
+ priv->autoNodeset,
+ &mem_mask, -1) < 0)
+ return -1;
+
+ if (period || quota) {
+ if (chSetupCgroupVcpuBW(priv->cgroup, period, quota) < 0)
+ return -1;
+ }
+
+ return 0;
+}
+
+
+int
+chRemoveCgroup(virDomainObj * vm)
+{
+ virCHDomainObjPrivate *priv = vm->privateData;
+
+ if (priv->cgroup == NULL)
+ return 0; /* Not supported, so claim success */
+
+ if (virCgroupTerminateMachine(priv->machineName) < 0) {
+ if (!virCgroupNewIgnoreError())
+ VIR_DEBUG("Failed to terminate cgroup for %s",
vm->def->name);
+ }
+
+ return virCgroupRemove(priv->cgroup);
+}
+
+
+static void
+chCgroupEmulatorAllNodesDataFree(chCgroupEmulatorAllNodesData * data)
+{
+ if (!data)
+ return;
+
+ virCgroupFree(data->emulatorCgroup);
+ g_free(data->emulatorMemMask);
+ g_free(data);
+}
+
+
+/**
+ * chCgroupEmulatorAllNodesAllow:
+ * @cgroup: domain cgroup pointer
+ * @retData: filled with structure used to roll back the operation
+ *
+ * Allows all NUMA nodes for the cloud hypervisor thread temporarily. This is
+ * necessary when hotplugging cpus since it requires memory allocated in the
+ * DMA region. Afterwards the operation can be reverted by
+ * chCgroupEmulatorAllNodesRestore.
+ *
+ * Returns 0 on success -1 on error
+ */
+int
+chCgroupEmulatorAllNodesAllow(virCgroup * cgroup,
+ chCgroupEmulatorAllNodesData ** retData)
+{
+ chCgroupEmulatorAllNodesData *data = NULL;
+ g_autofree char *all_nodes_str = NULL;
+
+ g_autoptr(virBitmap) all_nodes = NULL;
+ int ret = -1;
+
+ if (!virNumaIsAvailable() ||
+ !virCgroupHasController(cgroup, VIR_CGROUP_CONTROLLER_CPUSET))
+ return 0;
+
+ if (!(all_nodes = virNumaGetHostMemoryNodeset()))
+ goto cleanup;
+
+ if (!(all_nodes_str = virBitmapFormat(all_nodes)))
+ goto cleanup;
+
+ data = g_new0(chCgroupEmulatorAllNodesData, 1);
+
+ if (virCgroupNewThread(cgroup, VIR_CGROUP_THREAD_EMULATOR, 0,
+ false, &data->emulatorCgroup) < 0)
+ goto cleanup;
+
+ if (virCgroupGetCpusetMems(data->emulatorCgroup, &data->emulatorMemMask)
< 0
+ || virCgroupSetCpusetMems(data->emulatorCgroup, all_nodes_str) < 0)
+ goto cleanup;
+
+ *retData = g_steal_pointer(&data);
+ ret = 0;
+
+ cleanup:
+ chCgroupEmulatorAllNodesDataFree(data);
+
+ return ret;
+}
+
+
+/**
+ * chCgroupEmulatorAllNodesRestore:
+ * @data: data structure created by chCgroupEmulatorAllNodesAllow
+ *
+ * Rolls back the setting done by chCgroupEmulatorAllNodesAllow and frees the
+ * associated data.
+ */
+void
+chCgroupEmulatorAllNodesRestore(chCgroupEmulatorAllNodesData * data)
+{
+ virError *err;
+
+ if (!data)
+ return;
+
+ virErrorPreserveLast(&err);
+ virCgroupSetCpusetMems(data->emulatorCgroup, data->emulatorMemMask);
+ virErrorRestore(&err);
+
+ chCgroupEmulatorAllNodesDataFree(data);
+}
diff --git a/src/ch/ch_cgroup.h b/src/ch/ch_cgroup.h
new file mode 100644
index 0000000000..0152b5477c
--- /dev/null
+++ b/src/ch/ch_cgroup.h
@@ -0,0 +1,45 @@
+/*
+ * ch_cgroup.h: CH cgroup management
+ *
+ * Copyright Microsoft Corp. 2020-2021
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see
+ * <
http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include "virusb.h"
+#include "vircgroup.h"
+#include "domain_conf.h"
+#include "ch_conf.h"
+
+int chConnectCgroup(virDomainObj * vm);
+int chSetupCgroup(virDomainObj * vm, size_t nnicindexes, int *nicindexes);
+int chSetupCgroupVcpuBW(virCgroup * cgroup,
+ unsigned long long period, long long quota);
+int chSetupCgroupCpusetCpus(virCgroup * cgroup, virBitmap * cpumask);
+int chSetupGlobalCpuCgroup(virDomainObj * vm);
+int chRemoveCgroup(virDomainObj * vm);
+
+typedef struct _chCgroupEmulatorAllNodesData chCgroupEmulatorAllNodesData;
+
+struct _chCgroupEmulatorAllNodesData {
+ virCgroup *emulatorCgroup;
+ char *emulatorMemMask;
+};
+
+int chCgroupEmulatorAllNodesAllow(virCgroup * cgroup,
+ chCgroupEmulatorAllNodesData ** data);
+void chCgroupEmulatorAllNodesRestore(chCgroupEmulatorAllNodesData * data);
diff --git a/src/ch/ch_conf.c b/src/ch/ch_conf.c
index ed0fffe5d6..7f70452296 100644
--- a/src/ch/ch_conf.c
+++ b/src/ch/ch_conf.c
@@ -141,6 +141,8 @@ virCHDriverConfigNew(bool privileged)
if (!(cfg = virObjectNew(virCHDriverConfigClass)))
return NULL;
+ cfg->cgroupControllers = -1; /* Auto detect */
+
if (privileged) {
if (virGetUserID(CH_USER, &cfg->user) < 0)
return NULL;
diff --git a/src/ch/ch_conf.h b/src/ch/ch_conf.h
index 49f286f97a..19deb8e568 100644
--- a/src/ch/ch_conf.h
+++ b/src/ch/ch_conf.h
@@ -35,11 +35,13 @@ struct _virCHDriverConfig {
char *stateDir;
char *logDir;
-
+ int cgroupControllers;
uid_t user;
gid_t group;
};
+G_DEFINE_AUTOPTR_CLEANUP_FUNC(virCHDriverConfig, virObjectUnref);
+
struct _virCHDriver
{
virMutex lock;
diff --git a/src/ch/ch_domain.c b/src/ch/ch_domain.c
index e1030800aa..d0aaeed1f4 100644
--- a/src/ch/ch_domain.c
+++ b/src/ch/ch_domain.c
@@ -326,6 +326,39 @@ chValidateDomainDeviceDef(const virDomainDeviceDef *dev,
_("Serial can only be enabled for a PTY"));
return -1;
}
+ return 0;
+}
+int
+virCHDomainRefreshThreadInfo(virDomainObj *vm)
+{
+ size_t maxvcpus = virDomainDefGetVcpusMax(vm->def);
+ virCHMonitorThreadInfo *info = NULL;
+ size_t nthreads, ncpus = 0;
+ size_t i;
+
+ nthreads = virCHMonitorGetThreadInfo(virCHDomainGetMonitor(vm),
+ true, &info);
+
+ for (i = 0; i < nthreads; i++) {
+ virCHDomainVcpuPrivate *vcpupriv;
+ virDomainVcpuDef *vcpu;
+ virCHMonitorCPUInfo *vcpuInfo;
+
+ if (info[i].type != virCHThreadTypeVcpu)
+ continue;
+
+ // TODO: hotplug support
+ vcpuInfo = &info[i].vcpuInfo;
+ vcpu = virDomainDefGetVcpu(vm->def, vcpuInfo->cpuid);
+ vcpupriv = CH_DOMAIN_VCPU_PRIVATE(vcpu);
+ vcpupriv->tid = vcpuInfo->tid;
+ ncpus++;
+ }
+
+ // TODO: Remove the warning when hotplug is implemented.
+ if (ncpus != maxvcpus)
+ VIR_WARN("Mismatch in the number of cpus, expected: %ld, actual: %ld",
+ maxvcpus, ncpus);
return 0;
}
diff --git a/src/ch/ch_domain.h b/src/ch/ch_domain.h
index 3ac3421015..2ce3e2cef3 100644
--- a/src/ch/ch_domain.h
+++ b/src/ch/ch_domain.h
@@ -89,7 +89,8 @@ virCHDomainObjBeginJob(virDomainObj *obj, enum virCHDomainJob job)
void
virCHDomainObjEndJob(virDomainObj *obj);
-int virCHDomainRefreshVcpuInfo(virDomainObj *vm);
+int virCHDomainRefreshThreadInfo(virDomainObj *vm);
+
pid_t virCHDomainGetVcpuPid(virDomainObj *vm, unsigned int vcpuid);
bool virCHDomainHasVcpuPids(virDomainObj *vm);
diff --git a/src/ch/ch_monitor.c b/src/ch/ch_monitor.c
index c0ae031200..095779cb3f 100644
--- a/src/ch/ch_monitor.c
+++ b/src/ch/ch_monitor.c
@@ -41,6 +41,7 @@ VIR_LOG_INIT("ch.ch_monitor");
static virClass *virCHMonitorClass;
static void virCHMonitorDispose(void *obj);
+static void virCHMonitorThreadInfoFree(virCHMonitor *mon);
static int virCHMonitorOnceInit(void)
{
@@ -571,6 +572,7 @@ static void virCHMonitorDispose(void *opaque)
virCHMonitor *mon = opaque;
VIR_DEBUG("mon=%p", mon);
+ virCHMonitorThreadInfoFree(mon);
virObjectUnref(mon->vm);
}
@@ -736,6 +738,114 @@ virCHMonitorGet(virCHMonitor *mon, const char *endpoint,
virJSONValue **response
return ret;
}
+/**
+ * virCHMonitorGetInfo:
+ * @mon: Pointer to the monitor
+ * @info: Get VM info
+ *
+ * Retrieve the VM info and store in @info
+ *
+ * Returns 0 on success.
+ */
+int
+virCHMonitorGetInfo(virCHMonitor *mon, virJSONValue **info)
+{
+ return virCHMonitorGet(mon, URL_VM_INFO, info);
+}
+
+static void
+virCHMonitorThreadInfoFree(virCHMonitor *mon)
+{
+ mon->nthreads = 0;
+ if (mon->threads)
+ VIR_FREE(mon->threads);
+}
+
+static size_t
+virCHMonitorRefreshThreadInfo(virCHMonitor *mon)
+{
+ virCHMonitorThreadInfo *info = NULL;
+ g_autofree pid_t *tids = NULL;
+ virDomainObj *vm = mon->vm;
+ size_t ntids = 0;
+ size_t i;
+
+
+ virCHMonitorThreadInfoFree(mon);
+ if (virProcessGetPids(vm->pid, &ntids, &tids) < 0) {
+ mon->threads = NULL;
+ return 0;
+ }
+
+ info = g_new0(virCHMonitorThreadInfo, ntids);
+ for (i = 0; i < ntids; i++) {
+ g_autofree char *proc = NULL;
+ g_autofree char *data = NULL;
+
+ proc = g_strdup_printf("/proc/%d/task/%d/comm",
+ (int)vm->pid, (int)tids[i]);
+
+ if (virFileReadAll(proc, (1<<16), &data) < 0) {
+ continue;
+ }
+
+ VIR_DEBUG("VM PID: %d, TID %d, COMM: %s",
+ (int)vm->pid, (int)tids[i], data);
+ if (STRPREFIX(data, "vcpu")) {
+ int cpuid;
+ char *tmp;
+ if (virStrToLong_i(data + 4, &tmp, 0, &cpuid) < 0) {
+ VIR_WARN("Index is not specified correctly");
+ continue;
+ }
+ info[i].type = virCHThreadTypeVcpu;
+ info[i].vcpuInfo.tid = tids[i];
+ info[i].vcpuInfo.online = true;
+ info[i].vcpuInfo.cpuid = cpuid;
+ VIR_DEBUG("vcpu%d -> tid: %d", cpuid, tids[i]);
+ } else if (STRPREFIX(data, "_disk") || STRPREFIX(data,
"_net") ||
+ STRPREFIX(data, "_rng")) {
+ /* Prefixes used by cloud-hypervisor for IO Threads are captured at
+
https://github.com/cloud-hypervisor/cloud-hypervisor/blob/main/vmm/src/de...
*/
+ info[i].type = virCHThreadTypeIO;
+ info[i].ioInfo.tid = tids[i];
+ virStrcpy(info[i].ioInfo.thrName, data, VIRCH_THREAD_NAME_LEN - 1);
+ }else {
+ info[i].type = virCHThreadTypeEmulator;
+ info[i].emuInfo.tid = tids[i];
+ virStrcpy(info[i].emuInfo.thrName, data, VIRCH_THREAD_NAME_LEN - 1);
+ }
+ mon->nthreads++;
+
+ }
+ mon->threads = info;
+
+ return mon->nthreads;
+}
+
+/**
+ * virCHMonitorGetThreadInfo:
+ * @mon: Pointer to the monitor
+ * @refresh: Refresh thread info or not
+ *
+ * Retrive thread info and store to @threads
+ *
+ * Returns count of threads on success.
+ */
+size_t
+virCHMonitorGetThreadInfo(virCHMonitor *mon, bool refresh,
+ virCHMonitorThreadInfo **threads)
+{
+ int nthreads = 0;
+
+ if (refresh)
+ nthreads = virCHMonitorRefreshThreadInfo(mon);
+
+ *threads = mon->threads;
+
+ return nthreads;
+}
+
int
virCHMonitorShutdownVMM(virCHMonitor *mon)
{
@@ -810,18 +920,3 @@ virCHMonitorResumeVM(virCHMonitor *mon)
{
return virCHMonitorPutNoContent(mon, URL_VM_RESUME);
}
-
-/**
- * virCHMonitorGetInfo:
- * @mon: Pointer to the monitor
- * @info: Get VM info
- *
- * Retrieve the VM info and store in @info
- *
- * Returns 0 on success.
- */
-int
-virCHMonitorGetInfo(virCHMonitor *mon, virJSONValue **info)
-{
- return virCHMonitorGet(mon, URL_VM_INFO, info);
-}
diff --git a/src/ch/ch_monitor.h b/src/ch/ch_monitor.h
index 8ca9e17a9a..f8c3fa75e8 100644
--- a/src/ch/ch_monitor.h
+++ b/src/ch/ch_monitor.h
@@ -37,6 +37,50 @@
#define URL_VM_RESUME "vm.resume"
#define URL_VM_INFO "vm.info"
+#define VIRCH_THREAD_NAME_LEN 16
+
+typedef enum {
+ virCHThreadTypeEmulator,
+ virCHThreadTypeVcpu,
+ virCHThreadTypeIO,
+ virCHThreadTypeMax
+} virCHThreadType;
+
+typedef struct _virCHMonitorCPUInfo virCHMonitorCPUInfo;
+
+struct _virCHMonitorCPUInfo {
+ int cpuid;
+ pid_t tid;
+
+ bool online;
+};
+
+typedef struct _virCHMonitorEmuThreadInfo virCHMonitorEmuThreadInfo;
+
+struct _virCHMonitorEmuThreadInfo {
+ char thrName[VIRCH_THREAD_NAME_LEN];
+ pid_t tid;
+};
+
+typedef struct _virCHMonitorIOThreadInfo virCHMonitorIOThreadInfo;
+
+struct _virCHMonitorIOThreadInfo {
+ char thrName[VIRCH_THREAD_NAME_LEN];
+ pid_t tid;
+};
+
+typedef struct _virCHMonitorThreadInfo virCHMonitorThreadInfo;
+
+struct _virCHMonitorThreadInfo {
+ virCHThreadType type;
+
+ union {
+ virCHMonitorCPUInfo vcpuInfo;
+ virCHMonitorEmuThreadInfo emuInfo;
+ virCHMonitorIOThreadInfo ioInfo;
+ };
+};
+
typedef struct _virCHMonitor virCHMonitor;
struct _virCHMonitor {
@@ -49,6 +93,9 @@ struct _virCHMonitor {
pid_t pid;
virDomainObj *vm;
+
+ size_t nthreads;
+ virCHMonitorThreadInfo *threads;
};
virCHMonitor *virCHMonitorNew(virDomainObj *vm, const char *socketdir);
@@ -65,12 +112,9 @@ int virCHMonitorSuspendVM(virCHMonitor *mon);
int virCHMonitorResumeVM(virCHMonitor *mon);
int virCHMonitorGetInfo(virCHMonitor *mon, virJSONValue **info);
-typedef struct _virCHMonitorCPUInfo virCHMonitorCPUInfo;
-struct _virCHMonitorCPUInfo {
- pid_t tid;
- bool online;
-};
void virCHMonitorCPUInfoFree(virCHMonitorCPUInfo *cpus);
int virCHMonitorGetCPUInfo(virCHMonitor *mon,
virCHMonitorCPUInfo **vcpus,
size_t maxvcpus);
+size_t virCHMonitorGetThreadInfo(virCHMonitor *mon, bool refresh,
+ virCHMonitorThreadInfo **threads);
diff --git a/src/ch/ch_process.c b/src/ch/ch_process.c
index 3b7f6fcddf..8dce737adb 100644
--- a/src/ch/ch_process.c
+++ b/src/ch/ch_process.c
@@ -26,6 +26,8 @@
#include "ch_domain.h"
#include "ch_monitor.h"
#include "ch_process.h"
+#include "ch_cgroup.h"
+#include "virnuma.h"
#include "viralloc.h"
#include "virerror.h"
#include "virjson.h"
@@ -133,6 +135,257 @@ virCHProcessUpdateInfo(virDomainObj *vm)
return 0;
}
+static int
+virCHProcessGetAllCpuAffinity(virBitmap **cpumapRet)
+{
+ *cpumapRet = NULL;
+
+ if (!virHostCPUHasBitmap())
+ return 0;
+
+ if (!(*cpumapRet = virHostCPUGetOnlineBitmap()))
+ return -1;
+
+ return 0;
+}
+
+#if defined(WITH_SCHED_GETAFFINITY) || defined(WITH_BSD_CPU_AFFINITY)
+static int
+virCHProcessInitCpuAffinity(virDomainObj *vm)
+{
+ g_autoptr(virBitmap) cpumapToSet = NULL;
+ virDomainNumatuneMemMode mem_mode;
+ virCHDomainObjPrivate *priv = vm->privateData;
+
+ if (!vm->pid) {
+ virReportError(VIR_ERR_INTERNAL_ERROR, "%s",
+ _("Cannot setup CPU affinity until process is
started"));
+ return -1;
+ }
+
+ if (virDomainNumaGetNodeCount(vm->def->numa) <= 1 &&
+ virDomainNumatuneGetMode(vm->def->numa, -1, &mem_mode) == 0 &&
+ mem_mode == VIR_DOMAIN_NUMATUNE_MEM_STRICT) {
+ virBitmap *nodeset = NULL;
+
+ if (virDomainNumatuneMaybeGetNodeset(vm->def->numa,
+ priv->autoNodeset,
+ &nodeset,
+ -1) < 0)
+ return -1;
+
+ if (virNumaNodesetToCPUset(nodeset, &cpumapToSet) < 0)
+ return -1;
+ } else if (vm->def->cputune.emulatorpin) {
+ if (!(cpumapToSet = virBitmapNewCopy(vm->def->cputune.emulatorpin)))
+ return -1;
+ } else {
+ if (virCHProcessGetAllCpuAffinity(&cpumapToSet) < 0)
+ return -1;
+ }
+
+ if (cpumapToSet &&
+ virProcessSetAffinity(vm->pid, cpumapToSet, false) < 0) {
+ return -1;
+ }
+
+ return 0;
+}
+#else /* !defined(WITH_SCHED_GETAFFINITY) && !defined(WITH_BSD_CPU_AFFINITY) */
+static int
+virCHProcessInitCpuAffinity(virDomainObj *vm G_GNUC_UNUSED)
+{
+ return 0;
+}
+#endif /* !defined(WITH_SCHED_GETAFFINITY) && !defined(WITH_BSD_CPU_AFFINITY) */
+
+/**
+ * virCHProcessSetupPid:
+ *
+ * This function sets resource properties (affinity, cgroups,
+ * scheduler) for any PID associated with a domain. It should be used
+ * to set up emulator PIDs as well as vCPU and I/O thread pids to
+ * ensure they are all handled the same way.
+ *
+ * Returns 0 on success, -1 on error.
+ */
+static int
+virCHProcessSetupPid(virDomainObj *vm,
+ pid_t pid,
+ virCgroupThreadName nameval,
+ int id,
+ virBitmap *cpumask,
+ unsigned long long period,
+ long long quota,
+ virDomainThreadSchedParam *sched)
+{
+ virCHDomainObjPrivate *priv = vm->privateData;
+ virDomainNumatuneMemMode mem_mode;
+ virCgroup *cgroup = NULL;
+ virBitmap *use_cpumask = NULL;
+ virBitmap *affinity_cpumask = NULL;
+ g_autoptr(virBitmap) hostcpumap = NULL;
+ g_autofree char *mem_mask = NULL;
+ int ret = -1;
+
+ if ((period || quota) &&
+ !virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_CPU)) {
+ virReportError(VIR_ERR_CONFIG_UNSUPPORTED, "%s",
+ _("cgroup cpu is required for scheduler tuning"));
+ goto cleanup;
+ }
+
+ /* Infer which cpumask shall be used. */
+ if (cpumask) {
+ use_cpumask = cpumask;
+ } else if (vm->def->placement_mode == VIR_DOMAIN_CPU_PLACEMENT_MODE_AUTO) {
+ use_cpumask = priv->autoCpuset;
+ } else if (vm->def->cpumask) {
+ use_cpumask = vm->def->cpumask;
+ } else {
+ /* we can't assume cloud-hypervisor itself is running on all pCPUs,
+ * so we need to explicitly set the spawned instance to all pCPUs. */
+ if (virCHProcessGetAllCpuAffinity(&hostcpumap) < 0)
+ goto cleanup;
+ affinity_cpumask = hostcpumap;
+ }
+
+ /*
+ * If CPU cgroup controller is not initialized here, then we need
+ * neither period nor quota settings. And if CPUSET controller is
+ * not initialized either, then there's nothing to do anyway.
+ */
+ if (virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_CPU) ||
+ virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_CPUSET)) {
+
+ if (virDomainNumatuneGetMode(vm->def->numa, -1, &mem_mode) == 0
&&
+ mem_mode == VIR_DOMAIN_NUMATUNE_MEM_STRICT &&
+ virDomainNumatuneMaybeFormatNodeset(vm->def->numa,
+ priv->autoNodeset,
+ &mem_mask, -1) < 0)
+ goto cleanup;
+
+ if (virCgroupNewThread(priv->cgroup, nameval, id, true, &cgroup) < 0)
+ goto cleanup;
+
+ if (virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_CPUSET)) {
+ if (use_cpumask &&
+ chSetupCgroupCpusetCpus(cgroup, use_cpumask) < 0)
+ goto cleanup;
+
+ if (mem_mask && virCgroupSetCpusetMems(cgroup, mem_mask) < 0)
+ goto cleanup;
+
+ }
+
+ if ((period || quota) &&
+ chSetupCgroupVcpuBW(cgroup, period, quota) < 0)
+ goto cleanup;
+
+ /* Move the thread to the sub dir */
+ VIR_INFO("Adding pid %d to cgroup", pid);
+ if (virCgroupAddThread(cgroup, pid) < 0)
+ goto cleanup;
+
+ }
+
+ if (!affinity_cpumask)
+ affinity_cpumask = use_cpumask;
+
+ /* Setup legacy affinity. */
+ if (affinity_cpumask && virProcessSetAffinity(pid, affinity_cpumask, false)
< 0)
+ goto cleanup;
+
+ /* Set scheduler type and priority, but not for the main thread. */
+ if (sched &&
+ nameval != VIR_CGROUP_THREAD_EMULATOR &&
+ virProcessSetScheduler(pid, sched->policy, sched->priority) < 0)
+ goto cleanup;
+
+ ret = 0;
+ cleanup:
+ if (cgroup) {
+ if (ret < 0)
+ virCgroupRemove(cgroup);
+ virCgroupFree(cgroup);
+ }
+
+ return ret;
+}
+
+/**
+ * virCHProcessSetupVcpu:
+ * @vm: domain object
+ * @vcpuid: id of VCPU to set defaults
+ *
+ * This function sets resource properties (cgroups, affinity, scheduler) for a
+ * vCPU. This function expects that the vCPU is online and the vCPU pids were
+ * correctly detected at the point when it's called.
+ *
+ * Returns 0 on success, -1 on error.
+ */
+int
+virCHProcessSetupVcpu(virDomainObj *vm,
+ unsigned int vcpuid)
+{
+ pid_t vcpupid = virCHDomainGetVcpuPid(vm, vcpuid);
+ virDomainVcpuDef *vcpu = virDomainDefGetVcpu(vm->def, vcpuid);
+
+ return virCHProcessSetupPid(vm, vcpupid, VIR_CGROUP_THREAD_VCPU,
+ vcpuid, vcpu->cpumask,
+ vm->def->cputune.period,
+ vm->def->cputune.quota,
+ &vcpu->sched);
+}
+
+static int
+virCHProcessSetupVcpus(virDomainObj *vm)
+{
+ virDomainVcpuDef *vcpu;
+ unsigned int maxvcpus = virDomainDefGetVcpusMax(vm->def);
+ size_t i;
+
+ if ((vm->def->cputune.period || vm->def->cputune.quota) &&
+ !virCgroupHasController(((virCHDomainObjPrivate *)
vm->privateData)->cgroup,
+ VIR_CGROUP_CONTROLLER_CPU)) {
+ virReportError(VIR_ERR_CONFIG_UNSUPPORTED, "%s",
+ _("cgroup cpu is required for scheduler tuning"));
+ return -1;
+ }
+
+ if (!virCHDomainHasVcpuPids(vm)) {
+ /* If any CPU has custom affinity that differs from the
+ * VM default affinity, we must reject it */
+ for (i = 0; i < maxvcpus; i++) {
+ vcpu = virDomainDefGetVcpu(vm->def, i);
+
+ if (!vcpu->online)
+ continue;
+
+ if (vcpu->cpumask &&
+ !virBitmapEqual(vm->def->cpumask, vcpu->cpumask)) {
+ virReportError(VIR_ERR_OPERATION_INVALID, "%s",
+ _("cpu affinity is not supported"));
+ return -1;
+ }
+ }
+
+ return 0;
+ }
+
+ for (i = 0; i < maxvcpus; i++) {
+ vcpu = virDomainDefGetVcpu(vm->def, i);
+
+ if (!vcpu->online)
+ continue;
+
+ if (virCHProcessSetupVcpu(vm, i) < 0)
+ return -1;
+ }
+
+ return 0;
+}
+
/**
* virCHProcessStart:
* @driver: pointer to driver structure
@@ -168,18 +421,33 @@ int virCHProcessStart(virCHDriver *driver,
}
}
+ vm->pid = priv->monitor->pid;
+ vm->def->id = vm->pid;
+ priv->machineName = virCHDomainGetMachineName(vm);
+
+ if (chSetupCgroup(vm, nnicindexes, nicindexes) < 0)
+ goto cleanup;
+
+ if (virCHProcessInitCpuAffinity(vm) < 0)
+ goto cleanup;
+
if (virCHMonitorBootVM(priv->monitor) < 0) {
virReportError(VIR_ERR_INTERNAL_ERROR, "%s",
_("failed to boot guest VM"));
goto cleanup;
}
- priv->machineName = virCHDomainGetMachineName(vm);
- vm->pid = priv->monitor->pid;
- vm->def->id = vm->pid;
+ virCHDomainRefreshThreadInfo(vm);
- virCHProcessUpdateInfo(vm);
+ VIR_DEBUG("Setting global CPU cgroup (if required)");
+ if (chSetupGlobalCpuCgroup(vm) < 0)
+ goto cleanup;
+
+ VIR_DEBUG("Setting vCPU tuning/settings");
+ if (virCHProcessSetupVcpus(vm) < 0)
+ goto cleanup;
+ virCHProcessUpdateInfo(vm);
virDomainObjSetState(vm, VIR_DOMAIN_RUNNING, reason);
return 0;
@@ -195,6 +463,8 @@ int virCHProcessStop(virCHDriver *driver G_GNUC_UNUSED,
virDomainObj *vm,
virDomainShutoffReason reason)
{
+ int ret;
+ int retries = 0;
virCHDomainObjPrivate *priv = vm->privateData;
VIR_DEBUG("Stopping VM name=%s pid=%d reason=%d",
@@ -205,6 +475,16 @@ int virCHProcessStop(virCHDriver *driver G_GNUC_UNUSED,
priv->monitor = NULL;
}
+ retry:
+ if ((ret = chRemoveCgroup(vm)) < 0) {
+ if (ret == -EBUSY && (retries++ < 5)) {
+ g_usleep(200*1000);
+ goto retry;
+ }
+ VIR_WARN("Failed to remove cgroup for %s",
+ vm->def->name);
+ }
+
vm->pid = -1;
vm->def->id = -1;
diff --git a/src/ch/ch_process.h b/src/ch/ch_process.h
index abc4915979..800e3f4e23 100644
--- a/src/ch/ch_process.h
+++ b/src/ch/ch_process.h
@@ -29,3 +29,6 @@ int virCHProcessStart(virCHDriver *driver,
int virCHProcessStop(virCHDriver *driver,
virDomainObj *vm,
virDomainShutoffReason reason);
+
+int virCHProcessSetupVcpu(virDomainObj *vm,
+ unsigned int vcpuid);
diff --git a/src/ch/meson.build b/src/ch/meson.build
index 2b2bdda26c..0b20de56fd 100644
--- a/src/ch/meson.build
+++ b/src/ch/meson.build
@@ -1,6 +1,8 @@
ch_driver_sources = [
'ch_conf.c',
'ch_conf.h',
+ 'ch_cgroup.c',
+ 'ch_cgroup.h',
'ch_domain.c',
'ch_domain.h',
'ch_driver.c',
--
2.27.0