On Thu, Oct 20, 2022 at 16:59:09 -0500, Jonathon Jongsma wrote:
Adds the ability to monitor the nbdkit process so that we can take
action in case the child exits unexpectedly.
When the nbdkit process exits, we pause the vm, restart nbdkit, and then
resume the vm. This allows the vm to continue working in the event of a
nbdkit failure.
Eventually we may want to generalize this functionality since we may
need something similar for e.g. qemu-storage-daemon, etc.
The process is monitored with the pidfd_open() syscall if it exists
(since linux 5.3). Otherwise it resorts to checking whether the process
is alive once a second. The one-second time period was chosen somewhat
arbitrarily.
Signed-off-by: Jonathon Jongsma <jjongsma(a)redhat.com>
---
meson.build | 3 +
src/qemu/qemu_nbdkit.c | 220 ++++++++++++++++++++++++++++++++++++++++
src/qemu/qemu_nbdkit.h | 10 ++
src/qemu/qemu_process.c | 13 +++
4 files changed, 246 insertions(+)
diff --git a/meson.build b/meson.build
index e4581e74dd..b4ed170ca1 100644
--- a/meson.build
+++ b/meson.build
@@ -686,6 +686,9 @@ if host_machine.system() == 'linux'
# Check if we have new enough kernel to support BPF devices for cgroups v2
[ 'linux/bpf.h', 'BPF_PROG_QUERY' ],
[ 'linux/bpf.h', 'BPF_CGROUP_DEVICE' ],
+
+ # process management
+ [ 'sys/syscall.h', 'SYS_pidfd_open' ],
]
endif
diff --git a/src/qemu/qemu_nbdkit.c b/src/qemu/qemu_nbdkit.c
index 0a0dc5d2a4..f17fe022ec 100644
--- a/src/qemu/qemu_nbdkit.c
+++ b/src/qemu/qemu_nbdkit.c
@@ -21,9 +21,11 @@
#include <config.h>
#include <glib.h>
+#include <sys/syscall.h>
#include "vircommand.h"
#include "virerror.h"
+#include "virevent.h"
#include "virlog.h"
#include "virpidfile.h"
#include "virtime.h"
@@ -36,6 +38,7 @@
#include "qemu_nbdkit.h"
#define LIBVIRT_QEMU_NBDKITPRIV_H_ALLOW
#include "qemu_nbdkitpriv.h"
+#include "qemu_process.h"
#include "qemu_security.h"
#include <fcntl.h>
@@ -72,6 +75,13 @@ struct _qemuNbdkitCaps {
G_DEFINE_TYPE(qemuNbdkitCaps, qemu_nbdkit_caps, G_TYPE_OBJECT);
+struct _qemuNbdkitProcessPrivate {
+ int monitor;
+ virQEMUDriver *driver;
Note that 'driver' can be fetched from vm's privateData's 'driver'
field
so you don't need to have both.
+ virDomainObj *vm;
+};
+
+
enum {
PIPE_FD_READ = 0,
PIPE_FD_WRITE = 1
@@ -588,6 +598,168 @@ qemuNbdkitCapsCacheNew(const char *cachedir)
}
+static int
+qemuNbdkitProcessStartMonitor(qemuNbdkitProcess *proc,
+ virDomainObj *vm,
+ virQEMUDriver *driver);
As above. 'driver' is redundant.
+
+
+static void
+qemuNbdkitProcessHandleExit(qemuNbdkitProcess *proc)
+{
+ qemuNbdkitProcessPrivate *priv = proc->priv;
+ bool was_running = false;
+
+ VIR_DEBUG("nbdkit process %i died", proc->pid);
+
+ /* clean up resources associated with process */
+ qemuNbdkitProcessStop(proc);
+
+ if (!(priv->vm && priv->driver)) {
+ VIR_WARN("Unable to restart nbdkit -- vm and driver not set");
The function that registers this handler ensures that the required stuff
is set so this is dead code. And if not VIR_WARN is not the appropriate
action.
+ return;
+ }
+
+ VIR_DEBUG("restarting nbdkit process");
+
+ virObjectLock(priv->vm);
+ if (virDomainObjBeginJob(priv->vm, VIR_JOB_SUSPEND) < 0) {
+ VIR_WARN("can't begin job");
Same here, it's an error.
+ goto cleanup;
+ }
+
+ /* Pause domain */
+ if (virDomainObjGetState(priv->vm, NULL) == VIR_DOMAIN_RUNNING) {
+ was_running = true;
+ if (qemuProcessStopCPUs(priv->driver, priv->vm,
+ VIR_DOMAIN_PAUSED_IOERROR,
+ VIR_ASYNC_JOB_NONE) < 0)
+ goto endjob;
+ VIR_DEBUG("Paused vm while we restart nbdkit backend");
I don't think we should pause the VM here, it's not like the start of
NBDkit is slow.
Additionally after testing this, it doesn't actually even prevent the VM
from getting an I/O error after nbdkit is started back.
Just a subsequent request re-establishes the connection.
Please drop all of the VM state modification here.
+ }
+
+ if (qemuNbdkitProcessStart(proc, priv->vm, priv->driver) < 0)
+ VIR_WARN("Unable to restart nbkdit process");
+
+ if (was_running && virDomainObjIsActive(priv->vm)) {
+ if (qemuProcessStartCPUs(priv->driver, priv->vm,
+ VIR_DOMAIN_RUNNING_UNPAUSED,
+ VIR_ASYNC_JOB_NONE) < 0) {
+ VIR_WARN("Unable to resume guest CPUs after nbdkit restart");
+ goto endjob;
+ }
+ VIR_DEBUG("Resumed vm");
+ }
+ qemuNbdkitProcessStartMonitor(proc, NULL, NULL);
+
+ endjob:
+ virDomainObjEndJob(priv->vm);
+ cleanup:
+ virObjectUnlock(priv->vm);
+}
+
+
+#if WITH_DECL_SYS_PIDFD_OPEN
+static void
+qemuNbdkitProcessPidfdCb(int watch G_GNUC_UNUSED,
+ int fd,
+ int events G_GNUC_UNUSED,
+ void *opaque)
+{
+ qemuNbdkitProcess *proc = opaque;
+
+ VIR_FORCE_CLOSE(fd);
+ qemuNbdkitProcessHandleExit(proc);
+}
+#else
+static void
+qemuNbdkitProcessTimeoutCb(int timer G_GNUC_UNUSED,
+ void *opaque)
+{
+ qemuNbdkitProcess *proc = opaque;
+
+ if (virProcessKill(proc->pid, 0) < 0)
+ qemuNbdkitProcessHandleExit(proc);
+}
+#endif /* WITH_DECL_SYS_PIDFD_OPEN */
+
+
+static int
+qemuNbdkitProcessStartMonitor(qemuNbdkitProcess *proc,
+ virDomainObj *vm,
+ virQEMUDriver *driver)
+{
+ qemuNbdkitProcessPrivate *priv = proc->priv;
+#if WITH_DECL_SYS_PIDFD_OPEN
+ int pidfd;
+#endif
+
+ if (vm) {
+ virObjectRef(vm);
+
+ if (priv->vm)
+ virObjectUnref(priv->vm);
+
+ priv->vm = vm;
+ }
+
+ if (driver)
+ priv->driver = driver;
+
+ if (!(priv->vm && priv->driver)) {
+ VIR_WARN("set vm and driver before calling %s", G_STRFUNC);
Report proper error here. That also captures the function name.
+ return -1;
+ }
+
+#if WITH_DECL_SYS_PIDFD_OPEN
+ pidfd = syscall(SYS_pidfd_open, proc->pid, 0);
+ if (pidfd < 0)
+ return -1;
+
+ priv->monitor = virEventAddHandle(pidfd,
+ VIR_EVENT_HANDLE_READABLE,
+ qemuNbdkitProcessPidfdCb,
+ proc, NULL);
+#else
+ /* fall back to checking once a second */
+ priv->monitor = virEventAddTimeout(1000,
+ qemuNbdkitProcessTimeoutCb,
+ proc, NULL);
+#endif /* WITH_DECL_SYS_PIDFD_OPEN */
+
+ if (priv->monitor < 0)
+ return -1;
+
+ VIR_DEBUG("Monitoring nbdkit process %i for exit", proc->pid);
+
+ return 0;
+}
+
+
+static void
+qemuNbdkitProcessStopMonitor(qemuNbdkitProcess *proc)
+{
+ qemuNbdkitProcessPrivate *priv = proc->priv;
+
+ if (priv->monitor > 0) {
+#if WITH_DECL_SYS_PIDFD_OPEN
+ virEventRemoveHandle(priv->monitor);
+#else
+ virEventRemoveTimeout(priv->monitor);
+#endif /* WITH_DECL_SYS_PIDFD_OPEN */
+ priv->monitor = 0;
+ }
+}
+
+
+static void
+qemuNbdkitProcessPrivateFree(qemuNbdkitProcessPrivate *priv)
+{
+ virObjectUnref(priv->vm);
+ g_free(priv);
+}
+
+
static qemuNbdkitProcess *
qemuNbdkitProcessNew(virStorageSource *source,
const char *pidfile,
@@ -601,6 +773,7 @@ qemuNbdkitProcessNew(virStorageSource *source,
nbdkit->pid = -1;
nbdkit->pidfile = g_strdup(pidfile);
nbdkit->socketfile = g_strdup(socketfile);
+ nbdkit->priv = g_new0(qemuNbdkitProcessPrivate, 1);
return nbdkit;
}
@@ -627,6 +800,45 @@ qemuNbdkitProcessLoad(virStorageSource *source,
}
+static int
+qemuNbdkitStorageSourceManageProcessOne(virStorageSource *src,
+ virDomainObj *vm,
+ virQEMUDriver *driver)
+{
+ qemuDomainStorageSourcePrivate *srcPriv = QEMU_DOMAIN_STORAGE_SOURCE_PRIVATE(src);
+ qemuNbdkitProcess *nbdkit;
+
+ if (!srcPriv)
+ return 0;
+
+ nbdkit = srcPriv->nbdkitProcess;
+ if (nbdkit) {
+ nbdkit->caps = qemuGetNbdkitCaps(nbdkit->priv->driver);
On standard VM startup this will leak the existing reference assigned
via qemuDomainPrepareStorageSourceNbdkit.
+
+ if (qemuNbdkitProcessStartMonitor(nbdkit, vm, driver) < 0)
+ return -1;
+ }
+
+ return 0;
+}