[libvirt] [PATCH v6 0/9] Work around the kernel mdev uevent race in nodedev

v5 here: https://www.redhat.com/archives/libvir-list/2017-October/msg00440.html Since v5: - fixed minor nitpicks - added 3 more patches as per reviewer's suggestion to split some of the changes even more - patches {2,6,7,8,9}/9 are without any change Erik Skultety (9): nodedev: Move privileged flag from udev private data to driver's state (NEW) nodedev: udev: Introduce udevEventMonitorSanityCheck helper function (ACKed) nodedev: udev: Convert udev private data to a lockable object (ACKed with fixes) nodedev: udev: Remove driver locks from stateInitialize and (NEW) stateCleanup nodedev: udev: Unlock the private data before setting up 'system' node (NEW) nodedev: udev: Split udevEventHandleCallback in two functions (ACKed) nodedev: udev: Convert udevEventHandleThread to an actual thread (ACKed) routine util: Introduce virFileWaitForExists (ACKed) nodedev: udev: Hook up virFileWaitForAccess to work around uevent race (ACKed) src/conf/virnodedeviceobj.h | 1 + src/libvirt_private.syms | 1 + src/node_device/node_device_udev.c | 283 +++++++++++++++++++++++++++---------- src/node_device/node_device_udev.h | 3 - src/util/virfile.c | 31 ++++ src/util/virfile.h | 2 + 6 files changed, 245 insertions(+), 76 deletions(-) -- 2.13.6

Even though hal doesn't make use of it, the privileged flag is related to the daemon/driver rather than the backend actually used. While at it, get rid of some tab indentation in the driver state struct. Signed-off-by: Erik Skultety <eskultet@redhat.com> --- src/conf/virnodedeviceobj.h | 1 + src/node_device/node_device_udev.c | 11 +++++++---- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/src/conf/virnodedeviceobj.h b/src/conf/virnodedeviceobj.h index e7c26abbd..87f908369 100644 --- a/src/conf/virnodedeviceobj.h +++ b/src/conf/virnodedeviceobj.h @@ -40,6 +40,7 @@ struct _virNodeDeviceDriverState { virNodeDeviceObjListPtr devs; /* currently-known devices */ void *privateData; /* driver-specific private data */ + bool privileged; /* whether we run in privileged mode */ /* Immutable pointer, self-locking APIs */ virObjectEventStatePtr nodeDeviceEventState; diff --git a/src/node_device/node_device_udev.c b/src/node_device/node_device_udev.c index f4177455c..8ea5d1e62 100644 --- a/src/node_device/node_device_udev.c +++ b/src/node_device/node_device_udev.c @@ -56,7 +56,6 @@ VIR_LOG_INIT("node_device.node_device_udev"); struct _udevPrivate { struct udev_monitor *udev_monitor; int watch; - bool privileged; }; @@ -447,9 +446,13 @@ udevProcessPCI(struct udev_device *device, virNodeDevCapPCIDevPtr pci_dev = &def->caps->data.pci_dev; virPCIEDeviceInfoPtr pci_express = NULL; virPCIDevicePtr pciDev = NULL; - udevPrivate *priv = driver->privateData; int ret = -1; char *p; + bool privileged; + + nodeDeviceLock(); + privileged = driver->privileged; + nodeDeviceUnlock(); if (udevGetUintProperty(device, "PCI_CLASS", &pci_dev->class, 16) < 0) goto cleanup; @@ -498,7 +501,7 @@ udevProcessPCI(struct udev_device *device, goto cleanup; /* We need to be root to read PCI device configs */ - if (priv->privileged) { + if (privileged) { if (virPCIGetHeaderType(pciDev, &pci_dev->hdrType) < 0) goto cleanup; @@ -1787,7 +1790,6 @@ nodeStateInitialize(bool privileged, return -1; priv->watch = -1; - priv->privileged = privileged; if (VIR_ALLOC(driver) < 0) { VIR_FREE(priv); @@ -1802,6 +1804,7 @@ nodeStateInitialize(bool privileged, return -1; } + driver->privileged = privileged; driver->privateData = priv; nodeDeviceLock(); -- 2.13.6

We need to perform a sanity check on the udev monitor before every use so that we know nothing has changed in the meantime. The reason for moving the code to a separate helper is to enhance readability and shift the focus on the important stuff within the udevEventHandleCallback handler. Signed-off-by: Erik Skultety <eskultet@redhat.com> --- src/node_device/node_device_udev.c | 43 ++++++++++++++++++++++++-------------- 1 file changed, 27 insertions(+), 16 deletions(-) diff --git a/src/node_device/node_device_udev.c b/src/node_device/node_device_udev.c index 8ea5d1e62..8314b3834 100644 --- a/src/node_device/node_device_udev.c +++ b/src/node_device/node_device_udev.c @@ -1618,24 +1618,20 @@ udevHandleOneDevice(struct udev_device *device) } -static void -udevEventHandleCallback(int watch ATTRIBUTE_UNUSED, - int fd, - int events ATTRIBUTE_UNUSED, - void *data ATTRIBUTE_UNUSED) +static bool +udevEventMonitorSanityCheck(struct udev_monitor *udev_monitor, + int fd) { - struct udev_device *device = NULL; - struct udev_monitor *udev_monitor = DRV_STATE_UDEV_MONITOR(driver); - int udev_fd = -1; + int rc = -1; - udev_fd = udev_monitor_get_fd(udev_monitor); - if (fd != udev_fd) { + rc = udev_monitor_get_fd(udev_monitor); + if (fd != rc) { udevPrivate *priv = driver->privateData; virReportError(VIR_ERR_INTERNAL_ERROR, _("File descriptor returned by udev %d does not " "match node device file descriptor %d"), - fd, udev_fd); + fd, rc); /* this is a non-recoverable error, let's remove the handle, so that we * don't get in here again because of some spurious behaviour and report @@ -1644,21 +1640,36 @@ udevEventHandleCallback(int watch ATTRIBUTE_UNUSED, virEventRemoveHandle(priv->watch); priv->watch = -1; - goto cleanup; + return false; } + return true; +} + + +static void +udevEventHandleCallback(int watch ATTRIBUTE_UNUSED, + int fd, + int events ATTRIBUTE_UNUSED, + void *data ATTRIBUTE_UNUSED) +{ + struct udev_device *device = NULL; + struct udev_monitor *udev_monitor = NULL; + + udev_monitor = DRV_STATE_UDEV_MONITOR(driver); + + if (!udevEventMonitorSanityCheck(udev_monitor, fd)) + return; + device = udev_monitor_receive_device(udev_monitor); if (device == NULL) { virReportError(VIR_ERR_INTERNAL_ERROR, "%s", _("udev_monitor_receive_device returned NULL")); - goto cleanup; + return; } udevHandleOneDevice(device); - - cleanup: udev_device_unref(device); - return; } -- 2.13.6

Since there's going to be a worker thread which needs to have some data protected by a lock, the whole code would just simply get unnecessary complex, since two sets of locks would be necessary, driver lock (for udev monitor and event handle) and a mutex protecting thread-local data. Given the future thread will need to access the udev monitor socket as well, why not protect everything with a single lock, even better, by converting the driver's private data to a lockable object, we get the automatic object disposal feature for free. Signed-off-by: Erik Skultety <eskultet@redhat.com> --- src/node_device/node_device_udev.c | 134 +++++++++++++++++++++++-------------- src/node_device/node_device_udev.h | 3 - 2 files changed, 85 insertions(+), 52 deletions(-) diff --git a/src/node_device/node_device_udev.c b/src/node_device/node_device_udev.c index 8314b3834..a7b628153 100644 --- a/src/node_device/node_device_udev.c +++ b/src/node_device/node_device_udev.c @@ -53,11 +53,65 @@ VIR_LOG_INIT("node_device.node_device_udev"); # define TYPE_RAID 12 #endif -struct _udevPrivate { +typedef struct _udevEventData udevEventData; +typedef udevEventData *udevEventDataPtr; + +struct _udevEventData { + virObjectLockable parent; + struct udev_monitor *udev_monitor; int watch; }; +static virClassPtr udevEventDataClass; + +static void +udevEventDataDispose(void *obj) +{ + struct udev *udev = NULL; + udevEventDataPtr priv = obj; + + if (priv->watch != -1) + virEventRemoveHandle(priv->watch); + + if (!priv->udev_monitor) + return; + + udev = udev_monitor_get_udev(priv->udev_monitor); + udev_monitor_unref(priv->udev_monitor); + udev_unref(udev); +} + + +static int +udevEventDataOnceInit(void) +{ + if (!(udevEventDataClass = virClassNew(virClassForObjectLockable(), + "udevEventData", + sizeof(udevEventData), + udevEventDataDispose))) + return -1; + + return 0; +} + +VIR_ONCE_GLOBAL_INIT(udevEventData) + +static udevEventDataPtr +udevEventDataNew(void) +{ + udevEventDataPtr ret = NULL; + + if (udevEventDataInitialize() < 0) + return NULL; + + if (!(ret = virObjectLockableNew(udevEventDataClass))) + return NULL; + + ret->watch = -1; + return ret; +} + static bool udevHasDeviceProperty(struct udev_device *dev, @@ -1562,39 +1616,18 @@ udevPCITranslateDeinit(void) static int nodeStateCleanup(void) { - udevPrivate *priv = NULL; - struct udev_monitor *udev_monitor = NULL; - struct udev *udev = NULL; - if (!driver) return -1; nodeDeviceLock(); + virObjectUnref(driver->privateData); virObjectUnref(driver->nodeDeviceEventState); - priv = driver->privateData; - - if (priv) { - if (priv->watch != -1) - virEventRemoveHandle(priv->watch); - - udev_monitor = DRV_STATE_UDEV_MONITOR(driver); - - if (udev_monitor != NULL) { - udev = udev_monitor_get_udev(udev_monitor); - udev_monitor_unref(udev_monitor); - } - } - - if (udev != NULL) - udev_unref(udev); - virNodeDeviceObjListFree(driver->devs); nodeDeviceUnlock(); virMutexDestroy(&driver->lock); VIR_FREE(driver); - VIR_FREE(priv); udevPCITranslateDeinit(); return 0; @@ -1618,16 +1651,17 @@ udevHandleOneDevice(struct udev_device *device) } +/* the caller must be holding the udevEventData object lock prior to calling + * this function + */ static bool -udevEventMonitorSanityCheck(struct udev_monitor *udev_monitor, +udevEventMonitorSanityCheck(udevEventDataPtr priv, int fd) { int rc = -1; - rc = udev_monitor_get_fd(udev_monitor); + rc = udev_monitor_get_fd(priv->udev_monitor); if (fd != rc) { - udevPrivate *priv = driver->privateData; - virReportError(VIR_ERR_INTERNAL_ERROR, _("File descriptor returned by udev %d does not " "match node device file descriptor %d"), @@ -1653,15 +1687,19 @@ udevEventHandleCallback(int watch ATTRIBUTE_UNUSED, int events ATTRIBUTE_UNUSED, void *data ATTRIBUTE_UNUSED) { + udevEventDataPtr priv = driver->privateData; struct udev_device *device = NULL; - struct udev_monitor *udev_monitor = NULL; - udev_monitor = DRV_STATE_UDEV_MONITOR(driver); + virObjectLock(priv); - if (!udevEventMonitorSanityCheck(udev_monitor, fd)) + if (!udevEventMonitorSanityCheck(priv, fd)) { + virObjectUnlock(priv); return; + } + + device = udev_monitor_receive_device(priv->udev_monitor); + virObjectUnlock(priv); - device = udev_monitor_receive_device(udev_monitor); if (device == NULL) { virReportError(VIR_ERR_INTERNAL_ERROR, "%s", _("udev_monitor_receive_device returned NULL")); @@ -1678,12 +1716,13 @@ udevEventHandleCallback(int watch ATTRIBUTE_UNUSED, static void udevGetDMIData(virNodeDevCapSystemPtr syscap) { + udevEventDataPtr priv = driver->privateData; struct udev *udev = NULL; struct udev_device *device = NULL; virNodeDevCapSystemHardwarePtr hardware = &syscap->hardware; virNodeDevCapSystemFirmwarePtr firmware = &syscap->firmware; - udev = udev_monitor_get_udev(DRV_STATE_UDEV_MONITOR(driver)); + udev = udev_monitor_get_udev(priv->udev_monitor); device = udev_device_new_from_syspath(udev, DMI_DEVPATH); if (device == NULL) { @@ -1794,34 +1833,26 @@ nodeStateInitialize(bool privileged, virStateInhibitCallback callback ATTRIBUTE_UNUSED, void *opaque ATTRIBUTE_UNUSED) { - udevPrivate *priv = NULL; + udevEventDataPtr priv = NULL; struct udev *udev = NULL; - if (VIR_ALLOC(priv) < 0) + if (VIR_ALLOC(driver) < 0) return -1; - priv->watch = -1; - - if (VIR_ALLOC(driver) < 0) { - VIR_FREE(priv); - return -1; - } - if (virMutexInit(&driver->lock) < 0) { virReportError(VIR_ERR_INTERNAL_ERROR, "%s", _("Unable to initialize mutex")); - VIR_FREE(priv); VIR_FREE(driver); return -1; } - driver->privileged = privileged; + nodeDeviceLock(); + + if (!(driver->devs = virNodeDeviceObjListNew()) || + !(priv = udevEventDataNew())) + goto unlock; + driver->privateData = priv; - nodeDeviceLock(); - - if (!(driver->devs = virNodeDeviceObjListNew())) - goto unlock; - driver->nodeDeviceEventState = virObjectEventStateNew(); if (udevPCITranslateInit(privileged) < 0) @@ -1838,8 +1869,10 @@ nodeStateInitialize(bool privileged, udev_set_log_fn(udev, (udevLogFunctionPtr) udevLogFunction); #endif + virObjectLock(priv); + priv->udev_monitor = udev_monitor_new_from_netlink(udev, "udev"); - if (priv->udev_monitor == NULL) { + if (!priv->udev_monitor) { virReportError(VIR_ERR_INTERNAL_ERROR, "%s", _("udev_monitor_new_from_netlink returned NULL")); goto unlock; @@ -1874,6 +1907,7 @@ nodeStateInitialize(bool privileged, if (udevSetupSystemDev() != 0) goto unlock; + virObjectUnlock(priv); nodeDeviceUnlock(); /* Populate with known devices */ @@ -1887,6 +1921,8 @@ nodeStateInitialize(bool privileged, return -1; unlock: + if (priv) + virObjectUnlock(priv); nodeDeviceUnlock(); goto cleanup; } diff --git a/src/node_device/node_device_udev.h b/src/node_device/node_device_udev.h index 9a07ab77e..f15e5204c 100644 --- a/src/node_device/node_device_udev.h +++ b/src/node_device/node_device_udev.h @@ -23,9 +23,6 @@ #include <libudev.h> #include <stdint.h> -typedef struct _udevPrivate udevPrivate; - #define SYSFS_DATA_SIZE 4096 -#define DRV_STATE_UDEV_MONITOR(ds) (((udevPrivate *)((ds)->privateData))->udev_monitor) #define DMI_DEVPATH "/sys/devices/virtual/dmi/id" #define DMI_DEVPATH_FALLBACK "/sys/class/dmi/id" -- 2.13.6

The driver locks are unnecessary here, since currently the cleanup is only called from the main daemon thread, so we can't race here. Moreover @devs and @privateData are self-lockable objects, so no problem there either. Signed-off-by: Erik Skultety <eskultet@redhat.com> --- src/node_device/node_device_udev.c | 16 ++++------------ 1 file changed, 4 insertions(+), 12 deletions(-) diff --git a/src/node_device/node_device_udev.c b/src/node_device/node_device_udev.c index a7b628153..e0e5ba799 100644 --- a/src/node_device/node_device_udev.c +++ b/src/node_device/node_device_udev.c @@ -1619,13 +1619,10 @@ nodeStateCleanup(void) if (!driver) return -1; - nodeDeviceLock(); - virObjectUnref(driver->privateData); virObjectUnref(driver->nodeDeviceEventState); virNodeDeviceObjListFree(driver->devs); - nodeDeviceUnlock(); virMutexDestroy(&driver->lock); VIR_FREE(driver); @@ -1846,23 +1843,21 @@ nodeStateInitialize(bool privileged, return -1; } - nodeDeviceLock(); - if (!(driver->devs = virNodeDeviceObjListNew()) || !(priv = udevEventDataNew())) - goto unlock; + goto cleanup; driver->privateData = priv; driver->nodeDeviceEventState = virObjectEventStateNew(); if (udevPCITranslateInit(privileged) < 0) - goto unlock; + goto cleanup; udev = udev_new(); if (!udev) { virReportError(VIR_ERR_INTERNAL_ERROR, "%s", _("failed to create udev context")); - goto unlock; + goto cleanup; } #if HAVE_UDEV_LOGGING /* cast to get rid of missing-format-attribute warning */ @@ -1908,7 +1903,6 @@ nodeStateInitialize(bool privileged, goto unlock; virObjectUnlock(priv); - nodeDeviceUnlock(); /* Populate with known devices */ if (udevEnumerateDevices(udev) != 0) @@ -1921,9 +1915,7 @@ nodeStateInitialize(bool privileged, return -1; unlock: - if (priv) - virObjectUnlock(priv); - nodeDeviceUnlock(); + virObjectUnlock(priv); goto cleanup; } -- 2.13.6

udevSetupSystemDev only needs the udev data lock to be locked because of calling udevGetDMIData which accesses some protected structure members, but it can do that on its own just fine, no need to hold the lock the whole time. Signed-off-by: Erik Skultety <eskultet@redhat.com> --- src/node_device/node_device_udev.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/src/node_device/node_device_udev.c b/src/node_device/node_device_udev.c index e0e5ba799..6882517e6 100644 --- a/src/node_device/node_device_udev.c +++ b/src/node_device/node_device_udev.c @@ -1719,6 +1719,7 @@ udevGetDMIData(virNodeDevCapSystemPtr syscap) virNodeDevCapSystemHardwarePtr hardware = &syscap->hardware; virNodeDevCapSystemFirmwarePtr firmware = &syscap->firmware; + virObjectLock(priv); udev = udev_monitor_get_udev(priv->udev_monitor); device = udev_device_new_from_syspath(udev, DMI_DEVPATH); @@ -1731,6 +1732,7 @@ udevGetDMIData(virNodeDevCapSystemPtr syscap) return; } } + virObjectUnlock(priv); if (udevGetStringSysfsAttr(device, "product_name", &syscap->product_name) < 0) @@ -1898,11 +1900,11 @@ nodeStateInitialize(bool privileged, if (priv->watch == -1) goto unlock; + virObjectUnlock(priv); + /* Create a fictional 'computer' device to root the device tree. */ if (udevSetupSystemDev() != 0) - goto unlock; - - virObjectUnlock(priv); + goto cleanup; /* Populate with known devices */ if (udevEnumerateDevices(udev) != 0) -- 2.13.6

On 10/18/2017 09:52 AM, Erik Skultety wrote:
udevSetupSystemDev only needs the udev data lock to be locked because of calling udevGetDMIData which accesses some protected structure members, but it can do that on its own just fine, no need to hold the lock the whole time.
Signed-off-by: Erik Skultety <eskultet@redhat.com> --- src/node_device/node_device_udev.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-)
diff --git a/src/node_device/node_device_udev.c b/src/node_device/node_device_udev.c index e0e5ba799..6882517e6 100644 --- a/src/node_device/node_device_udev.c +++ b/src/node_device/node_device_udev.c @@ -1719,6 +1719,7 @@ udevGetDMIData(virNodeDevCapSystemPtr syscap) virNodeDevCapSystemHardwarePtr hardware = &syscap->hardware; virNodeDevCapSystemFirmwarePtr firmware = &syscap->firmware;
+ virObjectLock(priv); udev = udev_monitor_get_udev(priv->udev_monitor);
device = udev_device_new_from_syspath(udev, DMI_DEVPATH); @@ -1731,6 +1732,7 @@ udevGetDMIData(virNodeDevCapSystemPtr syscap)
virObjectUnlock(priv); Pesky return statements ;-) John
return; } } + virObjectUnlock(priv);
if (udevGetStringSysfsAttr(device, "product_name", &syscap->product_name) < 0) @@ -1898,11 +1900,11 @@ nodeStateInitialize(bool privileged, if (priv->watch == -1) goto unlock;
+ virObjectUnlock(priv); + /* Create a fictional 'computer' device to root the device tree. */ if (udevSetupSystemDev() != 0) - goto unlock; - - virObjectUnlock(priv); + goto cleanup;
/* Populate with known devices */ if (udevEnumerateDevices(udev) != 0)

On Wed, Oct 18, 2017 at 05:13:41PM -0400, John Ferlan wrote:
On 10/18/2017 09:52 AM, Erik Skultety wrote:
udevSetupSystemDev only needs the udev data lock to be locked because of calling udevGetDMIData which accesses some protected structure members, but it can do that on its own just fine, no need to hold the lock the whole time.
Signed-off-by: Erik Skultety <eskultet@redhat.com> --- src/node_device/node_device_udev.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-)
diff --git a/src/node_device/node_device_udev.c b/src/node_device/node_device_udev.c index e0e5ba799..6882517e6 100644 --- a/src/node_device/node_device_udev.c +++ b/src/node_device/node_device_udev.c @@ -1719,6 +1719,7 @@ udevGetDMIData(virNodeDevCapSystemPtr syscap) virNodeDevCapSystemHardwarePtr hardware = &syscap->hardware; virNodeDevCapSystemFirmwarePtr firmware = &syscap->firmware;
+ virObjectLock(priv); udev = udev_monitor_get_udev(priv->udev_monitor);
device = udev_device_new_from_syspath(udev, DMI_DEVPATH); @@ -1731,6 +1732,7 @@ udevGetDMIData(virNodeDevCapSystemPtr syscap)
virObjectUnlock(priv);
Pesky return statements ;-)
Wow, you really must have been a sharpshooter in the previous life ;), thanks a lot. Erik

On 10/19/2017 02:54 AM, Erik Skultety wrote:
On Wed, Oct 18, 2017 at 05:13:41PM -0400, John Ferlan wrote:
On 10/18/2017 09:52 AM, Erik Skultety wrote:
udevSetupSystemDev only needs the udev data lock to be locked because of calling udevGetDMIData which accesses some protected structure members, but it can do that on its own just fine, no need to hold the lock the whole time.
Signed-off-by: Erik Skultety <eskultet@redhat.com> --- src/node_device/node_device_udev.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-)
diff --git a/src/node_device/node_device_udev.c b/src/node_device/node_device_udev.c index e0e5ba799..6882517e6 100644 --- a/src/node_device/node_device_udev.c +++ b/src/node_device/node_device_udev.c @@ -1719,6 +1719,7 @@ udevGetDMIData(virNodeDevCapSystemPtr syscap) virNodeDevCapSystemHardwarePtr hardware = &syscap->hardware; virNodeDevCapSystemFirmwarePtr firmware = &syscap->firmware;
+ virObjectLock(priv); udev = udev_monitor_get_udev(priv->udev_monitor);
device = udev_device_new_from_syspath(udev, DMI_DEVPATH); @@ -1731,6 +1732,7 @@ udevGetDMIData(virNodeDevCapSystemPtr syscap)
virObjectUnlock(priv);
Pesky return statements ;-)
Wow, you really must have been a sharpshooter in the previous life ;), thanks a lot.
Erik
Sadly they seem to work better when reviewing code... They often fail me on my own code ;-) John

This patch splits udevEventHandleCallback in two (introduces udevEventHandleThread) in order to be later able to refactor the latter to actually become a normal thread which will wait some time for the kernel to create the whole sysfs tree for a device as we cannot do that in the event loop directly. Signed-off-by: Erik Skultety <eskultet@redhat.com> --- src/node_device/node_device_udev.c | 41 +++++++++++++++++++++++++++----------- 1 file changed, 29 insertions(+), 12 deletions(-) diff --git a/src/node_device/node_device_udev.c b/src/node_device/node_device_udev.c index 6882517e6..0167ad596 100644 --- a/src/node_device/node_device_udev.c +++ b/src/node_device/node_device_udev.c @@ -1679,32 +1679,49 @@ udevEventMonitorSanityCheck(udevEventDataPtr priv, static void +udevEventHandleThread(void *opaque) +{ + udevEventDataPtr priv = driver->privateData; + int fd = (intptr_t) opaque; + struct udev_device *device = NULL; + + virObjectLock(priv); + + if (!udevEventMonitorSanityCheck(priv, fd)) { + virObjectUnlock(priv); + return; + } + + device = udev_monitor_receive_device(priv->udev_monitor); + virObjectUnlock(priv); + + if (device == NULL) { + virReportError(VIR_ERR_INTERNAL_ERROR, "%s", + _("udev_monitor_receive_device returned NULL")); + return; + } + + udevHandleOneDevice(device); + udev_device_unref(device); +} + + +static void udevEventHandleCallback(int watch ATTRIBUTE_UNUSED, int fd, int events ATTRIBUTE_UNUSED, void *data ATTRIBUTE_UNUSED) { udevEventDataPtr priv = driver->privateData; - struct udev_device *device = NULL; virObjectLock(priv); - if (!udevEventMonitorSanityCheck(priv, fd)) { virObjectUnlock(priv); return; } - - device = udev_monitor_receive_device(priv->udev_monitor); virObjectUnlock(priv); - if (device == NULL) { - virReportError(VIR_ERR_INTERNAL_ERROR, "%s", - _("udev_monitor_receive_device returned NULL")); - return; - } - - udevHandleOneDevice(device); - udev_device_unref(device); + udevEventHandleThread((void *)(intptr_t) fd); } -- 2.13.6

Adjust udevEventHandleThread to be a proper thread routine running in an infinite loop handling devices. The handler thread pulls all available data from the udev monitor and only then waits until a wakeup signal for new incoming data has been emitted by udevEventHandleCallback. Signed-off-by: Erik Skultety <eskultet@redhat.com> --- src/node_device/node_device_udev.c | 106 +++++++++++++++++++++++++++++-------- 1 file changed, 84 insertions(+), 22 deletions(-) diff --git a/src/node_device/node_device_udev.c b/src/node_device/node_device_udev.c index 0167ad596..9fa90257e 100644 --- a/src/node_device/node_device_udev.c +++ b/src/node_device/node_device_udev.c @@ -61,6 +61,12 @@ struct _udevEventData { struct udev_monitor *udev_monitor; int watch; + + /* Thread data */ + virThread th; + virCond threadCond; + bool threadQuit; + bool dataReady; }; static virClassPtr udevEventDataClass; @@ -80,6 +86,8 @@ udevEventDataDispose(void *obj) udev = udev_monitor_get_udev(priv->udev_monitor); udev_monitor_unref(priv->udev_monitor); udev_unref(udev); + + virCondDestroy(&priv->threadCond); } @@ -108,6 +116,11 @@ udevEventDataNew(void) if (!(ret = virObjectLockableNew(udevEventDataClass))) return NULL; + if (virCondInit(&ret->threadCond) < 0) { + virObjectUnref(ret); + return NULL; + } + ret->watch = -1; return ret; } @@ -1616,10 +1629,21 @@ udevPCITranslateDeinit(void) static int nodeStateCleanup(void) { + udevEventDataPtr priv = NULL; + if (!driver) return -1; - virObjectUnref(driver->privateData); + priv = driver->privateData; + if (priv) { + virObjectLock(priv); + priv->threadQuit = true; + virCondSignal(&priv->threadCond); + virObjectUnlock(priv); + virThreadJoin(&priv->th); + } + + virObjectUnref(priv); virObjectUnref(driver->nodeDeviceEventState); virNodeDeviceObjListFree(driver->devs); @@ -1679,30 +1703,61 @@ udevEventMonitorSanityCheck(udevEventDataPtr priv, static void -udevEventHandleThread(void *opaque) +udevEventHandleThread(void *opaque ATTRIBUTE_UNUSED) { udevEventDataPtr priv = driver->privateData; - int fd = (intptr_t) opaque; struct udev_device *device = NULL; - virObjectLock(priv); + /* continue rather than break from the loop on non-fatal errors */ + while (1) { + virObjectLock(priv); + while (!priv->dataReady && !priv->threadQuit) { + if (virCondWait(&priv->threadCond, &priv->parent.lock)) { + virReportSystemError(errno, "%s", + _("handler failed to wait on condition")); + virObjectUnlock(priv); + return; + } + } - if (!udevEventMonitorSanityCheck(priv, fd)) { + if (priv->threadQuit) { + virObjectUnlock(priv); + return; + } + + errno = 0; + device = udev_monitor_receive_device(priv->udev_monitor); virObjectUnlock(priv); - return; - } - device = udev_monitor_receive_device(priv->udev_monitor); - virObjectUnlock(priv); + if (!device) { + if (errno == 0) { + virReportError(VIR_ERR_INTERNAL_ERROR, "%s", + _("failed to receive device from udev monitor")); + return; + } - if (device == NULL) { - virReportError(VIR_ERR_INTERNAL_ERROR, "%s", - _("udev_monitor_receive_device returned NULL")); - return; - } + /* POSIX allows both EAGAIN and EWOULDBLOCK to be used + * interchangeably when the read would block or timeout was fired + */ + VIR_WARNINGS_NO_WLOGICALOP_EQUAL_EXPR + if (errno != EAGAIN && errno != EWOULDBLOCK) { + VIR_WARNINGS_RESET + virReportSystemError(errno, "%s", + _("failed to receive device from udev " + "monitor")); + return; + } + + virObjectLock(priv); + priv->dataReady = false; + virObjectUnlock(priv); - udevHandleOneDevice(device); - udev_device_unref(device); + continue; + } + + udevHandleOneDevice(device); + udev_device_unref(device); + } } @@ -1715,13 +1770,14 @@ udevEventHandleCallback(int watch ATTRIBUTE_UNUSED, udevEventDataPtr priv = driver->privateData; virObjectLock(priv); - if (!udevEventMonitorSanityCheck(priv, fd)) { - virObjectUnlock(priv); - return; - } + + if (!udevEventMonitorSanityCheck(priv, fd)) + priv->threadQuit = true; + else + priv->dataReady = true; + + virCondSignal(&priv->threadCond); virObjectUnlock(priv); - - udevEventHandleThread((void *)(intptr_t) fd); } @@ -1903,6 +1959,12 @@ nodeStateInitialize(bool privileged, 128 * 1024 * 1024); #endif + if (virThreadCreate(&priv->th, true, udevEventHandleThread, NULL) < 0) { + virReportSystemError(errno, "%s", + _("failed to create udev handler thread")); + goto unlock; + } + /* We register the monitor with the event callback so we are * notified by udev of device changes before we enumerate existing * devices because libvirt will simply recreate the device if we -- 2.13.6

Since we have a number of places where we workaround timing issues with devices, attributes (files in general) not being available at the time of processing them by calling usleep in a loop for a fixed number of tries, we could as well have a utility function that would do that. Therefore we won't have to duplicate this ugly workaround even more. Signed-off-by: Erik Skultety <eskultet@redhat.com> --- src/libvirt_private.syms | 1 + src/util/virfile.c | 31 +++++++++++++++++++++++++++++++ src/util/virfile.h | 2 ++ 3 files changed, 34 insertions(+) diff --git a/src/libvirt_private.syms b/src/libvirt_private.syms index 7bd21ae23..3b5df28e5 100644 --- a/src/libvirt_private.syms +++ b/src/libvirt_private.syms @@ -1756,6 +1756,7 @@ virFileStripSuffix; virFileTouch; virFileUnlock; virFileUpdatePerm; +virFileWaitForExists; virFileWrapperFdClose; virFileWrapperFdFree; virFileWrapperFdNew; diff --git a/src/util/virfile.c b/src/util/virfile.c index 7ca60052d..82cb36dbc 100644 --- a/src/util/virfile.c +++ b/src/util/virfile.c @@ -4177,3 +4177,34 @@ virFileReadValueString(char **value, const char *format, ...) VIR_FREE(str); return ret; } + + +/** + * virFileWaitForExists: + * @path: absolute path to a sysfs attribute (can be a symlink) + * @ms: how long to wait (in milliseconds) + * @tries: how many times should we try to wait for @path to become accessible + * + * Checks the existence of @path. In case the file defined by @path + * doesn't exist, we wait for it to appear in @ms milliseconds (for up to + * @tries attempts). + * + * Returns 0 on success, -1 on error, setting errno appropriately. + */ +int +virFileWaitForExists(const char *path, + size_t ms, + size_t tries) +{ + errno = 0; + + /* wait for @path to be accessible in @ms milliseconds, up to @tries */ + while (tries-- > 0 && !virFileExists(path)) { + if (tries == 0 || errno != ENOENT) + return -1; + + usleep(ms * 1000); + } + + return 0; +} diff --git a/src/util/virfile.h b/src/util/virfile.h index 21fb41b70..91d318622 100644 --- a/src/util/virfile.h +++ b/src/util/virfile.h @@ -349,6 +349,8 @@ int virFileReadValueScaledInt(unsigned long long *value, const char *format, ... int virFileReadValueString(char **value, const char *format, ...) ATTRIBUTE_FMT_PRINTF(2, 3); +int virFileWaitForExists(const char *path, size_t ms, size_t tries); + int virFileInData(int fd, int *inData, -- 2.13.6

If we find ourselves in the situation that the 'add' uevent has been fired earlier than the sysfs tree for a device was created, we should use the best-effort approach and give kernel some predetermined amount of time, thus waiting for the attributes to be ready rather than discarding the device from our device list forever. If those don't appear in the given time frame, we need to move on, since libvirt can't wait indefinitely. Resolves: https://bugzilla.redhat.com/show_bug.cgi?id=1463285 Signed-off-by: Erik Skultety <eskultet@redhat.com> --- src/node_device/node_device_udev.c | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/src/node_device/node_device_udev.c b/src/node_device/node_device_udev.c index 9fa90257e..6686d4f3f 100644 --- a/src/node_device/node_device_udev.c +++ b/src/node_device/node_device_udev.c @@ -1188,9 +1188,23 @@ udevProcessMediatedDevice(struct udev_device *dev, char *canonicalpath = NULL; virNodeDevCapMdevPtr data = &def->caps->data.mdev; - if (virAsprintf(&linkpath, "%s/mdev_type", udev_device_get_syspath(dev)) < 0) + /* Because of a kernel uevent race, we might get the 'add' event prior to + * the sysfs tree being ready, so any attempt to access any sysfs attribute + * would result in ENOENT and us dropping the device, so let's work around + * it by waiting for the attributes to become available. + */ + + if (virAsprintf(&linkpath, "%s/mdev_type", + udev_device_get_syspath(dev)) < 0) goto cleanup; + if (virFileWaitForExists(linkpath, 1, 100) < 0) { + virReportSystemError(errno, + _("failed to wait for file '%s' to appear"), + linkpath); + goto cleanup; + } + if (virFileResolveLink(linkpath, &canonicalpath) < 0) { virReportSystemError(errno, _("failed to resolve '%s'"), linkpath); goto cleanup; -- 2.13.6

On 10/18/2017 09:52 AM, Erik Skultety wrote:
v5 here: https://www.redhat.com/archives/libvir-list/2017-October/msg00440.html
Since v5: - fixed minor nitpicks - added 3 more patches as per reviewer's suggestion to split some of the changes even more - patches {2,6,7,8,9}/9 are without any change
Erik Skultety (9): nodedev: Move privileged flag from udev private data to driver's state (NEW) nodedev: udev: Introduce udevEventMonitorSanityCheck helper function (ACKed) nodedev: udev: Convert udev private data to a lockable object (ACKed with fixes) nodedev: udev: Remove driver locks from stateInitialize and (NEW) stateCleanup nodedev: udev: Unlock the private data before setting up 'system' node (NEW) nodedev: udev: Split udevEventHandleCallback in two functions (ACKed) nodedev: udev: Convert udevEventHandleThread to an actual thread (ACKed) routine util: Introduce virFileWaitForExists (ACKed) nodedev: udev: Hook up virFileWaitForAccess to work around uevent race (ACKed)
src/conf/virnodedeviceobj.h | 1 + src/libvirt_private.syms | 1 + src/node_device/node_device_udev.c | 283 +++++++++++++++++++++++++++---------- src/node_device/node_device_udev.h | 3 - src/util/virfile.c | 31 ++++ src/util/virfile.h | 2 + 6 files changed, 245 insertions(+), 76 deletions(-)
-- 2.13.6
-- libvir-list mailing list libvir-list@redhat.com https://www.redhat.com/mailman/listinfo/libvir-list>
w/ one minor, noted adjustment in patch 5 Reviewed-by: John Ferlan <jferlan@redhat.com> (series) I ran through Coverity too... John

On Wed, Oct 18, 2017 at 05:13:49PM -0400, John Ferlan wrote:
On 10/18/2017 09:52 AM, Erik Skultety wrote:
v5 here: https://www.redhat.com/archives/libvir-list/2017-October/msg00440.html
Since v5: - fixed minor nitpicks - added 3 more patches as per reviewer's suggestion to split some of the changes even more - patches {2,6,7,8,9}/9 are without any change
Erik Skultety (9): nodedev: Move privileged flag from udev private data to driver's state (NEW) nodedev: udev: Introduce udevEventMonitorSanityCheck helper function (ACKed) nodedev: udev: Convert udev private data to a lockable object (ACKed with fixes) nodedev: udev: Remove driver locks from stateInitialize and (NEW) stateCleanup nodedev: udev: Unlock the private data before setting up 'system' node (NEW) nodedev: udev: Split udevEventHandleCallback in two functions (ACKed) nodedev: udev: Convert udevEventHandleThread to an actual thread (ACKed) routine util: Introduce virFileWaitForExists (ACKed) nodedev: udev: Hook up virFileWaitForAccess to work around uevent race (ACKed)
src/conf/virnodedeviceobj.h | 1 + src/libvirt_private.syms | 1 + src/node_device/node_device_udev.c | 283 +++++++++++++++++++++++++++---------- src/node_device/node_device_udev.h | 3 - src/util/virfile.c | 31 ++++ src/util/virfile.h | 2 + 6 files changed, 245 insertions(+), 76 deletions(-)
-- 2.13.6
-- libvir-list mailing list libvir-list@redhat.com https://www.redhat.com/mailman/listinfo/libvir-list>
w/ one minor, noted adjustment in patch 5
Reviewed-by: John Ferlan <jferlan@redhat.com> (series)
I ran through Coverity too...
Great, thanks a lot :). A just pushed the series. Erik
participants (2)
-
Erik Skultety
-
John Ferlan