[libvirt] [PATCH 0/6] Memory locking limit improvements

As noticed by Peter[1], the memory locking limit for the QEMU process is increased before assigning a VFIO device to a guest, but it might not be decreased when returning said device to the host. This series fixes this inconsistent behaviour and cleans up the code a little bit along the way. The idea is to introduce a new, smarter function called qemuDomainAdjustMaxMemLock() that does The Right Thing™ and increases the limit when required, while at the same time storing the original value. This way, when memory locking is no longer needed, it can restore it. I've tested this both on x86 and ppc64, both by removing devices that were assigned in the domain XML and devices that I had hotplugged. Patches 1-2 lay some groundwork by allowing retrieval of the memory locking limit for a process. Patches 3-4 add qemuDomainAdjustMaxMemLock() and use it where appropriate in the existing code. Patch 5 adds one more use of the function, after a PCI hostdev has been detached from the guest. Patch 6 replaces the use of Mlock with MemLock. Suggestions on how to further improve the names of those functions is very welcome, this is just a first step in the right direction. Cheers. [1] https://www.redhat.com/archives/libvir-list/2015-November/msg00642.html Andrea Bolognani (6): process: Allow virProcessPrLimit() to get current limit process: Add virProcessGetMaxMemLock() qemu: Add qemuDomainAdjustMaxMemLock() qemu: Use qemuDomainAdjustMaxMemLock() qemu: Reduce memlock limit after detaching hostdev qemu: Replace Mlock with MemLock in function names configure.ac | 2 +- src/conf/domain_conf.h | 3 +++ src/libvirt_private.syms | 1 + src/qemu/qemu_command.c | 4 ++-- src/qemu/qemu_domain.c | 56 +++++++++++++++++++++++++++++++++++++++++++--- src/qemu/qemu_domain.h | 5 +++-- src/qemu/qemu_hotplug.c | 46 ++++++++++++++++---------------------- src/util/virprocess.c | 58 +++++++++++++++++++++++++++++++++++++++++++----- src/util/virprocess.h | 2 ++ 9 files changed, 136 insertions(+), 41 deletions(-) -- 2.5.0

The prlimit() function allows both getting and setting limits for a process; expose the same functionality in our wrapper. Add the const modifier for new_limit, in accordance with the prototype for prlimit(). --- src/util/virprocess.c | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/src/util/virprocess.c b/src/util/virprocess.c index 4b18903..9b38834 100644 --- a/src/util/virprocess.c +++ b/src/util/virprocess.c @@ -725,15 +725,19 @@ int virProcessSetNamespaces(size_t nfdlist, #if HAVE_PRLIMIT static int -virProcessPrLimit(pid_t pid, int resource, struct rlimit *rlim) +virProcessPrLimit(pid_t pid, + int resource, + const struct rlimit *new_limit, + struct rlimit *old_limit) { - return prlimit(pid, resource, rlim, NULL); + return prlimit(pid, resource, new_limit, old_limit); } #elif HAVE_SETRLIMIT static int virProcessPrLimit(pid_t pid ATTRIBUTE_UNUSED, int resource ATTRIBUTE_UNUSED, - struct rlimit *rlim ATTRIBUTE_UNUSED) + const struct rlimit *new_limit ATTRIBUTE_UNUSED, + struct rlimit *old_limit ATTRIBUTE_UNUSED) { errno = ENOSYS; return -1; @@ -758,7 +762,7 @@ virProcessSetMaxMemLock(pid_t pid, unsigned long long bytes) return -1; } } else { - if (virProcessPrLimit(pid, RLIMIT_MEMLOCK, &rlim) < 0) { + if (virProcessPrLimit(pid, RLIMIT_MEMLOCK, &rlim, NULL) < 0) { virReportSystemError(errno, _("cannot limit locked memory " "of process %lld to %llu"), @@ -803,7 +807,7 @@ virProcessSetMaxProcesses(pid_t pid, unsigned int procs) return -1; } } else { - if (virProcessPrLimit(pid, RLIMIT_NPROC, &rlim) < 0) { + if (virProcessPrLimit(pid, RLIMIT_NPROC, &rlim, NULL) < 0) { virReportSystemError(errno, _("cannot limit number of subprocesses " "of process %lld to %u"), @@ -851,7 +855,7 @@ virProcessSetMaxFiles(pid_t pid, unsigned int files) return -1; } } else { - if (virProcessPrLimit(pid, RLIMIT_NOFILE, &rlim) < 0) { + if (virProcessPrLimit(pid, RLIMIT_NOFILE, &rlim, NULL) < 0) { virReportSystemError(errno, _("cannot limit number of open files " "of process %lld to %u"), -- 2.5.0

On 11/24/2015 08:56 AM, Andrea Bolognani wrote:
The prlimit() function allows both getting and setting limits for a process; expose the same functionality in our wrapper.
Add the const modifier for new_limit, in accordance with the prototype for prlimit().
Since the current way hasn't bombed on some other compiler, I certainly hope adding const won't do so - I see no harm, but I've also know I never fully trust "other" compilers...
--- src/util/virprocess.c | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-)
ACK - I see no harm here (yet ;-)) John
diff --git a/src/util/virprocess.c b/src/util/virprocess.c index 4b18903..9b38834 100644 --- a/src/util/virprocess.c +++ b/src/util/virprocess.c @@ -725,15 +725,19 @@ int virProcessSetNamespaces(size_t nfdlist,
#if HAVE_PRLIMIT static int -virProcessPrLimit(pid_t pid, int resource, struct rlimit *rlim) +virProcessPrLimit(pid_t pid, + int resource, + const struct rlimit *new_limit, + struct rlimit *old_limit) { - return prlimit(pid, resource, rlim, NULL); + return prlimit(pid, resource, new_limit, old_limit); } #elif HAVE_SETRLIMIT static int virProcessPrLimit(pid_t pid ATTRIBUTE_UNUSED, int resource ATTRIBUTE_UNUSED, - struct rlimit *rlim ATTRIBUTE_UNUSED) + const struct rlimit *new_limit ATTRIBUTE_UNUSED, + struct rlimit *old_limit ATTRIBUTE_UNUSED) { errno = ENOSYS; return -1; @@ -758,7 +762,7 @@ virProcessSetMaxMemLock(pid_t pid, unsigned long long bytes) return -1; } } else { - if (virProcessPrLimit(pid, RLIMIT_MEMLOCK, &rlim) < 0) { + if (virProcessPrLimit(pid, RLIMIT_MEMLOCK, &rlim, NULL) < 0) { virReportSystemError(errno, _("cannot limit locked memory " "of process %lld to %llu"), @@ -803,7 +807,7 @@ virProcessSetMaxProcesses(pid_t pid, unsigned int procs) return -1; } } else { - if (virProcessPrLimit(pid, RLIMIT_NPROC, &rlim) < 0) { + if (virProcessPrLimit(pid, RLIMIT_NPROC, &rlim, NULL) < 0) { virReportSystemError(errno, _("cannot limit number of subprocesses " "of process %lld to %u"), @@ -851,7 +855,7 @@ virProcessSetMaxFiles(pid_t pid, unsigned int files) return -1; } } else { - if (virProcessPrLimit(pid, RLIMIT_NOFILE, &rlim) < 0) { + if (virProcessPrLimit(pid, RLIMIT_NOFILE, &rlim, NULL) < 0) { virReportSystemError(errno, _("cannot limit number of open files " "of process %lld to %u"),

This function can be used to retrieve the current locked memory limit for a process, so that the setting can be later restored. Add a configure check for getrlimit(), which we now use. --- configure.ac | 2 +- src/libvirt_private.syms | 1 + src/util/virprocess.c | 42 ++++++++++++++++++++++++++++++++++++++++++ src/util/virprocess.h | 2 ++ 4 files changed, 46 insertions(+), 1 deletion(-) diff --git a/configure.ac b/configure.ac index f481c50..c2a567f 100644 --- a/configure.ac +++ b/configure.ac @@ -279,7 +279,7 @@ AC_CHECK_SIZEOF([long]) dnl Availability of various common functions (non-fatal if missing), dnl and various less common threadsafe functions AC_CHECK_FUNCS_ONCE([cfmakeraw fallocate geteuid getgid getgrnam_r \ - getmntent_r getpwuid_r getuid kill mmap newlocale posix_fallocate \ + getmntent_r getpwuid_r getrlimit getuid kill mmap newlocale posix_fallocate \ posix_memalign prlimit regexec sched_getaffinity setgroups setns \ setrlimit symlink sysctlbyname getifaddrs sched_setscheduler]) diff --git a/src/libvirt_private.syms b/src/libvirt_private.syms index 7e60d87..e330a16 100644 --- a/src/libvirt_private.syms +++ b/src/libvirt_private.syms @@ -2022,6 +2022,7 @@ virPortAllocatorSetUsed; virProcessAbort; virProcessExitWithStatus; virProcessGetAffinity; +virProcessGetMaxMemLock; virProcessGetNamespaces; virProcessGetPids; virProcessGetStartTime; diff --git a/src/util/virprocess.c b/src/util/virprocess.c index 9b38834..b14164a 100644 --- a/src/util/virprocess.c +++ b/src/util/virprocess.c @@ -788,6 +788,48 @@ virProcessSetMaxMemLock(pid_t pid ATTRIBUTE_UNUSED, unsigned long long bytes) } #endif /* ! (HAVE_SETRLIMIT && defined(RLIMIT_MEMLOCK)) */ +#if HAVE_GETRLIMIT && defined(RLIMIT_MEMLOCK) +int +virProcessGetMaxMemLock(pid_t pid, + unsigned long long *bytes) +{ + struct rlimit rlim; + + if (!bytes) + return 0; + + if (pid == 0) { + if (getrlimit(RLIMIT_MEMLOCK, &rlim) < 0) { + virReportSystemError(errno, + "%s", + _("cannot get locked memory limit")); + return -1; + } + } else { + if (virProcessPrLimit(pid, RLIMIT_MEMLOCK, NULL, &rlim) < 0) { + virReportSystemError(errno, + _("cannot get locked memory limit " + "of process %lld"), + (long long int) pid); + return -1; + } + } + + /* virProcessSetMaxMemLock() sets both rlim_cur and rlim_max to the + * same value, so we can retrieve just rlim_max here */ + *bytes = rlim.rlim_max; + + return 0; +} +#else /* ! (HAVE_GETRLIMIT && defined(RLIMIT_MEMLOCK)) */ +int +virProcessGetMaxMemLock(pid_t pid ATTRIBUTE_UNUSED, + unsigned long long *bytes) +{ + virReportSystemError(ENOSYS, "%s", _("Not supported on this platform")); + return -1; +} +#endif /* ! (HAVE_GETRLIMIT && defined(RLIMIT_MEMLOCK)) */ #if HAVE_SETRLIMIT && defined(RLIMIT_NPROC) int diff --git a/src/util/virprocess.h b/src/util/virprocess.h index 1768009..a7a1fe9 100644 --- a/src/util/virprocess.h +++ b/src/util/virprocess.h @@ -76,6 +76,8 @@ int virProcessSetMaxMemLock(pid_t pid, unsigned long long bytes); int virProcessSetMaxProcesses(pid_t pid, unsigned int procs); int virProcessSetMaxFiles(pid_t pid, unsigned int files); +int virProcessGetMaxMemLock(pid_t pid, unsigned long long *bytes); + /* Callback to run code within the mount namespace tied to the given * pid. This function must use only async-signal-safe functions, as * it gets run after a fork of a multi-threaded process. The return -- 2.5.0

On 11/24/2015 08:56 AM, Andrea Bolognani wrote:
This function can be used to retrieve the current locked memory limit for a process, so that the setting can be later restored.
Add a configure check for getrlimit(), which we now use. --- configure.ac | 2 +- src/libvirt_private.syms | 1 + src/util/virprocess.c | 42 ++++++++++++++++++++++++++++++++++++++++++ src/util/virprocess.h | 2 ++ 4 files changed, 46 insertions(+), 1 deletion(-)
diff --git a/configure.ac b/configure.ac index f481c50..c2a567f 100644 --- a/configure.ac +++ b/configure.ac @@ -279,7 +279,7 @@ AC_CHECK_SIZEOF([long]) dnl Availability of various common functions (non-fatal if missing), dnl and various less common threadsafe functions AC_CHECK_FUNCS_ONCE([cfmakeraw fallocate geteuid getgid getgrnam_r \ - getmntent_r getpwuid_r getuid kill mmap newlocale posix_fallocate \ + getmntent_r getpwuid_r getrlimit getuid kill mmap newlocale posix_fallocate \ posix_memalign prlimit regexec sched_getaffinity setgroups setns \ setrlimit symlink sysctlbyname getifaddrs sched_setscheduler])
diff --git a/src/libvirt_private.syms b/src/libvirt_private.syms index 7e60d87..e330a16 100644 --- a/src/libvirt_private.syms +++ b/src/libvirt_private.syms @@ -2022,6 +2022,7 @@ virPortAllocatorSetUsed; virProcessAbort; virProcessExitWithStatus; virProcessGetAffinity; +virProcessGetMaxMemLock; virProcessGetNamespaces; virProcessGetPids; virProcessGetStartTime; diff --git a/src/util/virprocess.c b/src/util/virprocess.c index 9b38834..b14164a 100644 --- a/src/util/virprocess.c +++ b/src/util/virprocess.c @@ -788,6 +788,48 @@ virProcessSetMaxMemLock(pid_t pid ATTRIBUTE_UNUSED, unsigned long long bytes) } #endif /* ! (HAVE_SETRLIMIT && defined(RLIMIT_MEMLOCK)) */
+#if HAVE_GETRLIMIT && defined(RLIMIT_MEMLOCK) +int +virProcessGetMaxMemLock(pid_t pid, + unsigned long long *bytes)
Another option would be to return bytes... where at least as I read patch 3 bytes == 0 is no different than original_memlock == 0 especially if arg2 is NULL Of course, does calling getrlimit or virProcessPrLimit() potentially multiple times make sense? Does blasting those error messages each time to the log, but continuing on cause confusion? I guess what I'm thinking about is how it's eventually used in patch 3 and the concept of failure because something in here fails or perhaps the getrlimit *or* prlimit isn't supported on the platform...
+{ + struct rlimit rlim; + + if (!bytes) + return 0;
Since you return 0 here if passed a NULL bytes, then I think you'd have to do so in the other API
+ + if (pid == 0) { + if (getrlimit(RLIMIT_MEMLOCK, &rlim) < 0) { + virReportSystemError(errno, + "%s",
This one could go on previous line - no big deal other than extra line
+ _("cannot get locked memory limit")); + return -1; + } + } else { + if (virProcessPrLimit(pid, RLIMIT_MEMLOCK, NULL, &rlim) < 0) { + virReportSystemError(errno, + _("cannot get locked memory limit " + "of process %lld"), + (long long int) pid); + return -1; + } + } + + /* virProcessSetMaxMemLock() sets both rlim_cur and rlim_max to the + * same value, so we can retrieve just rlim_max here */ + *bytes = rlim.rlim_max;
One oddball thought... what if rlim.rlim_max == 0? Is that possible? I suppose only if rlim_cur == 0 too, right? Not sure it makes sense and I didn't chase the syscall. It does say rlim_cur can be 0 and that rlim_max must be equal or higher... I'm just trying to logically follow through on the thought of how 0 is used in patch 3.
+ + return 0; +} +#else /* ! (HAVE_GETRLIMIT && defined(RLIMIT_MEMLOCK)) */ +int +virProcessGetMaxMemLock(pid_t pid ATTRIBUTE_UNUSED, + unsigned long long *bytes)
Would technically be unused if kept as is... However since the other API returns 0 when value is passed as NULL, this could too. John
+{ + virReportSystemError(ENOSYS, "%s", _("Not supported on this platform")); + return -1; +} +#endif /* ! (HAVE_GETRLIMIT && defined(RLIMIT_MEMLOCK)) */
#if HAVE_SETRLIMIT && defined(RLIMIT_NPROC) int diff --git a/src/util/virprocess.h b/src/util/virprocess.h index 1768009..a7a1fe9 100644 --- a/src/util/virprocess.h +++ b/src/util/virprocess.h @@ -76,6 +76,8 @@ int virProcessSetMaxMemLock(pid_t pid, unsigned long long bytes); int virProcessSetMaxProcesses(pid_t pid, unsigned int procs); int virProcessSetMaxFiles(pid_t pid, unsigned int files);
+int virProcessGetMaxMemLock(pid_t pid, unsigned long long *bytes); + /* Callback to run code within the mount namespace tied to the given * pid. This function must use only async-signal-safe functions, as * it gets run after a fork of a multi-threaded process. The return

On Wed, 2015-12-09 at 12:00 -0500, John Ferlan wrote:
On 11/24/2015 08:56 AM, Andrea Bolognani wrote:
@@ -788,6 +788,48 @@ virProcessSetMaxMemLock(pid_t pid ATTRIBUTE_UNUSED, unsigned long long bytes) } #endif /* ! (HAVE_SETRLIMIT && defined(RLIMIT_MEMLOCK)) */ +#if HAVE_GETRLIMIT && defined(RLIMIT_MEMLOCK) +int +virProcessGetMaxMemLock(pid_t pid, + unsigned long long *bytes)
Another option would be to return bytes...
where at least as I read patch 3 bytes == 0 is no different than original_memlock == 0 especially if arg2 is NULL
Of course, does calling getrlimit or virProcessPrLimit() potentially multiple times make sense? Does blasting those error messages each time to the log, but continuing on cause confusion?
I guess what I'm thinking about is how it's eventually used in patch 3 and the concept of failure because something in here fails or perhaps the getrlimit *or* prlimit isn't supported on the platform...
I think ignoring failures in getting the current memory locking limit is the right thing to do in the upper layers, as that just means we won't be able to restore the original limit afterwards and that's not really a huge problem. However, at this level, we're dealing with low-level functionality and I think all failures should be reported appropriately, eg. with a negative return value and a proper error logged.
+{ + struct rlimit rlim; + + if (!bytes) + return 0;
Since you return 0 here if passed a NULL bytes, then I think you'd have to do so in the other API
What other API do you mean?
+ + if (pid == 0) { + if (getrlimit(RLIMIT_MEMLOCK, &rlim) < 0) { + virReportSystemError(errno, + "%s",
This one could go on previous line - no big deal other than extra line
I'd rather squeeze the second and third lines together or leave it as it is, if that's the same to you.
+ _("cannot get locked memory limit")); + return -1; + } + } else { + if (virProcessPrLimit(pid, RLIMIT_MEMLOCK, NULL, &rlim) < 0) { + virReportSystemError(errno, + _("cannot get locked memory limit " + "of process %lld"), + (long long int) pid); + return -1; + } + } + + /* virProcessSetMaxMemLock() sets both rlim_cur and rlim_max to the + * same value, so we can retrieve just rlim_max here */ + *bytes = rlim.rlim_max;
One oddball thought... what if rlim.rlim_max == 0? Is that possible? I suppose only if rlim_cur == 0 too, right? Not sure it makes sense and I didn't chase the syscall. It does say rlim_cur can be 0 and that rlim_max must be equal or higher...
I'm just trying to logically follow through on the thought of how 0 is used in patch 3.
I don't know, but I don't think we should concern ourself too much with that case as virProcessSetMaxMemLock(), which will be used to restore whatever value this function returns, ignores the value 0.
+ + return 0; +} +#else /* ! (HAVE_GETRLIMIT && defined(RLIMIT_MEMLOCK)) */ +int +virProcessGetMaxMemLock(pid_t pid ATTRIBUTE_UNUSED, + unsigned long long *bytes)
Would technically be unused if kept as is... However since the other API returns 0 when value is passed as NULL, this could too.
Having a function that either fails as unimplemented or succeeds depending on its arguments doesn't feel right to me. I see, however, that all virProcessSetMax*() functions behave this way already so it makes sense to change do the same for consistency's sake. Cheers. -- Andrea Bolognani Software Engineer - Virtualization Team

On 12/10/2015 11:33 AM, Andrea Bolognani wrote:
On Wed, 2015-12-09 at 12:00 -0500, John Ferlan wrote:
On 11/24/2015 08:56 AM, Andrea Bolognani wrote:
@@ -788,6 +788,48 @@ virProcessSetMaxMemLock(pid_t pid ATTRIBUTE_UNUSED, unsigned long long bytes) } #endif /* ! (HAVE_SETRLIMIT && defined(RLIMIT_MEMLOCK)) */
+#if HAVE_GETRLIMIT && defined(RLIMIT_MEMLOCK) +int +virProcessGetMaxMemLock(pid_t pid, + unsigned long long *bytes)
Another option would be to return bytes...
where at least as I read patch 3 bytes == 0 is no different than original_memlock == 0 especially if arg2 is NULL
Of course, does calling getrlimit or virProcessPrLimit() potentially multiple times make sense? Does blasting those error messages each time to the log, but continuing on cause confusion?
I guess what I'm thinking about is how it's eventually used in patch 3 and the concept of failure because something in here fails or perhaps the getrlimit *or* prlimit isn't supported on the platform...
I think ignoring failures in getting the current memory locking limit is the right thing to do in the upper layers, as that just means we won't be able to restore the original limit afterwards and that's not really a huge problem.
However, at this level, we're dealing with low-level functionality and I think all failures should be reported appropriately, eg. with a negative return value and a proper error logged.
+{ + struct rlimit rlim; + + if (!bytes) + return 0;
Since you return 0 here if passed a NULL bytes, then I think you'd have to do so in the other API
What other API do you mean?
in the code after this: +#else /* ! (HAVE_GETRLIMIT && defined(RLIMIT_MEMLOCK)) */ +int +virProcessGetMaxMemLock(pid_t pid ATTRIBUTE_UNUSED, + unsigned long long *bytes)
+ + if (pid == 0) { + if (getrlimit(RLIMIT_MEMLOCK, &rlim) < 0) { + virReportSystemError(errno, + "%s",
This one could go on previous line - no big deal other than extra line
I'd rather squeeze the second and third lines together or leave it as it is, if that's the same to you.
It really doesn't matter - just seems strange to have "%s", on its own line especially when there's space on the prior line.
+ _("cannot get locked memory limit")); + return -1; + } + } else { + if (virProcessPrLimit(pid, RLIMIT_MEMLOCK, NULL, &rlim) < 0) { + virReportSystemError(errno, + _("cannot get locked memory limit " + "of process %lld"), + (long long int) pid); + return -1; + } + } + + /* virProcessSetMaxMemLock() sets both rlim_cur and rlim_max to the + * same value, so we can retrieve just rlim_max here */ + *bytes = rlim.rlim_max;
One oddball thought... what if rlim.rlim_max == 0? Is that possible? I suppose only if rlim_cur == 0 too, right? Not sure it makes sense and I didn't chase the syscall. It does say rlim_cur can be 0 and that rlim_max must be equal or higher...
I'm just trying to logically follow through on the thought of how 0 is used in patch 3.
I don't know, but I don't think we should concern ourself too much with that case as virProcessSetMaxMemLock(), which will be used to restore whatever value this function returns, ignores the value 0.
Fair enough...
+ + return 0; +} +#else /* ! (HAVE_GETRLIMIT && defined(RLIMIT_MEMLOCK)) */ +int +virProcessGetMaxMemLock(pid_t pid ATTRIBUTE_UNUSED, + unsigned long long *bytes)
Would technically be unused if kept as is... However since the other API returns 0 when value is passed as NULL, this could too.
Having a function that either fails as unimplemented or succeeds depending on its arguments doesn't feel right to me.
I see, however, that all virProcessSetMax*() functions behave this way already so it makes sense to change do the same for consistency's sake.
Who's to say which is the right way... But since we declare using 0 does nothing, then playing following the leader shouldn't hurt in this case. John

This function detects whether a domain needs RLIMIT_MEMLOCK to be set, and if so, uses an appropriate value. It also stores the original value inside the virDomainObj for the domain so that it can be later restored. --- src/conf/domain_conf.h | 3 +++ src/qemu/qemu_domain.c | 50 ++++++++++++++++++++++++++++++++++++++++++++++++++ src/qemu/qemu_domain.h | 1 + 3 files changed, 54 insertions(+) diff --git a/src/conf/domain_conf.h b/src/conf/domain_conf.h index 8d43ee6..9e28ac9 100644 --- a/src/conf/domain_conf.h +++ b/src/conf/domain_conf.h @@ -2386,6 +2386,9 @@ struct _virDomainObj { void (*privateDataFreeFunc)(void *); int taint; + + unsigned long long original_memlock; /* Original RLIMIT_MEMLOCK, zero if no + * restore will be required later */ }; typedef struct _virDomainObjList virDomainObjList; diff --git a/src/qemu/qemu_domain.c b/src/qemu/qemu_domain.c index 18513f9..51a1770 100644 --- a/src/qemu/qemu_domain.c +++ b/src/qemu/qemu_domain.c @@ -40,6 +40,7 @@ #include "virstoragefile.h" #include "virstring.h" #include "virthreadjob.h" +#include "virprocess.h" #include "storage/storage_driver.h" @@ -3954,3 +3955,52 @@ qemuDomainRequiresMlock(virDomainDefPtr def) return false; } + +/** + * qemuDomainAdjustMaxMemLock: + * + * @vm: domain + * + * Adjust the memory locking limit for the QEMU process associated to @vm, in + * order to comply with VFIO or architecture requirements. + * + * The limit will not be changed unless doing so is needed; the first time + * the limit is changed, the original (default) limit is stored in @vm and + * that value will be restored if qemuDomainAdjustMaxMemLock() is called once + * memory locking is no longer required. + * + * Returns: 0 on success, <0 on failure + */ +int +qemuDomainAdjustMaxMemLock(virDomainObjPtr vm) +{ + unsigned long long bytes = 0; + int ret = -1; + + if (qemuDomainRequiresMlock(vm->def)) { + /* If this is the first time adjusting the limit, save the current + * value so that we can restore it once memory locking is no longer + * required */ + if (!vm->original_memlock) { + if (virProcessGetMaxMemLock(vm->pid, &(vm->original_memlock)) < 0) + goto out; + } + bytes = qemuDomainGetMlockLimitBytes(vm->def); + } else { + /* Once memory locking is no longer required, we can restore the + * original, usually very low, limit */ + bytes = vm->original_memlock; + vm->original_memlock = 0; + } + + /* Don't do anything unless we're actually setting a limit */ + if (bytes) { + if (virProcessSetMaxMemLock(vm->pid, bytes) < 0) + goto out; + } + + ret = 0; + + out: + return ret; +} diff --git a/src/qemu/qemu_domain.h b/src/qemu/qemu_domain.h index 271dce9..cc64df3 100644 --- a/src/qemu/qemu_domain.h +++ b/src/qemu/qemu_domain.h @@ -486,6 +486,7 @@ int qemuDomainUpdateCurrentMemorySize(virQEMUDriverPtr driver, unsigned long long qemuDomainGetMlockLimitBytes(virDomainDefPtr def); bool qemuDomainRequiresMlock(virDomainDefPtr def); +int qemuDomainAdjustMaxMemLock(virDomainObjPtr vm); int qemuDomainDefValidateMemoryHotplug(const virDomainDef *def, virQEMUCapsPtr qemuCaps, -- 2.5.0

On 11/24/2015 08:56 AM, Andrea Bolognani wrote:
This function detects whether a domain needs RLIMIT_MEMLOCK to be set, and if so, uses an appropriate value.
It also stores the original value inside the virDomainObj for the domain so that it can be later restored. --- src/conf/domain_conf.h | 3 +++ src/qemu/qemu_domain.c | 50 ++++++++++++++++++++++++++++++++++++++++++++++++++ src/qemu/qemu_domain.h | 1 + 3 files changed, 54 insertions(+)
NB: This patch wouldn't "git am -3" apply for me - looks like you'll be having some merge conflicts to deal with.
diff --git a/src/conf/domain_conf.h b/src/conf/domain_conf.h index 8d43ee6..9e28ac9 100644 --- a/src/conf/domain_conf.h +++ b/src/conf/domain_conf.h @@ -2386,6 +2386,9 @@ struct _virDomainObj { void (*privateDataFreeFunc)(void *);
int taint; + + unsigned long long original_memlock; /* Original RLIMIT_MEMLOCK, zero if no + * restore will be required later */ };
typedef struct _virDomainObjList virDomainObjList; diff --git a/src/qemu/qemu_domain.c b/src/qemu/qemu_domain.c index 18513f9..51a1770 100644 --- a/src/qemu/qemu_domain.c +++ b/src/qemu/qemu_domain.c @@ -40,6 +40,7 @@ #include "virstoragefile.h" #include "virstring.h" #include "virthreadjob.h" +#include "virprocess.h"
#include "storage/storage_driver.h"
@@ -3954,3 +3955,52 @@ qemuDomainRequiresMlock(virDomainDefPtr def)
return false; } + +/** + * qemuDomainAdjustMaxMemLock: + * + * @vm: domain + * + * Adjust the memory locking limit for the QEMU process associated to @vm, in + * order to comply with VFIO or architecture requirements. + * + * The limit will not be changed unless doing so is needed; the first time + * the limit is changed, the original (default) limit is stored in @vm and + * that value will be restored if qemuDomainAdjustMaxMemLock() is called once + * memory locking is no longer required. + * + * Returns: 0 on success, <0 on failure + */ +int +qemuDomainAdjustMaxMemLock(virDomainObjPtr vm)
It seems this function does two things... #1 Get some original value if the domain requires mlock #2 Restores the original value Not sure it's best to mix the logic...
+{ + unsigned long long bytes = 0; + int ret = -1; + + if (qemuDomainRequiresMlock(vm->def)) { + /* If this is the first time adjusting the limit, save the current + * value so that we can restore it once memory locking is no longer + * required */ + if (!vm->original_memlock) { + if (virProcessGetMaxMemLock(vm->pid, &(vm->original_memlock)) < 0) + goto out;
Failure scenario 1 - I couldn't get some original value Should this really be a failure? Could fail for 4 reasons: 1. getrlimit returns fail when pid == 0 2. getrlimit && RLIMIT_MEMLOCK not available 3. prlimit returns fail 4. prlimit not available or in the case shown by commit id '05f79a38' the oddity from a mingw build. Given how the code is written today, if arg2 was NULL then a "0" would have been returned. So if it returns 0, then that means the "ability" to reset to some initial value is not available... Thus we just return skinny, dumb, and happy rather than failing. EG no worse than today. For archs/envs that support getting the value - sure we can support resetting the value...
+ } + bytes = qemuDomainGetMlockLimitBytes(vm->def); + } else { + /* Once memory locking is no longer required, we can restore the + * original, usually very low, limit */ + bytes = vm->original_memlock;
Not sure I understand what is meant by "is no longer required" (yet - I haven't read forward)... However, if it never was required, then all the following code does nothing since original_memlock would be 0 (and nothing here or in future patches) sets it unless the domain requires it. IOW: How/why does a domain go from at one point requiring Mlock to not requiring it? It's not lock a running PPC64 domain is going to change it's arch type...
+ vm->original_memlock = 0; + } + + /* Don't do anything unless we're actually setting a limit */ + if (bytes) { + if (virProcessSetMaxMemLock(vm->pid, bytes) < 0)
Failure scenario 2 - I couldn't set a new limit. E.G - in my mind the true failure. I couldn't set a new value. John
+ goto out; + } + + ret = 0; + + out: + return ret; +} diff --git a/src/qemu/qemu_domain.h b/src/qemu/qemu_domain.h index 271dce9..cc64df3 100644 --- a/src/qemu/qemu_domain.h +++ b/src/qemu/qemu_domain.h @@ -486,6 +486,7 @@ int qemuDomainUpdateCurrentMemorySize(virQEMUDriverPtr driver,
unsigned long long qemuDomainGetMlockLimitBytes(virDomainDefPtr def); bool qemuDomainRequiresMlock(virDomainDefPtr def); +int qemuDomainAdjustMaxMemLock(virDomainObjPtr vm);
int qemuDomainDefValidateMemoryHotplug(const virDomainDef *def, virQEMUCapsPtr qemuCaps,

On Wed, 2015-12-09 at 12:19 -0500, John Ferlan wrote:
On 11/24/2015 08:56 AM, Andrea Bolognani wrote:
This function detects whether a domain needs RLIMIT_MEMLOCK to be set, and if so, uses an appropriate value.
It also stores the original value inside the virDomainObj for the domain so that it can be later restored. --- src/conf/domain_conf.h | 3 +++ src/qemu/qemu_domain.c | 50 ++++++++++++++++++++++++++++++++++++++++++++++++++ src/qemu/qemu_domain.h | 1 + 3 files changed, 54 insertions(+)
NB: This patch wouldn't "git am -3" apply for me - looks like you'll be having some merge conflicts to deal with.
Yeah, a rebase was definitely in order :)
+/** + * qemuDomainAdjustMaxMemLock: + * + * @vm: domain + * + * Adjust the memory locking limit for the QEMU process associated to @vm, in + * order to comply with VFIO or architecture requirements. + * + * The limit will not be changed unless doing so is needed; the first time + * the limit is changed, the original (default) limit is stored in @vm and + * that value will be restored if qemuDomainAdjustMaxMemLock() is called once + * memory locking is no longer required. + * + * Returns: 0 on success, <0 on failure + */ +int +qemuDomainAdjustMaxMemLock(virDomainObjPtr vm)
It seems this function does two things...
#1 Get some original value if the domain requires mlock #2 Restores the original value
Not sure it's best to mix the logic...
I believe you understand the rationale now.
+{ + unsigned long long bytes = 0; + int ret = -1; + + if (qemuDomainRequiresMlock(vm->def)) { + /* If this is the first time adjusting the limit, save the current + * value so that we can restore it once memory locking is no longer + * required */ + if (!vm->original_memlock) { + if (virProcessGetMaxMemLock(vm->pid, &(vm->original_memlock)) < 0) + goto out;
Failure scenario 1 - I couldn't get some original value
Should this really be a failure? Could fail for 4 reasons:
1. getrlimit returns fail when pid == 0 2. getrlimit && RLIMIT_MEMLOCK not available 3. prlimit returns fail 4. prlimit not available or in the case shown by commit id '05f79a38' the oddity from a mingw build.
Given how the code is written today, if arg2 was NULL then a "0" would have been returned.
So if it returns 0, then that means the "ability" to reset to some initial value is not available... Thus we just return skinny, dumb, and happy rather than failing. EG no worse than today.
For archs/envs that support getting the value - sure we can support resetting the value...
Makes sense. Moreover, I can't really imagine a case where getting the current memory limit would fail but setting a new one immediately afterwards would succeed instead.
+ } + bytes = qemuDomainGetMlockLimitBytes(vm->def); + } else { + /* Once memory locking is no longer required, we can restore the + * original, usually very low, limit */ + bytes = vm->original_memlock;
Not sure I understand what is meant by "is no longer required" (yet - I haven't read forward)... However, if it never was required, then all the following code does nothing since original_memlock would be 0 (and nothing here or in future patches) sets it unless the domain requires it.
IOW: How/why does a domain go from at one point requiring Mlock to not requiring it? It's not lock a running PPC64 domain is going to change it's arch type...
I believe this, too, is now clear to you.
+ vm->original_memlock = 0; + } + + /* Don't do anything unless we're actually setting a limit */ + if (bytes) { + if (virProcessSetMaxMemLock(vm->pid, bytes) < 0)
Failure scenario 2 - I couldn't set a new limit. E.G - in my mind the true failure. I couldn't set a new value.
Of course, this one should keep on being be a severe failure and abort the whole operation. Cheers. -- Andrea Bolognani Software Engineer - Virtualization Team

On 11/24/2015 08:56 AM, Andrea Bolognani wrote: [...]
+ } + + /* Don't do anything unless we're actually setting a limit */ + if (bytes) { + if (virProcessSetMaxMemLock(vm->pid, bytes) < 0) + goto out; + }
virProcessSetMaxMemLock aready checks: if (bytes == 0) return 0; So the "if (bytes)" is unnecessary John

On Wed, 2015-12-09 at 13:02 -0500, John Ferlan wrote:
On 11/24/2015 08:56 AM, Andrea Bolognani wrote: [...]
+ } + + /* Don't do anything unless we're actually setting a limit */ + if (bytes) { + if (virProcessSetMaxMemLock(vm->pid, bytes) < 0) + goto out; + }
virProcessSetMaxMemLock aready checks:
if (bytes == 0) return 0;
So the "if (bytes)" is unnecessary
Good catch, I will remove the check and update the comment. Cheers. -- Andrea Bolognani Software Engineer - Virtualization Team

Replace all uses of the qemuDomainRequiresMlock/virProcessSetMaxMemLock combination with the equivalent qemuDomainAdjustMaxMemLock() call. --- src/qemu/qemu_hotplug.c | 40 +++++++++++++--------------------------- 1 file changed, 13 insertions(+), 27 deletions(-) diff --git a/src/qemu/qemu_hotplug.c b/src/qemu/qemu_hotplug.c index 8804d3d..a5c134a 100644 --- a/src/qemu/qemu_hotplug.c +++ b/src/qemu/qemu_hotplug.c @@ -1276,17 +1276,14 @@ qemuDomainAttachHostPCIDevice(virQEMUDriverPtr driver, } /* Temporarily add the hostdev to the domain definition. This is needed - * because qemuDomainRequiresMlock() and qemuDomainGetMlockLimitBytes() - * require the hostdev to be already part of the domain definition, but - * other functions like qemuAssignDeviceHostdevAlias() used below expect - * it *not* to be there. A better way to handle this would be nice */ + * because qemuDomainAdjustMaxMemLock() requires the hostdev to be already + * part of the domain definition, but other functions like + * qemuAssignDeviceHostdevAlias() used below expect it *not* to be there. + * A better way to handle this would be nice */ vm->def->hostdevs[vm->def->nhostdevs++] = hostdev; - if (qemuDomainRequiresMlock(vm->def)) { - if (virProcessSetMaxMemLock(vm->pid, - qemuDomainGetMlockLimitBytes(vm->def)) < 0) { - vm->def->hostdevs[--(vm->def->nhostdevs)] = NULL; - goto error; - } + if (qemuDomainAdjustMaxMemLock(vm) < 0) { + vm->def->hostdevs[--(vm->def->nhostdevs)] = NULL; + goto error; } vm->def->hostdevs[--(vm->def->nhostdevs)] = NULL; @@ -1772,7 +1769,6 @@ qemuDomainAttachMemory(virQEMUDriverPtr driver, virJSONValuePtr props = NULL; virObjectEventPtr event; bool fix_balloon = false; - bool mlock = false; int id; int ret = -1; @@ -1804,12 +1800,7 @@ qemuDomainAttachMemory(virQEMUDriverPtr driver, goto cleanup; } - mlock = qemuDomainRequiresMlock(vm->def); - - if (mlock && - virProcessSetMaxMemLock(vm->pid, - qemuDomainGetMlockLimitBytes(vm->def)) < 0) { - mlock = false; + if (qemuDomainAdjustMaxMemLock(vm) < 0) { virJSONValueFree(props); goto removedef; } @@ -1870,13 +1861,10 @@ qemuDomainAttachMemory(virQEMUDriverPtr driver, mem = NULL; /* reset the mlock limit */ - if (mlock) { - virErrorPtr err = virSaveLastError(); - ignore_value(virProcessSetMaxMemLock(vm->pid, - qemuDomainGetMlockLimitBytes(vm->def))); - virSetError(err); - virFreeError(err); - } + virErrorPtr err = virSaveLastError(); + ignore_value(qemuDomainAdjustMaxMemLock(vm)); + virSetError(err); + virFreeError(err); goto audit; } @@ -2970,9 +2958,7 @@ qemuDomainRemoveMemoryDevice(virQEMUDriverPtr driver, virDomainMemoryDefFree(mem); /* decrease the mlock limit after memory unplug if necessary */ - if (qemuDomainRequiresMlock(vm->def)) - ignore_value(virProcessSetMaxMemLock(vm->pid, - qemuDomainGetMlockLimitBytes(vm->def))); + ignore_value(qemuDomainAdjustMaxMemLock(vm)); return 0; } -- 2.5.0

On 11/24/2015 08:56 AM, Andrea Bolognani wrote:
Replace all uses of the qemuDomainRequiresMlock/virProcessSetMaxMemLock combination with the equivalent qemuDomainAdjustMaxMemLock() call. --- src/qemu/qemu_hotplug.c | 40 +++++++++++++--------------------------- 1 file changed, 13 insertions(+), 27 deletions(-)
OK - so now I see how this is going to be used... patch 3's multi purpose MaxMemLock makes a bit more sense; however, I'm still wondering how we'd ever get to a point where the a last removal wouldn't have set the lockbytes value to anything but the original (yes, patch 5 is missing from the current removal)... Seems like perhaps the original_memlock is fixing the bug that was shown in patch 5 - that the hostdev removal didn't return our value properly. IOW: If the original_memlock adjustment was taken out, is there a case (after patch 5 is applied) where when memory locking is no longer required that the value after the removal wouldn't have been what was saved in original_memlock. Perhaps would have been easier to have added the Adjust code without original_memlock, then replace code as is done here, then add the hostdev case, then add the original value. Just trying to separate new functionality from code motion/creation of helper code.
diff --git a/src/qemu/qemu_hotplug.c b/src/qemu/qemu_hotplug.c index 8804d3d..a5c134a 100644 --- a/src/qemu/qemu_hotplug.c +++ b/src/qemu/qemu_hotplug.c @@ -1276,17 +1276,14 @@ qemuDomainAttachHostPCIDevice(virQEMUDriverPtr driver, }
/* Temporarily add the hostdev to the domain definition. This is needed - * because qemuDomainRequiresMlock() and qemuDomainGetMlockLimitBytes() - * require the hostdev to be already part of the domain definition, but - * other functions like qemuAssignDeviceHostdevAlias() used below expect - * it *not* to be there. A better way to handle this would be nice */ + * because qemuDomainAdjustMaxMemLock() requires the hostdev to be already + * part of the domain definition, but other functions like + * qemuAssignDeviceHostdevAlias() used below expect it *not* to be there. + * A better way to handle this would be nice */ vm->def->hostdevs[vm->def->nhostdevs++] = hostdev; - if (qemuDomainRequiresMlock(vm->def)) { - if (virProcessSetMaxMemLock(vm->pid, - qemuDomainGetMlockLimitBytes(vm->def)) < 0) { - vm->def->hostdevs[--(vm->def->nhostdevs)] = NULL; - goto error; - } + if (qemuDomainAdjustMaxMemLock(vm) < 0) { + vm->def->hostdevs[--(vm->def->nhostdevs)] = NULL; + goto error; } vm->def->hostdevs[--(vm->def->nhostdevs)] = NULL;
@@ -1772,7 +1769,6 @@ qemuDomainAttachMemory(virQEMUDriverPtr driver, virJSONValuePtr props = NULL; virObjectEventPtr event; bool fix_balloon = false; - bool mlock = false; int id; int ret = -1;
@@ -1804,12 +1800,7 @@ qemuDomainAttachMemory(virQEMUDriverPtr driver, goto cleanup; }
- mlock = qemuDomainRequiresMlock(vm->def); - - if (mlock && - virProcessSetMaxMemLock(vm->pid, - qemuDomainGetMlockLimitBytes(vm->def)) < 0) { - mlock = false; + if (qemuDomainAdjustMaxMemLock(vm) < 0) {
The current code says don't run the "reset" code if the SetMaxMemLock failed... With this change, in the removedef code you'll call the AdjustMaxMemLock regardless of whether we failed to set. That fail to set (at this point in time) could be the "GetMaxMemLock" failed or the "SetMaxMemLock" failed. The point being at removedef, how do we know if the failure was because we failed to Adjust or something else. If we failed to Adjust, then calling it again will of course more than likely fail. John
virJSONValueFree(props); goto removedef; } @@ -1870,13 +1861,10 @@ qemuDomainAttachMemory(virQEMUDriverPtr driver, mem = NULL;
/* reset the mlock limit */ - if (mlock) { - virErrorPtr err = virSaveLastError(); - ignore_value(virProcessSetMaxMemLock(vm->pid, - qemuDomainGetMlockLimitBytes(vm->def))); - virSetError(err); - virFreeError(err); - } + virErrorPtr err = virSaveLastError(); + ignore_value(qemuDomainAdjustMaxMemLock(vm)); + virSetError(err); + virFreeError(err);
goto audit; } @@ -2970,9 +2958,7 @@ qemuDomainRemoveMemoryDevice(virQEMUDriverPtr driver, virDomainMemoryDefFree(mem);
/* decrease the mlock limit after memory unplug if necessary */ - if (qemuDomainRequiresMlock(vm->def)) - ignore_value(virProcessSetMaxMemLock(vm->pid, - qemuDomainGetMlockLimitBytes(vm->def))); + ignore_value(qemuDomainAdjustMaxMemLock(vm));
return 0; }

On Wed, 2015-12-09 at 15:15 -0500, John Ferlan wrote:
On 11/24/2015 08:56 AM, Andrea Bolognani wrote:
Replace all uses of the qemuDomainRequiresMlock/virProcessSetMaxMemLock combination with the equivalent qemuDomainAdjustMaxMemLock() call. --- src/qemu/qemu_hotplug.c | 40 +++++++++++++--------------------------- 1 file changed, 13 insertions(+), 27 deletions(-)
OK - so now I see how this is going to be used... patch 3's multi purpose MaxMemLock makes a bit more sense; however, I'm still wondering how we'd ever get to a point where the a last removal wouldn't have set the lockbytes value to anything but the original (yes, patch 5 is missing from the current removal)...
Seems like perhaps the original_memlock is fixing the bug that was shown in patch 5 - that the hostdev removal didn't return our value properly.
IOW: If the original_memlock adjustment was taken out, is there a case (after patch 5 is applied) where when memory locking is no longer required that the value after the removal wouldn't have been what was saved in original_memlock.
Hi John, first of all, thanks for the review. I'm in the process of going through all your comments, but I wanted to address this one first. I tried to give a somewhat detailed overview of the series in the cover letter to help reviewers understand how the single commits fit toghether, but it's always hard to balance the amount of information when you've just finished implementing something and all the details are still so completely clear in your head. Any tips on how the overview could have been more helpful are appreciated! The whole issue stems from the use of this kind of construct: if (qemuDomainRequiresMlock(vm->def)) virProcessSetMaxMemLock(vm->pid, qemuDomainGetMlockLimitBytes(vm->def)); qemuDomainRequiresMlock() is always true on ppc64, but on x86 it's true only if at least one VFIO device is currently assigned to the guest, which means it can return false both because no VFIO device has been assigned to the guest yet or because we just removed the last one. In the first case, we don't need to adjust the memory lock limit and everything is peachy; in the latter case, though, we *had* increased the memory locking limit when we assigned the first VFIO device and we should lower it now that's no longer required, but because of the code above we don't. qemuDomainAdjustMaxMemLock() solves this by storing the previous memory locking limit when first increasing it, so now we can tell whether memory locking is simply not needed or *no longer* needed, and we can behave accordingly - not doing anything in the former case, restoring the previous value in the latter.
Perhaps would have been easier to have added the Adjust code without original_memlock, then replace code as is done here, then add the hostdev case, then add the original value. Just trying to separate new functionality from code motion/creation of helper code.
It's never easy to order changes, is it? :) I can see how using the ordering you propose would make it easier to follow along, though. I will shuffle commits according to your suggestion for v2. Cheers. -- Andrea Bolognani Software Engineer - Virtualization Team

On 12/10/2015 05:43 AM, Andrea Bolognani wrote:
On Wed, 2015-12-09 at 15:15 -0500, John Ferlan wrote:
On 11/24/2015 08:56 AM, Andrea Bolognani wrote:
Replace all uses of the qemuDomainRequiresMlock/virProcessSetMaxMemLock combination with the equivalent qemuDomainAdjustMaxMemLock() call. --- src/qemu/qemu_hotplug.c | 40 +++++++++++++--------------------------- 1 file changed, 13 insertions(+), 27 deletions(-)
OK - so now I see how this is going to be used... patch 3's multi purpose MaxMemLock makes a bit more sense; however, I'm still wondering how we'd ever get to a point where the a last removal wouldn't have set the lockbytes value to anything but the original (yes, patch 5 is missing from the current removal)...
Seems like perhaps the original_memlock is fixing the bug that was shown in patch 5 - that the hostdev removal didn't return our value properly.
IOW: If the original_memlock adjustment was taken out, is there a case (after patch 5 is applied) where when memory locking is no longer required that the value after the removal wouldn't have been what was saved in original_memlock.
Hi John,
first of all, thanks for the review. I'm in the process of going through all your comments, but I wanted to address this one first.
I tried to give a somewhat detailed overview of the series in the cover letter to help reviewers understand how the single commits fit toghether, but it's always hard to balance the amount of information when you've just finished implementing something and all the details are still so completely clear in your head. Any tips on how the overview could have been more helpful are appreciated!
I always appreciate extra comments and cover letters, but they are a delicate balance. It's hard when you're so embroiled in your patches to step back and take that 10,000 foot view. There's no secret sauce for that. Yes I read cover letters, but then I start reviewing patches and that cover letter can easily get paged out of short term memory. When I review I try to take patches one at a time, but I will peek ahead as well when I get curious about the why something is being done. I'll also try to provide enough context in reviews to try an "show" what I was thinking while reviewing. Seeing terse reviews of "this is wrong" or "I wouldn't do it that way" in my mind are not very helpful.
The whole issue stems from the use of this kind of construct:
if (qemuDomainRequiresMlock(vm->def)) virProcessSetMaxMemLock(vm->pid, qemuDomainGetMlockLimitBytes(vm->def));
qemuDomainRequiresMlock() is always true on ppc64, but on x86 it's true only if at least one VFIO device is currently assigned to the guest, which means it can return false both because no VFIO device has been assigned to the guest yet or because we just removed the last one.
As I got to patch 4 & 5 I think it started to sink in again that the issue was more because of x86. Once I read and thought about patch 5, then I went back to my patch 4 comments and added the extra wording around whether or not using original_memlock would be useful and of course to separate out "code motion" from "new code". Of course by that time patch 3 was already sent so I couldn't go "adjust" my comments there.
In the first case, we don't need to adjust the memory lock limit and everything is peachy; in the latter case, though, we *had* increased the memory locking limit when we assigned the first VFIO device and we should lower it now that's no longer required, but because of the code above we don't.
qemuDomainAdjustMaxMemLock() solves this by storing the previous memory locking limit when first increasing it, so now we can tell whether memory locking is simply not needed or *no longer* needed, and we can behave accordingly - not doing anything in the former case, restoring the previous value in the latter.
Perhaps would have been easier to have added the Adjust code without original_memlock, then replace code as is done here, then add the hostdev case, then add the original value. Just trying to separate new functionality from code motion/creation of helper code.
It's never easy to order changes, is it? :)
Well better than what my former job had - here's a steaming pile of changes with an explanation of the changes. Sometimes you got a good explanation of what the changes did, while other times it was "this fixes bug #NNNNN". Um, gee thanks! John
I can see how using the ordering you propose would make it easier to follow along, though. I will shuffle commits according to your suggestion for v2.
Cheers.

We increase the limit before plugging in a PCI hostdev or a memory module because some memory might need to be locked due to eg. VFIO. Of course we should do the opposite after unplugging a device: this was already the case for memory modules, but not for hostdevs. --- src/qemu/qemu_hotplug.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/qemu/qemu_hotplug.c b/src/qemu/qemu_hotplug.c index a5c134a..04baeff 100644 --- a/src/qemu/qemu_hotplug.c +++ b/src/qemu/qemu_hotplug.c @@ -3070,6 +3070,12 @@ qemuDomainRemoveHostDevice(virQEMUDriverPtr driver, networkReleaseActualDevice(vm->def, net); virDomainNetDefFree(net); } + + /* QEMU might no longer need to lock as much memory, eg. we just detached + * a VFIO device, so adjust the limit here */ + if (qemuDomainAdjustMaxMemLock(vm) < 0) + VIR_WARN("Failed to adjust locked memory limit"); + ret = 0; cleanup: -- 2.5.0

On 11/24/2015 08:56 AM, Andrea Bolognani wrote:
We increase the limit before plugging in a PCI hostdev or a memory module because some memory might need to be locked due to eg. VFIO.
Of course we should do the opposite after unplugging a device: this was already the case for memory modules, but not for hostdevs. --- src/qemu/qemu_hotplug.c | 6 ++++++ 1 file changed, 6 insertions(+)
diff --git a/src/qemu/qemu_hotplug.c b/src/qemu/qemu_hotplug.c index a5c134a..04baeff 100644 --- a/src/qemu/qemu_hotplug.c +++ b/src/qemu/qemu_hotplug.c @@ -3070,6 +3070,12 @@ qemuDomainRemoveHostDevice(virQEMUDriverPtr driver, networkReleaseActualDevice(vm->def, net); virDomainNetDefFree(net); } + + /* QEMU might no longer need to lock as much memory, eg. we just detached + * a VFIO device, so adjust the limit here */ + if (qemuDomainAdjustMaxMemLock(vm) < 0) + VIR_WARN("Failed to adjust locked memory limit"); + ret = 0;
cleanup:
Since the add was in qemuDomainAttachHostPCIDevice, why is the remove not in qemuDomainDetachHostPCIDevice? Doing it here would mean it's call for any host device removal - whether or not it was ever added. John

On Wed, 2015-12-09 at 15:15 -0500, John Ferlan wrote:
On 11/24/2015 08:56 AM, Andrea Bolognani wrote:
We increase the limit before plugging in a PCI hostdev or a memory module because some memory might need to be locked due to eg. VFIO.
Of course we should do the opposite after unplugging a device: this was already the case for memory modules, but not for hostdevs. --- src/qemu/qemu_hotplug.c | 6 ++++++ 1 file changed, 6 insertions(+)
diff --git a/src/qemu/qemu_hotplug.c b/src/qemu/qemu_hotplug.c index a5c134a..04baeff 100644 --- a/src/qemu/qemu_hotplug.c +++ b/src/qemu/qemu_hotplug.c @@ -3070,6 +3070,12 @@ qemuDomainRemoveHostDevice(virQEMUDriverPtr driver, networkReleaseActualDevice(vm->def, net); virDomainNetDefFree(net); } + + /* QEMU might no longer need to lock as much memory, eg. we just detached + * a VFIO device, so adjust the limit here */ + if (qemuDomainAdjustMaxMemLock(vm) < 0) + VIR_WARN("Failed to adjust locked memory limit"); + ret = 0; cleanup:
Since the add was in qemuDomainAttachHostPCIDevice, why is the remove not in qemuDomainDetachHostPCIDevice? Doing it here would mean it's call for any host device removal - whether or not it was ever added.
Because qemuDomainDetachHostPCIDevice() is where we ask for the device to be removed from the guest, but we have to wait for it to actually be removed (and for the guest definition to be updated) before changing the memory locking limit. I guess I could move this code to qemuDomainDetachThisHostDevice(), but keeping it close to where the guest definition is updated looks cleaner to me. I also fail to see how a device that was never added could be removed, could you please elaborate? Cheers. -- Andrea Bolognani Software Engineer - Virtualization Team

On 12/10/2015 10:03 AM, Andrea Bolognani wrote:
On Wed, 2015-12-09 at 15:15 -0500, John Ferlan wrote:
On 11/24/2015 08:56 AM, Andrea Bolognani wrote:
We increase the limit before plugging in a PCI hostdev or a memory module because some memory might need to be locked due to eg. VFIO.
Of course we should do the opposite after unplugging a device: this was already the case for memory modules, but not for hostdevs. --- src/qemu/qemu_hotplug.c | 6 ++++++ 1 file changed, 6 insertions(+)
diff --git a/src/qemu/qemu_hotplug.c b/src/qemu/qemu_hotplug.c index a5c134a..04baeff 100644 --- a/src/qemu/qemu_hotplug.c +++ b/src/qemu/qemu_hotplug.c @@ -3070,6 +3070,12 @@ qemuDomainRemoveHostDevice(virQEMUDriverPtr driver, networkReleaseActualDevice(vm->def, net); virDomainNetDefFree(net); } + + /* QEMU might no longer need to lock as much memory, eg. we just detached + * a VFIO device, so adjust the limit here */ + if (qemuDomainAdjustMaxMemLock(vm) < 0) + VIR_WARN("Failed to adjust locked memory limit"); + ret = 0;
cleanup:
Since the add was in qemuDomainAttachHostPCIDevice, why is the remove not in qemuDomainDetachHostPCIDevice? Doing it here would mean it's call for any host device removal - whether or not it was ever added.
Because qemuDomainDetachHostPCIDevice() is where we ask for the device to be removed from the guest, but we have to wait for it to actually be removed (and for the guest definition to be updated) before changing the memory locking limit.
I guess I could move this code to qemuDomainDetachThisHostDevice(), but keeping it close to where the guest definition is updated looks cleaner to me.
Yeah - this all got confusing as I was paging back and forth... "DetachDevice" and "RemoveDevice"... Then I reviewed another change for shmem and got even more confused.
I also fail to see how a device that was never added could be removed, could you please elaborate?
Since qemuDomainRemoveHostDevice could be called for any host device removal and not specifically the HostPCIDevice, I was pointing out by calling qemuDomainAdjustMaxMemLock there you could be calling it even when a HostPCIDevice wasn't being removed. It was clear to me when I wrote it ;-) John

On Thu, 2015-12-10 at 12:11 -0500, John Ferlan wrote:
Since the add was in qemuDomainAttachHostPCIDevice, why is the remove not in qemuDomainDetachHostPCIDevice? Doing it here would mean it's call for any host device removal - whether or not it was ever added.
Because qemuDomainDetachHostPCIDevice() is where we ask for the device to be removed from the guest, but we have to wait for it to actually be removed (and for the guest definition to be updated) before changing the memory locking limit.
I guess I could move this code to qemuDomainDetachThisHostDevice(), but keeping it close to where the guest definition is updated looks cleaner to me.
Yeah - this all got confusing as I was paging back and forth... "DetachDevice" and "RemoveDevice"... Then I reviewed another change for shmem and got even more confused.
I know what you mean: the code could sure use some moving around to make it a bit more symmetric. And what about the names? qemuDomainDetachDiskDevice() qemuDomainDetachDeviceDiskLive() Ehm... qemuDomainDetachHostPCIDevice() qemuDomainDetachHostDevice() qemuDomainDetachThisHostDevice() Oh boy :)
I also fail to see how a device that was never added could be removed, could you please elaborate?
Since qemuDomainRemoveHostDevice could be called for any host device removal and not specifically the HostPCIDevice, I was pointing out by calling qemuDomainAdjustMaxMemLock there you could be calling it even when a HostPCIDevice wasn't being removed.
Oh, I get it now. Calling qemuDomainAdjustMaxMemLock() more than necessary shouldn't really cause any harm, but adding a check for hostdev->source.subsys.type == VIR_DOMAIN_HOSTDEV_SUBSYS_TYPE_PCI around the call is even better. Would that address your concerns?
It was clear to me when I wrote it ;-)
Again, I know exactly what you mean ;) Cheers. -- Andrea Bolognani Software Engineer - Virtualization Team

On 12/10/2015 12:36 PM, Andrea Bolognani wrote:
On Thu, 2015-12-10 at 12:11 -0500, John Ferlan wrote:
Since the add was in qemuDomainAttachHostPCIDevice, why is the remove not in qemuDomainDetachHostPCIDevice? Doing it here would mean it's call for any host device removal - whether or not it was ever added.
Because qemuDomainDetachHostPCIDevice() is where we ask for the device to be removed from the guest, but we have to wait for it to actually be removed (and for the guest definition to be updated) before changing the memory locking limit.
I guess I could move this code to qemuDomainDetachThisHostDevice(), but keeping it close to where the guest definition is updated looks cleaner to me.
Yeah - this all got confusing as I was paging back and forth... "DetachDevice" and "RemoveDevice"... Then I reviewed another change for shmem and got even more confused.
I know what you mean: the code could sure use some moving around to make it a bit more symmetric. And what about the names?
qemuDomainDetachDiskDevice() qemuDomainDetachDeviceDiskLive()
Ehm...
qemuDomainDetachHostPCIDevice() qemuDomainDetachHostDevice() qemuDomainDetachThisHostDevice()
Oh boy :)
I also fail to see how a device that was never added could be removed, could you please elaborate?
Since qemuDomainRemoveHostDevice could be called for any host device removal and not specifically the HostPCIDevice, I was pointing out by calling qemuDomainAdjustMaxMemLock there you could be calling it even when a HostPCIDevice wasn't being removed.
Oh, I get it now.
Calling qemuDomainAdjustMaxMemLock() more than necessary shouldn't really cause any harm, but adding a check for
hostdev->source.subsys.type == VIR_DOMAIN_HOSTDEV_SUBSYS_TYPE_PCI
around the call is even better.
Would that address your concerns?
You mean of course before the pesky "virDomainHostdevDefFree(hostdev)"? You could perhaps do it after the qemuDomainRemovePCIHostDevice in the switch or within that qemuDomainRemovePCIHostDevice code. John
It was clear to me when I wrote it ;-)
Again, I know exactly what you mean ;)

On Thu, 2015-12-10 at 12:44 -0500, John Ferlan wrote:
Calling qemuDomainAdjustMaxMemLock() more than necessary shouldn't really cause any harm, but adding a check for
hostdev->source.subsys.type == VIR_DOMAIN_HOSTDEV_SUBSYS_TYPE_PCI
around the call is even better.
Would that address your concerns?
You mean of course before the pesky "virDomainHostdevDefFree(hostdev)"?
You could perhaps do it after the qemuDomainRemovePCIHostDevice in the switch or within that qemuDomainRemovePCIHostDevice code.
I moved it into the switch. Tomorrow I'll do a round of tests and send out a v2. Cheers. -- Andrea Bolognani Software Engineer - Virtualization Team

On Thu, 2015-12-10 at 12:44 -0500, John Ferlan wrote:
Calling qemuDomainAdjustMaxMemLock() more than necessary shouldn't really cause any harm, but adding a check for
hostdev->source.subsys.type == VIR_DOMAIN_HOSTDEV_SUBSYS_TYPE_PCI
around the call is even better.
Would that address your concerns?
You mean of course before the pesky "virDomainHostdevDefFree(hostdev)"?
You could perhaps do it after the qemuDomainRemovePCIHostDevice in the switch or within that qemuDomainRemovePCIHostDevice code.
I have now posted v2: https://www.redhat.com/archives/libvir-list/2015-December/msg00454.html Cheers. -- Andrea Bolognani Software Engineer - Virtualization Team

MemLock is already used in other modules and, while still an abbreviation, is not ambiguous. --- src/qemu/qemu_command.c | 4 ++-- src/qemu/qemu_domain.c | 10 +++++----- src/qemu/qemu_domain.h | 4 ++-- 3 files changed, 9 insertions(+), 9 deletions(-) diff --git a/src/qemu/qemu_command.c b/src/qemu/qemu_command.c index 4d00fd9..f6c41ce 100644 --- a/src/qemu/qemu_command.c +++ b/src/qemu/qemu_command.c @@ -11095,8 +11095,8 @@ qemuBuildCommandLine(virConnectPtr conn, /* In some situations, eg. VFIO passthrough, QEMU might need to lock a * significant amount of memory, so we need to set the limit accordingly */ - if (qemuDomainRequiresMlock(def)) - virCommandSetMaxMemLock(cmd, qemuDomainGetMlockLimitBytes(def)); + if (qemuDomainRequiresMemLock(def)) + virCommandSetMaxMemLock(cmd, qemuDomainGetMemLockLimitBytes(def)); if (virQEMUCapsGet(qemuCaps, QEMU_CAPS_MSG_TIMESTAMP) && cfg->logTimestamp) diff --git a/src/qemu/qemu_domain.c b/src/qemu/qemu_domain.c index 51a1770..1521213 100644 --- a/src/qemu/qemu_domain.c +++ b/src/qemu/qemu_domain.c @@ -3802,7 +3802,7 @@ qemuDomainUpdateCurrentMemorySize(virQEMUDriverPtr driver, /** - * qemuDomainGetMlockLimitBytes: + * qemuDomainGetMemLockLimitBytes: * * @def: domain definition * @@ -3812,7 +3812,7 @@ qemuDomainUpdateCurrentMemorySize(virQEMUDriverPtr driver, * value returned may depend upon the architecture or devices present. */ unsigned long long -qemuDomainGetMlockLimitBytes(virDomainDefPtr def) +qemuDomainGetMemLockLimitBytes(virDomainDefPtr def) { unsigned long long memKB; @@ -3933,7 +3933,7 @@ qemuDomainGetMlockLimitBytes(virDomainDefPtr def) * requirements. * */ bool -qemuDomainRequiresMlock(virDomainDefPtr def) +qemuDomainRequiresMemLock(virDomainDefPtr def) { size_t i; @@ -3977,7 +3977,7 @@ qemuDomainAdjustMaxMemLock(virDomainObjPtr vm) unsigned long long bytes = 0; int ret = -1; - if (qemuDomainRequiresMlock(vm->def)) { + if (qemuDomainRequiresMemLock(vm->def)) { /* If this is the first time adjusting the limit, save the current * value so that we can restore it once memory locking is no longer * required */ @@ -3985,7 +3985,7 @@ qemuDomainAdjustMaxMemLock(virDomainObjPtr vm) if (virProcessGetMaxMemLock(vm->pid, &(vm->original_memlock)) < 0) goto out; } - bytes = qemuDomainGetMlockLimitBytes(vm->def); + bytes = qemuDomainGetMemLockLimitBytes(vm->def); } else { /* Once memory locking is no longer required, we can restore the * original, usually very low, limit */ diff --git a/src/qemu/qemu_domain.h b/src/qemu/qemu_domain.h index cc64df3..dcb9dc6 100644 --- a/src/qemu/qemu_domain.h +++ b/src/qemu/qemu_domain.h @@ -484,8 +484,8 @@ bool qemuDomainMachineHasBuiltinIDE(const virDomainDef *def); int qemuDomainUpdateCurrentMemorySize(virQEMUDriverPtr driver, virDomainObjPtr vm); -unsigned long long qemuDomainGetMlockLimitBytes(virDomainDefPtr def); -bool qemuDomainRequiresMlock(virDomainDefPtr def); +unsigned long long qemuDomainGetMemLockLimitBytes(virDomainDefPtr def); +bool qemuDomainRequiresMemLock(virDomainDefPtr def); int qemuDomainAdjustMaxMemLock(virDomainObjPtr vm); int qemuDomainDefValidateMemoryHotplug(const virDomainDef *def, -- 2.5.0

On 11/24/2015 08:56 AM, Andrea Bolognani wrote:
MemLock is already used in other modules and, while still an abbreviation, is not ambiguous. --- src/qemu/qemu_command.c | 4 ++-- src/qemu/qemu_domain.c | 10 +++++----- src/qemu/qemu_domain.h | 4 ++-- 3 files changed, 9 insertions(+), 9 deletions(-)
ACK - (in principal at least) John
participants (2)
-
Andrea Bolognani
-
John Ferlan