[libvirt] [PATCH v1 0/7] Hugepages wrt to NUMA

Up to now, domains are either backed by an arbitrary huge page but without any NUMA awareness. This is suboptimal and I'm trying to fix it. Michal Privoznik (7): configure: Check for statfs Introduce virFileFindHugeTLBFS qemu: Utilize virFileFindHugeTLBFS virbitmap: Introduce virBitmapDoesIntersect domain: Introduce ./hugepages/page/[@size,@unit,@nodeset] qemu: Implement ./hugepages/page/[@size,@unit,@nodeset] tests: Some testing of hugepages mapping configure.ac | 4 +- docs/formatdomain.html.in | 18 +- docs/schemas/domaincommon.rng | 19 +- src/Makefile.am | 1 + src/conf/domain_conf.c | 197 +++++++++++++++++++-- src/conf/domain_conf.h | 13 +- src/libvirt_private.syms | 2 + src/parallels/parallels_driver.c | 2 +- src/qemu/qemu_capabilities.c | 2 + src/qemu/qemu_capabilities.h | 1 + src/qemu/qemu_command.c | 111 ++++++++++-- src/qemu/qemu_conf.c | 124 +++++++++++-- src/qemu/qemu_conf.h | 9 +- src/qemu/qemu_driver.c | 39 ++-- src/qemu/qemu_process.c | 21 ++- src/util/virbitmap.c | 20 +++ src/util/virbitmap.h | 3 + src/util/virfile.c | 176 +++++++++++++++++- src/util/virfile.h | 12 ++ .../qemuxml2argv-hugepages-pages.args | 16 ++ .../qemuxml2argv-hugepages-pages.xml | 45 +++++ .../qemuxml2argv-hugepages-pages2.args | 10 ++ .../qemuxml2argv-hugepages-pages2.xml | 38 ++++ .../qemuxml2argv-hugepages-pages3.args | 9 + .../qemuxml2argv-hugepages-pages3.xml | 38 ++++ tests/qemuxml2argvdata/qemuxml2argv-hugepages.args | 2 +- tests/qemuxml2argvtest.c | 18 +- tests/qemuxml2xmltest.c | 3 + tests/virbitmaptest.c | 26 +++ 29 files changed, 884 insertions(+), 95 deletions(-) create mode 100644 tests/qemuxml2argvdata/qemuxml2argv-hugepages-pages.args create mode 100644 tests/qemuxml2argvdata/qemuxml2argv-hugepages-pages.xml create mode 100644 tests/qemuxml2argvdata/qemuxml2argv-hugepages-pages2.args create mode 100644 tests/qemuxml2argvdata/qemuxml2argv-hugepages-pages2.xml create mode 100644 tests/qemuxml2argvdata/qemuxml2argv-hugepages-pages3.args create mode 100644 tests/qemuxml2argvdata/qemuxml2argv-hugepages-pages3.xml -- 1.8.5.5

The statfs(2) gets filesystem statistics. Currently, we use it only on linux, and leave stub to implement on other platforms. But hey, other platforms (like FreeBSD) have statfs() too. If we check it in configure we can wider platforms supported. Speaking of FreeBSD, the headers to include are of course different: sys/param.h and sys/mount.h on the FreeBSD and sys/statfs.h on the Linux. The header files are checked too. Signed-off-by: Michal Privoznik <mprivozn@redhat.com> --- configure.ac | 4 ++-- src/util/virfile.c | 21 ++++++++++++++------- 2 files changed, 16 insertions(+), 9 deletions(-) diff --git a/configure.ac b/configure.ac index 8001e24..68ebf75 100644 --- a/configure.ac +++ b/configure.ac @@ -275,7 +275,7 @@ dnl and various less common threadsafe functions AC_CHECK_FUNCS_ONCE([cfmakeraw fallocate geteuid getgid getgrnam_r \ getmntent_r getpwuid_r getuid kill mmap newlocale posix_fallocate \ posix_memalign prlimit regexec sched_getaffinity setgroups setns \ - setrlimit symlink sysctlbyname getifaddrs]) + setrlimit symlink sysctlbyname getifaddrs statfs]) dnl Availability of pthread functions. Because of $LIB_PTHREAD, we dnl cannot use AC_CHECK_FUNCS_ONCE. LIB_PTHREAD and LIBMULTITHREAD @@ -316,7 +316,7 @@ dnl Availability of various common headers (non-fatal if missing). AC_CHECK_HEADERS([pwd.h paths.h regex.h sys/un.h \ sys/poll.h syslog.h mntent.h net/ethernet.h linux/magic.h \ sys/un.h sys/syscall.h sys/sysctl.h netinet/tcp.h ifaddrs.h \ - libtasn1.h sys/ucred.h sys/mount.h]) + libtasn1.h sys/ucred.h sys/mount.h sys/param.h sys/statfs.h]) dnl Check whether endian provides handy macros. AC_CHECK_DECLS([htole64], [], [], [[#include <endian.h>]]) diff --git a/src/util/virfile.c b/src/util/virfile.c index 463064c..699b9f8 100644 --- a/src/util/virfile.c +++ b/src/util/virfile.c @@ -43,11 +43,15 @@ # include <sys/mman.h> #endif -#ifdef __linux__ -# if HAVE_LINUX_MAGIC_H -# include <linux/magic.h> -# endif +#if HAVE_LINUX_MAGIC_H +# include <linux/magic.h> +#endif + +#ifdef HAVE_SYS_STATFS_H # include <sys/statfs.h> +#elif defined HAVE_SYS_PARAM_H && defined HAVE_SYS_MOUNT_H +# include <sys/param.h> +# include <sys/mount.h> #endif #if defined(__linux__) && HAVE_DECL_LO_FLAGS_AUTOCLEAR @@ -2817,7 +2821,7 @@ int virFilePrintf(FILE *fp, const char *msg, ...) } -#ifdef __linux__ +#ifdef HAVE_STATFS # ifndef NFS_SUPER_MAGIC # define NFS_SUPER_MAGIC 0x6969 @@ -2909,14 +2913,17 @@ virFileIsSharedFSType(const char *path, return 0; } -#else + +#else /* ! HAVE_STATFS */ + int virFileIsSharedFSType(const char *path ATTRIBUTE_UNUSED, int fstypes ATTRIBUTE_UNUSED) { /* XXX implement me :-) */ return 0; } -#endif + +#endif /* HAVE_STATFS */ int virFileIsSharedFS(const char *path) { -- 1.8.5.5

On Thu, Jul 17, 2014 at 06:12:42PM +0200, Michal Privoznik wrote:
The statfs(2) gets filesystem statistics. Currently, we use it only on linux, and leave stub to implement on other platforms. But hey, other platforms (like FreeBSD) have statfs() too. If we check it in configure we can wider platforms supported. Speaking of FreeBSD, the headers to include are of course different: sys/param.h and sys/mount.h on the FreeBSD and sys/statfs.h on the Linux. The header files are checked too.
Signed-off-by: Michal Privoznik <mprivozn@redhat.com> --- configure.ac | 4 ++-- src/util/virfile.c | 21 ++++++++++++++------- 2 files changed, 16 insertions(+), 9 deletions(-)
diff --git a/configure.ac b/configure.ac index 8001e24..68ebf75 100644 --- a/configure.ac +++ b/configure.ac @@ -275,7 +275,7 @@ dnl and various less common threadsafe functions AC_CHECK_FUNCS_ONCE([cfmakeraw fallocate geteuid getgid getgrnam_r \ getmntent_r getpwuid_r getuid kill mmap newlocale posix_fallocate \ posix_memalign prlimit regexec sched_getaffinity setgroups setns \ - setrlimit symlink sysctlbyname getifaddrs]) + setrlimit symlink sysctlbyname getifaddrs statfs])
dnl Availability of pthread functions. Because of $LIB_PTHREAD, we dnl cannot use AC_CHECK_FUNCS_ONCE. LIB_PTHREAD and LIBMULTITHREAD @@ -316,7 +316,7 @@ dnl Availability of various common headers (non-fatal if missing). AC_CHECK_HEADERS([pwd.h paths.h regex.h sys/un.h \ sys/poll.h syslog.h mntent.h net/ethernet.h linux/magic.h \ sys/un.h sys/syscall.h sys/sysctl.h netinet/tcp.h ifaddrs.h \ - libtasn1.h sys/ucred.h sys/mount.h]) + libtasn1.h sys/ucred.h sys/mount.h sys/param.h sys/statfs.h]) dnl Check whether endian provides handy macros. AC_CHECK_DECLS([htole64], [], [], [[#include <endian.h>]])
diff --git a/src/util/virfile.c b/src/util/virfile.c index 463064c..699b9f8 100644 --- a/src/util/virfile.c +++ b/src/util/virfile.c @@ -43,11 +43,15 @@ # include <sys/mman.h> #endif
-#ifdef __linux__ -# if HAVE_LINUX_MAGIC_H -# include <linux/magic.h> -# endif +#if HAVE_LINUX_MAGIC_H +# include <linux/magic.h> +#endif + +#ifdef HAVE_SYS_STATFS_H # include <sys/statfs.h> +#elif defined HAVE_SYS_PARAM_H && defined HAVE_SYS_MOUNT_H +# include <sys/param.h> +# include <sys/mount.h> #endif
#if defined(__linux__) && HAVE_DECL_LO_FLAGS_AUTOCLEAR @@ -2817,7 +2821,7 @@ int virFilePrintf(FILE *fp, const char *msg, ...) }
-#ifdef __linux__ +#ifdef HAVE_STATFS
# ifndef NFS_SUPER_MAGIC # define NFS_SUPER_MAGIC 0x6969
I'm fairly sure these constants are entirely Linux specific, so although you got it to compile on BSD, I don't think it'll be returning sensible results. Regards, Daniel -- |: http://berrange.com -o- http://www.flickr.com/photos/dberrange/ :| |: http://libvirt.org -o- http://virt-manager.org :| |: http://autobuild.org -o- http://search.cpan.org/~danberr/ :| |: http://entangle-photo.org -o- http://live.gnome.org/gtk-vnc :|

On 07/21/2014 08:54 AM, Daniel P. Berrange wrote:
On Thu, Jul 17, 2014 at 06:12:42PM +0200, Michal Privoznik wrote:
The statfs(2) gets filesystem statistics. Currently, we use it only on linux, and leave stub to implement on other platforms. But hey, other platforms (like FreeBSD) have statfs() too. If we check it in configure we can wider platforms supported. Speaking of FreeBSD, the headers to include are of course different: sys/param.h and sys/mount.h on the FreeBSD and sys/statfs.h on the Linux. The header files are checked too.
Signed-off-by: Michal Privoznik <mprivozn@redhat.com> --- configure.ac | 4 ++-- src/util/virfile.c | 21 ++++++++++++++------- 2 files changed, 16 insertions(+), 9 deletions(-)
-#ifdef __linux__ +#ifdef HAVE_STATFS
# ifndef NFS_SUPER_MAGIC # define NFS_SUPER_MAGIC 0x6969
I'm fairly sure these constants are entirely Linux specific, so although you got it to compile on BSD, I don't think it'll be returning sensible results.
Correct. FS Magic numbers are specific to Linux. Gnulib has a 'mountlist' module that coreutils and findutils share to try and portably get at file system names for non-Linux systems, but right now it is GPL, so we'd have to ask gnulib folks if it can be relaxed before libvirt could benefit from it. Sadly, mounting of file systems is still an area of widely varying implementation-specific quirks, where there are no standard practices between systems. -- Eric Blake eblake redhat com +1-919-301-3266 Libvirt virtualization library http://libvirt.org

On Mon, Jul 21, 2014 at 10:07:48AM -0600, Eric Blake wrote:
On 07/21/2014 08:54 AM, Daniel P. Berrange wrote:
On Thu, Jul 17, 2014 at 06:12:42PM +0200, Michal Privoznik wrote:
The statfs(2) gets filesystem statistics. Currently, we use it only on linux, and leave stub to implement on other platforms. But hey, other platforms (like FreeBSD) have statfs() too. If we check it in configure we can wider platforms supported. Speaking of FreeBSD, the headers to include are of course different: sys/param.h and sys/mount.h on the FreeBSD and sys/statfs.h on the Linux. The header files are checked too.
Signed-off-by: Michal Privoznik <mprivozn@redhat.com> --- configure.ac | 4 ++-- src/util/virfile.c | 21 ++++++++++++++------- 2 files changed, 16 insertions(+), 9 deletions(-)
-#ifdef __linux__ +#ifdef HAVE_STATFS
# ifndef NFS_SUPER_MAGIC # define NFS_SUPER_MAGIC 0x6969
I'm fairly sure these constants are entirely Linux specific, so although you got it to compile on BSD, I don't think it'll be returning sensible results.
Correct. FS Magic numbers are specific to Linux. Gnulib has a 'mountlist' module that coreutils and findutils share to try and portably get at file system names for non-Linux systems, but right now it is GPL, so we'd have to ask gnulib folks if it can be relaxed before libvirt could benefit from it. Sadly, mounting of file systems is still an area of widely varying implementation-specific quirks, where there are no standard practices between systems.
I think I'd just suggest dropping this patch - it shouldn't hold up the rest of the huge page series which we only really care about for Linux. Regards, Daniel -- |: http://berrange.com -o- http://www.flickr.com/photos/dberrange/ :| |: http://libvirt.org -o- http://virt-manager.org :| |: http://autobuild.org -o- http://search.cpan.org/~danberr/ :| |: http://entangle-photo.org -o- http://live.gnome.org/gtk-vnc :|

This should iterate over mount tab and search for hugetlbfs among with looking for the default value of huge pages. Signed-off-by: Michal Privoznik <mprivozn@redhat.com> --- src/libvirt_private.syms | 1 + src/util/virfile.c | 155 +++++++++++++++++++++++++++++++++++++++++++++++ src/util/virfile.h | 12 ++++ 3 files changed, 168 insertions(+) diff --git a/src/libvirt_private.syms b/src/libvirt_private.syms index 8d3671c..44403fd 100644 --- a/src/libvirt_private.syms +++ b/src/libvirt_private.syms @@ -1297,6 +1297,7 @@ virFileDirectFdFlag; virFileExists; virFileFclose; virFileFdopen; +virFileFindHugeTLBFS; virFileFindMountPoint; virFileFindResource; virFileFindResourceFull; diff --git a/src/util/virfile.c b/src/util/virfile.c index 699b9f8..e1034b7 100644 --- a/src/util/virfile.c +++ b/src/util/virfile.c @@ -2841,6 +2841,9 @@ int virFilePrintf(FILE *fp, const char *msg, ...) # ifndef CIFS_SUPER_MAGIC # define CIFS_SUPER_MAGIC 0xFF534D42 # endif +# ifndef HUGETLBFS_MAGIC +# define HUGETLBFS_MAGIC 0x958458f6 +# endif int virFileIsSharedFSType(const char *path, @@ -2914,6 +2917,33 @@ virFileIsSharedFSType(const char *path, return 0; } +int +virFileGetHugepageSize(const char *path, + unsigned long long *size) +{ + int ret = -1; + struct statfs fs; + + if (statfs(path, &fs) < 0) { + virReportSystemError(errno, + _("cannot determine filesystem for '%s'"), + path); + goto cleanup; + } + + if (fs.f_type != HUGETLBFS_MAGIC) { + virReportError(VIR_ERR_INTERNAL_ERROR, + _("not a hugetlbfs mount: '%s'"), + path); + goto cleanup; + } + + *size = fs.f_bsize / 1024; /* we are storing size in KiB */ + ret = 0; + cleanup: + return ret; +} + #else /* ! HAVE_STATFS */ int virFileIsSharedFSType(const char *path ATTRIBUTE_UNUSED, @@ -2923,6 +2953,17 @@ int virFileIsSharedFSType(const char *path ATTRIBUTE_UNUSED, return 0; } +int +virFileGetHugepageSize(const char *path, + unsigned long long *size) +{ + /* XXX implement me :-) */ + VIR_WARN("Trying to get huge page size on %s is not implemented yet.", + path); + *size = 2048; /* Pure guess */ + return 0; +} + #endif /* HAVE_STATFS */ int virFileIsSharedFS(const char *path) @@ -2935,3 +2976,117 @@ int virFileIsSharedFS(const char *path) VIR_FILE_SHFS_SMB | VIR_FILE_SHFS_CIFS); } + + +#if defined __linux__ && defined HAVE_MNTENT_H && defined HAVE_GETMNTENT_R + +# define PROC_MEMINFO "/proc/meminfo" +# define HUGEPAGESIZE_STR "Hugepagesize:" + +static int +virFileGetDefaultHugepageSize(unsigned long long *size) +{ + int ret = -1; + char *meminfo, *c, *n, *unit; + + if (virFileReadAll(PROC_MEMINFO, 4096, &meminfo) < 0) + goto cleanup; + + if (!(c = strstr(meminfo, HUGEPAGESIZE_STR))) { + virReportError(VIR_ERR_INTERNAL_ERROR, + _("Unable to parse %s"), + PROC_MEMINFO); + goto cleanup; + } + c += strlen(HUGEPAGESIZE_STR); + + if ((n = strchr(c, '\n'))) { + /* Cut off the rest of the meminfo file */ + *n = '\0'; + } + + if (virStrToLong_ull(c, &unit, 10, size) < 0 || STRNEQ(unit, " kB")) { + virReportError(VIR_ERR_INTERNAL_ERROR, + _("Unable to parse %s %s"), + HUGEPAGESIZE_STR, c); + goto cleanup; + } + + ret = 0; + cleanup: + VIR_FREE(meminfo); + return ret; +} + +# define PROC_MOUNTS "/proc/mounts" + +int +virFileFindHugeTLBFS(virHugeTLBFSPtr *ret_fs, + size_t *ret_nfs) +{ + int ret = -1; + FILE *f = NULL; + struct mntent mb; + char mntbuf[1024]; + virHugeTLBFSPtr fs = NULL; + size_t nfs = 0; + unsigned long long default_hugepagesz; + + if (virFileGetDefaultHugepageSize(&default_hugepagesz) < 0) + goto cleanup; + + if (!(f = setmntent(PROC_MOUNTS, "r"))) { + virReportSystemError(errno, + _("Unable to open %s"), + PROC_MOUNTS); + goto cleanup; + } + + while (getmntent_r(f, &mb, mntbuf, sizeof(mntbuf))) { + virHugeTLBFSPtr tmp; + + if (STRNEQ(mb.mnt_type, "hugetlbfs")) + continue; + + if (VIR_REALLOC_N(fs, nfs + 1) < 0) + goto cleanup; + + tmp = &fs[nfs]; + nfs++; + + if (VIR_STRDUP(tmp->mnt_dir, mb.mnt_dir) < 0) + goto cleanup; + + if (virFileGetHugepageSize(tmp->mnt_dir, &tmp->size) < 0) + goto cleanup; + + tmp->deflt = tmp->size == default_hugepagesz; + } + + *ret_fs = fs; + *ret_nfs = nfs; + fs = NULL; + nfs = 0; + ret = 0; + + cleanup: + endmntent(f); + while (nfs) + VIR_FREE(fs[--nfs].mnt_dir); + VIR_FREE(fs); + return ret; +} + +#else + +int +virFileFindHugeTLBFS(virHugeTLBFSPtr *ret_fs, + size_t *ret_nfs) +{ + /* XXX implement me :-) */ + *ret_fs = NULL; + *ret_nfs = 0; + return 0; +} +#endif /* ! defined __linux__ && defined HAVE_MNTENT_H && + defined HAVE_GETMNTENT_R */ diff --git a/src/util/virfile.h b/src/util/virfile.h index 36d3fe7..403d0ba 100644 --- a/src/util/virfile.h +++ b/src/util/virfile.h @@ -283,4 +283,16 @@ int virBuildPathInternal(char **path, ...) ATTRIBUTE_SENTINEL; int virFilePrintf(FILE *fp, const char *msg, ...) ATTRIBUTE_FMT_PRINTF(2, 3); +typedef struct _virHugeTLBFS virHugeTLBFS; +typedef virHugeTLBFS *virHugeTLBFSPtr; +struct _virHugeTLBFS { + char *mnt_dir; /* Where the FS is mount to */ + unsigned long long size; /* page size in kibibytes */ + bool deflt; /* is this the default huge page size */ +}; + +int virFileGetHugepageSize(const char *path, + unsigned long long *size); +int virFileFindHugeTLBFS(virHugeTLBFSPtr *ret_fs, + size_t *ret_nfs); #endif /* __VIR_FILE_H */ -- 1.8.5.5

On Thu, Jul 17, 2014 at 06:12:43PM +0200, Michal Privoznik wrote:
This should iterate over mount tab and search for hugetlbfs among with looking for the default value of huge pages.
Signed-off-by: Michal Privoznik <mprivozn@redhat.com> --- src/libvirt_private.syms | 1 + src/util/virfile.c | 155 +++++++++++++++++++++++++++++++++++++++++++++++ src/util/virfile.h | 12 ++++ 3 files changed, 168 insertions(+)
diff --git a/src/libvirt_private.syms b/src/libvirt_private.syms index 8d3671c..44403fd 100644 --- a/src/libvirt_private.syms +++ b/src/libvirt_private.syms @@ -1297,6 +1297,7 @@ virFileDirectFdFlag; virFileExists; virFileFclose; virFileFdopen; +virFileFindHugeTLBFS; virFileFindMountPoint; virFileFindResource; virFileFindResourceFull; diff --git a/src/util/virfile.c b/src/util/virfile.c index 699b9f8..e1034b7 100644 --- a/src/util/virfile.c +++ b/src/util/virfile.c
+int +virFileGetHugepageSize(const char *path, + unsigned long long *size)
This ought to be in libvirt_private.sym too
+int +virFileGetHugepageSize(const char *path, + unsigned long long *size) +{ + /* XXX implement me :-) */ + VIR_WARN("Trying to get huge page size on %s is not implemented yet.", + path); + *size = 2048; /* Pure guess */ + return 0; +}
Hmm, seems like we should really report an fatal error here, since I don't think it makes sense to continue QEMU startup in this scenario.
+ +# define PROC_MOUNTS "/proc/mounts" + +int +virFileFindHugeTLBFS(virHugeTLBFSPtr *ret_fs, + size_t *ret_nfs) +{ + int ret = -1; + FILE *f = NULL; + struct mntent mb; + char mntbuf[1024]; + virHugeTLBFSPtr fs = NULL; + size_t nfs = 0; + unsigned long long default_hugepagesz; + + if (virFileGetDefaultHugepageSize(&default_hugepagesz) < 0) + goto cleanup; + + if (!(f = setmntent(PROC_MOUNTS, "r"))) { + virReportSystemError(errno, + _("Unable to open %s"), + PROC_MOUNTS); + goto cleanup; + } + + while (getmntent_r(f, &mb, mntbuf, sizeof(mntbuf))) { + virHugeTLBFSPtr tmp; + + if (STRNEQ(mb.mnt_type, "hugetlbfs")) + continue; + + if (VIR_REALLOC_N(fs, nfs + 1) < 0) + goto cleanup; + + tmp = &fs[nfs]; + nfs++;
Perhaps VIR_EXPAND_N would be nicer ?
+ + if (VIR_STRDUP(tmp->mnt_dir, mb.mnt_dir) < 0) + goto cleanup; + + if (virFileGetHugepageSize(tmp->mnt_dir, &tmp->size) < 0) + goto cleanup; + + tmp->deflt = tmp->size == default_hugepagesz; + } + + *ret_fs = fs; + *ret_nfs = nfs; + fs = NULL; + nfs = 0; + ret = 0; + + cleanup: + endmntent(f); + while (nfs) + VIR_FREE(fs[--nfs].mnt_dir); + VIR_FREE(fs); + return ret; +} + +#else + +int +virFileFindHugeTLBFS(virHugeTLBFSPtr *ret_fs, + size_t *ret_nfs) +{ + /* XXX implement me :-) */ + *ret_fs = NULL; + *ret_nfs = 0; + return 0; +}
Again report a fatal error here
+#endif /* ! defined __linux__ && defined HAVE_MNTENT_H && + defined HAVE_GETMNTENT_R */
Regards, Daniel -- |: http://berrange.com -o- http://www.flickr.com/photos/dberrange/ :| |: http://libvirt.org -o- http://virt-manager.org :| |: http://autobuild.org -o- http://search.cpan.org/~danberr/ :| |: http://entangle-photo.org -o- http://live.gnome.org/gtk-vnc :|

Use better detection of hugetlbfs mount points. Yes, there can be multiple mount points each serving different huge page size. Since we already have ability to override the mount point in the qemu.conf file, this crazy backward compatibility code is brought in. Now we allow multiple mount points, so the "hugetlbfs_mount" option must take an list of strings (mount points). But previously, it was just a string, so we must accept both types now. Signed-off-by: Michal Privoznik <mprivozn@redhat.com> --- src/Makefile.am | 1 + src/qemu/qemu_command.c | 20 ++++---- src/qemu/qemu_conf.c | 122 +++++++++++++++++++++++++++++++++++++++++------ src/qemu/qemu_conf.h | 9 +++- src/qemu/qemu_driver.c | 39 +++++++-------- src/qemu/qemu_process.c | 21 +++++--- tests/qemuxml2argvtest.c | 10 ++-- 7 files changed, 166 insertions(+), 56 deletions(-) diff --git a/src/Makefile.am b/src/Makefile.am index 982f63d..85e61c2 100644 --- a/src/Makefile.am +++ b/src/Makefile.am @@ -1212,6 +1212,7 @@ libvirt_driver_qemu_impl_la_LIBADD = $(CAPNG_LIBS) \ $(GNUTLS_LIBS) \ $(LIBNL_LIBS) \ $(LIBXML_LIBS) \ + libvirt_util.la \ $(NULL) libvirt_driver_qemu_impl_la_SOURCES = $(QEMU_DRIVER_SOURCES) diff --git a/src/qemu/qemu_command.c b/src/qemu/qemu_command.c index 8062510..b14ce83 100644 --- a/src/qemu/qemu_command.c +++ b/src/qemu/qemu_command.c @@ -7333,14 +7333,12 @@ qemuBuildCommandLine(virConnectPtr conn, def->mem.max_balloon = VIR_DIV_UP(def->mem.max_balloon, 1024) * 1024; virCommandAddArgFormat(cmd, "%llu", def->mem.max_balloon / 1024); if (def->mem.hugepage_backed) { - if (!cfg->hugetlbfsMount) { + char *mem_path; + + if (!cfg->nhugetlbfs) { virReportError(VIR_ERR_INTERNAL_ERROR, - "%s", _("hugetlbfs filesystem is not mounted")); - goto error; - } - if (!cfg->hugepagePath) { - virReportError(VIR_ERR_INTERNAL_ERROR, - "%s", _("hugepages are disabled by administrator config")); + "%s", _("hugetlbfs filesystem is not mounted " + "or disabled by administrator config")); goto error; } if (!virQEMUCapsGet(qemuCaps, QEMU_CAPS_MEM_PATH)) { @@ -7349,8 +7347,14 @@ qemuBuildCommandLine(virConnectPtr conn, def->emulator); goto error; } + + if (!(mem_path = qemuGetDefaultHugepath(cfg->hugetlbfs, + cfg->nhugetlbfs))) + goto error; + virCommandAddArgList(cmd, "-mem-prealloc", "-mem-path", - cfg->hugepagePath, NULL); + mem_path, NULL); + VIR_FREE(mem_path); } if (def->mem.locked && !virQEMUCapsGet(qemuCaps, QEMU_CAPS_MLOCK)) { diff --git a/src/qemu/qemu_conf.c b/src/qemu/qemu_conf.c index e62bec0..cf5ce97 100644 --- a/src/qemu/qemu_conf.c +++ b/src/qemu/qemu_conf.c @@ -230,19 +230,13 @@ virQEMUDriverConfigPtr virQEMUDriverConfigNew(bool privileged) cfg->migrationPortMin = QEMU_MIGRATION_PORT_MIN; cfg->migrationPortMax = QEMU_MIGRATION_PORT_MAX; -#if defined HAVE_MNTENT_H && defined HAVE_GETMNTENT_R - /* For privileged driver, try and find hugepage mount automatically. + /* For privileged driver, try and find hugetlbfs mounts automatically. * Non-privileged driver requires admin to create a dir for the - * user, chown it, and then let user configure it manually */ + * user, chown it, and then let user configure it manually. */ if (privileged && - !(cfg->hugetlbfsMount = virFileFindMountPoint("hugetlbfs"))) { - if (errno != ENOENT) { - virReportSystemError(errno, "%s", - _("unable to find hugetlbfs mountpoint")); - goto error; - } - } -#endif + virFileFindHugeTLBFS(&cfg->hugetlbfs, &cfg->nhugetlbfs) < 0) + goto error; + if (VIR_STRDUP(cfg->bridgeHelperName, "/usr/libexec/qemu-bridge-helper") < 0) goto error; @@ -293,8 +287,11 @@ static void virQEMUDriverConfigDispose(void *obj) VIR_FREE(cfg->spicePassword); VIR_FREE(cfg->spiceSASLdir); - VIR_FREE(cfg->hugetlbfsMount); - VIR_FREE(cfg->hugepagePath); + while (cfg->nhugetlbfs) { + cfg->nhugetlbfs--; + VIR_FREE(cfg->hugetlbfs[cfg->nhugetlbfs].mnt_dir); + } + VIR_FREE(cfg->hugetlbfs); VIR_FREE(cfg->bridgeHelperName); VIR_FREE(cfg->saveImageFormat); @@ -307,6 +304,26 @@ static void virQEMUDriverConfigDispose(void *obj) } +static int +virQEMUDriverConfigHugeTLBFSInit(virHugeTLBFSPtr hugetlbfs, + const char *path, + bool deflt) +{ + int ret = -1; + + if (VIR_STRDUP(hugetlbfs->mnt_dir, path) < 0) + goto cleanup; + + if (virFileGetHugepageSize(path, &hugetlbfs->size) < 0) + goto cleanup; + + hugetlbfs->deflt = deflt; + ret = 0; + cleanup: + return ret; +} + + int virQEMUDriverConfigLoadFile(virQEMUDriverConfigPtr cfg, const char *filename) { @@ -555,7 +572,57 @@ int virQEMUDriverConfigLoadFile(virQEMUDriverConfigPtr cfg, GET_VALUE_BOOL("auto_dump_bypass_cache", cfg->autoDumpBypassCache); GET_VALUE_BOOL("auto_start_bypass_cache", cfg->autoStartBypassCache); - GET_VALUE_STR("hugetlbfs_mount", cfg->hugetlbfsMount); + /* Some crazy backcompat. Back in the old days, this was just a pure + * string. We must continue supporting it. These days however, this may be + * an array of strings. */ + p = virConfGetValue(conf, "hugetlbfs_mount"); + if (p && p->type == VIR_CONF_LIST) { + size_t len = 0; + virConfValuePtr pp = p->list; + + /* Calc length and check items */ + while (pp) { + if (pp->type != VIR_CONF_STRING) { + virReportError(VIR_ERR_CONF_SYNTAX, "%s", + _("hugetlbfs_mount must be a list of strings")); + goto cleanup; + } + len++; + pp = pp->next; + } + + if (len) { + if (VIR_REALLOC_N(cfg->hugetlbfs, len) < 0) + goto cleanup; + } else { + VIR_FREE(cfg->hugetlbfs); + } + cfg->nhugetlbfs = len; + + pp = p->list; + len = 0; + while (pp) { + if (virQEMUDriverConfigHugeTLBFSInit(&cfg->hugetlbfs[len], + pp->str, !len) < 0) + goto cleanup; + len++; + pp = pp->next; + } + } else { + CHECK_TYPE("hugetlbfs_mount", VIR_CONF_STRING); + if (p && p->str) { + if (VIR_REALLOC_N(cfg->hugetlbfs, 1) < 0) + goto cleanup; + cfg->nhugetlbfs = 1; + if (virQEMUDriverConfigHugeTLBFSInit(&cfg->hugetlbfs[0], + p->str, true) < 0) + goto cleanup; + } else { + VIR_FREE(cfg->hugetlbfs); + cfg->nhugetlbfs = 0; + } + } + GET_VALUE_STR("bridge_helper", cfg->bridgeHelperName); GET_VALUE_BOOL("mac_filter", cfg->macFilter); @@ -1402,3 +1469,30 @@ qemuTranslateSnapshotDiskSourcePool(virConnectPtr conn ATTRIBUTE_UNUSED, _("Snapshots are not yet supported with 'pool' volumes")); return -1; } + +char * +qemuGetHugepagePath(virHugeTLBFSPtr hugepage) +{ + char *ret; + + if (virAsprintf(&ret, "%s/libvirt/qemu", hugepage->mnt_dir) < 0) + return NULL; + + return ret; +} + +char * +qemuGetDefaultHugepath(virHugeTLBFSPtr hugetlbfs, + size_t nhugetlbfs) +{ + size_t i; + + for (i = 0; i < nhugetlbfs; i++) + if (hugetlbfs[i].deflt) + break; + + if (i == nhugetlbfs) + i = 0; + + return qemuGetHugepagePath(&hugetlbfs[i]); +} diff --git a/src/qemu/qemu_conf.h b/src/qemu/qemu_conf.h index 78b08e5..c3c9d6c 100644 --- a/src/qemu/qemu_conf.h +++ b/src/qemu/qemu_conf.h @@ -45,6 +45,7 @@ # include "qemu_capabilities.h" # include "virclosecallbacks.h" # include "virhostdev.h" +# include "virfile.h" # ifdef CPU_SETSIZE /* Linux */ # define QEMUD_CPUMASK_LEN CPU_SETSIZE @@ -126,8 +127,9 @@ struct _virQEMUDriverConfig { int webSocketPortMin; int webSocketPortMax; - char *hugetlbfsMount; - char *hugepagePath; + virHugeTLBFSPtr hugetlbfs; + size_t nhugetlbfs; + char *bridgeHelperName; bool macFilter; @@ -311,4 +313,7 @@ int qemuTranslateDiskSourcePool(virConnectPtr conn, int qemuTranslateSnapshotDiskSourcePool(virConnectPtr conn, virDomainSnapshotDiskDefPtr def); +char * qemuGetHugepagePath(virHugeTLBFSPtr hugepage); +char * qemuGetDefaultHugepath(virHugeTLBFSPtr hugetlbfs, + size_t nhugetlbfs); #endif /* __QEMUD_CONF_H */ diff --git a/src/qemu/qemu_driver.c b/src/qemu/qemu_driver.c index 33541d3..90be700 100644 --- a/src/qemu/qemu_driver.c +++ b/src/qemu/qemu_driver.c @@ -577,11 +577,11 @@ qemuStateInitialize(bool privileged, char *driverConf = NULL; virConnectPtr conn = NULL; char ebuf[1024]; - char *membase = NULL; - char *mempath = NULL; virQEMUDriverConfigPtr cfg; uid_t run_uid = -1; gid_t run_gid = -1; + char *hugepagePath = NULL; + size_t i; if (VIR_ALLOC(qemu_driver) < 0) return -1; @@ -753,37 +753,33 @@ qemuStateInitialize(bool privileged, /* If hugetlbfs is present, then we need to create a sub-directory within * it, since we can't assume the root mount point has permissions that - * will let our spawned QEMU instances use it. - * - * NB the check for '/', since user may config "" to disable hugepages - * even when mounted - */ - if (cfg->hugetlbfsMount && - cfg->hugetlbfsMount[0] == '/') { - if (virAsprintf(&membase, "%s/libvirt", - cfg->hugetlbfsMount) < 0 || - virAsprintf(&mempath, "%s/qemu", membase) < 0) + * will let our spawned QEMU instances use it. */ + for (i = 0; i < cfg->nhugetlbfs; i++) { + hugepagePath = qemuGetHugepagePath(&cfg->hugetlbfs[i]); + + if (!hugepagePath) goto error; - if (virFileMakePath(mempath) < 0) { + if (virFileMakePathWithMode(hugepagePath, 1777) < 0) { virReportSystemError(errno, - _("unable to create hugepage path %s"), mempath); + _("unable to create hugepage path %s"), + hugepagePath); goto error; } if (cfg->privileged) { - if (virFileUpdatePerm(membase, 0, S_IXGRP | S_IXOTH) < 0) + if (virFileUpdatePerm(cfg->hugetlbfs[i].mnt_dir, + 0, S_IXGRP | S_IXOTH) < 0) goto error; - if (chown(mempath, cfg->user, cfg->group) < 0) { + if (chown(hugepagePath, cfg->user, cfg->group) < 0) { virReportSystemError(errno, _("unable to set ownership on %s to %d:%d"), - mempath, (int) cfg->user, + hugepagePath, + (int) cfg->user, (int) cfg->group); goto error; } } - VIR_FREE(membase); - - cfg->hugepagePath = mempath; + VIR_FREE(hugepagePath); } if (!(qemu_driver->closeCallbacks = virCloseCallbacksNew())) @@ -844,8 +840,7 @@ qemuStateInitialize(bool privileged, error: virObjectUnref(conn); VIR_FREE(driverConf); - VIR_FREE(membase); - VIR_FREE(mempath); + VIR_FREE(hugepagePath); qemuStateCleanup(); return -1; } diff --git a/src/qemu/qemu_process.c b/src/qemu/qemu_process.c index 8a6b384..16d03d8 100644 --- a/src/qemu/qemu_process.c +++ b/src/qemu/qemu_process.c @@ -3791,12 +3791,21 @@ int qemuProcessStart(virConnectPtr conn, } virDomainAuditSecurityLabel(vm, true); - if (cfg->hugepagePath && vm->def->mem.hugepage_backed) { - if (virSecurityManagerSetHugepages(driver->securityManager, - vm->def, cfg->hugepagePath) < 0) { - virReportError(VIR_ERR_INTERNAL_ERROR, - "%s", _("Unable to set huge path in security driver")); - goto cleanup; + if (vm->def->mem.hugepage_backed) { + for (i = 0; i < cfg->nhugetlbfs; i++) { + char *hugepagePath = qemuGetHugepagePath(&cfg->hugetlbfs[i]); + + if (!hugepagePath) + goto cleanup; + + if (virSecurityManagerSetHugepages(driver->securityManager, + vm->def, hugepagePath) < 0) { + virReportError(VIR_ERR_INTERNAL_ERROR, + "%s", _("Unable to set huge path in security driver")); + VIR_FREE(hugepagePath); + goto cleanup; + } + VIR_FREE(hugepagePath); } } diff --git a/tests/qemuxml2argvtest.c b/tests/qemuxml2argvtest.c index 28436f2..1a5a4b0 100644 --- a/tests/qemuxml2argvtest.c +++ b/tests/qemuxml2argvtest.c @@ -524,12 +524,14 @@ mymain(void) VIR_FREE(driver.config->stateDir); if (VIR_STRDUP_QUIET(driver.config->stateDir, "/nowhere") < 0) return EXIT_FAILURE; - VIR_FREE(driver.config->hugetlbfsMount); - if (VIR_STRDUP_QUIET(driver.config->hugetlbfsMount, "/dev/hugepages") < 0) + VIR_FREE(driver.config->hugetlbfs); + if (VIR_ALLOC_N(driver.config->hugetlbfs, 1) < 0) return EXIT_FAILURE; - VIR_FREE(driver.config->hugepagePath); - if (VIR_STRDUP_QUIET(driver.config->hugepagePath, "/dev/hugepages/libvirt/qemu") < 0) + driver.config->nhugetlbfs = 1; + if (VIR_STRDUP(driver.config->hugetlbfs[0].mnt_dir, "/dev/hugepages") < 0) return EXIT_FAILURE; + driver.config->hugetlbfs[0].size = 2048; + driver.config->hugetlbfs[0].deflt = true; driver.config->spiceTLS = 1; if (VIR_STRDUP_QUIET(driver.config->spicePassword, "123456") < 0) return EXIT_FAILURE; -- 1.8.5.5

On Thu, Jul 17, 2014 at 06:12:44PM +0200, Michal Privoznik wrote:
Use better detection of hugetlbfs mount points. Yes, there can be multiple mount points each serving different huge page size.
Since we already have ability to override the mount point in the qemu.conf file, this crazy backward compatibility code is brought in. Now we allow multiple mount points, so the "hugetlbfs_mount" option must take an list of strings (mount points). But previously, it was just a string, so we must accept both types now.
Signed-off-by: Michal Privoznik <mprivozn@redhat.com> --- src/Makefile.am | 1 + src/qemu/qemu_command.c | 20 ++++---- src/qemu/qemu_conf.c | 122 +++++++++++++++++++++++++++++++++++++++++------ src/qemu/qemu_conf.h | 9 +++- src/qemu/qemu_driver.c | 39 +++++++-------- src/qemu/qemu_process.c | 21 +++++---
Also need to update the augeas files & example config to deal with list or string. Regards, Daniel -- |: http://berrange.com -o- http://www.flickr.com/photos/dberrange/ :| |: http://libvirt.org -o- http://virt-manager.org :| |: http://autobuild.org -o- http://search.cpan.org/~danberr/ :| |: http://entangle-photo.org -o- http://live.gnome.org/gtk-vnc :|

This internal API just checks if two bitmaps intersect or not. Signed-off-by: Michal Privoznik <mprivozn@redhat.com> --- src/libvirt_private.syms | 1 + src/util/virbitmap.c | 20 ++++++++++++++++++++ src/util/virbitmap.h | 3 +++ tests/virbitmaptest.c | 26 ++++++++++++++++++++++++++ 4 files changed, 50 insertions(+) diff --git a/src/libvirt_private.syms b/src/libvirt_private.syms index 44403fd..256edd5 100644 --- a/src/libvirt_private.syms +++ b/src/libvirt_private.syms @@ -1016,6 +1016,7 @@ virBitmapClearBit; virBitmapCopy; virBitmapCountBits; virBitmapDataToString; +virBitmapDoesIntersect; virBitmapEqual; virBitmapFormat; virBitmapFree; diff --git a/src/util/virbitmap.c b/src/util/virbitmap.c index 1029635..9f82eb9 100644 --- a/src/util/virbitmap.c +++ b/src/util/virbitmap.c @@ -732,3 +732,23 @@ virBitmapDataToString(void *data, virBitmapFree(map); return ret; } + +bool +virBitmapDoesIntersect(virBitmapPtr b1, + virBitmapPtr b2) +{ + size_t i; + + if (b1->max_bit > b2->max_bit) { + virBitmapPtr tmp = b1; + b1 = b2; + b2 = tmp; + } + + for (i = 0; i < b1->map_len; i++) { + if (b1->map[i] & b2->map[i]) + return true; + } + + return false; +} diff --git a/src/util/virbitmap.h b/src/util/virbitmap.h index 142a218..063516a 100644 --- a/src/util/virbitmap.h +++ b/src/util/virbitmap.h @@ -114,5 +114,8 @@ size_t virBitmapCountBits(virBitmapPtr bitmap) char *virBitmapDataToString(void *data, int len) ATTRIBUTE_NONNULL(1); +bool virBitmapDoesIntersect(virBitmapPtr b1, + virBitmapPtr b2) + ATTRIBUTE_NONNULL(1) ATTRIBUTE_NONNULL(2); #endif diff --git a/tests/virbitmaptest.c b/tests/virbitmaptest.c index 048946f..09b40d7 100644 --- a/tests/virbitmaptest.c +++ b/tests/virbitmaptest.c @@ -510,6 +510,30 @@ test9(const void *opaque ATTRIBUTE_UNUSED) } static int +test10(const void *opaque ATTRIBUTE_UNUSED) +{ + int ret = -1; + virBitmapPtr b1 = NULL, b2 = NULL, b3 = NULL; + + if (virBitmapParse("0-3,5-8,11-15", 0, &b1, 20) < 0 || + virBitmapParse("4,9,10,16-19", 0, &b2, 20) < 0 || + virBitmapParse("15", 0, &b3, 20) < 0) + goto cleanup; + + if (virBitmapDoesIntersect(b1, b2) || + virBitmapDoesIntersect(b2, b3) || + !virBitmapDoesIntersect(b1, b3)) + goto cleanup; + + ret = 0; + cleanup: + virBitmapFree(b1); + virBitmapFree(b2); + virBitmapFree(b3); + return ret; +} + +static int mymain(void) { int ret = 0; @@ -532,6 +556,8 @@ mymain(void) ret = -1; if (virtTestRun("test9", test9, NULL) < 0) ret = -1; + if (virtTestRun("test10", test10, NULL) < 0) + ret = -1; return ret; } -- 1.8.5.5

On Thu, Jul 17, 2014 at 06:12:45PM +0200, Michal Privoznik wrote:
This internal API just checks if two bitmaps intersect or not.
Signed-off-by: Michal Privoznik <mprivozn@redhat.com> --- src/libvirt_private.syms | 1 + src/util/virbitmap.c | 20 ++++++++++++++++++++ src/util/virbitmap.h | 3 +++ tests/virbitmaptest.c | 26 ++++++++++++++++++++++++++ 4 files changed, 50 insertions(+)
ACK Regards, Daniel -- |: http://berrange.com -o- http://www.flickr.com/photos/dberrange/ :| |: http://libvirt.org -o- http://virt-manager.org :| |: http://autobuild.org -o- http://search.cpan.org/~danberr/ :| |: http://entangle-photo.org -o- http://live.gnome.org/gtk-vnc :|

On 07/17/2014 10:12 AM, Michal Privoznik wrote:
This internal API just checks if two bitmaps intersect or not.
Signed-off-by: Michal Privoznik <mprivozn@redhat.com> --- src/libvirt_private.syms | 1 + src/util/virbitmap.c | 20 ++++++++++++++++++++ src/util/virbitmap.h | 3 +++ tests/virbitmaptest.c | 26 ++++++++++++++++++++++++++ 4 files changed, 50 insertions(+)
diff --git a/src/libvirt_private.syms b/src/libvirt_private.syms index 44403fd..256edd5 100644 --- a/src/libvirt_private.syms +++ b/src/libvirt_private.syms @@ -1016,6 +1016,7 @@ virBitmapClearBit; virBitmapCopy; virBitmapCountBits; virBitmapDataToString; +virBitmapDoesIntersect;
The naming sounds awkward. Maybe virBitmapIntersects or virBitmapHasIntersection would be better. -- Eric Blake eblake redhat com +1-919-301-3266 Libvirt virtualization library http://libvirt.org

On Mon, Jul 21, 2014 at 10:09:56AM -0600, Eric Blake wrote:
On 07/17/2014 10:12 AM, Michal Privoznik wrote:
This internal API just checks if two bitmaps intersect or not.
Signed-off-by: Michal Privoznik <mprivozn@redhat.com> --- src/libvirt_private.syms | 1 + src/util/virbitmap.c | 20 ++++++++++++++++++++ src/util/virbitmap.h | 3 +++ tests/virbitmaptest.c | 26 ++++++++++++++++++++++++++ 4 files changed, 50 insertions(+)
diff --git a/src/libvirt_private.syms b/src/libvirt_private.syms index 44403fd..256edd5 100644 --- a/src/libvirt_private.syms +++ b/src/libvirt_private.syms @@ -1016,6 +1016,7 @@ virBitmapClearBit; virBitmapCopy; virBitmapCountBits; virBitmapDataToString; +virBitmapDoesIntersect;
The naming sounds awkward. Maybe virBitmapIntersects or virBitmapHasIntersection would be better.
Perhaps even virBitmapOverlaps is a bit easier to say ? Regards, Daniel -- |: http://berrange.com -o- http://www.flickr.com/photos/dberrange/ :| |: http://libvirt.org -o- http://virt-manager.org :| |: http://autobuild.org -o- http://search.cpan.org/~danberr/ :| |: http://entangle-photo.org -o- http://live.gnome.org/gtk-vnc :|

<memoryBacking> <hugepages> <page size="1" unit="G" nodeset="0-3,5"/> <page size="2" unit="M" nodeset="4"/> </hugepages> </memoryBacking> Signed-off-by: Michal Privoznik <mprivozn@redhat.com> --- docs/formatdomain.html.in | 18 +- docs/schemas/domaincommon.rng | 19 +- src/conf/domain_conf.c | 197 +++++++++++++++++++-- src/conf/domain_conf.h | 13 +- src/parallels/parallels_driver.c | 2 +- src/qemu/qemu_command.c | 2 +- src/qemu/qemu_conf.c | 20 ++- src/qemu/qemu_process.c | 2 +- .../qemuxml2argv-hugepages-pages.xml | 45 +++++ tests/qemuxml2xmltest.c | 1 + 10 files changed, 288 insertions(+), 31 deletions(-) create mode 100644 tests/qemuxml2argvdata/qemuxml2argv-hugepages-pages.xml diff --git a/docs/formatdomain.html.in b/docs/formatdomain.html.in index 3c85fc5..f4362e6 100644 --- a/docs/formatdomain.html.in +++ b/docs/formatdomain.html.in @@ -617,7 +617,9 @@ <domain> ... <memoryBacking> - <hugepages/> + <hugepages> + <page size="1" unit="G" nodeset="0-3,5"/> + <page size="2" unit="M" nodeset="4"/> <nosharepages/> <locked/> </memoryBacking> @@ -632,7 +634,19 @@ <dl> <dt><code>hugepages</code></dt> <dd>This tells the hypervisor that the guest should have its memory - allocated using hugepages instead of the normal native page size.</dd> + allocated using hugepages instead of the normal native page size. + <span class='since'>Since 1.2.5</span> it's possible to set hugepages + more specifically per numa node. The <code>page</code> element is + introduced. It has one compulsory attribute <code>size</code> which + specifies which hugepages should be used (especially useful on systems + supporting hugepages of different sizes). The default unit for the + <code>size</code> attribute is kilobytes (multiplier of 1024). If you + want to use different unit, use optional <code>unit</code> attribute. + For systems with NUMA, the optional <code>nodeset</code> attribute may + come handy as it ties given guest's NUMA nodes to certain hugepage + sizes. From the example snippet, one gigabyte hugepages are used for + every NUMA node except node number four. For the correct syntax see + <a href="#elementsNUMATuning">this</a>.</dd> <dt><code>nosharepages</code></dt> <dd>Instructs hypervisor to disable shared pages (memory merge, KSM) for this domain. <span class="since">Since 1.0.6</span></dd> diff --git a/docs/schemas/domaincommon.rng b/docs/schemas/domaincommon.rng index 2caeef9..d9da0bc 100644 --- a/docs/schemas/domaincommon.rng +++ b/docs/schemas/domaincommon.rng @@ -567,7 +567,24 @@ <interleave> <optional> <element name="hugepages"> - <empty/> + <zeroOrMore> + <element name="page"> + <attribute name="size"> + <ref name="unsignedLong"/> + </attribute> + <optional> + <attribute name='unit'> + <ref name='unit'/> + </attribute> + </optional> + <optional> + <attribute name="nodeset"> + <ref name='cpuset'/> + </attribute> + </optional> + <empty/> + </element> + </zeroOrMore> </element> </optional> <optional> diff --git a/src/conf/domain_conf.c b/src/conf/domain_conf.c index a1ef374..b49bcb0 100644 --- a/src/conf/domain_conf.c +++ b/src/conf/domain_conf.c @@ -11258,6 +11258,57 @@ virDomainParseMemory(const char *xpath, xmlXPathContextPtr ctxt, } +static int +virDomainHugepagesParseXML(xmlNodePtr node, + xmlXPathContextPtr ctxt, + virDomainHugePagePtr hugepage) +{ + int ret = -1; + xmlNodePtr oldnode = ctxt->node; + unsigned long long bytes, max; + char *unit = NULL, *nodeset = NULL; + + ctxt->node = node; + + /* On 32-bit machines, our bound is 0xffffffff * KiB. On 64-bit + * machines, our bound is off_t (2^63). */ + if (sizeof(unsigned long) < sizeof(long long)) + max = 1024ull * ULONG_MAX; + else + max = LLONG_MAX; + + if (virXPathULongLong("string(./@size)", ctxt, &bytes) < 0) { + virReportError(VIR_ERR_XML_DETAIL, "%s", + _("unable to parse size attribute")); + goto cleanup; + } + + unit = virXPathString("string(./@unit)", ctxt); + + if (virScaleInteger(&bytes, unit, 1024, max) < 0) + goto cleanup; + + if (!(hugepage->size = VIR_DIV_UP(bytes, 1024))) { + virReportError(VIR_ERR_XML_DETAIL, "%s", + _("hugepage size can't be zero")); + goto cleanup; + } + + if ((nodeset = virXMLPropString(node, "nodeset"))) { + if (virBitmapParse(nodeset, 0, &hugepage->nodemask, + VIR_DOMAIN_CPUMASK_LEN) < 0) + goto cleanup; + } + + ret = 0; + cleanup: + VIR_FREE(unit); + VIR_FREE(nodeset); + ctxt->node = oldnode; + return ret; +} + + static virDomainResourceDefPtr virDomainResourceDefParse(xmlNodePtr node, xmlXPathContextPtr ctxt) @@ -11325,7 +11376,7 @@ virDomainDefParseXML(xmlDocPtr xml, { xmlNodePtr *nodes = NULL, node = NULL; char *tmp = NULL; - size_t i; + size_t i, j; int n; long id = -1; virDomainDefPtr def; @@ -11475,8 +11526,55 @@ virDomainDefParseXML(xmlDocPtr xml, def->mem.cur_balloon = def->mem.max_balloon; } - if ((node = virXPathNode("./memoryBacking/hugepages", ctxt))) - def->mem.hugepage_backed = true; + + if ((n = virXPathNodeSet("./memoryBacking/hugepages/page", ctxt, &nodes)) < 0) { + virReportError(VIR_ERR_INTERNAL_ERROR, "%s", + _("cannot extract hugepages nodes")); + goto error; + } + + if (n) { + if (VIR_ALLOC_N(def->mem.hugepages, n) < 0) + goto error; + + for (i = 0; i < n; i++) { + if (virDomainHugepagesParseXML(nodes[i], ctxt, + &def->mem.hugepages[i]) < 0) + goto error; + def->mem.nhugepages++; + + for (j = 0; j < i; j++) { + if (def->mem.hugepages[i].nodemask && + def->mem.hugepages[j].nodemask && + virBitmapDoesIntersect(def->mem.hugepages[i].nodemask, + def->mem.hugepages[j].nodemask)) { + virReportError(VIR_ERR_XML_DETAIL, + _("nodeset attribute of hugepages " + "of sizes %llu and %llu intersect"), + def->mem.hugepages[i].size, + def->mem.hugepages[j].size); + goto error; + } else if (!def->mem.hugepages[i].nodemask && + !def->mem.hugepages[j].nodemask) { + virReportError(VIR_ERR_XML_DETAIL, + _("two master hugepages detected: " + "%llu and %llu"), + def->mem.hugepages[i].size, + def->mem.hugepages[j].size); + goto error; + } + } + } + + VIR_FREE(nodes); + } else { + if ((node = virXPathNode("./memoryBacking/hugepages", ctxt))) { + if (VIR_ALLOC(def->mem.hugepages) < 0) + goto error; + + def->mem.nhugepages = 1; + } + } if ((node = virXPathNode("./memoryBacking/nosharepages", ctxt))) def->mem.nosharepages = true; @@ -11498,7 +11596,6 @@ virDomainDefParseXML(xmlDocPtr xml, goto error; for (i = 0; i < n; i++) { - size_t j; if (virDomainBlkioDeviceParseXML(nodes[i], &def->blkio.devices[i]) < 0) goto error; @@ -12383,7 +12480,6 @@ virDomainDefParseXML(xmlDocPtr xml, if (chr->target.port == -1) { int maxport = -1; - size_t j; for (j = 0; j < i; j++) { if (def->parallels[j]->target.port > maxport) maxport = def->parallels[j]->target.port; @@ -12411,7 +12507,6 @@ virDomainDefParseXML(xmlDocPtr xml, if (chr->target.port == -1) { int maxport = -1; - size_t j; for (j = 0; j < i; j++) { if (def->serials[j]->target.port > maxport) maxport = def->serials[j]->target.port; @@ -12469,7 +12564,6 @@ virDomainDefParseXML(xmlDocPtr xml, if (chr->info.type == VIR_DOMAIN_DEVICE_ADDRESS_TYPE_VIRTIO_SERIAL && chr->info.addr.vioserial.port == 0) { int maxport = 0; - size_t j; for (j = 0; j < i; j++) { virDomainChrDefPtr thischr = def->channels[j]; if (thischr->info.type == VIR_DOMAIN_DEVICE_ADDRESS_TYPE_VIRTIO_SERIAL && @@ -12586,7 +12680,7 @@ virDomainDefParseXML(xmlDocPtr xml, if (n && VIR_ALLOC_N(def->videos, n) < 0) goto error; for (i = 0; i < n; i++) { - size_t j = def->nvideos; + j = def->nvideos; virDomainVideoDefPtr video = virDomainVideoDefParseXML(nodes[j], def, flags); @@ -14024,13 +14118,38 @@ virDomainDefCheckABIStability(virDomainDefPtr src, dst->mem.cur_balloon, src->mem.cur_balloon); goto error; } - if (src->mem.hugepage_backed != dst->mem.hugepage_backed) { + if (src->mem.nhugepages != dst->mem.nhugepages) { virReportError(VIR_ERR_CONFIG_UNSUPPORTED, - _("Target domain huge page backing %d does not match source %d"), - dst->mem.hugepage_backed, - src->mem.hugepage_backed); + _("Target domain huge pages count %zu does not match source %zu"), + dst->mem.nhugepages, src->mem.nhugepages); goto error; } + for (i = 0; i < src->mem.nhugepages; i++) { + virDomainHugePagePtr src_huge = &src->mem.hugepages[i]; + virDomainHugePagePtr dst_huge = &dst->mem.hugepages[i]; + + if (src_huge->size != dst_huge->size) { + virReportError(VIR_ERR_CONFIG_UNSUPPORTED, + _("Target domain huge page size %llu " + "does not match source %llu"), + dst_huge->size, src_huge->size); + goto error; + } + + if (src_huge->nodemask && dst_huge->nodemask) { + if (!virBitmapEqual(src_huge->nodemask, dst_huge->nodemask)) { + virReportError(VIR_ERR_CONFIG_UNSUPPORTED, "%s", + _("Target huge page nodemask does not match source")); + goto error; + } + } else { + if (src_huge->nodemask || dst_huge->nodemask) { + virReportError(VIR_ERR_CONFIG_UNSUPPORTED, "%s", + _("Target huge page nodemask does not match source")); + goto error; + } + } + } if (src->vcpus != dst->vcpus) { virReportError(VIR_ERR_CONFIG_UNSUPPORTED, @@ -17136,6 +17255,54 @@ virDomainResourceDefFormat(virBufferPtr buf, } +static int +virDomainHugepagesFormatBuf(virBufferPtr buf, + virDomainHugePagePtr hugepage) +{ + int ret = -1; + + virBufferAsprintf(buf, "<page size='%llu' unit='KiB'", + hugepage->size); + + if (hugepage->nodemask) { + char *nodeset = NULL; + if (!(nodeset = virBitmapFormat(hugepage->nodemask))) + goto cleanup; + virBufferAsprintf(buf, " nodeset='%s'", nodeset); + VIR_FREE(nodeset); + } + + virBufferAddLit(buf, "/>\n"); + + ret = 0; + cleanup: + return ret; +} + +static void +virDomainHugepagesFormat(virBufferPtr buf, + virDomainHugePagePtr hugepages, + size_t nhugepages) +{ + size_t i; + + if (nhugepages == 1 && + hugepages[0].size == 0) { + virBufferAddLit(buf, "<hugepages/>\n"); + return; + } + + virBufferAddLit(buf, "<hugepages>\n"); + virBufferAdjustIndent(buf, 2); + + for (i = 0; i < nhugepages; i++) + virDomainHugepagesFormatBuf(buf, &hugepages[i]); + + virBufferAdjustIndent(buf, -2); + virBufferAddLit(buf, "</hugepages>\n"); +} + + #define DUMPXML_FLAGS \ (VIR_DOMAIN_XML_SECURE | \ VIR_DOMAIN_XML_INACTIVE | \ @@ -17319,11 +17486,11 @@ virDomainDefFormatInternal(virDomainDefPtr def, virBufferAddLit(buf, "</memtune>\n"); } - if (def->mem.hugepage_backed || def->mem.nosharepages || def->mem.locked) { + if (def->mem.nhugepages || def->mem.nosharepages || def->mem.locked) { virBufferAddLit(buf, "<memoryBacking>\n"); virBufferAdjustIndent(buf, 2); - if (def->mem.hugepage_backed) - virBufferAddLit(buf, "<hugepages/>\n"); + if (def->mem.nhugepages) + virDomainHugepagesFormat(buf, def->mem.hugepages, def->mem.nhugepages); if (def->mem.nosharepages) virBufferAddLit(buf, "<nosharepages/>\n"); if (def->mem.locked) diff --git a/src/conf/domain_conf.h b/src/conf/domain_conf.h index 4c9b7e8..61f057c 100644 --- a/src/conf/domain_conf.h +++ b/src/conf/domain_conf.h @@ -1848,6 +1848,14 @@ struct _virDomainResourceDef { char *partition; }; +typedef struct _virDomaiHugePage virDomainHugePage; +typedef virDomainHugePage *virDomainHugePagePtr; + +struct _virDomaiHugePage { + virBitmapPtr nodemask; /* guest's NUMA node mask */ + unsigned long long size; /* hugepage size in KiB */ +}; + /* * Guest VM main configuration * @@ -1874,7 +1882,10 @@ struct _virDomainDef { struct { unsigned long long max_balloon; /* in kibibytes */ unsigned long long cur_balloon; /* in kibibytes */ - bool hugepage_backed; + + virDomainHugePagePtr hugepages; + size_t nhugepages; + bool nosharepages; bool locked; int dump_core; /* enum virDomainMemDump */ diff --git a/src/parallels/parallels_driver.c b/src/parallels/parallels_driver.c index a503dea..bb9538f 100644 --- a/src/parallels/parallels_driver.c +++ b/src/parallels/parallels_driver.c @@ -2023,7 +2023,7 @@ parallelsApplyChanges(virDomainObjPtr dom, virDomainDefPtr new) return -1; } - if (old->mem.hugepage_backed != new->mem.hugepage_backed || + if (old->mem.nhugepages != new->mem.nhugepages || old->mem.hard_limit != new->mem.hard_limit || old->mem.soft_limit != new->mem.soft_limit || old->mem.min_guarantee != new->mem.min_guarantee || diff --git a/src/qemu/qemu_command.c b/src/qemu/qemu_command.c index b14ce83..0b8cef5 100644 --- a/src/qemu/qemu_command.c +++ b/src/qemu/qemu_command.c @@ -7332,7 +7332,7 @@ qemuBuildCommandLine(virConnectPtr conn, virCommandAddArg(cmd, "-m"); def->mem.max_balloon = VIR_DIV_UP(def->mem.max_balloon, 1024) * 1024; virCommandAddArgFormat(cmd, "%llu", def->mem.max_balloon / 1024); - if (def->mem.hugepage_backed) { + if (def->mem.nhugepages) { char *mem_path; if (!cfg->nhugetlbfs) { diff --git a/src/qemu/qemu_conf.c b/src/qemu/qemu_conf.c index cf5ce97..03593d6 100644 --- a/src/qemu/qemu_conf.c +++ b/src/qemu/qemu_conf.c @@ -611,15 +611,17 @@ int virQEMUDriverConfigLoadFile(virQEMUDriverConfigPtr cfg, } else { CHECK_TYPE("hugetlbfs_mount", VIR_CONF_STRING); if (p && p->str) { - if (VIR_REALLOC_N(cfg->hugetlbfs, 1) < 0) - goto cleanup; - cfg->nhugetlbfs = 1; - if (virQEMUDriverConfigHugeTLBFSInit(&cfg->hugetlbfs[0], - p->str, true) < 0) - goto cleanup; - } else { - VIR_FREE(cfg->hugetlbfs); - cfg->nhugetlbfs = 0; + if (STREQ(p->str, "")) { + VIR_FREE(cfg->hugetlbfs); + cfg->nhugetlbfs = 0; + } else { + if (VIR_REALLOC_N(cfg->hugetlbfs, 1) < 0) + goto cleanup; + cfg->nhugetlbfs = 1; + if (virQEMUDriverConfigHugeTLBFSInit(&cfg->hugetlbfs[0], + p->str, true) < 0) + goto cleanup; + } } } diff --git a/src/qemu/qemu_process.c b/src/qemu/qemu_process.c index 16d03d8..d898aad 100644 --- a/src/qemu/qemu_process.c +++ b/src/qemu/qemu_process.c @@ -3791,7 +3791,7 @@ int qemuProcessStart(virConnectPtr conn, } virDomainAuditSecurityLabel(vm, true); - if (vm->def->mem.hugepage_backed) { + if (vm->def->mem.nhugepages) { for (i = 0; i < cfg->nhugetlbfs; i++) { char *hugepagePath = qemuGetHugepagePath(&cfg->hugetlbfs[i]); diff --git a/tests/qemuxml2argvdata/qemuxml2argv-hugepages-pages.xml b/tests/qemuxml2argvdata/qemuxml2argv-hugepages-pages.xml new file mode 100644 index 0000000..5ad0695 --- /dev/null +++ b/tests/qemuxml2argvdata/qemuxml2argv-hugepages-pages.xml @@ -0,0 +1,45 @@ +<domain type='qemu'> + <name>QEMUGuest1</name> + <uuid>c7a5fdbd-edaf-9455-926a-d65c16db1809</uuid> + <memory unit='KiB'>4194304</memory> + <currentMemory unit='KiB'>4194304</currentMemory> + <memoryBacking> + <hugepages> + <page size='2048' unit='KiB' nodeset='1'/> + <page size='1048576' unit='KiB' nodeset='0,2-3'/> + </hugepages> + </memoryBacking> + <vcpu placement='static'>4</vcpu> + <numatune> + <memory mode='strict' nodeset='0-3'/> + <memnode cellid='3' mode='strict' nodeset='3'/> + </numatune> + <os> + <type arch='i686' machine='pc'>hvm</type> + <boot dev='hd'/> + </os> + <cpu> + <numa> + <cell id='0' cpus='0' memory='1048576'/> + <cell id='1' cpus='1' memory='1048576'/> + <cell id='2' cpus='2' memory='1048576'/> + <cell id='3' cpus='3' memory='1048576'/> + </numa> + </cpu> + <clock offset='utc'/> + <on_poweroff>destroy</on_poweroff> + <on_reboot>restart</on_reboot> + <on_crash>destroy</on_crash> + <devices> + <emulator>/usr/bin/qemu</emulator> + <disk type='block' device='disk'> + <source dev='/dev/HostVG/QEMUGuest1'/> + <target dev='hda' bus='ide'/> + <address type='drive' controller='0' bus='0' target='0' unit='0'/> + </disk> + <controller type='usb' index='0'/> + <controller type='ide' index='0'/> + <controller type='pci' index='0' model='pci-root'/> + <memballoon model='virtio'/> + </devices> +</domain> diff --git a/tests/qemuxml2xmltest.c b/tests/qemuxml2xmltest.c index cefe05b..09cb228 100644 --- a/tests/qemuxml2xmltest.c +++ b/tests/qemuxml2xmltest.c @@ -197,6 +197,7 @@ mymain(void) DO_TEST("hyperv-off"); DO_TEST("hugepages"); + DO_TEST("hugepages-pages"); DO_TEST("nosharepages"); DO_TEST("disk-aio"); DO_TEST("disk-cdrom"); -- 1.8.5.5

On Thu, Jul 17, 2014 at 06:12:46PM +0200, Michal Privoznik wrote:
+<domain type='qemu'> + <name>QEMUGuest1</name> + <uuid>c7a5fdbd-edaf-9455-926a-d65c16db1809</uuid> + <memory unit='KiB'>4194304</memory> + <currentMemory unit='KiB'>4194304</currentMemory> + <memoryBacking> + <hugepages> + <page size='2048' unit='KiB' nodeset='1'/> + <page size='1048576' unit='KiB' nodeset='0,2-3'/> + </hugepages> + </memoryBacking> + <vcpu placement='static'>4</vcpu> + <numatune> + <memory mode='strict' nodeset='0-3'/> + <memnode cellid='3' mode='strict' nodeset='3'/> + </numatune> + <os> + <type arch='i686' machine='pc'>hvm</type> + <boot dev='hd'/> + </os> + <cpu> + <numa> + <cell id='0' cpus='0' memory='1048576'/> + <cell id='1' cpus='1' memory='1048576'/> + <cell id='2' cpus='2' memory='1048576'/> + <cell id='3' cpus='3' memory='1048576'/> + </numa> + </cpu>
There's nothing functionally wrong with what you have here, but I'm wondering if you considered just adding a page size attribute against the <cell> element under <numa> here ? Feels like that might be a bit less verbose for the XML Regards, Daniel -- |: http://berrange.com -o- http://www.flickr.com/photos/dberrange/ :| |: http://libvirt.org -o- http://virt-manager.org :| |: http://autobuild.org -o- http://search.cpan.org/~danberr/ :| |: http://entangle-photo.org -o- http://live.gnome.org/gtk-vnc :|

On 21.07.2014 17:09, Daniel P. Berrange wrote:
On Thu, Jul 17, 2014 at 06:12:46PM +0200, Michal Privoznik wrote:
+<domain type='qemu'> + <name>QEMUGuest1</name> + <uuid>c7a5fdbd-edaf-9455-926a-d65c16db1809</uuid> + <memory unit='KiB'>4194304</memory> + <currentMemory unit='KiB'>4194304</currentMemory> + <memoryBacking> + <hugepages> + <page size='2048' unit='KiB' nodeset='1'/> + <page size='1048576' unit='KiB' nodeset='0,2-3'/> + </hugepages> + </memoryBacking> + <vcpu placement='static'>4</vcpu> + <numatune> + <memory mode='strict' nodeset='0-3'/> + <memnode cellid='3' mode='strict' nodeset='3'/> + </numatune> + <os> + <type arch='i686' machine='pc'>hvm</type> + <boot dev='hd'/> + </os> + <cpu> + <numa> + <cell id='0' cpus='0' memory='1048576'/> + <cell id='1' cpus='1' memory='1048576'/> + <cell id='2' cpus='2' memory='1048576'/> + <cell id='3' cpus='3' memory='1048576'/> + </numa> + </cpu>
There's nothing functionally wrong with what you have here, but I'm wondering if you considered just adding a page size attribute against the <cell> element under <numa> here ? Feels like that might be a bit less verbose for the XML
Huh funny. That idea came up to my mind, but I thought it was more verbose so I went down this road. I'm not fundamentally against it, but I like my approach more. In most common case, users will use only one size of huge pages to back their guests, so all they need to do is: <memoryBacking> <hugepages> <page size='1' unit='G'/> </hugepages> </memoryBacking> instead of repeating @pagesize attribute in each <cell/>. But as far as I see it's just question of preference without any technical impact, right? Michal

Signed-off-by: Michal Privoznik <mprivozn@redhat.com> --- src/qemu/qemu_capabilities.c | 2 + src/qemu/qemu_capabilities.h | 1 + src/qemu/qemu_command.c | 91 +++++++++++++++++++--- .../qemuxml2argv-hugepages-pages.args | 16 ++++ tests/qemuxml2argvdata/qemuxml2argv-hugepages.args | 2 +- tests/qemuxml2argvtest.c | 10 ++- 6 files changed, 109 insertions(+), 13 deletions(-) create mode 100644 tests/qemuxml2argvdata/qemuxml2argv-hugepages-pages.args diff --git a/src/qemu/qemu_capabilities.c b/src/qemu/qemu_capabilities.c index 07306e5..f69c4d0 100644 --- a/src/qemu/qemu_capabilities.c +++ b/src/qemu/qemu_capabilities.c @@ -263,6 +263,7 @@ VIR_ENUM_IMPL(virQEMUCaps, QEMU_CAPS_LAST, "memory-backend-ram", /* 170 */ "numa", + "memory-backend-file", ); @@ -1481,6 +1482,7 @@ struct virQEMUCapsStringFlags virQEMUCapsObjectTypes[] = { { "pvpanic", QEMU_CAPS_DEVICE_PANIC }, { "usb-kbd", QEMU_CAPS_DEVICE_USB_KBD }, { "memory-backend-ram", QEMU_CAPS_OBJECT_MEMORY_RAM }, + { "memory-backend-file", QEMU_CAPS_OBJECT_MEMORY_FILE }, }; static struct virQEMUCapsStringFlags virQEMUCapsObjectPropsVirtioBlk[] = { diff --git a/src/qemu/qemu_capabilities.h b/src/qemu/qemu_capabilities.h index 4332633..e80a377 100644 --- a/src/qemu/qemu_capabilities.h +++ b/src/qemu/qemu_capabilities.h @@ -211,6 +211,7 @@ typedef enum { QEMU_CAPS_CHANGE_BACKING_FILE = 169, /* change name of backing file in metadata */ QEMU_CAPS_OBJECT_MEMORY_RAM = 170, /* -object memory-backend-ram */ QEMU_CAPS_NUMA = 171, /* newer -numa handling with disjoint cpu ranges */ + QEMU_CAPS_OBJECT_MEMORY_FILE = 172, /* -object memory-backend-file */ QEMU_CAPS_LAST, /* this must always be the last item */ } virQEMUCapsFlags; diff --git a/src/qemu/qemu_command.c b/src/qemu/qemu_command.c index 0b8cef5..cb35727 100644 --- a/src/qemu/qemu_command.c +++ b/src/qemu/qemu_command.c @@ -6381,24 +6381,36 @@ qemuBuildSmpArgStr(const virDomainDef *def, } static int -qemuBuildNumaArgStr(const virDomainDef *def, +qemuBuildNumaArgStr(virQEMUDriverConfigPtr cfg, + const virDomainDef *def, virCommandPtr cmd, virQEMUCapsPtr qemuCaps) { - size_t i; + size_t i, j; virBuffer buf = VIR_BUFFER_INITIALIZER; + virDomainHugePagePtr master_hugepage = NULL; char *cpumask = NULL, *tmpmask = NULL, *next = NULL; char *nodemask = NULL; + char *mem_path = NULL; int ret = -1; if (virDomainNumatuneHasPerNodeBinding(def->numatune) && - !virQEMUCapsGet(qemuCaps, QEMU_CAPS_OBJECT_MEMORY_RAM)) { + !(virQEMUCapsGet(qemuCaps, QEMU_CAPS_OBJECT_MEMORY_RAM) || + virQEMUCapsGet(qemuCaps, QEMU_CAPS_OBJECT_MEMORY_FILE))) { virReportError(VIR_ERR_CONFIG_UNSUPPORTED, "%s", _("Per-node memory binding is not supported " "with this QEMU")); goto cleanup; } + if (def->mem.nhugepages && def->mem.hugepages[0].size && + !virQEMUCapsGet(qemuCaps, QEMU_CAPS_OBJECT_MEMORY_FILE)) { + virReportError(VIR_ERR_CONFIG_UNSUPPORTED, "%s", + _("huge pages pre NUMA node are not " + "supported with this QEMU")); + goto cleanup; + } + for (i = 0; i < def->cpu->ncells; i++) { int cellmem = VIR_DIV_UP(def->cpu->cells[i].mem, 1024); def->cpu->cells[i].mem = cellmem * 1024; @@ -6417,15 +6429,74 @@ qemuBuildNumaArgStr(const virDomainDef *def, goto cleanup; } - if (virQEMUCapsGet(qemuCaps, QEMU_CAPS_OBJECT_MEMORY_RAM)) { + if (virQEMUCapsGet(qemuCaps, QEMU_CAPS_OBJECT_MEMORY_RAM) || + virQEMUCapsGet(qemuCaps, QEMU_CAPS_OBJECT_MEMORY_FILE)) { virDomainNumatuneMemMode mode; + virDomainHugePagePtr hugepage = NULL; const char *policy = NULL; mode = virDomainNumatuneGetMode(def->numatune, i); policy = qemuNumaPolicyTypeToString(mode); - virBufferAsprintf(&buf, "memory-backend-ram,size=%dM,id=ram-node%zu", - cellmem, i); + /* Find the huge page size we want to use */ + for (j = 0; j < def->mem.nhugepages; j++) { + bool thisHugepage = false; + + hugepage = &def->mem.hugepages[j]; + + if (!hugepage->nodemask) { + master_hugepage = hugepage; + continue; + } + + if (virBitmapGetBit(hugepage->nodemask, i, &thisHugepage) < 0) { + /* Ignore this error. It's not an error after all. Well, + * the nodemask for this <page/> can contain lower NUMA + * nodes than we are querying in here. */ + continue; + } + + if (thisHugepage) { + /* Hooray, we've found the page size */ + break; + } + } + + if (j == def->mem.nhugepages) { + /* We have not found specific huge page to be used with this + * NUMA node. Use the generic setting then (<page/> without any + * @nodemask) if possible. */ + hugepage = master_hugepage; + } + + if (hugepage) { + /* Now lets see, if the huge page we want to use is even mounted + * and ready to use */ + + for (j = 0; j < cfg->nhugetlbfs; j++) { + if (cfg->hugetlbfs[j].size == hugepage->size) + break; + } + + if (j == cfg->nhugetlbfs) { + virReportError(VIR_ERR_INTERNAL_ERROR, + _("Unable to find any usable hugetlbfs mount for %llu KiB"), + hugepage->size); + goto cleanup; + } + + VIR_FREE(mem_path); + if (!(mem_path = qemuGetHugepagePath(&cfg->hugetlbfs[j]))) + goto cleanup; + + virBufferAsprintf(&buf, + "memory-backend-file,prealloc=yes,mem-path=%s", + mem_path); + } else { + virBufferAddLit(&buf, "memory-backend-ram"); + } + + virBufferAsprintf(&buf, ",size=%dM,id=ram-node%zu", cellmem, i); if (virDomainNumatuneMaybeFormatNodeset(def->numatune, NULL, &nodemask, i) < 0) @@ -6464,7 +6535,8 @@ qemuBuildNumaArgStr(const virDomainDef *def, virBufferAdd(&buf, tmpmask, -1); } - if (virQEMUCapsGet(qemuCaps, QEMU_CAPS_OBJECT_MEMORY_RAM)) { + if (virQEMUCapsGet(qemuCaps, QEMU_CAPS_OBJECT_MEMORY_RAM) || + virQEMUCapsGet(qemuCaps, QEMU_CAPS_OBJECT_MEMORY_FILE)) { virBufferAsprintf(&buf, ",memdev=ram-node%zu", i); } else { virBufferAsprintf(&buf, ",mem=%d", cellmem); @@ -6477,6 +6549,7 @@ qemuBuildNumaArgStr(const virDomainDef *def, cleanup: VIR_FREE(cpumask); VIR_FREE(nodemask); + VIR_FREE(mem_path); virBufferFreeAndReset(&buf); return ret; } @@ -7332,7 +7405,7 @@ qemuBuildCommandLine(virConnectPtr conn, virCommandAddArg(cmd, "-m"); def->mem.max_balloon = VIR_DIV_UP(def->mem.max_balloon, 1024) * 1024; virCommandAddArgFormat(cmd, "%llu", def->mem.max_balloon / 1024); - if (def->mem.nhugepages) { + if (def->mem.nhugepages && !def->mem.hugepages[0].size) { char *mem_path; if (!cfg->nhugetlbfs) { @@ -7376,7 +7449,7 @@ qemuBuildCommandLine(virConnectPtr conn, VIR_FREE(smp); if (def->cpu && def->cpu->ncells) - if (qemuBuildNumaArgStr(def, cmd, qemuCaps) < 0) + if (qemuBuildNumaArgStr(cfg, def, cmd, qemuCaps) < 0) goto error; if (virQEMUCapsGet(qemuCaps, QEMU_CAPS_UUID)) diff --git a/tests/qemuxml2argvdata/qemuxml2argv-hugepages-pages.args b/tests/qemuxml2argvdata/qemuxml2argv-hugepages-pages.args new file mode 100644 index 0000000..042683a --- /dev/null +++ b/tests/qemuxml2argvdata/qemuxml2argv-hugepages-pages.args @@ -0,0 +1,16 @@ +LC_ALL=C PATH=/bin HOME=/home/test USER=test LOGNAME=test QEMU_AUDIO_DRV=none \ +/usr/bin/qemu -S -M pc -m 4096 -smp 4 \ +-object memory-backend-file,prealloc=yes,mem-path=/dev/hugepages1G/libvirt/qemu,\ +size=1024M,id=ram-node0,host-nodes=0-3,policy=bind \ +-numa node,nodeid=0,cpus=0,memdev=ram-node0 \ +-object memory-backend-file,prealloc=yes,mem-path=/dev/hugepages2M/libvirt/qemu,\ +size=1024M,id=ram-node1,host-nodes=0-3,policy=bind \ +-numa node,nodeid=1,cpus=1,memdev=ram-node1 \ +-object memory-backend-file,prealloc=yes,mem-path=/dev/hugepages1G/libvirt/qemu,\ +size=1024M,id=ram-node2,host-nodes=0-3,policy=bind \ +-numa node,nodeid=2,cpus=2,memdev=ram-node2 \ +-object memory-backend-file,prealloc=yes,mem-path=/dev/hugepages1G/libvirt/qemu,\ +size=1024M,id=ram-node3,host-nodes=3,policy=bind \ +-numa node,nodeid=3,cpus=3,memdev=ram-node3 \ +-nographic -monitor unix:/tmp/test-monitor,server,nowait -no-acpi -boot c -usb \ +-hda /dev/HostVG/QEMUGuest1 -net none -serial none -parallel none diff --git a/tests/qemuxml2argvdata/qemuxml2argv-hugepages.args b/tests/qemuxml2argvdata/qemuxml2argv-hugepages.args index d42d9fc..51c5d62 100644 --- a/tests/qemuxml2argvdata/qemuxml2argv-hugepages.args +++ b/tests/qemuxml2argvdata/qemuxml2argv-hugepages.args @@ -1,5 +1,5 @@ LC_ALL=C PATH=/bin HOME=/home/test USER=test LOGNAME=test QEMU_AUDIO_DRV=none \ /usr/bin/qemu -S -M \ -pc -m 214 -mem-prealloc -mem-path /dev/hugepages/libvirt/qemu -smp 1 \ +pc -m 214 -mem-prealloc -mem-path /dev/hugepages2M/libvirt/qemu -smp 1 \ -nographic -monitor unix:/tmp/test-monitor,server,nowait -no-acpi -boot c -usb -hda \ /dev/HostVG/QEMUGuest1 -net none -serial none -parallel none diff --git a/tests/qemuxml2argvtest.c b/tests/qemuxml2argvtest.c index 1a5a4b0..63c9c4b 100644 --- a/tests/qemuxml2argvtest.c +++ b/tests/qemuxml2argvtest.c @@ -525,13 +525,15 @@ mymain(void) if (VIR_STRDUP_QUIET(driver.config->stateDir, "/nowhere") < 0) return EXIT_FAILURE; VIR_FREE(driver.config->hugetlbfs); - if (VIR_ALLOC_N(driver.config->hugetlbfs, 1) < 0) + if (VIR_ALLOC_N(driver.config->hugetlbfs, 2) < 0) return EXIT_FAILURE; - driver.config->nhugetlbfs = 1; - if (VIR_STRDUP(driver.config->hugetlbfs[0].mnt_dir, "/dev/hugepages") < 0) + driver.config->nhugetlbfs = 2; + if (VIR_STRDUP(driver.config->hugetlbfs[0].mnt_dir, "/dev/hugepages2M") < 0 || + VIR_STRDUP(driver.config->hugetlbfs[1].mnt_dir, "/dev/hugepages1G") < 0) return EXIT_FAILURE; driver.config->hugetlbfs[0].size = 2048; driver.config->hugetlbfs[0].deflt = true; + driver.config->hugetlbfs[1].size = 1048576; driver.config->spiceTLS = 1; if (VIR_STRDUP_QUIET(driver.config->spicePassword, "123456") < 0) return EXIT_FAILURE; @@ -665,6 +667,8 @@ mymain(void) DO_TEST("hyperv-off", NONE); DO_TEST("hugepages", QEMU_CAPS_MEM_PATH); + DO_TEST("hugepages-pages", QEMU_CAPS_MEM_PATH, QEMU_CAPS_OBJECT_MEMORY_RAM, + QEMU_CAPS_OBJECT_MEMORY_FILE); DO_TEST("nosharepages", QEMU_CAPS_MACHINE_OPT, QEMU_CAPS_MEM_MERGE); DO_TEST("disk-cdrom", NONE); DO_TEST("disk-cdrom-network-http", QEMU_CAPS_KVM, QEMU_CAPS_DEVICE, -- 1.8.5.5

Signed-off-by: Michal Privoznik <mprivozn@redhat.com> --- .../qemuxml2argv-hugepages-pages2.args | 10 ++++++ .../qemuxml2argv-hugepages-pages2.xml | 38 ++++++++++++++++++++++ .../qemuxml2argv-hugepages-pages3.args | 9 +++++ .../qemuxml2argv-hugepages-pages3.xml | 38 ++++++++++++++++++++++ tests/qemuxml2argvtest.c | 4 +++ tests/qemuxml2xmltest.c | 2 ++ 6 files changed, 101 insertions(+) create mode 100644 tests/qemuxml2argvdata/qemuxml2argv-hugepages-pages2.args create mode 100644 tests/qemuxml2argvdata/qemuxml2argv-hugepages-pages2.xml create mode 100644 tests/qemuxml2argvdata/qemuxml2argv-hugepages-pages3.args create mode 100644 tests/qemuxml2argvdata/qemuxml2argv-hugepages-pages3.xml diff --git a/tests/qemuxml2argvdata/qemuxml2argv-hugepages-pages2.args b/tests/qemuxml2argvdata/qemuxml2argv-hugepages-pages2.args new file mode 100644 index 0000000..9211bc6 --- /dev/null +++ b/tests/qemuxml2argvdata/qemuxml2argv-hugepages-pages2.args @@ -0,0 +1,10 @@ +LC_ALL=C PATH=/bin HOME=/home/test USER=test LOGNAME=test QEMU_AUDIO_DRV=none \ +/usr/bin/qemu -S -M pc -m 1024 -smp 2 \ +-object memory-backend-file,prealloc=yes,\ +mem-path=/dev/hugepages2M/libvirt/qemu,size=256M,id=ram-node0 \ +-numa node,nodeid=0,cpus=0,memdev=ram-node0 \ +-object memory-backend-file,prealloc=yes,\ +mem-path=/dev/hugepages2M/libvirt/qemu,size=768M,id=ram-node1 \ +-numa node,nodeid=1,cpus=1,memdev=ram-node1 \ +-nographic -monitor unix:/tmp/test-monitor,server,nowait -no-acpi -boot c \ +-usb -hda /dev/HostVG/QEMUGuest1 -net none -serial none -parallel none diff --git a/tests/qemuxml2argvdata/qemuxml2argv-hugepages-pages2.xml b/tests/qemuxml2argvdata/qemuxml2argv-hugepages-pages2.xml new file mode 100644 index 0000000..3df870b --- /dev/null +++ b/tests/qemuxml2argvdata/qemuxml2argv-hugepages-pages2.xml @@ -0,0 +1,38 @@ +<domain type='qemu'> + <name>SomeDummyHugepagesGuest</name> + <uuid>ef1bdff4-27f3-4e85-a807-5fb4d58463cc</uuid> + <memory unit='KiB'>1048576</memory> + <currentMemory unit='KiB'>1048576</currentMemory> + <memoryBacking> + <hugepages> + <page size='2048' unit='KiB'/> + </hugepages> + </memoryBacking> + <vcpu placement='static'>2</vcpu> + <os> + <type arch='i686' machine='pc'>hvm</type> + <boot dev='hd'/> + </os> + <cpu> + <numa> + <cell id='0' cpus='0' memory='262144'/> + <cell id='1' cpus='1' memory='786432'/> + </numa> + </cpu> + <clock offset='utc'/> + <on_poweroff>destroy</on_poweroff> + <on_reboot>restart</on_reboot> + <on_crash>destroy</on_crash> + <devices> + <emulator>/usr/bin/qemu</emulator> + <disk type='block' device='disk'> + <source dev='/dev/HostVG/QEMUGuest1'/> + <target dev='hda' bus='ide'/> + <address type='drive' controller='0' bus='0' target='0' unit='0'/> + </disk> + <controller type='usb' index='0'/> + <controller type='ide' index='0'/> + <controller type='pci' index='0' model='pci-root'/> + <memballoon model='virtio'/> + </devices> +</domain> diff --git a/tests/qemuxml2argvdata/qemuxml2argv-hugepages-pages3.args b/tests/qemuxml2argvdata/qemuxml2argv-hugepages-pages3.args new file mode 100644 index 0000000..27b3f8e --- /dev/null +++ b/tests/qemuxml2argvdata/qemuxml2argv-hugepages-pages3.args @@ -0,0 +1,9 @@ +LC_ALL=C PATH=/bin HOME=/home/test USER=test LOGNAME=test QEMU_AUDIO_DRV=none \ +/usr/bin/qemu -S -M pc -m 1024 -smp 2 \ +-object memory-backend-ram,size=256M,id=ram-node0 \ +-numa node,nodeid=0,cpus=0,memdev=ram-node0 \ +-object memory-backend-file,prealloc=yes,\ +mem-path=/dev/hugepages1G/libvirt/qemu,size=768M,id=ram-node1 \ +-numa node,nodeid=1,cpus=1,memdev=ram-node1 \ +-nographic -monitor unix:/tmp/test-monitor,server,nowait -no-acpi -boot c -usb \ +-hda /dev/HostVG/QEMUGuest1 -net none -serial none -parallel none diff --git a/tests/qemuxml2argvdata/qemuxml2argv-hugepages-pages3.xml b/tests/qemuxml2argvdata/qemuxml2argv-hugepages-pages3.xml new file mode 100644 index 0000000..35aa2cf --- /dev/null +++ b/tests/qemuxml2argvdata/qemuxml2argv-hugepages-pages3.xml @@ -0,0 +1,38 @@ +<domain type='qemu'> + <name>SomeDummyHugepagesGuest</name> + <uuid>ef1bdff4-27f3-4e85-a807-5fb4d58463cc</uuid> + <memory unit='KiB'>1048576</memory> + <currentMemory unit='KiB'>1048576</currentMemory> + <memoryBacking> + <hugepages> + <page size='1048576' unit='KiB' nodeset='1'/> + </hugepages> + </memoryBacking> + <vcpu placement='static'>2</vcpu> + <os> + <type arch='i686' machine='pc'>hvm</type> + <boot dev='hd'/> + </os> + <cpu> + <numa> + <cell id='0' cpus='0' memory='262144'/> + <cell id='1' cpus='1' memory='786432'/> + </numa> + </cpu> + <clock offset='utc'/> + <on_poweroff>destroy</on_poweroff> + <on_reboot>restart</on_reboot> + <on_crash>destroy</on_crash> + <devices> + <emulator>/usr/bin/qemu</emulator> + <disk type='block' device='disk'> + <source dev='/dev/HostVG/QEMUGuest1'/> + <target dev='hda' bus='ide'/> + <address type='drive' controller='0' bus='0' target='0' unit='0'/> + </disk> + <controller type='usb' index='0'/> + <controller type='ide' index='0'/> + <controller type='pci' index='0' model='pci-root'/> + <memballoon model='virtio'/> + </devices> +</domain> diff --git a/tests/qemuxml2argvtest.c b/tests/qemuxml2argvtest.c index 63c9c4b..b4505a9 100644 --- a/tests/qemuxml2argvtest.c +++ b/tests/qemuxml2argvtest.c @@ -669,6 +669,10 @@ mymain(void) DO_TEST("hugepages", QEMU_CAPS_MEM_PATH); DO_TEST("hugepages-pages", QEMU_CAPS_MEM_PATH, QEMU_CAPS_OBJECT_MEMORY_RAM, QEMU_CAPS_OBJECT_MEMORY_FILE); + DO_TEST("hugepages-pages2", QEMU_CAPS_MEM_PATH, QEMU_CAPS_OBJECT_MEMORY_RAM, + QEMU_CAPS_OBJECT_MEMORY_FILE); + DO_TEST("hugepages-pages3", QEMU_CAPS_MEM_PATH, QEMU_CAPS_OBJECT_MEMORY_RAM, + QEMU_CAPS_OBJECT_MEMORY_FILE); DO_TEST("nosharepages", QEMU_CAPS_MACHINE_OPT, QEMU_CAPS_MEM_MERGE); DO_TEST("disk-cdrom", NONE); DO_TEST("disk-cdrom-network-http", QEMU_CAPS_KVM, QEMU_CAPS_DEVICE, diff --git a/tests/qemuxml2xmltest.c b/tests/qemuxml2xmltest.c index 09cb228..8d46b40 100644 --- a/tests/qemuxml2xmltest.c +++ b/tests/qemuxml2xmltest.c @@ -198,6 +198,8 @@ mymain(void) DO_TEST("hugepages"); DO_TEST("hugepages-pages"); + DO_TEST("hugepages-pages2"); + DO_TEST("hugepages-pages3"); DO_TEST("nosharepages"); DO_TEST("disk-aio"); DO_TEST("disk-cdrom"); -- 1.8.5.5
participants (3)
-
Daniel P. Berrange
-
Eric Blake
-
Michal Privoznik