Just noticed that the functions signatures of the new functions created
by this
patch are mostly in one line, like:
static int read_dt_phandle(...)
While Libvirt code standard is the return value in the first line, then
the function
name after a line break:
void
qemuDomainUpdateCurrentMemorySize(...)
I'll fix this naming stardard in the next spin.
On 3/3/19 10:23 AM, Daniel Henrique Barboza wrote:
The NVLink2 support in QEMU implements the detection of NVLink2
capable devices by verfying the attributes of the VFIO mem region
QEMU allocates for the NVIDIA GPUs. To properly allocate an
adequate amount of memLock, Libvirt needs this information before
a QEMU instance is even created.
An alternative is presented in this patch. Given a PCI device,
we'll traverse the device tree at /proc/device-tree to check if
the device has a NPU bridge, retrieve the node of the NVLink2 bus,
find the memory-node that is related to the bus and see if it's a
NVLink2 bus by inspecting its 'reg' value. This logic is contained
inside the 'device_is_nvlink2_capable' function, which uses other
new helper functions to navigate and fetch values from the device
tree nodes.
Signed-off-by: Daniel Henrique Barboza <danielhb413(a)gmail.com>Just
---
src/qemu/qemu_domain.c | 188 +++++++++++++++++++++++++++++++++++++++++
1 file changed, 188 insertions(+)
diff --git a/src/qemu/qemu_domain.c b/src/qemu/qemu_domain.c
index 55578f3d19..76e1e4b161 100644
--- a/src/qemu/qemu_domain.c
+++ b/src/qemu/qemu_domain.c
@@ -10331,6 +10331,194 @@ qemuDomainUpdateCurrentMemorySize(virDomainObjPtr vm)
}
+/**
+ * Reads a phandle file and returns the phandle value.
+ */
+static int read_dt_phandle(const char* file)
+{
+ unsigned int buf[1];
+ size_t read;
+ FILE *f;
+
+ f = fopen(file, "r");
+ if (!f)
+ return -1;
+
+ read = fread(buf, sizeof(unsigned int), 1, f);
+
+ if (!read) {
+ fclose(f);
+ return 0;
+ }
+
+ fclose(f);
+ return be32toh(buf[0]);
+}
+
+
+/**
+ * Reads a memory reg file and returns the first 4 int values.
+ *
+ * The caller is responsible for freeing the returned array.
+ */
+static unsigned int *read_dt_memory_reg(const char *file)
+{
+ unsigned int *buf;
+ size_t read, i;
+ FILE *f;
+
+ f = fopen(file, "r");
+ if (!f)
+ return NULL;
+
+ buf = calloc(4, sizeof(unsigned int));
+ read = fread(buf, sizeof(unsigned int), 4, f);
+
+ if (!read && read < 4)
+ /* shouldn't happen */
+ VIR_FREE(buf);
+ else for (i = 0; i < 4; i++)
+ buf[i] = be32toh(buf[i]);
+
+ fclose(f);
+ return buf;
+}
+
+
+/**
+ * This wrapper function receives arguments to be used in a
+ * 'find' call to retrieve the file names that matches
+ * the criteria inside the /proc/device-tree dir.
+ *
+ * A 'find' call with '-iname phandle' inside /proc/device-tree
+ * provides more than a thousand matches. Adding '-path' to
+ * narrow it down further is necessary to keep the file
+ * listing sane.
+ *
+ * The caller is responsible to free the buffer returned by
+ * this function.
+ */
+static char *retrieve_dt_files_pattern(const char *path_pattern,
+ const char *file_pattern)
+{
+ virCommandPtr cmd = NULL;
+ char *output = NULL;
+
+ cmd = virCommandNew("find");
+ virCommandAddArgList(cmd, "/proc/device-tree/","-path",
path_pattern,
+ "-iname", file_pattern, NULL);
+ virCommandSetOutputBuffer(cmd, &output);
+
+ if (virCommandRun(cmd, NULL) < 0)
+ VIR_FREE(output);
+
+ virCommandFree(cmd);
+ return output;
+}
+
+
+/**
+ * Helper function that receives a listing of file names and
+ * calls read_dt_phandle() on each one finding for a match
+ * with the given phandle argument. Returns the file name if a
+ * match is found, NULL otherwise.
+ */
+static char *find_dt_file_with_phandle(char *files, int phandle)
+{
+ char *line, *tmp;
+ int ret;
+
+ line = strtok_r(files, "\n", &tmp);
+ do {
+ ret = read_dt_phandle(line);
+ if (ret == phandle)
+ break;
+ } while ((line = strtok_r(NULL, "\n", &tmp)) != NULL);
+
+ return line;
+}
+
+
+/**
+ * This function receives a string that represents a PCI device,
+ * such as '0004:04:00.0', and tells if the device is NVLink2 capable.
+ *
+ * The logic goes as follows:
+ *
+ * 1 - get the phandle of a nvlink of the device, reading the 'ibm,npu'
+ * attribute;
+ * 2 - find the device tree node of the nvlink bus using the phandle
+ * found in (1)
+ * 3 - get the phandle of the memory region of the nvlink bus
+ * 4 - find the device tree node of the memory region using the
+ * phandle found in (3)
+ * 5 - read the 'reg' value of the memory region. If the value of
+ * the second 64 bit value is 0x02 0x00, the device is attached
+ * to a NVLink2 bus.
+ *
+ * If any of these steps fails, the function returns false.
+ */
+static bool device_is_nvlink2_capable(const char *device)
+{
+ char *file, *files, *tmp;
+ unsigned int *reg;
+ int phandle;
+
+ if ((virAsprintf(&file, "/sys/bus/pci/devices/%s/of_node/ibm,npu",
+ device)) < 0)
+ return false;
+
+ /* Find phandles of nvlinks: */
+ if ((phandle = read_dt_phandle(file)) == -1)
+ return false;
+
+ /* Find a DT node for the phandle found */
+ files = retrieve_dt_files_pattern("*device-tree/pci*",
"phandle");
+ if (!files)
+ return false;
+
+ if ((file = find_dt_file_with_phandle(files, phandle)) == NULL)
+ goto fail;
+
+ /* Find a phandle of the GPU memory region of the device. The
+ * file found above ends with '/phandle' - the memory region
+ * of the GPU ends with '/memory-region */
+ tmp = strrchr(file, '/');
+ *tmp = '\0';
+ file = strcat(file, "/memory-region");
+
+ if ((phandle = read_dt_phandle(file)) == -1)
+ goto fail;
+
+ file = NULL;
+ VIR_FREE(files);
+
+ /* Find the memory node for the phandle found above */
+ files = retrieve_dt_files_pattern("*device-tree/memory*",
"phandle");
+ if (!files)
+ return false;
+
+ if ((file = find_dt_file_with_phandle(files, phandle)) == NULL)
+ goto fail;
+
+ /* And see its size in the second 64bit value of 'reg'. First,
+ * the end of the file needs to be changed from '/phandle' to
+ * '/reg' */
+ tmp = strrchr(file, '/');
+ *tmp = '\0';
+ file = strcat(file, "/reg");
+
+ reg = read_dt_memory_reg(file);
+ if (reg && reg[2] == 0x20 && reg[3] == 0x00)
+ return true;
+
+fail:
+ VIR_FREE(files);
+ VIR_FREE(reg);
+ return false;
+}
+
+
/**
* qemuDomainGetMemLockLimitBytes:
* @def: domain definition