
Il mer 27 mar 2024, 11:56 Philippe Mathieu-Daudé <philmd@linaro.org> ha scritto:
The whole RDMA subsystem was deprecated in commit e9a54265f5 ("hw/rdma: Deprecate the pvrdma device and the rdma subsystem") released in v8.2. Time to remove it.
Keep the RAM_SAVE_FLAG_HOOK definition since it might appears in old migration streams.
Remove the dependencies on libibumad and libibverbs.
Remove the generated vmw_pvrdma/ directory from linux-headers.
Remove RDMA handling from migration.
Remove RDMA handling in GlusterFS block driver.
I don't think these two were deprecated? They are unrelated to pvrdma. Paolo
Remove rdmacm-mux tool from contrib/.
Remove PVRDMA device.
Cc: Peter Xu <peterx@redhat.com> Cc: Li Zhijian <lizhijian@fujitsu.com> Cc: Yuval Shaia <yuval.shaia.ml@gmail.com> Cc: Marcel Apfelbaum <marcel.apfelbaum@gmail.com> Signed-off-by: Philippe Mathieu-Daudé <philmd@linaro.org> --- MAINTAINERS | 17 - docs/about/deprecated.rst | 9 - docs/about/removed-features.rst | 4 + docs/devel/migration/main.rst | 6 - docs/pvrdma.txt | 345 -- docs/rdma.txt | 420 -- docs/system/device-url-syntax.rst.inc | 4 +- docs/system/loongarch/virt.rst | 2 +- docs/system/qemu-block-drivers.rst.inc | 1 - meson.build | 59 - qapi/machine.json | 17 - qapi/migration.json | 31 +- qapi/qapi-schema.json | 1 - qapi/rdma.json | 38 - contrib/rdmacm-mux/rdmacm-mux.h | 61 - hw/rdma/rdma_backend.h | 129 - hw/rdma/rdma_backend_defs.h | 76 - hw/rdma/rdma_rm.h | 97 - hw/rdma/rdma_rm_defs.h | 146 - hw/rdma/rdma_utils.h | 63 - hw/rdma/trace.h | 1 - hw/rdma/vmw/pvrdma.h | 144 - hw/rdma/vmw/pvrdma_dev_ring.h | 46 - hw/rdma/vmw/pvrdma_qp_ops.h | 28 - hw/rdma/vmw/trace.h | 1 - include/hw/rdma/rdma.h | 37 - include/monitor/hmp.h | 1 - .../infiniband/hw/vmw_pvrdma/pvrdma_dev_api.h | 685 --- .../infiniband/hw/vmw_pvrdma/pvrdma_verbs.h | 348 -- .../standard-headers/rdma/vmw_pvrdma-abi.h | 310 -- migration/migration-stats.h | 6 +- migration/migration.h | 9 - migration/options.h | 2 - migration/rdma.h | 69 - block/gluster.c | 39 - contrib/rdmacm-mux/main.c | 831 ---- hw/core/machine-qmp-cmds.c | 32 - hw/rdma/rdma.c | 30 - hw/rdma/rdma_backend.c | 1401 ------ hw/rdma/rdma_rm.c | 812 ---- hw/rdma/rdma_utils.c | 126 - hw/rdma/vmw/pvrdma_cmd.c | 815 ---- hw/rdma/vmw/pvrdma_dev_ring.c | 141 - hw/rdma/vmw/pvrdma_main.c | 735 --- hw/rdma/vmw/pvrdma_qp_ops.c | 298 -- migration/migration-stats.c | 5 +- migration/migration.c | 31 - migration/options.c | 16 - migration/qemu-file.c | 1 - migration/ram.c | 86 +- migration/rdma.c | 4184 ----------------- migration/savevm.c | 2 +- monitor/qmp-cmds.c | 1 - Kconfig.host | 3 - contrib/rdmacm-mux/meson.build | 7 - hmp-commands-info.hx | 13 - hw/Kconfig | 1 - hw/meson.build | 1 - hw/rdma/Kconfig | 3 - hw/rdma/meson.build | 12 - hw/rdma/trace-events | 31 - hw/rdma/vmw/trace-events | 17 - meson_options.txt | 4 - migration/meson.build | 1 - migration/trace-events | 68 +- qapi/meson.build | 1 - qemu-options.hx | 6 - .../ci/org.centos/stream/8/x86_64/configure | 1 - scripts/ci/setup/build-environment.yml | 2 - scripts/coverity-scan/run-coverity-scan | 2 +- scripts/meson-buildoptions.sh | 6 - scripts/update-linux-headers.sh | 27 - tests/lcitool/projects/qemu.yml | 2 - tests/migration/guestperf/engine.py | 4 +- 74 files changed, 20 insertions(+), 12991 deletions(-) delete mode 100644 docs/pvrdma.txt delete mode 100644 docs/rdma.txt delete mode 100644 qapi/rdma.json delete mode 100644 contrib/rdmacm-mux/rdmacm-mux.h delete mode 100644 hw/rdma/rdma_backend.h delete mode 100644 hw/rdma/rdma_backend_defs.h delete mode 100644 hw/rdma/rdma_rm.h delete mode 100644 hw/rdma/rdma_rm_defs.h delete mode 100644 hw/rdma/rdma_utils.h delete mode 100644 hw/rdma/trace.h delete mode 100644 hw/rdma/vmw/pvrdma.h delete mode 100644 hw/rdma/vmw/pvrdma_dev_ring.h delete mode 100644 hw/rdma/vmw/pvrdma_qp_ops.h delete mode 100644 hw/rdma/vmw/trace.h delete mode 100644 include/hw/rdma/rdma.h delete mode 100644 include/standard-headers/drivers/infiniband/hw/vmw_pvrdma/pvrdma_dev_api.h delete mode 100644 include/standard-headers/drivers/infiniband/hw/vmw_pvrdma/pvrdma_verbs.h delete mode 100644 include/standard-headers/rdma/vmw_pvrdma-abi.h delete mode 100644 migration/rdma.h delete mode 100644 contrib/rdmacm-mux/main.c delete mode 100644 hw/rdma/rdma.c delete mode 100644 hw/rdma/rdma_backend.c delete mode 100644 hw/rdma/rdma_rm.c delete mode 100644 hw/rdma/rdma_utils.c delete mode 100644 hw/rdma/vmw/pvrdma_cmd.c delete mode 100644 hw/rdma/vmw/pvrdma_dev_ring.c delete mode 100644 hw/rdma/vmw/pvrdma_main.c delete mode 100644 hw/rdma/vmw/pvrdma_qp_ops.c delete mode 100644 migration/rdma.c delete mode 100644 contrib/rdmacm-mux/meson.build delete mode 100644 hw/rdma/Kconfig delete mode 100644 hw/rdma/meson.build delete mode 100644 hw/rdma/trace-events delete mode 100644 hw/rdma/vmw/trace-events
diff --git a/MAINTAINERS b/MAINTAINERS index a07af6b9d4..05226cea0a 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -3426,13 +3426,6 @@ F: docs/devel/migration.rst F: qapi/migration.json F: tests/migration/ F: util/userfaultfd.c -X: migration/rdma* - -RDMA Migration -R: Li Zhijian <lizhijian@fujitsu.com> -R: Peter Xu <peterx@redhat.com> -S: Odd Fixes -F: migration/rdma*
Migration dirty limit and dirty page rate M: Hyman Huang <yong.huang@smartx.com> @@ -4060,16 +4053,6 @@ F: block/replication.c F: tests/unit/test-replication.c F: docs/block-replication.txt
-PVRDMA -M: Yuval Shaia <yuval.shaia.ml@gmail.com> -M: Marcel Apfelbaum <marcel.apfelbaum@gmail.com> -S: Odd Fixes -F: hw/rdma/* -F: hw/rdma/vmw/* -F: docs/pvrdma.txt -F: contrib/rdmacm-mux/* -F: qapi/rdma.json - Semihosting M: Alex Bennée <alex.bennee@linaro.org> S: Maintained diff --git a/docs/about/deprecated.rst b/docs/about/deprecated.rst index 7b548519b5..29eae69e50 100644 --- a/docs/about/deprecated.rst +++ b/docs/about/deprecated.rst @@ -376,15 +376,6 @@ recommending to switch to their stable counterparts: - "Zve64f" should be replaced with "zve64f" - "Zve64d" should be replaced with "zve64d"
-``-device pvrdma`` and the rdma subsystem (since 8.2) -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -The pvrdma device and the whole rdma subsystem are in a bad shape and -without active maintenance. The QEMU project intends to remove this -device and subsystem from the code base in a future release without -replacement unless somebody steps up and improves the situation. - - Block device options ''''''''''''''''''''
diff --git a/docs/about/removed-features.rst b/docs/about/removed-features.rst index f9cf874f7b..4d5bdc43b4 100644 --- a/docs/about/removed-features.rst +++ b/docs/about/removed-features.rst @@ -909,6 +909,10 @@ contains native support for this feature and thus use of the option ROM approach was obsolete. The native SeaBIOS support can be activated by using ``-machine graphics=off``.
+``pvrdma`` and the RDMA subsystem (removed in 9.1) +'''''''''''''''''''''''''''''''''''''''''''''''''' + +The 'pvrdma' device and the whole RDMA subsystem have been removed.
Related binaries ---------------- diff --git a/docs/devel/migration/main.rst b/docs/devel/migration/main.rst index 54385a23e5..70278ce1e3 100644 --- a/docs/devel/migration/main.rst +++ b/docs/devel/migration/main.rst @@ -47,12 +47,6 @@ over any transport. QEMU interference. Note that QEMU does not flush cached file data/metadata at the end of migration.
-In addition, support is included for migration using RDMA, which -transports the page data using ``RDMA``, where the hardware takes care of -transporting the pages, and the load on the CPU is much lower. While the -internals of RDMA migration are a bit different, this isn't really visible -outside the RAM migration code. - All these migration protocols use the same infrastructure to save/restore state devices. This infrastructure is shared with the savevm/loadvm functionality. diff --git a/docs/pvrdma.txt b/docs/pvrdma.txt deleted file mode 100644 index 5c122fe818..0000000000 --- a/docs/pvrdma.txt +++ /dev/null @@ -1,345 +0,0 @@ -Paravirtualized RDMA Device (PVRDMA) -==================================== - - -1. Description -=============== -PVRDMA is the QEMU implementation of VMware's paravirtualized RDMA device. -It works with its Linux Kernel driver AS IS, no need for any special guest -modifications. - -While it complies with the VMware device, it can also communicate with bare -metal RDMA-enabled machines as peers. - -It does not require an RDMA HCA in the host, it can work with Soft-RoCE (rxe). - -It does not require the whole guest RAM to be pinned allowing memory -over-commit and, even if not implemented yet, migration support will be -possible with some HW assistance. - -A project presentation accompany this document: -- https://blog.linuxplumbersconf.org/2017/ocw/system/presentations/4730/origin... - - - -2. Setup -======== - - -2.1 Guest setup -=============== -Fedora 27+ kernels work out of the box, older distributions -require updating the kernel to 4.14 to include the pvrdma driver. - -However the libpvrdma library needed by User Level Software is still -not available as part of the distributions, so the rdma-core library -needs to be compiled and optionally installed. - -Please follow the instructions at: - https://github.com/linux-rdma/rdma-core.git - - -2.2 Host Setup -============== -The pvrdma backend is an ibdevice interface that can be exposed -either by a Soft-RoCE(rxe) device on machines with no RDMA device, -or an HCA SRIOV function(VF/PF). -Note that ibdevice interfaces can't be shared between pvrdma devices, -each one requiring a separate instance (rxe or SRIOV VF). - - -2.2.1 Soft-RoCE backend(rxe) -=========================== -A stable version of rxe is required, Fedora 27+ or a Linux -Kernel 4.14+ is preferred. - -The rdma_rxe module is part of the Linux Kernel but not loaded by default. -Install the User Level library (librxe) following the instructions from: -https://github.com/SoftRoCE/rxe-dev/wiki/rxe-dev:-Home - -Associate an ETH interface with rxe by running: - rxe_cfg add eth0 -An rxe0 ibdevice interface will be created and can be used as pvrdma backend. - - -2.2.2 RDMA device Virtual Function backend -========================================== -Nothing special is required, the pvrdma device can work not only with -Ethernet Links, but also Infinibands Links. -All is needed is an ibdevice with an active port, for Mellanox cards -will be something like mlx5_6 which can be the backend. - - -2.2.3 QEMU setup -================ -Configure QEMU with --enable-rdma flag, installing -the required RDMA libraries. - - - -3. Usage -======== - - -3.1 VM Memory settings -====================== -Currently the device is working only with memory backed RAM -and it must be mark as "shared": - -m 1G \ - -object memory-backend-ram,id=mb1,size=1G,share \ - -numa node,memdev=mb1 \ - - -3.2 MAD Multiplexer -=================== -MAD Multiplexer is a service that exposes MAD-like interface for VMs in -order to overcome the limitation where only single entity can register with -MAD layer to send and receive RDMA-CM MAD packets. - -To build rdmacm-mux run -# make rdmacm-mux - -Before running the rdmacm-mux make sure that both ib_cm and rdma_cm kernel -modules aren't loaded, otherwise the rdmacm-mux service will fail to start. - -The application accepts 3 command line arguments and exposes a UNIX socket -to pass control and data to it. --d rdma-device-name Name of RDMA device to register with --s unix-socket-path Path to unix socket to listen (default /var/run/rdmacm-mux) --p rdma-device-port Port number of RDMA device to register with (default 1) -The final UNIX socket file name is a concatenation of the 3 arguments so -for example for device mlx5_0 on port 2 this /var/run/rdmacm-mux-mlx5_0-2 -will be created. - -pvrdma requires this service. - -Please refer to contrib/rdmacm-mux for more details. - - -3.3 Service exposed by libvirt daemon -===================================== -The control over the RDMA device's GID table is done by updating the -device's Ethernet function addresses. -Usually the first GID entry is determined by the MAC address, the second by -the first IPv6 address and the third by the IPv4 address. Other entries can -be added by adding more IP addresses. The opposite is the same, i.e. -whenever an address is removed, the corresponding GID entry is removed. -The process is done by the network and RDMA stacks. Whenever an address is -added the ib_core driver is notified and calls the device driver add_gid -function which in turn update the device. -To support this in pvrdma device the device hooks into the create_bind and -destroy_bind HW commands triggered by pvrdma driver in guest. - -Whenever changed is made to the pvrdma port's GID table a special QMP -messages is sent to be processed by libvirt to update the address of the -backend Ethernet device. - -pvrdma requires that libvirt service will be up. - - -3.4 PCI devices settings -======================== -RoCE device exposes two functions - an Ethernet and RDMA. -To support it, pvrdma device is composed of two PCI functions, an Ethernet -device of type vmxnet3 on PCI slot 0 and a PVRDMA device on PCI slot 1. The -Ethernet function can be used for other Ethernet purposes such as IP. - - -3.5 Device parameters -===================== -- netdev: Specifies the Ethernet device function name on the host for - example enp175s0f0. For Soft-RoCE device (rxe) this would be the Ethernet - device used to create it. -- ibdev: The IB device name on host for example rxe0, mlx5_0 etc. -- mad-chardev: The name of the MAD multiplexer char device. -- ibport: In case of multi-port device (such as Mellanox's HCA) this - specify the port to use. If not set 1 will be used. -- dev-caps-max-mr-size: The maximum size of MR. -- dev-caps-max-qp: Maximum number of QPs. -- dev-caps-max-cq: Maximum number of CQs. -- dev-caps-max-mr: Maximum number of MRs. -- dev-caps-max-pd: Maximum number of PDs. -- dev-caps-max-ah: Maximum number of AHs. - -Notes: -- The first 3 parameters are mandatory settings, the rest have their - defaults. -- The last 8 parameters (the ones that prefixed by dev-caps) defines the top - limits but the final values is adjusted by the backend device limitations. -- netdev can be extracted from ibdev's sysfs - (/sys/class/infiniband/<ibdev>/device/net/) - - -3.6 Example -=========== -Define bridge device with vmxnet3 network backend: -<interface type='bridge'> - <mac address='56:b4:44:e9:62:dc'/> - <source bridge='bridge1'/> - <model type='vmxnet3'/> - <address type='pci' domain='0x0000' bus='0x00' slot='0x10' function='0x0' multifunction='on'/> -</interface> - -Define pvrdma device: -<qemu:commandline> - <qemu:arg value='-object'/> - <qemu:arg value='memory-backend-ram,id=mb1,size=1G,share'/> - <qemu:arg value='-numa'/> - <qemu:arg value='node,memdev=mb1'/> - <qemu:arg value='-chardev'/> - <qemu:arg value='socket,path=/var/run/rdmacm-mux-rxe0-1,id=mads'/> - <qemu:arg value='-device'/> - <qemu:arg value='pvrdma,addr=10.1,ibdev=rxe0,netdev=bridge0,mad-chardev=mads'/> -</qemu:commandline> - - - -4. Implementation details -========================= - - -4.1 Overview -============ -The device acts like a proxy between the Guest Driver and the host -ibdevice interface. -On configuration path: - - For every hardware resource request (PD/QP/CQ/...) the pvrdma will request - a resource from the backend interface, maintaining a 1-1 mapping - between the guest and host. -On data path: - - Every post_send/receive received from the guest will be converted into - a post_send/receive for the backend. The buffers data will not be touched - or copied resulting in near bare-metal performance for large enough buffers. - - Completions from the backend interface will result in completions for - the pvrdma device. - - -4.2 PCI BARs -============ -PCI Bars: - BAR 0 - MSI-X - MSI-X vectors: - (0) Command - used when execution of a command is completed. - (1) Async - not in use. - (2) Completion - used when a completion event is placed in - device's CQ ring. - BAR 1 - Registers - -------------------------------------------------------- - | VERSION | DSR | CTL | REQ | ERR | ICR | IMR | MAC | - -------------------------------------------------------- - DSR - Address of driver/device shared memory used - for the command channel, used for passing: - - General info such as driver version - - Address of 'command' and 'response' - - Address of async ring - - Address of device's CQ ring - - Device capabilities - CTL - Device control operations (activate, reset etc) - IMG - Set interrupt mask - REQ - Command execution register - ERR - Operation status - - BAR 2 - UAR - --------------------------------------------------------- - | QP_NUM | SEND/RECV Flag || CQ_NUM | ARM/POLL Flag | - --------------------------------------------------------- - - Offset 0 used for QP operations (send and recv) - - Offset 4 used for CQ operations (arm and poll) - - -4.3 Major flows -=============== - -4.3.1 Create CQ -=============== - - Guest driver - - Allocates pages for CQ ring - - Creates page directory (pdir) to hold CQ ring's pages - - Initializes CQ ring - - Initializes 'Create CQ' command object (cqe, pdir etc) - - Copies the command to 'command' address - - Writes 0 into REQ register - - Device - - Reads the request object from the 'command' address - - Allocates CQ object and initialize CQ ring based on pdir - - Creates the backend CQ - - Writes operation status to ERR register - - Posts command-interrupt to guest - - Guest driver - - Reads the HW response code from ERR register - -4.3.2 Create QP -=============== - - Guest driver - - Allocates pages for send and receive rings - - Creates page directory(pdir) to hold the ring's pages - - Initializes 'Create QP' command object (max_send_wr, - send_cq_handle, recv_cq_handle, pdir etc) - - Copies the object to 'command' address - - Write 0 into REQ register - - Device - - Reads the request object from 'command' address - - Allocates the QP object and initialize - - Send and recv rings based on pdir - - Send and recv ring state - - Creates the backend QP - - Writes the operation status to ERR register - - Posts command-interrupt to guest - - Guest driver - - Reads the HW response code from ERR register - -4.3.3 Post receive -================== - - Guest driver - - Initializes a wqe and place it on recv ring - - Write to qpn|qp_recv_bit (31) to QP offset in UAR - - Device - - Extracts qpn from UAR - - Walks through the ring and does the following for each wqe - - Prepares the backend CQE context to be used when - receiving completion from backend (wr_id, op_code, emu_cq_num) - - For each sge prepares backend sge - - Calls backend's post_recv - -4.3.4 Process backend events -============================ - - Done by a dedicated thread used to process backend events; - at initialization is attached to the device and creates - the communication channel. - - Thread main loop: - - Polls for completions - - Extracts QEMU _cq_num, wr_id and op_code from context - - Writes CQE to CQ ring - - Writes CQ number to device CQ - - Sends completion-interrupt to guest - - Deallocates context - - Acks the event to backend - - - -5. Limitations -============== -- The device obviously is limited by the Guest Linux Driver features implementation - of the VMware device API. -- Memory registration mechanism requires mremap for every page in the buffer in order - to map it to a contiguous virtual address range. Since this is not the data path - it should not matter much. If the default max mr size is increased, be aware that - memory registration can take up to 0.5 seconds for 1GB of memory. -- The device requires target page size to be the same as the host page size, - otherwise it will fail to init. -- QEMU cannot map guest RAM from a file descriptor if a pvrdma device is attached, - so it can't work with huge pages. The limitation will be addressed in the future, - however QEMU allocates Guest RAM with MADV_HUGEPAGE so if there are enough huge - pages available, QEMU will use them. QEMU will fail to init if the requirements - are not met. - - - -6. Performance -============== -By design the pvrdma device exits on each post-send/receive, so for small buffers -the performance is affected; however for medium buffers it will became close to -bare metal and from 1MB buffers and up it reaches bare metal performance. -(tested with 2 VMs, the pvrdma devices connected to 2 VFs of the same device) - -All the above assumes no memory registration is done on data path. diff --git a/docs/rdma.txt b/docs/rdma.txt deleted file mode 100644 index bd8dd799a9..0000000000 --- a/docs/rdma.txt +++ /dev/null @@ -1,420 +0,0 @@ -(RDMA: Remote Direct Memory Access) -RDMA Live Migration Specification, Version # 1 -============================================== -Wiki: https://wiki.qemu.org/Features/RDMALiveMigration -Github: git@github.com:hinesmr/qemu.git, 'rdma' branch - -Copyright (C) 2013 Michael R. Hines <mrhines@us.ibm.com> - -An *exhaustive* paper (2010) shows additional performance details -linked on the QEMU wiki above. - -Contents: -========= -* Introduction -* Before running -* Running -* Performance -* RDMA Migration Protocol Description -* Versioning and Capabilities -* QEMUFileRDMA Interface -* Migration of VM's ram -* Error handling -* TODO - -Introduction: -============= - -RDMA helps make your migration more deterministic under heavy load because -of the significantly lower latency and higher throughput over TCP/IP. This is -because the RDMA I/O architecture reduces the number of interrupts and -data copies by bypassing the host networking stack. In particular, a TCP-based -migration, under certain types of memory-bound workloads, may take a more -unpredictable amount of time to complete the migration if the amount of -memory tracked during each live migration iteration round cannot keep pace -with the rate of dirty memory produced by the workload. - -RDMA currently comes in two flavors: both Ethernet based (RoCE, or RDMA -over Converged Ethernet) as well as Infiniband-based. This implementation of -migration using RDMA is capable of using both technologies because of -the use of the OpenFabrics OFED software stack that abstracts out the -programming model irrespective of the underlying hardware. - -Refer to openfabrics.org or your respective RDMA hardware vendor for -an understanding on how to verify that you have the OFED software stack -installed in your environment. You should be able to successfully link -against the "librdmacm" and "libibverbs" libraries and development headers -for a working build of QEMU to run successfully using RDMA Migration. - -BEFORE RUNNING: -=============== - -Use of RDMA during migration requires pinning and registering memory -with the hardware. This means that memory must be physically resident -before the hardware can transmit that memory to another machine. -If this is not acceptable for your application or product, then the use -of RDMA migration may in fact be harmful to co-located VMs or other -software on the machine if there is not sufficient memory available to -relocate the entire footprint of the virtual machine. If so, then the -use of RDMA is discouraged and it is recommended to use standard TCP migration. - -Experimental: Next, decide if you want dynamic page registration. -For example, if you have an 8GB RAM virtual machine, but only 1GB -is in active use, then enabling this feature will cause all 8GB to -be pinned and resident in memory. This feature mostly affects the -bulk-phase round of the migration and can be enabled for extremely -high-performance RDMA hardware using the following command: - -QEMU Monitor Command: -$ migrate_set_capability rdma-pin-all on # disabled by default - -Performing this action will cause all 8GB to be pinned, so if that's -not what you want, then please ignore this step altogether. - -On the other hand, this will also significantly speed up the bulk round -of the migration, which can greatly reduce the "total" time of your migration. -Example performance of this using an idle VM in the previous example -can be found in the "Performance" section. - -Note: for very large virtual machines (hundreds of GBs), pinning all -*all* of the memory of your virtual machine in the kernel is very expensive -may extend the initial bulk iteration time by many seconds, -and thus extending the total migration time. However, this will not -affect the determinism or predictability of your migration you will -still gain from the benefits of advanced pinning with RDMA. - -RUNNING: -======== - -First, set the migration speed to match your hardware's capabilities: - -QEMU Monitor Command: -$ migrate_set_parameter max-bandwidth 40g # or whatever is the MAX of your RDMA device - -Next, on the destination machine, add the following to the QEMU command line: - -qemu ..... -incoming rdma:host:port - -Finally, perform the actual migration on the source machine: - -QEMU Monitor Command: -$ migrate -d rdma:host:port - -PERFORMANCE -=========== - -Here is a brief summary of total migration time and downtime using RDMA: -Using a 40gbps infiniband link performing a worst-case stress test, -using an 8GB RAM virtual machine: - -Using the following command: -$ apt-get install stress -$ stress --vm-bytes 7500M --vm 1 --vm-keep - -1. Migration throughput: 26 gigabits/second. -2. Downtime (stop time) varies between 15 and 100 milliseconds. - -EFFECTS of memory registration on bulk phase round: - -For example, in the same 8GB RAM example with all 8GB of memory in -active use and the VM itself is completely idle using the same 40 gbps -infiniband link: - -1. rdma-pin-all disabled total time: approximately 7.5 seconds @ 9.5 Gbps -2. rdma-pin-all enabled total time: approximately 4 seconds @ 26 Gbps - -These numbers would of course scale up to whatever size virtual machine -you have to migrate using RDMA. - -Enabling this feature does *not* have any measurable affect on -migration *downtime*. This is because, without this feature, all of the -memory will have already been registered already in advance during -the bulk round and does not need to be re-registered during the successive -iteration rounds. - -RDMA Protocol Description: -========================== - -Migration with RDMA is separated into two parts: - -1. The transmission of the pages using RDMA -2. Everything else (a control channel is introduced) - -"Everything else" is transmitted using a formal -protocol now, consisting of infiniband SEND messages. - -An infiniband SEND message is the standard ibverbs -message used by applications of infiniband hardware. -The only difference between a SEND message and an RDMA -message is that SEND messages cause notifications -to be posted to the completion queue (CQ) on the -infiniband receiver side, whereas RDMA messages (used -for VM's ram) do not (to behave like an actual DMA). - -Messages in infiniband require two things: - -1. registration of the memory that will be transmitted -2. (SEND only) work requests to be posted on both - sides of the network before the actual transmission - can occur. - -RDMA messages are much easier to deal with. Once the memory -on the receiver side is registered and pinned, we're -basically done. All that is required is for the sender -side to start dumping bytes onto the link. - -(Memory is not released from pinning until the migration -completes, given that RDMA migrations are very fast.) - -SEND messages require more coordination because the -receiver must have reserved space (using a receive -work request) on the receive queue (RQ) before QEMUFileRDMA -can start using them to carry all the bytes as -a control transport for migration of device state. - -To begin the migration, the initial connection setup is -as follows (migration-rdma.c): - -1. Receiver and Sender are started (command line or libvirt): -2. Both sides post two RQ work requests -3. Receiver does listen() -4. Sender does connect() -5. Receiver accept() -6. Check versioning and capabilities (described later) - -At this point, we define a control channel on top of SEND messages -which is described by a formal protocol. Each SEND message has a -header portion and a data portion (but together are transmitted -as a single SEND message). - -Header: - * Length (of the data portion, uint32, network byte order) - * Type (what command to perform, uint32, network byte order) - * Repeat (Number of commands in data portion, same type only) - -The 'Repeat' field is here to support future multiple page registrations -in a single message without any need to change the protocol itself -so that the protocol is compatible against multiple versions of QEMU. -Version #1 requires that all server implementations of the protocol must -check this field and register all requests found in the array of commands located -in the data portion and return an equal number of results in the response. -The maximum number of repeats is hard-coded to 4096. This is a conservative -limit based on the maximum size of a SEND message along with empirical -observations on the maximum future benefit of simultaneous page registrations. - -The 'type' field has 12 different command values: - 1. Unused - 2. Error (sent to the source during bad things) - 3. Ready (control-channel is available) - 4. QEMU File (for sending non-live device state) - 5. RAM Blocks request (used right after connection setup) - 6. RAM Blocks result (used right after connection setup) - 7. Compress page (zap zero page and skip registration) - 8. Register request (dynamic chunk registration) - 9. Register result ('rkey' to be used by sender) - 10. Register finished (registration for current iteration finished) - 11. Unregister request (unpin previously registered memory) - 12. Unregister finished (confirmation that unpin completed) - -A single control message, as hinted above, can contain within the data -portion an array of many commands of the same type. If there is more than -one command, then the 'repeat' field will be greater than 1. - -After connection setup, message 5 & 6 are used to exchange ram block -information and optionally pin all the memory if requested by the user. - -After ram block exchange is completed, we have two protocol-level -functions, responsible for communicating control-channel commands -using the above list of values: - -Logically: - -qemu_rdma_exchange_recv(header, expected command type) - -1. We transmit a READY command to let the sender know that - we are *ready* to receive some data bytes on the control channel. -2. Before attempting to receive the expected command, we post another - RQ work request to replace the one we just used up. -3. Block on a CQ event channel and wait for the SEND to arrive. -4. When the send arrives, librdmacm will unblock us. -5. Verify that the command-type and version received matches the one we expected. - -qemu_rdma_exchange_send(header, data, optional response header & data): - -1. Block on the CQ event channel waiting for a READY command - from the receiver to tell us that the receiver - is *ready* for us to transmit some new bytes. -2. Optionally: if we are expecting a response from the command - (that we have not yet transmitted), let's post an RQ - work request to receive that data a few moments later. -3. When the READY arrives, librdmacm will - unblock us and we immediately post a RQ work request - to replace the one we just used up. -4. Now, we can actually post the work request to SEND - the requested command type of the header we were asked for. -5. Optionally, if we are expecting a response (as before), - we block again and wait for that response using the additional - work request we previously posted. (This is used to carry - 'Register result' commands #6 back to the sender which - hold the rkey need to perform RDMA. Note that the virtual address - corresponding to this rkey was already exchanged at the beginning - of the connection (described below). - -All of the remaining command types (not including 'ready') -described above all use the aforementioned two functions to do the hard work: - -1. After connection setup, RAMBlock information is exchanged using - this protocol before the actual migration begins. This information includes - a description of each RAMBlock on the server side as well as the virtual addresses - and lengths of each RAMBlock. This is used by the client to determine the - start and stop locations of chunks and how to register them dynamically - before performing the RDMA operations. -2. During runtime, once a 'chunk' becomes full of pages ready to - be sent with RDMA, the registration commands are used to ask the - other side to register the memory for this chunk and respond - with the result (rkey) of the registration. -3. Also, the QEMUFile interfaces also call these functions (described below) - when transmitting non-live state, such as devices or to send - its own protocol information during the migration process. -4. Finally, zero pages are only checked if a page has not yet been registered - using chunk registration (or not checked at all and unconditionally - written if chunk registration is disabled. This is accomplished using - the "Compress" command listed above. If the page *has* been registered - then we check the entire chunk for zero. Only if the entire chunk is - zero, then we send a compress command to zap the page on the other side. - -Versioning and Capabilities -=========================== -Current version of the protocol is version #1. - -The same version applies to both for protocol traffic and capabilities -negotiation. (i.e. There is only one version number that is referred to -by all communication). - -librdmacm provides the user with a 'private data' area to be exchanged -at connection-setup time before any infiniband traffic is generated. - -Header: - * Version (protocol version validated before send/recv occurs), - uint32, network byte order - * Flags (bitwise OR of each capability), - uint32, network byte order - -There is no data portion of this header right now, so there is -no length field. The maximum size of the 'private data' section -is only 192 bytes per the Infiniband specification, so it's not -very useful for data anyway. This structure needs to remain small. - -This private data area is a convenient place to check for protocol -versioning because the user does not need to register memory to -transmit a few bytes of version information. - -This is also a convenient place to negotiate capabilities -(like dynamic page registration). - -If the version is invalid, we throw an error. - -If the version is new, we only negotiate the capabilities that the -requested version is able to perform and ignore the rest. - -Currently there is only one capability in Version #1: dynamic page registration - -Finally: Negotiation happens with the Flags field: If the primary-VM -sets a flag, but the destination does not support this capability, it -will return a zero-bit for that flag and the primary-VM will understand -that as not being an available capability and will thus disable that -capability on the primary-VM side. - -QEMUFileRDMA Interface: -======================= - -QEMUFileRDMA introduces a couple of new functions: - -1. qemu_rdma_get_buffer() (QEMUFileOps rdma_read_ops) -2. qemu_rdma_put_buffer() (QEMUFileOps rdma_write_ops) - -These two functions are very short and simply use the protocol -describe above to deliver bytes without changing the upper-level -users of QEMUFile that depend on a bytestream abstraction. - -Finally, how do we handoff the actual bytes to get_buffer()? - -Again, because we're trying to "fake" a bytestream abstraction -using an analogy not unlike individual UDP frames, we have -to hold on to the bytes received from control-channel's SEND -messages in memory. - -Each time we receive a complete "QEMU File" control-channel -message, the bytes from SEND are copied into a small local holding area. - -Then, we return the number of bytes requested by get_buffer() -and leave the remaining bytes in the holding area until get_buffer() -comes around for another pass. - -If the buffer is empty, then we follow the same steps -listed above and issue another "QEMU File" protocol command, -asking for a new SEND message to re-fill the buffer. - -Migration of VM's ram: -==================== - -At the beginning of the migration, (migration-rdma.c), -the sender and the receiver populate the list of RAMBlocks -to be registered with each other into a structure. -Then, using the aforementioned protocol, they exchange a -description of these blocks with each other, to be used later -during the iteration of main memory. This description includes -a list of all the RAMBlocks, their offsets and lengths, virtual -addresses and possibly includes pre-registered RDMA keys in case dynamic -page registration was disabled on the server-side, otherwise not. - -Main memory is not migrated with the aforementioned protocol, -but is instead migrated with normal RDMA Write operations. - -Pages are migrated in "chunks" (hard-coded to 1 Megabyte right now). -Chunk size is not dynamic, but it could be in a future implementation. -There's nothing to indicate that this is useful right now. - -When a chunk is full (or a flush() occurs), the memory backed by -the chunk is registered with librdmacm is pinned in memory on -both sides using the aforementioned protocol. -After pinning, an RDMA Write is generated and transmitted -for the entire chunk. - -Chunks are also transmitted in batches: This means that we -do not request that the hardware signal the completion queue -for the completion of *every* chunk. The current batch size -is about 64 chunks (corresponding to 64 MB of memory). -Only the last chunk in a batch must be signaled. -This helps keep everything as asynchronous as possible -and helps keep the hardware busy performing RDMA operations. - -Error-handling: -=============== - -Infiniband has what is called a "Reliable, Connected" -link (one of 4 choices). This is the mode in which -we use for RDMA migration. - -If a *single* message fails, -the decision is to abort the migration entirely and -cleanup all the RDMA descriptors and unregister all -the memory. - -After cleanup, the Virtual Machine is returned to normal -operation the same way that would happen if the TCP -socket is broken during a non-RDMA based migration. - -TODO: -===== -1. Currently, 'ulimit -l' mlock() limits as well as cgroups swap limits - are not compatible with infiniband memory pinning and will result in - an aborted migration (but with the source VM left unaffected). -2. Use of the recent /proc/<pid>/pagemap would likely speed up - the use of KSM and ballooning while using RDMA. -3. Also, some form of balloon-device usage tracking would also - help alleviate some issues. -4. Use LRU to provide more fine-grained direction of UNREGISTER - requests for unpinning memory in an overcommitted environment. -5. Expose UNREGISTER support to the user by way of workload-specific - hints about application behavior. diff --git a/docs/system/device-url-syntax.rst.inc b/docs/system/device-url-syntax.rst.inc index 7dbc525fa8..43b5c2596b 100644 --- a/docs/system/device-url-syntax.rst.inc +++ b/docs/system/device-url-syntax.rst.inc @@ -87,8 +87,8 @@ These are specified using a special URL syntax.
``GlusterFS`` GlusterFS is a user space distributed file system. QEMU supports the - use of GlusterFS volumes for hosting VM disk images using TCP, Unix - Domain Sockets and RDMA transport protocols. + use of GlusterFS volumes for hosting VM disk images using TCP and Unix + Domain Sockets transport protocols.
Syntax for specifying a VM disk image on GlusterFS volume is
diff --git a/docs/system/loongarch/virt.rst b/docs/system/loongarch/virt.rst index c37268b404..0a8e0766e4 100644 --- a/docs/system/loongarch/virt.rst +++ b/docs/system/loongarch/virt.rst @@ -39,7 +39,7 @@ can be accessed by following steps.
.. code-block:: bash
- ./configure --disable-rdma --disable-pvrdma --prefix=/usr \ + ./configure --prefix=/usr \ --target-list="loongarch64-softmmu" \ --disable-libiscsi --disable-libnfs --disable-libpmem \ --disable-glusterfs --enable-libusb --enable-usb-redir \ diff --git a/docs/system/qemu-block-drivers.rst.inc b/docs/system/qemu-block-drivers.rst.inc index 105cb9679c..384e95ba76 100644 --- a/docs/system/qemu-block-drivers.rst.inc +++ b/docs/system/qemu-block-drivers.rst.inc @@ -737,7 +737,6 @@ Examples |qemu_system| -drive file=gluster+tcp://[1:2:3:4:5:6:7:8]:24007/testvol/dir/a.img |qemu_system| -drive file=gluster+tcp:// server.domain.com:24007/testvol/dir/a.img |qemu_system| -drive file=gluster+unix:///testvol/dir/a.img?socket=/tmp/glusterd.socket - |qemu_system| -drive file=gluster+rdma://1.2.3.4:24007/testvol/a.img |qemu_system| -drive file=gluster:// 1.2.3.4/testvol/a.img,file.debug=9,file.logfile=/var/log/qemu-gluster.log |qemu_system| 'json:{"driver":"qcow2", "file":{"driver":"gluster", diff --git a/meson.build b/meson.build index c9c3217ba4..bd65abad13 100644 --- a/meson.build +++ b/meson.build @@ -1854,21 +1854,6 @@ if numa.found() and not cc.links(''' endif endif
-rdma = not_found -if not get_option('rdma').auto() or have_system - libumad = cc.find_library('ibumad', required: get_option('rdma')) - rdma_libs = [cc.find_library('rdmacm', has_headers: ['rdma/rdma_cma.h'], - required: get_option('rdma')), - cc.find_library('ibverbs', required: get_option('rdma')), - libumad] - rdma = declare_dependency(dependencies: rdma_libs) - foreach lib: rdma_libs - if not lib.found() - rdma = not_found - endif - endforeach -endif - cacard = not_found if not get_option('smartcard').auto() or have_system cacard = dependency('libcacard', required: get_option('smartcard'), @@ -2246,7 +2231,6 @@ endif config_host_data.set('CONFIG_OPENGL', opengl.found()) config_host_data.set('CONFIG_PLUGIN', get_option('plugins')) config_host_data.set('CONFIG_RBD', rbd.found()) -config_host_data.set('CONFIG_RDMA', rdma.found()) config_host_data.set('CONFIG_RELOCATABLE', get_option('relocatable')) config_host_data.set('CONFIG_SAFESTACK', get_option('safe_stack')) config_host_data.set('CONFIG_SDL', sdl.found()) @@ -2399,12 +2383,6 @@ if rbd.found() dependencies: rbd, prefix: '#include <rbd/librbd.h>')) endif -if rdma.found() - config_host_data.set('HAVE_IBV_ADVISE_MR', - cc.has_function('ibv_advise_mr', - dependencies: rdma, - prefix: '#include <infiniband/verbs.h>')) -endif
have_asan_fiber = false if get_option('sanitizers') and \ @@ -2829,37 +2807,6 @@ config_host_data.set('CONFIG_ARM_AES_BUILTIN', cc.compiles(''' void foo(uint8x16_t *p) { *p = vaesmcq_u8(*p); } '''))
-have_pvrdma = get_option('pvrdma') \ - .require(rdma.found(), error_message: 'PVRDMA requires OpenFabrics libraries') \ - .require(cc.compiles(gnu_source_prefix + ''' - #include <sys/mman.h> - int main(void) - { - char buf = 0; - void *addr = &buf; - addr = mremap(addr, 0, 1, MREMAP_MAYMOVE | MREMAP_FIXED); - - return 0; - }'''), error_message: 'PVRDMA requires mremap').allowed() - -if have_pvrdma - config_host_data.set('LEGACY_RDMA_REG_MR', not cc.links(''' - #include <infiniband/verbs.h> - int main(void) - { - struct ibv_mr *mr; - struct ibv_pd *pd = NULL; - size_t length = 10; - uint64_t iova = 0; - int access = 0; - void *addr = NULL; - - mr = ibv_reg_mr_iova(pd, addr, length, iova, access); - ibv_dereg_mr(mr); - return 0; - }''')) -endif - if get_option('membarrier').disabled() have_membarrier = false elif host_os == 'windows' @@ -2993,7 +2940,6 @@ host_kconfig = \ (have_vhost_kernel ? ['CONFIG_VHOST_KERNEL=y'] : []) + \ (have_virtfs ? ['CONFIG_VIRTFS=y'] : []) + \ (host_os == 'linux' ? ['CONFIG_LINUX=y'] : []) + \ - (have_pvrdma ? ['CONFIG_PVRDMA=y'] : []) + \ (multiprocess_allowed ? ['CONFIG_MULTIPROCESS_ALLOWED=y'] : []) + \ (vfio_user_server_allowed ? ['CONFIG_VFIO_USER_SERVER_ALLOWED=y'] : []) + \ (hv_balloon ? ['CONFIG_HV_BALLOON_POSSIBLE=y'] : []) @@ -3357,8 +3303,6 @@ if have_system 'hw/pci', 'hw/pci-host', 'hw/ppc', - 'hw/rdma', - 'hw/rdma/vmw', 'hw/rtc', 'hw/s390x', 'hw/scsi', @@ -4028,7 +3972,6 @@ if have_tools }] endforeach
- subdir('contrib/rdmacm-mux') subdir('contrib/elf2dmp')
executable('qemu-edid', files('qemu-edid.c', 'hw/display/edid-generate.c'), @@ -4433,8 +4376,6 @@ summary_info += {'Multipath support': mpathpersist} summary_info += {'Linux AIO support': libaio} summary_info += {'Linux io_uring support': linux_io_uring} summary_info += {'ATTR/XATTR support': libattr} -summary_info += {'RDMA support': rdma} -summary_info += {'PVRDMA support': have_pvrdma} summary_info += {'fdt support': fdt_opt == 'disabled' ? false : fdt_opt} summary_info += {'libcap-ng support': libcap_ng} summary_info += {'bpf support': libbpf} diff --git a/qapi/machine.json b/qapi/machine.json index e8b60641f2..e9f0f0c49a 100644 --- a/qapi/machine.json +++ b/qapi/machine.json @@ -1737,23 +1737,6 @@ 'returns': 'HumanReadableText', 'features': [ 'unstable' ] }
-## -# @x-query-rdma: -# -# Query RDMA state -# -# Features: -# -# @unstable: This command is meant for debugging. -# -# Returns: RDMA state -# -# Since: 6.2 -## -{ 'command': 'x-query-rdma', - 'returns': 'HumanReadableText', - 'features': [ 'unstable' ] } - ## # @x-query-roms: # diff --git a/qapi/migration.json b/qapi/migration.json index 8c65b90328..9a56d403be 100644 --- a/qapi/migration.json +++ b/qapi/migration.json @@ -221,8 +221,8 @@ # # @setup-time: amount of setup time in milliseconds *before* the # iterations begin but *after* the QMP command is issued. This is -# designed to provide an accounting of any activities (such as -# RDMA pinning) which may be expensive, but do not actually occur +# designed to provide an accounting of any activities which may be +# expensive, but do not actually occur # during the iterative migration rounds themselves. (since 1.6) # # @cpu-throttle-percentage: percentage of time guest cpus are being @@ -430,10 +430,6 @@ # for certain work loads, by sending compressed difference of the # pages # -# @rdma-pin-all: Controls whether or not the entire VM memory -# footprint is mlock()'d on demand or all at once. Refer to -# docs/rdma.txt for usage. Disabled by default. (since 2.0) -# # @zero-blocks: During storage migration encode blocks of zeroes # efficiently. This essentially saves 1MB of zeroes per block on # the wire. Enabling requires source and target VM to support @@ -547,7 +543,7 @@ # Since: 1.2 ## { 'enum': 'MigrationCapability', - 'data': ['xbzrle', 'rdma-pin-all', 'auto-converge', 'zero-blocks', + 'data': ['xbzrle', 'auto-converge', 'zero-blocks', { 'name': 'compress', 'features': [ 'deprecated' ] }, 'events', 'postcopy-ram', { 'name': 'x-colo', 'features': [ 'unstable' ] }, @@ -606,7 +602,6 @@ # -> { "execute": "query-migrate-capabilities" } # <- { "return": [ # {"state": false, "capability": "xbzrle"}, -# {"state": false, "capability": "rdma-pin-all"}, # {"state": false, "capability": "auto-converge"}, # {"state": false, "capability": "zero-blocks"}, # {"state": false, "capability": "compress"}, @@ -1654,14 +1649,12 @@ # # @exec: Direct the migration stream to another process. # -# @rdma: Migrate via RDMA. -# # @file: Direct the migration stream to a file. # # Since: 8.2 ## { 'enum': 'MigrationAddressType', - 'data': [ 'socket', 'exec', 'rdma', 'file' ] } + 'data': [ 'socket', 'exec', 'file' ] }
## # @FileMigrationArgs: @@ -1701,7 +1694,6 @@ 'data': { 'socket': 'SocketAddress', 'exec': 'MigrationExecCommand', - 'rdma': 'InetSocketAddress', 'file': 'FileMigrationArgs' } }
## @@ -1804,14 +1796,6 @@ # -> { "execute": "migrate", # "arguments": { # "channels": [ { "channel-type": "main", -# "addr": { "transport": "rdma", -# "host": "10.12.34.9", -# "port": "1050" } } ] } } -# <- { "return": {} } -# -# -> { "execute": "migrate", -# "arguments": { -# "channels": [ { "channel-type": "main", # "addr": { "transport": "file", # "filename": "/tmp/migfile", # "offset": "0x1000" } } ] } } @@ -1879,13 +1863,6 @@ # "/some/sock" ] } } ] } } # <- { "return": {} } # -# -> { "execute": "migrate-incoming", -# "arguments": { -# "channels": [ { "channel-type": "main", -# "addr": { "transport": "rdma", -# "host": "10.12.34.9", -# "port": "1050" } } ] } } -# <- { "return": {} } ## { 'command': 'migrate-incoming', 'data': {'*uri': 'str', diff --git a/qapi/qapi-schema.json b/qapi/qapi-schema.json index 8304d45625..5e33da7228 100644 --- a/qapi/qapi-schema.json +++ b/qapi/qapi-schema.json @@ -54,7 +54,6 @@ { 'include': 'dump.json' } { 'include': 'net.json' } { 'include': 'ebpf.json' } -{ 'include': 'rdma.json' } { 'include': 'rocker.json' } { 'include': 'tpm.json' } { 'include': 'ui.json' } diff --git a/qapi/rdma.json b/qapi/rdma.json deleted file mode 100644 index 195c001850..0000000000 --- a/qapi/rdma.json +++ /dev/null @@ -1,38 +0,0 @@ -# -*- Mode: Python -*- -# vim: filetype=python -# - -## -# = RDMA device -## - -## -# @RDMA_GID_STATUS_CHANGED: -# -# Emitted when guest driver adds/deletes GID to/from device -# -# @netdev: RoCE Network Device name -# -# @gid-status: Add or delete indication -# -# @subnet-prefix: Subnet Prefix -# -# @interface-id: Interface ID -# -# Since: 4.0 -# -# Example: -# -# <- {"timestamp": {"seconds": 1541579657, "microseconds": 986760}, -# "event": "RDMA_GID_STATUS_CHANGED", -# "data": -# {"netdev": "bridge0", -# "interface-id": 15880512517475447892, -# "gid-status": true, -# "subnet-prefix": 33022}} -## -{ 'event': 'RDMA_GID_STATUS_CHANGED', - 'data': { 'netdev' : 'str', - 'gid-status' : 'bool', - 'subnet-prefix' : 'uint64', - 'interface-id' : 'uint64' } } diff --git a/contrib/rdmacm-mux/rdmacm-mux.h b/contrib/rdmacm-mux/rdmacm-mux.h deleted file mode 100644 index 07a4722913..0000000000 --- a/contrib/rdmacm-mux/rdmacm-mux.h +++ /dev/null @@ -1,61 +0,0 @@ -/* - * QEMU paravirtual RDMA - rdmacm-mux declarations - * - * Copyright (C) 2018 Oracle - * Copyright (C) 2018 Red Hat Inc - * - * Authors: - * Yuval Shaia <yuval.shaia@oracle.com> - * Marcel Apfelbaum <marcel@redhat.com> - * - * This work is licensed under the terms of the GNU GPL, version 2 or later. - * See the COPYING file in the top-level directory. - * - */ - -#ifndef RDMACM_MUX_H -#define RDMACM_MUX_H - -#include "linux/if.h" -#include <infiniband/verbs.h> -#include <infiniband/umad.h> -#include <rdma/rdma_user_cm.h> - -typedef enum RdmaCmMuxMsgType { - RDMACM_MUX_MSG_TYPE_REQ = 0, - RDMACM_MUX_MSG_TYPE_RESP = 1, -} RdmaCmMuxMsgType; - -typedef enum RdmaCmMuxOpCode { - RDMACM_MUX_OP_CODE_REG = 0, - RDMACM_MUX_OP_CODE_UNREG = 1, - RDMACM_MUX_OP_CODE_MAD = 2, -} RdmaCmMuxOpCode; - -typedef enum RdmaCmMuxErrCode { - RDMACM_MUX_ERR_CODE_OK = 0, - RDMACM_MUX_ERR_CODE_EINVAL = 1, - RDMACM_MUX_ERR_CODE_EEXIST = 2, - RDMACM_MUX_ERR_CODE_EACCES = 3, - RDMACM_MUX_ERR_CODE_ENOTFOUND = 4, -} RdmaCmMuxErrCode; - -typedef struct RdmaCmMuxHdr { - RdmaCmMuxMsgType msg_type; - RdmaCmMuxOpCode op_code; - union ibv_gid sgid; - RdmaCmMuxErrCode err_code; -} RdmaCmUHdr; - -typedef struct RdmaCmUMad { - struct ib_user_mad hdr; - char mad[RDMA_MAX_PRIVATE_DATA]; -} RdmaCmUMad; - -typedef struct RdmaCmMuxMsg { - RdmaCmUHdr hdr; - int umad_len; - RdmaCmUMad umad; -} RdmaCmMuxMsg; - -#endif diff --git a/hw/rdma/rdma_backend.h b/hw/rdma/rdma_backend.h deleted file mode 100644 index 225af481e0..0000000000 --- a/hw/rdma/rdma_backend.h +++ /dev/null @@ -1,129 +0,0 @@ -/* - * RDMA device: Definitions of Backend Device functions - * - * Copyright (C) 2018 Oracle - * Copyright (C) 2018 Red Hat Inc - * - * Authors: - * Yuval Shaia <yuval.shaia@oracle.com> - * Marcel Apfelbaum <marcel@redhat.com> - * - * This work is licensed under the terms of the GNU GPL, version 2 or later. - * See the COPYING file in the top-level directory. - * - */ - -#ifndef RDMA_BACKEND_H -#define RDMA_BACKEND_H - -#include "qapi/error.h" -#include "chardev/char-fe.h" - -#include "rdma_rm_defs.h" -#include "rdma_backend_defs.h" - -/* Vendor Errors */ -#define VENDOR_ERR_FAIL_BACKEND 0x201 -#define VENDOR_ERR_TOO_MANY_SGES 0x202 -#define VENDOR_ERR_NOMEM 0x203 -#define VENDOR_ERR_QP0 0x204 -#define VENDOR_ERR_INV_NUM_SGE 0x205 -#define VENDOR_ERR_MAD_SEND 0x206 -#define VENDOR_ERR_INVLKEY 0x207 -#define VENDOR_ERR_MR_SMALL 0x208 -#define VENDOR_ERR_INV_MAD_BUFF 0x209 -#define VENDOR_ERR_INV_GID_IDX 0x210 - -/* Add definition for QP0 and QP1 as there is no userspace enums for them */ -enum ibv_special_qp_type { - IBV_QPT_SMI = 0, - IBV_QPT_GSI = 1, -}; - -static inline uint32_t rdma_backend_qpn(const RdmaBackendQP *qp) -{ - return qp->ibqp ? qp->ibqp->qp_num : 1; -} - -static inline uint32_t rdma_backend_mr_lkey(const RdmaBackendMR *mr) -{ - return mr->ibmr ? mr->ibmr->lkey : 0; -} - -static inline uint32_t rdma_backend_mr_rkey(const RdmaBackendMR *mr) -{ - return mr->ibmr ? mr->ibmr->rkey : 0; -} - -int rdma_backend_init(RdmaBackendDev *backend_dev, PCIDevice *pdev, - RdmaDeviceResources *rdma_dev_res, - const char *backend_device_name, uint8_t port_num, - struct ibv_device_attr *dev_attr, - CharBackend *mad_chr_be); -void rdma_backend_fini(RdmaBackendDev *backend_dev); -int rdma_backend_add_gid(RdmaBackendDev *backend_dev, const char *ifname, - union ibv_gid *gid); -int rdma_backend_del_gid(RdmaBackendDev *backend_dev, const char *ifname, - union ibv_gid *gid); -int rdma_backend_get_gid_index(RdmaBackendDev *backend_dev, - union ibv_gid *gid); -void rdma_backend_start(RdmaBackendDev *backend_dev); -void rdma_backend_stop(RdmaBackendDev *backend_dev); -void rdma_backend_register_comp_handler(void (*handler)(void *ctx, - struct ibv_wc *wc)); -void rdma_backend_unregister_comp_handler(void); - -int rdma_backend_query_port(RdmaBackendDev *backend_dev, - struct ibv_port_attr *port_attr); -int rdma_backend_create_pd(RdmaBackendDev *backend_dev, RdmaBackendPD *pd); -void rdma_backend_destroy_pd(RdmaBackendPD *pd); - -int rdma_backend_create_mr(RdmaBackendMR *mr, RdmaBackendPD *pd, void *addr, - size_t length, uint64_t guest_start, int access); -void rdma_backend_destroy_mr(RdmaBackendMR *mr); - -int rdma_backend_create_cq(RdmaBackendDev *backend_dev, RdmaBackendCQ *cq, - int cqe); -void rdma_backend_destroy_cq(RdmaBackendCQ *cq); -void rdma_backend_poll_cq(RdmaDeviceResources *rdma_dev_res, RdmaBackendCQ *cq); - -int rdma_backend_create_qp(RdmaBackendQP *qp, uint8_t qp_type, - RdmaBackendPD *pd, RdmaBackendCQ *scq, - RdmaBackendCQ *rcq, RdmaBackendSRQ *srq, - uint32_t max_send_wr, uint32_t max_recv_wr, - uint32_t max_send_sge, uint32_t max_recv_sge); -int rdma_backend_qp_state_init(RdmaBackendDev *backend_dev, RdmaBackendQP *qp, - uint8_t qp_type, uint32_t qkey); -int rdma_backend_qp_state_rtr(RdmaBackendDev *backend_dev, RdmaBackendQP *qp, - uint8_t qp_type, uint8_t sgid_idx, - union ibv_gid *dgid, uint32_t dqpn, - uint32_t rq_psn, uint32_t qkey, bool use_qkey); -int rdma_backend_qp_state_rts(RdmaBackendQP *qp, uint8_t qp_type, - uint32_t sq_psn, uint32_t qkey, bool use_qkey); -int rdma_backend_query_qp(RdmaBackendQP *qp, struct ibv_qp_attr *attr, - int attr_mask, struct ibv_qp_init_attr *init_attr); -void rdma_backend_destroy_qp(RdmaBackendQP *qp, RdmaDeviceResources *dev_res); - -void rdma_backend_post_send(RdmaBackendDev *backend_dev, - RdmaBackendQP *qp, uint8_t qp_type, - struct ibv_sge *sge, uint32_t num_sge, - uint8_t sgid_idx, union ibv_gid *sgid, - union ibv_gid *dgid, uint32_t dqpn, uint32_t dqkey, - void *ctx); -void rdma_backend_post_recv(RdmaBackendDev *backend_dev, - RdmaBackendQP *qp, uint8_t qp_type, - struct ibv_sge *sge, uint32_t num_sge, void *ctx); - -int rdma_backend_create_srq(RdmaBackendSRQ *srq, RdmaBackendPD *pd, - uint32_t max_wr, uint32_t max_sge, - uint32_t srq_limit); -int rdma_backend_query_srq(RdmaBackendSRQ *srq, struct ibv_srq_attr *srq_attr); -int rdma_backend_modify_srq(RdmaBackendSRQ *srq, struct ibv_srq_attr *srq_attr, - int srq_attr_mask); -void rdma_backend_destroy_srq(RdmaBackendSRQ *srq, - RdmaDeviceResources *dev_res); -void rdma_backend_post_srq_recv(RdmaBackendDev *backend_dev, - RdmaBackendSRQ *srq, struct ibv_sge *sge, - uint32_t num_sge, void *ctx); - -#endif diff --git a/hw/rdma/rdma_backend_defs.h b/hw/rdma/rdma_backend_defs.h deleted file mode 100644 index 4e6c0ad695..0000000000 --- a/hw/rdma/rdma_backend_defs.h +++ /dev/null @@ -1,76 +0,0 @@ -/* - * RDMA device: Definitions of Backend Device structures - * - * Copyright (C) 2018 Oracle - * Copyright (C) 2018 Red Hat Inc - * - * Authors: - * Yuval Shaia <yuval.shaia@oracle.com> - * Marcel Apfelbaum <marcel@redhat.com> - * - * This work is licensed under the terms of the GNU GPL, version 2 or later. - * See the COPYING file in the top-level directory. - * - */ - -#ifndef RDMA_BACKEND_DEFS_H -#define RDMA_BACKEND_DEFS_H - -#include "qemu/thread.h" -#include "chardev/char-fe.h" -#include <infiniband/verbs.h> -#include "contrib/rdmacm-mux/rdmacm-mux.h" -#include "rdma_utils.h" - -typedef struct RdmaDeviceResources RdmaDeviceResources; - -typedef struct RdmaBackendThread { - QemuThread thread; - bool run; /* Set by thread manager to let thread know it should exit */ - bool is_running; /* Set by the thread to report its status */ -} RdmaBackendThread; - -typedef struct RdmaCmMux { - CharBackend *chr_be; - int can_receive; -} RdmaCmMux; - -typedef struct RdmaBackendDev { - RdmaBackendThread comp_thread; - PCIDevice *dev; - RdmaDeviceResources *rdma_dev_res; - struct ibv_device *ib_dev; - struct ibv_context *context; - struct ibv_comp_channel *channel; - uint8_t port_num; - RdmaProtectedGQueue recv_mads_list; - RdmaCmMux rdmacm_mux; -} RdmaBackendDev; - -typedef struct RdmaBackendPD { - struct ibv_pd *ibpd; -} RdmaBackendPD; - -typedef struct RdmaBackendMR { - struct ibv_pd *ibpd; - struct ibv_mr *ibmr; -} RdmaBackendMR; - -typedef struct RdmaBackendCQ { - RdmaBackendDev *backend_dev; - struct ibv_cq *ibcq; -} RdmaBackendCQ; - -typedef struct RdmaBackendQP { - struct ibv_pd *ibpd; - struct ibv_qp *ibqp; - uint8_t sgid_idx; - RdmaProtectedGSList cqe_ctx_list; -} RdmaBackendQP; - -typedef struct RdmaBackendSRQ { - struct ibv_srq *ibsrq; - RdmaProtectedGSList cqe_ctx_list; -} RdmaBackendSRQ; - -#endif diff --git a/hw/rdma/rdma_rm.h b/hw/rdma/rdma_rm.h deleted file mode 100644 index d69a917795..0000000000 --- a/hw/rdma/rdma_rm.h +++ /dev/null @@ -1,97 +0,0 @@ -/* - * RDMA device: Definitions of Resource Manager functions - * - * Copyright (C) 2018 Oracle - * Copyright (C) 2018 Red Hat Inc - * - * Authors: - * Yuval Shaia <yuval.shaia@oracle.com> - * Marcel Apfelbaum <marcel@redhat.com> - * - * This work is licensed under the terms of the GNU GPL, version 2 or later. - * See the COPYING file in the top-level directory. - * - */ - -#ifndef RDMA_RM_H -#define RDMA_RM_H - -#include "qapi/error.h" -#include "rdma_backend_defs.h" -#include "rdma_rm_defs.h" - -int rdma_rm_init(RdmaDeviceResources *dev_res, - struct ibv_device_attr *dev_attr); -void rdma_rm_fini(RdmaDeviceResources *dev_res, RdmaBackendDev *backend_dev, - const char *ifname); - -int rdma_rm_alloc_pd(RdmaDeviceResources *dev_res, RdmaBackendDev *backend_dev, - uint32_t *pd_handle, uint32_t ctx_handle); -RdmaRmPD *rdma_rm_get_pd(RdmaDeviceResources *dev_res, uint32_t pd_handle); -void rdma_rm_dealloc_pd(RdmaDeviceResources *dev_res, uint32_t pd_handle); - -int rdma_rm_alloc_mr(RdmaDeviceResources *dev_res, uint32_t pd_handle, - uint64_t guest_start, uint64_t guest_length, - void *host_virt, int access_flags, uint32_t *mr_handle, - uint32_t *lkey, uint32_t *rkey); -RdmaRmMR *rdma_rm_get_mr(RdmaDeviceResources *dev_res, uint32_t mr_handle); -void rdma_rm_dealloc_mr(RdmaDeviceResources *dev_res, uint32_t mr_handle); - -int rdma_rm_alloc_uc(RdmaDeviceResources *dev_res, uint32_t pfn, - uint32_t *uc_handle); -RdmaRmUC *rdma_rm_get_uc(RdmaDeviceResources *dev_res, uint32_t uc_handle); -void rdma_rm_dealloc_uc(RdmaDeviceResources *dev_res, uint32_t uc_handle); - -int rdma_rm_alloc_cq(RdmaDeviceResources *dev_res, RdmaBackendDev *backend_dev, - uint32_t cqe, uint32_t *cq_handle, void *opaque); -RdmaRmCQ *rdma_rm_get_cq(RdmaDeviceResources *dev_res, uint32_t cq_handle); -void rdma_rm_req_notify_cq(RdmaDeviceResources *dev_res, uint32_t cq_handle, - bool notify); -void rdma_rm_dealloc_cq(RdmaDeviceResources *dev_res, uint32_t cq_handle); - -int rdma_rm_alloc_qp(RdmaDeviceResources *dev_res, uint32_t pd_handle, - uint8_t qp_type, uint32_t max_send_wr, - uint32_t max_send_sge, uint32_t send_cq_handle, - uint32_t max_recv_wr, uint32_t max_recv_sge, - uint32_t recv_cq_handle, void *opaque, uint32_t *qpn, - uint8_t is_srq, uint32_t srq_handle); -RdmaRmQP *rdma_rm_get_qp(RdmaDeviceResources *dev_res, uint32_t qpn); -int rdma_rm_modify_qp(RdmaDeviceResources *dev_res, RdmaBackendDev *backend_dev, - uint32_t qp_handle, uint32_t attr_mask, uint8_t sgid_idx, - union ibv_gid *dgid, uint32_t dqpn, - enum ibv_qp_state qp_state, uint32_t qkey, - uint32_t rq_psn, uint32_t sq_psn); -int rdma_rm_query_qp(RdmaDeviceResources *dev_res, RdmaBackendDev *backend_dev, - uint32_t qp_handle, struct ibv_qp_attr *attr, - int attr_mask, struct ibv_qp_init_attr *init_attr); -void rdma_rm_dealloc_qp(RdmaDeviceResources *dev_res, uint32_t qp_handle); - -RdmaRmSRQ *rdma_rm_get_srq(RdmaDeviceResources *dev_res, uint32_t srq_handle); -int rdma_rm_alloc_srq(RdmaDeviceResources *dev_res, uint32_t pd_handle, - uint32_t max_wr, uint32_t max_sge, uint32_t srq_limit, - uint32_t *srq_handle, void *opaque); -int rdma_rm_query_srq(RdmaDeviceResources *dev_res, uint32_t srq_handle, - struct ibv_srq_attr *srq_attr); -int rdma_rm_modify_srq(RdmaDeviceResources *dev_res, uint32_t srq_handle, - struct ibv_srq_attr *srq_attr, int srq_attr_mask); -void rdma_rm_dealloc_srq(RdmaDeviceResources *dev_res, uint32_t srq_handle); - -int rdma_rm_alloc_cqe_ctx(RdmaDeviceResources *dev_res, uint32_t *cqe_ctx_id, - void *ctx); -void *rdma_rm_get_cqe_ctx(RdmaDeviceResources *dev_res, uint32_t cqe_ctx_id); -void rdma_rm_dealloc_cqe_ctx(RdmaDeviceResources *dev_res, uint32_t cqe_ctx_id); - -int rdma_rm_add_gid(RdmaDeviceResources *dev_res, RdmaBackendDev *backend_dev, - const char *ifname, union ibv_gid *gid, int gid_idx); -int rdma_rm_del_gid(RdmaDeviceResources *dev_res, RdmaBackendDev *backend_dev, - const char *ifname, int gid_idx); -int rdma_rm_get_backend_gid_index(RdmaDeviceResources *dev_res, - RdmaBackendDev *backend_dev, int sgid_idx); -static inline union ibv_gid *rdma_rm_get_gid(RdmaDeviceResources *dev_res, - int sgid_idx) -{ - return &dev_res->port.gid_tbl[sgid_idx].gid; -} -void rdma_format_device_counters(RdmaDeviceResources *dev_res, GString *buf); - -#endif diff --git a/hw/rdma/rdma_rm_defs.h b/hw/rdma/rdma_rm_defs.h deleted file mode 100644 index 534f2f74d3..0000000000 --- a/hw/rdma/rdma_rm_defs.h +++ /dev/null @@ -1,146 +0,0 @@ -/* - * RDMA device: Definitions of Resource Manager structures - * - * Copyright (C) 2018 Oracle - * Copyright (C) 2018 Red Hat Inc - * - * Authors: - * Yuval Shaia <yuval.shaia@oracle.com> - * Marcel Apfelbaum <marcel@redhat.com> - * - * This work is licensed under the terms of the GNU GPL, version 2 or later. - * See the COPYING file in the top-level directory. - * - */ - -#ifndef RDMA_RM_DEFS_H -#define RDMA_RM_DEFS_H - -#include "rdma_backend_defs.h" - -#define MAX_PORTS 1 /* Do not change - we support only one port */ -#define MAX_PORT_GIDS 255 -#define MAX_GIDS MAX_PORT_GIDS -#define MAX_PORT_PKEYS 1 -#define MAX_PKEYS MAX_PORT_PKEYS -#define MAX_UCS 512 -#define MAX_MR_SIZE (1UL << 27) -#define MAX_QP 1024 -#define MAX_SGE 4 -#define MAX_CQ 2048 -#define MAX_MR 1024 -#define MAX_PD 1024 -#define MAX_QP_RD_ATOM 16 -#define MAX_QP_INIT_RD_ATOM 16 -#define MAX_AH 64 -#define MAX_SRQ 512 - -#define MAX_RM_TBL_NAME 16 -#define MAX_CONSEQ_EMPTY_POLL_CQ 4096 /* considered as error above this */ - -typedef struct RdmaRmResTbl { - char name[MAX_RM_TBL_NAME]; - QemuMutex lock; - unsigned long *bitmap; - size_t tbl_sz; - size_t res_sz; - void *tbl; - uint32_t used; /* number of used entries in the table */ -} RdmaRmResTbl; - -typedef struct RdmaRmPD { - RdmaBackendPD backend_pd; - uint32_t ctx_handle; -} RdmaRmPD; - -typedef enum CQNotificationType { - CNT_CLEAR, - CNT_ARM, - CNT_SET, -} CQNotificationType; - -typedef struct RdmaRmCQ { - RdmaBackendCQ backend_cq; - void *opaque; - CQNotificationType notify; -} RdmaRmCQ; - -/* MR (DMA region) */ -typedef struct RdmaRmMR { - RdmaBackendMR backend_mr; - void *virt; - uint64_t start; - size_t length; - uint32_t pd_handle; - uint32_t lkey; - uint32_t rkey; -} RdmaRmMR; - -typedef struct RdmaRmUC { - uint64_t uc_handle; -} RdmaRmUC; - -typedef struct RdmaRmQP { - RdmaBackendQP backend_qp; - void *opaque; - uint32_t qp_type; - uint32_t qpn; - uint32_t send_cq_handle; - uint32_t recv_cq_handle; - enum ibv_qp_state qp_state; - uint8_t is_srq; -} RdmaRmQP; - -typedef struct RdmaRmSRQ { - RdmaBackendSRQ backend_srq; - uint32_t recv_cq_handle; - void *opaque; -} RdmaRmSRQ; - -typedef struct RdmaRmGid { - union ibv_gid gid; - int backend_gid_index; -} RdmaRmGid; - -typedef struct RdmaRmPort { - RdmaRmGid gid_tbl[MAX_PORT_GIDS]; - enum ibv_port_state state; -} RdmaRmPort; - -typedef struct RdmaRmStats { - uint64_t tx; - uint64_t tx_len; - uint64_t tx_err; - uint64_t rx_bufs; - uint64_t rx_bufs_len; - uint64_t rx_bufs_err; - uint64_t rx_srq; - uint64_t completions; - uint64_t mad_tx; - uint64_t mad_tx_err; - uint64_t mad_rx; - uint64_t mad_rx_err; - uint64_t mad_rx_bufs; - uint64_t mad_rx_bufs_err; - uint64_t poll_cq_from_bk; - uint64_t poll_cq_from_guest; - uint64_t poll_cq_from_guest_empty; - uint64_t poll_cq_ppoll_to; - uint32_t missing_cqe; -} RdmaRmStats; - -struct RdmaDeviceResources { - RdmaRmPort port; - RdmaRmResTbl pd_tbl; - RdmaRmResTbl mr_tbl; - RdmaRmResTbl uc_tbl; - RdmaRmResTbl qp_tbl; - RdmaRmResTbl cq_tbl; - RdmaRmResTbl cqe_ctx_tbl; - RdmaRmResTbl srq_tbl; - GHashTable *qp_hash; /* Keeps mapping between real and emulated */ - QemuMutex lock; - RdmaRmStats stats; -}; - -#endif diff --git a/hw/rdma/rdma_utils.h b/hw/rdma/rdma_utils.h deleted file mode 100644 index 54e4f56edd..0000000000 --- a/hw/rdma/rdma_utils.h +++ /dev/null @@ -1,63 +0,0 @@ -/* - * RDMA device: Debug utilities - * - * Copyright (C) 2018 Oracle - * Copyright (C) 2018 Red Hat Inc - * - * - * Authors: - * Yuval Shaia <yuval.shaia@oracle.com> - * Marcel Apfelbaum <marcel@redhat.com> - * - * This work is licensed under the terms of the GNU GPL, version 2 or later. - * See the COPYING file in the top-level directory. - * - */ - -#ifndef RDMA_UTILS_H -#define RDMA_UTILS_H - -#include "qemu/error-report.h" -#include "sysemu/dma.h" - -#define rdma_error_report(fmt, ...) \ - error_report("%s: " fmt, "rdma", ## __VA_ARGS__) -#define rdma_warn_report(fmt, ...) \ - warn_report("%s: " fmt, "rdma", ## __VA_ARGS__) -#define rdma_info_report(fmt, ...) \ - info_report("%s: " fmt, "rdma", ## __VA_ARGS__) - -typedef struct RdmaProtectedGQueue { - QemuMutex lock; - GQueue *list; -} RdmaProtectedGQueue; - -typedef struct RdmaProtectedGSList { - QemuMutex lock; - GSList *list; -} RdmaProtectedGSList; - -void *rdma_pci_dma_map(PCIDevice *dev, dma_addr_t addr, dma_addr_t len); -void rdma_pci_dma_unmap(PCIDevice *dev, void *buffer, dma_addr_t len); -void rdma_protected_gqueue_init(RdmaProtectedGQueue *list); -void rdma_protected_gqueue_destroy(RdmaProtectedGQueue *list); -void rdma_protected_gqueue_append_int64(RdmaProtectedGQueue *list, - int64_t value); -int64_t rdma_protected_gqueue_pop_int64(RdmaProtectedGQueue *list); -void rdma_protected_gslist_init(RdmaProtectedGSList *list); -void rdma_protected_gslist_destroy(RdmaProtectedGSList *list); -void rdma_protected_gslist_append_int32(RdmaProtectedGSList *list, - int32_t value); -void rdma_protected_gslist_remove_int32(RdmaProtectedGSList *list, - int32_t value); - -static inline void addrconf_addr_eui48(uint8_t *eui, const char *addr) -{ - memcpy(eui, addr, 3); - eui[3] = 0xFF; - eui[4] = 0xFE; - memcpy(eui + 5, addr + 3, 3); - eui[0] ^= 2; -} - -#endif diff --git a/hw/rdma/trace.h b/hw/rdma/trace.h deleted file mode 100644 index b3fa8ebc51..0000000000 --- a/hw/rdma/trace.h +++ /dev/null @@ -1 +0,0 @@ -#include "trace/trace-hw_rdma.h" diff --git a/hw/rdma/vmw/pvrdma.h b/hw/rdma/vmw/pvrdma.h deleted file mode 100644 index 4cbc10c980..0000000000 --- a/hw/rdma/vmw/pvrdma.h +++ /dev/null @@ -1,144 +0,0 @@ -/* - * QEMU VMWARE paravirtual RDMA device definitions - * - * Copyright (C) 2018 Oracle - * Copyright (C) 2018 Red Hat Inc - * - * Authors: - * Yuval Shaia <yuval.shaia@oracle.com> - * Marcel Apfelbaum <marcel@redhat.com> - * - * This work is licensed under the terms of the GNU GPL, version 2 or later. - * See the COPYING file in the top-level directory. - * - */ - -#ifndef PVRDMA_PVRDMA_H -#define PVRDMA_PVRDMA_H - -#include "qemu/units.h" -#include "qemu/notify.h" -#include "hw/pci/msix.h" -#include "hw/pci/pci_device.h" -#include "chardev/char-fe.h" -#include "hw/net/vmxnet3_defs.h" - -#include "../rdma_backend_defs.h" -#include "../rdma_rm_defs.h" - -#include "standard-headers/drivers/infiniband/hw/vmw_pvrdma/pvrdma_dev_api.h" -#include "pvrdma_dev_ring.h" -#include "qom/object.h" - -/* BARs */ -#define RDMA_MSIX_BAR_IDX 0 -#define RDMA_REG_BAR_IDX 1 -#define RDMA_UAR_BAR_IDX 2 -#define RDMA_BAR0_MSIX_SIZE (16 * KiB) -#define RDMA_BAR1_REGS_SIZE 64 -#define RDMA_BAR2_UAR_SIZE (0x1000 * MAX_UCS) /* each uc gets page */ - -/* MSIX */ -#define RDMA_MAX_INTRS 3 -#define RDMA_MSIX_TABLE 0x0000 -#define RDMA_MSIX_PBA 0x2000 - -/* Interrupts Vectors */ -#define INTR_VEC_CMD_RING 0 -#define INTR_VEC_CMD_ASYNC_EVENTS 1 -#define INTR_VEC_CMD_COMPLETION_Q 2 - -/* HW attributes */ -#define PVRDMA_HW_NAME "pvrdma" -#define PVRDMA_HW_VERSION 17 -#define PVRDMA_FW_VERSION 14 - -/* Some defaults */ -#define PVRDMA_PKEY 0xFFFF - -typedef struct DSRInfo { - dma_addr_t dma; - struct pvrdma_device_shared_region *dsr; - - union pvrdma_cmd_req *req; - union pvrdma_cmd_resp *rsp; - - PvrdmaRingState *async_ring_state; - PvrdmaRing async; - - PvrdmaRingState *cq_ring_state; - PvrdmaRing cq; -} DSRInfo; - -typedef struct PVRDMADevStats { - uint64_t commands; - uint64_t regs_reads; - uint64_t regs_writes; - uint64_t uar_writes; - uint64_t interrupts; -} PVRDMADevStats; - -struct PVRDMADev { - PCIDevice parent_obj; - MemoryRegion msix; - MemoryRegion regs; - uint32_t regs_data[RDMA_BAR1_REGS_SIZE]; - MemoryRegion uar; - uint32_t uar_data[RDMA_BAR2_UAR_SIZE]; - DSRInfo dsr_info; - int interrupt_mask; - struct ibv_device_attr dev_attr; - uint64_t node_guid; - char *backend_eth_device_name; - char *backend_device_name; - uint8_t backend_port_num; - RdmaBackendDev backend_dev; - RdmaDeviceResources rdma_dev_res; - CharBackend mad_chr; - VMXNET3State *func0; - Notifier shutdown_notifier; - PVRDMADevStats stats; -}; -typedef struct PVRDMADev PVRDMADev; -DECLARE_INSTANCE_CHECKER(PVRDMADev, PVRDMA_DEV, - PVRDMA_HW_NAME) - -static inline int get_reg_val(PVRDMADev *dev, hwaddr addr, uint32_t *val) -{ - int idx = addr >> 2; - - if (idx >= RDMA_BAR1_REGS_SIZE) { - return -EINVAL; - } - - *val = dev->regs_data[idx]; - - return 0; -} - -static inline int set_reg_val(PVRDMADev *dev, hwaddr addr, uint32_t val) -{ - int idx = addr >> 2; - - if (idx >= RDMA_BAR1_REGS_SIZE) { - return -EINVAL; - } - - dev->regs_data[idx] = val; - - return 0; -} - -static inline void post_interrupt(PVRDMADev *dev, unsigned vector) -{ - PCIDevice *pci_dev = PCI_DEVICE(dev); - - if (likely(!dev->interrupt_mask)) { - dev->stats.interrupts++; - msix_notify(pci_dev, vector); - } -} - -int pvrdma_exec_cmd(PVRDMADev *dev); - -#endif diff --git a/hw/rdma/vmw/pvrdma_dev_ring.h b/hw/rdma/vmw/pvrdma_dev_ring.h deleted file mode 100644 index d231588ce0..0000000000 --- a/hw/rdma/vmw/pvrdma_dev_ring.h +++ /dev/null @@ -1,46 +0,0 @@ -/* - * QEMU VMWARE paravirtual RDMA ring utilities - * - * Copyright (C) 2018 Oracle - * Copyright (C) 2018 Red Hat Inc - * - * Authors: - * Yuval Shaia <yuval.shaia@oracle.com> - * Marcel Apfelbaum <marcel@redhat.com> - * - * This work is licensed under the terms of the GNU GPL, version 2 or later. - * See the COPYING file in the top-level directory. - * - */ - -#ifndef PVRDMA_DEV_RING_H -#define PVRDMA_DEV_RING_H - - -#define MAX_RING_NAME_SZ 32 - -typedef struct PvrdmaRingState { - int prod_tail; /* producer tail */ - int cons_head; /* consumer head */ -} PvrdmaRingState; - -typedef struct PvrdmaRing { - char name[MAX_RING_NAME_SZ]; - PCIDevice *dev; - uint32_t max_elems; - size_t elem_sz; - PvrdmaRingState *ring_state; /* used only for unmap */ - int npages; - void **pages; -} PvrdmaRing; - -int pvrdma_ring_init(PvrdmaRing *ring, const char *name, PCIDevice *dev, - PvrdmaRingState *ring_state, uint32_t max_elems, - size_t elem_sz, dma_addr_t *tbl, uint32_t npages); -void *pvrdma_ring_next_elem_read(PvrdmaRing *ring); -void pvrdma_ring_read_inc(PvrdmaRing *ring); -void *pvrdma_ring_next_elem_write(PvrdmaRing *ring); -void pvrdma_ring_write_inc(PvrdmaRing *ring); -void pvrdma_ring_free(PvrdmaRing *ring); - -#endif diff --git a/hw/rdma/vmw/pvrdma_qp_ops.h b/hw/rdma/vmw/pvrdma_qp_ops.h deleted file mode 100644 index bf2b15c5ce..0000000000 --- a/hw/rdma/vmw/pvrdma_qp_ops.h +++ /dev/null @@ -1,28 +0,0 @@ -/* - * QEMU VMWARE paravirtual RDMA QP Operations - * - * Copyright (C) 2018 Oracle - * Copyright (C) 2018 Red Hat Inc - * - * Authors: - * Yuval Shaia <yuval.shaia@oracle.com> - * Marcel Apfelbaum <marcel@redhat.com> - * - * This work is licensed under the terms of the GNU GPL, version 2 or later. - * See the COPYING file in the top-level directory. - * - */ - -#ifndef PVRDMA_QP_OPS_H -#define PVRDMA_QP_OPS_H - -#include "pvrdma.h" - -int pvrdma_qp_ops_init(void); -void pvrdma_qp_ops_fini(void); -void pvrdma_qp_send(PVRDMADev *dev, uint32_t qp_handle); -void pvrdma_qp_recv(PVRDMADev *dev, uint32_t qp_handle); -void pvrdma_srq_recv(PVRDMADev *dev, uint32_t srq_handle); -void pvrdma_cq_poll(RdmaDeviceResources *dev_res, uint32_t cq_handle); - -#endif diff --git a/hw/rdma/vmw/trace.h b/hw/rdma/vmw/trace.h deleted file mode 100644 index 3ebc9fb7ad..0000000000 --- a/hw/rdma/vmw/trace.h +++ /dev/null @@ -1 +0,0 @@ -#include "trace/trace-hw_rdma_vmw.h" diff --git a/include/hw/rdma/rdma.h b/include/hw/rdma/rdma.h deleted file mode 100644 index 80b2e531c4..0000000000 --- a/include/hw/rdma/rdma.h +++ /dev/null @@ -1,37 +0,0 @@ -/* - * RDMA device interface - * - * Copyright (C) 2019 Oracle - * Copyright (C) 2019 Red Hat Inc - * - * Authors: - * Yuval Shaia <yuval.shaia@oracle.com> - * - * This work is licensed under the terms of the GNU GPL, version 2 or later. - * See the COPYING file in the top-level directory. - * - */ - -#ifndef RDMA_H -#define RDMA_H - -#include "qom/object.h" - -#define INTERFACE_RDMA_PROVIDER "rdma" - -typedef struct RdmaProviderClass RdmaProviderClass; -DECLARE_CLASS_CHECKERS(RdmaProviderClass, RDMA_PROVIDER, - INTERFACE_RDMA_PROVIDER) -#define RDMA_PROVIDER(obj) \ - INTERFACE_CHECK(RdmaProvider, (obj), \ - INTERFACE_RDMA_PROVIDER) - -typedef struct RdmaProvider RdmaProvider; - -struct RdmaProviderClass { - InterfaceClass parent; - - void (*format_statistics)(RdmaProvider *obj, GString *buf); -}; - -#endif diff --git a/include/monitor/hmp.h b/include/monitor/hmp.h index 13f9a2dedb..f4cf8f6717 100644 --- a/include/monitor/hmp.h +++ b/include/monitor/hmp.h @@ -37,7 +37,6 @@ void hmp_info_spice(Monitor *mon, const QDict *qdict); void hmp_info_balloon(Monitor *mon, const QDict *qdict); void hmp_info_irq(Monitor *mon, const QDict *qdict); void hmp_info_pic(Monitor *mon, const QDict *qdict); -void hmp_info_rdma(Monitor *mon, const QDict *qdict); void hmp_info_pci(Monitor *mon, const QDict *qdict); void hmp_info_tpm(Monitor *mon, const QDict *qdict); void hmp_info_iothreads(Monitor *mon, const QDict *qdict); diff --git a/include/standard-headers/drivers/infiniband/hw/vmw_pvrdma/pvrdma_dev_api.h b/include/standard-headers/drivers/infiniband/hw/vmw_pvrdma/pvrdma_dev_api.h deleted file mode 100644 index a5a1c8234e..0000000000 --- a/include/standard-headers/drivers/infiniband/hw/vmw_pvrdma/pvrdma_dev_api.h +++ /dev/null @@ -1,685 +0,0 @@ -/* - * Copyright (c) 2012-2016 VMware, Inc. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of EITHER the GNU General Public License - * version 2 as published by the Free Software Foundation or the BSD - * 2-Clause License. This program is distributed in the hope that it - * will be useful, but WITHOUT ANY WARRANTY; WITHOUT EVEN THE IMPLIED - * WARRANTY OF MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE. - * See the GNU General Public License version 2 for more details at - * <http://www.gnu.org/licenses/old-licenses/gpl-2.0.en.html>