[libvirt] [PATCH v2 00/17] implement cgroups v2 devices support

In cgroups v2 there is no devices controller, BPF should be used instead. Changes in v2: - fixed build on bsd and older kernels without cgroup BPF - cgroup bpf devices code moved to separate file Documentation for eBPF: <http://man7.org/linux/man-pages/man2/bpf.2.html> <https://www.kernel.org/doc/Documentation/networking/filter.txt> <https://docs.cilium.io/en/v1.3/bpf/> Pavel Hrdina (17): util: introduce virbpf helpers vircgroup: introduce virCgroupV2DevicesAvailable vircgroup: introduce virCgroupV2DevicesAttachProg vircgroup: introduce virCgroupV2DevicesDetectProg vircgroup: introduce virCgroupV2DevicesCreateProg vircgroup: introduce virCgroupV2DevicesPrepareProg vircgroup: introduce virCgroupV2DevicesRemoveProg vircgroup: introduce virCgroupV2DeviceGetPerms vircgroup: introduce virCgroupV2DevicesGetKey vircgroup: introduce virCgroupV2AllowDevice vircgroup: introduce virCgroupV2DenyDevice vircgroup: introduce virCgroupV2AllowAllDevices vircgroup: introduce virCgroupV2DenyAllDevices vircgroup: workaround devices in hybrid mode vircgroupv2: detech BPF program before removing cgroup vircgroupv2: use dummy process to workaround kernel bug with systemd vircgroupmock: mock virCgroupV2DevicesAvailable configure.ac | 6 + include/libvirt/virterror.h | 1 + src/Makefile.am | 2 + src/libvirt_private.syms | 27 ++ src/lxc/lxc_cgroup.c | 1 + src/qemu/qemu_cgroup.c | 6 +- src/util/Makefile.inc.am | 4 + src/util/virbpf.c | 437 +++++++++++++++++++++ src/util/virbpf.h | 271 +++++++++++++ src/util/vircgroup.c | 19 +- src/util/vircgroup.h | 1 + src/util/vircgroupbackend.h | 3 +- src/util/vircgrouppriv.h | 12 + src/util/vircgroupv1.c | 9 +- src/util/vircgroupv2.c | 119 +++++- src/util/vircgroupv2devices.c | 625 ++++++++++++++++++++++++++++++ src/util/vircgroupv2devices.h | 57 +++ src/util/virerror.c | 1 + src/util/virsystemd.c | 2 +- src/util/virsystemd.h | 2 + tests/vircgroupdata/hybrid.parsed | 2 +- tests/vircgroupmock.c | 7 + tests/vircgrouptest.c | 4 +- 23 files changed, 1608 insertions(+), 10 deletions(-) create mode 100644 src/util/virbpf.c create mode 100644 src/util/virbpf.h create mode 100644 src/util/vircgroupv2devices.c create mode 100644 src/util/vircgroupv2devices.h -- 2.20.1

In order to implement devices controller with cgroup v2 we need to add support for BPF programs, cgroup v2 doesn't have devices controller. This introduces required helpers wrapping linux syscalls. Signed-off-by: Pavel Hrdina <phrdina@redhat.com> --- configure.ac | 5 + include/libvirt/virterror.h | 1 + src/libvirt_private.syms | 16 ++ src/util/Makefile.inc.am | 2 + src/util/virbpf.c | 437 ++++++++++++++++++++++++++++++++++++ src/util/virbpf.h | 271 ++++++++++++++++++++++ src/util/virerror.c | 1 + 7 files changed, 733 insertions(+) create mode 100644 src/util/virbpf.c create mode 100644 src/util/virbpf.h diff --git a/configure.ac b/configure.ac index ac52189dff..0b7afece25 100644 --- a/configure.ac +++ b/configure.ac @@ -885,6 +885,11 @@ AC_CHECK_DECLS([clock_serv_t, host_get_clock_service, clock_get_time], #include <mach/mach.h> ]) +# Check if we have new enough kernel to support BPF devices for cgroups v2 +if test "$with_linux" = "yes"; then + AC_CHECK_DECLS([BPF_PROG_QUERY], [], [], [#include <linux/bpf.h>]) +fi + # Check if we need to look for ifconfig if test "$want_ifconfig" = "yes"; then AC_PATH_PROG([IFCONFIG_PATH], [ifconfig]) diff --git a/include/libvirt/virterror.h b/include/libvirt/virterror.h index fbbe2d5624..d47bed4390 100644 --- a/include/libvirt/virterror.h +++ b/include/libvirt/virterror.h @@ -131,6 +131,7 @@ typedef enum { VIR_FROM_PERF = 65, /* Error from perf */ VIR_FROM_LIBSSH = 66, /* Error from libssh connection transport */ VIR_FROM_RESCTRL = 67, /* Error from resource control */ + VIR_FROM_BPF = 68, /* Error from BPF code */ # ifdef VIR_ENUM_SENTINELS VIR_ERR_DOMAIN_LAST diff --git a/src/libvirt_private.syms b/src/libvirt_private.syms index c3d6306809..0cff580de2 100644 --- a/src/libvirt_private.syms +++ b/src/libvirt_private.syms @@ -1477,6 +1477,22 @@ virBitmapToDataBuf; virBitmapToString; +# util/virbpf.h +virBPFAttachProg; +virBPFCreateMap; +virBPFDeleteElem; +virBPFDetachProg; +virBPFGetMap; +virBPFGetMapInfo; +virBPFGetNextElem; +virBPFGetProg; +virBPFGetProgInfo; +virBPFLoadProg; +virBPFLookupElem; +virBPFQueryProg; +virBPFUpdateElem; + + # util/virbuffer.h virBufferAdd; virBufferAddBuffer; diff --git a/src/util/Makefile.inc.am b/src/util/Makefile.inc.am index 4295babac3..1fd7ad2d43 100644 --- a/src/util/Makefile.inc.am +++ b/src/util/Makefile.inc.am @@ -17,6 +17,8 @@ UTIL_SOURCES = \ util/virauthconfig.h \ util/virbitmap.c \ util/virbitmap.h \ + util/virbpf.c \ + util/virbpf.h \ util/virbuffer.c \ util/virbuffer.h \ util/virperf.c \ diff --git a/src/util/virbpf.c b/src/util/virbpf.c new file mode 100644 index 0000000000..ea4b8c0a4a --- /dev/null +++ b/src/util/virbpf.c @@ -0,0 +1,437 @@ +/* + * virbpf.c: methods for eBPF + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library. If not, see + * <http://www.gnu.org/licenses/>. + */ +#include <config.h> + +#include <sys/syscall.h> + +#include "internal.h" + +#include "virbpf.h" +#include "virerror.h" +#include "virfile.h" +#include "virlog.h" +#include "virstring.h" + +VIR_LOG_INIT("util.bpf"); + +#define VIR_FROM_THIS VIR_FROM_BPF + +#if HAVE_DECL_BPF_PROG_QUERY +int +virBPFCreateMap(unsigned int mapType, + unsigned int keySize, + unsigned int valSize, + unsigned int maxEntries) +{ + union bpf_attr attr; + + memset(&attr, 0, sizeof(attr)); + + attr.map_type = mapType; + attr.key_size = keySize; + attr.value_size = valSize; + attr.max_entries = maxEntries; + + return syscall(SYS_bpf, BPF_MAP_CREATE, &attr, sizeof(attr)); +} + + +# define LOG_BUF_SIZE (256 * 1024) + +int +virBPFLoadProg(struct bpf_insn *insns, + int progType, + unsigned int insnCnt) +{ + VIR_AUTOFREE(char *) logbuf = NULL; + int progfd = -1; + union bpf_attr attr; + + if (VIR_ALLOC_N(logbuf, LOG_BUF_SIZE) < 0) + return -1; + + memset(&attr, 0, sizeof(attr)); + + attr.prog_type = progType; + attr.insn_cnt = (uint32_t)insnCnt; + attr.insns = (uint64_t)insns; + attr.license = (uint64_t)"GPL"; + attr.log_buf = (uint64_t)logbuf; + attr.log_size = LOG_BUF_SIZE; + attr.log_level = 1; + + progfd = syscall(SYS_bpf, BPF_PROG_LOAD, &attr, sizeof(attr)); + + if (progfd < 0) + VIR_DEBUG("%s", logbuf); + + return progfd; +} + + +int +virBPFAttachProg(int progfd, + int targetfd, + int attachType) +{ + union bpf_attr attr; + + memset(&attr, 0, sizeof(attr)); + + attr.target_fd = targetfd; + attr.attach_bpf_fd = progfd; + attr.attach_type = attachType; + + return syscall(SYS_bpf, BPF_PROG_ATTACH, &attr, sizeof(attr)); +} + + +int +virBPFDetachProg(int progfd, + int targetfd, + int attachType) +{ + union bpf_attr attr; + + memset(&attr, 0, sizeof(attr)); + + attr.target_fd = targetfd; + attr.attach_bpf_fd = progfd; + attr.attach_type = attachType; + + return syscall(SYS_bpf, BPF_PROG_DETACH, &attr, sizeof(attr)); +} + + +int +virBPFQueryProg(int targetfd, + unsigned int maxprogids, + int attachType, + unsigned int *progcnt, + void *progids) +{ + union bpf_attr attr; + int rc; + + memset(&attr, 0, sizeof(attr)); + + attr.query.target_fd = targetfd; + attr.query.attach_type = attachType; + attr.query.prog_cnt = maxprogids; + attr.query.prog_ids = (uint64_t)progids; + + rc = syscall(SYS_bpf, BPF_PROG_QUERY, &attr, sizeof(attr)); + + if (rc >= 0) + *progcnt = attr.query.prog_cnt; + + return rc; +} + + +int +virBPFGetProg(unsigned int id) +{ + union bpf_attr attr; + + memset(&attr, 0, sizeof(attr)); + + attr.prog_id = id; + + return syscall(SYS_bpf, BPF_PROG_GET_FD_BY_ID, &attr, sizeof(attr)); +} + + +int +virBPFGetProgInfo(int progfd, + struct bpf_prog_info *info, + unsigned int **mapIDs) +{ + union bpf_attr attr; + int rc; + + memset(&attr, 0, sizeof(attr)); + + attr.info.bpf_fd = progfd; + attr.info.info_len = sizeof(struct bpf_prog_info); + attr.info.info = (uint64_t)info; + + rc = syscall(SYS_bpf, BPF_OBJ_GET_INFO_BY_FD, &attr, sizeof(attr)); + if (rc < 0) + return rc; + + if (mapIDs && info->nr_map_ids > 0) { + unsigned int maplen = info->nr_map_ids; + VIR_AUTOFREE(unsigned int *) retmapIDs = NULL; + + if (VIR_ALLOC_N(retmapIDs, maplen) < 0) + return -1; + + memset(info, 0, sizeof(struct bpf_prog_info)); + info->nr_map_ids = maplen; + info->map_ids = (uint64_t)retmapIDs; + + memset(&attr, 0, sizeof(attr)); + attr.info.bpf_fd = progfd; + attr.info.info_len = sizeof(struct bpf_prog_info); + attr.info.info = (uint64_t)info; + + rc = syscall(SYS_bpf, BPF_OBJ_GET_INFO_BY_FD, &attr, sizeof(attr)); + if (rc < 0) + return rc; + + VIR_STEAL_PTR(*mapIDs, retmapIDs); + } + + return rc; +} + + +int +virBPFGetMap(unsigned int id) +{ + union bpf_attr attr; + + memset(&attr, 0, sizeof(attr)); + + attr.map_id = id; + + return syscall(SYS_bpf, BPF_MAP_GET_FD_BY_ID, &attr, sizeof(attr)); +} + + +int +virBPFGetMapInfo(int mapfd, + struct bpf_map_info *info) +{ + union bpf_attr attr; + + memset(&attr, 0, sizeof(attr)); + + attr.info.bpf_fd = mapfd; + attr.info.info_len = sizeof(struct bpf_map_info); + attr.info.info = (uint64_t)info; + + return syscall(SYS_bpf, BPF_OBJ_GET_INFO_BY_FD, &attr, sizeof(attr)); +} + + +int +virBPFLookupElem(int mapfd, + void *key, + void *val) +{ + union bpf_attr attr; + + memset(&attr, 0, sizeof(attr)); + + attr.map_fd = mapfd; + attr.key = (uint64_t)key; + attr.value = (uint64_t)val; + + return syscall(SYS_bpf, BPF_MAP_LOOKUP_ELEM, &attr, sizeof(attr)); +} + + +int +virBPFGetNextElem(int mapfd, + void *key, + void *nextKey) +{ + union bpf_attr attr; + + memset(&attr, 0, sizeof(attr)); + + attr.map_fd = mapfd; + attr.key = (uint64_t)key; + attr.next_key = (uint64_t)nextKey; + + return syscall(SYS_bpf, BPF_MAP_GET_NEXT_KEY, &attr, sizeof(attr)); +} + + +int +virBPFUpdateElem(int mapfd, + void *key, + void *val) +{ + union bpf_attr attr; + + memset(&attr, 0, sizeof(attr)); + + attr.map_fd = mapfd; + attr.key = (uint64_t)key; + attr.value = (uint64_t)val; + + return syscall(SYS_bpf, BPF_MAP_UPDATE_ELEM, &attr, sizeof(attr)); +} + + +int +virBPFDeleteElem(int mapfd, + void *key) +{ + union bpf_attr attr; + + memset(&attr, 0, sizeof(attr)); + + attr.map_fd = mapfd; + attr.key = (uint64_t)key; + + return syscall(SYS_bpf, BPF_MAP_DELETE_ELEM, &attr, sizeof(attr)); +} +#else /* HAVE_DECL_BPF_PROG_QUERY */ +int +virBPFCreateMap(unsigned int mapType ATTRIBUTE_UNUSED, + unsigned int keySize ATTRIBUTE_UNUSED, + unsigned int valSize ATTRIBUTE_UNUSED, + unsigned int maxEntries ATTRIBUTE_UNUSED) +{ + virReportSystemError(ENOSYS, "%s", + _("BPF not supported with this kernel")); + return -1; +} + + +int +virBPFLoadProg(struct bpf_insn *insns ATTRIBUTE_UNUSED, + int progType ATTRIBUTE_UNUSED, + unsigned int insnCnt ATTRIBUTE_UNUSED) +{ + virReportSystemError(ENOSYS, "%s", + _("BPF not supported with this kernel")); + return -1; +} + + +int +virBPFAttachProg(int progfd ATTRIBUTE_UNUSED, + int targetfd ATTRIBUTE_UNUSED, + int attachType ATTRIBUTE_UNUSED) +{ + virReportSystemError(ENOSYS, "%s", + _("BPF not supported with this kernel")); + return -1; +} + + +int +virBPFDetachProg(int progfd ATTRIBUTE_UNUSED, + int targetfd ATTRIBUTE_UNUSED, + int attachType ATTRIBUTE_UNUSED) +{ + virReportSystemError(ENOSYS, "%s", + _("BPF not supported with this kernel")); + return -1; +} + + +int +virBPFQueryProg(int targetfd ATTRIBUTE_UNUSED, + unsigned int maxprogids ATTRIBUTE_UNUSED, + int attachType ATTRIBUTE_UNUSED, + unsigned int *progcnt ATTRIBUTE_UNUSED, + void *progids ATTRIBUTE_UNUSED) +{ + virReportSystemError(ENOSYS, "%s", + _("BPF not supported with this kernel")); + return -1; +} + + +int +virBPFGetProg(unsigned int id ATTRIBUTE_UNUSED) +{ + virReportSystemError(ENOSYS, "%s", + _("BPF not supported with this kernel")); + return -1; +} + + +int +virBPFGetProgInfo(int progfd ATTRIBUTE_UNUSED, + struct bpf_prog_info *info ATTRIBUTE_UNUSED, + unsigned int **mapIDs ATTRIBUTE_UNUSED) +{ + virReportSystemError(ENOSYS, "%s", + _("BPF not supported with this kernel")); + return -1; +} + + +int +virBPFGetMap(unsigned int id ATTRIBUTE_UNUSED) +{ + virReportSystemError(ENOSYS, "%s", + _("BPF not supported with this kernel")); + return -1; +} + + +int +virBPFGetMapInfo(int mapfd ATTRIBUTE_UNUSED, + struct bpf_map_info *info ATTRIBUTE_UNUSED) +{ + virReportSystemError(ENOSYS, "%s", + _("BPF not supported with this kernel")); + return -1; +} + + +int +virBPFLookupElem(int mapfd ATTRIBUTE_UNUSED, + void *key ATTRIBUTE_UNUSED, + void *val ATTRIBUTE_UNUSED) +{ + virReportSystemError(ENOSYS, "%s", + _("BPF not supported with this kernel")); + return -1; +} + + +int +virBPFGetNextElem(int mapfd ATTRIBUTE_UNUSED, + void *key ATTRIBUTE_UNUSED, + void *nextKey ATTRIBUTE_UNUSED) +{ + virReportSystemError(ENOSYS, "%s", + _("BPF not supported with this kernel")); + return -1; +} + + +int +virBPFUpdateElem(int mapfd ATTRIBUTE_UNUSED, + void *key ATTRIBUTE_UNUSED, + void *val ATTRIBUTE_UNUSED) +{ + virReportSystemError(ENOSYS, "%s", + _("BPF not supported with this kernel")); + return -1; +} + + +int +virBPFDeleteElem(int mapfd ATTRIBUTE_UNUSED, + void *key ATTRIBUTE_UNUSED) +{ + virReportSystemError(ENOSYS, "%s", + _("BPF not supported with this kernel")); + return -1; +} +#endif /* HAVE_DECL_BPF_PROG_QUERY */ diff --git a/src/util/virbpf.h b/src/util/virbpf.h new file mode 100644 index 0000000000..0fd6a05388 --- /dev/null +++ b/src/util/virbpf.h @@ -0,0 +1,271 @@ +/* + * virbpf.h: methods for eBPF + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library. If not, see + * <http://www.gnu.org/licenses/>. + */ + +#ifndef LIBVIRT_VIRBPF_H +# define LIBVIRT_VIRBPF_H + +# if HAVE_DECL_BPF_PROG_QUERY + +# include <linux/bpf.h> + +/* ALU ops on registers, bpf_add|sub|...: dst_reg += src_reg */ + +# define VIR_BPF_ALU64_REG(op, dst, src) \ + ((struct bpf_insn) { \ + .code = BPF_ALU64 | BPF_OP(op) | BPF_X, \ + .dst_reg = dst, \ + .src_reg = src, \ + .off = 0, \ + .imm = 0, \ + }) + +/* ALU ops on immediates, bpf_add|sub|...: dst_reg += imm32 */ + +# define VIR_BPF_ALU64_IMM(op, dst, immval) \ + ((struct bpf_insn) { \ + .code = BPF_ALU64 | BPF_OP(op) | BPF_K, \ + .dst_reg = dst, \ + .src_reg = 0, \ + .off = 0, \ + .imm = immval, \ + }) + +/* Short form of mov, dst_reg = src_reg */ + +# define VIR_BPF_MOV64_REG(dst, src) \ + ((struct bpf_insn) { \ + .code = BPF_ALU64 | BPF_MOV | BPF_X, \ + .dst_reg = dst, \ + .src_reg = src, \ + .off = 0, \ + .imm = 0, \ + }) + +/* Short form of mov, dst_reg = imm32 */ + +# define VIR_BPF_MOV64_IMM(dst, immval) \ + ((struct bpf_insn) { \ + .code = BPF_ALU64 | BPF_MOV | BPF_K, \ + .dst_reg = dst, \ + .src_reg = 0, \ + .off = 0, \ + .imm = immval, \ + }) + +# define VIR_BPF_MOV32_IMM(dst, immval) \ + ((struct bpf_insn) { \ + .code = BPF_ALU | BPF_MOV | BPF_K, \ + .dst_reg = dst, \ + .src_reg = 0, \ + .off = 0, \ + .imm = immval, \ + }) + +/* BPF_LD_IMM64 macro encodes single 'load 64-bit immediate' insn */ +# define VIR_BPF_LD_IMM64(dst, imm) \ + BPF_LD_IMM64_RAW(dst, 0, imm) + +# define VIR_BPF_LD_IMM64_RAW(dst, src, immval) \ + ((struct bpf_insn) { \ + .code = BPF_LD | BPF_DW | BPF_IMM, \ + .dst_reg = dst, \ + .src_reg = src, \ + .off = 0, \ + .imm = (uint32_t)immval, \ + }), \ + ((struct bpf_insn) { \ + .code = 0, \ + .dst_reg = 0, \ + .src_reg = 0, \ + .off = 0, \ + .imm = ((uint64_t)immval) >> 32, \ + }) + +# ifndef VIR_BPF_PSEUDO_MAP_FD +# define VIR_BPF_PSEUDO_MAP_FD 1 +# endif + +/* pseudo VIR_BPF_LD_IMM64 insn used to refer to process-local map_fd */ +# define VIR_BPF_LD_MAP_FD(dst, mapfd) \ + VIR_BPF_LD_IMM64_RAW(dst, VIR_BPF_PSEUDO_MAP_FD, mapfd) + +/* Memory load, dst_reg = *(uint *) (src_reg + off16) */ + +# define VIR_BPF_LDX_MEM(size, dst, src, offval) \ + ((struct bpf_insn) { \ + .code = BPF_LDX | BPF_SIZE(size) | BPF_MEM, \ + .dst_reg = dst, \ + .src_reg = src, \ + .off = offval, \ + .imm = 0, \ + }) + +/* Memory store, *(uint *) (dst_reg + off16) = src_reg */ + +# define VIR_BPF_STX_MEM(size, dst, src, offval) \ + ((struct bpf_insn) { \ + .code = BPF_STX | BPF_SIZE(size) | BPF_MEM, \ + .dst_reg = dst, \ + .src_reg = src, \ + .off = offval, \ + .imm = 0, \ + }) + +/* Memory store, *(uint *) (dst_reg + off16) = imm32 */ + +# define VIR_BPF_ST_MEM(size, dst, immval, offval) \ + ((struct bpf_insn) { \ + .code = BPF_ST | BPF_SIZE(size) | BPF_MEM, \ + .dst_reg = dst, \ + .src_reg = 0, \ + .off = offval, \ + .imm = immval, \ + }) + +/* Conditional jumps against registers, if (dst_reg 'op' src_reg) goto pc + off16 */ + +# define VIR_BPF_JMP_REG(op, dst, src, offval) \ + ((struct bpf_insn) { \ + .code = BPF_JMP | BPF_OP(op) | BPF_X, \ + .dst_reg = dst, \ + .src_reg = src, \ + .off = offval, \ + .imm = 0, \ + }) + +/* Conditional jumps against immediates, if (dst_reg 'op' imm32) goto pc + off16 */ + +# define VIR_BPF_JMP_IMM(op, dst, immval, offval) \ + ((struct bpf_insn) { \ + .code = BPF_JMP | BPF_OP(op) | BPF_K, \ + .dst_reg = dst, \ + .src_reg = 0, \ + .off = offval, \ + .imm = immval, \ + }) + +/* Call eBPF function */ + +# define VIR_BPF_CALL_INSN(func) \ + ((struct bpf_insn) { \ + .code = BPF_JMP | BPF_CALL, \ + .dst_reg = 0, \ + .src_reg = 0, \ + .off = 0, \ + .imm = func, \ + }) + +/* Program exit */ + +# define VIR_BPF_EXIT_INSN() \ + ((struct bpf_insn) { \ + .code = BPF_JMP | BPF_EXIT, \ + .dst_reg = 0, \ + .src_reg = 0, \ + .off = 0, \ + .imm = 0, \ + }) + +# else /* HAVE_DECL_BPF_PROG_QUERY */ + +struct bpf_prog_info; +struct bpf_map_info; +struct bpf_insn; + +# define VIR_BPF_ALU64_REG(op, dst, src) +# define VIR_BPF_ALU64_IMM(op, dst, immval) +# define VIR_BPF_MOV64_REG(dst, src) +# define VIR_BPF_MOV64_IMM(dst, immval) +# define VIR_BPF_MOV32_IMM(dst, immval) +# define VIR_BPF_LD_IMM64(dst, imm) +# define VIR_BPF_LD_IMM64_RAW(dst, src, immval) +# define VIR_BPF_PSEUDO_MAP_FD +# define VIR_BPF_LD_MAP_FD(dst, mapfd) +# define VIR_BPF_LDX_MEM(size, dst, src, offval) +# define VIR_BPF_STX_MEM(size, dst, src, offval) +# define VIR_BPF_ST_MEM(size, dst, immval, offval) +# define VIR_BPF_JMP_REG(op, dst, src, offval) +# define VIR_BPF_JMP_IMM(op, dst, immval, offval) +# define VIR_BPF_CALL_INSN(func) +# define VIR_BPF_EXIT_INSN() + +# endif /* HAVE_DECL_BPF_PROG_QUERY */ + +int +virBPFCreateMap(unsigned int mapType, + unsigned int keySize, + unsigned int valSize, + unsigned int maxEntries); + +int +virBPFGetMapInfo(int mapfd, + struct bpf_map_info *info); + +int +virBPFLoadProg(struct bpf_insn *insns, + int progType, + unsigned int insnCnt); + +int +virBPFAttachProg(int progfd, + int targetfd, + int attachType); + +int +virBPFDetachProg(int progfd, + int targetfd, + int attachType); + +int +virBPFQueryProg(int targetfd, + unsigned int maxprogids, + int attachType, + unsigned int *progcnt, + void *progids); + +int +virBPFGetProg(unsigned int id); + +int +virBPFGetProgInfo(int progfd, + struct bpf_prog_info *info, + unsigned int **mapIDs); + +int +virBPFGetMap(unsigned int id); + +int +virBPFLookupElem(int mapfd, + void *key, + void *val); + +int +virBPFGetNextElem(int mapfd, + void *key, + void *nextKey); + +int +virBPFUpdateElem(int mapfd, + void *key, + void *val); + +int +virBPFDeleteElem(int mapfd, + void *key); + +#endif /* LIBVIRT_VIRBPF_H */ diff --git a/src/util/virerror.c b/src/util/virerror.c index 61b47d2be0..a40076f8ec 100644 --- a/src/util/virerror.c +++ b/src/util/virerror.c @@ -138,6 +138,7 @@ VIR_ENUM_IMPL(virErrorDomain, VIR_ERR_DOMAIN_LAST, "Perf", /* 65 */ "Libssh transport layer", "Resource control", + "BPF", ) -- 2.20.1

There is no exact way how to figure out whether BPF devices support is compiled into kernel. One way is to check kernel configure options but this is not reliable as it may not be available. Let's try to do syscall to which will list BPF cgroup device programs. Signed-off-by: Pavel Hrdina <phrdina@redhat.com> --- configure.ac | 3 +- src/Makefile.am | 2 + src/libvirt_private.syms | 3 ++ src/util/Makefile.inc.am | 2 + src/util/vircgroupv2.c | 7 +++- src/util/vircgroupv2devices.c | 73 +++++++++++++++++++++++++++++++++++ src/util/vircgroupv2devices.h | 27 +++++++++++++ 7 files changed, 115 insertions(+), 2 deletions(-) create mode 100644 src/util/vircgroupv2devices.c create mode 100644 src/util/vircgroupv2devices.h diff --git a/configure.ac b/configure.ac index 0b7afece25..acc0e888ea 100644 --- a/configure.ac +++ b/configure.ac @@ -887,7 +887,8 @@ AC_CHECK_DECLS([clock_serv_t, host_get_clock_service, clock_get_time], # Check if we have new enough kernel to support BPF devices for cgroups v2 if test "$with_linux" = "yes"; then - AC_CHECK_DECLS([BPF_PROG_QUERY], [], [], [#include <linux/bpf.h>]) + AC_CHECK_DECLS([BPF_PROG_QUERY, BPF_CGROUP_DEVICE], + [], [], [#include <linux/bpf.h>]) fi # Check if we need to look for ifconfig diff --git a/src/Makefile.am b/src/Makefile.am index cd386297ed..9d9e50d527 100644 --- a/src/Makefile.am +++ b/src/Makefile.am @@ -672,11 +672,13 @@ libvirt_setuid_rpc_client_la_SOURCES = \ util/viratomic.c \ util/viratomic.h \ util/virbitmap.c \ + util/virbpf.c \ util/virbuffer.c \ util/vircgroup.c \ util/vircgroupbackend.c \ util/vircgroupv1.c \ util/vircgroupv2.c \ + util/vircgroupv2devices.c \ util/vircommand.c \ util/virconf.c \ util/virdbus.c \ diff --git a/src/libvirt_private.syms b/src/libvirt_private.syms index 0cff580de2..6a822f7d90 100644 --- a/src/libvirt_private.syms +++ b/src/libvirt_private.syms @@ -1611,6 +1611,9 @@ virCgroupV1Register; # util/vircgroupv2.h virCgroupV2Register; +# util/vircgroupv2devices.h +virCgroupV2DevicesAvailable; + # util/virclosecallbacks.h virCloseCallbacksGet; virCloseCallbacksGetConn; diff --git a/src/util/Makefile.inc.am b/src/util/Makefile.inc.am index 1fd7ad2d43..6eb9e1b889 100644 --- a/src/util/Makefile.inc.am +++ b/src/util/Makefile.inc.am @@ -31,6 +31,8 @@ UTIL_SOURCES = \ util/vircgroupv1.h \ util/vircgroupv2.c \ util/vircgroupv2.h \ + util/vircgroupv2devices.c \ + util/vircgroupv2devices.h \ util/virclosecallbacks.c \ util/virclosecallbacks.h \ util/vircommand.c \ diff --git a/src/util/vircgroupv2.c b/src/util/vircgroupv2.c index cd58491da1..a56f3443e3 100644 --- a/src/util/vircgroupv2.c +++ b/src/util/vircgroupv2.c @@ -32,6 +32,7 @@ #include "vircgroup.h" #include "vircgroupbackend.h" #include "vircgroupv2.h" +#include "vircgroupv2devices.h" #include "virerror.h" #include "virfile.h" #include "virlog.h" @@ -292,6 +293,8 @@ virCgroupV2DetectControllers(virCgroupPtr group, /* In cgroup v2 there is no cpuacct controller, the cpu.stat file always * exists with usage stats. */ group->unified.controllers |= 1 << VIR_CGROUP_CONTROLLER_CPUACCT; + if (virCgroupV2DevicesAvailable(group)) + group->unified.controllers |= 1 << VIR_CGROUP_CONTROLLER_DEVICES; for (i = 0; i < VIR_CGROUP_CONTROLLER_LAST; i++) VIR_DEBUG("Controller '%s' present=%s", @@ -406,8 +409,10 @@ virCgroupV2MakeGroup(virCgroupPtr parent ATTRIBUTE_UNUSED, continue; /* Controllers that are implicitly enabled if available. */ - if (i == VIR_CGROUP_CONTROLLER_CPUACCT) + if (i == VIR_CGROUP_CONTROLLER_CPUACCT || + i == VIR_CGROUP_CONTROLLER_DEVICES) { continue; + } if (virCgroupV2EnableController(parent, i) < 0) return -1; diff --git a/src/util/vircgroupv2devices.c b/src/util/vircgroupv2devices.c new file mode 100644 index 0000000000..10080d4fff --- /dev/null +++ b/src/util/vircgroupv2devices.c @@ -0,0 +1,73 @@ +/* + * vircgroupv2devices.c: methods for cgroups v2 BPF devices + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library. If not, see + * <http://www.gnu.org/licenses/>. + */ +#include <config.h> + +#if HAVE_DECL_BPF_CGROUP_DEVICE +# include <fcntl.h> +# include <linux/bpf.h> +# include <sys/stat.h> +# include <sys/syscall.h> +# include <sys/types.h> +#endif /* !HAVE_DECL_BPF_CGROUP_DEVICE */ + +#include "internal.h" + +#define LIBVIRT_VIRCGROUPPRIV_H_ALLOW +#include "vircgrouppriv.h" + +#include "virbpf.h" +#include "vircgroup.h" +#include "vircgroupv2devices.h" +#include "virfile.h" +#include "virlog.h" + +VIR_LOG_INIT("util.cgroup"); + +#define VIR_FROM_THIS VIR_FROM_CGROUP + +#if HAVE_DECL_BPF_CGROUP_DEVICE +bool +virCgroupV2DevicesAvailable(virCgroupPtr group) +{ + bool ret = false; + int cgroupfd = -1; + unsigned int progCnt = 0; + + cgroupfd = open(group->unified.mountPoint, O_RDONLY); + if (cgroupfd < 0) { + VIR_DEBUG("failed to open cgroup '%s'", group->unified.mountPoint); + goto cleanup; + } + + if (virBPFQueryProg(cgroupfd, 0, BPF_CGROUP_DEVICE, &progCnt, NULL) < 0) { + VIR_DEBUG("failed to query cgroup progs"); + goto cleanup; + } + + ret = true; + cleanup: + VIR_FORCE_CLOSE(cgroupfd); + return ret; +} +#else /* !HAVE_DECL_BPF_CGROUP_DEVICE */ +bool +virCgroupV2DevicesAvailable(virCgroupPtr group ATTRIBUTE_UNUSED) +{ + return false; +} +#endif /* !HAVE_DECL_BPF_CGROUP_DEVICE */ diff --git a/src/util/vircgroupv2devices.h b/src/util/vircgroupv2devices.h new file mode 100644 index 0000000000..2ab35681db --- /dev/null +++ b/src/util/vircgroupv2devices.h @@ -0,0 +1,27 @@ +/* + * vircgroupv2devices.h: methods for cgroups v2 BPF devices + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library. If not, see + * <http://www.gnu.org/licenses/>. + */ + +#ifndef LIBVIRT_VIRCGROUPV2DEVICES_H +# define LIBVIRT_VIRCGROUPV2DEVICES_H + +# include "vircgroup.h" + +bool +virCgroupV2DevicesAvailable(virCgroupPtr group); + +#endif /* LIBVIRT_VIRCGROUPV2DEVICES_H */ -- 2.20.1

This function loads the BPF prog with prepared map into kernel and attaches it into guest cgroup. It can be also used to replace existing program in the cgroup if we need to resize BPF map to store more rules for devices. The old program will be closed and removed from kernel. There are two possible ways how to create BPF program: - One way is to write simple C-like code which can by compiled into BPF object file which can be loaded into kernel using elfutils. - The second way is to define macros which looks like assembler instructions and can be used directly to create BPF program that can be directly loaded into kernel. Since the program is not too complex we can use the second option. If there is no program, all devices are allowed, if there is some program it is executed and based on the exit status the access is denied for 0 and allowed for 1. Our program will follow these rules: - first it will try to look for the specific key using major and minor to see if there is any rule for that specific device - if there is no specific rule it will try to look for any rule that matches only major of the device - if there is no match with major it will try the same but with minor of the device - as the last attempt it will try to look for rule for all devices and if there is no match it will return 0 to deny that access Signed-off-by: Pavel Hrdina <phrdina@redhat.com> --- src/libvirt_private.syms | 1 + src/util/vircgrouppriv.h | 10 +++ src/util/vircgroupv2devices.c | 140 ++++++++++++++++++++++++++++++++++ src/util/vircgroupv2devices.h | 5 ++ 4 files changed, 156 insertions(+) diff --git a/src/libvirt_private.syms b/src/libvirt_private.syms index 6a822f7d90..3fc91ce207 100644 --- a/src/libvirt_private.syms +++ b/src/libvirt_private.syms @@ -1612,6 +1612,7 @@ virCgroupV1Register; virCgroupV2Register; # util/vircgroupv2devices.h +virCgroupV2DevicesAttachProg; virCgroupV2DevicesAvailable; # util/virclosecallbacks.h diff --git a/src/util/vircgrouppriv.h b/src/util/vircgrouppriv.h index a6fb3bb9f8..085fea375c 100644 --- a/src/util/vircgrouppriv.h +++ b/src/util/vircgrouppriv.h @@ -42,10 +42,20 @@ struct _virCgroupV1Controller { typedef struct _virCgroupV1Controller virCgroupV1Controller; typedef virCgroupV1Controller *virCgroupV1ControllerPtr; +struct _virCgroupV2Devices { + int mapfd; + int progfd; + ssize_t count; + ssize_t max; +}; +typedef struct _virCgroupV2Devices virCgroupV2Devices; +typedef virCgroupV2Devices *virCgroupV2DevicesPtr; + struct _virCgroupV2Controller { int controllers; char *mountPoint; char *placement; + virCgroupV2Devices devices; }; typedef struct _virCgroupV2Controller virCgroupV2Controller; typedef virCgroupV2Controller *virCgroupV2ControllerPtr; diff --git a/src/util/vircgroupv2devices.c b/src/util/vircgroupv2devices.c index 10080d4fff..0b721a0aad 100644 --- a/src/util/vircgroupv2devices.c +++ b/src/util/vircgroupv2devices.c @@ -64,10 +64,150 @@ virCgroupV2DevicesAvailable(virCgroupPtr group) VIR_FORCE_CLOSE(cgroupfd); return ret; } + + +static int +virCgroupV2DevicesLoadProg(int mapfd) +{ +# define VIR_CGROUP_BPF_LOOKUP \ + /* prepare key param on stack */ \ + VIR_BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_2, -8), \ + VIR_BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), \ + VIR_BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), \ + /* lookup key (major << 32) | minor in map */ \ + VIR_BPF_LD_MAP_FD(BPF_REG_1, mapfd), \ + VIR_BPF_CALL_INSN(BPF_FUNC_map_lookup_elem) + +# define VIR_CGROUP_BPF_CHECK_PERM \ + /* if no key skip perm check */ \ + VIR_BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 6), \ + /* get perms from map */ \ + VIR_BPF_LDX_MEM(BPF_W, BPF_REG_1, BPF_REG_0, 0), \ + /* get perms from ctx */ \ + VIR_BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_6, 0), \ + /* (map perms) & (ctx perms) */ \ + VIR_BPF_ALU64_REG(BPF_AND, BPF_REG_1, BPF_REG_2), \ + /* if (map perms) & (ctx perms) == (ctx perms) exit, otherwise continue */ \ + VIR_BPF_JMP_REG(BPF_JNE, BPF_REG_1, BPF_REG_2, 2), \ + /* set ret 1 and exit */ \ + VIR_BPF_MOV64_IMM(BPF_REG_0, 1), \ + VIR_BPF_EXIT_INSN() + + struct bpf_insn prog[] = { + /* save ctx, argument passed to BPF program */ + VIR_BPF_MOV64_REG(BPF_REG_6, BPF_REG_1), + + /* get major from ctx and shift << 32 */ + VIR_BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_6, 4), + VIR_BPF_ALU64_IMM(BPF_LSH, BPF_REG_2, 32), + /* get minor from ctx and | to shifted major */ + VIR_BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_6, 8), + VIR_BPF_ALU64_REG(BPF_OR, BPF_REG_2, BPF_REG_3), + /* lookup ((major << 32) | minor) in map and check perms */ + VIR_CGROUP_BPF_LOOKUP, + VIR_CGROUP_BPF_CHECK_PERM, + + /* get major from ctx and shift << 32 */ + VIR_BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_6, 4), + VIR_BPF_ALU64_IMM(BPF_LSH, BPF_REG_2, 32), + /* use -1 as minor and | to shifted major */ + VIR_BPF_MOV32_IMM(BPF_REG_3, -1), + VIR_BPF_ALU64_REG(BPF_OR, BPF_REG_2, BPF_REG_3), + /* lookup ((major << 32) | -1) in map and check perms */ + VIR_CGROUP_BPF_LOOKUP, + VIR_CGROUP_BPF_CHECK_PERM, + + /* use -1 as major and shift << 32 */ + VIR_BPF_MOV32_IMM(BPF_REG_2, -1), + VIR_BPF_ALU64_IMM(BPF_LSH, BPF_REG_2, 32), + /* get minor from ctx and | to shifted major */ + VIR_BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_6, 8), + VIR_BPF_ALU64_REG(BPF_OR, BPF_REG_2, BPF_REG_3), + /* lookup ((-1 << 32) | minor) in map and check perms */ + VIR_CGROUP_BPF_LOOKUP, + VIR_CGROUP_BPF_CHECK_PERM, + + /* use -1 as key which means major = -1 and minor = -1 */ + VIR_BPF_MOV64_IMM(BPF_REG_2, -1), + /* lookup -1 in map and check perms*/ + VIR_CGROUP_BPF_LOOKUP, + VIR_CGROUP_BPF_CHECK_PERM, + + /* no key was found, exit with 0 */ + VIR_BPF_MOV64_IMM(BPF_REG_0, 0), + VIR_BPF_EXIT_INSN(), + }; + + return virBPFLoadProg(prog, BPF_PROG_TYPE_CGROUP_DEVICE, ARRAY_CARDINALITY(prog)); +} + + +int +virCgroupV2DevicesAttachProg(virCgroupPtr group, + int mapfd, + size_t max) +{ + int ret = -1; + int progfd = -1; + int cgroupfd = -1; + VIR_AUTOFREE(char *) path = NULL; + + if (virCgroupPathOfController(group, VIR_CGROUP_CONTROLLER_DEVICES, + NULL, &path) < 0) { + goto cleanup; + } + + progfd = virCgroupV2DevicesLoadProg(mapfd); + if (progfd < 0) { + virReportSystemError(errno, "%s", _("failed to load cgroup BPF prog")); + goto cleanup; + } + + cgroupfd = open(path, O_RDONLY); + if (cgroupfd < 0) { + virReportSystemError(errno, _("unable to open '%s'"), path); + goto cleanup; + } + + if (virBPFAttachProg(progfd, cgroupfd, BPF_CGROUP_DEVICE) < 0) { + virReportSystemError(errno, "%s", _("failed to attach cgroup BPF prog")); + goto cleanup; + } + + if (group->unified.devices.progfd > 0) { + VIR_DEBUG("Closing existing program that was replaced by new one."); + VIR_FORCE_CLOSE(group->unified.devices.progfd); + } + + group->unified.devices.progfd = progfd; + group->unified.devices.mapfd = mapfd; + group->unified.devices.max = max; + progfd = -1; + mapfd = -1; + + ret = 0; + cleanup: + VIR_FORCE_CLOSE(cgroupfd); + VIR_FORCE_CLOSE(progfd); + VIR_FORCE_CLOSE(mapfd); + return ret; +} #else /* !HAVE_DECL_BPF_CGROUP_DEVICE */ bool virCgroupV2DevicesAvailable(virCgroupPtr group ATTRIBUTE_UNUSED) { return false; } + + +int +virCgroupV2DevicesAttachProg(virCgroupPtr group ATTRIBUTE_UNUSED, + int mapfd ATTRIBUTE_UNUSED, + size_t max ATTRIBUTE_UNUSED) +{ + virReportSystemError(ENOSYS, "%s", + _("cgroups v2 BPF devices not supported " + "with this kernel")); + return -1; +} #endif /* !HAVE_DECL_BPF_CGROUP_DEVICE */ diff --git a/src/util/vircgroupv2devices.h b/src/util/vircgroupv2devices.h index 2ab35681db..1ba87acb00 100644 --- a/src/util/vircgroupv2devices.h +++ b/src/util/vircgroupv2devices.h @@ -24,4 +24,9 @@ bool virCgroupV2DevicesAvailable(virCgroupPtr group); +int +virCgroupV2DevicesAttachProg(virCgroupPtr group, + int mapfd, + size_t max); + #endif /* LIBVIRT_VIRCGROUPV2DEVICES_H */ -- 2.20.1

On Mon, Jan 14, 2019 at 04:47:37PM +0100, Pavel Hrdina wrote:
This function loads the BPF prog with prepared map into kernel and attaches it into guest cgroup. It can be also used to replace existing program in the cgroup if we need to resize BPF map to store more rules for devices. The old program will be closed and removed from kernel.
There are two possible ways how to create BPF program:
- One way is to write simple C-like code which can by compiled into BPF object file which can be loaded into kernel using elfutils.
- The second way is to define macros which looks like assembler instructions and can be used directly to create BPF program that can be directly loaded into kernel.
Since the program is not too complex we can use the second option.
I'm really not liking this to be honest. Even for this "simple" program, I struggle to understand what the code below is doing. It is basically assembly language, which is inevitably hard to follow even for simple things. I'd like to see us use the BPF C compiler to build the loaded program unless there's a compelling reason why it won't work
+ + +static int +virCgroupV2DevicesLoadProg(int mapfd) +{ +# define VIR_CGROUP_BPF_LOOKUP \ + /* prepare key param on stack */ \ + VIR_BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_2, -8), \ + VIR_BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), \ + VIR_BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), \ + /* lookup key (major << 32) | minor in map */ \ + VIR_BPF_LD_MAP_FD(BPF_REG_1, mapfd), \ + VIR_BPF_CALL_INSN(BPF_FUNC_map_lookup_elem) + +# define VIR_CGROUP_BPF_CHECK_PERM \ + /* if no key skip perm check */ \ + VIR_BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 6), \ + /* get perms from map */ \ + VIR_BPF_LDX_MEM(BPF_W, BPF_REG_1, BPF_REG_0, 0), \ + /* get perms from ctx */ \ + VIR_BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_6, 0), \ + /* (map perms) & (ctx perms) */ \ + VIR_BPF_ALU64_REG(BPF_AND, BPF_REG_1, BPF_REG_2), \ + /* if (map perms) & (ctx perms) == (ctx perms) exit, otherwise continue */ \ + VIR_BPF_JMP_REG(BPF_JNE, BPF_REG_1, BPF_REG_2, 2), \ + /* set ret 1 and exit */ \ + VIR_BPF_MOV64_IMM(BPF_REG_0, 1), \ + VIR_BPF_EXIT_INSN() + + struct bpf_insn prog[] = { + /* save ctx, argument passed to BPF program */ + VIR_BPF_MOV64_REG(BPF_REG_6, BPF_REG_1), + + /* get major from ctx and shift << 32 */ + VIR_BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_6, 4), + VIR_BPF_ALU64_IMM(BPF_LSH, BPF_REG_2, 32), + /* get minor from ctx and | to shifted major */ + VIR_BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_6, 8), + VIR_BPF_ALU64_REG(BPF_OR, BPF_REG_2, BPF_REG_3), + /* lookup ((major << 32) | minor) in map and check perms */ + VIR_CGROUP_BPF_LOOKUP, + VIR_CGROUP_BPF_CHECK_PERM, + + /* get major from ctx and shift << 32 */ + VIR_BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_6, 4), + VIR_BPF_ALU64_IMM(BPF_LSH, BPF_REG_2, 32), + /* use -1 as minor and | to shifted major */ + VIR_BPF_MOV32_IMM(BPF_REG_3, -1), + VIR_BPF_ALU64_REG(BPF_OR, BPF_REG_2, BPF_REG_3), + /* lookup ((major << 32) | -1) in map and check perms */ + VIR_CGROUP_BPF_LOOKUP, + VIR_CGROUP_BPF_CHECK_PERM, + + /* use -1 as major and shift << 32 */ + VIR_BPF_MOV32_IMM(BPF_REG_2, -1), + VIR_BPF_ALU64_IMM(BPF_LSH, BPF_REG_2, 32), + /* get minor from ctx and | to shifted major */ + VIR_BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_6, 8), + VIR_BPF_ALU64_REG(BPF_OR, BPF_REG_2, BPF_REG_3), + /* lookup ((-1 << 32) | minor) in map and check perms */ + VIR_CGROUP_BPF_LOOKUP, + VIR_CGROUP_BPF_CHECK_PERM, + + /* use -1 as key which means major = -1 and minor = -1 */ + VIR_BPF_MOV64_IMM(BPF_REG_2, -1), + /* lookup -1 in map and check perms*/ + VIR_CGROUP_BPF_LOOKUP, + VIR_CGROUP_BPF_CHECK_PERM, + + /* no key was found, exit with 0 */ + VIR_BPF_MOV64_IMM(BPF_REG_0, 0), + VIR_BPF_EXIT_INSN(), + }; + + return virBPFLoadProg(prog, BPF_PROG_TYPE_CGROUP_DEVICE, ARRAY_CARDINALITY(prog));
Regards, Daniel -- |: https://berrange.com -o- https://www.flickr.com/photos/dberrange :| |: https://libvirt.org -o- https://fstop138.berrange.com :| |: https://entangle-photo.org -o- https://www.instagram.com/dberrange :|

On Mon, Jan 14, 2019 at 04:43:34PM +0000, Daniel P. Berrangé wrote:
On Mon, Jan 14, 2019 at 04:47:37PM +0100, Pavel Hrdina wrote:
This function loads the BPF prog with prepared map into kernel and attaches it into guest cgroup. It can be also used to replace existing program in the cgroup if we need to resize BPF map to store more rules for devices. The old program will be closed and removed from kernel.
There are two possible ways how to create BPF program:
- One way is to write simple C-like code which can by compiled into BPF object file which can be loaded into kernel using elfutils.
- The second way is to define macros which looks like assembler instructions and can be used directly to create BPF program that can be directly loaded into kernel.
Since the program is not too complex we can use the second option.
I'm really not liking this to be honest. Even for this "simple" program, I struggle to understand what the code below is doing. It is basically assembly language, which is inevitably hard to follow even for simple things.
I'd like to see us use the BPF C compiler to build the loaded program unless there's a compelling reason why it won't work
I personally don't like it as well, but the issue is that for the compilation we would have to use clang/llvm, AFAIK there is no support to compile BPF in GCC. Another issue is the loading part, you need to read the object file, extract correct sections which is not trivial and in order to have different size of maps we would have to modify the loaded data from the compiled object file. I can try to prepare alternative patches for that but it might me even more complex to understand than short assembly program. Pavel
+ + +static int +virCgroupV2DevicesLoadProg(int mapfd) +{ +# define VIR_CGROUP_BPF_LOOKUP \ + /* prepare key param on stack */ \ + VIR_BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_2, -8), \ + VIR_BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), \ + VIR_BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), \ + /* lookup key (major << 32) | minor in map */ \ + VIR_BPF_LD_MAP_FD(BPF_REG_1, mapfd), \ + VIR_BPF_CALL_INSN(BPF_FUNC_map_lookup_elem) + +# define VIR_CGROUP_BPF_CHECK_PERM \ + /* if no key skip perm check */ \ + VIR_BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 6), \ + /* get perms from map */ \ + VIR_BPF_LDX_MEM(BPF_W, BPF_REG_1, BPF_REG_0, 0), \ + /* get perms from ctx */ \ + VIR_BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_6, 0), \ + /* (map perms) & (ctx perms) */ \ + VIR_BPF_ALU64_REG(BPF_AND, BPF_REG_1, BPF_REG_2), \ + /* if (map perms) & (ctx perms) == (ctx perms) exit, otherwise continue */ \ + VIR_BPF_JMP_REG(BPF_JNE, BPF_REG_1, BPF_REG_2, 2), \ + /* set ret 1 and exit */ \ + VIR_BPF_MOV64_IMM(BPF_REG_0, 1), \ + VIR_BPF_EXIT_INSN() + + struct bpf_insn prog[] = { + /* save ctx, argument passed to BPF program */ + VIR_BPF_MOV64_REG(BPF_REG_6, BPF_REG_1), + + /* get major from ctx and shift << 32 */ + VIR_BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_6, 4), + VIR_BPF_ALU64_IMM(BPF_LSH, BPF_REG_2, 32), + /* get minor from ctx and | to shifted major */ + VIR_BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_6, 8), + VIR_BPF_ALU64_REG(BPF_OR, BPF_REG_2, BPF_REG_3), + /* lookup ((major << 32) | minor) in map and check perms */ + VIR_CGROUP_BPF_LOOKUP, + VIR_CGROUP_BPF_CHECK_PERM, + + /* get major from ctx and shift << 32 */ + VIR_BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_6, 4), + VIR_BPF_ALU64_IMM(BPF_LSH, BPF_REG_2, 32), + /* use -1 as minor and | to shifted major */ + VIR_BPF_MOV32_IMM(BPF_REG_3, -1), + VIR_BPF_ALU64_REG(BPF_OR, BPF_REG_2, BPF_REG_3), + /* lookup ((major << 32) | -1) in map and check perms */ + VIR_CGROUP_BPF_LOOKUP, + VIR_CGROUP_BPF_CHECK_PERM, + + /* use -1 as major and shift << 32 */ + VIR_BPF_MOV32_IMM(BPF_REG_2, -1), + VIR_BPF_ALU64_IMM(BPF_LSH, BPF_REG_2, 32), + /* get minor from ctx and | to shifted major */ + VIR_BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_6, 8), + VIR_BPF_ALU64_REG(BPF_OR, BPF_REG_2, BPF_REG_3), + /* lookup ((-1 << 32) | minor) in map and check perms */ + VIR_CGROUP_BPF_LOOKUP, + VIR_CGROUP_BPF_CHECK_PERM, + + /* use -1 as key which means major = -1 and minor = -1 */ + VIR_BPF_MOV64_IMM(BPF_REG_2, -1), + /* lookup -1 in map and check perms*/ + VIR_CGROUP_BPF_LOOKUP, + VIR_CGROUP_BPF_CHECK_PERM, + + /* no key was found, exit with 0 */ + VIR_BPF_MOV64_IMM(BPF_REG_0, 0), + VIR_BPF_EXIT_INSN(), + }; + + return virBPFLoadProg(prog, BPF_PROG_TYPE_CGROUP_DEVICE, ARRAY_CARDINALITY(prog));
Regards, Daniel -- |: https://berrange.com -o- https://www.flickr.com/photos/dberrange :| |: https://libvirt.org -o- https://fstop138.berrange.com :| |: https://entangle-photo.org -o- https://www.instagram.com/dberrange :|

On Tue, Jan 15, 2019 at 09:58:47AM +0100, Pavel Hrdina wrote:
On Mon, Jan 14, 2019 at 04:43:34PM +0000, Daniel P. Berrangé wrote:
On Mon, Jan 14, 2019 at 04:47:37PM +0100, Pavel Hrdina wrote:
This function loads the BPF prog with prepared map into kernel and attaches it into guest cgroup. It can be also used to replace existing program in the cgroup if we need to resize BPF map to store more rules for devices. The old program will be closed and removed from kernel.
There are two possible ways how to create BPF program:
- One way is to write simple C-like code which can by compiled into BPF object file which can be loaded into kernel using elfutils.
- The second way is to define macros which looks like assembler instructions and can be used directly to create BPF program that can be directly loaded into kernel.
Since the program is not too complex we can use the second option.
I'm really not liking this to be honest. Even for this "simple" program, I struggle to understand what the code below is doing. It is basically assembly language, which is inevitably hard to follow even for simple things.
I'd like to see us use the BPF C compiler to build the loaded program unless there's a compelling reason why it won't work
I personally don't like it as well, but the issue is that for the compilation we would have to use clang/llvm, AFAIK there is no support to compile BPF in GCC. Another issue is the loading part, you need to read the object file, extract correct sections which is not trivial and in order to have different size of maps we would have to modify the loaded data from the compiled object file.
I don't see a problem with using clang for BPF. Reading the DPDK lists, I see you can do something like this: $ clang -I iproute2/include/ -O2 -emit-llvm -c tap_bpf_program.c -o - \ | llc -march=bpf -filetype=obj -o tap_bpf_program.o $ objdump -j l3_l4 -s tap_bpf_program.o > tap_bpf_program.bin where the tap_bpf_program.bin should be the raw instructions to be loaded. Supposedly they also have a script that turns the .bin file into a C byte array which they then compile into their binary, so they can just load the uint8[] directly, instead of from a separate file The matter of dynamically sized device maps is more tricky. Assuming the device map size is basically a list of (major,minor) numbers, that's 2 bytes per entry. We could easily do a fixed size 50 elements giving 100 bytes per guest if that makes it easier. I guess the hard thing is actually getting the dynamic data into the code.
I can try to prepare alternative patches for that but it might me even more complex to understand than short assembly program.
Maybe there's a compromise/hybrid that would work simpler ? eg is it possible to structure things so all the checking code is statically compiled, and then we dynamically generate the instructions to define the list of devices and append that to the main static code. ? Regards, Daniel -- |: https://berrange.com -o- https://www.flickr.com/photos/dberrange :| |: https://libvirt.org -o- https://fstop138.berrange.com :| |: https://entangle-photo.org -o- https://www.instagram.com/dberrange :|

On Tue, Jan 15, 2019 at 10:14:33AM +0000, Daniel P. Berrangé wrote:
On Tue, Jan 15, 2019 at 09:58:47AM +0100, Pavel Hrdina wrote:
On Mon, Jan 14, 2019 at 04:43:34PM +0000, Daniel P. Berrangé wrote:
On Mon, Jan 14, 2019 at 04:47:37PM +0100, Pavel Hrdina wrote:
This function loads the BPF prog with prepared map into kernel and attaches it into guest cgroup. It can be also used to replace existing program in the cgroup if we need to resize BPF map to store more rules for devices. The old program will be closed and removed from kernel.
There are two possible ways how to create BPF program:
- One way is to write simple C-like code which can by compiled into BPF object file which can be loaded into kernel using elfutils.
- The second way is to define macros which looks like assembler instructions and can be used directly to create BPF program that can be directly loaded into kernel.
Since the program is not too complex we can use the second option.
I'm really not liking this to be honest. Even for this "simple" program, I struggle to understand what the code below is doing. It is basically assembly language, which is inevitably hard to follow even for simple things.
I'd like to see us use the BPF C compiler to build the loaded program unless there's a compelling reason why it won't work
I personally don't like it as well, but the issue is that for the compilation we would have to use clang/llvm, AFAIK there is no support to compile BPF in GCC. Another issue is the loading part, you need to read the object file, extract correct sections which is not trivial and in order to have different size of maps we would have to modify the loaded data from the compiled object file.
I don't see a problem with using clang for BPF. Reading the DPDK lists, I see you can do something like this:
$ clang -I iproute2/include/ -O2 -emit-llvm -c tap_bpf_program.c -o - \ | llc -march=bpf -filetype=obj -o tap_bpf_program.o $ objdump -j l3_l4 -s tap_bpf_program.o > tap_bpf_program.bin
where the tap_bpf_program.bin should be the raw instructions to be loaded.
There is no technical problem with using clang for BPF, the only issue that I have is that we will require clang to compile libvirt from source code.
Supposedly they also have a script that turns the .bin file into a C byte array which they then compile into their binary, so they can just load the uint8[] directly, instead of from a separate file
This is not an issue, there is a sample code in kernel that uses elfutils to load the bpf.obj file into C byte array, just different method how the get the list of instructions.
The matter of dynamically sized device maps is more tricky. Assuming the device map size is basically a list of (major,minor) numbers, that's 2 bytes per entry. We could easily do a fixed size 50 elements giving 100 bytes per guest if that makes it easier. I guess the hard thing is actually getting the dynamic data into the code.
50 elements per guest will not be good enough. Yes, in most cases it's perfectly OK, but there are some users having hundreds of disks inside a guest and we would break their usage and since we don't have any written limitation of how many disk/devices can be inside a guest we have to be able to store unlimited number of devices into the map until we run out of memory.
I can try to prepare alternative patches for that but it might me even more complex to understand than short assembly program.
Maybe there's a compromise/hybrid that would work simpler ? eg is it possible to structure things so all the checking code is statically compiled, and then we dynamically generate the instructions to define the list of devices and append that to the main static code. ?
This is not an issue, the list of devices is stored into the BPF map from user-space (libvirt) and the BPF map is used by the BPF program in the kernel. The limits are that you cannot reallocate the BPF map (this is solved by creating new program with bigger map) and you cannot change the BPF map FD while the program is loaded into kernel. Pavel
Regards, Daniel -- |: https://berrange.com -o- https://www.flickr.com/photos/dberrange :| |: https://libvirt.org -o- https://fstop138.berrange.com :| |: https://entangle-photo.org -o- https://www.instagram.com/dberrange :|

On Tue, Jan 15, 2019 at 01:07:55PM +0100, Pavel Hrdina wrote:
On Tue, Jan 15, 2019 at 10:14:33AM +0000, Daniel P. Berrangé wrote:
On Tue, Jan 15, 2019 at 09:58:47AM +0100, Pavel Hrdina wrote:
On Mon, Jan 14, 2019 at 04:43:34PM +0000, Daniel P. Berrangé wrote:
On Mon, Jan 14, 2019 at 04:47:37PM +0100, Pavel Hrdina wrote:
This function loads the BPF prog with prepared map into kernel and attaches it into guest cgroup. It can be also used to replace existing program in the cgroup if we need to resize BPF map to store more rules for devices. The old program will be closed and removed from kernel.
There are two possible ways how to create BPF program:
- One way is to write simple C-like code which can by compiled into BPF object file which can be loaded into kernel using elfutils.
- The second way is to define macros which looks like assembler instructions and can be used directly to create BPF program that can be directly loaded into kernel.
Since the program is not too complex we can use the second option.
I'm really not liking this to be honest. Even for this "simple" program, I struggle to understand what the code below is doing. It is basically assembly language, which is inevitably hard to follow even for simple things.
I'd like to see us use the BPF C compiler to build the loaded program unless there's a compelling reason why it won't work
I personally don't like it as well, but the issue is that for the compilation we would have to use clang/llvm, AFAIK there is no support to compile BPF in GCC. Another issue is the loading part, you need to read the object file, extract correct sections which is not trivial and in order to have different size of maps we would have to modify the loaded data from the compiled object file.
I don't see a problem with using clang for BPF. Reading the DPDK lists, I see you can do something like this:
$ clang -I iproute2/include/ -O2 -emit-llvm -c tap_bpf_program.c -o - \ | llc -march=bpf -filetype=obj -o tap_bpf_program.o $ objdump -j l3_l4 -s tap_bpf_program.o > tap_bpf_program.bin
where the tap_bpf_program.bin should be the raw instructions to be loaded.
There is no technical problem with using clang for BPF, the only issue that I have is that we will require clang to compile libvirt from source code.
Supposedly they also have a script that turns the .bin file into a C byte array which they then compile into their binary, so they can just load the uint8[] directly, instead of from a separate file
This is not an issue, there is a sample code in kernel that uses elfutils to load the bpf.obj file into C byte array, just different method how the get the list of instructions.
The matter of dynamically sized device maps is more tricky. Assuming the device map size is basically a list of (major,minor) numbers, that's 2 bytes per entry. We could easily do a fixed size 50 elements giving 100 bytes per guest if that makes it easier. I guess the hard thing is actually getting the dynamic data into the code.
50 elements per guest will not be good enough. Yes, in most cases it's perfectly OK, but there are some users having hundreds of disks inside a guest and we would break their usage and since we don't have any written limitation of how many disk/devices can be inside a guest we have to be able to store unlimited number of devices into the map until we run out of memory.
I can try to prepare alternative patches for that but it might me even more complex to understand than short assembly program.
Maybe there's a compromise/hybrid that would work simpler ? eg is it possible to structure things so all the checking code is statically compiled, and then we dynamically generate the instructions to define the list of devices and append that to the main static code. ?
This compromise would not work. The dynamic data are stored in the BPF map and the program uses only the map FD which is hardcoded into the program itself. The map FD is placed into correct register before we call bpf_map_lookup function. The only dynamic part of the program is the map FD which is used on 4 places in the program. I was also looking into the possibility of using elfutils as kernel is using in one of the samples and that feels more complex than this simple assembler like program. We would have to open the binary file generated by clang, load map data and instruction data, update the map size in the loaded map data and do syscall in order to create the map, get the map FD returned by syscall, inject that FD into the loaded instructions and do syscall to create the program. This IMHO sounds more complex and than the "simple" assembler like program. As a compromise we could have the C code as a comment in our source files right before the actual assembler like version with instructions how to use clang and llvm-objdump to get the assembler instructions. That's what I've done when I was implementing the program. The only optimization was to add two macros and change some jump logic but we can remove that optimization and make it 1:1 to what we get from llvm-objdump. Pavel
This is not an issue, the list of devices is stored into the BPF map from user-space (libvirt) and the BPF map is used by the BPF program in the kernel. The limits are that you cannot reallocate the BPF map (this is solved by creating new program with bigger map) and you cannot change the BPF map FD while the program is loaded into kernel.
Pavel

This function will be called if libvirtd was restarted while some domains were running. It will try to detect existing programs attached to the guest cgroup. Signed-off-by: Pavel Hrdina <phrdina@redhat.com> --- src/libvirt_private.syms | 1 + src/util/vircgroupv2devices.c | 117 ++++++++++++++++++++++++++++++++++ src/util/vircgroupv2devices.h | 3 + 3 files changed, 121 insertions(+) diff --git a/src/libvirt_private.syms b/src/libvirt_private.syms index 3fc91ce207..7f1050ef5a 100644 --- a/src/libvirt_private.syms +++ b/src/libvirt_private.syms @@ -1614,6 +1614,7 @@ virCgroupV2Register; # util/vircgroupv2devices.h virCgroupV2DevicesAttachProg; virCgroupV2DevicesAvailable; +virCgroupV2DevicesDetectProg; # util/virclosecallbacks.h virCloseCallbacksGet; diff --git a/src/util/vircgroupv2devices.c b/src/util/vircgroupv2devices.c index 0b721a0aad..e609faa210 100644 --- a/src/util/vircgroupv2devices.c +++ b/src/util/vircgroupv2devices.c @@ -192,6 +192,113 @@ virCgroupV2DevicesAttachProg(virCgroupPtr group, VIR_FORCE_CLOSE(mapfd); return ret; } + + +static int +virCgroupV2DevicesCountMapEntries(int mapfd) +{ + int ret = 0; + int rc; + uint64_t key = 0; + uint64_t prevKey = 0; + + while ((rc = virBPFGetNextElem(mapfd, &prevKey, &key)) == 0) { + ret++; + prevKey = key; + } + + if (rc < 0) + return -1; + + return ret; +} + + +# define MAX_PROG_IDS 10 + +int +virCgroupV2DevicesDetectProg(virCgroupPtr group) +{ + int ret = -1; + int cgroupfd = -1; + unsigned int progcnt = 0; + unsigned int progids[MAX_PROG_IDS] = { 0 }; + VIR_AUTOFREE(char *) path = NULL; + + if (group->unified.devices.progfd > 0 && group->unified.devices.mapfd > 0) + return 0; + + if (virCgroupPathOfController(group, VIR_CGROUP_CONTROLLER_DEVICES, + NULL, &path) < 0) { + return -1; + } + + cgroupfd = open(path, O_RDONLY); + if (cgroupfd < 0) { + virReportSystemError(errno, _("unable to open '%s'"), path); + goto cleanup; + } + + if (virBPFQueryProg(cgroupfd, MAX_PROG_IDS, BPF_CGROUP_DEVICE, + &progcnt, progids) < 0) { + virReportSystemError(errno, "%s", _("unable to query cgroup BPF progs")); + goto cleanup; + } + + if (progcnt > 0) { + /* No need to have alternate code, this function will not be called + * if compiled with old kernel. */ + int progfd = virBPFGetProg(progids[0]); + int mapfd = -1; + int nitems = -1; + struct bpf_prog_info progInfo = { 0 }; + struct bpf_map_info mapInfo = { 0 }; + VIR_AUTOFREE(unsigned int *) mapIDs = NULL; + + if (progfd < 0) { + virReportSystemError(errno, "%s", _("failed to get cgroup BPF prog FD")); + goto cleanup; + } + + if (virBPFGetProgInfo(progfd, &progInfo, &mapIDs) < 0) { + virReportSystemError(errno, "%s", _("failed to get cgroup BPF prog info")); + goto cleanup; + } + + if (progInfo.nr_map_ids == 0) { + virReportError(VIR_ERR_INTERNAL_ERROR, "%s", + _("no map for cgroup BPF prog")); + goto cleanup; + } + + mapfd = virBPFGetMap(mapIDs[0]); + if (mapfd < 0) { + virReportSystemError(errno, "%s", _("failed to get cgroup BPF map FD")); + goto cleanup; + } + + if (virBPFGetMapInfo(mapfd, &mapInfo) < 0) { + virReportSystemError(errno, "%s", _("failed to get cgroup BPF map info")); + goto cleanup; + } + + nitems = virCgroupV2DevicesCountMapEntries(mapfd); + if (nitems < 0) { + virReportSystemError(errno, "%s", _("failed to count cgroup BPF map items")); + goto cleanup; + } + + group->unified.devices.progfd = progfd; + group->unified.devices.mapfd = mapfd; + group->unified.devices.max = mapInfo.max_entries; + group->unified.devices.count = nitems; + } + + ret = 0; + cleanup: + VIR_FORCE_CLOSE(cgroupfd); + return ret; +} #else /* !HAVE_DECL_BPF_CGROUP_DEVICE */ bool virCgroupV2DevicesAvailable(virCgroupPtr group ATTRIBUTE_UNUSED) @@ -210,4 +317,14 @@ virCgroupV2DevicesAttachProg(virCgroupPtr group ATTRIBUTE_UNUSED, "with this kernel")); return -1; } + + +int +virCgroupV2DevicesDetectProg(virCgroupPtr group ATTRIBUTE_UNUSED) +{ + virReportSystemError(ENOSYS, "%s", + _("cgroups v2 BPF devices not supported " + "with this kernel")); + return -1; +} #endif /* !HAVE_DECL_BPF_CGROUP_DEVICE */ diff --git a/src/util/vircgroupv2devices.h b/src/util/vircgroupv2devices.h index 1ba87acb00..a8e50dcca5 100644 --- a/src/util/vircgroupv2devices.h +++ b/src/util/vircgroupv2devices.h @@ -29,4 +29,7 @@ virCgroupV2DevicesAttachProg(virCgroupPtr group, int mapfd, size_t max); +int +virCgroupV2DevicesDetectProg(virCgroupPtr group); + #endif /* LIBVIRT_VIRCGROUPV2DEVICES_H */ -- 2.20.1

This function creates new BPF program with new empty BPF map with the default size and attaches it to the guest cgroup. Signed-off-by: Pavel Hrdina <phrdina@redhat.com> --- src/libvirt_private.syms | 1 + src/util/vircgroupv2devices.c | 53 +++++++++++++++++++++++++++++++++++ src/util/vircgroupv2devices.h | 3 ++ 3 files changed, 57 insertions(+) diff --git a/src/libvirt_private.syms b/src/libvirt_private.syms index 7f1050ef5a..8ec188e9cd 100644 --- a/src/libvirt_private.syms +++ b/src/libvirt_private.syms @@ -1614,6 +1614,7 @@ virCgroupV2Register; # util/vircgroupv2devices.h virCgroupV2DevicesAttachProg; virCgroupV2DevicesAvailable; +virCgroupV2DevicesCreateProg; virCgroupV2DevicesDetectProg; # util/virclosecallbacks.h diff --git a/src/util/vircgroupv2devices.c b/src/util/vircgroupv2devices.c index e609faa210..cd369ef6ab 100644 --- a/src/util/vircgroupv2devices.c +++ b/src/util/vircgroupv2devices.c @@ -299,6 +299,49 @@ virCgroupV2DevicesDetectProg(virCgroupPtr group) VIR_FORCE_CLOSE(cgroupfd); return ret; } + + +# define VIR_CGROUP_V2_INITIAL_BPF_MAP_SIZE 64 + +static int +virCgroupV2DevicesCreateMap(size_t size) +{ + int mapfd = virBPFCreateMap(BPF_MAP_TYPE_HASH, sizeof(uint64_t), + sizeof(uint32_t), size); + + if (mapfd < 0) { + virReportSystemError(errno, "%s", + _("failed to initialize device BPF map")); + return -1; + } + + return mapfd; +} + + +int +virCgroupV2DevicesCreateProg(virCgroupPtr group) +{ + int mapfd; + + if (group->unified.devices.progfd > 0 && group->unified.devices.mapfd > 0) + return 0; + + mapfd = virCgroupV2DevicesCreateMap(VIR_CGROUP_V2_INITIAL_BPF_MAP_SIZE); + if (mapfd < 0) + return -1; + + if (virCgroupV2DevicesAttachProg(group, mapfd, + VIR_CGROUP_V2_INITIAL_BPF_MAP_SIZE) < 0) { + goto error; + } + + return 0; + + error: + VIR_FORCE_CLOSE(mapfd); + return -1; +} #else /* !HAVE_DECL_BPF_CGROUP_DEVICE */ bool virCgroupV2DevicesAvailable(virCgroupPtr group ATTRIBUTE_UNUSED) @@ -327,4 +370,14 @@ virCgroupV2DevicesDetectProg(virCgroupPtr group ATTRIBUTE_UNUSED) "with this kernel")); return -1; } + + +int +virCgroupV2DevicesCreateProg(virCgroupPtr group ATTRIBUTE_UNUSED) +{ + virReportSystemError(ENOSYS, "%s", + _("cgroups v2 BPF devices not supported " + "with this kernel")); + return -1; +} #endif /* !HAVE_DECL_BPF_CGROUP_DEVICE */ diff --git a/src/util/vircgroupv2devices.h b/src/util/vircgroupv2devices.h index a8e50dcca5..bcbd761537 100644 --- a/src/util/vircgroupv2devices.h +++ b/src/util/vircgroupv2devices.h @@ -32,4 +32,7 @@ virCgroupV2DevicesAttachProg(virCgroupPtr group, int virCgroupV2DevicesDetectProg(virCgroupPtr group); +int +virCgroupV2DevicesCreateProg(virCgroupPtr group); + #endif /* LIBVIRT_VIRCGROUPV2DEVICES_H */ -- 2.20.1

This function will be called for every virCgroup(Allow|Deny)* API in order to prepare BPF program for guest. Since libvirtd can be restarted at any point we will first try to detect existing progam, if there is none we will create a new empty BPF program and lastly if we don't have any space left in the existing BPF map we will create a new copy of the BPF map with more space and attach a new program with that map into the guest cgroup. This solution allows us to start with reasonably small BPF map consuming only small amount of memory and if needed we can easily extend the BPF map if there is a lot of host devices used in guest or if user wants to hot-plug a lot of devices once the guest is running. Since there is no way how to reallocate existing BPF map we need to create a new copy if we run out of space in current BPF map. This overcomes all the limitations in BPF: - map used in program has to be created before the program is loaded into kernel - once map is created you cannot change its size - you cannot replace map in existing program - you cannot use an array of maps because it can store FD to maps of one specific size so we would not be able to use it to overcome the second issue Signed-off-by: Pavel Hrdina <phrdina@redhat.com> --- src/libvirt_private.syms | 1 + src/util/vircgroupv2devices.c | 83 +++++++++++++++++++++++++++++++++++ src/util/vircgroupv2devices.h | 3 ++ 3 files changed, 87 insertions(+) diff --git a/src/libvirt_private.syms b/src/libvirt_private.syms index 8ec188e9cd..1d1f8bf478 100644 --- a/src/libvirt_private.syms +++ b/src/libvirt_private.syms @@ -1616,6 +1616,7 @@ virCgroupV2DevicesAttachProg; virCgroupV2DevicesAvailable; virCgroupV2DevicesCreateProg; virCgroupV2DevicesDetectProg; +virCgroupV2DevicesPrepareProg; # util/virclosecallbacks.h virCloseCallbacksGet; diff --git a/src/util/vircgroupv2devices.c b/src/util/vircgroupv2devices.c index cd369ef6ab..e3a415d615 100644 --- a/src/util/vircgroupv2devices.c +++ b/src/util/vircgroupv2devices.c @@ -319,6 +319,52 @@ virCgroupV2DevicesCreateMap(size_t size) } +static int +virCgroupV2DevicesReallocMap(int mapfd, + size_t size) +{ + uint64_t key = 0; + uint64_t prevKey = 0; + int rc; + int newmapfd = virCgroupV2DevicesCreateMap(size); + + VIR_DEBUG("realloc devices map mapfd:%d, size:%lu", mapfd, size); + + if (newmapfd < 0) + return -1; + + while ((rc = virBPFGetNextElem(mapfd, &prevKey, &key)) == 0) { + uint32_t val = 0; + + if (virBPFLookupElem(mapfd, &key, &val) < 0) { + virReportSystemError(errno, "%s", + _("failed to lookup device in old map")); + goto error; + } + + if (virBPFUpdateElem(newmapfd, &key, &val) < 0) { + virReportSystemError(errno, "%s", + _("failed to add device into new map")); + goto error; + } + + prevKey = key; + } + + if (rc < 0 && errno != ENOENT) { + virReportSystemError(errno, "%s", + _("failed to copy all device rules")); + goto error; + } + + return newmapfd; + + error: + VIR_FORCE_CLOSE(newmapfd); + return -1; +} + + int virCgroupV2DevicesCreateProg(virCgroupPtr group) { @@ -342,6 +388,33 @@ virCgroupV2DevicesCreateProg(virCgroupPtr group) VIR_FORCE_CLOSE(mapfd); return -1; } + + +int +virCgroupV2DevicesPrepareProg(virCgroupPtr group) +{ + if (virCgroupV2DevicesDetectProg(group) < 0) + return -1; + + if (virCgroupV2DevicesCreateProg(group) < 0) + return -1; + + if (group->unified.devices.count >= group->unified.devices.max) { + size_t max = group->unified.devices.max * 2; + int newmapfd = virCgroupV2DevicesReallocMap(group->unified.devices.mapfd, + max); + + if (newmapfd < 0) + return -1; + + if (virCgroupV2DevicesAttachProg(group, newmapfd, max) < 0) { + VIR_FORCE_CLOSE(newmapfd); + return -1; + } + } + + return 0; +} #else /* !HAVE_DECL_BPF_CGROUP_DEVICE */ bool virCgroupV2DevicesAvailable(virCgroupPtr group ATTRIBUTE_UNUSED) @@ -380,4 +453,14 @@ virCgroupV2DevicesCreateProg(virCgroupPtr group ATTRIBUTE_UNUSED) "with this kernel")); return -1; } + + +int +virCgroupV2DevicesPrepareProg(virCgroupPtr group ATTRIBUTE_UNUSED) +{ + virReportSystemError(ENOSYS, "%s", + _("cgroups v2 BPF devices not supported " + "with this kernel")); + return -1; +} #endif /* !HAVE_DECL_BPF_CGROUP_DEVICE */ diff --git a/src/util/vircgroupv2devices.h b/src/util/vircgroupv2devices.h index bcbd761537..10c80c8ae4 100644 --- a/src/util/vircgroupv2devices.h +++ b/src/util/vircgroupv2devices.h @@ -35,4 +35,7 @@ virCgroupV2DevicesDetectProg(virCgroupPtr group); int virCgroupV2DevicesCreateProg(virCgroupPtr group); +int +virCgroupV2DevicesPrepareProg(virCgroupPtr group); + #endif /* LIBVIRT_VIRCGROUPV2DEVICES_H */ -- 2.20.1

We need to close our FD that we have for BPF program and map in order to let kernel remove all resources once the cgroup is removed as well. Signed-off-by: Pavel Hrdina <phrdina@redhat.com> --- src/libvirt_private.syms | 1 + src/util/vircgroupv2.c | 3 +++ src/util/vircgroupv2devices.c | 26 ++++++++++++++++++++++++++ src/util/vircgroupv2devices.h | 3 +++ 4 files changed, 33 insertions(+) diff --git a/src/libvirt_private.syms b/src/libvirt_private.syms index 1d1f8bf478..c4f5cdea46 100644 --- a/src/libvirt_private.syms +++ b/src/libvirt_private.syms @@ -1617,6 +1617,7 @@ virCgroupV2DevicesAvailable; virCgroupV2DevicesCreateProg; virCgroupV2DevicesDetectProg; virCgroupV2DevicesPrepareProg; +virCgroupV2DevicesRemoveProg; # util/virclosecallbacks.h virCloseCallbacksGet; diff --git a/src/util/vircgroupv2.c b/src/util/vircgroupv2.c index a56f3443e3..b5d4776d14 100644 --- a/src/util/vircgroupv2.c +++ b/src/util/vircgroupv2.c @@ -439,6 +439,9 @@ virCgroupV2Remove(virCgroupPtr group) if (virCgroupV2PathOfController(group, controller, "", &grppath) < 0) return 0; + if (virCgroupV2DevicesRemoveProg(group) < 0) + return -1; + return virCgroupRemoveRecursively(grppath); } diff --git a/src/util/vircgroupv2devices.c b/src/util/vircgroupv2devices.c index e3a415d615..e7f8c048fc 100644 --- a/src/util/vircgroupv2devices.c +++ b/src/util/vircgroupv2devices.c @@ -415,6 +415,25 @@ virCgroupV2DevicesPrepareProg(virCgroupPtr group) return 0; } + + +int +virCgroupV2DevicesRemoveProg(virCgroupPtr group) +{ + if (virCgroupV2DevicesDetectProg(group) < 0) + return -1; + + if (group->unified.devices.progfd <= 0 && group->unified.devices.mapfd <= 0) + return 0; + + if (group->unified.devices.mapfd >= 0) + VIR_FORCE_CLOSE(group->unified.devices.mapfd); + + if (group->unified.devices.progfd >= 0) + VIR_FORCE_CLOSE(group->unified.devices.progfd); + + return 0; +} #else /* !HAVE_DECL_BPF_CGROUP_DEVICE */ bool virCgroupV2DevicesAvailable(virCgroupPtr group ATTRIBUTE_UNUSED) @@ -463,4 +482,11 @@ virCgroupV2DevicesPrepareProg(virCgroupPtr group ATTRIBUTE_UNUSED) "with this kernel")); return -1; } + + +int +virCgroupV2DevicesRemoveProg(virCgroupPtr group ATTRIBUTE_UNUSED) +{ + return 0; +} #endif /* !HAVE_DECL_BPF_CGROUP_DEVICE */ diff --git a/src/util/vircgroupv2devices.h b/src/util/vircgroupv2devices.h index 10c80c8ae4..ae02f9c334 100644 --- a/src/util/vircgroupv2devices.h +++ b/src/util/vircgroupv2devices.h @@ -38,4 +38,7 @@ virCgroupV2DevicesCreateProg(virCgroupPtr group); int virCgroupV2DevicesPrepareProg(virCgroupPtr group); +int +virCgroupV2DevicesRemoveProg(virCgroupPtr group); + #endif /* LIBVIRT_VIRCGROUPV2DEVICES_H */ -- 2.20.1

Signed-off-by: Pavel Hrdina <phrdina@redhat.com> --- src/libvirt_private.syms | 1 + src/util/vircgroupv2devices.c | 34 ++++++++++++++++++++++++++++++++++ src/util/vircgroupv2devices.h | 6 ++++++ 3 files changed, 41 insertions(+) diff --git a/src/libvirt_private.syms b/src/libvirt_private.syms index c4f5cdea46..0d25911bc1 100644 --- a/src/libvirt_private.syms +++ b/src/libvirt_private.syms @@ -1616,6 +1616,7 @@ virCgroupV2DevicesAttachProg; virCgroupV2DevicesAvailable; virCgroupV2DevicesCreateProg; virCgroupV2DevicesDetectProg; +virCgroupV2DevicesGetPerms; virCgroupV2DevicesPrepareProg; virCgroupV2DevicesRemoveProg; diff --git a/src/util/vircgroupv2devices.c b/src/util/vircgroupv2devices.c index e7f8c048fc..9cf9edee3a 100644 --- a/src/util/vircgroupv2devices.c +++ b/src/util/vircgroupv2devices.c @@ -434,6 +434,32 @@ virCgroupV2DevicesRemoveProg(virCgroupPtr group) return 0; } + + +uint32_t +virCgroupV2DevicesGetPerms(int perms, + char type) +{ + uint32_t ret = 0; + + if (perms & VIR_CGROUP_DEVICE_MKNOD) + ret |= BPF_DEVCG_ACC_MKNOD << 16; + + if (perms & VIR_CGROUP_DEVICE_READ) + ret |= BPF_DEVCG_ACC_READ << 16; + + if (perms & VIR_CGROUP_DEVICE_WRITE) + ret |= BPF_DEVCG_ACC_WRITE << 16; + + if (type == 'b') + ret |= BPF_DEVCG_DEV_BLOCK; + else if (type == 'c') + ret |= BPF_DEVCG_DEV_CHAR; + else + ret |= BPF_DEVCG_DEV_BLOCK | BPF_DEVCG_DEV_CHAR; + + return ret; +} #else /* !HAVE_DECL_BPF_CGROUP_DEVICE */ bool virCgroupV2DevicesAvailable(virCgroupPtr group ATTRIBUTE_UNUSED) @@ -489,4 +515,12 @@ virCgroupV2DevicesRemoveProg(virCgroupPtr group ATTRIBUTE_UNUSED) { return 0; } + + +uint32_t +virCgroupV2DevicesGetPerms(int perms ATTRIBUTE_UNUSED, + char type ATTRIBUTE_UNUSED) +{ + return 0; +} #endif /* !HAVE_DECL_BPF_CGROUP_DEVICE */ diff --git a/src/util/vircgroupv2devices.h b/src/util/vircgroupv2devices.h index ae02f9c334..cbfd9ae119 100644 --- a/src/util/vircgroupv2devices.h +++ b/src/util/vircgroupv2devices.h @@ -19,6 +19,8 @@ #ifndef LIBVIRT_VIRCGROUPV2DEVICES_H # define LIBVIRT_VIRCGROUPV2DEVICES_H +# include <sys/types.h> + # include "vircgroup.h" bool @@ -41,4 +43,8 @@ virCgroupV2DevicesPrepareProg(virCgroupPtr group); int virCgroupV2DevicesRemoveProg(virCgroupPtr group); +uint32_t +virCgroupV2DevicesGetPerms(int perms, + char type); + #endif /* LIBVIRT_VIRCGROUPV2DEVICES_H */ -- 2.20.1

Device rules are stored in BPF map that is a hash type, this function will create a key based on major and minor id of device. Signed-off-by: Pavel Hrdina <phrdina@redhat.com> --- src/libvirt_private.syms | 1 + src/util/vircgroupv2devices.c | 8 ++++++++ src/util/vircgroupv2devices.h | 4 ++++ 3 files changed, 13 insertions(+) diff --git a/src/libvirt_private.syms b/src/libvirt_private.syms index 0d25911bc1..f1da5ede71 100644 --- a/src/libvirt_private.syms +++ b/src/libvirt_private.syms @@ -1616,6 +1616,7 @@ virCgroupV2DevicesAttachProg; virCgroupV2DevicesAvailable; virCgroupV2DevicesCreateProg; virCgroupV2DevicesDetectProg; +virCgroupV2DevicesGetKey; virCgroupV2DevicesGetPerms; virCgroupV2DevicesPrepareProg; virCgroupV2DevicesRemoveProg; diff --git a/src/util/vircgroupv2devices.c b/src/util/vircgroupv2devices.c index 9cf9edee3a..8bf5100724 100644 --- a/src/util/vircgroupv2devices.c +++ b/src/util/vircgroupv2devices.c @@ -524,3 +524,11 @@ virCgroupV2DevicesGetPerms(int perms ATTRIBUTE_UNUSED, return 0; } #endif /* !HAVE_DECL_BPF_CGROUP_DEVICE */ + + +uint64_t +virCgroupV2DevicesGetKey(int major, + int minor) +{ + return (uint64_t)major << 32 | ((uint64_t)minor & 0x00000000ffffffff); +} diff --git a/src/util/vircgroupv2devices.h b/src/util/vircgroupv2devices.h index cbfd9ae119..d717c9755f 100644 --- a/src/util/vircgroupv2devices.h +++ b/src/util/vircgroupv2devices.h @@ -47,4 +47,8 @@ uint32_t virCgroupV2DevicesGetPerms(int perms, char type); +uint64_t +virCgroupV2DevicesGetKey(int major, + int minor); + #endif /* LIBVIRT_VIRCGROUPV2DEVICES_H */ -- 2.20.1

In order to allow device we need to create key and value which will be used to update BPF map. virBPFUpdateElem() can override existing entries in BPF map so we need to check if that entry exists in order to track number of entries in our map. This can add rule for specific device but major and minor can be both -1 which follows the same behavior as in cgroup v1. Signed-off-by: Pavel Hrdina <phrdina@redhat.com> --- src/util/vircgroupv2.c | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/src/util/vircgroupv2.c b/src/util/vircgroupv2.c index b5d4776d14..962d41ba0a 100644 --- a/src/util/vircgroupv2.c +++ b/src/util/vircgroupv2.c @@ -29,6 +29,7 @@ #define LIBVIRT_VIRCGROUPPRIV_H_ALLOW #include "vircgrouppriv.h" +#include "virbpf.h" #include "vircgroup.h" #include "vircgroupbackend.h" #include "vircgroupv2.h" @@ -1568,6 +1569,35 @@ virCgroupV2GetCpuacctStat(virCgroupPtr group, } +static int +virCgroupV2AllowDevice(virCgroupPtr group, + char type, + int major, + int minor, + int perms) +{ + uint64_t key = virCgroupV2DevicesGetKey(major, minor); + uint32_t val = virCgroupV2DevicesGetPerms(perms, type); + int rc; + + if (virCgroupV2DevicesPrepareProg(group) < 0) + return -1; + + rc = virBPFLookupElem(group->unified.devices.mapfd, &key, NULL); + + if (virBPFUpdateElem(group->unified.devices.mapfd, &key, &val) < 0) { + virReportSystemError(errno, "%s", + _("failed to update device in BPF cgroup map")); + return -1; + } + + if (rc < 0) + group->unified.devices.count++; + + return 0; +} + + virCgroupBackend virCgroupV2Backend = { .type = VIR_CGROUP_BACKEND_TYPE_V2, @@ -1617,6 +1647,8 @@ virCgroupBackend virCgroupV2Backend = { .getMemSwapHardLimit = virCgroupV2GetMemSwapHardLimit, .getMemSwapUsage = virCgroupV2GetMemSwapUsage, + .allowDevice = virCgroupV2AllowDevice, + .setCpuShares = virCgroupV2SetCpuShares, .getCpuShares = virCgroupV2GetCpuShares, .setCpuCfsPeriod = virCgroupV2SetCpuCfsPeriod, -- 2.20.1

In order to deny device we need to check if there is any entry in BPF map and we need to load the current value from map if there is already entry for that device. If both values are same we can remove that entry but if they are different we need to update the entry because we don't have to deny all access, but for example only write access. Signed-off-by: Pavel Hrdina <phrdina@redhat.com> --- src/util/vircgroupv2.c | 41 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 41 insertions(+) diff --git a/src/util/vircgroupv2.c b/src/util/vircgroupv2.c index 962d41ba0a..b6c09baadc 100644 --- a/src/util/vircgroupv2.c +++ b/src/util/vircgroupv2.c @@ -1598,6 +1598,46 @@ virCgroupV2AllowDevice(virCgroupPtr group, } +static int +virCgroupV2DenyDevice(virCgroupPtr group, + char type, + int major, + int minor, + int perms) +{ + uint64_t key = virCgroupV2DevicesGetKey(major, minor); + uint32_t newval = virCgroupV2DevicesGetPerms(perms, type); + uint32_t val = 0; + + if (virCgroupV2DevicesPrepareProg(group) < 0) + return -1; + + if (group->unified.devices.count <= 0 || + virBPFLookupElem(group->unified.devices.mapfd, &key, &val) < 0) { + VIR_DEBUG("nothing to do, device is not allowed"); + return 0; + } + + if (newval == val) { + if (virBPFDeleteElem(group->unified.devices.mapfd, &key) < 0) { + virReportSystemError(errno, "%s", + _("failed to remove device from BPF cgroup map")); + return -1; + } + group->unified.devices.count--; + } else { + val ^= val & newval; + if (virBPFUpdateElem(group->unified.devices.mapfd, &key, &val) < 0) { + virReportSystemError(errno, "%s", + _("failed to update device in BPF cgroup map")); + return -1; + } + } + + return 0; +} + + virCgroupBackend virCgroupV2Backend = { .type = VIR_CGROUP_BACKEND_TYPE_V2, @@ -1648,6 +1688,7 @@ virCgroupBackend virCgroupV2Backend = { .getMemSwapUsage = virCgroupV2GetMemSwapUsage, .allowDevice = virCgroupV2AllowDevice, + .denyDevice = virCgroupV2DenyDevice, .setCpuShares = virCgroupV2SetCpuShares, .getCpuShares = virCgroupV2GetCpuShares, -- 2.20.1

If we want to allow all devices with all permissions we need to replace any existing program that has any rule configured, otherwise we just need to add new rule which will for example allow read access to all devices. Signed-off-by: Pavel Hrdina <phrdina@redhat.com> --- src/util/vircgroupv2.c | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/src/util/vircgroupv2.c b/src/util/vircgroupv2.c index b6c09baadc..8ad4de986f 100644 --- a/src/util/vircgroupv2.c +++ b/src/util/vircgroupv2.c @@ -1638,6 +1638,23 @@ virCgroupV2DenyDevice(virCgroupPtr group, } +static int +virCgroupV2AllowAllDevices(virCgroupPtr group, + int perms) +{ + if (virCgroupV2DevicesPrepareProg(group) < 0) + return -1; + + if (group->unified.devices.count > 0 && + perms == VIR_CGROUP_DEVICE_RWM && + virCgroupV2DevicesCreateProg(group) < 0) { + return -1; + } + + return virCgroupV2AllowDevice(group, 'a', -1, -1, perms); +} + + virCgroupBackend virCgroupV2Backend = { .type = VIR_CGROUP_BACKEND_TYPE_V2, @@ -1689,6 +1706,7 @@ virCgroupBackend virCgroupV2Backend = { .allowDevice = virCgroupV2AllowDevice, .denyDevice = virCgroupV2DenyDevice, + .allowAllDevices = virCgroupV2AllowAllDevices, .setCpuShares = virCgroupV2SetCpuShares, .getCpuShares = virCgroupV2GetCpuShares, -- 2.20.1

If we want to deny all devices we just need to replace any existing program with new program with empty map. Signed-off-by: Pavel Hrdina <phrdina@redhat.com> --- src/util/vircgroupv2.c | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/src/util/vircgroupv2.c b/src/util/vircgroupv2.c index 8ad4de986f..f2ea1eb7df 100644 --- a/src/util/vircgroupv2.c +++ b/src/util/vircgroupv2.c @@ -1655,6 +1655,16 @@ virCgroupV2AllowAllDevices(virCgroupPtr group, } +static int +virCgroupV2DenyAllDevices(virCgroupPtr group) +{ + if (virCgroupV2DevicesDetectProg(group) < 0) + return -1; + + return virCgroupV2DevicesCreateProg(group); +} + + virCgroupBackend virCgroupV2Backend = { .type = VIR_CGROUP_BACKEND_TYPE_V2, @@ -1707,6 +1717,7 @@ virCgroupBackend virCgroupV2Backend = { .allowDevice = virCgroupV2AllowDevice, .denyDevice = virCgroupV2DenyDevice, .allowAllDevices = virCgroupV2AllowAllDevices, + .denyAllDevices = virCgroupV2DenyAllDevices, .setCpuShares = virCgroupV2SetCpuShares, .getCpuShares = virCgroupV2GetCpuShares, -- 2.20.1

So the issue here is that you can end up with configuration where you have cgroup v1 and v2 enabled at the same time and the devices controllers is enabled for cgroup v1. In cgroup v2 there is no devices controller, the device access is controlled using BPF and since it is not a cgroup controller both of them can exists at the same time and both of them are applied while resolving access to devices. In order to avoid configuring both BPF and cgroup v1 devices we will use BPF if possible and otherwise fallback to cgroup v1 devices. Signed-off-by: Pavel Hrdina <phrdina@redhat.com> --- src/util/vircgroup.c | 3 ++- src/util/vircgroupbackend.h | 3 ++- src/util/vircgroupv1.c | 9 ++++++++- src/util/vircgroupv2.c | 5 ++++- 4 files changed, 16 insertions(+), 4 deletions(-) diff --git a/src/util/vircgroup.c b/src/util/vircgroup.c index 3ebb3b0a0f..9d284e9bf4 100644 --- a/src/util/vircgroup.c +++ b/src/util/vircgroup.c @@ -381,7 +381,8 @@ virCgroupDetect(virCgroupPtr group, for (i = 0; i < VIR_CGROUP_BACKEND_TYPE_LAST; i++) { if (group->backends[i]) { - int rc = group->backends[i]->detectControllers(group, controllers); + int rc = group->backends[i]->detectControllers(group, controllers, + controllersAvailable); if (rc < 0) return -1; controllersAvailable |= rc; diff --git a/src/util/vircgroupbackend.h b/src/util/vircgroupbackend.h index 24b45be9bb..9bc8e7b11d 100644 --- a/src/util/vircgroupbackend.h +++ b/src/util/vircgroupbackend.h @@ -96,7 +96,8 @@ typedef char * typedef int (*virCgroupDetectControllersCB)(virCgroupPtr group, - int controllers); + int controllers, + int detected); typedef bool (*virCgroupHasControllerCB)(virCgroupPtr cgroup, diff --git a/src/util/vircgroupv1.c b/src/util/vircgroupv1.c index f6707e4894..c94fad0591 100644 --- a/src/util/vircgroupv1.c +++ b/src/util/vircgroupv1.c @@ -417,7 +417,8 @@ virCgroupV1StealPlacement(virCgroupPtr group) static int virCgroupV1DetectControllers(virCgroupPtr group, - int controllers) + int controllers, + int detected) { size_t i; size_t j; @@ -427,6 +428,9 @@ virCgroupV1DetectControllers(virCgroupPtr group, /* First mark requested but non-existing controllers to be ignored */ for (i = 0; i < VIR_CGROUP_CONTROLLER_LAST; i++) { if (((1 << i) & controllers)) { + int type = 1 << i; + if (type & detected) + VIR_FREE(group->legacy[i].mountPoint); /* Remove non-existent controllers */ if (!group->legacy[i].mountPoint) { VIR_DEBUG("Requested controller '%s' not mounted, ignoring", @@ -466,6 +470,9 @@ virCgroupV1DetectControllers(virCgroupPtr group, VIR_DEBUG("Auto-detecting controllers"); controllers = 0; for (i = 0; i < VIR_CGROUP_CONTROLLER_LAST; i++) { + int type = 1 << i; + if (type & detected) + VIR_FREE(group->legacy[i].mountPoint); VIR_DEBUG("Controller '%s' present=%s", virCgroupV1ControllerTypeToString(i), group->legacy[i].mountPoint ? "yes" : "no"); diff --git a/src/util/vircgroupv2.c b/src/util/vircgroupv2.c index f2ea1eb7df..44c4c7fc90 100644 --- a/src/util/vircgroupv2.c +++ b/src/util/vircgroupv2.c @@ -284,7 +284,8 @@ virCgroupV2ParseControllersFile(virCgroupPtr group) static int virCgroupV2DetectControllers(virCgroupPtr group, - int controllers) + int controllers, + int detected) { size_t i; @@ -297,6 +298,8 @@ virCgroupV2DetectControllers(virCgroupPtr group, if (virCgroupV2DevicesAvailable(group)) group->unified.controllers |= 1 << VIR_CGROUP_CONTROLLER_DEVICES; + group->unified.controllers &= ~detected; + for (i = 0; i < VIR_CGROUP_CONTROLLER_LAST; i++) VIR_DEBUG("Controller '%s' present=%s", virCgroupV2ControllerTypeToString(i), -- 2.20.1

This function simply removes program from guest cgroup before we remove the cgroup. This is required step because there is a bug [1] in kernel where the program might not be properly freed if you remove cgroup with attached program. [1] <https://bugzilla.redhat.com/show_bug.cgi?id=1656432> Signed-off-by: Pavel Hrdina <phrdina@redhat.com> --- src/util/vircgroupv2devices.c | 27 ++++++++++++++++++++++++++- 1 file changed, 26 insertions(+), 1 deletion(-) diff --git a/src/util/vircgroupv2devices.c b/src/util/vircgroupv2devices.c index 8bf5100724..a481ba89b7 100644 --- a/src/util/vircgroupv2devices.c +++ b/src/util/vircgroupv2devices.c @@ -420,19 +420,44 @@ virCgroupV2DevicesPrepareProg(virCgroupPtr group) int virCgroupV2DevicesRemoveProg(virCgroupPtr group) { + int ret = -1; + int cgroupfd = -1; + VIR_AUTOFREE(char *) path = NULL; + if (virCgroupV2DevicesDetectProg(group) < 0) return -1; if (group->unified.devices.progfd <= 0 && group->unified.devices.mapfd <= 0) return 0; + if (virCgroupPathOfController(group, VIR_CGROUP_CONTROLLER_DEVICES, + NULL, &path) < 0) { + return -1; + } + + cgroupfd = open(path, O_RDONLY); + if (cgroupfd < 0) { + virReportSystemError(errno, _("unable to open '%s'"), path); + goto cleanup; + } + + if (virBPFDetachProg(group->unified.devices.progfd, + cgroupfd, BPF_CGROUP_DEVICE) < 0) { + virReportSystemError(errno, "%s", _("failed to detach cgroup BPF prog")); + goto cleanup; + } + if (group->unified.devices.mapfd >= 0) VIR_FORCE_CLOSE(group->unified.devices.mapfd); if (group->unified.devices.progfd >= 0) VIR_FORCE_CLOSE(group->unified.devices.progfd); - return 0; + ret = 0; + + cleanup: + VIR_FORCE_CLOSE(cgroupfd); + return ret; } -- 2.20.1

If some program is attached to cgroup and that cgroup is removed before detaching the program it might not be freed and will remain in the system until it's rebooted. This would not be that big deal to workaround if we wouldn't use machined to track our guests. Systemd tries to be the nice guy and if there is no process attached to TransientUnit it will destroy the unit and remove the cgroup from system which will hit the kernel bug. In order to prevent this behavior of systemd we can move another process into the guest cgroup and the cgroup will remain active even if we kill qemu process while destroying guest. We can then detach our BPF program from that cgroup and call machined terminate API to cleanup the cgroup. Signed-off-by: Pavel Hrdina <phrdina@redhat.com> --- src/libvirt_private.syms | 1 + src/lxc/lxc_cgroup.c | 1 + src/qemu/qemu_cgroup.c | 6 ++- src/util/vircgroup.c | 16 ++++++++ src/util/vircgroup.h | 1 + src/util/vircgrouppriv.h | 2 + src/util/vircgroupv2.c | 2 + src/util/vircgroupv2devices.c | 70 ++++++++++++++++++++++++++++++++++- src/util/virsystemd.c | 2 +- src/util/virsystemd.h | 2 + 10 files changed, 99 insertions(+), 4 deletions(-) diff --git a/src/libvirt_private.syms b/src/libvirt_private.syms index f1da5ede71..c22c81cebe 100644 --- a/src/libvirt_private.syms +++ b/src/libvirt_private.syms @@ -3031,6 +3031,7 @@ virSystemdCanHybridSleep; virSystemdCanSuspend; virSystemdCreateMachine; virSystemdGetMachineNameByPID; +virSystemdHasMachined; virSystemdHasMachinedResetCachedValue; virSystemdMakeScopeName; virSystemdMakeSliceName; diff --git a/src/lxc/lxc_cgroup.c b/src/lxc/lxc_cgroup.c index d93a19d684..a8cef74bb9 100644 --- a/src/lxc/lxc_cgroup.c +++ b/src/lxc/lxc_cgroup.c @@ -455,6 +455,7 @@ virCgroupPtr virLXCCgroupCreate(virDomainDefPtr def, nnicindexes, nicindexes, def->resource->partition, -1, + LXC_STATE_DIR, &cgroup) < 0) goto cleanup; diff --git a/src/qemu/qemu_cgroup.c b/src/qemu/qemu_cgroup.c index 4931fb6575..9374a799a1 100644 --- a/src/qemu/qemu_cgroup.c +++ b/src/qemu/qemu_cgroup.c @@ -946,6 +946,7 @@ qemuInitCgroup(virDomainObjPtr vm, nnicindexes, nicindexes, vm->def->resource->partition, cfg->cgroupControllers, + cfg->stateDir, &priv->cgroup) < 0) { if (virCgroupNewIgnoreError()) goto done; @@ -1256,16 +1257,19 @@ int qemuRemoveCgroup(virDomainObjPtr vm) { qemuDomainObjPrivatePtr priv = vm->privateData; + int rc = 0; if (priv->cgroup == NULL) return 0; /* Not supported, so claim success */ + rc = virCgroupRemove(priv->cgroup); + if (virCgroupTerminateMachine(priv->machineName) < 0) { if (!virCgroupNewIgnoreError()) VIR_DEBUG("Failed to terminate cgroup for %s", vm->def->name); } - return virCgroupRemove(priv->cgroup); + return rc; } diff --git a/src/util/vircgroup.c b/src/util/vircgroup.c index 9d284e9bf4..506cc49b9a 100644 --- a/src/util/vircgroup.c +++ b/src/util/vircgroup.c @@ -1105,6 +1105,7 @@ virCgroupNewMachineSystemd(const char *name, int *nicindexes, const char *partition, int controllers, + const char *stateDir, virCgroupPtr *group) { int rv; @@ -1151,6 +1152,16 @@ virCgroupNewMachineSystemd(const char *name, return -1; } + if (VIR_STRDUP((*group)->stateDir, stateDir) < 0) { + virCgroupFree(group); + return -1; + } + + if (VIR_STRDUP((*group)->vmName, name) < 0) { + virCgroupFree(group); + return -1; + } + if (virCgroupAddProcess(*group, pidleader) < 0) { virErrorPtr saved = virSaveLastError(); virCgroupRemove(*group); @@ -1233,6 +1244,7 @@ virCgroupNewMachine(const char *name, int *nicindexes, const char *partition, int controllers, + const char *stateDir, virCgroupPtr *group) { int rv; @@ -1249,6 +1261,7 @@ virCgroupNewMachine(const char *name, nicindexes, partition, controllers, + stateDir, group)) == 0) return 0; @@ -1301,6 +1314,8 @@ virCgroupFree(virCgroupPtr *group) VIR_FREE((*group)->unified.placement); VIR_FREE((*group)->path); + VIR_FREE((*group)->stateDir); + VIR_FREE((*group)->vmName); VIR_FREE(*group); } @@ -2897,6 +2912,7 @@ virCgroupNewMachine(const char *name ATTRIBUTE_UNUSED, int *nicindexes ATTRIBUTE_UNUSED, const char *partition ATTRIBUTE_UNUSED, int controllers ATTRIBUTE_UNUSED, + const char *stateDir ATTRIBUTE_UNUSED, virCgroupPtr *group ATTRIBUTE_UNUSED) { virReportSystemError(ENXIO, "%s", diff --git a/src/util/vircgroup.h b/src/util/vircgroup.h index 372009de4a..0cd90d3651 100644 --- a/src/util/vircgroup.h +++ b/src/util/vircgroup.h @@ -98,6 +98,7 @@ int virCgroupNewMachine(const char *name, int *nicindexes, const char *partition, int controllers, + const char *stateDir, virCgroupPtr *group) ATTRIBUTE_NONNULL(1) ATTRIBUTE_NONNULL(2) ATTRIBUTE_NONNULL(3); diff --git a/src/util/vircgrouppriv.h b/src/util/vircgrouppriv.h index 085fea375c..e0ccce88a0 100644 --- a/src/util/vircgrouppriv.h +++ b/src/util/vircgrouppriv.h @@ -62,6 +62,8 @@ typedef virCgroupV2Controller *virCgroupV2ControllerPtr; struct _virCgroup { char *path; + char *stateDir; + char *vmName; virCgroupBackendPtr backends[VIR_CGROUP_BACKEND_TYPE_LAST]; diff --git a/src/util/vircgroupv2.c b/src/util/vircgroupv2.c index 44c4c7fc90..03efabe198 100644 --- a/src/util/vircgroupv2.c +++ b/src/util/vircgroupv2.c @@ -37,6 +37,8 @@ #include "virerror.h" #include "virfile.h" #include "virlog.h" +#include "virpidfile.h" +#include "virprocess.h" #include "virstring.h" #include "virsystemd.h" diff --git a/src/util/vircgroupv2devices.c b/src/util/vircgroupv2devices.c index a481ba89b7..9d8fc5557e 100644 --- a/src/util/vircgroupv2devices.c +++ b/src/util/vircgroupv2devices.c @@ -31,10 +31,15 @@ #include "vircgrouppriv.h" #include "virbpf.h" +#include "vircommand.h" #include "vircgroup.h" #include "vircgroupv2devices.h" #include "virfile.h" #include "virlog.h" +#include "virpidfile.h" +#include "virprocess.h" +#include "virstring.h" +#include "virsystemd.h" VIR_LOG_INIT("util.cgroup"); @@ -365,17 +370,74 @@ virCgroupV2DevicesReallocMap(int mapfd, } +static pid_t +virCgroupV2DevicesStartDummyProc(virCgroupPtr group) +{ + pid_t ret = -1; + pid_t pid = -1; + virCommandPtr cmd = NULL; + VIR_AUTOFREE(char *) pidfile = NULL; + VIR_AUTOFREE(char *) fileName = NULL; + + if (virAsprintf(&fileName, "cgroup-%s", group->vmName) < 0) + return -1; + + pidfile = virPidFileBuildPath(group->stateDir, fileName); + if (!pidfile) + return -1; + + cmd = virCommandNewArgList("/usr/bin/tail", "-f", "/dev/null", + "-s", "3600", NULL); + if (!cmd) + return -1; + + virCommandDaemonize(cmd); + virCommandSetPidFile(cmd, pidfile); + + if (virCommandRun(cmd, NULL) < 0) + goto cleanup; + + if (virPidFileReadPath(pidfile, &pid) < 0) { + virReportError(VIR_ERR_INTERNAL_ERROR, "%s", + _("failed to start dummy cgroup helper")); + goto cleanup; + } + + if (virCgroupAddMachineProcess(group, pid) < 0) + goto cleanup; + + ret = pid; + cleanup: + if (ret < 0) { + virCommandAbort(cmd); + if (pid > 0) + virProcessKillPainfully(pid, true); + } + if (pidfile) + unlink(pidfile); + virCommandFree(cmd); + return ret; +} + + int virCgroupV2DevicesCreateProg(virCgroupPtr group) { - int mapfd; + int mapfd = -1; + pid_t pid = -1; if (group->unified.devices.progfd > 0 && group->unified.devices.mapfd > 0) return 0; + if (virSystemdHasMachined() == 0) { + pid = virCgroupV2DevicesStartDummyProc(group); + if (pid < 0) + return -1; + } + mapfd = virCgroupV2DevicesCreateMap(VIR_CGROUP_V2_INITIAL_BPF_MAP_SIZE); if (mapfd < 0) - return -1; + goto error; if (virCgroupV2DevicesAttachProg(group, mapfd, VIR_CGROUP_V2_INITIAL_BPF_MAP_SIZE) < 0) { @@ -385,6 +447,7 @@ virCgroupV2DevicesCreateProg(virCgroupPtr group) return 0; error: + virProcessKillPainfully(pid, true); VIR_FORCE_CLOSE(mapfd); return -1; } @@ -453,6 +516,9 @@ virCgroupV2DevicesRemoveProg(virCgroupPtr group) if (group->unified.devices.progfd >= 0) VIR_FORCE_CLOSE(group->unified.devices.progfd); + if (virSystemdHasMachined() == 0) + virCgroupKillPainfully(group); + ret = 0; cleanup: diff --git a/src/util/virsystemd.c b/src/util/virsystemd.c index f492ac1859..ca35b12a5c 100644 --- a/src/util/virsystemd.c +++ b/src/util/virsystemd.c @@ -138,7 +138,7 @@ void virSystemdHasMachinedResetCachedValue(void) * -1 = error * 0 = machine1 is available */ -static int +int virSystemdHasMachined(void) { int ret; diff --git a/src/util/virsystemd.h b/src/util/virsystemd.h index 7d9c0ebd62..553d4196f1 100644 --- a/src/util/virsystemd.h +++ b/src/util/virsystemd.h @@ -51,4 +51,6 @@ int virSystemdCanHybridSleep(bool *result); char *virSystemdGetMachineNameByPID(pid_t pid); +int virSystemdHasMachined(void); + #endif /* LIBVIRT_VIRSYSTEMD_H */ -- 2.20.1

On 1/14/19 4:47 PM, Pavel Hrdina wrote:
If some program is attached to cgroup and that cgroup is removed before detaching the program it might not be freed and will remain in the system until it's rebooted.
This would not be that big deal to workaround if we wouldn't use machined to track our guests. Systemd tries to be the nice guy and if there is no process attached to TransientUnit it will destroy the unit and remove the cgroup from system which will hit the kernel bug.
In order to prevent this behavior of systemd we can move another process into the guest cgroup and the cgroup will remain active even if we kill qemu process while destroying guest. We can then detach our BPF program from that cgroup and call machined terminate API to cleanup the cgroup.
Signed-off-by: Pavel Hrdina <phrdina@redhat.com> --- src/libvirt_private.syms | 1 + src/lxc/lxc_cgroup.c | 1 + src/qemu/qemu_cgroup.c | 6 ++- src/util/vircgroup.c | 16 ++++++++ src/util/vircgroup.h | 1 + src/util/vircgrouppriv.h | 2 + src/util/vircgroupv2.c | 2 + src/util/vircgroupv2devices.c | 70 ++++++++++++++++++++++++++++++++++- src/util/virsystemd.c | 2 +- src/util/virsystemd.h | 2 + 10 files changed, 99 insertions(+), 4 deletions(-)
+static pid_t +virCgroupV2DevicesStartDummyProc(virCgroupPtr group) +{ + pid_t ret = -1; + pid_t pid = -1; + virCommandPtr cmd = NULL; + VIR_AUTOFREE(char *) pidfile = NULL; + VIR_AUTOFREE(char *) fileName = NULL; + + if (virAsprintf(&fileName, "cgroup-%s", group->vmName) < 0) + return -1; + + pidfile = virPidFileBuildPath(group->stateDir, fileName); + if (!pidfile) + return -1; + + cmd = virCommandNewArgList("/usr/bin/tail", "-f", "/dev/null", + "-s", "3600", NULL); + if (!cmd) + return -1; + + virCommandDaemonize(cmd); + virCommandSetPidFile(cmd, pidfile); + + if (virCommandRun(cmd, NULL) < 0) + goto cleanup; + + if (virPidFileReadPath(pidfile, &pid) < 0) { + virReportError(VIR_ERR_INTERNAL_ERROR, "%s", + _("failed to start dummy cgroup helper")); + goto cleanup; + } + + if (virCgroupAddMachineProcess(group, pid) < 0) + goto cleanup; + + ret = pid; + cleanup: + if (ret < 0) { + virCommandAbort(cmd); + if (pid > 0) + virProcessKillPainfully(pid, true); + } + if (pidfile) + unlink(pidfile); + virCommandFree(cmd); + return ret; +}
I don't think we should bother with this. This is clearly a kernel bug and once fixed (which we can't detect) we would still be placing this 'tail' into the cgroup for nothing. I know we have lots of workarounds for bugs in other libraries (and even probably kernels too) in our code, but we should start getting rid of those instead of introducing new ones. Michal

On Mon, Jan 14, 2019 at 04:47:50PM +0100, Pavel Hrdina wrote:
If some program is attached to cgroup and that cgroup is removed before detaching the program it might not be freed and will remain in the system until it's rebooted.
This would not be that big deal to workaround if we wouldn't use machined to track our guests. Systemd tries to be the nice guy and if there is no process attached to TransientUnit it will destroy the unit and remove the cgroup from system which will hit the kernel bug.
In order to prevent this behavior of systemd we can move another process into the guest cgroup and the cgroup will remain active even if we kill qemu process while destroying guest. We can then detach our BPF program from that cgroup and call machined terminate API to cleanup the cgroup.
The kernel bugzilla says that "bpftool" still reports existance of the BPF program when this situation hits. So can't we just use bpftool to explicitly delete this orphaned program
+static pid_t +virCgroupV2DevicesStartDummyProc(virCgroupPtr group) +{ + pid_t ret = -1; + pid_t pid = -1; + virCommandPtr cmd = NULL; + VIR_AUTOFREE(char *) pidfile = NULL; + VIR_AUTOFREE(char *) fileName = NULL; + + if (virAsprintf(&fileName, "cgroup-%s", group->vmName) < 0) + return -1; + + pidfile = virPidFileBuildPath(group->stateDir, fileName); + if (!pidfile) + return -1; + + cmd = virCommandNewArgList("/usr/bin/tail", "-f", "/dev/null", + "-s", "3600", NULL); + if (!cmd) + return -1; + + virCommandDaemonize(cmd); + virCommandSetPidFile(cmd, pidfile); + + if (virCommandRun(cmd, NULL) < 0) + goto cleanup; + + if (virPidFileReadPath(pidfile, &pid) < 0) { + virReportError(VIR_ERR_INTERNAL_ERROR, "%s", + _("failed to start dummy cgroup helper")); + goto cleanup; + } + + if (virCgroupAddMachineProcess(group, pid) < 0) + goto cleanup; + + ret = pid; + cleanup: + if (ret < 0) { + virCommandAbort(cmd); + if (pid > 0) + virProcessKillPainfully(pid, true); + } + if (pidfile) + unlink(pidfile); + virCommandFree(cmd); + return ret; +}
I think this is really horrible to be honest. If there's really no other option, then I'd prefer that we do not use BPF at all on known broken kernel versions until it is fixed. Regards, Daniel -- |: https://berrange.com -o- https://www.flickr.com/photos/dberrange :| |: https://libvirt.org -o- https://fstop138.berrange.com :| |: https://entangle-photo.org -o- https://www.instagram.com/dberrange :|

On Thu, Jan 24, 2019 at 11:06:51AM +0000, Daniel P. Berrangé wrote:
On Mon, Jan 14, 2019 at 04:47:50PM +0100, Pavel Hrdina wrote:
If some program is attached to cgroup and that cgroup is removed before detaching the program it might not be freed and will remain in the system until it's rebooted.
This would not be that big deal to workaround if we wouldn't use machined to track our guests. Systemd tries to be the nice guy and if there is no process attached to TransientUnit it will destroy the unit and remove the cgroup from system which will hit the kernel bug.
In order to prevent this behavior of systemd we can move another process into the guest cgroup and the cgroup will remain active even if we kill qemu process while destroying guest. We can then detach our BPF program from that cgroup and call machined terminate API to cleanup the cgroup.
The kernel bugzilla says that "bpftool" still reports existance of the BPF program when this situation hits. So can't we just use bpftool to explicitly delete this orphaned program
Unfortunately no, bpftool doesn't have any way how to delete any programs.
+static pid_t +virCgroupV2DevicesStartDummyProc(virCgroupPtr group) +{ + pid_t ret = -1; + pid_t pid = -1; + virCommandPtr cmd = NULL; + VIR_AUTOFREE(char *) pidfile = NULL; + VIR_AUTOFREE(char *) fileName = NULL; + + if (virAsprintf(&fileName, "cgroup-%s", group->vmName) < 0) + return -1; + + pidfile = virPidFileBuildPath(group->stateDir, fileName); + if (!pidfile) + return -1; + + cmd = virCommandNewArgList("/usr/bin/tail", "-f", "/dev/null", + "-s", "3600", NULL); + if (!cmd) + return -1; + + virCommandDaemonize(cmd); + virCommandSetPidFile(cmd, pidfile); + + if (virCommandRun(cmd, NULL) < 0) + goto cleanup; + + if (virPidFileReadPath(pidfile, &pid) < 0) { + virReportError(VIR_ERR_INTERNAL_ERROR, "%s", + _("failed to start dummy cgroup helper")); + goto cleanup; + } + + if (virCgroupAddMachineProcess(group, pid) < 0) + goto cleanup; + + ret = pid; + cleanup: + if (ret < 0) { + virCommandAbort(cmd); + if (pid > 0) + virProcessKillPainfully(pid, true); + } + if (pidfile) + unlink(pidfile); + virCommandFree(cmd); + return ret; +}
I think this is really horrible to be honest.
If there's really no other option, then I'd prefer that we do not use BPF at all on known broken kernel versions until it is fixed.
I agree with you and Michal, it's ugly and most likely the only solution with broken kernel and systemd. I wrote the patch to show how we could workaround the kernel bug and I was hoping that we will agree on not using it. Another possible workaround would be to not use machined and create the systemd unit by other APIs, that way systemd would not remove the cgroup and we would be able to detach programs before removing cgroups. Or just simply wait until kernel fixes the issue. Pavel

We need to mock virCgroupV2DevicesAvailable() in order to remove any dependency on kernel as BPF devices might not be available. Signed-off-by: Pavel Hrdina <phrdina@redhat.com> --- src/util/vircgroupv2devices.h | 5 ++++- tests/vircgroupdata/hybrid.parsed | 2 +- tests/vircgroupmock.c | 7 +++++++ tests/vircgrouptest.c | 4 ++-- 4 files changed, 14 insertions(+), 4 deletions(-) diff --git a/src/util/vircgroupv2devices.h b/src/util/vircgroupv2devices.h index d717c9755f..5fc35942ef 100644 --- a/src/util/vircgroupv2devices.h +++ b/src/util/vircgroupv2devices.h @@ -21,10 +21,13 @@ # include <sys/types.h> +# include "internal.h" + # include "vircgroup.h" bool -virCgroupV2DevicesAvailable(virCgroupPtr group); +virCgroupV2DevicesAvailable(virCgroupPtr group) + ATTRIBUTE_NOINLINE; int virCgroupV2DevicesAttachProg(virCgroupPtr group, diff --git a/tests/vircgroupdata/hybrid.parsed b/tests/vircgroupdata/hybrid.parsed index 7600de5f45..f755eed465 100644 --- a/tests/vircgroupdata/hybrid.parsed +++ b/tests/vircgroupdata/hybrid.parsed @@ -2,7 +2,7 @@ cpu <null> cpuacct <null> cpuset /not/really/sys/fs/cgroup/cpuset memory <null> -devices /not/really/sys/fs/cgroup/devices +devices <null> freezer /not/really/sys/fs/cgroup/freezer blkio <null> net_cls /not/really/sys/fs/cgroup/net_cls diff --git a/tests/vircgroupmock.c b/tests/vircgroupmock.c index 06bd0a5f29..97f98e91ee 100644 --- a/tests/vircgroupmock.c +++ b/tests/vircgroupmock.c @@ -34,6 +34,7 @@ # include "testutilslxc.h" # include "virstring.h" # include "virfile.h" +# include "vircgroupv2devices.h" static int (*real_open)(const char *path, int flags, ...); static FILE *(*real_fopen)(const char *path, const char *mode); @@ -701,6 +702,12 @@ int open(const char *path, int flags, ...) free(newpath); return ret; } + +bool +virCgroupV2DevicesAvailable(virCgroupPtr group ATTRIBUTE_UNUSED) +{ + return true; +} #else /* Nothing to override on non-__linux__ platforms */ #endif diff --git a/tests/vircgrouptest.c b/tests/vircgrouptest.c index 20f4c57b04..4c1f53d924 100644 --- a/tests/vircgrouptest.c +++ b/tests/vircgrouptest.c @@ -587,6 +587,7 @@ static int testCgroupNewForSelfUnified(const void *args ATTRIBUTE_UNUSED) (1 << VIR_CGROUP_CONTROLLER_CPU) | (1 << VIR_CGROUP_CONTROLLER_CPUACCT) | (1 << VIR_CGROUP_CONTROLLER_MEMORY) | + (1 << VIR_CGROUP_CONTROLLER_DEVICES) | (1 << VIR_CGROUP_CONTROLLER_BLKIO); if (virCgroupNewSelf(&cgroup) < 0) { @@ -609,14 +610,12 @@ static int testCgroupNewForSelfHybrid(const void *args ATTRIBUTE_UNUSED) const char *empty[VIR_CGROUP_CONTROLLER_LAST] = { 0 }; const char *mounts[VIR_CGROUP_CONTROLLER_LAST] = { [VIR_CGROUP_CONTROLLER_CPUSET] = "/not/really/sys/fs/cgroup/cpuset", - [VIR_CGROUP_CONTROLLER_DEVICES] = "/not/really/sys/fs/cgroup/devices", [VIR_CGROUP_CONTROLLER_FREEZER] = "/not/really/sys/fs/cgroup/freezer", [VIR_CGROUP_CONTROLLER_NET_CLS] = "/not/really/sys/fs/cgroup/net_cls", [VIR_CGROUP_CONTROLLER_PERF_EVENT] = "/not/really/sys/fs/cgroup/perf_event", }; const char *placement[VIR_CGROUP_CONTROLLER_LAST] = { [VIR_CGROUP_CONTROLLER_CPUSET] = "/", - [VIR_CGROUP_CONTROLLER_DEVICES] = "/", [VIR_CGROUP_CONTROLLER_FREEZER] = "/", [VIR_CGROUP_CONTROLLER_NET_CLS] = "/", [VIR_CGROUP_CONTROLLER_PERF_EVENT] = "/", @@ -625,6 +624,7 @@ static int testCgroupNewForSelfHybrid(const void *args ATTRIBUTE_UNUSED) (1 << VIR_CGROUP_CONTROLLER_CPU) | (1 << VIR_CGROUP_CONTROLLER_CPUACCT) | (1 << VIR_CGROUP_CONTROLLER_MEMORY) | + (1 << VIR_CGROUP_CONTROLLER_DEVICES) | (1 << VIR_CGROUP_CONTROLLER_BLKIO); if (virCgroupNewSelf(&cgroup) < 0) { -- 2.20.1
participants (3)
-
Daniel P. Berrangé
-
Michal Privoznik
-
Pavel Hrdina