[libvirt] [PATCH v3 00/15] implement cgroups v2 support

In cgroups v2 there is no devices controller, eBPF should be used instead. Changes in v3: - removed workaround for kernel bug [1] - added documentation how to get the eBPF program Changes in v2: - fixed build on bsd and older kernels without cgroup BPF - cgroup bpf devices code moved to separate file Documentation for eBPF: <http://man7.org/linux/man-pages/man2/bpf.2.html> <https://www.kernel.org/doc/Documentation/networking/filter.txt> <https://docs.cilium.io/en/v1.3/bpf/> [1] <https://bugzilla.redhat.com/show_bug.cgi?id=1656432> Pavel Hrdina (15): util: introduce virbpf helpers vircgroup: introduce virCgroupV2DevicesAvailable vircgroup: introduce virCgroupV2DevicesAttachProg vircgroup: introduce virCgroupV2DevicesDetectProg vircgroup: introduce virCgroupV2DevicesCreateProg vircgroup: introduce virCgroupV2DevicesPrepareProg vircgroup: introduce virCgroupV2DevicesRemoveProg vircgroup: introduce virCgroupV2DeviceGetPerms vircgroup: introduce virCgroupV2DevicesGetKey vircgroup: introduce virCgroupV2AllowDevice vircgroup: introduce virCgroupV2DenyDevice vircgroup: introduce virCgroupV2AllowAllDevices vircgroup: introduce virCgroupV2DenyAllDevices vircgroup: workaround devices in hybrid mode vircgroupmock: mock virCgroupV2DevicesAvailable configure.ac | 6 + include/libvirt/virterror.h | 2 + src/Makefile.am | 2 + src/libvirt_private.syms | 26 ++ src/util/Makefile.inc.am | 4 + src/util/virbpf.c | 438 +++++++++++++++++++ src/util/virbpf.h | 259 ++++++++++++ src/util/vircgroup.c | 3 +- src/util/vircgroupbackend.h | 3 +- src/util/vircgrouppriv.h | 10 + src/util/vircgroupv1.c | 9 +- src/util/vircgroupv2.c | 117 +++++- src/util/vircgroupv2devices.c | 670 ++++++++++++++++++++++++++++++ src/util/vircgroupv2devices.h | 57 +++ src/util/virerror.c | 2 + tests/vircgroupdata/hybrid.parsed | 2 +- tests/vircgroupmock.c | 7 + tests/vircgrouptest.c | 4 +- 18 files changed, 1613 insertions(+), 8 deletions(-) create mode 100644 src/util/virbpf.c create mode 100644 src/util/virbpf.h create mode 100644 src/util/vircgroupv2devices.c create mode 100644 src/util/vircgroupv2devices.h -- 2.20.1

In order to implement devices controller with cgroup v2 we need to add support for BPF programs, cgroup v2 doesn't have devices controller. This introduces required helpers wrapping linux syscalls. Signed-off-by: Pavel Hrdina <phrdina@redhat.com> --- configure.ac | 5 + include/libvirt/virterror.h | 2 + src/libvirt_private.syms | 16 ++ src/util/Makefile.inc.am | 2 + src/util/virbpf.c | 438 ++++++++++++++++++++++++++++++++++++ src/util/virbpf.h | 259 +++++++++++++++++++++ src/util/virerror.c | 2 + 7 files changed, 724 insertions(+) create mode 100644 src/util/virbpf.c create mode 100644 src/util/virbpf.h diff --git a/configure.ac b/configure.ac index dcd78f64bf..fbbc88303a 100644 --- a/configure.ac +++ b/configure.ac @@ -876,6 +876,11 @@ AC_CHECK_DECLS([clock_serv_t, host_get_clock_service, clock_get_time], #include <mach/mach.h> ]) +# Check if we have new enough kernel to support BPF devices for cgroups v2 +if test "$with_linux" = "yes"; then + AC_CHECK_DECLS([BPF_PROG_QUERY], [], [], [#include <linux/bpf.h>]) +fi + # Check if we need to look for ifconfig if test "$want_ifconfig" = "yes"; then AC_PATH_PROG([IFCONFIG_PATH], [ifconfig]) diff --git a/include/libvirt/virterror.h b/include/libvirt/virterror.h index 6dc83a17cc..f1fb9d4721 100644 --- a/include/libvirt/virterror.h +++ b/include/libvirt/virterror.h @@ -134,6 +134,8 @@ typedef enum { VIR_FROM_FIREWALLD = 68, /* Error from firewalld */ VIR_FROM_DOMAIN_CHECKPOINT = 69, /* Error from domain checkpoint */ + VIR_FROM_BPF = 70, /* Error from BPF code */ + # ifdef VIR_ENUM_SENTINELS VIR_ERR_DOMAIN_LAST # endif diff --git a/src/libvirt_private.syms b/src/libvirt_private.syms index a03cf0b645..56db5d92cd 100644 --- a/src/libvirt_private.syms +++ b/src/libvirt_private.syms @@ -1519,6 +1519,22 @@ virBitmapToDataBuf; virBitmapToString; +# util/virbpf.h +virBPFAttachProg; +virBPFCreateMap; +virBPFDeleteElem; +virBPFDetachProg; +virBPFGetMap; +virBPFGetMapInfo; +virBPFGetNextElem; +virBPFGetProg; +virBPFGetProgInfo; +virBPFLoadProg; +virBPFLookupElem; +virBPFQueryProg; +virBPFUpdateElem; + + # util/virbuffer.h virBufferAdd; virBufferAddBuffer; diff --git a/src/util/Makefile.inc.am b/src/util/Makefile.inc.am index c757f5a6ae..0c2ee03c2f 100644 --- a/src/util/Makefile.inc.am +++ b/src/util/Makefile.inc.am @@ -20,6 +20,8 @@ UTIL_SOURCES = \ util/virautoclean.h \ util/virbitmap.c \ util/virbitmap.h \ + util/virbpf.c \ + util/virbpf.h \ util/virbuffer.c \ util/virbuffer.h \ util/virperf.c \ diff --git a/src/util/virbpf.c b/src/util/virbpf.c new file mode 100644 index 0000000000..a79a97d578 --- /dev/null +++ b/src/util/virbpf.c @@ -0,0 +1,438 @@ +/* + * virbpf.c: methods for eBPF + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library. If not, see + * <http://www.gnu.org/licenses/>. + */ +#include <config.h> + +#include <sys/syscall.h> + +#include "internal.h" + +#include "viralloc.h" +#include "virbpf.h" +#include "virerror.h" +#include "virfile.h" +#include "virlog.h" +#include "virstring.h" + +VIR_LOG_INIT("util.bpf"); + +#define VIR_FROM_THIS VIR_FROM_BPF + +#if HAVE_DECL_BPF_PROG_QUERY +int +virBPFCreateMap(unsigned int mapType, + unsigned int keySize, + unsigned int valSize, + unsigned int maxEntries) +{ + union bpf_attr attr; + + memset(&attr, 0, sizeof(attr)); + + attr.map_type = mapType; + attr.key_size = keySize; + attr.value_size = valSize; + attr.max_entries = maxEntries; + + return syscall(SYS_bpf, BPF_MAP_CREATE, &attr, sizeof(attr)); +} + + +# define LOG_BUF_SIZE (256 * 1024) + +int +virBPFLoadProg(struct bpf_insn *insns, + int progType, + unsigned int insnCnt) +{ + VIR_AUTOFREE(char *) logbuf = NULL; + int progfd = -1; + union bpf_attr attr; + + if (VIR_ALLOC_N(logbuf, LOG_BUF_SIZE) < 0) + return -1; + + memset(&attr, 0, sizeof(attr)); + + attr.prog_type = progType; + attr.insn_cnt = (uint32_t)insnCnt; + attr.insns = (uint64_t)insns; + attr.license = (uint64_t)"GPL"; + attr.log_buf = (uint64_t)logbuf; + attr.log_size = LOG_BUF_SIZE; + attr.log_level = 1; + + progfd = syscall(SYS_bpf, BPF_PROG_LOAD, &attr, sizeof(attr)); + + if (progfd < 0) + VIR_DEBUG("%s", logbuf); + + return progfd; +} + + +int +virBPFAttachProg(int progfd, + int targetfd, + int attachType) +{ + union bpf_attr attr; + + memset(&attr, 0, sizeof(attr)); + + attr.target_fd = targetfd; + attr.attach_bpf_fd = progfd; + attr.attach_type = attachType; + + return syscall(SYS_bpf, BPF_PROG_ATTACH, &attr, sizeof(attr)); +} + + +int +virBPFDetachProg(int progfd, + int targetfd, + int attachType) +{ + union bpf_attr attr; + + memset(&attr, 0, sizeof(attr)); + + attr.target_fd = targetfd; + attr.attach_bpf_fd = progfd; + attr.attach_type = attachType; + + return syscall(SYS_bpf, BPF_PROG_DETACH, &attr, sizeof(attr)); +} + + +int +virBPFQueryProg(int targetfd, + unsigned int maxprogids, + int attachType, + unsigned int *progcnt, + void *progids) +{ + union bpf_attr attr; + int rc; + + memset(&attr, 0, sizeof(attr)); + + attr.query.target_fd = targetfd; + attr.query.attach_type = attachType; + attr.query.prog_cnt = maxprogids; + attr.query.prog_ids = (uint64_t)progids; + + rc = syscall(SYS_bpf, BPF_PROG_QUERY, &attr, sizeof(attr)); + + if (rc >= 0) + *progcnt = attr.query.prog_cnt; + + return rc; +} + + +int +virBPFGetProg(unsigned int id) +{ + union bpf_attr attr; + + memset(&attr, 0, sizeof(attr)); + + attr.prog_id = id; + + return syscall(SYS_bpf, BPF_PROG_GET_FD_BY_ID, &attr, sizeof(attr)); +} + + +int +virBPFGetProgInfo(int progfd, + struct bpf_prog_info *info, + unsigned int **mapIDs) +{ + union bpf_attr attr; + int rc; + + memset(&attr, 0, sizeof(attr)); + + attr.info.bpf_fd = progfd; + attr.info.info_len = sizeof(struct bpf_prog_info); + attr.info.info = (uint64_t)info; + + rc = syscall(SYS_bpf, BPF_OBJ_GET_INFO_BY_FD, &attr, sizeof(attr)); + if (rc < 0) + return rc; + + if (mapIDs && info->nr_map_ids > 0) { + unsigned int maplen = info->nr_map_ids; + VIR_AUTOFREE(unsigned int *) retmapIDs = NULL; + + if (VIR_ALLOC_N(retmapIDs, maplen) < 0) + return -1; + + memset(info, 0, sizeof(struct bpf_prog_info)); + info->nr_map_ids = maplen; + info->map_ids = (uint64_t)retmapIDs; + + memset(&attr, 0, sizeof(attr)); + attr.info.bpf_fd = progfd; + attr.info.info_len = sizeof(struct bpf_prog_info); + attr.info.info = (uint64_t)info; + + rc = syscall(SYS_bpf, BPF_OBJ_GET_INFO_BY_FD, &attr, sizeof(attr)); + if (rc < 0) + return rc; + + VIR_STEAL_PTR(*mapIDs, retmapIDs); + } + + return rc; +} + + +int +virBPFGetMap(unsigned int id) +{ + union bpf_attr attr; + + memset(&attr, 0, sizeof(attr)); + + attr.map_id = id; + + return syscall(SYS_bpf, BPF_MAP_GET_FD_BY_ID, &attr, sizeof(attr)); +} + + +int +virBPFGetMapInfo(int mapfd, + struct bpf_map_info *info) +{ + union bpf_attr attr; + + memset(&attr, 0, sizeof(attr)); + + attr.info.bpf_fd = mapfd; + attr.info.info_len = sizeof(struct bpf_map_info); + attr.info.info = (uint64_t)info; + + return syscall(SYS_bpf, BPF_OBJ_GET_INFO_BY_FD, &attr, sizeof(attr)); +} + + +int +virBPFLookupElem(int mapfd, + void *key, + void *val) +{ + union bpf_attr attr; + + memset(&attr, 0, sizeof(attr)); + + attr.map_fd = mapfd; + attr.key = (uint64_t)key; + attr.value = (uint64_t)val; + + return syscall(SYS_bpf, BPF_MAP_LOOKUP_ELEM, &attr, sizeof(attr)); +} + + +int +virBPFGetNextElem(int mapfd, + void *key, + void *nextKey) +{ + union bpf_attr attr; + + memset(&attr, 0, sizeof(attr)); + + attr.map_fd = mapfd; + attr.key = (uint64_t)key; + attr.next_key = (uint64_t)nextKey; + + return syscall(SYS_bpf, BPF_MAP_GET_NEXT_KEY, &attr, sizeof(attr)); +} + + +int +virBPFUpdateElem(int mapfd, + void *key, + void *val) +{ + union bpf_attr attr; + + memset(&attr, 0, sizeof(attr)); + + attr.map_fd = mapfd; + attr.key = (uint64_t)key; + attr.value = (uint64_t)val; + + return syscall(SYS_bpf, BPF_MAP_UPDATE_ELEM, &attr, sizeof(attr)); +} + + +int +virBPFDeleteElem(int mapfd, + void *key) +{ + union bpf_attr attr; + + memset(&attr, 0, sizeof(attr)); + + attr.map_fd = mapfd; + attr.key = (uint64_t)key; + + return syscall(SYS_bpf, BPF_MAP_DELETE_ELEM, &attr, sizeof(attr)); +} +#else /* HAVE_DECL_BPF_PROG_QUERY */ +int +virBPFCreateMap(unsigned int mapType ATTRIBUTE_UNUSED, + unsigned int keySize ATTRIBUTE_UNUSED, + unsigned int valSize ATTRIBUTE_UNUSED, + unsigned int maxEntries ATTRIBUTE_UNUSED) +{ + virReportSystemError(ENOSYS, "%s", + _("BPF not supported with this kernel")); + return -1; +} + + +int +virBPFLoadProg(struct bpf_insn *insns ATTRIBUTE_UNUSED, + int progType ATTRIBUTE_UNUSED, + unsigned int insnCnt ATTRIBUTE_UNUSED) +{ + virReportSystemError(ENOSYS, "%s", + _("BPF not supported with this kernel")); + return -1; +} + + +int +virBPFAttachProg(int progfd ATTRIBUTE_UNUSED, + int targetfd ATTRIBUTE_UNUSED, + int attachType ATTRIBUTE_UNUSED) +{ + virReportSystemError(ENOSYS, "%s", + _("BPF not supported with this kernel")); + return -1; +} + + +int +virBPFDetachProg(int progfd ATTRIBUTE_UNUSED, + int targetfd ATTRIBUTE_UNUSED, + int attachType ATTRIBUTE_UNUSED) +{ + virReportSystemError(ENOSYS, "%s", + _("BPF not supported with this kernel")); + return -1; +} + + +int +virBPFQueryProg(int targetfd ATTRIBUTE_UNUSED, + unsigned int maxprogids ATTRIBUTE_UNUSED, + int attachType ATTRIBUTE_UNUSED, + unsigned int *progcnt ATTRIBUTE_UNUSED, + void *progids ATTRIBUTE_UNUSED) +{ + virReportSystemError(ENOSYS, "%s", + _("BPF not supported with this kernel")); + return -1; +} + + +int +virBPFGetProg(unsigned int id ATTRIBUTE_UNUSED) +{ + virReportSystemError(ENOSYS, "%s", + _("BPF not supported with this kernel")); + return -1; +} + + +int +virBPFGetProgInfo(int progfd ATTRIBUTE_UNUSED, + struct bpf_prog_info *info ATTRIBUTE_UNUSED, + unsigned int **mapIDs ATTRIBUTE_UNUSED) +{ + virReportSystemError(ENOSYS, "%s", + _("BPF not supported with this kernel")); + return -1; +} + + +int +virBPFGetMap(unsigned int id ATTRIBUTE_UNUSED) +{ + virReportSystemError(ENOSYS, "%s", + _("BPF not supported with this kernel")); + return -1; +} + + +int +virBPFGetMapInfo(int mapfd ATTRIBUTE_UNUSED, + struct bpf_map_info *info ATTRIBUTE_UNUSED) +{ + virReportSystemError(ENOSYS, "%s", + _("BPF not supported with this kernel")); + return -1; +} + + +int +virBPFLookupElem(int mapfd ATTRIBUTE_UNUSED, + void *key ATTRIBUTE_UNUSED, + void *val ATTRIBUTE_UNUSED) +{ + virReportSystemError(ENOSYS, "%s", + _("BPF not supported with this kernel")); + return -1; +} + + +int +virBPFGetNextElem(int mapfd ATTRIBUTE_UNUSED, + void *key ATTRIBUTE_UNUSED, + void *nextKey ATTRIBUTE_UNUSED) +{ + virReportSystemError(ENOSYS, "%s", + _("BPF not supported with this kernel")); + return -1; +} + + +int +virBPFUpdateElem(int mapfd ATTRIBUTE_UNUSED, + void *key ATTRIBUTE_UNUSED, + void *val ATTRIBUTE_UNUSED) +{ + virReportSystemError(ENOSYS, "%s", + _("BPF not supported with this kernel")); + return -1; +} + + +int +virBPFDeleteElem(int mapfd ATTRIBUTE_UNUSED, + void *key ATTRIBUTE_UNUSED) +{ + virReportSystemError(ENOSYS, "%s", + _("BPF not supported with this kernel")); + return -1; +} +#endif /* HAVE_DECL_BPF_PROG_QUERY */ diff --git a/src/util/virbpf.h b/src/util/virbpf.h new file mode 100644 index 0000000000..b5874e1e8d --- /dev/null +++ b/src/util/virbpf.h @@ -0,0 +1,259 @@ +/* + * virbpf.h: methods for eBPF + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library. If not, see + * <http://www.gnu.org/licenses/>. + */ + +#ifndef LIBVIRT_VIRBPF_H +# define LIBVIRT_VIRBPF_H + +# if HAVE_DECL_BPF_PROG_QUERY + +# include <linux/bpf.h> + +/* ALU ops on registers, bpf_add|sub|...: dst_reg += src_reg */ + +# define VIR_BPF_ALU64_REG(op, dst, src) \ + ((struct bpf_insn) { \ + .code = BPF_ALU64 | BPF_OP(op) | BPF_X, \ + .dst_reg = dst, \ + .src_reg = src, \ + .off = 0, \ + .imm = 0, \ + }) + +/* ALU ops on immediates, bpf_add|sub|...: dst_reg += imm32 */ + +# define VIR_BPF_ALU64_IMM(op, dst, immval) \ + ((struct bpf_insn) { \ + .code = BPF_ALU64 | BPF_OP(op) | BPF_K, \ + .dst_reg = dst, \ + .src_reg = 0, \ + .off = 0, \ + .imm = immval, \ + }) + +/* mov of registers, dst_reg = src_reg */ + +# define VIR_BPF_MOV64_REG(dst, src) \ + ((struct bpf_insn) { \ + .code = BPF_ALU64 | BPF_MOV | BPF_X, \ + .dst_reg = dst, \ + .src_reg = src, \ + .off = 0, \ + .imm = 0, \ + }) + +/* mov of immediates, dst_reg = imm32 */ + +# define VIR_BPF_MOV64_IMM(dst, immval) \ + ((struct bpf_insn) { \ + .code = BPF_ALU64 | BPF_MOV | BPF_K, \ + .dst_reg = dst, \ + .src_reg = 0, \ + .off = 0, \ + .imm = immval, \ + }) + +/* helper to encode 16 byte instruction */ + +# define _VIR_BPF_LD_IMM64_RAW(dst, src, immval) \ + ((struct bpf_insn) { \ + .code = BPF_LD | BPF_DW | BPF_IMM, \ + .dst_reg = dst, \ + .src_reg = src, \ + .off = 0, \ + .imm = (uint32_t)immval, \ + }), \ + ((struct bpf_insn) { \ + .code = 0, \ + .dst_reg = 0, \ + .src_reg = 0, \ + .off = 0, \ + .imm = ((uint64_t)immval) >> 32, \ + }) + +/* encodes single 'load 64-bit immediate' insn, dst_reg = imm ll */ + +# define VIR_BPF_LD_IMM64(dst, imm) \ + _VIR_BPF_LD_IMM64_RAW(dst, 0, imm) + +/* pseudo VIR_BPF_LD_IMM64 insn used to refer to process-local map_fd */ + +# define VIR_BPF_LD_MAP_FD(dst, mapfd) \ + _VIR_BPF_LD_IMM64_RAW(dst, 1, mapfd) + +/* memory load, dst_reg = *(size *) (src_reg + off16) */ + +# define VIR_BPF_LDX_MEM(size, dst, src, offval) \ + ((struct bpf_insn) { \ + .code = BPF_LDX | BPF_SIZE(size) | BPF_MEM, \ + .dst_reg = dst, \ + .src_reg = src, \ + .off = offval, \ + .imm = 0, \ + }) + +/* memory store of registers, *(size *) (dst_reg + off16) = src_reg */ + +# define VIR_BPF_STX_MEM(size, dst, src, offval) \ + ((struct bpf_insn) { \ + .code = BPF_STX | BPF_SIZE(size) | BPF_MEM, \ + .dst_reg = dst, \ + .src_reg = src, \ + .off = offval, \ + .imm = 0, \ + }) + +/* memory store of immediates, *(size *) (dst_reg + off16) = imm32 */ + +# define VIR_BPF_ST_MEM(size, dst, immval, offval) \ + ((struct bpf_insn) { \ + .code = BPF_ST | BPF_SIZE(size) | BPF_MEM, \ + .dst_reg = dst, \ + .src_reg = 0, \ + .off = offval, \ + .imm = immval, \ + }) + +/* conditional jumps against registers, if (dst_reg 'op' src_reg) goto pc + off16 */ + +# define VIR_BPF_JMP_REG(op, dst, src, offval) \ + ((struct bpf_insn) { \ + .code = BPF_JMP | BPF_OP(op) | BPF_X, \ + .dst_reg = dst, \ + .src_reg = src, \ + .off = offval, \ + .imm = 0, \ + }) + +/* conditional jumps against immediates, if (dst_reg 'op' imm32) goto pc + off16 */ + +# define VIR_BPF_JMP_IMM(op, dst, immval, offval) \ + ((struct bpf_insn) { \ + .code = BPF_JMP | BPF_OP(op) | BPF_K, \ + .dst_reg = dst, \ + .src_reg = 0, \ + .off = offval, \ + .imm = immval, \ + }) + +/* call eBPF function, call imm32 */ + +# define VIR_BPF_CALL_INSN(func) \ + ((struct bpf_insn) { \ + .code = BPF_JMP | BPF_CALL, \ + .dst_reg = 0, \ + .src_reg = 0, \ + .off = 0, \ + .imm = func, \ + }) + +/* program exit */ + +# define VIR_BPF_EXIT_INSN() \ + ((struct bpf_insn) { \ + .code = BPF_JMP | BPF_EXIT, \ + .dst_reg = 0, \ + .src_reg = 0, \ + .off = 0, \ + .imm = 0, \ + }) + +# else /* HAVE_DECL_BPF_PROG_QUERY */ + +struct bpf_prog_info; +struct bpf_map_info; +struct bpf_insn; + +# define VIR_BPF_ALU64_REG(op, dst, src) +# define VIR_BPF_ALU64_IMM(op, dst, immval) +# define VIR_BPF_MOV64_REG(dst, src) +# define VIR_BPF_MOV64_IMM(dst, immval) +# define VIR_BPF_LD_IMM64(dst, imm) +# define VIR_BPF_LD_MAP_FD(dst, mapfd) +# define VIR_BPF_LDX_MEM(size, dst, src, offval) +# define VIR_BPF_STX_MEM(size, dst, src, offval) +# define VIR_BPF_ST_MEM(size, dst, immval, offval) +# define VIR_BPF_JMP_REG(op, dst, src, offval) +# define VIR_BPF_JMP_IMM(op, dst, immval, offval) +# define VIR_BPF_CALL_INSN(func) +# define VIR_BPF_EXIT_INSN() + +# endif /* HAVE_DECL_BPF_PROG_QUERY */ + +int +virBPFCreateMap(unsigned int mapType, + unsigned int keySize, + unsigned int valSize, + unsigned int maxEntries); + +int +virBPFGetMapInfo(int mapfd, + struct bpf_map_info *info); + +int +virBPFLoadProg(struct bpf_insn *insns, + int progType, + unsigned int insnCnt); + +int +virBPFAttachProg(int progfd, + int targetfd, + int attachType); + +int +virBPFDetachProg(int progfd, + int targetfd, + int attachType); + +int +virBPFQueryProg(int targetfd, + unsigned int maxprogids, + int attachType, + unsigned int *progcnt, + void *progids); + +int +virBPFGetProg(unsigned int id); + +int +virBPFGetProgInfo(int progfd, + struct bpf_prog_info *info, + unsigned int **mapIDs); + +int +virBPFGetMap(unsigned int id); + +int +virBPFLookupElem(int mapfd, + void *key, + void *val); + +int +virBPFGetNextElem(int mapfd, + void *key, + void *nextKey); + +int +virBPFUpdateElem(int mapfd, + void *key, + void *val); + +int +virBPFDeleteElem(int mapfd, + void *key); + +#endif /* LIBVIRT_VIRBPF_H */ diff --git a/src/util/virerror.c b/src/util/virerror.c index 37b5b2f3f9..f8944698d7 100644 --- a/src/util/virerror.c +++ b/src/util/virerror.c @@ -142,6 +142,8 @@ VIR_ENUM_IMPL(virErrorDomain, "Resource control", "FirewallD", "Domain Checkpoint", + + "BPF", /* 70 */ ); -- 2.20.1

On Thu, Apr 25, 2019 at 09:44:18AM +0200, Pavel Hrdina wrote:
In order to implement devices controller with cgroup v2 we need to add support for BPF programs, cgroup v2 doesn't have devices controller.
This introduces required helpers wrapping linux syscalls.
Signed-off-by: Pavel Hrdina <phrdina@redhat.com> --- configure.ac | 5 + include/libvirt/virterror.h | 2 + src/libvirt_private.syms | 16 ++ src/util/Makefile.inc.am | 2 + src/util/virbpf.c | 438 ++++++++++++++++++++++++++++++++++++ src/util/virbpf.h | 259 +++++++++++++++++++++ src/util/virerror.c | 2 + 7 files changed, 724 insertions(+) create mode 100644 src/util/virbpf.c create mode 100644 src/util/virbpf.h
diff --git a/src/util/virbpf.h b/src/util/virbpf.h new file mode 100644 index 0000000000..b5874e1e8d --- /dev/null +++ b/src/util/virbpf.h @@ -0,0 +1,259 @@ +/* + * virbpf.h: methods for eBPF + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library. If not, see + * <http://www.gnu.org/licenses/>. + */ +
+#ifndef LIBVIRT_VIRBPF_H +# define LIBVIRT_VIRBPF_H
#pragma once is the new avocado toast.
+ +# if HAVE_DECL_BPF_PROG_QUERY + +# include <linux/bpf.h> + +/* ALU ops on registers, bpf_add|sub|...: dst_reg += src_reg */ + +# define VIR_BPF_ALU64_REG(op, dst, src) \ + ((struct bpf_insn) { \ + .code = BPF_ALU64 | BPF_OP(op) | BPF_X, \ + .dst_reg = dst, \ + .src_reg = src, \ + .off = 0, \ + .imm = 0, \ + }) + +/* ALU ops on immediates, bpf_add|sub|...: dst_reg += imm32 */ + +# define VIR_BPF_ALU64_IMM(op, dst, immval) \ + ((struct bpf_insn) { \ + .code = BPF_ALU64 | BPF_OP(op) | BPF_K, \ + .dst_reg = dst, \ + .src_reg = 0, \ + .off = 0, \ + .imm = immval, \ + }) + +/* mov of registers, dst_reg = src_reg */ + +# define VIR_BPF_MOV64_REG(dst, src) \ + ((struct bpf_insn) { \ + .code = BPF_ALU64 | BPF_MOV | BPF_X, \ + .dst_reg = dst, \ + .src_reg = src, \ + .off = 0, \ + .imm = 0, \ + }) + +/* mov of immediates, dst_reg = imm32 */ + +# define VIR_BPF_MOV64_IMM(dst, immval) \ + ((struct bpf_insn) { \ + .code = BPF_ALU64 | BPF_MOV | BPF_K, \ + .dst_reg = dst, \ + .src_reg = 0, \ + .off = 0, \ + .imm = immval, \ + }) + +/* helper to encode 16 byte instruction */ + +# define _VIR_BPF_LD_IMM64_RAW(dst, src, immval) \ + ((struct bpf_insn) { \ + .code = BPF_LD | BPF_DW | BPF_IMM, \ + .dst_reg = dst, \ + .src_reg = src, \ + .off = 0, \ + .imm = (uint32_t)immval, \ + }), \ + ((struct bpf_insn) { \ + .code = 0, \ + .dst_reg = 0, \ + .src_reg = 0, \ + .off = 0, \ + .imm = ((uint64_t)immval) >> 32, \ + }) +
Trust, yada, yada.. Reviewed-by: Ján Tomko <jtomko@redhat.com> Jano

There is no exact way how to figure out whether BPF devices support is compiled into kernel. One way is to check kernel configure options but this is not reliable as it may not be available. Let's try to do syscall to which will list BPF cgroup device programs. Signed-off-by: Pavel Hrdina <phrdina@redhat.com> --- configure.ac | 3 +- src/Makefile.am | 2 + src/libvirt_private.syms | 3 ++ src/util/Makefile.inc.am | 2 + src/util/vircgroupv2.c | 7 +++- src/util/vircgroupv2devices.c | 73 +++++++++++++++++++++++++++++++++++ src/util/vircgroupv2devices.h | 27 +++++++++++++ 7 files changed, 115 insertions(+), 2 deletions(-) create mode 100644 src/util/vircgroupv2devices.c create mode 100644 src/util/vircgroupv2devices.h diff --git a/configure.ac b/configure.ac index fbbc88303a..67c87c1042 100644 --- a/configure.ac +++ b/configure.ac @@ -878,7 +878,8 @@ AC_CHECK_DECLS([clock_serv_t, host_get_clock_service, clock_get_time], # Check if we have new enough kernel to support BPF devices for cgroups v2 if test "$with_linux" = "yes"; then - AC_CHECK_DECLS([BPF_PROG_QUERY], [], [], [#include <linux/bpf.h>]) + AC_CHECK_DECLS([BPF_PROG_QUERY, BPF_CGROUP_DEVICE], + [], [], [#include <linux/bpf.h>]) fi # Check if we need to look for ifconfig diff --git a/src/Makefile.am b/src/Makefile.am index 7d452a9490..a65125bac6 100644 --- a/src/Makefile.am +++ b/src/Makefile.am @@ -663,11 +663,13 @@ libvirt_setuid_rpc_client_la_SOURCES = \ util/viratomic.h \ util/virautoclean.h \ util/virbitmap.c \ + util/virbpf.c \ util/virbuffer.c \ util/vircgroup.c \ util/vircgroupbackend.c \ util/vircgroupv1.c \ util/vircgroupv2.c \ + util/vircgroupv2devices.c \ util/vircommand.c \ util/virconf.c \ util/virdbus.c \ diff --git a/src/libvirt_private.syms b/src/libvirt_private.syms index 56db5d92cd..9eac05009c 100644 --- a/src/libvirt_private.syms +++ b/src/libvirt_private.syms @@ -1652,6 +1652,9 @@ virCgroupV1Register; # util/vircgroupv2.h virCgroupV2Register; +# util/vircgroupv2devices.h +virCgroupV2DevicesAvailable; + # util/virclosecallbacks.h virCloseCallbacksGet; virCloseCallbacksGetConn; diff --git a/src/util/Makefile.inc.am b/src/util/Makefile.inc.am index 0c2ee03c2f..5485fe621e 100644 --- a/src/util/Makefile.inc.am +++ b/src/util/Makefile.inc.am @@ -34,6 +34,8 @@ UTIL_SOURCES = \ util/vircgroupv1.h \ util/vircgroupv2.c \ util/vircgroupv2.h \ + util/vircgroupv2devices.c \ + util/vircgroupv2devices.h \ util/virclosecallbacks.c \ util/virclosecallbacks.h \ util/vircommand.c \ diff --git a/src/util/vircgroupv2.c b/src/util/vircgroupv2.c index 0cfbc96264..dc7573e05c 100644 --- a/src/util/vircgroupv2.c +++ b/src/util/vircgroupv2.c @@ -33,6 +33,7 @@ #include "vircgroup.h" #include "vircgroupbackend.h" #include "vircgroupv2.h" +#include "vircgroupv2devices.h" #include "virerror.h" #include "virfile.h" #include "virlog.h" @@ -295,6 +296,8 @@ virCgroupV2DetectControllers(virCgroupPtr group, /* In cgroup v2 there is no cpuacct controller, the cpu.stat file always * exists with usage stats. */ group->unified.controllers |= 1 << VIR_CGROUP_CONTROLLER_CPUACCT; + if (virCgroupV2DevicesAvailable(group)) + group->unified.controllers |= 1 << VIR_CGROUP_CONTROLLER_DEVICES; for (i = 0; i < VIR_CGROUP_CONTROLLER_LAST; i++) VIR_DEBUG("Controller '%s' present=%s", @@ -415,8 +418,10 @@ virCgroupV2MakeGroup(virCgroupPtr parent ATTRIBUTE_UNUSED, continue; /* Controllers that are implicitly enabled if available. */ - if (i == VIR_CGROUP_CONTROLLER_CPUACCT) + if (i == VIR_CGROUP_CONTROLLER_CPUACCT || + i == VIR_CGROUP_CONTROLLER_DEVICES) { continue; + } if (virCgroupV2EnableController(parent, i) < 0) return -1; diff --git a/src/util/vircgroupv2devices.c b/src/util/vircgroupv2devices.c new file mode 100644 index 0000000000..10080d4fff --- /dev/null +++ b/src/util/vircgroupv2devices.c @@ -0,0 +1,73 @@ +/* + * vircgroupv2devices.c: methods for cgroups v2 BPF devices + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library. If not, see + * <http://www.gnu.org/licenses/>. + */ +#include <config.h> + +#if HAVE_DECL_BPF_CGROUP_DEVICE +# include <fcntl.h> +# include <linux/bpf.h> +# include <sys/stat.h> +# include <sys/syscall.h> +# include <sys/types.h> +#endif /* !HAVE_DECL_BPF_CGROUP_DEVICE */ + +#include "internal.h" + +#define LIBVIRT_VIRCGROUPPRIV_H_ALLOW +#include "vircgrouppriv.h" + +#include "virbpf.h" +#include "vircgroup.h" +#include "vircgroupv2devices.h" +#include "virfile.h" +#include "virlog.h" + +VIR_LOG_INIT("util.cgroup"); + +#define VIR_FROM_THIS VIR_FROM_CGROUP + +#if HAVE_DECL_BPF_CGROUP_DEVICE +bool +virCgroupV2DevicesAvailable(virCgroupPtr group) +{ + bool ret = false; + int cgroupfd = -1; + unsigned int progCnt = 0; + + cgroupfd = open(group->unified.mountPoint, O_RDONLY); + if (cgroupfd < 0) { + VIR_DEBUG("failed to open cgroup '%s'", group->unified.mountPoint); + goto cleanup; + } + + if (virBPFQueryProg(cgroupfd, 0, BPF_CGROUP_DEVICE, &progCnt, NULL) < 0) { + VIR_DEBUG("failed to query cgroup progs"); + goto cleanup; + } + + ret = true; + cleanup: + VIR_FORCE_CLOSE(cgroupfd); + return ret; +} +#else /* !HAVE_DECL_BPF_CGROUP_DEVICE */ +bool +virCgroupV2DevicesAvailable(virCgroupPtr group ATTRIBUTE_UNUSED) +{ + return false; +} +#endif /* !HAVE_DECL_BPF_CGROUP_DEVICE */ diff --git a/src/util/vircgroupv2devices.h b/src/util/vircgroupv2devices.h new file mode 100644 index 0000000000..2ab35681db --- /dev/null +++ b/src/util/vircgroupv2devices.h @@ -0,0 +1,27 @@ +/* + * vircgroupv2devices.h: methods for cgroups v2 BPF devices + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library. If not, see + * <http://www.gnu.org/licenses/>. + */ + +#ifndef LIBVIRT_VIRCGROUPV2DEVICES_H +# define LIBVIRT_VIRCGROUPV2DEVICES_H + +# include "vircgroup.h" + +bool +virCgroupV2DevicesAvailable(virCgroupPtr group); + +#endif /* LIBVIRT_VIRCGROUPV2DEVICES_H */ -- 2.20.1

On Thu, Apr 25, 2019 at 09:44:19AM +0200, Pavel Hrdina wrote:
There is no exact way how to figure out whether BPF devices support is compiled into kernel. One way is to check kernel configure options but this is not reliable as it may not be available. Let's try to do syscall to which will list BPF cgroup device programs.
Signed-off-by: Pavel Hrdina <phrdina@redhat.com> --- configure.ac | 3 +- src/Makefile.am | 2 + src/libvirt_private.syms | 3 ++ src/util/Makefile.inc.am | 2 + src/util/vircgroupv2.c | 7 +++- src/util/vircgroupv2devices.c | 73 +++++++++++++++++++++++++++++++++++ src/util/vircgroupv2devices.h | 27 +++++++++++++ 7 files changed, 115 insertions(+), 2 deletions(-) create mode 100644 src/util/vircgroupv2devices.c create mode 100644 src/util/vircgroupv2devices.h
new file mode 100644 index 0000000000..10080d4fff --- /dev/null +++ b/src/util/vircgroupv2devices.c @@ -0,0 +1,73 @@ +/* + * vircgroupv2devices.c: methods for cgroups v2 BPF devices + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library. If not, see + * <http://www.gnu.org/licenses/>. + */ +#include <config.h> + +#if HAVE_DECL_BPF_CGROUP_DEVICE +# include <fcntl.h> +# include <linux/bpf.h> +# include <sys/stat.h> +# include <sys/syscall.h> +# include <sys/types.h> +#endif /* !HAVE_DECL_BPF_CGROUP_DEVICE */ + +#include "internal.h" + +#define LIBVIRT_VIRCGROUPPRIV_H_ALLOW +#include "vircgrouppriv.h" + +#include "virbpf.h" +#include "vircgroup.h" +#include "vircgroupv2devices.h" +#include "virfile.h" +#include "virlog.h" + +VIR_LOG_INIT("util.cgroup"); + +#define VIR_FROM_THIS VIR_FROM_CGROUP + +#if HAVE_DECL_BPF_CGROUP_DEVICE +bool +virCgroupV2DevicesAvailable(virCgroupPtr group) +{ + bool ret = false; + int cgroupfd = -1;
VIR_AUTOCLOSE
+ unsigned int progCnt = 0; + + cgroupfd = open(group->unified.mountPoint, O_RDONLY); + if (cgroupfd < 0) { + VIR_DEBUG("failed to open cgroup '%s'", group->unified.mountPoint); + goto cleanup; + } + + if (virBPFQueryProg(cgroupfd, 0, BPF_CGROUP_DEVICE, &progCnt, NULL) < 0) { + VIR_DEBUG("failed to query cgroup progs"); + goto cleanup; + } + + ret = true; + cleanup: + VIR_FORCE_CLOSE(cgroupfd); + return ret; +} +#else /* !HAVE_DECL_BPF_CGROUP_DEVICE */ +bool +virCgroupV2DevicesAvailable(virCgroupPtr group ATTRIBUTE_UNUSED) +{ + return false; +} +#endif /* !HAVE_DECL_BPF_CGROUP_DEVICE */ diff --git a/src/util/vircgroupv2devices.h b/src/util/vircgroupv2devices.h new file mode 100644 index 0000000000..2ab35681db --- /dev/null +++ b/src/util/vircgroupv2devices.h @@ -0,0 +1,27 @@ +/* + * vircgroupv2devices.h: methods for cgroups v2 BPF devices + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library. If not, see + * <http://www.gnu.org/licenses/>. + */ +
#pragma once Reviewed-by: Ján Tomko <jtomko@redhat.com> Jano

This function loads the BPF prog with prepared map into kernel and attaches it into guest cgroup. It can be also used to replace existing program in the cgroup if we need to resize BPF map to store more rules for devices. The old program will be closed and removed from kernel. There are two possible ways how to create BPF program: - One way is to write simple C-like code which can by compiled into BPF object file which can be loaded into kernel using elfutils. - The second way is to define macros which looks like assembler instructions and can be used directly to create BPF program that can be directly loaded into kernel. Since the program is not too complex we can use the second option. If there is no program, all devices are allowed, if there is some program it is executed and based on the exit status the access is denied for 0 and allowed for 1. Our program will follow these rules: - first it will try to look for the specific key using major and minor to see if there is any rule for that specific device - if there is no specific rule it will try to look for any rule that matches only major of the device - if there is no match with major it will try the same but with minor of the device - as the last attempt it will try to look for rule for all devices and if there is no match it will return 0 to deny that access Signed-off-by: Pavel Hrdina <phrdina@redhat.com> --- src/libvirt_private.syms | 1 + src/util/vircgrouppriv.h | 10 ++ src/util/vircgroupv2devices.c | 276 ++++++++++++++++++++++++++++++++++ src/util/vircgroupv2devices.h | 5 + 4 files changed, 292 insertions(+) diff --git a/src/libvirt_private.syms b/src/libvirt_private.syms index 9eac05009c..24a783840f 100644 --- a/src/libvirt_private.syms +++ b/src/libvirt_private.syms @@ -1653,6 +1653,7 @@ virCgroupV1Register; virCgroupV2Register; # util/vircgroupv2devices.h +virCgroupV2DevicesAttachProg; virCgroupV2DevicesAvailable; # util/virclosecallbacks.h diff --git a/src/util/vircgrouppriv.h b/src/util/vircgrouppriv.h index 9110c77297..7eba4ade23 100644 --- a/src/util/vircgrouppriv.h +++ b/src/util/vircgrouppriv.h @@ -41,10 +41,20 @@ struct _virCgroupV1Controller { typedef struct _virCgroupV1Controller virCgroupV1Controller; typedef virCgroupV1Controller *virCgroupV1ControllerPtr; +struct _virCgroupV2Devices { + int mapfd; + int progfd; + ssize_t count; + ssize_t max; +}; +typedef struct _virCgroupV2Devices virCgroupV2Devices; +typedef virCgroupV2Devices *virCgroupV2DevicesPtr; + struct _virCgroupV2Controller { int controllers; char *mountPoint; char *placement; + virCgroupV2Devices devices; }; typedef struct _virCgroupV2Controller virCgroupV2Controller; typedef virCgroupV2Controller *virCgroupV2ControllerPtr; diff --git a/src/util/vircgroupv2devices.c b/src/util/vircgroupv2devices.c index 10080d4fff..c8686e8768 100644 --- a/src/util/vircgroupv2devices.c +++ b/src/util/vircgroupv2devices.c @@ -30,6 +30,7 @@ #define LIBVIRT_VIRCGROUPPRIV_H_ALLOW #include "vircgrouppriv.h" +#include "viralloc.h" #include "virbpf.h" #include "vircgroup.h" #include "vircgroupv2devices.h" @@ -64,10 +65,285 @@ virCgroupV2DevicesAvailable(virCgroupPtr group) VIR_FORCE_CLOSE(cgroupfd); return ret; } + + +/* Steps to get assembly version of devices BPF program: + * + * Save the following program into bpfprog.c, compile it using clang: + * + * clang -O2 -Wall -target bpf -c bpfprog.c -o bpfprog.o + * + * Now you can use llvm-objdump to get the list if instructions: + * + * llvm-objdump -S -no-show-raw-insn bpfprog.o + * + * which can be converted into program using VIR_BPF_* macros. + * + * ---------------------------------------------------------------------------- + * #include <linux/bpf.h> + * #include <linux/version.h> + * + * #define SEC(NAME) __attribute__((section(NAME), used)) + * + * struct bpf_map_def { + * unsigned int type; + * unsigned int key_size; + * unsigned int value_size; + * unsigned int max_entries; + * unsigned int map_flags; + * unsigned int inner_map_idx; + * unsigned int numa_node; + * }; + * + * static void *(*bpf_map_lookup_elem)(void *map, void *key) = + * (void *) BPF_FUNC_map_lookup_elem; + * + * struct bpf_map_def SEC("maps") devices = { + * .type = BPF_MAP_TYPE_HASH, + * .key_size = sizeof(__u64), + * .value_size = sizeof(__u32), + * .max_entries = 65, + * }; + * + * SEC("cgroup/dev") int + * bpf_libvirt_cgroup_device(struct bpf_cgroup_dev_ctx *ctx) + * { + * __u64 key = ((__u64)ctx->major << 32) | ctx->minor; + * __u32 *val = 0; + * + * val = bpf_map_lookup_elem(&devices, &key); + * if (val && (ctx->access_type & *val) == ctx->access_type) + * return 1; + * + * key = ((__u64)ctx->major << 32) | 0xffffffff; + * val = bpf_map_lookup_elem(&devices, &key); + * if (val && (ctx->access_type & *val) == ctx->access_type) + * return 1; + * + * key = 0xffffffff00000000 | ctx->minor; + * val = bpf_map_lookup_elem(&devices, &key); + * if (val && (ctx->access_type & *val) == ctx->access_type) + * return 1; + * + * key = 0xffffffffffffffff; + * val = bpf_map_lookup_elem(&devices, &key); + * if (val && (ctx->access_type & *val) == ctx->access_type) + * return 1; + * + * return 0; + * } + * + * char _license[] SEC("license") = "GPL"; + * __u32 _version SEC("version") = LINUX_VERSION_CODE; + * ---------------------------------------------------------------------------- + * */ +static int +virCgroupV2DevicesLoadProg(int mapfd) +{ + struct bpf_insn prog[] = { + /* 0: r6 = r1 */ + VIR_BPF_MOV64_REG(BPF_REG_6, BPF_REG_1), + /* 1: r1 = *(u32 *)(r6 + 8) */ + VIR_BPF_LDX_MEM(BPF_W, BPF_REG_1, BPF_REG_6, 8), + /* 2: r2 = *(u32 *)(r6 + 4) */ + VIR_BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_6, 4), + /* 3: r2 <<= 32 */ + VIR_BPF_ALU64_IMM(BPF_LSH, BPF_REG_2, 32), + /* 4: r2 |= r1 */ + VIR_BPF_ALU64_REG(BPF_OR, BPF_REG_2, BPF_REG_1), + /* 5: *(u64 *)(r10 - 8) = r2 */ + VIR_BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_2, -8), + /* 6: r2 = r10 */ + VIR_BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + /* 7: r2 += -8 */ + VIR_BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), + /* 8: r1 = 0 ll */ + VIR_BPF_LD_MAP_FD(BPF_REG_1, mapfd), + /* 10: call 1 */ + VIR_BPF_CALL_INSN(BPF_FUNC_map_lookup_elem), + /* 11: r1 = r0 */ + VIR_BPF_MOV64_REG(BPF_REG_1, BPF_REG_0), + /* 12: if r1 == 0 goto +5 <LBB0_2> */ + VIR_BPF_JMP_IMM(BPF_JEQ, BPF_REG_1, 0, 5), + /* 13: r0 = 1 */ + VIR_BPF_MOV64_IMM(BPF_REG_0, 1), + /* 14: r2 = *(u32 *)(r6 + 0) */ + VIR_BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_6, 0), + /* 15: r1 = *(u32 *)(r1 + 0) */ + VIR_BPF_LDX_MEM(BPF_W, BPF_REG_1, BPF_REG_1, 0), + /* 16: r1 &= r2 */ + VIR_BPF_ALU64_REG(BPF_AND, BPF_REG_1, BPF_REG_2), + /* 17: if r1 == r2 goto +50 <LBB0_9> */ + VIR_BPF_JMP_REG(BPF_JEQ, BPF_REG_1, BPF_REG_2, 50), + /* LBB0_2: */ + /* 18: r1 = *(u32 *)(r6 + 4) */ + VIR_BPF_LDX_MEM(BPF_W, BPF_REG_1, BPF_REG_6, 4), + /* 19: r1 <<= 32 */ + VIR_BPF_ALU64_IMM(BPF_LSH, BPF_REG_1, 32), + /* 20: r2 = 4294967295 ll */ + VIR_BPF_LD_IMM64(BPF_REG_2, 0xffffffff), + /* 22: r1 |= r2 */ + VIR_BPF_ALU64_REG(BPF_OR, BPF_REG_1, BPF_REG_2), + /* 23: *(u64 *)(r10 - 8) = r1 */ + VIR_BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_1, -8), + /* 24: r2 = r10 */ + VIR_BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + /* 25: r2 += -8 */ + VIR_BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), + /* 26: r1 = 0 ll */ + VIR_BPF_LD_MAP_FD(BPF_REG_1, mapfd), + /* 28: call 1 */ + VIR_BPF_CALL_INSN(BPF_FUNC_map_lookup_elem), + /* 29: r1 = r0 */ + VIR_BPF_MOV64_REG(BPF_REG_1, BPF_REG_0), + /* 30: if r1 == 0 goto +5 <LBB0_4> */ + VIR_BPF_JMP_IMM(BPF_JEQ, BPF_REG_1, 0, 5), + /* 31: r0 = 1 */ + VIR_BPF_MOV64_IMM(BPF_REG_0, 1), + /* 32: r2 = *(u32 *)(r6 + 0) */ + VIR_BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_6, 0), + /* 33: r1 = *(u32 *)(r1 + 0) */ + VIR_BPF_LDX_MEM(BPF_W, BPF_REG_1, BPF_REG_1, 0), + /* 34: r1 &= r2 */ + VIR_BPF_ALU64_REG(BPF_AND, BPF_REG_1, BPF_REG_2), + /* 35: if r1 == r2 goto +32 <LBB0_9> */ + VIR_BPF_JMP_REG(BPF_JEQ, BPF_REG_1, BPF_REG_2, 32), + /* LBB0_4: */ + /* 36: r1 = *(u32 *)(r6 + 8) */ + VIR_BPF_LDX_MEM(BPF_W, BPF_REG_1, BPF_REG_6, 8), + /* 37: r2 = -4294967296 ll */ + VIR_BPF_LD_IMM64(BPF_REG_2, 0xffffffff00000000), + /* 39: r1 |= r2 */ + VIR_BPF_ALU64_REG(BPF_OR, BPF_REG_1, BPF_REG_2), + /* 40: *(u64 *)(r10 - 8) = r1 */ + VIR_BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_1, -8), + /* 41: r2 = r10 */ + VIR_BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + /* 42: r2 += -8 */ + VIR_BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), + /* 43: r1 = 0 ll */ + VIR_BPF_LD_MAP_FD(BPF_REG_1, mapfd), + /* 45: call 1 */ + VIR_BPF_CALL_INSN(BPF_FUNC_map_lookup_elem), + /* 46: r1 = r0 */ + VIR_BPF_MOV64_REG(BPF_REG_1, BPF_REG_0), + /* 47: if r1 == 0 goto +5 <LBB0_6> */ + VIR_BPF_JMP_IMM(BPF_JEQ, BPF_REG_1, 0, 5), + /* 48: r0 = 1 */ + VIR_BPF_MOV64_IMM(BPF_REG_0, 1), + /* 49: r2 = *(u32 *)(r6 + 0) */ + VIR_BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_6, 0), + /* 50: r1 = *(u32 *)(r1 + 0) */ + VIR_BPF_LDX_MEM(BPF_W, BPF_REG_1, BPF_REG_1, 0), + /* 51: r1 &= r2 */ + VIR_BPF_ALU64_REG(BPF_AND, BPF_REG_1, BPF_REG_2), + /* 52: if r1 == r2 goto +15 <LBB0_9> */ + VIR_BPF_JMP_REG(BPF_JEQ, BPF_REG_1, BPF_REG_2, 15), + /* LBB0_6: */ + /* 53: r1 = -1 */ + VIR_BPF_MOV64_IMM(BPF_REG_1, -1), + /* 54: *(u64 *)(r10 - 8) = r1 */ + VIR_BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_1, -8), + /* 55: r2 = r10 */ + VIR_BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + /* 56: r2 += -8 */ + VIR_BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), + /* 57: r1 = 0 ll */ + VIR_BPF_LD_MAP_FD(BPF_REG_1, mapfd), + /* 59: call 1 */ + VIR_BPF_CALL_INSN(BPF_FUNC_map_lookup_elem), + /* 60: r1 = r0 */ + VIR_BPF_MOV64_REG(BPF_REG_1, BPF_REG_0), + /* 61: if r1 == 0 goto +5 <LBB0_8> */ + VIR_BPF_JMP_IMM(BPF_JEQ, BPF_REG_1, 0, 5), + /* 62: r0 = 1 */ + VIR_BPF_MOV64_IMM(BPF_REG_0, 1), + /* 63: r2 = *(u32 *)(r6 + 0) */ + VIR_BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_6, 0), + /* 64: r1 = *(u32 *)(r1 + 0) */ + VIR_BPF_LDX_MEM(BPF_W, BPF_REG_1, BPF_REG_1, 0), + /* 65: r1 &= r2 */ + VIR_BPF_ALU64_REG(BPF_AND, BPF_REG_1, BPF_REG_2), + /* 66: if r1 == r2 goto +1 <LBB0_9> */ + VIR_BPF_JMP_REG(BPF_JEQ, BPF_REG_1, BPF_REG_2, 1), + /* LBB0_8: */ + /* 67: r0 = 0 */ + VIR_BPF_MOV64_IMM(BPF_REG_0, 0), + /* LBB0_9: */ + /* 68: exit */ + VIR_BPF_EXIT_INSN(), + }; + + return virBPFLoadProg(prog, BPF_PROG_TYPE_CGROUP_DEVICE, ARRAY_CARDINALITY(prog)); +} + + +int +virCgroupV2DevicesAttachProg(virCgroupPtr group, + int mapfd, + size_t max) +{ + int ret = -1; + int progfd = -1; + int cgroupfd = -1; + VIR_AUTOFREE(char *) path = NULL; + + if (virCgroupPathOfController(group, VIR_CGROUP_CONTROLLER_DEVICES, + NULL, &path) < 0) { + goto cleanup; + } + + progfd = virCgroupV2DevicesLoadProg(mapfd); + if (progfd < 0) { + virReportSystemError(errno, "%s", _("failed to load cgroup BPF prog")); + goto cleanup; + } + + cgroupfd = open(path, O_RDONLY); + if (cgroupfd < 0) { + virReportSystemError(errno, _("unable to open '%s'"), path); + goto cleanup; + } + + if (virBPFAttachProg(progfd, cgroupfd, BPF_CGROUP_DEVICE) < 0) { + virReportSystemError(errno, "%s", _("failed to attach cgroup BPF prog")); + goto cleanup; + } + + if (group->unified.devices.progfd > 0) { + VIR_DEBUG("Closing existing program that was replaced by new one."); + VIR_FORCE_CLOSE(group->unified.devices.progfd); + } + + group->unified.devices.progfd = progfd; + group->unified.devices.mapfd = mapfd; + group->unified.devices.max = max; + progfd = -1; + mapfd = -1; + + ret = 0; + cleanup: + VIR_FORCE_CLOSE(cgroupfd); + VIR_FORCE_CLOSE(progfd); + VIR_FORCE_CLOSE(mapfd); + return ret; +} #else /* !HAVE_DECL_BPF_CGROUP_DEVICE */ bool virCgroupV2DevicesAvailable(virCgroupPtr group ATTRIBUTE_UNUSED) { return false; } + + +int +virCgroupV2DevicesAttachProg(virCgroupPtr group ATTRIBUTE_UNUSED, + int mapfd ATTRIBUTE_UNUSED, + size_t max ATTRIBUTE_UNUSED) +{ + virReportSystemError(ENOSYS, "%s", + _("cgroups v2 BPF devices not supported " + "with this kernel")); + return -1; +} #endif /* !HAVE_DECL_BPF_CGROUP_DEVICE */ diff --git a/src/util/vircgroupv2devices.h b/src/util/vircgroupv2devices.h index 2ab35681db..1ba87acb00 100644 --- a/src/util/vircgroupv2devices.h +++ b/src/util/vircgroupv2devices.h @@ -24,4 +24,9 @@ bool virCgroupV2DevicesAvailable(virCgroupPtr group); +int +virCgroupV2DevicesAttachProg(virCgroupPtr group, + int mapfd, + size_t max); + #endif /* LIBVIRT_VIRCGROUPV2DEVICES_H */ -- 2.20.1

On Thu, Apr 25, 2019 at 09:44:20AM +0200, Pavel Hrdina wrote:
This function loads the BPF prog with prepared map into kernel and attaches it into guest cgroup. It can be also used to replace existing program in the cgroup if we need to resize BPF map to store more rules for devices. The old program will be closed and removed from kernel.
There are two possible ways how to create BPF program:
- One way is to write simple C-like code which can by compiled into BPF object file which can be loaded into kernel using elfutils.
- The second way is to define macros which looks like assembler
s/looks/look/
instructions and can be used directly to create BPF program that can be directly loaded into kernel.
Since the program is not too complex we can use the second option.
I can live with that, since it saves us the dependency on clang, and it probably won't require many changes, but please include all the steps necessary to regenerate it (see below).
If there is no program, all devices are allowed, if there is some program it is executed and based on the exit status the access is denied for 0 and allowed for 1.
Our program will follow these rules:
- first it will try to look for the specific key using major and minor to see if there is any rule for that specific device
- if there is no specific rule it will try to look for any rule that matches only major of the device
- if there is no match with major it will try the same but with minor of the device
- as the last attempt it will try to look for rule for all devices and if there is no match it will return 0 to deny that access
Signed-off-by: Pavel Hrdina <phrdina@redhat.com> --- src/libvirt_private.syms | 1 + src/util/vircgrouppriv.h | 10 ++ src/util/vircgroupv2devices.c | 276 ++++++++++++++++++++++++++++++++++ src/util/vircgroupv2devices.h | 5 + 4 files changed, 292 insertions(+)
diff --git a/src/libvirt_private.syms b/src/libvirt_private.syms index 9eac05009c..24a783840f 100644 --- a/src/libvirt_private.syms +++ b/src/libvirt_private.syms @@ -1653,6 +1653,7 @@ virCgroupV1Register; virCgroupV2Register;
# util/vircgroupv2devices.h +virCgroupV2DevicesAttachProg; virCgroupV2DevicesAvailable;
# util/virclosecallbacks.h diff --git a/src/util/vircgrouppriv.h b/src/util/vircgrouppriv.h index 9110c77297..7eba4ade23 100644 --- a/src/util/vircgrouppriv.h +++ b/src/util/vircgrouppriv.h @@ -41,10 +41,20 @@ struct _virCgroupV1Controller { typedef struct _virCgroupV1Controller virCgroupV1Controller; typedef virCgroupV1Controller *virCgroupV1ControllerPtr;
+struct _virCgroupV2Devices { + int mapfd; + int progfd; + ssize_t count; + ssize_t max; +}; +typedef struct _virCgroupV2Devices virCgroupV2Devices; +typedef virCgroupV2Devices *virCgroupV2DevicesPtr; + struct _virCgroupV2Controller { int controllers; char *mountPoint; char *placement; + virCgroupV2Devices devices; }; typedef struct _virCgroupV2Controller virCgroupV2Controller; typedef virCgroupV2Controller *virCgroupV2ControllerPtr; diff --git a/src/util/vircgroupv2devices.c b/src/util/vircgroupv2devices.c index 10080d4fff..c8686e8768 100644 --- a/src/util/vircgroupv2devices.c +++ b/src/util/vircgroupv2devices.c @@ -30,6 +30,7 @@ #define LIBVIRT_VIRCGROUPPRIV_H_ALLOW #include "vircgrouppriv.h"
+#include "viralloc.h" #include "virbpf.h" #include "vircgroup.h" #include "vircgroupv2devices.h" @@ -64,10 +65,285 @@ virCgroupV2DevicesAvailable(virCgroupPtr group) VIR_FORCE_CLOSE(cgroupfd); return ret; } + + +/* Steps to get assembly version of devices BPF program: + * + * Save the following program into bpfprog.c, compile it using clang:
It would be nicer to have the file separate, with this comment there, to save the developer the trouble of stripping the asterisks.
+ * + * clang -O2 -Wall -target bpf -c bpfprog.c -o bpfprog.o + * + * Now you can use llvm-objdump to get the list if instructions: + * + * llvm-objdump -S -no-show-raw-insn bpfprog.o + * + * which can be converted into program using VIR_BPF_* macros.
Did you convert them manually? Can you share the script here?
+ * + * ---------------------------------------------------------------------------- + * #include <linux/bpf.h> + * #include <linux/version.h> + * + * #define SEC(NAME) __attribute__((section(NAME), used)) + * + * struct bpf_map_def { + * unsigned int type; + * unsigned int key_size; + * unsigned int value_size; + * unsigned int max_entries; + * unsigned int map_flags; + * unsigned int inner_map_idx; + * unsigned int numa_node; + * }; + * + * static void *(*bpf_map_lookup_elem)(void *map, void *key) = + * (void *) BPF_FUNC_map_lookup_elem; + * + * struct bpf_map_def SEC("maps") devices = { + * .type = BPF_MAP_TYPE_HASH, + * .key_size = sizeof(__u64), + * .value_size = sizeof(__u32), + * .max_entries = 65, + * }; + * + * SEC("cgroup/dev") int + * bpf_libvirt_cgroup_device(struct bpf_cgroup_dev_ctx *ctx) + * { + * __u64 key = ((__u64)ctx->major << 32) | ctx->minor; + * __u32 *val = 0; + * + * val = bpf_map_lookup_elem(&devices, &key); + * if (val && (ctx->access_type & *val) == ctx->access_type) + * return 1; + * + * key = ((__u64)ctx->major << 32) | 0xffffffff; + * val = bpf_map_lookup_elem(&devices, &key); + * if (val && (ctx->access_type & *val) == ctx->access_type) + * return 1; + * + * key = 0xffffffff00000000 | ctx->minor; + * val = bpf_map_lookup_elem(&devices, &key); + * if (val && (ctx->access_type & *val) == ctx->access_type) + * return 1; + * + * key = 0xffffffffffffffff; + * val = bpf_map_lookup_elem(&devices, &key); + * if (val && (ctx->access_type & *val) == ctx->access_type) + * return 1; + * + * return 0; + * } + * + * char _license[] SEC("license") = "GPL"; + * __u32 _version SEC("version") = LINUX_VERSION_CODE; + * ---------------------------------------------------------------------------- + * */ +static int +virCgroupV2DevicesLoadProg(int mapfd) +{ + struct bpf_insn prog[] = { + /* 0: r6 = r1 */ + VIR_BPF_MOV64_REG(BPF_REG_6, BPF_REG_1), + /* 1: r1 = *(u32 *)(r6 + 8) */ + VIR_BPF_LDX_MEM(BPF_W, BPF_REG_1, BPF_REG_6, 8), + /* 2: r2 = *(u32 *)(r6 + 4) */ + VIR_BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_6, 4), + /* 3: r2 <<= 32 */ + VIR_BPF_ALU64_IMM(BPF_LSH, BPF_REG_2, 32), + /* 4: r2 |= r1 */ + VIR_BPF_ALU64_REG(BPF_OR, BPF_REG_2, BPF_REG_1), + /* 5: *(u64 *)(r10 - 8) = r2 */ + VIR_BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_2, -8), + /* 6: r2 = r10 */ + VIR_BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + /* 7: r2 += -8 */ + VIR_BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), + /* 8: r1 = 0 ll */ + VIR_BPF_LD_MAP_FD(BPF_REG_1, mapfd), + /* 10: call 1 */ + VIR_BPF_CALL_INSN(BPF_FUNC_map_lookup_elem), + /* 11: r1 = r0 */ + VIR_BPF_MOV64_REG(BPF_REG_1, BPF_REG_0), + /* 12: if r1 == 0 goto +5 <LBB0_2> */ + VIR_BPF_JMP_IMM(BPF_JEQ, BPF_REG_1, 0, 5), + /* 13: r0 = 1 */ + VIR_BPF_MOV64_IMM(BPF_REG_0, 1), + /* 14: r2 = *(u32 *)(r6 + 0) */ + VIR_BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_6, 0), + /* 15: r1 = *(u32 *)(r1 + 0) */ + VIR_BPF_LDX_MEM(BPF_W, BPF_REG_1, BPF_REG_1, 0), + /* 16: r1 &= r2 */ + VIR_BPF_ALU64_REG(BPF_AND, BPF_REG_1, BPF_REG_2), + /* 17: if r1 == r2 goto +50 <LBB0_9> */ + VIR_BPF_JMP_REG(BPF_JEQ, BPF_REG_1, BPF_REG_2, 50), + /* LBB0_2: */ + /* 18: r1 = *(u32 *)(r6 + 4) */ + VIR_BPF_LDX_MEM(BPF_W, BPF_REG_1, BPF_REG_6, 4), + /* 19: r1 <<= 32 */ + VIR_BPF_ALU64_IMM(BPF_LSH, BPF_REG_1, 32), + /* 20: r2 = 4294967295 ll */ + VIR_BPF_LD_IMM64(BPF_REG_2, 0xffffffff), + /* 22: r1 |= r2 */ + VIR_BPF_ALU64_REG(BPF_OR, BPF_REG_1, BPF_REG_2), + /* 23: *(u64 *)(r10 - 8) = r1 */ + VIR_BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_1, -8), + /* 24: r2 = r10 */ + VIR_BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + /* 25: r2 += -8 */ + VIR_BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), + /* 26: r1 = 0 ll */ + VIR_BPF_LD_MAP_FD(BPF_REG_1, mapfd), + /* 28: call 1 */ + VIR_BPF_CALL_INSN(BPF_FUNC_map_lookup_elem), + /* 29: r1 = r0 */ + VIR_BPF_MOV64_REG(BPF_REG_1, BPF_REG_0), + /* 30: if r1 == 0 goto +5 <LBB0_4> */ + VIR_BPF_JMP_IMM(BPF_JEQ, BPF_REG_1, 0, 5), + /* 31: r0 = 1 */ + VIR_BPF_MOV64_IMM(BPF_REG_0, 1), + /* 32: r2 = *(u32 *)(r6 + 0) */ + VIR_BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_6, 0), + /* 33: r1 = *(u32 *)(r1 + 0) */ + VIR_BPF_LDX_MEM(BPF_W, BPF_REG_1, BPF_REG_1, 0), + /* 34: r1 &= r2 */ + VIR_BPF_ALU64_REG(BPF_AND, BPF_REG_1, BPF_REG_2), + /* 35: if r1 == r2 goto +32 <LBB0_9> */ + VIR_BPF_JMP_REG(BPF_JEQ, BPF_REG_1, BPF_REG_2, 32), + /* LBB0_4: */ + /* 36: r1 = *(u32 *)(r6 + 8) */ + VIR_BPF_LDX_MEM(BPF_W, BPF_REG_1, BPF_REG_6, 8), + /* 37: r2 = -4294967296 ll */ + VIR_BPF_LD_IMM64(BPF_REG_2, 0xffffffff00000000), + /* 39: r1 |= r2 */ + VIR_BPF_ALU64_REG(BPF_OR, BPF_REG_1, BPF_REG_2), + /* 40: *(u64 *)(r10 - 8) = r1 */ + VIR_BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_1, -8), + /* 41: r2 = r10 */ + VIR_BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + /* 42: r2 += -8 */ + VIR_BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), + /* 43: r1 = 0 ll */ + VIR_BPF_LD_MAP_FD(BPF_REG_1, mapfd), + /* 45: call 1 */ + VIR_BPF_CALL_INSN(BPF_FUNC_map_lookup_elem), + /* 46: r1 = r0 */ + VIR_BPF_MOV64_REG(BPF_REG_1, BPF_REG_0), + /* 47: if r1 == 0 goto +5 <LBB0_6> */ + VIR_BPF_JMP_IMM(BPF_JEQ, BPF_REG_1, 0, 5), + /* 48: r0 = 1 */ + VIR_BPF_MOV64_IMM(BPF_REG_0, 1), + /* 49: r2 = *(u32 *)(r6 + 0) */ + VIR_BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_6, 0), + /* 50: r1 = *(u32 *)(r1 + 0) */ + VIR_BPF_LDX_MEM(BPF_W, BPF_REG_1, BPF_REG_1, 0), + /* 51: r1 &= r2 */ + VIR_BPF_ALU64_REG(BPF_AND, BPF_REG_1, BPF_REG_2), + /* 52: if r1 == r2 goto +15 <LBB0_9> */ + VIR_BPF_JMP_REG(BPF_JEQ, BPF_REG_1, BPF_REG_2, 15), + /* LBB0_6: */ + /* 53: r1 = -1 */ + VIR_BPF_MOV64_IMM(BPF_REG_1, -1), + /* 54: *(u64 *)(r10 - 8) = r1 */ + VIR_BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_1, -8), + /* 55: r2 = r10 */ + VIR_BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + /* 56: r2 += -8 */ + VIR_BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), + /* 57: r1 = 0 ll */ + VIR_BPF_LD_MAP_FD(BPF_REG_1, mapfd), + /* 59: call 1 */ + VIR_BPF_CALL_INSN(BPF_FUNC_map_lookup_elem), + /* 60: r1 = r0 */ + VIR_BPF_MOV64_REG(BPF_REG_1, BPF_REG_0), + /* 61: if r1 == 0 goto +5 <LBB0_8> */ + VIR_BPF_JMP_IMM(BPF_JEQ, BPF_REG_1, 0, 5), + /* 62: r0 = 1 */ + VIR_BPF_MOV64_IMM(BPF_REG_0, 1), + /* 63: r2 = *(u32 *)(r6 + 0) */ + VIR_BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_6, 0), + /* 64: r1 = *(u32 *)(r1 + 0) */ + VIR_BPF_LDX_MEM(BPF_W, BPF_REG_1, BPF_REG_1, 0), + /* 65: r1 &= r2 */ + VIR_BPF_ALU64_REG(BPF_AND, BPF_REG_1, BPF_REG_2), + /* 66: if r1 == r2 goto +1 <LBB0_9> */ + VIR_BPF_JMP_REG(BPF_JEQ, BPF_REG_1, BPF_REG_2, 1), + /* LBB0_8: */ + /* 67: r0 = 0 */ + VIR_BPF_MOV64_IMM(BPF_REG_0, 0), + /* LBB0_9: */ + /* 68: exit */ + VIR_BPF_EXIT_INSN(), + }; + + return virBPFLoadProg(prog, BPF_PROG_TYPE_CGROUP_DEVICE, ARRAY_CARDINALITY(prog)); +} + + +int +virCgroupV2DevicesAttachProg(virCgroupPtr group, + int mapfd, + size_t max) +{ + int ret = -1; + int progfd = -1; + int cgroupfd = -1;
VIR_AUTOCLOSE
+ VIR_AUTOFREE(char *) path = NULL; + + if (virCgroupPathOfController(group, VIR_CGROUP_CONTROLLER_DEVICES, + NULL, &path) < 0) { + goto cleanup; + } + + progfd = virCgroupV2DevicesLoadProg(mapfd); + if (progfd < 0) { + virReportSystemError(errno, "%s", _("failed to load cgroup BPF prog")); + goto cleanup; + } + + cgroupfd = open(path, O_RDONLY); + if (cgroupfd < 0) { + virReportSystemError(errno, _("unable to open '%s'"), path); + goto cleanup; + } + + if (virBPFAttachProg(progfd, cgroupfd, BPF_CGROUP_DEVICE) < 0) { + virReportSystemError(errno, "%s", _("failed to attach cgroup BPF prog")); + goto cleanup; + } + + if (group->unified.devices.progfd > 0) { + VIR_DEBUG("Closing existing program that was replaced by new one."); + VIR_FORCE_CLOSE(group->unified.devices.progfd); + } + + group->unified.devices.progfd = progfd; + group->unified.devices.mapfd = mapfd; + group->unified.devices.max = max; + progfd = -1; + mapfd = -1; + + ret = 0; + cleanup: + VIR_FORCE_CLOSE(cgroupfd); + VIR_FORCE_CLOSE(progfd); + VIR_FORCE_CLOSE(mapfd); + return ret; +} #else /* !HAVE_DECL_BPF_CGROUP_DEVICE */ bool virCgroupV2DevicesAvailable(virCgroupPtr group ATTRIBUTE_UNUSED) { return false;
Reviewed-by: Ján Tomko <jtomko@redhat.com> Jano

On Thu, Jun 13, 2019 at 01:16:21PM +0200, Ján Tomko wrote:
On Thu, Apr 25, 2019 at 09:44:20AM +0200, Pavel Hrdina wrote:
This function loads the BPF prog with prepared map into kernel and attaches it into guest cgroup. It can be also used to replace existing program in the cgroup if we need to resize BPF map to store more rules for devices. The old program will be closed and removed from kernel.
There are two possible ways how to create BPF program:
- One way is to write simple C-like code which can by compiled into BPF object file which can be loaded into kernel using elfutils.
- The second way is to define macros which looks like assembler
s/looks/look/
instructions and can be used directly to create BPF program that can be directly loaded into kernel.
Since the program is not too complex we can use the second option.
I can live with that, since it saves us the dependency on clang, and it probably won't require many changes, but please include all the steps necessary to regenerate it (see below).
If there is no program, all devices are allowed, if there is some program it is executed and based on the exit status the access is denied for 0 and allowed for 1.
Our program will follow these rules:
- first it will try to look for the specific key using major and minor to see if there is any rule for that specific device
- if there is no specific rule it will try to look for any rule that matches only major of the device
- if there is no match with major it will try the same but with minor of the device
- as the last attempt it will try to look for rule for all devices and if there is no match it will return 0 to deny that access
Signed-off-by: Pavel Hrdina <phrdina@redhat.com> --- src/libvirt_private.syms | 1 + src/util/vircgrouppriv.h | 10 ++ src/util/vircgroupv2devices.c | 276 ++++++++++++++++++++++++++++++++++ src/util/vircgroupv2devices.h | 5 + 4 files changed, 292 insertions(+)
diff --git a/src/libvirt_private.syms b/src/libvirt_private.syms index 9eac05009c..24a783840f 100644 --- a/src/libvirt_private.syms +++ b/src/libvirt_private.syms @@ -1653,6 +1653,7 @@ virCgroupV1Register; virCgroupV2Register;
# util/vircgroupv2devices.h +virCgroupV2DevicesAttachProg; virCgroupV2DevicesAvailable;
# util/virclosecallbacks.h diff --git a/src/util/vircgrouppriv.h b/src/util/vircgrouppriv.h index 9110c77297..7eba4ade23 100644 --- a/src/util/vircgrouppriv.h +++ b/src/util/vircgrouppriv.h @@ -41,10 +41,20 @@ struct _virCgroupV1Controller { typedef struct _virCgroupV1Controller virCgroupV1Controller; typedef virCgroupV1Controller *virCgroupV1ControllerPtr;
+struct _virCgroupV2Devices { + int mapfd; + int progfd; + ssize_t count; + ssize_t max; +}; +typedef struct _virCgroupV2Devices virCgroupV2Devices; +typedef virCgroupV2Devices *virCgroupV2DevicesPtr; + struct _virCgroupV2Controller { int controllers; char *mountPoint; char *placement; + virCgroupV2Devices devices; }; typedef struct _virCgroupV2Controller virCgroupV2Controller; typedef virCgroupV2Controller *virCgroupV2ControllerPtr; diff --git a/src/util/vircgroupv2devices.c b/src/util/vircgroupv2devices.c index 10080d4fff..c8686e8768 100644 --- a/src/util/vircgroupv2devices.c +++ b/src/util/vircgroupv2devices.c @@ -30,6 +30,7 @@ #define LIBVIRT_VIRCGROUPPRIV_H_ALLOW #include "vircgrouppriv.h"
+#include "viralloc.h" #include "virbpf.h" #include "vircgroup.h" #include "vircgroupv2devices.h" @@ -64,10 +65,285 @@ virCgroupV2DevicesAvailable(virCgroupPtr group) VIR_FORCE_CLOSE(cgroupfd); return ret; } + + +/* Steps to get assembly version of devices BPF program: + * + * Save the following program into bpfprog.c, compile it using clang:
It would be nicer to have the file separate, with this comment there, to save the developer the trouble of stripping the asterisks.
Sure, I can move that into a separate file, I just wanted to have it closer to the assembly-like code.
+ * + * clang -O2 -Wall -target bpf -c bpfprog.c -o bpfprog.o + * + * Now you can use llvm-objdump to get the list if instructions: + * + * llvm-objdump -S -no-show-raw-insn bpfprog.o + * + * which can be converted into program using VIR_BPF_* macros.
Did you convert them manually? Can you share the script here?
Yes, it was done manually as I did not create any script to do it. Pavel

This function will be called if libvirtd was restarted while some domains were running. It will try to detect existing programs attached to the guest cgroup. Signed-off-by: Pavel Hrdina <phrdina@redhat.com> --- src/libvirt_private.syms | 1 + src/util/vircgroupv2devices.c | 117 ++++++++++++++++++++++++++++++++++ src/util/vircgroupv2devices.h | 3 + 3 files changed, 121 insertions(+) diff --git a/src/libvirt_private.syms b/src/libvirt_private.syms index 24a783840f..4753507c0a 100644 --- a/src/libvirt_private.syms +++ b/src/libvirt_private.syms @@ -1655,6 +1655,7 @@ virCgroupV2Register; # util/vircgroupv2devices.h virCgroupV2DevicesAttachProg; virCgroupV2DevicesAvailable; +virCgroupV2DevicesDetectProg; # util/virclosecallbacks.h virCloseCallbacksGet; diff --git a/src/util/vircgroupv2devices.c b/src/util/vircgroupv2devices.c index c8686e8768..e936f0aa0e 100644 --- a/src/util/vircgroupv2devices.c +++ b/src/util/vircgroupv2devices.c @@ -328,6 +328,113 @@ virCgroupV2DevicesAttachProg(virCgroupPtr group, VIR_FORCE_CLOSE(mapfd); return ret; } + + +static int +virCgroupV2DevicesCountMapEntries(int mapfd) +{ + int ret = 0; + int rc; + uint64_t key = 0; + uint64_t prevKey = 0; + + while ((rc = virBPFGetNextElem(mapfd, &prevKey, &key)) == 0) { + ret++; + prevKey = key; + } + + if (rc < 0) + return -1; + + return ret; +} + + +# define MAX_PROG_IDS 10 + +int +virCgroupV2DevicesDetectProg(virCgroupPtr group) +{ + int ret = -1; + int cgroupfd = -1; + unsigned int progcnt = 0; + unsigned int progids[MAX_PROG_IDS] = { 0 }; + VIR_AUTOFREE(char *) path = NULL; + + if (group->unified.devices.progfd > 0 && group->unified.devices.mapfd > 0) + return 0; + + if (virCgroupPathOfController(group, VIR_CGROUP_CONTROLLER_DEVICES, + NULL, &path) < 0) { + return -1; + } + + cgroupfd = open(path, O_RDONLY); + if (cgroupfd < 0) { + virReportSystemError(errno, _("unable to open '%s'"), path); + goto cleanup; + } + + if (virBPFQueryProg(cgroupfd, MAX_PROG_IDS, BPF_CGROUP_DEVICE, + &progcnt, progids) < 0) { + virReportSystemError(errno, "%s", _("unable to query cgroup BPF progs")); + goto cleanup; + } + + if (progcnt > 0) { + /* No need to have alternate code, this function will not be called + * if compiled with old kernel. */ + int progfd = virBPFGetProg(progids[0]); + int mapfd = -1; + int nitems = -1; + struct bpf_prog_info progInfo = { 0 }; + struct bpf_map_info mapInfo = { 0 }; + VIR_AUTOFREE(unsigned int *) mapIDs = NULL; + + if (progfd < 0) { + virReportSystemError(errno, "%s", _("failed to get cgroup BPF prog FD")); + goto cleanup; + } + + if (virBPFGetProgInfo(progfd, &progInfo, &mapIDs) < 0) { + virReportSystemError(errno, "%s", _("failed to get cgroup BPF prog info")); + goto cleanup; + } + + if (progInfo.nr_map_ids == 0) { + virReportError(VIR_ERR_INTERNAL_ERROR, "%s", + _("no map for cgroup BPF prog")); + goto cleanup; + } + + mapfd = virBPFGetMap(mapIDs[0]); + if (mapfd < 0) { + virReportSystemError(errno, "%s", _("failed to get cgroup BPF map FD")); + goto cleanup; + } + + if (virBPFGetMapInfo(mapfd, &mapInfo) < 0) { + virReportSystemError(errno, "%s", _("failed to get cgroup BPF map info")); + goto cleanup; + } + + nitems = virCgroupV2DevicesCountMapEntries(mapfd); + if (nitems < 0) { + virReportSystemError(errno, "%s", _("failed to count cgroup BPF map items")); + goto cleanup; + } + + group->unified.devices.progfd = progfd; + group->unified.devices.mapfd = mapfd; + group->unified.devices.max = mapInfo.max_entries; + group->unified.devices.count = nitems; + } + + ret = 0; + cleanup: + VIR_FORCE_CLOSE(cgroupfd); + return ret; +} #else /* !HAVE_DECL_BPF_CGROUP_DEVICE */ bool virCgroupV2DevicesAvailable(virCgroupPtr group ATTRIBUTE_UNUSED) @@ -346,4 +453,14 @@ virCgroupV2DevicesAttachProg(virCgroupPtr group ATTRIBUTE_UNUSED, "with this kernel")); return -1; } + + +int +virCgroupV2DevicesDetectProg(virCgroupPtr group ATTRIBUTE_UNUSED) +{ + virReportSystemError(ENOSYS, "%s", + _("cgroups v2 BPF devices not supported " + "with this kernel")); + return -1; +} #endif /* !HAVE_DECL_BPF_CGROUP_DEVICE */ diff --git a/src/util/vircgroupv2devices.h b/src/util/vircgroupv2devices.h index 1ba87acb00..a8e50dcca5 100644 --- a/src/util/vircgroupv2devices.h +++ b/src/util/vircgroupv2devices.h @@ -29,4 +29,7 @@ virCgroupV2DevicesAttachProg(virCgroupPtr group, int mapfd, size_t max); +int +virCgroupV2DevicesDetectProg(virCgroupPtr group); + #endif /* LIBVIRT_VIRCGROUPV2DEVICES_H */ -- 2.20.1

On Thu, Apr 25, 2019 at 09:44:21AM +0200, Pavel Hrdina wrote:
This function will be called if libvirtd was restarted while some domains were running. It will try to detect existing programs attached to the guest cgroup.
Signed-off-by: Pavel Hrdina <phrdina@redhat.com> --- src/libvirt_private.syms | 1 + src/util/vircgroupv2devices.c | 117 ++++++++++++++++++++++++++++++++++ src/util/vircgroupv2devices.h | 3 + 3 files changed, 121 insertions(+)
diff --git a/src/libvirt_private.syms b/src/libvirt_private.syms index 24a783840f..4753507c0a 100644 --- a/src/libvirt_private.syms +++ b/src/libvirt_private.syms @@ -1655,6 +1655,7 @@ virCgroupV2Register; # util/vircgroupv2devices.h virCgroupV2DevicesAttachProg; virCgroupV2DevicesAvailable; +virCgroupV2DevicesDetectProg;
# util/virclosecallbacks.h virCloseCallbacksGet; diff --git a/src/util/vircgroupv2devices.c b/src/util/vircgroupv2devices.c index c8686e8768..e936f0aa0e 100644 --- a/src/util/vircgroupv2devices.c +++ b/src/util/vircgroupv2devices.c @@ -328,6 +328,113 @@ virCgroupV2DevicesAttachProg(virCgroupPtr group, VIR_FORCE_CLOSE(mapfd); return ret; } + + +static int +virCgroupV2DevicesCountMapEntries(int mapfd) +{ + int ret = 0; + int rc; + uint64_t key = 0; + uint64_t prevKey = 0; + + while ((rc = virBPFGetNextElem(mapfd, &prevKey, &key)) == 0) { + ret++; + prevKey = key; + } + + if (rc < 0) + return -1; + + return ret; +} + + +# define MAX_PROG_IDS 10 + +int +virCgroupV2DevicesDetectProg(virCgroupPtr group) +{ + int ret = -1; + int cgroupfd = -1;
With VIR_AUTOCLOSE
+ unsigned int progcnt = 0; + unsigned int progids[MAX_PROG_IDS] = { 0 }; + VIR_AUTOFREE(char *) path = NULL; + + if (group->unified.devices.progfd > 0 && group->unified.devices.mapfd > 0) + return 0; + + if (virCgroupPathOfController(group, VIR_CGROUP_CONTROLLER_DEVICES, + NULL, &path) < 0) { + return -1; + } + + cgroupfd = open(path, O_RDONLY); + if (cgroupfd < 0) { + virReportSystemError(errno, _("unable to open '%s'"), path); + goto cleanup; + } + + if (virBPFQueryProg(cgroupfd, MAX_PROG_IDS, BPF_CGROUP_DEVICE, + &progcnt, progids) < 0) { + virReportSystemError(errno, "%s", _("unable to query cgroup BPF progs")); + goto cleanup; + } +
and: if (progcnt == 0) return 0; you can reduce the indentation level below
+ if (progcnt > 0) { + /* No need to have alternate code, this function will not be called + * if compiled with old kernel. */ + int progfd = virBPFGetProg(progids[0]); + int mapfd = -1; + int nitems = -1; + struct bpf_prog_info progInfo = { 0 }; + struct bpf_map_info mapInfo = { 0 }; + VIR_AUTOFREE(unsigned int *) mapIDs = NULL; + + if (progfd < 0) { + virReportSystemError(errno, "%s", _("failed to get cgroup BPF prog FD")); + goto cleanup; + } + + if (virBPFGetProgInfo(progfd, &progInfo, &mapIDs) < 0) { + virReportSystemError(errno, "%s", _("failed to get cgroup BPF prog info")); + goto cleanup; + } + + if (progInfo.nr_map_ids == 0) { + virReportError(VIR_ERR_INTERNAL_ERROR, "%s", + _("no map for cgroup BPF prog")); + goto cleanup; + } + + mapfd = virBPFGetMap(mapIDs[0]); + if (mapfd < 0) { + virReportSystemError(errno, "%s", _("failed to get cgroup BPF map FD")); + goto cleanup; + } + + if (virBPFGetMapInfo(mapfd, &mapInfo) < 0) { + virReportSystemError(errno, "%s", _("failed to get cgroup BPF map info")); + goto cleanup; + } + + nitems = virCgroupV2DevicesCountMapEntries(mapfd); + if (nitems < 0) { + virReportSystemError(errno, "%s", _("failed to count cgroup BPF map items")); + goto cleanup; + } + + group->unified.devices.progfd = progfd; + group->unified.devices.mapfd = mapfd; + group->unified.devices.max = mapInfo.max_entries; + group->unified.devices.count = nitems; + } + + ret = 0; + cleanup: + VIR_FORCE_CLOSE(cgroupfd); + return ret; +} #else /* !HAVE_DECL_BPF_CGROUP_DEVICE */ bool virCgroupV2DevicesAvailable(virCgroupPtr group ATTRIBUTE_UNUSED) @@ -346,4 +453,14 @@ virCgroupV2DevicesAttachProg(virCgroupPtr group ATTRIBUTE_UNUSED, "with this kernel")); return -1; } + + +int +virCgroupV2DevicesDetectProg(virCgroupPtr group ATTRIBUTE_UNUSED) +{ + virReportSystemError(ENOSYS, "%s", + _("cgroups v2 BPF devices not supported " + "with this kernel")); + return -1; +} #endif /* !HAVE_DECL_BPF_CGROUP_DEVICE */
Reviewed-by: Ján Tomko <jtomko@redhat.com> Jano

This function creates new BPF program with new empty BPF map with the default size and attaches it to the guest cgroup. Signed-off-by: Pavel Hrdina <phrdina@redhat.com> --- src/libvirt_private.syms | 1 + src/util/vircgroupv2devices.c | 53 +++++++++++++++++++++++++++++++++++ src/util/vircgroupv2devices.h | 3 ++ 3 files changed, 57 insertions(+) diff --git a/src/libvirt_private.syms b/src/libvirt_private.syms index 4753507c0a..f49f401d2b 100644 --- a/src/libvirt_private.syms +++ b/src/libvirt_private.syms @@ -1655,6 +1655,7 @@ virCgroupV2Register; # util/vircgroupv2devices.h virCgroupV2DevicesAttachProg; virCgroupV2DevicesAvailable; +virCgroupV2DevicesCreateProg; virCgroupV2DevicesDetectProg; # util/virclosecallbacks.h diff --git a/src/util/vircgroupv2devices.c b/src/util/vircgroupv2devices.c index e936f0aa0e..d8934e8add 100644 --- a/src/util/vircgroupv2devices.c +++ b/src/util/vircgroupv2devices.c @@ -435,6 +435,49 @@ virCgroupV2DevicesDetectProg(virCgroupPtr group) VIR_FORCE_CLOSE(cgroupfd); return ret; } + + +# define VIR_CGROUP_V2_INITIAL_BPF_MAP_SIZE 64 + +static int +virCgroupV2DevicesCreateMap(size_t size) +{ + int mapfd = virBPFCreateMap(BPF_MAP_TYPE_HASH, sizeof(uint64_t), + sizeof(uint32_t), size); + + if (mapfd < 0) { + virReportSystemError(errno, "%s", + _("failed to initialize device BPF map")); + return -1; + } + + return mapfd; +} + + +int +virCgroupV2DevicesCreateProg(virCgroupPtr group) +{ + int mapfd; + + if (group->unified.devices.progfd > 0 && group->unified.devices.mapfd > 0) + return 0; + + mapfd = virCgroupV2DevicesCreateMap(VIR_CGROUP_V2_INITIAL_BPF_MAP_SIZE); + if (mapfd < 0) + return -1; + + if (virCgroupV2DevicesAttachProg(group, mapfd, + VIR_CGROUP_V2_INITIAL_BPF_MAP_SIZE) < 0) { + goto error; + } + + return 0; + + error: + VIR_FORCE_CLOSE(mapfd); + return -1; +} #else /* !HAVE_DECL_BPF_CGROUP_DEVICE */ bool virCgroupV2DevicesAvailable(virCgroupPtr group ATTRIBUTE_UNUSED) @@ -463,4 +506,14 @@ virCgroupV2DevicesDetectProg(virCgroupPtr group ATTRIBUTE_UNUSED) "with this kernel")); return -1; } + + +int +virCgroupV2DevicesCreateProg(virCgroupPtr group ATTRIBUTE_UNUSED) +{ + virReportSystemError(ENOSYS, "%s", + _("cgroups v2 BPF devices not supported " + "with this kernel")); + return -1; +} #endif /* !HAVE_DECL_BPF_CGROUP_DEVICE */ diff --git a/src/util/vircgroupv2devices.h b/src/util/vircgroupv2devices.h index a8e50dcca5..bcbd761537 100644 --- a/src/util/vircgroupv2devices.h +++ b/src/util/vircgroupv2devices.h @@ -32,4 +32,7 @@ virCgroupV2DevicesAttachProg(virCgroupPtr group, int virCgroupV2DevicesDetectProg(virCgroupPtr group); +int +virCgroupV2DevicesCreateProg(virCgroupPtr group); + #endif /* LIBVIRT_VIRCGROUPV2DEVICES_H */ -- 2.20.1

On Thu, Apr 25, 2019 at 09:44:22AM +0200, Pavel Hrdina wrote:
This function creates new BPF program with new empty BPF map with the default size and attaches it to the guest cgroup.
Signed-off-by: Pavel Hrdina <phrdina@redhat.com> --- src/libvirt_private.syms | 1 + src/util/vircgroupv2devices.c | 53 +++++++++++++++++++++++++++++++++++ src/util/vircgroupv2devices.h | 3 ++ 3 files changed, 57 insertions(+)
Reviewed-by: Ján Tomko <jtomko@redhat.com> Jano

This function will be called for every virCgroup(Allow|Deny)* API in order to prepare BPF program for guest. Since libvirtd can be restarted at any point we will first try to detect existing progam, if there is none we will create a new empty BPF program and lastly if we don't have any space left in the existing BPF map we will create a new copy of the BPF map with more space and attach a new program with that map into the guest cgroup. This solution allows us to start with reasonably small BPF map consuming only small amount of memory and if needed we can easily extend the BPF map if there is a lot of host devices used in guest or if user wants to hot-plug a lot of devices once the guest is running. Since there is no way how to reallocate existing BPF map we need to create a new copy if we run out of space in current BPF map. This overcomes all the limitations in BPF: - map used in program has to be created before the program is loaded into kernel - once map is created you cannot change its size - you cannot replace map in existing program - you cannot use an array of maps because it can store FD to maps of one specific size so we would not be able to use it to overcome the second issue Signed-off-by: Pavel Hrdina <phrdina@redhat.com> --- src/libvirt_private.syms | 1 + src/util/vircgroupv2devices.c | 83 +++++++++++++++++++++++++++++++++++ src/util/vircgroupv2devices.h | 3 ++ 3 files changed, 87 insertions(+) diff --git a/src/libvirt_private.syms b/src/libvirt_private.syms index f49f401d2b..9ab07de06d 100644 --- a/src/libvirt_private.syms +++ b/src/libvirt_private.syms @@ -1657,6 +1657,7 @@ virCgroupV2DevicesAttachProg; virCgroupV2DevicesAvailable; virCgroupV2DevicesCreateProg; virCgroupV2DevicesDetectProg; +virCgroupV2DevicesPrepareProg; # util/virclosecallbacks.h virCloseCallbacksGet; diff --git a/src/util/vircgroupv2devices.c b/src/util/vircgroupv2devices.c index d8934e8add..e8c6f74091 100644 --- a/src/util/vircgroupv2devices.c +++ b/src/util/vircgroupv2devices.c @@ -455,6 +455,52 @@ virCgroupV2DevicesCreateMap(size_t size) } +static int +virCgroupV2DevicesReallocMap(int mapfd, + size_t size) +{ + uint64_t key = 0; + uint64_t prevKey = 0; + int rc; + int newmapfd = virCgroupV2DevicesCreateMap(size); + + VIR_DEBUG("realloc devices map mapfd:%d, size:%lu", mapfd, size); + + if (newmapfd < 0) + return -1; + + while ((rc = virBPFGetNextElem(mapfd, &prevKey, &key)) == 0) { + uint32_t val = 0; + + if (virBPFLookupElem(mapfd, &key, &val) < 0) { + virReportSystemError(errno, "%s", + _("failed to lookup device in old map")); + goto error; + } + + if (virBPFUpdateElem(newmapfd, &key, &val) < 0) { + virReportSystemError(errno, "%s", + _("failed to add device into new map")); + goto error; + } + + prevKey = key; + } + + if (rc < 0 && errno != ENOENT) { + virReportSystemError(errno, "%s", + _("failed to copy all device rules")); + goto error; + } + + return newmapfd; + + error: + VIR_FORCE_CLOSE(newmapfd); + return -1; +} + + int virCgroupV2DevicesCreateProg(virCgroupPtr group) { @@ -478,6 +524,33 @@ virCgroupV2DevicesCreateProg(virCgroupPtr group) VIR_FORCE_CLOSE(mapfd); return -1; } + + +int +virCgroupV2DevicesPrepareProg(virCgroupPtr group) +{ + if (virCgroupV2DevicesDetectProg(group) < 0) + return -1; + + if (virCgroupV2DevicesCreateProg(group) < 0) + return -1; + + if (group->unified.devices.count >= group->unified.devices.max) { + size_t max = group->unified.devices.max * 2; + int newmapfd = virCgroupV2DevicesReallocMap(group->unified.devices.mapfd, + max); + + if (newmapfd < 0) + return -1; + + if (virCgroupV2DevicesAttachProg(group, newmapfd, max) < 0) { + VIR_FORCE_CLOSE(newmapfd); + return -1; + } + } + + return 0; +} #else /* !HAVE_DECL_BPF_CGROUP_DEVICE */ bool virCgroupV2DevicesAvailable(virCgroupPtr group ATTRIBUTE_UNUSED) @@ -516,4 +589,14 @@ virCgroupV2DevicesCreateProg(virCgroupPtr group ATTRIBUTE_UNUSED) "with this kernel")); return -1; } + + +int +virCgroupV2DevicesPrepareProg(virCgroupPtr group ATTRIBUTE_UNUSED) +{ + virReportSystemError(ENOSYS, "%s", + _("cgroups v2 BPF devices not supported " + "with this kernel")); + return -1; +} #endif /* !HAVE_DECL_BPF_CGROUP_DEVICE */ diff --git a/src/util/vircgroupv2devices.h b/src/util/vircgroupv2devices.h index bcbd761537..10c80c8ae4 100644 --- a/src/util/vircgroupv2devices.h +++ b/src/util/vircgroupv2devices.h @@ -35,4 +35,7 @@ virCgroupV2DevicesDetectProg(virCgroupPtr group); int virCgroupV2DevicesCreateProg(virCgroupPtr group); +int +virCgroupV2DevicesPrepareProg(virCgroupPtr group); + #endif /* LIBVIRT_VIRCGROUPV2DEVICES_H */ -- 2.20.1

On Thu, Apr 25, 2019 at 09:44:23AM +0200, Pavel Hrdina wrote:
This function will be called for every virCgroup(Allow|Deny)* API in order to prepare BPF program for guest. Since libvirtd can be restarted at any point we will first try to detect existing progam, if there is none we will create a new empty BPF program and lastly if we don't have any space left in the existing BPF map we will create a new copy of the BPF map with more space and attach a new program with that map into the guest cgroup.
This solution allows us to start with reasonably small BPF map consuming only small amount of memory and if needed we can easily extend the BPF map if there is a lot of host devices used in guest or if user wants to hot-plug a lot of devices once the guest is running.
Since there is no way how to reallocate existing BPF map we need to create a new copy if we run out of space in current BPF map.
This overcomes all the limitations in BPF:
- map used in program has to be created before the program is loaded into kernel
- once map is created you cannot change its size
- you cannot replace map in existing program
- you cannot use an array of maps because it can store FD to maps of one specific size so we would not be able to use it to overcome the second issue
Signed-off-by: Pavel Hrdina <phrdina@redhat.com> --- src/libvirt_private.syms | 1 + src/util/vircgroupv2devices.c | 83 +++++++++++++++++++++++++++++++++++ src/util/vircgroupv2devices.h | 3 ++ 3 files changed, 87 insertions(+)
Reviewed-by: Ján Tomko <jtomko@redhat.com> Jano

We need to close our FD that we have for BPF program and map in order to let kernel remove all resources once the cgroup is removed as well. Signed-off-by: Pavel Hrdina <phrdina@redhat.com> --- src/libvirt_private.syms | 1 + src/util/vircgroupv2.c | 3 +++ src/util/vircgroupv2devices.c | 26 ++++++++++++++++++++++++++ src/util/vircgroupv2devices.h | 3 +++ 4 files changed, 33 insertions(+) diff --git a/src/libvirt_private.syms b/src/libvirt_private.syms index 9ab07de06d..f42bdad9ef 100644 --- a/src/libvirt_private.syms +++ b/src/libvirt_private.syms @@ -1658,6 +1658,7 @@ virCgroupV2DevicesAvailable; virCgroupV2DevicesCreateProg; virCgroupV2DevicesDetectProg; virCgroupV2DevicesPrepareProg; +virCgroupV2DevicesRemoveProg; # util/virclosecallbacks.h virCloseCallbacksGet; diff --git a/src/util/vircgroupv2.c b/src/util/vircgroupv2.c index dc7573e05c..ce19169fe7 100644 --- a/src/util/vircgroupv2.c +++ b/src/util/vircgroupv2.c @@ -448,6 +448,9 @@ virCgroupV2Remove(virCgroupPtr group) if (virCgroupV2PathOfController(group, controller, "", &grppath) < 0) return 0; + if (virCgroupV2DevicesRemoveProg(group) < 0) + return -1; + return virCgroupRemoveRecursively(grppath); } diff --git a/src/util/vircgroupv2devices.c b/src/util/vircgroupv2devices.c index e8c6f74091..70e29b8470 100644 --- a/src/util/vircgroupv2devices.c +++ b/src/util/vircgroupv2devices.c @@ -551,6 +551,25 @@ virCgroupV2DevicesPrepareProg(virCgroupPtr group) return 0; } + + +int +virCgroupV2DevicesRemoveProg(virCgroupPtr group) +{ + if (virCgroupV2DevicesDetectProg(group) < 0) + return -1; + + if (group->unified.devices.progfd <= 0 && group->unified.devices.mapfd <= 0) + return 0; + + if (group->unified.devices.mapfd >= 0) + VIR_FORCE_CLOSE(group->unified.devices.mapfd); + + if (group->unified.devices.progfd >= 0) + VIR_FORCE_CLOSE(group->unified.devices.progfd); + + return 0; +} #else /* !HAVE_DECL_BPF_CGROUP_DEVICE */ bool virCgroupV2DevicesAvailable(virCgroupPtr group ATTRIBUTE_UNUSED) @@ -599,4 +618,11 @@ virCgroupV2DevicesPrepareProg(virCgroupPtr group ATTRIBUTE_UNUSED) "with this kernel")); return -1; } + + +int +virCgroupV2DevicesRemoveProg(virCgroupPtr group ATTRIBUTE_UNUSED) +{ + return 0; +} #endif /* !HAVE_DECL_BPF_CGROUP_DEVICE */ diff --git a/src/util/vircgroupv2devices.h b/src/util/vircgroupv2devices.h index 10c80c8ae4..ae02f9c334 100644 --- a/src/util/vircgroupv2devices.h +++ b/src/util/vircgroupv2devices.h @@ -38,4 +38,7 @@ virCgroupV2DevicesCreateProg(virCgroupPtr group); int virCgroupV2DevicesPrepareProg(virCgroupPtr group); +int +virCgroupV2DevicesRemoveProg(virCgroupPtr group); + #endif /* LIBVIRT_VIRCGROUPV2DEVICES_H */ -- 2.20.1

On Thu, Apr 25, 2019 at 09:44:24AM +0200, Pavel Hrdina wrote:
We need to close our FD that we have for BPF program and map in order to let kernel remove all resources once the cgroup is removed as well.
Signed-off-by: Pavel Hrdina <phrdina@redhat.com> --- src/libvirt_private.syms | 1 + src/util/vircgroupv2.c | 3 +++ src/util/vircgroupv2devices.c | 26 ++++++++++++++++++++++++++ src/util/vircgroupv2devices.h | 3 +++ 4 files changed, 33 insertions(+)
Reviewed-by: Ján Tomko <jtomko@redhat.com> Jano

Signed-off-by: Pavel Hrdina <phrdina@redhat.com> --- src/libvirt_private.syms | 1 + src/util/vircgroupv2devices.c | 34 ++++++++++++++++++++++++++++++++++ src/util/vircgroupv2devices.h | 6 ++++++ 3 files changed, 41 insertions(+) diff --git a/src/libvirt_private.syms b/src/libvirt_private.syms index f42bdad9ef..469a1cdbe0 100644 --- a/src/libvirt_private.syms +++ b/src/libvirt_private.syms @@ -1657,6 +1657,7 @@ virCgroupV2DevicesAttachProg; virCgroupV2DevicesAvailable; virCgroupV2DevicesCreateProg; virCgroupV2DevicesDetectProg; +virCgroupV2DevicesGetPerms; virCgroupV2DevicesPrepareProg; virCgroupV2DevicesRemoveProg; diff --git a/src/util/vircgroupv2devices.c b/src/util/vircgroupv2devices.c index 70e29b8470..0455ff6b24 100644 --- a/src/util/vircgroupv2devices.c +++ b/src/util/vircgroupv2devices.c @@ -570,6 +570,32 @@ virCgroupV2DevicesRemoveProg(virCgroupPtr group) return 0; } + + +uint32_t +virCgroupV2DevicesGetPerms(int perms, + char type) +{ + uint32_t ret = 0; + + if (perms & VIR_CGROUP_DEVICE_MKNOD) + ret |= BPF_DEVCG_ACC_MKNOD << 16; + + if (perms & VIR_CGROUP_DEVICE_READ) + ret |= BPF_DEVCG_ACC_READ << 16; + + if (perms & VIR_CGROUP_DEVICE_WRITE) + ret |= BPF_DEVCG_ACC_WRITE << 16; + + if (type == 'b') + ret |= BPF_DEVCG_DEV_BLOCK; + else if (type == 'c') + ret |= BPF_DEVCG_DEV_CHAR; + else + ret |= BPF_DEVCG_DEV_BLOCK | BPF_DEVCG_DEV_CHAR; + + return ret; +} #else /* !HAVE_DECL_BPF_CGROUP_DEVICE */ bool virCgroupV2DevicesAvailable(virCgroupPtr group ATTRIBUTE_UNUSED) @@ -625,4 +651,12 @@ virCgroupV2DevicesRemoveProg(virCgroupPtr group ATTRIBUTE_UNUSED) { return 0; } + + +uint32_t +virCgroupV2DevicesGetPerms(int perms ATTRIBUTE_UNUSED, + char type ATTRIBUTE_UNUSED) +{ + return 0; +} #endif /* !HAVE_DECL_BPF_CGROUP_DEVICE */ diff --git a/src/util/vircgroupv2devices.h b/src/util/vircgroupv2devices.h index ae02f9c334..cbfd9ae119 100644 --- a/src/util/vircgroupv2devices.h +++ b/src/util/vircgroupv2devices.h @@ -19,6 +19,8 @@ #ifndef LIBVIRT_VIRCGROUPV2DEVICES_H # define LIBVIRT_VIRCGROUPV2DEVICES_H +# include <sys/types.h> + # include "vircgroup.h" bool @@ -41,4 +43,8 @@ virCgroupV2DevicesPrepareProg(virCgroupPtr group); int virCgroupV2DevicesRemoveProg(virCgroupPtr group); +uint32_t +virCgroupV2DevicesGetPerms(int perms, + char type); + #endif /* LIBVIRT_VIRCGROUPV2DEVICES_H */ -- 2.20.1

On Thu, Apr 25, 2019 at 09:44:25AM +0200, Pavel Hrdina wrote:
Signed-off-by: Pavel Hrdina <phrdina@redhat.com> --- src/libvirt_private.syms | 1 + src/util/vircgroupv2devices.c | 34 ++++++++++++++++++++++++++++++++++ src/util/vircgroupv2devices.h | 6 ++++++ 3 files changed, 41 insertions(+)
diff --git a/src/libvirt_private.syms b/src/libvirt_private.syms index f42bdad9ef..469a1cdbe0 100644 --- a/src/libvirt_private.syms +++ b/src/libvirt_private.syms @@ -1657,6 +1657,7 @@ virCgroupV2DevicesAttachProg; virCgroupV2DevicesAvailable; virCgroupV2DevicesCreateProg; virCgroupV2DevicesDetectProg; +virCgroupV2DevicesGetPerms; virCgroupV2DevicesPrepareProg; virCgroupV2DevicesRemoveProg;
Reviewed-by: Ján Tomko <jtomko@redhat.com> Jano

Device rules are stored in BPF map that is a hash type, this function will create a key based on major and minor id of device. Signed-off-by: Pavel Hrdina <phrdina@redhat.com> --- src/libvirt_private.syms | 1 + src/util/vircgroupv2devices.c | 8 ++++++++ src/util/vircgroupv2devices.h | 4 ++++ 3 files changed, 13 insertions(+) diff --git a/src/libvirt_private.syms b/src/libvirt_private.syms index 469a1cdbe0..18cb3715a2 100644 --- a/src/libvirt_private.syms +++ b/src/libvirt_private.syms @@ -1657,6 +1657,7 @@ virCgroupV2DevicesAttachProg; virCgroupV2DevicesAvailable; virCgroupV2DevicesCreateProg; virCgroupV2DevicesDetectProg; +virCgroupV2DevicesGetKey; virCgroupV2DevicesGetPerms; virCgroupV2DevicesPrepareProg; virCgroupV2DevicesRemoveProg; diff --git a/src/util/vircgroupv2devices.c b/src/util/vircgroupv2devices.c index 0455ff6b24..9b154ccb5d 100644 --- a/src/util/vircgroupv2devices.c +++ b/src/util/vircgroupv2devices.c @@ -660,3 +660,11 @@ virCgroupV2DevicesGetPerms(int perms ATTRIBUTE_UNUSED, return 0; } #endif /* !HAVE_DECL_BPF_CGROUP_DEVICE */ + + +uint64_t +virCgroupV2DevicesGetKey(int major, + int minor) +{ + return (uint64_t)major << 32 | ((uint64_t)minor & 0x00000000ffffffff); +} diff --git a/src/util/vircgroupv2devices.h b/src/util/vircgroupv2devices.h index cbfd9ae119..d717c9755f 100644 --- a/src/util/vircgroupv2devices.h +++ b/src/util/vircgroupv2devices.h @@ -47,4 +47,8 @@ uint32_t virCgroupV2DevicesGetPerms(int perms, char type); +uint64_t +virCgroupV2DevicesGetKey(int major, + int minor); + #endif /* LIBVIRT_VIRCGROUPV2DEVICES_H */ -- 2.20.1

On Thu, Apr 25, 2019 at 09:44:26AM +0200, Pavel Hrdina wrote:
Device rules are stored in BPF map that is a hash type, this function will create a key based on major and minor id of device.
Signed-off-by: Pavel Hrdina <phrdina@redhat.com> --- src/libvirt_private.syms | 1 + src/util/vircgroupv2devices.c | 8 ++++++++ src/util/vircgroupv2devices.h | 4 ++++ 3 files changed, 13 insertions(+)
Reviewed-by: Ján Tomko <jtomko@redhat.com> Jano

In order to allow device we need to create key and value which will be used to update BPF map. virBPFUpdateElem() can override existing entries in BPF map so we need to check if that entry exists in order to track number of entries in our map. This can add rule for specific device but major and minor can be both -1 which follows the same behavior as in cgroup v1. Signed-off-by: Pavel Hrdina <phrdina@redhat.com> --- src/util/vircgroupv2.c | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/src/util/vircgroupv2.c b/src/util/vircgroupv2.c index ce19169fe7..9f9802bb2f 100644 --- a/src/util/vircgroupv2.c +++ b/src/util/vircgroupv2.c @@ -30,6 +30,7 @@ #include "vircgrouppriv.h" #include "viralloc.h" +#include "virbpf.h" #include "vircgroup.h" #include "vircgroupbackend.h" #include "vircgroupv2.h" @@ -1638,6 +1639,35 @@ virCgroupV2GetCpusetCpus(virCgroupPtr group, } +static int +virCgroupV2AllowDevice(virCgroupPtr group, + char type, + int major, + int minor, + int perms) +{ + uint64_t key = virCgroupV2DevicesGetKey(major, minor); + uint32_t val = virCgroupV2DevicesGetPerms(perms, type); + int rc; + + if (virCgroupV2DevicesPrepareProg(group) < 0) + return -1; + + rc = virBPFLookupElem(group->unified.devices.mapfd, &key, NULL); + + if (virBPFUpdateElem(group->unified.devices.mapfd, &key, &val) < 0) { + virReportSystemError(errno, "%s", + _("failed to update device in BPF cgroup map")); + return -1; + } + + if (rc < 0) + group->unified.devices.count++; + + return 0; +} + + virCgroupBackend virCgroupV2Backend = { .type = VIR_CGROUP_BACKEND_TYPE_V2, @@ -1687,6 +1717,8 @@ virCgroupBackend virCgroupV2Backend = { .getMemSwapHardLimit = virCgroupV2GetMemSwapHardLimit, .getMemSwapUsage = virCgroupV2GetMemSwapUsage, + .allowDevice = virCgroupV2AllowDevice, + .setCpuShares = virCgroupV2SetCpuShares, .getCpuShares = virCgroupV2GetCpuShares, .setCpuCfsPeriod = virCgroupV2SetCpuCfsPeriod, -- 2.20.1

On Thu, Apr 25, 2019 at 09:44:27AM +0200, Pavel Hrdina wrote:
In order to allow device we need to create key and value which will be used to update BPF map. virBPFUpdateElem() can override existing entries in BPF map so we need to check if that entry exists in order to track number of entries in our map.
This can add rule for specific device but major and minor can be both -1 which follows the same behavior as in cgroup v1.
Signed-off-by: Pavel Hrdina <phrdina@redhat.com> --- src/util/vircgroupv2.c | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+)
Reviewed-by: Ján Tomko <jtomko@redhat.com> Jano

In order to deny device we need to check if there is any entry in BPF map and we need to load the current value from map if there is already entry for that device. If both values are same we can remove that entry but if they are different we need to update the entry because we don't have to deny all access, but for example only write access. Signed-off-by: Pavel Hrdina <phrdina@redhat.com> --- src/util/vircgroupv2.c | 41 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 41 insertions(+) diff --git a/src/util/vircgroupv2.c b/src/util/vircgroupv2.c index 9f9802bb2f..bf78c33519 100644 --- a/src/util/vircgroupv2.c +++ b/src/util/vircgroupv2.c @@ -1668,6 +1668,46 @@ virCgroupV2AllowDevice(virCgroupPtr group, } +static int +virCgroupV2DenyDevice(virCgroupPtr group, + char type, + int major, + int minor, + int perms) +{ + uint64_t key = virCgroupV2DevicesGetKey(major, minor); + uint32_t newval = virCgroupV2DevicesGetPerms(perms, type); + uint32_t val = 0; + + if (virCgroupV2DevicesPrepareProg(group) < 0) + return -1; + + if (group->unified.devices.count <= 0 || + virBPFLookupElem(group->unified.devices.mapfd, &key, &val) < 0) { + VIR_DEBUG("nothing to do, device is not allowed"); + return 0; + } + + if (newval == val) { + if (virBPFDeleteElem(group->unified.devices.mapfd, &key) < 0) { + virReportSystemError(errno, "%s", + _("failed to remove device from BPF cgroup map")); + return -1; + } + group->unified.devices.count--; + } else { + val ^= val & newval; + if (virBPFUpdateElem(group->unified.devices.mapfd, &key, &val) < 0) { + virReportSystemError(errno, "%s", + _("failed to update device in BPF cgroup map")); + return -1; + } + } + + return 0; +} + + virCgroupBackend virCgroupV2Backend = { .type = VIR_CGROUP_BACKEND_TYPE_V2, @@ -1718,6 +1758,7 @@ virCgroupBackend virCgroupV2Backend = { .getMemSwapUsage = virCgroupV2GetMemSwapUsage, .allowDevice = virCgroupV2AllowDevice, + .denyDevice = virCgroupV2DenyDevice, .setCpuShares = virCgroupV2SetCpuShares, .getCpuShares = virCgroupV2GetCpuShares, -- 2.20.1

On Thu, Apr 25, 2019 at 09:44:28AM +0200, Pavel Hrdina wrote:
In order to deny device we need to check if there is any entry in BPF map and we need to load the current value from map if there is already entry for that device. If both values are same we can remove that entry but if they are different we need to update the entry because we don't have to deny all access, but for example only write access.
Signed-off-by: Pavel Hrdina <phrdina@redhat.com> --- src/util/vircgroupv2.c | 41 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 41 insertions(+)
Reviewed-by: Ján Tomko <jtomko@redhat.com> Jano

If we want to allow all devices with all permissions we need to replace any existing program that has any rule configured, otherwise we just need to add new rule which will for example allow read access to all devices. Signed-off-by: Pavel Hrdina <phrdina@redhat.com> --- src/util/vircgroupv2.c | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/src/util/vircgroupv2.c b/src/util/vircgroupv2.c index bf78c33519..a8ba9b9e9e 100644 --- a/src/util/vircgroupv2.c +++ b/src/util/vircgroupv2.c @@ -1708,6 +1708,23 @@ virCgroupV2DenyDevice(virCgroupPtr group, } +static int +virCgroupV2AllowAllDevices(virCgroupPtr group, + int perms) +{ + if (virCgroupV2DevicesPrepareProg(group) < 0) + return -1; + + if (group->unified.devices.count > 0 && + perms == VIR_CGROUP_DEVICE_RWM && + virCgroupV2DevicesCreateProg(group) < 0) { + return -1; + } + + return virCgroupV2AllowDevice(group, 'a', -1, -1, perms); +} + + virCgroupBackend virCgroupV2Backend = { .type = VIR_CGROUP_BACKEND_TYPE_V2, @@ -1759,6 +1776,7 @@ virCgroupBackend virCgroupV2Backend = { .allowDevice = virCgroupV2AllowDevice, .denyDevice = virCgroupV2DenyDevice, + .allowAllDevices = virCgroupV2AllowAllDevices, .setCpuShares = virCgroupV2SetCpuShares, .getCpuShares = virCgroupV2GetCpuShares, -- 2.20.1

On Thu, Apr 25, 2019 at 09:44:29AM +0200, Pavel Hrdina wrote:
If we want to allow all devices with all permissions we need to replace any existing program that has any rule configured, otherwise we just need to add new rule which will for example allow read access to all devices.
Signed-off-by: Pavel Hrdina <phrdina@redhat.com> --- src/util/vircgroupv2.c | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+)
Reviewed-by: Ján Tomko <jtomko@redhat.com> Jano

If we want to deny all devices we just need to replace any existing program with new program with empty map. Signed-off-by: Pavel Hrdina <phrdina@redhat.com> --- src/util/vircgroupv2.c | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/src/util/vircgroupv2.c b/src/util/vircgroupv2.c index a8ba9b9e9e..6c851f9637 100644 --- a/src/util/vircgroupv2.c +++ b/src/util/vircgroupv2.c @@ -1725,6 +1725,16 @@ virCgroupV2AllowAllDevices(virCgroupPtr group, } +static int +virCgroupV2DenyAllDevices(virCgroupPtr group) +{ + if (virCgroupV2DevicesDetectProg(group) < 0) + return -1; + + return virCgroupV2DevicesCreateProg(group); +} + + virCgroupBackend virCgroupV2Backend = { .type = VIR_CGROUP_BACKEND_TYPE_V2, @@ -1777,6 +1787,7 @@ virCgroupBackend virCgroupV2Backend = { .allowDevice = virCgroupV2AllowDevice, .denyDevice = virCgroupV2DenyDevice, .allowAllDevices = virCgroupV2AllowAllDevices, + .denyAllDevices = virCgroupV2DenyAllDevices, .setCpuShares = virCgroupV2SetCpuShares, .getCpuShares = virCgroupV2GetCpuShares, -- 2.20.1

On Thu, Apr 25, 2019 at 09:44:30AM +0200, Pavel Hrdina wrote:
If we want to deny all devices we just need to replace any existing program with new program with empty map.
Signed-off-by: Pavel Hrdina <phrdina@redhat.com> --- src/util/vircgroupv2.c | 11 +++++++++++ 1 file changed, 11 insertions(+)
Reviewed-by: Ján Tomko <jtomko@redhat.com> Jano

So the issue here is that you can end up with configuration where you have cgroup v1 and v2 enabled at the same time and the devices controllers is enabled for cgroup v1. In cgroup v2 there is no devices controller, the device access is controlled using BPF and since it is not a cgroup controller both of them can exists at the same time and both of them are applied while resolving access to devices. In order to avoid configuring both BPF and cgroup v1 devices we will use BPF if possible and otherwise fallback to cgroup v1 devices. Signed-off-by: Pavel Hrdina <phrdina@redhat.com> --- src/util/vircgroup.c | 3 ++- src/util/vircgroupbackend.h | 3 ++- src/util/vircgroupv1.c | 9 ++++++++- src/util/vircgroupv2.c | 5 ++++- 4 files changed, 16 insertions(+), 4 deletions(-) diff --git a/src/util/vircgroup.c b/src/util/vircgroup.c index 4238d7014b..2ec9d28fc9 100644 --- a/src/util/vircgroup.c +++ b/src/util/vircgroup.c @@ -383,7 +383,8 @@ virCgroupDetect(virCgroupPtr group, for (i = 0; i < VIR_CGROUP_BACKEND_TYPE_LAST; i++) { if (group->backends[i]) { - int rc = group->backends[i]->detectControllers(group, controllers); + int rc = group->backends[i]->detectControllers(group, controllers, + controllersAvailable); if (rc < 0) return -1; controllersAvailable |= rc; diff --git a/src/util/vircgroupbackend.h b/src/util/vircgroupbackend.h index 24b45be9bb..9bc8e7b11d 100644 --- a/src/util/vircgroupbackend.h +++ b/src/util/vircgroupbackend.h @@ -96,7 +96,8 @@ typedef char * typedef int (*virCgroupDetectControllersCB)(virCgroupPtr group, - int controllers); + int controllers, + int detected); typedef bool (*virCgroupHasControllerCB)(virCgroupPtr cgroup, diff --git a/src/util/vircgroupv1.c b/src/util/vircgroupv1.c index 8ce10d3608..f879c76595 100644 --- a/src/util/vircgroupv1.c +++ b/src/util/vircgroupv1.c @@ -420,7 +420,8 @@ virCgroupV1StealPlacement(virCgroupPtr group) static int virCgroupV1DetectControllers(virCgroupPtr group, - int controllers) + int controllers, + int detected) { size_t i; size_t j; @@ -430,6 +431,9 @@ virCgroupV1DetectControllers(virCgroupPtr group, /* First mark requested but non-existing controllers to be ignored */ for (i = 0; i < VIR_CGROUP_CONTROLLER_LAST; i++) { if (((1 << i) & controllers)) { + int type = 1 << i; + if (type & detected) + VIR_FREE(group->legacy[i].mountPoint); /* Remove non-existent controllers */ if (!group->legacy[i].mountPoint) { VIR_DEBUG("Requested controller '%s' not mounted, ignoring", @@ -469,6 +473,9 @@ virCgroupV1DetectControllers(virCgroupPtr group, VIR_DEBUG("Auto-detecting controllers"); controllers = 0; for (i = 0; i < VIR_CGROUP_CONTROLLER_LAST; i++) { + int type = 1 << i; + if (type & detected) + VIR_FREE(group->legacy[i].mountPoint); VIR_DEBUG("Controller '%s' present=%s", virCgroupV1ControllerTypeToString(i), group->legacy[i].mountPoint ? "yes" : "no"); diff --git a/src/util/vircgroupv2.c b/src/util/vircgroupv2.c index 6c851f9637..c14bdd1f6d 100644 --- a/src/util/vircgroupv2.c +++ b/src/util/vircgroupv2.c @@ -287,7 +287,8 @@ virCgroupV2ParseControllersFile(virCgroupPtr group) static int virCgroupV2DetectControllers(virCgroupPtr group, - int controllers) + int controllers, + int detected) { size_t i; @@ -300,6 +301,8 @@ virCgroupV2DetectControllers(virCgroupPtr group, if (virCgroupV2DevicesAvailable(group)) group->unified.controllers |= 1 << VIR_CGROUP_CONTROLLER_DEVICES; + group->unified.controllers &= ~detected; + for (i = 0; i < VIR_CGROUP_CONTROLLER_LAST; i++) VIR_DEBUG("Controller '%s' present=%s", virCgroupV2ControllerTypeToString(i), -- 2.20.1

On Thu, Apr 25, 2019 at 09:44:31AM +0200, Pavel Hrdina wrote:
So the issue here is that you can end up with configuration where you have cgroup v1 and v2 enabled at the same time and the devices controllers is enabled for cgroup v1.
In cgroup v2 there is no devices controller, the device access is controlled using BPF and since it is not a cgroup controller both of them can exists at the same time and both of them are applied while resolving access to devices.
In order to avoid configuring both BPF and cgroup v1 devices we will use BPF if possible and otherwise fallback to cgroup v1 devices.
Signed-off-by: Pavel Hrdina <phrdina@redhat.com> --- src/util/vircgroup.c | 3 ++- src/util/vircgroupbackend.h | 3 ++- src/util/vircgroupv1.c | 9 ++++++++- src/util/vircgroupv2.c | 5 ++++- 4 files changed, 16 insertions(+), 4 deletions(-)
Reviewed-by: Ján Tomko <jtomko@redhat.com> Jano

We need to mock virCgroupV2DevicesAvailable() in order to remove any dependency on kernel as BPF devices might not be available. Signed-off-by: Pavel Hrdina <phrdina@redhat.com> --- src/util/vircgroupv2devices.h | 5 ++++- tests/vircgroupdata/hybrid.parsed | 2 +- tests/vircgroupmock.c | 7 +++++++ tests/vircgrouptest.c | 4 ++-- 4 files changed, 14 insertions(+), 4 deletions(-) diff --git a/src/util/vircgroupv2devices.h b/src/util/vircgroupv2devices.h index d717c9755f..5fc35942ef 100644 --- a/src/util/vircgroupv2devices.h +++ b/src/util/vircgroupv2devices.h @@ -21,10 +21,13 @@ # include <sys/types.h> +# include "internal.h" + # include "vircgroup.h" bool -virCgroupV2DevicesAvailable(virCgroupPtr group); +virCgroupV2DevicesAvailable(virCgroupPtr group) + ATTRIBUTE_NOINLINE; int virCgroupV2DevicesAttachProg(virCgroupPtr group, diff --git a/tests/vircgroupdata/hybrid.parsed b/tests/vircgroupdata/hybrid.parsed index 7600de5f45..f755eed465 100644 --- a/tests/vircgroupdata/hybrid.parsed +++ b/tests/vircgroupdata/hybrid.parsed @@ -2,7 +2,7 @@ cpu <null> cpuacct <null> cpuset /not/really/sys/fs/cgroup/cpuset memory <null> -devices /not/really/sys/fs/cgroup/devices +devices <null> freezer /not/really/sys/fs/cgroup/freezer blkio <null> net_cls /not/really/sys/fs/cgroup/net_cls diff --git a/tests/vircgroupmock.c b/tests/vircgroupmock.c index 7bbaa6dd0f..5aeeb80bb3 100644 --- a/tests/vircgroupmock.c +++ b/tests/vircgroupmock.c @@ -35,6 +35,7 @@ # include "virstring.h" # include "virfile.h" # include "viralloc.h" +# include "vircgroupv2devices.h" static int (*real_open)(const char *path, int flags, ...); static FILE *(*real_fopen)(const char *path, const char *mode); @@ -600,6 +601,12 @@ int open(const char *path, int flags, ...) free(newpath); return ret; } + +bool +virCgroupV2DevicesAvailable(virCgroupPtr group ATTRIBUTE_UNUSED) +{ + return true; +} #else /* Nothing to override on non-__linux__ platforms */ #endif diff --git a/tests/vircgrouptest.c b/tests/vircgrouptest.c index 20f4c57b04..4c1f53d924 100644 --- a/tests/vircgrouptest.c +++ b/tests/vircgrouptest.c @@ -587,6 +587,7 @@ static int testCgroupNewForSelfUnified(const void *args ATTRIBUTE_UNUSED) (1 << VIR_CGROUP_CONTROLLER_CPU) | (1 << VIR_CGROUP_CONTROLLER_CPUACCT) | (1 << VIR_CGROUP_CONTROLLER_MEMORY) | + (1 << VIR_CGROUP_CONTROLLER_DEVICES) | (1 << VIR_CGROUP_CONTROLLER_BLKIO); if (virCgroupNewSelf(&cgroup) < 0) { @@ -609,14 +610,12 @@ static int testCgroupNewForSelfHybrid(const void *args ATTRIBUTE_UNUSED) const char *empty[VIR_CGROUP_CONTROLLER_LAST] = { 0 }; const char *mounts[VIR_CGROUP_CONTROLLER_LAST] = { [VIR_CGROUP_CONTROLLER_CPUSET] = "/not/really/sys/fs/cgroup/cpuset", - [VIR_CGROUP_CONTROLLER_DEVICES] = "/not/really/sys/fs/cgroup/devices", [VIR_CGROUP_CONTROLLER_FREEZER] = "/not/really/sys/fs/cgroup/freezer", [VIR_CGROUP_CONTROLLER_NET_CLS] = "/not/really/sys/fs/cgroup/net_cls", [VIR_CGROUP_CONTROLLER_PERF_EVENT] = "/not/really/sys/fs/cgroup/perf_event", }; const char *placement[VIR_CGROUP_CONTROLLER_LAST] = { [VIR_CGROUP_CONTROLLER_CPUSET] = "/", - [VIR_CGROUP_CONTROLLER_DEVICES] = "/", [VIR_CGROUP_CONTROLLER_FREEZER] = "/", [VIR_CGROUP_CONTROLLER_NET_CLS] = "/", [VIR_CGROUP_CONTROLLER_PERF_EVENT] = "/", @@ -625,6 +624,7 @@ static int testCgroupNewForSelfHybrid(const void *args ATTRIBUTE_UNUSED) (1 << VIR_CGROUP_CONTROLLER_CPU) | (1 << VIR_CGROUP_CONTROLLER_CPUACCT) | (1 << VIR_CGROUP_CONTROLLER_MEMORY) | + (1 << VIR_CGROUP_CONTROLLER_DEVICES) | (1 << VIR_CGROUP_CONTROLLER_BLKIO); if (virCgroupNewSelf(&cgroup) < 0) { -- 2.20.1

On Thu, Apr 25, 2019 at 09:44:32AM +0200, Pavel Hrdina wrote:
We need to mock virCgroupV2DevicesAvailable() in order to remove any dependency on kernel as BPF devices might not be available.
Signed-off-by: Pavel Hrdina <phrdina@redhat.com> --- src/util/vircgroupv2devices.h | 5 ++++- tests/vircgroupdata/hybrid.parsed | 2 +- tests/vircgroupmock.c | 7 +++++++ tests/vircgrouptest.c | 4 ++-- 4 files changed, 14 insertions(+), 4 deletions(-)
Reviewed-by: Ján Tomko <jtomko@redhat.com> Jano
participants (2)
-
Ján Tomko
-
Pavel Hrdina