[libvirt] [PATCH v2 00/17] implement cgroups v2 devices support

Pavel Hrdina

14 Jan 2019 14 Jan '19

6:47 a.m.

In cgroups v2 there is no devices controller, BPF should be used instead. Changes in v2: - fixed build on bsd and older kernels without cgroup BPF - cgroup bpf devices code moved to separate file Documentation for eBPF: <http://man7.org/linux/man-pages/man2/bpf.2.html> <https://www.kernel.org/doc/Documentation/networking/filter.txt> <https://docs.cilium.io/en/v1.3/bpf/> Pavel Hrdina (17): util: introduce virbpf helpers vircgroup: introduce virCgroupV2DevicesAvailable vircgroup: introduce virCgroupV2DevicesAttachProg vircgroup: introduce virCgroupV2DevicesDetectProg vircgroup: introduce virCgroupV2DevicesCreateProg vircgroup: introduce virCgroupV2DevicesPrepareProg vircgroup: introduce virCgroupV2DevicesRemoveProg vircgroup: introduce virCgroupV2DeviceGetPerms vircgroup: introduce virCgroupV2DevicesGetKey vircgroup: introduce virCgroupV2AllowDevice vircgroup: introduce virCgroupV2DenyDevice vircgroup: introduce virCgroupV2AllowAllDevices vircgroup: introduce virCgroupV2DenyAllDevices vircgroup: workaround devices in hybrid mode vircgroupv2: detech BPF program before removing cgroup vircgroupv2: use dummy process to workaround kernel bug with systemd vircgroupmock: mock virCgroupV2DevicesAvailable configure.ac | 6 + include/libvirt/virterror.h | 1 + src/Makefile.am | 2 + src/libvirt_private.syms | 27 ++ src/lxc/lxc_cgroup.c | 1 + src/qemu/qemu_cgroup.c | 6 +- src/util/Makefile.inc.am | 4 + src/util/virbpf.c | 437 +++++++++++++++++++++ src/util/virbpf.h | 271 +++++++++++++ src/util/vircgroup.c | 19 +- src/util/vircgroup.h | 1 + src/util/vircgroupbackend.h | 3 +- src/util/vircgrouppriv.h | 12 + src/util/vircgroupv1.c | 9 +- src/util/vircgroupv2.c | 119 +++++- src/util/vircgroupv2devices.c | 625 ++++++++++++++++++++++++++++++ src/util/vircgroupv2devices.h | 57 +++ src/util/virerror.c | 1 + src/util/virsystemd.c | 2 +- src/util/virsystemd.h | 2 + tests/vircgroupdata/hybrid.parsed | 2 +- tests/vircgroupmock.c | 7 + tests/vircgrouptest.c | 4 +- 23 files changed, 1608 insertions(+), 10 deletions(-) create mode 100644 src/util/virbpf.c create mode 100644 src/util/virbpf.h create mode 100644 src/util/vircgroupv2devices.c create mode 100644 src/util/vircgroupv2devices.h -- 2.20.1

Show replies by date

Pavel Hrdina

14 Jan 14 Jan

6:47 a.m.

New subject: [libvirt] [PATCH v2 01/17] util: introduce virbpf helpers

In order to implement devices controller with cgroup v2 we need to add support for BPF programs, cgroup v2 doesn't have devices controller. This introduces required helpers wrapping linux syscalls. Signed-off-by: Pavel Hrdina <phrdina@redhat.com> --- configure.ac | 5 + include/libvirt/virterror.h | 1 + src/libvirt_private.syms | 16 ++ src/util/Makefile.inc.am | 2 + src/util/virbpf.c | 437 ++++++++++++++++++++++++++++++++++++ src/util/virbpf.h | 271 ++++++++++++++++++++++ src/util/virerror.c | 1 + 7 files changed, 733 insertions(+) create mode 100644 src/util/virbpf.c create mode 100644 src/util/virbpf.h diff --git a/configure.ac b/configure.ac index ac52189dff..0b7afece25 100644 --- a/configure.ac +++ b/configure.ac @@ -885,6 +885,11 @@ AC_CHECK_DECLS([clock_serv_t, host_get_clock_service, clock_get_time], #include <mach/mach.h> ]) +# Check if we have new enough kernel to support BPF devices for cgroups v2 +if test "$with_linux" = "yes"; then + AC_CHECK_DECLS([BPF_PROG_QUERY], [], [], [#include <linux/bpf.h>]) +fi + # Check if we need to look for ifconfig if test "$want_ifconfig" = "yes"; then AC_PATH_PROG([IFCONFIG_PATH], [ifconfig]) diff --git a/include/libvirt/virterror.h b/include/libvirt/virterror.h index fbbe2d5624..d47bed4390 100644 --- a/include/libvirt/virterror.h +++ b/include/libvirt/virterror.h @@ -131,6 +131,7 @@ typedef enum { VIR_FROM_PERF = 65, /* Error from perf */ VIR_FROM_LIBSSH = 66, /* Error from libssh connection transport */ VIR_FROM_RESCTRL = 67, /* Error from resource control */ + VIR_FROM_BPF = 68, /* Error from BPF code */ # ifdef VIR_ENUM_SENTINELS VIR_ERR_DOMAIN_LAST diff --git a/src/libvirt_private.syms b/src/libvirt_private.syms index c3d6306809..0cff580de2 100644 --- a/src/libvirt_private.syms +++ b/src/libvirt_private.syms @@ -1477,6 +1477,22 @@ virBitmapToDataBuf; virBitmapToString; +# util/virbpf.h +virBPFAttachProg; +virBPFCreateMap; +virBPFDeleteElem; +virBPFDetachProg; +virBPFGetMap; +virBPFGetMapInfo; +virBPFGetNextElem; +virBPFGetProg; +virBPFGetProgInfo; +virBPFLoadProg; +virBPFLookupElem; +virBPFQueryProg; +virBPFUpdateElem; + + # util/virbuffer.h virBufferAdd; virBufferAddBuffer; diff --git a/src/util/Makefile.inc.am b/src/util/Makefile.inc.am index 4295babac3..1fd7ad2d43 100644 --- a/src/util/Makefile.inc.am +++ b/src/util/Makefile.inc.am @@ -17,6 +17,8 @@ UTIL_SOURCES = \ util/virauthconfig.h \ util/virbitmap.c \ util/virbitmap.h \ + util/virbpf.c \ + util/virbpf.h \ util/virbuffer.c \ util/virbuffer.h \ util/virperf.c \ diff --git a/src/util/virbpf.c b/src/util/virbpf.c new file mode 100644 index 0000000000..ea4b8c0a4a --- /dev/null +++ b/src/util/virbpf.c @@ -0,0 +1,437 @@ +/* + * virbpf.c: methods for eBPF + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library. If not, see + * <http://www.gnu.org/licenses/>. + */ +#include <config.h> + +#include <sys/syscall.h> + +#include "internal.h" + +#include "virbpf.h" +#include "virerror.h" +#include "virfile.h" +#include "virlog.h" +#include "virstring.h" + +VIR_LOG_INIT("util.bpf"); + +#define VIR_FROM_THIS VIR_FROM_BPF + +#if HAVE_DECL_BPF_PROG_QUERY +int +virBPFCreateMap(unsigned int mapType, + unsigned int keySize, + unsigned int valSize, + unsigned int maxEntries) +{ + union bpf_attr attr; + + memset(&attr, 0, sizeof(attr)); + + attr.map_type = mapType; + attr.key_size = keySize; + attr.value_size = valSize; + attr.max_entries = maxEntries; + + return syscall(SYS_bpf, BPF_MAP_CREATE, &attr, sizeof(attr)); +} + + +# define LOG_BUF_SIZE (256 * 1024) + +int +virBPFLoadProg(struct bpf_insn *insns, + int progType, + unsigned int insnCnt) +{ + VIR_AUTOFREE(char *) logbuf = NULL; + int progfd = -1; + union bpf_attr attr; + + if (VIR_ALLOC_N(logbuf, LOG_BUF_SIZE) < 0) + return -1; + + memset(&attr, 0, sizeof(attr)); + + attr.prog_type = progType; + attr.insn_cnt = (uint32_t)insnCnt; + attr.insns = (uint64_t)insns; + attr.license = (uint64_t)"GPL"; + attr.log_buf = (uint64_t)logbuf; + attr.log_size = LOG_BUF_SIZE; + attr.log_level = 1; + + progfd = syscall(SYS_bpf, BPF_PROG_LOAD, &attr, sizeof(attr)); + + if (progfd < 0) + VIR_DEBUG("%s", logbuf); + + return progfd; +} + + +int +virBPFAttachProg(int progfd, + int targetfd, + int attachType) +{ + union bpf_attr attr; + + memset(&attr, 0, sizeof(attr)); + + attr.target_fd = targetfd; + attr.attach_bpf_fd = progfd; + attr.attach_type = attachType; + + return syscall(SYS_bpf, BPF_PROG_ATTACH, &attr, sizeof(attr)); +} + + +int +virBPFDetachProg(int progfd, + int targetfd, + int attachType) +{ + union bpf_attr attr; + + memset(&attr, 0, sizeof(attr)); + + attr.target_fd = targetfd; + attr.attach_bpf_fd = progfd; + attr.attach_type = attachType; + + return syscall(SYS_bpf, BPF_PROG_DETACH, &attr, sizeof(attr)); +} + + +int +virBPFQueryProg(int targetfd, + unsigned int maxprogids, + int attachType, + unsigned int *progcnt, + void *progids) +{ + union bpf_attr attr; + int rc; + + memset(&attr, 0, sizeof(attr)); + + attr.query.target_fd = targetfd; + attr.query.attach_type = attachType; + attr.query.prog_cnt = maxprogids; + attr.query.prog_ids = (uint64_t)progids; + + rc = syscall(SYS_bpf, BPF_PROG_QUERY, &attr, sizeof(attr)); + + if (rc >= 0) + *progcnt = attr.query.prog_cnt; + + return rc; +} + + +int +virBPFGetProg(unsigned int id) +{ + union bpf_attr attr; + + memset(&attr, 0, sizeof(attr)); + + attr.prog_id = id; + + return syscall(SYS_bpf, BPF_PROG_GET_FD_BY_ID, &attr, sizeof(attr)); +} + + +int +virBPFGetProgInfo(int progfd, + struct bpf_prog_info *info, + unsigned int **mapIDs) +{ + union bpf_attr attr; + int rc; + + memset(&attr, 0, sizeof(attr)); + + attr.info.bpf_fd = progfd; + attr.info.info_len = sizeof(struct bpf_prog_info); + attr.info.info = (uint64_t)info; + + rc = syscall(SYS_bpf, BPF_OBJ_GET_INFO_BY_FD, &attr, sizeof(attr)); + if (rc < 0) + return rc; + + if (mapIDs && info->nr_map_ids > 0) { + unsigned int maplen = info->nr_map_ids; + VIR_AUTOFREE(unsigned int *) retmapIDs = NULL; + + if (VIR_ALLOC_N(retmapIDs, maplen) < 0) + return -1; + + memset(info, 0, sizeof(struct bpf_prog_info)); + info->nr_map_ids = maplen; + info->map_ids = (uint64_t)retmapIDs; + + memset(&attr, 0, sizeof(attr)); + attr.info.bpf_fd = progfd; + attr.info.info_len = sizeof(struct bpf_prog_info); + attr.info.info = (uint64_t)info; + + rc = syscall(SYS_bpf, BPF_OBJ_GET_INFO_BY_FD, &attr, sizeof(attr)); + if (rc < 0) + return rc; + + VIR_STEAL_PTR(*mapIDs, retmapIDs); + } + + return rc; +} + + +int +virBPFGetMap(unsigned int id) +{ + union bpf_attr attr; + + memset(&attr, 0, sizeof(attr)); + + attr.map_id = id; + + return syscall(SYS_bpf, BPF_MAP_GET_FD_BY_ID, &attr, sizeof(attr)); +} + + +int +virBPFGetMapInfo(int mapfd, + struct bpf_map_info *info) +{ + union bpf_attr attr; + + memset(&attr, 0, sizeof(attr)); + + attr.info.bpf_fd = mapfd; + attr.info.info_len = sizeof(struct bpf_map_info); + attr.info.info = (uint64_t)info; + + return syscall(SYS_bpf, BPF_OBJ_GET_INFO_BY_FD, &attr, sizeof(attr)); +} + + +int +virBPFLookupElem(int mapfd, + void *key, + void *val) +{ + union bpf_attr attr; + + memset(&attr, 0, sizeof(attr)); + + attr.map_fd = mapfd; + attr.key = (uint64_t)key; + attr.value = (uint64_t)val; + + return syscall(SYS_bpf, BPF_MAP_LOOKUP_ELEM, &attr, sizeof(attr)); +} + + +int +virBPFGetNextElem(int mapfd, + void *key, + void *nextKey) +{ + union bpf_attr attr; + + memset(&attr, 0, sizeof(attr)); + + attr.map_fd = mapfd; + attr.key = (uint64_t)key; + attr.next_key = (uint64_t)nextKey; + + return syscall(SYS_bpf, BPF_MAP_GET_NEXT_KEY, &attr, sizeof(attr)); +} + + +int +virBPFUpdateElem(int mapfd, + void *key, + void *val) +{ + union bpf_attr attr; + + memset(&attr, 0, sizeof(attr)); + + attr.map_fd = mapfd; + attr.key = (uint64_t)key; + attr.value = (uint64_t)val; + + return syscall(SYS_bpf, BPF_MAP_UPDATE_ELEM, &attr, sizeof(attr)); +} + + +int +virBPFDeleteElem(int mapfd, + void *key) +{ + union bpf_attr attr; + + memset(&attr, 0, sizeof(attr)); + + attr.map_fd = mapfd; + attr.key = (uint64_t)key; + + return syscall(SYS_bpf, BPF_MAP_DELETE_ELEM, &attr, sizeof(attr)); +} +#else /* HAVE_DECL_BPF_PROG_QUERY */ +int +virBPFCreateMap(unsigned int mapType ATTRIBUTE_UNUSED, + unsigned int keySize ATTRIBUTE_UNUSED, + unsigned int valSize ATTRIBUTE_UNUSED, + unsigned int maxEntries ATTRIBUTE_UNUSED) +{ + virReportSystemError(ENOSYS, "%s", + _("BPF not supported with this kernel")); + return -1; +} + + +int +virBPFLoadProg(struct bpf_insn *insns ATTRIBUTE_UNUSED, + int progType ATTRIBUTE_UNUSED, + unsigned int insnCnt ATTRIBUTE_UNUSED) +{ + virReportSystemError(ENOSYS, "%s", + _("BPF not supported with this kernel")); + return -1; +} + + +int +virBPFAttachProg(int progfd ATTRIBUTE_UNUSED, + int targetfd ATTRIBUTE_UNUSED, + int attachType ATTRIBUTE_UNUSED) +{ + virReportSystemError(ENOSYS, "%s", + _("BPF not supported with this kernel")); + return -1; +} + + +int +virBPFDetachProg(int progfd ATTRIBUTE_UNUSED, + int targetfd ATTRIBUTE_UNUSED, + int attachType ATTRIBUTE_UNUSED) +{ + virReportSystemError(ENOSYS, "%s", + _("BPF not supported with this kernel")); + return -1; +} + + +int +virBPFQueryProg(int targetfd ATTRIBUTE_UNUSED, + unsigned int maxprogids ATTRIBUTE_UNUSED, + int attachType ATTRIBUTE_UNUSED, + unsigned int *progcnt ATTRIBUTE_UNUSED, + void *progids ATTRIBUTE_UNUSED) +{ + virReportSystemError(ENOSYS, "%s", + _("BPF not supported with this kernel")); + return -1; +} + + +int +virBPFGetProg(unsigned int id ATTRIBUTE_UNUSED) +{ + virReportSystemError(ENOSYS, "%s", + _("BPF not supported with this kernel")); + return -1; +} + + +int +virBPFGetProgInfo(int progfd ATTRIBUTE_UNUSED, + struct bpf_prog_info *info ATTRIBUTE_UNUSED, + unsigned int **mapIDs ATTRIBUTE_UNUSED) +{ + virReportSystemError(ENOSYS, "%s", + _("BPF not supported with this kernel")); + return -1; +} + + +int +virBPFGetMap(unsigned int id ATTRIBUTE_UNUSED) +{ + virReportSystemError(ENOSYS, "%s", + _("BPF not supported with this kernel")); + return -1; +} + + +int +virBPFGetMapInfo(int mapfd ATTRIBUTE_UNUSED, + struct bpf_map_info *info ATTRIBUTE_UNUSED) +{ + virReportSystemError(ENOSYS, "%s", + _("BPF not supported with this kernel")); + return -1; +} + + +int +virBPFLookupElem(int mapfd ATTRIBUTE_UNUSED, + void *key ATTRIBUTE_UNUSED, + void *val ATTRIBUTE_UNUSED) +{ + virReportSystemError(ENOSYS, "%s", + _("BPF not supported with this kernel")); + return -1; +} + + +int +virBPFGetNextElem(int mapfd ATTRIBUTE_UNUSED, + void *key ATTRIBUTE_UNUSED, + void *nextKey ATTRIBUTE_UNUSED) +{ + virReportSystemError(ENOSYS, "%s", + _("BPF not supported with this kernel")); + return -1; +} + + +int +virBPFUpdateElem(int mapfd ATTRIBUTE_UNUSED, + void *key ATTRIBUTE_UNUSED, + void *val ATTRIBUTE_UNUSED) +{ + virReportSystemError(ENOSYS, "%s", + _("BPF not supported with this kernel")); + return -1; +} + + +int +virBPFDeleteElem(int mapfd ATTRIBUTE_UNUSED, + void *key ATTRIBUTE_UNUSED) +{ + virReportSystemError(ENOSYS, "%s", + _("BPF not supported with this kernel")); + return -1; +} +#endif /* HAVE_DECL_BPF_PROG_QUERY */ diff --git a/src/util/virbpf.h b/src/util/virbpf.h new file mode 100644 index 0000000000..0fd6a05388 --- /dev/null +++ b/src/util/virbpf.h @@ -0,0 +1,271 @@ +/* + * virbpf.h: methods for eBPF + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library. If not, see + * <http://www.gnu.org/licenses/>. + */ + +#ifndef LIBVIRT_VIRBPF_H +# define LIBVIRT_VIRBPF_H + +# if HAVE_DECL_BPF_PROG_QUERY + +# include <linux/bpf.h> + +/* ALU ops on registers, bpf_add|sub|...: dst_reg += src_reg */ + +# define VIR_BPF_ALU64_REG(op, dst, src) \ + ((struct bpf_insn) { \ + .code = BPF_ALU64 | BPF_OP(op) | BPF_X, \ + .dst_reg = dst, \ + .src_reg = src, \ + .off = 0, \ + .imm = 0, \ + }) + +/* ALU ops on immediates, bpf_add|sub|...: dst_reg += imm32 */ + +# define VIR_BPF_ALU64_IMM(op, dst, immval) \ + ((struct bpf_insn) { \ + .code = BPF_ALU64 | BPF_OP(op) | BPF_K, \ + .dst_reg = dst, \ + .src_reg = 0, \ + .off = 0, \ + .imm = immval, \ + }) + +/* Short form of mov, dst_reg = src_reg */ + +# define VIR_BPF_MOV64_REG(dst, src) \ + ((struct bpf_insn) { \ + .code = BPF_ALU64 | BPF_MOV | BPF_X, \ + .dst_reg = dst, \ + .src_reg = src, \ + .off = 0, \ + .imm = 0, \ + }) + +/* Short form of mov, dst_reg = imm32 */ + +# define VIR_BPF_MOV64_IMM(dst, immval) \ + ((struct bpf_insn) { \ + .code = BPF_ALU64 | BPF_MOV | BPF_K, \ + .dst_reg = dst, \ + .src_reg = 0, \ + .off = 0, \ + .imm = immval, \ + }) + +# define VIR_BPF_MOV32_IMM(dst, immval) \ + ((struct bpf_insn) { \ + .code = BPF_ALU | BPF_MOV | BPF_K, \ + .dst_reg = dst, \ + .src_reg = 0, \ + .off = 0, \ + .imm = immval, \ + }) + +/* BPF_LD_IMM64 macro encodes single 'load 64-bit immediate' insn */ +# define VIR_BPF_LD_IMM64(dst, imm) \ + BPF_LD_IMM64_RAW(dst, 0, imm) + +# define VIR_BPF_LD_IMM64_RAW(dst, src, immval) \ + ((struct bpf_insn) { \ + .code = BPF_LD | BPF_DW | BPF_IMM, \ + .dst_reg = dst, \ + .src_reg = src, \ + .off = 0, \ + .imm = (uint32_t)immval, \ + }), \ + ((struct bpf_insn) { \ + .code = 0, \ + .dst_reg = 0, \ + .src_reg = 0, \ + .off = 0, \ + .imm = ((uint64_t)immval) >> 32, \ + }) + +# ifndef VIR_BPF_PSEUDO_MAP_FD +# define VIR_BPF_PSEUDO_MAP_FD 1 +# endif + +/* pseudo VIR_BPF_LD_IMM64 insn used to refer to process-local map_fd */ +# define VIR_BPF_LD_MAP_FD(dst, mapfd) \ + VIR_BPF_LD_IMM64_RAW(dst, VIR_BPF_PSEUDO_MAP_FD, mapfd) + +/* Memory load, dst_reg = *(uint *) (src_reg + off16) */ + +# define VIR_BPF_LDX_MEM(size, dst, src, offval) \ + ((struct bpf_insn) { \ + .code = BPF_LDX | BPF_SIZE(size) | BPF_MEM, \ + .dst_reg = dst, \ + .src_reg = src, \ + .off = offval, \ + .imm = 0, \ + }) + +/* Memory store, *(uint *) (dst_reg + off16) = src_reg */ + +# define VIR_BPF_STX_MEM(size, dst, src, offval) \ + ((struct bpf_insn) { \ + .code = BPF_STX | BPF_SIZE(size) | BPF_MEM, \ + .dst_reg = dst, \ + .src_reg = src, \ + .off = offval, \ + .imm = 0, \ + }) + +/* Memory store, *(uint *) (dst_reg + off16) = imm32 */ + +# define VIR_BPF_ST_MEM(size, dst, immval, offval) \ + ((struct bpf_insn) { \ + .code = BPF_ST | BPF_SIZE(size) | BPF_MEM, \ + .dst_reg = dst, \ + .src_reg = 0, \ + .off = offval, \ + .imm = immval, \ + }) + +/* Conditional jumps against registers, if (dst_reg 'op' src_reg) goto pc + off16 */ + +# define VIR_BPF_JMP_REG(op, dst, src, offval) \ + ((struct bpf_insn) { \ + .code = BPF_JMP | BPF_OP(op) | BPF_X, \ + .dst_reg = dst, \ + .src_reg = src, \ + .off = offval, \ + .imm = 0, \ + }) + +/* Conditional jumps against immediates, if (dst_reg 'op' imm32) goto pc + off16 */ + +# define VIR_BPF_JMP_IMM(op, dst, immval, offval) \ + ((struct bpf_insn) { \ + .code = BPF_JMP | BPF_OP(op) | BPF_K, \ + .dst_reg = dst, \ + .src_reg = 0, \ + .off = offval, \ + .imm = immval, \ + }) + +/* Call eBPF function */ + +# define VIR_BPF_CALL_INSN(func) \ + ((struct bpf_insn) { \ + .code = BPF_JMP | BPF_CALL, \ + .dst_reg = 0, \ + .src_reg = 0, \ + .off = 0, \ + .imm = func, \ + }) + +/* Program exit */ + +# define VIR_BPF_EXIT_INSN() \ + ((struct bpf_insn) { \ + .code = BPF_JMP | BPF_EXIT, \ + .dst_reg = 0, \ + .src_reg = 0, \ + .off = 0, \ + .imm = 0, \ + }) + +# else /* HAVE_DECL_BPF_PROG_QUERY */ + +struct bpf_prog_info; +struct bpf_map_info; +struct bpf_insn; + +# define VIR_BPF_ALU64_REG(op, dst, src) +# define VIR_BPF_ALU64_IMM(op, dst, immval) +# define VIR_BPF_MOV64_REG(dst, src) +# define VIR_BPF_MOV64_IMM(dst, immval) +# define VIR_BPF_MOV32_IMM(dst, immval) +# define VIR_BPF_LD_IMM64(dst, imm) +# define VIR_BPF_LD_IMM64_RAW(dst, src, immval) +# define VIR_BPF_PSEUDO_MAP_FD +# define VIR_BPF_LD_MAP_FD(dst, mapfd) +# define VIR_BPF_LDX_MEM(size, dst, src, offval) +# define VIR_BPF_STX_MEM(size, dst, src, offval) +# define VIR_BPF_ST_MEM(size, dst, immval, offval) +# define VIR_BPF_JMP_REG(op, dst, src, offval) +# define VIR_BPF_JMP_IMM(op, dst, immval, offval) +# define VIR_BPF_CALL_INSN(func) +# define VIR_BPF_EXIT_INSN() + +# endif /* HAVE_DECL_BPF_PROG_QUERY */ + +int +virBPFCreateMap(unsigned int mapType, + unsigned int keySize, + unsigned int valSize, + unsigned int maxEntries); + +int +virBPFGetMapInfo(int mapfd, + struct bpf_map_info *info); + +int +virBPFLoadProg(struct bpf_insn *insns, + int progType, + unsigned int insnCnt); + +int +virBPFAttachProg(int progfd, + int targetfd, + int attachType); + +int +virBPFDetachProg(int progfd, + int targetfd, + int attachType); + +int +virBPFQueryProg(int targetfd, + unsigned int maxprogids, + int attachType, + unsigned int *progcnt, + void *progids); + +int +virBPFGetProg(unsigned int id); + +int +virBPFGetProgInfo(int progfd, + struct bpf_prog_info *info, + unsigned int **mapIDs); + +int +virBPFGetMap(unsigned int id); + +int +virBPFLookupElem(int mapfd, + void *key, + void *val); + +int +virBPFGetNextElem(int mapfd, + void *key, + void *nextKey); + +int +virBPFUpdateElem(int mapfd, + void *key, + void *val); + +int +virBPFDeleteElem(int mapfd, + void *key); + +#endif /* LIBVIRT_VIRBPF_H */ diff --git a/src/util/virerror.c b/src/util/virerror.c index 61b47d2be0..a40076f8ec 100644 --- a/src/util/virerror.c +++ b/src/util/virerror.c @@ -138,6 +138,7 @@ VIR_ENUM_IMPL(virErrorDomain, VIR_ERR_DOMAIN_LAST, "Perf", /* 65 */ "Libssh transport layer", "Resource control", + "BPF", ) -- 2.20.1

Pavel Hrdina

6:47 a.m.

New subject: [libvirt] [PATCH v2 02/17] vircgroup: introduce virCgroupV2DevicesAvailable

There is no exact way how to figure out whether BPF devices support is compiled into kernel. One way is to check kernel configure options but this is not reliable as it may not be available. Let's try to do syscall to which will list BPF cgroup device programs. Signed-off-by: Pavel Hrdina <phrdina@redhat.com> --- configure.ac | 3 +- src/Makefile.am | 2 + src/libvirt_private.syms | 3 ++ src/util/Makefile.inc.am | 2 + src/util/vircgroupv2.c | 7 +++- src/util/vircgroupv2devices.c | 73 +++++++++++++++++++++++++++++++++++ src/util/vircgroupv2devices.h | 27 +++++++++++++ 7 files changed, 115 insertions(+), 2 deletions(-) create mode 100644 src/util/vircgroupv2devices.c create mode 100644 src/util/vircgroupv2devices.h diff --git a/configure.ac b/configure.ac index 0b7afece25..acc0e888ea 100644 --- a/configure.ac +++ b/configure.ac @@ -887,7 +887,8 @@ AC_CHECK_DECLS([clock_serv_t, host_get_clock_service, clock_get_time], # Check if we have new enough kernel to support BPF devices for cgroups v2 if test "$with_linux" = "yes"; then - AC_CHECK_DECLS([BPF_PROG_QUERY], [], [], [#include <linux/bpf.h>]) + AC_CHECK_DECLS([BPF_PROG_QUERY, BPF_CGROUP_DEVICE], + [], [], [#include <linux/bpf.h>]) fi # Check if we need to look for ifconfig diff --git a/src/Makefile.am b/src/Makefile.am index cd386297ed..9d9e50d527 100644 --- a/src/Makefile.am +++ b/src/Makefile.am @@ -672,11 +672,13 @@ libvirt_setuid_rpc_client_la_SOURCES = \ util/viratomic.c \ util/viratomic.h \ util/virbitmap.c \ + util/virbpf.c \ util/virbuffer.c \ util/vircgroup.c \ util/vircgroupbackend.c \ util/vircgroupv1.c \ util/vircgroupv2.c \ + util/vircgroupv2devices.c \ util/vircommand.c \ util/virconf.c \ util/virdbus.c \ diff --git a/src/libvirt_private.syms b/src/libvirt_private.syms index 0cff580de2..6a822f7d90 100644 --- a/src/libvirt_private.syms +++ b/src/libvirt_private.syms @@ -1611,6 +1611,9 @@ virCgroupV1Register; # util/vircgroupv2.h virCgroupV2Register; +# util/vircgroupv2devices.h +virCgroupV2DevicesAvailable; + # util/virclosecallbacks.h virCloseCallbacksGet; virCloseCallbacksGetConn; diff --git a/src/util/Makefile.inc.am b/src/util/Makefile.inc.am index 1fd7ad2d43..6eb9e1b889 100644 --- a/src/util/Makefile.inc.am +++ b/src/util/Makefile.inc.am @@ -31,6 +31,8 @@ UTIL_SOURCES = \ util/vircgroupv1.h \ util/vircgroupv2.c \ util/vircgroupv2.h \ + util/vircgroupv2devices.c \ + util/vircgroupv2devices.h \ util/virclosecallbacks.c \ util/virclosecallbacks.h \ util/vircommand.c \ diff --git a/src/util/vircgroupv2.c b/src/util/vircgroupv2.c index cd58491da1..a56f3443e3 100644 --- a/src/util/vircgroupv2.c +++ b/src/util/vircgroupv2.c @@ -32,6 +32,7 @@ #include "vircgroup.h" #include "vircgroupbackend.h" #include "vircgroupv2.h" +#include "vircgroupv2devices.h" #include "virerror.h" #include "virfile.h" #include "virlog.h" @@ -292,6 +293,8 @@ virCgroupV2DetectControllers(virCgroupPtr group, /* In cgroup v2 there is no cpuacct controller, the cpu.stat file always * exists with usage stats. */ group->unified.controllers |= 1 << VIR_CGROUP_CONTROLLER_CPUACCT; + if (virCgroupV2DevicesAvailable(group)) + group->unified.controllers |= 1 << VIR_CGROUP_CONTROLLER_DEVICES; for (i = 0; i < VIR_CGROUP_CONTROLLER_LAST; i++) VIR_DEBUG("Controller '%s' present=%s", @@ -406,8 +409,10 @@ virCgroupV2MakeGroup(virCgroupPtr parent ATTRIBUTE_UNUSED, continue; /* Controllers that are implicitly enabled if available. */ - if (i == VIR_CGROUP_CONTROLLER_CPUACCT) + if (i == VIR_CGROUP_CONTROLLER_CPUACCT || + i == VIR_CGROUP_CONTROLLER_DEVICES) { continue; + } if (virCgroupV2EnableController(parent, i) < 0) return -1; diff --git a/src/util/vircgroupv2devices.c b/src/util/vircgroupv2devices.c new file mode 100644 index 0000000000..10080d4fff --- /dev/null +++ b/src/util/vircgroupv2devices.c @@ -0,0 +1,73 @@ +/* + * vircgroupv2devices.c: methods for cgroups v2 BPF devices + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library. If not, see + * <http://www.gnu.org/licenses/>. + */ +#include <config.h> + +#if HAVE_DECL_BPF_CGROUP_DEVICE +# include <fcntl.h> +# include <linux/bpf.h> +# include <sys/stat.h> +# include <sys/syscall.h> +# include <sys/types.h> +#endif /* !HAVE_DECL_BPF_CGROUP_DEVICE */ + +#include "internal.h" + +#define LIBVIRT_VIRCGROUPPRIV_H_ALLOW +#include "vircgrouppriv.h" + +#include "virbpf.h" +#include "vircgroup.h" +#include "vircgroupv2devices.h" +#include "virfile.h" +#include "virlog.h" + +VIR_LOG_INIT("util.cgroup"); + +#define VIR_FROM_THIS VIR_FROM_CGROUP + +#if HAVE_DECL_BPF_CGROUP_DEVICE +bool +virCgroupV2DevicesAvailable(virCgroupPtr group) +{ + bool ret = false; + int cgroupfd = -1; + unsigned int progCnt = 0; + + cgroupfd = open(group->unified.mountPoint, O_RDONLY); + if (cgroupfd < 0) { + VIR_DEBUG("failed to open cgroup '%s'", group->unified.mountPoint); + goto cleanup; + } + + if (virBPFQueryProg(cgroupfd, 0, BPF_CGROUP_DEVICE, &progCnt, NULL) < 0) { + VIR_DEBUG("failed to query cgroup progs"); + goto cleanup; + } + + ret = true; + cleanup: + VIR_FORCE_CLOSE(cgroupfd); + return ret; +} +#else /* !HAVE_DECL_BPF_CGROUP_DEVICE */ +bool +virCgroupV2DevicesAvailable(virCgroupPtr group ATTRIBUTE_UNUSED) +{ + return false; +} +#endif /* !HAVE_DECL_BPF_CGROUP_DEVICE */ diff --git a/src/util/vircgroupv2devices.h b/src/util/vircgroupv2devices.h new file mode 100644 index 0000000000..2ab35681db --- /dev/null +++ b/src/util/vircgroupv2devices.h @@ -0,0 +1,27 @@ +/* + * vircgroupv2devices.h: methods for cgroups v2 BPF devices + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library. If not, see + * <http://www.gnu.org/licenses/>. + */ + +#ifndef LIBVIRT_VIRCGROUPV2DEVICES_H +# define LIBVIRT_VIRCGROUPV2DEVICES_H + +# include "vircgroup.h" + +bool +virCgroupV2DevicesAvailable(virCgroupPtr group); + +#endif /* LIBVIRT_VIRCGROUPV2DEVICES_H */ -- 2.20.1

Pavel Hrdina

6:47 a.m.

New subject: [libvirt] [PATCH v2 03/17] vircgroup: introduce virCgroupV2DevicesAttachProg

This function loads the BPF prog with prepared map into kernel and attaches it into guest cgroup. It can be also used to replace existing program in the cgroup if we need to resize BPF map to store more rules for devices. The old program will be closed and removed from kernel. There are two possible ways how to create BPF program: - One way is to write simple C-like code which can by compiled into BPF object file which can be loaded into kernel using elfutils. - The second way is to define macros which looks like assembler instructions and can be used directly to create BPF program that can be directly loaded into kernel. Since the program is not too complex we can use the second option. If there is no program, all devices are allowed, if there is some program it is executed and based on the exit status the access is denied for 0 and allowed for 1. Our program will follow these rules: - first it will try to look for the specific key using major and minor to see if there is any rule for that specific device - if there is no specific rule it will try to look for any rule that matches only major of the device - if there is no match with major it will try the same but with minor of the device - as the last attempt it will try to look for rule for all devices and if there is no match it will return 0 to deny that access Signed-off-by: Pavel Hrdina <phrdina@redhat.com> --- src/libvirt_private.syms | 1 + src/util/vircgrouppriv.h | 10 +++ src/util/vircgroupv2devices.c | 140 ++++++++++++++++++++++++++++++++++ src/util/vircgroupv2devices.h | 5 ++ 4 files changed, 156 insertions(+) diff --git a/src/libvirt_private.syms b/src/libvirt_private.syms index 6a822f7d90..3fc91ce207 100644 --- a/src/libvirt_private.syms +++ b/src/libvirt_private.syms @@ -1612,6 +1612,7 @@ virCgroupV1Register; virCgroupV2Register; # util/vircgroupv2devices.h +virCgroupV2DevicesAttachProg; virCgroupV2DevicesAvailable; # util/virclosecallbacks.h diff --git a/src/util/vircgrouppriv.h b/src/util/vircgrouppriv.h index a6fb3bb9f8..085fea375c 100644 --- a/src/util/vircgrouppriv.h +++ b/src/util/vircgrouppriv.h @@ -42,10 +42,20 @@ struct _virCgroupV1Controller { typedef struct _virCgroupV1Controller virCgroupV1Controller; typedef virCgroupV1Controller *virCgroupV1ControllerPtr; +struct _virCgroupV2Devices { + int mapfd; + int progfd; + ssize_t count; + ssize_t max; +}; +typedef struct _virCgroupV2Devices virCgroupV2Devices; +typedef virCgroupV2Devices *virCgroupV2DevicesPtr; + struct _virCgroupV2Controller { int controllers; char *mountPoint; char *placement; + virCgroupV2Devices devices; }; typedef struct _virCgroupV2Controller virCgroupV2Controller; typedef virCgroupV2Controller *virCgroupV2ControllerPtr; diff --git a/src/util/vircgroupv2devices.c b/src/util/vircgroupv2devices.c index 10080d4fff..0b721a0aad 100644 --- a/src/util/vircgroupv2devices.c +++ b/src/util/vircgroupv2devices.c @@ -64,10 +64,150 @@ virCgroupV2DevicesAvailable(virCgroupPtr group) VIR_FORCE_CLOSE(cgroupfd); return ret; } + + +static int +virCgroupV2DevicesLoadProg(int mapfd) +{ +# define VIR_CGROUP_BPF_LOOKUP \ + /* prepare key param on stack */ \ + VIR_BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_2, -8), \ + VIR_BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), \ + VIR_BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), \ + /* lookup key (major << 32) | minor in map */ \ + VIR_BPF_LD_MAP_FD(BPF_REG_1, mapfd), \ + VIR_BPF_CALL_INSN(BPF_FUNC_map_lookup_elem) + +# define VIR_CGROUP_BPF_CHECK_PERM \ + /* if no key skip perm check */ \ + VIR_BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 6), \ + /* get perms from map */ \ + VIR_BPF_LDX_MEM(BPF_W, BPF_REG_1, BPF_REG_0, 0), \ + /* get perms from ctx */ \ + VIR_BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_6, 0), \ + /* (map perms) & (ctx perms) */ \ + VIR_BPF_ALU64_REG(BPF_AND, BPF_REG_1, BPF_REG_2), \ + /* if (map perms) & (ctx perms) == (ctx perms) exit, otherwise continue */ \ + VIR_BPF_JMP_REG(BPF_JNE, BPF_REG_1, BPF_REG_2, 2), \ + /* set ret 1 and exit */ \ + VIR_BPF_MOV64_IMM(BPF_REG_0, 1), \ + VIR_BPF_EXIT_INSN() + + struct bpf_insn prog[] = { + /* save ctx, argument passed to BPF program */ + VIR_BPF_MOV64_REG(BPF_REG_6, BPF_REG_1), + + /* get major from ctx and shift << 32 */ + VIR_BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_6, 4), + VIR_BPF_ALU64_IMM(BPF_LSH, BPF_REG_2, 32), + /* get minor from ctx and | to shifted major */ + VIR_BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_6, 8), + VIR_BPF_ALU64_REG(BPF_OR, BPF_REG_2, BPF_REG_3), + /* lookup ((major << 32) | minor) in map and check perms */ + VIR_CGROUP_BPF_LOOKUP, + VIR_CGROUP_BPF_CHECK_PERM, + + /* get major from ctx and shift << 32 */ + VIR_BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_6, 4), + VIR_BPF_ALU64_IMM(BPF_LSH, BPF_REG_2, 32), + /* use -1 as minor and | to shifted major */ + VIR_BPF_MOV32_IMM(BPF_REG_3, -1), + VIR_BPF_ALU64_REG(BPF_OR, BPF_REG_2, BPF_REG_3), + /* lookup ((major << 32) | -1) in map and check perms */ + VIR_CGROUP_BPF_LOOKUP, + VIR_CGROUP_BPF_CHECK_PERM, + + /* use -1 as major and shift << 32 */ + VIR_BPF_MOV32_IMM(BPF_REG_2, -1), + VIR_BPF_ALU64_IMM(BPF_LSH, BPF_REG_2, 32), + /* get minor from ctx and | to shifted major */ + VIR_BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_6, 8), + VIR_BPF_ALU64_REG(BPF_OR, BPF_REG_2, BPF_REG_3), + /* lookup ((-1 << 32) | minor) in map and check perms */ + VIR_CGROUP_BPF_LOOKUP, + VIR_CGROUP_BPF_CHECK_PERM, + + /* use -1 as key which means major = -1 and minor = -1 */ + VIR_BPF_MOV64_IMM(BPF_REG_2, -1), + /* lookup -1 in map and check perms*/ + VIR_CGROUP_BPF_LOOKUP, + VIR_CGROUP_BPF_CHECK_PERM, + + /* no key was found, exit with 0 */ + VIR_BPF_MOV64_IMM(BPF_REG_0, 0), + VIR_BPF_EXIT_INSN(), + }; + + return virBPFLoadProg(prog, BPF_PROG_TYPE_CGROUP_DEVICE, ARRAY_CARDINALITY(prog)); +} + + +int +virCgroupV2DevicesAttachProg(virCgroupPtr group, + int mapfd, + size_t max) +{ + int ret = -1; + int progfd = -1; + int cgroupfd = -1; + VIR_AUTOFREE(char *) path = NULL; + + if (virCgroupPathOfController(group, VIR_CGROUP_CONTROLLER_DEVICES, + NULL, &path) < 0) { + goto cleanup; + } + + progfd = virCgroupV2DevicesLoadProg(mapfd); + if (progfd < 0) { + virReportSystemError(errno, "%s", _("failed to load cgroup BPF prog")); + goto cleanup; + } + + cgroupfd = open(path, O_RDONLY); + if (cgroupfd < 0) { + virReportSystemError(errno, _("unable to open '%s'"), path); + goto cleanup; + } + + if (virBPFAttachProg(progfd, cgroupfd, BPF_CGROUP_DEVICE) < 0) { + virReportSystemError(errno, "%s", _("failed to attach cgroup BPF prog")); + goto cleanup; + } + + if (group->unified.devices.progfd > 0) { + VIR_DEBUG("Closing existing program that was replaced by new one."); + VIR_FORCE_CLOSE(group->unified.devices.progfd); + } + + group->unified.devices.progfd = progfd; + group->unified.devices.mapfd = mapfd; + group->unified.devices.max = max; + progfd = -1; + mapfd = -1; + + ret = 0; + cleanup: + VIR_FORCE_CLOSE(cgroupfd); + VIR_FORCE_CLOSE(progfd); + VIR_FORCE_CLOSE(mapfd); + return ret; +} #else /* !HAVE_DECL_BPF_CGROUP_DEVICE */ bool virCgroupV2DevicesAvailable(virCgroupPtr group ATTRIBUTE_UNUSED) { return false; } + + +int +virCgroupV2DevicesAttachProg(virCgroupPtr group ATTRIBUTE_UNUSED, + int mapfd ATTRIBUTE_UNUSED, + size_t max ATTRIBUTE_UNUSED) +{ + virReportSystemError(ENOSYS, "%s", + _("cgroups v2 BPF devices not supported " + "with this kernel")); + return -1; +} #endif /* !HAVE_DECL_BPF_CGROUP_DEVICE */ diff --git a/src/util/vircgroupv2devices.h b/src/util/vircgroupv2devices.h index 2ab35681db..1ba87acb00 100644 --- a/src/util/vircgroupv2devices.h +++ b/src/util/vircgroupv2devices.h @@ -24,4 +24,9 @@ bool virCgroupV2DevicesAvailable(virCgroupPtr group); +int +virCgroupV2DevicesAttachProg(virCgroupPtr group, + int mapfd, + size_t max); + #endif /* LIBVIRT_VIRCGROUPV2DEVICES_H */ -- 2.20.1

Daniel P. Berrangé

7:43 a.m.

New subject: [libvirt] [PATCH v2 03/17] vircgroup: introduce virCgroupV2DevicesAttachProg

On Mon, Jan 14, 2019 at 04:47:37PM +0100, Pavel Hrdina wrote:

...

This function loads the BPF prog with prepared map into kernel and attaches it into guest cgroup. It can be also used to replace existing program in the cgroup if we need to resize BPF map to store more rules for devices. The old program will be closed and removed from kernel.

There are two possible ways how to create BPF program:

- One way is to write simple C-like code which can by compiled into BPF object file which can be loaded into kernel using elfutils.

- The second way is to define macros which looks like assembler instructions and can be used directly to create BPF program that can be directly loaded into kernel.

Since the program is not too complex we can use the second option.

I'm really not liking this to be honest. Even for this "simple" program, I struggle to understand what the code below is doing. It is basically assembly language, which is inevitably hard to follow even for simple things. I'd like to see us use the BPF C compiler to build the loaded program unless there's a compelling reason why it won't work

...

+ + +static int +virCgroupV2DevicesLoadProg(int mapfd) +{ +# define VIR_CGROUP_BPF_LOOKUP \ + /* prepare key param on stack */ \ + VIR_BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_2, -8), \ + VIR_BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), \ + VIR_BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), \ + /* lookup key (major << 32) | minor in map */ \ + VIR_BPF_LD_MAP_FD(BPF_REG_1, mapfd), \ + VIR_BPF_CALL_INSN(BPF_FUNC_map_lookup_elem) + +# define VIR_CGROUP_BPF_CHECK_PERM \ + /* if no key skip perm check */ \ + VIR_BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 6), \ + /* get perms from map */ \ + VIR_BPF_LDX_MEM(BPF_W, BPF_REG_1, BPF_REG_0, 0), \ + /* get perms from ctx */ \ + VIR_BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_6, 0), \ + /* (map perms) & (ctx perms) */ \ + VIR_BPF_ALU64_REG(BPF_AND, BPF_REG_1, BPF_REG_2), \ + /* if (map perms) & (ctx perms) == (ctx perms) exit, otherwise continue */ \ + VIR_BPF_JMP_REG(BPF_JNE, BPF_REG_1, BPF_REG_2, 2), \ + /* set ret 1 and exit */ \ + VIR_BPF_MOV64_IMM(BPF_REG_0, 1), \ + VIR_BPF_EXIT_INSN() + + struct bpf_insn prog[] = { + /* save ctx, argument passed to BPF program */ + VIR_BPF_MOV64_REG(BPF_REG_6, BPF_REG_1), + + /* get major from ctx and shift << 32 */ + VIR_BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_6, 4), + VIR_BPF_ALU64_IMM(BPF_LSH, BPF_REG_2, 32), + /* get minor from ctx and | to shifted major */ + VIR_BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_6, 8), + VIR_BPF_ALU64_REG(BPF_OR, BPF_REG_2, BPF_REG_3), + /* lookup ((major << 32) | minor) in map and check perms */ + VIR_CGROUP_BPF_LOOKUP, + VIR_CGROUP_BPF_CHECK_PERM, + + /* get major from ctx and shift << 32 */ + VIR_BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_6, 4), + VIR_BPF_ALU64_IMM(BPF_LSH, BPF_REG_2, 32), + /* use -1 as minor and | to shifted major */ + VIR_BPF_MOV32_IMM(BPF_REG_3, -1), + VIR_BPF_ALU64_REG(BPF_OR, BPF_REG_2, BPF_REG_3), + /* lookup ((major << 32) | -1) in map and check perms */ + VIR_CGROUP_BPF_LOOKUP, + VIR_CGROUP_BPF_CHECK_PERM, + + /* use -1 as major and shift << 32 */ + VIR_BPF_MOV32_IMM(BPF_REG_2, -1), + VIR_BPF_ALU64_IMM(BPF_LSH, BPF_REG_2, 32), + /* get minor from ctx and | to shifted major */ + VIR_BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_6, 8), + VIR_BPF_ALU64_REG(BPF_OR, BPF_REG_2, BPF_REG_3), + /* lookup ((-1 << 32) | minor) in map and check perms */ + VIR_CGROUP_BPF_LOOKUP, + VIR_CGROUP_BPF_CHECK_PERM, + + /* use -1 as key which means major = -1 and minor = -1 */ + VIR_BPF_MOV64_IMM(BPF_REG_2, -1), + /* lookup -1 in map and check perms*/ + VIR_CGROUP_BPF_LOOKUP, + VIR_CGROUP_BPF_CHECK_PERM, + + /* no key was found, exit with 0 */ + VIR_BPF_MOV64_IMM(BPF_REG_0, 0), + VIR_BPF_EXIT_INSN(), + }; + + return virBPFLoadProg(prog, BPF_PROG_TYPE_CGROUP_DEVICE, ARRAY_CARDINALITY(prog));

Regards, Daniel -- |: https://berrange.com -o- https://www.flickr.com/photos/dberrange :| |: https://libvirt.org -o- https://fstop138.berrange.com :| |: https://entangle-photo.org -o- https://www.instagram.com/dberrange :|

Pavel Hrdina

11:58 p.m.

New subject: [libvirt] [PATCH v2 03/17] vircgroup: introduce virCgroupV2DevicesAttachProg

On Mon, Jan 14, 2019 at 04:43:34PM +0000, Daniel P. Berrangé wrote:

...

On Mon, Jan 14, 2019 at 04:47:37PM +0100, Pavel Hrdina wrote:

...
This function loads the BPF prog with prepared map into kernel and attaches it into guest cgroup. It can be also used to replace existing program in the cgroup if we need to resize BPF map to store more rules for devices. The old program will be closed and removed from kernel.

There are two possible ways how to create BPF program:

- One way is to write simple C-like code which can by compiled into BPF object file which can be loaded into kernel using elfutils.

- The second way is to define macros which looks like assembler instructions and can be used directly to create BPF program that can be directly loaded into kernel.

Since the program is not too complex we can use the second option.

I'm really not liking this to be honest. Even for this "simple" program, I struggle to understand what the code below is doing. It is basically assembly language, which is inevitably hard to follow even for simple things.

I'd like to see us use the BPF C compiler to build the loaded program unless there's a compelling reason why it won't work

I personally don't like it as well, but the issue is that for the compilation we would have to use clang/llvm, AFAIK there is no support to compile BPF in GCC. Another issue is the loading part, you need to read the object file, extract correct sections which is not trivial and in order to have different size of maps we would have to modify the loaded data from the compiled object file. I can try to prepare alternative patches for that but it might me even more complex to understand than short assembly program. Pavel

...

...
+ + +static int +virCgroupV2DevicesLoadProg(int mapfd) +{ +# define VIR_CGROUP_BPF_LOOKUP \ + /* prepare key param on stack */ \ + VIR_BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_2, -8), \ + VIR_BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), \ + VIR_BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), \ + /* lookup key (major << 32) | minor in map */ \ + VIR_BPF_LD_MAP_FD(BPF_REG_1, mapfd), \ + VIR_BPF_CALL_INSN(BPF_FUNC_map_lookup_elem) + +# define VIR_CGROUP_BPF_CHECK_PERM \ + /* if no key skip perm check */ \ + VIR_BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 6), \ + /* get perms from map */ \ + VIR_BPF_LDX_MEM(BPF_W, BPF_REG_1, BPF_REG_0, 0), \ + /* get perms from ctx */ \ + VIR_BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_6, 0), \ + /* (map perms) & (ctx perms) */ \ + VIR_BPF_ALU64_REG(BPF_AND, BPF_REG_1, BPF_REG_2), \ + /* if (map perms) & (ctx perms) == (ctx perms) exit, otherwise continue */ \ + VIR_BPF_JMP_REG(BPF_JNE, BPF_REG_1, BPF_REG_2, 2), \ + /* set ret 1 and exit */ \ + VIR_BPF_MOV64_IMM(BPF_REG_0, 1), \ + VIR_BPF_EXIT_INSN() + + struct bpf_insn prog[] = { + /* save ctx, argument passed to BPF program */ + VIR_BPF_MOV64_REG(BPF_REG_6, BPF_REG_1), + + /* get major from ctx and shift << 32 */ + VIR_BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_6, 4), + VIR_BPF_ALU64_IMM(BPF_LSH, BPF_REG_2, 32), + /* get minor from ctx and | to shifted major */ + VIR_BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_6, 8), + VIR_BPF_ALU64_REG(BPF_OR, BPF_REG_2, BPF_REG_3), + /* lookup ((major << 32) | minor) in map and check perms */ + VIR_CGROUP_BPF_LOOKUP, + VIR_CGROUP_BPF_CHECK_PERM, + + /* get major from ctx and shift << 32 */ + VIR_BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_6, 4), + VIR_BPF_ALU64_IMM(BPF_LSH, BPF_REG_2, 32), + /* use -1 as minor and | to shifted major */ + VIR_BPF_MOV32_IMM(BPF_REG_3, -1), + VIR_BPF_ALU64_REG(BPF_OR, BPF_REG_2, BPF_REG_3), + /* lookup ((major << 32) | -1) in map and check perms */ + VIR_CGROUP_BPF_LOOKUP, + VIR_CGROUP_BPF_CHECK_PERM, + + /* use -1 as major and shift << 32 */ + VIR_BPF_MOV32_IMM(BPF_REG_2, -1), + VIR_BPF_ALU64_IMM(BPF_LSH, BPF_REG_2, 32), + /* get minor from ctx and | to shifted major */ + VIR_BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_6, 8), + VIR_BPF_ALU64_REG(BPF_OR, BPF_REG_2, BPF_REG_3), + /* lookup ((-1 << 32) | minor) in map and check perms */ + VIR_CGROUP_BPF_LOOKUP, + VIR_CGROUP_BPF_CHECK_PERM, + + /* use -1 as key which means major = -1 and minor = -1 */ + VIR_BPF_MOV64_IMM(BPF_REG_2, -1), + /* lookup -1 in map and check perms*/ + VIR_CGROUP_BPF_LOOKUP, + VIR_CGROUP_BPF_CHECK_PERM, + + /* no key was found, exit with 0 */ + VIR_BPF_MOV64_IMM(BPF_REG_0, 0), + VIR_BPF_EXIT_INSN(), + }; + + return virBPFLoadProg(prog, BPF_PROG_TYPE_CGROUP_DEVICE, ARRAY_CARDINALITY(prog));

Regards, Daniel -- |: https://berrange.com -o- https://www.flickr.com/photos/dberrange :| |: https://libvirt.org -o- https://fstop138.berrange.com :| |: https://entangle-photo.org -o- https://www.instagram.com/dberrange :|

Daniel P. Berrangé

15 Jan 15 Jan

1:14 a.m.

New subject: [libvirt] [PATCH v2 03/17] vircgroup: introduce virCgroupV2DevicesAttachProg

On Tue, Jan 15, 2019 at 09:58:47AM +0100, Pavel Hrdina wrote:

...

On Mon, Jan 14, 2019 at 04:43:34PM +0000, Daniel P. Berrangé wrote:

...
On Mon, Jan 14, 2019 at 04:47:37PM +0100, Pavel Hrdina wrote:

...
This function loads the BPF prog with prepared map into kernel and attaches it into guest cgroup. It can be also used to replace existing program in the cgroup if we need to resize BPF map to store more rules for devices. The old program will be closed and removed from kernel.

There are two possible ways how to create BPF program:

- One way is to write simple C-like code which can by compiled into BPF object file which can be loaded into kernel using elfutils.

- The second way is to define macros which looks like assembler instructions and can be used directly to create BPF program that can be directly loaded into kernel.

Since the program is not too complex we can use the second option.

I'm really not liking this to be honest. Even for this "simple" program, I struggle to understand what the code below is doing. It is basically assembly language, which is inevitably hard to follow even for simple things.

I'd like to see us use the BPF C compiler to build the loaded program unless there's a compelling reason why it won't work

I personally don't like it as well, but the issue is that for the compilation we would have to use clang/llvm, AFAIK there is no support to compile BPF in GCC. Another issue is the loading part, you need to read the object file, extract correct sections which is not trivial and in order to have different size of maps we would have to modify the loaded data from the compiled object file.

I don't see a problem with using clang for BPF. Reading the DPDK lists, I see you can do something like this: $ clang -I iproute2/include/ -O2 -emit-llvm -c tap_bpf_program.c -o - \ | llc -march=bpf -filetype=obj -o tap_bpf_program.o $ objdump -j l3_l4 -s tap_bpf_program.o > tap_bpf_program.bin where the tap_bpf_program.bin should be the raw instructions to be loaded. Supposedly they also have a script that turns the .bin file into a C byte array which they then compile into their binary, so they can just load the uint8[] directly, instead of from a separate file The matter of dynamically sized device maps is more tricky. Assuming the device map size is basically a list of (major,minor) numbers, that's 2 bytes per entry. We could easily do a fixed size 50 elements giving 100 bytes per guest if that makes it easier. I guess the hard thing is actually getting the dynamic data into the code.

...

I can try to prepare alternative patches for that but it might me even more complex to understand than short assembly program.

Maybe there's a compromise/hybrid that would work simpler ? eg is it possible to structure things so all the checking code is statically compiled, and then we dynamically generate the instructions to define the list of devices and append that to the main static code. ? Regards, Daniel -- |: https://berrange.com -o- https://www.flickr.com/photos/dberrange :| |: https://libvirt.org -o- https://fstop138.berrange.com :| |: https://entangle-photo.org -o- https://www.instagram.com/dberrange :|

Pavel Hrdina

3:07 a.m.

New subject: [libvirt] [PATCH v2 03/17] vircgroup: introduce virCgroupV2DevicesAttachProg

On Tue, Jan 15, 2019 at 10:14:33AM +0000, Daniel P. Berrangé wrote:

...

On Tue, Jan 15, 2019 at 09:58:47AM +0100, Pavel Hrdina wrote:

...
On Mon, Jan 14, 2019 at 04:43:34PM +0000, Daniel P. Berrangé wrote:

...
On Mon, Jan 14, 2019 at 04:47:37PM +0100, Pavel Hrdina wrote:

...
This function loads the BPF prog with prepared map into kernel and attaches it into guest cgroup. It can be also used to replace existing program in the cgroup if we need to resize BPF map to store more rules for devices. The old program will be closed and removed from kernel.

There are two possible ways how to create BPF program:

- One way is to write simple C-like code which can by compiled into BPF object file which can be loaded into kernel using elfutils.

- The second way is to define macros which looks like assembler instructions and can be used directly to create BPF program that can be directly loaded into kernel.

Since the program is not too complex we can use the second option.

I'm really not liking this to be honest. Even for this "simple" program, I struggle to understand what the code below is doing. It is basically assembly language, which is inevitably hard to follow even for simple things.

I'd like to see us use the BPF C compiler to build the loaded program unless there's a compelling reason why it won't work

I personally don't like it as well, but the issue is that for the compilation we would have to use clang/llvm, AFAIK there is no support to compile BPF in GCC. Another issue is the loading part, you need to read the object file, extract correct sections which is not trivial and in order to have different size of maps we would have to modify the loaded data from the compiled object file.

I don't see a problem with using clang for BPF. Reading the DPDK lists, I see you can do something like this:

$ clang -I iproute2/include/ -O2 -emit-llvm -c tap_bpf_program.c -o - \ | llc -march=bpf -filetype=obj -o tap_bpf_program.o $ objdump -j l3_l4 -s tap_bpf_program.o > tap_bpf_program.bin

where the tap_bpf_program.bin should be the raw instructions to be loaded.

There is no technical problem with using clang for BPF, the only issue that I have is that we will require clang to compile libvirt from source code.

...

Supposedly they also have a script that turns the .bin file into a C byte array which they then compile into their binary, so they can just load the uint8[] directly, instead of from a separate file

This is not an issue, there is a sample code in kernel that uses elfutils to load the bpf.obj file into C byte array, just different method how the get the list of instructions.

...

The matter of dynamically sized device maps is more tricky. Assuming the device map size is basically a list of (major,minor) numbers, that's 2 bytes per entry. We could easily do a fixed size 50 elements giving 100 bytes per guest if that makes it easier. I guess the hard thing is actually getting the dynamic data into the code.

50 elements per guest will not be good enough. Yes, in most cases it's perfectly OK, but there are some users having hundreds of disks inside a guest and we would break their usage and since we don't have any written limitation of how many disk/devices can be inside a guest we have to be able to store unlimited number of devices into the map until we run out of memory.

...

...
I can try to prepare alternative patches for that but it might me even more complex to understand than short assembly program.

Maybe there's a compromise/hybrid that would work simpler ? eg is it possible to structure things so all the checking code is statically compiled, and then we dynamically generate the instructions to define the list of devices and append that to the main static code. ?

This is not an issue, the list of devices is stored into the BPF map from user-space (libvirt) and the BPF map is used by the BPF program in the kernel. The limits are that you cannot reallocate the BPF map (this is solved by creating new program with bigger map) and you cannot change the BPF map FD while the program is loaded into kernel. Pavel

...

Regards, Daniel -- |: https://berrange.com -o- https://www.flickr.com/photos/dberrange :| |: https://libvirt.org -o- https://fstop138.berrange.com :| |: https://entangle-photo.org -o- https://www.instagram.com/dberrange :|

Pavel Hrdina

24 Jan 24 Jan

7:21 a.m.

New subject: [libvirt] [PATCH v2 03/17] vircgroup: introduce virCgroupV2DevicesAttachProg

On Tue, Jan 15, 2019 at 01:07:55PM +0100, Pavel Hrdina wrote:

...

On Tue, Jan 15, 2019 at 10:14:33AM +0000, Daniel P. Berrangé wrote:

...
On Tue, Jan 15, 2019 at 09:58:47AM +0100, Pavel Hrdina wrote:

...
On Mon, Jan 14, 2019 at 04:43:34PM +0000, Daniel P. Berrangé wrote:

...
On Mon, Jan 14, 2019 at 04:47:37PM +0100, Pavel Hrdina wrote:

...
This function loads the BPF prog with prepared map into kernel and attaches it into guest cgroup. It can be also used to replace existing program in the cgroup if we need to resize BPF map to store more rules for devices. The old program will be closed and removed from kernel.

There are two possible ways how to create BPF program:

- One way is to write simple C-like code which can by compiled into BPF object file which can be loaded into kernel using elfutils.

- The second way is to define macros which looks like assembler instructions and can be used directly to create BPF program that can be directly loaded into kernel.

Since the program is not too complex we can use the second option.

I'm really not liking this to be honest. Even for this "simple" program, I struggle to understand what the code below is doing. It is basically assembly language, which is inevitably hard to follow even for simple things.

I'd like to see us use the BPF C compiler to build the loaded program unless there's a compelling reason why it won't work

I personally don't like it as well, but the issue is that for the compilation we would have to use clang/llvm, AFAIK there is no support to compile BPF in GCC. Another issue is the loading part, you need to read the object file, extract correct sections which is not trivial and in order to have different size of maps we would have to modify the loaded data from the compiled object file.

I don't see a problem with using clang for BPF. Reading the DPDK lists, I see you can do something like this:

$ clang -I iproute2/include/ -O2 -emit-llvm -c tap_bpf_program.c -o - \ | llc -march=bpf -filetype=obj -o tap_bpf_program.o $ objdump -j l3_l4 -s tap_bpf_program.o > tap_bpf_program.bin

where the tap_bpf_program.bin should be the raw instructions to be loaded.

There is no technical problem with using clang for BPF, the only issue that I have is that we will require clang to compile libvirt from source code.

...
Supposedly they also have a script that turns the .bin file into a C byte array which they then compile into their binary, so they can just load the uint8[] directly, instead of from a separate file

This is not an issue, there is a sample code in kernel that uses elfutils to load the bpf.obj file into C byte array, just different method how the get the list of instructions.

...
The matter of dynamically sized device maps is more tricky. Assuming the device map size is basically a list of (major,minor) numbers, that's 2 bytes per entry. We could easily do a fixed size 50 elements giving 100 bytes per guest if that makes it easier. I guess the hard thing is actually getting the dynamic data into the code.

50 elements per guest will not be good enough. Yes, in most cases it's perfectly OK, but there are some users having hundreds of disks inside a guest and we would break their usage and since we don't have any written limitation of how many disk/devices can be inside a guest we have to be able to store unlimited number of devices into the map until we run out of memory.

...
...
I can try to prepare alternative patches for that but it might me even more complex to understand than short assembly program.

Maybe there's a compromise/hybrid that would work simpler ? eg is it possible to structure things so all the checking code is statically compiled, and then we dynamically generate the instructions to define the list of devices and append that to the main static code. ?

This compromise would not work. The dynamic data are stored in the BPF map and the program uses only the map FD which is hardcoded into the program itself. The map FD is placed into correct register before we call bpf_map_lookup function. The only dynamic part of the program is the map FD which is used on 4 places in the program. I was also looking into the possibility of using elfutils as kernel is using in one of the samples and that feels more complex than this simple assembler like program. We would have to open the binary file generated by clang, load map data and instruction data, update the map size in the loaded map data and do syscall in order to create the map, get the map FD returned by syscall, inject that FD into the loaded instructions and do syscall to create the program. This IMHO sounds more complex and than the "simple" assembler like program. As a compromise we could have the C code as a comment in our source files right before the actual assembler like version with instructions how to use clang and llvm-objdump to get the assembler instructions. That's what I've done when I was implementing the program. The only optimization was to add two macros and change some jump logic but we can remove that optimization and make it 1:1 to what we get from llvm-objdump. Pavel

...

This is not an issue, the list of devices is stored into the BPF map from user-space (libvirt) and the BPF map is used by the BPF program in the kernel. The limits are that you cannot reallocate the BPF map (this is solved by creating new program with bigger map) and you cannot change the BPF map FD while the program is loaded into kernel.

Pavel

Pavel Hrdina

14 Jan 14 Jan

6:47 a.m.

New subject: [libvirt] [PATCH v2 16/17] vircgroupv2: use dummy process to workaround kernel bug with systemd

If some program is attached to cgroup and that cgroup is removed before detaching the program it might not be freed and will remain in the system until it's rebooted. This would not be that big deal to workaround if we wouldn't use machined to track our guests. Systemd tries to be the nice guy and if there is no process attached to TransientUnit it will destroy the unit and remove the cgroup from system which will hit the kernel bug. In order to prevent this behavior of systemd we can move another process into the guest cgroup and the cgroup will remain active even if we kill qemu process while destroying guest. We can then detach our BPF program from that cgroup and call machined terminate API to cleanup the cgroup. Signed-off-by: Pavel Hrdina <phrdina@redhat.com> --- src/libvirt_private.syms | 1 + src/lxc/lxc_cgroup.c | 1 + src/qemu/qemu_cgroup.c | 6 ++- src/util/vircgroup.c | 16 ++++++++ src/util/vircgroup.h | 1 + src/util/vircgrouppriv.h | 2 + src/util/vircgroupv2.c | 2 + src/util/vircgroupv2devices.c | 70 ++++++++++++++++++++++++++++++++++- src/util/virsystemd.c | 2 +- src/util/virsystemd.h | 2 + 10 files changed, 99 insertions(+), 4 deletions(-) diff --git a/src/libvirt_private.syms b/src/libvirt_private.syms index f1da5ede71..c22c81cebe 100644 --- a/src/libvirt_private.syms +++ b/src/libvirt_private.syms @@ -3031,6 +3031,7 @@ virSystemdCanHybridSleep; virSystemdCanSuspend; virSystemdCreateMachine; virSystemdGetMachineNameByPID; +virSystemdHasMachined; virSystemdHasMachinedResetCachedValue; virSystemdMakeScopeName; virSystemdMakeSliceName; diff --git a/src/lxc/lxc_cgroup.c b/src/lxc/lxc_cgroup.c index d93a19d684..a8cef74bb9 100644 --- a/src/lxc/lxc_cgroup.c +++ b/src/lxc/lxc_cgroup.c @@ -455,6 +455,7 @@ virCgroupPtr virLXCCgroupCreate(virDomainDefPtr def, nnicindexes, nicindexes, def->resource->partition, -1, + LXC_STATE_DIR, &cgroup) < 0) goto cleanup; diff --git a/src/qemu/qemu_cgroup.c b/src/qemu/qemu_cgroup.c index 4931fb6575..9374a799a1 100644 --- a/src/qemu/qemu_cgroup.c +++ b/src/qemu/qemu_cgroup.c @@ -946,6 +946,7 @@ qemuInitCgroup(virDomainObjPtr vm, nnicindexes, nicindexes, vm->def->resource->partition, cfg->cgroupControllers, + cfg->stateDir, &priv->cgroup) < 0) { if (virCgroupNewIgnoreError()) goto done; @@ -1256,16 +1257,19 @@ int qemuRemoveCgroup(virDomainObjPtr vm) { qemuDomainObjPrivatePtr priv = vm->privateData; + int rc = 0; if (priv->cgroup == NULL) return 0; /* Not supported, so claim success */ + rc = virCgroupRemove(priv->cgroup); + if (virCgroupTerminateMachine(priv->machineName) < 0) { if (!virCgroupNewIgnoreError()) VIR_DEBUG("Failed to terminate cgroup for %s", vm->def->name); } - return virCgroupRemove(priv->cgroup); + return rc; } diff --git a/src/util/vircgroup.c b/src/util/vircgroup.c index 9d284e9bf4..506cc49b9a 100644 --- a/src/util/vircgroup.c +++ b/src/util/vircgroup.c @@ -1105,6 +1105,7 @@ virCgroupNewMachineSystemd(const char *name, int *nicindexes, const char *partition, int controllers, + const char *stateDir, virCgroupPtr *group) { int rv; @@ -1151,6 +1152,16 @@ virCgroupNewMachineSystemd(const char *name, return -1; } + if (VIR_STRDUP((*group)->stateDir, stateDir) < 0) { + virCgroupFree(group); + return -1; + } + + if (VIR_STRDUP((*group)->vmName, name) < 0) { + virCgroupFree(group); + return -1; + } + if (virCgroupAddProcess(*group, pidleader) < 0) { virErrorPtr saved = virSaveLastError(); virCgroupRemove(*group); @@ -1233,6 +1244,7 @@ virCgroupNewMachine(const char *name, int *nicindexes, const char *partition, int controllers, + const char *stateDir, virCgroupPtr *group) { int rv; @@ -1249,6 +1261,7 @@ virCgroupNewMachine(const char *name, nicindexes, partition, controllers, + stateDir, group)) == 0) return 0; @@ -1301,6 +1314,8 @@ virCgroupFree(virCgroupPtr *group) VIR_FREE((*group)->unified.placement); VIR_FREE((*group)->path); + VIR_FREE((*group)->stateDir); + VIR_FREE((*group)->vmName); VIR_FREE(*group); } @@ -2897,6 +2912,7 @@ virCgroupNewMachine(const char *name ATTRIBUTE_UNUSED, int *nicindexes ATTRIBUTE_UNUSED, const char *partition ATTRIBUTE_UNUSED, int controllers ATTRIBUTE_UNUSED, + const char *stateDir ATTRIBUTE_UNUSED, virCgroupPtr *group ATTRIBUTE_UNUSED) { virReportSystemError(ENXIO, "%s", diff --git a/src/util/vircgroup.h b/src/util/vircgroup.h index 372009de4a..0cd90d3651 100644 --- a/src/util/vircgroup.h +++ b/src/util/vircgroup.h @@ -98,6 +98,7 @@ int virCgroupNewMachine(const char *name, int *nicindexes, const char *partition, int controllers, + const char *stateDir, virCgroupPtr *group) ATTRIBUTE_NONNULL(1) ATTRIBUTE_NONNULL(2) ATTRIBUTE_NONNULL(3); diff --git a/src/util/vircgrouppriv.h b/src/util/vircgrouppriv.h index 085fea375c..e0ccce88a0 100644 --- a/src/util/vircgrouppriv.h +++ b/src/util/vircgrouppriv.h @@ -62,6 +62,8 @@ typedef virCgroupV2Controller *virCgroupV2ControllerPtr; struct _virCgroup { char *path; + char *stateDir; + char *vmName; virCgroupBackendPtr backends[VIR_CGROUP_BACKEND_TYPE_LAST]; diff --git a/src/util/vircgroupv2.c b/src/util/vircgroupv2.c index 44c4c7fc90..03efabe198 100644 --- a/src/util/vircgroupv2.c +++ b/src/util/vircgroupv2.c @@ -37,6 +37,8 @@ #include "virerror.h" #include "virfile.h" #include "virlog.h" +#include "virpidfile.h" +#include "virprocess.h" #include "virstring.h" #include "virsystemd.h" diff --git a/src/util/vircgroupv2devices.c b/src/util/vircgroupv2devices.c index a481ba89b7..9d8fc5557e 100644 --- a/src/util/vircgroupv2devices.c +++ b/src/util/vircgroupv2devices.c @@ -31,10 +31,15 @@ #include "vircgrouppriv.h" #include "virbpf.h" +#include "vircommand.h" #include "vircgroup.h" #include "vircgroupv2devices.h" #include "virfile.h" #include "virlog.h" +#include "virpidfile.h" +#include "virprocess.h" +#include "virstring.h" +#include "virsystemd.h" VIR_LOG_INIT("util.cgroup"); @@ -365,17 +370,74 @@ virCgroupV2DevicesReallocMap(int mapfd, } +static pid_t +virCgroupV2DevicesStartDummyProc(virCgroupPtr group) +{ + pid_t ret = -1; + pid_t pid = -1; + virCommandPtr cmd = NULL; + VIR_AUTOFREE(char *) pidfile = NULL; + VIR_AUTOFREE(char *) fileName = NULL; + + if (virAsprintf(&fileName, "cgroup-%s", group->vmName) < 0) + return -1; + + pidfile = virPidFileBuildPath(group->stateDir, fileName); + if (!pidfile) + return -1; + + cmd = virCommandNewArgList("/usr/bin/tail", "-f", "/dev/null", + "-s", "3600", NULL); + if (!cmd) + return -1; + + virCommandDaemonize(cmd); + virCommandSetPidFile(cmd, pidfile); + + if (virCommandRun(cmd, NULL) < 0) + goto cleanup; + + if (virPidFileReadPath(pidfile, &pid) < 0) { + virReportError(VIR_ERR_INTERNAL_ERROR, "%s", + _("failed to start dummy cgroup helper")); + goto cleanup; + } + + if (virCgroupAddMachineProcess(group, pid) < 0) + goto cleanup; + + ret = pid; + cleanup: + if (ret < 0) { + virCommandAbort(cmd); + if (pid > 0) + virProcessKillPainfully(pid, true); + } + if (pidfile) + unlink(pidfile); + virCommandFree(cmd); + return ret; +} + + int virCgroupV2DevicesCreateProg(virCgroupPtr group) { - int mapfd; + int mapfd = -1; + pid_t pid = -1; if (group->unified.devices.progfd > 0 && group->unified.devices.mapfd > 0) return 0; + if (virSystemdHasMachined() == 0) { + pid = virCgroupV2DevicesStartDummyProc(group); + if (pid < 0) + return -1; + } + mapfd = virCgroupV2DevicesCreateMap(VIR_CGROUP_V2_INITIAL_BPF_MAP_SIZE); if (mapfd < 0) - return -1; + goto error; if (virCgroupV2DevicesAttachProg(group, mapfd, VIR_CGROUP_V2_INITIAL_BPF_MAP_SIZE) < 0) { @@ -385,6 +447,7 @@ virCgroupV2DevicesCreateProg(virCgroupPtr group) return 0; error: + virProcessKillPainfully(pid, true); VIR_FORCE_CLOSE(mapfd); return -1; } @@ -453,6 +516,9 @@ virCgroupV2DevicesRemoveProg(virCgroupPtr group) if (group->unified.devices.progfd >= 0) VIR_FORCE_CLOSE(group->unified.devices.progfd); + if (virSystemdHasMachined() == 0) + virCgroupKillPainfully(group); + ret = 0; cleanup: diff --git a/src/util/virsystemd.c b/src/util/virsystemd.c index f492ac1859..ca35b12a5c 100644 --- a/src/util/virsystemd.c +++ b/src/util/virsystemd.c @@ -138,7 +138,7 @@ void virSystemdHasMachinedResetCachedValue(void) * -1 = error * 0 = machine1 is available */ -static int +int virSystemdHasMachined(void) { int ret; diff --git a/src/util/virsystemd.h b/src/util/virsystemd.h index 7d9c0ebd62..553d4196f1 100644 --- a/src/util/virsystemd.h +++ b/src/util/virsystemd.h @@ -51,4 +51,6 @@ int virSystemdCanHybridSleep(bool *result); char *virSystemdGetMachineNameByPID(pid_t pid); +int virSystemdHasMachined(void); + #endif /* LIBVIRT_VIRSYSTEMD_H */ -- 2.20.1

Michal Privoznik

24 Jan 24 Jan

1:44 a.m.

New subject: [libvirt] [PATCH v2 16/17] vircgroupv2: use dummy process to workaround kernel bug with systemd

On 1/14/19 4:47 PM, Pavel Hrdina wrote:

...

If some program is attached to cgroup and that cgroup is removed before detaching the program it might not be freed and will remain in the system until it's rebooted.

This would not be that big deal to workaround if we wouldn't use machined to track our guests. Systemd tries to be the nice guy and if there is no process attached to TransientUnit it will destroy the unit and remove the cgroup from system which will hit the kernel bug.

In order to prevent this behavior of systemd we can move another process into the guest cgroup and the cgroup will remain active even if we kill qemu process while destroying guest. We can then detach our BPF program from that cgroup and call machined terminate API to cleanup the cgroup.

Signed-off-by: Pavel Hrdina <phrdina@redhat.com> --- src/libvirt_private.syms | 1 + src/lxc/lxc_cgroup.c | 1 + src/qemu/qemu_cgroup.c | 6 ++- src/util/vircgroup.c | 16 ++++++++ src/util/vircgroup.h | 1 + src/util/vircgrouppriv.h | 2 + src/util/vircgroupv2.c | 2 + src/util/vircgroupv2devices.c | 70 ++++++++++++++++++++++++++++++++++- src/util/virsystemd.c | 2 +- src/util/virsystemd.h | 2 + 10 files changed, 99 insertions(+), 4 deletions(-)

...

+static pid_t +virCgroupV2DevicesStartDummyProc(virCgroupPtr group) +{ + pid_t ret = -1; + pid_t pid = -1; + virCommandPtr cmd = NULL; + VIR_AUTOFREE(char *) pidfile = NULL; + VIR_AUTOFREE(char *) fileName = NULL; + + if (virAsprintf(&fileName, "cgroup-%s", group->vmName) < 0) + return -1; + + pidfile = virPidFileBuildPath(group->stateDir, fileName); + if (!pidfile) + return -1; + + cmd = virCommandNewArgList("/usr/bin/tail", "-f", "/dev/null", + "-s", "3600", NULL); + if (!cmd) + return -1; + + virCommandDaemonize(cmd); + virCommandSetPidFile(cmd, pidfile); + + if (virCommandRun(cmd, NULL) < 0) + goto cleanup; + + if (virPidFileReadPath(pidfile, &pid) < 0) { + virReportError(VIR_ERR_INTERNAL_ERROR, "%s", + _("failed to start dummy cgroup helper")); + goto cleanup; + } + + if (virCgroupAddMachineProcess(group, pid) < 0) + goto cleanup; + + ret = pid; + cleanup: + if (ret < 0) { + virCommandAbort(cmd); + if (pid > 0) + virProcessKillPainfully(pid, true); + } + if (pidfile) + unlink(pidfile); + virCommandFree(cmd); + return ret; +}

I don't think we should bother with this. This is clearly a kernel bug and once fixed (which we can't detect) we would still be placing this 'tail' into the cgroup for nothing. I know we have lots of workarounds for bugs in other libraries (and even probably kernels too) in our code, but we should start getting rid of those instead of introducing new ones. Michal

Daniel P. Berrangé

2:06 a.m.

New subject: [libvirt] [PATCH v2 16/17] vircgroupv2: use dummy process to workaround kernel bug with systemd

On Mon, Jan 14, 2019 at 04:47:50PM +0100, Pavel Hrdina wrote:

...

If some program is attached to cgroup and that cgroup is removed before detaching the program it might not be freed and will remain in the system until it's rebooted.

This would not be that big deal to workaround if we wouldn't use machined to track our guests. Systemd tries to be the nice guy and if there is no process attached to TransientUnit it will destroy the unit and remove the cgroup from system which will hit the kernel bug.

In order to prevent this behavior of systemd we can move another process into the guest cgroup and the cgroup will remain active even if we kill qemu process while destroying guest. We can then detach our BPF program from that cgroup and call machined terminate API to cleanup the cgroup.

The kernel bugzilla says that "bpftool" still reports existance of the BPF program when this situation hits. So can't we just use bpftool to explicitly delete this orphaned program

...

+static pid_t +virCgroupV2DevicesStartDummyProc(virCgroupPtr group) +{ + pid_t ret = -1; + pid_t pid = -1; + virCommandPtr cmd = NULL; + VIR_AUTOFREE(char *) pidfile = NULL; + VIR_AUTOFREE(char *) fileName = NULL; + + if (virAsprintf(&fileName, "cgroup-%s", group->vmName) < 0) + return -1; + + pidfile = virPidFileBuildPath(group->stateDir, fileName); + if (!pidfile) + return -1; + + cmd = virCommandNewArgList("/usr/bin/tail", "-f", "/dev/null", + "-s", "3600", NULL); + if (!cmd) + return -1; + + virCommandDaemonize(cmd); + virCommandSetPidFile(cmd, pidfile); + + if (virCommandRun(cmd, NULL) < 0) + goto cleanup; + + if (virPidFileReadPath(pidfile, &pid) < 0) { + virReportError(VIR_ERR_INTERNAL_ERROR, "%s", + _("failed to start dummy cgroup helper")); + goto cleanup; + } + + if (virCgroupAddMachineProcess(group, pid) < 0) + goto cleanup; + + ret = pid; + cleanup: + if (ret < 0) { + virCommandAbort(cmd); + if (pid > 0) + virProcessKillPainfully(pid, true); + } + if (pidfile) + unlink(pidfile); + virCommandFree(cmd); + return ret; +}

I think this is really horrible to be honest. If there's really no other option, then I'd prefer that we do not use BPF at all on known broken kernel versions until it is fixed. Regards, Daniel -- |: https://berrange.com -o- https://www.flickr.com/photos/dberrange :| |: https://libvirt.org -o- https://fstop138.berrange.com :| |: https://entangle-photo.org -o- https://www.instagram.com/dberrange :|

Pavel Hrdina

3:24 a.m.

New subject: [libvirt] [PATCH v2 16/17] vircgroupv2: use dummy process to workaround kernel bug with systemd

On Thu, Jan 24, 2019 at 11:06:51AM +0000, Daniel P. Berrangé wrote:

...

On Mon, Jan 14, 2019 at 04:47:50PM +0100, Pavel Hrdina wrote:

...
If some program is attached to cgroup and that cgroup is removed before detaching the program it might not be freed and will remain in the system until it's rebooted.

This would not be that big deal to workaround if we wouldn't use machined to track our guests. Systemd tries to be the nice guy and if there is no process attached to TransientUnit it will destroy the unit and remove the cgroup from system which will hit the kernel bug.

In order to prevent this behavior of systemd we can move another process into the guest cgroup and the cgroup will remain active even if we kill qemu process while destroying guest. We can then detach our BPF program from that cgroup and call machined terminate API to cleanup the cgroup.

The kernel bugzilla says that "bpftool" still reports existance of the BPF program when this situation hits. So can't we just use bpftool to explicitly delete this orphaned program

Unfortunately no, bpftool doesn't have any way how to delete any programs.

...

...
+static pid_t +virCgroupV2DevicesStartDummyProc(virCgroupPtr group) +{ + pid_t ret = -1; + pid_t pid = -1; + virCommandPtr cmd = NULL; + VIR_AUTOFREE(char *) pidfile = NULL; + VIR_AUTOFREE(char *) fileName = NULL; + + if (virAsprintf(&fileName, "cgroup-%s", group->vmName) < 0) + return -1; + + pidfile = virPidFileBuildPath(group->stateDir, fileName); + if (!pidfile) + return -1; + + cmd = virCommandNewArgList("/usr/bin/tail", "-f", "/dev/null", + "-s", "3600", NULL); + if (!cmd) + return -1; + + virCommandDaemonize(cmd); + virCommandSetPidFile(cmd, pidfile); + + if (virCommandRun(cmd, NULL) < 0) + goto cleanup; + + if (virPidFileReadPath(pidfile, &pid) < 0) { + virReportError(VIR_ERR_INTERNAL_ERROR, "%s", + _("failed to start dummy cgroup helper")); + goto cleanup; + } + + if (virCgroupAddMachineProcess(group, pid) < 0) + goto cleanup; + + ret = pid; + cleanup: + if (ret < 0) { + virCommandAbort(cmd); + if (pid > 0) + virProcessKillPainfully(pid, true); + } + if (pidfile) + unlink(pidfile); + virCommandFree(cmd); + return ret; +}

I think this is really horrible to be honest.

If there's really no other option, then I'd prefer that we do not use BPF at all on known broken kernel versions until it is fixed.

I agree with you and Michal, it's ugly and most likely the only solution with broken kernel and systemd. I wrote the patch to show how we could workaround the kernel bug and I was hoping that we will agree on not using it. Another possible workaround would be to not use machined and create the systemd unit by other APIs, that way systemd would not remove the cgroup and we would be able to detach programs before removing cgroups. Or just simply wait until kernel fixes the issue. Pavel

Pavel Hrdina

14 Jan 14 Jan

6:47 a.m.

New subject: [libvirt] [PATCH v2 17/17] vircgroupmock: mock virCgroupV2DevicesAvailable

We need to mock virCgroupV2DevicesAvailable() in order to remove any dependency on kernel as BPF devices might not be available. Signed-off-by: Pavel Hrdina <phrdina@redhat.com> --- src/util/vircgroupv2devices.h | 5 ++++- tests/vircgroupdata/hybrid.parsed | 2 +- tests/vircgroupmock.c | 7 +++++++ tests/vircgrouptest.c | 4 ++-- 4 files changed, 14 insertions(+), 4 deletions(-) diff --git a/src/util/vircgroupv2devices.h b/src/util/vircgroupv2devices.h index d717c9755f..5fc35942ef 100644 --- a/src/util/vircgroupv2devices.h +++ b/src/util/vircgroupv2devices.h @@ -21,10 +21,13 @@ # include <sys/types.h> +# include "internal.h" + # include "vircgroup.h" bool -virCgroupV2DevicesAvailable(virCgroupPtr group); +virCgroupV2DevicesAvailable(virCgroupPtr group) + ATTRIBUTE_NOINLINE; int virCgroupV2DevicesAttachProg(virCgroupPtr group, diff --git a/tests/vircgroupdata/hybrid.parsed b/tests/vircgroupdata/hybrid.parsed index 7600de5f45..f755eed465 100644 --- a/tests/vircgroupdata/hybrid.parsed +++ b/tests/vircgroupdata/hybrid.parsed @@ -2,7 +2,7 @@ cpu <null> cpuacct <null> cpuset /not/really/sys/fs/cgroup/cpuset memory <null> -devices /not/really/sys/fs/cgroup/devices +devices <null> freezer /not/really/sys/fs/cgroup/freezer blkio <null> net_cls /not/really/sys/fs/cgroup/net_cls diff --git a/tests/vircgroupmock.c b/tests/vircgroupmock.c index 06bd0a5f29..97f98e91ee 100644 --- a/tests/vircgroupmock.c +++ b/tests/vircgroupmock.c @@ -34,6 +34,7 @@ # include "testutilslxc.h" # include "virstring.h" # include "virfile.h" +# include "vircgroupv2devices.h" static int (*real_open)(const char *path, int flags, ...); static FILE *(*real_fopen)(const char *path, const char *mode); @@ -701,6 +702,12 @@ int open(const char *path, int flags, ...) free(newpath); return ret; } + +bool +virCgroupV2DevicesAvailable(virCgroupPtr group ATTRIBUTE_UNUSED) +{ + return true; +} #else /* Nothing to override on non-__linux__ platforms */ #endif diff --git a/tests/vircgrouptest.c b/tests/vircgrouptest.c index 20f4c57b04..4c1f53d924 100644 --- a/tests/vircgrouptest.c +++ b/tests/vircgrouptest.c @@ -587,6 +587,7 @@ static int testCgroupNewForSelfUnified(const void *args ATTRIBUTE_UNUSED) (1 << VIR_CGROUP_CONTROLLER_CPU) | (1 << VIR_CGROUP_CONTROLLER_CPUACCT) | (1 << VIR_CGROUP_CONTROLLER_MEMORY) | + (1 << VIR_CGROUP_CONTROLLER_DEVICES) | (1 << VIR_CGROUP_CONTROLLER_BLKIO); if (virCgroupNewSelf(&cgroup) < 0) { @@ -609,14 +610,12 @@ static int testCgroupNewForSelfHybrid(const void *args ATTRIBUTE_UNUSED) const char *empty[VIR_CGROUP_CONTROLLER_LAST] = { 0 }; const char *mounts[VIR_CGROUP_CONTROLLER_LAST] = { [VIR_CGROUP_CONTROLLER_CPUSET] = "/not/really/sys/fs/cgroup/cpuset", - [VIR_CGROUP_CONTROLLER_DEVICES] = "/not/really/sys/fs/cgroup/devices", [VIR_CGROUP_CONTROLLER_FREEZER] = "/not/really/sys/fs/cgroup/freezer", [VIR_CGROUP_CONTROLLER_NET_CLS] = "/not/really/sys/fs/cgroup/net_cls", [VIR_CGROUP_CONTROLLER_PERF_EVENT] = "/not/really/sys/fs/cgroup/perf_event", }; const char *placement[VIR_CGROUP_CONTROLLER_LAST] = { [VIR_CGROUP_CONTROLLER_CPUSET] = "/", - [VIR_CGROUP_CONTROLLER_DEVICES] = "/", [VIR_CGROUP_CONTROLLER_FREEZER] = "/", [VIR_CGROUP_CONTROLLER_NET_CLS] = "/", [VIR_CGROUP_CONTROLLER_PERF_EVENT] = "/", @@ -625,6 +624,7 @@ static int testCgroupNewForSelfHybrid(const void *args ATTRIBUTE_UNUSED) (1 << VIR_CGROUP_CONTROLLER_CPU) | (1 << VIR_CGROUP_CONTROLLER_CPUACCT) | (1 << VIR_CGROUP_CONTROLLER_MEMORY) | + (1 << VIR_CGROUP_CONTROLLER_DEVICES) | (1 << VIR_CGROUP_CONTROLLER_BLKIO); if (virCgroupNewSelf(&cgroup) < 0) { -- 2.20.1

2595

Age (days ago)

2605

Last active (days ago)

List overview

Download

25 comments

3 participants

participants (3)

Daniel P. Berrangé
Michal Privoznik
Pavel Hrdina

[libvirt] [PATCH v2 00/17] implement cgroups v2 devices support

tags

participants (3)