On Thu, Apr 25, 2019 at 09:44:20AM +0200, Pavel Hrdina wrote:
This function loads the BPF prog with prepared map into kernel and
attaches it into guest cgroup. It can be also used to replace existing
program in the cgroup if we need to resize BPF map to store more rules
for devices. The old program will be closed and removed from kernel.
There are two possible ways how to create BPF program:
- One way is to write simple C-like code which can by compiled into
BPF object file which can be loaded into kernel using elfutils.
- The second way is to define macros which looks like assembler
s/looks/look/
instructions and can be used directly to create BPF program
that
can be directly loaded into kernel.
Since the program is not too complex we can use the second option.
I can live with that, since it saves us the dependency on clang,
and it probably won't require many changes, but please include all
the steps necessary to regenerate it (see below).
If there is no program, all devices are allowed, if there is some
program it is executed and based on the exit status the access is
denied for 0 and allowed for 1.
Our program will follow these rules:
- first it will try to look for the specific key using major and
minor to see if there is any rule for that specific device
- if there is no specific rule it will try to look for any rule that
matches only major of the device
- if there is no match with major it will try the same but with
minor of the device
- as the last attempt it will try to look for rule for all devices
and if there is no match it will return 0 to deny that access
Signed-off-by: Pavel Hrdina <phrdina(a)redhat.com>
---
src/libvirt_private.syms | 1 +
src/util/vircgrouppriv.h | 10 ++
src/util/vircgroupv2devices.c | 276 ++++++++++++++++++++++++++++++++++
src/util/vircgroupv2devices.h | 5 +
4 files changed, 292 insertions(+)
diff --git a/src/libvirt_private.syms b/src/libvirt_private.syms
index 9eac05009c..24a783840f 100644
--- a/src/libvirt_private.syms
+++ b/src/libvirt_private.syms
@@ -1653,6 +1653,7 @@ virCgroupV1Register;
virCgroupV2Register;
# util/vircgroupv2devices.h
+virCgroupV2DevicesAttachProg;
virCgroupV2DevicesAvailable;
# util/virclosecallbacks.h
diff --git a/src/util/vircgrouppriv.h b/src/util/vircgrouppriv.h
index 9110c77297..7eba4ade23 100644
--- a/src/util/vircgrouppriv.h
+++ b/src/util/vircgrouppriv.h
@@ -41,10 +41,20 @@ struct _virCgroupV1Controller {
typedef struct _virCgroupV1Controller virCgroupV1Controller;
typedef virCgroupV1Controller *virCgroupV1ControllerPtr;
+struct _virCgroupV2Devices {
+ int mapfd;
+ int progfd;
+ ssize_t count;
+ ssize_t max;
+};
+typedef struct _virCgroupV2Devices virCgroupV2Devices;
+typedef virCgroupV2Devices *virCgroupV2DevicesPtr;
+
struct _virCgroupV2Controller {
int controllers;
char *mountPoint;
char *placement;
+ virCgroupV2Devices devices;
};
typedef struct _virCgroupV2Controller virCgroupV2Controller;
typedef virCgroupV2Controller *virCgroupV2ControllerPtr;
diff --git a/src/util/vircgroupv2devices.c b/src/util/vircgroupv2devices.c
index 10080d4fff..c8686e8768 100644
--- a/src/util/vircgroupv2devices.c
+++ b/src/util/vircgroupv2devices.c
@@ -30,6 +30,7 @@
#define LIBVIRT_VIRCGROUPPRIV_H_ALLOW
#include "vircgrouppriv.h"
+#include "viralloc.h"
#include "virbpf.h"
#include "vircgroup.h"
#include "vircgroupv2devices.h"
@@ -64,10 +65,285 @@ virCgroupV2DevicesAvailable(virCgroupPtr group)
VIR_FORCE_CLOSE(cgroupfd);
return ret;
}
+
+
+/* Steps to get assembly version of devices BPF program:
+ *
+ * Save the following program into bpfprog.c, compile it using clang:
It would be nicer to have the file separate, with this comment there,
to save the developer the trouble of stripping the asterisks.
+ *
+ * clang -O2 -Wall -target bpf -c bpfprog.c -o bpfprog.o
+ *
+ * Now you can use llvm-objdump to get the list if instructions:
+ *
+ * llvm-objdump -S -no-show-raw-insn bpfprog.o
+ *
+ * which can be converted into program using VIR_BPF_* macros.
Did you convert them manually? Can you share the script here?
+ *
+ * ----------------------------------------------------------------------------
+ * #include <linux/bpf.h>
+ * #include <linux/version.h>
+ *
+ * #define SEC(NAME) __attribute__((section(NAME), used))
+ *
+ * struct bpf_map_def {
+ * unsigned int type;
+ * unsigned int key_size;
+ * unsigned int value_size;
+ * unsigned int max_entries;
+ * unsigned int map_flags;
+ * unsigned int inner_map_idx;
+ * unsigned int numa_node;
+ * };
+ *
+ * static void *(*bpf_map_lookup_elem)(void *map, void *key) =
+ * (void *) BPF_FUNC_map_lookup_elem;
+ *
+ * struct bpf_map_def SEC("maps") devices = {
+ * .type = BPF_MAP_TYPE_HASH,
+ * .key_size = sizeof(__u64),
+ * .value_size = sizeof(__u32),
+ * .max_entries = 65,
+ * };
+ *
+ * SEC("cgroup/dev") int
+ * bpf_libvirt_cgroup_device(struct bpf_cgroup_dev_ctx *ctx)
+ * {
+ * __u64 key = ((__u64)ctx->major << 32) | ctx->minor;
+ * __u32 *val = 0;
+ *
+ * val = bpf_map_lookup_elem(&devices, &key);
+ * if (val && (ctx->access_type & *val) == ctx->access_type)
+ * return 1;
+ *
+ * key = ((__u64)ctx->major << 32) | 0xffffffff;
+ * val = bpf_map_lookup_elem(&devices, &key);
+ * if (val && (ctx->access_type & *val) == ctx->access_type)
+ * return 1;
+ *
+ * key = 0xffffffff00000000 | ctx->minor;
+ * val = bpf_map_lookup_elem(&devices, &key);
+ * if (val && (ctx->access_type & *val) == ctx->access_type)
+ * return 1;
+ *
+ * key = 0xffffffffffffffff;
+ * val = bpf_map_lookup_elem(&devices, &key);
+ * if (val && (ctx->access_type & *val) == ctx->access_type)
+ * return 1;
+ *
+ * return 0;
+ * }
+ *
+ * char _license[] SEC("license") = "GPL";
+ * __u32 _version SEC("version") = LINUX_VERSION_CODE;
+ * ----------------------------------------------------------------------------
+ * */
+static int
+virCgroupV2DevicesLoadProg(int mapfd)
+{
+ struct bpf_insn prog[] = {
+ /* 0: r6 = r1 */
+ VIR_BPF_MOV64_REG(BPF_REG_6, BPF_REG_1),
+ /* 1: r1 = *(u32 *)(r6 + 8) */
+ VIR_BPF_LDX_MEM(BPF_W, BPF_REG_1, BPF_REG_6, 8),
+ /* 2: r2 = *(u32 *)(r6 + 4) */
+ VIR_BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_6, 4),
+ /* 3: r2 <<= 32 */
+ VIR_BPF_ALU64_IMM(BPF_LSH, BPF_REG_2, 32),
+ /* 4: r2 |= r1 */
+ VIR_BPF_ALU64_REG(BPF_OR, BPF_REG_2, BPF_REG_1),
+ /* 5: *(u64 *)(r10 - 8) = r2 */
+ VIR_BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_2, -8),
+ /* 6: r2 = r10 */
+ VIR_BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ /* 7: r2 += -8 */
+ VIR_BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+ /* 8: r1 = 0 ll */
+ VIR_BPF_LD_MAP_FD(BPF_REG_1, mapfd),
+ /* 10: call 1 */
+ VIR_BPF_CALL_INSN(BPF_FUNC_map_lookup_elem),
+ /* 11: r1 = r0 */
+ VIR_BPF_MOV64_REG(BPF_REG_1, BPF_REG_0),
+ /* 12: if r1 == 0 goto +5 <LBB0_2> */
+ VIR_BPF_JMP_IMM(BPF_JEQ, BPF_REG_1, 0, 5),
+ /* 13: r0 = 1 */
+ VIR_BPF_MOV64_IMM(BPF_REG_0, 1),
+ /* 14: r2 = *(u32 *)(r6 + 0) */
+ VIR_BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_6, 0),
+ /* 15: r1 = *(u32 *)(r1 + 0) */
+ VIR_BPF_LDX_MEM(BPF_W, BPF_REG_1, BPF_REG_1, 0),
+ /* 16: r1 &= r2 */
+ VIR_BPF_ALU64_REG(BPF_AND, BPF_REG_1, BPF_REG_2),
+ /* 17: if r1 == r2 goto +50 <LBB0_9> */
+ VIR_BPF_JMP_REG(BPF_JEQ, BPF_REG_1, BPF_REG_2, 50),
+ /* LBB0_2: */
+ /* 18: r1 = *(u32 *)(r6 + 4) */
+ VIR_BPF_LDX_MEM(BPF_W, BPF_REG_1, BPF_REG_6, 4),
+ /* 19: r1 <<= 32 */
+ VIR_BPF_ALU64_IMM(BPF_LSH, BPF_REG_1, 32),
+ /* 20: r2 = 4294967295 ll */
+ VIR_BPF_LD_IMM64(BPF_REG_2, 0xffffffff),
+ /* 22: r1 |= r2 */
+ VIR_BPF_ALU64_REG(BPF_OR, BPF_REG_1, BPF_REG_2),
+ /* 23: *(u64 *)(r10 - 8) = r1 */
+ VIR_BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_1, -8),
+ /* 24: r2 = r10 */
+ VIR_BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ /* 25: r2 += -8 */
+ VIR_BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+ /* 26: r1 = 0 ll */
+ VIR_BPF_LD_MAP_FD(BPF_REG_1, mapfd),
+ /* 28: call 1 */
+ VIR_BPF_CALL_INSN(BPF_FUNC_map_lookup_elem),
+ /* 29: r1 = r0 */
+ VIR_BPF_MOV64_REG(BPF_REG_1, BPF_REG_0),
+ /* 30: if r1 == 0 goto +5 <LBB0_4> */
+ VIR_BPF_JMP_IMM(BPF_JEQ, BPF_REG_1, 0, 5),
+ /* 31: r0 = 1 */
+ VIR_BPF_MOV64_IMM(BPF_REG_0, 1),
+ /* 32: r2 = *(u32 *)(r6 + 0) */
+ VIR_BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_6, 0),
+ /* 33: r1 = *(u32 *)(r1 + 0) */
+ VIR_BPF_LDX_MEM(BPF_W, BPF_REG_1, BPF_REG_1, 0),
+ /* 34: r1 &= r2 */
+ VIR_BPF_ALU64_REG(BPF_AND, BPF_REG_1, BPF_REG_2),
+ /* 35: if r1 == r2 goto +32 <LBB0_9> */
+ VIR_BPF_JMP_REG(BPF_JEQ, BPF_REG_1, BPF_REG_2, 32),
+ /* LBB0_4: */
+ /* 36: r1 = *(u32 *)(r6 + 8) */
+ VIR_BPF_LDX_MEM(BPF_W, BPF_REG_1, BPF_REG_6, 8),
+ /* 37: r2 = -4294967296 ll */
+ VIR_BPF_LD_IMM64(BPF_REG_2, 0xffffffff00000000),
+ /* 39: r1 |= r2 */
+ VIR_BPF_ALU64_REG(BPF_OR, BPF_REG_1, BPF_REG_2),
+ /* 40: *(u64 *)(r10 - 8) = r1 */
+ VIR_BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_1, -8),
+ /* 41: r2 = r10 */
+ VIR_BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ /* 42: r2 += -8 */
+ VIR_BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+ /* 43: r1 = 0 ll */
+ VIR_BPF_LD_MAP_FD(BPF_REG_1, mapfd),
+ /* 45: call 1 */
+ VIR_BPF_CALL_INSN(BPF_FUNC_map_lookup_elem),
+ /* 46: r1 = r0 */
+ VIR_BPF_MOV64_REG(BPF_REG_1, BPF_REG_0),
+ /* 47: if r1 == 0 goto +5 <LBB0_6> */
+ VIR_BPF_JMP_IMM(BPF_JEQ, BPF_REG_1, 0, 5),
+ /* 48: r0 = 1 */
+ VIR_BPF_MOV64_IMM(BPF_REG_0, 1),
+ /* 49: r2 = *(u32 *)(r6 + 0) */
+ VIR_BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_6, 0),
+ /* 50: r1 = *(u32 *)(r1 + 0) */
+ VIR_BPF_LDX_MEM(BPF_W, BPF_REG_1, BPF_REG_1, 0),
+ /* 51: r1 &= r2 */
+ VIR_BPF_ALU64_REG(BPF_AND, BPF_REG_1, BPF_REG_2),
+ /* 52: if r1 == r2 goto +15 <LBB0_9> */
+ VIR_BPF_JMP_REG(BPF_JEQ, BPF_REG_1, BPF_REG_2, 15),
+ /* LBB0_6: */
+ /* 53: r1 = -1 */
+ VIR_BPF_MOV64_IMM(BPF_REG_1, -1),
+ /* 54: *(u64 *)(r10 - 8) = r1 */
+ VIR_BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_1, -8),
+ /* 55: r2 = r10 */
+ VIR_BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ /* 56: r2 += -8 */
+ VIR_BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+ /* 57: r1 = 0 ll */
+ VIR_BPF_LD_MAP_FD(BPF_REG_1, mapfd),
+ /* 59: call 1 */
+ VIR_BPF_CALL_INSN(BPF_FUNC_map_lookup_elem),
+ /* 60: r1 = r0 */
+ VIR_BPF_MOV64_REG(BPF_REG_1, BPF_REG_0),
+ /* 61: if r1 == 0 goto +5 <LBB0_8> */
+ VIR_BPF_JMP_IMM(BPF_JEQ, BPF_REG_1, 0, 5),
+ /* 62: r0 = 1 */
+ VIR_BPF_MOV64_IMM(BPF_REG_0, 1),
+ /* 63: r2 = *(u32 *)(r6 + 0) */
+ VIR_BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_6, 0),
+ /* 64: r1 = *(u32 *)(r1 + 0) */
+ VIR_BPF_LDX_MEM(BPF_W, BPF_REG_1, BPF_REG_1, 0),
+ /* 65: r1 &= r2 */
+ VIR_BPF_ALU64_REG(BPF_AND, BPF_REG_1, BPF_REG_2),
+ /* 66: if r1 == r2 goto +1 <LBB0_9> */
+ VIR_BPF_JMP_REG(BPF_JEQ, BPF_REG_1, BPF_REG_2, 1),
+ /* LBB0_8: */
+ /* 67: r0 = 0 */
+ VIR_BPF_MOV64_IMM(BPF_REG_0, 0),
+ /* LBB0_9: */
+ /* 68: exit */
+ VIR_BPF_EXIT_INSN(),
+ };
+
+ return virBPFLoadProg(prog, BPF_PROG_TYPE_CGROUP_DEVICE, ARRAY_CARDINALITY(prog));
+}
+
+
+int
+virCgroupV2DevicesAttachProg(virCgroupPtr group,
+ int mapfd,
+ size_t max)
+{
+ int ret = -1;
+ int progfd = -1;
+ int cgroupfd = -1;
VIR_AUTOCLOSE
+ VIR_AUTOFREE(char *) path = NULL;
+
+ if (virCgroupPathOfController(group, VIR_CGROUP_CONTROLLER_DEVICES,
+ NULL, &path) < 0) {
+ goto cleanup;
+ }
+
+ progfd = virCgroupV2DevicesLoadProg(mapfd);
+ if (progfd < 0) {
+ virReportSystemError(errno, "%s", _("failed to load cgroup BPF
prog"));
+ goto cleanup;
+ }
+
+ cgroupfd = open(path, O_RDONLY);
+ if (cgroupfd < 0) {
+ virReportSystemError(errno, _("unable to open '%s'"), path);
+ goto cleanup;
+ }
+
+ if (virBPFAttachProg(progfd, cgroupfd, BPF_CGROUP_DEVICE) < 0) {
+ virReportSystemError(errno, "%s", _("failed to attach cgroup BPF
prog"));
+ goto cleanup;
+ }
+
+ if (group->unified.devices.progfd > 0) {
+ VIR_DEBUG("Closing existing program that was replaced by new one.");
+ VIR_FORCE_CLOSE(group->unified.devices.progfd);
+ }
+
+ group->unified.devices.progfd = progfd;
+ group->unified.devices.mapfd = mapfd;
+ group->unified.devices.max = max;
+ progfd = -1;
+ mapfd = -1;
+
+ ret = 0;
+ cleanup:
+ VIR_FORCE_CLOSE(cgroupfd);
+ VIR_FORCE_CLOSE(progfd);
+ VIR_FORCE_CLOSE(mapfd);
+ return ret;
+}
#else /* !HAVE_DECL_BPF_CGROUP_DEVICE */
bool
virCgroupV2DevicesAvailable(virCgroupPtr group ATTRIBUTE_UNUSED)
{
return false;
Reviewed-by: Ján Tomko <jtomko(a)redhat.com>
Jano