[PATCH net-next 1/2] bpf: allow extended BPF programs access skb fields

Discussion:

(too old to reply)

Alexei Starovoitov

2015-03-13 02:30:01 UTC

introduce user accessible mirror of in-kernel 'struct sk_buff':
struct __sk_buff {
__u32 len;
__u32 pkt_type;
__u32 mark;
__u32 ifindex;
__u32 queue_mapping;
};

bpf programs can do:
struct __sk_buff *ptr;
var = ptr->pkt_type;

which will be compiled to bpf assembler as:
dst_reg = *(u32 *)(src_reg + 4) // 4 == offsetof(struct __sk_buff, pkt_type)

bpf verifier will check validity of access and will convert it to:
dst_reg = *(u8 *)(src_reg + offsetof(struct sk_buff, __pkt_type_offset))
dst_reg &= 7

since 'pkt_type' is a bitfield.

When pkt_type field is moved around, goes into different structure, removed or
its size changes, the function sk_filter_convert_ctx_access() would need to be
updated. Just like the function convert_bpf_extensions() in case of classic bpf.

Signed-off-by: Alexei Starovoitov <***@plumgrid.com>
---
include/linux/bpf.h | 5 +-
include/uapi/linux/bpf.h | 8 +++
kernel/bpf/syscall.c | 2 +-
kernel/bpf/verifier.c | 152 +++++++++++++++++++++++++++++++++++++++++-----
net/core/filter.c | 58 +++++++++++++++++-
5 files changed, 205 insertions(+), 20 deletions(-)

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 80f2e0fc3d02..2c17ebdfb5ae 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -103,6 +103,9 @@ struct bpf_verifier_ops {
* with 'type' (read or write) is allowed
*/
bool (*is_valid_access)(int off, int size, enum bpf_access_type type);
+
+ u32 (*convert_ctx_access)(int dst_reg, int src_reg, int ctx_off,
+ struct bpf_insn *insn);
};

struct bpf_prog_type_list {
@@ -133,7 +136,7 @@ struct bpf_map *bpf_map_get(struct fd f);
void bpf_map_put(struct bpf_map *map);

/* verify correctness of eBPF program */
-int bpf_check(struct bpf_prog *fp, union bpf_attr *attr);
+int bpf_check(struct bpf_prog **fp, union bpf_attr *attr);
#else
static inline void bpf_register_prog_type(struct bpf_prog_type_list *tl)
{
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 3fa1af8a58d7..66a82d6cd75b 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -168,4 +168,12 @@ enum bpf_func_id {
__BPF_FUNC_MAX_ID,
};

+struct __sk_buff {
+ __u32 len;
+ __u32 pkt_type;
+ __u32 mark;
+ __u32 ifindex;
+ __u32 queue_mapping;
+};
+
#endif /* _UAPI__LINUX_BPF_H__ */
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 669719ccc9ee..ea75c654af1b 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -519,7 +519,7 @@ static int bpf_prog_load(union bpf_attr *attr)
goto free_prog;

/* run eBPF verifier */
- err = bpf_check(prog, attr);
+ err = bpf_check(&prog, attr);
if (err < 0)
goto free_used_maps;

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index e6b522496250..c22ebd36fa4b 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -1620,11 +1620,10 @@ static int do_check(struct verifier_env *env)
return err;

} else if (class == BPF_LDX) {
- if (BPF_MODE(insn->code) != BPF_MEM ||
- insn->imm != 0) {
- verbose("BPF_LDX uses reserved fields\n");
- return -EINVAL;
- }
+ enum bpf_reg_type src_reg_type;
+
+ /* check for reserved fields is already done */
+
/* check src operand */
err = check_reg_arg(regs, insn->src_reg, SRC_OP);
if (err)
@@ -1643,6 +1642,29 @@ static int do_check(struct verifier_env *env)
if (err)
return err;

+ src_reg_type = regs[insn->src_reg].type;
+
+ if (insn->imm == 0 && BPF_SIZE(insn->code) == BPF_W) {
+ /* saw a valid insn
+ * dst_reg = *(u32 *)(src_reg + off)
+ * use reserved 'imm' field to mark this insn
+ */
+ insn->imm = src_reg_type;
+
+ } else if (src_reg_type != insn->imm &&
+ (src_reg_type == PTR_TO_CTX ||
+ insn->imm == PTR_TO_CTX)) {
+ /* ABuser program is trying to use the same insn
+ * dst_reg = *(u32*) (src_reg + off)
+ * with different pointer types:
+ * src_reg == ctx in one branch and
+ * src_reg == stack|map in some other branch.
+ * Reject it.
+ */
+ verbose("same insn cannot be used with different pointers\n");
+ return -EINVAL;
+ }
+
} else if (class == BPF_STX) {
if (BPF_MODE(insn->code) == BPF_XADD) {
err = check_xadd(env, insn);
@@ -1790,6 +1812,13 @@ static int replace_map_fd_with_map_ptr(struct verifier_env *env)
int i, j;

for (i = 0; i < insn_cnt; i++, insn++) {
+ if (BPF_CLASS(insn->code) == BPF_LDX &&
+ (BPF_MODE(insn->code) != BPF_MEM ||
+ insn->imm != 0)) {
+ verbose("BPF_LDX uses reserved fields\n");
+ return -EINVAL;
+ }
+
if (insn[0].code == (BPF_LD | BPF_IMM | BPF_DW)) {
struct bpf_map *map;
struct fd f;
@@ -1881,6 +1910,92 @@ static void convert_pseudo_ld_imm64(struct verifier_env *env)
insn->src_reg = 0;
}

+static void adjust_branches(struct bpf_prog *prog, int pos, int delta)
+{
+ struct bpf_insn *insn = prog->insnsi;
+ int insn_cnt = prog->len;
+ int i;
+
+ for (i = 0; i < insn_cnt; i++, insn++) {
+ if (BPF_CLASS(insn->code) != BPF_JMP ||
+ BPF_OP(insn->code) == BPF_CALL ||
+ BPF_OP(insn->code) == BPF_EXIT)
+ continue;
+
+ /* adjust offset of jmps if necessary */
+ if (i < pos && i + insn->off + 1 > pos)
+ insn->off += delta;
+ else if (i > pos && i + insn->off + 1 < pos)
+ insn->off -= delta;
+ }
+}
+
+/* convert load instructions that access fields of 'struct __sk_buff'
+ * into sequence of instructions that access fields of 'struct sk_buff'
+ */
+static int convert_ctx_accesses(struct verifier_env *env)
+{
+ struct bpf_insn *insn = env->prog->insnsi;
+ int insn_cnt = env->prog->len;
+ struct bpf_insn insn_buf[16];
+ struct bpf_prog *new_prog;
+ u32 cnt;
+ int i;
+
+ if (!env->prog->aux->ops->convert_ctx_access)
+ return 0;
+
+ for (i = 0; i < insn_cnt; i++, insn++) {
+ if (insn->code != (BPF_LDX | BPF_MEM | BPF_W))
+ continue;
+
+ if (insn->imm != PTR_TO_CTX) {
+ /* clear internal mark */
+ insn->imm = 0;
+ continue;
+ }
+
+ cnt = env->prog->aux->ops->
+ convert_ctx_access(insn->dst_reg, insn->src_reg,
+ insn->off, insn_buf);
+ if (cnt == 0 || cnt >= ARRAY_SIZE(insn_buf)) {
+ verbose("bpf verifier is misconfigured\n");
+ return -EINVAL;
+ }
+
+ if (cnt == 1) {
+ memcpy(insn, insn_buf, sizeof(*insn));
+ continue;
+ }
+
+ /* several new insns need to be inserted. Make room for them */
+ insn_cnt += cnt - 1;
+ new_prog = bpf_prog_realloc(env->prog,
+ bpf_prog_size(insn_cnt),
+ GFP_USER);
+ if (!new_prog)
+ return -ENOMEM;
+
+ new_prog->len = insn_cnt;
+
+ memmove(new_prog->insnsi + i + cnt, new_prog->insns + i + 1,
+ sizeof(*insn) * (insn_cnt - i - cnt));
+
+ /* copy substitute insns in place of load instruction */
+ memcpy(new_prog->insnsi + i, insn_buf, sizeof(*insn) * cnt);
+
+ /* adjust branches in the whole program */
+ adjust_branches(new_prog, i, cnt - 1);
+
+ /* keep walking new program and skip insns we just inserted */
+ env->prog = new_prog;
+ insn = new_prog->insnsi + i + cnt - 1;
+ i += cnt - 1;
+ }
+
+ return 0;
+}
+
static void free_states(struct verifier_env *env)
{
struct verifier_state_list *sl, *sln;
@@ -1903,13 +2018,13 @@ static void free_states(struct verifier_env *env)
kfree(env->explored_states);
}

-int bpf_check(struct bpf_prog *prog, union bpf_attr *attr)
+int bpf_check(struct bpf_prog **prog, union bpf_attr *attr)
{
char __user *log_ubuf = NULL;
struct verifier_env *env;
int ret = -EINVAL;

- if (prog->len <= 0 || prog->len > BPF_MAXINSNS)
+ if ((*prog)->len <= 0 || (*prog)->len > BPF_MAXINSNS)
return -E2BIG;

/* 'struct verifier_env' can be global, but since it's not small,
@@ -1919,7 +2034,7 @@ int bpf_check(struct bpf_prog *prog, union bpf_attr *attr)
if (!env)
return -ENOMEM;

- env->prog = prog;
+ env->prog = *prog;

/* grab the mutex to protect few globals used by verifier */
mutex_lock(&bpf_verifier_lock);
@@ -1951,7 +2066,7 @@ int bpf_check(struct bpf_prog *prog, union bpf_attr *attr)
if (ret < 0)
goto skip_full_check;

- env->explored_states = kcalloc(prog->len,
+ env->explored_states = kcalloc(env->prog->len,
sizeof(struct verifier_state_list *),
GFP_USER);
ret = -ENOMEM;
@@ -1968,6 +2083,10 @@ skip_full_check:
while (pop_stack(env, NULL) >= 0);
free_states(env);

+ if (ret == 0)
+ /* program is valid, convert *(u32*)(ctx + off) accesses */
+ ret = convert_ctx_accesses(env);
+
if (log_level && log_len >= log_size - 1) {
BUG_ON(log_len >= log_size);
/* verifier log exceeded user supplied buffer */
@@ -1983,18 +2102,18 @@ skip_full_check:

if (ret == 0 && env->used_map_cnt) {
/* if program passed verifier, update used_maps in bpf_prog_info */
- prog->aux->used_maps = kmalloc_array(env->used_map_cnt,
- sizeof(env->used_maps[0]),
- GFP_KERNEL);
+ env->prog->aux->used_maps = kmalloc_array(env->used_map_cnt,
+ sizeof(env->used_maps[0]),
+ GFP_KERNEL);

- if (!prog->aux->used_maps) {
+ if (!env->prog->aux->used_maps) {
ret = -ENOMEM;
goto free_log_buf;
}

- memcpy(prog->aux->used_maps, env->used_maps,
+ memcpy(env->prog->aux->used_maps, env->used_maps,
sizeof(env->used_maps[0]) * env->used_map_cnt);
- prog->aux->used_map_cnt = env->used_map_cnt;
+ env->prog->aux->used_map_cnt = env->used_map_cnt;

/* program is valid. Convert pseudo bpf_ld_imm64 into generic
* bpf_ld_imm64 instructions
@@ -2006,11 +2125,12 @@ free_log_buf:
if (log_level)
vfree(log_buf);
free_env:
- if (!prog->aux->used_maps)
+ if (!env->prog->aux->used_maps)
/* if we didn't copy map pointers into bpf_prog_info, release
* them now. Otherwise free_bpf_prog_info() will release them.
*/
release_maps(env);
+ *prog = env->prog;
kfree(env);
mutex_unlock(&bpf_verifier_lock);
return ret;
diff --git a/net/core/filter.c b/net/core/filter.c
index 7a4eb7030dba..b5fcc7e2b608 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -1147,13 +1147,67 @@ sk_filter_func_proto(enum bpf_func_id func_id)
static bool sk_filter_is_valid_access(int off, int size,
enum bpf_access_type type)
{
- /* skb fields cannot be accessed yet */
- return false;
+ /* only read is allowed */
+ if (type != BPF_READ)
+ return false;
+
+ /* check bounds */
+ if (off < 0 || off >= sizeof(struct __sk_buff))
+ return false;
+
+ /* disallow misaligned access */
+ if (off % size != 0)
+ return false;
+
+ /* all __sk_buff fields are __u32 */
+ if (size != 4)
+ return false;
+
+ return true;
+}
+
+static u32 sk_filter_convert_ctx_access(int dst_reg, int src_reg, int ctx_off,
+ struct bpf_insn *insn_buf)
+{
+ struct bpf_insn *insn = insn_buf;
+
+ switch (ctx_off) {
+ case offsetof(struct __sk_buff, len):
+ *insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg,
+ offsetof(struct sk_buff, len));
+ break;
+
+ case offsetof(struct __sk_buff, mark):
+ *insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg,
+ offsetof(struct sk_buff, mark));
+ break;
+
+ case offsetof(struct __sk_buff, ifindex):
+ *insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg,
+ offsetof(struct sk_buff, skb_iif));
+ break;
+
+ case offsetof(struct __sk_buff, pkt_type):
+ *insn++ = BPF_LDX_MEM(BPF_B, dst_reg, src_reg, PKT_TYPE_OFFSET());
+ *insn++ = BPF_ALU32_IMM(BPF_AND, dst_reg, PKT_TYPE_MAX);
+#ifdef __BIG_ENDIAN_BITFIELD
+ *insn++ = BPF_ALU32_IMM(BPF_RSH, dst_reg, 5);
+#endif
+ break;
+
+ case offsetof(struct __sk_buff, queue_mapping):
+ *insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg,
+ offsetof(struct sk_buff, queue_mapping));
+ break;
+ }
+
+ return insn - insn_buf;
}

static const struct bpf_verifier_ops sk_filter_ops = {
.get_func_proto = sk_filter_func_proto,
.is_valid_access = sk_filter_is_valid_access,
+ .convert_ctx_access = sk_filter_convert_ctx_access,
};

static struct bpf_prog_type_list sk_filter_type __read_mostly = {

--
1.7.9.5

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

Alexei Starovoitov

2015-03-13 02:30:02 UTC

Permalink

- modify sockex1 example to count number of bytes in outgoing packets
- modify sockex2 example to count number of bytes and packets per flow
- add 4 stress tests that exercise 'skb->field' code path of verifier

Signed-off-by: Alexei Starovoitov <***@plumgrid.com>
---
samples/bpf/sockex1_kern.c | 8 +++--
samples/bpf/sockex1_user.c | 2 +-
samples/bpf/sockex2_kern.c | 26 +++++++++------
samples/bpf/sockex2_user.c | 11 +++++--
samples/bpf/test_verifier.c | 73 +++++++++++++++++++++++++++++++++++++++++++
5 files changed, 104 insertions(+), 16 deletions(-)

diff --git a/samples/bpf/sockex1_kern.c b/samples/bpf/sockex1_kern.c
index 066892662915..ed18e9a4909c 100644
--- a/samples/bpf/sockex1_kern.c
+++ b/samples/bpf/sockex1_kern.c
@@ -1,5 +1,6 @@
#include <uapi/linux/bpf.h>
#include <uapi/linux/if_ether.h>
+#include <uapi/linux/if_packet.h>
#include <uapi/linux/ip.h>
#include "bpf_helpers.h"

@@ -11,14 +12,17 @@ struct bpf_map_def SEC("maps") my_map = {
};

SEC("socket1")
-int bpf_prog1(struct sk_buff *skb)
+int bpf_prog1(struct __sk_buff *skb)
{
int index = load_byte(skb, ETH_HLEN + offsetof(struct iphdr, protocol));
long *value;

+ if (skb->pkt_type != PACKET_OUTGOING)
+ return 0;
+
value = bpf_map_lookup_elem(&my_map, &index);
if (value)
- __sync_fetch_and_add(value, 1);
+ __sync_fetch_and_add(value, skb->len);

return 0;
}
diff --git a/samples/bpf/sockex1_user.c b/samples/bpf/sockex1_user.c
index 34a443ff3831..678ce4693551 100644
--- a/samples/bpf/sockex1_user.c
+++ b/samples/bpf/sockex1_user.c
@@ -40,7 +40,7 @@ int main(int ac, char **argv)
key = IPPROTO_ICMP;
assert(bpf_lookup_elem(map_fd[0], &key, &icmp_cnt) == 0);

- printf("TCP %lld UDP %lld ICMP %lld packets\n",
+ printf("TCP %lld UDP %lld ICMP %lld bytes\n",
tcp_cnt, udp_cnt, icmp_cnt);
sleep(1);
}
diff --git a/samples/bpf/sockex2_kern.c b/samples/bpf/sockex2_kern.c
index 6f0135f0f217..ba0e177ff561 100644
--- a/samples/bpf/sockex2_kern.c
+++ b/samples/bpf/sockex2_kern.c
@@ -42,13 +42,13 @@ static inline int proto_ports_offset(__u64 proto)
}
}

-static inline int ip_is_fragment(struct sk_buff *ctx, __u64 nhoff)
+static inline int ip_is_fragment(struct __sk_buff *ctx, __u64 nhoff)
{
return load_half(ctx, nhoff + offsetof(struct iphdr, frag_off))
& (IP_MF | IP_OFFSET);
}

-static inline __u32 ipv6_addr_hash(struct sk_buff *ctx, __u64 off)
+static inline __u32 ipv6_addr_hash(struct __sk_buff *ctx, __u64 off)
{
__u64 w0 = load_word(ctx, off);
__u64 w1 = load_word(ctx, off + 4);
@@ -58,7 +58,7 @@ static inline __u32 ipv6_addr_hash(struct sk_buff *ctx, __u64 off)
return (__u32)(w0 ^ w1 ^ w2 ^ w3);
}

-static inline __u64 parse_ip(struct sk_buff *skb, __u64 nhoff, __u64 *ip_proto,
+static inline __u64 parse_ip(struct __sk_buff *skb, __u64 nhoff, __u64 *ip_proto,
struct flow_keys *flow)
{
__u64 verlen;
@@ -82,7 +82,7 @@ static inline __u64 parse_ip(struct sk_buff *skb, __u64 nhoff, __u64 *ip_proto,
return nhoff;
}

-static inline __u64 parse_ipv6(struct sk_buff *skb, __u64 nhoff, __u64 *ip_proto,
+static inline __u64 parse_ipv6(struct __sk_buff *skb, __u64 nhoff, __u64 *ip_proto,
struct flow_keys *flow)
{
*ip_proto = load_byte(skb,
@@ -96,7 +96,7 @@ static inline __u64 parse_ipv6(struct sk_buff *skb, __u64 nhoff, __u64 *ip_proto
return nhoff;
}

-static inline bool flow_dissector(struct sk_buff *skb, struct flow_keys *flow)
+static inline bool flow_dissector(struct __sk_buff *skb, struct flow_keys *flow)
{
__u64 nhoff = ETH_HLEN;
__u64 ip_proto;
@@ -183,18 +183,23 @@ static inline bool flow_dissector(struct sk_buff *skb, struct flow_keys *flow)
return true;
}

+struct pair {
+ long packets;
+ long bytes;
+};
+
struct bpf_map_def SEC("maps") hash_map = {
.type = BPF_MAP_TYPE_HASH,
.key_size = sizeof(__be32),
- .value_size = sizeof(long),
+ .value_size = sizeof(struct pair),
.max_entries = 1024,
};

SEC("socket2")
-int bpf_prog2(struct sk_buff *skb)
+int bpf_prog2(struct __sk_buff *skb)
{
struct flow_keys flow;
- long *value;
+ struct pair *value;
u32 key;

if (!flow_dissector(skb, &flow))
@@ -203,9 +208,10 @@ int bpf_prog2(struct sk_buff *skb)
key = flow.dst;
value = bpf_map_lookup_elem(&hash_map, &key);
if (value) {
- __sync_fetch_and_add(value, 1);
+ __sync_fetch_and_add(&value->packets, 1);
+ __sync_fetch_and_add(&value->bytes, skb->len);
} else {
- long val = 1;
+ struct pair val = {1, skb->len};

bpf_map_update_elem(&hash_map, &key, &val, BPF_ANY);
}
diff --git a/samples/bpf/sockex2_user.c b/samples/bpf/sockex2_user.c
index d2d5f5a790d3..29a276d766fc 100644
--- a/samples/bpf/sockex2_user.c
+++ b/samples/bpf/sockex2_user.c
@@ -6,6 +6,11 @@
#include <unistd.h>
#include <arpa/inet.h>

+struct pair {
+ __u64 packets;
+ __u64 bytes;
+};
+
int main(int ac, char **argv)
{
char filename[256];
@@ -29,13 +34,13 @@ int main(int ac, char **argv)

for (i = 0; i < 5; i++) {
int key = 0, next_key;
- long long value;
+ struct pair value;

while (bpf_get_next_key(map_fd[0], &key, &next_key) == 0) {
bpf_lookup_elem(map_fd[0], &next_key, &value);
- printf("ip %s count %lld\n",
+ printf("ip %s bytes %lld packets %lld\n",
inet_ntoa((struct in_addr){htonl(next_key)}),
- value);
+ value.bytes, value.packets);
key = next_key;
}
sleep(1);
diff --git a/samples/bpf/test_verifier.c b/samples/bpf/test_verifier.c
index 7b56b59fad8e..33beae615c5e 100644
--- a/samples/bpf/test_verifier.c
+++ b/samples/bpf/test_verifier.c
@@ -14,6 +14,7 @@
#include <linux/unistd.h>
#include <string.h>
#include <linux/filter.h>
+#include <stddef.h>
#include "libbpf.h"

#define MAX_INSNS 512
@@ -642,6 +643,78 @@ static struct bpf_test tests[] = {
},
.result = ACCEPT,
},
+ {
+ "access skb fields ok",
+ .insns = {
+ BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1,
+ offsetof(struct __sk_buff, len)),
+ BPF_JMP_IMM(BPF_JGE, BPF_REG_0, 0, 1),
+ BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1,
+ offsetof(struct __sk_buff, mark)),
+ BPF_JMP_IMM(BPF_JGE, BPF_REG_0, 0, 1),
+ BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1,
+ offsetof(struct __sk_buff, ifindex)),
+ BPF_JMP_IMM(BPF_JGE, BPF_REG_0, 0, 1),
+ BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1,
+ offsetof(struct __sk_buff, pkt_type)),
+ BPF_JMP_IMM(BPF_JGE, BPF_REG_0, 0, 1),
+ BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1,
+ offsetof(struct __sk_buff, queue_mapping)),
+ BPF_JMP_IMM(BPF_JGE, BPF_REG_0, 0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .result = ACCEPT,
+ },
+ {
+ "access skb fields bad1",
+ .insns = {
+ BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1, -4),
+ BPF_EXIT_INSN(),
+ },
+ .errstr = "invalid bpf_context access",
+ .result = REJECT,
+ },
+ {
+ "access skb fields bad2",
+ .insns = {
+ BPF_JMP_IMM(BPF_JGE, BPF_REG_1, 0, 9),
+ BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 1),
+ BPF_EXIT_INSN(),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_0),
+ BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1,
+ offsetof(struct __sk_buff, pkt_type)),
+ BPF_EXIT_INSN(),
+ },
+ .fixup = {4},
+ .errstr = "different pointers",
+ .result = REJECT,
+ },
+ {
+ "access skb fields bad3",
+ .insns = {
+ BPF_JMP_IMM(BPF_JGE, BPF_REG_1, 0, 2),
+ BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1,
+ offsetof(struct __sk_buff, pkt_type)),
+ BPF_EXIT_INSN(),
+ BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 1),
+ BPF_EXIT_INSN(),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_0),
+ BPF_JMP_IMM(BPF_JA, 0, 0, -12),
+ },
+ .fixup = {6},
+ .errstr = "different pointers",
+ .result = REJECT,
+ },
};

static int probe_filter_length(struct bpf_insn *fp)

Daniel Borkmann

2015-03-13 10:00:03 UTC

Permalink

Post by Alexei Starovoitov
struct __sk_buff {
__u32 len;
__u32 pkt_type;
__u32 mark;
__u32 ifindex;
__u32 queue_mapping;
};
struct __sk_buff *ptr;
var = ptr->pkt_type;
dst_reg = *(u32 *)(src_reg + 4) // 4 == offsetof(struct __sk_buff, pkt_type)
dst_reg = *(u8 *)(src_reg + offsetof(struct sk_buff, __pkt_type_offset))
dst_reg &= 7
since 'pkt_type' is a bitfield.
When pkt_type field is moved around, goes into different structure, removed or
its size changes, the function sk_filter_convert_ctx_access() would need to be
updated. Just like the function convert_bpf_extensions() in case of classic bpf.

For each member, I'd also add BUILD_BUG_ON()s similarly as we have in
convert_bpf_extensions(). That way, people won't forget to adjust the
code.

General idea for this offset map looks good, imho. Well defined members
that are already exported to uapi e.g. through classic socket filters or
other socket api places could be used here.
...

Post by Alexei Starovoitov
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 3fa1af8a58d7..66a82d6cd75b 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -168,4 +168,12 @@ enum bpf_func_id {
__BPF_FUNC_MAX_ID,
};
+struct __sk_buff {
+ __u32 len;
+ __u32 pkt_type;
+ __u32 mark;
+ __u32 ifindex;
+ __u32 queue_mapping;
+};

I'd add a comment saying that fields may _only_ be safely added at
the end of the structure. Rearranging or removing members here,
naturally would break user space.

The remaining fields we export in classic BPF would be skb->hash,
skb->protocol, skb->vlan_tci, are we adding them as well to match
up functionality with classic BPF? For example, I can see hash being
useful as a key to be used with eBPF maps, etc.

...

Post by Alexei Starovoitov
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index e6b522496250..c22ebd36fa4b 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c

...

Post by Alexei Starovoitov
+/* convert load instructions that access fields of 'struct __sk_buff'
+ * into sequence of instructions that access fields of 'struct sk_buff'
+ */
+static int convert_ctx_accesses(struct verifier_env *env)
+{
+ struct bpf_insn *insn = env->prog->insnsi;
+ int insn_cnt = env->prog->len;
+ struct bpf_insn insn_buf[16];
+ struct bpf_prog *new_prog;
+ u32 cnt;
+ int i;
+
+ if (!env->prog->aux->ops->convert_ctx_access)
+ return 0;
+
+ for (i = 0; i < insn_cnt; i++, insn++) {
+ if (insn->code != (BPF_LDX | BPF_MEM | BPF_W))
+ continue;
+
+ if (insn->imm != PTR_TO_CTX) {
+ /* clear internal mark */
+ insn->imm = 0;
+ continue;
+ }
+
+ cnt = env->prog->aux->ops->
+ convert_ctx_access(insn->dst_reg, insn->src_reg,
+ insn->off, insn_buf);
+ if (cnt == 0 || cnt >= ARRAY_SIZE(insn_buf)) {
+ verbose("bpf verifier is misconfigured\n");
+ return -EINVAL;
+ }
+
+ if (cnt == 1) {
+ memcpy(insn, insn_buf, sizeof(*insn));
+ continue;
+ }
+
+ /* several new insns need to be inserted. Make room for them */
+ insn_cnt += cnt - 1;
+ new_prog = bpf_prog_realloc(env->prog,
+ bpf_prog_size(insn_cnt),
+ GFP_USER);
+ if (!new_prog)
+ return -ENOMEM;

Seems a bit expensive, do you think we could speculatively allocate a
bit more space in bpf_prog_load() when we detect that we have access
to ctx that we need to convert?

Post by Alexei Starovoitov
+ new_prog->len = insn_cnt;
+
+ memmove(new_prog->insnsi + i + cnt, new_prog->insns + i + 1,
+ sizeof(*insn) * (insn_cnt - i - cnt));
+
+ /* copy substitute insns in place of load instruction */
+ memcpy(new_prog->insnsi + i, insn_buf, sizeof(*insn) * cnt);
+
+ /* adjust branches in the whole program */
+ adjust_branches(new_prog, i, cnt - 1);
+
+ /* keep walking new program and skip insns we just inserted */
+ env->prog = new_prog;
+ insn = new_prog->insnsi + i + cnt - 1;
+ i += cnt - 1;
+ }
+
+ return 0;
+}
+
static void free_states(struct verifier_env *env)
{
struct verifier_state_list *sl, *sln;

...

Post by Alexei Starovoitov
diff --git a/net/core/filter.c b/net/core/filter.c
index 7a4eb7030dba..b5fcc7e2b608 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c

...

Post by Alexei Starovoitov
+
+static u32 sk_filter_convert_ctx_access(int dst_reg, int src_reg, int ctx_off,
+ struct bpf_insn *insn_buf)
+{
+ struct bpf_insn *insn = insn_buf;
+
+ switch (ctx_off) {
+ *insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg,
+ offsetof(struct sk_buff, len));
+ break;
+
+ *insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg,
+ offsetof(struct sk_buff, mark));
+ break;
+
+ *insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg,
+ offsetof(struct sk_buff, skb_iif));
+ break;

This would only work for incoming skbs, but not outgoing ones
f.e. in case of {cls,act}_bpf.

Post by Alexei Starovoitov
+ *insn++ = BPF_LDX_MEM(BPF_B, dst_reg, src_reg, PKT_TYPE_OFFSET());
+ *insn++ = BPF_ALU32_IMM(BPF_AND, dst_reg, PKT_TYPE_MAX);
+#ifdef __BIG_ENDIAN_BITFIELD
+ *insn++ = BPF_ALU32_IMM(BPF_RSH, dst_reg, 5);
+#endif
+ break;

...
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

Alexei Starovoitov

2015-03-13 16:30:03 UTC

Permalink

Post by Daniel Borkmann
For each member, I'd also add BUILD_BUG_ON()s similarly as we have in
convert_bpf_extensions(). That way, people won't forget to adjust the
code.

I thought about it, but didn't add it, since we already have them
in the same file several lines above this spot.
I think one build error per .c file should be enough to attract
attention.
Though I'll add a comment to convert_bpf_extensions() that build_bug_on
errors should be addressed in two places.
btw I've tried to do a common converter, but since offsets are different
and the way instructions are stored are also different it was messy.

Post by Daniel Borkmann
General idea for this offset map looks good, imho. Well defined members
that are already exported to uapi e.g. through classic socket filters or
other socket api places could be used here.

yes. exactly.

Post by Daniel Borkmann

Post by Alexei Starovoitov
+struct __sk_buff {
+ __u32 len;
+ __u32 pkt_type;
+ __u32 mark;
+ __u32 ifindex;
+ __u32 queue_mapping;
+};

I'd add a comment saying that fields may _only_ be safely added at
the end of the structure. Rearranging or removing members here,
naturally would break user space.

ok. will add a comment.

Post by Daniel Borkmann
The remaining fields we export in classic BPF would be skb->hash,
skb->protocol, skb->vlan_tci, are we adding them as well to match
up functionality with classic BPF? For example, I can see hash being
useful as a key to be used with eBPF maps, etc.

I want to do skb->protocol and skb->vlan_tci differently and hopefully
more generically than classic did them, so they will come in the
follow on patches. skb->hash also should be done differently.
So yes. All of them are subjects of future patches/discussions.

Post by Daniel Borkmann

Post by Alexei Starovoitov
+ /* several new insns need to be inserted. Make room for them */
+ insn_cnt += cnt - 1;
+ new_prog = bpf_prog_realloc(env->prog,
+ bpf_prog_size(insn_cnt),
+ GFP_USER);
+ if (!new_prog)
+ return -ENOMEM;

Seems a bit expensive, do you think we could speculatively allocate a
bit more space in bpf_prog_load() when we detect that we have access
to ctx that we need to convert?

we already have extra space thanks to your commit 60a3b2253c413 :)))
The size is rounded up to page size and whole thing is made read-only
after it passes verifier.
So 99% of the time we don't reallocate anything.

Post by Daniel Borkmann

Post by Alexei Starovoitov
+ *insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg,
+ offsetof(struct sk_buff, skb_iif));
+ break;

This would only work for incoming skbs, but not outgoing ones
f.e. in case of {cls,act}_bpf.

ahh, ok, will drop this field for now.

Thanks

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

Daniel Borkmann

2015-03-13 16:50:02 UTC

Permalink

Post by Alexei Starovoitov

Post by Daniel Borkmann
For each member, I'd also add BUILD_BUG_ON()s similarly as we have in
convert_bpf_extensions(). That way, people won't forget to adjust the
code.

My concern would be that in case the code gets changed, this spot
could easily be overlooked by someone possibly unfamiliar with the
code, since no build error triggers there.

So I guess it wouldn't hurt or cost us much to also adding the
BUILD_BUG_ON()s there instead of a comment.

...

Post by Alexei Starovoitov

Ok, sounds good.

Thanks,
Daniel
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

Alexei Starovoitov

2015-03-13 17:10:03 UTC

Permalink

Post by Daniel Borkmann

Post by Alexei Starovoitov

Post by Daniel Borkmann
For each member, I'd also add BUILD_BUG_ON()s similarly as we have in
convert_bpf_extensions(). That way, people won't forget to adjust the
code.

My concern would be that in case the code gets changed, this spot
could easily be overlooked by someone possibly unfamiliar with the
code, since no build error triggers there.
So I guess it wouldn't hurt or cost us much to also adding the
BUILD_BUG_ON()s there instead of a comment.

I think it's overkill, but fine, it's minor. Will add another set
of build_bug_ons and see how it looks.

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

Daniel Borkmann

2015-03-13 10:50:02 UTC

Permalink

On 03/13/2015 03:21 AM, Alexei Starovoitov wrote:
...

Daniel,
patch 1 includes a bit of code that does prog_realloc and branch adjustment
to make room for new instructions. I think you'd need the same for your
'constant blinding' work. If indeed that would be the case, we'll make it
into a helper function.

Yes, thanks will take care of that.

Cheers,
Daniel
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/