Seccomp BPF Filter

Introduction

In my previous post, I introduce what is seccomp, how to set seccomp rule via seccomp_rule_add and how to bypass via ptrace. In this post, I am going to further introduce the seccomp filter, another way to set seccomp rule in the program. This post will also involve some kernel debugging for illustrating the internal of bpf filter.

First Glance

In the first let’s use the following code to explain what is seccomp filter. Part of the code is borrowed from [1].

//gcc code0.c -o code0 -lseccomp -no-pie
#include <stdio.h>
#include <string.h>
#include <sys/ptrace.h>
#include <sys/types.h>
#include <sys/wait.h>
#include <sys/user.h>
#include <sys/reg.h>
#include <unistd.h>
#include <stddef.h>
#include <sys/syscall.h>
#include <sys/prctl.h>     /* prctl */
#include <linux/seccomp.h> /* seccomp's constants */
#include <linux/filter.h>
#include <linux/audit.h>
#include <seccomp.h>
#include <stdlib.h>
#include <unistd.h>

#define longsize 8

char buffer[100];

#define ArchField offsetof(struct seccomp_data, arch)

#define Allow(syscall) \
	BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, SYS_##syscall, 0, 1), \
	BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_ALLOW)

struct sock_filter filter[] = {
	/* validate arch */
	BPF_STMT(BPF_LD+BPF_W+BPF_ABS, ArchField),
	BPF_JUMP( BPF_JMP+BPF_JEQ+BPF_K, AUDIT_ARCH_X86_64, 1, 0),
	BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_KILL),

	/* load syscall */
	BPF_STMT(BPF_LD+BPF_W+BPF_ABS, offsetof(struct seccomp_data, nr)),

	/* list of allowed syscalls */
	Allow(exit_group),  /* exits a processs */
	Allow(brk),     /* for malloc(), inside libc */
	Allow(mmap),        /* also for malloc() */
	Allow(munmap),      /* for free(), inside libc */
	Allow(write),       /* called by printf */
	Allow(fstat),       /* called by printf */

	/* and if we don't match above, die */
	BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_KILL),
};

struct sock_fprog filterprog = {
	.len = sizeof(filter)/sizeof(filter[0]),
	.filter = filter
};

int main()
{
	pid_t  pid;
	int rv;
	long orig_rax;
	char *argv[]={"/bin/cat", "flag", NULL};
	char *env[]={NULL};
	char cmd[20] = "/bin/cat";
	long length;
	long addr;
	int insyscall = 0;
	struct user_regs_struct regs;

	prctl(PR_SET_NO_NEW_PRIVS, 1);
	if (prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &filterprog) == -1) {
		perror("Could not start seccomp:");
		exit(1);
	}


	pid = fork();
	if(pid == 0)
	{
		ptrace(PTRACE_TRACEME, 0, NULL, NULL);
		syscall(59,  cmd ,argv, env);
	}
	else
	{
		waitpid(pid, &rv, 0);
		printf("The child process exits\n");
	}
	return 0;
}

With the help of seccomp-tools, we can clearly view the seccomp rules set in the sample code

$ sudo seccomp-tools dump ./code0
 line  CODE  JT   JF      K
=================================
 0000: 0x20 0x00 0x00 0x00000004  A = arch
 0001: 0x15 0x01 0x00 0xc000003e  if (A == ARCH_X86_64) goto 0003
 0002: 0x06 0x00 0x00 0x00000000  return KILL
 0003: 0x20 0x00 0x00 0x00000000  A = sys_number
 0004: 0x15 0x00 0x01 0x000000e7  if (A != exit_group) goto 0006
 0005: 0x06 0x00 0x00 0x7fff0000  return ALLOW
 0006: 0x15 0x00 0x01 0x0000000c  if (A != brk) goto 0008
 0007: 0x06 0x00 0x00 0x7fff0000  return ALLOW
 0008: 0x15 0x00 0x01 0x00000009  if (A != mmap) goto 0010
 0009: 0x06 0x00 0x00 0x7fff0000  return ALLOW
 0010: 0x15 0x00 0x01 0x0000000b  if (A != munmap) goto 0012
 0011: 0x06 0x00 0x00 0x7fff0000  return ALLOW
 0012: 0x15 0x00 0x01 0x00000001  if (A != write) goto 0014
 0013: 0x06 0x00 0x00 0x7fff0000  return ALLOW
 0014: 0x15 0x00 0x01 0x00000005  if (A != fstat) goto 0016
 0015: 0x06 0x00 0x00 0x7fff0000  return ALLOW
 0016: 0x06 0x00 0x00 0x00000000  return KILL

And the final result is expected.

$ sudo ./code0
Bad system call

Internal of BPF

In this section, I am going to introduce how the seccomp rules are set in the code above. During the test, I will also introduce some code in Linux Kernel for illustration. The kernel code I use is 4.4-31.

struct sock_fprog

Structure sock_fprog is the basic data structure to carry the seccomp rules from user space to kernel space.
struct sock_fprog { /* Required for SO_ATTACH_FILTER. /
unsigned short len; /
Number of filter blocks */
struct sock_filter __user *filter;
};

In code of linux kernel:

SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
		unsigned long, arg4, unsigned long, arg5)
{
	struct task_struct *me = current;
	unsigned char comm[sizeof(me->comm)];
	long error;

	error = security_task_prctl(option, arg2, arg3, arg4, arg5);
	if (error != -ENOSYS)
		return error;

	error = 0;
	switch (option) {
        //other options
	case PR_SET_SECCOMP:
		error = prctl_set_seccomp(arg2, (char __user *)arg3);
		break;
        //other options
        }
        return error;
}

Then we take a step forward to have a look at prctl_set_seccomp:

// /kernel/seccomp.c
/**
 * prctl_set_seccomp: configures current->seccomp.mode
 * @seccomp_mode: requested mode to use
 * @filter: optional struct sock_fprog for use with SECCOMP_MODE_FILTER
 *
 * Returns 0 on success or -EINVAL on failure.
 */
long prctl_set_seccomp(unsigned long seccomp_mode, char __user *filter)
{
	unsigned int op;
	char __user *uargs;

	switch (seccomp_mode) {
	case SECCOMP_MODE_STRICT:
		op = SECCOMP_SET_MODE_STRICT;
		/*
		 * Setting strict mode through prctl always ignored filter,
		 * so make sure it is always NULL here to pass the internal
		 * check in do_seccomp().
		 */
		uargs = NULL;
		break;
	case SECCOMP_MODE_FILTER:
		op = SECCOMP_SET_MODE_FILTER;
		uargs = filter;
		break;
	default:
		return -EINVAL;
	}

	/* prctl interface doesn't have flags, so they are always zero. */
	return do_seccomp(op, 0, uargs);
}

/* Common entry point for both prctl and syscall. */
static long do_seccomp(unsigned int op, unsigned int flags,
		       const char __user *uargs)
{
	switch (op) {
	case SECCOMP_SET_MODE_STRICT:
		if (flags != 0 || uargs != NULL)
			return -EINVAL;
		return seccomp_set_mode_strict();
	case SECCOMP_SET_MODE_FILTER:
		return seccomp_set_mode_filter(flags, uargs);
	default:
		return -EINVAL;
	}
}

Function seccomp_set_mode_filter will later assign the filter to current process according to the content of uarg. More details will be discussed in future. Next we will discuss how the seccomp rules are generated.

BPF filter

sock_filter is the basic data structure to encode bpf filter rule. As denoted in the comment, code denotes the opcode used in intruction, jt and jf denote the true branch and false branch in jump instruction, and k is a multiuse field. In linux kernel, there are two macros for users to create filter instruction:

// source/include/uapi/linux/filter.h
/*
 *	Try and keep these values and structures similar to BSD, especially
 *	the BPF code definitions which need to match so you can share filters
 */
 
struct sock_filter {	/* Filter block */
	__u16	code;   /* Actual filter code */
	__u8	jt;	/* Jump true */
	__u8	jf;	/* Jump false */
	__u32	k;      /* Generic multiuse field */
};

/*
 * Macros for filter block array initializers.
 */
#ifndef BPF_STMT
#define BPF_STMT(code, k) { (unsigned short)(code), 0, 0, k }
#endif
#ifndef BPF_JUMP
#define BPF_JUMP(code, k, jt, jf) { (unsigned short)(code), jt, jf, k }
#endif

BPF Constant

The BPF instructions operate on the BPF virtual machine, which has four main elements: The accumulator register A, the index register X, the packet memory, and the scratch memory M[] [1].

Here we list the source code in linux kernel for defining necessary constant. More details of the meaning of the code could be found in [1].

#define BPF_CLASS(code) ((code) & 0x07)
#define		BPF_LD		0x00
#define		BPF_LDX		0x01
#define		BPF_ST		0x02
#define		BPF_STX		0x03
#define		BPF_ALU		0x04
#define		BPF_JMP		0x05
#define		BPF_RET		0x06
#define		BPF_MISC        0x07

/* ld/ldx fields */
#define BPF_SIZE(code)  ((code) & 0x18)
#define		BPF_W		0x00
#define		BPF_H		0x08
#define		BPF_B		0x10
#define BPF_MODE(code)  ((code) & 0xe0)
#define		BPF_IMM		0x00
#define		BPF_ABS		0x20
#define		BPF_IND		0x40
#define		BPF_MEM		0x60
#define		BPF_LEN		0x80
#define		BPF_MSH		0xa0

/* alu/jmp fields */
#define BPF_OP(code)    ((code) & 0xf0)
#define		BPF_ADD		0x00
#define		BPF_SUB		0x10
#define		BPF_MUL		0x20
#define		BPF_DIV		0x30
#define		BPF_OR		0x40
#define		BPF_AND		0x50
#define		BPF_LSH		0x60
#define		BPF_RSH		0x70
#define		BPF_NEG		0x80
#define		BPF_MOD		0x90
#define		BPF_XOR		0xa0

#define		BPF_JA		0x00
#define		BPF_JEQ		0x10
#define		BPF_JGT		0x20
#define		BPF_JGE		0x30
#define		BPF_JSET        0x40
#define BPF_SRC(code)   ((code) & 0x08)
#define		BPF_K		0x00
#define		BPF_X		0x08

One more important data structure in bpf is seccomp_data as below, which contains the necessary info for a syscall.

struct seccomp_data {
    int nr;
    __u32 arch;
    __u64 instruction_pointer;
    __u64 args[6];
};

Now, let me explain what happens in the sample code.
Check Architecture

	BPF_STMT(BPF_LD+BPF_W+BPF_ABS, ArchField),
	BPF_JUMP( BPF_JMP+BPF_JEQ+BPF_K, AUDIT_ARCH_X86_64, 1, 0),
	BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_KILL),

stmt1: Load arch info from packet to variable A.
jump: Jump one more instruction if value in A is equal to AUDIT_ARCH_X86_64.
stmt2: Return from the BPF execution and kill the process.

Get Syscall Number

BPF_STMT(BPF_LD+BPF_W+BPF_ABS, offsetof(struct seccomp_data, nr)),

stmt: Load syscall number from packet to variable A.

Set Legal Syscall Number

#define Allow(syscall) \
	BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, SYS_##syscall, 0, 1), \
	BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_ALLOW)

jump: Jump to next instruction if value in A is equal to the corresponding syscall number.
stmt: Return from the emulation, and allow the syscall.

Advanced BPF filter

Now, let me use bpf filter to implement a seccomp rules one argument as I do in my previous previous post.

//gcc code1.c -o code1 -lseccomp -no-pie
#include <stdio.h>
#include <string.h>
#include <sys/ptrace.h>
#include <sys/types.h>
#include <sys/wait.h>
#include <sys/user.h>
#include <sys/reg.h>
#include <unistd.h>
#include <stddef.h>
#include <sys/syscall.h>
#include <sys/prctl.h>     /* prctl */
#include <linux/seccomp.h> /* seccomp's constants */
#include <linux/filter.h>
#include <linux/audit.h>
#include <seccomp.h>
#include <stdlib.h>
#include <unistd.h>

#define longsize 8

char buffer[100];

#define ArchField offsetof(struct seccomp_data, arch)
#define ARG(index) offsetof(struct seccomp_data, args[index])

#define Allow(syscall) \
	BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, SYS_##syscall, 0, 1), \
	BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_ALLOW)

#define Deny(syscall) \
	BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, SYS_##syscall, 0, 1), \
	BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_KILL)

struct sock_filter filter[] = {
	/* validate arch */
	BPF_STMT(BPF_LD+BPF_W+BPF_ABS, ArchField),
	BPF_JUMP( BPF_JMP+BPF_JEQ+BPF_K, AUDIT_ARCH_X86_64, 1, 0),
	BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_KILL),

	/* load syscall */
	BPF_STMT(BPF_LD+BPF_W+BPF_ABS, offsetof(struct seccomp_data, nr)),
	
	/* if syscall is write, go for further check, otherwise go to ALLOW*/
	BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, SYS_write, 0, 3),

	/* get args */
	BPF_STMT(BPF_LD+BPF_W+BPF_ABS, ARG(2)),
	BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, 33, 0, 1),
	BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_KILL),

	BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_ALLOW),
};

struct sock_fprog filterprog = {
	.len = sizeof(filter)/sizeof(filter[0]),
	.filter = filter
};

int main()
{
	pid_t  pid;
	int rv;
	long orig_rax;
	char *argv[]={"/bin/cat", "flag", NULL};
	char *env[]={NULL};
	char cmd[20] = "/bin/cat";
	long length;
	long addr;
	int insyscall = 0;
	struct user_regs_struct regs;

	prctl(PR_SET_NO_NEW_PRIVS, 1);
	if (prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &filterprog) == -1) {
		perror("Could not start seccomp:");
		exit(1);
	}


	pid = fork();
	if(pid == 0)
	{
		ptrace(PTRACE_TRACEME, 0, NULL, NULL);
		syscall(59,  cmd ,argv, env);
	}
	else
	{
		while(1)
		{
			wait(&rv);
			if(WIFEXITED(rv)){
				break;
			}
			orig_rax = ptrace(PTRACE_PEEKUSER, pid, 8 * ORIG_RAX, NULL);
			if(orig_rax == 1)
			{
				if(insyscall == 0)
				{
					printf("Syscall number: %d\n", orig_rax);
					ptrace(PTRACE_GETREGS, pid, NULL, &regs);
					printf("Write called with 0x%lx, 0x%lx, 0x%lx\n", regs.rdi, regs.rsi, regs.rdx);
					insyscall = 1;
				}
				else
				{
					int rax = ptrace(PTRACE_PEEKUSER, pid, 8 * RAX, NULL);
					printf("\nWrite returned with %d\n", rax);
					insyscall = 0;
				}	
			}
			ptrace(PTRACE_SYSCALL, pid, NULL, NULL);
		}
	}
	return 0;
}

flag is a text file containing 32 bytes data. The result is given below without any output:

$ sudo ./code1
Syscall number: 1
Write called with 0x1, 0x7f7d2b3a0000, 0x21

With the help of seccomp-tools, we can view the seccomp rules set in this.

$ sudo seccomp-tools dump ./code1
 line  CODE  JT   JF      K
=================================
 0000: 0x20 0x00 0x00 0x00000004  A = arch
 0001: 0x15 0x01 0x00 0xc000003e  if (A == ARCH_X86_64) goto 0003
 0002: 0x06 0x00 0x00 0x00000000  return KILL
 0003: 0x20 0x00 0x00 0x00000000  A = sys_number
 0004: 0x15 0x00 0x03 0x00000001  if (A != write) goto 0008
 0005: 0x20 0x00 0x00 0x00000020  A = args[2]
 0006: 0x15 0x00 0x01 0x00000021  if (A != 0x21) goto 0008
 0007: 0x06 0x00 0x00 0x00000000  return KILL
 0008: 0x06 0x00 0x00 0x7fff0000  return ALLOW

In the given sample code, we define a new macro ARG(index) to get the argument register for bpf statement.

BPF filter and ptrace

Here I am going to give the code on how to bypass the seccomp filter via ptrace. The file flag still contains 32 bytes data.

#include <stdio.h>
#include <string.h>
#include <sys/ptrace.h>
#include <sys/types.h>
#include <sys/wait.h>
#include <sys/user.h>
#include <sys/reg.h>
#include <unistd.h>
#include <stddef.h>
#include <sys/syscall.h>
#include <sys/prctl.h>     /* prctl */
#include <linux/seccomp.h> /* seccomp's constants */
#include <linux/filter.h>
#include <linux/audit.h>
#include <seccomp.h>
#include <stdlib.h>
#include <unistd.h>

#define longsize 8

char buffer[100];

#define ArchField offsetof(struct seccomp_data, arch)
#define ARG(index) offsetof(struct seccomp_data, args[index])

#define Allow(syscall) \
	BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, SYS_##syscall, 0, 1), \
	BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_ALLOW)

#define Deny(syscall) \
	BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, SYS_##syscall, 0, 1), \
	BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_KILL)

struct sock_filter filter[] = {
	/* validate arch */
	BPF_STMT(BPF_LD+BPF_W+BPF_ABS, ArchField),
	BPF_JUMP( BPF_JMP+BPF_JEQ+BPF_K, AUDIT_ARCH_X86_64, 1, 0),
	BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_KILL),

	/* load syscall */
	BPF_STMT(BPF_LD+BPF_W+BPF_ABS, offsetof(struct seccomp_data, nr)),
	
	/* if syscall is write, go for further check, otherwise go to ALLOW*/
	BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, SYS_write, 0, 3),

	/* get args */
	BPF_STMT(BPF_LD+BPF_W+BPF_ABS, ARG(2)),
	BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, 33, 0, 1),
	BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_KILL),

	BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_ALLOW),
};

struct sock_fprog filterprog = {
	.len = sizeof(filter)/sizeof(filter[0]),
	.filter = filter
};



int main()
{
	pid_t  pid;
	int rv;
	long orig_rax;
	char *argv[]={"/bin/cat", "flag", NULL};
	char *env[]={NULL};
	char cmd[20] = "/bin/cat";
	long length;
	long addr;
	int insyscall = 0;
	struct user_regs_struct regs;

	prctl(PR_SET_NO_NEW_PRIVS, 1);
	if (prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &filterprog) == -1) {
		perror("Could not start seccomp:");
		exit(1);
	}


	pid = fork();
	if(pid == 0)
	{
		ptrace(PTRACE_TRACEME, 0, NULL, NULL);
		syscall(59,  cmd ,argv, env);
	}
	else
	{
		while(1)
		{
			wait(&rv);
			if(WIFEXITED(rv)){
				break;
			}
			orig_rax = ptrace(PTRACE_PEEKUSER, pid, 8 * ORIG_RAX, NULL);
			if(orig_rax == 1)
			{
				if(insyscall == 0)
				{
					printf("Syscall number: %d\n", orig_rax);
					ptrace(PTRACE_GETREGS, pid, NULL, &regs);
					printf("Write called with 0x%lx, 0x%lx, 0x%lx\n", regs.rdi, regs.rsi, regs.rdx);
					addr = regs.rsi;
					length = regs.rdx;
					if(regs.rdx == 33)
					{
						regs.rdx = 34;
					}
					rv = ptrace(PTRACE_SETREGS, pid, NULL, &regs);
					insyscall = 1;
				}
				else
				{
					int rax = ptrace(PTRACE_PEEKUSER, pid, 8 * RAX, NULL);
					printf("\nWrite returned with %d\n", rax);
					insyscall = 0;
				}	
			}
			ptrace(PTRACE_SYSCALL, pid, NULL, NULL);
		}
	}
	return 0;
}

And the output is given below:

$ sudo ./code2
Syscall number: 1
Write called with 0x1, 0x7f8971ba6000, 0x21
DANGOKYO{THIS_IS_SECCOMP_FILTER}

Write returned with 34

This time we use ptrace to modify the third argument register to 34 rather than 33. We can find that the ptrace tricks used in my previous post still works for seccomp filter.

Conclusion

In this post, I give an introduction on the bpf filter. Compared to seccomp_add_rule, bpf filter provides a more flexible and customizable way to set seccomp rules.

Reference

[1] https://eigenstate.org/notes/seccomp
[2] http://www.gsp.com/cgi-bin/man.cgi?topic=bpf

Leave a Reply

Fill in your details below or click an icon to log in:

WordPress.com Logo

You are commenting using your WordPress.com account. Log Out /  Change )

Facebook photo

You are commenting using your Facebook account. Log Out /  Change )

Connecting to %s

This site uses Akismet to reduce spam. Learn how your comment data is processed.