Introduction
In my previous post, I introduce what is seccomp, how to set seccomp rule via seccomp_rule_add and how to bypass via ptrace. In this post, I am going to further introduce the seccomp filter, another way to set seccomp rule in the program. This post will also involve some kernel debugging for illustrating the internal of bpf filter.
First Glance
In the first let’s use the following code to explain what is seccomp filter. Part of the code is borrowed from [1].
//gcc code0.c -o code0 -lseccomp -no-pie #include <stdio.h> #include <string.h> #include <sys/ptrace.h> #include <sys/types.h> #include <sys/wait.h> #include <sys/user.h> #include <sys/reg.h> #include <unistd.h> #include <stddef.h> #include <sys/syscall.h> #include <sys/prctl.h> /* prctl */ #include <linux/seccomp.h> /* seccomp's constants */ #include <linux/filter.h> #include <linux/audit.h> #include <seccomp.h> #include <stdlib.h> #include <unistd.h> #define longsize 8 char buffer[100]; #define ArchField offsetof(struct seccomp_data, arch) #define Allow(syscall) \ BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, SYS_##syscall, 0, 1), \ BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_ALLOW) struct sock_filter filter[] = { /* validate arch */ BPF_STMT(BPF_LD+BPF_W+BPF_ABS, ArchField), BPF_JUMP( BPF_JMP+BPF_JEQ+BPF_K, AUDIT_ARCH_X86_64, 1, 0), BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_KILL), /* load syscall */ BPF_STMT(BPF_LD+BPF_W+BPF_ABS, offsetof(struct seccomp_data, nr)), /* list of allowed syscalls */ Allow(exit_group), /* exits a processs */ Allow(brk), /* for malloc(), inside libc */ Allow(mmap), /* also for malloc() */ Allow(munmap), /* for free(), inside libc */ Allow(write), /* called by printf */ Allow(fstat), /* called by printf */ /* and if we don't match above, die */ BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_KILL), }; struct sock_fprog filterprog = { .len = sizeof(filter)/sizeof(filter[0]), .filter = filter }; int main() { pid_t pid; int rv; long orig_rax; char *argv[]={"/bin/cat", "flag", NULL}; char *env[]={NULL}; char cmd[20] = "/bin/cat"; long length; long addr; int insyscall = 0; struct user_regs_struct regs; prctl(PR_SET_NO_NEW_PRIVS, 1); if (prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &filterprog) == -1) { perror("Could not start seccomp:"); exit(1); } pid = fork(); if(pid == 0) { ptrace(PTRACE_TRACEME, 0, NULL, NULL); syscall(59, cmd ,argv, env); } else { waitpid(pid, &rv, 0); printf("The child process exits\n"); } return 0; }
With the help of seccomp-tools, we can clearly view the seccomp rules set in the sample code
$ sudo seccomp-tools dump ./code0 line CODE JT JF K ================================= 0000: 0x20 0x00 0x00 0x00000004 A = arch 0001: 0x15 0x01 0x00 0xc000003e if (A == ARCH_X86_64) goto 0003 0002: 0x06 0x00 0x00 0x00000000 return KILL 0003: 0x20 0x00 0x00 0x00000000 A = sys_number 0004: 0x15 0x00 0x01 0x000000e7 if (A != exit_group) goto 0006 0005: 0x06 0x00 0x00 0x7fff0000 return ALLOW 0006: 0x15 0x00 0x01 0x0000000c if (A != brk) goto 0008 0007: 0x06 0x00 0x00 0x7fff0000 return ALLOW 0008: 0x15 0x00 0x01 0x00000009 if (A != mmap) goto 0010 0009: 0x06 0x00 0x00 0x7fff0000 return ALLOW 0010: 0x15 0x00 0x01 0x0000000b if (A != munmap) goto 0012 0011: 0x06 0x00 0x00 0x7fff0000 return ALLOW 0012: 0x15 0x00 0x01 0x00000001 if (A != write) goto 0014 0013: 0x06 0x00 0x00 0x7fff0000 return ALLOW 0014: 0x15 0x00 0x01 0x00000005 if (A != fstat) goto 0016 0015: 0x06 0x00 0x00 0x7fff0000 return ALLOW 0016: 0x06 0x00 0x00 0x00000000 return KILL
And the final result is expected.
$ sudo ./code0 Bad system call
Internal of BPF
In this section, I am going to introduce how the seccomp rules are set in the code above. During the test, I will also introduce some code in Linux Kernel for illustration. The kernel code I use is 4.4-31.
struct sock_fprog
Structure sock_fprog is the basic data structure to carry the seccomp rules from user space to kernel space.
struct sock_fprog { /* Required for SO_ATTACH_FILTER. /
unsigned short len; / Number of filter blocks */
struct sock_filter __user *filter;
};
In code of linux kernel:
SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, unsigned long, arg4, unsigned long, arg5) { struct task_struct *me = current; unsigned char comm[sizeof(me->comm)]; long error; error = security_task_prctl(option, arg2, arg3, arg4, arg5); if (error != -ENOSYS) return error; error = 0; switch (option) { //other options case PR_SET_SECCOMP: error = prctl_set_seccomp(arg2, (char __user *)arg3); break; //other options } return error; }
Then we take a step forward to have a look at prctl_set_seccomp:
// /kernel/seccomp.c /** * prctl_set_seccomp: configures current->seccomp.mode * @seccomp_mode: requested mode to use * @filter: optional struct sock_fprog for use with SECCOMP_MODE_FILTER * * Returns 0 on success or -EINVAL on failure. */ long prctl_set_seccomp(unsigned long seccomp_mode, char __user *filter) { unsigned int op; char __user *uargs; switch (seccomp_mode) { case SECCOMP_MODE_STRICT: op = SECCOMP_SET_MODE_STRICT; /* * Setting strict mode through prctl always ignored filter, * so make sure it is always NULL here to pass the internal * check in do_seccomp(). */ uargs = NULL; break; case SECCOMP_MODE_FILTER: op = SECCOMP_SET_MODE_FILTER; uargs = filter; break; default: return -EINVAL; } /* prctl interface doesn't have flags, so they are always zero. */ return do_seccomp(op, 0, uargs); } /* Common entry point for both prctl and syscall. */ static long do_seccomp(unsigned int op, unsigned int flags, const char __user *uargs) { switch (op) { case SECCOMP_SET_MODE_STRICT: if (flags != 0 || uargs != NULL) return -EINVAL; return seccomp_set_mode_strict(); case SECCOMP_SET_MODE_FILTER: return seccomp_set_mode_filter(flags, uargs); default: return -EINVAL; } }
Function seccomp_set_mode_filter will later assign the filter to current process according to the content of uarg. More details will be discussed in future. Next we will discuss how the seccomp rules are generated.
BPF filter
sock_filter is the basic data structure to encode bpf filter rule. As denoted in the comment, code denotes the opcode used in intruction, jt and jf denote the true branch and false branch in jump instruction, and k is a multiuse field. In linux kernel, there are two macros for users to create filter instruction:
// source/include/uapi/linux/filter.h /* * Try and keep these values and structures similar to BSD, especially * the BPF code definitions which need to match so you can share filters */ struct sock_filter { /* Filter block */ __u16 code; /* Actual filter code */ __u8 jt; /* Jump true */ __u8 jf; /* Jump false */ __u32 k; /* Generic multiuse field */ }; /* * Macros for filter block array initializers. */ #ifndef BPF_STMT #define BPF_STMT(code, k) { (unsigned short)(code), 0, 0, k } #endif #ifndef BPF_JUMP #define BPF_JUMP(code, k, jt, jf) { (unsigned short)(code), jt, jf, k } #endif
BPF Constant
The BPF instructions operate on the BPF virtual machine, which has four main elements: The accumulator register A, the index register X, the packet memory, and the scratch memory M[] [1].
Here we list the source code in linux kernel for defining necessary constant. More details of the meaning of the code could be found in [1].
#define BPF_CLASS(code) ((code) & 0x07) #define BPF_LD 0x00 #define BPF_LDX 0x01 #define BPF_ST 0x02 #define BPF_STX 0x03 #define BPF_ALU 0x04 #define BPF_JMP 0x05 #define BPF_RET 0x06 #define BPF_MISC 0x07 /* ld/ldx fields */ #define BPF_SIZE(code) ((code) & 0x18) #define BPF_W 0x00 #define BPF_H 0x08 #define BPF_B 0x10 #define BPF_MODE(code) ((code) & 0xe0) #define BPF_IMM 0x00 #define BPF_ABS 0x20 #define BPF_IND 0x40 #define BPF_MEM 0x60 #define BPF_LEN 0x80 #define BPF_MSH 0xa0 /* alu/jmp fields */ #define BPF_OP(code) ((code) & 0xf0) #define BPF_ADD 0x00 #define BPF_SUB 0x10 #define BPF_MUL 0x20 #define BPF_DIV 0x30 #define BPF_OR 0x40 #define BPF_AND 0x50 #define BPF_LSH 0x60 #define BPF_RSH 0x70 #define BPF_NEG 0x80 #define BPF_MOD 0x90 #define BPF_XOR 0xa0 #define BPF_JA 0x00 #define BPF_JEQ 0x10 #define BPF_JGT 0x20 #define BPF_JGE 0x30 #define BPF_JSET 0x40 #define BPF_SRC(code) ((code) & 0x08) #define BPF_K 0x00 #define BPF_X 0x08
One more important data structure in bpf is seccomp_data as below, which contains the necessary info for a syscall.
struct seccomp_data { int nr; __u32 arch; __u64 instruction_pointer; __u64 args[6]; };
Now, let me explain what happens in the sample code.
Check Architecture
BPF_STMT(BPF_LD+BPF_W+BPF_ABS, ArchField), BPF_JUMP( BPF_JMP+BPF_JEQ+BPF_K, AUDIT_ARCH_X86_64, 1, 0), BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_KILL),
stmt1: Load arch info from packet to variable A.
jump: Jump one more instruction if value in A is equal to AUDIT_ARCH_X86_64.
stmt2: Return from the BPF execution and kill the process.
Get Syscall Number
BPF_STMT(BPF_LD+BPF_W+BPF_ABS, offsetof(struct seccomp_data, nr)),
stmt: Load syscall number from packet to variable A.
Set Legal Syscall Number
#define Allow(syscall) \ BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, SYS_##syscall, 0, 1), \ BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_ALLOW)
jump: Jump to next instruction if value in A is equal to the corresponding syscall number.
stmt: Return from the emulation, and allow the syscall.
Advanced BPF filter
Now, let me use bpf filter to implement a seccomp rules one argument as I do in my previous previous post.
//gcc code1.c -o code1 -lseccomp -no-pie #include <stdio.h> #include <string.h> #include <sys/ptrace.h> #include <sys/types.h> #include <sys/wait.h> #include <sys/user.h> #include <sys/reg.h> #include <unistd.h> #include <stddef.h> #include <sys/syscall.h> #include <sys/prctl.h> /* prctl */ #include <linux/seccomp.h> /* seccomp's constants */ #include <linux/filter.h> #include <linux/audit.h> #include <seccomp.h> #include <stdlib.h> #include <unistd.h> #define longsize 8 char buffer[100]; #define ArchField offsetof(struct seccomp_data, arch) #define ARG(index) offsetof(struct seccomp_data, args[index]) #define Allow(syscall) \ BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, SYS_##syscall, 0, 1), \ BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_ALLOW) #define Deny(syscall) \ BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, SYS_##syscall, 0, 1), \ BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_KILL) struct sock_filter filter[] = { /* validate arch */ BPF_STMT(BPF_LD+BPF_W+BPF_ABS, ArchField), BPF_JUMP( BPF_JMP+BPF_JEQ+BPF_K, AUDIT_ARCH_X86_64, 1, 0), BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_KILL), /* load syscall */ BPF_STMT(BPF_LD+BPF_W+BPF_ABS, offsetof(struct seccomp_data, nr)), /* if syscall is write, go for further check, otherwise go to ALLOW*/ BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, SYS_write, 0, 3), /* get args */ BPF_STMT(BPF_LD+BPF_W+BPF_ABS, ARG(2)), BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, 33, 0, 1), BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_KILL), BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_ALLOW), }; struct sock_fprog filterprog = { .len = sizeof(filter)/sizeof(filter[0]), .filter = filter }; int main() { pid_t pid; int rv; long orig_rax; char *argv[]={"/bin/cat", "flag", NULL}; char *env[]={NULL}; char cmd[20] = "/bin/cat"; long length; long addr; int insyscall = 0; struct user_regs_struct regs; prctl(PR_SET_NO_NEW_PRIVS, 1); if (prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &filterprog) == -1) { perror("Could not start seccomp:"); exit(1); } pid = fork(); if(pid == 0) { ptrace(PTRACE_TRACEME, 0, NULL, NULL); syscall(59, cmd ,argv, env); } else { while(1) { wait(&rv); if(WIFEXITED(rv)){ break; } orig_rax = ptrace(PTRACE_PEEKUSER, pid, 8 * ORIG_RAX, NULL); if(orig_rax == 1) { if(insyscall == 0) { printf("Syscall number: %d\n", orig_rax); ptrace(PTRACE_GETREGS, pid, NULL, ®s); printf("Write called with 0x%lx, 0x%lx, 0x%lx\n", regs.rdi, regs.rsi, regs.rdx); insyscall = 1; } else { int rax = ptrace(PTRACE_PEEKUSER, pid, 8 * RAX, NULL); printf("\nWrite returned with %d\n", rax); insyscall = 0; } } ptrace(PTRACE_SYSCALL, pid, NULL, NULL); } } return 0; }
flag is a text file containing 32 bytes data. The result is given below without any output:
$ sudo ./code1 Syscall number: 1 Write called with 0x1, 0x7f7d2b3a0000, 0x21
With the help of seccomp-tools, we can view the seccomp rules set in this.
$ sudo seccomp-tools dump ./code1 line CODE JT JF K ================================= 0000: 0x20 0x00 0x00 0x00000004 A = arch 0001: 0x15 0x01 0x00 0xc000003e if (A == ARCH_X86_64) goto 0003 0002: 0x06 0x00 0x00 0x00000000 return KILL 0003: 0x20 0x00 0x00 0x00000000 A = sys_number 0004: 0x15 0x00 0x03 0x00000001 if (A != write) goto 0008 0005: 0x20 0x00 0x00 0x00000020 A = args[2] 0006: 0x15 0x00 0x01 0x00000021 if (A != 0x21) goto 0008 0007: 0x06 0x00 0x00 0x00000000 return KILL 0008: 0x06 0x00 0x00 0x7fff0000 return ALLOW
In the given sample code, we define a new macro ARG(index) to get the argument register for bpf statement.
BPF filter and ptrace
Here I am going to give the code on how to bypass the seccomp filter via ptrace. The file flag still contains 32 bytes data.
#include <stdio.h> #include <string.h> #include <sys/ptrace.h> #include <sys/types.h> #include <sys/wait.h> #include <sys/user.h> #include <sys/reg.h> #include <unistd.h> #include <stddef.h> #include <sys/syscall.h> #include <sys/prctl.h> /* prctl */ #include <linux/seccomp.h> /* seccomp's constants */ #include <linux/filter.h> #include <linux/audit.h> #include <seccomp.h> #include <stdlib.h> #include <unistd.h> #define longsize 8 char buffer[100]; #define ArchField offsetof(struct seccomp_data, arch) #define ARG(index) offsetof(struct seccomp_data, args[index]) #define Allow(syscall) \ BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, SYS_##syscall, 0, 1), \ BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_ALLOW) #define Deny(syscall) \ BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, SYS_##syscall, 0, 1), \ BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_KILL) struct sock_filter filter[] = { /* validate arch */ BPF_STMT(BPF_LD+BPF_W+BPF_ABS, ArchField), BPF_JUMP( BPF_JMP+BPF_JEQ+BPF_K, AUDIT_ARCH_X86_64, 1, 0), BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_KILL), /* load syscall */ BPF_STMT(BPF_LD+BPF_W+BPF_ABS, offsetof(struct seccomp_data, nr)), /* if syscall is write, go for further check, otherwise go to ALLOW*/ BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, SYS_write, 0, 3), /* get args */ BPF_STMT(BPF_LD+BPF_W+BPF_ABS, ARG(2)), BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, 33, 0, 1), BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_KILL), BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_ALLOW), }; struct sock_fprog filterprog = { .len = sizeof(filter)/sizeof(filter[0]), .filter = filter }; int main() { pid_t pid; int rv; long orig_rax; char *argv[]={"/bin/cat", "flag", NULL}; char *env[]={NULL}; char cmd[20] = "/bin/cat"; long length; long addr; int insyscall = 0; struct user_regs_struct regs; prctl(PR_SET_NO_NEW_PRIVS, 1); if (prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &filterprog) == -1) { perror("Could not start seccomp:"); exit(1); } pid = fork(); if(pid == 0) { ptrace(PTRACE_TRACEME, 0, NULL, NULL); syscall(59, cmd ,argv, env); } else { while(1) { wait(&rv); if(WIFEXITED(rv)){ break; } orig_rax = ptrace(PTRACE_PEEKUSER, pid, 8 * ORIG_RAX, NULL); if(orig_rax == 1) { if(insyscall == 0) { printf("Syscall number: %d\n", orig_rax); ptrace(PTRACE_GETREGS, pid, NULL, ®s); printf("Write called with 0x%lx, 0x%lx, 0x%lx\n", regs.rdi, regs.rsi, regs.rdx); addr = regs.rsi; length = regs.rdx; if(regs.rdx == 33) { regs.rdx = 34; } rv = ptrace(PTRACE_SETREGS, pid, NULL, ®s); insyscall = 1; } else { int rax = ptrace(PTRACE_PEEKUSER, pid, 8 * RAX, NULL); printf("\nWrite returned with %d\n", rax); insyscall = 0; } } ptrace(PTRACE_SYSCALL, pid, NULL, NULL); } } return 0; }
And the output is given below:
$ sudo ./code2 Syscall number: 1 Write called with 0x1, 0x7f8971ba6000, 0x21 DANGOKYO{THIS_IS_SECCOMP_FILTER} Write returned with 34
This time we use ptrace to modify the third argument register to 34 rather than 33. We can find that the ptrace tricks used in my previous post still works for seccomp filter.
Conclusion
In this post, I give an introduction on the bpf filter. Compared to seccomp_add_rule, bpf filter provides a more flexible and customizable way to set seccomp rules.
Reference
[1] https://eigenstate.org/notes/seccomp
[2] http://www.gsp.com/cgi-bin/man.cgi?topic=bpf