[lvc-project] [PATCH 6.1.y 1/3] mm/hugetlb: unshare page tables during VMA split, not before

Mon Jul 7 13:39:56 MSK 2025

Hello Jann,

On Fri, 20 Jun 2025 23:33:31 +0200, Jann Horn wrote:
> Currently, __split_vma() triggers hugetlb page table unsharing through
> vm_ops->may_split().  This happens before the VMA lock and rmap locks are
> taken - which is too early, it allows racing VMA-locked page faults in our
> process and racing rmap walks from other processes to cause page tables to
> be shared again before we actually perform the split.
> 
> Fix it by explicitly calling into the hugetlb unshare logic from
> __split_vma() in the same place where THP splitting also happens.  At that
> point, both the VMA and the rmap(s) are write-locked.
> 
> An annoying detail is that we can now call into the helper
> hugetlb_unshare_pmds() from two different locking contexts:
> 
> 1. from hugetlb_split(), holding:
>     - mmap lock (exclusively)
>     - VMA lock
>     - file rmap lock (exclusively)
> 2. hugetlb_unshare_all_pmds(), which I think is designed to be able to
>    call us with only the mmap lock held (in shared mode), but currently
>    only runs while holding mmap lock (exclusively) and VMA lock
> 
> Backporting note:
> This commit fixes a racy protection that was introduced in commit
> b30c14cd6102 ("hugetlb: unshare some PMDs when splitting VMAs"); that
> commit claimed to fix an issue introduced in 5.13, but it should actually
> also go all the way back.
> 
> [jannh at google.com: v2]
>   Link: https://lkml.kernel.org/r/20250528-hugetlb-fixes-splitrace-v2-1-1329349bad1a@google.com
> Link: https://lkml.kernel.org/r/20250528-hugetlb-fixes-splitrace-v2-0-1329349bad1a@google.com
> Link: https://lkml.kernel.org/r/20250527-hugetlb-fixes-splitrace-v1-1-f4136f5ec58a@google.com
> Fixes: 39dde65c9940 ("[PATCH] shared page table for hugetlb page")
> Signed-off-by: Jann Horn <jannh at google.com>
> Cc: Liam Howlett <liam.howlett at oracle.com>
> Reviewed-by: Lorenzo Stoakes <lorenzo.stoakes at oracle.com>
> Reviewed-by: Oscar Salvador <osalvador at suse.de>
> Cc: Lorenzo Stoakes <lorenzo.stoakes at oracle.com>
> Cc: Vlastimil Babka <vbabka at suse.cz>
> Cc: <stable at vger.kernel.org>	[b30c14cd6102: hugetlb: unshare some PMDs when splitting VMAs]
> Cc: <stable at vger.kernel.org>
> Signed-off-by: Andrew Morton <akpm at linux-foundation.org>
> [stable backport: code got moved around, VMA splitting is in
> __vma_adjust]
> Signed-off-by: Jann Horn <jannh at google.com>
> ---
>  include/linux/hugetlb.h |  3 +++
>  mm/hugetlb.c            | 60 ++++++++++++++++++++++++++++++-----------
>  mm/mmap.c               |  8 ++++++
>  3 files changed, 55 insertions(+), 16 deletions(-)
> 
> diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
> index cc555072940f..26f2947c399d 100644
> --- a/include/linux/hugetlb.h
> +++ b/include/linux/hugetlb.h
> @@ -239,6 +239,7 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
>  
>  bool is_hugetlb_entry_migration(pte_t pte);
>  void hugetlb_unshare_all_pmds(struct vm_area_struct *vma);
> +void hugetlb_split(struct vm_area_struct *vma, unsigned long addr);
>  
>  #else /* !CONFIG_HUGETLB_PAGE */
>  
> @@ -472,6 +473,8 @@ static inline vm_fault_t hugetlb_fault(struct mm_struct *mm,
>  
>  static inline void hugetlb_unshare_all_pmds(struct vm_area_struct *vma) { }
>  
> +static inline void hugetlb_split(struct vm_area_struct *vma, unsigned long addr) {}
> +
>  #endif /* !CONFIG_HUGETLB_PAGE */
>  /*
>   * hugepages at page global directory. If arch support
> diff --git a/mm/hugetlb.c b/mm/hugetlb.c
> index 14b9494c58ed..fc5d3d665266 100644
> --- a/mm/hugetlb.c
> +++ b/mm/hugetlb.c
> @@ -95,7 +95,7 @@ static void hugetlb_vma_lock_free(struct vm_area_struct *vma);
>  static void hugetlb_vma_lock_alloc(struct vm_area_struct *vma);
>  static void __hugetlb_vma_unlock_write_free(struct vm_area_struct *vma);
>  static void hugetlb_unshare_pmds(struct vm_area_struct *vma,
> -		unsigned long start, unsigned long end);
> +		unsigned long start, unsigned long end, bool take_locks);
>  static struct resv_map *vma_resv_map(struct vm_area_struct *vma);
>  
>  static inline bool subpool_is_free(struct hugepage_subpool *spool)
> @@ -4900,26 +4900,40 @@ static int hugetlb_vm_op_split(struct vm_area_struct *vma, unsigned long addr)
>  {
>  	if (addr & ~(huge_page_mask(hstate_vma(vma))))
>  		return -EINVAL;
> +	return 0;
> +}
>  
> +void hugetlb_split(struct vm_area_struct *vma, unsigned long addr)
> +{
>  	/*
>  	 * PMD sharing is only possible for PUD_SIZE-aligned address ranges
>  	 * in HugeTLB VMAs. If we will lose PUD_SIZE alignment due to this
>  	 * split, unshare PMDs in the PUD_SIZE interval surrounding addr now.
> +	 * This function is called in the middle of a VMA split operation, with
> +	 * MM, VMA and rmap all write-locked to prevent concurrent page table
> +	 * walks (except hardware and gup_fast()).
>  	 */
> +	mmap_assert_write_locked(vma->vm_mm);
> +	i_mmap_assert_write_locked(vma->vm_file->f_mapping);


The above i_mmap lock assertion is firing on stable kernels from 5.10 to 6.1
included.

------------[ cut here ]------------
WARNING: CPU: 0 PID: 11489 at include/linux/fs.h:503 i_mmap_assert_write_locked include/linux/fs.h:503 [inline]
WARNING: CPU: 0 PID: 11489 at include/linux/fs.h:503 hugetlb_split+0x267/0x300 mm/hugetlb.c:4917
Modules linked in:
CPU: 0 PID: 11489 Comm: syz-executor.4 Not tainted 6.1.142-syzkaller-00296-gfd0df5221577 #0
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.12.0-1 04/01/2014
RIP: 0010:i_mmap_assert_write_locked include/linux/fs.h:503 [inline]
RIP: 0010:hugetlb_split+0x267/0x300 mm/hugetlb.c:4917
Call Trace:
 <TASK>
 __vma_adjust+0xd73/0x1c10 mm/mmap.c:736
 vma_adjust include/linux/mm.h:2745 [inline]
 __split_vma+0x459/0x540 mm/mmap.c:2385
 do_mas_align_munmap+0x5f2/0xf10 mm/mmap.c:2497
 do_mas_munmap+0x26c/0x2c0 mm/mmap.c:2646
 __mmap_region mm/mmap.c:2694 [inline]
 mmap_region+0x19f/0x1770 mm/mmap.c:2912
 do_mmap+0x84b/0xf20 mm/mmap.c:1432
 vm_mmap_pgoff+0x1af/0x280 mm/util.c:520
 ksys_mmap_pgoff+0x41f/0x5a0 mm/mmap.c:1478
 do_syscall_x64 arch/x86/entry/common.c:51 [inline]
 do_syscall_64+0x35/0x80 arch/x86/entry/common.c:81
 entry_SYSCALL_64_after_hwframe+0x6e/0xd8
RIP: 0033:0x46a269
 </TASK>

Found by Linux Verification Center (linuxtesting.org) with Syzkaller.


The main reason is that those branches lack the following

  commit ccf1d78d8b86e28502fa1b575a459a402177def4
  Author: Suren Baghdasaryan <surenb at google.com>
  Date:   Mon Feb 27 09:36:13 2023 -0800
  
      mm/mmap: move vma_prepare before vma_adjust_trans_huge
      
      vma_prepare() acquires all locks required before VMA modifications.  Move
      vma_prepare() before vma_adjust_trans_huge() so that VMA is locked before
      any modification.
      
      Link: https://lkml.kernel.org/r/20230227173632.3292573-15-surenb@google.com
      Signed-off-by: Suren Baghdasaryan <surenb at google.com>
      Signed-off-by: Andrew Morton <akpm at linux-foundation.org>

thus the needed lock is acquired just after vma_adjust_trans_huge() and
the newly added hugetlb_split().

Please have a look at a straightforward write-up which comes to my mind.
It does something like the ccf1d78d8b86 ("mm/mmap: move vma_prepare before
vma_adjust_trans_huge"), but in context of an old stable branch.

If looks okay, I'll be glad to prepare it as a formal patch and send it
out for the 5.10-5.15, too.

against 6.1.y
-------------

diff --git a/mm/mmap.c b/mm/mmap.c
index 0f303dc8425a..941880ed62d7 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -543,8 +543,6 @@ inline int vma_expand(struct ma_state *mas, struct vm_area_struct *vma,
        if (mas_preallocate(mas, vma, GFP_KERNEL))
                goto nomem;
 
-       vma_adjust_trans_huge(vma, start, end, 0);
-
        if (file) {
                mapping = file->f_mapping;
                root = &mapping->i_mmap;
@@ -562,6 +560,8 @@ inline int vma_expand(struct ma_state *mas, struct vm_area_struct *vma,
                vma_interval_tree_remove(vma, root);
        }
 
+       vma_adjust_trans_huge(vma, start, end, 0);
+
        vma->vm_start = start;
        vma->vm_end = end;
        vma->vm_pgoff = pgoff;
@@ -727,15 +727,6 @@ int __vma_adjust(struct vm_area_struct *vma, unsigned long start,
                return -ENOMEM;
        }
 
-       /*
-        * Get rid of huge pages and shared page tables straddling the split
-        * boundary.
-        */
-       vma_adjust_trans_huge(orig_vma, start, end, adjust_next);
-       if (is_vm_hugetlb_page(orig_vma)) {
-               hugetlb_split(orig_vma, start);
-               hugetlb_split(orig_vma, end);
-       }
        if (file) {
                mapping = file->f_mapping;
                root = &mapping->i_mmap;
@@ -775,6 +766,16 @@ int __vma_adjust(struct vm_area_struct *vma, unsigned long start,
                        vma_interval_tree_remove(next, root);
        }
 
+       /*
+        * Get rid of huge pages and shared page tables straddling the split
+        * boundary.
+        */
+       vma_adjust_trans_huge(orig_vma, start, end, adjust_next);
+       if (is_vm_hugetlb_page(orig_vma)) {
+               hugetlb_split(orig_vma, start);
+               hugetlb_split(orig_vma, end);
+       }
+
        if (start != vma->vm_start) {
                if ((vma->vm_start < start) &&
                    (!insert || (insert->vm_end != start))) {


--
Fedor

> +
>  	if (addr & ~PUD_MASK) {
> -		/*
> -		 * hugetlb_vm_op_split is called right before we attempt to
> -		 * split the VMA. We will need to unshare PMDs in the old and
> -		 * new VMAs, so let's unshare before we split.
> -		 */
>  		unsigned long floor = addr & PUD_MASK;
>  		unsigned long ceil = floor + PUD_SIZE;
>  
> -		if (floor >= vma->vm_start && ceil <= vma->vm_end)
> -			hugetlb_unshare_pmds(vma, floor, ceil);
> +		if (floor >= vma->vm_start && ceil <= vma->vm_end) {
> +			/*
> +			 * Locking:
> +			 * Use take_locks=false here.
> +			 * The file rmap lock is already held.
> +			 * The hugetlb VMA lock can't be taken when we already
> +			 * hold the file rmap lock, and we don't need it because
> +			 * its purpose is to synchronize against concurrent page
> +			 * table walks, which are not possible thanks to the
> +			 * locks held by our caller.
> +			 */
> +			hugetlb_unshare_pmds(vma, floor, ceil, /* take_locks = */ false);
> +		}
>  	}
> -
> -	return 0;
>  }
>  
>  static unsigned long hugetlb_vm_op_pagesize(struct vm_area_struct *vma)
> @@ -7495,9 +7509,16 @@ void move_hugetlb_state(struct page *oldpage, struct page *newpage, int reason)
>  	}
>  }
>  
> +/*
> + * If @take_locks is false, the caller must ensure that no concurrent page table
> + * access can happen (except for gup_fast() and hardware page walks).
> + * If @take_locks is true, we take the hugetlb VMA lock (to lock out things like
> + * concurrent page fault handling) and the file rmap lock.
> + */
>  static void hugetlb_unshare_pmds(struct vm_area_struct *vma,
>  				   unsigned long start,
> -				   unsigned long end)
> +				   unsigned long end,
> +				   bool take_locks)
>  {
>  	struct hstate *h = hstate_vma(vma);
>  	unsigned long sz = huge_page_size(h);
> @@ -7521,8 +7542,12 @@ static void hugetlb_unshare_pmds(struct vm_area_struct *vma,
>  	mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm,
>  				start, end);
>  	mmu_notifier_invalidate_range_start(&range);
> -	hugetlb_vma_lock_write(vma);
> -	i_mmap_lock_write(vma->vm_file->f_mapping);
> +	if (take_locks) {
> +		hugetlb_vma_lock_write(vma);
> +		i_mmap_lock_write(vma->vm_file->f_mapping);
> +	} else {
> +		i_mmap_assert_write_locked(vma->vm_file->f_mapping);
> +	}
>  	for (address = start; address < end; address += PUD_SIZE) {
>  		ptep = huge_pte_offset(mm, address, sz);
>  		if (!ptep)
> @@ -7532,8 +7557,10 @@ static void hugetlb_unshare_pmds(struct vm_area_struct *vma,
>  		spin_unlock(ptl);
>  	}
>  	flush_hugetlb_tlb_range(vma, start, end);
> -	i_mmap_unlock_write(vma->vm_file->f_mapping);
> -	hugetlb_vma_unlock_write(vma);
> +	if (take_locks) {
> +		i_mmap_unlock_write(vma->vm_file->f_mapping);
> +		hugetlb_vma_unlock_write(vma);
> +	}
>  	/*
>  	 * No need to call mmu_notifier_invalidate_range(), see
>  	 * Documentation/mm/mmu_notifier.rst.
> @@ -7548,7 +7575,8 @@ static void hugetlb_unshare_pmds(struct vm_area_struct *vma,
>  void hugetlb_unshare_all_pmds(struct vm_area_struct *vma)
>  {
>  	hugetlb_unshare_pmds(vma, ALIGN(vma->vm_start, PUD_SIZE),
> -			ALIGN_DOWN(vma->vm_end, PUD_SIZE));
> +			ALIGN_DOWN(vma->vm_end, PUD_SIZE),
> +			/* take_locks = */ true);
>  }
>  
>  #ifdef CONFIG_CMA
> diff --git a/mm/mmap.c b/mm/mmap.c
> index ebc3583fa612..0f303dc8425a 100644
> --- a/mm/mmap.c
> +++ b/mm/mmap.c
> @@ -727,7 +727,15 @@ int __vma_adjust(struct vm_area_struct *vma, unsigned long start,
>  		return -ENOMEM;
>  	}
>  
> +	/*
> +	 * Get rid of huge pages and shared page tables straddling the split
> +	 * boundary.
> +	 */
>  	vma_adjust_trans_huge(orig_vma, start, end, adjust_next);
> +	if (is_vm_hugetlb_page(orig_vma)) {
> +		hugetlb_split(orig_vma, start);
> +		hugetlb_split(orig_vma, end);
> +	}
>  	if (file) {
>  		mapping = file->f_mapping;
>  		root = &mapping->i_mmap;
> -- 
> 2.50.0.rc2.701.gf1e915cc24-goog