[lvc-project] [PATCH 5.10 1/2] hugetlbfs: make free_huge_page irq safe

Wed Jun 26 23:44:00 MSK 2024

From: Mike Kravetz <mike.kravetz at oracle.com>

Commit db71ef79b59bb2e78dc4df83d0e4bf6beaa5c82d upstream.

Commit c77c0a8ac4c5 ("mm/hugetlb: defer freeing of huge pages if in
non-task context") was added to address the issue of free_huge_page being
called from irq context.  That commit hands off free_huge_page processing
to a workqueue if !in_task.  However, this doesn't cover all the cases as
pointed out by 0day bot lockdep report [1].

:  Possible interrupt unsafe locking scenario:
:
:        CPU0                    CPU1
:        ----                    ----
:   lock(hugetlb_lock);
:                                local_irq_disable();
:                                lock(slock-AF_INET);
:                                lock(hugetlb_lock);
:   <Interrupt>
:     lock(slock-AF_INET);

Shakeel has later explained that this is very likely TCP TX zerocopy from
hugetlb pages scenario when the networking code drops a last reference to
hugetlb page while having IRQ disabled.  Hugetlb freeing path doesn't
disable IRQ while holding hugetlb_lock so a lock dependency chain can lead
to a deadlock.

This commit addresses the issue by doing the following:
- Make hugetlb_lock irq safe.  This is mostly a simple process of
  changing spin_*lock calls to spin_*lock_irq* calls.
- Make subpool lock irq safe in a similar manner.
- Revert the !in_task check and workqueue handoff.

[1] https://lore.kernel.org/linux-mm/000000000000f1c03b05bc43aadc@google.com/

Link: https://lkml.kernel.org/r/20210409205254.242291-8-mike.kravetz@oracle.com
Signed-off-by: Mike Kravetz <mike.kravetz at oracle.com>

[After commit db71ef79b59b ("hugetlb: make free_huge_page irq safe"), the
subpool lock should be locked with spin_lock_irq() and all call sites was
modified as such, except for the ones in hugetlbfs_statfs()]
Signed-off-by: Mikhail Ivanov <iwanov-23 at bk.ru>
Signed-off-by: Mikhail Ukhin <mish.uxin2012 at yandex.ru>
---
 mm/hugetlb.c | 155 ++++++++++++++++----------------------------
 mm/hugetlb_cgroup.c | 8 +--
 2 files changed, 60 insertions(+), 103 deletions(-)

diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 02b7c8f9b0e8..daba72765b06 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -97,11 +97,12 @@ static inline void ClearPageHugeFreed(struct page *head)
 /* Forward declaration */
 static int hugetlb_acct_memory(struct hstate *h, long delta);
 
-static inline void unlock_or_release_subpool(struct hugepage_subpool *spool)
+static inline void unlock_or_release_subpool(struct hugepage_subpool *spool,
+unsigned long irq_flags)
 {
         bool free = (spool->count == 0) && (spool->used_hpages == 0);
 
- spin_unlock(&spool->lock);
+spin_unlock_irqrestore(&spool->lock, irq_flags);
 
         /* If no pages are used, and no other handles to the subpool
          * remain, give up any reservations based on minimum size and
@@ -140,10 +141,12 @@ struct hugepage_subpool *hugepage_new_subpool(struct hstate *h, long max_hpages,
 
 void hugepage_put_subpool(struct hugepage_subpool *spool)
 {
- spin_lock(&spool->lock);Subject: [PATCH 1/2] hugetlb: make free_huge_page irq safe
+unsigned long flags;
+
+spin_lock_irqsave(&spool->lock, flags);
         BUG_ON(!spool->count);
         spool->count--;
- unlock_or_release_subpool(spool);
+unlock_or_release_subpool(spool, flags);
 }
 
 /*
@@ -162,7 +165,7 @@ static long hugepage_subpool_get_pages(struct hugepage_subpool *spool,
         if (!spool)
                 return ret;
 
- spin_lock(&spool->lock);
+spin_lock_irq(&spool->lock);
 
         if (spool->max_hpages != -1) { /* maximum size accounting */
                 if ((spool->used_hpages + delta) <= spool->max_hpages)
@@ -189,7 +192,7 @@ static long hugepage_subpool_get_pages(struct hugepage_subpool *spool,
         }
 
 unlock_ret:
- spin_unlock(&spool->lock);
+spin_unlock_irq(&spool->lock);
         return ret;
 }
 
@@ -203,11 +206,12 @@ static long hugepage_subpool_put_pages(struct hugepage_subpool *spool,
                                        long delta)
 {
         long ret = delta;
+unsigned long flags;
 
         if (!spool)
                 return delta;
 
- spin_lock(&spool->lock);
+spin_lock_irqsave(&spool->lock, flags);
 
         if (spool->max_hpages != -1) /* maximum size accounting */
                 spool->used_hpages -= delta;
@@ -228,7 +232,7 @@ static long hugepage_subpool_put_pages(struct hugepage_subpool *spool,
          * If hugetlbfs_put_super couldn't free spool due to an outstanding
          * quota reference, free it now.
          */
- unlock_or_release_subpool(spool);
+unlock_or_release_subpool(spool, flags);
 
         return ret;
 }
@@ -1360,10 +1364,10 @@ static void update_and_free_page(struct hstate *h, struct page *page)
                  * Temporarily drop the hugetlb_lock, because
                  * we might block in free_gigantic_page().
                  */
- spin_unlock(&hugetlb_lock);
+spin_unlock_irq(&hugetlb_lock);
                 destroy_compound_gigantic_page(page, huge_page_order(h));
                 free_gigantic_page(page, huge_page_order(h));
- spin_lock(&hugetlb_lock);
+spin_lock_irq(&hugetlb_lock);
         } else {
                 __free_pages(page, huge_page_order(h));
         }
@@ -1426,7 +1430,7 @@ static inline void ClearPageHugeTemporary(struct page *page)
         page[2].mapping = NULL;
 }
 
-static void __free_huge_page(struct page *page)
+static void free_huge_page(struct page *page)
 {
         /*
          * Can't pass hstate in here because it is called from the
@@ -1437,6 +1441,7 @@ static void __free_huge_page(struct page *page)
         struct hugepage_subpool *spool =
                 (struct hugepage_subpool *)page_private(page);
         bool restore_reserve;
+unsigned long flags;
 
         VM_BUG_ON_PAGE(page_count(page), page);
         VM_BUG_ON_PAGE(page_mapcount(page), page);
@@ -1465,7 +1470,7 @@ static void __free_huge_page(struct page *page)
                         restore_reserve = true;
         }
 
- spin_lock(&hugetlb_lock);
+spin_lock_irqsave(&hugetlb_lock, flags);
         clear_page_huge_active(page);
         hugetlb_cgroup_uncharge_page(hstate_index(h),
                                      pages_per_huge_page(h), page);
@@ -1488,55 +1493,7 @@ static void __free_huge_page(struct page *page)
                 arch_clear_hugepage_flags(page);
                 enqueue_huge_page(h, page);
         }
- spin_unlock(&hugetlb_lock);
-}
-
-/*
- * As free_huge_page() can be called from a non-task context, we have
- * to defer the actual freeing in a workqueue to prevent potential
- * hugetlb_lock deadlock.
- *
- * free_hpage_workfn() locklessly retrieves the linked list of pages to
- * be freed and frees them one-by-one. As the page->mapping pointer is
- * going to be cleared in __free_huge_page() anyway, it is reused as the
- * llist_node structure of a lockless linked list of huge pages to be freed.
- */
-static LLIST_HEAD(hpage_freelist);
-
-static void free_hpage_workfn(struct work_struct *work)
-{
- struct llist_node *node;
- struct page *page;
-
- node = llist_del_all(&hpage_freelist);
-
- while (node) {
- page = container_of((struct address_space **)node,
- struct page, mapping);
- node = node->next;
- __free_huge_page(page);
- }
-}
-static DECLARE_WORK(free_hpage_work, free_hpage_workfn);
-
-void free_huge_page(struct page *page)
-{
- /*
- * Defer freeing if in non-task context to avoid hugetlb_lock deadlock.
- */
- if (!in_task()) {
- /*
- * Only call schedule_work() if hpage_freelist is previously
- * empty. Otherwise, schedule_work() had been called but the
- * workfn hasn't retrieved the list yet.
- */
- if (llist_add((struct llist_node *)&page->mapping,
- &hpage_freelist))
- schedule_work(&free_hpage_work);
- return;
- }
-
- __free_huge_page(page);
+spin_unlock_irqrestore(&hugetlb_lock, flags);
 }
 
 static void prep_new_huge_page(struct hstate *h, struct page *page, int nid)
@@ -1545,11 +1502,11 @@ static void prep_new_huge_page(struct hstate *h, struct page *page, int nid)
         set_compound_page_dtor(page, HUGETLB_PAGE_DTOR);
         set_hugetlb_cgroup(page, NULL);
         set_hugetlb_cgroup_rsvd(page, NULL);
- spin_lock(&hugetlb_lock);
+spin_lock_irq(&hugetlb_lock);
         h->nr_huge_pages++;
         h->nr_huge_pages_node[nid]++;
         ClearPageHugeFreed(page);
- spin_unlock(&hugetlb_lock);
+spin_unlock_irq(&hugetlb_lock);
 }
 
 static void prep_compound_gigantic_page(struct page *page, unsigned int order)
@@ -1799,7 +1756,7 @@ int dissolve_free_huge_page(struct page *page)
         if (!PageHuge(page))
                 return 0;
 
- spin_lock(&hugetlb_lock);
+spin_lock_irq(&hugetlb_lock);
         if (!PageHuge(page)) {
                 rc = 0;
                 goto out;
@@ -1817,7 +1774,7 @@ int dissolve_free_huge_page(struct page *page)
                  * when it is dissolved.
                  */
                 if (unlikely(!PageHugeFreed(head))) {
- spin_unlock(&hugetlb_lock);
+spin_unlock_irq(&hugetlb_lock);
                         cond_resched();
 
                         /*
@@ -1847,7 +1804,7 @@ int dissolve_free_huge_page(struct page *page)
                 rc = 0;
         }
 out:
- spin_unlock(&hugetlb_lock);
+spin_unlock_irq(&hugetlb_lock);
         return rc;
 }
 
@@ -1889,16 +1846,16 @@ static struct page *alloc_surplus_huge_page(struct hstate *h, gfp_t gfp_mask,
         if (hstate_is_gigantic(h))
                 return NULL;
 
- spin_lock(&hugetlb_lock);
+spin_lock_irq(&hugetlb_lock);
         if (h->surplus_huge_pages >= h->nr_overcommit_huge_pages)
                 goto out_unlock;
- spin_unlock(&hugetlb_lock);
+spin_unlock_irq(&hugetlb_lock);
 
         page = alloc_fresh_huge_page(h, gfp_mask, nid, nmask, NULL);
         if (!page)
                 return NULL;
 
- spin_lock(&hugetlb_lock);
+spin_lock_irq(&hugetlb_lock);
         /*
          * We could have raced with the pool size change.
          * Double check that and simply deallocate the new page
@@ -1908,7 +1865,7 @@ static struct page *alloc_surplus_huge_page(struct hstate *h, gfp_t gfp_mask,
          */
         if (h->surplus_huge_pages >= h->nr_overcommit_huge_pages) {
                 SetPageHugeTemporary(page);
- spin_unlock(&hugetlb_lock);
+spin_unlock_irq(&hugetlb_lock);
                 put_page(page);
                 return NULL;
         } else {
@@ -1917,7 +1874,7 @@ static struct page *alloc_surplus_huge_page(struct hstate *h, gfp_t gfp_mask,
         }
 
 out_unlock:
- spin_unlock(&hugetlb_lock);
+spin_unlock_irq(&hugetlb_lock);
 
         return page;
 }
@@ -1967,17 +1924,17 @@ struct page *alloc_buddy_huge_page_with_mpol(struct hstate *h,
 struct page *alloc_huge_page_nodemask(struct hstate *h, int preferred_nid,
                 nodemask_t *nmask, gfp_t gfp_mask)
 {
- spin_lock(&hugetlb_lock);
+spin_lock_irq(&hugetlb_lock);
         if (h->free_huge_pages - h->resv_huge_pages > 0) {
                 struct page *page;
 
                 page = dequeue_huge_page_nodemask(h, gfp_mask, preferred_nid, nmask);
                 if (page) {
- spin_unlock(&hugetlb_lock);
+spin_unlock_irq(&hugetlb_lock);
                         return page;
                 }
         }
- spin_unlock(&hugetlb_lock);
+spin_unlock_irq(&hugetlb_lock);
 
         return alloc_migrate_huge_page(h, gfp_mask, preferred_nid, nmask);
 }
@@ -2024,7 +1981,7 @@ static int gather_surplus_pages(struct hstate *h, int delta)
 
         ret = -ENOMEM;
 retry:
- spin_unlock(&hugetlb_lock);
+spin_unlock_irq(&hugetlb_lock);
         for (i = 0; i < needed; i++) {
                 page = alloc_surplus_huge_page(h, htlb_alloc_mask(h),
                                 NUMA_NO_NODE, NULL);
@@ -2041,7 +1998,7 @@ static int gather_surplus_pages(struct hstate *h, int delta)
          * After retaking hugetlb_lock, we need to recalculate 'needed'
          * because either resv_huge_pages or free_huge_pages may have changed.
          */
- spin_lock(&hugetlb_lock);
+spin_lock_irq(&hugetlb_lock);
         needed = (h->resv_huge_pages + delta) -
                         (h->free_huge_pages + allocated);
         if (needed > 0) {
@@ -2079,12 +2036,12 @@ static int gather_surplus_pages(struct hstate *h, int delta)
                 enqueue_huge_page(h, page);
         }
 free:
- spin_unlock(&hugetlb_lock);
+spin_unlock_irq(&hugetlb_lock);
 
         /* Free unnecessary surplus pages to the buddy allocator */
         list_for_each_entry_safe(page, tmp, &surplus_list, lru)
                 put_page(page);
- spin_lock(&hugetlb_lock);
+spin_lock_irq(&hugetlb_lock);
 
         return ret;
 }
@@ -2375,7 +2332,7 @@ struct page *alloc_huge_page(struct vm_area_struct *vma,
         if (ret)
                 goto out_uncharge_cgroup_reservation;
 
- spin_lock(&hugetlb_lock);
+spin_lock_irq(&hugetlb_lock);
         /*
          * glb_chg is passed to indicate whether or not a page must be taken
          * from the global free pool (global change). gbl_chg == 0 indicates
@@ -2383,11 +2340,11 @@ struct page *alloc_huge_page(struct vm_area_struct *vma,
          */
         page = dequeue_huge_page_vma(h, vma, addr, avoid_reserve, gbl_chg);
         if (!page) {
- spin_unlock(&hugetlb_lock);
+spin_unlock_irq(&hugetlb_lock);
                 page = alloc_buddy_huge_page_with_mpol(h, vma, addr);
                 if (!page)
                         goto out_uncharge_cgroup;
- spin_lock(&hugetlb_lock);
+spin_lock_irq(&hugetlb_lock);
                 if (!avoid_reserve && vma_has_reserves(vma, gbl_chg)) {
                         SetPagePrivate(page);
                         h->resv_huge_pages--;
@@ -2404,7 +2361,7 @@ struct page *alloc_huge_page(struct vm_area_struct *vma,
                                                   h_cg, page);
         }
 
- spin_unlock(&hugetlb_lock);
+spin_unlock_irq(&hugetlb_lock);
 
         set_page_private(page, (unsigned long)spool);
 
@@ -2662,7 +2619,7 @@ static int set_max_huge_pages(struct hstate *h, unsigned long count, int nid,
         else
                 return -ENOMEM;
 
- spin_lock(&hugetlb_lock);
+spin_lock_irq(&hugetlb_lock);
 
         /*
          * Check for a node specific request.
@@ -2693,7 +2650,7 @@ static int set_max_huge_pages(struct hstate *h, unsigned long count, int nid,
          */
         if (hstate_is_gigantic(h) && !IS_ENABLED(CONFIG_CONTIG_ALLOC)) {
                 if (count > persistent_huge_pages(h)) {
- spin_unlock(&hugetlb_lock);
+spin_unlock_irq(&hugetlb_lock);
                         NODEMASK_FREE(node_alloc_noretry);
                         return -EINVAL;
                 }
@@ -2722,14 +2679,14 @@ static int set_max_huge_pages(struct hstate *h, unsigned long count, int nid,
                  * page, free_huge_page will handle it by freeing the page
                  * and reducing the surplus.
                  */
- spin_unlock(&hugetlb_lock);
+spin_unlock_irq(&hugetlb_lock);
 
                 /* yield cpu to avoid soft lockup */
                 cond_resched();
 
                 ret = alloc_pool_huge_page(h, nodes_allowed,
                                                 node_alloc_noretry);
- spin_lock(&hugetlb_lock);
+spin_lock_irq(&hugetlb_lock);
                 if (!ret)
                         goto out;
 
@@ -2767,7 +2724,7 @@ static int set_max_huge_pages(struct hstate *h, unsigned long count, int nid,
         }
 out:
         h->max_huge_pages = persistent_huge_pages(h);
- spin_unlock(&hugetlb_lock);
+spin_unlock_irq(&hugetlb_lock);
 
         NODEMASK_FREE(node_alloc_noretry);
 
@@ -2921,9 +2878,9 @@ static ssize_t nr_overcommit_hugepages_store(struct kobject *kobj,
         if (err)
                 return err;
 
- spin_lock(&hugetlb_lock);
+spin_lock_irq(&hugetlb_lock);
         h->nr_overcommit_huge_pages = input;
- spin_unlock(&hugetlb_lock);
+spin_unlock_irq(&hugetlb_lock);
 
         return count;
 }
@@ -3509,9 +3466,9 @@ int hugetlb_overcommit_handler(struct ctl_table *table, int write,
                 goto out;
 
         if (write) {
- spin_lock(&hugetlb_lock);
+spin_lock_irq(&hugetlb_lock);
                 h->nr_overcommit_huge_pages = tmp;
- spin_unlock(&hugetlb_lock);
+spin_unlock_irq(&hugetlb_lock);
         }
 out:
         return ret;
@@ -3604,7 +3561,7 @@ static int hugetlb_acct_memory(struct hstate *h, long delta)
 {
         int ret = -ENOMEM;
 
- spin_lock(&hugetlb_lock);
+spin_lock_irq(&hugetlb_lock);
         /*
          * When cpuset is configured, it breaks the strict hugetlb page
          * reservation as the accounting is done on a global variable. Such
@@ -3643,7 +3600,7 @@ static int hugetlb_acct_memory(struct hstate *h, long delta)
                 return_unused_surplus_pages(h, (unsigned long) -delta);
 
 out:
- spin_unlock(&hugetlb_lock);
+spin_unlock_irq(&hugetlb_lock);
         return ret;
 }
 
@@ -5650,7 +5607,7 @@ int isolate_hugetlb(struct page *page, struct list_head *list)
 {
         int ret = 0;
 
- spin_lock(&hugetlb_lock);
+spin_lock_irq(&hugetlb_lock);
         if (!PageHeadHuge(page) || !page_huge_active(page) ||
             !get_page_unless_zero(page)) {
                 ret = -EBUSY;
@@ -5659,17 +5616,17 @@ int isolate_hugetlb(struct page *page, struct list_head *list)
         clear_page_huge_active(page);
         list_move_tail(&page->lru, list);
 unlock:
- spin_unlock(&hugetlb_lock);
+spin_unlock_irq(&hugetlb_lock);
         return ret;
 }
 
 void putback_active_hugepage(struct page *page)
 {
         VM_BUG_ON_PAGE(!PageHead(page), page);
- spin_lock(&hugetlb_lock);
+spin_lock_irq(&hugetlb_lock);
         set_page_huge_active(page);
         list_move_tail(&page->lru, &(page_hstate(page))->hugepage_activelist);
- spin_unlock(&hugetlb_lock);
+spin_unlock_irq(&hugetlb_lock);
         put_page(page);
 }
 
@@ -5697,12 +5654,12 @@ void move_hugetlb_state(struct page *oldpage, struct page *newpage, int reason)
                 SetPageHugeTemporary(oldpage);
                 ClearPageHugeTemporary(newpage);
 
- spin_lock(&hugetlb_lock);
+spin_lock_irq(&hugetlb_lock);
                 if (h->surplus_huge_pages_node[old_nid]) {
                         h->surplus_huge_pages_node[old_nid]--;
                         h->surplus_huge_pages_node[new_nid]++;
                 }
- spin_unlock(&hugetlb_lock);
+spin_unlock_irq(&hugetlb_lock);
         }
 }
 
diff --git a/mm/hugetlb_cgroup.c b/mm/hugetlb_cgroup.c
index 1348819f546c..8b6d01874658 100644
--- a/mm/hugetlb_cgroup.c
+++ b/mm/hugetlb_cgroup.c
@@ -204,11 +204,11 @@ static void hugetlb_cgroup_css_offline(struct cgroup_subsys_state *css)
         do {
                 idx = 0;
                 for_each_hstate(h) {
- spin_lock(&hugetlb_lock);
+spin_lock_irq(&hugetlb_lock);
                         list_for_each_entry(page, &h->hugepage_activelist, lru)
                                 hugetlb_cgroup_move_parent(idx, h_cg, page);
 
- spin_unlock(&hugetlb_lock);
+spin_unlock_irq(&hugetlb_lock);
                         idx++;
                 }
                 cond_resched();
@@ -785,7 +785,7 @@ void hugetlb_cgroup_migrate(struct page *oldhpage, struct page *newhpage)
                 return;
 
         VM_BUG_ON_PAGE(!PageHuge(oldhpage), oldhpage);
- spin_lock(&hugetlb_lock);
+spin_lock_irq(&hugetlb_lock);
         h_cg = hugetlb_cgroup_from_page(oldhpage);
         h_cg_rsvd = hugetlb_cgroup_from_page_rsvd(oldhpage);
         set_hugetlb_cgroup(oldhpage, NULL);
@@ -795,7 +795,7 @@ void hugetlb_cgroup_migrate(struct page *oldhpage, struct page *newhpage)
         set_hugetlb_cgroup(newhpage, h_cg);
         set_hugetlb_cgroup_rsvd(newhpage, h_cg_rsvd);
         list_move(&newhpage->lru, &h->hugepage_activelist);
- spin_unlock(&hugetlb_lock);
+spin_unlock_irq(&hugetlb_lock);
         return;
 }
 
-- 
2.25.1