From: Hugh Dickins <hugh@veritas.com>

try_to_unuse drop mmlist_lock across unuse_process (with pretty dance
of atomic_incs and mmputs of various mmlist markers, and a polite new
cond_resched there), so unuse_process can pte_chain_alloc(GFP_KERNEL)
and pass that down and down and down and down to unuse_pte: which
cannot succeed more than once on a given mm (make that explicit by
returning back up once succeeded).  Preliminary checks moved up from
unuse_pte to unuse_pmd, and done more efficiently (avoid that extra
pte_file test added recently), swapoff spends far too long in here.
Updated locking comments and references to try_to_swap_out.



 25-akpm/mm/swapfile.c |  115 ++++++++++++++++++++++++++++++--------------------
 1 files changed, 70 insertions(+), 45 deletions(-)

diff -puN mm/swapfile.c~hugh-11-fix-unuse_pmd-fixme mm/swapfile.c
--- 25/mm/swapfile.c~hugh-11-fix-unuse_pmd-fixme	Tue Mar 25 18:35:00 2003
+++ 25-akpm/mm/swapfile.c	Tue Mar 25 18:35:00 2003
@@ -377,42 +377,34 @@ void free_swap_and_cache(swp_entry_t ent
  * share this swap entry, so be cautious and let do_wp_page work out
  * what to do if a write is requested later.
  */
-/* mmlist_lock and vma->vm_mm->page_table_lock are held */
+/* vma->vm_mm->page_table_lock is held */
 static void
 unuse_pte(struct vm_area_struct *vma, unsigned long address, pte_t *dir,
 	swp_entry_t entry, struct page *page, struct pte_chain **pte_chainp)
 {
-	pte_t pte = *dir;
-
-	if (pte_file(pte))
-		return;
-	if (likely(pte_to_swp_entry(pte).val != entry.val))
-		return;
-	if (unlikely(pte_none(pte) || pte_present(pte)))
-		return;
+	vma->vm_mm->rss++;
 	get_page(page);
 	set_pte(dir, pte_mkold(mk_pte(page, vma->vm_page_prot)));
 	SetPageAnon(page);
 	*pte_chainp = page_add_rmap(page, dir, *pte_chainp);
 	swap_free(entry);
-	++vma->vm_mm->rss;
 }
 
-/* mmlist_lock and vma->vm_mm->page_table_lock are held */
-static void unuse_pmd(struct vm_area_struct * vma, pmd_t *dir,
+/* vma->vm_mm->page_table_lock is held */
+static int unuse_pmd(struct vm_area_struct * vma, pmd_t *dir,
 	unsigned long address, unsigned long size, unsigned long offset,
-	swp_entry_t entry, struct page* page)
+	swp_entry_t entry, struct page *page, struct pte_chain **pte_chainp)
 {
 	pte_t * pte;
 	unsigned long end;
-	struct pte_chain *pte_chain = NULL;
+	pte_t swp_pte = swp_entry_to_pte(entry);
 
 	if (pmd_none(*dir))
-		return;
+		return 0;
 	if (pmd_bad(*dir)) {
 		pmd_ERROR(*dir);
 		pmd_clear(dir);
-		return;
+		return 0;
 	}
 	pte = pte_offset_map(dir, address);
 	offset += address & PMD_MASK;
@@ -422,33 +414,36 @@ static void unuse_pmd(struct vm_area_str
 		end = PMD_SIZE;
 	do {
 		/*
-		 * FIXME: handle pte_chain_alloc() failures
+		 * swapoff spends a _lot_ of time in this loop!
+		 * Test inline before going to call unuse_pte.
 		 */
-		if (pte_chain == NULL)
-			pte_chain = pte_chain_alloc(GFP_ATOMIC);
-		unuse_pte(vma, offset+address-vma->vm_start,
-				pte, entry, page, &pte_chain);
+		if (unlikely(pte_same(*pte, swp_pte))) {
+			unuse_pte(vma, offset + address, pte,
+					entry, page, pte_chainp);
+			pte_unmap(pte);
+			return 1;
+		}
 		address += PAGE_SIZE;
 		pte++;
 	} while (address && (address < end));
 	pte_unmap(pte - 1);
-	pte_chain_free(pte_chain);
+	return 0;
 }
 
-/* mmlist_lock and vma->vm_mm->page_table_lock are held */
-static void unuse_pgd(struct vm_area_struct * vma, pgd_t *dir,
+/* vma->vm_mm->page_table_lock is held */
+static int unuse_pgd(struct vm_area_struct * vma, pgd_t *dir,
 	unsigned long address, unsigned long size,
-	swp_entry_t entry, struct page* page)
+	swp_entry_t entry, struct page *page, struct pte_chain **pte_chainp)
 {
 	pmd_t * pmd;
 	unsigned long offset, end;
 
 	if (pgd_none(*dir))
-		return;
+		return 0;
 	if (pgd_bad(*dir)) {
 		pgd_ERROR(*dir);
 		pgd_clear(dir);
-		return;
+		return 0;
 	}
 	pmd = pmd_offset(dir, address);
 	offset = address & PGDIR_MASK;
@@ -459,32 +454,42 @@ static void unuse_pgd(struct vm_area_str
 	if (address >= end)
 		BUG();
 	do {
-		unuse_pmd(vma, pmd, address, end - address, offset, entry,
-			  page);
+		if (unuse_pmd(vma, pmd, address, end - address,
+				offset, entry, page, pte_chainp))
+			return 1;
 		address = (address + PMD_SIZE) & PMD_MASK;
 		pmd++;
 	} while (address && (address < end));
+	return 0;
 }
 
-/* mmlist_lock and vma->vm_mm->page_table_lock are held */
-static void unuse_vma(struct vm_area_struct * vma, pgd_t *pgdir,
-			swp_entry_t entry, struct page* page)
+/* vma->vm_mm->page_table_lock is held */
+static int unuse_vma(struct vm_area_struct * vma, pgd_t *pgdir,
+	swp_entry_t entry, struct page *page, struct pte_chain **pte_chainp)
 {
 	unsigned long start = vma->vm_start, end = vma->vm_end;
 
 	if (start >= end)
 		BUG();
 	do {
-		unuse_pgd(vma, pgdir, start, end - start, entry, page);
+		if (unuse_pgd(vma, pgdir, start, end - start,
+				entry, page, pte_chainp))
+			return 1;
 		start = (start + PGDIR_SIZE) & PGDIR_MASK;
 		pgdir++;
 	} while (start && (start < end));
+	return 0;
 }
 
-static void unuse_process(struct mm_struct * mm,
+static int unuse_process(struct mm_struct * mm,
 			swp_entry_t entry, struct page* page)
 {
 	struct vm_area_struct* vma;
+	struct pte_chain *pte_chain;
+
+	pte_chain = pte_chain_alloc(GFP_KERNEL);
+	if (!pte_chain)
+		return -ENOMEM;
 
 	/*
 	 * Go through process' page directory.
@@ -492,10 +497,12 @@ static void unuse_process(struct mm_stru
 	spin_lock(&mm->page_table_lock);
 	for (vma = mm->mmap; vma; vma = vma->vm_next) {
 		pgd_t * pgd = pgd_offset(mm, vma->vm_start);
-		unuse_vma(vma, pgd, entry, page);
+		if (unuse_vma(vma, pgd, entry, page, &pte_chain))
+			break;
 	}
 	spin_unlock(&mm->page_table_lock);
-	return;
+	pte_chain_free(pte_chain);
+	return 0;
 }
 
 /*
@@ -639,36 +646,54 @@ static int try_to_unuse(unsigned int typ
 			if (start_mm == &init_mm)
 				shmem = shmem_unuse(entry, page);
 			else
-				unuse_process(start_mm, entry, page);
+				retval = unuse_process(start_mm, entry, page);
 		}
 		if (*swap_map > 1) {
 			int set_start_mm = (*swap_map >= swcount);
 			struct list_head *p = &start_mm->mmlist;
 			struct mm_struct *new_start_mm = start_mm;
+			struct mm_struct *prev_mm = start_mm;
 			struct mm_struct *mm;
 
+			atomic_inc(&new_start_mm->mm_users);
+			atomic_inc(&prev_mm->mm_users);
 			spin_lock(&mmlist_lock);
-			while (*swap_map > 1 &&
+			while (*swap_map > 1 && !retval &&
 					(p = p->next) != &start_mm->mmlist) {
 				mm = list_entry(p, struct mm_struct, mmlist);
+				atomic_inc(&mm->mm_users);
+				spin_unlock(&mmlist_lock);
+				mmput(prev_mm);
+				prev_mm = mm;
+
+				cond_resched();
+
 				swcount = *swap_map;
-				if (mm == &init_mm) {
+				if (swcount <= 1)
+					;
+				else if (mm == &init_mm) {
 					set_start_mm = 1;
-					spin_unlock(&mmlist_lock);
 					shmem = shmem_unuse(entry, page);
-					spin_lock(&mmlist_lock);
 				} else
-					unuse_process(mm, entry, page);
+					retval = unuse_process(mm, entry, page);
 				if (set_start_mm && *swap_map < swcount) {
+					mmput(new_start_mm);
+					atomic_inc(&mm->mm_users);
 					new_start_mm = mm;
 					set_start_mm = 0;
 				}
+				spin_lock(&mmlist_lock);
 			}
-			atomic_inc(&new_start_mm->mm_users);
 			spin_unlock(&mmlist_lock);
+			mmput(prev_mm);
 			mmput(start_mm);
 			start_mm = new_start_mm;
 		}
+		if (retval) {
+			unlock_page(page);
+			page_cache_release(page);
+			break;
+		}
 
 		/*
 		 * How could swap count reach 0x7fff when the maximum
@@ -692,7 +717,7 @@ static int try_to_unuse(unsigned int typ
 
 		/*
 		 * If a reference remains (rare), we would like to leave
-		 * the page in the swap cache; but try_to_swap_out could
+		 * the page in the swap cache; but try_to_unmap could
 		 * then re-duplicate the entry once we drop page lock,
 		 * so we might loop indefinitely; also, that page could
 		 * not be swapped out to other storage meanwhile.  So:
@@ -728,7 +753,7 @@ static int try_to_unuse(unsigned int typ
 		/*
 		 * So we could skip searching mms once swap count went
 		 * to 1, we did not mark any present ptes as dirty: must
-		 * mark page dirty so try_to_swap_out will preserve it.
+		 * mark page dirty so shrink_list will preserve it.
 		 */
 		SetPageDirty(page);
 		unlock_page(page);

_