Linux patch 3.12.303.12-32

author: Mike Pagano <mpagano@gentoo.org> 2014-10-10 15:56:35 -0400
committer: Mike Pagano <mpagano@gentoo.org> 2014-10-10 15:56:35 -0400
commit: 45ca8c94954b7b8d9658410f759a5258d7cdca9a (patch)
tree: 4773f3981492d82912a85d050a20c121d7e7298a
parent: Linux patch 3.12.29 (diff)
download: linux-patches-3.12-32.tar.gz
linux-patches-3.12-32.tar.bz2
linux-patches-3.12-32.zip
2 files changed, 7728 insertions, 0 deletions
diff --git a/0000_README b/0000_README
index ae0f6aa7..d8b89ecb 100644
--- a/0000_README
+++ b/0000_README
@@ -158,6 +158,10 @@ Patch:  1028_linux-3.12.29.patch
 From:   http://www.kernel.org
 Desc:   Linux 3.12.29
 
+Patch:  1029_linux-3.12.30.patch
+From:   http://www.kernel.org
+Desc:   Linux 3.12.30
+
 Patch:  1500_XATTR_USER_PREFIX.patch
 From:   https://bugs.gentoo.org/show_bug.cgi?id=470644
 Desc:   Support for namespace user.pax.* on tmpfs.
diff --git a/1029_linux-3.12.30.patch b/1029_linux-3.12.30.patch
new file mode 100644
index 00000000..90682678
--- /dev/null
+++ b/1029_linux-3.12.30.patch
@@ -0,0 +1,7724 @@
+diff --git a/Makefile b/Makefile
+index 67cec33d00c7..1ad1566225ca 100644
+--- a/Makefile
++++ b/Makefile
+@@ -1,6 +1,6 @@
+ VERSION = 3
+ PATCHLEVEL = 12
+-SUBLEVEL = 29
++SUBLEVEL = 30
+ EXTRAVERSION =
+ NAME = One Giant Leap for Frogkind
+ 
+diff --git a/arch/tile/mm/homecache.c b/arch/tile/mm/homecache.c
+index 004ba568d93f..33294fdc402e 100644
+--- a/arch/tile/mm/homecache.c
++++ b/arch/tile/mm/homecache.c
+@@ -417,7 +417,7 @@ void __homecache_free_pages(struct page *page, unsigned int order)
+ 	if (put_page_testzero(page)) {
+ 		homecache_change_page_home(page, order, PAGE_HOME_HASH);
+ 		if (order == 0) {
+-			free_hot_cold_page(page, 0);
++			free_hot_cold_page(page, false);
+ 		} else {
+ 			init_page_count(page);
+ 			__free_pages(page, order);
+diff --git a/arch/unicore32/include/asm/mmu_context.h b/arch/unicore32/include/asm/mmu_context.h
+index fb5e4c658f7a..ef470a7a3d0f 100644
+--- a/arch/unicore32/include/asm/mmu_context.h
++++ b/arch/unicore32/include/asm/mmu_context.h
+@@ -14,6 +14,8 @@
+ 
+ #include <linux/compiler.h>
+ #include <linux/sched.h>
++#include <linux/mm.h>
++#include <linux/vmacache.h>
+ #include <linux/io.h>
+ 
+ #include <asm/cacheflush.h>
+@@ -73,7 +75,7 @@ do { \
+ 		else \
+ 			mm->mmap = NULL; \
+ 		rb_erase(&high_vma->vm_rb, &mm->mm_rb); \
+-		mm->mmap_cache = NULL; \
++		vmacache_invalidate(mm); \
+ 		mm->map_count--; \
+ 		remove_vma(high_vma); \
+ 	} \
+diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
+index e6d90babc245..04905bfc508b 100644
+--- a/arch/x86/include/asm/tlbflush.h
++++ b/arch/x86/include/asm/tlbflush.h
+@@ -62,7 +62,7 @@ static inline void __flush_tlb_all(void)
+ 
+ static inline void __flush_tlb_one(unsigned long addr)
+ {
+-	count_vm_event(NR_TLB_LOCAL_FLUSH_ONE);
++	count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ONE);
+ 	__flush_tlb_single(addr);
+ }
+ 
+@@ -93,13 +93,13 @@ static inline void __flush_tlb_one(unsigned long addr)
+  */
+ static inline void __flush_tlb_up(void)
+ {
+-	count_vm_event(NR_TLB_LOCAL_FLUSH_ALL);
++	count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL);
+ 	__flush_tlb();
+ }
+ 
+ static inline void flush_tlb_all(void)
+ {
+-	count_vm_event(NR_TLB_LOCAL_FLUSH_ALL);
++	count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL);
+ 	__flush_tlb_all();
+ }
+ 
+diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c
+index ce2d0a2c3e4f..0e25a1bc5ab5 100644
+--- a/arch/x86/kernel/cpu/mtrr/generic.c
++++ b/arch/x86/kernel/cpu/mtrr/generic.c
+@@ -683,7 +683,7 @@ static void prepare_set(void) __acquires(set_atomicity_lock)
+ 	}
+ 
+ 	/* Flush all TLBs via a mov %cr3, %reg; mov %reg, %cr3 */
+-	count_vm_event(NR_TLB_LOCAL_FLUSH_ALL);
++	count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL);
+ 	__flush_tlb();
+ 
+ 	/* Save MTRR state */
+@@ -697,7 +697,7 @@ static void prepare_set(void) __acquires(set_atomicity_lock)
+ static void post_set(void) __releases(set_atomicity_lock)
+ {
+ 	/* Flush TLBs (no need to flush caches - they are disabled) */
+-	count_vm_event(NR_TLB_LOCAL_FLUSH_ALL);
++	count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL);
+ 	__flush_tlb();
+ 
+ 	/* Intel (P6) standard MTRRs */
+diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
+index dfa537a03be1..5da29d04de2f 100644
+--- a/arch/x86/mm/pgtable.c
++++ b/arch/x86/mm/pgtable.c
+@@ -386,13 +386,20 @@ int pmdp_test_and_clear_young(struct vm_area_struct *vma,
+ int ptep_clear_flush_young(struct vm_area_struct *vma,
+ 			   unsigned long address, pte_t *ptep)
+ {
+-	int young;
+-
+-	young = ptep_test_and_clear_young(vma, address, ptep);
+-	if (young)
+-		flush_tlb_page(vma, address);
+-
+-	return young;
++	/*
++	 * On x86 CPUs, clearing the accessed bit without a TLB flush
++	 * doesn't cause data corruption. [ It could cause incorrect
++	 * page aging and the (mistaken) reclaim of hot pages, but the
++	 * chance of that should be relatively low. ]
++	 *
++	 * So as a performance optimization don't flush the TLB when
++	 * clearing the accessed bit, it will eventually be flushed by
++	 * a context switch or a VM operation anyway. [ In the rare
++	 * event of it not getting flushed for a long time the delay
++	 * shouldn't really matter because there's no real memory
++	 * pressure for swapout to react to. ]
++	 */
++	return ptep_test_and_clear_young(vma, address, ptep);
+ }
+ 
+ #ifdef CONFIG_TRANSPARENT_HUGEPAGE
+diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
+index ae699b3bbac8..dd8dda167a24 100644
+--- a/arch/x86/mm/tlb.c
++++ b/arch/x86/mm/tlb.c
+@@ -103,7 +103,7 @@ static void flush_tlb_func(void *info)
+ 	if (f->flush_mm != this_cpu_read(cpu_tlbstate.active_mm))
+ 		return;
+ 
+-	count_vm_event(NR_TLB_REMOTE_FLUSH_RECEIVED);
++	count_vm_tlb_event(NR_TLB_REMOTE_FLUSH_RECEIVED);
+ 	if (this_cpu_read(cpu_tlbstate.state) == TLBSTATE_OK) {
+ 		if (f->flush_end == TLB_FLUSH_ALL)
+ 			local_flush_tlb();
+@@ -131,7 +131,7 @@ void native_flush_tlb_others(const struct cpumask *cpumask,
+ 	info.flush_start = start;
+ 	info.flush_end = end;
+ 
+-	count_vm_event(NR_TLB_REMOTE_FLUSH);
++	count_vm_tlb_event(NR_TLB_REMOTE_FLUSH);
+ 	if (is_uv_system()) {
+ 		unsigned int cpu;
+ 
+@@ -151,44 +151,19 @@ void flush_tlb_current_task(void)
+ 
+ 	preempt_disable();
+ 
+-	count_vm_event(NR_TLB_LOCAL_FLUSH_ALL);
++	count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL);
+ 	local_flush_tlb();
+ 	if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids)
+ 		flush_tlb_others(mm_cpumask(mm), mm, 0UL, TLB_FLUSH_ALL);
+ 	preempt_enable();
+ }
+ 
+-/*
+- * It can find out the THP large page, or
+- * HUGETLB page in tlb_flush when THP disabled
+- */
+-static inline unsigned long has_large_page(struct mm_struct *mm,
+-				 unsigned long start, unsigned long end)
+-{
+-	pgd_t *pgd;
+-	pud_t *pud;
+-	pmd_t *pmd;
+-	unsigned long addr = ALIGN(start, HPAGE_SIZE);
+-	for (; addr < end; addr += HPAGE_SIZE) {
+-		pgd = pgd_offset(mm, addr);
+-		if (likely(!pgd_none(*pgd))) {
+-			pud = pud_offset(pgd, addr);
+-			if (likely(!pud_none(*pud))) {
+-				pmd = pmd_offset(pud, addr);
+-				if (likely(!pmd_none(*pmd)))
+-					if (pmd_large(*pmd))
+-						return addr;
+-			}
+-		}
+-	}
+-	return 0;
+-}
+-
+ void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,
+ 				unsigned long end, unsigned long vmflag)
+ {
+ 	unsigned long addr;
+ 	unsigned act_entries, tlb_entries = 0;
++	unsigned long nr_base_pages;
+ 
+ 	preempt_disable();
+ 	if (current->active_mm != mm)
+@@ -210,21 +185,20 @@ void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,
+ 		tlb_entries = tlb_lli_4k[ENTRIES];
+ 	else
+ 		tlb_entries = tlb_lld_4k[ENTRIES];
++
+ 	/* Assume all of TLB entries was occupied by this task */
+-	act_entries = mm->total_vm > tlb_entries ? tlb_entries : mm->total_vm;
++	act_entries = tlb_entries >> tlb_flushall_shift;
++	act_entries = mm->total_vm > act_entries ? act_entries : mm->total_vm;
++	nr_base_pages = (end - start) >> PAGE_SHIFT;
+ 
+ 	/* tlb_flushall_shift is on balance point, details in commit log */
+-	if ((end - start) >> PAGE_SHIFT > act_entries >> tlb_flushall_shift) {
+-		count_vm_event(NR_TLB_LOCAL_FLUSH_ALL);
++	if (nr_base_pages > act_entries) {
++		count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL);
+ 		local_flush_tlb();
+ 	} else {
+-		if (has_large_page(mm, start, end)) {
+-			local_flush_tlb();
+-			goto flush_all;
+-		}
+ 		/* flush range by one by one 'invlpg' */
+ 		for (addr = start; addr < end;	addr += PAGE_SIZE) {
+-			count_vm_event(NR_TLB_LOCAL_FLUSH_ONE);
++			count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ONE);
+ 			__flush_tlb_single(addr);
+ 		}
+ 
+@@ -262,7 +236,7 @@ void flush_tlb_page(struct vm_area_struct *vma, unsigned long start)
+ 
+ static void do_flush_tlb_all(void *info)
+ {
+-	count_vm_event(NR_TLB_REMOTE_FLUSH_RECEIVED);
++	count_vm_tlb_event(NR_TLB_REMOTE_FLUSH_RECEIVED);
+ 	__flush_tlb_all();
+ 	if (this_cpu_read(cpu_tlbstate.state) == TLBSTATE_LAZY)
+ 		leave_mm(smp_processor_id());
+@@ -270,7 +244,7 @@ static void do_flush_tlb_all(void *info)
+ 
+ void flush_tlb_all(void)
+ {
+-	count_vm_event(NR_TLB_REMOTE_FLUSH);
++	count_vm_tlb_event(NR_TLB_REMOTE_FLUSH);
+ 	on_each_cpu(do_flush_tlb_all, NULL, 1);
+ }
+ 
+diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
+index 6e9ff8fac75a..6357298932bf 100644
+--- a/fs/btrfs/compression.c
++++ b/fs/btrfs/compression.c
+@@ -474,7 +474,7 @@ static noinline int add_ra_bio_pages(struct inode *inode,
+ 		rcu_read_lock();
+ 		page = radix_tree_lookup(&mapping->page_tree, pg_index);
+ 		rcu_read_unlock();
+-		if (page) {
++		if (page && !radix_tree_exceptional_entry(page)) {
+ 			misses++;
+ 			if (misses > 4)
+ 				break;
+diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
+index 594bbfd4996e..7015d9079bd1 100644
+--- a/fs/btrfs/extent_io.c
++++ b/fs/btrfs/extent_io.c
+@@ -4446,7 +4446,8 @@ static void check_buffer_tree_ref(struct extent_buffer *eb)
+ 	spin_unlock(&eb->refs_lock);
+ }
+ 
+-static void mark_extent_buffer_accessed(struct extent_buffer *eb)
++static void mark_extent_buffer_accessed(struct extent_buffer *eb,
++		struct page *accessed)
+ {
+ 	unsigned long num_pages, i;
+ 
+@@ -4455,7 +4456,8 @@ static void mark_extent_buffer_accessed(struct extent_buffer *eb)
+ 	num_pages = num_extent_pages(eb->start, eb->len);
+ 	for (i = 0; i < num_pages; i++) {
+ 		struct page *p = extent_buffer_page(eb, i);
+-		mark_page_accessed(p);
++		if (p != accessed)
++			mark_page_accessed(p);
+ 	}
+ }
+ 
+@@ -4476,7 +4478,7 @@ struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree,
+ 	eb = radix_tree_lookup(&tree->buffer, start >> PAGE_CACHE_SHIFT);
+ 	if (eb && atomic_inc_not_zero(&eb->refs)) {
+ 		rcu_read_unlock();
+-		mark_extent_buffer_accessed(eb);
++		mark_extent_buffer_accessed(eb, NULL);
+ 		return eb;
+ 	}
+ 	rcu_read_unlock();
+@@ -4504,7 +4506,7 @@ struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree,
+ 				spin_unlock(&mapping->private_lock);
+ 				unlock_page(p);
+ 				page_cache_release(p);
+-				mark_extent_buffer_accessed(exists);
++				mark_extent_buffer_accessed(exists, p);
+ 				goto free_eb;
+ 			}
+ 
+@@ -4519,7 +4521,6 @@ struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree,
+ 		attach_extent_buffer_page(eb, p);
+ 		spin_unlock(&mapping->private_lock);
+ 		WARN_ON(PageDirty(p));
+-		mark_page_accessed(p);
+ 		eb->pages[i] = p;
+ 		if (!PageUptodate(p))
+ 			uptodate = 0;
+@@ -4549,7 +4550,7 @@ again:
+ 		}
+ 		spin_unlock(&tree->buffer_lock);
+ 		radix_tree_preload_end();
+-		mark_extent_buffer_accessed(exists);
++		mark_extent_buffer_accessed(exists, NULL);
+ 		goto free_eb;
+ 	}
+ 	/* add one reference for the tree */
+@@ -4595,7 +4596,7 @@ struct extent_buffer *find_extent_buffer(struct extent_io_tree *tree,
+ 	eb = radix_tree_lookup(&tree->buffer, start >> PAGE_CACHE_SHIFT);
+ 	if (eb && atomic_inc_not_zero(&eb->refs)) {
+ 		rcu_read_unlock();
+-		mark_extent_buffer_accessed(eb);
++		mark_extent_buffer_accessed(eb, NULL);
+ 		return eb;
+ 	}
+ 	rcu_read_unlock();
+diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
+index 72da4df53c9a..ad80dfa6cf91 100644
+--- a/fs/btrfs/file.c
++++ b/fs/btrfs/file.c
+@@ -426,13 +426,8 @@ static noinline int btrfs_copy_from_user(loff_t pos, int num_pages,
+ 		struct page *page = prepared_pages[pg];
+ 		/*
+ 		 * Copy data from userspace to the current page
+-		 *
+-		 * Disable pagefault to avoid recursive lock since
+-		 * the pages are already locked
+ 		 */
+-		pagefault_disable();
+ 		copied = iov_iter_copy_from_user_atomic(page, i, offset, count);
+-		pagefault_enable();
+ 
+ 		/* Flush processor's dcache for this page */
+ 		flush_dcache_page(page);
+@@ -476,11 +471,12 @@ static void btrfs_drop_pages(struct page **pages, size_t num_pages)
+ 	for (i = 0; i < num_pages; i++) {
+ 		/* page checked is some magic around finding pages that
+ 		 * have been modified without going through btrfs_set_page_dirty
+-		 * clear it here
++		 * clear it here. There should be no need to mark the pages
++		 * accessed as prepare_pages should have marked them accessed
++		 * in prepare_pages via find_or_create_page()
+ 		 */
+ 		ClearPageChecked(pages[i]);
+ 		unlock_page(pages[i]);
+-		mark_page_accessed(pages[i]);
+ 		page_cache_release(pages[i]);
+ 	}
+ }
+diff --git a/fs/buffer.c b/fs/buffer.c
+index aeeea6529bcd..b7888527f7c3 100644
+--- a/fs/buffer.c
++++ b/fs/buffer.c
+@@ -227,7 +227,7 @@ __find_get_block_slow(struct block_device *bdev, sector_t block)
+ 	int all_mapped = 1;
+ 
+ 	index = block >> (PAGE_CACHE_SHIFT - bd_inode->i_blkbits);
+-	page = find_get_page(bd_mapping, index);
++	page = find_get_page_flags(bd_mapping, index, FGP_ACCESSED);
+ 	if (!page)
+ 		goto out;
+ 
+@@ -1366,12 +1366,13 @@ __find_get_block(struct block_device *bdev, sector_t block, unsigned size)
+ 	struct buffer_head *bh = lookup_bh_lru(bdev, block, size);
+ 
+ 	if (bh == NULL) {
++		/* __find_get_block_slow will mark the page accessed */
+ 		bh = __find_get_block_slow(bdev, block);
+ 		if (bh)
+ 			bh_lru_install(bh);
+-	}
+-	if (bh)
++	} else
+ 		touch_buffer(bh);
++
+ 	return bh;
+ }
+ EXPORT_SYMBOL(__find_get_block);
+@@ -1483,16 +1484,27 @@ EXPORT_SYMBOL(set_bh_page);
+ /*
+  * Called when truncating a buffer on a page completely.
+  */
++
++/* Bits that are cleared during an invalidate */
++#define BUFFER_FLAGS_DISCARD \
++	(1 << BH_Mapped | 1 << BH_New | 1 << BH_Req | \
++	 1 << BH_Delay | 1 << BH_Unwritten)
++
+ static void discard_buffer(struct buffer_head * bh)
+ {
++	unsigned long b_state, b_state_old;
++
+ 	lock_buffer(bh);
+ 	clear_buffer_dirty(bh);
+ 	bh->b_bdev = NULL;
+-	clear_buffer_mapped(bh);
+-	clear_buffer_req(bh);
+-	clear_buffer_new(bh);
+-	clear_buffer_delay(bh);
+-	clear_buffer_unwritten(bh);
++	b_state = bh->b_state;
++	for (;;) {
++		b_state_old = cmpxchg(&bh->b_state, b_state,
++				      (b_state & ~BUFFER_FLAGS_DISCARD));
++		if (b_state_old == b_state)
++			break;
++		b_state = b_state_old;
++	}
+ 	unlock_buffer(bh);
+ }
+ 
+diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c
+index e501ac3a49ff..2f6cfcaa55fd 100644
+--- a/fs/cramfs/inode.c
++++ b/fs/cramfs/inode.c
+@@ -179,8 +179,7 @@ static void *cramfs_read(struct super_block *sb, unsigned int offset, unsigned i
+ 		struct page *page = NULL;
+ 
+ 		if (blocknr + i < devsize) {
+-			page = read_mapping_page_async(mapping, blocknr + i,
+-									NULL);
++			page = read_mapping_page(mapping, blocknr + i, NULL);
+ 			/* synchronous error? */
+ 			if (IS_ERR(page))
+ 				page = NULL;
+diff --git a/fs/exec.c b/fs/exec.c
+index 95eef54de2b6..26bb91bf203b 100644
+--- a/fs/exec.c
++++ b/fs/exec.c
+@@ -26,6 +26,7 @@
+ #include <linux/file.h>
+ #include <linux/fdtable.h>
+ #include <linux/mm.h>
++#include <linux/vmacache.h>
+ #include <linux/stat.h>
+ #include <linux/fcntl.h>
+ #include <linux/swap.h>
+@@ -818,7 +819,7 @@ EXPORT_SYMBOL(read_code);
+ static int exec_mmap(struct mm_struct *mm)
+ {
+ 	struct task_struct *tsk;
+-	struct mm_struct * old_mm, *active_mm;
++	struct mm_struct *old_mm, *active_mm;
+ 
+ 	/* Notify parent that we're no longer interested in the old VM */
+ 	tsk = current;
+@@ -844,6 +845,8 @@ static int exec_mmap(struct mm_struct *mm)
+ 	tsk->mm = mm;
+ 	tsk->active_mm = mm;
+ 	activate_mm(active_mm, mm);
++	tsk->mm->vmacache_seqnum = 0;
++	vmacache_flush(tsk);
+ 	task_unlock(tsk);
+ 	arch_pick_mmap_layout(mm);
+ 	if (old_mm) {
+diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
+index 242226a87be7..7620133f78bf 100644
+--- a/fs/ext4/mballoc.c
++++ b/fs/ext4/mballoc.c
+@@ -1044,6 +1044,8 @@ int ext4_mb_init_group(struct super_block *sb, ext4_group_t group)
+ 	 * allocating. If we are looking at the buddy cache we would
+ 	 * have taken a reference using ext4_mb_load_buddy and that
+ 	 * would have pinned buddy page to page cache.
++	 * The call to ext4_mb_get_buddy_page_lock will mark the
++	 * page accessed.
+ 	 */
+ 	ret = ext4_mb_get_buddy_page_lock(sb, group, &e4b);
+ 	if (ret || !EXT4_MB_GRP_NEED_INIT(this_grp)) {
+@@ -1062,7 +1064,6 @@ int ext4_mb_init_group(struct super_block *sb, ext4_group_t group)
+ 		ret = -EIO;
+ 		goto err;
+ 	}
+-	mark_page_accessed(page);
+ 
+ 	if (e4b.bd_buddy_page == NULL) {
+ 		/*
+@@ -1082,7 +1083,6 @@ int ext4_mb_init_group(struct super_block *sb, ext4_group_t group)
+ 		ret = -EIO;
+ 		goto err;
+ 	}
+-	mark_page_accessed(page);
+ err:
+ 	ext4_mb_put_buddy_page_lock(&e4b);
+ 	return ret;
+@@ -1141,7 +1141,7 @@ ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
+ 
+ 	/* we could use find_or_create_page(), but it locks page
+ 	 * what we'd like to avoid in fast path ... */
+-	page = find_get_page(inode->i_mapping, pnum);
++	page = find_get_page_flags(inode->i_mapping, pnum, FGP_ACCESSED);
+ 	if (page == NULL || !PageUptodate(page)) {
+ 		if (page)
+ 			/*
+@@ -1172,15 +1172,16 @@ ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
+ 		ret = -EIO;
+ 		goto err;
+ 	}
++
++	/* Pages marked accessed already */
+ 	e4b->bd_bitmap_page = page;
+ 	e4b->bd_bitmap = page_address(page) + (poff * sb->s_blocksize);
+-	mark_page_accessed(page);
+ 
+ 	block++;
+ 	pnum = block / blocks_per_page;
+ 	poff = block % blocks_per_page;
+ 
+-	page = find_get_page(inode->i_mapping, pnum);
++	page = find_get_page_flags(inode->i_mapping, pnum, FGP_ACCESSED);
+ 	if (page == NULL || !PageUptodate(page)) {
+ 		if (page)
+ 			page_cache_release(page);
+@@ -1201,9 +1202,10 @@ ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
+ 		ret = -EIO;
+ 		goto err;
+ 	}
++
++	/* Pages marked accessed already */
+ 	e4b->bd_buddy_page = page;
+ 	e4b->bd_buddy = page_address(page) + (poff * sb->s_blocksize);
+-	mark_page_accessed(page);
+ 
+ 	BUG_ON(e4b->bd_bitmap_page == NULL);
+ 	BUG_ON(e4b->bd_buddy_page == NULL);
+diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c
+index bb312201ca95..15a29af63e20 100644
+--- a/fs/f2fs/checkpoint.c
++++ b/fs/f2fs/checkpoint.c
+@@ -70,7 +70,6 @@ repeat:
+ 		goto repeat;
+ 	}
+ out:
+-	mark_page_accessed(page);
+ 	return page;
+ }
+ 
+diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
+index 51ef27894433..d0335bdb65b4 100644
+--- a/fs/f2fs/node.c
++++ b/fs/f2fs/node.c
+@@ -970,7 +970,6 @@ repeat:
+ 	}
+ got_it:
+ 	BUG_ON(nid != nid_of_node(page));
+-	mark_page_accessed(page);
+ 	return page;
+ }
+ 
+@@ -1026,7 +1025,6 @@ page_hit:
+ 		f2fs_put_page(page, 1);
+ 		return ERR_PTR(-EIO);
+ 	}
+-	mark_page_accessed(page);
+ 	return page;
+ }
+ 
+diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
+index fa8cb4b7b8fe..fc8e4991736a 100644
+--- a/fs/fuse/dev.c
++++ b/fs/fuse/dev.c
+@@ -1613,7 +1613,7 @@ out_finish:
+ 
+ static void fuse_retrieve_end(struct fuse_conn *fc, struct fuse_req *req)
+ {
+-	release_pages(req->pages, req->num_pages, 0);
++	release_pages(req->pages, req->num_pages, false);
+ }
+ 
+ static int fuse_retrieve(struct fuse_conn *fc, struct inode *inode,
+diff --git a/fs/fuse/file.c b/fs/fuse/file.c
+index 4598345ab87d..d08c108065e1 100644
+--- a/fs/fuse/file.c
++++ b/fs/fuse/file.c
+@@ -985,13 +985,9 @@ static ssize_t fuse_fill_write_pages(struct fuse_req *req,
+ 		if (mapping_writably_mapped(mapping))
+ 			flush_dcache_page(page);
+ 
+-		pagefault_disable();
+ 		tmp = iov_iter_copy_from_user_atomic(page, ii, offset, bytes);
+-		pagefault_enable();
+ 		flush_dcache_page(page);
+ 
+-		mark_page_accessed(page);
+-
+ 		if (!tmp) {
+ 			unlock_page(page);
+ 			page_cache_release(page);
+diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c
+index 1253c2006029..f3aee0bbe886 100644
+--- a/fs/gfs2/aops.c
++++ b/fs/gfs2/aops.c
+@@ -517,7 +517,6 @@ int gfs2_internal_read(struct gfs2_inode *ip, char *buf, loff_t *pos,
+ 		p = kmap_atomic(page);
+ 		memcpy(buf + copied, p + offset, amt);
+ 		kunmap_atomic(p);
+-		mark_page_accessed(page);
+ 		page_cache_release(page);
+ 		copied += amt;
+ 		index++;
+diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c
+index 52f177be3bf8..89afe3a8f626 100644
+--- a/fs/gfs2/meta_io.c
++++ b/fs/gfs2/meta_io.c
+@@ -128,7 +128,8 @@ struct buffer_head *gfs2_getbuf(struct gfs2_glock *gl, u64 blkno, int create)
+ 			yield();
+ 		}
+ 	} else {
+-		page = find_lock_page(mapping, index);
++		page = find_get_page_flags(mapping, index,
++						FGP_LOCK|FGP_ACCESSED);
+ 		if (!page)
+ 			return NULL;
+ 	}
+@@ -145,7 +146,6 @@ struct buffer_head *gfs2_getbuf(struct gfs2_glock *gl, u64 blkno, int create)
+ 		map_bh(bh, sdp->sd_vfs, blkno);
+ 
+ 	unlock_page(page);
+-	mark_page_accessed(page);
+ 	page_cache_release(page);
+ 
+ 	return bh;
+diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
+index d19b30ababf1..a4a8ed56e438 100644
+--- a/fs/hugetlbfs/inode.c
++++ b/fs/hugetlbfs/inode.c
+@@ -1017,6 +1017,11 @@ static int __init init_hugetlbfs_fs(void)
+ 	int error;
+ 	int i;
+ 
++	if (!hugepages_supported()) {
++		pr_info("hugetlbfs: disabling because there are no supported hugepage sizes\n");
++		return -ENOTSUPP;
++	}
++
+ 	error = bdi_init(&hugetlbfs_backing_dev_info);
+ 	if (error)
+ 		return error;
+diff --git a/fs/jffs2/fs.c b/fs/jffs2/fs.c
+index fe3c0527545f..91bf52d1a88c 100644
+--- a/fs/jffs2/fs.c
++++ b/fs/jffs2/fs.c
+@@ -682,7 +682,7 @@ unsigned char *jffs2_gc_fetch_page(struct jffs2_sb_info *c,
+ 	struct inode *inode = OFNI_EDONI_2SFFJ(f);
+ 	struct page *pg;
+ 
+-	pg = read_cache_page_async(inode->i_mapping, offset >> PAGE_CACHE_SHIFT,
++	pg = read_cache_page(inode->i_mapping, offset >> PAGE_CACHE_SHIFT,
+ 			     (void *)jffs2_do_readpage_unlock, inode);
+ 	if (IS_ERR(pg))
+ 		return (void *)pg;
+diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c
+index e242bbf72972..fdb74cbb9e0c 100644
+--- a/fs/nfs/blocklayout/blocklayout.c
++++ b/fs/nfs/blocklayout/blocklayout.c
+@@ -1220,7 +1220,7 @@ static u64 pnfs_num_cont_bytes(struct inode *inode, pgoff_t idx)
+ 	end = DIV_ROUND_UP(i_size_read(inode), PAGE_CACHE_SIZE);
+ 	if (end != NFS_I(inode)->npages) {
+ 		rcu_read_lock();
+-		end = radix_tree_next_hole(&mapping->page_tree, idx + 1, ULONG_MAX);
++		end = page_cache_next_hole(mapping, idx + 1, ULONG_MAX);
+ 		rcu_read_unlock();
+ 	}
+ 
+diff --git a/fs/ntfs/attrib.c b/fs/ntfs/attrib.c
+index a27e3fecefaf..250ed5b20c8f 100644
+--- a/fs/ntfs/attrib.c
++++ b/fs/ntfs/attrib.c
+@@ -1748,7 +1748,6 @@ int ntfs_attr_make_non_resident(ntfs_inode *ni, const u32 data_size)
+ 	if (page) {
+ 		set_page_dirty(page);
+ 		unlock_page(page);
+-		mark_page_accessed(page);
+ 		page_cache_release(page);
+ 	}
+ 	ntfs_debug("Done.");
+diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c
+index ea4ba9daeb47..a0b2f345da2b 100644
+--- a/fs/ntfs/file.c
++++ b/fs/ntfs/file.c
+@@ -2060,7 +2060,6 @@ static ssize_t ntfs_file_buffered_write(struct kiocb *iocb,
+ 		}
+ 		do {
+ 			unlock_page(pages[--do_pages]);
+-			mark_page_accessed(pages[do_pages]);
+ 			page_cache_release(pages[do_pages]);
+ 		} while (do_pages);
+ 		if (unlikely(status))
+diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
+index ad4df869c907..7724fbdf443f 100644
+--- a/fs/proc/task_mmu.c
++++ b/fs/proc/task_mmu.c
+@@ -1,4 +1,5 @@
+ #include <linux/mm.h>
++#include <linux/vmacache.h>
+ #include <linux/hugetlb.h>
+ #include <linux/huge_mm.h>
+ #include <linux/mount.h>
+@@ -159,7 +160,7 @@ static void *m_start(struct seq_file *m, loff_t *pos)
+ 
+ 	/*
+ 	 * We remember last_addr rather than next_addr to hit with
+-	 * mmap_cache most of the time. We have zero last_addr at
++	 * vmacache most of the time. We have zero last_addr at
+ 	 * the beginning and also after lseek. We will have -1 last_addr
+ 	 * after the end of the vmas.
+ 	 */
+diff --git a/fs/super.c b/fs/super.c
+index d127de207376..fb68a4c90c98 100644
+--- a/fs/super.c
++++ b/fs/super.c
+@@ -112,9 +112,14 @@ static unsigned long super_cache_count(struct shrinker *shrink,
+ 
+ 	sb = container_of(shrink, struct super_block, s_shrink);
+ 
+-	if (!grab_super_passive(sb))
+-		return 0;
+-
++	/*
++	 * Don't call grab_super_passive as it is a potential
++	 * scalability bottleneck. The counts could get updated
++	 * between super_cache_count and super_cache_scan anyway.
++	 * Call to super_cache_count with shrinker_rwsem held
++	 * ensures the safety of call to list_lru_count_node() and
++	 * s_op->nr_cached_objects().
++	 */
+ 	if (sb->s_op && sb->s_op->nr_cached_objects)
+ 		total_objects = sb->s_op->nr_cached_objects(sb,
+ 						 sc->nid);
+@@ -125,7 +130,6 @@ static unsigned long super_cache_count(struct shrinker *shrink,
+ 						 sc->nid);
+ 
+ 	total_objects = vfs_pressure_ratio(total_objects);
+-	drop_super(sb);
+ 	return total_objects;
+ }
+ 
+@@ -321,10 +325,8 @@ void deactivate_locked_super(struct super_block *s)
+ 	struct file_system_type *fs = s->s_type;
+ 	if (atomic_dec_and_test(&s->s_active)) {
+ 		cleancache_invalidate_fs(s);
+-		fs->kill_sb(s);
+-
+-		/* caches are now gone, we can safely kill the shrinker now */
+ 		unregister_shrinker(&s->s_shrink);
++		fs->kill_sb(s);
+ 
+ 		put_filesystem(fs);
+ 		put_super(s);
+diff --git a/include/linux/compaction.h b/include/linux/compaction.h
+index 091d72e70d8a..01e3132820da 100644
+--- a/include/linux/compaction.h
++++ b/include/linux/compaction.h
+@@ -22,7 +22,7 @@ extern int sysctl_extfrag_handler(struct ctl_table *table, int write,
+ extern int fragmentation_index(struct zone *zone, unsigned int order);
+ extern unsigned long try_to_compact_pages(struct zonelist *zonelist,
+ 			int order, gfp_t gfp_mask, nodemask_t *mask,
+-			bool sync, bool *contended);
++			enum migrate_mode mode, bool *contended);
+ extern void compact_pgdat(pg_data_t *pgdat, int order);
+ extern void reset_isolation_suitable(pg_data_t *pgdat);
+ extern unsigned long compaction_suitable(struct zone *zone, int order);
+@@ -62,6 +62,22 @@ static inline bool compaction_deferred(struct zone *zone, int order)
+ 	return zone->compact_considered < defer_limit;
+ }
+ 
++/*
++ * Update defer tracking counters after successful compaction of given order,
++ * which means an allocation either succeeded (alloc_success == true) or is
++ * expected to succeed.
++ */
++static inline void compaction_defer_reset(struct zone *zone, int order,
++		bool alloc_success)
++{
++	if (alloc_success) {
++		zone->compact_considered = 0;
++		zone->compact_defer_shift = 0;
++	}
++	if (order >= zone->compact_order_failed)
++		zone->compact_order_failed = order + 1;
++}
++
+ /* Returns true if restarting compaction after many failures */
+ static inline bool compaction_restarting(struct zone *zone, int order)
+ {
+@@ -75,7 +91,7 @@ static inline bool compaction_restarting(struct zone *zone, int order)
+ #else
+ static inline unsigned long try_to_compact_pages(struct zonelist *zonelist,
+ 			int order, gfp_t gfp_mask, nodemask_t *nodemask,
+-			bool sync, bool *contended)
++			enum migrate_mode mode, bool *contended)
+ {
+ 	return COMPACT_CONTINUE;
+ }
+diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h
+index cc1b01cf2035..a7ebb89ae9fb 100644
+--- a/include/linux/cpuset.h
++++ b/include/linux/cpuset.h
+@@ -12,10 +12,31 @@
+ #include <linux/cpumask.h>
+ #include <linux/nodemask.h>
+ #include <linux/mm.h>
++#include <linux/jump_label.h>
+ 
+ #ifdef CONFIG_CPUSETS
+ 
+-extern int number_of_cpusets;	/* How many cpusets are defined in system? */
++extern struct static_key cpusets_enabled_key;
++static inline bool cpusets_enabled(void)
++{
++	return static_key_false(&cpusets_enabled_key);
++}
++
++static inline int nr_cpusets(void)
++{
++	/* jump label reference count + the top-level cpuset */
++	return static_key_count(&cpusets_enabled_key) + 1;
++}
++
++static inline void cpuset_inc(void)
++{
++	static_key_slow_inc(&cpusets_enabled_key);
++}
++
++static inline void cpuset_dec(void)
++{
++	static_key_slow_dec(&cpusets_enabled_key);
++}
+ 
+ extern int cpuset_init(void);
+ extern void cpuset_init_smp(void);
+@@ -32,13 +53,13 @@ extern int __cpuset_node_allowed_hardwall(int node, gfp_t gfp_mask);
+ 
+ static inline int cpuset_node_allowed_softwall(int node, gfp_t gfp_mask)
+ {
+-	return number_of_cpusets <= 1 ||
++	return nr_cpusets() <= 1 ||
+ 		__cpuset_node_allowed_softwall(node, gfp_mask);
+ }
+ 
+ static inline int cpuset_node_allowed_hardwall(int node, gfp_t gfp_mask)
+ {
+-	return number_of_cpusets <= 1 ||
++	return nr_cpusets() <= 1 ||
+ 		__cpuset_node_allowed_hardwall(node, gfp_mask);
+ }
+ 
+@@ -87,25 +108,26 @@ extern void rebuild_sched_domains(void);
+ extern void cpuset_print_task_mems_allowed(struct task_struct *p);
+ 
+ /*
+- * get_mems_allowed is required when making decisions involving mems_allowed
+- * such as during page allocation. mems_allowed can be updated in parallel
+- * and depending on the new value an operation can fail potentially causing
+- * process failure. A retry loop with get_mems_allowed and put_mems_allowed
+- * prevents these artificial failures.
++ * read_mems_allowed_begin is required when making decisions involving
++ * mems_allowed such as during page allocation. mems_allowed can be updated in
++ * parallel and depending on the new value an operation can fail potentially
++ * causing process failure. A retry loop with read_mems_allowed_begin and
++ * read_mems_allowed_retry prevents these artificial failures.
+  */
+-static inline unsigned int get_mems_allowed(void)
++static inline unsigned int read_mems_allowed_begin(void)
+ {
+ 	return read_seqcount_begin(&current->mems_allowed_seq);
+ }
+ 
+ /*
+- * If this returns false, the operation that took place after get_mems_allowed
+- * may have failed. It is up to the caller to retry the operation if
++ * If this returns true, the operation that took place after
++ * read_mems_allowed_begin may have failed artificially due to a concurrent
++ * update of mems_allowed. It is up to the caller to retry the operation if
+  * appropriate.
+  */
+-static inline bool put_mems_allowed(unsigned int seq)
++static inline bool read_mems_allowed_retry(unsigned int seq)
+ {
+-	return !read_seqcount_retry(&current->mems_allowed_seq, seq);
++	return read_seqcount_retry(&current->mems_allowed_seq, seq);
+ }
+ 
+ static inline void set_mems_allowed(nodemask_t nodemask)
+@@ -119,6 +141,8 @@ static inline void set_mems_allowed(nodemask_t nodemask)
+ 
+ #else /* !CONFIG_CPUSETS */
+ 
++static inline bool cpusets_enabled(void) { return false; }
++
+ static inline int cpuset_init(void) { return 0; }
+ static inline void cpuset_init_smp(void) {}
+ 
+@@ -221,14 +245,14 @@ static inline void set_mems_allowed(nodemask_t nodemask)
+ {
+ }
+ 
+-static inline unsigned int get_mems_allowed(void)
++static inline unsigned int read_mems_allowed_begin(void)
+ {
+ 	return 0;
+ }
+ 
+-static inline bool put_mems_allowed(unsigned int seq)
++static inline bool read_mems_allowed_retry(unsigned int seq)
+ {
+-	return true;
++	return false;
+ }
+ 
+ #endif /* !CONFIG_CPUSETS */
+diff --git a/include/linux/gfp.h b/include/linux/gfp.h
+index 9b4dd491f7e8..fa7ac989ff56 100644
+--- a/include/linux/gfp.h
++++ b/include/linux/gfp.h
+@@ -364,8 +364,8 @@ void *alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask);
+ 
+ extern void __free_pages(struct page *page, unsigned int order);
+ extern void free_pages(unsigned long addr, unsigned int order);
+-extern void free_hot_cold_page(struct page *page, int cold);
+-extern void free_hot_cold_page_list(struct list_head *list, int cold);
++extern void free_hot_cold_page(struct page *page, bool cold);
++extern void free_hot_cold_page_list(struct list_head *list, bool cold);
+ 
+ extern void __free_memcg_kmem_pages(struct page *page, unsigned int order);
+ extern void free_memcg_kmem_pages(unsigned long addr, unsigned int order);
+diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
+index a291552ab767..aac671be9581 100644
+--- a/include/linux/huge_mm.h
++++ b/include/linux/huge_mm.h
+@@ -92,10 +92,6 @@ extern bool is_vma_temporary_stack(struct vm_area_struct *vma);
+ #endif /* CONFIG_DEBUG_VM */
+ 
+ extern unsigned long transparent_hugepage_flags;
+-extern int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
+-			  pmd_t *dst_pmd, pmd_t *src_pmd,
+-			  struct vm_area_struct *vma,
+-			  unsigned long addr, unsigned long end);
+ extern int split_huge_page_to_list(struct page *page, struct list_head *list);
+ static inline int split_huge_page(struct page *page)
+ {
+diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
+index 5214ff63c351..511b1a0d6cc2 100644
+--- a/include/linux/hugetlb.h
++++ b/include/linux/hugetlb.h
+@@ -396,6 +396,16 @@ static inline int hugepage_migration_support(struct hstate *h)
+ #endif
+ }
+ 
++static inline bool hugepages_supported(void)
++{
++	/*
++	 * Some platform decide whether they support huge pages at boot
++	 * time. On these, such as powerpc, HPAGE_SHIFT is set to 0 when
++	 * there is no such support
++	 */
++	return HPAGE_SHIFT != 0;
++}
++
+ #else	/* CONFIG_HUGETLB_PAGE */
+ struct hstate {};
+ #define alloc_huge_page_node(h, nid) NULL
+diff --git a/include/linux/jump_label.h b/include/linux/jump_label.h
+index a5079072da66..9216e465289a 100644
+--- a/include/linux/jump_label.h
++++ b/include/linux/jump_label.h
+@@ -62,6 +62,10 @@ struct static_key {
+ 
+ # include <asm/jump_label.h>
+ # define HAVE_JUMP_LABEL
++#else
++struct static_key {
++	atomic_t enabled;
++};
+ #endif	/* CC_HAVE_ASM_GOTO && CONFIG_JUMP_LABEL */
+ 
+ enum jump_label_type {
+@@ -72,6 +76,12 @@ enum jump_label_type {
+ struct module;
+ 
+ #include <linux/atomic.h>
++
++static inline int static_key_count(struct static_key *key)
++{
++	return atomic_read(&key->enabled);
++}
++
+ #ifdef HAVE_JUMP_LABEL
+ 
+ #define JUMP_LABEL_TRUE_BRANCH 1UL
+@@ -122,24 +132,20 @@ extern void jump_label_apply_nops(struct module *mod);
+ 
+ #else  /* !HAVE_JUMP_LABEL */
+ 
+-struct static_key {
+-	atomic_t enabled;
+-};
+-
+ static __always_inline void jump_label_init(void)
+ {
+ }
+ 
+ static __always_inline bool static_key_false(struct static_key *key)
+ {
+-	if (unlikely(atomic_read(&key->enabled)) > 0)
++	if (unlikely(static_key_count(key) > 0))
+ 		return true;
+ 	return false;
+ }
+ 
+ static __always_inline bool static_key_true(struct static_key *key)
+ {
+-	if (likely(atomic_read(&key->enabled)) > 0)
++	if (likely(static_key_count(key) > 0))
+ 		return true;
+ 	return false;
+ }
+@@ -179,7 +185,7 @@ static inline int jump_label_apply_nops(struct module *mod)
+ 
+ static inline bool static_key_enabled(struct static_key *key)
+ {
+-	return (atomic_read(&key->enabled) > 0);
++	return static_key_count(key) > 0;
+ }
+ 
+ #endif	/* _LINUX_JUMP_LABEL_H */
+diff --git a/include/linux/migrate.h b/include/linux/migrate.h
+index ee8b14ae4f3f..449905ebcab3 100644
+--- a/include/linux/migrate.h
++++ b/include/linux/migrate.h
+@@ -5,7 +5,9 @@
+ #include <linux/mempolicy.h>
+ #include <linux/migrate_mode.h>
+ 
+-typedef struct page *new_page_t(struct page *, unsigned long private, int **);
++typedef struct page *new_page_t(struct page *page, unsigned long private,
++				int **reason);
++typedef void free_page_t(struct page *page, unsigned long private);
+ 
+ /*
+  * Return values from addresss_space_operations.migratepage():
+@@ -39,7 +41,7 @@ extern void putback_lru_pages(struct list_head *l);
+ extern void putback_movable_pages(struct list_head *l);
+ extern int migrate_page(struct address_space *,
+ 			struct page *, struct page *, enum migrate_mode);
+-extern int migrate_pages(struct list_head *l, new_page_t x,
++extern int migrate_pages(struct list_head *l, new_page_t new, free_page_t free,
+ 		unsigned long private, enum migrate_mode mode, int reason);
+ 
+ extern int fail_migrate_page(struct address_space *,
+@@ -61,8 +63,9 @@ extern int migrate_page_move_mapping(struct address_space *mapping,
+ 
+ static inline void putback_lru_pages(struct list_head *l) {}
+ static inline void putback_movable_pages(struct list_head *l) {}
+-static inline int migrate_pages(struct list_head *l, new_page_t x,
+-		unsigned long private, enum migrate_mode mode, int reason)
++static inline int migrate_pages(struct list_head *l, new_page_t new,
++		free_page_t free, unsigned long private, enum migrate_mode mode,
++		int reason)
+ 	{ return -ENOSYS; }
+ 
+ static inline int migrate_prep(void) { return -ENOSYS; }
+diff --git a/include/linux/mm.h b/include/linux/mm.h
+index 073734339583..2b3a5330dcf2 100644
+--- a/include/linux/mm.h
++++ b/include/linux/mm.h
+@@ -919,6 +919,14 @@ extern void show_free_areas(unsigned int flags);
+ extern bool skip_free_areas_node(unsigned int flags, int nid);
+ 
+ int shmem_zero_setup(struct vm_area_struct *);
++#ifdef CONFIG_SHMEM
++bool shmem_mapping(struct address_space *mapping);
++#else
++static inline bool shmem_mapping(struct address_space *mapping)
++{
++	return false;
++}
++#endif
+ 
+ extern int can_do_mlock(void);
+ extern int user_shm_lock(size_t, struct user_struct *);
+@@ -1623,9 +1631,6 @@ void page_cache_async_readahead(struct address_space *mapping,
+ 				unsigned long size);
+ 
+ unsigned long max_sane_readahead(unsigned long nr);
+-unsigned long ra_submit(struct file_ra_state *ra,
+-			struct address_space *mapping,
+-			struct file *filp);
+ 
+ /* Generic expand stack which grows the stack according to GROWS{UP,DOWN} */
+ extern int expand_stack(struct vm_area_struct *vma, unsigned long address);
+diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
+index 8e082f18fb6a..b8131e7d6eda 100644
+--- a/include/linux/mm_types.h
++++ b/include/linux/mm_types.h
+@@ -324,9 +324,9 @@ struct mm_rss_stat {
+ 
+ struct kioctx_table;
+ struct mm_struct {
+-	struct vm_area_struct * mmap;		/* list of VMAs */
++	struct vm_area_struct *mmap;		/* list of VMAs */
+ 	struct rb_root mm_rb;
+-	struct vm_area_struct * mmap_cache;	/* last find_vma result */
++	u32 vmacache_seqnum;                   /* per-thread vmacache */
+ #ifdef CONFIG_MMU
+ 	unsigned long (*get_unmapped_area) (struct file *filp,
+ 				unsigned long addr, unsigned long len,
+diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
+index 56482904a676..450f19c5c865 100644
+--- a/include/linux/mmzone.h
++++ b/include/linux/mmzone.h
+@@ -78,10 +78,15 @@ extern int page_group_by_mobility_disabled;
+ #define NR_MIGRATETYPE_BITS (PB_migrate_end - PB_migrate + 1)
+ #define MIGRATETYPE_MASK ((1UL << NR_MIGRATETYPE_BITS) - 1)
+ 
+-static inline int get_pageblock_migratetype(struct page *page)
++#define get_pageblock_migratetype(page)					\
++	get_pfnblock_flags_mask(page, page_to_pfn(page),		\
++			PB_migrate_end, MIGRATETYPE_MASK)
++
++static inline int get_pfnblock_migratetype(struct page *page, unsigned long pfn)
+ {
+ 	BUILD_BUG_ON(PB_migrate_end - PB_migrate != 2);
+-	return get_pageblock_flags_mask(page, PB_migrate_end, MIGRATETYPE_MASK);
++	return get_pfnblock_flags_mask(page, pfn, PB_migrate_end,
++					MIGRATETYPE_MASK);
+ }
+ 
+ struct free_area {
+@@ -138,6 +143,7 @@ enum zone_stat_item {
+ 	NR_SHMEM,		/* shmem pages (included tmpfs/GEM pages) */
+ 	NR_DIRTIED,		/* page dirtyings since bootup */
+ 	NR_WRITTEN,		/* page writings since bootup */
++	NR_PAGES_SCANNED,	/* pages scanned since last reclaim */
+ #ifdef CONFIG_NUMA
+ 	NUMA_HIT,		/* allocated in intended node */
+ 	NUMA_MISS,		/* allocated in non intended node */
+@@ -316,19 +322,12 @@ enum zone_type {
+ #ifndef __GENERATING_BOUNDS_H
+ 
+ struct zone {
+-	/* Fields commonly accessed by the page allocator */
++	/* Read-mostly fields */
+ 
+ 	/* zone watermarks, access with *_wmark_pages(zone) macros */
+ 	unsigned long watermark[NR_WMARK];
+ 
+ 	/*
+-	 * When free pages are below this point, additional steps are taken
+-	 * when reading the number of free pages to avoid per-cpu counter
+-	 * drift allowing watermarks to be breached
+-	 */
+-	unsigned long percpu_drift_mark;
+-
+-	/*
+ 	 * We don't know if the memory that we're going to allocate will be freeable
+ 	 * or/and it will be released eventually, so to avoid totally wasting several
+ 	 * GB of ram we must reserve some of the lower zone memory (otherwise we risk
+@@ -336,40 +335,26 @@ struct zone {
+ 	 * on the higher zones). This array is recalculated at runtime if the
+ 	 * sysctl_lowmem_reserve_ratio sysctl changes.
+ 	 */
+-	unsigned long		lowmem_reserve[MAX_NR_ZONES];
+-
+-	/*
+-	 * This is a per-zone reserve of pages that should not be
+-	 * considered dirtyable memory.
+-	 */
+-	unsigned long		dirty_balance_reserve;
++	long lowmem_reserve[MAX_NR_ZONES];
+ 
+ #ifdef CONFIG_NUMA
+ 	int node;
++#endif
++
+ 	/*
+-	 * zone reclaim becomes active if more unmapped pages exist.
++	 * The target ratio of ACTIVE_ANON to INACTIVE_ANON pages on
++	 * this zone's LRU.  Maintained by the pageout code.
+ 	 */
+-	unsigned long		min_unmapped_pages;
+-	unsigned long		min_slab_pages;
+-#endif
++	unsigned int inactive_ratio;
++
++	struct pglist_data	*zone_pgdat;
+ 	struct per_cpu_pageset __percpu *pageset;
++
+ 	/*
+-	 * free areas of different sizes
++	 * This is a per-zone reserve of pages that should not be
++	 * considered dirtyable memory.
+ 	 */
+-	spinlock_t		lock;
+-#if defined CONFIG_COMPACTION || defined CONFIG_CMA
+-	/* Set to true when the PG_migrate_skip bits should be cleared */
+-	bool			compact_blockskip_flush;
+-
+-	/* pfns where compaction scanners should start */
+-	unsigned long		compact_cached_free_pfn;
+-	unsigned long		compact_cached_migrate_pfn;
+-#endif
+-#ifdef CONFIG_MEMORY_HOTPLUG
+-	/* see spanned/present_pages for more description */
+-	seqlock_t		span_seqlock;
+-#endif
+-	struct free_area	free_area[MAX_ORDER];
++	unsigned long		dirty_balance_reserve;
+ 
+ #ifndef CONFIG_SPARSEMEM
+ 	/*
+@@ -379,71 +364,14 @@ struct zone {
+ 	unsigned long		*pageblock_flags;
+ #endif /* CONFIG_SPARSEMEM */
+ 
+-#ifdef CONFIG_COMPACTION
+-	/*
+-	 * On compaction failure, 1<<compact_defer_shift compactions
+-	 * are skipped before trying again. The number attempted since
+-	 * last failure is tracked with compact_considered.
+-	 */
+-	unsigned int		compact_considered;
+-	unsigned int		compact_defer_shift;
+-	int			compact_order_failed;
+-#endif
+-
+-	ZONE_PADDING(_pad1_)
+-
+-	/* Fields commonly accessed by the page reclaim scanner */
+-	spinlock_t		lru_lock;
+-	struct lruvec		lruvec;
+-
+-	unsigned long		pages_scanned;	   /* since last reclaim */
+-	unsigned long		flags;		   /* zone flags, see below */
+-
+-	/* Zone statistics */
+-	atomic_long_t		vm_stat[NR_VM_ZONE_STAT_ITEMS];
+-
+-	/*
+-	 * The target ratio of ACTIVE_ANON to INACTIVE_ANON pages on
+-	 * this zone's LRU.  Maintained by the pageout code.
+-	 */
+-	unsigned int inactive_ratio;
+-
+-
+-	ZONE_PADDING(_pad2_)
+-	/* Rarely used or read-mostly fields */
+-
++#ifdef CONFIG_NUMA
+ 	/*
+-	 * wait_table		-- the array holding the hash table
+-	 * wait_table_hash_nr_entries	-- the size of the hash table array
+-	 * wait_table_bits	-- wait_table_size == (1 << wait_table_bits)
+-	 *
+-	 * The purpose of all these is to keep track of the people
+-	 * waiting for a page to become available and make them
+-	 * runnable again when possible. The trouble is that this
+-	 * consumes a lot of space, especially when so few things
+-	 * wait on pages at a given time. So instead of using
+-	 * per-page waitqueues, we use a waitqueue hash table.
+-	 *
+-	 * The bucket discipline is to sleep on the same queue when
+-	 * colliding and wake all in that wait queue when removing.
+-	 * When something wakes, it must check to be sure its page is
+-	 * truly available, a la thundering herd. The cost of a
+-	 * collision is great, but given the expected load of the
+-	 * table, they should be so rare as to be outweighed by the
+-	 * benefits from the saved space.
+-	 *
+-	 * __wait_on_page_locked() and unlock_page() in mm/filemap.c, are the
+-	 * primary users of these fields, and in mm/page_alloc.c
+-	 * free_area_init_core() performs the initialization of them.
++	 * zone reclaim becomes active if more unmapped pages exist.
+ 	 */
+-	wait_queue_head_t	* wait_table;
+-	unsigned long		wait_table_hash_nr_entries;
+-	unsigned long		wait_table_bits;
++	unsigned long		min_unmapped_pages;
++	unsigned long		min_slab_pages;
++#endif /* CONFIG_NUMA */
+ 
+-	/*
+-	 * Discontig memory support fields.
+-	 */
+-	struct pglist_data	*zone_pgdat;
+ 	/* zone_start_pfn == zone_start_paddr >> PAGE_SHIFT */
+ 	unsigned long		zone_start_pfn;
+ 
+@@ -489,14 +417,103 @@ struct zone {
+ 	 * adjust_managed_page_count() should be used instead of directly
+ 	 * touching zone->managed_pages and totalram_pages.
+ 	 */
++	unsigned long		managed_pages;
+ 	unsigned long		spanned_pages;
+ 	unsigned long		present_pages;
+-	unsigned long		managed_pages;
++
++	const char		*name;
+ 
+ 	/*
+-	 * rarely used fields:
++	 * Number of MIGRATE_RESEVE page block. To maintain for just
++	 * optimization. Protected by zone->lock.
+ 	 */
+-	const char		*name;
++	int			nr_migrate_reserve_block;
++
++#ifdef CONFIG_MEMORY_HOTPLUG
++	/* see spanned/present_pages for more description */
++	seqlock_t		span_seqlock;
++#endif
++
++	/*
++	 * wait_table		-- the array holding the hash table
++	 * wait_table_hash_nr_entries	-- the size of the hash table array
++	 * wait_table_bits	-- wait_table_size == (1 << wait_table_bits)
++	 *
++	 * The purpose of all these is to keep track of the people
++	 * waiting for a page to become available and make them
++	 * runnable again when possible. The trouble is that this
++	 * consumes a lot of space, especially when so few things
++	 * wait on pages at a given time. So instead of using
++	 * per-page waitqueues, we use a waitqueue hash table.
++	 *
++	 * The bucket discipline is to sleep on the same queue when
++	 * colliding and wake all in that wait queue when removing.
++	 * When something wakes, it must check to be sure its page is
++	 * truly available, a la thundering herd. The cost of a
++	 * collision is great, but given the expected load of the
++	 * table, they should be so rare as to be outweighed by the
++	 * benefits from the saved space.
++	 *
++	 * __wait_on_page_locked() and unlock_page() in mm/filemap.c, are the
++	 * primary users of these fields, and in mm/page_alloc.c
++	 * free_area_init_core() performs the initialization of them.
++	 */
++	wait_queue_head_t	*wait_table;
++	unsigned long		wait_table_hash_nr_entries;
++	unsigned long		wait_table_bits;
++
++	ZONE_PADDING(_pad1_)
++
++	/* Write-intensive fields used from the page allocator */
++	spinlock_t		lock;
++
++	/* free areas of different sizes */
++	struct free_area	free_area[MAX_ORDER];
++
++	/* zone flags, see below */
++	unsigned long		flags;
++
++	ZONE_PADDING(_pad2_)
++
++	/* Write-intensive fields used by page reclaim */
++
++	/* Fields commonly accessed by the page reclaim scanner */
++	spinlock_t		lru_lock;
++	struct lruvec		lruvec;
++
++	/*
++	 * When free pages are below this point, additional steps are taken
++	 * when reading the number of free pages to avoid per-cpu counter
++	 * drift allowing watermarks to be breached
++	 */
++	unsigned long percpu_drift_mark;
++
++#if defined CONFIG_COMPACTION || defined CONFIG_CMA
++	/* pfn where compaction free scanner should start */
++	unsigned long		compact_cached_free_pfn;
++	/* pfn where async and sync compaction migration scanner should start */
++	unsigned long		compact_cached_migrate_pfn[2];
++#endif
++
++#ifdef CONFIG_COMPACTION
++	/*
++	 * On compaction failure, 1<<compact_defer_shift compactions
++	 * are skipped before trying again. The number attempted since
++	 * last failure is tracked with compact_considered.
++	 */
++	unsigned int		compact_considered;
++	unsigned int		compact_defer_shift;
++	int			compact_order_failed;
++#endif
++
++#if defined CONFIG_COMPACTION || defined CONFIG_CMA
++	/* Set to true when the PG_migrate_skip bits should be cleared */
++	bool			compact_blockskip_flush;
++#endif
++
++	ZONE_PADDING(_pad3_)
++	/* Zone statistics */
++	atomic_long_t		vm_stat[NR_VM_ZONE_STAT_ITEMS];
+ } ____cacheline_internodealigned_in_smp;
+ 
+ typedef enum {
+@@ -512,6 +529,7 @@ typedef enum {
+ 	ZONE_WRITEBACK,			/* reclaim scanning has recently found
+ 					 * many pages under writeback
+ 					 */
++	ZONE_FAIR_DEPLETED,		/* fair zone policy batch depleted */
+ } zone_flags_t;
+ 
+ static inline void zone_set_flag(struct zone *zone, zone_flags_t flag)
+@@ -549,6 +567,11 @@ static inline int zone_is_reclaim_locked(const struct zone *zone)
+ 	return test_bit(ZONE_RECLAIM_LOCKED, &zone->flags);
+ }
+ 
++static inline int zone_is_fair_depleted(const struct zone *zone)
++{
++	return test_bit(ZONE_FAIR_DEPLETED, &zone->flags);
++}
++
+ static inline int zone_is_oom_locked(const struct zone *zone)
+ {
+ 	return test_bit(ZONE_OOM_LOCKED, &zone->flags);
+@@ -803,10 +826,10 @@ static inline bool pgdat_is_empty(pg_data_t *pgdat)
+ extern struct mutex zonelists_mutex;
+ void build_all_zonelists(pg_data_t *pgdat, struct zone *zone);
+ void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx);
+-bool zone_watermark_ok(struct zone *z, int order, unsigned long mark,
+-		int classzone_idx, int alloc_flags);
+-bool zone_watermark_ok_safe(struct zone *z, int order, unsigned long mark,
+-		int classzone_idx, int alloc_flags);
++bool zone_watermark_ok(struct zone *z, unsigned int order,
++		unsigned long mark, int classzone_idx, int alloc_flags);
++bool zone_watermark_ok_safe(struct zone *z, unsigned int order,
++		unsigned long mark, int classzone_idx, int alloc_flags);
+ enum memmap_context {
+ 	MEMMAP_EARLY,
+ 	MEMMAP_HOTPLUG,
+diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
+index dd7d45b5c496..2284ea62c6cc 100644
+--- a/include/linux/page-flags.h
++++ b/include/linux/page-flags.h
+@@ -198,6 +198,7 @@ struct page;	/* forward declaration */
+ TESTPAGEFLAG(Locked, locked)
+ PAGEFLAG(Error, error) TESTCLEARFLAG(Error, error)
+ PAGEFLAG(Referenced, referenced) TESTCLEARFLAG(Referenced, referenced)
++	__SETPAGEFLAG(Referenced, referenced)
+ PAGEFLAG(Dirty, dirty) TESTSCFLAG(Dirty, dirty) __CLEARPAGEFLAG(Dirty, dirty)
+ PAGEFLAG(LRU, lru) __CLEARPAGEFLAG(LRU, lru)
+ PAGEFLAG(Active, active) __CLEARPAGEFLAG(Active, active)
+@@ -208,6 +209,7 @@ PAGEFLAG(Pinned, pinned) TESTSCFLAG(Pinned, pinned)	/* Xen */
+ PAGEFLAG(SavePinned, savepinned);			/* Xen */
+ PAGEFLAG(Reserved, reserved) __CLEARPAGEFLAG(Reserved, reserved)
+ PAGEFLAG(SwapBacked, swapbacked) __CLEARPAGEFLAG(SwapBacked, swapbacked)
++	__SETPAGEFLAG(SwapBacked, swapbacked)
+ 
+ __PAGEFLAG(SlobFree, slob_free)
+ 
+@@ -228,9 +230,9 @@ PAGEFLAG(OwnerPriv1, owner_priv_1) TESTCLEARFLAG(OwnerPriv1, owner_priv_1)
+ TESTPAGEFLAG(Writeback, writeback) TESTSCFLAG(Writeback, writeback)
+ PAGEFLAG(MappedToDisk, mappedtodisk)
+ 
+-/* PG_readahead is only used for file reads; PG_reclaim is only for writes */
++/* PG_readahead is only used for reads; PG_reclaim is only for writes */
+ PAGEFLAG(Reclaim, reclaim) TESTCLEARFLAG(Reclaim, reclaim)
+-PAGEFLAG(Readahead, reclaim)		/* Reminder to do async read-ahead */
++PAGEFLAG(Readahead, reclaim) TESTCLEARFLAG(Readahead, reclaim)
+ 
+ #ifdef CONFIG_HIGHMEM
+ /*
+diff --git a/include/linux/pageblock-flags.h b/include/linux/pageblock-flags.h
+index c08730c10c7a..2baeee12f48e 100644
+--- a/include/linux/pageblock-flags.h
++++ b/include/linux/pageblock-flags.h
+@@ -65,33 +65,26 @@ extern int pageblock_order;
+ /* Forward declaration */
+ struct page;
+ 
+-unsigned long get_pageblock_flags_mask(struct page *page,
++unsigned long get_pfnblock_flags_mask(struct page *page,
++				unsigned long pfn,
+ 				unsigned long end_bitidx,
+ 				unsigned long mask);
+-void set_pageblock_flags_mask(struct page *page,
++
++void set_pfnblock_flags_mask(struct page *page,
+ 				unsigned long flags,
++				unsigned long pfn,
+ 				unsigned long end_bitidx,
+ 				unsigned long mask);
+ 
+ /* Declarations for getting and setting flags. See mm/page_alloc.c */
+-static inline unsigned long get_pageblock_flags_group(struct page *page,
+-					int start_bitidx, int end_bitidx)
+-{
+-	unsigned long nr_flag_bits = end_bitidx - start_bitidx + 1;
+-	unsigned long mask = (1 << nr_flag_bits) - 1;
+-
+-	return get_pageblock_flags_mask(page, end_bitidx, mask);
+-}
+-
+-static inline void set_pageblock_flags_group(struct page *page,
+-					unsigned long flags,
+-					int start_bitidx, int end_bitidx)
+-{
+-	unsigned long nr_flag_bits = end_bitidx - start_bitidx + 1;
+-	unsigned long mask = (1 << nr_flag_bits) - 1;
+-
+-	set_pageblock_flags_mask(page, flags, end_bitidx, mask);
+-}
++#define get_pageblock_flags_group(page, start_bitidx, end_bitidx) \
++	get_pfnblock_flags_mask(page, page_to_pfn(page),		\
++			end_bitidx,					\
++			(1 << (end_bitidx - start_bitidx + 1)) - 1)
++#define set_pageblock_flags_group(page, flags, start_bitidx, end_bitidx) \
++	set_pfnblock_flags_mask(page, flags, page_to_pfn(page),		\
++			end_bitidx,					\
++			(1 << (end_bitidx - start_bitidx + 1)) - 1)
+ 
+ #ifdef CONFIG_COMPACTION
+ #define get_pageblock_skip(page) \
+diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
+index e3dea75a078b..d57a02a9747b 100644
+--- a/include/linux/pagemap.h
++++ b/include/linux/pagemap.h
+@@ -99,7 +99,7 @@ static inline void mapping_set_gfp_mask(struct address_space *m, gfp_t mask)
+ 
+ #define page_cache_get(page)		get_page(page)
+ #define page_cache_release(page)	put_page(page)
+-void release_pages(struct page **pages, int nr, int cold);
++void release_pages(struct page **pages, int nr, bool cold);
+ 
+ /*
+  * speculatively take a reference to a page.
+@@ -243,12 +243,117 @@ static inline struct page *page_cache_alloc_readahead(struct address_space *x)
+ 
+ typedef int filler_t(void *, struct page *);
+ 
+-extern struct page * find_get_page(struct address_space *mapping,
+-				pgoff_t index);
+-extern struct page * find_lock_page(struct address_space *mapping,
+-				pgoff_t index);
+-extern struct page * find_or_create_page(struct address_space *mapping,
+-				pgoff_t index, gfp_t gfp_mask);
++pgoff_t page_cache_next_hole(struct address_space *mapping,
++			     pgoff_t index, unsigned long max_scan);
++pgoff_t page_cache_prev_hole(struct address_space *mapping,
++			     pgoff_t index, unsigned long max_scan);
++
++#define FGP_ACCESSED		0x00000001
++#define FGP_LOCK		0x00000002
++#define FGP_CREAT		0x00000004
++#define FGP_WRITE		0x00000008
++#define FGP_NOFS		0x00000010
++#define FGP_NOWAIT		0x00000020
++
++struct page *pagecache_get_page(struct address_space *mapping, pgoff_t offset,
++		int fgp_flags, gfp_t cache_gfp_mask, gfp_t radix_gfp_mask);
++
++/**
++ * find_get_page - find and get a page reference
++ * @mapping: the address_space to search
++ * @offset: the page index
++ *
++ * Looks up the page cache slot at @mapping & @offset.  If there is a
++ * page cache page, it is returned with an increased refcount.
++ *
++ * Otherwise, %NULL is returned.
++ */
++static inline struct page *find_get_page(struct address_space *mapping,
++					pgoff_t offset)
++{
++	return pagecache_get_page(mapping, offset, 0, 0, 0);
++}
++
++static inline struct page *find_get_page_flags(struct address_space *mapping,
++					pgoff_t offset, int fgp_flags)
++{
++	return pagecache_get_page(mapping, offset, fgp_flags, 0, 0);
++}
++
++/**
++ * find_lock_page - locate, pin and lock a pagecache page
++ * pagecache_get_page - find and get a page reference
++ * @mapping: the address_space to search
++ * @offset: the page index
++ *
++ * Looks up the page cache slot at @mapping & @offset.  If there is a
++ * page cache page, it is returned locked and with an increased
++ * refcount.
++ *
++ * Otherwise, %NULL is returned.
++ *
++ * find_lock_page() may sleep.
++ */
++static inline struct page *find_lock_page(struct address_space *mapping,
++					pgoff_t offset)
++{
++	return pagecache_get_page(mapping, offset, FGP_LOCK, 0, 0);
++}
++
++/**
++ * find_or_create_page - locate or add a pagecache page
++ * @mapping: the page's address_space
++ * @index: the page's index into the mapping
++ * @gfp_mask: page allocation mode
++ *
++ * Looks up the page cache slot at @mapping & @offset.  If there is a
++ * page cache page, it is returned locked and with an increased
++ * refcount.
++ *
++ * If the page is not present, a new page is allocated using @gfp_mask
++ * and added to the page cache and the VM's LRU list.  The page is
++ * returned locked and with an increased refcount.
++ *
++ * On memory exhaustion, %NULL is returned.
++ *
++ * find_or_create_page() may sleep, even if @gfp_flags specifies an
++ * atomic allocation!
++ */
++static inline struct page *find_or_create_page(struct address_space *mapping,
++					pgoff_t offset, gfp_t gfp_mask)
++{
++	return pagecache_get_page(mapping, offset,
++					FGP_LOCK|FGP_ACCESSED|FGP_CREAT,
++					gfp_mask, gfp_mask & GFP_RECLAIM_MASK);
++}
++
++/**
++ * grab_cache_page_nowait - returns locked page at given index in given cache
++ * @mapping: target address_space
++ * @index: the page index
++ *
++ * Same as grab_cache_page(), but do not wait if the page is unavailable.
++ * This is intended for speculative data generators, where the data can
++ * be regenerated if the page couldn't be grabbed.  This routine should
++ * be safe to call while holding the lock for another page.
++ *
++ * Clear __GFP_FS when allocating the page to avoid recursion into the fs
++ * and deadlock against the caller's locked page.
++ */
++static inline struct page *grab_cache_page_nowait(struct address_space *mapping,
++				pgoff_t index)
++{
++	return pagecache_get_page(mapping, index,
++			FGP_LOCK|FGP_CREAT|FGP_NOFS|FGP_NOWAIT,
++			mapping_gfp_mask(mapping),
++			GFP_NOFS);
++}
++
++struct page *find_get_entry(struct address_space *mapping, pgoff_t offset);
++struct page *find_lock_entry(struct address_space *mapping, pgoff_t offset);
++unsigned find_get_entries(struct address_space *mapping, pgoff_t start,
++			  unsigned int nr_entries, struct page **entries,
++			  pgoff_t *indices);
+ unsigned find_get_pages(struct address_space *mapping, pgoff_t start,
+ 			unsigned int nr_pages, struct page **pages);
+ unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t start,
+@@ -268,10 +373,6 @@ static inline struct page *grab_cache_page(struct address_space *mapping,
+ 	return find_or_create_page(mapping, index, mapping_gfp_mask(mapping));
+ }
+ 
+-extern struct page * grab_cache_page_nowait(struct address_space *mapping,
+-				pgoff_t index);
+-extern struct page * read_cache_page_async(struct address_space *mapping,
+-				pgoff_t index, filler_t *filler, void *data);
+ extern struct page * read_cache_page(struct address_space *mapping,
+ 				pgoff_t index, filler_t *filler, void *data);
+ extern struct page * read_cache_page_gfp(struct address_space *mapping,
+@@ -279,14 +380,6 @@ extern struct page * read_cache_page_gfp(struct address_space *mapping,
+ extern int read_cache_pages(struct address_space *mapping,
+ 		struct list_head *pages, filler_t *filler, void *data);
+ 
+-static inline struct page *read_mapping_page_async(
+-				struct address_space *mapping,
+-				pgoff_t index, void *data)
+-{
+-	filler_t *filler = (filler_t *)mapping->a_ops->readpage;
+-	return read_cache_page_async(mapping, index, filler, data);
+-}
+-
+ static inline struct page *read_mapping_page(struct address_space *mapping,
+ 				pgoff_t index, void *data)
+ {
+diff --git a/include/linux/pagevec.h b/include/linux/pagevec.h
+index e4dbfab37729..b45d391b4540 100644
+--- a/include/linux/pagevec.h
++++ b/include/linux/pagevec.h
+@@ -22,6 +22,11 @@ struct pagevec {
+ 
+ void __pagevec_release(struct pagevec *pvec);
+ void __pagevec_lru_add(struct pagevec *pvec);
++unsigned pagevec_lookup_entries(struct pagevec *pvec,
++				struct address_space *mapping,
++				pgoff_t start, unsigned nr_entries,
++				pgoff_t *indices);
++void pagevec_remove_exceptionals(struct pagevec *pvec);
+ unsigned pagevec_lookup(struct pagevec *pvec, struct address_space *mapping,
+ 		pgoff_t start, unsigned nr_pages);
+ unsigned pagevec_lookup_tag(struct pagevec *pvec,
+diff --git a/include/linux/plist.h b/include/linux/plist.h
+index aa0fb390bd29..8b6c970cff6c 100644
+--- a/include/linux/plist.h
++++ b/include/linux/plist.h
+@@ -98,6 +98,13 @@ struct plist_node {
+ }
+ 
+ /**
++ * PLIST_HEAD - declare and init plist_head
++ * @head:	name for struct plist_head variable
++ */
++#define PLIST_HEAD(head) \
++	struct plist_head head = PLIST_HEAD_INIT(head)
++
++/**
+  * PLIST_NODE_INIT - static struct plist_node initializer
+  * @node:	struct plist_node variable name
+  * @__prio:	initial node priority
+@@ -134,6 +141,8 @@ static inline void plist_node_init(struct plist_node *node, int prio)
+ extern void plist_add(struct plist_node *node, struct plist_head *head);
+ extern void plist_del(struct plist_node *node, struct plist_head *head);
+ 
++extern void plist_requeue(struct plist_node *node, struct plist_head *head);
++
+ /**
+  * plist_for_each - iterate over the plist
+  * @pos:	the type * to use as a loop counter
+@@ -143,6 +152,16 @@ extern void plist_del(struct plist_node *node, struct plist_head *head);
+ 	 list_for_each_entry(pos, &(head)->node_list, node_list)
+ 
+ /**
++ * plist_for_each_continue - continue iteration over the plist
++ * @pos:	the type * to use as a loop cursor
++ * @head:	the head for your list
++ *
++ * Continue to iterate over plist, continuing after the current position.
++ */
++#define plist_for_each_continue(pos, head)	\
++	 list_for_each_entry_continue(pos, &(head)->node_list, node_list)
++
++/**
+  * plist_for_each_safe - iterate safely over a plist of given type
+  * @pos:	the type * to use as a loop counter
+  * @n:	another type * to use as temporary storage
+@@ -163,6 +182,18 @@ extern void plist_del(struct plist_node *node, struct plist_head *head);
+ 	 list_for_each_entry(pos, &(head)->node_list, mem.node_list)
+ 
+ /**
++ * plist_for_each_entry_continue - continue iteration over list of given type
++ * @pos:	the type * to use as a loop cursor
++ * @head:	the head for your list
++ * @m:		the name of the list_struct within the struct
++ *
++ * Continue to iterate over list of given type, continuing after
++ * the current position.
++ */
++#define plist_for_each_entry_continue(pos, head, m)	\
++	list_for_each_entry_continue(pos, &(head)->node_list, m.node_list)
++
++/**
+  * plist_for_each_entry_safe - iterate safely over list of given type
+  * @pos:	the type * to use as a loop counter
+  * @n:		another type * to use as temporary storage
+@@ -229,6 +260,20 @@ static inline int plist_node_empty(const struct plist_node *node)
+ #endif
+ 
+ /**
++ * plist_next - get the next entry in list
++ * @pos:	the type * to cursor
++ */
++#define plist_next(pos) \
++	list_next_entry(pos, node_list)
++
++/**
++ * plist_prev - get the prev entry in list
++ * @pos:	the type * to cursor
++ */
++#define plist_prev(pos) \
++	list_prev_entry(pos, node_list)
++
++/**
+  * plist_first - return the first node (and thus, highest priority)
+  * @head:	the &struct plist_head pointer
+  *
+diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h
+index 403940787be1..e8be53ecfc45 100644
+--- a/include/linux/radix-tree.h
++++ b/include/linux/radix-tree.h
+@@ -219,6 +219,7 @@ static inline void radix_tree_replace_slot(void **pslot, void *item)
+ int radix_tree_insert(struct radix_tree_root *, unsigned long, void *);
+ void *radix_tree_lookup(struct radix_tree_root *, unsigned long);
+ void **radix_tree_lookup_slot(struct radix_tree_root *, unsigned long);
++void *radix_tree_delete_item(struct radix_tree_root *, unsigned long, void *);
+ void *radix_tree_delete(struct radix_tree_root *, unsigned long);
+ unsigned int
+ radix_tree_gang_lookup(struct radix_tree_root *root, void **results,
+@@ -226,10 +227,6 @@ radix_tree_gang_lookup(struct radix_tree_root *root, void **results,
+ unsigned int radix_tree_gang_lookup_slot(struct radix_tree_root *root,
+ 			void ***results, unsigned long *indices,
+ 			unsigned long first_index, unsigned int max_items);
+-unsigned long radix_tree_next_hole(struct radix_tree_root *root,
+-				unsigned long index, unsigned long max_scan);
+-unsigned long radix_tree_prev_hole(struct radix_tree_root *root,
+-				unsigned long index, unsigned long max_scan);
+ int radix_tree_preload(gfp_t gfp_mask);
+ int radix_tree_maybe_preload(gfp_t gfp_mask);
+ void radix_tree_init(void);
+diff --git a/include/linux/sched.h b/include/linux/sched.h
+index 0827bec7d82f..cb67b4e2dba2 100644
+--- a/include/linux/sched.h
++++ b/include/linux/sched.h
+@@ -63,6 +63,10 @@ struct fs_struct;
+ struct perf_event_context;
+ struct blk_plug;
+ 
++#define VMACACHE_BITS 2
++#define VMACACHE_SIZE (1U << VMACACHE_BITS)
++#define VMACACHE_MASK (VMACACHE_SIZE - 1)
++
+ /*
+  * List of flags we want to share for kernel threads,
+  * if only because they are not used by them anyway.
+@@ -1093,6 +1097,9 @@ struct task_struct {
+ #ifdef CONFIG_COMPAT_BRK
+ 	unsigned brk_randomized:1;
+ #endif
++	/* per-thread vma caching */
++	u32 vmacache_seqnum;
++	struct vm_area_struct *vmacache[VMACACHE_SIZE];
+ #if defined(SPLIT_RSS_COUNTING)
+ 	struct task_rss_stat	rss_stat;
+ #endif
+diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h
+index 30aa0dc60d75..deb49609cd36 100644
+--- a/include/linux/shmem_fs.h
++++ b/include/linux/shmem_fs.h
+@@ -49,6 +49,7 @@ extern struct file *shmem_file_setup(const char *name,
+ 					loff_t size, unsigned long flags);
+ extern int shmem_zero_setup(struct vm_area_struct *);
+ extern int shmem_lock(struct file *file, int lock, struct user_struct *user);
++extern bool shmem_mapping(struct address_space *mapping);
+ extern void shmem_unlock_mapping(struct address_space *mapping);
+ extern struct page *shmem_read_mapping_page_gfp(struct address_space *mapping,
+ 					pgoff_t index, gfp_t gfp_mask);
+diff --git a/include/linux/swap.h b/include/linux/swap.h
+index 46ba0c6c219f..241bf0922770 100644
+--- a/include/linux/swap.h
++++ b/include/linux/swap.h
+@@ -214,8 +214,9 @@ struct percpu_cluster {
+ struct swap_info_struct {
+ 	unsigned long	flags;		/* SWP_USED etc: see above */
+ 	signed short	prio;		/* swap priority of this type */
++	struct plist_node list;		/* entry in swap_active_head */
++	struct plist_node avail_list;	/* entry in swap_avail_head */
+ 	signed char	type;		/* strange name for an index */
+-	signed char	next;		/* next type on the swap list */
+ 	unsigned int	max;		/* extent of the swap_map */
+ 	unsigned char *swap_map;	/* vmalloc'ed array of usage counts */
+ 	struct swap_cluster_info *cluster_info; /* cluster info. Only for SSD */
+@@ -255,11 +256,6 @@ struct swap_info_struct {
+ 	struct swap_cluster_info discard_cluster_tail; /* list tail of discard clusters */
+ };
+ 
+-struct swap_list_t {
+-	int head;	/* head of priority-ordered swapfile list */
+-	int next;	/* swapfile to be used next */
+-};
+-
+ /* linux/mm/page_alloc.c */
+ extern unsigned long totalram_pages;
+ extern unsigned long totalreserve_pages;
+@@ -272,12 +268,14 @@ extern unsigned long nr_free_pagecache_pages(void);
+ 
+ 
+ /* linux/mm/swap.c */
+-extern void __lru_cache_add(struct page *);
+ extern void lru_cache_add(struct page *);
++extern void lru_cache_add_anon(struct page *page);
++extern void lru_cache_add_file(struct page *page);
+ extern void lru_add_page_tail(struct page *page, struct page *page_tail,
+ 			 struct lruvec *lruvec, struct list_head *head);
+ extern void activate_page(struct page *);
+ extern void mark_page_accessed(struct page *);
++extern void init_page_accessed(struct page *page);
+ extern void lru_add_drain(void);
+ extern void lru_add_drain_cpu(int cpu);
+ extern void lru_add_drain_all(void);
+@@ -287,22 +285,6 @@ extern void swap_setup(void);
+ 
+ extern void add_page_to_unevictable_list(struct page *page);
+ 
+-/**
+- * lru_cache_add: add a page to the page lists
+- * @page: the page to add
+- */
+-static inline void lru_cache_add_anon(struct page *page)
+-{
+-	ClearPageActive(page);
+-	__lru_cache_add(page);
+-}
+-
+-static inline void lru_cache_add_file(struct page *page)
+-{
+-	ClearPageActive(page);
+-	__lru_cache_add(page);
+-}
+-
+ /* linux/mm/vmscan.c */
+ extern unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
+ 					gfp_t gfp_mask, nodemask_t *mask);
+@@ -460,7 +442,7 @@ mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent, bool swapout)
+ #define free_page_and_swap_cache(page) \
+ 	page_cache_release(page)
+ #define free_pages_and_swap_cache(pages, nr) \
+-	release_pages((pages), (nr), 0);
++	release_pages((pages), (nr), false);
+ 
+ static inline void show_swap_cache_info(void)
+ {
+diff --git a/include/linux/swapfile.h b/include/linux/swapfile.h
+index e282624e8c10..388293a91e8c 100644
+--- a/include/linux/swapfile.h
++++ b/include/linux/swapfile.h
+@@ -6,7 +6,7 @@
+  * want to expose them to the dozens of source files that include swap.h
+  */
+ extern spinlock_t swap_lock;
+-extern struct swap_list_t swap_list;
++extern struct plist_head swap_active_head;
+ extern struct swap_info_struct *swap_info[];
+ extern int try_to_unuse(unsigned int, bool, unsigned long);
+ 
+diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h
+index c557c6d096de..3a712e2e7d76 100644
+--- a/include/linux/vm_event_item.h
++++ b/include/linux/vm_event_item.h
+@@ -71,12 +71,14 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT,
+ 		THP_ZERO_PAGE_ALLOC,
+ 		THP_ZERO_PAGE_ALLOC_FAILED,
+ #endif
++#ifdef CONFIG_DEBUG_TLBFLUSH
+ #ifdef CONFIG_SMP
+ 		NR_TLB_REMOTE_FLUSH,	/* cpu tried to flush others' tlbs */
+ 		NR_TLB_REMOTE_FLUSH_RECEIVED,/* cpu received ipi for flush */
+-#endif
++#endif /* CONFIG_SMP */
+ 		NR_TLB_LOCAL_FLUSH_ALL,
+ 		NR_TLB_LOCAL_FLUSH_ONE,
++#endif /* CONFIG_DEBUG_TLBFLUSH */
+ 		NR_VM_EVENT_ITEMS
+ };
+ 
+diff --git a/include/linux/vmacache.h b/include/linux/vmacache.h
+new file mode 100644
+index 000000000000..c3fa0fd43949
+--- /dev/null
++++ b/include/linux/vmacache.h
+@@ -0,0 +1,38 @@
++#ifndef __LINUX_VMACACHE_H
++#define __LINUX_VMACACHE_H
++
++#include <linux/sched.h>
++#include <linux/mm.h>
++
++/*
++ * Hash based on the page number. Provides a good hit rate for
++ * workloads with good locality and those with random accesses as well.
++ */
++#define VMACACHE_HASH(addr) ((addr >> PAGE_SHIFT) & VMACACHE_MASK)
++
++static inline void vmacache_flush(struct task_struct *tsk)
++{
++	memset(tsk->vmacache, 0, sizeof(tsk->vmacache));
++}
++
++extern void vmacache_flush_all(struct mm_struct *mm);
++extern void vmacache_update(unsigned long addr, struct vm_area_struct *newvma);
++extern struct vm_area_struct *vmacache_find(struct mm_struct *mm,
++						    unsigned long addr);
++
++#ifndef CONFIG_MMU
++extern struct vm_area_struct *vmacache_find_exact(struct mm_struct *mm,
++						  unsigned long start,
++						  unsigned long end);
++#endif
++
++static inline void vmacache_invalidate(struct mm_struct *mm)
++{
++	mm->vmacache_seqnum++;
++
++	/* deal with overflows */
++	if (unlikely(mm->vmacache_seqnum == 0))
++		vmacache_flush_all(mm);
++}
++
++#endif /* __LINUX_VMACACHE_H */
+diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h
+index a67b38415768..67ce70c8279b 100644
+--- a/include/linux/vmstat.h
++++ b/include/linux/vmstat.h
+@@ -83,6 +83,14 @@ static inline void vm_events_fold_cpu(int cpu)
+ #define count_vm_numa_events(x, y) do { (void)(y); } while (0)
+ #endif /* CONFIG_NUMA_BALANCING */
+ 
++#ifdef CONFIG_DEBUG_TLBFLUSH
++#define count_vm_tlb_event(x)	   count_vm_event(x)
++#define count_vm_tlb_events(x, y)  count_vm_events(x, y)
++#else
++#define count_vm_tlb_event(x)     do {} while (0)
++#define count_vm_tlb_events(x, y) do { (void)(y); } while (0)
++#endif
++
+ #define __count_zone_vm_events(item, zone, delta) \
+ 		__count_vm_events(item##_NORMAL - ZONE_NORMAL + \
+ 		zone_idx(zone), delta)
+diff --git a/include/trace/events/compaction.h b/include/trace/events/compaction.h
+index fde1b3e94c7d..c6814b917bdf 100644
+--- a/include/trace/events/compaction.h
++++ b/include/trace/events/compaction.h
+@@ -5,6 +5,7 @@
+ #define _TRACE_COMPACTION_H
+ 
+ #include <linux/types.h>
++#include <linux/list.h>
+ #include <linux/tracepoint.h>
+ #include <trace/events/gfpflags.h>
+ 
+@@ -47,10 +48,11 @@ DEFINE_EVENT(mm_compaction_isolate_template, mm_compaction_isolate_freepages,
+ 
+ TRACE_EVENT(mm_compaction_migratepages,
+ 
+-	TP_PROTO(unsigned long nr_migrated,
+-		unsigned long nr_failed),
++	TP_PROTO(unsigned long nr_all,
++		int migrate_rc,
++		struct list_head *migratepages),
+ 
+-	TP_ARGS(nr_migrated, nr_failed),
++	TP_ARGS(nr_all, migrate_rc, migratepages),
+ 
+ 	TP_STRUCT__entry(
+ 		__field(unsigned long, nr_migrated)
+@@ -58,7 +60,22 @@ TRACE_EVENT(mm_compaction_migratepages,
+ 	),
+ 
+ 	TP_fast_assign(
+-		__entry->nr_migrated = nr_migrated;
++		unsigned long nr_failed = 0;
++		struct list_head *page_lru;
++
++		/*
++		 * migrate_pages() returns either a non-negative number
++		 * with the number of pages that failed migration, or an
++		 * error code, in which case we need to count the remaining
++		 * pages manually
++		 */
++		if (migrate_rc >= 0)
++			nr_failed = migrate_rc;
++		else
++			list_for_each(page_lru, migratepages)
++				nr_failed++;
++
++		__entry->nr_migrated = nr_all - nr_failed;
+ 		__entry->nr_failed = nr_failed;
+ 	),
+ 
+@@ -67,6 +84,48 @@ TRACE_EVENT(mm_compaction_migratepages,
+ 		__entry->nr_failed)
+ );
+ 
++TRACE_EVENT(mm_compaction_begin,
++	TP_PROTO(unsigned long zone_start, unsigned long migrate_start,
++		unsigned long free_start, unsigned long zone_end),
++
++	TP_ARGS(zone_start, migrate_start, free_start, zone_end),
++
++	TP_STRUCT__entry(
++		__field(unsigned long, zone_start)
++		__field(unsigned long, migrate_start)
++		__field(unsigned long, free_start)
++		__field(unsigned long, zone_end)
++	),
++
++	TP_fast_assign(
++		__entry->zone_start = zone_start;
++		__entry->migrate_start = migrate_start;
++		__entry->free_start = free_start;
++		__entry->zone_end = zone_end;
++	),
++
++	TP_printk("zone_start=%lu migrate_start=%lu free_start=%lu zone_end=%lu",
++		__entry->zone_start,
++		__entry->migrate_start,
++		__entry->free_start,
++		__entry->zone_end)
++);
++
++TRACE_EVENT(mm_compaction_end,
++	TP_PROTO(int status),
++
++	TP_ARGS(status),
++
++	TP_STRUCT__entry(
++		__field(int, status)
++	),
++
++	TP_fast_assign(
++		__entry->status = status;
++	),
++
++	TP_printk("status=%d", __entry->status)
++);
+ 
+ #endif /* _TRACE_COMPACTION_H */
+ 
+diff --git a/include/trace/events/kmem.h b/include/trace/events/kmem.h
+index d0c613476620..aece1346ceb7 100644
+--- a/include/trace/events/kmem.h
++++ b/include/trace/events/kmem.h
+@@ -267,14 +267,12 @@ DEFINE_EVENT_PRINT(mm_page, mm_page_pcpu_drain,
+ TRACE_EVENT(mm_page_alloc_extfrag,
+ 
+ 	TP_PROTO(struct page *page,
+-			int alloc_order, int fallback_order,
+-			int alloc_migratetype, int fallback_migratetype,
+-			int change_ownership),
++		int alloc_order, int fallback_order,
++		int alloc_migratetype, int fallback_migratetype, int new_migratetype),
+ 
+ 	TP_ARGS(page,
+ 		alloc_order, fallback_order,
+-		alloc_migratetype, fallback_migratetype,
+-		change_ownership),
++		alloc_migratetype, fallback_migratetype, new_migratetype),
+ 
+ 	TP_STRUCT__entry(
+ 		__field(	struct page *,	page			)
+@@ -291,7 +289,7 @@ TRACE_EVENT(mm_page_alloc_extfrag,
+ 		__entry->fallback_order		= fallback_order;
+ 		__entry->alloc_migratetype	= alloc_migratetype;
+ 		__entry->fallback_migratetype	= fallback_migratetype;
+-		__entry->change_ownership	= change_ownership;
++		__entry->change_ownership	= (new_migratetype == alloc_migratetype);
+ 	),
+ 
+ 	TP_printk("page=%p pfn=%lu alloc_order=%d fallback_order=%d pageblock_order=%d alloc_migratetype=%d fallback_migratetype=%d fragmenting=%d change_ownership=%d",
+diff --git a/include/trace/events/pagemap.h b/include/trace/events/pagemap.h
+index 1c9fabde69e4..ce0803b8d05f 100644
+--- a/include/trace/events/pagemap.h
++++ b/include/trace/events/pagemap.h
+@@ -28,12 +28,10 @@ TRACE_EVENT(mm_lru_insertion,
+ 
+ 	TP_PROTO(
+ 		struct page *page,
+-		unsigned long pfn,
+-		int lru,
+-		unsigned long flags
++		int lru
+ 	),
+ 
+-	TP_ARGS(page, pfn, lru, flags),
++	TP_ARGS(page, lru),
+ 
+ 	TP_STRUCT__entry(
+ 		__field(struct page *,	page	)
+@@ -44,9 +42,9 @@ TRACE_EVENT(mm_lru_insertion,
+ 
+ 	TP_fast_assign(
+ 		__entry->page	= page;
+-		__entry->pfn	= pfn;
++		__entry->pfn	= page_to_pfn(page);
+ 		__entry->lru	= lru;
+-		__entry->flags	= flags;
++		__entry->flags	= trace_pagemap_flags(page);
+ 	),
+ 
+ 	/* Flag format is based on page-types.c formatting for pagemap */
+@@ -64,9 +62,9 @@ TRACE_EVENT(mm_lru_insertion,
+ 
+ TRACE_EVENT(mm_lru_activate,
+ 
+-	TP_PROTO(struct page *page, unsigned long pfn),
++	TP_PROTO(struct page *page),
+ 
+-	TP_ARGS(page, pfn),
++	TP_ARGS(page),
+ 
+ 	TP_STRUCT__entry(
+ 		__field(struct page *,	page	)
+@@ -75,7 +73,7 @@ TRACE_EVENT(mm_lru_activate,
+ 
+ 	TP_fast_assign(
+ 		__entry->page	= page;
+-		__entry->pfn	= pfn;
++		__entry->pfn	= page_to_pfn(page);
+ 	),
+ 
+ 	/* Flag format is based on page-types.c formatting for pagemap */
+diff --git a/kernel/cpuset.c b/kernel/cpuset.c
+index 0b29c52479a6..c8289138cad4 100644
+--- a/kernel/cpuset.c
++++ b/kernel/cpuset.c
+@@ -61,12 +61,7 @@
+ #include <linux/cgroup.h>
+ #include <linux/wait.h>
+ 
+-/*
+- * Tracks how many cpusets are currently defined in system.
+- * When there is only one cpuset (the root cpuset) we can
+- * short circuit some hooks.
+- */
+-int number_of_cpusets __read_mostly;
++struct static_key cpusets_enabled_key __read_mostly = STATIC_KEY_INIT_FALSE;
+ 
+ /* See "Frequency meter" comments, below. */
+ 
+@@ -611,7 +606,7 @@ static int generate_sched_domains(cpumask_var_t **domains,
+ 		goto done;
+ 	}
+ 
+-	csa = kmalloc(number_of_cpusets * sizeof(cp), GFP_KERNEL);
++	csa = kmalloc(nr_cpusets() * sizeof(cp), GFP_KERNEL);
+ 	if (!csa)
+ 		goto done;
+ 	csn = 0;
+@@ -1022,7 +1017,7 @@ static void cpuset_change_task_nodemask(struct task_struct *tsk,
+ 	task_lock(tsk);
+ 	/*
+ 	 * Determine if a loop is necessary if another thread is doing
+-	 * get_mems_allowed().  If at least one node remains unchanged and
++	 * read_mems_allowed_begin().  If at least one node remains unchanged and
+ 	 * tsk does not have a mempolicy, then an empty nodemask will not be
+ 	 * possible when mems_allowed is larger than a word.
+ 	 */
+@@ -1986,7 +1981,7 @@ static int cpuset_css_online(struct cgroup_subsys_state *css)
+ 	if (is_spread_slab(parent))
+ 		set_bit(CS_SPREAD_SLAB, &cs->flags);
+ 
+-	number_of_cpusets++;
++	cpuset_inc();
+ 
+ 	if (!test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags))
+ 		goto out_unlock;
+@@ -2037,7 +2032,7 @@ static void cpuset_css_offline(struct cgroup_subsys_state *css)
+ 	if (is_sched_load_balance(cs))
+ 		update_flag(CS_SCHED_LOAD_BALANCE, cs, 0);
+ 
+-	number_of_cpusets--;
++	cpuset_dec();
+ 	clear_bit(CS_ONLINE, &cs->flags);
+ 
+ 	mutex_unlock(&cpuset_mutex);
+@@ -2092,7 +2087,6 @@ int __init cpuset_init(void)
+ 	if (!alloc_cpumask_var(&cpus_attach, GFP_KERNEL))
+ 		BUG();
+ 
+-	number_of_cpusets = 1;
+ 	return 0;
+ }
+ 
+diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c
+index 0506d447aed2..e911ec662d03 100644
+--- a/kernel/debug/debug_core.c
++++ b/kernel/debug/debug_core.c
+@@ -49,6 +49,7 @@
+ #include <linux/pid.h>
+ #include <linux/smp.h>
+ #include <linux/mm.h>
++#include <linux/vmacache.h>
+ #include <linux/rcupdate.h>
+ 
+ #include <asm/cacheflush.h>
+@@ -224,10 +225,17 @@ static void kgdb_flush_swbreak_addr(unsigned long addr)
+ 	if (!CACHE_FLUSH_IS_SAFE)
+ 		return;
+ 
+-	if (current->mm && current->mm->mmap_cache) {
+-		flush_cache_range(current->mm->mmap_cache,
+-				  addr, addr + BREAK_INSTR_SIZE);
++	if (current->mm) {
++		int i;
++
++		for (i = 0; i < VMACACHE_SIZE; i++) {
++			if (!current->vmacache[i])
++				continue;
++			flush_cache_range(current->vmacache[i],
++					  addr, addr + BREAK_INSTR_SIZE);
++		}
+ 	}
++
+ 	/* Force flush instruction cache if it was outside the mm */
+ 	flush_icache_range(addr, addr + BREAK_INSTR_SIZE);
+ }
+diff --git a/kernel/fork.c b/kernel/fork.c
+index 143962949bed..29a1b0283d3b 100644
+--- a/kernel/fork.c
++++ b/kernel/fork.c
+@@ -28,6 +28,8 @@
+ #include <linux/mman.h>
+ #include <linux/mmu_notifier.h>
+ #include <linux/fs.h>
++#include <linux/mm.h>
++#include <linux/vmacache.h>
+ #include <linux/nsproxy.h>
+ #include <linux/capability.h>
+ #include <linux/cpu.h>
+@@ -363,7 +365,7 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
+ 
+ 	mm->locked_vm = 0;
+ 	mm->mmap = NULL;
+-	mm->mmap_cache = NULL;
++	mm->vmacache_seqnum = 0;
+ 	mm->map_count = 0;
+ 	cpumask_clear(mm_cpumask(mm));
+ 	mm->mm_rb = RB_ROOT;
+@@ -882,6 +884,9 @@ static int copy_mm(unsigned long clone_flags, struct task_struct *tsk)
+ 	if (!oldmm)
+ 		return 0;
+ 
++	/* initialize the new vmacache entries */
++	vmacache_flush(tsk);
++
+ 	if (clone_flags & CLONE_VM) {
+ 		atomic_inc(&oldmm->mm_users);
+ 		mm = oldmm;
+diff --git a/lib/plist.c b/lib/plist.c
+index 1ebc95f7a46f..0f2084d30798 100644
+--- a/lib/plist.c
++++ b/lib/plist.c
+@@ -134,6 +134,46 @@ void plist_del(struct plist_node *node, struct plist_head *head)
+ 	plist_check_head(head);
+ }
+ 
++/**
++ * plist_requeue - Requeue @node at end of same-prio entries.
++ *
++ * This is essentially an optimized plist_del() followed by
++ * plist_add().  It moves an entry already in the plist to
++ * after any other same-priority entries.
++ *
++ * @node:	&struct plist_node pointer - entry to be moved
++ * @head:	&struct plist_head pointer - list head
++ */
++void plist_requeue(struct plist_node *node, struct plist_head *head)
++{
++	struct plist_node *iter;
++	struct list_head *node_next = &head->node_list;
++
++	plist_check_head(head);
++	BUG_ON(plist_head_empty(head));
++	BUG_ON(plist_node_empty(node));
++
++	if (node == plist_last(head))
++		return;
++
++	iter = plist_next(node);
++
++	if (node->prio != iter->prio)
++		return;
++
++	plist_del(node, head);
++
++	plist_for_each_continue(iter, head) {
++		if (node->prio != iter->prio) {
++			node_next = &iter->node_list;
++			break;
++		}
++	}
++	list_add_tail(&node->node_list, node_next);
++
++	plist_check_head(head);
++}
++
+ #ifdef CONFIG_DEBUG_PI_LIST
+ #include <linux/sched.h>
+ #include <linux/module.h>
+@@ -170,6 +210,14 @@ static void __init plist_test_check(int nr_expect)
+ 	BUG_ON(prio_pos->prio_list.next != &first->prio_list);
+ }
+ 
++static void __init plist_test_requeue(struct plist_node *node)
++{
++	plist_requeue(node, &test_head);
++
++	if (node != plist_last(&test_head))
++		BUG_ON(node->prio == plist_next(node)->prio);
++}
++
+ static int  __init plist_test(void)
+ {
+ 	int nr_expect = 0, i, loop;
+@@ -193,6 +241,10 @@ static int  __init plist_test(void)
+ 			nr_expect--;
+ 		}
+ 		plist_test_check(nr_expect);
++		if (!plist_node_empty(test_node + i)) {
++			plist_test_requeue(test_node + i);
++			plist_test_check(nr_expect);
++		}
+ 	}
+ 
+ 	for (i = 0; i < ARRAY_SIZE(test_node); i++) {
+diff --git a/lib/radix-tree.c b/lib/radix-tree.c
+index 7811ed3b4e70..e8adb5d8a184 100644
+--- a/lib/radix-tree.c
++++ b/lib/radix-tree.c
+@@ -946,81 +946,6 @@ next:
+ }
+ EXPORT_SYMBOL(radix_tree_range_tag_if_tagged);
+ 
+-
+-/**
+- *	radix_tree_next_hole    -    find the next hole (not-present entry)
+- *	@root:		tree root
+- *	@index:		index key
+- *	@max_scan:	maximum range to search
+- *
+- *	Search the set [index, min(index+max_scan-1, MAX_INDEX)] for the lowest
+- *	indexed hole.
+- *
+- *	Returns: the index of the hole if found, otherwise returns an index
+- *	outside of the set specified (in which case 'return - index >= max_scan'
+- *	will be true). In rare cases of index wrap-around, 0 will be returned.
+- *
+- *	radix_tree_next_hole may be called under rcu_read_lock. However, like
+- *	radix_tree_gang_lookup, this will not atomically search a snapshot of
+- *	the tree at a single point in time. For example, if a hole is created
+- *	at index 5, then subsequently a hole is created at index 10,
+- *	radix_tree_next_hole covering both indexes may return 10 if called
+- *	under rcu_read_lock.
+- */
+-unsigned long radix_tree_next_hole(struct radix_tree_root *root,
+-				unsigned long index, unsigned long max_scan)
+-{
+-	unsigned long i;
+-
+-	for (i = 0; i < max_scan; i++) {
+-		if (!radix_tree_lookup(root, index))
+-			break;
+-		index++;
+-		if (index == 0)
+-			break;
+-	}
+-
+-	return index;
+-}
+-EXPORT_SYMBOL(radix_tree_next_hole);
+-
+-/**
+- *	radix_tree_prev_hole    -    find the prev hole (not-present entry)
+- *	@root:		tree root
+- *	@index:		index key
+- *	@max_scan:	maximum range to search
+- *
+- *	Search backwards in the range [max(index-max_scan+1, 0), index]
+- *	for the first hole.
+- *
+- *	Returns: the index of the hole if found, otherwise returns an index
+- *	outside of the set specified (in which case 'index - return >= max_scan'
+- *	will be true). In rare cases of wrap-around, ULONG_MAX will be returned.
+- *
+- *	radix_tree_next_hole may be called under rcu_read_lock. However, like
+- *	radix_tree_gang_lookup, this will not atomically search a snapshot of
+- *	the tree at a single point in time. For example, if a hole is created
+- *	at index 10, then subsequently a hole is created at index 5,
+- *	radix_tree_prev_hole covering both indexes may return 5 if called under
+- *	rcu_read_lock.
+- */
+-unsigned long radix_tree_prev_hole(struct radix_tree_root *root,
+-				   unsigned long index, unsigned long max_scan)
+-{
+-	unsigned long i;
+-
+-	for (i = 0; i < max_scan; i++) {
+-		if (!radix_tree_lookup(root, index))
+-			break;
+-		index--;
+-		if (index == ULONG_MAX)
+-			break;
+-	}
+-
+-	return index;
+-}
+-EXPORT_SYMBOL(radix_tree_prev_hole);
+-
+ /**
+  *	radix_tree_gang_lookup - perform multiple lookup on a radix tree
+  *	@root:		radix tree root
+@@ -1335,15 +1260,18 @@ static inline void radix_tree_shrink(struct radix_tree_root *root)
+ }
+ 
+ /**
+- *	radix_tree_delete    -    delete an item from a radix tree
++ *	radix_tree_delete_item    -    delete an item from a radix tree
+  *	@root:		radix tree root
+  *	@index:		index key
++ *	@item:		expected item
+  *
+- *	Remove the item at @index from the radix tree rooted at @root.
++ *	Remove @item at @index from the radix tree rooted at @root.
+  *
+- *	Returns the address of the deleted item, or NULL if it was not present.
++ *	Returns the address of the deleted item, or NULL if it was not present
++ *	or the entry at the given @index was not @item.
+  */
+-void *radix_tree_delete(struct radix_tree_root *root, unsigned long index)
++void *radix_tree_delete_item(struct radix_tree_root *root,
++			     unsigned long index, void *item)
+ {
+ 	struct radix_tree_node *node = NULL;
+ 	struct radix_tree_node *slot = NULL;
+@@ -1378,6 +1306,11 @@ void *radix_tree_delete(struct radix_tree_root *root, unsigned long index)
+ 	if (slot == NULL)
+ 		goto out;
+ 
++	if (item && slot != item) {
++		slot = NULL;
++		goto out;
++	}
++
+ 	/*
+ 	 * Clear all tags associated with the item to be deleted.
+ 	 * This way of doing it would be inefficient, but seldom is any set.
+@@ -1422,6 +1355,21 @@ void *radix_tree_delete(struct radix_tree_root *root, unsigned long index)
+ out:
+ 	return slot;
+ }
++EXPORT_SYMBOL(radix_tree_delete_item);
++
++/**
++ *	radix_tree_delete    -    delete an item from a radix tree
++ *	@root:		radix tree root
++ *	@index:		index key
++ *
++ *	Remove the item at @index from the radix tree rooted at @root.
++ *
++ *	Returns the address of the deleted item, or NULL if it was not present.
++ */
++void *radix_tree_delete(struct radix_tree_root *root, unsigned long index)
++{
++	return radix_tree_delete_item(root, index, NULL);
++}
+ EXPORT_SYMBOL(radix_tree_delete);
+ 
+ /**
+diff --git a/mm/Makefile b/mm/Makefile
+index 305d10acd081..fb51bc61d80a 100644
+--- a/mm/Makefile
++++ b/mm/Makefile
+@@ -16,7 +16,7 @@ obj-y			:= filemap.o mempool.o oom_kill.o fadvise.o \
+ 			   readahead.o swap.o truncate.o vmscan.o shmem.o \
+ 			   util.o mmzone.o vmstat.o backing-dev.o \
+ 			   mm_init.o mmu_context.o percpu.o slab_common.o \
+-			   compaction.o balloon_compaction.o \
++			   compaction.o balloon_compaction.o vmacache.o \
+ 			   interval_tree.o list_lru.o $(mmu-y)
+ 
+ obj-y += init-mm.o
+diff --git a/mm/compaction.c b/mm/compaction.c
+index 6441083e76d3..adb6d0560e96 100644
+--- a/mm/compaction.c
++++ b/mm/compaction.c
+@@ -89,7 +89,8 @@ static void __reset_isolation_suitable(struct zone *zone)
+ 	unsigned long end_pfn = zone_end_pfn(zone);
+ 	unsigned long pfn;
+ 
+-	zone->compact_cached_migrate_pfn = start_pfn;
++	zone->compact_cached_migrate_pfn[0] = start_pfn;
++	zone->compact_cached_migrate_pfn[1] = start_pfn;
+ 	zone->compact_cached_free_pfn = end_pfn;
+ 	zone->compact_blockskip_flush = false;
+ 
+@@ -131,9 +132,10 @@ void reset_isolation_suitable(pg_data_t *pgdat)
+  */
+ static void update_pageblock_skip(struct compact_control *cc,
+ 			struct page *page, unsigned long nr_isolated,
+-			bool migrate_scanner)
++			bool set_unsuitable, bool migrate_scanner)
+ {
+ 	struct zone *zone = cc->zone;
++	unsigned long pfn;
+ 
+ 	if (cc->ignore_skip_hint)
+ 		return;
+@@ -141,20 +143,32 @@ static void update_pageblock_skip(struct compact_control *cc,
+ 	if (!page)
+ 		return;
+ 
+-	if (!nr_isolated) {
+-		unsigned long pfn = page_to_pfn(page);
++	if (nr_isolated)
++		return;
++
++	/*
++	 * Only skip pageblocks when all forms of compaction will be known to
++	 * fail in the near future.
++	 */
++	if (set_unsuitable)
+ 		set_pageblock_skip(page);
+ 
+-		/* Update where compaction should restart */
+-		if (migrate_scanner) {
+-			if (!cc->finished_update_migrate &&
+-			    pfn > zone->compact_cached_migrate_pfn)
+-				zone->compact_cached_migrate_pfn = pfn;
+-		} else {
+-			if (!cc->finished_update_free &&
+-			    pfn < zone->compact_cached_free_pfn)
+-				zone->compact_cached_free_pfn = pfn;
+-		}
++	pfn = page_to_pfn(page);
++
++	/* Update where async and sync compaction should restart */
++	if (migrate_scanner) {
++		if (cc->finished_update_migrate)
++			return;
++		if (pfn > zone->compact_cached_migrate_pfn[0])
++			zone->compact_cached_migrate_pfn[0] = pfn;
++		if (cc->mode != MIGRATE_ASYNC &&
++		    pfn > zone->compact_cached_migrate_pfn[1])
++			zone->compact_cached_migrate_pfn[1] = pfn;
++	} else {
++		if (cc->finished_update_free)
++			return;
++		if (pfn < zone->compact_cached_free_pfn)
++			zone->compact_cached_free_pfn = pfn;
+ 	}
+ }
+ #else
+@@ -166,7 +180,7 @@ static inline bool isolation_suitable(struct compact_control *cc,
+ 
+ static void update_pageblock_skip(struct compact_control *cc,
+ 			struct page *page, unsigned long nr_isolated,
+-			bool migrate_scanner)
++			bool set_unsuitable, bool migrate_scanner)
+ {
+ }
+ #endif /* CONFIG_COMPACTION */
+@@ -195,7 +209,7 @@ static bool compact_checklock_irqsave(spinlock_t *lock, unsigned long *flags,
+ 		}
+ 
+ 		/* async aborts if taking too long or contended */
+-		if (!cc->sync) {
++		if (cc->mode == MIGRATE_ASYNC) {
+ 			cc->contended = true;
+ 			return false;
+ 		}
+@@ -208,30 +222,39 @@ static bool compact_checklock_irqsave(spinlock_t *lock, unsigned long *flags,
+ 	return true;
+ }
+ 
+-static inline bool compact_trylock_irqsave(spinlock_t *lock,
+-			unsigned long *flags, struct compact_control *cc)
++/*
++ * Aside from avoiding lock contention, compaction also periodically checks
++ * need_resched() and either schedules in sync compaction or aborts async
++ * compaction. This is similar to what compact_checklock_irqsave() does, but
++ * is used where no lock is concerned.
++ *
++ * Returns false when no scheduling was needed, or sync compaction scheduled.
++ * Returns true when async compaction should abort.
++ */
++static inline bool compact_should_abort(struct compact_control *cc)
+ {
+-	return compact_checklock_irqsave(lock, flags, false, cc);
++	/* async compaction aborts if contended */
++	if (need_resched()) {
++		if (cc->mode == MIGRATE_ASYNC) {
++			cc->contended = true;
++			return true;
++		}
++
++		cond_resched();
++	}
++
++	return false;
+ }
+ 
+ /* Returns true if the page is within a block suitable for migration to */
+ static bool suitable_migration_target(struct page *page)
+ {
+-	int migratetype = get_pageblock_migratetype(page);
+-
+-	/* Don't interfere with memory hot-remove or the min_free_kbytes blocks */
+-	if (migratetype == MIGRATE_RESERVE)
+-		return false;
+-
+-	if (is_migrate_isolate(migratetype))
+-		return false;
+-
+-	/* If the page is a large free page, then allow migration */
++	/* If the page is a large free page, then disallow migration */
+ 	if (PageBuddy(page) && page_order(page) >= pageblock_order)
+-		return true;
++		return false;
+ 
+ 	/* If the block is MIGRATE_MOVABLE or MIGRATE_CMA, allow migration */
+-	if (migrate_async_suitable(migratetype))
++	if (migrate_async_suitable(get_pageblock_migratetype(page)))
+ 		return true;
+ 
+ 	/* Otherwise skip the block */
+@@ -254,6 +277,7 @@ static unsigned long isolate_freepages_block(struct compact_control *cc,
+ 	struct page *cursor, *valid_page = NULL;
+ 	unsigned long flags;
+ 	bool locked = false;
++	bool checked_pageblock = false;
+ 
+ 	cursor = pfn_to_page(blockpfn);
+ 
+@@ -285,8 +309,16 @@ static unsigned long isolate_freepages_block(struct compact_control *cc,
+ 			break;
+ 
+ 		/* Recheck this is a suitable migration target under lock */
+-		if (!strict && !suitable_migration_target(page))
+-			break;
++		if (!strict && !checked_pageblock) {
++			/*
++			 * We need to check suitability of pageblock only once
++			 * and this isolate_freepages_block() is called with
++			 * pageblock range, so just check once is sufficient.
++			 */
++			checked_pageblock = true;
++			if (!suitable_migration_target(page))
++				break;
++		}
+ 
+ 		/* Recheck this is a buddy page under lock */
+ 		if (!PageBuddy(page))
+@@ -330,7 +362,8 @@ isolate_fail:
+ 
+ 	/* Update the pageblock-skip if the whole pageblock was scanned */
+ 	if (blockpfn == end_pfn)
+-		update_pageblock_skip(cc, valid_page, total_isolated, false);
++		update_pageblock_skip(cc, valid_page, total_isolated, true,
++				      false);
+ 
+ 	count_compact_events(COMPACTFREE_SCANNED, nr_scanned);
+ 	if (total_isolated)
+@@ -461,11 +494,14 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
+ 	unsigned long last_pageblock_nr = 0, pageblock_nr;
+ 	unsigned long nr_scanned = 0, nr_isolated = 0;
+ 	struct list_head *migratelist = &cc->migratepages;
+-	isolate_mode_t mode = 0;
+ 	struct lruvec *lruvec;
+ 	unsigned long flags;
+ 	bool locked = false;
+ 	struct page *page = NULL, *valid_page = NULL;
++	bool set_unsuitable = true;
++	const isolate_mode_t mode = (cc->mode == MIGRATE_ASYNC ?
++					ISOLATE_ASYNC_MIGRATE : 0) |
++				    (unevictable ? ISOLATE_UNEVICTABLE : 0);
+ 
+ 	/*
+ 	 * Ensure that there are not too many pages isolated from the LRU
+@@ -474,7 +510,7 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
+ 	 */
+ 	while (unlikely(too_many_isolated(zone))) {
+ 		/* async migration should just abort */
+-		if (!cc->sync)
++		if (cc->mode == MIGRATE_ASYNC)
+ 			return 0;
+ 
+ 		congestion_wait(BLK_RW_ASYNC, HZ/10);
+@@ -483,11 +519,13 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
+ 			return 0;
+ 	}
+ 
++	if (compact_should_abort(cc))
++		return 0;
++
+ 	/* Time to isolate some pages for migration */
+-	cond_resched();
+ 	for (; low_pfn < end_pfn; low_pfn++) {
+ 		/* give a chance to irqs before checking need_resched() */
+-		if (locked && !((low_pfn+1) % SWAP_CLUSTER_MAX)) {
++		if (locked && !(low_pfn % SWAP_CLUSTER_MAX)) {
+ 			if (should_release_lock(&zone->lru_lock)) {
+ 				spin_unlock_irqrestore(&zone->lru_lock, flags);
+ 				locked = false;
+@@ -526,25 +564,31 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
+ 
+ 		/* If isolation recently failed, do not retry */
+ 		pageblock_nr = low_pfn >> pageblock_order;
+-		if (!isolation_suitable(cc, page))
+-			goto next_pageblock;
++		if (last_pageblock_nr != pageblock_nr) {
++			int mt;
++
++			last_pageblock_nr = pageblock_nr;
++			if (!isolation_suitable(cc, page))
++				goto next_pageblock;
++
++			/*
++			 * For async migration, also only scan in MOVABLE
++			 * blocks. Async migration is optimistic to see if
++			 * the minimum amount of work satisfies the allocation
++			 */
++			mt = get_pageblock_migratetype(page);
++			if (cc->mode == MIGRATE_ASYNC &&
++			    !migrate_async_suitable(mt)) {
++				set_unsuitable = false;
++				goto next_pageblock;
++			}
++		}
+ 
+ 		/* Skip if free */
+ 		if (PageBuddy(page))
+ 			continue;
+ 
+ 		/*
+-		 * For async migration, also only scan in MOVABLE blocks. Async
+-		 * migration is optimistic to see if the minimum amount of work
+-		 * satisfies the allocation
+-		 */
+-		if (!cc->sync && last_pageblock_nr != pageblock_nr &&
+-		    !migrate_async_suitable(get_pageblock_migratetype(page))) {
+-			cc->finished_update_migrate = true;
+-			goto next_pageblock;
+-		}
+-
+-		/*
+ 		 * Check may be lockless but that's ok as we recheck later.
+ 		 * It's possible to migrate LRU pages and balloon pages
+ 		 * Skip any other type of page
+@@ -553,11 +597,7 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
+ 			if (unlikely(balloon_page_movable(page))) {
+ 				if (locked && balloon_page_isolate(page)) {
+ 					/* Successfully isolated */
+-					cc->finished_update_migrate = true;
+-					list_add(&page->lru, migratelist);
+-					cc->nr_migratepages++;
+-					nr_isolated++;
+-					goto check_compact_cluster;
++					goto isolate_success;
+ 				}
+ 			}
+ 			continue;
+@@ -580,6 +620,15 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
+ 			continue;
+ 		}
+ 
++		/*
++		 * Migration will fail if an anonymous page is pinned in memory,
++		 * so avoid taking lru_lock and isolating it unnecessarily in an
++		 * admittedly racy check.
++		 */
++		if (!page_mapping(page) &&
++		    page_count(page) > page_mapcount(page))
++			continue;
++
+ 		/* Check if it is ok to still hold the lock */
+ 		locked = compact_checklock_irqsave(&zone->lru_lock, &flags,
+ 								locked, cc);
+@@ -594,12 +643,6 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
+ 			continue;
+ 		}
+ 
+-		if (!cc->sync)
+-			mode |= ISOLATE_ASYNC_MIGRATE;
+-
+-		if (unevictable)
+-			mode |= ISOLATE_UNEVICTABLE;
+-
+ 		lruvec = mem_cgroup_page_lruvec(page, zone);
+ 
+ 		/* Try isolate the page */
+@@ -609,13 +652,14 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
+ 		VM_BUG_ON(PageTransCompound(page));
+ 
+ 		/* Successfully isolated */
+-		cc->finished_update_migrate = true;
+ 		del_page_from_lru_list(page, lruvec, page_lru(page));
++
++isolate_success:
++		cc->finished_update_migrate = true;
+ 		list_add(&page->lru, migratelist);
+ 		cc->nr_migratepages++;
+ 		nr_isolated++;
+ 
+-check_compact_cluster:
+ 		/* Avoid isolating too much */
+ 		if (cc->nr_migratepages == COMPACT_CLUSTER_MAX) {
+ 			++low_pfn;
+@@ -626,7 +670,6 @@ check_compact_cluster:
+ 
+ next_pageblock:
+ 		low_pfn = ALIGN(low_pfn + 1, pageblock_nr_pages) - 1;
+-		last_pageblock_nr = pageblock_nr;
+ 	}
+ 
+ 	acct_isolated(zone, locked, cc);
+@@ -634,9 +677,13 @@ next_pageblock:
+ 	if (locked)
+ 		spin_unlock_irqrestore(&zone->lru_lock, flags);
+ 
+-	/* Update the pageblock-skip if the whole pageblock was scanned */
++	/*
++	 * Update the pageblock-skip information and cached scanner pfn,
++	 * if the whole pageblock was scanned without isolating any page.
++	 */
+ 	if (low_pfn == end_pfn)
+-		update_pageblock_skip(cc, valid_page, nr_isolated, true);
++		update_pageblock_skip(cc, valid_page, nr_isolated,
++				      set_unsuitable, true);
+ 
+ 	trace_mm_compaction_isolate_migratepages(nr_scanned, nr_isolated);
+ 
+@@ -657,7 +704,9 @@ static void isolate_freepages(struct zone *zone,
+ 				struct compact_control *cc)
+ {
+ 	struct page *page;
+-	unsigned long high_pfn, low_pfn, pfn, z_end_pfn;
++	unsigned long block_start_pfn;	/* start of current pageblock */
++	unsigned long block_end_pfn;	/* end of current pageblock */
++	unsigned long low_pfn;	     /* lowest pfn scanner is able to scan */
+ 	int nr_freepages = cc->nr_freepages;
+ 	struct list_head *freelist = &cc->freepages;
+ 
+@@ -665,41 +714,38 @@ static void isolate_freepages(struct zone *zone,
+ 	 * Initialise the free scanner. The starting point is where we last
+ 	 * successfully isolated from, zone-cached value, or the end of the
+ 	 * zone when isolating for the first time. We need this aligned to
+-	 * the pageblock boundary, because we do pfn -= pageblock_nr_pages
+-	 * in the for loop.
++	 * the pageblock boundary, because we do
++	 * block_start_pfn -= pageblock_nr_pages in the for loop.
++	 * For ending point, take care when isolating in last pageblock of a
++	 * a zone which ends in the middle of a pageblock.
+ 	 * The low boundary is the end of the pageblock the migration scanner
+ 	 * is using.
+ 	 */
+-	pfn = cc->free_pfn & ~(pageblock_nr_pages-1);
++	block_start_pfn = cc->free_pfn & ~(pageblock_nr_pages-1);
++	block_end_pfn = min(block_start_pfn + pageblock_nr_pages,
++						zone_end_pfn(zone));
+ 	low_pfn = ALIGN(cc->migrate_pfn + 1, pageblock_nr_pages);
+ 
+ 	/*
+-	 * Take care that if the migration scanner is at the end of the zone
+-	 * that the free scanner does not accidentally move to the next zone
+-	 * in the next isolation cycle.
+-	 */
+-	high_pfn = min(low_pfn, pfn);
+-
+-	z_end_pfn = zone_end_pfn(zone);
+-
+-	/*
+ 	 * Isolate free pages until enough are available to migrate the
+ 	 * pages on cc->migratepages. We stop searching if the migrate
+ 	 * and free page scanners meet or enough free pages are isolated.
+ 	 */
+-	for (; pfn >= low_pfn && cc->nr_migratepages > nr_freepages;
+-					pfn -= pageblock_nr_pages) {
++	for (; block_start_pfn >= low_pfn && cc->nr_migratepages > nr_freepages;
++				block_end_pfn = block_start_pfn,
++				block_start_pfn -= pageblock_nr_pages) {
+ 		unsigned long isolated;
+-		unsigned long end_pfn;
+ 
+ 		/*
+ 		 * This can iterate a massively long zone without finding any
+ 		 * suitable migration targets, so periodically check if we need
+-		 * to schedule.
++		 * to schedule, or even abort async compaction.
+ 		 */
+-		cond_resched();
++		if (!(block_start_pfn % (SWAP_CLUSTER_MAX * pageblock_nr_pages))
++						&& compact_should_abort(cc))
++			break;
+ 
+-		if (!pfn_valid(pfn))
++		if (!pfn_valid(block_start_pfn))
+ 			continue;
+ 
+ 		/*
+@@ -709,7 +755,7 @@ static void isolate_freepages(struct zone *zone,
+ 		 * i.e. it's possible that all pages within a zones range of
+ 		 * pages do not belong to a single zone.
+ 		 */
+-		page = pfn_to_page(pfn);
++		page = pfn_to_page(block_start_pfn);
+ 		if (page_zone(page) != zone)
+ 			continue;
+ 
+@@ -722,26 +768,26 @@ static void isolate_freepages(struct zone *zone,
+ 			continue;
+ 
+ 		/* Found a block suitable for isolating free pages from */
+-		isolated = 0;
++		cc->free_pfn = block_start_pfn;
++		isolated = isolate_freepages_block(cc, block_start_pfn,
++					block_end_pfn, freelist, false);
++		nr_freepages += isolated;
+ 
+ 		/*
+-		 * Take care when isolating in last pageblock of a zone which
+-		 * ends in the middle of a pageblock.
++		 * Set a flag that we successfully isolated in this pageblock.
++		 * In the next loop iteration, zone->compact_cached_free_pfn
++		 * will not be updated and thus it will effectively contain the
++		 * highest pageblock we isolated pages from.
+ 		 */
+-		end_pfn = min(pfn + pageblock_nr_pages, z_end_pfn);
+-		isolated = isolate_freepages_block(cc, pfn, end_pfn,
+-						   freelist, false);
+-		nr_freepages += isolated;
++		if (isolated)
++			cc->finished_update_free = true;
+ 
+ 		/*
+-		 * Record the highest PFN we isolated pages from. When next
+-		 * looking for free pages, the search will restart here as
+-		 * page migration may have returned some pages to the allocator
++		 * isolate_freepages_block() might have aborted due to async
++		 * compaction being contended
+ 		 */
+-		if (isolated) {
+-			cc->finished_update_free = true;
+-			high_pfn = max(high_pfn, pfn);
+-		}
++		if (cc->contended)
++			break;
+ 	}
+ 
+ 	/* split_free_page does not map the pages */
+@@ -751,10 +797,9 @@ static void isolate_freepages(struct zone *zone,
+ 	 * If we crossed the migrate scanner, we want to keep it that way
+ 	 * so that compact_finished() may detect this
+ 	 */
+-	if (pfn < low_pfn)
+-		cc->free_pfn = max(pfn, zone->zone_start_pfn);
+-	else
+-		cc->free_pfn = high_pfn;
++	if (block_start_pfn < low_pfn)
++		cc->free_pfn = cc->migrate_pfn;
++
+ 	cc->nr_freepages = nr_freepages;
+ }
+ 
+@@ -769,9 +814,13 @@ static struct page *compaction_alloc(struct page *migratepage,
+ 	struct compact_control *cc = (struct compact_control *)data;
+ 	struct page *freepage;
+ 
+-	/* Isolate free pages if necessary */
++	/*
++	 * Isolate free pages if necessary, and if we are not aborting due to
++	 * contention.
++	 */
+ 	if (list_empty(&cc->freepages)) {
+-		isolate_freepages(cc->zone, cc);
++		if (!cc->contended)
++			isolate_freepages(cc->zone, cc);
+ 
+ 		if (list_empty(&cc->freepages))
+ 			return NULL;
+@@ -785,23 +834,16 @@ static struct page *compaction_alloc(struct page *migratepage,
+ }
+ 
+ /*
+- * We cannot control nr_migratepages and nr_freepages fully when migration is
+- * running as migrate_pages() has no knowledge of compact_control. When
+- * migration is complete, we count the number of pages on the lists by hand.
++ * This is a migrate-callback that "frees" freepages back to the isolated
++ * freelist.  All pages on the freelist are from the same zone, so there is no
++ * special handling needed for NUMA.
+  */
+-static void update_nr_listpages(struct compact_control *cc)
++static void compaction_free(struct page *page, unsigned long data)
+ {
+-	int nr_migratepages = 0;
+-	int nr_freepages = 0;
+-	struct page *page;
+-
+-	list_for_each_entry(page, &cc->migratepages, lru)
+-		nr_migratepages++;
+-	list_for_each_entry(page, &cc->freepages, lru)
+-		nr_freepages++;
++	struct compact_control *cc = (struct compact_control *)data;
+ 
+-	cc->nr_migratepages = nr_migratepages;
+-	cc->nr_freepages = nr_freepages;
++	list_add(&page->lru, &cc->freepages);
++	cc->nr_freepages++;
+ }
+ 
+ /* possible outcome of isolate_migratepages */
+@@ -848,11 +890,16 @@ static int compact_finished(struct zone *zone,
+ 	unsigned int order;
+ 	unsigned long watermark;
+ 
+-	if (fatal_signal_pending(current))
++	if (cc->contended || fatal_signal_pending(current))
+ 		return COMPACT_PARTIAL;
+ 
+ 	/* Compaction run completes if the migrate and free scanner meet */
+ 	if (cc->free_pfn <= cc->migrate_pfn) {
++		/* Let the next compaction start anew. */
++		zone->compact_cached_migrate_pfn[0] = zone->zone_start_pfn;
++		zone->compact_cached_migrate_pfn[1] = zone->zone_start_pfn;
++		zone->compact_cached_free_pfn = zone_end_pfn(zone);
++
+ 		/*
+ 		 * Mark that the PG_migrate_skip information should be cleared
+ 		 * by kswapd when it goes to sleep. kswapd does not set the
+@@ -950,6 +997,7 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
+ 	int ret;
+ 	unsigned long start_pfn = zone->zone_start_pfn;
+ 	unsigned long end_pfn = zone_end_pfn(zone);
++	const bool sync = cc->mode != MIGRATE_ASYNC;
+ 
+ 	ret = compaction_suitable(zone, cc->order);
+ 	switch (ret) {
+@@ -975,7 +1023,7 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
+ 	 * information on where the scanners should start but check that it
+ 	 * is initialised by ensuring the values are within zone boundaries.
+ 	 */
+-	cc->migrate_pfn = zone->compact_cached_migrate_pfn;
++	cc->migrate_pfn = zone->compact_cached_migrate_pfn[sync];
+ 	cc->free_pfn = zone->compact_cached_free_pfn;
+ 	if (cc->free_pfn < start_pfn || cc->free_pfn > end_pfn) {
+ 		cc->free_pfn = end_pfn & ~(pageblock_nr_pages-1);
+@@ -983,13 +1031,15 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
+ 	}
+ 	if (cc->migrate_pfn < start_pfn || cc->migrate_pfn > end_pfn) {
+ 		cc->migrate_pfn = start_pfn;
+-		zone->compact_cached_migrate_pfn = cc->migrate_pfn;
++		zone->compact_cached_migrate_pfn[0] = cc->migrate_pfn;
++		zone->compact_cached_migrate_pfn[1] = cc->migrate_pfn;
+ 	}
+ 
++	trace_mm_compaction_begin(start_pfn, cc->migrate_pfn, cc->free_pfn, end_pfn);
++
+ 	migrate_prep_local();
+ 
+ 	while ((ret = compact_finished(zone, cc)) == COMPACT_CONTINUE) {
+-		unsigned long nr_migrate, nr_remaining;
+ 		int err;
+ 
+ 		switch (isolate_migratepages(zone, cc)) {
+@@ -1004,21 +1054,20 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
+ 			;
+ 		}
+ 
+-		nr_migrate = cc->nr_migratepages;
++		if (!cc->nr_migratepages)
++			continue;
++
+ 		err = migrate_pages(&cc->migratepages, compaction_alloc,
+-				(unsigned long)cc,
+-				cc->sync ? MIGRATE_SYNC_LIGHT : MIGRATE_ASYNC,
++				compaction_free, (unsigned long)cc, cc->mode,
+ 				MR_COMPACTION);
+-		update_nr_listpages(cc);
+-		nr_remaining = cc->nr_migratepages;
+ 
+-		trace_mm_compaction_migratepages(nr_migrate - nr_remaining,
+-						nr_remaining);
++		trace_mm_compaction_migratepages(cc->nr_migratepages, err,
++							&cc->migratepages);
+ 
+-		/* Release isolated pages not migrated */
++		/* All pages were either migrated or will be released */
++		cc->nr_migratepages = 0;
+ 		if (err) {
+ 			putback_movable_pages(&cc->migratepages);
+-			cc->nr_migratepages = 0;
+ 			/*
+ 			 * migrate_pages() may return -ENOMEM when scanners meet
+ 			 * and we want compact_finished() to detect it
+@@ -1035,12 +1084,13 @@ out:
+ 	cc->nr_freepages -= release_freepages(&cc->freepages);
+ 	VM_BUG_ON(cc->nr_freepages != 0);
+ 
++	trace_mm_compaction_end(ret);
++
+ 	return ret;
+ }
+ 
+-static unsigned long compact_zone_order(struct zone *zone,
+-				 int order, gfp_t gfp_mask,
+-				 bool sync, bool *contended)
++static unsigned long compact_zone_order(struct zone *zone, int order,
++		gfp_t gfp_mask, enum migrate_mode mode, bool *contended)
+ {
+ 	unsigned long ret;
+ 	struct compact_control cc = {
+@@ -1049,7 +1099,7 @@ static unsigned long compact_zone_order(struct zone *zone,
+ 		.order = order,
+ 		.migratetype = allocflags_to_migratetype(gfp_mask),
+ 		.zone = zone,
+-		.sync = sync,
++		.mode = mode,
+ 	};
+ 	INIT_LIST_HEAD(&cc.freepages);
+ 	INIT_LIST_HEAD(&cc.migratepages);
+@@ -1071,7 +1121,7 @@ int sysctl_extfrag_threshold = 500;
+  * @order: The order of the current allocation
+  * @gfp_mask: The GFP mask of the current allocation
+  * @nodemask: The allowed nodes to allocate from
+- * @sync: Whether migration is synchronous or not
++ * @mode: The migration mode for async, sync light, or sync migration
+  * @contended: Return value that is true if compaction was aborted due to lock contention
+  * @page: Optionally capture a free page of the requested order during compaction
+  *
+@@ -1079,7 +1129,7 @@ int sysctl_extfrag_threshold = 500;
+  */
+ unsigned long try_to_compact_pages(struct zonelist *zonelist,
+ 			int order, gfp_t gfp_mask, nodemask_t *nodemask,
+-			bool sync, bool *contended)
++			enum migrate_mode mode, bool *contended)
+ {
+ 	enum zone_type high_zoneidx = gfp_zone(gfp_mask);
+ 	int may_enter_fs = gfp_mask & __GFP_FS;
+@@ -1104,7 +1154,7 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist,
+ 								nodemask) {
+ 		int status;
+ 
+-		status = compact_zone_order(zone, order, gfp_mask, sync,
++		status = compact_zone_order(zone, order, gfp_mask, mode,
+ 						contended);
+ 		rc = max(status, rc);
+ 
+@@ -1140,13 +1190,9 @@ static void __compact_pgdat(pg_data_t *pgdat, struct compact_control *cc)
+ 			compact_zone(zone, cc);
+ 
+ 		if (cc->order > 0) {
+-			int ok = zone_watermark_ok(zone, cc->order,
+-						low_wmark_pages(zone), 0, 0);
+-			if (ok && cc->order >= zone->compact_order_failed)
+-				zone->compact_order_failed = cc->order + 1;
+-			/* Currently async compaction is never deferred. */
+-			else if (!ok && cc->sync)
+-				defer_compaction(zone, cc->order);
++			if (zone_watermark_ok(zone, cc->order,
++						low_wmark_pages(zone), 0, 0))
++				compaction_defer_reset(zone, cc->order, false);
+ 		}
+ 
+ 		VM_BUG_ON(!list_empty(&cc->freepages));
+@@ -1158,7 +1204,7 @@ void compact_pgdat(pg_data_t *pgdat, int order)
+ {
+ 	struct compact_control cc = {
+ 		.order = order,
+-		.sync = false,
++		.mode = MIGRATE_ASYNC,
+ 	};
+ 
+ 	if (!order)
+@@ -1171,7 +1217,8 @@ static void compact_node(int nid)
+ {
+ 	struct compact_control cc = {
+ 		.order = -1,
+-		.sync = true,
++		.mode = MIGRATE_SYNC,
++		.ignore_skip_hint = true,
+ 	};
+ 
+ 	__compact_pgdat(NODE_DATA(nid), &cc);
+diff --git a/mm/filemap.c b/mm/filemap.c
+index ae4846ff4849..b012daefc2d7 100644
+--- a/mm/filemap.c
++++ b/mm/filemap.c
+@@ -192,9 +192,11 @@ static int filemap_check_errors(struct address_space *mapping)
+ {
+ 	int ret = 0;
+ 	/* Check for outstanding write errors */
+-	if (test_and_clear_bit(AS_ENOSPC, &mapping->flags))
++	if (test_bit(AS_ENOSPC, &mapping->flags) &&
++	    test_and_clear_bit(AS_ENOSPC, &mapping->flags))
+ 		ret = -ENOSPC;
+-	if (test_and_clear_bit(AS_EIO, &mapping->flags))
++	if (test_bit(AS_EIO, &mapping->flags) &&
++	    test_and_clear_bit(AS_EIO, &mapping->flags))
+ 		ret = -EIO;
+ 	return ret;
+ }
+@@ -446,6 +448,29 @@ int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask)
+ }
+ EXPORT_SYMBOL_GPL(replace_page_cache_page);
+ 
++static int page_cache_tree_insert(struct address_space *mapping,
++				  struct page *page)
++{
++	void **slot;
++	int error;
++
++	slot = radix_tree_lookup_slot(&mapping->page_tree, page->index);
++	if (slot) {
++		void *p;
++
++		p = radix_tree_deref_slot_protected(slot, &mapping->tree_lock);
++		if (!radix_tree_exceptional_entry(p))
++			return -EEXIST;
++		radix_tree_replace_slot(slot, page);
++		mapping->nrpages++;
++		return 0;
++	}
++	error = radix_tree_insert(&mapping->page_tree, page->index, page);
++	if (!error)
++		mapping->nrpages++;
++	return error;
++}
++
+ /**
+  * add_to_page_cache_locked - add a locked page to the pagecache
+  * @page:	page to add
+@@ -480,11 +505,10 @@ int add_to_page_cache_locked(struct page *page, struct address_space *mapping,
+ 	page->index = offset;
+ 
+ 	spin_lock_irq(&mapping->tree_lock);
+-	error = radix_tree_insert(&mapping->page_tree, offset, page);
++	error = page_cache_tree_insert(mapping, page);
+ 	radix_tree_preload_end();
+ 	if (unlikely(error))
+ 		goto err_insert;
+-	mapping->nrpages++;
+ 	__inc_zone_page_state(page, NR_FILE_PAGES);
+ 	spin_unlock_irq(&mapping->tree_lock);
+ 	trace_mm_filemap_add_to_page_cache(page);
+@@ -520,10 +544,10 @@ struct page *__page_cache_alloc(gfp_t gfp)
+ 	if (cpuset_do_page_mem_spread()) {
+ 		unsigned int cpuset_mems_cookie;
+ 		do {
+-			cpuset_mems_cookie = get_mems_allowed();
++			cpuset_mems_cookie = read_mems_allowed_begin();
+ 			n = cpuset_mem_spread_node();
+ 			page = alloc_pages_exact_node(n, gfp, 0);
+-		} while (!put_mems_allowed(cpuset_mems_cookie) && !page);
++		} while (!page && read_mems_allowed_retry(cpuset_mems_cookie));
+ 
+ 		return page;
+ 	}
+@@ -620,8 +644,17 @@ EXPORT_SYMBOL(unlock_page);
+  */
+ void end_page_writeback(struct page *page)
+ {
+-	if (TestClearPageReclaim(page))
++	/*
++	 * TestClearPageReclaim could be used here but it is an atomic
++	 * operation and overkill in this particular case. Failing to
++	 * shuffle a page marked for immediate reclaim is too mild to
++	 * justify taking an atomic operation penalty at the end of
++	 * ever page writeback.
++	 */
++	if (PageReclaim(page)) {
++		ClearPageReclaim(page);
+ 		rotate_reclaimable_page(page);
++	}
+ 
+ 	if (!test_clear_page_writeback(page))
+ 		BUG();
+@@ -686,14 +719,101 @@ int __lock_page_or_retry(struct page *page, struct mm_struct *mm,
+ }
+ 
+ /**
+- * find_get_page - find and get a page reference
++ * page_cache_next_hole - find the next hole (not-present entry)
++ * @mapping: mapping
++ * @index: index
++ * @max_scan: maximum range to search
++ *
++ * Search the set [index, min(index+max_scan-1, MAX_INDEX)] for the
++ * lowest indexed hole.
++ *
++ * Returns: the index of the hole if found, otherwise returns an index
++ * outside of the set specified (in which case 'return - index >=
++ * max_scan' will be true). In rare cases of index wrap-around, 0 will
++ * be returned.
++ *
++ * page_cache_next_hole may be called under rcu_read_lock. However,
++ * like radix_tree_gang_lookup, this will not atomically search a
++ * snapshot of the tree at a single point in time. For example, if a
++ * hole is created at index 5, then subsequently a hole is created at
++ * index 10, page_cache_next_hole covering both indexes may return 10
++ * if called under rcu_read_lock.
++ */
++pgoff_t page_cache_next_hole(struct address_space *mapping,
++			     pgoff_t index, unsigned long max_scan)
++{
++	unsigned long i;
++
++	for (i = 0; i < max_scan; i++) {
++		struct page *page;
++
++		page = radix_tree_lookup(&mapping->page_tree, index);
++		if (!page || radix_tree_exceptional_entry(page))
++			break;
++		index++;
++		if (index == 0)
++			break;
++	}
++
++	return index;
++}
++EXPORT_SYMBOL(page_cache_next_hole);
++
++/**
++ * page_cache_prev_hole - find the prev hole (not-present entry)
++ * @mapping: mapping
++ * @index: index
++ * @max_scan: maximum range to search
++ *
++ * Search backwards in the range [max(index-max_scan+1, 0), index] for
++ * the first hole.
++ *
++ * Returns: the index of the hole if found, otherwise returns an index
++ * outside of the set specified (in which case 'index - return >=
++ * max_scan' will be true). In rare cases of wrap-around, ULONG_MAX
++ * will be returned.
++ *
++ * page_cache_prev_hole may be called under rcu_read_lock. However,
++ * like radix_tree_gang_lookup, this will not atomically search a
++ * snapshot of the tree at a single point in time. For example, if a
++ * hole is created at index 10, then subsequently a hole is created at
++ * index 5, page_cache_prev_hole covering both indexes may return 5 if
++ * called under rcu_read_lock.
++ */
++pgoff_t page_cache_prev_hole(struct address_space *mapping,
++			     pgoff_t index, unsigned long max_scan)
++{
++	unsigned long i;
++
++	for (i = 0; i < max_scan; i++) {
++		struct page *page;
++
++		page = radix_tree_lookup(&mapping->page_tree, index);
++		if (!page || radix_tree_exceptional_entry(page))
++			break;
++		index--;
++		if (index == ULONG_MAX)
++			break;
++	}
++
++	return index;
++}
++EXPORT_SYMBOL(page_cache_prev_hole);
++
++/**
++ * find_get_entry - find and get a page cache entry
+  * @mapping: the address_space to search
+- * @offset: the page index
++ * @offset: the page cache index
++ *
++ * Looks up the page cache slot at @mapping & @offset.  If there is a
++ * page cache page, it is returned with an increased refcount.
+  *
+- * Is there a pagecache struct page at the given (mapping, offset) tuple?
+- * If yes, increment its refcount and return it; if no, return NULL.
++ * If the slot holds a shadow entry of a previously evicted page, it
++ * is returned.
++ *
++ * Otherwise, %NULL is returned.
+  */
+-struct page *find_get_page(struct address_space *mapping, pgoff_t offset)
++struct page *find_get_entry(struct address_space *mapping, pgoff_t offset)
+ {
+ 	void **pagep;
+ 	struct page *page;
+@@ -734,24 +854,30 @@ out:
+ 
+ 	return page;
+ }
+-EXPORT_SYMBOL(find_get_page);
++EXPORT_SYMBOL(find_get_entry);
+ 
+ /**
+- * find_lock_page - locate, pin and lock a pagecache page
++ * find_lock_entry - locate, pin and lock a page cache entry
+  * @mapping: the address_space to search
+- * @offset: the page index
++ * @offset: the page cache index
++ *
++ * Looks up the page cache slot at @mapping & @offset.  If there is a
++ * page cache page, it is returned locked and with an increased
++ * refcount.
+  *
+- * Locates the desired pagecache page, locks it, increments its reference
+- * count and returns its address.
++ * If the slot holds a shadow entry of a previously evicted page, it
++ * is returned.
+  *
+- * Returns zero if the page was not present. find_lock_page() may sleep.
++ * Otherwise, %NULL is returned.
++ *
++ * find_lock_entry() may sleep.
+  */
+-struct page *find_lock_page(struct address_space *mapping, pgoff_t offset)
++struct page *find_lock_entry(struct address_space *mapping, pgoff_t offset)
+ {
+ 	struct page *page;
+ 
+ repeat:
+-	page = find_get_page(mapping, offset);
++	page = find_get_entry(mapping, offset);
+ 	if (page && !radix_tree_exception(page)) {
+ 		lock_page(page);
+ 		/* Has the page been truncated? */
+@@ -764,44 +890,87 @@ repeat:
+ 	}
+ 	return page;
+ }
+-EXPORT_SYMBOL(find_lock_page);
++EXPORT_SYMBOL(find_lock_entry);
+ 
+ /**
+- * find_or_create_page - locate or add a pagecache page
+- * @mapping: the page's address_space
+- * @index: the page's index into the mapping
+- * @gfp_mask: page allocation mode
++ * pagecache_get_page - find and get a page reference
++ * @mapping: the address_space to search
++ * @offset: the page index
++ * @fgp_flags: PCG flags
++ * @gfp_mask: gfp mask to use if a page is to be allocated
++ *
++ * Looks up the page cache slot at @mapping & @offset.
++ *
++ * PCG flags modify how the page is returned
+  *
+- * Locates a page in the pagecache.  If the page is not present, a new page
+- * is allocated using @gfp_mask and is added to the pagecache and to the VM's
+- * LRU list.  The returned page is locked and has its reference count
+- * incremented.
++ * FGP_ACCESSED: the page will be marked accessed
++ * FGP_LOCK: Page is return locked
++ * FGP_CREAT: If page is not present then a new page is allocated using
++ *		@gfp_mask and added to the page cache and the VM's LRU
++ *		list. The page is returned locked and with an increased
++ *		refcount. Otherwise, %NULL is returned.
+  *
+- * find_or_create_page() may sleep, even if @gfp_flags specifies an atomic
+- * allocation!
++ * If FGP_LOCK or FGP_CREAT are specified then the function may sleep even
++ * if the GFP flags specified for FGP_CREAT are atomic.
+  *
+- * find_or_create_page() returns the desired page's address, or zero on
+- * memory exhaustion.
++ * If there is a page cache page, it is returned with an increased refcount.
+  */
+-struct page *find_or_create_page(struct address_space *mapping,
+-		pgoff_t index, gfp_t gfp_mask)
++struct page *pagecache_get_page(struct address_space *mapping, pgoff_t offset,
++	int fgp_flags, gfp_t cache_gfp_mask, gfp_t radix_gfp_mask)
+ {
+ 	struct page *page;
+-	int err;
++
+ repeat:
+-	page = find_lock_page(mapping, index);
+-	if (!page) {
+-		page = __page_cache_alloc(gfp_mask);
++	page = find_get_entry(mapping, offset);
++	if (radix_tree_exceptional_entry(page))
++		page = NULL;
++	if (!page)
++		goto no_page;
++
++	if (fgp_flags & FGP_LOCK) {
++		if (fgp_flags & FGP_NOWAIT) {
++			if (!trylock_page(page)) {
++				page_cache_release(page);
++				return NULL;
++			}
++		} else {
++			lock_page(page);
++		}
++
++		/* Has the page been truncated? */
++		if (unlikely(page->mapping != mapping)) {
++			unlock_page(page);
++			page_cache_release(page);
++			goto repeat;
++		}
++		VM_BUG_ON(page->index != offset);
++	}
++
++	if (page && (fgp_flags & FGP_ACCESSED))
++		mark_page_accessed(page);
++
++no_page:
++	if (!page && (fgp_flags & FGP_CREAT)) {
++		int err;
++		if ((fgp_flags & FGP_WRITE) && mapping_cap_account_dirty(mapping))
++			cache_gfp_mask |= __GFP_WRITE;
++		if (fgp_flags & FGP_NOFS) {
++			cache_gfp_mask &= ~__GFP_FS;
++			radix_gfp_mask &= ~__GFP_FS;
++		}
++
++		page = __page_cache_alloc(cache_gfp_mask);
+ 		if (!page)
+ 			return NULL;
+-		/*
+-		 * We want a regular kernel memory (not highmem or DMA etc)
+-		 * allocation for the radix tree nodes, but we need to honour
+-		 * the context-specific requirements the caller has asked for.
+-		 * GFP_RECLAIM_MASK collects those requirements.
+-		 */
+-		err = add_to_page_cache_lru(page, mapping, index,
+-			(gfp_mask & GFP_RECLAIM_MASK));
++
++		if (WARN_ON_ONCE(!(fgp_flags & FGP_LOCK)))
++			fgp_flags |= FGP_LOCK;
++
++		/* Init accessed so avoit atomic mark_page_accessed later */
++		if (fgp_flags & FGP_ACCESSED)
++			init_page_accessed(page);
++
++		err = add_to_page_cache_lru(page, mapping, offset, radix_gfp_mask);
+ 		if (unlikely(err)) {
+ 			page_cache_release(page);
+ 			page = NULL;
+@@ -809,9 +978,80 @@ repeat:
+ 				goto repeat;
+ 		}
+ 	}
++
+ 	return page;
+ }
+-EXPORT_SYMBOL(find_or_create_page);
++EXPORT_SYMBOL(pagecache_get_page);
++
++/**
++ * find_get_entries - gang pagecache lookup
++ * @mapping:	The address_space to search
++ * @start:	The starting page cache index
++ * @nr_entries:	The maximum number of entries
++ * @entries:	Where the resulting entries are placed
++ * @indices:	The cache indices corresponding to the entries in @entries
++ *
++ * find_get_entries() will search for and return a group of up to
++ * @nr_entries entries in the mapping.  The entries are placed at
++ * @entries.  find_get_entries() takes a reference against any actual
++ * pages it returns.
++ *
++ * The search returns a group of mapping-contiguous page cache entries
++ * with ascending indexes.  There may be holes in the indices due to
++ * not-present pages.
++ *
++ * Any shadow entries of evicted pages are included in the returned
++ * array.
++ *
++ * find_get_entries() returns the number of pages and shadow entries
++ * which were found.
++ */
++unsigned find_get_entries(struct address_space *mapping,
++			  pgoff_t start, unsigned int nr_entries,
++			  struct page **entries, pgoff_t *indices)
++{
++	void **slot;
++	unsigned int ret = 0;
++	struct radix_tree_iter iter;
++
++	if (!nr_entries)
++		return 0;
++
++	rcu_read_lock();
++restart:
++	radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, start) {
++		struct page *page;
++repeat:
++		page = radix_tree_deref_slot(slot);
++		if (unlikely(!page))
++			continue;
++		if (radix_tree_exception(page)) {
++			if (radix_tree_deref_retry(page))
++				goto restart;
++			/*
++			 * Otherwise, we must be storing a swap entry
++			 * here as an exceptional entry: so return it
++			 * without attempting to raise page count.
++			 */
++			goto export;
++		}
++		if (!page_cache_get_speculative(page))
++			goto repeat;
++
++		/* Has the page moved? */
++		if (unlikely(page != *slot)) {
++			page_cache_release(page);
++			goto repeat;
++		}
++export:
++		indices[ret] = iter.index;
++		entries[ret] = page;
++		if (++ret == nr_entries)
++			break;
++	}
++	rcu_read_unlock();
++	return ret;
++}
+ 
+ /**
+  * find_get_pages - gang pagecache lookup
+@@ -1031,39 +1271,6 @@ repeat:
+ }
+ EXPORT_SYMBOL(find_get_pages_tag);
+ 
+-/**
+- * grab_cache_page_nowait - returns locked page at given index in given cache
+- * @mapping: target address_space
+- * @index: the page index
+- *
+- * Same as grab_cache_page(), but do not wait if the page is unavailable.
+- * This is intended for speculative data generators, where the data can
+- * be regenerated if the page couldn't be grabbed.  This routine should
+- * be safe to call while holding the lock for another page.
+- *
+- * Clear __GFP_FS when allocating the page to avoid recursion into the fs
+- * and deadlock against the caller's locked page.
+- */
+-struct page *
+-grab_cache_page_nowait(struct address_space *mapping, pgoff_t index)
+-{
+-	struct page *page = find_get_page(mapping, index);
+-
+-	if (page) {
+-		if (trylock_page(page))
+-			return page;
+-		page_cache_release(page);
+-		return NULL;
+-	}
+-	page = __page_cache_alloc(mapping_gfp_mask(mapping) & ~__GFP_FS);
+-	if (page && add_to_page_cache_lru(page, mapping, index, GFP_NOFS)) {
+-		page_cache_release(page);
+-		page = NULL;
+-	}
+-	return page;
+-}
+-EXPORT_SYMBOL(grab_cache_page_nowait);
+-
+ /*
+  * CD/DVDs are error prone. When a medium error occurs, the driver may fail
+  * a _large_ part of the i/o request. Imagine the worst scenario:
+@@ -1797,6 +2004,18 @@ int generic_file_readonly_mmap(struct file * file, struct vm_area_struct * vma)
+ EXPORT_SYMBOL(generic_file_mmap);
+ EXPORT_SYMBOL(generic_file_readonly_mmap);
+ 
++static struct page *wait_on_page_read(struct page *page)
++{
++	if (!IS_ERR(page)) {
++		wait_on_page_locked(page);
++		if (!PageUptodate(page)) {
++			page_cache_release(page);
++			page = ERR_PTR(-EIO);
++		}
++	}
++	return page;
++}
++
+ static struct page *__read_cache_page(struct address_space *mapping,
+ 				pgoff_t index,
+ 				int (*filler)(void *, struct page *),
+@@ -1823,6 +2042,8 @@ repeat:
+ 		if (err < 0) {
+ 			page_cache_release(page);
+ 			page = ERR_PTR(err);
++		} else {
++			page = wait_on_page_read(page);
+ 		}
+ 	}
+ 	return page;
+@@ -1859,6 +2080,10 @@ retry:
+ 	if (err < 0) {
+ 		page_cache_release(page);
+ 		return ERR_PTR(err);
++	} else {
++		page = wait_on_page_read(page);
++		if (IS_ERR(page))
++			return page;
+ 	}
+ out:
+ 	mark_page_accessed(page);
+@@ -1866,40 +2091,25 @@ out:
+ }
+ 
+ /**
+- * read_cache_page_async - read into page cache, fill it if needed
++ * read_cache_page - read into page cache, fill it if needed
+  * @mapping:	the page's address_space
+  * @index:	the page index
+  * @filler:	function to perform the read
+  * @data:	first arg to filler(data, page) function, often left as NULL
+  *
+- * Same as read_cache_page, but don't wait for page to become unlocked
+- * after submitting it to the filler.
+- *
+  * Read into the page cache. If a page already exists, and PageUptodate() is
+- * not set, try to fill the page but don't wait for it to become unlocked.
++ * not set, try to fill the page and wait for it to become unlocked.
+  *
+  * If the page does not get brought uptodate, return -EIO.
+  */
+-struct page *read_cache_page_async(struct address_space *mapping,
++struct page *read_cache_page(struct address_space *mapping,
+ 				pgoff_t index,
+ 				int (*filler)(void *, struct page *),
+ 				void *data)
+ {
+ 	return do_read_cache_page(mapping, index, filler, data, mapping_gfp_mask(mapping));
+ }
+-EXPORT_SYMBOL(read_cache_page_async);
+-
+-static struct page *wait_on_page_read(struct page *page)
+-{
+-	if (!IS_ERR(page)) {
+-		wait_on_page_locked(page);
+-		if (!PageUptodate(page)) {
+-			page_cache_release(page);
+-			page = ERR_PTR(-EIO);
+-		}
+-	}
+-	return page;
+-}
++EXPORT_SYMBOL(read_cache_page);
+ 
+ /**
+  * read_cache_page_gfp - read into page cache, using specified page allocation flags.
+@@ -1918,31 +2128,10 @@ struct page *read_cache_page_gfp(struct address_space *mapping,
+ {
+ 	filler_t *filler = (filler_t *)mapping->a_ops->readpage;
+ 
+-	return wait_on_page_read(do_read_cache_page(mapping, index, filler, NULL, gfp));
++	return do_read_cache_page(mapping, index, filler, NULL, gfp);
+ }
+ EXPORT_SYMBOL(read_cache_page_gfp);
+ 
+-/**
+- * read_cache_page - read into page cache, fill it if needed
+- * @mapping:	the page's address_space
+- * @index:	the page index
+- * @filler:	function to perform the read
+- * @data:	first arg to filler(data, page) function, often left as NULL
+- *
+- * Read into the page cache. If a page already exists, and PageUptodate() is
+- * not set, try to fill the page then wait for it to become unlocked.
+- *
+- * If the page does not get brought uptodate, return -EIO.
+- */
+-struct page *read_cache_page(struct address_space *mapping,
+-				pgoff_t index,
+-				int (*filler)(void *, struct page *),
+-				void *data)
+-{
+-	return wait_on_page_read(read_cache_page_async(mapping, index, filler, data));
+-}
+-EXPORT_SYMBOL(read_cache_page);
+-
+ static size_t __iovec_copy_from_user_inatomic(char *vaddr,
+ 			const struct iovec *iov, size_t base, size_t bytes)
+ {
+@@ -1976,7 +2165,6 @@ size_t iov_iter_copy_from_user_atomic(struct page *page,
+ 	char *kaddr;
+ 	size_t copied;
+ 
+-	BUG_ON(!in_atomic());
+ 	kaddr = kmap_atomic(page);
+ 	if (likely(i->nr_segs == 1)) {
+ 		int left;
+@@ -2186,7 +2374,6 @@ int pagecache_write_end(struct file *file, struct address_space *mapping,
+ {
+ 	const struct address_space_operations *aops = mapping->a_ops;
+ 
+-	mark_page_accessed(page);
+ 	return aops->write_end(file, mapping, pos, len, copied, page, fsdata);
+ }
+ EXPORT_SYMBOL(pagecache_write_end);
+@@ -2268,34 +2455,18 @@ EXPORT_SYMBOL(generic_file_direct_write);
+ struct page *grab_cache_page_write_begin(struct address_space *mapping,
+ 					pgoff_t index, unsigned flags)
+ {
+-	int status;
+-	gfp_t gfp_mask;
+ 	struct page *page;
+-	gfp_t gfp_notmask = 0;
++	int fgp_flags = FGP_LOCK|FGP_ACCESSED|FGP_WRITE|FGP_CREAT;
+ 
+-	gfp_mask = mapping_gfp_mask(mapping);
+-	if (mapping_cap_account_dirty(mapping))
+-		gfp_mask |= __GFP_WRITE;
+ 	if (flags & AOP_FLAG_NOFS)
+-		gfp_notmask = __GFP_FS;
+-repeat:
+-	page = find_lock_page(mapping, index);
++		fgp_flags |= FGP_NOFS;
++
++	page = pagecache_get_page(mapping, index, fgp_flags,
++			mapping_gfp_mask(mapping),
++			GFP_KERNEL);
+ 	if (page)
+-		goto found;
++		wait_for_stable_page(page);
+ 
+-	page = __page_cache_alloc(gfp_mask & ~gfp_notmask);
+-	if (!page)
+-		return NULL;
+-	status = add_to_page_cache_lru(page, mapping, index,
+-						GFP_KERNEL & ~gfp_notmask);
+-	if (unlikely(status)) {
+-		page_cache_release(page);
+-		if (status == -EEXIST)
+-			goto repeat;
+-		return NULL;
+-	}
+-found:
+-	wait_for_stable_page(page);
+ 	return page;
+ }
+ EXPORT_SYMBOL(grab_cache_page_write_begin);
+@@ -2344,18 +2515,15 @@ again:
+ 
+ 		status = a_ops->write_begin(file, mapping, pos, bytes, flags,
+ 						&page, &fsdata);
+-		if (unlikely(status))
++		if (unlikely(status < 0))
+ 			break;
+ 
+ 		if (mapping_writably_mapped(mapping))
+ 			flush_dcache_page(page);
+ 
+-		pagefault_disable();
+ 		copied = iov_iter_copy_from_user_atomic(page, i, offset, bytes);
+-		pagefault_enable();
+ 		flush_dcache_page(page);
+ 
+-		mark_page_accessed(page);
+ 		status = a_ops->write_end(file, mapping, pos, bytes, copied,
+ 						page, fsdata);
+ 		if (unlikely(status < 0))
+diff --git a/mm/fremap.c b/mm/fremap.c
+index bbc4d660221a..34feba60a17e 100644
+--- a/mm/fremap.c
++++ b/mm/fremap.c
+@@ -23,28 +23,44 @@
+ 
+ #include "internal.h"
+ 
++static int mm_counter(struct page *page)
++{
++	return PageAnon(page) ? MM_ANONPAGES : MM_FILEPAGES;
++}
++
+ static void zap_pte(struct mm_struct *mm, struct vm_area_struct *vma,
+ 			unsigned long addr, pte_t *ptep)
+ {
+ 	pte_t pte = *ptep;
++	struct page *page;
++	swp_entry_t entry;
+ 
+ 	if (pte_present(pte)) {
+-		struct page *page;
+-
+ 		flush_cache_page(vma, addr, pte_pfn(pte));
+ 		pte = ptep_clear_flush(vma, addr, ptep);
+ 		page = vm_normal_page(vma, addr, pte);
+ 		if (page) {
+ 			if (pte_dirty(pte))
+ 				set_page_dirty(page);
++			update_hiwater_rss(mm);
++			dec_mm_counter(mm, mm_counter(page));
+ 			page_remove_rmap(page);
+ 			page_cache_release(page);
++		}
++	} else {	/* zap_pte() is not called when pte_none() */
++		if (!pte_file(pte)) {
+ 			update_hiwater_rss(mm);
+-			dec_mm_counter(mm, MM_FILEPAGES);
++			entry = pte_to_swp_entry(pte);
++			if (non_swap_entry(entry)) {
++				if (is_migration_entry(entry)) {
++					page = migration_entry_to_page(entry);
++					dec_mm_counter(mm, mm_counter(page));
++				}
++			} else {
++				free_swap_and_cache(entry);
++				dec_mm_counter(mm, MM_SWAPENTS);
++			}
+ 		}
+-	} else {
+-		if (!pte_file(pte))
+-			free_swap_and_cache(pte_to_swp_entry(pte));
+ 		pte_clear_not_present_full(mm, addr, ptep, 0);
+ 	}
+ }
+diff --git a/mm/frontswap.c b/mm/frontswap.c
+index 1b24bdcb3197..c30eec536f03 100644
+--- a/mm/frontswap.c
++++ b/mm/frontswap.c
+@@ -327,15 +327,12 @@ EXPORT_SYMBOL(__frontswap_invalidate_area);
+ 
+ static unsigned long __frontswap_curr_pages(void)
+ {
+-	int type;
+ 	unsigned long totalpages = 0;
+ 	struct swap_info_struct *si = NULL;
+ 
+ 	assert_spin_locked(&swap_lock);
+-	for (type = swap_list.head; type >= 0; type = si->next) {
+-		si = swap_info[type];
++	plist_for_each_entry(si, &swap_active_head, list)
+ 		totalpages += atomic_read(&si->frontswap_pages);
+-	}
+ 	return totalpages;
+ }
+ 
+@@ -347,11 +344,9 @@ static int __frontswap_unuse_pages(unsigned long total, unsigned long *unused,
+ 	int si_frontswap_pages;
+ 	unsigned long total_pages_to_unuse = total;
+ 	unsigned long pages = 0, pages_to_unuse = 0;
+-	int type;
+ 
+ 	assert_spin_locked(&swap_lock);
+-	for (type = swap_list.head; type >= 0; type = si->next) {
+-		si = swap_info[type];
++	plist_for_each_entry(si, &swap_active_head, list) {
+ 		si_frontswap_pages = atomic_read(&si->frontswap_pages);
+ 		if (total_pages_to_unuse < si_frontswap_pages) {
+ 			pages = pages_to_unuse = total_pages_to_unuse;
+@@ -366,7 +361,7 @@ static int __frontswap_unuse_pages(unsigned long total, unsigned long *unused,
+ 		}
+ 		vm_unacct_memory(pages);
+ 		*unused = pages_to_unuse;
+-		*swapid = type;
++		*swapid = si->type;
+ 		ret = 0;
+ 		break;
+ 	}
+@@ -413,7 +408,7 @@ void frontswap_shrink(unsigned long target_pages)
+ 	/*
+ 	 * we don't want to hold swap_lock while doing a very
+ 	 * lengthy try_to_unuse, but swap_list may change
+-	 * so restart scan from swap_list.head each time
++	 * so restart scan from swap_active_head each time
+ 	 */
+ 	spin_lock(&swap_lock);
+ 	ret = __frontswap_shrink(target_pages, &pages_to_unuse, &type);
+diff --git a/mm/huge_memory.c b/mm/huge_memory.c
+index 389973fd6bb7..2ee53749eb48 100644
+--- a/mm/huge_memory.c
++++ b/mm/huge_memory.c
+@@ -758,14 +758,6 @@ static inline struct page *alloc_hugepage_vma(int defrag,
+ 			       HPAGE_PMD_ORDER, vma, haddr, nd);
+ }
+ 
+-#ifndef CONFIG_NUMA
+-static inline struct page *alloc_hugepage(int defrag)
+-{
+-	return alloc_pages(alloc_hugepage_gfpmask(defrag, 0),
+-			   HPAGE_PMD_ORDER);
+-}
+-#endif
+-
+ static bool set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm,
+ 		struct vm_area_struct *vma, unsigned long haddr, pmd_t *pmd,
+ 		struct page *zero_page)
+@@ -2197,7 +2189,58 @@ static void khugepaged_alloc_sleep(void)
+ 			msecs_to_jiffies(khugepaged_alloc_sleep_millisecs));
+ }
+ 
++static int khugepaged_node_load[MAX_NUMNODES];
++
++static bool khugepaged_scan_abort(int nid)
++{
++	int i;
++
++	/*
++	 * If zone_reclaim_mode is disabled, then no extra effort is made to
++	 * allocate memory locally.
++	 */
++	if (!zone_reclaim_mode)
++		return false;
++
++	/* If there is a count for this node already, it must be acceptable */
++	if (khugepaged_node_load[nid])
++		return false;
++
++	for (i = 0; i < MAX_NUMNODES; i++) {
++		if (!khugepaged_node_load[i])
++			continue;
++		if (node_distance(nid, i) > RECLAIM_DISTANCE)
++			return true;
++	}
++	return false;
++}
++
+ #ifdef CONFIG_NUMA
++static int khugepaged_find_target_node(void)
++{
++	static int last_khugepaged_target_node = NUMA_NO_NODE;
++	int nid, target_node = 0, max_value = 0;
++
++	/* find first node with max normal pages hit */
++	for (nid = 0; nid < MAX_NUMNODES; nid++)
++		if (khugepaged_node_load[nid] > max_value) {
++			max_value = khugepaged_node_load[nid];
++			target_node = nid;
++		}
++
++	/* do some balance if several nodes have the same hit record */
++	if (target_node <= last_khugepaged_target_node)
++		for (nid = last_khugepaged_target_node + 1; nid < MAX_NUMNODES;
++				nid++)
++			if (max_value == khugepaged_node_load[nid]) {
++				target_node = nid;
++				break;
++			}
++
++	last_khugepaged_target_node = target_node;
++	return target_node;
++}
++
+ static bool khugepaged_prealloc_page(struct page **hpage, bool *wait)
+ {
+ 	if (IS_ERR(*hpage)) {
+@@ -2231,9 +2274,8 @@ static struct page
+ 	 * mmap_sem in read mode is good idea also to allow greater
+ 	 * scalability.
+ 	 */
+-	*hpage  = alloc_hugepage_vma(khugepaged_defrag(), vma, address,
+-				      node, __GFP_OTHER_NODE);
+-
++	*hpage = alloc_pages_exact_node(node, alloc_hugepage_gfpmask(
++		khugepaged_defrag(), __GFP_OTHER_NODE), HPAGE_PMD_ORDER);
+ 	/*
+ 	 * After allocating the hugepage, release the mmap_sem read lock in
+ 	 * preparation for taking it in write mode.
+@@ -2249,6 +2291,17 @@ static struct page
+ 	return *hpage;
+ }
+ #else
++static int khugepaged_find_target_node(void)
++{
++	return 0;
++}
++
++static inline struct page *alloc_hugepage(int defrag)
++{
++	return alloc_pages(alloc_hugepage_gfpmask(defrag, 0),
++			   HPAGE_PMD_ORDER);
++}
++
+ static struct page *khugepaged_alloc_hugepage(bool *wait)
+ {
+ 	struct page *hpage;
+@@ -2455,6 +2508,7 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
+ 	if (pmd_trans_huge(*pmd))
+ 		goto out;
+ 
++	memset(khugepaged_node_load, 0, sizeof(khugepaged_node_load));
+ 	pte = pte_offset_map_lock(mm, pmd, address, &ptl);
+ 	for (_address = address, _pte = pte; _pte < pte+HPAGE_PMD_NR;
+ 	     _pte++, _address += PAGE_SIZE) {
+@@ -2471,12 +2525,15 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
+ 		if (unlikely(!page))
+ 			goto out_unmap;
+ 		/*
+-		 * Chose the node of the first page. This could
+-		 * be more sophisticated and look at more pages,
+-		 * but isn't for now.
++		 * Record which node the original page is from and save this
++		 * information to khugepaged_node_load[].
++		 * Khupaged will allocate hugepage from the node has the max
++		 * hit record.
+ 		 */
+-		if (node == NUMA_NO_NODE)
+-			node = page_to_nid(page);
++		node = page_to_nid(page);
++		if (khugepaged_scan_abort(node))
++			goto out_unmap;
++		khugepaged_node_load[node]++;
+ 		VM_BUG_ON(PageCompound(page));
+ 		if (!PageLRU(page) || PageLocked(page) || !PageAnon(page))
+ 			goto out_unmap;
+@@ -2491,9 +2548,11 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
+ 		ret = 1;
+ out_unmap:
+ 	pte_unmap_unlock(pte, ptl);
+-	if (ret)
++	if (ret) {
++		node = khugepaged_find_target_node();
+ 		/* collapse_huge_page will return with the mmap_sem released */
+ 		collapse_huge_page(mm, address, hpage, vma, node);
++	}
+ out:
+ 	return ret;
+ }
+diff --git a/mm/hugetlb.c b/mm/hugetlb.c
+index f80b17106d24..c33d8a65298c 100644
+--- a/mm/hugetlb.c
++++ b/mm/hugetlb.c
+@@ -574,7 +574,7 @@ static struct page *dequeue_huge_page_vma(struct hstate *h,
+ 		goto err;
+ 
+ retry_cpuset:
+-	cpuset_mems_cookie = get_mems_allowed();
++	cpuset_mems_cookie = read_mems_allowed_begin();
+ 	zonelist = huge_zonelist(vma, address,
+ 					htlb_alloc_mask(h), &mpol, &nodemask);
+ 
+@@ -596,7 +596,7 @@ retry_cpuset:
+ 	}
+ 
+ 	mpol_cond_put(mpol);
+-	if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))
++	if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie)))
+ 		goto retry_cpuset;
+ 	return page;
+ 
+@@ -2114,6 +2114,9 @@ static int hugetlb_sysctl_handler_common(bool obey_mempolicy,
+ 	unsigned long tmp;
+ 	int ret;
+ 
++	if (!hugepages_supported())
++		return -ENOTSUPP;
++
+ 	tmp = h->max_huge_pages;
+ 
+ 	if (write && h->order >= MAX_ORDER)
+@@ -2167,6 +2170,9 @@ int hugetlb_overcommit_handler(struct ctl_table *table, int write,
+ 	unsigned long tmp;
+ 	int ret;
+ 
++	if (!hugepages_supported())
++		return -ENOTSUPP;
++
+ 	tmp = h->nr_overcommit_huge_pages;
+ 
+ 	if (write && h->order >= MAX_ORDER)
+@@ -2192,6 +2198,8 @@ out:
+ void hugetlb_report_meminfo(struct seq_file *m)
+ {
+ 	struct hstate *h = &default_hstate;
++	if (!hugepages_supported())
++		return;
+ 	seq_printf(m,
+ 			"HugePages_Total:   %5lu\n"
+ 			"HugePages_Free:    %5lu\n"
+@@ -2208,6 +2216,8 @@ void hugetlb_report_meminfo(struct seq_file *m)
+ int hugetlb_report_node_meminfo(int nid, char *buf)
+ {
+ 	struct hstate *h = &default_hstate;
++	if (!hugepages_supported())
++		return 0;
+ 	return sprintf(buf,
+ 		"Node %d HugePages_Total: %5u\n"
+ 		"Node %d HugePages_Free:  %5u\n"
+@@ -2222,6 +2232,9 @@ void hugetlb_show_meminfo(void)
+ 	struct hstate *h;
+ 	int nid;
+ 
++	if (!hugepages_supported())
++		return;
++
+ 	for_each_node_state(nid, N_MEMORY)
+ 		for_each_hstate(h)
+ 			pr_info("Node %d hugepages_total=%u hugepages_free=%u hugepages_surp=%u hugepages_size=%lukB\n",
+diff --git a/mm/internal.h b/mm/internal.h
+index fdddbc83ac5f..d610f7ce4e9c 100644
+--- a/mm/internal.h
++++ b/mm/internal.h
+@@ -11,6 +11,7 @@
+ #ifndef __MM_INTERNAL_H
+ #define __MM_INTERNAL_H
+ 
++#include <linux/fs.h>
+ #include <linux/mm.h>
+ 
+ void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *start_vma,
+@@ -21,6 +22,20 @@ static inline void set_page_count(struct page *page, int v)
+ 	atomic_set(&page->_count, v);
+ }
+ 
++extern int __do_page_cache_readahead(struct address_space *mapping,
++		struct file *filp, pgoff_t offset, unsigned long nr_to_read,
++		unsigned long lookahead_size);
++
++/*
++ * Submit IO for the read-ahead request in file_ra_state.
++ */
++static inline unsigned long ra_submit(struct file_ra_state *ra,
++		struct address_space *mapping, struct file *filp)
++{
++	return __do_page_cache_readahead(mapping, filp,
++					ra->start, ra->size, ra->async_size);
++}
++
+ /*
+  * Turn a non-refcounted page (->_count == 0) into refcounted with
+  * a count of one.
+@@ -120,7 +135,7 @@ struct compact_control {
+ 	unsigned long nr_migratepages;	/* Number of pages to migrate */
+ 	unsigned long free_pfn;		/* isolate_freepages search base */
+ 	unsigned long migrate_pfn;	/* isolate_migratepages search base */
+-	bool sync;			/* Synchronous migration */
++	enum migrate_mode mode;		/* Async or sync migration mode */
+ 	bool ignore_skip_hint;		/* Scan blocks even if marked skip */
+ 	bool finished_update_free;	/* True when the zone cached pfns are
+ 					 * no longer being updated
+@@ -130,7 +145,10 @@ struct compact_control {
+ 	int order;			/* order a direct compactor needs */
+ 	int migratetype;		/* MOVABLE, RECLAIMABLE etc */
+ 	struct zone *zone;
+-	bool contended;			/* True if a lock was contended */
++	bool contended;			/* True if a lock was contended, or
++					 * need_resched() true during async
++					 * compaction
++					 */
+ };
+ 
+ unsigned long
+diff --git a/mm/madvise.c b/mm/madvise.c
+index 539eeb96b323..a402f8fdc68e 100644
+--- a/mm/madvise.c
++++ b/mm/madvise.c
+@@ -195,7 +195,7 @@ static void force_shm_swapin_readahead(struct vm_area_struct *vma,
+ 	for (; start < end; start += PAGE_SIZE) {
+ 		index = ((start - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
+ 
+-		page = find_get_page(mapping, index);
++		page = find_get_entry(mapping, index);
+ 		if (!radix_tree_exceptional_entry(page)) {
+ 			if (page)
+ 				page_cache_release(page);
+diff --git a/mm/memory-failure.c b/mm/memory-failure.c
+index 6e3f9c39bc22..4ab233d4714a 100644
+--- a/mm/memory-failure.c
++++ b/mm/memory-failure.c
+@@ -1554,7 +1554,7 @@ static int soft_offline_huge_page(struct page *page, int flags)
+ 
+ 	/* Keep page count to indicate a given hugepage is isolated. */
+ 	list_move(&hpage->lru, &pagelist);
+-	ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL,
++	ret = migrate_pages(&pagelist, new_page, NULL, MPOL_MF_MOVE_ALL,
+ 				MIGRATE_SYNC, MR_MEMORY_FAILURE);
+ 	if (ret) {
+ 		pr_info("soft offline: %#lx: migration failed %d, type %lx\n",
+@@ -1635,7 +1635,7 @@ static int __soft_offline_page(struct page *page, int flags)
+ 		inc_zone_page_state(page, NR_ISOLATED_ANON +
+ 					page_is_file_cache(page));
+ 		list_add(&page->lru, &pagelist);
+-		ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL,
++		ret = migrate_pages(&pagelist, new_page, NULL, MPOL_MF_MOVE_ALL,
+ 					MIGRATE_SYNC, MR_MEMORY_FAILURE);
+ 		if (ret) {
+ 			putback_lru_pages(&pagelist);
+diff --git a/mm/memory.c b/mm/memory.c
+index 99fe3aa1035c..b5901068495f 100644
+--- a/mm/memory.c
++++ b/mm/memory.c
+@@ -878,7 +878,7 @@ out_set_pte:
+ 	return 0;
+ }
+ 
+-int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
++static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
+ 		   pmd_t *dst_pmd, pmd_t *src_pmd, struct vm_area_struct *vma,
+ 		   unsigned long addr, unsigned long end)
+ {
+@@ -3698,7 +3698,7 @@ static int handle_pte_fault(struct mm_struct *mm,
+ 	pte_t entry;
+ 	spinlock_t *ptl;
+ 
+-	entry = *pte;
++	entry = ACCESS_ONCE(*pte);
+ 	if (!pte_present(entry)) {
+ 		if (pte_none(entry)) {
+ 			if (vma->vm_ops) {
+diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
+index ed85fe3870e2..d31730564617 100644
+--- a/mm/memory_hotplug.c
++++ b/mm/memory_hotplug.c
+@@ -1321,7 +1321,7 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
+ 		 * alloc_migrate_target should be improooooved!!
+ 		 * migrate_pages returns # of failed pages.
+ 		 */
+-		ret = migrate_pages(&source, alloc_migrate_target, 0,
++		ret = migrate_pages(&source, alloc_migrate_target, NULL, 0,
+ 					MIGRATE_SYNC, MR_MEMORY_HOTPLUG);
+ 		if (ret)
+ 			putback_movable_pages(&source);
+diff --git a/mm/mempolicy.c b/mm/mempolicy.c
+index 0437f3595b32..cc61c7a7d6a1 100644
+--- a/mm/mempolicy.c
++++ b/mm/mempolicy.c
+@@ -1060,7 +1060,7 @@ static int migrate_to_node(struct mm_struct *mm, int source, int dest,
+ 			flags | MPOL_MF_DISCONTIG_OK, &pagelist);
+ 
+ 	if (!list_empty(&pagelist)) {
+-		err = migrate_pages(&pagelist, new_node_page, dest,
++		err = migrate_pages(&pagelist, new_node_page, NULL, dest,
+ 					MIGRATE_SYNC, MR_SYSCALL);
+ 		if (err)
+ 			putback_movable_pages(&pagelist);
+@@ -1306,7 +1306,7 @@ static long do_mbind(unsigned long start, unsigned long len,
+ 
+ 		if (!list_empty(&pagelist)) {
+ 			WARN_ON_ONCE(flags & MPOL_MF_LAZY);
+-			nr_failed = migrate_pages(&pagelist, new_page,
++			nr_failed = migrate_pages(&pagelist, new_page, NULL,
+ 				start, MIGRATE_SYNC, MR_MEMPOLICY_MBIND);
+ 			if (nr_failed)
+ 				putback_movable_pages(&pagelist);
+@@ -1873,7 +1873,7 @@ int node_random(const nodemask_t *maskp)
+  * If the effective policy is 'BIND, returns a pointer to the mempolicy's
+  * @nodemask for filtering the zonelist.
+  *
+- * Must be protected by get_mems_allowed()
++ * Must be protected by read_mems_allowed_begin()
+  */
+ struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr,
+ 				gfp_t gfp_flags, struct mempolicy **mpol,
+@@ -2037,7 +2037,7 @@ alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
+ 
+ retry_cpuset:
+ 	pol = get_vma_policy(current, vma, addr);
+-	cpuset_mems_cookie = get_mems_allowed();
++	cpuset_mems_cookie = read_mems_allowed_begin();
+ 
+ 	if (unlikely(pol->mode == MPOL_INTERLEAVE)) {
+ 		unsigned nid;
+@@ -2045,7 +2045,7 @@ retry_cpuset:
+ 		nid = interleave_nid(pol, vma, addr, PAGE_SHIFT + order);
+ 		mpol_cond_put(pol);
+ 		page = alloc_page_interleave(gfp, order, nid);
+-		if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))
++		if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie)))
+ 			goto retry_cpuset;
+ 
+ 		return page;
+@@ -2055,7 +2055,7 @@ retry_cpuset:
+ 				      policy_nodemask(gfp, pol));
+ 	if (unlikely(mpol_needs_cond_ref(pol)))
+ 		__mpol_put(pol);
+-	if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))
++	if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie)))
+ 		goto retry_cpuset;
+ 	return page;
+ }
+@@ -2089,7 +2089,7 @@ struct page *alloc_pages_current(gfp_t gfp, unsigned order)
+ 		pol = &default_policy;
+ 
+ retry_cpuset:
+-	cpuset_mems_cookie = get_mems_allowed();
++	cpuset_mems_cookie = read_mems_allowed_begin();
+ 
+ 	/*
+ 	 * No reference counting needed for current->mempolicy
+@@ -2102,7 +2102,7 @@ retry_cpuset:
+ 				policy_zonelist(gfp, pol, numa_node_id()),
+ 				policy_nodemask(gfp, pol));
+ 
+-	if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))
++	if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie)))
+ 		goto retry_cpuset;
+ 
+ 	return page;
+diff --git a/mm/migrate.c b/mm/migrate.c
+index e3cf71dd1288..96d4d814ae2f 100644
+--- a/mm/migrate.c
++++ b/mm/migrate.c
+@@ -867,8 +867,9 @@ out:
+  * Obtain the lock on page, remove all ptes and migrate the page
+  * to the newly allocated page in newpage.
+  */
+-static int unmap_and_move(new_page_t get_new_page, unsigned long private,
+-			struct page *page, int force, enum migrate_mode mode)
++static int unmap_and_move(new_page_t get_new_page, free_page_t put_new_page,
++			unsigned long private, struct page *page, int force,
++			enum migrate_mode mode)
+ {
+ 	int rc = 0;
+ 	int *result = NULL;
+@@ -912,11 +913,18 @@ out:
+ 				page_is_file_cache(page));
+ 		putback_lru_page(page);
+ 	}
++
+ 	/*
+-	 * Move the new page to the LRU. If migration was not successful
+-	 * then this will free the page.
++	 * If migration was not successful and there's a freeing callback, use
++	 * it.  Otherwise, putback_lru_page() will drop the reference grabbed
++	 * during isolation.
+ 	 */
+-	putback_lru_page(newpage);
++	if (rc != MIGRATEPAGE_SUCCESS && put_new_page) {
++		ClearPageSwapBacked(newpage);
++		put_new_page(newpage, private);
++	} else
++		putback_lru_page(newpage);
++
+ 	if (result) {
+ 		if (rc)
+ 			*result = rc;
+@@ -945,8 +953,9 @@ out:
+  * will wait in the page fault for migration to complete.
+  */
+ static int unmap_and_move_huge_page(new_page_t get_new_page,
+-				unsigned long private, struct page *hpage,
+-				int force, enum migrate_mode mode)
++				free_page_t put_new_page, unsigned long private,
++				struct page *hpage, int force,
++				enum migrate_mode mode)
+ {
+ 	int rc = 0;
+ 	int *result = NULL;
+@@ -982,20 +991,30 @@ static int unmap_and_move_huge_page(new_page_t get_new_page,
+ 	if (!page_mapped(hpage))
+ 		rc = move_to_new_page(new_hpage, hpage, 1, mode);
+ 
+-	if (rc)
++	if (rc != MIGRATEPAGE_SUCCESS)
+ 		remove_migration_ptes(hpage, hpage);
+ 
+ 	if (anon_vma)
+ 		put_anon_vma(anon_vma);
+ 
+-	if (!rc)
++	if (rc == MIGRATEPAGE_SUCCESS)
+ 		hugetlb_cgroup_migrate(hpage, new_hpage);
+ 
+ 	unlock_page(hpage);
+ out:
+ 	if (rc != -EAGAIN)
+ 		putback_active_hugepage(hpage);
+-	put_page(new_hpage);
++
++	/*
++	 * If migration was not successful and there's a freeing callback, use
++	 * it.  Otherwise, put_page() will drop the reference grabbed during
++	 * isolation.
++	 */
++	if (rc != MIGRATEPAGE_SUCCESS && put_new_page)
++		put_new_page(new_hpage, private);
++	else
++		put_page(new_hpage);
++
+ 	if (result) {
+ 		if (rc)
+ 			*result = rc;
+@@ -1012,6 +1031,8 @@ out:
+  * @from:		The list of pages to be migrated.
+  * @get_new_page:	The function used to allocate free pages to be used
+  *			as the target of the page migration.
++ * @put_new_page:	The function used to free target pages if migration
++ *			fails, or NULL if no special handling is necessary.
+  * @private:		Private data to be passed on to get_new_page()
+  * @mode:		The migration mode that specifies the constraints for
+  *			page migration, if any.
+@@ -1025,7 +1046,8 @@ out:
+  * Returns the number of pages that were not migrated, or an error code.
+  */
+ int migrate_pages(struct list_head *from, new_page_t get_new_page,
+-		unsigned long private, enum migrate_mode mode, int reason)
++		free_page_t put_new_page, unsigned long private,
++		enum migrate_mode mode, int reason)
+ {
+ 	int retry = 1;
+ 	int nr_failed = 0;
+@@ -1047,10 +1069,11 @@ int migrate_pages(struct list_head *from, new_page_t get_new_page,
+ 
+ 			if (PageHuge(page))
+ 				rc = unmap_and_move_huge_page(get_new_page,
+-						private, page, pass > 2, mode);
++						put_new_page, private, page,
++						pass > 2, mode);
+ 			else
+-				rc = unmap_and_move(get_new_page, private,
+-						page, pass > 2, mode);
++				rc = unmap_and_move(get_new_page, put_new_page,
++						private, page, pass > 2, mode);
+ 
+ 			switch(rc) {
+ 			case -ENOMEM:
+@@ -1194,7 +1217,7 @@ set_status:
+ 
+ 	err = 0;
+ 	if (!list_empty(&pagelist)) {
+-		err = migrate_pages(&pagelist, new_page_node,
++		err = migrate_pages(&pagelist, new_page_node, NULL,
+ 				(unsigned long)pm, MIGRATE_SYNC, MR_SYSCALL);
+ 		if (err)
+ 			putback_movable_pages(&pagelist);
+@@ -1643,7 +1666,8 @@ int migrate_misplaced_page(struct page *page, int node)
+ 
+ 	list_add(&page->lru, &migratepages);
+ 	nr_remaining = migrate_pages(&migratepages, alloc_misplaced_dst_page,
+-				     node, MIGRATE_ASYNC, MR_NUMA_MISPLACED);
++				     NULL, node, MIGRATE_ASYNC,
++				     MR_NUMA_MISPLACED);
+ 	if (nr_remaining) {
+ 		putback_lru_pages(&migratepages);
+ 		isolated = 0;
+diff --git a/mm/mincore.c b/mm/mincore.c
+index da2be56a7b8f..06cb81005c77 100644
+--- a/mm/mincore.c
++++ b/mm/mincore.c
+@@ -70,13 +70,21 @@ static unsigned char mincore_page(struct address_space *mapping, pgoff_t pgoff)
+ 	 * any other file mapping (ie. marked !present and faulted in with
+ 	 * tmpfs's .fault). So swapped out tmpfs mappings are tested here.
+ 	 */
+-	page = find_get_page(mapping, pgoff);
+ #ifdef CONFIG_SWAP
+-	/* shmem/tmpfs may return swap: account for swapcache page too. */
+-	if (radix_tree_exceptional_entry(page)) {
+-		swp_entry_t swap = radix_to_swp_entry(page);
+-		page = find_get_page(swap_address_space(swap), swap.val);
+-	}
++	if (shmem_mapping(mapping)) {
++		page = find_get_entry(mapping, pgoff);
++		/*
++		 * shmem/tmpfs may return swap: account for swapcache
++		 * page too.
++		 */
++		if (radix_tree_exceptional_entry(page)) {
++			swp_entry_t swp = radix_to_swp_entry(page);
++			page = find_get_page(swap_address_space(swp), swp.val);
++		}
++	} else
++		page = find_get_page(mapping, pgoff);
++#else
++	page = find_get_page(mapping, pgoff);
+ #endif
+ 	if (page) {
+ 		present = PageUptodate(page);
+diff --git a/mm/mmap.c b/mm/mmap.c
+index af99b9ed2007..c1249cb7dc15 100644
+--- a/mm/mmap.c
++++ b/mm/mmap.c
+@@ -10,6 +10,7 @@
+ #include <linux/slab.h>
+ #include <linux/backing-dev.h>
+ #include <linux/mm.h>
++#include <linux/vmacache.h>
+ #include <linux/shm.h>
+ #include <linux/mman.h>
+ #include <linux/pagemap.h>
+@@ -682,8 +683,9 @@ __vma_unlink(struct mm_struct *mm, struct vm_area_struct *vma,
+ 	prev->vm_next = next = vma->vm_next;
+ 	if (next)
+ 		next->vm_prev = prev;
+-	if (mm->mmap_cache == vma)
+-		mm->mmap_cache = prev;
++
++	/* Kill the cache */
++	vmacache_invalidate(mm);
+ }
+ 
+ /*
+@@ -1980,34 +1982,33 @@ EXPORT_SYMBOL(get_unmapped_area);
+ /* Look up the first VMA which satisfies  addr < vm_end,  NULL if none. */
+ struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr)
+ {
+-	struct vm_area_struct *vma = NULL;
++	struct rb_node *rb_node;
++	struct vm_area_struct *vma;
+ 
+ 	/* Check the cache first. */
+-	/* (Cache hit rate is typically around 35%.) */
+-	vma = ACCESS_ONCE(mm->mmap_cache);
+-	if (!(vma && vma->vm_end > addr && vma->vm_start <= addr)) {
+-		struct rb_node *rb_node;
++	vma = vmacache_find(mm, addr);
++	if (likely(vma))
++		return vma;
+ 
+-		rb_node = mm->mm_rb.rb_node;
+-		vma = NULL;
++	rb_node = mm->mm_rb.rb_node;
++	vma = NULL;
+ 
+-		while (rb_node) {
+-			struct vm_area_struct *vma_tmp;
+-
+-			vma_tmp = rb_entry(rb_node,
+-					   struct vm_area_struct, vm_rb);
+-
+-			if (vma_tmp->vm_end > addr) {
+-				vma = vma_tmp;
+-				if (vma_tmp->vm_start <= addr)
+-					break;
+-				rb_node = rb_node->rb_left;
+-			} else
+-				rb_node = rb_node->rb_right;
+-		}
+-		if (vma)
+-			mm->mmap_cache = vma;
++	while (rb_node) {
++		struct vm_area_struct *tmp;
++
++		tmp = rb_entry(rb_node, struct vm_area_struct, vm_rb);
++
++		if (tmp->vm_end > addr) {
++			vma = tmp;
++			if (tmp->vm_start <= addr)
++				break;
++			rb_node = rb_node->rb_left;
++		} else
++			rb_node = rb_node->rb_right;
+ 	}
++
++	if (vma)
++		vmacache_update(addr, vma);
+ 	return vma;
+ }
+ 
+@@ -2379,7 +2380,9 @@ detach_vmas_to_be_unmapped(struct mm_struct *mm, struct vm_area_struct *vma,
+ 	} else
+ 		mm->highest_vm_end = prev ? prev->vm_end : 0;
+ 	tail_vma->vm_next = NULL;
+-	mm->mmap_cache = NULL;		/* Kill the cache. */
++
++	/* Kill the cache */
++	vmacache_invalidate(mm);
+ }
+ 
+ /*
+diff --git a/mm/nommu.c b/mm/nommu.c
+index ecd1f158548e..1221d2b66e97 100644
+--- a/mm/nommu.c
++++ b/mm/nommu.c
+@@ -15,6 +15,7 @@
+ 
+ #include <linux/export.h>
+ #include <linux/mm.h>
++#include <linux/vmacache.h>
+ #include <linux/mman.h>
+ #include <linux/swap.h>
+ #include <linux/file.h>
+@@ -767,16 +768,23 @@ static void add_vma_to_mm(struct mm_struct *mm, struct vm_area_struct *vma)
+  */
+ static void delete_vma_from_mm(struct vm_area_struct *vma)
+ {
++	int i;
+ 	struct address_space *mapping;
+ 	struct mm_struct *mm = vma->vm_mm;
++	struct task_struct *curr = current;
+ 
+ 	kenter("%p", vma);
+ 
+ 	protect_vma(vma, 0);
+ 
+ 	mm->map_count--;
+-	if (mm->mmap_cache == vma)
+-		mm->mmap_cache = NULL;
++	for (i = 0; i < VMACACHE_SIZE; i++) {
++		/* if the vma is cached, invalidate the entire cache */
++		if (curr->vmacache[i] == vma) {
++			vmacache_invalidate(curr->mm);
++			break;
++		}
++	}
+ 
+ 	/* remove the VMA from the mapping */
+ 	if (vma->vm_file) {
+@@ -824,8 +832,8 @@ struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr)
+ 	struct vm_area_struct *vma;
+ 
+ 	/* check the cache first */
+-	vma = ACCESS_ONCE(mm->mmap_cache);
+-	if (vma && vma->vm_start <= addr && vma->vm_end > addr)
++	vma = vmacache_find(mm, addr);
++	if (likely(vma))
+ 		return vma;
+ 
+ 	/* trawl the list (there may be multiple mappings in which addr
+@@ -834,7 +842,7 @@ struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr)
+ 		if (vma->vm_start > addr)
+ 			return NULL;
+ 		if (vma->vm_end > addr) {
+-			mm->mmap_cache = vma;
++			vmacache_update(addr, vma);
+ 			return vma;
+ 		}
+ 	}
+@@ -873,8 +881,8 @@ static struct vm_area_struct *find_vma_exact(struct mm_struct *mm,
+ 	unsigned long end = addr + len;
+ 
+ 	/* check the cache first */
+-	vma = mm->mmap_cache;
+-	if (vma && vma->vm_start == addr && vma->vm_end == end)
++	vma = vmacache_find_exact(mm, addr, end);
++	if (vma)
+ 		return vma;
+ 
+ 	/* trawl the list (there may be multiple mappings in which addr
+@@ -885,7 +893,7 @@ static struct vm_area_struct *find_vma_exact(struct mm_struct *mm,
+ 		if (vma->vm_start > addr)
+ 			return NULL;
+ 		if (vma->vm_end == end) {
+-			mm->mmap_cache = vma;
++			vmacache_update(addr, vma);
+ 			return vma;
+ 		}
+ 	}
+diff --git a/mm/page_alloc.c b/mm/page_alloc.c
+index a280f772bc66..2f91223dbe93 100644
+--- a/mm/page_alloc.c
++++ b/mm/page_alloc.c
+@@ -405,7 +405,8 @@ static int destroy_compound_page(struct page *page, unsigned long order)
+ 	return bad;
+ }
+ 
+-static inline void prep_zero_page(struct page *page, int order, gfp_t gfp_flags)
++static inline void prep_zero_page(struct page *page, unsigned int order,
++							gfp_t gfp_flags)
+ {
+ 	int i;
+ 
+@@ -449,7 +450,7 @@ static inline void set_page_guard_flag(struct page *page) { }
+ static inline void clear_page_guard_flag(struct page *page) { }
+ #endif
+ 
+-static inline void set_page_order(struct page *page, int order)
++static inline void set_page_order(struct page *page, unsigned int order)
+ {
+ 	set_page_private(page, order);
+ 	__SetPageBuddy(page);
+@@ -500,21 +501,31 @@ __find_buddy_index(unsigned long page_idx, unsigned int order)
+  * For recording page's order, we use page_private(page).
+  */
+ static inline int page_is_buddy(struct page *page, struct page *buddy,
+-								int order)
++							unsigned int order)
+ {
+ 	if (!pfn_valid_within(page_to_pfn(buddy)))
+ 		return 0;
+ 
+-	if (page_zone_id(page) != page_zone_id(buddy))
+-		return 0;
+-
+ 	if (page_is_guard(buddy) && page_order(buddy) == order) {
+ 		VM_BUG_ON(page_count(buddy) != 0);
++
++		if (page_zone_id(page) != page_zone_id(buddy))
++			return 0;
++
+ 		return 1;
+ 	}
+ 
+ 	if (PageBuddy(buddy) && page_order(buddy) == order) {
+ 		VM_BUG_ON(page_count(buddy) != 0);
++
++		/*
++		 * zone check is done late to avoid uselessly
++		 * calculating zone/node ids for pages that could
++		 * never merge.
++		 */
++		if (page_zone_id(page) != page_zone_id(buddy))
++			return 0;
++
+ 		return 1;
+ 	}
+ 	return 0;
+@@ -546,6 +557,7 @@ static inline int page_is_buddy(struct page *page, struct page *buddy,
+  */
+ 
+ static inline void __free_one_page(struct page *page,
++		unsigned long pfn,
+ 		struct zone *zone, unsigned int order,
+ 		int migratetype)
+ {
+@@ -562,7 +574,7 @@ static inline void __free_one_page(struct page *page,
+ 
+ 	VM_BUG_ON(migratetype == -1);
+ 
+-	page_idx = page_to_pfn(page) & ((1 << MAX_ORDER) - 1);
++	page_idx = pfn & ((1 << MAX_ORDER) - 1);
+ 
+ 	VM_BUG_ON(page_idx & ((1 << order) - 1));
+ 	VM_BUG_ON(bad_range(zone, page));
+@@ -652,9 +664,12 @@ static void free_pcppages_bulk(struct zone *zone, int count,
+ 	int migratetype = 0;
+ 	int batch_free = 0;
+ 	int to_free = count;
++	unsigned long nr_scanned;
+ 
+ 	spin_lock(&zone->lock);
+-	zone->pages_scanned = 0;
++	nr_scanned = zone_page_state(zone, NR_PAGES_SCANNED);
++	if (nr_scanned)
++		__mod_zone_page_state(zone, NR_PAGES_SCANNED, -nr_scanned);
+ 
+ 	while (to_free) {
+ 		struct page *page;
+@@ -686,7 +701,7 @@ static void free_pcppages_bulk(struct zone *zone, int count,
+ 			list_del(&page->lru);
+ 			mt = get_freepage_migratetype(page);
+ 			/* MIGRATE_MOVABLE list may include MIGRATE_RESERVEs */
+-			__free_one_page(page, zone, 0, mt);
++			__free_one_page(page, page_to_pfn(page), zone, 0, mt);
+ 			trace_mm_page_pcpu_drain(page, 0, mt);
+ 			if (likely(!is_migrate_isolate_page(page))) {
+ 				__mod_zone_page_state(zone, NR_FREE_PAGES, 1);
+@@ -698,13 +713,18 @@ static void free_pcppages_bulk(struct zone *zone, int count,
+ 	spin_unlock(&zone->lock);
+ }
+ 
+-static void free_one_page(struct zone *zone, struct page *page, int order,
++static void free_one_page(struct zone *zone,
++				struct page *page, unsigned long pfn,
++				unsigned int order,
+ 				int migratetype)
+ {
++	unsigned long nr_scanned;
+ 	spin_lock(&zone->lock);
+-	zone->pages_scanned = 0;
++	nr_scanned = zone_page_state(zone, NR_PAGES_SCANNED);
++	if (nr_scanned)
++		__mod_zone_page_state(zone, NR_PAGES_SCANNED, -nr_scanned);
+ 
+-	__free_one_page(page, zone, order, migratetype);
++	__free_one_page(page, pfn, zone, order, migratetype);
+ 	if (unlikely(!is_migrate_isolate(migratetype)))
+ 		__mod_zone_freepage_state(zone, 1 << order, migratetype);
+ 	spin_unlock(&zone->lock);
+@@ -741,15 +761,16 @@ static void __free_pages_ok(struct page *page, unsigned int order)
+ {
+ 	unsigned long flags;
+ 	int migratetype;
++	unsigned long pfn = page_to_pfn(page);
+ 
+ 	if (!free_pages_prepare(page, order))
+ 		return;
+ 
++	migratetype = get_pfnblock_migratetype(page, pfn);
+ 	local_irq_save(flags);
+ 	__count_vm_events(PGFREE, 1 << order);
+-	migratetype = get_pageblock_migratetype(page);
+ 	set_freepage_migratetype(page, migratetype);
+-	free_one_page(page_zone(page), page, order, migratetype);
++	free_one_page(page_zone(page), page, pfn, order, migratetype);
+ 	local_irq_restore(flags);
+ }
+ 
+@@ -869,7 +890,7 @@ static inline int check_new_page(struct page *page)
+ 	return 0;
+ }
+ 
+-static int prep_new_page(struct page *page, int order, gfp_t gfp_flags)
++static int prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags)
+ {
+ 	int i;
+ 
+@@ -918,6 +939,7 @@ struct page *__rmqueue_smallest(struct zone *zone, unsigned int order,
+ 		rmv_page_order(page);
+ 		area->nr_free--;
+ 		expand(zone, page, order, current_order, area, migratetype);
++		set_freepage_migratetype(page, migratetype);
+ 		return page;
+ 	}
+ 
+@@ -1042,6 +1064,12 @@ static int try_to_steal_freepages(struct zone *zone, struct page *page,
+ {
+ 	int current_order = page_order(page);
+ 
++	/*
++	 * When borrowing from MIGRATE_CMA, we need to release the excess
++	 * buddy pages to CMA itself. We also ensure the freepage_migratetype
++	 * is set to CMA so it is returned to the correct freelist in case
++	 * the page ends up being not actually allocated from the pcp lists.
++	 */
+ 	if (is_migrate_cma(fallback_type))
+ 		return fallback_type;
+ 
+@@ -1073,16 +1101,17 @@ static int try_to_steal_freepages(struct zone *zone, struct page *page,
+ 
+ /* Remove an element from the buddy allocator from the fallback list */
+ static inline struct page *
+-__rmqueue_fallback(struct zone *zone, int order, int start_migratetype)
++__rmqueue_fallback(struct zone *zone, unsigned int order, int start_migratetype)
+ {
+ 	struct free_area *area;
+-	int current_order;
++	unsigned int current_order;
+ 	struct page *page;
+ 	int migratetype, new_type, i;
+ 
+ 	/* Find the largest possible block of pages in the other list */
+-	for (current_order = MAX_ORDER-1; current_order >= order;
+-						--current_order) {
++	for (current_order = MAX_ORDER-1;
++				current_order >= order && current_order <= MAX_ORDER-1;
++				--current_order) {
+ 		for (i = 0;; i++) {
+ 			migratetype = fallbacks[start_migratetype][i];
+ 
+@@ -1106,21 +1135,17 @@ __rmqueue_fallback(struct zone *zone, int order, int start_migratetype)
+ 			list_del(&page->lru);
+ 			rmv_page_order(page);
+ 
+-			/*
+-			 * Borrow the excess buddy pages as well, irrespective
+-			 * of whether we stole freepages, or took ownership of
+-			 * the pageblock or not.
+-			 *
+-			 * Exception: When borrowing from MIGRATE_CMA, release
+-			 * the excess buddy pages to CMA itself.
+-			 */
+ 			expand(zone, page, order, current_order, area,
+-			       is_migrate_cma(migratetype)
+-			     ? migratetype : start_migratetype);
++			       new_type);
++			/* The freepage_migratetype may differ from pageblock's
++			 * migratetype depending on the decisions in
++			 * try_to_steal_freepages. This is OK as long as it does
++			 * not differ for MIGRATE_CMA type.
++			 */
++			set_freepage_migratetype(page, new_type);
+ 
+-			trace_mm_page_alloc_extfrag(page, order,
+-				current_order, start_migratetype, migratetype,
+-				new_type == start_migratetype);
++			trace_mm_page_alloc_extfrag(page, order, current_order,
++				start_migratetype, migratetype, new_type);
+ 
+ 			return page;
+ 		}
+@@ -1166,9 +1191,9 @@ retry_reserve:
+  */
+ static int rmqueue_bulk(struct zone *zone, unsigned int order,
+ 			unsigned long count, struct list_head *list,
+-			int migratetype, int cold)
++			int migratetype, bool cold)
+ {
+-	int mt = migratetype, i;
++	int i;
+ 
+ 	spin_lock(&zone->lock);
+ 	for (i = 0; i < count; ++i) {
+@@ -1185,18 +1210,12 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
+ 		 * merge IO requests if the physical pages are ordered
+ 		 * properly.
+ 		 */
+-		if (likely(cold == 0))
++		if (likely(!cold))
+ 			list_add(&page->lru, list);
+ 		else
+ 			list_add_tail(&page->lru, list);
+-		if (IS_ENABLED(CONFIG_CMA)) {
+-			mt = get_pageblock_migratetype(page);
+-			if (!is_migrate_cma(mt) && !is_migrate_isolate(mt))
+-				mt = migratetype;
+-		}
+-		set_freepage_migratetype(page, mt);
+ 		list = &page->lru;
+-		if (is_migrate_cma(mt))
++		if (is_migrate_cma(get_freepage_migratetype(page)))
+ 			__mod_zone_page_state(zone, NR_FREE_CMA_PAGES,
+ 					      -(1 << order));
+ 	}
+@@ -1320,7 +1339,7 @@ void mark_free_pages(struct zone *zone)
+ {
+ 	unsigned long pfn, max_zone_pfn;
+ 	unsigned long flags;
+-	int order, t;
++	unsigned int order, t;
+ 	struct list_head *curr;
+ 
+ 	if (zone_is_empty(zone))
+@@ -1352,19 +1371,20 @@ void mark_free_pages(struct zone *zone)
+ 
+ /*
+  * Free a 0-order page
+- * cold == 1 ? free a cold page : free a hot page
++ * cold == true ? free a cold page : free a hot page
+  */
+-void free_hot_cold_page(struct page *page, int cold)
++void free_hot_cold_page(struct page *page, bool cold)
+ {
+ 	struct zone *zone = page_zone(page);
+ 	struct per_cpu_pages *pcp;
+ 	unsigned long flags;
++	unsigned long pfn = page_to_pfn(page);
+ 	int migratetype;
+ 
+ 	if (!free_pages_prepare(page, 0))
+ 		return;
+ 
+-	migratetype = get_pageblock_migratetype(page);
++	migratetype = get_pfnblock_migratetype(page, pfn);
+ 	set_freepage_migratetype(page, migratetype);
+ 	local_irq_save(flags);
+ 	__count_vm_event(PGFREE);
+@@ -1378,17 +1398,17 @@ void free_hot_cold_page(struct page *page, int cold)
+ 	 */
+ 	if (migratetype >= MIGRATE_PCPTYPES) {
+ 		if (unlikely(is_migrate_isolate(migratetype))) {
+-			free_one_page(zone, page, 0, migratetype);
++			free_one_page(zone, page, pfn, 0, migratetype);
+ 			goto out;
+ 		}
+ 		migratetype = MIGRATE_MOVABLE;
+ 	}
+ 
+ 	pcp = &this_cpu_ptr(zone->pageset)->pcp;
+-	if (cold)
+-		list_add_tail(&page->lru, &pcp->lists[migratetype]);
+-	else
++	if (!cold)
+ 		list_add(&page->lru, &pcp->lists[migratetype]);
++	else
++		list_add_tail(&page->lru, &pcp->lists[migratetype]);
+ 	pcp->count++;
+ 	if (pcp->count >= pcp->high) {
+ 		unsigned long batch = ACCESS_ONCE(pcp->batch);
+@@ -1403,7 +1423,7 @@ out:
+ /*
+  * Free a list of 0-order pages
+  */
+-void free_hot_cold_page_list(struct list_head *list, int cold)
++void free_hot_cold_page_list(struct list_head *list, bool cold)
+ {
+ 	struct page *page, *next;
+ 
+@@ -1515,12 +1535,12 @@ int split_free_page(struct page *page)
+  */
+ static inline
+ struct page *buffered_rmqueue(struct zone *preferred_zone,
+-			struct zone *zone, int order, gfp_t gfp_flags,
+-			int migratetype)
++			struct zone *zone, unsigned int order,
++			gfp_t gfp_flags, int migratetype)
+ {
+ 	unsigned long flags;
+ 	struct page *page;
+-	int cold = !!(gfp_flags & __GFP_COLD);
++	bool cold = ((gfp_flags & __GFP_COLD) != 0);
+ 
+ again:
+ 	if (likely(order == 0)) {
+@@ -1565,10 +1585,13 @@ again:
+ 		if (!page)
+ 			goto failed;
+ 		__mod_zone_freepage_state(zone, -(1 << order),
+-					  get_pageblock_migratetype(page));
++					  get_freepage_migratetype(page));
+ 	}
+ 
+ 	__mod_zone_page_state(zone, NR_ALLOC_BATCH, -(1 << order));
++	if (zone_page_state(zone, NR_ALLOC_BATCH) == 0 &&
++	    !zone_is_fair_depleted(zone))
++		zone_set_flag(zone, ZONE_FAIR_DEPLETED);
+ 
+ 	__count_zone_vm_events(PGALLOC, zone, 1 << order);
+ 	zone_statistics(preferred_zone, zone, gfp_flags);
+@@ -1665,12 +1688,12 @@ static inline bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
+  * Return true if free pages are above 'mark'. This takes into account the order
+  * of the allocation.
+  */
+-static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark,
+-		      int classzone_idx, int alloc_flags, long free_pages)
++static bool __zone_watermark_ok(struct zone *z, unsigned int order,
++			unsigned long mark, int classzone_idx, int alloc_flags,
++			long free_pages)
+ {
+ 	/* free_pages my go negative - that's OK */
+ 	long min = mark;
+-	long lowmem_reserve = z->lowmem_reserve[classzone_idx];
+ 	int o;
+ 	long free_cma = 0;
+ 
+@@ -1685,7 +1708,7 @@ static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark,
+ 		free_cma = zone_page_state(z, NR_FREE_CMA_PAGES);
+ #endif
+ 
+-	if (free_pages - free_cma <= min + lowmem_reserve)
++	if (free_pages - free_cma <= min + z->lowmem_reserve[classzone_idx])
+ 		return false;
+ 	for (o = 0; o < order; o++) {
+ 		/* At the next order, this order's pages become unavailable */
+@@ -1700,15 +1723,15 @@ static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark,
+ 	return true;
+ }
+ 
+-bool zone_watermark_ok(struct zone *z, int order, unsigned long mark,
++bool zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark,
+ 		      int classzone_idx, int alloc_flags)
+ {
+ 	return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags,
+ 					zone_page_state(z, NR_FREE_PAGES));
+ }
+ 
+-bool zone_watermark_ok_safe(struct zone *z, int order, unsigned long mark,
+-		      int classzone_idx, int alloc_flags)
++bool zone_watermark_ok_safe(struct zone *z, unsigned int order,
++			unsigned long mark, int classzone_idx, int alloc_flags)
+ {
+ 	long free_pages = zone_page_state(z, NR_FREE_PAGES);
+ 
+@@ -1850,7 +1873,7 @@ static void __paginginit init_zone_allows_reclaim(int nid)
+ {
+ 	int i;
+ 
+-	for_each_online_node(i)
++	for_each_node_state(i, N_MEMORY)
+ 		if (node_distance(nid, i) <= RECLAIM_DISTANCE)
+ 			node_set(i, NODE_DATA(nid)->reclaim_nodes);
+ 		else
+@@ -1893,6 +1916,18 @@ static inline void init_zone_allows_reclaim(int nid)
+ }
+ #endif	/* CONFIG_NUMA */
+ 
++static void reset_alloc_batches(struct zone *preferred_zone)
++{
++	struct zone *zone = preferred_zone->zone_pgdat->node_zones;
++
++	do {
++		mod_zone_page_state(zone, NR_ALLOC_BATCH,
++			high_wmark_pages(zone) - low_wmark_pages(zone) -
++			atomic_long_read(&zone->vm_stat[NR_ALLOC_BATCH]));
++		zone_clear_flag(zone, ZONE_FAIR_DEPLETED);
++	} while (zone++ != preferred_zone);
++}
++
+ /*
+  * get_page_from_freelist goes through the zonelist trying to allocate
+  * a page.
+@@ -1900,18 +1935,22 @@ static inline void init_zone_allows_reclaim(int nid)
+ static struct page *
+ get_page_from_freelist(gfp_t gfp_mask, nodemask_t *nodemask, unsigned int order,
+ 		struct zonelist *zonelist, int high_zoneidx, int alloc_flags,
+-		struct zone *preferred_zone, int migratetype)
++		struct zone *preferred_zone, int classzone_idx, int migratetype)
+ {
+ 	struct zoneref *z;
+ 	struct page *page = NULL;
+-	int classzone_idx;
+ 	struct zone *zone;
+ 	nodemask_t *allowednodes = NULL;/* zonelist_cache approximation */
+ 	int zlc_active = 0;		/* set if using zonelist_cache */
+ 	int did_zlc_setup = 0;		/* just call zlc_setup() one time */
++	bool consider_zone_dirty = (alloc_flags & ALLOC_WMARK_LOW) &&
++				(gfp_mask & __GFP_WRITE);
++	int nr_fair_skipped = 0;
++	bool zonelist_rescan;
+ 
+-	classzone_idx = zone_idx(preferred_zone);
+ zonelist_scan:
++	zonelist_rescan = false;
++
+ 	/*
+ 	 * Scan zonelist, looking for a zone with enough free.
+ 	 * See also __cpuset_node_allowed_softwall() comment in kernel/cpuset.c.
+@@ -1923,12 +1962,10 @@ zonelist_scan:
+ 		if (IS_ENABLED(CONFIG_NUMA) && zlc_active &&
+ 			!zlc_zone_worth_trying(zonelist, z, allowednodes))
+ 				continue;
+-		if ((alloc_flags & ALLOC_CPUSET) &&
++		if (cpusets_enabled() &&
++			(alloc_flags & ALLOC_CPUSET) &&
+ 			!cpuset_zone_allowed_softwall(zone, gfp_mask))
+ 				continue;
+-		BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK);
+-		if (unlikely(alloc_flags & ALLOC_NO_WATERMARKS))
+-			goto try_this_zone;
+ 		/*
+ 		 * Distribute pages in proportion to the individual
+ 		 * zone size to ensure fair page aging.  The zone a
+@@ -1937,9 +1974,11 @@ zonelist_scan:
+ 		 */
+ 		if (alloc_flags & ALLOC_FAIR) {
+ 			if (!zone_local(preferred_zone, zone))
++				break;
++			if (zone_is_fair_depleted(zone)) {
++				nr_fair_skipped++;
+ 				continue;
+-			if (zone_page_state(zone, NR_ALLOC_BATCH) <= 0)
+-				continue;
++			}
+ 		}
+ 		/*
+ 		 * When allocating a page cache page for writing, we
+@@ -1967,15 +2006,19 @@ zonelist_scan:
+ 		 * will require awareness of zones in the
+ 		 * dirty-throttling and the flusher threads.
+ 		 */
+-		if ((alloc_flags & ALLOC_WMARK_LOW) &&
+-		    (gfp_mask & __GFP_WRITE) && !zone_dirty_ok(zone))
+-			goto this_zone_full;
++		if (consider_zone_dirty && !zone_dirty_ok(zone))
++			continue;
+ 
+ 		mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK];
+ 		if (!zone_watermark_ok(zone, order, mark,
+ 				       classzone_idx, alloc_flags)) {
+ 			int ret;
+ 
++			/* Checked here to keep the fast path fast */
++			BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK);
++			if (alloc_flags & ALLOC_NO_WATERMARKS)
++				goto try_this_zone;
++
+ 			if (IS_ENABLED(CONFIG_NUMA) &&
+ 					!did_zlc_setup && nr_online_nodes > 1) {
+ 				/*
+@@ -2037,17 +2080,11 @@ try_this_zone:
+ 		if (page)
+ 			break;
+ this_zone_full:
+-		if (IS_ENABLED(CONFIG_NUMA))
++		if (IS_ENABLED(CONFIG_NUMA) && zlc_active)
+ 			zlc_mark_zone_full(zonelist, z);
+ 	}
+ 
+-	if (unlikely(IS_ENABLED(CONFIG_NUMA) && page == NULL && zlc_active)) {
+-		/* Disable zlc cache for second zonelist scan */
+-		zlc_active = 0;
+-		goto zonelist_scan;
+-	}
+-
+-	if (page)
++	if (page) {
+ 		/*
+ 		 * page->pfmemalloc is set when ALLOC_NO_WATERMARKS was
+ 		 * necessary to allocate the page. The expectation is
+@@ -2056,8 +2093,37 @@ this_zone_full:
+ 		 * for !PFMEMALLOC purposes.
+ 		 */
+ 		page->pfmemalloc = !!(alloc_flags & ALLOC_NO_WATERMARKS);
++		return page;
++	}
+ 
+-	return page;
++	/*
++	 * The first pass makes sure allocations are spread fairly within the
++	 * local node.  However, the local node might have free pages left
++	 * after the fairness batches are exhausted, and remote zones haven't
++	 * even been considered yet.  Try once more without fairness, and
++	 * include remote zones now, before entering the slowpath and waking
++	 * kswapd: prefer spilling to a remote zone over swapping locally.
++	 */
++	if (alloc_flags & ALLOC_FAIR) {
++		alloc_flags &= ~ALLOC_FAIR;
++		if (nr_fair_skipped) {
++			zonelist_rescan = true;
++			reset_alloc_batches(preferred_zone);
++		}
++		if (nr_online_nodes > 1)
++			zonelist_rescan = true;
++	}
++
++	if (unlikely(IS_ENABLED(CONFIG_NUMA) && zlc_active)) {
++		/* Disable zlc cache for second zonelist scan */
++		zlc_active = 0;
++		zonelist_rescan = true;
++	}
++
++	if (zonelist_rescan)
++		goto zonelist_scan;
++
++	return NULL;
+ }
+ 
+ /*
+@@ -2173,7 +2239,7 @@ static inline struct page *
+ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
+ 	struct zonelist *zonelist, enum zone_type high_zoneidx,
+ 	nodemask_t *nodemask, struct zone *preferred_zone,
+-	int migratetype)
++	int classzone_idx, int migratetype)
+ {
+ 	struct page *page;
+ 
+@@ -2191,7 +2257,7 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
+ 	page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask,
+ 		order, zonelist, high_zoneidx,
+ 		ALLOC_WMARK_HIGH|ALLOC_CPUSET,
+-		preferred_zone, migratetype);
++		preferred_zone, classzone_idx, migratetype);
+ 	if (page)
+ 		goto out;
+ 
+@@ -2226,7 +2292,7 @@ static struct page *
+ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
+ 	struct zonelist *zonelist, enum zone_type high_zoneidx,
+ 	nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
+-	int migratetype, bool sync_migration,
++	int classzone_idx, int migratetype, enum migrate_mode mode,
+ 	bool *contended_compaction, bool *deferred_compaction,
+ 	unsigned long *did_some_progress)
+ {
+@@ -2240,7 +2306,7 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
+ 
+ 	current->flags |= PF_MEMALLOC;
+ 	*did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask,
+-						nodemask, sync_migration,
++						nodemask, mode,
+ 						contended_compaction);
+ 	current->flags &= ~PF_MEMALLOC;
+ 
+@@ -2254,13 +2320,10 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
+ 		page = get_page_from_freelist(gfp_mask, nodemask,
+ 				order, zonelist, high_zoneidx,
+ 				alloc_flags & ~ALLOC_NO_WATERMARKS,
+-				preferred_zone, migratetype);
++				preferred_zone, classzone_idx, migratetype);
+ 		if (page) {
+ 			preferred_zone->compact_blockskip_flush = false;
+-			preferred_zone->compact_considered = 0;
+-			preferred_zone->compact_defer_shift = 0;
+-			if (order >= preferred_zone->compact_order_failed)
+-				preferred_zone->compact_order_failed = order + 1;
++			compaction_defer_reset(preferred_zone, order, true);
+ 			count_vm_event(COMPACTSUCCESS);
+ 			return page;
+ 		}
+@@ -2276,7 +2339,7 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
+ 		 * As async compaction considers a subset of pageblocks, only
+ 		 * defer if the failure was a sync compaction failure.
+ 		 */
+-		if (sync_migration)
++		if (mode != MIGRATE_ASYNC)
+ 			defer_compaction(preferred_zone, order);
+ 
+ 		cond_resched();
+@@ -2289,9 +2352,9 @@ static inline struct page *
+ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
+ 	struct zonelist *zonelist, enum zone_type high_zoneidx,
+ 	nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
+-	int migratetype, bool sync_migration,
+-	bool *contended_compaction, bool *deferred_compaction,
+-	unsigned long *did_some_progress)
++	int classzone_idx, int migratetype,
++	enum migrate_mode mode, bool *contended_compaction,
++	bool *deferred_compaction, unsigned long *did_some_progress)
+ {
+ 	return NULL;
+ }
+@@ -2330,7 +2393,7 @@ static inline struct page *
+ __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order,
+ 	struct zonelist *zonelist, enum zone_type high_zoneidx,
+ 	nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
+-	int migratetype, unsigned long *did_some_progress)
++	int classzone_idx, int migratetype, unsigned long *did_some_progress)
+ {
+ 	struct page *page = NULL;
+ 	bool drained = false;
+@@ -2348,7 +2411,8 @@ retry:
+ 	page = get_page_from_freelist(gfp_mask, nodemask, order,
+ 					zonelist, high_zoneidx,
+ 					alloc_flags & ~ALLOC_NO_WATERMARKS,
+-					preferred_zone, migratetype);
++					preferred_zone, classzone_idx,
++					migratetype);
+ 
+ 	/*
+ 	 * If an allocation failed after direct reclaim, it could be because
+@@ -2371,14 +2435,14 @@ static inline struct page *
+ __alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order,
+ 	struct zonelist *zonelist, enum zone_type high_zoneidx,
+ 	nodemask_t *nodemask, struct zone *preferred_zone,
+-	int migratetype)
++	int classzone_idx, int migratetype)
+ {
+ 	struct page *page;
+ 
+ 	do {
+ 		page = get_page_from_freelist(gfp_mask, nodemask, order,
+ 			zonelist, high_zoneidx, ALLOC_NO_WATERMARKS,
+-			preferred_zone, migratetype);
++			preferred_zone, classzone_idx, migratetype);
+ 
+ 		if (!page && gfp_mask & __GFP_NOFAIL)
+ 			wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50);
+@@ -2387,28 +2451,6 @@ __alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order,
+ 	return page;
+ }
+ 
+-static void reset_alloc_batches(struct zonelist *zonelist,
+-				enum zone_type high_zoneidx,
+-				struct zone *preferred_zone)
+-{
+-	struct zoneref *z;
+-	struct zone *zone;
+-
+-	for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {
+-		/*
+-		 * Only reset the batches of zones that were actually
+-		 * considered in the fairness pass, we don't want to
+-		 * trash fairness information for zones that are not
+-		 * actually part of this zonelist's round-robin cycle.
+-		 */
+-		if (!zone_local(preferred_zone, zone))
+-			continue;
+-		mod_zone_page_state(zone, NR_ALLOC_BATCH,
+-			high_wmark_pages(zone) - low_wmark_pages(zone) -
+-			atomic_long_read(&zone->vm_stat[NR_ALLOC_BATCH]));
+-	}
+-}
+-
+ static void wake_all_kswapds(unsigned int order,
+ 			     struct zonelist *zonelist,
+ 			     enum zone_type high_zoneidx,
+@@ -2479,14 +2521,14 @@ static inline struct page *
+ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
+ 	struct zonelist *zonelist, enum zone_type high_zoneidx,
+ 	nodemask_t *nodemask, struct zone *preferred_zone,
+-	int migratetype)
++	int classzone_idx, int migratetype)
+ {
+ 	const gfp_t wait = gfp_mask & __GFP_WAIT;
+ 	struct page *page = NULL;
+ 	int alloc_flags;
+ 	unsigned long pages_reclaimed = 0;
+ 	unsigned long did_some_progress;
+-	bool sync_migration = false;
++	enum migrate_mode migration_mode = MIGRATE_ASYNC;
+ 	bool deferred_compaction = false;
+ 	bool contended_compaction = false;
+ 
+@@ -2528,15 +2570,19 @@ restart:
+ 	 * Find the true preferred zone if the allocation is unconstrained by
+ 	 * cpusets.
+ 	 */
+-	if (!(alloc_flags & ALLOC_CPUSET) && !nodemask)
+-		first_zones_zonelist(zonelist, high_zoneidx, NULL,
+-					&preferred_zone);
++	if (!(alloc_flags & ALLOC_CPUSET) && !nodemask) {
++		struct zoneref *preferred_zoneref;
++		preferred_zoneref = first_zones_zonelist(zonelist, high_zoneidx,
++				NULL,
++				&preferred_zone);
++		classzone_idx = zonelist_zone_idx(preferred_zoneref);
++	}
+ 
+ rebalance:
+ 	/* This is the last chance, in general, before the goto nopage. */
+ 	page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist,
+ 			high_zoneidx, alloc_flags & ~ALLOC_NO_WATERMARKS,
+-			preferred_zone, migratetype);
++			preferred_zone, classzone_idx, migratetype);
+ 	if (page)
+ 		goto got_pg;
+ 
+@@ -2551,7 +2597,7 @@ rebalance:
+ 
+ 		page = __alloc_pages_high_priority(gfp_mask, order,
+ 				zonelist, high_zoneidx, nodemask,
+-				preferred_zone, migratetype);
++				preferred_zone, classzone_idx, migratetype);
+ 		if (page) {
+ 			goto got_pg;
+ 		}
+@@ -2573,17 +2619,16 @@ rebalance:
+ 	 * Try direct compaction. The first pass is asynchronous. Subsequent
+ 	 * attempts after direct reclaim are synchronous
+ 	 */
+-	page = __alloc_pages_direct_compact(gfp_mask, order,
+-					zonelist, high_zoneidx,
+-					nodemask,
+-					alloc_flags, preferred_zone,
+-					migratetype, sync_migration,
+-					&contended_compaction,
++	page = __alloc_pages_direct_compact(gfp_mask, order, zonelist,
++					high_zoneidx, nodemask, alloc_flags,
++					preferred_zone,
++					classzone_idx, migratetype,
++					migration_mode, &contended_compaction,
+ 					&deferred_compaction,
+ 					&did_some_progress);
+ 	if (page)
+ 		goto got_pg;
+-	sync_migration = true;
++	migration_mode = MIGRATE_SYNC_LIGHT;
+ 
+ 	/*
+ 	 * If compaction is deferred for high-order allocations, it is because
+@@ -2600,7 +2645,8 @@ rebalance:
+ 					zonelist, high_zoneidx,
+ 					nodemask,
+ 					alloc_flags, preferred_zone,
+-					migratetype, &did_some_progress);
++					classzone_idx, migratetype,
++					&did_some_progress);
+ 	if (page)
+ 		goto got_pg;
+ 
+@@ -2619,7 +2665,7 @@ rebalance:
+ 			page = __alloc_pages_may_oom(gfp_mask, order,
+ 					zonelist, high_zoneidx,
+ 					nodemask, preferred_zone,
+-					migratetype);
++					classzone_idx, migratetype);
+ 			if (page)
+ 				goto got_pg;
+ 
+@@ -2658,12 +2704,11 @@ rebalance:
+ 		 * direct reclaim and reclaim/compaction depends on compaction
+ 		 * being called after reclaim so call directly if necessary
+ 		 */
+-		page = __alloc_pages_direct_compact(gfp_mask, order,
+-					zonelist, high_zoneidx,
+-					nodemask,
+-					alloc_flags, preferred_zone,
+-					migratetype, sync_migration,
+-					&contended_compaction,
++		page = __alloc_pages_direct_compact(gfp_mask, order, zonelist,
++					high_zoneidx, nodemask, alloc_flags,
++					preferred_zone,
++					classzone_idx, migratetype,
++					migration_mode, &contended_compaction,
+ 					&deferred_compaction,
+ 					&did_some_progress);
+ 		if (page)
+@@ -2689,11 +2734,13 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
+ {
+ 	enum zone_type high_zoneidx = gfp_zone(gfp_mask);
+ 	struct zone *preferred_zone;
++	struct zoneref *preferred_zoneref;
+ 	struct page *page = NULL;
+ 	int migratetype = allocflags_to_migratetype(gfp_mask);
+ 	unsigned int cpuset_mems_cookie;
+ 	int alloc_flags = ALLOC_WMARK_LOW|ALLOC_CPUSET|ALLOC_FAIR;
+ 	struct mem_cgroup *memcg = NULL;
++	int classzone_idx;
+ 
+ 	gfp_mask &= gfp_allowed_mask;
+ 
+@@ -2720,42 +2767,26 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
+ 		return NULL;
+ 
+ retry_cpuset:
+-	cpuset_mems_cookie = get_mems_allowed();
++	cpuset_mems_cookie = read_mems_allowed_begin();
+ 
+ 	/* The preferred zone is used for statistics later */
+-	first_zones_zonelist(zonelist, high_zoneidx,
++	preferred_zoneref = first_zones_zonelist(zonelist, high_zoneidx,
+ 				nodemask ? : &cpuset_current_mems_allowed,
+ 				&preferred_zone);
+ 	if (!preferred_zone)
+ 		goto out;
++	classzone_idx = zonelist_zone_idx(preferred_zoneref);
+ 
+ #ifdef CONFIG_CMA
+ 	if (allocflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE)
+ 		alloc_flags |= ALLOC_CMA;
+ #endif
+-retry:
+ 	/* First allocation attempt */
+ 	page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order,
+ 			zonelist, high_zoneidx, alloc_flags,
+-			preferred_zone, migratetype);
++			preferred_zone, classzone_idx, migratetype);
+ 	if (unlikely(!page)) {
+ 		/*
+-		 * The first pass makes sure allocations are spread
+-		 * fairly within the local node.  However, the local
+-		 * node might have free pages left after the fairness
+-		 * batches are exhausted, and remote zones haven't
+-		 * even been considered yet.  Try once more without
+-		 * fairness, and include remote zones now, before
+-		 * entering the slowpath and waking kswapd: prefer
+-		 * spilling to a remote zone over swapping locally.
+-		 */
+-		if (alloc_flags & ALLOC_FAIR) {
+-			reset_alloc_batches(zonelist, high_zoneidx,
+-					    preferred_zone);
+-			alloc_flags &= ~ALLOC_FAIR;
+-			goto retry;
+-		}
+-		/*
+ 		 * Runtime PM, block IO and its error handling path
+ 		 * can deadlock because I/O on the device might not
+ 		 * complete.
+@@ -2763,7 +2794,7 @@ retry:
+ 		gfp_mask = memalloc_noio_flags(gfp_mask);
+ 		page = __alloc_pages_slowpath(gfp_mask, order,
+ 				zonelist, high_zoneidx, nodemask,
+-				preferred_zone, migratetype);
++				preferred_zone, classzone_idx, migratetype);
+ 	}
+ 
+ 	trace_mm_page_alloc(page, order, gfp_mask, migratetype);
+@@ -2775,7 +2806,7 @@ out:
+ 	 * the mask is being updated. If a page allocation is about to fail,
+ 	 * check if the cpuset changed during allocation and if so, retry.
+ 	 */
+-	if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))
++	if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie)))
+ 		goto retry_cpuset;
+ 
+ 	memcg_kmem_commit_charge(page, memcg, order);
+@@ -2814,7 +2845,7 @@ void __free_pages(struct page *page, unsigned int order)
+ {
+ 	if (put_page_testzero(page)) {
+ 		if (order == 0)
+-			free_hot_cold_page(page, 0);
++			free_hot_cold_page(page, false);
+ 		else
+ 			__free_pages_ok(page, order);
+ 	}
+@@ -3043,9 +3074,9 @@ bool skip_free_areas_node(unsigned int flags, int nid)
+ 		goto out;
+ 
+ 	do {
+-		cpuset_mems_cookie = get_mems_allowed();
++		cpuset_mems_cookie = read_mems_allowed_begin();
+ 		ret = !node_isset(nid, cpuset_current_mems_allowed);
+-	} while (!put_mems_allowed(cpuset_mems_cookie));
++	} while (read_mems_allowed_retry(cpuset_mems_cookie));
+ out:
+ 	return ret;
+ }
+@@ -3198,12 +3229,12 @@ void show_free_areas(unsigned int filter)
+ 			K(zone_page_state(zone, NR_BOUNCE)),
+ 			K(zone_page_state(zone, NR_FREE_CMA_PAGES)),
+ 			K(zone_page_state(zone, NR_WRITEBACK_TEMP)),
+-			zone->pages_scanned,
++			K(zone_page_state(zone, NR_PAGES_SCANNED)),
+ 			(!zone_reclaimable(zone) ? "yes" : "no")
+ 			);
+ 		printk("lowmem_reserve[]:");
+ 		for (i = 0; i < MAX_NR_ZONES; i++)
+-			printk(" %lu", zone->lowmem_reserve[i]);
++			printk(" %ld", zone->lowmem_reserve[i]);
+ 		printk("\n");
+ 	}
+ 
+@@ -3943,6 +3974,7 @@ static void setup_zone_migrate_reserve(struct zone *zone)
+ 	struct page *page;
+ 	unsigned long block_migratetype;
+ 	int reserve;
++	int old_reserve;
+ 
+ 	/*
+ 	 * Get the start pfn, end pfn and the number of blocks to reserve
+@@ -3964,6 +3996,12 @@ static void setup_zone_migrate_reserve(struct zone *zone)
+ 	 * future allocation of hugepages at runtime.
+ 	 */
+ 	reserve = min(2, reserve);
++	old_reserve = zone->nr_migrate_reserve_block;
++
++	/* When memory hot-add, we almost always need to do nothing */
++	if (reserve == old_reserve)
++		return;
++	zone->nr_migrate_reserve_block = reserve;
+ 
+ 	for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) {
+ 		if (!pfn_valid(pfn))
+@@ -4001,6 +4039,12 @@ static void setup_zone_migrate_reserve(struct zone *zone)
+ 				reserve--;
+ 				continue;
+ 			}
++		} else if (!old_reserve) {
++			/*
++			 * At boot time we don't need to scan the whole zone
++			 * for turning off MIGRATE_RESERVE.
++			 */
++			break;
+ 		}
+ 
+ 		/*
+@@ -4080,7 +4124,7 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
+ 
+ static void __meminit zone_init_free_lists(struct zone *zone)
+ {
+-	int order, t;
++	unsigned int order, t;
+ 	for_each_migratetype_order(order, t) {
+ 		INIT_LIST_HEAD(&zone->free_area[order].free_list[t]);
+ 		zone->free_area[order].nr_free = 0;
+@@ -4903,7 +4947,8 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size,
+ 
+ 	pgdat->node_id = nid;
+ 	pgdat->node_start_pfn = node_start_pfn;
+-	init_zone_allows_reclaim(nid);
++	if (node_state(nid, N_MEMORY))
++		init_zone_allows_reclaim(nid);
+ #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
+ 	get_pfn_range_for_nid(nid, &start_pfn, &end_pfn);
+ #endif
+@@ -5492,7 +5537,7 @@ static void calculate_totalreserve_pages(void)
+ 	for_each_online_pgdat(pgdat) {
+ 		for (i = 0; i < MAX_NR_ZONES; i++) {
+ 			struct zone *zone = pgdat->node_zones + i;
+-			unsigned long max = 0;
++			long max = 0;
+ 
+ 			/* Find valid and maximum lowmem_reserve in the zone */
+ 			for (j = i; j < MAX_NR_ZONES; j++) {
+@@ -5734,7 +5779,12 @@ module_init(init_per_zone_wmark_min)
+ int min_free_kbytes_sysctl_handler(ctl_table *table, int write,
+ 	void __user *buffer, size_t *length, loff_t *ppos)
+ {
+-	proc_dointvec(table, write, buffer, length, ppos);
++	int rc;
++
++	rc = proc_dointvec_minmax(table, write, buffer, length, ppos);
++	if (rc)
++		return rc;
++
+ 	if (write) {
+ 		user_min_free_kbytes = min_free_kbytes;
+ 		setup_per_zone_wmarks();
+@@ -5976,17 +6026,16 @@ static inline int pfn_to_bitidx(struct zone *zone, unsigned long pfn)
+  * @end_bitidx: The last bit of interest
+  * returns pageblock_bits flags
+  */
+-unsigned long get_pageblock_flags_mask(struct page *page,
++unsigned long get_pfnblock_flags_mask(struct page *page, unsigned long pfn,
+ 					unsigned long end_bitidx,
+ 					unsigned long mask)
+ {
+ 	struct zone *zone;
+ 	unsigned long *bitmap;
+-	unsigned long pfn, bitidx, word_bitidx;
++	unsigned long bitidx, word_bitidx;
+ 	unsigned long word;
+ 
+ 	zone = page_zone(page);
+-	pfn = page_to_pfn(page);
+ 	bitmap = get_pageblock_bitmap(zone, pfn);
+ 	bitidx = pfn_to_bitidx(zone, pfn);
+ 	word_bitidx = bitidx / BITS_PER_LONG;
+@@ -5998,25 +6047,25 @@ unsigned long get_pageblock_flags_mask(struct page *page,
+ }
+ 
+ /**
+- * set_pageblock_flags_mask - Set the requested group of flags for a pageblock_nr_pages block of pages
++ * set_pfnblock_flags_mask - Set the requested group of flags for a pageblock_nr_pages block of pages
+  * @page: The page within the block of interest
+  * @start_bitidx: The first bit of interest
+  * @end_bitidx: The last bit of interest
+  * @flags: The flags to set
+  */
+-void set_pageblock_flags_mask(struct page *page, unsigned long flags,
++void set_pfnblock_flags_mask(struct page *page, unsigned long flags,
++					unsigned long pfn,
+ 					unsigned long end_bitidx,
+ 					unsigned long mask)
+ {
+ 	struct zone *zone;
+ 	unsigned long *bitmap;
+-	unsigned long pfn, bitidx, word_bitidx;
++	unsigned long bitidx, word_bitidx;
+ 	unsigned long old_word, word;
+ 
+ 	BUILD_BUG_ON(NR_PAGEBLOCK_BITS != 4);
+ 
+ 	zone = page_zone(page);
+-	pfn = page_to_pfn(page);
+ 	bitmap = get_pageblock_bitmap(zone, pfn);
+ 	bitidx = pfn_to_bitidx(zone, pfn);
+ 	word_bitidx = bitidx / BITS_PER_LONG;
+@@ -6194,7 +6243,7 @@ static int __alloc_contig_migrate_range(struct compact_control *cc,
+ 		cc->nr_migratepages -= nr_reclaimed;
+ 
+ 		ret = migrate_pages(&cc->migratepages, alloc_migrate_target,
+-				    0, MIGRATE_SYNC, MR_CMA);
++				    NULL, 0, cc->mode, MR_CMA);
+ 	}
+ 	if (ret < 0) {
+ 		putback_movable_pages(&cc->migratepages);
+@@ -6233,7 +6282,7 @@ int alloc_contig_range(unsigned long start, unsigned long end,
+ 		.nr_migratepages = 0,
+ 		.order = -1,
+ 		.zone = page_zone(pfn_to_page(start)),
+-		.sync = true,
++		.mode = MIGRATE_SYNC,
+ 		.ignore_skip_hint = true,
+ 	};
+ 	INIT_LIST_HEAD(&cc.migratepages);
+@@ -6388,7 +6437,7 @@ __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn)
+ {
+ 	struct page *page;
+ 	struct zone *zone;
+-	int order, i;
++	unsigned int order, i;
+ 	unsigned long pfn;
+ 	unsigned long flags;
+ 	/* find the first valid pfn */
+@@ -6440,7 +6489,7 @@ bool is_free_buddy_page(struct page *page)
+ 	struct zone *zone = page_zone(page);
+ 	unsigned long pfn = page_to_pfn(page);
+ 	unsigned long flags;
+-	int order;
++	unsigned int order;
+ 
+ 	spin_lock_irqsave(&zone->lock, flags);
+ 	for (order = 0; order < MAX_ORDER; order++) {
+diff --git a/mm/readahead.c b/mm/readahead.c
+index e4ed04149785..0f35e983bffb 100644
+--- a/mm/readahead.c
++++ b/mm/readahead.c
+@@ -8,9 +8,7 @@
+  */
+ 
+ #include <linux/kernel.h>
+-#include <linux/fs.h>
+ #include <linux/gfp.h>
+-#include <linux/mm.h>
+ #include <linux/export.h>
+ #include <linux/blkdev.h>
+ #include <linux/backing-dev.h>
+@@ -20,6 +18,8 @@
+ #include <linux/syscalls.h>
+ #include <linux/file.h>
+ 
++#include "internal.h"
++
+ /*
+  * Initialise a struct file's readahead state.  Assumes that the caller has
+  * memset *ra to zero.
+@@ -149,8 +149,7 @@ out:
+  *
+  * Returns the number of pages requested, or the maximum amount of I/O allowed.
+  */
+-static int
+-__do_page_cache_readahead(struct address_space *mapping, struct file *filp,
++int __do_page_cache_readahead(struct address_space *mapping, struct file *filp,
+ 			pgoff_t offset, unsigned long nr_to_read,
+ 			unsigned long lookahead_size)
+ {
+@@ -179,7 +178,7 @@ __do_page_cache_readahead(struct address_space *mapping, struct file *filp,
+ 		rcu_read_lock();
+ 		page = radix_tree_lookup(&mapping->page_tree, page_offset);
+ 		rcu_read_unlock();
+-		if (page)
++		if (page && !radix_tree_exceptional_entry(page))
+ 			continue;
+ 
+ 		page = page_cache_alloc_readahead(mapping);
+@@ -237,28 +236,14 @@ int force_page_cache_readahead(struct address_space *mapping, struct file *filp,
+ 	return ret;
+ }
+ 
++#define MAX_READAHEAD   ((512*4096)/PAGE_CACHE_SIZE)
+ /*
+  * Given a desired number of PAGE_CACHE_SIZE readahead pages, return a
+  * sensible upper limit.
+  */
+ unsigned long max_sane_readahead(unsigned long nr)
+ {
+-	return min(nr, (node_page_state(numa_node_id(), NR_INACTIVE_FILE)
+-		+ node_page_state(numa_node_id(), NR_FREE_PAGES)) / 2);
+-}
+-
+-/*
+- * Submit IO for the read-ahead request in file_ra_state.
+- */
+-unsigned long ra_submit(struct file_ra_state *ra,
+-		       struct address_space *mapping, struct file *filp)
+-{
+-	int actual;
+-
+-	actual = __do_page_cache_readahead(mapping, filp,
+-					ra->start, ra->size, ra->async_size);
+-
+-	return actual;
++	return min(nr, MAX_READAHEAD);
+ }
+ 
+ /*
+@@ -351,7 +336,7 @@ static pgoff_t count_history_pages(struct address_space *mapping,
+ 	pgoff_t head;
+ 
+ 	rcu_read_lock();
+-	head = radix_tree_prev_hole(&mapping->page_tree, offset - 1, max);
++	head = page_cache_prev_hole(mapping, offset - 1, max);
+ 	rcu_read_unlock();
+ 
+ 	return offset - 1 - head;
+@@ -401,6 +386,7 @@ ondemand_readahead(struct address_space *mapping,
+ 		   unsigned long req_size)
+ {
+ 	unsigned long max = max_sane_readahead(ra->ra_pages);
++	pgoff_t prev_offset;
+ 
+ 	/*
+ 	 * start of file
+@@ -430,7 +416,7 @@ ondemand_readahead(struct address_space *mapping,
+ 		pgoff_t start;
+ 
+ 		rcu_read_lock();
+-		start = radix_tree_next_hole(&mapping->page_tree, offset+1,max);
++		start = page_cache_next_hole(mapping, offset + 1, max);
+ 		rcu_read_unlock();
+ 
+ 		if (!start || start - offset > max)
+@@ -452,8 +438,11 @@ ondemand_readahead(struct address_space *mapping,
+ 
+ 	/*
+ 	 * sequential cache miss
++	 * trivial case: (offset - prev_offset) == 1
++	 * unaligned reads: (offset - prev_offset) == 0
+ 	 */
+-	if (offset - (ra->prev_pos >> PAGE_CACHE_SHIFT) <= 1UL)
++	prev_offset = (unsigned long long)ra->prev_pos >> PAGE_CACHE_SHIFT;
++	if (offset - prev_offset <= 1UL)
+ 		goto initial_readahead;
+ 
+ 	/*
+diff --git a/mm/shmem.c b/mm/shmem.c
+index 0da81aaeb4cc..ab05681f41cd 100644
+--- a/mm/shmem.c
++++ b/mm/shmem.c
+@@ -243,19 +243,17 @@ static int shmem_radix_tree_replace(struct address_space *mapping,
+ 			pgoff_t index, void *expected, void *replacement)
+ {
+ 	void **pslot;
+-	void *item = NULL;
++	void *item;
+ 
+ 	VM_BUG_ON(!expected);
++	VM_BUG_ON(!replacement);
+ 	pslot = radix_tree_lookup_slot(&mapping->page_tree, index);
+-	if (pslot)
+-		item = radix_tree_deref_slot_protected(pslot,
+-							&mapping->tree_lock);
++	if (!pslot)
++		return -ENOENT;
++	item = radix_tree_deref_slot_protected(pslot, &mapping->tree_lock);
+ 	if (item != expected)
+ 		return -ENOENT;
+-	if (replacement)
+-		radix_tree_replace_slot(pslot, replacement);
+-	else
+-		radix_tree_delete(&mapping->page_tree, index);
++	radix_tree_replace_slot(pslot, replacement);
+ 	return 0;
+ }
+ 
+@@ -332,84 +330,20 @@ static void shmem_delete_from_page_cache(struct page *page, void *radswap)
+ }
+ 
+ /*
+- * Like find_get_pages, but collecting swap entries as well as pages.
+- */
+-static unsigned shmem_find_get_pages_and_swap(struct address_space *mapping,
+-					pgoff_t start, unsigned int nr_pages,
+-					struct page **pages, pgoff_t *indices)
+-{
+-	void **slot;
+-	unsigned int ret = 0;
+-	struct radix_tree_iter iter;
+-
+-	if (!nr_pages)
+-		return 0;
+-
+-	rcu_read_lock();
+-restart:
+-	radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, start) {
+-		struct page *page;
+-repeat:
+-		page = radix_tree_deref_slot(slot);
+-		if (unlikely(!page))
+-			continue;
+-		if (radix_tree_exception(page)) {
+-			if (radix_tree_deref_retry(page))
+-				goto restart;
+-			/*
+-			 * Otherwise, we must be storing a swap entry
+-			 * here as an exceptional entry: so return it
+-			 * without attempting to raise page count.
+-			 */
+-			goto export;
+-		}
+-		if (!page_cache_get_speculative(page))
+-			goto repeat;
+-
+-		/* Has the page moved? */
+-		if (unlikely(page != *slot)) {
+-			page_cache_release(page);
+-			goto repeat;
+-		}
+-export:
+-		indices[ret] = iter.index;
+-		pages[ret] = page;
+-		if (++ret == nr_pages)
+-			break;
+-	}
+-	rcu_read_unlock();
+-	return ret;
+-}
+-
+-/*
+  * Remove swap entry from radix tree, free the swap and its page cache.
+  */
+ static int shmem_free_swap(struct address_space *mapping,
+ 			   pgoff_t index, void *radswap)
+ {
+-	int error;
++	void *old;
+ 
+ 	spin_lock_irq(&mapping->tree_lock);
+-	error = shmem_radix_tree_replace(mapping, index, radswap, NULL);
++	old = radix_tree_delete_item(&mapping->page_tree, index, radswap);
+ 	spin_unlock_irq(&mapping->tree_lock);
+-	if (!error)
+-		free_swap_and_cache(radix_to_swp_entry(radswap));
+-	return error;
+-}
+-
+-/*
+- * Pagevec may contain swap entries, so shuffle up pages before releasing.
+- */
+-static void shmem_deswap_pagevec(struct pagevec *pvec)
+-{
+-	int i, j;
+-
+-	for (i = 0, j = 0; i < pagevec_count(pvec); i++) {
+-		struct page *page = pvec->pages[i];
+-		if (!radix_tree_exceptional_entry(page))
+-			pvec->pages[j++] = page;
+-	}
+-	pvec->nr = j;
++	if (old != radswap)
++		return -ENOENT;
++	free_swap_and_cache(radix_to_swp_entry(radswap));
++	return 0;
+ }
+ 
+ /*
+@@ -430,12 +364,12 @@ void shmem_unlock_mapping(struct address_space *mapping)
+ 		 * Avoid pagevec_lookup(): find_get_pages() returns 0 as if it
+ 		 * has finished, if it hits a row of PAGEVEC_SIZE swap entries.
+ 		 */
+-		pvec.nr = shmem_find_get_pages_and_swap(mapping, index,
+-					PAGEVEC_SIZE, pvec.pages, indices);
++		pvec.nr = find_get_entries(mapping, index,
++					   PAGEVEC_SIZE, pvec.pages, indices);
+ 		if (!pvec.nr)
+ 			break;
+ 		index = indices[pvec.nr - 1] + 1;
+-		shmem_deswap_pagevec(&pvec);
++		pagevec_remove_exceptionals(&pvec);
+ 		check_move_unevictable_pages(pvec.pages, pvec.nr);
+ 		pagevec_release(&pvec);
+ 		cond_resched();
+@@ -467,9 +401,9 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend,
+ 	pagevec_init(&pvec, 0);
+ 	index = start;
+ 	while (index < end) {
+-		pvec.nr = shmem_find_get_pages_and_swap(mapping, index,
+-				min(end - index, (pgoff_t)PAGEVEC_SIZE),
+-							pvec.pages, indices);
++		pvec.nr = find_get_entries(mapping, index,
++			min(end - index, (pgoff_t)PAGEVEC_SIZE),
++			pvec.pages, indices);
+ 		if (!pvec.nr)
+ 			break;
+ 		mem_cgroup_uncharge_start();
+@@ -498,7 +432,7 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend,
+ 			}
+ 			unlock_page(page);
+ 		}
+-		shmem_deswap_pagevec(&pvec);
++		pagevec_remove_exceptionals(&pvec);
+ 		pagevec_release(&pvec);
+ 		mem_cgroup_uncharge_end();
+ 		cond_resched();
+@@ -536,9 +470,10 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend,
+ 	index = start;
+ 	while (index < end) {
+ 		cond_resched();
+-		pvec.nr = shmem_find_get_pages_and_swap(mapping, index,
++
++		pvec.nr = find_get_entries(mapping, index,
+ 				min(end - index, (pgoff_t)PAGEVEC_SIZE),
+-							pvec.pages, indices);
++				pvec.pages, indices);
+ 		if (!pvec.nr) {
+ 			/* If all gone or hole-punch or unfalloc, we're done */
+ 			if (index == start || end != -1)
+@@ -581,7 +516,7 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend,
+ 			}
+ 			unlock_page(page);
+ 		}
+-		shmem_deswap_pagevec(&pvec);
++		pagevec_remove_exceptionals(&pvec);
+ 		pagevec_release(&pvec);
+ 		mem_cgroup_uncharge_end();
+ 		index++;
+@@ -1090,7 +1025,7 @@ static int shmem_getpage_gfp(struct inode *inode, pgoff_t index,
+ 		return -EFBIG;
+ repeat:
+ 	swap.val = 0;
+-	page = find_lock_page(mapping, index);
++	page = find_lock_entry(mapping, index);
+ 	if (radix_tree_exceptional_entry(page)) {
+ 		swap = radix_to_swp_entry(page);
+ 		page = NULL;
+@@ -1102,6 +1037,9 @@ repeat:
+ 		goto failed;
+ 	}
+ 
++	if (page && sgp == SGP_WRITE)
++		mark_page_accessed(page);
++
+ 	/* fallocated page? */
+ 	if (page && !PageUptodate(page)) {
+ 		if (sgp != SGP_READ)
+@@ -1183,6 +1121,9 @@ repeat:
+ 		shmem_recalc_inode(inode);
+ 		spin_unlock(&info->lock);
+ 
++		if (sgp == SGP_WRITE)
++			mark_page_accessed(page);
++
+ 		delete_from_swap_cache(page);
+ 		set_page_dirty(page);
+ 		swap_free(swap);
+@@ -1207,8 +1148,11 @@ repeat:
+ 			goto decused;
+ 		}
+ 
+-		SetPageSwapBacked(page);
++		__SetPageSwapBacked(page);
+ 		__set_page_locked(page);
++		if (sgp == SGP_WRITE)
++			init_page_accessed(page);
++
+ 		error = mem_cgroup_cache_charge(page, current->mm,
+ 						gfp & GFP_RECLAIM_MASK);
+ 		if (error)
+@@ -1485,6 +1429,11 @@ static struct inode *shmem_get_inode(struct super_block *sb, const struct inode
+ 	return inode;
+ }
+ 
++bool shmem_mapping(struct address_space *mapping)
++{
++	return mapping->backing_dev_info == &shmem_backing_dev_info;
++}
++
+ #ifdef CONFIG_TMPFS
+ static const struct inode_operations shmem_symlink_inode_operations;
+ static const struct inode_operations shmem_short_symlink_operations;
+@@ -1797,7 +1746,7 @@ static pgoff_t shmem_seek_hole_data(struct address_space *mapping,
+ 	pagevec_init(&pvec, 0);
+ 	pvec.nr = 1;		/* start small: we may be there already */
+ 	while (!done) {
+-		pvec.nr = shmem_find_get_pages_and_swap(mapping, index,
++		pvec.nr = find_get_entries(mapping, index,
+ 					pvec.nr, pvec.pages, indices);
+ 		if (!pvec.nr) {
+ 			if (whence == SEEK_DATA)
+@@ -1824,7 +1773,7 @@ static pgoff_t shmem_seek_hole_data(struct address_space *mapping,
+ 				break;
+ 			}
+ 		}
+-		shmem_deswap_pagevec(&pvec);
++		pagevec_remove_exceptionals(&pvec);
+ 		pagevec_release(&pvec);
+ 		pvec.nr = PAGEVEC_SIZE;
+ 		cond_resched();
+diff --git a/mm/slab.c b/mm/slab.c
+index 2580db062df9..eb4078c7d183 100644
+--- a/mm/slab.c
++++ b/mm/slab.c
+@@ -930,7 +930,8 @@ static void *__ac_put_obj(struct kmem_cache *cachep, struct array_cache *ac,
+ {
+ 	if (unlikely(pfmemalloc_active)) {
+ 		/* Some pfmemalloc slabs exist, check if this is one */
+-		struct page *page = virt_to_head_page(objp);
++		struct slab *slabp = virt_to_slab(objp);
++		struct page *page = virt_to_head_page(slabp->s_mem);
+ 		if (PageSlabPfmemalloc(page))
+ 			set_obj_pfmemalloc(&objp);
+ 	}
+@@ -1776,7 +1777,7 @@ static void *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, int nodeid)
+ 		__SetPageSlab(page + i);
+ 
+ 		if (page->pfmemalloc)
+-			SetPageSlabPfmemalloc(page + i);
++			SetPageSlabPfmemalloc(page);
+ 	}
+ 	memcg_bind_pages(cachep, cachep->gfporder);
+ 
+@@ -1809,9 +1810,10 @@ static void kmem_freepages(struct kmem_cache *cachep, void *addr)
+ 	else
+ 		sub_zone_page_state(page_zone(page),
+ 				NR_SLAB_UNRECLAIMABLE, nr_freed);
++
++	__ClearPageSlabPfmemalloc(page);
+ 	while (i--) {
+ 		BUG_ON(!PageSlab(page));
+-		__ClearPageSlabPfmemalloc(page);
+ 		__ClearPageSlab(page);
+ 		page++;
+ 	}
+@@ -3220,7 +3222,7 @@ static void *fallback_alloc(struct kmem_cache *cache, gfp_t flags)
+ 	local_flags = flags & (GFP_CONSTRAINT_MASK|GFP_RECLAIM_MASK);
+ 
+ retry_cpuset:
+-	cpuset_mems_cookie = get_mems_allowed();
++	cpuset_mems_cookie = read_mems_allowed_begin();
+ 	zonelist = node_zonelist(slab_node(), flags);
+ 
+ retry:
+@@ -3276,7 +3278,7 @@ retry:
+ 		}
+ 	}
+ 
+-	if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !obj))
++	if (unlikely(!obj && read_mems_allowed_retry(cpuset_mems_cookie)))
+ 		goto retry_cpuset;
+ 	return obj;
+ }
+diff --git a/mm/slub.c b/mm/slub.c
+index 5c1343a391d0..a88d94cfee20 100644
+--- a/mm/slub.c
++++ b/mm/slub.c
+@@ -1635,7 +1635,7 @@ static void *get_any_partial(struct kmem_cache *s, gfp_t flags,
+ 		return NULL;
+ 
+ 	do {
+-		cpuset_mems_cookie = get_mems_allowed();
++		cpuset_mems_cookie = read_mems_allowed_begin();
+ 		zonelist = node_zonelist(slab_node(), flags);
+ 		for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {
+ 			struct kmem_cache_node *n;
+@@ -1647,19 +1647,17 @@ static void *get_any_partial(struct kmem_cache *s, gfp_t flags,
+ 				object = get_partial_node(s, n, c, flags);
+ 				if (object) {
+ 					/*
+-					 * Return the object even if
+-					 * put_mems_allowed indicated that
+-					 * the cpuset mems_allowed was
+-					 * updated in parallel. It's a
+-					 * harmless race between the alloc
+-					 * and the cpuset update.
++					 * Don't check read_mems_allowed_retry()
++					 * here - if mems_allowed was updated in
++					 * parallel, that was a harmless race
++					 * between allocation and the cpuset
++					 * update
+ 					 */
+-					put_mems_allowed(cpuset_mems_cookie);
+ 					return object;
+ 				}
+ 			}
+ 		}
+-	} while (!put_mems_allowed(cpuset_mems_cookie));
++	} while (read_mems_allowed_retry(cpuset_mems_cookie));
+ #endif
+ 	return NULL;
+ }
+diff --git a/mm/swap.c b/mm/swap.c
+index aa4da5d9401d..16e70ce1912a 100644
+--- a/mm/swap.c
++++ b/mm/swap.c
+@@ -68,7 +68,7 @@ static void __page_cache_release(struct page *page)
+ static void __put_single_page(struct page *page)
+ {
+ 	__page_cache_release(page);
+-	free_hot_cold_page(page, 0);
++	free_hot_cold_page(page, false);
+ }
+ 
+ static void __put_compound_page(struct page *page)
+@@ -437,7 +437,7 @@ static void __activate_page(struct page *page, struct lruvec *lruvec,
+ 		SetPageActive(page);
+ 		lru += LRU_ACTIVE;
+ 		add_page_to_lru_list(page, lruvec, lru);
+-		trace_mm_lru_activate(page, page_to_pfn(page));
++		trace_mm_lru_activate(page);
+ 
+ 		__count_vm_event(PGACTIVATE);
+ 		update_page_reclaim_stat(lruvec, file, 1);
+@@ -549,12 +549,17 @@ void mark_page_accessed(struct page *page)
+ EXPORT_SYMBOL(mark_page_accessed);
+ 
+ /*
+- * Queue the page for addition to the LRU via pagevec. The decision on whether
+- * to add the page to the [in]active [file|anon] list is deferred until the
+- * pagevec is drained. This gives a chance for the caller of __lru_cache_add()
+- * have the page added to the active list using mark_page_accessed().
++ * Used to mark_page_accessed(page) that is not visible yet and when it is
++ * still safe to use non-atomic ops
+  */
+-void __lru_cache_add(struct page *page)
++void init_page_accessed(struct page *page)
++{
++	if (!PageReferenced(page))
++		__SetPageReferenced(page);
++}
++EXPORT_SYMBOL(init_page_accessed);
++
++static void __lru_cache_add(struct page *page)
+ {
+ 	struct pagevec *pvec = &get_cpu_var(lru_add_pvec);
+ 
+@@ -564,11 +569,34 @@ void __lru_cache_add(struct page *page)
+ 	pagevec_add(pvec, page);
+ 	put_cpu_var(lru_add_pvec);
+ }
+-EXPORT_SYMBOL(__lru_cache_add);
++
++/**
++ * lru_cache_add: add a page to the page lists
++ * @page: the page to add
++ */
++void lru_cache_add_anon(struct page *page)
++{
++	if (PageActive(page))
++		ClearPageActive(page);
++	__lru_cache_add(page);
++}
++
++void lru_cache_add_file(struct page *page)
++{
++	if (PageActive(page))
++		ClearPageActive(page);
++	__lru_cache_add(page);
++}
++EXPORT_SYMBOL(lru_cache_add_file);
+ 
+ /**
+  * lru_cache_add - add a page to a page list
+  * @page: the page to be added to the LRU.
++ *
++ * Queue the page for addition to the LRU via pagevec. The decision on whether
++ * to add the page to the [in]active [file|anon] list is deferred until the
++ * pagevec is drained. This gives a chance for the caller of lru_cache_add()
++ * have the page added to the active list using mark_page_accessed().
+  */
+ void lru_cache_add(struct page *page)
+ {
+@@ -779,7 +807,7 @@ void lru_add_drain_all(void)
+  * grabbed the page via the LRU.  If it did, give up: shrink_inactive_list()
+  * will free it.
+  */
+-void release_pages(struct page **pages, int nr, int cold)
++void release_pages(struct page **pages, int nr, bool cold)
+ {
+ 	int i;
+ 	LIST_HEAD(pages_to_free);
+@@ -820,7 +848,7 @@ void release_pages(struct page **pages, int nr, int cold)
+ 		}
+ 
+ 		/* Clear Active bit in case of parallel mark_page_accessed */
+-		ClearPageActive(page);
++		__ClearPageActive(page);
+ 
+ 		list_add(&page->lru, &pages_to_free);
+ 	}
+@@ -902,7 +930,7 @@ static void __pagevec_lru_add_fn(struct page *page, struct lruvec *lruvec,
+ 	SetPageLRU(page);
+ 	add_page_to_lru_list(page, lruvec, lru);
+ 	update_page_reclaim_stat(lruvec, file, active);
+-	trace_mm_lru_insertion(page, page_to_pfn(page), lru, trace_pagemap_flags(page));
++	trace_mm_lru_insertion(page, lru);
+ }
+ 
+ /*
+@@ -916,6 +944,57 @@ void __pagevec_lru_add(struct pagevec *pvec)
+ EXPORT_SYMBOL(__pagevec_lru_add);
+ 
+ /**
++ * pagevec_lookup_entries - gang pagecache lookup
++ * @pvec:	Where the resulting entries are placed
++ * @mapping:	The address_space to search
++ * @start:	The starting entry index
++ * @nr_entries:	The maximum number of entries
++ * @indices:	The cache indices corresponding to the entries in @pvec
++ *
++ * pagevec_lookup_entries() will search for and return a group of up
++ * to @nr_entries pages and shadow entries in the mapping.  All
++ * entries are placed in @pvec.  pagevec_lookup_entries() takes a
++ * reference against actual pages in @pvec.
++ *
++ * The search returns a group of mapping-contiguous entries with
++ * ascending indexes.  There may be holes in the indices due to
++ * not-present entries.
++ *
++ * pagevec_lookup_entries() returns the number of entries which were
++ * found.
++ */
++unsigned pagevec_lookup_entries(struct pagevec *pvec,
++				struct address_space *mapping,
++				pgoff_t start, unsigned nr_pages,
++				pgoff_t *indices)
++{
++	pvec->nr = find_get_entries(mapping, start, nr_pages,
++				    pvec->pages, indices);
++	return pagevec_count(pvec);
++}
++
++/**
++ * pagevec_remove_exceptionals - pagevec exceptionals pruning
++ * @pvec:	The pagevec to prune
++ *
++ * pagevec_lookup_entries() fills both pages and exceptional radix
++ * tree entries into the pagevec.  This function prunes all
++ * exceptionals from @pvec without leaving holes, so that it can be
++ * passed on to page-only pagevec operations.
++ */
++void pagevec_remove_exceptionals(struct pagevec *pvec)
++{
++	int i, j;
++
++	for (i = 0, j = 0; i < pagevec_count(pvec); i++) {
++		struct page *page = pvec->pages[i];
++		if (!radix_tree_exceptional_entry(page))
++			pvec->pages[j++] = page;
++	}
++	pvec->nr = j;
++}
++
++/**
+  * pagevec_lookup - gang pagecache lookup
+  * @pvec:	Where the resulting pages are placed
+  * @mapping:	The address_space to search
+diff --git a/mm/swap_state.c b/mm/swap_state.c
+index e6f15f8ca2af..4079edfff2cc 100644
+--- a/mm/swap_state.c
++++ b/mm/swap_state.c
+@@ -63,6 +63,8 @@ unsigned long total_swapcache_pages(void)
+ 	return ret;
+ }
+ 
++static atomic_t swapin_readahead_hits = ATOMIC_INIT(4);
++
+ void show_swap_cache_info(void)
+ {
+ 	printk("%lu pages in swap cache\n", total_swapcache_pages());
+@@ -268,7 +270,7 @@ void free_pages_and_swap_cache(struct page **pages, int nr)
+ 
+ 		for (i = 0; i < todo; i++)
+ 			free_swap_cache(pagep[i]);
+-		release_pages(pagep, todo, 0);
++		release_pages(pagep, todo, false);
+ 		pagep += todo;
+ 		nr -= todo;
+ 	}
+@@ -286,8 +288,11 @@ struct page * lookup_swap_cache(swp_entry_t entry)
+ 
+ 	page = find_get_page(swap_address_space(entry), entry.val);
+ 
+-	if (page)
++	if (page) {
+ 		INC_CACHE_INFO(find_success);
++		if (TestClearPageReadahead(page))
++			atomic_inc(&swapin_readahead_hits);
++	}
+ 
+ 	INC_CACHE_INFO(find_total);
+ 	return page;
+@@ -389,6 +394,50 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
+ 	return found_page;
+ }
+ 
++static unsigned long swapin_nr_pages(unsigned long offset)
++{
++	static unsigned long prev_offset;
++	unsigned int pages, max_pages, last_ra;
++	static atomic_t last_readahead_pages;
++
++	max_pages = 1 << ACCESS_ONCE(page_cluster);
++	if (max_pages <= 1)
++		return 1;
++
++	/*
++	 * This heuristic has been found to work well on both sequential and
++	 * random loads, swapping to hard disk or to SSD: please don't ask
++	 * what the "+ 2" means, it just happens to work well, that's all.
++	 */
++	pages = atomic_xchg(&swapin_readahead_hits, 0) + 2;
++	if (pages == 2) {
++		/*
++		 * We can have no readahead hits to judge by: but must not get
++		 * stuck here forever, so check for an adjacent offset instead
++		 * (and don't even bother to check whether swap type is same).
++		 */
++		if (offset != prev_offset + 1 && offset != prev_offset - 1)
++			pages = 1;
++		prev_offset = offset;
++	} else {
++		unsigned int roundup = 4;
++		while (roundup < pages)
++			roundup <<= 1;
++		pages = roundup;
++	}
++
++	if (pages > max_pages)
++		pages = max_pages;
++
++	/* Don't shrink readahead too fast */
++	last_ra = atomic_read(&last_readahead_pages) / 2;
++	if (pages < last_ra)
++		pages = last_ra;
++	atomic_set(&last_readahead_pages, pages);
++
++	return pages;
++}
++
+ /**
+  * swapin_readahead - swap in pages in hope we need them soon
+  * @entry: swap entry of this memory
+@@ -412,11 +461,16 @@ struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask,
+ 			struct vm_area_struct *vma, unsigned long addr)
+ {
+ 	struct page *page;
+-	unsigned long offset = swp_offset(entry);
++	unsigned long entry_offset = swp_offset(entry);
++	unsigned long offset = entry_offset;
+ 	unsigned long start_offset, end_offset;
+-	unsigned long mask = (1UL << page_cluster) - 1;
++	unsigned long mask;
+ 	struct blk_plug plug;
+ 
++	mask = swapin_nr_pages(offset) - 1;
++	if (!mask)
++		goto skip;
++
+ 	/* Read a page_cluster sized and aligned cluster around offset. */
+ 	start_offset = offset & ~mask;
+ 	end_offset = offset | mask;
+@@ -430,10 +484,13 @@ struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask,
+ 						gfp_mask, vma, addr);
+ 		if (!page)
+ 			continue;
++		if (offset != entry_offset)
++			SetPageReadahead(page);
+ 		page_cache_release(page);
+ 	}
+ 	blk_finish_plug(&plug);
+ 
+ 	lru_add_drain();	/* Push any new pages onto the LRU now */
++skip:
+ 	return read_swap_cache_async(entry, gfp_mask, vma, addr);
+ }
+diff --git a/mm/swapfile.c b/mm/swapfile.c
+index 0ec2eaf3ccfd..660b9c0e2e40 100644
+--- a/mm/swapfile.c
++++ b/mm/swapfile.c
+@@ -51,14 +51,32 @@ atomic_long_t nr_swap_pages;
+ /* protected with swap_lock. reading in vm_swap_full() doesn't need lock */
+ long total_swap_pages;
+ static int least_priority;
+-static atomic_t highest_priority_index = ATOMIC_INIT(-1);
+ 
+ static const char Bad_file[] = "Bad swap file entry ";
+ static const char Unused_file[] = "Unused swap file entry ";
+ static const char Bad_offset[] = "Bad swap offset entry ";
+ static const char Unused_offset[] = "Unused swap offset entry ";
+ 
+-struct swap_list_t swap_list = {-1, -1};
++/*
++ * all active swap_info_structs
++ * protected with swap_lock, and ordered by priority.
++ */
++PLIST_HEAD(swap_active_head);
++
++/*
++ * all available (active, not full) swap_info_structs
++ * protected with swap_avail_lock, ordered by priority.
++ * This is used by get_swap_page() instead of swap_active_head
++ * because swap_active_head includes all swap_info_structs,
++ * but get_swap_page() doesn't need to look at full ones.
++ * This uses its own lock instead of swap_lock because when a
++ * swap_info_struct changes between not-full/full, it needs to
++ * add/remove itself to/from this list, but the swap_info_struct->lock
++ * is held and the locking order requires swap_lock to be taken
++ * before any swap_info_struct->lock.
++ */
++static PLIST_HEAD(swap_avail_head);
++static DEFINE_SPINLOCK(swap_avail_lock);
+ 
+ struct swap_info_struct *swap_info[MAX_SWAPFILES];
+ 
+@@ -591,6 +609,9 @@ checks:
+ 	if (si->inuse_pages == si->pages) {
+ 		si->lowest_bit = si->max;
+ 		si->highest_bit = 0;
++		spin_lock(&swap_avail_lock);
++		plist_del(&si->avail_list, &swap_avail_head);
++		spin_unlock(&swap_avail_lock);
+ 	}
+ 	si->swap_map[offset] = usage;
+ 	inc_cluster_info_page(si, si->cluster_info, offset);
+@@ -639,71 +660,65 @@ no_page:
+ 
+ swp_entry_t get_swap_page(void)
+ {
+-	struct swap_info_struct *si;
++	struct swap_info_struct *si, *next;
+ 	pgoff_t offset;
+-	int type, next;
+-	int wrapped = 0;
+-	int hp_index;
+ 
+-	spin_lock(&swap_lock);
+ 	if (atomic_long_read(&nr_swap_pages) <= 0)
+ 		goto noswap;
+ 	atomic_long_dec(&nr_swap_pages);
+ 
+-	for (type = swap_list.next; type >= 0 && wrapped < 2; type = next) {
+-		hp_index = atomic_xchg(&highest_priority_index, -1);
+-		/*
+-		 * highest_priority_index records current highest priority swap
+-		 * type which just frees swap entries. If its priority is
+-		 * higher than that of swap_list.next swap type, we use it.  It
+-		 * isn't protected by swap_lock, so it can be an invalid value
+-		 * if the corresponding swap type is swapoff. We double check
+-		 * the flags here. It's even possible the swap type is swapoff
+-		 * and swapon again and its priority is changed. In such rare
+-		 * case, low prority swap type might be used, but eventually
+-		 * high priority swap will be used after several rounds of
+-		 * swap.
+-		 */
+-		if (hp_index != -1 && hp_index != type &&
+-		    swap_info[type]->prio < swap_info[hp_index]->prio &&
+-		    (swap_info[hp_index]->flags & SWP_WRITEOK)) {
+-			type = hp_index;
+-			swap_list.next = type;
+-		}
+-
+-		si = swap_info[type];
+-		next = si->next;
+-		if (next < 0 ||
+-		    (!wrapped && si->prio != swap_info[next]->prio)) {
+-			next = swap_list.head;
+-			wrapped++;
+-		}
++	spin_lock(&swap_avail_lock);
+ 
++start_over:
++	plist_for_each_entry_safe(si, next, &swap_avail_head, avail_list) {
++		/* requeue si to after same-priority siblings */
++		plist_requeue(&si->avail_list, &swap_avail_head);
++		spin_unlock(&swap_avail_lock);
+ 		spin_lock(&si->lock);
+-		if (!si->highest_bit) {
++		if (!si->highest_bit || !(si->flags & SWP_WRITEOK)) {
++			spin_lock(&swap_avail_lock);
++			if (plist_node_empty(&si->avail_list)) {
++				spin_unlock(&si->lock);
++				goto nextsi;
++			}
++			WARN(!si->highest_bit,
++			     "swap_info %d in list but !highest_bit\n",
++			     si->type);
++			WARN(!(si->flags & SWP_WRITEOK),
++			     "swap_info %d in list but !SWP_WRITEOK\n",
++			     si->type);
++			plist_del(&si->avail_list, &swap_avail_head);
+ 			spin_unlock(&si->lock);
+-			continue;
++			goto nextsi;
+ 		}
+-		if (!(si->flags & SWP_WRITEOK)) {
+-			spin_unlock(&si->lock);
+-			continue;
+-		}
+-
+-		swap_list.next = next;
+ 
+-		spin_unlock(&swap_lock);
+ 		/* This is called for allocating swap entry for cache */
+ 		offset = scan_swap_map(si, SWAP_HAS_CACHE);
+ 		spin_unlock(&si->lock);
+ 		if (offset)
+-			return swp_entry(type, offset);
+-		spin_lock(&swap_lock);
+-		next = swap_list.next;
++			return swp_entry(si->type, offset);
++		pr_debug("scan_swap_map of si %d failed to find offset\n",
++		       si->type);
++		spin_lock(&swap_avail_lock);
++nextsi:
++		/*
++		 * if we got here, it's likely that si was almost full before,
++		 * and since scan_swap_map() can drop the si->lock, multiple
++		 * callers probably all tried to get a page from the same si
++		 * and it filled up before we could get one; or, the si filled
++		 * up between us dropping swap_avail_lock and taking si->lock.
++		 * Since we dropped the swap_avail_lock, the swap_avail_head
++		 * list may have been modified; so if next is still in the
++		 * swap_avail_head list then try it, otherwise start over.
++		 */
++		if (plist_node_empty(&next->avail_list))
++			goto start_over;
+ 	}
+ 
++	spin_unlock(&swap_avail_lock);
++
+ 	atomic_long_inc(&nr_swap_pages);
+ noswap:
+-	spin_unlock(&swap_lock);
+ 	return (swp_entry_t) {0};
+ }
+ 
+@@ -765,27 +780,6 @@ out:
+ 	return NULL;
+ }
+ 
+-/*
+- * This swap type frees swap entry, check if it is the highest priority swap
+- * type which just frees swap entry. get_swap_page() uses
+- * highest_priority_index to search highest priority swap type. The
+- * swap_info_struct.lock can't protect us if there are multiple swap types
+- * active, so we use atomic_cmpxchg.
+- */
+-static void set_highest_priority_index(int type)
+-{
+-	int old_hp_index, new_hp_index;
+-
+-	do {
+-		old_hp_index = atomic_read(&highest_priority_index);
+-		if (old_hp_index != -1 &&
+-			swap_info[old_hp_index]->prio >= swap_info[type]->prio)
+-			break;
+-		new_hp_index = type;
+-	} while (atomic_cmpxchg(&highest_priority_index,
+-		old_hp_index, new_hp_index) != old_hp_index);
+-}
+-
+ static unsigned char swap_entry_free(struct swap_info_struct *p,
+ 				     swp_entry_t entry, unsigned char usage)
+ {
+@@ -827,9 +821,18 @@ static unsigned char swap_entry_free(struct swap_info_struct *p,
+ 		dec_cluster_info_page(p, p->cluster_info, offset);
+ 		if (offset < p->lowest_bit)
+ 			p->lowest_bit = offset;
+-		if (offset > p->highest_bit)
++		if (offset > p->highest_bit) {
++			bool was_full = !p->highest_bit;
+ 			p->highest_bit = offset;
+-		set_highest_priority_index(p->type);
++			if (was_full && (p->flags & SWP_WRITEOK)) {
++				spin_lock(&swap_avail_lock);
++				WARN_ON(!plist_node_empty(&p->avail_list));
++				if (plist_node_empty(&p->avail_list))
++					plist_add(&p->avail_list,
++						  &swap_avail_head);
++				spin_unlock(&swap_avail_lock);
++			}
++		}
+ 		atomic_long_inc(&nr_swap_pages);
+ 		p->inuse_pages--;
+ 		frontswap_invalidate_page(p->type, offset);
+@@ -1764,30 +1767,37 @@ static void _enable_swap_info(struct swap_info_struct *p, int prio,
+ 				unsigned char *swap_map,
+ 				struct swap_cluster_info *cluster_info)
+ {
+-	int i, prev;
+-
+ 	if (prio >= 0)
+ 		p->prio = prio;
+ 	else
+ 		p->prio = --least_priority;
++	/*
++	 * the plist prio is negated because plist ordering is
++	 * low-to-high, while swap ordering is high-to-low
++	 */
++	p->list.prio = -p->prio;
++	p->avail_list.prio = -p->prio;
+ 	p->swap_map = swap_map;
+ 	p->cluster_info = cluster_info;
+ 	p->flags |= SWP_WRITEOK;
+ 	atomic_long_add(p->pages, &nr_swap_pages);
+ 	total_swap_pages += p->pages;
+ 
+-	/* insert swap space into swap_list: */
+-	prev = -1;
+-	for (i = swap_list.head; i >= 0; i = swap_info[i]->next) {
+-		if (p->prio >= swap_info[i]->prio)
+-			break;
+-		prev = i;
+-	}
+-	p->next = i;
+-	if (prev < 0)
+-		swap_list.head = swap_list.next = p->type;
+-	else
+-		swap_info[prev]->next = p->type;
++	assert_spin_locked(&swap_lock);
++	/*
++	 * both lists are plists, and thus priority ordered.
++	 * swap_active_head needs to be priority ordered for swapoff(),
++	 * which on removal of any swap_info_struct with an auto-assigned
++	 * (i.e. negative) priority increments the auto-assigned priority
++	 * of any lower-priority swap_info_structs.
++	 * swap_avail_head needs to be priority ordered for get_swap_page(),
++	 * which allocates swap pages from the highest available priority
++	 * swap_info_struct.
++	 */
++	plist_add(&p->list, &swap_active_head);
++	spin_lock(&swap_avail_lock);
++	plist_add(&p->avail_list, &swap_avail_head);
++	spin_unlock(&swap_avail_lock);
+ }
+ 
+ static void enable_swap_info(struct swap_info_struct *p, int prio,
+@@ -1822,8 +1832,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
+ 	struct address_space *mapping;
+ 	struct inode *inode;
+ 	struct filename *pathname;
+-	int i, type, prev;
+-	int err;
++	int err, found = 0;
+ 	unsigned int old_block_size;
+ 
+ 	if (!capable(CAP_SYS_ADMIN))
+@@ -1841,17 +1850,16 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
+ 		goto out;
+ 
+ 	mapping = victim->f_mapping;
+-	prev = -1;
+ 	spin_lock(&swap_lock);
+-	for (type = swap_list.head; type >= 0; type = swap_info[type]->next) {
+-		p = swap_info[type];
++	plist_for_each_entry(p, &swap_active_head, list) {
+ 		if (p->flags & SWP_WRITEOK) {
+-			if (p->swap_file->f_mapping == mapping)
++			if (p->swap_file->f_mapping == mapping) {
++				found = 1;
+ 				break;
++			}
+ 		}
+-		prev = type;
+ 	}
+-	if (type < 0) {
++	if (!found) {
+ 		err = -EINVAL;
+ 		spin_unlock(&swap_lock);
+ 		goto out_dput;
+@@ -1863,20 +1871,21 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
+ 		spin_unlock(&swap_lock);
+ 		goto out_dput;
+ 	}
+-	if (prev < 0)
+-		swap_list.head = p->next;
+-	else
+-		swap_info[prev]->next = p->next;
+-	if (type == swap_list.next) {
+-		/* just pick something that's safe... */
+-		swap_list.next = swap_list.head;
+-	}
++	spin_lock(&swap_avail_lock);
++	plist_del(&p->avail_list, &swap_avail_head);
++	spin_unlock(&swap_avail_lock);
+ 	spin_lock(&p->lock);
+ 	if (p->prio < 0) {
+-		for (i = p->next; i >= 0; i = swap_info[i]->next)
+-			swap_info[i]->prio = p->prio--;
++		struct swap_info_struct *si = p;
++
++		plist_for_each_entry_continue(si, &swap_active_head, list) {
++			si->prio++;
++			si->list.prio--;
++			si->avail_list.prio--;
++		}
+ 		least_priority++;
+ 	}
++	plist_del(&p->list, &swap_active_head);
+ 	atomic_long_sub(p->pages, &nr_swap_pages);
+ 	total_swap_pages -= p->pages;
+ 	p->flags &= ~SWP_WRITEOK;
+@@ -1884,7 +1893,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
+ 	spin_unlock(&swap_lock);
+ 
+ 	set_current_oom_origin();
+-	err = try_to_unuse(type, false, 0); /* force all pages to be unused */
++	err = try_to_unuse(p->type, false, 0); /* force unuse all pages */
+ 	clear_current_oom_origin();
+ 
+ 	if (err) {
+@@ -1926,7 +1935,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
+ 	frontswap_map_set(p, NULL);
+ 	spin_unlock(&p->lock);
+ 	spin_unlock(&swap_lock);
+-	frontswap_invalidate_area(type);
++	frontswap_invalidate_area(p->type);
+ 	mutex_unlock(&swapon_mutex);
+ 	free_percpu(p->percpu_cluster);
+ 	p->percpu_cluster = NULL;
+@@ -1934,7 +1943,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
+ 	vfree(cluster_info);
+ 	vfree(frontswap_map);
+ 	/* Destroy swap account informatin */
+-	swap_cgroup_swapoff(type);
++	swap_cgroup_swapoff(p->type);
+ 
+ 	inode = mapping->host;
+ 	if (S_ISBLK(inode->i_mode)) {
+@@ -2141,8 +2150,9 @@ static struct swap_info_struct *alloc_swap_info(void)
+ 		 */
+ 	}
+ 	INIT_LIST_HEAD(&p->first_swap_extent.list);
++	plist_node_init(&p->list, 0);
++	plist_node_init(&p->avail_list, 0);
+ 	p->flags = SWP_USED;
+-	p->next = -1;
+ 	spin_unlock(&swap_lock);
+ 	spin_lock_init(&p->lock);
+ 
+diff --git a/mm/truncate.c b/mm/truncate.c
+index 353b683afd6e..2e84fe59190b 100644
+--- a/mm/truncate.c
++++ b/mm/truncate.c
+@@ -22,6 +22,22 @@
+ #include <linux/cleancache.h>
+ #include "internal.h"
+ 
++static void clear_exceptional_entry(struct address_space *mapping,
++				    pgoff_t index, void *entry)
++{
++	/* Handled by shmem itself */
++	if (shmem_mapping(mapping))
++		return;
++
++	spin_lock_irq(&mapping->tree_lock);
++	/*
++	 * Regular page slots are stabilized by the page lock even
++	 * without the tree itself locked.  These unlocked entries
++	 * need verification under the tree lock.
++	 */
++	radix_tree_delete_item(&mapping->page_tree, index, entry);
++	spin_unlock_irq(&mapping->tree_lock);
++}
+ 
+ /**
+  * do_invalidatepage - invalidate part or all of a page
+@@ -208,6 +224,7 @@ void truncate_inode_pages_range(struct address_space *mapping,
+ 	unsigned int	partial_start;	/* inclusive */
+ 	unsigned int	partial_end;	/* exclusive */
+ 	struct pagevec	pvec;
++	pgoff_t		indices[PAGEVEC_SIZE];
+ 	pgoff_t		index;
+ 	int		i;
+ 
+@@ -238,17 +255,23 @@ void truncate_inode_pages_range(struct address_space *mapping,
+ 
+ 	pagevec_init(&pvec, 0);
+ 	index = start;
+-	while (index < end && pagevec_lookup(&pvec, mapping, index,
+-			min(end - index, (pgoff_t)PAGEVEC_SIZE))) {
++	while (index < end && pagevec_lookup_entries(&pvec, mapping, index,
++			min(end - index, (pgoff_t)PAGEVEC_SIZE),
++			indices)) {
+ 		mem_cgroup_uncharge_start();
+ 		for (i = 0; i < pagevec_count(&pvec); i++) {
+ 			struct page *page = pvec.pages[i];
+ 
+ 			/* We rely upon deletion not changing page->index */
+-			index = page->index;
++			index = indices[i];
+ 			if (index >= end)
+ 				break;
+ 
++			if (radix_tree_exceptional_entry(page)) {
++				clear_exceptional_entry(mapping, index, page);
++				continue;
++			}
++
+ 			if (!trylock_page(page))
+ 				continue;
+ 			WARN_ON(page->index != index);
+@@ -259,6 +282,7 @@ void truncate_inode_pages_range(struct address_space *mapping,
+ 			truncate_inode_page(mapping, page);
+ 			unlock_page(page);
+ 		}
++		pagevec_remove_exceptionals(&pvec);
+ 		pagevec_release(&pvec);
+ 		mem_cgroup_uncharge_end();
+ 		cond_resched();
+@@ -307,14 +331,16 @@ void truncate_inode_pages_range(struct address_space *mapping,
+ 	index = start;
+ 	for ( ; ; ) {
+ 		cond_resched();
+-		if (!pagevec_lookup(&pvec, mapping, index,
+-			min(end - index, (pgoff_t)PAGEVEC_SIZE))) {
++		if (!pagevec_lookup_entries(&pvec, mapping, index,
++			min(end - index, (pgoff_t)PAGEVEC_SIZE),
++			indices)) {
+ 			if (index == start)
+ 				break;
+ 			index = start;
+ 			continue;
+ 		}
+-		if (index == start && pvec.pages[0]->index >= end) {
++		if (index == start && indices[0] >= end) {
++			pagevec_remove_exceptionals(&pvec);
+ 			pagevec_release(&pvec);
+ 			break;
+ 		}
+@@ -323,16 +349,22 @@ void truncate_inode_pages_range(struct address_space *mapping,
+ 			struct page *page = pvec.pages[i];
+ 
+ 			/* We rely upon deletion not changing page->index */
+-			index = page->index;
++			index = indices[i];
+ 			if (index >= end)
+ 				break;
+ 
++			if (radix_tree_exceptional_entry(page)) {
++				clear_exceptional_entry(mapping, index, page);
++				continue;
++			}
++
+ 			lock_page(page);
+ 			WARN_ON(page->index != index);
+ 			wait_on_page_writeback(page);
+ 			truncate_inode_page(mapping, page);
+ 			unlock_page(page);
+ 		}
++		pagevec_remove_exceptionals(&pvec);
+ 		pagevec_release(&pvec);
+ 		mem_cgroup_uncharge_end();
+ 		index++;
+@@ -375,6 +407,7 @@ EXPORT_SYMBOL(truncate_inode_pages);
+ unsigned long invalidate_mapping_pages(struct address_space *mapping,
+ 		pgoff_t start, pgoff_t end)
+ {
++	pgoff_t indices[PAGEVEC_SIZE];
+ 	struct pagevec pvec;
+ 	pgoff_t index = start;
+ 	unsigned long ret;
+@@ -390,17 +423,23 @@ unsigned long invalidate_mapping_pages(struct address_space *mapping,
+ 	 */
+ 
+ 	pagevec_init(&pvec, 0);
+-	while (index <= end && pagevec_lookup(&pvec, mapping, index,
+-			min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1)) {
++	while (index <= end && pagevec_lookup_entries(&pvec, mapping, index,
++			min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1,
++			indices)) {
+ 		mem_cgroup_uncharge_start();
+ 		for (i = 0; i < pagevec_count(&pvec); i++) {
+ 			struct page *page = pvec.pages[i];
+ 
+ 			/* We rely upon deletion not changing page->index */
+-			index = page->index;
++			index = indices[i];
+ 			if (index > end)
+ 				break;
+ 
++			if (radix_tree_exceptional_entry(page)) {
++				clear_exceptional_entry(mapping, index, page);
++				continue;
++			}
++
+ 			if (!trylock_page(page))
+ 				continue;
+ 			WARN_ON(page->index != index);
+@@ -414,6 +453,7 @@ unsigned long invalidate_mapping_pages(struct address_space *mapping,
+ 				deactivate_page(page);
+ 			count += ret;
+ 		}
++		pagevec_remove_exceptionals(&pvec);
+ 		pagevec_release(&pvec);
+ 		mem_cgroup_uncharge_end();
+ 		cond_resched();
+@@ -481,6 +521,7 @@ static int do_launder_page(struct address_space *mapping, struct page *page)
+ int invalidate_inode_pages2_range(struct address_space *mapping,
+ 				  pgoff_t start, pgoff_t end)
+ {
++	pgoff_t indices[PAGEVEC_SIZE];
+ 	struct pagevec pvec;
+ 	pgoff_t index;
+ 	int i;
+@@ -491,17 +532,23 @@ int invalidate_inode_pages2_range(struct address_space *mapping,
+ 	cleancache_invalidate_inode(mapping);
+ 	pagevec_init(&pvec, 0);
+ 	index = start;
+-	while (index <= end && pagevec_lookup(&pvec, mapping, index,
+-			min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1)) {
++	while (index <= end && pagevec_lookup_entries(&pvec, mapping, index,
++			min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1,
++			indices)) {
+ 		mem_cgroup_uncharge_start();
+ 		for (i = 0; i < pagevec_count(&pvec); i++) {
+ 			struct page *page = pvec.pages[i];
+ 
+ 			/* We rely upon deletion not changing page->index */
+-			index = page->index;
++			index = indices[i];
+ 			if (index > end)
+ 				break;
+ 
++			if (radix_tree_exceptional_entry(page)) {
++				clear_exceptional_entry(mapping, index, page);
++				continue;
++			}
++
+ 			lock_page(page);
+ 			WARN_ON(page->index != index);
+ 			if (page->mapping != mapping) {
+@@ -539,6 +586,7 @@ int invalidate_inode_pages2_range(struct address_space *mapping,
+ 				ret = ret2;
+ 			unlock_page(page);
+ 		}
++		pagevec_remove_exceptionals(&pvec);
+ 		pagevec_release(&pvec);
+ 		mem_cgroup_uncharge_end();
+ 		cond_resched();
+diff --git a/mm/vmacache.c b/mm/vmacache.c
+new file mode 100644
+index 000000000000..1037a3bab505
+--- /dev/null
++++ b/mm/vmacache.c
+@@ -0,0 +1,114 @@
++/*
++ * Copyright (C) 2014 Davidlohr Bueso.
++ */
++#include <linux/sched.h>
++#include <linux/mm.h>
++#include <linux/vmacache.h>
++
++/*
++ * Flush vma caches for threads that share a given mm.
++ *
++ * The operation is safe because the caller holds the mmap_sem
++ * exclusively and other threads accessing the vma cache will
++ * have mmap_sem held at least for read, so no extra locking
++ * is required to maintain the vma cache.
++ */
++void vmacache_flush_all(struct mm_struct *mm)
++{
++	struct task_struct *g, *p;
++
++	rcu_read_lock();
++	for_each_process_thread(g, p) {
++		/*
++		 * Only flush the vmacache pointers as the
++		 * mm seqnum is already set and curr's will
++		 * be set upon invalidation when the next
++		 * lookup is done.
++		 */
++		if (mm == p->mm)
++			vmacache_flush(p);
++	}
++	rcu_read_unlock();
++}
++
++/*
++ * This task may be accessing a foreign mm via (for example)
++ * get_user_pages()->find_vma().  The vmacache is task-local and this
++ * task's vmacache pertains to a different mm (ie, its own).  There is
++ * nothing we can do here.
++ *
++ * Also handle the case where a kernel thread has adopted this mm via use_mm().
++ * That kernel thread's vmacache is not applicable to this mm.
++ */
++static bool vmacache_valid_mm(struct mm_struct *mm)
++{
++	return current->mm == mm && !(current->flags & PF_KTHREAD);
++}
++
++void vmacache_update(unsigned long addr, struct vm_area_struct *newvma)
++{
++	if (vmacache_valid_mm(newvma->vm_mm))
++		current->vmacache[VMACACHE_HASH(addr)] = newvma;
++}
++
++static bool vmacache_valid(struct mm_struct *mm)
++{
++	struct task_struct *curr;
++
++	if (!vmacache_valid_mm(mm))
++		return false;
++
++	curr = current;
++	if (mm->vmacache_seqnum != curr->vmacache_seqnum) {
++		/*
++		 * First attempt will always be invalid, initialize
++		 * the new cache for this task here.
++		 */
++		curr->vmacache_seqnum = mm->vmacache_seqnum;
++		vmacache_flush(curr);
++		return false;
++	}
++	return true;
++}
++
++struct vm_area_struct *vmacache_find(struct mm_struct *mm, unsigned long addr)
++{
++	int i;
++
++	if (!vmacache_valid(mm))
++		return NULL;
++
++	for (i = 0; i < VMACACHE_SIZE; i++) {
++		struct vm_area_struct *vma = current->vmacache[i];
++
++		if (!vma)
++			continue;
++		if (WARN_ON_ONCE(vma->vm_mm != mm))
++			break;
++		if (vma->vm_start <= addr && vma->vm_end > addr)
++			return vma;
++	}
++
++	return NULL;
++}
++
++#ifndef CONFIG_MMU
++struct vm_area_struct *vmacache_find_exact(struct mm_struct *mm,
++					   unsigned long start,
++					   unsigned long end)
++{
++	int i;
++
++	if (!vmacache_valid(mm))
++		return NULL;
++
++	for (i = 0; i < VMACACHE_SIZE; i++) {
++		struct vm_area_struct *vma = current->vmacache[i];
++
++		if (vma && vma->vm_start == start && vma->vm_end == end)
++			return vma;
++	}
++
++	return NULL;
++}
++#endif
+diff --git a/mm/vmalloc.c b/mm/vmalloc.c
+index e2be0f802ccf..060dc366ac44 100644
+--- a/mm/vmalloc.c
++++ b/mm/vmalloc.c
+@@ -2685,14 +2685,14 @@ void get_vmalloc_info(struct vmalloc_info *vmi)
+ 
+ 	prev_end = VMALLOC_START;
+ 
+-	spin_lock(&vmap_area_lock);
++	rcu_read_lock();
+ 
+ 	if (list_empty(&vmap_area_list)) {
+ 		vmi->largest_chunk = VMALLOC_TOTAL;
+ 		goto out;
+ 	}
+ 
+-	list_for_each_entry(va, &vmap_area_list, list) {
++	list_for_each_entry_rcu(va, &vmap_area_list, list) {
+ 		unsigned long addr = va->va_start;
+ 
+ 		/*
+@@ -2719,7 +2719,7 @@ void get_vmalloc_info(struct vmalloc_info *vmi)
+ 		vmi->largest_chunk = VMALLOC_END - prev_end;
+ 
+ out:
+-	spin_unlock(&vmap_area_lock);
++	rcu_read_unlock();
+ }
+ #endif
+ 
+diff --git a/mm/vmscan.c b/mm/vmscan.c
+index 5ad29b2925a0..5461d02ea718 100644
+--- a/mm/vmscan.c
++++ b/mm/vmscan.c
+@@ -163,7 +163,8 @@ static unsigned long zone_reclaimable_pages(struct zone *zone)
+ 
+ bool zone_reclaimable(struct zone *zone)
+ {
+-	return zone->pages_scanned < zone_reclaimable_pages(zone) * 6;
++	return zone_page_state(zone, NR_PAGES_SCANNED) <
++		zone_reclaimable_pages(zone) * 6;
+ }
+ 
+ static unsigned long get_lru_size(struct lruvec *lruvec, enum lru_list lru)
+@@ -224,15 +225,15 @@ shrink_slab_node(struct shrink_control *shrinkctl, struct shrinker *shrinker,
+ 	unsigned long freed = 0;
+ 	unsigned long long delta;
+ 	long total_scan;
+-	long max_pass;
++	long freeable;
+ 	long nr;
+ 	long new_nr;
+ 	int nid = shrinkctl->nid;
+ 	long batch_size = shrinker->batch ? shrinker->batch
+ 					  : SHRINK_BATCH;
+ 
+-	max_pass = shrinker->count_objects(shrinker, shrinkctl);
+-	if (max_pass == 0)
++	freeable = shrinker->count_objects(shrinker, shrinkctl);
++	if (freeable == 0)
+ 		return 0;
+ 
+ 	/*
+@@ -244,14 +245,14 @@ shrink_slab_node(struct shrink_control *shrinkctl, struct shrinker *shrinker,
+ 
+ 	total_scan = nr;
+ 	delta = (4 * nr_pages_scanned) / shrinker->seeks;
+-	delta *= max_pass;
++	delta *= freeable;
+ 	do_div(delta, lru_pages + 1);
+ 	total_scan += delta;
+ 	if (total_scan < 0) {
+ 		printk(KERN_ERR
+ 		"shrink_slab: %pF negative objects to delete nr=%ld\n",
+ 		       shrinker->scan_objects, total_scan);
+-		total_scan = max_pass;
++		total_scan = freeable;
+ 	}
+ 
+ 	/*
+@@ -260,38 +261,55 @@ shrink_slab_node(struct shrink_control *shrinkctl, struct shrinker *shrinker,
+ 	 * shrinkers to return -1 all the time. This results in a large
+ 	 * nr being built up so when a shrink that can do some work
+ 	 * comes along it empties the entire cache due to nr >>>
+-	 * max_pass.  This is bad for sustaining a working set in
++	 * freeable. This is bad for sustaining a working set in
+ 	 * memory.
+ 	 *
+ 	 * Hence only allow the shrinker to scan the entire cache when
+ 	 * a large delta change is calculated directly.
+ 	 */
+-	if (delta < max_pass / 4)
+-		total_scan = min(total_scan, max_pass / 2);
++	if (delta < freeable / 4)
++		total_scan = min(total_scan, freeable / 2);
+ 
+ 	/*
+ 	 * Avoid risking looping forever due to too large nr value:
+ 	 * never try to free more than twice the estimate number of
+ 	 * freeable entries.
+ 	 */
+-	if (total_scan > max_pass * 2)
+-		total_scan = max_pass * 2;
++	if (total_scan > freeable * 2)
++		total_scan = freeable * 2;
+ 
+ 	trace_mm_shrink_slab_start(shrinker, shrinkctl, nr,
+ 				nr_pages_scanned, lru_pages,
+-				max_pass, delta, total_scan);
++				freeable, delta, total_scan);
+ 
+-	while (total_scan >= batch_size) {
++	/*
++	 * Normally, we should not scan less than batch_size objects in one
++	 * pass to avoid too frequent shrinker calls, but if the slab has less
++	 * than batch_size objects in total and we are really tight on memory,
++	 * we will try to reclaim all available objects, otherwise we can end
++	 * up failing allocations although there are plenty of reclaimable
++	 * objects spread over several slabs with usage less than the
++	 * batch_size.
++	 *
++	 * We detect the "tight on memory" situations by looking at the total
++	 * number of objects we want to scan (total_scan). If it is greater
++	 * than the total number of objects on slab (freeable), we must be
++	 * scanning at high prio and therefore should try to reclaim as much as
++	 * possible.
++	 */
++	while (total_scan >= batch_size ||
++	       total_scan >= freeable) {
+ 		unsigned long ret;
++		unsigned long nr_to_scan = min(batch_size, total_scan);
+ 
+-		shrinkctl->nr_to_scan = batch_size;
++		shrinkctl->nr_to_scan = nr_to_scan;
+ 		ret = shrinker->scan_objects(shrinker, shrinkctl);
+ 		if (ret == SHRINK_STOP)
+ 			break;
+ 		freed += ret;
+ 
+-		count_vm_events(SLABS_SCANNED, batch_size);
+-		total_scan -= batch_size;
++		count_vm_events(SLABS_SCANNED, nr_to_scan);
++		total_scan -= nr_to_scan;
+ 
+ 		cond_resched();
+ 	}
+@@ -352,16 +370,17 @@ unsigned long shrink_slab(struct shrink_control *shrinkctl,
+ 	}
+ 
+ 	list_for_each_entry(shrinker, &shrinker_list, list) {
+-		for_each_node_mask(shrinkctl->nid, shrinkctl->nodes_to_scan) {
+-			if (!node_online(shrinkctl->nid))
+-				continue;
+-
+-			if (!(shrinker->flags & SHRINKER_NUMA_AWARE) &&
+-			    (shrinkctl->nid != 0))
+-				break;
+-
++		if (!(shrinker->flags & SHRINKER_NUMA_AWARE)) {
++			shrinkctl->nid = 0;
+ 			freed += shrink_slab_node(shrinkctl, shrinker,
+-				 nr_pages_scanned, lru_pages);
++					nr_pages_scanned, lru_pages);
++			continue;
++		}
++
++		for_each_node_mask(shrinkctl->nid, shrinkctl->nodes_to_scan) {
++			if (node_online(shrinkctl->nid))
++				freed += shrink_slab_node(shrinkctl, shrinker,
++						nr_pages_scanned, lru_pages);
+ 
+ 		}
+ 	}
+@@ -1089,7 +1108,7 @@ keep:
+ 		VM_BUG_ON(PageLRU(page) || PageUnevictable(page));
+ 	}
+ 
+-	free_hot_cold_page_list(&free_pages, 1);
++	free_hot_cold_page_list(&free_pages, true);
+ 
+ 	list_splice(&ret_pages, page_list);
+ 	count_vm_events(PGACTIVATE, pgactivate);
+@@ -1126,7 +1145,7 @@ unsigned long reclaim_clean_pages_from_list(struct zone *zone,
+ 			TTU_UNMAP|TTU_IGNORE_ACCESS,
+ 			&dummy1, &dummy2, &dummy3, &dummy4, &dummy5, true);
+ 	list_splice(&clean_pages, page_list);
+-	__mod_zone_page_state(zone, NR_ISOLATED_FILE, -ret);
++	mod_zone_page_state(zone, NR_ISOLATED_FILE, -ret);
+ 	return ret;
+ }
+ 
+@@ -1452,7 +1471,7 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
+ 	__mod_zone_page_state(zone, NR_ISOLATED_ANON + file, nr_taken);
+ 
+ 	if (global_reclaim(sc)) {
+-		zone->pages_scanned += nr_scanned;
++		__mod_zone_page_state(zone, NR_PAGES_SCANNED, nr_scanned);
+ 		if (current_is_kswapd())
+ 			__count_zone_vm_events(PGSCAN_KSWAPD, zone, nr_scanned);
+ 		else
+@@ -1487,7 +1506,7 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
+ 
+ 	spin_unlock_irq(&zone->lru_lock);
+ 
+-	free_hot_cold_page_list(&page_list, 1);
++	free_hot_cold_page_list(&page_list, true);
+ 
+ 	/*
+ 	 * If reclaim is isolating dirty pages under writeback, it implies
+@@ -1641,7 +1660,7 @@ static void shrink_active_list(unsigned long nr_to_scan,
+ 	nr_taken = isolate_lru_pages(nr_to_scan, lruvec, &l_hold,
+ 				     &nr_scanned, sc, isolate_mode, lru);
+ 	if (global_reclaim(sc))
+-		zone->pages_scanned += nr_scanned;
++		__mod_zone_page_state(zone, NR_PAGES_SCANNED, nr_scanned);
+ 
+ 	reclaim_stat->recent_scanned[file] += nr_taken;
+ 
+@@ -1707,7 +1726,7 @@ static void shrink_active_list(unsigned long nr_to_scan,
+ 	__mod_zone_page_state(zone, NR_ISOLATED_ANON + file, -nr_taken);
+ 	spin_unlock_irq(&zone->lru_lock);
+ 
+-	free_hot_cold_page_list(&l_hold, 1);
++	free_hot_cold_page_list(&l_hold, true);
+ }
+ 
+ #ifdef CONFIG_SWAP
+@@ -1829,7 +1848,7 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
+ 	struct zone *zone = lruvec_zone(lruvec);
+ 	unsigned long anon_prio, file_prio;
+ 	enum scan_balance scan_balance;
+-	unsigned long anon, file, free;
++	unsigned long anon, file;
+ 	bool force_scan = false;
+ 	unsigned long ap, fp;
+ 	enum lru_list lru;
+@@ -1877,11 +1896,6 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
+ 		goto out;
+ 	}
+ 
+-	anon  = get_lru_size(lruvec, LRU_ACTIVE_ANON) +
+-		get_lru_size(lruvec, LRU_INACTIVE_ANON);
+-	file  = get_lru_size(lruvec, LRU_ACTIVE_FILE) +
+-		get_lru_size(lruvec, LRU_INACTIVE_FILE);
+-
+ 	/*
+ 	 * If it's foreseeable that reclaiming the file cache won't be
+ 	 * enough to get the zone back into a desirable shape, we have
+@@ -1889,8 +1903,14 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
+ 	 * thrashing - remaining file pages alone.
+ 	 */
+ 	if (global_reclaim(sc)) {
+-		free = zone_page_state(zone, NR_FREE_PAGES);
+-		if (unlikely(file + free <= high_wmark_pages(zone))) {
++		unsigned long zonefile;
++		unsigned long zonefree;
++
++		zonefree = zone_page_state(zone, NR_FREE_PAGES);
++		zonefile = zone_page_state(zone, NR_ACTIVE_FILE) +
++			   zone_page_state(zone, NR_INACTIVE_FILE);
++
++		if (unlikely(zonefile + zonefree <= high_wmark_pages(zone))) {
+ 			scan_balance = SCAN_ANON;
+ 			goto out;
+ 		}
+@@ -1925,6 +1945,12 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
+ 	 *
+ 	 * anon in [0], file in [1]
+ 	 */
++
++	anon  = get_lru_size(lruvec, LRU_ACTIVE_ANON) +
++		get_lru_size(lruvec, LRU_INACTIVE_ANON);
++	file  = get_lru_size(lruvec, LRU_ACTIVE_FILE) +
++		get_lru_size(lruvec, LRU_INACTIVE_FILE);
++
+ 	spin_lock_irq(&zone->lru_lock);
+ 	if (unlikely(reclaim_stat->recent_scanned[0] > anon / 4)) {
+ 		reclaim_stat->recent_scanned[0] /= 2;
+@@ -2000,13 +2026,27 @@ static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
+ 	unsigned long nr_reclaimed = 0;
+ 	unsigned long nr_to_reclaim = sc->nr_to_reclaim;
+ 	struct blk_plug plug;
+-	bool scan_adjusted = false;
++	bool scan_adjusted;
+ 
+ 	get_scan_count(lruvec, sc, nr);
+ 
+ 	/* Record the original scan target for proportional adjustments later */
+ 	memcpy(targets, nr, sizeof(nr));
+ 
++	/*
++	 * Global reclaiming within direct reclaim at DEF_PRIORITY is a normal
++	 * event that can occur when there is little memory pressure e.g.
++	 * multiple streaming readers/writers. Hence, we do not abort scanning
++	 * when the requested number of pages are reclaimed when scanning at
++	 * DEF_PRIORITY on the assumption that the fact we are direct
++	 * reclaiming implies that kswapd is not keeping up and it is best to
++	 * do a batch of work at once. For memcg reclaim one check is made to
++	 * abort proportional reclaim if either the file or anon lru has already
++	 * dropped to zero at the first pass.
++	 */
++	scan_adjusted = (global_reclaim(sc) && !current_is_kswapd() &&
++			 sc->priority == DEF_PRIORITY);
++
+ 	blk_start_plug(&plug);
+ 	while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] ||
+ 					nr[LRU_INACTIVE_FILE]) {
+@@ -2027,17 +2067,8 @@ static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
+ 			continue;
+ 
+ 		/*
+-		 * For global direct reclaim, reclaim only the number of pages
+-		 * requested. Less care is taken to scan proportionally as it
+-		 * is more important to minimise direct reclaim stall latency
+-		 * than it is to properly age the LRU lists.
+-		 */
+-		if (global_reclaim(sc) && !current_is_kswapd())
+-			break;
+-
+-		/*
+ 		 * For kswapd and memcg, reclaim at least the number of pages
+-		 * requested. Ensure that the anon and file LRUs shrink
++		 * requested. Ensure that the anon and file LRUs are scanned
+ 		 * proportionally what was requested by get_scan_count(). We
+ 		 * stop reclaiming one LRU and reduce the amount scanning
+ 		 * proportional to the original scan target.
+@@ -2045,6 +2076,15 @@ static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
+ 		nr_file = nr[LRU_INACTIVE_FILE] + nr[LRU_ACTIVE_FILE];
+ 		nr_anon = nr[LRU_INACTIVE_ANON] + nr[LRU_ACTIVE_ANON];
+ 
++		/*
++		 * It's just vindictive to attack the larger once the smaller
++		 * has gone to zero.  And given the way we stop scanning the
++		 * smaller below, this makes sure that we only make one nudge
++		 * towards proportionality once we've got nr_to_reclaim.
++		 */
++		if (!nr_file || !nr_anon)
++			break;
++
+ 		if (nr_file > nr_anon) {
+ 			unsigned long scan_target = targets[LRU_INACTIVE_ANON] +
+ 						targets[LRU_ACTIVE_ANON] + 1;
+@@ -2406,8 +2446,8 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
+ 			unsigned long lru_pages = 0;
+ 
+ 			nodes_clear(shrink->nodes_to_scan);
+-			for_each_zone_zonelist(zone, z, zonelist,
+-					gfp_zone(sc->gfp_mask)) {
++			for_each_zone_zonelist_nodemask(zone, z, zonelist,
++					gfp_zone(sc->gfp_mask), sc->nodemask) {
+ 				if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
+ 					continue;
+ 
+diff --git a/mm/vmstat.c b/mm/vmstat.c
+index 5a442a723d79..f7ca04482299 100644
+--- a/mm/vmstat.c
++++ b/mm/vmstat.c
+@@ -200,7 +200,7 @@ void set_pgdat_percpu_threshold(pg_data_t *pgdat,
+ 			continue;
+ 
+ 		threshold = (*calculate_pressure)(zone);
+-		for_each_possible_cpu(cpu)
++		for_each_online_cpu(cpu)
+ 			per_cpu_ptr(zone->pageset, cpu)->stat_threshold
+ 							= threshold;
+ 	}
+@@ -761,6 +761,7 @@ const char * const vmstat_text[] = {
+ 	"nr_shmem",
+ 	"nr_dirtied",
+ 	"nr_written",
++	"nr_pages_scanned",
+ 
+ #ifdef CONFIG_NUMA
+ 	"numa_hit",
+@@ -851,12 +852,14 @@ const char * const vmstat_text[] = {
+ 	"thp_zero_page_alloc",
+ 	"thp_zero_page_alloc_failed",
+ #endif
++#ifdef CONFIG_DEBUG_TLBFLUSH
+ #ifdef CONFIG_SMP
+ 	"nr_tlb_remote_flush",
+ 	"nr_tlb_remote_flush_received",
+-#endif
++#endif /* CONFIG_SMP */
+ 	"nr_tlb_local_flush_all",
+ 	"nr_tlb_local_flush_one",
++#endif /* CONFIG_DEBUG_TLBFLUSH */
+ 
+ #endif /* CONFIG_VM_EVENTS_COUNTERS */
+ };
+@@ -1053,7 +1056,7 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
+ 		   min_wmark_pages(zone),
+ 		   low_wmark_pages(zone),
+ 		   high_wmark_pages(zone),
+-		   zone->pages_scanned,
++		   zone_page_state(zone, NR_PAGES_SCANNED),
+ 		   zone->spanned_pages,
+ 		   zone->present_pages,
+ 		   zone->managed_pages);
+@@ -1063,10 +1066,10 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
+ 				zone_page_state(zone, i));
+ 
+ 	seq_printf(m,
+-		   "\n        protection: (%lu",
++		   "\n        protection: (%ld",
+ 		   zone->lowmem_reserve[0]);
+ 	for (i = 1; i < ARRAY_SIZE(zone->lowmem_reserve); i++)
+-		seq_printf(m, ", %lu", zone->lowmem_reserve[i]);
++		seq_printf(m, ", %ld", zone->lowmem_reserve[i]);
+ 	seq_printf(m,
+ 		   ")"
+ 		   "\n  pagesets");
author	Mike Pagano <mpagano@gentoo.org>	2014-10-10 15:56:35 -0400
committer	Mike Pagano <mpagano@gentoo.org>	2014-10-10 15:56:35 -0400
commit	45ca8c94954b7b8d9658410f759a5258d7cdca9a (patch)
tree	4773f3981492d82912a85d050a20c121d7e7298a
parent	Linux patch 3.12.29 (diff)
download	linux-patches-3.12-32.tar.gz linux-patches-3.12-32.tar.bz2 linux-patches-3.12-32.zip