summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--0000_README4
-rw-r--r--1029_linux-3.12.30.patch7724
2 files changed, 7728 insertions, 0 deletions
diff --git a/0000_README b/0000_README
index ae0f6aa7..d8b89ecb 100644
--- a/0000_README
+++ b/0000_README
@@ -158,6 +158,10 @@ Patch: 1028_linux-3.12.29.patch
From: http://www.kernel.org
Desc: Linux 3.12.29
+Patch: 1029_linux-3.12.30.patch
+From: http://www.kernel.org
+Desc: Linux 3.12.30
+
Patch: 1500_XATTR_USER_PREFIX.patch
From: https://bugs.gentoo.org/show_bug.cgi?id=470644
Desc: Support for namespace user.pax.* on tmpfs.
diff --git a/1029_linux-3.12.30.patch b/1029_linux-3.12.30.patch
new file mode 100644
index 00000000..90682678
--- /dev/null
+++ b/1029_linux-3.12.30.patch
@@ -0,0 +1,7724 @@
+diff --git a/Makefile b/Makefile
+index 67cec33d00c7..1ad1566225ca 100644
+--- a/Makefile
++++ b/Makefile
+@@ -1,6 +1,6 @@
+ VERSION = 3
+ PATCHLEVEL = 12
+-SUBLEVEL = 29
++SUBLEVEL = 30
+ EXTRAVERSION =
+ NAME = One Giant Leap for Frogkind
+
+diff --git a/arch/tile/mm/homecache.c b/arch/tile/mm/homecache.c
+index 004ba568d93f..33294fdc402e 100644
+--- a/arch/tile/mm/homecache.c
++++ b/arch/tile/mm/homecache.c
+@@ -417,7 +417,7 @@ void __homecache_free_pages(struct page *page, unsigned int order)
+ if (put_page_testzero(page)) {
+ homecache_change_page_home(page, order, PAGE_HOME_HASH);
+ if (order == 0) {
+- free_hot_cold_page(page, 0);
++ free_hot_cold_page(page, false);
+ } else {
+ init_page_count(page);
+ __free_pages(page, order);
+diff --git a/arch/unicore32/include/asm/mmu_context.h b/arch/unicore32/include/asm/mmu_context.h
+index fb5e4c658f7a..ef470a7a3d0f 100644
+--- a/arch/unicore32/include/asm/mmu_context.h
++++ b/arch/unicore32/include/asm/mmu_context.h
+@@ -14,6 +14,8 @@
+
+ #include <linux/compiler.h>
+ #include <linux/sched.h>
++#include <linux/mm.h>
++#include <linux/vmacache.h>
+ #include <linux/io.h>
+
+ #include <asm/cacheflush.h>
+@@ -73,7 +75,7 @@ do { \
+ else \
+ mm->mmap = NULL; \
+ rb_erase(&high_vma->vm_rb, &mm->mm_rb); \
+- mm->mmap_cache = NULL; \
++ vmacache_invalidate(mm); \
+ mm->map_count--; \
+ remove_vma(high_vma); \
+ } \
+diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
+index e6d90babc245..04905bfc508b 100644
+--- a/arch/x86/include/asm/tlbflush.h
++++ b/arch/x86/include/asm/tlbflush.h
+@@ -62,7 +62,7 @@ static inline void __flush_tlb_all(void)
+
+ static inline void __flush_tlb_one(unsigned long addr)
+ {
+- count_vm_event(NR_TLB_LOCAL_FLUSH_ONE);
++ count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ONE);
+ __flush_tlb_single(addr);
+ }
+
+@@ -93,13 +93,13 @@ static inline void __flush_tlb_one(unsigned long addr)
+ */
+ static inline void __flush_tlb_up(void)
+ {
+- count_vm_event(NR_TLB_LOCAL_FLUSH_ALL);
++ count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL);
+ __flush_tlb();
+ }
+
+ static inline void flush_tlb_all(void)
+ {
+- count_vm_event(NR_TLB_LOCAL_FLUSH_ALL);
++ count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL);
+ __flush_tlb_all();
+ }
+
+diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c
+index ce2d0a2c3e4f..0e25a1bc5ab5 100644
+--- a/arch/x86/kernel/cpu/mtrr/generic.c
++++ b/arch/x86/kernel/cpu/mtrr/generic.c
+@@ -683,7 +683,7 @@ static void prepare_set(void) __acquires(set_atomicity_lock)
+ }
+
+ /* Flush all TLBs via a mov %cr3, %reg; mov %reg, %cr3 */
+- count_vm_event(NR_TLB_LOCAL_FLUSH_ALL);
++ count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL);
+ __flush_tlb();
+
+ /* Save MTRR state */
+@@ -697,7 +697,7 @@ static void prepare_set(void) __acquires(set_atomicity_lock)
+ static void post_set(void) __releases(set_atomicity_lock)
+ {
+ /* Flush TLBs (no need to flush caches - they are disabled) */
+- count_vm_event(NR_TLB_LOCAL_FLUSH_ALL);
++ count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL);
+ __flush_tlb();
+
+ /* Intel (P6) standard MTRRs */
+diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
+index dfa537a03be1..5da29d04de2f 100644
+--- a/arch/x86/mm/pgtable.c
++++ b/arch/x86/mm/pgtable.c
+@@ -386,13 +386,20 @@ int pmdp_test_and_clear_young(struct vm_area_struct *vma,
+ int ptep_clear_flush_young(struct vm_area_struct *vma,
+ unsigned long address, pte_t *ptep)
+ {
+- int young;
+-
+- young = ptep_test_and_clear_young(vma, address, ptep);
+- if (young)
+- flush_tlb_page(vma, address);
+-
+- return young;
++ /*
++ * On x86 CPUs, clearing the accessed bit without a TLB flush
++ * doesn't cause data corruption. [ It could cause incorrect
++ * page aging and the (mistaken) reclaim of hot pages, but the
++ * chance of that should be relatively low. ]
++ *
++ * So as a performance optimization don't flush the TLB when
++ * clearing the accessed bit, it will eventually be flushed by
++ * a context switch or a VM operation anyway. [ In the rare
++ * event of it not getting flushed for a long time the delay
++ * shouldn't really matter because there's no real memory
++ * pressure for swapout to react to. ]
++ */
++ return ptep_test_and_clear_young(vma, address, ptep);
+ }
+
+ #ifdef CONFIG_TRANSPARENT_HUGEPAGE
+diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
+index ae699b3bbac8..dd8dda167a24 100644
+--- a/arch/x86/mm/tlb.c
++++ b/arch/x86/mm/tlb.c
+@@ -103,7 +103,7 @@ static void flush_tlb_func(void *info)
+ if (f->flush_mm != this_cpu_read(cpu_tlbstate.active_mm))
+ return;
+
+- count_vm_event(NR_TLB_REMOTE_FLUSH_RECEIVED);
++ count_vm_tlb_event(NR_TLB_REMOTE_FLUSH_RECEIVED);
+ if (this_cpu_read(cpu_tlbstate.state) == TLBSTATE_OK) {
+ if (f->flush_end == TLB_FLUSH_ALL)
+ local_flush_tlb();
+@@ -131,7 +131,7 @@ void native_flush_tlb_others(const struct cpumask *cpumask,
+ info.flush_start = start;
+ info.flush_end = end;
+
+- count_vm_event(NR_TLB_REMOTE_FLUSH);
++ count_vm_tlb_event(NR_TLB_REMOTE_FLUSH);
+ if (is_uv_system()) {
+ unsigned int cpu;
+
+@@ -151,44 +151,19 @@ void flush_tlb_current_task(void)
+
+ preempt_disable();
+
+- count_vm_event(NR_TLB_LOCAL_FLUSH_ALL);
++ count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL);
+ local_flush_tlb();
+ if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids)
+ flush_tlb_others(mm_cpumask(mm), mm, 0UL, TLB_FLUSH_ALL);
+ preempt_enable();
+ }
+
+-/*
+- * It can find out the THP large page, or
+- * HUGETLB page in tlb_flush when THP disabled
+- */
+-static inline unsigned long has_large_page(struct mm_struct *mm,
+- unsigned long start, unsigned long end)
+-{
+- pgd_t *pgd;
+- pud_t *pud;
+- pmd_t *pmd;
+- unsigned long addr = ALIGN(start, HPAGE_SIZE);
+- for (; addr < end; addr += HPAGE_SIZE) {
+- pgd = pgd_offset(mm, addr);
+- if (likely(!pgd_none(*pgd))) {
+- pud = pud_offset(pgd, addr);
+- if (likely(!pud_none(*pud))) {
+- pmd = pmd_offset(pud, addr);
+- if (likely(!pmd_none(*pmd)))
+- if (pmd_large(*pmd))
+- return addr;
+- }
+- }
+- }
+- return 0;
+-}
+-
+ void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,
+ unsigned long end, unsigned long vmflag)
+ {
+ unsigned long addr;
+ unsigned act_entries, tlb_entries = 0;
++ unsigned long nr_base_pages;
+
+ preempt_disable();
+ if (current->active_mm != mm)
+@@ -210,21 +185,20 @@ void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,
+ tlb_entries = tlb_lli_4k[ENTRIES];
+ else
+ tlb_entries = tlb_lld_4k[ENTRIES];
++
+ /* Assume all of TLB entries was occupied by this task */
+- act_entries = mm->total_vm > tlb_entries ? tlb_entries : mm->total_vm;
++ act_entries = tlb_entries >> tlb_flushall_shift;
++ act_entries = mm->total_vm > act_entries ? act_entries : mm->total_vm;
++ nr_base_pages = (end - start) >> PAGE_SHIFT;
+
+ /* tlb_flushall_shift is on balance point, details in commit log */
+- if ((end - start) >> PAGE_SHIFT > act_entries >> tlb_flushall_shift) {
+- count_vm_event(NR_TLB_LOCAL_FLUSH_ALL);
++ if (nr_base_pages > act_entries) {
++ count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL);
+ local_flush_tlb();
+ } else {
+- if (has_large_page(mm, start, end)) {
+- local_flush_tlb();
+- goto flush_all;
+- }
+ /* flush range by one by one 'invlpg' */
+ for (addr = start; addr < end; addr += PAGE_SIZE) {
+- count_vm_event(NR_TLB_LOCAL_FLUSH_ONE);
++ count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ONE);
+ __flush_tlb_single(addr);
+ }
+
+@@ -262,7 +236,7 @@ void flush_tlb_page(struct vm_area_struct *vma, unsigned long start)
+
+ static void do_flush_tlb_all(void *info)
+ {
+- count_vm_event(NR_TLB_REMOTE_FLUSH_RECEIVED);
++ count_vm_tlb_event(NR_TLB_REMOTE_FLUSH_RECEIVED);
+ __flush_tlb_all();
+ if (this_cpu_read(cpu_tlbstate.state) == TLBSTATE_LAZY)
+ leave_mm(smp_processor_id());
+@@ -270,7 +244,7 @@ static void do_flush_tlb_all(void *info)
+
+ void flush_tlb_all(void)
+ {
+- count_vm_event(NR_TLB_REMOTE_FLUSH);
++ count_vm_tlb_event(NR_TLB_REMOTE_FLUSH);
+ on_each_cpu(do_flush_tlb_all, NULL, 1);
+ }
+
+diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
+index 6e9ff8fac75a..6357298932bf 100644
+--- a/fs/btrfs/compression.c
++++ b/fs/btrfs/compression.c
+@@ -474,7 +474,7 @@ static noinline int add_ra_bio_pages(struct inode *inode,
+ rcu_read_lock();
+ page = radix_tree_lookup(&mapping->page_tree, pg_index);
+ rcu_read_unlock();
+- if (page) {
++ if (page && !radix_tree_exceptional_entry(page)) {
+ misses++;
+ if (misses > 4)
+ break;
+diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
+index 594bbfd4996e..7015d9079bd1 100644
+--- a/fs/btrfs/extent_io.c
++++ b/fs/btrfs/extent_io.c
+@@ -4446,7 +4446,8 @@ static void check_buffer_tree_ref(struct extent_buffer *eb)
+ spin_unlock(&eb->refs_lock);
+ }
+
+-static void mark_extent_buffer_accessed(struct extent_buffer *eb)
++static void mark_extent_buffer_accessed(struct extent_buffer *eb,
++ struct page *accessed)
+ {
+ unsigned long num_pages, i;
+
+@@ -4455,7 +4456,8 @@ static void mark_extent_buffer_accessed(struct extent_buffer *eb)
+ num_pages = num_extent_pages(eb->start, eb->len);
+ for (i = 0; i < num_pages; i++) {
+ struct page *p = extent_buffer_page(eb, i);
+- mark_page_accessed(p);
++ if (p != accessed)
++ mark_page_accessed(p);
+ }
+ }
+
+@@ -4476,7 +4478,7 @@ struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree,
+ eb = radix_tree_lookup(&tree->buffer, start >> PAGE_CACHE_SHIFT);
+ if (eb && atomic_inc_not_zero(&eb->refs)) {
+ rcu_read_unlock();
+- mark_extent_buffer_accessed(eb);
++ mark_extent_buffer_accessed(eb, NULL);
+ return eb;
+ }
+ rcu_read_unlock();
+@@ -4504,7 +4506,7 @@ struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree,
+ spin_unlock(&mapping->private_lock);
+ unlock_page(p);
+ page_cache_release(p);
+- mark_extent_buffer_accessed(exists);
++ mark_extent_buffer_accessed(exists, p);
+ goto free_eb;
+ }
+
+@@ -4519,7 +4521,6 @@ struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree,
+ attach_extent_buffer_page(eb, p);
+ spin_unlock(&mapping->private_lock);
+ WARN_ON(PageDirty(p));
+- mark_page_accessed(p);
+ eb->pages[i] = p;
+ if (!PageUptodate(p))
+ uptodate = 0;
+@@ -4549,7 +4550,7 @@ again:
+ }
+ spin_unlock(&tree->buffer_lock);
+ radix_tree_preload_end();
+- mark_extent_buffer_accessed(exists);
++ mark_extent_buffer_accessed(exists, NULL);
+ goto free_eb;
+ }
+ /* add one reference for the tree */
+@@ -4595,7 +4596,7 @@ struct extent_buffer *find_extent_buffer(struct extent_io_tree *tree,
+ eb = radix_tree_lookup(&tree->buffer, start >> PAGE_CACHE_SHIFT);
+ if (eb && atomic_inc_not_zero(&eb->refs)) {
+ rcu_read_unlock();
+- mark_extent_buffer_accessed(eb);
++ mark_extent_buffer_accessed(eb, NULL);
+ return eb;
+ }
+ rcu_read_unlock();
+diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
+index 72da4df53c9a..ad80dfa6cf91 100644
+--- a/fs/btrfs/file.c
++++ b/fs/btrfs/file.c
+@@ -426,13 +426,8 @@ static noinline int btrfs_copy_from_user(loff_t pos, int num_pages,
+ struct page *page = prepared_pages[pg];
+ /*
+ * Copy data from userspace to the current page
+- *
+- * Disable pagefault to avoid recursive lock since
+- * the pages are already locked
+ */
+- pagefault_disable();
+ copied = iov_iter_copy_from_user_atomic(page, i, offset, count);
+- pagefault_enable();
+
+ /* Flush processor's dcache for this page */
+ flush_dcache_page(page);
+@@ -476,11 +471,12 @@ static void btrfs_drop_pages(struct page **pages, size_t num_pages)
+ for (i = 0; i < num_pages; i++) {
+ /* page checked is some magic around finding pages that
+ * have been modified without going through btrfs_set_page_dirty
+- * clear it here
++ * clear it here. There should be no need to mark the pages
++ * accessed as prepare_pages should have marked them accessed
++ * in prepare_pages via find_or_create_page()
+ */
+ ClearPageChecked(pages[i]);
+ unlock_page(pages[i]);
+- mark_page_accessed(pages[i]);
+ page_cache_release(pages[i]);
+ }
+ }
+diff --git a/fs/buffer.c b/fs/buffer.c
+index aeeea6529bcd..b7888527f7c3 100644
+--- a/fs/buffer.c
++++ b/fs/buffer.c
+@@ -227,7 +227,7 @@ __find_get_block_slow(struct block_device *bdev, sector_t block)
+ int all_mapped = 1;
+
+ index = block >> (PAGE_CACHE_SHIFT - bd_inode->i_blkbits);
+- page = find_get_page(bd_mapping, index);
++ page = find_get_page_flags(bd_mapping, index, FGP_ACCESSED);
+ if (!page)
+ goto out;
+
+@@ -1366,12 +1366,13 @@ __find_get_block(struct block_device *bdev, sector_t block, unsigned size)
+ struct buffer_head *bh = lookup_bh_lru(bdev, block, size);
+
+ if (bh == NULL) {
++ /* __find_get_block_slow will mark the page accessed */
+ bh = __find_get_block_slow(bdev, block);
+ if (bh)
+ bh_lru_install(bh);
+- }
+- if (bh)
++ } else
+ touch_buffer(bh);
++
+ return bh;
+ }
+ EXPORT_SYMBOL(__find_get_block);
+@@ -1483,16 +1484,27 @@ EXPORT_SYMBOL(set_bh_page);
+ /*
+ * Called when truncating a buffer on a page completely.
+ */
++
++/* Bits that are cleared during an invalidate */
++#define BUFFER_FLAGS_DISCARD \
++ (1 << BH_Mapped | 1 << BH_New | 1 << BH_Req | \
++ 1 << BH_Delay | 1 << BH_Unwritten)
++
+ static void discard_buffer(struct buffer_head * bh)
+ {
++ unsigned long b_state, b_state_old;
++
+ lock_buffer(bh);
+ clear_buffer_dirty(bh);
+ bh->b_bdev = NULL;
+- clear_buffer_mapped(bh);
+- clear_buffer_req(bh);
+- clear_buffer_new(bh);
+- clear_buffer_delay(bh);
+- clear_buffer_unwritten(bh);
++ b_state = bh->b_state;
++ for (;;) {
++ b_state_old = cmpxchg(&bh->b_state, b_state,
++ (b_state & ~BUFFER_FLAGS_DISCARD));
++ if (b_state_old == b_state)
++ break;
++ b_state = b_state_old;
++ }
+ unlock_buffer(bh);
+ }
+
+diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c
+index e501ac3a49ff..2f6cfcaa55fd 100644
+--- a/fs/cramfs/inode.c
++++ b/fs/cramfs/inode.c
+@@ -179,8 +179,7 @@ static void *cramfs_read(struct super_block *sb, unsigned int offset, unsigned i
+ struct page *page = NULL;
+
+ if (blocknr + i < devsize) {
+- page = read_mapping_page_async(mapping, blocknr + i,
+- NULL);
++ page = read_mapping_page(mapping, blocknr + i, NULL);
+ /* synchronous error? */
+ if (IS_ERR(page))
+ page = NULL;
+diff --git a/fs/exec.c b/fs/exec.c
+index 95eef54de2b6..26bb91bf203b 100644
+--- a/fs/exec.c
++++ b/fs/exec.c
+@@ -26,6 +26,7 @@
+ #include <linux/file.h>
+ #include <linux/fdtable.h>
+ #include <linux/mm.h>
++#include <linux/vmacache.h>
+ #include <linux/stat.h>
+ #include <linux/fcntl.h>
+ #include <linux/swap.h>
+@@ -818,7 +819,7 @@ EXPORT_SYMBOL(read_code);
+ static int exec_mmap(struct mm_struct *mm)
+ {
+ struct task_struct *tsk;
+- struct mm_struct * old_mm, *active_mm;
++ struct mm_struct *old_mm, *active_mm;
+
+ /* Notify parent that we're no longer interested in the old VM */
+ tsk = current;
+@@ -844,6 +845,8 @@ static int exec_mmap(struct mm_struct *mm)
+ tsk->mm = mm;
+ tsk->active_mm = mm;
+ activate_mm(active_mm, mm);
++ tsk->mm->vmacache_seqnum = 0;
++ vmacache_flush(tsk);
+ task_unlock(tsk);
+ arch_pick_mmap_layout(mm);
+ if (old_mm) {
+diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
+index 242226a87be7..7620133f78bf 100644
+--- a/fs/ext4/mballoc.c
++++ b/fs/ext4/mballoc.c
+@@ -1044,6 +1044,8 @@ int ext4_mb_init_group(struct super_block *sb, ext4_group_t group)
+ * allocating. If we are looking at the buddy cache we would
+ * have taken a reference using ext4_mb_load_buddy and that
+ * would have pinned buddy page to page cache.
++ * The call to ext4_mb_get_buddy_page_lock will mark the
++ * page accessed.
+ */
+ ret = ext4_mb_get_buddy_page_lock(sb, group, &e4b);
+ if (ret || !EXT4_MB_GRP_NEED_INIT(this_grp)) {
+@@ -1062,7 +1064,6 @@ int ext4_mb_init_group(struct super_block *sb, ext4_group_t group)
+ ret = -EIO;
+ goto err;
+ }
+- mark_page_accessed(page);
+
+ if (e4b.bd_buddy_page == NULL) {
+ /*
+@@ -1082,7 +1083,6 @@ int ext4_mb_init_group(struct super_block *sb, ext4_group_t group)
+ ret = -EIO;
+ goto err;
+ }
+- mark_page_accessed(page);
+ err:
+ ext4_mb_put_buddy_page_lock(&e4b);
+ return ret;
+@@ -1141,7 +1141,7 @@ ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
+
+ /* we could use find_or_create_page(), but it locks page
+ * what we'd like to avoid in fast path ... */
+- page = find_get_page(inode->i_mapping, pnum);
++ page = find_get_page_flags(inode->i_mapping, pnum, FGP_ACCESSED);
+ if (page == NULL || !PageUptodate(page)) {
+ if (page)
+ /*
+@@ -1172,15 +1172,16 @@ ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
+ ret = -EIO;
+ goto err;
+ }
++
++ /* Pages marked accessed already */
+ e4b->bd_bitmap_page = page;
+ e4b->bd_bitmap = page_address(page) + (poff * sb->s_blocksize);
+- mark_page_accessed(page);
+
+ block++;
+ pnum = block / blocks_per_page;
+ poff = block % blocks_per_page;
+
+- page = find_get_page(inode->i_mapping, pnum);
++ page = find_get_page_flags(inode->i_mapping, pnum, FGP_ACCESSED);
+ if (page == NULL || !PageUptodate(page)) {
+ if (page)
+ page_cache_release(page);
+@@ -1201,9 +1202,10 @@ ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
+ ret = -EIO;
+ goto err;
+ }
++
++ /* Pages marked accessed already */
+ e4b->bd_buddy_page = page;
+ e4b->bd_buddy = page_address(page) + (poff * sb->s_blocksize);
+- mark_page_accessed(page);
+
+ BUG_ON(e4b->bd_bitmap_page == NULL);
+ BUG_ON(e4b->bd_buddy_page == NULL);
+diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c
+index bb312201ca95..15a29af63e20 100644
+--- a/fs/f2fs/checkpoint.c
++++ b/fs/f2fs/checkpoint.c
+@@ -70,7 +70,6 @@ repeat:
+ goto repeat;
+ }
+ out:
+- mark_page_accessed(page);
+ return page;
+ }
+
+diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
+index 51ef27894433..d0335bdb65b4 100644
+--- a/fs/f2fs/node.c
++++ b/fs/f2fs/node.c
+@@ -970,7 +970,6 @@ repeat:
+ }
+ got_it:
+ BUG_ON(nid != nid_of_node(page));
+- mark_page_accessed(page);
+ return page;
+ }
+
+@@ -1026,7 +1025,6 @@ page_hit:
+ f2fs_put_page(page, 1);
+ return ERR_PTR(-EIO);
+ }
+- mark_page_accessed(page);
+ return page;
+ }
+
+diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
+index fa8cb4b7b8fe..fc8e4991736a 100644
+--- a/fs/fuse/dev.c
++++ b/fs/fuse/dev.c
+@@ -1613,7 +1613,7 @@ out_finish:
+
+ static void fuse_retrieve_end(struct fuse_conn *fc, struct fuse_req *req)
+ {
+- release_pages(req->pages, req->num_pages, 0);
++ release_pages(req->pages, req->num_pages, false);
+ }
+
+ static int fuse_retrieve(struct fuse_conn *fc, struct inode *inode,
+diff --git a/fs/fuse/file.c b/fs/fuse/file.c
+index 4598345ab87d..d08c108065e1 100644
+--- a/fs/fuse/file.c
++++ b/fs/fuse/file.c
+@@ -985,13 +985,9 @@ static ssize_t fuse_fill_write_pages(struct fuse_req *req,
+ if (mapping_writably_mapped(mapping))
+ flush_dcache_page(page);
+
+- pagefault_disable();
+ tmp = iov_iter_copy_from_user_atomic(page, ii, offset, bytes);
+- pagefault_enable();
+ flush_dcache_page(page);
+
+- mark_page_accessed(page);
+-
+ if (!tmp) {
+ unlock_page(page);
+ page_cache_release(page);
+diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c
+index 1253c2006029..f3aee0bbe886 100644
+--- a/fs/gfs2/aops.c
++++ b/fs/gfs2/aops.c
+@@ -517,7 +517,6 @@ int gfs2_internal_read(struct gfs2_inode *ip, char *buf, loff_t *pos,
+ p = kmap_atomic(page);
+ memcpy(buf + copied, p + offset, amt);
+ kunmap_atomic(p);
+- mark_page_accessed(page);
+ page_cache_release(page);
+ copied += amt;
+ index++;
+diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c
+index 52f177be3bf8..89afe3a8f626 100644
+--- a/fs/gfs2/meta_io.c
++++ b/fs/gfs2/meta_io.c
+@@ -128,7 +128,8 @@ struct buffer_head *gfs2_getbuf(struct gfs2_glock *gl, u64 blkno, int create)
+ yield();
+ }
+ } else {
+- page = find_lock_page(mapping, index);
++ page = find_get_page_flags(mapping, index,
++ FGP_LOCK|FGP_ACCESSED);
+ if (!page)
+ return NULL;
+ }
+@@ -145,7 +146,6 @@ struct buffer_head *gfs2_getbuf(struct gfs2_glock *gl, u64 blkno, int create)
+ map_bh(bh, sdp->sd_vfs, blkno);
+
+ unlock_page(page);
+- mark_page_accessed(page);
+ page_cache_release(page);
+
+ return bh;
+diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
+index d19b30ababf1..a4a8ed56e438 100644
+--- a/fs/hugetlbfs/inode.c
++++ b/fs/hugetlbfs/inode.c
+@@ -1017,6 +1017,11 @@ static int __init init_hugetlbfs_fs(void)
+ int error;
+ int i;
+
++ if (!hugepages_supported()) {
++ pr_info("hugetlbfs: disabling because there are no supported hugepage sizes\n");
++ return -ENOTSUPP;
++ }
++
+ error = bdi_init(&hugetlbfs_backing_dev_info);
+ if (error)
+ return error;
+diff --git a/fs/jffs2/fs.c b/fs/jffs2/fs.c
+index fe3c0527545f..91bf52d1a88c 100644
+--- a/fs/jffs2/fs.c
++++ b/fs/jffs2/fs.c
+@@ -682,7 +682,7 @@ unsigned char *jffs2_gc_fetch_page(struct jffs2_sb_info *c,
+ struct inode *inode = OFNI_EDONI_2SFFJ(f);
+ struct page *pg;
+
+- pg = read_cache_page_async(inode->i_mapping, offset >> PAGE_CACHE_SHIFT,
++ pg = read_cache_page(inode->i_mapping, offset >> PAGE_CACHE_SHIFT,
+ (void *)jffs2_do_readpage_unlock, inode);
+ if (IS_ERR(pg))
+ return (void *)pg;
+diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c
+index e242bbf72972..fdb74cbb9e0c 100644
+--- a/fs/nfs/blocklayout/blocklayout.c
++++ b/fs/nfs/blocklayout/blocklayout.c
+@@ -1220,7 +1220,7 @@ static u64 pnfs_num_cont_bytes(struct inode *inode, pgoff_t idx)
+ end = DIV_ROUND_UP(i_size_read(inode), PAGE_CACHE_SIZE);
+ if (end != NFS_I(inode)->npages) {
+ rcu_read_lock();
+- end = radix_tree_next_hole(&mapping->page_tree, idx + 1, ULONG_MAX);
++ end = page_cache_next_hole(mapping, idx + 1, ULONG_MAX);
+ rcu_read_unlock();
+ }
+
+diff --git a/fs/ntfs/attrib.c b/fs/ntfs/attrib.c
+index a27e3fecefaf..250ed5b20c8f 100644
+--- a/fs/ntfs/attrib.c
++++ b/fs/ntfs/attrib.c
+@@ -1748,7 +1748,6 @@ int ntfs_attr_make_non_resident(ntfs_inode *ni, const u32 data_size)
+ if (page) {
+ set_page_dirty(page);
+ unlock_page(page);
+- mark_page_accessed(page);
+ page_cache_release(page);
+ }
+ ntfs_debug("Done.");
+diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c
+index ea4ba9daeb47..a0b2f345da2b 100644
+--- a/fs/ntfs/file.c
++++ b/fs/ntfs/file.c
+@@ -2060,7 +2060,6 @@ static ssize_t ntfs_file_buffered_write(struct kiocb *iocb,
+ }
+ do {
+ unlock_page(pages[--do_pages]);
+- mark_page_accessed(pages[do_pages]);
+ page_cache_release(pages[do_pages]);
+ } while (do_pages);
+ if (unlikely(status))
+diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
+index ad4df869c907..7724fbdf443f 100644
+--- a/fs/proc/task_mmu.c
++++ b/fs/proc/task_mmu.c
+@@ -1,4 +1,5 @@
+ #include <linux/mm.h>
++#include <linux/vmacache.h>
+ #include <linux/hugetlb.h>
+ #include <linux/huge_mm.h>
+ #include <linux/mount.h>
+@@ -159,7 +160,7 @@ static void *m_start(struct seq_file *m, loff_t *pos)
+
+ /*
+ * We remember last_addr rather than next_addr to hit with
+- * mmap_cache most of the time. We have zero last_addr at
++ * vmacache most of the time. We have zero last_addr at
+ * the beginning and also after lseek. We will have -1 last_addr
+ * after the end of the vmas.
+ */
+diff --git a/fs/super.c b/fs/super.c
+index d127de207376..fb68a4c90c98 100644
+--- a/fs/super.c
++++ b/fs/super.c
+@@ -112,9 +112,14 @@ static unsigned long super_cache_count(struct shrinker *shrink,
+
+ sb = container_of(shrink, struct super_block, s_shrink);
+
+- if (!grab_super_passive(sb))
+- return 0;
+-
++ /*
++ * Don't call grab_super_passive as it is a potential
++ * scalability bottleneck. The counts could get updated
++ * between super_cache_count and super_cache_scan anyway.
++ * Call to super_cache_count with shrinker_rwsem held
++ * ensures the safety of call to list_lru_count_node() and
++ * s_op->nr_cached_objects().
++ */
+ if (sb->s_op && sb->s_op->nr_cached_objects)
+ total_objects = sb->s_op->nr_cached_objects(sb,
+ sc->nid);
+@@ -125,7 +130,6 @@ static unsigned long super_cache_count(struct shrinker *shrink,
+ sc->nid);
+
+ total_objects = vfs_pressure_ratio(total_objects);
+- drop_super(sb);
+ return total_objects;
+ }
+
+@@ -321,10 +325,8 @@ void deactivate_locked_super(struct super_block *s)
+ struct file_system_type *fs = s->s_type;
+ if (atomic_dec_and_test(&s->s_active)) {
+ cleancache_invalidate_fs(s);
+- fs->kill_sb(s);
+-
+- /* caches are now gone, we can safely kill the shrinker now */
+ unregister_shrinker(&s->s_shrink);
++ fs->kill_sb(s);
+
+ put_filesystem(fs);
+ put_super(s);
+diff --git a/include/linux/compaction.h b/include/linux/compaction.h
+index 091d72e70d8a..01e3132820da 100644
+--- a/include/linux/compaction.h
++++ b/include/linux/compaction.h
+@@ -22,7 +22,7 @@ extern int sysctl_extfrag_handler(struct ctl_table *table, int write,
+ extern int fragmentation_index(struct zone *zone, unsigned int order);
+ extern unsigned long try_to_compact_pages(struct zonelist *zonelist,
+ int order, gfp_t gfp_mask, nodemask_t *mask,
+- bool sync, bool *contended);
++ enum migrate_mode mode, bool *contended);
+ extern void compact_pgdat(pg_data_t *pgdat, int order);
+ extern void reset_isolation_suitable(pg_data_t *pgdat);
+ extern unsigned long compaction_suitable(struct zone *zone, int order);
+@@ -62,6 +62,22 @@ static inline bool compaction_deferred(struct zone *zone, int order)
+ return zone->compact_considered < defer_limit;
+ }
+
++/*
++ * Update defer tracking counters after successful compaction of given order,
++ * which means an allocation either succeeded (alloc_success == true) or is
++ * expected to succeed.
++ */
++static inline void compaction_defer_reset(struct zone *zone, int order,
++ bool alloc_success)
++{
++ if (alloc_success) {
++ zone->compact_considered = 0;
++ zone->compact_defer_shift = 0;
++ }
++ if (order >= zone->compact_order_failed)
++ zone->compact_order_failed = order + 1;
++}
++
+ /* Returns true if restarting compaction after many failures */
+ static inline bool compaction_restarting(struct zone *zone, int order)
+ {
+@@ -75,7 +91,7 @@ static inline bool compaction_restarting(struct zone *zone, int order)
+ #else
+ static inline unsigned long try_to_compact_pages(struct zonelist *zonelist,
+ int order, gfp_t gfp_mask, nodemask_t *nodemask,
+- bool sync, bool *contended)
++ enum migrate_mode mode, bool *contended)
+ {
+ return COMPACT_CONTINUE;
+ }
+diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h
+index cc1b01cf2035..a7ebb89ae9fb 100644
+--- a/include/linux/cpuset.h
++++ b/include/linux/cpuset.h
+@@ -12,10 +12,31 @@
+ #include <linux/cpumask.h>
+ #include <linux/nodemask.h>
+ #include <linux/mm.h>
++#include <linux/jump_label.h>
+
+ #ifdef CONFIG_CPUSETS
+
+-extern int number_of_cpusets; /* How many cpusets are defined in system? */
++extern struct static_key cpusets_enabled_key;
++static inline bool cpusets_enabled(void)
++{
++ return static_key_false(&cpusets_enabled_key);
++}
++
++static inline int nr_cpusets(void)
++{
++ /* jump label reference count + the top-level cpuset */
++ return static_key_count(&cpusets_enabled_key) + 1;
++}
++
++static inline void cpuset_inc(void)
++{
++ static_key_slow_inc(&cpusets_enabled_key);
++}
++
++static inline void cpuset_dec(void)
++{
++ static_key_slow_dec(&cpusets_enabled_key);
++}
+
+ extern int cpuset_init(void);
+ extern void cpuset_init_smp(void);
+@@ -32,13 +53,13 @@ extern int __cpuset_node_allowed_hardwall(int node, gfp_t gfp_mask);
+
+ static inline int cpuset_node_allowed_softwall(int node, gfp_t gfp_mask)
+ {
+- return number_of_cpusets <= 1 ||
++ return nr_cpusets() <= 1 ||
+ __cpuset_node_allowed_softwall(node, gfp_mask);
+ }
+
+ static inline int cpuset_node_allowed_hardwall(int node, gfp_t gfp_mask)
+ {
+- return number_of_cpusets <= 1 ||
++ return nr_cpusets() <= 1 ||
+ __cpuset_node_allowed_hardwall(node, gfp_mask);
+ }
+
+@@ -87,25 +108,26 @@ extern void rebuild_sched_domains(void);
+ extern void cpuset_print_task_mems_allowed(struct task_struct *p);
+
+ /*
+- * get_mems_allowed is required when making decisions involving mems_allowed
+- * such as during page allocation. mems_allowed can be updated in parallel
+- * and depending on the new value an operation can fail potentially causing
+- * process failure. A retry loop with get_mems_allowed and put_mems_allowed
+- * prevents these artificial failures.
++ * read_mems_allowed_begin is required when making decisions involving
++ * mems_allowed such as during page allocation. mems_allowed can be updated in
++ * parallel and depending on the new value an operation can fail potentially
++ * causing process failure. A retry loop with read_mems_allowed_begin and
++ * read_mems_allowed_retry prevents these artificial failures.
+ */
+-static inline unsigned int get_mems_allowed(void)
++static inline unsigned int read_mems_allowed_begin(void)
+ {
+ return read_seqcount_begin(&current->mems_allowed_seq);
+ }
+
+ /*
+- * If this returns false, the operation that took place after get_mems_allowed
+- * may have failed. It is up to the caller to retry the operation if
++ * If this returns true, the operation that took place after
++ * read_mems_allowed_begin may have failed artificially due to a concurrent
++ * update of mems_allowed. It is up to the caller to retry the operation if
+ * appropriate.
+ */
+-static inline bool put_mems_allowed(unsigned int seq)
++static inline bool read_mems_allowed_retry(unsigned int seq)
+ {
+- return !read_seqcount_retry(&current->mems_allowed_seq, seq);
++ return read_seqcount_retry(&current->mems_allowed_seq, seq);
+ }
+
+ static inline void set_mems_allowed(nodemask_t nodemask)
+@@ -119,6 +141,8 @@ static inline void set_mems_allowed(nodemask_t nodemask)
+
+ #else /* !CONFIG_CPUSETS */
+
++static inline bool cpusets_enabled(void) { return false; }
++
+ static inline int cpuset_init(void) { return 0; }
+ static inline void cpuset_init_smp(void) {}
+
+@@ -221,14 +245,14 @@ static inline void set_mems_allowed(nodemask_t nodemask)
+ {
+ }
+
+-static inline unsigned int get_mems_allowed(void)
++static inline unsigned int read_mems_allowed_begin(void)
+ {
+ return 0;
+ }
+
+-static inline bool put_mems_allowed(unsigned int seq)
++static inline bool read_mems_allowed_retry(unsigned int seq)
+ {
+- return true;
++ return false;
+ }
+
+ #endif /* !CONFIG_CPUSETS */
+diff --git a/include/linux/gfp.h b/include/linux/gfp.h
+index 9b4dd491f7e8..fa7ac989ff56 100644
+--- a/include/linux/gfp.h
++++ b/include/linux/gfp.h
+@@ -364,8 +364,8 @@ void *alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask);
+
+ extern void __free_pages(struct page *page, unsigned int order);
+ extern void free_pages(unsigned long addr, unsigned int order);
+-extern void free_hot_cold_page(struct page *page, int cold);
+-extern void free_hot_cold_page_list(struct list_head *list, int cold);
++extern void free_hot_cold_page(struct page *page, bool cold);
++extern void free_hot_cold_page_list(struct list_head *list, bool cold);
+
+ extern void __free_memcg_kmem_pages(struct page *page, unsigned int order);
+ extern void free_memcg_kmem_pages(unsigned long addr, unsigned int order);
+diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
+index a291552ab767..aac671be9581 100644
+--- a/include/linux/huge_mm.h
++++ b/include/linux/huge_mm.h
+@@ -92,10 +92,6 @@ extern bool is_vma_temporary_stack(struct vm_area_struct *vma);
+ #endif /* CONFIG_DEBUG_VM */
+
+ extern unsigned long transparent_hugepage_flags;
+-extern int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
+- pmd_t *dst_pmd, pmd_t *src_pmd,
+- struct vm_area_struct *vma,
+- unsigned long addr, unsigned long end);
+ extern int split_huge_page_to_list(struct page *page, struct list_head *list);
+ static inline int split_huge_page(struct page *page)
+ {
+diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
+index 5214ff63c351..511b1a0d6cc2 100644
+--- a/include/linux/hugetlb.h
++++ b/include/linux/hugetlb.h
+@@ -396,6 +396,16 @@ static inline int hugepage_migration_support(struct hstate *h)
+ #endif
+ }
+
++static inline bool hugepages_supported(void)
++{
++ /*
++ * Some platform decide whether they support huge pages at boot
++ * time. On these, such as powerpc, HPAGE_SHIFT is set to 0 when
++ * there is no such support
++ */
++ return HPAGE_SHIFT != 0;
++}
++
+ #else /* CONFIG_HUGETLB_PAGE */
+ struct hstate {};
+ #define alloc_huge_page_node(h, nid) NULL
+diff --git a/include/linux/jump_label.h b/include/linux/jump_label.h
+index a5079072da66..9216e465289a 100644
+--- a/include/linux/jump_label.h
++++ b/include/linux/jump_label.h
+@@ -62,6 +62,10 @@ struct static_key {
+
+ # include <asm/jump_label.h>
+ # define HAVE_JUMP_LABEL
++#else
++struct static_key {
++ atomic_t enabled;
++};
+ #endif /* CC_HAVE_ASM_GOTO && CONFIG_JUMP_LABEL */
+
+ enum jump_label_type {
+@@ -72,6 +76,12 @@ enum jump_label_type {
+ struct module;
+
+ #include <linux/atomic.h>
++
++static inline int static_key_count(struct static_key *key)
++{
++ return atomic_read(&key->enabled);
++}
++
+ #ifdef HAVE_JUMP_LABEL
+
+ #define JUMP_LABEL_TRUE_BRANCH 1UL
+@@ -122,24 +132,20 @@ extern void jump_label_apply_nops(struct module *mod);
+
+ #else /* !HAVE_JUMP_LABEL */
+
+-struct static_key {
+- atomic_t enabled;
+-};
+-
+ static __always_inline void jump_label_init(void)
+ {
+ }
+
+ static __always_inline bool static_key_false(struct static_key *key)
+ {
+- if (unlikely(atomic_read(&key->enabled)) > 0)
++ if (unlikely(static_key_count(key) > 0))
+ return true;
+ return false;
+ }
+
+ static __always_inline bool static_key_true(struct static_key *key)
+ {
+- if (likely(atomic_read(&key->enabled)) > 0)
++ if (likely(static_key_count(key) > 0))
+ return true;
+ return false;
+ }
+@@ -179,7 +185,7 @@ static inline int jump_label_apply_nops(struct module *mod)
+
+ static inline bool static_key_enabled(struct static_key *key)
+ {
+- return (atomic_read(&key->enabled) > 0);
++ return static_key_count(key) > 0;
+ }
+
+ #endif /* _LINUX_JUMP_LABEL_H */
+diff --git a/include/linux/migrate.h b/include/linux/migrate.h
+index ee8b14ae4f3f..449905ebcab3 100644
+--- a/include/linux/migrate.h
++++ b/include/linux/migrate.h
+@@ -5,7 +5,9 @@
+ #include <linux/mempolicy.h>
+ #include <linux/migrate_mode.h>
+
+-typedef struct page *new_page_t(struct page *, unsigned long private, int **);
++typedef struct page *new_page_t(struct page *page, unsigned long private,
++ int **reason);
++typedef void free_page_t(struct page *page, unsigned long private);
+
+ /*
+ * Return values from addresss_space_operations.migratepage():
+@@ -39,7 +41,7 @@ extern void putback_lru_pages(struct list_head *l);
+ extern void putback_movable_pages(struct list_head *l);
+ extern int migrate_page(struct address_space *,
+ struct page *, struct page *, enum migrate_mode);
+-extern int migrate_pages(struct list_head *l, new_page_t x,
++extern int migrate_pages(struct list_head *l, new_page_t new, free_page_t free,
+ unsigned long private, enum migrate_mode mode, int reason);
+
+ extern int fail_migrate_page(struct address_space *,
+@@ -61,8 +63,9 @@ extern int migrate_page_move_mapping(struct address_space *mapping,
+
+ static inline void putback_lru_pages(struct list_head *l) {}
+ static inline void putback_movable_pages(struct list_head *l) {}
+-static inline int migrate_pages(struct list_head *l, new_page_t x,
+- unsigned long private, enum migrate_mode mode, int reason)
++static inline int migrate_pages(struct list_head *l, new_page_t new,
++ free_page_t free, unsigned long private, enum migrate_mode mode,
++ int reason)
+ { return -ENOSYS; }
+
+ static inline int migrate_prep(void) { return -ENOSYS; }
+diff --git a/include/linux/mm.h b/include/linux/mm.h
+index 073734339583..2b3a5330dcf2 100644
+--- a/include/linux/mm.h
++++ b/include/linux/mm.h
+@@ -919,6 +919,14 @@ extern void show_free_areas(unsigned int flags);
+ extern bool skip_free_areas_node(unsigned int flags, int nid);
+
+ int shmem_zero_setup(struct vm_area_struct *);
++#ifdef CONFIG_SHMEM
++bool shmem_mapping(struct address_space *mapping);
++#else
++static inline bool shmem_mapping(struct address_space *mapping)
++{
++ return false;
++}
++#endif
+
+ extern int can_do_mlock(void);
+ extern int user_shm_lock(size_t, struct user_struct *);
+@@ -1623,9 +1631,6 @@ void page_cache_async_readahead(struct address_space *mapping,
+ unsigned long size);
+
+ unsigned long max_sane_readahead(unsigned long nr);
+-unsigned long ra_submit(struct file_ra_state *ra,
+- struct address_space *mapping,
+- struct file *filp);
+
+ /* Generic expand stack which grows the stack according to GROWS{UP,DOWN} */
+ extern int expand_stack(struct vm_area_struct *vma, unsigned long address);
+diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
+index 8e082f18fb6a..b8131e7d6eda 100644
+--- a/include/linux/mm_types.h
++++ b/include/linux/mm_types.h
+@@ -324,9 +324,9 @@ struct mm_rss_stat {
+
+ struct kioctx_table;
+ struct mm_struct {
+- struct vm_area_struct * mmap; /* list of VMAs */
++ struct vm_area_struct *mmap; /* list of VMAs */
+ struct rb_root mm_rb;
+- struct vm_area_struct * mmap_cache; /* last find_vma result */
++ u32 vmacache_seqnum; /* per-thread vmacache */
+ #ifdef CONFIG_MMU
+ unsigned long (*get_unmapped_area) (struct file *filp,
+ unsigned long addr, unsigned long len,
+diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
+index 56482904a676..450f19c5c865 100644
+--- a/include/linux/mmzone.h
++++ b/include/linux/mmzone.h
+@@ -78,10 +78,15 @@ extern int page_group_by_mobility_disabled;
+ #define NR_MIGRATETYPE_BITS (PB_migrate_end - PB_migrate + 1)
+ #define MIGRATETYPE_MASK ((1UL << NR_MIGRATETYPE_BITS) - 1)
+
+-static inline int get_pageblock_migratetype(struct page *page)
++#define get_pageblock_migratetype(page) \
++ get_pfnblock_flags_mask(page, page_to_pfn(page), \
++ PB_migrate_end, MIGRATETYPE_MASK)
++
++static inline int get_pfnblock_migratetype(struct page *page, unsigned long pfn)
+ {
+ BUILD_BUG_ON(PB_migrate_end - PB_migrate != 2);
+- return get_pageblock_flags_mask(page, PB_migrate_end, MIGRATETYPE_MASK);
++ return get_pfnblock_flags_mask(page, pfn, PB_migrate_end,
++ MIGRATETYPE_MASK);
+ }
+
+ struct free_area {
+@@ -138,6 +143,7 @@ enum zone_stat_item {
+ NR_SHMEM, /* shmem pages (included tmpfs/GEM pages) */
+ NR_DIRTIED, /* page dirtyings since bootup */
+ NR_WRITTEN, /* page writings since bootup */
++ NR_PAGES_SCANNED, /* pages scanned since last reclaim */
+ #ifdef CONFIG_NUMA
+ NUMA_HIT, /* allocated in intended node */
+ NUMA_MISS, /* allocated in non intended node */
+@@ -316,19 +322,12 @@ enum zone_type {
+ #ifndef __GENERATING_BOUNDS_H
+
+ struct zone {
+- /* Fields commonly accessed by the page allocator */
++ /* Read-mostly fields */
+
+ /* zone watermarks, access with *_wmark_pages(zone) macros */
+ unsigned long watermark[NR_WMARK];
+
+ /*
+- * When free pages are below this point, additional steps are taken
+- * when reading the number of free pages to avoid per-cpu counter
+- * drift allowing watermarks to be breached
+- */
+- unsigned long percpu_drift_mark;
+-
+- /*
+ * We don't know if the memory that we're going to allocate will be freeable
+ * or/and it will be released eventually, so to avoid totally wasting several
+ * GB of ram we must reserve some of the lower zone memory (otherwise we risk
+@@ -336,40 +335,26 @@ struct zone {
+ * on the higher zones). This array is recalculated at runtime if the
+ * sysctl_lowmem_reserve_ratio sysctl changes.
+ */
+- unsigned long lowmem_reserve[MAX_NR_ZONES];
+-
+- /*
+- * This is a per-zone reserve of pages that should not be
+- * considered dirtyable memory.
+- */
+- unsigned long dirty_balance_reserve;
++ long lowmem_reserve[MAX_NR_ZONES];
+
+ #ifdef CONFIG_NUMA
+ int node;
++#endif
++
+ /*
+- * zone reclaim becomes active if more unmapped pages exist.
++ * The target ratio of ACTIVE_ANON to INACTIVE_ANON pages on
++ * this zone's LRU. Maintained by the pageout code.
+ */
+- unsigned long min_unmapped_pages;
+- unsigned long min_slab_pages;
+-#endif
++ unsigned int inactive_ratio;
++
++ struct pglist_data *zone_pgdat;
+ struct per_cpu_pageset __percpu *pageset;
++
+ /*
+- * free areas of different sizes
++ * This is a per-zone reserve of pages that should not be
++ * considered dirtyable memory.
+ */
+- spinlock_t lock;
+-#if defined CONFIG_COMPACTION || defined CONFIG_CMA
+- /* Set to true when the PG_migrate_skip bits should be cleared */
+- bool compact_blockskip_flush;
+-
+- /* pfns where compaction scanners should start */
+- unsigned long compact_cached_free_pfn;
+- unsigned long compact_cached_migrate_pfn;
+-#endif
+-#ifdef CONFIG_MEMORY_HOTPLUG
+- /* see spanned/present_pages for more description */
+- seqlock_t span_seqlock;
+-#endif
+- struct free_area free_area[MAX_ORDER];
++ unsigned long dirty_balance_reserve;
+
+ #ifndef CONFIG_SPARSEMEM
+ /*
+@@ -379,71 +364,14 @@ struct zone {
+ unsigned long *pageblock_flags;
+ #endif /* CONFIG_SPARSEMEM */
+
+-#ifdef CONFIG_COMPACTION
+- /*
+- * On compaction failure, 1<<compact_defer_shift compactions
+- * are skipped before trying again. The number attempted since
+- * last failure is tracked with compact_considered.
+- */
+- unsigned int compact_considered;
+- unsigned int compact_defer_shift;
+- int compact_order_failed;
+-#endif
+-
+- ZONE_PADDING(_pad1_)
+-
+- /* Fields commonly accessed by the page reclaim scanner */
+- spinlock_t lru_lock;
+- struct lruvec lruvec;
+-
+- unsigned long pages_scanned; /* since last reclaim */
+- unsigned long flags; /* zone flags, see below */
+-
+- /* Zone statistics */
+- atomic_long_t vm_stat[NR_VM_ZONE_STAT_ITEMS];
+-
+- /*
+- * The target ratio of ACTIVE_ANON to INACTIVE_ANON pages on
+- * this zone's LRU. Maintained by the pageout code.
+- */
+- unsigned int inactive_ratio;
+-
+-
+- ZONE_PADDING(_pad2_)
+- /* Rarely used or read-mostly fields */
+-
++#ifdef CONFIG_NUMA
+ /*
+- * wait_table -- the array holding the hash table
+- * wait_table_hash_nr_entries -- the size of the hash table array
+- * wait_table_bits -- wait_table_size == (1 << wait_table_bits)
+- *
+- * The purpose of all these is to keep track of the people
+- * waiting for a page to become available and make them
+- * runnable again when possible. The trouble is that this
+- * consumes a lot of space, especially when so few things
+- * wait on pages at a given time. So instead of using
+- * per-page waitqueues, we use a waitqueue hash table.
+- *
+- * The bucket discipline is to sleep on the same queue when
+- * colliding and wake all in that wait queue when removing.
+- * When something wakes, it must check to be sure its page is
+- * truly available, a la thundering herd. The cost of a
+- * collision is great, but given the expected load of the
+- * table, they should be so rare as to be outweighed by the
+- * benefits from the saved space.
+- *
+- * __wait_on_page_locked() and unlock_page() in mm/filemap.c, are the
+- * primary users of these fields, and in mm/page_alloc.c
+- * free_area_init_core() performs the initialization of them.
++ * zone reclaim becomes active if more unmapped pages exist.
+ */
+- wait_queue_head_t * wait_table;
+- unsigned long wait_table_hash_nr_entries;
+- unsigned long wait_table_bits;
++ unsigned long min_unmapped_pages;
++ unsigned long min_slab_pages;
++#endif /* CONFIG_NUMA */
+
+- /*
+- * Discontig memory support fields.
+- */
+- struct pglist_data *zone_pgdat;
+ /* zone_start_pfn == zone_start_paddr >> PAGE_SHIFT */
+ unsigned long zone_start_pfn;
+
+@@ -489,14 +417,103 @@ struct zone {
+ * adjust_managed_page_count() should be used instead of directly
+ * touching zone->managed_pages and totalram_pages.
+ */
++ unsigned long managed_pages;
+ unsigned long spanned_pages;
+ unsigned long present_pages;
+- unsigned long managed_pages;
++
++ const char *name;
+
+ /*
+- * rarely used fields:
++ * Number of MIGRATE_RESEVE page block. To maintain for just
++ * optimization. Protected by zone->lock.
+ */
+- const char *name;
++ int nr_migrate_reserve_block;
++
++#ifdef CONFIG_MEMORY_HOTPLUG
++ /* see spanned/present_pages for more description */
++ seqlock_t span_seqlock;
++#endif
++
++ /*
++ * wait_table -- the array holding the hash table
++ * wait_table_hash_nr_entries -- the size of the hash table array
++ * wait_table_bits -- wait_table_size == (1 << wait_table_bits)
++ *
++ * The purpose of all these is to keep track of the people
++ * waiting for a page to become available and make them
++ * runnable again when possible. The trouble is that this
++ * consumes a lot of space, especially when so few things
++ * wait on pages at a given time. So instead of using
++ * per-page waitqueues, we use a waitqueue hash table.
++ *
++ * The bucket discipline is to sleep on the same queue when
++ * colliding and wake all in that wait queue when removing.
++ * When something wakes, it must check to be sure its page is
++ * truly available, a la thundering herd. The cost of a
++ * collision is great, but given the expected load of the
++ * table, they should be so rare as to be outweighed by the
++ * benefits from the saved space.
++ *
++ * __wait_on_page_locked() and unlock_page() in mm/filemap.c, are the
++ * primary users of these fields, and in mm/page_alloc.c
++ * free_area_init_core() performs the initialization of them.
++ */
++ wait_queue_head_t *wait_table;
++ unsigned long wait_table_hash_nr_entries;
++ unsigned long wait_table_bits;
++
++ ZONE_PADDING(_pad1_)
++
++ /* Write-intensive fields used from the page allocator */
++ spinlock_t lock;
++
++ /* free areas of different sizes */
++ struct free_area free_area[MAX_ORDER];
++
++ /* zone flags, see below */
++ unsigned long flags;
++
++ ZONE_PADDING(_pad2_)
++
++ /* Write-intensive fields used by page reclaim */
++
++ /* Fields commonly accessed by the page reclaim scanner */
++ spinlock_t lru_lock;
++ struct lruvec lruvec;
++
++ /*
++ * When free pages are below this point, additional steps are taken
++ * when reading the number of free pages to avoid per-cpu counter
++ * drift allowing watermarks to be breached
++ */
++ unsigned long percpu_drift_mark;
++
++#if defined CONFIG_COMPACTION || defined CONFIG_CMA
++ /* pfn where compaction free scanner should start */
++ unsigned long compact_cached_free_pfn;
++ /* pfn where async and sync compaction migration scanner should start */
++ unsigned long compact_cached_migrate_pfn[2];
++#endif
++
++#ifdef CONFIG_COMPACTION
++ /*
++ * On compaction failure, 1<<compact_defer_shift compactions
++ * are skipped before trying again. The number attempted since
++ * last failure is tracked with compact_considered.
++ */
++ unsigned int compact_considered;
++ unsigned int compact_defer_shift;
++ int compact_order_failed;
++#endif
++
++#if defined CONFIG_COMPACTION || defined CONFIG_CMA
++ /* Set to true when the PG_migrate_skip bits should be cleared */
++ bool compact_blockskip_flush;
++#endif
++
++ ZONE_PADDING(_pad3_)
++ /* Zone statistics */
++ atomic_long_t vm_stat[NR_VM_ZONE_STAT_ITEMS];
+ } ____cacheline_internodealigned_in_smp;
+
+ typedef enum {
+@@ -512,6 +529,7 @@ typedef enum {
+ ZONE_WRITEBACK, /* reclaim scanning has recently found
+ * many pages under writeback
+ */
++ ZONE_FAIR_DEPLETED, /* fair zone policy batch depleted */
+ } zone_flags_t;
+
+ static inline void zone_set_flag(struct zone *zone, zone_flags_t flag)
+@@ -549,6 +567,11 @@ static inline int zone_is_reclaim_locked(const struct zone *zone)
+ return test_bit(ZONE_RECLAIM_LOCKED, &zone->flags);
+ }
+
++static inline int zone_is_fair_depleted(const struct zone *zone)
++{
++ return test_bit(ZONE_FAIR_DEPLETED, &zone->flags);
++}
++
+ static inline int zone_is_oom_locked(const struct zone *zone)
+ {
+ return test_bit(ZONE_OOM_LOCKED, &zone->flags);
+@@ -803,10 +826,10 @@ static inline bool pgdat_is_empty(pg_data_t *pgdat)
+ extern struct mutex zonelists_mutex;
+ void build_all_zonelists(pg_data_t *pgdat, struct zone *zone);
+ void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx);
+-bool zone_watermark_ok(struct zone *z, int order, unsigned long mark,
+- int classzone_idx, int alloc_flags);
+-bool zone_watermark_ok_safe(struct zone *z, int order, unsigned long mark,
+- int classzone_idx, int alloc_flags);
++bool zone_watermark_ok(struct zone *z, unsigned int order,
++ unsigned long mark, int classzone_idx, int alloc_flags);
++bool zone_watermark_ok_safe(struct zone *z, unsigned int order,
++ unsigned long mark, int classzone_idx, int alloc_flags);
+ enum memmap_context {
+ MEMMAP_EARLY,
+ MEMMAP_HOTPLUG,
+diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
+index dd7d45b5c496..2284ea62c6cc 100644
+--- a/include/linux/page-flags.h
++++ b/include/linux/page-flags.h
+@@ -198,6 +198,7 @@ struct page; /* forward declaration */
+ TESTPAGEFLAG(Locked, locked)
+ PAGEFLAG(Error, error) TESTCLEARFLAG(Error, error)
+ PAGEFLAG(Referenced, referenced) TESTCLEARFLAG(Referenced, referenced)
++ __SETPAGEFLAG(Referenced, referenced)
+ PAGEFLAG(Dirty, dirty) TESTSCFLAG(Dirty, dirty) __CLEARPAGEFLAG(Dirty, dirty)
+ PAGEFLAG(LRU, lru) __CLEARPAGEFLAG(LRU, lru)
+ PAGEFLAG(Active, active) __CLEARPAGEFLAG(Active, active)
+@@ -208,6 +209,7 @@ PAGEFLAG(Pinned, pinned) TESTSCFLAG(Pinned, pinned) /* Xen */
+ PAGEFLAG(SavePinned, savepinned); /* Xen */
+ PAGEFLAG(Reserved, reserved) __CLEARPAGEFLAG(Reserved, reserved)
+ PAGEFLAG(SwapBacked, swapbacked) __CLEARPAGEFLAG(SwapBacked, swapbacked)
++ __SETPAGEFLAG(SwapBacked, swapbacked)
+
+ __PAGEFLAG(SlobFree, slob_free)
+
+@@ -228,9 +230,9 @@ PAGEFLAG(OwnerPriv1, owner_priv_1) TESTCLEARFLAG(OwnerPriv1, owner_priv_1)
+ TESTPAGEFLAG(Writeback, writeback) TESTSCFLAG(Writeback, writeback)
+ PAGEFLAG(MappedToDisk, mappedtodisk)
+
+-/* PG_readahead is only used for file reads; PG_reclaim is only for writes */
++/* PG_readahead is only used for reads; PG_reclaim is only for writes */
+ PAGEFLAG(Reclaim, reclaim) TESTCLEARFLAG(Reclaim, reclaim)
+-PAGEFLAG(Readahead, reclaim) /* Reminder to do async read-ahead */
++PAGEFLAG(Readahead, reclaim) TESTCLEARFLAG(Readahead, reclaim)
+
+ #ifdef CONFIG_HIGHMEM
+ /*
+diff --git a/include/linux/pageblock-flags.h b/include/linux/pageblock-flags.h
+index c08730c10c7a..2baeee12f48e 100644
+--- a/include/linux/pageblock-flags.h
++++ b/include/linux/pageblock-flags.h
+@@ -65,33 +65,26 @@ extern int pageblock_order;
+ /* Forward declaration */
+ struct page;
+
+-unsigned long get_pageblock_flags_mask(struct page *page,
++unsigned long get_pfnblock_flags_mask(struct page *page,
++ unsigned long pfn,
+ unsigned long end_bitidx,
+ unsigned long mask);
+-void set_pageblock_flags_mask(struct page *page,
++
++void set_pfnblock_flags_mask(struct page *page,
+ unsigned long flags,
++ unsigned long pfn,
+ unsigned long end_bitidx,
+ unsigned long mask);
+
+ /* Declarations for getting and setting flags. See mm/page_alloc.c */
+-static inline unsigned long get_pageblock_flags_group(struct page *page,
+- int start_bitidx, int end_bitidx)
+-{
+- unsigned long nr_flag_bits = end_bitidx - start_bitidx + 1;
+- unsigned long mask = (1 << nr_flag_bits) - 1;
+-
+- return get_pageblock_flags_mask(page, end_bitidx, mask);
+-}
+-
+-static inline void set_pageblock_flags_group(struct page *page,
+- unsigned long flags,
+- int start_bitidx, int end_bitidx)
+-{
+- unsigned long nr_flag_bits = end_bitidx - start_bitidx + 1;
+- unsigned long mask = (1 << nr_flag_bits) - 1;
+-
+- set_pageblock_flags_mask(page, flags, end_bitidx, mask);
+-}
++#define get_pageblock_flags_group(page, start_bitidx, end_bitidx) \
++ get_pfnblock_flags_mask(page, page_to_pfn(page), \
++ end_bitidx, \
++ (1 << (end_bitidx - start_bitidx + 1)) - 1)
++#define set_pageblock_flags_group(page, flags, start_bitidx, end_bitidx) \
++ set_pfnblock_flags_mask(page, flags, page_to_pfn(page), \
++ end_bitidx, \
++ (1 << (end_bitidx - start_bitidx + 1)) - 1)
+
+ #ifdef CONFIG_COMPACTION
+ #define get_pageblock_skip(page) \
+diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
+index e3dea75a078b..d57a02a9747b 100644
+--- a/include/linux/pagemap.h
++++ b/include/linux/pagemap.h
+@@ -99,7 +99,7 @@ static inline void mapping_set_gfp_mask(struct address_space *m, gfp_t mask)
+
+ #define page_cache_get(page) get_page(page)
+ #define page_cache_release(page) put_page(page)
+-void release_pages(struct page **pages, int nr, int cold);
++void release_pages(struct page **pages, int nr, bool cold);
+
+ /*
+ * speculatively take a reference to a page.
+@@ -243,12 +243,117 @@ static inline struct page *page_cache_alloc_readahead(struct address_space *x)
+
+ typedef int filler_t(void *, struct page *);
+
+-extern struct page * find_get_page(struct address_space *mapping,
+- pgoff_t index);
+-extern struct page * find_lock_page(struct address_space *mapping,
+- pgoff_t index);
+-extern struct page * find_or_create_page(struct address_space *mapping,
+- pgoff_t index, gfp_t gfp_mask);
++pgoff_t page_cache_next_hole(struct address_space *mapping,
++ pgoff_t index, unsigned long max_scan);
++pgoff_t page_cache_prev_hole(struct address_space *mapping,
++ pgoff_t index, unsigned long max_scan);
++
++#define FGP_ACCESSED 0x00000001
++#define FGP_LOCK 0x00000002
++#define FGP_CREAT 0x00000004
++#define FGP_WRITE 0x00000008
++#define FGP_NOFS 0x00000010
++#define FGP_NOWAIT 0x00000020
++
++struct page *pagecache_get_page(struct address_space *mapping, pgoff_t offset,
++ int fgp_flags, gfp_t cache_gfp_mask, gfp_t radix_gfp_mask);
++
++/**
++ * find_get_page - find and get a page reference
++ * @mapping: the address_space to search
++ * @offset: the page index
++ *
++ * Looks up the page cache slot at @mapping & @offset. If there is a
++ * page cache page, it is returned with an increased refcount.
++ *
++ * Otherwise, %NULL is returned.
++ */
++static inline struct page *find_get_page(struct address_space *mapping,
++ pgoff_t offset)
++{
++ return pagecache_get_page(mapping, offset, 0, 0, 0);
++}
++
++static inline struct page *find_get_page_flags(struct address_space *mapping,
++ pgoff_t offset, int fgp_flags)
++{
++ return pagecache_get_page(mapping, offset, fgp_flags, 0, 0);
++}
++
++/**
++ * find_lock_page - locate, pin and lock a pagecache page
++ * pagecache_get_page - find and get a page reference
++ * @mapping: the address_space to search
++ * @offset: the page index
++ *
++ * Looks up the page cache slot at @mapping & @offset. If there is a
++ * page cache page, it is returned locked and with an increased
++ * refcount.
++ *
++ * Otherwise, %NULL is returned.
++ *
++ * find_lock_page() may sleep.
++ */
++static inline struct page *find_lock_page(struct address_space *mapping,
++ pgoff_t offset)
++{
++ return pagecache_get_page(mapping, offset, FGP_LOCK, 0, 0);
++}
++
++/**
++ * find_or_create_page - locate or add a pagecache page
++ * @mapping: the page's address_space
++ * @index: the page's index into the mapping
++ * @gfp_mask: page allocation mode
++ *
++ * Looks up the page cache slot at @mapping & @offset. If there is a
++ * page cache page, it is returned locked and with an increased
++ * refcount.
++ *
++ * If the page is not present, a new page is allocated using @gfp_mask
++ * and added to the page cache and the VM's LRU list. The page is
++ * returned locked and with an increased refcount.
++ *
++ * On memory exhaustion, %NULL is returned.
++ *
++ * find_or_create_page() may sleep, even if @gfp_flags specifies an
++ * atomic allocation!
++ */
++static inline struct page *find_or_create_page(struct address_space *mapping,
++ pgoff_t offset, gfp_t gfp_mask)
++{
++ return pagecache_get_page(mapping, offset,
++ FGP_LOCK|FGP_ACCESSED|FGP_CREAT,
++ gfp_mask, gfp_mask & GFP_RECLAIM_MASK);
++}
++
++/**
++ * grab_cache_page_nowait - returns locked page at given index in given cache
++ * @mapping: target address_space
++ * @index: the page index
++ *
++ * Same as grab_cache_page(), but do not wait if the page is unavailable.
++ * This is intended for speculative data generators, where the data can
++ * be regenerated if the page couldn't be grabbed. This routine should
++ * be safe to call while holding the lock for another page.
++ *
++ * Clear __GFP_FS when allocating the page to avoid recursion into the fs
++ * and deadlock against the caller's locked page.
++ */
++static inline struct page *grab_cache_page_nowait(struct address_space *mapping,
++ pgoff_t index)
++{
++ return pagecache_get_page(mapping, index,
++ FGP_LOCK|FGP_CREAT|FGP_NOFS|FGP_NOWAIT,
++ mapping_gfp_mask(mapping),
++ GFP_NOFS);
++}
++
++struct page *find_get_entry(struct address_space *mapping, pgoff_t offset);
++struct page *find_lock_entry(struct address_space *mapping, pgoff_t offset);
++unsigned find_get_entries(struct address_space *mapping, pgoff_t start,
++ unsigned int nr_entries, struct page **entries,
++ pgoff_t *indices);
+ unsigned find_get_pages(struct address_space *mapping, pgoff_t start,
+ unsigned int nr_pages, struct page **pages);
+ unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t start,
+@@ -268,10 +373,6 @@ static inline struct page *grab_cache_page(struct address_space *mapping,
+ return find_or_create_page(mapping, index, mapping_gfp_mask(mapping));
+ }
+
+-extern struct page * grab_cache_page_nowait(struct address_space *mapping,
+- pgoff_t index);
+-extern struct page * read_cache_page_async(struct address_space *mapping,
+- pgoff_t index, filler_t *filler, void *data);
+ extern struct page * read_cache_page(struct address_space *mapping,
+ pgoff_t index, filler_t *filler, void *data);
+ extern struct page * read_cache_page_gfp(struct address_space *mapping,
+@@ -279,14 +380,6 @@ extern struct page * read_cache_page_gfp(struct address_space *mapping,
+ extern int read_cache_pages(struct address_space *mapping,
+ struct list_head *pages, filler_t *filler, void *data);
+
+-static inline struct page *read_mapping_page_async(
+- struct address_space *mapping,
+- pgoff_t index, void *data)
+-{
+- filler_t *filler = (filler_t *)mapping->a_ops->readpage;
+- return read_cache_page_async(mapping, index, filler, data);
+-}
+-
+ static inline struct page *read_mapping_page(struct address_space *mapping,
+ pgoff_t index, void *data)
+ {
+diff --git a/include/linux/pagevec.h b/include/linux/pagevec.h
+index e4dbfab37729..b45d391b4540 100644
+--- a/include/linux/pagevec.h
++++ b/include/linux/pagevec.h
+@@ -22,6 +22,11 @@ struct pagevec {
+
+ void __pagevec_release(struct pagevec *pvec);
+ void __pagevec_lru_add(struct pagevec *pvec);
++unsigned pagevec_lookup_entries(struct pagevec *pvec,
++ struct address_space *mapping,
++ pgoff_t start, unsigned nr_entries,
++ pgoff_t *indices);
++void pagevec_remove_exceptionals(struct pagevec *pvec);
+ unsigned pagevec_lookup(struct pagevec *pvec, struct address_space *mapping,
+ pgoff_t start, unsigned nr_pages);
+ unsigned pagevec_lookup_tag(struct pagevec *pvec,
+diff --git a/include/linux/plist.h b/include/linux/plist.h
+index aa0fb390bd29..8b6c970cff6c 100644
+--- a/include/linux/plist.h
++++ b/include/linux/plist.h
+@@ -98,6 +98,13 @@ struct plist_node {
+ }
+
+ /**
++ * PLIST_HEAD - declare and init plist_head
++ * @head: name for struct plist_head variable
++ */
++#define PLIST_HEAD(head) \
++ struct plist_head head = PLIST_HEAD_INIT(head)
++
++/**
+ * PLIST_NODE_INIT - static struct plist_node initializer
+ * @node: struct plist_node variable name
+ * @__prio: initial node priority
+@@ -134,6 +141,8 @@ static inline void plist_node_init(struct plist_node *node, int prio)
+ extern void plist_add(struct plist_node *node, struct plist_head *head);
+ extern void plist_del(struct plist_node *node, struct plist_head *head);
+
++extern void plist_requeue(struct plist_node *node, struct plist_head *head);
++
+ /**
+ * plist_for_each - iterate over the plist
+ * @pos: the type * to use as a loop counter
+@@ -143,6 +152,16 @@ extern void plist_del(struct plist_node *node, struct plist_head *head);
+ list_for_each_entry(pos, &(head)->node_list, node_list)
+
+ /**
++ * plist_for_each_continue - continue iteration over the plist
++ * @pos: the type * to use as a loop cursor
++ * @head: the head for your list
++ *
++ * Continue to iterate over plist, continuing after the current position.
++ */
++#define plist_for_each_continue(pos, head) \
++ list_for_each_entry_continue(pos, &(head)->node_list, node_list)
++
++/**
+ * plist_for_each_safe - iterate safely over a plist of given type
+ * @pos: the type * to use as a loop counter
+ * @n: another type * to use as temporary storage
+@@ -163,6 +182,18 @@ extern void plist_del(struct plist_node *node, struct plist_head *head);
+ list_for_each_entry(pos, &(head)->node_list, mem.node_list)
+
+ /**
++ * plist_for_each_entry_continue - continue iteration over list of given type
++ * @pos: the type * to use as a loop cursor
++ * @head: the head for your list
++ * @m: the name of the list_struct within the struct
++ *
++ * Continue to iterate over list of given type, continuing after
++ * the current position.
++ */
++#define plist_for_each_entry_continue(pos, head, m) \
++ list_for_each_entry_continue(pos, &(head)->node_list, m.node_list)
++
++/**
+ * plist_for_each_entry_safe - iterate safely over list of given type
+ * @pos: the type * to use as a loop counter
+ * @n: another type * to use as temporary storage
+@@ -229,6 +260,20 @@ static inline int plist_node_empty(const struct plist_node *node)
+ #endif
+
+ /**
++ * plist_next - get the next entry in list
++ * @pos: the type * to cursor
++ */
++#define plist_next(pos) \
++ list_next_entry(pos, node_list)
++
++/**
++ * plist_prev - get the prev entry in list
++ * @pos: the type * to cursor
++ */
++#define plist_prev(pos) \
++ list_prev_entry(pos, node_list)
++
++/**
+ * plist_first - return the first node (and thus, highest priority)
+ * @head: the &struct plist_head pointer
+ *
+diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h
+index 403940787be1..e8be53ecfc45 100644
+--- a/include/linux/radix-tree.h
++++ b/include/linux/radix-tree.h
+@@ -219,6 +219,7 @@ static inline void radix_tree_replace_slot(void **pslot, void *item)
+ int radix_tree_insert(struct radix_tree_root *, unsigned long, void *);
+ void *radix_tree_lookup(struct radix_tree_root *, unsigned long);
+ void **radix_tree_lookup_slot(struct radix_tree_root *, unsigned long);
++void *radix_tree_delete_item(struct radix_tree_root *, unsigned long, void *);
+ void *radix_tree_delete(struct radix_tree_root *, unsigned long);
+ unsigned int
+ radix_tree_gang_lookup(struct radix_tree_root *root, void **results,
+@@ -226,10 +227,6 @@ radix_tree_gang_lookup(struct radix_tree_root *root, void **results,
+ unsigned int radix_tree_gang_lookup_slot(struct radix_tree_root *root,
+ void ***results, unsigned long *indices,
+ unsigned long first_index, unsigned int max_items);
+-unsigned long radix_tree_next_hole(struct radix_tree_root *root,
+- unsigned long index, unsigned long max_scan);
+-unsigned long radix_tree_prev_hole(struct radix_tree_root *root,
+- unsigned long index, unsigned long max_scan);
+ int radix_tree_preload(gfp_t gfp_mask);
+ int radix_tree_maybe_preload(gfp_t gfp_mask);
+ void radix_tree_init(void);
+diff --git a/include/linux/sched.h b/include/linux/sched.h
+index 0827bec7d82f..cb67b4e2dba2 100644
+--- a/include/linux/sched.h
++++ b/include/linux/sched.h
+@@ -63,6 +63,10 @@ struct fs_struct;
+ struct perf_event_context;
+ struct blk_plug;
+
++#define VMACACHE_BITS 2
++#define VMACACHE_SIZE (1U << VMACACHE_BITS)
++#define VMACACHE_MASK (VMACACHE_SIZE - 1)
++
+ /*
+ * List of flags we want to share for kernel threads,
+ * if only because they are not used by them anyway.
+@@ -1093,6 +1097,9 @@ struct task_struct {
+ #ifdef CONFIG_COMPAT_BRK
+ unsigned brk_randomized:1;
+ #endif
++ /* per-thread vma caching */
++ u32 vmacache_seqnum;
++ struct vm_area_struct *vmacache[VMACACHE_SIZE];
+ #if defined(SPLIT_RSS_COUNTING)
+ struct task_rss_stat rss_stat;
+ #endif
+diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h
+index 30aa0dc60d75..deb49609cd36 100644
+--- a/include/linux/shmem_fs.h
++++ b/include/linux/shmem_fs.h
+@@ -49,6 +49,7 @@ extern struct file *shmem_file_setup(const char *name,
+ loff_t size, unsigned long flags);
+ extern int shmem_zero_setup(struct vm_area_struct *);
+ extern int shmem_lock(struct file *file, int lock, struct user_struct *user);
++extern bool shmem_mapping(struct address_space *mapping);
+ extern void shmem_unlock_mapping(struct address_space *mapping);
+ extern struct page *shmem_read_mapping_page_gfp(struct address_space *mapping,
+ pgoff_t index, gfp_t gfp_mask);
+diff --git a/include/linux/swap.h b/include/linux/swap.h
+index 46ba0c6c219f..241bf0922770 100644
+--- a/include/linux/swap.h
++++ b/include/linux/swap.h
+@@ -214,8 +214,9 @@ struct percpu_cluster {
+ struct swap_info_struct {
+ unsigned long flags; /* SWP_USED etc: see above */
+ signed short prio; /* swap priority of this type */
++ struct plist_node list; /* entry in swap_active_head */
++ struct plist_node avail_list; /* entry in swap_avail_head */
+ signed char type; /* strange name for an index */
+- signed char next; /* next type on the swap list */
+ unsigned int max; /* extent of the swap_map */
+ unsigned char *swap_map; /* vmalloc'ed array of usage counts */
+ struct swap_cluster_info *cluster_info; /* cluster info. Only for SSD */
+@@ -255,11 +256,6 @@ struct swap_info_struct {
+ struct swap_cluster_info discard_cluster_tail; /* list tail of discard clusters */
+ };
+
+-struct swap_list_t {
+- int head; /* head of priority-ordered swapfile list */
+- int next; /* swapfile to be used next */
+-};
+-
+ /* linux/mm/page_alloc.c */
+ extern unsigned long totalram_pages;
+ extern unsigned long totalreserve_pages;
+@@ -272,12 +268,14 @@ extern unsigned long nr_free_pagecache_pages(void);
+
+
+ /* linux/mm/swap.c */
+-extern void __lru_cache_add(struct page *);
+ extern void lru_cache_add(struct page *);
++extern void lru_cache_add_anon(struct page *page);
++extern void lru_cache_add_file(struct page *page);
+ extern void lru_add_page_tail(struct page *page, struct page *page_tail,
+ struct lruvec *lruvec, struct list_head *head);
+ extern void activate_page(struct page *);
+ extern void mark_page_accessed(struct page *);
++extern void init_page_accessed(struct page *page);
+ extern void lru_add_drain(void);
+ extern void lru_add_drain_cpu(int cpu);
+ extern void lru_add_drain_all(void);
+@@ -287,22 +285,6 @@ extern void swap_setup(void);
+
+ extern void add_page_to_unevictable_list(struct page *page);
+
+-/**
+- * lru_cache_add: add a page to the page lists
+- * @page: the page to add
+- */
+-static inline void lru_cache_add_anon(struct page *page)
+-{
+- ClearPageActive(page);
+- __lru_cache_add(page);
+-}
+-
+-static inline void lru_cache_add_file(struct page *page)
+-{
+- ClearPageActive(page);
+- __lru_cache_add(page);
+-}
+-
+ /* linux/mm/vmscan.c */
+ extern unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
+ gfp_t gfp_mask, nodemask_t *mask);
+@@ -460,7 +442,7 @@ mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent, bool swapout)
+ #define free_page_and_swap_cache(page) \
+ page_cache_release(page)
+ #define free_pages_and_swap_cache(pages, nr) \
+- release_pages((pages), (nr), 0);
++ release_pages((pages), (nr), false);
+
+ static inline void show_swap_cache_info(void)
+ {
+diff --git a/include/linux/swapfile.h b/include/linux/swapfile.h
+index e282624e8c10..388293a91e8c 100644
+--- a/include/linux/swapfile.h
++++ b/include/linux/swapfile.h
+@@ -6,7 +6,7 @@
+ * want to expose them to the dozens of source files that include swap.h
+ */
+ extern spinlock_t swap_lock;
+-extern struct swap_list_t swap_list;
++extern struct plist_head swap_active_head;
+ extern struct swap_info_struct *swap_info[];
+ extern int try_to_unuse(unsigned int, bool, unsigned long);
+
+diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h
+index c557c6d096de..3a712e2e7d76 100644
+--- a/include/linux/vm_event_item.h
++++ b/include/linux/vm_event_item.h
+@@ -71,12 +71,14 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT,
+ THP_ZERO_PAGE_ALLOC,
+ THP_ZERO_PAGE_ALLOC_FAILED,
+ #endif
++#ifdef CONFIG_DEBUG_TLBFLUSH
+ #ifdef CONFIG_SMP
+ NR_TLB_REMOTE_FLUSH, /* cpu tried to flush others' tlbs */
+ NR_TLB_REMOTE_FLUSH_RECEIVED,/* cpu received ipi for flush */
+-#endif
++#endif /* CONFIG_SMP */
+ NR_TLB_LOCAL_FLUSH_ALL,
+ NR_TLB_LOCAL_FLUSH_ONE,
++#endif /* CONFIG_DEBUG_TLBFLUSH */
+ NR_VM_EVENT_ITEMS
+ };
+
+diff --git a/include/linux/vmacache.h b/include/linux/vmacache.h
+new file mode 100644
+index 000000000000..c3fa0fd43949
+--- /dev/null
++++ b/include/linux/vmacache.h
+@@ -0,0 +1,38 @@
++#ifndef __LINUX_VMACACHE_H
++#define __LINUX_VMACACHE_H
++
++#include <linux/sched.h>
++#include <linux/mm.h>
++
++/*
++ * Hash based on the page number. Provides a good hit rate for
++ * workloads with good locality and those with random accesses as well.
++ */
++#define VMACACHE_HASH(addr) ((addr >> PAGE_SHIFT) & VMACACHE_MASK)
++
++static inline void vmacache_flush(struct task_struct *tsk)
++{
++ memset(tsk->vmacache, 0, sizeof(tsk->vmacache));
++}
++
++extern void vmacache_flush_all(struct mm_struct *mm);
++extern void vmacache_update(unsigned long addr, struct vm_area_struct *newvma);
++extern struct vm_area_struct *vmacache_find(struct mm_struct *mm,
++ unsigned long addr);
++
++#ifndef CONFIG_MMU
++extern struct vm_area_struct *vmacache_find_exact(struct mm_struct *mm,
++ unsigned long start,
++ unsigned long end);
++#endif
++
++static inline void vmacache_invalidate(struct mm_struct *mm)
++{
++ mm->vmacache_seqnum++;
++
++ /* deal with overflows */
++ if (unlikely(mm->vmacache_seqnum == 0))
++ vmacache_flush_all(mm);
++}
++
++#endif /* __LINUX_VMACACHE_H */
+diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h
+index a67b38415768..67ce70c8279b 100644
+--- a/include/linux/vmstat.h
++++ b/include/linux/vmstat.h
+@@ -83,6 +83,14 @@ static inline void vm_events_fold_cpu(int cpu)
+ #define count_vm_numa_events(x, y) do { (void)(y); } while (0)
+ #endif /* CONFIG_NUMA_BALANCING */
+
++#ifdef CONFIG_DEBUG_TLBFLUSH
++#define count_vm_tlb_event(x) count_vm_event(x)
++#define count_vm_tlb_events(x, y) count_vm_events(x, y)
++#else
++#define count_vm_tlb_event(x) do {} while (0)
++#define count_vm_tlb_events(x, y) do { (void)(y); } while (0)
++#endif
++
+ #define __count_zone_vm_events(item, zone, delta) \
+ __count_vm_events(item##_NORMAL - ZONE_NORMAL + \
+ zone_idx(zone), delta)
+diff --git a/include/trace/events/compaction.h b/include/trace/events/compaction.h
+index fde1b3e94c7d..c6814b917bdf 100644
+--- a/include/trace/events/compaction.h
++++ b/include/trace/events/compaction.h
+@@ -5,6 +5,7 @@
+ #define _TRACE_COMPACTION_H
+
+ #include <linux/types.h>
++#include <linux/list.h>
+ #include <linux/tracepoint.h>
+ #include <trace/events/gfpflags.h>
+
+@@ -47,10 +48,11 @@ DEFINE_EVENT(mm_compaction_isolate_template, mm_compaction_isolate_freepages,
+
+ TRACE_EVENT(mm_compaction_migratepages,
+
+- TP_PROTO(unsigned long nr_migrated,
+- unsigned long nr_failed),
++ TP_PROTO(unsigned long nr_all,
++ int migrate_rc,
++ struct list_head *migratepages),
+
+- TP_ARGS(nr_migrated, nr_failed),
++ TP_ARGS(nr_all, migrate_rc, migratepages),
+
+ TP_STRUCT__entry(
+ __field(unsigned long, nr_migrated)
+@@ -58,7 +60,22 @@ TRACE_EVENT(mm_compaction_migratepages,
+ ),
+
+ TP_fast_assign(
+- __entry->nr_migrated = nr_migrated;
++ unsigned long nr_failed = 0;
++ struct list_head *page_lru;
++
++ /*
++ * migrate_pages() returns either a non-negative number
++ * with the number of pages that failed migration, or an
++ * error code, in which case we need to count the remaining
++ * pages manually
++ */
++ if (migrate_rc >= 0)
++ nr_failed = migrate_rc;
++ else
++ list_for_each(page_lru, migratepages)
++ nr_failed++;
++
++ __entry->nr_migrated = nr_all - nr_failed;
+ __entry->nr_failed = nr_failed;
+ ),
+
+@@ -67,6 +84,48 @@ TRACE_EVENT(mm_compaction_migratepages,
+ __entry->nr_failed)
+ );
+
++TRACE_EVENT(mm_compaction_begin,
++ TP_PROTO(unsigned long zone_start, unsigned long migrate_start,
++ unsigned long free_start, unsigned long zone_end),
++
++ TP_ARGS(zone_start, migrate_start, free_start, zone_end),
++
++ TP_STRUCT__entry(
++ __field(unsigned long, zone_start)
++ __field(unsigned long, migrate_start)
++ __field(unsigned long, free_start)
++ __field(unsigned long, zone_end)
++ ),
++
++ TP_fast_assign(
++ __entry->zone_start = zone_start;
++ __entry->migrate_start = migrate_start;
++ __entry->free_start = free_start;
++ __entry->zone_end = zone_end;
++ ),
++
++ TP_printk("zone_start=%lu migrate_start=%lu free_start=%lu zone_end=%lu",
++ __entry->zone_start,
++ __entry->migrate_start,
++ __entry->free_start,
++ __entry->zone_end)
++);
++
++TRACE_EVENT(mm_compaction_end,
++ TP_PROTO(int status),
++
++ TP_ARGS(status),
++
++ TP_STRUCT__entry(
++ __field(int, status)
++ ),
++
++ TP_fast_assign(
++ __entry->status = status;
++ ),
++
++ TP_printk("status=%d", __entry->status)
++);
+
+ #endif /* _TRACE_COMPACTION_H */
+
+diff --git a/include/trace/events/kmem.h b/include/trace/events/kmem.h
+index d0c613476620..aece1346ceb7 100644
+--- a/include/trace/events/kmem.h
++++ b/include/trace/events/kmem.h
+@@ -267,14 +267,12 @@ DEFINE_EVENT_PRINT(mm_page, mm_page_pcpu_drain,
+ TRACE_EVENT(mm_page_alloc_extfrag,
+
+ TP_PROTO(struct page *page,
+- int alloc_order, int fallback_order,
+- int alloc_migratetype, int fallback_migratetype,
+- int change_ownership),
++ int alloc_order, int fallback_order,
++ int alloc_migratetype, int fallback_migratetype, int new_migratetype),
+
+ TP_ARGS(page,
+ alloc_order, fallback_order,
+- alloc_migratetype, fallback_migratetype,
+- change_ownership),
++ alloc_migratetype, fallback_migratetype, new_migratetype),
+
+ TP_STRUCT__entry(
+ __field( struct page *, page )
+@@ -291,7 +289,7 @@ TRACE_EVENT(mm_page_alloc_extfrag,
+ __entry->fallback_order = fallback_order;
+ __entry->alloc_migratetype = alloc_migratetype;
+ __entry->fallback_migratetype = fallback_migratetype;
+- __entry->change_ownership = change_ownership;
++ __entry->change_ownership = (new_migratetype == alloc_migratetype);
+ ),
+
+ TP_printk("page=%p pfn=%lu alloc_order=%d fallback_order=%d pageblock_order=%d alloc_migratetype=%d fallback_migratetype=%d fragmenting=%d change_ownership=%d",
+diff --git a/include/trace/events/pagemap.h b/include/trace/events/pagemap.h
+index 1c9fabde69e4..ce0803b8d05f 100644
+--- a/include/trace/events/pagemap.h
++++ b/include/trace/events/pagemap.h
+@@ -28,12 +28,10 @@ TRACE_EVENT(mm_lru_insertion,
+
+ TP_PROTO(
+ struct page *page,
+- unsigned long pfn,
+- int lru,
+- unsigned long flags
++ int lru
+ ),
+
+- TP_ARGS(page, pfn, lru, flags),
++ TP_ARGS(page, lru),
+
+ TP_STRUCT__entry(
+ __field(struct page *, page )
+@@ -44,9 +42,9 @@ TRACE_EVENT(mm_lru_insertion,
+
+ TP_fast_assign(
+ __entry->page = page;
+- __entry->pfn = pfn;
++ __entry->pfn = page_to_pfn(page);
+ __entry->lru = lru;
+- __entry->flags = flags;
++ __entry->flags = trace_pagemap_flags(page);
+ ),
+
+ /* Flag format is based on page-types.c formatting for pagemap */
+@@ -64,9 +62,9 @@ TRACE_EVENT(mm_lru_insertion,
+
+ TRACE_EVENT(mm_lru_activate,
+
+- TP_PROTO(struct page *page, unsigned long pfn),
++ TP_PROTO(struct page *page),
+
+- TP_ARGS(page, pfn),
++ TP_ARGS(page),
+
+ TP_STRUCT__entry(
+ __field(struct page *, page )
+@@ -75,7 +73,7 @@ TRACE_EVENT(mm_lru_activate,
+
+ TP_fast_assign(
+ __entry->page = page;
+- __entry->pfn = pfn;
++ __entry->pfn = page_to_pfn(page);
+ ),
+
+ /* Flag format is based on page-types.c formatting for pagemap */
+diff --git a/kernel/cpuset.c b/kernel/cpuset.c
+index 0b29c52479a6..c8289138cad4 100644
+--- a/kernel/cpuset.c
++++ b/kernel/cpuset.c
+@@ -61,12 +61,7 @@
+ #include <linux/cgroup.h>
+ #include <linux/wait.h>
+
+-/*
+- * Tracks how many cpusets are currently defined in system.
+- * When there is only one cpuset (the root cpuset) we can
+- * short circuit some hooks.
+- */
+-int number_of_cpusets __read_mostly;
++struct static_key cpusets_enabled_key __read_mostly = STATIC_KEY_INIT_FALSE;
+
+ /* See "Frequency meter" comments, below. */
+
+@@ -611,7 +606,7 @@ static int generate_sched_domains(cpumask_var_t **domains,
+ goto done;
+ }
+
+- csa = kmalloc(number_of_cpusets * sizeof(cp), GFP_KERNEL);
++ csa = kmalloc(nr_cpusets() * sizeof(cp), GFP_KERNEL);
+ if (!csa)
+ goto done;
+ csn = 0;
+@@ -1022,7 +1017,7 @@ static void cpuset_change_task_nodemask(struct task_struct *tsk,
+ task_lock(tsk);
+ /*
+ * Determine if a loop is necessary if another thread is doing
+- * get_mems_allowed(). If at least one node remains unchanged and
++ * read_mems_allowed_begin(). If at least one node remains unchanged and
+ * tsk does not have a mempolicy, then an empty nodemask will not be
+ * possible when mems_allowed is larger than a word.
+ */
+@@ -1986,7 +1981,7 @@ static int cpuset_css_online(struct cgroup_subsys_state *css)
+ if (is_spread_slab(parent))
+ set_bit(CS_SPREAD_SLAB, &cs->flags);
+
+- number_of_cpusets++;
++ cpuset_inc();
+
+ if (!test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags))
+ goto out_unlock;
+@@ -2037,7 +2032,7 @@ static void cpuset_css_offline(struct cgroup_subsys_state *css)
+ if (is_sched_load_balance(cs))
+ update_flag(CS_SCHED_LOAD_BALANCE, cs, 0);
+
+- number_of_cpusets--;
++ cpuset_dec();
+ clear_bit(CS_ONLINE, &cs->flags);
+
+ mutex_unlock(&cpuset_mutex);
+@@ -2092,7 +2087,6 @@ int __init cpuset_init(void)
+ if (!alloc_cpumask_var(&cpus_attach, GFP_KERNEL))
+ BUG();
+
+- number_of_cpusets = 1;
+ return 0;
+ }
+
+diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c
+index 0506d447aed2..e911ec662d03 100644
+--- a/kernel/debug/debug_core.c
++++ b/kernel/debug/debug_core.c
+@@ -49,6 +49,7 @@
+ #include <linux/pid.h>
+ #include <linux/smp.h>
+ #include <linux/mm.h>
++#include <linux/vmacache.h>
+ #include <linux/rcupdate.h>
+
+ #include <asm/cacheflush.h>
+@@ -224,10 +225,17 @@ static void kgdb_flush_swbreak_addr(unsigned long addr)
+ if (!CACHE_FLUSH_IS_SAFE)
+ return;
+
+- if (current->mm && current->mm->mmap_cache) {
+- flush_cache_range(current->mm->mmap_cache,
+- addr, addr + BREAK_INSTR_SIZE);
++ if (current->mm) {
++ int i;
++
++ for (i = 0; i < VMACACHE_SIZE; i++) {
++ if (!current->vmacache[i])
++ continue;
++ flush_cache_range(current->vmacache[i],
++ addr, addr + BREAK_INSTR_SIZE);
++ }
+ }
++
+ /* Force flush instruction cache if it was outside the mm */
+ flush_icache_range(addr, addr + BREAK_INSTR_SIZE);
+ }
+diff --git a/kernel/fork.c b/kernel/fork.c
+index 143962949bed..29a1b0283d3b 100644
+--- a/kernel/fork.c
++++ b/kernel/fork.c
+@@ -28,6 +28,8 @@
+ #include <linux/mman.h>
+ #include <linux/mmu_notifier.h>
+ #include <linux/fs.h>
++#include <linux/mm.h>
++#include <linux/vmacache.h>
+ #include <linux/nsproxy.h>
+ #include <linux/capability.h>
+ #include <linux/cpu.h>
+@@ -363,7 +365,7 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
+
+ mm->locked_vm = 0;
+ mm->mmap = NULL;
+- mm->mmap_cache = NULL;
++ mm->vmacache_seqnum = 0;
+ mm->map_count = 0;
+ cpumask_clear(mm_cpumask(mm));
+ mm->mm_rb = RB_ROOT;
+@@ -882,6 +884,9 @@ static int copy_mm(unsigned long clone_flags, struct task_struct *tsk)
+ if (!oldmm)
+ return 0;
+
++ /* initialize the new vmacache entries */
++ vmacache_flush(tsk);
++
+ if (clone_flags & CLONE_VM) {
+ atomic_inc(&oldmm->mm_users);
+ mm = oldmm;
+diff --git a/lib/plist.c b/lib/plist.c
+index 1ebc95f7a46f..0f2084d30798 100644
+--- a/lib/plist.c
++++ b/lib/plist.c
+@@ -134,6 +134,46 @@ void plist_del(struct plist_node *node, struct plist_head *head)
+ plist_check_head(head);
+ }
+
++/**
++ * plist_requeue - Requeue @node at end of same-prio entries.
++ *
++ * This is essentially an optimized plist_del() followed by
++ * plist_add(). It moves an entry already in the plist to
++ * after any other same-priority entries.
++ *
++ * @node: &struct plist_node pointer - entry to be moved
++ * @head: &struct plist_head pointer - list head
++ */
++void plist_requeue(struct plist_node *node, struct plist_head *head)
++{
++ struct plist_node *iter;
++ struct list_head *node_next = &head->node_list;
++
++ plist_check_head(head);
++ BUG_ON(plist_head_empty(head));
++ BUG_ON(plist_node_empty(node));
++
++ if (node == plist_last(head))
++ return;
++
++ iter = plist_next(node);
++
++ if (node->prio != iter->prio)
++ return;
++
++ plist_del(node, head);
++
++ plist_for_each_continue(iter, head) {
++ if (node->prio != iter->prio) {
++ node_next = &iter->node_list;
++ break;
++ }
++ }
++ list_add_tail(&node->node_list, node_next);
++
++ plist_check_head(head);
++}
++
+ #ifdef CONFIG_DEBUG_PI_LIST
+ #include <linux/sched.h>
+ #include <linux/module.h>
+@@ -170,6 +210,14 @@ static void __init plist_test_check(int nr_expect)
+ BUG_ON(prio_pos->prio_list.next != &first->prio_list);
+ }
+
++static void __init plist_test_requeue(struct plist_node *node)
++{
++ plist_requeue(node, &test_head);
++
++ if (node != plist_last(&test_head))
++ BUG_ON(node->prio == plist_next(node)->prio);
++}
++
+ static int __init plist_test(void)
+ {
+ int nr_expect = 0, i, loop;
+@@ -193,6 +241,10 @@ static int __init plist_test(void)
+ nr_expect--;
+ }
+ plist_test_check(nr_expect);
++ if (!plist_node_empty(test_node + i)) {
++ plist_test_requeue(test_node + i);
++ plist_test_check(nr_expect);
++ }
+ }
+
+ for (i = 0; i < ARRAY_SIZE(test_node); i++) {
+diff --git a/lib/radix-tree.c b/lib/radix-tree.c
+index 7811ed3b4e70..e8adb5d8a184 100644
+--- a/lib/radix-tree.c
++++ b/lib/radix-tree.c
+@@ -946,81 +946,6 @@ next:
+ }
+ EXPORT_SYMBOL(radix_tree_range_tag_if_tagged);
+
+-
+-/**
+- * radix_tree_next_hole - find the next hole (not-present entry)
+- * @root: tree root
+- * @index: index key
+- * @max_scan: maximum range to search
+- *
+- * Search the set [index, min(index+max_scan-1, MAX_INDEX)] for the lowest
+- * indexed hole.
+- *
+- * Returns: the index of the hole if found, otherwise returns an index
+- * outside of the set specified (in which case 'return - index >= max_scan'
+- * will be true). In rare cases of index wrap-around, 0 will be returned.
+- *
+- * radix_tree_next_hole may be called under rcu_read_lock. However, like
+- * radix_tree_gang_lookup, this will not atomically search a snapshot of
+- * the tree at a single point in time. For example, if a hole is created
+- * at index 5, then subsequently a hole is created at index 10,
+- * radix_tree_next_hole covering both indexes may return 10 if called
+- * under rcu_read_lock.
+- */
+-unsigned long radix_tree_next_hole(struct radix_tree_root *root,
+- unsigned long index, unsigned long max_scan)
+-{
+- unsigned long i;
+-
+- for (i = 0; i < max_scan; i++) {
+- if (!radix_tree_lookup(root, index))
+- break;
+- index++;
+- if (index == 0)
+- break;
+- }
+-
+- return index;
+-}
+-EXPORT_SYMBOL(radix_tree_next_hole);
+-
+-/**
+- * radix_tree_prev_hole - find the prev hole (not-present entry)
+- * @root: tree root
+- * @index: index key
+- * @max_scan: maximum range to search
+- *
+- * Search backwards in the range [max(index-max_scan+1, 0), index]
+- * for the first hole.
+- *
+- * Returns: the index of the hole if found, otherwise returns an index
+- * outside of the set specified (in which case 'index - return >= max_scan'
+- * will be true). In rare cases of wrap-around, ULONG_MAX will be returned.
+- *
+- * radix_tree_next_hole may be called under rcu_read_lock. However, like
+- * radix_tree_gang_lookup, this will not atomically search a snapshot of
+- * the tree at a single point in time. For example, if a hole is created
+- * at index 10, then subsequently a hole is created at index 5,
+- * radix_tree_prev_hole covering both indexes may return 5 if called under
+- * rcu_read_lock.
+- */
+-unsigned long radix_tree_prev_hole(struct radix_tree_root *root,
+- unsigned long index, unsigned long max_scan)
+-{
+- unsigned long i;
+-
+- for (i = 0; i < max_scan; i++) {
+- if (!radix_tree_lookup(root, index))
+- break;
+- index--;
+- if (index == ULONG_MAX)
+- break;
+- }
+-
+- return index;
+-}
+-EXPORT_SYMBOL(radix_tree_prev_hole);
+-
+ /**
+ * radix_tree_gang_lookup - perform multiple lookup on a radix tree
+ * @root: radix tree root
+@@ -1335,15 +1260,18 @@ static inline void radix_tree_shrink(struct radix_tree_root *root)
+ }
+
+ /**
+- * radix_tree_delete - delete an item from a radix tree
++ * radix_tree_delete_item - delete an item from a radix tree
+ * @root: radix tree root
+ * @index: index key
++ * @item: expected item
+ *
+- * Remove the item at @index from the radix tree rooted at @root.
++ * Remove @item at @index from the radix tree rooted at @root.
+ *
+- * Returns the address of the deleted item, or NULL if it was not present.
++ * Returns the address of the deleted item, or NULL if it was not present
++ * or the entry at the given @index was not @item.
+ */
+-void *radix_tree_delete(struct radix_tree_root *root, unsigned long index)
++void *radix_tree_delete_item(struct radix_tree_root *root,
++ unsigned long index, void *item)
+ {
+ struct radix_tree_node *node = NULL;
+ struct radix_tree_node *slot = NULL;
+@@ -1378,6 +1306,11 @@ void *radix_tree_delete(struct radix_tree_root *root, unsigned long index)
+ if (slot == NULL)
+ goto out;
+
++ if (item && slot != item) {
++ slot = NULL;
++ goto out;
++ }
++
+ /*
+ * Clear all tags associated with the item to be deleted.
+ * This way of doing it would be inefficient, but seldom is any set.
+@@ -1422,6 +1355,21 @@ void *radix_tree_delete(struct radix_tree_root *root, unsigned long index)
+ out:
+ return slot;
+ }
++EXPORT_SYMBOL(radix_tree_delete_item);
++
++/**
++ * radix_tree_delete - delete an item from a radix tree
++ * @root: radix tree root
++ * @index: index key
++ *
++ * Remove the item at @index from the radix tree rooted at @root.
++ *
++ * Returns the address of the deleted item, or NULL if it was not present.
++ */
++void *radix_tree_delete(struct radix_tree_root *root, unsigned long index)
++{
++ return radix_tree_delete_item(root, index, NULL);
++}
+ EXPORT_SYMBOL(radix_tree_delete);
+
+ /**
+diff --git a/mm/Makefile b/mm/Makefile
+index 305d10acd081..fb51bc61d80a 100644
+--- a/mm/Makefile
++++ b/mm/Makefile
+@@ -16,7 +16,7 @@ obj-y := filemap.o mempool.o oom_kill.o fadvise.o \
+ readahead.o swap.o truncate.o vmscan.o shmem.o \
+ util.o mmzone.o vmstat.o backing-dev.o \
+ mm_init.o mmu_context.o percpu.o slab_common.o \
+- compaction.o balloon_compaction.o \
++ compaction.o balloon_compaction.o vmacache.o \
+ interval_tree.o list_lru.o $(mmu-y)
+
+ obj-y += init-mm.o
+diff --git a/mm/compaction.c b/mm/compaction.c
+index 6441083e76d3..adb6d0560e96 100644
+--- a/mm/compaction.c
++++ b/mm/compaction.c
+@@ -89,7 +89,8 @@ static void __reset_isolation_suitable(struct zone *zone)
+ unsigned long end_pfn = zone_end_pfn(zone);
+ unsigned long pfn;
+
+- zone->compact_cached_migrate_pfn = start_pfn;
++ zone->compact_cached_migrate_pfn[0] = start_pfn;
++ zone->compact_cached_migrate_pfn[1] = start_pfn;
+ zone->compact_cached_free_pfn = end_pfn;
+ zone->compact_blockskip_flush = false;
+
+@@ -131,9 +132,10 @@ void reset_isolation_suitable(pg_data_t *pgdat)
+ */
+ static void update_pageblock_skip(struct compact_control *cc,
+ struct page *page, unsigned long nr_isolated,
+- bool migrate_scanner)
++ bool set_unsuitable, bool migrate_scanner)
+ {
+ struct zone *zone = cc->zone;
++ unsigned long pfn;
+
+ if (cc->ignore_skip_hint)
+ return;
+@@ -141,20 +143,32 @@ static void update_pageblock_skip(struct compact_control *cc,
+ if (!page)
+ return;
+
+- if (!nr_isolated) {
+- unsigned long pfn = page_to_pfn(page);
++ if (nr_isolated)
++ return;
++
++ /*
++ * Only skip pageblocks when all forms of compaction will be known to
++ * fail in the near future.
++ */
++ if (set_unsuitable)
+ set_pageblock_skip(page);
+
+- /* Update where compaction should restart */
+- if (migrate_scanner) {
+- if (!cc->finished_update_migrate &&
+- pfn > zone->compact_cached_migrate_pfn)
+- zone->compact_cached_migrate_pfn = pfn;
+- } else {
+- if (!cc->finished_update_free &&
+- pfn < zone->compact_cached_free_pfn)
+- zone->compact_cached_free_pfn = pfn;
+- }
++ pfn = page_to_pfn(page);
++
++ /* Update where async and sync compaction should restart */
++ if (migrate_scanner) {
++ if (cc->finished_update_migrate)
++ return;
++ if (pfn > zone->compact_cached_migrate_pfn[0])
++ zone->compact_cached_migrate_pfn[0] = pfn;
++ if (cc->mode != MIGRATE_ASYNC &&
++ pfn > zone->compact_cached_migrate_pfn[1])
++ zone->compact_cached_migrate_pfn[1] = pfn;
++ } else {
++ if (cc->finished_update_free)
++ return;
++ if (pfn < zone->compact_cached_free_pfn)
++ zone->compact_cached_free_pfn = pfn;
+ }
+ }
+ #else
+@@ -166,7 +180,7 @@ static inline bool isolation_suitable(struct compact_control *cc,
+
+ static void update_pageblock_skip(struct compact_control *cc,
+ struct page *page, unsigned long nr_isolated,
+- bool migrate_scanner)
++ bool set_unsuitable, bool migrate_scanner)
+ {
+ }
+ #endif /* CONFIG_COMPACTION */
+@@ -195,7 +209,7 @@ static bool compact_checklock_irqsave(spinlock_t *lock, unsigned long *flags,
+ }
+
+ /* async aborts if taking too long or contended */
+- if (!cc->sync) {
++ if (cc->mode == MIGRATE_ASYNC) {
+ cc->contended = true;
+ return false;
+ }
+@@ -208,30 +222,39 @@ static bool compact_checklock_irqsave(spinlock_t *lock, unsigned long *flags,
+ return true;
+ }
+
+-static inline bool compact_trylock_irqsave(spinlock_t *lock,
+- unsigned long *flags, struct compact_control *cc)
++/*
++ * Aside from avoiding lock contention, compaction also periodically checks
++ * need_resched() and either schedules in sync compaction or aborts async
++ * compaction. This is similar to what compact_checklock_irqsave() does, but
++ * is used where no lock is concerned.
++ *
++ * Returns false when no scheduling was needed, or sync compaction scheduled.
++ * Returns true when async compaction should abort.
++ */
++static inline bool compact_should_abort(struct compact_control *cc)
+ {
+- return compact_checklock_irqsave(lock, flags, false, cc);
++ /* async compaction aborts if contended */
++ if (need_resched()) {
++ if (cc->mode == MIGRATE_ASYNC) {
++ cc->contended = true;
++ return true;
++ }
++
++ cond_resched();
++ }
++
++ return false;
+ }
+
+ /* Returns true if the page is within a block suitable for migration to */
+ static bool suitable_migration_target(struct page *page)
+ {
+- int migratetype = get_pageblock_migratetype(page);
+-
+- /* Don't interfere with memory hot-remove or the min_free_kbytes blocks */
+- if (migratetype == MIGRATE_RESERVE)
+- return false;
+-
+- if (is_migrate_isolate(migratetype))
+- return false;
+-
+- /* If the page is a large free page, then allow migration */
++ /* If the page is a large free page, then disallow migration */
+ if (PageBuddy(page) && page_order(page) >= pageblock_order)
+- return true;
++ return false;
+
+ /* If the block is MIGRATE_MOVABLE or MIGRATE_CMA, allow migration */
+- if (migrate_async_suitable(migratetype))
++ if (migrate_async_suitable(get_pageblock_migratetype(page)))
+ return true;
+
+ /* Otherwise skip the block */
+@@ -254,6 +277,7 @@ static unsigned long isolate_freepages_block(struct compact_control *cc,
+ struct page *cursor, *valid_page = NULL;
+ unsigned long flags;
+ bool locked = false;
++ bool checked_pageblock = false;
+
+ cursor = pfn_to_page(blockpfn);
+
+@@ -285,8 +309,16 @@ static unsigned long isolate_freepages_block(struct compact_control *cc,
+ break;
+
+ /* Recheck this is a suitable migration target under lock */
+- if (!strict && !suitable_migration_target(page))
+- break;
++ if (!strict && !checked_pageblock) {
++ /*
++ * We need to check suitability of pageblock only once
++ * and this isolate_freepages_block() is called with
++ * pageblock range, so just check once is sufficient.
++ */
++ checked_pageblock = true;
++ if (!suitable_migration_target(page))
++ break;
++ }
+
+ /* Recheck this is a buddy page under lock */
+ if (!PageBuddy(page))
+@@ -330,7 +362,8 @@ isolate_fail:
+
+ /* Update the pageblock-skip if the whole pageblock was scanned */
+ if (blockpfn == end_pfn)
+- update_pageblock_skip(cc, valid_page, total_isolated, false);
++ update_pageblock_skip(cc, valid_page, total_isolated, true,
++ false);
+
+ count_compact_events(COMPACTFREE_SCANNED, nr_scanned);
+ if (total_isolated)
+@@ -461,11 +494,14 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
+ unsigned long last_pageblock_nr = 0, pageblock_nr;
+ unsigned long nr_scanned = 0, nr_isolated = 0;
+ struct list_head *migratelist = &cc->migratepages;
+- isolate_mode_t mode = 0;
+ struct lruvec *lruvec;
+ unsigned long flags;
+ bool locked = false;
+ struct page *page = NULL, *valid_page = NULL;
++ bool set_unsuitable = true;
++ const isolate_mode_t mode = (cc->mode == MIGRATE_ASYNC ?
++ ISOLATE_ASYNC_MIGRATE : 0) |
++ (unevictable ? ISOLATE_UNEVICTABLE : 0);
+
+ /*
+ * Ensure that there are not too many pages isolated from the LRU
+@@ -474,7 +510,7 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
+ */
+ while (unlikely(too_many_isolated(zone))) {
+ /* async migration should just abort */
+- if (!cc->sync)
++ if (cc->mode == MIGRATE_ASYNC)
+ return 0;
+
+ congestion_wait(BLK_RW_ASYNC, HZ/10);
+@@ -483,11 +519,13 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
+ return 0;
+ }
+
++ if (compact_should_abort(cc))
++ return 0;
++
+ /* Time to isolate some pages for migration */
+- cond_resched();
+ for (; low_pfn < end_pfn; low_pfn++) {
+ /* give a chance to irqs before checking need_resched() */
+- if (locked && !((low_pfn+1) % SWAP_CLUSTER_MAX)) {
++ if (locked && !(low_pfn % SWAP_CLUSTER_MAX)) {
+ if (should_release_lock(&zone->lru_lock)) {
+ spin_unlock_irqrestore(&zone->lru_lock, flags);
+ locked = false;
+@@ -526,25 +564,31 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
+
+ /* If isolation recently failed, do not retry */
+ pageblock_nr = low_pfn >> pageblock_order;
+- if (!isolation_suitable(cc, page))
+- goto next_pageblock;
++ if (last_pageblock_nr != pageblock_nr) {
++ int mt;
++
++ last_pageblock_nr = pageblock_nr;
++ if (!isolation_suitable(cc, page))
++ goto next_pageblock;
++
++ /*
++ * For async migration, also only scan in MOVABLE
++ * blocks. Async migration is optimistic to see if
++ * the minimum amount of work satisfies the allocation
++ */
++ mt = get_pageblock_migratetype(page);
++ if (cc->mode == MIGRATE_ASYNC &&
++ !migrate_async_suitable(mt)) {
++ set_unsuitable = false;
++ goto next_pageblock;
++ }
++ }
+
+ /* Skip if free */
+ if (PageBuddy(page))
+ continue;
+
+ /*
+- * For async migration, also only scan in MOVABLE blocks. Async
+- * migration is optimistic to see if the minimum amount of work
+- * satisfies the allocation
+- */
+- if (!cc->sync && last_pageblock_nr != pageblock_nr &&
+- !migrate_async_suitable(get_pageblock_migratetype(page))) {
+- cc->finished_update_migrate = true;
+- goto next_pageblock;
+- }
+-
+- /*
+ * Check may be lockless but that's ok as we recheck later.
+ * It's possible to migrate LRU pages and balloon pages
+ * Skip any other type of page
+@@ -553,11 +597,7 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
+ if (unlikely(balloon_page_movable(page))) {
+ if (locked && balloon_page_isolate(page)) {
+ /* Successfully isolated */
+- cc->finished_update_migrate = true;
+- list_add(&page->lru, migratelist);
+- cc->nr_migratepages++;
+- nr_isolated++;
+- goto check_compact_cluster;
++ goto isolate_success;
+ }
+ }
+ continue;
+@@ -580,6 +620,15 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
+ continue;
+ }
+
++ /*
++ * Migration will fail if an anonymous page is pinned in memory,
++ * so avoid taking lru_lock and isolating it unnecessarily in an
++ * admittedly racy check.
++ */
++ if (!page_mapping(page) &&
++ page_count(page) > page_mapcount(page))
++ continue;
++
+ /* Check if it is ok to still hold the lock */
+ locked = compact_checklock_irqsave(&zone->lru_lock, &flags,
+ locked, cc);
+@@ -594,12 +643,6 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
+ continue;
+ }
+
+- if (!cc->sync)
+- mode |= ISOLATE_ASYNC_MIGRATE;
+-
+- if (unevictable)
+- mode |= ISOLATE_UNEVICTABLE;
+-
+ lruvec = mem_cgroup_page_lruvec(page, zone);
+
+ /* Try isolate the page */
+@@ -609,13 +652,14 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
+ VM_BUG_ON(PageTransCompound(page));
+
+ /* Successfully isolated */
+- cc->finished_update_migrate = true;
+ del_page_from_lru_list(page, lruvec, page_lru(page));
++
++isolate_success:
++ cc->finished_update_migrate = true;
+ list_add(&page->lru, migratelist);
+ cc->nr_migratepages++;
+ nr_isolated++;
+
+-check_compact_cluster:
+ /* Avoid isolating too much */
+ if (cc->nr_migratepages == COMPACT_CLUSTER_MAX) {
+ ++low_pfn;
+@@ -626,7 +670,6 @@ check_compact_cluster:
+
+ next_pageblock:
+ low_pfn = ALIGN(low_pfn + 1, pageblock_nr_pages) - 1;
+- last_pageblock_nr = pageblock_nr;
+ }
+
+ acct_isolated(zone, locked, cc);
+@@ -634,9 +677,13 @@ next_pageblock:
+ if (locked)
+ spin_unlock_irqrestore(&zone->lru_lock, flags);
+
+- /* Update the pageblock-skip if the whole pageblock was scanned */
++ /*
++ * Update the pageblock-skip information and cached scanner pfn,
++ * if the whole pageblock was scanned without isolating any page.
++ */
+ if (low_pfn == end_pfn)
+- update_pageblock_skip(cc, valid_page, nr_isolated, true);
++ update_pageblock_skip(cc, valid_page, nr_isolated,
++ set_unsuitable, true);
+
+ trace_mm_compaction_isolate_migratepages(nr_scanned, nr_isolated);
+
+@@ -657,7 +704,9 @@ static void isolate_freepages(struct zone *zone,
+ struct compact_control *cc)
+ {
+ struct page *page;
+- unsigned long high_pfn, low_pfn, pfn, z_end_pfn;
++ unsigned long block_start_pfn; /* start of current pageblock */
++ unsigned long block_end_pfn; /* end of current pageblock */
++ unsigned long low_pfn; /* lowest pfn scanner is able to scan */
+ int nr_freepages = cc->nr_freepages;
+ struct list_head *freelist = &cc->freepages;
+
+@@ -665,41 +714,38 @@ static void isolate_freepages(struct zone *zone,
+ * Initialise the free scanner. The starting point is where we last
+ * successfully isolated from, zone-cached value, or the end of the
+ * zone when isolating for the first time. We need this aligned to
+- * the pageblock boundary, because we do pfn -= pageblock_nr_pages
+- * in the for loop.
++ * the pageblock boundary, because we do
++ * block_start_pfn -= pageblock_nr_pages in the for loop.
++ * For ending point, take care when isolating in last pageblock of a
++ * a zone which ends in the middle of a pageblock.
+ * The low boundary is the end of the pageblock the migration scanner
+ * is using.
+ */
+- pfn = cc->free_pfn & ~(pageblock_nr_pages-1);
++ block_start_pfn = cc->free_pfn & ~(pageblock_nr_pages-1);
++ block_end_pfn = min(block_start_pfn + pageblock_nr_pages,
++ zone_end_pfn(zone));
+ low_pfn = ALIGN(cc->migrate_pfn + 1, pageblock_nr_pages);
+
+ /*
+- * Take care that if the migration scanner is at the end of the zone
+- * that the free scanner does not accidentally move to the next zone
+- * in the next isolation cycle.
+- */
+- high_pfn = min(low_pfn, pfn);
+-
+- z_end_pfn = zone_end_pfn(zone);
+-
+- /*
+ * Isolate free pages until enough are available to migrate the
+ * pages on cc->migratepages. We stop searching if the migrate
+ * and free page scanners meet or enough free pages are isolated.
+ */
+- for (; pfn >= low_pfn && cc->nr_migratepages > nr_freepages;
+- pfn -= pageblock_nr_pages) {
++ for (; block_start_pfn >= low_pfn && cc->nr_migratepages > nr_freepages;
++ block_end_pfn = block_start_pfn,
++ block_start_pfn -= pageblock_nr_pages) {
+ unsigned long isolated;
+- unsigned long end_pfn;
+
+ /*
+ * This can iterate a massively long zone without finding any
+ * suitable migration targets, so periodically check if we need
+- * to schedule.
++ * to schedule, or even abort async compaction.
+ */
+- cond_resched();
++ if (!(block_start_pfn % (SWAP_CLUSTER_MAX * pageblock_nr_pages))
++ && compact_should_abort(cc))
++ break;
+
+- if (!pfn_valid(pfn))
++ if (!pfn_valid(block_start_pfn))
+ continue;
+
+ /*
+@@ -709,7 +755,7 @@ static void isolate_freepages(struct zone *zone,
+ * i.e. it's possible that all pages within a zones range of
+ * pages do not belong to a single zone.
+ */
+- page = pfn_to_page(pfn);
++ page = pfn_to_page(block_start_pfn);
+ if (page_zone(page) != zone)
+ continue;
+
+@@ -722,26 +768,26 @@ static void isolate_freepages(struct zone *zone,
+ continue;
+
+ /* Found a block suitable for isolating free pages from */
+- isolated = 0;
++ cc->free_pfn = block_start_pfn;
++ isolated = isolate_freepages_block(cc, block_start_pfn,
++ block_end_pfn, freelist, false);
++ nr_freepages += isolated;
+
+ /*
+- * Take care when isolating in last pageblock of a zone which
+- * ends in the middle of a pageblock.
++ * Set a flag that we successfully isolated in this pageblock.
++ * In the next loop iteration, zone->compact_cached_free_pfn
++ * will not be updated and thus it will effectively contain the
++ * highest pageblock we isolated pages from.
+ */
+- end_pfn = min(pfn + pageblock_nr_pages, z_end_pfn);
+- isolated = isolate_freepages_block(cc, pfn, end_pfn,
+- freelist, false);
+- nr_freepages += isolated;
++ if (isolated)
++ cc->finished_update_free = true;
+
+ /*
+- * Record the highest PFN we isolated pages from. When next
+- * looking for free pages, the search will restart here as
+- * page migration may have returned some pages to the allocator
++ * isolate_freepages_block() might have aborted due to async
++ * compaction being contended
+ */
+- if (isolated) {
+- cc->finished_update_free = true;
+- high_pfn = max(high_pfn, pfn);
+- }
++ if (cc->contended)
++ break;
+ }
+
+ /* split_free_page does not map the pages */
+@@ -751,10 +797,9 @@ static void isolate_freepages(struct zone *zone,
+ * If we crossed the migrate scanner, we want to keep it that way
+ * so that compact_finished() may detect this
+ */
+- if (pfn < low_pfn)
+- cc->free_pfn = max(pfn, zone->zone_start_pfn);
+- else
+- cc->free_pfn = high_pfn;
++ if (block_start_pfn < low_pfn)
++ cc->free_pfn = cc->migrate_pfn;
++
+ cc->nr_freepages = nr_freepages;
+ }
+
+@@ -769,9 +814,13 @@ static struct page *compaction_alloc(struct page *migratepage,
+ struct compact_control *cc = (struct compact_control *)data;
+ struct page *freepage;
+
+- /* Isolate free pages if necessary */
++ /*
++ * Isolate free pages if necessary, and if we are not aborting due to
++ * contention.
++ */
+ if (list_empty(&cc->freepages)) {
+- isolate_freepages(cc->zone, cc);
++ if (!cc->contended)
++ isolate_freepages(cc->zone, cc);
+
+ if (list_empty(&cc->freepages))
+ return NULL;
+@@ -785,23 +834,16 @@ static struct page *compaction_alloc(struct page *migratepage,
+ }
+
+ /*
+- * We cannot control nr_migratepages and nr_freepages fully when migration is
+- * running as migrate_pages() has no knowledge of compact_control. When
+- * migration is complete, we count the number of pages on the lists by hand.
++ * This is a migrate-callback that "frees" freepages back to the isolated
++ * freelist. All pages on the freelist are from the same zone, so there is no
++ * special handling needed for NUMA.
+ */
+-static void update_nr_listpages(struct compact_control *cc)
++static void compaction_free(struct page *page, unsigned long data)
+ {
+- int nr_migratepages = 0;
+- int nr_freepages = 0;
+- struct page *page;
+-
+- list_for_each_entry(page, &cc->migratepages, lru)
+- nr_migratepages++;
+- list_for_each_entry(page, &cc->freepages, lru)
+- nr_freepages++;
++ struct compact_control *cc = (struct compact_control *)data;
+
+- cc->nr_migratepages = nr_migratepages;
+- cc->nr_freepages = nr_freepages;
++ list_add(&page->lru, &cc->freepages);
++ cc->nr_freepages++;
+ }
+
+ /* possible outcome of isolate_migratepages */
+@@ -848,11 +890,16 @@ static int compact_finished(struct zone *zone,
+ unsigned int order;
+ unsigned long watermark;
+
+- if (fatal_signal_pending(current))
++ if (cc->contended || fatal_signal_pending(current))
+ return COMPACT_PARTIAL;
+
+ /* Compaction run completes if the migrate and free scanner meet */
+ if (cc->free_pfn <= cc->migrate_pfn) {
++ /* Let the next compaction start anew. */
++ zone->compact_cached_migrate_pfn[0] = zone->zone_start_pfn;
++ zone->compact_cached_migrate_pfn[1] = zone->zone_start_pfn;
++ zone->compact_cached_free_pfn = zone_end_pfn(zone);
++
+ /*
+ * Mark that the PG_migrate_skip information should be cleared
+ * by kswapd when it goes to sleep. kswapd does not set the
+@@ -950,6 +997,7 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
+ int ret;
+ unsigned long start_pfn = zone->zone_start_pfn;
+ unsigned long end_pfn = zone_end_pfn(zone);
++ const bool sync = cc->mode != MIGRATE_ASYNC;
+
+ ret = compaction_suitable(zone, cc->order);
+ switch (ret) {
+@@ -975,7 +1023,7 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
+ * information on where the scanners should start but check that it
+ * is initialised by ensuring the values are within zone boundaries.
+ */
+- cc->migrate_pfn = zone->compact_cached_migrate_pfn;
++ cc->migrate_pfn = zone->compact_cached_migrate_pfn[sync];
+ cc->free_pfn = zone->compact_cached_free_pfn;
+ if (cc->free_pfn < start_pfn || cc->free_pfn > end_pfn) {
+ cc->free_pfn = end_pfn & ~(pageblock_nr_pages-1);
+@@ -983,13 +1031,15 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
+ }
+ if (cc->migrate_pfn < start_pfn || cc->migrate_pfn > end_pfn) {
+ cc->migrate_pfn = start_pfn;
+- zone->compact_cached_migrate_pfn = cc->migrate_pfn;
++ zone->compact_cached_migrate_pfn[0] = cc->migrate_pfn;
++ zone->compact_cached_migrate_pfn[1] = cc->migrate_pfn;
+ }
+
++ trace_mm_compaction_begin(start_pfn, cc->migrate_pfn, cc->free_pfn, end_pfn);
++
+ migrate_prep_local();
+
+ while ((ret = compact_finished(zone, cc)) == COMPACT_CONTINUE) {
+- unsigned long nr_migrate, nr_remaining;
+ int err;
+
+ switch (isolate_migratepages(zone, cc)) {
+@@ -1004,21 +1054,20 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
+ ;
+ }
+
+- nr_migrate = cc->nr_migratepages;
++ if (!cc->nr_migratepages)
++ continue;
++
+ err = migrate_pages(&cc->migratepages, compaction_alloc,
+- (unsigned long)cc,
+- cc->sync ? MIGRATE_SYNC_LIGHT : MIGRATE_ASYNC,
++ compaction_free, (unsigned long)cc, cc->mode,
+ MR_COMPACTION);
+- update_nr_listpages(cc);
+- nr_remaining = cc->nr_migratepages;
+
+- trace_mm_compaction_migratepages(nr_migrate - nr_remaining,
+- nr_remaining);
++ trace_mm_compaction_migratepages(cc->nr_migratepages, err,
++ &cc->migratepages);
+
+- /* Release isolated pages not migrated */
++ /* All pages were either migrated or will be released */
++ cc->nr_migratepages = 0;
+ if (err) {
+ putback_movable_pages(&cc->migratepages);
+- cc->nr_migratepages = 0;
+ /*
+ * migrate_pages() may return -ENOMEM when scanners meet
+ * and we want compact_finished() to detect it
+@@ -1035,12 +1084,13 @@ out:
+ cc->nr_freepages -= release_freepages(&cc->freepages);
+ VM_BUG_ON(cc->nr_freepages != 0);
+
++ trace_mm_compaction_end(ret);
++
+ return ret;
+ }
+
+-static unsigned long compact_zone_order(struct zone *zone,
+- int order, gfp_t gfp_mask,
+- bool sync, bool *contended)
++static unsigned long compact_zone_order(struct zone *zone, int order,
++ gfp_t gfp_mask, enum migrate_mode mode, bool *contended)
+ {
+ unsigned long ret;
+ struct compact_control cc = {
+@@ -1049,7 +1099,7 @@ static unsigned long compact_zone_order(struct zone *zone,
+ .order = order,
+ .migratetype = allocflags_to_migratetype(gfp_mask),
+ .zone = zone,
+- .sync = sync,
++ .mode = mode,
+ };
+ INIT_LIST_HEAD(&cc.freepages);
+ INIT_LIST_HEAD(&cc.migratepages);
+@@ -1071,7 +1121,7 @@ int sysctl_extfrag_threshold = 500;
+ * @order: The order of the current allocation
+ * @gfp_mask: The GFP mask of the current allocation
+ * @nodemask: The allowed nodes to allocate from
+- * @sync: Whether migration is synchronous or not
++ * @mode: The migration mode for async, sync light, or sync migration
+ * @contended: Return value that is true if compaction was aborted due to lock contention
+ * @page: Optionally capture a free page of the requested order during compaction
+ *
+@@ -1079,7 +1129,7 @@ int sysctl_extfrag_threshold = 500;
+ */
+ unsigned long try_to_compact_pages(struct zonelist *zonelist,
+ int order, gfp_t gfp_mask, nodemask_t *nodemask,
+- bool sync, bool *contended)
++ enum migrate_mode mode, bool *contended)
+ {
+ enum zone_type high_zoneidx = gfp_zone(gfp_mask);
+ int may_enter_fs = gfp_mask & __GFP_FS;
+@@ -1104,7 +1154,7 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist,
+ nodemask) {
+ int status;
+
+- status = compact_zone_order(zone, order, gfp_mask, sync,
++ status = compact_zone_order(zone, order, gfp_mask, mode,
+ contended);
+ rc = max(status, rc);
+
+@@ -1140,13 +1190,9 @@ static void __compact_pgdat(pg_data_t *pgdat, struct compact_control *cc)
+ compact_zone(zone, cc);
+
+ if (cc->order > 0) {
+- int ok = zone_watermark_ok(zone, cc->order,
+- low_wmark_pages(zone), 0, 0);
+- if (ok && cc->order >= zone->compact_order_failed)
+- zone->compact_order_failed = cc->order + 1;
+- /* Currently async compaction is never deferred. */
+- else if (!ok && cc->sync)
+- defer_compaction(zone, cc->order);
++ if (zone_watermark_ok(zone, cc->order,
++ low_wmark_pages(zone), 0, 0))
++ compaction_defer_reset(zone, cc->order, false);
+ }
+
+ VM_BUG_ON(!list_empty(&cc->freepages));
+@@ -1158,7 +1204,7 @@ void compact_pgdat(pg_data_t *pgdat, int order)
+ {
+ struct compact_control cc = {
+ .order = order,
+- .sync = false,
++ .mode = MIGRATE_ASYNC,
+ };
+
+ if (!order)
+@@ -1171,7 +1217,8 @@ static void compact_node(int nid)
+ {
+ struct compact_control cc = {
+ .order = -1,
+- .sync = true,
++ .mode = MIGRATE_SYNC,
++ .ignore_skip_hint = true,
+ };
+
+ __compact_pgdat(NODE_DATA(nid), &cc);
+diff --git a/mm/filemap.c b/mm/filemap.c
+index ae4846ff4849..b012daefc2d7 100644
+--- a/mm/filemap.c
++++ b/mm/filemap.c
+@@ -192,9 +192,11 @@ static int filemap_check_errors(struct address_space *mapping)
+ {
+ int ret = 0;
+ /* Check for outstanding write errors */
+- if (test_and_clear_bit(AS_ENOSPC, &mapping->flags))
++ if (test_bit(AS_ENOSPC, &mapping->flags) &&
++ test_and_clear_bit(AS_ENOSPC, &mapping->flags))
+ ret = -ENOSPC;
+- if (test_and_clear_bit(AS_EIO, &mapping->flags))
++ if (test_bit(AS_EIO, &mapping->flags) &&
++ test_and_clear_bit(AS_EIO, &mapping->flags))
+ ret = -EIO;
+ return ret;
+ }
+@@ -446,6 +448,29 @@ int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask)
+ }
+ EXPORT_SYMBOL_GPL(replace_page_cache_page);
+
++static int page_cache_tree_insert(struct address_space *mapping,
++ struct page *page)
++{
++ void **slot;
++ int error;
++
++ slot = radix_tree_lookup_slot(&mapping->page_tree, page->index);
++ if (slot) {
++ void *p;
++
++ p = radix_tree_deref_slot_protected(slot, &mapping->tree_lock);
++ if (!radix_tree_exceptional_entry(p))
++ return -EEXIST;
++ radix_tree_replace_slot(slot, page);
++ mapping->nrpages++;
++ return 0;
++ }
++ error = radix_tree_insert(&mapping->page_tree, page->index, page);
++ if (!error)
++ mapping->nrpages++;
++ return error;
++}
++
+ /**
+ * add_to_page_cache_locked - add a locked page to the pagecache
+ * @page: page to add
+@@ -480,11 +505,10 @@ int add_to_page_cache_locked(struct page *page, struct address_space *mapping,
+ page->index = offset;
+
+ spin_lock_irq(&mapping->tree_lock);
+- error = radix_tree_insert(&mapping->page_tree, offset, page);
++ error = page_cache_tree_insert(mapping, page);
+ radix_tree_preload_end();
+ if (unlikely(error))
+ goto err_insert;
+- mapping->nrpages++;
+ __inc_zone_page_state(page, NR_FILE_PAGES);
+ spin_unlock_irq(&mapping->tree_lock);
+ trace_mm_filemap_add_to_page_cache(page);
+@@ -520,10 +544,10 @@ struct page *__page_cache_alloc(gfp_t gfp)
+ if (cpuset_do_page_mem_spread()) {
+ unsigned int cpuset_mems_cookie;
+ do {
+- cpuset_mems_cookie = get_mems_allowed();
++ cpuset_mems_cookie = read_mems_allowed_begin();
+ n = cpuset_mem_spread_node();
+ page = alloc_pages_exact_node(n, gfp, 0);
+- } while (!put_mems_allowed(cpuset_mems_cookie) && !page);
++ } while (!page && read_mems_allowed_retry(cpuset_mems_cookie));
+
+ return page;
+ }
+@@ -620,8 +644,17 @@ EXPORT_SYMBOL(unlock_page);
+ */
+ void end_page_writeback(struct page *page)
+ {
+- if (TestClearPageReclaim(page))
++ /*
++ * TestClearPageReclaim could be used here but it is an atomic
++ * operation and overkill in this particular case. Failing to
++ * shuffle a page marked for immediate reclaim is too mild to
++ * justify taking an atomic operation penalty at the end of
++ * ever page writeback.
++ */
++ if (PageReclaim(page)) {
++ ClearPageReclaim(page);
+ rotate_reclaimable_page(page);
++ }
+
+ if (!test_clear_page_writeback(page))
+ BUG();
+@@ -686,14 +719,101 @@ int __lock_page_or_retry(struct page *page, struct mm_struct *mm,
+ }
+
+ /**
+- * find_get_page - find and get a page reference
++ * page_cache_next_hole - find the next hole (not-present entry)
++ * @mapping: mapping
++ * @index: index
++ * @max_scan: maximum range to search
++ *
++ * Search the set [index, min(index+max_scan-1, MAX_INDEX)] for the
++ * lowest indexed hole.
++ *
++ * Returns: the index of the hole if found, otherwise returns an index
++ * outside of the set specified (in which case 'return - index >=
++ * max_scan' will be true). In rare cases of index wrap-around, 0 will
++ * be returned.
++ *
++ * page_cache_next_hole may be called under rcu_read_lock. However,
++ * like radix_tree_gang_lookup, this will not atomically search a
++ * snapshot of the tree at a single point in time. For example, if a
++ * hole is created at index 5, then subsequently a hole is created at
++ * index 10, page_cache_next_hole covering both indexes may return 10
++ * if called under rcu_read_lock.
++ */
++pgoff_t page_cache_next_hole(struct address_space *mapping,
++ pgoff_t index, unsigned long max_scan)
++{
++ unsigned long i;
++
++ for (i = 0; i < max_scan; i++) {
++ struct page *page;
++
++ page = radix_tree_lookup(&mapping->page_tree, index);
++ if (!page || radix_tree_exceptional_entry(page))
++ break;
++ index++;
++ if (index == 0)
++ break;
++ }
++
++ return index;
++}
++EXPORT_SYMBOL(page_cache_next_hole);
++
++/**
++ * page_cache_prev_hole - find the prev hole (not-present entry)
++ * @mapping: mapping
++ * @index: index
++ * @max_scan: maximum range to search
++ *
++ * Search backwards in the range [max(index-max_scan+1, 0), index] for
++ * the first hole.
++ *
++ * Returns: the index of the hole if found, otherwise returns an index
++ * outside of the set specified (in which case 'index - return >=
++ * max_scan' will be true). In rare cases of wrap-around, ULONG_MAX
++ * will be returned.
++ *
++ * page_cache_prev_hole may be called under rcu_read_lock. However,
++ * like radix_tree_gang_lookup, this will not atomically search a
++ * snapshot of the tree at a single point in time. For example, if a
++ * hole is created at index 10, then subsequently a hole is created at
++ * index 5, page_cache_prev_hole covering both indexes may return 5 if
++ * called under rcu_read_lock.
++ */
++pgoff_t page_cache_prev_hole(struct address_space *mapping,
++ pgoff_t index, unsigned long max_scan)
++{
++ unsigned long i;
++
++ for (i = 0; i < max_scan; i++) {
++ struct page *page;
++
++ page = radix_tree_lookup(&mapping->page_tree, index);
++ if (!page || radix_tree_exceptional_entry(page))
++ break;
++ index--;
++ if (index == ULONG_MAX)
++ break;
++ }
++
++ return index;
++}
++EXPORT_SYMBOL(page_cache_prev_hole);
++
++/**
++ * find_get_entry - find and get a page cache entry
+ * @mapping: the address_space to search
+- * @offset: the page index
++ * @offset: the page cache index
++ *
++ * Looks up the page cache slot at @mapping & @offset. If there is a
++ * page cache page, it is returned with an increased refcount.
+ *
+- * Is there a pagecache struct page at the given (mapping, offset) tuple?
+- * If yes, increment its refcount and return it; if no, return NULL.
++ * If the slot holds a shadow entry of a previously evicted page, it
++ * is returned.
++ *
++ * Otherwise, %NULL is returned.
+ */
+-struct page *find_get_page(struct address_space *mapping, pgoff_t offset)
++struct page *find_get_entry(struct address_space *mapping, pgoff_t offset)
+ {
+ void **pagep;
+ struct page *page;
+@@ -734,24 +854,30 @@ out:
+
+ return page;
+ }
+-EXPORT_SYMBOL(find_get_page);
++EXPORT_SYMBOL(find_get_entry);
+
+ /**
+- * find_lock_page - locate, pin and lock a pagecache page
++ * find_lock_entry - locate, pin and lock a page cache entry
+ * @mapping: the address_space to search
+- * @offset: the page index
++ * @offset: the page cache index
++ *
++ * Looks up the page cache slot at @mapping & @offset. If there is a
++ * page cache page, it is returned locked and with an increased
++ * refcount.
+ *
+- * Locates the desired pagecache page, locks it, increments its reference
+- * count and returns its address.
++ * If the slot holds a shadow entry of a previously evicted page, it
++ * is returned.
+ *
+- * Returns zero if the page was not present. find_lock_page() may sleep.
++ * Otherwise, %NULL is returned.
++ *
++ * find_lock_entry() may sleep.
+ */
+-struct page *find_lock_page(struct address_space *mapping, pgoff_t offset)
++struct page *find_lock_entry(struct address_space *mapping, pgoff_t offset)
+ {
+ struct page *page;
+
+ repeat:
+- page = find_get_page(mapping, offset);
++ page = find_get_entry(mapping, offset);
+ if (page && !radix_tree_exception(page)) {
+ lock_page(page);
+ /* Has the page been truncated? */
+@@ -764,44 +890,87 @@ repeat:
+ }
+ return page;
+ }
+-EXPORT_SYMBOL(find_lock_page);
++EXPORT_SYMBOL(find_lock_entry);
+
+ /**
+- * find_or_create_page - locate or add a pagecache page
+- * @mapping: the page's address_space
+- * @index: the page's index into the mapping
+- * @gfp_mask: page allocation mode
++ * pagecache_get_page - find and get a page reference
++ * @mapping: the address_space to search
++ * @offset: the page index
++ * @fgp_flags: PCG flags
++ * @gfp_mask: gfp mask to use if a page is to be allocated
++ *
++ * Looks up the page cache slot at @mapping & @offset.
++ *
++ * PCG flags modify how the page is returned
+ *
+- * Locates a page in the pagecache. If the page is not present, a new page
+- * is allocated using @gfp_mask and is added to the pagecache and to the VM's
+- * LRU list. The returned page is locked and has its reference count
+- * incremented.
++ * FGP_ACCESSED: the page will be marked accessed
++ * FGP_LOCK: Page is return locked
++ * FGP_CREAT: If page is not present then a new page is allocated using
++ * @gfp_mask and added to the page cache and the VM's LRU
++ * list. The page is returned locked and with an increased
++ * refcount. Otherwise, %NULL is returned.
+ *
+- * find_or_create_page() may sleep, even if @gfp_flags specifies an atomic
+- * allocation!
++ * If FGP_LOCK or FGP_CREAT are specified then the function may sleep even
++ * if the GFP flags specified for FGP_CREAT are atomic.
+ *
+- * find_or_create_page() returns the desired page's address, or zero on
+- * memory exhaustion.
++ * If there is a page cache page, it is returned with an increased refcount.
+ */
+-struct page *find_or_create_page(struct address_space *mapping,
+- pgoff_t index, gfp_t gfp_mask)
++struct page *pagecache_get_page(struct address_space *mapping, pgoff_t offset,
++ int fgp_flags, gfp_t cache_gfp_mask, gfp_t radix_gfp_mask)
+ {
+ struct page *page;
+- int err;
++
+ repeat:
+- page = find_lock_page(mapping, index);
+- if (!page) {
+- page = __page_cache_alloc(gfp_mask);
++ page = find_get_entry(mapping, offset);
++ if (radix_tree_exceptional_entry(page))
++ page = NULL;
++ if (!page)
++ goto no_page;
++
++ if (fgp_flags & FGP_LOCK) {
++ if (fgp_flags & FGP_NOWAIT) {
++ if (!trylock_page(page)) {
++ page_cache_release(page);
++ return NULL;
++ }
++ } else {
++ lock_page(page);
++ }
++
++ /* Has the page been truncated? */
++ if (unlikely(page->mapping != mapping)) {
++ unlock_page(page);
++ page_cache_release(page);
++ goto repeat;
++ }
++ VM_BUG_ON(page->index != offset);
++ }
++
++ if (page && (fgp_flags & FGP_ACCESSED))
++ mark_page_accessed(page);
++
++no_page:
++ if (!page && (fgp_flags & FGP_CREAT)) {
++ int err;
++ if ((fgp_flags & FGP_WRITE) && mapping_cap_account_dirty(mapping))
++ cache_gfp_mask |= __GFP_WRITE;
++ if (fgp_flags & FGP_NOFS) {
++ cache_gfp_mask &= ~__GFP_FS;
++ radix_gfp_mask &= ~__GFP_FS;
++ }
++
++ page = __page_cache_alloc(cache_gfp_mask);
+ if (!page)
+ return NULL;
+- /*
+- * We want a regular kernel memory (not highmem or DMA etc)
+- * allocation for the radix tree nodes, but we need to honour
+- * the context-specific requirements the caller has asked for.
+- * GFP_RECLAIM_MASK collects those requirements.
+- */
+- err = add_to_page_cache_lru(page, mapping, index,
+- (gfp_mask & GFP_RECLAIM_MASK));
++
++ if (WARN_ON_ONCE(!(fgp_flags & FGP_LOCK)))
++ fgp_flags |= FGP_LOCK;
++
++ /* Init accessed so avoit atomic mark_page_accessed later */
++ if (fgp_flags & FGP_ACCESSED)
++ init_page_accessed(page);
++
++ err = add_to_page_cache_lru(page, mapping, offset, radix_gfp_mask);
+ if (unlikely(err)) {
+ page_cache_release(page);
+ page = NULL;
+@@ -809,9 +978,80 @@ repeat:
+ goto repeat;
+ }
+ }
++
+ return page;
+ }
+-EXPORT_SYMBOL(find_or_create_page);
++EXPORT_SYMBOL(pagecache_get_page);
++
++/**
++ * find_get_entries - gang pagecache lookup
++ * @mapping: The address_space to search
++ * @start: The starting page cache index
++ * @nr_entries: The maximum number of entries
++ * @entries: Where the resulting entries are placed
++ * @indices: The cache indices corresponding to the entries in @entries
++ *
++ * find_get_entries() will search for and return a group of up to
++ * @nr_entries entries in the mapping. The entries are placed at
++ * @entries. find_get_entries() takes a reference against any actual
++ * pages it returns.
++ *
++ * The search returns a group of mapping-contiguous page cache entries
++ * with ascending indexes. There may be holes in the indices due to
++ * not-present pages.
++ *
++ * Any shadow entries of evicted pages are included in the returned
++ * array.
++ *
++ * find_get_entries() returns the number of pages and shadow entries
++ * which were found.
++ */
++unsigned find_get_entries(struct address_space *mapping,
++ pgoff_t start, unsigned int nr_entries,
++ struct page **entries, pgoff_t *indices)
++{
++ void **slot;
++ unsigned int ret = 0;
++ struct radix_tree_iter iter;
++
++ if (!nr_entries)
++ return 0;
++
++ rcu_read_lock();
++restart:
++ radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, start) {
++ struct page *page;
++repeat:
++ page = radix_tree_deref_slot(slot);
++ if (unlikely(!page))
++ continue;
++ if (radix_tree_exception(page)) {
++ if (radix_tree_deref_retry(page))
++ goto restart;
++ /*
++ * Otherwise, we must be storing a swap entry
++ * here as an exceptional entry: so return it
++ * without attempting to raise page count.
++ */
++ goto export;
++ }
++ if (!page_cache_get_speculative(page))
++ goto repeat;
++
++ /* Has the page moved? */
++ if (unlikely(page != *slot)) {
++ page_cache_release(page);
++ goto repeat;
++ }
++export:
++ indices[ret] = iter.index;
++ entries[ret] = page;
++ if (++ret == nr_entries)
++ break;
++ }
++ rcu_read_unlock();
++ return ret;
++}
+
+ /**
+ * find_get_pages - gang pagecache lookup
+@@ -1031,39 +1271,6 @@ repeat:
+ }
+ EXPORT_SYMBOL(find_get_pages_tag);
+
+-/**
+- * grab_cache_page_nowait - returns locked page at given index in given cache
+- * @mapping: target address_space
+- * @index: the page index
+- *
+- * Same as grab_cache_page(), but do not wait if the page is unavailable.
+- * This is intended for speculative data generators, where the data can
+- * be regenerated if the page couldn't be grabbed. This routine should
+- * be safe to call while holding the lock for another page.
+- *
+- * Clear __GFP_FS when allocating the page to avoid recursion into the fs
+- * and deadlock against the caller's locked page.
+- */
+-struct page *
+-grab_cache_page_nowait(struct address_space *mapping, pgoff_t index)
+-{
+- struct page *page = find_get_page(mapping, index);
+-
+- if (page) {
+- if (trylock_page(page))
+- return page;
+- page_cache_release(page);
+- return NULL;
+- }
+- page = __page_cache_alloc(mapping_gfp_mask(mapping) & ~__GFP_FS);
+- if (page && add_to_page_cache_lru(page, mapping, index, GFP_NOFS)) {
+- page_cache_release(page);
+- page = NULL;
+- }
+- return page;
+-}
+-EXPORT_SYMBOL(grab_cache_page_nowait);
+-
+ /*
+ * CD/DVDs are error prone. When a medium error occurs, the driver may fail
+ * a _large_ part of the i/o request. Imagine the worst scenario:
+@@ -1797,6 +2004,18 @@ int generic_file_readonly_mmap(struct file * file, struct vm_area_struct * vma)
+ EXPORT_SYMBOL(generic_file_mmap);
+ EXPORT_SYMBOL(generic_file_readonly_mmap);
+
++static struct page *wait_on_page_read(struct page *page)
++{
++ if (!IS_ERR(page)) {
++ wait_on_page_locked(page);
++ if (!PageUptodate(page)) {
++ page_cache_release(page);
++ page = ERR_PTR(-EIO);
++ }
++ }
++ return page;
++}
++
+ static struct page *__read_cache_page(struct address_space *mapping,
+ pgoff_t index,
+ int (*filler)(void *, struct page *),
+@@ -1823,6 +2042,8 @@ repeat:
+ if (err < 0) {
+ page_cache_release(page);
+ page = ERR_PTR(err);
++ } else {
++ page = wait_on_page_read(page);
+ }
+ }
+ return page;
+@@ -1859,6 +2080,10 @@ retry:
+ if (err < 0) {
+ page_cache_release(page);
+ return ERR_PTR(err);
++ } else {
++ page = wait_on_page_read(page);
++ if (IS_ERR(page))
++ return page;
+ }
+ out:
+ mark_page_accessed(page);
+@@ -1866,40 +2091,25 @@ out:
+ }
+
+ /**
+- * read_cache_page_async - read into page cache, fill it if needed
++ * read_cache_page - read into page cache, fill it if needed
+ * @mapping: the page's address_space
+ * @index: the page index
+ * @filler: function to perform the read
+ * @data: first arg to filler(data, page) function, often left as NULL
+ *
+- * Same as read_cache_page, but don't wait for page to become unlocked
+- * after submitting it to the filler.
+- *
+ * Read into the page cache. If a page already exists, and PageUptodate() is
+- * not set, try to fill the page but don't wait for it to become unlocked.
++ * not set, try to fill the page and wait for it to become unlocked.
+ *
+ * If the page does not get brought uptodate, return -EIO.
+ */
+-struct page *read_cache_page_async(struct address_space *mapping,
++struct page *read_cache_page(struct address_space *mapping,
+ pgoff_t index,
+ int (*filler)(void *, struct page *),
+ void *data)
+ {
+ return do_read_cache_page(mapping, index, filler, data, mapping_gfp_mask(mapping));
+ }
+-EXPORT_SYMBOL(read_cache_page_async);
+-
+-static struct page *wait_on_page_read(struct page *page)
+-{
+- if (!IS_ERR(page)) {
+- wait_on_page_locked(page);
+- if (!PageUptodate(page)) {
+- page_cache_release(page);
+- page = ERR_PTR(-EIO);
+- }
+- }
+- return page;
+-}
++EXPORT_SYMBOL(read_cache_page);
+
+ /**
+ * read_cache_page_gfp - read into page cache, using specified page allocation flags.
+@@ -1918,31 +2128,10 @@ struct page *read_cache_page_gfp(struct address_space *mapping,
+ {
+ filler_t *filler = (filler_t *)mapping->a_ops->readpage;
+
+- return wait_on_page_read(do_read_cache_page(mapping, index, filler, NULL, gfp));
++ return do_read_cache_page(mapping, index, filler, NULL, gfp);
+ }
+ EXPORT_SYMBOL(read_cache_page_gfp);
+
+-/**
+- * read_cache_page - read into page cache, fill it if needed
+- * @mapping: the page's address_space
+- * @index: the page index
+- * @filler: function to perform the read
+- * @data: first arg to filler(data, page) function, often left as NULL
+- *
+- * Read into the page cache. If a page already exists, and PageUptodate() is
+- * not set, try to fill the page then wait for it to become unlocked.
+- *
+- * If the page does not get brought uptodate, return -EIO.
+- */
+-struct page *read_cache_page(struct address_space *mapping,
+- pgoff_t index,
+- int (*filler)(void *, struct page *),
+- void *data)
+-{
+- return wait_on_page_read(read_cache_page_async(mapping, index, filler, data));
+-}
+-EXPORT_SYMBOL(read_cache_page);
+-
+ static size_t __iovec_copy_from_user_inatomic(char *vaddr,
+ const struct iovec *iov, size_t base, size_t bytes)
+ {
+@@ -1976,7 +2165,6 @@ size_t iov_iter_copy_from_user_atomic(struct page *page,
+ char *kaddr;
+ size_t copied;
+
+- BUG_ON(!in_atomic());
+ kaddr = kmap_atomic(page);
+ if (likely(i->nr_segs == 1)) {
+ int left;
+@@ -2186,7 +2374,6 @@ int pagecache_write_end(struct file *file, struct address_space *mapping,
+ {
+ const struct address_space_operations *aops = mapping->a_ops;
+
+- mark_page_accessed(page);
+ return aops->write_end(file, mapping, pos, len, copied, page, fsdata);
+ }
+ EXPORT_SYMBOL(pagecache_write_end);
+@@ -2268,34 +2455,18 @@ EXPORT_SYMBOL(generic_file_direct_write);
+ struct page *grab_cache_page_write_begin(struct address_space *mapping,
+ pgoff_t index, unsigned flags)
+ {
+- int status;
+- gfp_t gfp_mask;
+ struct page *page;
+- gfp_t gfp_notmask = 0;
++ int fgp_flags = FGP_LOCK|FGP_ACCESSED|FGP_WRITE|FGP_CREAT;
+
+- gfp_mask = mapping_gfp_mask(mapping);
+- if (mapping_cap_account_dirty(mapping))
+- gfp_mask |= __GFP_WRITE;
+ if (flags & AOP_FLAG_NOFS)
+- gfp_notmask = __GFP_FS;
+-repeat:
+- page = find_lock_page(mapping, index);
++ fgp_flags |= FGP_NOFS;
++
++ page = pagecache_get_page(mapping, index, fgp_flags,
++ mapping_gfp_mask(mapping),
++ GFP_KERNEL);
+ if (page)
+- goto found;
++ wait_for_stable_page(page);
+
+- page = __page_cache_alloc(gfp_mask & ~gfp_notmask);
+- if (!page)
+- return NULL;
+- status = add_to_page_cache_lru(page, mapping, index,
+- GFP_KERNEL & ~gfp_notmask);
+- if (unlikely(status)) {
+- page_cache_release(page);
+- if (status == -EEXIST)
+- goto repeat;
+- return NULL;
+- }
+-found:
+- wait_for_stable_page(page);
+ return page;
+ }
+ EXPORT_SYMBOL(grab_cache_page_write_begin);
+@@ -2344,18 +2515,15 @@ again:
+
+ status = a_ops->write_begin(file, mapping, pos, bytes, flags,
+ &page, &fsdata);
+- if (unlikely(status))
++ if (unlikely(status < 0))
+ break;
+
+ if (mapping_writably_mapped(mapping))
+ flush_dcache_page(page);
+
+- pagefault_disable();
+ copied = iov_iter_copy_from_user_atomic(page, i, offset, bytes);
+- pagefault_enable();
+ flush_dcache_page(page);
+
+- mark_page_accessed(page);
+ status = a_ops->write_end(file, mapping, pos, bytes, copied,
+ page, fsdata);
+ if (unlikely(status < 0))
+diff --git a/mm/fremap.c b/mm/fremap.c
+index bbc4d660221a..34feba60a17e 100644
+--- a/mm/fremap.c
++++ b/mm/fremap.c
+@@ -23,28 +23,44 @@
+
+ #include "internal.h"
+
++static int mm_counter(struct page *page)
++{
++ return PageAnon(page) ? MM_ANONPAGES : MM_FILEPAGES;
++}
++
+ static void zap_pte(struct mm_struct *mm, struct vm_area_struct *vma,
+ unsigned long addr, pte_t *ptep)
+ {
+ pte_t pte = *ptep;
++ struct page *page;
++ swp_entry_t entry;
+
+ if (pte_present(pte)) {
+- struct page *page;
+-
+ flush_cache_page(vma, addr, pte_pfn(pte));
+ pte = ptep_clear_flush(vma, addr, ptep);
+ page = vm_normal_page(vma, addr, pte);
+ if (page) {
+ if (pte_dirty(pte))
+ set_page_dirty(page);
++ update_hiwater_rss(mm);
++ dec_mm_counter(mm, mm_counter(page));
+ page_remove_rmap(page);
+ page_cache_release(page);
++ }
++ } else { /* zap_pte() is not called when pte_none() */
++ if (!pte_file(pte)) {
+ update_hiwater_rss(mm);
+- dec_mm_counter(mm, MM_FILEPAGES);
++ entry = pte_to_swp_entry(pte);
++ if (non_swap_entry(entry)) {
++ if (is_migration_entry(entry)) {
++ page = migration_entry_to_page(entry);
++ dec_mm_counter(mm, mm_counter(page));
++ }
++ } else {
++ free_swap_and_cache(entry);
++ dec_mm_counter(mm, MM_SWAPENTS);
++ }
+ }
+- } else {
+- if (!pte_file(pte))
+- free_swap_and_cache(pte_to_swp_entry(pte));
+ pte_clear_not_present_full(mm, addr, ptep, 0);
+ }
+ }
+diff --git a/mm/frontswap.c b/mm/frontswap.c
+index 1b24bdcb3197..c30eec536f03 100644
+--- a/mm/frontswap.c
++++ b/mm/frontswap.c
+@@ -327,15 +327,12 @@ EXPORT_SYMBOL(__frontswap_invalidate_area);
+
+ static unsigned long __frontswap_curr_pages(void)
+ {
+- int type;
+ unsigned long totalpages = 0;
+ struct swap_info_struct *si = NULL;
+
+ assert_spin_locked(&swap_lock);
+- for (type = swap_list.head; type >= 0; type = si->next) {
+- si = swap_info[type];
++ plist_for_each_entry(si, &swap_active_head, list)
+ totalpages += atomic_read(&si->frontswap_pages);
+- }
+ return totalpages;
+ }
+
+@@ -347,11 +344,9 @@ static int __frontswap_unuse_pages(unsigned long total, unsigned long *unused,
+ int si_frontswap_pages;
+ unsigned long total_pages_to_unuse = total;
+ unsigned long pages = 0, pages_to_unuse = 0;
+- int type;
+
+ assert_spin_locked(&swap_lock);
+- for (type = swap_list.head; type >= 0; type = si->next) {
+- si = swap_info[type];
++ plist_for_each_entry(si, &swap_active_head, list) {
+ si_frontswap_pages = atomic_read(&si->frontswap_pages);
+ if (total_pages_to_unuse < si_frontswap_pages) {
+ pages = pages_to_unuse = total_pages_to_unuse;
+@@ -366,7 +361,7 @@ static int __frontswap_unuse_pages(unsigned long total, unsigned long *unused,
+ }
+ vm_unacct_memory(pages);
+ *unused = pages_to_unuse;
+- *swapid = type;
++ *swapid = si->type;
+ ret = 0;
+ break;
+ }
+@@ -413,7 +408,7 @@ void frontswap_shrink(unsigned long target_pages)
+ /*
+ * we don't want to hold swap_lock while doing a very
+ * lengthy try_to_unuse, but swap_list may change
+- * so restart scan from swap_list.head each time
++ * so restart scan from swap_active_head each time
+ */
+ spin_lock(&swap_lock);
+ ret = __frontswap_shrink(target_pages, &pages_to_unuse, &type);
+diff --git a/mm/huge_memory.c b/mm/huge_memory.c
+index 389973fd6bb7..2ee53749eb48 100644
+--- a/mm/huge_memory.c
++++ b/mm/huge_memory.c
+@@ -758,14 +758,6 @@ static inline struct page *alloc_hugepage_vma(int defrag,
+ HPAGE_PMD_ORDER, vma, haddr, nd);
+ }
+
+-#ifndef CONFIG_NUMA
+-static inline struct page *alloc_hugepage(int defrag)
+-{
+- return alloc_pages(alloc_hugepage_gfpmask(defrag, 0),
+- HPAGE_PMD_ORDER);
+-}
+-#endif
+-
+ static bool set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm,
+ struct vm_area_struct *vma, unsigned long haddr, pmd_t *pmd,
+ struct page *zero_page)
+@@ -2197,7 +2189,58 @@ static void khugepaged_alloc_sleep(void)
+ msecs_to_jiffies(khugepaged_alloc_sleep_millisecs));
+ }
+
++static int khugepaged_node_load[MAX_NUMNODES];
++
++static bool khugepaged_scan_abort(int nid)
++{
++ int i;
++
++ /*
++ * If zone_reclaim_mode is disabled, then no extra effort is made to
++ * allocate memory locally.
++ */
++ if (!zone_reclaim_mode)
++ return false;
++
++ /* If there is a count for this node already, it must be acceptable */
++ if (khugepaged_node_load[nid])
++ return false;
++
++ for (i = 0; i < MAX_NUMNODES; i++) {
++ if (!khugepaged_node_load[i])
++ continue;
++ if (node_distance(nid, i) > RECLAIM_DISTANCE)
++ return true;
++ }
++ return false;
++}
++
+ #ifdef CONFIG_NUMA
++static int khugepaged_find_target_node(void)
++{
++ static int last_khugepaged_target_node = NUMA_NO_NODE;
++ int nid, target_node = 0, max_value = 0;
++
++ /* find first node with max normal pages hit */
++ for (nid = 0; nid < MAX_NUMNODES; nid++)
++ if (khugepaged_node_load[nid] > max_value) {
++ max_value = khugepaged_node_load[nid];
++ target_node = nid;
++ }
++
++ /* do some balance if several nodes have the same hit record */
++ if (target_node <= last_khugepaged_target_node)
++ for (nid = last_khugepaged_target_node + 1; nid < MAX_NUMNODES;
++ nid++)
++ if (max_value == khugepaged_node_load[nid]) {
++ target_node = nid;
++ break;
++ }
++
++ last_khugepaged_target_node = target_node;
++ return target_node;
++}
++
+ static bool khugepaged_prealloc_page(struct page **hpage, bool *wait)
+ {
+ if (IS_ERR(*hpage)) {
+@@ -2231,9 +2274,8 @@ static struct page
+ * mmap_sem in read mode is good idea also to allow greater
+ * scalability.
+ */
+- *hpage = alloc_hugepage_vma(khugepaged_defrag(), vma, address,
+- node, __GFP_OTHER_NODE);
+-
++ *hpage = alloc_pages_exact_node(node, alloc_hugepage_gfpmask(
++ khugepaged_defrag(), __GFP_OTHER_NODE), HPAGE_PMD_ORDER);
+ /*
+ * After allocating the hugepage, release the mmap_sem read lock in
+ * preparation for taking it in write mode.
+@@ -2249,6 +2291,17 @@ static struct page
+ return *hpage;
+ }
+ #else
++static int khugepaged_find_target_node(void)
++{
++ return 0;
++}
++
++static inline struct page *alloc_hugepage(int defrag)
++{
++ return alloc_pages(alloc_hugepage_gfpmask(defrag, 0),
++ HPAGE_PMD_ORDER);
++}
++
+ static struct page *khugepaged_alloc_hugepage(bool *wait)
+ {
+ struct page *hpage;
+@@ -2455,6 +2508,7 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
+ if (pmd_trans_huge(*pmd))
+ goto out;
+
++ memset(khugepaged_node_load, 0, sizeof(khugepaged_node_load));
+ pte = pte_offset_map_lock(mm, pmd, address, &ptl);
+ for (_address = address, _pte = pte; _pte < pte+HPAGE_PMD_NR;
+ _pte++, _address += PAGE_SIZE) {
+@@ -2471,12 +2525,15 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
+ if (unlikely(!page))
+ goto out_unmap;
+ /*
+- * Chose the node of the first page. This could
+- * be more sophisticated and look at more pages,
+- * but isn't for now.
++ * Record which node the original page is from and save this
++ * information to khugepaged_node_load[].
++ * Khupaged will allocate hugepage from the node has the max
++ * hit record.
+ */
+- if (node == NUMA_NO_NODE)
+- node = page_to_nid(page);
++ node = page_to_nid(page);
++ if (khugepaged_scan_abort(node))
++ goto out_unmap;
++ khugepaged_node_load[node]++;
+ VM_BUG_ON(PageCompound(page));
+ if (!PageLRU(page) || PageLocked(page) || !PageAnon(page))
+ goto out_unmap;
+@@ -2491,9 +2548,11 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
+ ret = 1;
+ out_unmap:
+ pte_unmap_unlock(pte, ptl);
+- if (ret)
++ if (ret) {
++ node = khugepaged_find_target_node();
+ /* collapse_huge_page will return with the mmap_sem released */
+ collapse_huge_page(mm, address, hpage, vma, node);
++ }
+ out:
+ return ret;
+ }
+diff --git a/mm/hugetlb.c b/mm/hugetlb.c
+index f80b17106d24..c33d8a65298c 100644
+--- a/mm/hugetlb.c
++++ b/mm/hugetlb.c
+@@ -574,7 +574,7 @@ static struct page *dequeue_huge_page_vma(struct hstate *h,
+ goto err;
+
+ retry_cpuset:
+- cpuset_mems_cookie = get_mems_allowed();
++ cpuset_mems_cookie = read_mems_allowed_begin();
+ zonelist = huge_zonelist(vma, address,
+ htlb_alloc_mask(h), &mpol, &nodemask);
+
+@@ -596,7 +596,7 @@ retry_cpuset:
+ }
+
+ mpol_cond_put(mpol);
+- if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))
++ if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie)))
+ goto retry_cpuset;
+ return page;
+
+@@ -2114,6 +2114,9 @@ static int hugetlb_sysctl_handler_common(bool obey_mempolicy,
+ unsigned long tmp;
+ int ret;
+
++ if (!hugepages_supported())
++ return -ENOTSUPP;
++
+ tmp = h->max_huge_pages;
+
+ if (write && h->order >= MAX_ORDER)
+@@ -2167,6 +2170,9 @@ int hugetlb_overcommit_handler(struct ctl_table *table, int write,
+ unsigned long tmp;
+ int ret;
+
++ if (!hugepages_supported())
++ return -ENOTSUPP;
++
+ tmp = h->nr_overcommit_huge_pages;
+
+ if (write && h->order >= MAX_ORDER)
+@@ -2192,6 +2198,8 @@ out:
+ void hugetlb_report_meminfo(struct seq_file *m)
+ {
+ struct hstate *h = &default_hstate;
++ if (!hugepages_supported())
++ return;
+ seq_printf(m,
+ "HugePages_Total: %5lu\n"
+ "HugePages_Free: %5lu\n"
+@@ -2208,6 +2216,8 @@ void hugetlb_report_meminfo(struct seq_file *m)
+ int hugetlb_report_node_meminfo(int nid, char *buf)
+ {
+ struct hstate *h = &default_hstate;
++ if (!hugepages_supported())
++ return 0;
+ return sprintf(buf,
+ "Node %d HugePages_Total: %5u\n"
+ "Node %d HugePages_Free: %5u\n"
+@@ -2222,6 +2232,9 @@ void hugetlb_show_meminfo(void)
+ struct hstate *h;
+ int nid;
+
++ if (!hugepages_supported())
++ return;
++
+ for_each_node_state(nid, N_MEMORY)
+ for_each_hstate(h)
+ pr_info("Node %d hugepages_total=%u hugepages_free=%u hugepages_surp=%u hugepages_size=%lukB\n",
+diff --git a/mm/internal.h b/mm/internal.h
+index fdddbc83ac5f..d610f7ce4e9c 100644
+--- a/mm/internal.h
++++ b/mm/internal.h
+@@ -11,6 +11,7 @@
+ #ifndef __MM_INTERNAL_H
+ #define __MM_INTERNAL_H
+
++#include <linux/fs.h>
+ #include <linux/mm.h>
+
+ void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *start_vma,
+@@ -21,6 +22,20 @@ static inline void set_page_count(struct page *page, int v)
+ atomic_set(&page->_count, v);
+ }
+
++extern int __do_page_cache_readahead(struct address_space *mapping,
++ struct file *filp, pgoff_t offset, unsigned long nr_to_read,
++ unsigned long lookahead_size);
++
++/*
++ * Submit IO for the read-ahead request in file_ra_state.
++ */
++static inline unsigned long ra_submit(struct file_ra_state *ra,
++ struct address_space *mapping, struct file *filp)
++{
++ return __do_page_cache_readahead(mapping, filp,
++ ra->start, ra->size, ra->async_size);
++}
++
+ /*
+ * Turn a non-refcounted page (->_count == 0) into refcounted with
+ * a count of one.
+@@ -120,7 +135,7 @@ struct compact_control {
+ unsigned long nr_migratepages; /* Number of pages to migrate */
+ unsigned long free_pfn; /* isolate_freepages search base */
+ unsigned long migrate_pfn; /* isolate_migratepages search base */
+- bool sync; /* Synchronous migration */
++ enum migrate_mode mode; /* Async or sync migration mode */
+ bool ignore_skip_hint; /* Scan blocks even if marked skip */
+ bool finished_update_free; /* True when the zone cached pfns are
+ * no longer being updated
+@@ -130,7 +145,10 @@ struct compact_control {
+ int order; /* order a direct compactor needs */
+ int migratetype; /* MOVABLE, RECLAIMABLE etc */
+ struct zone *zone;
+- bool contended; /* True if a lock was contended */
++ bool contended; /* True if a lock was contended, or
++ * need_resched() true during async
++ * compaction
++ */
+ };
+
+ unsigned long
+diff --git a/mm/madvise.c b/mm/madvise.c
+index 539eeb96b323..a402f8fdc68e 100644
+--- a/mm/madvise.c
++++ b/mm/madvise.c
+@@ -195,7 +195,7 @@ static void force_shm_swapin_readahead(struct vm_area_struct *vma,
+ for (; start < end; start += PAGE_SIZE) {
+ index = ((start - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
+
+- page = find_get_page(mapping, index);
++ page = find_get_entry(mapping, index);
+ if (!radix_tree_exceptional_entry(page)) {
+ if (page)
+ page_cache_release(page);
+diff --git a/mm/memory-failure.c b/mm/memory-failure.c
+index 6e3f9c39bc22..4ab233d4714a 100644
+--- a/mm/memory-failure.c
++++ b/mm/memory-failure.c
+@@ -1554,7 +1554,7 @@ static int soft_offline_huge_page(struct page *page, int flags)
+
+ /* Keep page count to indicate a given hugepage is isolated. */
+ list_move(&hpage->lru, &pagelist);
+- ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL,
++ ret = migrate_pages(&pagelist, new_page, NULL, MPOL_MF_MOVE_ALL,
+ MIGRATE_SYNC, MR_MEMORY_FAILURE);
+ if (ret) {
+ pr_info("soft offline: %#lx: migration failed %d, type %lx\n",
+@@ -1635,7 +1635,7 @@ static int __soft_offline_page(struct page *page, int flags)
+ inc_zone_page_state(page, NR_ISOLATED_ANON +
+ page_is_file_cache(page));
+ list_add(&page->lru, &pagelist);
+- ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL,
++ ret = migrate_pages(&pagelist, new_page, NULL, MPOL_MF_MOVE_ALL,
+ MIGRATE_SYNC, MR_MEMORY_FAILURE);
+ if (ret) {
+ putback_lru_pages(&pagelist);
+diff --git a/mm/memory.c b/mm/memory.c
+index 99fe3aa1035c..b5901068495f 100644
+--- a/mm/memory.c
++++ b/mm/memory.c
+@@ -878,7 +878,7 @@ out_set_pte:
+ return 0;
+ }
+
+-int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
++static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
+ pmd_t *dst_pmd, pmd_t *src_pmd, struct vm_area_struct *vma,
+ unsigned long addr, unsigned long end)
+ {
+@@ -3698,7 +3698,7 @@ static int handle_pte_fault(struct mm_struct *mm,
+ pte_t entry;
+ spinlock_t *ptl;
+
+- entry = *pte;
++ entry = ACCESS_ONCE(*pte);
+ if (!pte_present(entry)) {
+ if (pte_none(entry)) {
+ if (vma->vm_ops) {
+diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
+index ed85fe3870e2..d31730564617 100644
+--- a/mm/memory_hotplug.c
++++ b/mm/memory_hotplug.c
+@@ -1321,7 +1321,7 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
+ * alloc_migrate_target should be improooooved!!
+ * migrate_pages returns # of failed pages.
+ */
+- ret = migrate_pages(&source, alloc_migrate_target, 0,
++ ret = migrate_pages(&source, alloc_migrate_target, NULL, 0,
+ MIGRATE_SYNC, MR_MEMORY_HOTPLUG);
+ if (ret)
+ putback_movable_pages(&source);
+diff --git a/mm/mempolicy.c b/mm/mempolicy.c
+index 0437f3595b32..cc61c7a7d6a1 100644
+--- a/mm/mempolicy.c
++++ b/mm/mempolicy.c
+@@ -1060,7 +1060,7 @@ static int migrate_to_node(struct mm_struct *mm, int source, int dest,
+ flags | MPOL_MF_DISCONTIG_OK, &pagelist);
+
+ if (!list_empty(&pagelist)) {
+- err = migrate_pages(&pagelist, new_node_page, dest,
++ err = migrate_pages(&pagelist, new_node_page, NULL, dest,
+ MIGRATE_SYNC, MR_SYSCALL);
+ if (err)
+ putback_movable_pages(&pagelist);
+@@ -1306,7 +1306,7 @@ static long do_mbind(unsigned long start, unsigned long len,
+
+ if (!list_empty(&pagelist)) {
+ WARN_ON_ONCE(flags & MPOL_MF_LAZY);
+- nr_failed = migrate_pages(&pagelist, new_page,
++ nr_failed = migrate_pages(&pagelist, new_page, NULL,
+ start, MIGRATE_SYNC, MR_MEMPOLICY_MBIND);
+ if (nr_failed)
+ putback_movable_pages(&pagelist);
+@@ -1873,7 +1873,7 @@ int node_random(const nodemask_t *maskp)
+ * If the effective policy is 'BIND, returns a pointer to the mempolicy's
+ * @nodemask for filtering the zonelist.
+ *
+- * Must be protected by get_mems_allowed()
++ * Must be protected by read_mems_allowed_begin()
+ */
+ struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr,
+ gfp_t gfp_flags, struct mempolicy **mpol,
+@@ -2037,7 +2037,7 @@ alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
+
+ retry_cpuset:
+ pol = get_vma_policy(current, vma, addr);
+- cpuset_mems_cookie = get_mems_allowed();
++ cpuset_mems_cookie = read_mems_allowed_begin();
+
+ if (unlikely(pol->mode == MPOL_INTERLEAVE)) {
+ unsigned nid;
+@@ -2045,7 +2045,7 @@ retry_cpuset:
+ nid = interleave_nid(pol, vma, addr, PAGE_SHIFT + order);
+ mpol_cond_put(pol);
+ page = alloc_page_interleave(gfp, order, nid);
+- if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))
++ if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie)))
+ goto retry_cpuset;
+
+ return page;
+@@ -2055,7 +2055,7 @@ retry_cpuset:
+ policy_nodemask(gfp, pol));
+ if (unlikely(mpol_needs_cond_ref(pol)))
+ __mpol_put(pol);
+- if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))
++ if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie)))
+ goto retry_cpuset;
+ return page;
+ }
+@@ -2089,7 +2089,7 @@ struct page *alloc_pages_current(gfp_t gfp, unsigned order)
+ pol = &default_policy;
+
+ retry_cpuset:
+- cpuset_mems_cookie = get_mems_allowed();
++ cpuset_mems_cookie = read_mems_allowed_begin();
+
+ /*
+ * No reference counting needed for current->mempolicy
+@@ -2102,7 +2102,7 @@ retry_cpuset:
+ policy_zonelist(gfp, pol, numa_node_id()),
+ policy_nodemask(gfp, pol));
+
+- if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))
++ if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie)))
+ goto retry_cpuset;
+
+ return page;
+diff --git a/mm/migrate.c b/mm/migrate.c
+index e3cf71dd1288..96d4d814ae2f 100644
+--- a/mm/migrate.c
++++ b/mm/migrate.c
+@@ -867,8 +867,9 @@ out:
+ * Obtain the lock on page, remove all ptes and migrate the page
+ * to the newly allocated page in newpage.
+ */
+-static int unmap_and_move(new_page_t get_new_page, unsigned long private,
+- struct page *page, int force, enum migrate_mode mode)
++static int unmap_and_move(new_page_t get_new_page, free_page_t put_new_page,
++ unsigned long private, struct page *page, int force,
++ enum migrate_mode mode)
+ {
+ int rc = 0;
+ int *result = NULL;
+@@ -912,11 +913,18 @@ out:
+ page_is_file_cache(page));
+ putback_lru_page(page);
+ }
++
+ /*
+- * Move the new page to the LRU. If migration was not successful
+- * then this will free the page.
++ * If migration was not successful and there's a freeing callback, use
++ * it. Otherwise, putback_lru_page() will drop the reference grabbed
++ * during isolation.
+ */
+- putback_lru_page(newpage);
++ if (rc != MIGRATEPAGE_SUCCESS && put_new_page) {
++ ClearPageSwapBacked(newpage);
++ put_new_page(newpage, private);
++ } else
++ putback_lru_page(newpage);
++
+ if (result) {
+ if (rc)
+ *result = rc;
+@@ -945,8 +953,9 @@ out:
+ * will wait in the page fault for migration to complete.
+ */
+ static int unmap_and_move_huge_page(new_page_t get_new_page,
+- unsigned long private, struct page *hpage,
+- int force, enum migrate_mode mode)
++ free_page_t put_new_page, unsigned long private,
++ struct page *hpage, int force,
++ enum migrate_mode mode)
+ {
+ int rc = 0;
+ int *result = NULL;
+@@ -982,20 +991,30 @@ static int unmap_and_move_huge_page(new_page_t get_new_page,
+ if (!page_mapped(hpage))
+ rc = move_to_new_page(new_hpage, hpage, 1, mode);
+
+- if (rc)
++ if (rc != MIGRATEPAGE_SUCCESS)
+ remove_migration_ptes(hpage, hpage);
+
+ if (anon_vma)
+ put_anon_vma(anon_vma);
+
+- if (!rc)
++ if (rc == MIGRATEPAGE_SUCCESS)
+ hugetlb_cgroup_migrate(hpage, new_hpage);
+
+ unlock_page(hpage);
+ out:
+ if (rc != -EAGAIN)
+ putback_active_hugepage(hpage);
+- put_page(new_hpage);
++
++ /*
++ * If migration was not successful and there's a freeing callback, use
++ * it. Otherwise, put_page() will drop the reference grabbed during
++ * isolation.
++ */
++ if (rc != MIGRATEPAGE_SUCCESS && put_new_page)
++ put_new_page(new_hpage, private);
++ else
++ put_page(new_hpage);
++
+ if (result) {
+ if (rc)
+ *result = rc;
+@@ -1012,6 +1031,8 @@ out:
+ * @from: The list of pages to be migrated.
+ * @get_new_page: The function used to allocate free pages to be used
+ * as the target of the page migration.
++ * @put_new_page: The function used to free target pages if migration
++ * fails, or NULL if no special handling is necessary.
+ * @private: Private data to be passed on to get_new_page()
+ * @mode: The migration mode that specifies the constraints for
+ * page migration, if any.
+@@ -1025,7 +1046,8 @@ out:
+ * Returns the number of pages that were not migrated, or an error code.
+ */
+ int migrate_pages(struct list_head *from, new_page_t get_new_page,
+- unsigned long private, enum migrate_mode mode, int reason)
++ free_page_t put_new_page, unsigned long private,
++ enum migrate_mode mode, int reason)
+ {
+ int retry = 1;
+ int nr_failed = 0;
+@@ -1047,10 +1069,11 @@ int migrate_pages(struct list_head *from, new_page_t get_new_page,
+
+ if (PageHuge(page))
+ rc = unmap_and_move_huge_page(get_new_page,
+- private, page, pass > 2, mode);
++ put_new_page, private, page,
++ pass > 2, mode);
+ else
+- rc = unmap_and_move(get_new_page, private,
+- page, pass > 2, mode);
++ rc = unmap_and_move(get_new_page, put_new_page,
++ private, page, pass > 2, mode);
+
+ switch(rc) {
+ case -ENOMEM:
+@@ -1194,7 +1217,7 @@ set_status:
+
+ err = 0;
+ if (!list_empty(&pagelist)) {
+- err = migrate_pages(&pagelist, new_page_node,
++ err = migrate_pages(&pagelist, new_page_node, NULL,
+ (unsigned long)pm, MIGRATE_SYNC, MR_SYSCALL);
+ if (err)
+ putback_movable_pages(&pagelist);
+@@ -1643,7 +1666,8 @@ int migrate_misplaced_page(struct page *page, int node)
+
+ list_add(&page->lru, &migratepages);
+ nr_remaining = migrate_pages(&migratepages, alloc_misplaced_dst_page,
+- node, MIGRATE_ASYNC, MR_NUMA_MISPLACED);
++ NULL, node, MIGRATE_ASYNC,
++ MR_NUMA_MISPLACED);
+ if (nr_remaining) {
+ putback_lru_pages(&migratepages);
+ isolated = 0;
+diff --git a/mm/mincore.c b/mm/mincore.c
+index da2be56a7b8f..06cb81005c77 100644
+--- a/mm/mincore.c
++++ b/mm/mincore.c
+@@ -70,13 +70,21 @@ static unsigned char mincore_page(struct address_space *mapping, pgoff_t pgoff)
+ * any other file mapping (ie. marked !present and faulted in with
+ * tmpfs's .fault). So swapped out tmpfs mappings are tested here.
+ */
+- page = find_get_page(mapping, pgoff);
+ #ifdef CONFIG_SWAP
+- /* shmem/tmpfs may return swap: account for swapcache page too. */
+- if (radix_tree_exceptional_entry(page)) {
+- swp_entry_t swap = radix_to_swp_entry(page);
+- page = find_get_page(swap_address_space(swap), swap.val);
+- }
++ if (shmem_mapping(mapping)) {
++ page = find_get_entry(mapping, pgoff);
++ /*
++ * shmem/tmpfs may return swap: account for swapcache
++ * page too.
++ */
++ if (radix_tree_exceptional_entry(page)) {
++ swp_entry_t swp = radix_to_swp_entry(page);
++ page = find_get_page(swap_address_space(swp), swp.val);
++ }
++ } else
++ page = find_get_page(mapping, pgoff);
++#else
++ page = find_get_page(mapping, pgoff);
+ #endif
+ if (page) {
+ present = PageUptodate(page);
+diff --git a/mm/mmap.c b/mm/mmap.c
+index af99b9ed2007..c1249cb7dc15 100644
+--- a/mm/mmap.c
++++ b/mm/mmap.c
+@@ -10,6 +10,7 @@
+ #include <linux/slab.h>
+ #include <linux/backing-dev.h>
+ #include <linux/mm.h>
++#include <linux/vmacache.h>
+ #include <linux/shm.h>
+ #include <linux/mman.h>
+ #include <linux/pagemap.h>
+@@ -682,8 +683,9 @@ __vma_unlink(struct mm_struct *mm, struct vm_area_struct *vma,
+ prev->vm_next = next = vma->vm_next;
+ if (next)
+ next->vm_prev = prev;
+- if (mm->mmap_cache == vma)
+- mm->mmap_cache = prev;
++
++ /* Kill the cache */
++ vmacache_invalidate(mm);
+ }
+
+ /*
+@@ -1980,34 +1982,33 @@ EXPORT_SYMBOL(get_unmapped_area);
+ /* Look up the first VMA which satisfies addr < vm_end, NULL if none. */
+ struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr)
+ {
+- struct vm_area_struct *vma = NULL;
++ struct rb_node *rb_node;
++ struct vm_area_struct *vma;
+
+ /* Check the cache first. */
+- /* (Cache hit rate is typically around 35%.) */
+- vma = ACCESS_ONCE(mm->mmap_cache);
+- if (!(vma && vma->vm_end > addr && vma->vm_start <= addr)) {
+- struct rb_node *rb_node;
++ vma = vmacache_find(mm, addr);
++ if (likely(vma))
++ return vma;
+
+- rb_node = mm->mm_rb.rb_node;
+- vma = NULL;
++ rb_node = mm->mm_rb.rb_node;
++ vma = NULL;
+
+- while (rb_node) {
+- struct vm_area_struct *vma_tmp;
+-
+- vma_tmp = rb_entry(rb_node,
+- struct vm_area_struct, vm_rb);
+-
+- if (vma_tmp->vm_end > addr) {
+- vma = vma_tmp;
+- if (vma_tmp->vm_start <= addr)
+- break;
+- rb_node = rb_node->rb_left;
+- } else
+- rb_node = rb_node->rb_right;
+- }
+- if (vma)
+- mm->mmap_cache = vma;
++ while (rb_node) {
++ struct vm_area_struct *tmp;
++
++ tmp = rb_entry(rb_node, struct vm_area_struct, vm_rb);
++
++ if (tmp->vm_end > addr) {
++ vma = tmp;
++ if (tmp->vm_start <= addr)
++ break;
++ rb_node = rb_node->rb_left;
++ } else
++ rb_node = rb_node->rb_right;
+ }
++
++ if (vma)
++ vmacache_update(addr, vma);
+ return vma;
+ }
+
+@@ -2379,7 +2380,9 @@ detach_vmas_to_be_unmapped(struct mm_struct *mm, struct vm_area_struct *vma,
+ } else
+ mm->highest_vm_end = prev ? prev->vm_end : 0;
+ tail_vma->vm_next = NULL;
+- mm->mmap_cache = NULL; /* Kill the cache. */
++
++ /* Kill the cache */
++ vmacache_invalidate(mm);
+ }
+
+ /*
+diff --git a/mm/nommu.c b/mm/nommu.c
+index ecd1f158548e..1221d2b66e97 100644
+--- a/mm/nommu.c
++++ b/mm/nommu.c
+@@ -15,6 +15,7 @@
+
+ #include <linux/export.h>
+ #include <linux/mm.h>
++#include <linux/vmacache.h>
+ #include <linux/mman.h>
+ #include <linux/swap.h>
+ #include <linux/file.h>
+@@ -767,16 +768,23 @@ static void add_vma_to_mm(struct mm_struct *mm, struct vm_area_struct *vma)
+ */
+ static void delete_vma_from_mm(struct vm_area_struct *vma)
+ {
++ int i;
+ struct address_space *mapping;
+ struct mm_struct *mm = vma->vm_mm;
++ struct task_struct *curr = current;
+
+ kenter("%p", vma);
+
+ protect_vma(vma, 0);
+
+ mm->map_count--;
+- if (mm->mmap_cache == vma)
+- mm->mmap_cache = NULL;
++ for (i = 0; i < VMACACHE_SIZE; i++) {
++ /* if the vma is cached, invalidate the entire cache */
++ if (curr->vmacache[i] == vma) {
++ vmacache_invalidate(curr->mm);
++ break;
++ }
++ }
+
+ /* remove the VMA from the mapping */
+ if (vma->vm_file) {
+@@ -824,8 +832,8 @@ struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr)
+ struct vm_area_struct *vma;
+
+ /* check the cache first */
+- vma = ACCESS_ONCE(mm->mmap_cache);
+- if (vma && vma->vm_start <= addr && vma->vm_end > addr)
++ vma = vmacache_find(mm, addr);
++ if (likely(vma))
+ return vma;
+
+ /* trawl the list (there may be multiple mappings in which addr
+@@ -834,7 +842,7 @@ struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr)
+ if (vma->vm_start > addr)
+ return NULL;
+ if (vma->vm_end > addr) {
+- mm->mmap_cache = vma;
++ vmacache_update(addr, vma);
+ return vma;
+ }
+ }
+@@ -873,8 +881,8 @@ static struct vm_area_struct *find_vma_exact(struct mm_struct *mm,
+ unsigned long end = addr + len;
+
+ /* check the cache first */
+- vma = mm->mmap_cache;
+- if (vma && vma->vm_start == addr && vma->vm_end == end)
++ vma = vmacache_find_exact(mm, addr, end);
++ if (vma)
+ return vma;
+
+ /* trawl the list (there may be multiple mappings in which addr
+@@ -885,7 +893,7 @@ static struct vm_area_struct *find_vma_exact(struct mm_struct *mm,
+ if (vma->vm_start > addr)
+ return NULL;
+ if (vma->vm_end == end) {
+- mm->mmap_cache = vma;
++ vmacache_update(addr, vma);
+ return vma;
+ }
+ }
+diff --git a/mm/page_alloc.c b/mm/page_alloc.c
+index a280f772bc66..2f91223dbe93 100644
+--- a/mm/page_alloc.c
++++ b/mm/page_alloc.c
+@@ -405,7 +405,8 @@ static int destroy_compound_page(struct page *page, unsigned long order)
+ return bad;
+ }
+
+-static inline void prep_zero_page(struct page *page, int order, gfp_t gfp_flags)
++static inline void prep_zero_page(struct page *page, unsigned int order,
++ gfp_t gfp_flags)
+ {
+ int i;
+
+@@ -449,7 +450,7 @@ static inline void set_page_guard_flag(struct page *page) { }
+ static inline void clear_page_guard_flag(struct page *page) { }
+ #endif
+
+-static inline void set_page_order(struct page *page, int order)
++static inline void set_page_order(struct page *page, unsigned int order)
+ {
+ set_page_private(page, order);
+ __SetPageBuddy(page);
+@@ -500,21 +501,31 @@ __find_buddy_index(unsigned long page_idx, unsigned int order)
+ * For recording page's order, we use page_private(page).
+ */
+ static inline int page_is_buddy(struct page *page, struct page *buddy,
+- int order)
++ unsigned int order)
+ {
+ if (!pfn_valid_within(page_to_pfn(buddy)))
+ return 0;
+
+- if (page_zone_id(page) != page_zone_id(buddy))
+- return 0;
+-
+ if (page_is_guard(buddy) && page_order(buddy) == order) {
+ VM_BUG_ON(page_count(buddy) != 0);
++
++ if (page_zone_id(page) != page_zone_id(buddy))
++ return 0;
++
+ return 1;
+ }
+
+ if (PageBuddy(buddy) && page_order(buddy) == order) {
+ VM_BUG_ON(page_count(buddy) != 0);
++
++ /*
++ * zone check is done late to avoid uselessly
++ * calculating zone/node ids for pages that could
++ * never merge.
++ */
++ if (page_zone_id(page) != page_zone_id(buddy))
++ return 0;
++
+ return 1;
+ }
+ return 0;
+@@ -546,6 +557,7 @@ static inline int page_is_buddy(struct page *page, struct page *buddy,
+ */
+
+ static inline void __free_one_page(struct page *page,
++ unsigned long pfn,
+ struct zone *zone, unsigned int order,
+ int migratetype)
+ {
+@@ -562,7 +574,7 @@ static inline void __free_one_page(struct page *page,
+
+ VM_BUG_ON(migratetype == -1);
+
+- page_idx = page_to_pfn(page) & ((1 << MAX_ORDER) - 1);
++ page_idx = pfn & ((1 << MAX_ORDER) - 1);
+
+ VM_BUG_ON(page_idx & ((1 << order) - 1));
+ VM_BUG_ON(bad_range(zone, page));
+@@ -652,9 +664,12 @@ static void free_pcppages_bulk(struct zone *zone, int count,
+ int migratetype = 0;
+ int batch_free = 0;
+ int to_free = count;
++ unsigned long nr_scanned;
+
+ spin_lock(&zone->lock);
+- zone->pages_scanned = 0;
++ nr_scanned = zone_page_state(zone, NR_PAGES_SCANNED);
++ if (nr_scanned)
++ __mod_zone_page_state(zone, NR_PAGES_SCANNED, -nr_scanned);
+
+ while (to_free) {
+ struct page *page;
+@@ -686,7 +701,7 @@ static void free_pcppages_bulk(struct zone *zone, int count,
+ list_del(&page->lru);
+ mt = get_freepage_migratetype(page);
+ /* MIGRATE_MOVABLE list may include MIGRATE_RESERVEs */
+- __free_one_page(page, zone, 0, mt);
++ __free_one_page(page, page_to_pfn(page), zone, 0, mt);
+ trace_mm_page_pcpu_drain(page, 0, mt);
+ if (likely(!is_migrate_isolate_page(page))) {
+ __mod_zone_page_state(zone, NR_FREE_PAGES, 1);
+@@ -698,13 +713,18 @@ static void free_pcppages_bulk(struct zone *zone, int count,
+ spin_unlock(&zone->lock);
+ }
+
+-static void free_one_page(struct zone *zone, struct page *page, int order,
++static void free_one_page(struct zone *zone,
++ struct page *page, unsigned long pfn,
++ unsigned int order,
+ int migratetype)
+ {
++ unsigned long nr_scanned;
+ spin_lock(&zone->lock);
+- zone->pages_scanned = 0;
++ nr_scanned = zone_page_state(zone, NR_PAGES_SCANNED);
++ if (nr_scanned)
++ __mod_zone_page_state(zone, NR_PAGES_SCANNED, -nr_scanned);
+
+- __free_one_page(page, zone, order, migratetype);
++ __free_one_page(page, pfn, zone, order, migratetype);
+ if (unlikely(!is_migrate_isolate(migratetype)))
+ __mod_zone_freepage_state(zone, 1 << order, migratetype);
+ spin_unlock(&zone->lock);
+@@ -741,15 +761,16 @@ static void __free_pages_ok(struct page *page, unsigned int order)
+ {
+ unsigned long flags;
+ int migratetype;
++ unsigned long pfn = page_to_pfn(page);
+
+ if (!free_pages_prepare(page, order))
+ return;
+
++ migratetype = get_pfnblock_migratetype(page, pfn);
+ local_irq_save(flags);
+ __count_vm_events(PGFREE, 1 << order);
+- migratetype = get_pageblock_migratetype(page);
+ set_freepage_migratetype(page, migratetype);
+- free_one_page(page_zone(page), page, order, migratetype);
++ free_one_page(page_zone(page), page, pfn, order, migratetype);
+ local_irq_restore(flags);
+ }
+
+@@ -869,7 +890,7 @@ static inline int check_new_page(struct page *page)
+ return 0;
+ }
+
+-static int prep_new_page(struct page *page, int order, gfp_t gfp_flags)
++static int prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags)
+ {
+ int i;
+
+@@ -918,6 +939,7 @@ struct page *__rmqueue_smallest(struct zone *zone, unsigned int order,
+ rmv_page_order(page);
+ area->nr_free--;
+ expand(zone, page, order, current_order, area, migratetype);
++ set_freepage_migratetype(page, migratetype);
+ return page;
+ }
+
+@@ -1042,6 +1064,12 @@ static int try_to_steal_freepages(struct zone *zone, struct page *page,
+ {
+ int current_order = page_order(page);
+
++ /*
++ * When borrowing from MIGRATE_CMA, we need to release the excess
++ * buddy pages to CMA itself. We also ensure the freepage_migratetype
++ * is set to CMA so it is returned to the correct freelist in case
++ * the page ends up being not actually allocated from the pcp lists.
++ */
+ if (is_migrate_cma(fallback_type))
+ return fallback_type;
+
+@@ -1073,16 +1101,17 @@ static int try_to_steal_freepages(struct zone *zone, struct page *page,
+
+ /* Remove an element from the buddy allocator from the fallback list */
+ static inline struct page *
+-__rmqueue_fallback(struct zone *zone, int order, int start_migratetype)
++__rmqueue_fallback(struct zone *zone, unsigned int order, int start_migratetype)
+ {
+ struct free_area *area;
+- int current_order;
++ unsigned int current_order;
+ struct page *page;
+ int migratetype, new_type, i;
+
+ /* Find the largest possible block of pages in the other list */
+- for (current_order = MAX_ORDER-1; current_order >= order;
+- --current_order) {
++ for (current_order = MAX_ORDER-1;
++ current_order >= order && current_order <= MAX_ORDER-1;
++ --current_order) {
+ for (i = 0;; i++) {
+ migratetype = fallbacks[start_migratetype][i];
+
+@@ -1106,21 +1135,17 @@ __rmqueue_fallback(struct zone *zone, int order, int start_migratetype)
+ list_del(&page->lru);
+ rmv_page_order(page);
+
+- /*
+- * Borrow the excess buddy pages as well, irrespective
+- * of whether we stole freepages, or took ownership of
+- * the pageblock or not.
+- *
+- * Exception: When borrowing from MIGRATE_CMA, release
+- * the excess buddy pages to CMA itself.
+- */
+ expand(zone, page, order, current_order, area,
+- is_migrate_cma(migratetype)
+- ? migratetype : start_migratetype);
++ new_type);
++ /* The freepage_migratetype may differ from pageblock's
++ * migratetype depending on the decisions in
++ * try_to_steal_freepages. This is OK as long as it does
++ * not differ for MIGRATE_CMA type.
++ */
++ set_freepage_migratetype(page, new_type);
+
+- trace_mm_page_alloc_extfrag(page, order,
+- current_order, start_migratetype, migratetype,
+- new_type == start_migratetype);
++ trace_mm_page_alloc_extfrag(page, order, current_order,
++ start_migratetype, migratetype, new_type);
+
+ return page;
+ }
+@@ -1166,9 +1191,9 @@ retry_reserve:
+ */
+ static int rmqueue_bulk(struct zone *zone, unsigned int order,
+ unsigned long count, struct list_head *list,
+- int migratetype, int cold)
++ int migratetype, bool cold)
+ {
+- int mt = migratetype, i;
++ int i;
+
+ spin_lock(&zone->lock);
+ for (i = 0; i < count; ++i) {
+@@ -1185,18 +1210,12 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
+ * merge IO requests if the physical pages are ordered
+ * properly.
+ */
+- if (likely(cold == 0))
++ if (likely(!cold))
+ list_add(&page->lru, list);
+ else
+ list_add_tail(&page->lru, list);
+- if (IS_ENABLED(CONFIG_CMA)) {
+- mt = get_pageblock_migratetype(page);
+- if (!is_migrate_cma(mt) && !is_migrate_isolate(mt))
+- mt = migratetype;
+- }
+- set_freepage_migratetype(page, mt);
+ list = &page->lru;
+- if (is_migrate_cma(mt))
++ if (is_migrate_cma(get_freepage_migratetype(page)))
+ __mod_zone_page_state(zone, NR_FREE_CMA_PAGES,
+ -(1 << order));
+ }
+@@ -1320,7 +1339,7 @@ void mark_free_pages(struct zone *zone)
+ {
+ unsigned long pfn, max_zone_pfn;
+ unsigned long flags;
+- int order, t;
++ unsigned int order, t;
+ struct list_head *curr;
+
+ if (zone_is_empty(zone))
+@@ -1352,19 +1371,20 @@ void mark_free_pages(struct zone *zone)
+
+ /*
+ * Free a 0-order page
+- * cold == 1 ? free a cold page : free a hot page
++ * cold == true ? free a cold page : free a hot page
+ */
+-void free_hot_cold_page(struct page *page, int cold)
++void free_hot_cold_page(struct page *page, bool cold)
+ {
+ struct zone *zone = page_zone(page);
+ struct per_cpu_pages *pcp;
+ unsigned long flags;
++ unsigned long pfn = page_to_pfn(page);
+ int migratetype;
+
+ if (!free_pages_prepare(page, 0))
+ return;
+
+- migratetype = get_pageblock_migratetype(page);
++ migratetype = get_pfnblock_migratetype(page, pfn);
+ set_freepage_migratetype(page, migratetype);
+ local_irq_save(flags);
+ __count_vm_event(PGFREE);
+@@ -1378,17 +1398,17 @@ void free_hot_cold_page(struct page *page, int cold)
+ */
+ if (migratetype >= MIGRATE_PCPTYPES) {
+ if (unlikely(is_migrate_isolate(migratetype))) {
+- free_one_page(zone, page, 0, migratetype);
++ free_one_page(zone, page, pfn, 0, migratetype);
+ goto out;
+ }
+ migratetype = MIGRATE_MOVABLE;
+ }
+
+ pcp = &this_cpu_ptr(zone->pageset)->pcp;
+- if (cold)
+- list_add_tail(&page->lru, &pcp->lists[migratetype]);
+- else
++ if (!cold)
+ list_add(&page->lru, &pcp->lists[migratetype]);
++ else
++ list_add_tail(&page->lru, &pcp->lists[migratetype]);
+ pcp->count++;
+ if (pcp->count >= pcp->high) {
+ unsigned long batch = ACCESS_ONCE(pcp->batch);
+@@ -1403,7 +1423,7 @@ out:
+ /*
+ * Free a list of 0-order pages
+ */
+-void free_hot_cold_page_list(struct list_head *list, int cold)
++void free_hot_cold_page_list(struct list_head *list, bool cold)
+ {
+ struct page *page, *next;
+
+@@ -1515,12 +1535,12 @@ int split_free_page(struct page *page)
+ */
+ static inline
+ struct page *buffered_rmqueue(struct zone *preferred_zone,
+- struct zone *zone, int order, gfp_t gfp_flags,
+- int migratetype)
++ struct zone *zone, unsigned int order,
++ gfp_t gfp_flags, int migratetype)
+ {
+ unsigned long flags;
+ struct page *page;
+- int cold = !!(gfp_flags & __GFP_COLD);
++ bool cold = ((gfp_flags & __GFP_COLD) != 0);
+
+ again:
+ if (likely(order == 0)) {
+@@ -1565,10 +1585,13 @@ again:
+ if (!page)
+ goto failed;
+ __mod_zone_freepage_state(zone, -(1 << order),
+- get_pageblock_migratetype(page));
++ get_freepage_migratetype(page));
+ }
+
+ __mod_zone_page_state(zone, NR_ALLOC_BATCH, -(1 << order));
++ if (zone_page_state(zone, NR_ALLOC_BATCH) == 0 &&
++ !zone_is_fair_depleted(zone))
++ zone_set_flag(zone, ZONE_FAIR_DEPLETED);
+
+ __count_zone_vm_events(PGALLOC, zone, 1 << order);
+ zone_statistics(preferred_zone, zone, gfp_flags);
+@@ -1665,12 +1688,12 @@ static inline bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
+ * Return true if free pages are above 'mark'. This takes into account the order
+ * of the allocation.
+ */
+-static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark,
+- int classzone_idx, int alloc_flags, long free_pages)
++static bool __zone_watermark_ok(struct zone *z, unsigned int order,
++ unsigned long mark, int classzone_idx, int alloc_flags,
++ long free_pages)
+ {
+ /* free_pages my go negative - that's OK */
+ long min = mark;
+- long lowmem_reserve = z->lowmem_reserve[classzone_idx];
+ int o;
+ long free_cma = 0;
+
+@@ -1685,7 +1708,7 @@ static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark,
+ free_cma = zone_page_state(z, NR_FREE_CMA_PAGES);
+ #endif
+
+- if (free_pages - free_cma <= min + lowmem_reserve)
++ if (free_pages - free_cma <= min + z->lowmem_reserve[classzone_idx])
+ return false;
+ for (o = 0; o < order; o++) {
+ /* At the next order, this order's pages become unavailable */
+@@ -1700,15 +1723,15 @@ static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark,
+ return true;
+ }
+
+-bool zone_watermark_ok(struct zone *z, int order, unsigned long mark,
++bool zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark,
+ int classzone_idx, int alloc_flags)
+ {
+ return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags,
+ zone_page_state(z, NR_FREE_PAGES));
+ }
+
+-bool zone_watermark_ok_safe(struct zone *z, int order, unsigned long mark,
+- int classzone_idx, int alloc_flags)
++bool zone_watermark_ok_safe(struct zone *z, unsigned int order,
++ unsigned long mark, int classzone_idx, int alloc_flags)
+ {
+ long free_pages = zone_page_state(z, NR_FREE_PAGES);
+
+@@ -1850,7 +1873,7 @@ static void __paginginit init_zone_allows_reclaim(int nid)
+ {
+ int i;
+
+- for_each_online_node(i)
++ for_each_node_state(i, N_MEMORY)
+ if (node_distance(nid, i) <= RECLAIM_DISTANCE)
+ node_set(i, NODE_DATA(nid)->reclaim_nodes);
+ else
+@@ -1893,6 +1916,18 @@ static inline void init_zone_allows_reclaim(int nid)
+ }
+ #endif /* CONFIG_NUMA */
+
++static void reset_alloc_batches(struct zone *preferred_zone)
++{
++ struct zone *zone = preferred_zone->zone_pgdat->node_zones;
++
++ do {
++ mod_zone_page_state(zone, NR_ALLOC_BATCH,
++ high_wmark_pages(zone) - low_wmark_pages(zone) -
++ atomic_long_read(&zone->vm_stat[NR_ALLOC_BATCH]));
++ zone_clear_flag(zone, ZONE_FAIR_DEPLETED);
++ } while (zone++ != preferred_zone);
++}
++
+ /*
+ * get_page_from_freelist goes through the zonelist trying to allocate
+ * a page.
+@@ -1900,18 +1935,22 @@ static inline void init_zone_allows_reclaim(int nid)
+ static struct page *
+ get_page_from_freelist(gfp_t gfp_mask, nodemask_t *nodemask, unsigned int order,
+ struct zonelist *zonelist, int high_zoneidx, int alloc_flags,
+- struct zone *preferred_zone, int migratetype)
++ struct zone *preferred_zone, int classzone_idx, int migratetype)
+ {
+ struct zoneref *z;
+ struct page *page = NULL;
+- int classzone_idx;
+ struct zone *zone;
+ nodemask_t *allowednodes = NULL;/* zonelist_cache approximation */
+ int zlc_active = 0; /* set if using zonelist_cache */
+ int did_zlc_setup = 0; /* just call zlc_setup() one time */
++ bool consider_zone_dirty = (alloc_flags & ALLOC_WMARK_LOW) &&
++ (gfp_mask & __GFP_WRITE);
++ int nr_fair_skipped = 0;
++ bool zonelist_rescan;
+
+- classzone_idx = zone_idx(preferred_zone);
+ zonelist_scan:
++ zonelist_rescan = false;
++
+ /*
+ * Scan zonelist, looking for a zone with enough free.
+ * See also __cpuset_node_allowed_softwall() comment in kernel/cpuset.c.
+@@ -1923,12 +1962,10 @@ zonelist_scan:
+ if (IS_ENABLED(CONFIG_NUMA) && zlc_active &&
+ !zlc_zone_worth_trying(zonelist, z, allowednodes))
+ continue;
+- if ((alloc_flags & ALLOC_CPUSET) &&
++ if (cpusets_enabled() &&
++ (alloc_flags & ALLOC_CPUSET) &&
+ !cpuset_zone_allowed_softwall(zone, gfp_mask))
+ continue;
+- BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK);
+- if (unlikely(alloc_flags & ALLOC_NO_WATERMARKS))
+- goto try_this_zone;
+ /*
+ * Distribute pages in proportion to the individual
+ * zone size to ensure fair page aging. The zone a
+@@ -1937,9 +1974,11 @@ zonelist_scan:
+ */
+ if (alloc_flags & ALLOC_FAIR) {
+ if (!zone_local(preferred_zone, zone))
++ break;
++ if (zone_is_fair_depleted(zone)) {
++ nr_fair_skipped++;
+ continue;
+- if (zone_page_state(zone, NR_ALLOC_BATCH) <= 0)
+- continue;
++ }
+ }
+ /*
+ * When allocating a page cache page for writing, we
+@@ -1967,15 +2006,19 @@ zonelist_scan:
+ * will require awareness of zones in the
+ * dirty-throttling and the flusher threads.
+ */
+- if ((alloc_flags & ALLOC_WMARK_LOW) &&
+- (gfp_mask & __GFP_WRITE) && !zone_dirty_ok(zone))
+- goto this_zone_full;
++ if (consider_zone_dirty && !zone_dirty_ok(zone))
++ continue;
+
+ mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK];
+ if (!zone_watermark_ok(zone, order, mark,
+ classzone_idx, alloc_flags)) {
+ int ret;
+
++ /* Checked here to keep the fast path fast */
++ BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK);
++ if (alloc_flags & ALLOC_NO_WATERMARKS)
++ goto try_this_zone;
++
+ if (IS_ENABLED(CONFIG_NUMA) &&
+ !did_zlc_setup && nr_online_nodes > 1) {
+ /*
+@@ -2037,17 +2080,11 @@ try_this_zone:
+ if (page)
+ break;
+ this_zone_full:
+- if (IS_ENABLED(CONFIG_NUMA))
++ if (IS_ENABLED(CONFIG_NUMA) && zlc_active)
+ zlc_mark_zone_full(zonelist, z);
+ }
+
+- if (unlikely(IS_ENABLED(CONFIG_NUMA) && page == NULL && zlc_active)) {
+- /* Disable zlc cache for second zonelist scan */
+- zlc_active = 0;
+- goto zonelist_scan;
+- }
+-
+- if (page)
++ if (page) {
+ /*
+ * page->pfmemalloc is set when ALLOC_NO_WATERMARKS was
+ * necessary to allocate the page. The expectation is
+@@ -2056,8 +2093,37 @@ this_zone_full:
+ * for !PFMEMALLOC purposes.
+ */
+ page->pfmemalloc = !!(alloc_flags & ALLOC_NO_WATERMARKS);
++ return page;
++ }
+
+- return page;
++ /*
++ * The first pass makes sure allocations are spread fairly within the
++ * local node. However, the local node might have free pages left
++ * after the fairness batches are exhausted, and remote zones haven't
++ * even been considered yet. Try once more without fairness, and
++ * include remote zones now, before entering the slowpath and waking
++ * kswapd: prefer spilling to a remote zone over swapping locally.
++ */
++ if (alloc_flags & ALLOC_FAIR) {
++ alloc_flags &= ~ALLOC_FAIR;
++ if (nr_fair_skipped) {
++ zonelist_rescan = true;
++ reset_alloc_batches(preferred_zone);
++ }
++ if (nr_online_nodes > 1)
++ zonelist_rescan = true;
++ }
++
++ if (unlikely(IS_ENABLED(CONFIG_NUMA) && zlc_active)) {
++ /* Disable zlc cache for second zonelist scan */
++ zlc_active = 0;
++ zonelist_rescan = true;
++ }
++
++ if (zonelist_rescan)
++ goto zonelist_scan;
++
++ return NULL;
+ }
+
+ /*
+@@ -2173,7 +2239,7 @@ static inline struct page *
+ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
+ struct zonelist *zonelist, enum zone_type high_zoneidx,
+ nodemask_t *nodemask, struct zone *preferred_zone,
+- int migratetype)
++ int classzone_idx, int migratetype)
+ {
+ struct page *page;
+
+@@ -2191,7 +2257,7 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
+ page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask,
+ order, zonelist, high_zoneidx,
+ ALLOC_WMARK_HIGH|ALLOC_CPUSET,
+- preferred_zone, migratetype);
++ preferred_zone, classzone_idx, migratetype);
+ if (page)
+ goto out;
+
+@@ -2226,7 +2292,7 @@ static struct page *
+ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
+ struct zonelist *zonelist, enum zone_type high_zoneidx,
+ nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
+- int migratetype, bool sync_migration,
++ int classzone_idx, int migratetype, enum migrate_mode mode,
+ bool *contended_compaction, bool *deferred_compaction,
+ unsigned long *did_some_progress)
+ {
+@@ -2240,7 +2306,7 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
+
+ current->flags |= PF_MEMALLOC;
+ *did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask,
+- nodemask, sync_migration,
++ nodemask, mode,
+ contended_compaction);
+ current->flags &= ~PF_MEMALLOC;
+
+@@ -2254,13 +2320,10 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
+ page = get_page_from_freelist(gfp_mask, nodemask,
+ order, zonelist, high_zoneidx,
+ alloc_flags & ~ALLOC_NO_WATERMARKS,
+- preferred_zone, migratetype);
++ preferred_zone, classzone_idx, migratetype);
+ if (page) {
+ preferred_zone->compact_blockskip_flush = false;
+- preferred_zone->compact_considered = 0;
+- preferred_zone->compact_defer_shift = 0;
+- if (order >= preferred_zone->compact_order_failed)
+- preferred_zone->compact_order_failed = order + 1;
++ compaction_defer_reset(preferred_zone, order, true);
+ count_vm_event(COMPACTSUCCESS);
+ return page;
+ }
+@@ -2276,7 +2339,7 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
+ * As async compaction considers a subset of pageblocks, only
+ * defer if the failure was a sync compaction failure.
+ */
+- if (sync_migration)
++ if (mode != MIGRATE_ASYNC)
+ defer_compaction(preferred_zone, order);
+
+ cond_resched();
+@@ -2289,9 +2352,9 @@ static inline struct page *
+ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
+ struct zonelist *zonelist, enum zone_type high_zoneidx,
+ nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
+- int migratetype, bool sync_migration,
+- bool *contended_compaction, bool *deferred_compaction,
+- unsigned long *did_some_progress)
++ int classzone_idx, int migratetype,
++ enum migrate_mode mode, bool *contended_compaction,
++ bool *deferred_compaction, unsigned long *did_some_progress)
+ {
+ return NULL;
+ }
+@@ -2330,7 +2393,7 @@ static inline struct page *
+ __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order,
+ struct zonelist *zonelist, enum zone_type high_zoneidx,
+ nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
+- int migratetype, unsigned long *did_some_progress)
++ int classzone_idx, int migratetype, unsigned long *did_some_progress)
+ {
+ struct page *page = NULL;
+ bool drained = false;
+@@ -2348,7 +2411,8 @@ retry:
+ page = get_page_from_freelist(gfp_mask, nodemask, order,
+ zonelist, high_zoneidx,
+ alloc_flags & ~ALLOC_NO_WATERMARKS,
+- preferred_zone, migratetype);
++ preferred_zone, classzone_idx,
++ migratetype);
+
+ /*
+ * If an allocation failed after direct reclaim, it could be because
+@@ -2371,14 +2435,14 @@ static inline struct page *
+ __alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order,
+ struct zonelist *zonelist, enum zone_type high_zoneidx,
+ nodemask_t *nodemask, struct zone *preferred_zone,
+- int migratetype)
++ int classzone_idx, int migratetype)
+ {
+ struct page *page;
+
+ do {
+ page = get_page_from_freelist(gfp_mask, nodemask, order,
+ zonelist, high_zoneidx, ALLOC_NO_WATERMARKS,
+- preferred_zone, migratetype);
++ preferred_zone, classzone_idx, migratetype);
+
+ if (!page && gfp_mask & __GFP_NOFAIL)
+ wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50);
+@@ -2387,28 +2451,6 @@ __alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order,
+ return page;
+ }
+
+-static void reset_alloc_batches(struct zonelist *zonelist,
+- enum zone_type high_zoneidx,
+- struct zone *preferred_zone)
+-{
+- struct zoneref *z;
+- struct zone *zone;
+-
+- for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {
+- /*
+- * Only reset the batches of zones that were actually
+- * considered in the fairness pass, we don't want to
+- * trash fairness information for zones that are not
+- * actually part of this zonelist's round-robin cycle.
+- */
+- if (!zone_local(preferred_zone, zone))
+- continue;
+- mod_zone_page_state(zone, NR_ALLOC_BATCH,
+- high_wmark_pages(zone) - low_wmark_pages(zone) -
+- atomic_long_read(&zone->vm_stat[NR_ALLOC_BATCH]));
+- }
+-}
+-
+ static void wake_all_kswapds(unsigned int order,
+ struct zonelist *zonelist,
+ enum zone_type high_zoneidx,
+@@ -2479,14 +2521,14 @@ static inline struct page *
+ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
+ struct zonelist *zonelist, enum zone_type high_zoneidx,
+ nodemask_t *nodemask, struct zone *preferred_zone,
+- int migratetype)
++ int classzone_idx, int migratetype)
+ {
+ const gfp_t wait = gfp_mask & __GFP_WAIT;
+ struct page *page = NULL;
+ int alloc_flags;
+ unsigned long pages_reclaimed = 0;
+ unsigned long did_some_progress;
+- bool sync_migration = false;
++ enum migrate_mode migration_mode = MIGRATE_ASYNC;
+ bool deferred_compaction = false;
+ bool contended_compaction = false;
+
+@@ -2528,15 +2570,19 @@ restart:
+ * Find the true preferred zone if the allocation is unconstrained by
+ * cpusets.
+ */
+- if (!(alloc_flags & ALLOC_CPUSET) && !nodemask)
+- first_zones_zonelist(zonelist, high_zoneidx, NULL,
+- &preferred_zone);
++ if (!(alloc_flags & ALLOC_CPUSET) && !nodemask) {
++ struct zoneref *preferred_zoneref;
++ preferred_zoneref = first_zones_zonelist(zonelist, high_zoneidx,
++ NULL,
++ &preferred_zone);
++ classzone_idx = zonelist_zone_idx(preferred_zoneref);
++ }
+
+ rebalance:
+ /* This is the last chance, in general, before the goto nopage. */
+ page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist,
+ high_zoneidx, alloc_flags & ~ALLOC_NO_WATERMARKS,
+- preferred_zone, migratetype);
++ preferred_zone, classzone_idx, migratetype);
+ if (page)
+ goto got_pg;
+
+@@ -2551,7 +2597,7 @@ rebalance:
+
+ page = __alloc_pages_high_priority(gfp_mask, order,
+ zonelist, high_zoneidx, nodemask,
+- preferred_zone, migratetype);
++ preferred_zone, classzone_idx, migratetype);
+ if (page) {
+ goto got_pg;
+ }
+@@ -2573,17 +2619,16 @@ rebalance:
+ * Try direct compaction. The first pass is asynchronous. Subsequent
+ * attempts after direct reclaim are synchronous
+ */
+- page = __alloc_pages_direct_compact(gfp_mask, order,
+- zonelist, high_zoneidx,
+- nodemask,
+- alloc_flags, preferred_zone,
+- migratetype, sync_migration,
+- &contended_compaction,
++ page = __alloc_pages_direct_compact(gfp_mask, order, zonelist,
++ high_zoneidx, nodemask, alloc_flags,
++ preferred_zone,
++ classzone_idx, migratetype,
++ migration_mode, &contended_compaction,
+ &deferred_compaction,
+ &did_some_progress);
+ if (page)
+ goto got_pg;
+- sync_migration = true;
++ migration_mode = MIGRATE_SYNC_LIGHT;
+
+ /*
+ * If compaction is deferred for high-order allocations, it is because
+@@ -2600,7 +2645,8 @@ rebalance:
+ zonelist, high_zoneidx,
+ nodemask,
+ alloc_flags, preferred_zone,
+- migratetype, &did_some_progress);
++ classzone_idx, migratetype,
++ &did_some_progress);
+ if (page)
+ goto got_pg;
+
+@@ -2619,7 +2665,7 @@ rebalance:
+ page = __alloc_pages_may_oom(gfp_mask, order,
+ zonelist, high_zoneidx,
+ nodemask, preferred_zone,
+- migratetype);
++ classzone_idx, migratetype);
+ if (page)
+ goto got_pg;
+
+@@ -2658,12 +2704,11 @@ rebalance:
+ * direct reclaim and reclaim/compaction depends on compaction
+ * being called after reclaim so call directly if necessary
+ */
+- page = __alloc_pages_direct_compact(gfp_mask, order,
+- zonelist, high_zoneidx,
+- nodemask,
+- alloc_flags, preferred_zone,
+- migratetype, sync_migration,
+- &contended_compaction,
++ page = __alloc_pages_direct_compact(gfp_mask, order, zonelist,
++ high_zoneidx, nodemask, alloc_flags,
++ preferred_zone,
++ classzone_idx, migratetype,
++ migration_mode, &contended_compaction,
+ &deferred_compaction,
+ &did_some_progress);
+ if (page)
+@@ -2689,11 +2734,13 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
+ {
+ enum zone_type high_zoneidx = gfp_zone(gfp_mask);
+ struct zone *preferred_zone;
++ struct zoneref *preferred_zoneref;
+ struct page *page = NULL;
+ int migratetype = allocflags_to_migratetype(gfp_mask);
+ unsigned int cpuset_mems_cookie;
+ int alloc_flags = ALLOC_WMARK_LOW|ALLOC_CPUSET|ALLOC_FAIR;
+ struct mem_cgroup *memcg = NULL;
++ int classzone_idx;
+
+ gfp_mask &= gfp_allowed_mask;
+
+@@ -2720,42 +2767,26 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
+ return NULL;
+
+ retry_cpuset:
+- cpuset_mems_cookie = get_mems_allowed();
++ cpuset_mems_cookie = read_mems_allowed_begin();
+
+ /* The preferred zone is used for statistics later */
+- first_zones_zonelist(zonelist, high_zoneidx,
++ preferred_zoneref = first_zones_zonelist(zonelist, high_zoneidx,
+ nodemask ? : &cpuset_current_mems_allowed,
+ &preferred_zone);
+ if (!preferred_zone)
+ goto out;
++ classzone_idx = zonelist_zone_idx(preferred_zoneref);
+
+ #ifdef CONFIG_CMA
+ if (allocflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE)
+ alloc_flags |= ALLOC_CMA;
+ #endif
+-retry:
+ /* First allocation attempt */
+ page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order,
+ zonelist, high_zoneidx, alloc_flags,
+- preferred_zone, migratetype);
++ preferred_zone, classzone_idx, migratetype);
+ if (unlikely(!page)) {
+ /*
+- * The first pass makes sure allocations are spread
+- * fairly within the local node. However, the local
+- * node might have free pages left after the fairness
+- * batches are exhausted, and remote zones haven't
+- * even been considered yet. Try once more without
+- * fairness, and include remote zones now, before
+- * entering the slowpath and waking kswapd: prefer
+- * spilling to a remote zone over swapping locally.
+- */
+- if (alloc_flags & ALLOC_FAIR) {
+- reset_alloc_batches(zonelist, high_zoneidx,
+- preferred_zone);
+- alloc_flags &= ~ALLOC_FAIR;
+- goto retry;
+- }
+- /*
+ * Runtime PM, block IO and its error handling path
+ * can deadlock because I/O on the device might not
+ * complete.
+@@ -2763,7 +2794,7 @@ retry:
+ gfp_mask = memalloc_noio_flags(gfp_mask);
+ page = __alloc_pages_slowpath(gfp_mask, order,
+ zonelist, high_zoneidx, nodemask,
+- preferred_zone, migratetype);
++ preferred_zone, classzone_idx, migratetype);
+ }
+
+ trace_mm_page_alloc(page, order, gfp_mask, migratetype);
+@@ -2775,7 +2806,7 @@ out:
+ * the mask is being updated. If a page allocation is about to fail,
+ * check if the cpuset changed during allocation and if so, retry.
+ */
+- if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))
++ if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie)))
+ goto retry_cpuset;
+
+ memcg_kmem_commit_charge(page, memcg, order);
+@@ -2814,7 +2845,7 @@ void __free_pages(struct page *page, unsigned int order)
+ {
+ if (put_page_testzero(page)) {
+ if (order == 0)
+- free_hot_cold_page(page, 0);
++ free_hot_cold_page(page, false);
+ else
+ __free_pages_ok(page, order);
+ }
+@@ -3043,9 +3074,9 @@ bool skip_free_areas_node(unsigned int flags, int nid)
+ goto out;
+
+ do {
+- cpuset_mems_cookie = get_mems_allowed();
++ cpuset_mems_cookie = read_mems_allowed_begin();
+ ret = !node_isset(nid, cpuset_current_mems_allowed);
+- } while (!put_mems_allowed(cpuset_mems_cookie));
++ } while (read_mems_allowed_retry(cpuset_mems_cookie));
+ out:
+ return ret;
+ }
+@@ -3198,12 +3229,12 @@ void show_free_areas(unsigned int filter)
+ K(zone_page_state(zone, NR_BOUNCE)),
+ K(zone_page_state(zone, NR_FREE_CMA_PAGES)),
+ K(zone_page_state(zone, NR_WRITEBACK_TEMP)),
+- zone->pages_scanned,
++ K(zone_page_state(zone, NR_PAGES_SCANNED)),
+ (!zone_reclaimable(zone) ? "yes" : "no")
+ );
+ printk("lowmem_reserve[]:");
+ for (i = 0; i < MAX_NR_ZONES; i++)
+- printk(" %lu", zone->lowmem_reserve[i]);
++ printk(" %ld", zone->lowmem_reserve[i]);
+ printk("\n");
+ }
+
+@@ -3943,6 +3974,7 @@ static void setup_zone_migrate_reserve(struct zone *zone)
+ struct page *page;
+ unsigned long block_migratetype;
+ int reserve;
++ int old_reserve;
+
+ /*
+ * Get the start pfn, end pfn and the number of blocks to reserve
+@@ -3964,6 +3996,12 @@ static void setup_zone_migrate_reserve(struct zone *zone)
+ * future allocation of hugepages at runtime.
+ */
+ reserve = min(2, reserve);
++ old_reserve = zone->nr_migrate_reserve_block;
++
++ /* When memory hot-add, we almost always need to do nothing */
++ if (reserve == old_reserve)
++ return;
++ zone->nr_migrate_reserve_block = reserve;
+
+ for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) {
+ if (!pfn_valid(pfn))
+@@ -4001,6 +4039,12 @@ static void setup_zone_migrate_reserve(struct zone *zone)
+ reserve--;
+ continue;
+ }
++ } else if (!old_reserve) {
++ /*
++ * At boot time we don't need to scan the whole zone
++ * for turning off MIGRATE_RESERVE.
++ */
++ break;
+ }
+
+ /*
+@@ -4080,7 +4124,7 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
+
+ static void __meminit zone_init_free_lists(struct zone *zone)
+ {
+- int order, t;
++ unsigned int order, t;
+ for_each_migratetype_order(order, t) {
+ INIT_LIST_HEAD(&zone->free_area[order].free_list[t]);
+ zone->free_area[order].nr_free = 0;
+@@ -4903,7 +4947,8 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size,
+
+ pgdat->node_id = nid;
+ pgdat->node_start_pfn = node_start_pfn;
+- init_zone_allows_reclaim(nid);
++ if (node_state(nid, N_MEMORY))
++ init_zone_allows_reclaim(nid);
+ #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
+ get_pfn_range_for_nid(nid, &start_pfn, &end_pfn);
+ #endif
+@@ -5492,7 +5537,7 @@ static void calculate_totalreserve_pages(void)
+ for_each_online_pgdat(pgdat) {
+ for (i = 0; i < MAX_NR_ZONES; i++) {
+ struct zone *zone = pgdat->node_zones + i;
+- unsigned long max = 0;
++ long max = 0;
+
+ /* Find valid and maximum lowmem_reserve in the zone */
+ for (j = i; j < MAX_NR_ZONES; j++) {
+@@ -5734,7 +5779,12 @@ module_init(init_per_zone_wmark_min)
+ int min_free_kbytes_sysctl_handler(ctl_table *table, int write,
+ void __user *buffer, size_t *length, loff_t *ppos)
+ {
+- proc_dointvec(table, write, buffer, length, ppos);
++ int rc;
++
++ rc = proc_dointvec_minmax(table, write, buffer, length, ppos);
++ if (rc)
++ return rc;
++
+ if (write) {
+ user_min_free_kbytes = min_free_kbytes;
+ setup_per_zone_wmarks();
+@@ -5976,17 +6026,16 @@ static inline int pfn_to_bitidx(struct zone *zone, unsigned long pfn)
+ * @end_bitidx: The last bit of interest
+ * returns pageblock_bits flags
+ */
+-unsigned long get_pageblock_flags_mask(struct page *page,
++unsigned long get_pfnblock_flags_mask(struct page *page, unsigned long pfn,
+ unsigned long end_bitidx,
+ unsigned long mask)
+ {
+ struct zone *zone;
+ unsigned long *bitmap;
+- unsigned long pfn, bitidx, word_bitidx;
++ unsigned long bitidx, word_bitidx;
+ unsigned long word;
+
+ zone = page_zone(page);
+- pfn = page_to_pfn(page);
+ bitmap = get_pageblock_bitmap(zone, pfn);
+ bitidx = pfn_to_bitidx(zone, pfn);
+ word_bitidx = bitidx / BITS_PER_LONG;
+@@ -5998,25 +6047,25 @@ unsigned long get_pageblock_flags_mask(struct page *page,
+ }
+
+ /**
+- * set_pageblock_flags_mask - Set the requested group of flags for a pageblock_nr_pages block of pages
++ * set_pfnblock_flags_mask - Set the requested group of flags for a pageblock_nr_pages block of pages
+ * @page: The page within the block of interest
+ * @start_bitidx: The first bit of interest
+ * @end_bitidx: The last bit of interest
+ * @flags: The flags to set
+ */
+-void set_pageblock_flags_mask(struct page *page, unsigned long flags,
++void set_pfnblock_flags_mask(struct page *page, unsigned long flags,
++ unsigned long pfn,
+ unsigned long end_bitidx,
+ unsigned long mask)
+ {
+ struct zone *zone;
+ unsigned long *bitmap;
+- unsigned long pfn, bitidx, word_bitidx;
++ unsigned long bitidx, word_bitidx;
+ unsigned long old_word, word;
+
+ BUILD_BUG_ON(NR_PAGEBLOCK_BITS != 4);
+
+ zone = page_zone(page);
+- pfn = page_to_pfn(page);
+ bitmap = get_pageblock_bitmap(zone, pfn);
+ bitidx = pfn_to_bitidx(zone, pfn);
+ word_bitidx = bitidx / BITS_PER_LONG;
+@@ -6194,7 +6243,7 @@ static int __alloc_contig_migrate_range(struct compact_control *cc,
+ cc->nr_migratepages -= nr_reclaimed;
+
+ ret = migrate_pages(&cc->migratepages, alloc_migrate_target,
+- 0, MIGRATE_SYNC, MR_CMA);
++ NULL, 0, cc->mode, MR_CMA);
+ }
+ if (ret < 0) {
+ putback_movable_pages(&cc->migratepages);
+@@ -6233,7 +6282,7 @@ int alloc_contig_range(unsigned long start, unsigned long end,
+ .nr_migratepages = 0,
+ .order = -1,
+ .zone = page_zone(pfn_to_page(start)),
+- .sync = true,
++ .mode = MIGRATE_SYNC,
+ .ignore_skip_hint = true,
+ };
+ INIT_LIST_HEAD(&cc.migratepages);
+@@ -6388,7 +6437,7 @@ __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn)
+ {
+ struct page *page;
+ struct zone *zone;
+- int order, i;
++ unsigned int order, i;
+ unsigned long pfn;
+ unsigned long flags;
+ /* find the first valid pfn */
+@@ -6440,7 +6489,7 @@ bool is_free_buddy_page(struct page *page)
+ struct zone *zone = page_zone(page);
+ unsigned long pfn = page_to_pfn(page);
+ unsigned long flags;
+- int order;
++ unsigned int order;
+
+ spin_lock_irqsave(&zone->lock, flags);
+ for (order = 0; order < MAX_ORDER; order++) {
+diff --git a/mm/readahead.c b/mm/readahead.c
+index e4ed04149785..0f35e983bffb 100644
+--- a/mm/readahead.c
++++ b/mm/readahead.c
+@@ -8,9 +8,7 @@
+ */
+
+ #include <linux/kernel.h>
+-#include <linux/fs.h>
+ #include <linux/gfp.h>
+-#include <linux/mm.h>
+ #include <linux/export.h>
+ #include <linux/blkdev.h>
+ #include <linux/backing-dev.h>
+@@ -20,6 +18,8 @@
+ #include <linux/syscalls.h>
+ #include <linux/file.h>
+
++#include "internal.h"
++
+ /*
+ * Initialise a struct file's readahead state. Assumes that the caller has
+ * memset *ra to zero.
+@@ -149,8 +149,7 @@ out:
+ *
+ * Returns the number of pages requested, or the maximum amount of I/O allowed.
+ */
+-static int
+-__do_page_cache_readahead(struct address_space *mapping, struct file *filp,
++int __do_page_cache_readahead(struct address_space *mapping, struct file *filp,
+ pgoff_t offset, unsigned long nr_to_read,
+ unsigned long lookahead_size)
+ {
+@@ -179,7 +178,7 @@ __do_page_cache_readahead(struct address_space *mapping, struct file *filp,
+ rcu_read_lock();
+ page = radix_tree_lookup(&mapping->page_tree, page_offset);
+ rcu_read_unlock();
+- if (page)
++ if (page && !radix_tree_exceptional_entry(page))
+ continue;
+
+ page = page_cache_alloc_readahead(mapping);
+@@ -237,28 +236,14 @@ int force_page_cache_readahead(struct address_space *mapping, struct file *filp,
+ return ret;
+ }
+
++#define MAX_READAHEAD ((512*4096)/PAGE_CACHE_SIZE)
+ /*
+ * Given a desired number of PAGE_CACHE_SIZE readahead pages, return a
+ * sensible upper limit.
+ */
+ unsigned long max_sane_readahead(unsigned long nr)
+ {
+- return min(nr, (node_page_state(numa_node_id(), NR_INACTIVE_FILE)
+- + node_page_state(numa_node_id(), NR_FREE_PAGES)) / 2);
+-}
+-
+-/*
+- * Submit IO for the read-ahead request in file_ra_state.
+- */
+-unsigned long ra_submit(struct file_ra_state *ra,
+- struct address_space *mapping, struct file *filp)
+-{
+- int actual;
+-
+- actual = __do_page_cache_readahead(mapping, filp,
+- ra->start, ra->size, ra->async_size);
+-
+- return actual;
++ return min(nr, MAX_READAHEAD);
+ }
+
+ /*
+@@ -351,7 +336,7 @@ static pgoff_t count_history_pages(struct address_space *mapping,
+ pgoff_t head;
+
+ rcu_read_lock();
+- head = radix_tree_prev_hole(&mapping->page_tree, offset - 1, max);
++ head = page_cache_prev_hole(mapping, offset - 1, max);
+ rcu_read_unlock();
+
+ return offset - 1 - head;
+@@ -401,6 +386,7 @@ ondemand_readahead(struct address_space *mapping,
+ unsigned long req_size)
+ {
+ unsigned long max = max_sane_readahead(ra->ra_pages);
++ pgoff_t prev_offset;
+
+ /*
+ * start of file
+@@ -430,7 +416,7 @@ ondemand_readahead(struct address_space *mapping,
+ pgoff_t start;
+
+ rcu_read_lock();
+- start = radix_tree_next_hole(&mapping->page_tree, offset+1,max);
++ start = page_cache_next_hole(mapping, offset + 1, max);
+ rcu_read_unlock();
+
+ if (!start || start - offset > max)
+@@ -452,8 +438,11 @@ ondemand_readahead(struct address_space *mapping,
+
+ /*
+ * sequential cache miss
++ * trivial case: (offset - prev_offset) == 1
++ * unaligned reads: (offset - prev_offset) == 0
+ */
+- if (offset - (ra->prev_pos >> PAGE_CACHE_SHIFT) <= 1UL)
++ prev_offset = (unsigned long long)ra->prev_pos >> PAGE_CACHE_SHIFT;
++ if (offset - prev_offset <= 1UL)
+ goto initial_readahead;
+
+ /*
+diff --git a/mm/shmem.c b/mm/shmem.c
+index 0da81aaeb4cc..ab05681f41cd 100644
+--- a/mm/shmem.c
++++ b/mm/shmem.c
+@@ -243,19 +243,17 @@ static int shmem_radix_tree_replace(struct address_space *mapping,
+ pgoff_t index, void *expected, void *replacement)
+ {
+ void **pslot;
+- void *item = NULL;
++ void *item;
+
+ VM_BUG_ON(!expected);
++ VM_BUG_ON(!replacement);
+ pslot = radix_tree_lookup_slot(&mapping->page_tree, index);
+- if (pslot)
+- item = radix_tree_deref_slot_protected(pslot,
+- &mapping->tree_lock);
++ if (!pslot)
++ return -ENOENT;
++ item = radix_tree_deref_slot_protected(pslot, &mapping->tree_lock);
+ if (item != expected)
+ return -ENOENT;
+- if (replacement)
+- radix_tree_replace_slot(pslot, replacement);
+- else
+- radix_tree_delete(&mapping->page_tree, index);
++ radix_tree_replace_slot(pslot, replacement);
+ return 0;
+ }
+
+@@ -332,84 +330,20 @@ static void shmem_delete_from_page_cache(struct page *page, void *radswap)
+ }
+
+ /*
+- * Like find_get_pages, but collecting swap entries as well as pages.
+- */
+-static unsigned shmem_find_get_pages_and_swap(struct address_space *mapping,
+- pgoff_t start, unsigned int nr_pages,
+- struct page **pages, pgoff_t *indices)
+-{
+- void **slot;
+- unsigned int ret = 0;
+- struct radix_tree_iter iter;
+-
+- if (!nr_pages)
+- return 0;
+-
+- rcu_read_lock();
+-restart:
+- radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, start) {
+- struct page *page;
+-repeat:
+- page = radix_tree_deref_slot(slot);
+- if (unlikely(!page))
+- continue;
+- if (radix_tree_exception(page)) {
+- if (radix_tree_deref_retry(page))
+- goto restart;
+- /*
+- * Otherwise, we must be storing a swap entry
+- * here as an exceptional entry: so return it
+- * without attempting to raise page count.
+- */
+- goto export;
+- }
+- if (!page_cache_get_speculative(page))
+- goto repeat;
+-
+- /* Has the page moved? */
+- if (unlikely(page != *slot)) {
+- page_cache_release(page);
+- goto repeat;
+- }
+-export:
+- indices[ret] = iter.index;
+- pages[ret] = page;
+- if (++ret == nr_pages)
+- break;
+- }
+- rcu_read_unlock();
+- return ret;
+-}
+-
+-/*
+ * Remove swap entry from radix tree, free the swap and its page cache.
+ */
+ static int shmem_free_swap(struct address_space *mapping,
+ pgoff_t index, void *radswap)
+ {
+- int error;
++ void *old;
+
+ spin_lock_irq(&mapping->tree_lock);
+- error = shmem_radix_tree_replace(mapping, index, radswap, NULL);
++ old = radix_tree_delete_item(&mapping->page_tree, index, radswap);
+ spin_unlock_irq(&mapping->tree_lock);
+- if (!error)
+- free_swap_and_cache(radix_to_swp_entry(radswap));
+- return error;
+-}
+-
+-/*
+- * Pagevec may contain swap entries, so shuffle up pages before releasing.
+- */
+-static void shmem_deswap_pagevec(struct pagevec *pvec)
+-{
+- int i, j;
+-
+- for (i = 0, j = 0; i < pagevec_count(pvec); i++) {
+- struct page *page = pvec->pages[i];
+- if (!radix_tree_exceptional_entry(page))
+- pvec->pages[j++] = page;
+- }
+- pvec->nr = j;
++ if (old != radswap)
++ return -ENOENT;
++ free_swap_and_cache(radix_to_swp_entry(radswap));
++ return 0;
+ }
+
+ /*
+@@ -430,12 +364,12 @@ void shmem_unlock_mapping(struct address_space *mapping)
+ * Avoid pagevec_lookup(): find_get_pages() returns 0 as if it
+ * has finished, if it hits a row of PAGEVEC_SIZE swap entries.
+ */
+- pvec.nr = shmem_find_get_pages_and_swap(mapping, index,
+- PAGEVEC_SIZE, pvec.pages, indices);
++ pvec.nr = find_get_entries(mapping, index,
++ PAGEVEC_SIZE, pvec.pages, indices);
+ if (!pvec.nr)
+ break;
+ index = indices[pvec.nr - 1] + 1;
+- shmem_deswap_pagevec(&pvec);
++ pagevec_remove_exceptionals(&pvec);
+ check_move_unevictable_pages(pvec.pages, pvec.nr);
+ pagevec_release(&pvec);
+ cond_resched();
+@@ -467,9 +401,9 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend,
+ pagevec_init(&pvec, 0);
+ index = start;
+ while (index < end) {
+- pvec.nr = shmem_find_get_pages_and_swap(mapping, index,
+- min(end - index, (pgoff_t)PAGEVEC_SIZE),
+- pvec.pages, indices);
++ pvec.nr = find_get_entries(mapping, index,
++ min(end - index, (pgoff_t)PAGEVEC_SIZE),
++ pvec.pages, indices);
+ if (!pvec.nr)
+ break;
+ mem_cgroup_uncharge_start();
+@@ -498,7 +432,7 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend,
+ }
+ unlock_page(page);
+ }
+- shmem_deswap_pagevec(&pvec);
++ pagevec_remove_exceptionals(&pvec);
+ pagevec_release(&pvec);
+ mem_cgroup_uncharge_end();
+ cond_resched();
+@@ -536,9 +470,10 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend,
+ index = start;
+ while (index < end) {
+ cond_resched();
+- pvec.nr = shmem_find_get_pages_and_swap(mapping, index,
++
++ pvec.nr = find_get_entries(mapping, index,
+ min(end - index, (pgoff_t)PAGEVEC_SIZE),
+- pvec.pages, indices);
++ pvec.pages, indices);
+ if (!pvec.nr) {
+ /* If all gone or hole-punch or unfalloc, we're done */
+ if (index == start || end != -1)
+@@ -581,7 +516,7 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend,
+ }
+ unlock_page(page);
+ }
+- shmem_deswap_pagevec(&pvec);
++ pagevec_remove_exceptionals(&pvec);
+ pagevec_release(&pvec);
+ mem_cgroup_uncharge_end();
+ index++;
+@@ -1090,7 +1025,7 @@ static int shmem_getpage_gfp(struct inode *inode, pgoff_t index,
+ return -EFBIG;
+ repeat:
+ swap.val = 0;
+- page = find_lock_page(mapping, index);
++ page = find_lock_entry(mapping, index);
+ if (radix_tree_exceptional_entry(page)) {
+ swap = radix_to_swp_entry(page);
+ page = NULL;
+@@ -1102,6 +1037,9 @@ repeat:
+ goto failed;
+ }
+
++ if (page && sgp == SGP_WRITE)
++ mark_page_accessed(page);
++
+ /* fallocated page? */
+ if (page && !PageUptodate(page)) {
+ if (sgp != SGP_READ)
+@@ -1183,6 +1121,9 @@ repeat:
+ shmem_recalc_inode(inode);
+ spin_unlock(&info->lock);
+
++ if (sgp == SGP_WRITE)
++ mark_page_accessed(page);
++
+ delete_from_swap_cache(page);
+ set_page_dirty(page);
+ swap_free(swap);
+@@ -1207,8 +1148,11 @@ repeat:
+ goto decused;
+ }
+
+- SetPageSwapBacked(page);
++ __SetPageSwapBacked(page);
+ __set_page_locked(page);
++ if (sgp == SGP_WRITE)
++ init_page_accessed(page);
++
+ error = mem_cgroup_cache_charge(page, current->mm,
+ gfp & GFP_RECLAIM_MASK);
+ if (error)
+@@ -1485,6 +1429,11 @@ static struct inode *shmem_get_inode(struct super_block *sb, const struct inode
+ return inode;
+ }
+
++bool shmem_mapping(struct address_space *mapping)
++{
++ return mapping->backing_dev_info == &shmem_backing_dev_info;
++}
++
+ #ifdef CONFIG_TMPFS
+ static const struct inode_operations shmem_symlink_inode_operations;
+ static const struct inode_operations shmem_short_symlink_operations;
+@@ -1797,7 +1746,7 @@ static pgoff_t shmem_seek_hole_data(struct address_space *mapping,
+ pagevec_init(&pvec, 0);
+ pvec.nr = 1; /* start small: we may be there already */
+ while (!done) {
+- pvec.nr = shmem_find_get_pages_and_swap(mapping, index,
++ pvec.nr = find_get_entries(mapping, index,
+ pvec.nr, pvec.pages, indices);
+ if (!pvec.nr) {
+ if (whence == SEEK_DATA)
+@@ -1824,7 +1773,7 @@ static pgoff_t shmem_seek_hole_data(struct address_space *mapping,
+ break;
+ }
+ }
+- shmem_deswap_pagevec(&pvec);
++ pagevec_remove_exceptionals(&pvec);
+ pagevec_release(&pvec);
+ pvec.nr = PAGEVEC_SIZE;
+ cond_resched();
+diff --git a/mm/slab.c b/mm/slab.c
+index 2580db062df9..eb4078c7d183 100644
+--- a/mm/slab.c
++++ b/mm/slab.c
+@@ -930,7 +930,8 @@ static void *__ac_put_obj(struct kmem_cache *cachep, struct array_cache *ac,
+ {
+ if (unlikely(pfmemalloc_active)) {
+ /* Some pfmemalloc slabs exist, check if this is one */
+- struct page *page = virt_to_head_page(objp);
++ struct slab *slabp = virt_to_slab(objp);
++ struct page *page = virt_to_head_page(slabp->s_mem);
+ if (PageSlabPfmemalloc(page))
+ set_obj_pfmemalloc(&objp);
+ }
+@@ -1776,7 +1777,7 @@ static void *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, int nodeid)
+ __SetPageSlab(page + i);
+
+ if (page->pfmemalloc)
+- SetPageSlabPfmemalloc(page + i);
++ SetPageSlabPfmemalloc(page);
+ }
+ memcg_bind_pages(cachep, cachep->gfporder);
+
+@@ -1809,9 +1810,10 @@ static void kmem_freepages(struct kmem_cache *cachep, void *addr)
+ else
+ sub_zone_page_state(page_zone(page),
+ NR_SLAB_UNRECLAIMABLE, nr_freed);
++
++ __ClearPageSlabPfmemalloc(page);
+ while (i--) {
+ BUG_ON(!PageSlab(page));
+- __ClearPageSlabPfmemalloc(page);
+ __ClearPageSlab(page);
+ page++;
+ }
+@@ -3220,7 +3222,7 @@ static void *fallback_alloc(struct kmem_cache *cache, gfp_t flags)
+ local_flags = flags & (GFP_CONSTRAINT_MASK|GFP_RECLAIM_MASK);
+
+ retry_cpuset:
+- cpuset_mems_cookie = get_mems_allowed();
++ cpuset_mems_cookie = read_mems_allowed_begin();
+ zonelist = node_zonelist(slab_node(), flags);
+
+ retry:
+@@ -3276,7 +3278,7 @@ retry:
+ }
+ }
+
+- if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !obj))
++ if (unlikely(!obj && read_mems_allowed_retry(cpuset_mems_cookie)))
+ goto retry_cpuset;
+ return obj;
+ }
+diff --git a/mm/slub.c b/mm/slub.c
+index 5c1343a391d0..a88d94cfee20 100644
+--- a/mm/slub.c
++++ b/mm/slub.c
+@@ -1635,7 +1635,7 @@ static void *get_any_partial(struct kmem_cache *s, gfp_t flags,
+ return NULL;
+
+ do {
+- cpuset_mems_cookie = get_mems_allowed();
++ cpuset_mems_cookie = read_mems_allowed_begin();
+ zonelist = node_zonelist(slab_node(), flags);
+ for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {
+ struct kmem_cache_node *n;
+@@ -1647,19 +1647,17 @@ static void *get_any_partial(struct kmem_cache *s, gfp_t flags,
+ object = get_partial_node(s, n, c, flags);
+ if (object) {
+ /*
+- * Return the object even if
+- * put_mems_allowed indicated that
+- * the cpuset mems_allowed was
+- * updated in parallel. It's a
+- * harmless race between the alloc
+- * and the cpuset update.
++ * Don't check read_mems_allowed_retry()
++ * here - if mems_allowed was updated in
++ * parallel, that was a harmless race
++ * between allocation and the cpuset
++ * update
+ */
+- put_mems_allowed(cpuset_mems_cookie);
+ return object;
+ }
+ }
+ }
+- } while (!put_mems_allowed(cpuset_mems_cookie));
++ } while (read_mems_allowed_retry(cpuset_mems_cookie));
+ #endif
+ return NULL;
+ }
+diff --git a/mm/swap.c b/mm/swap.c
+index aa4da5d9401d..16e70ce1912a 100644
+--- a/mm/swap.c
++++ b/mm/swap.c
+@@ -68,7 +68,7 @@ static void __page_cache_release(struct page *page)
+ static void __put_single_page(struct page *page)
+ {
+ __page_cache_release(page);
+- free_hot_cold_page(page, 0);
++ free_hot_cold_page(page, false);
+ }
+
+ static void __put_compound_page(struct page *page)
+@@ -437,7 +437,7 @@ static void __activate_page(struct page *page, struct lruvec *lruvec,
+ SetPageActive(page);
+ lru += LRU_ACTIVE;
+ add_page_to_lru_list(page, lruvec, lru);
+- trace_mm_lru_activate(page, page_to_pfn(page));
++ trace_mm_lru_activate(page);
+
+ __count_vm_event(PGACTIVATE);
+ update_page_reclaim_stat(lruvec, file, 1);
+@@ -549,12 +549,17 @@ void mark_page_accessed(struct page *page)
+ EXPORT_SYMBOL(mark_page_accessed);
+
+ /*
+- * Queue the page for addition to the LRU via pagevec. The decision on whether
+- * to add the page to the [in]active [file|anon] list is deferred until the
+- * pagevec is drained. This gives a chance for the caller of __lru_cache_add()
+- * have the page added to the active list using mark_page_accessed().
++ * Used to mark_page_accessed(page) that is not visible yet and when it is
++ * still safe to use non-atomic ops
+ */
+-void __lru_cache_add(struct page *page)
++void init_page_accessed(struct page *page)
++{
++ if (!PageReferenced(page))
++ __SetPageReferenced(page);
++}
++EXPORT_SYMBOL(init_page_accessed);
++
++static void __lru_cache_add(struct page *page)
+ {
+ struct pagevec *pvec = &get_cpu_var(lru_add_pvec);
+
+@@ -564,11 +569,34 @@ void __lru_cache_add(struct page *page)
+ pagevec_add(pvec, page);
+ put_cpu_var(lru_add_pvec);
+ }
+-EXPORT_SYMBOL(__lru_cache_add);
++
++/**
++ * lru_cache_add: add a page to the page lists
++ * @page: the page to add
++ */
++void lru_cache_add_anon(struct page *page)
++{
++ if (PageActive(page))
++ ClearPageActive(page);
++ __lru_cache_add(page);
++}
++
++void lru_cache_add_file(struct page *page)
++{
++ if (PageActive(page))
++ ClearPageActive(page);
++ __lru_cache_add(page);
++}
++EXPORT_SYMBOL(lru_cache_add_file);
+
+ /**
+ * lru_cache_add - add a page to a page list
+ * @page: the page to be added to the LRU.
++ *
++ * Queue the page for addition to the LRU via pagevec. The decision on whether
++ * to add the page to the [in]active [file|anon] list is deferred until the
++ * pagevec is drained. This gives a chance for the caller of lru_cache_add()
++ * have the page added to the active list using mark_page_accessed().
+ */
+ void lru_cache_add(struct page *page)
+ {
+@@ -779,7 +807,7 @@ void lru_add_drain_all(void)
+ * grabbed the page via the LRU. If it did, give up: shrink_inactive_list()
+ * will free it.
+ */
+-void release_pages(struct page **pages, int nr, int cold)
++void release_pages(struct page **pages, int nr, bool cold)
+ {
+ int i;
+ LIST_HEAD(pages_to_free);
+@@ -820,7 +848,7 @@ void release_pages(struct page **pages, int nr, int cold)
+ }
+
+ /* Clear Active bit in case of parallel mark_page_accessed */
+- ClearPageActive(page);
++ __ClearPageActive(page);
+
+ list_add(&page->lru, &pages_to_free);
+ }
+@@ -902,7 +930,7 @@ static void __pagevec_lru_add_fn(struct page *page, struct lruvec *lruvec,
+ SetPageLRU(page);
+ add_page_to_lru_list(page, lruvec, lru);
+ update_page_reclaim_stat(lruvec, file, active);
+- trace_mm_lru_insertion(page, page_to_pfn(page), lru, trace_pagemap_flags(page));
++ trace_mm_lru_insertion(page, lru);
+ }
+
+ /*
+@@ -916,6 +944,57 @@ void __pagevec_lru_add(struct pagevec *pvec)
+ EXPORT_SYMBOL(__pagevec_lru_add);
+
+ /**
++ * pagevec_lookup_entries - gang pagecache lookup
++ * @pvec: Where the resulting entries are placed
++ * @mapping: The address_space to search
++ * @start: The starting entry index
++ * @nr_entries: The maximum number of entries
++ * @indices: The cache indices corresponding to the entries in @pvec
++ *
++ * pagevec_lookup_entries() will search for and return a group of up
++ * to @nr_entries pages and shadow entries in the mapping. All
++ * entries are placed in @pvec. pagevec_lookup_entries() takes a
++ * reference against actual pages in @pvec.
++ *
++ * The search returns a group of mapping-contiguous entries with
++ * ascending indexes. There may be holes in the indices due to
++ * not-present entries.
++ *
++ * pagevec_lookup_entries() returns the number of entries which were
++ * found.
++ */
++unsigned pagevec_lookup_entries(struct pagevec *pvec,
++ struct address_space *mapping,
++ pgoff_t start, unsigned nr_pages,
++ pgoff_t *indices)
++{
++ pvec->nr = find_get_entries(mapping, start, nr_pages,
++ pvec->pages, indices);
++ return pagevec_count(pvec);
++}
++
++/**
++ * pagevec_remove_exceptionals - pagevec exceptionals pruning
++ * @pvec: The pagevec to prune
++ *
++ * pagevec_lookup_entries() fills both pages and exceptional radix
++ * tree entries into the pagevec. This function prunes all
++ * exceptionals from @pvec without leaving holes, so that it can be
++ * passed on to page-only pagevec operations.
++ */
++void pagevec_remove_exceptionals(struct pagevec *pvec)
++{
++ int i, j;
++
++ for (i = 0, j = 0; i < pagevec_count(pvec); i++) {
++ struct page *page = pvec->pages[i];
++ if (!radix_tree_exceptional_entry(page))
++ pvec->pages[j++] = page;
++ }
++ pvec->nr = j;
++}
++
++/**
+ * pagevec_lookup - gang pagecache lookup
+ * @pvec: Where the resulting pages are placed
+ * @mapping: The address_space to search
+diff --git a/mm/swap_state.c b/mm/swap_state.c
+index e6f15f8ca2af..4079edfff2cc 100644
+--- a/mm/swap_state.c
++++ b/mm/swap_state.c
+@@ -63,6 +63,8 @@ unsigned long total_swapcache_pages(void)
+ return ret;
+ }
+
++static atomic_t swapin_readahead_hits = ATOMIC_INIT(4);
++
+ void show_swap_cache_info(void)
+ {
+ printk("%lu pages in swap cache\n", total_swapcache_pages());
+@@ -268,7 +270,7 @@ void free_pages_and_swap_cache(struct page **pages, int nr)
+
+ for (i = 0; i < todo; i++)
+ free_swap_cache(pagep[i]);
+- release_pages(pagep, todo, 0);
++ release_pages(pagep, todo, false);
+ pagep += todo;
+ nr -= todo;
+ }
+@@ -286,8 +288,11 @@ struct page * lookup_swap_cache(swp_entry_t entry)
+
+ page = find_get_page(swap_address_space(entry), entry.val);
+
+- if (page)
++ if (page) {
+ INC_CACHE_INFO(find_success);
++ if (TestClearPageReadahead(page))
++ atomic_inc(&swapin_readahead_hits);
++ }
+
+ INC_CACHE_INFO(find_total);
+ return page;
+@@ -389,6 +394,50 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
+ return found_page;
+ }
+
++static unsigned long swapin_nr_pages(unsigned long offset)
++{
++ static unsigned long prev_offset;
++ unsigned int pages, max_pages, last_ra;
++ static atomic_t last_readahead_pages;
++
++ max_pages = 1 << ACCESS_ONCE(page_cluster);
++ if (max_pages <= 1)
++ return 1;
++
++ /*
++ * This heuristic has been found to work well on both sequential and
++ * random loads, swapping to hard disk or to SSD: please don't ask
++ * what the "+ 2" means, it just happens to work well, that's all.
++ */
++ pages = atomic_xchg(&swapin_readahead_hits, 0) + 2;
++ if (pages == 2) {
++ /*
++ * We can have no readahead hits to judge by: but must not get
++ * stuck here forever, so check for an adjacent offset instead
++ * (and don't even bother to check whether swap type is same).
++ */
++ if (offset != prev_offset + 1 && offset != prev_offset - 1)
++ pages = 1;
++ prev_offset = offset;
++ } else {
++ unsigned int roundup = 4;
++ while (roundup < pages)
++ roundup <<= 1;
++ pages = roundup;
++ }
++
++ if (pages > max_pages)
++ pages = max_pages;
++
++ /* Don't shrink readahead too fast */
++ last_ra = atomic_read(&last_readahead_pages) / 2;
++ if (pages < last_ra)
++ pages = last_ra;
++ atomic_set(&last_readahead_pages, pages);
++
++ return pages;
++}
++
+ /**
+ * swapin_readahead - swap in pages in hope we need them soon
+ * @entry: swap entry of this memory
+@@ -412,11 +461,16 @@ struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask,
+ struct vm_area_struct *vma, unsigned long addr)
+ {
+ struct page *page;
+- unsigned long offset = swp_offset(entry);
++ unsigned long entry_offset = swp_offset(entry);
++ unsigned long offset = entry_offset;
+ unsigned long start_offset, end_offset;
+- unsigned long mask = (1UL << page_cluster) - 1;
++ unsigned long mask;
+ struct blk_plug plug;
+
++ mask = swapin_nr_pages(offset) - 1;
++ if (!mask)
++ goto skip;
++
+ /* Read a page_cluster sized and aligned cluster around offset. */
+ start_offset = offset & ~mask;
+ end_offset = offset | mask;
+@@ -430,10 +484,13 @@ struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask,
+ gfp_mask, vma, addr);
+ if (!page)
+ continue;
++ if (offset != entry_offset)
++ SetPageReadahead(page);
+ page_cache_release(page);
+ }
+ blk_finish_plug(&plug);
+
+ lru_add_drain(); /* Push any new pages onto the LRU now */
++skip:
+ return read_swap_cache_async(entry, gfp_mask, vma, addr);
+ }
+diff --git a/mm/swapfile.c b/mm/swapfile.c
+index 0ec2eaf3ccfd..660b9c0e2e40 100644
+--- a/mm/swapfile.c
++++ b/mm/swapfile.c
+@@ -51,14 +51,32 @@ atomic_long_t nr_swap_pages;
+ /* protected with swap_lock. reading in vm_swap_full() doesn't need lock */
+ long total_swap_pages;
+ static int least_priority;
+-static atomic_t highest_priority_index = ATOMIC_INIT(-1);
+
+ static const char Bad_file[] = "Bad swap file entry ";
+ static const char Unused_file[] = "Unused swap file entry ";
+ static const char Bad_offset[] = "Bad swap offset entry ";
+ static const char Unused_offset[] = "Unused swap offset entry ";
+
+-struct swap_list_t swap_list = {-1, -1};
++/*
++ * all active swap_info_structs
++ * protected with swap_lock, and ordered by priority.
++ */
++PLIST_HEAD(swap_active_head);
++
++/*
++ * all available (active, not full) swap_info_structs
++ * protected with swap_avail_lock, ordered by priority.
++ * This is used by get_swap_page() instead of swap_active_head
++ * because swap_active_head includes all swap_info_structs,
++ * but get_swap_page() doesn't need to look at full ones.
++ * This uses its own lock instead of swap_lock because when a
++ * swap_info_struct changes between not-full/full, it needs to
++ * add/remove itself to/from this list, but the swap_info_struct->lock
++ * is held and the locking order requires swap_lock to be taken
++ * before any swap_info_struct->lock.
++ */
++static PLIST_HEAD(swap_avail_head);
++static DEFINE_SPINLOCK(swap_avail_lock);
+
+ struct swap_info_struct *swap_info[MAX_SWAPFILES];
+
+@@ -591,6 +609,9 @@ checks:
+ if (si->inuse_pages == si->pages) {
+ si->lowest_bit = si->max;
+ si->highest_bit = 0;
++ spin_lock(&swap_avail_lock);
++ plist_del(&si->avail_list, &swap_avail_head);
++ spin_unlock(&swap_avail_lock);
+ }
+ si->swap_map[offset] = usage;
+ inc_cluster_info_page(si, si->cluster_info, offset);
+@@ -639,71 +660,65 @@ no_page:
+
+ swp_entry_t get_swap_page(void)
+ {
+- struct swap_info_struct *si;
++ struct swap_info_struct *si, *next;
+ pgoff_t offset;
+- int type, next;
+- int wrapped = 0;
+- int hp_index;
+
+- spin_lock(&swap_lock);
+ if (atomic_long_read(&nr_swap_pages) <= 0)
+ goto noswap;
+ atomic_long_dec(&nr_swap_pages);
+
+- for (type = swap_list.next; type >= 0 && wrapped < 2; type = next) {
+- hp_index = atomic_xchg(&highest_priority_index, -1);
+- /*
+- * highest_priority_index records current highest priority swap
+- * type which just frees swap entries. If its priority is
+- * higher than that of swap_list.next swap type, we use it. It
+- * isn't protected by swap_lock, so it can be an invalid value
+- * if the corresponding swap type is swapoff. We double check
+- * the flags here. It's even possible the swap type is swapoff
+- * and swapon again and its priority is changed. In such rare
+- * case, low prority swap type might be used, but eventually
+- * high priority swap will be used after several rounds of
+- * swap.
+- */
+- if (hp_index != -1 && hp_index != type &&
+- swap_info[type]->prio < swap_info[hp_index]->prio &&
+- (swap_info[hp_index]->flags & SWP_WRITEOK)) {
+- type = hp_index;
+- swap_list.next = type;
+- }
+-
+- si = swap_info[type];
+- next = si->next;
+- if (next < 0 ||
+- (!wrapped && si->prio != swap_info[next]->prio)) {
+- next = swap_list.head;
+- wrapped++;
+- }
++ spin_lock(&swap_avail_lock);
+
++start_over:
++ plist_for_each_entry_safe(si, next, &swap_avail_head, avail_list) {
++ /* requeue si to after same-priority siblings */
++ plist_requeue(&si->avail_list, &swap_avail_head);
++ spin_unlock(&swap_avail_lock);
+ spin_lock(&si->lock);
+- if (!si->highest_bit) {
++ if (!si->highest_bit || !(si->flags & SWP_WRITEOK)) {
++ spin_lock(&swap_avail_lock);
++ if (plist_node_empty(&si->avail_list)) {
++ spin_unlock(&si->lock);
++ goto nextsi;
++ }
++ WARN(!si->highest_bit,
++ "swap_info %d in list but !highest_bit\n",
++ si->type);
++ WARN(!(si->flags & SWP_WRITEOK),
++ "swap_info %d in list but !SWP_WRITEOK\n",
++ si->type);
++ plist_del(&si->avail_list, &swap_avail_head);
+ spin_unlock(&si->lock);
+- continue;
++ goto nextsi;
+ }
+- if (!(si->flags & SWP_WRITEOK)) {
+- spin_unlock(&si->lock);
+- continue;
+- }
+-
+- swap_list.next = next;
+
+- spin_unlock(&swap_lock);
+ /* This is called for allocating swap entry for cache */
+ offset = scan_swap_map(si, SWAP_HAS_CACHE);
+ spin_unlock(&si->lock);
+ if (offset)
+- return swp_entry(type, offset);
+- spin_lock(&swap_lock);
+- next = swap_list.next;
++ return swp_entry(si->type, offset);
++ pr_debug("scan_swap_map of si %d failed to find offset\n",
++ si->type);
++ spin_lock(&swap_avail_lock);
++nextsi:
++ /*
++ * if we got here, it's likely that si was almost full before,
++ * and since scan_swap_map() can drop the si->lock, multiple
++ * callers probably all tried to get a page from the same si
++ * and it filled up before we could get one; or, the si filled
++ * up between us dropping swap_avail_lock and taking si->lock.
++ * Since we dropped the swap_avail_lock, the swap_avail_head
++ * list may have been modified; so if next is still in the
++ * swap_avail_head list then try it, otherwise start over.
++ */
++ if (plist_node_empty(&next->avail_list))
++ goto start_over;
+ }
+
++ spin_unlock(&swap_avail_lock);
++
+ atomic_long_inc(&nr_swap_pages);
+ noswap:
+- spin_unlock(&swap_lock);
+ return (swp_entry_t) {0};
+ }
+
+@@ -765,27 +780,6 @@ out:
+ return NULL;
+ }
+
+-/*
+- * This swap type frees swap entry, check if it is the highest priority swap
+- * type which just frees swap entry. get_swap_page() uses
+- * highest_priority_index to search highest priority swap type. The
+- * swap_info_struct.lock can't protect us if there are multiple swap types
+- * active, so we use atomic_cmpxchg.
+- */
+-static void set_highest_priority_index(int type)
+-{
+- int old_hp_index, new_hp_index;
+-
+- do {
+- old_hp_index = atomic_read(&highest_priority_index);
+- if (old_hp_index != -1 &&
+- swap_info[old_hp_index]->prio >= swap_info[type]->prio)
+- break;
+- new_hp_index = type;
+- } while (atomic_cmpxchg(&highest_priority_index,
+- old_hp_index, new_hp_index) != old_hp_index);
+-}
+-
+ static unsigned char swap_entry_free(struct swap_info_struct *p,
+ swp_entry_t entry, unsigned char usage)
+ {
+@@ -827,9 +821,18 @@ static unsigned char swap_entry_free(struct swap_info_struct *p,
+ dec_cluster_info_page(p, p->cluster_info, offset);
+ if (offset < p->lowest_bit)
+ p->lowest_bit = offset;
+- if (offset > p->highest_bit)
++ if (offset > p->highest_bit) {
++ bool was_full = !p->highest_bit;
+ p->highest_bit = offset;
+- set_highest_priority_index(p->type);
++ if (was_full && (p->flags & SWP_WRITEOK)) {
++ spin_lock(&swap_avail_lock);
++ WARN_ON(!plist_node_empty(&p->avail_list));
++ if (plist_node_empty(&p->avail_list))
++ plist_add(&p->avail_list,
++ &swap_avail_head);
++ spin_unlock(&swap_avail_lock);
++ }
++ }
+ atomic_long_inc(&nr_swap_pages);
+ p->inuse_pages--;
+ frontswap_invalidate_page(p->type, offset);
+@@ -1764,30 +1767,37 @@ static void _enable_swap_info(struct swap_info_struct *p, int prio,
+ unsigned char *swap_map,
+ struct swap_cluster_info *cluster_info)
+ {
+- int i, prev;
+-
+ if (prio >= 0)
+ p->prio = prio;
+ else
+ p->prio = --least_priority;
++ /*
++ * the plist prio is negated because plist ordering is
++ * low-to-high, while swap ordering is high-to-low
++ */
++ p->list.prio = -p->prio;
++ p->avail_list.prio = -p->prio;
+ p->swap_map = swap_map;
+ p->cluster_info = cluster_info;
+ p->flags |= SWP_WRITEOK;
+ atomic_long_add(p->pages, &nr_swap_pages);
+ total_swap_pages += p->pages;
+
+- /* insert swap space into swap_list: */
+- prev = -1;
+- for (i = swap_list.head; i >= 0; i = swap_info[i]->next) {
+- if (p->prio >= swap_info[i]->prio)
+- break;
+- prev = i;
+- }
+- p->next = i;
+- if (prev < 0)
+- swap_list.head = swap_list.next = p->type;
+- else
+- swap_info[prev]->next = p->type;
++ assert_spin_locked(&swap_lock);
++ /*
++ * both lists are plists, and thus priority ordered.
++ * swap_active_head needs to be priority ordered for swapoff(),
++ * which on removal of any swap_info_struct with an auto-assigned
++ * (i.e. negative) priority increments the auto-assigned priority
++ * of any lower-priority swap_info_structs.
++ * swap_avail_head needs to be priority ordered for get_swap_page(),
++ * which allocates swap pages from the highest available priority
++ * swap_info_struct.
++ */
++ plist_add(&p->list, &swap_active_head);
++ spin_lock(&swap_avail_lock);
++ plist_add(&p->avail_list, &swap_avail_head);
++ spin_unlock(&swap_avail_lock);
+ }
+
+ static void enable_swap_info(struct swap_info_struct *p, int prio,
+@@ -1822,8 +1832,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
+ struct address_space *mapping;
+ struct inode *inode;
+ struct filename *pathname;
+- int i, type, prev;
+- int err;
++ int err, found = 0;
+ unsigned int old_block_size;
+
+ if (!capable(CAP_SYS_ADMIN))
+@@ -1841,17 +1850,16 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
+ goto out;
+
+ mapping = victim->f_mapping;
+- prev = -1;
+ spin_lock(&swap_lock);
+- for (type = swap_list.head; type >= 0; type = swap_info[type]->next) {
+- p = swap_info[type];
++ plist_for_each_entry(p, &swap_active_head, list) {
+ if (p->flags & SWP_WRITEOK) {
+- if (p->swap_file->f_mapping == mapping)
++ if (p->swap_file->f_mapping == mapping) {
++ found = 1;
+ break;
++ }
+ }
+- prev = type;
+ }
+- if (type < 0) {
++ if (!found) {
+ err = -EINVAL;
+ spin_unlock(&swap_lock);
+ goto out_dput;
+@@ -1863,20 +1871,21 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
+ spin_unlock(&swap_lock);
+ goto out_dput;
+ }
+- if (prev < 0)
+- swap_list.head = p->next;
+- else
+- swap_info[prev]->next = p->next;
+- if (type == swap_list.next) {
+- /* just pick something that's safe... */
+- swap_list.next = swap_list.head;
+- }
++ spin_lock(&swap_avail_lock);
++ plist_del(&p->avail_list, &swap_avail_head);
++ spin_unlock(&swap_avail_lock);
+ spin_lock(&p->lock);
+ if (p->prio < 0) {
+- for (i = p->next; i >= 0; i = swap_info[i]->next)
+- swap_info[i]->prio = p->prio--;
++ struct swap_info_struct *si = p;
++
++ plist_for_each_entry_continue(si, &swap_active_head, list) {
++ si->prio++;
++ si->list.prio--;
++ si->avail_list.prio--;
++ }
+ least_priority++;
+ }
++ plist_del(&p->list, &swap_active_head);
+ atomic_long_sub(p->pages, &nr_swap_pages);
+ total_swap_pages -= p->pages;
+ p->flags &= ~SWP_WRITEOK;
+@@ -1884,7 +1893,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
+ spin_unlock(&swap_lock);
+
+ set_current_oom_origin();
+- err = try_to_unuse(type, false, 0); /* force all pages to be unused */
++ err = try_to_unuse(p->type, false, 0); /* force unuse all pages */
+ clear_current_oom_origin();
+
+ if (err) {
+@@ -1926,7 +1935,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
+ frontswap_map_set(p, NULL);
+ spin_unlock(&p->lock);
+ spin_unlock(&swap_lock);
+- frontswap_invalidate_area(type);
++ frontswap_invalidate_area(p->type);
+ mutex_unlock(&swapon_mutex);
+ free_percpu(p->percpu_cluster);
+ p->percpu_cluster = NULL;
+@@ -1934,7 +1943,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
+ vfree(cluster_info);
+ vfree(frontswap_map);
+ /* Destroy swap account informatin */
+- swap_cgroup_swapoff(type);
++ swap_cgroup_swapoff(p->type);
+
+ inode = mapping->host;
+ if (S_ISBLK(inode->i_mode)) {
+@@ -2141,8 +2150,9 @@ static struct swap_info_struct *alloc_swap_info(void)
+ */
+ }
+ INIT_LIST_HEAD(&p->first_swap_extent.list);
++ plist_node_init(&p->list, 0);
++ plist_node_init(&p->avail_list, 0);
+ p->flags = SWP_USED;
+- p->next = -1;
+ spin_unlock(&swap_lock);
+ spin_lock_init(&p->lock);
+
+diff --git a/mm/truncate.c b/mm/truncate.c
+index 353b683afd6e..2e84fe59190b 100644
+--- a/mm/truncate.c
++++ b/mm/truncate.c
+@@ -22,6 +22,22 @@
+ #include <linux/cleancache.h>
+ #include "internal.h"
+
++static void clear_exceptional_entry(struct address_space *mapping,
++ pgoff_t index, void *entry)
++{
++ /* Handled by shmem itself */
++ if (shmem_mapping(mapping))
++ return;
++
++ spin_lock_irq(&mapping->tree_lock);
++ /*
++ * Regular page slots are stabilized by the page lock even
++ * without the tree itself locked. These unlocked entries
++ * need verification under the tree lock.
++ */
++ radix_tree_delete_item(&mapping->page_tree, index, entry);
++ spin_unlock_irq(&mapping->tree_lock);
++}
+
+ /**
+ * do_invalidatepage - invalidate part or all of a page
+@@ -208,6 +224,7 @@ void truncate_inode_pages_range(struct address_space *mapping,
+ unsigned int partial_start; /* inclusive */
+ unsigned int partial_end; /* exclusive */
+ struct pagevec pvec;
++ pgoff_t indices[PAGEVEC_SIZE];
+ pgoff_t index;
+ int i;
+
+@@ -238,17 +255,23 @@ void truncate_inode_pages_range(struct address_space *mapping,
+
+ pagevec_init(&pvec, 0);
+ index = start;
+- while (index < end && pagevec_lookup(&pvec, mapping, index,
+- min(end - index, (pgoff_t)PAGEVEC_SIZE))) {
++ while (index < end && pagevec_lookup_entries(&pvec, mapping, index,
++ min(end - index, (pgoff_t)PAGEVEC_SIZE),
++ indices)) {
+ mem_cgroup_uncharge_start();
+ for (i = 0; i < pagevec_count(&pvec); i++) {
+ struct page *page = pvec.pages[i];
+
+ /* We rely upon deletion not changing page->index */
+- index = page->index;
++ index = indices[i];
+ if (index >= end)
+ break;
+
++ if (radix_tree_exceptional_entry(page)) {
++ clear_exceptional_entry(mapping, index, page);
++ continue;
++ }
++
+ if (!trylock_page(page))
+ continue;
+ WARN_ON(page->index != index);
+@@ -259,6 +282,7 @@ void truncate_inode_pages_range(struct address_space *mapping,
+ truncate_inode_page(mapping, page);
+ unlock_page(page);
+ }
++ pagevec_remove_exceptionals(&pvec);
+ pagevec_release(&pvec);
+ mem_cgroup_uncharge_end();
+ cond_resched();
+@@ -307,14 +331,16 @@ void truncate_inode_pages_range(struct address_space *mapping,
+ index = start;
+ for ( ; ; ) {
+ cond_resched();
+- if (!pagevec_lookup(&pvec, mapping, index,
+- min(end - index, (pgoff_t)PAGEVEC_SIZE))) {
++ if (!pagevec_lookup_entries(&pvec, mapping, index,
++ min(end - index, (pgoff_t)PAGEVEC_SIZE),
++ indices)) {
+ if (index == start)
+ break;
+ index = start;
+ continue;
+ }
+- if (index == start && pvec.pages[0]->index >= end) {
++ if (index == start && indices[0] >= end) {
++ pagevec_remove_exceptionals(&pvec);
+ pagevec_release(&pvec);
+ break;
+ }
+@@ -323,16 +349,22 @@ void truncate_inode_pages_range(struct address_space *mapping,
+ struct page *page = pvec.pages[i];
+
+ /* We rely upon deletion not changing page->index */
+- index = page->index;
++ index = indices[i];
+ if (index >= end)
+ break;
+
++ if (radix_tree_exceptional_entry(page)) {
++ clear_exceptional_entry(mapping, index, page);
++ continue;
++ }
++
+ lock_page(page);
+ WARN_ON(page->index != index);
+ wait_on_page_writeback(page);
+ truncate_inode_page(mapping, page);
+ unlock_page(page);
+ }
++ pagevec_remove_exceptionals(&pvec);
+ pagevec_release(&pvec);
+ mem_cgroup_uncharge_end();
+ index++;
+@@ -375,6 +407,7 @@ EXPORT_SYMBOL(truncate_inode_pages);
+ unsigned long invalidate_mapping_pages(struct address_space *mapping,
+ pgoff_t start, pgoff_t end)
+ {
++ pgoff_t indices[PAGEVEC_SIZE];
+ struct pagevec pvec;
+ pgoff_t index = start;
+ unsigned long ret;
+@@ -390,17 +423,23 @@ unsigned long invalidate_mapping_pages(struct address_space *mapping,
+ */
+
+ pagevec_init(&pvec, 0);
+- while (index <= end && pagevec_lookup(&pvec, mapping, index,
+- min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1)) {
++ while (index <= end && pagevec_lookup_entries(&pvec, mapping, index,
++ min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1,
++ indices)) {
+ mem_cgroup_uncharge_start();
+ for (i = 0; i < pagevec_count(&pvec); i++) {
+ struct page *page = pvec.pages[i];
+
+ /* We rely upon deletion not changing page->index */
+- index = page->index;
++ index = indices[i];
+ if (index > end)
+ break;
+
++ if (radix_tree_exceptional_entry(page)) {
++ clear_exceptional_entry(mapping, index, page);
++ continue;
++ }
++
+ if (!trylock_page(page))
+ continue;
+ WARN_ON(page->index != index);
+@@ -414,6 +453,7 @@ unsigned long invalidate_mapping_pages(struct address_space *mapping,
+ deactivate_page(page);
+ count += ret;
+ }
++ pagevec_remove_exceptionals(&pvec);
+ pagevec_release(&pvec);
+ mem_cgroup_uncharge_end();
+ cond_resched();
+@@ -481,6 +521,7 @@ static int do_launder_page(struct address_space *mapping, struct page *page)
+ int invalidate_inode_pages2_range(struct address_space *mapping,
+ pgoff_t start, pgoff_t end)
+ {
++ pgoff_t indices[PAGEVEC_SIZE];
+ struct pagevec pvec;
+ pgoff_t index;
+ int i;
+@@ -491,17 +532,23 @@ int invalidate_inode_pages2_range(struct address_space *mapping,
+ cleancache_invalidate_inode(mapping);
+ pagevec_init(&pvec, 0);
+ index = start;
+- while (index <= end && pagevec_lookup(&pvec, mapping, index,
+- min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1)) {
++ while (index <= end && pagevec_lookup_entries(&pvec, mapping, index,
++ min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1,
++ indices)) {
+ mem_cgroup_uncharge_start();
+ for (i = 0; i < pagevec_count(&pvec); i++) {
+ struct page *page = pvec.pages[i];
+
+ /* We rely upon deletion not changing page->index */
+- index = page->index;
++ index = indices[i];
+ if (index > end)
+ break;
+
++ if (radix_tree_exceptional_entry(page)) {
++ clear_exceptional_entry(mapping, index, page);
++ continue;
++ }
++
+ lock_page(page);
+ WARN_ON(page->index != index);
+ if (page->mapping != mapping) {
+@@ -539,6 +586,7 @@ int invalidate_inode_pages2_range(struct address_space *mapping,
+ ret = ret2;
+ unlock_page(page);
+ }
++ pagevec_remove_exceptionals(&pvec);
+ pagevec_release(&pvec);
+ mem_cgroup_uncharge_end();
+ cond_resched();
+diff --git a/mm/vmacache.c b/mm/vmacache.c
+new file mode 100644
+index 000000000000..1037a3bab505
+--- /dev/null
++++ b/mm/vmacache.c
+@@ -0,0 +1,114 @@
++/*
++ * Copyright (C) 2014 Davidlohr Bueso.
++ */
++#include <linux/sched.h>
++#include <linux/mm.h>
++#include <linux/vmacache.h>
++
++/*
++ * Flush vma caches for threads that share a given mm.
++ *
++ * The operation is safe because the caller holds the mmap_sem
++ * exclusively and other threads accessing the vma cache will
++ * have mmap_sem held at least for read, so no extra locking
++ * is required to maintain the vma cache.
++ */
++void vmacache_flush_all(struct mm_struct *mm)
++{
++ struct task_struct *g, *p;
++
++ rcu_read_lock();
++ for_each_process_thread(g, p) {
++ /*
++ * Only flush the vmacache pointers as the
++ * mm seqnum is already set and curr's will
++ * be set upon invalidation when the next
++ * lookup is done.
++ */
++ if (mm == p->mm)
++ vmacache_flush(p);
++ }
++ rcu_read_unlock();
++}
++
++/*
++ * This task may be accessing a foreign mm via (for example)
++ * get_user_pages()->find_vma(). The vmacache is task-local and this
++ * task's vmacache pertains to a different mm (ie, its own). There is
++ * nothing we can do here.
++ *
++ * Also handle the case where a kernel thread has adopted this mm via use_mm().
++ * That kernel thread's vmacache is not applicable to this mm.
++ */
++static bool vmacache_valid_mm(struct mm_struct *mm)
++{
++ return current->mm == mm && !(current->flags & PF_KTHREAD);
++}
++
++void vmacache_update(unsigned long addr, struct vm_area_struct *newvma)
++{
++ if (vmacache_valid_mm(newvma->vm_mm))
++ current->vmacache[VMACACHE_HASH(addr)] = newvma;
++}
++
++static bool vmacache_valid(struct mm_struct *mm)
++{
++ struct task_struct *curr;
++
++ if (!vmacache_valid_mm(mm))
++ return false;
++
++ curr = current;
++ if (mm->vmacache_seqnum != curr->vmacache_seqnum) {
++ /*
++ * First attempt will always be invalid, initialize
++ * the new cache for this task here.
++ */
++ curr->vmacache_seqnum = mm->vmacache_seqnum;
++ vmacache_flush(curr);
++ return false;
++ }
++ return true;
++}
++
++struct vm_area_struct *vmacache_find(struct mm_struct *mm, unsigned long addr)
++{
++ int i;
++
++ if (!vmacache_valid(mm))
++ return NULL;
++
++ for (i = 0; i < VMACACHE_SIZE; i++) {
++ struct vm_area_struct *vma = current->vmacache[i];
++
++ if (!vma)
++ continue;
++ if (WARN_ON_ONCE(vma->vm_mm != mm))
++ break;
++ if (vma->vm_start <= addr && vma->vm_end > addr)
++ return vma;
++ }
++
++ return NULL;
++}
++
++#ifndef CONFIG_MMU
++struct vm_area_struct *vmacache_find_exact(struct mm_struct *mm,
++ unsigned long start,
++ unsigned long end)
++{
++ int i;
++
++ if (!vmacache_valid(mm))
++ return NULL;
++
++ for (i = 0; i < VMACACHE_SIZE; i++) {
++ struct vm_area_struct *vma = current->vmacache[i];
++
++ if (vma && vma->vm_start == start && vma->vm_end == end)
++ return vma;
++ }
++
++ return NULL;
++}
++#endif
+diff --git a/mm/vmalloc.c b/mm/vmalloc.c
+index e2be0f802ccf..060dc366ac44 100644
+--- a/mm/vmalloc.c
++++ b/mm/vmalloc.c
+@@ -2685,14 +2685,14 @@ void get_vmalloc_info(struct vmalloc_info *vmi)
+
+ prev_end = VMALLOC_START;
+
+- spin_lock(&vmap_area_lock);
++ rcu_read_lock();
+
+ if (list_empty(&vmap_area_list)) {
+ vmi->largest_chunk = VMALLOC_TOTAL;
+ goto out;
+ }
+
+- list_for_each_entry(va, &vmap_area_list, list) {
++ list_for_each_entry_rcu(va, &vmap_area_list, list) {
+ unsigned long addr = va->va_start;
+
+ /*
+@@ -2719,7 +2719,7 @@ void get_vmalloc_info(struct vmalloc_info *vmi)
+ vmi->largest_chunk = VMALLOC_END - prev_end;
+
+ out:
+- spin_unlock(&vmap_area_lock);
++ rcu_read_unlock();
+ }
+ #endif
+
+diff --git a/mm/vmscan.c b/mm/vmscan.c
+index 5ad29b2925a0..5461d02ea718 100644
+--- a/mm/vmscan.c
++++ b/mm/vmscan.c
+@@ -163,7 +163,8 @@ static unsigned long zone_reclaimable_pages(struct zone *zone)
+
+ bool zone_reclaimable(struct zone *zone)
+ {
+- return zone->pages_scanned < zone_reclaimable_pages(zone) * 6;
++ return zone_page_state(zone, NR_PAGES_SCANNED) <
++ zone_reclaimable_pages(zone) * 6;
+ }
+
+ static unsigned long get_lru_size(struct lruvec *lruvec, enum lru_list lru)
+@@ -224,15 +225,15 @@ shrink_slab_node(struct shrink_control *shrinkctl, struct shrinker *shrinker,
+ unsigned long freed = 0;
+ unsigned long long delta;
+ long total_scan;
+- long max_pass;
++ long freeable;
+ long nr;
+ long new_nr;
+ int nid = shrinkctl->nid;
+ long batch_size = shrinker->batch ? shrinker->batch
+ : SHRINK_BATCH;
+
+- max_pass = shrinker->count_objects(shrinker, shrinkctl);
+- if (max_pass == 0)
++ freeable = shrinker->count_objects(shrinker, shrinkctl);
++ if (freeable == 0)
+ return 0;
+
+ /*
+@@ -244,14 +245,14 @@ shrink_slab_node(struct shrink_control *shrinkctl, struct shrinker *shrinker,
+
+ total_scan = nr;
+ delta = (4 * nr_pages_scanned) / shrinker->seeks;
+- delta *= max_pass;
++ delta *= freeable;
+ do_div(delta, lru_pages + 1);
+ total_scan += delta;
+ if (total_scan < 0) {
+ printk(KERN_ERR
+ "shrink_slab: %pF negative objects to delete nr=%ld\n",
+ shrinker->scan_objects, total_scan);
+- total_scan = max_pass;
++ total_scan = freeable;
+ }
+
+ /*
+@@ -260,38 +261,55 @@ shrink_slab_node(struct shrink_control *shrinkctl, struct shrinker *shrinker,
+ * shrinkers to return -1 all the time. This results in a large
+ * nr being built up so when a shrink that can do some work
+ * comes along it empties the entire cache due to nr >>>
+- * max_pass. This is bad for sustaining a working set in
++ * freeable. This is bad for sustaining a working set in
+ * memory.
+ *
+ * Hence only allow the shrinker to scan the entire cache when
+ * a large delta change is calculated directly.
+ */
+- if (delta < max_pass / 4)
+- total_scan = min(total_scan, max_pass / 2);
++ if (delta < freeable / 4)
++ total_scan = min(total_scan, freeable / 2);
+
+ /*
+ * Avoid risking looping forever due to too large nr value:
+ * never try to free more than twice the estimate number of
+ * freeable entries.
+ */
+- if (total_scan > max_pass * 2)
+- total_scan = max_pass * 2;
++ if (total_scan > freeable * 2)
++ total_scan = freeable * 2;
+
+ trace_mm_shrink_slab_start(shrinker, shrinkctl, nr,
+ nr_pages_scanned, lru_pages,
+- max_pass, delta, total_scan);
++ freeable, delta, total_scan);
+
+- while (total_scan >= batch_size) {
++ /*
++ * Normally, we should not scan less than batch_size objects in one
++ * pass to avoid too frequent shrinker calls, but if the slab has less
++ * than batch_size objects in total and we are really tight on memory,
++ * we will try to reclaim all available objects, otherwise we can end
++ * up failing allocations although there are plenty of reclaimable
++ * objects spread over several slabs with usage less than the
++ * batch_size.
++ *
++ * We detect the "tight on memory" situations by looking at the total
++ * number of objects we want to scan (total_scan). If it is greater
++ * than the total number of objects on slab (freeable), we must be
++ * scanning at high prio and therefore should try to reclaim as much as
++ * possible.
++ */
++ while (total_scan >= batch_size ||
++ total_scan >= freeable) {
+ unsigned long ret;
++ unsigned long nr_to_scan = min(batch_size, total_scan);
+
+- shrinkctl->nr_to_scan = batch_size;
++ shrinkctl->nr_to_scan = nr_to_scan;
+ ret = shrinker->scan_objects(shrinker, shrinkctl);
+ if (ret == SHRINK_STOP)
+ break;
+ freed += ret;
+
+- count_vm_events(SLABS_SCANNED, batch_size);
+- total_scan -= batch_size;
++ count_vm_events(SLABS_SCANNED, nr_to_scan);
++ total_scan -= nr_to_scan;
+
+ cond_resched();
+ }
+@@ -352,16 +370,17 @@ unsigned long shrink_slab(struct shrink_control *shrinkctl,
+ }
+
+ list_for_each_entry(shrinker, &shrinker_list, list) {
+- for_each_node_mask(shrinkctl->nid, shrinkctl->nodes_to_scan) {
+- if (!node_online(shrinkctl->nid))
+- continue;
+-
+- if (!(shrinker->flags & SHRINKER_NUMA_AWARE) &&
+- (shrinkctl->nid != 0))
+- break;
+-
++ if (!(shrinker->flags & SHRINKER_NUMA_AWARE)) {
++ shrinkctl->nid = 0;
+ freed += shrink_slab_node(shrinkctl, shrinker,
+- nr_pages_scanned, lru_pages);
++ nr_pages_scanned, lru_pages);
++ continue;
++ }
++
++ for_each_node_mask(shrinkctl->nid, shrinkctl->nodes_to_scan) {
++ if (node_online(shrinkctl->nid))
++ freed += shrink_slab_node(shrinkctl, shrinker,
++ nr_pages_scanned, lru_pages);
+
+ }
+ }
+@@ -1089,7 +1108,7 @@ keep:
+ VM_BUG_ON(PageLRU(page) || PageUnevictable(page));
+ }
+
+- free_hot_cold_page_list(&free_pages, 1);
++ free_hot_cold_page_list(&free_pages, true);
+
+ list_splice(&ret_pages, page_list);
+ count_vm_events(PGACTIVATE, pgactivate);
+@@ -1126,7 +1145,7 @@ unsigned long reclaim_clean_pages_from_list(struct zone *zone,
+ TTU_UNMAP|TTU_IGNORE_ACCESS,
+ &dummy1, &dummy2, &dummy3, &dummy4, &dummy5, true);
+ list_splice(&clean_pages, page_list);
+- __mod_zone_page_state(zone, NR_ISOLATED_FILE, -ret);
++ mod_zone_page_state(zone, NR_ISOLATED_FILE, -ret);
+ return ret;
+ }
+
+@@ -1452,7 +1471,7 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
+ __mod_zone_page_state(zone, NR_ISOLATED_ANON + file, nr_taken);
+
+ if (global_reclaim(sc)) {
+- zone->pages_scanned += nr_scanned;
++ __mod_zone_page_state(zone, NR_PAGES_SCANNED, nr_scanned);
+ if (current_is_kswapd())
+ __count_zone_vm_events(PGSCAN_KSWAPD, zone, nr_scanned);
+ else
+@@ -1487,7 +1506,7 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
+
+ spin_unlock_irq(&zone->lru_lock);
+
+- free_hot_cold_page_list(&page_list, 1);
++ free_hot_cold_page_list(&page_list, true);
+
+ /*
+ * If reclaim is isolating dirty pages under writeback, it implies
+@@ -1641,7 +1660,7 @@ static void shrink_active_list(unsigned long nr_to_scan,
+ nr_taken = isolate_lru_pages(nr_to_scan, lruvec, &l_hold,
+ &nr_scanned, sc, isolate_mode, lru);
+ if (global_reclaim(sc))
+- zone->pages_scanned += nr_scanned;
++ __mod_zone_page_state(zone, NR_PAGES_SCANNED, nr_scanned);
+
+ reclaim_stat->recent_scanned[file] += nr_taken;
+
+@@ -1707,7 +1726,7 @@ static void shrink_active_list(unsigned long nr_to_scan,
+ __mod_zone_page_state(zone, NR_ISOLATED_ANON + file, -nr_taken);
+ spin_unlock_irq(&zone->lru_lock);
+
+- free_hot_cold_page_list(&l_hold, 1);
++ free_hot_cold_page_list(&l_hold, true);
+ }
+
+ #ifdef CONFIG_SWAP
+@@ -1829,7 +1848,7 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
+ struct zone *zone = lruvec_zone(lruvec);
+ unsigned long anon_prio, file_prio;
+ enum scan_balance scan_balance;
+- unsigned long anon, file, free;
++ unsigned long anon, file;
+ bool force_scan = false;
+ unsigned long ap, fp;
+ enum lru_list lru;
+@@ -1877,11 +1896,6 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
+ goto out;
+ }
+
+- anon = get_lru_size(lruvec, LRU_ACTIVE_ANON) +
+- get_lru_size(lruvec, LRU_INACTIVE_ANON);
+- file = get_lru_size(lruvec, LRU_ACTIVE_FILE) +
+- get_lru_size(lruvec, LRU_INACTIVE_FILE);
+-
+ /*
+ * If it's foreseeable that reclaiming the file cache won't be
+ * enough to get the zone back into a desirable shape, we have
+@@ -1889,8 +1903,14 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
+ * thrashing - remaining file pages alone.
+ */
+ if (global_reclaim(sc)) {
+- free = zone_page_state(zone, NR_FREE_PAGES);
+- if (unlikely(file + free <= high_wmark_pages(zone))) {
++ unsigned long zonefile;
++ unsigned long zonefree;
++
++ zonefree = zone_page_state(zone, NR_FREE_PAGES);
++ zonefile = zone_page_state(zone, NR_ACTIVE_FILE) +
++ zone_page_state(zone, NR_INACTIVE_FILE);
++
++ if (unlikely(zonefile + zonefree <= high_wmark_pages(zone))) {
+ scan_balance = SCAN_ANON;
+ goto out;
+ }
+@@ -1925,6 +1945,12 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
+ *
+ * anon in [0], file in [1]
+ */
++
++ anon = get_lru_size(lruvec, LRU_ACTIVE_ANON) +
++ get_lru_size(lruvec, LRU_INACTIVE_ANON);
++ file = get_lru_size(lruvec, LRU_ACTIVE_FILE) +
++ get_lru_size(lruvec, LRU_INACTIVE_FILE);
++
+ spin_lock_irq(&zone->lru_lock);
+ if (unlikely(reclaim_stat->recent_scanned[0] > anon / 4)) {
+ reclaim_stat->recent_scanned[0] /= 2;
+@@ -2000,13 +2026,27 @@ static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
+ unsigned long nr_reclaimed = 0;
+ unsigned long nr_to_reclaim = sc->nr_to_reclaim;
+ struct blk_plug plug;
+- bool scan_adjusted = false;
++ bool scan_adjusted;
+
+ get_scan_count(lruvec, sc, nr);
+
+ /* Record the original scan target for proportional adjustments later */
+ memcpy(targets, nr, sizeof(nr));
+
++ /*
++ * Global reclaiming within direct reclaim at DEF_PRIORITY is a normal
++ * event that can occur when there is little memory pressure e.g.
++ * multiple streaming readers/writers. Hence, we do not abort scanning
++ * when the requested number of pages are reclaimed when scanning at
++ * DEF_PRIORITY on the assumption that the fact we are direct
++ * reclaiming implies that kswapd is not keeping up and it is best to
++ * do a batch of work at once. For memcg reclaim one check is made to
++ * abort proportional reclaim if either the file or anon lru has already
++ * dropped to zero at the first pass.
++ */
++ scan_adjusted = (global_reclaim(sc) && !current_is_kswapd() &&
++ sc->priority == DEF_PRIORITY);
++
+ blk_start_plug(&plug);
+ while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] ||
+ nr[LRU_INACTIVE_FILE]) {
+@@ -2027,17 +2067,8 @@ static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
+ continue;
+
+ /*
+- * For global direct reclaim, reclaim only the number of pages
+- * requested. Less care is taken to scan proportionally as it
+- * is more important to minimise direct reclaim stall latency
+- * than it is to properly age the LRU lists.
+- */
+- if (global_reclaim(sc) && !current_is_kswapd())
+- break;
+-
+- /*
+ * For kswapd and memcg, reclaim at least the number of pages
+- * requested. Ensure that the anon and file LRUs shrink
++ * requested. Ensure that the anon and file LRUs are scanned
+ * proportionally what was requested by get_scan_count(). We
+ * stop reclaiming one LRU and reduce the amount scanning
+ * proportional to the original scan target.
+@@ -2045,6 +2076,15 @@ static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
+ nr_file = nr[LRU_INACTIVE_FILE] + nr[LRU_ACTIVE_FILE];
+ nr_anon = nr[LRU_INACTIVE_ANON] + nr[LRU_ACTIVE_ANON];
+
++ /*
++ * It's just vindictive to attack the larger once the smaller
++ * has gone to zero. And given the way we stop scanning the
++ * smaller below, this makes sure that we only make one nudge
++ * towards proportionality once we've got nr_to_reclaim.
++ */
++ if (!nr_file || !nr_anon)
++ break;
++
+ if (nr_file > nr_anon) {
+ unsigned long scan_target = targets[LRU_INACTIVE_ANON] +
+ targets[LRU_ACTIVE_ANON] + 1;
+@@ -2406,8 +2446,8 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
+ unsigned long lru_pages = 0;
+
+ nodes_clear(shrink->nodes_to_scan);
+- for_each_zone_zonelist(zone, z, zonelist,
+- gfp_zone(sc->gfp_mask)) {
++ for_each_zone_zonelist_nodemask(zone, z, zonelist,
++ gfp_zone(sc->gfp_mask), sc->nodemask) {
+ if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
+ continue;
+
+diff --git a/mm/vmstat.c b/mm/vmstat.c
+index 5a442a723d79..f7ca04482299 100644
+--- a/mm/vmstat.c
++++ b/mm/vmstat.c
+@@ -200,7 +200,7 @@ void set_pgdat_percpu_threshold(pg_data_t *pgdat,
+ continue;
+
+ threshold = (*calculate_pressure)(zone);
+- for_each_possible_cpu(cpu)
++ for_each_online_cpu(cpu)
+ per_cpu_ptr(zone->pageset, cpu)->stat_threshold
+ = threshold;
+ }
+@@ -761,6 +761,7 @@ const char * const vmstat_text[] = {
+ "nr_shmem",
+ "nr_dirtied",
+ "nr_written",
++ "nr_pages_scanned",
+
+ #ifdef CONFIG_NUMA
+ "numa_hit",
+@@ -851,12 +852,14 @@ const char * const vmstat_text[] = {
+ "thp_zero_page_alloc",
+ "thp_zero_page_alloc_failed",
+ #endif
++#ifdef CONFIG_DEBUG_TLBFLUSH
+ #ifdef CONFIG_SMP
+ "nr_tlb_remote_flush",
+ "nr_tlb_remote_flush_received",
+-#endif
++#endif /* CONFIG_SMP */
+ "nr_tlb_local_flush_all",
+ "nr_tlb_local_flush_one",
++#endif /* CONFIG_DEBUG_TLBFLUSH */
+
+ #endif /* CONFIG_VM_EVENTS_COUNTERS */
+ };
+@@ -1053,7 +1056,7 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
+ min_wmark_pages(zone),
+ low_wmark_pages(zone),
+ high_wmark_pages(zone),
+- zone->pages_scanned,
++ zone_page_state(zone, NR_PAGES_SCANNED),
+ zone->spanned_pages,
+ zone->present_pages,
+ zone->managed_pages);
+@@ -1063,10 +1066,10 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
+ zone_page_state(zone, i));
+
+ seq_printf(m,
+- "\n protection: (%lu",
++ "\n protection: (%ld",
+ zone->lowmem_reserve[0]);
+ for (i = 1; i < ARRAY_SIZE(zone->lowmem_reserve); i++)
+- seq_printf(m, ", %lu", zone->lowmem_reserve[i]);
++ seq_printf(m, ", %ld", zone->lowmem_reserve[i]);
+ seq_printf(m,
+ ")"
+ "\n pagesets");