From 849369d6c66d3054688672f97d31fceb8e8230fb Mon Sep 17 00:00:00 2001
From: root <root@artemis.panaceas.org>
Date: Fri, 25 Dec 2015 04:40:36 +0000
Subject: initial_commit

---
 Documentation/vm/.gitignore             |    2 +
 Documentation/vm/00-INDEX               |   38 ++
 Documentation/vm/Makefile               |    8 +
 Documentation/vm/active_mm.txt          |   83 +++
 Documentation/vm/balance                |   93 +++
 Documentation/vm/cleancache.txt         |  278 ++++++++
 Documentation/vm/highmem.txt            |  162 +++++
 Documentation/vm/hugepage-mmap.c        |   91 +++
 Documentation/vm/hugepage-shm.c         |   98 +++
 Documentation/vm/hugetlbpage.txt        |  309 +++++++++
 Documentation/vm/hwpoison.txt           |  182 +++++
 Documentation/vm/ksm.txt                |   82 +++
 Documentation/vm/locking                |  130 ++++
 Documentation/vm/map_hugetlb.c          |   77 +++
 Documentation/vm/numa                   |  149 +++++
 Documentation/vm/numa_memory_policy.txt |  453 +++++++++++++
 Documentation/vm/overcommit-accounting  |   73 ++
 Documentation/vm/page-types.c           | 1100 +++++++++++++++++++++++++++++++
 Documentation/vm/page_migration         |  149 +++++
 Documentation/vm/pagemap.txt            |  147 +++++
 Documentation/vm/slub.txt               |  280 ++++++++
 Documentation/vm/transhuge.txt          |  298 +++++++++
 Documentation/vm/unevictable-lru.txt    |  690 +++++++++++++++++++
 23 files changed, 4972 insertions(+)
 create mode 100644 Documentation/vm/.gitignore
 create mode 100644 Documentation/vm/00-INDEX
 create mode 100644 Documentation/vm/Makefile
 create mode 100644 Documentation/vm/active_mm.txt
 create mode 100644 Documentation/vm/balance
 create mode 100644 Documentation/vm/cleancache.txt
 create mode 100644 Documentation/vm/highmem.txt
 create mode 100644 Documentation/vm/hugepage-mmap.c
 create mode 100644 Documentation/vm/hugepage-shm.c
 create mode 100644 Documentation/vm/hugetlbpage.txt
 create mode 100644 Documentation/vm/hwpoison.txt
 create mode 100644 Documentation/vm/ksm.txt
 create mode 100644 Documentation/vm/locking
 create mode 100644 Documentation/vm/map_hugetlb.c
 create mode 100644 Documentation/vm/numa
 create mode 100644 Documentation/vm/numa_memory_policy.txt
 create mode 100644 Documentation/vm/overcommit-accounting
 create mode 100644 Documentation/vm/page-types.c
 create mode 100644 Documentation/vm/page_migration
 create mode 100644 Documentation/vm/pagemap.txt
 create mode 100644 Documentation/vm/slub.txt
 create mode 100644 Documentation/vm/transhuge.txt
 create mode 100644 Documentation/vm/unevictable-lru.txt

(limited to 'Documentation/vm')

diff --git a/Documentation/vm/.gitignore b/Documentation/vm/.gitignore
new file mode 100644
index 00000000..09b164a5
--- /dev/null
+++ b/Documentation/vm/.gitignore
@@ -0,0 +1,2 @@
+page-types
+slabinfo
diff --git a/Documentation/vm/00-INDEX b/Documentation/vm/00-INDEX
new file mode 100644
index 00000000..dca82d7c
--- /dev/null
+++ b/Documentation/vm/00-INDEX
@@ -0,0 +1,38 @@
+00-INDEX
+	- this file.
+active_mm.txt
+	- An explanation from Linus about tsk->active_mm vs tsk->mm.
+balance
+	- various information on memory balancing.
+hugepage-mmap.c
+	- Example app using huge page memory with the mmap system call.
+hugepage-shm.c
+	- Example app using huge page memory with Sys V shared memory system calls.
+hugetlbpage.txt
+	- a brief summary of hugetlbpage support in the Linux kernel.
+hwpoison.txt
+	- explains what hwpoison is
+ksm.txt
+	- how to use the Kernel Samepage Merging feature.
+locking
+	- info on how locking and synchronization is done in the Linux vm code.
+map_hugetlb.c
+	- an example program that uses the MAP_HUGETLB mmap flag.
+numa
+	- information about NUMA specific code in the Linux vm.
+numa_memory_policy.txt
+	- documentation of concepts and APIs of the 2.6 memory policy support.
+overcommit-accounting
+	- description of the Linux kernels overcommit handling modes.
+page-types.c
+	- Tool for querying page flags
+page_migration
+	- description of page migration in NUMA systems.
+pagemap.txt
+	- pagemap, from the userspace perspective
+slabinfo.c
+	- source code for a tool to get reports about slabs.
+slub.txt
+	- a short users guide for SLUB.
+unevictable-lru.txt
+	- Unevictable LRU infrastructure
diff --git a/Documentation/vm/Makefile b/Documentation/vm/Makefile
new file mode 100644
index 00000000..3fa4d066
--- /dev/null
+++ b/Documentation/vm/Makefile
@@ -0,0 +1,8 @@
+# kbuild trick to avoid linker error. Can be omitted if a module is built.
+obj- := dummy.o
+
+# List of programs to build
+hostprogs-y := page-types hugepage-mmap hugepage-shm map_hugetlb
+
+# Tell kbuild to always build the programs
+always := $(hostprogs-y)
diff --git a/Documentation/vm/active_mm.txt b/Documentation/vm/active_mm.txt
new file mode 100644
index 00000000..dbf45817
--- /dev/null
+++ b/Documentation/vm/active_mm.txt
@@ -0,0 +1,83 @@
+List:       linux-kernel
+Subject:    Re: active_mm
+From:       Linus Torvalds <torvalds () transmeta ! com>
+Date:       1999-07-30 21:36:24
+
+Cc'd to linux-kernel, because I don't write explanations all that often,
+and when I do I feel better about more people reading them.
+
+On Fri, 30 Jul 1999, David Mosberger wrote:
+>
+> Is there a brief description someplace on how "mm" vs. "active_mm" in
+> the task_struct are supposed to be used?  (My apologies if this was
+> discussed on the mailing lists---I just returned from vacation and
+> wasn't able to follow linux-kernel for a while).
+
+Basically, the new setup is:
+
+ - we have "real address spaces" and "anonymous address spaces". The
+   difference is that an anonymous address space doesn't care about the
+   user-level page tables at all, so when we do a context switch into an
+   anonymous address space we just leave the previous address space
+   active.
+
+   The obvious use for a "anonymous address space" is any thread that
+   doesn't need any user mappings - all kernel threads basically fall into
+   this category, but even "real" threads can temporarily say that for
+   some amount of time they are not going to be interested in user space,
+   and that the scheduler might as well try to avoid wasting time on
+   switching the VM state around. Currently only the old-style bdflush
+   sync does that.
+
+ - "tsk->mm" points to the "real address space". For an anonymous process,
+   tsk->mm will be NULL, for the logical reason that an anonymous process
+   really doesn't _have_ a real address space at all.
+
+ - however, we obviously need to keep track of which address space we
+   "stole" for such an anonymous user. For that, we have "tsk->active_mm",
+   which shows what the currently active address space is.
+
+   The rule is that for a process with a real address space (ie tsk->mm is
+   non-NULL) the active_mm obviously always has to be the same as the real
+   one.
+
+   For a anonymous process, tsk->mm == NULL, and tsk->active_mm is the
+   "borrowed" mm while the anonymous process is running. When the
+   anonymous process gets scheduled away, the borrowed address space is
+   returned and cleared.
+
+To support all that, the "struct mm_struct" now has two counters: a
+"mm_users" counter that is how many "real address space users" there are,
+and a "mm_count" counter that is the number of "lazy" users (ie anonymous
+users) plus one if there are any real users.
+
+Usually there is at least one real user, but it could be that the real
+user exited on another CPU while a lazy user was still active, so you do
+actually get cases where you have a address space that is _only_ used by
+lazy users. That is often a short-lived state, because once that thread
+gets scheduled away in favour of a real thread, the "zombie" mm gets
+released because "mm_users" becomes zero.
+
+Also, a new rule is that _nobody_ ever has "init_mm" as a real MM any
+more. "init_mm" should be considered just a "lazy context when no other
+context is available", and in fact it is mainly used just at bootup when
+no real VM has yet been created. So code that used to check
+
+	if (current->mm == &init_mm)
+
+should generally just do
+
+	if (!current->mm)
+
+instead (which makes more sense anyway - the test is basically one of "do
+we have a user context", and is generally done by the page fault handler
+and things like that).
+
+Anyway, I put a pre-patch-2.3.13-1 on ftp.kernel.org just a moment ago,
+because it slightly changes the interfaces to accommodate the alpha (who
+would have thought it, but the alpha actually ends up having one of the
+ugliest context switch codes - unlike the other architectures where the MM
+and register state is separate, the alpha PALcode joins the two, and you
+need to switch both together).
+
+(From http://marc.info/?l=linux-kernel&m=93337278602211&w=2)
diff --git a/Documentation/vm/balance b/Documentation/vm/balance
new file mode 100644
index 00000000..c46e68cf
--- /dev/null
+++ b/Documentation/vm/balance
@@ -0,0 +1,93 @@
+Started Jan 2000 by Kanoj Sarcar <kanoj@sgi.com>
+
+Memory balancing is needed for non __GFP_WAIT as well as for non
+__GFP_IO allocations.
+
+There are two reasons to be requesting non __GFP_WAIT allocations:
+the caller can not sleep (typically intr context), or does not want
+to incur cost overheads of page stealing and possible swap io for
+whatever reasons.
+
+__GFP_IO allocation requests are made to prevent file system deadlocks.
+
+In the absence of non sleepable allocation requests, it seems detrimental
+to be doing balancing. Page reclamation can be kicked off lazily, that
+is, only when needed (aka zone free memory is 0), instead of making it
+a proactive process.
+
+That being said, the kernel should try to fulfill requests for direct
+mapped pages from the direct mapped pool, instead of falling back on
+the dma pool, so as to keep the dma pool filled for dma requests (atomic
+or not). A similar argument applies to highmem and direct mapped pages.
+OTOH, if there is a lot of free dma pages, it is preferable to satisfy
+regular memory requests by allocating one from the dma pool, instead
+of incurring the overhead of regular zone balancing.
+
+In 2.2, memory balancing/page reclamation would kick off only when the
+_total_ number of free pages fell below 1/64 th of total memory. With the
+right ratio of dma and regular memory, it is quite possible that balancing
+would not be done even when the dma zone was completely empty. 2.2 has
+been running production machines of varying memory sizes, and seems to be
+doing fine even with the presence of this problem. In 2.3, due to
+HIGHMEM, this problem is aggravated.
+
+In 2.3, zone balancing can be done in one of two ways: depending on the
+zone size (and possibly of the size of lower class zones), we can decide
+at init time how many free pages we should aim for while balancing any
+zone. The good part is, while balancing, we do not need to look at sizes
+of lower class zones, the bad part is, we might do too frequent balancing
+due to ignoring possibly lower usage in the lower class zones. Also,
+with a slight change in the allocation routine, it is possible to reduce
+the memclass() macro to be a simple equality.
+
+Another possible solution is that we balance only when the free memory
+of a zone _and_ all its lower class zones falls below 1/64th of the
+total memory in the zone and its lower class zones. This fixes the 2.2
+balancing problem, and stays as close to 2.2 behavior as possible. Also,
+the balancing algorithm works the same way on the various architectures,
+which have different numbers and types of zones. If we wanted to get
+fancy, we could assign different weights to free pages in different
+zones in the future.
+
+Note that if the size of the regular zone is huge compared to dma zone,
+it becomes less significant to consider the free dma pages while
+deciding whether to balance the regular zone. The first solution
+becomes more attractive then.
+
+The appended patch implements the second solution. It also "fixes" two
+problems: first, kswapd is woken up as in 2.2 on low memory conditions
+for non-sleepable allocations. Second, the HIGHMEM zone is also balanced,
+so as to give a fighting chance for replace_with_highmem() to get a
+HIGHMEM page, as well as to ensure that HIGHMEM allocations do not
+fall back into regular zone. This also makes sure that HIGHMEM pages
+are not leaked (for example, in situations where a HIGHMEM page is in 
+the swapcache but is not being used by anyone)
+
+kswapd also needs to know about the zones it should balance. kswapd is
+primarily needed in a situation where balancing can not be done, 
+probably because all allocation requests are coming from intr context
+and all process contexts are sleeping. For 2.3, kswapd does not really
+need to balance the highmem zone, since intr context does not request
+highmem pages. kswapd looks at the zone_wake_kswapd field in the zone
+structure to decide whether a zone needs balancing.
+
+Page stealing from process memory and shm is done if stealing the page would
+alleviate memory pressure on any zone in the page's node that has fallen below
+its watermark.
+
+watemark[WMARK_MIN/WMARK_LOW/WMARK_HIGH]/low_on_memory/zone_wake_kswapd: These
+are per-zone fields, used to determine when a zone needs to be balanced. When
+the number of pages falls below watermark[WMARK_MIN], the hysteric field
+low_on_memory gets set. This stays set till the number of free pages becomes
+watermark[WMARK_HIGH]. When low_on_memory is set, page allocation requests will
+try to free some pages in the zone (providing GFP_WAIT is set in the request).
+Orthogonal to this, is the decision to poke kswapd to free some zone pages.
+That decision is not hysteresis based, and is done when the number of free
+pages is below watermark[WMARK_LOW]; in which case zone_wake_kswapd is also set.
+
+
+(Good) Ideas that I have heard:
+1. Dynamic experience should influence balancing: number of failed requests
+for a zone can be tracked and fed into the balancing scheme (jalvo@mbay.net)
+2. Implement a replace_with_highmem()-like replace_with_regular() to preserve
+dma pages. (lkd@tantalophile.demon.co.uk)
diff --git a/Documentation/vm/cleancache.txt b/Documentation/vm/cleancache.txt
new file mode 100644
index 00000000..36c367c7
--- /dev/null
+++ b/Documentation/vm/cleancache.txt
@@ -0,0 +1,278 @@
+MOTIVATION
+
+Cleancache is a new optional feature provided by the VFS layer that
+potentially dramatically increases page cache effectiveness for
+many workloads in many environments at a negligible cost.
+
+Cleancache can be thought of as a page-granularity victim cache for clean
+pages that the kernel's pageframe replacement algorithm (PFRA) would like
+to keep around, but can't since there isn't enough memory.  So when the
+PFRA "evicts" a page, it first attempts to use cleancache code to
+put the data contained in that page into "transcendent memory", memory
+that is not directly accessible or addressable by the kernel and is
+of unknown and possibly time-varying size.
+
+Later, when a cleancache-enabled filesystem wishes to access a page
+in a file on disk, it first checks cleancache to see if it already
+contains it; if it does, the page of data is copied into the kernel
+and a disk access is avoided.
+
+Transcendent memory "drivers" for cleancache are currently implemented
+in Xen (using hypervisor memory) and zcache (using in-kernel compressed
+memory) and other implementations are in development.
+
+FAQs are included below.
+
+IMPLEMENTATION OVERVIEW
+
+A cleancache "backend" that provides transcendent memory registers itself
+to the kernel's cleancache "frontend" by calling cleancache_register_ops,
+passing a pointer to a cleancache_ops structure with funcs set appropriately.
+Note that cleancache_register_ops returns the previous settings so that
+chaining can be performed if desired. The functions provided must conform to
+certain semantics as follows:
+
+Most important, cleancache is "ephemeral".  Pages which are copied into
+cleancache have an indefinite lifetime which is completely unknowable
+by the kernel and so may or may not still be in cleancache at any later time.
+Thus, as its name implies, cleancache is not suitable for dirty pages.
+Cleancache has complete discretion over what pages to preserve and what
+pages to discard and when.
+
+Mounting a cleancache-enabled filesystem should call "init_fs" to obtain a
+pool id which, if positive, must be saved in the filesystem's superblock;
+a negative return value indicates failure.  A "put_page" will copy a
+(presumably about-to-be-evicted) page into cleancache and associate it with
+the pool id, a file key, and a page index into the file.  (The combination
+of a pool id, a file key, and an index is sometimes called a "handle".)
+A "get_page" will copy the page, if found, from cleancache into kernel memory.
+A "flush_page" will ensure the page no longer is present in cleancache;
+a "flush_inode" will flush all pages associated with the specified file;
+and, when a filesystem is unmounted, a "flush_fs" will flush all pages in
+all files specified by the given pool id and also surrender the pool id.
+
+An "init_shared_fs", like init_fs, obtains a pool id but tells cleancache
+to treat the pool as shared using a 128-bit UUID as a key.  On systems
+that may run multiple kernels (such as hard partitioned or virtualized
+systems) that may share a clustered filesystem, and where cleancache
+may be shared among those kernels, calls to init_shared_fs that specify the
+same UUID will receive the same pool id, thus allowing the pages to
+be shared.  Note that any security requirements must be imposed outside
+of the kernel (e.g. by "tools" that control cleancache).  Or a
+cleancache implementation can simply disable shared_init by always
+returning a negative value.
+
+If a get_page is successful on a non-shared pool, the page is flushed (thus
+making cleancache an "exclusive" cache).  On a shared pool, the page
+is NOT flushed on a successful get_page so that it remains accessible to
+other sharers.  The kernel is responsible for ensuring coherency between
+cleancache (shared or not), the page cache, and the filesystem, using
+cleancache flush operations as required.
+
+Note that cleancache must enforce put-put-get coherency and get-get
+coherency.  For the former, if two puts are made to the same handle but
+with different data, say AAA by the first put and BBB by the second, a
+subsequent get can never return the stale data (AAA).  For get-get coherency,
+if a get for a given handle fails, subsequent gets for that handle will
+never succeed unless preceded by a successful put with that handle.
+
+Last, cleancache provides no SMP serialization guarantees; if two
+different Linux threads are simultaneously putting and flushing a page
+with the same handle, the results are indeterminate.  Callers must
+lock the page to ensure serial behavior.
+
+CLEANCACHE PERFORMANCE METRICS
+
+Cleancache monitoring is done by sysfs files in the
+/sys/kernel/mm/cleancache directory.  The effectiveness of cleancache
+can be measured (across all filesystems) with:
+
+succ_gets	- number of gets that were successful
+failed_gets	- number of gets that failed
+puts		- number of puts attempted (all "succeed")
+flushes		- number of flushes attempted
+
+A backend implementatation may provide additional metrics.
+
+FAQ
+
+1) Where's the value? (Andrew Morton)
+
+Cleancache provides a significant performance benefit to many workloads
+in many environments with negligible overhead by improving the
+effectiveness of the pagecache.  Clean pagecache pages are
+saved in transcendent memory (RAM that is otherwise not directly
+addressable to the kernel); fetching those pages later avoids "refaults"
+and thus disk reads.
+
+Cleancache (and its sister code "frontswap") provide interfaces for
+this transcendent memory (aka "tmem"), which conceptually lies between
+fast kernel-directly-addressable RAM and slower DMA/asynchronous devices.
+Disallowing direct kernel or userland reads/writes to tmem
+is ideal when data is transformed to a different form and size (such
+as with compression) or secretly moved (as might be useful for write-
+balancing for some RAM-like devices).  Evicted page-cache pages (and
+swap pages) are a great use for this kind of slower-than-RAM-but-much-
+faster-than-disk transcendent memory, and the cleancache (and frontswap)
+"page-object-oriented" specification provides a nice way to read and
+write -- and indirectly "name" -- the pages.
+
+In the virtual case, the whole point of virtualization is to statistically
+multiplex physical resources across the varying demands of multiple
+virtual machines.  This is really hard to do with RAM and efforts to
+do it well with no kernel change have essentially failed (except in some
+well-publicized special-case workloads).  Cleancache -- and frontswap --
+with a fairly small impact on the kernel, provide a huge amount
+of flexibility for more dynamic, flexible RAM multiplexing.
+Specifically, the Xen Transcendent Memory backend allows otherwise
+"fallow" hypervisor-owned RAM to not only be "time-shared" between multiple
+virtual machines, but the pages can be compressed and deduplicated to
+optimize RAM utilization.  And when guest OS's are induced to surrender
+underutilized RAM (e.g. with "self-ballooning"), page cache pages
+are the first to go, and cleancache allows those pages to be
+saved and reclaimed if overall host system memory conditions allow.
+
+And the identical interface used for cleancache can be used in
+physical systems as well.  The zcache driver acts as a memory-hungry
+device that stores pages of data in a compressed state.  And
+the proposed "RAMster" driver shares RAM across multiple physical
+systems.
+
+2) Why does cleancache have its sticky fingers so deep inside the
+   filesystems and VFS? (Andrew Morton and Christoph Hellwig)
+
+The core hooks for cleancache in VFS are in most cases a single line
+and the minimum set are placed precisely where needed to maintain
+coherency (via cleancache_flush operations) between cleancache,
+the page cache, and disk.  All hooks compile into nothingness if
+cleancache is config'ed off and turn into a function-pointer-
+compare-to-NULL if config'ed on but no backend claims the ops
+functions, or to a compare-struct-element-to-negative if a
+backend claims the ops functions but a filesystem doesn't enable
+cleancache.
+
+Some filesystems are built entirely on top of VFS and the hooks
+in VFS are sufficient, so don't require an "init_fs" hook; the
+initial implementation of cleancache didn't provide this hook.
+But for some filesystems (such as btrfs), the VFS hooks are
+incomplete and one or more hooks in fs-specific code are required.
+And for some other filesystems, such as tmpfs, cleancache may
+be counterproductive.  So it seemed prudent to require a filesystem
+to "opt in" to use cleancache, which requires adding a hook in
+each filesystem.  Not all filesystems are supported by cleancache
+only because they haven't been tested.  The existing set should
+be sufficient to validate the concept, the opt-in approach means
+that untested filesystems are not affected, and the hooks in the
+existing filesystems should make it very easy to add more
+filesystems in the future.
+
+The total impact of the hooks to existing fs and mm files is only
+about 40 lines added (not counting comments and blank lines).
+
+3) Why not make cleancache asynchronous and batched so it can
+   more easily interface with real devices with DMA instead
+   of copying each individual page? (Minchan Kim)
+
+The one-page-at-a-time copy semantics simplifies the implementation
+on both the frontend and backend and also allows the backend to
+do fancy things on-the-fly like page compression and
+page deduplication.  And since the data is "gone" (copied into/out
+of the pageframe) before the cleancache get/put call returns,
+a great deal of race conditions and potential coherency issues
+are avoided.  While the interface seems odd for a "real device"
+or for real kernel-addressable RAM, it makes perfect sense for
+transcendent memory.
+
+4) Why is non-shared cleancache "exclusive"?  And where is the
+   page "flushed" after a "get"? (Minchan Kim)
+
+The main reason is to free up space in transcendent memory and
+to avoid unnecessary cleancache_flush calls.  If you want inclusive,
+the page can be "put" immediately following the "get".  If
+put-after-get for inclusive becomes common, the interface could
+be easily extended to add a "get_no_flush" call.
+
+The flush is done by the cleancache backend implementation.
+
+5) What's the performance impact?
+
+Performance analysis has been presented at OLS'09 and LCA'10.
+Briefly, performance gains can be significant on most workloads,
+especially when memory pressure is high (e.g. when RAM is
+overcommitted in a virtual workload); and because the hooks are
+invoked primarily in place of or in addition to a disk read/write,
+overhead is negligible even in worst case workloads.  Basically
+cleancache replaces I/O with memory-copy-CPU-overhead; on older
+single-core systems with slow memory-copy speeds, cleancache
+has little value, but in newer multicore machines, especially
+consolidated/virtualized machines, it has great value.
+
+6) How do I add cleancache support for filesystem X? (Boaz Harrash)
+
+Filesystems that are well-behaved and conform to certain
+restrictions can utilize cleancache simply by making a call to
+cleancache_init_fs at mount time.  Unusual, misbehaving, or
+poorly layered filesystems must either add additional hooks
+and/or undergo extensive additional testing... or should just
+not enable the optional cleancache.
+
+Some points for a filesystem to consider:
+
+- The FS should be block-device-based (e.g. a ram-based FS such
+  as tmpfs should not enable cleancache)
+- To ensure coherency/correctness, the FS must ensure that all
+  file removal or truncation operations either go through VFS or
+  add hooks to do the equivalent cleancache "flush" operations
+- To ensure coherency/correctness, either inode numbers must
+  be unique across the lifetime of the on-disk file OR the
+  FS must provide an "encode_fh" function.
+- The FS must call the VFS superblock alloc and deactivate routines
+  or add hooks to do the equivalent cleancache calls done there.
+- To maximize performance, all pages fetched from the FS should
+  go through the do_mpag_readpage routine or the FS should add
+  hooks to do the equivalent (cf. btrfs)
+- Currently, the FS blocksize must be the same as PAGESIZE.  This
+  is not an architectural restriction, but no backends currently
+  support anything different.
+- A clustered FS should invoke the "shared_init_fs" cleancache
+  hook to get best performance for some backends.
+
+7) Why not use the KVA of the inode as the key? (Christoph Hellwig)
+
+If cleancache would use the inode virtual address instead of
+inode/filehandle, the pool id could be eliminated.  But, this
+won't work because cleancache retains pagecache data pages
+persistently even when the inode has been pruned from the
+inode unused list, and only flushes the data page if the file
+gets removed/truncated.  So if cleancache used the inode kva,
+there would be potential coherency issues if/when the inode
+kva is reused for a different file.  Alternately, if cleancache
+flushed the pages when the inode kva was freed, much of the value
+of cleancache would be lost because the cache of pages in cleanache
+is potentially much larger than the kernel pagecache and is most
+useful if the pages survive inode cache removal.
+
+8) Why is a global variable required?
+
+The cleancache_enabled flag is checked in all of the frequently-used
+cleancache hooks.  The alternative is a function call to check a static
+variable. Since cleancache is enabled dynamically at runtime, systems
+that don't enable cleancache would suffer thousands (possibly
+tens-of-thousands) of unnecessary function calls per second.  So the
+global variable allows cleancache to be enabled by default at compile
+time, but have insignificant performance impact when cleancache remains
+disabled at runtime.
+
+9) Does cleanache work with KVM?
+
+The memory model of KVM is sufficiently different that a cleancache
+backend may have less value for KVM.  This remains to be tested,
+especially in an overcommitted system.
+
+10) Does cleancache work in userspace?  It sounds useful for
+   memory hungry caches like web browsers.  (Jamie Lokier)
+
+No plans yet, though we agree it sounds useful, at least for
+apps that bypass the page cache (e.g. O_DIRECT).
+
+Last updated: Dan Magenheimer, April 13 2011
diff --git a/Documentation/vm/highmem.txt b/Documentation/vm/highmem.txt
new file mode 100644
index 00000000..4324d24f
--- /dev/null
+++ b/Documentation/vm/highmem.txt
@@ -0,0 +1,162 @@
+
+			     ====================
+			     HIGH MEMORY HANDLING
+			     ====================
+
+By: Peter Zijlstra <a.p.zijlstra@chello.nl>
+
+Contents:
+
+ (*) What is high memory?
+
+ (*) Temporary virtual mappings.
+
+ (*) Using kmap_atomic.
+
+ (*) Cost of temporary mappings.
+
+ (*) i386 PAE.
+
+
+====================
+WHAT IS HIGH MEMORY?
+====================
+
+High memory (highmem) is used when the size of physical memory approaches or
+exceeds the maximum size of virtual memory.  At that point it becomes
+impossible for the kernel to keep all of the available physical memory mapped
+at all times.  This means the kernel needs to start using temporary mappings of
+the pieces of physical memory that it wants to access.
+
+The part of (physical) memory not covered by a permanent mapping is what we
+refer to as 'highmem'.  There are various architecture dependent constraints on
+where exactly that border lies.
+
+In the i386 arch, for example, we choose to map the kernel into every process's
+VM space so that we don't have to pay the full TLB invalidation costs for
+kernel entry/exit.  This means the available virtual memory space (4GiB on
+i386) has to be divided between user and kernel space.
+
+The traditional split for architectures using this approach is 3:1, 3GiB for
+userspace and the top 1GiB for kernel space:
+
+		+--------+ 0xffffffff
+		| Kernel |
+		+--------+ 0xc0000000
+		|        |
+		| User   |
+		|        |
+		+--------+ 0x00000000
+
+This means that the kernel can at most map 1GiB of physical memory at any one
+time, but because we need virtual address space for other things - including
+temporary maps to access the rest of the physical memory - the actual direct
+map will typically be less (usually around ~896MiB).
+
+Other architectures that have mm context tagged TLBs can have separate kernel
+and user maps.  Some hardware (like some ARMs), however, have limited virtual
+space when they use mm context tags.
+
+
+==========================
+TEMPORARY VIRTUAL MAPPINGS
+==========================
+
+The kernel contains several ways of creating temporary mappings:
+
+ (*) vmap().  This can be used to make a long duration mapping of multiple
+     physical pages into a contiguous virtual space.  It needs global
+     synchronization to unmap.
+
+ (*) kmap().  This permits a short duration mapping of a single page.  It needs
+     global synchronization, but is amortized somewhat.  It is also prone to
+     deadlocks when using in a nested fashion, and so it is not recommended for
+     new code.
+
+ (*) kmap_atomic().  This permits a very short duration mapping of a single
+     page.  Since the mapping is restricted to the CPU that issued it, it
+     performs well, but the issuing task is therefore required to stay on that
+     CPU until it has finished, lest some other task displace its mappings.
+
+     kmap_atomic() may also be used by interrupt contexts, since it is does not
+     sleep and the caller may not sleep until after kunmap_atomic() is called.
+
+     It may be assumed that k[un]map_atomic() won't fail.
+
+
+=================
+USING KMAP_ATOMIC
+=================
+
+When and where to use kmap_atomic() is straightforward.  It is used when code
+wants to access the contents of a page that might be allocated from high memory
+(see __GFP_HIGHMEM), for example a page in the pagecache.  The API has two
+functions, and they can be used in a manner similar to the following:
+
+	/* Find the page of interest. */
+	struct page *page = find_get_page(mapping, offset);
+
+	/* Gain access to the contents of that page. */
+	void *vaddr = kmap_atomic(page);
+
+	/* Do something to the contents of that page. */
+	memset(vaddr, 0, PAGE_SIZE);
+
+	/* Unmap that page. */
+	kunmap_atomic(vaddr);
+
+Note that the kunmap_atomic() call takes the result of the kmap_atomic() call
+not the argument.
+
+If you need to map two pages because you want to copy from one page to
+another you need to keep the kmap_atomic calls strictly nested, like:
+
+	vaddr1 = kmap_atomic(page1);
+	vaddr2 = kmap_atomic(page2);
+
+	memcpy(vaddr1, vaddr2, PAGE_SIZE);
+
+	kunmap_atomic(vaddr2);
+	kunmap_atomic(vaddr1);
+
+
+==========================
+COST OF TEMPORARY MAPPINGS
+==========================
+
+The cost of creating temporary mappings can be quite high.  The arch has to
+manipulate the kernel's page tables, the data TLB and/or the MMU's registers.
+
+If CONFIG_HIGHMEM is not set, then the kernel will try and create a mapping
+simply with a bit of arithmetic that will convert the page struct address into
+a pointer to the page contents rather than juggling mappings about.  In such a
+case, the unmap operation may be a null operation.
+
+If CONFIG_MMU is not set, then there can be no temporary mappings and no
+highmem.  In such a case, the arithmetic approach will also be used.
+
+
+========
+i386 PAE
+========
+
+The i386 arch, under some circumstances, will permit you to stick up to 64GiB
+of RAM into your 32-bit machine.  This has a number of consequences:
+
+ (*) Linux needs a page-frame structure for each page in the system and the
+     pageframes need to live in the permanent mapping, which means:
+
+ (*) you can have 896M/sizeof(struct page) page-frames at most; with struct
+     page being 32-bytes that would end up being something in the order of 112G
+     worth of pages; the kernel, however, needs to store more than just
+     page-frames in that memory...
+
+ (*) PAE makes your page tables larger - which slows the system down as more
+     data has to be accessed to traverse in TLB fills and the like.  One
+     advantage is that PAE has more PTE bits and can provide advanced features
+     like NX and PAT.
+
+The general recommendation is that you don't use more than 8GiB on a 32-bit
+machine - although more might work for you and your workload, you're pretty
+much on your own - don't expect kernel developers to really care much if things
+come apart.
diff --git a/Documentation/vm/hugepage-mmap.c b/Documentation/vm/hugepage-mmap.c
new file mode 100644
index 00000000..db0dd9a3
--- /dev/null
+++ b/Documentation/vm/hugepage-mmap.c
@@ -0,0 +1,91 @@
+/*
+ * hugepage-mmap:
+ *
+ * Example of using huge page memory in a user application using the mmap
+ * system call.  Before running this application, make sure that the
+ * administrator has mounted the hugetlbfs filesystem (on some directory
+ * like /mnt) using the command mount -t hugetlbfs nodev /mnt. In this
+ * example, the app is requesting memory of size 256MB that is backed by
+ * huge pages.
+ *
+ * For the ia64 architecture, the Linux kernel reserves Region number 4 for
+ * huge pages.  That means that if one requires a fixed address, a huge page
+ * aligned address starting with 0x800000... will be required.  If a fixed
+ * address is not required, the kernel will select an address in the proper
+ * range.
+ * Other architectures, such as ppc64, i386 or x86_64 are not so constrained.
+ */
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <unistd.h>
+#include <sys/mman.h>
+#include <fcntl.h>
+
+#define FILE_NAME "/mnt/hugepagefile"
+#define LENGTH (256UL*1024*1024)
+#define PROTECTION (PROT_READ | PROT_WRITE)
+
+/* Only ia64 requires this */
+#ifdef __ia64__
+#define ADDR (void *)(0x8000000000000000UL)
+#define FLAGS (MAP_SHARED | MAP_FIXED)
+#else
+#define ADDR (void *)(0x0UL)
+#define FLAGS (MAP_SHARED)
+#endif
+
+static void check_bytes(char *addr)
+{
+	printf("First hex is %x\n", *((unsigned int *)addr));
+}
+
+static void write_bytes(char *addr)
+{
+	unsigned long i;
+
+	for (i = 0; i < LENGTH; i++)
+		*(addr + i) = (char)i;
+}
+
+static void read_bytes(char *addr)
+{
+	unsigned long i;
+
+	check_bytes(addr);
+	for (i = 0; i < LENGTH; i++)
+		if (*(addr + i) != (char)i) {
+			printf("Mismatch at %lu\n", i);
+			break;
+		}
+}
+
+int main(void)
+{
+	void *addr;
+	int fd;
+
+	fd = open(FILE_NAME, O_CREAT | O_RDWR, 0755);
+	if (fd < 0) {
+		perror("Open failed");
+		exit(1);
+	}
+
+	addr = mmap(ADDR, LENGTH, PROTECTION, FLAGS, fd, 0);
+	if (addr == MAP_FAILED) {
+		perror("mmap");
+		unlink(FILE_NAME);
+		exit(1);
+	}
+
+	printf("Returned address is %p\n", addr);
+	check_bytes(addr);
+	write_bytes(addr);
+	read_bytes(addr);
+
+	munmap(addr, LENGTH);
+	close(fd);
+	unlink(FILE_NAME);
+
+	return 0;
+}
diff --git a/Documentation/vm/hugepage-shm.c b/Documentation/vm/hugepage-shm.c
new file mode 100644
index 00000000..07956d85
--- /dev/null
+++ b/Documentation/vm/hugepage-shm.c
@@ -0,0 +1,98 @@
+/*
+ * hugepage-shm:
+ *
+ * Example of using huge page memory in a user application using Sys V shared
+ * memory system calls.  In this example the app is requesting 256MB of
+ * memory that is backed by huge pages.  The application uses the flag
+ * SHM_HUGETLB in the shmget system call to inform the kernel that it is
+ * requesting huge pages.
+ *
+ * For the ia64 architecture, the Linux kernel reserves Region number 4 for
+ * huge pages.  That means that if one requires a fixed address, a huge page
+ * aligned address starting with 0x800000... will be required.  If a fixed
+ * address is not required, the kernel will select an address in the proper
+ * range.
+ * Other architectures, such as ppc64, i386 or x86_64 are not so constrained.
+ *
+ * Note: The default shared memory limit is quite low on many kernels,
+ * you may need to increase it via:
+ *
+ * echo 268435456 > /proc/sys/kernel/shmmax
+ *
+ * This will increase the maximum size per shared memory segment to 256MB.
+ * The other limit that you will hit eventually is shmall which is the
+ * total amount of shared memory in pages. To set it to 16GB on a system
+ * with a 4kB pagesize do:
+ *
+ * echo 4194304 > /proc/sys/kernel/shmall
+ */
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <sys/types.h>
+#include <sys/ipc.h>
+#include <sys/shm.h>
+#include <sys/mman.h>
+
+#ifndef SHM_HUGETLB
+#define SHM_HUGETLB 04000
+#endif
+
+#define LENGTH (256UL*1024*1024)
+
+#define dprintf(x)  printf(x)
+
+/* Only ia64 requires this */
+#ifdef __ia64__
+#define ADDR (void *)(0x8000000000000000UL)
+#define SHMAT_FLAGS (SHM_RND)
+#else
+#define ADDR (void *)(0x0UL)
+#define SHMAT_FLAGS (0)
+#endif
+
+int main(void)
+{
+	int shmid;
+	unsigned long i;
+	char *shmaddr;
+
+	if ((shmid = shmget(2, LENGTH,
+			    SHM_HUGETLB | IPC_CREAT | SHM_R | SHM_W)) < 0) {
+		perror("shmget");
+		exit(1);
+	}
+	printf("shmid: 0x%x\n", shmid);
+
+	shmaddr = shmat(shmid, ADDR, SHMAT_FLAGS);
+	if (shmaddr == (char *)-1) {
+		perror("Shared memory attach failure");
+		shmctl(shmid, IPC_RMID, NULL);
+		exit(2);
+	}
+	printf("shmaddr: %p\n", shmaddr);
+
+	dprintf("Starting the writes:\n");
+	for (i = 0; i < LENGTH; i++) {
+		shmaddr[i] = (char)(i);
+		if (!(i % (1024 * 1024)))
+			dprintf(".");
+	}
+	dprintf("\n");
+
+	dprintf("Starting the Check...");
+	for (i = 0; i < LENGTH; i++)
+		if (shmaddr[i] != (char)i)
+			printf("\nIndex %lu mismatched\n", i);
+	dprintf("Done.\n");
+
+	if (shmdt((const void *)shmaddr) != 0) {
+		perror("Detach failure");
+		shmctl(shmid, IPC_RMID, NULL);
+		exit(3);
+	}
+
+	shmctl(shmid, IPC_RMID, NULL);
+
+	return 0;
+}
diff --git a/Documentation/vm/hugetlbpage.txt b/Documentation/vm/hugetlbpage.txt
new file mode 100644
index 00000000..f8551b38
--- /dev/null
+++ b/Documentation/vm/hugetlbpage.txt
@@ -0,0 +1,309 @@
+
+The intent of this file is to give a brief summary of hugetlbpage support in
+the Linux kernel.  This support is built on top of multiple page size support
+that is provided by most modern architectures.  For example, i386
+architecture supports 4K and 4M (2M in PAE mode) page sizes, ia64
+architecture supports multiple page sizes 4K, 8K, 64K, 256K, 1M, 4M, 16M,
+256M and ppc64 supports 4K and 16M.  A TLB is a cache of virtual-to-physical
+translations.  Typically this is a very scarce resource on processor.
+Operating systems try to make best use of limited number of TLB resources.
+This optimization is more critical now as bigger and bigger physical memories
+(several GBs) are more readily available.
+
+Users can use the huge page support in Linux kernel by either using the mmap
+system call or standard SYSV shared memory system calls (shmget, shmat).
+
+First the Linux kernel needs to be built with the CONFIG_HUGETLBFS
+(present under "File systems") and CONFIG_HUGETLB_PAGE (selected
+automatically when CONFIG_HUGETLBFS is selected) configuration
+options.
+
+The /proc/meminfo file provides information about the total number of
+persistent hugetlb pages in the kernel's huge page pool.  It also displays
+information about the number of free, reserved and surplus huge pages and the
+default huge page size.  The huge page size is needed for generating the
+proper alignment and size of the arguments to system calls that map huge page
+regions.
+
+The output of "cat /proc/meminfo" will include lines like:
+
+.....
+HugePages_Total: vvv
+HugePages_Free:  www
+HugePages_Rsvd:  xxx
+HugePages_Surp:  yyy
+Hugepagesize:    zzz kB
+
+where:
+HugePages_Total is the size of the pool of huge pages.
+HugePages_Free  is the number of huge pages in the pool that are not yet
+                allocated.
+HugePages_Rsvd  is short for "reserved," and is the number of huge pages for
+                which a commitment to allocate from the pool has been made,
+                but no allocation has yet been made.  Reserved huge pages
+                guarantee that an application will be able to allocate a
+                huge page from the pool of huge pages at fault time.
+HugePages_Surp  is short for "surplus," and is the number of huge pages in
+                the pool above the value in /proc/sys/vm/nr_hugepages. The
+                maximum number of surplus huge pages is controlled by
+                /proc/sys/vm/nr_overcommit_hugepages.
+
+/proc/filesystems should also show a filesystem of type "hugetlbfs" configured
+in the kernel.
+
+/proc/sys/vm/nr_hugepages indicates the current number of "persistent" huge
+pages in the kernel's huge page pool.  "Persistent" huge pages will be
+returned to the huge page pool when freed by a task.  A user with root
+privileges can dynamically allocate more or free some persistent huge pages
+by increasing or decreasing the value of 'nr_hugepages'.
+
+Pages that are used as huge pages are reserved inside the kernel and cannot
+be used for other purposes.  Huge pages cannot be swapped out under
+memory pressure.
+
+Once a number of huge pages have been pre-allocated to the kernel huge page
+pool, a user with appropriate privilege can use either the mmap system call
+or shared memory system calls to use the huge pages.  See the discussion of
+Using Huge Pages, below.
+
+The administrator can allocate persistent huge pages on the kernel boot
+command line by specifying the "hugepages=N" parameter, where 'N' = the
+number of huge pages requested.  This is the most reliable method of
+allocating huge pages as memory has not yet become fragmented.
+
+Some platforms support multiple huge page sizes.  To allocate huge pages
+of a specific size, one must precede the huge pages boot command parameters
+with a huge page size selection parameter "hugepagesz=<size>".  <size> must
+be specified in bytes with optional scale suffix [kKmMgG].  The default huge
+page size may be selected with the "default_hugepagesz=<size>" boot parameter.
+
+When multiple huge page sizes are supported, /proc/sys/vm/nr_hugepages
+indicates the current number of pre-allocated huge pages of the default size.
+Thus, one can use the following command to dynamically allocate/deallocate
+default sized persistent huge pages:
+
+	echo 20 > /proc/sys/vm/nr_hugepages
+
+This command will try to adjust the number of default sized huge pages in the
+huge page pool to 20, allocating or freeing huge pages, as required.
+
+On a NUMA platform, the kernel will attempt to distribute the huge page pool
+over all the set of allowed nodes specified by the NUMA memory policy of the
+task that modifies nr_hugepages.  The default for the allowed nodes--when the
+task has default memory policy--is all on-line nodes with memory.  Allowed
+nodes with insufficient available, contiguous memory for a huge page will be
+silently skipped when allocating persistent huge pages.  See the discussion
+below of the interaction of task memory policy, cpusets and per node attributes
+with the allocation and freeing of persistent huge pages.
+
+The success or failure of huge page allocation depends on the amount of
+physically contiguous memory that is present in system at the time of the
+allocation attempt.  If the kernel is unable to allocate huge pages from
+some nodes in a NUMA system, it will attempt to make up the difference by
+allocating extra pages on other nodes with sufficient available contiguous
+memory, if any.
+
+System administrators may want to put this command in one of the local rc
+init files.  This will enable the kernel to allocate huge pages early in
+the boot process when the possibility of getting physical contiguous pages
+is still very high.  Administrators can verify the number of huge pages
+actually allocated by checking the sysctl or meminfo.  To check the per node
+distribution of huge pages in a NUMA system, use:
+
+	cat /sys/devices/system/node/node*/meminfo | fgrep Huge
+
+/proc/sys/vm/nr_overcommit_hugepages specifies how large the pool of
+huge pages can grow, if more huge pages than /proc/sys/vm/nr_hugepages are
+requested by applications.  Writing any non-zero value into this file
+indicates that the hugetlb subsystem is allowed to try to obtain that
+number of "surplus" huge pages from the kernel's normal page pool, when the
+persistent huge page pool is exhausted. As these surplus huge pages become
+unused, they are freed back to the kernel's normal page pool.
+
+When increasing the huge page pool size via nr_hugepages, any existing surplus
+pages will first be promoted to persistent huge pages.  Then, additional
+huge pages will be allocated, if necessary and if possible, to fulfill
+the new persistent huge page pool size.
+
+The administrator may shrink the pool of persistent huge pages for
+the default huge page size by setting the nr_hugepages sysctl to a
+smaller value.  The kernel will attempt to balance the freeing of huge pages
+across all nodes in the memory policy of the task modifying nr_hugepages.
+Any free huge pages on the selected nodes will be freed back to the kernel's
+normal page pool.
+
+Caveat: Shrinking the persistent huge page pool via nr_hugepages such that
+it becomes less than the number of huge pages in use will convert the balance
+of the in-use huge pages to surplus huge pages.  This will occur even if
+the number of surplus pages it would exceed the overcommit value.  As long as
+this condition holds--that is, until nr_hugepages+nr_overcommit_hugepages is
+increased sufficiently, or the surplus huge pages go out of use and are freed--
+no more surplus huge pages will be allowed to be allocated.
+
+With support for multiple huge page pools at run-time available, much of
+the huge page userspace interface in /proc/sys/vm has been duplicated in sysfs.
+The /proc interfaces discussed above have been retained for backwards
+compatibility. The root huge page control directory in sysfs is:
+
+	/sys/kernel/mm/hugepages
+
+For each huge page size supported by the running kernel, a subdirectory
+will exist, of the form:
+
+	hugepages-${size}kB
+
+Inside each of these directories, the same set of files will exist:
+
+	nr_hugepages
+	nr_hugepages_mempolicy
+	nr_overcommit_hugepages
+	free_hugepages
+	resv_hugepages
+	surplus_hugepages
+
+which function as described above for the default huge page-sized case.
+
+
+Interaction of Task Memory Policy with Huge Page Allocation/Freeing
+
+Whether huge pages are allocated and freed via the /proc interface or
+the /sysfs interface using the nr_hugepages_mempolicy attribute, the NUMA
+nodes from which huge pages are allocated or freed are controlled by the
+NUMA memory policy of the task that modifies the nr_hugepages_mempolicy
+sysctl or attribute.  When the nr_hugepages attribute is used, mempolicy
+is ignored.
+
+The recommended method to allocate or free huge pages to/from the kernel
+huge page pool, using the nr_hugepages example above, is:
+
+    numactl --interleave <node-list> echo 20 \
+				>/proc/sys/vm/nr_hugepages_mempolicy
+
+or, more succinctly:
+
+    numactl -m <node-list> echo 20 >/proc/sys/vm/nr_hugepages_mempolicy
+
+This will allocate or free abs(20 - nr_hugepages) to or from the nodes
+specified in <node-list>, depending on whether number of persistent huge pages
+is initially less than or greater than 20, respectively.  No huge pages will be
+allocated nor freed on any node not included in the specified <node-list>.
+
+When adjusting the persistent hugepage count via nr_hugepages_mempolicy, any
+memory policy mode--bind, preferred, local or interleave--may be used.  The
+resulting effect on persistent huge page allocation is as follows:
+
+1) Regardless of mempolicy mode [see Documentation/vm/numa_memory_policy.txt],
+   persistent huge pages will be distributed across the node or nodes
+   specified in the mempolicy as if "interleave" had been specified.
+   However, if a node in the policy does not contain sufficient contiguous
+   memory for a huge page, the allocation will not "fallback" to the nearest
+   neighbor node with sufficient contiguous memory.  To do this would cause
+   undesirable imbalance in the distribution of the huge page pool, or
+   possibly, allocation of persistent huge pages on nodes not allowed by
+   the task's memory policy.
+
+2) One or more nodes may be specified with the bind or interleave policy.
+   If more than one node is specified with the preferred policy, only the
+   lowest numeric id will be used.  Local policy will select the node where
+   the task is running at the time the nodes_allowed mask is constructed.
+   For local policy to be deterministic, the task must be bound to a cpu or
+   cpus in a single node.  Otherwise, the task could be migrated to some
+   other node at any time after launch and the resulting node will be
+   indeterminate.  Thus, local policy is not very useful for this purpose.
+   Any of the other mempolicy modes may be used to specify a single node.
+
+3) The nodes allowed mask will be derived from any non-default task mempolicy,
+   whether this policy was set explicitly by the task itself or one of its
+   ancestors, such as numactl.  This means that if the task is invoked from a
+   shell with non-default policy, that policy will be used.  One can specify a
+   node list of "all" with numactl --interleave or --membind [-m] to achieve
+   interleaving over all nodes in the system or cpuset.
+
+4) Any task mempolicy specifed--e.g., using numactl--will be constrained by
+   the resource limits of any cpuset in which the task runs.  Thus, there will
+   be no way for a task with non-default policy running in a cpuset with a
+   subset of the system nodes to allocate huge pages outside the cpuset
+   without first moving to a cpuset that contains all of the desired nodes.
+
+5) Boot-time huge page allocation attempts to distribute the requested number
+   of huge pages over all on-lines nodes with memory.
+
+Per Node Hugepages Attributes
+
+A subset of the contents of the root huge page control directory in sysfs,
+described above, will be replicated under each the system device of each
+NUMA node with memory in:
+
+	/sys/devices/system/node/node[0-9]*/hugepages/
+
+Under this directory, the subdirectory for each supported huge page size
+contains the following attribute files:
+
+	nr_hugepages
+	free_hugepages
+	surplus_hugepages
+
+The free_' and surplus_' attribute files are read-only.  They return the number
+of free and surplus [overcommitted] huge pages, respectively, on the parent
+node.
+
+The nr_hugepages attribute returns the total number of huge pages on the
+specified node.  When this attribute is written, the number of persistent huge
+pages on the parent node will be adjusted to the specified value, if sufficient
+resources exist, regardless of the task's mempolicy or cpuset constraints.
+
+Note that the number of overcommit and reserve pages remain global quantities,
+as we don't know until fault time, when the faulting task's mempolicy is
+applied, from which node the huge page allocation will be attempted.
+
+
+Using Huge Pages
+
+If the user applications are going to request huge pages using mmap system
+call, then it is required that system administrator mount a file system of
+type hugetlbfs:
+
+  mount -t hugetlbfs \
+	-o uid=<value>,gid=<value>,mode=<value>,size=<value>,nr_inodes=<value> \
+	none /mnt/huge
+
+This command mounts a (pseudo) filesystem of type hugetlbfs on the directory
+/mnt/huge.  Any files created on /mnt/huge uses huge pages.  The uid and gid
+options sets the owner and group of the root of the file system.  By default
+the uid and gid of the current process are taken.  The mode option sets the
+mode of root of file system to value & 0777.  This value is given in octal.
+By default the value 0755 is picked. The size option sets the maximum value of
+memory (huge pages) allowed for that filesystem (/mnt/huge). The size is
+rounded down to HPAGE_SIZE.  The option nr_inodes sets the maximum number of
+inodes that /mnt/huge can use.  If the size or nr_inodes option is not
+provided on command line then no limits are set.  For size and nr_inodes
+options, you can use [G|g]/[M|m]/[K|k] to represent giga/mega/kilo. For
+example, size=2K has the same meaning as size=2048.
+
+While read system calls are supported on files that reside on hugetlb
+file systems, write system calls are not.
+
+Regular chown, chgrp, and chmod commands (with right permissions) could be
+used to change the file attributes on hugetlbfs.
+
+Also, it is important to note that no such mount command is required if the
+applications are going to use only shmat/shmget system calls or mmap with
+MAP_HUGETLB.  Users who wish to use hugetlb page via shared memory segment
+should be a member of a supplementary group and system admin needs to
+configure that gid into /proc/sys/vm/hugetlb_shm_group.  It is possible for
+same or different applications to use any combination of mmaps and shm*
+calls, though the mount of filesystem will be required for using mmap calls
+without MAP_HUGETLB.  For an example of how to use mmap with MAP_HUGETLB see
+map_hugetlb.c.
+
+*******************************************************************
+
+/*
+ * hugepage-shm:  see Documentation/vm/hugepage-shm.c
+ */
+
+*******************************************************************
+
+/*
+ * hugepage-mmap:  see Documentation/vm/hugepage-mmap.c
+ */
diff --git a/Documentation/vm/hwpoison.txt b/Documentation/vm/hwpoison.txt
new file mode 100644
index 00000000..55006846
--- /dev/null
+++ b/Documentation/vm/hwpoison.txt
@@ -0,0 +1,182 @@
+What is hwpoison?
+
+Upcoming Intel CPUs have support for recovering from some memory errors
+(``MCA recovery''). This requires the OS to declare a page "poisoned",
+kill the processes associated with it and avoid using it in the future.
+
+This patchkit implements the necessary infrastructure in the VM.
+
+To quote the overview comment:
+
+ * High level machine check handler. Handles pages reported by the
+ * hardware as being corrupted usually due to a 2bit ECC memory or cache
+ * failure.
+ *
+ * This focusses on pages detected as corrupted in the background.
+ * When the current CPU tries to consume corruption the currently
+ * running process can just be killed directly instead. This implies
+ * that if the error cannot be handled for some reason it's safe to
+ * just ignore it because no corruption has been consumed yet. Instead
+ * when that happens another machine check will happen.
+ *
+ * Handles page cache pages in various states. The tricky part
+ * here is that we can access any page asynchronous to other VM
+ * users, because memory failures could happen anytime and anywhere,
+ * possibly violating some of their assumptions. This is why this code
+ * has to be extremely careful. Generally it tries to use normal locking
+ * rules, as in get the standard locks, even if that means the
+ * error handling takes potentially a long time.
+ *
+ * Some of the operations here are somewhat inefficient and have non
+ * linear algorithmic complexity, because the data structures have not
+ * been optimized for this case. This is in particular the case
+ * for the mapping from a vma to a process. Since this case is expected
+ * to be rare we hope we can get away with this.
+
+The code consists of a the high level handler in mm/memory-failure.c,
+a new page poison bit and various checks in the VM to handle poisoned
+pages.
+
+The main target right now is KVM guests, but it works for all kinds
+of applications. KVM support requires a recent qemu-kvm release.
+
+For the KVM use there was need for a new signal type so that
+KVM can inject the machine check into the guest with the proper
+address. This in theory allows other applications to handle
+memory failures too. The expection is that near all applications
+won't do that, but some very specialized ones might.
+
+---
+
+There are two (actually three) modi memory failure recovery can be in:
+
+vm.memory_failure_recovery sysctl set to zero:
+	All memory failures cause a panic. Do not attempt recovery.
+	(on x86 this can be also affected by the tolerant level of the
+	MCE subsystem)
+
+early kill
+	(can be controlled globally and per process)
+	Send SIGBUS to the application as soon as the error is detected
+	This allows applications who can process memory errors in a gentle
+	way (e.g. drop affected object)
+	This is the mode used by KVM qemu.
+
+late kill
+	Send SIGBUS when the application runs into the corrupted page.
+	This is best for memory error unaware applications and default
+	Note some pages are always handled as late kill.
+
+---
+
+User control:
+
+vm.memory_failure_recovery
+	See sysctl.txt
+
+vm.memory_failure_early_kill
+	Enable early kill mode globally
+
+PR_MCE_KILL
+	Set early/late kill mode/revert to system default
+	arg1: PR_MCE_KILL_CLEAR: Revert to system default
+	arg1: PR_MCE_KILL_SET: arg2 defines thread specific mode
+		PR_MCE_KILL_EARLY: Early kill
+		PR_MCE_KILL_LATE:  Late kill
+		PR_MCE_KILL_DEFAULT: Use system global default
+PR_MCE_KILL_GET
+	return current mode
+
+
+---
+
+Testing:
+
+madvise(MADV_HWPOISON, ....)
+	(as root)
+	Poison a page in the process for testing
+
+
+hwpoison-inject module through debugfs
+
+/sys/debug/hwpoison/
+
+corrupt-pfn
+
+Inject hwpoison fault at PFN echoed into this file. This does
+some early filtering to avoid corrupted unintended pages in test suites.
+
+unpoison-pfn
+
+Software-unpoison page at PFN echoed into this file. This
+way a page can be reused again.
+This only works for Linux injected failures, not for real
+memory failures.
+
+Note these injection interfaces are not stable and might change between
+kernel versions
+
+corrupt-filter-dev-major
+corrupt-filter-dev-minor
+
+Only handle memory failures to pages associated with the file system defined
+by block device major/minor.  -1U is the wildcard value.
+This should be only used for testing with artificial injection.
+
+corrupt-filter-memcg
+
+Limit injection to pages owned by memgroup. Specified by inode number
+of the memcg.
+
+Example:
+        mkdir /sys/fs/cgroup/mem/hwpoison
+
+        usemem -m 100 -s 1000 &
+        echo `jobs -p` > /sys/fs/cgroup/mem/hwpoison/tasks
+
+        memcg_ino=$(ls -id /sys/fs/cgroup/mem/hwpoison | cut -f1 -d' ')
+        echo $memcg_ino > /debug/hwpoison/corrupt-filter-memcg
+
+        page-types -p `pidof init`   --hwpoison  # shall do nothing
+        page-types -p `pidof usemem` --hwpoison  # poison its pages
+
+corrupt-filter-flags-mask
+corrupt-filter-flags-value
+
+When specified, only poison pages if ((page_flags & mask) == value).
+This allows stress testing of many kinds of pages. The page_flags
+are the same as in /proc/kpageflags. The flag bits are defined in
+include/linux/kernel-page-flags.h and documented in
+Documentation/vm/pagemap.txt
+
+Architecture specific MCE injector
+
+x86 has mce-inject, mce-test
+
+Some portable hwpoison test programs in mce-test, see blow.
+
+---
+
+References:
+
+http://halobates.de/mce-lc09-2.pdf
+	Overview presentation from LinuxCon 09
+
+git://git.kernel.org/pub/scm/utils/cpu/mce/mce-test.git
+	Test suite (hwpoison specific portable tests in tsrc)
+
+git://git.kernel.org/pub/scm/utils/cpu/mce/mce-inject.git
+	x86 specific injector
+
+
+---
+
+Limitations:
+
+- Not all page types are supported and never will. Most kernel internal
+objects cannot be recovered, only LRU pages for now.
+- Right now hugepage support is missing.
+
+---
+Andi Kleen, Oct 2009
+
diff --git a/Documentation/vm/ksm.txt b/Documentation/vm/ksm.txt
new file mode 100644
index 00000000..b392e496
--- /dev/null
+++ b/Documentation/vm/ksm.txt
@@ -0,0 +1,82 @@
+How to use the Kernel Samepage Merging feature
+----------------------------------------------
+
+KSM is a memory-saving de-duplication feature, enabled by CONFIG_KSM=y,
+added to the Linux kernel in 2.6.32.  See mm/ksm.c for its implementation,
+and http://lwn.net/Articles/306704/ and http://lwn.net/Articles/330589/
+
+The KSM daemon ksmd periodically scans those areas of user memory which
+have been registered with it, looking for pages of identical content which
+can be replaced by a single write-protected page (which is automatically
+copied if a process later wants to update its content).
+
+KSM was originally developed for use with KVM (where it was known as
+Kernel Shared Memory), to fit more virtual machines into physical memory,
+by sharing the data common between them.  But it can be useful to any
+application which generates many instances of the same data.
+
+KSM only merges anonymous (private) pages, never pagecache (file) pages.
+KSM's merged pages were originally locked into kernel memory, but can now
+be swapped out just like other user pages (but sharing is broken when they
+are swapped back in: ksmd must rediscover their identity and merge again).
+
+KSM only operates on those areas of address space which an application
+has advised to be likely candidates for merging, by using the madvise(2)
+system call: int madvise(addr, length, MADV_MERGEABLE).
+
+The app may call int madvise(addr, length, MADV_UNMERGEABLE) to cancel
+that advice and restore unshared pages: whereupon KSM unmerges whatever
+it merged in that range.  Note: this unmerging call may suddenly require
+more memory than is available - possibly failing with EAGAIN, but more
+probably arousing the Out-Of-Memory killer.
+
+If KSM is not configured into the running kernel, madvise MADV_MERGEABLE
+and MADV_UNMERGEABLE simply fail with EINVAL.  If the running kernel was
+built with CONFIG_KSM=y, those calls will normally succeed: even if the
+the KSM daemon is not currently running, MADV_MERGEABLE still registers
+the range for whenever the KSM daemon is started; even if the range
+cannot contain any pages which KSM could actually merge; even if
+MADV_UNMERGEABLE is applied to a range which was never MADV_MERGEABLE.
+
+Like other madvise calls, they are intended for use on mapped areas of
+the user address space: they will report ENOMEM if the specified range
+includes unmapped gaps (though working on the intervening mapped areas),
+and might fail with EAGAIN if not enough memory for internal structures.
+
+Applications should be considerate in their use of MADV_MERGEABLE,
+restricting its use to areas likely to benefit.  KSM's scans may use a lot
+of processing power: some installations will disable KSM for that reason.
+
+The KSM daemon is controlled by sysfs files in /sys/kernel/mm/ksm/,
+readable by all but writable only by root:
+
+pages_to_scan    - how many present pages to scan before ksmd goes to sleep
+                   e.g. "echo 100 > /sys/kernel/mm/ksm/pages_to_scan"
+                   Default: 100 (chosen for demonstration purposes)
+
+sleep_millisecs  - how many milliseconds ksmd should sleep before next scan
+                   e.g. "echo 20 > /sys/kernel/mm/ksm/sleep_millisecs"
+                   Default: 20 (chosen for demonstration purposes)
+
+run              - set 0 to stop ksmd from running but keep merged pages,
+                   set 1 to run ksmd e.g. "echo 1 > /sys/kernel/mm/ksm/run",
+                   set 2 to stop ksmd and unmerge all pages currently merged,
+                         but leave mergeable areas registered for next run
+                   Default: 0 (must be changed to 1 to activate KSM,
+                               except if CONFIG_SYSFS is disabled)
+
+The effectiveness of KSM and MADV_MERGEABLE is shown in /sys/kernel/mm/ksm/:
+
+pages_shared     - how many shared pages are being used
+pages_sharing    - how many more sites are sharing them i.e. how much saved
+pages_unshared   - how many pages unique but repeatedly checked for merging
+pages_volatile   - how many pages changing too fast to be placed in a tree
+full_scans       - how many times all mergeable areas have been scanned
+
+A high ratio of pages_sharing to pages_shared indicates good sharing, but
+a high ratio of pages_unshared to pages_sharing indicates wasted effort.
+pages_volatile embraces several different kinds of activity, but a high
+proportion there would also indicate poor use of madvise MADV_MERGEABLE.
+
+Izik Eidus,
+Hugh Dickins, 17 Nov 2009
diff --git a/Documentation/vm/locking b/Documentation/vm/locking
new file mode 100644
index 00000000..f61228bd
--- /dev/null
+++ b/Documentation/vm/locking
@@ -0,0 +1,130 @@
+Started Oct 1999 by Kanoj Sarcar <kanojsarcar@yahoo.com>
+
+The intent of this file is to have an uptodate, running commentary 
+from different people about how locking and synchronization is done 
+in the Linux vm code.
+
+page_table_lock & mmap_sem
+--------------------------------------
+
+Page stealers pick processes out of the process pool and scan for 
+the best process to steal pages from. To guarantee the existence 
+of the victim mm, a mm_count inc and a mmdrop are done in swap_out().
+Page stealers hold kernel_lock to protect against a bunch of races.
+The vma list of the victim mm is also scanned by the stealer, 
+and the page_table_lock is used to preserve list sanity against the
+process adding/deleting to the list. This also guarantees existence
+of the vma. Vma existence is not guaranteed once try_to_swap_out() 
+drops the page_table_lock. To guarantee the existence of the underlying 
+file structure, a get_file is done before the swapout() method is 
+invoked. The page passed into swapout() is guaranteed not to be reused
+for a different purpose because the page reference count due to being
+present in the user's pte is not released till after swapout() returns.
+
+Any code that modifies the vmlist, or the vm_start/vm_end/
+vm_flags:VM_LOCKED/vm_next of any vma *in the list* must prevent 
+kswapd from looking at the chain.
+
+The rules are:
+1. To scan the vmlist (look but don't touch) you must hold the
+   mmap_sem with read bias, i.e. down_read(&mm->mmap_sem)
+2. To modify the vmlist you need to hold the mmap_sem with
+   read&write bias, i.e. down_write(&mm->mmap_sem)  *AND*
+   you need to take the page_table_lock.
+3. The swapper takes _just_ the page_table_lock, this is done
+   because the mmap_sem can be an extremely long lived lock
+   and the swapper just cannot sleep on that.
+4. The exception to this rule is expand_stack, which just
+   takes the read lock and the page_table_lock, this is ok
+   because it doesn't really modify fields anybody relies on.
+5. You must be able to guarantee that while holding page_table_lock
+   or page_table_lock of mm A, you will not try to get either lock
+   for mm B.
+
+The caveats are:
+1. find_vma() makes use of, and updates, the mmap_cache pointer hint.
+The update of mmap_cache is racy (page stealer can race with other code
+that invokes find_vma with mmap_sem held), but that is okay, since it 
+is a hint. This can be fixed, if desired, by having find_vma grab the
+page_table_lock.
+
+
+Code that add/delete elements from the vmlist chain are
+1. callers of insert_vm_struct
+2. callers of merge_segments
+3. callers of avl_remove
+
+Code that changes vm_start/vm_end/vm_flags:VM_LOCKED of vma's on
+the list:
+1. expand_stack
+2. mprotect
+3. mlock
+4. mremap
+
+It is advisable that changes to vm_start/vm_end be protected, although 
+in some cases it is not really needed. Eg, vm_start is modified by 
+expand_stack(), it is hard to come up with a destructive scenario without 
+having the vmlist protection in this case.
+
+The page_table_lock nests with the inode i_mmap_mutex and the kmem cache
+c_spinlock spinlocks.  This is okay, since the kmem code asks for pages after
+dropping c_spinlock.  The page_table_lock also nests with pagecache_lock and
+pagemap_lru_lock spinlocks, and no code asks for memory with these locks
+held.
+
+The page_table_lock is grabbed while holding the kernel_lock spinning monitor.
+
+The page_table_lock is a spin lock.
+
+Note: PTL can also be used to guarantee that no new clones using the
+mm start up ... this is a loose form of stability on mm_users. For
+example, it is used in copy_mm to protect against a racing tlb_gather_mmu
+single address space optimization, so that the zap_page_range (from
+truncate) does not lose sending ipi's to cloned threads that might
+be spawned underneath it and go to user mode to drag in pte's into tlbs.
+
+swap_lock
+--------------
+The swap devices are chained in priority order from the "swap_list" header. 
+The "swap_list" is used for the round-robin swaphandle allocation strategy.
+The #free swaphandles is maintained in "nr_swap_pages". These two together
+are protected by the swap_lock.
+
+The swap_lock also protects all the device reference counts on the
+corresponding swaphandles, maintained in the "swap_map" array, and the
+"highest_bit" and "lowest_bit" fields.
+
+The swap_lock is a spinlock, and is never acquired from intr level.
+
+To prevent races between swap space deletion or async readahead swapins
+deciding whether a swap handle is being used, ie worthy of being read in
+from disk, and an unmap -> swap_free making the handle unused, the swap
+delete and readahead code grabs a temp reference on the swaphandle to
+prevent warning messages from swap_duplicate <- read_swap_cache_async.
+
+Swap cache locking
+------------------
+Pages are added into the swap cache with kernel_lock held, to make sure
+that multiple pages are not being added (and hence lost) by associating
+all of them with the same swaphandle.
+
+Pages are guaranteed not to be removed from the scache if the page is 
+"shared": ie, other processes hold reference on the page or the associated 
+swap handle. The only code that does not follow this rule is shrink_mmap,
+which deletes pages from the swap cache if no process has a reference on 
+the page (multiple processes might have references on the corresponding
+swap handle though). lookup_swap_cache() races with shrink_mmap, when
+establishing a reference on a scache page, so, it must check whether the
+page it located is still in the swapcache, or shrink_mmap deleted it.
+(This race is due to the fact that shrink_mmap looks at the page ref
+count with pagecache_lock, but then drops pagecache_lock before deleting
+the page from the scache).
+
+do_wp_page and do_swap_page have MP races in them while trying to figure
+out whether a page is "shared", by looking at the page_count + swap_count.
+To preserve the sum of the counts, the page lock _must_ be acquired before
+calling is_page_shared (else processes might switch their swap_count refs
+to the page count refs, after the page count ref has been snapshotted).
+
+Swap device deletion code currently breaks all the scache assumptions,
+since it grabs neither mmap_sem nor page_table_lock.
diff --git a/Documentation/vm/map_hugetlb.c b/Documentation/vm/map_hugetlb.c
new file mode 100644
index 00000000..eda1a6d3
--- /dev/null
+++ b/Documentation/vm/map_hugetlb.c
@@ -0,0 +1,77 @@
+/*
+ * Example of using hugepage memory in a user application using the mmap
+ * system call with MAP_HUGETLB flag.  Before running this program make
+ * sure the administrator has allocated enough default sized huge pages
+ * to cover the 256 MB allocation.
+ *
+ * For ia64 architecture, Linux kernel reserves Region number 4 for hugepages.
+ * That means the addresses starting with 0x800000... will need to be
+ * specified.  Specifying a fixed address is not required on ppc64, i386
+ * or x86_64.
+ */
+#include <stdlib.h>
+#include <stdio.h>
+#include <unistd.h>
+#include <sys/mman.h>
+#include <fcntl.h>
+
+#define LENGTH (256UL*1024*1024)
+#define PROTECTION (PROT_READ | PROT_WRITE)
+
+#ifndef MAP_HUGETLB
+#define MAP_HUGETLB 0x40000 /* arch specific */
+#endif
+
+/* Only ia64 requires this */
+#ifdef __ia64__
+#define ADDR (void *)(0x8000000000000000UL)
+#define FLAGS (MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB | MAP_FIXED)
+#else
+#define ADDR (void *)(0x0UL)
+#define FLAGS (MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB)
+#endif
+
+static void check_bytes(char *addr)
+{
+	printf("First hex is %x\n", *((unsigned int *)addr));
+}
+
+static void write_bytes(char *addr)
+{
+	unsigned long i;
+
+	for (i = 0; i < LENGTH; i++)
+		*(addr + i) = (char)i;
+}
+
+static void read_bytes(char *addr)
+{
+	unsigned long i;
+
+	check_bytes(addr);
+	for (i = 0; i < LENGTH; i++)
+		if (*(addr + i) != (char)i) {
+			printf("Mismatch at %lu\n", i);
+			break;
+		}
+}
+
+int main(void)
+{
+	void *addr;
+
+	addr = mmap(ADDR, LENGTH, PROTECTION, FLAGS, 0, 0);
+	if (addr == MAP_FAILED) {
+		perror("mmap");
+		exit(1);
+	}
+
+	printf("Returned address is %p\n", addr);
+	check_bytes(addr);
+	write_bytes(addr);
+	read_bytes(addr);
+
+	munmap(addr, LENGTH);
+
+	return 0;
+}
diff --git a/Documentation/vm/numa b/Documentation/vm/numa
new file mode 100644
index 00000000..a200a386
--- /dev/null
+++ b/Documentation/vm/numa
@@ -0,0 +1,149 @@
+Started Nov 1999 by Kanoj Sarcar <kanoj@sgi.com>
+
+What is NUMA?
+
+This question can be answered from a couple of perspectives:  the
+hardware view and the Linux software view.
+
+From the hardware perspective, a NUMA system is a computer platform that
+comprises multiple components or assemblies each of which may contain 0
+or more CPUs, local memory, and/or IO buses.  For brevity and to
+disambiguate the hardware view of these physical components/assemblies
+from the software abstraction thereof, we'll call the components/assemblies
+'cells' in this document.
+
+Each of the 'cells' may be viewed as an SMP [symmetric multi-processor] subset
+of the system--although some components necessary for a stand-alone SMP system
+may not be populated on any given cell.   The cells of the NUMA system are
+connected together with some sort of system interconnect--e.g., a crossbar or
+point-to-point link are common types of NUMA system interconnects.  Both of
+these types of interconnects can be aggregated to create NUMA platforms with
+cells at multiple distances from other cells.
+
+For Linux, the NUMA platforms of interest are primarily what is known as Cache
+Coherent NUMA or ccNUMA systems.   With ccNUMA systems, all memory is visible
+to and accessible from any CPU attached to any cell and cache coherency
+is handled in hardware by the processor caches and/or the system interconnect.
+
+Memory access time and effective memory bandwidth varies depending on how far
+away the cell containing the CPU or IO bus making the memory access is from the
+cell containing the target memory.  For example, access to memory by CPUs
+attached to the same cell will experience faster access times and higher
+bandwidths than accesses to memory on other, remote cells.  NUMA platforms
+can have cells at multiple remote distances from any given cell.
+
+Platform vendors don't build NUMA systems just to make software developers'
+lives interesting.  Rather, this architecture is a means to provide scalable
+memory bandwidth.  However, to achieve scalable memory bandwidth, system and
+application software must arrange for a large majority of the memory references
+[cache misses] to be to "local" memory--memory on the same cell, if any--or
+to the closest cell with memory.
+
+This leads to the Linux software view of a NUMA system:
+
+Linux divides the system's hardware resources into multiple software
+abstractions called "nodes".  Linux maps the nodes onto the physical cells
+of the hardware platform, abstracting away some of the details for some
+architectures.  As with physical cells, software nodes may contain 0 or more
+CPUs, memory and/or IO buses.  And, again, memory accesses to memory on
+"closer" nodes--nodes that map to closer cells--will generally experience
+faster access times and higher effective bandwidth than accesses to more
+remote cells.
+
+For some architectures, such as x86, Linux will "hide" any node representing a
+physical cell that has no memory attached, and reassign any CPUs attached to
+that cell to a node representing a cell that does have memory.  Thus, on
+these architectures, one cannot assume that all CPUs that Linux associates with
+a given node will see the same local memory access times and bandwidth.
+
+In addition, for some architectures, again x86 is an example, Linux supports
+the emulation of additional nodes.  For NUMA emulation, linux will carve up
+the existing nodes--or the system memory for non-NUMA platforms--into multiple
+nodes.  Each emulated node will manage a fraction of the underlying cells'
+physical memory.  NUMA emluation is useful for testing NUMA kernel and
+application features on non-NUMA platforms, and as a sort of memory resource
+management mechanism when used together with cpusets.
+[see Documentation/cgroups/cpusets.txt]
+
+For each node with memory, Linux constructs an independent memory management
+subsystem, complete with its own free page lists, in-use page lists, usage
+statistics and locks to mediate access.  In addition, Linux constructs for
+each memory zone [one or more of DMA, DMA32, NORMAL, HIGH_MEMORY, MOVABLE],
+an ordered "zonelist".  A zonelist specifies the zones/nodes to visit when a
+selected zone/node cannot satisfy the allocation request.  This situation,
+when a zone has no available memory to satisfy a request, is called
+"overflow" or "fallback".
+
+Because some nodes contain multiple zones containing different types of
+memory, Linux must decide whether to order the zonelists such that allocations
+fall back to the same zone type on a different node, or to a different zone
+type on the same node.  This is an important consideration because some zones,
+such as DMA or DMA32, represent relatively scarce resources.  Linux chooses
+a default zonelist order based on the sizes of the various zone types relative
+to the total memory of the node and the total memory of the system.  The
+default zonelist order may be overridden using the numa_zonelist_order kernel
+boot parameter or sysctl.  [see Documentation/kernel-parameters.txt and
+Documentation/sysctl/vm.txt]
+
+By default, Linux will attempt to satisfy memory allocation requests from the
+node to which the CPU that executes the request is assigned.  Specifically,
+Linux will attempt to allocate from the first node in the appropriate zonelist
+for the node where the request originates.  This is called "local allocation."
+If the "local" node cannot satisfy the request, the kernel will examine other
+nodes' zones in the selected zonelist looking for the first zone in the list
+that can satisfy the request.
+
+Local allocation will tend to keep subsequent access to the allocated memory
+"local" to the underlying physical resources and off the system interconnect--
+as long as the task on whose behalf the kernel allocated some memory does not
+later migrate away from that memory.  The Linux scheduler is aware of the
+NUMA topology of the platform--embodied in the "scheduling domains" data
+structures [see Documentation/scheduler/sched-domains.txt]--and the scheduler
+attempts to minimize task migration to distant scheduling domains.  However,
+the scheduler does not take a task's NUMA footprint into account directly.
+Thus, under sufficient imbalance, tasks can migrate between nodes, remote
+from their initial node and kernel data structures.
+
+System administrators and application designers can restrict a task's migration
+to improve NUMA locality using various CPU affinity command line interfaces,
+such as taskset(1) and numactl(1), and program interfaces such as
+sched_setaffinity(2).  Further, one can modify the kernel's default local
+allocation behavior using Linux NUMA memory policy.
+[see Documentation/vm/numa_memory_policy.]
+
+System administrators can restrict the CPUs and nodes' memories that a non-
+privileged user can specify in the scheduling or NUMA commands and functions
+using control groups and CPUsets.  [see Documentation/cgroups/CPUsets.txt]
+
+On architectures that do not hide memoryless nodes, Linux will include only
+zones [nodes] with memory in the zonelists.  This means that for a memoryless
+node the "local memory node"--the node of the first zone in CPU's node's
+zonelist--will not be the node itself.  Rather, it will be the node that the
+kernel selected as the nearest node with memory when it built the zonelists.
+So, default, local allocations will succeed with the kernel supplying the
+closest available memory.  This is a consequence of the same mechanism that
+allows such allocations to fallback to other nearby nodes when a node that
+does contain memory overflows.
+
+Some kernel allocations do not want or cannot tolerate this allocation fallback
+behavior.  Rather they want to be sure they get memory from the specified node
+or get notified that the node has no free memory.  This is usually the case when
+a subsystem allocates per CPU memory resources, for example.
+
+A typical model for making such an allocation is to obtain the node id of the
+node to which the "current CPU" is attached using one of the kernel's
+numa_node_id() or CPU_to_node() functions and then request memory from only
+the node id returned.  When such an allocation fails, the requesting subsystem
+may revert to its own fallback path.  The slab kernel memory allocator is an
+example of this.  Or, the subsystem may choose to disable or not to enable
+itself on allocation failure.  The kernel profiling subsystem is an example of
+this.
+
+If the architecture supports--does not hide--memoryless nodes, then CPUs
+attached to memoryless nodes would always incur the fallback path overhead
+or some subsystems would fail to initialize if they attempted to allocated
+memory exclusively from a node without memory.  To support such
+architectures transparently, kernel subsystems can use the numa_mem_id()
+or cpu_to_mem() function to locate the "local memory node" for the calling or
+specified CPU.  Again, this is the same node from which default, local page
+allocations will be attempted.
diff --git a/Documentation/vm/numa_memory_policy.txt b/Documentation/vm/numa_memory_policy.txt
new file mode 100644
index 00000000..4e7da654
--- /dev/null
+++ b/Documentation/vm/numa_memory_policy.txt
@@ -0,0 +1,453 @@
+
+What is Linux Memory Policy?
+
+In the Linux kernel, "memory policy" determines from which node the kernel will
+allocate memory in a NUMA system or in an emulated NUMA system.  Linux has
+supported platforms with Non-Uniform Memory Access architectures since 2.4.?.
+The current memory policy support was added to Linux 2.6 around May 2004.  This
+document attempts to describe the concepts and APIs of the 2.6 memory policy
+support.
+
+Memory policies should not be confused with cpusets
+(Documentation/cgroups/cpusets.txt)
+which is an administrative mechanism for restricting the nodes from which
+memory may be allocated by a set of processes. Memory policies are a
+programming interface that a NUMA-aware application can take advantage of.  When
+both cpusets and policies are applied to a task, the restrictions of the cpuset
+takes priority.  See "MEMORY POLICIES AND CPUSETS" below for more details.
+
+MEMORY POLICY CONCEPTS
+
+Scope of Memory Policies
+
+The Linux kernel supports _scopes_ of memory policy, described here from
+most general to most specific:
+
+    System Default Policy:  this policy is "hard coded" into the kernel.  It
+    is the policy that governs all page allocations that aren't controlled
+    by one of the more specific policy scopes discussed below.  When the
+    system is "up and running", the system default policy will use "local
+    allocation" described below.  However, during boot up, the system
+    default policy will be set to interleave allocations across all nodes
+    with "sufficient" memory, so as not to overload the initial boot node
+    with boot-time allocations.
+
+    Task/Process Policy:  this is an optional, per-task policy.  When defined
+    for a specific task, this policy controls all page allocations made by or
+    on behalf of the task that aren't controlled by a more specific scope.
+    If a task does not define a task policy, then all page allocations that
+    would have been controlled by the task policy "fall back" to the System
+    Default Policy.
+
+	The task policy applies to the entire address space of a task. Thus,
+	it is inheritable, and indeed is inherited, across both fork()
+	[clone() w/o the CLONE_VM flag] and exec*().  This allows a parent task
+	to establish the task policy for a child task exec()'d from an
+	executable image that has no awareness of memory policy.  See the
+	MEMORY POLICY APIS section, below, for an overview of the system call
+	that a task may use to set/change its task/process policy.
+
+	In a multi-threaded task, task policies apply only to the thread
+	[Linux kernel task] that installs the policy and any threads
+	subsequently created by that thread.  Any sibling threads existing
+	at the time a new task policy is installed retain their current
+	policy.
+
+	A task policy applies only to pages allocated after the policy is
+	installed.  Any pages already faulted in by the task when the task
+	changes its task policy remain where they were allocated based on
+	the policy at the time they were allocated.
+
+    VMA Policy:  A "VMA" or "Virtual Memory Area" refers to a range of a task's
+    virtual address space.  A task may define a specific policy for a range
+    of its virtual address space.   See the MEMORY POLICIES APIS section,
+    below, for an overview of the mbind() system call used to set a VMA
+    policy.
+
+    A VMA policy will govern the allocation of pages that back this region of
+    the address space.  Any regions of the task's address space that don't
+    have an explicit VMA policy will fall back to the task policy, which may
+    itself fall back to the System Default Policy.
+
+    VMA policies have a few complicating details:
+
+	VMA policy applies ONLY to anonymous pages.  These include pages
+	allocated for anonymous segments, such as the task stack and heap, and
+	any regions of the address space mmap()ed with the MAP_ANONYMOUS flag.
+	If a VMA policy is applied to a file mapping, it will be ignored if
+	the mapping used the MAP_SHARED flag.  If the file mapping used the
+	MAP_PRIVATE flag, the VMA policy will only be applied when an
+	anonymous page is allocated on an attempt to write to the mapping--
+	i.e., at Copy-On-Write.
+
+	VMA policies are shared between all tasks that share a virtual address
+	space--a.k.a. threads--independent of when the policy is installed; and
+	they are inherited across fork().  However, because VMA policies refer
+	to a specific region of a task's address space, and because the address
+	space is discarded and recreated on exec*(), VMA policies are NOT
+	inheritable across exec().  Thus, only NUMA-aware applications may
+	use VMA policies.
+
+	A task may install a new VMA policy on a sub-range of a previously
+	mmap()ed region.  When this happens, Linux splits the existing virtual
+	memory area into 2 or 3 VMAs, each with it's own policy.
+
+	By default, VMA policy applies only to pages allocated after the policy
+	is installed.  Any pages already faulted into the VMA range remain
+	where they were allocated based on the policy at the time they were
+	allocated.  However, since 2.6.16, Linux supports page migration via
+	the mbind() system call, so that page contents can be moved to match
+	a newly installed policy.
+
+    Shared Policy:  Conceptually, shared policies apply to "memory objects"
+    mapped shared into one or more tasks' distinct address spaces.  An
+    application installs a shared policies the same way as VMA policies--using
+    the mbind() system call specifying a range of virtual addresses that map
+    the shared object.  However, unlike VMA policies, which can be considered
+    to be an attribute of a range of a task's address space, shared policies
+    apply directly to the shared object.  Thus, all tasks that attach to the
+    object share the policy, and all pages allocated for the shared object,
+    by any task, will obey the shared policy.
+
+	As of 2.6.22, only shared memory segments, created by shmget() or
+	mmap(MAP_ANONYMOUS|MAP_SHARED), support shared policy.  When shared
+	policy support was added to Linux, the associated data structures were
+	added to hugetlbfs shmem segments.  At the time, hugetlbfs did not
+	support allocation at fault time--a.k.a lazy allocation--so hugetlbfs
+	shmem segments were never "hooked up" to the shared policy support.
+	Although hugetlbfs segments now support lazy allocation, their support
+	for shared policy has not been completed.
+
+	As mentioned above [re: VMA policies], allocations of page cache
+	pages for regular files mmap()ed with MAP_SHARED ignore any VMA
+	policy installed on the virtual address range backed by the shared
+	file mapping.  Rather, shared page cache pages, including pages backing
+	private mappings that have not yet been written by the task, follow
+	task policy, if any, else System Default Policy.
+
+	The shared policy infrastructure supports different policies on subset
+	ranges of the shared object.  However, Linux still splits the VMA of
+	the task that installs the policy for each range of distinct policy.
+	Thus, different tasks that attach to a shared memory segment can have
+	different VMA configurations mapping that one shared object.  This
+	can be seen by examining the /proc/<pid>/numa_maps of tasks sharing
+	a shared memory region, when one task has installed shared policy on
+	one or more ranges of the region.
+
+Components of Memory Policies
+
+    A Linux memory policy consists of a "mode", optional mode flags, and an
+    optional set of nodes.  The mode determines the behavior of the policy,
+    the optional mode flags determine the behavior of the mode, and the
+    optional set of nodes can be viewed as the arguments to the policy
+    behavior.
+
+   Internally, memory policies are implemented by a reference counted
+   structure, struct mempolicy.  Details of this structure will be discussed
+   in context, below, as required to explain the behavior.
+
+   Linux memory policy supports the following 4 behavioral modes:
+
+	Default Mode--MPOL_DEFAULT:  This mode is only used in the memory
+	policy APIs.  Internally, MPOL_DEFAULT is converted to the NULL
+	memory policy in all policy scopes.  Any existing non-default policy
+	will simply be removed when MPOL_DEFAULT is specified.  As a result,
+	MPOL_DEFAULT means "fall back to the next most specific policy scope."
+
+	    For example, a NULL or default task policy will fall back to the
+	    system default policy.  A NULL or default vma policy will fall
+	    back to the task policy.
+
+	    When specified in one of the memory policy APIs, the Default mode
+	    does not use the optional set of nodes.
+
+	    It is an error for the set of nodes specified for this policy to
+	    be non-empty.
+
+	MPOL_BIND:  This mode specifies that memory must come from the
+	set of nodes specified by the policy.  Memory will be allocated from
+	the node in the set with sufficient free memory that is closest to
+	the node where the allocation takes place.
+
+	MPOL_PREFERRED:  This mode specifies that the allocation should be
+	attempted from the single node specified in the policy.  If that
+	allocation fails, the kernel will search other nodes, in order of
+	increasing distance from the preferred node based on information
+	provided by the platform firmware.
+	containing the cpu where the allocation takes place.
+
+	    Internally, the Preferred policy uses a single node--the
+	    preferred_node member of struct mempolicy.  When the internal
+	    mode flag MPOL_F_LOCAL is set, the preferred_node is ignored and
+	    the policy is interpreted as local allocation.  "Local" allocation
+	    policy can be viewed as a Preferred policy that starts at the node
+	    containing the cpu where the allocation takes place.
+
+	    It is possible for the user to specify that local allocation is
+	    always preferred by passing an empty nodemask with this mode.
+	    If an empty nodemask is passed, the policy cannot use the
+	    MPOL_F_STATIC_NODES or MPOL_F_RELATIVE_NODES flags described
+	    below.
+
+	MPOL_INTERLEAVED:  This mode specifies that page allocations be
+	interleaved, on a page granularity, across the nodes specified in
+	the policy.  This mode also behaves slightly differently, based on
+	the context where it is used:
+
+	    For allocation of anonymous pages and shared memory pages,
+	    Interleave mode indexes the set of nodes specified by the policy
+	    using the page offset of the faulting address into the segment
+	    [VMA] containing the address modulo the number of nodes specified
+	    by the policy.  It then attempts to allocate a page, starting at
+	    the selected node, as if the node had been specified by a Preferred
+	    policy or had been selected by a local allocation.  That is,
+	    allocation will follow the per node zonelist.
+
+	    For allocation of page cache pages, Interleave mode indexes the set
+	    of nodes specified by the policy using a node counter maintained
+	    per task.  This counter wraps around to the lowest specified node
+	    after it reaches the highest specified node.  This will tend to
+	    spread the pages out over the nodes specified by the policy based
+	    on the order in which they are allocated, rather than based on any
+	    page offset into an address range or file.  During system boot up,
+	    the temporary interleaved system default policy works in this
+	    mode.
+
+   Linux memory policy supports the following optional mode flags:
+
+	MPOL_F_STATIC_NODES:  This flag specifies that the nodemask passed by
+	the user should not be remapped if the task or VMA's set of allowed
+	nodes changes after the memory policy has been defined.
+
+	    Without this flag, anytime a mempolicy is rebound because of a
+	    change in the set of allowed nodes, the node (Preferred) or
+	    nodemask (Bind, Interleave) is remapped to the new set of
+	    allowed nodes.  This may result in nodes being used that were
+	    previously undesired.
+
+	    With this flag, if the user-specified nodes overlap with the
+	    nodes allowed by the task's cpuset, then the memory policy is
+	    applied to their intersection.  If the two sets of nodes do not
+	    overlap, the Default policy is used.
+
+	    For example, consider a task that is attached to a cpuset with
+	    mems 1-3 that sets an Interleave policy over the same set.  If
+	    the cpuset's mems change to 3-5, the Interleave will now occur
+	    over nodes 3, 4, and 5.  With this flag, however, since only node
+	    3 is allowed from the user's nodemask, the "interleave" only
+	    occurs over that node.  If no nodes from the user's nodemask are
+	    now allowed, the Default behavior is used.
+
+	    MPOL_F_STATIC_NODES cannot be combined with the
+	    MPOL_F_RELATIVE_NODES flag.  It also cannot be used for
+	    MPOL_PREFERRED policies that were created with an empty nodemask
+	    (local allocation).
+
+	MPOL_F_RELATIVE_NODES:  This flag specifies that the nodemask passed
+	by the user will be mapped relative to the set of the task or VMA's
+	set of allowed nodes.  The kernel stores the user-passed nodemask,
+	and if the allowed nodes changes, then that original nodemask will
+	be remapped relative to the new set of allowed nodes.
+
+	    Without this flag (and without MPOL_F_STATIC_NODES), anytime a
+	    mempolicy is rebound because of a change in the set of allowed
+	    nodes, the node (Preferred) or nodemask (Bind, Interleave) is
+	    remapped to the new set of allowed nodes.  That remap may not
+	    preserve the relative nature of the user's passed nodemask to its
+	    set of allowed nodes upon successive rebinds: a nodemask of
+	    1,3,5 may be remapped to 7-9 and then to 1-3 if the set of
+	    allowed nodes is restored to its original state.
+
+	    With this flag, the remap is done so that the node numbers from
+	    the user's passed nodemask are relative to the set of allowed
+	    nodes.  In other words, if nodes 0, 2, and 4 are set in the user's
+	    nodemask, the policy will be effected over the first (and in the
+	    Bind or Interleave case, the third and fifth) nodes in the set of
+	    allowed nodes.  The nodemask passed by the user represents nodes
+	    relative to task or VMA's set of allowed nodes.
+
+	    If the user's nodemask includes nodes that are outside the range
+	    of the new set of allowed nodes (for example, node 5 is set in
+	    the user's nodemask when the set of allowed nodes is only 0-3),
+	    then the remap wraps around to the beginning of the nodemask and,
+	    if not already set, sets the node in the mempolicy nodemask.
+
+	    For example, consider a task that is attached to a cpuset with
+	    mems 2-5 that sets an Interleave policy over the same set with
+	    MPOL_F_RELATIVE_NODES.  If the cpuset's mems change to 3-7, the
+	    interleave now occurs over nodes 3,5-6.  If the cpuset's mems
+	    then change to 0,2-3,5, then the interleave occurs over nodes
+	    0,3,5.
+
+	    Thanks to the consistent remapping, applications preparing
+	    nodemasks to specify memory policies using this flag should
+	    disregard their current, actual cpuset imposed memory placement
+	    and prepare the nodemask as if they were always located on
+	    memory nodes 0 to N-1, where N is the number of memory nodes the
+	    policy is intended to manage.  Let the kernel then remap to the
+	    set of memory nodes allowed by the task's cpuset, as that may
+	    change over time.
+
+	    MPOL_F_RELATIVE_NODES cannot be combined with the
+	    MPOL_F_STATIC_NODES flag.  It also cannot be used for
+	    MPOL_PREFERRED policies that were created with an empty nodemask
+	    (local allocation).
+
+MEMORY POLICY REFERENCE COUNTING
+
+To resolve use/free races, struct mempolicy contains an atomic reference
+count field.  Internal interfaces, mpol_get()/mpol_put() increment and
+decrement this reference count, respectively.  mpol_put() will only free
+the structure back to the mempolicy kmem cache when the reference count
+goes to zero.
+
+When a new memory policy is allocated, its reference count is initialized
+to '1', representing the reference held by the task that is installing the
+new policy.  When a pointer to a memory policy structure is stored in another
+structure, another reference is added, as the task's reference will be dropped
+on completion of the policy installation.
+
+During run-time "usage" of the policy, we attempt to minimize atomic operations
+on the reference count, as this can lead to cache lines bouncing between cpus
+and NUMA nodes.  "Usage" here means one of the following:
+
+1) querying of the policy, either by the task itself [using the get_mempolicy()
+   API discussed below] or by another task using the /proc/<pid>/numa_maps
+   interface.
+
+2) examination of the policy to determine the policy mode and associated node
+   or node lists, if any, for page allocation.  This is considered a "hot
+   path".  Note that for MPOL_BIND, the "usage" extends across the entire
+   allocation process, which may sleep during page reclaimation, because the
+   BIND policy nodemask is used, by reference, to filter ineligible nodes.
+
+We can avoid taking an extra reference during the usages listed above as
+follows:
+
+1) we never need to get/free the system default policy as this is never
+   changed nor freed, once the system is up and running.
+
+2) for querying the policy, we do not need to take an extra reference on the
+   target task's task policy nor vma policies because we always acquire the
+   task's mm's mmap_sem for read during the query.  The set_mempolicy() and
+   mbind() APIs [see below] always acquire the mmap_sem for write when
+   installing or replacing task or vma policies.  Thus, there is no possibility
+   of a task or thread freeing a policy while another task or thread is
+   querying it.
+
+3) Page allocation usage of task or vma policy occurs in the fault path where
+   we hold them mmap_sem for read.  Again, because replacing the task or vma
+   policy requires that the mmap_sem be held for write, the policy can't be
+   freed out from under us while we're using it for page allocation.
+
+4) Shared policies require special consideration.  One task can replace a
+   shared memory policy while another task, with a distinct mmap_sem, is
+   querying or allocating a page based on the policy.  To resolve this
+   potential race, the shared policy infrastructure adds an extra reference
+   to the shared policy during lookup while holding a spin lock on the shared
+   policy management structure.  This requires that we drop this extra
+   reference when we're finished "using" the policy.  We must drop the
+   extra reference on shared policies in the same query/allocation paths
+   used for non-shared policies.  For this reason, shared policies are marked
+   as such, and the extra reference is dropped "conditionally"--i.e., only
+   for shared policies.
+
+   Because of this extra reference counting, and because we must lookup
+   shared policies in a tree structure under spinlock, shared policies are
+   more expensive to use in the page allocation path.  This is especially
+   true for shared policies on shared memory regions shared by tasks running
+   on different NUMA nodes.  This extra overhead can be avoided by always
+   falling back to task or system default policy for shared memory regions,
+   or by prefaulting the entire shared memory region into memory and locking
+   it down.  However, this might not be appropriate for all applications.
+
+MEMORY POLICY APIs
+
+Linux supports 3 system calls for controlling memory policy.  These APIS
+always affect only the calling task, the calling task's address space, or
+some shared object mapped into the calling task's address space.
+
+	Note:  the headers that define these APIs and the parameter data types
+	for user space applications reside in a package that is not part of
+	the Linux kernel.  The kernel system call interfaces, with the 'sys_'
+	prefix, are defined in <linux/syscalls.h>; the mode and flag
+	definitions are defined in <linux/mempolicy.h>.
+
+Set [Task] Memory Policy:
+
+	long set_mempolicy(int mode, const unsigned long *nmask,
+					unsigned long maxnode);
+
+	Set's the calling task's "task/process memory policy" to mode
+	specified by the 'mode' argument and the set of nodes defined
+	by 'nmask'.  'nmask' points to a bit mask of node ids containing
+	at least 'maxnode' ids.  Optional mode flags may be passed by
+	combining the 'mode' argument with the flag (for example:
+	MPOL_INTERLEAVE | MPOL_F_STATIC_NODES).
+
+	See the set_mempolicy(2) man page for more details
+
+
+Get [Task] Memory Policy or Related Information
+
+	long get_mempolicy(int *mode,
+			   const unsigned long *nmask, unsigned long maxnode,
+			   void *addr, int flags);
+
+	Queries the "task/process memory policy" of the calling task, or
+	the policy or location of a specified virtual address, depending
+	on the 'flags' argument.
+
+	See the get_mempolicy(2) man page for more details
+
+
+Install VMA/Shared Policy for a Range of Task's Address Space
+
+	long mbind(void *start, unsigned long len, int mode,
+		   const unsigned long *nmask, unsigned long maxnode,
+		   unsigned flags);
+
+	mbind() installs the policy specified by (mode, nmask, maxnodes) as
+	a VMA policy for the range of the calling task's address space
+	specified by the 'start' and 'len' arguments.  Additional actions
+	may be requested via the 'flags' argument.
+
+	See the mbind(2) man page for more details.
+
+MEMORY POLICY COMMAND LINE INTERFACE
+
+Although not strictly part of the Linux implementation of memory policy,
+a command line tool, numactl(8), exists that allows one to:
+
++ set the task policy for a specified program via set_mempolicy(2), fork(2) and
+  exec(2)
+
++ set the shared policy for a shared memory segment via mbind(2)
+
+The numactl(8) tool is packaged with the run-time version of the library
+containing the memory policy system call wrappers.  Some distributions
+package the headers and compile-time libraries in a separate development
+package.
+
+
+MEMORY POLICIES AND CPUSETS
+
+Memory policies work within cpusets as described above.  For memory policies
+that require a node or set of nodes, the nodes are restricted to the set of
+nodes whose memories are allowed by the cpuset constraints.  If the nodemask
+specified for the policy contains nodes that are not allowed by the cpuset and
+MPOL_F_RELATIVE_NODES is not used, the intersection of the set of nodes
+specified for the policy and the set of nodes with memory is used.  If the
+result is the empty set, the policy is considered invalid and cannot be
+installed.  If MPOL_F_RELATIVE_NODES is used, the policy's nodes are mapped
+onto and folded into the task's set of allowed nodes as previously described.
+
+The interaction of memory policies and cpusets can be problematic when tasks
+in two cpusets share access to a memory region, such as shared memory segments
+created by shmget() of mmap() with the MAP_ANONYMOUS and MAP_SHARED flags, and
+any of the tasks install shared policy on the region, only nodes whose
+memories are allowed in both cpusets may be used in the policies.  Obtaining
+this information requires "stepping outside" the memory policy APIs to use the
+cpuset information and requires that one know in what cpusets other task might
+be attaching to the shared region.  Furthermore, if the cpusets' allowed
+memory sets are disjoint, "local" allocation is the only valid policy.
diff --git a/Documentation/vm/overcommit-accounting b/Documentation/vm/overcommit-accounting
new file mode 100644
index 00000000..706d7ed9
--- /dev/null
+++ b/Documentation/vm/overcommit-accounting
@@ -0,0 +1,73 @@
+The Linux kernel supports the following overcommit handling modes
+
+0	-	Heuristic overcommit handling. Obvious overcommits of
+		address space are refused. Used for a typical system. It
+		ensures a seriously wild allocation fails while allowing
+		overcommit to reduce swap usage.  root is allowed to 
+		allocate slightly more memory in this mode. This is the 
+		default.
+
+1	-	Always overcommit. Appropriate for some scientific
+		applications.
+
+2	-	Don't overcommit. The total address space commit
+		for the system is not permitted to exceed swap + a
+		configurable percentage (default is 50) of physical RAM.
+		Depending on the percentage you use, in most situations
+		this means a process will not be killed while accessing
+		pages but will receive errors on memory allocation as
+		appropriate.
+
+The overcommit policy is set via the sysctl `vm.overcommit_memory'.
+
+The overcommit percentage is set via `vm.overcommit_ratio'.
+
+The current overcommit limit and amount committed are viewable in
+/proc/meminfo as CommitLimit and Committed_AS respectively.
+
+Gotchas
+-------
+
+The C language stack growth does an implicit mremap. If you want absolute
+guarantees and run close to the edge you MUST mmap your stack for the 
+largest size you think you will need. For typical stack usage this does
+not matter much but it's a corner case if you really really care
+
+In mode 2 the MAP_NORESERVE flag is ignored. 
+
+
+How It Works
+------------
+
+The overcommit is based on the following rules
+
+For a file backed map
+	SHARED or READ-only	-	0 cost (the file is the map not swap)
+	PRIVATE WRITABLE	-	size of mapping per instance
+
+For an anonymous or /dev/zero map
+	SHARED			-	size of mapping
+	PRIVATE READ-only	-	0 cost (but of little use)
+	PRIVATE WRITABLE	-	size of mapping per instance
+
+Additional accounting
+	Pages made writable copies by mmap
+	shmfs memory drawn from the same pool
+
+Status
+------
+
+o	We account mmap memory mappings
+o	We account mprotect changes in commit
+o	We account mremap changes in size
+o	We account brk
+o	We account munmap
+o	We report the commit status in /proc
+o	Account and check on fork
+o	Review stack handling/building on exec
+o	SHMfs accounting
+o	Implement actual limit enforcement
+
+To Do
+-----
+o	Account ptrace pages (this is hard)
diff --git a/Documentation/vm/page-types.c b/Documentation/vm/page-types.c
new file mode 100644
index 00000000..7445caa2
--- /dev/null
+++ b/Documentation/vm/page-types.c
@@ -0,0 +1,1100 @@
+/*
+ * page-types: Tool for querying page flags
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; version 2.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should find a copy of v2 of the GNU General Public License somewhere on
+ * your Linux system; if not, write to the Free Software Foundation, Inc., 59
+ * Temple Place, Suite 330, Boston, MA 02111-1307 USA.
+ *
+ * Copyright (C) 2009 Intel corporation
+ *
+ * Authors: Wu Fengguang <fengguang.wu@intel.com>
+ */
+
+#define _LARGEFILE64_SOURCE
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <stdint.h>
+#include <stdarg.h>
+#include <string.h>
+#include <getopt.h>
+#include <limits.h>
+#include <assert.h>
+#include <sys/types.h>
+#include <sys/errno.h>
+#include <sys/fcntl.h>
+#include <sys/mount.h>
+#include <sys/statfs.h>
+#include "../../include/linux/magic.h"
+
+
+#ifndef MAX_PATH
+# define MAX_PATH 256
+#endif
+
+#ifndef STR
+# define _STR(x) #x
+# define STR(x) _STR(x)
+#endif
+
+/*
+ * pagemap kernel ABI bits
+ */
+
+#define PM_ENTRY_BYTES      sizeof(uint64_t)
+#define PM_STATUS_BITS      3
+#define PM_STATUS_OFFSET    (64 - PM_STATUS_BITS)
+#define PM_STATUS_MASK      (((1LL << PM_STATUS_BITS) - 1) << PM_STATUS_OFFSET)
+#define PM_STATUS(nr)       (((nr) << PM_STATUS_OFFSET) & PM_STATUS_MASK)
+#define PM_PSHIFT_BITS      6
+#define PM_PSHIFT_OFFSET    (PM_STATUS_OFFSET - PM_PSHIFT_BITS)
+#define PM_PSHIFT_MASK      (((1LL << PM_PSHIFT_BITS) - 1) << PM_PSHIFT_OFFSET)
+#define PM_PSHIFT(x)        (((u64) (x) << PM_PSHIFT_OFFSET) & PM_PSHIFT_MASK)
+#define PM_PFRAME_MASK      ((1LL << PM_PSHIFT_OFFSET) - 1)
+#define PM_PFRAME(x)        ((x) & PM_PFRAME_MASK)
+
+#define PM_PRESENT          PM_STATUS(4LL)
+#define PM_SWAP             PM_STATUS(2LL)
+
+
+/*
+ * kernel page flags
+ */
+
+#define KPF_BYTES		8
+#define PROC_KPAGEFLAGS		"/proc/kpageflags"
+
+/* copied from kpageflags_read() */
+#define KPF_LOCKED		0
+#define KPF_ERROR		1
+#define KPF_REFERENCED		2
+#define KPF_UPTODATE		3
+#define KPF_DIRTY		4
+#define KPF_LRU			5
+#define KPF_ACTIVE		6
+#define KPF_SLAB		7
+#define KPF_WRITEBACK		8
+#define KPF_RECLAIM		9
+#define KPF_BUDDY		10
+
+/* [11-20] new additions in 2.6.31 */
+#define KPF_MMAP		11
+#define KPF_ANON		12
+#define KPF_SWAPCACHE		13
+#define KPF_SWAPBACKED		14
+#define KPF_COMPOUND_HEAD	15
+#define KPF_COMPOUND_TAIL	16
+#define KPF_HUGE		17
+#define KPF_UNEVICTABLE		18
+#define KPF_HWPOISON		19
+#define KPF_NOPAGE		20
+#define KPF_KSM			21
+
+/* [32-] kernel hacking assistances */
+#define KPF_RESERVED		32
+#define KPF_MLOCKED		33
+#define KPF_MAPPEDTODISK	34
+#define KPF_PRIVATE		35
+#define KPF_PRIVATE_2		36
+#define KPF_OWNER_PRIVATE	37
+#define KPF_ARCH		38
+#define KPF_UNCACHED		39
+
+/* [48-] take some arbitrary free slots for expanding overloaded flags
+ * not part of kernel API
+ */
+#define KPF_READAHEAD		48
+#define KPF_SLOB_FREE		49
+#define KPF_SLUB_FROZEN		50
+#define KPF_SLUB_DEBUG		51
+
+#define KPF_ALL_BITS		((uint64_t)~0ULL)
+#define KPF_HACKERS_BITS	(0xffffULL << 32)
+#define KPF_OVERLOADED_BITS	(0xffffULL << 48)
+#define BIT(name)		(1ULL << KPF_##name)
+#define BITS_COMPOUND		(BIT(COMPOUND_HEAD) | BIT(COMPOUND_TAIL))
+
+static const char *page_flag_names[] = {
+	[KPF_LOCKED]		= "L:locked",
+	[KPF_ERROR]		= "E:error",
+	[KPF_REFERENCED]	= "R:referenced",
+	[KPF_UPTODATE]		= "U:uptodate",
+	[KPF_DIRTY]		= "D:dirty",
+	[KPF_LRU]		= "l:lru",
+	[KPF_ACTIVE]		= "A:active",
+	[KPF_SLAB]		= "S:slab",
+	[KPF_WRITEBACK]		= "W:writeback",
+	[KPF_RECLAIM]		= "I:reclaim",
+	[KPF_BUDDY]		= "B:buddy",
+
+	[KPF_MMAP]		= "M:mmap",
+	[KPF_ANON]		= "a:anonymous",
+	[KPF_SWAPCACHE]		= "s:swapcache",
+	[KPF_SWAPBACKED]	= "b:swapbacked",
+	[KPF_COMPOUND_HEAD]	= "H:compound_head",
+	[KPF_COMPOUND_TAIL]	= "T:compound_tail",
+	[KPF_HUGE]		= "G:huge",
+	[KPF_UNEVICTABLE]	= "u:unevictable",
+	[KPF_HWPOISON]		= "X:hwpoison",
+	[KPF_NOPAGE]		= "n:nopage",
+	[KPF_KSM]		= "x:ksm",
+
+	[KPF_RESERVED]		= "r:reserved",
+	[KPF_MLOCKED]		= "m:mlocked",
+	[KPF_MAPPEDTODISK]	= "d:mappedtodisk",
+	[KPF_PRIVATE]		= "P:private",
+	[KPF_PRIVATE_2]		= "p:private_2",
+	[KPF_OWNER_PRIVATE]	= "O:owner_private",
+	[KPF_ARCH]		= "h:arch",
+	[KPF_UNCACHED]		= "c:uncached",
+
+	[KPF_READAHEAD]		= "I:readahead",
+	[KPF_SLOB_FREE]		= "P:slob_free",
+	[KPF_SLUB_FROZEN]	= "A:slub_frozen",
+	[KPF_SLUB_DEBUG]	= "E:slub_debug",
+};
+
+
+static const char *debugfs_known_mountpoints[] = {
+	"/sys/kernel/debug",
+	"/debug",
+	0,
+};
+
+/*
+ * data structures
+ */
+
+static int		opt_raw;	/* for kernel developers */
+static int		opt_list;	/* list pages (in ranges) */
+static int		opt_no_summary;	/* don't show summary */
+static pid_t		opt_pid;	/* process to walk */
+
+#define MAX_ADDR_RANGES	1024
+static int		nr_addr_ranges;
+static unsigned long	opt_offset[MAX_ADDR_RANGES];
+static unsigned long	opt_size[MAX_ADDR_RANGES];
+
+#define MAX_VMAS	10240
+static int		nr_vmas;
+static unsigned long	pg_start[MAX_VMAS];
+static unsigned long	pg_end[MAX_VMAS];
+
+#define MAX_BIT_FILTERS	64
+static int		nr_bit_filters;
+static uint64_t		opt_mask[MAX_BIT_FILTERS];
+static uint64_t		opt_bits[MAX_BIT_FILTERS];
+
+static int		page_size;
+
+static int		pagemap_fd;
+static int		kpageflags_fd;
+
+static int		opt_hwpoison;
+static int		opt_unpoison;
+
+static char		hwpoison_debug_fs[MAX_PATH+1];
+static int		hwpoison_inject_fd;
+static int		hwpoison_forget_fd;
+
+#define HASH_SHIFT	13
+#define HASH_SIZE	(1 << HASH_SHIFT)
+#define HASH_MASK	(HASH_SIZE - 1)
+#define HASH_KEY(flags)	(flags & HASH_MASK)
+
+static unsigned long	total_pages;
+static unsigned long	nr_pages[HASH_SIZE];
+static uint64_t 	page_flags[HASH_SIZE];
+
+
+/*
+ * helper functions
+ */
+
+#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
+
+#define min_t(type, x, y) ({			\
+	type __min1 = (x);			\
+	type __min2 = (y);			\
+	__min1 < __min2 ? __min1 : __min2; })
+
+#define max_t(type, x, y) ({			\
+	type __max1 = (x);			\
+	type __max2 = (y);			\
+	__max1 > __max2 ? __max1 : __max2; })
+
+static unsigned long pages2mb(unsigned long pages)
+{
+	return (pages * page_size) >> 20;
+}
+
+static void fatal(const char *x, ...)
+{
+	va_list ap;
+
+	va_start(ap, x);
+	vfprintf(stderr, x, ap);
+	va_end(ap);
+	exit(EXIT_FAILURE);
+}
+
+static int checked_open(const char *pathname, int flags)
+{
+	int fd = open(pathname, flags);
+
+	if (fd < 0) {
+		perror(pathname);
+		exit(EXIT_FAILURE);
+	}
+
+	return fd;
+}
+
+/*
+ * pagemap/kpageflags routines
+ */
+
+static unsigned long do_u64_read(int fd, char *name,
+				 uint64_t *buf,
+				 unsigned long index,
+				 unsigned long count)
+{
+	long bytes;
+
+	if (index > ULONG_MAX / 8)
+		fatal("index overflow: %lu\n", index);
+
+	if (lseek(fd, index * 8, SEEK_SET) < 0) {
+		perror(name);
+		exit(EXIT_FAILURE);
+	}
+
+	bytes = read(fd, buf, count * 8);
+	if (bytes < 0) {
+		perror(name);
+		exit(EXIT_FAILURE);
+	}
+	if (bytes % 8)
+		fatal("partial read: %lu bytes\n", bytes);
+
+	return bytes / 8;
+}
+
+static unsigned long kpageflags_read(uint64_t *buf,
+				     unsigned long index,
+				     unsigned long pages)
+{
+	return do_u64_read(kpageflags_fd, PROC_KPAGEFLAGS, buf, index, pages);
+}
+
+static unsigned long pagemap_read(uint64_t *buf,
+				  unsigned long index,
+				  unsigned long pages)
+{
+	return do_u64_read(pagemap_fd, "/proc/pid/pagemap", buf, index, pages);
+}
+
+static unsigned long pagemap_pfn(uint64_t val)
+{
+	unsigned long pfn;
+
+	if (val & PM_PRESENT)
+		pfn = PM_PFRAME(val);
+	else
+		pfn = 0;
+
+	return pfn;
+}
+
+
+/*
+ * page flag names
+ */
+
+static char *page_flag_name(uint64_t flags)
+{
+	static char buf[65];
+	int present;
+	int i, j;
+
+	for (i = 0, j = 0; i < ARRAY_SIZE(page_flag_names); i++) {
+		present = (flags >> i) & 1;
+		if (!page_flag_names[i]) {
+			if (present)
+				fatal("unknown flag bit %d\n", i);
+			continue;
+		}
+		buf[j++] = present ? page_flag_names[i][0] : '_';
+	}
+
+	return buf;
+}
+
+static char *page_flag_longname(uint64_t flags)
+{
+	static char buf[1024];
+	int i, n;
+
+	for (i = 0, n = 0; i < ARRAY_SIZE(page_flag_names); i++) {
+		if (!page_flag_names[i])
+			continue;
+		if ((flags >> i) & 1)
+			n += snprintf(buf + n, sizeof(buf) - n, "%s,",
+					page_flag_names[i] + 2);
+	}
+	if (n)
+		n--;
+	buf[n] = '\0';
+
+	return buf;
+}
+
+
+/*
+ * page list and summary
+ */
+
+static void show_page_range(unsigned long voffset,
+			    unsigned long offset, uint64_t flags)
+{
+	static uint64_t      flags0;
+	static unsigned long voff;
+	static unsigned long index;
+	static unsigned long count;
+
+	if (flags == flags0 && offset == index + count &&
+	    (!opt_pid || voffset == voff + count)) {
+		count++;
+		return;
+	}
+
+	if (count) {
+		if (opt_pid)
+			printf("%lx\t", voff);
+		printf("%lx\t%lx\t%s\n",
+				index, count, page_flag_name(flags0));
+	}
+
+	flags0 = flags;
+	index  = offset;
+	voff   = voffset;
+	count  = 1;
+}
+
+static void show_page(unsigned long voffset,
+		      unsigned long offset, uint64_t flags)
+{
+	if (opt_pid)
+		printf("%lx\t", voffset);
+	printf("%lx\t%s\n", offset, page_flag_name(flags));
+}
+
+static void show_summary(void)
+{
+	int i;
+
+	printf("             flags\tpage-count       MB"
+		"  symbolic-flags\t\t\tlong-symbolic-flags\n");
+
+	for (i = 0; i < ARRAY_SIZE(nr_pages); i++) {
+		if (nr_pages[i])
+			printf("0x%016llx\t%10lu %8lu  %s\t%s\n",
+				(unsigned long long)page_flags[i],
+				nr_pages[i],
+				pages2mb(nr_pages[i]),
+				page_flag_name(page_flags[i]),
+				page_flag_longname(page_flags[i]));
+	}
+
+	printf("             total\t%10lu %8lu\n",
+			total_pages, pages2mb(total_pages));
+}
+
+
+/*
+ * page flag filters
+ */
+
+static int bit_mask_ok(uint64_t flags)
+{
+	int i;
+
+	for (i = 0; i < nr_bit_filters; i++) {
+		if (opt_bits[i] == KPF_ALL_BITS) {
+			if ((flags & opt_mask[i]) == 0)
+				return 0;
+		} else {
+			if ((flags & opt_mask[i]) != opt_bits[i])
+				return 0;
+		}
+	}
+
+	return 1;
+}
+
+static uint64_t expand_overloaded_flags(uint64_t flags)
+{
+	/* SLOB/SLUB overload several page flags */
+	if (flags & BIT(SLAB)) {
+		if (flags & BIT(PRIVATE))
+			flags ^= BIT(PRIVATE) | BIT(SLOB_FREE);
+		if (flags & BIT(ACTIVE))
+			flags ^= BIT(ACTIVE) | BIT(SLUB_FROZEN);
+		if (flags & BIT(ERROR))
+			flags ^= BIT(ERROR) | BIT(SLUB_DEBUG);
+	}
+
+	/* PG_reclaim is overloaded as PG_readahead in the read path */
+	if ((flags & (BIT(RECLAIM) | BIT(WRITEBACK))) == BIT(RECLAIM))
+		flags ^= BIT(RECLAIM) | BIT(READAHEAD);
+
+	return flags;
+}
+
+static uint64_t well_known_flags(uint64_t flags)
+{
+	/* hide flags intended only for kernel hacker */
+	flags &= ~KPF_HACKERS_BITS;
+
+	/* hide non-hugeTLB compound pages */
+	if ((flags & BITS_COMPOUND) && !(flags & BIT(HUGE)))
+		flags &= ~BITS_COMPOUND;
+
+	return flags;
+}
+
+static uint64_t kpageflags_flags(uint64_t flags)
+{
+	flags = expand_overloaded_flags(flags);
+
+	if (!opt_raw)
+		flags = well_known_flags(flags);
+
+	return flags;
+}
+
+/* verify that a mountpoint is actually a debugfs instance */
+static int debugfs_valid_mountpoint(const char *debugfs)
+{
+	struct statfs st_fs;
+
+	if (statfs(debugfs, &st_fs) < 0)
+		return -ENOENT;
+	else if (st_fs.f_type != (long) DEBUGFS_MAGIC)
+		return -ENOENT;
+
+	return 0;
+}
+
+/* find the path to the mounted debugfs */
+static const char *debugfs_find_mountpoint(void)
+{
+	const char **ptr;
+	char type[100];
+	FILE *fp;
+
+	ptr = debugfs_known_mountpoints;
+	while (*ptr) {
+		if (debugfs_valid_mountpoint(*ptr) == 0) {
+			strcpy(hwpoison_debug_fs, *ptr);
+			return hwpoison_debug_fs;
+		}
+		ptr++;
+	}
+
+	/* give up and parse /proc/mounts */
+	fp = fopen("/proc/mounts", "r");
+	if (fp == NULL)
+		perror("Can't open /proc/mounts for read");
+
+	while (fscanf(fp, "%*s %"
+		      STR(MAX_PATH)
+		      "s %99s %*s %*d %*d\n",
+		      hwpoison_debug_fs, type) == 2) {
+		if (strcmp(type, "debugfs") == 0)
+			break;
+	}
+	fclose(fp);
+
+	if (strcmp(type, "debugfs") != 0)
+		return NULL;
+
+	return hwpoison_debug_fs;
+}
+
+/* mount the debugfs somewhere if it's not mounted */
+
+static void debugfs_mount(void)
+{
+	const char **ptr;
+
+	/* see if it's already mounted */
+	if (debugfs_find_mountpoint())
+		return;
+
+	ptr = debugfs_known_mountpoints;
+	while (*ptr) {
+		if (mount(NULL, *ptr, "debugfs", 0, NULL) == 0) {
+			/* save the mountpoint */
+			strcpy(hwpoison_debug_fs, *ptr);
+			break;
+		}
+		ptr++;
+	}
+
+	if (*ptr == NULL) {
+		perror("mount debugfs");
+		exit(EXIT_FAILURE);
+	}
+}
+
+/*
+ * page actions
+ */
+
+static void prepare_hwpoison_fd(void)
+{
+	char buf[MAX_PATH + 1];
+
+	debugfs_mount();
+
+	if (opt_hwpoison && !hwpoison_inject_fd) {
+		snprintf(buf, MAX_PATH, "%s/hwpoison/corrupt-pfn",
+			hwpoison_debug_fs);
+		hwpoison_inject_fd = checked_open(buf, O_WRONLY);
+	}
+
+	if (opt_unpoison && !hwpoison_forget_fd) {
+		snprintf(buf, MAX_PATH, "%s/hwpoison/unpoison-pfn",
+			hwpoison_debug_fs);
+		hwpoison_forget_fd = checked_open(buf, O_WRONLY);
+	}
+}
+
+static int hwpoison_page(unsigned long offset)
+{
+	char buf[100];
+	int len;
+
+	len = sprintf(buf, "0x%lx\n", offset);
+	len = write(hwpoison_inject_fd, buf, len);
+	if (len < 0) {
+		perror("hwpoison inject");
+		return len;
+	}
+	return 0;
+}
+
+static int unpoison_page(unsigned long offset)
+{
+	char buf[100];
+	int len;
+
+	len = sprintf(buf, "0x%lx\n", offset);
+	len = write(hwpoison_forget_fd, buf, len);
+	if (len < 0) {
+		perror("hwpoison forget");
+		return len;
+	}
+	return 0;
+}
+
+/*
+ * page frame walker
+ */
+
+static int hash_slot(uint64_t flags)
+{
+	int k = HASH_KEY(flags);
+	int i;
+
+	/* Explicitly reserve slot 0 for flags 0: the following logic
+	 * cannot distinguish an unoccupied slot from slot (flags==0).
+	 */
+	if (flags == 0)
+		return 0;
+
+	/* search through the remaining (HASH_SIZE-1) slots */
+	for (i = 1; i < ARRAY_SIZE(page_flags); i++, k++) {
+		if (!k || k >= ARRAY_SIZE(page_flags))
+			k = 1;
+		if (page_flags[k] == 0) {
+			page_flags[k] = flags;
+			return k;
+		}
+		if (page_flags[k] == flags)
+			return k;
+	}
+
+	fatal("hash table full: bump up HASH_SHIFT?\n");
+	exit(EXIT_FAILURE);
+}
+
+static void add_page(unsigned long voffset,
+		     unsigned long offset, uint64_t flags)
+{
+	flags = kpageflags_flags(flags);
+
+	if (!bit_mask_ok(flags))
+		return;
+
+	if (opt_hwpoison)
+		hwpoison_page(offset);
+	if (opt_unpoison)
+		unpoison_page(offset);
+
+	if (opt_list == 1)
+		show_page_range(voffset, offset, flags);
+	else if (opt_list == 2)
+		show_page(voffset, offset, flags);
+
+	nr_pages[hash_slot(flags)]++;
+	total_pages++;
+}
+
+#define KPAGEFLAGS_BATCH	(64 << 10)	/* 64k pages */
+static void walk_pfn(unsigned long voffset,
+		     unsigned long index,
+		     unsigned long count)
+{
+	uint64_t buf[KPAGEFLAGS_BATCH];
+	unsigned long batch;
+	long pages;
+	unsigned long i;
+
+	while (count) {
+		batch = min_t(unsigned long, count, KPAGEFLAGS_BATCH);
+		pages = kpageflags_read(buf, index, batch);
+		if (pages == 0)
+			break;
+
+		for (i = 0; i < pages; i++)
+			add_page(voffset + i, index + i, buf[i]);
+
+		index += pages;
+		count -= pages;
+	}
+}
+
+#define PAGEMAP_BATCH	(64 << 10)
+static void walk_vma(unsigned long index, unsigned long count)
+{
+	uint64_t buf[PAGEMAP_BATCH];
+	unsigned long batch;
+	unsigned long pages;
+	unsigned long pfn;
+	unsigned long i;
+
+	while (count) {
+		batch = min_t(unsigned long, count, PAGEMAP_BATCH);
+		pages = pagemap_read(buf, index, batch);
+		if (pages == 0)
+			break;
+
+		for (i = 0; i < pages; i++) {
+			pfn = pagemap_pfn(buf[i]);
+			if (pfn)
+				walk_pfn(index + i, pfn, 1);
+		}
+
+		index += pages;
+		count -= pages;
+	}
+}
+
+static void walk_task(unsigned long index, unsigned long count)
+{
+	const unsigned long end = index + count;
+	unsigned long start;
+	int i = 0;
+
+	while (index < end) {
+
+		while (pg_end[i] <= index)
+			if (++i >= nr_vmas)
+				return;
+		if (pg_start[i] >= end)
+			return;
+
+		start = max_t(unsigned long, pg_start[i], index);
+		index = min_t(unsigned long, pg_end[i], end);
+
+		assert(start < index);
+		walk_vma(start, index - start);
+	}
+}
+
+static void add_addr_range(unsigned long offset, unsigned long size)
+{
+	if (nr_addr_ranges >= MAX_ADDR_RANGES)
+		fatal("too many addr ranges\n");
+
+	opt_offset[nr_addr_ranges] = offset;
+	opt_size[nr_addr_ranges] = min_t(unsigned long, size, ULONG_MAX-offset);
+	nr_addr_ranges++;
+}
+
+static void walk_addr_ranges(void)
+{
+	int i;
+
+	kpageflags_fd = checked_open(PROC_KPAGEFLAGS, O_RDONLY);
+
+	if (!nr_addr_ranges)
+		add_addr_range(0, ULONG_MAX);
+
+	for (i = 0; i < nr_addr_ranges; i++)
+		if (!opt_pid)
+			walk_pfn(0, opt_offset[i], opt_size[i]);
+		else
+			walk_task(opt_offset[i], opt_size[i]);
+
+	close(kpageflags_fd);
+}
+
+
+/*
+ * user interface
+ */
+
+static const char *page_flag_type(uint64_t flag)
+{
+	if (flag & KPF_HACKERS_BITS)
+		return "(r)";
+	if (flag & KPF_OVERLOADED_BITS)
+		return "(o)";
+	return "   ";
+}
+
+static void usage(void)
+{
+	int i, j;
+
+	printf(
+"page-types [options]\n"
+"            -r|--raw                   Raw mode, for kernel developers\n"
+"            -d|--describe flags        Describe flags\n"
+"            -a|--addr    addr-spec     Walk a range of pages\n"
+"            -b|--bits    bits-spec     Walk pages with specified bits\n"
+"            -p|--pid     pid           Walk process address space\n"
+#if 0 /* planned features */
+"            -f|--file    filename      Walk file address space\n"
+#endif
+"            -l|--list                  Show page details in ranges\n"
+"            -L|--list-each             Show page details one by one\n"
+"            -N|--no-summary            Don't show summary info\n"
+"            -X|--hwpoison              hwpoison pages\n"
+"            -x|--unpoison              unpoison pages\n"
+"            -h|--help                  Show this usage message\n"
+"flags:\n"
+"            0x10                       bitfield format, e.g.\n"
+"            anon                       bit-name, e.g.\n"
+"            0x10,anon                  comma-separated list, e.g.\n"
+"addr-spec:\n"
+"            N                          one page at offset N (unit: pages)\n"
+"            N+M                        pages range from N to N+M-1\n"
+"            N,M                        pages range from N to M-1\n"
+"            N,                         pages range from N to end\n"
+"            ,M                         pages range from 0 to M-1\n"
+"bits-spec:\n"
+"            bit1,bit2                  (flags & (bit1|bit2)) != 0\n"
+"            bit1,bit2=bit1             (flags & (bit1|bit2)) == bit1\n"
+"            bit1,~bit2                 (flags & (bit1|bit2)) == bit1\n"
+"            =bit1,bit2                 flags == (bit1|bit2)\n"
+"bit-names:\n"
+	);
+
+	for (i = 0, j = 0; i < ARRAY_SIZE(page_flag_names); i++) {
+		if (!page_flag_names[i])
+			continue;
+		printf("%16s%s", page_flag_names[i] + 2,
+				 page_flag_type(1ULL << i));
+		if (++j > 3) {
+			j = 0;
+			putchar('\n');
+		}
+	}
+	printf("\n                                   "
+		"(r) raw mode bits  (o) overloaded bits\n");
+}
+
+static unsigned long long parse_number(const char *str)
+{
+	unsigned long long n;
+
+	n = strtoll(str, NULL, 0);
+
+	if (n == 0 && str[0] != '0')
+		fatal("invalid name or number: %s\n", str);
+
+	return n;
+}
+
+static void parse_pid(const char *str)
+{
+	FILE *file;
+	char buf[5000];
+
+	opt_pid = parse_number(str);
+
+	sprintf(buf, "/proc/%d/pagemap", opt_pid);
+	pagemap_fd = checked_open(buf, O_RDONLY);
+
+	sprintf(buf, "/proc/%d/maps", opt_pid);
+	file = fopen(buf, "r");
+	if (!file) {
+		perror(buf);
+		exit(EXIT_FAILURE);
+	}
+
+	while (fgets(buf, sizeof(buf), file) != NULL) {
+		unsigned long vm_start;
+		unsigned long vm_end;
+		unsigned long long pgoff;
+		int major, minor;
+		char r, w, x, s;
+		unsigned long ino;
+		int n;
+
+		n = sscanf(buf, "%lx-%lx %c%c%c%c %llx %x:%x %lu",
+			   &vm_start,
+			   &vm_end,
+			   &r, &w, &x, &s,
+			   &pgoff,
+			   &major, &minor,
+			   &ino);
+		if (n < 10) {
+			fprintf(stderr, "unexpected line: %s\n", buf);
+			continue;
+		}
+		pg_start[nr_vmas] = vm_start / page_size;
+		pg_end[nr_vmas] = vm_end / page_size;
+		if (++nr_vmas >= MAX_VMAS) {
+			fprintf(stderr, "too many VMAs\n");
+			break;
+		}
+	}
+	fclose(file);
+}
+
+static void parse_file(const char *name)
+{
+}
+
+static void parse_addr_range(const char *optarg)
+{
+	unsigned long offset;
+	unsigned long size;
+	char *p;
+
+	p = strchr(optarg, ',');
+	if (!p)
+		p = strchr(optarg, '+');
+
+	if (p == optarg) {
+		offset = 0;
+		size   = parse_number(p + 1);
+	} else if (p) {
+		offset = parse_number(optarg);
+		if (p[1] == '\0')
+			size = ULONG_MAX;
+		else {
+			size = parse_number(p + 1);
+			if (*p == ',') {
+				if (size < offset)
+					fatal("invalid range: %lu,%lu\n",
+							offset, size);
+				size -= offset;
+			}
+		}
+	} else {
+		offset = parse_number(optarg);
+		size   = 1;
+	}
+
+	add_addr_range(offset, size);
+}
+
+static void add_bits_filter(uint64_t mask, uint64_t bits)
+{
+	if (nr_bit_filters >= MAX_BIT_FILTERS)
+		fatal("too much bit filters\n");
+
+	opt_mask[nr_bit_filters] = mask;
+	opt_bits[nr_bit_filters] = bits;
+	nr_bit_filters++;
+}
+
+static uint64_t parse_flag_name(const char *str, int len)
+{
+	int i;
+
+	if (!*str || !len)
+		return 0;
+
+	if (len <= 8 && !strncmp(str, "compound", len))
+		return BITS_COMPOUND;
+
+	for (i = 0; i < ARRAY_SIZE(page_flag_names); i++) {
+		if (!page_flag_names[i])
+			continue;
+		if (!strncmp(str, page_flag_names[i] + 2, len))
+			return 1ULL << i;
+	}
+
+	return parse_number(str);
+}
+
+static uint64_t parse_flag_names(const char *str, int all)
+{
+	const char *p    = str;
+	uint64_t   flags = 0;
+
+	while (1) {
+		if (*p == ',' || *p == '=' || *p == '\0') {
+			if ((*str != '~') || (*str == '~' && all && *++str))
+				flags |= parse_flag_name(str, p - str);
+			if (*p != ',')
+				break;
+			str = p + 1;
+		}
+		p++;
+	}
+
+	return flags;
+}
+
+static void parse_bits_mask(const char *optarg)
+{
+	uint64_t mask;
+	uint64_t bits;
+	const char *p;
+
+	p = strchr(optarg, '=');
+	if (p == optarg) {
+		mask = KPF_ALL_BITS;
+		bits = parse_flag_names(p + 1, 0);
+	} else if (p) {
+		mask = parse_flag_names(optarg, 0);
+		bits = parse_flag_names(p + 1, 0);
+	} else if (strchr(optarg, '~')) {
+		mask = parse_flag_names(optarg, 1);
+		bits = parse_flag_names(optarg, 0);
+	} else {
+		mask = parse_flag_names(optarg, 0);
+		bits = KPF_ALL_BITS;
+	}
+
+	add_bits_filter(mask, bits);
+}
+
+static void describe_flags(const char *optarg)
+{
+	uint64_t flags = parse_flag_names(optarg, 0);
+
+	printf("0x%016llx\t%s\t%s\n",
+		(unsigned long long)flags,
+		page_flag_name(flags),
+		page_flag_longname(flags));
+}
+
+static const struct option opts[] = {
+	{ "raw"       , 0, NULL, 'r' },
+	{ "pid"       , 1, NULL, 'p' },
+	{ "file"      , 1, NULL, 'f' },
+	{ "addr"      , 1, NULL, 'a' },
+	{ "bits"      , 1, NULL, 'b' },
+	{ "describe"  , 1, NULL, 'd' },
+	{ "list"      , 0, NULL, 'l' },
+	{ "list-each" , 0, NULL, 'L' },
+	{ "no-summary", 0, NULL, 'N' },
+	{ "hwpoison"  , 0, NULL, 'X' },
+	{ "unpoison"  , 0, NULL, 'x' },
+	{ "help"      , 0, NULL, 'h' },
+	{ NULL        , 0, NULL, 0 }
+};
+
+int main(int argc, char *argv[])
+{
+	int c;
+
+	page_size = getpagesize();
+
+	while ((c = getopt_long(argc, argv,
+				"rp:f:a:b:d:lLNXxh", opts, NULL)) != -1) {
+		switch (c) {
+		case 'r':
+			opt_raw = 1;
+			break;
+		case 'p':
+			parse_pid(optarg);
+			break;
+		case 'f':
+			parse_file(optarg);
+			break;
+		case 'a':
+			parse_addr_range(optarg);
+			break;
+		case 'b':
+			parse_bits_mask(optarg);
+			break;
+		case 'd':
+			describe_flags(optarg);
+			exit(0);
+		case 'l':
+			opt_list = 1;
+			break;
+		case 'L':
+			opt_list = 2;
+			break;
+		case 'N':
+			opt_no_summary = 1;
+			break;
+		case 'X':
+			opt_hwpoison = 1;
+			prepare_hwpoison_fd();
+			break;
+		case 'x':
+			opt_unpoison = 1;
+			prepare_hwpoison_fd();
+			break;
+		case 'h':
+			usage();
+			exit(0);
+		default:
+			usage();
+			exit(1);
+		}
+	}
+
+	if (opt_list && opt_pid)
+		printf("voffset\t");
+	if (opt_list == 1)
+		printf("offset\tlen\tflags\n");
+	if (opt_list == 2)
+		printf("offset\tflags\n");
+
+	walk_addr_ranges();
+
+	if (opt_list == 1)
+		show_page_range(0, 0, 0);  /* drain the buffer */
+
+	if (opt_no_summary)
+		return 0;
+
+	if (opt_list)
+		printf("\n\n");
+
+	show_summary();
+
+	return 0;
+}
diff --git a/Documentation/vm/page_migration b/Documentation/vm/page_migration
new file mode 100644
index 00000000..6513fe2d
--- /dev/null
+++ b/Documentation/vm/page_migration
@@ -0,0 +1,149 @@
+Page migration
+--------------
+
+Page migration allows the moving of the physical location of pages between
+nodes in a numa system while the process is running. This means that the
+virtual addresses that the process sees do not change. However, the
+system rearranges the physical location of those pages.
+
+The main intend of page migration is to reduce the latency of memory access
+by moving pages near to the processor where the process accessing that memory
+is running.
+
+Page migration allows a process to manually relocate the node on which its
+pages are located through the MF_MOVE and MF_MOVE_ALL options while setting
+a new memory policy via mbind(). The pages of process can also be relocated
+from another process using the sys_migrate_pages() function call. The
+migrate_pages function call takes two sets of nodes and moves pages of a
+process that are located on the from nodes to the destination nodes.
+Page migration functions are provided by the numactl package by Andi Kleen
+(a version later than 0.9.3 is required. Get it from
+ftp://oss.sgi.com/www/projects/libnuma/download/). numactl provides libnuma
+which provides an interface similar to other numa functionality for page
+migration.  cat /proc/<pid>/numa_maps allows an easy review of where the
+pages of a process are located. See also the numa_maps documentation in the
+proc(5) man page.
+
+Manual migration is useful if for example the scheduler has relocated
+a process to a processor on a distant node. A batch scheduler or an
+administrator may detect the situation and move the pages of the process
+nearer to the new processor. The kernel itself does only provide
+manual page migration support. Automatic page migration may be implemented
+through user space processes that move pages. A special function call
+"move_pages" allows the moving of individual pages within a process.
+A NUMA profiler may f.e. obtain a log showing frequent off node
+accesses and may use the result to move pages to more advantageous
+locations.
+
+Larger installations usually partition the system using cpusets into
+sections of nodes. Paul Jackson has equipped cpusets with the ability to
+move pages when a task is moved to another cpuset (See
+Documentation/cgroups/cpusets.txt).
+Cpusets allows the automation of process locality. If a task is moved to
+a new cpuset then also all its pages are moved with it so that the
+performance of the process does not sink dramatically. Also the pages
+of processes in a cpuset are moved if the allowed memory nodes of a
+cpuset are changed.
+
+Page migration allows the preservation of the relative location of pages
+within a group of nodes for all migration techniques which will preserve a
+particular memory allocation pattern generated even after migrating a
+process. This is necessary in order to preserve the memory latencies.
+Processes will run with similar performance after migration.
+
+Page migration occurs in several steps. First a high level
+description for those trying to use migrate_pages() from the kernel
+(for userspace usage see the Andi Kleen's numactl package mentioned above)
+and then a low level description of how the low level details work.
+
+A. In kernel use of migrate_pages()
+-----------------------------------
+
+1. Remove pages from the LRU.
+
+   Lists of pages to be migrated are generated by scanning over
+   pages and moving them into lists. This is done by
+   calling isolate_lru_page().
+   Calling isolate_lru_page increases the references to the page
+   so that it cannot vanish while the page migration occurs.
+   It also prevents the swapper or other scans to encounter
+   the page.
+
+2. We need to have a function of type new_page_t that can be
+   passed to migrate_pages(). This function should figure out
+   how to allocate the correct new page given the old page.
+
+3. The migrate_pages() function is called which attempts
+   to do the migration. It will call the function to allocate
+   the new page for each page that is considered for
+   moving.
+
+B. How migrate_pages() works
+----------------------------
+
+migrate_pages() does several passes over its list of pages. A page is moved
+if all references to a page are removable at the time. The page has
+already been removed from the LRU via isolate_lru_page() and the refcount
+is increased so that the page cannot be freed while page migration occurs.
+
+Steps:
+
+1. Lock the page to be migrated
+
+2. Insure that writeback is complete.
+
+3. Prep the new page that we want to move to. It is locked
+   and set to not being uptodate so that all accesses to the new
+   page immediately lock while the move is in progress.
+
+4. The new page is prepped with some settings from the old page so that
+   accesses to the new page will discover a page with the correct settings.
+
+5. All the page table references to the page are converted
+   to migration entries or dropped (nonlinear vmas).
+   This decrease the mapcount of a page. If the resulting
+   mapcount is not zero then we do not migrate the page.
+   All user space processes that attempt to access the page
+   will now wait on the page lock.
+
+6. The radix tree lock is taken. This will cause all processes trying
+   to access the page via the mapping to block on the radix tree spinlock.
+
+7. The refcount of the page is examined and we back out if references remain
+   otherwise we know that we are the only one referencing this page.
+
+8. The radix tree is checked and if it does not contain the pointer to this
+   page then we back out because someone else modified the radix tree.
+
+9. The radix tree is changed to point to the new page.
+
+10. The reference count of the old page is dropped because the radix tree
+    reference is gone. A reference to the new page is established because
+    the new page is referenced to by the radix tree.
+
+11. The radix tree lock is dropped. With that lookups in the mapping
+    become possible again. Processes will move from spinning on the tree_lock
+    to sleeping on the locked new page.
+
+12. The page contents are copied to the new page.
+
+13. The remaining page flags are copied to the new page.
+
+14. The old page flags are cleared to indicate that the page does
+    not provide any information anymore.
+
+15. Queued up writeback on the new page is triggered.
+
+16. If migration entries were page then replace them with real ptes. Doing
+    so will enable access for user space processes not already waiting for
+    the page lock.
+
+19. The page locks are dropped from the old and new page.
+    Processes waiting on the page lock will redo their page faults
+    and will reach the new page.
+
+20. The new page is moved to the LRU and can be scanned by the swapper
+    etc again.
+
+Christoph Lameter, May 8, 2006.
+
diff --git a/Documentation/vm/pagemap.txt b/Documentation/vm/pagemap.txt
new file mode 100644
index 00000000..df09b965
--- /dev/null
+++ b/Documentation/vm/pagemap.txt
@@ -0,0 +1,147 @@
+pagemap, from the userspace perspective
+---------------------------------------
+
+pagemap is a new (as of 2.6.25) set of interfaces in the kernel that allow
+userspace programs to examine the page tables and related information by
+reading files in /proc.
+
+There are three components to pagemap:
+
+ * /proc/pid/pagemap.  This file lets a userspace process find out which
+   physical frame each virtual page is mapped to.  It contains one 64-bit
+   value for each virtual page, containing the following data (from
+   fs/proc/task_mmu.c, above pagemap_read):
+
+    * Bits 0-54  page frame number (PFN) if present
+    * Bits 0-4   swap type if swapped
+    * Bits 5-54  swap offset if swapped
+    * Bits 55-60 page shift (page size = 1<<page shift)
+    * Bit  61    reserved for future use
+    * Bit  62    page swapped
+    * Bit  63    page present
+
+   If the page is not present but in swap, then the PFN contains an
+   encoding of the swap file number and the page's offset into the
+   swap. Unmapped pages return a null PFN. This allows determining
+   precisely which pages are mapped (or in swap) and comparing mapped
+   pages between processes.
+
+   Efficient users of this interface will use /proc/pid/maps to
+   determine which areas of memory are actually mapped and llseek to
+   skip over unmapped regions.
+
+ * /proc/kpagecount.  This file contains a 64-bit count of the number of
+   times each page is mapped, indexed by PFN.
+
+ * /proc/kpageflags.  This file contains a 64-bit set of flags for each
+   page, indexed by PFN.
+
+   The flags are (from fs/proc/page.c, above kpageflags_read):
+
+     0. LOCKED
+     1. ERROR
+     2. REFERENCED
+     3. UPTODATE
+     4. DIRTY
+     5. LRU
+     6. ACTIVE
+     7. SLAB
+     8. WRITEBACK
+     9. RECLAIM
+    10. BUDDY
+    11. MMAP
+    12. ANON
+    13. SWAPCACHE
+    14. SWAPBACKED
+    15. COMPOUND_HEAD
+    16. COMPOUND_TAIL
+    16. HUGE
+    18. UNEVICTABLE
+    19. HWPOISON
+    20. NOPAGE
+    21. KSM
+
+Short descriptions to the page flags:
+
+ 0. LOCKED
+    page is being locked for exclusive access, eg. by undergoing read/write IO
+
+ 7. SLAB
+    page is managed by the SLAB/SLOB/SLUB/SLQB kernel memory allocator
+    When compound page is used, SLUB/SLQB will only set this flag on the head
+    page; SLOB will not flag it at all.
+
+10. BUDDY
+    a free memory block managed by the buddy system allocator
+    The buddy system organizes free memory in blocks of various orders.
+    An order N block has 2^N physically contiguous pages, with the BUDDY flag
+    set for and _only_ for the first page.
+
+15. COMPOUND_HEAD
+16. COMPOUND_TAIL
+    A compound page with order N consists of 2^N physically contiguous pages.
+    A compound page with order 2 takes the form of "HTTT", where H donates its
+    head page and T donates its tail page(s).  The major consumers of compound
+    pages are hugeTLB pages (Documentation/vm/hugetlbpage.txt), the SLUB etc.
+    memory allocators and various device drivers. However in this interface,
+    only huge/giga pages are made visible to end users.
+17. HUGE
+    this is an integral part of a HugeTLB page
+
+19. HWPOISON
+    hardware detected memory corruption on this page: don't touch the data!
+
+20. NOPAGE
+    no page frame exists at the requested address
+
+21. KSM
+    identical memory pages dynamically shared between one or more processes
+
+    [IO related page flags]
+ 1. ERROR     IO error occurred
+ 3. UPTODATE  page has up-to-date data
+              ie. for file backed page: (in-memory data revision >= on-disk one)
+ 4. DIRTY     page has been written to, hence contains new data
+              ie. for file backed page: (in-memory data revision >  on-disk one)
+ 8. WRITEBACK page is being synced to disk
+
+    [LRU related page flags]
+ 5. LRU         page is in one of the LRU lists
+ 6. ACTIVE      page is in the active LRU list
+18. UNEVICTABLE page is in the unevictable (non-)LRU list
+                It is somehow pinned and not a candidate for LRU page reclaims,
+		eg. ramfs pages, shmctl(SHM_LOCK) and mlock() memory segments
+ 2. REFERENCED  page has been referenced since last LRU list enqueue/requeue
+ 9. RECLAIM     page will be reclaimed soon after its pageout IO completed
+11. MMAP        a memory mapped page
+12. ANON        a memory mapped page that is not part of a file
+13. SWAPCACHE   page is mapped to swap space, ie. has an associated swap entry
+14. SWAPBACKED  page is backed by swap/RAM
+
+The page-types tool in this directory can be used to query the above flags.
+
+Using pagemap to do something useful:
+
+The general procedure for using pagemap to find out about a process' memory
+usage goes like this:
+
+ 1. Read /proc/pid/maps to determine which parts of the memory space are
+    mapped to what.
+ 2. Select the maps you are interested in -- all of them, or a particular
+    library, or the stack or the heap, etc.
+ 3. Open /proc/pid/pagemap and seek to the pages you would like to examine.
+ 4. Read a u64 for each page from pagemap.
+ 5. Open /proc/kpagecount and/or /proc/kpageflags.  For each PFN you just
+    read, seek to that entry in the file, and read the data you want.
+
+For example, to find the "unique set size" (USS), which is the amount of
+memory that a process is using that is not shared with any other process,
+you can go through every map in the process, find the PFNs, look those up
+in kpagecount, and tally up the number of pages that are only referenced
+once.
+
+Other notes:
+
+Reading from any of the files will return -EINVAL if you are not starting
+the read on an 8-byte boundary (e.g., if you seeked an odd number of bytes
+into the file), or if the size of the read is not a multiple of 8 bytes.
diff --git a/Documentation/vm/slub.txt b/Documentation/vm/slub.txt
new file mode 100644
index 00000000..07375e73
--- /dev/null
+++ b/Documentation/vm/slub.txt
@@ -0,0 +1,280 @@
+Short users guide for SLUB
+--------------------------
+
+The basic philosophy of SLUB is very different from SLAB. SLAB
+requires rebuilding the kernel to activate debug options for all
+slab caches. SLUB always includes full debugging but it is off by default.
+SLUB can enable debugging only for selected slabs in order to avoid
+an impact on overall system performance which may make a bug more
+difficult to find.
+
+In order to switch debugging on one can add a option "slub_debug"
+to the kernel command line. That will enable full debugging for
+all slabs.
+
+Typically one would then use the "slabinfo" command to get statistical
+data and perform operation on the slabs. By default slabinfo only lists
+slabs that have data in them. See "slabinfo -h" for more options when
+running the command. slabinfo can be compiled with
+
+gcc -o slabinfo Documentation/vm/slabinfo.c
+
+Some of the modes of operation of slabinfo require that slub debugging
+be enabled on the command line. F.e. no tracking information will be
+available without debugging on and validation can only partially
+be performed if debugging was not switched on.
+
+Some more sophisticated uses of slub_debug:
+-------------------------------------------
+
+Parameters may be given to slub_debug. If none is specified then full
+debugging is enabled. Format:
+
+slub_debug=<Debug-Options>       Enable options for all slabs
+slub_debug=<Debug-Options>,<slab name>
+				Enable options only for select slabs
+
+Possible debug options are
+	F		Sanity checks on (enables SLAB_DEBUG_FREE. Sorry
+			SLAB legacy issues)
+	Z		Red zoning
+	P		Poisoning (object and padding)
+	U		User tracking (free and alloc)
+	T		Trace (please only use on single slabs)
+	A		Toggle failslab filter mark for the cache
+	O		Switch debugging off for caches that would have
+			caused higher minimum slab orders
+	-		Switch all debugging off (useful if the kernel is
+			configured with CONFIG_SLUB_DEBUG_ON)
+
+F.e. in order to boot just with sanity checks and red zoning one would specify:
+
+	slub_debug=FZ
+
+Trying to find an issue in the dentry cache? Try
+
+	slub_debug=,dentry
+
+to only enable debugging on the dentry cache.
+
+Red zoning and tracking may realign the slab.  We can just apply sanity checks
+to the dentry cache with
+
+	slub_debug=F,dentry
+
+Debugging options may require the minimum possible slab order to increase as
+a result of storing the metadata (for example, caches with PAGE_SIZE object
+sizes).  This has a higher liklihood of resulting in slab allocation errors
+in low memory situations or if there's high fragmentation of memory.  To
+switch off debugging for such caches by default, use
+
+	slub_debug=O
+
+In case you forgot to enable debugging on the kernel command line: It is
+possible to enable debugging manually when the kernel is up. Look at the
+contents of:
+
+/sys/kernel/slab/<slab name>/
+
+Look at the writable files. Writing 1 to them will enable the
+corresponding debug option. All options can be set on a slab that does
+not contain objects. If the slab already contains objects then sanity checks
+and tracing may only be enabled. The other options may cause the realignment
+of objects.
+
+Careful with tracing: It may spew out lots of information and never stop if
+used on the wrong slab.
+
+Slab merging
+------------
+
+If no debug options are specified then SLUB may merge similar slabs together
+in order to reduce overhead and increase cache hotness of objects.
+slabinfo -a displays which slabs were merged together.
+
+Slab validation
+---------------
+
+SLUB can validate all object if the kernel was booted with slub_debug. In
+order to do so you must have the slabinfo tool. Then you can do
+
+slabinfo -v
+
+which will test all objects. Output will be generated to the syslog.
+
+This also works in a more limited way if boot was without slab debug.
+In that case slabinfo -v simply tests all reachable objects. Usually
+these are in the cpu slabs and the partial slabs. Full slabs are not
+tracked by SLUB in a non debug situation.
+
+Getting more performance
+------------------------
+
+To some degree SLUB's performance is limited by the need to take the
+list_lock once in a while to deal with partial slabs. That overhead is
+governed by the order of the allocation for each slab. The allocations
+can be influenced by kernel parameters:
+
+slub_min_objects=x		(default 4)
+slub_min_order=x		(default 0)
+slub_max_order=x		(default 1)
+
+slub_min_objects allows to specify how many objects must at least fit
+into one slab in order for the allocation order to be acceptable.
+In general slub will be able to perform this number of allocations
+on a slab without consulting centralized resources (list_lock) where
+contention may occur.
+
+slub_min_order specifies a minim order of slabs. A similar effect like
+slub_min_objects.
+
+slub_max_order specified the order at which slub_min_objects should no
+longer be checked. This is useful to avoid SLUB trying to generate
+super large order pages to fit slub_min_objects of a slab cache with
+large object sizes into one high order page.
+
+SLUB Debug output
+-----------------
+
+Here is a sample of slub debug output:
+
+====================================================================
+BUG kmalloc-8: Redzone overwritten
+--------------------------------------------------------------------
+
+INFO: 0xc90f6d28-0xc90f6d2b. First byte 0x00 instead of 0xcc
+INFO: Slab 0xc528c530 flags=0x400000c3 inuse=61 fp=0xc90f6d58
+INFO: Object 0xc90f6d20 @offset=3360 fp=0xc90f6d58
+INFO: Allocated in get_modalias+0x61/0xf5 age=53 cpu=1 pid=554
+
+Bytes b4 0xc90f6d10:  00 00 00 00 00 00 00 00 5a 5a 5a 5a 5a 5a 5a 5a ........ZZZZZZZZ
+  Object 0xc90f6d20:  31 30 31 39 2e 30 30 35                         1019.005
+ Redzone 0xc90f6d28:  00 cc cc cc                                     .
+ Padding 0xc90f6d50:  5a 5a 5a 5a 5a 5a 5a 5a                         ZZZZZZZZ
+
+  [<c010523d>] dump_trace+0x63/0x1eb
+  [<c01053df>] show_trace_log_lvl+0x1a/0x2f
+  [<c010601d>] show_trace+0x12/0x14
+  [<c0106035>] dump_stack+0x16/0x18
+  [<c017e0fa>] object_err+0x143/0x14b
+  [<c017e2cc>] check_object+0x66/0x234
+  [<c017eb43>] __slab_free+0x239/0x384
+  [<c017f446>] kfree+0xa6/0xc6
+  [<c02e2335>] get_modalias+0xb9/0xf5
+  [<c02e23b7>] dmi_dev_uevent+0x27/0x3c
+  [<c027866a>] dev_uevent+0x1ad/0x1da
+  [<c0205024>] kobject_uevent_env+0x20a/0x45b
+  [<c020527f>] kobject_uevent+0xa/0xf
+  [<c02779f1>] store_uevent+0x4f/0x58
+  [<c027758e>] dev_attr_store+0x29/0x2f
+  [<c01bec4f>] sysfs_write_file+0x16e/0x19c
+  [<c0183ba7>] vfs_write+0xd1/0x15a
+  [<c01841d7>] sys_write+0x3d/0x72
+  [<c0104112>] sysenter_past_esp+0x5f/0x99
+  [<b7f7b410>] 0xb7f7b410
+  =======================
+
+FIX kmalloc-8: Restoring Redzone 0xc90f6d28-0xc90f6d2b=0xcc
+
+If SLUB encounters a corrupted object (full detection requires the kernel
+to be booted with slub_debug) then the following output will be dumped
+into the syslog:
+
+1. Description of the problem encountered
+
+This will be a message in the system log starting with
+
+===============================================
+BUG <slab cache affected>: <What went wrong>
+-----------------------------------------------
+
+INFO: <corruption start>-<corruption_end> <more info>
+INFO: Slab <address> <slab information>
+INFO: Object <address> <object information>
+INFO: Allocated in <kernel function> age=<jiffies since alloc> cpu=<allocated by
+	cpu> pid=<pid of the process>
+INFO: Freed in <kernel function> age=<jiffies since free> cpu=<freed by cpu>
+	 pid=<pid of the process>
+
+(Object allocation / free information is only available if SLAB_STORE_USER is
+set for the slab. slub_debug sets that option)
+
+2. The object contents if an object was involved.
+
+Various types of lines can follow the BUG SLUB line:
+
+Bytes b4 <address> : <bytes>
+	Shows a few bytes before the object where the problem was detected.
+	Can be useful if the corruption does not stop with the start of the
+	object.
+
+Object <address> : <bytes>
+	The bytes of the object. If the object is inactive then the bytes
+	typically contain poison values. Any non-poison value shows a
+	corruption by a write after free.
+
+Redzone <address> : <bytes>
+	The Redzone following the object. The Redzone is used to detect
+	writes after the object. All bytes should always have the same
+	value. If there is any deviation then it is due to a write after
+	the object boundary.
+
+	(Redzone information is only available if SLAB_RED_ZONE is set.
+	slub_debug sets that option)
+
+Padding <address> : <bytes>
+	Unused data to fill up the space in order to get the next object
+	properly aligned. In the debug case we make sure that there are
+	at least 4 bytes of padding. This allows the detection of writes
+	before the object.
+
+3. A stackdump
+
+The stackdump describes the location where the error was detected. The cause
+of the corruption is may be more likely found by looking at the function that
+allocated or freed the object.
+
+4. Report on how the problem was dealt with in order to ensure the continued
+operation of the system.
+
+These are messages in the system log beginning with
+
+FIX <slab cache affected>: <corrective action taken>
+
+In the above sample SLUB found that the Redzone of an active object has
+been overwritten. Here a string of 8 characters was written into a slab that
+has the length of 8 characters. However, a 8 character string needs a
+terminating 0. That zero has overwritten the first byte of the Redzone field.
+After reporting the details of the issue encountered the FIX SLUB message
+tells us that SLUB has restored the Redzone to its proper value and then
+system operations continue.
+
+Emergency operations:
+---------------------
+
+Minimal debugging (sanity checks alone) can be enabled by booting with
+
+	slub_debug=F
+
+This will be generally be enough to enable the resiliency features of slub
+which will keep the system running even if a bad kernel component will
+keep corrupting objects. This may be important for production systems.
+Performance will be impacted by the sanity checks and there will be a
+continual stream of error messages to the syslog but no additional memory
+will be used (unlike full debugging).
+
+No guarantees. The kernel component still needs to be fixed. Performance
+may be optimized further by locating the slab that experiences corruption
+and enabling debugging only for that cache
+
+I.e.
+
+	slub_debug=F,dentry
+
+If the corruption occurs by writing after the end of the object then it
+may be advisable to enable a Redzone to avoid corrupting the beginning
+of other objects.
+
+	slub_debug=FZ,dentry
+
+Christoph Lameter, May 30, 2007
diff --git a/Documentation/vm/transhuge.txt b/Documentation/vm/transhuge.txt
new file mode 100644
index 00000000..0924aaca
--- /dev/null
+++ b/Documentation/vm/transhuge.txt
@@ -0,0 +1,298 @@
+= Transparent Hugepage Support =
+
+== Objective ==
+
+Performance critical computing applications dealing with large memory
+working sets are already running on top of libhugetlbfs and in turn
+hugetlbfs. Transparent Hugepage Support is an alternative means of
+using huge pages for the backing of virtual memory with huge pages
+that supports the automatic promotion and demotion of page sizes and
+without the shortcomings of hugetlbfs.
+
+Currently it only works for anonymous memory mappings but in the
+future it can expand over the pagecache layer starting with tmpfs.
+
+The reason applications are running faster is because of two
+factors. The first factor is almost completely irrelevant and it's not
+of significant interest because it'll also have the downside of
+requiring larger clear-page copy-page in page faults which is a
+potentially negative effect. The first factor consists in taking a
+single page fault for each 2M virtual region touched by userland (so
+reducing the enter/exit kernel frequency by a 512 times factor). This
+only matters the first time the memory is accessed for the lifetime of
+a memory mapping. The second long lasting and much more important
+factor will affect all subsequent accesses to the memory for the whole
+runtime of the application. The second factor consist of two
+components: 1) the TLB miss will run faster (especially with
+virtualization using nested pagetables but almost always also on bare
+metal without virtualization) and 2) a single TLB entry will be
+mapping a much larger amount of virtual memory in turn reducing the
+number of TLB misses. With virtualization and nested pagetables the
+TLB can be mapped of larger size only if both KVM and the Linux guest
+are using hugepages but a significant speedup already happens if only
+one of the two is using hugepages just because of the fact the TLB
+miss is going to run faster.
+
+== Design ==
+
+- "graceful fallback": mm components which don't have transparent
+  hugepage knowledge fall back to breaking a transparent hugepage and
+  working on the regular pages and their respective regular pmd/pte
+  mappings
+
+- if a hugepage allocation fails because of memory fragmentation,
+  regular pages should be gracefully allocated instead and mixed in
+  the same vma without any failure or significant delay and without
+  userland noticing
+
+- if some task quits and more hugepages become available (either
+  immediately in the buddy or through the VM), guest physical memory
+  backed by regular pages should be relocated on hugepages
+  automatically (with khugepaged)
+
+- it doesn't require memory reservation and in turn it uses hugepages
+  whenever possible (the only possible reservation here is kernelcore=
+  to avoid unmovable pages to fragment all the memory but such a tweak
+  is not specific to transparent hugepage support and it's a generic
+  feature that applies to all dynamic high order allocations in the
+  kernel)
+
+- this initial support only offers the feature in the anonymous memory
+  regions but it'd be ideal to move it to tmpfs and the pagecache
+  later
+
+Transparent Hugepage Support maximizes the usefulness of free memory
+if compared to the reservation approach of hugetlbfs by allowing all
+unused memory to be used as cache or other movable (or even unmovable
+entities). It doesn't require reservation to prevent hugepage
+allocation failures to be noticeable from userland. It allows paging
+and all other advanced VM features to be available on the
+hugepages. It requires no modifications for applications to take
+advantage of it.
+
+Applications however can be further optimized to take advantage of
+this feature, like for example they've been optimized before to avoid
+a flood of mmap system calls for every malloc(4k). Optimizing userland
+is by far not mandatory and khugepaged already can take care of long
+lived page allocations even for hugepage unaware applications that
+deals with large amounts of memory.
+
+In certain cases when hugepages are enabled system wide, application
+may end up allocating more memory resources. An application may mmap a
+large region but only touch 1 byte of it, in that case a 2M page might
+be allocated instead of a 4k page for no good. This is why it's
+possible to disable hugepages system-wide and to only have them inside
+MADV_HUGEPAGE madvise regions.
+
+Embedded systems should enable hugepages only inside madvise regions
+to eliminate any risk of wasting any precious byte of memory and to
+only run faster.
+
+Applications that gets a lot of benefit from hugepages and that don't
+risk to lose memory by using hugepages, should use
+madvise(MADV_HUGEPAGE) on their critical mmapped regions.
+
+== sysfs ==
+
+Transparent Hugepage Support can be entirely disabled (mostly for
+debugging purposes) or only enabled inside MADV_HUGEPAGE regions (to
+avoid the risk of consuming more memory resources) or enabled system
+wide. This can be achieved with one of:
+
+echo always >/sys/kernel/mm/transparent_hugepage/enabled
+echo madvise >/sys/kernel/mm/transparent_hugepage/enabled
+echo never >/sys/kernel/mm/transparent_hugepage/enabled
+
+It's also possible to limit defrag efforts in the VM to generate
+hugepages in case they're not immediately free to madvise regions or
+to never try to defrag memory and simply fallback to regular pages
+unless hugepages are immediately available. Clearly if we spend CPU
+time to defrag memory, we would expect to gain even more by the fact
+we use hugepages later instead of regular pages. This isn't always
+guaranteed, but it may be more likely in case the allocation is for a
+MADV_HUGEPAGE region.
+
+echo always >/sys/kernel/mm/transparent_hugepage/defrag
+echo madvise >/sys/kernel/mm/transparent_hugepage/defrag
+echo never >/sys/kernel/mm/transparent_hugepage/defrag
+
+khugepaged will be automatically started when
+transparent_hugepage/enabled is set to "always" or "madvise, and it'll
+be automatically shutdown if it's set to "never".
+
+khugepaged runs usually at low frequency so while one may not want to
+invoke defrag algorithms synchronously during the page faults, it
+should be worth invoking defrag at least in khugepaged. However it's
+also possible to disable defrag in khugepaged:
+
+echo yes >/sys/kernel/mm/transparent_hugepage/khugepaged/defrag
+echo no >/sys/kernel/mm/transparent_hugepage/khugepaged/defrag
+
+You can also control how many pages khugepaged should scan at each
+pass:
+
+/sys/kernel/mm/transparent_hugepage/khugepaged/pages_to_scan
+
+and how many milliseconds to wait in khugepaged between each pass (you
+can set this to 0 to run khugepaged at 100% utilization of one core):
+
+/sys/kernel/mm/transparent_hugepage/khugepaged/scan_sleep_millisecs
+
+and how many milliseconds to wait in khugepaged if there's an hugepage
+allocation failure to throttle the next allocation attempt.
+
+/sys/kernel/mm/transparent_hugepage/khugepaged/alloc_sleep_millisecs
+
+The khugepaged progress can be seen in the number of pages collapsed:
+
+/sys/kernel/mm/transparent_hugepage/khugepaged/pages_collapsed
+
+for each pass:
+
+/sys/kernel/mm/transparent_hugepage/khugepaged/full_scans
+
+== Boot parameter ==
+
+You can change the sysfs boot time defaults of Transparent Hugepage
+Support by passing the parameter "transparent_hugepage=always" or
+"transparent_hugepage=madvise" or "transparent_hugepage=never"
+(without "") to the kernel command line.
+
+== Need of application restart ==
+
+The transparent_hugepage/enabled values only affect future
+behavior. So to make them effective you need to restart any
+application that could have been using hugepages. This also applies to
+the regions registered in khugepaged.
+
+== get_user_pages and follow_page ==
+
+get_user_pages and follow_page if run on a hugepage, will return the
+head or tail pages as usual (exactly as they would do on
+hugetlbfs). Most gup users will only care about the actual physical
+address of the page and its temporary pinning to release after the I/O
+is complete, so they won't ever notice the fact the page is huge. But
+if any driver is going to mangle over the page structure of the tail
+page (like for checking page->mapping or other bits that are relevant
+for the head page and not the tail page), it should be updated to jump
+to check head page instead (while serializing properly against
+split_huge_page() to avoid the head and tail pages to disappear from
+under it, see the futex code to see an example of that, hugetlbfs also
+needed special handling in futex code for similar reasons).
+
+NOTE: these aren't new constraints to the GUP API, and they match the
+same constrains that applies to hugetlbfs too, so any driver capable
+of handling GUP on hugetlbfs will also work fine on transparent
+hugepage backed mappings.
+
+In case you can't handle compound pages if they're returned by
+follow_page, the FOLL_SPLIT bit can be specified as parameter to
+follow_page, so that it will split the hugepages before returning
+them. Migration for example passes FOLL_SPLIT as parameter to
+follow_page because it's not hugepage aware and in fact it can't work
+at all on hugetlbfs (but it instead works fine on transparent
+hugepages thanks to FOLL_SPLIT). migration simply can't deal with
+hugepages being returned (as it's not only checking the pfn of the
+page and pinning it during the copy but it pretends to migrate the
+memory in regular page sizes and with regular pte/pmd mappings).
+
+== Optimizing the applications ==
+
+To be guaranteed that the kernel will map a 2M page immediately in any
+memory region, the mmap region has to be hugepage naturally
+aligned. posix_memalign() can provide that guarantee.
+
+== Hugetlbfs ==
+
+You can use hugetlbfs on a kernel that has transparent hugepage
+support enabled just fine as always. No difference can be noted in
+hugetlbfs other than there will be less overall fragmentation. All
+usual features belonging to hugetlbfs are preserved and
+unaffected. libhugetlbfs will also work fine as usual.
+
+== Graceful fallback ==
+
+Code walking pagetables but unware about huge pmds can simply call
+split_huge_page_pmd(mm, pmd) where the pmd is the one returned by
+pmd_offset. It's trivial to make the code transparent hugepage aware
+by just grepping for "pmd_offset" and adding split_huge_page_pmd where
+missing after pmd_offset returns the pmd. Thanks to the graceful
+fallback design, with a one liner change, you can avoid to write
+hundred if not thousand of lines of complex code to make your code
+hugepage aware.
+
+If you're not walking pagetables but you run into a physical hugepage
+but you can't handle it natively in your code, you can split it by
+calling split_huge_page(page). This is what the Linux VM does before
+it tries to swapout the hugepage for example.
+
+Example to make mremap.c transparent hugepage aware with a one liner
+change:
+
+diff --git a/mm/mremap.c b/mm/mremap.c
+--- a/mm/mremap.c
++++ b/mm/mremap.c
+@@ -41,6 +41,7 @@ static pmd_t *get_old_pmd(struct mm_stru
+		return NULL;
+
+	pmd = pmd_offset(pud, addr);
++	split_huge_page_pmd(mm, pmd);
+	if (pmd_none_or_clear_bad(pmd))
+		return NULL;
+
+== Locking in hugepage aware code ==
+
+We want as much code as possible hugepage aware, as calling
+split_huge_page() or split_huge_page_pmd() has a cost.
+
+To make pagetable walks huge pmd aware, all you need to do is to call
+pmd_trans_huge() on the pmd returned by pmd_offset. You must hold the
+mmap_sem in read (or write) mode to be sure an huge pmd cannot be
+created from under you by khugepaged (khugepaged collapse_huge_page
+takes the mmap_sem in write mode in addition to the anon_vma lock). If
+pmd_trans_huge returns false, you just fallback in the old code
+paths. If instead pmd_trans_huge returns true, you have to take the
+mm->page_table_lock and re-run pmd_trans_huge. Taking the
+page_table_lock will prevent the huge pmd to be converted into a
+regular pmd from under you (split_huge_page can run in parallel to the
+pagetable walk). If the second pmd_trans_huge returns false, you
+should just drop the page_table_lock and fallback to the old code as
+before. Otherwise you should run pmd_trans_splitting on the pmd. In
+case pmd_trans_splitting returns true, it means split_huge_page is
+already in the middle of splitting the page. So if pmd_trans_splitting
+returns true it's enough to drop the page_table_lock and call
+wait_split_huge_page and then fallback the old code paths. You are
+guaranteed by the time wait_split_huge_page returns, the pmd isn't
+huge anymore. If pmd_trans_splitting returns false, you can proceed to
+process the huge pmd and the hugepage natively. Once finished you can
+drop the page_table_lock.
+
+== compound_lock, get_user_pages and put_page ==
+
+split_huge_page internally has to distribute the refcounts in the head
+page to the tail pages before clearing all PG_head/tail bits from the
+page structures. It can do that easily for refcounts taken by huge pmd
+mappings. But the GUI API as created by hugetlbfs (that returns head
+and tail pages if running get_user_pages on an address backed by any
+hugepage), requires the refcount to be accounted on the tail pages and
+not only in the head pages, if we want to be able to run
+split_huge_page while there are gup pins established on any tail
+page. Failure to be able to run split_huge_page if there's any gup pin
+on any tail page, would mean having to split all hugepages upfront in
+get_user_pages which is unacceptable as too many gup users are
+performance critical and they must work natively on hugepages like
+they work natively on hugetlbfs already (hugetlbfs is simpler because
+hugetlbfs pages cannot be splitted so there wouldn't be requirement of
+accounting the pins on the tail pages for hugetlbfs). If we wouldn't
+account the gup refcounts on the tail pages during gup, we won't know
+anymore which tail page is pinned by gup and which is not while we run
+split_huge_page. But we still have to add the gup pin to the head page
+too, to know when we can free the compound page in case it's never
+splitted during its lifetime. That requires changing not just
+get_page, but put_page as well so that when put_page runs on a tail
+page (and only on a tail page) it will find its respective head page,
+and then it will decrease the head page refcount in addition to the
+tail page refcount. To obtain a head page reliably and to decrease its
+refcount without race conditions, put_page has to serialize against
+__split_huge_page_refcount using a special per-page lock called
+compound_lock.
diff --git a/Documentation/vm/unevictable-lru.txt b/Documentation/vm/unevictable-lru.txt
new file mode 100644
index 00000000..97bae3c5
--- /dev/null
+++ b/Documentation/vm/unevictable-lru.txt
@@ -0,0 +1,690 @@
+			==============================
+			UNEVICTABLE LRU INFRASTRUCTURE
+			==============================
+
+========
+CONTENTS
+========
+
+ (*) The Unevictable LRU
+
+     - The unevictable page list.
+     - Memory control group interaction.
+     - Marking address spaces unevictable.
+     - Detecting Unevictable Pages.
+     - vmscan's handling of unevictable pages.
+
+ (*) mlock()'d pages.
+
+     - History.
+     - Basic management.
+     - mlock()/mlockall() system call handling.
+     - Filtering special vmas.
+     - munlock()/munlockall() system call handling.
+     - Migrating mlocked pages.
+     - mmap(MAP_LOCKED) system call handling.
+     - munmap()/exit()/exec() system call handling.
+     - try_to_unmap().
+     - try_to_munlock() reverse map scan.
+     - Page reclaim in shrink_*_list().
+
+
+============
+INTRODUCTION
+============
+
+This document describes the Linux memory manager's "Unevictable LRU"
+infrastructure and the use of this to manage several types of "unevictable"
+pages.
+
+The document attempts to provide the overall rationale behind this mechanism
+and the rationale for some of the design decisions that drove the
+implementation.  The latter design rationale is discussed in the context of an
+implementation description.  Admittedly, one can obtain the implementation
+details - the "what does it do?" - by reading the code.  One hopes that the
+descriptions below add value by provide the answer to "why does it do that?".
+
+
+===================
+THE UNEVICTABLE LRU
+===================
+
+The Unevictable LRU facility adds an additional LRU list to track unevictable
+pages and to hide these pages from vmscan.  This mechanism is based on a patch
+by Larry Woodman of Red Hat to address several scalability problems with page
+reclaim in Linux.  The problems have been observed at customer sites on large
+memory x86_64 systems.
+
+To illustrate this with an example, a non-NUMA x86_64 platform with 128GB of
+main memory will have over 32 million 4k pages in a single zone.  When a large
+fraction of these pages are not evictable for any reason [see below], vmscan
+will spend a lot of time scanning the LRU lists looking for the small fraction
+of pages that are evictable.  This can result in a situation where all CPUs are
+spending 100% of their time in vmscan for hours or days on end, with the system
+completely unresponsive.
+
+The unevictable list addresses the following classes of unevictable pages:
+
+ (*) Those owned by ramfs.
+
+ (*) Those mapped into SHM_LOCK'd shared memory regions.
+
+ (*) Those mapped into VM_LOCKED [mlock()ed] VMAs.
+
+The infrastructure may also be able to handle other conditions that make pages
+unevictable, either by definition or by circumstance, in the future.
+
+
+THE UNEVICTABLE PAGE LIST
+-------------------------
+
+The Unevictable LRU infrastructure consists of an additional, per-zone, LRU list
+called the "unevictable" list and an associated page flag, PG_unevictable, to
+indicate that the page is being managed on the unevictable list.
+
+The PG_unevictable flag is analogous to, and mutually exclusive with, the
+PG_active flag in that it indicates on which LRU list a page resides when
+PG_lru is set.
+
+The Unevictable LRU infrastructure maintains unevictable pages on an additional
+LRU list for a few reasons:
+
+ (1) We get to "treat unevictable pages just like we treat other pages in the
+     system - which means we get to use the same code to manipulate them, the
+     same code to isolate them (for migrate, etc.), the same code to keep track
+     of the statistics, etc..." [Rik van Riel]
+
+ (2) We want to be able to migrate unevictable pages between nodes for memory
+     defragmentation, workload management and memory hotplug.  The linux kernel
+     can only migrate pages that it can successfully isolate from the LRU
+     lists.  If we were to maintain pages elsewhere than on an LRU-like list,
+     where they can be found by isolate_lru_page(), we would prevent their
+     migration, unless we reworked migration code to find the unevictable pages
+     itself.
+
+
+The unevictable list does not differentiate between file-backed and anonymous,
+swap-backed pages.  This differentiation is only important while the pages are,
+in fact, evictable.
+
+The unevictable list benefits from the "arrayification" of the per-zone LRU
+lists and statistics originally proposed and posted by Christoph Lameter.
+
+The unevictable list does not use the LRU pagevec mechanism. Rather,
+unevictable pages are placed directly on the page's zone's unevictable list
+under the zone lru_lock.  This allows us to prevent the stranding of pages on
+the unevictable list when one task has the page isolated from the LRU and other
+tasks are changing the "evictability" state of the page.
+
+
+MEMORY CONTROL GROUP INTERACTION
+--------------------------------
+
+The unevictable LRU facility interacts with the memory control group [aka
+memory controller; see Documentation/cgroups/memory.txt] by extending the
+lru_list enum.
+
+The memory controller data structure automatically gets a per-zone unevictable
+list as a result of the "arrayification" of the per-zone LRU lists (one per
+lru_list enum element).  The memory controller tracks the movement of pages to
+and from the unevictable list.
+
+When a memory control group comes under memory pressure, the controller will
+not attempt to reclaim pages on the unevictable list.  This has a couple of
+effects:
+
+ (1) Because the pages are "hidden" from reclaim on the unevictable list, the
+     reclaim process can be more efficient, dealing only with pages that have a
+     chance of being reclaimed.
+
+ (2) On the other hand, if too many of the pages charged to the control group
+     are unevictable, the evictable portion of the working set of the tasks in
+     the control group may not fit into the available memory.  This can cause
+     the control group to thrash or to OOM-kill tasks.
+
+
+MARKING ADDRESS SPACES UNEVICTABLE
+----------------------------------
+
+For facilities such as ramfs none of the pages attached to the address space
+may be evicted.  To prevent eviction of any such pages, the AS_UNEVICTABLE
+address space flag is provided, and this can be manipulated by a filesystem
+using a number of wrapper functions:
+
+ (*) void mapping_set_unevictable(struct address_space *mapping);
+
+	Mark the address space as being completely unevictable.
+
+ (*) void mapping_clear_unevictable(struct address_space *mapping);
+
+	Mark the address space as being evictable.
+
+ (*) int mapping_unevictable(struct address_space *mapping);
+
+	Query the address space, and return true if it is completely
+	unevictable.
+
+These are currently used in two places in the kernel:
+
+ (1) By ramfs to mark the address spaces of its inodes when they are created,
+     and this mark remains for the life of the inode.
+
+ (2) By SYSV SHM to mark SHM_LOCK'd address spaces until SHM_UNLOCK is called.
+
+     Note that SHM_LOCK is not required to page in the locked pages if they're
+     swapped out; the application must touch the pages manually if it wants to
+     ensure they're in memory.
+
+
+DETECTING UNEVICTABLE PAGES
+---------------------------
+
+The function page_evictable() in vmscan.c determines whether a page is
+evictable or not using the query function outlined above [see section "Marking
+address spaces unevictable"] to check the AS_UNEVICTABLE flag.
+
+For address spaces that are so marked after being populated (as SHM regions
+might be), the lock action (eg: SHM_LOCK) can be lazy, and need not populate
+the page tables for the region as does, for example, mlock(), nor need it make
+any special effort to push any pages in the SHM_LOCK'd area to the unevictable
+list.  Instead, vmscan will do this if and when it encounters the pages during
+a reclamation scan.
+
+On an unlock action (such as SHM_UNLOCK), the unlocker (eg: shmctl()) must scan
+the pages in the region and "rescue" them from the unevictable list if no other
+condition is keeping them unevictable.  If an unevictable region is destroyed,
+the pages are also "rescued" from the unevictable list in the process of
+freeing them.
+
+page_evictable() also checks for mlocked pages by testing an additional page
+flag, PG_mlocked (as wrapped by PageMlocked()).  If the page is NOT mlocked,
+and a non-NULL VMA is supplied, page_evictable() will check whether the VMA is
+VM_LOCKED via is_mlocked_vma().  is_mlocked_vma() will SetPageMlocked() and
+update the appropriate statistics if the vma is VM_LOCKED.  This method allows
+efficient "culling" of pages in the fault path that are being faulted in to
+VM_LOCKED VMAs.
+
+
+VMSCAN'S HANDLING OF UNEVICTABLE PAGES
+--------------------------------------
+
+If unevictable pages are culled in the fault path, or moved to the unevictable
+list at mlock() or mmap() time, vmscan will not encounter the pages until they
+have become evictable again (via munlock() for example) and have been "rescued"
+from the unevictable list.  However, there may be situations where we decide,
+for the sake of expediency, to leave a unevictable page on one of the regular
+active/inactive LRU lists for vmscan to deal with.  vmscan checks for such
+pages in all of the shrink_{active|inactive|page}_list() functions and will
+"cull" such pages that it encounters: that is, it diverts those pages to the
+unevictable list for the zone being scanned.
+
+There may be situations where a page is mapped into a VM_LOCKED VMA, but the
+page is not marked as PG_mlocked.  Such pages will make it all the way to
+shrink_page_list() where they will be detected when vmscan walks the reverse
+map in try_to_unmap().  If try_to_unmap() returns SWAP_MLOCK,
+shrink_page_list() will cull the page at that point.
+
+To "cull" an unevictable page, vmscan simply puts the page back on the LRU list
+using putback_lru_page() - the inverse operation to isolate_lru_page() - after
+dropping the page lock.  Because the condition which makes the page unevictable
+may change once the page is unlocked, putback_lru_page() will recheck the
+unevictable state of a page that it places on the unevictable list.  If the
+page has become unevictable, putback_lru_page() removes it from the list and
+retries, including the page_unevictable() test.  Because such a race is a rare
+event and movement of pages onto the unevictable list should be rare, these
+extra evictabilty checks should not occur in the majority of calls to
+putback_lru_page().
+
+
+=============
+MLOCKED PAGES
+=============
+
+The unevictable page list is also useful for mlock(), in addition to ramfs and
+SYSV SHM.  Note that mlock() is only available in CONFIG_MMU=y situations; in
+NOMMU situations, all mappings are effectively mlocked.
+
+
+HISTORY
+-------
+
+The "Unevictable mlocked Pages" infrastructure is based on work originally
+posted by Nick Piggin in an RFC patch entitled "mm: mlocked pages off LRU".
+Nick posted his patch as an alternative to a patch posted by Christoph Lameter
+to achieve the same objective: hiding mlocked pages from vmscan.
+
+In Nick's patch, he used one of the struct page LRU list link fields as a count
+of VM_LOCKED VMAs that map the page.  This use of the link field for a count
+prevented the management of the pages on an LRU list, and thus mlocked pages
+were not migratable as isolate_lru_page() could not find them, and the LRU list
+link field was not available to the migration subsystem.
+
+Nick resolved this by putting mlocked pages back on the lru list before
+attempting to isolate them, thus abandoning the count of VM_LOCKED VMAs.  When
+Nick's patch was integrated with the Unevictable LRU work, the count was
+replaced by walking the reverse map to determine whether any VM_LOCKED VMAs
+mapped the page.  More on this below.
+
+
+BASIC MANAGEMENT
+----------------
+
+mlocked pages - pages mapped into a VM_LOCKED VMA - are a class of unevictable
+pages.  When such a page has been "noticed" by the memory management subsystem,
+the page is marked with the PG_mlocked flag.  This can be manipulated using the
+PageMlocked() functions.
+
+A PG_mlocked page will be placed on the unevictable list when it is added to
+the LRU.  Such pages can be "noticed" by memory management in several places:
+
+ (1) in the mlock()/mlockall() system call handlers;
+
+ (2) in the mmap() system call handler when mmapping a region with the
+     MAP_LOCKED flag;
+
+ (3) mmapping a region in a task that has called mlockall() with the MCL_FUTURE
+     flag
+
+ (4) in the fault path, if mlocked pages are "culled" in the fault path,
+     and when a VM_LOCKED stack segment is expanded; or
+
+ (5) as mentioned above, in vmscan:shrink_page_list() when attempting to
+     reclaim a page in a VM_LOCKED VMA via try_to_unmap()
+
+all of which result in the VM_LOCKED flag being set for the VMA if it doesn't
+already have it set.
+
+mlocked pages become unlocked and rescued from the unevictable list when:
+
+ (1) mapped in a range unlocked via the munlock()/munlockall() system calls;
+
+ (2) munmap()'d out of the last VM_LOCKED VMA that maps the page, including
+     unmapping at task exit;
+
+ (3) when the page is truncated from the last VM_LOCKED VMA of an mmapped file;
+     or
+
+ (4) before a page is COW'd in a VM_LOCKED VMA.
+
+
+mlock()/mlockall() SYSTEM CALL HANDLING
+---------------------------------------
+
+Both [do_]mlock() and [do_]mlockall() system call handlers call mlock_fixup()
+for each VMA in the range specified by the call.  In the case of mlockall(),
+this is the entire active address space of the task.  Note that mlock_fixup()
+is used for both mlocking and munlocking a range of memory.  A call to mlock()
+an already VM_LOCKED VMA, or to munlock() a VMA that is not VM_LOCKED is
+treated as a no-op, and mlock_fixup() simply returns.
+
+If the VMA passes some filtering as described in "Filtering Special Vmas"
+below, mlock_fixup() will attempt to merge the VMA with its neighbors or split
+off a subset of the VMA if the range does not cover the entire VMA.  Once the
+VMA has been merged or split or neither, mlock_fixup() will call
+__mlock_vma_pages_range() to fault in the pages via get_user_pages() and to
+mark the pages as mlocked via mlock_vma_page().
+
+Note that the VMA being mlocked might be mapped with PROT_NONE.  In this case,
+get_user_pages() will be unable to fault in the pages.  That's okay.  If pages
+do end up getting faulted into this VM_LOCKED VMA, we'll handle them in the
+fault path or in vmscan.
+
+Also note that a page returned by get_user_pages() could be truncated or
+migrated out from under us, while we're trying to mlock it.  To detect this,
+__mlock_vma_pages_range() checks page_mapping() after acquiring the page lock.
+If the page is still associated with its mapping, we'll go ahead and call
+mlock_vma_page().  If the mapping is gone, we just unlock the page and move on.
+In the worst case, this will result in a page mapped in a VM_LOCKED VMA
+remaining on a normal LRU list without being PageMlocked().  Again, vmscan will
+detect and cull such pages.
+
+mlock_vma_page() will call TestSetPageMlocked() for each page returned by
+get_user_pages().  We use TestSetPageMlocked() because the page might already
+be mlocked by another task/VMA and we don't want to do extra work.  We
+especially do not want to count an mlocked page more than once in the
+statistics.  If the page was already mlocked, mlock_vma_page() need do nothing
+more.
+
+If the page was NOT already mlocked, mlock_vma_page() attempts to isolate the
+page from the LRU, as it is likely on the appropriate active or inactive list
+at that time.  If the isolate_lru_page() succeeds, mlock_vma_page() will put
+back the page - by calling putback_lru_page() - which will notice that the page
+is now mlocked and divert the page to the zone's unevictable list.  If
+mlock_vma_page() is unable to isolate the page from the LRU, vmscan will handle
+it later if and when it attempts to reclaim the page.
+
+
+FILTERING SPECIAL VMAS
+----------------------
+
+mlock_fixup() filters several classes of "special" VMAs:
+
+1) VMAs with VM_IO or VM_PFNMAP set are skipped entirely.  The pages behind
+   these mappings are inherently pinned, so we don't need to mark them as
+   mlocked.  In any case, most of the pages have no struct page in which to so
+   mark the page.  Because of this, get_user_pages() will fail for these VMAs,
+   so there is no sense in attempting to visit them.
+
+2) VMAs mapping hugetlbfs page are already effectively pinned into memory.  We
+   neither need nor want to mlock() these pages.  However, to preserve the
+   prior behavior of mlock() - before the unevictable/mlock changes -
+   mlock_fixup() will call make_pages_present() in the hugetlbfs VMA range to
+   allocate the huge pages and populate the ptes.
+
+3) VMAs with VM_DONTEXPAND or VM_RESERVED are generally userspace mappings of
+   kernel pages, such as the VDSO page, relay channel pages, etc.  These pages
+   are inherently unevictable and are not managed on the LRU lists.
+   mlock_fixup() treats these VMAs the same as hugetlbfs VMAs.  It calls
+   make_pages_present() to populate the ptes.
+
+Note that for all of these special VMAs, mlock_fixup() does not set the
+VM_LOCKED flag.  Therefore, we won't have to deal with them later during
+munlock(), munmap() or task exit.  Neither does mlock_fixup() account these
+VMAs against the task's "locked_vm".
+
+
+munlock()/munlockall() SYSTEM CALL HANDLING
+-------------------------------------------
+
+The munlock() and munlockall() system calls are handled by the same functions -
+do_mlock[all]() - as the mlock() and mlockall() system calls with the unlock vs
+lock operation indicated by an argument.  So, these system calls are also
+handled by mlock_fixup().  Again, if called for an already munlocked VMA,
+mlock_fixup() simply returns.  Because of the VMA filtering discussed above,
+VM_LOCKED will not be set in any "special" VMAs.  So, these VMAs will be
+ignored for munlock.
+
+If the VMA is VM_LOCKED, mlock_fixup() again attempts to merge or split off the
+specified range.  The range is then munlocked via the function
+__mlock_vma_pages_range() - the same function used to mlock a VMA range -
+passing a flag to indicate that munlock() is being performed.
+
+Because the VMA access protections could have been changed to PROT_NONE after
+faulting in and mlocking pages, get_user_pages() was unreliable for visiting
+these pages for munlocking.  Because we don't want to leave pages mlocked,
+get_user_pages() was enhanced to accept a flag to ignore the permissions when
+fetching the pages - all of which should be resident as a result of previous
+mlocking.
+
+For munlock(), __mlock_vma_pages_range() unlocks individual pages by calling
+munlock_vma_page().  munlock_vma_page() unconditionally clears the PG_mlocked
+flag using TestClearPageMlocked().  As with mlock_vma_page(),
+munlock_vma_page() use the Test*PageMlocked() function to handle the case where
+the page might have already been unlocked by another task.  If the page was
+mlocked, munlock_vma_page() updates that zone statistics for the number of
+mlocked pages.  Note, however, that at this point we haven't checked whether
+the page is mapped by other VM_LOCKED VMAs.
+
+We can't call try_to_munlock(), the function that walks the reverse map to
+check for other VM_LOCKED VMAs, without first isolating the page from the LRU.
+try_to_munlock() is a variant of try_to_unmap() and thus requires that the page
+not be on an LRU list [more on these below].  However, the call to
+isolate_lru_page() could fail, in which case we couldn't try_to_munlock().  So,
+we go ahead and clear PG_mlocked up front, as this might be the only chance we
+have.  If we can successfully isolate the page, we go ahead and
+try_to_munlock(), which will restore the PG_mlocked flag and update the zone
+page statistics if it finds another VMA holding the page mlocked.  If we fail
+to isolate the page, we'll have left a potentially mlocked page on the LRU.
+This is fine, because we'll catch it later if and if vmscan tries to reclaim
+the page.  This should be relatively rare.
+
+
+MIGRATING MLOCKED PAGES
+-----------------------
+
+A page that is being migrated has been isolated from the LRU lists and is held
+locked across unmapping of the page, updating the page's address space entry
+and copying the contents and state, until the page table entry has been
+replaced with an entry that refers to the new page.  Linux supports migration
+of mlocked pages and other unevictable pages.  This involves simply moving the
+PG_mlocked and PG_unevictable states from the old page to the new page.
+
+Note that page migration can race with mlocking or munlocking of the same page.
+This has been discussed from the mlock/munlock perspective in the respective
+sections above.  Both processes (migration and m[un]locking) hold the page
+locked.  This provides the first level of synchronization.  Page migration
+zeros out the page_mapping of the old page before unlocking it, so m[un]lock
+can skip these pages by testing the page mapping under page lock.
+
+To complete page migration, we place the new and old pages back onto the LRU
+after dropping the page lock.  The "unneeded" page - old page on success, new
+page on failure - will be freed when the reference count held by the migration
+process is released.  To ensure that we don't strand pages on the unevictable
+list because of a race between munlock and migration, page migration uses the
+putback_lru_page() function to add migrated pages back to the LRU.
+
+
+mmap(MAP_LOCKED) SYSTEM CALL HANDLING
+-------------------------------------
+
+In addition the the mlock()/mlockall() system calls, an application can request
+that a region of memory be mlocked supplying the MAP_LOCKED flag to the mmap()
+call.  Furthermore, any mmap() call or brk() call that expands the heap by a
+task that has previously called mlockall() with the MCL_FUTURE flag will result
+in the newly mapped memory being mlocked.  Before the unevictable/mlock
+changes, the kernel simply called make_pages_present() to allocate pages and
+populate the page table.
+
+To mlock a range of memory under the unevictable/mlock infrastructure, the
+mmap() handler and task address space expansion functions call
+mlock_vma_pages_range() specifying the vma and the address range to mlock.
+mlock_vma_pages_range() filters VMAs like mlock_fixup(), as described above in
+"Filtering Special VMAs".  It will clear the VM_LOCKED flag, which will have
+already been set by the caller, in filtered VMAs.  Thus these VMA's need not be
+visited for munlock when the region is unmapped.
+
+For "normal" VMAs, mlock_vma_pages_range() calls __mlock_vma_pages_range() to
+fault/allocate the pages and mlock them.  Again, like mlock_fixup(),
+mlock_vma_pages_range() downgrades the mmap semaphore to read mode before
+attempting to fault/allocate and mlock the pages and "upgrades" the semaphore
+back to write mode before returning.
+
+The callers of mlock_vma_pages_range() will have already added the memory range
+to be mlocked to the task's "locked_vm".  To account for filtered VMAs,
+mlock_vma_pages_range() returns the number of pages NOT mlocked.  All of the
+callers then subtract a non-negative return value from the task's locked_vm.  A
+negative return value represent an error - for example, from get_user_pages()
+attempting to fault in a VMA with PROT_NONE access.  In this case, we leave the
+memory range accounted as locked_vm, as the protections could be changed later
+and pages allocated into that region.
+
+
+munmap()/exit()/exec() SYSTEM CALL HANDLING
+-------------------------------------------
+
+When unmapping an mlocked region of memory, whether by an explicit call to
+munmap() or via an internal unmap from exit() or exec() processing, we must
+munlock the pages if we're removing the last VM_LOCKED VMA that maps the pages.
+Before the unevictable/mlock changes, mlocking did not mark the pages in any
+way, so unmapping them required no processing.
+
+To munlock a range of memory under the unevictable/mlock infrastructure, the
+munmap() handler and task address space call tear down function
+munlock_vma_pages_all().  The name reflects the observation that one always
+specifies the entire VMA range when munlock()ing during unmap of a region.
+Because of the VMA filtering when mlocking() regions, only "normal" VMAs that
+actually contain mlocked pages will be passed to munlock_vma_pages_all().
+
+munlock_vma_pages_all() clears the VM_LOCKED VMA flag and, like mlock_fixup()
+for the munlock case, calls __munlock_vma_pages_range() to walk the page table
+for the VMA's memory range and munlock_vma_page() each resident page mapped by
+the VMA.  This effectively munlocks the page, only if this is the last
+VM_LOCKED VMA that maps the page.
+
+
+try_to_unmap()
+--------------
+
+Pages can, of course, be mapped into multiple VMAs.  Some of these VMAs may
+have VM_LOCKED flag set.  It is possible for a page mapped into one or more
+VM_LOCKED VMAs not to have the PG_mlocked flag set and therefore reside on one
+of the active or inactive LRU lists.  This could happen if, for example, a task
+in the process of munlocking the page could not isolate the page from the LRU.
+As a result, vmscan/shrink_page_list() might encounter such a page as described
+in section "vmscan's handling of unevictable pages".  To handle this situation,
+try_to_unmap() checks for VM_LOCKED VMAs while it is walking a page's reverse
+map.
+
+try_to_unmap() is always called, by either vmscan for reclaim or for page
+migration, with the argument page locked and isolated from the LRU.  Separate
+functions handle anonymous and mapped file pages, as these types of pages have
+different reverse map mechanisms.
+
+ (*) try_to_unmap_anon()
+
+     To unmap anonymous pages, each VMA in the list anchored in the anon_vma
+     must be visited - at least until a VM_LOCKED VMA is encountered.  If the
+     page is being unmapped for migration, VM_LOCKED VMAs do not stop the
+     process because mlocked pages are migratable.  However, for reclaim, if
+     the page is mapped into a VM_LOCKED VMA, the scan stops.
+
+     try_to_unmap_anon() attempts to acquire in read mode the mmap semphore of
+     the mm_struct to which the VMA belongs.  If this is successful, it will
+     mlock the page via mlock_vma_page() - we wouldn't have gotten to
+     try_to_unmap_anon() if the page were already mlocked - and will return
+     SWAP_MLOCK, indicating that the page is unevictable.
+
+     If the mmap semaphore cannot be acquired, we are not sure whether the page
+     is really unevictable or not.  In this case, try_to_unmap_anon() will
+     return SWAP_AGAIN.
+
+ (*) try_to_unmap_file() - linear mappings
+
+     Unmapping of a mapped file page works the same as for anonymous mappings,
+     except that the scan visits all VMAs that map the page's index/page offset
+     in the page's mapping's reverse map priority search tree.  It also visits
+     each VMA in the page's mapping's non-linear list, if the list is
+     non-empty.
+
+     As for anonymous pages, on encountering a VM_LOCKED VMA for a mapped file
+     page, try_to_unmap_file() will attempt to acquire the associated
+     mm_struct's mmap semaphore to mlock the page, returning SWAP_MLOCK if this
+     is successful, and SWAP_AGAIN, if not.
+
+ (*) try_to_unmap_file() - non-linear mappings
+
+     If a page's mapping contains a non-empty non-linear mapping VMA list, then
+     try_to_un{map|lock}() must also visit each VMA in that list to determine
+     whether the page is mapped in a VM_LOCKED VMA.  Again, the scan must visit
+     all VMAs in the non-linear list to ensure that the pages is not/should not
+     be mlocked.
+
+     If a VM_LOCKED VMA is found in the list, the scan could terminate.
+     However, there is no easy way to determine whether the page is actually
+     mapped in a given VMA - either for unmapping or testing whether the
+     VM_LOCKED VMA actually pins the page.
+
+     try_to_unmap_file() handles non-linear mappings by scanning a certain
+     number of pages - a "cluster" - in each non-linear VMA associated with the
+     page's mapping, for each file mapped page that vmscan tries to unmap.  If
+     this happens to unmap the page we're trying to unmap, try_to_unmap() will
+     notice this on return (page_mapcount(page) will be 0) and return
+     SWAP_SUCCESS.  Otherwise, it will return SWAP_AGAIN, causing vmscan to
+     recirculate this page.  We take advantage of the cluster scan in
+     try_to_unmap_cluster() as follows:
+
+	For each non-linear VMA, try_to_unmap_cluster() attempts to acquire the
+	mmap semaphore of the associated mm_struct for read without blocking.
+
+	If this attempt is successful and the VMA is VM_LOCKED,
+	try_to_unmap_cluster() will retain the mmap semaphore for the scan;
+	otherwise it drops it here.
+
+	Then, for each page in the cluster, if we're holding the mmap semaphore
+	for a locked VMA, try_to_unmap_cluster() calls mlock_vma_page() to
+	mlock the page.  This call is a no-op if the page is already locked,
+	but will mlock any pages in the non-linear mapping that happen to be
+	unlocked.
+
+	If one of the pages so mlocked is the page passed in to try_to_unmap(),
+	try_to_unmap_cluster() will return SWAP_MLOCK, rather than the default
+	SWAP_AGAIN.  This will allow vmscan to cull the page, rather than
+	recirculating it on the inactive list.
+
+	Again, if try_to_unmap_cluster() cannot acquire the VMA's mmap sem, it
+	returns SWAP_AGAIN, indicating that the page is mapped by a VM_LOCKED
+	VMA, but couldn't be mlocked.
+
+
+try_to_munlock() REVERSE MAP SCAN
+---------------------------------
+
+ [!] TODO/FIXME: a better name might be page_mlocked() - analogous to the
+     page_referenced() reverse map walker.
+
+When munlock_vma_page() [see section "munlock()/munlockall() System Call
+Handling" above] tries to munlock a page, it needs to determine whether or not
+the page is mapped by any VM_LOCKED VMA without actually attempting to unmap
+all PTEs from the page.  For this purpose, the unevictable/mlock infrastructure
+introduced a variant of try_to_unmap() called try_to_munlock().
+
+try_to_munlock() calls the same functions as try_to_unmap() for anonymous and
+mapped file pages with an additional argument specifing unlock versus unmap
+processing.  Again, these functions walk the respective reverse maps looking
+for VM_LOCKED VMAs.  When such a VMA is found for anonymous pages and file
+pages mapped in linear VMAs, as in the try_to_unmap() case, the functions
+attempt to acquire the associated mmap semphore, mlock the page via
+mlock_vma_page() and return SWAP_MLOCK.  This effectively undoes the
+pre-clearing of the page's PG_mlocked done by munlock_vma_page.
+
+If try_to_unmap() is unable to acquire a VM_LOCKED VMA's associated mmap
+semaphore, it will return SWAP_AGAIN.  This will allow shrink_page_list() to
+recycle the page on the inactive list and hope that it has better luck with the
+page next time.
+
+For file pages mapped into non-linear VMAs, the try_to_munlock() logic works
+slightly differently.  On encountering a VM_LOCKED non-linear VMA that might
+map the page, try_to_munlock() returns SWAP_AGAIN without actually mlocking the
+page.  munlock_vma_page() will just leave the page unlocked and let vmscan deal
+with it - the usual fallback position.
+
+Note that try_to_munlock()'s reverse map walk must visit every VMA in a page's
+reverse map to determine that a page is NOT mapped into any VM_LOCKED VMA.
+However, the scan can terminate when it encounters a VM_LOCKED VMA and can
+successfully acquire the VMA's mmap semphore for read and mlock the page.
+Although try_to_munlock() might be called a great many times when munlocking a
+large region or tearing down a large address space that has been mlocked via
+mlockall(), overall this is a fairly rare event.
+
+
+PAGE RECLAIM IN shrink_*_list()
+-------------------------------
+
+shrink_active_list() culls any obviously unevictable pages - i.e.
+!page_evictable(page, NULL) - diverting these to the unevictable list.
+However, shrink_active_list() only sees unevictable pages that made it onto the
+active/inactive lru lists.  Note that these pages do not have PageUnevictable
+set - otherwise they would be on the unevictable list and shrink_active_list
+would never see them.
+
+Some examples of these unevictable pages on the LRU lists are:
+
+ (1) ramfs pages that have been placed on the LRU lists when first allocated.
+
+ (2) SHM_LOCK'd shared memory pages.  shmctl(SHM_LOCK) does not attempt to
+     allocate or fault in the pages in the shared memory region.  This happens
+     when an application accesses the page the first time after SHM_LOCK'ing
+     the segment.
+
+ (3) mlocked pages that could not be isolated from the LRU and moved to the
+     unevictable list in mlock_vma_page().
+
+ (4) Pages mapped into multiple VM_LOCKED VMAs, but try_to_munlock() couldn't
+     acquire the VMA's mmap semaphore to test the flags and set PageMlocked.
+     munlock_vma_page() was forced to let the page back on to the normal LRU
+     list for vmscan to handle.
+
+shrink_inactive_list() also diverts any unevictable pages that it finds on the
+inactive lists to the appropriate zone's unevictable list.
+
+shrink_inactive_list() should only see SHM_LOCK'd pages that became SHM_LOCK'd
+after shrink_active_list() had moved them to the inactive list, or pages mapped
+into VM_LOCKED VMAs that munlock_vma_page() couldn't isolate from the LRU to
+recheck via try_to_munlock().  shrink_inactive_list() won't notice the latter,
+but will pass on to shrink_page_list().
+
+shrink_page_list() again culls obviously unevictable pages that it could
+encounter for similar reason to shrink_inactive_list().  Pages mapped into
+VM_LOCKED VMAs but without PG_mlocked set will make it all the way to
+try_to_unmap().  shrink_page_list() will divert them to the unevictable list
+when try_to_unmap() returns SWAP_MLOCK, as discussed above.
-- 
cgit v1.2.3