12 files changed, 423 insertions, 28 deletions
diff --git a/Documentation/DMA-API.txt b/Documentation/DMA-API.txt
index b939ebb62871..80d150458c80 100644
--- a/Documentation/DMA-API.txt
+++ b/Documentation/DMA-API.txt
@@ -145,7 +145,7 @@ Part Ic - DMA addressing limitations
 int
 dma_supported(struct device *dev, u64 mask)
 int
-pci_dma_supported(struct device *dev, u64 mask)
+pci_dma_supported(struct pci_dev *hwdev, u64 mask)
 
 Checks to see if the device can support DMA to the memory described by
 mask.
@@ -189,7 +189,7 @@ dma_addr_t
 dma_map_single(struct device *dev, void *cpu_addr, size_t size,
 		      enum dma_data_direction direction)
 dma_addr_t
-pci_map_single(struct device *dev, void *cpu_addr, size_t size,
+pci_map_single(struct pci_dev *hwdev, void *cpu_addr, size_t size,
 		      int direction)
 
 Maps a piece of processor virtual memory so it can be accessed by the
@@ -395,6 +395,71 @@ Notes:  You must do this:
 
 See also dma_map_single().
 
+dma_addr_t
+dma_map_single_attrs(struct device *dev, void *cpu_addr, size_t size,
+		     enum dma_data_direction dir,
+		     struct dma_attrs *attrs)
+
+void
+dma_unmap_single_attrs(struct device *dev, dma_addr_t dma_addr,
+		       size_t size, enum dma_data_direction dir,
+		       struct dma_attrs *attrs)
+
+int
+dma_map_sg_attrs(struct device *dev, struct scatterlist *sgl,
+		 int nents, enum dma_data_direction dir,
+		 struct dma_attrs *attrs)
+
+void
+dma_unmap_sg_attrs(struct device *dev, struct scatterlist *sgl,
+		   int nents, enum dma_data_direction dir,
+		   struct dma_attrs *attrs)
+
+The four functions above are just like the counterpart functions
+without the _attrs suffixes, except that they pass an optional
+struct dma_attrs*.
+
+struct dma_attrs encapsulates a set of "dma attributes". For the
+definition of struct dma_attrs see linux/dma-attrs.h.
+
+The interpretation of dma attributes is architecture-specific, and
+each attribute should be documented in Documentation/DMA-attributes.txt.
+
+If struct dma_attrs* is NULL, the semantics of each of these
+functions is identical to those of the corresponding function
+without the _attrs suffix. As a result dma_map_single_attrs()
+can generally replace dma_map_single(), etc.
+
+As an example of the use of the *_attrs functions, here's how
+you could pass an attribute DMA_ATTR_FOO when mapping memory
+for DMA:
+
+#include <linux/dma-attrs.h>
+/* DMA_ATTR_FOO should be defined in linux/dma-attrs.h and
+ * documented in Documentation/DMA-attributes.txt */
+...
+
+	DEFINE_DMA_ATTRS(attrs);
+	dma_set_attr(DMA_ATTR_FOO, &attrs);
+	....
+	n = dma_map_sg_attrs(dev, sg, nents, DMA_TO_DEVICE, &attr);
+	....
+
+Architectures that care about DMA_ATTR_FOO would check for its
+presence in their implementations of the mapping and unmapping
+routines, e.g.:
+
+void whizco_dma_map_sg_attrs(struct device *dev, dma_addr_t dma_addr,
+			     size_t size, enum dma_data_direction dir,
+			     struct dma_attrs *attrs)
+{
+	....
+	int foo =  dma_get_attr(DMA_ATTR_FOO, attrs);
+	....
+	if (foo)
+		/* twizzle the frobnozzle */
+	....
+
 
 Part II - Advanced dma_ usage
 -----------------------------
diff --git a/Documentation/DMA-attributes.txt b/Documentation/DMA-attributes.txt
new file mode 100644
index 000000000000..6d772f84b477
--- /dev/null
+++ b/Documentation/DMA-attributes.txt
@@ -0,0 +1,24 @@
+			DMA attributes
+			==============
+
+This document describes the semantics of the DMA attributes that are
+defined in linux/dma-attrs.h.
+
+DMA_ATTR_WRITE_BARRIER
+----------------------
+
+DMA_ATTR_WRITE_BARRIER is a (write) barrier attribute for DMA.  DMA
+to a memory region with the DMA_ATTR_WRITE_BARRIER attribute forces
+all pending DMA writes to complete, and thus provides a mechanism to
+strictly order DMA from a device across all intervening busses and
+bridges.  This barrier is not specific to a particular type of
+interconnect, it applies to the system as a whole, and so its
+implementation must account for the idiosyncracies of the system all
+the way from the DMA device to memory.
+
+As an example of a situation where DMA_ATTR_WRITE_BARRIER would be
+useful, suppose that a device does a DMA write to indicate that data is
+ready and available in memory.  The DMA of the "completion indication"
+could race with data DMA.  Mapping the memory used for completion
+indications with DMA_ATTR_WRITE_BARRIER would prevent the race.
+
diff --git a/Documentation/cgroups.txt b/Documentation/cgroups.txt
index 31d12e21ff8a..c298a6690e0d 100644
--- a/Documentation/cgroups.txt
+++ b/Documentation/cgroups.txt
@@ -500,8 +500,7 @@ post-attachment activity that requires memory allocations or blocking.
 
 void fork(struct cgroup_subsy *ss, struct task_struct *task)
 
-Called when a task is forked into a cgroup. Also called during
-registration for all existing tasks.
+Called when a task is forked into a cgroup.
 
 void exit(struct cgroup_subsys *ss, struct task_struct *task)
 
diff --git a/Documentation/controllers/devices.txt b/Documentation/controllers/devices.txt
new file mode 100644
index 000000000000..4dcea42432c2
--- /dev/null
+++ b/Documentation/controllers/devices.txt
@@ -0,0 +1,48 @@
+Device Whitelist Controller
+
+1. Description:
+
+Implement a cgroup to track and enforce open and mknod restrictions
+on device files.  A device cgroup associates a device access
+whitelist with each cgroup.  A whitelist entry has 4 fields.
+'type' is a (all), c (char), or b (block).  'all' means it applies
+to all types and all major and minor numbers.  Major and minor are
+either an integer or * for all.  Access is a composition of r
+(read), w (write), and m (mknod).
+
+The root device cgroup starts with rwm to 'all'.  A child device
+cgroup gets a copy of the parent.  Administrators can then remove
+devices from the whitelist or add new entries.  A child cgroup can
+never receive a device access which is denied its parent.  However
+when a device access is removed from a parent it will not also be
+removed from the child(ren).
+
+2. User Interface
+
+An entry is added using devices.allow, and removed using
+devices.deny.  For instance
+
+	echo 'c 1:3 mr' > /cgroups/1/devices.allow
+
+allows cgroup 1 to read and mknod the device usually known as
+/dev/null.  Doing
+
+	echo a > /cgroups/1/devices.deny
+
+will remove the default 'a *:* mrw' entry.
+
+3. Security
+
+Any task can move itself between cgroups.  This clearly won't
+suffice, but we can decide the best way to adequately restrict
+movement as people get some experience with this.  We may just want
+to require CAP_SYS_ADMIN, which at least is a separate bit from
+CAP_MKNOD.  We may want to just refuse moving to a cgroup which
+isn't a descendent of the current one.  Or we may want to use
+CAP_MAC_ADMIN, since we really are trying to lock down root.
+
+CAP_SYS_ADMIN is needed to modify the whitelist or move another
+task to a new cgroup.  (Again we'll probably want to change that).
+
+A cgroup may not be granted more permissions than the cgroup's
+parent has.
diff --git a/Documentation/controllers/resource_counter.txt b/Documentation/controllers/resource_counter.txt
new file mode 100644
index 000000000000..f196ac1d7d25
--- /dev/null
+++ b/Documentation/controllers/resource_counter.txt
@@ -0,0 +1,181 @@
+
+		The Resource Counter
+
+The resource counter, declared at include/linux/res_counter.h,
+is supposed to facilitate the resource management by controllers
+by providing common stuff for accounting.
+
+This "stuff" includes the res_counter structure and routines
+to work with it.
+
+
+
+1. Crucial parts of the res_counter structure
+
+ a. unsigned long long usage
+
+ 	The usage value shows the amount of a resource that is consumed
+	by a group at a given time. The units of measurement should be
+	determined by the controller that uses this counter. E.g. it can
+	be bytes, items or any other unit the controller operates on.
+
+ b. unsigned long long max_usage
+
+ 	The maximal value of the usage over time.
+
+ 	This value is useful when gathering statistical information about
+	the particular group, as it shows the actual resource requirements
+	for a particular group, not just some usage snapshot.
+
+ c. unsigned long long limit
+
+ 	The maximal allowed amount of resource to consume by the group. In
+	case the group requests for more resources, so that the usage value
+	would exceed the limit, the resource allocation is rejected (see
+	the next section).
+
+ d. unsigned long long failcnt
+
+ 	The failcnt stands for "failures counter". This is the number of
+	resource allocation attempts that failed.
+
+ c. spinlock_t lock
+
+ 	Protects changes of the above values.
+
+
+
+2. Basic accounting routines
+
+ a. void res_counter_init(struct res_counter *rc)
+
+ 	Initializes the resource counter. As usual, should be the first
+	routine called for a new counter.
+
+ b. int res_counter_charge[_locked]
+			(struct res_counter *rc, unsigned long val)
+
+	When a resource is about to be allocated it has to be accounted
+	with the appropriate resource counter (controller should determine
+	which one to use on its own). This operation is called "charging".
+
+	This is not very important which operation - resource allocation
+	or charging - is performed first, but
+	  * if the allocation is performed first, this may create a
+	    temporary resource over-usage by the time resource counter is
+	    charged;
+	  * if the charging is performed first, then it should be uncharged
+	    on error path (if the one is called).
+
+ c. void res_counter_uncharge[_locked]
+			(struct res_counter *rc, unsigned long val)
+
+	When a resource is released (freed) it should be de-accounted
+	from the resource counter it was accounted to.  This is called
+	"uncharging".
+
+    The _locked routines imply that the res_counter->lock is taken.
+
+
+ 2.1 Other accounting routines
+
+    There are more routines that may help you with common needs, like
+    checking whether the limit is reached or resetting the max_usage
+    value. They are all declared in include/linux/res_counter.h.
+
+
+
+3. Analyzing the resource counter registrations
+
+ a. If the failcnt value constantly grows, this means that the counter's
+    limit is too tight. Either the group is misbehaving and consumes too
+    many resources, or the configuration is not suitable for the group
+    and the limit should be increased.
+
+ b. The max_usage value can be used to quickly tune the group. One may
+    set the limits to maximal values and either load the container with
+    a common pattern or leave one for a while. After this the max_usage
+    value shows the amount of memory the container would require during
+    its common activity.
+
+    Setting the limit a bit above this value gives a pretty good
+    configuration that works in most of the cases.
+
+ c. If the max_usage is much less than the limit, but the failcnt value
+    is growing, then the group tries to allocate a big chunk of resource
+    at once.
+
+ d. If the max_usage is much less than the limit, but the failcnt value
+    is 0, then this group is given too high limit, that it does not
+    require. It is better to lower the limit a bit leaving more resource
+    for other groups.
+
+
+
+4. Communication with the control groups subsystem (cgroups)
+
+All the resource controllers that are using cgroups and resource counters
+should provide files (in the cgroup filesystem) to work with the resource
+counter fields. They are recommended to adhere to the following rules:
+
+ a. File names
+
+ 	Field name	File name
+	---------------------------------------------------
+	usage		usage_in_<unit_of_measurement>
+	max_usage	max_usage_in_<unit_of_measurement>
+	limit		limit_in_<unit_of_measurement>
+	failcnt		failcnt
+	lock		no file :)
+
+ b. Reading from file should show the corresponding field value in the
+    appropriate format.
+
+ c. Writing to file
+
+ 	Field		Expected behavior
+	----------------------------------
+	usage		prohibited
+	max_usage	reset to usage
+	limit		set the limit
+	failcnt		reset to zero
+
+
+
+5. Usage example
+
+ a. Declare a task group (take a look at cgroups subsystem for this) and
+    fold a res_counter into it
+
+	struct my_group {
+		struct res_counter res;
+
+		<other fields>
+	}
+
+ b. Put hooks in resource allocation/release paths
+
+ 	int alloc_something(...)
+	{
+		if (res_counter_charge(res_counter_ptr, amount) < 0)
+			return -ENOMEM;
+
+		<allocate the resource and return to the caller>
+	}
+
+	void release_something(...)
+	{
+		res_counter_uncharge(res_counter_ptr, amount);
+
+		<release the resource>
+	}
+
+    In order to keep the usage value self-consistent, both the
+    "res_counter_ptr" and the "amount" in release_something() should be
+    the same as they were in the alloc_something() when the releasing
+    resource was allocated.
+
+ c. Provide the way to read res_counter values and set them (the cgroups
+    still can help with it).
+
+ c. Compile and run :)
diff --git a/Documentation/cpu-freq/user-guide.txt b/Documentation/cpu-freq/user-guide.txt
index af3b925ece08..6c442d8426b5 100644
--- a/Documentation/cpu-freq/user-guide.txt
+++ b/Documentation/cpu-freq/user-guide.txt
@@ -154,6 +154,11 @@ scaling_governor,		and by "echoing" the name of another
 				that some governors won't load - they only
 				work on some specific architectures or
 				processors.
+
+cpuinfo_cur_freq :		Current speed of the CPU, in KHz.
+
+scaling_available_frequencies : List of available frequencies, in KHz.
+
 scaling_min_freq and
 scaling_max_freq		show the current "policy limits" (in
 				kHz). By echoing new values into these
@@ -162,6 +167,15 @@ scaling_max_freq		show the current "policy limits" (in
 				first set scaling_max_freq, then
 				scaling_min_freq.
 
+affected_cpus :			List of CPUs that require software coordination
+				of frequency.
+
+related_cpus :			List of CPUs that need some sort of frequency
+				coordination, whether software or hardware.
+
+scaling_driver :		Hardware driver for cpufreq.
+
+scaling_cur_freq :		Current frequency of the CPU, in KHz.
 
 If you have selected the "userspace" governor which allows you to
 set the CPU operating frequency to a specific value, you can read out
diff --git a/Documentation/cpusets.txt b/Documentation/cpusets.txt
index aa854b9b18cd..fb7b361e6eea 100644
--- a/Documentation/cpusets.txt
+++ b/Documentation/cpusets.txt
@@ -171,6 +171,7 @@ files describing that cpuset:
  - memory_migrate flag: if set, move pages to cpusets nodes
  - cpu_exclusive flag: is cpu placement exclusive?
  - mem_exclusive flag: is memory placement exclusive?
+ - mem_hardwall flag:  is memory allocation hardwalled
  - memory_pressure: measure of how much paging pressure in cpuset
 
 In addition, the root cpuset only has the following file:
@@ -222,17 +223,18 @@ If a cpuset is cpu or mem exclusive, no other cpuset, other than
 a direct ancestor or descendent, may share any of the same CPUs or
 Memory Nodes.
 
-A cpuset that is mem_exclusive restricts kernel allocations for
-page, buffer and other data commonly shared by the kernel across
-multiple users.  All cpusets, whether mem_exclusive or not, restrict
-allocations of memory for user space.  This enables configuring a
-system so that several independent jobs can share common kernel data,
-such as file system pages, while isolating each jobs user allocation in
-its own cpuset.  To do this, construct a large mem_exclusive cpuset to
-hold all the jobs, and construct child, non-mem_exclusive cpusets for
-each individual job.  Only a small amount of typical kernel memory,
-such as requests from interrupt handlers, is allowed to be taken
-outside even a mem_exclusive cpuset.
+A cpuset that is mem_exclusive *or* mem_hardwall is "hardwalled",
+i.e. it restricts kernel allocations for page, buffer and other data
+commonly shared by the kernel across multiple users.  All cpusets,
+whether hardwalled or not, restrict allocations of memory for user
+space.  This enables configuring a system so that several independent
+jobs can share common kernel data, such as file system pages, while
+isolating each job's user allocation in its own cpuset.  To do this,
+construct a large mem_exclusive cpuset to hold all the jobs, and
+construct child, non-mem_exclusive cpusets for each individual job.
+Only a small amount of typical kernel memory, such as requests from
+interrupt handlers, is allowed to be taken outside even a
+mem_exclusive cpuset.
 
 
 1.5 What is memory_pressure ?
@@ -707,7 +709,7 @@ Now you want to do something with this cpuset.
 
 In this directory you can find several files:
 # ls
-cpus  cpu_exclusive  mems  mem_exclusive  tasks
+cpus  cpu_exclusive  mems  mem_exclusive mem_hardwall  tasks
 
 Reading them will give you information about the state of this cpuset:
 the CPUs and Memory Nodes it can use, the processes that are using
diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index e5f3d918316f..3ce193f86565 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -627,8 +627,7 @@ and is between 256 and 4096 characters. It is defined in the file
 	eata=		[HW,SCSI]
 
 	edd=		[EDD]
-			Format: {"of[f]" | "sk[ipmbr]"}
-			See comment in arch/i386/boot/edd.S
+			Format: {"off" | "on" | "skip[mbr]"}
 
 	eisa_irq_edge=	[PARISC,HW]
 			See header of drivers/parisc/eisa.c.
@@ -1389,6 +1388,13 @@ and is between 256 and 4096 characters. It is defined in the file
 
 	nr_uarts=	[SERIAL] maximum number of UARTs to be registered.
 
+	olpc_ec_timeout= [OLPC] ms delay when issuing EC commands
+			Rather than timing out after 20 ms if an EC
+			command is not properly ACKed, override the length
+			of the timeout.  We have interrupts disabled while
+			waiting for the ACK, so if this is set too high
+			interrupts *may* be lost!
+
 	opl3=		[HW,OSS]
 			Format: <io>
 
diff --git a/Documentation/keys-request-key.txt b/Documentation/keys-request-key.txt
index 266955d23ee6..09b55e461740 100644
--- a/Documentation/keys-request-key.txt
+++ b/Documentation/keys-request-key.txt
@@ -11,26 +11,29 @@ request_key*():
 
 	struct key *request_key(const struct key_type *type,
 				const char *description,
-				const char *callout_string);
+				const char *callout_info);
 
 or:
 
 	struct key *request_key_with_auxdata(const struct key_type *type,
 					     const char *description,
-					     const char *callout_string,
+					     const char *callout_info,
+					     size_t callout_len,
 					     void *aux);
 
 or:
 
 	struct key *request_key_async(const struct key_type *type,
 				      const char *description,
-				      const char *callout_string);
+				      const char *callout_info,
+				      size_t callout_len);
 
 or:
 
 	struct key *request_key_async_with_auxdata(const struct key_type *type,
 						   const char *description,
-						   const char *callout_string,
+						   const char *callout_info,
+					     	   size_t callout_len,
 						   void *aux);
 
 Or by userspace invoking the request_key system call:
diff --git a/Documentation/keys.txt b/Documentation/keys.txt
index 51652d39e61c..d5c7a57d1700 100644
--- a/Documentation/keys.txt
+++ b/Documentation/keys.txt
@@ -170,7 +170,8 @@ The key service provides a number of features besides keys:
      amount of description and payload space that can be consumed.
 
      The user can view information on this and other statistics through procfs
-     files.
+     files.  The root user may also alter the quota limits through sysctl files
+     (see the section "New procfs files").
 
      Process-specific and thread-specific keyrings are not counted towards a
      user's quota.
@@ -329,6 +330,27 @@ about the status of the key service:
 	<bytes>/<max>		Key size quota
 
 
+Four new sysctl files have been added also for the purpose of controlling the
+quota limits on keys:
+
+ (*) /proc/sys/kernel/keys/root_maxkeys
+     /proc/sys/kernel/keys/root_maxbytes
+
+     These files hold the maximum number of keys that root may have and the
+     maximum total number of bytes of data that root may have stored in those
+     keys.
+
+ (*) /proc/sys/kernel/keys/maxkeys
+     /proc/sys/kernel/keys/maxbytes
+
+     These files hold the maximum number of keys that each non-root user may
+     have and the maximum total number of bytes of data that each of those
+     users may have stored in their keys.
+
+Root may alter these by writing each new limit as a decimal number string to
+the appropriate file.
+
+
 ===============================
 USERSPACE SYSTEM CALL INTERFACE
 ===============================
@@ -711,6 +733,27 @@ The keyctl syscall functions are:
      The assumed authoritative key is inherited across fork and exec.
 
 
+ (*) Get the LSM security context attached to a key.
+
+	long keyctl(KEYCTL_GET_SECURITY, key_serial_t key, char *buffer,
+		    size_t buflen)
+
+     This function returns a string that represents the LSM security context
+     attached to a key in the buffer provided.
+
+     Unless there's an error, it always returns the amount of data it could
+     produce, even if that's too big for the buffer, but it won't copy more
+     than requested to userspace. If the buffer pointer is NULL then no copy
+     will take place.
+
+     A NUL character is included at the end of the string if the buffer is
+     sufficiently big.  This is included in the returned count.  If no LSM is
+     in force then an empty string will be returned.
+
+     A process must have view permission on the key for this function to be
+     successful.
+
+
 ===============
 KERNEL SERVICES
 ===============
@@ -771,7 +814,7 @@ payload contents" for more information.
 
 	struct key *request_key(const struct key_type *type,
 				const char *description,
-				const char *callout_string);
+				const char *callout_info);
 
     This is used to request a key or keyring with a description that matches
     the description specified according to the key type's match function. This
@@ -793,24 +836,28 @@ payload contents" for more information.
 
 	struct key *request_key_with_auxdata(const struct key_type *type,
 					     const char *description,
-					     const char *callout_string,
+					     const void *callout_info,
+					     size_t callout_len,
 					     void *aux);
 
     This is identical to request_key(), except that the auxiliary data is
-    passed to the key_type->request_key() op if it exists.
+    passed to the key_type->request_key() op if it exists, and the callout_info
+    is a blob of length callout_len, if given (the length may be 0).
 
 
 (*) A key can be requested asynchronously by calling one of:
 
 	struct key *request_key_async(const struct key_type *type,
 				      const char *description,
-				      const char *callout_string);
+				      const void *callout_info,
+				      size_t callout_len);
 
     or:
 
 	struct key *request_key_async_with_auxdata(const struct key_type *type,
 						   const char *description,
-						   const char *callout_string,
+						   const char *callout_info,
+					     	   size_t callout_len,
 					     	   void *aux);
 
     which are asynchronous equivalents of request_key() and
diff --git a/Documentation/oops-tracing.txt b/Documentation/oops-tracing.txt
index 7f60dfe642ca..b152e81da592 100644
--- a/Documentation/oops-tracing.txt
+++ b/Documentation/oops-tracing.txt
@@ -253,6 +253,10 @@ characters, each representing a particular tainted value.
 
   8: 'D' if the kernel has died recently, i.e. there was an OOPS or BUG.
 
+  9: 'A' if the ACPI table has been overridden.
+
+ 10: 'W' if a warning has previously been issued by the kernel.
+
 The primary reason for the 'Tainted: ' string is to tell kernel
 debuggers if this is a clean kernel or if anything unusual has
 occurred.  Tainting is permanent: even if an offending module is
diff --git a/Documentation/sysrq.txt b/Documentation/sysrq.txt
index 10c8f6922ef4..5ce0952aa065 100644
--- a/Documentation/sysrq.txt
+++ b/Documentation/sysrq.txt
@@ -85,6 +85,8 @@ On all -  write a character to /proc/sysrq-trigger.  e.g.:
 'k'     - Secure Access Key (SAK) Kills all programs on the current virtual
           console. NOTE: See important comments below in SAK section.
 
+'l'     - Shows a stack backtrace for all active CPUs.
+
 'm'     - Will dump current memory info to your console.
 
 'n'	- Used to make RT tasks nice-able