summaryrefslogtreecommitdiff
path: root/Documentation
diff options
context:
space:
mode:
Diffstat (limited to 'Documentation')
-rw-r--r--Documentation/DMA-API.txt69
-rw-r--r--Documentation/DMA-attributes.txt24
-rw-r--r--Documentation/cgroups.txt3
-rw-r--r--Documentation/controllers/devices.txt48
-rw-r--r--Documentation/controllers/resource_counter.txt181
-rw-r--r--Documentation/cpu-freq/user-guide.txt14
-rw-r--r--Documentation/cpusets.txt26
-rw-r--r--Documentation/kernel-parameters.txt10
-rw-r--r--Documentation/keys-request-key.txt11
-rw-r--r--Documentation/keys.txt59
-rw-r--r--Documentation/oops-tracing.txt4
-rw-r--r--Documentation/sysrq.txt2
12 files changed, 423 insertions, 28 deletions
diff --git a/Documentation/DMA-API.txt b/Documentation/DMA-API.txt
index b939ebb62871..80d150458c80 100644
--- a/Documentation/DMA-API.txt
+++ b/Documentation/DMA-API.txt
@@ -145,7 +145,7 @@ Part Ic - DMA addressing limitations
int
dma_supported(struct device *dev, u64 mask)
int
-pci_dma_supported(struct device *dev, u64 mask)
+pci_dma_supported(struct pci_dev *hwdev, u64 mask)
Checks to see if the device can support DMA to the memory described by
mask.
@@ -189,7 +189,7 @@ dma_addr_t
dma_map_single(struct device *dev, void *cpu_addr, size_t size,
enum dma_data_direction direction)
dma_addr_t
-pci_map_single(struct device *dev, void *cpu_addr, size_t size,
+pci_map_single(struct pci_dev *hwdev, void *cpu_addr, size_t size,
int direction)
Maps a piece of processor virtual memory so it can be accessed by the
@@ -395,6 +395,71 @@ Notes: You must do this:
See also dma_map_single().
+dma_addr_t
+dma_map_single_attrs(struct device *dev, void *cpu_addr, size_t size,
+ enum dma_data_direction dir,
+ struct dma_attrs *attrs)
+
+void
+dma_unmap_single_attrs(struct device *dev, dma_addr_t dma_addr,
+ size_t size, enum dma_data_direction dir,
+ struct dma_attrs *attrs)
+
+int
+dma_map_sg_attrs(struct device *dev, struct scatterlist *sgl,
+ int nents, enum dma_data_direction dir,
+ struct dma_attrs *attrs)
+
+void
+dma_unmap_sg_attrs(struct device *dev, struct scatterlist *sgl,
+ int nents, enum dma_data_direction dir,
+ struct dma_attrs *attrs)
+
+The four functions above are just like the counterpart functions
+without the _attrs suffixes, except that they pass an optional
+struct dma_attrs*.
+
+struct dma_attrs encapsulates a set of "dma attributes". For the
+definition of struct dma_attrs see linux/dma-attrs.h.
+
+The interpretation of dma attributes is architecture-specific, and
+each attribute should be documented in Documentation/DMA-attributes.txt.
+
+If struct dma_attrs* is NULL, the semantics of each of these
+functions is identical to those of the corresponding function
+without the _attrs suffix. As a result dma_map_single_attrs()
+can generally replace dma_map_single(), etc.
+
+As an example of the use of the *_attrs functions, here's how
+you could pass an attribute DMA_ATTR_FOO when mapping memory
+for DMA:
+
+#include <linux/dma-attrs.h>
+/* DMA_ATTR_FOO should be defined in linux/dma-attrs.h and
+ * documented in Documentation/DMA-attributes.txt */
+...
+
+ DEFINE_DMA_ATTRS(attrs);
+ dma_set_attr(DMA_ATTR_FOO, &attrs);
+ ....
+ n = dma_map_sg_attrs(dev, sg, nents, DMA_TO_DEVICE, &attr);
+ ....
+
+Architectures that care about DMA_ATTR_FOO would check for its
+presence in their implementations of the mapping and unmapping
+routines, e.g.:
+
+void whizco_dma_map_sg_attrs(struct device *dev, dma_addr_t dma_addr,
+ size_t size, enum dma_data_direction dir,
+ struct dma_attrs *attrs)
+{
+ ....
+ int foo = dma_get_attr(DMA_ATTR_FOO, attrs);
+ ....
+ if (foo)
+ /* twizzle the frobnozzle */
+ ....
+
Part II - Advanced dma_ usage
-----------------------------
diff --git a/Documentation/DMA-attributes.txt b/Documentation/DMA-attributes.txt
new file mode 100644
index 000000000000..6d772f84b477
--- /dev/null
+++ b/Documentation/DMA-attributes.txt
@@ -0,0 +1,24 @@
+ DMA attributes
+ ==============
+
+This document describes the semantics of the DMA attributes that are
+defined in linux/dma-attrs.h.
+
+DMA_ATTR_WRITE_BARRIER
+----------------------
+
+DMA_ATTR_WRITE_BARRIER is a (write) barrier attribute for DMA. DMA
+to a memory region with the DMA_ATTR_WRITE_BARRIER attribute forces
+all pending DMA writes to complete, and thus provides a mechanism to
+strictly order DMA from a device across all intervening busses and
+bridges. This barrier is not specific to a particular type of
+interconnect, it applies to the system as a whole, and so its
+implementation must account for the idiosyncracies of the system all
+the way from the DMA device to memory.
+
+As an example of a situation where DMA_ATTR_WRITE_BARRIER would be
+useful, suppose that a device does a DMA write to indicate that data is
+ready and available in memory. The DMA of the "completion indication"
+could race with data DMA. Mapping the memory used for completion
+indications with DMA_ATTR_WRITE_BARRIER would prevent the race.
+
diff --git a/Documentation/cgroups.txt b/Documentation/cgroups.txt
index 31d12e21ff8a..c298a6690e0d 100644
--- a/Documentation/cgroups.txt
+++ b/Documentation/cgroups.txt
@@ -500,8 +500,7 @@ post-attachment activity that requires memory allocations or blocking.
void fork(struct cgroup_subsy *ss, struct task_struct *task)
-Called when a task is forked into a cgroup. Also called during
-registration for all existing tasks.
+Called when a task is forked into a cgroup.
void exit(struct cgroup_subsys *ss, struct task_struct *task)
diff --git a/Documentation/controllers/devices.txt b/Documentation/controllers/devices.txt
new file mode 100644
index 000000000000..4dcea42432c2
--- /dev/null
+++ b/Documentation/controllers/devices.txt
@@ -0,0 +1,48 @@
+Device Whitelist Controller
+
+1. Description:
+
+Implement a cgroup to track and enforce open and mknod restrictions
+on device files. A device cgroup associates a device access
+whitelist with each cgroup. A whitelist entry has 4 fields.
+'type' is a (all), c (char), or b (block). 'all' means it applies
+to all types and all major and minor numbers. Major and minor are
+either an integer or * for all. Access is a composition of r
+(read), w (write), and m (mknod).
+
+The root device cgroup starts with rwm to 'all'. A child device
+cgroup gets a copy of the parent. Administrators can then remove
+devices from the whitelist or add new entries. A child cgroup can
+never receive a device access which is denied its parent. However
+when a device access is removed from a parent it will not also be
+removed from the child(ren).
+
+2. User Interface
+
+An entry is added using devices.allow, and removed using
+devices.deny. For instance
+
+ echo 'c 1:3 mr' > /cgroups/1/devices.allow
+
+allows cgroup 1 to read and mknod the device usually known as
+/dev/null. Doing
+
+ echo a > /cgroups/1/devices.deny
+
+will remove the default 'a *:* mrw' entry.
+
+3. Security
+
+Any task can move itself between cgroups. This clearly won't
+suffice, but we can decide the best way to adequately restrict
+movement as people get some experience with this. We may just want
+to require CAP_SYS_ADMIN, which at least is a separate bit from
+CAP_MKNOD. We may want to just refuse moving to a cgroup which
+isn't a descendent of the current one. Or we may want to use
+CAP_MAC_ADMIN, since we really are trying to lock down root.
+
+CAP_SYS_ADMIN is needed to modify the whitelist or move another
+task to a new cgroup. (Again we'll probably want to change that).
+
+A cgroup may not be granted more permissions than the cgroup's
+parent has.
diff --git a/Documentation/controllers/resource_counter.txt b/Documentation/controllers/resource_counter.txt
new file mode 100644
index 000000000000..f196ac1d7d25
--- /dev/null
+++ b/Documentation/controllers/resource_counter.txt
@@ -0,0 +1,181 @@
+
+ The Resource Counter
+
+The resource counter, declared at include/linux/res_counter.h,
+is supposed to facilitate the resource management by controllers
+by providing common stuff for accounting.
+
+This "stuff" includes the res_counter structure and routines
+to work with it.
+
+
+
+1. Crucial parts of the res_counter structure
+
+ a. unsigned long long usage
+
+ The usage value shows the amount of a resource that is consumed
+ by a group at a given time. The units of measurement should be
+ determined by the controller that uses this counter. E.g. it can
+ be bytes, items or any other unit the controller operates on.
+
+ b. unsigned long long max_usage
+
+ The maximal value of the usage over time.
+
+ This value is useful when gathering statistical information about
+ the particular group, as it shows the actual resource requirements
+ for a particular group, not just some usage snapshot.
+
+ c. unsigned long long limit
+
+ The maximal allowed amount of resource to consume by the group. In
+ case the group requests for more resources, so that the usage value
+ would exceed the limit, the resource allocation is rejected (see
+ the next section).
+
+ d. unsigned long long failcnt
+
+ The failcnt stands for "failures counter". This is the number of
+ resource allocation attempts that failed.
+
+ c. spinlock_t lock
+
+ Protects changes of the above values.
+
+
+
+2. Basic accounting routines
+
+ a. void res_counter_init(struct res_counter *rc)
+
+ Initializes the resource counter. As usual, should be the first
+ routine called for a new counter.
+
+ b. int res_counter_charge[_locked]
+ (struct res_counter *rc, unsigned long val)
+
+ When a resource is about to be allocated it has to be accounted
+ with the appropriate resource counter (controller should determine
+ which one to use on its own). This operation is called "charging".
+
+ This is not very important which operation - resource allocation
+ or charging - is performed first, but
+ * if the allocation is performed first, this may create a
+ temporary resource over-usage by the time resource counter is
+ charged;
+ * if the charging is performed first, then it should be uncharged
+ on error path (if the one is called).
+
+ c. void res_counter_uncharge[_locked]
+ (struct res_counter *rc, unsigned long val)
+
+ When a resource is released (freed) it should be de-accounted
+ from the resource counter it was accounted to. This is called
+ "uncharging".
+
+ The _locked routines imply that the res_counter->lock is taken.
+
+
+ 2.1 Other accounting routines
+
+ There are more routines that may help you with common needs, like
+ checking whether the limit is reached or resetting the max_usage
+ value. They are all declared in include/linux/res_counter.h.
+
+
+
+3. Analyzing the resource counter registrations
+
+ a. If the failcnt value constantly grows, this means that the counter's
+ limit is too tight. Either the group is misbehaving and consumes too
+ many resources, or the configuration is not suitable for the group
+ and the limit should be increased.
+
+ b. The max_usage value can be used to quickly tune the group. One may
+ set the limits to maximal values and either load the container with
+ a common pattern or leave one for a while. After this the max_usage
+ value shows the amount of memory the container would require during
+ its common activity.
+
+ Setting the limit a bit above this value gives a pretty good
+ configuration that works in most of the cases.
+
+ c. If the max_usage is much less than the limit, but the failcnt value
+ is growing, then the group tries to allocate a big chunk of resource
+ at once.
+
+ d. If the max_usage is much less than the limit, but the failcnt value
+ is 0, then this group is given too high limit, that it does not
+ require. It is better to lower the limit a bit leaving more resource
+ for other groups.
+
+
+
+4. Communication with the control groups subsystem (cgroups)
+
+All the resource controllers that are using cgroups and resource counters
+should provide files (in the cgroup filesystem) to work with the resource
+counter fields. They are recommended to adhere to the following rules:
+
+ a. File names
+
+ Field name File name
+ ---------------------------------------------------
+ usage usage_in_<unit_of_measurement>
+ max_usage max_usage_in_<unit_of_measurement>
+ limit limit_in_<unit_of_measurement>
+ failcnt failcnt
+ lock no file :)
+
+ b. Reading from file should show the corresponding field value in the
+ appropriate format.
+
+ c. Writing to file
+
+ Field Expected behavior
+ ----------------------------------
+ usage prohibited
+ max_usage reset to usage
+ limit set the limit
+ failcnt reset to zero
+
+
+
+5. Usage example
+
+ a. Declare a task group (take a look at cgroups subsystem for this) and
+ fold a res_counter into it
+
+ struct my_group {
+ struct res_counter res;
+
+ <other fields>
+ }
+
+ b. Put hooks in resource allocation/release paths
+
+ int alloc_something(...)
+ {
+ if (res_counter_charge(res_counter_ptr, amount) < 0)
+ return -ENOMEM;
+
+ <allocate the resource and return to the caller>
+ }
+
+ void release_something(...)
+ {
+ res_counter_uncharge(res_counter_ptr, amount);
+
+ <release the resource>
+ }
+
+ In order to keep the usage value self-consistent, both the
+ "res_counter_ptr" and the "amount" in release_something() should be
+ the same as they were in the alloc_something() when the releasing
+ resource was allocated.
+
+ c. Provide the way to read res_counter values and set them (the cgroups
+ still can help with it).
+
+ c. Compile and run :)
diff --git a/Documentation/cpu-freq/user-guide.txt b/Documentation/cpu-freq/user-guide.txt
index af3b925ece08..6c442d8426b5 100644
--- a/Documentation/cpu-freq/user-guide.txt
+++ b/Documentation/cpu-freq/user-guide.txt
@@ -154,6 +154,11 @@ scaling_governor, and by "echoing" the name of another
that some governors won't load - they only
work on some specific architectures or
processors.
+
+cpuinfo_cur_freq : Current speed of the CPU, in KHz.
+
+scaling_available_frequencies : List of available frequencies, in KHz.
+
scaling_min_freq and
scaling_max_freq show the current "policy limits" (in
kHz). By echoing new values into these
@@ -162,6 +167,15 @@ scaling_max_freq show the current "policy limits" (in
first set scaling_max_freq, then
scaling_min_freq.
+affected_cpus : List of CPUs that require software coordination
+ of frequency.
+
+related_cpus : List of CPUs that need some sort of frequency
+ coordination, whether software or hardware.
+
+scaling_driver : Hardware driver for cpufreq.
+
+scaling_cur_freq : Current frequency of the CPU, in KHz.
If you have selected the "userspace" governor which allows you to
set the CPU operating frequency to a specific value, you can read out
diff --git a/Documentation/cpusets.txt b/Documentation/cpusets.txt
index aa854b9b18cd..fb7b361e6eea 100644
--- a/Documentation/cpusets.txt
+++ b/Documentation/cpusets.txt
@@ -171,6 +171,7 @@ files describing that cpuset:
- memory_migrate flag: if set, move pages to cpusets nodes
- cpu_exclusive flag: is cpu placement exclusive?
- mem_exclusive flag: is memory placement exclusive?
+ - mem_hardwall flag: is memory allocation hardwalled
- memory_pressure: measure of how much paging pressure in cpuset
In addition, the root cpuset only has the following file:
@@ -222,17 +223,18 @@ If a cpuset is cpu or mem exclusive, no other cpuset, other than
a direct ancestor or descendent, may share any of the same CPUs or
Memory Nodes.
-A cpuset that is mem_exclusive restricts kernel allocations for
-page, buffer and other data commonly shared by the kernel across
-multiple users. All cpusets, whether mem_exclusive or not, restrict
-allocations of memory for user space. This enables configuring a
-system so that several independent jobs can share common kernel data,
-such as file system pages, while isolating each jobs user allocation in
-its own cpuset. To do this, construct a large mem_exclusive cpuset to
-hold all the jobs, and construct child, non-mem_exclusive cpusets for
-each individual job. Only a small amount of typical kernel memory,
-such as requests from interrupt handlers, is allowed to be taken
-outside even a mem_exclusive cpuset.
+A cpuset that is mem_exclusive *or* mem_hardwall is "hardwalled",
+i.e. it restricts kernel allocations for page, buffer and other data
+commonly shared by the kernel across multiple users. All cpusets,
+whether hardwalled or not, restrict allocations of memory for user
+space. This enables configuring a system so that several independent
+jobs can share common kernel data, such as file system pages, while
+isolating each job's user allocation in its own cpuset. To do this,
+construct a large mem_exclusive cpuset to hold all the jobs, and
+construct child, non-mem_exclusive cpusets for each individual job.
+Only a small amount of typical kernel memory, such as requests from
+interrupt handlers, is allowed to be taken outside even a
+mem_exclusive cpuset.
1.5 What is memory_pressure ?
@@ -707,7 +709,7 @@ Now you want to do something with this cpuset.
In this directory you can find several files:
# ls
-cpus cpu_exclusive mems mem_exclusive tasks
+cpus cpu_exclusive mems mem_exclusive mem_hardwall tasks
Reading them will give you information about the state of this cpuset:
the CPUs and Memory Nodes it can use, the processes that are using
diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index e5f3d918316f..3ce193f86565 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -627,8 +627,7 @@ and is between 256 and 4096 characters. It is defined in the file
eata= [HW,SCSI]
edd= [EDD]
- Format: {"of[f]" | "sk[ipmbr]"}
- See comment in arch/i386/boot/edd.S
+ Format: {"off" | "on" | "skip[mbr]"}
eisa_irq_edge= [PARISC,HW]
See header of drivers/parisc/eisa.c.
@@ -1389,6 +1388,13 @@ and is between 256 and 4096 characters. It is defined in the file
nr_uarts= [SERIAL] maximum number of UARTs to be registered.
+ olpc_ec_timeout= [OLPC] ms delay when issuing EC commands
+ Rather than timing out after 20 ms if an EC
+ command is not properly ACKed, override the length
+ of the timeout. We have interrupts disabled while
+ waiting for the ACK, so if this is set too high
+ interrupts *may* be lost!
+
opl3= [HW,OSS]
Format: <io>
diff --git a/Documentation/keys-request-key.txt b/Documentation/keys-request-key.txt
index 266955d23ee6..09b55e461740 100644
--- a/Documentation/keys-request-key.txt
+++ b/Documentation/keys-request-key.txt
@@ -11,26 +11,29 @@ request_key*():
struct key *request_key(const struct key_type *type,
const char *description,
- const char *callout_string);
+ const char *callout_info);
or:
struct key *request_key_with_auxdata(const struct key_type *type,
const char *description,
- const char *callout_string,
+ const char *callout_info,
+ size_t callout_len,
void *aux);
or:
struct key *request_key_async(const struct key_type *type,
const char *description,
- const char *callout_string);
+ const char *callout_info,
+ size_t callout_len);
or:
struct key *request_key_async_with_auxdata(const struct key_type *type,
const char *description,
- const char *callout_string,
+ const char *callout_info,
+ size_t callout_len,
void *aux);
Or by userspace invoking the request_key system call:
diff --git a/Documentation/keys.txt b/Documentation/keys.txt
index 51652d39e61c..d5c7a57d1700 100644
--- a/Documentation/keys.txt
+++ b/Documentation/keys.txt
@@ -170,7 +170,8 @@ The key service provides a number of features besides keys:
amount of description and payload space that can be consumed.
The user can view information on this and other statistics through procfs
- files.
+ files. The root user may also alter the quota limits through sysctl files
+ (see the section "New procfs files").
Process-specific and thread-specific keyrings are not counted towards a
user's quota.
@@ -329,6 +330,27 @@ about the status of the key service:
<bytes>/<max> Key size quota
+Four new sysctl files have been added also for the purpose of controlling the
+quota limits on keys:
+
+ (*) /proc/sys/kernel/keys/root_maxkeys
+ /proc/sys/kernel/keys/root_maxbytes
+
+ These files hold the maximum number of keys that root may have and the
+ maximum total number of bytes of data that root may have stored in those
+ keys.
+
+ (*) /proc/sys/kernel/keys/maxkeys
+ /proc/sys/kernel/keys/maxbytes
+
+ These files hold the maximum number of keys that each non-root user may
+ have and the maximum total number of bytes of data that each of those
+ users may have stored in their keys.
+
+Root may alter these by writing each new limit as a decimal number string to
+the appropriate file.
+
+
===============================
USERSPACE SYSTEM CALL INTERFACE
===============================
@@ -711,6 +733,27 @@ The keyctl syscall functions are:
The assumed authoritative key is inherited across fork and exec.
+ (*) Get the LSM security context attached to a key.
+
+ long keyctl(KEYCTL_GET_SECURITY, key_serial_t key, char *buffer,
+ size_t buflen)
+
+ This function returns a string that represents the LSM security context
+ attached to a key in the buffer provided.
+
+ Unless there's an error, it always returns the amount of data it could
+ produce, even if that's too big for the buffer, but it won't copy more
+ than requested to userspace. If the buffer pointer is NULL then no copy
+ will take place.
+
+ A NUL character is included at the end of the string if the buffer is
+ sufficiently big. This is included in the returned count. If no LSM is
+ in force then an empty string will be returned.
+
+ A process must have view permission on the key for this function to be
+ successful.
+
+
===============
KERNEL SERVICES
===============
@@ -771,7 +814,7 @@ payload contents" for more information.
struct key *request_key(const struct key_type *type,
const char *description,
- const char *callout_string);
+ const char *callout_info);
This is used to request a key or keyring with a description that matches
the description specified according to the key type's match function. This
@@ -793,24 +836,28 @@ payload contents" for more information.
struct key *request_key_with_auxdata(const struct key_type *type,
const char *description,
- const char *callout_string,
+ const void *callout_info,
+ size_t callout_len,
void *aux);
This is identical to request_key(), except that the auxiliary data is
- passed to the key_type->request_key() op if it exists.
+ passed to the key_type->request_key() op if it exists, and the callout_info
+ is a blob of length callout_len, if given (the length may be 0).
(*) A key can be requested asynchronously by calling one of:
struct key *request_key_async(const struct key_type *type,
const char *description,
- const char *callout_string);
+ const void *callout_info,
+ size_t callout_len);
or:
struct key *request_key_async_with_auxdata(const struct key_type *type,
const char *description,
- const char *callout_string,
+ const char *callout_info,
+ size_t callout_len,
void *aux);
which are asynchronous equivalents of request_key() and
diff --git a/Documentation/oops-tracing.txt b/Documentation/oops-tracing.txt
index 7f60dfe642ca..b152e81da592 100644
--- a/Documentation/oops-tracing.txt
+++ b/Documentation/oops-tracing.txt
@@ -253,6 +253,10 @@ characters, each representing a particular tainted value.
8: 'D' if the kernel has died recently, i.e. there was an OOPS or BUG.
+ 9: 'A' if the ACPI table has been overridden.
+
+ 10: 'W' if a warning has previously been issued by the kernel.
+
The primary reason for the 'Tainted: ' string is to tell kernel
debuggers if this is a clean kernel or if anything unusual has
occurred. Tainting is permanent: even if an offending module is
diff --git a/Documentation/sysrq.txt b/Documentation/sysrq.txt
index 10c8f6922ef4..5ce0952aa065 100644
--- a/Documentation/sysrq.txt
+++ b/Documentation/sysrq.txt
@@ -85,6 +85,8 @@ On all - write a character to /proc/sysrq-trigger. e.g.:
'k' - Secure Access Key (SAK) Kills all programs on the current virtual
console. NOTE: See important comments below in SAK section.
+'l' - Shows a stack backtrace for all active CPUs.
+
'm' - Will dump current memory info to your console.
'n' - Used to make RT tasks nice-able