From 1da177e4c3f41524e886b7f1b8a0c1fc7321cac2 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sat, 16 Apr 2005 15:20:36 -0700 Subject: Linux-2.6.12-rc2 Initial git repository build. I'm not bothering with the full history, even though we have it. We can create a separate "historical" git archive of that later if we want to, and in the meantime it's about 3.2GB when imported into git - space that would just make the early git days unnecessarily complicated, when we don't have a lot of good infrastructure for it. Let it rip! --- Documentation/DocBook/Makefile | 195 +++ Documentation/DocBook/deviceiobook.tmpl | 341 ++++ Documentation/DocBook/gadget.tmpl | 752 +++++++++ Documentation/DocBook/journal-api.tmpl | 333 ++++ Documentation/DocBook/kernel-api.tmpl | 342 ++++ Documentation/DocBook/kernel-hacking.tmpl | 1349 ++++++++++++++++ Documentation/DocBook/kernel-locking.tmpl | 2088 +++++++++++++++++++++++++ Documentation/DocBook/libata.tmpl | 282 ++++ Documentation/DocBook/librs.tmpl | 289 ++++ Documentation/DocBook/lsm.tmpl | 265 ++++ Documentation/DocBook/man/Makefile | 3 + Documentation/DocBook/mcabook.tmpl | 107 ++ Documentation/DocBook/mtdnand.tmpl | 1320 ++++++++++++++++ Documentation/DocBook/procfs-guide.tmpl | 591 +++++++ Documentation/DocBook/procfs_example.c | 224 +++ Documentation/DocBook/scsidrivers.tmpl | 193 +++ Documentation/DocBook/sis900.tmpl | 585 +++++++ Documentation/DocBook/tulip-user.tmpl | 327 ++++ Documentation/DocBook/usb.tmpl | 979 ++++++++++++ Documentation/DocBook/via-audio.tmpl | 597 +++++++ Documentation/DocBook/videobook.tmpl | 1663 ++++++++++++++++++++ Documentation/DocBook/wanbook.tmpl | 99 ++ Documentation/DocBook/writing_usb_driver.tmpl | 419 +++++ Documentation/DocBook/z8530book.tmpl | 385 +++++ 24 files changed, 13728 insertions(+) create mode 100644 Documentation/DocBook/Makefile create mode 100644 Documentation/DocBook/deviceiobook.tmpl create mode 100644 Documentation/DocBook/gadget.tmpl create mode 100644 Documentation/DocBook/journal-api.tmpl create mode 100644 Documentation/DocBook/kernel-api.tmpl create mode 100644 Documentation/DocBook/kernel-hacking.tmpl create mode 100644 Documentation/DocBook/kernel-locking.tmpl create mode 100644 Documentation/DocBook/libata.tmpl create mode 100644 Documentation/DocBook/librs.tmpl create mode 100644 Documentation/DocBook/lsm.tmpl create mode 100644 Documentation/DocBook/man/Makefile create mode 100644 Documentation/DocBook/mcabook.tmpl create mode 100644 Documentation/DocBook/mtdnand.tmpl create mode 100644 Documentation/DocBook/procfs-guide.tmpl create mode 100644 Documentation/DocBook/procfs_example.c create mode 100644 Documentation/DocBook/scsidrivers.tmpl create mode 100644 Documentation/DocBook/sis900.tmpl create mode 100644 Documentation/DocBook/tulip-user.tmpl create mode 100644 Documentation/DocBook/usb.tmpl create mode 100644 Documentation/DocBook/via-audio.tmpl create mode 100644 Documentation/DocBook/videobook.tmpl create mode 100644 Documentation/DocBook/wanbook.tmpl create mode 100644 Documentation/DocBook/writing_usb_driver.tmpl create mode 100644 Documentation/DocBook/z8530book.tmpl (limited to 'Documentation/DocBook') diff --git a/Documentation/DocBook/Makefile b/Documentation/DocBook/Makefile new file mode 100644 index 000000000000..a221039ee4c9 --- /dev/null +++ b/Documentation/DocBook/Makefile @@ -0,0 +1,195 @@ +### +# This makefile is used to generate the kernel documentation, +# primarily based on in-line comments in various source files. +# See Documentation/kernel-doc-nano-HOWTO.txt for instruction in how +# to ducument the SRC - and how to read it. +# To add a new book the only step required is to add the book to the +# list of DOCBOOKS. + +DOCBOOKS := wanbook.xml z8530book.xml mcabook.xml videobook.xml \ + kernel-hacking.xml kernel-locking.xml via-audio.xml \ + deviceiobook.xml procfs-guide.xml tulip-user.xml \ + writing_usb_driver.xml scsidrivers.xml sis900.xml \ + kernel-api.xml journal-api.xml lsm.xml usb.xml \ + gadget.xml libata.xml mtdnand.xml librs.xml + +### +# The build process is as follows (targets): +# (xmldocs) +# file.tmpl --> file.xml +--> file.ps (psdocs) +# +--> file.pdf (pdfdocs) +# +--> DIR=file (htmldocs) +# +--> man/ (mandocs) + +### +# The targets that may be used. +.PHONY: xmldocs sgmldocs psdocs pdfdocs htmldocs mandocs installmandocs + +BOOKS := $(addprefix $(obj)/,$(DOCBOOKS)) +xmldocs: $(BOOKS) +sgmldocs: xmldocs + +PS := $(patsubst %.xml, %.ps, $(BOOKS)) +psdocs: $(PS) + +PDF := $(patsubst %.xml, %.pdf, $(BOOKS)) +pdfdocs: $(PDF) + +HTML := $(patsubst %.xml, %.html, $(BOOKS)) +htmldocs: $(HTML) + +MAN := $(patsubst %.xml, %.9, $(BOOKS)) +mandocs: $(MAN) + +installmandocs: mandocs + $(MAKEMAN) install Documentation/DocBook/man + +### +#External programs used +KERNELDOC = scripts/kernel-doc +DOCPROC = scripts/basic/docproc +SPLITMAN = $(PERL) $(srctree)/scripts/split-man +MAKEMAN = $(PERL) $(srctree)/scripts/makeman + +### +# DOCPROC is used for two purposes: +# 1) To generate a dependency list for a .tmpl file +# 2) To preprocess a .tmpl file and call kernel-doc with +# appropriate parameters. +# The following rules are used to generate the .xml documentation +# required to generate the final targets. (ps, pdf, html). +quiet_cmd_docproc = DOCPROC $@ + cmd_docproc = SRCTREE=$(srctree)/ $(DOCPROC) doc $< >$@ +define rule_docproc + set -e; \ + $(if $($(quiet)cmd_$(1)),echo ' $($(quiet)cmd_$(1))';) \ + $(cmd_$(1)); \ + ( \ + echo 'cmd_$@ := $(cmd_$(1))'; \ + echo $@: `SRCTREE=$(srctree) $(DOCPROC) depend $<`; \ + ) > $(dir $@).$(notdir $@).cmd +endef + +%.xml: %.tmpl FORCE + $(call if_changed_rule,docproc) + +### +#Read in all saved dependency files +cmd_files := $(wildcard $(foreach f,$(BOOKS),$(dir $(f)).$(notdir $(f)).cmd)) + +ifneq ($(cmd_files),) + include $(cmd_files) +endif + +### +# Changes in kernel-doc force a rebuild of all documentation +$(BOOKS): $(KERNELDOC) + +### +# procfs guide uses a .c file as example code. +# This requires an explicit dependency +C-procfs-example = procfs_example.xml +C-procfs-example2 = $(addprefix $(obj)/,$(C-procfs-example)) +$(obj)/procfs-guide.xml: $(C-procfs-example2) + +### +# Rules to generate postscript, PDF and HTML +# db2html creates a directory. Generate a html file used for timestamp + +quiet_cmd_db2ps = DB2PS $@ + cmd_db2ps = db2ps -o $(dir $@) $< +%.ps : %.xml + @(which db2ps > /dev/null 2>&1) || \ + (echo "*** You need to install DocBook stylesheets ***"; \ + exit 1) + $(call cmd,db2ps) + +quiet_cmd_db2pdf = DB2PDF $@ + cmd_db2pdf = db2pdf -o $(dir $@) $< +%.pdf : %.xml + @(which db2pdf > /dev/null 2>&1) || \ + (echo "*** You need to install DocBook stylesheets ***"; \ + exit 1) + $(call cmd,db2pdf) + +quiet_cmd_db2html = DB2HTML $@ + cmd_db2html = db2html -o $(patsubst %.html,%,$@) $< && \ + echo ' \ + Goto $(patsubst %.html,%,$(notdir $@))

' > $@ + +%.html: %.xml + @(which db2html > /dev/null 2>&1) || \ + (echo "*** You need to install DocBook stylesheets ***"; \ + exit 1) + @rm -rf $@ $(patsubst %.html,%,$@) + $(call cmd,db2html) + @if [ ! -z "$(PNG-$(basename $(notdir $@)))" ]; then \ + cp $(PNG-$(basename $(notdir $@))) $(patsubst %.html,%,$@); fi + +### +# Rule to generate man files - output is placed in the man subdirectory + +%.9: %.xml +ifneq ($(KBUILD_SRC),) + $(Q)mkdir -p $(objtree)/Documentation/DocBook/man +endif + $(SPLITMAN) $< $(objtree)/Documentation/DocBook/man "$(VERSION).$(PATCHLEVEL).$(SUBLEVEL)" + $(MAKEMAN) convert $(objtree)/Documentation/DocBook/man $< + +### +# Rules to generate postscripts and PNG imgages from .fig format files +quiet_cmd_fig2eps = FIG2EPS $@ + cmd_fig2eps = fig2dev -Leps $< $@ + +%.eps: %.fig + @(which fig2dev > /dev/null 2>&1) || \ + (echo "*** You need to install transfig ***"; \ + exit 1) + $(call cmd,fig2eps) + +quiet_cmd_fig2png = FIG2PNG $@ + cmd_fig2png = fig2dev -Lpng $< $@ + +%.png: %.fig + @(which fig2dev > /dev/null 2>&1) || \ + (echo "*** You need to install transfig ***"; \ + exit 1) + $(call cmd,fig2png) + +### +# Rule to convert a .c file to inline XML documentation +%.xml: %.c + @echo ' GEN $@' + @( \ + echo ""; \ + expand --tabs=8 < $< | \ + sed -e "s/&/\\&/g" \ + -e "s//\\>/g"; \ + echo "") > $@ + +### +# Help targets as used by the top-level makefile +dochelp: + @echo ' Linux kernel internal documentation in different formats:' + @echo ' xmldocs (XML DocBook), psdocs (Postscript), pdfdocs (PDF)' + @echo ' htmldocs (HTML), mandocs (man pages, use installmandocs to install)' + +### +# Temporary files left by various tools +clean-files := $(DOCBOOKS) \ + $(patsubst %.xml, %.dvi, $(DOCBOOKS)) \ + $(patsubst %.xml, %.aux, $(DOCBOOKS)) \ + $(patsubst %.xml, %.tex, $(DOCBOOKS)) \ + $(patsubst %.xml, %.log, $(DOCBOOKS)) \ + $(patsubst %.xml, %.out, $(DOCBOOKS)) \ + $(patsubst %.xml, %.ps, $(DOCBOOKS)) \ + $(patsubst %.xml, %.pdf, $(DOCBOOKS)) \ + $(patsubst %.xml, %.html, $(DOCBOOKS)) \ + $(patsubst %.xml, %.9, $(DOCBOOKS)) \ + $(C-procfs-example) + +clean-dirs := $(patsubst %.xml,%,$(DOCBOOKS)) + +#man put files in man subdir - traverse down +subdir- := man/ diff --git a/Documentation/DocBook/deviceiobook.tmpl b/Documentation/DocBook/deviceiobook.tmpl new file mode 100644 index 000000000000..6f41f2f5c6f6 --- /dev/null +++ b/Documentation/DocBook/deviceiobook.tmpl @@ -0,0 +1,341 @@ + + + + + + Bus-Independent Device Accesses + + + + Matthew + Wilcox + +

+ matthew@wil.cx +
+ + + + + + + Alan + Cox + +
+ alan@redhat.com +
+
+
+
+ + + 2001 + Matthew Wilcox + + + + + This documentation is free software; you can redistribute + it and/or modify it under the terms of the GNU General Public + License as published by the Free Software Foundation; either + version 2 of the License, or (at your option) any later + version. + + + + This program is distributed in the hope that it will be + useful, but WITHOUT ANY WARRANTY; without even the implied + warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + See the GNU General Public License for more details. + + + + You should have received a copy of the GNU General Public + License along with this program; if not, write to the Free + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, + MA 02111-1307 USA + + + + For more details see the file COPYING in the source + distribution of Linux. + + + + + + + + Introduction + + Linux provides an API which abstracts performing IO across all busses + and devices, allowing device drivers to be written independently of + bus type. + + + + + Known Bugs And Assumptions + + None. + + + + + Memory Mapped IO + + Getting Access to the Device + + The most widely supported form of IO is memory mapped IO. + That is, a part of the CPU's address space is interpreted + not as accesses to memory, but as accesses to a device. Some + architectures define devices to be at a fixed address, but most + have some method of discovering devices. The PCI bus walk is a + good example of such a scheme. This document does not cover how + to receive such an address, but assumes you are starting with one. + Physical addresses are of type unsigned long. + + + + This address should not be used directly. Instead, to get an + address suitable for passing to the accessor functions described + below, you should call ioremap. + An address suitable for accessing the device will be returned to you. + + + + After you've finished using the device (say, in your module's + exit routine), call iounmap in order to return + the address space to the kernel. Most architectures allocate new + address space each time you call ioremap, and + they can run out unless you call iounmap. + + + + + Accessing the device + + The part of the interface most used by drivers is reading and + writing memory-mapped registers on the device. Linux provides + interfaces to read and write 8-bit, 16-bit, 32-bit and 64-bit + quantities. Due to a historical accident, these are named byte, + word, long and quad accesses. Both read and write accesses are + supported; there is no prefetch support at this time. + + + + The functions are named readb, + readw, readl, + readq, readb_relaxed, + readw_relaxed, readl_relaxed, + readq_relaxed, writeb, + writew, writel and + writeq. + + + + Some devices (such as framebuffers) would like to use larger + transfers than 8 bytes at a time. For these devices, the + memcpy_toio, memcpy_fromio + and memset_io functions are provided. + Do not use memset or memcpy on IO addresses; they + are not guaranteed to copy data in order. + + + + The read and write functions are defined to be ordered. That is the + compiler is not permitted to reorder the I/O sequence. When the + ordering can be compiler optimised, you can use + __readb and friends to indicate the relaxed ordering. Use + this with care. + + + + While the basic functions are defined to be synchronous with respect + to each other and ordered with respect to each other the busses the + devices sit on may themselves have asynchronicity. In particular many + authors are burned by the fact that PCI bus writes are posted + asynchronously. A driver author must issue a read from the same + device to ensure that writes have occurred in the specific cases the + author cares. This kind of property cannot be hidden from driver + writers in the API. In some cases, the read used to flush the device + may be expected to fail (if the card is resetting, for example). In + that case, the read should be done from config space, which is + guaranteed to soft-fail if the card doesn't respond. + + + + The following is an example of flushing a write to a device when + the driver would like to ensure the write's effects are visible prior + to continuing execution. + + + +static inline void +qla1280_disable_intrs(struct scsi_qla_host *ha) +{ + struct device_reg *reg; + + reg = ha->iobase; + /* disable risc and host interrupts */ + WRT_REG_WORD(&reg->ictrl, 0); + /* + * The following read will ensure that the above write + * has been received by the device before we return from this + * function. + */ + RD_REG_WORD(&reg->ictrl); + ha->flags.ints_enabled = 0; +} + + + + In addition to write posting, on some large multiprocessing systems + (e.g. SGI Challenge, Origin and Altix machines) posted writes won't + be strongly ordered coming from different CPUs. Thus it's important + to properly protect parts of your driver that do memory-mapped writes + with locks and use the mmiowb to make sure they + arrive in the order intended. Issuing a regular readX + will also ensure write ordering, but should only be used + when the driver has to be sure that the write has actually arrived + at the device (not that it's simply ordered with respect to other + writes), since a full readX is a relatively + expensive operation. + + + + Generally, one should use mmiowb prior to + releasing a spinlock that protects regions using writeb + or similar functions that aren't surrounded by + readb calls, which will ensure ordering and flushing. The + following pseudocode illustrates what might occur if write ordering + isn't guaranteed via mmiowb or one of the + readX functions. + + + +CPU A: spin_lock_irqsave(&dev_lock, flags) +CPU A: ... +CPU A: writel(newval, ring_ptr); +CPU A: spin_unlock_irqrestore(&dev_lock, flags) + ... +CPU B: spin_lock_irqsave(&dev_lock, flags) +CPU B: writel(newval2, ring_ptr); +CPU B: ... +CPU B: spin_unlock_irqrestore(&dev_lock, flags) + + + + In the case above, newval2 could be written to ring_ptr before + newval. Fixing it is easy though: + + + +CPU A: spin_lock_irqsave(&dev_lock, flags) +CPU A: ... +CPU A: writel(newval, ring_ptr); +CPU A: mmiowb(); /* ensure no other writes beat us to the device */ +CPU A: spin_unlock_irqrestore(&dev_lock, flags) + ... +CPU B: spin_lock_irqsave(&dev_lock, flags) +CPU B: writel(newval2, ring_ptr); +CPU B: ... +CPU B: mmiowb(); +CPU B: spin_unlock_irqrestore(&dev_lock, flags) + + + + See tg3.c for a real world example of how to use mmiowb + + + + + PCI ordering rules also guarantee that PIO read responses arrive + after any outstanding DMA writes from that bus, since for some devices + the result of a readb call may signal to the + driver that a DMA transaction is complete. In many cases, however, + the driver may want to indicate that the next + readb call has no relation to any previous DMA + writes performed by the device. The driver can use + readb_relaxed for these cases, although only + some platforms will honor the relaxed semantics. Using the relaxed + read functions will provide significant performance benefits on + platforms that support it. The qla2xxx driver provides examples + of how to use readX_relaxed. In many cases, + a majority of the driver's readX calls can + safely be converted to readX_relaxed calls, since + only a few will indicate or depend on DMA completion. + + + + + ISA legacy functions + + On older kernels (2.2 and earlier) the ISA bus could be read or + written with these functions and without ioremap being used. This is + no longer true in Linux 2.4. A set of equivalent functions exist for + easy legacy driver porting. The functions available are prefixed + with 'isa_' and are isa_readb, + isa_writeb, isa_readw, + isa_writew, isa_readl, + isa_writel, isa_memcpy_fromio + and isa_memcpy_toio + + + These functions should not be used in new drivers, and will + eventually be going away. + + + + + + + Port Space Accesses + + Port Space Explained + + + Another form of IO commonly supported is Port Space. This is a + range of addresses separate to the normal memory address space. + Access to these addresses is generally not as fast as accesses + to the memory mapped addresses, and it also has a potentially + smaller address space. + + + + Unlike memory mapped IO, no preparation is required + to access port space. + + + + + Accessing Port Space + + Accesses to this space are provided through a set of functions + which allow 8-bit, 16-bit and 32-bit accesses; also + known as byte, word and long. These functions are + inb, inw, + inl, outb, + outw and outl. + + + + Some variants are provided for these functions. Some devices + require that accesses to their ports are slowed down. This + functionality is provided by appending a _p + to the end of the function. There are also equivalents to memcpy. + The ins and outs + functions copy bytes, words or longs to the given port. + + + + + + + Public Functions Provided +!Einclude/asm-i386/io.h + + + diff --git a/Documentation/DocBook/gadget.tmpl b/Documentation/DocBook/gadget.tmpl new file mode 100644 index 000000000000..a34442436128 --- /dev/null +++ b/Documentation/DocBook/gadget.tmpl @@ -0,0 +1,752 @@ + + + + + + USB Gadget API for Linux + 20 August 2004 + 20 August 2004 + + + + This documentation is free software; you can redistribute + it and/or modify it under the terms of the GNU General Public + License as published by the Free Software Foundation; either + version 2 of the License, or (at your option) any later + version. + + + + This program is distributed in the hope that it will be + useful, but WITHOUT ANY WARRANTY; without even the implied + warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + See the GNU General Public License for more details. + + + + You should have received a copy of the GNU General Public + License along with this program; if not, write to the Free + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, + MA 02111-1307 USA + + + + For more details see the file COPYING in the source + distribution of Linux. + + + + 2003-2004 + David Brownell + + + + David + Brownell + +
dbrownell@users.sourceforge.net
+
+
+
+ + + +Introduction + +This document presents a Linux-USB "Gadget" +kernel mode +API, for use within peripherals and other USB devices +that embed Linux. +It provides an overview of the API structure, +and shows how that fits into a system development project. +This is the first such API released on Linux to address +a number of important problems, including: + + + Supports USB 2.0, for high speed devices which + can stream data at several dozen megabytes per second. + + Handles devices with dozens of endpoints just as + well as ones with just two fixed-function ones. Gadget drivers + can be written so they're easy to port to new hardware. + + Flexible enough to expose more complex USB device + capabilities such as multiple configurations, multiple interfaces, + composite devices, + and alternate interface settings. + + USB "On-The-Go" (OTG) support, in conjunction + with updates to the Linux-USB host side. + + Sharing data structures and API models with the + Linux-USB host side API. This helps the OTG support, and + looks forward to more-symmetric frameworks (where the same + I/O model is used by both host and device side drivers). + + Minimalist, so it's easier to support new device + controller hardware. I/O processing doesn't imply large + demands for memory or CPU resources. + + + + +Most Linux developers will not be able to use this API, since they +have USB "host" hardware in a PC, workstation, or server. +Linux users with embedded systems are more likely to +have USB peripheral hardware. +To distinguish drivers running inside such hardware from the +more familiar Linux "USB device drivers", +which are host side proxies for the real USB devices, +a different term is used: +the drivers inside the peripherals are "USB gadget drivers". +In USB protocol interactions, the device driver is the master +(or "client driver") +and the gadget driver is the slave (or "function driver"). + + +The gadget API resembles the host side Linux-USB API in that both +use queues of request objects to package I/O buffers, and those requests +may be submitted or canceled. +They share common definitions for the standard USB +Chapter 9 messages, structures, and constants. +Also, both APIs bind and unbind drivers to devices. +The APIs differ in detail, since the host side's current +URB framework exposes a number of implementation details +and assumptions that are inappropriate for a gadget API. +While the model for control transfers and configuration +management is necessarily different (one side is a hardware-neutral master, +the other is a hardware-aware slave), the endpoint I/0 API used here +should also be usable for an overhead-reduced host side API. + + + + +Structure of Gadget Drivers + +A system running inside a USB peripheral +normally has at least three layers inside the kernel to handle +USB protocol processing, and may have additional layers in +user space code. +The "gadget" API is used by the middle layer to interact +with the lowest level (which directly handles hardware). + + +In Linux, from the bottom up, these layers are: + + + + + + USB Controller Driver + + + This is the lowest software level. + It is the only layer that talks to hardware, + through registers, fifos, dma, irqs, and the like. + The <linux/usb_gadget.h> API abstracts + the peripheral controller endpoint hardware. + That hardware is exposed through endpoint objects, which accept + streams of IN/OUT buffers, and through callbacks that interact + with gadget drivers. + Since normal USB devices only have one upstream + port, they only have one of these drivers. + The controller driver can support any number of different + gadget drivers, but only one of them can be used at a time. + + + Examples of such controller hardware include + the PCI-based NetChip 2280 USB 2.0 high speed controller, + the SA-11x0 or PXA-25x UDC (found within many PDAs), + and a variety of other products. + + + + + + Gadget Driver + + + The lower boundary of this driver implements hardware-neutral + USB functions, using calls to the controller driver. + Because such hardware varies widely in capabilities and restrictions, + and is used in embedded environments where space is at a premium, + the gadget driver is often configured at compile time + to work with endpoints supported by one particular controller. + Gadget drivers may be portable to several different controllers, + using conditional compilation. + (Recent kernels substantially simplify the work involved in + supporting new hardware, by autoconfiguring + endpoints automatically for many bulk-oriented drivers.) + Gadget driver responsibilities include: + + + handling setup requests (ep0 protocol responses) + possibly including class-specific functionality + + returning configuration and string descriptors + + (re)setting configurations and interface + altsettings, including enabling and configuring endpoints + + handling life cycle events, such as managing + bindings to hardware, + USB suspend/resume, remote wakeup, + and disconnection from the USB host. + + managing IN and OUT transfers on all currently + enabled endpoints + + + + + Such drivers may be modules of proprietary code, although + that approach is discouraged in the Linux community. + + + + + Upper Level + + + Most gadget drivers have an upper boundary that connects + to some Linux driver or framework in Linux. + Through that boundary flows the data which the gadget driver + produces and/or consumes through protocol transfers over USB. + Examples include: + + + user mode code, using generic (gadgetfs) + or application specific files in + /dev + + networking subsystem (for network gadgets, + like the CDC Ethernet Model gadget driver) + + data capture drivers, perhaps video4Linux or + a scanner driver; or test and measurement hardware. + + input subsystem (for HID gadgets) + + sound subsystem (for audio gadgets) + + file system (for PTP gadgets) + + block i/o subsystem (for usb-storage gadgets) + + ... and more + + + + + Additional Layers + + + Other layers may exist. + These could include kernel layers, such as network protocol stacks, + as well as user mode applications building on standard POSIX + system call APIs such as + open(), close(), + read() and write(). + On newer systems, POSIX Async I/O calls may be an option. + Such user mode code will not necessarily be subject to + the GNU General Public License (GPL). + + + + + + +OTG-capable systems will also need to include a standard Linux-USB +host side stack, +with usbcore, +one or more Host Controller Drivers (HCDs), +USB Device Drivers to support +the OTG "Targeted Peripheral List", +and so forth. +There will also be an OTG Controller Driver, +which is visible to gadget and device driver developers only indirectly. +That helps the host and device side USB controllers implement the +two new OTG protocols (HNP and SRP). +Roles switch (host to peripheral, or vice versa) using HNP +during USB suspend processing, and SRP can be viewed as a +more battery-friendly kind of device wakeup protocol. + + +Over time, reusable utilities are evolving to help make some +gadget driver tasks simpler. +For example, building configuration descriptors from vectors of +descriptors for the configurations interfaces and endpoints is +now automated, and many drivers now use autoconfiguration to +choose hardware endpoints and initialize their descriptors. + +A potential example of particular interest +is code implementing standard USB-IF protocols for +HID, networking, storage, or audio classes. +Some developers are interested in KDB or KGDB hooks, to let +target hardware be remotely debugged. +Most such USB protocol code doesn't need to be hardware-specific, +any more than network protocols like X11, HTTP, or NFS are. +Such gadget-side interface drivers should eventually be combined, +to implement composite devices. + + + + + +Kernel Mode Gadget API + +Gadget drivers declare themselves through a +struct usb_gadget_driver, which is responsible for +most parts of enumeration for a struct usb_gadget. +The response to a set_configuration usually involves +enabling one or more of the struct usb_ep objects +exposed by the gadget, and submitting one or more +struct usb_request buffers to transfer data. +Understand those four data types, and their operations, and +you will understand how this API works. + + +Incomplete Data Type Descriptions + +This documentation was prepared using the standard Linux +kernel docproc tool, which turns text +and in-code comments into SGML DocBook and then into usable +formats such as HTML or PDF. +Other than the "Chapter 9" data types, most of the significant +data types and functions are described here. + + +However, docproc does not understand all the C constructs +that are used, so some relevant information is likely omitted from +what you are reading. +One example of such information is endpoint autoconfiguration. +You'll have to read the header file, and use example source +code (such as that for "Gadget Zero"), to fully understand the API. + + +The part of the API implementing some basic +driver capabilities is specific to the version of the +Linux kernel that's in use. +The 2.6 kernel includes a driver model +framework that has no analogue on earlier kernels; +so those parts of the gadget API are not fully portable. +(They are implemented on 2.4 kernels, but in a different way.) +The driver model state is another part of this API that is +ignored by the kerneldoc tools. + + + +The core API does not expose +every possible hardware feature, only the most widely available ones. +There are significant hardware features, such as device-to-device DMA +(without temporary storage in a memory buffer) +that would be added using hardware-specific APIs. + + +This API allows drivers to use conditional compilation to handle +endpoint capabilities of different hardware, but doesn't require that. +Hardware tends to have arbitrary restrictions, relating to +transfer types, addressing, packet sizes, buffering, and availability. +As a rule, such differences only matter for "endpoint zero" logic +that handles device configuration and management. +The API supports limited run-time +detection of capabilities, through naming conventions for endpoints. +Many drivers will be able to at least partially autoconfigure +themselves. +In particular, driver init sections will often have endpoint +autoconfiguration logic that scans the hardware's list of endpoints +to find ones matching the driver requirements +(relying on those conventions), to eliminate some of the most +common reasons for conditional compilation. + + +Like the Linux-USB host side API, this API exposes +the "chunky" nature of USB messages: I/O requests are in terms +of one or more "packets", and packet boundaries are visible to drivers. +Compared to RS-232 serial protocols, USB resembles +synchronous protocols like HDLC +(N bytes per frame, multipoint addressing, host as the primary +station and devices as secondary stations) +more than asynchronous ones +(tty style: 8 data bits per frame, no parity, one stop bit). +So for example the controller drivers won't buffer +two single byte writes into a single two-byte USB IN packet, +although gadget drivers may do so when they implement +protocols where packet boundaries (and "short packets") +are not significant. + + +Driver Life Cycle + +Gadget drivers make endpoint I/O requests to hardware without +needing to know many details of the hardware, but driver +setup/configuration code needs to handle some differences. +Use the API like this: + + + + +Register a driver for the particular device side +usb controller hardware, +such as the net2280 on PCI (USB 2.0), +sa11x0 or pxa25x as found in Linux PDAs, +and so on. +At this point the device is logically in the USB ch9 initial state +("attached"), drawing no power and not usable +(since it does not yet support enumeration). +Any host should not see the device, since it's not +activated the data line pullup used by the host to +detect a device, even if VBUS power is available. + + +Register a gadget driver that implements some higher level +device function. That will then bind() to a usb_gadget, which +activates the data line pullup sometime after detecting VBUS. + + +The hardware driver can now start enumerating. +The steps it handles are to accept USB power and set_address requests. +Other steps are handled by the gadget driver. +If the gadget driver module is unloaded before the host starts to +enumerate, steps before step 7 are skipped. + + +The gadget driver's setup() call returns usb descriptors, +based both on what the bus interface hardware provides and on the +functionality being implemented. +That can involve alternate settings or configurations, +unless the hardware prevents such operation. +For OTG devices, each configuration descriptor includes +an OTG descriptor. + + +The gadget driver handles the last step of enumeration, +when the USB host issues a set_configuration call. +It enables all endpoints used in that configuration, +with all interfaces in their default settings. +That involves using a list of the hardware's endpoints, enabling each +endpoint according to its descriptor. +It may also involve using usb_gadget_vbus_draw +to let more power be drawn from VBUS, as allowed by that configuration. +For OTG devices, setting a configuration may also involve reporting +HNP capabilities through a user interface. + + +Do real work and perform data transfers, possibly involving +changes to interface settings or switching to new configurations, until the +device is disconnect()ed from the host. +Queue any number of transfer requests to each endpoint. +It may be suspended and resumed several times before being disconnected. +On disconnect, the drivers go back to step 3 (above). + + +When the gadget driver module is being unloaded, +the driver unbind() callback is issued. That lets the controller +driver be unloaded. + + + + +Drivers will normally be arranged so that just loading the +gadget driver module (or statically linking it into a Linux kernel) +allows the peripheral device to be enumerated, but some drivers +will defer enumeration until some higher level component (like +a user mode daemon) enables it. +Note that at this lowest level there are no policies about how +ep0 configuration logic is implemented, +except that it should obey USB specifications. +Such issues are in the domain of gadget drivers, +including knowing about implementation constraints +imposed by some USB controllers +or understanding that composite devices might happen to +be built by integrating reusable components. + + +Note that the lifecycle above can be slightly different +for OTG devices. +Other than providing an additional OTG descriptor in each +configuration, only the HNP-related differences are particularly +visible to driver code. +They involve reporting requirements during the SET_CONFIGURATION +request, and the option to invoke HNP during some suspend callbacks. +Also, SRP changes the semantics of +usb_gadget_wakeup +slightly. + + + + +USB 2.0 Chapter 9 Types and Constants + +Gadget drivers +rely on common USB structures and constants +defined in the +<linux/usb_ch9.h> +header file, which is standard in Linux 2.6 kernels. +These are the same types and constants used by host +side drivers (and usbcore). + + +!Iinclude/linux/usb_ch9.h + + +Core Objects and Methods + +These are declared in +<linux/usb_gadget.h>, +and are used by gadget drivers to interact with +USB peripheral controller drivers. + + + + +!Iinclude/linux/usb_gadget.h + + +Optional Utilities + +The core API is sufficient for writing a USB Gadget Driver, +but some optional utilities are provided to simplify common tasks. +These utilities include endpoint autoconfiguration. + + +!Edrivers/usb/gadget/usbstring.c +!Edrivers/usb/gadget/config.c + + + + + +Peripheral Controller Drivers + +The first hardware supporting this API was the NetChip 2280 +controller, which supports USB 2.0 high speed and is based on PCI. +This is the net2280 driver module. +The driver supports Linux kernel versions 2.4 and 2.6; +contact NetChip Technologies for development boards and product +information. + + +Other hardware working in the "gadget" framework includes: +Intel's PXA 25x and IXP42x series processors +(pxa2xx_udc), +Toshiba TC86c001 "Goku-S" (goku_udc), +Renesas SH7705/7727 (sh_udc), +MediaQ 11xx (mq11xx_udc), +Hynix HMS30C7202 (h7202_udc), +National 9303/4 (n9604_udc), +Texas Instruments OMAP (omap_udc), +Sharp LH7A40x (lh7a40x_udc), +and more. +Most of those are full speed controllers. + + +At this writing, there are people at work on drivers in +this framework for several other USB device controllers, +with plans to make many of them be widely available. + + + + +A partial USB simulator, +the dummy_hcd driver, is available. +It can act like a net2280, a pxa25x, or an sa11x0 in terms +of available endpoints and device speeds; and it simulates +control, bulk, and to some extent interrupt transfers. +That lets you develop some parts of a gadget driver on a normal PC, +without any special hardware, and perhaps with the assistance +of tools such as GDB running with User Mode Linux. +At least one person has expressed interest in adapting that +approach, hooking it up to a simulator for a microcontroller. +Such simulators can help debug subsystems where the runtime hardware +is unfriendly to software development, or is not yet available. + + +Support for other controllers is expected to be developed +and contributed +over time, as this driver framework evolves. + + + + +Gadget Drivers + +In addition to Gadget Zero +(used primarily for testing and development with drivers +for usb controller hardware), other gadget drivers exist. + + +There's an ethernet gadget +driver, which implements one of the most useful +Communications Device Class (CDC) models. +One of the standards for cable modem interoperability even +specifies the use of this ethernet model as one of two +mandatory options. +Gadgets using this code look to a USB host as if they're +an Ethernet adapter. +It provides access to a network where the gadget's CPU is one host, +which could easily be bridging, routing, or firewalling +access to other networks. +Since some hardware can't fully implement the CDC Ethernet +requirements, this driver also implements a "good parts only" +subset of CDC Ethernet. +(That subset doesn't advertise itself as CDC Ethernet, +to avoid creating problems.) + + +Support for Microsoft's RNDIS +protocol has been contributed by Pengutronix and Auerswald GmbH. +This is like CDC Ethernet, but it runs on more slightly USB hardware +(but less than the CDC subset). +However, its main claim to fame is being able to connect directly to +recent versions of Windows, using drivers that Microsoft bundles +and supports, making it much simpler to network with Windows. + + +There is also support for user mode gadget drivers, +using gadgetfs. +This provides a User Mode API that presents +each endpoint as a single file descriptor. I/O is done using +normal read() and read() calls. +Familiar tools like GDB and pthreads can be used to +develop and debug user mode drivers, so that once a robust +controller driver is available many applications for it +won't require new kernel mode software. +Linux 2.6 Async I/O (AIO) +support is available, so that user mode software +can stream data with only slightly more overhead +than a kernel driver. + + +There's a USB Mass Storage class driver, which provides +a different solution for interoperability with systems such +as MS-Windows and MacOS. +That File-backed Storage driver uses a +file or block device as backing store for a drive, +like the loop driver. +The USB host uses the BBB, CB, or CBI versions of the mass +storage class specification, using transparent SCSI commands +to access the data from the backing store. + + +There's a "serial line" driver, useful for TTY style +operation over USB. +The latest version of that driver supports CDC ACM style +operation, like a USB modem, and so on most hardware it can +interoperate easily with MS-Windows. +One interesting use of that driver is in boot firmware (like a BIOS), +which can sometimes use that model with very small systems without +real serial lines. + + +Support for other kinds of gadget is expected to +be developed and contributed +over time, as this driver framework evolves. + + + + +USB On-The-GO (OTG) + +USB OTG support on Linux 2.6 was initially developed +by Texas Instruments for +OMAP 16xx and 17xx +series processors. +Other OTG systems should work in similar ways, but the +hardware level details could be very different. + + +Systems need specialized hardware support to implement OTG, +notably including a special Mini-AB jack +and associated transciever to support Dual-Role +operation: +they can act either as a host, using the standard +Linux-USB host side driver stack, +or as a peripheral, using this "gadget" framework. +To do that, the system software relies on small additions +to those programming interfaces, +and on a new internal component (here called an "OTG Controller") +affecting which driver stack connects to the OTG port. +In each role, the system can re-use the existing pool of +hardware-neutral drivers, layered on top of the controller +driver interfaces (usb_bus or +usb_gadget). +Such drivers need at most minor changes, and most of the calls +added to support OTG can also benefit non-OTG products. + + + + Gadget drivers test the is_otg + flag, and use it to determine whether or not to include + an OTG descriptor in each of their configurations. + + Gadget drivers may need changes to support the + two new OTG protocols, exposed in new gadget attributes + such as b_hnp_enable flag. + HNP support should be reported through a user interface + (two LEDs could suffice), and is triggered in some cases + when the host suspends the peripheral. + SRP support can be user-initiated just like remote wakeup, + probably by pressing the same button. + + On the host side, USB device drivers need + to be taught to trigger HNP at appropriate moments, using + usb_suspend_device(). + That also conserves battery power, which is useful even + for non-OTG configurations. + + Also on the host side, a driver must support the + OTG "Targeted Peripheral List". That's just a whitelist, + used to reject peripherals not supported with a given + Linux OTG host. + This whitelist is product-specific; + each product must modify otg_whitelist.h + to match its interoperability specification. + + + Non-OTG Linux hosts, like PCs and workstations, + normally have some solution for adding drivers, so that + peripherals that aren't recognized can eventually be supported. + That approach is unreasonable for consumer products that may + never have their firmware upgraded, and where it's usually + unrealistic to expect traditional PC/workstation/server kinds + of support model to work. + For example, it's often impractical to change device firmware + once the product has been distributed, so driver bugs can't + normally be fixed if they're found after shipment. + + + + +Additional changes are needed below those hardware-neutral +usb_bus and usb_gadget +driver interfaces; those aren't discussed here in any detail. +Those affect the hardware-specific code for each USB Host or Peripheral +controller, and how the HCD initializes (since OTG can be active only +on a single port). +They also involve what may be called an OTG Controller +Driver, managing the OTG transceiver and the OTG state +machine logic as well as much of the root hub behavior for the +OTG port. +The OTG controller driver needs to activate and deactivate USB +controllers depending on the relevant device role. +Some related changes were needed inside usbcore, so that it +can identify OTG-capable devices and respond appropriately +to HNP or SRP protocols. + + + + +
+ diff --git a/Documentation/DocBook/journal-api.tmpl b/Documentation/DocBook/journal-api.tmpl new file mode 100644 index 000000000000..1ef6f43c6d8f --- /dev/null +++ b/Documentation/DocBook/journal-api.tmpl @@ -0,0 +1,333 @@ + + + + + + The Linux Journalling API + + + Roger + Gammans + +
+ rgammans@computer-surgery.co.uk +
+
+
+
+ + + + Stephen + Tweedie + +
+ sct@redhat.com +
+
+
+
+ + + 2002 + Roger Gammans + + + + + This documentation is free software; you can redistribute + it and/or modify it under the terms of the GNU General Public + License as published by the Free Software Foundation; either + version 2 of the License, or (at your option) any later + version. + + + + This program is distributed in the hope that it will be + useful, but WITHOUT ANY WARRANTY; without even the implied + warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + See the GNU General Public License for more details. + + + + You should have received a copy of the GNU General Public + License along with this program; if not, write to the Free + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, + MA 02111-1307 USA + + + + For more details see the file COPYING in the source + distribution of Linux. + + +
+ + + + + Overview + + Details + +The journalling layer is easy to use. You need to +first of all create a journal_t data structure. There are +two calls to do this dependent on how you decide to allocate the physical +media on which the journal resides. The journal_init_inode() call +is for journals stored in filesystem inodes, or the journal_init_dev() +call can be use for journal stored on a raw device (in a continuous range +of blocks). A journal_t is a typedef for a struct pointer, so when +you are finally finished make sure you call journal_destroy() on it +to free up any used kernel memory. + + + +Once you have got your journal_t object you need to 'mount' or load the journal +file, unless of course you haven't initialised it yet - in which case you +need to call journal_create(). + + + +Most of the time however your journal file will already have been created, but +before you load it you must call journal_wipe() to empty the journal file. +Hang on, you say , what if the filesystem wasn't cleanly umount()'d . Well, it is the +job of the client file system to detect this and skip the call to journal_wipe(). + + + +In either case the next call should be to journal_load() which prepares the +journal file for use. Note that journal_wipe(..,0) calls journal_skip_recovery() +for you if it detects any outstanding transactions in the journal and similarly +journal_load() will call journal_recover() if necessary. +I would advise reading fs/ext3/super.c for examples on this stage. +[RGG: Why is the journal_wipe() call necessary - doesn't this needlessly +complicate the API. Or isn't a good idea for the journal layer to hide +dirty mounts from the client fs] + + + +Now you can go ahead and start modifying the underlying +filesystem. Almost. + + + + + +You still need to actually journal your filesystem changes, this +is done by wrapping them into transactions. Additionally you +also need to wrap the modification of each of the the buffers +with calls to the journal layer, so it knows what the modifications +you are actually making are. To do this use journal_start() which +returns a transaction handle. + + + +journal_start() +and its counterpart journal_stop(), which indicates the end of a transaction +are nestable calls, so you can reenter a transaction if necessary, +but remember you must call journal_stop() the same number of times as +journal_start() before the transaction is completed (or more accurately +leaves the the update phase). Ext3/VFS makes use of this feature to simplify +quota support. + + + +Inside each transaction you need to wrap the modifications to the +individual buffers (blocks). Before you start to modify a buffer you +need to call journal_get_{create,write,undo}_access() as appropriate, +this allows the journalling layer to copy the unmodified data if it +needs to. After all the buffer may be part of a previously uncommitted +transaction. +At this point you are at last ready to modify a buffer, and once +you are have done so you need to call journal_dirty_{meta,}data(). +Or if you've asked for access to a buffer you now know is now longer +required to be pushed back on the device you can call journal_forget() +in much the same way as you might have used bforget() in the past. + + + +A journal_flush() may be called at any time to commit and checkpoint +all your transactions. + + + +Then at umount time , in your put_super() (2.4) or write_super() (2.5) +you can then call journal_destroy() to clean up your in-core journal object. + + + + +Unfortunately there a couple of ways the journal layer can cause a deadlock. +The first thing to note is that each task can only have +a single outstanding transaction at any one time, remember nothing +commits until the outermost journal_stop(). This means +you must complete the transaction at the end of each file/inode/address +etc. operation you perform, so that the journalling system isn't re-entered +on another journal. Since transactions can't be nested/batched +across differing journals, and another filesystem other than +yours (say ext3) may be modified in a later syscall. + + + +The second case to bear in mind is that journal_start() can +block if there isn't enough space in the journal for your transaction +(based on the passed nblocks param) - when it blocks it merely(!) needs to +wait for transactions to complete and be committed from other tasks, +so essentially we are waiting for journal_stop(). So to avoid +deadlocks you must treat journal_start/stop() as if they +were semaphores and include them in your semaphore ordering rules to prevent +deadlocks. Note that journal_extend() has similar blocking behaviour to +journal_start() so you can deadlock here just as easily as on journal_start(). + + + +Try to reserve the right number of blocks the first time. ;-). This will +be the maximum number of blocks you are going to touch in this transaction. +I advise having a look at at least ext3_jbd.h to see the basis on which +ext3 uses to make these decisions. + + + +Another wriggle to watch out for is your on-disk block allocation strategy. +why? Because, if you undo a delete, you need to ensure you haven't reused any +of the freed blocks in a later transaction. One simple way of doing this +is make sure any blocks you allocate only have checkpointed transactions +listed against them. Ext3 does this in ext3_test_allocatable(). + + + +Lock is also providing through journal_{un,}lock_updates(), +ext3 uses this when it wants a window with a clean and stable fs for a moment. +eg. + + + + + journal_lock_updates() //stop new stuff happening.. + journal_flush() // checkpoint everything. + ..do stuff on stable fs + journal_unlock_updates() // carry on with filesystem use. + + + +The opportunities for abuse and DOS attacks with this should be obvious, +if you allow unprivileged userspace to trigger codepaths containing these +calls. + + + +A new feature of jbd since 2.5.25 is commit callbacks with the new +journal_callback_set() function you can now ask the journalling layer +to call you back when the transaction is finally committed to disk, so that +you can do some of your own management. The key to this is the journal_callback +struct, this maintains the internal callback information but you can +extend it like this:- + + + struct myfs_callback_s { + //Data structure element required by jbd.. + struct journal_callback for_jbd; + // Stuff for myfs allocated together. + myfs_inode* i_commited; + + } + + + +this would be useful if you needed to know when data was committed to a +particular inode. + + + + + +Summary + +Using the journal is a matter of wrapping the different context changes, +being each mount, each modification (transaction) and each changed buffer +to tell the journalling layer about them. + + + +Here is a some pseudo code to give you an idea of how it works, as +an example. + + + + journal_t* my_jnrl = journal_create(); + journal_init_{dev,inode}(jnrl,...) + if (clean) journal_wipe(); + journal_load(); + + foreach(transaction) { /*transactions must be + completed before + a syscall returns to + userspace*/ + + handle_t * xct=journal_start(my_jnrl); + foreach(bh) { + journal_get_{create,write,undo}_access(xact,bh); + if ( myfs_modify(bh) ) { /* returns true + if makes changes */ + journal_dirty_{meta,}data(xact,bh); + } else { + journal_forget(bh); + } + } + journal_stop(xct); + } + journal_destroy(my_jrnl); + + + + + + + Data Types + + The journalling layer uses typedefs to 'hide' the concrete definitions + of the structures used. As a client of the JBD layer you can + just rely on the using the pointer as a magic cookie of some sort. + + Obviously the hiding is not enforced as this is 'C'. + + Structures +!Iinclude/linux/jbd.h + + + + + Functions + + The functions here are split into two groups those that + affect a journal as a whole, and those which are used to + manage transactions + + Journal Level +!Efs/jbd/journal.c +!Efs/jbd/recovery.c + + Transasction Level +!Efs/jbd/transaction.c + + + + See also + + + + Journaling the Linux ext2fs Filesystem,LinuxExpo 98, Stephen Tweedie + + + + + + + Ext3 Journalling FileSystem , OLS 2000, Dr. Stephen Tweedie + + + + + +
diff --git a/Documentation/DocBook/kernel-api.tmpl b/Documentation/DocBook/kernel-api.tmpl new file mode 100644 index 000000000000..1bd20c860285 --- /dev/null +++ b/Documentation/DocBook/kernel-api.tmpl @@ -0,0 +1,342 @@ + + + + + + The Linux Kernel API + + + + This documentation is free software; you can redistribute + it and/or modify it under the terms of the GNU General Public + License as published by the Free Software Foundation; either + version 2 of the License, or (at your option) any later + version. + + + + This program is distributed in the hope that it will be + useful, but WITHOUT ANY WARRANTY; without even the implied + warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + See the GNU General Public License for more details. + + + + You should have received a copy of the GNU General Public + License along with this program; if not, write to the Free + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, + MA 02111-1307 USA + + + + For more details see the file COPYING in the source + distribution of Linux. + + + + + + + + Driver Basics + Driver Entry and Exit points +!Iinclude/linux/init.h + + + Atomic and pointer manipulation +!Iinclude/asm-i386/atomic.h +!Iinclude/asm-i386/unaligned.h + + + + + + + Data Types + Doubly Linked Lists +!Iinclude/linux/list.h + + + + + Basic C Library Functions + + + When writing drivers, you cannot in general use routines which are + from the C Library. Some of the functions have been found generally + useful and they are listed below. The behaviour of these functions + may vary slightly from those defined by ANSI, and these deviations + are noted in the text. + + + String Conversions +!Ilib/vsprintf.c +!Elib/vsprintf.c + + String Manipulation +!Ilib/string.c +!Elib/string.c + + Bit Operations +!Iinclude/asm-i386/bitops.h + + + + + Memory Management in Linux + The Slab Cache +!Emm/slab.c + + User Space Memory Access +!Iinclude/asm-i386/uaccess.h +!Iarch/i386/lib/usercopy.c + + + + + FIFO Buffer + kfifo interface +!Iinclude/linux/kfifo.h +!Ekernel/kfifo.c + + + + + The proc filesystem + + sysctl interface +!Ekernel/sysctl.c + + + + + The debugfs filesystem + + debugfs interface +!Efs/debugfs/inode.c +!Efs/debugfs/file.c + + + + + The Linux VFS + The Directory Cache +!Efs/dcache.c +!Iinclude/linux/dcache.h + + Inode Handling +!Efs/inode.c +!Efs/bad_inode.c + + Registration and Superblocks +!Efs/super.c + + File Locks +!Efs/locks.c +!Ifs/locks.c + + + + + Linux Networking + Socket Buffer Functions +!Iinclude/linux/skbuff.h +!Enet/core/skbuff.c + + Socket Filter +!Enet/core/filter.c + + Generic Network Statistics +!Iinclude/linux/gen_stats.h +!Enet/core/gen_stats.c +!Enet/core/gen_estimator.c + + + + + Network device support + Driver Support +!Enet/core/dev.c + + 8390 Based Network Cards +!Edrivers/net/8390.c + + Synchronous PPP +!Edrivers/net/wan/syncppp.c + + + + + Module Support + Module Loading +!Ekernel/kmod.c + + Inter Module support + + Refer to the file kernel/module.c for more information. + + + + + + + Hardware Interfaces + Interrupt Handling +!Iarch/i386/kernel/irq.c + + + MTRR Handling +!Earch/i386/kernel/cpu/mtrr/main.c + + PCI Support Library +!Edrivers/pci/pci.c + + PCI Hotplug Support Library +!Edrivers/pci/hotplug/pci_hotplug_core.c + + MCA Architecture + MCA Device Functions + + Refer to the file arch/i386/kernel/mca.c for more information. + + + + MCA Bus DMA +!Iinclude/asm-i386/mca_dma.h + + + + + + The Device File System +!Efs/devfs/base.c + + + + Security Framework +!Esecurity/security.c + + + + Power Management +!Ekernel/power/pm.c + + + + Block Devices +!Edrivers/block/ll_rw_blk.c + + + + Miscellaneous Devices +!Edrivers/char/misc.c + + + + Video4Linux +!Edrivers/media/video/videodev.c + + + + Sound Devices +!Esound/sound_core.c + + + + + 16x50 UART Driver +!Edrivers/serial/serial_core.c +!Edrivers/serial/8250.c + + + + Z85230 Support Library +!Edrivers/net/wan/z85230.c + + + + Frame Buffer Library + + + The frame buffer drivers depend heavily on four data structures. + These structures are declared in include/linux/fb.h. They are + fb_info, fb_var_screeninfo, fb_fix_screeninfo and fb_monospecs. + The last three can be made available to and from userland. + + + + fb_info defines the current state of a particular video card. + Inside fb_info, there exists a fb_ops structure which is a + collection of needed functions to make fbdev and fbcon work. + fb_info is only visible to the kernel. + + + + fb_var_screeninfo is used to describe the features of a video card + that are user defined. With fb_var_screeninfo, things such as + depth and the resolution may be defined. + + + + The next structure is fb_fix_screeninfo. This defines the + properties of a card that are created when a mode is set and can't + be changed otherwise. A good example of this is the start of the + frame buffer memory. This "locks" the address of the frame buffer + memory, so that it cannot be changed or moved. + + + + The last structure is fb_monospecs. In the old API, there was + little importance for fb_monospecs. This allowed for forbidden things + such as setting a mode of 800x600 on a fix frequency monitor. With + the new API, fb_monospecs prevents such things, and if used + correctly, can prevent a monitor from being cooked. fb_monospecs + will not be useful until kernels 2.5.x. + + + Frame Buffer Memory +!Edrivers/video/fbmem.c + + Frame Buffer Console +!Edrivers/video/console/fbcon.c + + Frame Buffer Colormap +!Edrivers/video/fbcmap.c + + + Frame Buffer Video Mode Database +!Idrivers/video/modedb.c +!Edrivers/video/modedb.c + + Frame Buffer Macintosh Video Mode Database +!Idrivers/video/macmodes.c + + Frame Buffer Fonts + + Refer to the file drivers/video/console/fonts.c for more information. + + + + + diff --git a/Documentation/DocBook/kernel-hacking.tmpl b/Documentation/DocBook/kernel-hacking.tmpl new file mode 100644 index 000000000000..49a9ef82d575 --- /dev/null +++ b/Documentation/DocBook/kernel-hacking.tmpl @@ -0,0 +1,1349 @@ + + + + + + Unreliable Guide To Hacking The Linux Kernel + + + + Paul + Rusty + Russell + +
+ rusty@rustcorp.com.au +
+
+
+
+ + + 2001 + Rusty Russell + + + + + This documentation is free software; you can redistribute + it and/or modify it under the terms of the GNU General Public + License as published by the Free Software Foundation; either + version 2 of the License, or (at your option) any later + version. + + + + This program is distributed in the hope that it will be + useful, but WITHOUT ANY WARRANTY; without even the implied + warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + See the GNU General Public License for more details. + + + + You should have received a copy of the GNU General Public + License along with this program; if not, write to the Free + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, + MA 02111-1307 USA + + + + For more details see the file COPYING in the source + distribution of Linux. + + + + + This is the first release of this document as part of the kernel tarball. + + +
+ + + + + Introduction + + Welcome, gentle reader, to Rusty's Unreliable Guide to Linux + Kernel Hacking. This document describes the common routines and + general requirements for kernel code: its goal is to serve as a + primer for Linux kernel development for experienced C + programmers. I avoid implementation details: that's what the + code is for, and I ignore whole tracts of useful routines. + + + Before you read this, please understand that I never wanted to + write this document, being grossly under-qualified, but I always + wanted to read it, and this was the only way. I hope it will + grow into a compendium of best practice, common starting points + and random information. + + + + + The Players + + + At any time each of the CPUs in a system can be: + + + + + + not associated with any process, serving a hardware interrupt; + + + + + + not associated with any process, serving a softirq, tasklet or bh; + + + + + + running in kernel space, associated with a process; + + + + + + running a process in user space. + + + + + + There is a strict ordering between these: other than the last + category (userspace) each can only be pre-empted by those above. + For example, while a softirq is running on a CPU, no other + softirq will pre-empt it, but a hardware interrupt can. However, + any other CPUs in the system execute independently. + + + + We'll see a number of ways that the user context can block + interrupts, to become truly non-preemptable. + + + + User Context + + + User context is when you are coming in from a system call or + other trap: you can sleep, and you own the CPU (except for + interrupts) until you call schedule(). + In other words, user context (unlike userspace) is not pre-emptable. + + + + + You are always in user context on module load and unload, + and on operations on the block device layer. + + + + + In user context, the current pointer (indicating + the task we are currently executing) is valid, and + in_interrupt() + (include/linux/interrupt.h) is false + . + + + + + Beware that if you have interrupts or bottom halves disabled + (see below), in_interrupt() will return a + false positive. + + + + + + Hardware Interrupts (Hard IRQs) + + + Timer ticks, network cards and + keyboard are examples of real + hardware which produce interrupts at any time. The kernel runs + interrupt handlers, which services the hardware. The kernel + guarantees that this handler is never re-entered: if another + interrupt arrives, it is queued (or dropped). Because it + disables interrupts, this handler has to be fast: frequently it + simply acknowledges the interrupt, marks a `software interrupt' + for execution and exits. + + + + You can tell you are in a hardware interrupt, because + in_irq() returns true. + + + + Beware that this will return a false positive if interrupts are disabled + (see below). + + + + + + Software Interrupt Context: Bottom Halves, Tasklets, softirqs + + + Whenever a system call is about to return to userspace, or a + hardware interrupt handler exits, any `software interrupts' + which are marked pending (usually by hardware interrupts) are + run (kernel/softirq.c). + + + + Much of the real interrupt handling work is done here. Early in + the transition to SMP, there were only `bottom + halves' (BHs), which didn't take advantage of multiple CPUs. Shortly + after we switched from wind-up computers made of match-sticks and snot, + we abandoned this limitation. + + + + include/linux/interrupt.h lists the + different BH's. No matter how many CPUs you have, no two BHs will run at + the same time. This made the transition to SMP simpler, but sucks hard for + scalable performance. A very important bottom half is the timer + BH (include/linux/timer.h): you + can register to have it call functions for you in a given length of time. + + + + 2.3.43 introduced softirqs, and re-implemented the (now + deprecated) BHs underneath them. Softirqs are fully-SMP + versions of BHs: they can run on as many CPUs at once as + required. This means they need to deal with any races in shared + data using their own locks. A bitmask is used to keep track of + which are enabled, so the 32 available softirqs should not be + used up lightly. (Yes, people will + notice). + + + + tasklets (include/linux/interrupt.h) + are like softirqs, except they are dynamically-registrable (meaning you + can have as many as you want), and they also guarantee that any tasklet + will only run on one CPU at any time, although different tasklets can + run simultaneously (unlike different BHs). + + + + The name `tasklet' is misleading: they have nothing to do with `tasks', + and probably more to do with some bad vodka Alexey Kuznetsov had at the + time. + + + + + You can tell you are in a softirq (or bottom half, or tasklet) + using the in_softirq() macro + (include/linux/interrupt.h). + + + + Beware that this will return a false positive if a bh lock (see below) + is held. + + + + + + + Some Basic Rules + + + + No memory protection + + + If you corrupt memory, whether in user context or + interrupt context, the whole machine will crash. Are you + sure you can't do what you want in userspace? + + + + + + No floating point or MMX + + + The FPU context is not saved; even in user + context the FPU state probably won't + correspond with the current process: you would mess with some + user process' FPU state. If you really want + to do this, you would have to explicitly save/restore the full + FPU state (and avoid context switches). It + is generally a bad idea; use fixed point arithmetic first. + + + + + + A rigid stack limit + + + The kernel stack is about 6K in 2.2 (for most + architectures: it's about 14K on the Alpha), and shared + with interrupts so you can't use it all. Avoid deep + recursion and huge local arrays on the stack (allocate + them dynamically instead). + + + + + + The Linux kernel is portable + + + Let's keep it that way. Your code should be 64-bit clean, + and endian-independent. You should also minimize CPU + specific stuff, e.g. inline assembly should be cleanly + encapsulated and minimized to ease porting. Generally it + should be restricted to the architecture-dependent part of + the kernel tree. + + + + + + + + ioctls: Not writing a new system call + + + A system call generally looks like this + + + +asmlinkage long sys_mycall(int arg) +{ + return 0; +} + + + + First, in most cases you don't want to create a new system call. + You create a character device and implement an appropriate ioctl + for it. This is much more flexible than system calls, doesn't have + to be entered in every architecture's + include/asm/unistd.h and + arch/kernel/entry.S file, and is much more + likely to be accepted by Linus. + + + + If all your routine does is read or write some parameter, consider + implementing a sysctl interface instead. + + + + Inside the ioctl you're in user context to a process. When a + error occurs you return a negated errno (see + include/linux/errno.h), + otherwise you return 0. + + + + After you slept you should check if a signal occurred: the + Unix/Linux way of handling signals is to temporarily exit the + system call with the -ERESTARTSYS error. The + system call entry code will switch back to user context, process + the signal handler and then your system call will be restarted + (unless the user disabled that). So you should be prepared to + process the restart, e.g. if you're in the middle of manipulating + some data structure. + + + +if (signal_pending()) + return -ERESTARTSYS; + + + + If you're doing longer computations: first think userspace. If you + really want to do it in kernel you should + regularly check if you need to give up the CPU (remember there is + cooperative multitasking per CPU). Idiom: + + + +cond_resched(); /* Will sleep */ + + + + A short note on interface design: the UNIX system call motto is + "Provide mechanism not policy". + + + + + Recipes for Deadlock + + + You cannot call any routines which may sleep, unless: + + + + + You are in user context. + + + + + + You do not own any spinlocks. + + + + + + You have interrupts enabled (actually, Andi Kleen says + that the scheduling code will enable them for you, but + that's probably not what you wanted). + + + + + + Note that some functions may sleep implicitly: common ones are + the user space access functions (*_user) and memory allocation + functions without GFP_ATOMIC. + + + + You will eventually lock up your box if you break these rules. + + + + Really. + + + + + Common Routines + + + + <function>printk()</function> + <filename class="headerfile">include/linux/kernel.h</filename> + + + + printk() feeds kernel messages to the + console, dmesg, and the syslog daemon. It is useful for debugging + and reporting errors, and can be used inside interrupt context, + but use with caution: a machine which has its console flooded with + printk messages is unusable. It uses a format string mostly + compatible with ANSI C printf, and C string concatenation to give + it a first "priority" argument: + + + +printk(KERN_INFO "i = %u\n", i); + + + + See include/linux/kernel.h; + for other KERN_ values; these are interpreted by syslog as the + level. Special case: for printing an IP address use + + + +__u32 ipaddress; +printk(KERN_INFO "my ip: %d.%d.%d.%d\n", NIPQUAD(ipaddress)); + + + + printk() internally uses a 1K buffer and does + not catch overruns. Make sure that will be enough. + + + + + You will know when you are a real kernel hacker + when you start typoing printf as printk in your user programs :) + + + + + + + + Another sidenote: the original Unix Version 6 sources had a + comment on top of its printf function: "Printf should not be + used for chit-chat". You should follow that advice. + + + + + + + <function>copy_[to/from]_user()</function> + / + <function>get_user()</function> + / + <function>put_user()</function> + <filename class="headerfile">include/asm/uaccess.h</filename> + + + + [SLEEPS] + + + + put_user() and get_user() + are used to get and put single values (such as an int, char, or + long) from and to userspace. A pointer into userspace should + never be simply dereferenced: data should be copied using these + routines. Both return -EFAULT or 0. + + + copy_to_user() and + copy_from_user() are more general: they copy + an arbitrary amount of data to and from userspace. + + + Unlike put_user() and + get_user(), they return the amount of + uncopied data (ie. 0 still means + success). + + + [Yes, this moronic interface makes me cringe. Please submit a + patch and become my hero --RR.] + + + The functions may sleep implicitly. This should never be called + outside user context (it makes no sense), with interrupts + disabled, or a spinlock held. + + + + + <function>kmalloc()</function>/<function>kfree()</function> + <filename class="headerfile">include/linux/slab.h</filename> + + + [MAY SLEEP: SEE BELOW] + + + + These routines are used to dynamically request pointer-aligned + chunks of memory, like malloc and free do in userspace, but + kmalloc() takes an extra flag word. + Important values: + + + + + + + GFP_KERNEL + + + + + May sleep and swap to free memory. Only allowed in user + context, but is the most reliable way to allocate memory. + + + + + + + + GFP_ATOMIC + + + + + Don't sleep. Less reliable than GFP_KERNEL, + but may be called from interrupt context. You should + really have a good out-of-memory + error-handling strategy. + + + + + + + + GFP_DMA + + + + + Allocate ISA DMA lower than 16MB. If you don't know what that + is you don't need it. Very unreliable. + + + + + + + If you see a kmem_grow: Called nonatomically from int + warning message you called a memory allocation function + from interrupt context without GFP_ATOMIC. + You should really fix that. Run, don't walk. + + + + If you are allocating at least PAGE_SIZE + (include/asm/page.h) bytes, + consider using __get_free_pages() + + (include/linux/mm.h). It + takes an order argument (0 for page sized, 1 for double page, 2 + for four pages etc.) and the same memory priority flag word as + above. + + + + If you are allocating more than a page worth of bytes you can use + vmalloc(). It'll allocate virtual memory in + the kernel map. This block is not contiguous in physical memory, + but the MMU makes it look like it is for you + (so it'll only look contiguous to the CPUs, not to external device + drivers). If you really need large physically contiguous memory + for some weird device, you have a problem: it is poorly supported + in Linux because after some time memory fragmentation in a running + kernel makes it hard. The best way is to allocate the block early + in the boot process via the alloc_bootmem() + routine. + + + + Before inventing your own cache of often-used objects consider + using a slab cache in + include/linux/slab.h + + + + + <function>current</function> + <filename class="headerfile">include/asm/current.h</filename> + + + This global variable (really a macro) contains a pointer to + the current task structure, so is only valid in user context. + For example, when a process makes a system call, this will + point to the task structure of the calling process. It is + not NULL in interrupt context. + + + + + <function>udelay()</function>/<function>mdelay()</function> + <filename class="headerfile">include/asm/delay.h</filename> + <filename class="headerfile">include/linux/delay.h</filename> + + + + The udelay() function can be used for small pauses. + Do not use large values with udelay() as you risk + overflow - the helper function mdelay() is useful + here, or even consider schedule_timeout(). + + + + + <function>cpu_to_be32()</function>/<function>be32_to_cpu()</function>/<function>cpu_to_le32()</function>/<function>le32_to_cpu()</function> + <filename class="headerfile">include/asm/byteorder.h</filename> + + + + The cpu_to_be32() family (where the "32" can + be replaced by 64 or 16, and the "be" can be replaced by "le") are + the general way to do endian conversions in the kernel: they + return the converted value. All variations supply the reverse as + well: be32_to_cpu(), etc. + + + + There are two major variations of these functions: the pointer + variation, such as cpu_to_be32p(), which take + a pointer to the given type, and return the converted value. The + other variation is the "in-situ" family, such as + cpu_to_be32s(), which convert value referred + to by the pointer, and return void. + + + + + <function>local_irq_save()</function>/<function>local_irq_restore()</function> + <filename class="headerfile">include/asm/system.h</filename> + + + + These routines disable hard interrupts on the local CPU, and + restore them. They are reentrant; saving the previous state in + their one unsigned long flags argument. If you + know that interrupts are enabled, you can simply use + local_irq_disable() and + local_irq_enable(). + + + + + <function>local_bh_disable()</function>/<function>local_bh_enable()</function> + <filename class="headerfile">include/linux/interrupt.h</filename> + + + These routines disable soft interrupts on the local CPU, and + restore them. They are reentrant; if soft interrupts were + disabled before, they will still be disabled after this pair + of functions has been called. They prevent softirqs, tasklets + and bottom halves from running on the current CPU. + + + + + <function>smp_processor_id</function>() + <filename class="headerfile">include/asm/smp.h</filename> + + + smp_processor_id() returns the current + processor number, between 0 and NR_CPUS (the + maximum number of CPUs supported by Linux, currently 32). These + values are not necessarily continuous. + + + + + <type>__init</type>/<type>__exit</type>/<type>__initdata</type> + <filename class="headerfile">include/linux/init.h</filename> + + + After boot, the kernel frees up a special section; functions + marked with __init and data structures marked with + __initdata are dropped after boot is complete (within + modules this directive is currently ignored). __exit + is used to declare a function which is only required on exit: the + function will be dropped if this file is not compiled as a module. + See the header file for use. Note that it makes no sense for a function + marked with __init to be exported to modules with + EXPORT_SYMBOL() - this will break. + + + Static data structures marked as __initdata must be initialised + (as opposed to ordinary static data which is zeroed BSS) and cannot be + const. + + + + + + <function>__initcall()</function>/<function>module_init()</function> + <filename class="headerfile">include/linux/init.h</filename> + + Many parts of the kernel are well served as a module + (dynamically-loadable parts of the kernel). Using the + module_init() and + module_exit() macros it is easy to write code + without #ifdefs which can operate both as a module or built into + the kernel. + + + + The module_init() macro defines which + function is to be called at module insertion time (if the file is + compiled as a module), or at boot time: if the file is not + compiled as a module the module_init() macro + becomes equivalent to __initcall(), which + through linker magic ensures that the function is called on boot. + + + + The function can return a negative error number to cause + module loading to fail (unfortunately, this has no effect if + the module is compiled into the kernel). For modules, this is + called in user context, with interrupts enabled, and the + kernel lock held, so it can sleep. + + + + + <function>module_exit()</function> + <filename class="headerfile">include/linux/init.h</filename> + + + This macro defines the function to be called at module removal + time (or never, in the case of the file compiled into the + kernel). It will only be called if the module usage count has + reached zero. This function can also sleep, but cannot fail: + everything must be cleaned up by the time it returns. + + + + + + + + Wait Queues + <filename class="headerfile">include/linux/wait.h</filename> + + + [SLEEPS] + + + + A wait queue is used to wait for someone to wake you up when a + certain condition is true. They must be used carefully to ensure + there is no race condition. You declare a + wait_queue_head_t, and then processes which want to + wait for that condition declare a wait_queue_t + referring to themselves, and place that in the queue. + + + + Declaring + + + You declare a wait_queue_head_t using the + DECLARE_WAIT_QUEUE_HEAD() macro, or using the + init_waitqueue_head() routine in your + initialization code. + + + + + Queuing + + + Placing yourself in the waitqueue is fairly complex, because you + must put yourself in the queue before checking the condition. + There is a macro to do this: + wait_event_interruptible() + + include/linux/sched.h The + first argument is the wait queue head, and the second is an + expression which is evaluated; the macro returns + 0 when this expression is true, or + -ERESTARTSYS if a signal is received. + The wait_event() version ignores signals. + + + Do not use the sleep_on() function family - + it is very easy to accidentally introduce races; almost certainly + one of the wait_event() family will do, or a + loop around schedule_timeout(). If you choose + to loop around schedule_timeout() remember + you must set the task state (with + set_current_state()) on each iteration to avoid + busy-looping. + + + + + + Waking Up Queued Tasks + + + Call wake_up() + + include/linux/sched.h;, + which will wake up every process in the queue. The exception is + if one has TASK_EXCLUSIVE set, in which case + the remainder of the queue will not be woken. + + + + + + Atomic Operations + + + Certain operations are guaranteed atomic on all platforms. The + first class of operations work on atomic_t + + include/asm/atomic.h; this + contains a signed integer (at least 24 bits long), and you must use + these functions to manipulate or read atomic_t variables. + atomic_read() and + atomic_set() get and set the counter, + atomic_add(), + atomic_sub(), + atomic_inc(), + atomic_dec(), and + atomic_dec_and_test() (returns + true if it was decremented to zero). + + + + Yes. It returns true (i.e. != 0) if the + atomic variable is zero. + + + + Note that these functions are slower than normal arithmetic, and + so should not be used unnecessarily. On some platforms they + are much slower, like 32-bit Sparc where they use a spinlock. + + + + The second class of atomic operations is atomic bit operations on a + long, defined in + + include/linux/bitops.h. These + operations generally take a pointer to the bit pattern, and a bit + number: 0 is the least significant bit. + set_bit(), clear_bit() + and change_bit() set, clear, and flip the + given bit. test_and_set_bit(), + test_and_clear_bit() and + test_and_change_bit() do the same thing, + except return true if the bit was previously set; these are + particularly useful for very simple locking. + + + + It is possible to call these operations with bit indices greater + than BITS_PER_LONG. The resulting behavior is strange on big-endian + platforms though so it is a good idea not to do this. + + + + Note that the order of bits depends on the architecture, and in + particular, the bitfield passed to these operations must be at + least as large as a long. + + + + + Symbols + + + Within the kernel proper, the normal linking rules apply + (ie. unless a symbol is declared to be file scope with the + static keyword, it can be used anywhere in the + kernel). However, for modules, a special exported symbol table is + kept which limits the entry points to the kernel proper. Modules + can also export symbols. + + + + <function>EXPORT_SYMBOL()</function> + <filename class="headerfile">include/linux/module.h</filename> + + + This is the classic method of exporting a symbol, and it works + for both modules and non-modules. In the kernel all these + declarations are often bundled into a single file to help + genksyms (which searches source files for these declarations). + See the comment on genksyms and Makefiles below. + + + + + <function>EXPORT_SYMBOL_GPL()</function> + <filename class="headerfile">include/linux/module.h</filename> + + + Similar to EXPORT_SYMBOL() except that the + symbols exported by EXPORT_SYMBOL_GPL() can + only be seen by modules with a + MODULE_LICENSE() that specifies a GPL + compatible license. + + + + + + Routines and Conventions + + + Double-linked lists + <filename class="headerfile">include/linux/list.h</filename> + + + There are three sets of linked-list routines in the kernel + headers, but this one seems to be winning out (and Linus has + used it). If you don't have some particular pressing need for + a single list, it's a good choice. In fact, I don't care + whether it's a good choice or not, just use it so we can get + rid of the others. + + + + + Return Conventions + + + For code called in user context, it's very common to defy C + convention, and return 0 for success, + and a negative error number + (eg. -EFAULT) for failure. This can be + unintuitive at first, but it's fairly widespread in the networking + code, for example. + + + + The filesystem code uses ERR_PTR() + + include/linux/fs.h; to + encode a negative error number into a pointer, and + IS_ERR() and PTR_ERR() + to get it back out again: avoids a separate pointer parameter for + the error number. Icky, but in a good way. + + + + + Breaking Compilation + + + Linus and the other developers sometimes change function or + structure names in development kernels; this is not done just to + keep everyone on their toes: it reflects a fundamental change + (eg. can no longer be called with interrupts on, or does extra + checks, or doesn't do checks which were caught before). Usually + this is accompanied by a fairly complete note to the linux-kernel + mailing list; search the archive. Simply doing a global replace + on the file usually makes things worse. + + + + + Initializing structure members + + + The preferred method of initializing structures is to use + designated initialisers, as defined by ISO C99, eg: + + +static struct block_device_operations opt_fops = { + .open = opt_open, + .release = opt_release, + .ioctl = opt_ioctl, + .check_media_change = opt_media_change, +}; + + + This makes it easy to grep for, and makes it clear which + structure fields are set. You should do this because it looks + cool. + + + + + GNU Extensions + + + GNU Extensions are explicitly allowed in the Linux kernel. + Note that some of the more complex ones are not very well + supported, due to lack of general use, but the following are + considered standard (see the GCC info page section "C + Extensions" for more details - Yes, really the info page, the + man page is only a short summary of the stuff in info): + + + + + Inline functions + + + + + Statement expressions (ie. the ({ and }) constructs). + + + + + Declaring attributes of a function / variable / type + (__attribute__) + + + + + typeof + + + + + Zero length arrays + + + + + Macro varargs + + + + + Arithmetic on void pointers + + + + + Non-Constant initializers + + + + + Assembler Instructions (not outside arch/ and include/asm/) + + + + + Function names as strings (__FUNCTION__) + + + + + __builtin_constant_p() + + + + + + Be wary when using long long in the kernel, the code gcc generates for + it is horrible and worse: division and multiplication does not work + on i386 because the GCC runtime functions for it are missing from + the kernel environment. + + + + + + + C++ + + + Using C++ in the kernel is usually a bad idea, because the + kernel does not provide the necessary runtime environment + and the include files are not tested for it. It is still + possible, but not recommended. If you really want to do + this, forget about exceptions at least. + + + + + #if + + + It is generally considered cleaner to use macros in header files + (or at the top of .c files) to abstract away functions rather than + using `#if' pre-processor statements throughout the source code. + + + + + + Putting Your Stuff in the Kernel + + + In order to get your stuff into shape for official inclusion, or + even to make a neat patch, there's administrative work to be + done: + + + + + Figure out whose pond you've been pissing in. Look at the top of + the source files, inside the MAINTAINERS + file, and last of all in the CREDITS file. + You should coordinate with this person to make sure you're not + duplicating effort, or trying something that's already been + rejected. + + + + Make sure you put your name and EMail address at the top of + any files you create or mangle significantly. This is the + first place people will look when they find a bug, or when + they want to make a change. + + + + + + Usually you want a configuration option for your kernel hack. + Edit Config.in in the appropriate directory + (but under arch/ it's called + config.in). The Config Language used is not + bash, even though it looks like bash; the safe way is to use only + the constructs that you already see in + Config.in files (see + Documentation/kbuild/kconfig-language.txt). + It's good to run "make xconfig" at least once to test (because + it's the only one with a static parser). + + + + Variables which can be Y or N use bool followed by a + tagline and the config define name (which must start with + CONFIG_). The tristate function is the same, but + allows the answer M (which defines + CONFIG_foo_MODULE in your source, instead of + CONFIG_FOO) if CONFIG_MODULES + is enabled. + + + + You may well want to make your CONFIG option only visible if + CONFIG_EXPERIMENTAL is enabled: this serves as a + warning to users. There many other fancy things you can do: see + the various Config.in files for ideas. + + + + + + Edit the Makefile: the CONFIG variables are + exported here so you can conditionalize compilation with `ifeq'. + If your file exports symbols then add the names to + export-objs so that genksyms will find them. + + + There is a restriction on the kernel build system that objects + which export symbols must have globally unique names. + If your object does not have a globally unique name then the + standard fix is to move the + EXPORT_SYMBOL() statements to their own + object with a unique name. + This is why several systems have separate exporting objects, + usually suffixed with ksyms. + + + + + + + + Document your option in Documentation/Configure.help. Mention + incompatibilities and issues here. Definitely + end your description with if in doubt, say N + (or, occasionally, `Y'); this is for people who have no + idea what you are talking about. + + + + + + Put yourself in CREDITS if you've done + something noteworthy, usually beyond a single file (your name + should be at the top of the source files anyway). + MAINTAINERS means you want to be consulted + when changes are made to a subsystem, and hear about bugs; it + implies a more-than-passing commitment to some part of the code. + + + + + + Finally, don't forget to read Documentation/SubmittingPatches + and possibly Documentation/SubmittingDrivers. + + + + + + + Kernel Cantrips + + + Some favorites from browsing the source. Feel free to add to this + list. + + + + include/linux/brlock.h: + + +extern inline void br_read_lock (enum brlock_indices idx) +{ + /* + * This causes a link-time bug message if an + * invalid index is used: + */ + if (idx >= __BR_END) + __br_lock_usage_bug(); + + read_lock(&__brlock_array[smp_processor_id()][idx]); +} + + + + include/linux/fs.h: + + +/* + * Kernel pointers have redundant information, so we can use a + * scheme where we can return either an error code or a dentry + * pointer with the same return value. + * + * This should be a per-architecture thing, to allow different + * error and pointer decisions. + */ + #define ERR_PTR(err) ((void *)((long)(err))) + #define PTR_ERR(ptr) ((long)(ptr)) + #define IS_ERR(ptr) ((unsigned long)(ptr) > (unsigned long)(-1000)) + + + + include/asm-i386/uaccess.h: + + + +#define copy_to_user(to,from,n) \ + (__builtin_constant_p(n) ? \ + __constant_copy_to_user((to),(from),(n)) : \ + __generic_copy_to_user((to),(from),(n))) + + + + arch/sparc/kernel/head.S: + + + +/* + * Sun people can't spell worth damn. "compatability" indeed. + * At least we *know* we can't spell, and use a spell-checker. + */ + +/* Uh, actually Linus it is I who cannot spell. Too much murky + * Sparc assembly will do this to ya. + */ +C_LABEL(cputypvar): + .asciz "compatability" + +/* Tested on SS-5, SS-10. Probably someone at Sun applied a spell-checker. */ + .align 4 +C_LABEL(cputypvar_sun4m): + .asciz "compatible" + + + + arch/sparc/lib/checksum.S: + + + + /* Sun, you just can't beat me, you just can't. Stop trying, + * give up. I'm serious, I am going to kick the living shit + * out of you, game over, lights out. + */ + + + + + Thanks + + + Thanks to Andi Kleen for the idea, answering my questions, fixing + my mistakes, filling content, etc. Philipp Rumpf for more spelling + and clarity fixes, and some excellent non-obvious points. Werner + Almesberger for giving me a great summary of + disable_irq(), and Jes Sorensen and Andrea + Arcangeli added caveats. Michael Elizabeth Chastain for checking + and adding to the Configure section. Telsa Gwynne for teaching me DocBook. + + +
+ diff --git a/Documentation/DocBook/kernel-locking.tmpl b/Documentation/DocBook/kernel-locking.tmpl new file mode 100644 index 000000000000..90dc2de8e0af --- /dev/null +++ b/Documentation/DocBook/kernel-locking.tmpl @@ -0,0 +1,2088 @@ + + + + + + Unreliable Guide To Locking + + + + Rusty + Russell + +
+ rusty@rustcorp.com.au +
+
+
+
+ + + 2003 + Rusty Russell + + + + + This documentation is free software; you can redistribute + it and/or modify it under the terms of the GNU General Public + License as published by the Free Software Foundation; either + version 2 of the License, or (at your option) any later + version. + + + + This program is distributed in the hope that it will be + useful, but WITHOUT ANY WARRANTY; without even the implied + warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + See the GNU General Public License for more details. + + + + You should have received a copy of the GNU General Public + License along with this program; if not, write to the Free + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, + MA 02111-1307 USA + + + + For more details see the file COPYING in the source + distribution of Linux. + + +
+ + + + Introduction + + Welcome, to Rusty's Remarkably Unreliable Guide to Kernel + Locking issues. This document describes the locking systems in + the Linux Kernel in 2.6. + + + With the wide availability of HyperThreading, and preemption in the Linux + Kernel, everyone hacking on the kernel needs to know the + fundamentals of concurrency and locking for + SMP. + + + + + The Problem With Concurrency + + (Skip this if you know what a Race Condition is). + + + In a normal program, you can increment a counter like so: + + + very_important_count++; + + + + This is what they would expect to happen: + + + + Expected Results + + + + + + Instance 1 + Instance 2 + + + + + + read very_important_count (5) + + + + add 1 (6) + + + + write very_important_count (6) + + + + + read very_important_count (6) + + + + add 1 (7) + + + + write very_important_count (7) + + + + +
+ + + This is what might happen: + + + + Possible Results + + + + + Instance 1 + Instance 2 + + + + + + read very_important_count (5) + + + + + read very_important_count (5) + + + add 1 (6) + + + + + add 1 (6) + + + write very_important_count (6) + + + + + write very_important_count (6) + + + +
+ + + Race Conditions and Critical Regions + + This overlap, where the result depends on the + relative timing of multiple tasks, is called a race condition. + The piece of code containing the concurrency issue is called a + critical region. And especially since Linux starting running + on SMP machines, they became one of the major issues in kernel + design and implementation. + + + Preemption can have the same effect, even if there is only one + CPU: by preempting one task during the critical region, we have + exactly the same race condition. In this case the thread which + preempts might run the critical region itself. + + + The solution is to recognize when these simultaneous accesses + occur, and use locks to make sure that only one instance can + enter the critical region at any time. There are many + friendly primitives in the Linux kernel to help you do this. + And then there are the unfriendly primitives, but I'll pretend + they don't exist. + + +
+ + + Locking in the Linux Kernel + + + If I could give you one piece of advice: never sleep with anyone + crazier than yourself. But if I had to give you advice on + locking: keep it simple. + + + + Be reluctant to introduce new locks. + + + + Strangely enough, this last one is the exact reverse of my advice when + you have slept with someone crazier than yourself. + And you should think about getting a big dog. + + + + Two Main Types of Kernel Locks: Spinlocks and Semaphores + + + There are two main types of kernel locks. The fundamental type + is the spinlock + (include/asm/spinlock.h), + which is a very simple single-holder lock: if you can't get the + spinlock, you keep trying (spinning) until you can. Spinlocks are + very small and fast, and can be used anywhere. + + + The second type is a semaphore + (include/asm/semaphore.h): it + can have more than one holder at any time (the number decided at + initialization time), although it is most commonly used as a + single-holder lock (a mutex). If you can't get a semaphore, + your task will put itself on the queue, and be woken up when the + semaphore is released. This means the CPU will do something + else while you are waiting, but there are many cases when you + simply can't sleep (see ), and so + have to use a spinlock instead. + + + Neither type of lock is recursive: see + . + + + + + Locks and Uniprocessor Kernels + + + For kernels compiled without CONFIG_SMP, and + without CONFIG_PREEMPT spinlocks do not exist at + all. This is an excellent design decision: when no-one else can + run at the same time, there is no reason to have a lock. + + + + If the kernel is compiled without CONFIG_SMP, + but CONFIG_PREEMPT is set, then spinlocks + simply disable preemption, which is sufficient to prevent any + races. For most purposes, we can think of preemption as + equivalent to SMP, and not worry about it separately. + + + + You should always test your locking code with CONFIG_SMP + and CONFIG_PREEMPT enabled, even if you don't have an SMP test box, because it + will still catch some kinds of locking bugs. + + + + Semaphores still exist, because they are required for + synchronization between user + contexts, as we will see below. + + + + + Locking Only In User Context + + + If you have a data structure which is only ever accessed from + user context, then you can use a simple semaphore + (linux/asm/semaphore.h) to protect it. This + is the most trivial case: you initialize the semaphore to the number + of resources available (usually 1), and call + down_interruptible() to grab the semaphore, and + up() to release it. There is also a + down(), which should be avoided, because it + will not return if a signal is received. + + + + Example: linux/net/core/netfilter.c allows + registration of new setsockopt() and + getsockopt() calls, with + nf_register_sockopt(). Registration and + de-registration are only done on module load and unload (and boot + time, where there is no concurrency), and the list of registrations + is only consulted for an unknown setsockopt() + or getsockopt() system call. The + nf_sockopt_mutex is perfect to protect this, + especially since the setsockopt and getsockopt calls may well + sleep. + + + + + Locking Between User Context and Softirqs + + + If a softirq shares + data with user context, you have two problems. Firstly, the current + user context can be interrupted by a softirq, and secondly, the + critical region could be entered from another CPU. This is where + spin_lock_bh() + (include/linux/spinlock.h) is + used. It disables softirqs on that CPU, then grabs the lock. + spin_unlock_bh() does the reverse. (The + '_bh' suffix is a historical reference to "Bottom Halves", the + old name for software interrupts. It should really be + called spin_lock_softirq()' in a perfect world). + + + + Note that you can also use spin_lock_irq() + or spin_lock_irqsave() here, which stop + hardware interrupts as well: see . + + + + This works perfectly for UP + as well: the spin lock vanishes, and this macro + simply becomes local_bh_disable() + (include/linux/interrupt.h), which + protects you from the softirq being run. + + + + + Locking Between User Context and Tasklets + + + This is exactly the same as above, because tasklets are actually run + from a softirq. + + + + + Locking Between User Context and Timers + + + This, too, is exactly the same as above, because timers are actually run from + a softirq. From a locking point of view, tasklets and timers + are identical. + + + + + Locking Between Tasklets/Timers + + + Sometimes a tasklet or timer might want to share data with + another tasklet or timer. + + + + The Same Tasklet/Timer + + Since a tasklet is never run on two CPUs at once, you don't + need to worry about your tasklet being reentrant (running + twice at once), even on SMP. + + + + + Different Tasklets/Timers + + If another tasklet/timer wants + to share data with your tasklet or timer , you will both need to use + spin_lock() and + spin_unlock() calls. + spin_lock_bh() is + unnecessary here, as you are already in a tasklet, and + none will be run on the same CPU. + + + + + + Locking Between Softirqs + + + Often a softirq might + want to share data with itself or a tasklet/timer. + + + + The Same Softirq + + + The same softirq can run on the other CPUs: you can use a + per-CPU array (see ) for better + performance. If you're going so far as to use a softirq, + you probably care about scalable performance enough + to justify the extra complexity. + + + + You'll need to use spin_lock() and + spin_unlock() for shared data. + + + + + Different Softirqs + + + You'll need to use spin_lock() and + spin_unlock() for shared data, whether it + be a timer, tasklet, different softirq or the same or another + softirq: any of them could be running on a different CPU. + + + + + + + Hard IRQ Context + + + Hardware interrupts usually communicate with a + tasklet or softirq. Frequently this involves putting work in a + queue, which the softirq will take out. + + + + Locking Between Hard IRQ and Softirqs/Tasklets + + + If a hardware irq handler shares data with a softirq, you have + two concerns. Firstly, the softirq processing can be + interrupted by a hardware interrupt, and secondly, the + critical region could be entered by a hardware interrupt on + another CPU. This is where spin_lock_irq() is + used. It is defined to disable interrupts on that cpu, then grab + the lock. spin_unlock_irq() does the reverse. + + + + The irq handler does not to use + spin_lock_irq(), because the softirq cannot + run while the irq handler is running: it can use + spin_lock(), which is slightly faster. The + only exception would be if a different hardware irq handler uses + the same lock: spin_lock_irq() will stop + that from interrupting us. + + + + This works perfectly for UP as well: the spin lock vanishes, + and this macro simply becomes local_irq_disable() + (include/asm/smp.h), which + protects you from the softirq/tasklet/BH being run. + + + + spin_lock_irqsave() + (include/linux/spinlock.h) is a variant + which saves whether interrupts were on or off in a flags word, + which is passed to spin_unlock_irqrestore(). This + means that the same code can be used inside an hard irq handler (where + interrupts are already off) and in softirqs (where the irq + disabling is required). + + + + Note that softirqs (and hence tasklets and timers) are run on + return from hardware interrupts, so + spin_lock_irq() also stops these. In that + sense, spin_lock_irqsave() is the most + general and powerful locking function. + + + + + Locking Between Two Hard IRQ Handlers + + It is rare to have to share data between two IRQ handlers, but + if you do, spin_lock_irqsave() should be + used: it is architecture-specific whether all interrupts are + disabled inside irq handlers themselves. + + + + + + + Cheat Sheet For Locking + + Pete Zaitcev gives the following summary: + + + + + If you are in a process context (any syscall) and want to + lock other process out, use a semaphore. You can take a semaphore + and sleep (copy_from_user*( or + kmalloc(x,GFP_KERNEL)). + + + + + Otherwise (== data can be touched in an interrupt), use + spin_lock_irqsave() and + spin_unlock_irqrestore(). + + + + + Avoid holding spinlock for more than 5 lines of code and + across any function call (except accessors like + readb). + + + + + + Table of Minimum Requirements + + The following table lists the minimum + locking requirements between various contexts. In some cases, + the same context can only be running on one CPU at a time, so + no locking is required for that context (eg. a particular + thread can only run on one CPU at a time, but if it needs + shares data with another thread, locking is required). + + + Remember the advice above: you can always use + spin_lock_irqsave(), which is a superset + of all other spinlock primitives. + + +Table of Locking Requirements + + + + +IRQ Handler A +IRQ Handler B +Softirq A +Softirq B +Tasklet A +Tasklet B +Timer A +Timer B +User Context A +User Context B + + + +IRQ Handler A +None + + + +IRQ Handler B +spin_lock_irqsave +None + + + +Softirq A +spin_lock_irq +spin_lock_irq +spin_lock + + + +Softirq B +spin_lock_irq +spin_lock_irq +spin_lock +spin_lock + + + +Tasklet A +spin_lock_irq +spin_lock_irq +spin_lock +spin_lock +None + + + +Tasklet B +spin_lock_irq +spin_lock_irq +spin_lock +spin_lock +spin_lock +None + + + +Timer A +spin_lock_irq +spin_lock_irq +spin_lock +spin_lock +spin_lock +spin_lock +None + + + +Timer B +spin_lock_irq +spin_lock_irq +spin_lock +spin_lock +spin_lock +spin_lock +spin_lock +None + + + +User Context A +spin_lock_irq +spin_lock_irq +spin_lock_bh +spin_lock_bh +spin_lock_bh +spin_lock_bh +spin_lock_bh +spin_lock_bh +None + + + +User Context B +spin_lock_irq +spin_lock_irq +spin_lock_bh +spin_lock_bh +spin_lock_bh +spin_lock_bh +spin_lock_bh +spin_lock_bh +down_interruptible +None + + + + +
+
+
+ + + Common Examples + +Let's step through a simple example: a cache of number to name +mappings. The cache keeps a count of how often each of the objects is +used, and when it gets full, throws out the least used one. + + + + + All In User Context + +For our first example, we assume that all operations are in user +context (ie. from system calls), so we can sleep. This means we can +use a semaphore to protect the cache and all the objects within +it. Here's the code: + + + +#include <linux/list.h> +#include <linux/slab.h> +#include <linux/string.h> +#include <asm/semaphore.h> +#include <asm/errno.h> + +struct object +{ + struct list_head list; + int id; + char name[32]; + int popularity; +}; + +/* Protects the cache, cache_num, and the objects within it */ +static DECLARE_MUTEX(cache_lock); +static LIST_HEAD(cache); +static unsigned int cache_num = 0; +#define MAX_CACHE_SIZE 10 + +/* Must be holding cache_lock */ +static struct object *__cache_find(int id) +{ + struct object *i; + + list_for_each_entry(i, &cache, list) + if (i->id == id) { + i->popularity++; + return i; + } + return NULL; +} + +/* Must be holding cache_lock */ +static void __cache_delete(struct object *obj) +{ + BUG_ON(!obj); + list_del(&obj->list); + kfree(obj); + cache_num--; +} + +/* Must be holding cache_lock */ +static void __cache_add(struct object *obj) +{ + list_add(&obj->list, &cache); + if (++cache_num > MAX_CACHE_SIZE) { + struct object *i, *outcast = NULL; + list_for_each_entry(i, &cache, list) { + if (!outcast || i->popularity < outcast->popularity) + outcast = i; + } + __cache_delete(outcast); + } +} + +int cache_add(int id, const char *name) +{ + struct object *obj; + + if ((obj = kmalloc(sizeof(*obj), GFP_KERNEL)) == NULL) + return -ENOMEM; + + strlcpy(obj->name, name, sizeof(obj->name)); + obj->id = id; + obj->popularity = 0; + + down(&cache_lock); + __cache_add(obj); + up(&cache_lock); + return 0; +} + +void cache_delete(int id) +{ + down(&cache_lock); + __cache_delete(__cache_find(id)); + up(&cache_lock); +} + +int cache_find(int id, char *name) +{ + struct object *obj; + int ret = -ENOENT; + + down(&cache_lock); + obj = __cache_find(id); + if (obj) { + ret = 0; + strcpy(name, obj->name); + } + up(&cache_lock); + return ret; +} + + + +Note that we always make sure we have the cache_lock when we add, +delete, or look up the cache: both the cache infrastructure itself and +the contents of the objects are protected by the lock. In this case +it's easy, since we copy the data for the user, and never let them +access the objects directly. + + +There is a slight (and common) optimization here: in +cache_add we set up the fields of the object +before grabbing the lock. This is safe, as no-one else can access it +until we put it in cache. + + + + + Accessing From Interrupt Context + +Now consider the case where cache_find can be +called from interrupt context: either a hardware interrupt or a +softirq. An example would be a timer which deletes object from the +cache. + + +The change is shown below, in standard patch format: the +- are lines which are taken away, and the ++ are lines which are added. + + +--- cache.c.usercontext 2003-12-09 13:58:54.000000000 +1100 ++++ cache.c.interrupt 2003-12-09 14:07:49.000000000 +1100 +@@ -12,7 +12,7 @@ + int popularity; + }; + +-static DECLARE_MUTEX(cache_lock); ++static spinlock_t cache_lock = SPIN_LOCK_UNLOCKED; + static LIST_HEAD(cache); + static unsigned int cache_num = 0; + #define MAX_CACHE_SIZE 10 +@@ -55,6 +55,7 @@ + int cache_add(int id, const char *name) + { + struct object *obj; ++ unsigned long flags; + + if ((obj = kmalloc(sizeof(*obj), GFP_KERNEL)) == NULL) + return -ENOMEM; +@@ -63,30 +64,33 @@ + obj->id = id; + obj->popularity = 0; + +- down(&cache_lock); ++ spin_lock_irqsave(&cache_lock, flags); + __cache_add(obj); +- up(&cache_lock); ++ spin_unlock_irqrestore(&cache_lock, flags); + return 0; + } + + void cache_delete(int id) + { +- down(&cache_lock); ++ unsigned long flags; ++ ++ spin_lock_irqsave(&cache_lock, flags); + __cache_delete(__cache_find(id)); +- up(&cache_lock); ++ spin_unlock_irqrestore(&cache_lock, flags); + } + + int cache_find(int id, char *name) + { + struct object *obj; + int ret = -ENOENT; ++ unsigned long flags; + +- down(&cache_lock); ++ spin_lock_irqsave(&cache_lock, flags); + obj = __cache_find(id); + if (obj) { + ret = 0; + strcpy(name, obj->name); + } +- up(&cache_lock); ++ spin_unlock_irqrestore(&cache_lock, flags); + return ret; + } + + + +Note that the spin_lock_irqsave will turn off +interrupts if they are on, otherwise does nothing (if we are already +in an interrupt handler), hence these functions are safe to call from +any context. + + +Unfortunately, cache_add calls +kmalloc with the GFP_KERNEL +flag, which is only legal in user context. I have assumed that +cache_add is still only called in user context, +otherwise this should become a parameter to +cache_add. + + + + Exposing Objects Outside This File + +If our objects contained more information, it might not be sufficient +to copy the information in and out: other parts of the code might want +to keep pointers to these objects, for example, rather than looking up +the id every time. This produces two problems. + + +The first problem is that we use the cache_lock to +protect objects: we'd need to make this non-static so the rest of the +code can use it. This makes locking trickier, as it is no longer all +in one place. + + +The second problem is the lifetime problem: if another structure keeps +a pointer to an object, it presumably expects that pointer to remain +valid. Unfortunately, this is only guaranteed while you hold the +lock, otherwise someone might call cache_delete +and even worse, add another object, re-using the same address. + + +As there is only one lock, you can't hold it forever: no-one else would +get any work done. + + +The solution to this problem is to use a reference count: everyone who +has a pointer to the object increases it when they first get the +object, and drops the reference count when they're finished with it. +Whoever drops it to zero knows it is unused, and can actually delete it. + + +Here is the code: + + + +--- cache.c.interrupt 2003-12-09 14:25:43.000000000 +1100 ++++ cache.c.refcnt 2003-12-09 14:33:05.000000000 +1100 +@@ -7,6 +7,7 @@ + struct object + { + struct list_head list; ++ unsigned int refcnt; + int id; + char name[32]; + int popularity; +@@ -17,6 +18,35 @@ + static unsigned int cache_num = 0; + #define MAX_CACHE_SIZE 10 + ++static void __object_put(struct object *obj) ++{ ++ if (--obj->refcnt == 0) ++ kfree(obj); ++} ++ ++static void __object_get(struct object *obj) ++{ ++ obj->refcnt++; ++} ++ ++void object_put(struct object *obj) ++{ ++ unsigned long flags; ++ ++ spin_lock_irqsave(&cache_lock, flags); ++ __object_put(obj); ++ spin_unlock_irqrestore(&cache_lock, flags); ++} ++ ++void object_get(struct object *obj) ++{ ++ unsigned long flags; ++ ++ spin_lock_irqsave(&cache_lock, flags); ++ __object_get(obj); ++ spin_unlock_irqrestore(&cache_lock, flags); ++} ++ + /* Must be holding cache_lock */ + static struct object *__cache_find(int id) + { +@@ -35,6 +65,7 @@ + { + BUG_ON(!obj); + list_del(&obj->list); ++ __object_put(obj); + cache_num--; + } + +@@ -63,6 +94,7 @@ + strlcpy(obj->name, name, sizeof(obj->name)); + obj->id = id; + obj->popularity = 0; ++ obj->refcnt = 1; /* The cache holds a reference */ + + spin_lock_irqsave(&cache_lock, flags); + __cache_add(obj); +@@ -79,18 +111,15 @@ + spin_unlock_irqrestore(&cache_lock, flags); + } + +-int cache_find(int id, char *name) ++struct object *cache_find(int id) + { + struct object *obj; +- int ret = -ENOENT; + unsigned long flags; + + spin_lock_irqsave(&cache_lock, flags); + obj = __cache_find(id); +- if (obj) { +- ret = 0; +- strcpy(name, obj->name); +- } ++ if (obj) ++ __object_get(obj); + spin_unlock_irqrestore(&cache_lock, flags); +- return ret; ++ return obj; + } + + + +We encapsulate the reference counting in the standard 'get' and 'put' +functions. Now we can return the object itself from +cache_find which has the advantage that the user +can now sleep holding the object (eg. to +copy_to_user to name to userspace). + + +The other point to note is that I said a reference should be held for +every pointer to the object: thus the reference count is 1 when first +inserted into the cache. In some versions the framework does not hold +a reference count, but they are more complicated. + + + + Using Atomic Operations For The Reference Count + +In practice, atomic_t would usually be used for +refcnt. There are a number of atomic +operations defined in + +include/asm/atomic.h: these are +guaranteed to be seen atomically from all CPUs in the system, so no +lock is required. In this case, it is simpler than using spinlocks, +although for anything non-trivial using spinlocks is clearer. The +atomic_inc and +atomic_dec_and_test are used instead of the +standard increment and decrement operators, and the lock is no longer +used to protect the reference count itself. + + + +--- cache.c.refcnt 2003-12-09 15:00:35.000000000 +1100 ++++ cache.c.refcnt-atomic 2003-12-11 15:49:42.000000000 +1100 +@@ -7,7 +7,7 @@ + struct object + { + struct list_head list; +- unsigned int refcnt; ++ atomic_t refcnt; + int id; + char name[32]; + int popularity; +@@ -18,33 +18,15 @@ + static unsigned int cache_num = 0; + #define MAX_CACHE_SIZE 10 + +-static void __object_put(struct object *obj) +-{ +- if (--obj->refcnt == 0) +- kfree(obj); +-} +- +-static void __object_get(struct object *obj) +-{ +- obj->refcnt++; +-} +- + void object_put(struct object *obj) + { +- unsigned long flags; +- +- spin_lock_irqsave(&cache_lock, flags); +- __object_put(obj); +- spin_unlock_irqrestore(&cache_lock, flags); ++ if (atomic_dec_and_test(&obj->refcnt)) ++ kfree(obj); + } + + void object_get(struct object *obj) + { +- unsigned long flags; +- +- spin_lock_irqsave(&cache_lock, flags); +- __object_get(obj); +- spin_unlock_irqrestore(&cache_lock, flags); ++ atomic_inc(&obj->refcnt); + } + + /* Must be holding cache_lock */ +@@ -65,7 +47,7 @@ + { + BUG_ON(!obj); + list_del(&obj->list); +- __object_put(obj); ++ object_put(obj); + cache_num--; + } + +@@ -94,7 +76,7 @@ + strlcpy(obj->name, name, sizeof(obj->name)); + obj->id = id; + obj->popularity = 0; +- obj->refcnt = 1; /* The cache holds a reference */ ++ atomic_set(&obj->refcnt, 1); /* The cache holds a reference */ + + spin_lock_irqsave(&cache_lock, flags); + __cache_add(obj); +@@ -119,7 +101,7 @@ + spin_lock_irqsave(&cache_lock, flags); + obj = __cache_find(id); + if (obj) +- __object_get(obj); ++ object_get(obj); + spin_unlock_irqrestore(&cache_lock, flags); + return obj; + } + + + + + + Protecting The Objects Themselves + +In these examples, we assumed that the objects (except the reference +counts) never changed once they are created. If we wanted to allow +the name to change, there are three possibilities: + + + + +You can make cache_lock non-static, and tell people +to grab that lock before changing the name in any object. + + + + +You can provide a cache_obj_rename which grabs +this lock and changes the name for the caller, and tell everyone to +use that function. + + + + +You can make the cache_lock protect only the cache +itself, and use another lock to protect the name. + + + + + +Theoretically, you can make the locks as fine-grained as one lock for +every field, for every object. In practice, the most common variants +are: + + + + +One lock which protects the infrastructure (the cache +list in this example) and all the objects. This is what we have done +so far. + + + + +One lock which protects the infrastructure (including the list +pointers inside the objects), and one lock inside the object which +protects the rest of that object. + + + + +Multiple locks to protect the infrastructure (eg. one lock per hash +chain), possibly with a separate per-object lock. + + + + + +Here is the "lock-per-object" implementation: + + +--- cache.c.refcnt-atomic 2003-12-11 15:50:54.000000000 +1100 ++++ cache.c.perobjectlock 2003-12-11 17:15:03.000000000 +1100 +@@ -6,11 +6,17 @@ + + struct object + { ++ /* These two protected by cache_lock. */ + struct list_head list; ++ int popularity; ++ + atomic_t refcnt; ++ ++ /* Doesn't change once created. */ + int id; ++ ++ spinlock_t lock; /* Protects the name */ + char name[32]; +- int popularity; + }; + + static spinlock_t cache_lock = SPIN_LOCK_UNLOCKED; +@@ -77,6 +84,7 @@ + obj->id = id; + obj->popularity = 0; + atomic_set(&obj->refcnt, 1); /* The cache holds a reference */ ++ spin_lock_init(&obj->lock); + + spin_lock_irqsave(&cache_lock, flags); + __cache_add(obj); + + + +Note that I decide that the popularity +count should be protected by the cache_lock rather +than the per-object lock: this is because it (like the +struct list_head inside the object) is +logically part of the infrastructure. This way, I don't need to grab +the lock of every object in __cache_add when +seeking the least popular. + + + +I also decided that the id member is +unchangeable, so I don't need to grab each object lock in +__cache_find() to examine the +id: the object lock is only used by a +caller who wants to read or write the name +field. + + + +Note also that I added a comment describing what data was protected by +which locks. This is extremely important, as it describes the runtime +behavior of the code, and can be hard to gain from just reading. And +as Alan Cox says, Lock data, not code. + + + + + + Common Problems + + Deadlock: Simple and Advanced + + + There is a coding bug where a piece of code tries to grab a + spinlock twice: it will spin forever, waiting for the lock to + be released (spinlocks, rwlocks and semaphores are not + recursive in Linux). This is trivial to diagnose: not a + stay-up-five-nights-talk-to-fluffy-code-bunnies kind of + problem. + + + + For a slightly more complex case, imagine you have a region + shared by a softirq and user context. If you use a + spin_lock() call to protect it, it is + possible that the user context will be interrupted by the softirq + while it holds the lock, and the softirq will then spin + forever trying to get the same lock. + + + + Both of these are called deadlock, and as shown above, it can + occur even with a single CPU (although not on UP compiles, + since spinlocks vanish on kernel compiles with + CONFIG_SMP=n. You'll still get data corruption + in the second example). + + + + This complete lockup is easy to diagnose: on SMP boxes the + watchdog timer or compiling with DEBUG_SPINLOCKS set + (include/linux/spinlock.h) will show this up + immediately when it happens. + + + + A more complex problem is the so-called 'deadly embrace', + involving two or more locks. Say you have a hash table: each + entry in the table is a spinlock, and a chain of hashed + objects. Inside a softirq handler, you sometimes want to + alter an object from one place in the hash to another: you + grab the spinlock of the old hash chain and the spinlock of + the new hash chain, and delete the object from the old one, + and insert it in the new one. + + + + There are two problems here. First, if your code ever + tries to move the object to the same chain, it will deadlock + with itself as it tries to lock it twice. Secondly, if the + same softirq on another CPU is trying to move another object + in the reverse direction, the following could happen: + + + + Consequences + + + + + + CPU 1 + CPU 2 + + + + + + Grab lock A -> OK + Grab lock B -> OK + + + Grab lock B -> spin + Grab lock A -> spin + + + +
+ + + The two CPUs will spin forever, waiting for the other to give up + their lock. It will look, smell, and feel like a crash. + +
+ + + Preventing Deadlock + + + Textbooks will tell you that if you always lock in the same + order, you will never get this kind of deadlock. Practice + will tell you that this approach doesn't scale: when I + create a new lock, I don't understand enough of the kernel + to figure out where in the 5000 lock hierarchy it will fit. + + + + The best locks are encapsulated: they never get exposed in + headers, and are never held around calls to non-trivial + functions outside the same file. You can read through this + code and see that it will never deadlock, because it never + tries to grab another lock while it has that one. People + using your code don't even need to know you are using a + lock. + + + + A classic problem here is when you provide callbacks or + hooks: if you call these with the lock held, you risk simple + deadlock, or a deadly embrace (who knows what the callback + will do?). Remember, the other programmers are out to get + you, so don't do this. + + + + Overzealous Prevention Of Deadlocks + + + Deadlocks are problematic, but not as bad as data + corruption. Code which grabs a read lock, searches a list, + fails to find what it wants, drops the read lock, grabs a + write lock and inserts the object has a race condition. + + + + If you don't see why, please stay the fuck away from my code. + + + + + + Racing Timers: A Kernel Pastime + + + Timers can produce their own special problems with races. + Consider a collection of objects (list, hash, etc) where each + object has a timer which is due to destroy it. + + + + If you want to destroy the entire collection (say on module + removal), you might do the following: + + + + /* THIS CODE BAD BAD BAD BAD: IF IT WAS ANY WORSE IT WOULD USE + HUNGARIAN NOTATION */ + spin_lock_bh(&list_lock); + + while (list) { + struct foo *next = list->next; + del_timer(&list->timer); + kfree(list); + list = next; + } + + spin_unlock_bh(&list_lock); + + + + Sooner or later, this will crash on SMP, because a timer can + have just gone off before the spin_lock_bh(), + and it will only get the lock after we + spin_unlock_bh(), and then try to free + the element (which has already been freed!). + + + + This can be avoided by checking the result of + del_timer(): if it returns + 1, the timer has been deleted. + If 0, it means (in this + case) that it is currently running, so we can do: + + + + retry: + spin_lock_bh(&list_lock); + + while (list) { + struct foo *next = list->next; + if (!del_timer(&list->timer)) { + /* Give timer a chance to delete this */ + spin_unlock_bh(&list_lock); + goto retry; + } + kfree(list); + list = next; + } + + spin_unlock_bh(&list_lock); + + + + Another common problem is deleting timers which restart + themselves (by calling add_timer() at the end + of their timer function). Because this is a fairly common case + which is prone to races, you should use del_timer_sync() + (include/linux/timer.h) + to handle this case. It returns the number of times the timer + had to be deleted before we finally stopped it from adding itself back + in. + + + +
+ + + Locking Speed + + +There are three main things to worry about when considering speed of +some code which does locking. First is concurrency: how many things +are going to be waiting while someone else is holding a lock. Second +is the time taken to actually acquire and release an uncontended lock. +Third is using fewer, or smarter locks. I'm assuming that the lock is +used fairly often: otherwise, you wouldn't be concerned about +efficiency. + + +Concurrency depends on how long the lock is usually held: you should +hold the lock for as long as needed, but no longer. In the cache +example, we always create the object without the lock held, and then +grab the lock only when we are ready to insert it in the list. + + +Acquisition times depend on how much damage the lock operations do to +the pipeline (pipeline stalls) and how likely it is that this CPU was +the last one to grab the lock (ie. is the lock cache-hot for this +CPU): on a machine with more CPUs, this likelihood drops fast. +Consider a 700MHz Intel Pentium III: an instruction takes about 0.7ns, +an atomic increment takes about 58ns, a lock which is cache-hot on +this CPU takes 160ns, and a cacheline transfer from another CPU takes +an additional 170 to 360ns. (These figures from Paul McKenney's + Linux +Journal RCU article). + + +These two aims conflict: holding a lock for a short time might be done +by splitting locks into parts (such as in our final per-object-lock +example), but this increases the number of lock acquisitions, and the +results are often slower than having a single lock. This is another +reason to advocate locking simplicity. + + +The third concern is addressed below: there are some methods to reduce +the amount of locking which needs to be done. + + + + Read/Write Lock Variants + + + Both spinlocks and semaphores have read/write variants: + rwlock_t and struct rw_semaphore. + These divide users into two classes: the readers and the writers. If + you are only reading the data, you can get a read lock, but to write to + the data you need the write lock. Many people can hold a read lock, + but a writer must be sole holder. + + + + If your code divides neatly along reader/writer lines (as our + cache code does), and the lock is held by readers for + significant lengths of time, using these locks can help. They + are slightly slower than the normal locks though, so in practice + rwlock_t is not usually worthwhile. + + + + + Avoiding Locks: Read Copy Update + + + There is a special method of read/write locking called Read Copy + Update. Using RCU, the readers can avoid taking a lock + altogether: as we expect our cache to be read more often than + updated (otherwise the cache is a waste of time), it is a + candidate for this optimization. + + + + How do we get rid of read locks? Getting rid of read locks + means that writers may be changing the list underneath the + readers. That is actually quite simple: we can read a linked + list while an element is being added if the writer adds the + element very carefully. For example, adding + new to a single linked list called + list: + + + + new->next = list->next; + wmb(); + list->next = new; + + + + The wmb() is a write memory barrier. It + ensures that the first operation (setting the new element's + next pointer) is complete and will be seen by + all CPUs, before the second operation is (putting the new + element into the list). This is important, since modern + compilers and modern CPUs can both reorder instructions unless + told otherwise: we want a reader to either not see the new + element at all, or see the new element with the + next pointer correctly pointing at the rest of + the list. + + + Fortunately, there is a function to do this for standard + struct list_head lists: + list_add_rcu() + (include/linux/list.h). + + + Removing an element from the list is even simpler: we replace + the pointer to the old element with a pointer to its successor, + and readers will either see it, or skip over it. + + + list->next = old->next; + + + There is list_del_rcu() + (include/linux/list.h) which does this (the + normal version poisons the old object, which we don't want). + + + The reader must also be careful: some CPUs can look through the + next pointer to start reading the contents of + the next element early, but don't realize that the pre-fetched + contents is wrong when the next pointer changes + underneath them. Once again, there is a + list_for_each_entry_rcu() + (include/linux/list.h) to help you. Of + course, writers can just use + list_for_each_entry(), since there cannot + be two simultaneous writers. + + + Our final dilemma is this: when can we actually destroy the + removed element? Remember, a reader might be stepping through + this element in the list right now: it we free this element and + the next pointer changes, the reader will jump + off into garbage and crash. We need to wait until we know that + all the readers who were traversing the list when we deleted the + element are finished. We use call_rcu() to + register a callback which will actually destroy the object once + the readers are finished. + + + But how does Read Copy Update know when the readers are + finished? The method is this: firstly, the readers always + traverse the list inside + rcu_read_lock()/rcu_read_unlock() + pairs: these simply disable preemption so the reader won't go to + sleep while reading the list. + + + RCU then waits until every other CPU has slept at least once: + since readers cannot sleep, we know that any readers which were + traversing the list during the deletion are finished, and the + callback is triggered. The real Read Copy Update code is a + little more optimized than this, but this is the fundamental + idea. + + + +--- cache.c.perobjectlock 2003-12-11 17:15:03.000000000 +1100 ++++ cache.c.rcupdate 2003-12-11 17:55:14.000000000 +1100 +@@ -1,15 +1,18 @@ + #include <linux/list.h> + #include <linux/slab.h> + #include <linux/string.h> ++#include <linux/rcupdate.h> + #include <asm/semaphore.h> + #include <asm/errno.h> + + struct object + { +- /* These two protected by cache_lock. */ ++ /* This is protected by RCU */ + struct list_head list; + int popularity; + ++ struct rcu_head rcu; ++ + atomic_t refcnt; + + /* Doesn't change once created. */ +@@ -40,7 +43,7 @@ + { + struct object *i; + +- list_for_each_entry(i, &cache, list) { ++ list_for_each_entry_rcu(i, &cache, list) { + if (i->id == id) { + i->popularity++; + return i; +@@ -49,19 +52,25 @@ + return NULL; + } + ++/* Final discard done once we know no readers are looking. */ ++static void cache_delete_rcu(void *arg) ++{ ++ object_put(arg); ++} ++ + /* Must be holding cache_lock */ + static void __cache_delete(struct object *obj) + { + BUG_ON(!obj); +- list_del(&obj->list); +- object_put(obj); ++ list_del_rcu(&obj->list); + cache_num--; ++ call_rcu(&obj->rcu, cache_delete_rcu, obj); + } + + /* Must be holding cache_lock */ + static void __cache_add(struct object *obj) + { +- list_add(&obj->list, &cache); ++ list_add_rcu(&obj->list, &cache); + if (++cache_num > MAX_CACHE_SIZE) { + struct object *i, *outcast = NULL; + list_for_each_entry(i, &cache, list) { +@@ -85,6 +94,7 @@ + obj->popularity = 0; + atomic_set(&obj->refcnt, 1); /* The cache holds a reference */ + spin_lock_init(&obj->lock); ++ INIT_RCU_HEAD(&obj->rcu); + + spin_lock_irqsave(&cache_lock, flags); + __cache_add(obj); +@@ -104,12 +114,11 @@ + struct object *cache_find(int id) + { + struct object *obj; +- unsigned long flags; + +- spin_lock_irqsave(&cache_lock, flags); ++ rcu_read_lock(); + obj = __cache_find(id); + if (obj) + object_get(obj); +- spin_unlock_irqrestore(&cache_lock, flags); ++ rcu_read_unlock(); + return obj; + } + + + +Note that the reader will alter the +popularity member in +__cache_find(), and now it doesn't hold a lock. +One solution would be to make it an atomic_t, but for +this usage, we don't really care about races: an approximate result is +good enough, so I didn't change it. + + + +The result is that cache_find() requires no +synchronization with any other functions, so is almost as fast on SMP +as it would be on UP. + + + +There is a furthur optimization possible here: remember our original +cache code, where there were no reference counts and the caller simply +held the lock whenever using the object? This is still possible: if +you hold the lock, noone can delete the object, so you don't need to +get and put the reference count. + + + +Now, because the 'read lock' in RCU is simply disabling preemption, a +caller which always has preemption disabled between calling +cache_find() and +object_put() does not need to actually get and +put the reference count: we could expose +__cache_find() by making it non-static, and +such callers could simply call that. + + +The benefit here is that the reference count is not written to: the +object is not altered in any way, which is much faster on SMP +machines due to caching. + + + + + Per-CPU Data + + + Another technique for avoiding locking which is used fairly + widely is to duplicate information for each CPU. For example, + if you wanted to keep a count of a common condition, you could + use a spin lock and a single counter. Nice and simple. + + + + If that was too slow (it's usually not, but if you've got a + really big machine to test on and can show that it is), you + could instead use a counter for each CPU, then none of them need + an exclusive lock. See DEFINE_PER_CPU(), + get_cpu_var() and + put_cpu_var() + (include/linux/percpu.h). + + + + Of particular use for simple per-cpu counters is the + local_t type, and the + cpu_local_inc() and related functions, + which are more efficient than simple code on some architectures + (include/asm/local.h). + + + + Note that there is no simple, reliable way of getting an exact + value of such a counter, without introducing more locks. This + is not a problem for some uses. + + + + + Data Which Mostly Used By An IRQ Handler + + + If data is always accessed from within the same IRQ handler, you + don't need a lock at all: the kernel already guarantees that the + irq handler will not run simultaneously on multiple CPUs. + + + Manfred Spraul points out that you can still do this, even if + the data is very occasionally accessed in user context or + softirqs/tasklets. The irq handler doesn't use a lock, and + all other accesses are done as so: + + + + spin_lock(&lock); + disable_irq(irq); + ... + enable_irq(irq); + spin_unlock(&lock); + + + The disable_irq() prevents the irq handler + from running (and waits for it to finish if it's currently + running on other CPUs). The spinlock prevents any other + accesses happening at the same time. Naturally, this is slower + than just a spin_lock_irq() call, so it + only makes sense if this type of access happens extremely + rarely. + + + + + + What Functions Are Safe To Call From Interrupts? + + + Many functions in the kernel sleep (ie. call schedule()) + directly or indirectly: you can never call them while holding a + spinlock, or with preemption disabled. This also means you need + to be in user context: calling them from an interrupt is illegal. + + + + Some Functions Which Sleep + + + The most common ones are listed below, but you usually have to + read the code to find out if other calls are safe. If everyone + else who calls it can sleep, you probably need to be able to + sleep, too. In particular, registration and deregistration + functions usually expect to be called from user context, and can + sleep. + + + + + + Accesses to + userspace: + + + + + copy_from_user() + + + + + copy_to_user() + + + + + get_user() + + + + + put_user() + + + + + + + + kmalloc(GFP_KERNEL) + + + + + + down_interruptible() and + down() + + + There is a down_trylock() which can be + used inside interrupt context, as it will not sleep. + up() will also never sleep. + + + + + + + Some Functions Which Don't Sleep + + + Some functions are safe to call from any context, or holding + almost any lock. + + + + + + printk() + + + + + kfree() + + + + + add_timer() and del_timer() + + + + + + + + Further reading + + + + + Documentation/spinlocks.txt: + Linus Torvalds' spinlocking tutorial in the kernel sources. + + + + + + Unix Systems for Modern Architectures: Symmetric + Multiprocessing and Caching for Kernel Programmers: + + + + Curt Schimmel's very good introduction to kernel level + locking (not written for Linux, but nearly everything + applies). The book is expensive, but really worth every + penny to understand SMP locking. [ISBN: 0201633388] + + + + + + + Thanks + + + Thanks to Telsa Gwynne for DocBooking, neatening and adding + style. + + + + Thanks to Martin Pool, Philipp Rumpf, Stephen Rothwell, Paul + Mackerras, Ruedi Aschwanden, Alan Cox, Manfred Spraul, Tim + Waugh, Pete Zaitcev, James Morris, Robert Love, Paul McKenney, + John Ashby for proofreading, correcting, flaming, commenting. + + + + Thanks to the cabal for having no influence on this document. + + + + + Glossary + + + preemption + + + Prior to 2.5, or when CONFIG_PREEMPT is + unset, processes in user context inside the kernel would not + preempt each other (ie. you had that CPU until you have it up, + except for interrupts). With the addition of + CONFIG_PREEMPT in 2.5.4, this changed: when + in user context, higher priority tasks can "cut in": spinlocks + were changed to disable preemption, even on UP. + + + + + + bh + + + Bottom Half: for historical reasons, functions with + '_bh' in them often now refer to any software interrupt, e.g. + spin_lock_bh() blocks any software interrupt + on the current CPU. Bottom halves are deprecated, and will + eventually be replaced by tasklets. Only one bottom half will be + running at any time. + + + + + + Hardware Interrupt / Hardware IRQ + + + Hardware interrupt request. in_irq() returns + true in a hardware interrupt handler. + + + + + + Interrupt Context + + + Not user context: processing a hardware irq or software irq. + Indicated by the in_interrupt() macro + returning true. + + + + + + SMP + + + Symmetric Multi-Processor: kernels compiled for multiple-CPU + machines. (CONFIG_SMP=y). + + + + + + Software Interrupt / softirq + + + Software interrupt handler. in_irq() returns + false; in_softirq() + returns true. Tasklets and softirqs + both fall into the category of 'software interrupts'. + + + Strictly speaking a softirq is one of up to 32 enumerated software + interrupts which can run on multiple CPUs at once. + Sometimes used to refer to tasklets as + well (ie. all software interrupts). + + + + + + tasklet + + + A dynamically-registrable software interrupt, + which is guaranteed to only run on one CPU at a time. + + + + + + timer + + + A dynamically-registrable software interrupt, which is run at + (or close to) a given time. When running, it is just like a + tasklet (in fact, they are called from the TIMER_SOFTIRQ). + + + + + + UP + + + Uni-Processor: Non-SMP. (CONFIG_SMP=n). + + + + + + User Context + + + The kernel executing on behalf of a particular process (ie. a + system call or trap) or kernel thread. You can tell which + process with the current macro.) Not to + be confused with userspace. Can be interrupted by software or + hardware interrupts. + + + + + + Userspace + + + A process executing its own code outside the kernel. + + + + + +
+ diff --git a/Documentation/DocBook/libata.tmpl b/Documentation/DocBook/libata.tmpl new file mode 100644 index 000000000000..cf2fce7707da --- /dev/null +++ b/Documentation/DocBook/libata.tmpl @@ -0,0 +1,282 @@ + + + + + + libATA Developer's Guide + + + + Jeff + Garzik + + + + + 2003 + Jeff Garzik + + + + + The contents of this file are subject to the Open + Software License version 1.1 that can be found at + http://www.opensource.org/licenses/osl-1.1.txt and is included herein + by reference. + + + + Alternatively, the contents of this file may be used under the terms + of the GNU General Public License version 2 (the "GPL") as distributed + in the kernel source COPYING file, in which case the provisions of + the GPL are applicable instead of the above. If you wish to allow + the use of your version of this file only under the terms of the + GPL and not to allow others to use your version of this file under + the OSL, indicate your decision by deleting the provisions above and + replace them with the notice and other provisions required by the GPL. + If you do not delete the provisions above, a recipient may use your + version of this file under either the OSL or the GPL. + + + + + + + + + Thanks + + The bulk of the ATA knowledge comes thanks to long conversations with + Andre Hedrick (www.linux-ide.org). + + + Thanks to Alan Cox for pointing out similarities + between SATA and SCSI, and in general for motivation to hack on + libata. + + + libata's device detection + method, ata_pio_devchk, and in general all the early probing was + based on extensive study of Hale Landis's probe/reset code in his + ATADRVR driver (www.ata-atapi.com). + + + + + libata Driver API + + struct ata_port_operations + + +void (*port_disable) (struct ata_port *); + + + + Called from ata_bus_probe() and ata_bus_reset() error paths, + as well as when unregistering from the SCSI module (rmmod, hot + unplug). + + + +void (*dev_config) (struct ata_port *, struct ata_device *); + + + + Called after IDENTIFY [PACKET] DEVICE is issued to each device + found. Typically used to apply device-specific fixups prior to + issue of SET FEATURES - XFER MODE, and prior to operation. + + + +void (*set_piomode) (struct ata_port *, struct ata_device *); +void (*set_dmamode) (struct ata_port *, struct ata_device *); +void (*post_set_mode) (struct ata_port *ap); + + + + Hooks called prior to the issue of SET FEATURES - XFER MODE + command. dev->pio_mode is guaranteed to be valid when + ->set_piomode() is called, and dev->dma_mode is guaranteed to be + valid when ->set_dmamode() is called. ->post_set_mode() is + called unconditionally, after the SET FEATURES - XFER MODE + command completes successfully. + + + + ->set_piomode() is always called (if present), but + ->set_dma_mode() is only called if DMA is possible. + + + +void (*tf_load) (struct ata_port *ap, struct ata_taskfile *tf); +void (*tf_read) (struct ata_port *ap, struct ata_taskfile *tf); + + + + ->tf_load() is called to load the given taskfile into hardware + registers / DMA buffers. ->tf_read() is called to read the + hardware registers / DMA buffers, to obtain the current set of + taskfile register values. + + + +void (*exec_command)(struct ata_port *ap, struct ata_taskfile *tf); + + + + causes an ATA command, previously loaded with + ->tf_load(), to be initiated in hardware. + + + +u8 (*check_status)(struct ata_port *ap); +void (*dev_select)(struct ata_port *ap, unsigned int device); + + + + Reads the Status ATA shadow register from hardware. On some + hardware, this has the side effect of clearing the interrupt + condition. + + + +void (*dev_select)(struct ata_port *ap, unsigned int device); + + + + Issues the low-level hardware command(s) that causes one of N + hardware devices to be considered 'selected' (active and + available for use) on the ATA bus. + + + +void (*phy_reset) (struct ata_port *ap); + + + + The very first step in the probe phase. Actions vary depending + on the bus type, typically. After waking up the device and probing + for device presence (PATA and SATA), typically a soft reset + (SRST) will be performed. Drivers typically use the helper + functions ata_bus_reset() or sata_phy_reset() for this hook. + + + +void (*bmdma_setup) (struct ata_queued_cmd *qc); +void (*bmdma_start) (struct ata_queued_cmd *qc); + + + + When setting up an IDE BMDMA transaction, these hooks arm + (->bmdma_setup) and fire (->bmdma_start) the hardware's DMA + engine. + + + +void (*qc_prep) (struct ata_queued_cmd *qc); +int (*qc_issue) (struct ata_queued_cmd *qc); + + + + Higher-level hooks, these two hooks can potentially supercede + several of the above taskfile/DMA engine hooks. ->qc_prep is + called after the buffers have been DMA-mapped, and is typically + used to populate the hardware's DMA scatter-gather table. + Most drivers use the standard ata_qc_prep() helper function, but + more advanced drivers roll their own. + + + ->qc_issue is used to make a command active, once the hardware + and S/G tables have been prepared. IDE BMDMA drivers use the + helper function ata_qc_issue_prot() for taskfile protocol-based + dispatch. More advanced drivers roll their own ->qc_issue + implementation, using this as the "issue new ATA command to + hardware" hook. + + + +void (*eng_timeout) (struct ata_port *ap); + + + + This is a high level error handling function, called from the + error handling thread, when a command times out. + + + +irqreturn_t (*irq_handler)(int, void *, struct pt_regs *); +void (*irq_clear) (struct ata_port *); + + + + ->irq_handler is the interrupt handling routine registered with + the system, by libata. ->irq_clear is called during probe just + before the interrupt handler is registered, to be sure hardware + is quiet. + + + +u32 (*scr_read) (struct ata_port *ap, unsigned int sc_reg); +void (*scr_write) (struct ata_port *ap, unsigned int sc_reg, + u32 val); + + + + Read and write standard SATA phy registers. Currently only used + if ->phy_reset hook called the sata_phy_reset() helper function. + + + +int (*port_start) (struct ata_port *ap); +void (*port_stop) (struct ata_port *ap); +void (*host_stop) (struct ata_host_set *host_set); + + + + ->port_start() is called just after the data structures for each + port are initialized. Typically this is used to alloc per-port + DMA buffers / tables / rings, enable DMA engines, and similar + tasks. + + + ->host_stop() is called when the rmmod or hot unplug process + begins. The hook must stop all hardware interrupts, DMA + engines, etc. + + + ->port_stop() is called after ->host_stop(). It's sole function + is to release DMA/memory resources, now that they are no longer + actively being used. + + + + + + + libata Library +!Edrivers/scsi/libata-core.c + + + + libata Core Internals +!Idrivers/scsi/libata-core.c + + + + libata SCSI translation/emulation +!Edrivers/scsi/libata-scsi.c +!Idrivers/scsi/libata-scsi.c + + + + ata_piix Internals +!Idrivers/scsi/ata_piix.c + + + + sata_sil Internals +!Idrivers/scsi/sata_sil.c + + + diff --git a/Documentation/DocBook/librs.tmpl b/Documentation/DocBook/librs.tmpl new file mode 100644 index 000000000000..3ff39bafc00e --- /dev/null +++ b/Documentation/DocBook/librs.tmpl @@ -0,0 +1,289 @@ + + + + + + Reed-Solomon Library Programming Interface + + + + Thomas + Gleixner + +
+ tglx@linutronix.de +
+
+
+
+ + + 2004 + Thomas Gleixner + + + + + This documentation is free software; you can redistribute + it and/or modify it under the terms of the GNU General Public + License version 2 as published by the Free Software Foundation. + + + + This program is distributed in the hope that it will be + useful, but WITHOUT ANY WARRANTY; without even the implied + warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + See the GNU General Public License for more details. + + + + You should have received a copy of the GNU General Public + License along with this program; if not, write to the Free + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, + MA 02111-1307 USA + + + + For more details see the file COPYING in the source + distribution of Linux. + + +
+ + + + + Introduction + + The generic Reed-Solomon Library provides encoding, decoding + and error correction functions. + + + Reed-Solomon codes are used in communication and storage + applications to ensure data integrity. + + + This documentation is provided for developers who want to utilize + the functions provided by the library. + + + + + Known Bugs And Assumptions + + None. + + + + + Usage + + This chapter provides examples how to use the library. + + + Initializing + + The init function init_rs returns a pointer to a + rs decoder structure, which holds the necessary + information for encoding, decoding and error correction + with the given polynomial. It either uses an existing + matching decoder or creates a new one. On creation all + the lookup tables for fast en/decoding are created. + The function may take a while, so make sure not to + call it in critical code paths. + + +/* the Reed Solomon control structure */ +static struct rs_control *rs_decoder; + +/* Symbolsize is 10 (bits) + * Primitve polynomial is x^10+x^3+1 + * first consecutive root is 0 + * primitve element to generate roots = 1 + * generator polinomial degree (number of roots) = 6 + */ +rs_decoder = init_rs (10, 0x409, 0, 1, 6); + + + + Encoding + + The encoder calculates the Reed-Solomon code over + the given data length and stores the result in + the parity buffer. Note that the parity buffer must + be initialized before calling the encoder. + + + The expanded data can be inverted on the fly by + providing a non zero inversion mask. The expanded data is + XOR'ed with the mask. This is used e.g. for FLASH + ECC, where the all 0xFF is inverted to an all 0x00. + The Reed-Solomon code for all 0x00 is all 0x00. The + code is inverted before storing to FLASH so it is 0xFF + too. This prevent's that reading from an erased FLASH + results in ECC errors. + + + The databytes are expanded to the given symbol size + on the fly. There is no support for encoding continuous + bitstreams with a symbol size != 8 at the moment. If + it is necessary it should be not a big deal to implement + such functionality. + + +/* Parity buffer. Size = number of roots */ +uint16_t par[6]; +/* Initialize the parity buffer */ +memset(par, 0, sizeof(par)); +/* Encode 512 byte in data8. Store parity in buffer par */ +encode_rs8 (rs_decoder, data8, 512, par, 0); + + + + Decoding + + The decoder calculates the syndrome over + the given data length and the received parity symbols + and corrects errors in the data. + + + If a syndrome is available from a hardware decoder + then the syndrome calculation is skipped. + + + The correction of the data buffer can be suppressed + by providing a correction pattern buffer and an error + location buffer to the decoder. The decoder stores the + calculated error location and the correction bitmask + in the given buffers. This is useful for hardware + decoders which use a weird bit ordering scheme. + + + The databytes are expanded to the given symbol size + on the fly. There is no support for decoding continuous + bitstreams with a symbolsize != 8 at the moment. If + it is necessary it should be not a big deal to implement + such functionality. + + + + + Decoding with syndrome calculation, direct data correction + + +/* Parity buffer. Size = number of roots */ +uint16_t par[6]; +uint8_t data[512]; +int numerr; +/* Receive data */ +..... +/* Receive parity */ +..... +/* Decode 512 byte in data8.*/ +numerr = decode_rs8 (rs_decoder, data8, par, 512, NULL, 0, NULL, 0, NULL); + + + + + + Decoding with syndrome given by hardware decoder, direct data correction + + +/* Parity buffer. Size = number of roots */ +uint16_t par[6], syn[6]; +uint8_t data[512]; +int numerr; +/* Receive data */ +..... +/* Receive parity */ +..... +/* Get syndrome from hardware decoder */ +..... +/* Decode 512 byte in data8.*/ +numerr = decode_rs8 (rs_decoder, data8, par, 512, syn, 0, NULL, 0, NULL); + + + + + + Decoding with syndrome given by hardware decoder, no direct data correction. + + + Note: It's not necessary to give data and received parity to the decoder. + + +/* Parity buffer. Size = number of roots */ +uint16_t par[6], syn[6], corr[8]; +uint8_t data[512]; +int numerr, errpos[8]; +/* Receive data */ +..... +/* Receive parity */ +..... +/* Get syndrome from hardware decoder */ +..... +/* Decode 512 byte in data8.*/ +numerr = decode_rs8 (rs_decoder, NULL, NULL, 512, syn, 0, errpos, 0, corr); +for (i = 0; i < numerr; i++) { + do_error_correction_in_your_buffer(errpos[i], corr[i]); +} + + + + + Cleanup + + The function free_rs frees the allocated resources, + if the caller is the last user of the decoder. + + +/* Release resources */ +free_rs(rs_decoder); + + + + + + + Structures + + This chapter contains the autogenerated documentation of the structures which are + used in the Reed-Solomon Library and are relevant for a developer. + +!Iinclude/linux/rslib.h + + + + Public Functions Provided + + This chapter contains the autogenerated documentation of the Reed-Solomon functions + which are exported. + +!Elib/reed_solomon/reed_solomon.c + + + + Credits + + The library code for encoding and decoding was written by Phil Karn. + + + Copyright 2002, Phil Karn, KA9Q + May be used under the terms of the GNU General Public License (GPL) + + + The wrapper functions and interfaces are written by Thomas Gleixner + + + Many users have provided bugfixes, improvements and helping hands for testing. + Thanks a lot. + + + The following people have contributed to this document: + + + Thomas Gleixnertglx@linutronix.de + + +
diff --git a/Documentation/DocBook/lsm.tmpl b/Documentation/DocBook/lsm.tmpl new file mode 100644 index 000000000000..f63822195871 --- /dev/null +++ b/Documentation/DocBook/lsm.tmpl @@ -0,0 +1,265 @@ + + + +
+ + Linux Security Modules: General Security Hooks for Linux + + + Stephen + Smalley + + NAI Labs +
ssmalley@nai.com
+
+
+ + Timothy + Fraser + + NAI Labs +
tfraser@nai.com
+
+
+ + Chris + Vance + + NAI Labs +
cvance@nai.com
+
+
+
+
+ +Introduction + + +In March 2001, the National Security Agency (NSA) gave a presentation +about Security-Enhanced Linux (SELinux) at the 2.5 Linux Kernel +Summit. SELinux is an implementation of flexible and fine-grained +nondiscretionary access controls in the Linux kernel, originally +implemented as its own particular kernel patch. Several other +security projects (e.g. RSBAC, Medusa) have also developed flexible +access control architectures for the Linux kernel, and various +projects have developed particular access control models for Linux +(e.g. LIDS, DTE, SubDomain). Each project has developed and +maintained its own kernel patch to support its security needs. + + + +In response to the NSA presentation, Linus Torvalds made a set of +remarks that described a security framework he would be willing to +consider for inclusion in the mainstream Linux kernel. He described a +general framework that would provide a set of security hooks to +control operations on kernel objects and a set of opaque security +fields in kernel data structures for maintaining security attributes. +This framework could then be used by loadable kernel modules to +implement any desired model of security. Linus also suggested the +possibility of migrating the Linux capabilities code into such a +module. + + + +The Linux Security Modules (LSM) project was started by WireX to +develop such a framework. LSM is a joint development effort by +several security projects, including Immunix, SELinux, SGI and Janus, +and several individuals, including Greg Kroah-Hartman and James +Morris, to develop a Linux kernel patch that implements this +framework. The patch is currently tracking the 2.4 series and is +targeted for integration into the 2.5 development series. This +technical report provides an overview of the framework and the example +capabilities security module provided by the LSM kernel patch. + + + + +LSM Framework + + +The LSM kernel patch provides a general kernel framework to support +security modules. In particular, the LSM framework is primarily +focused on supporting access control modules, although future +development is likely to address other security needs such as +auditing. By itself, the framework does not provide any additional +security; it merely provides the infrastructure to support security +modules. The LSM kernel patch also moves most of the capabilities +logic into an optional security module, with the system defaulting +to the traditional superuser logic. This capabilities module +is discussed further in . + + + +The LSM kernel patch adds security fields to kernel data structures +and inserts calls to hook functions at critical points in the kernel +code to manage the security fields and to perform access control. It +also adds functions for registering and unregistering security +modules, and adds a general security system call +to support new system calls for security-aware applications. + + + +The LSM security fields are simply void* pointers. For +process and program execution security information, security fields +were added to struct task_struct and +struct linux_binprm. For filesystem security +information, a security field was added to +struct super_block. For pipe, file, and socket +security information, security fields were added to +struct inode and +struct file. For packet and network device security +information, security fields were added to +struct sk_buff and +struct net_device. For System V IPC security +information, security fields were added to +struct kern_ipc_perm and +struct msg_msg; additionally, the definitions +for struct msg_msg, struct +msg_queue, and struct +shmid_kernel were moved to header files +(include/linux/msg.h and +include/linux/shm.h as appropriate) to allow +the security modules to use these definitions. + + + +Each LSM hook is a function pointer in a global table, +security_ops. This table is a +security_operations structure as defined by +include/linux/security.h. Detailed documentation +for each hook is included in this header file. At present, this +structure consists of a collection of substructures that group related +hooks based on the kernel object (e.g. task, inode, file, sk_buff, +etc) as well as some top-level hook function pointers for system +operations. This structure is likely to be flattened in the future +for performance. The placement of the hook calls in the kernel code +is described by the "called:" lines in the per-hook documentation in +the header file. The hook calls can also be easily found in the +kernel code by looking for the string "security_ops->". + + + + +Linus mentioned per-process security hooks in his original remarks as a +possible alternative to global security hooks. However, if LSM were +to start from the perspective of per-process hooks, then the base +framework would have to deal with how to handle operations that +involve multiple processes (e.g. kill), since each process might have +its own hook for controlling the operation. This would require a +general mechanism for composing hooks in the base framework. +Additionally, LSM would still need global hooks for operations that +have no process context (e.g. network input operations). +Consequently, LSM provides global security hooks, but a security +module is free to implement per-process hooks (where that makes sense) +by storing a security_ops table in each process' security field and +then invoking these per-process hooks from the global hooks. +The problem of composition is thus deferred to the module. + + + +The global security_ops table is initialized to a set of hook +functions provided by a dummy security module that provides +traditional superuser logic. A register_security +function (in security/security.c) is provided to +allow a security module to set security_ops to refer to its own hook +functions, and an unregister_security function is +provided to revert security_ops to the dummy module hooks. This +mechanism is used to set the primary security module, which is +responsible for making the final decision for each hook. + + + +LSM also provides a simple mechanism for stacking additional security +modules with the primary security module. It defines +register_security and +unregister_security hooks in the +security_operations structure and provides +mod_reg_security and +mod_unreg_security functions that invoke these +hooks after performing some sanity checking. A security module can +call these functions in order to stack with other modules. However, +the actual details of how this stacking is handled are deferred to the +module, which can implement these hooks in any way it wishes +(including always returning an error if it does not wish to support +stacking). In this manner, LSM again defers the problem of +composition to the module. + + + +Although the LSM hooks are organized into substructures based on +kernel object, all of the hooks can be viewed as falling into two +major categories: hooks that are used to manage the security fields +and hooks that are used to perform access control. Examples of the +first category of hooks include the +alloc_security and +free_security hooks defined for each kernel data +structure that has a security field. These hooks are used to allocate +and free security structures for kernel objects. The first category +of hooks also includes hooks that set information in the security +field after allocation, such as the post_lookup +hook in struct inode_security_ops. This hook +is used to set security information for inodes after successful lookup +operations. An example of the second category of hooks is the +permission hook in +struct inode_security_ops. This hook checks +permission when accessing an inode. + + + + +LSM Capabilities Module + + +The LSM kernel patch moves most of the existing POSIX.1e capabilities +logic into an optional security module stored in the file +security/capability.c. This change allows +users who do not want to use capabilities to omit this code entirely +from their kernel, instead using the dummy module for traditional +superuser logic or any other module that they desire. This change +also allows the developers of the capabilities logic to maintain and +enhance their code more freely, without needing to integrate patches +back into the base kernel. + + + +In addition to moving the capabilities logic, the LSM kernel patch +could move the capability-related fields from the kernel data +structures into the new security fields managed by the security +modules. However, at present, the LSM kernel patch leaves the +capability fields in the kernel data structures. In his original +remarks, Linus suggested that this might be preferable so that other +security modules can be easily stacked with the capabilities module +without needing to chain multiple security structures on the security field. +It also avoids imposing extra overhead on the capabilities module +to manage the security fields. However, the LSM framework could +certainly support such a move if it is determined to be desirable, +with only a few additional changes described below. + + + +At present, the capabilities logic for computing process capabilities +on execve and set*uid, +checking capabilities for a particular process, saving and checking +capabilities for netlink messages, and handling the +capget and capset system +calls have been moved into the capabilities module. There are still a +few locations in the base kernel where capability-related fields are +directly examined or modified, but the current version of the LSM +patch does allow a security module to completely replace the +assignment and testing of capabilities. These few locations would +need to be changed if the capability-related fields were moved into +the security field. The following is a list of known locations that +still perform such direct examination or modification of +capability-related fields: + +fs/open.c:sys_access +fs/lockd/host.c:nlm_bind_host +fs/nfsd/auth.c:nfsd_setuser +fs/proc/array.c:task_cap + + + + + +
diff --git a/Documentation/DocBook/man/Makefile b/Documentation/DocBook/man/Makefile new file mode 100644 index 000000000000..4fb7ea0f7ac8 --- /dev/null +++ b/Documentation/DocBook/man/Makefile @@ -0,0 +1,3 @@ +# Rules are put in Documentation/DocBook + +clean-files := *.9.gz *.sgml manpage.links manpage.refs diff --git a/Documentation/DocBook/mcabook.tmpl b/Documentation/DocBook/mcabook.tmpl new file mode 100644 index 000000000000..4367f4642f3d --- /dev/null +++ b/Documentation/DocBook/mcabook.tmpl @@ -0,0 +1,107 @@ + + + + + + MCA Driver Programming Interface + + + + Alan + Cox + +
+ alan@redhat.com +
+
+
+ + David + Weinehall + + + Chris + Beauregard + +
+ + + 2000 + Alan Cox + David Weinehall + Chris Beauregard + + + + + This documentation is free software; you can redistribute + it and/or modify it under the terms of the GNU General Public + License as published by the Free Software Foundation; either + version 2 of the License, or (at your option) any later + version. + + + + This program is distributed in the hope that it will be + useful, but WITHOUT ANY WARRANTY; without even the implied + warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + See the GNU General Public License for more details. + + + + You should have received a copy of the GNU General Public + License along with this program; if not, write to the Free + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, + MA 02111-1307 USA + + + + For more details see the file COPYING in the source + distribution of Linux. + + +
+ + + + + Introduction + + The MCA bus functions provide a generalised interface to find MCA + bus cards, to claim them for a driver, and to read and manipulate POS + registers without being aware of the motherboard internals or + certain deep magic specific to onboard devices. + + + The basic interface to the MCA bus devices is the slot. Each slot + is numbered and virtual slot numbers are assigned to the internal + devices. Using a pci_dev as other busses do does not really make + sense in the MCA context as the MCA bus resources require card + specific interpretation. + + + Finally the MCA bus functions provide a parallel set of DMA + functions mimicing the ISA bus DMA functions as closely as possible, + although also supporting the additional DMA functionality on the + MCA bus controllers. + + + + Known Bugs And Assumptions + + None. + + + + + Public Functions Provided +!Earch/i386/kernel/mca.c + + + + DMA Functions Provided +!Iinclude/asm-i386/mca_dma.h + + +
diff --git a/Documentation/DocBook/mtdnand.tmpl b/Documentation/DocBook/mtdnand.tmpl new file mode 100644 index 000000000000..6e463d0db266 --- /dev/null +++ b/Documentation/DocBook/mtdnand.tmpl @@ -0,0 +1,1320 @@ + + + + + + MTD NAND Driver Programming Interface + + + + Thomas + Gleixner + +
+ tglx@linutronix.de +
+
+
+
+ + + 2004 + Thomas Gleixner + + + + + This documentation is free software; you can redistribute + it and/or modify it under the terms of the GNU General Public + License version 2 as published by the Free Software Foundation. + + + + This program is distributed in the hope that it will be + useful, but WITHOUT ANY WARRANTY; without even the implied + warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + See the GNU General Public License for more details. + + + + You should have received a copy of the GNU General Public + License along with this program; if not, write to the Free + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, + MA 02111-1307 USA + + + + For more details see the file COPYING in the source + distribution of Linux. + + +
+ + + + + Introduction + + The generic NAND driver supports almost all NAND and AG-AND based + chips and connects them to the Memory Technology Devices (MTD) + subsystem of the Linux Kernel. + + + This documentation is provided for developers who want to implement + board drivers or filesystem drivers suitable for NAND devices. + + + + + Known Bugs And Assumptions + + None. + + + + + Documentation hints + + The function and structure docs are autogenerated. Each function and + struct member has a short description which is marked with an [XXX] identifier. + The following chapters explain the meaning of those identifiers. + + + Function identifiers [XXX] + + The functions are marked with [XXX] identifiers in the short + comment. The identifiers explain the usage and scope of the + functions. Following identifiers are used: + + + + [MTD Interface] + These functions provide the interface to the MTD kernel API. + They are not replacable and provide functionality + which is complete hardware independent. + + + [NAND Interface] + These functions are exported and provide the interface to the NAND kernel API. + + + [GENERIC] + Generic functions are not replacable and provide functionality + which is complete hardware independent. + + + [DEFAULT] + Default functions provide hardware related functionality which is suitable + for most of the implementations. These functions can be replaced by the + board driver if neccecary. Those functions are called via pointers in the + NAND chip description structure. The board driver can set the functions which + should be replaced by board dependend functions before calling nand_scan(). + If the function pointer is NULL on entry to nand_scan() then the pointer + is set to the default function which is suitable for the detected chip type. + + + + + Struct member identifiers [XXX] + + The struct members are marked with [XXX] identifiers in the + comment. The identifiers explain the usage and scope of the + members. Following identifiers are used: + + + + [INTERN] + These members are for NAND driver internal use only and must not be + modified. Most of these values are calculated from the chip geometry + information which is evaluated during nand_scan(). + + + [REPLACEABLE] + Replaceable members hold hardware related functions which can be + provided by the board driver. The board driver can set the functions which + should be replaced by board dependend functions before calling nand_scan(). + If the function pointer is NULL on entry to nand_scan() then the pointer + is set to the default function which is suitable for the detected chip type. + + + [BOARDSPECIFIC] + Board specific members hold hardware related information which must + be provided by the board driver. The board driver must set the function + pointers and datafields before calling nand_scan(). + + + [OPTIONAL] + Optional members can hold information relevant for the board driver. The + generic NAND driver code does not use this information. + + + + + + + Basic board driver + + For most boards it will be sufficient to provide just the + basic functions and fill out some really board dependend + members in the nand chip description structure. + See drivers/mtd/nand/skeleton for reference. + + + Basic defines + + At least you have to provide a mtd structure and + a storage for the ioremap'ed chip address. + You can allocate the mtd structure using kmalloc + or you can allocate it statically. + In case of static allocation you have to allocate + a nand_chip structure too. + + + Kmalloc based example + + +static struct mtd_info *board_mtd; +static unsigned long baseaddr; + + + Static example + + +static struct mtd_info board_mtd; +static struct nand_chip board_chip; +static unsigned long baseaddr; + + + + Partition defines + + If you want to divide your device into parititions, then + enable the configuration switch CONFIG_MTD_PARITIONS and define + a paritioning scheme suitable to your board. + + +#define NUM_PARTITIONS 2 +static struct mtd_partition partition_info[] = { + { .name = "Flash partition 1", + .offset = 0, + .size = 8 * 1024 * 1024 }, + { .name = "Flash partition 2", + .offset = MTDPART_OFS_NEXT, + .size = MTDPART_SIZ_FULL }, +}; + + + + Hardware control function + + The hardware control function provides access to the + control pins of the NAND chip(s). + The access can be done by GPIO pins or by address lines. + If you use address lines, make sure that the timing + requirements are met. + + + GPIO based example + + +static void board_hwcontrol(struct mtd_info *mtd, int cmd) +{ + switch(cmd){ + case NAND_CTL_SETCLE: /* Set CLE pin high */ break; + case NAND_CTL_CLRCLE: /* Set CLE pin low */ break; + case NAND_CTL_SETALE: /* Set ALE pin high */ break; + case NAND_CTL_CLRALE: /* Set ALE pin low */ break; + case NAND_CTL_SETNCE: /* Set nCE pin low */ break; + case NAND_CTL_CLRNCE: /* Set nCE pin high */ break; + } +} + + + Address lines based example. It's assumed that the + nCE pin is driven by a chip select decoder. + + +static void board_hwcontrol(struct mtd_info *mtd, int cmd) +{ + struct nand_chip *this = (struct nand_chip *) mtd->priv; + switch(cmd){ + case NAND_CTL_SETCLE: this->IO_ADDR_W |= CLE_ADRR_BIT; break; + case NAND_CTL_CLRCLE: this->IO_ADDR_W &= ~CLE_ADRR_BIT; break; + case NAND_CTL_SETALE: this->IO_ADDR_W |= ALE_ADRR_BIT; break; + case NAND_CTL_CLRALE: this->IO_ADDR_W &= ~ALE_ADRR_BIT; break; + } +} + + + + Device ready function + + If the hardware interface has the ready busy pin of the NAND chip connected to a + GPIO or other accesible I/O pin, this function is used to read back the state of the + pin. The function has no arguments and should return 0, if the device is busy (R/B pin + is low) and 1, if the device is ready (R/B pin is high). + If the hardware interface does not give access to the ready busy pin, then + the function must not be defined and the function pointer this->dev_ready is set to NULL. + + + + Init function + + The init function allocates memory and sets up all the board + specific parameters and function pointers. When everything + is set up nand_scan() is called. This function tries to + detect and identify then chip. If a chip is found all the + internal data fields are initialized accordingly. + The structure(s) have to be zeroed out first and then filled with the neccecary + information about the device. + + +int __init board_init (void) +{ + struct nand_chip *this; + int err = 0; + + /* Allocate memory for MTD device structure and private data */ + board_mtd = kmalloc (sizeof(struct mtd_info) + sizeof (struct nand_chip), GFP_KERNEL); + if (!board_mtd) { + printk ("Unable to allocate NAND MTD device structure.\n"); + err = -ENOMEM; + goto out; + } + + /* Initialize structures */ + memset ((char *) board_mtd, 0, sizeof(struct mtd_info) + sizeof(struct nand_chip)); + + /* map physical adress */ + baseaddr = (unsigned long)ioremap(CHIP_PHYSICAL_ADDRESS, 1024); + if(!baseaddr){ + printk("Ioremap to access NAND chip failed\n"); + err = -EIO; + goto out_mtd; + } + + /* Get pointer to private data */ + this = (struct nand_chip *) (); + /* Link the private data with the MTD structure */ + board_mtd->priv = this; + + /* Set address of NAND IO lines */ + this->IO_ADDR_R = baseaddr; + this->IO_ADDR_W = baseaddr; + /* Reference hardware control function */ + this->hwcontrol = board_hwcontrol; + /* Set command delay time, see datasheet for correct value */ + this->chip_delay = CHIP_DEPENDEND_COMMAND_DELAY; + /* Assign the device ready function, if available */ + this->dev_ready = board_dev_ready; + this->eccmode = NAND_ECC_SOFT; + + /* Scan to find existance of the device */ + if (nand_scan (board_mtd, 1)) { + err = -ENXIO; + goto out_ior; + } + + add_mtd_partitions(board_mtd, partition_info, NUM_PARTITIONS); + goto out; + +out_ior: + iounmap((void *)baseaddr); +out_mtd: + kfree (board_mtd); +out: + return err; +} +module_init(board_init); + + + + Exit function + + The exit function is only neccecary if the driver is + compiled as a module. It releases all resources which + are held by the chip driver and unregisters the partitions + in the MTD layer. + + +#ifdef MODULE +static void __exit board_cleanup (void) +{ + /* Release resources, unregister device */ + nand_release (board_mtd); + + /* unmap physical adress */ + iounmap((void *)baseaddr); + + /* Free the MTD device structure */ + kfree (board_mtd); +} +module_exit(board_cleanup); +#endif + + + + + + Advanced board driver functions + + This chapter describes the advanced functionality of the NAND + driver. For a list of functions which can be overridden by the board + driver see the documentation of the nand_chip structure. + + + Multiple chip control + + The nand driver can control chip arrays. Therefor the + board driver must provide an own select_chip function. This + function must (de)select the requested chip. + The function pointer in the nand_chip structure must + be set before calling nand_scan(). The maxchip parameter + of nand_scan() defines the maximum number of chips to + scan for. Make sure that the select_chip function can + handle the requested number of chips. + + + The nand driver concatenates the chips to one virtual + chip and provides this virtual chip to the MTD layer. + + + Note: The driver can only handle linear chip arrays + of equally sized chips. There is no support for + parallel arrays which extend the buswidth. + + + GPIO based example + + +static void board_select_chip (struct mtd_info *mtd, int chip) +{ + /* Deselect all chips, set all nCE pins high */ + GPIO(BOARD_NAND_NCE) |= 0xff; + if (chip >= 0) + GPIO(BOARD_NAND_NCE) &= ~ (1 << chip); +} + + + Address lines based example. + Its assumed that the nCE pins are connected to an + address decoder. + + +static void board_select_chip (struct mtd_info *mtd, int chip) +{ + struct nand_chip *this = (struct nand_chip *) mtd->priv; + + /* Deselect all chips */ + this->IO_ADDR_R &= ~BOARD_NAND_ADDR_MASK; + this->IO_ADDR_W &= ~BOARD_NAND_ADDR_MASK; + switch (chip) { + case 0: + this->IO_ADDR_R |= BOARD_NAND_ADDR_CHIP0; + this->IO_ADDR_W |= BOARD_NAND_ADDR_CHIP0; + break; + .... + case n: + this->IO_ADDR_R |= BOARD_NAND_ADDR_CHIPn; + this->IO_ADDR_W |= BOARD_NAND_ADDR_CHIPn; + break; + } +} + + + + Hardware ECC support + + Functions and constants + + The nand driver supports three different types of + hardware ECC. + + NAND_ECC_HW3_256 + Hardware ECC generator providing 3 bytes ECC per + 256 byte. + + NAND_ECC_HW3_512 + Hardware ECC generator providing 3 bytes ECC per + 512 byte. + + NAND_ECC_HW6_512 + Hardware ECC generator providing 6 bytes ECC per + 512 byte. + + NAND_ECC_HW8_512 + Hardware ECC generator providing 6 bytes ECC per + 512 byte. + + + If your hardware generator has a different functionality + add it at the appropriate place in nand_base.c + + + The board driver must provide following functions: + + enable_hwecc + This function is called before reading / writing to + the chip. Reset or initialize the hardware generator + in this function. The function is called with an + argument which let you distinguish between read + and write operations. + + calculate_ecc + This function is called after read / write from / to + the chip. Transfer the ECC from the hardware to + the buffer. If the option NAND_HWECC_SYNDROME is set + then the function is only called on write. See below. + + correct_data + In case of an ECC error this function is called for + error detection and correction. Return 1 respectively 2 + in case the error can be corrected. If the error is + not correctable return -1. If your hardware generator + matches the default algorithm of the nand_ecc software + generator then use the correction function provided + by nand_ecc instead of implementing duplicated code. + + + + + + Hardware ECC with syndrome calculation + + Many hardware ECC implementations provide Reed-Solomon + codes and calculate an error syndrome on read. The syndrome + must be converted to a standard Reed-Solomon syndrome + before calling the error correction code in the generic + Reed-Solomon library. + + + The ECC bytes must be placed immidiately after the data + bytes in order to make the syndrome generator work. This + is contrary to the usual layout used by software ECC. The + seperation of data and out of band area is not longer + possible. The nand driver code handles this layout and + the remaining free bytes in the oob area are managed by + the autoplacement code. Provide a matching oob-layout + in this case. See rts_from4.c and diskonchip.c for + implementation reference. In those cases we must also + use bad block tables on FLASH, because the ECC layout is + interferring with the bad block marker positions. + See bad block table support for details. + + + + + Bad block table support + + Most NAND chips mark the bad blocks at a defined + position in the spare area. Those blocks must + not be erased under any circumstances as the bad + block information would be lost. + It is possible to check the bad block mark each + time when the blocks are accessed by reading the + spare area of the first page in the block. This + is time consuming so a bad block table is used. + + + The nand driver supports various types of bad block + tables. + + Per device + The bad block table contains all bad block information + of the device which can consist of multiple chips. + + Per chip + A bad block table is used per chip and contains the + bad block information for this particular chip. + + Fixed offset + The bad block table is located at a fixed offset + in the chip (device). This applies to various + DiskOnChip devices. + + Automatic placed + The bad block table is automatically placed and + detected either at the end or at the beginning + of a chip (device) + + Mirrored tables + The bad block table is mirrored on the chip (device) to + allow updates of the bad block table without data loss. + + + + + nand_scan() calls the function nand_default_bbt(). + nand_default_bbt() selects appropriate default + bad block table desriptors depending on the chip information + which was retrieved by nand_scan(). + + + The standard policy is scanning the device for bad + blocks and build a ram based bad block table which + allows faster access than always checking the + bad block information on the flash chip itself. + + + Flash based tables + + It may be desired or neccecary to keep a bad block table in FLASH. + For AG-AND chips this is mandatory, as they have no factory marked + bad blocks. They have factory marked good blocks. The marker pattern + is erased when the block is erased to be reused. So in case of + powerloss before writing the pattern back to the chip this block + would be lost and added to the bad blocks. Therefor we scan the + chip(s) when we detect them the first time for good blocks and + store this information in a bad block table before erasing any + of the blocks. + + + The blocks in which the tables are stored are procteted against + accidental access by marking them bad in the memory bad block + table. The bad block table managment functions are allowed + to circumvernt this protection. + + + The simplest way to activate the FLASH based bad block table support + is to set the option NAND_USE_FLASH_BBT in the option field of + the nand chip structure before calling nand_scan(). For AG-AND + chips is this done by default. + This activates the default FLASH based bad block table functionality + of the NAND driver. The default bad block table options are + + Store bad block table per chip + Use 2 bits per block + Automatic placement at the end of the chip + Use mirrored tables with version numbers + Reserve 4 blocks at the end of the chip + + + + + User defined tables + + User defined tables are created by filling out a + nand_bbt_descr structure and storing the pointer in the + nand_chip structure member bbt_td before calling nand_scan(). + If a mirror table is neccecary a second structure must be + created and a pointer to this structure must be stored + in bbt_md inside the nand_chip structure. If the bbt_md + member is set to NULL then only the main table is used + and no scan for the mirrored table is performed. + + + The most important field in the nand_bbt_descr structure + is the options field. The options define most of the + table properties. Use the predefined constants from + nand.h to define the options. + + Number of bits per block + The supported number of bits is 1, 2, 4, 8. + Table per chip + Setting the constant NAND_BBT_PERCHIP selects that + a bad block table is managed for each chip in a chip array. + If this option is not set then a per device bad block table + is used. + Table location is absolute + Use the option constant NAND_BBT_ABSPAGE and + define the absolute page number where the bad block + table starts in the field pages. If you have selected bad block + tables per chip and you have a multi chip array then the start page + must be given for each chip in the chip array. Note: there is no scan + for a table ident pattern performed, so the fields + pattern, veroffs, offs, len can be left uninitialized + Table location is automatically detected + The table can either be located in the first or the last good + blocks of the chip (device). Set NAND_BBT_LASTBLOCK to place + the bad block table at the end of the chip (device). The + bad block tables are marked and identified by a pattern which + is stored in the spare area of the first page in the block which + holds the bad block table. Store a pointer to the pattern + in the pattern field. Further the length of the pattern has to be + stored in len and the offset in the spare area must be given + in the offs member of the nand_bbt_descr stucture. For mirrored + bad block tables different patterns are mandatory. + Table creation + Set the option NAND_BBT_CREATE to enable the table creation + if no table can be found during the scan. Usually this is done only + once if a new chip is found. + Table write support + Set the option NAND_BBT_WRITE to enable the table write support. + This allows the update of the bad block table(s) in case a block has + to be marked bad due to wear. The MTD interface function block_markbad + is calling the update function of the bad block table. If the write + support is enabled then the table is updated on FLASH. + + Note: Write support should only be enabled for mirrored tables with + version control. + + Table version control + Set the option NAND_BBT_VERSION to enable the table version control. + It's highly recommended to enable this for mirrored tables with write + support. It makes sure that the risk of loosing the bad block + table information is reduced to the loss of the information about the + one worn out block which should be marked bad. The version is stored in + 4 consecutive bytes in the spare area of the device. The position of + the version number is defined by the member veroffs in the bad block table + descriptor. + Save block contents on write + + In case that the block which holds the bad block table does contain + other useful information, set the option NAND_BBT_SAVECONTENT. When + the bad block table is written then the whole block is read the bad + block table is updated and the block is erased and everything is + written back. If this option is not set only the bad block table + is written and everything else in the block is ignored and erased. + + Number of reserved blocks + + For automatic placement some blocks must be reserved for + bad block table storage. The number of reserved blocks is defined + in the maxblocks member of the babd block table description structure. + Reserving 4 blocks for mirrored tables should be a reasonable number. + This also limits the number of blocks which are scanned for the bad + block table ident pattern. + + + + + + + Spare area (auto)placement + + The nand driver implements different possibilities for + placement of filesystem data in the spare area, + + Placement defined by fs driver + Automatic placement + + The default placement function is automatic placement. The + nand driver has built in default placement schemes for the + various chiptypes. If due to hardware ECC functionality the + default placement does not fit then the board driver can + provide a own placement scheme. + + + File system drivers can provide a own placement scheme which + is used instead of the default placement scheme. + + + Placement schemes are defined by a nand_oobinfo structure + +struct nand_oobinfo { + int useecc; + int eccbytes; + int eccpos[24]; + int oobfree[8][2]; +}; + + + useecc + The useecc member controls the ecc and placement function. The header + file include/mtd/mtd-abi.h contains constants to select ecc and + placement. MTD_NANDECC_OFF switches off the ecc complete. This is + not recommended and available for testing and diagnosis only. + MTD_NANDECC_PLACE selects caller defined placement, MTD_NANDECC_AUTOPLACE + selects automatic placement. + + eccbytes + The eccbytes member defines the number of ecc bytes per page. + + eccpos + The eccpos array holds the byte offsets in the spare area where + the ecc codes are placed. + + oobfree + The oobfree array defines the areas in the spare area which can be + used for automatic placement. The information is given in the format + {offset, size}. offset defines the start of the usable area, size the + length in bytes. More than one area can be defined. The list is terminated + by an {0, 0} entry. + + + + + Placement defined by fs driver + + The calling function provides a pointer to a nand_oobinfo + structure which defines the ecc placement. For writes the + caller must provide a spare area buffer along with the + data buffer. The spare area buffer size is (number of pages) * + (size of spare area). For reads the buffer size is + (number of pages) * ((size of spare area) + (number of ecc + steps per page) * sizeof (int)). The driver stores the + result of the ecc check for each tuple in the spare buffer. + The storage sequence is + + + <spare data page 0><ecc result 0>...<ecc result n> + + + ... + + + <spare data page n><ecc result 0>...<ecc result n> + + + This is a legacy mode used by YAFFS1. + + + If the spare area buffer is NULL then only the ECC placement is + done according to the given scheme in the nand_oobinfo structure. + + + + Automatic placement + + Automatic placement uses the built in defaults to place the + ecc bytes in the spare area. If filesystem data have to be stored / + read into the spare area then the calling function must provide a + buffer. The buffer size per page is determined by the oobfree array in + the nand_oobinfo structure. + + + If the spare area buffer is NULL then only the ECC placement is + done according to the default builtin scheme. + + + + User space placement selection + + All non ecc functions like mtd->read and mtd->write use an internal + structure, which can be set by an ioctl. This structure is preset + to the autoplacement default. + + ioctl (fd, MEMSETOOBSEL, oobsel); + + oobsel is a pointer to a user supplied structure of type + nand_oobconfig. The contents of this structure must match the + criteria of the filesystem, which will be used. See an example in utils/nandwrite.c. + + + + + Spare area autoplacement default schemes + + 256 byte pagesize + + +Offset +Content +Comment + + +0x00 +ECC byte 0 +Error correction code byte 0 + + +0x01 +ECC byte 1 +Error correction code byte 1 + + +0x02 +ECC byte 2 +Error correction code byte 2 + + +0x03 +Autoplace 0 + + + +0x04 +Autoplace 1 + + + +0x05 +Bad block marker +If any bit in this byte is zero, then this block is bad. +This applies only to the first page in a block. In the remaining +pages this byte is reserved + + +0x06 +Autoplace 2 + + + +0x07 +Autoplace 3 + + + + + + 512 byte pagesize + + +Offset +Content +Comment + + +0x00 +ECC byte 0 +Error correction code byte 0 of the lower 256 Byte data in +this page + + +0x01 +ECC byte 1 +Error correction code byte 1 of the lower 256 Bytes of data +in this page + + +0x02 +ECC byte 2 +Error correction code byte 2 of the lower 256 Bytes of data +in this page + + +0x03 +ECC byte 3 +Error correction code byte 0 of the upper 256 Bytes of data +in this page + + +0x04 +reserved +reserved + + +0x05 +Bad block marker +If any bit in this byte is zero, then this block is bad. +This applies only to the first page in a block. In the remaining +pages this byte is reserved + + +0x06 +ECC byte 4 +Error correction code byte 1 of the upper 256 Bytes of data +in this page + + +0x07 +ECC byte 5 +Error correction code byte 2 of the upper 256 Bytes of data +in this page + + +0x08 - 0x0F +Autoplace 0 - 7 + + + + + + 2048 byte pagesize + + +Offset +Content +Comment + + +0x00 +Bad block marker +If any bit in this byte is zero, then this block is bad. +This applies only to the first page in a block. In the remaining +pages this byte is reserved + + +0x01 +Reserved +Reserved + + +0x02-0x27 +Autoplace 0 - 37 + + + +0x28 +ECC byte 0 +Error correction code byte 0 of the first 256 Byte data in +this page + + +0x29 +ECC byte 1 +Error correction code byte 1 of the first 256 Bytes of data +in this page + + +0x2A +ECC byte 2 +Error correction code byte 2 of the first 256 Bytes data in +this page + + +0x2B +ECC byte 3 +Error correction code byte 0 of the second 256 Bytes of data +in this page + + +0x2C +ECC byte 4 +Error correction code byte 1 of the second 256 Bytes of data +in this page + + +0x2D +ECC byte 5 +Error correction code byte 2 of the second 256 Bytes of data +in this page + + +0x2E +ECC byte 6 +Error correction code byte 0 of the third 256 Bytes of data +in this page + + +0x2F +ECC byte 7 +Error correction code byte 1 of the third 256 Bytes of data +in this page + + +0x30 +ECC byte 8 +Error correction code byte 2 of the third 256 Bytes of data +in this page + + +0x31 +ECC byte 9 +Error correction code byte 0 of the fourth 256 Bytes of data +in this page + + +0x32 +ECC byte 10 +Error correction code byte 1 of the fourth 256 Bytes of data +in this page + + +0x33 +ECC byte 11 +Error correction code byte 2 of the fourth 256 Bytes of data +in this page + + +0x34 +ECC byte 12 +Error correction code byte 0 of the fifth 256 Bytes of data +in this page + + +0x35 +ECC byte 13 +Error correction code byte 1 of the fifth 256 Bytes of data +in this page + + +0x36 +ECC byte 14 +Error correction code byte 2 of the fifth 256 Bytes of data +in this page + + +0x37 +ECC byte 15 +Error correction code byte 0 of the sixt 256 Bytes of data +in this page + + +0x38 +ECC byte 16 +Error correction code byte 1 of the sixt 256 Bytes of data +in this page + + +0x39 +ECC byte 17 +Error correction code byte 2 of the sixt 256 Bytes of data +in this page + + +0x3A +ECC byte 18 +Error correction code byte 0 of the seventh 256 Bytes of +data in this page + + +0x3B +ECC byte 19 +Error correction code byte 1 of the seventh 256 Bytes of +data in this page + + +0x3C +ECC byte 20 +Error correction code byte 2 of the seventh 256 Bytes of +data in this page + + +0x3D +ECC byte 21 +Error correction code byte 0 of the eigth 256 Bytes of data +in this page + + +0x3E +ECC byte 22 +Error correction code byte 1 of the eigth 256 Bytes of data +in this page + + +0x3F +ECC byte 23 +Error correction code byte 2 of the eigth 256 Bytes of data +in this page + + + + + + + + Filesystem support + + The NAND driver provides all neccecary functions for a + filesystem via the MTD interface. + + + Filesystems must be aware of the NAND pecularities and + restrictions. One major restrictions of NAND Flash is, that you cannot + write as often as you want to a page. The consecutive writes to a page, + before erasing it again, are restricted to 1-3 writes, depending on the + manufacturers specifications. This applies similar to the spare area. + + + Therefor NAND aware filesystems must either write in page size chunks + or hold a writebuffer to collect smaller writes until they sum up to + pagesize. Available NAND aware filesystems: JFFS2, YAFFS. + + + The spare area usage to store filesystem data is controlled by + the spare area placement functionality which is described in one + of the earlier chapters. + + + + Tools + + The MTD project provides a couple of helpful tools to handle NAND Flash. + + flasherase, flasheraseall: Erase and format FLASH partitions + nandwrite: write filesystem images to NAND FLASH + nanddump: dump the contents of a NAND FLASH partitions + + + + These tools are aware of the NAND restrictions. Please use those tools + instead of complaining about errors which are caused by non NAND aware + access methods. + + + + + Constants + + This chapter describes the constants which might be relevant for a driver developer. + + + Chip option constants + + Constants for chip id table + + These constants are defined in nand.h. They are ored together to describe + the chip functionality. + +/* Chip can not auto increment pages */ +#define NAND_NO_AUTOINCR 0x00000001 +/* Buswitdh is 16 bit */ +#define NAND_BUSWIDTH_16 0x00000002 +/* Device supports partial programming without padding */ +#define NAND_NO_PADDING 0x00000004 +/* Chip has cache program function */ +#define NAND_CACHEPRG 0x00000008 +/* Chip has copy back function */ +#define NAND_COPYBACK 0x00000010 +/* AND Chip which has 4 banks and a confusing page / block + * assignment. See Renesas datasheet for further information */ +#define NAND_IS_AND 0x00000020 +/* Chip has a array of 4 pages which can be read without + * additional ready /busy waits */ +#define NAND_4PAGE_ARRAY 0x00000040 + + + + + Constants for runtime options + + These constants are defined in nand.h. They are ored together to describe + the functionality. + +/* Use a flash based bad block table. This option is parsed by the + * default bad block table function (nand_default_bbt). */ +#define NAND_USE_FLASH_BBT 0x00010000 +/* The hw ecc generator provides a syndrome instead a ecc value on read + * This can only work if we have the ecc bytes directly behind the + * data bytes. Applies for DOC and AG-AND Renesas HW Reed Solomon generators */ +#define NAND_HWECC_SYNDROME 0x00020000 + + + + + + + ECC selection constants + + Use these constants to select the ECC algorithm. + +/* No ECC. Usage is not recommended ! */ +#define NAND_ECC_NONE 0 +/* Software ECC 3 byte ECC per 256 Byte data */ +#define NAND_ECC_SOFT 1 +/* Hardware ECC 3 byte ECC per 256 Byte data */ +#define NAND_ECC_HW3_256 2 +/* Hardware ECC 3 byte ECC per 512 Byte data */ +#define NAND_ECC_HW3_512 3 +/* Hardware ECC 6 byte ECC per 512 Byte data */ +#define NAND_ECC_HW6_512 4 +/* Hardware ECC 6 byte ECC per 512 Byte data */ +#define NAND_ECC_HW8_512 6 + + + + + + Hardware control related constants + + These constants describe the requested hardware access function when + the boardspecific hardware control function is called + +/* Select the chip by setting nCE to low */ +#define NAND_CTL_SETNCE 1 +/* Deselect the chip by setting nCE to high */ +#define NAND_CTL_CLRNCE 2 +/* Select the command latch by setting CLE to high */ +#define NAND_CTL_SETCLE 3 +/* Deselect the command latch by setting CLE to low */ +#define NAND_CTL_CLRCLE 4 +/* Select the address latch by setting ALE to high */ +#define NAND_CTL_SETALE 5 +/* Deselect the address latch by setting ALE to low */ +#define NAND_CTL_CLRALE 6 +/* Set write protection by setting WP to high. Not used! */ +#define NAND_CTL_SETWP 7 +/* Clear write protection by setting WP to low. Not used! */ +#define NAND_CTL_CLRWP 8 + + + + + + Bad block table related constants + + These constants describe the options used for bad block + table descriptors. + +/* Options for the bad block table descriptors */ + +/* The number of bits used per block in the bbt on the device */ +#define NAND_BBT_NRBITS_MSK 0x0000000F +#define NAND_BBT_1BIT 0x00000001 +#define NAND_BBT_2BIT 0x00000002 +#define NAND_BBT_4BIT 0x00000004 +#define NAND_BBT_8BIT 0x00000008 +/* The bad block table is in the last good block of the device */ +#define NAND_BBT_LASTBLOCK 0x00000010 +/* The bbt is at the given page, else we must scan for the bbt */ +#define NAND_BBT_ABSPAGE 0x00000020 +/* The bbt is at the given page, else we must scan for the bbt */ +#define NAND_BBT_SEARCH 0x00000040 +/* bbt is stored per chip on multichip devices */ +#define NAND_BBT_PERCHIP 0x00000080 +/* bbt has a version counter at offset veroffs */ +#define NAND_BBT_VERSION 0x00000100 +/* Create a bbt if none axists */ +#define NAND_BBT_CREATE 0x00000200 +/* Search good / bad pattern through all pages of a block */ +#define NAND_BBT_SCANALLPAGES 0x00000400 +/* Scan block empty during good / bad block scan */ +#define NAND_BBT_SCANEMPTY 0x00000800 +/* Write bbt if neccecary */ +#define NAND_BBT_WRITE 0x00001000 +/* Read and write back block contents when writing bbt */ +#define NAND_BBT_SAVECONTENT 0x00002000 + + + + + + + + Structures + + This chapter contains the autogenerated documentation of the structures which are + used in the NAND driver and might be relevant for a driver developer. Each + struct member has a short description which is marked with an [XXX] identifier. + See the chapter "Documentation hints" for an explanation. + +!Iinclude/linux/mtd/nand.h + + + + Public Functions Provided + + This chapter contains the autogenerated documentation of the NAND kernel API functions + which are exported. Each function has a short description which is marked with an [XXX] identifier. + See the chapter "Documentation hints" for an explanation. + +!Edrivers/mtd/nand/nand_base.c +!Edrivers/mtd/nand/nand_bbt.c +!Edrivers/mtd/nand/nand_ecc.c + + + + Internal Functions Provided + + This chapter contains the autogenerated documentation of the NAND driver internal functions. + Each function has a short description which is marked with an [XXX] identifier. + See the chapter "Documentation hints" for an explanation. + The functions marked with [DEFAULT] might be relevant for a board driver developer. + +!Idrivers/mtd/nand/nand_base.c +!Idrivers/mtd/nand/nand_bbt.c +!Idrivers/mtd/nand/nand_ecc.c + + + + Credits + + The following people have contributed to the NAND driver: + + Steven J. Hillsjhill@realitydiluted.com + David Woodhousedwmw2@infradead.org + Thomas Gleixnertglx@linutronix.de + + A lot of users have provided bugfixes, improvements and helping hands for testing. + Thanks a lot. + + + The following people have contributed to this document: + + Thomas Gleixnertglx@linutronix.de + + + +
diff --git a/Documentation/DocBook/procfs-guide.tmpl b/Documentation/DocBook/procfs-guide.tmpl new file mode 100644 index 000000000000..45cad23efefa --- /dev/null +++ b/Documentation/DocBook/procfs-guide.tmpl @@ -0,0 +1,591 @@ + + +]> + + + + Linux Kernel Procfs Guide + + + + Erik + (J.A.K.) + Mouw + + Delft University of Technology + Faculty of Information Technology and Systems +
+ J.A.K.Mouw@its.tudelft.nl + PO BOX 5031 + 2600 GA + Delft + The Netherlands +
+
+
+
+ + + + 1.0  + May 30, 2001 + Initial revision posted to linux-kernel + + + 1.1  + June 3, 2001 + Revised after comments from linux-kernel + + + + + 2001 + Erik Mouw + + + + + + This documentation is free software; you can redistribute it + and/or modify it under the terms of the GNU General Public + License as published by the Free Software Foundation; either + version 2 of the License, or (at your option) any later + version. + + + + This documentation is distributed in the hope that it will be + useful, but WITHOUT ANY WARRANTY; without even the implied + warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR + PURPOSE. See the GNU General Public License for more details. + + + + You should have received a copy of the GNU General Public + License along with this program; if not, write to the Free + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, + MA 02111-1307 USA + + + + For more details see the file COPYING in the source + distribution of Linux. + + +
+ + + + + + + + + + + + Preface + + + This guide describes the use of the procfs file system from + within the Linux kernel. The idea to write this guide came up on + the #kernelnewbies IRC channel (see http://www.kernelnewbies.org/), + when Jeff Garzik explained the use of procfs and forwarded me a + message Alexander Viro wrote to the linux-kernel mailing list. I + agreed to write it up nicely, so here it is. + + + + I'd like to thank Jeff Garzik + jgarzik@pobox.com and Alexander Viro + viro@parcelfarce.linux.theplanet.co.uk for their input, + Tim Waugh twaugh@redhat.com for his Selfdocbook, + and Marc Joosen marcj@historia.et.tudelft.nl for + proofreading. + + + + This documentation was written while working on the LART + computing board (http://www.lart.tudelft.nl/), + which is sponsored by the Mobile Multi-media Communications + (http://www.mmc.tudelft.nl/) + and Ubiquitous Communications (http://www.ubicom.tudelft.nl/) + projects. + + + + Erik + + + + + + + + Introduction + + + The /proc file system + (procfs) is a special file system in the linux kernel. It's a + virtual file system: it is not associated with a block device + but exists only in memory. The files in the procfs are there to + allow userland programs access to certain information from the + kernel (like process information in /proc/[0-9]+/), but also for debug + purposes (like /proc/ksyms). + + + + This guide describes the use of the procfs file system from + within the Linux kernel. It starts by introducing all relevant + functions to manage the files within the file system. After that + it shows how to communicate with userland, and some tips and + tricks will be pointed out. Finally a complete example will be + shown. + + + + Note that the files in /proc/sys are sysctl files: they + don't belong to procfs and are governed by a completely + different API described in the Kernel API book. + + + + + + + + Managing procfs entries + + + This chapter describes the functions that various kernel + components use to populate the procfs with files, symlinks, + device nodes, and directories. + + + + A minor note before we start: if you want to use any of the + procfs functions, be sure to include the correct header file! + This should be one of the first lines in your code: + + + +#include <linux/proc_fs.h> + + + + + + + Creating a regular file + + + + struct proc_dir_entry* create_proc_entry + const char* name + mode_t mode + struct proc_dir_entry* parent + + + + + This function creates a regular file with the name + name, file mode + mode in the directory + parent. To create a file in the root of + the procfs, use NULL as + parent parameter. When successful, the + function will return a pointer to the freshly created + struct proc_dir_entry; otherwise it + will return NULL. describes how to do something useful with + regular files. + + + + Note that it is specifically supported that you can pass a + path that spans multiple directories. For example + create_proc_entry("drivers/via0/info") + will create the via0 + directory if necessary, with standard + 0755 permissions. + + + + If you only want to be able to read the file, the function + create_proc_read_entry described in may be used to create and initialise + the procfs entry in one single call. + + + + + + + + Creating a symlink + + + + struct proc_dir_entry* + proc_symlink const + char* name + struct proc_dir_entry* + parent const + char* dest + + + + + This creates a symlink in the procfs directory + parent that points from + name to + dest. This translates in userland to + ln -s dest + name. + + + + + Creating a directory + + + + struct proc_dir_entry* proc_mkdir + const char* name + struct proc_dir_entry* parent + + + + + Create a directory name in the procfs + directory parent. + + + + + + + + Removing an entry + + + + void remove_proc_entry + const char* name + struct proc_dir_entry* parent + + + + + Removes the entry name in the directory + parent from the procfs. Entries are + removed by their name, not by the + struct proc_dir_entry returned by the + various create functions. Note that this function doesn't + recursively remove entries. + + + + Be sure to free the data entry from + the struct proc_dir_entry before + remove_proc_entry is called (that is: if + there was some data allocated, of + course). See for more information + on using the data entry. + + + + + + + + + Communicating with userland + + + Instead of reading (or writing) information directly from + kernel memory, procfs works with call back + functions for files: functions that are called when + a specific file is being read or written. Such functions have + to be initialised after the procfs file is created by setting + the read_proc and/or + write_proc fields in the + struct proc_dir_entry* that the + function create_proc_entry returned: + + + +struct proc_dir_entry* entry; + +entry->read_proc = read_proc_foo; +entry->write_proc = write_proc_foo; + + + + If you only want to use a the + read_proc, the function + create_proc_read_entry described in may be used to create and initialise the + procfs entry in one single call. + + + + + + Reading data + + + The read function is a call back function that allows userland + processes to read data from the kernel. The read function + should have the following format: + + + + + int read_func + char* page + char** start + off_t off + int count + int* eof + void* data + + + + + The read function should write its information into the + page. For proper use, the function + should start writing at an offset of + off in page and + write at most count bytes, but because + most read functions are quite simple and only return a small + amount of information, these two parameters are usually + ignored (it breaks pagers like more and + less, but cat still + works). + + + + If the off and + count parameters are properly used, + eof should be used to signal that the + end of the file has been reached by writing + 1 to the memory location + eof points to. + + + + The parameter start doesn't seem to be + used anywhere in the kernel. The data + parameter can be used to create a single call back function for + several files, see . + + + + The read_func function must return the + number of bytes written into the page. + + + + shows how to use a read call back + function. + + + + + + + + Writing data + + + The write call back function allows a userland process to write + data to the kernel, so it has some kind of control over the + kernel. The write function should have the following format: + + + + + int write_func + struct file* file + const char* buffer + unsigned long count + void* data + + + + + The write function should read count + bytes at maximum from the buffer. Note + that the buffer doesn't live in the + kernel's memory space, so it should first be copied to kernel + space with copy_from_user. The + file parameter is usually + ignored. shows how to use the + data parameter. + + + + Again, shows how to use this call back + function. + + + + + + + + A single call back for many files + + + When a large number of almost identical files is used, it's + quite inconvenient to use a separate call back function for + each file. A better approach is to have a single call back + function that distinguishes between the files by using the + data field in struct + proc_dir_entry. First of all, the + data field has to be initialised: + + + +struct proc_dir_entry* entry; +struct my_file_data *file_data; + +file_data = kmalloc(sizeof(struct my_file_data), GFP_KERNEL); +entry->data = file_data; + + + + The data field is a void + *, so it can be initialised with anything. + + + + Now that the data field is set, the + read_proc and + write_proc can use it to distinguish + between files because they get it passed into their + data parameter: + + + +int foo_read_func(char *page, char **start, off_t off, + int count, int *eof, void *data) +{ + int len; + + if(data == file_data) { + /* special case for this file */ + } else { + /* normal processing */ + } + + return len; +} + + + + Be sure to free the data data field + when removing the procfs entry. + + + + + + + + + Tips and tricks + + + + + + Convenience functions + + + + struct proc_dir_entry* create_proc_read_entry + const char* name + mode_t mode + struct proc_dir_entry* parent + read_proc_t* read_proc + void* data + + + + + This function creates a regular file in exactly the same way + as create_proc_entry from does, but also allows to set the read + function read_proc in one call. This + function can set the data as well, like + explained in . + + + + + + + Modules + + + If procfs is being used from within a module, be sure to set + the owner field in the + struct proc_dir_entry to + THIS_MODULE. + + + +struct proc_dir_entry* entry; + +entry->owner = THIS_MODULE; + + + + + + + + Mode and ownership + + + Sometimes it is useful to change the mode and/or ownership of + a procfs entry. Here is an example that shows how to achieve + that: + + + +struct proc_dir_entry* entry; + +entry->mode = S_IWUSR |S_IRUSR | S_IRGRP | S_IROTH; +entry->uid = 0; +entry->gid = 100; + + + + + + + + + + Example + + + +&procfsexample; + + +
diff --git a/Documentation/DocBook/procfs_example.c b/Documentation/DocBook/procfs_example.c new file mode 100644 index 000000000000..7064084c1c5e --- /dev/null +++ b/Documentation/DocBook/procfs_example.c @@ -0,0 +1,224 @@ +/* + * procfs_example.c: an example proc interface + * + * Copyright (C) 2001, Erik Mouw (J.A.K.Mouw@its.tudelft.nl) + * + * This file accompanies the procfs-guide in the Linux kernel + * source. Its main use is to demonstrate the concepts and + * functions described in the guide. + * + * This software has been developed while working on the LART + * computing board (http://www.lart.tudelft.nl/), which is + * sponsored by the Mobile Multi-media Communications + * (http://www.mmc.tudelft.nl/) and Ubiquitous Communications + * (http://www.ubicom.tudelft.nl/) projects. + * + * The author can be reached at: + * + * Erik Mouw + * Information and Communication Theory Group + * Faculty of Information Technology and Systems + * Delft University of Technology + * P.O. Box 5031 + * 2600 GA Delft + * The Netherlands + * + * + * This program is free software; you can redistribute + * it and/or modify it under the terms of the GNU General + * Public License as published by the Free Software + * Foundation; either version 2 of the License, or (at your + * option) any later version. + * + * This program is distributed in the hope that it will be + * useful, but WITHOUT ANY WARRANTY; without even the implied + * warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR + * PURPOSE. See the GNU General Public License for more + * details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place, + * Suite 330, Boston, MA 02111-1307 USA + * + */ + +#include +#include +#include +#include +#include +#include + + +#define MODULE_VERS "1.0" +#define MODULE_NAME "procfs_example" + +#define FOOBAR_LEN 8 + +struct fb_data_t { + char name[FOOBAR_LEN + 1]; + char value[FOOBAR_LEN + 1]; +}; + + +static struct proc_dir_entry *example_dir, *foo_file, + *bar_file, *jiffies_file, *symlink; + + +struct fb_data_t foo_data, bar_data; + + +static int proc_read_jiffies(char *page, char **start, + off_t off, int count, + int *eof, void *data) +{ + int len; + + len = sprintf(page, "jiffies = %ld\n", + jiffies); + + return len; +} + + +static int proc_read_foobar(char *page, char **start, + off_t off, int count, + int *eof, void *data) +{ + int len; + struct fb_data_t *fb_data = (struct fb_data_t *)data; + + /* DON'T DO THAT - buffer overruns are bad */ + len = sprintf(page, "%s = '%s'\n", + fb_data->name, fb_data->value); + + return len; +} + + +static int proc_write_foobar(struct file *file, + const char *buffer, + unsigned long count, + void *data) +{ + int len; + struct fb_data_t *fb_data = (struct fb_data_t *)data; + + if(count > FOOBAR_LEN) + len = FOOBAR_LEN; + else + len = count; + + if(copy_from_user(fb_data->value, buffer, len)) + return -EFAULT; + + fb_data->value[len] = '\0'; + + return len; +} + + +static int __init init_procfs_example(void) +{ + int rv = 0; + + /* create directory */ + example_dir = proc_mkdir(MODULE_NAME, NULL); + if(example_dir == NULL) { + rv = -ENOMEM; + goto out; + } + + example_dir->owner = THIS_MODULE; + + /* create jiffies using convenience function */ + jiffies_file = create_proc_read_entry("jiffies", + 0444, example_dir, + proc_read_jiffies, + NULL); + if(jiffies_file == NULL) { + rv = -ENOMEM; + goto no_jiffies; + } + + jiffies_file->owner = THIS_MODULE; + + /* create foo and bar files using same callback + * functions + */ + foo_file = create_proc_entry("foo", 0644, example_dir); + if(foo_file == NULL) { + rv = -ENOMEM; + goto no_foo; + } + + strcpy(foo_data.name, "foo"); + strcpy(foo_data.value, "foo"); + foo_file->data = &foo_data; + foo_file->read_proc = proc_read_foobar; + foo_file->write_proc = proc_write_foobar; + foo_file->owner = THIS_MODULE; + + bar_file = create_proc_entry("bar", 0644, example_dir); + if(bar_file == NULL) { + rv = -ENOMEM; + goto no_bar; + } + + strcpy(bar_data.name, "bar"); + strcpy(bar_data.value, "bar"); + bar_file->data = &bar_data; + bar_file->read_proc = proc_read_foobar; + bar_file->write_proc = proc_write_foobar; + bar_file->owner = THIS_MODULE; + + /* create symlink */ + symlink = proc_symlink("jiffies_too", example_dir, + "jiffies"); + if(symlink == NULL) { + rv = -ENOMEM; + goto no_symlink; + } + + symlink->owner = THIS_MODULE; + + /* everything OK */ + printk(KERN_INFO "%s %s initialised\n", + MODULE_NAME, MODULE_VERS); + return 0; + +no_symlink: + remove_proc_entry("tty", example_dir); +no_tty: + remove_proc_entry("bar", example_dir); +no_bar: + remove_proc_entry("foo", example_dir); +no_foo: + remove_proc_entry("jiffies", example_dir); +no_jiffies: + remove_proc_entry(MODULE_NAME, NULL); +out: + return rv; +} + + +static void __exit cleanup_procfs_example(void) +{ + remove_proc_entry("jiffies_too", example_dir); + remove_proc_entry("tty", example_dir); + remove_proc_entry("bar", example_dir); + remove_proc_entry("foo", example_dir); + remove_proc_entry("jiffies", example_dir); + remove_proc_entry(MODULE_NAME, NULL); + + printk(KERN_INFO "%s %s removed\n", + MODULE_NAME, MODULE_VERS); +} + + +module_init(init_procfs_example); +module_exit(cleanup_procfs_example); + +MODULE_AUTHOR("Erik Mouw"); +MODULE_DESCRIPTION("procfs examples"); diff --git a/Documentation/DocBook/scsidrivers.tmpl b/Documentation/DocBook/scsidrivers.tmpl new file mode 100644 index 000000000000..d058e65daf19 --- /dev/null +++ b/Documentation/DocBook/scsidrivers.tmpl @@ -0,0 +1,193 @@ + + + + + + SCSI Subsystem Interfaces + + + + Douglas + Gilbert + +
+ dgilbert@interlog.com +
+
+
+
+ 2003-08-11 + + + 2002 + 2003 + Douglas Gilbert + + + + + This documentation is free software; you can redistribute + it and/or modify it under the terms of the GNU General Public + License as published by the Free Software Foundation; either + version 2 of the License, or (at your option) any later + version. + + + + This program is distributed in the hope that it will be + useful, but WITHOUT ANY WARRANTY; without even the implied + warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + See the GNU General Public License for more details. + + + + You should have received a copy of the GNU General Public + License along with this program; if not, write to the Free + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, + MA 02111-1307 USA + + + + For more details see the file COPYING in the source + distribution of Linux. + + + +
+ + + + + Introduction + +This document outlines the interface between the Linux scsi mid level +and lower level drivers. Lower level drivers are variously called HBA +(host bus adapter) drivers, host drivers (HD) or pseudo adapter drivers. +The latter alludes to the fact that a lower level driver may be a +bridge to another IO subsystem (and the "ide-scsi" driver is an example +of this). There can be many lower level drivers active in a running +system, but only one per hardware type. For example, the aic7xxx driver +controls adaptec controllers based on the 7xxx chip series. Most lower +level drivers can control one or more scsi hosts (a.k.a. scsi initiators). + + +This document can been found in an ASCII text file in the linux kernel +source: Documentation/scsi/scsi_mid_low_api.txt . +It currently hold a little more information than this document. The +drivers/scsi/hosts.h and +drivers/scsi/scsi.h headers contain descriptions of members +of important structures for the scsi subsystem. + + + + + Driver structure + +Traditionally a lower level driver for the scsi subsystem has been +at least two files in the drivers/scsi directory. For example, a +driver called "xyz" has a header file "xyz.h" and a source file +"xyz.c". [Actually there is no good reason why this couldn't all +be in one file.] Some drivers that have been ported to several operating +systems (e.g. aic7xxx which has separate files for generic and +OS-specific code) have more than two files. Such drivers tend to have +their own directory under the drivers/scsi directory. + + +scsi_module.c is normally included at the end of a lower +level driver. For it to work a declaration like this is needed before +it is included: + + static Scsi_Host_Template driver_template = DRIVER_TEMPLATE; + /* DRIVER_TEMPLATE should contain pointers to supported interface + functions. Scsi_Host_Template is defined hosts.h */ + #include "scsi_module.c" + + + +The scsi_module.c assumes the name "driver_template" is appropriately +defined. It contains 2 functions: + + + init_this_scsi_driver() called during builtin and module driver + initialization: invokes mid level's scsi_register_host() + + + exit_this_scsi_driver() called during closedown: invokes + mid level's scsi_unregister_host() + + + + +When a new, lower level driver is being added to Linux, the following +files (all found in the drivers/scsi directory) will need some attention: +Makefile, Config.help and Config.in . It is probably best to look at what +an existing lower level driver does in this regard. + + + + + Interface Functions +!EDocumentation/scsi/scsi_mid_low_api.txt + + + + Locks + +Each Scsi_Host instance has a spin_lock called Scsi_Host::default_lock +which is initialized in scsi_register() [found in hosts.c]. Within the +same function the Scsi_Host::host_lock pointer is initialized to point +at default_lock with the scsi_assign_lock() function. Thereafter +lock and unlock operations performed by the mid level use the +Scsi_Host::host_lock pointer. + + +Lower level drivers can override the use of Scsi_Host::default_lock by +using scsi_assign_lock(). The earliest opportunity to do this would +be in the detect() function after it has invoked scsi_register(). It +could be replaced by a coarser grain lock (e.g. per driver) or a +lock of equal granularity (i.e. per host). Using finer grain locks +(e.g. per scsi device) may be possible by juggling locks in +queuecommand(). + + + + + Changes since lk 2.4 series + +io_request_lock has been replaced by several finer grained locks. The lock +relevant to lower level drivers is Scsi_Host::host_lock and there is one +per scsi host. + + +The older error handling mechanism has been removed. This means the +lower level interface functions abort() and reset() have been removed. + + +In the 2.4 series the scsi subsystem configuration descriptions were +aggregated with the configuration descriptions from all other Linux +subsystems in the Documentation/Configure.help file. In the 2.5 series, +the scsi subsystem now has its own (much smaller) drivers/scsi/Config.help +file. + + + + + Credits + +The following people have contributed to this document: + + +Mike Anderson andmike@us.ibm.com + + +James Bottomley James.Bottomley@steeleye.com + + +Patrick Mansfield patmans@us.ibm.com + + + + + +
diff --git a/Documentation/DocBook/sis900.tmpl b/Documentation/DocBook/sis900.tmpl new file mode 100644 index 000000000000..6c2cbac93c3f --- /dev/null +++ b/Documentation/DocBook/sis900.tmpl @@ -0,0 +1,585 @@ + + + + + + + +SiS 900/7016 Fast Ethernet Device Driver + + + +Ollie +Lho + + + +Lei Chun +Chang + + + +Document Revision: 0.3 for SiS900 driver v1.06 & v1.07 +November 16, 2000 + + + 1999 + Silicon Integrated System Corp. + + + + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + + + + + +This document gives some information on installation and usage of SiS 900/7016 +device driver under Linux. + + + + + + + + + Introduction + + +This document describes the revision 1.06 and 1.07 of SiS 900/7016 Fast Ethernet +device driver under Linux. The driver is developed by Silicon Integrated +System Corp. and distributed freely under the GNU General Public License (GPL). +The driver can be compiled as a loadable module and used under Linux kernel +version 2.2.x. (rev. 1.06) +With minimal changes, the driver can also be used under 2.3.x and 2.4.x kernel +(rev. 1.07), please see +. If you are intended to +use the driver for earlier kernels, you are on your own. + + + +The driver is tested with usual TCP/IP applications including +FTP, Telnet, Netscape etc. and is used constantly by the developers. + + + +Please send all comments/fixes/questions to +Lei-Chun Chang. + + + + + Changes + + +Changes made in Revision 1.07 + + + + +Separation of sis900.c and sis900.h in order to move most +constant definition to sis900.h (many of those constants were +corrected) + + + + + +Clean up PCI detection, the pci-scan from Donald Becker were not used, +just simple pci_find_*. + + + + + +MII detection is modified to support multiple mii transceiver. + + + + + +Bugs in read_eeprom, mdio_* were removed. + + + + + +Lot of sis900 irrelevant comments were removed/changed and +more comments were added to reflect the real situation. + + + + + +Clean up of physical/virtual address space mess in buffer +descriptors. + + + + + +Better transmit/receive error handling. + + + + + +The driver now uses zero-copy single buffer management +scheme to improve performance. + + + + + +Names of variables were changed to be more consistent. + + + + + +Clean up of auo-negotiation and timer code. + + + + + +Automatic detection and change of PHY on the fly. + + + + + +Bug in mac probing fixed. + + + + + +Fix 630E equalier problem by modifying the equalizer workaround rule. + + + + + +Support for ICS1893 10/100 Interated PHYceiver. + + + + + +Support for media select by ifconfig. + + + + + +Added kernel-doc extratable documentation. + + + + + + + + + Tested Environment + + +This driver is developed on the following hardware + + + + + +Intel Celeron 500 with SiS 630 (rev 02) chipset + + + + + +SiS 900 (rev 01) and SiS 7016/7014 Fast Ethernet Card + + + + + +and tested with these software environments + + + + + +Red Hat Linux version 6.2 + + + + + +Linux kernel version 2.4.0 + + + + + +Netscape version 4.6 + + + + + +NcFTP 3.0.0 beta 18 + + + + + +Samba version 2.0.3 + + + + + + + + + + +Files in This Package + + +In the package you can find these files: + + + + + + +sis900.c + + +Driver source file in C + + + + + +sis900.h + + +Header file for sis900.c + + + + + +sis900.sgml + + +DocBook SGML source of the document + + + + + +sis900.txt + + +Driver document in plain text + + + + + + + + + + Installation + + +Silicon Integrated System Corp. is cooperating closely with core Linux Kernel +developers. The revisions of SiS 900 driver are distributed by the usuall channels +for kernel tar files and patches. Those kernel tar files for official kernel and +patches for kernel pre-release can be download at +official kernel ftp site +and its mirrors. +The 1.06 revision can be found in kernel version later than 2.3.15 and pre-2.2.14, +and 1.07 revision can be found in kernel version 2.4.0. +If you have no prior experience in networking under Linux, please read +Ethernet HOWTO and +Networking HOWTO available from +Linux Documentation Project (LDP). + + + +The driver is bundled in release later than 2.2.11 and 2.3.15 so this +is the most easy case. +Be sure you have the appropriate packages for compiling kernel source. +Those packages are listed in Document/Changes in kernel source +distribution. If you have to install the driver other than those bundled +in kernel release, you should have your driver file +sis900.c and sis900.h +copied into /usr/src/linux/drivers/net/ first. +There are two alternative ways to install the driver + + + +Building the driver as loadable module + + +To build the driver as a loadable kernel module you have to reconfigure +the kernel to activate network support by + + + +make menuconfig + + + +Choose Loadable module support --->, +then select Enable loadable module support. + + + +Choose Network Device Support --->, select +Ethernet (10 or 100Mbit). +Then select EISA, VLB, PCI and on board controllers, +and choose SiS 900/7016 PCI Fast Ethernet Adapter support +to M. + + + +After reconfiguring the kernel, you can make the driver module by + + + +make modules + + + +The driver should be compiled with no errors. After compiling the driver, +the driver can be installed to proper place by + + + +make modules_install + + + +Load the driver into kernel by + + + +insmod sis900 + + + +When loading the driver into memory, some information message can be view by + + + + +dmesg + + +or + + +cat /var/log/message + + + + +If the driver is loaded properly you will have messages similar to this: + + + +sis900.c: v1.07.06 11/07/2000 +eth0: SiS 900 PCI Fast Ethernet at 0xd000, IRQ 10, 00:00:e8:83:7f:a4. +eth0: SiS 900 Internal MII PHY transceiver found at address 1. +eth0: Using SiS 900 Internal MII PHY as default + + + +showing the version of the driver and the results of probing routine. + + + +Once the driver is loaded, network can be brought up by + + + +/sbin/ifconfig eth0 IPADDR broadcast BROADCAST netmask NETMASK media TYPE + + + +where IPADDR, BROADCAST, NETMASK are your IP address, broadcast address and +netmask respectively. TYPE is used to set medium type used by the device. +Typical values are "10baseT"(twisted-pair 10Mbps Ethernet) or "100baseT" +(twisted-pair 100Mbps Ethernet). For more information on how to configure +network interface, please refer to +Networking HOWTO. + + + +The link status is also shown by kernel messages. For example, after the +network interface is activated, you may have the message: + + + +eth0: Media Link On 100mbps full-duplex + + + +If you try to unplug the twist pair (TP) cable you will get + + + +eth0: Media Link Off + + + +indicating that the link is failed. + + + + +Building the driver into kernel + + +If you want to make the driver into kernel, choose Y +rather than M on +SiS 900/7016 PCI Fast Ethernet Adapter support +when configuring the kernel. Build the kernel image in the usual way + + + +make clean + +make bzlilo + + + +Next time the system reboot, you have the driver in memory. + + + + + + + Known Problems and Bugs + + +There are some known problems and bugs. If you find any other bugs please +mail to lcchang@sis.com.tw + + + + + +AM79C901 HomePNA PHY is not thoroughly tested, there may be some +bugs in the on the fly change of transceiver. + + + + + +A bug is hidden somewhere in the receive buffer management code, +the bug causes NULL pointer reference in the kernel. This fault is +caught before bad things happen and reported with the message: + + +eth0: NULL pointer encountered in Rx ring, skipping + + +which can be viewed with dmesg or +cat /var/log/message. + + + + + +The media type change from 10Mbps to 100Mbps twisted-pair ethernet +by ifconfig causes the media link down. + + + + + + + + + Revision History + + + + + + +November 13, 2000, Revision 1.07, seventh release, 630E problem fixed +and further clean up. + + + + + +November 4, 1999, Revision 1.06, Second release, lots of clean up +and optimization. + + + + + +August 8, 1999, Revision 1.05, Initial Public Release + + + + + + + + + Acknowledgements + + +This driver was originally derived form +Donald Becker's +pci-skeleton and +rtl8139 drivers. Donald also provided various suggestion +regarded with improvements made in revision 1.06. + + + +The 1.05 revision was created by +Jim Huang, AMD 79c901 +support was added by Chin-Shan Li. + + + + +List of Functions +!Idrivers/net/sis900.c + + + diff --git a/Documentation/DocBook/tulip-user.tmpl b/Documentation/DocBook/tulip-user.tmpl new file mode 100644 index 000000000000..6520d7a1b132 --- /dev/null +++ b/Documentation/DocBook/tulip-user.tmpl @@ -0,0 +1,327 @@ + + + + + + Tulip Driver User's Guide + + + + Jeff + Garzik + +
+ jgarzik@pobox.com +
+
+
+
+ + + 2001 + Jeff Garzik + + + + + This documentation is free software; you can redistribute + it and/or modify it under the terms of the GNU General Public + License as published by the Free Software Foundation; either + version 2 of the License, or (at your option) any later + version. + + + + This program is distributed in the hope that it will be + useful, but WITHOUT ANY WARRANTY; without even the implied + warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + See the GNU General Public License for more details. + + + + You should have received a copy of the GNU General Public + License along with this program; if not, write to the Free + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, + MA 02111-1307 USA + + + + For more details see the file COPYING in the source + distribution of Linux. + + +
+ + + + + Introduction + +The Tulip Ethernet Card Driver +is maintained by Jeff Garzik (jgarzik@pobox.com). + + + +The Tulip driver was developed by Donald Becker and changed by +Jeff Garzik, Takashi Manabe and a cast of thousands. + + + +For 2.4.x and later kernels, the Linux Tulip driver is available at +http://sourceforge.net/projects/tulip/ + + + + This driver is for the Digital "Tulip" Ethernet adapter interface. + It should work with most DEC 21*4*-based chips/ethercards, as well as + with work-alike chips from Lite-On (PNIC) and Macronix (MXIC) and ASIX. + + + + The original author may be reached as becker@scyld.com, or C/O + Scyld Computing Corporation, + 410 Severn Ave., Suite 210, + Annapolis MD 21403 + + + + Additional information on Donald Becker's tulip.c + is available at http://www.scyld.com/network/tulip.html + + + + + + Driver Compatibility + + +This device driver is designed for the DECchip "Tulip", Digital's +single-chip ethernet controllers for PCI (now owned by Intel). +Supported members of the family +are the 21040, 21041, 21140, 21140A, 21142, and 21143. Similar work-alike +chips from Lite-On, Macronics, ASIX, Compex and other listed below are also +supported. + + + +These chips are used on at least 140 unique PCI board designs. The great +number of chips and board designs supported is the reason for the +driver size and complexity. Almost of the increasing complexity is in the +board configuration and media selection code. There is very little +increasing in the operational critical path length. + + + + + Board-specific Settings + + +PCI bus devices are configured by the system at boot time, so no jumpers +need to be set on the board. The system BIOS preferably should assign the +PCI INTA signal to an otherwise unused system IRQ line. + + + +Some boards have EEPROMs tables with default media entry. The factory default +is usually "autoselect". This should only be overridden when using +transceiver connections without link beat e.g. 10base2 or AUI, or (rarely!) +for forcing full-duplex when used with old link partners that do not do +autonegotiation. + + + + + Driver Operation + +Ring buffers + + +The Tulip can use either ring buffers or lists of Tx and Rx descriptors. +This driver uses statically allocated rings of Rx and Tx descriptors, set at +compile time by RX/TX_RING_SIZE. This version of the driver allocates skbuffs +for the Rx ring buffers at open() time and passes the skb->data field to the +Tulip as receive data buffers. When an incoming frame is less than +RX_COPYBREAK bytes long, a fresh skbuff is allocated and the frame is +copied to the new skbuff. When the incoming frame is larger, the skbuff is +passed directly up the protocol stack and replaced by a newly allocated +skbuff. + + + +The RX_COPYBREAK value is chosen to trade-off the memory wasted by +using a full-sized skbuff for small frames vs. the copying costs of larger +frames. For small frames the copying cost is negligible (esp. considering +that we are pre-loading the cache with immediately useful header +information). For large frames the copying cost is non-trivial, and the +larger copy might flush the cache of useful data. A subtle aspect of this +choice is that the Tulip only receives into longword aligned buffers, thus +the IP header at offset 14 isn't longword aligned for further processing. +Copied frames are put into the new skbuff at an offset of "+2", thus copying +has the beneficial effect of aligning the IP header and preloading the +cache. + + + + +Synchronization + +The driver runs as two independent, single-threaded flows of control. One +is the send-packet routine, which enforces single-threaded use by the +dev->tbusy flag. The other thread is the interrupt handler, which is single +threaded by the hardware and other software. + + + +The send packet thread has partial control over the Tx ring and 'dev->tbusy' +flag. It sets the tbusy flag whenever it's queuing a Tx packet. If the next +queue slot is empty, it clears the tbusy flag when finished otherwise it sets +the 'tp->tx_full' flag. + + + +The interrupt handler has exclusive control over the Rx ring and records stats +from the Tx ring. (The Tx-done interrupt can't be selectively turned off, so +we can't avoid the interrupt overhead by having the Tx routine reap the Tx +stats.) After reaping the stats, it marks the queue entry as empty by setting +the 'base' to zero. Iff the 'tp->tx_full' flag is set, it clears both the +tx_full and tbusy flags. + + + + + + + + Errata + + +The old DEC databooks were light on details. +The 21040 databook claims that CSR13, CSR14, and CSR15 should each be the last +register of the set CSR12-15 written. Hmmm, now how is that possible? + + + +The DEC SROM format is very badly designed not precisely defined, leading to +part of the media selection junkheap below. Some boards do not have EEPROM +media tables and need to be patched up. Worse, other boards use the DEC +design kit media table when it isn't correct for their board. + + + +We cannot use MII interrupts because there is no defined GPIO pin to attach +them. The MII transceiver status is polled using an kernel timer. + + + + + Driver Change History + + Version 0.9.14 (February 20, 2001) + + Fix PNIC problems (Manfred Spraul) + Add new PCI id for Accton comet + Support Davicom tulips + Fix oops in eeprom parsing + Enable workarounds for early PCI chipsets + IA64, hppa csr0 support + Support media types 5, 6 + Interpret a bit more of the 21142 SROM extended media type 3 + Add missing delay in eeprom reading + + + + Version 0.9.11 (November 3, 2000) + + Eliminate extra bus accesses when sharing interrupts (prumpf) + Barrier following ownership descriptor bit flip (prumpf) + Endianness fixes for >14 addresses in setup frames (prumpf) + Report link beat to kernel/userspace via netif_carrier_*. (kuznet) + Better spinlocking in set_rx_mode. + Fix I/O resource request failure error messages (DaveM catch) + Handle DMA allocation failure. + + + + Version 0.9.10 (September 6, 2000) + + Simple interrupt mitigation (via jamal) + More PCI ids + + + + Version 0.9.9 (August 11, 2000) + + More PCI ids + + + + Version 0.9.8 (July 13, 2000) + + Correct signed/unsigned comparison for dummy frame index + Remove outdated references to struct enet_statistics + + + + Version 0.9.7 (June 17, 2000) + + Timer cleanups (Andrew Morton) + Alpha compile fix (somebody?) + + + + Version 0.9.6 (May 31, 2000) + + Revert 21143-related support flag patch + Add HPPA/media-table debugging printk + + + + Version 0.9.5 (May 30, 2000) + + HPPA support (willy@puffingroup) + CSR6 bits and tulip.h cleanup (Chris Smith) + Improve debugging messages a bit + Add delay after CSR13 write in t21142_start_nway + Remove unused ETHER_STATS code + Convert 'extern inline' to 'static inline' in tulip.h (Chris Smith) + Update DS21143 support flags in tulip_chip_info[] + Use spin_lock_irq, not _irqsave/restore, in tulip_start_xmit() + Add locking to set_rx_mode() + Fix race with chip setting DescOwned bit (Hal Murray) + Request 100% of PIO and MMIO resource space assigned to card + Remove error message from pci_enable_device failure + + + + Version 0.9.4.3 (April 14, 2000) + + mod_timer fix (Hal Murray) + PNIC2 resuscitation (Chris Smith) + + + + Version 0.9.4.2 (March 21, 2000) + + Fix 21041 CSR7, CSR13/14/15 handling + Merge some PCI ids from tulip 0.91x + Merge some HAS_xxx flags and flag settings from tulip 0.91x + asm/io.h fix (submitted by many) and cleanup + s/HAS_NWAY143/HAS_NWAY/ + Cleanup 21041 mode reporting + Small code cleanups + + + + Version 0.9.4.1 (March 18, 2000) + + Finish PCI DMA conversion (davem) + Do not netif_start_queue() at end of tulip_tx_timeout() (kuznet) + PCI DMA fix (kuznet) + eeprom.c code cleanup + Remove Xircom Tulip crud + + + + +
diff --git a/Documentation/DocBook/usb.tmpl b/Documentation/DocBook/usb.tmpl new file mode 100644 index 000000000000..f3ef0bf435e9 --- /dev/null +++ b/Documentation/DocBook/usb.tmpl @@ -0,0 +1,979 @@ + + + + + + The Linux-USB Host Side API + + + + This documentation is free software; you can redistribute + it and/or modify it under the terms of the GNU General Public + License as published by the Free Software Foundation; either + version 2 of the License, or (at your option) any later + version. + + + + This program is distributed in the hope that it will be + useful, but WITHOUT ANY WARRANTY; without even the implied + warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + See the GNU General Public License for more details. + + + + You should have received a copy of the GNU General Public + License along with this program; if not, write to the Free + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, + MA 02111-1307 USA + + + + For more details see the file COPYING in the source + distribution of Linux. + + + + + + + + Introduction to USB on Linux + + A Universal Serial Bus (USB) is used to connect a host, + such as a PC or workstation, to a number of peripheral + devices. USB uses a tree structure, with the host at the + root (the system's master), hubs as interior nodes, and + peripheral devices as leaves (and slaves). + Modern PCs support several such trees of USB devices, usually + one USB 2.0 tree (480 Mbit/sec each) with + a few USB 1.1 trees (12 Mbit/sec each) that are used when you + connect a USB 1.1 device directly to the machine's "root hub". + + + That master/slave asymmetry was designed in part for + ease of use. It is not physically possible to assemble + (legal) USB cables incorrectly: all upstream "to-the-host" + connectors are the rectangular type, matching the sockets on + root hubs, and the downstream type are the squarish type + (or they are built in to the peripheral). + Software doesn't need to deal with distributed autoconfiguration + since the pre-designated master node manages all that. + At the electrical level, bus protocol overhead is reduced by + eliminating arbitration and moving scheduling into host software. + + + USB 1.0 was announced in January 1996, and was revised + as USB 1.1 (with improvements in hub specification and + support for interrupt-out transfers) in September 1998. + USB 2.0 was released in April 2000, including high speed + transfers and transaction translating hubs (used for USB 1.1 + and 1.0 backward compatibility). + + + USB support was added to Linux early in the 2.2 kernel series + shortly before the 2.3 development forked off. Updates + from 2.3 were regularly folded back into 2.2 releases, bringing + new features such as /sbin/hotplug support, + more drivers, and more robustness. + The 2.5 kernel series continued such improvements, and also + worked on USB 2.0 support, + higher performance, + better consistency between host controller drivers, + API simplification (to make bugs less likely), + and providing internal "kerneldoc" documentation. + + + Linux can run inside USB devices as well as on + the hosts that control the devices. + Because the Linux 2.x USB support evolved to support mass market + platforms such as Apple Macintosh or PC-compatible systems, + it didn't address design concerns for those types of USB systems. + So it can't be used inside mass-market PDAs, or other peripherals. + USB device drivers running inside those Linux peripherals + don't do the same things as the ones running inside hosts, + and so they've been given a different name: + they're called gadget drivers. + This document does not present gadget drivers. + + + + + + USB Host-Side API Model + + Within the kernel, + host-side drivers for USB devices talk to the "usbcore" APIs. + There are two types of public "usbcore" APIs, targetted at two different + layers of USB driver. Those are + general purpose drivers, exposed through + driver frameworks such as block, character, or network devices; + and drivers that are part of the core, + which are involved in managing a USB bus. + Such core drivers include the hub driver, + which manages trees of USB devices, and several different kinds + of host controller driver (HCD), + which control individual busses. + + + The device model seen by USB drivers is relatively complex. + + + + + USB supports four kinds of data transfer + (control, bulk, interrupt, and isochronous). Two transfer + types use bandwidth as it's available (control and bulk), + while the other two types of transfer (interrupt and isochronous) + are scheduled to provide guaranteed bandwidth. + + + The device description model includes one or more + "configurations" per device, only one of which is active at a time. + Devices that are capable of high speed operation must also support + full speed configurations, along with a way to ask about the + "other speed" configurations that might be used. + + + Configurations have one or more "interface", each + of which may have "alternate settings". Interfaces may be + standardized by USB "Class" specifications, or may be specific to + a vendor or device. + + USB device drivers actually bind to interfaces, not devices. + Think of them as "interface drivers", though you + may not see many devices where the distinction is important. + Most USB devices are simple, with only one configuration, + one interface, and one alternate setting. + + + Interfaces have one or more "endpoints", each of + which supports one type and direction of data transfer such as + "bulk out" or "interrupt in". The entire configuration may have + up to sixteen endpoints in each direction, allocated as needed + among all the interfaces. + + + Data transfer on USB is packetized; each endpoint + has a maximum packet size. + Drivers must often be aware of conventions such as flagging the end + of bulk transfers using "short" (including zero length) packets. + + + The Linux USB API supports synchronous calls for + control and bulk messaging. + It also supports asynchnous calls for all kinds of data transfer, + using request structures called "URBs" (USB Request Blocks). + + + + + Accordingly, the USB Core API exposed to device drivers + covers quite a lot of territory. You'll probably need to consult + the USB 2.0 specification, available online from www.usb.org at + no cost, as well as class or device specifications. + + + The only host-side drivers that actually touch hardware + (reading/writing registers, handling IRQs, and so on) are the HCDs. + In theory, all HCDs provide the same functionality through the same + API. In practice, that's becoming more true on the 2.5 kernels, + but there are still differences that crop up especially with + fault handling. Different controllers don't necessarily report + the same aspects of failures, and recovery from faults (including + software-induced ones like unlinking an URB) isn't yet fully + consistent. + Device driver authors should make a point of doing disconnect + testing (while the device is active) with each different host + controller driver, to make sure drivers don't have bugs of + their own as well as to make sure they aren't relying on some + HCD-specific behavior. + (You will need external USB 1.1 and/or + USB 2.0 hubs to perform all those tests.) + + + + +USB-Standard Types + + In <linux/usb_ch9.h> you will find + the USB data types defined in chapter 9 of the USB specification. + These data types are used throughout USB, and in APIs including + this host side API, gadget APIs, and usbfs. + + +!Iinclude/linux/usb_ch9.h + + + +Host-Side Data Types and Macros + + The host side API exposes several layers to drivers, some of + which are more necessary than others. + These support lifecycle models for host side drivers + and devices, and support passing buffers through usbcore to + some HCD that performs the I/O for the device driver. + + + +!Iinclude/linux/usb.h + + + + USB Core APIs + + There are two basic I/O models in the USB API. + The most elemental one is asynchronous: drivers submit requests + in the form of an URB, and the URB's completion callback + handle the next step. + All USB transfer types support that model, although there + are special cases for control URBs (which always have setup + and status stages, but may not have a data stage) and + isochronous URBs (which allow large packets and include + per-packet fault reports). + Built on top of that is synchronous API support, where a + driver calls a routine that allocates one or more URBs, + submits them, and waits until they complete. + There are synchronous wrappers for single-buffer control + and bulk transfers (which are awkward to use in some + driver disconnect scenarios), and for scatterlist based + streaming i/o (bulk or interrupt). + + + USB drivers need to provide buffers that can be + used for DMA, although they don't necessarily need to + provide the DMA mapping themselves. + There are APIs to use used when allocating DMA buffers, + which can prevent use of bounce buffers on some systems. + In some cases, drivers may be able to rely on 64bit DMA + to eliminate another kind of bounce buffer. + + +!Edrivers/usb/core/urb.c +!Edrivers/usb/core/message.c +!Edrivers/usb/core/file.c +!Edrivers/usb/core/usb.c +!Edrivers/usb/core/hub.c + + + Host Controller APIs + + These APIs are only for use by host controller drivers, + most of which implement standard register interfaces such as + EHCI, OHCI, or UHCI. + UHCI was one of the first interfaces, designed by Intel and + also used by VIA; it doesn't do much in hardware. + OHCI was designed later, to have the hardware do more work + (bigger transfers, tracking protocol state, and so on). + EHCI was designed with USB 2.0; its design has features that + resemble OHCI (hardware does much more work) as well as + UHCI (some parts of ISO support, TD list processing). + + + There are host controllers other than the "big three", + although most PCI based controllers (and a few non-PCI based + ones) use one of those interfaces. + Not all host controllers use DMA; some use PIO, and there + is also a simulator. + + + The same basic APIs are available to drivers for all + those controllers. + For historical reasons they are in two layers: + struct usb_bus is a rather thin + layer that became available in the 2.2 kernels, while + struct usb_hcd is a more featureful + layer (available in later 2.4 kernels and in 2.5) that + lets HCDs share common code, to shrink driver size + and significantly reduce hcd-specific behaviors. + + +!Edrivers/usb/core/hcd.c +!Edrivers/usb/core/hcd-pci.c +!Edrivers/usb/core/buffer.c + + + + The USB Filesystem (usbfs) + + This chapter presents the Linux usbfs. + You may prefer to avoid writing new kernel code for your + USB driver; that's the problem that usbfs set out to solve. + User mode device drivers are usually packaged as applications + or libraries, and may use usbfs through some programming library + that wraps it. Such libraries include + libusb + for C/C++, and + jUSB for Java. + + + Unfinished + This particular documentation is incomplete, + especially with respect to the asynchronous mode. + As of kernel 2.5.66 the code and this (new) documentation + need to be cross-reviewed. + + + + Configure usbfs into Linux kernels by enabling the + USB filesystem option (CONFIG_USB_DEVICEFS), + and you get basic support for user mode USB device drivers. + Until relatively recently it was often (confusingly) called + usbdevfs although it wasn't solving what + devfs was. + Every USB device will appear in usbfs, regardless of whether or + not it has a kernel driver; but only devices with kernel drivers + show up in devfs. + + + + What files are in "usbfs"? + + Conventionally mounted at + /proc/bus/usb, usbfs + features include: + + /proc/bus/usb/devices + ... a text file + showing each of the USB devices on known to the kernel, + and their configuration descriptors. + You can also poll() this to learn about new devices. + + /proc/bus/usb/BBB/DDD + ... magic files + exposing the each device's configuration descriptors, and + supporting a series of ioctls for making device requests, + including I/O to devices. (Purely for access by programs.) + + + + + Each bus is given a number (BBB) based on when it was + enumerated; within each bus, each device is given a similar + number (DDD). + Those BBB/DDD paths are not "stable" identifiers; + expect them to change even if you always leave the devices + plugged in to the same hub port. + Don't even think of saving these in application + configuration files. + Stable identifiers are available, for user mode applications + that want to use them. HID and networking devices expose + these stable IDs, so that for example you can be sure that + you told the right UPS to power down its second server. + "usbfs" doesn't (yet) expose those IDs. + + + + + + Mounting and Access Control + + There are a number of mount options for usbfs, which will + be of most interest to you if you need to override the default + access control policy. + That policy is that only root may read or write device files + (/proc/bus/BBB/DDD) although anyone may read + the devices + or drivers files. + I/O requests to the device also need the CAP_SYS_RAWIO capability, + + + The significance of that is that by default, all user mode + device drivers need super-user privileges. + You can change modes or ownership in a driver setup + when the device hotplugs, or maye just start the + driver right then, as a privileged server (or some activity + within one). + That's the most secure approach for multi-user systems, + but for single user systems ("trusted" by that user) + it's more convenient just to grant everyone all access + (using the devmode=0666 option) + so the driver can start whenever it's needed. + + + The mount options for usbfs, usable in /etc/fstab or + in command line invocations of mount, are: + + + + busgid=NNNNN + Controls the GID used for the + /proc/bus/usb/BBB + directories. (Default: 0) + busmode=MMM + Controls the file mode used for the + /proc/bus/usb/BBB + directories. (Default: 0555) + + busuid=NNNNN + Controls the UID used for the + /proc/bus/usb/BBB + directories. (Default: 0) + + devgid=NNNNN + Controls the GID used for the + /proc/bus/usb/BBB/DDD + files. (Default: 0) + devmode=MMM + Controls the file mode used for the + /proc/bus/usb/BBB/DDD + files. (Default: 0644) + devuid=NNNNN + Controls the UID used for the + /proc/bus/usb/BBB/DDD + files. (Default: 0) + + listgid=NNNNN + Controls the GID used for the + /proc/bus/usb/devices and drivers files. + (Default: 0) + listmode=MMM + Controls the file mode used for the + /proc/bus/usb/devices and drivers files. + (Default: 0444) + listuid=NNNNN + Controls the UID used for the + /proc/bus/usb/devices and drivers files. + (Default: 0) + + + + + Note that many Linux distributions hard-wire the mount options + for usbfs in their init scripts, such as + /etc/rc.d/rc.sysinit, + rather than making it easy to set this per-system + policy in /etc/fstab. + + + + + + /proc/bus/usb/devices + + This file is handy for status viewing tools in user + mode, which can scan the text format and ignore most of it. + More detailed device status (including class and vendor + status) is available from device-specific files. + For information about the current format of this file, + see the + Documentation/usb/proc_usb_info.txt + file in your Linux kernel sources. + + + Otherwise the main use for this file from programs + is to poll() it to get notifications of usb devices + as they're plugged or unplugged. + To see what changed, you'd need to read the file and + compare "before" and "after" contents, scan the filesystem, + or see its hotplug event. + + + + + + /proc/bus/usb/BBB/DDD + + Use these files in one of these basic ways: + + + They can be read, + producing first the device descriptor + (18 bytes) and then the descriptors for the current configuration. + See the USB 2.0 spec for details about those binary data formats. + You'll need to convert most multibyte values from little endian + format to your native host byte order, although a few of the + fields in the device descriptor (both of the BCD-encoded fields, + and the vendor and product IDs) will be byteswapped for you. + Note that configuration descriptors include descriptors for + interfaces, altsettings, endpoints, and maybe additional + class descriptors. + + + Perform USB operations using + ioctl() requests to make endpoint I/O + requests (synchronously or asynchronously) or manage + the device. + These requests need the CAP_SYS_RAWIO capability, + as well as filesystem access permissions. + Only one ioctl request can be made on one of these + device files at a time. + This means that if you are synchronously reading an endpoint + from one thread, you won't be able to write to a different + endpoint from another thread until the read completes. + This works for half duplex protocols, + but otherwise you'd use asynchronous i/o requests. + + + + + + + Life Cycle of User Mode Drivers + + Such a driver first needs to find a device file + for a device it knows how to handle. + Maybe it was told about it because a + /sbin/hotplug event handling agent + chose that driver to handle the new device. + Or maybe it's an application that scans all the + /proc/bus/usb device files, and ignores most devices. + In either case, it should read() all + the descriptors from the device file, + and check them against what it knows how to handle. + It might just reject everything except a particular + vendor and product ID, or need a more complex policy. + + + Never assume there will only be one such device + on the system at a time! + If your code can't handle more than one device at + a time, at least detect when there's more than one, and + have your users choose which device to use. + + + Once your user mode driver knows what device to use, + it interacts with it in either of two styles. + The simple style is to make only control requests; some + devices don't need more complex interactions than those. + (An example might be software using vendor-specific control + requests for some initialization or configuration tasks, + with a kernel driver for the rest.) + + + More likely, you need a more complex style driver: + one using non-control endpoints, reading or writing data + and claiming exclusive use of an interface. + Bulk transfers are easiest to use, + but only their sibling interrupt transfers + work with low speed devices. + Both interrupt and isochronous transfers + offer service guarantees because their bandwidth is reserved. + Such "periodic" transfers are awkward to use through usbfs, + unless you're using the asynchronous calls. However, interrupt + transfers can also be used in a synchronous "one shot" style. + + + Your user-mode driver should never need to worry + about cleaning up request state when the device is + disconnected, although it should close its open file + descriptors as soon as it starts seeing the ENODEV + errors. + + + + + The ioctl() Requests + + To use these ioctls, you need to include the following + headers in your userspace program: +#include <linux/usb.h> +#include <linux/usbdevice_fs.h> +#include <asm/byteorder.h> + The standard USB device model requests, from "Chapter 9" of + the USB 2.0 specification, are automatically included from + the <linux/usb_ch9.h> header. + + + Unless noted otherwise, the ioctl requests + described here will + update the modification time on the usbfs file to which + they are applied (unless they fail). + A return of zero indicates success; otherwise, a + standard USB error code is returned. (These are + documented in + Documentation/usb/error-codes.txt + in your kernel sources.) + + + Each of these files multiplexes access to several + I/O streams, one per endpoint. + Each device has one control endpoint (endpoint zero) + which supports a limited RPC style RPC access. + Devices are configured + by khubd (in the kernel) setting a device-wide + configuration that affects things + like power consumption and basic functionality. + The endpoints are part of USB interfaces, + which may have altsettings + affecting things like which endpoints are available. + Many devices only have a single configuration and interface, + so drivers for them will ignore configurations and altsettings. + + + + + Management/Status Requests + + A number of usbfs requests don't deal very directly + with device I/O. + They mostly relate to device management and status. + These are all synchronous requests. + + + + + USBDEVFS_CLAIMINTERFACE + This is used to force usbfs to + claim a specific interface, + which has not previously been claimed by usbfs or any other + kernel driver. + The ioctl parameter is an integer holding the number of + the interface (bInterfaceNumber from descriptor). + + Note that if your driver doesn't claim an interface + before trying to use one of its endpoints, and no + other driver has bound to it, then the interface is + automatically claimed by usbfs. + + This claim will be released by a RELEASEINTERFACE ioctl, + or by closing the file descriptor. + File modification time is not updated by this request. + + + USBDEVFS_CONNECTINFO + Says whether the device is lowspeed. + The ioctl parameter points to a structure like this: +struct usbdevfs_connectinfo { + unsigned int devnum; + unsigned char slow; +}; + File modification time is not updated by this request. + + You can't tell whether a "not slow" + device is connected at high speed (480 MBit/sec) + or just full speed (12 MBit/sec). + You should know the devnum value already, + it's the DDD value of the device file name. + + + USBDEVFS_GETDRIVER + Returns the name of the kernel driver + bound to a given interface (a string). Parameter + is a pointer to this structure, which is modified: +struct usbdevfs_getdriver { + unsigned int interface; + char driver[USBDEVFS_MAXDRIVERNAME + 1]; +}; + File modification time is not updated by this request. + + + USBDEVFS_IOCTL + Passes a request from userspace through + to a kernel driver that has an ioctl entry in the + struct usb_driver it registered. +struct usbdevfs_ioctl { + int ifno; + int ioctl_code; + void *data; +}; + +/* user mode call looks like this. + * 'request' becomes the driver->ioctl() 'code' parameter. + * the size of 'param' is encoded in 'request', and that data + * is copied to or from the driver->ioctl() 'buf' parameter. + */ +static int +usbdev_ioctl (int fd, int ifno, unsigned request, void *param) +{ + struct usbdevfs_ioctl wrapper; + + wrapper.ifno = ifno; + wrapper.ioctl_code = request; + wrapper.data = param; + + return ioctl (fd, USBDEVFS_IOCTL, &wrapper); +} + File modification time is not updated by this request. + + This request lets kernel drivers talk to user mode code + through filesystem operations even when they don't create + a charactor or block special device. + It's also been used to do things like ask devices what + device special file should be used. + Two pre-defined ioctls are used + to disconnect and reconnect kernel drivers, so + that user mode code can completely manage binding + and configuration of devices. + + + USBDEVFS_RELEASEINTERFACE + This is used to release the claim usbfs + made on interface, either implicitly or because of a + USBDEVFS_CLAIMINTERFACE call, before the file + descriptor is closed. + The ioctl parameter is an integer holding the number of + the interface (bInterfaceNumber from descriptor); + File modification time is not updated by this request. + + No security check is made to ensure + that the task which made the claim is the one + which is releasing it. + This means that user mode driver may interfere + other ones. + + + USBDEVFS_RESETEP + Resets the data toggle value for an endpoint + (bulk or interrupt) to DATA0. + The ioctl parameter is an integer endpoint number + (1 to 15, as identified in the endpoint descriptor), + with USB_DIR_IN added if the device's endpoint sends + data to the host. + + Avoid using this request. + It should probably be removed. + Using it typically means the device and driver will lose + toggle synchronization. If you really lost synchronization, + you likely need to completely handshake with the device, + using a request like CLEAR_HALT + or SET_INTERFACE. + + + + + + + + Synchronous I/O Support + + Synchronous requests involve the kernel blocking + until until the user mode request completes, either by + finishing successfully or by reporting an error. + In most cases this is the simplest way to use usbfs, + although as noted above it does prevent performing I/O + to more than one endpoint at a time. + + + + + USBDEVFS_BULK + Issues a bulk read or write request to the + device. + The ioctl parameter is a pointer to this structure: +struct usbdevfs_bulktransfer { + unsigned int ep; + unsigned int len; + unsigned int timeout; /* in milliseconds */ + void *data; +}; + The "ep" value identifies a + bulk endpoint number (1 to 15, as identified in an endpoint + descriptor), + masked with USB_DIR_IN when referring to an endpoint which + sends data to the host from the device. + The length of the data buffer is identified by "len"; + Recent kernels support requests up to about 128KBytes. + FIXME say how read length is returned, + and how short reads are handled.. + + + USBDEVFS_CLEAR_HALT + Clears endpoint halt (stall) and + resets the endpoint toggle. This is only + meaningful for bulk or interrupt endpoints. + The ioctl parameter is an integer endpoint number + (1 to 15, as identified in an endpoint descriptor), + masked with USB_DIR_IN when referring to an endpoint which + sends data to the host from the device. + + Use this on bulk or interrupt endpoints which have + stalled, returning -EPIPE status + to a data transfer request. + Do not issue the control request directly, since + that could invalidate the host's record of the + data toggle. + + + USBDEVFS_CONTROL + Issues a control request to the device. + The ioctl parameter points to a structure like this: +struct usbdevfs_ctrltransfer { + __u8 bRequestType; + __u8 bRequest; + __u16 wValue; + __u16 wIndex; + __u16 wLength; + __u32 timeout; /* in milliseconds */ + void *data; +}; + + The first eight bytes of this structure are the contents + of the SETUP packet to be sent to the device; see the + USB 2.0 specification for details. + The bRequestType value is composed by combining a + USB_TYPE_* value, a USB_DIR_* value, and a + USB_RECIP_* value (from + <linux/usb.h>). + If wLength is nonzero, it describes the length of the data + buffer, which is either written to the device + (USB_DIR_OUT) or read from the device (USB_DIR_IN). + + At this writing, you can't transfer more than 4 KBytes + of data to or from a device; usbfs has a limit, and + some host controller drivers have a limit. + (That's not usually a problem.) + Also there's no way to say it's + not OK to get a short read back from the device. + + + USBDEVFS_RESET + Does a USB level device reset. + The ioctl parameter is ignored. + After the reset, this rebinds all device interfaces. + File modification time is not updated by this request. + + Avoid using this call + until some usbcore bugs get fixed, + since it does not fully synchronize device, interface, + and driver (not just usbfs) state. + + + USBDEVFS_SETINTERFACE + Sets the alternate setting for an + interface. The ioctl parameter is a pointer to a + structure like this: +struct usbdevfs_setinterface { + unsigned int interface; + unsigned int altsetting; +}; + File modification time is not updated by this request. + + Those struct members are from some interface descriptor + applying to the the current configuration. + The interface number is the bInterfaceNumber value, and + the altsetting number is the bAlternateSetting value. + (This resets each endpoint in the interface.) + + + USBDEVFS_SETCONFIGURATION + Issues the + usb_set_configuration call + for the device. + The parameter is an integer holding the number of + a configuration (bConfigurationValue from descriptor). + File modification time is not updated by this request. + + Avoid using this call + until some usbcore bugs get fixed, + since it does not fully synchronize device, interface, + and driver (not just usbfs) state. + + + + + + + Asynchronous I/O Support + + As mentioned above, there are situations where it may be + important to initiate concurrent operations from user mode code. + This is particularly important for periodic transfers + (interrupt and isochronous), but it can be used for other + kinds of USB requests too. + In such cases, the asynchronous requests described here + are essential. Rather than submitting one request and having + the kernel block until it completes, the blocking is separate. + + + These requests are packaged into a structure that + resembles the URB used by kernel device drivers. + (No POSIX Async I/O support here, sorry.) + It identifies the endpoint type (USBDEVFS_URB_TYPE_*), + endpoint (number, masked with USB_DIR_IN as appropriate), + buffer and length, and a user "context" value serving to + uniquely identify each request. + (It's usually a pointer to per-request data.) + Flags can modify requests (not as many as supported for + kernel drivers). + + + Each request can specify a realtime signal number + (between SIGRTMIN and SIGRTMAX, inclusive) to request a + signal be sent when the request completes. + + + When usbfs returns these urbs, the status value + is updated, and the buffer may have been modified. + Except for isochronous transfers, the actual_length is + updated to say how many bytes were transferred; if the + USBDEVFS_URB_DISABLE_SPD flag is set + ("short packets are not OK"), if fewer bytes were read + than were requested then you get an error report. + + +struct usbdevfs_iso_packet_desc { + unsigned int length; + unsigned int actual_length; + unsigned int status; +}; + +struct usbdevfs_urb { + unsigned char type; + unsigned char endpoint; + int status; + unsigned int flags; + void *buffer; + int buffer_length; + int actual_length; + int start_frame; + int number_of_packets; + int error_count; + unsigned int signr; + void *usercontext; + struct usbdevfs_iso_packet_desc iso_frame_desc[]; +}; + + For these asynchronous requests, the file modification + time reflects when the request was initiated. + This contrasts with their use with the synchronous requests, + where it reflects when requests complete. + + + + + USBDEVFS_DISCARDURB + + TBS + File modification time is not updated by this request. + + + + USBDEVFS_DISCSIGNAL + + TBS + File modification time is not updated by this request. + + + + USBDEVFS_REAPURB + + TBS + File modification time is not updated by this request. + + + + USBDEVFS_REAPURBNDELAY + + TBS + File modification time is not updated by this request. + + + + USBDEVFS_SUBMITURB + + TBS + + + + + + + + + + + + diff --git a/Documentation/DocBook/via-audio.tmpl b/Documentation/DocBook/via-audio.tmpl new file mode 100644 index 000000000000..36e642147d6b --- /dev/null +++ b/Documentation/DocBook/via-audio.tmpl @@ -0,0 +1,597 @@ + + + + + + Via 686 Audio Driver for Linux + + + + Jeff + Garzik + + + + + 1999-2001 + Jeff Garzik + + + + + This documentation is free software; you can redistribute + it and/or modify it under the terms of the GNU General Public + License as published by the Free Software Foundation; either + version 2 of the License, or (at your option) any later + version. + + + + This program is distributed in the hope that it will be + useful, but WITHOUT ANY WARRANTY; without even the implied + warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + See the GNU General Public License for more details. + + + + You should have received a copy of the GNU General Public + License along with this program; if not, write to the Free + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, + MA 02111-1307 USA + + + + For more details see the file COPYING in the source + distribution of Linux. + + + + + + + + Introduction + + The Via VT82C686A "super southbridge" chips contain + AC97-compatible audio logic which features dual 16-bit stereo + PCM sound channels (full duplex), plus a third PCM channel intended for use + in hardware-assisted FM synthesis. + + + The current Linux kernel audio driver for this family of chips + supports audio playback and recording, but hardware-assisted + FM features, and hardware buffer direct-access (mmap) + support are not yet available. + + + This driver supports any Linux kernel version after 2.4.10. + + + Please send bug reports to the mailing list linux-via@gtf.org. + To subscribe, e-mail majordomo@gtf.org with + + + subscribe linux-via + + + in the body of the message. + + + + + Driver Installation + + To use this audio driver, select the + CONFIG_SOUND_VIA82CXXX option in the section Sound during kernel configuration. + Follow the usual kernel procedures for rebuilding the kernel, + or building and installing driver modules. + + + To make this driver the default audio driver, you can add the + following to your /etc/conf.modules file: + + + alias sound via82cxxx_audio + + + Note that soundcore and ac97_codec support modules + are also required for working audio, in addition to + the via82cxxx_audio module itself. + + + + + Submitting a bug report + Description of problem + + Describe the application you were using to play/record sound, and how + to reproduce the problem. + + + Diagnostic output + + Obtain the via-audio-diag diagnostics program from + http://sf.net/projects/gkernel/ and provide a dump of the + audio chip's registers while the problem is occurring. Sample command line: + + + ./via-audio-diag -aps > diag-output.txt + + + Driver debug output + + Define VIA_DEBUG at the beginning of the driver, then capture and email + the kernel log output. This can be viewed in the system kernel log (if + enabled), or via the dmesg program. Sample command line: + + + dmesg > /tmp/dmesg-output.txt + + + Bigger kernel message buffer + + If you wish to increase the size of the buffer displayed by dmesg, then + change the LOG_BUF_LEN macro at the top of linux/kernel/printk.c, recompile + your kernel, and pass the LOG_BUF_LEN value to dmesg. Sample command line with + LOG_BUF_LEN == 32768: + + + dmesg -s 32768 > /tmp/dmesg-output.txt + + + + + + Known Bugs And Assumptions + + + Low volume + + + Volume too low on many systems. Workaround: use mixer program + such as xmixer to increase volume. + + + + + + + + + + Thanks + + Via for providing e-mail support, specs, and NDA'd source code. + + + MandrakeSoft for providing hacking time. + + + AC97 mixer interface fixes and debugging by Ron Cemer roncemer@gte.net. + + + Rui Sousa rui.sousa@conexant.com, for bugfixing + MMAP support, and several other notable fixes that resulted from + his hard work and testing. + + + Adrian Cox adrian@humboldt.co.uk, for bugfixing + MMAP support, and several other notable fixes that resulted from + his hard work and testing. + + + Thomas Sailer for further bugfixes. + + + + + Random Notes + + Two /proc pseudo-files provide diagnostic information. This is generally + not useful to most users. Power users can disable CONFIG_SOUND_VIA82CXXX_PROCFS, + and remove the /proc support code. Once + version 2.0.0 is released, the /proc support code will be disabled by + default. Available /proc pseudo-files: + + + /proc/driver/via/0/info + /proc/driver/via/0/ac97 + + + This driver by default supports all PCI audio devices which report + a vendor id of 0x1106, and a device id of 0x3058. Subsystem vendor + and device ids are not examined. + + + GNU indent formatting options: + +-kr -i8 -ts8 -br -ce -bap -sob -l80 -pcs -cs -ss -bs -di1 -nbc -lp -psl + + + + Via has graciously donated e-mail support and source code to help further + the development of this driver. Their assistance has been invaluable + in the design and coding of the next major version of this driver. + + + The Via audio chip apparently provides a second PCM scatter-gather + DMA channel just for FM data, but does not have a full hardware MIDI + processor. I haven't put much thought towards a solution here, but it + might involve using SoftOSS midi wave table, or simply disabling MIDI + support altogether and using the FM PCM channel as a second (input? output?) + + + + + Driver ChangeLog + + +Version 1.9.1 + + + + + DSP read/write bugfixes from Thomas Sailer. + + + + + + Add new PCI id for single-channel use of Via 8233. + + + + + + Other bug fixes, tweaks, new ioctls. + + + + + + + +Version 1.1.15 + + + + + Support for variable fragment size and variable fragment number (Rui + Sousa) + + + + + + Fixes for the SPEED, STEREO, CHANNELS, FMT ioctls when in read & + write mode (Rui Sousa) + + + + + + Mmaped sound is now fully functional. (Rui Sousa) + + + + + + Make sure to enable PCI device before reading any of its PCI + config information. (fixes potential hotplug problems) + + + + + + Clean up code a bit and add more internal function documentation. + + + + + + AC97 codec access fixes (Adrian Cox) + + + + + + Big endian fixes (Adrian Cox) + + + + + + MIDI support (Adrian Cox) + + + + + + Detect and report locked-rate AC97 codecs. If your hardware only + supports 48Khz (locked rate), then your recording/playback software + must upsample or downsample accordingly. The hardware cannot do it. + + + + + + Use new pci_request_regions and pci_disable_device functions in + kernel 2.4.6. + + + + + + + +Version 1.1.14 + + + + + Use VM_RESERVE when available, to eliminate unnecessary page faults. + + + + + + +Version 1.1.12 + + + + + mmap bug fixes from Linus. + + + + + + +Version 1.1.11 + + + + + Many more bug fixes. mmap enabled by default, but may still be buggy. + + + + + + Uses new and spiffy method of mmap'ing the DMA buffer, based + on a suggestion from Linus. + + + + + + +Version 1.1.10 + + + + + Many bug fixes. mmap enabled by default, but may still be buggy. + + + + + + +Version 1.1.9 + + + + + Redesign and rewrite audio playback implementation. (faster and smaller, hopefully) + + + + + + Implement recording and full duplex (DSP_CAP_DUPLEX) support. + + + + + + Make procfs support optional. + + + + + + Quick interrupt status check, to lessen overhead in interrupt + sharing situations. + + + + + + Add mmap(2) support. Disabled for now, it is still buggy and experimental. + + + + + + Surround all syscalls with a semaphore for cheap and easy SMP protection. + + + + + + Fix bug in channel shutdown (hardware channel reset) code. + + + + + + Remove unnecessary spinlocks (better performance). + + + + + + Eliminate "unknown AFMT" message by using a different method + of selecting the best AFMT_xxx sound sample format for use. + + + + + + Support for realtime hardware pointer position reporting + (DSP_CAP_REALTIME, SNDCTL_DSP_GETxPTR ioctls) + + + + + + Support for capture/playback triggering + (DSP_CAP_TRIGGER, SNDCTL_DSP_SETTRIGGER ioctls) + + + + + + SNDCTL_DSP_SETDUPLEX and SNDCTL_DSP_POST ioctls now handled. + + + + + + Rewrite open(2) and close(2) logic to allow only one user at + a time. All other open(2) attempts will sleep until they succeed. + FIXME: open(O_RDONLY) and open(O_WRONLY) should be allowed to succeed. + + + + + + Reviewed code to ensure that SMP and multiple audio devices + are fully supported. + + + + + + + +Version 1.1.8 + + + + + Clean up interrupt handler output. Fixes the following kernel error message: + + + unhandled interrupt ... + + + + + + Convert documentation to DocBook, so that PDF, HTML and PostScript (.ps) output is readily + available. + + + + + + + +Version 1.1.7 + + + + + Fix module unload bug where mixer device left registered + after driver exit + + + + + + +Version 1.1.6 + + + + + Rewrite via_set_rate to mimic ALSA basic AC97 rate setting + + + + + Remove much dead code + + + + + Complete spin_lock_irqsave -> spin_lock_irq conversion in via_dsp_ioctl + + + + + Fix build problem in via_dsp_ioctl + + + + + Optimize included headers to eliminate headers found in linux/sound + + + + + + +Version 1.1.5 + + + + + Disable some overly-verbose debugging code + + + + + Remove unnecessary sound locks + + + + + Fix some ioctls for better time resolution + + + + + Begin spin_lock_irqsave -> spin_lock_irq conversion in via_dsp_ioctl + + + + + + +Version 1.1.4 + + + + + Completed rewrite of driver. Eliminated SoundBlaster compatibility + completely, and now uses the much-faster scatter-gather DMA engine. + + + + + + + + + Internal Functions +!Isound/oss/via82cxxx_audio.c + + + + + diff --git a/Documentation/DocBook/videobook.tmpl b/Documentation/DocBook/videobook.tmpl new file mode 100644 index 000000000000..3ec6c875588a --- /dev/null +++ b/Documentation/DocBook/videobook.tmpl @@ -0,0 +1,1663 @@ + + + + + + Video4Linux Programming + + + + Alan + Cox + +
+ alan@redhat.com +
+
+
+
+ + + 2000 + Alan Cox + + + + + This documentation is free software; you can redistribute + it and/or modify it under the terms of the GNU General Public + License as published by the Free Software Foundation; either + version 2 of the License, or (at your option) any later + version. + + + + This program is distributed in the hope that it will be + useful, but WITHOUT ANY WARRANTY; without even the implied + warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + See the GNU General Public License for more details. + + + + You should have received a copy of the GNU General Public + License along with this program; if not, write to the Free + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, + MA 02111-1307 USA + + + + For more details see the file COPYING in the source + distribution of Linux. + + +
+ + + + + Introduction + + Parts of this document first appeared in Linux Magazine under a + ninety day exclusivity. + + + Video4Linux is intended to provide a common programming interface + for the many TV and capture cards now on the market, as well as + parallel port and USB video cameras. Radio, teletext decoders and + vertical blanking data interfaces are also provided. + + + + Radio Devices + + There are a wide variety of radio interfaces available for PC's, and these + are generally very simple to program. The biggest problem with supporting + such devices is normally extracting documentation from the vendor. + + + The radio interface supports a simple set of control ioctls standardised + across all radio and tv interfaces. It does not support read or write, which + are used for video streams. The reason radio cards do not allow you to read + the audio stream into an application is that without exception they provide + a connection on to a soundcard. Soundcards can be used to read the radio + data just fine. + + + Registering Radio Devices + + The Video4linux core provides an interface for registering devices. The + first step in writing our radio card driver is to register it. + + + + +static struct video_device my_radio +{ + "My radio", + VID_TYPE_TUNER, + VID_HARDWARE_MYRADIO, + radio_open. + radio_close, + NULL, /* no read */ + NULL, /* no write */ + NULL, /* no poll */ + radio_ioctl, + NULL, /* no special init function */ + NULL /* no private data */ +}; + + + + + This declares our video4linux device driver interface. The VID_TYPE_ value + defines what kind of an interface we are, and defines basic capabilities. + + + The only defined value relevant for a radio card is VID_TYPE_TUNER which + indicates that the device can be tuned. Clearly our radio is going to have some + way to change channel so it is tuneable. + + + The VID_HARDWARE_ types are unique to each device. Numbers are assigned by + alan@redhat.com when device drivers are going to be released. Until then you + can pull a suitably large number out of your hat and use it. 10000 should be + safe for a very long time even allowing for the huge number of vendors + making new and different radio cards at the moment. + + + We declare an open and close routine, but we do not need read or write, + which are used to read and write video data to or from the card itself. As + we have no read or write there is no poll function. + + + The private initialise function is run when the device is registered. In + this driver we've already done all the work needed. The final pointer is a + private data pointer that can be used by the device driver to attach and + retrieve private data structures. We set this field "priv" to NULL for + the moment. + + + Having the structure defined is all very well but we now need to register it + with the kernel. + + + + +static int io = 0x320; + +int __init myradio_init(struct video_init *v) +{ + if(!request_region(io, MY_IO_SIZE, "myradio")) + { + printk(KERN_ERR + "myradio: port 0x%03X is in use.\n", io); + return -EBUSY; + } + + if(video_device_register(&my_radio, VFL_TYPE_RADIO)==-1) { + release_region(io, MY_IO_SIZE); + return -EINVAL; + } + return 0; +} + + + + The first stage of the initialisation, as is normally the case, is to check + that the I/O space we are about to fiddle with doesn't belong to some other + driver. If it is we leave well alone. If the user gives the address of the + wrong device then we will spot this. These policies will generally avoid + crashing the machine. + + + Now we ask the Video4Linux layer to register the device for us. We hand it + our carefully designed video_device structure and also tell it which group + of devices we want it registered with. In this case VFL_TYPE_RADIO. + + + The types available are + + Device Types + + + + VFL_TYPE_RADIO/dev/radio{n} + + Radio devices are assigned in this block. As with all of these + selections the actual number assignment is done by the video layer + accordijng to what is free. + + VFL_TYPE_GRABBER/dev/video{n} + Video capture devices and also -- counter-intuitively for the name -- + hardware video playback devices such as MPEG2 cards. + + VFL_TYPE_VBI/dev/vbi{n} + The VBI devices capture the hidden lines on a television picture + that carry further information like closed caption data, teletext + (primarily in Europe) and now Intercast and the ATVEC internet + television encodings. + + VFL_TYPE_VTX/dev/vtx[n} + VTX is 'Videotext' also known as 'Teletext'. This is a system for + sending numbered, 40x25, mostly textual page images over the hidden + lines. Unlike the /dev/vbi interfaces, this is for 'smart' decoder + chips. (The use of the word smart here has to be taken in context, + the smartest teletext chips are fairly dumb pieces of technology). + + + + +
+ + We are most definitely a radio. + + + Finally we allocate our I/O space so that nobody treads on us and return 0 + to signify general happiness with the state of the universe. + +
+ + Opening And Closing The Radio + + + The functions we declared in our video_device are mostly very simple. + Firstly we can drop in what is basically standard code for open and close. + + + + +static int users = 0; + +static int radio_open(stuct video_device *dev, int flags) +{ + if(users) + return -EBUSY; + users++; + return 0; +} + + + + At open time we need to do nothing but check if someone else is also using + the radio card. If nobody is using it we make a note that we are using it, + then we ensure that nobody unloads our driver on us. + + + + +static int radio_close(struct video_device *dev) +{ + users--; +} + + + + At close time we simply need to reduce the user count and allow the module + to become unloadable. + + + If you are sharp you will have noticed neither the open nor the close + routines attempt to reset or change the radio settings. This is intentional. + It allows an application to set up the radio and exit. It avoids a user + having to leave an application running all the time just to listen to the + radio. + + + + The Ioctl Interface + + This leaves the ioctl routine, without which the driver will not be + terribly useful to anyone. + + + + +static int radio_ioctl(struct video_device *dev, unsigned int cmd, void *arg) +{ + switch(cmd) + { + case VIDIOCGCAP: + { + struct video_capability v; + v.type = VID_TYPE_TUNER; + v.channels = 1; + v.audios = 1; + v.maxwidth = 0; + v.minwidth = 0; + v.maxheight = 0; + v.minheight = 0; + strcpy(v.name, "My Radio"); + if(copy_to_user(arg, &v, sizeof(v))) + return -EFAULT; + return 0; + } + + + + VIDIOCGCAP is the first ioctl all video4linux devices must support. It + allows the applications to find out what sort of a card they have found and + to figure out what they want to do about it. The fields in the structure are + + struct video_capability fields + + + + nameThe device text name. This is intended for the user. + + channelsThe number of different channels you can tune on + this card. It could even by zero for a card that has + no tuning capability. For our simple FM radio it is 1. + An AM/FM radio would report 2. + + audiosThe number of audio inputs on this device. For our + radio there is only one audio input. + + minwidth,minheightThe smallest size the card is capable of capturing + images in. We set these to zero. Radios do not + capture pictures + + maxwidth,maxheightThe largest image size the card is capable of + capturing. For our radio we report 0. + + + typeThis reports the capabilities of the device, and + matches the field we filled in in the struct + video_device when registering. + + + +
+ + Having filled in the fields, we use copy_to_user to copy the structure into + the users buffer. If the copy fails we return an EFAULT to the application + so that it knows it tried to feed us garbage. + + + The next pair of ioctl operations select which tuner is to be used and let + the application find the tuner properties. We have only a single FM band + tuner in our example device. + + + + + case VIDIOCGTUNER: + { + struct video_tuner v; + if(copy_from_user(&v, arg, sizeof(v))!=0) + return -EFAULT; + if(v.tuner) + return -EINVAL; + v.rangelow=(87*16000); + v.rangehigh=(108*16000); + v.flags = VIDEO_TUNER_LOW; + v.mode = VIDEO_MODE_AUTO; + v.signal = 0xFFFF; + strcpy(v.name, "FM"); + if(copy_to_user(&v, arg, sizeof(v))!=0) + return -EFAULT; + return 0; + } + + + + The VIDIOCGTUNER ioctl allows applications to query a tuner. The application + sets the tuner field to the tuner number it wishes to query. The query does + not change the tuner that is being used, it merely enquires about the tuner + in question. + + + We have exactly one tuner so after copying the user buffer to our temporary + structure we complain if they asked for a tuner other than tuner 0. + + + The video_tuner structure has the following fields + + struct video_tuner fields + + + + int tunerThe number of the tuner in question + + char name[32]A text description of this tuner. "FM" will do fine. + This is intended for the application. + + u32 flags + Tuner capability flags + + + u16 modeThe current reception mode + + + u16 signalThe signal strength scaled between 0 and 65535. If + a device cannot tell the signal strength it should + report 65535. Many simple cards contain only a + signal/no signal bit. Such cards will report either + 0 or 65535. + + + u32 rangelow, rangehigh + The range of frequencies supported by the radio + or TV. It is scaled according to the VIDEO_TUNER_LOW + flag. + + + + +
+ + struct video_tuner flags + + + + VIDEO_TUNER_PALA PAL TV tuner + + VIDEO_TUNER_NTSCAn NTSC (US) TV tuner + + VIDEO_TUNER_SECAMA SECAM (French) TV tuner + + VIDEO_TUNER_LOW + The tuner frequency is scaled in 1/16th of a KHz + steps. If not it is in 1/16th of a MHz steps + + + VIDEO_TUNER_NORMThe tuner can set its format + + VIDEO_TUNER_STEREO_ONThe tuner is currently receiving a stereo signal + + + +
+ + struct video_tuner modes + + + + VIDEO_MODE_PALPAL Format + + VIDEO_MODE_NTSCNTSC Format (USA) + + VIDEO_MODE_SECAMFrench Format + + VIDEO_MODE_AUTOA device that does not need to do + TV format switching + + + +
+ + The settings for the radio card are thus fairly simple. We report that we + are a tuner called "FM" for FM radio. In order to get the best tuning + resolution we report VIDEO_TUNER_LOW and select tuning to 1/16th of KHz. Its + unlikely our card can do that resolution but it is a fair bet the card can + do better than 1/16th of a MHz. VIDEO_TUNER_LOW is appropriate to almost all + radio usage. + + + We report that the tuner automatically handles deciding what format it is + receiving - true enough as it only handles FM radio. Our example card is + also incapable of detecting stereo or signal strengths so it reports a + strength of 0xFFFF (maximum) and no stereo detected. + + + To finish off we set the range that can be tuned to be 87-108Mhz, the normal + FM broadcast radio range. It is important to find out what the card is + actually capable of tuning. It is easy enough to simply use the FM broadcast + range. Unfortunately if you do this you will discover the FM broadcast + ranges in the USA, Europe and Japan are all subtly different and some users + cannot receive all the stations they wish. + + + The application also needs to be able to set the tuner it wishes to use. In + our case, with a single tuner this is rather simple to arrange. + + + + case VIDIOCSTUNER: + { + struct video_tuner v; + if(copy_from_user(&v, arg, sizeof(v))) + return -EFAULT; + if(v.tuner != 0) + return -EINVAL; + return 0; + } + + + + We copy the user supplied structure into kernel memory so we can examine it. + If the user has selected a tuner other than zero we reject the request. If + they wanted tuner 0 then, surprisingly enough, that is the current tuner already. + + + The next two ioctls we need to provide are to get and set the frequency of + the radio. These both use an unsigned long argument which is the frequency. + The scale of the frequency depends on the VIDEO_TUNER_LOW flag as I + mentioned earlier on. Since we have VIDEO_TUNER_LOW set this will be in + 1/16ths of a KHz. + + + +static unsigned long current_freq; + + + + case VIDIOCGFREQ: + if(copy_to_user(arg, &current_freq, + sizeof(unsigned long)) + return -EFAULT; + return 0; + + + + Querying the frequency in our case is relatively simple. Our radio card is + too dumb to let us query the signal strength so we remember our setting if + we know it. All we have to do is copy it to the user. + + + + + case VIDIOCSFREQ: + { + u32 freq; + if(copy_from_user(arg, &freq, + sizeof(unsigned long))!=0) + return -EFAULT; + if(hardware_set_freq(freq)<0) + return -EINVAL; + current_freq = freq; + return 0; + } + + + + Setting the frequency is a little more complex. We begin by copying the + desired frequency into kernel space. Next we call a hardware specific routine + to set the radio up. This might be as simple as some scaling and a few + writes to an I/O port. For most radio cards it turns out a good deal more + complicated and may involve programming things like a phase locked loop on + the card. This is what documentation is for. + + + The final set of operations we need to provide for our radio are the + volume controls. Not all radio cards can even do volume control. After all + there is a perfectly good volume control on the sound card. We will assume + our radio card has a simple 4 step volume control. + + + There are two ioctls with audio we need to support + + + +static int current_volume=0; + + case VIDIOCGAUDIO: + { + struct video_audio v; + if(copy_from_user(&v, arg, sizeof(v))) + return -EFAULT; + if(v.audio != 0) + return -EINVAL; + v.volume = 16384*current_volume; + v.step = 16384; + strcpy(v.name, "Radio"); + v.mode = VIDEO_SOUND_MONO; + v.balance = 0; + v.base = 0; + v.treble = 0; + + if(copy_to_user(arg. &v, sizeof(v))) + return -EFAULT; + return 0; + } + + + + Much like the tuner we start by copying the user structure into kernel + space. Again we check if the user has asked for a valid audio input. We have + only input 0 and we punt if they ask for another input. + + + Then we fill in the video_audio structure. This has the following format + + struct video_audio fields + + + + audioThe input the user wishes to query + + volumeThe volume setting on a scale of 0-65535 + + baseThe base level on a scale of 0-65535 + + trebleThe treble level on a scale of 0-65535 + + flagsThe features this audio device supports + + + nameA text name to display to the user. We picked + "Radio" as it explains things quite nicely. + + modeThe current reception mode for the audio + + We report MONO because our card is too stupid to know if it is in + mono or stereo. + + + balanceThe stereo balance on a scale of 0-65535, 32768 is + middle. + + stepThe step by which the volume control jumps. This is + used to help make it easy for applications to set + slider behaviour. + + + +
+ + struct video_audio flags + + + + VIDEO_AUDIO_MUTEThe audio is currently muted. We + could fake this in our driver but we + choose not to bother. + + VIDEO_AUDIO_MUTABLEThe input has a mute option + + VIDEO_AUDIO_TREBLEThe input has a treble control + + VIDEO_AUDIO_BASSThe input has a base control + + + +
+ + struct video_audio modes + + + + VIDEO_SOUND_MONOMono sound + + VIDEO_SOUND_STEREOStereo sound + + VIDEO_SOUND_LANG1Alternative language 1 (TV specific) + + VIDEO_SOUND_LANG2Alternative language 2 (TV specific) + + + +
+ + Having filled in the structure we copy it back to user space. + + + The VIDIOCSAUDIO ioctl allows the user to set the audio parameters in the + video_audio structure. The driver does its best to honour the request. + + + + case VIDIOCSAUDIO: + { + struct video_audio v; + if(copy_from_user(&v, arg, sizeof(v))) + return -EFAULT; + if(v.audio) + return -EINVAL; + current_volume = v/16384; + hardware_set_volume(current_volume); + return 0; + } + + + + In our case there is very little that the user can set. The volume is + basically the limit. Note that we could pretend to have a mute feature + by rewriting this to + + + + case VIDIOCSAUDIO: + { + struct video_audio v; + if(copy_from_user(&v, arg, sizeof(v))) + return -EFAULT; + if(v.audio) + return -EINVAL; + current_volume = v/16384; + if(v.flags&VIDEO_AUDIO_MUTE) + hardware_set_volume(0); + else + hardware_set_volume(current_volume); + current_muted = v.flags & + VIDEO_AUDIO_MUTE; + return 0; + } + + + + This with the corresponding changes to the VIDIOCGAUDIO code to report the + state of the mute flag we save and to report the card has a mute function, + will allow applications to use a mute facility with this card. It is + questionable whether this is a good idea however. User applications can already + fake this themselves and kernel space is precious. + + + We now have a working radio ioctl handler. So we just wrap up the function + + + + + } + return -ENOIOCTLCMD; +} + + + + and pass the Video4Linux layer back an error so that it knows we did not + understand the request we got passed. + +
+ + Module Wrapper + + Finally we add in the usual module wrapping and the driver is done. + + + +#ifndef MODULE + +static int io = 0x300; + +#else + +static int io = -1; + +#endif + +MODULE_AUTHOR("Alan Cox"); +MODULE_DESCRIPTION("A driver for an imaginary radio card."); +module_param(io, int, 0444); +MODULE_PARM_DESC(io, "I/O address of the card."); + +static int __init init(void) +{ + if(io==-1) + { + printk(KERN_ERR + "You must set an I/O address with io=0x???\n"); + return -EINVAL; + } + return myradio_init(NULL); +} + +static void __exit cleanup(void) +{ + video_unregister_device(&my_radio); + release_region(io, MY_IO_SIZE); +} + +module_init(init); +module_exit(cleanup); + + + + In this example we set the IO base by default if the driver is compiled into + the kernel: you can still set it using "my_radio.irq" if this file is called my_radio.c. For the module we require the + user sets the parameter. We set io to a nonsense port (-1) so that we can + tell if the user supplied an io parameter or not. + + + We use MODULE_ defines to give an author for the card driver and a + description. We also use them to declare that io is an integer and it is the + address of the card, and can be read by anyone from sysfs. + + + The clean-up routine unregisters the video_device we registered, and frees + up the I/O space. Note that the unregister takes the actual video_device + structure as its argument. Unlike the file operations structure which can be + shared by all instances of a device a video_device structure as an actual + instance of the device. If you are registering multiple radio devices you + need to fill in one structure per device (most likely by setting up a + template and copying it to each of the actual device structures). + + +
+ + Video Capture Devices + + Video Capture Device Types + + The video capture devices share the same interfaces as radio devices. In + order to explain the video capture interface I will use the example of a + camera that has no tuners or audio input. This keeps the example relatively + clean. To get both combine the two driver examples. + + + Video capture devices divide into four categories. A little technology + backgrounder. Full motion video even at television resolution (which is + actually fairly low) is pretty resource-intensive. You are continually + passing megabytes of data every second from the capture card to the display. + several alternative approaches have emerged because copying this through the + processor and the user program is a particularly bad idea . + + + The first is to add the television image onto the video output directly. + This is also how some 3D cards work. These basic cards can generally drop the + video into any chosen rectangle of the display. Cards like this, which + include most mpeg1 cards that used the feature connector, aren't very + friendly in a windowing environment. They don't understand windows or + clipping. The video window is always on the top of the display. + + + Chroma keying is a technique used by cards to get around this. It is an old + television mixing trick where you mark all the areas you wish to replace + with a single clear colour that isn't used in the image - TV people use an + incredibly bright blue while computing people often use a particularly + virulent purple. Bright blue occurs on the desktop. Anyone with virulent + purple windows has another problem besides their TV overlay. + + + The third approach is to copy the data from the capture card to the video + card, but to do it directly across the PCI bus. This relieves the processor + from doing the work but does require some smartness on the part of the video + capture chip, as well as a suitable video card. Programming this kind of + card and more so debugging it can be extremely tricky. There are some quite + complicated interactions with the display and you may also have to cope with + various chipset bugs that show up when PCI cards start talking to each + other. + + + To keep our example fairly simple we will assume a card that supports + overlaying a flat rectangular image onto the frame buffer output, and which + can also capture stuff into processor memory. + + + + Registering Video Capture Devices + + This time we need to add more functions for our camera device. + + +static struct video_device my_camera +{ + "My Camera", + VID_TYPE_OVERLAY|VID_TYPE_SCALES|\ + VID_TYPE_CAPTURE|VID_TYPE_CHROMAKEY, + VID_HARDWARE_MYCAMERA, + camera_open. + camera_close, + camera_read, /* no read */ + NULL, /* no write */ + camera_poll, /* no poll */ + camera_ioctl, + NULL, /* no special init function */ + NULL /* no private data */ +}; + + + We need a read() function which is used for capturing data from + the card, and we need a poll function so that a driver can wait for the next + frame to be captured. + + + We use the extra video capability flags that did not apply to the + radio interface. The video related flags are + + Capture Capabilities + + + +VID_TYPE_CAPTUREWe support image capture + +VID_TYPE_TELETEXTA teletext capture device (vbi{n]) + +VID_TYPE_OVERLAYThe image can be directly overlaid onto the + frame buffer + +VID_TYPE_CHROMAKEYChromakey can be used to select which parts + of the image to display + +VID_TYPE_CLIPPINGIt is possible to give the board a list of + rectangles to draw around. + +VID_TYPE_FRAMERAMThe video capture goes into the video memory + and actually changes it. Applications need + to know this so they can clean up after the + card + +VID_TYPE_SCALESThe image can be scaled to various sizes, + rather than being a single fixed size. + +VID_TYPE_MONOCHROMEThe capture will be monochrome. This isn't a + complete answer to the question since a mono + camera on a colour capture card will still + produce mono output. + +VID_TYPE_SUBCAPTUREThe card allows only part of its field of + view to be captured. This enables + applications to avoid copying all of a large + image into memory when only some section is + relevant. + + + +
+ + We set VID_TYPE_CAPTURE so that we are seen as a capture card, + VID_TYPE_CHROMAKEY so the application knows it is time to draw in virulent + purple, and VID_TYPE_SCALES because we can be resized. + + + Our setup is fairly similar. This time we also want an interrupt line + for the 'frame captured' signal. Not all cards have this so some of them + cannot handle poll(). + + + + +static int io = 0x320; +static int irq = 11; + +int __init mycamera_init(struct video_init *v) +{ + if(!request_region(io, MY_IO_SIZE, "mycamera")) + { + printk(KERN_ERR + "mycamera: port 0x%03X is in use.\n", io); + return -EBUSY; + } + + if(video_device_register(&my_camera, + VFL_TYPE_GRABBER)==-1) { + release_region(io, MY_IO_SIZE); + return -EINVAL; + } + return 0; +} + + + + This is little changed from the needs of the radio card. We specify + VFL_TYPE_GRABBER this time as we want to be allocated a /dev/video name. + +
+ + Opening And Closing The Capture Device + + + +static int users = 0; + +static int camera_open(stuct video_device *dev, int flags) +{ + if(users) + return -EBUSY; + if(request_irq(irq, camera_irq, 0, "camera", dev)<0) + return -EBUSY; + users++; + return 0; +} + + +static int camera_close(struct video_device *dev) +{ + users--; + free_irq(irq, dev); +} + + + The open and close routines are also quite similar. The only real change is + that we now request an interrupt for the camera device interrupt line. If we + cannot get the interrupt we report EBUSY to the application and give up. + + + + Interrupt Handling + + Our example handler is for an ISA bus device. If it was PCI you would be + able to share the interrupt and would have set SA_SHIRQ to indicate a + shared IRQ. We pass the device pointer as the interrupt routine argument. We + don't need to since we only support one card but doing this will make it + easier to upgrade the driver for multiple devices in the future. + + + Our interrupt routine needs to do little if we assume the card can simply + queue one frame to be read after it captures it. + + + + +static struct wait_queue *capture_wait; +static int capture_ready = 0; + +static void camera_irq(int irq, void *dev_id, + struct pt_regs *regs) +{ + capture_ready=1; + wake_up_interruptible(&capture_wait); +} + + + The interrupt handler is nice and simple for this card as we are assuming + the card is buffering the frame for us. This means we have little to do but + wake up anybody interested. We also set a capture_ready flag, as we may + capture a frame before an application needs it. In this case we need to know + that a frame is ready. If we had to collect the frame on the interrupt life + would be more complex. + + + The two new routines we need to supply are camera_read which returns a + frame, and camera_poll which waits for a frame to become ready. + + + + +static int camera_poll(struct video_device *dev, + struct file *file, struct poll_table *wait) +{ + poll_wait(file, &capture_wait, wait); + if(capture_read) + return POLLIN|POLLRDNORM; + return 0; +} + + + + Our wait queue for polling is the capture_wait queue. This will cause the + task to be woken up by our camera_irq routine. We check capture_read to see + if there is an image present and if so report that it is readable. + + + + Reading The Video Image + + + +static long camera_read(struct video_device *dev, char *buf, + unsigned long count) +{ + struct wait_queue wait = { current, NULL }; + u8 *ptr; + int len; + int i; + + add_wait_queue(&capture_wait, &wait); + + while(!capture_ready) + { + if(file->flags&O_NDELAY) + { + remove_wait_queue(&capture_wait, &wait); + current->state = TASK_RUNNING; + return -EWOULDBLOCK; + } + if(signal_pending(current)) + { + remove_wait_queue(&capture_wait, &wait); + current->state = TASK_RUNNING; + return -ERESTARTSYS; + } + schedule(); + current->state = TASK_INTERRUPTIBLE; + } + remove_wait_queue(&capture_wait, &wait); + current->state = TASK_RUNNING; + + + + The first thing we have to do is to ensure that the application waits until + the next frame is ready. The code here is almost identical to the mouse code + we used earlier in this chapter. It is one of the common building blocks of + Linux device driver code and probably one which you will find occurs in any + drivers you write. + + + We wait for a frame to be ready, or for a signal to interrupt our waiting. If a + signal occurs we need to return from the system call so that the signal can + be sent to the application itself. We also check to see if the user actually + wanted to avoid waiting - ie if they are using non-blocking I/O and have other things + to get on with. + + + Next we copy the data from the card to the user application. This is rarely + as easy as our example makes out. We will add capture_w, and capture_h here + to hold the width and height of the captured image. We assume the card only + supports 24bit RGB for now. + + + + + + capture_ready = 0; + + ptr=(u8 *)buf; + len = capture_w * 3 * capture_h; /* 24bit RGB */ + + if(len>count) + len=count; /* Doesn't all fit */ + + for(i=0; i<len; i++) + { + put_user(inb(io+IMAGE_DATA), ptr); + ptr++; + } + + hardware_restart_capture(); + + return i; +} + + + + For a real hardware device you would try to avoid the loop with put_user(). + Each call to put_user() has a time overhead checking whether the accesses to user + space are allowed. It would be better to read a line into a temporary buffer + then copy this to user space in one go. + + + Having captured the image and put it into user space we can kick the card to + get the next frame acquired. + + + + Video Ioctl Handling + + As with the radio driver the major control interface is via the ioctl() + function. Video capture devices support the same tuner calls as a radio + device and also support additional calls to control how the video functions + are handled. In this simple example the card has no tuners to avoid making + the code complex. + + + + + +static int camera_ioctl(struct video_device *dev, unsigned int cmd, void *arg) +{ + switch(cmd) + { + case VIDIOCGCAP: + { + struct video_capability v; + v.type = VID_TYPE_CAPTURE|\ + VID_TYPE_CHROMAKEY|\ + VID_TYPE_SCALES|\ + VID_TYPE_OVERLAY; + v.channels = 1; + v.audios = 0; + v.maxwidth = 640; + v.minwidth = 16; + v.maxheight = 480; + v.minheight = 16; + strcpy(v.name, "My Camera"); + if(copy_to_user(arg, &v, sizeof(v))) + return -EFAULT; + return 0; + } + + + + + The first ioctl we must support and which all video capture and radio + devices are required to support is VIDIOCGCAP. This behaves exactly the same + as with a radio device. This time, however, we report the extra capabilities + we outlined earlier on when defining our video_dev structure. + + + We now set the video flags saying that we support overlay, capture, + scaling and chromakey. We also report size limits - our smallest image is + 16x16 pixels, our largest is 640x480. + + + To keep things simple we report no audio and no tuning capabilities at all. + + + + case VIDIOCGCHAN: + { + struct video_channel v; + if(copy_from_user(&v, arg, sizeof(v))) + return -EFAULT; + if(v.channel != 0) + return -EINVAL; + v.flags = 0; + v.tuners = 0; + v.type = VIDEO_TYPE_CAMERA; + v.norm = VIDEO_MODE_AUTO; + strcpy(v.name, "Camera Input");break; + if(copy_to_user(&v, arg, sizeof(v))) + return -EFAULT; + return 0; + } + + + + + This follows what is very much the standard way an ioctl handler looks + in Linux. We copy the data into a kernel space variable and we check that the + request is valid (in this case that the input is 0). Finally we copy the + camera info back to the user. + + + The VIDIOCGCHAN ioctl allows a user to ask about video channels (that is + inputs to the video card). Our example card has a single camera input. The + fields in the structure are + + struct video_channel fields + + + + + channelThe channel number we are selecting + + nameThe name for this channel. This is intended + to describe the port to the user. + Appropriate names are therefore things like + "Camera" "SCART input" + + flagsChannel properties + + typeInput type + + normThe current television encoding being used + if relevant for this channel. + + + + +
+ struct video_channel flags + + + + VIDEO_VC_TUNERChannel has a tuner. + + VIDEO_VC_AUDIOChannel has audio. + + + +
+ struct video_channel types + + + + VIDEO_TYPE_TVTelevision input. + + VIDEO_TYPE_CAMERAFixed camera input. + + 0Type is unknown. + + + +
+ struct video_channel norms + + + + VIDEO_MODE_PALPAL encoded Television + + VIDEO_MODE_NTSCNTSC (US) encoded Television + + VIDEO_MODE_SECAMSECAM (French) Television + + VIDEO_MODE_AUTOAutomatic switching, or format does not + matter + + + +
+ + The corresponding VIDIOCSCHAN ioctl allows a user to change channel and to + request the norm is changed - for example to switch between a PAL or an NTSC + format camera. + + + + + case VIDIOCSCHAN: + { + struct video_channel v; + if(copy_from_user(&v, arg, sizeof(v))) + return -EFAULT; + if(v.channel != 0) + return -EINVAL; + if(v.norm != VIDEO_MODE_AUTO) + return -EINVAL; + return 0; + } + + + + + The implementation of this call in our driver is remarkably easy. Because we + are assuming fixed format hardware we need only check that the user has not + tried to change anything. + + + The user also needs to be able to configure and adjust the picture they are + seeing. This is much like adjusting a television set. A user application + also needs to know the palette being used so that it knows how to display + the image that has been captured. The VIDIOCGPICT and VIDIOCSPICT ioctl + calls provide this information. + + + + + case VIDIOCGPICT + { + struct video_picture v; + v.brightness = hardware_brightness(); + v.hue = hardware_hue(); + v.colour = hardware_saturation(); + v.contrast = hardware_brightness(); + /* Not settable */ + v.whiteness = 32768; + v.depth = 24; /* 24bit */ + v.palette = VIDEO_PALETTE_RGB24; + if(copy_to_user(&v, arg, + sizeof(v))) + return -EFAULT; + return 0; + } + + + + + The brightness, hue, color, and contrast provide the picture controls that + are akin to a conventional television. Whiteness provides additional + control for greyscale images. All of these values are scaled between 0-65535 + and have 32768 as the mid point setting. The scaling means that applications + do not have to worry about the capability range of the hardware but can let + it make a best effort attempt. + + + Our depth is 24, as this is in bits. We will be returning RGB24 format. This + has one byte of red, then one of green, then one of blue. This then repeats + for every other pixel in the image. The other common formats the interface + defines are + + Framebuffer Encodings + + + + GREYLinear greyscale. This is for simple cameras and the + like + + RGB565The top 5 bits hold 32 red levels, the next six bits + hold green and the low 5 bits hold blue. + + RGB555The top bit is clear. The red green and blue levels + each occupy five bits. + + + +
+ + Additional modes are support for YUV capture formats. These are common for + TV and video conferencing applications. + + + The VIDIOCSPICT ioctl allows a user to set some of the picture parameters. + Exactly which ones are supported depends heavily on the card itself. It is + possible to support many modes and effects in software. In general doing + this in the kernel is a bad idea. Video capture is a performance-sensitive + application and the programs can often do better if they aren't being + 'helped' by an overkeen driver writer. Thus for our device we will report + RGB24 only and refuse to allow a change. + + + + + case VIDIOCSPICT: + { + struct video_picture v; + if(copy_from_user(&v, arg, sizeof(v))) + return -EFAULT; + if(v.depth!=24 || + v.palette != VIDEO_PALETTE_RGB24) + return -EINVAL; + set_hardware_brightness(v.brightness); + set_hardware_hue(v.hue); + set_hardware_saturation(v.colour); + set_hardware_brightness(v.contrast); + return 0; + } + + + + + We check the user has not tried to change the palette or the depth. We do + not want to carry out some of the changes and then return an error. This may + confuse the application which will be assuming no change occurred. + + + In much the same way as you need to be able to set the picture controls to + get the right capture images, many cards need to know what they are + displaying onto when generating overlay output. In some cases getting this + wrong even makes a nasty mess or may crash the computer. For that reason + the VIDIOCSBUF ioctl used to set up the frame buffer information may well + only be usable by root. + + + We will assume our card is one of the old ISA devices with feature connector + and only supports a couple of standard video modes. Very common for older + cards although the PCI devices are way smarter than this. + + + + +static struct video_buffer capture_fb; + + case VIDIOCGFBUF: + { + if(copy_to_user(arg, &capture_fb, + sizeof(capture_fb))) + return -EFAULT; + return 0; + + } + + + + + We keep the frame buffer information in the format the ioctl uses. This + makes it nice and easy to work with in the ioctl calls. + + + + case VIDIOCSFBUF: + { + struct video_buffer v; + + if(!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if(copy_from_user(&v, arg, sizeof(v))) + return -EFAULT; + if(v.width!=320 && v.width!=640) + return -EINVAL; + if(v.height!=200 && v.height!=240 + && v.height!=400 + && v.height !=480) + return -EINVAL; + memcpy(&capture_fb, &v, sizeof(v)); + hardware_set_fb(&v); + return 0; + } + + + + + + The capable() function checks a user has the required capability. The Linux + operating system has a set of about 30 capabilities indicating privileged + access to services. The default set up gives the superuser (uid 0) all of + them and nobody else has any. + + + We check that the user has the SYS_ADMIN capability, that is they are + allowed to operate as the machine administrator. We don't want anyone but + the administrator making a mess of the display. + + + Next we check for standard PC video modes (320 or 640 wide with either + EGA or VGA depths). If the mode is not a standard video mode we reject it as + not supported by our card. If the mode is acceptable we save it so that + VIDIOCFBUF will give the right answer next time it is called. The + hardware_set_fb() function is some undescribed card specific function to + program the card for the desired mode. + + + Before the driver can display an overlay window it needs to know where the + window should be placed, and also how large it should be. If the card + supports clipping it needs to know which rectangles to omit from the + display. The video_window structure is used to describe the way the image + should be displayed. + + struct video_window fields + + + + widthThe width in pixels of the desired image. The card + may use a smaller size if this size is not available + + heightThe height of the image. The card may use a smaller + size if this size is not available. + + x The X position of the top left of the window. This + is in pixels relative to the left hand edge of the + picture. Not all cards can display images aligned on + any pixel boundary. If the position is unsuitable + the card adjusts the image right and reduces the + width. + + y The Y position of the top left of the window. This + is counted in pixels relative to the top edge of the + picture. As with the width if the card cannot + display starting on this line it will adjust the + values. + + chromakeyThe colour (expressed in RGB32 format) for the + chromakey colour if chroma keying is being used. + + clipsAn array of rectangles that must not be drawn + over. + + clipcountThe number of clips in this array. + + + +
+ + Each clip is a struct video_clip which has the following fields + + video_clip fields + + + + x, yCo-ordinates relative to the display + + width, heightWidth and height in pixels + + nextA spare field for the application to use + + + +
+ + The driver is required to ensure it always draws in the area requested or a smaller area, and that it never draws in any of the areas that are clipped. + This may well mean it has to leave alone. small areas the application wished to be + drawn. + + + Our example card uses chromakey so does not have to address most of the + clipping. We will add a video_window structure to our global variables to + remember our parameters, as we did with the frame buffer. + + + + + case VIDIOCGWIN: + { + if(copy_to_user(arg, &capture_win, + sizeof(capture_win))) + return -EFAULT; + return 0; + } + + + case VIDIOCSWIN: + { + struct video_window v; + if(copy_from_user(&v, arg, sizeof(v))) + return -EFAULT; + if(v.width > 640 || v.height > 480) + return -EINVAL; + if(v.width < 16 || v.height < 16) + return -EINVAL; + hardware_set_key(v.chromakey); + hardware_set_window(v); + memcpy(&capture_win, &v, sizeof(v)); + capture_w = v.width; + capture_h = v.height; + return 0; + } + + + + + Because we are using Chromakey our setup is fairly simple. Mostly we have to + check the values are sane and load them into the capture card. + + + With all the setup done we can now turn on the actual capture/overlay. This + is done with the VIDIOCCAPTURE ioctl. This takes a single integer argument + where 0 is on and 1 is off. + + + + + case VIDIOCCAPTURE: + { + int v; + if(get_user(v, (int *)arg)) + return -EFAULT; + if(v==0) + hardware_capture_off(); + else + { + if(capture_fb.width == 0 + || capture_w == 0) + return -EINVAL; + hardware_capture_on(); + } + return 0; + } + + + + + We grab the flag from user space and either enable or disable according to + its value. There is one small corner case we have to consider here. Suppose + that the capture was requested before the video window or the frame buffer + had been set up. In those cases there will be unconfigured fields in our + card data, as well as unconfigured hardware settings. We check for this case and + return an error if the frame buffer or the capture window width is zero. + + + + + default: + return -ENOIOCTLCMD; + } +} + + + + We don't need to support any other ioctls, so if we get this far, it is time + to tell the video layer that we don't now what the user is talking about. + +
+ + Other Functionality + + The Video4Linux layer supports additional features, including a high + performance mmap() based capture mode and capturing part of the image. + These features are out of the scope of the book. You should however have enough + example code to implement most simple video4linux devices for radio and TV + cards. + + +
+ + Known Bugs And Assumptions + + + Multiple Opens + + + The driver assumes multiple opens should not be allowed. A driver + can work around this but not cleanly. + + + + API Deficiencies + + + The existing API poorly reflects compression capable devices. There + are plans afoot to merge V4L, V4L2 and some other ideas into a + better interface. + + + + + + + + + Public Functions Provided +!Edrivers/media/video/videodev.c + + +
diff --git a/Documentation/DocBook/wanbook.tmpl b/Documentation/DocBook/wanbook.tmpl new file mode 100644 index 000000000000..9eebcc304de4 --- /dev/null +++ b/Documentation/DocBook/wanbook.tmpl @@ -0,0 +1,99 @@ + + + + + + Synchronous PPP and Cisco HDLC Programming Guide + + + + Alan + Cox + +
+ alan@redhat.com +
+
+
+
+ + + 2000 + Alan Cox + + + + + This documentation is free software; you can redistribute + it and/or modify it under the terms of the GNU General Public + License as published by the Free Software Foundation; either + version 2 of the License, or (at your option) any later + version. + + + + This program is distributed in the hope that it will be + useful, but WITHOUT ANY WARRANTY; without even the implied + warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + See the GNU General Public License for more details. + + + + You should have received a copy of the GNU General Public + License along with this program; if not, write to the Free + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, + MA 02111-1307 USA + + + + For more details see the file COPYING in the source + distribution of Linux. + + +
+ + + + + Introduction + + The syncppp drivers in Linux provide a fairly complete + implementation of Cisco HDLC and a minimal implementation of + PPP. The longer term goal is to switch the PPP layer to the + generic PPP interface that is new in Linux 2.3.x. The API should + remain unchanged when this is done, but support will then be + available for IPX, compression and other PPP features + + + + Known Bugs And Assumptions + + + PPP is minimal + + + The current PPP implementation is very basic, although sufficient + for most wan usages. + + + + Cisco HDLC Quirks + + + Currently we do not end all packets with the correct Cisco multicast + or unicast flags. Nothing appears to mind too much but this should + be corrected. + + + + + + + + + Public Functions Provided +!Edrivers/net/wan/syncppp.c + + +
diff --git a/Documentation/DocBook/writing_usb_driver.tmpl b/Documentation/DocBook/writing_usb_driver.tmpl new file mode 100644 index 000000000000..51f3bfb6fb6e --- /dev/null +++ b/Documentation/DocBook/writing_usb_driver.tmpl @@ -0,0 +1,419 @@ + + + + + + Writing USB Device Drivers + + + + Greg + Kroah-Hartman + +
+ greg@kroah.com +
+
+
+
+ + + 2001-2002 + Greg Kroah-Hartman + + + + + This documentation is free software; you can redistribute + it and/or modify it under the terms of the GNU General Public + License as published by the Free Software Foundation; either + version 2 of the License, or (at your option) any later + version. + + + + This program is distributed in the hope that it will be + useful, but WITHOUT ANY WARRANTY; without even the implied + warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + See the GNU General Public License for more details. + + + + You should have received a copy of the GNU General Public + License along with this program; if not, write to the Free + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, + MA 02111-1307 USA + + + + For more details see the file COPYING in the source + distribution of Linux. + + + + This documentation is based on an article published in + Linux Journal Magazine, October 2001, Issue 90. + + +
+ + + + + Introduction + + The Linux USB subsystem has grown from supporting only two different + types of devices in the 2.2.7 kernel (mice and keyboards), to over 20 + different types of devices in the 2.4 kernel. Linux currently supports + almost all USB class devices (standard types of devices like keyboards, + mice, modems, printers and speakers) and an ever-growing number of + vendor-specific devices (such as USB to serial converters, digital + cameras, Ethernet devices and MP3 players). For a full list of the + different USB devices currently supported, see Resources. + + + The remaining kinds of USB devices that do not have support on Linux are + almost all vendor-specific devices. Each vendor decides to implement a + custom protocol to talk to their device, so a custom driver usually needs + to be created. Some vendors are open with their USB protocols and help + with the creation of Linux drivers, while others do not publish them, and + developers are forced to reverse-engineer. See Resources for some links + to handy reverse-engineering tools. + + + Because each different protocol causes a new driver to be created, I have + written a generic USB driver skeleton, modeled after the pci-skeleton.c + file in the kernel source tree upon which many PCI network drivers have + been based. This USB skeleton can be found at drivers/usb/usb-skeleton.c + in the kernel source tree. In this article I will walk through the basics + of the skeleton driver, explaining the different pieces and what needs to + be done to customize it to your specific device. + + + + + Linux USB Basics + + If you are going to write a Linux USB driver, please become familiar with + the USB protocol specification. It can be found, along with many other + useful documents, at the USB home page (see Resources). An excellent + introduction to the Linux USB subsystem can be found at the USB Working + Devices List (see Resources). It explains how the Linux USB subsystem is + structured and introduces the reader to the concept of USB urbs, which + are essential to USB drivers. + + + The first thing a Linux USB driver needs to do is register itself with + the Linux USB subsystem, giving it some information about which devices + the driver supports and which functions to call when a device supported + by the driver is inserted or removed from the system. All of this + information is passed to the USB subsystem in the usb_driver structure. + The skeleton driver declares a usb_driver as: + + +static struct usb_driver skel_driver = { + .name = "skeleton", + .probe = skel_probe, + .disconnect = skel_disconnect, + .fops = &skel_fops, + .minor = USB_SKEL_MINOR_BASE, + .id_table = skel_table, +}; + + + The variable name is a string that describes the driver. It is used in + informational messages printed to the system log. The probe and + disconnect function pointers are called when a device that matches the + information provided in the id_table variable is either seen or removed. + + + The fops and minor variables are optional. Most USB drivers hook into + another kernel subsystem, such as the SCSI, network or TTY subsystem. + These types of drivers register themselves with the other kernel + subsystem, and any user-space interactions are provided through that + interface. But for drivers that do not have a matching kernel subsystem, + such as MP3 players or scanners, a method of interacting with user space + is needed. The USB subsystem provides a way to register a minor device + number and a set of file_operations function pointers that enable this + user-space interaction. The skeleton driver needs this kind of interface, + so it provides a minor starting number and a pointer to its + file_operations functions. + + + The USB driver is then registered with a call to usb_register, usually in + the driver's init function, as shown here: + + +static int __init usb_skel_init(void) +{ + int result; + + /* register this driver with the USB subsystem */ + result = usb_register(&skel_driver); + if (result < 0) { + err("usb_register failed for the "__FILE__ "driver." + "Error number %d", result); + return -1; + } + + return 0; +} +module_init(usb_skel_init); + + + When the driver is unloaded from the system, it needs to unregister + itself with the USB subsystem. This is done with the usb_unregister + function: + + +static void __exit usb_skel_exit(void) +{ + /* deregister this driver with the USB subsystem */ + usb_deregister(&skel_driver); +} +module_exit(usb_skel_exit); + + + To enable the linux-hotplug system to load the driver automatically when + the device is plugged in, you need to create a MODULE_DEVICE_TABLE. The + following code tells the hotplug scripts that this module supports a + single device with a specific vendor and product ID: + + +/* table of devices that work with this driver */ +static struct usb_device_id skel_table [] = { + { USB_DEVICE(USB_SKEL_VENDOR_ID, USB_SKEL_PRODUCT_ID) }, + { } /* Terminating entry */ +}; +MODULE_DEVICE_TABLE (usb, skel_table); + + + There are other macros that can be used in describing a usb_device_id for + drivers that support a whole class of USB drivers. See usb.h for more + information on this. + + + + + Device operation + + When a device is plugged into the USB bus that matches the device ID + pattern that your driver registered with the USB core, the probe function + is called. The usb_device structure, interface number and the interface ID + are passed to the function: + + +static int skel_probe(struct usb_interface *interface, + const struct usb_device_id *id) + + + The driver now needs to verify that this device is actually one that it + can accept. If so, it returns 0. + If not, or if any error occurs during initialization, an errorcode + (such as -ENOMEM or -ENODEV) + is returned from the probe function. + + + In the skeleton driver, we determine what end points are marked as bulk-in + and bulk-out. We create buffers to hold the data that will be sent and + received from the device, and a USB urb to write data to the device is + initialized. + + + Conversely, when the device is removed from the USB bus, the disconnect + function is called with the device pointer. The driver needs to clean any + private data that has been allocated at this time and to shut down any + pending urbs that are in the USB system. The driver also unregisters + itself from the devfs subsystem with the call: + + +/* remove our devfs node */ +devfs_unregister(skel->devfs); + + + Now that the device is plugged into the system and the driver is bound to + the device, any of the functions in the file_operations structure that + were passed to the USB subsystem will be called from a user program trying + to talk to the device. The first function called will be open, as the + program tries to open the device for I/O. We increment our private usage + count and save off a pointer to our internal structure in the file + structure. This is done so that future calls to file operations will + enable the driver to determine which device the user is addressing. All + of this is done with the following code: + + +/* increment our usage count for the module */ +++skel->open_count; + +/* save our object in the file's private structure */ +file->private_data = dev; + + + After the open function is called, the read and write functions are called + to receive and send data to the device. In the skel_write function, we + receive a pointer to some data that the user wants to send to the device + and the size of the data. The function determines how much data it can + send to the device based on the size of the write urb it has created (this + size depends on the size of the bulk out end point that the device has). + Then it copies the data from user space to kernel space, points the urb to + the data and submits the urb to the USB subsystem. This can be shown in + he following code: + + +/* we can only write as much as 1 urb will hold */ +bytes_written = (count > skel->bulk_out_size) ? skel->bulk_out_size : count; + +/* copy the data from user space into our urb */ +copy_from_user(skel->write_urb->transfer_buffer, buffer, bytes_written); + +/* set up our urb */ +usb_fill_bulk_urb(skel->write_urb, + skel->dev, + usb_sndbulkpipe(skel->dev, skel->bulk_out_endpointAddr), + skel->write_urb->transfer_buffer, + bytes_written, + skel_write_bulk_callback, + skel); + +/* send the data out the bulk port */ +result = usb_submit_urb(skel->write_urb); +if (result) { + err("Failed submitting write urb, error %d", result); +} + + + When the write urb is filled up with the proper information using the + usb_fill_bulk_urb function, we point the urb's completion callback to call our + own skel_write_bulk_callback function. This function is called when the + urb is finished by the USB subsystem. The callback function is called in + interrupt context, so caution must be taken not to do very much processing + at that time. Our implementation of skel_write_bulk_callback merely + reports if the urb was completed successfully or not and then returns. + + + The read function works a bit differently from the write function in that + we do not use an urb to transfer data from the device to the driver. + Instead we call the usb_bulk_msg function, which can be used to send or + receive data from a device without having to create urbs and handle + urb completion callback functions. We call the usb_bulk_msg function, + giving it a buffer into which to place any data received from the device + and a timeout value. If the timeout period expires without receiving any + data from the device, the function will fail and return an error message. + This can be shown with the following code: + + +/* do an immediate bulk read to get data from the device */ +retval = usb_bulk_msg (skel->dev, + usb_rcvbulkpipe (skel->dev, + skel->bulk_in_endpointAddr), + skel->bulk_in_buffer, + skel->bulk_in_size, + &count, HZ*10); +/* if the read was successful, copy the data to user space */ +if (!retval) { + if (copy_to_user (buffer, skel->bulk_in_buffer, count)) + retval = -EFAULT; + else + retval = count; +} + + + The usb_bulk_msg function can be very useful for doing single reads or + writes to a device; however, if you need to read or write constantly to a + device, it is recommended to set up your own urbs and submit them to the + USB subsystem. + + + When the user program releases the file handle that it has been using to + talk to the device, the release function in the driver is called. In this + function we decrement our private usage count and wait for possible + pending writes: + + +/* decrement our usage count for the device */ +--skel->open_count; + + + One of the more difficult problems that USB drivers must be able to handle + smoothly is the fact that the USB device may be removed from the system at + any point in time, even if a program is currently talking to it. It needs + to be able to shut down any current reads and writes and notify the + user-space programs that the device is no longer there. The following + code (function skel_delete) + is an example of how to do this: + +static inline void skel_delete (struct usb_skel *dev) +{ + if (dev->bulk_in_buffer != NULL) + kfree (dev->bulk_in_buffer); + if (dev->bulk_out_buffer != NULL) + usb_buffer_free (dev->udev, dev->bulk_out_size, + dev->bulk_out_buffer, + dev->write_urb->transfer_dma); + if (dev->write_urb != NULL) + usb_free_urb (dev->write_urb); + kfree (dev); +} + + + If a program currently has an open handle to the device, we reset the flag + device_present. For + every read, write, release and other functions that expect a device to be + present, the driver first checks this flag to see if the device is + still present. If not, it releases that the device has disappeared, and a + -ENODEV error is returned to the user-space program. When the release + function is eventually called, it determines if there is no device + and if not, it does the cleanup that the skel_disconnect + function normally does if there are no open files on the device (see + Listing 5). + + + + + Isochronous Data + + This usb-skeleton driver does not have any examples of interrupt or + isochronous data being sent to or from the device. Interrupt data is sent + almost exactly as bulk data is, with a few minor exceptions. Isochronous + data works differently with continuous streams of data being sent to or + from the device. The audio and video camera drivers are very good examples + of drivers that handle isochronous data and will be useful if you also + need to do this. + + + + + Conclusion + + Writing Linux USB device drivers is not a difficult task as the + usb-skeleton driver shows. This driver, combined with the other current + USB drivers, should provide enough examples to help a beginning author + create a working driver in a minimal amount of time. The linux-usb-devel + mailing list archives also contain a lot of helpful information. + + + + + Resources + + The Linux USB Project: http://www.linux-usb.org/ + + + Linux Hotplug Project: http://linux-hotplug.sourceforge.net/ + + + Linux USB Working Devices List: http://www.qbik.ch/usb/devices/ + + + linux-usb-devel Mailing List Archives: http://marc.theaimsgroup.com/?l=linux-usb-devel + + + Programming Guide for Linux USB Device Drivers: http://usb.cs.tum.edu/usbdoc + + + USB Home Page: http://www.usb.org + + + +
diff --git a/Documentation/DocBook/z8530book.tmpl b/Documentation/DocBook/z8530book.tmpl new file mode 100644 index 000000000000..a507876447aa --- /dev/null +++ b/Documentation/DocBook/z8530book.tmpl @@ -0,0 +1,385 @@ + + + + + + Z8530 Programming Guide + + + + Alan + Cox + +
+ alan@redhat.com +
+
+
+
+ + + 2000 + Alan Cox + + + + + This documentation is free software; you can redistribute + it and/or modify it under the terms of the GNU General Public + License as published by the Free Software Foundation; either + version 2 of the License, or (at your option) any later + version. + + + + This program is distributed in the hope that it will be + useful, but WITHOUT ANY WARRANTY; without even the implied + warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + See the GNU General Public License for more details. + + + + You should have received a copy of the GNU General Public + License along with this program; if not, write to the Free + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, + MA 02111-1307 USA + + + + For more details see the file COPYING in the source + distribution of Linux. + + +
+ + + + + Introduction + + The Z85x30 family synchronous/asynchronous controller chips are + used on a large number of cheap network interface cards. The + kernel provides a core interface layer that is designed to make + it easy to provide WAN services using this chip. + + + The current driver only support synchronous operation. Merging the + asynchronous driver support into this code to allow any Z85x30 + device to be used as both a tty interface and as a synchronous + controller is a project for Linux post the 2.4 release + + + The support code handles most common card configurations and + supports running both Cisco HDLC and Synchronous PPP. With extra + glue the frame relay and X.25 protocols can also be used with this + driver. + + + + + Driver Modes + + The Z85230 driver layer can drive Z8530, Z85C30 and Z85230 devices + in three different modes. Each mode can be applied to an individual + channel on the chip (each chip has two channels). + + + The PIO synchronous mode supports the most common Z8530 wiring. Here + the chip is interface to the I/O and interrupt facilities of the + host machine but not to the DMA subsystem. When running PIO the + Z8530 has extremely tight timing requirements. Doing high speeds, + even with a Z85230 will be tricky. Typically you should expect to + achieve at best 9600 baud with a Z8C530 and 64Kbits with a Z85230. + + + The DMA mode supports the chip when it is configured to use dual DMA + channels on an ISA bus. The better cards tend to support this mode + of operation for a single channel. With DMA running the Z85230 tops + out when it starts to hit ISA DMA constraints at about 512Kbits. It + is worth noting here that many PC machines hang or crash when the + chip is driven fast enough to hold the ISA bus solid. + + + Transmit DMA mode uses a single DMA channel. The DMA channel is used + for transmission as the transmit FIFO is smaller than the receive + FIFO. it gives better performance than pure PIO mode but is nowhere + near as ideal as pure DMA mode. + + + + + Using the Z85230 driver + + The Z85230 driver provides the back end interface to your board. To + configure a Z8530 interface you need to detect the board and to + identify its ports and interrupt resources. It is also your problem + to verify the resources are available. + + + Having identified the chip you need to fill in a struct z8530_dev, + which describes each chip. This object must exist until you finally + shutdown the board. Firstly zero the active field. This ensures + nothing goes off without you intending it. The irq field should + be set to the interrupt number of the chip. (Each chip has a single + interrupt source rather than each channel). You are responsible + for allocating the interrupt line. The interrupt handler should be + set to z8530_interrupt. The device id should + be set to the z8530_dev structure pointer. Whether the interrupt can + be shared or not is board dependent, and up to you to initialise. + + + The structure holds two channel structures. + Initialise chanA.ctrlio and chanA.dataio with the address of the + control and data ports. You can or this with Z8530_PORT_SLEEP to + indicate your interface needs the 5uS delay for chip settling done + in software. The PORT_SLEEP option is architecture specific. Other + flags may become available on future platforms, eg for MMIO. + Initialise the chanA.irqs to &z8530_nop to start the chip up + as disabled and discarding interrupt events. This ensures that + stray interrupts will be mopped up and not hang the bus. Set + chanA.dev to point to the device structure itself. The + private and name field you may use as you wish. The private field + is unused by the Z85230 layer. The name is used for error reporting + and it may thus make sense to make it match the network name. + + + Repeat the same operation with the B channel if your chip has + both channels wired to something useful. This isn't always the + case. If it is not wired then the I/O values do not matter, but + you must initialise chanB.dev. + + + If your board has DMA facilities then initialise the txdma and + rxdma fields for the relevant channels. You must also allocate the + ISA DMA channels and do any necessary board level initialisation + to configure them. The low level driver will do the Z8530 and + DMA controller programming but not board specific magic. + + + Having initialised the device you can then call + z8530_init. This will probe the chip and + reset it into a known state. An identification sequence is then + run to identify the chip type. If the checks fail to pass the + function returns a non zero error code. Typically this indicates + that the port given is not valid. After this call the + type field of the z8530_dev structure is initialised to either + Z8530, Z85C30 or Z85230 according to the chip found. + + + Once you have called z8530_init you can also make use of the utility + function z8530_describe. This provides a + consistent reporting format for the Z8530 devices, and allows all + the drivers to provide consistent reporting. + + + + + Attaching Network Interfaces + + If you wish to use the network interface facilities of the driver, + then you need to attach a network device to each channel that is + present and in use. In addition to use the SyncPPP and Cisco HDLC + you need to follow some additional plumbing rules. They may seem + complex but a look at the example hostess_sv11 driver should + reassure you. + + + The network device used for each channel should be pointed to by + the netdevice field of each channel. The dev-> priv field of the + network device points to your private data - you will need to be + able to find your ppp device from this. In addition to use the + sync ppp layer the private data must start with a void * pointer + to the syncppp structures. + + + The way most drivers approach this particular problem is to + create a structure holding the Z8530 device definition and + put that and the syncppp pointer into the private field of + the network device. The network device fields of the channels + then point back to the network devices. The ppp_device can also + be put in the private structure conveniently. + + + If you wish to use the synchronous ppp then you need to attach + the syncppp layer to the network device. You should do this before + you register the network device. The + sppp_attach requires that the first void * + pointer in your private data is pointing to an empty struct + ppp_device. The function fills in the initial data for the + ppp/hdlc layer. + + + Before you register your network device you will also need to + provide suitable handlers for most of the network device callbacks. + See the network device documentation for more details on this. + + + + + Configuring And Activating The Port + + The Z85230 driver provides helper functions and tables to load the + port registers on the Z8530 chips. When programming the register + settings for a channel be aware that the documentation recommends + initialisation orders. Strange things happen when these are not + followed. + + + z8530_channel_load takes an array of + pairs of initialisation values in an array of u8 type. The first + value is the Z8530 register number. Add 16 to indicate the alternate + register bank on the later chips. The array is terminated by a 255. + + + The driver provides a pair of public tables. The + z8530_hdlc_kilostream table is for the UK 'Kilostream' service and + also happens to cover most other end host configurations. The + z8530_hdlc_kilostream_85230 table is the same configuration using + the enhancements of the 85230 chip. The configuration loaded is + standard NRZ encoded synchronous data with HDLC bitstuffing. All + of the timing is taken from the other end of the link. + + + When writing your own tables be aware that the driver internally + tracks register values. It may need to reload values. You should + therefore be sure to set registers 1-7, 9-11, 14 and 15 in all + configurations. Where the register settings depend on DMA selection + the driver will update the bits itself when you open or close. + Loading a new table with the interface open is not recommended. + + + There are three standard configurations supported by the core + code. In PIO mode the interface is programmed up to use + interrupt driven PIO. This places high demands on the host processor + to avoid latency. The driver is written to take account of latency + issues but it cannot avoid latencies caused by other drivers, + notably IDE in PIO mode. Because the drivers allocate buffers you + must also prevent MTU changes while the port is open. + + + Once the port is open it will call the rx_function of each channel + whenever a completed packet arrived. This is invoked from + interrupt context and passes you the channel and a network + buffer (struct sk_buff) holding the data. The data includes + the CRC bytes so most users will want to trim the last two + bytes before processing the data. This function is very timing + critical. When you wish to simply discard data the support + code provides the function z8530_null_rx + to discard the data. + + + To active PIO mode sending and receiving the + z8530_sync_open is called. This expects to be passed + the network device and the channel. Typically this is called from + your network device open callback. On a failure a non zero error + status is returned. The z8530_sync_close + function shuts down a PIO channel. This must be done before the + channel is opened again and before the driver shuts down + and unloads. + + + The ideal mode of operation is dual channel DMA mode. Here the + kernel driver will configure the board for DMA in both directions. + The driver also handles ISA DMA issues such as controller + programming and the memory range limit for you. This mode is + activated by calling the z8530_sync_dma_open + function. On failure a non zero error value is returned. + Once this mode is activated it can be shut down by calling the + z8530_sync_dma_close. You must call the close + function matching the open mode you used. + + + The final supported mode uses a single DMA channel to drive the + transmit side. As the Z85C30 has a larger FIFO on the receive + channel this tends to increase the maximum speed a little. + This is activated by calling the z8530_sync_txdma_open + . This returns a non zero error code on failure. The + z8530_sync_txdma_close function closes down + the Z8530 interface from this mode. + + + + + Network Layer Functions + + The Z8530 layer provides functions to queue packets for + transmission. The driver internally buffers the frame currently + being transmitted and one further frame (in order to keep back + to back transmission running). Any further buffering is up to + the caller. + + + The function z8530_queue_xmit takes a network + buffer in sk_buff format and queues it for transmission. The + caller must provide the entire packet with the exception of the + bitstuffing and CRC. This is normally done by the caller via + the syncppp interface layer. It returns 0 if the buffer has been + queued and non zero values for queue full. If the function accepts + the buffer it becomes property of the Z8530 layer and the caller + should not free it. + + + The function z8530_get_stats returns a pointer + to an internally maintained per interface statistics block. This + provides most of the interface code needed to implement the network + layer get_stats callback. + + + + + Porting The Z8530 Driver + + The Z8530 driver is written to be portable. In DMA mode it makes + assumptions about the use of ISA DMA. These are probably warranted + in most cases as the Z85230 in particular was designed to glue to PC + type machines. The PIO mode makes no real assumptions. + + + Should you need to retarget the Z8530 driver to another architecture + the only code that should need changing are the port I/O functions. + At the moment these assume PC I/O port accesses. This may not be + appropriate for all platforms. Replacing + z8530_read_port and z8530_write_port + is intended to be all that is required to port this + driver layer. + + + + + Known Bugs And Assumptions + + + Interrupt Locking + + + The locking in the driver is done via the global cli/sti lock. This + makes for relatively poor SMP performance. Switching this to use a + per device spin lock would probably materially improve performance. + + + + Occasional Failures + + + We have reports of occasional failures when run for very long + periods of time and the driver starts to receive junk frames. At + the moment the cause of this is not clear. + + + + + + + + + Public Functions Provided +!Edrivers/net/wan/z85230.c + + + + Internal Functions +!Idrivers/net/wan/z85230.c + + +
-- cgit v1.2.3