diff options
569 files changed, 11804 insertions, 5206 deletions
diff --git a/Documentation/DocBook/kernel-api.tmpl b/Documentation/DocBook/kernel-api.tmpl index d3290c46af51..aa38cc5692a0 100644 --- a/Documentation/DocBook/kernel-api.tmpl +++ b/Documentation/DocBook/kernel-api.tmpl @@ -46,7 +46,7 @@ <sect1><title>Atomic and pointer manipulation</title> !Iinclude/asm-x86/atomic_32.h -!Iinclude/asm-x86/unaligned_32.h +!Iinclude/asm-x86/unaligned.h </sect1> <sect1><title>Delaying, scheduling, and timer routines</title> diff --git a/Documentation/accounting/cgroupstats.txt b/Documentation/accounting/cgroupstats.txt new file mode 100644 index 000000000000..eda40fd39cad --- /dev/null +++ b/Documentation/accounting/cgroupstats.txt @@ -0,0 +1,27 @@ +Control Groupstats is inspired by the discussion at +http://lkml.org/lkml/2007/4/11/187 and implements per cgroup statistics as +suggested by Andrew Morton in http://lkml.org/lkml/2007/4/11/263. + +Per cgroup statistics infrastructure re-uses code from the taskstats +interface. A new set of cgroup operations are registered with commands +and attributes specific to cgroups. It should be very easy to +extend per cgroup statistics, by adding members to the cgroupstats +structure. + +The current model for cgroupstats is a pull, a push model (to post +statistics on interesting events), should be very easy to add. Currently +user space requests for statistics by passing the cgroup path. +Statistics about the state of all the tasks in the cgroup is returned to +user space. + +NOTE: We currently rely on delay accounting for extracting information +about tasks blocked on I/O. If CONFIG_TASK_DELAY_ACCT is disabled, this +information will not be available. + +To extract cgroup statistics a utility very similar to getdelays.c +has been developed, the sample output of the utility is shown below + +~/balbir/cgroupstats # ./getdelays -C "/cgroup/a" +sleeping 1, blocked 0, running 1, stopped 0, uninterruptible 0 +~/balbir/cgroupstats # ./getdelays -C "/cgroup" +sleeping 155, blocked 0, running 1, stopped 0, uninterruptible 2 diff --git a/Documentation/cachetlb.txt b/Documentation/cachetlb.txt index 552cabac0608..da42ab414c48 100644 --- a/Documentation/cachetlb.txt +++ b/Documentation/cachetlb.txt @@ -87,30 +87,7 @@ changes occur: This is used primarily during fault processing. -5) void flush_tlb_pgtables(struct mm_struct *mm, - unsigned long start, unsigned long end) - - The software page tables for address space 'mm' for virtual - addresses in the range 'start' to 'end-1' are being torn down. - - Some platforms cache the lowest level of the software page tables - in a linear virtually mapped array, to make TLB miss processing - more efficient. On such platforms, since the TLB is caching the - software page table structure, it needs to be flushed when parts - of the software page table tree are unlinked/freed. - - Sparc64 is one example of a platform which does this. - - Usually, when munmap()'ing an area of user virtual address - space, the kernel leaves the page table parts around and just - marks the individual pte's as invalid. However, if very large - portions of the address space are unmapped, the kernel frees up - those portions of the software page tables to prevent potential - excessive kernel memory usage caused by erratic mmap/mmunmap - sequences. It is at these times that flush_tlb_pgtables will - be invoked. - -6) void update_mmu_cache(struct vm_area_struct *vma, +5) void update_mmu_cache(struct vm_area_struct *vma, unsigned long address, pte_t pte) At the end of every page fault, this routine is invoked to @@ -123,7 +100,7 @@ changes occur: translations for software managed TLB configurations. The sparc64 port currently does this. -7) void tlb_migrate_finish(struct mm_struct *mm) +6) void tlb_migrate_finish(struct mm_struct *mm) This interface is called at the end of an explicit process migration. This interface provides a hook diff --git a/Documentation/cgroups.txt b/Documentation/cgroups.txt new file mode 100644 index 000000000000..98a26f81fa75 --- /dev/null +++ b/Documentation/cgroups.txt @@ -0,0 +1,545 @@ + CGROUPS + ------- + +Written by Paul Menage <menage@google.com> based on Documentation/cpusets.txt + +Original copyright statements from cpusets.txt: +Portions Copyright (C) 2004 BULL SA. +Portions Copyright (c) 2004-2006 Silicon Graphics, Inc. +Modified by Paul Jackson <pj@sgi.com> +Modified by Christoph Lameter <clameter@sgi.com> + +CONTENTS: +========= + +1. Control Groups + 1.1 What are cgroups ? + 1.2 Why are cgroups needed ? + 1.3 How are cgroups implemented ? + 1.4 What does notify_on_release do ? + 1.5 How do I use cgroups ? +2. Usage Examples and Syntax + 2.1 Basic Usage + 2.2 Attaching processes +3. Kernel API + 3.1 Overview + 3.2 Synchronization + 3.3 Subsystem API +4. Questions + +1. Control Groups +========== + +1.1 What are cgroups ? +---------------------- + +Control Groups provide a mechanism for aggregating/partitioning sets of +tasks, and all their future children, into hierarchical groups with +specialized behaviour. + +Definitions: + +A *cgroup* associates a set of tasks with a set of parameters for one +or more subsystems. + +A *subsystem* is a module that makes use of the task grouping +facilities provided by cgroups to treat groups of tasks in +particular ways. A subsystem is typically a "resource controller" that +schedules a resource or applies per-cgroup limits, but it may be +anything that wants to act on a group of processes, e.g. a +virtualization subsystem. + +A *hierarchy* is a set of cgroups arranged in a tree, such that +every task in the system is in exactly one of the cgroups in the +hierarchy, and a set of subsystems; each subsystem has system-specific +state attached to each cgroup in the hierarchy. Each hierarchy has +an instance of the cgroup virtual filesystem associated with it. + +At any one time there may be multiple active hierachies of task +cgroups. Each hierarchy is a partition of all tasks in the system. + +User level code may create and destroy cgroups by name in an +instance of the cgroup virtual file system, specify and query to +which cgroup a task is assigned, and list the task pids assigned to +a cgroup. Those creations and assignments only affect the hierarchy +associated with that instance of the cgroup file system. + +On their own, the only use for cgroups is for simple job +tracking. The intention is that other subsystems hook into the generic +cgroup support to provide new attributes for cgroups, such as +accounting/limiting the resources which processes in a cgroup can +access. For example, cpusets (see Documentation/cpusets.txt) allows +you to associate a set of CPUs and a set of memory nodes with the +tasks in each cgroup. + +1.2 Why are cgroups needed ? +---------------------------- + +There are multiple efforts to provide process aggregations in the +Linux kernel, mainly for resource tracking purposes. Such efforts +include cpusets, CKRM/ResGroups, UserBeanCounters, and virtual server +namespaces. These all require the basic notion of a +grouping/partitioning of processes, with newly forked processes ending +in the same group (cgroup) as their parent process. + +The kernel cgroup patch provides the minimum essential kernel +mechanisms required to efficiently implement such groups. It has +minimal impact on the system fast paths, and provides hooks for +specific subsystems such as cpusets to provide additional behaviour as +desired. + +Multiple hierarchy support is provided to allow for situations where +the division of tasks into cgroups is distinctly different for +different subsystems - having parallel hierarchies allows each +hierarchy to be a natural division of tasks, without having to handle +complex combinations of tasks that would be present if several +unrelated subsystems needed to be forced into the same tree of +cgroups. + +At one extreme, each resource controller or subsystem could be in a +separate hierarchy; at the other extreme, all subsystems +would be attached to the same hierarchy. + +As an example of a scenario (originally proposed by vatsa@in.ibm.com) +that can benefit from multiple hierarchies, consider a large +university server with various users - students, professors, system +tasks etc. The resource planning for this server could be along the +following lines: + + CPU : Top cpuset + / \ + CPUSet1 CPUSet2 + | | + (Profs) (Students) + + In addition (system tasks) are attached to topcpuset (so + that they can run anywhere) with a limit of 20% + + Memory : Professors (50%), students (30%), system (20%) + + Disk : Prof (50%), students (30%), system (20%) + + Network : WWW browsing (20%), Network File System (60%), others (20%) + / \ + Prof (15%) students (5%) + +Browsers like firefox/lynx go into the WWW network class, while (k)nfsd go +into NFS network class. + +At the same time firefox/lynx will share an appropriate CPU/Memory class +depending on who launched it (prof/student). + +With the ability to classify tasks differently for different resources +(by putting those resource subsystems in different hierarchies) then +the admin can easily set up a script which receives exec notifications +and depending on who is launching the browser he can + + # echo browser_pid > /mnt/<restype>/<userclass>/tasks + +With only a single hierarchy, he now would potentially have to create +a separate cgroup for every browser launched and associate it with +approp network and other resource class. This may lead to +proliferation of such cgroups. + +Also lets say that the administrator would like to give enhanced network +access temporarily to a student's browser (since it is night and the user +wants to do online gaming :) OR give one of the students simulation +apps enhanced CPU power, + +With ability to write pids directly to resource classes, its just a +matter of : + + # echo pid > /mnt/network/<new_class>/tasks + (after some time) + # echo pid > /mnt/network/<orig_class>/tasks + +Without this ability, he would have to split the cgroup into +multiple separate ones and then associate the new cgroups with the +new resource classes. + + + +1.3 How are cgroups implemented ? +--------------------------------- + +Control Groups extends the kernel as follows: + + - Each task in the system has a reference-counted pointer to a + css_set. + + - A css_set contains a set of reference-counted pointers to + cgroup_subsys_state objects, one for each cgroup subsystem + registered in the system. There is no direct link from a task to + the cgroup of which it's a member in each hierarchy, but this + can be determined by following pointers through the + cgroup_subsys_state objects. This is because accessing the + subsystem state is something that's expected to happen frequently + and in performance-critical code, whereas operations that require a + task's actual cgroup assignments (in particular, moving between + cgroups) are less common. A linked list runs through the cg_list + field of each task_struct using the css_set, anchored at + css_set->tasks. + + - A cgroup hierarchy filesystem can be mounted for browsing and + manipulation from user space. + + - You can list all the tasks (by pid) attached to any cgroup. + +The implementation of cgroups requires a few, simple hooks +into the rest of the kernel, none in performance critical paths: + + - in init/main.c, to initialize the root cgroups and initial + css_set at system boot. + + - in fork and exit, to attach and detach a task from its css_set. + +In addition a new file system, of type "cgroup" may be mounted, to +enable browsing and modifying the cgroups presently known to the +kernel. When mounting a cgroup hierarchy, you may specify a +comma-separated list of subsystems to mount as the filesystem mount +options. By default, mounting the cgroup filesystem attempts to +mount a hierarchy containing all registered subsystems. + +If an active hierarchy with exactly the same set of subsystems already +exists, it will be reused for the new mount. If no existing hierarchy +matches, and any of the requested subsystems are in use in an existing +hierarchy, the mount will fail with -EBUSY. Otherwise, a new hierarchy +is activated, associated with the requested subsystems. + +It's not currently possible to bind a new subsystem to an active +cgroup hierarchy, or to unbind a subsystem from an active cgroup +hierarchy. This may be possible in future, but is fraught with nasty +error-recovery issues. + +When a cgroup filesystem is unmounted, if there are any +child cgroups created below the top-level cgroup, that hierarchy +will remain active even though unmounted; if there are no +child cgroups then the hierarchy will be deactivated. + +No new system calls are added for cgroups - all support for +querying and modifying cgroups is via this cgroup file system. + +Each task under /proc has an added file named 'cgroup' displaying, +for each active hierarchy, the subsystem names and the cgroup name +as the path relative to the root of the cgroup file system. + +Each cgroup is represented by a directory in the cgroup file system +containing the following files describing that cgroup: + + - tasks: list of tasks (by pid) attached to that cgroup + - notify_on_release flag: run /sbin/cgroup_release_agent on exit? + +Other subsystems such as cpusets may add additional files in each +cgroup dir + +New cgroups are created using the mkdir system call or shell +command. The properties of a cgroup, such as its flags, are +modified by writing to the appropriate file in that cgroups +directory, as listed above. + +The named hierarchical structure of nested cgroups allows partitioning +a large system into nested, dynamically changeable, "soft-partitions". + +The attachment of each task, automatically inherited at fork by any +children of that task, to a cgroup allows organizing the work load +on a system into related sets of tasks. A task may be re-attached to +any other cgroup, if allowed by the permissions on the necessary +cgroup file system directories. + +When a task is moved from one cgroup to another, it gets a new +css_set pointer - if there's an already existing css_set with the +desired collection of cgroups then that group is reused, else a new +css_set is allocated. Note that the current implementation uses a +linear search to locate an appropriate existing css_set, so isn't +very efficient. A future version will use a hash table for better +performance. + +To allow access from a cgroup to the css_sets (and hence tasks) +that comprise it, a set of cg_cgroup_link objects form a lattice; +each cg_cgroup_link is linked into a list of cg_cgroup_links for +a single cgroup on its cont_link_list field, and a list of +cg_cgroup_links for a single css_set on its cg_link_list. + +Thus the set of tasks in a cgroup can be listed by iterating over +each css_set that references the cgroup, and sub-iterating over +each css_set's task set. + +The use of a Linux virtual file system (vfs) to represent the +cgroup hierarchy provides for a familiar permission and name space +for cgroups, with a minimum of additional kernel code. + +1.4 What does notify_on_release do ? +------------------------------------ + +*** notify_on_release is disabled in the current patch set. It will be +*** reactivated in a future patch in a less-intrusive manner + +If the notify_on_release flag is enabled (1) in a cgroup, then +whenever the last task in the cgroup leaves (exits or attaches to +some other cgroup) and the last child cgroup of that cgroup +is removed, then the kernel runs the command specified by the contents +of the "release_agent" file in that hierarchy's root directory, +supplying the pathname (relative to the mount point of the cgroup +file system) of the abandoned cgroup. This enables automatic +removal of abandoned cgroups. The default value of +notify_on_release in the root cgroup at system boot is disabled +(0). The default value of other cgroups at creation is the current +value of their parents notify_on_release setting. The default value of +a cgroup hierarchy's release_agent path is empty. + +1.5 How do I use cgroups ? +-------------------------- + +To start a new job that is to be contained within a cgroup, using +the "cpuset" cgroup subsystem, the steps are something like: + + 1) mkdir /dev/cgroup + 2) mount -t cgroup -ocpuset cpuset /dev/cgroup + 3) Create the new cgroup by doing mkdir's and write's (or echo's) in + the /dev/cgroup virtual file system. + 4) Start a task that will be the "founding father" of the new job. + 5) Attach that task to the new cgroup by writing its pid to the + /dev/cgroup tasks file for that cgroup. + 6) fork, exec or clone the job tasks from this founding father task. + +For example, the following sequence of commands will setup a cgroup +named "Charlie", containing just CPUs 2 and 3, and Memory Node 1, +and then start a subshell 'sh' in that cgroup: + + mount -t cgroup cpuset -ocpuset /dev/cgroup + cd /dev/cgroup + mkdir Charlie + cd Charlie + /bin/echo 2-3 > cpus + /bin/echo 1 > mems + /bin/echo $$ > tasks + sh + # The subshell 'sh' is now running in cgroup Charlie + # The next line should display '/Charlie' + cat /proc/self/cgroup + +2. Usage Examples and Syntax +============================ + +2.1 Basic Usage +--------------- + +Creating, modifying, using the cgroups can be done through the cgroup +virtual filesystem. + +To mount a cgroup hierarchy will all available subsystems, type: +# mount -t cgroup xxx /dev/cgroup + +The "xxx" is not interpreted by the cgroup code, but will appear in +/proc/mounts so may be any useful identifying string that you like. + +To mount a cgroup hierarchy with just the cpuset and numtasks +subsystems, type: +# mount -t cgroup -o cpuset,numtasks hier1 /dev/cgroup + +To change the set of subsystems bound to a mounted hierarchy, just +remount with different options: + +# mount -o remount,cpuset,ns /dev/cgroup + +Note that changing the set of subsystems is currently only supported +when the hierarchy consists of a single (root) cgroup. Supporting +the ability to arbitrarily bind/unbind subsystems from an existing +cgroup hierarchy is intended to be implemented in the future. + +Then under /dev/cgroup you can find a tree that corresponds to the +tree of the cgroups in the system. For instance, /dev/cgroup +is the cgroup that holds the whole system. + +If you want to create a new cgroup under /dev/cgroup: +# cd /dev/cgroup +# mkdir my_cgroup + +Now you want to do something with this cgroup. +# cd my_cgroup + +In this directory you can find several files: +# ls +notify_on_release release_agent tasks +(plus whatever files are added by the attached subsystems) + +Now attach your shell to this cgroup: +# /bin/echo $$ > tasks + +You can also create cgroups inside your cgroup by using mkdir in this +directory. +# mkdir my_sub_cs + +To remove a cgroup, just use rmdir: +# rmdir my_sub_cs + +This will fail if the cgroup is in use (has cgroups inside, or +has processes attached, or is held alive by other subsystem-specific +reference). + +2.2 Attaching processes +----------------------- + +# /bin/echo PID > tasks + +Note that it is PID, not PIDs. You can only attach ONE task at a time. +If you have several tasks to attach, you have to do it one after another: + +# /bin/echo PID1 > tasks +# /bin/echo PID2 > tasks + ... +# /bin/echo PIDn > tasks + +3. Kernel API +============= + +3.1 Overview +------------ + +Each kernel subsystem that wants to hook into the generic cgroup +system needs to create a cgroup_subsys object. This contains +various methods, which are callbacks from the cgroup system, along +with a subsystem id which will be assigned by the cgroup system. + +Other fields in the cgroup_subsys object include: + +- subsys_id: a unique array index for the subsystem, indicating which + entry in cgroup->subsys[] this subsystem should be + managing. Initialized by cgroup_register_subsys(); prior to this + it should be initialized to -1 + +- hierarchy: an index indicating which hierarchy, if any, this + subsystem is currently attached to. If this is -1, then the + subsystem is not attached to any hierarchy, and all tasks should be + considered to be members of the subsystem's top_cgroup. It should + be initialized to -1. + +- name: should be initialized to a unique subsystem name prior to + calling cgroup_register_subsystem. Should be no longer than + MAX_CGROUP_TYPE_NAMELEN + +Each cgroup object created by the system has an array of pointers, +indexed by subsystem id; this pointer is entirely managed by the +subsystem; the generic cgroup code will never touch this pointer. + +3.2 Synchronization +------------------- + +There is a global mutex, cgroup_mutex, used by the cgroup +system. This should be taken by anything that wants to modify a +cgroup. It may also be taken to prevent cgroups from being +modified, but more specific locks may be more appropriate in that +situation. + +See kernel/cgroup.c for more details. + +Subsystems can take/release the cgroup_mutex via the functions +cgroup_lock()/cgroup_unlock(), and can +take/release the callback_mutex via the functions +cgroup_lock()/cgroup_unlock(). + +Accessing a task's cgroup pointer may be done in the following ways: +- while holding cgroup_mutex +- while holding the task's alloc_lock (via task_lock()) +- inside an rcu_read_lock() section via rcu_dereference() + +3.3 Subsystem API +-------------------------- + +Each subsystem should: + +- add an entry in linux/cgroup_subsys.h +- define a cgroup_subsys object called <name>_subsys + +Each subsystem may export the following methods. The only mandatory +methods are create/destroy. Any others that are null are presumed to +be successful no-ops. + +struct cgroup_subsys_state *create(struct cgroup *cont) +LL=cgroup_mutex + +Called to create a subsystem state object for a cgroup. The +subsystem should allocate its subsystem state object for the passed +cgroup, returning a pointer to the new object on success or a +negative error code. On success, the subsystem pointer should point to +a structure of type cgroup_subsys_state (typically embedded in a +larger subsystem-specific object), which will be initialized by the +cgroup system. Note that this will be called at initialization to +create the root subsystem state for this subsystem; this case can be +identified by the passed cgroup object having a NULL parent (since +it's the root of the hierarchy) and may be an appropriate place for +initialization code. + +void destroy(struct cgroup *cont) +LL=cgroup_mutex + +The cgroup system is about to destroy the passed cgroup; the +subsystem should do any necessary cleanup + +int can_attach(struct cgroup_subsys *ss, struct cgroup *cont, + struct task_struct *task) +LL=cgroup_mutex + +Called prior to moving a task into a cgroup; if the subsystem +returns an error, this will abort the attach operation. If a NULL +task is passed, then a successful result indicates that *any* +unspecified task can be moved into the cgroup. Note that this isn't +called on a fork. If this method returns 0 (success) then this should +remain valid while the caller holds cgroup_mutex. + +void attach(struct cgroup_subsys *ss, struct cgroup *cont, + struct cgroup *old_cont, struct task_struct *task) +LL=cgroup_mutex + + +Called after the task has been attached to the cgroup, to allow any +post-attachment activity that requires memory allocations or blocking. + +void fork(struct cgroup_subsy *ss, struct task_struct *task) +LL=callback_mutex, maybe read_lock(tasklist_lock) + +Called when a task is forked into a cgroup. Also called during +registration for all existing tasks. + +void exit(struct cgroup_subsys *ss, struct task_struct *task) +LL=callback_mutex + +Called during task exit + +int populate(struct cgroup_subsys *ss, struct cgroup *cont) +LL=none + +Called after creation of a cgroup to allow a subsystem to populate +the cgroup directory with file entries. The subsystem should make +calls to cgroup_add_file() with objects of type cftype (see +include/linux/cgroup.h for details). Note that although this +method can return an error code, the error code is currently not +always handled well. + +void post_clone(struct cgroup_subsys *ss, struct cgroup *cont) + +Called at the end of cgroup_clone() to do any paramater +initialization which might be required before a task could attach. For +example in cpusets, no task may attach before 'cpus' and 'mems' are set +up. + +void bind(struct cgroup_subsys *ss, struct cgroup *root) +LL=callback_mutex + +Called when a cgroup subsystem is rebound to a different hierarchy +and root cgroup. Currently this will only involve movement between +the default hierarchy (which never has sub-cgroups) and a hierarchy +that is being created/destroyed (and hence has no sub-cgroups). + +4. Questions +============ + +Q: what's up with this '/bin/echo' ? +A: bash's builtin 'echo' command does not check calls to write() against + errors. If you use it in the cgroup file system, you won't be + able to tell whether a command succeeded or failed. + +Q: When I attach processes, only the first of the line gets really attached ! +A: We can only return one error code per call to write(). So you should also + put only ONE pid. + diff --git a/Documentation/cpu-hotplug.txt b/Documentation/cpu-hotplug.txt index b6d24c22274b..a741f658a3c9 100644 --- a/Documentation/cpu-hotplug.txt +++ b/Documentation/cpu-hotplug.txt @@ -220,7 +220,9 @@ A: The following happen, listed in no particular order :-) CPU_DOWN_PREPARE or CPU_DOWN_PREPARE_FROZEN, depending on whether or not the CPU is being offlined while tasks are frozen due to a suspend operation in progress -- All process is migrated away from this outgoing CPU to a new CPU +- All processes are migrated away from this outgoing CPU to new CPUs. + The new CPU is chosen from each process' current cpuset, which may be + a subset of all online CPUs. - All interrupts targeted to this CPU is migrated to a new CPU - timers/bottom half/task lets are also migrated to a new CPU - Once all services are migrated, kernel calls an arch specific routine diff --git a/Documentation/cpusets.txt b/Documentation/cpusets.txt index ec9de6917f01..141bef1c8599 100644 --- a/Documentation/cpusets.txt +++ b/Documentation/cpusets.txt @@ -7,6 +7,7 @@ Written by Simon.Derr@bull.net Portions Copyright (c) 2004-2006 Silicon Graphics, Inc. Modified by Paul Jackson <pj@sgi.com> Modified by Christoph Lameter <clameter@sgi.com> +Modified by Paul Menage <menage@google.com> CONTENTS: ========= @@ -16,9 +17,9 @@ CONTENTS: 1.2 Why are cpusets needed ? 1.3 How are cpusets implemented ? 1.4 What are exclusive cpusets ? - 1.5 What does notify_on_release do ? - 1.6 What is memory_pressure ? - 1.7 What is memory spread ? + 1.5 What is memory_pressure ? + 1.6 What is memory spread ? + 1.7 What is sched_load_balance ? 1.8 How do I use cpusets ? 2. Usage Examples and Syntax 2.1 Basic Usage @@ -44,18 +45,19 @@ hierarchy visible in a virtual file system. These are the essential hooks, beyond what is already present, required to manage dynamic job placement on large systems. -Each task has a pointer to a cpuset. Multiple tasks may reference -the same cpuset. Requests by a task, using the sched_setaffinity(2) -system call to include CPUs in its CPU affinity mask, and using the -mbind(2) and set_mempolicy(2) system calls to include Memory Nodes -in its memory policy, are both filtered through that tasks cpuset, -filtering out any CPUs or Memory Nodes not in that cpuset. The -scheduler will not schedule a task on a CPU that is not allowed in -its cpus_allowed vector, and the kernel page allocator will not -allocate a page on a node that is not allowed in the requesting tasks -mems_allowed vector. - -User level code may create and destroy cpusets by name in the cpuset +Cpusets use the generic cgroup subsystem described in +Documentation/cgroup.txt. + +Requests by a task, using the sched_setaffinity(2) system call to +include CPUs in its CPU affinity mask, and using the mbind(2) and +set_mempolicy(2) system calls to include Memory Nodes in its memory +policy, are both filtered through that tasks cpuset, filtering out any +CPUs or Memory Nodes not in that cpuset. The scheduler will not +schedule a task on a CPU that is not allowed in its cpus_allowed +vector, and the kernel page allocator will not allocate a page on a +node that is not allowed in the requesting tasks mems_allowed vector. + +User level code may create and destroy cpusets by name in the cgroup virtual file system, manage the attributes and permissions of these cpusets and which CPUs and Memory Nodes are assigned to each cpuset, specify and query to which cpuset a task is assigned, and list the @@ -115,7 +117,7 @@ Cpusets extends these two mechanisms as follows: - Cpusets are sets of allowed CPUs and Memory Nodes, known to the kernel. - Each task in the system is attached to a cpuset, via a pointer - in the task structure to a reference counted cpuset structure. + in the task structure to a reference counted cgroup structure. - Calls to sched_setaffinity are filtered to just those CPUs allowed in that tasks cpuset. - Calls to mbind and set_mempolicy are filtered to just @@ -145,15 +147,10 @@ into the rest of the kernel, none in performance critical paths: - in page_alloc.c, to restrict memory to allowed nodes. - in vmscan.c, to restrict page recovery to the current cpuset. -In addition a new file system, of type "cpuset" may be mounted, -typically at /dev/cpuset, to enable browsing and modifying the cpusets -presently known to the kernel. No new system calls are added for -cpusets - all support for querying and modifying cpusets is via -this cpuset file system. - -Each task under /proc has an added file named 'cpuset', displaying -the cpuset name, as the path relative to the root of the cpuset file -system. +You should mount the "cgroup" filesystem type in order to enable +browsing and modifying the cpusets presently known to the kernel. No +new system calls are added for cpusets - all support for querying and +modifying cpusets is via this cpuset file system. The /proc/<pid>/status file for each task has two added lines, displaying the tasks cpus_allowed (on which CPUs it may be scheduled) @@ -163,16 +160,15 @@ in the format seen in the following example: Cpus_allowed: ffffffff,ffffffff,ffffffff,ffffffff Mems_allowed: ffffffff,ffffffff -Each cpuset is represented by a directory in the cpuset file system -containing the following files describing that cpuset: +Each cpuset is represented by a directory in the cgroup file system +containing (on top of the standard cgroup files) the following +files describing that cpuset: - cpus: list of CPUs in that cpuset - mems: list of Memory Nodes in that cpuset - memory_migrate flag: if set, move pages to cpusets nodes - cpu_exclusive flag: is cpu placement exclusive? - mem_exclusive flag: is memory placement exclusive? - - tasks: list of tasks (by pid) attached to that cpuset - - notify_on_release flag: run /sbin/cpuset_release_agent on exit? - memory_pressure: measure of how much paging pressure in cpuset In addition, the root cpuset only has the following file: @@ -237,21 +233,7 @@ such as requests from interrupt handlers, is allowed to be taken outside even a mem_exclusive cpuset. -1.5 What does notify_on_release do ? ------------------------------------- - -If the notify_on_release flag is enabled (1) in a cpuset, then whenever -the last task in the cpuset leaves (exits or attaches to some other -cpuset) and the last child cpuset of that cpuset is removed, then -the kernel runs the command /sbin/cpuset_release_agent, supplying the -pathname (relative to the mount point of the cpuset file system) of the -abandoned cpuset. This enables automatic removal of abandoned cpusets. -The default value of notify_on_release in the root cpuset at system -boot is disabled (0). The default value of other cpusets at creation -is the current value of their parents notify_on_release setting. - - -1.6 What is memory_pressure ? +1.5 What is memory_pressure ? ----------------------------- The memory_pressure of a cpuset provides a simple per-cpuset metric of the rate that the tasks in a cpuset are attempting to free up in @@ -308,7 +290,7 @@ the tasks in the cpuset, in units of reclaims attempted per second, times 1000. -1.7 What is memory spread ? +1.6 What is memory spread ? --------------------------- There are two boolean flag files per cpuset that control where the kernel allocates pages for the file system buffers and related in @@ -378,6 +360,142 @@ policy, especially for jobs that might have one thread reading in the data set, the memory allocation across the nodes in the jobs cpuset can become very uneven. +1.7 What is sched_load_balance ? +-------------------------------- + +The kernel scheduler (kernel/sched.c) automatically load balances +tasks. If one CPU is underutilized, kernel code running on that +CPU will look for tasks on other more overloaded CPUs and move those +tasks to itself, within the constraints of such placement mechanisms +as cpusets and sched_setaffinity. + +The algorithmic cost of load balancing and its impact on key shared +kernel data structures such as the task list increases more than +linearly with the number of CPUs being balanced. So the scheduler +has support to partition the systems CPUs into a number of sched +domains such that it only load balances within each sched domain. +Each sched domain covers some subset of the CPUs in the system; +no two sched domains overlap; some CPUs might not be in any sched +domain and hence won't be load balanced. + +Put simply, it costs less to balance between two smaller sched domains +than one big one, but doing so means that overloads in one of the +two domains won't be load balanced to the other one. + +By default, there is one sched domain covering all CPUs, except those +marked isolated using the kernel boot time "isolcpus=" argument. + +This default load balancing across all CPUs is not well suited for +the following two situations: + 1) On large systems, load balancing across many CPUs is expensive. + If the system is managed using cpusets to place independent jobs + on separate sets of CPUs, full load balancing is unnecessary. + 2) Systems supporting realtime on some CPUs need to minimize + system overhead on those CPUs, including avoiding task load + balancing if that is not needed. + +When the per-cpuset flag "sched_load_balance" is enabled (the default +setting), it requests that all the CPUs in that cpusets allowed 'cpus' +be contained in a single sched domain, ensuring that load balancing +can move a task (not otherwised pinned, as by sched_setaffinity) +from any CPU in that cpuset to any other. + +When the per-cpuset flag "sched_load_balance" is disabled, then the +scheduler will avoid load balancing across the CPUs in that cpuset, +--except-- in so far as is necessary because some overlapping cpuset +has "sched_load_balance" enabled. + +So, for example, if the top cpuset has the flag "sched_load_balance" +enabled, then the scheduler will have one sched domain covering all +CPUs, and the setting of the "sched_load_balance" flag in any other +cpusets won't matter, as we're already fully load balancing. + +Therefore in the above two situations, the top cpuset flag +"sched_load_balance" should be disabled, and only some of the smaller, +child cpusets have this flag enabled. + +When doing this, you don't usually want to leave any unpinned tasks in +the top cpuset that might use non-trivial amounts of CPU, as such tasks +may be artificially constrained to some subset of CPUs, depending on +the particulars of this flag setting in descendent cpusets. Even if +such a task could use spare CPU cycles in some other CPUs, the kernel +scheduler might not consider the possibility of load balancing that +task to that underused CPU. + +Of course, tasks pinned to a particular CPU can be left in a cpuset +that disables "sched_load_balance" as those tasks aren't going anywhere +else anyway. + +There is an impedance mismatch here, between cpusets and sched domains. +Cpusets are hierarchical and nest. Sched domains are flat; they don't +overlap and each CPU is in at most one sched domain. + +It is necessary for sched domains to be flat because load balancing +across partially overlapping sets of CPUs would risk unstable dynamics +that would be beyond our understanding. So if each of two partially +overlapping cpusets enables the flag 'sched_load_balance', then we +form a single sched domain that is a superset of both. We won't move +a task to a CPU outside it cpuset, but the scheduler load balancing +code might waste some compute cycles considering that possibility. + +This mismatch is why there is not a simple one-to-one relation +between which cpusets have the flag "sched_load_balance" enabled, +and the sched domain configuration. If a cpuset enables the flag, it +will get balancing across all its CPUs, but if it disables the flag, +it will only be assured of no load balancing if no other overlapping +cpuset enables the flag. + +If two cpusets have partially overlapping 'cpus' allowed, and only +one of them has this flag enabled, then the other may find its +tasks only partially load balanced, just on the overlapping CPUs. +This is just the general case of the top_cpuset example given a few +paragraphs above. In the general case, as in the top cpuset case, +don't leave tasks that might use non-trivial amounts of CPU in +such partially load balanced cpusets, as they may be artificially +constrained to some subset of the CPUs allowed to them, for lack of +load balancing to the other CPUs. + +1.7.1 sched_load_balance implementation details. +------------------------------------------------ + +The per-cpuset flag 'sched_load_balance' defaults to enabled (contrary +to most cpuset flags.) When enabled for a cpuset, the kernel will +ensure that it can load balance across all the CPUs in that cpuset +(makes sure that all the CPUs in the cpus_allowed of that cpuset are +in the same sched domain.) + +If two overlapping cpusets both have 'sched_load_balance' enabled, +then they will be (must be) both in the same sched domain. + +If, as is the default, the top cpuset has 'sched_load_balance' enabled, +then by the above that means there is a single sched domain covering +the whole system, regardless of any other cpuset settings. + +The kernel commits to user space that it will avoid load balancing +where it can. It will pick as fine a granularity partition of sched +domains as it can while still providing load balancing for any set +of CPUs allowed to a cpuset having 'sched_load_balance' enabled. + +The internal kernel cpuset to scheduler interface passes from the +cpuset code to the scheduler code a partition of the load balanced +CPUs in the system. This partition is a set of subsets (represented +as an array of cpumask_t) of CPUs, pairwise disjoint, that cover all +the CPUs that must be load balanced. + +Whenever the 'sched_load_balance' flag changes, or CPUs come or go +from a cpuset with this flag enabled, or a cpuset with this flag +enabled is removed, the cpuset code builds a new such partition and +passes it to the scheduler sched domain setup code, to have the sched +domains rebuilt as necessary. + +This partition exactly defines what sched domains the scheduler should +setup - one sched domain for each element (cpumask_t) in the partition. + +The scheduler remembers the currently active sched domain partitions. +When the scheduler routine partition_sched_domains() is invoked from +the cpuset code to update these sched domains, it compares the new +partition requested with the current, and updates its sched domains, +removing the old and adding the new, for each change. 1.8 How do I use cpusets ? -------------------------- @@ -469,7 +587,7 @@ than stress the kernel. To start a new job that is to be contained within a cpuset, the steps are: 1) mkdir /dev/cpuset - 2) mount -t cpuset none /dev/cpuset + 2) mount -t cgroup -ocpuset cpuset /dev/cpuset 3) Create the new cpuset by doing mkdir's and write's (or echo's) in the /dev/cpuset virtual file system. 4) Start a task that will be the "founding father" of the new job. @@ -481,7 +599,7 @@ For example, the following sequence of commands will setup a cpuset named "Charlie", containing just CPUs 2 and 3, and Memory Node 1, and then start a subshell 'sh' in that cpuset: - mount -t cpuset none /dev/cpuset + mount -t cgroup -ocpuset cpuset /dev/cpuset cd /dev/cpuset mkdir Charlie cd Charlie @@ -513,7 +631,7 @@ Creating, modifying, using the cpusets can be done through the cpuset virtual filesystem. To mount it, type: -# mount -t cpuset none /dev/cpuset +# mount -t cgroup -o cpuset cpuset /dev/cpuset Then under /dev/cpuset you can find a tree that corresponds to the tree of the cpusets in the system. For instance, /dev/cpuset @@ -556,6 +674,18 @@ To remove a cpuset, just use rmdir: This will fail if the cpuset is in use (has cpusets inside, or has processes attached). +Note that for legacy reasons, the "cpuset" filesystem exists as a +wrapper around the cgroup filesystem. + +The command + +mount -t cpuset X /dev/cpuset + +is equivalent to + +mount -t cgroup -ocpuset X /dev/cpuset +echo "/sbin/cpuset_release_agent" > /dev/cpuset/release_agent + 2.2 Adding/removing cpus ------------------------ diff --git a/Documentation/input/input-programming.txt b/Documentation/input/input-programming.txt index d9d523099bb7..4d932dc66098 100644 --- a/Documentation/input/input-programming.txt +++ b/Documentation/input/input-programming.txt @@ -42,8 +42,8 @@ static int __init button_init(void) goto err_free_irq; } - button_dev->evbit[0] = BIT(EV_KEY); - button_dev->keybit[LONG(BTN_0)] = BIT(BTN_0); + button_dev->evbit[0] = BIT_MASK(EV_KEY); + button_dev->keybit[BIT_WORD(BTN_0)] = BIT_MASK(BTN_0); error = input_register_device(button_dev); if (error) { @@ -217,14 +217,15 @@ If you don't need absfuzz and absflat, you can set them to zero, which mean that the thing is precise and always returns to exactly the center position (if it has any). -1.4 NBITS(), LONG(), BIT() +1.4 BITS_TO_LONGS(), BIT_WORD(), BIT_MASK() ~~~~~~~~~~~~~~~~~~~~~~~~~~ -These three macros from input.h help some bitfield computations: +These three macros from bitops.h help some bitfield computations: - NBITS(x) - returns the length of a bitfield array in longs for x bits - LONG(x) - returns the index in the array in longs for bit x - BIT(x) - returns the index in a long for bit x + BITS_TO_LONGS(x) - returns the length of a bitfield array in longs for + x bits + BIT_WORD(x) - returns the index in the array in longs for bit x + BIT_MASK(x) - returns the index in a long for bit x 1.5 The id* and name fields ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/Documentation/kdump/kdump.txt b/Documentation/kdump/kdump.txt index 1b37b28cc234..d0ac72cc19ff 100644 --- a/Documentation/kdump/kdump.txt +++ b/Documentation/kdump/kdump.txt @@ -231,6 +231,32 @@ Dump-capture kernel config options (Arch Dependent, ia64) any space below the alignment point will be wasted. +Extended crashkernel syntax +=========================== + +While the "crashkernel=size[@offset]" syntax is sufficient for most +configurations, sometimes it's handy to have the reserved memory dependent +on the value of System RAM -- that's mostly for distributors that pre-setup +the kernel command line to avoid a unbootable system after some memory has +been removed from the machine. + +The syntax is: + + crashkernel=<range1>:<size1>[,<range2>:<size2>,...][@offset] + range=start-[end] + +For example: + + crashkernel=512M-2G:64M,2G-:128M + +This would mean: + + 1) if the RAM is smaller than 512M, then don't reserve anything + (this is the "rescue" case) + 2) if the RAM size is between 512M and 2G, then reserve 64M + 3) if the RAM size is larger than 2G, then reserve 128M + + Boot into System Kernel ======================= diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index 189df0bcab99..0a3fed445249 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -497,6 +497,13 @@ and is between 256 and 4096 characters. It is defined in the file [KNL] Reserve a chunk of physical memory to hold a kernel to switch to with kexec on panic. + crashkernel=range1:size1[,range2:size2,...][@offset] + [KNL] Same as above, but depends on the memory + in the running system. The syntax of range is + start-[end] where start and end are both + a memory unit (amount[KMG]). See also + Documentation/kdump/kdump.txt for a example. + cs4232= [HW,OSS] Format: <io>,<irq>,<dma>,<dma2>,<mpuio>,<mpuirq> diff --git a/Documentation/markers.txt b/Documentation/markers.txt new file mode 100644 index 000000000000..295a71bc301e --- /dev/null +++ b/Documentation/markers.txt @@ -0,0 +1,81 @@ + Using the Linux Kernel Markers + + Mathieu Desnoyers + + +This document introduces Linux Kernel Markers and their use. It provides +examples of how to insert markers in the kernel and connect probe functions to +them and provides some examples of probe functions. + + +* Purpose of markers + +A marker placed in code provides a hook to call a function (probe) that you can +provide at runtime. A marker can be "on" (a probe is connected to it) or "off" +(no probe is attached). When a marker is "off" it has no effect, except for +adding a tiny time penalty (checking a condition for a branch) and space +penalty (adding a few bytes for the function call at the end of the +instrumented function and adds a data structure in a separate section). When a +marker is "on", the function you provide is called each time the marker is +executed, in the execution context of the caller. When the function provided +ends its execution, it returns to the caller (continuing from the marker site). + +You can put markers at important locations in the code. Markers are +lightweight hooks that can pass an arbitrary number of parameters, +described in a printk-like format string, to the attached probe function. + +They can be used for tracing and performance accounting. + + +* Usage + +In order to use the macro trace_mark, you should include linux/marker.h. + +#include <linux/marker.h> + +And, + +trace_mark(subsystem_event, "%d %s", someint, somestring); +Where : +- subsystem_event is an identifier unique to your event + - subsystem is the name of your subsystem. + - event is the name of the event to mark. +- "%d %s" is the formatted string for the serializer. +- someint is an integer. +- somestring is a char pointer. + +Connecting a function (probe) to a marker is done by providing a probe (function +to call) for the specific marker through marker_probe_register() and can be +activated by calling marker_arm(). Marker deactivation can be done by calling +marker_disarm() as many times as marker_arm() has been called. Removing a probe +is done through marker_probe_unregister(); it will disarm the probe and make +sure there is no caller left using the probe when it returns. Probe removal is +preempt-safe because preemption is disabled around the probe call. See the +"Probe example" section below for a sample probe module. + +The marker mechanism supports inserting multiple instances of the same marker. +Markers can be put in inline functions, inlined static functions, and +unrolled loops as well as regular functions. + +The naming scheme "subsystem_event" is suggested here as a convention intended +to limit collisions. Marker names are global to the kernel: they are considered +as being the same whether they are in the core kernel image or in modules. +Conflicting format strings for markers with the same name will cause the markers +to be detected to have a different format string not to be armed and will output +a printk warning which identifies the inconsistency: + +"Format mismatch for probe probe_name (format), marker (format)" + + +* Probe / marker example + +See the example provided in samples/markers/src + +Compile them with your kernel. + +Run, as root : +modprobe marker-example (insmod order is not important) +modprobe probe-example +cat /proc/marker-example (returns an expected error) +rmmod marker-example probe-example +dmesg @@ -774,6 +774,9 @@ vmlinux: $(vmlinux-lds) $(vmlinux-init) $(vmlinux-main) $(kallsyms.o) vmlinux.o ifdef CONFIG_HEADERS_CHECK $(Q)$(MAKE) -f $(srctree)/Makefile headers_check endif +ifdef CONFIG_SAMPLES + $(Q)$(MAKE) $(build)=samples +endif $(call vmlinux-modpost) $(call if_changed_rule,vmlinux__) $(Q)rm -f .old_version diff --git a/arch/alpha/Kconfig b/arch/alpha/Kconfig index 2a85dc33907c..4c002ba37e50 100644 --- a/arch/alpha/Kconfig +++ b/arch/alpha/Kconfig @@ -654,7 +654,7 @@ source "drivers/Kconfig" source "fs/Kconfig" -source "arch/alpha/oprofile/Kconfig" +source "kernel/Kconfig.instrumentation" source "arch/alpha/Kconfig.debug" diff --git a/arch/alpha/kernel/semaphore.c b/arch/alpha/kernel/semaphore.c index 8c8aaa205eae..8d2982aa1b8d 100644 --- a/arch/alpha/kernel/semaphore.c +++ b/arch/alpha/kernel/semaphore.c @@ -69,7 +69,7 @@ __down_failed(struct semaphore *sem) #ifdef CONFIG_DEBUG_SEMAPHORE printk("%s(%d): down failed(%p)\n", - tsk->comm, tsk->pid, sem); + tsk->comm, task_pid_nr(tsk), sem); #endif tsk->state = TASK_UNINTERRUPTIBLE; @@ -98,7 +98,7 @@ __down_failed(struct semaphore *sem) #ifdef CONFIG_DEBUG_SEMAPHORE printk("%s(%d): down acquired(%p)\n", - tsk->comm, tsk->pid, sem); + tsk->comm, task_pid_nr(tsk), sem); #endif } @@ -111,7 +111,7 @@ __down_failed_interruptible(struct semaphore *sem) #ifdef CONFIG_DEBUG_SEMAPHORE printk("%s(%d): down failed(%p)\n", - tsk->comm, tsk->pid, sem); + tsk->comm, task_pid_nr(tsk), sem); #endif tsk->state = TASK_INTERRUPTIBLE; @@ -139,7 +139,7 @@ __down_failed_interruptible(struct semaphore *sem) #ifdef CONFIG_DEBUG_SEMAPHORE printk("%s(%d): down %s(%p)\n", - current->comm, current->pid, + current->comm, task_pid_nr(current), (ret < 0 ? "interrupted" : "acquired"), sem); #endif return ret; @@ -168,7 +168,7 @@ down(struct semaphore *sem) #endif #ifdef CONFIG_DEBUG_SEMAPHORE printk("%s(%d): down(%p) <count=%d> from %p\n", - current->comm, current->pid, sem, + current->comm, task_pid_nr(current), sem, atomic_read(&sem->count), __builtin_return_address(0)); #endif __down(sem); @@ -182,7 +182,7 @@ down_interruptible(struct semaphore *sem) #endif #ifdef CONFIG_DEBUG_SEMAPHORE printk("%s(%d): down(%p) <count=%d> from %p\n", - current->comm, current->pid, sem, + current->comm, task_pid_nr(current), sem, atomic_read(&sem->count), __builtin_return_address(0)); #endif return __down_interruptible(sem); @@ -201,7 +201,7 @@ down_trylock(struct semaphore *sem) #ifdef CONFIG_DEBUG_SEMAPHORE printk("%s(%d): down_trylock %s from %p\n", - current->comm, current->pid, + current->comm, task_pid_nr(current), ret ? "failed" : "acquired", __builtin_return_address(0)); #endif @@ -217,7 +217,7 @@ up(struct semaphore *sem) #endif #ifdef CONFIG_DEBUG_SEMAPHORE printk("%s(%d): up(%p) <count=%d> from %p\n", - current->comm, current->pid, sem, + current->comm, task_pid_nr(current), sem, atomic_read(&sem->count), __builtin_return_address(0)); #endif __up(sem); diff --git a/arch/alpha/kernel/traps.c b/arch/alpha/kernel/traps.c index ec0f05e0d8ff..2dc7f9fed213 100644 --- a/arch/alpha/kernel/traps.c +++ b/arch/alpha/kernel/traps.c @@ -182,7 +182,7 @@ die_if_kernel(char * str, struct pt_regs *regs, long err, unsigned long *r9_15) #ifdef CONFIG_SMP printk("CPU %d ", hard_smp_processor_id()); #endif - printk("%s(%d): %s %ld\n", current->comm, current->pid, str, err); + printk("%s(%d): %s %ld\n", current->comm, task_pid_nr(current), str, err); dik_show_regs(regs, r9_15); add_taint(TAINT_DIE); dik_show_trace((unsigned long *)(regs+1)); @@ -646,7 +646,7 @@ got_exception: lock_kernel(); printk("%s(%d): unhandled unaligned exception\n", - current->comm, current->pid); + current->comm, task_pid_nr(current)); printk("pc = [<%016lx>] ra = [<%016lx>] ps = %04lx\n", pc, una_reg(26), regs->ps); @@ -786,7 +786,7 @@ do_entUnaUser(void __user * va, unsigned long opcode, } if (++cnt < 5) { printk("%s(%d): unaligned trap at %016lx: %p %lx %ld\n", - current->comm, current->pid, + current->comm, task_pid_nr(current), regs->pc - 4, va, opcode, reg); } last_time = jiffies; diff --git a/arch/alpha/lib/fls.c b/arch/alpha/lib/fls.c index 7ad84ea0acf8..32afaa3fa686 100644 --- a/arch/alpha/lib/fls.c +++ b/arch/alpha/lib/fls.c @@ -3,7 +3,7 @@ */ #include <linux/module.h> -#include <asm/bitops.h> +#include <linux/bitops.h> /* This is fls(x)-1, except zero is held to zero. This allows most efficent input into extbl, plus it allows easy handling of fls(0)=0. */ diff --git a/arch/alpha/mm/fault.c b/arch/alpha/mm/fault.c index 25154df3055a..4829f96585b1 100644 --- a/arch/alpha/mm/fault.c +++ b/arch/alpha/mm/fault.c @@ -188,13 +188,13 @@ do_page_fault(unsigned long address, unsigned long mmcsr, /* We ran out of memory, or some other thing happened to us that made us unable to handle the page fault gracefully. */ out_of_memory: - if (is_init(current)) { + if (is_global_init(current)) { yield(); down_read(&mm->mmap_sem); goto survive; } printk(KERN_ALERT "VM: killing process %s(%d)\n", - current->comm, current->pid); + current->comm, task_pid_nr(current)); if (!user_mode(regs)) goto no_context; do_group_exit(SIGKILL); diff --git a/arch/alpha/oprofile/Kconfig b/arch/alpha/oprofile/Kconfig deleted file mode 100644 index 5ade19801b97..000000000000 --- a/arch/alpha/oprofile/Kconfig +++ /dev/null @@ -1,23 +0,0 @@ - -menu "Profiling support" - depends on EXPERIMENTAL - -config PROFILING - bool "Profiling support (EXPERIMENTAL)" - help - Say Y here to enable the extended profiling support mechanisms used - by profilers such as OProfile. - - -config OPROFILE - tristate "OProfile system profiling (EXPERIMENTAL)" - depends on PROFILING - help - OProfile is a profiling system capable of profiling the - whole system, include the kernel, kernel modules, libraries, - and applications. - - If unsure, say N. - -endmenu - diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig index 0a0c88d0039c..4cee938df01e 100644 --- a/arch/arm/Kconfig +++ b/arch/arm/Kconfig @@ -1068,7 +1068,7 @@ endmenu source "fs/Kconfig" -source "arch/arm/oprofile/Kconfig" +source "kernel/Kconfig.instrumentation" source "arch/arm/Kconfig.debug" diff --git a/arch/arm/kernel/process.c b/arch/arm/kernel/process.c index 93b7f8e22dcc..4f1a03124a74 100644 --- a/arch/arm/kernel/process.c +++ b/arch/arm/kernel/process.c @@ -265,7 +265,7 @@ void __show_regs(struct pt_regs *regs) void show_regs(struct pt_regs * regs) { printk("\n"); - printk("Pid: %d, comm: %20s\n", current->pid, current->comm); + printk("Pid: %d, comm: %20s\n", task_pid_nr(current), current->comm); __show_regs(regs); __backtrace(); } diff --git a/arch/arm/kernel/ptrace.c b/arch/arm/kernel/ptrace.c index 5feee722ea98..4b05dc5c1023 100644 --- a/arch/arm/kernel/ptrace.c +++ b/arch/arm/kernel/ptrace.c @@ -382,16 +382,16 @@ static void clear_breakpoint(struct task_struct *task, struct debug_entry *bp) if (ret != 2 || old_insn.thumb != BREAKINST_THUMB) printk(KERN_ERR "%s:%d: corrupted Thumb breakpoint at " - "0x%08lx (0x%04x)\n", task->comm, task->pid, - addr, old_insn.thumb); + "0x%08lx (0x%04x)\n", task->comm, + task_pid_nr(task), addr, old_insn.thumb); } else { ret = swap_insn(task, addr & ~3, &old_insn.arm, &bp->insn.arm, 4); if (ret != 4 || old_insn.arm != BREAKINST_ARM) printk(KERN_ERR "%s:%d: corrupted ARM breakpoint at " - "0x%08lx (0x%08x)\n", task->comm, task->pid, - addr, old_insn.arm); + "0x%08lx (0x%08x)\n", task->comm, + task_pid_nr(task), addr, old_insn.arm); } } diff --git a/arch/arm/kernel/traps.c b/arch/arm/kernel/traps.c index 8ad47619c079..4764bd9ccee8 100644 --- a/arch/arm/kernel/traps.c +++ b/arch/arm/kernel/traps.c @@ -223,7 +223,7 @@ static void __die(const char *str, int err, struct thread_info *thread, struct p print_modules(); __show_regs(regs); printk("Process %s (pid: %d, stack limit = 0x%p)\n", - tsk->comm, tsk->pid, thread + 1); + tsk->comm, task_pid_nr(tsk), thread + 1); if (!user_mode(regs) || in_interrupt()) { dump_mem("Stack: ", regs->ARM_sp, @@ -337,7 +337,7 @@ asmlinkage void __exception do_undefinstr(struct pt_regs *regs) #ifdef CONFIG_DEBUG_USER if (user_debug & UDBG_UNDEFINED) { printk(KERN_INFO "%s (%d): undefined instruction: pc=%p\n", - current->comm, current->pid, pc); + current->comm, task_pid_nr(current), pc); dump_instr(regs); } #endif @@ -388,7 +388,7 @@ static int bad_syscall(int n, struct pt_regs *regs) #ifdef CONFIG_DEBUG_USER if (user_debug & UDBG_SYSCALL) { printk(KERN_ERR "[%d] %s: obsolete system call %08x.\n", - current->pid, current->comm, n); + task_pid_nr(current), current->comm, n); dump_instr(regs); } #endif @@ -565,7 +565,7 @@ asmlinkage int arm_syscall(int no, struct pt_regs *regs) */ if (user_debug & UDBG_SYSCALL) { printk("[%d] %s: arm syscall %d\n", - current->pid, current->comm, no); + task_pid_nr(current), current->comm, no); dump_instr(regs); if (user_mode(regs)) { __show_regs(regs); @@ -642,7 +642,7 @@ baddataabort(int code, unsigned long instr, struct pt_regs *regs) #ifdef CONFIG_DEBUG_USER if (user_debug & UDBG_BADABORT) { printk(KERN_ERR "[%d] %s: bad data abort: code %d instr 0x%08lx\n", - current->pid, current->comm, code, instr); + task_pid_nr(current), current->comm, code, instr); dump_instr(regs); show_pte(current->mm, addr); } diff --git a/arch/arm/mm/alignment.c b/arch/arm/mm/alignment.c index 074b7cb07743..e162cca5917f 100644 --- a/arch/arm/mm/alignment.c +++ b/arch/arm/mm/alignment.c @@ -757,7 +757,7 @@ do_alignment(unsigned long addr, unsigned int fsr, struct pt_regs *regs) if (ai_usermode & 1) printk("Alignment trap: %s (%d) PC=0x%08lx Instr=0x%0*lx " "Address=0x%08lx FSR 0x%03x\n", current->comm, - current->pid, instrptr, + task_pid_nr(current), instrptr, thumb_mode(regs) ? 4 : 8, thumb_mode(regs) ? tinstr : instr, addr, fsr); diff --git a/arch/arm/mm/fault.c b/arch/arm/mm/fault.c index 59ed1d05b71b..a8a7dab757eb 100644 --- a/arch/arm/mm/fault.c +++ b/arch/arm/mm/fault.c @@ -197,7 +197,7 @@ survive: return fault; out_of_memory: - if (!is_init(tsk)) + if (!is_global_init(tsk)) goto out; /* diff --git a/arch/arm/oprofile/Kconfig b/arch/arm/oprofile/Kconfig deleted file mode 100644 index afd93ad02feb..000000000000 --- a/arch/arm/oprofile/Kconfig +++ /dev/null @@ -1,42 +0,0 @@ - -menu "Profiling support" - depends on EXPERIMENTAL - -config PROFILING - bool "Profiling support (EXPERIMENTAL)" - help - Say Y here to enable the extended profiling support mechanisms used - by profilers such as OProfile. - - -config OPROFILE - tristate "OProfile system profiling (EXPERIMENTAL)" - depends on PROFILING - help - OProfile is a profiling system capable of profiling the - whole system, include the kernel, kernel modules, libraries, - and applications. - - If unsure, say N. - -if OPROFILE - -config OPROFILE_ARMV6 - bool - depends on CPU_V6 && !SMP - default y - select OPROFILE_ARM11_CORE - -config OPROFILE_MPCORE - bool - depends on CPU_V6 && SMP - default y - select OPROFILE_ARM11_CORE - -config OPROFILE_ARM11_CORE - bool - -endif - -endmenu - diff --git a/arch/avr32/kernel/traps.c b/arch/avr32/kernel/traps.c index 9a73ce7eb50f..8a7caf8e7b45 100644 --- a/arch/avr32/kernel/traps.c +++ b/arch/avr32/kernel/traps.c @@ -89,7 +89,7 @@ void _exception(long signr, struct pt_regs *regs, int code, * generate the same exception over and over again and we get * nowhere. Better to kill it and let the kernel panic. */ - if (is_init(current)) { + if (is_global_init(current)) { __sighandler_t handler; spin_lock_irq(¤t->sighand->siglock); diff --git a/arch/avr32/mm/fault.c b/arch/avr32/mm/fault.c index 11472f8701bd..6560cb18b4e3 100644 --- a/arch/avr32/mm/fault.c +++ b/arch/avr32/mm/fault.c @@ -160,7 +160,7 @@ bad_area: if (exception_trace && printk_ratelimit()) printk("%s%s[%d]: segfault at %08lx pc %08lx " "sp %08lx ecr %lu\n", - is_init(tsk) ? KERN_EMERG : KERN_INFO, + is_global_init(tsk) ? KERN_EMERG : KERN_INFO, tsk->comm, tsk->pid, address, regs->pc, regs->sp, ecr); _exception(SIGSEGV, regs, code, address); @@ -209,7 +209,7 @@ no_context: */ out_of_memory: up_read(&mm->mmap_sem); - if (is_init(current)) { + if (is_global_init(current)) { yield(); down_read(&mm->mmap_sem); goto survive; @@ -231,7 +231,7 @@ do_sigbus: if (exception_trace) printk("%s%s[%d]: bus error at %08lx pc %08lx " "sp %08lx ecr %lu\n", - is_init(tsk) ? KERN_EMERG : KERN_INFO, + is_global_init(tsk) ? KERN_EMERG : KERN_INFO, tsk->comm, tsk->pid, address, regs->pc, regs->sp, ecr); diff --git a/arch/blackfin/Kconfig b/arch/blackfin/Kconfig index aa9db3073312..4c5ca9d5e40f 100644 --- a/arch/blackfin/Kconfig +++ b/arch/blackfin/Kconfig @@ -1012,7 +1012,7 @@ source "drivers/Kconfig" source "fs/Kconfig" -source "arch/blackfin/oprofile/Kconfig" +source "kernel/Kconfig.instrumentation" menu "Kernel hacking" diff --git a/arch/blackfin/oprofile/Kconfig b/arch/blackfin/oprofile/Kconfig deleted file mode 100644 index 0a2fd999c941..000000000000 --- a/arch/blackfin/oprofile/Kconfig +++ /dev/null @@ -1,29 +0,0 @@ -menu "Profiling support" -depends on EXPERIMENTAL - -config PROFILING - bool "Profiling support (EXPERIMENTAL)" - help - Say Y here to enable the extended profiling support mechanisms used - by profilers such as OProfile. - -config OPROFILE - tristate "OProfile system profiling (EXPERIMENTAL)" - depends on PROFILING - help - OProfile is a profiling system capable of profiling the - whole system, include the kernel, kernel modules, libraries, - and applications. - - If unsure, say N. - -config HARDWARE_PM - tristate "Hardware Performance Monitor Profiling" - depends on PROFILING - help - take use of hardware performance monitor to profiling the kernel - and application. - - If unsure, say N. - -endmenu diff --git a/arch/cris/Kconfig b/arch/cris/Kconfig index 6b4d026a00a1..21900a9378bb 100644 --- a/arch/cris/Kconfig +++ b/arch/cris/Kconfig @@ -196,6 +196,8 @@ source "sound/Kconfig" source "drivers/usb/Kconfig" +source "kernel/Kconfig.instrumentation" + source "arch/cris/Kconfig.debug" source "security/Kconfig" diff --git a/arch/frv/Kconfig b/arch/frv/Kconfig index 74eef7111f2b..43153e767bb1 100644 --- a/arch/frv/Kconfig +++ b/arch/frv/Kconfig @@ -375,6 +375,8 @@ source "drivers/Kconfig" source "fs/Kconfig" +source "kernel/Kconfig.instrumentation" + source "arch/frv/Kconfig.debug" source "security/Kconfig" diff --git a/arch/frv/kernel/irq-mb93091.c b/arch/frv/kernel/irq-mb93091.c index ad753c1e9b8f..9e38f99bbab8 100644 --- a/arch/frv/kernel/irq-mb93091.c +++ b/arch/frv/kernel/irq-mb93091.c @@ -17,10 +17,10 @@ #include <linux/interrupt.h> #include <linux/init.h> #include <linux/irq.h> +#include <linux/bitops.h> #include <asm/io.h> #include <asm/system.h> -#include <asm/bitops.h> #include <asm/delay.h> #include <asm/irq.h> #include <asm/irc-regs.h> diff --git a/arch/frv/kernel/irq-mb93093.c b/arch/frv/kernel/irq-mb93093.c index e0983f6926ed..3c2752ca9775 100644 --- a/arch/frv/kernel/irq-mb93093.c +++ b/arch/frv/kernel/irq-mb93093.c @@ -17,10 +17,10 @@ #include <linux/interrupt.h> #include <linux/init.h> #include <linux/irq.h> +#include <linux/bitops.h> #include <asm/io.h> #include <asm/system.h> -#include <asm/bitops.h> #include <asm/delay.h> #include <asm/irq.h> #include <asm/irc-regs.h> diff --git a/arch/frv/kernel/irq-mb93493.c b/arch/frv/kernel/irq-mb93493.c index c157eeff871d..7754c7338e4b 100644 --- a/arch/frv/kernel/irq-mb93493.c +++ b/arch/frv/kernel/irq-mb93493.c @@ -17,10 +17,10 @@ #include <linux/interrupt.h> #include <linux/init.h> #include <linux/irq.h> +#include <linux/bitops.h> #include <asm/io.h> #include <asm/system.h> -#include <asm/bitops.h> #include <asm/delay.h> #include <asm/irq.h> #include <asm/irc-regs.h> diff --git a/arch/frv/kernel/irq.c b/arch/frv/kernel/irq.c index c7e59dcadee4..7ddb69089ed4 100644 --- a/arch/frv/kernel/irq.c +++ b/arch/frv/kernel/irq.c @@ -24,12 +24,12 @@ #include <linux/proc_fs.h> #include <linux/seq_file.h> #include <linux/module.h> +#include <linux/bitops.h> #include <asm/atomic.h> #include <asm/io.h> #include <asm/smp.h> #include <asm/system.h> -#include <asm/bitops.h> #include <asm/uaccess.h> #include <asm/pgalloc.h> #include <asm/delay.h> diff --git a/arch/h8300/Kconfig b/arch/h8300/Kconfig index e35f74e6e505..e2e9f57abe2e 100644 --- a/arch/h8300/Kconfig +++ b/arch/h8300/Kconfig @@ -223,6 +223,8 @@ endmenu source "fs/Kconfig" +source "kernel/Kconfig.instrumentation" + source "arch/h8300/Kconfig.debug" source "security/Kconfig" diff --git a/arch/i386/Kconfig b/arch/i386/Kconfig index b84d5050e92e..04be7a7d090f 100644 --- a/arch/i386/Kconfig +++ b/arch/i386/Kconfig @@ -1256,31 +1256,6 @@ source "drivers/Kconfig" source "fs/Kconfig" -menuconfig INSTRUMENTATION - bool "Instrumentation Support" - default y - ---help--- - Say Y here to get to see options related to performance measurement, - debugging, and testing. This option alone does not add any kernel code. - - If you say N, all options in this submenu will be skipped and disabled. - -if INSTRUMENTATION - -source "arch/x86/oprofile/Kconfig" - -config KPROBES - bool "Kprobes" - depends on KALLSYMS && MODULES - help - Kprobes allows you to trap at almost any kernel address and - execute a callback function. register_kprobe() establishes - a probepoint and specifies the callback. Kprobes is useful - for kernel debugging, non-intrusive instrumentation and testing. - If in doubt, say "N". - -endif # INSTRUMENTATION - source "arch/i386/Kconfig.debug" source "security/Kconfig" diff --git a/arch/ia64/Kconfig b/arch/ia64/Kconfig index c60532d93c54..c89108e9770d 100644 --- a/arch/ia64/Kconfig +++ b/arch/ia64/Kconfig @@ -592,20 +592,7 @@ config IRQ_PER_CPU source "arch/ia64/hp/sim/Kconfig" -menu "Instrumentation Support" - -source "arch/ia64/oprofile/Kconfig" - -config KPROBES - bool "Kprobes" - depends on KALLSYMS && MODULES - help - Kprobes allows you to trap at almost any kernel address and - execute a callback function. register_kprobe() establishes - a probepoint and specifies the callback. Kprobes is useful - for kernel debugging, non-intrusive instrumentation and testing. - If in doubt, say "N". -endmenu +source "kernel/Kconfig.instrumentation" source "arch/ia64/Kconfig.debug" diff --git a/arch/ia64/configs/sn2_defconfig b/arch/ia64/configs/sn2_defconfig index 449d3e75bfc2..75fd90dc76a3 100644 --- a/arch/ia64/configs/sn2_defconfig +++ b/arch/ia64/configs/sn2_defconfig @@ -26,6 +26,7 @@ CONFIG_TASK_IO_ACCOUNTING=y # CONFIG_AUDIT is not set # CONFIG_IKCONFIG is not set CONFIG_LOG_BUF_SHIFT=20 +CONFIG_CGROUPS=y CONFIG_CPUSETS=y CONFIG_SYSFS_DEPRECATED=y CONFIG_RELAY=y diff --git a/arch/ia64/ia32/sys_ia32.c b/arch/ia64/ia32/sys_ia32.c index a3405b3c1eef..d025a22eb225 100644 --- a/arch/ia64/ia32/sys_ia32.c +++ b/arch/ia64/ia32/sys_ia32.c @@ -773,7 +773,7 @@ emulate_mmap (struct file *file, unsigned long start, unsigned long len, int pro if (flags & MAP_SHARED) printk(KERN_INFO "%s(%d): emulate_mmap() can't share head (addr=0x%lx)\n", - current->comm, current->pid, start); + current->comm, task_pid_nr(current), start); ret = mmap_subpage(file, start, min(PAGE_ALIGN(start), end), prot, flags, off); if (IS_ERR((void *) ret)) @@ -786,7 +786,7 @@ emulate_mmap (struct file *file, unsigned long start, unsigned long len, int pro if (flags & MAP_SHARED) printk(KERN_INFO "%s(%d): emulate_mmap() can't share tail (end=0x%lx)\n", - current->comm, current->pid, end); + current->comm, task_pid_nr(current), end); ret = mmap_subpage(file, max(start, PAGE_START(end)), end, prot, flags, (off + len) - offset_in_page(end)); if (IS_ERR((void *) ret)) @@ -816,7 +816,7 @@ emulate_mmap (struct file *file, unsigned long start, unsigned long len, int pro if ((flags & MAP_SHARED) && !is_congruent) printk(KERN_INFO "%s(%d): emulate_mmap() can't share contents of incongruent mmap " - "(addr=0x%lx,off=0x%llx)\n", current->comm, current->pid, start, off); + "(addr=0x%lx,off=0x%llx)\n", current->comm, task_pid_nr(current), start, off); DBG("mmap_body: mapping [0x%lx-0x%lx) %s with poff 0x%llx\n", pstart, pend, is_congruent ? "congruent" : "not congruent", poff); diff --git a/arch/ia64/kernel/efi.c b/arch/ia64/kernel/efi.c index 73ca86d03810..8e4894b205e2 100644 --- a/arch/ia64/kernel/efi.c +++ b/arch/ia64/kernel/efi.c @@ -967,7 +967,7 @@ find_memmap_space (void) * to use. We can allocate partial granules only if the unavailable * parts exist, and are WB. */ -void +unsigned long efi_memmap_init(unsigned long *s, unsigned long *e) { struct kern_memdesc *k, *prev = NULL; @@ -1084,6 +1084,8 @@ efi_memmap_init(unsigned long *s, unsigned long *e) /* reserve the memory we are using for kern_memmap */ *s = (u64)kern_memmap; *e = (u64)++k; + + return total_mem; } void diff --git a/arch/ia64/kernel/perfmon.c b/arch/ia64/kernel/perfmon.c index f55fa07849c4..59169bf7145f 100644 --- a/arch/ia64/kernel/perfmon.c +++ b/arch/ia64/kernel/perfmon.c @@ -158,14 +158,14 @@ */ #define PROTECT_CTX(c, f) \ do { \ - DPRINT(("spinlock_irq_save ctx %p by [%d]\n", c, current->pid)); \ + DPRINT(("spinlock_irq_save ctx %p by [%d]\n", c, task_pid_nr(current))); \ spin_lock_irqsave(&(c)->ctx_lock, f); \ - DPRINT(("spinlocked ctx %p by [%d]\n", c, current->pid)); \ + DPRINT(("spinlocked ctx %p by [%d]\n", c, task_pid_nr(current))); \ } while(0) #define UNPROTECT_CTX(c, f) \ do { \ - DPRINT(("spinlock_irq_restore ctx %p by [%d]\n", c, current->pid)); \ + DPRINT(("spinlock_irq_restore ctx %p by [%d]\n", c, task_pid_nr(current))); \ spin_unlock_irqrestore(&(c)->ctx_lock, f); \ } while(0) @@ -227,12 +227,12 @@ #ifdef PFM_DEBUGGING #define DPRINT(a) \ do { \ - if (unlikely(pfm_sysctl.debug >0)) { printk("%s.%d: CPU%d [%d] ", __FUNCTION__, __LINE__, smp_processor_id(), current->pid); printk a; } \ + if (unlikely(pfm_sysctl.debug >0)) { printk("%s.%d: CPU%d [%d] ", __FUNCTION__, __LINE__, smp_processor_id(), task_pid_nr(current)); printk a; } \ } while (0) #define DPRINT_ovfl(a) \ do { \ - if (unlikely(pfm_sysctl.debug > 0 && pfm_sysctl.debug_ovfl >0)) { printk("%s.%d: CPU%d [%d] ", __FUNCTION__, __LINE__, smp_processor_id(), current->pid); printk a; } \ + if (unlikely(pfm_sysctl.debug > 0 && pfm_sysctl.debug_ovfl >0)) { printk("%s.%d: CPU%d [%d] ", __FUNCTION__, __LINE__, smp_processor_id(), task_pid_nr(current)); printk a; } \ } while (0) #endif @@ -913,7 +913,7 @@ pfm_mask_monitoring(struct task_struct *task) unsigned long mask, val, ovfl_mask; int i; - DPRINT_ovfl(("masking monitoring for [%d]\n", task->pid)); + DPRINT_ovfl(("masking monitoring for [%d]\n", task_pid_nr(task))); ovfl_mask = pmu_conf->ovfl_val; /* @@ -992,12 +992,12 @@ pfm_restore_monitoring(struct task_struct *task) ovfl_mask = pmu_conf->ovfl_val; if (task != current) { - printk(KERN_ERR "perfmon.%d: invalid task[%d] current[%d]\n", __LINE__, task->pid, current->pid); + printk(KERN_ERR "perfmon.%d: invalid task[%d] current[%d]\n", __LINE__, task_pid_nr(task), task_pid_nr(current)); return; } if (ctx->ctx_state != PFM_CTX_MASKED) { printk(KERN_ERR "perfmon.%d: task[%d] current[%d] invalid state=%d\n", __LINE__, - task->pid, current->pid, ctx->ctx_state); + task_pid_nr(task), task_pid_nr(current), ctx->ctx_state); return; } psr = pfm_get_psr(); @@ -1051,7 +1051,8 @@ pfm_restore_monitoring(struct task_struct *task) if ((mask & 0x1) == 0UL) continue; ctx->th_pmcs[i] = ctx->ctx_pmcs[i]; ia64_set_pmc(i, ctx->th_pmcs[i]); - DPRINT(("[%d] pmc[%d]=0x%lx\n", task->pid, i, ctx->th_pmcs[i])); + DPRINT(("[%d] pmc[%d]=0x%lx\n", + task_pid_nr(task), i, ctx->th_pmcs[i])); } ia64_srlz_d(); @@ -1370,7 +1371,7 @@ pfm_reserve_session(struct task_struct *task, int is_syswide, unsigned int cpu) error_conflict: DPRINT(("system wide not possible, conflicting session [%d] on CPU%d\n", - pfm_sessions.pfs_sys_session[cpu]->pid, + task_pid_nr(pfm_sessions.pfs_sys_session[cpu]), cpu)); abort: UNLOCK_PFS(flags); @@ -1442,7 +1443,7 @@ pfm_remove_smpl_mapping(struct task_struct *task, void *vaddr, unsigned long siz /* sanity checks */ if (task->mm == NULL || size == 0UL || vaddr == NULL) { - printk(KERN_ERR "perfmon: pfm_remove_smpl_mapping [%d] invalid context mm=%p\n", task->pid, task->mm); + printk(KERN_ERR "perfmon: pfm_remove_smpl_mapping [%d] invalid context mm=%p\n", task_pid_nr(task), task->mm); return -EINVAL; } @@ -1459,7 +1460,7 @@ pfm_remove_smpl_mapping(struct task_struct *task, void *vaddr, unsigned long siz up_write(&task->mm->mmap_sem); if (r !=0) { - printk(KERN_ERR "perfmon: [%d] unable to unmap sampling buffer @%p size=%lu\n", task->pid, vaddr, size); + printk(KERN_ERR "perfmon: [%d] unable to unmap sampling buffer @%p size=%lu\n", task_pid_nr(task), vaddr, size); } DPRINT(("do_unmap(%p, %lu)=%d\n", vaddr, size, r)); @@ -1501,7 +1502,7 @@ pfm_free_smpl_buffer(pfm_context_t *ctx) return 0; invalid_free: - printk(KERN_ERR "perfmon: pfm_free_smpl_buffer [%d] no buffer\n", current->pid); + printk(KERN_ERR "perfmon: pfm_free_smpl_buffer [%d] no buffer\n", task_pid_nr(current)); return -EINVAL; } #endif @@ -1547,13 +1548,13 @@ pfm_read(struct file *filp, char __user *buf, size_t size, loff_t *ppos) unsigned long flags; DECLARE_WAITQUEUE(wait, current); if (PFM_IS_FILE(filp) == 0) { - printk(KERN_ERR "perfmon: pfm_poll: bad magic [%d]\n", current->pid); + printk(KERN_ERR "perfmon: pfm_poll: bad magic [%d]\n", task_pid_nr(current)); return -EINVAL; } ctx = (pfm_context_t *)filp->private_data; if (ctx == NULL) { - printk(KERN_ERR "perfmon: pfm_read: NULL ctx [%d]\n", current->pid); + printk(KERN_ERR "perfmon: pfm_read: NULL ctx [%d]\n", task_pid_nr(current)); return -EINVAL; } @@ -1607,7 +1608,7 @@ pfm_read(struct file *filp, char __user *buf, size_t size, loff_t *ppos) PROTECT_CTX(ctx, flags); } - DPRINT(("[%d] back to running ret=%ld\n", current->pid, ret)); + DPRINT(("[%d] back to running ret=%ld\n", task_pid_nr(current), ret)); set_current_state(TASK_RUNNING); remove_wait_queue(&ctx->ctx_msgq_wait, &wait); @@ -1616,7 +1617,7 @@ pfm_read(struct file *filp, char __user *buf, size_t size, loff_t *ppos) ret = -EINVAL; msg = pfm_get_next_msg(ctx); if (msg == NULL) { - printk(KERN_ERR "perfmon: pfm_read no msg for ctx=%p [%d]\n", ctx, current->pid); + printk(KERN_ERR "perfmon: pfm_read no msg for ctx=%p [%d]\n", ctx, task_pid_nr(current)); goto abort_locked; } @@ -1647,13 +1648,13 @@ pfm_poll(struct file *filp, poll_table * wait) unsigned int mask = 0; if (PFM_IS_FILE(filp) == 0) { - printk(KERN_ERR "perfmon: pfm_poll: bad magic [%d]\n", current->pid); + printk(KERN_ERR "perfmon: pfm_poll: bad magic [%d]\n", task_pid_nr(current)); return 0; } ctx = (pfm_context_t *)filp->private_data; if (ctx == NULL) { - printk(KERN_ERR "perfmon: pfm_poll: NULL ctx [%d]\n", current->pid); + printk(KERN_ERR "perfmon: pfm_poll: NULL ctx [%d]\n", task_pid_nr(current)); return 0; } @@ -1692,7 +1693,7 @@ pfm_do_fasync(int fd, struct file *filp, pfm_context_t *ctx, int on) ret = fasync_helper (fd, filp, on, &ctx->ctx_async_queue); DPRINT(("pfm_fasync called by [%d] on ctx_fd=%d on=%d async_queue=%p ret=%d\n", - current->pid, + task_pid_nr(current), fd, on, ctx->ctx_async_queue, ret)); @@ -1707,13 +1708,13 @@ pfm_fasync(int fd, struct file *filp, int on) int ret; if (PFM_IS_FILE(filp) == 0) { - printk(KERN_ERR "perfmon: pfm_fasync bad magic [%d]\n", current->pid); + printk(KERN_ERR "perfmon: pfm_fasync bad magic [%d]\n", task_pid_nr(current)); return -EBADF; } ctx = (pfm_context_t *)filp->private_data; if (ctx == NULL) { - printk(KERN_ERR "perfmon: pfm_fasync NULL ctx [%d]\n", current->pid); + printk(KERN_ERR "perfmon: pfm_fasync NULL ctx [%d]\n", task_pid_nr(current)); return -EBADF; } /* @@ -1759,7 +1760,7 @@ pfm_syswide_force_stop(void *info) if (owner != ctx->ctx_task) { printk(KERN_ERR "perfmon: pfm_syswide_force_stop CPU%d unexpected owner [%d] instead of [%d]\n", smp_processor_id(), - owner->pid, ctx->ctx_task->pid); + task_pid_nr(owner), task_pid_nr(ctx->ctx_task)); return; } if (GET_PMU_CTX() != ctx) { @@ -1769,7 +1770,7 @@ pfm_syswide_force_stop(void *info) return; } - DPRINT(("on CPU%d forcing system wide stop for [%d]\n", smp_processor_id(), ctx->ctx_task->pid)); + DPRINT(("on CPU%d forcing system wide stop for [%d]\n", smp_processor_id(), task_pid_nr(ctx->ctx_task))); /* * the context is already protected in pfm_close(), we simply * need to mask interrupts to avoid a PMU interrupt race on @@ -1821,7 +1822,7 @@ pfm_flush(struct file *filp, fl_owner_t id) ctx = (pfm_context_t *)filp->private_data; if (ctx == NULL) { - printk(KERN_ERR "perfmon: pfm_flush: NULL ctx [%d]\n", current->pid); + printk(KERN_ERR "perfmon: pfm_flush: NULL ctx [%d]\n", task_pid_nr(current)); return -EBADF; } @@ -1969,7 +1970,7 @@ pfm_close(struct inode *inode, struct file *filp) ctx = (pfm_context_t *)filp->private_data; if (ctx == NULL) { - printk(KERN_ERR "perfmon: pfm_close: NULL ctx [%d]\n", current->pid); + printk(KERN_ERR "perfmon: pfm_close: NULL ctx [%d]\n", task_pid_nr(current)); return -EBADF; } @@ -2066,7 +2067,7 @@ pfm_close(struct inode *inode, struct file *filp) */ ctx->ctx_state = PFM_CTX_ZOMBIE; - DPRINT(("zombie ctx for [%d]\n", task->pid)); + DPRINT(("zombie ctx for [%d]\n", task_pid_nr(task))); /* * cannot free the context on the spot. deferred until * the task notices the ZOMBIE state @@ -2472,7 +2473,7 @@ pfm_setup_buffer_fmt(struct task_struct *task, struct file *filp, pfm_context_t /* invoke and lock buffer format, if found */ fmt = pfm_find_buffer_fmt(arg->ctx_smpl_buf_id); if (fmt == NULL) { - DPRINT(("[%d] cannot find buffer format\n", task->pid)); + DPRINT(("[%d] cannot find buffer format\n", task_pid_nr(task))); return -EINVAL; } @@ -2483,7 +2484,7 @@ pfm_setup_buffer_fmt(struct task_struct *task, struct file *filp, pfm_context_t ret = pfm_buf_fmt_validate(fmt, task, ctx_flags, cpu, fmt_arg); - DPRINT(("[%d] after validate(0x%x,%d,%p)=%d\n", task->pid, ctx_flags, cpu, fmt_arg, ret)); + DPRINT(("[%d] after validate(0x%x,%d,%p)=%d\n", task_pid_nr(task), ctx_flags, cpu, fmt_arg, ret)); if (ret) goto error; @@ -2605,23 +2606,23 @@ pfm_task_incompatible(pfm_context_t *ctx, struct task_struct *task) * no kernel task or task not owner by caller */ if (task->mm == NULL) { - DPRINT(("task [%d] has not memory context (kernel thread)\n", task->pid)); + DPRINT(("task [%d] has not memory context (kernel thread)\n", task_pid_nr(task))); return -EPERM; } if (pfm_bad_permissions(task)) { - DPRINT(("no permission to attach to [%d]\n", task->pid)); + DPRINT(("no permission to attach to [%d]\n", task_pid_nr(task))); return -EPERM; } /* * cannot block in self-monitoring mode */ if (CTX_OVFL_NOBLOCK(ctx) == 0 && task == current) { - DPRINT(("cannot load a blocking context on self for [%d]\n", task->pid)); + DPRINT(("cannot load a blocking context on self for [%d]\n", task_pid_nr(task))); return -EINVAL; } if (task->exit_state == EXIT_ZOMBIE) { - DPRINT(("cannot attach to zombie task [%d]\n", task->pid)); + DPRINT(("cannot attach to zombie task [%d]\n", task_pid_nr(task))); return -EBUSY; } @@ -2631,7 +2632,7 @@ pfm_task_incompatible(pfm_context_t *ctx, struct task_struct *task) if (task == current) return 0; if ((task->state != TASK_STOPPED) && (task->state != TASK_TRACED)) { - DPRINT(("cannot attach to non-stopped task [%d] state=%ld\n", task->pid, task->state)); + DPRINT(("cannot attach to non-stopped task [%d] state=%ld\n", task_pid_nr(task), task->state)); return -EBUSY; } /* @@ -3512,7 +3513,7 @@ pfm_use_debug_registers(struct task_struct *task) if (pmu_conf->use_rr_dbregs == 0) return 0; - DPRINT(("called for [%d]\n", task->pid)); + DPRINT(("called for [%d]\n", task_pid_nr(task))); /* * do it only once @@ -3543,7 +3544,7 @@ pfm_use_debug_registers(struct task_struct *task) DPRINT(("ptrace_use_dbregs=%u sys_use_dbregs=%u by [%d] ret = %d\n", pfm_sessions.pfs_ptrace_use_dbregs, pfm_sessions.pfs_sys_use_dbregs, - task->pid, ret)); + task_pid_nr(task), ret)); UNLOCK_PFS(flags); @@ -3568,7 +3569,7 @@ pfm_release_debug_registers(struct task_struct *task) LOCK_PFS(flags); if (pfm_sessions.pfs_ptrace_use_dbregs == 0) { - printk(KERN_ERR "perfmon: invalid release for [%d] ptrace_use_dbregs=0\n", task->pid); + printk(KERN_ERR "perfmon: invalid release for [%d] ptrace_use_dbregs=0\n", task_pid_nr(task)); ret = -1; } else { pfm_sessions.pfs_ptrace_use_dbregs--; @@ -3620,7 +3621,7 @@ pfm_restart(pfm_context_t *ctx, void *arg, int count, struct pt_regs *regs) /* sanity check */ if (unlikely(task == NULL)) { - printk(KERN_ERR "perfmon: [%d] pfm_restart no task\n", current->pid); + printk(KERN_ERR "perfmon: [%d] pfm_restart no task\n", task_pid_nr(current)); return -EINVAL; } @@ -3629,7 +3630,7 @@ pfm_restart(pfm_context_t *ctx, void *arg, int count, struct pt_regs *regs) fmt = ctx->ctx_buf_fmt; DPRINT(("restarting self %d ovfl=0x%lx\n", - task->pid, + task_pid_nr(task), ctx->ctx_ovfl_regs[0])); if (CTX_HAS_SMPL(ctx)) { @@ -3653,11 +3654,11 @@ pfm_restart(pfm_context_t *ctx, void *arg, int count, struct pt_regs *regs) pfm_reset_regs(ctx, ctx->ctx_ovfl_regs, PFM_PMD_LONG_RESET); if (rst_ctrl.bits.mask_monitoring == 0) { - DPRINT(("resuming monitoring for [%d]\n", task->pid)); + DPRINT(("resuming monitoring for [%d]\n", task_pid_nr(task))); if (state == PFM_CTX_MASKED) pfm_restore_monitoring(task); } else { - DPRINT(("keeping monitoring stopped for [%d]\n", task->pid)); + DPRINT(("keeping monitoring stopped for [%d]\n", task_pid_nr(task))); // cannot use pfm_stop_monitoring(task, regs); } @@ -3714,10 +3715,10 @@ pfm_restart(pfm_context_t *ctx, void *arg, int count, struct pt_regs *regs) * "self-monitoring". */ if (CTX_OVFL_NOBLOCK(ctx) == 0 && state == PFM_CTX_MASKED) { - DPRINT(("unblocking [%d] \n", task->pid)); + DPRINT(("unblocking [%d] \n", task_pid_nr(task))); complete(&ctx->ctx_restart_done); } else { - DPRINT(("[%d] armed exit trap\n", task->pid)); + DPRINT(("[%d] armed exit trap\n", task_pid_nr(task))); ctx->ctx_fl_trap_reason = PFM_TRAP_REASON_RESET; @@ -3805,7 +3806,7 @@ pfm_write_ibr_dbr(int mode, pfm_context_t *ctx, void *arg, int count, struct pt_ * don't bother if we are loaded and task is being debugged */ if (is_loaded && (thread->flags & IA64_THREAD_DBG_VALID) != 0) { - DPRINT(("debug registers already in use for [%d]\n", task->pid)); + DPRINT(("debug registers already in use for [%d]\n", task_pid_nr(task))); return -EBUSY; } @@ -3846,7 +3847,7 @@ pfm_write_ibr_dbr(int mode, pfm_context_t *ctx, void *arg, int count, struct pt_ * is shared by all processes running on it */ if (first_time && can_access_pmu) { - DPRINT(("[%d] clearing ibrs, dbrs\n", task->pid)); + DPRINT(("[%d] clearing ibrs, dbrs\n", task_pid_nr(task))); for (i=0; i < pmu_conf->num_ibrs; i++) { ia64_set_ibr(i, 0UL); ia64_dv_serialize_instruction(); @@ -4035,7 +4036,7 @@ pfm_stop(pfm_context_t *ctx, void *arg, int count, struct pt_regs *regs) return -EBUSY; } DPRINT(("task [%d] ctx_state=%d is_system=%d\n", - PFM_CTX_TASK(ctx)->pid, + task_pid_nr(PFM_CTX_TASK(ctx)), state, is_system)); /* @@ -4093,7 +4094,7 @@ pfm_stop(pfm_context_t *ctx, void *arg, int count, struct pt_regs *regs) * monitoring disabled in kernel at next reschedule */ ctx->ctx_saved_psr_up = 0; - DPRINT(("task=[%d]\n", task->pid)); + DPRINT(("task=[%d]\n", task_pid_nr(task))); } return 0; } @@ -4298,11 +4299,12 @@ pfm_context_load(pfm_context_t *ctx, void *arg, int count, struct pt_regs *regs) if (is_system) { if (pfm_sessions.pfs_ptrace_use_dbregs) { - DPRINT(("cannot load [%d] dbregs in use\n", task->pid)); + DPRINT(("cannot load [%d] dbregs in use\n", + task_pid_nr(task))); ret = -EBUSY; } else { pfm_sessions.pfs_sys_use_dbregs++; - DPRINT(("load [%d] increased sys_use_dbreg=%u\n", task->pid, pfm_sessions.pfs_sys_use_dbregs)); + DPRINT(("load [%d] increased sys_use_dbreg=%u\n", task_pid_nr(task), pfm_sessions.pfs_sys_use_dbregs)); set_dbregs = 1; } } @@ -4394,7 +4396,7 @@ pfm_context_load(pfm_context_t *ctx, void *arg, int count, struct pt_regs *regs) /* allow user level control */ ia64_psr(regs)->sp = 0; - DPRINT(("clearing psr.sp for [%d]\n", task->pid)); + DPRINT(("clearing psr.sp for [%d]\n", task_pid_nr(task))); SET_LAST_CPU(ctx, smp_processor_id()); INC_ACTIVATION(); @@ -4429,7 +4431,7 @@ pfm_context_load(pfm_context_t *ctx, void *arg, int count, struct pt_regs *regs) */ SET_PMU_OWNER(task, ctx); - DPRINT(("context loaded on PMU for [%d]\n", task->pid)); + DPRINT(("context loaded on PMU for [%d]\n", task_pid_nr(task))); } else { /* * when not current, task MUST be stopped, so this is safe @@ -4493,7 +4495,7 @@ pfm_context_unload(pfm_context_t *ctx, void *arg, int count, struct pt_regs *reg int prev_state, is_system; int ret; - DPRINT(("ctx_state=%d task [%d]\n", ctx->ctx_state, task ? task->pid : -1)); + DPRINT(("ctx_state=%d task [%d]\n", ctx->ctx_state, task ? task_pid_nr(task) : -1)); prev_state = ctx->ctx_state; is_system = ctx->ctx_fl_system; @@ -4568,7 +4570,7 @@ pfm_context_unload(pfm_context_t *ctx, void *arg, int count, struct pt_regs *reg */ ia64_psr(regs)->sp = 1; - DPRINT(("setting psr.sp for [%d]\n", task->pid)); + DPRINT(("setting psr.sp for [%d]\n", task_pid_nr(task))); } /* * save PMDs to context @@ -4608,7 +4610,7 @@ pfm_context_unload(pfm_context_t *ctx, void *arg, int count, struct pt_regs *reg ctx->ctx_fl_can_restart = 0; ctx->ctx_fl_going_zombie = 0; - DPRINT(("disconnected [%d] from context\n", task->pid)); + DPRINT(("disconnected [%d] from context\n", task_pid_nr(task))); return 0; } @@ -4631,7 +4633,7 @@ pfm_exit_thread(struct task_struct *task) PROTECT_CTX(ctx, flags); - DPRINT(("state=%d task [%d]\n", ctx->ctx_state, task->pid)); + DPRINT(("state=%d task [%d]\n", ctx->ctx_state, task_pid_nr(task))); state = ctx->ctx_state; switch(state) { @@ -4640,13 +4642,13 @@ pfm_exit_thread(struct task_struct *task) * only comes to this function if pfm_context is not NULL, i.e., cannot * be in unloaded state */ - printk(KERN_ERR "perfmon: pfm_exit_thread [%d] ctx unloaded\n", task->pid); + printk(KERN_ERR "perfmon: pfm_exit_thread [%d] ctx unloaded\n", task_pid_nr(task)); break; case PFM_CTX_LOADED: case PFM_CTX_MASKED: ret = pfm_context_unload(ctx, NULL, 0, regs); if (ret) { - printk(KERN_ERR "perfmon: pfm_exit_thread [%d] state=%d unload failed %d\n", task->pid, state, ret); + printk(KERN_ERR "perfmon: pfm_exit_thread [%d] state=%d unload failed %d\n", task_pid_nr(task), state, ret); } DPRINT(("ctx unloaded for current state was %d\n", state)); @@ -4655,12 +4657,12 @@ pfm_exit_thread(struct task_struct *task) case PFM_CTX_ZOMBIE: ret = pfm_context_unload(ctx, NULL, 0, regs); if (ret) { - printk(KERN_ERR "perfmon: pfm_exit_thread [%d] state=%d unload failed %d\n", task->pid, state, ret); + printk(KERN_ERR "perfmon: pfm_exit_thread [%d] state=%d unload failed %d\n", task_pid_nr(task), state, ret); } free_ok = 1; break; default: - printk(KERN_ERR "perfmon: pfm_exit_thread [%d] unexpected state=%d\n", task->pid, state); + printk(KERN_ERR "perfmon: pfm_exit_thread [%d] unexpected state=%d\n", task_pid_nr(task), state); break; } UNPROTECT_CTX(ctx, flags); @@ -4744,7 +4746,7 @@ recheck: DPRINT(("context %d state=%d [%d] task_state=%ld must_stop=%d\n", ctx->ctx_fd, state, - task->pid, + task_pid_nr(task), task->state, PFM_CMD_STOPPED(cmd))); /* @@ -4791,7 +4793,7 @@ recheck: */ if (PFM_CMD_STOPPED(cmd)) { if ((task->state != TASK_STOPPED) && (task->state != TASK_TRACED)) { - DPRINT(("[%d] task not in stopped state\n", task->pid)); + DPRINT(("[%d] task not in stopped state\n", task_pid_nr(task))); return -EBUSY; } /* @@ -4884,7 +4886,7 @@ restart_args: * limit abuse to min page size */ if (unlikely(sz > PFM_MAX_ARGSIZE)) { - printk(KERN_ERR "perfmon: [%d] argument too big %lu\n", current->pid, sz); + printk(KERN_ERR "perfmon: [%d] argument too big %lu\n", task_pid_nr(current), sz); return -E2BIG; } @@ -5031,11 +5033,11 @@ pfm_context_force_terminate(pfm_context_t *ctx, struct pt_regs *regs) { int ret; - DPRINT(("entering for [%d]\n", current->pid)); + DPRINT(("entering for [%d]\n", task_pid_nr(current))); ret = pfm_context_unload(ctx, NULL, 0, regs); if (ret) { - printk(KERN_ERR "pfm_context_force_terminate: [%d] unloaded failed with %d\n", current->pid, ret); + printk(KERN_ERR "pfm_context_force_terminate: [%d] unloaded failed with %d\n", task_pid_nr(current), ret); } /* @@ -5072,7 +5074,7 @@ pfm_handle_work(void) ctx = PFM_GET_CTX(current); if (ctx == NULL) { - printk(KERN_ERR "perfmon: [%d] has no PFM context\n", current->pid); + printk(KERN_ERR "perfmon: [%d] has no PFM context\n", task_pid_nr(current)); return; } @@ -5269,7 +5271,7 @@ pfm_overflow_handler(struct task_struct *task, pfm_context_t *ctx, u64 pmc0, str DPRINT_ovfl(("pmc0=0x%lx pid=%d iip=0x%lx, %s " "used_pmds=0x%lx\n", pmc0, - task ? task->pid: -1, + task ? task_pid_nr(task): -1, (regs ? regs->cr_iip : 0), CTX_OVFL_NOBLOCK(ctx) ? "nonblocking" : "blocking", ctx->ctx_used_pmds[0])); @@ -5458,7 +5460,7 @@ pfm_overflow_handler(struct task_struct *task, pfm_context_t *ctx, u64 pmc0, str } DPRINT_ovfl(("owner [%d] pending=%ld reason=%u ovfl_pmds=0x%lx ovfl_notify=0x%lx masked=%d\n", - GET_PMU_OWNER() ? GET_PMU_OWNER()->pid : -1, + GET_PMU_OWNER() ? task_pid_nr(GET_PMU_OWNER()) : -1, PFM_GET_WORK_PENDING(task), ctx->ctx_fl_trap_reason, ovfl_pmds, @@ -5483,7 +5485,7 @@ pfm_overflow_handler(struct task_struct *task, pfm_context_t *ctx, u64 pmc0, str sanity_check: printk(KERN_ERR "perfmon: CPU%d overflow handler [%d] pmc0=0x%lx\n", smp_processor_id(), - task ? task->pid : -1, + task ? task_pid_nr(task) : -1, pmc0); return; @@ -5516,7 +5518,7 @@ stop_monitoring: * * Overall pretty hairy stuff.... */ - DPRINT(("ctx is zombie for [%d], converted to spurious\n", task ? task->pid: -1)); + DPRINT(("ctx is zombie for [%d], converted to spurious\n", task ? task_pid_nr(task): -1)); pfm_clear_psr_up(); ia64_psr(regs)->up = 0; ia64_psr(regs)->sp = 1; @@ -5577,13 +5579,13 @@ pfm_do_interrupt_handler(int irq, void *arg, struct pt_regs *regs) report_spurious1: printk(KERN_INFO "perfmon: spurious overflow interrupt on CPU%d: process %d has no PFM context\n", - this_cpu, task->pid); + this_cpu, task_pid_nr(task)); pfm_unfreeze_pmu(); return -1; report_spurious2: printk(KERN_INFO "perfmon: spurious overflow interrupt on CPU%d: process %d, invalid flag\n", this_cpu, - task->pid); + task_pid_nr(task)); pfm_unfreeze_pmu(); return -1; } @@ -5870,7 +5872,8 @@ pfm_force_cleanup(pfm_context_t *ctx, struct pt_regs *regs) ia64_psr(regs)->sp = 1; if (GET_PMU_OWNER() == task) { - DPRINT(("cleared ownership for [%d]\n", ctx->ctx_task->pid)); + DPRINT(("cleared ownership for [%d]\n", + task_pid_nr(ctx->ctx_task))); SET_PMU_OWNER(NULL, NULL); } @@ -5882,7 +5885,7 @@ pfm_force_cleanup(pfm_context_t *ctx, struct pt_regs *regs) task->thread.pfm_context = NULL; task->thread.flags &= ~IA64_THREAD_PM_VALID; - DPRINT(("force cleanup for [%d]\n", task->pid)); + DPRINT(("force cleanup for [%d]\n", task_pid_nr(task))); } @@ -6426,7 +6429,7 @@ pfm_flush_pmds(struct task_struct *task, pfm_context_t *ctx) if (PMD_IS_COUNTING(i)) { DPRINT(("[%d] pmd[%d] ctx_pmd=0x%lx hw_pmd=0x%lx\n", - task->pid, + task_pid_nr(task), i, ctx->ctx_pmds[i].val, val & ovfl_val)); @@ -6448,11 +6451,11 @@ pfm_flush_pmds(struct task_struct *task, pfm_context_t *ctx) */ if (pmc0 & (1UL << i)) { val += 1 + ovfl_val; - DPRINT(("[%d] pmd[%d] overflowed\n", task->pid, i)); + DPRINT(("[%d] pmd[%d] overflowed\n", task_pid_nr(task), i)); } } - DPRINT(("[%d] ctx_pmd[%d]=0x%lx pmd_val=0x%lx\n", task->pid, i, val, pmd_val)); + DPRINT(("[%d] ctx_pmd[%d]=0x%lx pmd_val=0x%lx\n", task_pid_nr(task), i, val, pmd_val)); if (is_self) ctx->th_pmds[i] = pmd_val; @@ -6793,14 +6796,14 @@ dump_pmu_state(const char *from) printk("CPU%d from %s() current [%d] iip=0x%lx %s\n", this_cpu, from, - current->pid, + task_pid_nr(current), regs->cr_iip, current->comm); task = GET_PMU_OWNER(); ctx = GET_PMU_CTX(); - printk("->CPU%d owner [%d] ctx=%p\n", this_cpu, task ? task->pid : -1, ctx); + printk("->CPU%d owner [%d] ctx=%p\n", this_cpu, task ? task_pid_nr(task) : -1, ctx); psr = pfm_get_psr(); @@ -6848,7 +6851,7 @@ pfm_inherit(struct task_struct *task, struct pt_regs *regs) { struct thread_struct *thread; - DPRINT(("perfmon: pfm_inherit clearing state for [%d]\n", task->pid)); + DPRINT(("perfmon: pfm_inherit clearing state for [%d]\n", task_pid_nr(task))); thread = &task->thread; diff --git a/arch/ia64/kernel/perfmon_default_smpl.c b/arch/ia64/kernel/perfmon_default_smpl.c index ff80eab83b38..a7af1cb419f9 100644 --- a/arch/ia64/kernel/perfmon_default_smpl.c +++ b/arch/ia64/kernel/perfmon_default_smpl.c @@ -44,11 +44,11 @@ default_validate(struct task_struct *task, unsigned int flags, int cpu, void *da int ret = 0; if (data == NULL) { - DPRINT(("[%d] no argument passed\n", task->pid)); + DPRINT(("[%d] no argument passed\n", task_pid_nr(task))); return -EINVAL; } - DPRINT(("[%d] validate flags=0x%x CPU%d\n", task->pid, flags, cpu)); + DPRINT(("[%d] validate flags=0x%x CPU%d\n", task_pid_nr(task), flags, cpu)); /* * must hold at least the buffer header + one minimally sized entry @@ -88,7 +88,7 @@ default_init(struct task_struct *task, void *buf, unsigned int flags, int cpu, v hdr->hdr_count = 0UL; DPRINT(("[%d] buffer=%p buf_size=%lu hdr_size=%lu hdr_version=%u cur_offs=%lu\n", - task->pid, + task_pid_nr(task), buf, hdr->hdr_buf_size, sizeof(*hdr), @@ -245,7 +245,7 @@ default_restart(struct task_struct *task, pfm_ovfl_ctrl_t *ctrl, void *buf, stru static int default_exit(struct task_struct *task, void *buf, struct pt_regs *regs) { - DPRINT(("[%d] exit(%p)\n", task->pid, buf)); + DPRINT(("[%d] exit(%p)\n", task_pid_nr(task), buf)); return 0; } diff --git a/arch/ia64/kernel/process.c b/arch/ia64/kernel/process.c index c613fc0e91cc..2418289ee5ca 100644 --- a/arch/ia64/kernel/process.c +++ b/arch/ia64/kernel/process.c @@ -105,7 +105,8 @@ show_regs (struct pt_regs *regs) unsigned long ip = regs->cr_iip + ia64_psr(regs)->ri; print_modules(); - printk("\nPid: %d, CPU %d, comm: %20s\n", current->pid, smp_processor_id(), current->comm); + printk("\nPid: %d, CPU %d, comm: %20s\n", task_pid_nr(current), + smp_processor_id(), current->comm); printk("psr : %016lx ifs : %016lx ip : [<%016lx>] %s\n", regs->cr_ipsr, regs->cr_ifs, ip, print_tainted()); print_symbol("ip is at %s\n", ip); diff --git a/arch/ia64/kernel/setup.c b/arch/ia64/kernel/setup.c index c5cfcfa4c87c..cbf67f1aa291 100644 --- a/arch/ia64/kernel/setup.c +++ b/arch/ia64/kernel/setup.c @@ -208,6 +208,48 @@ static int __init register_memory(void) __initcall(register_memory); + +#ifdef CONFIG_KEXEC +static void __init setup_crashkernel(unsigned long total, int *n) +{ + unsigned long long base = 0, size = 0; + int ret; + + ret = parse_crashkernel(boot_command_line, total, + &size, &base); + if (ret == 0 && size > 0) { + if (!base) { + sort_regions(rsvd_region, *n); + base = kdump_find_rsvd_region(size, + rsvd_region, *n); + } + if (base != ~0UL) { + printk(KERN_INFO "Reserving %ldMB of memory at %ldMB " + "for crashkernel (System RAM: %ldMB)\n", + (unsigned long)(size >> 20), + (unsigned long)(base >> 20), + (unsigned long)(total >> 20)); + rsvd_region[*n].start = + (unsigned long)__va(base); + rsvd_region[*n].end = + (unsigned long)__va(base + size); + (*n)++; + crashk_res.start = base; + crashk_res.end = base + size - 1; + } + } + efi_memmap_res.start = ia64_boot_param->efi_memmap; + efi_memmap_res.end = efi_memmap_res.start + + ia64_boot_param->efi_memmap_size; + boot_param_res.start = __pa(ia64_boot_param); + boot_param_res.end = boot_param_res.start + + sizeof(*ia64_boot_param); +} +#else +static inline void __init setup_crashkernel(unsigned long total, int *n) +{} +#endif + /** * reserve_memory - setup reserved memory areas * @@ -219,6 +261,7 @@ void __init reserve_memory (void) { int n = 0; + unsigned long total_memory; /* * none of the entries in this table overlap @@ -254,50 +297,11 @@ reserve_memory (void) n++; #endif - efi_memmap_init(&rsvd_region[n].start, &rsvd_region[n].end); + total_memory = efi_memmap_init(&rsvd_region[n].start, &rsvd_region[n].end); n++; -#ifdef CONFIG_KEXEC - /* crashkernel=size@offset specifies the size to reserve for a crash - * kernel. If offset is 0, then it is determined automatically. - * By reserving this memory we guarantee that linux never set's it - * up as a DMA target.Useful for holding code to do something - * appropriate after a kernel panic. - */ - { - char *from = strstr(boot_command_line, "crashkernel="); - unsigned long base, size; - if (from) { - size = memparse(from + 12, &from); - if (*from == '@') - base = memparse(from+1, &from); - else - base = 0; - if (size) { - if (!base) { - sort_regions(rsvd_region, n); - base = kdump_find_rsvd_region(size, - rsvd_region, n); - } - if (base != ~0UL) { - rsvd_region[n].start = - (unsigned long)__va(base); - rsvd_region[n].end = - (unsigned long)__va(base + size); - n++; - crashk_res.start = base; - crashk_res.end = base + size - 1; - } - } - } - efi_memmap_res.start = ia64_boot_param->efi_memmap; - efi_memmap_res.end = efi_memmap_res.start + - ia64_boot_param->efi_memmap_size; - boot_param_res.start = __pa(ia64_boot_param); - boot_param_res.end = boot_param_res.start + - sizeof(*ia64_boot_param); - } -#endif + setup_crashkernel(total_memory, &n); + /* end of memory marker */ rsvd_region[n].start = ~0UL; rsvd_region[n].end = ~0UL; diff --git a/arch/ia64/kernel/signal.c b/arch/ia64/kernel/signal.c index aeec8184e862..cdb64cc4d9c8 100644 --- a/arch/ia64/kernel/signal.c +++ b/arch/ia64/kernel/signal.c @@ -227,7 +227,7 @@ ia64_rt_sigreturn (struct sigscratch *scr) si.si_signo = SIGSEGV; si.si_errno = 0; si.si_code = SI_KERNEL; - si.si_pid = current->pid; + si.si_pid = task_pid_vnr(current); si.si_uid = current->uid; si.si_addr = sc; force_sig_info(SIGSEGV, &si, current); @@ -332,7 +332,7 @@ force_sigsegv_info (int sig, void __user *addr) si.si_signo = SIGSEGV; si.si_errno = 0; si.si_code = SI_KERNEL; - si.si_pid = current->pid; + si.si_pid = task_pid_vnr(current); si.si_uid = current->uid; si.si_addr = addr; force_sig_info(SIGSEGV, &si, current); diff --git a/arch/ia64/kernel/traps.c b/arch/ia64/kernel/traps.c index 3aeaf15e468b..78d65cb947d2 100644 --- a/arch/ia64/kernel/traps.c +++ b/arch/ia64/kernel/traps.c @@ -61,7 +61,7 @@ die (const char *str, struct pt_regs *regs, long err) if (++die.lock_owner_depth < 3) { printk("%s[%d]: %s %ld [%d]\n", - current->comm, current->pid, str, err, ++die_counter); + current->comm, task_pid_nr(current), str, err, ++die_counter); (void) notify_die(DIE_OOPS, (char *)str, regs, err, 255, SIGSEGV); show_regs(regs); } else @@ -315,7 +315,7 @@ handle_fpu_swa (int fp_fault, struct pt_regs *regs, unsigned long isr) last.time = current_jiffies + 5 * HZ; printk(KERN_WARNING "%s(%d): floating-point assist fault at ip %016lx, isr %016lx\n", - current->comm, current->pid, regs->cr_iip + ia64_psr(regs)->ri, isr); + current->comm, task_pid_nr(current), regs->cr_iip + ia64_psr(regs)->ri, isr); } } } @@ -453,7 +453,7 @@ ia64_fault (unsigned long vector, unsigned long isr, unsigned long ifa, if (code == 8) { # ifdef CONFIG_IA64_PRINT_HAZARDS printk("%s[%d]: possible hazard @ ip=%016lx (pr = %016lx)\n", - current->comm, current->pid, + current->comm, task_pid_nr(current), regs.cr_iip + ia64_psr(®s)->ri, regs.pr); # endif return; diff --git a/arch/ia64/kernel/unaligned.c b/arch/ia64/kernel/unaligned.c index fe6aa5a9f8fa..2173de9fe917 100644 --- a/arch/ia64/kernel/unaligned.c +++ b/arch/ia64/kernel/unaligned.c @@ -1340,7 +1340,8 @@ ia64_handle_unaligned (unsigned long ifa, struct pt_regs *regs) size_t len; len = sprintf(buf, "%s(%d): unaligned access to 0x%016lx, " - "ip=0x%016lx\n\r", current->comm, current->pid, + "ip=0x%016lx\n\r", current->comm, + task_pid_nr(current), ifa, regs->cr_iip + ipsr->ri); /* * Don't call tty_write_message() if we're in the kernel; we might @@ -1363,7 +1364,7 @@ ia64_handle_unaligned (unsigned long ifa, struct pt_regs *regs) "administrator\n" "echo 0 > /proc/sys/kernel/ignore-" "unaligned-usertrap to re-enable\n", - current->comm, current->pid); + current->comm, task_pid_nr(current)); } } } else { diff --git a/arch/ia64/mm/fault.c b/arch/ia64/mm/fault.c index 32f26253c4e8..7571076a16a1 100644 --- a/arch/ia64/mm/fault.c +++ b/arch/ia64/mm/fault.c @@ -274,7 +274,7 @@ ia64_do_page_fault (unsigned long address, unsigned long isr, struct pt_regs *re out_of_memory: up_read(&mm->mmap_sem); - if (is_init(current)) { + if (is_global_init(current)) { yield(); down_read(&mm->mmap_sem); goto survive; diff --git a/arch/ia64/mm/init.c b/arch/ia64/mm/init.c index 3e10152abbf0..c6c19bf11bec 100644 --- a/arch/ia64/mm/init.c +++ b/arch/ia64/mm/init.c @@ -127,8 +127,8 @@ ia64_init_addr_space (void) vma->vm_mm = current->mm; vma->vm_start = current->thread.rbs_bot & PAGE_MASK; vma->vm_end = vma->vm_start + PAGE_SIZE; - vma->vm_page_prot = protection_map[VM_DATA_DEFAULT_FLAGS & 0x7]; vma->vm_flags = VM_DATA_DEFAULT_FLAGS|VM_GROWSUP|VM_ACCOUNT; + vma->vm_page_prot = vm_get_page_prot(vma->vm_flags); down_write(¤t->mm->mmap_sem); if (insert_vm_struct(current->mm, vma)) { up_write(¤t->mm->mmap_sem); diff --git a/arch/ia64/oprofile/Kconfig b/arch/ia64/oprofile/Kconfig deleted file mode 100644 index 97271ab484dc..000000000000 --- a/arch/ia64/oprofile/Kconfig +++ /dev/null @@ -1,20 +0,0 @@ -config PROFILING - bool "Profiling support (EXPERIMENTAL)" - help - Say Y here to enable the extended profiling support mechanisms used - by profilers such as OProfile. - -config OPROFILE - tristate "OProfile system profiling (EXPERIMENTAL)" - depends on PROFILING - help - OProfile is a profiling system capable of profiling the - whole system, include the kernel, kernel modules, libraries, - and applications. - - Due to firmware bugs, you may need to use the "nohalt" boot - option if you're using OProfile with the hardware performance - counters. - - If unsure, say N. - diff --git a/arch/m32r/Kconfig b/arch/m32r/Kconfig index bd5fe76401f1..ab9a264cb194 100644 --- a/arch/m32r/Kconfig +++ b/arch/m32r/Kconfig @@ -426,7 +426,7 @@ source "drivers/Kconfig" source "fs/Kconfig" -source "arch/m32r/oprofile/Kconfig" +source "kernel/Kconfig.instrumentation" source "arch/m32r/Kconfig.debug" diff --git a/arch/m32r/kernel/traps.c b/arch/m32r/kernel/traps.c index 97e0b1c0830e..89ba4a0b5d51 100644 --- a/arch/m32r/kernel/traps.c +++ b/arch/m32r/kernel/traps.c @@ -196,7 +196,7 @@ static void show_registers(struct pt_regs *regs) printk("SPI: %08lx\n", sp); } printk("Process %s (pid: %d, process nr: %d, stackpage=%08lx)", - current->comm, current->pid, 0xffff & i, 4096+(unsigned long)current); + current->comm, task_pid_nr(current), 0xffff & i, 4096+(unsigned long)current); /* * When in-kernel, we also print out the stack and code at the diff --git a/arch/m32r/mm/fault.c b/arch/m32r/mm/fault.c index 70a766aad3e0..4a71df4c1b30 100644 --- a/arch/m32r/mm/fault.c +++ b/arch/m32r/mm/fault.c @@ -271,7 +271,7 @@ no_context: */ out_of_memory: up_read(&mm->mmap_sem); - if (is_init(tsk)) { + if (is_global_init(tsk)) { yield(); down_read(&mm->mmap_sem); goto survive; diff --git a/arch/m32r/oprofile/Kconfig b/arch/m32r/oprofile/Kconfig deleted file mode 100644 index 19d37730b664..000000000000 --- a/arch/m32r/oprofile/Kconfig +++ /dev/null @@ -1,23 +0,0 @@ - -menu "Profiling support" - depends on EXPERIMENTAL - -config PROFILING - bool "Profiling support (EXPERIMENTAL)" - help - Say Y here to enable the extended profiling support mechanisms used - by profilers such as OProfile. - - -config OPROFILE - tristate "OProfile system profiling (EXPERIMENTAL)" - depends on PROFILING - help - OProfile is a profiling system capable of profiling the - whole system, include the kernel, kernel modules, libraries, - and applications. - - If unsure, say N. - -endmenu - diff --git a/arch/m68k/Kconfig b/arch/m68k/Kconfig index 20a9c08e59c3..01dee84f840a 100644 --- a/arch/m68k/Kconfig +++ b/arch/m68k/Kconfig @@ -683,6 +683,8 @@ endmenu source "fs/Kconfig" +source "kernel/Kconfig.instrumentation" + source "arch/m68k/Kconfig.debug" source "security/Kconfig" diff --git a/arch/m68k/kernel/traps.c b/arch/m68k/kernel/traps.c index 4e2752a0e89b..97f556fa4932 100644 --- a/arch/m68k/kernel/traps.c +++ b/arch/m68k/kernel/traps.c @@ -900,7 +900,7 @@ void show_registers(struct pt_regs *regs) regs->d4, regs->d5, regs->a0, regs->a1); printk("Process %s (pid: %d, task=%p)\n", - current->comm, current->pid, current); + current->comm, task_pid_nr(current), current); addr = (unsigned long)&fp->un; printk("Frame format=%X ", regs->format); switch (regs->format) { @@ -1038,7 +1038,7 @@ void bad_super_trap (struct frame *fp) fp->un.fmtb.daddr, space_names[ssw & DFC], fp->ptregs.pc); } - printk ("Current process id is %d\n", current->pid); + printk ("Current process id is %d\n", task_pid_nr(current)); die_if_kernel("BAD KERNEL TRAP", &fp->ptregs, 0); } diff --git a/arch/m68k/mm/fault.c b/arch/m68k/mm/fault.c index eaa618681159..f493f03231d5 100644 --- a/arch/m68k/mm/fault.c +++ b/arch/m68k/mm/fault.c @@ -180,7 +180,7 @@ good_area: */ out_of_memory: up_read(&mm->mmap_sem); - if (is_init(current)) { + if (is_global_init(current)) { yield(); down_read(&mm->mmap_sem); goto survive; diff --git a/arch/m68knommu/Kconfig b/arch/m68knommu/Kconfig index 185906b54cb0..f52c627bdadd 100644 --- a/arch/m68knommu/Kconfig +++ b/arch/m68knommu/Kconfig @@ -696,6 +696,8 @@ source "drivers/Kconfig" source "fs/Kconfig" +source "kernel/Kconfig.instrumentation" + source "arch/m68knommu/Kconfig.debug" source "security/Kconfig" diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig index cb027580cd1d..4dc142d394a3 100644 --- a/arch/mips/Kconfig +++ b/arch/mips/Kconfig @@ -2005,7 +2005,7 @@ source "drivers/Kconfig" source "fs/Kconfig" -source "arch/mips/oprofile/Kconfig" +source "kernel/Kconfig.instrumentation" source "arch/mips/Kconfig.debug" diff --git a/arch/mips/au1000/pb1200/irqmap.c b/arch/mips/au1000/pb1200/irqmap.c index 5f48b0603796..bdf00e2a35e4 100644 --- a/arch/mips/au1000/pb1200/irqmap.c +++ b/arch/mips/au1000/pb1200/irqmap.c @@ -36,8 +36,8 @@ #include <linux/slab.h> #include <linux/random.h> #include <linux/delay.h> +#include <linux/bitops.h> -#include <asm/bitops.h> #include <asm/bootinfo.h> #include <asm/io.h> #include <asm/mipsregs.h> diff --git a/arch/mips/basler/excite/excite_irq.c b/arch/mips/basler/excite/excite_irq.c index 1ecab6350421..4903e067916b 100644 --- a/arch/mips/basler/excite/excite_irq.c +++ b/arch/mips/basler/excite/excite_irq.c @@ -29,7 +29,7 @@ #include <linux/timex.h> #include <linux/slab.h> #include <linux/random.h> -#include <asm/bitops.h> +#include <linux/bitops.h> #include <asm/bootinfo.h> #include <asm/io.h> #include <asm/irq.h> diff --git a/arch/mips/configs/ip27_defconfig b/arch/mips/configs/ip27_defconfig index 49bcc58929ba..892d4c38fd0d 100644 --- a/arch/mips/configs/ip27_defconfig +++ b/arch/mips/configs/ip27_defconfig @@ -175,6 +175,7 @@ CONFIG_POSIX_MQUEUE=y CONFIG_IKCONFIG=y CONFIG_IKCONFIG_PROC=y CONFIG_LOG_BUF_SHIFT=15 +CONFIG_CGROUPS=y CONFIG_CPUSETS=y CONFIG_SYSFS_DEPRECATED=y CONFIG_RELAY=y diff --git a/arch/mips/configs/sb1250-swarm_defconfig b/arch/mips/configs/sb1250-swarm_defconfig index 3ed991ae0ebe..49dfcef2518c 100644 --- a/arch/mips/configs/sb1250-swarm_defconfig +++ b/arch/mips/configs/sb1250-swarm_defconfig @@ -196,6 +196,7 @@ CONFIG_SYSVIPC_SYSCTL=y # CONFIG_UTS_NS is not set # CONFIG_AUDIT is not set # CONFIG_IKCONFIG is not set +CONFIG_CGROUPS=y CONFIG_CPUSETS=y CONFIG_SYSFS_DEPRECATED=y CONFIG_RELAY=y diff --git a/arch/mips/kernel/irixelf.c b/arch/mips/kernel/irixelf.c index b997af713eb3..7852c7cdf29e 100644 --- a/arch/mips/kernel/irixelf.c +++ b/arch/mips/kernel/irixelf.c @@ -1172,8 +1172,8 @@ static int irix_core_dump(long signr, struct pt_regs *regs, struct file *file, u prstatus.pr_sighold = current->blocked.sig[0]; psinfo.pr_pid = prstatus.pr_pid = current->pid; psinfo.pr_ppid = prstatus.pr_ppid = current->parent->pid; - psinfo.pr_pgrp = prstatus.pr_pgrp = process_group(current); - psinfo.pr_sid = prstatus.pr_sid = process_session(current); + psinfo.pr_pgrp = prstatus.pr_pgrp = task_pgrp_nr(current); + psinfo.pr_sid = prstatus.pr_sid = task_session_nr(current); if (current->pid == current->tgid) { /* * This is the record for the group leader. Add in the diff --git a/arch/mips/kernel/irixsig.c b/arch/mips/kernel/irixsig.c index 85c2e389edd6..a0a91056fda7 100644 --- a/arch/mips/kernel/irixsig.c +++ b/arch/mips/kernel/irixsig.c @@ -609,7 +609,7 @@ repeat: p = list_entry(_p, struct task_struct, sibling); if ((type == IRIX_P_PID) && p->pid != pid) continue; - if ((type == IRIX_P_PGID) && process_group(p) != pid) + if ((type == IRIX_P_PGID) && task_pgrp_nr(p) != pid) continue; if ((p->exit_signal != SIGCHLD)) continue; diff --git a/arch/mips/kernel/sysirix.c b/arch/mips/kernel/sysirix.c index ee7790d9debe..4c477c7ff74a 100644 --- a/arch/mips/kernel/sysirix.c +++ b/arch/mips/kernel/sysirix.c @@ -763,11 +763,11 @@ asmlinkage int irix_setpgrp(int flags) printk("[%s:%d] setpgrp(%d) ", current->comm, current->pid, flags); #endif if(!flags) - error = process_group(current); + error = task_pgrp_nr(current); else error = sys_setsid(); #ifdef DEBUG_PROCGRPS - printk("returning %d\n", process_group(current)); + printk("returning %d\n", task_pgrp_nr(current)); #endif return error; diff --git a/arch/mips/kernel/traps.c b/arch/mips/kernel/traps.c index 7b78d137259f..fa500787152d 100644 --- a/arch/mips/kernel/traps.c +++ b/arch/mips/kernel/traps.c @@ -314,7 +314,7 @@ void show_registers(const struct pt_regs *regs) __show_regs(regs); print_modules(); printk("Process %s (pid: %d, threadinfo=%p, task=%p)\n", - current->comm, current->pid, current_thread_info(), current); + current->comm, task_pid_nr(current), current_thread_info(), current); show_stacktrace(current, regs); show_code((unsigned int __user *) regs->cp0_epc); printk("\n"); diff --git a/arch/mips/mm/fault.c b/arch/mips/mm/fault.c index 5699c7713e2f..fa636fc6b7b9 100644 --- a/arch/mips/mm/fault.c +++ b/arch/mips/mm/fault.c @@ -173,7 +173,7 @@ no_context: */ out_of_memory: up_read(&mm->mmap_sem); - if (is_init(tsk)) { + if (is_global_init(tsk)) { yield(); down_read(&mm->mmap_sem); goto survive; diff --git a/arch/mips/oprofile/Kconfig b/arch/mips/oprofile/Kconfig deleted file mode 100644 index fb6f235348b0..000000000000 --- a/arch/mips/oprofile/Kconfig +++ /dev/null @@ -1,23 +0,0 @@ - -menu "Profiling support" - depends on EXPERIMENTAL - -config PROFILING - bool "Profiling support (EXPERIMENTAL)" - help - Say Y here to enable the extended profiling support mechanisms used - by profilers such as OProfile. - - -config OPROFILE - tristate "OProfile system profiling (EXPERIMENTAL)" - depends on PROFILING && !MIPS_MT_SMTC && EXPERIMENTAL - help - OProfile is a profiling system capable of profiling the - whole system, include the kernel, kernel modules, libraries, - and applications. - - If unsure, say N. - -endmenu - diff --git a/arch/mips/tx4938/common/setup.c b/arch/mips/tx4938/common/setup.c index ab4082267553..04f009ccb0eb 100644 --- a/arch/mips/tx4938/common/setup.c +++ b/arch/mips/tx4938/common/setup.c @@ -24,7 +24,7 @@ #include <linux/slab.h> #include <linux/random.h> #include <linux/irq.h> -#include <asm/bitops.h> +#include <linux/bitops.h> #include <asm/bootinfo.h> #include <asm/io.h> #include <asm/irq.h> diff --git a/arch/parisc/Kconfig b/arch/parisc/Kconfig index 3d73545e8c48..b8ef1787a191 100644 --- a/arch/parisc/Kconfig +++ b/arch/parisc/Kconfig @@ -267,7 +267,7 @@ source "drivers/Kconfig" source "fs/Kconfig" -source "arch/parisc/oprofile/Kconfig" +source "kernel/Kconfig.instrumentation" source "arch/parisc/Kconfig.debug" diff --git a/arch/parisc/kernel/signal.c b/arch/parisc/kernel/signal.c index fb35ebc0c4da..2ce3806f02e1 100644 --- a/arch/parisc/kernel/signal.c +++ b/arch/parisc/kernel/signal.c @@ -181,7 +181,7 @@ give_sigsegv: si.si_signo = SIGSEGV; si.si_errno = 0; si.si_code = SI_KERNEL; - si.si_pid = current->pid; + si.si_pid = task_pid_vnr(current); si.si_uid = current->uid; si.si_addr = &frame->uc; force_sig_info(SIGSEGV, &si, current); diff --git a/arch/parisc/kernel/traps.c b/arch/parisc/kernel/traps.c index bbf029a184ac..99fd56939afa 100644 --- a/arch/parisc/kernel/traps.c +++ b/arch/parisc/kernel/traps.c @@ -219,7 +219,7 @@ void die_if_kernel(char *str, struct pt_regs *regs, long err) return; /* STFU */ printk(KERN_CRIT "%s (pid %d): %s (code %ld) at " RFMT "\n", - current->comm, current->pid, str, err, regs->iaoq[0]); + current->comm, task_pid_nr(current), str, err, regs->iaoq[0]); #ifdef PRINT_USER_FAULTS /* XXX for debugging only */ show_regs(regs); @@ -252,7 +252,7 @@ KERN_CRIT " || ||\n"); if (err) printk(KERN_CRIT "%s (pid %d): %s (code %ld)\n", - current->comm, current->pid, str, err); + current->comm, task_pid_nr(current), str, err); /* Wot's wrong wif bein' racy? */ if (current->thread.flags & PARISC_KERNEL_DEATH) { @@ -317,7 +317,7 @@ static void handle_break(struct pt_regs *regs) if (unlikely(iir != GDB_BREAK_INSN)) { printk(KERN_DEBUG "break %d,%d: pid=%d command='%s'\n", iir & 31, (iir>>13) & ((1<<13)-1), - current->pid, current->comm); + task_pid_nr(current), current->comm); show_regs(regs); } #endif @@ -747,7 +747,7 @@ void handle_interruption(int code, struct pt_regs *regs) if (user_mode(regs)) { #ifdef PRINT_USER_FAULTS printk(KERN_DEBUG "\nhandle_interruption() pid=%d command='%s'\n", - current->pid, current->comm); + task_pid_nr(current), current->comm); show_regs(regs); #endif /* SIGBUS, for lack of a better one. */ @@ -772,7 +772,7 @@ void handle_interruption(int code, struct pt_regs *regs) else printk(KERN_DEBUG "User Fault (long pointer) (fault %d) ", code); - printk("pid=%d command='%s'\n", current->pid, current->comm); + printk("pid=%d command='%s'\n", task_pid_nr(current), current->comm); show_regs(regs); #endif si.si_signo = SIGSEGV; diff --git a/arch/parisc/kernel/unaligned.c b/arch/parisc/kernel/unaligned.c index 347bb922e6d0..aebf3c168871 100644 --- a/arch/parisc/kernel/unaligned.c +++ b/arch/parisc/kernel/unaligned.c @@ -469,7 +469,7 @@ void handle_unaligned(struct pt_regs *regs) && ++unaligned_count < 5) { char buf[256]; sprintf(buf, "%s(%d): unaligned access to 0x" RFMT " at ip=0x" RFMT "\n", - current->comm, current->pid, regs->ior, regs->iaoq[0]); + current->comm, task_pid_nr(current), regs->ior, regs->iaoq[0]); printk(KERN_WARNING "%s", buf); #ifdef DEBUG_UNALIGNED show_regs(regs); diff --git a/arch/parisc/mm/fault.c b/arch/parisc/mm/fault.c index 1c091b415cd9..b2e3e9a8cece 100644 --- a/arch/parisc/mm/fault.c +++ b/arch/parisc/mm/fault.c @@ -211,7 +211,7 @@ bad_area: #ifdef PRINT_USER_FAULTS printk(KERN_DEBUG "\n"); printk(KERN_DEBUG "do_page_fault() pid=%d command='%s' type=%lu address=0x%08lx\n", - tsk->pid, tsk->comm, code, address); + task_pid_nr(tsk), tsk->comm, code, address); if (vma) { printk(KERN_DEBUG "vm_start = 0x%08lx, vm_end = 0x%08lx\n", vma->vm_start, vma->vm_end); diff --git a/arch/parisc/oprofile/Kconfig b/arch/parisc/oprofile/Kconfig deleted file mode 100644 index 5ade19801b97..000000000000 --- a/arch/parisc/oprofile/Kconfig +++ /dev/null @@ -1,23 +0,0 @@ - -menu "Profiling support" - depends on EXPERIMENTAL - -config PROFILING - bool "Profiling support (EXPERIMENTAL)" - help - Say Y here to enable the extended profiling support mechanisms used - by profilers such as OProfile. - - -config OPROFILE - tristate "OProfile system profiling (EXPERIMENTAL)" - depends on PROFILING - help - OProfile is a profiling system capable of profiling the - whole system, include the kernel, kernel modules, libraries, - and applications. - - If unsure, say N. - -endmenu - diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 3763f681ce4c..18f397ca05ef 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -669,20 +669,7 @@ source "arch/powerpc/sysdev/qe_lib/Kconfig" source "lib/Kconfig" -menu "Instrumentation Support" - -source "arch/powerpc/oprofile/Kconfig" - -config KPROBES - bool "Kprobes" - depends on !BOOKE && !4xx && KALLSYMS && MODULES - help - Kprobes allows you to trap at almost any kernel address and - execute a callback function. register_kprobe() establishes - a probepoint and specifies the callback. Kprobes is useful - for kernel debugging, non-intrusive instrumentation and testing. - If in doubt, say "N". -endmenu +source "kernel/Kconfig.instrumentation" source "arch/powerpc/Kconfig.debug" diff --git a/arch/powerpc/configs/cell_defconfig b/arch/powerpc/configs/cell_defconfig index 8b47c846421c..dcd7c02727c2 100644 --- a/arch/powerpc/configs/cell_defconfig +++ b/arch/powerpc/configs/cell_defconfig @@ -68,6 +68,7 @@ CONFIG_SYSVIPC_SYSCTL=y CONFIG_IKCONFIG=y CONFIG_IKCONFIG_PROC=y CONFIG_LOG_BUF_SHIFT=15 +CONFIG_CGROUPS=y CONFIG_CPUSETS=y CONFIG_SYSFS_DEPRECATED=y # CONFIG_RELAY is not set diff --git a/arch/powerpc/configs/ppc64_defconfig b/arch/powerpc/configs/ppc64_defconfig index bb8d4e46f0c5..05582af50c5b 100644 --- a/arch/powerpc/configs/ppc64_defconfig +++ b/arch/powerpc/configs/ppc64_defconfig @@ -71,6 +71,7 @@ CONFIG_TASK_DELAY_ACCT=y CONFIG_IKCONFIG=y CONFIG_IKCONFIG_PROC=y CONFIG_LOG_BUF_SHIFT=17 +CONFIG_CGROUPS=y CONFIG_CPUSETS=y CONFIG_SYSFS_DEPRECATED=y CONFIG_RELAY=y diff --git a/arch/powerpc/configs/pseries_defconfig b/arch/powerpc/configs/pseries_defconfig index c09eb8cfbe71..62a38406b62f 100644 --- a/arch/powerpc/configs/pseries_defconfig +++ b/arch/powerpc/configs/pseries_defconfig @@ -71,6 +71,7 @@ CONFIG_AUDITSYSCALL=y CONFIG_IKCONFIG=y CONFIG_IKCONFIG_PROC=y CONFIG_LOG_BUF_SHIFT=17 +CONFIG_CGROUPS=y CONFIG_CPUSETS=y CONFIG_SYSFS_DEPRECATED=y # CONFIG_RELAY is not set diff --git a/arch/powerpc/kernel/machine_kexec.c b/arch/powerpc/kernel/machine_kexec.c index e60a0c544d63..c0c8e8c3ced9 100644 --- a/arch/powerpc/kernel/machine_kexec.c +++ b/arch/powerpc/kernel/machine_kexec.c @@ -61,45 +61,39 @@ NORET_TYPE void machine_kexec(struct kimage *image) for(;;); } -static int __init early_parse_crashk(char *p) -{ - unsigned long size; - - if (!p) - return 1; - - size = memparse(p, &p); - - if (*p == '@') - crashk_res.start = memparse(p + 1, &p); - else - crashk_res.start = KDUMP_KERNELBASE; - - crashk_res.end = crashk_res.start + size - 1; - - return 0; -} -early_param("crashkernel", early_parse_crashk); - void __init reserve_crashkernel(void) { - unsigned long size; + unsigned long long crash_size, crash_base; + int ret; + + /* this is necessary because of lmb_phys_mem_size() */ + lmb_analyze(); + + /* use common parsing */ + ret = parse_crashkernel(boot_command_line, lmb_phys_mem_size(), + &crash_size, &crash_base); + if (ret == 0 && crash_size > 0) { + if (crash_base == 0) + crash_base = KDUMP_KERNELBASE; + crashk_res.start = crash_base; + } else { + /* handle the device tree */ + crash_size = crashk_res.end - crashk_res.start + 1; + } - if (crashk_res.start == 0) + if (crash_size == 0) return; /* We might have got these values via the command line or the * device tree, either way sanitise them now. */ - size = crashk_res.end - crashk_res.start + 1; - if (crashk_res.start != KDUMP_KERNELBASE) printk("Crash kernel location must be 0x%x\n", KDUMP_KERNELBASE); crashk_res.start = KDUMP_KERNELBASE; - size = PAGE_ALIGN(size); - crashk_res.end = crashk_res.start + size - 1; + crash_size = PAGE_ALIGN(crash_size); + crashk_res.end = crashk_res.start + crash_size - 1; /* Crash kernel trumps memory limit */ if (memory_limit && memory_limit <= crashk_res.end) { @@ -108,7 +102,13 @@ void __init reserve_crashkernel(void) memory_limit); } - lmb_reserve(crashk_res.start, size); + printk(KERN_INFO "Reserving %ldMB of memory at %ldMB " + "for crashkernel (System RAM: %ldMB)\n", + (unsigned long)(crash_size >> 20), + (unsigned long)(crashk_res.start >> 20), + (unsigned long)(lmb_phys_mem_size() >> 20)); + + lmb_reserve(crashk_res.start, crash_size); } int overlaps_crashkernel(unsigned long start, unsigned long size) diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c index ea6ad7a2a7e3..b9d88374f14f 100644 --- a/arch/powerpc/kernel/process.c +++ b/arch/powerpc/kernel/process.c @@ -459,7 +459,7 @@ void show_regs(struct pt_regs * regs) printk("DAR: "REG", DSISR: "REG"\n", regs->dar, regs->dsisr); #endif printk("TASK = %p[%d] '%s' THREAD: %p", - current, current->pid, current->comm, task_thread_info(current)); + current, task_pid_nr(current), current->comm, task_thread_info(current)); #ifdef CONFIG_SMP printk(" CPU: %d", smp_processor_id()); diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c index bf9e39c6e296..59c464e26f38 100644 --- a/arch/powerpc/kernel/traps.c +++ b/arch/powerpc/kernel/traps.c @@ -201,7 +201,7 @@ void _exception(int signr, struct pt_regs *regs, int code, unsigned long addr) * generate the same exception over and over again and we get * nowhere. Better to kill it and let the kernel panic. */ - if (is_init(current)) { + if (is_global_init(current)) { __sighandler_t handler; spin_lock_irq(¤t->sighand->siglock); @@ -881,7 +881,7 @@ void nonrecoverable_exception(struct pt_regs *regs) void trace_syscall(struct pt_regs *regs) { printk("Task: %p(%d), PC: %08lX/%08lX, Syscall: %3ld, Result: %s%ld %s\n", - current, current->pid, regs->nip, regs->link, regs->gpr[0], + current, task_pid_nr(current), regs->nip, regs->link, regs->gpr[0], regs->ccr&0x10000000?"Error=":"", regs->gpr[3], print_tainted()); } diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c index ab3546c5ac3a..a18fda361cc0 100644 --- a/arch/powerpc/mm/fault.c +++ b/arch/powerpc/mm/fault.c @@ -375,7 +375,7 @@ bad_area_nosemaphore: */ out_of_memory: up_read(&mm->mmap_sem); - if (is_init(current)) { + if (is_global_init(current)) { yield(); down_read(&mm->mmap_sem); goto survive; diff --git a/arch/powerpc/oprofile/Kconfig b/arch/powerpc/oprofile/Kconfig deleted file mode 100644 index 7089e79689b9..000000000000 --- a/arch/powerpc/oprofile/Kconfig +++ /dev/null @@ -1,24 +0,0 @@ -config PROFILING - bool "Profiling support (EXPERIMENTAL)" - help - Say Y here to enable the extended profiling support mechanisms used - by profilers such as OProfile. - - -config OPROFILE - tristate "OProfile system profiling (EXPERIMENTAL)" - depends on PROFILING - help - OProfile is a profiling system capable of profiling the - whole system, include the kernel, kernel modules, libraries, - and applications. - - If unsure, say N. - -config OPROFILE_CELL - bool "OProfile for Cell Broadband Engine" - depends on (SPU_FS = y && OPROFILE = m) || (SPU_FS = y && OPROFILE = y) || (SPU_FS = m && OPROFILE = m) - default y - help - Profiling of Cell BE SPUs requires special support enabled - by this option. diff --git a/arch/powerpc/platforms/maple/setup.c b/arch/powerpc/platforms/maple/setup.c index 354c05861629..144177d77cf1 100644 --- a/arch/powerpc/platforms/maple/setup.c +++ b/arch/powerpc/platforms/maple/setup.c @@ -41,13 +41,13 @@ #include <linux/root_dev.h> #include <linux/serial.h> #include <linux/smp.h> +#include <linux/bitops.h> #include <asm/processor.h> #include <asm/sections.h> #include <asm/prom.h> #include <asm/system.h> #include <asm/pgtable.h> -#include <asm/bitops.h> #include <asm/io.h> #include <asm/kexec.h> #include <asm/pci-bridge.h> diff --git a/arch/powerpc/platforms/pseries/ras.c b/arch/powerpc/platforms/pseries/ras.c index 3a393c7f390e..a1ab25c7082f 100644 --- a/arch/powerpc/platforms/pseries/ras.c +++ b/arch/powerpc/platforms/pseries/ras.c @@ -332,7 +332,7 @@ static int recover_mce(struct pt_regs *regs, struct rtas_error_log * err) err->disposition == RTAS_DISP_NOT_RECOVERED && err->target == RTAS_TARGET_MEMORY && err->type == RTAS_TYPE_ECC_UNCORR && - !(current->pid == 0 || is_init(current))) { + !(current->pid == 0 || is_global_init(current))) { /* Kill off a user process with an ECC error */ printk(KERN_ERR "MCE: uncorrectable ecc error for pid %d\n", current->pid); diff --git a/arch/ppc/Kconfig b/arch/ppc/Kconfig index 607925c8a99e..6473fa7cb4b9 100644 --- a/arch/ppc/Kconfig +++ b/arch/ppc/Kconfig @@ -1317,7 +1317,7 @@ endmenu source "lib/Kconfig" -source "arch/powerpc/oprofile/Kconfig" +source "kernel/Kconfig.instrumentation" source "arch/ppc/Kconfig.debug" diff --git a/arch/ppc/kernel/traps.c b/arch/ppc/kernel/traps.c index 3f3b292eb773..c78568905c3b 100644 --- a/arch/ppc/kernel/traps.c +++ b/arch/ppc/kernel/traps.c @@ -121,7 +121,7 @@ void _exception(int signr, struct pt_regs *regs, int code, unsigned long addr) * generate the same exception over and over again and we get * nowhere. Better to kill it and let the kernel panic. */ - if (is_init(current)) { + if (is_global_init(current)) { __sighandler_t handler; spin_lock_irq(¤t->sighand->siglock); diff --git a/arch/ppc/mm/fault.c b/arch/ppc/mm/fault.c index 94913ddcf76e..254c23b755e6 100644 --- a/arch/ppc/mm/fault.c +++ b/arch/ppc/mm/fault.c @@ -290,7 +290,7 @@ bad_area: */ out_of_memory: up_read(&mm->mmap_sem); - if (is_init(current)) { + if (is_global_init(current)) { yield(); down_read(&mm->mmap_sem); goto survive; diff --git a/arch/ppc/platforms/chestnut.c b/arch/ppc/platforms/chestnut.c index 248684f50dd9..dcd6070b85eb 100644 --- a/arch/ppc/platforms/chestnut.c +++ b/arch/ppc/platforms/chestnut.c @@ -49,7 +49,6 @@ extern void gen550_progress(char *, unsigned short); extern void gen550_init(int, struct uart_port *); extern void mv64360_pcibios_fixup(mv64x60_handle_t *bh); -#define BIT(x) (1<<x) #define CHESTNUT_PRESERVE_MASK (BIT(MV64x60_CPU2DEV_0_WIN) | \ BIT(MV64x60_CPU2DEV_1_WIN) | \ BIT(MV64x60_CPU2DEV_2_WIN) | \ diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index b71132166f60..4ec716d8c1a6 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@ -529,21 +529,7 @@ source "drivers/Kconfig" source "fs/Kconfig" -menu "Instrumentation Support" - -source "arch/s390/oprofile/Kconfig" - -config KPROBES - bool "Kprobes (EXPERIMENTAL)" - depends on EXPERIMENTAL && MODULES - help - Kprobes allows you to trap at almost any kernel address and - execute a callback function. register_kprobe() establishes - a probepoint and specifies the callback. Kprobes is useful - for kernel debugging, non-intrusive instrumentation and testing. - If in doubt, say "N". - -endmenu +source "kernel/Kconfig.instrumentation" source "arch/s390/Kconfig.debug" diff --git a/arch/s390/kernel/process.c b/arch/s390/kernel/process.c index abb447a3e472..70c57378f426 100644 --- a/arch/s390/kernel/process.c +++ b/arch/s390/kernel/process.c @@ -166,7 +166,7 @@ void show_regs(struct pt_regs *regs) printk("CPU: %d %s\n", task_thread_info(tsk)->cpu, print_tainted()); printk("Process %s (pid: %d, task: %p, ksp: %p)\n", - current->comm, current->pid, (void *) tsk, + current->comm, task_pid_nr(current), (void *) tsk, (void *) tsk->thread.ksp); show_registers(regs); diff --git a/arch/s390/lib/uaccess_pt.c b/arch/s390/lib/uaccess_pt.c index 60604b2819b2..b159a9d65680 100644 --- a/arch/s390/lib/uaccess_pt.c +++ b/arch/s390/lib/uaccess_pt.c @@ -64,7 +64,7 @@ out: out_of_memory: up_read(&mm->mmap_sem); - if (is_init(current)) { + if (is_global_init(current)) { yield(); down_read(&mm->mmap_sem); goto survive; diff --git a/arch/s390/mm/fault.c b/arch/s390/mm/fault.c index 14c241ccdd4d..2456b52ed068 100644 --- a/arch/s390/mm/fault.c +++ b/arch/s390/mm/fault.c @@ -211,7 +211,7 @@ static int do_out_of_memory(struct pt_regs *regs, unsigned long error_code, struct mm_struct *mm = tsk->mm; up_read(&mm->mmap_sem); - if (is_init(tsk)) { + if (is_global_init(tsk)) { yield(); down_read(&mm->mmap_sem); return 1; diff --git a/arch/s390/oprofile/Kconfig b/arch/s390/oprofile/Kconfig deleted file mode 100644 index 208220a5f23f..000000000000 --- a/arch/s390/oprofile/Kconfig +++ /dev/null @@ -1,22 +0,0 @@ - -menu "Profiling support" - -config PROFILING - bool "Profiling support" - help - Say Y here to enable profiling support mechanisms used by - profilers such as readprofile or OProfile. - - -config OPROFILE - tristate "OProfile system profiling" - depends on PROFILING - help - OProfile is a profiling system capable of profiling the - whole system, include the kernel, kernel modules, libraries, - and applications. - - If unsure, say N. - -endmenu - diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig index 44982c1dfa23..247f8a65e733 100644 --- a/arch/sh/Kconfig +++ b/arch/sh/Kconfig @@ -758,7 +758,7 @@ source "drivers/Kconfig" source "fs/Kconfig" -source "arch/sh/oprofile/Kconfig" +source "kernel/Kconfig.instrumentation" source "arch/sh/Kconfig.debug" diff --git a/arch/sh/kernel/machine_kexec.c b/arch/sh/kernel/machine_kexec.c index 790ed69b8666..5c17de51987e 100644 --- a/arch/sh/kernel/machine_kexec.c +++ b/arch/sh/kernel/machine_kexec.c @@ -104,24 +104,3 @@ NORET_TYPE void machine_kexec(struct kimage *image) (*rnk)(page_list, reboot_code_buffer, image->start, vbr_reg); } -/* crashkernel=size@addr specifies the location to reserve for - * a crash kernel. By reserving this memory we guarantee - * that linux never sets it up as a DMA target. - * Useful for holding code to do something appropriate - * after a kernel panic. - */ -static int __init parse_crashkernel(char *arg) -{ - unsigned long size, base; - size = memparse(arg, &arg); - if (*arg == '@') { - base = memparse(arg+1, &arg); - /* FIXME: Do I want a sanity check - * to validate the memory range? - */ - crashk_res.start = base; - crashk_res.end = base + size - 1; - } - return 0; -} -early_param("crashkernel", parse_crashkernel); diff --git a/arch/sh/kernel/process.c b/arch/sh/kernel/process.c index b4469992d6b2..6d7f2b07e491 100644 --- a/arch/sh/kernel/process.c +++ b/arch/sh/kernel/process.c @@ -121,7 +121,7 @@ void machine_power_off(void) void show_regs(struct pt_regs * regs) { printk("\n"); - printk("Pid : %d, Comm: %20s\n", current->pid, current->comm); + printk("Pid : %d, Comm: %20s\n", task_pid_nr(current), current->comm); print_symbol("PC is at %s\n", instruction_pointer(regs)); printk("PC : %08lx SP : %08lx SR : %08lx ", regs->pc, regs->regs[15], regs->sr); diff --git a/arch/sh/kernel/setup.c b/arch/sh/kernel/setup.c index b3027a6775b9..b749403f6b38 100644 --- a/arch/sh/kernel/setup.c +++ b/arch/sh/kernel/setup.c @@ -128,6 +128,37 @@ static void __init register_bootmem_low_pages(void) free_bootmem(PFN_PHYS(curr_pfn), PFN_PHYS(pages)); } +#ifdef CONFIG_KEXEC +static void __init reserve_crashkernel(void) +{ + unsigned long long free_mem; + unsigned long long crash_size, crash_base; + int ret; + + free_mem = ((unsigned long long)max_low_pfn - min_low_pfn) << PAGE_SHIFT; + + ret = parse_crashkernel(boot_command_line, free_mem, + &crash_size, &crash_base); + if (ret == 0 && crash_size) { + if (crash_base > 0) { + printk(KERN_INFO "Reserving %ldMB of memory at %ldMB " + "for crashkernel (System RAM: %ldMB)\n", + (unsigned long)(crash_size >> 20), + (unsigned long)(crash_base >> 20), + (unsigned long)(free_mem >> 20)); + crashk_res.start = crash_base; + crashk_res.end = crash_base + crash_size - 1; + reserve_bootmem(crash_base, crash_size); + } else + printk(KERN_INFO "crashkernel reservation failed - " + "you have to specify a base address\n"); + } +} +#else +static inline void __init reserve_crashkernel(void) +{} +#endif + void __init setup_bootmem_allocator(unsigned long free_pfn) { unsigned long bootmap_size; @@ -189,11 +220,8 @@ void __init setup_bootmem_allocator(unsigned long free_pfn) } } #endif -#ifdef CONFIG_KEXEC - if (crashk_res.start != crashk_res.end) - reserve_bootmem(crashk_res.start, - crashk_res.end - crashk_res.start + 1); -#endif + + reserve_crashkernel(); } #ifndef CONFIG_NEED_MULTIPLE_NODES diff --git a/arch/sh/kernel/signal.c b/arch/sh/kernel/signal.c index 2f42442cf164..ca754fd42437 100644 --- a/arch/sh/kernel/signal.c +++ b/arch/sh/kernel/signal.c @@ -382,7 +382,7 @@ static int setup_frame(int sig, struct k_sigaction *ka, set_fs(USER_DS); pr_debug("SIG deliver (%s:%d): sp=%p pc=%08lx pr=%08lx\n", - current->comm, current->pid, frame, regs->pc, regs->pr); + current->comm, task_pid_nr(current), frame, regs->pc, regs->pr); flush_cache_sigtramp(regs->pr); @@ -462,7 +462,7 @@ static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, set_fs(USER_DS); pr_debug("SIG deliver (%s:%d): sp=%p pc=%08lx pr=%08lx\n", - current->comm, current->pid, frame, regs->pc, regs->pr); + current->comm, task_pid_nr(current), frame, regs->pc, regs->pr); flush_cache_sigtramp(regs->pr); diff --git a/arch/sh/kernel/traps.c b/arch/sh/kernel/traps.c index dcb46e71da1c..cf99111cb33f 100644 --- a/arch/sh/kernel/traps.c +++ b/arch/sh/kernel/traps.c @@ -95,8 +95,8 @@ void die(const char * str, struct pt_regs * regs, long err) print_modules(); show_regs(regs); - printk("Process: %s (pid: %d, stack limit = %p)\n", - current->comm, current->pid, task_stack_page(current) + 1); + printk("Process: %s (pid: %d, stack limit = %p)\n", current->comm, + task_pid_nr(current), task_stack_page(current) + 1); if (!user_mode(regs) || in_interrupt()) dump_mem("Stack: ", regs->regs[15], THREAD_SIZE + @@ -386,7 +386,8 @@ static int handle_unaligned_access(u16 instruction, struct pt_regs *regs) printk(KERN_NOTICE "Fixing up unaligned userspace access " "in \"%s\" pid=%d pc=0x%p ins=0x%04hx\n", - current->comm,current->pid,(u16*)regs->pc,instruction); + current->comm, task_pid_nr(current), + (u16 *)regs->pc, instruction); } ret = -EFAULT; diff --git a/arch/sh/mm/fault.c b/arch/sh/mm/fault.c index 4729668ce5bf..f33cedb353fc 100644 --- a/arch/sh/mm/fault.c +++ b/arch/sh/mm/fault.c @@ -207,7 +207,7 @@ no_context: */ out_of_memory: up_read(&mm->mmap_sem); - if (is_init(current)) { + if (is_global_init(current)) { yield(); down_read(&mm->mmap_sem); goto survive; diff --git a/arch/sh/oprofile/Kconfig b/arch/sh/oprofile/Kconfig deleted file mode 100644 index 5ade19801b97..000000000000 --- a/arch/sh/oprofile/Kconfig +++ /dev/null @@ -1,23 +0,0 @@ - -menu "Profiling support" - depends on EXPERIMENTAL - -config PROFILING - bool "Profiling support (EXPERIMENTAL)" - help - Say Y here to enable the extended profiling support mechanisms used - by profilers such as OProfile. - - -config OPROFILE - tristate "OProfile system profiling (EXPERIMENTAL)" - depends on PROFILING - help - OProfile is a profiling system capable of profiling the - whole system, include the kernel, kernel modules, libraries, - and applications. - - If unsure, say N. - -endmenu - diff --git a/arch/sh64/Kconfig b/arch/sh64/Kconfig index b3327ce8e82f..ba204bac49df 100644 --- a/arch/sh64/Kconfig +++ b/arch/sh64/Kconfig @@ -284,7 +284,7 @@ source "drivers/Kconfig" source "fs/Kconfig" -source "arch/sh64/oprofile/Kconfig" +source "kernel/Kconfig.instrumentation" source "arch/sh64/Kconfig.debug" diff --git a/arch/sh64/kernel/traps.c b/arch/sh64/kernel/traps.c index 9d0d58fb29fa..c03101fab467 100644 --- a/arch/sh64/kernel/traps.c +++ b/arch/sh64/kernel/traps.c @@ -764,7 +764,7 @@ static int misaligned_fixup(struct pt_regs *regs) --user_mode_unaligned_fixup_count; /* Only do 'count' worth of these reports, to remove a potential DoS against syslog */ printk("Fixing up unaligned userspace access in \"%s\" pid=%d pc=0x%08x ins=0x%08lx\n", - current->comm, current->pid, (__u32)regs->pc, opcode); + current->comm, task_pid_nr(current), (__u32)regs->pc, opcode); } else #endif if (!user_mode(regs) && (kernel_mode_unaligned_fixup_count > 0)) { @@ -774,7 +774,7 @@ static int misaligned_fixup(struct pt_regs *regs) (__u32)regs->pc, opcode); } else { printk("Fixing up unaligned kernelspace access in \"%s\" pid=%d pc=0x%08x ins=0x%08lx\n", - current->comm, current->pid, (__u32)regs->pc, opcode); + current->comm, task_pid_nr(current), (__u32)regs->pc, opcode); } } diff --git a/arch/sh64/mm/fault.c b/arch/sh64/mm/fault.c index dd81c669c79b..7c79a1ba8059 100644 --- a/arch/sh64/mm/fault.c +++ b/arch/sh64/mm/fault.c @@ -81,7 +81,7 @@ static inline void print_vma(struct vm_area_struct *vma) static inline void print_task(struct task_struct *tsk) { - printk("Task pid %d\n", tsk->pid); + printk("Task pid %d\n", task_pid_nr(tsk)); } static pte_t *lookup_pte(struct mm_struct *mm, unsigned long address) @@ -272,13 +272,13 @@ bad_area: * usermode, so only need a few */ count++; printk("user mode bad_area address=%08lx pid=%d (%s) pc=%08lx\n", - address, current->pid, current->comm, + address, task_pid_nr(current), current->comm, (unsigned long) regs->pc); #if 0 show_regs(regs); #endif } - if (is_init(tsk)) { + if (is_global_init(tsk)) { panic("INIT had user mode bad_area\n"); } tsk->thread.address = address; @@ -320,14 +320,14 @@ no_context: * us unable to handle the page fault gracefully. */ out_of_memory: - if (is_init(current)) { + if (is_global_init(current)) { panic("INIT out of memory\n"); yield(); goto survive; } printk("fault:Out of memory\n"); up_read(&mm->mmap_sem); - if (is_init(current)) { + if (is_global_init(current)) { yield(); down_read(&mm->mmap_sem); goto survive; diff --git a/arch/sh64/oprofile/Kconfig b/arch/sh64/oprofile/Kconfig deleted file mode 100644 index 19d37730b664..000000000000 --- a/arch/sh64/oprofile/Kconfig +++ /dev/null @@ -1,23 +0,0 @@ - -menu "Profiling support" - depends on EXPERIMENTAL - -config PROFILING - bool "Profiling support (EXPERIMENTAL)" - help - Say Y here to enable the extended profiling support mechanisms used - by profilers such as OProfile. - - -config OPROFILE - tristate "OProfile system profiling (EXPERIMENTAL)" - depends on PROFILING - help - OProfile is a profiling system capable of profiling the - whole system, include the kernel, kernel modules, libraries, - and applications. - - If unsure, say N. - -endmenu - diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig index c0f4ba109daa..527adc808ad6 100644 --- a/arch/sparc/Kconfig +++ b/arch/sparc/Kconfig @@ -320,11 +320,7 @@ endmenu source "fs/Kconfig" -menu "Instrumentation Support" - -source "arch/sparc/oprofile/Kconfig" - -endmenu +source "kernel/Kconfig.instrumentation" source "arch/sparc/Kconfig.debug" diff --git a/arch/sparc/kernel/ptrace.c b/arch/sparc/kernel/ptrace.c index 003f8eed32f4..fe562db475e9 100644 --- a/arch/sparc/kernel/ptrace.c +++ b/arch/sparc/kernel/ptrace.c @@ -155,7 +155,7 @@ static inline void read_sunos_user(struct pt_regs *regs, unsigned long offset, /* Rest of them are completely unsupported. */ default: printk("%s [%d]: Wants to read user offset %ld\n", - current->comm, current->pid, offset); + current->comm, task_pid_nr(current), offset); pt_error_return(regs, EIO); return; } @@ -222,7 +222,7 @@ static inline void write_sunos_user(struct pt_regs *regs, unsigned long offset, /* Rest of them are completely unsupported or "no-touch". */ default: printk("%s [%d]: Wants to write user offset %ld\n", - current->comm, current->pid, offset); + current->comm, task_pid_nr(current), offset); goto failure; } success: diff --git a/arch/sparc/kernel/sys_sparc.c b/arch/sparc/kernel/sys_sparc.c index 6c0221e9a9f5..42bf09db9a81 100644 --- a/arch/sparc/kernel/sys_sparc.c +++ b/arch/sparc/kernel/sys_sparc.c @@ -357,7 +357,7 @@ c_sys_nis_syscall (struct pt_regs *regs) if (count++ > 5) return -ENOSYS; printk ("%s[%d]: Unimplemented SPARC system call %d\n", - current->comm, current->pid, (int)regs->u_regs[1]); + current->comm, task_pid_nr(current), (int)regs->u_regs[1]); #ifdef DEBUG_UNIMP_SYSCALL show_regs (regs); #endif diff --git a/arch/sparc/kernel/sys_sunos.c b/arch/sparc/kernel/sys_sunos.c index f807172cab0e..28c187c5d9fd 100644 --- a/arch/sparc/kernel/sys_sunos.c +++ b/arch/sparc/kernel/sys_sunos.c @@ -866,7 +866,7 @@ asmlinkage int sunos_killpg(int pgrp, int sig) rcu_read_lock(); ret = -EINVAL; if (pgrp > 0) - ret = kill_pgrp(find_pid(pgrp), sig, 0); + ret = kill_pgrp(find_vpid(pgrp), sig, 0); rcu_read_unlock(); return ret; diff --git a/arch/sparc/kernel/traps.c b/arch/sparc/kernel/traps.c index 3bc3bff51e08..d404e7994527 100644 --- a/arch/sparc/kernel/traps.c +++ b/arch/sparc/kernel/traps.c @@ -38,7 +38,7 @@ struct trap_trace_entry trapbuf[1024]; void syscall_trace_entry(struct pt_regs *regs) { - printk("%s[%d]: ", current->comm, current->pid); + printk("%s[%d]: ", current->comm, task_pid_nr(current)); printk("scall<%d> (could be %d)\n", (int) regs->u_regs[UREG_G1], (int) regs->u_regs[UREG_I0]); } @@ -99,7 +99,7 @@ void die_if_kernel(char *str, struct pt_regs *regs) " /_| \\__/ |_\\\n" " \\__U_/\n"); - printk("%s(%d): %s [#%d]\n", current->comm, current->pid, str, ++die_counter); + printk("%s(%d): %s [#%d]\n", current->comm, task_pid_nr(current), str, ++die_counter); show_regs(regs); add_taint(TAINT_DIE); diff --git a/arch/sparc/oprofile/Kconfig b/arch/sparc/oprofile/Kconfig deleted file mode 100644 index d8a84088471a..000000000000 --- a/arch/sparc/oprofile/Kconfig +++ /dev/null @@ -1,17 +0,0 @@ -config PROFILING - bool "Profiling support (EXPERIMENTAL)" - help - Say Y here to enable the extended profiling support mechanisms used - by profilers such as OProfile. - - -config OPROFILE - tristate "OProfile system profiling (EXPERIMENTAL)" - depends on PROFILING - help - OProfile is a profiling system capable of profiling the - whole system, include the kernel, kernel modules, libraries, - and applications. - - If unsure, say N. - diff --git a/arch/sparc64/Kconfig b/arch/sparc64/Kconfig index 59c4d752d286..c7a74e376985 100644 --- a/arch/sparc64/Kconfig +++ b/arch/sparc64/Kconfig @@ -460,20 +460,7 @@ source "drivers/fc4/Kconfig" source "fs/Kconfig" -menu "Instrumentation Support" - -source "arch/sparc64/oprofile/Kconfig" - -config KPROBES - bool "Kprobes (EXPERIMENTAL)" - depends on KALLSYMS && EXPERIMENTAL && MODULES - help - Kprobes allows you to trap at almost any kernel address and - execute a callback function. register_kprobe() establishes - a probepoint and specifies the callback. Kprobes is useful - for kernel debugging, non-intrusive instrumentation and testing. - If in doubt, say "N". -endmenu +source "kernel/Kconfig.instrumentation" source "arch/sparc64/Kconfig.debug" diff --git a/arch/sparc64/kernel/sys_sunos32.c b/arch/sparc64/kernel/sys_sunos32.c index 8f7a06e2c7e7..170d6ca8de6f 100644 --- a/arch/sparc64/kernel/sys_sunos32.c +++ b/arch/sparc64/kernel/sys_sunos32.c @@ -831,7 +831,7 @@ asmlinkage int sunos_killpg(int pgrp, int sig) rcu_read_lock(); ret = -EINVAL; if (pgrp > 0) - ret = kill_pgrp(find_pid(pgrp), sig, 0); + ret = kill_pgrp(find_vpid(pgrp), sig, 0); rcu_read_unlock(); return ret; diff --git a/arch/sparc64/kernel/traps.c b/arch/sparc64/kernel/traps.c index 34573a55b6e5..e9c7e4f07abf 100644 --- a/arch/sparc64/kernel/traps.c +++ b/arch/sparc64/kernel/traps.c @@ -2225,7 +2225,7 @@ void die_if_kernel(char *str, struct pt_regs *regs) " /_| \\__/ |_\\\n" " \\__U_/\n"); - printk("%s(%d): %s [#%d]\n", current->comm, current->pid, str, ++die_counter); + printk("%s(%d): %s [#%d]\n", current->comm, task_pid_nr(current), str, ++die_counter); notify_die(DIE_OOPS, str, regs, 0, 255, SIGSEGV); __asm__ __volatile__("flushw"); __show_regs(regs); diff --git a/arch/sparc64/oprofile/Kconfig b/arch/sparc64/oprofile/Kconfig deleted file mode 100644 index d8a84088471a..000000000000 --- a/arch/sparc64/oprofile/Kconfig +++ /dev/null @@ -1,17 +0,0 @@ -config PROFILING - bool "Profiling support (EXPERIMENTAL)" - help - Say Y here to enable the extended profiling support mechanisms used - by profilers such as OProfile. - - -config OPROFILE - tristate "OProfile system profiling (EXPERIMENTAL)" - depends on PROFILING - help - OProfile is a profiling system capable of profiling the - whole system, include the kernel, kernel modules, libraries, - and applications. - - If unsure, say N. - diff --git a/arch/sparc64/solaris/misc.c b/arch/sparc64/solaris/misc.c index 3b67de7455f1..c86cb3091a8e 100644 --- a/arch/sparc64/solaris/misc.c +++ b/arch/sparc64/solaris/misc.c @@ -415,7 +415,7 @@ asmlinkage int solaris_procids(int cmd, s32 pid, s32 pgid) switch (cmd) { case 0: /* getpgrp */ - return process_group(current); + return task_pgrp_nr(current); case 1: /* setpgrp */ { int (*sys_setpgid)(pid_t,pid_t) = @@ -426,7 +426,7 @@ asmlinkage int solaris_procids(int cmd, s32 pid, s32 pgid) ret = sys_setpgid(0, 0); if (ret) return ret; proc_clear_tty(current); - return process_group(current); + return task_pgrp_nr(current); } case 2: /* getsid */ { diff --git a/arch/um/Kconfig b/arch/um/Kconfig index 740d8a922e48..d8925d285573 100644 --- a/arch/um/Kconfig +++ b/arch/um/Kconfig @@ -289,4 +289,6 @@ config INPUT bool default n +source "kernel/Kconfig.instrumentation" + source "arch/um/Kconfig.debug" diff --git a/arch/um/kernel/trap.c b/arch/um/kernel/trap.c index bd060551e619..cb3321f8e0a9 100644 --- a/arch/um/kernel/trap.c +++ b/arch/um/kernel/trap.c @@ -108,7 +108,7 @@ out_nosemaphore: * us unable to handle the page fault gracefully. */ out_of_memory: - if (is_init(current)) { + if (is_global_init(current)) { up_read(&mm->mmap_sem); yield(); down_read(&mm->mmap_sem); diff --git a/arch/um/sys-x86_64/sysrq.c b/arch/um/sys-x86_64/sysrq.c index ce3e07fcf283..765444031819 100644 --- a/arch/um/sys-x86_64/sysrq.c +++ b/arch/um/sys-x86_64/sysrq.c @@ -15,8 +15,8 @@ void __show_regs(struct pt_regs * regs) { printk("\n"); print_modules(); - printk("Pid: %d, comm: %.20s %s %s\n", - current->pid, current->comm, print_tainted(), init_utsname()->release); + printk("Pid: %d, comm: %.20s %s %s\n", task_pid_nr(current), + current->comm, print_tainted(), init_utsname()->release); printk("RIP: %04lx:[<%016lx>] ", PT_REGS_CS(regs) & 0xffff, PT_REGS_RIP(regs)); printk("\nRSP: %016lx EFLAGS: %08lx\n", PT_REGS_RSP(regs), diff --git a/arch/v850/Kconfig b/arch/v850/Kconfig index ace479ab273f..b6a50b8b38de 100644 --- a/arch/v850/Kconfig +++ b/arch/v850/Kconfig @@ -331,6 +331,8 @@ source "sound/Kconfig" source "drivers/usb/Kconfig" +source "kernel/Kconfig.instrumentation" + source "arch/v850/Kconfig.debug" source "security/Kconfig" diff --git a/arch/x86/kernel/crash_dump_32.c b/arch/x86/kernel/crash_dump_32.c index 32e75d0731a9..72d0c56c1b48 100644 --- a/arch/x86/kernel/crash_dump_32.c +++ b/arch/x86/kernel/crash_dump_32.c @@ -47,6 +47,7 @@ ssize_t copy_oldmem_page(unsigned long pfn, char *buf, if (!kdump_buf_page) { printk(KERN_WARNING "Kdump: Kdump buffer page not" " allocated\n"); + kunmap_atomic(vaddr, KM_PTE0); return -EFAULT; } copy_page(kdump_buf_page, vaddr); diff --git a/arch/x86/kernel/e820_32.c b/arch/x86/kernel/e820_32.c index 3c86b979a40a..d58039e8de74 100644 --- a/arch/x86/kernel/e820_32.c +++ b/arch/x86/kernel/e820_32.c @@ -288,7 +288,8 @@ legacy_init_iomem_resources(struct resource *code_resource, struct resource *dat request_resource(res, code_resource); request_resource(res, data_resource); #ifdef CONFIG_KEXEC - request_resource(res, &crashk_res); + if (crashk_res.start != crashk_res.end) + request_resource(res, &crashk_res); #endif } } diff --git a/arch/x86/kernel/e820_64.c b/arch/x86/kernel/e820_64.c index e422b8159f69..57616865d8a0 100644 --- a/arch/x86/kernel/e820_64.c +++ b/arch/x86/kernel/e820_64.c @@ -226,7 +226,8 @@ void __init e820_reserve_resources(void) request_resource(res, &code_resource); request_resource(res, &data_resource); #ifdef CONFIG_KEXEC - request_resource(res, &crashk_res); + if (crashk_res.start != crashk_res.end) + request_resource(res, &crashk_res); #endif } } diff --git a/arch/x86/kernel/machine_kexec_32.c b/arch/x86/kernel/machine_kexec_32.c index 8459ca64bc2f..11b935f4f886 100644 --- a/arch/x86/kernel/machine_kexec_32.c +++ b/arch/x86/kernel/machine_kexec_32.c @@ -149,28 +149,6 @@ NORET_TYPE void machine_kexec(struct kimage *image) image->start, cpu_has_pae); } -/* crashkernel=size@addr specifies the location to reserve for - * a crash kernel. By reserving this memory we guarantee - * that linux never sets it up as a DMA target. - * Useful for holding code to do something appropriate - * after a kernel panic. - */ -static int __init parse_crashkernel(char *arg) -{ - unsigned long size, base; - size = memparse(arg, &arg); - if (*arg == '@') { - base = memparse(arg+1, &arg); - /* FIXME: Do I want a sanity check - * to validate the memory range? - */ - crashk_res.start = base; - crashk_res.end = base + size - 1; - } - return 0; -} -early_param("crashkernel", parse_crashkernel); - void arch_crash_save_vmcoreinfo(void) { #ifdef CONFIG_ARCH_DISCONTIGMEM_ENABLE diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c index 7450b69710b5..0d8577f05422 100644 --- a/arch/x86/kernel/machine_kexec_64.c +++ b/arch/x86/kernel/machine_kexec_64.c @@ -231,33 +231,6 @@ NORET_TYPE void machine_kexec(struct kimage *image) image->start); } -/* crashkernel=size@addr specifies the location to reserve for - * a crash kernel. By reserving this memory we guarantee - * that linux never set's it up as a DMA target. - * Useful for holding code to do something appropriate - * after a kernel panic. - */ -static int __init setup_crashkernel(char *arg) -{ - unsigned long size, base; - char *p; - if (!arg) - return -EINVAL; - size = memparse(arg, &p); - if (arg == p) - return -EINVAL; - if (*p == '@') { - base = memparse(p+1, &p); - /* FIXME: Do I want a sanity check to validate the - * memory range? Yes you do, but it's too early for - * e820 -AK */ - crashk_res.start = base; - crashk_res.end = base + size - 1; - } - return 0; -} -early_param("crashkernel", setup_crashkernel); - void arch_crash_save_vmcoreinfo(void) { #ifdef CONFIG_ARCH_DISCONTIGMEM_ENABLE diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 097aeafce5ff..044a47745a5c 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -301,7 +301,7 @@ void show_regs(struct pt_regs * regs) unsigned long d0, d1, d2, d3, d6, d7; printk("\n"); - printk("Pid: %d, comm: %20s\n", current->pid, current->comm); + printk("Pid: %d, comm: %20s\n", task_pid_nr(current), current->comm); printk("EIP: %04x:[<%08lx>] CPU: %d\n",0xffff & regs->xcs,regs->eip, smp_processor_id()); print_symbol("EIP is at %s\n", regs->eip); diff --git a/arch/x86/kernel/setup_32.c b/arch/x86/kernel/setup_32.c index b87a6fd5ba48..978dc0196a0f 100644 --- a/arch/x86/kernel/setup_32.c +++ b/arch/x86/kernel/setup_32.c @@ -378,6 +378,49 @@ extern unsigned long __init setup_memory(void); extern void zone_sizes_init(void); #endif /* !CONFIG_NEED_MULTIPLE_NODES */ +static inline unsigned long long get_total_mem(void) +{ + unsigned long long total; + + total = max_low_pfn - min_low_pfn; +#ifdef CONFIG_HIGHMEM + total += highend_pfn - highstart_pfn; +#endif + + return total << PAGE_SHIFT; +} + +#ifdef CONFIG_KEXEC +static void __init reserve_crashkernel(void) +{ + unsigned long long total_mem; + unsigned long long crash_size, crash_base; + int ret; + + total_mem = get_total_mem(); + + ret = parse_crashkernel(boot_command_line, total_mem, + &crash_size, &crash_base); + if (ret == 0 && crash_size > 0) { + if (crash_base > 0) { + printk(KERN_INFO "Reserving %ldMB of memory at %ldMB " + "for crashkernel (System RAM: %ldMB)\n", + (unsigned long)(crash_size >> 20), + (unsigned long)(crash_base >> 20), + (unsigned long)(total_mem >> 20)); + crashk_res.start = crash_base; + crashk_res.end = crash_base + crash_size - 1; + reserve_bootmem(crash_base, crash_size); + } else + printk(KERN_INFO "crashkernel reservation failed - " + "you have to specify a base address\n"); + } +} +#else +static inline void __init reserve_crashkernel(void) +{} +#endif + void __init setup_bootmem_allocator(void) { unsigned long bootmap_size; @@ -453,11 +496,7 @@ void __init setup_bootmem_allocator(void) } } #endif -#ifdef CONFIG_KEXEC - if (crashk_res.start != crashk_res.end) - reserve_bootmem(crashk_res.start, - crashk_res.end - crashk_res.start + 1); -#endif + reserve_crashkernel(); } /* diff --git a/arch/x86/kernel/setup_64.c b/arch/x86/kernel/setup_64.c index 5a19f0cc5b67..cdcba6975226 100644 --- a/arch/x86/kernel/setup_64.c +++ b/arch/x86/kernel/setup_64.c @@ -191,6 +191,37 @@ static inline void copy_edd(void) } #endif +#ifdef CONFIG_KEXEC +static void __init reserve_crashkernel(void) +{ + unsigned long long free_mem; + unsigned long long crash_size, crash_base; + int ret; + + free_mem = ((unsigned long long)max_low_pfn - min_low_pfn) << PAGE_SHIFT; + + ret = parse_crashkernel(boot_command_line, free_mem, + &crash_size, &crash_base); + if (ret == 0 && crash_size) { + if (crash_base > 0) { + printk(KERN_INFO "Reserving %ldMB of memory at %ldMB " + "for crashkernel (System RAM: %ldMB)\n", + (unsigned long)(crash_size >> 20), + (unsigned long)(crash_base >> 20), + (unsigned long)(free_mem >> 20)); + crashk_res.start = crash_base; + crashk_res.end = crash_base + crash_size - 1; + reserve_bootmem(crash_base, crash_size); + } else + printk(KERN_INFO "crashkernel reservation failed - " + "you have to specify a base address\n"); + } +} +#else +static inline void __init reserve_crashkernel(void) +{} +#endif + #define EBDA_ADDR_POINTER 0x40E unsigned __initdata ebda_addr; @@ -357,13 +388,7 @@ void __init setup_arch(char **cmdline_p) } } #endif -#ifdef CONFIG_KEXEC - if (crashk_res.start != crashk_res.end) { - reserve_bootmem_generic(crashk_res.start, - crashk_res.end - crashk_res.start + 1); - } -#endif - + reserve_crashkernel(); paging_init(); #ifdef CONFIG_PCI diff --git a/arch/x86/kernel/signal_32.c b/arch/x86/kernel/signal_32.c index 0d79df3c5631..6dc394b87255 100644 --- a/arch/x86/kernel/signal_32.c +++ b/arch/x86/kernel/signal_32.c @@ -200,8 +200,8 @@ badframe: if (show_unhandled_signals && printk_ratelimit()) printk("%s%s[%d] bad frame in sigreturn frame:%p eip:%lx" " esp:%lx oeax:%lx\n", - current->pid > 1 ? KERN_INFO : KERN_EMERG, - current->comm, current->pid, frame, regs->eip, + task_pid_nr(current) > 1 ? KERN_INFO : KERN_EMERG, + current->comm, task_pid_nr(current), frame, regs->eip, regs->esp, regs->orig_eax); force_sig(SIGSEGV, current); diff --git a/arch/x86/kernel/traps_32.c b/arch/x86/kernel/traps_32.c index b132d3957dfc..1e9d57256eb1 100644 --- a/arch/x86/kernel/traps_32.c +++ b/arch/x86/kernel/traps_32.c @@ -316,7 +316,7 @@ void show_registers(struct pt_regs *regs) printk(KERN_EMERG "ds: %04x es: %04x fs: %04x gs: %04x ss: %04x\n", regs->xds & 0xffff, regs->xes & 0xffff, regs->xfs & 0xffff, gs, ss); printk(KERN_EMERG "Process %.*s (pid: %d, ti=%p task=%p task.ti=%p)", - TASK_COMM_LEN, current->comm, current->pid, + TASK_COMM_LEN, current->comm, task_pid_nr(current), current_thread_info(), current, task_thread_info(current)); /* * When in-kernel, we also print out the stack and code at the @@ -622,7 +622,7 @@ fastcall void __kprobes do_general_protection(struct pt_regs * regs, printk_ratelimit()) printk(KERN_INFO "%s[%d] general protection eip:%lx esp:%lx error:%lx\n", - current->comm, current->pid, + current->comm, task_pid_nr(current), regs->eip, regs->esp, error_code); force_sig(SIGSEGV, current); diff --git a/arch/x86/lib/usercopy_32.c b/arch/x86/lib/usercopy_32.c index 9f38b12b4af1..8bab2b2efaff 100644 --- a/arch/x86/lib/usercopy_32.c +++ b/arch/x86/lib/usercopy_32.c @@ -748,7 +748,7 @@ survive: retval = get_user_pages(current, current->mm, (unsigned long )to, 1, 1, 0, &pg, NULL); - if (retval == -ENOMEM && is_init(current)) { + if (retval == -ENOMEM && is_global_init(current)) { up_read(¤t->mm->mmap_sem); congestion_wait(WRITE, HZ/50); goto survive; diff --git a/arch/x86/mm/fault_32.c b/arch/x86/mm/fault_32.c index 6555c3d14371..4d3e538c57ab 100644 --- a/arch/x86/mm/fault_32.c +++ b/arch/x86/mm/fault_32.c @@ -471,8 +471,8 @@ bad_area_nosemaphore: printk_ratelimit()) { printk("%s%s[%d]: segfault at %08lx eip %08lx " "esp %08lx error %lx\n", - tsk->pid > 1 ? KERN_INFO : KERN_EMERG, - tsk->comm, tsk->pid, address, regs->eip, + task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG, + tsk->comm, task_pid_nr(tsk), address, regs->eip, regs->esp, error_code); } tsk->thread.cr2 = address; @@ -587,7 +587,7 @@ no_context: */ out_of_memory: up_read(&mm->mmap_sem); - if (is_init(tsk)) { + if (is_global_init(tsk)) { yield(); down_read(&mm->mmap_sem); goto survive; diff --git a/arch/x86/mm/fault_64.c b/arch/x86/mm/fault_64.c index 5e0e54906c48..5149ac136a5d 100644 --- a/arch/x86/mm/fault_64.c +++ b/arch/x86/mm/fault_64.c @@ -554,7 +554,7 @@ no_context: */ out_of_memory: up_read(&mm->mmap_sem); - if (is_init(current)) { + if (is_global_init(current)) { yield(); goto again; } diff --git a/arch/x86_64/Kconfig b/arch/x86_64/Kconfig index 78cb68f2ebbd..d2521942e5bd 100644 --- a/arch/x86_64/Kconfig +++ b/arch/x86_64/Kconfig @@ -799,21 +799,6 @@ source "drivers/firmware/Kconfig" source fs/Kconfig -menu "Instrumentation Support" - -source "arch/x86/oprofile/Kconfig" - -config KPROBES - bool "Kprobes" - depends on KALLSYMS && MODULES - help - Kprobes allows you to trap at almost any kernel address and - execute a callback function. register_kprobe() establishes - a probepoint and specifies the callback. Kprobes is useful - for kernel debugging, non-intrusive instrumentation and testing. - If in doubt, say "N". -endmenu - source "arch/x86_64/Kconfig.debug" source "security/Kconfig" diff --git a/arch/xtensa/Kconfig b/arch/xtensa/Kconfig index 7fbb44bea37f..85ffbb491490 100644 --- a/arch/xtensa/Kconfig +++ b/arch/xtensa/Kconfig @@ -251,6 +251,8 @@ config EMBEDDED_RAMDISK_IMAGE provide one yourself. endmenu +source "kernel/Kconfig.instrumentation" + source "arch/xtensa/Kconfig.debug" source "security/Kconfig" diff --git a/arch/xtensa/kernel/traps.c b/arch/xtensa/kernel/traps.c index 8be99c777d9d..397bcd6ad08d 100644 --- a/arch/xtensa/kernel/traps.c +++ b/arch/xtensa/kernel/traps.c @@ -176,7 +176,7 @@ void do_unhandled(struct pt_regs *regs, unsigned long exccause) printk("Caught unhandled exception in '%s' " "(pid = %d, pc = %#010lx) - should not happen\n" "\tEXCCAUSE is %ld\n", - current->comm, current->pid, regs->pc, exccause); + current->comm, task_pid_nr(current), regs->pc, exccause); force_sig(SIGILL, current); } @@ -228,7 +228,7 @@ do_illegal_instruction(struct pt_regs *regs) /* If in user mode, send SIGILL signal to current process. */ printk("Illegal Instruction in '%s' (pid = %d, pc = %#010lx)\n", - current->comm, current->pid, regs->pc); + current->comm, task_pid_nr(current), regs->pc); force_sig(SIGILL, current); } @@ -254,7 +254,7 @@ do_unaligned_user (struct pt_regs *regs) current->thread.error_code = -3; printk("Unaligned memory access to %08lx in '%s' " "(pid = %d, pc = %#010lx)\n", - regs->excvaddr, current->comm, current->pid, regs->pc); + regs->excvaddr, current->comm, task_pid_nr(current), regs->pc); info.si_signo = SIGBUS; info.si_errno = 0; info.si_code = BUS_ADRALN; diff --git a/arch/xtensa/mm/fault.c b/arch/xtensa/mm/fault.c index 2f842859948f..33f366be323f 100644 --- a/arch/xtensa/mm/fault.c +++ b/arch/xtensa/mm/fault.c @@ -145,7 +145,7 @@ bad_area: */ out_of_memory: up_read(&mm->mmap_sem); - if (is_init(current)) { + if (is_global_init(current)) { yield(); down_read(&mm->mmap_sem); goto survive; diff --git a/block/ll_rw_blk.c b/block/ll_rw_blk.c index 3935469e3662..8025d646ab30 100644 --- a/block/ll_rw_blk.c +++ b/block/ll_rw_blk.c @@ -3367,7 +3367,7 @@ void submit_bio(int rw, struct bio *bio) if (unlikely(block_dump)) { char b[BDEVNAME_SIZE]; printk(KERN_DEBUG "%s(%d): %s block %Lu on %s\n", - current->comm, current->pid, + current->comm, task_pid_nr(current), (rw & WRITE) ? "WRITE" : "READ", (unsigned long long)bio->bi_sector, bdevname(bio->bi_bdev,b)); @@ -3739,7 +3739,7 @@ EXPORT_SYMBOL(end_dequeued_request); /** * end_request - end I/O on the current segment of the request - * @rq: the request being processed + * @req: the request being processed * @uptodate: error value or 0/1 uptodate flag * * Description: diff --git a/drivers/acpi/button.c b/drivers/acpi/button.c index 2e79a3395ecf..301e832e6961 100644 --- a/drivers/acpi/button.c +++ b/drivers/acpi/button.c @@ -434,18 +434,18 @@ static int acpi_button_add(struct acpi_device *device) switch (button->type) { case ACPI_BUTTON_TYPE_POWER: case ACPI_BUTTON_TYPE_POWERF: - input->evbit[0] = BIT(EV_KEY); + input->evbit[0] = BIT_MASK(EV_KEY); set_bit(KEY_POWER, input->keybit); break; case ACPI_BUTTON_TYPE_SLEEP: case ACPI_BUTTON_TYPE_SLEEPF: - input->evbit[0] = BIT(EV_KEY); + input->evbit[0] = BIT_MASK(EV_KEY); set_bit(KEY_SLEEP, input->keybit); break; case ACPI_BUTTON_TYPE_LID: - input->evbit[0] = BIT(EV_SW); + input->evbit[0] = BIT_MASK(EV_SW); set_bit(SW_LID, input->swbit); break; } diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c index cb136a919f2a..ac4a0cb217ab 100644 --- a/drivers/block/nbd.c +++ b/drivers/block/nbd.c @@ -188,7 +188,7 @@ static int sock_xmit(struct nbd_device *lo, int send, void *buf, int size, if (signal_pending(current)) { siginfo_t info; printk(KERN_WARNING "nbd (pid %d: %s) got signal %d\n", - current->pid, current->comm, + task_pid_nr(current), current->comm, dequeue_signal_lock(current, ¤t->blocked, &info)); result = -EINTR; sock_shutdown(lo, !send); diff --git a/drivers/cdrom/cdrom.c b/drivers/cdrom/cdrom.c index d70745c84250..af0561053167 100644 --- a/drivers/cdrom/cdrom.c +++ b/drivers/cdrom/cdrom.c @@ -1107,7 +1107,7 @@ int open_for_data(struct cdrom_device_info * cdi) is the default case! */ cdinfo(CD_OPEN, "bummer. wrong media type.\n"); cdinfo(CD_WARNING, "pid %d must open device O_NONBLOCK!\n", - (unsigned int)current->pid); + (unsigned int)task_pid_nr(current)); ret=-EMEDIUMTYPE; goto clean_up_and_return; } diff --git a/drivers/char/drm/drm_bufs.c b/drivers/char/drm/drm_bufs.c index 856774fbe025..d24a6c2c2c24 100644 --- a/drivers/char/drm/drm_bufs.c +++ b/drivers/char/drm/drm_bufs.c @@ -1456,7 +1456,7 @@ int drm_freebufs(struct drm_device *dev, void *data, buf = dma->buflist[idx]; if (buf->file_priv != file_priv) { DRM_ERROR("Process %d freeing buffer not owned\n", - current->pid); + task_pid_nr(current)); return -EINVAL; } drm_free_buffer(dev, buf); diff --git a/drivers/char/drm/drm_drv.c b/drivers/char/drm/drm_drv.c index 72668b15e5ce..44a46268b02b 100644 --- a/drivers/char/drm/drm_drv.c +++ b/drivers/char/drm/drm_drv.c @@ -463,7 +463,7 @@ int drm_ioctl(struct inode *inode, struct file *filp, ++file_priv->ioctl_count; DRM_DEBUG("pid=%d, cmd=0x%02x, nr=0x%02x, dev 0x%lx, auth=%d\n", - current->pid, cmd, nr, + task_pid_nr(current), cmd, nr, (long)old_encode_dev(file_priv->head->device), file_priv->authenticated); diff --git a/drivers/char/drm/drm_fops.c b/drivers/char/drm/drm_fops.c index f383fc37190c..3992f73299cc 100644 --- a/drivers/char/drm/drm_fops.c +++ b/drivers/char/drm/drm_fops.c @@ -234,7 +234,7 @@ static int drm_open_helper(struct inode *inode, struct file *filp, if (!drm_cpu_valid()) return -EINVAL; - DRM_DEBUG("pid = %d, minor = %d\n", current->pid, minor); + DRM_DEBUG("pid = %d, minor = %d\n", task_pid_nr(current), minor); priv = drm_alloc(sizeof(*priv), DRM_MEM_FILES); if (!priv) @@ -244,7 +244,7 @@ static int drm_open_helper(struct inode *inode, struct file *filp, filp->private_data = priv; priv->filp = filp; priv->uid = current->euid; - priv->pid = current->pid; + priv->pid = task_pid_nr(current); priv->minor = minor; priv->head = drm_heads[minor]; priv->ioctl_count = 0; @@ -339,7 +339,8 @@ int drm_release(struct inode *inode, struct file *filp) */ DRM_DEBUG("pid = %d, device = 0x%lx, open_count = %d\n", - current->pid, (long)old_encode_dev(file_priv->head->device), + task_pid_nr(current), + (long)old_encode_dev(file_priv->head->device), dev->open_count); if (dev->driver->reclaim_buffers_locked && dev->lock.hw_lock) { diff --git a/drivers/char/drm/drm_lock.c b/drivers/char/drm/drm_lock.c index c6b73e744d67..bea2a7d5b2b2 100644 --- a/drivers/char/drm/drm_lock.c +++ b/drivers/char/drm/drm_lock.c @@ -58,12 +58,12 @@ int drm_lock(struct drm_device *dev, void *data, struct drm_file *file_priv) if (lock->context == DRM_KERNEL_CONTEXT) { DRM_ERROR("Process %d using kernel context %d\n", - current->pid, lock->context); + task_pid_nr(current), lock->context); return -EINVAL; } DRM_DEBUG("%d (pid %d) requests lock (0x%08x), flags = 0x%08x\n", - lock->context, current->pid, + lock->context, task_pid_nr(current), dev->lock.hw_lock->lock, lock->flags); if (drm_core_check_feature(dev, DRIVER_DMA_QUEUE)) @@ -153,7 +153,7 @@ int drm_unlock(struct drm_device *dev, void *data, struct drm_file *file_priv) if (lock->context == DRM_KERNEL_CONTEXT) { DRM_ERROR("Process %d using kernel context %d\n", - current->pid, lock->context); + task_pid_nr(current), lock->context); return -EINVAL; } diff --git a/drivers/char/drm/drm_os_linux.h b/drivers/char/drm/drm_os_linux.h index 114e54e0f61b..76e44ac94fb5 100644 --- a/drivers/char/drm/drm_os_linux.h +++ b/drivers/char/drm/drm_os_linux.h @@ -7,7 +7,7 @@ #include <linux/delay.h> /** Current process ID */ -#define DRM_CURRENTPID current->pid +#define DRM_CURRENTPID task_pid_nr(current) #define DRM_SUSER(p) capable(CAP_SYS_ADMIN) #define DRM_UDELAY(d) udelay(d) /** Read a byte from a MMIO region */ diff --git a/drivers/char/drm/i810_dma.c b/drivers/char/drm/i810_dma.c index 8e841bdee6dc..eb381a7c5bee 100644 --- a/drivers/char/drm/i810_dma.c +++ b/drivers/char/drm/i810_dma.c @@ -1024,7 +1024,7 @@ static int i810_getbuf(struct drm_device *dev, void *data, retcode = i810_dma_get_buffer(dev, d, file_priv); DRM_DEBUG("i810_dma: %d returning %d, granted = %d\n", - current->pid, retcode, d->granted); + task_pid_nr(current), retcode, d->granted); sarea_priv->last_dispatch = (int)hw_status[5]; diff --git a/drivers/char/drm/i830_dma.c b/drivers/char/drm/i830_dma.c index 43a1f78712d6..69a363edb0d2 100644 --- a/drivers/char/drm/i830_dma.c +++ b/drivers/char/drm/i830_dma.c @@ -1409,7 +1409,7 @@ static int i830_getbuf(struct drm_device *dev, void *data, retcode = i830_dma_get_buffer(dev, d, file_priv); DRM_DEBUG("i830_dma: %d returning %d, granted = %d\n", - current->pid, retcode, d->granted); + task_pid_nr(current), retcode, d->granted); sarea_priv->last_dispatch = (int)hw_status[5]; diff --git a/drivers/char/esp.c b/drivers/char/esp.c index 2e7ae42a5503..0f8fb135da53 100644 --- a/drivers/char/esp.c +++ b/drivers/char/esp.c @@ -58,10 +58,10 @@ #include <linux/mm.h> #include <linux/init.h> #include <linux/delay.h> +#include <linux/bitops.h> #include <asm/system.h> #include <asm/io.h> -#include <asm/bitops.h> #include <asm/dma.h> #include <linux/slab.h> diff --git a/drivers/char/keyboard.c b/drivers/char/keyboard.c index 212276affa1f..fc54d234507a 100644 --- a/drivers/char/keyboard.c +++ b/drivers/char/keyboard.c @@ -42,6 +42,7 @@ #include <linux/sysrq.h> #include <linux/input.h> #include <linux/reboot.h> +#include <linux/notifier.h> extern void ctrl_alt_del(void); @@ -81,7 +82,8 @@ void compute_shiftstate(void); typedef void (k_handler_fn)(struct vc_data *vc, unsigned char value, char up_flag); static k_handler_fn K_HANDLERS; -static k_handler_fn *k_handler[16] = { K_HANDLERS }; +k_handler_fn *k_handler[16] = { K_HANDLERS }; +EXPORT_SYMBOL_GPL(k_handler); #define FN_HANDLERS\ fn_null, fn_enter, fn_show_ptregs, fn_show_mem,\ @@ -127,7 +129,7 @@ int shift_state = 0; */ static struct input_handler kbd_handler; -static unsigned long key_down[NBITS(KEY_MAX)]; /* keyboard key bitmap */ +static unsigned long key_down[BITS_TO_LONGS(KEY_CNT)]; /* keyboard key bitmap */ static unsigned char shift_down[NR_SHIFT]; /* shift state counters.. */ static int dead_key_next; static int npadch = -1; /* -1 or number assembled on pad */ @@ -160,6 +162,23 @@ static int sysrq_alt_use; static int sysrq_alt; /* + * Notifier list for console keyboard events + */ +static ATOMIC_NOTIFIER_HEAD(keyboard_notifier_list); + +int register_keyboard_notifier(struct notifier_block *nb) +{ + return atomic_notifier_chain_register(&keyboard_notifier_list, nb); +} +EXPORT_SYMBOL_GPL(register_keyboard_notifier); + +int unregister_keyboard_notifier(struct notifier_block *nb) +{ + return atomic_notifier_chain_unregister(&keyboard_notifier_list, nb); +} +EXPORT_SYMBOL_GPL(unregister_keyboard_notifier); + +/* * Translation of scancodes to keycodes. We set them on only the first * keyboard in the list that accepts the scancode and keycode. * Explanation for not choosing the first attached keyboard anymore: @@ -1130,6 +1149,7 @@ static void kbd_keycode(unsigned int keycode, int down, int hw_raw) unsigned char type, raw_mode; struct tty_struct *tty; int shift_final; + struct keyboard_notifier_param param = { .vc = vc, .value = keycode, .down = down }; tty = vc->vc_tty; @@ -1217,10 +1237,11 @@ static void kbd_keycode(unsigned int keycode, int down, int hw_raw) return; } - shift_final = (shift_state | kbd->slockstate) ^ kbd->lockstate; + param.shift = shift_final = (shift_state | kbd->slockstate) ^ kbd->lockstate; key_map = key_maps[shift_final]; - if (!key_map) { + if (atomic_notifier_call_chain(&keyboard_notifier_list, KBD_KEYCODE, ¶m) == NOTIFY_STOP || !key_map) { + atomic_notifier_call_chain(&keyboard_notifier_list, KBD_UNBOUND_KEYCODE, ¶m); compute_shiftstate(); kbd->slockstate = 0; return; @@ -1237,6 +1258,9 @@ static void kbd_keycode(unsigned int keycode, int down, int hw_raw) type = KTYP(keysym); if (type < 0xf0) { + param.value = keysym; + if (atomic_notifier_call_chain(&keyboard_notifier_list, KBD_UNICODE, ¶m) == NOTIFY_STOP) + return; if (down && !raw_mode) to_utf8(vc, keysym); return; @@ -1244,9 +1268,6 @@ static void kbd_keycode(unsigned int keycode, int down, int hw_raw) type -= 0xf0; - if (raw_mode && type != KT_SPEC && type != KT_SHIFT) - return; - if (type == KT_LETTER) { type = KT_LATIN; if (vc_kbd_led(kbd, VC_CAPSLOCK)) { @@ -1255,9 +1276,18 @@ static void kbd_keycode(unsigned int keycode, int down, int hw_raw) keysym = key_map[keycode]; } } + param.value = keysym; + + if (atomic_notifier_call_chain(&keyboard_notifier_list, KBD_KEYSYM, ¶m) == NOTIFY_STOP) + return; + + if (raw_mode && type != KT_SPEC && type != KT_SHIFT) + return; (*k_handler[type])(vc, keysym & 0xff, !down); + atomic_notifier_call_chain(&keyboard_notifier_list, KBD_POST_KEYSYM, ¶m); + if (type != KT_SLOCK) kbd->slockstate = 0; } @@ -1347,12 +1377,12 @@ static void kbd_start(struct input_handle *handle) static const struct input_device_id kbd_ids[] = { { .flags = INPUT_DEVICE_ID_MATCH_EVBIT, - .evbit = { BIT(EV_KEY) }, + .evbit = { BIT_MASK(EV_KEY) }, }, { .flags = INPUT_DEVICE_ID_MATCH_EVBIT, - .evbit = { BIT(EV_SND) }, + .evbit = { BIT_MASK(EV_SND) }, }, { }, /* Terminating entry */ diff --git a/drivers/char/mxser.c b/drivers/char/mxser.c index 661aca0e155d..fd0abef7ee08 100644 --- a/drivers/char/mxser.c +++ b/drivers/char/mxser.c @@ -56,11 +56,11 @@ #include <linux/mm.h> #include <linux/delay.h> #include <linux/pci.h> +#include <linux/bitops.h> #include <asm/system.h> #include <asm/io.h> #include <asm/irq.h> -#include <asm/bitops.h> #include <asm/uaccess.h> #include "mxser.h" diff --git a/drivers/char/mxser_new.c b/drivers/char/mxser_new.c index 854dbf59eb68..081c84c7b548 100644 --- a/drivers/char/mxser_new.c +++ b/drivers/char/mxser_new.c @@ -39,11 +39,11 @@ #include <linux/mm.h> #include <linux/delay.h> #include <linux/pci.h> +#include <linux/bitops.h> #include <asm/system.h> #include <asm/io.h> #include <asm/irq.h> -#include <asm/bitops.h> #include <asm/uaccess.h> #include "mxser_new.h" diff --git a/drivers/char/sonypi.c b/drivers/char/sonypi.c index 859858561ab6..9782cb4d30dc 100644 --- a/drivers/char/sonypi.c +++ b/drivers/char/sonypi.c @@ -1178,9 +1178,9 @@ static int __devinit sonypi_create_input_devices(void) jog_dev->id.bustype = BUS_ISA; jog_dev->id.vendor = PCI_VENDOR_ID_SONY; - jog_dev->evbit[0] = BIT(EV_KEY) | BIT(EV_REL); - jog_dev->keybit[LONG(BTN_MOUSE)] = BIT(BTN_MIDDLE); - jog_dev->relbit[0] = BIT(REL_WHEEL); + jog_dev->evbit[0] = BIT_MASK(EV_KEY) | BIT_MASK(EV_REL); + jog_dev->keybit[BIT_WORD(BTN_MOUSE)] = BIT_MASK(BTN_MIDDLE); + jog_dev->relbit[0] = BIT_MASK(REL_WHEEL); sonypi_device.input_key_dev = key_dev = input_allocate_device(); if (!key_dev) { @@ -1193,7 +1193,7 @@ static int __devinit sonypi_create_input_devices(void) key_dev->id.vendor = PCI_VENDOR_ID_SONY; /* Initialize the Input Drivers: special keys */ - key_dev->evbit[0] = BIT(EV_KEY); + key_dev->evbit[0] = BIT_MASK(EV_KEY); for (i = 0; sonypi_inputkeys[i].sonypiev; i++) if (sonypi_inputkeys[i].inputev) set_bit(sonypi_inputkeys[i].inputev, key_dev->keybit); diff --git a/drivers/char/sx.c b/drivers/char/sx.c index 85a23283dff5..a6e1c9ba1217 100644 --- a/drivers/char/sx.c +++ b/drivers/char/sx.c @@ -1467,7 +1467,7 @@ static int sx_open(struct tty_struct *tty, struct file *filp) line = tty->index; sx_dprintk(SX_DEBUG_OPEN, "%d: opening line %d. tty=%p ctty=%p, " - "np=%d)\n", current->pid, line, tty, + "np=%d)\n", task_pid_nr(current), line, tty, current->signal->tty, sx_nports); if ((line < 0) || (line >= SX_NPORTS) || (line >= sx_nports)) diff --git a/drivers/char/sysrq.c b/drivers/char/sysrq.c index 78d14935f2b8..de60e1ea4fb3 100644 --- a/drivers/char/sysrq.c +++ b/drivers/char/sysrq.c @@ -251,7 +251,7 @@ static void send_sig_all(int sig) struct task_struct *p; for_each_process(p) { - if (p->mm && !is_init(p)) + if (p->mm && !is_global_init(p)) /* Not swapper, init nor kernel thread */ force_sig(sig, p); } diff --git a/drivers/char/tty_io.c b/drivers/char/tty_io.c index 9c867cf6de64..13a53575a016 100644 --- a/drivers/char/tty_io.c +++ b/drivers/char/tty_io.c @@ -103,6 +103,7 @@ #include <linux/selection.h> #include <linux/kmod.h> +#include <linux/nsproxy.h> #undef TTY_DEBUG_HANGUP @@ -3107,7 +3108,7 @@ static int tiocgpgrp(struct tty_struct *tty, struct tty_struct *real_tty, pid_t */ if (tty == real_tty && current->signal->tty != real_tty) return -ENOTTY; - return put_user(pid_nr(real_tty->pgrp), p); + return put_user(pid_vnr(real_tty->pgrp), p); } /** @@ -3141,7 +3142,7 @@ static int tiocspgrp(struct tty_struct *tty, struct tty_struct *real_tty, pid_t if (pgrp_nr < 0) return -EINVAL; rcu_read_lock(); - pgrp = find_pid(pgrp_nr); + pgrp = find_vpid(pgrp_nr); retval = -ESRCH; if (!pgrp) goto out_unlock; @@ -3178,7 +3179,7 @@ static int tiocgsid(struct tty_struct *tty, struct tty_struct *real_tty, pid_t _ return -ENOTTY; if (!real_tty->session) return -ENOTTY; - return put_user(pid_nr(real_tty->session), p); + return put_user(pid_vnr(real_tty->session), p); } /** @@ -3528,8 +3529,8 @@ void __do_SAK(struct tty_struct *tty) /* Kill the entire session */ do_each_pid_task(session, PIDTYPE_SID, p) { printk(KERN_NOTICE "SAK: killed process %d" - " (%s): process_session(p)==tty->session\n", - p->pid, p->comm); + " (%s): task_session_nr(p)==tty->session\n", + task_pid_nr(p), p->comm); send_sig(SIGKILL, p, 1); } while_each_pid_task(session, PIDTYPE_SID, p); /* Now kill any processes that happen to have the @@ -3538,8 +3539,8 @@ void __do_SAK(struct tty_struct *tty) do_each_thread(g, p) { if (p->signal->tty == tty) { printk(KERN_NOTICE "SAK: killed process %d" - " (%s): process_session(p)==tty->session\n", - p->pid, p->comm); + " (%s): task_session_nr(p)==tty->session\n", + task_pid_nr(p), p->comm); send_sig(SIGKILL, p, 1); continue; } @@ -3559,7 +3560,7 @@ void __do_SAK(struct tty_struct *tty) filp->private_data == tty) { printk(KERN_NOTICE "SAK: killed process %d" " (%s): fd#%d opened to the tty\n", - p->pid, p->comm, i); + task_pid_nr(p), p->comm, i); force_sig(SIGKILL, p); break; } diff --git a/drivers/char/vt.c b/drivers/char/vt.c index 1764c67b585f..7a5badfb7d84 100644 --- a/drivers/char/vt.c +++ b/drivers/char/vt.c @@ -99,6 +99,7 @@ #include <linux/pm.h> #include <linux/font.h> #include <linux/bitops.h> +#include <linux/notifier.h> #include <asm/io.h> #include <asm/system.h> @@ -223,6 +224,35 @@ enum { }; /* + * Notifier list for console events. + */ +static ATOMIC_NOTIFIER_HEAD(vt_notifier_list); + +int register_vt_notifier(struct notifier_block *nb) +{ + return atomic_notifier_chain_register(&vt_notifier_list, nb); +} +EXPORT_SYMBOL_GPL(register_vt_notifier); + +int unregister_vt_notifier(struct notifier_block *nb) +{ + return atomic_notifier_chain_unregister(&vt_notifier_list, nb); +} +EXPORT_SYMBOL_GPL(unregister_vt_notifier); + +static void notify_write(struct vc_data *vc, unsigned int unicode) +{ + struct vt_notifier_param param = { .vc = vc, unicode = unicode }; + atomic_notifier_call_chain(&vt_notifier_list, VT_WRITE, ¶m); +} + +static void notify_update(struct vc_data *vc) +{ + struct vt_notifier_param param = { .vc = vc }; + atomic_notifier_call_chain(&vt_notifier_list, VT_UPDATE, ¶m); +} + +/* * Low-Level Functions */ @@ -718,6 +748,7 @@ int vc_allocate(unsigned int currcons) /* return 0 on success */ return -ENXIO; if (!vc_cons[currcons].d) { struct vc_data *vc; + struct vt_notifier_param param; /* prevent users from taking too much memory */ if (currcons >= MAX_NR_USER_CONSOLES && !capable(CAP_SYS_RESOURCE)) @@ -729,7 +760,7 @@ int vc_allocate(unsigned int currcons) /* return 0 on success */ /* although the numbers above are not valid since long ago, the point is still up-to-date and the comment still has its value even if only as a historical artifact. --mj, July 1998 */ - vc = kzalloc(sizeof(struct vc_data), GFP_KERNEL); + param.vc = vc = kzalloc(sizeof(struct vc_data), GFP_KERNEL); if (!vc) return -ENOMEM; vc_cons[currcons].d = vc; @@ -746,6 +777,7 @@ int vc_allocate(unsigned int currcons) /* return 0 on success */ } vc->vc_kmalloced = 1; vc_init(vc, vc->vc_rows, vc->vc_cols, 1); + atomic_notifier_call_chain(&vt_notifier_list, VT_ALLOCATE, ¶m); } return 0; } @@ -907,6 +939,8 @@ void vc_deallocate(unsigned int currcons) if (vc_cons_allocated(currcons)) { struct vc_data *vc = vc_cons[currcons].d; + struct vt_notifier_param param = { .vc = vc }; + atomic_notifier_call_chain(&vt_notifier_list, VT_DEALLOCATE, ¶m); vc->vc_sw->con_deinit(vc); put_pid(vc->vt_pid); module_put(vc->vc_sw->owner); @@ -1019,6 +1053,7 @@ static void lf(struct vc_data *vc) vc->vc_pos += vc->vc_size_row; } vc->vc_need_wrap = 0; + notify_write(vc, '\n'); } static void ri(struct vc_data *vc) @@ -1039,6 +1074,7 @@ static inline void cr(struct vc_data *vc) { vc->vc_pos -= vc->vc_x << 1; vc->vc_need_wrap = vc->vc_x = 0; + notify_write(vc, '\r'); } static inline void bs(struct vc_data *vc) @@ -1047,6 +1083,7 @@ static inline void bs(struct vc_data *vc) vc->vc_pos -= 2; vc->vc_x--; vc->vc_need_wrap = 0; + notify_write(vc, '\b'); } } @@ -1593,6 +1630,7 @@ static void do_con_trol(struct tty_struct *tty, struct vc_data *vc, int c) break; } vc->vc_pos += (vc->vc_x << 1); + notify_write(vc, '\t'); return; case 10: case 11: case 12: lf(vc); @@ -2252,6 +2290,7 @@ rescan_last_byte: tc = conv_uni_to_pc(vc, ' '); /* A space is printed in the second column */ if (tc < 0) tc = ' '; } + notify_write(vc, c); if (inverse) { FLUSH @@ -2274,6 +2313,7 @@ rescan_last_byte: release_console_sem(); out: + notify_update(vc); return n; #undef FLUSH } @@ -2317,6 +2357,7 @@ static void console_callback(struct work_struct *ignored) do_blank_screen(0); blank_timer_expired = 0; } + notify_update(vc_cons[fg_console].d); release_console_sem(); } @@ -2418,6 +2459,7 @@ static void vt_console_print(struct console *co, const char *b, unsigned count) continue; } scr_writew((vc->vc_attr << 8) + c, (unsigned short *)vc->vc_pos); + notify_write(vc, c); cnt++; if (myx == vc->vc_cols - 1) { vc->vc_need_wrap = 1; @@ -2436,6 +2478,7 @@ static void vt_console_print(struct console *co, const char *b, unsigned count) } } set_cursor(vc); + notify_update(vc); quit: clear_bit(0, &printing); diff --git a/drivers/edac/edac_core.h b/drivers/edac/edac_core.h index e80af67664cc..2d23e304f5ec 100644 --- a/drivers/edac/edac_core.h +++ b/drivers/edac/edac_core.h @@ -94,8 +94,6 @@ extern int edac_debug_level; #endif /* !CONFIG_EDAC_DEBUG */ -#define BIT(x) (1 << (x)) - #define PCI_VEND_DEV(vend, dev) PCI_VENDOR_ID_ ## vend, \ PCI_DEVICE_ID_ ## vend ## _ ## dev diff --git a/drivers/edac/pasemi_edac.c b/drivers/edac/pasemi_edac.c index e66cdd42a392..9007d0677220 100644 --- a/drivers/edac/pasemi_edac.c +++ b/drivers/edac/pasemi_edac.c @@ -270,6 +270,7 @@ static void __devexit pasemi_edac_remove(struct pci_dev *pdev) static const struct pci_device_id pasemi_edac_pci_tbl[] = { { PCI_DEVICE(PCI_VENDOR_ID_PASEMI, 0xa00a) }, + { } }; MODULE_DEVICE_TABLE(pci, pasemi_edac_pci_tbl); diff --git a/drivers/firmware/dcdbas.h b/drivers/firmware/dcdbas.h index dcdba0f1b32c..87bc3417de27 100644 --- a/drivers/firmware/dcdbas.h +++ b/drivers/firmware/dcdbas.h @@ -17,7 +17,6 @@ #define _DCDBAS_H_ #include <linux/device.h> -#include <linux/input.h> #include <linux/sysfs.h> #include <linux/types.h> diff --git a/drivers/hid/hidraw.c b/drivers/hid/hidraw.c index a702e2f6da7d..1ca6f4635eeb 100644 --- a/drivers/hid/hidraw.c +++ b/drivers/hid/hidraw.c @@ -113,13 +113,13 @@ static ssize_t hidraw_write(struct file *file, const char __user *buffer, size_t if (count > HID_MIN_BUFFER_SIZE) { printk(KERN_WARNING "hidraw: pid %d passed too large report\n", - current->pid); + task_pid_nr(current)); return -EINVAL; } if (count < 2) { printk(KERN_WARNING "hidraw: pid %d passed too short report\n", - current->pid); + task_pid_nr(current)); return -EINVAL; } diff --git a/drivers/hid/usbhid/usbkbd.c b/drivers/hid/usbhid/usbkbd.c index b76b02f7b52d..775a1ef28a29 100644 --- a/drivers/hid/usbhid/usbkbd.c +++ b/drivers/hid/usbhid/usbkbd.c @@ -274,8 +274,11 @@ static int usb_kbd_probe(struct usb_interface *iface, input_set_drvdata(input_dev, kbd); - input_dev->evbit[0] = BIT(EV_KEY) | BIT(EV_LED) | BIT(EV_REP); - input_dev->ledbit[0] = BIT(LED_NUML) | BIT(LED_CAPSL) | BIT(LED_SCROLLL) | BIT(LED_COMPOSE) | BIT(LED_KANA); + input_dev->evbit[0] = BIT_MASK(EV_KEY) | BIT_MASK(EV_LED) | + BIT_MASK(EV_REP); + input_dev->ledbit[0] = BIT_MASK(LED_NUML) | BIT_MASK(LED_CAPSL) | + BIT_MASK(LED_SCROLLL) | BIT_MASK(LED_COMPOSE) | + BIT_MASK(LED_KANA); for (i = 0; i < 255; i++) set_bit(usb_kbd_keycode[i], input_dev->keybit); diff --git a/drivers/hid/usbhid/usbmouse.c b/drivers/hid/usbhid/usbmouse.c index 5345c73bcf62..f8ad6910d3d9 100644 --- a/drivers/hid/usbhid/usbmouse.c +++ b/drivers/hid/usbhid/usbmouse.c @@ -173,11 +173,13 @@ static int usb_mouse_probe(struct usb_interface *intf, const struct usb_device_i usb_to_input_id(dev, &input_dev->id); input_dev->dev.parent = &intf->dev; - input_dev->evbit[0] = BIT(EV_KEY) | BIT(EV_REL); - input_dev->keybit[LONG(BTN_MOUSE)] = BIT(BTN_LEFT) | BIT(BTN_RIGHT) | BIT(BTN_MIDDLE); - input_dev->relbit[0] = BIT(REL_X) | BIT(REL_Y); - input_dev->keybit[LONG(BTN_MOUSE)] |= BIT(BTN_SIDE) | BIT(BTN_EXTRA); - input_dev->relbit[0] |= BIT(REL_WHEEL); + input_dev->evbit[0] = BIT_MASK(EV_KEY) | BIT_MASK(EV_REL); + input_dev->keybit[BIT_WORD(BTN_MOUSE)] = BIT_MASK(BTN_LEFT) | + BIT_MASK(BTN_RIGHT) | BIT_MASK(BTN_MIDDLE); + input_dev->relbit[0] = BIT_MASK(REL_X) | BIT_MASK(REL_Y); + input_dev->keybit[BIT_WORD(BTN_MOUSE)] |= BIT_MASK(BTN_SIDE) | + BIT_MASK(BTN_EXTRA); + input_dev->relbit[0] |= BIT_MASK(REL_WHEEL); input_set_drvdata(input_dev, mouse); diff --git a/drivers/hwmon/applesmc.c b/drivers/hwmon/applesmc.c index 4879125b4cdc..1001d2e122a2 100644 --- a/drivers/hwmon/applesmc.c +++ b/drivers/hwmon/applesmc.c @@ -1099,7 +1099,7 @@ static int applesmc_create_accelerometer(void) idev->name = "applesmc"; idev->id.bustype = BUS_HOST; idev->dev.parent = &pdev->dev; - idev->evbit[0] = BIT(EV_ABS); + idev->evbit[0] = BIT_MASK(EV_ABS); input_set_abs_params(idev, ABS_X, -256, 256, APPLESMC_INPUT_FUZZ, APPLESMC_INPUT_FLAT); input_set_abs_params(idev, ABS_Y, diff --git a/drivers/hwmon/hdaps.c b/drivers/hwmon/hdaps.c index 8a7ae03aeee4..bab5fd2e4dfd 100644 --- a/drivers/hwmon/hdaps.c +++ b/drivers/hwmon/hdaps.c @@ -574,7 +574,7 @@ static int __init hdaps_init(void) idev = hdaps_idev->input; idev->name = "hdaps"; idev->dev.parent = &pdev->dev; - idev->evbit[0] = BIT(EV_ABS); + idev->evbit[0] = BIT_MASK(EV_ABS); input_set_abs_params(idev, ABS_X, -256, 256, HDAPS_INPUT_FUZZ, HDAPS_INPUT_FLAT); input_set_abs_params(idev, ABS_Y, diff --git a/drivers/i2c/busses/i2c-pxa.c b/drivers/i2c/busses/i2c-pxa.c index 00fad11733ad..6426a61f8d4d 100644 --- a/drivers/i2c/busses/i2c-pxa.c +++ b/drivers/i2c/busses/i2c-pxa.c @@ -85,7 +85,7 @@ struct bits { const char *set; const char *unset; }; -#define BIT(m, s, u) { .mask = m, .set = s, .unset = u } +#define PXA_BIT(m, s, u) { .mask = m, .set = s, .unset = u } static inline void decode_bits(const char *prefix, const struct bits *bits, int num, u32 val) @@ -100,17 +100,17 @@ decode_bits(const char *prefix, const struct bits *bits, int num, u32 val) } static const struct bits isr_bits[] = { - BIT(ISR_RWM, "RX", "TX"), - BIT(ISR_ACKNAK, "NAK", "ACK"), - BIT(ISR_UB, "Bsy", "Rdy"), - BIT(ISR_IBB, "BusBsy", "BusRdy"), - BIT(ISR_SSD, "SlaveStop", NULL), - BIT(ISR_ALD, "ALD", NULL), - BIT(ISR_ITE, "TxEmpty", NULL), - BIT(ISR_IRF, "RxFull", NULL), - BIT(ISR_GCAD, "GenCall", NULL), - BIT(ISR_SAD, "SlaveAddr", NULL), - BIT(ISR_BED, "BusErr", NULL), + PXA_BIT(ISR_RWM, "RX", "TX"), + PXA_BIT(ISR_ACKNAK, "NAK", "ACK"), + PXA_BIT(ISR_UB, "Bsy", "Rdy"), + PXA_BIT(ISR_IBB, "BusBsy", "BusRdy"), + PXA_BIT(ISR_SSD, "SlaveStop", NULL), + PXA_BIT(ISR_ALD, "ALD", NULL), + PXA_BIT(ISR_ITE, "TxEmpty", NULL), + PXA_BIT(ISR_IRF, "RxFull", NULL), + PXA_BIT(ISR_GCAD, "GenCall", NULL), + PXA_BIT(ISR_SAD, "SlaveAddr", NULL), + PXA_BIT(ISR_BED, "BusErr", NULL), }; static void decode_ISR(unsigned int val) @@ -120,21 +120,21 @@ static void decode_ISR(unsigned int val) } static const struct bits icr_bits[] = { - BIT(ICR_START, "START", NULL), - BIT(ICR_STOP, "STOP", NULL), - BIT(ICR_ACKNAK, "ACKNAK", NULL), - BIT(ICR_TB, "TB", NULL), - BIT(ICR_MA, "MA", NULL), - BIT(ICR_SCLE, "SCLE", "scle"), - BIT(ICR_IUE, "IUE", "iue"), - BIT(ICR_GCD, "GCD", NULL), - BIT(ICR_ITEIE, "ITEIE", NULL), - BIT(ICR_IRFIE, "IRFIE", NULL), - BIT(ICR_BEIE, "BEIE", NULL), - BIT(ICR_SSDIE, "SSDIE", NULL), - BIT(ICR_ALDIE, "ALDIE", NULL), - BIT(ICR_SADIE, "SADIE", NULL), - BIT(ICR_UR, "UR", "ur"), + PXA_BIT(ICR_START, "START", NULL), + PXA_BIT(ICR_STOP, "STOP", NULL), + PXA_BIT(ICR_ACKNAK, "ACKNAK", NULL), + PXA_BIT(ICR_TB, "TB", NULL), + PXA_BIT(ICR_MA, "MA", NULL), + PXA_BIT(ICR_SCLE, "SCLE", "scle"), + PXA_BIT(ICR_IUE, "IUE", "iue"), + PXA_BIT(ICR_GCD, "GCD", NULL), + PXA_BIT(ICR_ITEIE, "ITEIE", NULL), + PXA_BIT(ICR_IRFIE, "IRFIE", NULL), + PXA_BIT(ICR_BEIE, "BEIE", NULL), + PXA_BIT(ICR_SSDIE, "SSDIE", NULL), + PXA_BIT(ICR_ALDIE, "ALDIE", NULL), + PXA_BIT(ICR_SADIE, "SADIE", NULL), + PXA_BIT(ICR_UR, "UR", "ur"), }; static void decode_ICR(unsigned int val) diff --git a/drivers/ide/ide-io.c b/drivers/ide/ide-io.c index 5c8b008676fb..32eaa3f80515 100644 --- a/drivers/ide/ide-io.c +++ b/drivers/ide/ide-io.c @@ -47,12 +47,12 @@ #include <linux/device.h> #include <linux/kmod.h> #include <linux/scatterlist.h> +#include <linux/bitops.h> #include <asm/byteorder.h> #include <asm/irq.h> #include <asm/uaccess.h> #include <asm/io.h> -#include <asm/bitops.h> static int __ide_end_request(ide_drive_t *drive, struct request *rq, int uptodate, unsigned int nr_bytes) diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c index 93644f82592c..d08fb30768bc 100644 --- a/drivers/infiniband/core/cma.c +++ b/drivers/infiniband/core/cma.c @@ -2797,11 +2797,12 @@ static void cma_remove_one(struct ib_device *device) static int cma_init(void) { - int ret, low, high; + int ret, low, high, remaining; get_random_bytes(&next_port, sizeof next_port); inet_get_local_port_range(&low, &high); - next_port = ((unsigned int) next_port % (high - low)) + low; + remaining = (high - low) + 1; + next_port = ((unsigned int) next_port % remaining) + low; cma_wq = create_singlethread_workqueue("rdma_cm"); if (!cma_wq) diff --git a/drivers/input/evdev.c b/drivers/input/evdev.c index 1d62c8b88e12..e5b4e9bfbdc5 100644 --- a/drivers/input/evdev.c +++ b/drivers/input/evdev.c @@ -495,7 +495,7 @@ static unsigned int evdev_poll(struct file *file, poll_table *wait) #ifdef CONFIG_COMPAT #define BITS_PER_LONG_COMPAT (sizeof(compat_long_t) * 8) -#define NBITS_COMPAT(x) ((((x) - 1) / BITS_PER_LONG_COMPAT) + 1) +#define BITS_TO_LONGS_COMPAT(x) ((((x) - 1) / BITS_PER_LONG_COMPAT) + 1) #ifdef __BIG_ENDIAN static int bits_to_user(unsigned long *bits, unsigned int maxbit, @@ -504,7 +504,7 @@ static int bits_to_user(unsigned long *bits, unsigned int maxbit, int len, i; if (compat) { - len = NBITS_COMPAT(maxbit) * sizeof(compat_long_t); + len = BITS_TO_LONGS_COMPAT(maxbit) * sizeof(compat_long_t); if (len > maxlen) len = maxlen; @@ -515,7 +515,7 @@ static int bits_to_user(unsigned long *bits, unsigned int maxbit, sizeof(compat_long_t))) return -EFAULT; } else { - len = NBITS(maxbit) * sizeof(long); + len = BITS_TO_LONGS(maxbit) * sizeof(long); if (len > maxlen) len = maxlen; @@ -530,8 +530,8 @@ static int bits_to_user(unsigned long *bits, unsigned int maxbit, unsigned int maxlen, void __user *p, int compat) { int len = compat ? - NBITS_COMPAT(maxbit) * sizeof(compat_long_t) : - NBITS(maxbit) * sizeof(long); + BITS_TO_LONGS_COMPAT(maxbit) * sizeof(compat_long_t) : + BITS_TO_LONGS(maxbit) * sizeof(long); if (len > maxlen) len = maxlen; @@ -545,7 +545,7 @@ static int bits_to_user(unsigned long *bits, unsigned int maxbit, static int bits_to_user(unsigned long *bits, unsigned int maxbit, unsigned int maxlen, void __user *p, int compat) { - int len = NBITS(maxbit) * sizeof(long); + int len = BITS_TO_LONGS(maxbit) * sizeof(long); if (len > maxlen) len = maxlen; diff --git a/drivers/input/input.c b/drivers/input/input.c index 2f2b020cd629..307c7b5c2b33 100644 --- a/drivers/input/input.c +++ b/drivers/input/input.c @@ -584,10 +584,10 @@ static int input_default_setkeycode(struct input_dev *dev, #define MATCH_BIT(bit, max) \ - for (i = 0; i < NBITS(max); i++) \ + for (i = 0; i < BITS_TO_LONGS(max); i++) \ if ((id->bit[i] & dev->bit[i]) != id->bit[i]) \ break; \ - if (i != NBITS(max)) \ + if (i != BITS_TO_LONGS(max)) \ continue; static const struct input_device_id *input_match_device(const struct input_device_id *id, @@ -698,7 +698,7 @@ static void input_seq_print_bitmap(struct seq_file *seq, const char *name, { int i; - for (i = NBITS(max) - 1; i > 0; i--) + for (i = BITS_TO_LONGS(max) - 1; i > 0; i--) if (bitmap[i]) break; @@ -892,7 +892,7 @@ static int input_print_modalias_bits(char *buf, int size, len += snprintf(buf, max(size, 0), "%c", name); for (i = min_bit; i < max_bit; i++) - if (bm[LONG(i)] & BIT(i)) + if (bm[BIT_WORD(i)] & BIT_MASK(i)) len += snprintf(buf + len, max(size - len, 0), "%X,", i); return len; } @@ -991,7 +991,7 @@ static int input_print_bitmap(char *buf, int buf_size, unsigned long *bitmap, int i; int len = 0; - for (i = NBITS(max) - 1; i > 0; i--) + for (i = BITS_TO_LONGS(max) - 1; i > 0; i--) if (bitmap[i]) break; diff --git a/drivers/input/joydev.c b/drivers/input/joydev.c index 2b201f9aa024..22b2789ef58a 100644 --- a/drivers/input/joydev.c +++ b/drivers/input/joydev.c @@ -844,8 +844,8 @@ static const struct input_device_id joydev_blacklist[] = { { .flags = INPUT_DEVICE_ID_MATCH_EVBIT | INPUT_DEVICE_ID_MATCH_KEYBIT, - .evbit = { BIT(EV_KEY) }, - .keybit = { [LONG(BTN_TOUCH)] = BIT(BTN_TOUCH) }, + .evbit = { BIT_MASK(EV_KEY) }, + .keybit = { [BIT_WORD(BTN_TOUCH)] = BIT_MASK(BTN_TOUCH) }, }, /* Avoid itouchpads, touchscreens and tablets */ { } /* Terminating entry */ }; @@ -854,20 +854,20 @@ static const struct input_device_id joydev_ids[] = { { .flags = INPUT_DEVICE_ID_MATCH_EVBIT | INPUT_DEVICE_ID_MATCH_ABSBIT, - .evbit = { BIT(EV_ABS) }, - .absbit = { BIT(ABS_X) }, + .evbit = { BIT_MASK(EV_ABS) }, + .absbit = { BIT_MASK(ABS_X) }, }, { .flags = INPUT_DEVICE_ID_MATCH_EVBIT | INPUT_DEVICE_ID_MATCH_ABSBIT, - .evbit = { BIT(EV_ABS) }, - .absbit = { BIT(ABS_WHEEL) }, + .evbit = { BIT_MASK(EV_ABS) }, + .absbit = { BIT_MASK(ABS_WHEEL) }, }, { .flags = INPUT_DEVICE_ID_MATCH_EVBIT | INPUT_DEVICE_ID_MATCH_ABSBIT, - .evbit = { BIT(EV_ABS) }, - .absbit = { BIT(ABS_THROTTLE) }, + .evbit = { BIT_MASK(EV_ABS) }, + .absbit = { BIT_MASK(ABS_THROTTLE) }, }, { } /* Terminating entry */ }; diff --git a/drivers/input/joystick/a3d.c b/drivers/input/joystick/a3d.c index ff701ab10d74..52ba16f487c7 100644 --- a/drivers/input/joystick/a3d.c +++ b/drivers/input/joystick/a3d.c @@ -326,14 +326,19 @@ static int a3d_connect(struct gameport *gameport, struct gameport_driver *drv) a3d->length = 33; - input_dev->evbit[0] |= BIT(EV_ABS) | BIT(EV_KEY) | BIT(EV_REL); - input_dev->relbit[0] |= BIT(REL_X) | BIT(REL_Y); - input_dev->absbit[0] |= BIT(ABS_X) | BIT(ABS_Y) | BIT(ABS_THROTTLE) | BIT(ABS_RUDDER) - | BIT(ABS_HAT0X) | BIT(ABS_HAT0Y) | BIT(ABS_HAT1X) | BIT(ABS_HAT1Y); - input_dev->keybit[LONG(BTN_MOUSE)] |= BIT(BTN_RIGHT) | BIT(BTN_LEFT) | BIT(BTN_MIDDLE) - | BIT(BTN_SIDE) | BIT(BTN_EXTRA); - input_dev->keybit[LONG(BTN_JOYSTICK)] |= BIT(BTN_TRIGGER) | BIT(BTN_THUMB) | BIT(BTN_TOP) - | BIT(BTN_PINKIE); + input_dev->evbit[0] |= BIT_MASK(EV_ABS) | BIT_MASK(EV_KEY) | + BIT_MASK(EV_REL); + input_dev->relbit[0] |= BIT_MASK(REL_X) | BIT_MASK(REL_Y); + input_dev->absbit[0] |= BIT_MASK(ABS_X) | BIT_MASK(ABS_Y) | + BIT_MASK(ABS_THROTTLE) | BIT_MASK(ABS_RUDDER) | + BIT_MASK(ABS_HAT0X) | BIT_MASK(ABS_HAT0Y) | + BIT_MASK(ABS_HAT1X) | BIT_MASK(ABS_HAT1Y); + input_dev->keybit[BIT_WORD(BTN_MOUSE)] |= BIT_MASK(BTN_RIGHT) | + BIT_MASK(BTN_LEFT) | BIT_MASK(BTN_MIDDLE) | + BIT_MASK(BTN_SIDE) | BIT_MASK(BTN_EXTRA); + input_dev->keybit[BIT_WORD(BTN_JOYSTICK)] |= + BIT_MASK(BTN_TRIGGER) | BIT_MASK(BTN_THUMB) | + BIT_MASK(BTN_TOP) | BIT_MASK(BTN_PINKIE); a3d_read(a3d, data); @@ -348,9 +353,10 @@ static int a3d_connect(struct gameport *gameport, struct gameport_driver *drv) } else { a3d->length = 29; - input_dev->evbit[0] |= BIT(EV_KEY) | BIT(EV_REL); - input_dev->relbit[0] |= BIT(REL_X) | BIT(REL_Y); - input_dev->keybit[LONG(BTN_MOUSE)] |= BIT(BTN_RIGHT) | BIT(BTN_LEFT) | BIT(BTN_MIDDLE); + input_dev->evbit[0] |= BIT_MASK(EV_KEY) | BIT_MASK(EV_REL); + input_dev->relbit[0] |= BIT_MASK(REL_X) | BIT_MASK(REL_Y); + input_dev->keybit[BIT_WORD(BTN_MOUSE)] |= BIT_MASK(BTN_RIGHT) | + BIT_MASK(BTN_LEFT) | BIT_MASK(BTN_MIDDLE); a3d_read(a3d, data); diff --git a/drivers/input/joystick/adi.c b/drivers/input/joystick/adi.c index 28140c4a110d..d1ca8a14950f 100644 --- a/drivers/input/joystick/adi.c +++ b/drivers/input/joystick/adi.c @@ -431,7 +431,7 @@ static int adi_init_input(struct adi *adi, struct adi_port *port, int half) input_dev->open = adi_open; input_dev->close = adi_close; - input_dev->evbit[0] = BIT(EV_KEY) | BIT(EV_ABS); + input_dev->evbit[0] = BIT_MASK(EV_KEY) | BIT_MASK(EV_ABS); for (i = 0; i < adi->axes10 + adi->axes8 + (adi->hats + (adi->pad != -1)) * 2; i++) set_bit(adi->abs[i], input_dev->absbit); diff --git a/drivers/input/joystick/amijoy.c b/drivers/input/joystick/amijoy.c index b0f5541ec3e6..5cf9f3610e67 100644 --- a/drivers/input/joystick/amijoy.c +++ b/drivers/input/joystick/amijoy.c @@ -137,9 +137,10 @@ static int __init amijoy_init(void) amijoy_dev[i]->open = amijoy_open; amijoy_dev[i]->close = amijoy_close; - amijoy_dev[i]->evbit[0] = BIT(EV_KEY) | BIT(EV_ABS); - amijoy_dev[i]->absbit[0] = BIT(ABS_X) | BIT(ABS_Y); - amijoy_dev[i]->keybit[LONG(BTN_LEFT)] = BIT(BTN_LEFT) | BIT(BTN_MIDDLE) | BIT(BTN_RIGHT); + amijoy_dev[i]->evbit[0] = BIT_MASK(EV_KEY) | BIT_MASK(EV_ABS); + amijoy_dev[i]->absbit[0] = BIT_MASK(ABS_X) | BIT_MASK(ABS_Y); + amijoy_dev[i]->keybit[BIT_WORD(BTN_LEFT)] = BIT_MASK(BTN_LEFT) | + BIT_MASK(BTN_MIDDLE) | BIT_MASK(BTN_RIGHT); for (j = 0; j < 2; j++) { amijoy_dev[i]->absmin[ABS_X + j] = -1; amijoy_dev[i]->absmax[ABS_X + j] = 1; diff --git a/drivers/input/joystick/analog.c b/drivers/input/joystick/analog.c index bdd157c1ebf8..15739880afc6 100644 --- a/drivers/input/joystick/analog.c +++ b/drivers/input/joystick/analog.c @@ -456,7 +456,7 @@ static int analog_init_device(struct analog_port *port, struct analog *analog, i input_dev->open = analog_open; input_dev->close = analog_close; - input_dev->evbit[0] = BIT(EV_KEY) | BIT(EV_ABS); + input_dev->evbit[0] = BIT_MASK(EV_KEY) | BIT_MASK(EV_ABS); for (i = j = 0; i < 4; i++) if (analog->mask & (1 << i)) { diff --git a/drivers/input/joystick/cobra.c b/drivers/input/joystick/cobra.c index d3352a849b85..55646a6d89f5 100644 --- a/drivers/input/joystick/cobra.c +++ b/drivers/input/joystick/cobra.c @@ -218,7 +218,7 @@ static int cobra_connect(struct gameport *gameport, struct gameport_driver *drv) input_dev->open = cobra_open; input_dev->close = cobra_close; - input_dev->evbit[0] = BIT(EV_KEY) | BIT(EV_ABS); + input_dev->evbit[0] = BIT_MASK(EV_KEY) | BIT_MASK(EV_ABS); input_set_abs_params(input_dev, ABS_X, -1, 1, 0, 0); input_set_abs_params(input_dev, ABS_Y, -1, 1, 0, 0); for (j = 0; cobra_btn[j]; j++) diff --git a/drivers/input/joystick/db9.c b/drivers/input/joystick/db9.c index b069ee18e353..27fc475bd3a1 100644 --- a/drivers/input/joystick/db9.c +++ b/drivers/input/joystick/db9.c @@ -631,7 +631,7 @@ static struct db9 __init *db9_probe(int parport, int mode) input_dev->open = db9_open; input_dev->close = db9_close; - input_dev->evbit[0] = BIT(EV_KEY) | BIT(EV_ABS); + input_dev->evbit[0] = BIT_MASK(EV_KEY) | BIT_MASK(EV_ABS); for (j = 0; j < db9_mode->n_buttons; j++) set_bit(db9_mode->buttons[j], input_dev->keybit); for (j = 0; j < db9_mode->n_axis; j++) { diff --git a/drivers/input/joystick/gamecon.c b/drivers/input/joystick/gamecon.c index 1a452e0e5f25..df2a9d02ca6c 100644 --- a/drivers/input/joystick/gamecon.c +++ b/drivers/input/joystick/gamecon.c @@ -653,12 +653,12 @@ static int __init gc_setup_pad(struct gc *gc, int idx, int pad_type) input_dev->close = gc_close; if (pad_type != GC_SNESMOUSE) { - input_dev->evbit[0] = BIT(EV_KEY) | BIT(EV_ABS); + input_dev->evbit[0] = BIT_MASK(EV_KEY) | BIT_MASK(EV_ABS); for (i = 0; i < 2; i++) input_set_abs_params(input_dev, ABS_X + i, -1, 1, 0, 0); } else - input_dev->evbit[0] = BIT(EV_KEY) | BIT(EV_REL); + input_dev->evbit[0] = BIT_MASK(EV_KEY) | BIT_MASK(EV_REL); gc->pads[0] |= gc_status_bit[idx]; gc->pads[pad_type] |= gc_status_bit[idx]; diff --git a/drivers/input/joystick/gf2k.c b/drivers/input/joystick/gf2k.c index d514aebf7554..1f6302c0eb3f 100644 --- a/drivers/input/joystick/gf2k.c +++ b/drivers/input/joystick/gf2k.c @@ -315,7 +315,7 @@ static int gf2k_connect(struct gameport *gameport, struct gameport_driver *drv) input_dev->open = gf2k_open; input_dev->close = gf2k_close; - input_dev->evbit[0] = BIT(EV_KEY) | BIT(EV_ABS); + input_dev->evbit[0] = BIT_MASK(EV_KEY) | BIT_MASK(EV_ABS); for (i = 0; i < gf2k_axes[gf2k->id]; i++) set_bit(gf2k_abs[i], input_dev->absbit); diff --git a/drivers/input/joystick/grip.c b/drivers/input/joystick/grip.c index 73eb5ab6f140..fd3853ab1aad 100644 --- a/drivers/input/joystick/grip.c +++ b/drivers/input/joystick/grip.c @@ -370,7 +370,7 @@ static int grip_connect(struct gameport *gameport, struct gameport_driver *drv) input_dev->open = grip_open; input_dev->close = grip_close; - input_dev->evbit[0] = BIT(EV_KEY) | BIT(EV_ABS); + input_dev->evbit[0] = BIT_MASK(EV_KEY) | BIT_MASK(EV_ABS); for (j = 0; (t = grip_abs[grip->mode[i]][j]) >= 0; j++) { diff --git a/drivers/input/joystick/grip_mp.c b/drivers/input/joystick/grip_mp.c index 4ed3a3eadf19..c57e21d68c00 100644 --- a/drivers/input/joystick/grip_mp.c +++ b/drivers/input/joystick/grip_mp.c @@ -606,7 +606,7 @@ static int register_slot(int slot, struct grip_mp *grip) input_dev->open = grip_open; input_dev->close = grip_close; - input_dev->evbit[0] = BIT(EV_KEY) | BIT(EV_ABS); + input_dev->evbit[0] = BIT_MASK(EV_KEY) | BIT_MASK(EV_ABS); for (j = 0; (t = grip_abs[port->mode][j]) >= 0; j++) input_set_abs_params(input_dev, t, -1, 1, 0, 0); diff --git a/drivers/input/joystick/guillemot.c b/drivers/input/joystick/guillemot.c index d4e8073caf27..aa6bfb3fb8cd 100644 --- a/drivers/input/joystick/guillemot.c +++ b/drivers/input/joystick/guillemot.c @@ -238,7 +238,7 @@ static int guillemot_connect(struct gameport *gameport, struct gameport_driver * input_dev->open = guillemot_open; input_dev->close = guillemot_close; - input_dev->evbit[0] = BIT(EV_KEY) | BIT(EV_ABS); + input_dev->evbit[0] = BIT_MASK(EV_KEY) | BIT_MASK(EV_ABS); for (i = 0; (t = guillemot->type->abs[i]) >= 0; i++) input_set_abs_params(input_dev, t, 0, 255, 0, 0); diff --git a/drivers/input/joystick/iforce/iforce-main.c b/drivers/input/joystick/iforce/iforce-main.c index 682244b1c042..6f826b37d9aa 100644 --- a/drivers/input/joystick/iforce/iforce-main.c +++ b/drivers/input/joystick/iforce/iforce-main.c @@ -389,7 +389,8 @@ int iforce_init_device(struct iforce *iforce) * Set input device bitfields and ranges. */ - input_dev->evbit[0] = BIT(EV_KEY) | BIT(EV_ABS) | BIT(EV_FF_STATUS); + input_dev->evbit[0] = BIT_MASK(EV_KEY) | BIT_MASK(EV_ABS) | + BIT_MASK(EV_FF_STATUS); for (i = 0; iforce->type->btn[i] >= 0; i++) set_bit(iforce->type->btn[i], input_dev->keybit); diff --git a/drivers/input/joystick/iforce/iforce.h b/drivers/input/joystick/iforce/iforce.h index 40a853ac21c7..a964a7cfd210 100644 --- a/drivers/input/joystick/iforce/iforce.h +++ b/drivers/input/joystick/iforce/iforce.h @@ -62,13 +62,13 @@ #define FF_CORE_IS_PLAYED 3 /* Effect is currently being played */ #define FF_CORE_SHOULD_PLAY 4 /* User wants the effect to be played */ #define FF_CORE_UPDATE 5 /* Effect is being updated */ -#define FF_MODCORE_MAX 5 +#define FF_MODCORE_CNT 6 struct iforce_core_effect { /* Information about where modifiers are stored in the device's memory */ struct resource mod1_chunk; struct resource mod2_chunk; - unsigned long flags[NBITS(FF_MODCORE_MAX)]; + unsigned long flags[BITS_TO_LONGS(FF_MODCORE_CNT)]; }; #define FF_CMD_EFFECT 0x010e diff --git a/drivers/input/joystick/interact.c b/drivers/input/joystick/interact.c index 1aec1e9d7c59..bc8ea95dfd0e 100644 --- a/drivers/input/joystick/interact.c +++ b/drivers/input/joystick/interact.c @@ -269,7 +269,7 @@ static int interact_connect(struct gameport *gameport, struct gameport_driver *d input_dev->open = interact_open; input_dev->close = interact_close; - input_dev->evbit[0] = BIT(EV_KEY) | BIT(EV_ABS); + input_dev->evbit[0] = BIT_MASK(EV_KEY) | BIT_MASK(EV_ABS); for (i = 0; (t = interact_type[interact->type].abs[i]) >= 0; i++) { set_bit(t, input_dev->absbit); diff --git a/drivers/input/joystick/magellan.c b/drivers/input/joystick/magellan.c index b35604ee43ae..54e676948ebb 100644 --- a/drivers/input/joystick/magellan.c +++ b/drivers/input/joystick/magellan.c @@ -170,7 +170,7 @@ static int magellan_connect(struct serio *serio, struct serio_driver *drv) input_dev->id.version = 0x0100; input_dev->dev.parent = &serio->dev; - input_dev->evbit[0] = BIT(EV_KEY) | BIT(EV_ABS); + input_dev->evbit[0] = BIT_MASK(EV_KEY) | BIT_MASK(EV_ABS); for (i = 0; i < 9; i++) set_bit(magellan_buttons[i], input_dev->keybit); diff --git a/drivers/input/joystick/sidewinder.c b/drivers/input/joystick/sidewinder.c index 2adf73f63c94..7b4865fdee54 100644 --- a/drivers/input/joystick/sidewinder.c +++ b/drivers/input/joystick/sidewinder.c @@ -758,7 +758,7 @@ static int sw_connect(struct gameport *gameport, struct gameport_driver *drv) input_dev->open = sw_open; input_dev->close = sw_close; - input_dev->evbit[0] = BIT(EV_KEY) | BIT(EV_ABS); + input_dev->evbit[0] = BIT_MASK(EV_KEY) | BIT_MASK(EV_ABS); for (j = 0; (bits = sw_bit[sw->type][j]); j++) { code = sw_abs[sw->type][j]; diff --git a/drivers/input/joystick/spaceball.c b/drivers/input/joystick/spaceball.c index abb7c4cf54ad..d4087fd49656 100644 --- a/drivers/input/joystick/spaceball.c +++ b/drivers/input/joystick/spaceball.c @@ -228,18 +228,23 @@ static int spaceball_connect(struct serio *serio, struct serio_driver *drv) input_dev->id.version = 0x0100; input_dev->dev.parent = &serio->dev; - input_dev->evbit[0] = BIT(EV_KEY) | BIT(EV_ABS); + input_dev->evbit[0] = BIT_MASK(EV_KEY) | BIT_MASK(EV_ABS); switch (id) { case SPACEBALL_4000FLX: case SPACEBALL_4000FLX_L: - input_dev->keybit[LONG(BTN_0)] |= BIT(BTN_9); - input_dev->keybit[LONG(BTN_A)] |= BIT(BTN_A) | BIT(BTN_B) | BIT(BTN_C) | BIT(BTN_MODE); + input_dev->keybit[BIT_WORD(BTN_0)] |= BIT_MASK(BTN_9); + input_dev->keybit[BIT_WORD(BTN_A)] |= BIT_MASK(BTN_A) | + BIT_MASK(BTN_B) | BIT_MASK(BTN_C) | + BIT_MASK(BTN_MODE); default: - input_dev->keybit[LONG(BTN_0)] |= BIT(BTN_2) | BIT(BTN_3) | BIT(BTN_4) - | BIT(BTN_5) | BIT(BTN_6) | BIT(BTN_7) | BIT(BTN_8); + input_dev->keybit[BIT_WORD(BTN_0)] |= BIT_MASK(BTN_2) | + BIT_MASK(BTN_3) | BIT_MASK(BTN_4) | + BIT_MASK(BTN_5) | BIT_MASK(BTN_6) | + BIT_MASK(BTN_7) | BIT_MASK(BTN_8); case SPACEBALL_3003C: - input_dev->keybit[LONG(BTN_0)] |= BIT(BTN_1) | BIT(BTN_8); + input_dev->keybit[BIT_WORD(BTN_0)] |= BIT_MASK(BTN_1) | + BIT_MASK(BTN_8); } for (i = 0; i < 3; i++) { diff --git a/drivers/input/joystick/spaceorb.c b/drivers/input/joystick/spaceorb.c index c4937f1e837c..f7ce4004f4ba 100644 --- a/drivers/input/joystick/spaceorb.c +++ b/drivers/input/joystick/spaceorb.c @@ -185,7 +185,7 @@ static int spaceorb_connect(struct serio *serio, struct serio_driver *drv) input_dev->id.version = 0x0100; input_dev->dev.parent = &serio->dev; - input_dev->evbit[0] = BIT(EV_KEY) | BIT(EV_ABS); + input_dev->evbit[0] = BIT_MASK(EV_KEY) | BIT_MASK(EV_ABS); for (i = 0; i < 6; i++) set_bit(spaceorb_buttons[i], input_dev->keybit); diff --git a/drivers/input/joystick/stinger.c b/drivers/input/joystick/stinger.c index 8581ee991d4e..baa10b2f7ba1 100644 --- a/drivers/input/joystick/stinger.c +++ b/drivers/input/joystick/stinger.c @@ -156,10 +156,11 @@ static int stinger_connect(struct serio *serio, struct serio_driver *drv) input_dev->id.version = 0x0100; input_dev->dev.parent = &serio->dev; - input_dev->evbit[0] = BIT(EV_KEY) | BIT(EV_ABS); - input_dev->keybit[LONG(BTN_A)] = BIT(BTN_A) | BIT(BTN_B) | BIT(BTN_C) | BIT(BTN_X) | - BIT(BTN_Y) | BIT(BTN_Z) | BIT(BTN_TL) | BIT(BTN_TR) | - BIT(BTN_START) | BIT(BTN_SELECT); + input_dev->evbit[0] = BIT_MASK(EV_KEY) | BIT_MASK(EV_ABS); + input_dev->keybit[BIT_WORD(BTN_A)] = BIT_MASK(BTN_A) | BIT_MASK(BTN_B) | + BIT_MASK(BTN_C) | BIT_MASK(BTN_X) | BIT_MASK(BTN_Y) | + BIT_MASK(BTN_Z) | BIT_MASK(BTN_TL) | BIT_MASK(BTN_TR) | + BIT_MASK(BTN_START) | BIT_MASK(BTN_SELECT); input_set_abs_params(input_dev, ABS_X, -64, 64, 0, 4); input_set_abs_params(input_dev, ABS_Y, -64, 64, 0, 4); diff --git a/drivers/input/joystick/tmdc.c b/drivers/input/joystick/tmdc.c index 3b36ee04f726..0feeb8acb532 100644 --- a/drivers/input/joystick/tmdc.c +++ b/drivers/input/joystick/tmdc.c @@ -333,7 +333,7 @@ static int tmdc_setup_port(struct tmdc *tmdc, int idx, unsigned char *data) input_dev->open = tmdc_open; input_dev->close = tmdc_close; - input_dev->evbit[0] = BIT(EV_KEY) | BIT(EV_ABS); + input_dev->evbit[0] = BIT_MASK(EV_KEY) | BIT_MASK(EV_ABS); for (i = 0; i < port->absc && i < TMDC_ABS; i++) if (port->abs[i] >= 0) diff --git a/drivers/input/joystick/turbografx.c b/drivers/input/joystick/turbografx.c index 8381c6f14373..bbebd4e2ad7f 100644 --- a/drivers/input/joystick/turbografx.c +++ b/drivers/input/joystick/turbografx.c @@ -229,7 +229,7 @@ static struct tgfx __init *tgfx_probe(int parport, int *n_buttons, int n_devs) input_dev->open = tgfx_open; input_dev->close = tgfx_close; - input_dev->evbit[0] = BIT(EV_KEY) | BIT(EV_ABS); + input_dev->evbit[0] = BIT_MASK(EV_KEY) | BIT_MASK(EV_ABS); input_set_abs_params(input_dev, ABS_X, -1, 1, 0, 0); input_set_abs_params(input_dev, ABS_Y, -1, 1, 0, 0); diff --git a/drivers/input/joystick/twidjoy.c b/drivers/input/joystick/twidjoy.c index c91504ec38eb..1085c841fec4 100644 --- a/drivers/input/joystick/twidjoy.c +++ b/drivers/input/joystick/twidjoy.c @@ -207,7 +207,7 @@ static int twidjoy_connect(struct serio *serio, struct serio_driver *drv) input_dev->id.version = 0x0100; input_dev->dev.parent = &serio->dev; - input_dev->evbit[0] = BIT(EV_KEY) | BIT(EV_ABS); + input_dev->evbit[0] = BIT_MASK(EV_KEY) | BIT_MASK(EV_ABS); input_set_abs_params(input_dev, ABS_X, -50, 50, 4, 4); input_set_abs_params(input_dev, ABS_Y, -50, 50, 4, 4); diff --git a/drivers/input/joystick/warrior.c b/drivers/input/joystick/warrior.c index 4e85f72eefd7..e928b6e3724a 100644 --- a/drivers/input/joystick/warrior.c +++ b/drivers/input/joystick/warrior.c @@ -162,9 +162,11 @@ static int warrior_connect(struct serio *serio, struct serio_driver *drv) input_dev->id.version = 0x0100; input_dev->dev.parent = &serio->dev; - input_dev->evbit[0] = BIT(EV_KEY) | BIT(EV_REL) | BIT(EV_ABS); - input_dev->keybit[LONG(BTN_TRIGGER)] = BIT(BTN_TRIGGER) | BIT(BTN_THUMB) | BIT(BTN_TOP) | BIT(BTN_TOP2); - input_dev->relbit[0] = BIT(REL_DIAL); + input_dev->evbit[0] = BIT_MASK(EV_KEY) | BIT_MASK(EV_REL) | + BIT_MASK(EV_ABS); + input_dev->keybit[BIT_WORD(BTN_TRIGGER)] = BIT_MASK(BTN_TRIGGER) | + BIT_MASK(BTN_THUMB) | BIT_MASK(BTN_TOP) | BIT_MASK(BTN_TOP2); + input_dev->relbit[0] = BIT_MASK(REL_DIAL); input_set_abs_params(input_dev, ABS_X, -64, 64, 0, 8); input_set_abs_params(input_dev, ABS_Y, -64, 64, 0, 8); input_set_abs_params(input_dev, ABS_THROTTLE, -112, 112, 0, 0); diff --git a/drivers/input/joystick/xpad.c b/drivers/input/joystick/xpad.c index 623629a69b03..6dd375825a14 100644 --- a/drivers/input/joystick/xpad.c +++ b/drivers/input/joystick/xpad.c @@ -658,7 +658,7 @@ static int xpad_probe(struct usb_interface *intf, const struct usb_device_id *id input_dev->open = xpad_open; input_dev->close = xpad_close; - input_dev->evbit[0] = BIT(EV_KEY) | BIT(EV_ABS); + input_dev->evbit[0] = BIT_MASK(EV_KEY) | BIT_MASK(EV_ABS); /* set up buttons */ for (i = 0; xpad_btn[i] >= 0; i++) diff --git a/drivers/input/keyboard/aaed2000_kbd.c b/drivers/input/keyboard/aaed2000_kbd.c index 63d6ead6b877..72abc196ce66 100644 --- a/drivers/input/keyboard/aaed2000_kbd.c +++ b/drivers/input/keyboard/aaed2000_kbd.c @@ -125,7 +125,7 @@ static int __devinit aaedkbd_probe(struct platform_device *pdev) input_dev->id.version = 0x0100; input_dev->dev.parent = &pdev->dev; - input_dev->evbit[0] = BIT(EV_KEY) | BIT(EV_REP); + input_dev->evbit[0] = BIT_MASK(EV_KEY) | BIT_MASK(EV_REP); input_dev->keycode = aaedkbd->keycode; input_dev->keycodesize = sizeof(unsigned char); input_dev->keycodemax = ARRAY_SIZE(aaedkbd_keycode); diff --git a/drivers/input/keyboard/amikbd.c b/drivers/input/keyboard/amikbd.c index c67e84ec2d6a..81bf7562aca0 100644 --- a/drivers/input/keyboard/amikbd.c +++ b/drivers/input/keyboard/amikbd.c @@ -209,7 +209,7 @@ static int __init amikbd_init(void) amikbd_dev->id.product = 0x0001; amikbd_dev->id.version = 0x0100; - amikbd_dev->evbit[0] = BIT(EV_KEY) | BIT(EV_REP); + amikbd_dev->evbit[0] = BIT_MASK(EV_KEY) | BIT_MASK(EV_REP); for (i = 0; i < 0x78; i++) set_bit(i, amikbd_dev->keybit); diff --git a/drivers/input/keyboard/atakbd.c b/drivers/input/keyboard/atakbd.c index a1800151b6ce..4e92100c56a8 100644 --- a/drivers/input/keyboard/atakbd.c +++ b/drivers/input/keyboard/atakbd.c @@ -237,7 +237,7 @@ static int __init atakbd_init(void) atakbd_dev->id.product = 0x0001; atakbd_dev->id.version = 0x0100; - atakbd_dev->evbit[0] = BIT(EV_KEY) | BIT(EV_REP); + atakbd_dev->evbit[0] = BIT_MASK(EV_KEY) | BIT_MASK(EV_REP); atakbd_dev->keycode = atakbd_keycode; atakbd_dev->keycodesize = sizeof(unsigned char); atakbd_dev->keycodemax = ARRAY_SIZE(atakbd_keycode); diff --git a/drivers/input/keyboard/atkbd.c b/drivers/input/keyboard/atkbd.c index 41fc3d03b6eb..b39c5b31e620 100644 --- a/drivers/input/keyboard/atkbd.c +++ b/drivers/input/keyboard/atkbd.c @@ -900,27 +900,32 @@ static void atkbd_set_device_attrs(struct atkbd *atkbd) input_set_drvdata(input_dev, atkbd); - input_dev->evbit[0] = BIT(EV_KEY) | BIT(EV_REP) | BIT(EV_MSC); + input_dev->evbit[0] = BIT_MASK(EV_KEY) | BIT_MASK(EV_REP) | + BIT_MASK(EV_MSC); if (atkbd->write) { - input_dev->evbit[0] |= BIT(EV_LED); - input_dev->ledbit[0] = BIT(LED_NUML) | BIT(LED_CAPSL) | BIT(LED_SCROLLL); + input_dev->evbit[0] |= BIT_MASK(EV_LED); + input_dev->ledbit[0] = BIT_MASK(LED_NUML) | + BIT_MASK(LED_CAPSL) | BIT_MASK(LED_SCROLLL); } if (atkbd->extra) - input_dev->ledbit[0] |= BIT(LED_COMPOSE) | BIT(LED_SUSPEND) | - BIT(LED_SLEEP) | BIT(LED_MUTE) | BIT(LED_MISC); + input_dev->ledbit[0] |= BIT_MASK(LED_COMPOSE) | + BIT_MASK(LED_SUSPEND) | BIT_MASK(LED_SLEEP) | + BIT_MASK(LED_MUTE) | BIT_MASK(LED_MISC); if (!atkbd->softrepeat) { input_dev->rep[REP_DELAY] = 250; input_dev->rep[REP_PERIOD] = 33; } - input_dev->mscbit[0] = atkbd->softraw ? BIT(MSC_SCAN) : BIT(MSC_RAW) | BIT(MSC_SCAN); + input_dev->mscbit[0] = atkbd->softraw ? BIT_MASK(MSC_SCAN) : + BIT_MASK(MSC_RAW) | BIT_MASK(MSC_SCAN); if (atkbd->scroll) { - input_dev->evbit[0] |= BIT(EV_REL); - input_dev->relbit[0] = BIT(REL_WHEEL) | BIT(REL_HWHEEL); + input_dev->evbit[0] |= BIT_MASK(EV_REL); + input_dev->relbit[0] = BIT_MASK(REL_WHEEL) | + BIT_MASK(REL_HWHEEL); set_bit(BTN_MIDDLE, input_dev->keybit); } diff --git a/drivers/input/keyboard/corgikbd.c b/drivers/input/keyboard/corgikbd.c index 6578bfff644b..790fed368aae 100644 --- a/drivers/input/keyboard/corgikbd.c +++ b/drivers/input/keyboard/corgikbd.c @@ -325,7 +325,8 @@ static int __init corgikbd_probe(struct platform_device *pdev) input_dev->id.version = 0x0100; input_dev->dev.parent = &pdev->dev; - input_dev->evbit[0] = BIT(EV_KEY) | BIT(EV_REP) | BIT(EV_PWR) | BIT(EV_SW); + input_dev->evbit[0] = BIT_MASK(EV_KEY) | BIT_MASK(EV_REP) | + BIT_MASK(EV_PWR) | BIT_MASK(EV_SW); input_dev->keycode = corgikbd->keycode; input_dev->keycodesize = sizeof(unsigned char); input_dev->keycodemax = ARRAY_SIZE(corgikbd_keycode); diff --git a/drivers/input/keyboard/gpio_keys.c b/drivers/input/keyboard/gpio_keys.c index e2a3293bc67e..3eddf52a0bba 100644 --- a/drivers/input/keyboard/gpio_keys.c +++ b/drivers/input/keyboard/gpio_keys.c @@ -62,7 +62,7 @@ static int __devinit gpio_keys_probe(struct platform_device *pdev) platform_set_drvdata(pdev, input); - input->evbit[0] = BIT(EV_KEY); + input->evbit[0] = BIT_MASK(EV_KEY); input->name = pdev->name; input->phys = "gpio-keys/input0"; diff --git a/drivers/input/keyboard/hil_kbd.c b/drivers/input/keyboard/hil_kbd.c index cdd254f2e6c7..adbf29f0169d 100644 --- a/drivers/input/keyboard/hil_kbd.c +++ b/drivers/input/keyboard/hil_kbd.c @@ -323,8 +323,9 @@ static int hil_kbd_connect(struct serio *serio, struct serio_driver *drv) goto bail2; } - kbd->dev->evbit[0] = BIT(EV_KEY) | BIT(EV_REP); - kbd->dev->ledbit[0] = BIT(LED_NUML) | BIT(LED_CAPSL) | BIT(LED_SCROLLL); + kbd->dev->evbit[0] = BIT_MASK(EV_KEY) | BIT_MASK(EV_REP); + kbd->dev->ledbit[0] = BIT_MASK(LED_NUML) | BIT_MASK(LED_CAPSL) | + BIT_MASK(LED_SCROLLL); kbd->dev->keycodemax = HIL_KEYCODES_SET1_TBLSIZE; kbd->dev->keycodesize = sizeof(hil_kbd_set1[0]); kbd->dev->keycode = hil_kbd_set1; diff --git a/drivers/input/keyboard/hilkbd.c b/drivers/input/keyboard/hilkbd.c index 499b6974457f..50d80ecf0b80 100644 --- a/drivers/input/keyboard/hilkbd.c +++ b/drivers/input/keyboard/hilkbd.c @@ -266,8 +266,9 @@ hil_keyb_init(void) if (hphilkeyb_keycode[i] != KEY_RESERVED) set_bit(hphilkeyb_keycode[i], hil_dev.dev->keybit); - hil_dev.dev->evbit[0] = BIT(EV_KEY) | BIT(EV_REP); - hil_dev.dev->ledbit[0] = BIT(LED_NUML) | BIT(LED_CAPSL) | BIT(LED_SCROLLL); + hil_dev.dev->evbit[0] = BIT_MASK(EV_KEY) | BIT_MASK(EV_REP); + hil_dev.dev->ledbit[0] = BIT_MASK(LED_NUML) | BIT_MASK(LED_CAPSL) | + BIT_MASK(LED_SCROLLL); hil_dev.dev->keycodemax = HIL_KEYCODES_SET1_TBLSIZE; hil_dev.dev->keycodesize= sizeof(hphilkeyb_keycode[0]); hil_dev.dev->keycode = hphilkeyb_keycode; diff --git a/drivers/input/keyboard/locomokbd.c b/drivers/input/keyboard/locomokbd.c index 7a41b271f222..5a0ca18d6755 100644 --- a/drivers/input/keyboard/locomokbd.c +++ b/drivers/input/keyboard/locomokbd.c @@ -233,7 +233,7 @@ static int locomokbd_probe(struct locomo_dev *dev) input_dev->id.version = 0x0100; input_dev->dev.parent = &dev->dev; - input_dev->evbit[0] = BIT(EV_KEY) | BIT(EV_REP); + input_dev->evbit[0] = BIT_MASK(EV_KEY) | BIT_MASK(EV_REP); input_dev->keycode = locomokbd->keycode; input_dev->keycodesize = sizeof(unsigned char); input_dev->keycodemax = ARRAY_SIZE(locomokbd_keycode); diff --git a/drivers/input/keyboard/newtonkbd.c b/drivers/input/keyboard/newtonkbd.c index b97a41e3ee56..48d1cab0aa1c 100644 --- a/drivers/input/keyboard/newtonkbd.c +++ b/drivers/input/keyboard/newtonkbd.c @@ -106,7 +106,7 @@ static int nkbd_connect(struct serio *serio, struct serio_driver *drv) input_dev->id.version = 0x0100; input_dev->dev.parent = &serio->dev; - input_dev->evbit[0] = BIT(EV_KEY) | BIT(EV_REP); + input_dev->evbit[0] = BIT_MASK(EV_KEY) | BIT_MASK(EV_REP); input_dev->keycode = nkbd->keycode; input_dev->keycodesize = sizeof(unsigned char); input_dev->keycodemax = ARRAY_SIZE(nkbd_keycode); diff --git a/drivers/input/keyboard/pxa27x_keyboard.c b/drivers/input/keyboard/pxa27x_keyboard.c index b7061aa38816..bdd64ee4c5c8 100644 --- a/drivers/input/keyboard/pxa27x_keyboard.c +++ b/drivers/input/keyboard/pxa27x_keyboard.c @@ -183,8 +183,9 @@ static int __devinit pxakbd_probe(struct platform_device *pdev) input_dev->close = pxakbd_close; input_dev->dev.parent = &pdev->dev; - input_dev->evbit[0] = BIT(EV_KEY) | BIT(EV_REP) | BIT(EV_REL); - input_dev->relbit[LONG(REL_WHEEL)] = BIT(REL_WHEEL); + input_dev->evbit[0] = BIT_MASK(EV_KEY) | BIT_MASK(EV_REP) | + BIT_MASK(EV_REL); + input_dev->relbit[BIT_WORD(REL_WHEEL)] = BIT_MASK(REL_WHEEL); for (row = 0; row < pdata->nr_rows; row++) { for (col = 0; col < pdata->nr_cols; col++) { int code = pdata->keycodes[row][col]; diff --git a/drivers/input/keyboard/spitzkbd.c b/drivers/input/keyboard/spitzkbd.c index 41b80385476c..410d78a774d0 100644 --- a/drivers/input/keyboard/spitzkbd.c +++ b/drivers/input/keyboard/spitzkbd.c @@ -381,7 +381,8 @@ static int __init spitzkbd_probe(struct platform_device *dev) input_dev->id.product = 0x0001; input_dev->id.version = 0x0100; - input_dev->evbit[0] = BIT(EV_KEY) | BIT(EV_REP) | BIT(EV_PWR) | BIT(EV_SW); + input_dev->evbit[0] = BIT_MASK(EV_KEY) | BIT_MASK(EV_REP) | + BIT_MASK(EV_PWR) | BIT_MASK(EV_SW); input_dev->keycode = spitzkbd->keycode; input_dev->keycodesize = sizeof(unsigned char); input_dev->keycodemax = ARRAY_SIZE(spitzkbd_keycode); diff --git a/drivers/input/keyboard/stowaway.c b/drivers/input/keyboard/stowaway.c index b44b0684d543..7437219370b1 100644 --- a/drivers/input/keyboard/stowaway.c +++ b/drivers/input/keyboard/stowaway.c @@ -110,7 +110,7 @@ static int skbd_connect(struct serio *serio, struct serio_driver *drv) input_dev->id.version = 0x0100; input_dev->dev.parent = &serio->dev; - input_dev->evbit[0] = BIT(EV_KEY) | BIT(EV_REP); + input_dev->evbit[0] = BIT_MASK(EV_KEY) | BIT_MASK(EV_REP); input_dev->keycode = skbd->keycode; input_dev->keycodesize = sizeof(unsigned char); input_dev->keycodemax = ARRAY_SIZE(skbd_keycode); diff --git a/drivers/input/keyboard/sunkbd.c b/drivers/input/keyboard/sunkbd.c index 1d4e39624cfe..be0f5d19d023 100644 --- a/drivers/input/keyboard/sunkbd.c +++ b/drivers/input/keyboard/sunkbd.c @@ -277,9 +277,11 @@ static int sunkbd_connect(struct serio *serio, struct serio_driver *drv) input_dev->event = sunkbd_event; - input_dev->evbit[0] = BIT(EV_KEY) | BIT(EV_LED) | BIT(EV_SND) | BIT(EV_REP); - input_dev->ledbit[0] = BIT(LED_CAPSL) | BIT(LED_COMPOSE) | BIT(LED_SCROLLL) | BIT(LED_NUML); - input_dev->sndbit[0] = BIT(SND_CLICK) | BIT(SND_BELL); + input_dev->evbit[0] = BIT_MASK(EV_KEY) | BIT_MASK(EV_LED) | + BIT_MASK(EV_SND) | BIT_MASK(EV_REP); + input_dev->ledbit[0] = BIT_MASK(LED_CAPSL) | BIT_MASK(LED_COMPOSE) | + BIT_MASK(LED_SCROLLL) | BIT_MASK(LED_NUML); + input_dev->sndbit[0] = BIT_MASK(SND_CLICK) | BIT_MASK(SND_BELL); input_dev->keycode = sunkbd->keycode; input_dev->keycodesize = sizeof(unsigned char); diff --git a/drivers/input/keyboard/xtkbd.c b/drivers/input/keyboard/xtkbd.c index f3a56eb58ed1..152a2c070508 100644 --- a/drivers/input/keyboard/xtkbd.c +++ b/drivers/input/keyboard/xtkbd.c @@ -110,7 +110,7 @@ static int xtkbd_connect(struct serio *serio, struct serio_driver *drv) input_dev->id.version = 0x0100; input_dev->dev.parent = &serio->dev; - input_dev->evbit[0] = BIT(EV_KEY) | BIT(EV_REP); + input_dev->evbit[0] = BIT_MASK(EV_KEY) | BIT_MASK(EV_REP); input_dev->keycode = xtkbd->keycode; input_dev->keycodesize = sizeof(unsigned char); input_dev->keycodemax = ARRAY_SIZE(xtkbd_keycode); diff --git a/drivers/input/misc/ati_remote.c b/drivers/input/misc/ati_remote.c index 471aab206443..3a7937481ad8 100644 --- a/drivers/input/misc/ati_remote.c +++ b/drivers/input/misc/ati_remote.c @@ -662,10 +662,10 @@ static void ati_remote_input_init(struct ati_remote *ati_remote) struct input_dev *idev = ati_remote->idev; int i; - idev->evbit[0] = BIT(EV_KEY) | BIT(EV_REL); - idev->keybit[LONG(BTN_MOUSE)] = ( BIT(BTN_LEFT) | BIT(BTN_RIGHT) | - BIT(BTN_SIDE) | BIT(BTN_EXTRA) ); - idev->relbit[0] = BIT(REL_X) | BIT(REL_Y); + idev->evbit[0] = BIT_MASK(EV_KEY) | BIT_MASK(EV_REL); + idev->keybit[BIT_WORD(BTN_MOUSE)] = BIT_MASK(BTN_LEFT) | + BIT_MASK(BTN_RIGHT) | BIT_MASK(BTN_SIDE) | BIT_MASK(BTN_EXTRA); + idev->relbit[0] = BIT_MASK(REL_X) | BIT_MASK(REL_Y); for (i = 0; ati_remote_tbl[i].kind != KIND_END; i++) if (ati_remote_tbl[i].type == EV_KEY) set_bit(ati_remote_tbl[i].code, idev->keybit); diff --git a/drivers/input/misc/ati_remote2.c b/drivers/input/misc/ati_remote2.c index 1031543e5c3f..f2709b82485c 100644 --- a/drivers/input/misc/ati_remote2.c +++ b/drivers/input/misc/ati_remote2.c @@ -346,9 +346,10 @@ static int ati_remote2_input_init(struct ati_remote2 *ar2) ar2->idev = idev; input_set_drvdata(idev, ar2); - idev->evbit[0] = BIT(EV_KEY) | BIT(EV_REP) | BIT(EV_REL); - idev->keybit[LONG(BTN_MOUSE)] = BIT(BTN_LEFT) | BIT(BTN_RIGHT); - idev->relbit[0] = BIT(REL_X) | BIT(REL_Y); + idev->evbit[0] = BIT_MASK(EV_KEY) | BIT_MASK(EV_REP) | BIT_MASK(EV_REL); + idev->keybit[BIT_WORD(BTN_MOUSE)] = BIT_MASK(BTN_LEFT) | + BIT_MASK(BTN_RIGHT); + idev->relbit[0] = BIT_MASK(REL_X) | BIT_MASK(REL_Y); for (i = 0; ati_remote2_key_table[i].key_code != KEY_RESERVED; i++) set_bit(ati_remote2_key_table[i].key_code, idev->keybit); diff --git a/drivers/input/misc/atlas_btns.c b/drivers/input/misc/atlas_btns.c index e43e92fd9e23..4e3ad657ed80 100644 --- a/drivers/input/misc/atlas_btns.c +++ b/drivers/input/misc/atlas_btns.c @@ -81,7 +81,7 @@ static int atlas_acpi_button_add(struct acpi_device *device) input_dev->name = "Atlas ACPI button driver"; input_dev->phys = "ASIM0000/atlas/input0"; input_dev->id.bustype = BUS_HOST; - input_dev->evbit[LONG(EV_KEY)] = BIT(EV_KEY); + input_dev->evbit[BIT_WORD(EV_KEY)] = BIT_MASK(EV_KEY); set_bit(KEY_F1, input_dev->keybit); set_bit(KEY_F2, input_dev->keybit); diff --git a/drivers/input/misc/cobalt_btns.c b/drivers/input/misc/cobalt_btns.c index 064b07936019..1aef97ed5e84 100644 --- a/drivers/input/misc/cobalt_btns.c +++ b/drivers/input/misc/cobalt_btns.c @@ -104,7 +104,7 @@ static int __devinit cobalt_buttons_probe(struct platform_device *pdev) input->id.bustype = BUS_HOST; input->cdev.dev = &pdev->dev; - input->evbit[0] = BIT(EV_KEY); + input->evbit[0] = BIT_MASK(EV_KEY); for (i = 0; i < ARRAY_SIZE(buttons_map); i++) { set_bit(buttons_map[i].keycode, input->keybit); buttons_map[i].count = 0; diff --git a/drivers/input/misc/ixp4xx-beeper.c b/drivers/input/misc/ixp4xx-beeper.c index e759944041ab..d2ade7443b7d 100644 --- a/drivers/input/misc/ixp4xx-beeper.c +++ b/drivers/input/misc/ixp4xx-beeper.c @@ -109,8 +109,8 @@ static int __devinit ixp4xx_spkr_probe(struct platform_device *dev) input_dev->id.version = 0x0100; input_dev->dev.parent = &dev->dev; - input_dev->evbit[0] = BIT(EV_SND); - input_dev->sndbit[0] = BIT(SND_BELL) | BIT(SND_TONE); + input_dev->evbit[0] = BIT_MASK(EV_SND); + input_dev->sndbit[0] = BIT_MASK(SND_BELL) | BIT_MASK(SND_TONE); input_dev->event = ixp4xx_spkr_event; err = request_irq(IRQ_IXP4XX_TIMER2, &ixp4xx_spkr_interrupt, diff --git a/drivers/input/misc/keyspan_remote.c b/drivers/input/misc/keyspan_remote.c index 1bffc9fa98c2..fd74347047dd 100644 --- a/drivers/input/misc/keyspan_remote.c +++ b/drivers/input/misc/keyspan_remote.c @@ -497,7 +497,7 @@ static int keyspan_probe(struct usb_interface *interface, const struct usb_devic usb_to_input_id(udev, &input_dev->id); input_dev->dev.parent = &interface->dev; - input_dev->evbit[0] = BIT(EV_KEY); /* We will only report KEY events. */ + input_dev->evbit[0] = BIT_MASK(EV_KEY); /* We will only report KEY events. */ for (i = 0; i < ARRAY_SIZE(keyspan_key_table); i++) if (keyspan_key_table[i] != KEY_RESERVED) set_bit(keyspan_key_table[i], input_dev->keybit); diff --git a/drivers/input/misc/m68kspkr.c b/drivers/input/misc/m68kspkr.c index e9f26e766b4d..0c64d9bb718e 100644 --- a/drivers/input/misc/m68kspkr.c +++ b/drivers/input/misc/m68kspkr.c @@ -65,8 +65,8 @@ static int __devinit m68kspkr_probe(struct platform_device *dev) input_dev->id.version = 0x0100; input_dev->dev.parent = &dev->dev; - input_dev->evbit[0] = BIT(EV_SND); - input_dev->sndbit[0] = BIT(SND_BELL) | BIT(SND_TONE); + input_dev->evbit[0] = BIT_MASK(EV_SND); + input_dev->sndbit[0] = BIT_MASK(SND_BELL) | BIT_MASK(SND_TONE); input_dev->event = m68kspkr_event; err = input_register_device(input_dev); diff --git a/drivers/input/misc/pcspkr.c b/drivers/input/misc/pcspkr.c index c19f77fbaf2a..4941a9e61e90 100644 --- a/drivers/input/misc/pcspkr.c +++ b/drivers/input/misc/pcspkr.c @@ -86,8 +86,8 @@ static int __devinit pcspkr_probe(struct platform_device *dev) pcspkr_dev->id.version = 0x0100; pcspkr_dev->dev.parent = &dev->dev; - pcspkr_dev->evbit[0] = BIT(EV_SND); - pcspkr_dev->sndbit[0] = BIT(SND_BELL) | BIT(SND_TONE); + pcspkr_dev->evbit[0] = BIT_MASK(EV_SND); + pcspkr_dev->sndbit[0] = BIT_MASK(SND_BELL) | BIT_MASK(SND_TONE); pcspkr_dev->event = pcspkr_event; err = input_register_device(pcspkr_dev); diff --git a/drivers/input/misc/powermate.c b/drivers/input/misc/powermate.c index 448a470d28f2..7a7b8c7b9633 100644 --- a/drivers/input/misc/powermate.c +++ b/drivers/input/misc/powermate.c @@ -363,10 +363,11 @@ static int powermate_probe(struct usb_interface *intf, const struct usb_device_i input_dev->event = powermate_input_event; - input_dev->evbit[0] = BIT(EV_KEY) | BIT(EV_REL) | BIT(EV_MSC); - input_dev->keybit[LONG(BTN_0)] = BIT(BTN_0); - input_dev->relbit[LONG(REL_DIAL)] = BIT(REL_DIAL); - input_dev->mscbit[LONG(MSC_PULSELED)] = BIT(MSC_PULSELED); + input_dev->evbit[0] = BIT_MASK(EV_KEY) | BIT_MASK(EV_REL) | + BIT_MASK(EV_MSC); + input_dev->keybit[BIT_WORD(BTN_0)] = BIT_MASK(BTN_0); + input_dev->relbit[BIT_WORD(REL_DIAL)] = BIT_MASK(REL_DIAL); + input_dev->mscbit[BIT_WORD(MSC_PULSELED)] = BIT_MASK(MSC_PULSELED); /* get a handle to the interrupt data pipe */ pipe = usb_rcvintpipe(udev, endpoint->bEndpointAddress); diff --git a/drivers/input/misc/sparcspkr.c b/drivers/input/misc/sparcspkr.c index e36ec1d92be8..a3637d870880 100644 --- a/drivers/input/misc/sparcspkr.c +++ b/drivers/input/misc/sparcspkr.c @@ -115,8 +115,8 @@ static int __devinit sparcspkr_probe(struct device *dev) input_dev->id.version = 0x0100; input_dev->dev.parent = dev; - input_dev->evbit[0] = BIT(EV_SND); - input_dev->sndbit[0] = BIT(SND_BELL) | BIT(SND_TONE); + input_dev->evbit[0] = BIT_MASK(EV_SND); + input_dev->sndbit[0] = BIT_MASK(SND_BELL) | BIT_MASK(SND_TONE); input_dev->event = state->event; diff --git a/drivers/input/misc/yealink.c b/drivers/input/misc/yealink.c index ab15880fd566..46279ef2b649 100644 --- a/drivers/input/misc/yealink.c +++ b/drivers/input/misc/yealink.c @@ -945,7 +945,7 @@ static int usb_probe(struct usb_interface *intf, const struct usb_device_id *id) /* input_dev->event = input_ev; TODO */ /* register available key events */ - input_dev->evbit[0] = BIT(EV_KEY); + input_dev->evbit[0] = BIT_MASK(EV_KEY); for (i = 0; i < 256; i++) { int k = map_p1k_to_key(i); if (k >= 0) { diff --git a/drivers/input/mouse/alps.c b/drivers/input/mouse/alps.c index 64d70a9b714c..2b5ed119c9a9 100644 --- a/drivers/input/mouse/alps.c +++ b/drivers/input/mouse/alps.c @@ -455,24 +455,25 @@ int alps_init(struct psmouse *psmouse) if (alps_hw_init(psmouse, &version)) goto init_fail; - dev1->evbit[LONG(EV_KEY)] |= BIT(EV_KEY); - dev1->keybit[LONG(BTN_TOUCH)] |= BIT(BTN_TOUCH); - dev1->keybit[LONG(BTN_TOOL_FINGER)] |= BIT(BTN_TOOL_FINGER); - dev1->keybit[LONG(BTN_LEFT)] |= BIT(BTN_LEFT) | BIT(BTN_MIDDLE) | BIT(BTN_RIGHT); + dev1->evbit[BIT_WORD(EV_KEY)] |= BIT_MASK(EV_KEY); + dev1->keybit[BIT_WORD(BTN_TOUCH)] |= BIT_MASK(BTN_TOUCH); + dev1->keybit[BIT_WORD(BTN_TOOL_FINGER)] |= BIT_MASK(BTN_TOOL_FINGER); + dev1->keybit[BIT_WORD(BTN_LEFT)] |= BIT_MASK(BTN_LEFT) | + BIT_MASK(BTN_MIDDLE) | BIT_MASK(BTN_RIGHT); - dev1->evbit[LONG(EV_ABS)] |= BIT(EV_ABS); + dev1->evbit[BIT_WORD(EV_ABS)] |= BIT_MASK(EV_ABS); input_set_abs_params(dev1, ABS_X, 0, 1023, 0, 0); input_set_abs_params(dev1, ABS_Y, 0, 767, 0, 0); input_set_abs_params(dev1, ABS_PRESSURE, 0, 127, 0, 0); if (priv->i->flags & ALPS_WHEEL) { - dev1->evbit[LONG(EV_REL)] |= BIT(EV_REL); - dev1->relbit[LONG(REL_WHEEL)] |= BIT(REL_WHEEL); + dev1->evbit[BIT_WORD(EV_REL)] |= BIT_MASK(EV_REL); + dev1->relbit[BIT_WORD(REL_WHEEL)] |= BIT_MASK(REL_WHEEL); } if (priv->i->flags & (ALPS_FW_BK_1 | ALPS_FW_BK_2)) { - dev1->keybit[LONG(BTN_FORWARD)] |= BIT(BTN_FORWARD); - dev1->keybit[LONG(BTN_BACK)] |= BIT(BTN_BACK); + dev1->keybit[BIT_WORD(BTN_FORWARD)] |= BIT_MASK(BTN_FORWARD); + dev1->keybit[BIT_WORD(BTN_BACK)] |= BIT_MASK(BTN_BACK); } snprintf(priv->phys, sizeof(priv->phys), "%s/input1", psmouse->ps2dev.serio->phys); @@ -483,9 +484,10 @@ int alps_init(struct psmouse *psmouse) dev2->id.product = PSMOUSE_ALPS; dev2->id.version = 0x0000; - dev2->evbit[0] = BIT(EV_KEY) | BIT(EV_REL); - dev2->relbit[LONG(REL_X)] |= BIT(REL_X) | BIT(REL_Y); - dev2->keybit[LONG(BTN_LEFT)] |= BIT(BTN_LEFT) | BIT(BTN_MIDDLE) | BIT(BTN_RIGHT); + dev2->evbit[0] = BIT_MASK(EV_KEY) | BIT_MASK(EV_REL); + dev2->relbit[BIT_WORD(REL_X)] |= BIT_MASK(REL_X) | BIT_MASK(REL_Y); + dev2->keybit[BIT_WORD(BTN_LEFT)] |= BIT_MASK(BTN_LEFT) | + BIT_MASK(BTN_MIDDLE) | BIT_MASK(BTN_RIGHT); if (input_register_device(priv->dev2)) goto init_fail; diff --git a/drivers/input/mouse/amimouse.c b/drivers/input/mouse/amimouse.c index 239a0e16d91a..a185ac78a42c 100644 --- a/drivers/input/mouse/amimouse.c +++ b/drivers/input/mouse/amimouse.c @@ -111,9 +111,10 @@ static int __init amimouse_init(void) amimouse_dev->id.product = 0x0002; amimouse_dev->id.version = 0x0100; - amimouse_dev->evbit[0] = BIT(EV_KEY) | BIT(EV_REL); - amimouse_dev->relbit[0] = BIT(REL_X) | BIT(REL_Y); - amimouse_dev->keybit[LONG(BTN_LEFT)] = BIT(BTN_LEFT) | BIT(BTN_MIDDLE) | BIT(BTN_RIGHT); + amimouse_dev->evbit[0] = BIT_MASK(EV_KEY) | BIT_MASK(EV_REL); + amimouse_dev->relbit[0] = BIT_MASK(REL_X) | BIT_MASK(REL_Y); + amimouse_dev->keybit[BIT_WORD(BTN_LEFT)] = BIT_MASK(BTN_LEFT) | + BIT_MASK(BTN_MIDDLE) | BIT_MASK(BTN_RIGHT); amimouse_dev->open = amimouse_open; amimouse_dev->close = amimouse_close; diff --git a/drivers/input/mouse/atarimouse.c b/drivers/input/mouse/atarimouse.c index c8c7244b48a1..98a3561d4b05 100644 --- a/drivers/input/mouse/atarimouse.c +++ b/drivers/input/mouse/atarimouse.c @@ -137,9 +137,10 @@ static int __init atamouse_init(void) atamouse_dev->id.product = 0x0002; atamouse_dev->id.version = 0x0100; - atamouse_dev->evbit[0] = BIT(EV_KEY) | BIT(EV_REL); - atamouse_dev->relbit[0] = BIT(REL_X) | BIT(REL_Y); - atamouse_dev->keybit[LONG(BTN_LEFT)] = BIT(BTN_LEFT) | BIT(BTN_MIDDLE) | BIT(BTN_RIGHT); + atamouse_dev->evbit[0] = BIT_MASK(EV_KEY) | BIT_MASK(EV_REL); + atamouse_dev->relbit[0] = BIT_MASK(REL_X) | BIT_MASK(REL_Y); + atamouse_dev->keybit[BIT_WORD(BTN_LEFT)] = BIT_MASK(BTN_LEFT) | + BIT_MASK(BTN_MIDDLE) | BIT_MASK(BTN_RIGHT); atamouse_dev->open = atamouse_open; atamouse_dev->close = atamouse_close; diff --git a/drivers/input/mouse/hil_ptr.c b/drivers/input/mouse/hil_ptr.c index 449bf4dcbbcc..27f88fbb7136 100644 --- a/drivers/input/mouse/hil_ptr.c +++ b/drivers/input/mouse/hil_ptr.c @@ -298,12 +298,12 @@ static int hil_ptr_connect(struct serio *serio, struct serio_driver *driver) idd = ptr->idd + 1; txt = "unknown"; if ((did & HIL_IDD_DID_TYPE_MASK) == HIL_IDD_DID_TYPE_REL) { - ptr->dev->evbit[0] = BIT(EV_REL); + ptr->dev->evbit[0] = BIT_MASK(EV_REL); txt = "relative"; } if ((did & HIL_IDD_DID_TYPE_MASK) == HIL_IDD_DID_TYPE_ABS) { - ptr->dev->evbit[0] = BIT(EV_ABS); + ptr->dev->evbit[0] = BIT_MASK(EV_ABS); txt = "absolute"; } if (!ptr->dev->evbit[0]) @@ -311,7 +311,7 @@ static int hil_ptr_connect(struct serio *serio, struct serio_driver *driver) ptr->nbtn = HIL_IDD_NUM_BUTTONS(idd); if (ptr->nbtn) - ptr->dev->evbit[0] |= BIT(EV_KEY); + ptr->dev->evbit[0] |= BIT_MASK(EV_KEY); naxsets = HIL_IDD_NUM_AXSETS(*idd); ptr->naxes = HIL_IDD_NUM_AXES_PER_SET(*idd); diff --git a/drivers/input/mouse/inport.c b/drivers/input/mouse/inport.c index 79b624fe8994..655a39217432 100644 --- a/drivers/input/mouse/inport.c +++ b/drivers/input/mouse/inport.c @@ -163,9 +163,10 @@ static int __init inport_init(void) inport_dev->id.product = 0x0001; inport_dev->id.version = 0x0100; - inport_dev->evbit[0] = BIT(EV_KEY) | BIT(EV_REL); - inport_dev->keybit[LONG(BTN_LEFT)] = BIT(BTN_LEFT) | BIT(BTN_MIDDLE) | BIT(BTN_RIGHT); - inport_dev->relbit[0] = BIT(REL_X) | BIT(REL_Y); + inport_dev->evbit[0] = BIT_MASK(EV_KEY) | BIT_MASK(EV_REL); + inport_dev->keybit[BIT_WORD(BTN_LEFT)] = BIT_MASK(BTN_LEFT) | + BIT_MASK(BTN_MIDDLE) | BIT_MASK(BTN_RIGHT); + inport_dev->relbit[0] = BIT_MASK(REL_X) | BIT_MASK(REL_Y); inport_dev->open = inport_open; inport_dev->close = inport_close; diff --git a/drivers/input/mouse/lifebook.c b/drivers/input/mouse/lifebook.c index d7de4c53b3d8..9ec57d80186e 100644 --- a/drivers/input/mouse/lifebook.c +++ b/drivers/input/mouse/lifebook.c @@ -270,9 +270,10 @@ static int lifebook_create_relative_device(struct psmouse *psmouse) dev2->id.version = 0x0000; dev2->dev.parent = &psmouse->ps2dev.serio->dev; - dev2->evbit[0] = BIT(EV_KEY) | BIT(EV_REL); - dev2->relbit[LONG(REL_X)] = BIT(REL_X) | BIT(REL_Y); - dev2->keybit[LONG(BTN_LEFT)] = BIT(BTN_LEFT) | BIT(BTN_RIGHT); + dev2->evbit[0] = BIT_MASK(EV_KEY) | BIT_MASK(EV_REL); + dev2->relbit[BIT_WORD(REL_X)] = BIT_MASK(REL_X) | BIT_MASK(REL_Y); + dev2->keybit[BIT_WORD(BTN_LEFT)] = BIT_MASK(BTN_LEFT) | + BIT_MASK(BTN_RIGHT); error = input_register_device(priv->dev2); if (error) @@ -295,9 +296,9 @@ int lifebook_init(struct psmouse *psmouse) if (lifebook_absolute_mode(psmouse)) return -1; - dev1->evbit[0] = BIT(EV_ABS) | BIT(EV_KEY); + dev1->evbit[0] = BIT_MASK(EV_ABS) | BIT_MASK(EV_KEY); dev1->relbit[0] = 0; - dev1->keybit[LONG(BTN_TOUCH)] = BIT(BTN_TOUCH); + dev1->keybit[BIT_WORD(BTN_TOUCH)] = BIT_MASK(BTN_TOUCH); input_set_abs_params(dev1, ABS_X, 0, max_coord, 0, 0); input_set_abs_params(dev1, ABS_Y, 0, max_coord, 0, 0); diff --git a/drivers/input/mouse/logibm.c b/drivers/input/mouse/logibm.c index 26c3b2e2ca94..b23a4f3ea5cd 100644 --- a/drivers/input/mouse/logibm.c +++ b/drivers/input/mouse/logibm.c @@ -156,9 +156,10 @@ static int __init logibm_init(void) logibm_dev->id.product = 0x0001; logibm_dev->id.version = 0x0100; - logibm_dev->evbit[0] = BIT(EV_KEY) | BIT(EV_REL); - logibm_dev->keybit[LONG(BTN_LEFT)] = BIT(BTN_LEFT) | BIT(BTN_MIDDLE) | BIT(BTN_RIGHT); - logibm_dev->relbit[0] = BIT(REL_X) | BIT(REL_Y); + logibm_dev->evbit[0] = BIT_MASK(EV_KEY) | BIT_MASK(EV_REL); + logibm_dev->keybit[BIT_WORD(BTN_LEFT)] = BIT_MASK(BTN_LEFT) | + BIT_MASK(BTN_MIDDLE) | BIT_MASK(BTN_RIGHT); + logibm_dev->relbit[0] = BIT_MASK(REL_X) | BIT_MASK(REL_Y); logibm_dev->open = logibm_open; logibm_dev->close = logibm_close; diff --git a/drivers/input/mouse/pc110pad.c b/drivers/input/mouse/pc110pad.c index 05d992e514f0..8991ab0b4fe3 100644 --- a/drivers/input/mouse/pc110pad.c +++ b/drivers/input/mouse/pc110pad.c @@ -144,9 +144,9 @@ static int __init pc110pad_init(void) pc110pad_dev->id.product = 0x0001; pc110pad_dev->id.version = 0x0100; - pc110pad_dev->evbit[0] = BIT(EV_KEY) | BIT(EV_ABS); - pc110pad_dev->absbit[0] = BIT(ABS_X) | BIT(ABS_Y); - pc110pad_dev->keybit[LONG(BTN_TOUCH)] = BIT(BTN_TOUCH); + pc110pad_dev->evbit[0] = BIT_MASK(EV_KEY) | BIT_MASK(EV_ABS); + pc110pad_dev->absbit[0] = BIT_MASK(ABS_X) | BIT_MASK(ABS_Y); + pc110pad_dev->keybit[BIT_WORD(BTN_TOUCH)] = BIT_MASK(BTN_TOUCH); pc110pad_dev->absmax[ABS_X] = 0x1ff; pc110pad_dev->absmax[ABS_Y] = 0x0ff; diff --git a/drivers/input/mouse/psmouse-base.c b/drivers/input/mouse/psmouse-base.c index 073525756532..da316d13d7f5 100644 --- a/drivers/input/mouse/psmouse-base.c +++ b/drivers/input/mouse/psmouse-base.c @@ -1115,9 +1115,10 @@ static int psmouse_switch_protocol(struct psmouse *psmouse, const struct psmouse input_dev->dev.parent = &psmouse->ps2dev.serio->dev; - input_dev->evbit[0] = BIT(EV_KEY) | BIT(EV_REL); - input_dev->keybit[LONG(BTN_MOUSE)] = BIT(BTN_LEFT) | BIT(BTN_MIDDLE) | BIT(BTN_RIGHT); - input_dev->relbit[0] = BIT(REL_X) | BIT(REL_Y); + input_dev->evbit[0] = BIT_MASK(EV_KEY) | BIT_MASK(EV_REL); + input_dev->keybit[BIT_WORD(BTN_MOUSE)] = BIT_MASK(BTN_LEFT) | + BIT_MASK(BTN_MIDDLE) | BIT_MASK(BTN_RIGHT); + input_dev->relbit[0] = BIT_MASK(REL_X) | BIT_MASK(REL_Y); psmouse->set_rate = psmouse_set_rate; psmouse->set_resolution = psmouse_set_resolution; diff --git a/drivers/input/mouse/rpcmouse.c b/drivers/input/mouse/rpcmouse.c index 355efd0423e7..18a48636ba4a 100644 --- a/drivers/input/mouse/rpcmouse.c +++ b/drivers/input/mouse/rpcmouse.c @@ -78,9 +78,10 @@ static int __init rpcmouse_init(void) rpcmouse_dev->id.product = 0x0001; rpcmouse_dev->id.version = 0x0100; - rpcmouse_dev->evbit[0] = BIT(EV_KEY) | BIT(EV_REL); - rpcmouse_dev->keybit[LONG(BTN_LEFT)] = BIT(BTN_LEFT) | BIT(BTN_MIDDLE) | BIT(BTN_RIGHT); - rpcmouse_dev->relbit[0] = BIT(REL_X) | BIT(REL_Y); + rpcmouse_dev->evbit[0] = BIT_MASK(EV_KEY) | BIT_MASK(EV_REL); + rpcmouse_dev->keybit[BIT_WORD(BTN_LEFT)] = BIT_MASK(BTN_LEFT) | + BIT_MASK(BTN_MIDDLE) | BIT_MASK(BTN_RIGHT); + rpcmouse_dev->relbit[0] = BIT_MASK(REL_X) | BIT_MASK(REL_Y); rpcmouse_lastx = (short) iomd_readl(IOMD_MOUSEX); rpcmouse_lasty = (short) iomd_readl(IOMD_MOUSEY); diff --git a/drivers/input/mouse/sermouse.c b/drivers/input/mouse/sermouse.c index 77b8ee2b9651..ed917bfd086a 100644 --- a/drivers/input/mouse/sermouse.c +++ b/drivers/input/mouse/sermouse.c @@ -268,9 +268,10 @@ static int sermouse_connect(struct serio *serio, struct serio_driver *drv) input_dev->id.version = 0x0100; input_dev->dev.parent = &serio->dev; - input_dev->evbit[0] = BIT(EV_KEY) | BIT(EV_REL); - input_dev->keybit[LONG(BTN_MOUSE)] = BIT(BTN_LEFT) | BIT(BTN_RIGHT); - input_dev->relbit[0] = BIT(REL_X) | BIT(REL_Y); + input_dev->evbit[0] = BIT_MASK(EV_KEY) | BIT_MASK(EV_REL); + input_dev->keybit[BIT_WORD(BTN_MOUSE)] = BIT_MASK(BTN_LEFT) | + BIT_MASK(BTN_RIGHT); + input_dev->relbit[0] = BIT_MASK(REL_X) | BIT_MASK(REL_Y); if (c & 0x01) set_bit(BTN_MIDDLE, input_dev->keybit); if (c & 0x02) set_bit(BTN_SIDE, input_dev->keybit); diff --git a/drivers/input/mouse/touchkit_ps2.c b/drivers/input/mouse/touchkit_ps2.c index 7b977fd23571..3fadb2accac0 100644 --- a/drivers/input/mouse/touchkit_ps2.c +++ b/drivers/input/mouse/touchkit_ps2.c @@ -85,7 +85,7 @@ int touchkit_ps2_detect(struct psmouse *psmouse, int set_properties) return -ENODEV; if (set_properties) { - dev->evbit[0] = BIT(EV_KEY) | BIT(EV_ABS); + dev->evbit[0] = BIT_MASK(EV_KEY) | BIT_MASK(EV_ABS); set_bit(BTN_TOUCH, dev->keybit); input_set_abs_params(dev, ABS_X, 0, TOUCHKIT_MAX_XC, 0, 0); input_set_abs_params(dev, ABS_Y, 0, TOUCHKIT_MAX_YC, 0, 0); diff --git a/drivers/input/mousedev.c b/drivers/input/mousedev.c index 79146d6ed2ab..78c3ea75da2a 100644 --- a/drivers/input/mousedev.c +++ b/drivers/input/mousedev.c @@ -998,34 +998,36 @@ static const struct input_device_id mousedev_ids[] = { .flags = INPUT_DEVICE_ID_MATCH_EVBIT | INPUT_DEVICE_ID_MATCH_KEYBIT | INPUT_DEVICE_ID_MATCH_RELBIT, - .evbit = { BIT(EV_KEY) | BIT(EV_REL) }, - .keybit = { [LONG(BTN_LEFT)] = BIT(BTN_LEFT) }, - .relbit = { BIT(REL_X) | BIT(REL_Y) }, + .evbit = { BIT_MASK(EV_KEY) | BIT_MASK(EV_REL) }, + .keybit = { [BIT_WORD(BTN_LEFT)] = BIT_MASK(BTN_LEFT) }, + .relbit = { BIT_MASK(REL_X) | BIT_MASK(REL_Y) }, }, /* A mouse like device, at least one button, two relative axes */ { .flags = INPUT_DEVICE_ID_MATCH_EVBIT | INPUT_DEVICE_ID_MATCH_RELBIT, - .evbit = { BIT(EV_KEY) | BIT(EV_REL) }, - .relbit = { BIT(REL_WHEEL) }, + .evbit = { BIT_MASK(EV_KEY) | BIT_MASK(EV_REL) }, + .relbit = { BIT_MASK(REL_WHEEL) }, }, /* A separate scrollwheel */ { .flags = INPUT_DEVICE_ID_MATCH_EVBIT | INPUT_DEVICE_ID_MATCH_KEYBIT | INPUT_DEVICE_ID_MATCH_ABSBIT, - .evbit = { BIT(EV_KEY) | BIT(EV_ABS) }, - .keybit = { [LONG(BTN_TOUCH)] = BIT(BTN_TOUCH) }, - .absbit = { BIT(ABS_X) | BIT(ABS_Y) }, + .evbit = { BIT_MASK(EV_KEY) | BIT_MASK(EV_ABS) }, + .keybit = { [BIT_WORD(BTN_TOUCH)] = BIT_MASK(BTN_TOUCH) }, + .absbit = { BIT_MASK(ABS_X) | BIT_MASK(ABS_Y) }, }, /* A tablet like device, at least touch detection, two absolute axes */ { .flags = INPUT_DEVICE_ID_MATCH_EVBIT | INPUT_DEVICE_ID_MATCH_KEYBIT | INPUT_DEVICE_ID_MATCH_ABSBIT, - .evbit = { BIT(EV_KEY) | BIT(EV_ABS) }, - .keybit = { [LONG(BTN_TOOL_FINGER)] = BIT(BTN_TOOL_FINGER) }, - .absbit = { BIT(ABS_X) | BIT(ABS_Y) | BIT(ABS_PRESSURE) | - BIT(ABS_TOOL_WIDTH) }, + .evbit = { BIT_MASK(EV_KEY) | BIT_MASK(EV_ABS) }, + .keybit = { [BIT_WORD(BTN_TOOL_FINGER)] = + BIT_MASK(BTN_TOOL_FINGER) }, + .absbit = { BIT_MASK(ABS_X) | BIT_MASK(ABS_Y) | + BIT_MASK(ABS_PRESSURE) | + BIT_MASK(ABS_TOOL_WIDTH) }, }, /* A touchpad */ { }, /* Terminating entry */ diff --git a/drivers/input/tablet/acecad.c b/drivers/input/tablet/acecad.c index dd2310458c46..b973d0ef6d16 100644 --- a/drivers/input/tablet/acecad.c +++ b/drivers/input/tablet/acecad.c @@ -192,10 +192,14 @@ static int usb_acecad_probe(struct usb_interface *intf, const struct usb_device_ input_dev->open = usb_acecad_open; input_dev->close = usb_acecad_close; - input_dev->evbit[0] = BIT(EV_KEY) | BIT(EV_ABS); - input_dev->absbit[0] = BIT(ABS_X) | BIT(ABS_Y) | BIT(ABS_PRESSURE); - input_dev->keybit[LONG(BTN_LEFT)] = BIT(BTN_LEFT) | BIT(BTN_RIGHT) | BIT(BTN_MIDDLE); - input_dev->keybit[LONG(BTN_DIGI)] = BIT(BTN_TOOL_PEN) |BIT(BTN_TOUCH) | BIT(BTN_STYLUS) | BIT(BTN_STYLUS2); + input_dev->evbit[0] = BIT_MASK(EV_KEY) | BIT_MASK(EV_ABS); + input_dev->absbit[0] = BIT_MASK(ABS_X) | BIT_MASK(ABS_Y) | + BIT_MASK(ABS_PRESSURE); + input_dev->keybit[BIT_WORD(BTN_LEFT)] = BIT_MASK(BTN_LEFT) | + BIT_MASK(BTN_RIGHT) | BIT_MASK(BTN_MIDDLE); + input_dev->keybit[BIT_WORD(BTN_DIGI)] = BIT_MASK(BTN_TOOL_PEN) | + BIT_MASK(BTN_TOUCH) | BIT_MASK(BTN_STYLUS) | + BIT_MASK(BTN_STYLUS2); switch (id->driver_info) { case 0: diff --git a/drivers/input/tablet/gtco.c b/drivers/input/tablet/gtco.c index b2ca10f2fe0e..d2c6da264722 100644 --- a/drivers/input/tablet/gtco.c +++ b/drivers/input/tablet/gtco.c @@ -573,10 +573,12 @@ static void gtco_setup_caps(struct input_dev *inputdev) struct gtco *device = input_get_drvdata(inputdev); /* Which events */ - inputdev->evbit[0] = BIT(EV_KEY) | BIT(EV_ABS) | BIT(EV_MSC); + inputdev->evbit[0] = BIT_MASK(EV_KEY) | BIT_MASK(EV_ABS) | + BIT_MASK(EV_MSC); /* Misc event menu block */ - inputdev->mscbit[0] = BIT(MSC_SCAN)|BIT(MSC_SERIAL)|BIT(MSC_RAW) ; + inputdev->mscbit[0] = BIT_MASK(MSC_SCAN) | BIT_MASK(MSC_SERIAL) | + BIT_MASK(MSC_RAW); /* Absolute values based on HID report info */ input_set_abs_params(inputdev, ABS_X, device->min_X, device->max_X, diff --git a/drivers/input/tablet/kbtab.c b/drivers/input/tablet/kbtab.c index 91e6d00d4a43..1182fc133167 100644 --- a/drivers/input/tablet/kbtab.c +++ b/drivers/input/tablet/kbtab.c @@ -153,10 +153,13 @@ static int kbtab_probe(struct usb_interface *intf, const struct usb_device_id *i input_dev->open = kbtab_open; input_dev->close = kbtab_close; - input_dev->evbit[0] |= BIT(EV_KEY) | BIT(EV_ABS) | BIT(EV_MSC); - input_dev->keybit[LONG(BTN_LEFT)] |= BIT(BTN_LEFT) | BIT(BTN_RIGHT) | BIT(BTN_MIDDLE); - input_dev->keybit[LONG(BTN_DIGI)] |= BIT(BTN_TOOL_PEN) | BIT(BTN_TOUCH); - input_dev->mscbit[0] |= BIT(MSC_SERIAL); + input_dev->evbit[0] |= BIT_MASK(EV_KEY) | BIT_MASK(EV_ABS) | + BIT_MASK(EV_MSC); + input_dev->keybit[BIT_WORD(BTN_LEFT)] |= BIT_MASK(BTN_LEFT) | + BIT_MASK(BTN_RIGHT) | BIT_MASK(BTN_MIDDLE); + input_dev->keybit[BIT_WORD(BTN_DIGI)] |= BIT_MASK(BTN_TOOL_PEN) | + BIT_MASK(BTN_TOUCH); + input_dev->mscbit[0] |= BIT_MASK(MSC_SERIAL); input_set_abs_params(input_dev, ABS_X, 0, 0x2000, 4, 0); input_set_abs_params(input_dev, ABS_Y, 0, 0x1750, 4, 0); input_set_abs_params(input_dev, ABS_PRESSURE, 0, 0xff, 0, 0); diff --git a/drivers/input/tablet/wacom_sys.c b/drivers/input/tablet/wacom_sys.c index 064e123c9b76..d64b1ea136b3 100644 --- a/drivers/input/tablet/wacom_sys.c +++ b/drivers/input/tablet/wacom_sys.c @@ -140,48 +140,58 @@ static void wacom_close(struct input_dev *dev) void input_dev_mo(struct input_dev *input_dev, struct wacom_wac *wacom_wac) { - input_dev->keybit[LONG(BTN_LEFT)] |= BIT(BTN_1) | BIT(BTN_5); + input_dev->keybit[BIT_WORD(BTN_LEFT)] |= BIT_MASK(BTN_1) | + BIT_MASK(BTN_5); input_set_abs_params(input_dev, ABS_WHEEL, 0, 71, 0, 0); } void input_dev_g4(struct input_dev *input_dev, struct wacom_wac *wacom_wac) { - input_dev->evbit[0] |= BIT(EV_MSC); - input_dev->mscbit[0] |= BIT(MSC_SERIAL); - input_dev->keybit[LONG(BTN_DIGI)] |= BIT(BTN_TOOL_FINGER); - input_dev->keybit[LONG(BTN_LEFT)] |= BIT(BTN_0) | BIT(BTN_4); + input_dev->evbit[0] |= BIT_MASK(EV_MSC); + input_dev->mscbit[0] |= BIT_MASK(MSC_SERIAL); + input_dev->keybit[BIT_WORD(BTN_DIGI)] |= BIT_MASK(BTN_TOOL_FINGER); + input_dev->keybit[BIT_WORD(BTN_LEFT)] |= BIT_MASK(BTN_0) | + BIT_MASK(BTN_4); } void input_dev_g(struct input_dev *input_dev, struct wacom_wac *wacom_wac) { - input_dev->evbit[0] |= BIT(EV_REL); - input_dev->relbit[0] |= BIT(REL_WHEEL); - input_dev->keybit[LONG(BTN_LEFT)] |= BIT(BTN_LEFT) | BIT(BTN_RIGHT) | BIT(BTN_MIDDLE); - input_dev->keybit[LONG(BTN_DIGI)] |= BIT(BTN_TOOL_RUBBER) | BIT(BTN_TOOL_MOUSE) | BIT(BTN_STYLUS2); + input_dev->evbit[0] |= BIT_MASK(EV_REL); + input_dev->relbit[0] |= BIT_MASK(REL_WHEEL); + input_dev->keybit[BIT_WORD(BTN_LEFT)] |= BIT_MASK(BTN_LEFT) | + BIT_MASK(BTN_RIGHT) | BIT_MASK(BTN_MIDDLE); + input_dev->keybit[BIT_WORD(BTN_DIGI)] |= BIT_MASK(BTN_TOOL_RUBBER) | + BIT_MASK(BTN_TOOL_MOUSE) | BIT_MASK(BTN_STYLUS2); input_set_abs_params(input_dev, ABS_DISTANCE, 0, wacom_wac->features->distance_max, 0, 0); } void input_dev_i3s(struct input_dev *input_dev, struct wacom_wac *wacom_wac) { - input_dev->keybit[LONG(BTN_DIGI)] |= BIT(BTN_TOOL_FINGER); - input_dev->keybit[LONG(BTN_LEFT)] |= BIT(BTN_0) | BIT(BTN_1) | BIT(BTN_2) | BIT(BTN_3); + input_dev->keybit[BIT_WORD(BTN_DIGI)] |= BIT_MASK(BTN_TOOL_FINGER); + input_dev->keybit[BIT_WORD(BTN_LEFT)] |= BIT_MASK(BTN_0) | + BIT_MASK(BTN_1) | BIT_MASK(BTN_2) | BIT_MASK(BTN_3); input_set_abs_params(input_dev, ABS_RX, 0, 4096, 0, 0); } void input_dev_i3(struct input_dev *input_dev, struct wacom_wac *wacom_wac) { - input_dev->keybit[LONG(BTN_LEFT)] |= BIT(BTN_4) | BIT(BTN_5) | BIT(BTN_6) | BIT(BTN_7); + input_dev->keybit[BIT_WORD(BTN_LEFT)] |= BIT_MASK(BTN_4) | + BIT_MASK(BTN_5) | BIT_MASK(BTN_6) | BIT_MASK(BTN_7); input_set_abs_params(input_dev, ABS_RY, 0, 4096, 0, 0); } void input_dev_i(struct input_dev *input_dev, struct wacom_wac *wacom_wac) { - input_dev->evbit[0] |= BIT(EV_MSC) | BIT(EV_REL); - input_dev->mscbit[0] |= BIT(MSC_SERIAL); - input_dev->relbit[0] |= BIT(REL_WHEEL); - input_dev->keybit[LONG(BTN_LEFT)] |= BIT(BTN_LEFT) | BIT(BTN_RIGHT) | BIT(BTN_MIDDLE) | BIT(BTN_SIDE) | BIT(BTN_EXTRA); - input_dev->keybit[LONG(BTN_DIGI)] |= BIT(BTN_TOOL_RUBBER) | BIT(BTN_TOOL_MOUSE) | BIT(BTN_TOOL_BRUSH) - | BIT(BTN_TOOL_PENCIL) | BIT(BTN_TOOL_AIRBRUSH) | BIT(BTN_TOOL_LENS) | BIT(BTN_STYLUS2); + input_dev->evbit[0] |= BIT_MASK(EV_MSC) | BIT_MASK(EV_REL); + input_dev->mscbit[0] |= BIT_MASK(MSC_SERIAL); + input_dev->relbit[0] |= BIT_MASK(REL_WHEEL); + input_dev->keybit[BIT_WORD(BTN_LEFT)] |= BIT_MASK(BTN_LEFT) | + BIT_MASK(BTN_RIGHT) | BIT_MASK(BTN_MIDDLE) | + BIT_MASK(BTN_SIDE) | BIT_MASK(BTN_EXTRA); + input_dev->keybit[BIT_WORD(BTN_DIGI)] |= BIT_MASK(BTN_TOOL_RUBBER) | + BIT_MASK(BTN_TOOL_MOUSE) | BIT_MASK(BTN_TOOL_BRUSH) | + BIT_MASK(BTN_TOOL_PENCIL) | BIT_MASK(BTN_TOOL_AIRBRUSH) | + BIT_MASK(BTN_TOOL_LENS) | BIT_MASK(BTN_STYLUS2); input_set_abs_params(input_dev, ABS_DISTANCE, 0, wacom_wac->features->distance_max, 0, 0); input_set_abs_params(input_dev, ABS_WHEEL, 0, 1023, 0, 0); input_set_abs_params(input_dev, ABS_TILT_X, 0, 127, 0, 0); @@ -192,12 +202,13 @@ void input_dev_i(struct input_dev *input_dev, struct wacom_wac *wacom_wac) void input_dev_pl(struct input_dev *input_dev, struct wacom_wac *wacom_wac) { - input_dev->keybit[LONG(BTN_DIGI)] |= BIT(BTN_STYLUS2) | BIT(BTN_TOOL_RUBBER); + input_dev->keybit[BIT_WORD(BTN_DIGI)] |= BIT_MASK(BTN_STYLUS2) | + BIT_MASK(BTN_TOOL_RUBBER); } void input_dev_pt(struct input_dev *input_dev, struct wacom_wac *wacom_wac) { - input_dev->keybit[LONG(BTN_DIGI)] |= BIT(BTN_TOOL_RUBBER); + input_dev->keybit[BIT_WORD(BTN_DIGI)] |= BIT_MASK(BTN_TOOL_RUBBER); } static int wacom_probe(struct usb_interface *intf, const struct usb_device_id *id) @@ -243,12 +254,13 @@ static int wacom_probe(struct usb_interface *intf, const struct usb_device_id *i input_dev->open = wacom_open; input_dev->close = wacom_close; - input_dev->evbit[0] |= BIT(EV_KEY) | BIT(EV_ABS); - input_dev->keybit[LONG(BTN_DIGI)] |= BIT(BTN_TOOL_PEN) | BIT(BTN_TOUCH) | BIT(BTN_STYLUS); + input_dev->evbit[0] |= BIT_MASK(EV_KEY) | BIT_MASK(EV_ABS); + input_dev->keybit[BIT_WORD(BTN_DIGI)] |= BIT_MASK(BTN_TOOL_PEN) | + BIT_MASK(BTN_TOUCH) | BIT_MASK(BTN_STYLUS); input_set_abs_params(input_dev, ABS_X, 0, wacom_wac->features->x_max, 4, 0); input_set_abs_params(input_dev, ABS_Y, 0, wacom_wac->features->y_max, 4, 0); input_set_abs_params(input_dev, ABS_PRESSURE, 0, wacom_wac->features->pressure_max, 0, 0); - input_dev->absbit[LONG(ABS_MISC)] |= BIT(ABS_MISC); + input_dev->absbit[BIT_WORD(ABS_MISC)] |= BIT_MASK(ABS_MISC); wacom_init_input_dev(input_dev, wacom_wac); diff --git a/drivers/input/touchscreen/ads7846.c b/drivers/input/touchscreen/ads7846.c index 51ae4fb7d123..f59aecf5ec15 100644 --- a/drivers/input/touchscreen/ads7846.c +++ b/drivers/input/touchscreen/ads7846.c @@ -917,8 +917,8 @@ static int __devinit ads7846_probe(struct spi_device *spi) input_dev->phys = ts->phys; input_dev->dev.parent = &spi->dev; - input_dev->evbit[0] = BIT(EV_KEY) | BIT(EV_ABS); - input_dev->keybit[LONG(BTN_TOUCH)] = BIT(BTN_TOUCH); + input_dev->evbit[0] = BIT_MASK(EV_KEY) | BIT_MASK(EV_ABS); + input_dev->keybit[BIT_WORD(BTN_TOUCH)] = BIT_MASK(BTN_TOUCH); input_set_abs_params(input_dev, ABS_X, pdata->x_min ? : 0, pdata->x_max ? : MAX_12BIT, diff --git a/drivers/input/touchscreen/corgi_ts.c b/drivers/input/touchscreen/corgi_ts.c index e6a31d118786..b1b2e07bf080 100644 --- a/drivers/input/touchscreen/corgi_ts.c +++ b/drivers/input/touchscreen/corgi_ts.c @@ -302,8 +302,8 @@ static int __init corgits_probe(struct platform_device *pdev) input_dev->id.version = 0x0100; input_dev->dev.parent = &pdev->dev; - input_dev->evbit[0] = BIT(EV_KEY) | BIT(EV_ABS); - input_dev->keybit[LONG(BTN_TOUCH)] = BIT(BTN_TOUCH); + input_dev->evbit[0] = BIT_MASK(EV_KEY) | BIT_MASK(EV_ABS); + input_dev->keybit[BIT_WORD(BTN_TOUCH)] = BIT_MASK(BTN_TOUCH); input_set_abs_params(input_dev, ABS_X, X_AXIS_MIN, X_AXIS_MAX, 0, 0); input_set_abs_params(input_dev, ABS_Y, Y_AXIS_MIN, Y_AXIS_MAX, 0, 0); input_set_abs_params(input_dev, ABS_PRESSURE, PRESSURE_MIN, PRESSURE_MAX, 0, 0); diff --git a/drivers/input/touchscreen/elo.c b/drivers/input/touchscreen/elo.c index 557d781719f1..d20689cdbd5d 100644 --- a/drivers/input/touchscreen/elo.c +++ b/drivers/input/touchscreen/elo.c @@ -320,8 +320,8 @@ static int elo_connect(struct serio *serio, struct serio_driver *drv) input_dev->id.version = 0x0100; input_dev->dev.parent = &serio->dev; - input_dev->evbit[0] = BIT(EV_KEY) | BIT(EV_ABS); - input_dev->keybit[LONG(BTN_TOUCH)] = BIT(BTN_TOUCH); + input_dev->evbit[0] = BIT_MASK(EV_KEY) | BIT_MASK(EV_ABS); + input_dev->keybit[BIT_WORD(BTN_TOUCH)] = BIT_MASK(BTN_TOUCH); serio_set_drvdata(serio, elo); err = serio_open(serio, drv); diff --git a/drivers/input/touchscreen/fujitsu_ts.c b/drivers/input/touchscreen/fujitsu_ts.c index daf7a4afc935..80b21800355f 100644 --- a/drivers/input/touchscreen/fujitsu_ts.c +++ b/drivers/input/touchscreen/fujitsu_ts.c @@ -122,8 +122,8 @@ static int fujitsu_connect(struct serio *serio, struct serio_driver *drv) input_dev->id.vendor = SERIO_FUJITSU; input_dev->id.product = 0; input_dev->id.version = 0x0100; - input_dev->evbit[0] = BIT(EV_KEY) | BIT(EV_ABS); - input_dev->keybit[LONG(BTN_TOUCH)] = BIT(BTN_TOUCH); + input_dev->evbit[0] = BIT_MASK(EV_KEY) | BIT_MASK(EV_ABS); + input_dev->keybit[BIT_WORD(BTN_TOUCH)] = BIT_MASK(BTN_TOUCH); input_set_abs_params(input_dev, ABS_X, 0, 4096, 0, 0); input_set_abs_params(input_dev, ABS_Y, 0, 4096, 0, 0); diff --git a/drivers/input/touchscreen/gunze.c b/drivers/input/touchscreen/gunze.c index 39d602600d7c..a48a15868c4a 100644 --- a/drivers/input/touchscreen/gunze.c +++ b/drivers/input/touchscreen/gunze.c @@ -137,8 +137,8 @@ static int gunze_connect(struct serio *serio, struct serio_driver *drv) input_dev->id.product = 0x0051; input_dev->id.version = 0x0100; input_dev->dev.parent = &serio->dev; - input_dev->evbit[0] = BIT(EV_KEY) | BIT(EV_ABS); - input_dev->keybit[LONG(BTN_TOUCH)] = BIT(BTN_TOUCH); + input_dev->evbit[0] = BIT_MASK(EV_KEY) | BIT_MASK(EV_ABS); + input_dev->keybit[BIT_WORD(BTN_TOUCH)] = BIT_MASK(BTN_TOUCH); input_set_abs_params(input_dev, ABS_X, 24, 1000, 0, 0); input_set_abs_params(input_dev, ABS_Y, 24, 1000, 0, 0); diff --git a/drivers/input/touchscreen/h3600_ts_input.c b/drivers/input/touchscreen/h3600_ts_input.c index 09ed7803cb8f..2ae6c6016a86 100644 --- a/drivers/input/touchscreen/h3600_ts_input.c +++ b/drivers/input/touchscreen/h3600_ts_input.c @@ -373,8 +373,9 @@ static int h3600ts_connect(struct serio *serio, struct serio_driver *drv) input_dev->event = h3600ts_event; - input_dev->evbit[0] = BIT(EV_KEY) | BIT(EV_ABS) | BIT(EV_LED) | BIT(EV_PWR); - input_dev->ledbit[0] = BIT(LED_SLEEP); + input_dev->evbit[0] = BIT_MASK(EV_KEY) | BIT_MASK(EV_ABS) | + BIT_MASK(EV_LED) | BIT_MASK(EV_PWR); + input_dev->ledbit[0] = BIT_MASK(LED_SLEEP); input_set_abs_params(input_dev, ABS_X, 60, 985, 0, 0); input_set_abs_params(input_dev, ABS_Y, 35, 1024, 0, 0); diff --git a/drivers/input/touchscreen/hp680_ts_input.c b/drivers/input/touchscreen/hp680_ts_input.c index 1a15475aedfc..c38d4e0f95c6 100644 --- a/drivers/input/touchscreen/hp680_ts_input.c +++ b/drivers/input/touchscreen/hp680_ts_input.c @@ -81,8 +81,8 @@ static int __init hp680_ts_init(void) if (!hp680_ts_dev) return -ENOMEM; - hp680_ts_dev->evbit[0] = BIT(EV_ABS) | BIT(EV_KEY); - hp680_ts_dev->keybit[LONG(BTN_TOUCH)] = BIT(BTN_TOUCH); + hp680_ts_dev->evbit[0] = BIT_MASK(EV_ABS) | BIT_MASK(EV_KEY); + hp680_ts_dev->keybit[BIT_WORD(BTN_TOUCH)] = BIT_MASK(BTN_TOUCH); input_set_abs_params(hp680_ts_dev, ABS_X, HP680_TS_ABS_X_MIN, HP680_TS_ABS_X_MAX, 0, 0); diff --git a/drivers/input/touchscreen/mk712.c b/drivers/input/touchscreen/mk712.c index 44140feeffc5..80a658868706 100644 --- a/drivers/input/touchscreen/mk712.c +++ b/drivers/input/touchscreen/mk712.c @@ -186,8 +186,8 @@ static int __init mk712_init(void) mk712_dev->open = mk712_open; mk712_dev->close = mk712_close; - mk712_dev->evbit[0] = BIT(EV_KEY) | BIT(EV_ABS); - mk712_dev->keybit[LONG(BTN_TOUCH)] = BIT(BTN_TOUCH); + mk712_dev->evbit[0] = BIT_MASK(EV_KEY) | BIT_MASK(EV_ABS); + mk712_dev->keybit[BIT_WORD(BTN_TOUCH)] = BIT_MASK(BTN_TOUCH); input_set_abs_params(mk712_dev, ABS_X, 0, 0xfff, 88, 0); input_set_abs_params(mk712_dev, ABS_Y, 0, 0xfff, 88, 0); diff --git a/drivers/input/touchscreen/mtouch.c b/drivers/input/touchscreen/mtouch.c index 4ec3b1f940c8..9077228418b7 100644 --- a/drivers/input/touchscreen/mtouch.c +++ b/drivers/input/touchscreen/mtouch.c @@ -151,8 +151,8 @@ static int mtouch_connect(struct serio *serio, struct serio_driver *drv) input_dev->id.product = 0; input_dev->id.version = 0x0100; input_dev->dev.parent = &serio->dev; - input_dev->evbit[0] = BIT(EV_KEY) | BIT(EV_ABS); - input_dev->keybit[LONG(BTN_TOUCH)] = BIT(BTN_TOUCH); + input_dev->evbit[0] = BIT_MASK(EV_KEY) | BIT_MASK(EV_ABS); + input_dev->keybit[BIT_WORD(BTN_TOUCH)] = BIT_MASK(BTN_TOUCH); input_set_abs_params(mtouch->dev, ABS_X, MTOUCH_MIN_XC, MTOUCH_MAX_XC, 0, 0); input_set_abs_params(mtouch->dev, ABS_Y, MTOUCH_MIN_YC, MTOUCH_MAX_YC, 0, 0); diff --git a/drivers/input/touchscreen/penmount.c b/drivers/input/touchscreen/penmount.c index f2c0d3c7149c..c7f9cebebbb6 100644 --- a/drivers/input/touchscreen/penmount.c +++ b/drivers/input/touchscreen/penmount.c @@ -113,8 +113,8 @@ static int pm_connect(struct serio *serio, struct serio_driver *drv) input_dev->id.version = 0x0100; input_dev->dev.parent = &serio->dev; - input_dev->evbit[0] = BIT(EV_KEY) | BIT(EV_ABS); - input_dev->keybit[LONG(BTN_TOUCH)] = BIT(BTN_TOUCH); + input_dev->evbit[0] = BIT_MASK(EV_KEY) | BIT_MASK(EV_ABS); + input_dev->keybit[BIT_WORD(BTN_TOUCH)] = BIT_MASK(BTN_TOUCH); input_set_abs_params(pm->dev, ABS_X, 0, 0x3ff, 0, 0); input_set_abs_params(pm->dev, ABS_Y, 0, 0x3ff, 0, 0); diff --git a/drivers/input/touchscreen/touchright.c b/drivers/input/touchscreen/touchright.c index 3def7bb1df44..3a5c142c2a78 100644 --- a/drivers/input/touchscreen/touchright.c +++ b/drivers/input/touchscreen/touchright.c @@ -125,8 +125,8 @@ static int tr_connect(struct serio *serio, struct serio_driver *drv) input_dev->id.product = 0; input_dev->id.version = 0x0100; input_dev->dev.parent = &serio->dev; - input_dev->evbit[0] = BIT(EV_KEY) | BIT(EV_ABS); - input_dev->keybit[LONG(BTN_TOUCH)] = BIT(BTN_TOUCH); + input_dev->evbit[0] = BIT_MASK(EV_KEY) | BIT_MASK(EV_ABS); + input_dev->keybit[BIT_WORD(BTN_TOUCH)] = BIT_MASK(BTN_TOUCH); input_set_abs_params(tr->dev, ABS_X, TR_MIN_XC, TR_MAX_XC, 0, 0); input_set_abs_params(tr->dev, ABS_Y, TR_MIN_YC, TR_MAX_YC, 0, 0); diff --git a/drivers/input/touchscreen/touchwin.c b/drivers/input/touchscreen/touchwin.c index ac4bdcf18666..763a656a59f8 100644 --- a/drivers/input/touchscreen/touchwin.c +++ b/drivers/input/touchscreen/touchwin.c @@ -132,8 +132,8 @@ static int tw_connect(struct serio *serio, struct serio_driver *drv) input_dev->id.product = 0; input_dev->id.version = 0x0100; input_dev->dev.parent = &serio->dev; - input_dev->evbit[0] = BIT(EV_KEY) | BIT(EV_ABS); - input_dev->keybit[LONG(BTN_TOUCH)] = BIT(BTN_TOUCH); + input_dev->evbit[0] = BIT_MASK(EV_KEY) | BIT_MASK(EV_ABS); + input_dev->keybit[BIT_WORD(BTN_TOUCH)] = BIT_MASK(BTN_TOUCH); input_set_abs_params(tw->dev, ABS_X, TW_MIN_XC, TW_MAX_XC, 0, 0); input_set_abs_params(tw->dev, ABS_Y, TW_MIN_YC, TW_MAX_YC, 0, 0); diff --git a/drivers/input/touchscreen/ucb1400_ts.c b/drivers/input/touchscreen/ucb1400_ts.c index 89373b01d8f5..7549939b9535 100644 --- a/drivers/input/touchscreen/ucb1400_ts.c +++ b/drivers/input/touchscreen/ucb1400_ts.c @@ -517,7 +517,7 @@ static int ucb1400_ts_probe(struct device *dev) idev->id.product = id; idev->open = ucb1400_ts_open; idev->close = ucb1400_ts_close; - idev->evbit[0] = BIT(EV_ABS); + idev->evbit[0] = BIT_MASK(EV_ABS); ucb1400_adc_enable(ucb); x_res = ucb1400_ts_read_xres(ucb); diff --git a/drivers/input/touchscreen/usbtouchscreen.c b/drivers/input/touchscreen/usbtouchscreen.c index 9fb3d5c30999..5f34b78d5ddb 100644 --- a/drivers/input/touchscreen/usbtouchscreen.c +++ b/drivers/input/touchscreen/usbtouchscreen.c @@ -868,8 +868,8 @@ static int usbtouch_probe(struct usb_interface *intf, input_dev->open = usbtouch_open; input_dev->close = usbtouch_close; - input_dev->evbit[0] = BIT(EV_KEY) | BIT(EV_ABS); - input_dev->keybit[LONG(BTN_TOUCH)] = BIT(BTN_TOUCH); + input_dev->evbit[0] = BIT_MASK(EV_KEY) | BIT_MASK(EV_ABS); + input_dev->keybit[BIT_WORD(BTN_TOUCH)] = BIT_MASK(BTN_TOUCH); input_set_abs_params(input_dev, ABS_X, type->min_xc, type->max_xc, 0, 0); input_set_abs_params(input_dev, ABS_Y, type->min_yc, type->max_yc, 0, 0); if (type->max_press) diff --git a/drivers/isdn/hardware/avm/b1dma.c b/drivers/isdn/hardware/avm/b1dma.c index 428872b653e9..669f6f67449c 100644 --- a/drivers/isdn/hardware/avm/b1dma.c +++ b/drivers/isdn/hardware/avm/b1dma.c @@ -486,11 +486,13 @@ static void b1dma_handle_rx(avmcard *card) card->name); } else { memcpy(skb_put(skb, MsgLen), card->msgbuf, MsgLen); - if (CAPIMSG_CMD(skb->data) == CAPI_DATA_B3_CONF) + if (CAPIMSG_CMD(skb->data) == CAPI_DATA_B3_CONF) { + spin_lock(&card->lock); capilib_data_b3_conf(&cinfo->ncci_head, ApplId, - CAPIMSG_NCCI(skb->data), - CAPIMSG_MSGID(skb->data)); - + CAPIMSG_NCCI(skb->data), + CAPIMSG_MSGID(skb->data)); + spin_unlock(&card->lock); + } capi_ctr_handle_message(ctrl, ApplId, skb); } break; @@ -500,9 +502,9 @@ static void b1dma_handle_rx(avmcard *card) ApplId = _get_word(&p); NCCI = _get_word(&p); WindowSize = _get_word(&p); - + spin_lock(&card->lock); capilib_new_ncci(&cinfo->ncci_head, ApplId, NCCI, WindowSize); - + spin_unlock(&card->lock); break; case RECEIVE_FREE_NCCI: @@ -510,9 +512,11 @@ static void b1dma_handle_rx(avmcard *card) ApplId = _get_word(&p); NCCI = _get_word(&p); - if (NCCI != 0xffffffff) + if (NCCI != 0xffffffff) { + spin_lock(&card->lock); capilib_free_ncci(&cinfo->ncci_head, ApplId, NCCI); - + spin_unlock(&card->lock); + } break; case RECEIVE_START: @@ -751,10 +755,10 @@ void b1dma_reset_ctr(struct capi_ctr *ctrl) spin_lock_irqsave(&card->lock, flags); b1dma_reset(card); - spin_unlock_irqrestore(&card->lock, flags); memset(cinfo->version, 0, sizeof(cinfo->version)); capilib_release(&cinfo->ncci_head); + spin_unlock_irqrestore(&card->lock, flags); capi_ctr_reseted(ctrl); } @@ -803,8 +807,11 @@ void b1dma_release_appl(struct capi_ctr *ctrl, u16 appl) avmcard *card = cinfo->card; struct sk_buff *skb; void *p; + unsigned long flags; + spin_lock_irqsave(&card->lock, flags); capilib_release_appl(&cinfo->ncci_head, appl); + spin_unlock_irqrestore(&card->lock, flags); skb = alloc_skb(7, GFP_ATOMIC); if (!skb) { @@ -832,10 +839,13 @@ u16 b1dma_send_message(struct capi_ctr *ctrl, struct sk_buff *skb) u16 retval = CAPI_NOERROR; if (CAPIMSG_CMD(skb->data) == CAPI_DATA_B3_REQ) { + unsigned long flags; + spin_lock_irqsave(&card->lock, flags); retval = capilib_data_b3_req(&cinfo->ncci_head, CAPIMSG_APPID(skb->data), CAPIMSG_NCCI(skb->data), CAPIMSG_MSGID(skb->data)); + spin_unlock_irqrestore(&card->lock, flags); } if (retval == CAPI_NOERROR) b1dma_queue_tx(card, skb); diff --git a/drivers/isdn/hardware/avm/c4.c b/drivers/isdn/hardware/avm/c4.c index 8710cf6214d9..4bbbbe688077 100644 --- a/drivers/isdn/hardware/avm/c4.c +++ b/drivers/isdn/hardware/avm/c4.c @@ -678,7 +678,9 @@ static irqreturn_t c4_handle_interrupt(avmcard *card) for (i=0; i < card->nr_controllers; i++) { avmctrl_info *cinfo = &card->ctrlinfo[i]; memset(cinfo->version, 0, sizeof(cinfo->version)); + spin_lock_irqsave(&card->lock, flags); capilib_release(&cinfo->ncci_head); + spin_unlock_irqrestore(&card->lock, flags); capi_ctr_reseted(&cinfo->capi_ctrl); } card->nlogcontr = 0; diff --git a/drivers/isdn/hardware/avm/t1isa.c b/drivers/isdn/hardware/avm/t1isa.c index c925020fe9b7..6130724e46e7 100644 --- a/drivers/isdn/hardware/avm/t1isa.c +++ b/drivers/isdn/hardware/avm/t1isa.c @@ -180,8 +180,8 @@ static irqreturn_t t1isa_interrupt(int interrupt, void *devptr) ApplId = (unsigned) b1_get_word(card->port); MsgLen = t1_get_slice(card->port, card->msgbuf); - spin_unlock_irqrestore(&card->lock, flags); if (!(skb = alloc_skb(MsgLen, GFP_ATOMIC))) { + spin_unlock_irqrestore(&card->lock, flags); printk(KERN_ERR "%s: incoming packet dropped\n", card->name); } else { @@ -190,7 +190,7 @@ static irqreturn_t t1isa_interrupt(int interrupt, void *devptr) capilib_data_b3_conf(&cinfo->ncci_head, ApplId, CAPIMSG_NCCI(skb->data), CAPIMSG_MSGID(skb->data)); - + spin_unlock_irqrestore(&card->lock, flags); capi_ctr_handle_message(ctrl, ApplId, skb); } break; @@ -200,21 +200,17 @@ static irqreturn_t t1isa_interrupt(int interrupt, void *devptr) ApplId = b1_get_word(card->port); NCCI = b1_get_word(card->port); WindowSize = b1_get_word(card->port); - spin_unlock_irqrestore(&card->lock, flags); - capilib_new_ncci(&cinfo->ncci_head, ApplId, NCCI, WindowSize); - + spin_unlock_irqrestore(&card->lock, flags); break; case RECEIVE_FREE_NCCI: ApplId = b1_get_word(card->port); NCCI = b1_get_word(card->port); - spin_unlock_irqrestore(&card->lock, flags); - if (NCCI != 0xffffffff) capilib_free_ncci(&cinfo->ncci_head, ApplId, NCCI); - + spin_unlock_irqrestore(&card->lock, flags); break; case RECEIVE_START: @@ -333,13 +329,16 @@ static void t1isa_reset_ctr(struct capi_ctr *ctrl) avmctrl_info *cinfo = (avmctrl_info *)(ctrl->driverdata); avmcard *card = cinfo->card; unsigned int port = card->port; + unsigned long flags; t1_disable_irq(port); b1_reset(port); b1_reset(port); memset(cinfo->version, 0, sizeof(cinfo->version)); + spin_lock_irqsave(&card->lock, flags); capilib_release(&cinfo->ncci_head); + spin_unlock_irqrestore(&card->lock, flags); capi_ctr_reseted(ctrl); } @@ -466,29 +465,26 @@ static u16 t1isa_send_message(struct capi_ctr *ctrl, struct sk_buff *skb) u8 subcmd = CAPIMSG_SUBCOMMAND(skb->data); u16 dlen, retval; + spin_lock_irqsave(&card->lock, flags); if (CAPICMD(cmd, subcmd) == CAPI_DATA_B3_REQ) { retval = capilib_data_b3_req(&cinfo->ncci_head, CAPIMSG_APPID(skb->data), CAPIMSG_NCCI(skb->data), CAPIMSG_MSGID(skb->data)); - if (retval != CAPI_NOERROR) + if (retval != CAPI_NOERROR) { + spin_unlock_irqrestore(&card->lock, flags); return retval; - + } dlen = CAPIMSG_DATALEN(skb->data); - spin_lock_irqsave(&card->lock, flags); b1_put_byte(port, SEND_DATA_B3_REQ); t1_put_slice(port, skb->data, len); t1_put_slice(port, skb->data + len, dlen); - spin_unlock_irqrestore(&card->lock, flags); } else { - - spin_lock_irqsave(&card->lock, flags); b1_put_byte(port, SEND_MESSAGE); t1_put_slice(port, skb->data, len); - spin_unlock_irqrestore(&card->lock, flags); } - + spin_unlock_irqrestore(&card->lock, flags); dev_kfree_skb_any(skb); return CAPI_NOERROR; } diff --git a/drivers/isdn/sc/debug.h b/drivers/isdn/sc/debug.h deleted file mode 100644 index e9db96ede4b2..000000000000 --- a/drivers/isdn/sc/debug.h +++ /dev/null @@ -1,19 +0,0 @@ -/* $Id: debug.h,v 1.2.8.1 2001/09/23 22:24:59 kai Exp $ - * - * Copyright (C) 1996 SpellCaster Telecommunications Inc. - * - * This software may be used and distributed according to the terms - * of the GNU General Public License, incorporated herein by reference. - * - * For more information, please contact gpl-info@spellcast.com or write: - * - * SpellCaster Telecommunications Inc. - * 5621 Finch Avenue East, Unit #3 - * Scarborough, Ontario Canada - * M1B 2T9 - * +1 (416) 297-8565 - * +1 (416) 297-6433 Facsimile - */ - -#define REQUEST_IRQ(a,b,c,d,e) request_irq(a,b,c,d,e) -#define FREE_IRQ(a,b) free_irq(a,b) diff --git a/drivers/isdn/sc/includes.h b/drivers/isdn/sc/includes.h index 5286e0c810a9..4766e5b77378 100644 --- a/drivers/isdn/sc/includes.h +++ b/drivers/isdn/sc/includes.h @@ -14,4 +14,3 @@ #include <linux/timer.h> #include <linux/wait.h> #include <linux/isdnif.h> -#include "debug.h" diff --git a/drivers/isdn/sc/init.c b/drivers/isdn/sc/init.c index 0bf76344a0d5..d09c854cfac7 100644 --- a/drivers/isdn/sc/init.c +++ b/drivers/isdn/sc/init.c @@ -404,7 +404,7 @@ static void __exit sc_exit(void) /* * Release the IRQ */ - FREE_IRQ(sc_adapter[i]->interrupt, NULL); + free_irq(sc_adapter[i]->interrupt, NULL); /* * Reset for a clean start diff --git a/drivers/macintosh/adbhid.c b/drivers/macintosh/adbhid.c index 2766e4fc4ea8..883da72b5368 100644 --- a/drivers/macintosh/adbhid.c +++ b/drivers/macintosh/adbhid.c @@ -791,8 +791,10 @@ adbhid_input_register(int id, int default_id, int original_handler_id, if (hid->keycode[i]) set_bit(hid->keycode[i], input_dev->keybit); - input_dev->evbit[0] = BIT(EV_KEY) | BIT(EV_LED) | BIT(EV_REP); - input_dev->ledbit[0] = BIT(LED_SCROLLL) | BIT(LED_CAPSL) | BIT(LED_NUML); + input_dev->evbit[0] = BIT_MASK(EV_KEY) | BIT_MASK(EV_LED) | + BIT_MASK(EV_REP); + input_dev->ledbit[0] = BIT_MASK(LED_SCROLLL) | + BIT_MASK(LED_CAPSL) | BIT_MASK(LED_NUML); input_dev->event = adbhid_kbd_event; input_dev->keycodemax = KEY_FN; input_dev->keycodesize = sizeof(hid->keycode[0]); @@ -801,16 +803,18 @@ adbhid_input_register(int id, int default_id, int original_handler_id, case ADB_MOUSE: sprintf(hid->name, "ADB mouse"); - input_dev->evbit[0] = BIT(EV_KEY) | BIT(EV_REL); - input_dev->keybit[LONG(BTN_MOUSE)] = BIT(BTN_LEFT) | BIT(BTN_MIDDLE) | BIT(BTN_RIGHT); - input_dev->relbit[0] = BIT(REL_X) | BIT(REL_Y); + input_dev->evbit[0] = BIT_MASK(EV_KEY) | BIT_MASK(EV_REL); + input_dev->keybit[BIT_WORD(BTN_MOUSE)] = BIT_MASK(BTN_LEFT) | + BIT_MASK(BTN_MIDDLE) | BIT_MASK(BTN_RIGHT); + input_dev->relbit[0] = BIT_MASK(REL_X) | BIT_MASK(REL_Y); break; case ADB_MISC: switch (original_handler_id) { case 0x02: /* Adjustable keyboard button device */ sprintf(hid->name, "ADB adjustable keyboard buttons"); - input_dev->evbit[0] = BIT(EV_KEY) | BIT(EV_REP); + input_dev->evbit[0] = BIT_MASK(EV_KEY) | + BIT_MASK(EV_REP); set_bit(KEY_SOUND, input_dev->keybit); set_bit(KEY_MUTE, input_dev->keybit); set_bit(KEY_VOLUMEUP, input_dev->keybit); @@ -818,7 +822,8 @@ adbhid_input_register(int id, int default_id, int original_handler_id, break; case 0x1f: /* Powerbook button device */ sprintf(hid->name, "ADB Powerbook buttons"); - input_dev->evbit[0] = BIT(EV_KEY) | BIT(EV_REP); + input_dev->evbit[0] = BIT_MASK(EV_KEY) | + BIT_MASK(EV_REP); set_bit(KEY_MUTE, input_dev->keybit); set_bit(KEY_VOLUMEUP, input_dev->keybit); set_bit(KEY_VOLUMEDOWN, input_dev->keybit); diff --git a/drivers/macintosh/mac_hid.c b/drivers/macintosh/mac_hid.c index 33dee3a773ed..89302309da92 100644 --- a/drivers/macintosh/mac_hid.c +++ b/drivers/macintosh/mac_hid.c @@ -117,9 +117,10 @@ static int emumousebtn_input_register(void) emumousebtn->id.product = 0x0001; emumousebtn->id.version = 0x0100; - emumousebtn->evbit[0] = BIT(EV_KEY) | BIT(EV_REL); - emumousebtn->keybit[LONG(BTN_MOUSE)] = BIT(BTN_LEFT) | BIT(BTN_MIDDLE) | BIT(BTN_RIGHT); - emumousebtn->relbit[0] = BIT(REL_X) | BIT(REL_Y); + emumousebtn->evbit[0] = BIT_MASK(EV_KEY) | BIT_MASK(EV_REL); + emumousebtn->keybit[BIT_WORD(BTN_MOUSE)] = BIT_MASK(BTN_LEFT) | + BIT_MASK(BTN_MIDDLE) | BIT_MASK(BTN_RIGHT); + emumousebtn->relbit[0] = BIT_MASK(REL_X) | BIT_MASK(REL_Y); ret = input_register_device(emumousebtn); if (ret) diff --git a/drivers/md/md.c b/drivers/md/md.c index c059ae6f37e5..808cd9549456 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -4717,7 +4717,7 @@ mdk_thread_t *md_register_thread(void (*run) (mddev_t *), mddev_t *mddev, void md_unregister_thread(mdk_thread_t *thread) { - dprintk("interrupting MD-thread pid %d\n", thread->tsk->pid); + dprintk("interrupting MD-thread pid %d\n", task_pid_nr(thread->tsk)); kthread_stop(thread->tsk); kfree(thread); diff --git a/drivers/media/dvb/cinergyT2/cinergyT2.c b/drivers/media/dvb/cinergyT2/cinergyT2.c index 5a12b5679556..154a7ce7cb82 100644 --- a/drivers/media/dvb/cinergyT2/cinergyT2.c +++ b/drivers/media/dvb/cinergyT2/cinergyT2.c @@ -820,7 +820,7 @@ static int cinergyt2_register_rc(struct cinergyt2 *cinergyt2) input_dev->name = DRIVER_NAME " remote control"; input_dev->phys = cinergyt2->phys; - input_dev->evbit[0] = BIT(EV_KEY) | BIT(EV_REP); + input_dev->evbit[0] = BIT_MASK(EV_KEY) | BIT_MASK(EV_REP); for (i = 0; i < ARRAY_SIZE(rc_keys); i += 3) set_bit(rc_keys[i + 2], input_dev->keybit); input_dev->keycodesize = 0; diff --git a/drivers/media/dvb/dvb-usb/dvb-usb-remote.c b/drivers/media/dvb/dvb-usb/dvb-usb-remote.c index 7b9f35bfb4f0..c0c2c22ddd83 100644 --- a/drivers/media/dvb/dvb-usb/dvb-usb-remote.c +++ b/drivers/media/dvb/dvb-usb/dvb-usb-remote.c @@ -106,7 +106,7 @@ int dvb_usb_remote_init(struct dvb_usb_device *d) if (!input_dev) return -ENOMEM; - input_dev->evbit[0] = BIT(EV_KEY); + input_dev->evbit[0] = BIT_MASK(EV_KEY); input_dev->name = "IR-receiver inside an USB DVB receiver"; input_dev->phys = d->rc_phys; usb_to_input_id(d->udev, &input_dev->id); diff --git a/drivers/media/dvb/ttpci/av7110_ir.c b/drivers/media/dvb/ttpci/av7110_ir.c index 5d19c402dad1..a283e1de83fa 100644 --- a/drivers/media/dvb/ttpci/av7110_ir.c +++ b/drivers/media/dvb/ttpci/av7110_ir.c @@ -27,7 +27,7 @@ #include <linux/module.h> #include <linux/proc_fs.h> #include <linux/kernel.h> -#include <asm/bitops.h> +#include <linux/bitops.h> #include "av7110.h" #include "av7110_hw.h" diff --git a/drivers/media/dvb/ttusb-dec/ttusb_dec.c b/drivers/media/dvb/ttusb-dec/ttusb_dec.c index 5e691fd79904..1ec981d98b91 100644 --- a/drivers/media/dvb/ttusb-dec/ttusb_dec.c +++ b/drivers/media/dvb/ttusb-dec/ttusb_dec.c @@ -1198,7 +1198,7 @@ static int ttusb_init_rc( struct ttusb_dec *dec) input_dev->name = "ttusb_dec remote control"; input_dev->phys = dec->rc_phys; - input_dev->evbit[0] = BIT(EV_KEY); + input_dev->evbit[0] = BIT_MASK(EV_KEY); input_dev->keycodesize = sizeof(u16); input_dev->keycodemax = 0x1a; input_dev->keycode = rc_keys; diff --git a/drivers/media/video/usbvideo/konicawc.c b/drivers/media/video/usbvideo/konicawc.c index 491505d6fdee..3e93f8058770 100644 --- a/drivers/media/video/usbvideo/konicawc.c +++ b/drivers/media/video/usbvideo/konicawc.c @@ -238,8 +238,8 @@ static void konicawc_register_input(struct konicawc *cam, struct usb_device *dev usb_to_input_id(dev, &input_dev->id); input_dev->dev.parent = &dev->dev; - input_dev->evbit[0] = BIT(EV_KEY); - input_dev->keybit[LONG(BTN_0)] = BIT(BTN_0); + input_dev->evbit[0] = BIT_MASK(EV_KEY); + input_dev->keybit[BIT_WORD(BTN_0)] = BIT_MASK(BTN_0); input_dev->private = cam; diff --git a/drivers/media/video/usbvideo/quickcam_messenger.c b/drivers/media/video/usbvideo/quickcam_messenger.c index dd1a6d6bbc9e..d847273eeba0 100644 --- a/drivers/media/video/usbvideo/quickcam_messenger.c +++ b/drivers/media/video/usbvideo/quickcam_messenger.c @@ -102,8 +102,8 @@ static void qcm_register_input(struct qcm *cam, struct usb_device *dev) usb_to_input_id(dev, &input_dev->id); input_dev->dev.parent = &dev->dev; - input_dev->evbit[0] = BIT(EV_KEY); - input_dev->keybit[LONG(BTN_0)] = BIT(BTN_0); + input_dev->evbit[0] = BIT_MASK(EV_KEY); + input_dev->keybit[BIT_WORD(BTN_0)] = BIT_MASK(BTN_0); input_dev->private = cam; diff --git a/drivers/media/video/zoran_driver.c b/drivers/media/video/zoran_driver.c index 1c14fa2bd411..419e5af78533 100644 --- a/drivers/media/video/zoran_driver.c +++ b/drivers/media/video/zoran_driver.c @@ -1285,7 +1285,7 @@ zoran_open (struct inode *inode, } dprintk(1, KERN_INFO "%s: zoran_open(%s, pid=[%d]), users(-)=%d\n", - ZR_DEVNAME(zr), current->comm, current->pid, zr->user); + ZR_DEVNAME(zr), current->comm, task_pid_nr(current), zr->user); /* now, create the open()-specific file_ops struct */ fh = kzalloc(sizeof(struct zoran_fh), GFP_KERNEL); @@ -1358,7 +1358,7 @@ zoran_close (struct inode *inode, struct zoran *zr = fh->zr; dprintk(1, KERN_INFO "%s: zoran_close(%s, pid=[%d]), users(+)=%d\n", - ZR_DEVNAME(zr), current->comm, current->pid, zr->user); + ZR_DEVNAME(zr), current->comm, task_pid_nr(current), zr->user); /* kernel locks (fs/device.c), so don't do that ourselves * (prevents deadlocks) */ diff --git a/drivers/misc/ibmasm/remote.c b/drivers/misc/ibmasm/remote.c index 0550ce075fc4..1d9defb1a10c 100644 --- a/drivers/misc/ibmasm/remote.c +++ b/drivers/misc/ibmasm/remote.c @@ -226,9 +226,9 @@ int ibmasm_init_remote_input_dev(struct service_processor *sp) mouse_dev->id.product = pdev->device; mouse_dev->id.version = 1; mouse_dev->dev.parent = sp->dev; - mouse_dev->evbit[0] = BIT(EV_KEY) | BIT(EV_ABS); - mouse_dev->keybit[LONG(BTN_MOUSE)] = BIT(BTN_LEFT) | - BIT(BTN_RIGHT) | BIT(BTN_MIDDLE); + mouse_dev->evbit[0] = BIT_MASK(EV_KEY) | BIT_MASK(EV_ABS); + mouse_dev->keybit[BIT_WORD(BTN_MOUSE)] = BIT_MASK(BTN_LEFT) | + BIT_MASK(BTN_RIGHT) | BIT_MASK(BTN_MIDDLE); set_bit(BTN_TOUCH, mouse_dev->keybit); mouse_dev->name = "ibmasm RSA I remote mouse"; input_set_abs_params(mouse_dev, ABS_X, 0, MOUSE_X_MAX, 0, 0); @@ -239,7 +239,7 @@ int ibmasm_init_remote_input_dev(struct service_processor *sp) keybd_dev->id.product = pdev->device; keybd_dev->id.version = 2; keybd_dev->dev.parent = sp->dev; - keybd_dev->evbit[0] = BIT(EV_KEY); + keybd_dev->evbit[0] = BIT_MASK(EV_KEY); keybd_dev->name = "ibmasm RSA I remote keyboard"; for (i = 0; i < XLATE_SIZE; i++) { diff --git a/drivers/misc/phantom.c b/drivers/misc/phantom.c index 5108b7c576df..cd221fd0fb94 100644 --- a/drivers/misc/phantom.c +++ b/drivers/misc/phantom.c @@ -9,6 +9,7 @@ * You need an userspace library to cooperate with this driver. It (and other * info) may be obtained here: * http://www.fi.muni.cz/~xslaby/phantom.html + * or alternatively, you might use OpenHaptics provided by Sensable. */ #include <linux/kernel.h> @@ -24,13 +25,14 @@ #include <asm/atomic.h> #include <asm/io.h> -#define PHANTOM_VERSION "n0.9.5" +#define PHANTOM_VERSION "n0.9.7" #define PHANTOM_MAX_MINORS 8 #define PHN_IRQCTL 0x4c /* irq control in caddr space */ #define PHB_RUNNING 1 +#define PHB_NOT_OH 2 static struct class *phantom_class; static int phantom_major; @@ -47,7 +49,11 @@ struct phantom_device { struct cdev cdev; struct mutex open_lock; - spinlock_t ioctl_lock; + spinlock_t regs_lock; + + /* used in NOT_OH mode */ + struct phm_regs oregs; + u32 ctl_reg; }; static unsigned char phantom_devices[PHANTOM_MAX_MINORS]; @@ -82,6 +88,7 @@ static long phantom_ioctl(struct file *file, unsigned int cmd, struct phm_regs rs; struct phm_reg r; void __user *argp = (void __user *)arg; + unsigned long flags; unsigned int i; if (_IOC_TYPE(cmd) != PH_IOC_MAGIC || @@ -96,32 +103,45 @@ static long phantom_ioctl(struct file *file, unsigned int cmd, if (r.reg > 7) return -EINVAL; - spin_lock(&dev->ioctl_lock); + spin_lock_irqsave(&dev->regs_lock, flags); if (r.reg == PHN_CONTROL && (r.value & PHN_CTL_IRQ) && phantom_status(dev, dev->status | PHB_RUNNING)){ - spin_unlock(&dev->ioctl_lock); + spin_unlock_irqrestore(&dev->regs_lock, flags); return -ENODEV; } pr_debug("phantom: writing %x to %u\n", r.value, r.reg); + + /* preserve amp bit (don't allow to change it when in NOT_OH) */ + if (r.reg == PHN_CONTROL && (dev->status & PHB_NOT_OH)) { + r.value &= ~PHN_CTL_AMP; + r.value |= dev->ctl_reg & PHN_CTL_AMP; + dev->ctl_reg = r.value; + } + iowrite32(r.value, dev->iaddr + r.reg); ioread32(dev->iaddr); /* PCI posting */ if (r.reg == PHN_CONTROL && !(r.value & PHN_CTL_IRQ)) phantom_status(dev, dev->status & ~PHB_RUNNING); - spin_unlock(&dev->ioctl_lock); + spin_unlock_irqrestore(&dev->regs_lock, flags); break; case PHN_SET_REGS: if (copy_from_user(&rs, argp, sizeof(rs))) return -EFAULT; pr_debug("phantom: SRS %u regs %x\n", rs.count, rs.mask); - spin_lock(&dev->ioctl_lock); - for (i = 0; i < min(rs.count, 8U); i++) - if ((1 << i) & rs.mask) - iowrite32(rs.values[i], dev->oaddr + i); - ioread32(dev->iaddr); /* PCI posting */ - spin_unlock(&dev->ioctl_lock); + spin_lock_irqsave(&dev->regs_lock, flags); + if (dev->status & PHB_NOT_OH) + memcpy(&dev->oregs, &rs, sizeof(rs)); + else { + u32 m = min(rs.count, 8U); + for (i = 0; i < m; i++) + if (rs.mask & BIT(i)) + iowrite32(rs.values[i], dev->oaddr + i); + ioread32(dev->iaddr); /* PCI posting */ + } + spin_unlock_irqrestore(&dev->regs_lock, flags); break; case PHN_GET_REG: if (copy_from_user(&r, argp, sizeof(r))) @@ -135,20 +155,35 @@ static long phantom_ioctl(struct file *file, unsigned int cmd, if (copy_to_user(argp, &r, sizeof(r))) return -EFAULT; break; - case PHN_GET_REGS: + case PHN_GET_REGS: { + u32 m; + if (copy_from_user(&rs, argp, sizeof(rs))) return -EFAULT; + m = min(rs.count, 8U); + pr_debug("phantom: GRS %u regs %x\n", rs.count, rs.mask); - spin_lock(&dev->ioctl_lock); - for (i = 0; i < min(rs.count, 8U); i++) - if ((1 << i) & rs.mask) + spin_lock_irqsave(&dev->regs_lock, flags); + for (i = 0; i < m; i++) + if (rs.mask & BIT(i)) rs.values[i] = ioread32(dev->iaddr + i); - spin_unlock(&dev->ioctl_lock); + spin_unlock_irqrestore(&dev->regs_lock, flags); if (copy_to_user(argp, &rs, sizeof(rs))) return -EFAULT; break; + } case PHN_NOT_OH: + spin_lock_irqsave(&dev->regs_lock, flags); + if (dev->status & PHB_RUNNING) { + printk(KERN_ERR "phantom: you need to set NOT_OH " + "before you start the device!\n"); + spin_unlock_irqrestore(&dev->regs_lock, flags); + return -EINVAL; + } + dev->status |= PHB_NOT_OH; + spin_unlock_irqrestore(&dev->regs_lock, flags); + break; default: return -ENOTTY; } @@ -171,8 +206,11 @@ static int phantom_open(struct inode *inode, struct file *file) return -EINVAL; } + WARN_ON(dev->status & PHB_NOT_OH); + file->private_data = dev; + atomic_set(&dev->counter, 0); dev->opened++; mutex_unlock(&dev->open_lock); @@ -187,6 +225,7 @@ static int phantom_release(struct inode *inode, struct file *file) dev->opened = 0; phantom_status(dev, dev->status & ~PHB_RUNNING); + dev->status &= ~PHB_NOT_OH; mutex_unlock(&dev->open_lock); @@ -220,12 +259,32 @@ static struct file_operations phantom_file_ops = { static irqreturn_t phantom_isr(int irq, void *data) { struct phantom_device *dev = data; + unsigned int i; + u32 ctl; - if (!(ioread32(dev->iaddr + PHN_CONTROL) & PHN_CTL_IRQ)) + spin_lock(&dev->regs_lock); + ctl = ioread32(dev->iaddr + PHN_CONTROL); + if (!(ctl & PHN_CTL_IRQ)) { + spin_unlock(&dev->regs_lock); return IRQ_NONE; + } iowrite32(0, dev->iaddr); iowrite32(0xc0, dev->iaddr); + + if (dev->status & PHB_NOT_OH) { + struct phm_regs *r = &dev->oregs; + u32 m = min(r->count, 8U); + + for (i = 0; i < m; i++) + if (r->mask & BIT(i)) + iowrite32(r->values[i], dev->oaddr + i); + + dev->ctl_reg ^= PHN_CTL_AMP; + iowrite32(dev->ctl_reg, dev->iaddr + PHN_CONTROL); + } + spin_unlock(&dev->regs_lock); + ioread32(dev->iaddr); /* PCI posting */ atomic_inc(&dev->counter); @@ -297,7 +356,7 @@ static int __devinit phantom_probe(struct pci_dev *pdev, } mutex_init(&pht->open_lock); - spin_lock_init(&pht->ioctl_lock); + spin_lock_init(&pht->regs_lock); init_waitqueue_head(&pht->wait); cdev_init(&pht->cdev, &phantom_file_ops); pht->cdev.owner = THIS_MODULE; @@ -378,6 +437,8 @@ static int phantom_suspend(struct pci_dev *pdev, pm_message_t state) iowrite32(0, dev->caddr + PHN_IRQCTL); ioread32(dev->caddr + PHN_IRQCTL); /* PCI posting */ + synchronize_irq(pdev->irq); + return 0; } diff --git a/drivers/misc/sony-laptop.c b/drivers/misc/sony-laptop.c index e73a71f04bb4..86da96becd28 100644 --- a/drivers/misc/sony-laptop.c +++ b/drivers/misc/sony-laptop.c @@ -411,9 +411,9 @@ static int sony_laptop_setup_input(void) jog_dev->id.bustype = BUS_ISA; jog_dev->id.vendor = PCI_VENDOR_ID_SONY; - jog_dev->evbit[0] = BIT(EV_KEY) | BIT(EV_REL); - jog_dev->keybit[LONG(BTN_MOUSE)] = BIT(BTN_MIDDLE); - jog_dev->relbit[0] = BIT(REL_WHEEL); + jog_dev->evbit[0] = BIT_MASK(EV_KEY) | BIT_MASK(EV_REL); + jog_dev->keybit[BIT_WORD(BTN_MOUSE)] = BIT_MASK(BTN_MIDDLE); + jog_dev->relbit[0] = BIT_MASK(REL_WHEEL); error = input_register_device(jog_dev); if (error) diff --git a/drivers/mtd/ubi/wl.c b/drivers/mtd/ubi/wl.c index a4f1bf33164a..6330c8cc72b5 100644 --- a/drivers/mtd/ubi/wl.c +++ b/drivers/mtd/ubi/wl.c @@ -1309,7 +1309,7 @@ static int ubi_thread(void *u) struct ubi_device *ubi = u; ubi_msg("background thread \"%s\" started, PID %d", - ubi->bgt_name, current->pid); + ubi->bgt_name, task_pid_nr(current)); set_freezable(); for (;;) { diff --git a/drivers/net/bnx2.c b/drivers/net/bnx2.c index 96cee4badd28..da767d3d5af5 100644 --- a/drivers/net/bnx2.c +++ b/drivers/net/bnx2.c @@ -26,7 +26,7 @@ #include <linux/etherdevice.h> #include <linux/skbuff.h> #include <linux/dma-mapping.h> -#include <asm/bitops.h> +#include <linux/bitops.h> #include <asm/io.h> #include <asm/irq.h> #include <linux/delay.h> diff --git a/drivers/net/bonding/bond_3ad.c b/drivers/net/bonding/bond_3ad.c index 7a045a37056e..084f0292ea6e 100644 --- a/drivers/net/bonding/bond_3ad.c +++ b/drivers/net/bonding/bond_3ad.c @@ -126,7 +126,7 @@ static struct aggregator *__get_active_agg(struct aggregator *aggregator); // ================= main 802.3ad protocol functions ================== static int ad_lacpdu_send(struct port *port); -static int ad_marker_send(struct port *port, struct marker *marker); +static int ad_marker_send(struct port *port, struct bond_marker *marker); static void ad_mux_machine(struct port *port); static void ad_rx_machine(struct lacpdu *lacpdu, struct port *port); static void ad_tx_machine(struct port *port); @@ -139,8 +139,8 @@ static void ad_initialize_port(struct port *port, int lacp_fast); static void ad_initialize_lacpdu(struct lacpdu *Lacpdu); static void ad_enable_collecting_distributing(struct port *port); static void ad_disable_collecting_distributing(struct port *port); -static void ad_marker_info_received(struct marker *marker_info, struct port *port); -static void ad_marker_response_received(struct marker *marker, struct port *port); +static void ad_marker_info_received(struct bond_marker *marker_info, struct port *port); +static void ad_marker_response_received(struct bond_marker *marker, struct port *port); ///////////////////////////////////////////////////////////////////////////////// @@ -889,12 +889,12 @@ static int ad_lacpdu_send(struct port *port) * Returns: 0 on success * < 0 on error */ -static int ad_marker_send(struct port *port, struct marker *marker) +static int ad_marker_send(struct port *port, struct bond_marker *marker) { struct slave *slave = port->slave; struct sk_buff *skb; - struct marker_header *marker_header; - int length = sizeof(struct marker_header); + struct bond_marker_header *marker_header; + int length = sizeof(struct bond_marker_header); struct mac_addr lacpdu_multicast_address = AD_MULTICAST_LACPDU_ADDR; skb = dev_alloc_skb(length + 16); @@ -909,7 +909,7 @@ static int ad_marker_send(struct port *port, struct marker *marker) skb->network_header = skb->mac_header + ETH_HLEN; skb->protocol = PKT_TYPE_LACPDU; - marker_header = (struct marker_header *)skb_put(skb, length); + marker_header = (struct bond_marker_header *)skb_put(skb, length); marker_header->ad_header.destination_address = lacpdu_multicast_address; /* Note: source addres is set to be the member's PERMANENT address, because we use it @@ -1709,7 +1709,7 @@ static void ad_disable_collecting_distributing(struct port *port) */ static void ad_marker_info_send(struct port *port) { - struct marker marker; + struct bond_marker marker; u16 index; // fill the marker PDU with the appropriate values @@ -1742,13 +1742,14 @@ static void ad_marker_info_send(struct port *port) * @port: the port we're looking at * */ -static void ad_marker_info_received(struct marker *marker_info,struct port *port) +static void ad_marker_info_received(struct bond_marker *marker_info, + struct port *port) { - struct marker marker; + struct bond_marker marker; // copy the received marker data to the response marker //marker = *marker_info; - memcpy(&marker, marker_info, sizeof(struct marker)); + memcpy(&marker, marker_info, sizeof(struct bond_marker)); // change the marker subtype to marker response marker.tlv_type=AD_MARKER_RESPONSE_SUBTYPE; // send the marker response @@ -1767,7 +1768,8 @@ static void ad_marker_info_received(struct marker *marker_info,struct port *port * response for marker PDU's, in this stage, but only to respond to marker * information. */ -static void ad_marker_response_received(struct marker *marker, struct port *port) +static void ad_marker_response_received(struct bond_marker *marker, + struct port *port) { marker=NULL; // just to satisfy the compiler port=NULL; // just to satisfy the compiler @@ -2164,15 +2166,15 @@ static void bond_3ad_rx_indication(struct lacpdu *lacpdu, struct slave *slave, u case AD_TYPE_MARKER: // No need to convert fields to Little Endian since we don't use the marker's fields. - switch (((struct marker *)lacpdu)->tlv_type) { + switch (((struct bond_marker *)lacpdu)->tlv_type) { case AD_MARKER_INFORMATION_SUBTYPE: dprintk("Received Marker Information on port %d\n", port->actor_port_number); - ad_marker_info_received((struct marker *)lacpdu, port); + ad_marker_info_received((struct bond_marker *)lacpdu, port); break; case AD_MARKER_RESPONSE_SUBTYPE: dprintk("Received Marker Response on port %d\n", port->actor_port_number); - ad_marker_response_received((struct marker *)lacpdu, port); + ad_marker_response_received((struct bond_marker *)lacpdu, port); break; default: diff --git a/drivers/net/bonding/bond_3ad.h b/drivers/net/bonding/bond_3ad.h index 862952fa6fd9..f16557264944 100644 --- a/drivers/net/bonding/bond_3ad.h +++ b/drivers/net/bonding/bond_3ad.h @@ -92,7 +92,7 @@ typedef enum { typedef enum { AD_MARKER_INFORMATION_SUBTYPE = 1, // marker imformation subtype AD_MARKER_RESPONSE_SUBTYPE // marker response subtype -} marker_subtype_t; +} bond_marker_subtype_t; // timers types(43.4.9 in the 802.3ad standard) typedef enum { @@ -148,7 +148,7 @@ typedef struct lacpdu_header { } lacpdu_header_t; // Marker Protocol Data Unit(PDU) structure(43.5.3.2 in the 802.3ad standard) -typedef struct marker { +typedef struct bond_marker { u8 subtype; // = 0x02 (marker PDU) u8 version_number; // = 0x01 u8 tlv_type; // = 0x01 (marker information) @@ -161,12 +161,12 @@ typedef struct marker { u8 tlv_type_terminator; // = 0x00 u8 terminator_length; // = 0x00 u8 reserved_90[90]; // = 0 -} marker_t; +} bond_marker_t; -typedef struct marker_header { +typedef struct bond_marker_header { struct ad_header ad_header; - struct marker marker; -} marker_header_t; + struct bond_marker marker; +} bond_marker_header_t; #pragma pack() diff --git a/drivers/net/cris/eth_v10.c b/drivers/net/cris/eth_v10.c index 314b2f68f78f..edd6828f0a78 100644 --- a/drivers/net/cris/eth_v10.c +++ b/drivers/net/cris/eth_v10.c @@ -234,6 +234,7 @@ #include <linux/spinlock.h> #include <linux/errno.h> #include <linux/init.h> +#include <linux/bitops.h> #include <linux/if.h> #include <linux/mii.h> @@ -247,7 +248,6 @@ #include <asm/irq.h> #include <asm/dma.h> #include <asm/system.h> -#include <asm/bitops.h> #include <asm/ethernet.h> #include <asm/cache.h> diff --git a/drivers/net/cxgb3/adapter.h b/drivers/net/cxgb3/adapter.h index 044261703381..2a3df145850d 100644 --- a/drivers/net/cxgb3/adapter.h +++ b/drivers/net/cxgb3/adapter.h @@ -41,9 +41,9 @@ #include <linux/timer.h> #include <linux/cache.h> #include <linux/mutex.h> +#include <linux/bitops.h> #include "t3cdev.h" #include <asm/semaphore.h> -#include <asm/bitops.h> #include <asm/io.h> typedef irqreturn_t(*intr_handler_t) (int, void *); diff --git a/drivers/net/eth16i.c b/drivers/net/eth16i.c index 243fc6b354b5..e3dd8b136908 100644 --- a/drivers/net/eth16i.c +++ b/drivers/net/eth16i.c @@ -170,7 +170,6 @@ static char *version = /* Few macros */ -#define BIT(a) ( (1 << (a)) ) #define BITSET(ioaddr, bnum) ((outb(((inb(ioaddr)) | (bnum)), ioaddr))) #define BITCLR(ioaddr, bnum) ((outb(((inb(ioaddr)) & (~(bnum))), ioaddr))) diff --git a/drivers/net/hamradio/dmascc.c b/drivers/net/hamradio/dmascc.c index bc02e4694804..11b83dae00ac 100644 --- a/drivers/net/hamradio/dmascc.c +++ b/drivers/net/hamradio/dmascc.c @@ -21,6 +21,7 @@ #include <linux/module.h> +#include <linux/bitops.h> #include <linux/delay.h> #include <linux/errno.h> #include <linux/if_arp.h> @@ -35,7 +36,6 @@ #include <linux/sockios.h> #include <linux/workqueue.h> #include <asm/atomic.h> -#include <asm/bitops.h> #include <asm/dma.h> #include <asm/io.h> #include <asm/irq.h> diff --git a/drivers/net/mac89x0.c b/drivers/net/mac89x0.c index 30854f094965..a19b5958cee9 100644 --- a/drivers/net/mac89x0.c +++ b/drivers/net/mac89x0.c @@ -99,9 +99,9 @@ static char *version = #include <linux/etherdevice.h> #include <linux/skbuff.h> #include <linux/delay.h> +#include <linux/bitops.h> #include <asm/system.h> -#include <asm/bitops.h> #include <asm/io.h> #include <asm/hwtest.h> #include <asm/macints.h> diff --git a/drivers/net/meth.h b/drivers/net/meth.h index ea3b8fc86d1e..a78dc1ca8c29 100644 --- a/drivers/net/meth.h +++ b/drivers/net/meth.h @@ -28,9 +28,6 @@ #define RX_BUFFER_OFFSET (sizeof(rx_status_vector)+2) /* staus vector + 2 bytes of padding */ #define RX_BUCKET_SIZE 256 -#undef BIT -#define BIT(x) (1UL << (x)) - /* For more detailed explanations of what each field menas, see Nick's great comments to #defines below (or docs, if you are lucky enough toget hold of them :)*/ diff --git a/drivers/net/s2io-regs.h b/drivers/net/s2io-regs.h index aef66e2d98d2..01f08d726ace 100644 --- a/drivers/net/s2io-regs.h +++ b/drivers/net/s2io-regs.h @@ -20,17 +20,17 @@ struct XENA_dev_config { /* General Control-Status Registers */ u64 general_int_status; -#define GEN_INTR_TXPIC BIT(0) -#define GEN_INTR_TXDMA BIT(1) -#define GEN_INTR_TXMAC BIT(2) -#define GEN_INTR_TXXGXS BIT(3) -#define GEN_INTR_TXTRAFFIC BIT(8) -#define GEN_INTR_RXPIC BIT(32) -#define GEN_INTR_RXDMA BIT(33) -#define GEN_INTR_RXMAC BIT(34) -#define GEN_INTR_MC BIT(35) -#define GEN_INTR_RXXGXS BIT(36) -#define GEN_INTR_RXTRAFFIC BIT(40) +#define GEN_INTR_TXPIC s2BIT(0) +#define GEN_INTR_TXDMA s2BIT(1) +#define GEN_INTR_TXMAC s2BIT(2) +#define GEN_INTR_TXXGXS s2BIT(3) +#define GEN_INTR_TXTRAFFIC s2BIT(8) +#define GEN_INTR_RXPIC s2BIT(32) +#define GEN_INTR_RXDMA s2BIT(33) +#define GEN_INTR_RXMAC s2BIT(34) +#define GEN_INTR_MC s2BIT(35) +#define GEN_INTR_RXXGXS s2BIT(36) +#define GEN_INTR_RXTRAFFIC s2BIT(40) #define GEN_ERROR_INTR GEN_INTR_TXPIC | GEN_INTR_RXPIC | \ GEN_INTR_TXDMA | GEN_INTR_RXDMA | \ GEN_INTR_TXMAC | GEN_INTR_RXMAC | \ @@ -54,36 +54,36 @@ struct XENA_dev_config { u64 adapter_status; -#define ADAPTER_STATUS_TDMA_READY BIT(0) -#define ADAPTER_STATUS_RDMA_READY BIT(1) -#define ADAPTER_STATUS_PFC_READY BIT(2) -#define ADAPTER_STATUS_TMAC_BUF_EMPTY BIT(3) -#define ADAPTER_STATUS_PIC_QUIESCENT BIT(5) -#define ADAPTER_STATUS_RMAC_REMOTE_FAULT BIT(6) -#define ADAPTER_STATUS_RMAC_LOCAL_FAULT BIT(7) +#define ADAPTER_STATUS_TDMA_READY s2BIT(0) +#define ADAPTER_STATUS_RDMA_READY s2BIT(1) +#define ADAPTER_STATUS_PFC_READY s2BIT(2) +#define ADAPTER_STATUS_TMAC_BUF_EMPTY s2BIT(3) +#define ADAPTER_STATUS_PIC_QUIESCENT s2BIT(5) +#define ADAPTER_STATUS_RMAC_REMOTE_FAULT s2BIT(6) +#define ADAPTER_STATUS_RMAC_LOCAL_FAULT s2BIT(7) #define ADAPTER_STATUS_RMAC_PCC_IDLE vBIT(0xFF,8,8) #define ADAPTER_STATUS_RMAC_PCC_FOUR_IDLE vBIT(0x0F,8,8) #define ADAPTER_STATUS_RC_PRC_QUIESCENT vBIT(0xFF,16,8) -#define ADAPTER_STATUS_MC_DRAM_READY BIT(24) -#define ADAPTER_STATUS_MC_QUEUES_READY BIT(25) -#define ADAPTER_STATUS_M_PLL_LOCK BIT(30) -#define ADAPTER_STATUS_P_PLL_LOCK BIT(31) +#define ADAPTER_STATUS_MC_DRAM_READY s2BIT(24) +#define ADAPTER_STATUS_MC_QUEUES_READY s2BIT(25) +#define ADAPTER_STATUS_M_PLL_LOCK s2BIT(30) +#define ADAPTER_STATUS_P_PLL_LOCK s2BIT(31) u64 adapter_control; -#define ADAPTER_CNTL_EN BIT(7) -#define ADAPTER_EOI_TX_ON BIT(15) -#define ADAPTER_LED_ON BIT(23) +#define ADAPTER_CNTL_EN s2BIT(7) +#define ADAPTER_EOI_TX_ON s2BIT(15) +#define ADAPTER_LED_ON s2BIT(23) #define ADAPTER_UDPI(val) vBIT(val,36,4) -#define ADAPTER_WAIT_INT BIT(48) -#define ADAPTER_ECC_EN BIT(55) +#define ADAPTER_WAIT_INT s2BIT(48) +#define ADAPTER_ECC_EN s2BIT(55) u64 serr_source; -#define SERR_SOURCE_PIC BIT(0) -#define SERR_SOURCE_TXDMA BIT(1) -#define SERR_SOURCE_RXDMA BIT(2) -#define SERR_SOURCE_MAC BIT(3) -#define SERR_SOURCE_MC BIT(4) -#define SERR_SOURCE_XGXS BIT(5) +#define SERR_SOURCE_PIC s2BIT(0) +#define SERR_SOURCE_TXDMA s2BIT(1) +#define SERR_SOURCE_RXDMA s2BIT(2) +#define SERR_SOURCE_MAC s2BIT(3) +#define SERR_SOURCE_MC s2BIT(4) +#define SERR_SOURCE_XGXS s2BIT(5) #define SERR_SOURCE_ANY (SERR_SOURCE_PIC | \ SERR_SOURCE_TXDMA | \ SERR_SOURCE_RXDMA | \ @@ -101,41 +101,41 @@ struct XENA_dev_config { #define PCI_MODE_PCIX_M2_66 0x5 #define PCI_MODE_PCIX_M2_100 0x6 #define PCI_MODE_PCIX_M2_133 0x7 -#define PCI_MODE_UNSUPPORTED BIT(0) -#define PCI_MODE_32_BITS BIT(8) -#define PCI_MODE_UNKNOWN_MODE BIT(9) +#define PCI_MODE_UNSUPPORTED s2BIT(0) +#define PCI_MODE_32_BITS s2BIT(8) +#define PCI_MODE_UNKNOWN_MODE s2BIT(9) u8 unused_0[0x800 - 0x128]; /* PCI-X Controller registers */ u64 pic_int_status; u64 pic_int_mask; -#define PIC_INT_TX BIT(0) -#define PIC_INT_FLSH BIT(1) -#define PIC_INT_MDIO BIT(2) -#define PIC_INT_IIC BIT(3) -#define PIC_INT_GPIO BIT(4) -#define PIC_INT_RX BIT(32) +#define PIC_INT_TX s2BIT(0) +#define PIC_INT_FLSH s2BIT(1) +#define PIC_INT_MDIO s2BIT(2) +#define PIC_INT_IIC s2BIT(3) +#define PIC_INT_GPIO s2BIT(4) +#define PIC_INT_RX s2BIT(32) u64 txpic_int_reg; u64 txpic_int_mask; -#define PCIX_INT_REG_ECC_SG_ERR BIT(0) -#define PCIX_INT_REG_ECC_DB_ERR BIT(1) -#define PCIX_INT_REG_FLASHR_R_FSM_ERR BIT(8) -#define PCIX_INT_REG_FLASHR_W_FSM_ERR BIT(9) -#define PCIX_INT_REG_INI_TX_FSM_SERR BIT(10) -#define PCIX_INT_REG_INI_TXO_FSM_ERR BIT(11) -#define PCIX_INT_REG_TRT_FSM_SERR BIT(13) -#define PCIX_INT_REG_SRT_FSM_SERR BIT(14) -#define PCIX_INT_REG_PIFR_FSM_SERR BIT(15) -#define PCIX_INT_REG_WRC_TX_SEND_FSM_SERR BIT(21) -#define PCIX_INT_REG_RRC_TX_REQ_FSM_SERR BIT(23) -#define PCIX_INT_REG_INI_RX_FSM_SERR BIT(48) -#define PCIX_INT_REG_RA_RX_FSM_SERR BIT(50) +#define PCIX_INT_REG_ECC_SG_ERR s2BIT(0) +#define PCIX_INT_REG_ECC_DB_ERR s2BIT(1) +#define PCIX_INT_REG_FLASHR_R_FSM_ERR s2BIT(8) +#define PCIX_INT_REG_FLASHR_W_FSM_ERR s2BIT(9) +#define PCIX_INT_REG_INI_TX_FSM_SERR s2BIT(10) +#define PCIX_INT_REG_INI_TXO_FSM_ERR s2BIT(11) +#define PCIX_INT_REG_TRT_FSM_SERR s2BIT(13) +#define PCIX_INT_REG_SRT_FSM_SERR s2BIT(14) +#define PCIX_INT_REG_PIFR_FSM_SERR s2BIT(15) +#define PCIX_INT_REG_WRC_TX_SEND_FSM_SERR s2BIT(21) +#define PCIX_INT_REG_RRC_TX_REQ_FSM_SERR s2BIT(23) +#define PCIX_INT_REG_INI_RX_FSM_SERR s2BIT(48) +#define PCIX_INT_REG_RA_RX_FSM_SERR s2BIT(50) /* -#define PCIX_INT_REG_WRC_RX_SEND_FSM_SERR BIT(52) -#define PCIX_INT_REG_RRC_RX_REQ_FSM_SERR BIT(54) -#define PCIX_INT_REG_RRC_RX_SPLIT_FSM_SERR BIT(58) +#define PCIX_INT_REG_WRC_RX_SEND_FSM_SERR s2BIT(52) +#define PCIX_INT_REG_RRC_RX_REQ_FSM_SERR s2BIT(54) +#define PCIX_INT_REG_RRC_RX_SPLIT_FSM_SERR s2BIT(58) */ u64 txpic_alarms; u64 rxpic_int_reg; @@ -144,92 +144,92 @@ struct XENA_dev_config { u64 flsh_int_reg; u64 flsh_int_mask; -#define PIC_FLSH_INT_REG_CYCLE_FSM_ERR BIT(63) -#define PIC_FLSH_INT_REG_ERR BIT(62) +#define PIC_FLSH_INT_REG_CYCLE_FSM_ERR s2BIT(63) +#define PIC_FLSH_INT_REG_ERR s2BIT(62) u64 flash_alarms; u64 mdio_int_reg; u64 mdio_int_mask; -#define MDIO_INT_REG_MDIO_BUS_ERR BIT(0) -#define MDIO_INT_REG_DTX_BUS_ERR BIT(8) -#define MDIO_INT_REG_LASI BIT(39) +#define MDIO_INT_REG_MDIO_BUS_ERR s2BIT(0) +#define MDIO_INT_REG_DTX_BUS_ERR s2BIT(8) +#define MDIO_INT_REG_LASI s2BIT(39) u64 mdio_alarms; u64 iic_int_reg; u64 iic_int_mask; -#define IIC_INT_REG_BUS_FSM_ERR BIT(4) -#define IIC_INT_REG_BIT_FSM_ERR BIT(5) -#define IIC_INT_REG_CYCLE_FSM_ERR BIT(6) -#define IIC_INT_REG_REQ_FSM_ERR BIT(7) -#define IIC_INT_REG_ACK_ERR BIT(8) +#define IIC_INT_REG_BUS_FSM_ERR s2BIT(4) +#define IIC_INT_REG_BIT_FSM_ERR s2BIT(5) +#define IIC_INT_REG_CYCLE_FSM_ERR s2BIT(6) +#define IIC_INT_REG_REQ_FSM_ERR s2BIT(7) +#define IIC_INT_REG_ACK_ERR s2BIT(8) u64 iic_alarms; u8 unused4[0x08]; u64 gpio_int_reg; -#define GPIO_INT_REG_DP_ERR_INT BIT(0) -#define GPIO_INT_REG_LINK_DOWN BIT(1) -#define GPIO_INT_REG_LINK_UP BIT(2) +#define GPIO_INT_REG_DP_ERR_INT s2BIT(0) +#define GPIO_INT_REG_LINK_DOWN s2BIT(1) +#define GPIO_INT_REG_LINK_UP s2BIT(2) u64 gpio_int_mask; -#define GPIO_INT_MASK_LINK_DOWN BIT(1) -#define GPIO_INT_MASK_LINK_UP BIT(2) +#define GPIO_INT_MASK_LINK_DOWN s2BIT(1) +#define GPIO_INT_MASK_LINK_UP s2BIT(2) u64 gpio_alarms; u8 unused5[0x38]; u64 tx_traffic_int; -#define TX_TRAFFIC_INT_n(n) BIT(n) +#define TX_TRAFFIC_INT_n(n) s2BIT(n) u64 tx_traffic_mask; u64 rx_traffic_int; -#define RX_TRAFFIC_INT_n(n) BIT(n) +#define RX_TRAFFIC_INT_n(n) s2BIT(n) u64 rx_traffic_mask; /* PIC Control registers */ u64 pic_control; -#define PIC_CNTL_RX_ALARM_MAP_1 BIT(0) +#define PIC_CNTL_RX_ALARM_MAP_1 s2BIT(0) #define PIC_CNTL_SHARED_SPLITS(n) vBIT(n,11,5) u64 swapper_ctrl; -#define SWAPPER_CTRL_PIF_R_FE BIT(0) -#define SWAPPER_CTRL_PIF_R_SE BIT(1) -#define SWAPPER_CTRL_PIF_W_FE BIT(8) -#define SWAPPER_CTRL_PIF_W_SE BIT(9) -#define SWAPPER_CTRL_TXP_FE BIT(16) -#define SWAPPER_CTRL_TXP_SE BIT(17) -#define SWAPPER_CTRL_TXD_R_FE BIT(18) -#define SWAPPER_CTRL_TXD_R_SE BIT(19) -#define SWAPPER_CTRL_TXD_W_FE BIT(20) -#define SWAPPER_CTRL_TXD_W_SE BIT(21) -#define SWAPPER_CTRL_TXF_R_FE BIT(22) -#define SWAPPER_CTRL_TXF_R_SE BIT(23) -#define SWAPPER_CTRL_RXD_R_FE BIT(32) -#define SWAPPER_CTRL_RXD_R_SE BIT(33) -#define SWAPPER_CTRL_RXD_W_FE BIT(34) -#define SWAPPER_CTRL_RXD_W_SE BIT(35) -#define SWAPPER_CTRL_RXF_W_FE BIT(36) -#define SWAPPER_CTRL_RXF_W_SE BIT(37) -#define SWAPPER_CTRL_XMSI_FE BIT(40) -#define SWAPPER_CTRL_XMSI_SE BIT(41) -#define SWAPPER_CTRL_STATS_FE BIT(48) -#define SWAPPER_CTRL_STATS_SE BIT(49) +#define SWAPPER_CTRL_PIF_R_FE s2BIT(0) +#define SWAPPER_CTRL_PIF_R_SE s2BIT(1) +#define SWAPPER_CTRL_PIF_W_FE s2BIT(8) +#define SWAPPER_CTRL_PIF_W_SE s2BIT(9) +#define SWAPPER_CTRL_TXP_FE s2BIT(16) +#define SWAPPER_CTRL_TXP_SE s2BIT(17) +#define SWAPPER_CTRL_TXD_R_FE s2BIT(18) +#define SWAPPER_CTRL_TXD_R_SE s2BIT(19) +#define SWAPPER_CTRL_TXD_W_FE s2BIT(20) +#define SWAPPER_CTRL_TXD_W_SE s2BIT(21) +#define SWAPPER_CTRL_TXF_R_FE s2BIT(22) +#define SWAPPER_CTRL_TXF_R_SE s2BIT(23) +#define SWAPPER_CTRL_RXD_R_FE s2BIT(32) +#define SWAPPER_CTRL_RXD_R_SE s2BIT(33) +#define SWAPPER_CTRL_RXD_W_FE s2BIT(34) +#define SWAPPER_CTRL_RXD_W_SE s2BIT(35) +#define SWAPPER_CTRL_RXF_W_FE s2BIT(36) +#define SWAPPER_CTRL_RXF_W_SE s2BIT(37) +#define SWAPPER_CTRL_XMSI_FE s2BIT(40) +#define SWAPPER_CTRL_XMSI_SE s2BIT(41) +#define SWAPPER_CTRL_STATS_FE s2BIT(48) +#define SWAPPER_CTRL_STATS_SE s2BIT(49) u64 pif_rd_swapper_fb; #define IF_RD_SWAPPER_FB 0x0123456789ABCDEF u64 scheduled_int_ctrl; -#define SCHED_INT_CTRL_TIMER_EN BIT(0) -#define SCHED_INT_CTRL_ONE_SHOT BIT(1) +#define SCHED_INT_CTRL_TIMER_EN s2BIT(0) +#define SCHED_INT_CTRL_ONE_SHOT s2BIT(1) #define SCHED_INT_CTRL_INT2MSI(val) vBIT(val,10,6) #define SCHED_INT_PERIOD TBD u64 txreqtimeout; #define TXREQTO_VAL(val) vBIT(val,0,32) -#define TXREQTO_EN BIT(63) +#define TXREQTO_EN s2BIT(63) u64 statsreqtimeout; #define STATREQTO_VAL(n) TBD -#define STATREQTO_EN BIT(63) +#define STATREQTO_EN s2BIT(63) u64 read_retry_delay; u64 read_retry_acceleration; @@ -255,10 +255,10 @@ struct XENA_dev_config { /* Automated statistics collection */ u64 stat_cfg; -#define STAT_CFG_STAT_EN BIT(0) -#define STAT_CFG_ONE_SHOT_EN BIT(1) -#define STAT_CFG_STAT_NS_EN BIT(8) -#define STAT_CFG_STAT_RO BIT(9) +#define STAT_CFG_STAT_EN s2BIT(0) +#define STAT_CFG_ONE_SHOT_EN s2BIT(1) +#define STAT_CFG_STAT_NS_EN s2BIT(8) +#define STAT_CFG_STAT_RO s2BIT(9) #define STAT_TRSF_PER(n) TBD #define PER_SEC 0x208d5 #define SET_UPDT_PERIOD(n) vBIT((PER_SEC*n),32,32) @@ -290,18 +290,18 @@ struct XENA_dev_config { #define I2C_CONTROL_DEV_ID(id) vBIT(id,1,3) #define I2C_CONTROL_ADDR(addr) vBIT(addr,5,11) #define I2C_CONTROL_BYTE_CNT(cnt) vBIT(cnt,22,2) -#define I2C_CONTROL_READ BIT(24) -#define I2C_CONTROL_NACK BIT(25) +#define I2C_CONTROL_READ s2BIT(24) +#define I2C_CONTROL_NACK s2BIT(25) #define I2C_CONTROL_CNTL_START vBIT(0xE,28,4) #define I2C_CONTROL_CNTL_END(val) (val & vBIT(0x1,28,4)) #define I2C_CONTROL_GET_DATA(val) (u32)(val & 0xFFFFFFFF) #define I2C_CONTROL_SET_DATA(val) vBIT(val,32,32) u64 gpio_control; -#define GPIO_CTRL_GPIO_0 BIT(8) +#define GPIO_CTRL_GPIO_0 s2BIT(8) u64 misc_control; -#define FAULT_BEHAVIOUR BIT(0) -#define EXT_REQ_EN BIT(1) +#define FAULT_BEHAVIOUR s2BIT(0) +#define EXT_REQ_EN s2BIT(1) #define MISC_LINK_STABILITY_PRD(val) vBIT(val,29,3) u8 unused7_1[0x230 - 0x208]; @@ -317,29 +317,29 @@ struct XENA_dev_config { /* TxDMA registers */ u64 txdma_int_status; u64 txdma_int_mask; -#define TXDMA_PFC_INT BIT(0) -#define TXDMA_TDA_INT BIT(1) -#define TXDMA_PCC_INT BIT(2) -#define TXDMA_TTI_INT BIT(3) -#define TXDMA_LSO_INT BIT(4) -#define TXDMA_TPA_INT BIT(5) -#define TXDMA_SM_INT BIT(6) +#define TXDMA_PFC_INT s2BIT(0) +#define TXDMA_TDA_INT s2BIT(1) +#define TXDMA_PCC_INT s2BIT(2) +#define TXDMA_TTI_INT s2BIT(3) +#define TXDMA_LSO_INT s2BIT(4) +#define TXDMA_TPA_INT s2BIT(5) +#define TXDMA_SM_INT s2BIT(6) u64 pfc_err_reg; -#define PFC_ECC_SG_ERR BIT(7) -#define PFC_ECC_DB_ERR BIT(15) -#define PFC_SM_ERR_ALARM BIT(23) -#define PFC_MISC_0_ERR BIT(31) -#define PFC_MISC_1_ERR BIT(32) -#define PFC_PCIX_ERR BIT(39) +#define PFC_ECC_SG_ERR s2BIT(7) +#define PFC_ECC_DB_ERR s2BIT(15) +#define PFC_SM_ERR_ALARM s2BIT(23) +#define PFC_MISC_0_ERR s2BIT(31) +#define PFC_MISC_1_ERR s2BIT(32) +#define PFC_PCIX_ERR s2BIT(39) u64 pfc_err_mask; u64 pfc_err_alarm; u64 tda_err_reg; #define TDA_Fn_ECC_SG_ERR vBIT(0xff,0,8) #define TDA_Fn_ECC_DB_ERR vBIT(0xff,8,8) -#define TDA_SM0_ERR_ALARM BIT(22) -#define TDA_SM1_ERR_ALARM BIT(23) -#define TDA_PCIX_ERR BIT(39) +#define TDA_SM0_ERR_ALARM s2BIT(22) +#define TDA_SM1_ERR_ALARM s2BIT(23) +#define TDA_PCIX_ERR s2BIT(39) u64 tda_err_mask; u64 tda_err_alarm; @@ -351,40 +351,40 @@ struct XENA_dev_config { #define PCC_SM_ERR_ALARM vBIT(0xff,32,8) #define PCC_WR_ERR_ALARM vBIT(0xff,40,8) #define PCC_N_SERR vBIT(0xff,48,8) -#define PCC_6_COF_OV_ERR BIT(56) -#define PCC_7_COF_OV_ERR BIT(57) -#define PCC_6_LSO_OV_ERR BIT(58) -#define PCC_7_LSO_OV_ERR BIT(59) +#define PCC_6_COF_OV_ERR s2BIT(56) +#define PCC_7_COF_OV_ERR s2BIT(57) +#define PCC_6_LSO_OV_ERR s2BIT(58) +#define PCC_7_LSO_OV_ERR s2BIT(59) #define PCC_ENABLE_FOUR vBIT(0x0F,0,8) u64 pcc_err_mask; u64 pcc_err_alarm; u64 tti_err_reg; -#define TTI_ECC_SG_ERR BIT(7) -#define TTI_ECC_DB_ERR BIT(15) -#define TTI_SM_ERR_ALARM BIT(23) +#define TTI_ECC_SG_ERR s2BIT(7) +#define TTI_ECC_DB_ERR s2BIT(15) +#define TTI_SM_ERR_ALARM s2BIT(23) u64 tti_err_mask; u64 tti_err_alarm; u64 lso_err_reg; -#define LSO6_SEND_OFLOW BIT(12) -#define LSO7_SEND_OFLOW BIT(13) -#define LSO6_ABORT BIT(14) -#define LSO7_ABORT BIT(15) -#define LSO6_SM_ERR_ALARM BIT(22) -#define LSO7_SM_ERR_ALARM BIT(23) +#define LSO6_SEND_OFLOW s2BIT(12) +#define LSO7_SEND_OFLOW s2BIT(13) +#define LSO6_ABORT s2BIT(14) +#define LSO7_ABORT s2BIT(15) +#define LSO6_SM_ERR_ALARM s2BIT(22) +#define LSO7_SM_ERR_ALARM s2BIT(23) u64 lso_err_mask; u64 lso_err_alarm; u64 tpa_err_reg; -#define TPA_TX_FRM_DROP BIT(7) -#define TPA_SM_ERR_ALARM BIT(23) +#define TPA_TX_FRM_DROP s2BIT(7) +#define TPA_SM_ERR_ALARM s2BIT(23) u64 tpa_err_mask; u64 tpa_err_alarm; u64 sm_err_reg; -#define SM_SM_ERR_ALARM BIT(15) +#define SM_SM_ERR_ALARM s2BIT(15) u64 sm_err_mask; u64 sm_err_alarm; @@ -397,7 +397,7 @@ struct XENA_dev_config { #define X_MAX_FIFOS 8 #define X_FIFO_MAX_LEN 0x1FFF /*8191 */ u64 tx_fifo_partition_0; -#define TX_FIFO_PARTITION_EN BIT(0) +#define TX_FIFO_PARTITION_EN s2BIT(0) #define TX_FIFO_PARTITION_0_PRI(val) vBIT(val,5,3) #define TX_FIFO_PARTITION_0_LEN(val) vBIT(val,19,13) #define TX_FIFO_PARTITION_1_PRI(val) vBIT(val,37,3) @@ -437,16 +437,16 @@ struct XENA_dev_config { u64 tx_w_round_robin_4; u64 tti_command_mem; -#define TTI_CMD_MEM_WE BIT(7) -#define TTI_CMD_MEM_STROBE_NEW_CMD BIT(15) -#define TTI_CMD_MEM_STROBE_BEING_EXECUTED BIT(15) +#define TTI_CMD_MEM_WE s2BIT(7) +#define TTI_CMD_MEM_STROBE_NEW_CMD s2BIT(15) +#define TTI_CMD_MEM_STROBE_BEING_EXECUTED s2BIT(15) #define TTI_CMD_MEM_OFFSET(n) vBIT(n,26,6) u64 tti_data1_mem; #define TTI_DATA1_MEM_TX_TIMER_VAL(n) vBIT(n,6,26) #define TTI_DATA1_MEM_TX_TIMER_AC_CI(n) vBIT(n,38,2) -#define TTI_DATA1_MEM_TX_TIMER_AC_EN BIT(38) -#define TTI_DATA1_MEM_TX_TIMER_CI_EN BIT(39) +#define TTI_DATA1_MEM_TX_TIMER_AC_EN s2BIT(38) +#define TTI_DATA1_MEM_TX_TIMER_CI_EN s2BIT(39) #define TTI_DATA1_MEM_TX_URNG_A(n) vBIT(n,41,7) #define TTI_DATA1_MEM_TX_URNG_B(n) vBIT(n,49,7) #define TTI_DATA1_MEM_TX_URNG_C(n) vBIT(n,57,7) @@ -459,11 +459,11 @@ struct XENA_dev_config { /* Tx Protocol assist */ u64 tx_pa_cfg; -#define TX_PA_CFG_IGNORE_FRM_ERR BIT(1) -#define TX_PA_CFG_IGNORE_SNAP_OUI BIT(2) -#define TX_PA_CFG_IGNORE_LLC_CTRL BIT(3) -#define TX_PA_CFG_IGNORE_L2_ERR BIT(6) -#define RX_PA_CFG_STRIP_VLAN_TAG BIT(15) +#define TX_PA_CFG_IGNORE_FRM_ERR s2BIT(1) +#define TX_PA_CFG_IGNORE_SNAP_OUI s2BIT(2) +#define TX_PA_CFG_IGNORE_LLC_CTRL s2BIT(3) +#define TX_PA_CFG_IGNORE_L2_ERR s2BIT(6) +#define RX_PA_CFG_STRIP_VLAN_TAG s2BIT(15) /* Recent add, used only debug purposes. */ u64 pcc_enable; @@ -477,31 +477,31 @@ struct XENA_dev_config { /* RxDMA Registers */ u64 rxdma_int_status; u64 rxdma_int_mask; -#define RXDMA_INT_RC_INT_M BIT(0) -#define RXDMA_INT_RPA_INT_M BIT(1) -#define RXDMA_INT_RDA_INT_M BIT(2) -#define RXDMA_INT_RTI_INT_M BIT(3) +#define RXDMA_INT_RC_INT_M s2BIT(0) +#define RXDMA_INT_RPA_INT_M s2BIT(1) +#define RXDMA_INT_RDA_INT_M s2BIT(2) +#define RXDMA_INT_RTI_INT_M s2BIT(3) u64 rda_err_reg; #define RDA_RXDn_ECC_SG_ERR vBIT(0xFF,0,8) #define RDA_RXDn_ECC_DB_ERR vBIT(0xFF,8,8) -#define RDA_FRM_ECC_SG_ERR BIT(23) -#define RDA_FRM_ECC_DB_N_AERR BIT(31) -#define RDA_SM1_ERR_ALARM BIT(38) -#define RDA_SM0_ERR_ALARM BIT(39) -#define RDA_MISC_ERR BIT(47) -#define RDA_PCIX_ERR BIT(55) -#define RDA_RXD_ECC_DB_SERR BIT(63) +#define RDA_FRM_ECC_SG_ERR s2BIT(23) +#define RDA_FRM_ECC_DB_N_AERR s2BIT(31) +#define RDA_SM1_ERR_ALARM s2BIT(38) +#define RDA_SM0_ERR_ALARM s2BIT(39) +#define RDA_MISC_ERR s2BIT(47) +#define RDA_PCIX_ERR s2BIT(55) +#define RDA_RXD_ECC_DB_SERR s2BIT(63) u64 rda_err_mask; u64 rda_err_alarm; u64 rc_err_reg; #define RC_PRCn_ECC_SG_ERR vBIT(0xFF,0,8) #define RC_PRCn_ECC_DB_ERR vBIT(0xFF,8,8) -#define RC_FTC_ECC_SG_ERR BIT(23) -#define RC_FTC_ECC_DB_ERR BIT(31) +#define RC_FTC_ECC_SG_ERR s2BIT(23) +#define RC_FTC_ECC_DB_ERR s2BIT(31) #define RC_PRCn_SM_ERR_ALARM vBIT(0xFF,32,8) -#define RC_FTC_SM_ERR_ALARM BIT(47) +#define RC_FTC_SM_ERR_ALARM s2BIT(47) #define RC_RDA_FAIL_WR_Rn vBIT(0xFF,48,8) u64 rc_err_mask; u64 rc_err_alarm; @@ -517,18 +517,18 @@ struct XENA_dev_config { u64 prc_pcix_err_alarm; u64 rpa_err_reg; -#define RPA_ECC_SG_ERR BIT(7) -#define RPA_ECC_DB_ERR BIT(15) -#define RPA_FLUSH_REQUEST BIT(22) -#define RPA_SM_ERR_ALARM BIT(23) -#define RPA_CREDIT_ERR BIT(31) +#define RPA_ECC_SG_ERR s2BIT(7) +#define RPA_ECC_DB_ERR s2BIT(15) +#define RPA_FLUSH_REQUEST s2BIT(22) +#define RPA_SM_ERR_ALARM s2BIT(23) +#define RPA_CREDIT_ERR s2BIT(31) u64 rpa_err_mask; u64 rpa_err_alarm; u64 rti_err_reg; -#define RTI_ECC_SG_ERR BIT(7) -#define RTI_ECC_DB_ERR BIT(15) -#define RTI_SM_ERR_ALARM BIT(23) +#define RTI_ECC_SG_ERR s2BIT(7) +#define RTI_ECC_DB_ERR s2BIT(15) +#define RTI_SM_ERR_ALARM s2BIT(23) u64 rti_err_mask; u64 rti_err_alarm; @@ -568,49 +568,49 @@ struct XENA_dev_config { #endif u64 prc_rxd0_n[RX_MAX_RINGS]; u64 prc_ctrl_n[RX_MAX_RINGS]; -#define PRC_CTRL_RC_ENABLED BIT(7) -#define PRC_CTRL_RING_MODE (BIT(14)|BIT(15)) +#define PRC_CTRL_RC_ENABLED s2BIT(7) +#define PRC_CTRL_RING_MODE (s2BIT(14)|s2BIT(15)) #define PRC_CTRL_RING_MODE_1 vBIT(0,14,2) #define PRC_CTRL_RING_MODE_3 vBIT(1,14,2) #define PRC_CTRL_RING_MODE_5 vBIT(2,14,2) #define PRC_CTRL_RING_MODE_x vBIT(3,14,2) -#define PRC_CTRL_NO_SNOOP (BIT(22)|BIT(23)) -#define PRC_CTRL_NO_SNOOP_DESC BIT(22) -#define PRC_CTRL_NO_SNOOP_BUFF BIT(23) -#define PRC_CTRL_BIMODAL_INTERRUPT BIT(37) -#define PRC_CTRL_GROUP_READS BIT(38) +#define PRC_CTRL_NO_SNOOP (s2BIT(22)|s2BIT(23)) +#define PRC_CTRL_NO_SNOOP_DESC s2BIT(22) +#define PRC_CTRL_NO_SNOOP_BUFF s2BIT(23) +#define PRC_CTRL_BIMODAL_INTERRUPT s2BIT(37) +#define PRC_CTRL_GROUP_READS s2BIT(38) #define PRC_CTRL_RXD_BACKOFF_INTERVAL(val) vBIT(val,40,24) u64 prc_alarm_action; -#define PRC_ALARM_ACTION_RR_R0_STOP BIT(3) -#define PRC_ALARM_ACTION_RW_R0_STOP BIT(7) -#define PRC_ALARM_ACTION_RR_R1_STOP BIT(11) -#define PRC_ALARM_ACTION_RW_R1_STOP BIT(15) -#define PRC_ALARM_ACTION_RR_R2_STOP BIT(19) -#define PRC_ALARM_ACTION_RW_R2_STOP BIT(23) -#define PRC_ALARM_ACTION_RR_R3_STOP BIT(27) -#define PRC_ALARM_ACTION_RW_R3_STOP BIT(31) -#define PRC_ALARM_ACTION_RR_R4_STOP BIT(35) -#define PRC_ALARM_ACTION_RW_R4_STOP BIT(39) -#define PRC_ALARM_ACTION_RR_R5_STOP BIT(43) -#define PRC_ALARM_ACTION_RW_R5_STOP BIT(47) -#define PRC_ALARM_ACTION_RR_R6_STOP BIT(51) -#define PRC_ALARM_ACTION_RW_R6_STOP BIT(55) -#define PRC_ALARM_ACTION_RR_R7_STOP BIT(59) -#define PRC_ALARM_ACTION_RW_R7_STOP BIT(63) +#define PRC_ALARM_ACTION_RR_R0_STOP s2BIT(3) +#define PRC_ALARM_ACTION_RW_R0_STOP s2BIT(7) +#define PRC_ALARM_ACTION_RR_R1_STOP s2BIT(11) +#define PRC_ALARM_ACTION_RW_R1_STOP s2BIT(15) +#define PRC_ALARM_ACTION_RR_R2_STOP s2BIT(19) +#define PRC_ALARM_ACTION_RW_R2_STOP s2BIT(23) +#define PRC_ALARM_ACTION_RR_R3_STOP s2BIT(27) +#define PRC_ALARM_ACTION_RW_R3_STOP s2BIT(31) +#define PRC_ALARM_ACTION_RR_R4_STOP s2BIT(35) +#define PRC_ALARM_ACTION_RW_R4_STOP s2BIT(39) +#define PRC_ALARM_ACTION_RR_R5_STOP s2BIT(43) +#define PRC_ALARM_ACTION_RW_R5_STOP s2BIT(47) +#define PRC_ALARM_ACTION_RR_R6_STOP s2BIT(51) +#define PRC_ALARM_ACTION_RW_R6_STOP s2BIT(55) +#define PRC_ALARM_ACTION_RR_R7_STOP s2BIT(59) +#define PRC_ALARM_ACTION_RW_R7_STOP s2BIT(63) /* Receive traffic interrupts */ u64 rti_command_mem; -#define RTI_CMD_MEM_WE BIT(7) -#define RTI_CMD_MEM_STROBE BIT(15) -#define RTI_CMD_MEM_STROBE_NEW_CMD BIT(15) -#define RTI_CMD_MEM_STROBE_CMD_BEING_EXECUTED BIT(15) +#define RTI_CMD_MEM_WE s2BIT(7) +#define RTI_CMD_MEM_STROBE s2BIT(15) +#define RTI_CMD_MEM_STROBE_NEW_CMD s2BIT(15) +#define RTI_CMD_MEM_STROBE_CMD_BEING_EXECUTED s2BIT(15) #define RTI_CMD_MEM_OFFSET(n) vBIT(n,29,3) u64 rti_data1_mem; #define RTI_DATA1_MEM_RX_TIMER_VAL(n) vBIT(n,3,29) -#define RTI_DATA1_MEM_RX_TIMER_AC_EN BIT(38) -#define RTI_DATA1_MEM_RX_TIMER_CI_EN BIT(39) +#define RTI_DATA1_MEM_RX_TIMER_AC_EN s2BIT(38) +#define RTI_DATA1_MEM_RX_TIMER_CI_EN s2BIT(39) #define RTI_DATA1_MEM_RX_URNG_A(n) vBIT(n,41,7) #define RTI_DATA1_MEM_RX_URNG_B(n) vBIT(n,49,7) #define RTI_DATA1_MEM_RX_URNG_C(n) vBIT(n,57,7) @@ -622,10 +622,10 @@ struct XENA_dev_config { #define RTI_DATA2_MEM_RX_UFC_D(n) vBIT(n,48,16) u64 rx_pa_cfg; -#define RX_PA_CFG_IGNORE_FRM_ERR BIT(1) -#define RX_PA_CFG_IGNORE_SNAP_OUI BIT(2) -#define RX_PA_CFG_IGNORE_LLC_CTRL BIT(3) -#define RX_PA_CFG_IGNORE_L2_ERR BIT(6) +#define RX_PA_CFG_IGNORE_FRM_ERR s2BIT(1) +#define RX_PA_CFG_IGNORE_SNAP_OUI s2BIT(2) +#define RX_PA_CFG_IGNORE_LLC_CTRL s2BIT(3) +#define RX_PA_CFG_IGNORE_L2_ERR s2BIT(6) u64 unused_11_1; @@ -641,64 +641,64 @@ struct XENA_dev_config { /* Media Access Controller Register */ u64 mac_int_status; u64 mac_int_mask; -#define MAC_INT_STATUS_TMAC_INT BIT(0) -#define MAC_INT_STATUS_RMAC_INT BIT(1) +#define MAC_INT_STATUS_TMAC_INT s2BIT(0) +#define MAC_INT_STATUS_RMAC_INT s2BIT(1) u64 mac_tmac_err_reg; -#define TMAC_ECC_SG_ERR BIT(7) -#define TMAC_ECC_DB_ERR BIT(15) -#define TMAC_TX_BUF_OVRN BIT(23) -#define TMAC_TX_CRI_ERR BIT(31) -#define TMAC_TX_SM_ERR BIT(39) -#define TMAC_DESC_ECC_SG_ERR BIT(47) -#define TMAC_DESC_ECC_DB_ERR BIT(55) +#define TMAC_ECC_SG_ERR s2BIT(7) +#define TMAC_ECC_DB_ERR s2BIT(15) +#define TMAC_TX_BUF_OVRN s2BIT(23) +#define TMAC_TX_CRI_ERR s2BIT(31) +#define TMAC_TX_SM_ERR s2BIT(39) +#define TMAC_DESC_ECC_SG_ERR s2BIT(47) +#define TMAC_DESC_ECC_DB_ERR s2BIT(55) u64 mac_tmac_err_mask; u64 mac_tmac_err_alarm; u64 mac_rmac_err_reg; -#define RMAC_RX_BUFF_OVRN BIT(0) -#define RMAC_FRM_RCVD_INT BIT(1) -#define RMAC_UNUSED_INT BIT(2) -#define RMAC_RTS_PNUM_ECC_SG_ERR BIT(5) -#define RMAC_RTS_DS_ECC_SG_ERR BIT(6) -#define RMAC_RD_BUF_ECC_SG_ERR BIT(7) -#define RMAC_RTH_MAP_ECC_SG_ERR BIT(8) -#define RMAC_RTH_SPDM_ECC_SG_ERR BIT(9) -#define RMAC_RTS_VID_ECC_SG_ERR BIT(10) -#define RMAC_DA_SHADOW_ECC_SG_ERR BIT(11) -#define RMAC_RTS_PNUM_ECC_DB_ERR BIT(13) -#define RMAC_RTS_DS_ECC_DB_ERR BIT(14) -#define RMAC_RD_BUF_ECC_DB_ERR BIT(15) -#define RMAC_RTH_MAP_ECC_DB_ERR BIT(16) -#define RMAC_RTH_SPDM_ECC_DB_ERR BIT(17) -#define RMAC_RTS_VID_ECC_DB_ERR BIT(18) -#define RMAC_DA_SHADOW_ECC_DB_ERR BIT(19) -#define RMAC_LINK_STATE_CHANGE_INT BIT(31) -#define RMAC_RX_SM_ERR BIT(39) -#define RMAC_SINGLE_ECC_ERR (BIT(5) | BIT(6) | BIT(7) |\ - BIT(8) | BIT(9) | BIT(10)|\ - BIT(11)) -#define RMAC_DOUBLE_ECC_ERR (BIT(13) | BIT(14) | BIT(15) |\ - BIT(16) | BIT(17) | BIT(18)|\ - BIT(19)) +#define RMAC_RX_BUFF_OVRN s2BIT(0) +#define RMAC_FRM_RCVD_INT s2BIT(1) +#define RMAC_UNUSED_INT s2BIT(2) +#define RMAC_RTS_PNUM_ECC_SG_ERR s2BIT(5) +#define RMAC_RTS_DS_ECC_SG_ERR s2BIT(6) +#define RMAC_RD_BUF_ECC_SG_ERR s2BIT(7) +#define RMAC_RTH_MAP_ECC_SG_ERR s2BIT(8) +#define RMAC_RTH_SPDM_ECC_SG_ERR s2BIT(9) +#define RMAC_RTS_VID_ECC_SG_ERR s2BIT(10) +#define RMAC_DA_SHADOW_ECC_SG_ERR s2BIT(11) +#define RMAC_RTS_PNUM_ECC_DB_ERR s2BIT(13) +#define RMAC_RTS_DS_ECC_DB_ERR s2BIT(14) +#define RMAC_RD_BUF_ECC_DB_ERR s2BIT(15) +#define RMAC_RTH_MAP_ECC_DB_ERR s2BIT(16) +#define RMAC_RTH_SPDM_ECC_DB_ERR s2BIT(17) +#define RMAC_RTS_VID_ECC_DB_ERR s2BIT(18) +#define RMAC_DA_SHADOW_ECC_DB_ERR s2BIT(19) +#define RMAC_LINK_STATE_CHANGE_INT s2BIT(31) +#define RMAC_RX_SM_ERR s2BIT(39) +#define RMAC_SINGLE_ECC_ERR (s2BIT(5) | s2BIT(6) | s2BIT(7) |\ + s2BIT(8) | s2BIT(9) | s2BIT(10)|\ + s2BIT(11)) +#define RMAC_DOUBLE_ECC_ERR (s2BIT(13) | s2BIT(14) | s2BIT(15) |\ + s2BIT(16) | s2BIT(17) | s2BIT(18)|\ + s2BIT(19)) u64 mac_rmac_err_mask; u64 mac_rmac_err_alarm; u8 unused14[0x100 - 0x40]; u64 mac_cfg; -#define MAC_CFG_TMAC_ENABLE BIT(0) -#define MAC_CFG_RMAC_ENABLE BIT(1) -#define MAC_CFG_LAN_NOT_WAN BIT(2) -#define MAC_CFG_TMAC_LOOPBACK BIT(3) -#define MAC_CFG_TMAC_APPEND_PAD BIT(4) -#define MAC_CFG_RMAC_STRIP_FCS BIT(5) -#define MAC_CFG_RMAC_STRIP_PAD BIT(6) -#define MAC_CFG_RMAC_PROM_ENABLE BIT(7) -#define MAC_RMAC_DISCARD_PFRM BIT(8) -#define MAC_RMAC_BCAST_ENABLE BIT(9) -#define MAC_RMAC_ALL_ADDR_ENABLE BIT(10) +#define MAC_CFG_TMAC_ENABLE s2BIT(0) +#define MAC_CFG_RMAC_ENABLE s2BIT(1) +#define MAC_CFG_LAN_NOT_WAN s2BIT(2) +#define MAC_CFG_TMAC_LOOPBACK s2BIT(3) +#define MAC_CFG_TMAC_APPEND_PAD s2BIT(4) +#define MAC_CFG_RMAC_STRIP_FCS s2BIT(5) +#define MAC_CFG_RMAC_STRIP_PAD s2BIT(6) +#define MAC_CFG_RMAC_PROM_ENABLE s2BIT(7) +#define MAC_RMAC_DISCARD_PFRM s2BIT(8) +#define MAC_RMAC_BCAST_ENABLE s2BIT(9) +#define MAC_RMAC_ALL_ADDR_ENABLE s2BIT(10) #define MAC_RMAC_INVLD_IPG_THR(val) vBIT(val,16,8) u64 tmac_avg_ipg; @@ -710,14 +710,14 @@ struct XENA_dev_config { #define RMAC_MAX_PYLD_LEN_JUMBO_DEF vBIT(9600,2,14) u64 rmac_err_cfg; -#define RMAC_ERR_FCS BIT(0) -#define RMAC_ERR_FCS_ACCEPT BIT(1) -#define RMAC_ERR_TOO_LONG BIT(1) -#define RMAC_ERR_TOO_LONG_ACCEPT BIT(1) -#define RMAC_ERR_RUNT BIT(2) -#define RMAC_ERR_RUNT_ACCEPT BIT(2) -#define RMAC_ERR_LEN_MISMATCH BIT(3) -#define RMAC_ERR_LEN_MISMATCH_ACCEPT BIT(3) +#define RMAC_ERR_FCS s2BIT(0) +#define RMAC_ERR_FCS_ACCEPT s2BIT(1) +#define RMAC_ERR_TOO_LONG s2BIT(1) +#define RMAC_ERR_TOO_LONG_ACCEPT s2BIT(1) +#define RMAC_ERR_RUNT s2BIT(2) +#define RMAC_ERR_RUNT_ACCEPT s2BIT(2) +#define RMAC_ERR_LEN_MISMATCH s2BIT(3) +#define RMAC_ERR_LEN_MISMATCH_ACCEPT s2BIT(3) u64 rmac_cfg_key; #define RMAC_CFG_KEY(val) vBIT(val,0,16) @@ -728,15 +728,15 @@ struct XENA_dev_config { #define MAC_MC_ADDR_START_OFFSET 16 #define MAC_MC_ALL_MC_ADDR_OFFSET 63 /* enables all multicast pkts */ u64 rmac_addr_cmd_mem; -#define RMAC_ADDR_CMD_MEM_WE BIT(7) +#define RMAC_ADDR_CMD_MEM_WE s2BIT(7) #define RMAC_ADDR_CMD_MEM_RD 0 -#define RMAC_ADDR_CMD_MEM_STROBE_NEW_CMD BIT(15) -#define RMAC_ADDR_CMD_MEM_STROBE_CMD_EXECUTING BIT(15) +#define RMAC_ADDR_CMD_MEM_STROBE_NEW_CMD s2BIT(15) +#define RMAC_ADDR_CMD_MEM_STROBE_CMD_EXECUTING s2BIT(15) #define RMAC_ADDR_CMD_MEM_OFFSET(n) vBIT(n,26,6) u64 rmac_addr_data0_mem; #define RMAC_ADDR_DATA0_MEM_ADDR(n) vBIT(n,0,48) -#define RMAC_ADDR_DATA0_MEM_USER BIT(48) +#define RMAC_ADDR_DATA0_MEM_USER s2BIT(48) u64 rmac_addr_data1_mem; #define RMAC_ADDR_DATA1_MEM_MASK(n) vBIT(n,0,48) @@ -753,10 +753,10 @@ struct XENA_dev_config { u64 tmac_ipg_cfg; u64 rmac_pause_cfg; -#define RMAC_PAUSE_GEN BIT(0) -#define RMAC_PAUSE_GEN_ENABLE BIT(0) -#define RMAC_PAUSE_RX BIT(1) -#define RMAC_PAUSE_RX_ENABLE BIT(1) +#define RMAC_PAUSE_GEN s2BIT(0) +#define RMAC_PAUSE_GEN_ENABLE s2BIT(0) +#define RMAC_PAUSE_RX s2BIT(1) +#define RMAC_PAUSE_RX_ENABLE s2BIT(1) #define RMAC_PAUSE_HG_PTIME_DEF vBIT(0xFFFF,16,16) #define RMAC_PAUSE_HG_PTIME(val) vBIT(val,16,16) @@ -787,29 +787,29 @@ struct XENA_dev_config { #define MAX_DIX_MAP 4 u64 rts_dix_map_n[MAX_DIX_MAP]; #define RTS_DIX_MAP_ETYPE(val) vBIT(val,0,16) -#define RTS_DIX_MAP_SCW(val) BIT(val,21) +#define RTS_DIX_MAP_SCW(val) s2BIT(val,21) u64 rts_q_alternates; u64 rts_default_q; u64 rts_ctrl; -#define RTS_CTRL_IGNORE_SNAP_OUI BIT(2) -#define RTS_CTRL_IGNORE_LLC_CTRL BIT(3) +#define RTS_CTRL_IGNORE_SNAP_OUI s2BIT(2) +#define RTS_CTRL_IGNORE_LLC_CTRL s2BIT(3) u64 rts_pn_cam_ctrl; -#define RTS_PN_CAM_CTRL_WE BIT(7) -#define RTS_PN_CAM_CTRL_STROBE_NEW_CMD BIT(15) -#define RTS_PN_CAM_CTRL_STROBE_BEING_EXECUTED BIT(15) +#define RTS_PN_CAM_CTRL_WE s2BIT(7) +#define RTS_PN_CAM_CTRL_STROBE_NEW_CMD s2BIT(15) +#define RTS_PN_CAM_CTRL_STROBE_BEING_EXECUTED s2BIT(15) #define RTS_PN_CAM_CTRL_OFFSET(n) vBIT(n,24,8) u64 rts_pn_cam_data; -#define RTS_PN_CAM_DATA_TCP_SELECT BIT(7) +#define RTS_PN_CAM_DATA_TCP_SELECT s2BIT(7) #define RTS_PN_CAM_DATA_PORT(val) vBIT(val,8,16) #define RTS_PN_CAM_DATA_SCW(val) vBIT(val,24,8) u64 rts_ds_mem_ctrl; -#define RTS_DS_MEM_CTRL_WE BIT(7) -#define RTS_DS_MEM_CTRL_STROBE_NEW_CMD BIT(15) -#define RTS_DS_MEM_CTRL_STROBE_CMD_BEING_EXECUTED BIT(15) +#define RTS_DS_MEM_CTRL_WE s2BIT(7) +#define RTS_DS_MEM_CTRL_STROBE_NEW_CMD s2BIT(15) +#define RTS_DS_MEM_CTRL_STROBE_CMD_BEING_EXECUTED s2BIT(15) #define RTS_DS_MEM_CTRL_OFFSET(n) vBIT(n,26,6) u64 rts_ds_mem_data; #define RTS_DS_MEM_DATA(n) vBIT(n,0,8) @@ -823,23 +823,23 @@ struct XENA_dev_config { /* memory controller registers */ u64 mc_int_status; -#define MC_INT_STATUS_MC_INT BIT(0) +#define MC_INT_STATUS_MC_INT s2BIT(0) u64 mc_int_mask; -#define MC_INT_MASK_MC_INT BIT(0) +#define MC_INT_MASK_MC_INT s2BIT(0) u64 mc_err_reg; -#define MC_ERR_REG_ECC_DB_ERR_L BIT(14) -#define MC_ERR_REG_ECC_DB_ERR_U BIT(15) -#define MC_ERR_REG_MIRI_ECC_DB_ERR_0 BIT(18) -#define MC_ERR_REG_MIRI_ECC_DB_ERR_1 BIT(20) -#define MC_ERR_REG_MIRI_CRI_ERR_0 BIT(22) -#define MC_ERR_REG_MIRI_CRI_ERR_1 BIT(23) -#define MC_ERR_REG_SM_ERR BIT(31) -#define MC_ERR_REG_ECC_ALL_SNG (BIT(2) | BIT(3) | BIT(4) | BIT(5) |\ - BIT(17) | BIT(19)) -#define MC_ERR_REG_ECC_ALL_DBL (BIT(10) | BIT(11) | BIT(12) |\ - BIT(13) | BIT(18) | BIT(20)) -#define PLL_LOCK_N BIT(39) +#define MC_ERR_REG_ECC_DB_ERR_L s2BIT(14) +#define MC_ERR_REG_ECC_DB_ERR_U s2BIT(15) +#define MC_ERR_REG_MIRI_ECC_DB_ERR_0 s2BIT(18) +#define MC_ERR_REG_MIRI_ECC_DB_ERR_1 s2BIT(20) +#define MC_ERR_REG_MIRI_CRI_ERR_0 s2BIT(22) +#define MC_ERR_REG_MIRI_CRI_ERR_1 s2BIT(23) +#define MC_ERR_REG_SM_ERR s2BIT(31) +#define MC_ERR_REG_ECC_ALL_SNG (s2BIT(2) | s2BIT(3) | s2BIT(4) | s2BIT(5) |\ + s2BIT(17) | s2BIT(19)) +#define MC_ERR_REG_ECC_ALL_DBL (s2BIT(10) | s2BIT(11) | s2BIT(12) |\ + s2BIT(13) | s2BIT(18) | s2BIT(20)) +#define PLL_LOCK_N s2BIT(39) u64 mc_err_mask; u64 mc_err_alarm; @@ -857,8 +857,8 @@ struct XENA_dev_config { #define RX_QUEUE_CFG_Q7_SZ(n) vBIT(n,56,8) u64 mc_rldram_mrs; -#define MC_RLDRAM_QUEUE_SIZE_ENABLE BIT(39) -#define MC_RLDRAM_MRS_ENABLE BIT(47) +#define MC_RLDRAM_QUEUE_SIZE_ENABLE s2BIT(39) +#define MC_RLDRAM_MRS_ENABLE s2BIT(47) u64 mc_rldram_interleave; @@ -871,11 +871,11 @@ struct XENA_dev_config { u64 mc_rldram_ref_per; u8 unused20[0x220 - 0x208]; u64 mc_rldram_test_ctrl; -#define MC_RLDRAM_TEST_MODE BIT(47) -#define MC_RLDRAM_TEST_WRITE BIT(7) -#define MC_RLDRAM_TEST_GO BIT(15) -#define MC_RLDRAM_TEST_DONE BIT(23) -#define MC_RLDRAM_TEST_PASS BIT(31) +#define MC_RLDRAM_TEST_MODE s2BIT(47) +#define MC_RLDRAM_TEST_WRITE s2BIT(7) +#define MC_RLDRAM_TEST_GO s2BIT(15) +#define MC_RLDRAM_TEST_DONE s2BIT(23) +#define MC_RLDRAM_TEST_PASS s2BIT(31) u8 unused21[0x240 - 0x228]; u64 mc_rldram_test_add; @@ -888,7 +888,7 @@ struct XENA_dev_config { u8 unused24_1[0x360 - 0x308]; u64 mc_rldram_ctrl; -#define MC_RLDRAM_ENABLE_ODT BIT(7) +#define MC_RLDRAM_ENABLE_ODT s2BIT(7) u8 unused24_2[0x640 - 0x368]; u64 mc_rldram_ref_per_herc; @@ -906,24 +906,24 @@ struct XENA_dev_config { /* XGXS control registers */ u64 xgxs_int_status; -#define XGXS_INT_STATUS_TXGXS BIT(0) -#define XGXS_INT_STATUS_RXGXS BIT(1) +#define XGXS_INT_STATUS_TXGXS s2BIT(0) +#define XGXS_INT_STATUS_RXGXS s2BIT(1) u64 xgxs_int_mask; -#define XGXS_INT_MASK_TXGXS BIT(0) -#define XGXS_INT_MASK_RXGXS BIT(1) +#define XGXS_INT_MASK_TXGXS s2BIT(0) +#define XGXS_INT_MASK_RXGXS s2BIT(1) u64 xgxs_txgxs_err_reg; -#define TXGXS_ECC_SG_ERR BIT(7) -#define TXGXS_ECC_DB_ERR BIT(15) -#define TXGXS_ESTORE_UFLOW BIT(31) -#define TXGXS_TX_SM_ERR BIT(39) +#define TXGXS_ECC_SG_ERR s2BIT(7) +#define TXGXS_ECC_DB_ERR s2BIT(15) +#define TXGXS_ESTORE_UFLOW s2BIT(31) +#define TXGXS_TX_SM_ERR s2BIT(39) u64 xgxs_txgxs_err_mask; u64 xgxs_txgxs_err_alarm; u64 xgxs_rxgxs_err_reg; -#define RXGXS_ESTORE_OFLOW BIT(7) -#define RXGXS_RX_SM_ERR BIT(39) +#define RXGXS_ESTORE_OFLOW s2BIT(7) +#define RXGXS_RX_SM_ERR s2BIT(39) u64 xgxs_rxgxs_err_mask; u64 xgxs_rxgxs_err_alarm; @@ -942,10 +942,10 @@ struct XENA_dev_config { #define SPI_CONTROL_BYTECNT(cnt) vBIT(cnt,29,3) #define SPI_CONTROL_CMD(cmd) vBIT(cmd,32,8) #define SPI_CONTROL_ADDR(addr) vBIT(addr,40,24) -#define SPI_CONTROL_SEL1 BIT(4) -#define SPI_CONTROL_REQ BIT(7) -#define SPI_CONTROL_NACK BIT(5) -#define SPI_CONTROL_DONE BIT(6) +#define SPI_CONTROL_SEL1 s2BIT(4) +#define SPI_CONTROL_REQ s2BIT(7) +#define SPI_CONTROL_NACK s2BIT(5) +#define SPI_CONTROL_DONE s2BIT(6) u64 spi_data; #define SPI_DATA_WRITE(data,len) vBIT(data,0,len) }; diff --git a/drivers/net/s2io.c b/drivers/net/s2io.c index 22e4054d4fcb..b8c0e7b4ca1c 100644 --- a/drivers/net/s2io.c +++ b/drivers/net/s2io.c @@ -1716,7 +1716,7 @@ static int init_nic(struct s2io_nic *nic) MISC_LINK_STABILITY_PRD(3); writeq(val64, &bar0->misc_control); val64 = readq(&bar0->pic_control2); - val64 &= ~(BIT(13)|BIT(14)|BIT(15)); + val64 &= ~(s2BIT(13)|s2BIT(14)|s2BIT(15)); writeq(val64, &bar0->pic_control2); } if (strstr(nic->product_name, "CX4")) { @@ -2427,7 +2427,7 @@ static int fill_rx_buffers(struct s2io_nic *nic, int ring_no) } if ((rxdp->Control_1 & RXD_OWN_XENA) && ((nic->rxd_mode == RXD_MODE_3B) && - (rxdp->Control_2 & BIT(0)))) { + (rxdp->Control_2 & s2BIT(0)))) { mac_control->rings[ring_no].rx_curr_put_info. offset = off; goto end; @@ -2540,7 +2540,7 @@ static int fill_rx_buffers(struct s2io_nic *nic, int ring_no) rxdp->Control_2 |= SET_BUFFER2_SIZE_3 (dev->mtu + 4); } - rxdp->Control_2 |= BIT(0); + rxdp->Control_2 |= s2BIT(0); } rxdp->Host_Control = (unsigned long) (skb); if (alloc_tab & ((1 << rxsync_frequency) - 1)) @@ -3377,7 +3377,7 @@ static void s2io_reset(struct s2io_nic * sp) pci_write_config_dword(sp->pdev, 0x68, 0x7C); /* Clearing PCI_STATUS error reflected here */ - writeq(BIT(62), &bar0->txpic_int_reg); + writeq(s2BIT(62), &bar0->txpic_int_reg); } /* Reset device statistics maintained by OS */ @@ -3575,7 +3575,7 @@ static int wait_for_msix_trans(struct s2io_nic *nic, int i) do { val64 = readq(&bar0->xmsi_access); - if (!(val64 & BIT(15))) + if (!(val64 & s2BIT(15))) break; mdelay(1); cnt++; @@ -3597,7 +3597,7 @@ static void restore_xmsi_data(struct s2io_nic *nic) for (i=0; i < MAX_REQUESTED_MSI_X; i++) { writeq(nic->msix_info[i].addr, &bar0->xmsi_address); writeq(nic->msix_info[i].data, &bar0->xmsi_data); - val64 = (BIT(7) | BIT(15) | vBIT(i, 26, 6)); + val64 = (s2BIT(7) | s2BIT(15) | vBIT(i, 26, 6)); writeq(val64, &bar0->xmsi_access); if (wait_for_msix_trans(nic, i)) { DBG_PRINT(ERR_DBG, "failed in %s\n", __FUNCTION__); @@ -3614,7 +3614,7 @@ static void store_xmsi_data(struct s2io_nic *nic) /* Store and display */ for (i=0; i < MAX_REQUESTED_MSI_X; i++) { - val64 = (BIT(15) | vBIT(i, 26, 6)); + val64 = (s2BIT(15) | vBIT(i, 26, 6)); writeq(val64, &bar0->xmsi_access); if (wait_for_msix_trans(nic, i)) { DBG_PRINT(ERR_DBG, "failed in %s\n", __FUNCTION__); @@ -4634,7 +4634,7 @@ static void s2io_updt_stats(struct s2io_nic *sp) do { udelay(100); val64 = readq(&bar0->stat_cfg); - if (!(val64 & BIT(0))) + if (!(val64 & s2BIT(0))) break; cnt++; if (cnt == 5) diff --git a/drivers/net/s2io.h b/drivers/net/s2io.h index f6b45565304f..cc1797a071aa 100644 --- a/drivers/net/s2io.h +++ b/drivers/net/s2io.h @@ -14,7 +14,7 @@ #define _S2IO_H #define TBD 0 -#define BIT(loc) (0x8000000000000000ULL >> (loc)) +#define s2BIT(loc) (0x8000000000000000ULL >> (loc)) #define vBIT(val, loc, sz) (((u64)val) << (64-loc-sz)) #define INV(d) ((d&0xff)<<24) | (((d>>8)&0xff)<<16) | (((d>>16)&0xff)<<8)| ((d>>24)&0xff) @@ -473,42 +473,42 @@ struct TxFIFO_element { u64 List_Control; #define TX_FIFO_LAST_TXD_NUM( val) vBIT(val,0,8) -#define TX_FIFO_FIRST_LIST BIT(14) -#define TX_FIFO_LAST_LIST BIT(15) +#define TX_FIFO_FIRST_LIST s2BIT(14) +#define TX_FIFO_LAST_LIST s2BIT(15) #define TX_FIFO_FIRSTNLAST_LIST vBIT(3,14,2) -#define TX_FIFO_SPECIAL_FUNC BIT(23) -#define TX_FIFO_DS_NO_SNOOP BIT(31) -#define TX_FIFO_BUFF_NO_SNOOP BIT(30) +#define TX_FIFO_SPECIAL_FUNC s2BIT(23) +#define TX_FIFO_DS_NO_SNOOP s2BIT(31) +#define TX_FIFO_BUFF_NO_SNOOP s2BIT(30) }; /* Tx descriptor structure */ struct TxD { u64 Control_1; /* bit mask */ -#define TXD_LIST_OWN_XENA BIT(7) -#define TXD_T_CODE (BIT(12)|BIT(13)|BIT(14)|BIT(15)) +#define TXD_LIST_OWN_XENA s2BIT(7) +#define TXD_T_CODE (s2BIT(12)|s2BIT(13)|s2BIT(14)|s2BIT(15)) #define TXD_T_CODE_OK(val) (|(val & TXD_T_CODE)) #define GET_TXD_T_CODE(val) ((val & TXD_T_CODE)<<12) -#define TXD_GATHER_CODE (BIT(22) | BIT(23)) -#define TXD_GATHER_CODE_FIRST BIT(22) -#define TXD_GATHER_CODE_LAST BIT(23) -#define TXD_TCP_LSO_EN BIT(30) -#define TXD_UDP_COF_EN BIT(31) -#define TXD_UFO_EN BIT(31) | BIT(30) +#define TXD_GATHER_CODE (s2BIT(22) | s2BIT(23)) +#define TXD_GATHER_CODE_FIRST s2BIT(22) +#define TXD_GATHER_CODE_LAST s2BIT(23) +#define TXD_TCP_LSO_EN s2BIT(30) +#define TXD_UDP_COF_EN s2BIT(31) +#define TXD_UFO_EN s2BIT(31) | s2BIT(30) #define TXD_TCP_LSO_MSS(val) vBIT(val,34,14) #define TXD_UFO_MSS(val) vBIT(val,34,14) #define TXD_BUFFER0_SIZE(val) vBIT(val,48,16) u64 Control_2; -#define TXD_TX_CKO_CONTROL (BIT(5)|BIT(6)|BIT(7)) -#define TXD_TX_CKO_IPV4_EN BIT(5) -#define TXD_TX_CKO_TCP_EN BIT(6) -#define TXD_TX_CKO_UDP_EN BIT(7) -#define TXD_VLAN_ENABLE BIT(15) +#define TXD_TX_CKO_CONTROL (s2BIT(5)|s2BIT(6)|s2BIT(7)) +#define TXD_TX_CKO_IPV4_EN s2BIT(5) +#define TXD_TX_CKO_TCP_EN s2BIT(6) +#define TXD_TX_CKO_UDP_EN s2BIT(7) +#define TXD_VLAN_ENABLE s2BIT(15) #define TXD_VLAN_TAG(val) vBIT(val,16,16) #define TXD_INT_NUMBER(val) vBIT(val,34,6) -#define TXD_INT_TYPE_PER_LIST BIT(47) -#define TXD_INT_TYPE_UTILZ BIT(46) +#define TXD_INT_TYPE_PER_LIST s2BIT(47) +#define TXD_INT_TYPE_UTILZ s2BIT(46) #define TXD_SET_MARKER vBIT(0x6,0,4) u64 Buffer_Pointer; @@ -525,14 +525,14 @@ struct list_info_hold { struct RxD_t { u64 Host_Control; /* reserved for host */ u64 Control_1; -#define RXD_OWN_XENA BIT(7) -#define RXD_T_CODE (BIT(12)|BIT(13)|BIT(14)|BIT(15)) +#define RXD_OWN_XENA s2BIT(7) +#define RXD_T_CODE (s2BIT(12)|s2BIT(13)|s2BIT(14)|s2BIT(15)) #define RXD_FRAME_PROTO vBIT(0xFFFF,24,8) -#define RXD_FRAME_PROTO_IPV4 BIT(27) -#define RXD_FRAME_PROTO_IPV6 BIT(28) -#define RXD_FRAME_IP_FRAG BIT(29) -#define RXD_FRAME_PROTO_TCP BIT(30) -#define RXD_FRAME_PROTO_UDP BIT(31) +#define RXD_FRAME_PROTO_IPV4 s2BIT(27) +#define RXD_FRAME_PROTO_IPV6 s2BIT(28) +#define RXD_FRAME_IP_FRAG s2BIT(29) +#define RXD_FRAME_PROTO_TCP s2BIT(30) +#define RXD_FRAME_PROTO_UDP s2BIT(31) #define TCP_OR_UDP_FRAME (RXD_FRAME_PROTO_TCP | RXD_FRAME_PROTO_UDP) #define RXD_GET_L3_CKSUM(val) ((u16)(val>> 16) & 0xFFFF) #define RXD_GET_L4_CKSUM(val) ((u16)(val) & 0xFFFF) @@ -998,26 +998,26 @@ static inline void SPECIAL_REG_WRITE(u64 val, void __iomem *addr, int order) /* Interrupt masks for the general interrupt mask register */ #define DISABLE_ALL_INTRS 0xFFFFFFFFFFFFFFFFULL -#define TXPIC_INT_M BIT(0) -#define TXDMA_INT_M BIT(1) -#define TXMAC_INT_M BIT(2) -#define TXXGXS_INT_M BIT(3) -#define TXTRAFFIC_INT_M BIT(8) -#define PIC_RX_INT_M BIT(32) -#define RXDMA_INT_M BIT(33) -#define RXMAC_INT_M BIT(34) -#define MC_INT_M BIT(35) -#define RXXGXS_INT_M BIT(36) -#define RXTRAFFIC_INT_M BIT(40) +#define TXPIC_INT_M s2BIT(0) +#define TXDMA_INT_M s2BIT(1) +#define TXMAC_INT_M s2BIT(2) +#define TXXGXS_INT_M s2BIT(3) +#define TXTRAFFIC_INT_M s2BIT(8) +#define PIC_RX_INT_M s2BIT(32) +#define RXDMA_INT_M s2BIT(33) +#define RXMAC_INT_M s2BIT(34) +#define MC_INT_M s2BIT(35) +#define RXXGXS_INT_M s2BIT(36) +#define RXTRAFFIC_INT_M s2BIT(40) /* PIC level Interrupts TODO*/ /* DMA level Inressupts */ -#define TXDMA_PFC_INT_M BIT(0) -#define TXDMA_PCC_INT_M BIT(2) +#define TXDMA_PFC_INT_M s2BIT(0) +#define TXDMA_PCC_INT_M s2BIT(2) /* PFC block interrupts */ -#define PFC_MISC_ERR_1 BIT(0) /* Interrupt to indicate FIFO full */ +#define PFC_MISC_ERR_1 s2BIT(0) /* Interrupt to indicate FIFO full */ /* PCC block interrupts. */ #define PCC_FB_ECC_ERR vBIT(0xff, 16, 8) /* Interrupt to indicate diff --git a/drivers/net/spider_net.c b/drivers/net/spider_net.c index fab055ffcc90..571060a3c91e 100644 --- a/drivers/net/spider_net.c +++ b/drivers/net/spider_net.c @@ -46,7 +46,7 @@ #include <linux/vmalloc.h> #include <linux/wait.h> #include <linux/workqueue.h> -#include <asm/bitops.h> +#include <linux/bitops.h> #include <asm/pci-bridge.h> #include <net/checksum.h> diff --git a/drivers/net/tulip/uli526x.c b/drivers/net/tulip/uli526x.c index 76e55612430b..a7afeea156bd 100644 --- a/drivers/net/tulip/uli526x.c +++ b/drivers/net/tulip/uli526x.c @@ -34,9 +34,9 @@ #include <linux/delay.h> #include <linux/spinlock.h> #include <linux/dma-mapping.h> +#include <linux/bitops.h> #include <asm/processor.h> -#include <asm/bitops.h> #include <asm/io.h> #include <asm/dma.h> #include <asm/uaccess.h> diff --git a/drivers/net/wireless/bcm43xx/bcm43xx_leds.c b/drivers/net/wireless/bcm43xx/bcm43xx_leds.c index 8f198befba39..cb51dc51cce6 100644 --- a/drivers/net/wireless/bcm43xx/bcm43xx_leds.c +++ b/drivers/net/wireless/bcm43xx/bcm43xx_leds.c @@ -29,7 +29,7 @@ #include "bcm43xx_radio.h" #include "bcm43xx.h" -#include <asm/bitops.h> +#include <linux/bitops.h> static void bcm43xx_led_changestate(struct bcm43xx_led *led) diff --git a/drivers/net/wireless/hostap/hostap_common.h b/drivers/net/wireless/hostap/hostap_common.h index ceb7f1e5e9e0..517f89845144 100644 --- a/drivers/net/wireless/hostap/hostap_common.h +++ b/drivers/net/wireless/hostap/hostap_common.h @@ -4,9 +4,6 @@ #include <linux/types.h> #include <linux/if_ether.h> -#define BIT(x) (1 << (x)) - - /* IEEE 802.11 defines */ /* Information Element IDs */ diff --git a/drivers/net/wireless/hostap/hostap_ioctl.c b/drivers/net/wireless/hostap/hostap_ioctl.c index 40f516d42c5e..d8f5efcfcab9 100644 --- a/drivers/net/wireless/hostap/hostap_ioctl.c +++ b/drivers/net/wireless/hostap/hostap_ioctl.c @@ -2920,7 +2920,7 @@ static int prism2_ioctl_priv_monitor(struct net_device *dev, int *i) printk(KERN_DEBUG "%s: process %d (%s) used deprecated iwpriv monitor " "- update software to use iwconfig mode monitor\n", - dev->name, current->pid, current->comm); + dev->name, task_pid_nr(current), current->comm); /* Backward compatibility code - this can be removed at some point */ diff --git a/drivers/pcmcia/m32r_pcc.c b/drivers/pcmcia/m32r_pcc.c index 67d28ee80f22..c5e0d89c3ece 100644 --- a/drivers/pcmcia/m32r_pcc.c +++ b/drivers/pcmcia/m32r_pcc.c @@ -22,9 +22,9 @@ #include <linux/workqueue.h> #include <linux/interrupt.h> #include <linux/platform_device.h> +#include <linux/bitops.h> #include <asm/irq.h> #include <asm/io.h> -#include <asm/bitops.h> #include <asm/system.h> #include <asm/addrspace.h> diff --git a/drivers/pcmcia/m8xx_pcmcia.c b/drivers/pcmcia/m8xx_pcmcia.c index b01985498460..d182760f035b 100644 --- a/drivers/pcmcia/m8xx_pcmcia.c +++ b/drivers/pcmcia/m8xx_pcmcia.c @@ -48,9 +48,9 @@ #include <linux/delay.h> #include <linux/interrupt.h> #include <linux/fsl_devices.h> +#include <linux/bitops.h> #include <asm/io.h> -#include <asm/bitops.h> #include <asm/system.h> #include <asm/time.h> #include <asm/mpc8xx.h> diff --git a/drivers/ps3/ps3av.c b/drivers/ps3/ps3av.c index 397f4ce849dc..87b3493d88e5 100644 --- a/drivers/ps3/ps3av.c +++ b/drivers/ps3/ps3av.c @@ -729,7 +729,7 @@ static void ps3av_monitor_info_dump(const struct ps3av_pkt_av_get_monitor_info * static const struct ps3av_monitor_quirk { const char *monitor_name; - u32 clear_60, clear_50, clear_vesa; + u32 clear_60; } ps3av_monitor_quirks[] = { { .monitor_name = "DELL 2007WFP", @@ -757,10 +757,6 @@ static void ps3av_fixup_monitor_info(struct ps3av_info_monitor *info) quirk->monitor_name); info->res_60.res_bits &= ~quirk->clear_60; info->res_60.native &= ~quirk->clear_60; - info->res_50.res_bits &= ~quirk->clear_50; - info->res_50.native &= ~quirk->clear_50; - info->res_vesa.res_bits &= ~quirk->clear_vesa; - info->res_vesa.native &= ~quirk->clear_vesa; break; } } diff --git a/drivers/ps3/vuart.c b/drivers/ps3/vuart.c index bea25a1391ee..9dea585ef806 100644 --- a/drivers/ps3/vuart.c +++ b/drivers/ps3/vuart.c @@ -22,11 +22,11 @@ #include <linux/module.h> #include <linux/interrupt.h> #include <linux/workqueue.h> +#include <linux/bitops.h> #include <asm/ps3.h> #include <asm/firmware.h> #include <asm/lv1call.h> -#include <asm/bitops.h> #include "vuart.h" diff --git a/drivers/rtc/rtc-pl031.c b/drivers/rtc/rtc-pl031.c index e4bf68ca96f7..2fd49edcc712 100644 --- a/drivers/rtc/rtc-pl031.c +++ b/drivers/rtc/rtc-pl031.c @@ -21,11 +21,11 @@ #include <linux/interrupt.h> #include <linux/string.h> #include <linux/pm.h> +#include <linux/bitops.h> #include <linux/amba/bus.h> #include <asm/io.h> -#include <asm/bitops.h> #include <asm/hardware.h> #include <asm/irq.h> #include <asm/rtc.h> diff --git a/drivers/rtc/rtc-sa1100.c b/drivers/rtc/rtc-sa1100.c index 0918b787c4dd..6f1e9a9804bc 100644 --- a/drivers/rtc/rtc-sa1100.c +++ b/drivers/rtc/rtc-sa1100.c @@ -29,8 +29,8 @@ #include <linux/interrupt.h> #include <linux/string.h> #include <linux/pm.h> +#include <linux/bitops.h> -#include <asm/bitops.h> #include <asm/hardware.h> #include <asm/irq.h> #include <asm/rtc.h> diff --git a/drivers/s390/cio/idset.c b/drivers/s390/cio/idset.c index 16ea828e99f7..ef7bc0a125ef 100644 --- a/drivers/s390/cio/idset.c +++ b/drivers/s390/cio/idset.c @@ -6,7 +6,7 @@ */ #include <linux/slab.h> -#include <asm/bitops.h> +#include <linux/bitops.h> #include "idset.h" #include "css.h" diff --git a/drivers/s390/net/claw.c b/drivers/s390/net/claw.c index 399695f7b1af..3561982749e3 100644 --- a/drivers/s390/net/claw.c +++ b/drivers/s390/net/claw.c @@ -59,13 +59,13 @@ * 1.15 Changed for 2.6 Kernel No longer compiles on 2.4 or lower * 1.25 Added Packing support */ -#include <asm/bitops.h> #include <asm/ccwdev.h> #include <asm/ccwgroup.h> #include <asm/debug.h> #include <asm/idals.h> #include <asm/io.h> +#include <linux/bitops.h> #include <linux/ctype.h> #include <linux/delay.h> #include <linux/errno.h> diff --git a/drivers/scsi/FlashPoint.c b/drivers/scsi/FlashPoint.c index a7f916c0c9cd..1c9078191d9e 100644 --- a/drivers/scsi/FlashPoint.c +++ b/drivers/scsi/FlashPoint.c @@ -25,9 +25,6 @@ #define FAILURE 0xFFFFFFFFL -#define BIT(x) ((unsigned char)(1<<(x))) /* single-bit mask in bit position x */ -#define BITW(x) ((unsigned short)(1<<(x))) /* single-bit mask in bit position x */ - struct sccb; typedef void (*CALL_BK_FN) (struct sccb *); @@ -374,9 +371,9 @@ typedef struct SCCBscam_info { #define SCAM_ENABLED BIT(2) #define SCAM_LEVEL2 BIT(3) -#define RENEGO_ENA BITW(10) -#define CONNIO_ENA BITW(11) -#define GREEN_PC_ENA BITW(12) +#define RENEGO_ENA BIT(10) +#define CONNIO_ENA BIT(11) +#define GREEN_PC_ENA BIT(12) #define AUTO_RATE_00 00 #define AUTO_RATE_05 01 @@ -511,23 +508,23 @@ typedef struct SCCBscam_info { #define hp_intena 0x40 -#define RESET BITW(7) -#define PROG_HLT BITW(6) -#define PARITY BITW(5) -#define FIFO BITW(4) -#define SEL BITW(3) -#define SCAM_SEL BITW(2) -#define RSEL BITW(1) -#define TIMEOUT BITW(0) -#define BUS_FREE BITW(15) -#define XFER_CNT_0 BITW(14) -#define PHASE BITW(13) -#define IUNKWN BITW(12) -#define ICMD_COMP BITW(11) -#define ITICKLE BITW(10) -#define IDO_STRT BITW(9) -#define ITAR_DISC BITW(8) -#define AUTO_INT (BITW(12)+BITW(11)+BITW(10)+BITW(9)+BITW(8)) +#define RESET BIT(7) +#define PROG_HLT BIT(6) +#define PARITY BIT(5) +#define FIFO BIT(4) +#define SEL BIT(3) +#define SCAM_SEL BIT(2) +#define RSEL BIT(1) +#define TIMEOUT BIT(0) +#define BUS_FREE BIT(15) +#define XFER_CNT_0 BIT(14) +#define PHASE BIT(13) +#define IUNKWN BIT(12) +#define ICMD_COMP BIT(11) +#define ITICKLE BIT(10) +#define IDO_STRT BIT(9) +#define ITAR_DISC BIT(8) +#define AUTO_INT (BIT(12)+BIT(11)+BIT(10)+BIT(9)+BIT(8)) #define CLR_ALL_INT 0xFFFF #define CLR_ALL_INT_1 0xFF00 @@ -674,37 +671,37 @@ typedef struct SCCBscam_info { #define BIOS_DATA_OFFSET 0x60 #define BIOS_RELATIVE_CARD 0x64 -#define AR3 (BITW(9) + BITW(8)) -#define SDATA BITW(10) +#define AR3 (BIT(9) + BIT(8)) +#define SDATA BIT(10) -#define CRD_OP BITW(11) /* Cmp Reg. w/ Data */ +#define CRD_OP BIT(11) /* Cmp Reg. w/ Data */ -#define CRR_OP BITW(12) /* Cmp Reg. w. Reg. */ +#define CRR_OP BIT(12) /* Cmp Reg. w. Reg. */ -#define CPE_OP (BITW(14)+BITW(11)) /* Cmp SCSI phs & Branch EQ */ +#define CPE_OP (BIT(14)+BIT(11)) /* Cmp SCSI phs & Branch EQ */ -#define CPN_OP (BITW(14)+BITW(12)) /* Cmp SCSI phs & Branch NOT EQ */ +#define CPN_OP (BIT(14)+BIT(12)) /* Cmp SCSI phs & Branch NOT EQ */ #define ADATA_OUT 0x00 -#define ADATA_IN BITW(8) -#define ACOMMAND BITW(10) -#define ASTATUS (BITW(10)+BITW(8)) -#define AMSG_OUT (BITW(10)+BITW(9)) -#define AMSG_IN (BITW(10)+BITW(9)+BITW(8)) +#define ADATA_IN BIT(8) +#define ACOMMAND BIT(10) +#define ASTATUS (BIT(10)+BIT(8)) +#define AMSG_OUT (BIT(10)+BIT(9)) +#define AMSG_IN (BIT(10)+BIT(9)+BIT(8)) -#define BRH_OP BITW(13) /* Branch */ +#define BRH_OP BIT(13) /* Branch */ #define ALWAYS 0x00 -#define EQUAL BITW(8) -#define NOT_EQ BITW(9) +#define EQUAL BIT(8) +#define NOT_EQ BIT(9) -#define TCB_OP (BITW(13)+BITW(11)) /* Test condition & branch */ +#define TCB_OP (BIT(13)+BIT(11)) /* Test condition & branch */ -#define FIFO_0 BITW(10) +#define FIFO_0 BIT(10) -#define MPM_OP BITW(15) /* Match phase and move data */ +#define MPM_OP BIT(15) /* Match phase and move data */ -#define MRR_OP BITW(14) /* Move DReg. to Reg. */ +#define MRR_OP BIT(14) /* Move DReg. to Reg. */ #define S_IDREG (BIT(2)+BIT(1)+BIT(0)) @@ -712,9 +709,9 @@ typedef struct SCCBscam_info { #define D_AR1 BIT(0) #define D_BUCKET (BIT(2) + BIT(1) + BIT(0)) -#define RAT_OP (BITW(14)+BITW(13)+BITW(11)) +#define RAT_OP (BIT(14)+BIT(13)+BIT(11)) -#define SSI_OP (BITW(15)+BITW(11)) +#define SSI_OP (BIT(15)+BIT(11)) #define SSI_ITAR_DISC (ITAR_DISC >> 8) #define SSI_IDO_STRT (IDO_STRT >> 8) diff --git a/drivers/scsi/Kconfig b/drivers/scsi/Kconfig index 30905cebefbb..a5763c6e9362 100644 --- a/drivers/scsi/Kconfig +++ b/drivers/scsi/Kconfig @@ -521,7 +521,7 @@ config SCSI_DPT_I2O config SCSI_ADVANSYS tristate "AdvanSys SCSI support" - depends on SCSI + depends on SCSI && VIRT_TO_BUS depends on ISA || EISA || PCI help This is a driver for all SCSI host adapters manufactured by diff --git a/drivers/scsi/ide-scsi.c b/drivers/scsi/ide-scsi.c index fa7ba64483fb..252d1806467f 100644 --- a/drivers/scsi/ide-scsi.c +++ b/drivers/scsi/ide-scsi.c @@ -47,9 +47,9 @@ #include <linux/scatterlist.h> #include <linux/delay.h> #include <linux/mutex.h> +#include <linux/bitops.h> #include <asm/io.h> -#include <asm/bitops.h> #include <asm/uaccess.h> #include <scsi/scsi.h> diff --git a/drivers/scsi/libsas/sas_discover.c b/drivers/scsi/libsas/sas_discover.c index 7ef0afc3cd68..5f3a0d7b18de 100644 --- a/drivers/scsi/libsas/sas_discover.c +++ b/drivers/scsi/libsas/sas_discover.c @@ -285,7 +285,7 @@ static void sas_discover_domain(struct work_struct *work) dev = port->port_dev; SAS_DPRINTK("DOING DISCOVERY on port %d, pid:%d\n", port->id, - current->pid); + task_pid_nr(current)); switch (dev->dev_type) { case SAS_END_DEV: @@ -320,7 +320,7 @@ static void sas_discover_domain(struct work_struct *work) } SAS_DPRINTK("DONE DISCOVERY on port %d, pid:%d, result:%d\n", port->id, - current->pid, error); + task_pid_nr(current), error); } static void sas_revalidate_domain(struct work_struct *work) @@ -334,12 +334,12 @@ static void sas_revalidate_domain(struct work_struct *work) &port->disc.pending); SAS_DPRINTK("REVALIDATING DOMAIN on port %d, pid:%d\n", port->id, - current->pid); + task_pid_nr(current)); if (port->port_dev) res = sas_ex_revalidate_domain(port->port_dev); SAS_DPRINTK("done REVALIDATING DOMAIN on port %d, pid:%d, res 0x%x\n", - port->id, current->pid, res); + port->id, task_pid_nr(current), res); } /* ---------- Events ---------- */ diff --git a/drivers/scsi/nsp32.h b/drivers/scsi/nsp32.h index a976e8193d16..6715ecb3bfca 100644 --- a/drivers/scsi/nsp32.h +++ b/drivers/scsi/nsp32.h @@ -69,11 +69,6 @@ typedef u32 u32_le; typedef u16 u16_le; /* - * MACRO - */ -#define BIT(x) (1UL << (x)) - -/* * BASIC Definitions */ #ifndef TRUE diff --git a/drivers/scsi/pcmcia/nsp_cs.h b/drivers/scsi/pcmcia/nsp_cs.h index b7f0fa246413..98397559c53b 100644 --- a/drivers/scsi/pcmcia/nsp_cs.h +++ b/drivers/scsi/pcmcia/nsp_cs.h @@ -24,7 +24,6 @@ /************************************ * Some useful macros... */ -#define BIT(x) (1L << (x)) /* SCSI initiator must be ID 7 */ #define NSP_INITIATOR_ID 7 diff --git a/drivers/scsi/qla4xxx/ql4_fw.h b/drivers/scsi/qla4xxx/ql4_fw.h index 9bb3d1d2a925..fe415ec85655 100644 --- a/drivers/scsi/qla4xxx/ql4_fw.h +++ b/drivers/scsi/qla4xxx/ql4_fw.h @@ -671,7 +671,7 @@ struct continuation_t1_entry { #define ET_CONTINUE ET_CONT_T1 /* Marker entry structure*/ -struct marker_entry { +struct qla4_marker_entry { struct qla4_header hdr; /* 00-03 */ uint32_t system_defined; /* 04-07 */ diff --git a/drivers/scsi/qla4xxx/ql4_iocb.c b/drivers/scsi/qla4xxx/ql4_iocb.c index 5006ecb3ef5e..e4461b5d767a 100644 --- a/drivers/scsi/qla4xxx/ql4_iocb.c +++ b/drivers/scsi/qla4xxx/ql4_iocb.c @@ -69,7 +69,7 @@ static int qla4xxx_get_req_pkt(struct scsi_qla_host *ha, static int qla4xxx_send_marker_iocb(struct scsi_qla_host *ha, struct ddb_entry *ddb_entry, int lun) { - struct marker_entry *marker_entry; + struct qla4_marker_entry *marker_entry; unsigned long flags = 0; uint8_t status = QLA_SUCCESS; diff --git a/drivers/serial/amba-pl011.c b/drivers/serial/amba-pl011.c index 72229df9dc11..40604a092921 100644 --- a/drivers/serial/amba-pl011.c +++ b/drivers/serial/amba-pl011.c @@ -263,15 +263,15 @@ static unsigned int pl01x_get_mctrl(struct uart_port *port) unsigned int result = 0; unsigned int status = readw(uap->port.membase + UART01x_FR); -#define BIT(uartbit, tiocmbit) \ +#define TIOCMBIT(uartbit, tiocmbit) \ if (status & uartbit) \ result |= tiocmbit - BIT(UART01x_FR_DCD, TIOCM_CAR); - BIT(UART01x_FR_DSR, TIOCM_DSR); - BIT(UART01x_FR_CTS, TIOCM_CTS); - BIT(UART011_FR_RI, TIOCM_RNG); -#undef BIT + TIOCMBIT(UART01x_FR_DCD, TIOCM_CAR); + TIOCMBIT(UART01x_FR_DSR, TIOCM_DSR); + TIOCMBIT(UART01x_FR_CTS, TIOCM_CTS); + TIOCMBIT(UART011_FR_RI, TIOCM_RNG); +#undef TIOCMBIT return result; } @@ -282,18 +282,18 @@ static void pl011_set_mctrl(struct uart_port *port, unsigned int mctrl) cr = readw(uap->port.membase + UART011_CR); -#define BIT(tiocmbit, uartbit) \ +#define TIOCMBIT(tiocmbit, uartbit) \ if (mctrl & tiocmbit) \ cr |= uartbit; \ else \ cr &= ~uartbit - BIT(TIOCM_RTS, UART011_CR_RTS); - BIT(TIOCM_DTR, UART011_CR_DTR); - BIT(TIOCM_OUT1, UART011_CR_OUT1); - BIT(TIOCM_OUT2, UART011_CR_OUT2); - BIT(TIOCM_LOOP, UART011_CR_LBE); -#undef BIT + TIOCMBIT(TIOCM_RTS, UART011_CR_RTS); + TIOCMBIT(TIOCM_DTR, UART011_CR_DTR); + TIOCMBIT(TIOCM_OUT1, UART011_CR_OUT1); + TIOCMBIT(TIOCM_OUT2, UART011_CR_OUT2); + TIOCMBIT(TIOCM_LOOP, UART011_CR_LBE); +#undef TIOCMBIT writew(cr, uap->port.membase + UART011_CR); } diff --git a/drivers/serial/crisv10.c b/drivers/serial/crisv10.c index 7e8724d3571f..f523cdf4b02b 100644 --- a/drivers/serial/crisv10.c +++ b/drivers/serial/crisv10.c @@ -442,11 +442,11 @@ static char *serial_version = "$Revision: 1.25 $"; #include <asm/uaccess.h> #include <linux/kernel.h> #include <linux/mutex.h> +#include <linux/bitops.h> #include <asm/io.h> #include <asm/irq.h> #include <asm/system.h> -#include <asm/bitops.h> #include <linux/delay.h> #include <asm/arch/svinto.h> diff --git a/drivers/usb/core/devio.c b/drivers/usb/core/devio.c index f013b4012c9a..1f4f6d02fe25 100644 --- a/drivers/usb/core/devio.c +++ b/drivers/usb/core/devio.c @@ -460,7 +460,7 @@ static int checkintf(struct dev_state *ps, unsigned int ifnum) return 0; /* if not yet claimed, claim it for the driver */ dev_warn(&ps->dev->dev, "usbfs: process %d (%s) did not claim interface %u before use\n", - current->pid, current->comm, ifnum); + task_pid_nr(current), current->comm, ifnum); return claimintf(ps, ifnum); } diff --git a/drivers/usb/gadget/file_storage.c b/drivers/usb/gadget/file_storage.c index 73726c570a6e..1d174dcb3ac9 100644 --- a/drivers/usb/gadget/file_storage.c +++ b/drivers/usb/gadget/file_storage.c @@ -4006,7 +4006,7 @@ static int __init fsg_bind(struct usb_gadget *gadget) DBG(fsg, "removable=%d, stall=%d, buflen=%u\n", mod_data.removable, mod_data.can_stall, mod_data.buflen); - DBG(fsg, "I/O thread pid: %d\n", fsg->thread_task->pid); + DBG(fsg, "I/O thread pid: %d\n", task_pid_nr(fsg->thread_task)); set_bit(REGISTERED, &fsg->atomic_bitflags); diff --git a/drivers/video/cyber2000fb.c b/drivers/video/cyber2000fb.c index 9bb2cbfe4a3d..5fb8675e0d6b 100644 --- a/drivers/video/cyber2000fb.c +++ b/drivers/video/cyber2000fb.c @@ -62,7 +62,7 @@ struct cfb_info { struct display_switch *dispsw; struct display *display; struct pci_dev *dev; - unsigned char __iomem *region; + unsigned char __iomem *region; unsigned char __iomem *regs; u_int id; int func_use_count; @@ -97,11 +97,11 @@ MODULE_PARM_DESC(default_font, "Default font name"); /* * Our access methods. */ -#define cyber2000fb_writel(val,reg,cfb) writel(val, (cfb)->regs + (reg)) -#define cyber2000fb_writew(val,reg,cfb) writew(val, (cfb)->regs + (reg)) -#define cyber2000fb_writeb(val,reg,cfb) writeb(val, (cfb)->regs + (reg)) +#define cyber2000fb_writel(val, reg, cfb) writel(val, (cfb)->regs + (reg)) +#define cyber2000fb_writew(val, reg, cfb) writew(val, (cfb)->regs + (reg)) +#define cyber2000fb_writeb(val, reg, cfb) writeb(val, (cfb)->regs + (reg)) -#define cyber2000fb_readb(reg,cfb) readb((cfb)->regs + (reg)) +#define cyber2000fb_readb(reg, cfb) readb((cfb)->regs + (reg)) static inline void cyber2000_crtcw(unsigned int reg, unsigned int val, struct cfb_info *cfb) @@ -221,12 +221,8 @@ cyber2000fb_copyarea(struct fb_info *info, const struct fb_copyarea *region) static void cyber2000fb_imageblit(struct fb_info *info, const struct fb_image *image) { -// struct cfb_info *cfb = (struct cfb_info *)info; - -// if (!(cfb->fb.var.accel_flags & FB_ACCELF_TEXT)) { - cfb_imageblit(info, image); - return; -// } + cfb_imageblit(info, image); + return; } static int cyber2000fb_sync(struct fb_info *info) @@ -277,12 +273,12 @@ cyber2000fb_setcolreg(u_int regno, u_int red, u_int green, u_int blue, /* * Pseudocolour: - * 8 8 + * 8 8 * pixel --/--+--/--> red lut --> red dac - * | 8 - * +--/--> green lut --> green dac - * | 8 - * +--/--> blue lut --> blue dac + * | 8 + * +--/--> green lut --> green dac + * | 8 + * +--/--> blue lut --> blue dac */ case FB_VISUAL_PSEUDOCOLOR: if (regno >= NR_PALETTE) @@ -292,9 +288,9 @@ cyber2000fb_setcolreg(u_int regno, u_int red, u_int green, u_int blue, green >>= 8; blue >>= 8; - cfb->palette[regno].red = red; + cfb->palette[regno].red = red; cfb->palette[regno].green = green; - cfb->palette[regno].blue = blue; + cfb->palette[regno].blue = blue; cyber2000fb_writeb(regno, 0x3c8, cfb); cyber2000fb_writeb(red, 0x3c9, cfb); @@ -304,12 +300,12 @@ cyber2000fb_setcolreg(u_int regno, u_int red, u_int green, u_int blue, /* * Direct colour: - * n rl - * pixel --/--+--/--> red lut --> red dac - * | gl - * +--/--> green lut --> green dac - * | bl - * +--/--> blue lut --> blue dac + * n rl + * pixel --/--+--/--> red lut --> red dac + * | gl + * +--/--> green lut --> green dac + * | bl + * +--/--> blue lut --> blue dac * n = bpp, rl = red length, gl = green length, bl = blue length */ case FB_VISUAL_DIRECTCOLOR: @@ -325,9 +321,11 @@ cyber2000fb_setcolreg(u_int regno, u_int red, u_int green, u_int blue, * to the high 6 bits of the LUT. */ cyber2000fb_writeb(regno << 2, 0x3c8, cfb); - cyber2000fb_writeb(cfb->palette[regno >> 1].red, 0x3c9, cfb); + cyber2000fb_writeb(cfb->palette[regno >> 1].red, + 0x3c9, cfb); cyber2000fb_writeb(green, 0x3c9, cfb); - cyber2000fb_writeb(cfb->palette[regno >> 1].blue, 0x3c9, cfb); + cyber2000fb_writeb(cfb->palette[regno >> 1].blue, + 0x3c9, cfb); green = cfb->palette[regno << 3].green; @@ -335,9 +333,9 @@ cyber2000fb_setcolreg(u_int regno, u_int red, u_int green, u_int blue, } if (var->green.length >= 5 && regno < 32) { - cfb->palette[regno << 3].red = red; + cfb->palette[regno << 3].red = red; cfb->palette[regno << 3].green = green; - cfb->palette[regno << 3].blue = blue; + cfb->palette[regno << 3].blue = blue; /* * The 5 bits of each colour component are @@ -351,9 +349,9 @@ cyber2000fb_setcolreg(u_int regno, u_int red, u_int green, u_int blue, } if (var->green.length == 4 && regno < 16) { - cfb->palette[regno << 4].red = red; + cfb->palette[regno << 4].red = red; cfb->palette[regno << 4].green = green; - cfb->palette[regno << 4].blue = blue; + cfb->palette[regno << 4].blue = blue; /* * The 5 bits of each colour component are @@ -377,12 +375,12 @@ cyber2000fb_setcolreg(u_int regno, u_int red, u_int green, u_int blue, /* * True colour: - * n rl - * pixel --/--+--/--> red dac - * | gl - * +--/--> green dac - * | bl - * +--/--> blue dac + * n rl + * pixel --/--+--/--> red dac + * | gl + * +--/--> green dac + * | bl + * +--/--> blue dac * n = bpp, rl = red length, gl = green length, bl = blue length */ case FB_VISUAL_TRUECOLOR: @@ -494,9 +492,9 @@ static void cyber2000fb_set_timing(struct cfb_info *cfb, struct par_info *hw) /* PLL registers */ cyber2000_grphw(EXT_DCLK_MULT, hw->clock_mult, cfb); - cyber2000_grphw(EXT_DCLK_DIV, hw->clock_div, cfb); + cyber2000_grphw(EXT_DCLK_DIV, hw->clock_div, cfb); cyber2000_grphw(EXT_MCLK_MULT, cfb->mclk_mult, cfb); - cyber2000_grphw(EXT_MCLK_DIV, cfb->mclk_div, cfb); + cyber2000_grphw(EXT_MCLK_DIV, cfb->mclk_div, cfb); cyber2000_grphw(0x90, 0x01, cfb); cyber2000_grphw(0xb9, 0x80, cfb); cyber2000_grphw(0xb9, 0x00, cfb); @@ -515,8 +513,8 @@ static void cyber2000fb_set_timing(struct cfb_info *cfb, struct par_info *hw) /* * Set up accelerator registers */ - cyber2000fb_writew(hw->width, CO_REG_SRC_WIDTH, cfb); - cyber2000fb_writew(hw->width, CO_REG_DEST_WIDTH, cfb); + cyber2000fb_writew(hw->width, CO_REG_SRC_WIDTH, cfb); + cyber2000fb_writew(hw->width, CO_REG_DEST_WIDTH, cfb); cyber2000fb_writeb(hw->co_pixfmt, CO_REG_PIXFMT, cfb); } @@ -549,15 +547,15 @@ cyber2000fb_decode_crtc(struct par_info *hw, struct cfb_info *cfb, { u_int Htotal, Hblankend, Hsyncend; u_int Vtotal, Vdispend, Vblankstart, Vblankend, Vsyncstart, Vsyncend; -#define BIT(v,b1,m,b2) (((v >> b1) & m) << b2) +#define ENCODE_BIT(v, b1, m, b2) ((((v) >> (b1)) & (m)) << (b2)) hw->crtc[13] = hw->pitch; hw->crtc[17] = 0xe3; hw->crtc[14] = 0; hw->crtc[8] = 0; - Htotal = var->xres + var->right_margin + - var->hsync_len + var->left_margin; + Htotal = var->xres + var->right_margin + + var->hsync_len + var->left_margin; if (Htotal > 2080) return -EINVAL; @@ -567,15 +565,15 @@ cyber2000fb_decode_crtc(struct par_info *hw, struct cfb_info *cfb, hw->crtc[2] = var->xres >> 3; hw->crtc[4] = (var->xres + var->right_margin) >> 3; - Hblankend = (Htotal - 4*8) >> 3; + Hblankend = (Htotal - 4 * 8) >> 3; - hw->crtc[3] = BIT(Hblankend, 0, 0x1f, 0) | - BIT(1, 0, 0x01, 7); + hw->crtc[3] = ENCODE_BIT(Hblankend, 0, 0x1f, 0) | + ENCODE_BIT(1, 0, 0x01, 7); Hsyncend = (var->xres + var->right_margin + var->hsync_len) >> 3; - hw->crtc[5] = BIT(Hsyncend, 0, 0x1f, 0) | - BIT(Hblankend, 5, 0x01, 7); + hw->crtc[5] = ENCODE_BIT(Hsyncend, 0, 0x1f, 0) | + ENCODE_BIT(Hblankend, 5, 0x01, 7); Vdispend = var->yres - 1; Vsyncstart = var->yres + var->lower_margin; @@ -590,20 +588,20 @@ cyber2000fb_decode_crtc(struct par_info *hw, struct cfb_info *cfb, Vblankend = Vtotal - 10; hw->crtc[6] = Vtotal; - hw->crtc[7] = BIT(Vtotal, 8, 0x01, 0) | - BIT(Vdispend, 8, 0x01, 1) | - BIT(Vsyncstart, 8, 0x01, 2) | - BIT(Vblankstart,8, 0x01, 3) | - BIT(1, 0, 0x01, 4) | - BIT(Vtotal, 9, 0x01, 5) | - BIT(Vdispend, 9, 0x01, 6) | - BIT(Vsyncstart, 9, 0x01, 7); - hw->crtc[9] = BIT(0, 0, 0x1f, 0) | - BIT(Vblankstart,9, 0x01, 5) | - BIT(1, 0, 0x01, 6); + hw->crtc[7] = ENCODE_BIT(Vtotal, 8, 0x01, 0) | + ENCODE_BIT(Vdispend, 8, 0x01, 1) | + ENCODE_BIT(Vsyncstart, 8, 0x01, 2) | + ENCODE_BIT(Vblankstart, 8, 0x01, 3) | + ENCODE_BIT(1, 0, 0x01, 4) | + ENCODE_BIT(Vtotal, 9, 0x01, 5) | + ENCODE_BIT(Vdispend, 9, 0x01, 6) | + ENCODE_BIT(Vsyncstart, 9, 0x01, 7); + hw->crtc[9] = ENCODE_BIT(0, 0, 0x1f, 0) | + ENCODE_BIT(Vblankstart, 9, 0x01, 5) | + ENCODE_BIT(1, 0, 0x01, 6); hw->crtc[10] = Vsyncstart; - hw->crtc[11] = BIT(Vsyncend, 0, 0x0f, 0) | - BIT(1, 0, 0x01, 7); + hw->crtc[11] = ENCODE_BIT(Vsyncend, 0, 0x0f, 0) | + ENCODE_BIT(1, 0, 0x01, 7); hw->crtc[12] = Vdispend; hw->crtc[15] = Vblankstart; hw->crtc[16] = Vblankend; @@ -615,10 +613,10 @@ cyber2000fb_decode_crtc(struct par_info *hw, struct cfb_info *cfb, * 4=LINECOMP:10 5-IVIDEO 6=FIXCNT */ hw->crtc_ofl = - BIT(Vtotal, 10, 0x01, 0) | - BIT(Vdispend, 10, 0x01, 1) | - BIT(Vsyncstart, 10, 0x01, 2) | - BIT(Vblankstart,10, 0x01, 3) | + ENCODE_BIT(Vtotal, 10, 0x01, 0) | + ENCODE_BIT(Vdispend, 10, 0x01, 1) | + ENCODE_BIT(Vsyncstart, 10, 0x01, 2) | + ENCODE_BIT(Vblankstart, 10, 0x01, 3) | EXT_CRT_VRTOFL_LINECOMP10; /* woody: set the interlaced bit... */ @@ -750,11 +748,11 @@ cyber2000fb_check_var(struct fb_var_screeninfo *var, struct fb_info *info) var->red.msb_right = 0; var->green.msb_right = 0; var->blue.msb_right = 0; + var->transp.offset = 0; + var->transp.length = 0; switch (var->bits_per_pixel) { case 8: /* PSEUDOCOLOUR, 256 */ - var->transp.offset = 0; - var->transp.length = 0; var->red.offset = 0; var->red.length = 8; var->green.offset = 0; @@ -766,8 +764,6 @@ cyber2000fb_check_var(struct fb_var_screeninfo *var, struct fb_info *info) case 16:/* DIRECTCOLOUR, 64k or 32k */ switch (var->green.length) { case 6: /* RGB565, 64k */ - var->transp.offset = 0; - var->transp.length = 0; var->red.offset = 11; var->red.length = 5; var->green.offset = 5; @@ -778,8 +774,6 @@ cyber2000fb_check_var(struct fb_var_screeninfo *var, struct fb_info *info) default: case 5: /* RGB555, 32k */ - var->transp.offset = 0; - var->transp.length = 0; var->red.offset = 10; var->red.length = 5; var->green.offset = 5; @@ -802,8 +796,6 @@ cyber2000fb_check_var(struct fb_var_screeninfo *var, struct fb_info *info) break; case 24:/* TRUECOLOUR, 16m */ - var->transp.offset = 0; - var->transp.length = 0; var->red.offset = 16; var->red.length = 8; var->green.offset = 8; @@ -830,7 +822,7 @@ cyber2000fb_check_var(struct fb_var_screeninfo *var, struct fb_info *info) mem = var->xres_virtual * var->yres_virtual * (var->bits_per_pixel / 8); if (mem > cfb->fb.fix.smem_len) var->yres_virtual = cfb->fb.fix.smem_len * 8 / - (var->bits_per_pixel * var->xres_virtual); + (var->bits_per_pixel * var->xres_virtual); if (var->yres > var->yres_virtual) var->yres = var->yres_virtual; @@ -921,7 +913,7 @@ static int cyber2000fb_set_par(struct fb_info *info) hw.fetch <<= 1; hw.fetch += 1; - cfb->fb.fix.line_length = var->xres_virtual * var->bits_per_pixel / 8; + cfb->fb.fix.line_length = var->xres_virtual * var->bits_per_pixel / 8; /* * Same here - if the size of the video mode exceeds the @@ -952,7 +944,6 @@ static int cyber2000fb_set_par(struct fb_info *info) return 0; } - /* * Pan or Wrap the Display */ @@ -1002,15 +993,15 @@ static int cyber2000fb_blank(int blank, struct fb_info *info) switch (blank) { case FB_BLANK_POWERDOWN: /* powerdown - both sync lines down */ sync = EXT_SYNC_CTL_VS_0 | EXT_SYNC_CTL_HS_0; - break; + break; case FB_BLANK_HSYNC_SUSPEND: /* hsync off */ sync = EXT_SYNC_CTL_VS_NORMAL | EXT_SYNC_CTL_HS_0; - break; + break; case FB_BLANK_VSYNC_SUSPEND: /* vsync off */ sync = EXT_SYNC_CTL_VS_0 | EXT_SYNC_CTL_HS_NORMAL; break; - case FB_BLANK_NORMAL: /* soft blank */ - default: /* unblank */ + case FB_BLANK_NORMAL: /* soft blank */ + default: /* unblank */ break; } @@ -1018,7 +1009,8 @@ static int cyber2000fb_blank(int blank, struct fb_info *info) if (blank <= 1) { /* turn on ramdacs */ - cfb->ramdac_powerdown &= ~(RAMDAC_DACPWRDN | RAMDAC_BYPASS | RAMDAC_RAMPWRDN); + cfb->ramdac_powerdown &= ~(RAMDAC_DACPWRDN | RAMDAC_BYPASS | + RAMDAC_RAMPWRDN); cyber2000fb_write_ramdac_ctrl(cfb); } @@ -1043,7 +1035,8 @@ static int cyber2000fb_blank(int blank, struct fb_info *info) if (blank >= 2) { /* turn off ramdacs */ - cfb->ramdac_powerdown |= RAMDAC_DACPWRDN | RAMDAC_BYPASS | RAMDAC_RAMPWRDN; + cfb->ramdac_powerdown |= RAMDAC_DACPWRDN | RAMDAC_BYPASS | + RAMDAC_RAMPWRDN; cyber2000fb_write_ramdac_ctrl(cfb); } @@ -1068,7 +1061,7 @@ static struct fb_ops cyber2000fb_ops = { * of this driver. It is here solely at the moment to support the other * CyberPro modules external to this driver. */ -static struct cfb_info *int_cfb_info; +static struct cfb_info *int_cfb_info; /* * Enable access to the extended registers @@ -1085,6 +1078,7 @@ void cyber2000fb_enable_extregs(struct cfb_info *cfb) cyber2000_grphw(EXT_FUNC_CTL, old, cfb); } } +EXPORT_SYMBOL(cyber2000fb_enable_extregs); /* * Disable access to the extended registers @@ -1104,11 +1098,13 @@ void cyber2000fb_disable_extregs(struct cfb_info *cfb) else cfb->func_use_count -= 1; } +EXPORT_SYMBOL(cyber2000fb_disable_extregs); void cyber2000fb_get_fb_var(struct cfb_info *cfb, struct fb_var_screeninfo *var) { memcpy(var, &cfb->fb.var, sizeof(struct fb_var_screeninfo)); } +EXPORT_SYMBOL(cyber2000fb_get_fb_var); /* * Attach a capture/tv driver to the core CyberX0X0 driver. @@ -1122,13 +1118,15 @@ int cyber2000fb_attach(struct cyberpro_info *info, int idx) info->fb_size = int_cfb_info->fb.fix.smem_len; info->enable_extregs = cyber2000fb_enable_extregs; info->disable_extregs = cyber2000fb_disable_extregs; - info->info = int_cfb_info; + info->info = int_cfb_info; - strlcpy(info->dev_name, int_cfb_info->fb.fix.id, sizeof(info->dev_name)); + strlcpy(info->dev_name, int_cfb_info->fb.fix.id, + sizeof(info->dev_name)); } return int_cfb_info != NULL; } +EXPORT_SYMBOL(cyber2000fb_attach); /* * Detach a capture/tv driver from the core CyberX0X0 driver. @@ -1136,12 +1134,7 @@ int cyber2000fb_attach(struct cyberpro_info *info, int idx) void cyber2000fb_detach(int idx) { } - -EXPORT_SYMBOL(cyber2000fb_attach); EXPORT_SYMBOL(cyber2000fb_detach); -EXPORT_SYMBOL(cyber2000fb_enable_extregs); -EXPORT_SYMBOL(cyber2000fb_disable_extregs); -EXPORT_SYMBOL(cyber2000fb_get_fb_var); /* * These parameters give @@ -1205,7 +1198,7 @@ static void cyberpro_init_hw(struct cfb_info *cfb) int i; for (i = 0; i < sizeof(igs_regs); i += 2) - cyber2000_grphw(igs_regs[i], igs_regs[i+1], cfb); + cyber2000_grphw(igs_regs[i], igs_regs[i + 1], cfb); if (cfb->id == ID_CYBERPRO_5000) { unsigned char val; @@ -1215,8 +1208,8 @@ static void cyberpro_init_hw(struct cfb_info *cfb) } } -static struct cfb_info * __devinit -cyberpro_alloc_fb_info(unsigned int id, char *name) +static struct cfb_info __devinit *cyberpro_alloc_fb_info(unsigned int id, + char *name) { struct cfb_info *cfb; @@ -1228,9 +1221,9 @@ cyberpro_alloc_fb_info(unsigned int id, char *name) cfb->id = id; if (id == ID_CYBERPRO_5000) - cfb->ref_ps = 40690; // 24.576 MHz + cfb->ref_ps = 40690; /* 24.576 MHz */ else - cfb->ref_ps = 69842; // 14.31818 MHz (69841?) + cfb->ref_ps = 69842; /* 14.31818 MHz (69841?) */ cfb->divisors[0] = 1; cfb->divisors[1] = 2; @@ -1282,8 +1275,7 @@ cyberpro_alloc_fb_info(unsigned int id, char *name) return cfb; } -static void -cyberpro_free_fb_info(struct cfb_info *cfb) +static void cyberpro_free_fb_info(struct cfb_info *cfb) { if (cfb) { /* @@ -1300,8 +1292,7 @@ cyberpro_free_fb_info(struct cfb_info *cfb) * video=cyber2000:font:fontname */ #ifndef MODULE -static int -cyber2000fb_setup(char *options) +static int cyber2000fb_setup(char *options) { char *opt; @@ -1315,7 +1306,8 @@ cyber2000fb_setup(char *options) if (strncmp(opt, "font:", 5) == 0) { static char default_font_storage[40]; - strlcpy(default_font_storage, opt + 5, sizeof(default_font_storage)); + strlcpy(default_font_storage, opt + 5, + sizeof(default_font_storage)); default_font = default_font_storage; continue; } @@ -1354,10 +1346,18 @@ static int __devinit cyberpro_common_probe(struct cfb_info *cfb) * Determine the size of the memory. */ switch (cfb->mem_ctl2 & MEM_CTL2_SIZE_MASK) { - case MEM_CTL2_SIZE_4MB: smem_size = 0x00400000; break; - case MEM_CTL2_SIZE_2MB: smem_size = 0x00200000; break; - case MEM_CTL2_SIZE_1MB: smem_size = 0x00100000; break; - default: smem_size = 0x00100000; break; + case MEM_CTL2_SIZE_4MB: + smem_size = 0x00400000; + break; + case MEM_CTL2_SIZE_2MB: + smem_size = 0x00200000; + break; + case MEM_CTL2_SIZE_1MB: + smem_size = 0x00100000; + break; + default: + smem_size = 0x00100000; + break; } cfb->fb.fix.smem_len = smem_size; @@ -1366,8 +1366,8 @@ static int __devinit cyberpro_common_probe(struct cfb_info *cfb) err = -EINVAL; if (!fb_find_mode(&cfb->fb.var, &cfb->fb, NULL, NULL, 0, - &cyber2000fb_default_mode, 8)) { - printk("%s: no valid mode found\n", cfb->fb.fix.id); + &cyber2000fb_default_mode, 8)) { + printk(KERN_ERR "%s: no valid mode found\n", cfb->fb.fix.id); goto failed; } @@ -1377,7 +1377,7 @@ static int __devinit cyberpro_common_probe(struct cfb_info *cfb) if (cfb->fb.var.yres_virtual < cfb->fb.var.yres) cfb->fb.var.yres_virtual = cfb->fb.var.yres; -// fb_set_var(&cfb->fb.var, -1, &cfb->fb); +/* fb_set_var(&cfb->fb.var, -1, &cfb->fb); */ /* * Calculate the hsync and vsync frequencies. Note that @@ -1425,20 +1425,20 @@ static void cyberpro_common_resume(struct cfb_info *cfb) #include <asm/arch/hardware.h> -static int __devinit -cyberpro_vl_probe(void) +static int __devinit cyberpro_vl_probe(void) { struct cfb_info *cfb; int err = -ENOMEM; - if (!request_mem_region(FB_START,FB_SIZE,"CyberPro2010")) return err; + if (!request_mem_region(FB_START, FB_SIZE, "CyberPro2010")) + return err; cfb = cyberpro_alloc_fb_info(ID_CYBERPRO_2010, "CyberPro2010"); if (!cfb) goto failed_release; cfb->dev = NULL; - cfb->region = ioremap(FB_START,FB_SIZE); + cfb->region = ioremap(FB_START, FB_SIZE); if (!cfb->region) goto failed_ioremap; @@ -1475,7 +1475,7 @@ failed: failed_ioremap: cyberpro_free_fb_info(cfb); failed_release: - release_mem_region(FB_START,FB_SIZE); + release_mem_region(FB_START, FB_SIZE); return err; } @@ -1538,7 +1538,8 @@ static int cyberpro_pci_enable_mmio(struct cfb_info *cfb) * Allow the CyberPro to accept PCI burst accesses */ if (cfb->id == ID_CYBERPRO_2010) { - printk(KERN_INFO "%s: NOT enabling PCI bursts\n", cfb->fb.fix.id); + printk(KERN_INFO "%s: NOT enabling PCI bursts\n", + cfb->fb.fix.id); } else { val = cyber2000_grphr(EXT_BUS_CTL, cfb); if (!(val & EXT_BUS_CTL_PCIBURST_WRITE)) { @@ -1688,9 +1689,10 @@ static int cyberpro_pci_resume(struct pci_dev *dev) } static struct pci_device_id cyberpro_pci_table[] = { -// Not yet -// { PCI_VENDOR_ID_INTERG, PCI_DEVICE_ID_INTERG_1682, -// PCI_ANY_ID, PCI_ANY_ID, 0, 0, ID_IGA_1682 }, +/* Not yet + * { PCI_VENDOR_ID_INTERG, PCI_DEVICE_ID_INTERG_1682, + * PCI_ANY_ID, PCI_ANY_ID, 0, 0, ID_IGA_1682 }, + */ { PCI_VENDOR_ID_INTERG, PCI_DEVICE_ID_INTERG_2000, PCI_ANY_ID, PCI_ANY_ID, 0, 0, ID_CYBERPRO_2000 }, { PCI_VENDOR_ID_INTERG, PCI_DEVICE_ID_INTERG_2010, @@ -1700,7 +1702,7 @@ static struct pci_device_id cyberpro_pci_table[] = { { 0, } }; -MODULE_DEVICE_TABLE(pci,cyberpro_pci_table); +MODULE_DEVICE_TABLE(pci, cyberpro_pci_table); static struct pci_driver cyberpro_driver = { .name = "CyberPro", diff --git a/drivers/video/pnx4008/sdum.h b/drivers/video/pnx4008/sdum.h index e8c5dcdd8813..189c3d641383 100644 --- a/drivers/video/pnx4008/sdum.h +++ b/drivers/video/pnx4008/sdum.h @@ -77,9 +77,6 @@ #define CONF_DIRTYDETECTION_OFF (0x600) #define CONF_DIRTYDETECTION_ON (0x601) -/* Set the corresponding bit. */ -#define BIT(n) (0x1U << (n)) - struct dumchannel_uf { int channelnr; u32 *dirty; diff --git a/drivers/watchdog/at91rm9200_wdt.c b/drivers/watchdog/at91rm9200_wdt.c index 38bd37372599..a684b1e87372 100644 --- a/drivers/watchdog/at91rm9200_wdt.c +++ b/drivers/watchdog/at91rm9200_wdt.c @@ -9,6 +9,7 @@ * 2 of the License, or (at your option) any later version. */ +#include <linux/bitops.h> #include <linux/errno.h> #include <linux/fs.h> #include <linux/init.h> @@ -19,7 +20,6 @@ #include <linux/platform_device.h> #include <linux/types.h> #include <linux/watchdog.h> -#include <asm/bitops.h> #include <asm/uaccess.h> #include <asm/arch/at91_st.h> diff --git a/drivers/watchdog/ks8695_wdt.c b/drivers/watchdog/ks8695_wdt.c index 7150fb945eaf..e3a29c302309 100644 --- a/drivers/watchdog/ks8695_wdt.c +++ b/drivers/watchdog/ks8695_wdt.c @@ -8,6 +8,7 @@ * published by the Free Software Foundation. */ +#include <linux/bitops.h> #include <linux/errno.h> #include <linux/fs.h> #include <linux/init.h> @@ -18,7 +19,6 @@ #include <linux/platform_device.h> #include <linux/types.h> #include <linux/watchdog.h> -#include <asm/bitops.h> #include <asm/io.h> #include <asm/uaccess.h> #include <asm/arch/regs-timer.h> diff --git a/drivers/watchdog/omap_wdt.c b/drivers/watchdog/omap_wdt.c index 719b066f73c4..635ca454f56b 100644 --- a/drivers/watchdog/omap_wdt.c +++ b/drivers/watchdog/omap_wdt.c @@ -39,11 +39,11 @@ #include <linux/platform_device.h> #include <linux/moduleparam.h> #include <linux/clk.h> +#include <linux/bitops.h> #include <asm/io.h> #include <asm/uaccess.h> #include <asm/hardware.h> -#include <asm/bitops.h> #include <asm/arch/prcm.h> diff --git a/drivers/watchdog/sa1100_wdt.c b/drivers/watchdog/sa1100_wdt.c index 3475f47aaa45..34a2b3b81800 100644 --- a/drivers/watchdog/sa1100_wdt.c +++ b/drivers/watchdog/sa1100_wdt.c @@ -25,13 +25,13 @@ #include <linux/miscdevice.h> #include <linux/watchdog.h> #include <linux/init.h> +#include <linux/bitops.h> #ifdef CONFIG_ARCH_PXA #include <asm/arch/pxa-regs.h> #endif #include <asm/hardware.h> -#include <asm/bitops.h> #include <asm/uaccess.h> #define OSCR_FREQ CLOCK_TICK_RATE diff --git a/fs/Kconfig b/fs/Kconfig index e31f3691b151..cc28a69246a7 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -220,7 +220,7 @@ config JBD config JBD_DEBUG bool "JBD (ext3) debugging support" - depends on JBD + depends on JBD && DEBUG_FS help If you are using the ext3 journaled file system (or potentially any other file system/device using JBD), this option allows you to @@ -229,10 +229,10 @@ config JBD_DEBUG debugging output will be turned off. If you select Y here, then you will be able to turn on debugging - with "echo N > /proc/sys/fs/jbd-debug", where N is a number between - 1 and 5, the higher the number, the more debugging output is - generated. To turn debugging off again, do - "echo 0 > /proc/sys/fs/jbd-debug". + with "echo N > /sys/kernel/debug/jbd/jbd-debug", where N is a + number between 1 and 5, the higher the number, the more debugging + output is generated. To turn debugging off again, do + "echo 0 > /sys/kernel/debug/jbd/jbd-debug". config JBD2 tristate diff --git a/fs/autofs/inode.c b/fs/autofs/inode.c index e7204d71acc9..45f5992a0957 100644 --- a/fs/autofs/inode.c +++ b/fs/autofs/inode.c @@ -80,7 +80,7 @@ static int parse_options(char *options, int *pipefd, uid_t *uid, gid_t *gid, *uid = current->uid; *gid = current->gid; - *pgrp = process_group(current); + *pgrp = task_pgrp_nr(current); *minproto = *maxproto = AUTOFS_PROTO_VERSION; diff --git a/fs/autofs/root.c b/fs/autofs/root.c index c1489533277a..5efff3c0d886 100644 --- a/fs/autofs/root.c +++ b/fs/autofs/root.c @@ -214,8 +214,8 @@ static struct dentry *autofs_root_lookup(struct inode *dir, struct dentry *dentr oz_mode = autofs_oz_mode(sbi); DPRINTK(("autofs_lookup: pid = %u, pgrp = %u, catatonic = %d, " - "oz_mode = %d\n", pid_nr(task_pid(current)), - process_group(current), sbi->catatonic, + "oz_mode = %d\n", task_pid_nr(current), + task_pgrp_nr(current), sbi->catatonic, oz_mode)); /* @@ -536,7 +536,7 @@ static int autofs_root_ioctl(struct inode *inode, struct file *filp, struct autofs_sb_info *sbi = autofs_sbi(inode->i_sb); void __user *argp = (void __user *)arg; - DPRINTK(("autofs_ioctl: cmd = 0x%08x, arg = 0x%08lx, sbi = %p, pgrp = %u\n",cmd,arg,sbi,process_group(current))); + DPRINTK(("autofs_ioctl: cmd = 0x%08x, arg = 0x%08lx, sbi = %p, pgrp = %u\n",cmd,arg,sbi,task_pgrp_nr(current))); if (_IOC_TYPE(cmd) != _IOC_TYPE(AUTOFS_IOC_FIRST) || _IOC_NR(cmd) - _IOC_NR(AUTOFS_IOC_FIRST) >= AUTOFS_IOC_COUNT) diff --git a/fs/autofs4/autofs_i.h b/fs/autofs4/autofs_i.h index d85f42fa9206..2d4ae40718d9 100644 --- a/fs/autofs4/autofs_i.h +++ b/fs/autofs4/autofs_i.h @@ -131,7 +131,7 @@ static inline struct autofs_info *autofs4_dentry_ino(struct dentry *dentry) filesystem without "magic".) */ static inline int autofs4_oz_mode(struct autofs_sb_info *sbi) { - return sbi->catatonic || process_group(current) == sbi->oz_pgrp; + return sbi->catatonic || task_pgrp_nr(current) == sbi->oz_pgrp; } /* Does a dentry have some pending activity? */ diff --git a/fs/autofs4/inode.c b/fs/autofs4/inode.c index cd81f0836671..7f05d6ccdb13 100644 --- a/fs/autofs4/inode.c +++ b/fs/autofs4/inode.c @@ -226,7 +226,7 @@ static int parse_options(char *options, int *pipefd, uid_t *uid, gid_t *gid, *uid = current->uid; *gid = current->gid; - *pgrp = process_group(current); + *pgrp = task_pgrp_nr(current); *minproto = AUTOFS_MIN_PROTO_VERSION; *maxproto = AUTOFS_MAX_PROTO_VERSION; @@ -323,7 +323,7 @@ int autofs4_fill_super(struct super_block *s, void *data, int silent) sbi->pipe = NULL; sbi->catatonic = 1; sbi->exp_timeout = 0; - sbi->oz_pgrp = process_group(current); + sbi->oz_pgrp = task_pgrp_nr(current); sbi->sb = s; sbi->version = 0; sbi->sub_version = 0; diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c index 45ff3d63b758..2bbcc8151dc3 100644 --- a/fs/autofs4/root.c +++ b/fs/autofs4/root.c @@ -582,7 +582,7 @@ static struct dentry *autofs4_lookup(struct inode *dir, struct dentry *dentry, s oz_mode = autofs4_oz_mode(sbi); DPRINTK("pid = %u, pgrp = %u, catatonic = %d, oz_mode = %d", - current->pid, process_group(current), sbi->catatonic, oz_mode); + current->pid, task_pgrp_nr(current), sbi->catatonic, oz_mode); unhashed = autofs4_lookup_unhashed(sbi, dentry->d_parent, &dentry->d_name); if (!unhashed) { @@ -976,7 +976,7 @@ static int autofs4_root_ioctl(struct inode *inode, struct file *filp, void __user *p = (void __user *)arg; DPRINTK("cmd = 0x%08x, arg = 0x%08lx, sbi = %p, pgrp = %u", - cmd,arg,sbi,process_group(current)); + cmd,arg,sbi,task_pgrp_nr(current)); if (_IOC_TYPE(cmd) != _IOC_TYPE(AUTOFS_IOC_FIRST) || _IOC_NR(cmd) - _IOC_NR(AUTOFS_IOC_FIRST) >= AUTOFS_IOC_COUNT) diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index 6e2f3b8dde7f..ba8de7ca260b 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -1383,10 +1383,10 @@ static void fill_prstatus(struct elf_prstatus *prstatus, prstatus->pr_info.si_signo = prstatus->pr_cursig = signr; prstatus->pr_sigpend = p->pending.signal.sig[0]; prstatus->pr_sighold = p->blocked.sig[0]; - prstatus->pr_pid = p->pid; - prstatus->pr_ppid = p->parent->pid; - prstatus->pr_pgrp = process_group(p); - prstatus->pr_sid = process_session(p); + prstatus->pr_pid = task_pid_vnr(p); + prstatus->pr_ppid = task_pid_vnr(p->parent); + prstatus->pr_pgrp = task_pgrp_vnr(p); + prstatus->pr_sid = task_session_vnr(p); if (thread_group_leader(p)) { /* * This is the record for the group leader. Add in the @@ -1429,10 +1429,10 @@ static int fill_psinfo(struct elf_prpsinfo *psinfo, struct task_struct *p, psinfo->pr_psargs[i] = ' '; psinfo->pr_psargs[len] = 0; - psinfo->pr_pid = p->pid; - psinfo->pr_ppid = p->parent->pid; - psinfo->pr_pgrp = process_group(p); - psinfo->pr_sid = process_session(p); + psinfo->pr_pid = task_pid_vnr(p); + psinfo->pr_ppid = task_pid_vnr(p->parent); + psinfo->pr_pgrp = task_pgrp_vnr(p); + psinfo->pr_sid = task_session_vnr(p); i = p->state ? ffz(~p->state) + 1 : 0; psinfo->pr_state = i; diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c index 033861c6b8f1..32649f2a1654 100644 --- a/fs/binfmt_elf_fdpic.c +++ b/fs/binfmt_elf_fdpic.c @@ -1342,10 +1342,10 @@ static void fill_prstatus(struct elf_prstatus *prstatus, prstatus->pr_info.si_signo = prstatus->pr_cursig = signr; prstatus->pr_sigpend = p->pending.signal.sig[0]; prstatus->pr_sighold = p->blocked.sig[0]; - prstatus->pr_pid = p->pid; - prstatus->pr_ppid = p->parent->pid; - prstatus->pr_pgrp = process_group(p); - prstatus->pr_sid = process_session(p); + prstatus->pr_pid = task_pid_vnr(p); + prstatus->pr_ppid = task_pid_vnr(p->parent); + prstatus->pr_pgrp = task_pgrp_vnr(p); + prstatus->pr_sid = task_session_vnr(p); if (thread_group_leader(p)) { /* * This is the record for the group leader. Add in the @@ -1391,10 +1391,10 @@ static int fill_psinfo(struct elf_prpsinfo *psinfo, struct task_struct *p, psinfo->pr_psargs[i] = ' '; psinfo->pr_psargs[len] = 0; - psinfo->pr_pid = p->pid; - psinfo->pr_ppid = p->parent->pid; - psinfo->pr_pgrp = process_group(p); - psinfo->pr_sid = process_session(p); + psinfo->pr_pid = task_pid_vnr(p); + psinfo->pr_ppid = task_pid_vnr(p->parent); + psinfo->pr_pgrp = task_pgrp_vnr(p); + psinfo->pr_sid = task_session_vnr(p); i = p->state ? ffz(~p->state) + 1 : 0; psinfo->pr_state = i; diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 4af3588c1a96..370866cb3d48 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -352,7 +352,7 @@ cifs_demultiplex_thread(struct TCP_Server_Info *server) current->flags |= PF_MEMALLOC; server->tsk = current; /* save process info to wake at shutdown */ - cFYI(1, ("Demultiplex PID: %d", current->pid)); + cFYI(1, ("Demultiplex PID: %d", task_pid_nr(current))); write_lock(&GlobalSMBSeslock); atomic_inc(&tcpSesAllocCount); length = tcpSesAllocCount.counter; diff --git a/fs/coda/upcall.c b/fs/coda/upcall.c index cdb4c07a7870..359e531094dd 100644 --- a/fs/coda/upcall.c +++ b/fs/coda/upcall.c @@ -51,7 +51,7 @@ static void *alloc_upcall(int opcode, int size) inp->ih.opcode = opcode; inp->ih.pid = current->pid; - inp->ih.pgid = process_group(current); + inp->ih.pgid = task_pgrp_nr(current); #ifdef CONFIG_CODA_FS_OLD_API memset(&inp->ih.cred, 0, sizeof(struct coda_cred)); inp->ih.cred.cr_fsuid = current->fsuid; diff --git a/fs/dlm/user.c b/fs/dlm/user.c index 6438941ab1f8..4f741546f4bb 100644 --- a/fs/dlm/user.c +++ b/fs/dlm/user.c @@ -456,7 +456,7 @@ static int check_version(struct dlm_write_request *req) printk(KERN_DEBUG "dlm: process %s (%d) version mismatch " "user (%d.%d.%d) kernel (%d.%d.%d)\n", current->comm, - current->pid, + task_pid_nr(current), req->version[0], req->version[1], req->version[2], diff --git a/fs/eventpoll.c b/fs/eventpoll.c index de6189291954..34f68f3a069a 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -325,15 +325,14 @@ static void ep_poll_safewake(struct poll_safewake *psw, wait_queue_head_t *wq) int wake_nests = 0; unsigned long flags; struct task_struct *this_task = current; - struct list_head *lsthead = &psw->wake_task_list, *lnk; + struct list_head *lsthead = &psw->wake_task_list; struct wake_task_node *tncur; struct wake_task_node tnode; spin_lock_irqsave(&psw->lock, flags); /* Try to see if the current task is already inside this wakeup call */ - list_for_each(lnk, lsthead) { - tncur = list_entry(lnk, struct wake_task_node, llink); + list_for_each_entry(tncur, lsthead, llink) { if (tncur->wq == wq || (tncur->task == this_task && ++wake_nests > EP_MAX_POLLWAKE_NESTS)) { diff --git a/fs/exec.c b/fs/exec.c index 070ddf13cb71..2c942e2d14ea 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -234,7 +234,7 @@ static int __bprm_mm_init(struct linux_binprm *bprm) vma->vm_start = vma->vm_end - PAGE_SIZE; vma->vm_flags = VM_STACK_FLAGS; - vma->vm_page_prot = protection_map[vma->vm_flags & 0x7]; + vma->vm_page_prot = vm_get_page_prot(vma->vm_flags); err = insert_vm_struct(mm, vma); if (err) { up_write(&mm->mmap_sem); @@ -775,8 +775,8 @@ static int de_thread(struct task_struct *tsk) * Reparenting needs write_lock on tasklist_lock, * so it is safe to do it under read_lock. */ - if (unlikely(tsk->group_leader == child_reaper(tsk))) - tsk->nsproxy->pid_ns->child_reaper = tsk; + if (unlikely(tsk->group_leader == task_child_reaper(tsk))) + task_active_pid_ns(tsk)->child_reaper = tsk; zap_other_threads(tsk); read_unlock(&tasklist_lock); @@ -841,8 +841,8 @@ static int de_thread(struct task_struct *tsk) */ tsk->start_time = leader->start_time; - BUG_ON(leader->tgid != tsk->tgid); - BUG_ON(tsk->pid == tsk->tgid); + BUG_ON(!same_thread_group(leader, tsk)); + BUG_ON(has_group_leader_pid(tsk)); /* * An exec() starts a new thread group with the * TGID of the previous thread group. Rehash the @@ -857,7 +857,7 @@ static int de_thread(struct task_struct *tsk) */ detach_pid(tsk, PIDTYPE_PID); tsk->pid = leader->pid; - attach_pid(tsk, PIDTYPE_PID, find_pid(tsk->pid)); + attach_pid(tsk, PIDTYPE_PID, task_pid(leader)); transfer_pid(leader, tsk, PIDTYPE_PGID); transfer_pid(leader, tsk, PIDTYPE_SID); list_replace_rcu(&leader->tasks, &tsk->tasks); @@ -1433,7 +1433,7 @@ static int format_corename(char *corename, const char *pattern, long signr) case 'p': pid_in_pattern = 1; rc = snprintf(out_ptr, out_end - out_ptr, - "%d", current->tgid); + "%d", task_tgid_vnr(current)); if (rc > out_end - out_ptr) goto out; out_ptr += rc; @@ -1513,7 +1513,7 @@ static int format_corename(char *corename, const char *pattern, long signr) if (!ispipe && !pid_in_pattern && (core_uses_pid || atomic_read(¤t->mm->mm_users) != 1)) { rc = snprintf(out_ptr, out_end - out_ptr, - ".%d", current->tgid); + ".%d", task_tgid_vnr(current)); if (rc > out_end - out_ptr) goto out; out_ptr += rc; diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c index 3dec003b773e..9b162cd6c16c 100644 --- a/fs/ext3/inode.c +++ b/fs/ext3/inode.c @@ -2954,7 +2954,7 @@ int ext3_write_inode(struct inode *inode, int wait) return 0; if (ext3_journal_current_handle()) { - jbd_debug(0, "called recursively, non-PF_MEMALLOC!\n"); + jbd_debug(1, "called recursively, non-PF_MEMALLOC!\n"); dump_stack(); return -EIO; } diff --git a/fs/ext3/xattr.c b/fs/ext3/xattr.c index f58cbb26323e..408373819e34 100644 --- a/fs/ext3/xattr.c +++ b/fs/ext3/xattr.c @@ -741,12 +741,11 @@ ext3_xattr_block_set(handle_t *handle, struct inode *inode, } } else { /* Allocate a buffer where we construct the new block. */ - s->base = kmalloc(sb->s_blocksize, GFP_KERNEL); + s->base = kzalloc(sb->s_blocksize, GFP_KERNEL); /* assert(header == s->base) */ error = -ENOMEM; if (s->base == NULL) goto cleanup; - memset(s->base, 0, sb->s_blocksize); header(s->base)->h_magic = cpu_to_le32(EXT3_XATTR_MAGIC); header(s->base)->h_blocks = cpu_to_le32(1); header(s->base)->h_refcount = cpu_to_le32(1); diff --git a/fs/fcntl.c b/fs/fcntl.c index c9db73fc5e3d..8685263ccc4a 100644 --- a/fs/fcntl.c +++ b/fs/fcntl.c @@ -18,6 +18,7 @@ #include <linux/ptrace.h> #include <linux/signal.h> #include <linux/rcupdate.h> +#include <linux/pid_namespace.h> #include <asm/poll.h> #include <asm/siginfo.h> @@ -292,7 +293,7 @@ int f_setown(struct file *filp, unsigned long arg, int force) who = -who; } rcu_read_lock(); - pid = find_pid(who); + pid = find_vpid(who); result = __f_setown(filp, pid, type, force); rcu_read_unlock(); return result; @@ -308,7 +309,7 @@ pid_t f_getown(struct file *filp) { pid_t pid; read_lock(&filp->f_owner.lock); - pid = pid_nr(filp->f_owner.pid); + pid = pid_nr_ns(filp->f_owner.pid, current->nsproxy->pid_ns); if (filp->f_owner.pid_type == PIDTYPE_PGID) pid = -pid; read_unlock(&filp->f_owner.lock); diff --git a/fs/file_table.c b/fs/file_table.c index 3176fefc92e1..664e3f2309b8 100644 --- a/fs/file_table.c +++ b/fs/file_table.c @@ -323,12 +323,11 @@ void file_kill(struct file *file) int fs_may_remount_ro(struct super_block *sb) { - struct list_head *p; + struct file *file; /* Check that no files are currently opened for writing. */ file_list_lock(); - list_for_each(p, &sb->s_files) { - struct file *file = list_entry(p, struct file, f_u.fu_list); + list_for_each_entry(file, &sb->s_files, f_u.fu_list) { struct inode *inode = file->f_path.dentry->d_inode; /* File with pending delete? */ diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 686734ff973d..0fca82021d76 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -89,7 +89,7 @@ void __mark_inode_dirty(struct inode *inode, int flags) if (inode->i_ino || strcmp(inode->i_sb->s_id, "bdev")) printk(KERN_DEBUG "%s(%d): dirtied inode %lu (%s) on %s\n", - current->comm, current->pid, inode->i_ino, + current->comm, task_pid_nr(current), inode->i_ino, name, inode->i_sb->s_id); } diff --git a/fs/ioprio.c b/fs/ioprio.c index 10d2c211d18b..d6ff77e8e7ec 100644 --- a/fs/ioprio.c +++ b/fs/ioprio.c @@ -25,6 +25,7 @@ #include <linux/capability.h> #include <linux/syscalls.h> #include <linux/security.h> +#include <linux/pid_namespace.h> static int set_task_ioprio(struct task_struct *task, int ioprio) { @@ -93,7 +94,7 @@ asmlinkage long sys_ioprio_set(int which, int who, int ioprio) if (!who) p = current; else - p = find_task_by_pid(who); + p = find_task_by_vpid(who); if (p) ret = set_task_ioprio(p, ioprio); break; @@ -101,7 +102,7 @@ asmlinkage long sys_ioprio_set(int which, int who, int ioprio) if (!who) pgrp = task_pgrp(current); else - pgrp = find_pid(who); + pgrp = find_vpid(who); do_each_pid_task(pgrp, PIDTYPE_PGID, p) { ret = set_task_ioprio(p, ioprio); if (ret) @@ -180,7 +181,7 @@ asmlinkage long sys_ioprio_get(int which, int who) if (!who) p = current; else - p = find_task_by_pid(who); + p = find_task_by_vpid(who); if (p) ret = get_task_ioprio(p); break; @@ -188,7 +189,7 @@ asmlinkage long sys_ioprio_get(int which, int who) if (!who) pgrp = task_pgrp(current); else - pgrp = find_pid(who); + pgrp = find_vpid(who); do_each_pid_task(pgrp, PIDTYPE_PGID, p) { tmpio = get_task_ioprio(p); if (tmpio < 0) diff --git a/fs/jbd/commit.c b/fs/jbd/commit.c index a263d82761df..8f1f2aa5fb39 100644 --- a/fs/jbd/commit.c +++ b/fs/jbd/commit.c @@ -466,7 +466,7 @@ void journal_commit_transaction(journal_t *journal) spin_unlock(&journal->j_list_lock); if (err) - __journal_abort_hard(journal); + journal_abort(journal, err); journal_write_revoke_records(journal, commit_transaction); @@ -524,7 +524,7 @@ void journal_commit_transaction(journal_t *journal) descriptor = journal_get_descriptor_buffer(journal); if (!descriptor) { - __journal_abort_hard(journal); + journal_abort(journal, -EIO); continue; } @@ -557,7 +557,7 @@ void journal_commit_transaction(journal_t *journal) and repeat this loop: we'll fall into the refile-on-abort condition above. */ if (err) { - __journal_abort_hard(journal); + journal_abort(journal, err); continue; } @@ -748,7 +748,7 @@ wait_for_iobuf: err = -EIO; if (err) - __journal_abort_hard(journal); + journal_abort(journal, err); /* End of a transaction! Finally, we can do checkpoint processing: any buffers committed as a result of this diff --git a/fs/jbd/journal.c b/fs/jbd/journal.c index 5d9fec0b7ebd..5d14243499d4 100644 --- a/fs/jbd/journal.c +++ b/fs/jbd/journal.c @@ -35,6 +35,7 @@ #include <linux/kthread.h> #include <linux/poison.h> #include <linux/proc_fs.h> +#include <linux/debugfs.h> #include <asm/uaccess.h> #include <asm/page.h> @@ -654,10 +655,9 @@ static journal_t * journal_init_common (void) journal_t *journal; int err; - journal = kmalloc(sizeof(*journal), GFP_KERNEL); + journal = kzalloc(sizeof(*journal), GFP_KERNEL); if (!journal) goto fail; - memset(journal, 0, sizeof(*journal)); init_waitqueue_head(&journal->j_wait_transaction_locked); init_waitqueue_head(&journal->j_wait_logspace); @@ -1852,64 +1852,41 @@ void journal_put_journal_head(struct journal_head *jh) } /* - * /proc tunables + * debugfs tunables */ -#if defined(CONFIG_JBD_DEBUG) -int journal_enable_debug; -EXPORT_SYMBOL(journal_enable_debug); -#endif +#ifdef CONFIG_JBD_DEBUG -#if defined(CONFIG_JBD_DEBUG) && defined(CONFIG_PROC_FS) +u8 journal_enable_debug __read_mostly; +EXPORT_SYMBOL(journal_enable_debug); -static struct proc_dir_entry *proc_jbd_debug; +static struct dentry *jbd_debugfs_dir; +static struct dentry *jbd_debug; -static int read_jbd_debug(char *page, char **start, off_t off, - int count, int *eof, void *data) +static void __init jbd_create_debugfs_entry(void) { - int ret; - - ret = sprintf(page + off, "%d\n", journal_enable_debug); - *eof = 1; - return ret; + jbd_debugfs_dir = debugfs_create_dir("jbd", NULL); + if (jbd_debugfs_dir) + jbd_debug = debugfs_create_u8("jbd-debug", S_IRUGO, + jbd_debugfs_dir, + &journal_enable_debug); } -static int write_jbd_debug(struct file *file, const char __user *buffer, - unsigned long count, void *data) +static void __exit jbd_remove_debugfs_entry(void) { - char buf[32]; - - if (count > ARRAY_SIZE(buf) - 1) - count = ARRAY_SIZE(buf) - 1; - if (copy_from_user(buf, buffer, count)) - return -EFAULT; - buf[ARRAY_SIZE(buf) - 1] = '\0'; - journal_enable_debug = simple_strtoul(buf, NULL, 10); - return count; + debugfs_remove(jbd_debug); + debugfs_remove(jbd_debugfs_dir); } -#define JBD_PROC_NAME "sys/fs/jbd-debug" +#else -static void __init create_jbd_proc_entry(void) +static inline void jbd_create_debugfs_entry(void) { - proc_jbd_debug = create_proc_entry(JBD_PROC_NAME, 0644, NULL); - if (proc_jbd_debug) { - /* Why is this so hard? */ - proc_jbd_debug->read_proc = read_jbd_debug; - proc_jbd_debug->write_proc = write_jbd_debug; - } } -static void __exit remove_jbd_proc_entry(void) +static inline void jbd_remove_debugfs_entry(void) { - if (proc_jbd_debug) - remove_proc_entry(JBD_PROC_NAME, NULL); } -#else - -#define create_jbd_proc_entry() do {} while (0) -#define remove_jbd_proc_entry() do {} while (0) - #endif struct kmem_cache *jbd_handle_cache; @@ -1966,7 +1943,7 @@ static int __init journal_init(void) ret = journal_init_caches(); if (ret != 0) journal_destroy_caches(); - create_jbd_proc_entry(); + jbd_create_debugfs_entry(); return ret; } @@ -1977,7 +1954,7 @@ static void __exit journal_exit(void) if (n) printk(KERN_EMERG "JBD: leaked %d journal_heads!\n", n); #endif - remove_jbd_proc_entry(); + jbd_remove_debugfs_entry(); journal_destroy_caches(); } diff --git a/fs/jbd/recovery.c b/fs/jbd/recovery.c index 2a5f4b833e35..c5d9694b6a2f 100644 --- a/fs/jbd/recovery.c +++ b/fs/jbd/recovery.c @@ -250,10 +250,10 @@ int journal_recover(journal_t *journal) if (!err) err = do_one_pass(journal, &info, PASS_REPLAY); - jbd_debug(0, "JBD: recovery, exit status %d, " + jbd_debug(1, "JBD: recovery, exit status %d, " "recovered transactions %u to %u\n", err, info.start_transaction, info.end_transaction); - jbd_debug(0, "JBD: Replayed %d and revoked %d/%d blocks\n", + jbd_debug(1, "JBD: Replayed %d and revoked %d/%d blocks\n", info.nr_replays, info.nr_revoke_hits, info.nr_revokes); /* Restart the log at the next transaction ID, thus invalidating @@ -297,7 +297,7 @@ int journal_skip_recovery(journal_t *journal) #ifdef CONFIG_JBD_DEBUG int dropped = info.end_transaction - be32_to_cpu(sb->s_sequence); #endif - jbd_debug(0, + jbd_debug(1, "JBD: ignoring %d transaction%s from the journal.\n", dropped, (dropped == 1) ? "" : "s"); journal->j_transaction_sequence = ++info.end_transaction; diff --git a/fs/jbd/transaction.c b/fs/jbd/transaction.c index 9841b1e5af03..08ff6c7028cc 100644 --- a/fs/jbd/transaction.c +++ b/fs/jbd/transaction.c @@ -96,13 +96,12 @@ static int start_this_handle(journal_t *journal, handle_t *handle) alloc_transaction: if (!journal->j_running_transaction) { - new_transaction = kmalloc(sizeof(*new_transaction), + new_transaction = kzalloc(sizeof(*new_transaction), GFP_NOFS|__GFP_NOFAIL); if (!new_transaction) { ret = -ENOMEM; goto out; } - memset(new_transaction, 0, sizeof(*new_transaction)); } jbd_debug(3, "New handle %p going live.\n", handle); diff --git a/fs/jffs2/debug.h b/fs/jffs2/debug.h index 2a49f2c51a9f..4130adabd76e 100644 --- a/fs/jffs2/debug.h +++ b/fs/jffs2/debug.h @@ -80,28 +80,28 @@ #define JFFS2_ERROR(fmt, ...) \ do { \ printk(JFFS2_ERR_MSG_PREFIX \ - " (%d) %s: " fmt, current->pid, \ + " (%d) %s: " fmt, task_pid_nr(current), \ __FUNCTION__ , ##__VA_ARGS__); \ } while(0) #define JFFS2_WARNING(fmt, ...) \ do { \ printk(JFFS2_WARN_MSG_PREFIX \ - " (%d) %s: " fmt, current->pid, \ + " (%d) %s: " fmt, task_pid_nr(current), \ __FUNCTION__ , ##__VA_ARGS__); \ } while(0) #define JFFS2_NOTICE(fmt, ...) \ do { \ printk(JFFS2_NOTICE_MSG_PREFIX \ - " (%d) %s: " fmt, current->pid, \ + " (%d) %s: " fmt, task_pid_nr(current), \ __FUNCTION__ , ##__VA_ARGS__); \ } while(0) #define JFFS2_DEBUG(fmt, ...) \ do { \ printk(JFFS2_DBG_MSG_PREFIX \ - " (%d) %s: " fmt, current->pid, \ + " (%d) %s: " fmt, task_pid_nr(current), \ __FUNCTION__ , ##__VA_ARGS__); \ } while(0) diff --git a/fs/namespace.c b/fs/namespace.c index 07daa7972591..860752998fb3 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -1411,7 +1411,7 @@ long do_mount(char *dev_name, char *dir_name, char *type_page, mnt_flags |= MNT_RELATIME; flags &= ~(MS_NOSUID | MS_NOEXEC | MS_NODEV | MS_ACTIVE | - MS_NOATIME | MS_NODIRATIME | MS_RELATIME); + MS_NOATIME | MS_NODIRATIME | MS_RELATIME| MS_KERNMOUNT); /* ... and get the mountpoint */ retval = path_lookup(dir_name, LOOKUP_FOLLOW, &nd); diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index 46934c97f8f7..d0199189924c 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -1029,13 +1029,13 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file, if (EX_WGATHER(exp)) { if (atomic_read(&inode->i_writecount) > 1 || (last_ino == inode->i_ino && last_dev == inode->i_sb->s_dev)) { - dprintk("nfsd: write defer %d\n", current->pid); + dprintk("nfsd: write defer %d\n", task_pid_nr(current)); msleep(10); - dprintk("nfsd: write resume %d\n", current->pid); + dprintk("nfsd: write resume %d\n", task_pid_nr(current)); } if (inode->i_state & I_DIRTY) { - dprintk("nfsd: write sync %d\n", current->pid); + dprintk("nfsd: write sync %d\n", task_pid_nr(current)); host_err=nfsd_sync(file); } #if 0 diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c index f14b541fab95..9cc7c0418b70 100644 --- a/fs/ocfs2/cluster/heartbeat.c +++ b/fs/ocfs2/cluster/heartbeat.c @@ -1372,7 +1372,7 @@ static ssize_t o2hb_region_pid_read(struct o2hb_region *reg, spin_lock(&o2hb_live_lock); if (reg->hr_task) - pid = reg->hr_task->pid; + pid = task_pid_nr(reg->hr_task); spin_unlock(&o2hb_live_lock); if (!pid) diff --git a/fs/ocfs2/cluster/masklog.h b/fs/ocfs2/cluster/masklog.h index 75cd877f6d42..cd046060114e 100644 --- a/fs/ocfs2/cluster/masklog.h +++ b/fs/ocfs2/cluster/masklog.h @@ -192,7 +192,7 @@ extern struct mlog_bits mlog_and_bits, mlog_not_bits; * previous token if args expands to nothing. */ #define __mlog_printk(level, fmt, args...) \ - printk(level "(%u,%lu):%s:%d " fmt, current->pid, \ + printk(level "(%u,%lu):%s:%d " fmt, task_pid_nr(current), \ __mlog_cpu_guess, __PRETTY_FUNCTION__, __LINE__ , \ ##args) diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c index a2c33160bfd6..2fde7bf91434 100644 --- a/fs/ocfs2/dlm/dlmrecovery.c +++ b/fs/ocfs2/dlm/dlmrecovery.c @@ -259,7 +259,7 @@ static void dlm_print_reco_node_status(struct dlm_ctxt *dlm) struct dlm_lock_resource *res; mlog(ML_NOTICE, "%s(%d): recovery info, state=%s, dead=%u, master=%u\n", - dlm->name, dlm->dlm_reco_thread_task->pid, + dlm->name, task_pid_nr(dlm->dlm_reco_thread_task), dlm->reco.state & DLM_RECO_STATE_ACTIVE ? "ACTIVE" : "inactive", dlm->reco.dead_node, dlm->reco.new_master); @@ -420,7 +420,7 @@ void dlm_wait_for_recovery(struct dlm_ctxt *dlm) if (dlm_in_recovery(dlm)) { mlog(0, "%s: reco thread %d in recovery: " "state=%d, master=%u, dead=%u\n", - dlm->name, dlm->dlm_reco_thread_task->pid, + dlm->name, task_pid_nr(dlm->dlm_reco_thread_task), dlm->reco.state, dlm->reco.new_master, dlm->reco.dead_node); } @@ -483,7 +483,7 @@ static int dlm_do_recovery(struct dlm_ctxt *dlm) return 0; } mlog(0, "%s(%d):recovery thread found node %u in the recovery map!\n", - dlm->name, dlm->dlm_reco_thread_task->pid, + dlm->name, task_pid_nr(dlm->dlm_reco_thread_task), dlm->reco.dead_node); spin_unlock(&dlm->spinlock); @@ -507,7 +507,7 @@ static int dlm_do_recovery(struct dlm_ctxt *dlm) mlog(0, "another node will master this recovery session.\n"); } mlog(0, "dlm=%s (%d), new_master=%u, this node=%u, dead_node=%u\n", - dlm->name, dlm->dlm_reco_thread_task->pid, dlm->reco.new_master, + dlm->name, task_pid_nr(dlm->dlm_reco_thread_task), dlm->reco.new_master, dlm->node_num, dlm->reco.dead_node); /* it is safe to start everything back up here @@ -520,7 +520,7 @@ static int dlm_do_recovery(struct dlm_ctxt *dlm) master_here: mlog(0, "(%d) mastering recovery of %s:%u here(this=%u)!\n", - dlm->dlm_reco_thread_task->pid, + task_pid_nr(dlm->dlm_reco_thread_task), dlm->name, dlm->reco.dead_node, dlm->node_num); status = dlm_remaster_locks(dlm, dlm->reco.dead_node); diff --git a/fs/proc/array.c b/fs/proc/array.c index 27b59f5f3bd1..7a34571203bc 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -77,6 +77,7 @@ #include <linux/cpuset.h> #include <linux/rcupdate.h> #include <linux/delayacct.h> +#include <linux/pid_namespace.h> #include <asm/pgtable.h> #include <asm/processor.h> @@ -145,8 +146,7 @@ static inline const char *get_task_state(struct task_struct *tsk) TASK_UNINTERRUPTIBLE | TASK_STOPPED | TASK_TRACED)) | - (tsk->exit_state & (EXIT_ZOMBIE | - EXIT_DEAD)); + tsk->exit_state; const char **p = &task_state_array[0]; while (state) { @@ -161,8 +161,15 @@ static inline char *task_state(struct task_struct *p, char *buffer) struct group_info *group_info; int g; struct fdtable *fdt = NULL; + struct pid_namespace *ns; + pid_t ppid, tpid; + ns = current->nsproxy->pid_ns; rcu_read_lock(); + ppid = pid_alive(p) ? + task_tgid_nr_ns(rcu_dereference(p->real_parent), ns) : 0; + tpid = pid_alive(p) && p->ptrace ? + task_ppid_nr_ns(rcu_dereference(p->parent), ns) : 0; buffer += sprintf(buffer, "State:\t%s\n" "Tgid:\t%d\n" @@ -172,9 +179,9 @@ static inline char *task_state(struct task_struct *p, char *buffer) "Uid:\t%d\t%d\t%d\t%d\n" "Gid:\t%d\t%d\t%d\t%d\n", get_task_state(p), - p->tgid, p->pid, - pid_alive(p) ? rcu_dereference(p->real_parent)->tgid : 0, - pid_alive(p) && p->ptrace ? rcu_dereference(p->parent)->pid : 0, + task_tgid_nr_ns(p, ns), + task_pid_nr_ns(p, ns), + ppid, tpid, p->uid, p->euid, p->suid, p->fsuid, p->gid, p->egid, p->sgid, p->fsgid); @@ -394,6 +401,9 @@ static int do_task_stat(struct task_struct *task, char *buffer, int whole) unsigned long rsslim = 0; char tcomm[sizeof(task->comm)]; unsigned long flags; + struct pid_namespace *ns; + + ns = current->nsproxy->pid_ns; state = *get_task_state(task); vsize = eip = esp = 0; @@ -416,7 +426,7 @@ static int do_task_stat(struct task_struct *task, char *buffer, int whole) struct signal_struct *sig = task->signal; if (sig->tty) { - tty_pgrp = pid_nr(sig->tty->pgrp); + tty_pgrp = pid_nr_ns(sig->tty->pgrp, ns); tty_nr = new_encode_dev(tty_devnum(sig->tty)); } @@ -449,9 +459,9 @@ static int do_task_stat(struct task_struct *task, char *buffer, int whole) gtime += cputime_add(gtime, sig->gtime); } - sid = signal_session(sig); - pgid = process_group(task); - ppid = rcu_dereference(task->real_parent)->tgid; + sid = task_session_nr_ns(task, ns); + pgid = task_pgrp_nr_ns(task, ns); + ppid = task_ppid_nr_ns(task, ns); unlock_task_sighand(task, &flags); } @@ -483,7 +493,7 @@ static int do_task_stat(struct task_struct *task, char *buffer, int whole) res = sprintf(buffer, "%d (%s) %c %d %d %d %d %d %u %lu \ %lu %lu %lu %lu %lu %ld %ld %ld %ld %d 0 %llu %lu %ld %lu %lu %lu %lu %lu \ %lu %lu %lu %lu %lu %lu %lu %lu %d %d %u %u %llu %lu %ld\n", - task->pid, + task_pid_nr_ns(task, ns), tcomm, state, ppid, diff --git a/fs/proc/base.c b/fs/proc/base.c index 4fe74d156416..39a3d7c969c5 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -63,16 +63,19 @@ #include <linux/mm.h> #include <linux/rcupdate.h> #include <linux/kallsyms.h> +#include <linux/resource.h> #include <linux/module.h> #include <linux/mount.h> #include <linux/security.h> #include <linux/ptrace.h> +#include <linux/cgroup.h> #include <linux/cpuset.h> #include <linux/audit.h> #include <linux/poll.h> #include <linux/nsproxy.h> #include <linux/oom.h> #include <linux/elf.h> +#include <linux/pid_namespace.h> #include "internal.h" /* NOTE: @@ -301,6 +304,78 @@ static int proc_oom_score(struct task_struct *task, char *buffer) return sprintf(buffer, "%lu\n", points); } +struct limit_names { + char *name; + char *unit; +}; + +static const struct limit_names lnames[RLIM_NLIMITS] = { + [RLIMIT_CPU] = {"Max cpu time", "ms"}, + [RLIMIT_FSIZE] = {"Max file size", "bytes"}, + [RLIMIT_DATA] = {"Max data size", "bytes"}, + [RLIMIT_STACK] = {"Max stack size", "bytes"}, + [RLIMIT_CORE] = {"Max core file size", "bytes"}, + [RLIMIT_RSS] = {"Max resident set", "bytes"}, + [RLIMIT_NPROC] = {"Max processes", "processes"}, + [RLIMIT_NOFILE] = {"Max open files", "files"}, + [RLIMIT_MEMLOCK] = {"Max locked memory", "bytes"}, + [RLIMIT_AS] = {"Max address space", "bytes"}, + [RLIMIT_LOCKS] = {"Max file locks", "locks"}, + [RLIMIT_SIGPENDING] = {"Max pending signals", "signals"}, + [RLIMIT_MSGQUEUE] = {"Max msgqueue size", "bytes"}, + [RLIMIT_NICE] = {"Max nice priority", NULL}, + [RLIMIT_RTPRIO] = {"Max realtime priority", NULL}, +}; + +/* Display limits for a process */ +static int proc_pid_limits(struct task_struct *task, char *buffer) +{ + unsigned int i; + int count = 0; + unsigned long flags; + char *bufptr = buffer; + + struct rlimit rlim[RLIM_NLIMITS]; + + rcu_read_lock(); + if (!lock_task_sighand(task,&flags)) { + rcu_read_unlock(); + return 0; + } + memcpy(rlim, task->signal->rlim, sizeof(struct rlimit) * RLIM_NLIMITS); + unlock_task_sighand(task, &flags); + rcu_read_unlock(); + + /* + * print the file header + */ + count += sprintf(&bufptr[count], "%-25s %-20s %-20s %-10s\n", + "Limit", "Soft Limit", "Hard Limit", "Units"); + + for (i = 0; i < RLIM_NLIMITS; i++) { + if (rlim[i].rlim_cur == RLIM_INFINITY) + count += sprintf(&bufptr[count], "%-25s %-20s ", + lnames[i].name, "unlimited"); + else + count += sprintf(&bufptr[count], "%-25s %-20lu ", + lnames[i].name, rlim[i].rlim_cur); + + if (rlim[i].rlim_max == RLIM_INFINITY) + count += sprintf(&bufptr[count], "%-20s ", "unlimited"); + else + count += sprintf(&bufptr[count], "%-20lu ", + rlim[i].rlim_max); + + if (lnames[i].unit) + count += sprintf(&bufptr[count], "%-10s\n", + lnames[i].unit); + else + count += sprintf(&bufptr[count], "\n"); + } + + return count; +} + /************************************************************************/ /* Here the fs part begins */ /************************************************************************/ @@ -349,18 +424,21 @@ struct proc_mounts { static int mounts_open(struct inode *inode, struct file *file) { struct task_struct *task = get_proc_task(inode); + struct nsproxy *nsp; struct mnt_namespace *ns = NULL; struct proc_mounts *p; int ret = -EINVAL; if (task) { - task_lock(task); - if (task->nsproxy) { - ns = task->nsproxy->mnt_ns; + rcu_read_lock(); + nsp = task_nsproxy(task); + if (nsp) { + ns = nsp->mnt_ns; if (ns) get_mnt_ns(ns); } - task_unlock(task); + rcu_read_unlock(); + put_task_struct(task); } @@ -423,16 +501,20 @@ static int mountstats_open(struct inode *inode, struct file *file) if (!ret) { struct seq_file *m = file->private_data; + struct nsproxy *nsp; struct mnt_namespace *mnt_ns = NULL; struct task_struct *task = get_proc_task(inode); if (task) { - task_lock(task); - if (task->nsproxy) - mnt_ns = task->nsproxy->mnt_ns; - if (mnt_ns) - get_mnt_ns(mnt_ns); - task_unlock(task); + rcu_read_lock(); + nsp = task_nsproxy(task); + if (nsp) { + mnt_ns = nsp->mnt_ns; + if (mnt_ns) + get_mnt_ns(mnt_ns); + } + rcu_read_unlock(); + put_task_struct(task); } @@ -1437,7 +1519,7 @@ static int proc_readfd_common(struct file * filp, void * dirent, struct dentry *dentry = filp->f_path.dentry; struct inode *inode = dentry->d_inode; struct task_struct *p = get_proc_task(inode); - unsigned int fd, tid, ino; + unsigned int fd, ino; int retval; struct files_struct * files; struct fdtable *fdt; @@ -1446,7 +1528,6 @@ static int proc_readfd_common(struct file * filp, void * dirent, if (!p) goto out_no_task; retval = 0; - tid = p->pid; fd = filp->f_pos; switch (fd) { @@ -1681,7 +1762,6 @@ static int proc_pident_readdir(struct file *filp, const struct pid_entry *ents, unsigned int nents) { int i; - int pid; struct dentry *dentry = filp->f_path.dentry; struct inode *inode = dentry->d_inode; struct task_struct *task = get_proc_task(inode); @@ -1694,7 +1774,6 @@ static int proc_pident_readdir(struct file *filp, goto out_no_task; ret = 0; - pid = task->pid; i = filp->f_pos; switch (i) { case 0: @@ -1928,14 +2007,14 @@ static int proc_self_readlink(struct dentry *dentry, char __user *buffer, int buflen) { char tmp[PROC_NUMBUF]; - sprintf(tmp, "%d", current->tgid); + sprintf(tmp, "%d", task_tgid_vnr(current)); return vfs_readlink(dentry,buffer,buflen,tmp); } static void *proc_self_follow_link(struct dentry *dentry, struct nameidata *nd) { char tmp[PROC_NUMBUF]; - sprintf(tmp, "%d", current->tgid); + sprintf(tmp, "%d", task_tgid_vnr(current)); return ERR_PTR(vfs_follow_link(nd,tmp)); } @@ -2101,6 +2180,7 @@ static const struct pid_entry tgid_base_stuff[] = { REG("environ", S_IRUSR, environ), INF("auxv", S_IRUSR, pid_auxv), INF("status", S_IRUGO, pid_status), + INF("limits", S_IRUSR, pid_limits), #ifdef CONFIG_SCHED_DEBUG REG("sched", S_IRUGO|S_IWUSR, pid_sched), #endif @@ -2130,9 +2210,12 @@ static const struct pid_entry tgid_base_stuff[] = { #ifdef CONFIG_SCHEDSTATS INF("schedstat", S_IRUGO, pid_schedstat), #endif -#ifdef CONFIG_CPUSETS +#ifdef CONFIG_PROC_PID_CPUSET REG("cpuset", S_IRUGO, cpuset), #endif +#ifdef CONFIG_CGROUPS + REG("cgroup", S_IRUGO, cgroup), +#endif INF("oom_score", S_IRUGO, oom_score), REG("oom_adj", S_IRUGO|S_IWUSR, oom_adjust), #ifdef CONFIG_AUDITSYSCALL @@ -2193,27 +2276,27 @@ static const struct inode_operations proc_tgid_base_inode_operations = { * that no dcache entries will exist at process exit time it * just makes it very unlikely that any will persist. */ -void proc_flush_task(struct task_struct *task) +static void proc_flush_task_mnt(struct vfsmount *mnt, pid_t pid, pid_t tgid) { struct dentry *dentry, *leader, *dir; char buf[PROC_NUMBUF]; struct qstr name; name.name = buf; - name.len = snprintf(buf, sizeof(buf), "%d", task->pid); - dentry = d_hash_and_lookup(proc_mnt->mnt_root, &name); + name.len = snprintf(buf, sizeof(buf), "%d", pid); + dentry = d_hash_and_lookup(mnt->mnt_root, &name); if (dentry) { shrink_dcache_parent(dentry); d_drop(dentry); dput(dentry); } - if (thread_group_leader(task)) + if (tgid == 0) goto out; name.name = buf; - name.len = snprintf(buf, sizeof(buf), "%d", task->tgid); - leader = d_hash_and_lookup(proc_mnt->mnt_root, &name); + name.len = snprintf(buf, sizeof(buf), "%d", tgid); + leader = d_hash_and_lookup(mnt->mnt_root, &name); if (!leader) goto out; @@ -2224,7 +2307,7 @@ void proc_flush_task(struct task_struct *task) goto out_put_leader; name.name = buf; - name.len = snprintf(buf, sizeof(buf), "%d", task->pid); + name.len = snprintf(buf, sizeof(buf), "%d", pid); dentry = d_hash_and_lookup(dir, &name); if (dentry) { shrink_dcache_parent(dentry); @@ -2239,6 +2322,36 @@ out: return; } +/* + * when flushing dentries from proc one need to flush them from global + * proc (proc_mnt) and from all the namespaces' procs this task was seen + * in. this call is supposed to make all this job. + */ + +void proc_flush_task(struct task_struct *task) +{ + int i, leader; + struct pid *pid, *tgid; + struct upid *upid; + + leader = thread_group_leader(task); + proc_flush_task_mnt(proc_mnt, task->pid, leader ? task->tgid : 0); + pid = task_pid(task); + if (pid->level == 0) + return; + + tgid = task_tgid(task); + for (i = 1; i <= pid->level; i++) { + upid = &pid->numbers[i]; + proc_flush_task_mnt(upid->ns->proc_mnt, upid->nr, + leader ? 0 : tgid->numbers[i].nr); + } + + upid = &pid->numbers[pid->level]; + if (upid->nr == 1) + pid_ns_release_proc(upid->ns); +} + static struct dentry *proc_pid_instantiate(struct inode *dir, struct dentry * dentry, struct task_struct *task, const void *ptr) @@ -2274,6 +2387,7 @@ struct dentry *proc_pid_lookup(struct inode *dir, struct dentry * dentry, struct struct dentry *result = ERR_PTR(-ENOENT); struct task_struct *task; unsigned tgid; + struct pid_namespace *ns; result = proc_base_lookup(dir, dentry); if (!IS_ERR(result) || PTR_ERR(result) != -ENOENT) @@ -2283,8 +2397,9 @@ struct dentry *proc_pid_lookup(struct inode *dir, struct dentry * dentry, struct if (tgid == ~0U) goto out; + ns = dentry->d_sb->s_fs_info; rcu_read_lock(); - task = find_task_by_pid(tgid); + task = find_task_by_pid_ns(tgid, ns); if (task) get_task_struct(task); rcu_read_unlock(); @@ -2301,7 +2416,8 @@ out: * Find the first task with tgid >= tgid * */ -static struct task_struct *next_tgid(unsigned int tgid) +static struct task_struct *next_tgid(unsigned int tgid, + struct pid_namespace *ns) { struct task_struct *task; struct pid *pid; @@ -2309,9 +2425,9 @@ static struct task_struct *next_tgid(unsigned int tgid) rcu_read_lock(); retry: task = NULL; - pid = find_ge_pid(tgid); + pid = find_ge_pid(tgid, ns); if (pid) { - tgid = pid->nr + 1; + tgid = pid_nr_ns(pid, ns) + 1; task = pid_task(pid, PIDTYPE_PID); /* What we to know is if the pid we have find is the * pid of a thread_group_leader. Testing for task @@ -2351,6 +2467,7 @@ int proc_pid_readdir(struct file * filp, void * dirent, filldir_t filldir) struct task_struct *reaper = get_proc_task(filp->f_path.dentry->d_inode); struct task_struct *task; int tgid; + struct pid_namespace *ns; if (!reaper) goto out_no_task; @@ -2361,11 +2478,12 @@ int proc_pid_readdir(struct file * filp, void * dirent, filldir_t filldir) goto out; } + ns = filp->f_dentry->d_sb->s_fs_info; tgid = filp->f_pos - TGID_OFFSET; - for (task = next_tgid(tgid); + for (task = next_tgid(tgid, ns); task; - put_task_struct(task), task = next_tgid(tgid + 1)) { - tgid = task->pid; + put_task_struct(task), task = next_tgid(tgid + 1, ns)) { + tgid = task_pid_nr_ns(task, ns); filp->f_pos = tgid + TGID_OFFSET; if (proc_pid_fill_cache(filp, dirent, filldir, task, tgid) < 0) { put_task_struct(task); @@ -2388,6 +2506,7 @@ static const struct pid_entry tid_base_stuff[] = { REG("environ", S_IRUSR, environ), INF("auxv", S_IRUSR, pid_auxv), INF("status", S_IRUGO, pid_status), + INF("limits", S_IRUSR, pid_limits), #ifdef CONFIG_SCHED_DEBUG REG("sched", S_IRUGO|S_IWUSR, pid_sched), #endif @@ -2416,9 +2535,12 @@ static const struct pid_entry tid_base_stuff[] = { #ifdef CONFIG_SCHEDSTATS INF("schedstat", S_IRUGO, pid_schedstat), #endif -#ifdef CONFIG_CPUSETS +#ifdef CONFIG_PROC_PID_CPUSET REG("cpuset", S_IRUGO, cpuset), #endif +#ifdef CONFIG_CGROUPS + REG("cgroup", S_IRUGO, cgroup), +#endif INF("oom_score", S_IRUGO, oom_score), REG("oom_adj", S_IRUGO|S_IWUSR, oom_adjust), #ifdef CONFIG_AUDITSYSCALL @@ -2486,6 +2608,7 @@ static struct dentry *proc_task_lookup(struct inode *dir, struct dentry * dentry struct task_struct *task; struct task_struct *leader = get_proc_task(dir); unsigned tid; + struct pid_namespace *ns; if (!leader) goto out_no_task; @@ -2494,14 +2617,15 @@ static struct dentry *proc_task_lookup(struct inode *dir, struct dentry * dentry if (tid == ~0U) goto out; + ns = dentry->d_sb->s_fs_info; rcu_read_lock(); - task = find_task_by_pid(tid); + task = find_task_by_pid_ns(tid, ns); if (task) get_task_struct(task); rcu_read_unlock(); if (!task) goto out; - if (leader->tgid != task->tgid) + if (!same_thread_group(leader, task)) goto out_drop_task; result = proc_task_instantiate(dir, dentry, task, NULL); @@ -2526,14 +2650,14 @@ out_no_task: * threads past it. */ static struct task_struct *first_tid(struct task_struct *leader, - int tid, int nr) + int tid, int nr, struct pid_namespace *ns) { struct task_struct *pos; rcu_read_lock(); /* Attempt to start with the pid of a thread */ if (tid && (nr > 0)) { - pos = find_task_by_pid(tid); + pos = find_task_by_pid_ns(tid, ns); if (pos && (pos->group_leader == leader)) goto found; } @@ -2602,6 +2726,7 @@ static int proc_task_readdir(struct file * filp, void * dirent, filldir_t filldi ino_t ino; int tid; unsigned long pos = filp->f_pos; /* avoiding "long long" filp->f_pos */ + struct pid_namespace *ns; task = get_proc_task(inode); if (!task) @@ -2635,12 +2760,13 @@ static int proc_task_readdir(struct file * filp, void * dirent, filldir_t filldi /* f_version caches the tgid value that the last readdir call couldn't * return. lseek aka telldir automagically resets f_version to 0. */ + ns = filp->f_dentry->d_sb->s_fs_info; tid = (int)filp->f_version; filp->f_version = 0; - for (task = first_tid(leader, tid, pos - 2); + for (task = first_tid(leader, tid, pos - 2, ns); task; task = next_tid(task), pos++) { - tid = task->pid; + tid = task_pid_nr_ns(task, ns); if (proc_task_fill_cache(filp, dirent, filldir, task, tid) < 0) { /* returning this tgid failed, save it as the first * pid for the next readir call */ diff --git a/fs/proc/inode.c b/fs/proc/inode.c index 99ca00485fc3..abe6a3f04368 100644 --- a/fs/proc/inode.c +++ b/fs/proc/inode.c @@ -448,7 +448,7 @@ out_mod: return NULL; } -int proc_fill_super(struct super_block *s, void *data, int silent) +int proc_fill_super(struct super_block *s) { struct inode * root_inode; diff --git a/fs/proc/proc_misc.c b/fs/proc/proc_misc.c index d6dc72c78bc1..e0d064e9764e 100644 --- a/fs/proc/proc_misc.c +++ b/fs/proc/proc_misc.c @@ -91,7 +91,8 @@ static int loadavg_read_proc(char *page, char **start, off_t off, LOAD_INT(a), LOAD_FRAC(a), LOAD_INT(b), LOAD_FRAC(b), LOAD_INT(c), LOAD_FRAC(c), - nr_running(), nr_threads, current->nsproxy->pid_ns->last_pid); + nr_running(), nr_threads, + task_active_pid_ns(current)->last_pid); return proc_calc_metrics(page, start, off, count, eof, len); } diff --git a/fs/proc/root.c b/fs/proc/root.c index cf3046638b09..ec9cb3b6c93b 100644 --- a/fs/proc/root.c +++ b/fs/proc/root.c @@ -18,32 +18,90 @@ #include <linux/bitops.h> #include <linux/smp_lock.h> #include <linux/mount.h> +#include <linux/pid_namespace.h> #include "internal.h" struct proc_dir_entry *proc_bus, *proc_root_fs, *proc_root_driver; +static int proc_test_super(struct super_block *sb, void *data) +{ + return sb->s_fs_info == data; +} + +static int proc_set_super(struct super_block *sb, void *data) +{ + struct pid_namespace *ns; + + ns = (struct pid_namespace *)data; + sb->s_fs_info = get_pid_ns(ns); + return set_anon_super(sb, NULL); +} + static int proc_get_sb(struct file_system_type *fs_type, int flags, const char *dev_name, void *data, struct vfsmount *mnt) { + int err; + struct super_block *sb; + struct pid_namespace *ns; + struct proc_inode *ei; + if (proc_mnt) { /* Seed the root directory with a pid so it doesn't need * to be special in base.c. I would do this earlier but * the only task alive when /proc is mounted the first time * is the init_task and it doesn't have any pids. */ - struct proc_inode *ei; ei = PROC_I(proc_mnt->mnt_sb->s_root->d_inode); if (!ei->pid) ei->pid = find_get_pid(1); } - return get_sb_single(fs_type, flags, data, proc_fill_super, mnt); + + if (flags & MS_KERNMOUNT) + ns = (struct pid_namespace *)data; + else + ns = current->nsproxy->pid_ns; + + sb = sget(fs_type, proc_test_super, proc_set_super, ns); + if (IS_ERR(sb)) + return PTR_ERR(sb); + + if (!sb->s_root) { + sb->s_flags = flags; + err = proc_fill_super(sb); + if (err) { + up_write(&sb->s_umount); + deactivate_super(sb); + return err; + } + + ei = PROC_I(sb->s_root->d_inode); + if (!ei->pid) { + rcu_read_lock(); + ei->pid = get_pid(find_pid_ns(1, ns)); + rcu_read_unlock(); + } + + sb->s_flags |= MS_ACTIVE; + ns->proc_mnt = mnt; + } + + return simple_set_mnt(mnt, sb); +} + +static void proc_kill_sb(struct super_block *sb) +{ + struct pid_namespace *ns; + + ns = (struct pid_namespace *)sb->s_fs_info; + kill_anon_super(sb); + put_pid_ns(ns); } static struct file_system_type proc_fs_type = { .name = "proc", .get_sb = proc_get_sb, - .kill_sb = kill_anon_super, + .kill_sb = proc_kill_sb, }; void __init proc_root_init(void) @@ -54,12 +112,13 @@ void __init proc_root_init(void) err = register_filesystem(&proc_fs_type); if (err) return; - proc_mnt = kern_mount(&proc_fs_type); + proc_mnt = kern_mount_data(&proc_fs_type, &init_pid_ns); err = PTR_ERR(proc_mnt); if (IS_ERR(proc_mnt)) { unregister_filesystem(&proc_fs_type); return; } + proc_misc_init(); proc_net_init(); @@ -153,6 +212,22 @@ struct proc_dir_entry proc_root = { .parent = &proc_root, }; +int pid_ns_prepare_proc(struct pid_namespace *ns) +{ + struct vfsmount *mnt; + + mnt = kern_mount_data(&proc_fs_type, ns); + if (IS_ERR(mnt)) + return PTR_ERR(mnt); + + return 0; +} + +void pid_ns_release_proc(struct pid_namespace *ns) +{ + mntput(ns->proc_mnt); +} + EXPORT_SYMBOL(proc_symlink); EXPORT_SYMBOL(proc_mkdir); EXPORT_SYMBOL(create_proc_entry); diff --git a/fs/reiserfs/bitmap.c b/fs/reiserfs/bitmap.c index 2a5dd34649b3..16b331dd9913 100644 --- a/fs/reiserfs/bitmap.c +++ b/fs/reiserfs/bitmap.c @@ -47,7 +47,9 @@ test_bit(_ALLOC_ ## optname , &SB_ALLOC_OPTS(s)) static inline void get_bit_address(struct super_block *s, - b_blocknr_t block, int *bmap_nr, int *offset) + b_blocknr_t block, + unsigned int *bmap_nr, + unsigned int *offset) { /* It is in the bitmap block number equal to the block * number divided by the number of bits in a block. */ @@ -56,10 +58,10 @@ static inline void get_bit_address(struct super_block *s, *offset = block & ((s->s_blocksize << 3) - 1); } -#ifdef CONFIG_REISERFS_CHECK int is_reusable(struct super_block *s, b_blocknr_t block, int bit_value) { - int bmap, offset; + unsigned int bmap, offset; + unsigned int bmap_count = reiserfs_bmap_count(s); if (block == 0 || block >= SB_BLOCK_COUNT(s)) { reiserfs_warning(s, @@ -75,25 +77,26 @@ int is_reusable(struct super_block *s, b_blocknr_t block, int bit_value) if (unlikely(test_bit(REISERFS_OLD_FORMAT, &(REISERFS_SB(s)->s_properties)))) { b_blocknr_t bmap1 = REISERFS_SB(s)->s_sbh->b_blocknr + 1; - if (block >= bmap1 && block <= bmap1 + SB_BMAP_NR(s)) { + if (block >= bmap1 && + block <= bmap1 + bmap_count) { reiserfs_warning(s, "vs: 4019: is_reusable: " "bitmap block %lu(%u) can't be freed or reused", - block, SB_BMAP_NR(s)); + block, bmap_count); return 0; } } else { if (offset == 0) { reiserfs_warning(s, "vs: 4020: is_reusable: " "bitmap block %lu(%u) can't be freed or reused", - block, SB_BMAP_NR(s)); + block, bmap_count); return 0; } } - if (bmap >= SB_BMAP_NR(s)) { + if (bmap >= bmap_count) { reiserfs_warning(s, "vs-4030: is_reusable: there is no so many bitmap blocks: " - "block=%lu, bitmap_nr=%d", block, bmap); + "block=%lu, bitmap_nr=%u", block, bmap); return 0; } @@ -106,12 +109,11 @@ int is_reusable(struct super_block *s, b_blocknr_t block, int bit_value) return 1; } -#endif /* CONFIG_REISERFS_CHECK */ /* searches in journal structures for a given block number (bmap, off). If block is found in reiserfs journal it suggests next free block candidate to test. */ -static inline int is_block_in_journal(struct super_block *s, int bmap, int - off, int *next) +static inline int is_block_in_journal(struct super_block *s, unsigned int bmap, + int off, int *next) { b_blocknr_t tmp; @@ -132,8 +134,8 @@ static inline int is_block_in_journal(struct super_block *s, int bmap, int /* it searches for a window of zero bits with given minimum and maximum lengths in one bitmap * block; */ static int scan_bitmap_block(struct reiserfs_transaction_handle *th, - int bmap_n, int *beg, int boundary, int min, - int max, int unfm) + unsigned int bmap_n, int *beg, int boundary, + int min, int max, int unfm) { struct super_block *s = th->t_super; struct reiserfs_bitmap_info *bi = &SB_AP_BITMAP(s)[bmap_n]; @@ -143,8 +145,8 @@ static int scan_bitmap_block(struct reiserfs_transaction_handle *th, BUG_ON(!th->t_trans_id); - RFALSE(bmap_n >= SB_BMAP_NR(s), "Bitmap %d is out of range (0..%d)", - bmap_n, SB_BMAP_NR(s) - 1); + RFALSE(bmap_n >= reiserfs_bmap_count(s), "Bitmap %u is out of " + "range (0..%u)", bmap_n, reiserfs_bmap_count(s) - 1); PROC_INFO_INC(s, scan_bitmap.bmap); /* this is unclear and lacks comments, explain how journal bitmaps work here for the reader. Convey a sense of the design here. What @@ -249,12 +251,12 @@ static int bmap_hash_id(struct super_block *s, u32 id) } else { hash_in = (char *)(&id); hash = keyed_hash(hash_in, 4); - bm = hash % SB_BMAP_NR(s); + bm = hash % reiserfs_bmap_count(s); if (!bm) bm = 1; } /* this can only be true when SB_BMAP_NR = 1 */ - if (bm >= SB_BMAP_NR(s)) + if (bm >= reiserfs_bmap_count(s)) bm = 0; return bm; } @@ -273,7 +275,7 @@ static inline int block_group_used(struct super_block *s, u32 id) * to make a better decision. This favors long-term performace gain * with a better on-disk layout vs. a short term gain of skipping the * read and potentially having a bad placement. */ - if (info->first_zero_hint == 0) { + if (info->free_count == UINT_MAX) { struct buffer_head *bh = reiserfs_read_bitmap_block(s, bm); brelse(bh); } @@ -309,16 +311,16 @@ __le32 reiserfs_choose_packing(struct inode * dir) * bitmap and place new blocks there. Returns number of allocated blocks. */ static int scan_bitmap(struct reiserfs_transaction_handle *th, b_blocknr_t * start, b_blocknr_t finish, - int min, int max, int unfm, unsigned long file_block) + int min, int max, int unfm, sector_t file_block) { int nr_allocated = 0; struct super_block *s = th->t_super; /* find every bm and bmap and bmap_nr in this file, and change them all to bitmap_blocknr * - Hans, it is not a block number - Zam. */ - int bm, off; - int end_bm, end_off; - int off_max = s->s_blocksize << 3; + unsigned int bm, off; + unsigned int end_bm, end_off; + unsigned int off_max = s->s_blocksize << 3; BUG_ON(!th->t_trans_id); @@ -328,10 +330,10 @@ static int scan_bitmap(struct reiserfs_transaction_handle *th, get_bit_address(s, *start, &bm, &off); get_bit_address(s, finish, &end_bm, &end_off); - if (bm > SB_BMAP_NR(s)) + if (bm > reiserfs_bmap_count(s)) return 0; - if (end_bm > SB_BMAP_NR(s)) - end_bm = SB_BMAP_NR(s); + if (end_bm > reiserfs_bmap_count(s)) + end_bm = reiserfs_bmap_count(s); /* When the bitmap is more than 10% free, anyone can allocate. * When it's less than 10% free, only files that already use the @@ -385,7 +387,7 @@ static void _reiserfs_free_block(struct reiserfs_transaction_handle *th, struct reiserfs_super_block *rs; struct buffer_head *sbh, *bmbh; struct reiserfs_bitmap_info *apbi; - int nr, offset; + unsigned int nr, offset; BUG_ON(!th->t_trans_id); @@ -397,10 +399,12 @@ static void _reiserfs_free_block(struct reiserfs_transaction_handle *th, get_bit_address(s, block, &nr, &offset); - if (nr >= sb_bmap_nr(rs)) { + if (nr >= reiserfs_bmap_count(s)) { reiserfs_warning(s, "vs-4075: reiserfs_free_block: " - "block %lu is out of range on %s", - block, reiserfs_bdevname(s)); + "block %lu is out of range on %s " + "(nr=%u,max=%u)", block, + reiserfs_bdevname(s), nr, + reiserfs_bmap_count(s)); return; } @@ -434,12 +438,19 @@ void reiserfs_free_block(struct reiserfs_transaction_handle *th, int for_unformatted) { struct super_block *s = th->t_super; - BUG_ON(!th->t_trans_id); RFALSE(!s, "vs-4061: trying to free block on nonexistent device"); - RFALSE(is_reusable(s, block, 1) == 0, - "vs-4071: can not free such block"); + if (!is_reusable(s, block, 1)) + return; + + if (block > sb_block_count(REISERFS_SB(s)->s_rs)) { + reiserfs_panic(th->t_super, "bitmap-4072", + "Trying to free block outside file system " + "boundaries (%lu > %lu)", + block, sb_block_count(REISERFS_SB(s)->s_rs)); + return; + } /* mark it before we clear it, just in case */ journal_mark_freed(th, s, block); _reiserfs_free_block(th, inode, block, for_unformatted); @@ -449,11 +460,11 @@ void reiserfs_free_block(struct reiserfs_transaction_handle *th, static void reiserfs_free_prealloc_block(struct reiserfs_transaction_handle *th, struct inode *inode, b_blocknr_t block) { + BUG_ON(!th->t_trans_id); RFALSE(!th->t_super, "vs-4060: trying to free block on nonexistent device"); - RFALSE(is_reusable(th->t_super, block, 1) == 0, - "vs-4070: can not free such block"); - BUG_ON(!th->t_trans_id); + if (!is_reusable(th->t_super, block, 1)) + return; _reiserfs_free_block(th, inode, block, 1); } @@ -1207,27 +1218,22 @@ void reiserfs_cache_bitmap_metadata(struct super_block *sb, { unsigned long *cur = (unsigned long *)(bh->b_data + bh->b_size); - info->first_zero_hint = 1 << (sb->s_blocksize_bits + 3); + /* The first bit must ALWAYS be 1 */ + BUG_ON(!reiserfs_test_le_bit(0, (unsigned long *)bh->b_data)); + + info->free_count = 0; while (--cur >= (unsigned long *)bh->b_data) { - int base = ((char *)cur - bh->b_data) << 3; + int i; /* 0 and ~0 are special, we can optimize for them */ - if (*cur == 0) { - info->first_zero_hint = base; + if (*cur == 0) info->free_count += BITS_PER_LONG; - } else if (*cur != ~0L) { /* A mix, investigate */ - int b; - for (b = BITS_PER_LONG - 1; b >= 0; b--) { - if (!reiserfs_test_le_bit(b, cur)) { - info->first_zero_hint = base + b; + else if (*cur != ~0L) /* A mix, investigate */ + for (i = BITS_PER_LONG - 1; i >= 0; i--) + if (!reiserfs_test_le_bit(i, cur)) info->free_count++; - } - } - } } - /* The first bit must ALWAYS be 1 */ - BUG_ON(info->first_zero_hint == 0); } struct buffer_head *reiserfs_read_bitmap_block(struct super_block *sb, @@ -1257,7 +1263,7 @@ struct buffer_head *reiserfs_read_bitmap_block(struct super_block *sb, BUG_ON(!buffer_uptodate(bh)); BUG_ON(atomic_read(&bh->b_count) == 0); - if (info->first_zero_hint == 0) + if (info->free_count == UINT_MAX) reiserfs_cache_bitmap_metadata(sb, bh, info); } @@ -1267,12 +1273,13 @@ struct buffer_head *reiserfs_read_bitmap_block(struct super_block *sb, int reiserfs_init_bitmap_cache(struct super_block *sb) { struct reiserfs_bitmap_info *bitmap; + unsigned int bmap_nr = reiserfs_bmap_count(sb); - bitmap = vmalloc(sizeof (*bitmap) * SB_BMAP_NR(sb)); + bitmap = vmalloc(sizeof(*bitmap) * bmap_nr); if (bitmap == NULL) return -ENOMEM; - memset(bitmap, 0, sizeof (*bitmap) * SB_BMAP_NR(sb)); + memset(bitmap, 0xff, sizeof(*bitmap) * bmap_nr); SB_AP_BITMAP(sb) = bitmap; diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c index 0804289d355d..a991af96f3f0 100644 --- a/fs/reiserfs/inode.c +++ b/fs/reiserfs/inode.c @@ -199,7 +199,7 @@ static inline void set_block_dev_mapped(struct buffer_head *bh, // files which were created in the earlier version can not be longer, // than 2 gb // -static int file_capable(struct inode *inode, long block) +static int file_capable(struct inode *inode, sector_t block) { if (get_inode_item_key_version(inode) != KEY_FORMAT_3_5 || // it is new file. block < (1 << (31 - inode->i_sb->s_blocksize_bits))) // old file, but 'block' is inside of 2gb @@ -242,7 +242,7 @@ static int restart_transaction(struct reiserfs_transaction_handle *th, // Please improve the english/clarity in the comment above, as it is // hard to understand. -static int _get_block_create_0(struct inode *inode, long block, +static int _get_block_create_0(struct inode *inode, sector_t block, struct buffer_head *bh_result, int args) { INITIALIZE_PATH(path); @@ -250,7 +250,7 @@ static int _get_block_create_0(struct inode *inode, long block, struct buffer_head *bh; struct item_head *ih, tmp_ih; int fs_gen; - int blocknr; + b_blocknr_t blocknr; char *p = NULL; int chars; int ret; @@ -569,7 +569,7 @@ static int convert_tail_for_hole(struct inode *inode, } static inline int _allocate_block(struct reiserfs_transaction_handle *th, - long block, + sector_t block, struct inode *inode, b_blocknr_t * allocated_block_nr, struct treepath *path, int flags) diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c index 4cad9e75ef56..bb05a3e51b93 100644 --- a/fs/reiserfs/journal.c +++ b/fs/reiserfs/journal.c @@ -219,11 +219,12 @@ static void allocate_bitmap_nodes(struct super_block *p_s_sb) } } -static int set_bit_in_list_bitmap(struct super_block *p_s_sb, int block, +static int set_bit_in_list_bitmap(struct super_block *p_s_sb, + b_blocknr_t block, struct reiserfs_list_bitmap *jb) { - int bmap_nr = block / (p_s_sb->s_blocksize << 3); - int bit_nr = block % (p_s_sb->s_blocksize << 3); + unsigned int bmap_nr = block / (p_s_sb->s_blocksize << 3); + unsigned int bit_nr = block % (p_s_sb->s_blocksize << 3); if (!jb->bitmaps[bmap_nr]) { jb->bitmaps[bmap_nr] = get_bitmap_node(p_s_sb); @@ -239,7 +240,7 @@ static void cleanup_bitmap_list(struct super_block *p_s_sb, if (jb->bitmaps == NULL) return; - for (i = 0; i < SB_BMAP_NR(p_s_sb); i++) { + for (i = 0; i < reiserfs_bmap_count(p_s_sb); i++) { if (jb->bitmaps[i]) { free_bitmap_node(p_s_sb, jb->bitmaps[i]); jb->bitmaps[i] = NULL; @@ -289,7 +290,7 @@ static int free_bitmap_nodes(struct super_block *p_s_sb) */ int reiserfs_allocate_list_bitmaps(struct super_block *p_s_sb, struct reiserfs_list_bitmap *jb_array, - int bmap_nr) + unsigned int bmap_nr) { int i; int failed = 0; @@ -483,7 +484,7 @@ static inline struct reiserfs_journal_cnode *get_journal_hash_dev(struct ** */ int reiserfs_in_journal(struct super_block *p_s_sb, - int bmap_nr, int bit_nr, int search_all, + unsigned int bmap_nr, int bit_nr, int search_all, b_blocknr_t * next_zero_bit) { struct reiserfs_journal *journal = SB_JOURNAL(p_s_sb); @@ -1013,7 +1014,7 @@ static int flush_commit_list(struct super_block *s, struct reiserfs_journal_list *jl, int flushall) { int i; - int bn; + b_blocknr_t bn; struct buffer_head *tbh = NULL; unsigned long trans_id = jl->j_trans_id; struct reiserfs_journal *journal = SB_JOURNAL(s); @@ -2307,8 +2308,9 @@ static int journal_read_transaction(struct super_block *p_s_sb, Right now it is only used from journal code. But later we might use it from other places. Note: Do not use journal_getblk/sb_getblk functions here! */ -static struct buffer_head *reiserfs_breada(struct block_device *dev, int block, - int bufsize, unsigned int max_block) +static struct buffer_head *reiserfs_breada(struct block_device *dev, + b_blocknr_t block, int bufsize, + b_blocknr_t max_block) { struct buffer_head *bhlist[BUFNR]; unsigned int blocks = BUFNR; @@ -2732,7 +2734,7 @@ int journal_init(struct super_block *p_s_sb, const char *j_dev_name, journal->j_persistent_trans = 0; if (reiserfs_allocate_list_bitmaps(p_s_sb, journal->j_list_bitmap, - SB_BMAP_NR(p_s_sb))) + reiserfs_bmap_count(p_s_sb))) goto free_and_return; allocate_bitmap_nodes(p_s_sb); @@ -2740,7 +2742,7 @@ int journal_init(struct super_block *p_s_sb, const char *j_dev_name, SB_JOURNAL_1st_RESERVED_BLOCK(p_s_sb) = (old_format ? REISERFS_OLD_DISK_OFFSET_IN_BYTES / p_s_sb->s_blocksize + - SB_BMAP_NR(p_s_sb) + + reiserfs_bmap_count(p_s_sb) + 1 : REISERFS_DISK_OFFSET_IN_BYTES / p_s_sb->s_blocksize + 2); diff --git a/fs/reiserfs/prints.c b/fs/reiserfs/prints.c index bc808a91eeaa..5e7388b32d02 100644 --- a/fs/reiserfs/prints.c +++ b/fs/reiserfs/prints.c @@ -356,13 +356,11 @@ extern struct tree_balance *cur_tb; void reiserfs_panic(struct super_block *sb, const char *fmt, ...) { do_reiserfs_warning(fmt); - printk(KERN_EMERG "REISERFS: panic (device %s): %s\n", - reiserfs_bdevname(sb), error_buf); - BUG(); - /* this is not actually called, but makes reiserfs_panic() "noreturn" */ - panic("REISERFS: panic (device %s): %s\n", - reiserfs_bdevname(sb), error_buf); + dump_stack(); + + panic(KERN_EMERG "REISERFS: panic (device %s): %s\n", + reiserfs_bdevname(sb), error_buf); } void reiserfs_abort(struct super_block *sb, int errno, const char *fmt, ...) diff --git a/fs/reiserfs/resize.c b/fs/reiserfs/resize.c index 976cc7887a0d..f71c3948edef 100644 --- a/fs/reiserfs/resize.c +++ b/fs/reiserfs/resize.c @@ -61,7 +61,8 @@ int reiserfs_resize(struct super_block *s, unsigned long block_count_new) } /* count used bits in last bitmap block */ - block_r = SB_BLOCK_COUNT(s) - (SB_BMAP_NR(s) - 1) * s->s_blocksize * 8; + block_r = SB_BLOCK_COUNT(s) - + (reiserfs_bmap_count(s) - 1) * s->s_blocksize * 8; /* count bitmap blocks in new fs */ bmap_nr_new = block_count_new / (s->s_blocksize * 8); @@ -73,7 +74,7 @@ int reiserfs_resize(struct super_block *s, unsigned long block_count_new) /* save old values */ block_count = SB_BLOCK_COUNT(s); - bmap_nr = SB_BMAP_NR(s); + bmap_nr = reiserfs_bmap_count(s); /* resizing of reiserfs bitmaps (journal and real), if needed */ if (bmap_nr_new > bmap_nr) { @@ -119,7 +120,7 @@ int reiserfs_resize(struct super_block *s, unsigned long block_count_new) return -ENOMEM; } memset(bitmap, 0, - sizeof(struct reiserfs_bitmap_info) * SB_BMAP_NR(s)); + sizeof(struct reiserfs_bitmap_info) * bmap_nr_new); for (i = 0; i < bmap_nr; i++) bitmap[i] = old_bitmap[i]; @@ -143,7 +144,6 @@ int reiserfs_resize(struct super_block *s, unsigned long block_count_new) mark_buffer_dirty(bh); sync_dirty_buffer(bh); // update bitmap_info stuff - bitmap[i].first_zero_hint = 1; bitmap[i].free_count = sb_blocksize(sb) * 8 - 1; brelse(bh); } @@ -173,8 +173,6 @@ int reiserfs_resize(struct super_block *s, unsigned long block_count_new) for (i = block_r; i < s->s_blocksize * 8; i++) reiserfs_test_and_clear_le_bit(i, bh->b_data); info->free_count += s->s_blocksize * 8 - block_r; - if (!info->first_zero_hint) - info->first_zero_hint = block_r; journal_mark_dirty(&th, s, bh); brelse(bh); @@ -196,9 +194,6 @@ int reiserfs_resize(struct super_block *s, unsigned long block_count_new) brelse(bh); info->free_count -= s->s_blocksize * 8 - block_r_new; - /* Extreme case where last bitmap is the only valid block in itself. */ - if (!info->free_count) - info->first_zero_hint = 0; /* update super */ reiserfs_prepare_for_journal(s, SB_BUFFER_WITH_SB(s), 1); free_blocks = SB_FREE_BLOCKS(s); @@ -206,7 +201,7 @@ int reiserfs_resize(struct super_block *s, unsigned long block_count_new) free_blocks + (block_count_new - block_count - (bmap_nr_new - bmap_nr))); PUT_SB_BLOCK_COUNT(s, block_count_new); - PUT_SB_BMAP_NR(s, bmap_nr_new); + PUT_SB_BMAP_NR(s, bmap_would_wrap(bmap_nr_new) ? : bmap_nr_new); s->s_dirt = 1; journal_mark_dirty(&th, s, SB_BUFFER_WITH_SB(s)); diff --git a/fs/reiserfs/stree.c b/fs/reiserfs/stree.c index 981027d1187b..ca41567d7890 100644 --- a/fs/reiserfs/stree.c +++ b/fs/reiserfs/stree.c @@ -559,7 +559,7 @@ static int is_tree_node(struct buffer_head *bh, int level) /* The function is NOT SCHEDULE-SAFE! */ static void search_by_key_reada(struct super_block *s, struct buffer_head **bh, - unsigned long *b, int num) + b_blocknr_t *b, int num) { int i, j; @@ -611,7 +611,7 @@ int search_by_key(struct super_block *p_s_sb, const struct cpu_key *p_s_key, /* DISK_LEAF_NODE_LEVEL */ ) { - int n_block_number; + b_blocknr_t n_block_number; int expected_level; struct buffer_head *p_s_bh; struct path_element *p_s_last_element; @@ -619,7 +619,7 @@ int search_by_key(struct super_block *p_s_sb, const struct cpu_key *p_s_key, /* int right_neighbor_of_leaf_node; int fs_gen; struct buffer_head *reada_bh[SEARCH_BY_KEY_READA]; - unsigned long reada_blocks[SEARCH_BY_KEY_READA]; + b_blocknr_t reada_blocks[SEARCH_BY_KEY_READA]; int reada_count = 0; #ifdef CONFIG_REISERFS_CHECK diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c index b82897ae090b..57adfe90d5ae 100644 --- a/fs/reiserfs/super.c +++ b/fs/reiserfs/super.c @@ -1725,6 +1725,21 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent) set_sb_umount_state(rs, REISERFS_ERROR_FS); set_sb_fs_state(rs, 0); + /* Clear out s_bmap_nr if it would wrap. We can handle this + * case, but older revisions can't. This will cause the + * file system to fail mount on those older implementations, + * avoiding corruption. -jeffm */ + if (bmap_would_wrap(reiserfs_bmap_count(s)) && + sb_bmap_nr(rs) != 0) { + reiserfs_warning(s, "super-2030: This file system " + "claims to use %u bitmap blocks in " + "its super block, but requires %u. " + "Clearing to zero.", sb_bmap_nr(rs), + reiserfs_bmap_count(s)); + + set_sb_bmap_nr(rs, 0); + } + if (old_format_only(s)) { /* filesystem of format 3.5 either with standard or non-standard journal */ diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c index fab4b9b2664f..1597f6b649e0 100644 --- a/fs/reiserfs/xattr.c +++ b/fs/reiserfs/xattr.c @@ -484,7 +484,7 @@ reiserfs_xattr_set(struct inode *inode, const char *name, const void *buffer, /* Resize it so we're ok to write there */ newattrs.ia_size = buffer_size; newattrs.ia_valid = ATTR_SIZE | ATTR_CTIME; - mutex_lock(&xinode->i_mutex); + mutex_lock_nested(&xinode->i_mutex, I_MUTEX_XATTR); err = notify_change(fp->f_path.dentry, &newattrs); if (err) goto out_filp; @@ -1223,7 +1223,8 @@ int reiserfs_xattr_init(struct super_block *s, int mount_flags) if (!IS_ERR(dentry)) { if (!(mount_flags & MS_RDONLY) && !dentry->d_inode) { struct inode *inode = dentry->d_parent->d_inode; - mutex_lock(&inode->i_mutex); + mutex_lock_nested(&inode->i_mutex, + I_MUTEX_XATTR); err = inode->i_op->mkdir(inode, dentry, 0700); mutex_unlock(&inode->i_mutex); if (err) { diff --git a/fs/select.c b/fs/select.c index 7dede89658f5..47f47925aea2 100644 --- a/fs/select.c +++ b/fs/select.c @@ -177,11 +177,6 @@ get_max: return max; } -#define BIT(i) (1UL << ((i)&(__NFDBITS-1))) -#define MEM(i,m) ((m)+(unsigned)(i)/__NFDBITS) -#define ISSET(i,m) (((i)&*(m)) != 0) -#define SET(i,m) (*(m) |= (i)) - #define POLLIN_SET (POLLRDNORM | POLLRDBAND | POLLIN | POLLHUP | POLLERR) #define POLLOUT_SET (POLLWRBAND | POLLWRNORM | POLLOUT | POLLERR) #define POLLEX_SET (POLLPRI) diff --git a/fs/super.c b/fs/super.c index 1bfcca2104be..d28fde7e1cfb 100644 --- a/fs/super.c +++ b/fs/super.c @@ -40,10 +40,6 @@ #include <asm/uaccess.h> -void get_filesystem(struct file_system_type *fs); -void put_filesystem(struct file_system_type *fs); -struct file_system_type *get_fs_type(const char *name); - LIST_HEAD(super_blocks); DEFINE_SPINLOCK(sb_lock); @@ -336,21 +332,21 @@ struct super_block *sget(struct file_system_type *type, void *data) { struct super_block *s = NULL; - struct list_head *p; + struct super_block *old; int err; retry: spin_lock(&sb_lock); - if (test) list_for_each(p, &type->fs_supers) { - struct super_block *old; - old = list_entry(p, struct super_block, s_instances); - if (!test(old, data)) - continue; - if (!grab_super(old)) - goto retry; - if (s) - destroy_super(s); - return old; + if (test) { + list_for_each_entry(old, &type->fs_supers, s_instances) { + if (!test(old, data)) + continue; + if (!grab_super(old)) + goto retry; + if (s) + destroy_super(s); + return old; + } } if (!s) { spin_unlock(&sb_lock); @@ -948,9 +944,9 @@ do_kern_mount(const char *fstype, int flags, const char *name, void *data) return mnt; } -struct vfsmount *kern_mount(struct file_system_type *type) +struct vfsmount *kern_mount_data(struct file_system_type *type, void *data) { - return vfs_kern_mount(type, 0, type->name, NULL); + return vfs_kern_mount(type, MS_KERNMOUNT, type->name, data); } -EXPORT_SYMBOL(kern_mount); +EXPORT_SYMBOL_GPL(kern_mount_data); diff --git a/include/asm-alpha/bitops.h b/include/asm-alpha/bitops.h index 381b4f5b4d5d..9e19a704d484 100644 --- a/include/asm-alpha/bitops.h +++ b/include/asm-alpha/bitops.h @@ -1,6 +1,10 @@ #ifndef _ALPHA_BITOPS_H #define _ALPHA_BITOPS_H +#ifndef _LINUX_BITOPS_H +#error only <linux/bitops.h> can be included directly +#endif + #include <asm/compiler.h> #include <asm/barrier.h> diff --git a/include/asm-alpha/tlbflush.h b/include/asm-alpha/tlbflush.h index 1ca3ed3bd6d3..eefab3fb51ae 100644 --- a/include/asm-alpha/tlbflush.h +++ b/include/asm-alpha/tlbflush.h @@ -92,17 +92,6 @@ flush_tlb_other(struct mm_struct *mm) if (*mmc) *mmc = 0; } -/* Flush a specified range of user mapping page tables from TLB. - Although Alpha uses VPTE caches, this can be a nop, as Alpha does - not have finegrained tlb flushing, so it will flush VPTE stuff - during next flush_tlb_range. */ - -static inline void -flush_tlb_pgtables(struct mm_struct *mm, unsigned long start, - unsigned long end) -{ -} - #ifndef CONFIG_SMP /* Flush everything (kernel mapping may also have changed due to vmalloc/vfree). */ diff --git a/include/asm-arm/arch-ixp4xx/io.h b/include/asm-arm/arch-ixp4xx/io.h index c72f9d79417c..eeeea90cd5a9 100644 --- a/include/asm-arm/arch-ixp4xx/io.h +++ b/include/asm-arm/arch-ixp4xx/io.h @@ -17,9 +17,6 @@ #define IO_SPACE_LIMIT 0xffff0000 -#define BIT(x) ((1)<<(x)) - - extern int (*ixp4xx_pci_read)(u32 addr, u32 cmd, u32* data); extern int ixp4xx_pci_write(u32 addr, u32 cmd, u32 data); diff --git a/include/asm-arm/bitops.h b/include/asm-arm/bitops.h index 52fe05895deb..47a6b086eee2 100644 --- a/include/asm-arm/bitops.h +++ b/include/asm-arm/bitops.h @@ -19,6 +19,10 @@ #ifdef __KERNEL__ +#ifndef _LINUX_BITOPS_H +#error only <linux/bitops.h> can be included directly +#endif + #include <linux/compiler.h> #include <asm/system.h> diff --git a/include/asm-arm/tlbflush.h b/include/asm-arm/tlbflush.h index 71be4fded7e2..8c6bc1bb9d1a 100644 --- a/include/asm-arm/tlbflush.h +++ b/include/asm-arm/tlbflush.h @@ -463,11 +463,6 @@ extern void flush_tlb_kernel_range(unsigned long start, unsigned long end); */ extern void update_mmu_cache(struct vm_area_struct *vma, unsigned long addr, pte_t pte); -/* - * ARM processors do not cache TLB tables in RAM. - */ -#define flush_tlb_pgtables(mm,start,end) do { } while (0) - #endif #endif /* CONFIG_MMU */ diff --git a/include/asm-avr32/bitops.h b/include/asm-avr32/bitops.h index f3faddfd46a8..1a50b69b1a19 100644 --- a/include/asm-avr32/bitops.h +++ b/include/asm-avr32/bitops.h @@ -8,6 +8,10 @@ #ifndef __ASM_AVR32_BITOPS_H #define __ASM_AVR32_BITOPS_H +#ifndef _LINUX_BITOPS_H +#error only <linux/bitops.h> can be included directly +#endif + #include <asm/byteorder.h> #include <asm/system.h> diff --git a/include/asm-avr32/tlbflush.h b/include/asm-avr32/tlbflush.h index 730e268f81f3..5bc7c88a5770 100644 --- a/include/asm-avr32/tlbflush.h +++ b/include/asm-avr32/tlbflush.h @@ -19,7 +19,6 @@ * - flush_tlb_page(vma, vmaddr) flushes one page * - flush_tlb_range(vma, start, end) flushes a range of pages * - flush_tlb_kernel_range(start, end) flushes a range of kernel pages - * - flush_tlb_pgtables(mm, start, end) flushes a range of page tables */ extern void flush_tlb(void); extern void flush_tlb_all(void); @@ -29,12 +28,6 @@ extern void flush_tlb_range(struct vm_area_struct *vma, unsigned long start, extern void flush_tlb_page(struct vm_area_struct *vma, unsigned long page); extern void __flush_tlb_page(unsigned long asid, unsigned long page); -static inline void flush_tlb_pgtables(struct mm_struct *mm, - unsigned long start, unsigned long end) -{ - /* Nothing to do */ -} - extern void flush_tlb_kernel_range(unsigned long start, unsigned long end); #endif /* __ASM_AVR32_TLBFLUSH_H */ diff --git a/include/asm-blackfin/bitops.h b/include/asm-blackfin/bitops.h index 03ecedc1f2a7..b39a175c79c1 100644 --- a/include/asm-blackfin/bitops.h +++ b/include/asm-blackfin/bitops.h @@ -11,6 +11,10 @@ #ifdef __KERNEL__ +#ifndef _LINUX_BITOPS_H +#error only <linux/bitops.h> can be included directly +#endif + #include <asm-generic/bitops/ffs.h> #include <asm-generic/bitops/__ffs.h> #include <asm-generic/bitops/sched.h> diff --git a/include/asm-blackfin/tlbflush.h b/include/asm-blackfin/tlbflush.h index 10a07ba1e011..277b400924b8 100644 --- a/include/asm-blackfin/tlbflush.h +++ b/include/asm-blackfin/tlbflush.h @@ -53,10 +53,4 @@ static inline void flush_tlb_kernel_page(unsigned long addr) BUG(); } -static inline void flush_tlb_pgtables(struct mm_struct *mm, - unsigned long start, unsigned long end) -{ - BUG(); -} - #endif diff --git a/include/asm-cris/bitops.h b/include/asm-cris/bitops.h index 617151b9b72b..e2f49c27ed29 100644 --- a/include/asm-cris/bitops.h +++ b/include/asm-cris/bitops.h @@ -14,6 +14,10 @@ /* Currently this is unsuitable for consumption outside the kernel. */ #ifdef __KERNEL__ +#ifndef _LINUX_BITOPS_H +#error only <linux/bitops.h> can be included directly +#endif + #include <asm/arch/bitops.h> #include <asm/system.h> #include <asm/atomic.h> diff --git a/include/asm-cris/posix_types.h b/include/asm-cris/posix_types.h index 7b9ed22ab5dd..92000d0c3f97 100644 --- a/include/asm-cris/posix_types.h +++ b/include/asm-cris/posix_types.h @@ -52,7 +52,7 @@ typedef struct { } __kernel_fsid_t; #ifdef __KERNEL__ -#include <asm/bitops.h> +#include <linux/bitops.h> #undef __FD_SET #define __FD_SET(fd,fdsetp) set_bit(fd, (void *)(fdsetp)) diff --git a/include/asm-cris/tlbflush.h b/include/asm-cris/tlbflush.h index 0569612477e3..20697e7ef4f2 100644 --- a/include/asm-cris/tlbflush.h +++ b/include/asm-cris/tlbflush.h @@ -38,13 +38,6 @@ static inline void flush_tlb_range(struct vm_area_struct * vma, unsigned long st flush_tlb_mm(vma->vm_mm); } -static inline void flush_tlb_pgtables(struct mm_struct *mm, - unsigned long start, unsigned long end) -{ - /* CRIS does not keep any page table caches in TLB */ -} - - static inline void flush_tlb(void) { flush_tlb_mm(current->mm); diff --git a/include/asm-frv/bitops.h b/include/asm-frv/bitops.h index 8dba74b1a254..e29de7131b79 100644 --- a/include/asm-frv/bitops.h +++ b/include/asm-frv/bitops.h @@ -21,6 +21,10 @@ #ifdef __KERNEL__ +#ifndef _LINUX_BITOPS_H +#error only <linux/bitops.h> can be included directly +#endif + #include <asm-generic/bitops/ffz.h> /* diff --git a/include/asm-frv/tlbflush.h b/include/asm-frv/tlbflush.h index 8370f97e41ee..7ac5eafc5d98 100644 --- a/include/asm-frv/tlbflush.h +++ b/include/asm-frv/tlbflush.h @@ -57,7 +57,6 @@ do { \ #define __flush_tlb_global() flush_tlb_all() #define flush_tlb() flush_tlb_all() #define flush_tlb_kernel_range(start, end) flush_tlb_all() -#define flush_tlb_pgtables(mm,start,end) do { } while(0) #else @@ -66,7 +65,6 @@ do { \ #define flush_tlb_mm(mm) BUG() #define flush_tlb_page(vma,addr) BUG() #define flush_tlb_range(mm,start,end) BUG() -#define flush_tlb_pgtables(mm,start,end) BUG() #define flush_tlb_kernel_range(start, end) BUG() #endif diff --git a/include/asm-generic/bitops.h b/include/asm-generic/bitops.h index e022a0f59e6b..15e6f253dda4 100644 --- a/include/asm-generic/bitops.h +++ b/include/asm-generic/bitops.h @@ -19,6 +19,10 @@ #ifdef __KERNEL__ +#ifndef _LINUX_BITOPS_H +#error only <linux/bitops.h> can be included directly +#endif + #include <asm-generic/bitops/sched.h> #include <asm-generic/bitops/ffs.h> #include <asm-generic/bitops/hweight.h> diff --git a/include/asm-generic/bitops/atomic.h b/include/asm-generic/bitops/atomic.h index cd8a9641bd66..4657f3e410fc 100644 --- a/include/asm-generic/bitops/atomic.h +++ b/include/asm-generic/bitops/atomic.h @@ -3,9 +3,6 @@ #include <asm/types.h> -#define BITOP_MASK(nr) (1UL << ((nr) % BITS_PER_LONG)) -#define BITOP_WORD(nr) ((nr) / BITS_PER_LONG) - #ifdef CONFIG_SMP #include <asm/spinlock.h> #include <asm/cache.h> /* we use L1_CACHE_BYTES */ @@ -66,8 +63,8 @@ extern raw_spinlock_t __atomic_hash[ATOMIC_HASH_SIZE] __lock_aligned; */ static inline void set_bit(int nr, volatile unsigned long *addr) { - unsigned long mask = BITOP_MASK(nr); - unsigned long *p = ((unsigned long *)addr) + BITOP_WORD(nr); + unsigned long mask = BIT_MASK(nr); + unsigned long *p = ((unsigned long *)addr) + BIT_WORD(nr); unsigned long flags; _atomic_spin_lock_irqsave(p, flags); @@ -87,8 +84,8 @@ static inline void set_bit(int nr, volatile unsigned long *addr) */ static inline void clear_bit(int nr, volatile unsigned long *addr) { - unsigned long mask = BITOP_MASK(nr); - unsigned long *p = ((unsigned long *)addr) + BITOP_WORD(nr); + unsigned long mask = BIT_MASK(nr); + unsigned long *p = ((unsigned long *)addr) + BIT_WORD(nr); unsigned long flags; _atomic_spin_lock_irqsave(p, flags); @@ -108,8 +105,8 @@ static inline void clear_bit(int nr, volatile unsigned long *addr) */ static inline void change_bit(int nr, volatile unsigned long *addr) { - unsigned long mask = BITOP_MASK(nr); - unsigned long *p = ((unsigned long *)addr) + BITOP_WORD(nr); + unsigned long mask = BIT_MASK(nr); + unsigned long *p = ((unsigned long *)addr) + BIT_WORD(nr); unsigned long flags; _atomic_spin_lock_irqsave(p, flags); @@ -128,8 +125,8 @@ static inline void change_bit(int nr, volatile unsigned long *addr) */ static inline int test_and_set_bit(int nr, volatile unsigned long *addr) { - unsigned long mask = BITOP_MASK(nr); - unsigned long *p = ((unsigned long *)addr) + BITOP_WORD(nr); + unsigned long mask = BIT_MASK(nr); + unsigned long *p = ((unsigned long *)addr) + BIT_WORD(nr); unsigned long old; unsigned long flags; @@ -152,8 +149,8 @@ static inline int test_and_set_bit(int nr, volatile unsigned long *addr) */ static inline int test_and_clear_bit(int nr, volatile unsigned long *addr) { - unsigned long mask = BITOP_MASK(nr); - unsigned long *p = ((unsigned long *)addr) + BITOP_WORD(nr); + unsigned long mask = BIT_MASK(nr); + unsigned long *p = ((unsigned long *)addr) + BIT_WORD(nr); unsigned long old; unsigned long flags; @@ -175,8 +172,8 @@ static inline int test_and_clear_bit(int nr, volatile unsigned long *addr) */ static inline int test_and_change_bit(int nr, volatile unsigned long *addr) { - unsigned long mask = BITOP_MASK(nr); - unsigned long *p = ((unsigned long *)addr) + BITOP_WORD(nr); + unsigned long mask = BIT_MASK(nr); + unsigned long *p = ((unsigned long *)addr) + BIT_WORD(nr); unsigned long old; unsigned long flags; diff --git a/include/asm-generic/bitops/non-atomic.h b/include/asm-generic/bitops/non-atomic.h index 46a825cf2ae1..697cc2b7e0f0 100644 --- a/include/asm-generic/bitops/non-atomic.h +++ b/include/asm-generic/bitops/non-atomic.h @@ -3,9 +3,6 @@ #include <asm/types.h> -#define BITOP_MASK(nr) (1UL << ((nr) % BITS_PER_LONG)) -#define BITOP_WORD(nr) ((nr) / BITS_PER_LONG) - /** * __set_bit - Set a bit in memory * @nr: the bit to set @@ -17,16 +14,16 @@ */ static inline void __set_bit(int nr, volatile unsigned long *addr) { - unsigned long mask = BITOP_MASK(nr); - unsigned long *p = ((unsigned long *)addr) + BITOP_WORD(nr); + unsigned long mask = BIT_MASK(nr); + unsigned long *p = ((unsigned long *)addr) + BIT_WORD(nr); *p |= mask; } static inline void __clear_bit(int nr, volatile unsigned long *addr) { - unsigned long mask = BITOP_MASK(nr); - unsigned long *p = ((unsigned long *)addr) + BITOP_WORD(nr); + unsigned long mask = BIT_MASK(nr); + unsigned long *p = ((unsigned long *)addr) + BIT_WORD(nr); *p &= ~mask; } @@ -42,8 +39,8 @@ static inline void __clear_bit(int nr, volatile unsigned long *addr) */ static inline void __change_bit(int nr, volatile unsigned long *addr) { - unsigned long mask = BITOP_MASK(nr); - unsigned long *p = ((unsigned long *)addr) + BITOP_WORD(nr); + unsigned long mask = BIT_MASK(nr); + unsigned long *p = ((unsigned long *)addr) + BIT_WORD(nr); *p ^= mask; } @@ -59,8 +56,8 @@ static inline void __change_bit(int nr, volatile unsigned long *addr) */ static inline int __test_and_set_bit(int nr, volatile unsigned long *addr) { - unsigned long mask = BITOP_MASK(nr); - unsigned long *p = ((unsigned long *)addr) + BITOP_WORD(nr); + unsigned long mask = BIT_MASK(nr); + unsigned long *p = ((unsigned long *)addr) + BIT_WORD(nr); unsigned long old = *p; *p = old | mask; @@ -78,8 +75,8 @@ static inline int __test_and_set_bit(int nr, volatile unsigned long *addr) */ static inline int __test_and_clear_bit(int nr, volatile unsigned long *addr) { - unsigned long mask = BITOP_MASK(nr); - unsigned long *p = ((unsigned long *)addr) + BITOP_WORD(nr); + unsigned long mask = BIT_MASK(nr); + unsigned long *p = ((unsigned long *)addr) + BIT_WORD(nr); unsigned long old = *p; *p = old & ~mask; @@ -90,8 +87,8 @@ static inline int __test_and_clear_bit(int nr, volatile unsigned long *addr) static inline int __test_and_change_bit(int nr, volatile unsigned long *addr) { - unsigned long mask = BITOP_MASK(nr); - unsigned long *p = ((unsigned long *)addr) + BITOP_WORD(nr); + unsigned long mask = BIT_MASK(nr); + unsigned long *p = ((unsigned long *)addr) + BIT_WORD(nr); unsigned long old = *p; *p = old ^ mask; @@ -105,7 +102,7 @@ static inline int __test_and_change_bit(int nr, */ static inline int test_bit(int nr, const volatile unsigned long *addr) { - return 1UL & (addr[BITOP_WORD(nr)] >> (nr & (BITS_PER_LONG-1))); + return 1UL & (addr[BIT_WORD(nr)] >> (nr & (BITS_PER_LONG-1))); } #endif /* _ASM_GENERIC_BITOPS_NON_ATOMIC_H_ */ diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h index 5615440027ec..9f584cc5c5fb 100644 --- a/include/asm-generic/vmlinux.lds.h +++ b/include/asm-generic/vmlinux.lds.h @@ -12,7 +12,11 @@ /* .data section */ #define DATA_DATA \ *(.data) \ - *(.data.init.refok) + *(.data.init.refok) \ + . = ALIGN(8); \ + VMLINUX_SYMBOL(__start___markers) = .; \ + *(__markers) \ + VMLINUX_SYMBOL(__stop___markers) = .; #define RO_DATA(align) \ . = ALIGN((align)); \ @@ -20,6 +24,7 @@ VMLINUX_SYMBOL(__start_rodata) = .; \ *(.rodata) *(.rodata.*) \ *(__vermagic) /* Kernel version magic */ \ + *(__markers_strings) /* Markers: strings */ \ } \ \ .rodata1 : AT(ADDR(.rodata1) - LOAD_OFFSET) { \ diff --git a/include/asm-h8300/bitops.h b/include/asm-h8300/bitops.h index e64ad315656d..cb18e3b0aa94 100644 --- a/include/asm-h8300/bitops.h +++ b/include/asm-h8300/bitops.h @@ -10,6 +10,11 @@ #include <asm/system.h> #ifdef __KERNEL__ + +#ifndef _LINUX_BITOPS_H +#error only <linux/bitops.h> can be included directly +#endif + /* * Function prototypes to keep gcc -Wall happy */ diff --git a/include/asm-h8300/tlbflush.h b/include/asm-h8300/tlbflush.h index 9a2c5c9fd700..41c148a9208e 100644 --- a/include/asm-h8300/tlbflush.h +++ b/include/asm-h8300/tlbflush.h @@ -52,10 +52,4 @@ static inline void flush_tlb_kernel_page(unsigned long addr) BUG(); } -static inline void flush_tlb_pgtables(struct mm_struct *mm, - unsigned long start, unsigned long end) -{ - BUG(); -} - #endif /* _H8300_TLBFLUSH_H */ diff --git a/include/asm-ia64/bitops.h b/include/asm-ia64/bitops.h index 2144f1a8ed6f..a977affaebec 100644 --- a/include/asm-ia64/bitops.h +++ b/include/asm-ia64/bitops.h @@ -9,6 +9,10 @@ * O(1) scheduler patch */ +#ifndef _LINUX_BITOPS_H +#error only <linux/bitops.h> can be included directly +#endif + #include <linux/compiler.h> #include <linux/types.h> #include <asm/intrinsics.h> diff --git a/include/asm-ia64/cacheflush.h b/include/asm-ia64/cacheflush.h index 4906916d715b..afcfbda76e20 100644 --- a/include/asm-ia64/cacheflush.h +++ b/include/asm-ia64/cacheflush.h @@ -7,8 +7,8 @@ */ #include <linux/page-flags.h> +#include <linux/bitops.h> -#include <asm/bitops.h> #include <asm/page.h> /* diff --git a/include/asm-ia64/meminit.h b/include/asm-ia64/meminit.h index 3a62878e84f3..f93308f54b61 100644 --- a/include/asm-ia64/meminit.h +++ b/include/asm-ia64/meminit.h @@ -35,7 +35,7 @@ extern void find_memory (void); extern void reserve_memory (void); extern void find_initrd (void); extern int filter_rsvd_memory (unsigned long start, unsigned long end, void *arg); -extern void efi_memmap_init(unsigned long *, unsigned long *); +extern unsigned long efi_memmap_init(unsigned long *s, unsigned long *e); extern int find_max_min_low_pfn (unsigned long , unsigned long, void *); extern unsigned long vmcore_find_descriptor_size(unsigned long address); diff --git a/include/asm-ia64/pgtable.h b/include/asm-ia64/pgtable.h index 0971ec90807e..e6204f14f614 100644 --- a/include/asm-ia64/pgtable.h +++ b/include/asm-ia64/pgtable.h @@ -150,7 +150,7 @@ # ifndef __ASSEMBLY__ #include <linux/sched.h> /* for mm_struct */ -#include <asm/bitops.h> +#include <linux/bitops.h> #include <asm/cacheflush.h> #include <asm/mmu_context.h> #include <asm/processor.h> diff --git a/include/asm-ia64/smp.h b/include/asm-ia64/smp.h index 1703c9d885bd..471cc2ee9ac4 100644 --- a/include/asm-ia64/smp.h +++ b/include/asm-ia64/smp.h @@ -14,8 +14,8 @@ #include <linux/threads.h> #include <linux/kernel.h> #include <linux/cpumask.h> +#include <linux/bitops.h> -#include <asm/bitops.h> #include <asm/io.h> #include <asm/param.h> #include <asm/processor.h> diff --git a/include/asm-ia64/spinlock.h b/include/asm-ia64/spinlock.h index ff857e31738a..0229fb95fb38 100644 --- a/include/asm-ia64/spinlock.h +++ b/include/asm-ia64/spinlock.h @@ -11,9 +11,9 @@ #include <linux/compiler.h> #include <linux/kernel.h> +#include <linux/bitops.h> #include <asm/atomic.h> -#include <asm/bitops.h> #include <asm/intrinsics.h> #include <asm/system.h> diff --git a/include/asm-ia64/tlbflush.h b/include/asm-ia64/tlbflush.h index e37f9fbf33af..80bcb0a38e8a 100644 --- a/include/asm-ia64/tlbflush.h +++ b/include/asm-ia64/tlbflush.h @@ -84,19 +84,6 @@ flush_tlb_page (struct vm_area_struct *vma, unsigned long addr) } /* - * Flush the TLB entries mapping the virtually mapped linear page - * table corresponding to address range [START-END). - */ -static inline void -flush_tlb_pgtables (struct mm_struct *mm, unsigned long start, unsigned long end) -{ - /* - * Deprecated. The virtual page table is now flushed via the normal gather/flush - * interface (see tlb.h). - */ -} - -/* * Flush the local TLB. Invoked from another cpu using an IPI. */ #ifdef CONFIG_SMP diff --git a/include/asm-m32r/bitops.h b/include/asm-m32r/bitops.h index 313a02c4a889..6dc9b81bf9f3 100644 --- a/include/asm-m32r/bitops.h +++ b/include/asm-m32r/bitops.h @@ -11,6 +11,10 @@ * Copyright (C) 2004 Hirokazu Takata <takata at linux-m32r.org> */ +#ifndef _LINUX_BITOPS_H +#error only <linux/bitops.h> can be included directly +#endif + #include <linux/compiler.h> #include <asm/assembler.h> #include <asm/system.h> diff --git a/include/asm-m32r/pgtable.h b/include/asm-m32r/pgtable.h index 92d7266783fd..86505387be08 100644 --- a/include/asm-m32r/pgtable.h +++ b/include/asm-m32r/pgtable.h @@ -21,9 +21,9 @@ #ifndef __ASSEMBLY__ #include <linux/threads.h> +#include <linux/bitops.h> #include <asm/processor.h> #include <asm/addrspace.h> -#include <asm/bitops.h> #include <asm/page.h> struct mm_struct; diff --git a/include/asm-m32r/tlbflush.h b/include/asm-m32r/tlbflush.h index 3d37ac002bcc..0ef95307784e 100644 --- a/include/asm-m32r/tlbflush.h +++ b/include/asm-m32r/tlbflush.h @@ -12,7 +12,6 @@ * - flush_tlb_page(vma, vmaddr) flushes one page * - flush_tlb_range(vma, start, end) flushes a range of pages * - flush_tlb_kernel_range(start, end) flushes a range of kernel pages - * - flush_tlb_pgtables(mm, start, end) flushes a range of page tables */ extern void local_flush_tlb_all(void); @@ -93,8 +92,6 @@ static __inline__ void __flush_tlb_all(void) ); } -#define flush_tlb_pgtables(mm, start, end) do { } while (0) - extern void update_mmu_cache(struct vm_area_struct *, unsigned long, pte_t); #endif /* _ASM_M32R_TLBFLUSH_H */ diff --git a/include/asm-m68k/bitops.h b/include/asm-m68k/bitops.h index da151f70cdc6..2976b5d68e96 100644 --- a/include/asm-m68k/bitops.h +++ b/include/asm-m68k/bitops.h @@ -8,6 +8,10 @@ * for more details. */ +#ifndef _LINUX_BITOPS_H +#error only <linux/bitops.h> can be included directly +#endif + #include <linux/compiler.h> /* diff --git a/include/asm-m68k/tlbflush.h b/include/asm-m68k/tlbflush.h index 31678831ee47..17707ec315e2 100644 --- a/include/asm-m68k/tlbflush.h +++ b/include/asm-m68k/tlbflush.h @@ -92,11 +92,6 @@ static inline void flush_tlb_kernel_range(unsigned long start, unsigned long end flush_tlb_all(); } -static inline void flush_tlb_pgtables(struct mm_struct *mm, - unsigned long start, unsigned long end) -{ -} - #else @@ -219,11 +214,6 @@ static inline void flush_tlb_kernel_page (unsigned long addr) sun3_put_segmap (addr & ~(SUN3_PMEG_SIZE - 1), SUN3_INVALID_PMEG); } -static inline void flush_tlb_pgtables(struct mm_struct *mm, - unsigned long start, unsigned long end) -{ -} - #endif #endif /* _M68K_TLBFLUSH_H */ diff --git a/include/asm-m68knommu/bitops.h b/include/asm-m68knommu/bitops.h index b8b2770d6870..f8dfb7ba2e25 100644 --- a/include/asm-m68knommu/bitops.h +++ b/include/asm-m68knommu/bitops.h @@ -10,6 +10,10 @@ #ifdef __KERNEL__ +#ifndef _LINUX_BITOPS_H +#error only <linux/bitops.h> can be included directly +#endif + #include <asm-generic/bitops/ffs.h> #include <asm-generic/bitops/__ffs.h> #include <asm-generic/bitops/sched.h> diff --git a/include/asm-m68knommu/tlbflush.h b/include/asm-m68knommu/tlbflush.h index de858db28b00..a470cfb803eb 100644 --- a/include/asm-m68knommu/tlbflush.h +++ b/include/asm-m68knommu/tlbflush.h @@ -52,10 +52,4 @@ static inline void flush_tlb_kernel_page(unsigned long addr) BUG(); } -static inline void flush_tlb_pgtables(struct mm_struct *mm, - unsigned long start, unsigned long end) -{ - BUG(); -} - #endif /* _M68KNOMMU_TLBFLUSH_H */ diff --git a/include/asm-mips/bitops.h b/include/asm-mips/bitops.h index 77ed0c79830b..ec75ce4cdb8c 100644 --- a/include/asm-mips/bitops.h +++ b/include/asm-mips/bitops.h @@ -9,6 +9,10 @@ #ifndef _ASM_BITOPS_H #define _ASM_BITOPS_H +#ifndef _LINUX_BITOPS_H +#error only <linux/bitops.h> can be included directly +#endif + #include <linux/compiler.h> #include <linux/irqflags.h> #include <linux/types.h> diff --git a/include/asm-mips/fpu.h b/include/asm-mips/fpu.h index 483685b1592e..e59d4c039661 100644 --- a/include/asm-mips/fpu.h +++ b/include/asm-mips/fpu.h @@ -12,12 +12,12 @@ #include <linux/sched.h> #include <linux/thread_info.h> +#include <linux/bitops.h> #include <asm/mipsregs.h> #include <asm/cpu.h> #include <asm/cpu-features.h> #include <asm/hazards.h> -#include <asm/bitops.h> #include <asm/processor.h> #include <asm/current.h> diff --git a/include/asm-mips/ip32/crime.h b/include/asm-mips/ip32/crime.h index a13702fafa85..7c36b0e5b1c6 100644 --- a/include/asm-mips/ip32/crime.h +++ b/include/asm-mips/ip32/crime.h @@ -17,9 +17,6 @@ */ #define CRIME_BASE 0x14000000 /* physical */ -#undef BIT -#define BIT(x) (1UL << (x)) - struct sgi_crime { volatile unsigned long id; #define CRIME_ID_MASK 0xff diff --git a/include/asm-mips/ip32/mace.h b/include/asm-mips/ip32/mace.h index 990082c81f39..d08d7c672139 100644 --- a/include/asm-mips/ip32/mace.h +++ b/include/asm-mips/ip32/mace.h @@ -17,9 +17,6 @@ */ #define MACE_BASE 0x1f000000 /* physical */ -#undef BIT -#define BIT(x) (1UL << (x)) - /* * PCI interface */ diff --git a/include/asm-mips/tlbflush.h b/include/asm-mips/tlbflush.h index 730e841fb08a..86b21de12e91 100644 --- a/include/asm-mips/tlbflush.h +++ b/include/asm-mips/tlbflush.h @@ -11,7 +11,6 @@ * - flush_tlb_page(vma, vmaddr) flushes one page * - flush_tlb_range(vma, start, end) flushes a range of pages * - flush_tlb_kernel_range(start, end) flushes a range of kernel pages - * - flush_tlb_pgtables(mm, start, end) flushes a range of page tables */ extern void local_flush_tlb_all(void); extern void local_flush_tlb_mm(struct mm_struct *mm); @@ -45,10 +44,4 @@ extern void flush_tlb_one(unsigned long vaddr); #endif /* CONFIG_SMP */ -static inline void flush_tlb_pgtables(struct mm_struct *mm, - unsigned long start, unsigned long end) -{ - /* Nothing to do on MIPS. */ -} - #endif /* __ASM_TLBFLUSH_H */ diff --git a/include/asm-parisc/bitops.h b/include/asm-parisc/bitops.h index 03ae287baf89..f8eebcbad01f 100644 --- a/include/asm-parisc/bitops.h +++ b/include/asm-parisc/bitops.h @@ -1,6 +1,10 @@ #ifndef _PARISC_BITOPS_H #define _PARISC_BITOPS_H +#ifndef _LINUX_BITOPS_H +#error only <linux/bitops.h> can be included directly +#endif + #include <linux/compiler.h> #include <asm/types.h> /* for BITS_PER_LONG/SHIFT_PER_LONG */ #include <asm/byteorder.h> diff --git a/include/asm-parisc/pgtable.h b/include/asm-parisc/pgtable.h index e88cacd63724..9ab79c8e5a41 100644 --- a/include/asm-parisc/pgtable.h +++ b/include/asm-parisc/pgtable.h @@ -11,9 +11,9 @@ */ #include <linux/mm.h> /* for vm_area_struct */ +#include <linux/bitops.h> #include <asm/processor.h> #include <asm/cache.h> -#include <asm/bitops.h> /* * kern_addr_valid(ADDR) tests if ADDR is pointing to valid kernel diff --git a/include/asm-parisc/tlbflush.h b/include/asm-parisc/tlbflush.h index 270cf309772b..b72ec66db699 100644 --- a/include/asm-parisc/tlbflush.h +++ b/include/asm-parisc/tlbflush.h @@ -57,10 +57,6 @@ static inline void flush_tlb_mm(struct mm_struct *mm) #endif } -extern __inline__ void flush_tlb_pgtables(struct mm_struct *mm, unsigned long start, unsigned long end) -{ -} - static inline void flush_tlb_page(struct vm_area_struct *vma, unsigned long addr) { diff --git a/include/asm-powerpc/bitops.h b/include/asm-powerpc/bitops.h index e85c3e078ba2..733b4af7f4f1 100644 --- a/include/asm-powerpc/bitops.h +++ b/include/asm-powerpc/bitops.h @@ -38,6 +38,10 @@ #ifdef __KERNEL__ +#ifndef _LINUX_BITOPS_H +#error only <linux/bitops.h> can be included directly +#endif + #include <linux/compiler.h> #include <asm/asm-compat.h> #include <asm/synch.h> diff --git a/include/asm-powerpc/iommu.h b/include/asm-powerpc/iommu.h index 870967e47204..4a82fdccee92 100644 --- a/include/asm-powerpc/iommu.h +++ b/include/asm-powerpc/iommu.h @@ -26,9 +26,9 @@ #include <linux/spinlock.h> #include <linux/device.h> #include <linux/dma-mapping.h> +#include <linux/bitops.h> #include <asm/machdep.h> #include <asm/types.h> -#include <asm/bitops.h> #define IOMMU_PAGE_SHIFT 12 #define IOMMU_PAGE_SIZE (ASM_CONST(1) << IOMMU_PAGE_SHIFT) diff --git a/include/asm-powerpc/mmu_context.h b/include/asm-powerpc/mmu_context.h index f863ac21409e..9102b8bf0ead 100644 --- a/include/asm-powerpc/mmu_context.h +++ b/include/asm-powerpc/mmu_context.h @@ -8,7 +8,7 @@ #ifndef CONFIG_PPC64 #include <asm/atomic.h> -#include <asm/bitops.h> +#include <linux/bitops.h> /* * On 32-bit PowerPC 6xx/7xx/7xxx CPUs, we use a set of 16 VSIDs diff --git a/include/asm-powerpc/tlbflush.h b/include/asm-powerpc/tlbflush.h index a022f806bb21..b6b036ccee34 100644 --- a/include/asm-powerpc/tlbflush.h +++ b/include/asm-powerpc/tlbflush.h @@ -8,7 +8,6 @@ * - flush_tlb_page_nohash(vma, vmaddr) flushes one page if SW loaded TLB * - flush_tlb_range(vma, start, end) flushes a range of pages * - flush_tlb_kernel_range(start, end) flushes a range of kernel pages - * - flush_tlb_pgtables(mm, start, end) flushes a range of page tables * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License @@ -174,15 +173,5 @@ extern void __flush_hash_table_range(struct mm_struct *mm, unsigned long start, */ extern void update_mmu_cache(struct vm_area_struct *, unsigned long, pte_t); -/* - * This is called in munmap when we have freed up some page-table - * pages. We don't need to do anything here, there's nothing special - * about our page-table pages. -- paulus - */ -static inline void flush_tlb_pgtables(struct mm_struct *mm, - unsigned long start, unsigned long end) -{ -} - #endif /*__KERNEL__ */ #endif /* _ASM_POWERPC_TLBFLUSH_H */ diff --git a/include/asm-ppc/mmu_context.h b/include/asm-ppc/mmu_context.h index a6441a063e5d..b2e25d8997bf 100644 --- a/include/asm-ppc/mmu_context.h +++ b/include/asm-ppc/mmu_context.h @@ -2,8 +2,9 @@ #ifndef __PPC_MMU_CONTEXT_H #define __PPC_MMU_CONTEXT_H +#include <linux/bitops.h> + #include <asm/atomic.h> -#include <asm/bitops.h> #include <asm/mmu.h> #include <asm/cputable.h> #include <asm-generic/mm_hooks.h> diff --git a/include/asm-s390/bitops.h b/include/asm-s390/bitops.h index d756b34d25f3..34d9a6357c38 100644 --- a/include/asm-s390/bitops.h +++ b/include/asm-s390/bitops.h @@ -15,6 +15,10 @@ #ifdef __KERNEL__ +#ifndef _LINUX_BITOPS_H +#error only <linux/bitops.h> can be included directly +#endif + #include <linux/compiler.h> /* diff --git a/include/asm-s390/tlbflush.h b/include/asm-s390/tlbflush.h index 66793f55c8b2..6de2632a3e4f 100644 --- a/include/asm-s390/tlbflush.h +++ b/include/asm-s390/tlbflush.h @@ -14,7 +14,6 @@ * - flush_tlb_page(vma, vmaddr) flushes one page * - flush_tlb_range(vma, start, end) flushes a range of pages * - flush_tlb_kernel_range(start, end) flushes a range of kernel pages - * - flush_tlb_pgtables(mm, start, end) flushes a range of page tables */ /* @@ -152,10 +151,4 @@ static inline void flush_tlb_range(struct vm_area_struct *vma, #endif -static inline void flush_tlb_pgtables(struct mm_struct *mm, - unsigned long start, unsigned long end) -{ - /* S/390 does not keep any page table caches in TLB */ -} - #endif /* _S390_TLBFLUSH_H */ diff --git a/include/asm-sh/bitops.h b/include/asm-sh/bitops.h index 9d7021723a25..df805f20b267 100644 --- a/include/asm-sh/bitops.h +++ b/include/asm-sh/bitops.h @@ -2,6 +2,11 @@ #define __ASM_SH_BITOPS_H #ifdef __KERNEL__ + +#ifndef _LINUX_BITOPS_H +#error only <linux/bitops.h> can be included directly +#endif + #include <asm/system.h> /* For __swab32 */ #include <asm/byteorder.h> diff --git a/include/asm-sh/tlbflush.h b/include/asm-sh/tlbflush.h index 455fb8da441e..e0ac97221ae6 100644 --- a/include/asm-sh/tlbflush.h +++ b/include/asm-sh/tlbflush.h @@ -9,7 +9,6 @@ * - flush_tlb_page(vma, vmaddr) flushes one page * - flush_tlb_range(vma, start, end) flushes a range of pages * - flush_tlb_kernel_range(start, end) flushes a range of kernel pages - * - flush_tlb_pgtables(mm, start, end) flushes a range of page tables */ extern void local_flush_tlb_all(void); extern void local_flush_tlb_mm(struct mm_struct *mm); @@ -47,9 +46,4 @@ extern void flush_tlb_one(unsigned long asid, unsigned long page); #endif /* CONFIG_SMP */ -static inline void flush_tlb_pgtables(struct mm_struct *mm, - unsigned long start, unsigned long end) -{ - /* Nothing to do */ -} #endif /* __ASM_SH_TLBFLUSH_H */ diff --git a/include/asm-sh64/bitops.h b/include/asm-sh64/bitops.h index 444d5ea92ce9..600c59efb4c2 100644 --- a/include/asm-sh64/bitops.h +++ b/include/asm-sh64/bitops.h @@ -13,6 +13,11 @@ */ #ifdef __KERNEL__ + +#ifndef _LINUX_BITOPS_H +#error only <linux/bitops.h> can be included directly +#endif + #include <linux/compiler.h> #include <asm/system.h> /* For __swab32 */ diff --git a/include/asm-sh64/tlbflush.h b/include/asm-sh64/tlbflush.h index e45beadc29ee..16a164a23754 100644 --- a/include/asm-sh64/tlbflush.h +++ b/include/asm-sh64/tlbflush.h @@ -20,10 +20,6 @@ extern void flush_tlb_mm(struct mm_struct *mm); extern void flush_tlb_range(struct vm_area_struct *vma, unsigned long start, unsigned long end); extern void flush_tlb_page(struct vm_area_struct *vma, unsigned long page); -static inline void flush_tlb_pgtables(struct mm_struct *mm, - unsigned long start, unsigned long end) -{ -} extern void flush_tlb_kernel_range(unsigned long start, unsigned long end); diff --git a/include/asm-sparc/bitops.h b/include/asm-sparc/bitops.h index 00bd0a679d70..cb3cefab6e09 100644 --- a/include/asm-sparc/bitops.h +++ b/include/asm-sparc/bitops.h @@ -14,6 +14,10 @@ #ifdef __KERNEL__ +#ifndef _LINUX_BITOPS_H +#error only <linux/bitops.h> can be included directly +#endif + extern unsigned long ___set_bit(unsigned long *addr, unsigned long mask); extern unsigned long ___clear_bit(unsigned long *addr, unsigned long mask); extern unsigned long ___change_bit(unsigned long *addr, unsigned long mask); diff --git a/include/asm-sparc/tlbflush.h b/include/asm-sparc/tlbflush.h index a619da5cfaa9..b957e29d2ae1 100644 --- a/include/asm-sparc/tlbflush.h +++ b/include/asm-sparc/tlbflush.h @@ -13,7 +13,6 @@ * - flush_tlb_page(vma, vmaddr) flushes one page * - flush_tlb_range(vma, start, end) flushes a range of pages * - flush_tlb_kernel_range(start, end) flushes a range of kernel pages - * - flush_tlb_pgtables(mm, start, end) flushes a range of page tables */ #ifdef CONFIG_SMP @@ -42,11 +41,6 @@ BTFIXUPDEF_CALL(void, flush_tlb_mm, struct mm_struct *) BTFIXUPDEF_CALL(void, flush_tlb_range, struct vm_area_struct *, unsigned long, unsigned long) BTFIXUPDEF_CALL(void, flush_tlb_page, struct vm_area_struct *, unsigned long) -// Thanks to Anton Blanchard, our pagetables became uncached in 2.4. Wee! -// extern void flush_tlb_pgtables(struct mm_struct *mm, -// unsigned long start, unsigned long end); -#define flush_tlb_pgtables(mm, start, end) do{ }while(0) - #define flush_tlb_all() BTFIXUP_CALL(flush_tlb_all)() #define flush_tlb_mm(mm) BTFIXUP_CALL(flush_tlb_mm)(mm) #define flush_tlb_range(vma,start,end) BTFIXUP_CALL(flush_tlb_range)(vma,start,end) diff --git a/include/asm-sparc64/bitops.h b/include/asm-sparc64/bitops.h index dd4bfe993b61..982ce8992b91 100644 --- a/include/asm-sparc64/bitops.h +++ b/include/asm-sparc64/bitops.h @@ -7,6 +7,10 @@ #ifndef _SPARC64_BITOPS_H #define _SPARC64_BITOPS_H +#ifndef _LINUX_BITOPS_H +#error only <linux/bitops.h> can be included directly +#endif + #include <linux/compiler.h> #include <asm/byteorder.h> diff --git a/include/asm-sparc64/smp.h b/include/asm-sparc64/smp.h index 42c09949526c..1c1c5ea5cea5 100644 --- a/include/asm-sparc64/smp.h +++ b/include/asm-sparc64/smp.h @@ -26,7 +26,7 @@ * Private routines/data */ -#include <asm/bitops.h> +#include <linux/bitops.h> #include <asm/atomic.h> #include <asm/percpu.h> diff --git a/include/asm-sparc64/tlbflush.h b/include/asm-sparc64/tlbflush.h index 3487328570ed..fbb675dbe0c9 100644 --- a/include/asm-sparc64/tlbflush.h +++ b/include/asm-sparc64/tlbflush.h @@ -41,11 +41,4 @@ do { flush_tsb_kernel_range(start,end); \ #endif /* ! CONFIG_SMP */ -static inline void flush_tlb_pgtables(struct mm_struct *mm, unsigned long start, unsigned long end) -{ - /* We don't use virtual page tables for TLB miss processing - * any more. Nowadays we use the TSB. - */ -} - #endif /* _SPARC64_TLBFLUSH_H */ diff --git a/include/asm-um/bitops.h b/include/asm-um/bitops.h index 46d781953d3a..e4d38d437b97 100644 --- a/include/asm-um/bitops.h +++ b/include/asm-um/bitops.h @@ -1,6 +1,10 @@ #ifndef __UM_BITOPS_H #define __UM_BITOPS_H +#ifndef _LINUX_BITOPS_H +#error only <linux/bitops.h> can be included directly +#endif + #include "asm/arch/bitops.h" #endif diff --git a/include/asm-um/tlbflush.h b/include/asm-um/tlbflush.h index 9d647c55350b..614f2c091178 100644 --- a/include/asm-um/tlbflush.h +++ b/include/asm-um/tlbflush.h @@ -17,7 +17,6 @@ * - flush_tlb_page(vma, vmaddr) flushes one page * - flush_tlb_kernel_vm() flushes the kernel vm area * - flush_tlb_range(vma, start, end) flushes a range of pages - * - flush_tlb_pgtables(mm, start, end) flushes a range of page tables */ extern void flush_tlb_all(void); @@ -29,9 +28,4 @@ extern void flush_tlb_kernel_vm(void); extern void flush_tlb_kernel_range(unsigned long start, unsigned long end); extern void __flush_tlb_one(unsigned long addr); -static inline void flush_tlb_pgtables(struct mm_struct *mm, - unsigned long start, unsigned long end) -{ -} - #endif diff --git a/include/asm-v850/bitops.h b/include/asm-v850/bitops.h index 8eafdb1c08ba..f82f5b4a56e0 100644 --- a/include/asm-v850/bitops.h +++ b/include/asm-v850/bitops.h @@ -13,6 +13,9 @@ #ifndef __V850_BITOPS_H__ #define __V850_BITOPS_H__ +#ifndef _LINUX_BITOPS_H +#error only <linux/bitops.h> can be included directly +#endif #include <linux/compiler.h> /* unlikely */ #include <asm/byteorder.h> /* swab32 */ diff --git a/include/asm-v850/tlbflush.h b/include/asm-v850/tlbflush.h index 5f2f85f636ea..c44aa64449c8 100644 --- a/include/asm-v850/tlbflush.h +++ b/include/asm-v850/tlbflush.h @@ -61,10 +61,4 @@ static inline void flush_tlb_kernel_page(unsigned long addr) BUG (); } -static inline void flush_tlb_pgtables(struct mm_struct *mm, - unsigned long start, unsigned long end) -{ - BUG (); -} - #endif /* __V850_TLBFLUSH_H__ */ diff --git a/include/asm-x86/bitops_32.h b/include/asm-x86/bitops_32.h index c96641f75022..3268a341cf49 100644 --- a/include/asm-x86/bitops_32.h +++ b/include/asm-x86/bitops_32.h @@ -5,6 +5,10 @@ * Copyright 1992, Linus Torvalds. */ +#ifndef _LINUX_BITOPS_H +#error only <linux/bitops.h> can be included directly +#endif + #include <linux/compiler.h> #include <asm/alternative.h> diff --git a/include/asm-x86/bitops_64.h b/include/asm-x86/bitops_64.h index 525edf2ce5c2..dacaa5f1febc 100644 --- a/include/asm-x86/bitops_64.h +++ b/include/asm-x86/bitops_64.h @@ -5,6 +5,10 @@ * Copyright 1992, Linus Torvalds. */ +#ifndef _LINUX_BITOPS_H +#error only <linux/bitops.h> can be included directly +#endif + #include <asm/alternative.h> #if __GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 1) diff --git a/include/asm-x86/pgtable_32.h b/include/asm-x86/pgtable_32.h index acd4b339c49b..ed3e70d8d04b 100644 --- a/include/asm-x86/pgtable_32.h +++ b/include/asm-x86/pgtable_32.h @@ -17,10 +17,7 @@ #include <linux/threads.h> #include <asm/paravirt.h> -#ifndef _I386_BITOPS_H -#include <asm/bitops.h> -#endif - +#include <linux/bitops.h> #include <linux/slab.h> #include <linux/list.h> #include <linux/spinlock.h> diff --git a/include/asm-x86/pgtable_64.h b/include/asm-x86/pgtable_64.h index a79f5355e3b0..9b0ff477b39e 100644 --- a/include/asm-x86/pgtable_64.h +++ b/include/asm-x86/pgtable_64.h @@ -9,7 +9,7 @@ * the x86-64 page table tree. */ #include <asm/processor.h> -#include <asm/bitops.h> +#include <linux/bitops.h> #include <linux/threads.h> #include <asm/pda.h> diff --git a/include/asm-x86/smp_32.h b/include/asm-x86/smp_32.h index ee46038d126c..1f576a93368f 100644 --- a/include/asm-x86/smp_32.h +++ b/include/asm-x86/smp_32.h @@ -11,7 +11,7 @@ #endif #if defined(CONFIG_X86_LOCAL_APIC) && !defined(__ASSEMBLY__) -#include <asm/bitops.h> +#include <linux/bitops.h> #include <asm/mpspec.h> #include <asm/apic.h> #ifdef CONFIG_X86_IO_APIC diff --git a/include/asm-x86/tlbflush_32.h b/include/asm-x86/tlbflush_32.h index a50fa6741486..2bd5b95e2048 100644 --- a/include/asm-x86/tlbflush_32.h +++ b/include/asm-x86/tlbflush_32.h @@ -78,7 +78,6 @@ * - flush_tlb_page(vma, vmaddr) flushes one page * - flush_tlb_range(vma, start, end) flushes a range of pages * - flush_tlb_kernel_range(start, end) flushes a range of kernel pages - * - flush_tlb_pgtables(mm, start, end) flushes a range of page tables * - flush_tlb_others(cpumask, mm, va) flushes a TLBs on other cpus * * ..but the i386 has somewhat limited tlb flushing capabilities, @@ -166,10 +165,4 @@ static inline void flush_tlb_kernel_range(unsigned long start, flush_tlb_all(); } -static inline void flush_tlb_pgtables(struct mm_struct *mm, - unsigned long start, unsigned long end) -{ - /* i386 does not keep any page table caches in TLB */ -} - #endif /* _I386_TLBFLUSH_H */ diff --git a/include/asm-x86/tlbflush_64.h b/include/asm-x86/tlbflush_64.h index 888eb4abdd07..7731fd23d572 100644 --- a/include/asm-x86/tlbflush_64.h +++ b/include/asm-x86/tlbflush_64.h @@ -31,7 +31,6 @@ static inline void __flush_tlb_all(void) * - flush_tlb_page(vma, vmaddr) flushes one page * - flush_tlb_range(vma, start, end) flushes a range of pages * - flush_tlb_kernel_range(start, end) flushes a range of kernel pages - * - flush_tlb_pgtables(mm, start, end) flushes a range of page tables * * x86-64 can only flush individual pages or full VMs. For a range flush * we always do the full VM. Might be worth trying if for a small @@ -98,12 +97,4 @@ static inline void flush_tlb_kernel_range(unsigned long start, flush_tlb_all(); } -static inline void flush_tlb_pgtables(struct mm_struct *mm, - unsigned long start, unsigned long end) -{ - /* x86_64 does not keep any page table caches in a software TLB. - The CPUs do in their hardware TLBs, but they are handled - by the normal TLB flushing algorithms. */ -} - #endif /* _X8664_TLBFLUSH_H */ diff --git a/include/asm-x86/topology_64.h b/include/asm-x86/topology_64.h index 848c17f92226..c0c93d744673 100644 --- a/include/asm-x86/topology_64.h +++ b/include/asm-x86/topology_64.h @@ -5,7 +5,7 @@ #ifdef CONFIG_NUMA #include <asm/mpspec.h> -#include <asm/bitops.h> +#include <linux/bitops.h> extern cpumask_t cpu_online_map; diff --git a/include/asm-xtensa/bitops.h b/include/asm-xtensa/bitops.h index 78db04cf6e48..23261e8f2e5a 100644 --- a/include/asm-xtensa/bitops.h +++ b/include/asm-xtensa/bitops.h @@ -15,6 +15,10 @@ #ifdef __KERNEL__ +#ifndef _LINUX_BITOPS_H +#error only <linux/bitops.h> can be included directly +#endif + #include <asm/processor.h> #include <asm/byteorder.h> #include <asm/system.h> diff --git a/include/asm-xtensa/tlbflush.h b/include/asm-xtensa/tlbflush.h index 7c637b3c352c..46d240074f74 100644 --- a/include/asm-xtensa/tlbflush.h +++ b/include/asm-xtensa/tlbflush.h @@ -41,17 +41,6 @@ extern void flush_tlb_range(struct vm_area_struct*,unsigned long,unsigned long); #define flush_tlb_kernel_range(start,end) flush_tlb_all() - -/* This is calld in munmap when we have freed up some page-table pages. - * We don't need to do anything here, there's nothing special about our - * page-table pages. - */ - -static inline void flush_tlb_pgtables(struct mm_struct *mm, - unsigned long start, unsigned long end) -{ -} - /* TLB operations. */ static inline unsigned long itlb_probe(unsigned long addr) diff --git a/include/linux/Kbuild b/include/linux/Kbuild index 7ac8303c8471..e3ffd14a3f0b 100644 --- a/include/linux/Kbuild +++ b/include/linux/Kbuild @@ -47,6 +47,7 @@ header-y += coda_psdev.h header-y += coff.h header-y += comstats.h header-y += const.h +header-y += cgroupstats.h header-y += cycx_cfm.h header-y += dlm_device.h header-y += dlm_netlink.h diff --git a/include/linux/bitmap.h b/include/linux/bitmap.h index 64b4641904fe..acad1105d942 100644 --- a/include/linux/bitmap.h +++ b/include/linux/bitmap.h @@ -6,6 +6,7 @@ #include <linux/types.h> #include <linux/bitops.h> #include <linux/string.h> +#include <linux/kernel.h> /* * bitmaps provide bit arrays that consume one or more unsigned diff --git a/include/linux/bitops.h b/include/linux/bitops.h index b9fb8ee3308b..69c1edb9fe54 100644 --- a/include/linux/bitops.h +++ b/include/linux/bitops.h @@ -2,6 +2,14 @@ #define _LINUX_BITOPS_H #include <asm/types.h> +#ifdef __KERNEL__ +#define BIT(nr) (1UL << (nr)) +#define BIT_MASK(nr) (1UL << ((nr) % BITS_PER_LONG)) +#define BIT_WORD(nr) ((nr) / BITS_PER_LONG) +#define BITS_TO_LONGS(nr) DIV_ROUND_UP(nr, BITS_PER_LONG) +#define BITS_PER_BYTE 8 +#endif + /* * Include this here because some architectures need generic_ffs/fls in * scope diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h new file mode 100644 index 000000000000..87479328d46d --- /dev/null +++ b/include/linux/cgroup.h @@ -0,0 +1,327 @@ +#ifndef _LINUX_CGROUP_H +#define _LINUX_CGROUP_H +/* + * cgroup interface + * + * Copyright (C) 2003 BULL SA + * Copyright (C) 2004-2006 Silicon Graphics, Inc. + * + */ + +#include <linux/sched.h> +#include <linux/kref.h> +#include <linux/cpumask.h> +#include <linux/nodemask.h> +#include <linux/rcupdate.h> +#include <linux/cgroupstats.h> + +#ifdef CONFIG_CGROUPS + +struct cgroupfs_root; +struct cgroup_subsys; +struct inode; + +extern int cgroup_init_early(void); +extern int cgroup_init(void); +extern void cgroup_init_smp(void); +extern void cgroup_lock(void); +extern void cgroup_unlock(void); +extern void cgroup_fork(struct task_struct *p); +extern void cgroup_fork_callbacks(struct task_struct *p); +extern void cgroup_post_fork(struct task_struct *p); +extern void cgroup_exit(struct task_struct *p, int run_callbacks); +extern int cgroupstats_build(struct cgroupstats *stats, + struct dentry *dentry); + +extern struct file_operations proc_cgroup_operations; + +/* Define the enumeration of all cgroup subsystems */ +#define SUBSYS(_x) _x ## _subsys_id, +enum cgroup_subsys_id { +#include <linux/cgroup_subsys.h> + CGROUP_SUBSYS_COUNT +}; +#undef SUBSYS + +/* Per-subsystem/per-cgroup state maintained by the system. */ +struct cgroup_subsys_state { + /* The cgroup that this subsystem is attached to. Useful + * for subsystems that want to know about the cgroup + * hierarchy structure */ + struct cgroup *cgroup; + + /* State maintained by the cgroup system to allow + * subsystems to be "busy". Should be accessed via css_get() + * and css_put() */ + + atomic_t refcnt; + + unsigned long flags; +}; + +/* bits in struct cgroup_subsys_state flags field */ +enum { + CSS_ROOT, /* This CSS is the root of the subsystem */ +}; + +/* + * Call css_get() to hold a reference on the cgroup; + * + */ + +static inline void css_get(struct cgroup_subsys_state *css) +{ + /* We don't need to reference count the root state */ + if (!test_bit(CSS_ROOT, &css->flags)) + atomic_inc(&css->refcnt); +} +/* + * css_put() should be called to release a reference taken by + * css_get() + */ + +extern void __css_put(struct cgroup_subsys_state *css); +static inline void css_put(struct cgroup_subsys_state *css) +{ + if (!test_bit(CSS_ROOT, &css->flags)) + __css_put(css); +} + +struct cgroup { + unsigned long flags; /* "unsigned long" so bitops work */ + + /* count users of this cgroup. >0 means busy, but doesn't + * necessarily indicate the number of tasks in the + * cgroup */ + atomic_t count; + + /* + * We link our 'sibling' struct into our parent's 'children'. + * Our children link their 'sibling' into our 'children'. + */ + struct list_head sibling; /* my parent's children */ + struct list_head children; /* my children */ + + struct cgroup *parent; /* my parent */ + struct dentry *dentry; /* cgroup fs entry */ + + /* Private pointers for each registered subsystem */ + struct cgroup_subsys_state *subsys[CGROUP_SUBSYS_COUNT]; + + struct cgroupfs_root *root; + struct cgroup *top_cgroup; + + /* + * List of cg_cgroup_links pointing at css_sets with + * tasks in this cgroup. Protected by css_set_lock + */ + struct list_head css_sets; + + /* + * Linked list running through all cgroups that can + * potentially be reaped by the release agent. Protected by + * release_list_lock + */ + struct list_head release_list; +}; + +/* A css_set is a structure holding pointers to a set of + * cgroup_subsys_state objects. This saves space in the task struct + * object and speeds up fork()/exit(), since a single inc/dec and a + * list_add()/del() can bump the reference count on the entire + * cgroup set for a task. + */ + +struct css_set { + + /* Reference count */ + struct kref ref; + + /* + * List running through all cgroup groups. Protected by + * css_set_lock + */ + struct list_head list; + + /* + * List running through all tasks using this cgroup + * group. Protected by css_set_lock + */ + struct list_head tasks; + + /* + * List of cg_cgroup_link objects on link chains from + * cgroups referenced from this css_set. Protected by + * css_set_lock + */ + struct list_head cg_links; + + /* + * Set of subsystem states, one for each subsystem. This array + * is immutable after creation apart from the init_css_set + * during subsystem registration (at boot time). + */ + struct cgroup_subsys_state *subsys[CGROUP_SUBSYS_COUNT]; + +}; + +/* struct cftype: + * + * The files in the cgroup filesystem mostly have a very simple read/write + * handling, some common function will take care of it. Nevertheless some cases + * (read tasks) are special and therefore I define this structure for every + * kind of file. + * + * + * When reading/writing to a file: + * - the cgroup to use in file->f_dentry->d_parent->d_fsdata + * - the 'cftype' of the file is file->f_dentry->d_fsdata + */ + +#define MAX_CFTYPE_NAME 64 +struct cftype { + /* By convention, the name should begin with the name of the + * subsystem, followed by a period */ + char name[MAX_CFTYPE_NAME]; + int private; + int (*open) (struct inode *inode, struct file *file); + ssize_t (*read) (struct cgroup *cont, struct cftype *cft, + struct file *file, + char __user *buf, size_t nbytes, loff_t *ppos); + /* + * read_uint() is a shortcut for the common case of returning a + * single integer. Use it in place of read() + */ + u64 (*read_uint) (struct cgroup *cont, struct cftype *cft); + ssize_t (*write) (struct cgroup *cont, struct cftype *cft, + struct file *file, + const char __user *buf, size_t nbytes, loff_t *ppos); + + /* + * write_uint() is a shortcut for the common case of accepting + * a single integer (as parsed by simple_strtoull) from + * userspace. Use in place of write(); return 0 or error. + */ + int (*write_uint) (struct cgroup *cont, struct cftype *cft, u64 val); + + int (*release) (struct inode *inode, struct file *file); +}; + +/* Add a new file to the given cgroup directory. Should only be + * called by subsystems from within a populate() method */ +int cgroup_add_file(struct cgroup *cont, struct cgroup_subsys *subsys, + const struct cftype *cft); + +/* Add a set of new files to the given cgroup directory. Should + * only be called by subsystems from within a populate() method */ +int cgroup_add_files(struct cgroup *cont, + struct cgroup_subsys *subsys, + const struct cftype cft[], + int count); + +int cgroup_is_removed(const struct cgroup *cont); + +int cgroup_path(const struct cgroup *cont, char *buf, int buflen); + +int cgroup_task_count(const struct cgroup *cont); + +/* Return true if the cgroup is a descendant of the current cgroup */ +int cgroup_is_descendant(const struct cgroup *cont); + +/* Control Group subsystem type. See Documentation/cgroups.txt for details */ + +struct cgroup_subsys { + struct cgroup_subsys_state *(*create)(struct cgroup_subsys *ss, + struct cgroup *cont); + void (*destroy)(struct cgroup_subsys *ss, struct cgroup *cont); + int (*can_attach)(struct cgroup_subsys *ss, + struct cgroup *cont, struct task_struct *tsk); + void (*attach)(struct cgroup_subsys *ss, struct cgroup *cont, + struct cgroup *old_cont, struct task_struct *tsk); + void (*fork)(struct cgroup_subsys *ss, struct task_struct *task); + void (*exit)(struct cgroup_subsys *ss, struct task_struct *task); + int (*populate)(struct cgroup_subsys *ss, + struct cgroup *cont); + void (*post_clone)(struct cgroup_subsys *ss, struct cgroup *cont); + void (*bind)(struct cgroup_subsys *ss, struct cgroup *root); + int subsys_id; + int active; + int early_init; +#define MAX_CGROUP_TYPE_NAMELEN 32 + const char *name; + + /* Protected by RCU */ + struct cgroupfs_root *root; + + struct list_head sibling; + + void *private; +}; + +#define SUBSYS(_x) extern struct cgroup_subsys _x ## _subsys; +#include <linux/cgroup_subsys.h> +#undef SUBSYS + +static inline struct cgroup_subsys_state *cgroup_subsys_state( + struct cgroup *cont, int subsys_id) +{ + return cont->subsys[subsys_id]; +} + +static inline struct cgroup_subsys_state *task_subsys_state( + struct task_struct *task, int subsys_id) +{ + return rcu_dereference(task->cgroups->subsys[subsys_id]); +} + +static inline struct cgroup* task_cgroup(struct task_struct *task, + int subsys_id) +{ + return task_subsys_state(task, subsys_id)->cgroup; +} + +int cgroup_path(const struct cgroup *cont, char *buf, int buflen); + +int cgroup_clone(struct task_struct *tsk, struct cgroup_subsys *ss); + +/* A cgroup_iter should be treated as an opaque object */ +struct cgroup_iter { + struct list_head *cg_link; + struct list_head *task; +}; + +/* To iterate across the tasks in a cgroup: + * + * 1) call cgroup_iter_start to intialize an iterator + * + * 2) call cgroup_iter_next() to retrieve member tasks until it + * returns NULL or until you want to end the iteration + * + * 3) call cgroup_iter_end() to destroy the iterator. + */ +void cgroup_iter_start(struct cgroup *cont, struct cgroup_iter *it); +struct task_struct *cgroup_iter_next(struct cgroup *cont, + struct cgroup_iter *it); +void cgroup_iter_end(struct cgroup *cont, struct cgroup_iter *it); + +#else /* !CONFIG_CGROUPS */ + +static inline int cgroup_init_early(void) { return 0; } +static inline int cgroup_init(void) { return 0; } +static inline void cgroup_init_smp(void) {} +static inline void cgroup_fork(struct task_struct *p) {} +static inline void cgroup_fork_callbacks(struct task_struct *p) {} +static inline void cgroup_post_fork(struct task_struct *p) {} +static inline void cgroup_exit(struct task_struct *p, int callbacks) {} + +static inline void cgroup_lock(void) {} +static inline void cgroup_unlock(void) {} +static inline int cgroupstats_build(struct cgroupstats *stats, + struct dentry *dentry) +{ + return -EINVAL; +} + +#endif /* !CONFIG_CGROUPS */ + +#endif /* _LINUX_CGROUP_H */ diff --git a/include/linux/cgroup_subsys.h b/include/linux/cgroup_subsys.h new file mode 100644 index 000000000000..0b9bfbde8168 --- /dev/null +++ b/include/linux/cgroup_subsys.h @@ -0,0 +1,38 @@ +/* Add subsystem definitions of the form SUBSYS(<name>) in this + * file. Surround each one by a line of comment markers so that + * patches don't collide + */ + +/* */ + +/* */ + +#ifdef CONFIG_CPUSETS +SUBSYS(cpuset) +#endif + +/* */ + +#ifdef CONFIG_CGROUP_CPUACCT +SUBSYS(cpuacct) +#endif + +/* */ + +#ifdef CONFIG_CGROUP_DEBUG +SUBSYS(debug) +#endif + +/* */ + +#ifdef CONFIG_CGROUP_NS +SUBSYS(ns) +#endif + +/* */ + +#ifdef CONFIG_FAIR_CGROUP_SCHED +SUBSYS(cpu_cgroup) +#endif + +/* */ diff --git a/include/linux/cgroupstats.h b/include/linux/cgroupstats.h new file mode 100644 index 000000000000..4f53abf6855d --- /dev/null +++ b/include/linux/cgroupstats.h @@ -0,0 +1,70 @@ +/* cgroupstats.h - exporting per-cgroup statistics + * + * Copyright IBM Corporation, 2007 + * Author Balbir Singh <balbir@linux.vnet.ibm.com> + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2.1 of the GNU Lesser General Public License + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + */ + +#ifndef _LINUX_CGROUPSTATS_H +#define _LINUX_CGROUPSTATS_H + +#include <linux/taskstats.h> + +/* + * Data shared between user space and kernel space on a per cgroup + * basis. This data is shared using taskstats. + * + * Most of these states are derived by looking at the task->state value + * For the nr_io_wait state, a flag in the delay accounting structure + * indicates that the task is waiting on IO + * + * Each member is aligned to a 8 byte boundary. + */ +struct cgroupstats { + __u64 nr_sleeping; /* Number of tasks sleeping */ + __u64 nr_running; /* Number of tasks running */ + __u64 nr_stopped; /* Number of tasks in stopped state */ + __u64 nr_uninterruptible; /* Number of tasks in uninterruptible */ + /* state */ + __u64 nr_io_wait; /* Number of tasks waiting on IO */ +}; + +/* + * Commands sent from userspace + * Not versioned. New commands should only be inserted at the enum's end + * prior to __CGROUPSTATS_CMD_MAX + */ + +enum { + CGROUPSTATS_CMD_UNSPEC = __TASKSTATS_CMD_MAX, /* Reserved */ + CGROUPSTATS_CMD_GET, /* user->kernel request/get-response */ + CGROUPSTATS_CMD_NEW, /* kernel->user event */ + __CGROUPSTATS_CMD_MAX, +}; + +#define CGROUPSTATS_CMD_MAX (__CGROUPSTATS_CMD_MAX - 1) + +enum { + CGROUPSTATS_TYPE_UNSPEC = 0, /* Reserved */ + CGROUPSTATS_TYPE_CGROUP_STATS, /* contains name + stats */ + __CGROUPSTATS_TYPE_MAX, +}; + +#define CGROUPSTATS_TYPE_MAX (__CGROUPSTATS_TYPE_MAX - 1) + +enum { + CGROUPSTATS_CMD_ATTR_UNSPEC = 0, + CGROUPSTATS_CMD_ATTR_FD, + __CGROUPSTATS_CMD_ATTR_MAX, +}; + +#define CGROUPSTATS_CMD_ATTR_MAX (__CGROUPSTATS_CMD_ATTR_MAX - 1) + +#endif /* _LINUX_CGROUPSTATS_H */ diff --git a/include/linux/cpu_acct.h b/include/linux/cpu_acct.h new file mode 100644 index 000000000000..6b5fd8a66c8d --- /dev/null +++ b/include/linux/cpu_acct.h @@ -0,0 +1,14 @@ + +#ifndef _LINUX_CPU_ACCT_H +#define _LINUX_CPU_ACCT_H + +#include <linux/cgroup.h> +#include <asm/cputime.h> + +#ifdef CONFIG_CGROUP_CPUACCT +extern void cpuacct_charge(struct task_struct *, cputime_t cputime); +#else +static void inline cpuacct_charge(struct task_struct *p, cputime_t cputime) {} +#endif + +#endif diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h index ea44d2e768a0..ecae585ec3da 100644 --- a/include/linux/cpuset.h +++ b/include/linux/cpuset.h @@ -11,6 +11,7 @@ #include <linux/sched.h> #include <linux/cpumask.h> #include <linux/nodemask.h> +#include <linux/cgroup.h> #ifdef CONFIG_CPUSETS @@ -19,9 +20,8 @@ extern int number_of_cpusets; /* How many cpusets are defined in system? */ extern int cpuset_init_early(void); extern int cpuset_init(void); extern void cpuset_init_smp(void); -extern void cpuset_fork(struct task_struct *p); -extern void cpuset_exit(struct task_struct *p); extern cpumask_t cpuset_cpus_allowed(struct task_struct *p); +extern cpumask_t cpuset_cpus_allowed_locked(struct task_struct *p); extern nodemask_t cpuset_mems_allowed(struct task_struct *p); #define cpuset_current_mems_allowed (current->mems_allowed) void cpuset_init_current_mems_allowed(void); @@ -76,18 +76,22 @@ static inline int cpuset_do_slab_mem_spread(void) extern void cpuset_track_online_nodes(void); +extern int current_cpuset_is_being_rebound(void); + #else /* !CONFIG_CPUSETS */ static inline int cpuset_init_early(void) { return 0; } static inline int cpuset_init(void) { return 0; } static inline void cpuset_init_smp(void) {} -static inline void cpuset_fork(struct task_struct *p) {} -static inline void cpuset_exit(struct task_struct *p) {} static inline cpumask_t cpuset_cpus_allowed(struct task_struct *p) { return cpu_possible_map; } +static inline cpumask_t cpuset_cpus_allowed_locked(struct task_struct *p) +{ + return cpu_possible_map; +} static inline nodemask_t cpuset_mems_allowed(struct task_struct *p) { @@ -148,6 +152,11 @@ static inline int cpuset_do_slab_mem_spread(void) static inline void cpuset_track_online_nodes(void) {} +static inline int current_cpuset_is_being_rebound(void) +{ + return 0; +} + #endif /* !CONFIG_CPUSETS */ #endif /* _LINUX_CPUSET_H */ diff --git a/include/linux/delayacct.h b/include/linux/delayacct.h index 55d1ca5e60f5..ab94bc083558 100644 --- a/include/linux/delayacct.h +++ b/include/linux/delayacct.h @@ -26,6 +26,7 @@ * Used to set current->delays->flags */ #define DELAYACCT_PF_SWAPIN 0x00000001 /* I am doing a swapin */ +#define DELAYACCT_PF_BLKIO 0x00000002 /* I am waiting on IO */ #ifdef CONFIG_TASK_DELAY_ACCT @@ -39,6 +40,14 @@ extern void __delayacct_blkio_end(void); extern int __delayacct_add_tsk(struct taskstats *, struct task_struct *); extern __u64 __delayacct_blkio_ticks(struct task_struct *); +static inline int delayacct_is_task_waiting_on_io(struct task_struct *p) +{ + if (p->delays) + return (p->delays->flags & DELAYACCT_PF_BLKIO); + else + return 0; +} + static inline void delayacct_set_flag(int flag) { if (current->delays) @@ -71,6 +80,7 @@ static inline void delayacct_tsk_free(struct task_struct *tsk) static inline void delayacct_blkio_start(void) { + delayacct_set_flag(DELAYACCT_PF_BLKIO); if (current->delays) __delayacct_blkio_start(); } @@ -79,6 +89,7 @@ static inline void delayacct_blkio_end(void) { if (current->delays) __delayacct_blkio_end(); + delayacct_clear_flag(DELAYACCT_PF_BLKIO); } static inline int delayacct_add_tsk(struct taskstats *d, @@ -116,6 +127,8 @@ static inline int delayacct_add_tsk(struct taskstats *d, { return 0; } static inline __u64 delayacct_blkio_ticks(struct task_struct *tsk) { return 0; } +static inline int delayacct_is_task_waiting_on_io(struct task_struct *p) +{ return 0; } #endif /* CONFIG_TASK_DELAY_ACCT */ #endif diff --git a/include/linux/fs.h b/include/linux/fs.h index 6a4d170ad9a5..1657e995f72c 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -123,6 +123,7 @@ extern int dir_notify_enable; #define MS_SLAVE (1<<19) /* change to slave */ #define MS_SHARED (1<<20) /* change to shared */ #define MS_RELATIME (1<<21) /* Update atime relative to mtime/ctime. */ +#define MS_KERNMOUNT (1<<22) /* this is a kern_mount call */ #define MS_ACTIVE (1<<30) #define MS_NOUSER (1<<31) @@ -1459,7 +1460,8 @@ void unnamed_dev_init(void); extern int register_filesystem(struct file_system_type *); extern int unregister_filesystem(struct file_system_type *); -extern struct vfsmount *kern_mount(struct file_system_type *); +extern struct vfsmount *kern_mount_data(struct file_system_type *, void *data); +#define kern_mount(type) kern_mount_data(type, NULL) extern int may_umount_tree(struct vfsmount *); extern int may_umount(struct vfsmount *); extern void umount_tree(struct vfsmount *, int, struct list_head *); @@ -1922,6 +1924,8 @@ extern int vfs_fstat(unsigned int, struct kstat *); extern int vfs_ioctl(struct file *, unsigned int, unsigned int, unsigned long); +extern void get_filesystem(struct file_system_type *fs); +extern void put_filesystem(struct file_system_type *fs); extern struct file_system_type *get_fs_type(const char *name); extern struct super_block *get_super(struct block_device *); extern struct super_block *user_get_super(dev_t); diff --git a/include/linux/hid.h b/include/linux/hid.h index edb8024d744b..6e35b92b1d2c 100644 --- a/include/linux/hid.h +++ b/include/linux/hid.h @@ -469,8 +469,8 @@ struct hid_device { /* device report descriptor */ /* handler for raw output data, used by hidraw */ int (*hid_output_raw_report) (struct hid_device *, __u8 *, size_t); #ifdef CONFIG_USB_HIDINPUT_POWERBOOK - unsigned long pb_pressed_fn[NBITS(KEY_MAX)]; - unsigned long pb_pressed_numlock[NBITS(KEY_MAX)]; + unsigned long pb_pressed_fn[BITS_TO_LONGS(KEY_CNT)]; + unsigned long pb_pressed_numlock[BITS_TO_LONGS(KEY_CNT)]; #endif }; diff --git a/include/linux/init_task.h b/include/linux/init_task.h index d4b2f1c76e12..cae35b6b9aec 100644 --- a/include/linux/init_task.h +++ b/include/linux/init_task.h @@ -67,9 +67,6 @@ .posix_timers = LIST_HEAD_INIT(sig.posix_timers), \ .cpu_timers = INIT_CPU_TIMERS(sig.cpu_timers), \ .rlim = INIT_RLIMITS, \ - .pgrp = 0, \ - .tty_old_pgrp = NULL, \ - { .__session = 0}, \ } extern struct nsproxy init_nsproxy; @@ -94,15 +91,18 @@ extern struct group_info init_groups; #define INIT_STRUCT_PID { \ .count = ATOMIC_INIT(1), \ - .nr = 0, \ - /* Don't put this struct pid in pid_hash */ \ - .pid_chain = { .next = NULL, .pprev = NULL }, \ .tasks = { \ { .first = &init_task.pids[PIDTYPE_PID].node }, \ { .first = &init_task.pids[PIDTYPE_PGID].node }, \ { .first = &init_task.pids[PIDTYPE_SID].node }, \ }, \ .rcu = RCU_HEAD_INIT, \ + .level = 0, \ + .numbers = { { \ + .nr = 0, \ + .ns = &init_pid_ns, \ + .pid_chain = { .next = NULL, .pprev = NULL }, \ + }, } \ } #define INIT_PID_LINK(type) \ diff --git a/include/linux/input.h b/include/linux/input.h index f30da6fc08e3..62268929856c 100644 --- a/include/linux/input.h +++ b/include/linux/input.h @@ -98,6 +98,7 @@ struct input_absinfo { #define EV_PWR 0x16 #define EV_FF_STATUS 0x17 #define EV_MAX 0x1f +#define EV_CNT (EV_MAX+1) /* * Synchronization events. @@ -567,6 +568,7 @@ struct input_absinfo { /* We avoid low common keys in module aliases so they don't get huge. */ #define KEY_MIN_INTERESTING KEY_MUTE #define KEY_MAX 0x1ff +#define KEY_CNT (KEY_MAX+1) /* * Relative axes @@ -583,6 +585,7 @@ struct input_absinfo { #define REL_WHEEL 0x08 #define REL_MISC 0x09 #define REL_MAX 0x0f +#define REL_CNT (REL_MAX+1) /* * Absolute axes @@ -615,6 +618,7 @@ struct input_absinfo { #define ABS_VOLUME 0x20 #define ABS_MISC 0x28 #define ABS_MAX 0x3f +#define ABS_CNT (ABS_MAX+1) /* * Switch events @@ -625,6 +629,7 @@ struct input_absinfo { #define SW_HEADPHONE_INSERT 0x02 /* set = inserted */ #define SW_RADIO 0x03 /* set = radio enabled */ #define SW_MAX 0x0f +#define SW_CNT (SW_MAX+1) /* * Misc events @@ -636,6 +641,7 @@ struct input_absinfo { #define MSC_RAW 0x03 #define MSC_SCAN 0x04 #define MSC_MAX 0x07 +#define MSC_CNT (MSC_MAX+1) /* * LEDs @@ -653,6 +659,7 @@ struct input_absinfo { #define LED_MAIL 0x09 #define LED_CHARGING 0x0a #define LED_MAX 0x0f +#define LED_CNT (LED_MAX+1) /* * Autorepeat values @@ -670,6 +677,7 @@ struct input_absinfo { #define SND_BELL 0x01 #define SND_TONE 0x02 #define SND_MAX 0x07 +#define SND_CNT (SND_MAX+1) /* * IDs. @@ -920,6 +928,7 @@ struct ff_effect { #define FF_AUTOCENTER 0x61 #define FF_MAX 0x7f +#define FF_CNT (FF_MAX+1) #ifdef __KERNEL__ @@ -932,10 +941,6 @@ struct ff_effect { #include <linux/timer.h> #include <linux/mod_devicetable.h> -#define NBITS(x) (((x)/BITS_PER_LONG)+1) -#define BIT(x) (1UL<<((x)%BITS_PER_LONG)) -#define LONG(x) ((x)/BITS_PER_LONG) - /** * struct input_dev - represents an input device * @name: name of the device @@ -1005,28 +1010,30 @@ struct ff_effect { * @going_away: marks devices that are in a middle of unregistering and * causes input_open_device*() fail with -ENODEV. * @dev: driver model's view of this device + * @cdev: union for struct device pointer * @h_list: list of input handles associated with the device. When * accessing the list dev->mutex must be held * @node: used to place the device onto input_dev_list */ struct input_dev { - + /* private: */ void *private; /* do not use */ + /* public: */ const char *name; const char *phys; const char *uniq; struct input_id id; - unsigned long evbit[NBITS(EV_MAX)]; - unsigned long keybit[NBITS(KEY_MAX)]; - unsigned long relbit[NBITS(REL_MAX)]; - unsigned long absbit[NBITS(ABS_MAX)]; - unsigned long mscbit[NBITS(MSC_MAX)]; - unsigned long ledbit[NBITS(LED_MAX)]; - unsigned long sndbit[NBITS(SND_MAX)]; - unsigned long ffbit[NBITS(FF_MAX)]; - unsigned long swbit[NBITS(SW_MAX)]; + unsigned long evbit[BITS_TO_LONGS(EV_CNT)]; + unsigned long keybit[BITS_TO_LONGS(KEY_CNT)]; + unsigned long relbit[BITS_TO_LONGS(REL_CNT)]; + unsigned long absbit[BITS_TO_LONGS(ABS_CNT)]; + unsigned long mscbit[BITS_TO_LONGS(MSC_CNT)]; + unsigned long ledbit[BITS_TO_LONGS(LED_CNT)]; + unsigned long sndbit[BITS_TO_LONGS(SND_CNT)]; + unsigned long ffbit[BITS_TO_LONGS(FF_CNT)]; + unsigned long swbit[BITS_TO_LONGS(SW_CNT)]; unsigned int keycodemax; unsigned int keycodesize; @@ -1044,10 +1051,10 @@ struct input_dev { int abs[ABS_MAX + 1]; int rep[REP_MAX + 1]; - unsigned long key[NBITS(KEY_MAX)]; - unsigned long led[NBITS(LED_MAX)]; - unsigned long snd[NBITS(SND_MAX)]; - unsigned long sw[NBITS(SW_MAX)]; + unsigned long key[BITS_TO_LONGS(KEY_CNT)]; + unsigned long led[BITS_TO_LONGS(LED_CNT)]; + unsigned long snd[BITS_TO_LONGS(SND_CNT)]; + unsigned long sw[BITS_TO_LONGS(SW_CNT)]; int absmax[ABS_MAX + 1]; int absmin[ABS_MAX + 1]; @@ -1291,7 +1298,7 @@ static inline void input_set_abs_params(struct input_dev *dev, int axis, int min dev->absfuzz[axis] = fuzz; dev->absflat[axis] = flat; - dev->absbit[LONG(axis)] |= BIT(axis); + dev->absbit[BIT_WORD(axis)] |= BIT_MASK(axis); } extern struct class input_class; @@ -1332,7 +1339,7 @@ struct ff_device { void *private; - unsigned long ffbit[NBITS(FF_MAX)]; + unsigned long ffbit[BITS_TO_LONGS(FF_CNT)]; struct mutex mutex; diff --git a/include/linux/ipc.h b/include/linux/ipc.h index ee111834091c..408696ea5189 100644 --- a/include/linux/ipc.h +++ b/include/linux/ipc.h @@ -89,6 +89,7 @@ struct kern_ipc_perm { spinlock_t lock; int deleted; + int id; key_t key; uid_t uid; gid_t gid; @@ -110,6 +111,8 @@ struct ipc_namespace { int msg_ctlmax; int msg_ctlmnb; int msg_ctlmni; + atomic_t msg_bytes; + atomic_t msg_hdrs; size_t shm_ctlmax; size_t shm_ctlall; diff --git a/include/linux/jbd.h b/include/linux/jbd.h index a3abf51e488f..16e7ed855a18 100644 --- a/include/linux/jbd.h +++ b/include/linux/jbd.h @@ -58,7 +58,7 @@ * CONFIG_JBD_DEBUG is on. */ #define JBD_EXPENSIVE_CHECKING -extern int journal_enable_debug; +extern u8 journal_enable_debug; #define jbd_debug(n, f, a...) \ do { \ @@ -248,17 +248,7 @@ typedef struct journal_superblock_s #include <linux/fs.h> #include <linux/sched.h> -#define JBD_ASSERTIONS -#ifdef JBD_ASSERTIONS -#define J_ASSERT(assert) \ -do { \ - if (!(assert)) { \ - printk (KERN_EMERG \ - "Assertion failure in %s() at %s:%d: \"%s\"\n", \ - __FUNCTION__, __FILE__, __LINE__, # assert); \ - BUG(); \ - } \ -} while (0) +#define J_ASSERT(assert) BUG_ON(!(assert)) #if defined(CONFIG_BUFFER_DEBUG) void buffer_assertion_failure(struct buffer_head *bh); @@ -274,10 +264,6 @@ void buffer_assertion_failure(struct buffer_head *bh); #define J_ASSERT_JH(jh, expr) J_ASSERT(expr) #endif -#else -#define J_ASSERT(assert) do { } while (0) -#endif /* JBD_ASSERTIONS */ - #if defined(JBD_PARANOID_IOFAIL) #define J_EXPECT(expr, why...) J_ASSERT(expr) #define J_EXPECT_BH(bh, expr, why...) J_ASSERT_BH(bh, expr) diff --git a/include/linux/kexec.h b/include/linux/kexec.h index ad4b82ce84af..2d9c448d8c52 100644 --- a/include/linux/kexec.h +++ b/include/linux/kexec.h @@ -187,6 +187,8 @@ extern u32 vmcoreinfo_note[VMCOREINFO_NOTE_SIZE/4]; extern size_t vmcoreinfo_size; extern size_t vmcoreinfo_max_size; +int __init parse_crashkernel(char *cmdline, unsigned long long system_ram, + unsigned long long *crash_size, unsigned long long *crash_base); #else /* !CONFIG_KEXEC */ struct pt_regs; diff --git a/include/linux/keyboard.h b/include/linux/keyboard.h index 33b5c2e325b9..65c2d70853e9 100644 --- a/include/linux/keyboard.h +++ b/include/linux/keyboard.h @@ -23,10 +23,21 @@ #define MAX_NR_OF_USER_KEYMAPS 256 /* should be at least 7 */ #ifdef __KERNEL__ +struct notifier_block; extern const int NR_TYPES; extern const int max_vals[]; extern unsigned short *key_maps[MAX_NR_KEYMAPS]; extern unsigned short plain_map[NR_KEYS]; + +struct keyboard_notifier_param { + struct vc_data *vc; /* VC on which the keyboard press was done */ + int down; /* Pressure of the key? */ + int shift; /* Current shift mask */ + unsigned int value; /* keycode, unicode value or keysym */ +}; + +extern int register_keyboard_notifier(struct notifier_block *nb); +extern int unregister_keyboard_notifier(struct notifier_block *nb); #endif #define MAX_NR_FUNC 256 /* max nr of strings assigned to keys */ diff --git a/include/linux/list.h b/include/linux/list.h index b0cf0135fe3e..75ce2cb4ff6e 100644 --- a/include/linux/list.h +++ b/include/linux/list.h @@ -478,8 +478,7 @@ static inline void list_splice_init_rcu(struct list_head *list, pos = n, n = pos->next) /** - * list_for_each_prev_safe - iterate over a list backwards safe against removal - of list entry + * list_for_each_prev_safe - iterate over a list backwards safe against removal of list entry * @pos: the &struct list_head to use as a loop cursor. * @n: another &struct list_head to use as temporary storage * @head: the head for your list. diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h index f6279f68a827..4c4d236ded18 100644 --- a/include/linux/lockdep.h +++ b/include/linux/lockdep.h @@ -276,6 +276,14 @@ extern void lockdep_init_map(struct lockdep_map *lock, const char *name, (lock)->dep_map.key, sub) /* + * To initialize a lockdep_map statically use this macro. + * Note that _name must not be NULL. + */ +#define STATIC_LOCKDEP_MAP_INIT(_name, _key) \ + { .name = (_name), .key = (void *)(_key), } + + +/* * Acquire a lock. * * Values for "read": diff --git a/include/linux/magic.h b/include/linux/magic.h index 722d4755060f..1fa0c2ce4dec 100644 --- a/include/linux/magic.h +++ b/include/linux/magic.h @@ -37,6 +37,7 @@ #define SMB_SUPER_MAGIC 0x517B #define USBDEVICE_SUPER_MAGIC 0x9fa2 +#define CGROUP_SUPER_MAGIC 0x27e0eb #define FUTEXFS_SUPER_MAGIC 0xBAD1DEA #define INOTIFYFS_SUPER_MAGIC 0x2BAD1DEA diff --git a/include/linux/marker.h b/include/linux/marker.h new file mode 100644 index 000000000000..5f36cf946bcb --- /dev/null +++ b/include/linux/marker.h @@ -0,0 +1,129 @@ +#ifndef _LINUX_MARKER_H +#define _LINUX_MARKER_H + +/* + * Code markup for dynamic and static tracing. + * + * See Documentation/marker.txt. + * + * (C) Copyright 2006 Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca> + * + * This file is released under the GPLv2. + * See the file COPYING for more details. + */ + +#include <linux/types.h> + +struct module; +struct marker; + +/** + * marker_probe_func - Type of a marker probe function + * @mdata: pointer of type struct marker + * @private_data: caller site private data + * @fmt: format string + * @...: variable argument list + * + * Type of marker probe functions. They receive the mdata and need to parse the + * format string to recover the variable argument list. + */ +typedef void marker_probe_func(const struct marker *mdata, + void *private_data, const char *fmt, ...); + +struct marker { + const char *name; /* Marker name */ + const char *format; /* Marker format string, describing the + * variable argument list. + */ + char state; /* Marker state. */ + marker_probe_func *call;/* Probe handler function pointer */ + void *private; /* Private probe data */ +} __attribute__((aligned(8))); + +#ifdef CONFIG_MARKERS + +/* + * Note : the empty asm volatile with read constraint is used here instead of a + * "used" attribute to fix a gcc 4.1.x bug. + * Make sure the alignment of the structure in the __markers section will + * not add unwanted padding between the beginning of the section and the + * structure. Force alignment to the same alignment as the section start. + */ +#define __trace_mark(name, call_data, format, args...) \ + do { \ + static const char __mstrtab_name_##name[] \ + __attribute__((section("__markers_strings"))) \ + = #name; \ + static const char __mstrtab_format_##name[] \ + __attribute__((section("__markers_strings"))) \ + = format; \ + static struct marker __mark_##name \ + __attribute__((section("__markers"), aligned(8))) = \ + { __mstrtab_name_##name, __mstrtab_format_##name, \ + 0, __mark_empty_function, NULL }; \ + __mark_check_format(format, ## args); \ + if (unlikely(__mark_##name.state)) { \ + preempt_disable(); \ + (*__mark_##name.call) \ + (&__mark_##name, call_data, \ + format, ## args); \ + preempt_enable(); \ + } \ + } while (0) + +extern void marker_update_probe_range(struct marker *begin, + struct marker *end, struct module *probe_module, int *refcount); +#else /* !CONFIG_MARKERS */ +#define __trace_mark(name, call_data, format, args...) \ + __mark_check_format(format, ## args) +static inline void marker_update_probe_range(struct marker *begin, + struct marker *end, struct module *probe_module, int *refcount) +{ } +#endif /* CONFIG_MARKERS */ + +/** + * trace_mark - Marker + * @name: marker name, not quoted. + * @format: format string + * @args...: variable argument list + * + * Places a marker. + */ +#define trace_mark(name, format, args...) \ + __trace_mark(name, NULL, format, ## args) + +#define MARK_MAX_FORMAT_LEN 1024 + +/** + * MARK_NOARGS - Format string for a marker with no argument. + */ +#define MARK_NOARGS " " + +/* To be used for string format validity checking with gcc */ +static inline void __printf(1, 2) __mark_check_format(const char *fmt, ...) +{ +} + +extern marker_probe_func __mark_empty_function; + +/* + * Connect a probe to a marker. + * private data pointer must be a valid allocated memory address, or NULL. + */ +extern int marker_probe_register(const char *name, const char *format, + marker_probe_func *probe, void *private); + +/* + * Returns the private data given to marker_probe_register. + */ +extern void *marker_probe_unregister(const char *name); +/* + * Unregister a marker by providing the registered private data. + */ +extern void *marker_probe_unregister_private_data(void *private); + +extern int marker_arm(const char *name); +extern int marker_disarm(const char *name); +extern void *marker_get_private_data(const char *name); + +#endif diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h index 38c04d61ee06..59c4865bc85f 100644 --- a/include/linux/mempolicy.h +++ b/include/linux/mempolicy.h @@ -148,14 +148,6 @@ extern void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new); extern void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new); extern void mpol_fix_fork_child_flag(struct task_struct *p); -#define set_cpuset_being_rebound(x) (cpuset_being_rebound = (x)) - -#ifdef CONFIG_CPUSETS -#define current_cpuset_is_being_rebound() \ - (cpuset_being_rebound == current->cpuset) -#else -#define current_cpuset_is_being_rebound() 0 -#endif extern struct mempolicy default_policy; extern struct zonelist *huge_zonelist(struct vm_area_struct *vma, @@ -173,8 +165,6 @@ static inline void check_highest_zone(enum zone_type k) int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags); -extern void *cpuset_being_rebound; /* Trigger mpol_copy vma rebind */ - #else struct mempolicy {}; @@ -248,8 +238,6 @@ static inline void mpol_fix_fork_child_flag(struct task_struct *p) { } -#define set_cpuset_being_rebound(x) do {} while (0) - static inline struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr, gfp_t gfp_flags, struct mempolicy **mpol) { diff --git a/include/linux/module.h b/include/linux/module.h index 642f325e4917..2cbc0b87e329 100644 --- a/include/linux/module.h +++ b/include/linux/module.h @@ -15,6 +15,7 @@ #include <linux/stringify.h> #include <linux/kobject.h> #include <linux/moduleparam.h> +#include <linux/marker.h> #include <asm/local.h> #include <asm/module.h> @@ -354,6 +355,10 @@ struct module /* The command line arguments (may be mangled). People like keeping pointers to this stuff */ char *args; +#ifdef CONFIG_MARKERS + struct marker *markers; + unsigned int num_markers; +#endif }; #ifndef MODULE_ARCH_INIT #define MODULE_ARCH_INIT {} @@ -457,6 +462,8 @@ int unregister_module_notifier(struct notifier_block * nb); extern void print_modules(void); +extern void module_update_markers(struct module *probe_module, int *refcount); + #else /* !CONFIG_MODULES... */ #define EXPORT_SYMBOL(sym) #define EXPORT_SYMBOL_GPL(sym) @@ -556,6 +563,11 @@ static inline void print_modules(void) { } +static inline void module_update_markers(struct module *probe_module, + int *refcount) +{ +} + #endif /* CONFIG_MODULES */ struct device_driver; diff --git a/include/linux/msg.h b/include/linux/msg.h index f1b60740d641..10a3d5a1abff 100644 --- a/include/linux/msg.h +++ b/include/linux/msg.h @@ -77,7 +77,6 @@ struct msg_msg { /* one msq_queue structure for each present queue on the system */ struct msg_queue { struct kern_ipc_perm q_perm; - int q_id; time_t q_stime; /* last msgsnd time */ time_t q_rtime; /* last msgrcv time */ time_t q_ctime; /* last change time */ diff --git a/include/linux/notifier.h b/include/linux/notifier.h index fad7ff17e468..0c40cc0b4a36 100644 --- a/include/linux/notifier.h +++ b/include/linux/notifier.h @@ -231,5 +231,22 @@ static inline int notifier_to_errno(int ret) #define PM_SUSPEND_PREPARE 0x0003 /* Going to suspend the system */ #define PM_POST_SUSPEND 0x0004 /* Suspend finished */ +/* Console keyboard events. + * Note: KBD_KEYCODE is always sent before KBD_UNBOUND_KEYCODE, KBD_UNICODE and + * KBD_KEYSYM. */ +#define KBD_KEYCODE 0x0001 /* Keyboard keycode, called before any other */ +#define KBD_UNBOUND_KEYCODE 0x0002 /* Keyboard keycode which is not bound to any other */ +#define KBD_UNICODE 0x0003 /* Keyboard unicode */ +#define KBD_KEYSYM 0x0004 /* Keyboard keysym */ +#define KBD_POST_KEYSYM 0x0005 /* Called after keyboard keysym interpretation */ + +extern struct blocking_notifier_head reboot_notifier_list; + +/* Virtual Terminal events. */ +#define VT_ALLOCATE 0x0001 /* Console got allocated */ +#define VT_DEALLOCATE 0x0002 /* Console will be deallocated */ +#define VT_WRITE 0x0003 /* A char got output */ +#define VT_UPDATE 0x0004 /* A bigger update occurred */ + #endif /* __KERNEL__ */ #endif /* _LINUX_NOTIFIER_H */ diff --git a/include/linux/nsproxy.h b/include/linux/nsproxy.h index 033a648709b6..0e66b57631fc 100644 --- a/include/linux/nsproxy.h +++ b/include/linux/nsproxy.h @@ -32,8 +32,39 @@ struct nsproxy { }; extern struct nsproxy init_nsproxy; +/* + * the namespaces access rules are: + * + * 1. only current task is allowed to change tsk->nsproxy pointer or + * any pointer on the nsproxy itself + * + * 2. when accessing (i.e. reading) current task's namespaces - no + * precautions should be taken - just dereference the pointers + * + * 3. the access to other task namespaces is performed like this + * rcu_read_lock(); + * nsproxy = task_nsproxy(tsk); + * if (nsproxy != NULL) { + * / * + * * work with the namespaces here + * * e.g. get the reference on one of them + * * / + * } / * + * * NULL task_nsproxy() means that this task is + * * almost dead (zombie) + * * / + * rcu_read_unlock(); + * + */ + +static inline struct nsproxy *task_nsproxy(struct task_struct *tsk) +{ + return rcu_dereference(tsk->nsproxy); +} + int copy_namespaces(unsigned long flags, struct task_struct *tsk); -void get_task_namespaces(struct task_struct *tsk); +void exit_task_namespaces(struct task_struct *tsk); +void switch_task_namespaces(struct task_struct *tsk, struct nsproxy *new); void free_nsproxy(struct nsproxy *ns); int unshare_nsproxy_namespaces(unsigned long, struct nsproxy **, struct fs_struct *); @@ -45,14 +76,15 @@ static inline void put_nsproxy(struct nsproxy *ns) } } -static inline void exit_task_namespaces(struct task_struct *p) +static inline void get_nsproxy(struct nsproxy *ns) { - struct nsproxy *ns = p->nsproxy; - if (ns) { - task_lock(p); - p->nsproxy = NULL; - task_unlock(p); - put_nsproxy(ns); - } + atomic_inc(&ns->count); } + +#ifdef CONFIG_CGROUP_NS +int ns_cgroup_clone(struct task_struct *tsk); +#else +static inline int ns_cgroup_clone(struct task_struct *tsk) { return 0; } +#endif + #endif diff --git a/include/linux/of.h b/include/linux/of.h index 6df80e985914..5c39b9270ff7 100644 --- a/include/linux/of.h +++ b/include/linux/of.h @@ -16,8 +16,8 @@ * 2 of the License, or (at your option) any later version. */ #include <linux/types.h> +#include <linux/bitops.h> -#include <asm/bitops.h> #include <asm/prom.h> /* flag descriptions */ diff --git a/include/linux/phantom.h b/include/linux/phantom.h index d3ebbfae6903..96f4048a6cc3 100644 --- a/include/linux/phantom.h +++ b/include/linux/phantom.h @@ -30,7 +30,11 @@ struct phm_regs { #define PHN_SET_REG _IOW (PH_IOC_MAGIC, 1, struct phm_reg *) #define PHN_GET_REGS _IOWR(PH_IOC_MAGIC, 2, struct phm_regs *) #define PHN_SET_REGS _IOW (PH_IOC_MAGIC, 3, struct phm_regs *) -#define PH_IOC_MAXNR 3 +/* this ioctl tells the driver, that the caller is not OpenHaptics and might + * use improved registers update (no more phantom switchoffs when using + * libphantom) */ +#define PHN_NOT_OH _IO (PH_IOC_MAGIC, 4) +#define PH_IOC_MAXNR 4 #define PHN_CONTROL 0x6 /* control byte in iaddr space */ #define PHN_CTL_AMP 0x1 /* switch after torques change */ diff --git a/include/linux/pid.h b/include/linux/pid.h index 1e0e4e3423a6..e29a900a8499 100644 --- a/include/linux/pid.h +++ b/include/linux/pid.h @@ -40,15 +40,28 @@ enum pid_type * processes. */ -struct pid -{ - atomic_t count; + +/* + * struct upid is used to get the id of the struct pid, as it is + * seen in particular namespace. Later the struct pid is found with + * find_pid_ns() using the int nr and struct pid_namespace *ns. + */ + +struct upid { /* Try to keep pid_chain in the same cacheline as nr for find_pid */ int nr; + struct pid_namespace *ns; struct hlist_node pid_chain; +}; + +struct pid +{ + atomic_t count; /* lists of tasks that use this pid */ struct hlist_head tasks[PIDTYPE_MAX]; struct rcu_head rcu; + int level; + struct upid numbers[1]; }; extern struct pid init_struct_pid; @@ -83,26 +96,60 @@ extern void FASTCALL(detach_pid(struct task_struct *task, enum pid_type)); extern void FASTCALL(transfer_pid(struct task_struct *old, struct task_struct *new, enum pid_type)); +struct pid_namespace; +extern struct pid_namespace init_pid_ns; + /* * look up a PID in the hash table. Must be called with the tasklist_lock * or rcu_read_lock() held. + * + * find_pid_ns() finds the pid in the namespace specified + * find_pid() find the pid by its global id, i.e. in the init namespace + * find_vpid() finr the pid by its virtual id, i.e. in the current namespace + * + * see also find_task_by_pid() set in include/linux/sched.h */ -extern struct pid *FASTCALL(find_pid(int nr)); +extern struct pid *FASTCALL(find_pid_ns(int nr, struct pid_namespace *ns)); +extern struct pid *find_vpid(int nr); +extern struct pid *find_pid(int nr); /* * Lookup a PID in the hash table, and return with it's count elevated. */ extern struct pid *find_get_pid(int nr); -extern struct pid *find_ge_pid(int nr); +extern struct pid *find_ge_pid(int nr, struct pid_namespace *); -extern struct pid *alloc_pid(void); +extern struct pid *alloc_pid(struct pid_namespace *ns); extern void FASTCALL(free_pid(struct pid *pid)); +extern void zap_pid_ns_processes(struct pid_namespace *pid_ns); + +/* + * the helpers to get the pid's id seen from different namespaces + * + * pid_nr() : global id, i.e. the id seen from the init namespace; + * pid_vnr() : virtual id, i.e. the id seen from the namespace this pid + * belongs to. this only makes sence when called in the + * context of the task that belongs to the same namespace; + * pid_nr_ns() : id seen from the ns specified. + * + * see also task_xid_nr() etc in include/linux/sched.h + */ static inline pid_t pid_nr(struct pid *pid) { pid_t nr = 0; if (pid) - nr = pid->nr; + nr = pid->numbers[0].nr; + return nr; +} + +pid_t pid_nr_ns(struct pid *pid, struct pid_namespace *ns); + +static inline pid_t pid_vnr(struct pid *pid) +{ + pid_t nr = 0; + if (pid) + nr = pid->numbers[pid->level].nr; return nr; } diff --git a/include/linux/pid_namespace.h b/include/linux/pid_namespace.h index b9a17e08ff0f..0135c76c76c6 100644 --- a/include/linux/pid_namespace.h +++ b/include/linux/pid_namespace.h @@ -4,7 +4,6 @@ #include <linux/sched.h> #include <linux/mm.h> #include <linux/threads.h> -#include <linux/pid.h> #include <linux/nsproxy.h> #include <linux/kref.h> @@ -20,13 +19,21 @@ struct pid_namespace { struct pidmap pidmap[PIDMAP_ENTRIES]; int last_pid; struct task_struct *child_reaper; + struct kmem_cache *pid_cachep; + int level; + struct pid_namespace *parent; +#ifdef CONFIG_PROC_FS + struct vfsmount *proc_mnt; +#endif }; extern struct pid_namespace init_pid_ns; -static inline void get_pid_ns(struct pid_namespace *ns) +static inline struct pid_namespace *get_pid_ns(struct pid_namespace *ns) { - kref_get(&ns->kref); + if (ns != &init_pid_ns) + kref_get(&ns->kref); + return ns; } extern struct pid_namespace *copy_pid_ns(unsigned long flags, struct pid_namespace *ns); @@ -34,12 +41,19 @@ extern void free_pid_ns(struct kref *kref); static inline void put_pid_ns(struct pid_namespace *ns) { - kref_put(&ns->kref, free_pid_ns); + if (ns != &init_pid_ns) + kref_put(&ns->kref, free_pid_ns); } -static inline struct task_struct *child_reaper(struct task_struct *tsk) +static inline struct pid_namespace *task_active_pid_ns(struct task_struct *tsk) { - return init_pid_ns.child_reaper; + return tsk->nsproxy->pid_ns; +} + +static inline struct task_struct *task_child_reaper(struct task_struct *tsk) +{ + BUG_ON(tsk != current); + return tsk->nsproxy->pid_ns->child_reaper; } #endif /* _LINUX_PID_NS_H */ diff --git a/include/linux/prio_heap.h b/include/linux/prio_heap.h new file mode 100644 index 000000000000..08094350f26a --- /dev/null +++ b/include/linux/prio_heap.h @@ -0,0 +1,58 @@ +#ifndef _LINUX_PRIO_HEAP_H +#define _LINUX_PRIO_HEAP_H + +/* + * Simple insertion-only static-sized priority heap containing + * pointers, based on CLR, chapter 7 + */ + +#include <linux/gfp.h> + +/** + * struct ptr_heap - simple static-sized priority heap + * @ptrs - pointer to data area + * @max - max number of elements that can be stored in @ptrs + * @size - current number of valid elements in @ptrs (in the range 0..@size-1 + * @gt: comparison operator, which should implement "greater than" + */ +struct ptr_heap { + void **ptrs; + int max; + int size; + int (*gt)(void *, void *); +}; + +/** + * heap_init - initialize an empty heap with a given memory size + * @heap: the heap structure to be initialized + * @size: amount of memory to use in bytes + * @gfp_mask: mask to pass to kmalloc() + * @gt: comparison operator, which should implement "greater than" + */ +extern int heap_init(struct ptr_heap *heap, size_t size, gfp_t gfp_mask, + int (*gt)(void *, void *)); + +/** + * heap_free - release a heap's storage + * @heap: the heap structure whose data should be released + */ +void heap_free(struct ptr_heap *heap); + +/** + * heap_insert - insert a value into the heap and return any overflowed value + * @heap: the heap to be operated on + * @p: the pointer to be inserted + * + * Attempts to insert the given value into the priority heap. If the + * heap is full prior to the insertion, then the resulting heap will + * consist of the smallest @max elements of the original heap and the + * new element; the greatest element will be removed from the heap and + * returned. Note that the returned element will be the new element + * (i.e. no change to the heap) if the new element is greater than all + * elements currently in the heap. + */ +extern void *heap_insert(struct ptr_heap *heap, void *p); + + + +#endif /* _LINUX_PRIO_HEAP_H */ diff --git a/include/linux/proc_fs.h b/include/linux/proc_fs.h index 20741f668f7b..1ff461672060 100644 --- a/include/linux/proc_fs.h +++ b/include/linux/proc_fs.h @@ -125,7 +125,8 @@ extern struct proc_dir_entry *create_proc_entry(const char *name, mode_t mode, extern void remove_proc_entry(const char *name, struct proc_dir_entry *parent); extern struct vfsmount *proc_mnt; -extern int proc_fill_super(struct super_block *,void *,int); +struct pid_namespace; +extern int proc_fill_super(struct super_block *); extern struct inode *proc_get_inode(struct super_block *, unsigned int, struct proc_dir_entry *); /* @@ -142,6 +143,9 @@ extern const struct file_operations proc_kcore_operations; extern const struct file_operations proc_kmsg_operations; extern const struct file_operations ppc_htab_operations; +extern int pid_ns_prepare_proc(struct pid_namespace *ns); +extern void pid_ns_release_proc(struct pid_namespace *ns); + /* * proc_tty.c */ @@ -207,7 +211,9 @@ extern void proc_net_remove(struct net *net, const char *name); #define proc_net_create(net, name, mode, info) ({ (void)(mode), NULL; }) static inline void proc_net_remove(struct net *net, const char *name) {} -static inline void proc_flush_task(struct task_struct *task) { } +static inline void proc_flush_task(struct task_struct *task) +{ +} static inline struct proc_dir_entry *create_proc_entry(const char *name, mode_t mode, struct proc_dir_entry *parent) { return NULL; } @@ -232,6 +238,15 @@ static inline void proc_tty_unregister_driver(struct tty_driver *driver) {}; extern struct proc_dir_entry proc_root; +static inline int pid_ns_prepare_proc(struct pid_namespace *ns) +{ + return 0; +} + +static inline void pid_ns_release_proc(struct pid_namespace *ns) +{ +} + #endif /* CONFIG_PROC_FS */ #if !defined(CONFIG_PROC_KCORE) diff --git a/include/linux/reiserfs_fs.h b/include/linux/reiserfs_fs.h index 8dcf237d3386..72bfccd3da22 100644 --- a/include/linux/reiserfs_fs.h +++ b/include/linux/reiserfs_fs.h @@ -85,7 +85,7 @@ void reiserfs_warning(struct super_block *s, const char *fmt, ...); if( !( cond ) ) \ reiserfs_panic( NULL, "reiserfs[%i]: assertion " scond " failed at " \ __FILE__ ":%i:%s: " format "\n", \ - in_interrupt() ? -1 : current -> pid, __LINE__ , __FUNCTION__ , ##args ) + in_interrupt() ? -1 : task_pid_nr(current), __LINE__ , __FUNCTION__ , ##args ) #define RASSERT(cond, format, args...) __RASSERT(cond, #cond, format, ##args) @@ -283,6 +283,18 @@ static inline struct reiserfs_sb_info *REISERFS_SB(const struct super_block *sb) return sb->s_fs_info; } +/* Don't trust REISERFS_SB(sb)->s_bmap_nr, it's a u16 + * which overflows on large file systems. */ +static inline u32 reiserfs_bmap_count(struct super_block *sb) +{ + return (SB_BLOCK_COUNT(sb) - 1) / (sb->s_blocksize * 8) + 1; +} + +static inline int bmap_would_wrap(unsigned bmap_nr) +{ + return bmap_nr > ((1LL << 16) - 1); +} + /** this says about version of key of all items (but stat data) the object consists of */ #define get_inode_item_key_version( inode ) \ @@ -1734,8 +1746,8 @@ int journal_end_sync(struct reiserfs_transaction_handle *, struct super_block *, int journal_mark_freed(struct reiserfs_transaction_handle *, struct super_block *, b_blocknr_t blocknr); int journal_transaction_should_end(struct reiserfs_transaction_handle *, int); -int reiserfs_in_journal(struct super_block *p_s_sb, int bmap_nr, int bit_nr, - int searchall, b_blocknr_t * next); +int reiserfs_in_journal(struct super_block *p_s_sb, unsigned int bmap_nr, + int bit_nr, int searchall, b_blocknr_t *next); int journal_begin(struct reiserfs_transaction_handle *, struct super_block *p_s_sb, unsigned long); int journal_join_abort(struct reiserfs_transaction_handle *, @@ -1743,7 +1755,7 @@ int journal_join_abort(struct reiserfs_transaction_handle *, void reiserfs_journal_abort(struct super_block *sb, int errno); void reiserfs_abort(struct super_block *sb, int errno, const char *fmt, ...); int reiserfs_allocate_list_bitmaps(struct super_block *s, - struct reiserfs_list_bitmap *, int); + struct reiserfs_list_bitmap *, unsigned int); void add_save_link(struct reiserfs_transaction_handle *th, struct inode *inode, int truncate); @@ -2041,7 +2053,7 @@ struct buffer_head *get_FEB(struct tree_balance *); * arguments, such as node, search path, transaction_handle, etc. */ struct __reiserfs_blocknr_hint { struct inode *inode; /* inode passed to allocator, if we allocate unf. nodes */ - long block; /* file offset, in blocks */ + sector_t block; /* file offset, in blocks */ struct in_core_key key; struct treepath *path; /* search path, used by allocator to deternine search_start by * various ways */ @@ -2099,7 +2111,8 @@ static inline int reiserfs_new_form_blocknrs(struct tree_balance *tb, static inline int reiserfs_new_unf_blocknrs(struct reiserfs_transaction_handle *th, struct inode *inode, b_blocknr_t * new_blocknrs, - struct treepath *path, long block) + struct treepath *path, + sector_t block) { reiserfs_blocknr_hint_t hint = { .th = th, @@ -2116,7 +2129,8 @@ static inline int reiserfs_new_unf_blocknrs(struct reiserfs_transaction_handle static inline int reiserfs_new_unf_blocknrs2(struct reiserfs_transaction_handle *th, struct inode *inode, b_blocknr_t * new_blocknrs, - struct treepath *path, long block) + struct treepath *path, + sector_t block) { reiserfs_blocknr_hint_t hint = { .th = th, diff --git a/include/linux/reiserfs_fs_sb.h b/include/linux/reiserfs_fs_sb.h index ff9e9234f8ba..10fa0c832018 100644 --- a/include/linux/reiserfs_fs_sb.h +++ b/include/linux/reiserfs_fs_sb.h @@ -265,9 +265,7 @@ enum journal_state_bits { typedef __u32(*hashf_t) (const signed char *, int); struct reiserfs_bitmap_info { - // FIXME: Won't work with block sizes > 8K - __u16 first_zero_hint; - __u16 free_count; + __u32 free_count; }; struct proc_dir_entry; diff --git a/include/linux/sched.h b/include/linux/sched.h index 10a83d8d5775..13df99fb2769 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -25,6 +25,7 @@ #define CLONE_NEWUTS 0x04000000 /* New utsname group? */ #define CLONE_NEWIPC 0x08000000 /* New ipcs */ #define CLONE_NEWUSER 0x10000000 /* New user namespace */ +#define CLONE_NEWPID 0x20000000 /* New pid namespace */ #define CLONE_NEWNET 0x40000000 /* New network namespace */ /* @@ -428,7 +429,17 @@ struct signal_struct { cputime_t it_prof_incr, it_virt_incr; /* job control IDs */ - pid_t pgrp; + + /* + * pgrp and session fields are deprecated. + * use the task_session_Xnr and task_pgrp_Xnr routines below + */ + + union { + pid_t pgrp __deprecated; + pid_t __pgrp; + }; + struct pid *tty_old_pgrp; union { @@ -736,6 +747,8 @@ struct sched_domain { #endif }; +extern void partition_sched_domains(int ndoms_new, cpumask_t *doms_new); + #endif /* CONFIG_SMP */ /* @@ -756,8 +769,6 @@ static inline int above_background_load(void) } struct io_context; /* See blkdev.h */ -struct cpuset; - #define NGROUPS_SMALL 32 #define NGROUPS_PER_BLOCK ((int)(PAGE_SIZE / sizeof(gid_t))) struct group_info { @@ -1125,11 +1136,16 @@ struct task_struct { short il_next; #endif #ifdef CONFIG_CPUSETS - struct cpuset *cpuset; nodemask_t mems_allowed; int cpuset_mems_generation; int cpuset_mem_spread_rotor; #endif +#ifdef CONFIG_CGROUPS + /* Control Group info protected by css_set_lock */ + struct css_set *cgroups; + /* cg_list protected by css_set_lock and tsk->alloc_lock */ + struct list_head cg_list; +#endif #ifdef CONFIG_FUTEX struct robust_list_head __user *robust_list; #ifdef CONFIG_COMPAT @@ -1185,24 +1201,14 @@ static inline int rt_task(struct task_struct *p) return rt_prio(p->prio); } -static inline pid_t process_group(struct task_struct *tsk) +static inline void set_task_session(struct task_struct *tsk, pid_t session) { - return tsk->signal->pgrp; + tsk->signal->__session = session; } -static inline pid_t signal_session(struct signal_struct *sig) +static inline void set_task_pgrp(struct task_struct *tsk, pid_t pgrp) { - return sig->__session; -} - -static inline pid_t process_session(struct task_struct *tsk) -{ - return signal_session(tsk->signal); -} - -static inline void set_signal_session(struct signal_struct *sig, pid_t session) -{ - sig->__session = session; + tsk->signal->__pgrp = pgrp; } static inline struct pid *task_pid(struct task_struct *task) @@ -1225,6 +1231,88 @@ static inline struct pid *task_session(struct task_struct *task) return task->group_leader->pids[PIDTYPE_SID].pid; } +struct pid_namespace; + +/* + * the helpers to get the task's different pids as they are seen + * from various namespaces + * + * task_xid_nr() : global id, i.e. the id seen from the init namespace; + * task_xid_vnr() : virtual id, i.e. the id seen from the namespace the task + * belongs to. this only makes sence when called in the + * context of the task that belongs to the same namespace; + * task_xid_nr_ns() : id seen from the ns specified; + * + * set_task_vxid() : assigns a virtual id to a task; + * + * task_ppid_nr_ns() : the parent's id as seen from the namespace specified. + * the result depends on the namespace and whether the + * task in question is the namespace's init. e.g. for the + * namespace's init this will return 0 when called from + * the namespace of this init, or appropriate id otherwise. + * + * + * see also pid_nr() etc in include/linux/pid.h + */ + +static inline pid_t task_pid_nr(struct task_struct *tsk) +{ + return tsk->pid; +} + +pid_t task_pid_nr_ns(struct task_struct *tsk, struct pid_namespace *ns); + +static inline pid_t task_pid_vnr(struct task_struct *tsk) +{ + return pid_vnr(task_pid(tsk)); +} + + +static inline pid_t task_tgid_nr(struct task_struct *tsk) +{ + return tsk->tgid; +} + +pid_t task_tgid_nr_ns(struct task_struct *tsk, struct pid_namespace *ns); + +static inline pid_t task_tgid_vnr(struct task_struct *tsk) +{ + return pid_vnr(task_tgid(tsk)); +} + + +static inline pid_t task_pgrp_nr(struct task_struct *tsk) +{ + return tsk->signal->__pgrp; +} + +pid_t task_pgrp_nr_ns(struct task_struct *tsk, struct pid_namespace *ns); + +static inline pid_t task_pgrp_vnr(struct task_struct *tsk) +{ + return pid_vnr(task_pgrp(tsk)); +} + + +static inline pid_t task_session_nr(struct task_struct *tsk) +{ + return tsk->signal->__session; +} + +pid_t task_session_nr_ns(struct task_struct *tsk, struct pid_namespace *ns); + +static inline pid_t task_session_vnr(struct task_struct *tsk) +{ + return pid_vnr(task_session(tsk)); +} + + +static inline pid_t task_ppid_nr_ns(struct task_struct *tsk, + struct pid_namespace *ns) +{ + return pid_nr_ns(task_pid(rcu_dereference(tsk->real_parent)), ns); +} + /** * pid_alive - check that a task structure is not stale * @p: Task structure to be checked. @@ -1239,16 +1327,22 @@ static inline int pid_alive(struct task_struct *p) } /** - * is_init - check if a task structure is init + * is_global_init - check if a task structure is init * @tsk: Task structure to be checked. * * Check if a task structure is the first user space task the kernel created. */ -static inline int is_init(struct task_struct *tsk) +static inline int is_global_init(struct task_struct *tsk) { return tsk->pid == 1; } +/* + * is_container_init: + * check whether in the task is init in its own pid namespace. + */ +extern int is_container_init(struct task_struct *tsk); + extern struct pid *cad_pid; extern void free_task(struct task_struct *tsk); @@ -1420,8 +1514,32 @@ extern struct task_struct init_task; extern struct mm_struct init_mm; -#define find_task_by_pid(nr) find_task_by_pid_type(PIDTYPE_PID, nr) -extern struct task_struct *find_task_by_pid_type(int type, int pid); +extern struct pid_namespace init_pid_ns; + +/* + * find a task by one of its numerical ids + * + * find_task_by_pid_type_ns(): + * it is the most generic call - it finds a task by all id, + * type and namespace specified + * find_task_by_pid_ns(): + * finds a task by its pid in the specified namespace + * find_task_by_vpid(): + * finds a task by its virtual pid + * find_task_by_pid(): + * finds a task by its global pid + * + * see also find_pid() etc in include/linux/pid.h + */ + +extern struct task_struct *find_task_by_pid_type_ns(int type, int pid, + struct pid_namespace *ns); + +extern struct task_struct *find_task_by_pid(pid_t nr); +extern struct task_struct *find_task_by_vpid(pid_t nr); +extern struct task_struct *find_task_by_pid_ns(pid_t nr, + struct pid_namespace *ns); + extern void __set_special_pids(pid_t session, pid_t pgrp); /* per-UID process charging. */ @@ -1608,6 +1726,12 @@ static inline int has_group_leader_pid(struct task_struct *p) return p->pid == p->tgid; } +static inline +int same_thread_group(struct task_struct *p1, struct task_struct *p2) +{ + return p1->tgid == p2->tgid; +} + static inline struct task_struct *next_thread(const struct task_struct *p) { return list_entry(rcu_dereference(p->thread_group.next), @@ -1625,7 +1749,8 @@ static inline int thread_group_empty(struct task_struct *p) /* * Protects ->fs, ->files, ->mm, ->group_info, ->comm, keyring * subscriptions and synchronises with wait4(). Also used in procfs. Also - * pins the final release of task.io_context. Also protects ->cpuset. + * pins the final release of task.io_context. Also protects ->cpuset and + * ->cgroup.subsys[]. * * Nests both inside and outside of read_lock(&tasklist_lock). * It must not be nested with write_lock_irq(&tasklist_lock), diff --git a/include/linux/sem.h b/include/linux/sem.h index 9aaffb0b1d81..c8eaad9e4b72 100644 --- a/include/linux/sem.h +++ b/include/linux/sem.h @@ -90,7 +90,6 @@ struct sem { /* One sem_array data structure for each set of semaphores in the system. */ struct sem_array { struct kern_ipc_perm sem_perm; /* permissions .. see ipc.h */ - int sem_id; time_t sem_otime; /* last semop time */ time_t sem_ctime; /* last change time */ struct sem *sem_base; /* ptr to first semaphore in array */ diff --git a/include/linux/shm.h b/include/linux/shm.h index bea65d9c93ef..eeaed921a1dc 100644 --- a/include/linux/shm.h +++ b/include/linux/shm.h @@ -79,7 +79,6 @@ struct shmid_kernel /* private to the kernel */ { struct kern_ipc_perm shm_perm; struct file * shm_file; - int id; unsigned long shm_nattch; unsigned long shm_segsz; time_t shm_atim; diff --git a/include/linux/types.h b/include/linux/types.h index 0351bf2fac85..4f0dad21c917 100644 --- a/include/linux/types.h +++ b/include/linux/types.h @@ -3,12 +3,9 @@ #ifdef __KERNEL__ -#define BITS_TO_LONGS(bits) \ - (((bits)+BITS_PER_LONG-1)/BITS_PER_LONG) #define DECLARE_BITMAP(name,bits) \ unsigned long name[BITS_TO_LONGS(bits)] -#define BITS_PER_BYTE 8 #endif #include <linux/posix_types.h> diff --git a/include/linux/uinput.h b/include/linux/uinput.h index a6c1e8eed226..15ddd4483b09 100644 --- a/include/linux/uinput.h +++ b/include/linux/uinput.h @@ -162,10 +162,6 @@ struct uinput_ff_erase { #define UI_FF_UPLOAD 1 #define UI_FF_ERASE 2 -#ifndef NBITS -#define NBITS(x) ((((x)-1)/(sizeof(long)*8))+1) -#endif /* NBITS */ - #define UINPUT_MAX_NAME_SIZE 80 struct uinput_user_dev { char name[UINPUT_MAX_NAME_SIZE]; diff --git a/include/linux/vt.h b/include/linux/vt.h index ba806e8711be..02c1c0288770 100644 --- a/include/linux/vt.h +++ b/include/linux/vt.h @@ -1,6 +1,18 @@ #ifndef _LINUX_VT_H #define _LINUX_VT_H +#ifdef __KERNEL__ +struct notifier_block; + +struct vt_notifier_param { + struct vc_data *vc; /* VC on which the update happened */ + unsigned int c; /* Printed char */ +}; + +extern int register_vt_notifier(struct notifier_block *nb); +extern int unregister_vt_notifier(struct notifier_block *nb); +#endif + /* * These constants are also useful for user-level apps (e.g., VC * resizing). diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h index ce6badc98f6d..7daafdc2514b 100644 --- a/include/linux/workqueue.h +++ b/include/linux/workqueue.h @@ -8,6 +8,7 @@ #include <linux/timer.h> #include <linux/linkage.h> #include <linux/bitops.h> +#include <linux/lockdep.h> #include <asm/atomic.h> struct workqueue_struct; @@ -28,6 +29,9 @@ struct work_struct { #define WORK_STRUCT_WQ_DATA_MASK (~WORK_STRUCT_FLAG_MASK) struct list_head entry; work_func_t func; +#ifdef CONFIG_LOCKDEP + struct lockdep_map lockdep_map; +#endif }; #define WORK_DATA_INIT() ATOMIC_LONG_INIT(0) @@ -41,10 +45,23 @@ struct execute_work { struct work_struct work; }; +#ifdef CONFIG_LOCKDEP +/* + * NB: because we have to copy the lockdep_map, setting _key + * here is required, otherwise it could get initialised to the + * copy of the lockdep_map! + */ +#define __WORK_INIT_LOCKDEP_MAP(n, k) \ + .lockdep_map = STATIC_LOCKDEP_MAP_INIT(n, k), +#else +#define __WORK_INIT_LOCKDEP_MAP(n, k) +#endif + #define __WORK_INITIALIZER(n, f) { \ .data = WORK_DATA_INIT(), \ .entry = { &(n).entry, &(n).entry }, \ .func = (f), \ + __WORK_INIT_LOCKDEP_MAP(#n, &(n)) \ } #define __DELAYED_WORK_INITIALIZER(n, f) { \ @@ -76,12 +93,24 @@ struct execute_work { * assignment of the work data initializer allows the compiler * to generate better code. */ +#ifdef CONFIG_LOCKDEP +#define INIT_WORK(_work, _func) \ + do { \ + static struct lock_class_key __key; \ + \ + (_work)->data = (atomic_long_t) WORK_DATA_INIT(); \ + lockdep_init_map(&(_work)->lockdep_map, #_work, &__key, 0);\ + INIT_LIST_HEAD(&(_work)->entry); \ + PREPARE_WORK((_work), (_func)); \ + } while (0) +#else #define INIT_WORK(_work, _func) \ do { \ (_work)->data = (atomic_long_t) WORK_DATA_INIT(); \ INIT_LIST_HEAD(&(_work)->entry); \ PREPARE_WORK((_work), (_func)); \ } while (0) +#endif #define INIT_DELAYED_WORK(_work, _func) \ do { \ @@ -118,9 +147,23 @@ struct execute_work { clear_bit(WORK_STRUCT_PENDING, work_data_bits(work)) -extern struct workqueue_struct *__create_workqueue(const char *name, - int singlethread, - int freezeable); +extern struct workqueue_struct * +__create_workqueue_key(const char *name, int singlethread, + int freezeable, struct lock_class_key *key); + +#ifdef CONFIG_LOCKDEP +#define __create_workqueue(name, singlethread, freezeable) \ +({ \ + static struct lock_class_key __key; \ + \ + __create_workqueue_key((name), (singlethread), \ + (freezeable), &__key); \ +}) +#else +#define __create_workqueue(name, singlethread, freezeable) \ + __create_workqueue_key((name), (singlethread), (freezeable), NULL) +#endif + #define create_workqueue(name) __create_workqueue((name), 0, 0) #define create_freezeable_workqueue(name) __create_workqueue((name), 1, 1) #define create_singlethread_workqueue(name) __create_workqueue((name), 1, 0) diff --git a/include/net/9p/9p.h b/include/net/9p/9p.h index 686425a97b0f..625346c47ee2 100644 --- a/include/net/9p/9p.h +++ b/include/net/9p/9p.h @@ -44,7 +44,7 @@ extern unsigned int p9_debug_level; do { \ if ((p9_debug_level & level) == level) \ printk(KERN_NOTICE "-- %s (%d): " \ - format , __FUNCTION__, current->pid , ## arg); \ + format , __FUNCTION__, task_pid_nr(current) , ## arg); \ } while (0) #define PRINT_FCALL_ERROR(s, fcall) P9_DPRINTK(P9_DEBUG_ERROR, \ @@ -59,7 +59,7 @@ do { \ #define P9_EPRINTK(level, format, arg...) \ do { \ printk(level "9p: %s (%d): " \ - format , __FUNCTION__, current->pid , ## arg); \ + format , __FUNCTION__, task_pid_nr(current), ## arg); \ } while (0) diff --git a/include/net/scm.h b/include/net/scm.h index 423cb1d5ac25..06df126103ca 100644 --- a/include/net/scm.h +++ b/include/net/scm.h @@ -4,6 +4,8 @@ #include <linux/limits.h> #include <linux/net.h> #include <linux/security.h> +#include <linux/pid.h> +#include <linux/nsproxy.h> /* Well, we should have at least one descriptor open * to accept passed FDs 8) @@ -54,7 +56,7 @@ static __inline__ int scm_send(struct socket *sock, struct msghdr *msg, struct task_struct *p = current; scm->creds.uid = p->uid; scm->creds.gid = p->gid; - scm->creds.pid = p->tgid; + scm->creds.pid = task_tgid_vnr(p); scm->fp = NULL; scm->seq = 0; unix_get_peersec_dgram(sock, scm); diff --git a/include/video/sstfb.h b/include/video/sstfb.h index baa163f770ab..b52f07381243 100644 --- a/include/video/sstfb.h +++ b/include/video/sstfb.h @@ -68,7 +68,6 @@ # define print_var(X,Y...) #endif -#define BIT(x) (1ul<<(x)) #define POW2(x) (1ul<<(x)) /* diff --git a/include/video/tdfx.h b/include/video/tdfx.h index 05b63c2a5abc..7431d9681e57 100644 --- a/include/video/tdfx.h +++ b/include/video/tdfx.h @@ -79,8 +79,6 @@ /* register bitfields (not all, only as needed) */ -#define BIT(x) (1UL << (x)) - /* COMMAND_2D reg. values */ #define TDFX_ROP_COPY 0xcc /* src */ #define TDFX_ROP_INVERT 0x55 /* NOT dst */ diff --git a/init/Kconfig b/init/Kconfig index a29a688c47d3..541382d539ad 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -270,9 +270,43 @@ config LOG_BUF_SHIFT 13 => 8 KB 12 => 4 KB +config CGROUPS + bool "Control Group support" + help + This option will let you use process cgroup subsystems + such as Cpusets + + Say N if unsure. + +config CGROUP_DEBUG + bool "Example debug cgroup subsystem" + depends on CGROUPS + help + This option enables a simple cgroup subsystem that + exports useful debugging information about the cgroups + framework + + Say N if unsure + +config CGROUP_NS + bool "Namespace cgroup subsystem" + depends on CGROUPS + help + Provides a simple namespace cgroup subsystem to + provide hierarchical naming of sets of namespaces, + for instance virtual servers and checkpoint/restart + jobs. + +config CGROUP_CPUACCT + bool "Simple CPU accounting cgroup subsystem" + depends on CGROUPS + help + Provides a simple Resource Controller for monitoring the + total CPU consumed by the tasks in a cgroup + config CPUSETS bool "Cpuset support" - depends on SMP + depends on SMP && CGROUPS help This option will let you create and manage CPUSETs which allow dynamically partitioning a system into sets of CPUs and @@ -300,6 +334,16 @@ config FAIR_USER_SCHED This option will choose userid as the basis for grouping tasks, thus providing equal CPU bandwidth to each user. +config FAIR_CGROUP_SCHED + bool "Control groups" + depends on CGROUPS + help + This option allows you to create arbitrary task groups + using the "cgroup" pseudo filesystem and control + the cpu bandwidth allocated to each such task group. + Refer to Documentation/cgroups.txt for more information + on "cgroup" pseudo filesystem. + endchoice config SYSFS_DEPRECATED @@ -322,6 +366,11 @@ config SYSFS_DEPRECATED If you are using a distro that was released in 2006 or later, it should be safe to say N here. +config PROC_PID_CPUSET + bool "Include legacy /proc/<pid>/cpuset file" + depends on CPUSETS + default y + config RELAY bool "Kernel->user space relay support (formerly relayfs)" help diff --git a/init/main.c b/init/main.c index 9def935ab13a..0dd0e7a1f632 100644 --- a/init/main.c +++ b/init/main.c @@ -39,6 +39,7 @@ #include <linux/writeback.h> #include <linux/cpu.h> #include <linux/cpuset.h> +#include <linux/cgroup.h> #include <linux/efi.h> #include <linux/tick.h> #include <linux/interrupt.h> @@ -523,6 +524,7 @@ asmlinkage void __init start_kernel(void) */ unwind_init(); lockdep_init(); + cgroup_init_early(); local_irq_disable(); early_boot_irqs_off(); @@ -640,6 +642,7 @@ asmlinkage void __init start_kernel(void) #ifdef CONFIG_PROC_FS proc_root_init(); #endif + cgroup_init(); cpuset_init(); taskstats_init_early(); delayacct_init(); diff --git a/ipc/mqueue.c b/ipc/mqueue.c index 20f1fed8fa48..c0b26dc4617b 100644 --- a/ipc/mqueue.c +++ b/ipc/mqueue.c @@ -29,6 +29,8 @@ #include <linux/audit.h> #include <linux/signal.h> #include <linux/mutex.h> +#include <linux/nsproxy.h> +#include <linux/pid.h> #include <net/sock.h> #include "util.h" @@ -330,7 +332,8 @@ static ssize_t mqueue_read_file(struct file *filp, char __user *u_data, (info->notify_owner && info->notify.sigev_notify == SIGEV_SIGNAL) ? info->notify.sigev_signo : 0, - pid_nr(info->notify_owner)); + pid_nr_ns(info->notify_owner, + current->nsproxy->pid_ns)); spin_unlock(&info->lock); buffer[sizeof(buffer)-1] = '\0'; slen = strlen(buffer)+1; @@ -507,7 +510,7 @@ static void __do_notify(struct mqueue_inode_info *info) sig_i.si_errno = 0; sig_i.si_code = SI_MESGQ; sig_i.si_value = info->notify.sigev_value; - sig_i.si_pid = current->tgid; + sig_i.si_pid = task_pid_vnr(current); sig_i.si_uid = current->uid; kill_pid_info(info->notify.sigev_signo, diff --git a/ipc/msg.c b/ipc/msg.c index a03fcb522fff..fdf3db5731ce 100644 --- a/ipc/msg.c +++ b/ipc/msg.c @@ -34,7 +34,7 @@ #include <linux/syscalls.h> #include <linux/audit.h> #include <linux/seq_file.h> -#include <linux/mutex.h> +#include <linux/rwsem.h> #include <linux/nsproxy.h> #include <asm/current.h> @@ -66,23 +66,15 @@ struct msg_sender { #define SEARCH_NOTEQUAL 3 #define SEARCH_LESSEQUAL 4 -static atomic_t msg_bytes = ATOMIC_INIT(0); -static atomic_t msg_hdrs = ATOMIC_INIT(0); - static struct ipc_ids init_msg_ids; #define msg_ids(ns) (*((ns)->ids[IPC_MSG_IDS])) -#define msg_lock(ns, id) ((struct msg_queue*)ipc_lock(&msg_ids(ns), id)) #define msg_unlock(msq) ipc_unlock(&(msq)->q_perm) -#define msg_rmid(ns, id) ((struct msg_queue*)ipc_rmid(&msg_ids(ns), id)) -#define msg_checkid(ns, msq, msgid) \ - ipc_checkid(&msg_ids(ns), &msq->q_perm, msgid) -#define msg_buildid(ns, id, seq) \ - ipc_buildid(&msg_ids(ns), id, seq) - -static void freeque (struct ipc_namespace *ns, struct msg_queue *msq, int id); -static int newque (struct ipc_namespace *ns, key_t key, int msgflg); +#define msg_buildid(id, seq) ipc_buildid(id, seq) + +static void freeque(struct ipc_namespace *, struct msg_queue *); +static int newque(struct ipc_namespace *, struct ipc_params *); #ifdef CONFIG_PROC_FS static int sysvipc_msg_proc_show(struct seq_file *s, void *it); #endif @@ -93,7 +85,9 @@ static void __msg_init_ns(struct ipc_namespace *ns, struct ipc_ids *ids) ns->msg_ctlmax = MSGMAX; ns->msg_ctlmnb = MSGMNB; ns->msg_ctlmni = MSGMNI; - ipc_init_ids(ids, ns->msg_ctlmni); + atomic_set(&ns->msg_bytes, 0); + atomic_set(&ns->msg_hdrs, 0); + ipc_init_ids(ids); } int msg_init_ns(struct ipc_namespace *ns) @@ -110,20 +104,25 @@ int msg_init_ns(struct ipc_namespace *ns) void msg_exit_ns(struct ipc_namespace *ns) { - int i; struct msg_queue *msq; + int next_id; + int total, in_use; + + down_write(&msg_ids(ns).rw_mutex); + + in_use = msg_ids(ns).in_use; - mutex_lock(&msg_ids(ns).mutex); - for (i = 0; i <= msg_ids(ns).max_id; i++) { - msq = msg_lock(ns, i); + for (total = 0, next_id = 0; total < in_use; next_id++) { + msq = idr_find(&msg_ids(ns).ipcs_idr, next_id); if (msq == NULL) continue; - - freeque(ns, msq, i); + ipc_lock_by_ptr(&msq->q_perm); + freeque(ns, msq); + total++; } - mutex_unlock(&msg_ids(ns).mutex); - ipc_fini_ids(ns->ids[IPC_MSG_IDS]); + up_write(&msg_ids(ns).rw_mutex); + kfree(ns->ids[IPC_MSG_IDS]); ns->ids[IPC_MSG_IDS] = NULL; } @@ -136,10 +135,55 @@ void __init msg_init(void) IPC_MSG_IDS, sysvipc_msg_proc_show); } -static int newque (struct ipc_namespace *ns, key_t key, int msgflg) +/* + * This routine is called in the paths where the rw_mutex is held to protect + * access to the idr tree. + */ +static inline struct msg_queue *msg_lock_check_down(struct ipc_namespace *ns, + int id) +{ + struct kern_ipc_perm *ipcp = ipc_lock_check_down(&msg_ids(ns), id); + + return container_of(ipcp, struct msg_queue, q_perm); +} + +/* + * msg_lock_(check_) routines are called in the paths where the rw_mutex + * is not held. + */ +static inline struct msg_queue *msg_lock(struct ipc_namespace *ns, int id) +{ + struct kern_ipc_perm *ipcp = ipc_lock(&msg_ids(ns), id); + + return container_of(ipcp, struct msg_queue, q_perm); +} + +static inline struct msg_queue *msg_lock_check(struct ipc_namespace *ns, + int id) +{ + struct kern_ipc_perm *ipcp = ipc_lock_check(&msg_ids(ns), id); + + return container_of(ipcp, struct msg_queue, q_perm); +} + +static inline void msg_rmid(struct ipc_namespace *ns, struct msg_queue *s) +{ + ipc_rmid(&msg_ids(ns), &s->q_perm); +} + +/** + * newque - Create a new msg queue + * @ns: namespace + * @params: ptr to the structure that contains the key and msgflg + * + * Called with msg_ids.rw_mutex held (writer) + */ +static int newque(struct ipc_namespace *ns, struct ipc_params *params) { struct msg_queue *msq; int id, retval; + key_t key = params->key; + int msgflg = params->flg; msq = ipc_rcu_alloc(sizeof(*msq)); if (!msq) @@ -155,14 +199,17 @@ static int newque (struct ipc_namespace *ns, key_t key, int msgflg) return retval; } + /* + * ipc_addid() locks msq + */ id = ipc_addid(&msg_ids(ns), &msq->q_perm, ns->msg_ctlmni); - if (id == -1) { + if (id < 0) { security_msg_queue_free(msq); ipc_rcu_putref(msq); - return -ENOSPC; + return id; } - msq->q_id = msg_buildid(ns, id, msq->q_perm.seq); + msq->q_perm.id = msg_buildid(id, msq->q_perm.seq); msq->q_stime = msq->q_rtime = 0; msq->q_ctime = get_seconds(); msq->q_cbytes = msq->q_qnum = 0; @@ -171,9 +218,10 @@ static int newque (struct ipc_namespace *ns, key_t key, int msgflg) INIT_LIST_HEAD(&msq->q_messages); INIT_LIST_HEAD(&msq->q_receivers); INIT_LIST_HEAD(&msq->q_senders); + msg_unlock(msq); - return msq->q_id; + return msq->q_perm.id; } static inline void ss_add(struct msg_queue *msq, struct msg_sender *mss) @@ -224,19 +272,19 @@ static void expunge_all(struct msg_queue *msq, int res) /* * freeque() wakes up waiters on the sender and receiver waiting queue, - * removes the message queue from message queue ID - * array, and cleans up all the messages associated with this queue. + * removes the message queue from message queue ID IDR, and cleans up all the + * messages associated with this queue. * - * msg_ids.mutex and the spinlock for this message queue is hold - * before freeque() is called. msg_ids.mutex remains locked on exit. + * msg_ids.rw_mutex (writer) and the spinlock for this message queue are held + * before freeque() is called. msg_ids.rw_mutex remains locked on exit. */ -static void freeque(struct ipc_namespace *ns, struct msg_queue *msq, int id) +static void freeque(struct ipc_namespace *ns, struct msg_queue *msq) { struct list_head *tmp; expunge_all(msq, -EIDRM); ss_wakeup(&msq->q_senders, 1); - msq = msg_rmid(ns, id); + msg_rmid(ns, msq); msg_unlock(msq); tmp = msq->q_messages.next; @@ -244,49 +292,40 @@ static void freeque(struct ipc_namespace *ns, struct msg_queue *msq, int id) struct msg_msg *msg = list_entry(tmp, struct msg_msg, m_list); tmp = tmp->next; - atomic_dec(&msg_hdrs); + atomic_dec(&ns->msg_hdrs); free_msg(msg); } - atomic_sub(msq->q_cbytes, &msg_bytes); + atomic_sub(msq->q_cbytes, &ns->msg_bytes); security_msg_queue_free(msq); ipc_rcu_putref(msq); } +/* + * Called with msg_ids.rw_mutex and ipcp locked. + */ +static inline int msg_security(struct kern_ipc_perm *ipcp, int msgflg) +{ + struct msg_queue *msq = container_of(ipcp, struct msg_queue, q_perm); + + return security_msg_queue_associate(msq, msgflg); +} + asmlinkage long sys_msgget(key_t key, int msgflg) { - struct msg_queue *msq; - int id, ret = -EPERM; struct ipc_namespace *ns; + struct ipc_ops msg_ops; + struct ipc_params msg_params; ns = current->nsproxy->ipc_ns; - - mutex_lock(&msg_ids(ns).mutex); - if (key == IPC_PRIVATE) - ret = newque(ns, key, msgflg); - else if ((id = ipc_findkey(&msg_ids(ns), key)) == -1) { /* key not used */ - if (!(msgflg & IPC_CREAT)) - ret = -ENOENT; - else - ret = newque(ns, key, msgflg); - } else if (msgflg & IPC_CREAT && msgflg & IPC_EXCL) { - ret = -EEXIST; - } else { - msq = msg_lock(ns, id); - BUG_ON(msq == NULL); - if (ipcperms(&msq->q_perm, msgflg)) - ret = -EACCES; - else { - int qid = msg_buildid(ns, id, msq->q_perm.seq); - - ret = security_msg_queue_associate(msq, msgflg); - if (!ret) - ret = qid; - } - msg_unlock(msq); - } - mutex_unlock(&msg_ids(ns).mutex); - return ret; + msg_ops.getnew = newque; + msg_ops.associate = msg_security; + msg_ops.more_checks = NULL; + + msg_params.key = key; + msg_params.flg = msgflg; + + return ipcget(ns, &msg_ids(ns), &msg_ops, &msg_params); } static inline unsigned long @@ -420,23 +459,23 @@ asmlinkage long sys_msgctl(int msqid, int cmd, struct msqid_ds __user *buf) msginfo.msgmnb = ns->msg_ctlmnb; msginfo.msgssz = MSGSSZ; msginfo.msgseg = MSGSEG; - mutex_lock(&msg_ids(ns).mutex); + down_read(&msg_ids(ns).rw_mutex); if (cmd == MSG_INFO) { msginfo.msgpool = msg_ids(ns).in_use; - msginfo.msgmap = atomic_read(&msg_hdrs); - msginfo.msgtql = atomic_read(&msg_bytes); + msginfo.msgmap = atomic_read(&ns->msg_hdrs); + msginfo.msgtql = atomic_read(&ns->msg_bytes); } else { msginfo.msgmap = MSGMAP; msginfo.msgpool = MSGPOOL; msginfo.msgtql = MSGTQL; } - max_id = msg_ids(ns).max_id; - mutex_unlock(&msg_ids(ns).mutex); + max_id = ipc_get_maxid(&msg_ids(ns)); + up_read(&msg_ids(ns).rw_mutex); if (copy_to_user(buf, &msginfo, sizeof(struct msginfo))) return -EFAULT; return (max_id < 0) ? 0 : max_id; } - case MSG_STAT: + case MSG_STAT: /* msqid is an index rather than a msg queue id */ case IPC_STAT: { struct msqid64_ds tbuf; @@ -444,21 +483,16 @@ asmlinkage long sys_msgctl(int msqid, int cmd, struct msqid_ds __user *buf) if (!buf) return -EFAULT; - if (cmd == MSG_STAT && msqid >= msg_ids(ns).entries->size) - return -EINVAL; - - memset(&tbuf, 0, sizeof(tbuf)); - - msq = msg_lock(ns, msqid); - if (msq == NULL) - return -EINVAL; if (cmd == MSG_STAT) { - success_return = msg_buildid(ns, msqid, msq->q_perm.seq); + msq = msg_lock(ns, msqid); + if (IS_ERR(msq)) + return PTR_ERR(msq); + success_return = msq->q_perm.id; } else { - err = -EIDRM; - if (msg_checkid(ns, msq, msqid)) - goto out_unlock; + msq = msg_lock_check(ns, msqid); + if (IS_ERR(msq)) + return PTR_ERR(msq); success_return = 0; } err = -EACCES; @@ -469,6 +503,8 @@ asmlinkage long sys_msgctl(int msqid, int cmd, struct msqid_ds __user *buf) if (err) goto out_unlock; + memset(&tbuf, 0, sizeof(tbuf)); + kernel_to_ipc64_perm(&msq->q_perm, &tbuf.msg_perm); tbuf.msg_stime = msq->q_stime; tbuf.msg_rtime = msq->q_rtime; @@ -495,15 +531,13 @@ asmlinkage long sys_msgctl(int msqid, int cmd, struct msqid_ds __user *buf) return -EINVAL; } - mutex_lock(&msg_ids(ns).mutex); - msq = msg_lock(ns, msqid); - err = -EINVAL; - if (msq == NULL) + down_write(&msg_ids(ns).rw_mutex); + msq = msg_lock_check_down(ns, msqid); + if (IS_ERR(msq)) { + err = PTR_ERR(msq); goto out_up; + } - err = -EIDRM; - if (msg_checkid(ns, msq, msqid)) - goto out_unlock_up; ipcp = &msq->q_perm; err = audit_ipc_obj(ipcp); @@ -552,12 +586,12 @@ asmlinkage long sys_msgctl(int msqid, int cmd, struct msqid_ds __user *buf) break; } case IPC_RMID: - freeque(ns, msq, msqid); + freeque(ns, msq); break; } err = 0; out_up: - mutex_unlock(&msg_ids(ns).mutex); + up_write(&msg_ids(ns).rw_mutex); return err; out_unlock_up: msg_unlock(msq); @@ -611,7 +645,7 @@ static inline int pipelined_send(struct msg_queue *msq, struct msg_msg *msg) msr->r_msg = ERR_PTR(-E2BIG); } else { msr->r_msg = NULL; - msq->q_lrpid = msr->r_tsk->pid; + msq->q_lrpid = task_pid_vnr(msr->r_tsk); msq->q_rtime = get_seconds(); wake_up_process(msr->r_tsk); smp_mb(); @@ -646,14 +680,11 @@ long do_msgsnd(int msqid, long mtype, void __user *mtext, msg->m_type = mtype; msg->m_ts = msgsz; - msq = msg_lock(ns, msqid); - err = -EINVAL; - if (msq == NULL) + msq = msg_lock_check(ns, msqid); + if (IS_ERR(msq)) { + err = PTR_ERR(msq); goto out_free; - - err= -EIDRM; - if (msg_checkid(ns, msq, msqid)) - goto out_unlock_free; + } for (;;) { struct msg_sender s; @@ -695,7 +726,7 @@ long do_msgsnd(int msqid, long mtype, void __user *mtext, } } - msq->q_lspid = current->tgid; + msq->q_lspid = task_tgid_vnr(current); msq->q_stime = get_seconds(); if (!pipelined_send(msq, msg)) { @@ -703,8 +734,8 @@ long do_msgsnd(int msqid, long mtype, void __user *mtext, list_add_tail(&msg->m_list, &msq->q_messages); msq->q_cbytes += msgsz; msq->q_qnum++; - atomic_add(msgsz, &msg_bytes); - atomic_inc(&msg_hdrs); + atomic_add(msgsz, &ns->msg_bytes); + atomic_inc(&ns->msg_hdrs); } err = 0; @@ -760,13 +791,9 @@ long do_msgrcv(int msqid, long *pmtype, void __user *mtext, mode = convert_mode(&msgtyp, msgflg); ns = current->nsproxy->ipc_ns; - msq = msg_lock(ns, msqid); - if (msq == NULL) - return -EINVAL; - - msg = ERR_PTR(-EIDRM); - if (msg_checkid(ns, msq, msqid)) - goto out_unlock; + msq = msg_lock_check(ns, msqid); + if (IS_ERR(msq)) + return PTR_ERR(msq); for (;;) { struct msg_receiver msr_d; @@ -810,10 +837,10 @@ long do_msgrcv(int msqid, long *pmtype, void __user *mtext, list_del(&msg->m_list); msq->q_qnum--; msq->q_rtime = get_seconds(); - msq->q_lrpid = current->tgid; + msq->q_lrpid = task_tgid_vnr(current); msq->q_cbytes -= msg->m_ts; - atomic_sub(msg->m_ts, &msg_bytes); - atomic_dec(&msg_hdrs); + atomic_sub(msg->m_ts, &ns->msg_bytes); + atomic_dec(&ns->msg_hdrs); ss_wakeup(&msq->q_senders, 0); msg_unlock(msq); break; @@ -926,7 +953,7 @@ static int sysvipc_msg_proc_show(struct seq_file *s, void *it) return seq_printf(s, "%10d %10d %4o %10lu %10lu %5u %5u %5u %5u %5u %5u %10lu %10lu %10lu\n", msq->q_perm.key, - msq->q_id, + msq->q_perm.id, msq->q_perm.mode, msq->q_cbytes, msq->q_qnum, diff --git a/ipc/sem.c b/ipc/sem.c index b676fef6d208..35952c0bae46 100644 --- a/ipc/sem.c +++ b/ipc/sem.c @@ -80,7 +80,7 @@ #include <linux/audit.h> #include <linux/capability.h> #include <linux/seq_file.h> -#include <linux/mutex.h> +#include <linux/rwsem.h> #include <linux/nsproxy.h> #include <asm/uaccess.h> @@ -88,18 +88,14 @@ #define sem_ids(ns) (*((ns)->ids[IPC_SEM_IDS])) -#define sem_lock(ns, id) ((struct sem_array*)ipc_lock(&sem_ids(ns), id)) #define sem_unlock(sma) ipc_unlock(&(sma)->sem_perm) -#define sem_rmid(ns, id) ((struct sem_array*)ipc_rmid(&sem_ids(ns), id)) -#define sem_checkid(ns, sma, semid) \ - ipc_checkid(&sem_ids(ns),&sma->sem_perm,semid) -#define sem_buildid(ns, id, seq) \ - ipc_buildid(&sem_ids(ns), id, seq) +#define sem_checkid(sma, semid) ipc_checkid(&sma->sem_perm, semid) +#define sem_buildid(id, seq) ipc_buildid(id, seq) static struct ipc_ids init_sem_ids; -static int newary(struct ipc_namespace *, key_t, int, int); -static void freeary(struct ipc_namespace *ns, struct sem_array *sma, int id); +static int newary(struct ipc_namespace *, struct ipc_params *); +static void freeary(struct ipc_namespace *, struct sem_array *); #ifdef CONFIG_PROC_FS static int sysvipc_sem_proc_show(struct seq_file *s, void *it); #endif @@ -129,7 +125,7 @@ static void __sem_init_ns(struct ipc_namespace *ns, struct ipc_ids *ids) ns->sc_semopm = SEMOPM; ns->sc_semmni = SEMMNI; ns->used_sems = 0; - ipc_init_ids(ids, ns->sc_semmni); + ipc_init_ids(ids); } int sem_init_ns(struct ipc_namespace *ns) @@ -146,20 +142,24 @@ int sem_init_ns(struct ipc_namespace *ns) void sem_exit_ns(struct ipc_namespace *ns) { - int i; struct sem_array *sma; + int next_id; + int total, in_use; - mutex_lock(&sem_ids(ns).mutex); - for (i = 0; i <= sem_ids(ns).max_id; i++) { - sma = sem_lock(ns, i); + down_write(&sem_ids(ns).rw_mutex); + + in_use = sem_ids(ns).in_use; + + for (total = 0, next_id = 0; total < in_use; next_id++) { + sma = idr_find(&sem_ids(ns).ipcs_idr, next_id); if (sma == NULL) continue; - - freeary(ns, sma, i); + ipc_lock_by_ptr(&sma->sem_perm); + freeary(ns, sma); + total++; } - mutex_unlock(&sem_ids(ns).mutex); + up_write(&sem_ids(ns).rw_mutex); - ipc_fini_ids(ns->ids[IPC_SEM_IDS]); kfree(ns->ids[IPC_SEM_IDS]); ns->ids[IPC_SEM_IDS] = NULL; } @@ -173,6 +173,42 @@ void __init sem_init (void) } /* + * This routine is called in the paths where the rw_mutex is held to protect + * access to the idr tree. + */ +static inline struct sem_array *sem_lock_check_down(struct ipc_namespace *ns, + int id) +{ + struct kern_ipc_perm *ipcp = ipc_lock_check_down(&sem_ids(ns), id); + + return container_of(ipcp, struct sem_array, sem_perm); +} + +/* + * sem_lock_(check_) routines are called in the paths where the rw_mutex + * is not held. + */ +static inline struct sem_array *sem_lock(struct ipc_namespace *ns, int id) +{ + struct kern_ipc_perm *ipcp = ipc_lock(&sem_ids(ns), id); + + return container_of(ipcp, struct sem_array, sem_perm); +} + +static inline struct sem_array *sem_lock_check(struct ipc_namespace *ns, + int id) +{ + struct kern_ipc_perm *ipcp = ipc_lock_check(&sem_ids(ns), id); + + return container_of(ipcp, struct sem_array, sem_perm); +} + +static inline void sem_rmid(struct ipc_namespace *ns, struct sem_array *s) +{ + ipc_rmid(&sem_ids(ns), &s->sem_perm); +} + +/* * Lockless wakeup algorithm: * Without the check/retry algorithm a lockless wakeup is possible: * - queue.status is initialized to -EINTR before blocking. @@ -206,12 +242,23 @@ void __init sem_init (void) */ #define IN_WAKEUP 1 -static int newary (struct ipc_namespace *ns, key_t key, int nsems, int semflg) +/** + * newary - Create a new semaphore set + * @ns: namespace + * @params: ptr to the structure that contains key, semflg and nsems + * + * Called with sem_ids.rw_mutex held (as a writer) + */ + +static int newary(struct ipc_namespace *ns, struct ipc_params *params) { int id; int retval; struct sem_array *sma; int size; + key_t key = params->key; + int nsems = params->u.nsems; + int semflg = params->flg; if (!nsems) return -EINVAL; @@ -236,14 +283,14 @@ static int newary (struct ipc_namespace *ns, key_t key, int nsems, int semflg) } id = ipc_addid(&sem_ids(ns), &sma->sem_perm, ns->sc_semmni); - if(id == -1) { + if (id < 0) { security_sem_free(sma); ipc_rcu_putref(sma); - return -ENOSPC; + return id; } ns->used_sems += nsems; - sma->sem_id = sem_buildid(ns, id, sma->sem_perm.seq); + sma->sem_perm.id = sem_buildid(id, sma->sem_perm.seq); sma->sem_base = (struct sem *) &sma[1]; /* sma->sem_pending = NULL; */ sma->sem_pending_last = &sma->sem_pending; @@ -252,48 +299,56 @@ static int newary (struct ipc_namespace *ns, key_t key, int nsems, int semflg) sma->sem_ctime = get_seconds(); sem_unlock(sma); - return sma->sem_id; + return sma->sem_perm.id; +} + + +/* + * Called with sem_ids.rw_mutex and ipcp locked. + */ +static inline int sem_security(struct kern_ipc_perm *ipcp, int semflg) +{ + struct sem_array *sma; + + sma = container_of(ipcp, struct sem_array, sem_perm); + return security_sem_associate(sma, semflg); } -asmlinkage long sys_semget (key_t key, int nsems, int semflg) +/* + * Called with sem_ids.rw_mutex and ipcp locked. + */ +static inline int sem_more_checks(struct kern_ipc_perm *ipcp, + struct ipc_params *params) { - int id, err = -EINVAL; struct sem_array *sma; + + sma = container_of(ipcp, struct sem_array, sem_perm); + if (params->u.nsems > sma->sem_nsems) + return -EINVAL; + + return 0; +} + +asmlinkage long sys_semget(key_t key, int nsems, int semflg) +{ struct ipc_namespace *ns; + struct ipc_ops sem_ops; + struct ipc_params sem_params; ns = current->nsproxy->ipc_ns; if (nsems < 0 || nsems > ns->sc_semmsl) return -EINVAL; - mutex_lock(&sem_ids(ns).mutex); - - if (key == IPC_PRIVATE) { - err = newary(ns, key, nsems, semflg); - } else if ((id = ipc_findkey(&sem_ids(ns), key)) == -1) { /* key not used */ - if (!(semflg & IPC_CREAT)) - err = -ENOENT; - else - err = newary(ns, key, nsems, semflg); - } else if (semflg & IPC_CREAT && semflg & IPC_EXCL) { - err = -EEXIST; - } else { - sma = sem_lock(ns, id); - BUG_ON(sma==NULL); - if (nsems > sma->sem_nsems) - err = -EINVAL; - else if (ipcperms(&sma->sem_perm, semflg)) - err = -EACCES; - else { - int semid = sem_buildid(ns, id, sma->sem_perm.seq); - err = security_sem_associate(sma, semflg); - if (!err) - err = semid; - } - sem_unlock(sma); - } - mutex_unlock(&sem_ids(ns).mutex); - return err; + sem_ops.getnew = newary; + sem_ops.associate = sem_security; + sem_ops.more_checks = sem_more_checks; + + sem_params.key = key; + sem_params.flg = semflg; + sem_params.u.nsems = nsems; + + return ipcget(ns, &sem_ids(ns), &sem_ops, &sem_params); } /* Manage the doubly linked list sma->sem_pending as a FIFO: @@ -487,15 +542,14 @@ static int count_semzcnt (struct sem_array * sma, ushort semnum) return semzcnt; } -/* Free a semaphore set. freeary() is called with sem_ids.mutex locked and - * the spinlock for this semaphore set hold. sem_ids.mutex remains locked - * on exit. +/* Free a semaphore set. freeary() is called with sem_ids.rw_mutex locked + * as a writer and the spinlock for this semaphore set hold. sem_ids.rw_mutex + * remains locked on exit. */ -static void freeary (struct ipc_namespace *ns, struct sem_array *sma, int id) +static void freeary(struct ipc_namespace *ns, struct sem_array *sma) { struct sem_undo *un; struct sem_queue *q; - int size; /* Invalidate the existing undo structures for this semaphore set. * (They will be freed without any further action in exit_sem() @@ -518,12 +572,11 @@ static void freeary (struct ipc_namespace *ns, struct sem_array *sma, int id) q = n; } - /* Remove the semaphore set from the ID array*/ - sma = sem_rmid(ns, id); + /* Remove the semaphore set from the IDR */ + sem_rmid(ns, sma); sem_unlock(sma); ns->used_sems -= sma->sem_nsems; - size = sizeof (*sma) + sma->sem_nsems * sizeof (struct sem); security_sem_free(sma); ipc_rcu_putref(sma); } @@ -576,7 +629,7 @@ static int semctl_nolock(struct ipc_namespace *ns, int semid, int semnum, seminfo.semmnu = SEMMNU; seminfo.semmap = SEMMAP; seminfo.semume = SEMUME; - mutex_lock(&sem_ids(ns).mutex); + down_read(&sem_ids(ns).rw_mutex); if (cmd == SEM_INFO) { seminfo.semusz = sem_ids(ns).in_use; seminfo.semaem = ns->used_sems; @@ -584,8 +637,8 @@ static int semctl_nolock(struct ipc_namespace *ns, int semid, int semnum, seminfo.semusz = SEMUSZ; seminfo.semaem = SEMAEM; } - max_id = sem_ids(ns).max_id; - mutex_unlock(&sem_ids(ns).mutex); + max_id = ipc_get_maxid(&sem_ids(ns)); + up_read(&sem_ids(ns).rw_mutex); if (copy_to_user (arg.__buf, &seminfo, sizeof(struct seminfo))) return -EFAULT; return (max_id < 0) ? 0: max_id; @@ -595,14 +648,9 @@ static int semctl_nolock(struct ipc_namespace *ns, int semid, int semnum, struct semid64_ds tbuf; int id; - if(semid >= sem_ids(ns).entries->size) - return -EINVAL; - - memset(&tbuf,0,sizeof(tbuf)); - sma = sem_lock(ns, semid); - if(sma == NULL) - return -EINVAL; + if (IS_ERR(sma)) + return PTR_ERR(sma); err = -EACCES; if (ipcperms (&sma->sem_perm, S_IRUGO)) @@ -612,7 +660,9 @@ static int semctl_nolock(struct ipc_namespace *ns, int semid, int semnum, if (err) goto out_unlock; - id = sem_buildid(ns, semid, sma->sem_perm.seq); + id = sma->sem_perm.id; + + memset(&tbuf, 0, sizeof(tbuf)); kernel_to_ipc64_perm(&sma->sem_perm, &tbuf.sem_perm); tbuf.sem_otime = sma->sem_otime; @@ -642,16 +692,12 @@ static int semctl_main(struct ipc_namespace *ns, int semid, int semnum, ushort* sem_io = fast_sem_io; int nsems; - sma = sem_lock(ns, semid); - if(sma==NULL) - return -EINVAL; + sma = sem_lock_check(ns, semid); + if (IS_ERR(sma)) + return PTR_ERR(sma); nsems = sma->sem_nsems; - err=-EIDRM; - if (sem_checkid(ns,sma,semid)) - goto out_unlock; - err = -EACCES; if (ipcperms (&sma->sem_perm, (cmd==SETVAL||cmd==SETALL)?S_IWUGO:S_IRUGO)) goto out_unlock; @@ -795,7 +841,7 @@ static int semctl_main(struct ipc_namespace *ns, int semid, int semnum, for (un = sma->undo; un; un = un->id_next) un->semadj[semnum] = 0; curr->semval = val; - curr->sempid = current->tgid; + curr->sempid = task_tgid_vnr(current); sma->sem_ctime = get_seconds(); /* maybe some queued-up processes were waiting for this */ update_queue(sma); @@ -863,14 +909,10 @@ static int semctl_down(struct ipc_namespace *ns, int semid, int semnum, if(copy_semid_from_user (&setbuf, arg.buf, version)) return -EFAULT; } - sma = sem_lock(ns, semid); - if(sma==NULL) - return -EINVAL; + sma = sem_lock_check_down(ns, semid); + if (IS_ERR(sma)) + return PTR_ERR(sma); - if (sem_checkid(ns,sma,semid)) { - err=-EIDRM; - goto out_unlock; - } ipcp = &sma->sem_perm; err = audit_ipc_obj(ipcp); @@ -894,7 +936,7 @@ static int semctl_down(struct ipc_namespace *ns, int semid, int semnum, switch(cmd){ case IPC_RMID: - freeary(ns, sma, semid); + freeary(ns, sma); err = 0; break; case IPC_SET: @@ -948,45 +990,15 @@ asmlinkage long sys_semctl (int semid, int semnum, int cmd, union semun arg) return err; case IPC_RMID: case IPC_SET: - mutex_lock(&sem_ids(ns).mutex); + down_write(&sem_ids(ns).rw_mutex); err = semctl_down(ns,semid,semnum,cmd,version,arg); - mutex_unlock(&sem_ids(ns).mutex); + up_write(&sem_ids(ns).rw_mutex); return err; default: return -EINVAL; } } -static inline void lock_semundo(void) -{ - struct sem_undo_list *undo_list; - - undo_list = current->sysvsem.undo_list; - if (undo_list) - spin_lock(&undo_list->lock); -} - -/* This code has an interaction with copy_semundo(). - * Consider; two tasks are sharing the undo_list. task1 - * acquires the undo_list lock in lock_semundo(). If task2 now - * exits before task1 releases the lock (by calling - * unlock_semundo()), then task1 will never call spin_unlock(). - * This leave the sem_undo_list in a locked state. If task1 now creats task3 - * and once again shares the sem_undo_list, the sem_undo_list will still be - * locked, and future SEM_UNDO operations will deadlock. This case is - * dealt with in copy_semundo() by having it reinitialize the spin lock when - * the refcnt goes from 1 to 2. - */ -static inline void unlock_semundo(void) -{ - struct sem_undo_list *undo_list; - - undo_list = current->sysvsem.undo_list; - if (undo_list) - spin_unlock(&undo_list->lock); -} - - /* If the task doesn't already have a undo_list, then allocate one * here. We guarantee there is only one thread using this undo list, * and current is THE ONE @@ -1047,22 +1059,17 @@ static struct sem_undo *find_undo(struct ipc_namespace *ns, int semid) if (error) return ERR_PTR(error); - lock_semundo(); + spin_lock(&ulp->lock); un = lookup_undo(ulp, semid); - unlock_semundo(); + spin_unlock(&ulp->lock); if (likely(un!=NULL)) goto out; /* no undo structure around - allocate one. */ - sma = sem_lock(ns, semid); - un = ERR_PTR(-EINVAL); - if(sma==NULL) - goto out; - un = ERR_PTR(-EIDRM); - if (sem_checkid(ns,sma,semid)) { - sem_unlock(sma); - goto out; - } + sma = sem_lock_check(ns, semid); + if (IS_ERR(sma)) + return ERR_PTR(PTR_ERR(sma)); + nsems = sma->sem_nsems; ipc_rcu_getref(sma); sem_unlock(sma); @@ -1077,10 +1084,10 @@ static struct sem_undo *find_undo(struct ipc_namespace *ns, int semid) new->semadj = (short *) &new[1]; new->semid = semid; - lock_semundo(); + spin_lock(&ulp->lock); un = lookup_undo(ulp, semid); if (un) { - unlock_semundo(); + spin_unlock(&ulp->lock); kfree(new); ipc_lock_by_ptr(&sma->sem_perm); ipc_rcu_putref(sma); @@ -1091,7 +1098,7 @@ static struct sem_undo *find_undo(struct ipc_namespace *ns, int semid) ipc_rcu_putref(sma); if (sma->sem_perm.deleted) { sem_unlock(sma); - unlock_semundo(); + spin_unlock(&ulp->lock); kfree(new); un = ERR_PTR(-EIDRM); goto out; @@ -1102,7 +1109,7 @@ static struct sem_undo *find_undo(struct ipc_namespace *ns, int semid) sma->undo = new; sem_unlock(sma); un = new; - unlock_semundo(); + spin_unlock(&ulp->lock); out: return un; } @@ -1168,15 +1175,14 @@ retry_undos: } else un = NULL; - sma = sem_lock(ns, semid); - error=-EINVAL; - if(sma==NULL) + sma = sem_lock_check(ns, semid); + if (IS_ERR(sma)) { + error = PTR_ERR(sma); goto out_free; - error = -EIDRM; - if (sem_checkid(ns,sma,semid)) - goto out_unlock_free; + } + /* - * semid identifies are not unique - find_undo may have + * semid identifiers are not unique - find_undo may have * allocated an undo structure, it was invalidated by an RMID * and now a new array with received the same id. Check and retry. */ @@ -1196,7 +1202,7 @@ retry_undos: if (error) goto out_unlock_free; - error = try_atomic_semop (sma, sops, nsops, un, current->tgid); + error = try_atomic_semop (sma, sops, nsops, un, task_tgid_vnr(current)); if (error <= 0) { if (alter && error == 0) update_queue (sma); @@ -1211,7 +1217,7 @@ retry_undos: queue.sops = sops; queue.nsops = nsops; queue.undo = un; - queue.pid = current->tgid; + queue.pid = task_tgid_vnr(current); queue.id = semid; queue.alter = alter; if (alter) @@ -1242,7 +1248,7 @@ retry_undos: } sma = sem_lock(ns, semid); - if(sma==NULL) { + if (IS_ERR(sma)) { BUG_ON(queue.prev != NULL); error = -EIDRM; goto out_free; @@ -1279,10 +1285,6 @@ asmlinkage long sys_semop (int semid, struct sembuf __user *tsops, unsigned nsop /* If CLONE_SYSVSEM is set, establish sharing of SEM_UNDO state between * parent and child tasks. - * - * See the notes above unlock_semundo() regarding the spin_lock_init() - * in this code. Initialize the undo_list->lock here instead of get_undo_list() - * because of the reasoning in the comment above unlock_semundo. */ int copy_semundo(unsigned long clone_flags, struct task_struct *tsk) @@ -1342,13 +1344,13 @@ void exit_sem(struct task_struct *tsk) if(semid == -1) continue; sma = sem_lock(ns, semid); - if (sma == NULL) + if (IS_ERR(sma)) continue; if (u->semid == -1) goto next_entry; - BUG_ON(sem_checkid(ns,sma,u->semid)); + BUG_ON(sem_checkid(sma, u->semid)); /* remove u from the sma->undo list */ for (unp = &sma->undo; (un = *unp); unp = &un->id_next) { @@ -1382,7 +1384,7 @@ found: semaphore->semval = 0; if (semaphore->semval > SEMVMX) semaphore->semval = SEMVMX; - semaphore->sempid = current->tgid; + semaphore->sempid = task_tgid_vnr(current); } } sma->sem_otime = get_seconds(); @@ -1402,7 +1404,7 @@ static int sysvipc_sem_proc_show(struct seq_file *s, void *it) return seq_printf(s, "%10d %10d %4o %10lu %5u %5u %5u %5u %10lu %10lu\n", sma->sem_perm.key, - sma->sem_id, + sma->sem_perm.id, sma->sem_perm.mode, sma->sem_nsems, sma->sem_perm.uid, diff --git a/ipc/shm.c b/ipc/shm.c index 5fc5cf50cf1b..3818fae625c5 100644 --- a/ipc/shm.c +++ b/ipc/shm.c @@ -35,7 +35,7 @@ #include <linux/capability.h> #include <linux/ptrace.h> #include <linux/seq_file.h> -#include <linux/mutex.h> +#include <linux/rwsem.h> #include <linux/nsproxy.h> #include <linux/mount.h> @@ -59,17 +59,11 @@ static struct ipc_ids init_shm_ids; #define shm_ids(ns) (*((ns)->ids[IPC_SHM_IDS])) -#define shm_lock(ns, id) \ - ((struct shmid_kernel*)ipc_lock(&shm_ids(ns),id)) #define shm_unlock(shp) \ ipc_unlock(&(shp)->shm_perm) -#define shm_get(ns, id) \ - ((struct shmid_kernel*)ipc_get(&shm_ids(ns),id)) -#define shm_buildid(ns, id, seq) \ - ipc_buildid(&shm_ids(ns), id, seq) +#define shm_buildid(id, seq) ipc_buildid(id, seq) -static int newseg (struct ipc_namespace *ns, key_t key, - int shmflg, size_t size); +static int newseg(struct ipc_namespace *, struct ipc_params *); static void shm_open(struct vm_area_struct *vma); static void shm_close(struct vm_area_struct *vma); static void shm_destroy (struct ipc_namespace *ns, struct shmid_kernel *shp); @@ -84,9 +78,13 @@ static void __shm_init_ns(struct ipc_namespace *ns, struct ipc_ids *ids) ns->shm_ctlall = SHMALL; ns->shm_ctlmni = SHMMNI; ns->shm_tot = 0; - ipc_init_ids(ids, 1); + ipc_init_ids(ids); } +/* + * Called with shm_ids.rw_mutex (writer) and the shp structure locked. + * Only shm_ids.rw_mutex remains locked on exit. + */ static void do_shm_rmid(struct ipc_namespace *ns, struct shmid_kernel *shp) { if (shp->shm_nattch){ @@ -112,20 +110,24 @@ int shm_init_ns(struct ipc_namespace *ns) void shm_exit_ns(struct ipc_namespace *ns) { - int i; struct shmid_kernel *shp; + int next_id; + int total, in_use; + + down_write(&shm_ids(ns).rw_mutex); - mutex_lock(&shm_ids(ns).mutex); - for (i = 0; i <= shm_ids(ns).max_id; i++) { - shp = shm_lock(ns, i); + in_use = shm_ids(ns).in_use; + + for (total = 0, next_id = 0; total < in_use; next_id++) { + shp = idr_find(&shm_ids(ns).ipcs_idr, next_id); if (shp == NULL) continue; - + ipc_lock_by_ptr(&shp->shm_perm); do_shm_rmid(ns, shp); + total++; } - mutex_unlock(&shm_ids(ns).mutex); + up_write(&shm_ids(ns).rw_mutex); - ipc_fini_ids(ns->ids[IPC_SHM_IDS]); kfree(ns->ids[IPC_SHM_IDS]); ns->ids[IPC_SHM_IDS] = NULL; } @@ -138,17 +140,49 @@ void __init shm_init (void) IPC_SHM_IDS, sysvipc_shm_proc_show); } -static inline int shm_checkid(struct ipc_namespace *ns, - struct shmid_kernel *s, int id) +/* + * shm_lock_(check_)down routines are called in the paths where the rw_mutex + * is held to protect access to the idr tree. + */ +static inline struct shmid_kernel *shm_lock_down(struct ipc_namespace *ns, + int id) { - if (ipc_checkid(&shm_ids(ns), &s->shm_perm, id)) - return -EIDRM; - return 0; + struct kern_ipc_perm *ipcp = ipc_lock_down(&shm_ids(ns), id); + + return container_of(ipcp, struct shmid_kernel, shm_perm); +} + +static inline struct shmid_kernel *shm_lock_check_down( + struct ipc_namespace *ns, + int id) +{ + struct kern_ipc_perm *ipcp = ipc_lock_check_down(&shm_ids(ns), id); + + return container_of(ipcp, struct shmid_kernel, shm_perm); +} + +/* + * shm_lock_(check_) routines are called in the paths where the rw_mutex + * is not held. + */ +static inline struct shmid_kernel *shm_lock(struct ipc_namespace *ns, int id) +{ + struct kern_ipc_perm *ipcp = ipc_lock(&shm_ids(ns), id); + + return container_of(ipcp, struct shmid_kernel, shm_perm); +} + +static inline struct shmid_kernel *shm_lock_check(struct ipc_namespace *ns, + int id) +{ + struct kern_ipc_perm *ipcp = ipc_lock_check(&shm_ids(ns), id); + + return container_of(ipcp, struct shmid_kernel, shm_perm); } -static inline struct shmid_kernel *shm_rmid(struct ipc_namespace *ns, int id) +static inline void shm_rmid(struct ipc_namespace *ns, struct shmid_kernel *s) { - return (struct shmid_kernel *)ipc_rmid(&shm_ids(ns), id); + ipc_rmid(&shm_ids(ns), &s->shm_perm); } static inline int shm_addid(struct ipc_namespace *ns, struct shmid_kernel *shp) @@ -166,9 +200,9 @@ static void shm_open(struct vm_area_struct *vma) struct shmid_kernel *shp; shp = shm_lock(sfd->ns, sfd->id); - BUG_ON(!shp); + BUG_ON(IS_ERR(shp)); shp->shm_atim = get_seconds(); - shp->shm_lprid = current->tgid; + shp->shm_lprid = task_tgid_vnr(current); shp->shm_nattch++; shm_unlock(shp); } @@ -176,15 +210,16 @@ static void shm_open(struct vm_area_struct *vma) /* * shm_destroy - free the struct shmid_kernel * + * @ns: namespace * @shp: struct to free * - * It has to be called with shp and shm_ids.mutex locked, + * It has to be called with shp and shm_ids.rw_mutex (writer) locked, * but returns with shp unlocked and freed. */ static void shm_destroy(struct ipc_namespace *ns, struct shmid_kernel *shp) { ns->shm_tot -= (shp->shm_segsz + PAGE_SIZE - 1) >> PAGE_SHIFT; - shm_rmid(ns, shp->id); + shm_rmid(ns, shp); shm_unlock(shp); if (!is_file_hugepages(shp->shm_file)) shmem_lock(shp->shm_file, 0, shp->mlock_user); @@ -209,11 +244,11 @@ static void shm_close(struct vm_area_struct *vma) struct shmid_kernel *shp; struct ipc_namespace *ns = sfd->ns; - mutex_lock(&shm_ids(ns).mutex); + down_write(&shm_ids(ns).rw_mutex); /* remove from the list of attaches of the shm segment */ - shp = shm_lock(ns, sfd->id); - BUG_ON(!shp); - shp->shm_lprid = current->tgid; + shp = shm_lock_down(ns, sfd->id); + BUG_ON(IS_ERR(shp)); + shp->shm_lprid = task_tgid_vnr(current); shp->shm_dtim = get_seconds(); shp->shm_nattch--; if(shp->shm_nattch == 0 && @@ -221,7 +256,7 @@ static void shm_close(struct vm_area_struct *vma) shm_destroy(ns, shp); else shm_unlock(shp); - mutex_unlock(&shm_ids(ns).mutex); + up_write(&shm_ids(ns).rw_mutex); } static int shm_fault(struct vm_area_struct *vma, struct vm_fault *vmf) @@ -337,8 +372,19 @@ static struct vm_operations_struct shm_vm_ops = { #endif }; -static int newseg (struct ipc_namespace *ns, key_t key, int shmflg, size_t size) +/** + * newseg - Create a new shared memory segment + * @ns: namespace + * @params: ptr to the structure that contains key, size and shmflg + * + * Called with shm_ids.rw_mutex held as a writer. + */ + +static int newseg(struct ipc_namespace *ns, struct ipc_params *params) { + key_t key = params->key; + int shmflg = params->flg; + size_t size = params->u.size; int error; struct shmid_kernel *shp; int numpages = (size + PAGE_SIZE -1) >> PAGE_SHIFT; @@ -387,28 +433,30 @@ static int newseg (struct ipc_namespace *ns, key_t key, int shmflg, size_t size) if (IS_ERR(file)) goto no_file; - error = -ENOSPC; id = shm_addid(ns, shp); - if(id == -1) + if (id < 0) { + error = id; goto no_id; + } - shp->shm_cprid = current->tgid; + shp->shm_cprid = task_tgid_vnr(current); shp->shm_lprid = 0; shp->shm_atim = shp->shm_dtim = 0; shp->shm_ctim = get_seconds(); shp->shm_segsz = size; shp->shm_nattch = 0; - shp->id = shm_buildid(ns, id, shp->shm_perm.seq); + shp->shm_perm.id = shm_buildid(id, shp->shm_perm.seq); shp->shm_file = file; /* * shmid gets reported as "inode#" in /proc/pid/maps. * proc-ps tools use this. Changing this will break them. */ - file->f_dentry->d_inode->i_ino = shp->id; + file->f_dentry->d_inode->i_ino = shp->shm_perm.id; ns->shm_tot += numpages; + error = shp->shm_perm.id; shm_unlock(shp); - return shp->id; + return error; no_id: fput(file); @@ -418,42 +466,49 @@ no_file: return error; } -asmlinkage long sys_shmget (key_t key, size_t size, int shmflg) +/* + * Called with shm_ids.rw_mutex and ipcp locked. + */ +static inline int shm_security(struct kern_ipc_perm *ipcp, int shmflg) { struct shmid_kernel *shp; - int err, id = 0; + + shp = container_of(ipcp, struct shmid_kernel, shm_perm); + return security_shm_associate(shp, shmflg); +} + +/* + * Called with shm_ids.rw_mutex and ipcp locked. + */ +static inline int shm_more_checks(struct kern_ipc_perm *ipcp, + struct ipc_params *params) +{ + struct shmid_kernel *shp; + + shp = container_of(ipcp, struct shmid_kernel, shm_perm); + if (shp->shm_segsz < params->u.size) + return -EINVAL; + + return 0; +} + +asmlinkage long sys_shmget (key_t key, size_t size, int shmflg) +{ struct ipc_namespace *ns; + struct ipc_ops shm_ops; + struct ipc_params shm_params; ns = current->nsproxy->ipc_ns; - mutex_lock(&shm_ids(ns).mutex); - if (key == IPC_PRIVATE) { - err = newseg(ns, key, shmflg, size); - } else if ((id = ipc_findkey(&shm_ids(ns), key)) == -1) { - if (!(shmflg & IPC_CREAT)) - err = -ENOENT; - else - err = newseg(ns, key, shmflg, size); - } else if ((shmflg & IPC_CREAT) && (shmflg & IPC_EXCL)) { - err = -EEXIST; - } else { - shp = shm_lock(ns, id); - BUG_ON(shp==NULL); - if (shp->shm_segsz < size) - err = -EINVAL; - else if (ipcperms(&shp->shm_perm, shmflg)) - err = -EACCES; - else { - int shmid = shm_buildid(ns, id, shp->shm_perm.seq); - err = security_shm_associate(shp, shmflg); - if (!err) - err = shmid; - } - shm_unlock(shp); - } - mutex_unlock(&shm_ids(ns).mutex); + shm_ops.getnew = newseg; + shm_ops.associate = shm_security; + shm_ops.more_checks = shm_more_checks; - return err; + shm_params.key = key; + shm_params.flg = shmflg; + shm_params.u.size = size; + + return ipcget(ns, &shm_ids(ns), &shm_ops, &shm_params); } static inline unsigned long copy_shmid_to_user(void __user *buf, struct shmid64_ds *in, int version) @@ -547,20 +602,26 @@ static inline unsigned long copy_shminfo_to_user(void __user *buf, struct shminf } } +/* + * Called with shm_ids.rw_mutex held as a reader + */ static void shm_get_stat(struct ipc_namespace *ns, unsigned long *rss, unsigned long *swp) { - int i; + int next_id; + int total, in_use; *rss = 0; *swp = 0; - for (i = 0; i <= shm_ids(ns).max_id; i++) { + in_use = shm_ids(ns).in_use; + + for (total = 0, next_id = 0; total < in_use; next_id++) { struct shmid_kernel *shp; struct inode *inode; - shp = shm_get(ns, i); - if(!shp) + shp = idr_find(&shm_ids(ns).ipcs_idr, next_id); + if (shp == NULL) continue; inode = shp->shm_file->f_path.dentry->d_inode; @@ -575,6 +636,8 @@ static void shm_get_stat(struct ipc_namespace *ns, unsigned long *rss, *swp += info->swapped; spin_unlock(&info->lock); } + + total++; } } @@ -610,8 +673,11 @@ asmlinkage long sys_shmctl (int shmid, int cmd, struct shmid_ds __user *buf) shminfo.shmmin = SHMMIN; if(copy_shminfo_to_user (buf, &shminfo, version)) return -EFAULT; - /* reading a integer is always atomic */ - err= shm_ids(ns).max_id; + + down_read(&shm_ids(ns).rw_mutex); + err = ipc_get_maxid(&shm_ids(ns)); + up_read(&shm_ids(ns).rw_mutex); + if(err<0) err = 0; goto out; @@ -625,14 +691,14 @@ asmlinkage long sys_shmctl (int shmid, int cmd, struct shmid_ds __user *buf) return err; memset(&shm_info,0,sizeof(shm_info)); - mutex_lock(&shm_ids(ns).mutex); + down_read(&shm_ids(ns).rw_mutex); shm_info.used_ids = shm_ids(ns).in_use; shm_get_stat (ns, &shm_info.shm_rss, &shm_info.shm_swp); shm_info.shm_tot = ns->shm_tot; shm_info.swap_attempts = 0; shm_info.swap_successes = 0; - err = shm_ids(ns).max_id; - mutex_unlock(&shm_ids(ns).mutex); + err = ipc_get_maxid(&shm_ids(ns)); + up_read(&shm_ids(ns).rw_mutex); if(copy_to_user (buf, &shm_info, sizeof(shm_info))) { err = -EFAULT; goto out; @@ -646,20 +712,25 @@ asmlinkage long sys_shmctl (int shmid, int cmd, struct shmid_ds __user *buf) { struct shmid64_ds tbuf; int result; - memset(&tbuf, 0, sizeof(tbuf)); - shp = shm_lock(ns, shmid); - if(shp==NULL) { - err = -EINVAL; + + if (!buf) { + err = -EFAULT; goto out; - } else if(cmd==SHM_STAT) { - err = -EINVAL; - if (shmid > shm_ids(ns).max_id) - goto out_unlock; - result = shm_buildid(ns, shmid, shp->shm_perm.seq); + } + + if (cmd == SHM_STAT) { + shp = shm_lock(ns, shmid); + if (IS_ERR(shp)) { + err = PTR_ERR(shp); + goto out; + } + result = shp->shm_perm.id; } else { - err = shm_checkid(ns, shp,shmid); - if(err) - goto out_unlock; + shp = shm_lock_check(ns, shmid); + if (IS_ERR(shp)) { + err = PTR_ERR(shp); + goto out; + } result = 0; } err=-EACCES; @@ -668,6 +739,7 @@ asmlinkage long sys_shmctl (int shmid, int cmd, struct shmid_ds __user *buf) err = security_shm_shmctl(shp, cmd); if (err) goto out_unlock; + memset(&tbuf, 0, sizeof(tbuf)); kernel_to_ipc64_perm(&shp->shm_perm, &tbuf.shm_perm); tbuf.shm_segsz = shp->shm_segsz; tbuf.shm_atime = shp->shm_atim; @@ -686,14 +758,11 @@ asmlinkage long sys_shmctl (int shmid, int cmd, struct shmid_ds __user *buf) case SHM_LOCK: case SHM_UNLOCK: { - shp = shm_lock(ns, shmid); - if(shp==NULL) { - err = -EINVAL; + shp = shm_lock_check(ns, shmid); + if (IS_ERR(shp)) { + err = PTR_ERR(shp); goto out; } - err = shm_checkid(ns, shp,shmid); - if(err) - goto out_unlock; err = audit_ipc_obj(&(shp->shm_perm)); if (err) @@ -742,14 +811,12 @@ asmlinkage long sys_shmctl (int shmid, int cmd, struct shmid_ds __user *buf) * Instead we set a destroyed flag, and then blow * the name away when the usage hits zero. */ - mutex_lock(&shm_ids(ns).mutex); - shp = shm_lock(ns, shmid); - err = -EINVAL; - if (shp == NULL) + down_write(&shm_ids(ns).rw_mutex); + shp = shm_lock_check_down(ns, shmid); + if (IS_ERR(shp)) { + err = PTR_ERR(shp); goto out_up; - err = shm_checkid(ns, shp, shmid); - if(err) - goto out_unlock_up; + } err = audit_ipc_obj(&(shp->shm_perm)); if (err) @@ -767,24 +834,27 @@ asmlinkage long sys_shmctl (int shmid, int cmd, struct shmid_ds __user *buf) goto out_unlock_up; do_shm_rmid(ns, shp); - mutex_unlock(&shm_ids(ns).mutex); + up_write(&shm_ids(ns).rw_mutex); goto out; } case IPC_SET: { + if (!buf) { + err = -EFAULT; + goto out; + } + if (copy_shmid_from_user (&setbuf, buf, version)) { err = -EFAULT; goto out; } - mutex_lock(&shm_ids(ns).mutex); - shp = shm_lock(ns, shmid); - err=-EINVAL; - if(shp==NULL) + down_write(&shm_ids(ns).rw_mutex); + shp = shm_lock_check_down(ns, shmid); + if (IS_ERR(shp)) { + err = PTR_ERR(shp); goto out_up; - err = shm_checkid(ns, shp,shmid); - if(err) - goto out_unlock_up; + } err = audit_ipc_obj(&(shp->shm_perm)); if (err) goto out_unlock_up; @@ -819,7 +889,7 @@ asmlinkage long sys_shmctl (int shmid, int cmd, struct shmid_ds __user *buf) out_unlock_up: shm_unlock(shp); out_up: - mutex_unlock(&shm_ids(ns).mutex); + up_write(&shm_ids(ns).rw_mutex); goto out; out_unlock: shm_unlock(shp); @@ -890,13 +960,11 @@ long do_shmat(int shmid, char __user *shmaddr, int shmflg, ulong *raddr) * additional creator id... */ ns = current->nsproxy->ipc_ns; - shp = shm_lock(ns, shmid); - if(shp == NULL) + shp = shm_lock_check(ns, shmid); + if (IS_ERR(shp)) { + err = PTR_ERR(shp); goto out; - - err = shm_checkid(ns, shp,shmid); - if (err) - goto out_unlock; + } err = -EACCES; if (ipcperms(&shp->shm_perm, acc_mode)) @@ -925,7 +993,7 @@ long do_shmat(int shmid, char __user *shmaddr, int shmflg, ulong *raddr) file->private_data = sfd; file->f_mapping = shp->shm_file->f_mapping; - sfd->id = shp->id; + sfd->id = shp->shm_perm.id; sfd->ns = get_ipc_ns(ns); sfd->file = shp->shm_file; sfd->vm_ops = NULL; @@ -955,16 +1023,16 @@ invalid: fput(file); out_nattch: - mutex_lock(&shm_ids(ns).mutex); - shp = shm_lock(ns, shmid); - BUG_ON(!shp); + down_write(&shm_ids(ns).rw_mutex); + shp = shm_lock_down(ns, shmid); + BUG_ON(IS_ERR(shp)); shp->shm_nattch--; if(shp->shm_nattch == 0 && shp->shm_perm.mode & SHM_DEST) shm_destroy(ns, shp); else shm_unlock(shp); - mutex_unlock(&shm_ids(ns).mutex); + up_write(&shm_ids(ns).rw_mutex); out: return err; @@ -1094,7 +1162,7 @@ static int sysvipc_shm_proc_show(struct seq_file *s, void *it) format = BIG_STRING; return seq_printf(s, format, shp->shm_perm.key, - shp->id, + shp->shm_perm.id, shp->shm_perm.mode, shp->shm_segsz, shp->shm_cprid, diff --git a/ipc/util.c b/ipc/util.c index 44e5135aee47..1aa0ebf71bac 100644 --- a/ipc/util.c +++ b/ipc/util.c @@ -32,6 +32,7 @@ #include <linux/proc_fs.h> #include <linux/audit.h> #include <linux/nsproxy.h> +#include <linux/rwsem.h> #include <asm/unistd.h> @@ -129,23 +130,16 @@ __initcall(ipc_init); /** * ipc_init_ids - initialise IPC identifiers * @ids: Identifier set - * @size: Number of identifiers * - * Given a size for the ipc identifier range (limited below IPCMNI) - * set up the sequence range to use then allocate and initialise the - * array itself. + * Set up the sequence range to use for the ipc identifier range (limited + * below IPCMNI) then initialise the ids idr. */ -void ipc_init_ids(struct ipc_ids* ids, int size) +void ipc_init_ids(struct ipc_ids *ids) { - int i; + init_rwsem(&ids->rw_mutex); - mutex_init(&ids->mutex); - - if(size > IPCMNI) - size = IPCMNI; ids->in_use = 0; - ids->max_id = -1; ids->seq = 0; { int seq_limit = INT_MAX/SEQ_MULTIPLIER; @@ -155,17 +149,7 @@ void ipc_init_ids(struct ipc_ids* ids, int size) ids->seq_max = seq_limit; } - ids->entries = ipc_rcu_alloc(sizeof(struct kern_ipc_perm *)*size + - sizeof(struct ipc_id_ary)); - - if(ids->entries == NULL) { - printk(KERN_ERR "ipc_init_ids() failed, ipc service disabled.\n"); - size = 0; - ids->entries = &ids->nullentry; - } - ids->entries->size = size; - for(i=0;i<size;i++) - ids->entries->p[i] = NULL; + idr_init(&ids->ipcs_idr); } #ifdef CONFIG_PROC_FS @@ -208,99 +192,96 @@ void __init ipc_init_proc_interface(const char *path, const char *header, * @ids: Identifier set * @key: The key to find * - * Requires ipc_ids.mutex locked. - * Returns the identifier if found or -1 if not. + * Requires ipc_ids.rw_mutex locked. + * Returns the LOCKED pointer to the ipc structure if found or NULL + * if not. + * If key is found ipc points to the owning ipc structure */ -int ipc_findkey(struct ipc_ids* ids, key_t key) +static struct kern_ipc_perm *ipc_findkey(struct ipc_ids *ids, key_t key) { - int id; - struct kern_ipc_perm* p; - int max_id = ids->max_id; + struct kern_ipc_perm *ipc; + int next_id; + int total; - /* - * rcu_dereference() is not needed here - * since ipc_ids.mutex is held - */ - for (id = 0; id <= max_id; id++) { - p = ids->entries->p[id]; - if(p==NULL) + for (total = 0, next_id = 0; total < ids->in_use; next_id++) { + ipc = idr_find(&ids->ipcs_idr, next_id); + + if (ipc == NULL) + continue; + + if (ipc->key != key) { + total++; continue; - if (key == p->key) - return id; + } + + ipc_lock_by_ptr(ipc); + return ipc; } - return -1; + + return NULL; } -/* - * Requires ipc_ids.mutex locked +/** + * ipc_get_maxid - get the last assigned id + * @ids: IPC identifier set + * + * Called with ipc_ids.rw_mutex held. */ -static int grow_ary(struct ipc_ids* ids, int newsize) -{ - struct ipc_id_ary* new; - struct ipc_id_ary* old; - int i; - int size = ids->entries->size; - - if(newsize > IPCMNI) - newsize = IPCMNI; - if(newsize <= size) - return newsize; - - new = ipc_rcu_alloc(sizeof(struct kern_ipc_perm *)*newsize + - sizeof(struct ipc_id_ary)); - if(new == NULL) - return size; - new->size = newsize; - memcpy(new->p, ids->entries->p, sizeof(struct kern_ipc_perm *)*size); - for(i=size;i<newsize;i++) { - new->p[i] = NULL; - } - old = ids->entries; - /* - * Use rcu_assign_pointer() to make sure the memcpyed contents - * of the new array are visible before the new array becomes visible. - */ - rcu_assign_pointer(ids->entries, new); +int ipc_get_maxid(struct ipc_ids *ids) +{ + struct kern_ipc_perm *ipc; + int max_id = -1; + int total, id; + + if (ids->in_use == 0) + return -1; - __ipc_fini_ids(ids, old); - return newsize; + if (ids->in_use == IPCMNI) + return IPCMNI - 1; + + /* Look for the last assigned id */ + total = 0; + for (id = 0; id < IPCMNI && total < ids->in_use; id++) { + ipc = idr_find(&ids->ipcs_idr, id); + if (ipc != NULL) { + max_id = id; + total++; + } + } + return max_id; } /** * ipc_addid - add an IPC identifier * @ids: IPC identifier set * @new: new IPC permission set - * @size: new size limit for the id array + * @size: limit for the number of used ids * - * Add an entry 'new' to the IPC arrays. The permissions object is + * Add an entry 'new' to the IPC ids idr. The permissions object is * initialised and the first free entry is set up and the id assigned - * is returned. The list is returned in a locked state on success. - * On failure the list is not locked and -1 is returned. + * is returned. The 'new' entry is returned in a locked state on success. + * On failure the entry is not locked and a negative err-code is returned. * - * Called with ipc_ids.mutex held. + * Called with ipc_ids.rw_mutex held as a writer. */ int ipc_addid(struct ipc_ids* ids, struct kern_ipc_perm* new, int size) { - int id; + int id, err; - size = grow_ary(ids,size); + if (size > IPCMNI) + size = IPCMNI; + + if (ids->in_use >= size) + return -ENOSPC; + + err = idr_get_new(&ids->ipcs_idr, new, &id); + if (err) + return err; - /* - * rcu_dereference()() is not needed here since - * ipc_ids.mutex is held - */ - for (id = 0; id < size; id++) { - if(ids->entries->p[id] == NULL) - goto found; - } - return -1; -found: ids->in_use++; - if (id > ids->max_id) - ids->max_id = id; new->cuid = new->uid = current->euid; new->gid = new->cgid = current->egid; @@ -313,48 +294,153 @@ found: new->deleted = 0; rcu_read_lock(); spin_lock(&new->lock); - ids->entries->p[id] = new; return id; } /** + * ipcget_new - create a new ipc object + * @ns: namespace + * @ids: IPC identifer set + * @ops: the actual creation routine to call + * @params: its parameters + * + * This routine is called by sys_msgget, sys_semget() and sys_shmget() + * when the key is IPC_PRIVATE. + */ +int ipcget_new(struct ipc_namespace *ns, struct ipc_ids *ids, + struct ipc_ops *ops, struct ipc_params *params) +{ + int err; +retry: + err = idr_pre_get(&ids->ipcs_idr, GFP_KERNEL); + + if (!err) + return -ENOMEM; + + down_write(&ids->rw_mutex); + err = ops->getnew(ns, params); + up_write(&ids->rw_mutex); + + if (err == -EAGAIN) + goto retry; + + return err; +} + +/** + * ipc_check_perms - check security and permissions for an IPC + * @ipcp: ipc permission set + * @ops: the actual security routine to call + * @params: its parameters + * + * This routine is called by sys_msgget(), sys_semget() and sys_shmget() + * when the key is not IPC_PRIVATE and that key already exists in the + * ids IDR. + * + * On success, the IPC id is returned. + * + * It is called with ipc_ids.rw_mutex and ipcp->lock held. + */ +static int ipc_check_perms(struct kern_ipc_perm *ipcp, struct ipc_ops *ops, + struct ipc_params *params) +{ + int err; + + if (ipcperms(ipcp, params->flg)) + err = -EACCES; + else { + err = ops->associate(ipcp, params->flg); + if (!err) + err = ipcp->id; + } + + return err; +} + +/** + * ipcget_public - get an ipc object or create a new one + * @ns: namespace + * @ids: IPC identifer set + * @ops: the actual creation routine to call + * @params: its parameters + * + * This routine is called by sys_msgget, sys_semget() and sys_shmget() + * when the key is not IPC_PRIVATE. + * It adds a new entry if the key is not found and does some permission + * / security checkings if the key is found. + * + * On success, the ipc id is returned. + */ +int ipcget_public(struct ipc_namespace *ns, struct ipc_ids *ids, + struct ipc_ops *ops, struct ipc_params *params) +{ + struct kern_ipc_perm *ipcp; + int flg = params->flg; + int err; +retry: + err = idr_pre_get(&ids->ipcs_idr, GFP_KERNEL); + + /* + * Take the lock as a writer since we are potentially going to add + * a new entry + read locks are not "upgradable" + */ + down_write(&ids->rw_mutex); + ipcp = ipc_findkey(ids, params->key); + if (ipcp == NULL) { + /* key not used */ + if (!(flg & IPC_CREAT)) + err = -ENOENT; + else if (!err) + err = -ENOMEM; + else + err = ops->getnew(ns, params); + } else { + /* ipc object has been locked by ipc_findkey() */ + + if (flg & IPC_CREAT && flg & IPC_EXCL) + err = -EEXIST; + else { + err = 0; + if (ops->more_checks) + err = ops->more_checks(ipcp, params); + if (!err) + /* + * ipc_check_perms returns the IPC id on + * success + */ + err = ipc_check_perms(ipcp, ops, params); + } + ipc_unlock(ipcp); + } + up_write(&ids->rw_mutex); + + if (err == -EAGAIN) + goto retry; + + return err; +} + + +/** * ipc_rmid - remove an IPC identifier - * @ids: identifier set - * @id: Identifier to remove + * @ids: IPC identifier set + * @ipcp: ipc perm structure containing the identifier to remove * - * The identifier must be valid, and in use. The kernel will panic if - * fed an invalid identifier. The entry is removed and internal - * variables recomputed. The object associated with the identifier - * is returned. - * ipc_ids.mutex and the spinlock for this ID is hold before this function - * is called, and remain locked on the exit. + * ipc_ids.rw_mutex (as a writer) and the spinlock for this ID are held + * before this function is called, and remain locked on the exit. */ -struct kern_ipc_perm* ipc_rmid(struct ipc_ids* ids, int id) +void ipc_rmid(struct ipc_ids *ids, struct kern_ipc_perm *ipcp) { - struct kern_ipc_perm* p; - int lid = id % SEQ_MULTIPLIER; - BUG_ON(lid >= ids->entries->size); + int lid = ipcid_to_idx(ipcp->id); + + idr_remove(&ids->ipcs_idr, lid); - /* - * do not need a rcu_dereference()() here to force ordering - * on Alpha, since the ipc_ids.mutex is held. - */ - p = ids->entries->p[lid]; - ids->entries->p[lid] = NULL; - BUG_ON(p==NULL); ids->in_use--; - if (lid == ids->max_id) { - do { - lid--; - if(lid == -1) - break; - } while (ids->entries->p[lid] == NULL); - ids->max_id = lid; - } - p->deleted = 1; - return p; + ipcp->deleted = 1; + + return; } /** @@ -491,10 +577,12 @@ static void ipc_do_vfree(struct work_struct *work) */ static void ipc_schedule_free(struct rcu_head *head) { - struct ipc_rcu_grace *grace = - container_of(head, struct ipc_rcu_grace, rcu); - struct ipc_rcu_sched *sched = - container_of(&(grace->data[0]), struct ipc_rcu_sched, data[0]); + struct ipc_rcu_grace *grace; + struct ipc_rcu_sched *sched; + + grace = container_of(head, struct ipc_rcu_grace, rcu); + sched = container_of(&(grace->data[0]), struct ipc_rcu_sched, + data[0]); INIT_WORK(&sched->work, ipc_do_vfree); schedule_work(&sched->work); @@ -583,7 +671,7 @@ void kernel_to_ipc64_perm (struct kern_ipc_perm *in, struct ipc64_perm *out) } /** - * ipc64_perm_to_ipc_perm - convert old ipc permissions to new + * ipc64_perm_to_ipc_perm - convert new ipc permissions to old * @in: new style IPC permissions * @out: old style IPC permissions * @@ -602,44 +690,37 @@ void ipc64_perm_to_ipc_perm (struct ipc64_perm *in, struct ipc_perm *out) out->seq = in->seq; } -/* - * So far only shm_get_stat() calls ipc_get() via shm_get(), so ipc_get() - * is called with shm_ids.mutex locked. Since grow_ary() is also called with - * shm_ids.mutex down(for Shared Memory), there is no need to add read - * barriers here to gurantee the writes in grow_ary() are seen in order - * here (for Alpha). +/** + * ipc_lock - Lock an ipc structure without rw_mutex held + * @ids: IPC identifier set + * @id: ipc id to look for + * + * Look for an id in the ipc ids idr and lock the associated ipc object. * - * However ipc_get() itself does not necessary require ipc_ids.mutex down. So - * if in the future ipc_get() is used by other places without ipc_ids.mutex - * down, then ipc_get() needs read memery barriers as ipc_lock() does. + * The ipc object is locked on exit. + * + * This is the routine that should be called when the rw_mutex is not already + * held, i.e. idr tree not protected: it protects the idr tree in read mode + * during the idr_find(). */ -struct kern_ipc_perm* ipc_get(struct ipc_ids* ids, int id) -{ - struct kern_ipc_perm* out; - int lid = id % SEQ_MULTIPLIER; - if(lid >= ids->entries->size) - return NULL; - out = ids->entries->p[lid]; - return out; -} -struct kern_ipc_perm* ipc_lock(struct ipc_ids* ids, int id) +struct kern_ipc_perm *ipc_lock(struct ipc_ids *ids, int id) { - struct kern_ipc_perm* out; - int lid = id % SEQ_MULTIPLIER; - struct ipc_id_ary* entries; + struct kern_ipc_perm *out; + int lid = ipcid_to_idx(id); + + down_read(&ids->rw_mutex); rcu_read_lock(); - entries = rcu_dereference(ids->entries); - if(lid >= entries->size) { - rcu_read_unlock(); - return NULL; - } - out = entries->p[lid]; - if(out == NULL) { + out = idr_find(&ids->ipcs_idr, lid); + if (out == NULL) { rcu_read_unlock(); - return NULL; + up_read(&ids->rw_mutex); + return ERR_PTR(-EINVAL); } + + up_read(&ids->rw_mutex); + spin_lock(&out->lock); /* ipc_rmid() may have already freed the ID while ipc_lock @@ -648,33 +729,44 @@ struct kern_ipc_perm* ipc_lock(struct ipc_ids* ids, int id) if (out->deleted) { spin_unlock(&out->lock); rcu_read_unlock(); - return NULL; + return ERR_PTR(-EINVAL); } + return out; } -void ipc_lock_by_ptr(struct kern_ipc_perm *perm) -{ - rcu_read_lock(); - spin_lock(&perm->lock); -} +/** + * ipc_lock_down - Lock an ipc structure with rw_sem held + * @ids: IPC identifier set + * @id: ipc id to look for + * + * Look for an id in the ipc ids idr and lock the associated ipc object. + * + * The ipc object is locked on exit. + * + * This is the routine that should be called when the rw_mutex is already + * held, i.e. idr tree protected. + */ -void ipc_unlock(struct kern_ipc_perm* perm) +struct kern_ipc_perm *ipc_lock_down(struct ipc_ids *ids, int id) { - spin_unlock(&perm->lock); - rcu_read_unlock(); -} + struct kern_ipc_perm *out; + int lid = ipcid_to_idx(id); -int ipc_buildid(struct ipc_ids* ids, int id, int seq) -{ - return SEQ_MULTIPLIER*seq + id; -} + rcu_read_lock(); + out = idr_find(&ids->ipcs_idr, lid); + if (out == NULL) { + rcu_read_unlock(); + return ERR_PTR(-EINVAL); + } -int ipc_checkid(struct ipc_ids* ids, struct kern_ipc_perm* ipcp, int uid) -{ - if(uid/SEQ_MULTIPLIER != ipcp->seq) - return 1; - return 0; + spin_lock(&out->lock); + + /* + * No need to verify that the structure is still valid since the + * rw_mutex is held. + */ + return out; } #ifdef __ARCH_WANT_IPC_PARSE_VERSION @@ -707,27 +799,30 @@ struct ipc_proc_iter { struct ipc_proc_iface *iface; }; -static void *sysvipc_proc_next(struct seq_file *s, void *it, loff_t *pos) +/* + * This routine locks the ipc structure found at least at position pos. + */ +struct kern_ipc_perm *sysvipc_find_ipc(struct ipc_ids *ids, loff_t pos, + loff_t *new_pos) { - struct ipc_proc_iter *iter = s->private; - struct ipc_proc_iface *iface = iter->iface; - struct kern_ipc_perm *ipc = it; - loff_t p; - struct ipc_ids *ids; + struct kern_ipc_perm *ipc; + int total, id; - ids = iter->ns->ids[iface->ids]; + total = 0; + for (id = 0; id < pos && total < ids->in_use; id++) { + ipc = idr_find(&ids->ipcs_idr, id); + if (ipc != NULL) + total++; + } - /* If we had an ipc id locked before, unlock it */ - if (ipc && ipc != SEQ_START_TOKEN) - ipc_unlock(ipc); + if (total >= ids->in_use) + return NULL; - /* - * p = *pos - 1 (because id 0 starts at position 1) - * + 1 (because we increment the position by one) - */ - for (p = *pos; p <= ids->max_id; p++) { - if ((ipc = ipc_lock(ids, p)) != NULL) { - *pos = p + 1; + for ( ; pos < IPCMNI; pos++) { + ipc = idr_find(&ids->ipcs_idr, pos); + if (ipc != NULL) { + *new_pos = pos + 1; + ipc_lock_by_ptr(ipc); return ipc; } } @@ -736,16 +831,27 @@ static void *sysvipc_proc_next(struct seq_file *s, void *it, loff_t *pos) return NULL; } +static void *sysvipc_proc_next(struct seq_file *s, void *it, loff_t *pos) +{ + struct ipc_proc_iter *iter = s->private; + struct ipc_proc_iface *iface = iter->iface; + struct kern_ipc_perm *ipc = it; + + /* If we had an ipc id locked before, unlock it */ + if (ipc && ipc != SEQ_START_TOKEN) + ipc_unlock(ipc); + + return sysvipc_find_ipc(iter->ns->ids[iface->ids], *pos, pos); +} + /* - * File positions: pos 0 -> header, pos n -> ipc id + 1. - * SeqFile iterator: iterator value locked shp or SEQ_TOKEN_START. + * File positions: pos 0 -> header, pos n -> ipc id = n - 1. + * SeqFile iterator: iterator value locked ipc pointer or SEQ_TOKEN_START. */ static void *sysvipc_proc_start(struct seq_file *s, loff_t *pos) { struct ipc_proc_iter *iter = s->private; struct ipc_proc_iface *iface = iter->iface; - struct kern_ipc_perm *ipc; - loff_t p; struct ipc_ids *ids; ids = iter->ns->ids[iface->ids]; @@ -754,7 +860,7 @@ static void *sysvipc_proc_start(struct seq_file *s, loff_t *pos) * Take the lock - this will be released by the corresponding * call to stop(). */ - mutex_lock(&ids->mutex); + down_read(&ids->rw_mutex); /* pos < 0 is invalid */ if (*pos < 0) @@ -765,13 +871,7 @@ static void *sysvipc_proc_start(struct seq_file *s, loff_t *pos) return SEQ_START_TOKEN; /* Find the (pos-1)th ipc */ - for (p = *pos - 1; p <= ids->max_id; p++) { - if ((ipc = ipc_lock(ids, p)) != NULL) { - *pos = p + 1; - return ipc; - } - } - return NULL; + return sysvipc_find_ipc(ids, *pos - 1, pos); } static void sysvipc_proc_stop(struct seq_file *s, void *it) @@ -781,13 +881,13 @@ static void sysvipc_proc_stop(struct seq_file *s, void *it) struct ipc_proc_iface *iface = iter->iface; struct ipc_ids *ids; - /* If we had a locked segment, release it */ + /* If we had a locked structure, release it */ if (ipc && ipc != SEQ_START_TOKEN) ipc_unlock(ipc); ids = iter->ns->ids[iface->ids]; /* Release the lock we took in start() */ - mutex_unlock(&ids->mutex); + up_read(&ids->rw_mutex); } static int sysvipc_proc_show(struct seq_file *s, void *it) diff --git a/ipc/util.h b/ipc/util.h index 333e891bcaca..9ffea40457ce 100644 --- a/ipc/util.h +++ b/ipc/util.h @@ -10,6 +10,9 @@ #ifndef _IPC_UTIL_H #define _IPC_UTIL_H +#include <linux/idr.h> +#include <linux/err.h> + #define USHRT_MAX 0xffff #define SEQ_MULTIPLIER (IPCMNI) @@ -25,24 +28,46 @@ void sem_exit_ns(struct ipc_namespace *ns); void msg_exit_ns(struct ipc_namespace *ns); void shm_exit_ns(struct ipc_namespace *ns); -struct ipc_id_ary { - int size; - struct kern_ipc_perm *p[0]; -}; - struct ipc_ids { int in_use; - int max_id; unsigned short seq; unsigned short seq_max; - struct mutex mutex; - struct ipc_id_ary nullentry; - struct ipc_id_ary* entries; + struct rw_semaphore rw_mutex; + struct idr ipcs_idr; +}; + +/* + * Structure that holds the parameters needed by the ipc operations + * (see after) + */ +struct ipc_params { + key_t key; + int flg; + union { + size_t size; /* for shared memories */ + int nsems; /* for semaphores */ + } u; /* holds the getnew() specific param */ +}; + +/* + * Structure that holds some ipc operations. This structure is used to unify + * the calls to sys_msgget(), sys_semget(), sys_shmget() + * . routine to call to create a new ipc object. Can be one of newque, + * newary, newseg + * . routine to call to check permissions for a new ipc object. + * Can be one of security_msg_associate, security_sem_associate, + * security_shm_associate + * . routine to call for an extra check if needed + */ +struct ipc_ops { + int (*getnew) (struct ipc_namespace *, struct ipc_params *); + int (*associate) (struct kern_ipc_perm *, int); + int (*more_checks) (struct kern_ipc_perm *, struct ipc_params *); }; struct seq_file; -void ipc_init_ids(struct ipc_ids *ids, int size); +void ipc_init_ids(struct ipc_ids *); #ifdef CONFIG_PROC_FS void __init ipc_init_proc_interface(const char *path, const char *header, int ids, int (*show)(struct seq_file *, void *)); @@ -54,14 +79,19 @@ void __init ipc_init_proc_interface(const char *path, const char *header, #define IPC_MSG_IDS 1 #define IPC_SHM_IDS 2 -/* must be called with ids->mutex acquired.*/ -int ipc_findkey(struct ipc_ids* ids, key_t key); -int ipc_addid(struct ipc_ids* ids, struct kern_ipc_perm* new, int size); +#define ipcid_to_idx(id) ((id) % SEQ_MULTIPLIER) + +/* must be called with ids->rw_mutex acquired for writing */ +int ipc_addid(struct ipc_ids *, struct kern_ipc_perm *, int); + +/* must be called with ids->rw_mutex acquired for reading */ +int ipc_get_maxid(struct ipc_ids *); /* must be called with both locks acquired. */ -struct kern_ipc_perm* ipc_rmid(struct ipc_ids* ids, int id); +void ipc_rmid(struct ipc_ids *, struct kern_ipc_perm *); -int ipcperms (struct kern_ipc_perm *ipcp, short flg); +/* must be called with ipcp locked */ +int ipcperms(struct kern_ipc_perm *ipcp, short flg); /* for rare, potentially huge allocations. * both function can sleep @@ -79,24 +109,12 @@ void* ipc_rcu_alloc(int size); void ipc_rcu_getref(void *ptr); void ipc_rcu_putref(void *ptr); -static inline void __ipc_fini_ids(struct ipc_ids *ids, - struct ipc_id_ary *entries) -{ - if (entries != &ids->nullentry) - ipc_rcu_putref(entries); -} - -static inline void ipc_fini_ids(struct ipc_ids *ids) -{ - __ipc_fini_ids(ids, ids->entries); -} - -struct kern_ipc_perm* ipc_get(struct ipc_ids* ids, int id); -struct kern_ipc_perm* ipc_lock(struct ipc_ids* ids, int id); -void ipc_lock_by_ptr(struct kern_ipc_perm *ipcp); -void ipc_unlock(struct kern_ipc_perm* perm); -int ipc_buildid(struct ipc_ids* ids, int id, int seq); -int ipc_checkid(struct ipc_ids* ids, struct kern_ipc_perm* ipcp, int uid); +/* + * ipc_lock_down: called with rw_mutex held + * ipc_lock: called without that lock held + */ +struct kern_ipc_perm *ipc_lock_down(struct ipc_ids *, int); +struct kern_ipc_perm *ipc_lock(struct ipc_ids *, int); void kernel_to_ipc64_perm(struct kern_ipc_perm *in, struct ipc64_perm *out); void ipc64_perm_to_ipc_perm(struct ipc64_perm *in, struct ipc_perm *out); @@ -111,5 +129,89 @@ int ipc_parse_version (int *cmd); extern void free_msg(struct msg_msg *msg); extern struct msg_msg *load_msg(const void __user *src, int len); extern int store_msg(void __user *dest, struct msg_msg *msg, int len); +extern int ipcget_new(struct ipc_namespace *, struct ipc_ids *, + struct ipc_ops *, struct ipc_params *); +extern int ipcget_public(struct ipc_namespace *, struct ipc_ids *, + struct ipc_ops *, struct ipc_params *); + +static inline int ipc_buildid(int id, int seq) +{ + return SEQ_MULTIPLIER * seq + id; +} + +/* + * Must be called with ipcp locked + */ +static inline int ipc_checkid(struct kern_ipc_perm *ipcp, int uid) +{ + if (uid / SEQ_MULTIPLIER != ipcp->seq) + return 1; + return 0; +} + +static inline void ipc_lock_by_ptr(struct kern_ipc_perm *perm) +{ + rcu_read_lock(); + spin_lock(&perm->lock); +} + +static inline void ipc_unlock(struct kern_ipc_perm *perm) +{ + spin_unlock(&perm->lock); + rcu_read_unlock(); +} + +static inline struct kern_ipc_perm *ipc_lock_check_down(struct ipc_ids *ids, + int id) +{ + struct kern_ipc_perm *out; + + out = ipc_lock_down(ids, id); + if (IS_ERR(out)) + return out; + + if (ipc_checkid(out, id)) { + ipc_unlock(out); + return ERR_PTR(-EIDRM); + } + + return out; +} + +static inline struct kern_ipc_perm *ipc_lock_check(struct ipc_ids *ids, + int id) +{ + struct kern_ipc_perm *out; + + out = ipc_lock(ids, id); + if (IS_ERR(out)) + return out; + + if (ipc_checkid(out, id)) { + ipc_unlock(out); + return ERR_PTR(-EIDRM); + } + + return out; +} + +/** + * ipcget - Common sys_*get() code + * @ns : namsepace + * @ids : IPC identifier set + * @ops : operations to be called on ipc object creation, permission checks + * and further checks + * @params : the parameters needed by the previous operations. + * + * Common routine called by sys_msgget(), sys_semget() and sys_shmget(). + */ +static inline int ipcget(struct ipc_namespace *ns, struct ipc_ids *ids, + struct ipc_ops *ops, struct ipc_params *params) +{ + if (params->key == IPC_PRIVATE) + return ipcget_new(ns, ids, ops, params); + else + return ipcget_public(ns, ids, ops, params); +} #endif diff --git a/kernel/Kconfig.instrumentation b/kernel/Kconfig.instrumentation new file mode 100644 index 000000000000..f5f2c769d95e --- /dev/null +++ b/kernel/Kconfig.instrumentation @@ -0,0 +1,49 @@ +menuconfig INSTRUMENTATION + bool "Instrumentation Support" + default y + ---help--- + Say Y here to get to see options related to performance measurement, + system-wide debugging, and testing. This option alone does not add any + kernel code. + + If you say N, all options in this submenu will be skipped and + disabled. If you're trying to debug the kernel itself, go see the + Kernel Hacking menu. + +if INSTRUMENTATION + +config PROFILING + bool "Profiling support (EXPERIMENTAL)" + help + Say Y here to enable the extended profiling support mechanisms used + by profilers such as OProfile. + +config OPROFILE + tristate "OProfile system profiling (EXPERIMENTAL)" + depends on PROFILING + depends on ALPHA || ARM || BLACKFIN || X86_32 || IA64 || M32R || MIPS || PARISC || PPC || S390 || SUPERH || SPARC || X86_64 + help + OProfile is a profiling system capable of profiling the + whole system, include the kernel, kernel modules, libraries, + and applications. + + If unsure, say N. + +config KPROBES + bool "Kprobes" + depends on KALLSYMS && MODULES + depends on X86_32 || IA64 || PPC || S390 || SPARC64 || X86_64 || AVR32 + help + Kprobes allows you to trap at almost any kernel address and + execute a callback function. register_kprobe() establishes + a probepoint and specifies the callback. Kprobes is useful + for kernel debugging, non-intrusive instrumentation and testing. + If in doubt, say "N". + +config MARKERS + bool "Activate markers" + help + Place an empty function call at each marker site. Can be + dynamically changed for a probe function. + +endif # INSTRUMENTATION diff --git a/kernel/Makefile b/kernel/Makefile index d63fbb18798a..05c3e6df8597 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -8,8 +8,8 @@ obj-y = sched.o fork.o exec_domain.o panic.o printk.o profile.o \ signal.o sys.o kmod.o workqueue.o pid.o \ rcupdate.o extable.o params.o posix-timers.o \ kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \ - hrtimer.o rwsem.o latency.o nsproxy.o srcu.o die_notifier.o \ - utsname.o sysctl_check.o + hrtimer.o rwsem.o latency.o nsproxy.o srcu.o \ + utsname.o sysctl_check.o notifier.o obj-$(CONFIG_STACKTRACE) += stacktrace.o obj-y += time/ @@ -36,7 +36,11 @@ obj-$(CONFIG_PM) += power/ obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o obj-$(CONFIG_KEXEC) += kexec.o obj-$(CONFIG_COMPAT) += compat.o +obj-$(CONFIG_CGROUPS) += cgroup.o +obj-$(CONFIG_CGROUP_DEBUG) += cgroup_debug.o obj-$(CONFIG_CPUSETS) += cpuset.o +obj-$(CONFIG_CGROUP_CPUACCT) += cpu_acct.o +obj-$(CONFIG_CGROUP_NS) += ns_cgroup.o obj-$(CONFIG_IKCONFIG) += configs.o obj-$(CONFIG_STOP_MACHINE) += stop_machine.o obj-$(CONFIG_AUDIT) += audit.o auditfilter.o @@ -51,6 +55,7 @@ obj-$(CONFIG_RELAY) += relay.o obj-$(CONFIG_SYSCTL) += utsname_sysctl.o obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o obj-$(CONFIG_TASKSTATS) += taskstats.o tsacct.o +obj-$(CONFIG_MARKERS) += marker.o ifneq ($(CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER),y) # According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is diff --git a/kernel/capability.c b/kernel/capability.c index cbc5fd60c0f3..efbd9cdce132 100644 --- a/kernel/capability.c +++ b/kernel/capability.c @@ -12,6 +12,7 @@ #include <linux/module.h> #include <linux/security.h> #include <linux/syscalls.h> +#include <linux/pid_namespace.h> #include <asm/uaccess.h> /* @@ -61,8 +62,8 @@ asmlinkage long sys_capget(cap_user_header_t header, cap_user_data_t dataptr) spin_lock(&task_capability_lock); read_lock(&tasklist_lock); - if (pid && pid != current->pid) { - target = find_task_by_pid(pid); + if (pid && pid != task_pid_vnr(current)) { + target = find_task_by_vpid(pid); if (!target) { ret = -ESRCH; goto out; @@ -95,7 +96,7 @@ static inline int cap_set_pg(int pgrp_nr, kernel_cap_t *effective, int found = 0; struct pid *pgrp; - pgrp = find_pid(pgrp_nr); + pgrp = find_vpid(pgrp_nr); do_each_pid_task(pgrp, PIDTYPE_PGID, g) { target = g; while_each_thread(g, target) { @@ -129,7 +130,7 @@ static inline int cap_set_all(kernel_cap_t *effective, int found = 0; do_each_thread(g, target) { - if (target == current || is_init(target)) + if (target == current || is_container_init(target->group_leader)) continue; found = 1; if (security_capset_check(target, effective, inheritable, @@ -184,7 +185,7 @@ asmlinkage long sys_capset(cap_user_header_t header, const cap_user_data_t data) if (get_user(pid, &header->pid)) return -EFAULT; - if (pid && pid != current->pid && !capable(CAP_SETPCAP)) + if (pid && pid != task_pid_vnr(current) && !capable(CAP_SETPCAP)) return -EPERM; if (copy_from_user(&effective, &data->effective, sizeof(effective)) || @@ -195,8 +196,8 @@ asmlinkage long sys_capset(cap_user_header_t header, const cap_user_data_t data) spin_lock(&task_capability_lock); read_lock(&tasklist_lock); - if (pid > 0 && pid != current->pid) { - target = find_task_by_pid(pid); + if (pid > 0 && pid != task_pid_vnr(current)) { + target = find_task_by_vpid(pid); if (!target) { ret = -ESRCH; goto out; diff --git a/kernel/cgroup.c b/kernel/cgroup.c new file mode 100644 index 000000000000..5987dccdb2a0 --- /dev/null +++ b/kernel/cgroup.c @@ -0,0 +1,2805 @@ +/* + * kernel/cgroup.c + * + * Generic process-grouping system. + * + * Based originally on the cpuset system, extracted by Paul Menage + * Copyright (C) 2006 Google, Inc + * + * Copyright notices from the original cpuset code: + * -------------------------------------------------- + * Copyright (C) 2003 BULL SA. + * Copyright (C) 2004-2006 Silicon Graphics, Inc. + * + * Portions derived from Patrick Mochel's sysfs code. + * sysfs is Copyright (c) 2001-3 Patrick Mochel + * + * 2003-10-10 Written by Simon Derr. + * 2003-10-22 Updates by Stephen Hemminger. + * 2004 May-July Rework by Paul Jackson. + * --------------------------------------------------- + * + * This file is subject to the terms and conditions of the GNU General Public + * License. See the file COPYING in the main directory of the Linux + * distribution for more details. + */ + +#include <linux/cgroup.h> +#include <linux/errno.h> +#include <linux/fs.h> +#include <linux/kernel.h> +#include <linux/list.h> +#include <linux/mm.h> +#include <linux/mutex.h> +#include <linux/mount.h> +#include <linux/pagemap.h> +#include <linux/proc_fs.h> +#include <linux/rcupdate.h> +#include <linux/sched.h> +#include <linux/backing-dev.h> +#include <linux/seq_file.h> +#include <linux/slab.h> +#include <linux/magic.h> +#include <linux/spinlock.h> +#include <linux/string.h> +#include <linux/sort.h> +#include <linux/kmod.h> +#include <linux/delayacct.h> +#include <linux/cgroupstats.h> + +#include <asm/atomic.h> + +static DEFINE_MUTEX(cgroup_mutex); + +/* Generate an array of cgroup subsystem pointers */ +#define SUBSYS(_x) &_x ## _subsys, + +static struct cgroup_subsys *subsys[] = { +#include <linux/cgroup_subsys.h> +}; + +/* + * A cgroupfs_root represents the root of a cgroup hierarchy, + * and may be associated with a superblock to form an active + * hierarchy + */ +struct cgroupfs_root { + struct super_block *sb; + + /* + * The bitmask of subsystems intended to be attached to this + * hierarchy + */ + unsigned long subsys_bits; + + /* The bitmask of subsystems currently attached to this hierarchy */ + unsigned long actual_subsys_bits; + + /* A list running through the attached subsystems */ + struct list_head subsys_list; + + /* The root cgroup for this hierarchy */ + struct cgroup top_cgroup; + + /* Tracks how many cgroups are currently defined in hierarchy.*/ + int number_of_cgroups; + + /* A list running through the mounted hierarchies */ + struct list_head root_list; + + /* Hierarchy-specific flags */ + unsigned long flags; + + /* The path to use for release notifications. No locking + * between setting and use - so if userspace updates this + * while child cgroups exist, you could miss a + * notification. We ensure that it's always a valid + * NUL-terminated string */ + char release_agent_path[PATH_MAX]; +}; + + +/* + * The "rootnode" hierarchy is the "dummy hierarchy", reserved for the + * subsystems that are otherwise unattached - it never has more than a + * single cgroup, and all tasks are part of that cgroup. + */ +static struct cgroupfs_root rootnode; + +/* The list of hierarchy roots */ + +static LIST_HEAD(roots); +static int root_count; + +/* dummytop is a shorthand for the dummy hierarchy's top cgroup */ +#define dummytop (&rootnode.top_cgroup) + +/* This flag indicates whether tasks in the fork and exit paths should + * take callback_mutex and check for fork/exit handlers to call. This + * avoids us having to do extra work in the fork/exit path if none of the + * subsystems need to be called. + */ +static int need_forkexit_callback; + +/* bits in struct cgroup flags field */ +enum { + /* Control Group is dead */ + CGRP_REMOVED, + /* Control Group has previously had a child cgroup or a task, + * but no longer (only if CGRP_NOTIFY_ON_RELEASE is set) */ + CGRP_RELEASABLE, + /* Control Group requires release notifications to userspace */ + CGRP_NOTIFY_ON_RELEASE, +}; + +/* convenient tests for these bits */ +inline int cgroup_is_removed(const struct cgroup *cgrp) +{ + return test_bit(CGRP_REMOVED, &cgrp->flags); +} + +/* bits in struct cgroupfs_root flags field */ +enum { + ROOT_NOPREFIX, /* mounted subsystems have no named prefix */ +}; + +inline int cgroup_is_releasable(const struct cgroup *cgrp) +{ + const int bits = + (1 << CGRP_RELEASABLE) | + (1 << CGRP_NOTIFY_ON_RELEASE); + return (cgrp->flags & bits) == bits; +} + +inline int notify_on_release(const struct cgroup *cgrp) +{ + return test_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags); +} + +/* + * for_each_subsys() allows you to iterate on each subsystem attached to + * an active hierarchy + */ +#define for_each_subsys(_root, _ss) \ +list_for_each_entry(_ss, &_root->subsys_list, sibling) + +/* for_each_root() allows you to iterate across the active hierarchies */ +#define for_each_root(_root) \ +list_for_each_entry(_root, &roots, root_list) + +/* the list of cgroups eligible for automatic release. Protected by + * release_list_lock */ +static LIST_HEAD(release_list); +static DEFINE_SPINLOCK(release_list_lock); +static void cgroup_release_agent(struct work_struct *work); +static DECLARE_WORK(release_agent_work, cgroup_release_agent); +static void check_for_release(struct cgroup *cgrp); + +/* Link structure for associating css_set objects with cgroups */ +struct cg_cgroup_link { + /* + * List running through cg_cgroup_links associated with a + * cgroup, anchored on cgroup->css_sets + */ + struct list_head cgrp_link_list; + /* + * List running through cg_cgroup_links pointing at a + * single css_set object, anchored on css_set->cg_links + */ + struct list_head cg_link_list; + struct css_set *cg; +}; + +/* The default css_set - used by init and its children prior to any + * hierarchies being mounted. It contains a pointer to the root state + * for each subsystem. Also used to anchor the list of css_sets. Not + * reference-counted, to improve performance when child cgroups + * haven't been created. + */ + +static struct css_set init_css_set; +static struct cg_cgroup_link init_css_set_link; + +/* css_set_lock protects the list of css_set objects, and the + * chain of tasks off each css_set. Nests outside task->alloc_lock + * due to cgroup_iter_start() */ +static DEFINE_RWLOCK(css_set_lock); +static int css_set_count; + +/* We don't maintain the lists running through each css_set to its + * task until after the first call to cgroup_iter_start(). This + * reduces the fork()/exit() overhead for people who have cgroups + * compiled into their kernel but not actually in use */ +static int use_task_css_set_links; + +/* When we create or destroy a css_set, the operation simply + * takes/releases a reference count on all the cgroups referenced + * by subsystems in this css_set. This can end up multiple-counting + * some cgroups, but that's OK - the ref-count is just a + * busy/not-busy indicator; ensuring that we only count each cgroup + * once would require taking a global lock to ensure that no + * subsystems moved between hierarchies while we were doing so. + * + * Possible TODO: decide at boot time based on the number of + * registered subsystems and the number of CPUs or NUMA nodes whether + * it's better for performance to ref-count every subsystem, or to + * take a global lock and only add one ref count to each hierarchy. + */ + +/* + * unlink a css_set from the list and free it + */ +static void unlink_css_set(struct css_set *cg) +{ + write_lock(&css_set_lock); + list_del(&cg->list); + css_set_count--; + while (!list_empty(&cg->cg_links)) { + struct cg_cgroup_link *link; + link = list_entry(cg->cg_links.next, + struct cg_cgroup_link, cg_link_list); + list_del(&link->cg_link_list); + list_del(&link->cgrp_link_list); + kfree(link); + } + write_unlock(&css_set_lock); +} + +static void __release_css_set(struct kref *k, int taskexit) +{ + int i; + struct css_set *cg = container_of(k, struct css_set, ref); + + unlink_css_set(cg); + + rcu_read_lock(); + for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { + struct cgroup *cgrp = cg->subsys[i]->cgroup; + if (atomic_dec_and_test(&cgrp->count) && + notify_on_release(cgrp)) { + if (taskexit) + set_bit(CGRP_RELEASABLE, &cgrp->flags); + check_for_release(cgrp); + } + } + rcu_read_unlock(); + kfree(cg); +} + +static void release_css_set(struct kref *k) +{ + __release_css_set(k, 0); +} + +static void release_css_set_taskexit(struct kref *k) +{ + __release_css_set(k, 1); +} + +/* + * refcounted get/put for css_set objects + */ +static inline void get_css_set(struct css_set *cg) +{ + kref_get(&cg->ref); +} + +static inline void put_css_set(struct css_set *cg) +{ + kref_put(&cg->ref, release_css_set); +} + +static inline void put_css_set_taskexit(struct css_set *cg) +{ + kref_put(&cg->ref, release_css_set_taskexit); +} + +/* + * find_existing_css_set() is a helper for + * find_css_set(), and checks to see whether an existing + * css_set is suitable. This currently walks a linked-list for + * simplicity; a later patch will use a hash table for better + * performance + * + * oldcg: the cgroup group that we're using before the cgroup + * transition + * + * cgrp: the cgroup that we're moving into + * + * template: location in which to build the desired set of subsystem + * state objects for the new cgroup group + */ + +static struct css_set *find_existing_css_set( + struct css_set *oldcg, + struct cgroup *cgrp, + struct cgroup_subsys_state *template[]) +{ + int i; + struct cgroupfs_root *root = cgrp->root; + struct list_head *l = &init_css_set.list; + + /* Built the set of subsystem state objects that we want to + * see in the new css_set */ + for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { + if (root->subsys_bits & (1ull << i)) { + /* Subsystem is in this hierarchy. So we want + * the subsystem state from the new + * cgroup */ + template[i] = cgrp->subsys[i]; + } else { + /* Subsystem is not in this hierarchy, so we + * don't want to change the subsystem state */ + template[i] = oldcg->subsys[i]; + } + } + + /* Look through existing cgroup groups to find one to reuse */ + do { + struct css_set *cg = + list_entry(l, struct css_set, list); + + if (!memcmp(template, cg->subsys, sizeof(cg->subsys))) { + /* All subsystems matched */ + return cg; + } + /* Try the next cgroup group */ + l = l->next; + } while (l != &init_css_set.list); + + /* No existing cgroup group matched */ + return NULL; +} + +/* + * allocate_cg_links() allocates "count" cg_cgroup_link structures + * and chains them on tmp through their cgrp_link_list fields. Returns 0 on + * success or a negative error + */ + +static int allocate_cg_links(int count, struct list_head *tmp) +{ + struct cg_cgroup_link *link; + int i; + INIT_LIST_HEAD(tmp); + for (i = 0; i < count; i++) { + link = kmalloc(sizeof(*link), GFP_KERNEL); + if (!link) { + while (!list_empty(tmp)) { + link = list_entry(tmp->next, + struct cg_cgroup_link, + cgrp_link_list); + list_del(&link->cgrp_link_list); + kfree(link); + } + return -ENOMEM; + } + list_add(&link->cgrp_link_list, tmp); + } + return 0; +} + +static void free_cg_links(struct list_head *tmp) +{ + while (!list_empty(tmp)) { + struct cg_cgroup_link *link; + link = list_entry(tmp->next, + struct cg_cgroup_link, + cgrp_link_list); + list_del(&link->cgrp_link_list); + kfree(link); + } +} + +/* + * find_css_set() takes an existing cgroup group and a + * cgroup object, and returns a css_set object that's + * equivalent to the old group, but with the given cgroup + * substituted into the appropriate hierarchy. Must be called with + * cgroup_mutex held + */ + +static struct css_set *find_css_set( + struct css_set *oldcg, struct cgroup *cgrp) +{ + struct css_set *res; + struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT]; + int i; + + struct list_head tmp_cg_links; + struct cg_cgroup_link *link; + + /* First see if we already have a cgroup group that matches + * the desired set */ + write_lock(&css_set_lock); + res = find_existing_css_set(oldcg, cgrp, template); + if (res) + get_css_set(res); + write_unlock(&css_set_lock); + + if (res) + return res; + + res = kmalloc(sizeof(*res), GFP_KERNEL); + if (!res) + return NULL; + + /* Allocate all the cg_cgroup_link objects that we'll need */ + if (allocate_cg_links(root_count, &tmp_cg_links) < 0) { + kfree(res); + return NULL; + } + + kref_init(&res->ref); + INIT_LIST_HEAD(&res->cg_links); + INIT_LIST_HEAD(&res->tasks); + + /* Copy the set of subsystem state objects generated in + * find_existing_css_set() */ + memcpy(res->subsys, template, sizeof(res->subsys)); + + write_lock(&css_set_lock); + /* Add reference counts and links from the new css_set. */ + for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { + struct cgroup *cgrp = res->subsys[i]->cgroup; + struct cgroup_subsys *ss = subsys[i]; + atomic_inc(&cgrp->count); + /* + * We want to add a link once per cgroup, so we + * only do it for the first subsystem in each + * hierarchy + */ + if (ss->root->subsys_list.next == &ss->sibling) { + BUG_ON(list_empty(&tmp_cg_links)); + link = list_entry(tmp_cg_links.next, + struct cg_cgroup_link, + cgrp_link_list); + list_del(&link->cgrp_link_list); + list_add(&link->cgrp_link_list, &cgrp->css_sets); + link->cg = res; + list_add(&link->cg_link_list, &res->cg_links); + } + } + if (list_empty(&rootnode.subsys_list)) { + link = list_entry(tmp_cg_links.next, + struct cg_cgroup_link, + cgrp_link_list); + list_del(&link->cgrp_link_list); + list_add(&link->cgrp_link_list, &dummytop->css_sets); + link->cg = res; + list_add(&link->cg_link_list, &res->cg_links); + } + + BUG_ON(!list_empty(&tmp_cg_links)); + + /* Link this cgroup group into the list */ + list_add(&res->list, &init_css_set.list); + css_set_count++; + INIT_LIST_HEAD(&res->tasks); + write_unlock(&css_set_lock); + + return res; +} + +/* + * There is one global cgroup mutex. We also require taking + * task_lock() when dereferencing a task's cgroup subsys pointers. + * See "The task_lock() exception", at the end of this comment. + * + * A task must hold cgroup_mutex to modify cgroups. + * + * Any task can increment and decrement the count field without lock. + * So in general, code holding cgroup_mutex can't rely on the count + * field not changing. However, if the count goes to zero, then only + * attach_task() can increment it again. Because a count of zero + * means that no tasks are currently attached, therefore there is no + * way a task attached to that cgroup can fork (the other way to + * increment the count). So code holding cgroup_mutex can safely + * assume that if the count is zero, it will stay zero. Similarly, if + * a task holds cgroup_mutex on a cgroup with zero count, it + * knows that the cgroup won't be removed, as cgroup_rmdir() + * needs that mutex. + * + * The cgroup_common_file_write handler for operations that modify + * the cgroup hierarchy holds cgroup_mutex across the entire operation, + * single threading all such cgroup modifications across the system. + * + * The fork and exit callbacks cgroup_fork() and cgroup_exit(), don't + * (usually) take cgroup_mutex. These are the two most performance + * critical pieces of code here. The exception occurs on cgroup_exit(), + * when a task in a notify_on_release cgroup exits. Then cgroup_mutex + * is taken, and if the cgroup count is zero, a usermode call made + * to /sbin/cgroup_release_agent with the name of the cgroup (path + * relative to the root of cgroup file system) as the argument. + * + * A cgroup can only be deleted if both its 'count' of using tasks + * is zero, and its list of 'children' cgroups is empty. Since all + * tasks in the system use _some_ cgroup, and since there is always at + * least one task in the system (init, pid == 1), therefore, top_cgroup + * always has either children cgroups and/or using tasks. So we don't + * need a special hack to ensure that top_cgroup cannot be deleted. + * + * The task_lock() exception + * + * The need for this exception arises from the action of + * attach_task(), which overwrites one tasks cgroup pointer with + * another. It does so using cgroup_mutexe, however there are + * several performance critical places that need to reference + * task->cgroup without the expense of grabbing a system global + * mutex. Therefore except as noted below, when dereferencing or, as + * in attach_task(), modifying a task'ss cgroup pointer we use + * task_lock(), which acts on a spinlock (task->alloc_lock) already in + * the task_struct routinely used for such matters. + * + * P.S. One more locking exception. RCU is used to guard the + * update of a tasks cgroup pointer by attach_task() + */ + +/** + * cgroup_lock - lock out any changes to cgroup structures + * + */ + +void cgroup_lock(void) +{ + mutex_lock(&cgroup_mutex); +} + +/** + * cgroup_unlock - release lock on cgroup changes + * + * Undo the lock taken in a previous cgroup_lock() call. + */ + +void cgroup_unlock(void) +{ + mutex_unlock(&cgroup_mutex); +} + +/* + * A couple of forward declarations required, due to cyclic reference loop: + * cgroup_mkdir -> cgroup_create -> cgroup_populate_dir -> + * cgroup_add_file -> cgroup_create_file -> cgroup_dir_inode_operations + * -> cgroup_mkdir. + */ + +static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, int mode); +static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry); +static int cgroup_populate_dir(struct cgroup *cgrp); +static struct inode_operations cgroup_dir_inode_operations; +static struct file_operations proc_cgroupstats_operations; + +static struct backing_dev_info cgroup_backing_dev_info = { + .capabilities = BDI_CAP_NO_ACCT_DIRTY | BDI_CAP_NO_WRITEBACK, +}; + +static struct inode *cgroup_new_inode(mode_t mode, struct super_block *sb) +{ + struct inode *inode = new_inode(sb); + + if (inode) { + inode->i_mode = mode; + inode->i_uid = current->fsuid; + inode->i_gid = current->fsgid; + inode->i_blocks = 0; + inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; + inode->i_mapping->backing_dev_info = &cgroup_backing_dev_info; + } + return inode; +} + +static void cgroup_diput(struct dentry *dentry, struct inode *inode) +{ + /* is dentry a directory ? if so, kfree() associated cgroup */ + if (S_ISDIR(inode->i_mode)) { + struct cgroup *cgrp = dentry->d_fsdata; + BUG_ON(!(cgroup_is_removed(cgrp))); + /* It's possible for external users to be holding css + * reference counts on a cgroup; css_put() needs to + * be able to access the cgroup after decrementing + * the reference count in order to know if it needs to + * queue the cgroup to be handled by the release + * agent */ + synchronize_rcu(); + kfree(cgrp); + } + iput(inode); +} + +static void remove_dir(struct dentry *d) +{ + struct dentry *parent = dget(d->d_parent); + + d_delete(d); + simple_rmdir(parent->d_inode, d); + dput(parent); +} + +static void cgroup_clear_directory(struct dentry *dentry) +{ + struct list_head *node; + + BUG_ON(!mutex_is_locked(&dentry->d_inode->i_mutex)); + spin_lock(&dcache_lock); + node = dentry->d_subdirs.next; + while (node != &dentry->d_subdirs) { + struct dentry *d = list_entry(node, struct dentry, d_u.d_child); + list_del_init(node); + if (d->d_inode) { + /* This should never be called on a cgroup + * directory with child cgroups */ + BUG_ON(d->d_inode->i_mode & S_IFDIR); + d = dget_locked(d); + spin_unlock(&dcache_lock); + d_delete(d); + simple_unlink(dentry->d_inode, d); + dput(d); + spin_lock(&dcache_lock); + } + node = dentry->d_subdirs.next; + } + spin_unlock(&dcache_lock); +} + +/* + * NOTE : the dentry must have been dget()'ed + */ +static void cgroup_d_remove_dir(struct dentry *dentry) +{ + cgroup_clear_directory(dentry); + + spin_lock(&dcache_lock); + list_del_init(&dentry->d_u.d_child); + spin_unlock(&dcache_lock); + remove_dir(dentry); +} + +static int rebind_subsystems(struct cgroupfs_root *root, + unsigned long final_bits) +{ + unsigned long added_bits, removed_bits; + struct cgroup *cgrp = &root->top_cgroup; + int i; + + removed_bits = root->actual_subsys_bits & ~final_bits; + added_bits = final_bits & ~root->actual_subsys_bits; + /* Check that any added subsystems are currently free */ + for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { + unsigned long long bit = 1ull << i; + struct cgroup_subsys *ss = subsys[i]; + if (!(bit & added_bits)) + continue; + if (ss->root != &rootnode) { + /* Subsystem isn't free */ + return -EBUSY; + } + } + + /* Currently we don't handle adding/removing subsystems when + * any child cgroups exist. This is theoretically supportable + * but involves complex error handling, so it's being left until + * later */ + if (!list_empty(&cgrp->children)) + return -EBUSY; + + /* Process each subsystem */ + for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { + struct cgroup_subsys *ss = subsys[i]; + unsigned long bit = 1UL << i; + if (bit & added_bits) { + /* We're binding this subsystem to this hierarchy */ + BUG_ON(cgrp->subsys[i]); + BUG_ON(!dummytop->subsys[i]); + BUG_ON(dummytop->subsys[i]->cgroup != dummytop); + cgrp->subsys[i] = dummytop->subsys[i]; + cgrp->subsys[i]->cgroup = cgrp; + list_add(&ss->sibling, &root->subsys_list); + rcu_assign_pointer(ss->root, root); + if (ss->bind) + ss->bind(ss, cgrp); + + } else if (bit & removed_bits) { + /* We're removing this subsystem */ + BUG_ON(cgrp->subsys[i] != dummytop->subsys[i]); + BUG_ON(cgrp->subsys[i]->cgroup != cgrp); + if (ss->bind) + ss->bind(ss, dummytop); + dummytop->subsys[i]->cgroup = dummytop; + cgrp->subsys[i] = NULL; + rcu_assign_pointer(subsys[i]->root, &rootnode); + list_del(&ss->sibling); + } else if (bit & final_bits) { + /* Subsystem state should already exist */ + BUG_ON(!cgrp->subsys[i]); + } else { + /* Subsystem state shouldn't exist */ + BUG_ON(cgrp->subsys[i]); + } + } + root->subsys_bits = root->actual_subsys_bits = final_bits; + synchronize_rcu(); + + return 0; +} + +static int cgroup_show_options(struct seq_file *seq, struct vfsmount *vfs) +{ + struct cgroupfs_root *root = vfs->mnt_sb->s_fs_info; + struct cgroup_subsys *ss; + + mutex_lock(&cgroup_mutex); + for_each_subsys(root, ss) + seq_printf(seq, ",%s", ss->name); + if (test_bit(ROOT_NOPREFIX, &root->flags)) + seq_puts(seq, ",noprefix"); + if (strlen(root->release_agent_path)) + seq_printf(seq, ",release_agent=%s", root->release_agent_path); + mutex_unlock(&cgroup_mutex); + return 0; +} + +struct cgroup_sb_opts { + unsigned long subsys_bits; + unsigned long flags; + char *release_agent; +}; + +/* Convert a hierarchy specifier into a bitmask of subsystems and + * flags. */ +static int parse_cgroupfs_options(char *data, + struct cgroup_sb_opts *opts) +{ + char *token, *o = data ?: "all"; + + opts->subsys_bits = 0; + opts->flags = 0; + opts->release_agent = NULL; + + while ((token = strsep(&o, ",")) != NULL) { + if (!*token) + return -EINVAL; + if (!strcmp(token, "all")) { + opts->subsys_bits = (1 << CGROUP_SUBSYS_COUNT) - 1; + } else if (!strcmp(token, "noprefix")) { + set_bit(ROOT_NOPREFIX, &opts->flags); + } else if (!strncmp(token, "release_agent=", 14)) { + /* Specifying two release agents is forbidden */ + if (opts->release_agent) + return -EINVAL; + opts->release_agent = kzalloc(PATH_MAX, GFP_KERNEL); + if (!opts->release_agent) + return -ENOMEM; + strncpy(opts->release_agent, token + 14, PATH_MAX - 1); + opts->release_agent[PATH_MAX - 1] = 0; + } else { + struct cgroup_subsys *ss; + int i; + for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { + ss = subsys[i]; + if (!strcmp(token, ss->name)) { + set_bit(i, &opts->subsys_bits); + break; + } + } + if (i == CGROUP_SUBSYS_COUNT) + return -ENOENT; + } + } + + /* We can't have an empty hierarchy */ + if (!opts->subsys_bits) + return -EINVAL; + + return 0; +} + +static int cgroup_remount(struct super_block *sb, int *flags, char *data) +{ + int ret = 0; + struct cgroupfs_root *root = sb->s_fs_info; + struct cgroup *cgrp = &root->top_cgroup; + struct cgroup_sb_opts opts; + + mutex_lock(&cgrp->dentry->d_inode->i_mutex); + mutex_lock(&cgroup_mutex); + + /* See what subsystems are wanted */ + ret = parse_cgroupfs_options(data, &opts); + if (ret) + goto out_unlock; + + /* Don't allow flags to change at remount */ + if (opts.flags != root->flags) { + ret = -EINVAL; + goto out_unlock; + } + + ret = rebind_subsystems(root, opts.subsys_bits); + + /* (re)populate subsystem files */ + if (!ret) + cgroup_populate_dir(cgrp); + + if (opts.release_agent) + strcpy(root->release_agent_path, opts.release_agent); + out_unlock: + if (opts.release_agent) + kfree(opts.release_agent); + mutex_unlock(&cgroup_mutex); + mutex_unlock(&cgrp->dentry->d_inode->i_mutex); + return ret; +} + +static struct super_operations cgroup_ops = { + .statfs = simple_statfs, + .drop_inode = generic_delete_inode, + .show_options = cgroup_show_options, + .remount_fs = cgroup_remount, +}; + +static void init_cgroup_root(struct cgroupfs_root *root) +{ + struct cgroup *cgrp = &root->top_cgroup; + INIT_LIST_HEAD(&root->subsys_list); + INIT_LIST_HEAD(&root->root_list); + root->number_of_cgroups = 1; + cgrp->root = root; + cgrp->top_cgroup = cgrp; + INIT_LIST_HEAD(&cgrp->sibling); + INIT_LIST_HEAD(&cgrp->children); + INIT_LIST_HEAD(&cgrp->css_sets); + INIT_LIST_HEAD(&cgrp->release_list); +} + +static int cgroup_test_super(struct super_block *sb, void *data) +{ + struct cgroupfs_root *new = data; + struct cgroupfs_root *root = sb->s_fs_info; + + /* First check subsystems */ + if (new->subsys_bits != root->subsys_bits) + return 0; + + /* Next check flags */ + if (new->flags != root->flags) + return 0; + + return 1; +} + +static int cgroup_set_super(struct super_block *sb, void *data) +{ + int ret; + struct cgroupfs_root *root = data; + + ret = set_anon_super(sb, NULL); + if (ret) + return ret; + + sb->s_fs_info = root; + root->sb = sb; + + sb->s_blocksize = PAGE_CACHE_SIZE; + sb->s_blocksize_bits = PAGE_CACHE_SHIFT; + sb->s_magic = CGROUP_SUPER_MAGIC; + sb->s_op = &cgroup_ops; + + return 0; +} + +static int cgroup_get_rootdir(struct super_block *sb) +{ + struct inode *inode = + cgroup_new_inode(S_IFDIR | S_IRUGO | S_IXUGO | S_IWUSR, sb); + struct dentry *dentry; + + if (!inode) + return -ENOMEM; + + inode->i_op = &simple_dir_inode_operations; + inode->i_fop = &simple_dir_operations; + inode->i_op = &cgroup_dir_inode_operations; + /* directories start off with i_nlink == 2 (for "." entry) */ + inc_nlink(inode); + dentry = d_alloc_root(inode); + if (!dentry) { + iput(inode); + return -ENOMEM; + } + sb->s_root = dentry; + return 0; +} + +static int cgroup_get_sb(struct file_system_type *fs_type, + int flags, const char *unused_dev_name, + void *data, struct vfsmount *mnt) +{ + struct cgroup_sb_opts opts; + int ret = 0; + struct super_block *sb; + struct cgroupfs_root *root; + struct list_head tmp_cg_links, *l; + INIT_LIST_HEAD(&tmp_cg_links); + + /* First find the desired set of subsystems */ + ret = parse_cgroupfs_options(data, &opts); + if (ret) { + if (opts.release_agent) + kfree(opts.release_agent); + return ret; + } + + root = kzalloc(sizeof(*root), GFP_KERNEL); + if (!root) + return -ENOMEM; + + init_cgroup_root(root); + root->subsys_bits = opts.subsys_bits; + root->flags = opts.flags; + if (opts.release_agent) { + strcpy(root->release_agent_path, opts.release_agent); + kfree(opts.release_agent); + } + + sb = sget(fs_type, cgroup_test_super, cgroup_set_super, root); + + if (IS_ERR(sb)) { + kfree(root); + return PTR_ERR(sb); + } + + if (sb->s_fs_info != root) { + /* Reusing an existing superblock */ + BUG_ON(sb->s_root == NULL); + kfree(root); + root = NULL; + } else { + /* New superblock */ + struct cgroup *cgrp = &root->top_cgroup; + struct inode *inode; + + BUG_ON(sb->s_root != NULL); + + ret = cgroup_get_rootdir(sb); + if (ret) + goto drop_new_super; + inode = sb->s_root->d_inode; + + mutex_lock(&inode->i_mutex); + mutex_lock(&cgroup_mutex); + + /* + * We're accessing css_set_count without locking + * css_set_lock here, but that's OK - it can only be + * increased by someone holding cgroup_lock, and + * that's us. The worst that can happen is that we + * have some link structures left over + */ + ret = allocate_cg_links(css_set_count, &tmp_cg_links); + if (ret) { + mutex_unlock(&cgroup_mutex); + mutex_unlock(&inode->i_mutex); + goto drop_new_super; + } + + ret = rebind_subsystems(root, root->subsys_bits); + if (ret == -EBUSY) { + mutex_unlock(&cgroup_mutex); + mutex_unlock(&inode->i_mutex); + goto drop_new_super; + } + + /* EBUSY should be the only error here */ + BUG_ON(ret); + + list_add(&root->root_list, &roots); + root_count++; + + sb->s_root->d_fsdata = &root->top_cgroup; + root->top_cgroup.dentry = sb->s_root; + + /* Link the top cgroup in this hierarchy into all + * the css_set objects */ + write_lock(&css_set_lock); + l = &init_css_set.list; + do { + struct css_set *cg; + struct cg_cgroup_link *link; + cg = list_entry(l, struct css_set, list); + BUG_ON(list_empty(&tmp_cg_links)); + link = list_entry(tmp_cg_links.next, + struct cg_cgroup_link, + cgrp_link_list); + list_del(&link->cgrp_link_list); + link->cg = cg; + list_add(&link->cgrp_link_list, + &root->top_cgroup.css_sets); + list_add(&link->cg_link_list, &cg->cg_links); + l = l->next; + } while (l != &init_css_set.list); + write_unlock(&css_set_lock); + + free_cg_links(&tmp_cg_links); + + BUG_ON(!list_empty(&cgrp->sibling)); + BUG_ON(!list_empty(&cgrp->children)); + BUG_ON(root->number_of_cgroups != 1); + + cgroup_populate_dir(cgrp); + mutex_unlock(&inode->i_mutex); + mutex_unlock(&cgroup_mutex); + } + + return simple_set_mnt(mnt, sb); + + drop_new_super: + up_write(&sb->s_umount); + deactivate_super(sb); + free_cg_links(&tmp_cg_links); + return ret; +} + +static void cgroup_kill_sb(struct super_block *sb) { + struct cgroupfs_root *root = sb->s_fs_info; + struct cgroup *cgrp = &root->top_cgroup; + int ret; + + BUG_ON(!root); + + BUG_ON(root->number_of_cgroups != 1); + BUG_ON(!list_empty(&cgrp->children)); + BUG_ON(!list_empty(&cgrp->sibling)); + + mutex_lock(&cgroup_mutex); + + /* Rebind all subsystems back to the default hierarchy */ + ret = rebind_subsystems(root, 0); + /* Shouldn't be able to fail ... */ + BUG_ON(ret); + + /* + * Release all the links from css_sets to this hierarchy's + * root cgroup + */ + write_lock(&css_set_lock); + while (!list_empty(&cgrp->css_sets)) { + struct cg_cgroup_link *link; + link = list_entry(cgrp->css_sets.next, + struct cg_cgroup_link, cgrp_link_list); + list_del(&link->cg_link_list); + list_del(&link->cgrp_link_list); + kfree(link); + } + write_unlock(&css_set_lock); + + if (!list_empty(&root->root_list)) { + list_del(&root->root_list); + root_count--; + } + mutex_unlock(&cgroup_mutex); + + kfree(root); + kill_litter_super(sb); +} + +static struct file_system_type cgroup_fs_type = { + .name = "cgroup", + .get_sb = cgroup_get_sb, + .kill_sb = cgroup_kill_sb, +}; + +static inline struct cgroup *__d_cgrp(struct dentry *dentry) +{ + return dentry->d_fsdata; +} + +static inline struct cftype *__d_cft(struct dentry *dentry) +{ + return dentry->d_fsdata; +} + +/* + * Called with cgroup_mutex held. Writes path of cgroup into buf. + * Returns 0 on success, -errno on error. + */ +int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen) +{ + char *start; + + if (cgrp == dummytop) { + /* + * Inactive subsystems have no dentry for their root + * cgroup + */ + strcpy(buf, "/"); + return 0; + } + + start = buf + buflen; + + *--start = '\0'; + for (;;) { + int len = cgrp->dentry->d_name.len; + if ((start -= len) < buf) + return -ENAMETOOLONG; + memcpy(start, cgrp->dentry->d_name.name, len); + cgrp = cgrp->parent; + if (!cgrp) + break; + if (!cgrp->parent) + continue; + if (--start < buf) + return -ENAMETOOLONG; + *start = '/'; + } + memmove(buf, start, buf + buflen - start); + return 0; +} + +/* + * Return the first subsystem attached to a cgroup's hierarchy, and + * its subsystem id. + */ + +static void get_first_subsys(const struct cgroup *cgrp, + struct cgroup_subsys_state **css, int *subsys_id) +{ + const struct cgroupfs_root *root = cgrp->root; + const struct cgroup_subsys *test_ss; + BUG_ON(list_empty(&root->subsys_list)); + test_ss = list_entry(root->subsys_list.next, + struct cgroup_subsys, sibling); + if (css) { + *css = cgrp->subsys[test_ss->subsys_id]; + BUG_ON(!*css); + } + if (subsys_id) + *subsys_id = test_ss->subsys_id; +} + +/* + * Attach task 'tsk' to cgroup 'cgrp' + * + * Call holding cgroup_mutex. May take task_lock of + * the task 'pid' during call. + */ +static int attach_task(struct cgroup *cgrp, struct task_struct *tsk) +{ + int retval = 0; + struct cgroup_subsys *ss; + struct cgroup *oldcgrp; + struct css_set *cg = tsk->cgroups; + struct css_set *newcg; + struct cgroupfs_root *root = cgrp->root; + int subsys_id; + + get_first_subsys(cgrp, NULL, &subsys_id); + + /* Nothing to do if the task is already in that cgroup */ + oldcgrp = task_cgroup(tsk, subsys_id); + if (cgrp == oldcgrp) + return 0; + + for_each_subsys(root, ss) { + if (ss->can_attach) { + retval = ss->can_attach(ss, cgrp, tsk); + if (retval) { + return retval; + } + } + } + + /* + * Locate or allocate a new css_set for this task, + * based on its final set of cgroups + */ + newcg = find_css_set(cg, cgrp); + if (!newcg) { + return -ENOMEM; + } + + task_lock(tsk); + if (tsk->flags & PF_EXITING) { + task_unlock(tsk); + put_css_set(newcg); + return -ESRCH; + } + rcu_assign_pointer(tsk->cgroups, newcg); + task_unlock(tsk); + + /* Update the css_set linked lists if we're using them */ + write_lock(&css_set_lock); + if (!list_empty(&tsk->cg_list)) { + list_del(&tsk->cg_list); + list_add(&tsk->cg_list, &newcg->tasks); + } + write_unlock(&css_set_lock); + + for_each_subsys(root, ss) { + if (ss->attach) { + ss->attach(ss, cgrp, oldcgrp, tsk); + } + } + set_bit(CGRP_RELEASABLE, &oldcgrp->flags); + synchronize_rcu(); + put_css_set(cg); + return 0; +} + +/* + * Attach task with pid 'pid' to cgroup 'cgrp'. Call with + * cgroup_mutex, may take task_lock of task + */ +static int attach_task_by_pid(struct cgroup *cgrp, char *pidbuf) +{ + pid_t pid; + struct task_struct *tsk; + int ret; + + if (sscanf(pidbuf, "%d", &pid) != 1) + return -EIO; + + if (pid) { + rcu_read_lock(); + tsk = find_task_by_pid(pid); + if (!tsk || tsk->flags & PF_EXITING) { + rcu_read_unlock(); + return -ESRCH; + } + get_task_struct(tsk); + rcu_read_unlock(); + + if ((current->euid) && (current->euid != tsk->uid) + && (current->euid != tsk->suid)) { + put_task_struct(tsk); + return -EACCES; + } + } else { + tsk = current; + get_task_struct(tsk); + } + + ret = attach_task(cgrp, tsk); + put_task_struct(tsk); + return ret; +} + +/* The various types of files and directories in a cgroup file system */ + +enum cgroup_filetype { + FILE_ROOT, + FILE_DIR, + FILE_TASKLIST, + FILE_NOTIFY_ON_RELEASE, + FILE_RELEASABLE, + FILE_RELEASE_AGENT, +}; + +static ssize_t cgroup_write_uint(struct cgroup *cgrp, struct cftype *cft, + struct file *file, + const char __user *userbuf, + size_t nbytes, loff_t *unused_ppos) +{ + char buffer[64]; + int retval = 0; + u64 val; + char *end; + + if (!nbytes) + return -EINVAL; + if (nbytes >= sizeof(buffer)) + return -E2BIG; + if (copy_from_user(buffer, userbuf, nbytes)) + return -EFAULT; + + buffer[nbytes] = 0; /* nul-terminate */ + + /* strip newline if necessary */ + if (nbytes && (buffer[nbytes-1] == '\n')) + buffer[nbytes-1] = 0; + val = simple_strtoull(buffer, &end, 0); + if (*end) + return -EINVAL; + + /* Pass to subsystem */ + retval = cft->write_uint(cgrp, cft, val); + if (!retval) + retval = nbytes; + return retval; +} + +static ssize_t cgroup_common_file_write(struct cgroup *cgrp, + struct cftype *cft, + struct file *file, + const char __user *userbuf, + size_t nbytes, loff_t *unused_ppos) +{ + enum cgroup_filetype type = cft->private; + char *buffer; + int retval = 0; + + if (nbytes >= PATH_MAX) + return -E2BIG; + + /* +1 for nul-terminator */ + buffer = kmalloc(nbytes + 1, GFP_KERNEL); + if (buffer == NULL) + return -ENOMEM; + + if (copy_from_user(buffer, userbuf, nbytes)) { + retval = -EFAULT; + goto out1; + } + buffer[nbytes] = 0; /* nul-terminate */ + + mutex_lock(&cgroup_mutex); + + if (cgroup_is_removed(cgrp)) { + retval = -ENODEV; + goto out2; + } + + switch (type) { + case FILE_TASKLIST: + retval = attach_task_by_pid(cgrp, buffer); + break; + case FILE_NOTIFY_ON_RELEASE: + clear_bit(CGRP_RELEASABLE, &cgrp->flags); + if (simple_strtoul(buffer, NULL, 10) != 0) + set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags); + else + clear_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags); + break; + case FILE_RELEASE_AGENT: + { + struct cgroupfs_root *root = cgrp->root; + /* Strip trailing newline */ + if (nbytes && (buffer[nbytes-1] == '\n')) { + buffer[nbytes-1] = 0; + } + if (nbytes < sizeof(root->release_agent_path)) { + /* We never write anything other than '\0' + * into the last char of release_agent_path, + * so it always remains a NUL-terminated + * string */ + strncpy(root->release_agent_path, buffer, nbytes); + root->release_agent_path[nbytes] = 0; + } else { + retval = -ENOSPC; + } + break; + } + default: + retval = -EINVAL; + goto out2; + } + + if (retval == 0) + retval = nbytes; +out2: + mutex_unlock(&cgroup_mutex); +out1: + kfree(buffer); + return retval; +} + +static ssize_t cgroup_file_write(struct file *file, const char __user *buf, + size_t nbytes, loff_t *ppos) +{ + struct cftype *cft = __d_cft(file->f_dentry); + struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent); + + if (!cft) + return -ENODEV; + if (cft->write) + return cft->write(cgrp, cft, file, buf, nbytes, ppos); + if (cft->write_uint) + return cgroup_write_uint(cgrp, cft, file, buf, nbytes, ppos); + return -EINVAL; +} + +static ssize_t cgroup_read_uint(struct cgroup *cgrp, struct cftype *cft, + struct file *file, + char __user *buf, size_t nbytes, + loff_t *ppos) +{ + char tmp[64]; + u64 val = cft->read_uint(cgrp, cft); + int len = sprintf(tmp, "%llu\n", (unsigned long long) val); + + return simple_read_from_buffer(buf, nbytes, ppos, tmp, len); +} + +static ssize_t cgroup_common_file_read(struct cgroup *cgrp, + struct cftype *cft, + struct file *file, + char __user *buf, + size_t nbytes, loff_t *ppos) +{ + enum cgroup_filetype type = cft->private; + char *page; + ssize_t retval = 0; + char *s; + + if (!(page = (char *)__get_free_page(GFP_KERNEL))) + return -ENOMEM; + + s = page; + + switch (type) { + case FILE_RELEASE_AGENT: + { + struct cgroupfs_root *root; + size_t n; + mutex_lock(&cgroup_mutex); + root = cgrp->root; + n = strnlen(root->release_agent_path, + sizeof(root->release_agent_path)); + n = min(n, (size_t) PAGE_SIZE); + strncpy(s, root->release_agent_path, n); + mutex_unlock(&cgroup_mutex); + s += n; + break; + } + default: + retval = -EINVAL; + goto out; + } + *s++ = '\n'; + + retval = simple_read_from_buffer(buf, nbytes, ppos, page, s - page); +out: + free_page((unsigned long)page); + return retval; +} + +static ssize_t cgroup_file_read(struct file *file, char __user *buf, + size_t nbytes, loff_t *ppos) +{ + struct cftype *cft = __d_cft(file->f_dentry); + struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent); + + if (!cft) + return -ENODEV; + + if (cft->read) + return cft->read(cgrp, cft, file, buf, nbytes, ppos); + if (cft->read_uint) + return cgroup_read_uint(cgrp, cft, file, buf, nbytes, ppos); + return -EINVAL; +} + +static int cgroup_file_open(struct inode *inode, struct file *file) +{ + int err; + struct cftype *cft; + + err = generic_file_open(inode, file); + if (err) + return err; + + cft = __d_cft(file->f_dentry); + if (!cft) + return -ENODEV; + if (cft->open) + err = cft->open(inode, file); + else + err = 0; + + return err; +} + +static int cgroup_file_release(struct inode *inode, struct file *file) +{ + struct cftype *cft = __d_cft(file->f_dentry); + if (cft->release) + return cft->release(inode, file); + return 0; +} + +/* + * cgroup_rename - Only allow simple rename of directories in place. + */ +static int cgroup_rename(struct inode *old_dir, struct dentry *old_dentry, + struct inode *new_dir, struct dentry *new_dentry) +{ + if (!S_ISDIR(old_dentry->d_inode->i_mode)) + return -ENOTDIR; + if (new_dentry->d_inode) + return -EEXIST; + if (old_dir != new_dir) + return -EIO; + return simple_rename(old_dir, old_dentry, new_dir, new_dentry); +} + +static struct file_operations cgroup_file_operations = { + .read = cgroup_file_read, + .write = cgroup_file_write, + .llseek = generic_file_llseek, + .open = cgroup_file_open, + .release = cgroup_file_release, +}; + +static struct inode_operations cgroup_dir_inode_operations = { + .lookup = simple_lookup, + .mkdir = cgroup_mkdir, + .rmdir = cgroup_rmdir, + .rename = cgroup_rename, +}; + +static int cgroup_create_file(struct dentry *dentry, int mode, + struct super_block *sb) +{ + static struct dentry_operations cgroup_dops = { + .d_iput = cgroup_diput, + }; + + struct inode *inode; + + if (!dentry) + return -ENOENT; + if (dentry->d_inode) + return -EEXIST; + + inode = cgroup_new_inode(mode, sb); + if (!inode) + return -ENOMEM; + + if (S_ISDIR(mode)) { + inode->i_op = &cgroup_dir_inode_operations; + inode->i_fop = &simple_dir_operations; + + /* start off with i_nlink == 2 (for "." entry) */ + inc_nlink(inode); + + /* start with the directory inode held, so that we can + * populate it without racing with another mkdir */ + mutex_lock_nested(&inode->i_mutex, I_MUTEX_CHILD); + } else if (S_ISREG(mode)) { + inode->i_size = 0; + inode->i_fop = &cgroup_file_operations; + } + dentry->d_op = &cgroup_dops; + d_instantiate(dentry, inode); + dget(dentry); /* Extra count - pin the dentry in core */ + return 0; +} + +/* + * cgroup_create_dir - create a directory for an object. + * cgrp: the cgroup we create the directory for. + * It must have a valid ->parent field + * And we are going to fill its ->dentry field. + * dentry: dentry of the new cgroup + * mode: mode to set on new directory. + */ +static int cgroup_create_dir(struct cgroup *cgrp, struct dentry *dentry, + int mode) +{ + struct dentry *parent; + int error = 0; + + parent = cgrp->parent->dentry; + error = cgroup_create_file(dentry, S_IFDIR | mode, cgrp->root->sb); + if (!error) { + dentry->d_fsdata = cgrp; + inc_nlink(parent->d_inode); + cgrp->dentry = dentry; + dget(dentry); + } + dput(dentry); + + return error; +} + +int cgroup_add_file(struct cgroup *cgrp, + struct cgroup_subsys *subsys, + const struct cftype *cft) +{ + struct dentry *dir = cgrp->dentry; + struct dentry *dentry; + int error; + + char name[MAX_CGROUP_TYPE_NAMELEN + MAX_CFTYPE_NAME + 2] = { 0 }; + if (subsys && !test_bit(ROOT_NOPREFIX, &cgrp->root->flags)) { + strcpy(name, subsys->name); + strcat(name, "."); + } + strcat(name, cft->name); + BUG_ON(!mutex_is_locked(&dir->d_inode->i_mutex)); + dentry = lookup_one_len(name, dir, strlen(name)); + if (!IS_ERR(dentry)) { + error = cgroup_create_file(dentry, 0644 | S_IFREG, + cgrp->root->sb); + if (!error) + dentry->d_fsdata = (void *)cft; + dput(dentry); + } else + error = PTR_ERR(dentry); + return error; +} + +int cgroup_add_files(struct cgroup *cgrp, + struct cgroup_subsys *subsys, + const struct cftype cft[], + int count) +{ + int i, err; + for (i = 0; i < count; i++) { + err = cgroup_add_file(cgrp, subsys, &cft[i]); + if (err) + return err; + } + return 0; +} + +/* Count the number of tasks in a cgroup. */ + +int cgroup_task_count(const struct cgroup *cgrp) +{ + int count = 0; + struct list_head *l; + + read_lock(&css_set_lock); + l = cgrp->css_sets.next; + while (l != &cgrp->css_sets) { + struct cg_cgroup_link *link = + list_entry(l, struct cg_cgroup_link, cgrp_link_list); + count += atomic_read(&link->cg->ref.refcount); + l = l->next; + } + read_unlock(&css_set_lock); + return count; +} + +/* + * Advance a list_head iterator. The iterator should be positioned at + * the start of a css_set + */ +static void cgroup_advance_iter(struct cgroup *cgrp, + struct cgroup_iter *it) +{ + struct list_head *l = it->cg_link; + struct cg_cgroup_link *link; + struct css_set *cg; + + /* Advance to the next non-empty css_set */ + do { + l = l->next; + if (l == &cgrp->css_sets) { + it->cg_link = NULL; + return; + } + link = list_entry(l, struct cg_cgroup_link, cgrp_link_list); + cg = link->cg; + } while (list_empty(&cg->tasks)); + it->cg_link = l; + it->task = cg->tasks.next; +} + +void cgroup_iter_start(struct cgroup *cgrp, struct cgroup_iter *it) +{ + /* + * The first time anyone tries to iterate across a cgroup, + * we need to enable the list linking each css_set to its + * tasks, and fix up all existing tasks. + */ + if (!use_task_css_set_links) { + struct task_struct *p, *g; + write_lock(&css_set_lock); + use_task_css_set_links = 1; + do_each_thread(g, p) { + task_lock(p); + if (list_empty(&p->cg_list)) + list_add(&p->cg_list, &p->cgroups->tasks); + task_unlock(p); + } while_each_thread(g, p); + write_unlock(&css_set_lock); + } + read_lock(&css_set_lock); + it->cg_link = &cgrp->css_sets; + cgroup_advance_iter(cgrp, it); +} + +struct task_struct *cgroup_iter_next(struct cgroup *cgrp, + struct cgroup_iter *it) +{ + struct task_struct *res; + struct list_head *l = it->task; + + /* If the iterator cg is NULL, we have no tasks */ + if (!it->cg_link) + return NULL; + res = list_entry(l, struct task_struct, cg_list); + /* Advance iterator to find next entry */ + l = l->next; + if (l == &res->cgroups->tasks) { + /* We reached the end of this task list - move on to + * the next cg_cgroup_link */ + cgroup_advance_iter(cgrp, it); + } else { + it->task = l; + } + return res; +} + +void cgroup_iter_end(struct cgroup *cgrp, struct cgroup_iter *it) +{ + read_unlock(&css_set_lock); +} + +/* + * Stuff for reading the 'tasks' file. + * + * Reading this file can return large amounts of data if a cgroup has + * *lots* of attached tasks. So it may need several calls to read(), + * but we cannot guarantee that the information we produce is correct + * unless we produce it entirely atomically. + * + * Upon tasks file open(), a struct ctr_struct is allocated, that + * will have a pointer to an array (also allocated here). The struct + * ctr_struct * is stored in file->private_data. Its resources will + * be freed by release() when the file is closed. The array is used + * to sprintf the PIDs and then used by read(). + */ +struct ctr_struct { + char *buf; + int bufsz; +}; + +/* + * Load into 'pidarray' up to 'npids' of the tasks using cgroup + * 'cgrp'. Return actual number of pids loaded. No need to + * task_lock(p) when reading out p->cgroup, since we're in an RCU + * read section, so the css_set can't go away, and is + * immutable after creation. + */ +static int pid_array_load(pid_t *pidarray, int npids, struct cgroup *cgrp) +{ + int n = 0; + struct cgroup_iter it; + struct task_struct *tsk; + cgroup_iter_start(cgrp, &it); + while ((tsk = cgroup_iter_next(cgrp, &it))) { + if (unlikely(n == npids)) + break; + pidarray[n++] = task_pid_nr(tsk); + } + cgroup_iter_end(cgrp, &it); + return n; +} + +/** + * Build and fill cgroupstats so that taskstats can export it to user + * space. + * + * @stats: cgroupstats to fill information into + * @dentry: A dentry entry belonging to the cgroup for which stats have + * been requested. + */ +int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry) +{ + int ret = -EINVAL; + struct cgroup *cgrp; + struct cgroup_iter it; + struct task_struct *tsk; + /* + * Validate dentry by checking the superblock operations + */ + if (dentry->d_sb->s_op != &cgroup_ops) + goto err; + + ret = 0; + cgrp = dentry->d_fsdata; + rcu_read_lock(); + + cgroup_iter_start(cgrp, &it); + while ((tsk = cgroup_iter_next(cgrp, &it))) { + switch (tsk->state) { + case TASK_RUNNING: + stats->nr_running++; + break; + case TASK_INTERRUPTIBLE: + stats->nr_sleeping++; + break; + case TASK_UNINTERRUPTIBLE: + stats->nr_uninterruptible++; + break; + case TASK_STOPPED: + stats->nr_stopped++; + break; + default: + if (delayacct_is_task_waiting_on_io(tsk)) + stats->nr_io_wait++; + break; + } + } + cgroup_iter_end(cgrp, &it); + + rcu_read_unlock(); +err: + return ret; +} + +static int cmppid(const void *a, const void *b) +{ + return *(pid_t *)a - *(pid_t *)b; +} + +/* + * Convert array 'a' of 'npids' pid_t's to a string of newline separated + * decimal pids in 'buf'. Don't write more than 'sz' chars, but return + * count 'cnt' of how many chars would be written if buf were large enough. + */ +static int pid_array_to_buf(char *buf, int sz, pid_t *a, int npids) +{ + int cnt = 0; + int i; + + for (i = 0; i < npids; i++) + cnt += snprintf(buf + cnt, max(sz - cnt, 0), "%d\n", a[i]); + return cnt; +} + +/* + * Handle an open on 'tasks' file. Prepare a buffer listing the + * process id's of tasks currently attached to the cgroup being opened. + * + * Does not require any specific cgroup mutexes, and does not take any. + */ +static int cgroup_tasks_open(struct inode *unused, struct file *file) +{ + struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent); + struct ctr_struct *ctr; + pid_t *pidarray; + int npids; + char c; + + if (!(file->f_mode & FMODE_READ)) + return 0; + + ctr = kmalloc(sizeof(*ctr), GFP_KERNEL); + if (!ctr) + goto err0; + + /* + * If cgroup gets more users after we read count, we won't have + * enough space - tough. This race is indistinguishable to the + * caller from the case that the additional cgroup users didn't + * show up until sometime later on. + */ + npids = cgroup_task_count(cgrp); + if (npids) { + pidarray = kmalloc(npids * sizeof(pid_t), GFP_KERNEL); + if (!pidarray) + goto err1; + + npids = pid_array_load(pidarray, npids, cgrp); + sort(pidarray, npids, sizeof(pid_t), cmppid, NULL); + + /* Call pid_array_to_buf() twice, first just to get bufsz */ + ctr->bufsz = pid_array_to_buf(&c, sizeof(c), pidarray, npids) + 1; + ctr->buf = kmalloc(ctr->bufsz, GFP_KERNEL); + if (!ctr->buf) + goto err2; + ctr->bufsz = pid_array_to_buf(ctr->buf, ctr->bufsz, pidarray, npids); + + kfree(pidarray); + } else { + ctr->buf = 0; + ctr->bufsz = 0; + } + file->private_data = ctr; + return 0; + +err2: + kfree(pidarray); +err1: + kfree(ctr); +err0: + return -ENOMEM; +} + +static ssize_t cgroup_tasks_read(struct cgroup *cgrp, + struct cftype *cft, + struct file *file, char __user *buf, + size_t nbytes, loff_t *ppos) +{ + struct ctr_struct *ctr = file->private_data; + + return simple_read_from_buffer(buf, nbytes, ppos, ctr->buf, ctr->bufsz); +} + +static int cgroup_tasks_release(struct inode *unused_inode, + struct file *file) +{ + struct ctr_struct *ctr; + + if (file->f_mode & FMODE_READ) { + ctr = file->private_data; + kfree(ctr->buf); + kfree(ctr); + } + return 0; +} + +static u64 cgroup_read_notify_on_release(struct cgroup *cgrp, + struct cftype *cft) +{ + return notify_on_release(cgrp); +} + +static u64 cgroup_read_releasable(struct cgroup *cgrp, struct cftype *cft) +{ + return test_bit(CGRP_RELEASABLE, &cgrp->flags); +} + +/* + * for the common functions, 'private' gives the type of file + */ +static struct cftype files[] = { + { + .name = "tasks", + .open = cgroup_tasks_open, + .read = cgroup_tasks_read, + .write = cgroup_common_file_write, + .release = cgroup_tasks_release, + .private = FILE_TASKLIST, + }, + + { + .name = "notify_on_release", + .read_uint = cgroup_read_notify_on_release, + .write = cgroup_common_file_write, + .private = FILE_NOTIFY_ON_RELEASE, + }, + + { + .name = "releasable", + .read_uint = cgroup_read_releasable, + .private = FILE_RELEASABLE, + } +}; + +static struct cftype cft_release_agent = { + .name = "release_agent", + .read = cgroup_common_file_read, + .write = cgroup_common_file_write, + .private = FILE_RELEASE_AGENT, +}; + +static int cgroup_populate_dir(struct cgroup *cgrp) +{ + int err; + struct cgroup_subsys *ss; + + /* First clear out any existing files */ + cgroup_clear_directory(cgrp->dentry); + + err = cgroup_add_files(cgrp, NULL, files, ARRAY_SIZE(files)); + if (err < 0) + return err; + + if (cgrp == cgrp->top_cgroup) { + if ((err = cgroup_add_file(cgrp, NULL, &cft_release_agent)) < 0) + return err; + } + + for_each_subsys(cgrp->root, ss) { + if (ss->populate && (err = ss->populate(ss, cgrp)) < 0) + return err; + } + + return 0; +} + +static void init_cgroup_css(struct cgroup_subsys_state *css, + struct cgroup_subsys *ss, + struct cgroup *cgrp) +{ + css->cgroup = cgrp; + atomic_set(&css->refcnt, 0); + css->flags = 0; + if (cgrp == dummytop) + set_bit(CSS_ROOT, &css->flags); + BUG_ON(cgrp->subsys[ss->subsys_id]); + cgrp->subsys[ss->subsys_id] = css; +} + +/* + * cgroup_create - create a cgroup + * parent: cgroup that will be parent of the new cgroup. + * name: name of the new cgroup. Will be strcpy'ed. + * mode: mode to set on new inode + * + * Must be called with the mutex on the parent inode held + */ + +static long cgroup_create(struct cgroup *parent, struct dentry *dentry, + int mode) +{ + struct cgroup *cgrp; + struct cgroupfs_root *root = parent->root; + int err = 0; + struct cgroup_subsys *ss; + struct super_block *sb = root->sb; + + cgrp = kzalloc(sizeof(*cgrp), GFP_KERNEL); + if (!cgrp) + return -ENOMEM; + + /* Grab a reference on the superblock so the hierarchy doesn't + * get deleted on unmount if there are child cgroups. This + * can be done outside cgroup_mutex, since the sb can't + * disappear while someone has an open control file on the + * fs */ + atomic_inc(&sb->s_active); + + mutex_lock(&cgroup_mutex); + + cgrp->flags = 0; + INIT_LIST_HEAD(&cgrp->sibling); + INIT_LIST_HEAD(&cgrp->children); + INIT_LIST_HEAD(&cgrp->css_sets); + INIT_LIST_HEAD(&cgrp->release_list); + + cgrp->parent = parent; + cgrp->root = parent->root; + cgrp->top_cgroup = parent->top_cgroup; + + for_each_subsys(root, ss) { + struct cgroup_subsys_state *css = ss->create(ss, cgrp); + if (IS_ERR(css)) { + err = PTR_ERR(css); + goto err_destroy; + } + init_cgroup_css(css, ss, cgrp); + } + + list_add(&cgrp->sibling, &cgrp->parent->children); + root->number_of_cgroups++; + + err = cgroup_create_dir(cgrp, dentry, mode); + if (err < 0) + goto err_remove; + + /* The cgroup directory was pre-locked for us */ + BUG_ON(!mutex_is_locked(&cgrp->dentry->d_inode->i_mutex)); + + err = cgroup_populate_dir(cgrp); + /* If err < 0, we have a half-filled directory - oh well ;) */ + + mutex_unlock(&cgroup_mutex); + mutex_unlock(&cgrp->dentry->d_inode->i_mutex); + + return 0; + + err_remove: + + list_del(&cgrp->sibling); + root->number_of_cgroups--; + + err_destroy: + + for_each_subsys(root, ss) { + if (cgrp->subsys[ss->subsys_id]) + ss->destroy(ss, cgrp); + } + + mutex_unlock(&cgroup_mutex); + + /* Release the reference count that we took on the superblock */ + deactivate_super(sb); + + kfree(cgrp); + return err; +} + +static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, int mode) +{ + struct cgroup *c_parent = dentry->d_parent->d_fsdata; + + /* the vfs holds inode->i_mutex already */ + return cgroup_create(c_parent, dentry, mode | S_IFDIR); +} + +static inline int cgroup_has_css_refs(struct cgroup *cgrp) +{ + /* Check the reference count on each subsystem. Since we + * already established that there are no tasks in the + * cgroup, if the css refcount is also 0, then there should + * be no outstanding references, so the subsystem is safe to + * destroy. We scan across all subsystems rather than using + * the per-hierarchy linked list of mounted subsystems since + * we can be called via check_for_release() with no + * synchronization other than RCU, and the subsystem linked + * list isn't RCU-safe */ + int i; + for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { + struct cgroup_subsys *ss = subsys[i]; + struct cgroup_subsys_state *css; + /* Skip subsystems not in this hierarchy */ + if (ss->root != cgrp->root) + continue; + css = cgrp->subsys[ss->subsys_id]; + /* When called from check_for_release() it's possible + * that by this point the cgroup has been removed + * and the css deleted. But a false-positive doesn't + * matter, since it can only happen if the cgroup + * has been deleted and hence no longer needs the + * release agent to be called anyway. */ + if (css && atomic_read(&css->refcnt)) { + return 1; + } + } + return 0; +} + +static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry) +{ + struct cgroup *cgrp = dentry->d_fsdata; + struct dentry *d; + struct cgroup *parent; + struct cgroup_subsys *ss; + struct super_block *sb; + struct cgroupfs_root *root; + + /* the vfs holds both inode->i_mutex already */ + + mutex_lock(&cgroup_mutex); + if (atomic_read(&cgrp->count) != 0) { + mutex_unlock(&cgroup_mutex); + return -EBUSY; + } + if (!list_empty(&cgrp->children)) { + mutex_unlock(&cgroup_mutex); + return -EBUSY; + } + + parent = cgrp->parent; + root = cgrp->root; + sb = root->sb; + + if (cgroup_has_css_refs(cgrp)) { + mutex_unlock(&cgroup_mutex); + return -EBUSY; + } + + for_each_subsys(root, ss) { + if (cgrp->subsys[ss->subsys_id]) + ss->destroy(ss, cgrp); + } + + spin_lock(&release_list_lock); + set_bit(CGRP_REMOVED, &cgrp->flags); + if (!list_empty(&cgrp->release_list)) + list_del(&cgrp->release_list); + spin_unlock(&release_list_lock); + /* delete my sibling from parent->children */ + list_del(&cgrp->sibling); + spin_lock(&cgrp->dentry->d_lock); + d = dget(cgrp->dentry); + cgrp->dentry = NULL; + spin_unlock(&d->d_lock); + + cgroup_d_remove_dir(d); + dput(d); + root->number_of_cgroups--; + + set_bit(CGRP_RELEASABLE, &parent->flags); + check_for_release(parent); + + mutex_unlock(&cgroup_mutex); + /* Drop the active superblock reference that we took when we + * created the cgroup */ + deactivate_super(sb); + return 0; +} + +static void cgroup_init_subsys(struct cgroup_subsys *ss) +{ + struct cgroup_subsys_state *css; + struct list_head *l; + printk(KERN_ERR "Initializing cgroup subsys %s\n", ss->name); + + /* Create the top cgroup state for this subsystem */ + ss->root = &rootnode; + css = ss->create(ss, dummytop); + /* We don't handle early failures gracefully */ + BUG_ON(IS_ERR(css)); + init_cgroup_css(css, ss, dummytop); + + /* Update all cgroup groups to contain a subsys + * pointer to this state - since the subsystem is + * newly registered, all tasks and hence all cgroup + * groups are in the subsystem's top cgroup. */ + write_lock(&css_set_lock); + l = &init_css_set.list; + do { + struct css_set *cg = + list_entry(l, struct css_set, list); + cg->subsys[ss->subsys_id] = dummytop->subsys[ss->subsys_id]; + l = l->next; + } while (l != &init_css_set.list); + write_unlock(&css_set_lock); + + /* If this subsystem requested that it be notified with fork + * events, we should send it one now for every process in the + * system */ + if (ss->fork) { + struct task_struct *g, *p; + + read_lock(&tasklist_lock); + do_each_thread(g, p) { + ss->fork(ss, p); + } while_each_thread(g, p); + read_unlock(&tasklist_lock); + } + + need_forkexit_callback |= ss->fork || ss->exit; + + ss->active = 1; +} + +/** + * cgroup_init_early - initialize cgroups at system boot, and + * initialize any subsystems that request early init. + */ +int __init cgroup_init_early(void) +{ + int i; + kref_init(&init_css_set.ref); + kref_get(&init_css_set.ref); + INIT_LIST_HEAD(&init_css_set.list); + INIT_LIST_HEAD(&init_css_set.cg_links); + INIT_LIST_HEAD(&init_css_set.tasks); + css_set_count = 1; + init_cgroup_root(&rootnode); + list_add(&rootnode.root_list, &roots); + root_count = 1; + init_task.cgroups = &init_css_set; + + init_css_set_link.cg = &init_css_set; + list_add(&init_css_set_link.cgrp_link_list, + &rootnode.top_cgroup.css_sets); + list_add(&init_css_set_link.cg_link_list, + &init_css_set.cg_links); + + for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { + struct cgroup_subsys *ss = subsys[i]; + + BUG_ON(!ss->name); + BUG_ON(strlen(ss->name) > MAX_CGROUP_TYPE_NAMELEN); + BUG_ON(!ss->create); + BUG_ON(!ss->destroy); + if (ss->subsys_id != i) { + printk(KERN_ERR "Subsys %s id == %d\n", + ss->name, ss->subsys_id); + BUG(); + } + + if (ss->early_init) + cgroup_init_subsys(ss); + } + return 0; +} + +/** + * cgroup_init - register cgroup filesystem and /proc file, and + * initialize any subsystems that didn't request early init. + */ +int __init cgroup_init(void) +{ + int err; + int i; + struct proc_dir_entry *entry; + + err = bdi_init(&cgroup_backing_dev_info); + if (err) + return err; + + for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { + struct cgroup_subsys *ss = subsys[i]; + if (!ss->early_init) + cgroup_init_subsys(ss); + } + + err = register_filesystem(&cgroup_fs_type); + if (err < 0) + goto out; + + entry = create_proc_entry("cgroups", 0, NULL); + if (entry) + entry->proc_fops = &proc_cgroupstats_operations; + +out: + if (err) + bdi_destroy(&cgroup_backing_dev_info); + + return err; +} + +/* + * proc_cgroup_show() + * - Print task's cgroup paths into seq_file, one line for each hierarchy + * - Used for /proc/<pid>/cgroup. + * - No need to task_lock(tsk) on this tsk->cgroup reference, as it + * doesn't really matter if tsk->cgroup changes after we read it, + * and we take cgroup_mutex, keeping attach_task() from changing it + * anyway. No need to check that tsk->cgroup != NULL, thanks to + * the_top_cgroup_hack in cgroup_exit(), which sets an exiting tasks + * cgroup to top_cgroup. + */ + +/* TODO: Use a proper seq_file iterator */ +static int proc_cgroup_show(struct seq_file *m, void *v) +{ + struct pid *pid; + struct task_struct *tsk; + char *buf; + int retval; + struct cgroupfs_root *root; + + retval = -ENOMEM; + buf = kmalloc(PAGE_SIZE, GFP_KERNEL); + if (!buf) + goto out; + + retval = -ESRCH; + pid = m->private; + tsk = get_pid_task(pid, PIDTYPE_PID); + if (!tsk) + goto out_free; + + retval = 0; + + mutex_lock(&cgroup_mutex); + + for_each_root(root) { + struct cgroup_subsys *ss; + struct cgroup *cgrp; + int subsys_id; + int count = 0; + + /* Skip this hierarchy if it has no active subsystems */ + if (!root->actual_subsys_bits) + continue; + for_each_subsys(root, ss) + seq_printf(m, "%s%s", count++ ? "," : "", ss->name); + seq_putc(m, ':'); + get_first_subsys(&root->top_cgroup, NULL, &subsys_id); + cgrp = task_cgroup(tsk, subsys_id); + retval = cgroup_path(cgrp, buf, PAGE_SIZE); + if (retval < 0) + goto out_unlock; + seq_puts(m, buf); + seq_putc(m, '\n'); + } + +out_unlock: + mutex_unlock(&cgroup_mutex); + put_task_struct(tsk); +out_free: + kfree(buf); +out: + return retval; +} + +static int cgroup_open(struct inode *inode, struct file *file) +{ + struct pid *pid = PROC_I(inode)->pid; + return single_open(file, proc_cgroup_show, pid); +} + +struct file_operations proc_cgroup_operations = { + .open = cgroup_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +/* Display information about each subsystem and each hierarchy */ +static int proc_cgroupstats_show(struct seq_file *m, void *v) +{ + int i; + struct cgroupfs_root *root; + + seq_puts(m, "#subsys_name\thierarchy\tnum_cgroups\n"); + mutex_lock(&cgroup_mutex); + for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { + struct cgroup_subsys *ss = subsys[i]; + seq_printf(m, "%s\t%lu\t%d\n", + ss->name, ss->root->subsys_bits, + ss->root->number_of_cgroups); + } + mutex_unlock(&cgroup_mutex); + return 0; +} + +static int cgroupstats_open(struct inode *inode, struct file *file) +{ + return single_open(file, proc_cgroupstats_show, 0); +} + +static struct file_operations proc_cgroupstats_operations = { + .open = cgroupstats_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +/** + * cgroup_fork - attach newly forked task to its parents cgroup. + * @tsk: pointer to task_struct of forking parent process. + * + * Description: A task inherits its parent's cgroup at fork(). + * + * A pointer to the shared css_set was automatically copied in + * fork.c by dup_task_struct(). However, we ignore that copy, since + * it was not made under the protection of RCU or cgroup_mutex, so + * might no longer be a valid cgroup pointer. attach_task() might + * have already changed current->cgroups, allowing the previously + * referenced cgroup group to be removed and freed. + * + * At the point that cgroup_fork() is called, 'current' is the parent + * task, and the passed argument 'child' points to the child task. + */ +void cgroup_fork(struct task_struct *child) +{ + task_lock(current); + child->cgroups = current->cgroups; + get_css_set(child->cgroups); + task_unlock(current); + INIT_LIST_HEAD(&child->cg_list); +} + +/** + * cgroup_fork_callbacks - called on a new task very soon before + * adding it to the tasklist. No need to take any locks since no-one + * can be operating on this task + */ +void cgroup_fork_callbacks(struct task_struct *child) +{ + if (need_forkexit_callback) { + int i; + for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { + struct cgroup_subsys *ss = subsys[i]; + if (ss->fork) + ss->fork(ss, child); + } + } +} + +/** + * cgroup_post_fork - called on a new task after adding it to the + * task list. Adds the task to the list running through its css_set + * if necessary. Has to be after the task is visible on the task list + * in case we race with the first call to cgroup_iter_start() - to + * guarantee that the new task ends up on its list. */ +void cgroup_post_fork(struct task_struct *child) +{ + if (use_task_css_set_links) { + write_lock(&css_set_lock); + if (list_empty(&child->cg_list)) + list_add(&child->cg_list, &child->cgroups->tasks); + write_unlock(&css_set_lock); + } +} +/** + * cgroup_exit - detach cgroup from exiting task + * @tsk: pointer to task_struct of exiting process + * + * Description: Detach cgroup from @tsk and release it. + * + * Note that cgroups marked notify_on_release force every task in + * them to take the global cgroup_mutex mutex when exiting. + * This could impact scaling on very large systems. Be reluctant to + * use notify_on_release cgroups where very high task exit scaling + * is required on large systems. + * + * the_top_cgroup_hack: + * + * Set the exiting tasks cgroup to the root cgroup (top_cgroup). + * + * We call cgroup_exit() while the task is still competent to + * handle notify_on_release(), then leave the task attached to the + * root cgroup in each hierarchy for the remainder of its exit. + * + * To do this properly, we would increment the reference count on + * top_cgroup, and near the very end of the kernel/exit.c do_exit() + * code we would add a second cgroup function call, to drop that + * reference. This would just create an unnecessary hot spot on + * the top_cgroup reference count, to no avail. + * + * Normally, holding a reference to a cgroup without bumping its + * count is unsafe. The cgroup could go away, or someone could + * attach us to a different cgroup, decrementing the count on + * the first cgroup that we never incremented. But in this case, + * top_cgroup isn't going away, and either task has PF_EXITING set, + * which wards off any attach_task() attempts, or task is a failed + * fork, never visible to attach_task. + * + */ +void cgroup_exit(struct task_struct *tsk, int run_callbacks) +{ + int i; + struct css_set *cg; + + if (run_callbacks && need_forkexit_callback) { + for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { + struct cgroup_subsys *ss = subsys[i]; + if (ss->exit) + ss->exit(ss, tsk); + } + } + + /* + * Unlink from the css_set task list if necessary. + * Optimistically check cg_list before taking + * css_set_lock + */ + if (!list_empty(&tsk->cg_list)) { + write_lock(&css_set_lock); + if (!list_empty(&tsk->cg_list)) + list_del(&tsk->cg_list); + write_unlock(&css_set_lock); + } + + /* Reassign the task to the init_css_set. */ + task_lock(tsk); + cg = tsk->cgroups; + tsk->cgroups = &init_css_set; + task_unlock(tsk); + if (cg) + put_css_set_taskexit(cg); +} + +/** + * cgroup_clone - duplicate the current cgroup in the hierarchy + * that the given subsystem is attached to, and move this task into + * the new child + */ +int cgroup_clone(struct task_struct *tsk, struct cgroup_subsys *subsys) +{ + struct dentry *dentry; + int ret = 0; + char nodename[MAX_CGROUP_TYPE_NAMELEN]; + struct cgroup *parent, *child; + struct inode *inode; + struct css_set *cg; + struct cgroupfs_root *root; + struct cgroup_subsys *ss; + + /* We shouldn't be called by an unregistered subsystem */ + BUG_ON(!subsys->active); + + /* First figure out what hierarchy and cgroup we're dealing + * with, and pin them so we can drop cgroup_mutex */ + mutex_lock(&cgroup_mutex); + again: + root = subsys->root; + if (root == &rootnode) { + printk(KERN_INFO + "Not cloning cgroup for unused subsystem %s\n", + subsys->name); + mutex_unlock(&cgroup_mutex); + return 0; + } + cg = tsk->cgroups; + parent = task_cgroup(tsk, subsys->subsys_id); + + snprintf(nodename, MAX_CGROUP_TYPE_NAMELEN, "node_%d", tsk->pid); + + /* Pin the hierarchy */ + atomic_inc(&parent->root->sb->s_active); + + /* Keep the cgroup alive */ + get_css_set(cg); + mutex_unlock(&cgroup_mutex); + + /* Now do the VFS work to create a cgroup */ + inode = parent->dentry->d_inode; + + /* Hold the parent directory mutex across this operation to + * stop anyone else deleting the new cgroup */ + mutex_lock(&inode->i_mutex); + dentry = lookup_one_len(nodename, parent->dentry, strlen(nodename)); + if (IS_ERR(dentry)) { + printk(KERN_INFO + "Couldn't allocate dentry for %s: %ld\n", nodename, + PTR_ERR(dentry)); + ret = PTR_ERR(dentry); + goto out_release; + } + + /* Create the cgroup directory, which also creates the cgroup */ + ret = vfs_mkdir(inode, dentry, S_IFDIR | 0755); + child = __d_cgrp(dentry); + dput(dentry); + if (ret) { + printk(KERN_INFO + "Failed to create cgroup %s: %d\n", nodename, + ret); + goto out_release; + } + + if (!child) { + printk(KERN_INFO + "Couldn't find new cgroup %s\n", nodename); + ret = -ENOMEM; + goto out_release; + } + + /* The cgroup now exists. Retake cgroup_mutex and check + * that we're still in the same state that we thought we + * were. */ + mutex_lock(&cgroup_mutex); + if ((root != subsys->root) || + (parent != task_cgroup(tsk, subsys->subsys_id))) { + /* Aargh, we raced ... */ + mutex_unlock(&inode->i_mutex); + put_css_set(cg); + + deactivate_super(parent->root->sb); + /* The cgroup is still accessible in the VFS, but + * we're not going to try to rmdir() it at this + * point. */ + printk(KERN_INFO + "Race in cgroup_clone() - leaking cgroup %s\n", + nodename); + goto again; + } + + /* do any required auto-setup */ + for_each_subsys(root, ss) { + if (ss->post_clone) + ss->post_clone(ss, child); + } + + /* All seems fine. Finish by moving the task into the new cgroup */ + ret = attach_task(child, tsk); + mutex_unlock(&cgroup_mutex); + + out_release: + mutex_unlock(&inode->i_mutex); + + mutex_lock(&cgroup_mutex); + put_css_set(cg); + mutex_unlock(&cgroup_mutex); + deactivate_super(parent->root->sb); + return ret; +} + +/* + * See if "cgrp" is a descendant of the current task's cgroup in + * the appropriate hierarchy + * + * If we are sending in dummytop, then presumably we are creating + * the top cgroup in the subsystem. + * + * Called only by the ns (nsproxy) cgroup. + */ +int cgroup_is_descendant(const struct cgroup *cgrp) +{ + int ret; + struct cgroup *target; + int subsys_id; + + if (cgrp == dummytop) + return 1; + + get_first_subsys(cgrp, NULL, &subsys_id); + target = task_cgroup(current, subsys_id); + while (cgrp != target && cgrp!= cgrp->top_cgroup) + cgrp = cgrp->parent; + ret = (cgrp == target); + return ret; +} + +static void check_for_release(struct cgroup *cgrp) +{ + /* All of these checks rely on RCU to keep the cgroup + * structure alive */ + if (cgroup_is_releasable(cgrp) && !atomic_read(&cgrp->count) + && list_empty(&cgrp->children) && !cgroup_has_css_refs(cgrp)) { + /* Control Group is currently removeable. If it's not + * already queued for a userspace notification, queue + * it now */ + int need_schedule_work = 0; + spin_lock(&release_list_lock); + if (!cgroup_is_removed(cgrp) && + list_empty(&cgrp->release_list)) { + list_add(&cgrp->release_list, &release_list); + need_schedule_work = 1; + } + spin_unlock(&release_list_lock); + if (need_schedule_work) + schedule_work(&release_agent_work); + } +} + +void __css_put(struct cgroup_subsys_state *css) +{ + struct cgroup *cgrp = css->cgroup; + rcu_read_lock(); + if (atomic_dec_and_test(&css->refcnt) && notify_on_release(cgrp)) { + set_bit(CGRP_RELEASABLE, &cgrp->flags); + check_for_release(cgrp); + } + rcu_read_unlock(); +} + +/* + * Notify userspace when a cgroup is released, by running the + * configured release agent with the name of the cgroup (path + * relative to the root of cgroup file system) as the argument. + * + * Most likely, this user command will try to rmdir this cgroup. + * + * This races with the possibility that some other task will be + * attached to this cgroup before it is removed, or that some other + * user task will 'mkdir' a child cgroup of this cgroup. That's ok. + * The presumed 'rmdir' will fail quietly if this cgroup is no longer + * unused, and this cgroup will be reprieved from its death sentence, + * to continue to serve a useful existence. Next time it's released, + * we will get notified again, if it still has 'notify_on_release' set. + * + * The final arg to call_usermodehelper() is UMH_WAIT_EXEC, which + * means only wait until the task is successfully execve()'d. The + * separate release agent task is forked by call_usermodehelper(), + * then control in this thread returns here, without waiting for the + * release agent task. We don't bother to wait because the caller of + * this routine has no use for the exit status of the release agent + * task, so no sense holding our caller up for that. + * + */ + +static void cgroup_release_agent(struct work_struct *work) +{ + BUG_ON(work != &release_agent_work); + mutex_lock(&cgroup_mutex); + spin_lock(&release_list_lock); + while (!list_empty(&release_list)) { + char *argv[3], *envp[3]; + int i; + char *pathbuf; + struct cgroup *cgrp = list_entry(release_list.next, + struct cgroup, + release_list); + list_del_init(&cgrp->release_list); + spin_unlock(&release_list_lock); + pathbuf = kmalloc(PAGE_SIZE, GFP_KERNEL); + if (!pathbuf) { + spin_lock(&release_list_lock); + continue; + } + + if (cgroup_path(cgrp, pathbuf, PAGE_SIZE) < 0) { + kfree(pathbuf); + spin_lock(&release_list_lock); + continue; + } + + i = 0; + argv[i++] = cgrp->root->release_agent_path; + argv[i++] = (char *)pathbuf; + argv[i] = NULL; + + i = 0; + /* minimal command environment */ + envp[i++] = "HOME=/"; + envp[i++] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin"; + envp[i] = NULL; + + /* Drop the lock while we invoke the usermode helper, + * since the exec could involve hitting disk and hence + * be a slow process */ + mutex_unlock(&cgroup_mutex); + call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC); + kfree(pathbuf); + mutex_lock(&cgroup_mutex); + spin_lock(&release_list_lock); + } + spin_unlock(&release_list_lock); + mutex_unlock(&cgroup_mutex); +} diff --git a/kernel/cgroup_debug.c b/kernel/cgroup_debug.c new file mode 100644 index 000000000000..37301e877cb0 --- /dev/null +++ b/kernel/cgroup_debug.c @@ -0,0 +1,97 @@ +/* + * kernel/ccontainer_debug.c - Example cgroup subsystem that + * exposes debug info + * + * Copyright (C) Google Inc, 2007 + * + * Developed by Paul Menage (menage@google.com) + * + */ + +#include <linux/cgroup.h> +#include <linux/fs.h> +#include <linux/slab.h> +#include <linux/rcupdate.h> + +#include <asm/atomic.h> + +static struct cgroup_subsys_state *debug_create(struct cgroup_subsys *ss, + struct cgroup *cont) +{ + struct cgroup_subsys_state *css = kzalloc(sizeof(*css), GFP_KERNEL); + + if (!css) + return ERR_PTR(-ENOMEM); + + return css; +} + +static void debug_destroy(struct cgroup_subsys *ss, struct cgroup *cont) +{ + kfree(cont->subsys[debug_subsys_id]); +} + +static u64 cgroup_refcount_read(struct cgroup *cont, struct cftype *cft) +{ + return atomic_read(&cont->count); +} + +static u64 taskcount_read(struct cgroup *cont, struct cftype *cft) +{ + u64 count; + + cgroup_lock(); + count = cgroup_task_count(cont); + cgroup_unlock(); + return count; +} + +static u64 current_css_set_read(struct cgroup *cont, struct cftype *cft) +{ + return (u64)(long)current->cgroups; +} + +static u64 current_css_set_refcount_read(struct cgroup *cont, + struct cftype *cft) +{ + u64 count; + + rcu_read_lock(); + count = atomic_read(¤t->cgroups->ref.refcount); + rcu_read_unlock(); + return count; +} + +static struct cftype files[] = { + { + .name = "cgroup_refcount", + .read_uint = cgroup_refcount_read, + }, + { + .name = "taskcount", + .read_uint = taskcount_read, + }, + + { + .name = "current_css_set", + .read_uint = current_css_set_read, + }, + + { + .name = "current_css_set_refcount", + .read_uint = current_css_set_refcount_read, + }, +}; + +static int debug_populate(struct cgroup_subsys *ss, struct cgroup *cont) +{ + return cgroup_add_files(cont, ss, files, ARRAY_SIZE(files)); +} + +struct cgroup_subsys debug_subsys = { + .name = "debug", + .create = debug_create, + .destroy = debug_destroy, + .populate = debug_populate, + .subsys_id = debug_subsys_id, +}; diff --git a/kernel/cpu.c b/kernel/cpu.c index a21f71af9d81..6b3a0c15144f 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -98,7 +98,8 @@ static inline void check_for_tasks(int cpu) !cputime_eq(p->stime, cputime_zero))) printk(KERN_WARNING "Task %s (pid = %d) is on cpu %d\ (state = %ld, flags = %x) \n", - p->comm, p->pid, cpu, p->state, p->flags); + p->comm, task_pid_nr(p), cpu, + p->state, p->flags); } write_unlock_irq(&tasklist_lock); } @@ -264,6 +265,15 @@ out_notify: int __cpuinit cpu_up(unsigned int cpu) { int err = 0; + if (!cpu_isset(cpu, cpu_possible_map)) { + printk(KERN_ERR "can't online cpu %d because it is not " + "configured as may-hotadd at boot time\n", cpu); +#if defined(CONFIG_IA64) || defined(CONFIG_X86_64) || defined(CONFIG_S390) + printk(KERN_ERR "please check additional_cpus= boot " + "parameter\n"); +#endif + return -EINVAL; + } mutex_lock(&cpu_add_remove_lock); if (cpu_hotplug_disabled) diff --git a/kernel/cpu_acct.c b/kernel/cpu_acct.c new file mode 100644 index 000000000000..731e47e7f164 --- /dev/null +++ b/kernel/cpu_acct.c @@ -0,0 +1,186 @@ +/* + * kernel/cpu_acct.c - CPU accounting cgroup subsystem + * + * Copyright (C) Google Inc, 2006 + * + * Developed by Paul Menage (menage@google.com) and Balbir Singh + * (balbir@in.ibm.com) + * + */ + +/* + * Example cgroup subsystem for reporting total CPU usage of tasks in a + * cgroup, along with percentage load over a time interval + */ + +#include <linux/module.h> +#include <linux/cgroup.h> +#include <linux/fs.h> +#include <linux/rcupdate.h> + +#include <asm/div64.h> + +struct cpuacct { + struct cgroup_subsys_state css; + spinlock_t lock; + /* total time used by this class */ + cputime64_t time; + + /* time when next load calculation occurs */ + u64 next_interval_check; + + /* time used in current period */ + cputime64_t current_interval_time; + + /* time used in last period */ + cputime64_t last_interval_time; +}; + +struct cgroup_subsys cpuacct_subsys; + +static inline struct cpuacct *cgroup_ca(struct cgroup *cont) +{ + return container_of(cgroup_subsys_state(cont, cpuacct_subsys_id), + struct cpuacct, css); +} + +static inline struct cpuacct *task_ca(struct task_struct *task) +{ + return container_of(task_subsys_state(task, cpuacct_subsys_id), + struct cpuacct, css); +} + +#define INTERVAL (HZ * 10) + +static inline u64 next_interval_boundary(u64 now) +{ + /* calculate the next interval boundary beyond the + * current time */ + do_div(now, INTERVAL); + return (now + 1) * INTERVAL; +} + +static struct cgroup_subsys_state *cpuacct_create( + struct cgroup_subsys *ss, struct cgroup *cont) +{ + struct cpuacct *ca = kzalloc(sizeof(*ca), GFP_KERNEL); + + if (!ca) + return ERR_PTR(-ENOMEM); + spin_lock_init(&ca->lock); + ca->next_interval_check = next_interval_boundary(get_jiffies_64()); + return &ca->css; +} + +static void cpuacct_destroy(struct cgroup_subsys *ss, + struct cgroup *cont) +{ + kfree(cgroup_ca(cont)); +} + +/* Lazily update the load calculation if necessary. Called with ca locked */ +static void cpuusage_update(struct cpuacct *ca) +{ + u64 now = get_jiffies_64(); + + /* If we're not due for an update, return */ + if (ca->next_interval_check > now) + return; + + if (ca->next_interval_check <= (now - INTERVAL)) { + /* If it's been more than an interval since the last + * check, then catch up - the last interval must have + * been zero load */ + ca->last_interval_time = 0; + ca->next_interval_check = next_interval_boundary(now); + } else { + /* If a steal takes the last interval time negative, + * then we just ignore it */ + if ((s64)ca->current_interval_time > 0) + ca->last_interval_time = ca->current_interval_time; + else + ca->last_interval_time = 0; + ca->next_interval_check += INTERVAL; + } + ca->current_interval_time = 0; +} + +static u64 cpuusage_read(struct cgroup *cont, struct cftype *cft) +{ + struct cpuacct *ca = cgroup_ca(cont); + u64 time; + + spin_lock_irq(&ca->lock); + cpuusage_update(ca); + time = cputime64_to_jiffies64(ca->time); + spin_unlock_irq(&ca->lock); + + /* Convert 64-bit jiffies to seconds */ + time *= 1000; + do_div(time, HZ); + return time; +} + +static u64 load_read(struct cgroup *cont, struct cftype *cft) +{ + struct cpuacct *ca = cgroup_ca(cont); + u64 time; + + /* Find the time used in the previous interval */ + spin_lock_irq(&ca->lock); + cpuusage_update(ca); + time = cputime64_to_jiffies64(ca->last_interval_time); + spin_unlock_irq(&ca->lock); + + /* Convert time to a percentage, to give the load in the + * previous period */ + time *= 100; + do_div(time, INTERVAL); + + return time; +} + +static struct cftype files[] = { + { + .name = "usage", + .read_uint = cpuusage_read, + }, + { + .name = "load", + .read_uint = load_read, + } +}; + +static int cpuacct_populate(struct cgroup_subsys *ss, struct cgroup *cont) +{ + return cgroup_add_files(cont, ss, files, ARRAY_SIZE(files)); +} + +void cpuacct_charge(struct task_struct *task, cputime_t cputime) +{ + + struct cpuacct *ca; + unsigned long flags; + + if (!cpuacct_subsys.active) + return; + rcu_read_lock(); + ca = task_ca(task); + if (ca) { + spin_lock_irqsave(&ca->lock, flags); + cpuusage_update(ca); + ca->time = cputime64_add(ca->time, cputime); + ca->current_interval_time = + cputime64_add(ca->current_interval_time, cputime); + spin_unlock_irqrestore(&ca->lock, flags); + } + rcu_read_unlock(); +} + +struct cgroup_subsys cpuacct_subsys = { + .name = "cpuacct", + .create = cpuacct_create, + .destroy = cpuacct_destroy, + .populate = cpuacct_populate, + .subsys_id = cpuacct_subsys_id, +}; diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 64950fa5d321..50f5dc463688 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -4,7 +4,8 @@ * Processor and Memory placement constraints for sets of tasks. * * Copyright (C) 2003 BULL SA. - * Copyright (C) 2004-2006 Silicon Graphics, Inc. + * Copyright (C) 2004-2007 Silicon Graphics, Inc. + * Copyright (C) 2006 Google, Inc * * Portions derived from Patrick Mochel's sysfs code. * sysfs is Copyright (c) 2001-3 Patrick Mochel @@ -12,6 +13,7 @@ * 2003-10-10 Written by Simon Derr. * 2003-10-22 Updates by Stephen Hemminger. * 2004 May-July Rework by Paul Jackson. + * 2006 Rework by Paul Menage to use generic cgroups * * This file is subject to the terms and conditions of the GNU General Public * License. See the file COPYING in the main directory of the Linux @@ -36,6 +38,7 @@ #include <linux/mount.h> #include <linux/namei.h> #include <linux/pagemap.h> +#include <linux/prio_heap.h> #include <linux/proc_fs.h> #include <linux/rcupdate.h> #include <linux/sched.h> @@ -52,8 +55,7 @@ #include <asm/uaccess.h> #include <asm/atomic.h> #include <linux/mutex.h> - -#define CPUSET_SUPER_MAGIC 0x27e0eb +#include <linux/kfifo.h> /* * Tracks how many cpusets are currently defined in system. @@ -62,6 +64,10 @@ */ int number_of_cpusets __read_mostly; +/* Retrieve the cpuset from a cgroup */ +struct cgroup_subsys cpuset_subsys; +struct cpuset; + /* See "Frequency meter" comments, below. */ struct fmeter { @@ -72,24 +78,13 @@ struct fmeter { }; struct cpuset { + struct cgroup_subsys_state css; + unsigned long flags; /* "unsigned long" so bitops work */ cpumask_t cpus_allowed; /* CPUs allowed to tasks in cpuset */ nodemask_t mems_allowed; /* Memory Nodes allowed to tasks */ - /* - * Count is atomic so can incr (fork) or decr (exit) without a lock. - */ - atomic_t count; /* count tasks using this cpuset */ - - /* - * We link our 'sibling' struct into our parents 'children'. - * Our children link their 'sibling' into our 'children'. - */ - struct list_head sibling; /* my parents children */ - struct list_head children; /* my children */ - struct cpuset *parent; /* my parent */ - struct dentry *dentry; /* cpuset fs entry */ /* * Copy of global cpuset_mems_generation as of the most @@ -98,15 +93,32 @@ struct cpuset { int mems_generation; struct fmeter fmeter; /* memory_pressure filter */ + + /* partition number for rebuild_sched_domains() */ + int pn; }; +/* Retrieve the cpuset for a cgroup */ +static inline struct cpuset *cgroup_cs(struct cgroup *cont) +{ + return container_of(cgroup_subsys_state(cont, cpuset_subsys_id), + struct cpuset, css); +} + +/* Retrieve the cpuset for a task */ +static inline struct cpuset *task_cs(struct task_struct *task) +{ + return container_of(task_subsys_state(task, cpuset_subsys_id), + struct cpuset, css); +} + + /* bits in struct cpuset flags field */ typedef enum { CS_CPU_EXCLUSIVE, CS_MEM_EXCLUSIVE, CS_MEMORY_MIGRATE, - CS_REMOVED, - CS_NOTIFY_ON_RELEASE, + CS_SCHED_LOAD_BALANCE, CS_SPREAD_PAGE, CS_SPREAD_SLAB, } cpuset_flagbits_t; @@ -122,14 +134,9 @@ static inline int is_mem_exclusive(const struct cpuset *cs) return test_bit(CS_MEM_EXCLUSIVE, &cs->flags); } -static inline int is_removed(const struct cpuset *cs) +static inline int is_sched_load_balance(const struct cpuset *cs) { - return test_bit(CS_REMOVED, &cs->flags); -} - -static inline int notify_on_release(const struct cpuset *cs) -{ - return test_bit(CS_NOTIFY_ON_RELEASE, &cs->flags); + return test_bit(CS_SCHED_LOAD_BALANCE, &cs->flags); } static inline int is_memory_migrate(const struct cpuset *cs) @@ -172,14 +179,8 @@ static struct cpuset top_cpuset = { .flags = ((1 << CS_CPU_EXCLUSIVE) | (1 << CS_MEM_EXCLUSIVE)), .cpus_allowed = CPU_MASK_ALL, .mems_allowed = NODE_MASK_ALL, - .count = ATOMIC_INIT(0), - .sibling = LIST_HEAD_INIT(top_cpuset.sibling), - .children = LIST_HEAD_INIT(top_cpuset.children), }; -static struct vfsmount *cpuset_mount; -static struct super_block *cpuset_sb; - /* * We have two global cpuset mutexes below. They can nest. * It is ok to first take manage_mutex, then nest callback_mutex. We also @@ -263,297 +264,33 @@ static struct super_block *cpuset_sb; * the routine cpuset_update_task_memory_state(). */ -static DEFINE_MUTEX(manage_mutex); static DEFINE_MUTEX(callback_mutex); -/* - * A couple of forward declarations required, due to cyclic reference loop: - * cpuset_mkdir -> cpuset_create -> cpuset_populate_dir -> cpuset_add_file - * -> cpuset_create_file -> cpuset_dir_inode_operations -> cpuset_mkdir. - */ - -static int cpuset_mkdir(struct inode *dir, struct dentry *dentry, int mode); -static int cpuset_rmdir(struct inode *unused_dir, struct dentry *dentry); - -static struct backing_dev_info cpuset_backing_dev_info = { - .ra_pages = 0, /* No readahead */ - .capabilities = BDI_CAP_NO_ACCT_DIRTY | BDI_CAP_NO_WRITEBACK, -}; - -static struct inode *cpuset_new_inode(mode_t mode) -{ - struct inode *inode = new_inode(cpuset_sb); - - if (inode) { - inode->i_mode = mode; - inode->i_uid = current->fsuid; - inode->i_gid = current->fsgid; - inode->i_blocks = 0; - inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; - inode->i_mapping->backing_dev_info = &cpuset_backing_dev_info; - } - return inode; -} - -static void cpuset_diput(struct dentry *dentry, struct inode *inode) -{ - /* is dentry a directory ? if so, kfree() associated cpuset */ - if (S_ISDIR(inode->i_mode)) { - struct cpuset *cs = dentry->d_fsdata; - BUG_ON(!(is_removed(cs))); - kfree(cs); - } - iput(inode); -} - -static struct dentry_operations cpuset_dops = { - .d_iput = cpuset_diput, -}; - -static struct dentry *cpuset_get_dentry(struct dentry *parent, const char *name) -{ - struct dentry *d = lookup_one_len(name, parent, strlen(name)); - if (!IS_ERR(d)) - d->d_op = &cpuset_dops; - return d; -} - -static void remove_dir(struct dentry *d) -{ - struct dentry *parent = dget(d->d_parent); - - d_delete(d); - simple_rmdir(parent->d_inode, d); - dput(parent); -} - -/* - * NOTE : the dentry must have been dget()'ed - */ -static void cpuset_d_remove_dir(struct dentry *dentry) -{ - struct list_head *node; - - spin_lock(&dcache_lock); - node = dentry->d_subdirs.next; - while (node != &dentry->d_subdirs) { - struct dentry *d = list_entry(node, struct dentry, d_u.d_child); - list_del_init(node); - if (d->d_inode) { - d = dget_locked(d); - spin_unlock(&dcache_lock); - d_delete(d); - simple_unlink(dentry->d_inode, d); - dput(d); - spin_lock(&dcache_lock); - } - node = dentry->d_subdirs.next; - } - list_del_init(&dentry->d_u.d_child); - spin_unlock(&dcache_lock); - remove_dir(dentry); -} - -static struct super_operations cpuset_ops = { - .statfs = simple_statfs, - .drop_inode = generic_delete_inode, -}; - -static int cpuset_fill_super(struct super_block *sb, void *unused_data, - int unused_silent) -{ - struct inode *inode; - struct dentry *root; - - sb->s_blocksize = PAGE_CACHE_SIZE; - sb->s_blocksize_bits = PAGE_CACHE_SHIFT; - sb->s_magic = CPUSET_SUPER_MAGIC; - sb->s_op = &cpuset_ops; - cpuset_sb = sb; - - inode = cpuset_new_inode(S_IFDIR | S_IRUGO | S_IXUGO | S_IWUSR); - if (inode) { - inode->i_op = &simple_dir_inode_operations; - inode->i_fop = &simple_dir_operations; - /* directories start off with i_nlink == 2 (for "." entry) */ - inc_nlink(inode); - } else { - return -ENOMEM; - } - - root = d_alloc_root(inode); - if (!root) { - iput(inode); - return -ENOMEM; - } - sb->s_root = root; - return 0; -} - +/* This is ugly, but preserves the userspace API for existing cpuset + * users. If someone tries to mount the "cpuset" filesystem, we + * silently switch it to mount "cgroup" instead */ static int cpuset_get_sb(struct file_system_type *fs_type, int flags, const char *unused_dev_name, void *data, struct vfsmount *mnt) { - return get_sb_single(fs_type, flags, data, cpuset_fill_super, mnt); + struct file_system_type *cgroup_fs = get_fs_type("cgroup"); + int ret = -ENODEV; + if (cgroup_fs) { + char mountopts[] = + "cpuset,noprefix," + "release_agent=/sbin/cpuset_release_agent"; + ret = cgroup_fs->get_sb(cgroup_fs, flags, + unused_dev_name, mountopts, mnt); + put_filesystem(cgroup_fs); + } + return ret; } static struct file_system_type cpuset_fs_type = { .name = "cpuset", .get_sb = cpuset_get_sb, - .kill_sb = kill_litter_super, }; -/* struct cftype: - * - * The files in the cpuset filesystem mostly have a very simple read/write - * handling, some common function will take care of it. Nevertheless some cases - * (read tasks) are special and therefore I define this structure for every - * kind of file. - * - * - * When reading/writing to a file: - * - the cpuset to use in file->f_path.dentry->d_parent->d_fsdata - * - the 'cftype' of the file is file->f_path.dentry->d_fsdata - */ - -struct cftype { - char *name; - int private; - int (*open) (struct inode *inode, struct file *file); - ssize_t (*read) (struct file *file, char __user *buf, size_t nbytes, - loff_t *ppos); - int (*write) (struct file *file, const char __user *buf, size_t nbytes, - loff_t *ppos); - int (*release) (struct inode *inode, struct file *file); -}; - -static inline struct cpuset *__d_cs(struct dentry *dentry) -{ - return dentry->d_fsdata; -} - -static inline struct cftype *__d_cft(struct dentry *dentry) -{ - return dentry->d_fsdata; -} - -/* - * Call with manage_mutex held. Writes path of cpuset into buf. - * Returns 0 on success, -errno on error. - */ - -static int cpuset_path(const struct cpuset *cs, char *buf, int buflen) -{ - char *start; - - start = buf + buflen; - - *--start = '\0'; - for (;;) { - int len = cs->dentry->d_name.len; - if ((start -= len) < buf) - return -ENAMETOOLONG; - memcpy(start, cs->dentry->d_name.name, len); - cs = cs->parent; - if (!cs) - break; - if (!cs->parent) - continue; - if (--start < buf) - return -ENAMETOOLONG; - *start = '/'; - } - memmove(buf, start, buf + buflen - start); - return 0; -} - -/* - * Notify userspace when a cpuset is released, by running - * /sbin/cpuset_release_agent with the name of the cpuset (path - * relative to the root of cpuset file system) as the argument. - * - * Most likely, this user command will try to rmdir this cpuset. - * - * This races with the possibility that some other task will be - * attached to this cpuset before it is removed, or that some other - * user task will 'mkdir' a child cpuset of this cpuset. That's ok. - * The presumed 'rmdir' will fail quietly if this cpuset is no longer - * unused, and this cpuset will be reprieved from its death sentence, - * to continue to serve a useful existence. Next time it's released, - * we will get notified again, if it still has 'notify_on_release' set. - * - * The final arg to call_usermodehelper() is 0, which means don't - * wait. The separate /sbin/cpuset_release_agent task is forked by - * call_usermodehelper(), then control in this thread returns here, - * without waiting for the release agent task. We don't bother to - * wait because the caller of this routine has no use for the exit - * status of the /sbin/cpuset_release_agent task, so no sense holding - * our caller up for that. - * - * When we had only one cpuset mutex, we had to call this - * without holding it, to avoid deadlock when call_usermodehelper() - * allocated memory. With two locks, we could now call this while - * holding manage_mutex, but we still don't, so as to minimize - * the time manage_mutex is held. - */ - -static void cpuset_release_agent(const char *pathbuf) -{ - char *argv[3], *envp[3]; - int i; - - if (!pathbuf) - return; - - i = 0; - argv[i++] = "/sbin/cpuset_release_agent"; - argv[i++] = (char *)pathbuf; - argv[i] = NULL; - - i = 0; - /* minimal command environment */ - envp[i++] = "HOME=/"; - envp[i++] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin"; - envp[i] = NULL; - - call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC); - kfree(pathbuf); -} - -/* - * Either cs->count of using tasks transitioned to zero, or the - * cs->children list of child cpusets just became empty. If this - * cs is notify_on_release() and now both the user count is zero and - * the list of children is empty, prepare cpuset path in a kmalloc'd - * buffer, to be returned via ppathbuf, so that the caller can invoke - * cpuset_release_agent() with it later on, once manage_mutex is dropped. - * Call here with manage_mutex held. - * - * This check_for_release() routine is responsible for kmalloc'ing - * pathbuf. The above cpuset_release_agent() is responsible for - * kfree'ing pathbuf. The caller of these routines is responsible - * for providing a pathbuf pointer, initialized to NULL, then - * calling check_for_release() with manage_mutex held and the address - * of the pathbuf pointer, then dropping manage_mutex, then calling - * cpuset_release_agent() with pathbuf, as set by check_for_release(). - */ - -static void check_for_release(struct cpuset *cs, char **ppathbuf) -{ - if (notify_on_release(cs) && atomic_read(&cs->count) == 0 && - list_empty(&cs->children)) { - char *buf; - - buf = kmalloc(PAGE_SIZE, GFP_KERNEL); - if (!buf) - return; - if (cpuset_path(cs, buf, PAGE_SIZE) < 0) - kfree(buf); - else - *ppathbuf = buf; - } -} - /* * Return in *pmask the portion of a cpusets's cpus_allowed that * are online. If none are online, walk up the cpuset hierarchy @@ -653,20 +390,19 @@ void cpuset_update_task_memory_state(void) struct task_struct *tsk = current; struct cpuset *cs; - if (tsk->cpuset == &top_cpuset) { + if (task_cs(tsk) == &top_cpuset) { /* Don't need rcu for top_cpuset. It's never freed. */ my_cpusets_mem_gen = top_cpuset.mems_generation; } else { rcu_read_lock(); - cs = rcu_dereference(tsk->cpuset); - my_cpusets_mem_gen = cs->mems_generation; + my_cpusets_mem_gen = task_cs(current)->mems_generation; rcu_read_unlock(); } if (my_cpusets_mem_gen != tsk->cpuset_mems_generation) { mutex_lock(&callback_mutex); task_lock(tsk); - cs = tsk->cpuset; /* Maybe changed when task not locked */ + cs = task_cs(tsk); /* Maybe changed when task not locked */ guarantee_online_mems(cs, &tsk->mems_allowed); tsk->cpuset_mems_generation = cs->mems_generation; if (is_spread_page(cs)) @@ -721,11 +457,12 @@ static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q) static int validate_change(const struct cpuset *cur, const struct cpuset *trial) { + struct cgroup *cont; struct cpuset *c, *par; /* Each of our child cpusets must be a subset of us */ - list_for_each_entry(c, &cur->children, sibling) { - if (!is_cpuset_subset(c, trial)) + list_for_each_entry(cont, &cur->css.cgroup->children, sibling) { + if (!is_cpuset_subset(cgroup_cs(cont), trial)) return -EBUSY; } @@ -740,7 +477,8 @@ static int validate_change(const struct cpuset *cur, const struct cpuset *trial) return -EACCES; /* If either I or some sibling (!= me) is exclusive, we can't overlap */ - list_for_each_entry(c, &par->children, sibling) { + list_for_each_entry(cont, &par->css.cgroup->children, sibling) { + c = cgroup_cs(cont); if ((is_cpu_exclusive(trial) || is_cpu_exclusive(c)) && c != cur && cpus_intersects(trial->cpus_allowed, c->cpus_allowed)) @@ -751,17 +489,265 @@ static int validate_change(const struct cpuset *cur, const struct cpuset *trial) return -EINVAL; } + /* Cpusets with tasks can't have empty cpus_allowed or mems_allowed */ + if (cgroup_task_count(cur->css.cgroup)) { + if (cpus_empty(trial->cpus_allowed) || + nodes_empty(trial->mems_allowed)) { + return -ENOSPC; + } + } + return 0; } /* + * Helper routine for rebuild_sched_domains(). + * Do cpusets a, b have overlapping cpus_allowed masks? + */ + +static int cpusets_overlap(struct cpuset *a, struct cpuset *b) +{ + return cpus_intersects(a->cpus_allowed, b->cpus_allowed); +} + +/* + * rebuild_sched_domains() + * + * If the flag 'sched_load_balance' of any cpuset with non-empty + * 'cpus' changes, or if the 'cpus' allowed changes in any cpuset + * which has that flag enabled, or if any cpuset with a non-empty + * 'cpus' is removed, then call this routine to rebuild the + * scheduler's dynamic sched domains. + * + * This routine builds a partial partition of the systems CPUs + * (the set of non-overlappping cpumask_t's in the array 'part' + * below), and passes that partial partition to the kernel/sched.c + * partition_sched_domains() routine, which will rebuild the + * schedulers load balancing domains (sched domains) as specified + * by that partial partition. A 'partial partition' is a set of + * non-overlapping subsets whose union is a subset of that set. + * + * See "What is sched_load_balance" in Documentation/cpusets.txt + * for a background explanation of this. + * + * Does not return errors, on the theory that the callers of this + * routine would rather not worry about failures to rebuild sched + * domains when operating in the severe memory shortage situations + * that could cause allocation failures below. + * + * Call with cgroup_mutex held. May take callback_mutex during + * call due to the kfifo_alloc() and kmalloc() calls. May nest + * a call to the lock_cpu_hotplug()/unlock_cpu_hotplug() pair. + * Must not be called holding callback_mutex, because we must not + * call lock_cpu_hotplug() while holding callback_mutex. Elsewhere + * the kernel nests callback_mutex inside lock_cpu_hotplug() calls. + * So the reverse nesting would risk an ABBA deadlock. + * + * The three key local variables below are: + * q - a kfifo queue of cpuset pointers, used to implement a + * top-down scan of all cpusets. This scan loads a pointer + * to each cpuset marked is_sched_load_balance into the + * array 'csa'. For our purposes, rebuilding the schedulers + * sched domains, we can ignore !is_sched_load_balance cpusets. + * csa - (for CpuSet Array) Array of pointers to all the cpusets + * that need to be load balanced, for convenient iterative + * access by the subsequent code that finds the best partition, + * i.e the set of domains (subsets) of CPUs such that the + * cpus_allowed of every cpuset marked is_sched_load_balance + * is a subset of one of these domains, while there are as + * many such domains as possible, each as small as possible. + * doms - Conversion of 'csa' to an array of cpumasks, for passing to + * the kernel/sched.c routine partition_sched_domains() in a + * convenient format, that can be easily compared to the prior + * value to determine what partition elements (sched domains) + * were changed (added or removed.) + * + * Finding the best partition (set of domains): + * The triple nested loops below over i, j, k scan over the + * load balanced cpusets (using the array of cpuset pointers in + * csa[]) looking for pairs of cpusets that have overlapping + * cpus_allowed, but which don't have the same 'pn' partition + * number and gives them in the same partition number. It keeps + * looping on the 'restart' label until it can no longer find + * any such pairs. + * + * The union of the cpus_allowed masks from the set of + * all cpusets having the same 'pn' value then form the one + * element of the partition (one sched domain) to be passed to + * partition_sched_domains(). + */ + +static void rebuild_sched_domains(void) +{ + struct kfifo *q; /* queue of cpusets to be scanned */ + struct cpuset *cp; /* scans q */ + struct cpuset **csa; /* array of all cpuset ptrs */ + int csn; /* how many cpuset ptrs in csa so far */ + int i, j, k; /* indices for partition finding loops */ + cpumask_t *doms; /* resulting partition; i.e. sched domains */ + int ndoms; /* number of sched domains in result */ + int nslot; /* next empty doms[] cpumask_t slot */ + + q = NULL; + csa = NULL; + doms = NULL; + + /* Special case for the 99% of systems with one, full, sched domain */ + if (is_sched_load_balance(&top_cpuset)) { + ndoms = 1; + doms = kmalloc(sizeof(cpumask_t), GFP_KERNEL); + if (!doms) + goto rebuild; + *doms = top_cpuset.cpus_allowed; + goto rebuild; + } + + q = kfifo_alloc(number_of_cpusets * sizeof(cp), GFP_KERNEL, NULL); + if (IS_ERR(q)) + goto done; + csa = kmalloc(number_of_cpusets * sizeof(cp), GFP_KERNEL); + if (!csa) + goto done; + csn = 0; + + cp = &top_cpuset; + __kfifo_put(q, (void *)&cp, sizeof(cp)); + while (__kfifo_get(q, (void *)&cp, sizeof(cp))) { + struct cgroup *cont; + struct cpuset *child; /* scans child cpusets of cp */ + if (is_sched_load_balance(cp)) + csa[csn++] = cp; + list_for_each_entry(cont, &cp->css.cgroup->children, sibling) { + child = cgroup_cs(cont); + __kfifo_put(q, (void *)&child, sizeof(cp)); + } + } + + for (i = 0; i < csn; i++) + csa[i]->pn = i; + ndoms = csn; + +restart: + /* Find the best partition (set of sched domains) */ + for (i = 0; i < csn; i++) { + struct cpuset *a = csa[i]; + int apn = a->pn; + + for (j = 0; j < csn; j++) { + struct cpuset *b = csa[j]; + int bpn = b->pn; + + if (apn != bpn && cpusets_overlap(a, b)) { + for (k = 0; k < csn; k++) { + struct cpuset *c = csa[k]; + + if (c->pn == bpn) + c->pn = apn; + } + ndoms--; /* one less element */ + goto restart; + } + } + } + + /* Convert <csn, csa> to <ndoms, doms> */ + doms = kmalloc(ndoms * sizeof(cpumask_t), GFP_KERNEL); + if (!doms) + goto rebuild; + + for (nslot = 0, i = 0; i < csn; i++) { + struct cpuset *a = csa[i]; + int apn = a->pn; + + if (apn >= 0) { + cpumask_t *dp = doms + nslot; + + if (nslot == ndoms) { + static int warnings = 10; + if (warnings) { + printk(KERN_WARNING + "rebuild_sched_domains confused:" + " nslot %d, ndoms %d, csn %d, i %d," + " apn %d\n", + nslot, ndoms, csn, i, apn); + warnings--; + } + continue; + } + + cpus_clear(*dp); + for (j = i; j < csn; j++) { + struct cpuset *b = csa[j]; + + if (apn == b->pn) { + cpus_or(*dp, *dp, b->cpus_allowed); + b->pn = -1; + } + } + nslot++; + } + } + BUG_ON(nslot != ndoms); + +rebuild: + /* Have scheduler rebuild sched domains */ + lock_cpu_hotplug(); + partition_sched_domains(ndoms, doms); + unlock_cpu_hotplug(); + +done: + if (q && !IS_ERR(q)) + kfifo_free(q); + kfree(csa); + /* Don't kfree(doms) -- partition_sched_domains() does that. */ +} + +static inline int started_after_time(struct task_struct *t1, + struct timespec *time, + struct task_struct *t2) +{ + int start_diff = timespec_compare(&t1->start_time, time); + if (start_diff > 0) { + return 1; + } else if (start_diff < 0) { + return 0; + } else { + /* + * Arbitrarily, if two processes started at the same + * time, we'll say that the lower pointer value + * started first. Note that t2 may have exited by now + * so this may not be a valid pointer any longer, but + * that's fine - it still serves to distinguish + * between two tasks started (effectively) + * simultaneously. + */ + return t1 > t2; + } +} + +static inline int started_after(void *p1, void *p2) +{ + struct task_struct *t1 = p1; + struct task_struct *t2 = p2; + return started_after_time(t1, &t2->start_time, t2); +} + +/* * Call with manage_mutex held. May take callback_mutex during call. */ static int update_cpumask(struct cpuset *cs, char *buf) { struct cpuset trialcs; - int retval; + int retval, i; + int is_load_balanced; + struct cgroup_iter it; + struct cgroup *cgrp = cs->css.cgroup; + struct task_struct *p, *dropped; + /* Never dereference latest_task, since it's not refcounted */ + struct task_struct *latest_task = NULL; + struct ptr_heap heap; + struct timespec latest_time = { 0, 0 }; /* top_cpuset.cpus_allowed tracks cpu_online_map; it's read-only */ if (cs == &top_cpuset) @@ -770,11 +756,13 @@ static int update_cpumask(struct cpuset *cs, char *buf) trialcs = *cs; /* - * We allow a cpuset's cpus_allowed to be empty; if it has attached - * tasks, we'll catch it later when we validate the change and return - * -ENOSPC. + * An empty cpus_allowed is ok iff there are no tasks in the cpuset. + * Since cpulist_parse() fails on an empty mask, we special case + * that parsing. The validate_change() call ensures that cpusets + * with tasks have cpus. */ - if (!buf[0] || (buf[0] == '\n' && !buf[1])) { + buf = strstrip(buf); + if (!*buf) { cpus_clear(trialcs.cpus_allowed); } else { retval = cpulist_parse(buf, trialcs.cpus_allowed); @@ -782,15 +770,79 @@ static int update_cpumask(struct cpuset *cs, char *buf) return retval; } cpus_and(trialcs.cpus_allowed, trialcs.cpus_allowed, cpu_online_map); - /* cpus_allowed cannot be empty for a cpuset with attached tasks. */ - if (atomic_read(&cs->count) && cpus_empty(trialcs.cpus_allowed)) - return -ENOSPC; retval = validate_change(cs, &trialcs); if (retval < 0) return retval; + + /* Nothing to do if the cpus didn't change */ + if (cpus_equal(cs->cpus_allowed, trialcs.cpus_allowed)) + return 0; + retval = heap_init(&heap, PAGE_SIZE, GFP_KERNEL, &started_after); + if (retval) + return retval; + + is_load_balanced = is_sched_load_balance(&trialcs); + mutex_lock(&callback_mutex); cs->cpus_allowed = trialcs.cpus_allowed; mutex_unlock(&callback_mutex); + + again: + /* + * Scan tasks in the cpuset, and update the cpumasks of any + * that need an update. Since we can't call set_cpus_allowed() + * while holding tasklist_lock, gather tasks to be processed + * in a heap structure. If the statically-sized heap fills up, + * overflow tasks that started later, and in future iterations + * only consider tasks that started after the latest task in + * the previous pass. This guarantees forward progress and + * that we don't miss any tasks + */ + heap.size = 0; + cgroup_iter_start(cgrp, &it); + while ((p = cgroup_iter_next(cgrp, &it))) { + /* Only affect tasks that don't have the right cpus_allowed */ + if (cpus_equal(p->cpus_allowed, cs->cpus_allowed)) + continue; + /* + * Only process tasks that started after the last task + * we processed + */ + if (!started_after_time(p, &latest_time, latest_task)) + continue; + dropped = heap_insert(&heap, p); + if (dropped == NULL) { + get_task_struct(p); + } else if (dropped != p) { + get_task_struct(p); + put_task_struct(dropped); + } + } + cgroup_iter_end(cgrp, &it); + if (heap.size) { + for (i = 0; i < heap.size; i++) { + struct task_struct *p = heap.ptrs[i]; + if (i == 0) { + latest_time = p->start_time; + latest_task = p; + } + set_cpus_allowed(p, cs->cpus_allowed); + put_task_struct(p); + } + /* + * If we had to process any tasks at all, scan again + * in case some of them were in the middle of forking + * children that didn't notice the new cpumask + * restriction. Not the most efficient way to do it, + * but it avoids having to take callback_mutex in the + * fork path + */ + goto again; + } + heap_free(&heap); + if (is_load_balanced) + rebuild_sched_domains(); + return 0; } @@ -839,7 +891,7 @@ static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from, do_migrate_pages(mm, from, to, MPOL_MF_MOVE_ALL); mutex_lock(&callback_mutex); - guarantee_online_mems(tsk->cpuset, &tsk->mems_allowed); + guarantee_online_mems(task_cs(tsk),&tsk->mems_allowed); mutex_unlock(&callback_mutex); } @@ -857,16 +909,19 @@ static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from, * their mempolicies to the cpusets new mems_allowed. */ +static void *cpuset_being_rebound; + static int update_nodemask(struct cpuset *cs, char *buf) { struct cpuset trialcs; nodemask_t oldmem; - struct task_struct *g, *p; + struct task_struct *p; struct mm_struct **mmarray; int i, n, ntasks; int migrate; int fudge; int retval; + struct cgroup_iter it; /* * top_cpuset.mems_allowed tracks node_stats[N_HIGH_MEMORY]; @@ -878,29 +933,19 @@ static int update_nodemask(struct cpuset *cs, char *buf) trialcs = *cs; /* - * We allow a cpuset's mems_allowed to be empty; if it has attached - * tasks, we'll catch it later when we validate the change and return - * -ENOSPC. + * An empty mems_allowed is ok iff there are no tasks in the cpuset. + * Since nodelist_parse() fails on an empty mask, we special case + * that parsing. The validate_change() call ensures that cpusets + * with tasks have memory. */ - if (!buf[0] || (buf[0] == '\n' && !buf[1])) { + buf = strstrip(buf); + if (!*buf) { nodes_clear(trialcs.mems_allowed); } else { retval = nodelist_parse(buf, trialcs.mems_allowed); if (retval < 0) goto done; - if (!nodes_intersects(trialcs.mems_allowed, - node_states[N_HIGH_MEMORY])) { - /* - * error if only memoryless nodes specified. - */ - retval = -ENOSPC; - goto done; - } } - /* - * Exclude memoryless nodes. We know that trialcs.mems_allowed - * contains at least one node with memory. - */ nodes_and(trialcs.mems_allowed, trialcs.mems_allowed, node_states[N_HIGH_MEMORY]); oldmem = cs->mems_allowed; @@ -908,11 +953,6 @@ static int update_nodemask(struct cpuset *cs, char *buf) retval = 0; /* Too easy - nothing to do */ goto done; } - /* mems_allowed cannot be empty for a cpuset with attached tasks. */ - if (atomic_read(&cs->count) && nodes_empty(trialcs.mems_allowed)) { - retval = -ENOSPC; - goto done; - } retval = validate_change(cs, &trialcs); if (retval < 0) goto done; @@ -922,7 +962,7 @@ static int update_nodemask(struct cpuset *cs, char *buf) cs->mems_generation = cpuset_mems_generation++; mutex_unlock(&callback_mutex); - set_cpuset_being_rebound(cs); /* causes mpol_copy() rebind */ + cpuset_being_rebound = cs; /* causes mpol_copy() rebind */ fudge = 10; /* spare mmarray[] slots */ fudge += cpus_weight(cs->cpus_allowed); /* imagine one fork-bomb/cpu */ @@ -936,13 +976,13 @@ static int update_nodemask(struct cpuset *cs, char *buf) * enough mmarray[] w/o using GFP_ATOMIC. */ while (1) { - ntasks = atomic_read(&cs->count); /* guess */ + ntasks = cgroup_task_count(cs->css.cgroup); /* guess */ ntasks += fudge; mmarray = kmalloc(ntasks * sizeof(*mmarray), GFP_KERNEL); if (!mmarray) goto done; read_lock(&tasklist_lock); /* block fork */ - if (atomic_read(&cs->count) <= ntasks) + if (cgroup_task_count(cs->css.cgroup) <= ntasks) break; /* got enough */ read_unlock(&tasklist_lock); /* try again */ kfree(mmarray); @@ -951,21 +991,21 @@ static int update_nodemask(struct cpuset *cs, char *buf) n = 0; /* Load up mmarray[] with mm reference for each task in cpuset. */ - do_each_thread(g, p) { + cgroup_iter_start(cs->css.cgroup, &it); + while ((p = cgroup_iter_next(cs->css.cgroup, &it))) { struct mm_struct *mm; if (n >= ntasks) { printk(KERN_WARNING "Cpuset mempolicy rebind incomplete.\n"); - continue; + break; } - if (p->cpuset != cs) - continue; mm = get_task_mm(p); if (!mm) continue; mmarray[n++] = mm; - } while_each_thread(g, p); + } + cgroup_iter_end(cs->css.cgroup, &it); read_unlock(&tasklist_lock); /* @@ -993,12 +1033,17 @@ static int update_nodemask(struct cpuset *cs, char *buf) /* We're done rebinding vma's to this cpusets new mems_allowed. */ kfree(mmarray); - set_cpuset_being_rebound(NULL); + cpuset_being_rebound = NULL; retval = 0; done: return retval; } +int current_cpuset_is_being_rebound(void) +{ + return task_cs(current) == cpuset_being_rebound; +} + /* * Call with manage_mutex held. */ @@ -1015,6 +1060,7 @@ static int update_memory_pressure_enabled(struct cpuset *cs, char *buf) /* * update_flag - read a 0 or a 1 in a file and update associated flag * bit: the bit to update (CS_CPU_EXCLUSIVE, CS_MEM_EXCLUSIVE, + * CS_SCHED_LOAD_BALANCE, * CS_NOTIFY_ON_RELEASE, CS_MEMORY_MIGRATE, * CS_SPREAD_PAGE, CS_SPREAD_SLAB) * cs: the cpuset to update @@ -1028,6 +1074,7 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, char *buf) int turning_on; struct cpuset trialcs; int err; + int cpus_nonempty, balance_flag_changed; turning_on = (simple_strtoul(buf, NULL, 10) != 0); @@ -1040,10 +1087,18 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, char *buf) err = validate_change(cs, &trialcs); if (err < 0) return err; + + cpus_nonempty = !cpus_empty(trialcs.cpus_allowed); + balance_flag_changed = (is_sched_load_balance(cs) != + is_sched_load_balance(&trialcs)); + mutex_lock(&callback_mutex); cs->flags = trialcs.flags; mutex_unlock(&callback_mutex); + if (cpus_nonempty && balance_flag_changed) + rebuild_sched_domains(); + return 0; } @@ -1145,85 +1200,34 @@ static int fmeter_getrate(struct fmeter *fmp) return val; } -/* - * Attack task specified by pid in 'pidbuf' to cpuset 'cs', possibly - * writing the path of the old cpuset in 'ppathbuf' if it needs to be - * notified on release. - * - * Call holding manage_mutex. May take callback_mutex and task_lock of - * the task 'pid' during call. - */ - -static int attach_task(struct cpuset *cs, char *pidbuf, char **ppathbuf) +static int cpuset_can_attach(struct cgroup_subsys *ss, + struct cgroup *cont, struct task_struct *tsk) { - pid_t pid; - struct task_struct *tsk; - struct cpuset *oldcs; - cpumask_t cpus; - nodemask_t from, to; - struct mm_struct *mm; - int retval; + struct cpuset *cs = cgroup_cs(cont); - if (sscanf(pidbuf, "%d", &pid) != 1) - return -EIO; if (cpus_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed)) return -ENOSPC; - if (pid) { - read_lock(&tasklist_lock); - - tsk = find_task_by_pid(pid); - if (!tsk || tsk->flags & PF_EXITING) { - read_unlock(&tasklist_lock); - return -ESRCH; - } - - get_task_struct(tsk); - read_unlock(&tasklist_lock); - - if ((current->euid) && (current->euid != tsk->uid) - && (current->euid != tsk->suid)) { - put_task_struct(tsk); - return -EACCES; - } - } else { - tsk = current; - get_task_struct(tsk); - } + return security_task_setscheduler(tsk, 0, NULL); +} - retval = security_task_setscheduler(tsk, 0, NULL); - if (retval) { - put_task_struct(tsk); - return retval; - } +static void cpuset_attach(struct cgroup_subsys *ss, + struct cgroup *cont, struct cgroup *oldcont, + struct task_struct *tsk) +{ + cpumask_t cpus; + nodemask_t from, to; + struct mm_struct *mm; + struct cpuset *cs = cgroup_cs(cont); + struct cpuset *oldcs = cgroup_cs(oldcont); mutex_lock(&callback_mutex); - - task_lock(tsk); - oldcs = tsk->cpuset; - /* - * After getting 'oldcs' cpuset ptr, be sure still not exiting. - * If 'oldcs' might be the top_cpuset due to the_top_cpuset_hack - * then fail this attach_task(), to avoid breaking top_cpuset.count. - */ - if (tsk->flags & PF_EXITING) { - task_unlock(tsk); - mutex_unlock(&callback_mutex); - put_task_struct(tsk); - return -ESRCH; - } - atomic_inc(&cs->count); - rcu_assign_pointer(tsk->cpuset, cs); - task_unlock(tsk); - guarantee_online_cpus(cs, &cpus); set_cpus_allowed(tsk, cpus); + mutex_unlock(&callback_mutex); from = oldcs->mems_allowed; to = cs->mems_allowed; - - mutex_unlock(&callback_mutex); - mm = get_task_mm(tsk); if (mm) { mpol_rebind_mm(mm, &to); @@ -1232,44 +1236,36 @@ static int attach_task(struct cpuset *cs, char *pidbuf, char **ppathbuf) mmput(mm); } - put_task_struct(tsk); - synchronize_rcu(); - if (atomic_dec_and_test(&oldcs->count)) - check_for_release(oldcs, ppathbuf); - return 0; } /* The various types of files and directories in a cpuset file system */ typedef enum { - FILE_ROOT, - FILE_DIR, FILE_MEMORY_MIGRATE, FILE_CPULIST, FILE_MEMLIST, FILE_CPU_EXCLUSIVE, FILE_MEM_EXCLUSIVE, - FILE_NOTIFY_ON_RELEASE, + FILE_SCHED_LOAD_BALANCE, FILE_MEMORY_PRESSURE_ENABLED, FILE_MEMORY_PRESSURE, FILE_SPREAD_PAGE, FILE_SPREAD_SLAB, - FILE_TASKLIST, } cpuset_filetype_t; -static ssize_t cpuset_common_file_write(struct file *file, +static ssize_t cpuset_common_file_write(struct cgroup *cont, + struct cftype *cft, + struct file *file, const char __user *userbuf, size_t nbytes, loff_t *unused_ppos) { - struct cpuset *cs = __d_cs(file->f_path.dentry->d_parent); - struct cftype *cft = __d_cft(file->f_path.dentry); + struct cpuset *cs = cgroup_cs(cont); cpuset_filetype_t type = cft->private; char *buffer; - char *pathbuf = NULL; int retval = 0; /* Crude upper limit on largest legitimate cpulist user might write. */ - if (nbytes > 100 + 6 * max(NR_CPUS, MAX_NUMNODES)) + if (nbytes > 100U + 6 * max(NR_CPUS, MAX_NUMNODES)) return -E2BIG; /* +1 for nul-terminator */ @@ -1282,9 +1278,9 @@ static ssize_t cpuset_common_file_write(struct file *file, } buffer[nbytes] = 0; /* nul-terminate */ - mutex_lock(&manage_mutex); + cgroup_lock(); - if (is_removed(cs)) { + if (cgroup_is_removed(cont)) { retval = -ENODEV; goto out2; } @@ -1302,8 +1298,8 @@ static ssize_t cpuset_common_file_write(struct file *file, case FILE_MEM_EXCLUSIVE: retval = update_flag(CS_MEM_EXCLUSIVE, cs, buffer); break; - case FILE_NOTIFY_ON_RELEASE: - retval = update_flag(CS_NOTIFY_ON_RELEASE, cs, buffer); + case FILE_SCHED_LOAD_BALANCE: + retval = update_flag(CS_SCHED_LOAD_BALANCE, cs, buffer); break; case FILE_MEMORY_MIGRATE: retval = update_flag(CS_MEMORY_MIGRATE, cs, buffer); @@ -1322,9 +1318,6 @@ static ssize_t cpuset_common_file_write(struct file *file, retval = update_flag(CS_SPREAD_SLAB, cs, buffer); cs->mems_generation = cpuset_mems_generation++; break; - case FILE_TASKLIST: - retval = attach_task(cs, buffer, &pathbuf); - break; default: retval = -EINVAL; goto out2; @@ -1333,30 +1326,12 @@ static ssize_t cpuset_common_file_write(struct file *file, if (retval == 0) retval = nbytes; out2: - mutex_unlock(&manage_mutex); - cpuset_release_agent(pathbuf); + cgroup_unlock(); out1: kfree(buffer); return retval; } -static ssize_t cpuset_file_write(struct file *file, const char __user *buf, - size_t nbytes, loff_t *ppos) -{ - ssize_t retval = 0; - struct cftype *cft = __d_cft(file->f_path.dentry); - if (!cft) - return -ENODEV; - - /* special function ? */ - if (cft->write) - retval = cft->write(file, buf, nbytes, ppos); - else - retval = cpuset_common_file_write(file, buf, nbytes, ppos); - - return retval; -} - /* * These ascii lists should be read in a single call, by using a user * buffer large enough to hold the entire map. If read in smaller @@ -1391,11 +1366,13 @@ static int cpuset_sprintf_memlist(char *page, struct cpuset *cs) return nodelist_scnprintf(page, PAGE_SIZE, mask); } -static ssize_t cpuset_common_file_read(struct file *file, char __user *buf, - size_t nbytes, loff_t *ppos) +static ssize_t cpuset_common_file_read(struct cgroup *cont, + struct cftype *cft, + struct file *file, + char __user *buf, + size_t nbytes, loff_t *ppos) { - struct cftype *cft = __d_cft(file->f_path.dentry); - struct cpuset *cs = __d_cs(file->f_path.dentry->d_parent); + struct cpuset *cs = cgroup_cs(cont); cpuset_filetype_t type = cft->private; char *page; ssize_t retval = 0; @@ -1419,8 +1396,8 @@ static ssize_t cpuset_common_file_read(struct file *file, char __user *buf, case FILE_MEM_EXCLUSIVE: *s++ = is_mem_exclusive(cs) ? '1' : '0'; break; - case FILE_NOTIFY_ON_RELEASE: - *s++ = notify_on_release(cs) ? '1' : '0'; + case FILE_SCHED_LOAD_BALANCE: + *s++ = is_sched_load_balance(cs) ? '1' : '0'; break; case FILE_MEMORY_MIGRATE: *s++ = is_memory_migrate(cs) ? '1' : '0'; @@ -1449,390 +1426,150 @@ out: return retval; } -static ssize_t cpuset_file_read(struct file *file, char __user *buf, size_t nbytes, - loff_t *ppos) -{ - ssize_t retval = 0; - struct cftype *cft = __d_cft(file->f_path.dentry); - if (!cft) - return -ENODEV; - - /* special function ? */ - if (cft->read) - retval = cft->read(file, buf, nbytes, ppos); - else - retval = cpuset_common_file_read(file, buf, nbytes, ppos); - - return retval; -} - -static int cpuset_file_open(struct inode *inode, struct file *file) -{ - int err; - struct cftype *cft; - - err = generic_file_open(inode, file); - if (err) - return err; - - cft = __d_cft(file->f_path.dentry); - if (!cft) - return -ENODEV; - if (cft->open) - err = cft->open(inode, file); - else - err = 0; - - return err; -} - -static int cpuset_file_release(struct inode *inode, struct file *file) -{ - struct cftype *cft = __d_cft(file->f_path.dentry); - if (cft->release) - return cft->release(inode, file); - return 0; -} - -/* - * cpuset_rename - Only allow simple rename of directories in place. - */ -static int cpuset_rename(struct inode *old_dir, struct dentry *old_dentry, - struct inode *new_dir, struct dentry *new_dentry) -{ - if (!S_ISDIR(old_dentry->d_inode->i_mode)) - return -ENOTDIR; - if (new_dentry->d_inode) - return -EEXIST; - if (old_dir != new_dir) - return -EIO; - return simple_rename(old_dir, old_dentry, new_dir, new_dentry); -} - -static const struct file_operations cpuset_file_operations = { - .read = cpuset_file_read, - .write = cpuset_file_write, - .llseek = generic_file_llseek, - .open = cpuset_file_open, - .release = cpuset_file_release, -}; - -static const struct inode_operations cpuset_dir_inode_operations = { - .lookup = simple_lookup, - .mkdir = cpuset_mkdir, - .rmdir = cpuset_rmdir, - .rename = cpuset_rename, -}; - -static int cpuset_create_file(struct dentry *dentry, int mode) -{ - struct inode *inode; - - if (!dentry) - return -ENOENT; - if (dentry->d_inode) - return -EEXIST; - - inode = cpuset_new_inode(mode); - if (!inode) - return -ENOMEM; - - if (S_ISDIR(mode)) { - inode->i_op = &cpuset_dir_inode_operations; - inode->i_fop = &simple_dir_operations; - - /* start off with i_nlink == 2 (for "." entry) */ - inc_nlink(inode); - } else if (S_ISREG(mode)) { - inode->i_size = 0; - inode->i_fop = &cpuset_file_operations; - } - - d_instantiate(dentry, inode); - dget(dentry); /* Extra count - pin the dentry in core */ - return 0; -} - -/* - * cpuset_create_dir - create a directory for an object. - * cs: the cpuset we create the directory for. - * It must have a valid ->parent field - * And we are going to fill its ->dentry field. - * name: The name to give to the cpuset directory. Will be copied. - * mode: mode to set on new directory. - */ - -static int cpuset_create_dir(struct cpuset *cs, const char *name, int mode) -{ - struct dentry *dentry = NULL; - struct dentry *parent; - int error = 0; - - parent = cs->parent->dentry; - dentry = cpuset_get_dentry(parent, name); - if (IS_ERR(dentry)) - return PTR_ERR(dentry); - error = cpuset_create_file(dentry, S_IFDIR | mode); - if (!error) { - dentry->d_fsdata = cs; - inc_nlink(parent->d_inode); - cs->dentry = dentry; - } - dput(dentry); - - return error; -} - -static int cpuset_add_file(struct dentry *dir, const struct cftype *cft) -{ - struct dentry *dentry; - int error; - - mutex_lock(&dir->d_inode->i_mutex); - dentry = cpuset_get_dentry(dir, cft->name); - if (!IS_ERR(dentry)) { - error = cpuset_create_file(dentry, 0644 | S_IFREG); - if (!error) - dentry->d_fsdata = (void *)cft; - dput(dentry); - } else - error = PTR_ERR(dentry); - mutex_unlock(&dir->d_inode->i_mutex); - return error; -} - -/* - * Stuff for reading the 'tasks' file. - * - * Reading this file can return large amounts of data if a cpuset has - * *lots* of attached tasks. So it may need several calls to read(), - * but we cannot guarantee that the information we produce is correct - * unless we produce it entirely atomically. - * - * Upon tasks file open(), a struct ctr_struct is allocated, that - * will have a pointer to an array (also allocated here). The struct - * ctr_struct * is stored in file->private_data. Its resources will - * be freed by release() when the file is closed. The array is used - * to sprintf the PIDs and then used by read(). - */ - -/* cpusets_tasks_read array */ - -struct ctr_struct { - char *buf; - int bufsz; -}; - -/* - * Load into 'pidarray' up to 'npids' of the tasks using cpuset 'cs'. - * Return actual number of pids loaded. No need to task_lock(p) - * when reading out p->cpuset, as we don't really care if it changes - * on the next cycle, and we are not going to try to dereference it. - */ -static int pid_array_load(pid_t *pidarray, int npids, struct cpuset *cs) -{ - int n = 0; - struct task_struct *g, *p; - - read_lock(&tasklist_lock); - - do_each_thread(g, p) { - if (p->cpuset == cs) { - if (unlikely(n == npids)) - goto array_full; - pidarray[n++] = p->pid; - } - } while_each_thread(g, p); - -array_full: - read_unlock(&tasklist_lock); - return n; -} - -static int cmppid(const void *a, const void *b) -{ - return *(pid_t *)a - *(pid_t *)b; -} - -/* - * Convert array 'a' of 'npids' pid_t's to a string of newline separated - * decimal pids in 'buf'. Don't write more than 'sz' chars, but return - * count 'cnt' of how many chars would be written if buf were large enough. - */ -static int pid_array_to_buf(char *buf, int sz, pid_t *a, int npids) -{ - int cnt = 0; - int i; - - for (i = 0; i < npids; i++) - cnt += snprintf(buf + cnt, max(sz - cnt, 0), "%d\n", a[i]); - return cnt; -} - -/* - * Handle an open on 'tasks' file. Prepare a buffer listing the - * process id's of tasks currently attached to the cpuset being opened. - * - * Does not require any specific cpuset mutexes, and does not take any. - */ -static int cpuset_tasks_open(struct inode *unused, struct file *file) -{ - struct cpuset *cs = __d_cs(file->f_path.dentry->d_parent); - struct ctr_struct *ctr; - pid_t *pidarray; - int npids; - char c; - - if (!(file->f_mode & FMODE_READ)) - return 0; - - ctr = kmalloc(sizeof(*ctr), GFP_KERNEL); - if (!ctr) - goto err0; - - /* - * If cpuset gets more users after we read count, we won't have - * enough space - tough. This race is indistinguishable to the - * caller from the case that the additional cpuset users didn't - * show up until sometime later on. - */ - npids = atomic_read(&cs->count); - pidarray = kmalloc(npids * sizeof(pid_t), GFP_KERNEL); - if (!pidarray) - goto err1; - - npids = pid_array_load(pidarray, npids, cs); - sort(pidarray, npids, sizeof(pid_t), cmppid, NULL); - - /* Call pid_array_to_buf() twice, first just to get bufsz */ - ctr->bufsz = pid_array_to_buf(&c, sizeof(c), pidarray, npids) + 1; - ctr->buf = kmalloc(ctr->bufsz, GFP_KERNEL); - if (!ctr->buf) - goto err2; - ctr->bufsz = pid_array_to_buf(ctr->buf, ctr->bufsz, pidarray, npids); - - kfree(pidarray); - file->private_data = ctr; - return 0; - -err2: - kfree(pidarray); -err1: - kfree(ctr); -err0: - return -ENOMEM; -} - -static ssize_t cpuset_tasks_read(struct file *file, char __user *buf, - size_t nbytes, loff_t *ppos) -{ - struct ctr_struct *ctr = file->private_data; - return simple_read_from_buffer(buf, nbytes, ppos, ctr->buf, ctr->bufsz); -} -static int cpuset_tasks_release(struct inode *unused_inode, struct file *file) -{ - struct ctr_struct *ctr; - if (file->f_mode & FMODE_READ) { - ctr = file->private_data; - kfree(ctr->buf); - kfree(ctr); - } - return 0; -} /* * for the common functions, 'private' gives the type of file */ -static struct cftype cft_tasks = { - .name = "tasks", - .open = cpuset_tasks_open, - .read = cpuset_tasks_read, - .release = cpuset_tasks_release, - .private = FILE_TASKLIST, -}; - static struct cftype cft_cpus = { .name = "cpus", + .read = cpuset_common_file_read, + .write = cpuset_common_file_write, .private = FILE_CPULIST, }; static struct cftype cft_mems = { .name = "mems", + .read = cpuset_common_file_read, + .write = cpuset_common_file_write, .private = FILE_MEMLIST, }; static struct cftype cft_cpu_exclusive = { .name = "cpu_exclusive", + .read = cpuset_common_file_read, + .write = cpuset_common_file_write, .private = FILE_CPU_EXCLUSIVE, }; static struct cftype cft_mem_exclusive = { .name = "mem_exclusive", + .read = cpuset_common_file_read, + .write = cpuset_common_file_write, .private = FILE_MEM_EXCLUSIVE, }; -static struct cftype cft_notify_on_release = { - .name = "notify_on_release", - .private = FILE_NOTIFY_ON_RELEASE, +static struct cftype cft_sched_load_balance = { + .name = "sched_load_balance", + .read = cpuset_common_file_read, + .write = cpuset_common_file_write, + .private = FILE_SCHED_LOAD_BALANCE, }; static struct cftype cft_memory_migrate = { .name = "memory_migrate", + .read = cpuset_common_file_read, + .write = cpuset_common_file_write, .private = FILE_MEMORY_MIGRATE, }; static struct cftype cft_memory_pressure_enabled = { .name = "memory_pressure_enabled", + .read = cpuset_common_file_read, + .write = cpuset_common_file_write, .private = FILE_MEMORY_PRESSURE_ENABLED, }; static struct cftype cft_memory_pressure = { .name = "memory_pressure", + .read = cpuset_common_file_read, + .write = cpuset_common_file_write, .private = FILE_MEMORY_PRESSURE, }; static struct cftype cft_spread_page = { .name = "memory_spread_page", + .read = cpuset_common_file_read, + .write = cpuset_common_file_write, .private = FILE_SPREAD_PAGE, }; static struct cftype cft_spread_slab = { .name = "memory_spread_slab", + .read = cpuset_common_file_read, + .write = cpuset_common_file_write, .private = FILE_SPREAD_SLAB, }; -static int cpuset_populate_dir(struct dentry *cs_dentry) +static int cpuset_populate(struct cgroup_subsys *ss, struct cgroup *cont) { int err; - if ((err = cpuset_add_file(cs_dentry, &cft_cpus)) < 0) + if ((err = cgroup_add_file(cont, ss, &cft_cpus)) < 0) return err; - if ((err = cpuset_add_file(cs_dentry, &cft_mems)) < 0) + if ((err = cgroup_add_file(cont, ss, &cft_mems)) < 0) return err; - if ((err = cpuset_add_file(cs_dentry, &cft_cpu_exclusive)) < 0) + if ((err = cgroup_add_file(cont, ss, &cft_cpu_exclusive)) < 0) return err; - if ((err = cpuset_add_file(cs_dentry, &cft_mem_exclusive)) < 0) + if ((err = cgroup_add_file(cont, ss, &cft_mem_exclusive)) < 0) return err; - if ((err = cpuset_add_file(cs_dentry, &cft_notify_on_release)) < 0) + if ((err = cgroup_add_file(cont, ss, &cft_memory_migrate)) < 0) return err; - if ((err = cpuset_add_file(cs_dentry, &cft_memory_migrate)) < 0) + if ((err = cgroup_add_file(cont, ss, &cft_sched_load_balance)) < 0) return err; - if ((err = cpuset_add_file(cs_dentry, &cft_memory_pressure)) < 0) + if ((err = cgroup_add_file(cont, ss, &cft_memory_pressure)) < 0) return err; - if ((err = cpuset_add_file(cs_dentry, &cft_spread_page)) < 0) + if ((err = cgroup_add_file(cont, ss, &cft_spread_page)) < 0) return err; - if ((err = cpuset_add_file(cs_dentry, &cft_spread_slab)) < 0) - return err; - if ((err = cpuset_add_file(cs_dentry, &cft_tasks)) < 0) + if ((err = cgroup_add_file(cont, ss, &cft_spread_slab)) < 0) return err; + /* memory_pressure_enabled is in root cpuset only */ + if (err == 0 && !cont->parent) + err = cgroup_add_file(cont, ss, + &cft_memory_pressure_enabled); return 0; } /* + * post_clone() is called at the end of cgroup_clone(). + * 'cgroup' was just created automatically as a result of + * a cgroup_clone(), and the current task is about to + * be moved into 'cgroup'. + * + * Currently we refuse to set up the cgroup - thereby + * refusing the task to be entered, and as a result refusing + * the sys_unshare() or clone() which initiated it - if any + * sibling cpusets have exclusive cpus or mem. + * + * If this becomes a problem for some users who wish to + * allow that scenario, then cpuset_post_clone() could be + * changed to grant parent->cpus_allowed-sibling_cpus_exclusive + * (and likewise for mems) to the new cgroup. + */ +static void cpuset_post_clone(struct cgroup_subsys *ss, + struct cgroup *cgroup) +{ + struct cgroup *parent, *child; + struct cpuset *cs, *parent_cs; + + parent = cgroup->parent; + list_for_each_entry(child, &parent->children, sibling) { + cs = cgroup_cs(child); + if (is_mem_exclusive(cs) || is_cpu_exclusive(cs)) + return; + } + cs = cgroup_cs(cgroup); + parent_cs = cgroup_cs(parent); + + cs->mems_allowed = parent_cs->mems_allowed; + cs->cpus_allowed = parent_cs->cpus_allowed; + return; +} + +/* * cpuset_create - create a cpuset * parent: cpuset that will be parent of the new cpuset. * name: name of the new cpuset. Will be strcpy'ed. @@ -1841,106 +1578,77 @@ static int cpuset_populate_dir(struct dentry *cs_dentry) * Must be called with the mutex on the parent inode held */ -static long cpuset_create(struct cpuset *parent, const char *name, int mode) +static struct cgroup_subsys_state *cpuset_create( + struct cgroup_subsys *ss, + struct cgroup *cont) { struct cpuset *cs; - int err; + struct cpuset *parent; + if (!cont->parent) { + /* This is early initialization for the top cgroup */ + top_cpuset.mems_generation = cpuset_mems_generation++; + return &top_cpuset.css; + } + parent = cgroup_cs(cont->parent); cs = kmalloc(sizeof(*cs), GFP_KERNEL); if (!cs) - return -ENOMEM; + return ERR_PTR(-ENOMEM); - mutex_lock(&manage_mutex); cpuset_update_task_memory_state(); cs->flags = 0; - if (notify_on_release(parent)) - set_bit(CS_NOTIFY_ON_RELEASE, &cs->flags); if (is_spread_page(parent)) set_bit(CS_SPREAD_PAGE, &cs->flags); if (is_spread_slab(parent)) set_bit(CS_SPREAD_SLAB, &cs->flags); + set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags); cs->cpus_allowed = CPU_MASK_NONE; cs->mems_allowed = NODE_MASK_NONE; - atomic_set(&cs->count, 0); - INIT_LIST_HEAD(&cs->sibling); - INIT_LIST_HEAD(&cs->children); cs->mems_generation = cpuset_mems_generation++; fmeter_init(&cs->fmeter); cs->parent = parent; - - mutex_lock(&callback_mutex); - list_add(&cs->sibling, &cs->parent->children); number_of_cpusets++; - mutex_unlock(&callback_mutex); - - err = cpuset_create_dir(cs, name, mode); - if (err < 0) - goto err; - - /* - * Release manage_mutex before cpuset_populate_dir() because it - * will down() this new directory's i_mutex and if we race with - * another mkdir, we might deadlock. - */ - mutex_unlock(&manage_mutex); - - err = cpuset_populate_dir(cs->dentry); - /* If err < 0, we have a half-filled directory - oh well ;) */ - return 0; -err: - list_del(&cs->sibling); - mutex_unlock(&manage_mutex); - kfree(cs); - return err; + return &cs->css ; } -static int cpuset_mkdir(struct inode *dir, struct dentry *dentry, int mode) -{ - struct cpuset *c_parent = dentry->d_parent->d_fsdata; - - /* the vfs holds inode->i_mutex already */ - return cpuset_create(c_parent, dentry->d_name.name, mode | S_IFDIR); -} +/* + * Locking note on the strange update_flag() call below: + * + * If the cpuset being removed has its flag 'sched_load_balance' + * enabled, then simulate turning sched_load_balance off, which + * will call rebuild_sched_domains(). The lock_cpu_hotplug() + * call in rebuild_sched_domains() must not be made while holding + * callback_mutex. Elsewhere the kernel nests callback_mutex inside + * lock_cpu_hotplug() calls. So the reverse nesting would risk an + * ABBA deadlock. + */ -static int cpuset_rmdir(struct inode *unused_dir, struct dentry *dentry) +static void cpuset_destroy(struct cgroup_subsys *ss, struct cgroup *cont) { - struct cpuset *cs = dentry->d_fsdata; - struct dentry *d; - struct cpuset *parent; - char *pathbuf = NULL; + struct cpuset *cs = cgroup_cs(cont); - /* the vfs holds both inode->i_mutex already */ - - mutex_lock(&manage_mutex); cpuset_update_task_memory_state(); - if (atomic_read(&cs->count) > 0) { - mutex_unlock(&manage_mutex); - return -EBUSY; - } - if (!list_empty(&cs->children)) { - mutex_unlock(&manage_mutex); - return -EBUSY; - } - parent = cs->parent; - mutex_lock(&callback_mutex); - set_bit(CS_REMOVED, &cs->flags); - list_del(&cs->sibling); /* delete my sibling from parent->children */ - spin_lock(&cs->dentry->d_lock); - d = dget(cs->dentry); - cs->dentry = NULL; - spin_unlock(&d->d_lock); - cpuset_d_remove_dir(d); - dput(d); + + if (is_sched_load_balance(cs)) + update_flag(CS_SCHED_LOAD_BALANCE, cs, "0"); + number_of_cpusets--; - mutex_unlock(&callback_mutex); - if (list_empty(&parent->children)) - check_for_release(parent, &pathbuf); - mutex_unlock(&manage_mutex); - cpuset_release_agent(pathbuf); - return 0; + kfree(cs); } +struct cgroup_subsys cpuset_subsys = { + .name = "cpuset", + .create = cpuset_create, + .destroy = cpuset_destroy, + .can_attach = cpuset_can_attach, + .attach = cpuset_attach, + .populate = cpuset_populate, + .post_clone = cpuset_post_clone, + .subsys_id = cpuset_subsys_id, + .early_init = 1, +}; + /* * cpuset_init_early - just enough so that the calls to * cpuset_update_task_memory_state() in early init code @@ -1949,13 +1657,11 @@ static int cpuset_rmdir(struct inode *unused_dir, struct dentry *dentry) int __init cpuset_init_early(void) { - struct task_struct *tsk = current; - - tsk->cpuset = &top_cpuset; - tsk->cpuset->mems_generation = cpuset_mems_generation++; + top_cpuset.mems_generation = cpuset_mems_generation++; return 0; } + /** * cpuset_init - initialize cpusets at system boot * @@ -1964,39 +1670,21 @@ int __init cpuset_init_early(void) int __init cpuset_init(void) { - struct dentry *root; - int err; + int err = 0; top_cpuset.cpus_allowed = CPU_MASK_ALL; top_cpuset.mems_allowed = NODE_MASK_ALL; fmeter_init(&top_cpuset.fmeter); top_cpuset.mems_generation = cpuset_mems_generation++; - - init_task.cpuset = &top_cpuset; + set_bit(CS_SCHED_LOAD_BALANCE, &top_cpuset.flags); err = register_filesystem(&cpuset_fs_type); if (err < 0) - goto out; - cpuset_mount = kern_mount(&cpuset_fs_type); - if (IS_ERR(cpuset_mount)) { - printk(KERN_ERR "cpuset: could not mount!\n"); - err = PTR_ERR(cpuset_mount); - cpuset_mount = NULL; - goto out; - } - root = cpuset_mount->mnt_sb->s_root; - root->d_fsdata = &top_cpuset; - inc_nlink(root->d_inode); - top_cpuset.dentry = root; - root->d_inode->i_op = &cpuset_dir_inode_operations; + return err; + number_of_cpusets = 1; - err = cpuset_populate_dir(root); - /* memory_pressure_enabled is in root cpuset only */ - if (err == 0) - err = cpuset_add_file(root, &cft_memory_pressure_enabled); -out: - return err; + return 0; } /* @@ -2022,10 +1710,12 @@ out: static void guarantee_online_cpus_mems_in_subtree(const struct cpuset *cur) { + struct cgroup *cont; struct cpuset *c; /* Each of our child cpusets mems must be online */ - list_for_each_entry(c, &cur->children, sibling) { + list_for_each_entry(cont, &cur->css.cgroup->children, sibling) { + c = cgroup_cs(cont); guarantee_online_cpus_mems_in_subtree(c); if (!cpus_empty(c->cpus_allowed)) guarantee_online_cpus(c, &c->cpus_allowed); @@ -2053,7 +1743,7 @@ static void guarantee_online_cpus_mems_in_subtree(const struct cpuset *cur) static void common_cpu_mem_hotplug_unplug(void) { - mutex_lock(&manage_mutex); + cgroup_lock(); mutex_lock(&callback_mutex); guarantee_online_cpus_mems_in_subtree(&top_cpuset); @@ -2061,7 +1751,7 @@ static void common_cpu_mem_hotplug_unplug(void) top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY]; mutex_unlock(&callback_mutex); - mutex_unlock(&manage_mutex); + cgroup_unlock(); } /* @@ -2074,8 +1764,8 @@ static void common_cpu_mem_hotplug_unplug(void) * cpu_online_map on each CPU hotplug (cpuhp) event. */ -static int cpuset_handle_cpuhp(struct notifier_block *nb, - unsigned long phase, void *cpu) +static int cpuset_handle_cpuhp(struct notifier_block *unused_nb, + unsigned long phase, void *unused_cpu) { if (phase == CPU_DYING || phase == CPU_DYING_FROZEN) return NOTIFY_DONE; @@ -2113,109 +1803,7 @@ void __init cpuset_init_smp(void) } /** - * cpuset_fork - attach newly forked task to its parents cpuset. - * @tsk: pointer to task_struct of forking parent process. - * - * Description: A task inherits its parent's cpuset at fork(). - * - * A pointer to the shared cpuset was automatically copied in fork.c - * by dup_task_struct(). However, we ignore that copy, since it was - * not made under the protection of task_lock(), so might no longer be - * a valid cpuset pointer. attach_task() might have already changed - * current->cpuset, allowing the previously referenced cpuset to - * be removed and freed. Instead, we task_lock(current) and copy - * its present value of current->cpuset for our freshly forked child. - * - * At the point that cpuset_fork() is called, 'current' is the parent - * task, and the passed argument 'child' points to the child task. - **/ - -void cpuset_fork(struct task_struct *child) -{ - task_lock(current); - child->cpuset = current->cpuset; - atomic_inc(&child->cpuset->count); - task_unlock(current); -} - -/** - * cpuset_exit - detach cpuset from exiting task - * @tsk: pointer to task_struct of exiting process - * - * Description: Detach cpuset from @tsk and release it. - * - * Note that cpusets marked notify_on_release force every task in - * them to take the global manage_mutex mutex when exiting. - * This could impact scaling on very large systems. Be reluctant to - * use notify_on_release cpusets where very high task exit scaling - * is required on large systems. - * - * Don't even think about derefencing 'cs' after the cpuset use count - * goes to zero, except inside a critical section guarded by manage_mutex - * or callback_mutex. Otherwise a zero cpuset use count is a license to - * any other task to nuke the cpuset immediately, via cpuset_rmdir(). - * - * This routine has to take manage_mutex, not callback_mutex, because - * it is holding that mutex while calling check_for_release(), - * which calls kmalloc(), so can't be called holding callback_mutex(). - * - * the_top_cpuset_hack: - * - * Set the exiting tasks cpuset to the root cpuset (top_cpuset). - * - * Don't leave a task unable to allocate memory, as that is an - * accident waiting to happen should someone add a callout in - * do_exit() after the cpuset_exit() call that might allocate. - * If a task tries to allocate memory with an invalid cpuset, - * it will oops in cpuset_update_task_memory_state(). - * - * We call cpuset_exit() while the task is still competent to - * handle notify_on_release(), then leave the task attached to - * the root cpuset (top_cpuset) for the remainder of its exit. - * - * To do this properly, we would increment the reference count on - * top_cpuset, and near the very end of the kernel/exit.c do_exit() - * code we would add a second cpuset function call, to drop that - * reference. This would just create an unnecessary hot spot on - * the top_cpuset reference count, to no avail. - * - * Normally, holding a reference to a cpuset without bumping its - * count is unsafe. The cpuset could go away, or someone could - * attach us to a different cpuset, decrementing the count on - * the first cpuset that we never incremented. But in this case, - * top_cpuset isn't going away, and either task has PF_EXITING set, - * which wards off any attach_task() attempts, or task is a failed - * fork, never visible to attach_task. - * - * Another way to do this would be to set the cpuset pointer - * to NULL here, and check in cpuset_update_task_memory_state() - * for a NULL pointer. This hack avoids that NULL check, for no - * cost (other than this way too long comment ;). - **/ -void cpuset_exit(struct task_struct *tsk) -{ - struct cpuset *cs; - - task_lock(current); - cs = tsk->cpuset; - tsk->cpuset = &top_cpuset; /* the_top_cpuset_hack - see above */ - task_unlock(current); - - if (notify_on_release(cs)) { - char *pathbuf = NULL; - - mutex_lock(&manage_mutex); - if (atomic_dec_and_test(&cs->count)) - check_for_release(cs, &pathbuf); - mutex_unlock(&manage_mutex); - cpuset_release_agent(pathbuf); - } else { - atomic_dec(&cs->count); - } -} - -/** * cpuset_cpus_allowed - return cpus_allowed mask from a tasks cpuset. * @tsk: pointer to task_struct from which to obtain cpuset->cpus_allowed. * @@ -2230,10 +1818,23 @@ cpumask_t cpuset_cpus_allowed(struct task_struct *tsk) cpumask_t mask; mutex_lock(&callback_mutex); + mask = cpuset_cpus_allowed_locked(tsk); + mutex_unlock(&callback_mutex); + + return mask; +} + +/** + * cpuset_cpus_allowed_locked - return cpus_allowed mask from a tasks cpuset. + * Must be called with callback_mutex held. + **/ +cpumask_t cpuset_cpus_allowed_locked(struct task_struct *tsk) +{ + cpumask_t mask; + task_lock(tsk); - guarantee_online_cpus(tsk->cpuset, &mask); + guarantee_online_cpus(task_cs(tsk), &mask); task_unlock(tsk); - mutex_unlock(&callback_mutex); return mask; } @@ -2259,7 +1860,7 @@ nodemask_t cpuset_mems_allowed(struct task_struct *tsk) mutex_lock(&callback_mutex); task_lock(tsk); - guarantee_online_mems(tsk->cpuset, &mask); + guarantee_online_mems(task_cs(tsk), &mask); task_unlock(tsk); mutex_unlock(&callback_mutex); @@ -2390,7 +1991,7 @@ int __cpuset_zone_allowed_softwall(struct zone *z, gfp_t gfp_mask) mutex_lock(&callback_mutex); task_lock(current); - cs = nearest_exclusive_ancestor(current->cpuset); + cs = nearest_exclusive_ancestor(task_cs(current)); task_unlock(current); allowed = node_isset(node, cs->mems_allowed); @@ -2550,14 +2151,12 @@ int cpuset_memory_pressure_enabled __read_mostly; void __cpuset_memory_pressure_bump(void) { - struct cpuset *cs; - task_lock(current); - cs = current->cpuset; - fmeter_markevent(&cs->fmeter); + fmeter_markevent(&task_cs(current)->fmeter); task_unlock(current); } +#ifdef CONFIG_PROC_PID_CPUSET /* * proc_cpuset_show() * - Print tasks cpuset path into seq_file. @@ -2569,11 +2168,12 @@ void __cpuset_memory_pressure_bump(void) * the_top_cpuset_hack in cpuset_exit(), which sets an exiting tasks * cpuset to top_cpuset. */ -static int proc_cpuset_show(struct seq_file *m, void *v) +static int proc_cpuset_show(struct seq_file *m, void *unused_v) { struct pid *pid; struct task_struct *tsk; char *buf; + struct cgroup_subsys_state *css; int retval; retval = -ENOMEM; @@ -2588,15 +2188,15 @@ static int proc_cpuset_show(struct seq_file *m, void *v) goto out_free; retval = -EINVAL; - mutex_lock(&manage_mutex); - - retval = cpuset_path(tsk->cpuset, buf, PAGE_SIZE); + cgroup_lock(); + css = task_subsys_state(tsk, cpuset_subsys_id); + retval = cgroup_path(css->cgroup, buf, PAGE_SIZE); if (retval < 0) goto out_unlock; seq_puts(m, buf); seq_putc(m, '\n'); out_unlock: - mutex_unlock(&manage_mutex); + cgroup_unlock(); put_task_struct(tsk); out_free: kfree(buf); @@ -2616,6 +2216,7 @@ const struct file_operations proc_cpuset_operations = { .llseek = seq_lseek, .release = single_release, }; +#endif /* CONFIG_PROC_PID_CPUSET */ /* Display task cpus_allowed, mems_allowed in /proc/<pid>/status file. */ char *cpuset_task_status_allowed(struct task_struct *task, char *buffer) diff --git a/kernel/die_notifier.c b/kernel/die_notifier.c deleted file mode 100644 index 0d98827887a7..000000000000 --- a/kernel/die_notifier.c +++ /dev/null @@ -1,38 +0,0 @@ - -#include <linux/module.h> -#include <linux/notifier.h> -#include <linux/vmalloc.h> -#include <linux/kdebug.h> - - -static ATOMIC_NOTIFIER_HEAD(die_chain); - -int notify_die(enum die_val val, const char *str, - struct pt_regs *regs, long err, int trap, int sig) -{ - struct die_args args = { - .regs = regs, - .str = str, - .err = err, - .trapnr = trap, - .signr = sig, - - }; - - return atomic_notifier_call_chain(&die_chain, val, &args); -} - -int register_die_notifier(struct notifier_block *nb) -{ - vmalloc_sync_all(); - return atomic_notifier_chain_register(&die_chain, nb); -} -EXPORT_SYMBOL_GPL(register_die_notifier); - -int unregister_die_notifier(struct notifier_block *nb) -{ - return atomic_notifier_chain_unregister(&die_chain, nb); -} -EXPORT_SYMBOL_GPL(unregister_die_notifier); - - diff --git a/kernel/exit.c b/kernel/exit.c index 2c704c86edb3..f1aec27f1df0 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -31,7 +31,7 @@ #include <linux/taskstats_kern.h> #include <linux/delayacct.h> #include <linux/freezer.h> -#include <linux/cpuset.h> +#include <linux/cgroup.h> #include <linux/syscalls.h> #include <linux/signal.h> #include <linux/posix-timers.h> @@ -148,6 +148,7 @@ void release_task(struct task_struct * p) int zap_leader; repeat: atomic_dec(&p->user->processes); + proc_flush_task(p); write_lock_irq(&tasklist_lock); ptrace_unlink(p); BUG_ON(!list_empty(&p->ptrace_list) || !list_empty(&p->ptrace_children)); @@ -175,7 +176,6 @@ repeat: } write_unlock_irq(&tasklist_lock); - proc_flush_task(p); release_thread(p); call_rcu(&p->rcu, delayed_put_task_struct); @@ -221,7 +221,7 @@ static int will_become_orphaned_pgrp(struct pid *pgrp, struct task_struct *ignor do_each_pid_task(pgrp, PIDTYPE_PGID, p) { if (p == ignored_task || p->exit_state - || is_init(p->real_parent)) + || is_global_init(p->real_parent)) continue; if (task_pgrp(p->real_parent) != pgrp && task_session(p->real_parent) == task_session(p)) { @@ -299,14 +299,14 @@ void __set_special_pids(pid_t session, pid_t pgrp) { struct task_struct *curr = current->group_leader; - if (process_session(curr) != session) { + if (task_session_nr(curr) != session) { detach_pid(curr, PIDTYPE_SID); - set_signal_session(curr->signal, session); + set_task_session(curr, session); attach_pid(curr, PIDTYPE_SID, find_pid(session)); } - if (process_group(curr) != pgrp) { + if (task_pgrp_nr(curr) != pgrp) { detach_pid(curr, PIDTYPE_PGID); - curr->signal->pgrp = pgrp; + set_task_pgrp(curr, pgrp); attach_pid(curr, PIDTYPE_PGID, find_pid(pgrp)); } } @@ -400,11 +400,12 @@ void daemonize(const char *name, ...) current->fs = fs; atomic_inc(&fs->count); - exit_task_namespaces(current); - current->nsproxy = init_task.nsproxy; - get_task_namespaces(current); + if (current->nsproxy != init_task.nsproxy) { + get_nsproxy(init_task.nsproxy); + switch_task_namespaces(current, init_task.nsproxy); + } - exit_files(current); + exit_files(current); current->files = init_task.files; atomic_inc(¤t->files->count); @@ -492,7 +493,7 @@ void reset_files_struct(struct task_struct *tsk, struct files_struct *files) } EXPORT_SYMBOL(reset_files_struct); -static inline void __exit_files(struct task_struct *tsk) +static void __exit_files(struct task_struct *tsk) { struct files_struct * files = tsk->files; @@ -509,7 +510,7 @@ void exit_files(struct task_struct *tsk) __exit_files(tsk); } -static inline void __put_fs_struct(struct fs_struct *fs) +static void __put_fs_struct(struct fs_struct *fs) { /* No need to hold fs->lock if we are killing it */ if (atomic_dec_and_test(&fs->count)) { @@ -530,7 +531,7 @@ void put_fs_struct(struct fs_struct *fs) __put_fs_struct(fs); } -static inline void __exit_fs(struct task_struct *tsk) +static void __exit_fs(struct task_struct *tsk) { struct fs_struct * fs = tsk->fs; @@ -665,19 +666,22 @@ reparent_thread(struct task_struct *p, struct task_struct *father, int traced) * the child reaper process (ie "init") in our pid * space. */ -static void -forget_original_parent(struct task_struct *father, struct list_head *to_release) +static void forget_original_parent(struct task_struct *father) { - struct task_struct *p, *reaper = father; - struct list_head *_p, *_n; + struct task_struct *p, *n, *reaper = father; + struct list_head ptrace_dead; + + INIT_LIST_HEAD(&ptrace_dead); + + write_lock_irq(&tasklist_lock); do { reaper = next_thread(reaper); if (reaper == father) { - reaper = child_reaper(father); + reaper = task_child_reaper(father); break; } - } while (reaper->exit_state); + } while (reaper->flags & PF_EXITING); /* * There are only two places where our children can be: @@ -687,9 +691,8 @@ forget_original_parent(struct task_struct *father, struct list_head *to_release) * * Search them and reparent children. */ - list_for_each_safe(_p, _n, &father->children) { + list_for_each_entry_safe(p, n, &father->children, sibling) { int ptrace; - p = list_entry(_p, struct task_struct, sibling); ptrace = p->ptrace; @@ -715,13 +718,23 @@ forget_original_parent(struct task_struct *father, struct list_head *to_release) * while it was being traced by us, to be able to see it in wait4. */ if (unlikely(ptrace && p->exit_state == EXIT_ZOMBIE && p->exit_signal == -1)) - list_add(&p->ptrace_list, to_release); + list_add(&p->ptrace_list, &ptrace_dead); } - list_for_each_safe(_p, _n, &father->ptrace_children) { - p = list_entry(_p, struct task_struct, ptrace_list); + + list_for_each_entry_safe(p, n, &father->ptrace_children, ptrace_list) { p->real_parent = reaper; reparent_thread(p, father, 1); } + + write_unlock_irq(&tasklist_lock); + BUG_ON(!list_empty(&father->children)); + BUG_ON(!list_empty(&father->ptrace_children)); + + list_for_each_entry_safe(p, n, &ptrace_dead, ptrace_list) { + list_del_init(&p->ptrace_list); + release_task(p); + } + } /* @@ -732,7 +745,6 @@ static void exit_notify(struct task_struct *tsk) { int state; struct task_struct *t; - struct list_head ptrace_dead, *_p, *_n; struct pid *pgrp; if (signal_pending(tsk) && !(tsk->signal->flags & SIGNAL_GROUP_EXIT) @@ -753,8 +765,6 @@ static void exit_notify(struct task_struct *tsk) spin_unlock_irq(&tsk->sighand->siglock); } - write_lock_irq(&tasklist_lock); - /* * This does two things: * @@ -763,12 +773,10 @@ static void exit_notify(struct task_struct *tsk) * as a result of our exiting, and if they have any stopped * jobs, send them a SIGHUP and then a SIGCONT. (POSIX 3.2.2.2) */ + forget_original_parent(tsk); + exit_task_namespaces(tsk); - INIT_LIST_HEAD(&ptrace_dead); - forget_original_parent(tsk, &ptrace_dead); - BUG_ON(!list_empty(&tsk->children)); - BUG_ON(!list_empty(&tsk->ptrace_children)); - + write_lock_irq(&tasklist_lock); /* * Check to see if any process groups have become orphaned * as a result of our exiting, and if they have any stopped @@ -792,7 +800,7 @@ static void exit_notify(struct task_struct *tsk) /* Let father know we died * * Thread signals are configurable, but you aren't going to use - * that to send signals to arbitary processes. + * that to send signals to arbitary processes. * That stops right now. * * If the parent exec id doesn't match the exec id we saved @@ -833,12 +841,6 @@ static void exit_notify(struct task_struct *tsk) write_unlock_irq(&tasklist_lock); - list_for_each_safe(_p, _n, &ptrace_dead) { - list_del_init(_p); - t = list_entry(_p, struct task_struct, ptrace_list); - release_task(t); - } - /* If the process is dead, release it - nobody will wait for it */ if (state == EXIT_DEAD) release_task(tsk); @@ -874,10 +876,35 @@ static inline void check_stack_usage(void) {} static inline void exit_child_reaper(struct task_struct *tsk) { - if (likely(tsk->group_leader != child_reaper(tsk))) + if (likely(tsk->group_leader != task_child_reaper(tsk))) return; - panic("Attempted to kill init!"); + if (tsk->nsproxy->pid_ns == &init_pid_ns) + panic("Attempted to kill init!"); + + /* + * @tsk is the last thread in the 'cgroup-init' and is exiting. + * Terminate all remaining processes in the namespace and reap them + * before exiting @tsk. + * + * Note that @tsk (last thread of cgroup-init) may not necessarily + * be the child-reaper (i.e main thread of cgroup-init) of the + * namespace i.e the child_reaper may have already exited. + * + * Even after a child_reaper exits, we let it inherit orphaned children, + * because, pid_ns->child_reaper remains valid as long as there is + * at least one living sub-thread in the cgroup init. + + * This living sub-thread of the cgroup-init will be notified when + * a child inherited by the 'child-reaper' exits (do_notify_parent() + * uses __group_send_sig_info()). Further, when reaping child processes, + * do_wait() iterates over children of all living sub threads. + + * i.e even though 'child_reaper' thread is listed as the parent of the + * orphaned children, any living sub-thread in the cgroup-init can + * perform the role of the child_reaper. + */ + zap_pid_ns_processes(tsk->nsproxy->pid_ns); } fastcall NORET_TYPE void do_exit(long code) @@ -932,7 +959,7 @@ fastcall NORET_TYPE void do_exit(long code) if (unlikely(in_atomic())) printk(KERN_INFO "note: %s[%d] exited with preempt_count %d\n", - current->comm, current->pid, + current->comm, task_pid_nr(current), preempt_count()); acct_update_integrals(tsk); @@ -972,7 +999,7 @@ fastcall NORET_TYPE void do_exit(long code) __exit_fs(tsk); check_stack_usage(); exit_thread(); - cpuset_exit(tsk); + cgroup_exit(tsk, 1); exit_keys(tsk); if (group_dead && tsk->signal->leader) @@ -983,7 +1010,6 @@ fastcall NORET_TYPE void do_exit(long code) module_put(tsk->binfmt->module); proc_exit_connector(tsk); - exit_task_namespaces(tsk); exit_notify(tsk); #ifdef CONFIG_NUMA mpol_free(tsk->mempolicy); @@ -1086,15 +1112,17 @@ asmlinkage void sys_exit_group(int error_code) static int eligible_child(pid_t pid, int options, struct task_struct *p) { int err; + struct pid_namespace *ns; + ns = current->nsproxy->pid_ns; if (pid > 0) { - if (p->pid != pid) + if (task_pid_nr_ns(p, ns) != pid) return 0; } else if (!pid) { - if (process_group(p) != process_group(current)) + if (task_pgrp_nr_ns(p, ns) != task_pgrp_vnr(current)) return 0; } else if (pid != -1) { - if (process_group(p) != -pid) + if (task_pgrp_nr_ns(p, ns) != -pid) return 0; } @@ -1164,9 +1192,12 @@ static int wait_task_zombie(struct task_struct *p, int noreap, { unsigned long state; int retval, status, traced; + struct pid_namespace *ns; + + ns = current->nsproxy->pid_ns; if (unlikely(noreap)) { - pid_t pid = p->pid; + pid_t pid = task_pid_nr_ns(p, ns); uid_t uid = p->uid; int exit_code = p->exit_code; int why, status; @@ -1285,11 +1316,11 @@ static int wait_task_zombie(struct task_struct *p, int noreap, retval = put_user(status, &infop->si_status); } if (!retval && infop) - retval = put_user(p->pid, &infop->si_pid); + retval = put_user(task_pid_nr_ns(p, ns), &infop->si_pid); if (!retval && infop) retval = put_user(p->uid, &infop->si_uid); if (!retval) - retval = p->pid; + retval = task_pid_nr_ns(p, ns); if (traced) { write_lock_irq(&tasklist_lock); @@ -1326,6 +1357,7 @@ static int wait_task_stopped(struct task_struct *p, int delayed_group_leader, int __user *stat_addr, struct rusage __user *ru) { int retval, exit_code; + struct pid_namespace *ns; if (!p->exit_code) return 0; @@ -1344,11 +1376,12 @@ static int wait_task_stopped(struct task_struct *p, int delayed_group_leader, * keep holding onto the tasklist_lock while we call getrusage and * possibly take page faults for user memory. */ + ns = current->nsproxy->pid_ns; get_task_struct(p); read_unlock(&tasklist_lock); if (unlikely(noreap)) { - pid_t pid = p->pid; + pid_t pid = task_pid_nr_ns(p, ns); uid_t uid = p->uid; int why = (p->ptrace & PT_PTRACED) ? CLD_TRAPPED : CLD_STOPPED; @@ -1419,11 +1452,11 @@ bail_ref: if (!retval && infop) retval = put_user(exit_code, &infop->si_status); if (!retval && infop) - retval = put_user(p->pid, &infop->si_pid); + retval = put_user(task_pid_nr_ns(p, ns), &infop->si_pid); if (!retval && infop) retval = put_user(p->uid, &infop->si_uid); if (!retval) - retval = p->pid; + retval = task_pid_nr_ns(p, ns); put_task_struct(p); BUG_ON(!retval); @@ -1443,6 +1476,7 @@ static int wait_task_continued(struct task_struct *p, int noreap, int retval; pid_t pid; uid_t uid; + struct pid_namespace *ns; if (!(p->signal->flags & SIGNAL_STOP_CONTINUED)) return 0; @@ -1457,7 +1491,8 @@ static int wait_task_continued(struct task_struct *p, int noreap, p->signal->flags &= ~SIGNAL_STOP_CONTINUED; spin_unlock_irq(&p->sighand->siglock); - pid = p->pid; + ns = current->nsproxy->pid_ns; + pid = task_pid_nr_ns(p, ns); uid = p->uid; get_task_struct(p); read_unlock(&tasklist_lock); @@ -1468,7 +1503,7 @@ static int wait_task_continued(struct task_struct *p, int noreap, if (!retval && stat_addr) retval = put_user(0xffff, stat_addr); if (!retval) - retval = p->pid; + retval = task_pid_nr_ns(p, ns); } else { retval = wait_noreap_copyout(p, pid, uid, CLD_CONTINUED, SIGCONT, @@ -1517,12 +1552,9 @@ repeat: tsk = current; do { struct task_struct *p; - struct list_head *_p; int ret; - list_for_each(_p,&tsk->children) { - p = list_entry(_p, struct task_struct, sibling); - + list_for_each_entry(p, &tsk->children, sibling) { ret = eligible_child(pid, options, p); if (!ret) continue; @@ -1604,9 +1636,8 @@ check_continued: } } if (!flag) { - list_for_each(_p, &tsk->ptrace_children) { - p = list_entry(_p, struct task_struct, - ptrace_list); + list_for_each_entry(p, &tsk->ptrace_children, + ptrace_list) { if (!eligible_child(pid, options, p)) continue; flag = 1; diff --git a/kernel/fork.c b/kernel/fork.c index 2ce28f165e31..ddafdfac9456 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -29,7 +29,7 @@ #include <linux/nsproxy.h> #include <linux/capability.h> #include <linux/cpu.h> -#include <linux/cpuset.h> +#include <linux/cgroup.h> #include <linux/security.h> #include <linux/swap.h> #include <linux/syscalls.h> @@ -50,6 +50,7 @@ #include <linux/taskstats_kern.h> #include <linux/random.h> #include <linux/tty.h> +#include <linux/proc_fs.h> #include <asm/pgtable.h> #include <asm/pgalloc.h> @@ -116,7 +117,7 @@ EXPORT_SYMBOL(free_task); void __put_task_struct(struct task_struct *tsk) { - WARN_ON(!(tsk->exit_state & (EXIT_DEAD | EXIT_ZOMBIE))); + WARN_ON(!tsk->exit_state); WARN_ON(atomic_read(&tsk->usage)); WARN_ON(tsk == current); @@ -205,7 +206,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig) } #ifdef CONFIG_MMU -static inline int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) +static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) { struct vm_area_struct *mpnt, *tmp, **pprev; struct rb_node **rb_link, *rb_parent; @@ -583,7 +584,7 @@ fail_nomem: return retval; } -static inline struct fs_struct *__copy_fs_struct(struct fs_struct *old) +static struct fs_struct *__copy_fs_struct(struct fs_struct *old) { struct fs_struct *fs = kmem_cache_alloc(fs_cachep, GFP_KERNEL); /* We don't need to lock fs - think why ;-) */ @@ -615,7 +616,7 @@ struct fs_struct *copy_fs_struct(struct fs_struct *old) EXPORT_SYMBOL_GPL(copy_fs_struct); -static inline int copy_fs(unsigned long clone_flags, struct task_struct * tsk) +static int copy_fs(unsigned long clone_flags, struct task_struct *tsk) { if (clone_flags & CLONE_FS) { atomic_inc(¤t->fs->count); @@ -818,7 +819,7 @@ int unshare_files(void) EXPORT_SYMBOL(unshare_files); -static inline int copy_sighand(unsigned long clone_flags, struct task_struct * tsk) +static int copy_sighand(unsigned long clone_flags, struct task_struct *tsk) { struct sighand_struct *sig; @@ -841,7 +842,7 @@ void __cleanup_sighand(struct sighand_struct *sighand) kmem_cache_free(sighand_cachep, sighand); } -static inline int copy_signal(unsigned long clone_flags, struct task_struct * tsk) +static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) { struct signal_struct *sig; int ret; @@ -923,7 +924,7 @@ void __cleanup_signal(struct signal_struct *sig) kmem_cache_free(signal_cachep, sig); } -static inline void cleanup_signal(struct task_struct *tsk) +static void cleanup_signal(struct task_struct *tsk) { struct signal_struct *sig = tsk->signal; @@ -933,7 +934,7 @@ static inline void cleanup_signal(struct task_struct *tsk) __cleanup_signal(sig); } -static inline void copy_flags(unsigned long clone_flags, struct task_struct *p) +static void copy_flags(unsigned long clone_flags, struct task_struct *p) { unsigned long new_flags = p->flags; @@ -949,10 +950,10 @@ asmlinkage long sys_set_tid_address(int __user *tidptr) { current->clear_child_tid = tidptr; - return current->pid; + return task_pid_vnr(current); } -static inline void rt_mutex_init_task(struct task_struct *p) +static void rt_mutex_init_task(struct task_struct *p) { spin_lock_init(&p->pi_lock); #ifdef CONFIG_RT_MUTEXES @@ -973,12 +974,12 @@ static struct task_struct *copy_process(unsigned long clone_flags, unsigned long stack_start, struct pt_regs *regs, unsigned long stack_size, - int __user *parent_tidptr, int __user *child_tidptr, struct pid *pid) { int retval; - struct task_struct *p = NULL; + struct task_struct *p; + int cgroup_callbacks_done = 0; if ((clone_flags & (CLONE_NEWNS|CLONE_FS)) == (CLONE_NEWNS|CLONE_FS)) return ERR_PTR(-EINVAL); @@ -1042,12 +1043,6 @@ static struct task_struct *copy_process(unsigned long clone_flags, p->did_exec = 0; delayacct_tsk_init(p); /* Must remain after dup_task_struct() */ copy_flags(clone_flags, p); - p->pid = pid_nr(pid); - retval = -EFAULT; - if (clone_flags & CLONE_PARENT_SETTID) - if (put_user(p->pid, parent_tidptr)) - goto bad_fork_cleanup_delays_binfmt; - INIT_LIST_HEAD(&p->children); INIT_LIST_HEAD(&p->sibling); p->vfork_done = NULL; @@ -1087,13 +1082,13 @@ static struct task_struct *copy_process(unsigned long clone_flags, #endif p->io_context = NULL; p->audit_context = NULL; - cpuset_fork(p); + cgroup_fork(p); #ifdef CONFIG_NUMA p->mempolicy = mpol_copy(p->mempolicy); if (IS_ERR(p->mempolicy)) { retval = PTR_ERR(p->mempolicy); p->mempolicy = NULL; - goto bad_fork_cleanup_cpuset; + goto bad_fork_cleanup_cgroup; } mpol_fix_fork_child_flag(p); #endif @@ -1126,10 +1121,6 @@ static struct task_struct *copy_process(unsigned long clone_flags, p->blocked_on = NULL; /* not blocked yet */ #endif - p->tgid = p->pid; - if (clone_flags & CLONE_THREAD) - p->tgid = current->tgid; - if ((retval = security_task_alloc(p))) goto bad_fork_cleanup_policy; if ((retval = audit_alloc(p))) @@ -1155,6 +1146,24 @@ static struct task_struct *copy_process(unsigned long clone_flags, if (retval) goto bad_fork_cleanup_namespaces; + if (pid != &init_struct_pid) { + retval = -ENOMEM; + pid = alloc_pid(task_active_pid_ns(p)); + if (!pid) + goto bad_fork_cleanup_namespaces; + + if (clone_flags & CLONE_NEWPID) { + retval = pid_ns_prepare_proc(task_active_pid_ns(p)); + if (retval < 0) + goto bad_fork_free_pid; + } + } + + p->pid = pid_nr(pid); + p->tgid = p->pid; + if (clone_flags & CLONE_THREAD) + p->tgid = current->tgid; + p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? child_tidptr : NULL; /* * Clear TID on mm_release()? @@ -1204,6 +1213,12 @@ static struct task_struct *copy_process(unsigned long clone_flags, /* Perform scheduler related setup. Assign this task to a CPU. */ sched_fork(p, clone_flags); + /* Now that the task is set up, run cgroup callbacks if + * necessary. We need to run them before the task is visible + * on the tasklist. */ + cgroup_fork_callbacks(p); + cgroup_callbacks_done = 1; + /* Need tasklist lock for parent etc handling! */ write_lock_irq(&tasklist_lock); @@ -1246,7 +1261,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, spin_unlock(¤t->sighand->siglock); write_unlock_irq(&tasklist_lock); retval = -ERESTARTNOINTR; - goto bad_fork_cleanup_namespaces; + goto bad_fork_free_pid; } if (clone_flags & CLONE_THREAD) { @@ -1275,11 +1290,22 @@ static struct task_struct *copy_process(unsigned long clone_flags, __ptrace_link(p, current->parent); if (thread_group_leader(p)) { - p->signal->tty = current->signal->tty; - p->signal->pgrp = process_group(current); - set_signal_session(p->signal, process_session(current)); - attach_pid(p, PIDTYPE_PGID, task_pgrp(current)); - attach_pid(p, PIDTYPE_SID, task_session(current)); + if (clone_flags & CLONE_NEWPID) { + p->nsproxy->pid_ns->child_reaper = p; + p->signal->tty = NULL; + set_task_pgrp(p, p->pid); + set_task_session(p, p->pid); + attach_pid(p, PIDTYPE_PGID, pid); + attach_pid(p, PIDTYPE_SID, pid); + } else { + p->signal->tty = current->signal->tty; + set_task_pgrp(p, task_pgrp_nr(current)); + set_task_session(p, task_session_nr(current)); + attach_pid(p, PIDTYPE_PGID, + task_pgrp(current)); + attach_pid(p, PIDTYPE_SID, + task_session(current)); + } list_add_tail_rcu(&p->tasks, &init_task.tasks); __get_cpu_var(process_counts)++; @@ -1292,8 +1318,12 @@ static struct task_struct *copy_process(unsigned long clone_flags, spin_unlock(¤t->sighand->siglock); write_unlock_irq(&tasklist_lock); proc_fork_connector(p); + cgroup_post_fork(p); return p; +bad_fork_free_pid: + if (pid != &init_struct_pid) + free_pid(pid); bad_fork_cleanup_namespaces: exit_task_namespaces(p); bad_fork_cleanup_keys: @@ -1318,10 +1348,9 @@ bad_fork_cleanup_security: bad_fork_cleanup_policy: #ifdef CONFIG_NUMA mpol_free(p->mempolicy); -bad_fork_cleanup_cpuset: +bad_fork_cleanup_cgroup: #endif - cpuset_exit(p); -bad_fork_cleanup_delays_binfmt: + cgroup_exit(p, cgroup_callbacks_done); delayacct_tsk_free(p); if (p->binfmt) module_put(p->binfmt->module); @@ -1348,7 +1377,7 @@ struct task_struct * __cpuinit fork_idle(int cpu) struct task_struct *task; struct pt_regs regs; - task = copy_process(CLONE_VM, 0, idle_regs(®s), 0, NULL, NULL, + task = copy_process(CLONE_VM, 0, idle_regs(®s), 0, NULL, &init_struct_pid); if (!IS_ERR(task)) init_idle(task, cpu); @@ -1356,7 +1385,7 @@ struct task_struct * __cpuinit fork_idle(int cpu) return task; } -static inline int fork_traceflag (unsigned clone_flags) +static int fork_traceflag(unsigned clone_flags) { if (clone_flags & CLONE_UNTRACED) return 0; @@ -1387,19 +1416,16 @@ long do_fork(unsigned long clone_flags, { struct task_struct *p; int trace = 0; - struct pid *pid = alloc_pid(); long nr; - if (!pid) - return -EAGAIN; - nr = pid->nr; if (unlikely(current->ptrace)) { trace = fork_traceflag (clone_flags); if (trace) clone_flags |= CLONE_PTRACE; } - p = copy_process(clone_flags, stack_start, regs, stack_size, parent_tidptr, child_tidptr, pid); + p = copy_process(clone_flags, stack_start, regs, stack_size, + child_tidptr, NULL); /* * Do this prior waking up the new thread - the thread pointer * might get invalid after that point, if the thread exits quickly. @@ -1407,6 +1433,17 @@ long do_fork(unsigned long clone_flags, if (!IS_ERR(p)) { struct completion vfork; + /* + * this is enough to call pid_nr_ns here, but this if + * improves optimisation of regular fork() + */ + nr = (clone_flags & CLONE_NEWPID) ? + task_pid_nr_ns(p, current->nsproxy->pid_ns) : + task_pid_vnr(p); + + if (clone_flags & CLONE_PARENT_SETTID) + put_user(nr, parent_tidptr); + if (clone_flags & CLONE_VFORK) { p->vfork_done = &vfork; init_completion(&vfork); @@ -1440,7 +1477,6 @@ long do_fork(unsigned long clone_flags, } } } else { - free_pid(pid); nr = PTR_ERR(p); } return nr; @@ -1485,7 +1521,7 @@ void __init proc_caches_init(void) * Check constraints on flags passed to the unshare system call and * force unsharing of additional process context as appropriate. */ -static inline void check_unshare_flags(unsigned long *flags_ptr) +static void check_unshare_flags(unsigned long *flags_ptr) { /* * If unsharing a thread from a thread group, must also @@ -1617,7 +1653,7 @@ asmlinkage long sys_unshare(unsigned long unshare_flags) struct mm_struct *mm, *new_mm = NULL, *active_mm = NULL; struct files_struct *fd, *new_fd = NULL; struct sem_undo_list *new_ulist = NULL; - struct nsproxy *new_nsproxy = NULL, *old_nsproxy = NULL; + struct nsproxy *new_nsproxy = NULL; check_unshare_flags(&unshare_flags); @@ -1647,14 +1683,13 @@ asmlinkage long sys_unshare(unsigned long unshare_flags) if (new_fs || new_mm || new_fd || new_ulist || new_nsproxy) { - task_lock(current); - if (new_nsproxy) { - old_nsproxy = current->nsproxy; - current->nsproxy = new_nsproxy; - new_nsproxy = old_nsproxy; + switch_task_namespaces(current, new_nsproxy); + new_nsproxy = NULL; } + task_lock(current); + if (new_fs) { fs = current->fs; current->fs = new_fs; diff --git a/kernel/futex.c b/kernel/futex.c index e45a65e41686..32710451dc20 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -53,6 +53,9 @@ #include <linux/signal.h> #include <linux/module.h> #include <linux/magic.h> +#include <linux/pid.h> +#include <linux/nsproxy.h> + #include <asm/futex.h> #include "rtmutex_common.h" @@ -443,8 +446,7 @@ static struct task_struct * futex_find_get_task(pid_t pid) struct task_struct *p; rcu_read_lock(); - p = find_task_by_pid(pid); - + p = find_task_by_vpid(pid); if (!p || ((current->euid != p->euid) && (current->euid != p->uid))) p = ERR_PTR(-ESRCH); else @@ -653,7 +655,7 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this) if (!(uval & FUTEX_OWNER_DIED)) { int ret = 0; - newval = FUTEX_WAITERS | new_owner->pid; + newval = FUTEX_WAITERS | task_pid_vnr(new_owner); curval = cmpxchg_futex_value_locked(uaddr, uval, newval); @@ -1106,7 +1108,7 @@ static void unqueue_me_pi(struct futex_q *q) static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q, struct task_struct *curr) { - u32 newtid = curr->pid | FUTEX_WAITERS; + u32 newtid = task_pid_vnr(curr) | FUTEX_WAITERS; struct futex_pi_state *pi_state = q->pi_state; u32 uval, curval, newval; int ret; @@ -1368,7 +1370,7 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared, * (by doing a 0 -> TID atomic cmpxchg), while holding all * the locks. It will most likely not succeed. */ - newval = current->pid; + newval = task_pid_vnr(current); curval = cmpxchg_futex_value_locked(uaddr, 0, newval); @@ -1379,7 +1381,7 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared, * Detect deadlocks. In case of REQUEUE_PI this is a valid * situation and we return success to user space. */ - if (unlikely((curval & FUTEX_TID_MASK) == current->pid)) { + if (unlikely((curval & FUTEX_TID_MASK) == task_pid_vnr(current))) { ret = -EDEADLK; goto out_unlock_release_sem; } @@ -1408,7 +1410,7 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared, */ if (unlikely(ownerdied || !(curval & FUTEX_TID_MASK))) { /* Keep the OWNER_DIED bit */ - newval = (curval & ~FUTEX_TID_MASK) | current->pid; + newval = (curval & ~FUTEX_TID_MASK) | task_pid_vnr(current); ownerdied = 0; lock_taken = 1; } @@ -1587,7 +1589,7 @@ retry: /* * We release only a lock we actually own: */ - if ((uval & FUTEX_TID_MASK) != current->pid) + if ((uval & FUTEX_TID_MASK) != task_pid_vnr(current)) return -EPERM; /* * First take all the futex related locks: @@ -1608,7 +1610,7 @@ retry_unlocked: * anyone else up: */ if (!(uval & FUTEX_OWNER_DIED)) - uval = cmpxchg_futex_value_locked(uaddr, current->pid, 0); + uval = cmpxchg_futex_value_locked(uaddr, task_pid_vnr(current), 0); if (unlikely(uval == -EFAULT)) @@ -1617,7 +1619,7 @@ retry_unlocked: * Rare case: we managed to release the lock atomically, * no need to wake anyone else up: */ - if (unlikely(uval == current->pid)) + if (unlikely(uval == task_pid_vnr(current))) goto out_unlock; /* @@ -1854,7 +1856,7 @@ sys_get_robust_list(int pid, struct robust_list_head __user * __user *head_ptr, ret = -ESRCH; rcu_read_lock(); - p = find_task_by_pid(pid); + p = find_task_by_vpid(pid); if (!p) goto err_unlock; ret = -EPERM; @@ -1887,7 +1889,7 @@ retry: if (get_user(uval, uaddr)) return -1; - if ((uval & FUTEX_TID_MASK) == curr->pid) { + if ((uval & FUTEX_TID_MASK) == task_pid_vnr(curr)) { /* * Ok, this dying thread is truly holding a futex * of interest. Set the OWNER_DIED bit atomically diff --git a/kernel/futex_compat.c b/kernel/futex_compat.c index 2c2e2954b713..00b572666cc7 100644 --- a/kernel/futex_compat.c +++ b/kernel/futex_compat.c @@ -8,6 +8,7 @@ #include <linux/linkage.h> #include <linux/compat.h> +#include <linux/nsproxy.h> #include <linux/futex.h> #include <asm/uaccess.h> @@ -124,7 +125,7 @@ compat_sys_get_robust_list(int pid, compat_uptr_t __user *head_ptr, ret = -ESRCH; read_lock(&tasklist_lock); - p = find_task_by_pid(pid); + p = find_task_by_vpid(pid); if (!p) goto err_unlock; ret = -EPERM; diff --git a/kernel/kexec.c b/kernel/kexec.c index e9f1b4ea504d..aa74a1ef2da8 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -51,7 +51,7 @@ struct resource crashk_res = { int kexec_should_crash(struct task_struct *p) { - if (in_interrupt() || !p->pid || is_init(p) || panic_on_oops) + if (in_interrupt() || !p->pid || is_global_init(p) || panic_on_oops) return 1; return 0; } @@ -1146,6 +1146,172 @@ static int __init crash_notes_memory_init(void) } module_init(crash_notes_memory_init) + +/* + * parsing the "crashkernel" commandline + * + * this code is intended to be called from architecture specific code + */ + + +/* + * This function parses command lines in the format + * + * crashkernel=ramsize-range:size[,...][@offset] + * + * The function returns 0 on success and -EINVAL on failure. + */ +static int __init parse_crashkernel_mem(char *cmdline, + unsigned long long system_ram, + unsigned long long *crash_size, + unsigned long long *crash_base) +{ + char *cur = cmdline, *tmp; + + /* for each entry of the comma-separated list */ + do { + unsigned long long start, end = ULLONG_MAX, size; + + /* get the start of the range */ + start = memparse(cur, &tmp); + if (cur == tmp) { + pr_warning("crashkernel: Memory value expected\n"); + return -EINVAL; + } + cur = tmp; + if (*cur != '-') { + pr_warning("crashkernel: '-' expected\n"); + return -EINVAL; + } + cur++; + + /* if no ':' is here, than we read the end */ + if (*cur != ':') { + end = memparse(cur, &tmp); + if (cur == tmp) { + pr_warning("crashkernel: Memory " + "value expected\n"); + return -EINVAL; + } + cur = tmp; + if (end <= start) { + pr_warning("crashkernel: end <= start\n"); + return -EINVAL; + } + } + + if (*cur != ':') { + pr_warning("crashkernel: ':' expected\n"); + return -EINVAL; + } + cur++; + + size = memparse(cur, &tmp); + if (cur == tmp) { + pr_warning("Memory value expected\n"); + return -EINVAL; + } + cur = tmp; + if (size >= system_ram) { + pr_warning("crashkernel: invalid size\n"); + return -EINVAL; + } + + /* match ? */ + if (system_ram >= start && system_ram <= end) { + *crash_size = size; + break; + } + } while (*cur++ == ','); + + if (*crash_size > 0) { + while (*cur != ' ' && *cur != '@') + cur++; + if (*cur == '@') { + cur++; + *crash_base = memparse(cur, &tmp); + if (cur == tmp) { + pr_warning("Memory value expected " + "after '@'\n"); + return -EINVAL; + } + } + } + + return 0; +} + +/* + * That function parses "simple" (old) crashkernel command lines like + * + * crashkernel=size[@offset] + * + * It returns 0 on success and -EINVAL on failure. + */ +static int __init parse_crashkernel_simple(char *cmdline, + unsigned long long *crash_size, + unsigned long long *crash_base) +{ + char *cur = cmdline; + + *crash_size = memparse(cmdline, &cur); + if (cmdline == cur) { + pr_warning("crashkernel: memory value expected\n"); + return -EINVAL; + } + + if (*cur == '@') + *crash_base = memparse(cur+1, &cur); + + return 0; +} + +/* + * That function is the entry point for command line parsing and should be + * called from the arch-specific code. + */ +int __init parse_crashkernel(char *cmdline, + unsigned long long system_ram, + unsigned long long *crash_size, + unsigned long long *crash_base) +{ + char *p = cmdline, *ck_cmdline = NULL; + char *first_colon, *first_space; + + BUG_ON(!crash_size || !crash_base); + *crash_size = 0; + *crash_base = 0; + + /* find crashkernel and use the last one if there are more */ + p = strstr(p, "crashkernel="); + while (p) { + ck_cmdline = p; + p = strstr(p+1, "crashkernel="); + } + + if (!ck_cmdline) + return -EINVAL; + + ck_cmdline += 12; /* strlen("crashkernel=") */ + + /* + * if the commandline contains a ':', then that's the extended + * syntax -- if not, it must be the classic syntax + */ + first_colon = strchr(ck_cmdline, ':'); + first_space = strchr(ck_cmdline, ' '); + if (first_colon && (!first_space || first_colon < first_space)) + return parse_crashkernel_mem(ck_cmdline, system_ram, + crash_size, crash_base); + else + return parse_crashkernel_simple(ck_cmdline, crash_size, + crash_base); + + return 0; +} + + + void crash_save_vmcoreinfo(void) { u32 *buf; diff --git a/kernel/lockdep.c b/kernel/lockdep.c index a6f1ee9c92d9..55fe0c7cd95f 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c @@ -511,11 +511,11 @@ static void lockdep_print_held_locks(struct task_struct *curr) int i, depth = curr->lockdep_depth; if (!depth) { - printk("no locks held by %s/%d.\n", curr->comm, curr->pid); + printk("no locks held by %s/%d.\n", curr->comm, task_pid_nr(curr)); return; } printk("%d lock%s held by %s/%d:\n", - depth, depth > 1 ? "s" : "", curr->comm, curr->pid); + depth, depth > 1 ? "s" : "", curr->comm, task_pid_nr(curr)); for (i = 0; i < depth; i++) { printk(" #%d: ", i); @@ -904,7 +904,7 @@ print_circular_bug_header(struct lock_list *entry, unsigned int depth) print_kernel_version(); printk( "-------------------------------------------------------\n"); printk("%s/%d is trying to acquire lock:\n", - curr->comm, curr->pid); + curr->comm, task_pid_nr(curr)); print_lock(check_source); printk("\nbut task is already holding lock:\n"); print_lock(check_target); @@ -1085,7 +1085,7 @@ print_bad_irq_dependency(struct task_struct *curr, print_kernel_version(); printk( "------------------------------------------------------\n"); printk("%s/%d [HC%u[%lu]:SC%u[%lu]:HE%u:SE%u] is trying to acquire:\n", - curr->comm, curr->pid, + curr->comm, task_pid_nr(curr), curr->hardirq_context, hardirq_count() >> HARDIRQ_SHIFT, curr->softirq_context, softirq_count() >> SOFTIRQ_SHIFT, curr->hardirqs_enabled, @@ -1237,7 +1237,7 @@ print_deadlock_bug(struct task_struct *curr, struct held_lock *prev, print_kernel_version(); printk( "---------------------------------------------\n"); printk("%s/%d is trying to acquire lock:\n", - curr->comm, curr->pid); + curr->comm, task_pid_nr(curr)); print_lock(next); printk("\nbut task is already holding lock:\n"); print_lock(prev); @@ -1521,7 +1521,7 @@ cache_hit: } static int validate_chain(struct task_struct *curr, struct lockdep_map *lock, - struct held_lock *hlock, int chain_head, u64 chain_key) + struct held_lock *hlock, int chain_head, u64 chain_key) { /* * Trylock needs to maintain the stack of held locks, but it @@ -1641,7 +1641,7 @@ print_usage_bug(struct task_struct *curr, struct held_lock *this, usage_str[prev_bit], usage_str[new_bit]); printk("%s/%d [HC%u[%lu]:SC%u[%lu]:HE%u:SE%u] takes:\n", - curr->comm, curr->pid, + curr->comm, task_pid_nr(curr), trace_hardirq_context(curr), hardirq_count() >> HARDIRQ_SHIFT, trace_softirq_context(curr), softirq_count() >> SOFTIRQ_SHIFT, trace_hardirqs_enabled(curr), @@ -1694,7 +1694,7 @@ print_irq_inversion_bug(struct task_struct *curr, struct lock_class *other, print_kernel_version(); printk( "---------------------------------------------------------\n"); printk("%s/%d just changed the state of lock:\n", - curr->comm, curr->pid); + curr->comm, task_pid_nr(curr)); print_lock(this); if (forwards) printk("but this lock took another, %s-irq-unsafe lock in the past:\n", irqclass); @@ -2487,7 +2487,7 @@ print_unlock_inbalance_bug(struct task_struct *curr, struct lockdep_map *lock, printk( "[ BUG: bad unlock balance detected! ]\n"); printk( "-------------------------------------\n"); printk("%s/%d is trying to release lock (", - curr->comm, curr->pid); + curr->comm, task_pid_nr(curr)); print_lockdep_cache(lock); printk(") at:\n"); print_ip_sym(ip); @@ -2737,7 +2737,7 @@ print_lock_contention_bug(struct task_struct *curr, struct lockdep_map *lock, printk( "[ BUG: bad contention detected! ]\n"); printk( "---------------------------------\n"); printk("%s/%d is trying to contend lock (", - curr->comm, curr->pid); + curr->comm, task_pid_nr(curr)); print_lockdep_cache(lock); printk(") at:\n"); print_ip_sym(ip); @@ -3072,7 +3072,7 @@ print_freed_lock_bug(struct task_struct *curr, const void *mem_from, printk( "[ BUG: held lock freed! ]\n"); printk( "-------------------------\n"); printk("%s/%d is freeing memory %p-%p, with a lock still held there!\n", - curr->comm, curr->pid, mem_from, mem_to-1); + curr->comm, task_pid_nr(curr), mem_from, mem_to-1); print_lock(hlock); lockdep_print_held_locks(curr); @@ -3125,7 +3125,7 @@ static void print_held_locks_bug(struct task_struct *curr) printk( "[ BUG: lock held at task exit time! ]\n"); printk( "-------------------------------------\n"); printk("%s/%d is exiting with locks still held!\n", - curr->comm, curr->pid); + curr->comm, task_pid_nr(curr)); lockdep_print_held_locks(curr); printk("\nstack backtrace:\n"); diff --git a/kernel/marker.c b/kernel/marker.c new file mode 100644 index 000000000000..ccb48d9a3657 --- /dev/null +++ b/kernel/marker.c @@ -0,0 +1,525 @@ +/* + * Copyright (C) 2007 Mathieu Desnoyers + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + */ +#include <linux/module.h> +#include <linux/mutex.h> +#include <linux/types.h> +#include <linux/jhash.h> +#include <linux/list.h> +#include <linux/rcupdate.h> +#include <linux/marker.h> +#include <linux/err.h> + +extern struct marker __start___markers[]; +extern struct marker __stop___markers[]; + +/* + * module_mutex nests inside markers_mutex. Markers mutex protects the builtin + * and module markers, the hash table and deferred_sync. + */ +static DEFINE_MUTEX(markers_mutex); + +/* + * Marker deferred synchronization. + * Upon marker probe_unregister, we delay call to synchronize_sched() to + * accelerate mass unregistration (only when there is no more reference to a + * given module do we call synchronize_sched()). However, we need to make sure + * every critical region has ended before we re-arm a marker that has been + * unregistered and then registered back with a different probe data. + */ +static int deferred_sync; + +/* + * Marker hash table, containing the active markers. + * Protected by module_mutex. + */ +#define MARKER_HASH_BITS 6 +#define MARKER_TABLE_SIZE (1 << MARKER_HASH_BITS) + +struct marker_entry { + struct hlist_node hlist; + char *format; + marker_probe_func *probe; + void *private; + int refcount; /* Number of times armed. 0 if disarmed. */ + char name[0]; /* Contains name'\0'format'\0' */ +}; + +static struct hlist_head marker_table[MARKER_TABLE_SIZE]; + +/** + * __mark_empty_function - Empty probe callback + * @mdata: pointer of type const struct marker + * @fmt: format string + * @...: variable argument list + * + * Empty callback provided as a probe to the markers. By providing this to a + * disabled marker, we make sure the execution flow is always valid even + * though the function pointer change and the marker enabling are two distinct + * operations that modifies the execution flow of preemptible code. + */ +void __mark_empty_function(const struct marker *mdata, void *private, + const char *fmt, ...) +{ +} +EXPORT_SYMBOL_GPL(__mark_empty_function); + +/* + * Get marker if the marker is present in the marker hash table. + * Must be called with markers_mutex held. + * Returns NULL if not present. + */ +static struct marker_entry *get_marker(const char *name) +{ + struct hlist_head *head; + struct hlist_node *node; + struct marker_entry *e; + u32 hash = jhash(name, strlen(name), 0); + + head = &marker_table[hash & ((1 << MARKER_HASH_BITS)-1)]; + hlist_for_each_entry(e, node, head, hlist) { + if (!strcmp(name, e->name)) + return e; + } + return NULL; +} + +/* + * Add the marker to the marker hash table. Must be called with markers_mutex + * held. + */ +static int add_marker(const char *name, const char *format, + marker_probe_func *probe, void *private) +{ + struct hlist_head *head; + struct hlist_node *node; + struct marker_entry *e; + size_t name_len = strlen(name) + 1; + size_t format_len = 0; + u32 hash = jhash(name, name_len-1, 0); + + if (format) + format_len = strlen(format) + 1; + head = &marker_table[hash & ((1 << MARKER_HASH_BITS)-1)]; + hlist_for_each_entry(e, node, head, hlist) { + if (!strcmp(name, e->name)) { + printk(KERN_NOTICE + "Marker %s busy, probe %p already installed\n", + name, e->probe); + return -EBUSY; /* Already there */ + } + } + /* + * Using kmalloc here to allocate a variable length element. Could + * cause some memory fragmentation if overused. + */ + e = kmalloc(sizeof(struct marker_entry) + name_len + format_len, + GFP_KERNEL); + if (!e) + return -ENOMEM; + memcpy(&e->name[0], name, name_len); + if (format) { + e->format = &e->name[name_len]; + memcpy(e->format, format, format_len); + trace_mark(core_marker_format, "name %s format %s", + e->name, e->format); + } else + e->format = NULL; + e->probe = probe; + e->private = private; + e->refcount = 0; + hlist_add_head(&e->hlist, head); + return 0; +} + +/* + * Remove the marker from the marker hash table. Must be called with mutex_lock + * held. + */ +static void *remove_marker(const char *name) +{ + struct hlist_head *head; + struct hlist_node *node; + struct marker_entry *e; + int found = 0; + size_t len = strlen(name) + 1; + void *private = NULL; + u32 hash = jhash(name, len-1, 0); + + head = &marker_table[hash & ((1 << MARKER_HASH_BITS)-1)]; + hlist_for_each_entry(e, node, head, hlist) { + if (!strcmp(name, e->name)) { + found = 1; + break; + } + } + if (found) { + private = e->private; + hlist_del(&e->hlist); + kfree(e); + } + return private; +} + +/* + * Set the mark_entry format to the format found in the element. + */ +static int marker_set_format(struct marker_entry **entry, const char *format) +{ + struct marker_entry *e; + size_t name_len = strlen((*entry)->name) + 1; + size_t format_len = strlen(format) + 1; + + e = kmalloc(sizeof(struct marker_entry) + name_len + format_len, + GFP_KERNEL); + if (!e) + return -ENOMEM; + memcpy(&e->name[0], (*entry)->name, name_len); + e->format = &e->name[name_len]; + memcpy(e->format, format, format_len); + e->probe = (*entry)->probe; + e->private = (*entry)->private; + e->refcount = (*entry)->refcount; + hlist_add_before(&e->hlist, &(*entry)->hlist); + hlist_del(&(*entry)->hlist); + kfree(*entry); + *entry = e; + trace_mark(core_marker_format, "name %s format %s", + e->name, e->format); + return 0; +} + +/* + * Sets the probe callback corresponding to one marker. + */ +static int set_marker(struct marker_entry **entry, struct marker *elem) +{ + int ret; + WARN_ON(strcmp((*entry)->name, elem->name) != 0); + + if ((*entry)->format) { + if (strcmp((*entry)->format, elem->format) != 0) { + printk(KERN_NOTICE + "Format mismatch for probe %s " + "(%s), marker (%s)\n", + (*entry)->name, + (*entry)->format, + elem->format); + return -EPERM; + } + } else { + ret = marker_set_format(entry, elem->format); + if (ret) + return ret; + } + elem->call = (*entry)->probe; + elem->private = (*entry)->private; + elem->state = 1; + return 0; +} + +/* + * Disable a marker and its probe callback. + * Note: only after a synchronize_sched() issued after setting elem->call to the + * empty function insures that the original callback is not used anymore. This + * insured by preemption disabling around the call site. + */ +static void disable_marker(struct marker *elem) +{ + elem->state = 0; + elem->call = __mark_empty_function; + /* + * Leave the private data and id there, because removal is racy and + * should be done only after a synchronize_sched(). These are never used + * until the next initialization anyway. + */ +} + +/** + * marker_update_probe_range - Update a probe range + * @begin: beginning of the range + * @end: end of the range + * @probe_module: module address of the probe being updated + * @refcount: number of references left to the given probe_module (out) + * + * Updates the probe callback corresponding to a range of markers. + * Must be called with markers_mutex held. + */ +void marker_update_probe_range(struct marker *begin, + struct marker *end, struct module *probe_module, + int *refcount) +{ + struct marker *iter; + struct marker_entry *mark_entry; + + for (iter = begin; iter < end; iter++) { + mark_entry = get_marker(iter->name); + if (mark_entry && mark_entry->refcount) { + set_marker(&mark_entry, iter); + /* + * ignore error, continue + */ + if (probe_module) + if (probe_module == + __module_text_address((unsigned long)mark_entry->probe)) + (*refcount)++; + } else { + disable_marker(iter); + } + } +} + +/* + * Update probes, removing the faulty probes. + * Issues a synchronize_sched() when no reference to the module passed + * as parameter is found in the probes so the probe module can be + * safely unloaded from now on. + */ +static void marker_update_probes(struct module *probe_module) +{ + int refcount = 0; + + mutex_lock(&markers_mutex); + /* Core kernel markers */ + marker_update_probe_range(__start___markers, + __stop___markers, probe_module, &refcount); + /* Markers in modules. */ + module_update_markers(probe_module, &refcount); + if (probe_module && refcount == 0) { + synchronize_sched(); + deferred_sync = 0; + } + mutex_unlock(&markers_mutex); +} + +/** + * marker_probe_register - Connect a probe to a marker + * @name: marker name + * @format: format string + * @probe: probe handler + * @private: probe private data + * + * private data must be a valid allocated memory address, or NULL. + * Returns 0 if ok, error value on error. + */ +int marker_probe_register(const char *name, const char *format, + marker_probe_func *probe, void *private) +{ + struct marker_entry *entry; + int ret = 0, need_update = 0; + + mutex_lock(&markers_mutex); + entry = get_marker(name); + if (entry && entry->refcount) { + ret = -EBUSY; + goto end; + } + if (deferred_sync) { + synchronize_sched(); + deferred_sync = 0; + } + ret = add_marker(name, format, probe, private); + if (ret) + goto end; + need_update = 1; +end: + mutex_unlock(&markers_mutex); + if (need_update) + marker_update_probes(NULL); + return ret; +} +EXPORT_SYMBOL_GPL(marker_probe_register); + +/** + * marker_probe_unregister - Disconnect a probe from a marker + * @name: marker name + * + * Returns the private data given to marker_probe_register, or an ERR_PTR(). + */ +void *marker_probe_unregister(const char *name) +{ + struct module *probe_module; + struct marker_entry *entry; + void *private; + int need_update = 0; + + mutex_lock(&markers_mutex); + entry = get_marker(name); + if (!entry) { + private = ERR_PTR(-ENOENT); + goto end; + } + entry->refcount = 0; + /* In what module is the probe handler ? */ + probe_module = __module_text_address((unsigned long)entry->probe); + private = remove_marker(name); + deferred_sync = 1; + need_update = 1; +end: + mutex_unlock(&markers_mutex); + if (need_update) + marker_update_probes(probe_module); + return private; +} +EXPORT_SYMBOL_GPL(marker_probe_unregister); + +/** + * marker_probe_unregister_private_data - Disconnect a probe from a marker + * @private: probe private data + * + * Unregister a marker by providing the registered private data. + * Returns the private data given to marker_probe_register, or an ERR_PTR(). + */ +void *marker_probe_unregister_private_data(void *private) +{ + struct module *probe_module; + struct hlist_head *head; + struct hlist_node *node; + struct marker_entry *entry; + int found = 0; + unsigned int i; + int need_update = 0; + + mutex_lock(&markers_mutex); + for (i = 0; i < MARKER_TABLE_SIZE; i++) { + head = &marker_table[i]; + hlist_for_each_entry(entry, node, head, hlist) { + if (entry->private == private) { + found = 1; + goto iter_end; + } + } + } +iter_end: + if (!found) { + private = ERR_PTR(-ENOENT); + goto end; + } + entry->refcount = 0; + /* In what module is the probe handler ? */ + probe_module = __module_text_address((unsigned long)entry->probe); + private = remove_marker(entry->name); + deferred_sync = 1; + need_update = 1; +end: + mutex_unlock(&markers_mutex); + if (need_update) + marker_update_probes(probe_module); + return private; +} +EXPORT_SYMBOL_GPL(marker_probe_unregister_private_data); + +/** + * marker_arm - Arm a marker + * @name: marker name + * + * Activate a marker. It keeps a reference count of the number of + * arming/disarming done. + * Returns 0 if ok, error value on error. + */ +int marker_arm(const char *name) +{ + struct marker_entry *entry; + int ret = 0, need_update = 0; + + mutex_lock(&markers_mutex); + entry = get_marker(name); + if (!entry) { + ret = -ENOENT; + goto end; + } + /* + * Only need to update probes when refcount passes from 0 to 1. + */ + if (entry->refcount++) + goto end; + need_update = 1; +end: + mutex_unlock(&markers_mutex); + if (need_update) + marker_update_probes(NULL); + return ret; +} +EXPORT_SYMBOL_GPL(marker_arm); + +/** + * marker_disarm - Disarm a marker + * @name: marker name + * + * Disarm a marker. It keeps a reference count of the number of arming/disarming + * done. + * Returns 0 if ok, error value on error. + */ +int marker_disarm(const char *name) +{ + struct marker_entry *entry; + int ret = 0, need_update = 0; + + mutex_lock(&markers_mutex); + entry = get_marker(name); + if (!entry) { + ret = -ENOENT; + goto end; + } + /* + * Only permit decrement refcount if higher than 0. + * Do probe update only on 1 -> 0 transition. + */ + if (entry->refcount) { + if (--entry->refcount) + goto end; + } else { + ret = -EPERM; + goto end; + } + need_update = 1; +end: + mutex_unlock(&markers_mutex); + if (need_update) + marker_update_probes(NULL); + return ret; +} +EXPORT_SYMBOL_GPL(marker_disarm); + +/** + * marker_get_private_data - Get a marker's probe private data + * @name: marker name + * + * Returns the private data pointer, or an ERR_PTR. + * The private data pointer should _only_ be dereferenced if the caller is the + * owner of the data, or its content could vanish. This is mostly used to + * confirm that a caller is the owner of a registered probe. + */ +void *marker_get_private_data(const char *name) +{ + struct hlist_head *head; + struct hlist_node *node; + struct marker_entry *e; + size_t name_len = strlen(name) + 1; + u32 hash = jhash(name, name_len-1, 0); + int found = 0; + + head = &marker_table[hash & ((1 << MARKER_HASH_BITS)-1)]; + hlist_for_each_entry(e, node, head, hlist) { + if (!strcmp(name, e->name)) { + found = 1; + return e->private; + } + } + return ERR_PTR(-ENOENT); +} +EXPORT_SYMBOL_GPL(marker_get_private_data); diff --git a/kernel/module.c b/kernel/module.c index 7734595bd329..3202c9950073 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -1673,6 +1673,8 @@ static struct module *load_module(void __user *umod, unsigned int unusedcrcindex; unsigned int unusedgplindex; unsigned int unusedgplcrcindex; + unsigned int markersindex; + unsigned int markersstringsindex; struct module *mod; long err = 0; void *percpu = NULL, *ptr = NULL; /* Stops spurious gcc warning */ @@ -1939,6 +1941,9 @@ static struct module *load_module(void __user *umod, add_taint_module(mod, TAINT_FORCED_MODULE); } #endif + markersindex = find_sec(hdr, sechdrs, secstrings, "__markers"); + markersstringsindex = find_sec(hdr, sechdrs, secstrings, + "__markers_strings"); /* Now do relocations. */ for (i = 1; i < hdr->e_shnum; i++) { @@ -1961,6 +1966,11 @@ static struct module *load_module(void __user *umod, if (err < 0) goto cleanup; } +#ifdef CONFIG_MARKERS + mod->markers = (void *)sechdrs[markersindex].sh_addr; + mod->num_markers = + sechdrs[markersindex].sh_size / sizeof(*mod->markers); +#endif /* Find duplicate symbols */ err = verify_export_symbols(mod); @@ -1979,6 +1989,11 @@ static struct module *load_module(void __user *umod, add_kallsyms(mod, sechdrs, symindex, strindex, secstrings); +#ifdef CONFIG_MARKERS + if (!mod->taints) + marker_update_probe_range(mod->markers, + mod->markers + mod->num_markers, NULL, NULL); +#endif err = module_finalize(hdr, sechdrs, mod); if (err < 0) goto cleanup; @@ -2570,3 +2585,18 @@ EXPORT_SYMBOL(module_remove_driver); void struct_module(struct module *mod) { return; } EXPORT_SYMBOL(struct_module); #endif + +#ifdef CONFIG_MARKERS +void module_update_markers(struct module *probe_module, int *refcount) +{ + struct module *mod; + + mutex_lock(&module_mutex); + list_for_each_entry(mod, &modules, list) + if (!mod->taints) + marker_update_probe_range(mod->markers, + mod->markers + mod->num_markers, + probe_module, refcount); + mutex_unlock(&module_mutex); +} +#endif diff --git a/kernel/notifier.c b/kernel/notifier.c new file mode 100644 index 000000000000..4253f472f060 --- /dev/null +++ b/kernel/notifier.c @@ -0,0 +1,539 @@ +#include <linux/kdebug.h> +#include <linux/kprobes.h> +#include <linux/module.h> +#include <linux/notifier.h> +#include <linux/rcupdate.h> +#include <linux/vmalloc.h> + +/* + * Notifier list for kernel code which wants to be called + * at shutdown. This is used to stop any idling DMA operations + * and the like. + */ +BLOCKING_NOTIFIER_HEAD(reboot_notifier_list); + +/* + * Notifier chain core routines. The exported routines below + * are layered on top of these, with appropriate locking added. + */ + +static int notifier_chain_register(struct notifier_block **nl, + struct notifier_block *n) +{ + while ((*nl) != NULL) { + if (n->priority > (*nl)->priority) + break; + nl = &((*nl)->next); + } + n->next = *nl; + rcu_assign_pointer(*nl, n); + return 0; +} + +static int notifier_chain_unregister(struct notifier_block **nl, + struct notifier_block *n) +{ + while ((*nl) != NULL) { + if ((*nl) == n) { + rcu_assign_pointer(*nl, n->next); + return 0; + } + nl = &((*nl)->next); + } + return -ENOENT; +} + +/** + * notifier_call_chain - Informs the registered notifiers about an event. + * @nl: Pointer to head of the blocking notifier chain + * @val: Value passed unmodified to notifier function + * @v: Pointer passed unmodified to notifier function + * @nr_to_call: Number of notifier functions to be called. Don't care + * value of this parameter is -1. + * @nr_calls: Records the number of notifications sent. Don't care + * value of this field is NULL. + * @returns: notifier_call_chain returns the value returned by the + * last notifier function called. + */ +static int __kprobes notifier_call_chain(struct notifier_block **nl, + unsigned long val, void *v, + int nr_to_call, int *nr_calls) +{ + int ret = NOTIFY_DONE; + struct notifier_block *nb, *next_nb; + + nb = rcu_dereference(*nl); + + while (nb && nr_to_call) { + next_nb = rcu_dereference(nb->next); + ret = nb->notifier_call(nb, val, v); + + if (nr_calls) + (*nr_calls)++; + + if ((ret & NOTIFY_STOP_MASK) == NOTIFY_STOP_MASK) + break; + nb = next_nb; + nr_to_call--; + } + return ret; +} + +/* + * Atomic notifier chain routines. Registration and unregistration + * use a spinlock, and call_chain is synchronized by RCU (no locks). + */ + +/** + * atomic_notifier_chain_register - Add notifier to an atomic notifier chain + * @nh: Pointer to head of the atomic notifier chain + * @n: New entry in notifier chain + * + * Adds a notifier to an atomic notifier chain. + * + * Currently always returns zero. + */ +int atomic_notifier_chain_register(struct atomic_notifier_head *nh, + struct notifier_block *n) +{ + unsigned long flags; + int ret; + + spin_lock_irqsave(&nh->lock, flags); + ret = notifier_chain_register(&nh->head, n); + spin_unlock_irqrestore(&nh->lock, flags); + return ret; +} +EXPORT_SYMBOL_GPL(atomic_notifier_chain_register); + +/** + * atomic_notifier_chain_unregister - Remove notifier from an atomic notifier chain + * @nh: Pointer to head of the atomic notifier chain + * @n: Entry to remove from notifier chain + * + * Removes a notifier from an atomic notifier chain. + * + * Returns zero on success or %-ENOENT on failure. + */ +int atomic_notifier_chain_unregister(struct atomic_notifier_head *nh, + struct notifier_block *n) +{ + unsigned long flags; + int ret; + + spin_lock_irqsave(&nh->lock, flags); + ret = notifier_chain_unregister(&nh->head, n); + spin_unlock_irqrestore(&nh->lock, flags); + synchronize_rcu(); + return ret; +} +EXPORT_SYMBOL_GPL(atomic_notifier_chain_unregister); + +/** + * __atomic_notifier_call_chain - Call functions in an atomic notifier chain + * @nh: Pointer to head of the atomic notifier chain + * @val: Value passed unmodified to notifier function + * @v: Pointer passed unmodified to notifier function + * @nr_to_call: See the comment for notifier_call_chain. + * @nr_calls: See the comment for notifier_call_chain. + * + * Calls each function in a notifier chain in turn. The functions + * run in an atomic context, so they must not block. + * This routine uses RCU to synchronize with changes to the chain. + * + * If the return value of the notifier can be and'ed + * with %NOTIFY_STOP_MASK then atomic_notifier_call_chain() + * will return immediately, with the return value of + * the notifier function which halted execution. + * Otherwise the return value is the return value + * of the last notifier function called. + */ +int __kprobes __atomic_notifier_call_chain(struct atomic_notifier_head *nh, + unsigned long val, void *v, + int nr_to_call, int *nr_calls) +{ + int ret; + + rcu_read_lock(); + ret = notifier_call_chain(&nh->head, val, v, nr_to_call, nr_calls); + rcu_read_unlock(); + return ret; +} +EXPORT_SYMBOL_GPL(__atomic_notifier_call_chain); + +int __kprobes atomic_notifier_call_chain(struct atomic_notifier_head *nh, + unsigned long val, void *v) +{ + return __atomic_notifier_call_chain(nh, val, v, -1, NULL); +} +EXPORT_SYMBOL_GPL(atomic_notifier_call_chain); + +/* + * Blocking notifier chain routines. All access to the chain is + * synchronized by an rwsem. + */ + +/** + * blocking_notifier_chain_register - Add notifier to a blocking notifier chain + * @nh: Pointer to head of the blocking notifier chain + * @n: New entry in notifier chain + * + * Adds a notifier to a blocking notifier chain. + * Must be called in process context. + * + * Currently always returns zero. + */ +int blocking_notifier_chain_register(struct blocking_notifier_head *nh, + struct notifier_block *n) +{ + int ret; + + /* + * This code gets used during boot-up, when task switching is + * not yet working and interrupts must remain disabled. At + * such times we must not call down_write(). + */ + if (unlikely(system_state == SYSTEM_BOOTING)) + return notifier_chain_register(&nh->head, n); + + down_write(&nh->rwsem); + ret = notifier_chain_register(&nh->head, n); + up_write(&nh->rwsem); + return ret; +} +EXPORT_SYMBOL_GPL(blocking_notifier_chain_register); + +/** + * blocking_notifier_chain_unregister - Remove notifier from a blocking notifier chain + * @nh: Pointer to head of the blocking notifier chain + * @n: Entry to remove from notifier chain + * + * Removes a notifier from a blocking notifier chain. + * Must be called from process context. + * + * Returns zero on success or %-ENOENT on failure. + */ +int blocking_notifier_chain_unregister(struct blocking_notifier_head *nh, + struct notifier_block *n) +{ + int ret; + + /* + * This code gets used during boot-up, when task switching is + * not yet working and interrupts must remain disabled. At + * such times we must not call down_write(). + */ + if (unlikely(system_state == SYSTEM_BOOTING)) + return notifier_chain_unregister(&nh->head, n); + + down_write(&nh->rwsem); + ret = notifier_chain_unregister(&nh->head, n); + up_write(&nh->rwsem); + return ret; +} +EXPORT_SYMBOL_GPL(blocking_notifier_chain_unregister); + +/** + * __blocking_notifier_call_chain - Call functions in a blocking notifier chain + * @nh: Pointer to head of the blocking notifier chain + * @val: Value passed unmodified to notifier function + * @v: Pointer passed unmodified to notifier function + * @nr_to_call: See comment for notifier_call_chain. + * @nr_calls: See comment for notifier_call_chain. + * + * Calls each function in a notifier chain in turn. The functions + * run in a process context, so they are allowed to block. + * + * If the return value of the notifier can be and'ed + * with %NOTIFY_STOP_MASK then blocking_notifier_call_chain() + * will return immediately, with the return value of + * the notifier function which halted execution. + * Otherwise the return value is the return value + * of the last notifier function called. + */ +int __blocking_notifier_call_chain(struct blocking_notifier_head *nh, + unsigned long val, void *v, + int nr_to_call, int *nr_calls) +{ + int ret = NOTIFY_DONE; + + /* + * We check the head outside the lock, but if this access is + * racy then it does not matter what the result of the test + * is, we re-check the list after having taken the lock anyway: + */ + if (rcu_dereference(nh->head)) { + down_read(&nh->rwsem); + ret = notifier_call_chain(&nh->head, val, v, nr_to_call, + nr_calls); + up_read(&nh->rwsem); + } + return ret; +} +EXPORT_SYMBOL_GPL(__blocking_notifier_call_chain); + +int blocking_notifier_call_chain(struct blocking_notifier_head *nh, + unsigned long val, void *v) +{ + return __blocking_notifier_call_chain(nh, val, v, -1, NULL); +} +EXPORT_SYMBOL_GPL(blocking_notifier_call_chain); + +/* + * Raw notifier chain routines. There is no protection; + * the caller must provide it. Use at your own risk! + */ + +/** + * raw_notifier_chain_register - Add notifier to a raw notifier chain + * @nh: Pointer to head of the raw notifier chain + * @n: New entry in notifier chain + * + * Adds a notifier to a raw notifier chain. + * All locking must be provided by the caller. + * + * Currently always returns zero. + */ +int raw_notifier_chain_register(struct raw_notifier_head *nh, + struct notifier_block *n) +{ + return notifier_chain_register(&nh->head, n); +} +EXPORT_SYMBOL_GPL(raw_notifier_chain_register); + +/** + * raw_notifier_chain_unregister - Remove notifier from a raw notifier chain + * @nh: Pointer to head of the raw notifier chain + * @n: Entry to remove from notifier chain + * + * Removes a notifier from a raw notifier chain. + * All locking must be provided by the caller. + * + * Returns zero on success or %-ENOENT on failure. + */ +int raw_notifier_chain_unregister(struct raw_notifier_head *nh, + struct notifier_block *n) +{ + return notifier_chain_unregister(&nh->head, n); +} +EXPORT_SYMBOL_GPL(raw_notifier_chain_unregister); + +/** + * __raw_notifier_call_chain - Call functions in a raw notifier chain + * @nh: Pointer to head of the raw notifier chain + * @val: Value passed unmodified to notifier function + * @v: Pointer passed unmodified to notifier function + * @nr_to_call: See comment for notifier_call_chain. + * @nr_calls: See comment for notifier_call_chain + * + * Calls each function in a notifier chain in turn. The functions + * run in an undefined context. + * All locking must be provided by the caller. + * + * If the return value of the notifier can be and'ed + * with %NOTIFY_STOP_MASK then raw_notifier_call_chain() + * will return immediately, with the return value of + * the notifier function which halted execution. + * Otherwise the return value is the return value + * of the last notifier function called. + */ +int __raw_notifier_call_chain(struct raw_notifier_head *nh, + unsigned long val, void *v, + int nr_to_call, int *nr_calls) +{ + return notifier_call_chain(&nh->head, val, v, nr_to_call, nr_calls); +} +EXPORT_SYMBOL_GPL(__raw_notifier_call_chain); + +int raw_notifier_call_chain(struct raw_notifier_head *nh, + unsigned long val, void *v) +{ + return __raw_notifier_call_chain(nh, val, v, -1, NULL); +} +EXPORT_SYMBOL_GPL(raw_notifier_call_chain); + +/* + * SRCU notifier chain routines. Registration and unregistration + * use a mutex, and call_chain is synchronized by SRCU (no locks). + */ + +/** + * srcu_notifier_chain_register - Add notifier to an SRCU notifier chain + * @nh: Pointer to head of the SRCU notifier chain + * @n: New entry in notifier chain + * + * Adds a notifier to an SRCU notifier chain. + * Must be called in process context. + * + * Currently always returns zero. + */ +int srcu_notifier_chain_register(struct srcu_notifier_head *nh, + struct notifier_block *n) +{ + int ret; + + /* + * This code gets used during boot-up, when task switching is + * not yet working and interrupts must remain disabled. At + * such times we must not call mutex_lock(). + */ + if (unlikely(system_state == SYSTEM_BOOTING)) + return notifier_chain_register(&nh->head, n); + + mutex_lock(&nh->mutex); + ret = notifier_chain_register(&nh->head, n); + mutex_unlock(&nh->mutex); + return ret; +} +EXPORT_SYMBOL_GPL(srcu_notifier_chain_register); + +/** + * srcu_notifier_chain_unregister - Remove notifier from an SRCU notifier chain + * @nh: Pointer to head of the SRCU notifier chain + * @n: Entry to remove from notifier chain + * + * Removes a notifier from an SRCU notifier chain. + * Must be called from process context. + * + * Returns zero on success or %-ENOENT on failure. + */ +int srcu_notifier_chain_unregister(struct srcu_notifier_head *nh, + struct notifier_block *n) +{ + int ret; + + /* + * This code gets used during boot-up, when task switching is + * not yet working and interrupts must remain disabled. At + * such times we must not call mutex_lock(). + */ + if (unlikely(system_state == SYSTEM_BOOTING)) + return notifier_chain_unregister(&nh->head, n); + + mutex_lock(&nh->mutex); + ret = notifier_chain_unregister(&nh->head, n); + mutex_unlock(&nh->mutex); + synchronize_srcu(&nh->srcu); + return ret; +} +EXPORT_SYMBOL_GPL(srcu_notifier_chain_unregister); + +/** + * __srcu_notifier_call_chain - Call functions in an SRCU notifier chain + * @nh: Pointer to head of the SRCU notifier chain + * @val: Value passed unmodified to notifier function + * @v: Pointer passed unmodified to notifier function + * @nr_to_call: See comment for notifier_call_chain. + * @nr_calls: See comment for notifier_call_chain + * + * Calls each function in a notifier chain in turn. The functions + * run in a process context, so they are allowed to block. + * + * If the return value of the notifier can be and'ed + * with %NOTIFY_STOP_MASK then srcu_notifier_call_chain() + * will return immediately, with the return value of + * the notifier function which halted execution. + * Otherwise the return value is the return value + * of the last notifier function called. + */ +int __srcu_notifier_call_chain(struct srcu_notifier_head *nh, + unsigned long val, void *v, + int nr_to_call, int *nr_calls) +{ + int ret; + int idx; + + idx = srcu_read_lock(&nh->srcu); + ret = notifier_call_chain(&nh->head, val, v, nr_to_call, nr_calls); + srcu_read_unlock(&nh->srcu, idx); + return ret; +} +EXPORT_SYMBOL_GPL(__srcu_notifier_call_chain); + +int srcu_notifier_call_chain(struct srcu_notifier_head *nh, + unsigned long val, void *v) +{ + return __srcu_notifier_call_chain(nh, val, v, -1, NULL); +} +EXPORT_SYMBOL_GPL(srcu_notifier_call_chain); + +/** + * srcu_init_notifier_head - Initialize an SRCU notifier head + * @nh: Pointer to head of the srcu notifier chain + * + * Unlike other sorts of notifier heads, SRCU notifier heads require + * dynamic initialization. Be sure to call this routine before + * calling any of the other SRCU notifier routines for this head. + * + * If an SRCU notifier head is deallocated, it must first be cleaned + * up by calling srcu_cleanup_notifier_head(). Otherwise the head's + * per-cpu data (used by the SRCU mechanism) will leak. + */ +void srcu_init_notifier_head(struct srcu_notifier_head *nh) +{ + mutex_init(&nh->mutex); + if (init_srcu_struct(&nh->srcu) < 0) + BUG(); + nh->head = NULL; +} +EXPORT_SYMBOL_GPL(srcu_init_notifier_head); + +/** + * register_reboot_notifier - Register function to be called at reboot time + * @nb: Info about notifier function to be called + * + * Registers a function with the list of functions + * to be called at reboot time. + * + * Currently always returns zero, as blocking_notifier_chain_register() + * always returns zero. + */ +int register_reboot_notifier(struct notifier_block *nb) +{ + return blocking_notifier_chain_register(&reboot_notifier_list, nb); +} +EXPORT_SYMBOL(register_reboot_notifier); + +/** + * unregister_reboot_notifier - Unregister previously registered reboot notifier + * @nb: Hook to be unregistered + * + * Unregisters a previously registered reboot + * notifier function. + * + * Returns zero on success, or %-ENOENT on failure. + */ +int unregister_reboot_notifier(struct notifier_block *nb) +{ + return blocking_notifier_chain_unregister(&reboot_notifier_list, nb); +} +EXPORT_SYMBOL(unregister_reboot_notifier); + +static ATOMIC_NOTIFIER_HEAD(die_chain); + +int notify_die(enum die_val val, const char *str, + struct pt_regs *regs, long err, int trap, int sig) +{ + struct die_args args = { + .regs = regs, + .str = str, + .err = err, + .trapnr = trap, + .signr = sig, + + }; + return atomic_notifier_call_chain(&die_chain, val, &args); +} + +int register_die_notifier(struct notifier_block *nb) +{ + vmalloc_sync_all(); + return atomic_notifier_chain_register(&die_chain, nb); +} +EXPORT_SYMBOL_GPL(register_die_notifier); + +int unregister_die_notifier(struct notifier_block *nb) +{ + return atomic_notifier_chain_unregister(&die_chain, nb); +} +EXPORT_SYMBOL_GPL(unregister_die_notifier); diff --git a/kernel/ns_cgroup.c b/kernel/ns_cgroup.c new file mode 100644 index 000000000000..aead4d69f62b --- /dev/null +++ b/kernel/ns_cgroup.c @@ -0,0 +1,100 @@ +/* + * ns_cgroup.c - namespace cgroup subsystem + * + * Copyright 2006, 2007 IBM Corp + */ + +#include <linux/module.h> +#include <linux/cgroup.h> +#include <linux/fs.h> + +struct ns_cgroup { + struct cgroup_subsys_state css; + spinlock_t lock; +}; + +struct cgroup_subsys ns_subsys; + +static inline struct ns_cgroup *cgroup_to_ns( + struct cgroup *cgroup) +{ + return container_of(cgroup_subsys_state(cgroup, ns_subsys_id), + struct ns_cgroup, css); +} + +int ns_cgroup_clone(struct task_struct *task) +{ + return cgroup_clone(task, &ns_subsys); +} + +/* + * Rules: + * 1. you can only enter a cgroup which is a child of your current + * cgroup + * 2. you can only place another process into a cgroup if + * a. you have CAP_SYS_ADMIN + * b. your cgroup is an ancestor of task's destination cgroup + * (hence either you are in the same cgroup as task, or in an + * ancestor cgroup thereof) + */ +static int ns_can_attach(struct cgroup_subsys *ss, + struct cgroup *new_cgroup, struct task_struct *task) +{ + struct cgroup *orig; + + if (current != task) { + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (!cgroup_is_descendant(new_cgroup)) + return -EPERM; + } + + if (atomic_read(&new_cgroup->count) != 0) + return -EPERM; + + orig = task_cgroup(task, ns_subsys_id); + if (orig && orig != new_cgroup->parent) + return -EPERM; + + return 0; +} + +/* + * Rules: you can only create a cgroup if + * 1. you are capable(CAP_SYS_ADMIN) + * 2. the target cgroup is a descendant of your own cgroup + */ +static struct cgroup_subsys_state *ns_create(struct cgroup_subsys *ss, + struct cgroup *cgroup) +{ + struct ns_cgroup *ns_cgroup; + + if (!capable(CAP_SYS_ADMIN)) + return ERR_PTR(-EPERM); + if (!cgroup_is_descendant(cgroup)) + return ERR_PTR(-EPERM); + + ns_cgroup = kzalloc(sizeof(*ns_cgroup), GFP_KERNEL); + if (!ns_cgroup) + return ERR_PTR(-ENOMEM); + spin_lock_init(&ns_cgroup->lock); + return &ns_cgroup->css; +} + +static void ns_destroy(struct cgroup_subsys *ss, + struct cgroup *cgroup) +{ + struct ns_cgroup *ns_cgroup; + + ns_cgroup = cgroup_to_ns(cgroup); + kfree(ns_cgroup); +} + +struct cgroup_subsys ns_subsys = { + .name = "ns", + .can_attach = ns_can_attach, + .create = ns_create, + .destroy = ns_destroy, + .subsys_id = ns_subsys_id, +}; diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c index 049e7c0ac566..79f871bc0ef4 100644 --- a/kernel/nsproxy.c +++ b/kernel/nsproxy.c @@ -26,19 +26,6 @@ static struct kmem_cache *nsproxy_cachep; struct nsproxy init_nsproxy = INIT_NSPROXY(init_nsproxy); -static inline void get_nsproxy(struct nsproxy *ns) -{ - atomic_inc(&ns->count); -} - -void get_task_namespaces(struct task_struct *tsk) -{ - struct nsproxy *ns = tsk->nsproxy; - if (ns) { - get_nsproxy(ns); - } -} - /* * creates a copy of "orig" with refcount 1. */ @@ -87,7 +74,7 @@ static struct nsproxy *create_new_namespaces(unsigned long flags, goto out_ipc; } - new_nsp->pid_ns = copy_pid_ns(flags, tsk->nsproxy->pid_ns); + new_nsp->pid_ns = copy_pid_ns(flags, task_active_pid_ns(tsk)); if (IS_ERR(new_nsp->pid_ns)) { err = PTR_ERR(new_nsp->pid_ns); goto out_pid; @@ -142,7 +129,8 @@ int copy_namespaces(unsigned long flags, struct task_struct *tsk) get_nsproxy(old_ns); - if (!(flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC | CLONE_NEWUSER | CLONE_NEWNET))) + if (!(flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC | + CLONE_NEWUSER | CLONE_NEWPID | CLONE_NEWNET))) return 0; if (!capable(CAP_SYS_ADMIN)) { @@ -156,7 +144,14 @@ int copy_namespaces(unsigned long flags, struct task_struct *tsk) goto out; } + err = ns_cgroup_clone(tsk); + if (err) { + put_nsproxy(new_ns); + goto out; + } + tsk->nsproxy = new_ns; + out: put_nsproxy(old_ns); return err; @@ -196,11 +191,46 @@ int unshare_nsproxy_namespaces(unsigned long unshare_flags, *new_nsp = create_new_namespaces(unshare_flags, current, new_fs ? new_fs : current->fs); - if (IS_ERR(*new_nsp)) + if (IS_ERR(*new_nsp)) { err = PTR_ERR(*new_nsp); + goto out; + } + + err = ns_cgroup_clone(current); + if (err) + put_nsproxy(*new_nsp); + +out: return err; } +void switch_task_namespaces(struct task_struct *p, struct nsproxy *new) +{ + struct nsproxy *ns; + + might_sleep(); + + ns = p->nsproxy; + + rcu_assign_pointer(p->nsproxy, new); + + if (ns && atomic_dec_and_test(&ns->count)) { + /* + * wait for others to get what they want from this nsproxy. + * + * cannot release this nsproxy via the call_rcu() since + * put_mnt_ns() will want to sleep + */ + synchronize_rcu(); + free_nsproxy(ns); + } +} + +void exit_task_namespaces(struct task_struct *p) +{ + switch_task_namespaces(p, NULL); +} + static int __init nsproxy_cache_init(void) { nsproxy_cachep = KMEM_CACHE(nsproxy, SLAB_PANIC); diff --git a/kernel/pid.c b/kernel/pid.c index c6e3f9ffff87..d1db36b94674 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -18,6 +18,12 @@ * allocation scenario when all but one out of 1 million PIDs possible are * allocated already: the scanning of 32 list entries and at most PAGE_SIZE * bytes. The typical fastpath is a single successful setbit. Freeing is O(1). + * + * Pid namespaces: + * (C) 2007 Pavel Emelyanov <xemul@openvz.org>, OpenVZ, SWsoft Inc. + * (C) 2007 Sukadev Bhattiprolu <sukadev@us.ibm.com>, IBM + * Many thanks to Oleg Nesterov for comments and help + * */ #include <linux/mm.h> @@ -28,12 +34,14 @@ #include <linux/hash.h> #include <linux/pid_namespace.h> #include <linux/init_task.h> +#include <linux/syscalls.h> -#define pid_hashfn(nr) hash_long((unsigned long)nr, pidhash_shift) +#define pid_hashfn(nr, ns) \ + hash_long((unsigned long)nr + (unsigned long)ns, pidhash_shift) static struct hlist_head *pid_hash; static int pidhash_shift; -static struct kmem_cache *pid_cachep; struct pid init_struct_pid = INIT_STRUCT_PID; +static struct kmem_cache *pid_ns_cachep; int pid_max = PID_MAX_DEFAULT; @@ -68,8 +76,25 @@ struct pid_namespace init_pid_ns = { [ 0 ... PIDMAP_ENTRIES-1] = { ATOMIC_INIT(BITS_PER_PAGE), NULL } }, .last_pid = 0, - .child_reaper = &init_task + .level = 0, + .child_reaper = &init_task, }; +EXPORT_SYMBOL_GPL(init_pid_ns); + +int is_container_init(struct task_struct *tsk) +{ + int ret = 0; + struct pid *pid; + + rcu_read_lock(); + pid = task_pid(tsk); + if (pid != NULL && pid->numbers[pid->level].nr == 1) + ret = 1; + rcu_read_unlock(); + + return ret; +} +EXPORT_SYMBOL(is_container_init); /* * Note: disable interrupts while the pidmap_lock is held as an @@ -176,11 +201,17 @@ static int next_pidmap(struct pid_namespace *pid_ns, int last) fastcall void put_pid(struct pid *pid) { + struct pid_namespace *ns; + if (!pid) return; + + ns = pid->numbers[pid->level].ns; if ((atomic_read(&pid->count) == 1) || - atomic_dec_and_test(&pid->count)) - kmem_cache_free(pid_cachep, pid); + atomic_dec_and_test(&pid->count)) { + kmem_cache_free(ns->pid_cachep, pid); + put_pid_ns(ns); + } } EXPORT_SYMBOL_GPL(put_pid); @@ -193,60 +224,94 @@ static void delayed_put_pid(struct rcu_head *rhp) fastcall void free_pid(struct pid *pid) { /* We can be called with write_lock_irq(&tasklist_lock) held */ + int i; unsigned long flags; spin_lock_irqsave(&pidmap_lock, flags); - hlist_del_rcu(&pid->pid_chain); + for (i = 0; i <= pid->level; i++) + hlist_del_rcu(&pid->numbers[i].pid_chain); spin_unlock_irqrestore(&pidmap_lock, flags); - free_pidmap(&init_pid_ns, pid->nr); + for (i = 0; i <= pid->level; i++) + free_pidmap(pid->numbers[i].ns, pid->numbers[i].nr); + call_rcu(&pid->rcu, delayed_put_pid); } -struct pid *alloc_pid(void) +struct pid *alloc_pid(struct pid_namespace *ns) { struct pid *pid; enum pid_type type; - int nr = -1; + int i, nr; + struct pid_namespace *tmp; + struct upid *upid; - pid = kmem_cache_alloc(pid_cachep, GFP_KERNEL); + pid = kmem_cache_alloc(ns->pid_cachep, GFP_KERNEL); if (!pid) goto out; - nr = alloc_pidmap(current->nsproxy->pid_ns); - if (nr < 0) - goto out_free; + tmp = ns; + for (i = ns->level; i >= 0; i--) { + nr = alloc_pidmap(tmp); + if (nr < 0) + goto out_free; + + pid->numbers[i].nr = nr; + pid->numbers[i].ns = tmp; + tmp = tmp->parent; + } + get_pid_ns(ns); + pid->level = ns->level; atomic_set(&pid->count, 1); - pid->nr = nr; for (type = 0; type < PIDTYPE_MAX; ++type) INIT_HLIST_HEAD(&pid->tasks[type]); spin_lock_irq(&pidmap_lock); - hlist_add_head_rcu(&pid->pid_chain, &pid_hash[pid_hashfn(pid->nr)]); + for (i = ns->level; i >= 0; i--) { + upid = &pid->numbers[i]; + hlist_add_head_rcu(&upid->pid_chain, + &pid_hash[pid_hashfn(upid->nr, upid->ns)]); + } spin_unlock_irq(&pidmap_lock); out: return pid; out_free: - kmem_cache_free(pid_cachep, pid); + for (i++; i <= ns->level; i++) + free_pidmap(pid->numbers[i].ns, pid->numbers[i].nr); + + kmem_cache_free(ns->pid_cachep, pid); pid = NULL; goto out; } -struct pid * fastcall find_pid(int nr) +struct pid * fastcall find_pid_ns(int nr, struct pid_namespace *ns) { struct hlist_node *elem; - struct pid *pid; + struct upid *pnr; + + hlist_for_each_entry_rcu(pnr, elem, + &pid_hash[pid_hashfn(nr, ns)], pid_chain) + if (pnr->nr == nr && pnr->ns == ns) + return container_of(pnr, struct pid, + numbers[ns->level]); - hlist_for_each_entry_rcu(pid, elem, - &pid_hash[pid_hashfn(nr)], pid_chain) { - if (pid->nr == nr) - return pid; - } return NULL; } +EXPORT_SYMBOL_GPL(find_pid_ns); + +struct pid *find_vpid(int nr) +{ + return find_pid_ns(nr, current->nsproxy->pid_ns); +} +EXPORT_SYMBOL_GPL(find_vpid); + +struct pid *find_pid(int nr) +{ + return find_pid_ns(nr, &init_pid_ns); +} EXPORT_SYMBOL_GPL(find_pid); /* @@ -307,12 +372,32 @@ struct task_struct * fastcall pid_task(struct pid *pid, enum pid_type type) /* * Must be called under rcu_read_lock() or with tasklist_lock read-held. */ -struct task_struct *find_task_by_pid_type(int type, int nr) +struct task_struct *find_task_by_pid_type_ns(int type, int nr, + struct pid_namespace *ns) { - return pid_task(find_pid(nr), type); + return pid_task(find_pid_ns(nr, ns), type); } -EXPORT_SYMBOL(find_task_by_pid_type); +EXPORT_SYMBOL(find_task_by_pid_type_ns); + +struct task_struct *find_task_by_pid(pid_t nr) +{ + return find_task_by_pid_type_ns(PIDTYPE_PID, nr, &init_pid_ns); +} +EXPORT_SYMBOL(find_task_by_pid); + +struct task_struct *find_task_by_vpid(pid_t vnr) +{ + return find_task_by_pid_type_ns(PIDTYPE_PID, vnr, + current->nsproxy->pid_ns); +} +EXPORT_SYMBOL(find_task_by_vpid); + +struct task_struct *find_task_by_pid_ns(pid_t nr, struct pid_namespace *ns) +{ + return find_task_by_pid_type_ns(PIDTYPE_PID, nr, ns); +} +EXPORT_SYMBOL(find_task_by_pid_ns); struct pid *get_task_pid(struct task_struct *task, enum pid_type type) { @@ -339,45 +424,239 @@ struct pid *find_get_pid(pid_t nr) struct pid *pid; rcu_read_lock(); - pid = get_pid(find_pid(nr)); + pid = get_pid(find_vpid(nr)); rcu_read_unlock(); return pid; } +pid_t pid_nr_ns(struct pid *pid, struct pid_namespace *ns) +{ + struct upid *upid; + pid_t nr = 0; + + if (pid && ns->level <= pid->level) { + upid = &pid->numbers[ns->level]; + if (upid->ns == ns) + nr = upid->nr; + } + return nr; +} + +pid_t task_pid_nr_ns(struct task_struct *tsk, struct pid_namespace *ns) +{ + return pid_nr_ns(task_pid(tsk), ns); +} +EXPORT_SYMBOL(task_pid_nr_ns); + +pid_t task_tgid_nr_ns(struct task_struct *tsk, struct pid_namespace *ns) +{ + return pid_nr_ns(task_tgid(tsk), ns); +} +EXPORT_SYMBOL(task_tgid_nr_ns); + +pid_t task_pgrp_nr_ns(struct task_struct *tsk, struct pid_namespace *ns) +{ + return pid_nr_ns(task_pgrp(tsk), ns); +} +EXPORT_SYMBOL(task_pgrp_nr_ns); + +pid_t task_session_nr_ns(struct task_struct *tsk, struct pid_namespace *ns) +{ + return pid_nr_ns(task_session(tsk), ns); +} +EXPORT_SYMBOL(task_session_nr_ns); + /* * Used by proc to find the first pid that is greater then or equal to nr. * * If there is a pid at nr this function is exactly the same as find_pid. */ -struct pid *find_ge_pid(int nr) +struct pid *find_ge_pid(int nr, struct pid_namespace *ns) { struct pid *pid; do { - pid = find_pid(nr); + pid = find_pid_ns(nr, ns); if (pid) break; - nr = next_pidmap(current->nsproxy->pid_ns, nr); + nr = next_pidmap(ns, nr); } while (nr > 0); return pid; } EXPORT_SYMBOL_GPL(find_get_pid); +struct pid_cache { + int nr_ids; + char name[16]; + struct kmem_cache *cachep; + struct list_head list; +}; + +static LIST_HEAD(pid_caches_lh); +static DEFINE_MUTEX(pid_caches_mutex); + +/* + * creates the kmem cache to allocate pids from. + * @nr_ids: the number of numerical ids this pid will have to carry + */ + +static struct kmem_cache *create_pid_cachep(int nr_ids) +{ + struct pid_cache *pcache; + struct kmem_cache *cachep; + + mutex_lock(&pid_caches_mutex); + list_for_each_entry (pcache, &pid_caches_lh, list) + if (pcache->nr_ids == nr_ids) + goto out; + + pcache = kmalloc(sizeof(struct pid_cache), GFP_KERNEL); + if (pcache == NULL) + goto err_alloc; + + snprintf(pcache->name, sizeof(pcache->name), "pid_%d", nr_ids); + cachep = kmem_cache_create(pcache->name, + sizeof(struct pid) + (nr_ids - 1) * sizeof(struct upid), + 0, SLAB_HWCACHE_ALIGN, NULL); + if (cachep == NULL) + goto err_cachep; + + pcache->nr_ids = nr_ids; + pcache->cachep = cachep; + list_add(&pcache->list, &pid_caches_lh); +out: + mutex_unlock(&pid_caches_mutex); + return pcache->cachep; + +err_cachep: + kfree(pcache); +err_alloc: + mutex_unlock(&pid_caches_mutex); + return NULL; +} + +static struct pid_namespace *create_pid_namespace(int level) +{ + struct pid_namespace *ns; + int i; + + ns = kmem_cache_alloc(pid_ns_cachep, GFP_KERNEL); + if (ns == NULL) + goto out; + + ns->pidmap[0].page = kzalloc(PAGE_SIZE, GFP_KERNEL); + if (!ns->pidmap[0].page) + goto out_free; + + ns->pid_cachep = create_pid_cachep(level + 1); + if (ns->pid_cachep == NULL) + goto out_free_map; + + kref_init(&ns->kref); + ns->last_pid = 0; + ns->child_reaper = NULL; + ns->level = level; + + set_bit(0, ns->pidmap[0].page); + atomic_set(&ns->pidmap[0].nr_free, BITS_PER_PAGE - 1); + + for (i = 1; i < PIDMAP_ENTRIES; i++) { + ns->pidmap[i].page = 0; + atomic_set(&ns->pidmap[i].nr_free, BITS_PER_PAGE); + } + + return ns; + +out_free_map: + kfree(ns->pidmap[0].page); +out_free: + kmem_cache_free(pid_ns_cachep, ns); +out: + return ERR_PTR(-ENOMEM); +} + +static void destroy_pid_namespace(struct pid_namespace *ns) +{ + int i; + + for (i = 0; i < PIDMAP_ENTRIES; i++) + kfree(ns->pidmap[i].page); + kmem_cache_free(pid_ns_cachep, ns); +} + struct pid_namespace *copy_pid_ns(unsigned long flags, struct pid_namespace *old_ns) { + struct pid_namespace *new_ns; + BUG_ON(!old_ns); - get_pid_ns(old_ns); - return old_ns; + new_ns = get_pid_ns(old_ns); + if (!(flags & CLONE_NEWPID)) + goto out; + + new_ns = ERR_PTR(-EINVAL); + if (flags & CLONE_THREAD) + goto out_put; + + new_ns = create_pid_namespace(old_ns->level + 1); + if (!IS_ERR(new_ns)) + new_ns->parent = get_pid_ns(old_ns); + +out_put: + put_pid_ns(old_ns); +out: + return new_ns; } void free_pid_ns(struct kref *kref) { - struct pid_namespace *ns; + struct pid_namespace *ns, *parent; ns = container_of(kref, struct pid_namespace, kref); - kfree(ns); + + parent = ns->parent; + destroy_pid_namespace(ns); + + if (parent != NULL) + put_pid_ns(parent); +} + +void zap_pid_ns_processes(struct pid_namespace *pid_ns) +{ + int nr; + int rc; + + /* + * The last thread in the cgroup-init thread group is terminating. + * Find remaining pid_ts in the namespace, signal and wait for them + * to exit. + * + * Note: This signals each threads in the namespace - even those that + * belong to the same thread group, To avoid this, we would have + * to walk the entire tasklist looking a processes in this + * namespace, but that could be unnecessarily expensive if the + * pid namespace has just a few processes. Or we need to + * maintain a tasklist for each pid namespace. + * + */ + read_lock(&tasklist_lock); + nr = next_pidmap(pid_ns, 1); + while (nr > 0) { + kill_proc_info(SIGKILL, SEND_SIG_PRIV, nr); + nr = next_pidmap(pid_ns, nr); + } + read_unlock(&tasklist_lock); + + do { + clear_thread_flag(TIF_SIGPENDING); + rc = sys_wait4(-1, NULL, __WALL, NULL); + } while (rc != -ECHILD); + + + /* Child reaper for the pid namespace is going away */ + pid_ns->child_reaper = NULL; + return; } /* @@ -412,5 +691,9 @@ void __init pidmap_init(void) set_bit(0, init_pid_ns.pidmap[0].page); atomic_dec(&init_pid_ns.pidmap[0].nr_free); - pid_cachep = KMEM_CACHE(pid, SLAB_PANIC); + init_pid_ns.pid_cachep = create_pid_cachep(1); + if (init_pid_ns.pid_cachep == NULL) + panic("Can't create pid_1 cachep\n"); + + pid_ns_cachep = KMEM_CACHE(pid_namespace, SLAB_PANIC); } diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c index b53c8fcd9d82..68c96376e84a 100644 --- a/kernel/posix-cpu-timers.c +++ b/kernel/posix-cpu-timers.c @@ -21,8 +21,8 @@ static int check_clock(const clockid_t which_clock) read_lock(&tasklist_lock); p = find_task_by_pid(pid); - if (!p || (CPUCLOCK_PERTHREAD(which_clock) ? - p->tgid != current->tgid : p->tgid != pid)) { + if (!p || !(CPUCLOCK_PERTHREAD(which_clock) ? + same_thread_group(p, current) : thread_group_leader(p))) { error = -EINVAL; } read_unlock(&tasklist_lock); @@ -308,13 +308,13 @@ int posix_cpu_clock_get(const clockid_t which_clock, struct timespec *tp) p = find_task_by_pid(pid); if (p) { if (CPUCLOCK_PERTHREAD(which_clock)) { - if (p->tgid == current->tgid) { + if (same_thread_group(p, current)) { error = cpu_clock_sample(which_clock, p, &rtn); } } else { read_lock(&tasklist_lock); - if (p->tgid == pid && p->signal) { + if (thread_group_leader(p) && p->signal) { error = cpu_clock_sample_group(which_clock, p, &rtn); @@ -355,7 +355,7 @@ int posix_cpu_timer_create(struct k_itimer *new_timer) p = current; } else { p = find_task_by_pid(pid); - if (p && p->tgid != current->tgid) + if (p && !same_thread_group(p, current)) p = NULL; } } else { @@ -363,7 +363,7 @@ int posix_cpu_timer_create(struct k_itimer *new_timer) p = current->group_leader; } else { p = find_task_by_pid(pid); - if (p && p->tgid != pid) + if (p && !thread_group_leader(p)) p = NULL; } } diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c index d11f579d189a..35b4bbfc78ff 100644 --- a/kernel/posix-timers.c +++ b/kernel/posix-timers.c @@ -404,7 +404,7 @@ static struct task_struct * good_sigevent(sigevent_t * event) if ((event->sigev_notify & SIGEV_THREAD_ID ) && (!(rtn = find_task_by_pid(event->sigev_notify_thread_id)) || - rtn->tgid != current->tgid || + !same_thread_group(rtn, current) || (event->sigev_notify & ~SIGEV_THREAD_ID) != SIGEV_SIGNAL)) return NULL; @@ -608,7 +608,7 @@ static struct k_itimer * lock_timer(timer_t timer_id, unsigned long *flags) spin_lock(&timr->it_lock); if ((timr->it_id != timer_id) || !(timr->it_process) || - timr->it_process->tgid != current->tgid) { + !same_thread_group(timr->it_process, current)) { spin_unlock(&timr->it_lock); spin_unlock_irqrestore(&idr_lock, *flags); timr = NULL; diff --git a/kernel/ptrace.c b/kernel/ptrace.c index a73ebd3b9d4c..7c76f2ffaeaa 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -19,6 +19,7 @@ #include <linux/security.h> #include <linux/signal.h> #include <linux/audit.h> +#include <linux/pid_namespace.h> #include <asm/pgtable.h> #include <asm/uaccess.h> @@ -168,7 +169,7 @@ int ptrace_attach(struct task_struct *task) retval = -EPERM; if (task->pid <= 1) goto out; - if (task->tgid == current->tgid) + if (same_thread_group(task, current)) goto out; repeat: @@ -443,7 +444,7 @@ struct task_struct *ptrace_get_task_struct(pid_t pid) return ERR_PTR(-EPERM); read_lock(&tasklist_lock); - child = find_task_by_pid(pid); + child = find_task_by_vpid(pid); if (child) get_task_struct(child); diff --git a/kernel/rtmutex-debug.c b/kernel/rtmutex-debug.c index 6b0703db152d..56d73cb8826d 100644 --- a/kernel/rtmutex-debug.c +++ b/kernel/rtmutex-debug.c @@ -87,7 +87,7 @@ static int rt_trace_on = 1; static void printk_task(struct task_struct *p) { if (p) - printk("%16s:%5d [%p, %3d]", p->comm, p->pid, p, p->prio); + printk("%16s:%5d [%p, %3d]", p->comm, task_pid_nr(p), p, p->prio); else printk("<none>"); } @@ -152,22 +152,25 @@ void debug_rt_mutex_print_deadlock(struct rt_mutex_waiter *waiter) printk( "[ BUG: circular locking deadlock detected! ]\n"); printk( "--------------------------------------------\n"); printk("%s/%d is deadlocking current task %s/%d\n\n", - task->comm, task->pid, current->comm, current->pid); + task->comm, task_pid_nr(task), + current->comm, task_pid_nr(current)); printk("\n1) %s/%d is trying to acquire this lock:\n", - current->comm, current->pid); + current->comm, task_pid_nr(current)); printk_lock(waiter->lock, 1); - printk("\n2) %s/%d is blocked on this lock:\n", task->comm, task->pid); + printk("\n2) %s/%d is blocked on this lock:\n", + task->comm, task_pid_nr(task)); printk_lock(waiter->deadlock_lock, 1); debug_show_held_locks(current); debug_show_held_locks(task); - printk("\n%s/%d's [blocked] stackdump:\n\n", task->comm, task->pid); + printk("\n%s/%d's [blocked] stackdump:\n\n", + task->comm, task_pid_nr(task)); show_stack(task, NULL); printk("\n%s/%d's [current] stackdump:\n\n", - current->comm, current->pid); + current->comm, task_pid_nr(current)); dump_stack(); debug_show_all_locks(); diff --git a/kernel/rtmutex.c b/kernel/rtmutex.c index 8cd9bd2cdb34..0deef71ff8d2 100644 --- a/kernel/rtmutex.c +++ b/kernel/rtmutex.c @@ -185,7 +185,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, prev_max = max_lock_depth; printk(KERN_WARNING "Maximum lock depth %d reached " "task: %s (%d)\n", max_lock_depth, - top_task->comm, top_task->pid); + top_task->comm, task_pid_nr(top_task)); } put_task_struct(task); diff --git a/kernel/sched.c b/kernel/sched.c index ed90be46fb31..afe76ec2e7fe 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -44,6 +44,7 @@ #include <linux/vmalloc.h> #include <linux/blkdev.h> #include <linux/delay.h> +#include <linux/pid_namespace.h> #include <linux/smp.h> #include <linux/threads.h> #include <linux/timer.h> @@ -51,6 +52,7 @@ #include <linux/cpu.h> #include <linux/cpuset.h> #include <linux/percpu.h> +#include <linux/cpu_acct.h> #include <linux/kthread.h> #include <linux/seq_file.h> #include <linux/sysctl.h> @@ -153,10 +155,15 @@ struct rt_prio_array { #ifdef CONFIG_FAIR_GROUP_SCHED +#include <linux/cgroup.h> + struct cfs_rq; /* task group related information */ struct task_group { +#ifdef CONFIG_FAIR_CGROUP_SCHED + struct cgroup_subsys_state css; +#endif /* schedulable entities of this group on each cpu */ struct sched_entity **se; /* runqueue "owned" by this group on each cpu */ @@ -197,6 +204,9 @@ static inline struct task_group *task_group(struct task_struct *p) #ifdef CONFIG_FAIR_USER_SCHED tg = p->user->tg; +#elif defined(CONFIG_FAIR_CGROUP_SCHED) + tg = container_of(task_subsys_state(p, cpu_cgroup_subsys_id), + struct task_group, css); #else tg = &init_task_group; #endif @@ -1875,7 +1885,7 @@ asmlinkage void schedule_tail(struct task_struct *prev) preempt_enable(); #endif if (current->set_child_tid) - put_user(current->pid, current->set_child_tid); + put_user(task_pid_vnr(current), current->set_child_tid); } /* @@ -3307,9 +3317,13 @@ void account_user_time(struct task_struct *p, cputime_t cputime) { struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; cputime64_t tmp; + struct rq *rq = this_rq(); p->utime = cputime_add(p->utime, cputime); + if (p != rq->idle) + cpuacct_charge(p, cputime); + /* Add user time to cpustat. */ tmp = cputime_to_cputime64(cputime); if (TASK_NICE(p) > 0) @@ -3374,9 +3388,10 @@ void account_system_time(struct task_struct *p, int hardirq_offset, cpustat->irq = cputime64_add(cpustat->irq, tmp); else if (softirq_count()) cpustat->softirq = cputime64_add(cpustat->softirq, tmp); - else if (p != rq->idle) + else if (p != rq->idle) { cpustat->system = cputime64_add(cpustat->system, tmp); - else if (atomic_read(&rq->nr_iowait) > 0) + cpuacct_charge(p, cputime); + } else if (atomic_read(&rq->nr_iowait) > 0) cpustat->iowait = cputime64_add(cpustat->iowait, tmp); else cpustat->idle = cputime64_add(cpustat->idle, tmp); @@ -3412,8 +3427,10 @@ void account_steal_time(struct task_struct *p, cputime_t steal) cpustat->iowait = cputime64_add(cpustat->iowait, tmp); else cpustat->idle = cputime64_add(cpustat->idle, tmp); - } else + } else { cpustat->steal = cputime64_add(cpustat->steal, tmp); + cpuacct_charge(p, -tmp); + } } /* @@ -3493,7 +3510,7 @@ EXPORT_SYMBOL(sub_preempt_count); static noinline void __schedule_bug(struct task_struct *prev) { printk(KERN_ERR "BUG: scheduling while atomic: %s/0x%08x/%d\n", - prev->comm, preempt_count(), prev->pid); + prev->comm, preempt_count(), task_pid_nr(prev)); debug_show_held_locks(prev); if (irqs_disabled()) print_irqtrace_events(prev); @@ -4159,7 +4176,7 @@ struct task_struct *idle_task(int cpu) */ static struct task_struct *find_process_by_pid(pid_t pid) { - return pid ? find_task_by_pid(pid) : current; + return pid ? find_task_by_vpid(pid) : current; } /* Actually do priority change: must hold rq lock. */ @@ -4462,8 +4479,21 @@ long sched_setaffinity(pid_t pid, cpumask_t new_mask) cpus_allowed = cpuset_cpus_allowed(p); cpus_and(new_mask, new_mask, cpus_allowed); + again: retval = set_cpus_allowed(p, new_mask); + if (!retval) { + cpus_allowed = cpuset_cpus_allowed(p); + if (!cpus_subset(new_mask, cpus_allowed)) { + /* + * We must have raced with a concurrent cpuset + * update. Just reset the cpus_allowed to the + * cpuset's cpus_allowed + */ + new_mask = cpus_allowed; + goto again; + } + } out_unlock: put_task_struct(p); mutex_unlock(&sched_hotcpu_mutex); @@ -4843,7 +4873,8 @@ static void show_task(struct task_struct *p) free = (unsigned long)n - (unsigned long)end_of_stack(p); } #endif - printk(KERN_CONT "%5lu %5d %6d\n", free, p->pid, p->parent->pid); + printk(KERN_CONT "%5lu %5d %6d\n", free, + task_pid_nr(p), task_pid_nr(p->parent)); if (state != TASK_RUNNING) show_stack(p, NULL); @@ -5137,8 +5168,16 @@ static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p) /* No more Mr. Nice Guy. */ if (dest_cpu == NR_CPUS) { + cpumask_t cpus_allowed = cpuset_cpus_allowed_locked(p); + /* + * Try to stay on the same cpuset, where the + * current cpuset may be a subset of all cpus. + * The cpuset_cpus_allowed_locked() variant of + * cpuset_cpus_allowed() will not block. It must be + * called within calls to cpuset_lock/cpuset_unlock. + */ rq = task_rq_lock(p, &flags); - cpus_setall(p->cpus_allowed); + p->cpus_allowed = cpus_allowed; dest_cpu = any_online_cpu(p->cpus_allowed); task_rq_unlock(rq, &flags); @@ -5150,7 +5189,7 @@ static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p) if (p->mm && printk_ratelimit()) printk(KERN_INFO "process %d (%s) no " "longer affine to cpu%d\n", - p->pid, p->comm, dead_cpu); + task_pid_nr(p), p->comm, dead_cpu); } } while (!__migrate_task_irq(p, dead_cpu, dest_cpu)); } @@ -5257,7 +5296,7 @@ static void migrate_dead(unsigned int dead_cpu, struct task_struct *p) struct rq *rq = cpu_rq(dead_cpu); /* Must be exiting, otherwise would be on tasklist. */ - BUG_ON(p->exit_state != EXIT_ZOMBIE && p->exit_state != EXIT_DEAD); + BUG_ON(!p->exit_state); /* Cannot have done final schedule yet: would have vanished. */ BUG_ON(p->state == TASK_DEAD); @@ -5504,6 +5543,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) case CPU_DEAD: case CPU_DEAD_FROZEN: + cpuset_lock(); /* around calls to cpuset_cpus_allowed_lock() */ migrate_live_tasks(cpu); rq = cpu_rq(cpu); kthread_stop(rq->migration_thread); @@ -5517,6 +5557,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) rq->idle->sched_class = &idle_sched_class; migrate_dead_tasks(cpu); spin_unlock_irq(&rq->lock); + cpuset_unlock(); migrate_nr_uninterruptible(rq); BUG_ON(rq->nr_running != 0); @@ -6367,26 +6408,31 @@ error: return -ENOMEM; #endif } + +static cpumask_t *doms_cur; /* current sched domains */ +static int ndoms_cur; /* number of sched domains in 'doms_cur' */ + +/* + * Special case: If a kmalloc of a doms_cur partition (array of + * cpumask_t) fails, then fallback to a single sched domain, + * as determined by the single cpumask_t fallback_doms. + */ +static cpumask_t fallback_doms; + /* * Set up scheduler domains and groups. Callers must hold the hotplug lock. + * For now this just excludes isolated cpus, but could be used to + * exclude other special cases in the future. */ static int arch_init_sched_domains(const cpumask_t *cpu_map) { - cpumask_t cpu_default_map; - int err; - - /* - * Setup mask for cpus without special case scheduling requirements. - * For now this just excludes isolated cpus, but could be used to - * exclude other special cases in the future. - */ - cpus_andnot(cpu_default_map, *cpu_map, cpu_isolated_map); - - err = build_sched_domains(&cpu_default_map); - + ndoms_cur = 1; + doms_cur = kmalloc(sizeof(cpumask_t), GFP_KERNEL); + if (!doms_cur) + doms_cur = &fallback_doms; + cpus_andnot(*doms_cur, *cpu_map, cpu_isolated_map); register_sched_domain_sysctl(); - - return err; + return build_sched_domains(doms_cur); } static void arch_destroy_sched_domains(const cpumask_t *cpu_map) @@ -6410,6 +6456,68 @@ static void detach_destroy_domains(const cpumask_t *cpu_map) arch_destroy_sched_domains(cpu_map); } +/* + * Partition sched domains as specified by the 'ndoms_new' + * cpumasks in the array doms_new[] of cpumasks. This compares + * doms_new[] to the current sched domain partitioning, doms_cur[]. + * It destroys each deleted domain and builds each new domain. + * + * 'doms_new' is an array of cpumask_t's of length 'ndoms_new'. + * The masks don't intersect (don't overlap.) We should setup one + * sched domain for each mask. CPUs not in any of the cpumasks will + * not be load balanced. If the same cpumask appears both in the + * current 'doms_cur' domains and in the new 'doms_new', we can leave + * it as it is. + * + * The passed in 'doms_new' should be kmalloc'd. This routine takes + * ownership of it and will kfree it when done with it. If the caller + * failed the kmalloc call, then it can pass in doms_new == NULL, + * and partition_sched_domains() will fallback to the single partition + * 'fallback_doms'. + * + * Call with hotplug lock held + */ +void partition_sched_domains(int ndoms_new, cpumask_t *doms_new) +{ + int i, j; + + if (doms_new == NULL) { + ndoms_new = 1; + doms_new = &fallback_doms; + cpus_andnot(doms_new[0], cpu_online_map, cpu_isolated_map); + } + + /* Destroy deleted domains */ + for (i = 0; i < ndoms_cur; i++) { + for (j = 0; j < ndoms_new; j++) { + if (cpus_equal(doms_cur[i], doms_new[j])) + goto match1; + } + /* no match - a current sched domain not in new doms_new[] */ + detach_destroy_domains(doms_cur + i); +match1: + ; + } + + /* Build new domains */ + for (i = 0; i < ndoms_new; i++) { + for (j = 0; j < ndoms_cur; j++) { + if (cpus_equal(doms_new[i], doms_cur[j])) + goto match2; + } + /* no match - add a new doms_new */ + build_sched_domains(doms_new + i); +match2: + ; + } + + /* Remember the new sched domains */ + if (doms_cur != &fallback_doms) + kfree(doms_cur); + doms_cur = doms_new; + ndoms_cur = ndoms_new; +} + #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) static int arch_reinit_sched_domains(void) { @@ -6991,3 +7099,116 @@ unsigned long sched_group_shares(struct task_group *tg) } #endif /* CONFIG_FAIR_GROUP_SCHED */ + +#ifdef CONFIG_FAIR_CGROUP_SCHED + +/* return corresponding task_group object of a cgroup */ +static inline struct task_group *cgroup_tg(struct cgroup *cont) +{ + return container_of(cgroup_subsys_state(cont, cpu_cgroup_subsys_id), + struct task_group, css); +} + +static struct cgroup_subsys_state * +cpu_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont) +{ + struct task_group *tg; + + if (!cont->parent) { + /* This is early initialization for the top cgroup */ + init_task_group.css.cgroup = cont; + return &init_task_group.css; + } + + /* we support only 1-level deep hierarchical scheduler atm */ + if (cont->parent->parent) + return ERR_PTR(-EINVAL); + + tg = sched_create_group(); + if (IS_ERR(tg)) + return ERR_PTR(-ENOMEM); + + /* Bind the cgroup to task_group object we just created */ + tg->css.cgroup = cont; + + return &tg->css; +} + +static void cpu_cgroup_destroy(struct cgroup_subsys *ss, + struct cgroup *cont) +{ + struct task_group *tg = cgroup_tg(cont); + + sched_destroy_group(tg); +} + +static int cpu_cgroup_can_attach(struct cgroup_subsys *ss, + struct cgroup *cont, struct task_struct *tsk) +{ + /* We don't support RT-tasks being in separate groups */ + if (tsk->sched_class != &fair_sched_class) + return -EINVAL; + + return 0; +} + +static void +cpu_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cont, + struct cgroup *old_cont, struct task_struct *tsk) +{ + sched_move_task(tsk); +} + +static ssize_t cpu_shares_write(struct cgroup *cont, struct cftype *cftype, + struct file *file, const char __user *userbuf, + size_t nbytes, loff_t *ppos) +{ + unsigned long shareval; + struct task_group *tg = cgroup_tg(cont); + char buffer[2*sizeof(unsigned long) + 1]; + int rc; + + if (nbytes > 2*sizeof(unsigned long)) /* safety check */ + return -E2BIG; + + if (copy_from_user(buffer, userbuf, nbytes)) + return -EFAULT; + + buffer[nbytes] = 0; /* nul-terminate */ + shareval = simple_strtoul(buffer, NULL, 10); + + rc = sched_group_set_shares(tg, shareval); + + return (rc < 0 ? rc : nbytes); +} + +static u64 cpu_shares_read_uint(struct cgroup *cont, struct cftype *cft) +{ + struct task_group *tg = cgroup_tg(cont); + + return (u64) tg->shares; +} + +static struct cftype cpu_shares = { + .name = "shares", + .read_uint = cpu_shares_read_uint, + .write = cpu_shares_write, +}; + +static int cpu_cgroup_populate(struct cgroup_subsys *ss, struct cgroup *cont) +{ + return cgroup_add_file(cont, ss, &cpu_shares); +} + +struct cgroup_subsys cpu_cgroup_subsys = { + .name = "cpu", + .create = cpu_cgroup_create, + .destroy = cpu_cgroup_destroy, + .can_attach = cpu_cgroup_can_attach, + .attach = cpu_cgroup_attach, + .populate = cpu_cgroup_populate, + .subsys_id = cpu_cgroup_subsys_id, + .early_init = 1, +}; + +#endif /* CONFIG_FAIR_CGROUP_SCHED */ diff --git a/kernel/signal.c b/kernel/signal.c index e4f059cd9867..12006308c7eb 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -256,7 +256,7 @@ flush_signal_handlers(struct task_struct *t, int force_default) int unhandled_signal(struct task_struct *tsk, int sig) { - if (is_init(tsk)) + if (is_global_init(tsk)) return 1; if (tsk->ptrace & PT_PTRACED) return 0; @@ -536,7 +536,7 @@ static int check_kill_permission(int sig, struct siginfo *info, return error; error = -EPERM; if (((sig != SIGCONT) || - (process_session(current) != process_session(t))) + (task_session_nr(current) != task_session_nr(t))) && (current->euid ^ t->suid) && (current->euid ^ t->uid) && (current->uid ^ t->suid) && (current->uid ^ t->uid) && !capable(CAP_KILL)) @@ -694,7 +694,7 @@ static int send_signal(int sig, struct siginfo *info, struct task_struct *t, q->info.si_signo = sig; q->info.si_errno = 0; q->info.si_code = SI_USER; - q->info.si_pid = current->pid; + q->info.si_pid = task_pid_vnr(current); q->info.si_uid = current->uid; break; case (unsigned long) SEND_SIG_PRIV: @@ -730,7 +730,7 @@ int print_fatal_signals; static void print_fatal_signal(struct pt_regs *regs, int signr) { printk("%s/%d: potentially unexpected fatal signal %d.\n", - current->comm, current->pid, signr); + current->comm, task_pid_nr(current), signr); #ifdef __i386__ printk("code at %08lx: ", regs->eip); @@ -1089,7 +1089,7 @@ kill_proc_info(int sig, struct siginfo *info, pid_t pid) { int error; rcu_read_lock(); - error = kill_pid_info(sig, info, find_pid(pid)); + error = kill_pid_info(sig, info, find_vpid(pid)); rcu_read_unlock(); return error; } @@ -1150,7 +1150,7 @@ static int kill_something_info(int sig, struct siginfo *info, int pid) read_lock(&tasklist_lock); for_each_process(p) { - if (p->pid > 1 && p->tgid != current->tgid) { + if (p->pid > 1 && !same_thread_group(p, current)) { int err = group_send_sig_info(sig, info, p); ++count; if (err != -EPERM) @@ -1160,9 +1160,9 @@ static int kill_something_info(int sig, struct siginfo *info, int pid) read_unlock(&tasklist_lock); ret = count ? retval : -ESRCH; } else if (pid < 0) { - ret = kill_pgrp_info(sig, info, find_pid(-pid)); + ret = kill_pgrp_info(sig, info, find_vpid(-pid)); } else { - ret = kill_pid_info(sig, info, find_pid(pid)); + ret = kill_pid_info(sig, info, find_vpid(pid)); } rcu_read_unlock(); return ret; @@ -1266,7 +1266,12 @@ EXPORT_SYMBOL(kill_pid); int kill_proc(pid_t pid, int sig, int priv) { - return kill_proc_info(sig, __si_special(priv), pid); + int ret; + + rcu_read_lock(); + ret = kill_pid_info(sig, __si_special(priv), find_pid(pid)); + rcu_read_unlock(); + return ret; } /* @@ -1443,7 +1448,22 @@ void do_notify_parent(struct task_struct *tsk, int sig) info.si_signo = sig; info.si_errno = 0; - info.si_pid = tsk->pid; + /* + * we are under tasklist_lock here so our parent is tied to + * us and cannot exit and release its namespace. + * + * the only it can is to switch its nsproxy with sys_unshare, + * bu uncharing pid namespaces is not allowed, so we'll always + * see relevant namespace + * + * write_lock() currently calls preempt_disable() which is the + * same as rcu_read_lock(), but according to Oleg, this is not + * correct to rely on this + */ + rcu_read_lock(); + info.si_pid = task_pid_nr_ns(tsk, tsk->parent->nsproxy->pid_ns); + rcu_read_unlock(); + info.si_uid = tsk->uid; /* FIXME: find out whether or not this is supposed to be c*time. */ @@ -1508,7 +1528,13 @@ static void do_notify_parent_cldstop(struct task_struct *tsk, int why) info.si_signo = SIGCHLD; info.si_errno = 0; - info.si_pid = tsk->pid; + /* + * see comment in do_notify_parent() abot the following 3 lines + */ + rcu_read_lock(); + info.si_pid = task_pid_nr_ns(tsk, tsk->parent->nsproxy->pid_ns); + rcu_read_unlock(); + info.si_uid = tsk->uid; /* FIXME: find out whether or not this is supposed to be c*time. */ @@ -1634,7 +1660,7 @@ void ptrace_notify(int exit_code) memset(&info, 0, sizeof info); info.si_signo = SIGTRAP; info.si_code = exit_code; - info.si_pid = current->pid; + info.si_pid = task_pid_vnr(current); info.si_uid = current->uid; /* Let the debugger run. */ @@ -1804,7 +1830,7 @@ relock: info->si_signo = signr; info->si_errno = 0; info->si_code = SI_USER; - info->si_pid = current->parent->pid; + info->si_pid = task_pid_vnr(current->parent); info->si_uid = current->parent->uid; } @@ -1835,11 +1861,9 @@ relock: continue; /* - * Init of a pid space gets no signals it doesn't want from - * within that pid space. It can of course get signals from - * its parent pid space. + * Global init gets no signals it doesn't want. */ - if (current == child_reaper(current)) + if (is_global_init(current)) continue; if (sig_kernel_stop(signr)) { @@ -2193,7 +2217,7 @@ sys_kill(int pid, int sig) info.si_signo = sig; info.si_errno = 0; info.si_code = SI_USER; - info.si_pid = current->tgid; + info.si_pid = task_tgid_vnr(current); info.si_uid = current->uid; return kill_something_info(sig, &info, pid); @@ -2209,12 +2233,12 @@ static int do_tkill(int tgid, int pid, int sig) info.si_signo = sig; info.si_errno = 0; info.si_code = SI_TKILL; - info.si_pid = current->tgid; + info.si_pid = task_tgid_vnr(current); info.si_uid = current->uid; read_lock(&tasklist_lock); - p = find_task_by_pid(pid); - if (p && (tgid <= 0 || p->tgid == tgid)) { + p = find_task_by_vpid(pid); + if (p && (tgid <= 0 || task_tgid_vnr(p) == tgid)) { error = check_kill_permission(sig, &info, p); /* * The null signal is a permissions and process existence diff --git a/kernel/softlockup.c b/kernel/softlockup.c index edeeef3a6a32..11df812263c8 100644 --- a/kernel/softlockup.c +++ b/kernel/softlockup.c @@ -113,7 +113,7 @@ void softlockup_tick(void) spin_lock(&print_lock); printk(KERN_ERR "BUG: soft lockup - CPU#%d stuck for %lus! [%s:%d]\n", this_cpu, now - touch_timestamp, - current->comm, current->pid); + current->comm, task_pid_nr(current)); if (regs) show_regs(regs); else diff --git a/kernel/sys.c b/kernel/sys.c index bc8879c822a5..304b5410d746 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -106,537 +106,6 @@ EXPORT_SYMBOL(cad_pid); void (*pm_power_off_prepare)(void); -/* - * Notifier list for kernel code which wants to be called - * at shutdown. This is used to stop any idling DMA operations - * and the like. - */ - -static BLOCKING_NOTIFIER_HEAD(reboot_notifier_list); - -/* - * Notifier chain core routines. The exported routines below - * are layered on top of these, with appropriate locking added. - */ - -static int notifier_chain_register(struct notifier_block **nl, - struct notifier_block *n) -{ - while ((*nl) != NULL) { - if (n->priority > (*nl)->priority) - break; - nl = &((*nl)->next); - } - n->next = *nl; - rcu_assign_pointer(*nl, n); - return 0; -} - -static int notifier_chain_unregister(struct notifier_block **nl, - struct notifier_block *n) -{ - while ((*nl) != NULL) { - if ((*nl) == n) { - rcu_assign_pointer(*nl, n->next); - return 0; - } - nl = &((*nl)->next); - } - return -ENOENT; -} - -/** - * notifier_call_chain - Informs the registered notifiers about an event. - * @nl: Pointer to head of the blocking notifier chain - * @val: Value passed unmodified to notifier function - * @v: Pointer passed unmodified to notifier function - * @nr_to_call: Number of notifier functions to be called. Don't care - * value of this parameter is -1. - * @nr_calls: Records the number of notifications sent. Don't care - * value of this field is NULL. - * @returns: notifier_call_chain returns the value returned by the - * last notifier function called. - */ - -static int __kprobes notifier_call_chain(struct notifier_block **nl, - unsigned long val, void *v, - int nr_to_call, int *nr_calls) -{ - int ret = NOTIFY_DONE; - struct notifier_block *nb, *next_nb; - - nb = rcu_dereference(*nl); - - while (nb && nr_to_call) { - next_nb = rcu_dereference(nb->next); - ret = nb->notifier_call(nb, val, v); - - if (nr_calls) - (*nr_calls)++; - - if ((ret & NOTIFY_STOP_MASK) == NOTIFY_STOP_MASK) - break; - nb = next_nb; - nr_to_call--; - } - return ret; -} - -/* - * Atomic notifier chain routines. Registration and unregistration - * use a spinlock, and call_chain is synchronized by RCU (no locks). - */ - -/** - * atomic_notifier_chain_register - Add notifier to an atomic notifier chain - * @nh: Pointer to head of the atomic notifier chain - * @n: New entry in notifier chain - * - * Adds a notifier to an atomic notifier chain. - * - * Currently always returns zero. - */ - -int atomic_notifier_chain_register(struct atomic_notifier_head *nh, - struct notifier_block *n) -{ - unsigned long flags; - int ret; - - spin_lock_irqsave(&nh->lock, flags); - ret = notifier_chain_register(&nh->head, n); - spin_unlock_irqrestore(&nh->lock, flags); - return ret; -} - -EXPORT_SYMBOL_GPL(atomic_notifier_chain_register); - -/** - * atomic_notifier_chain_unregister - Remove notifier from an atomic notifier chain - * @nh: Pointer to head of the atomic notifier chain - * @n: Entry to remove from notifier chain - * - * Removes a notifier from an atomic notifier chain. - * - * Returns zero on success or %-ENOENT on failure. - */ -int atomic_notifier_chain_unregister(struct atomic_notifier_head *nh, - struct notifier_block *n) -{ - unsigned long flags; - int ret; - - spin_lock_irqsave(&nh->lock, flags); - ret = notifier_chain_unregister(&nh->head, n); - spin_unlock_irqrestore(&nh->lock, flags); - synchronize_rcu(); - return ret; -} - -EXPORT_SYMBOL_GPL(atomic_notifier_chain_unregister); - -/** - * __atomic_notifier_call_chain - Call functions in an atomic notifier chain - * @nh: Pointer to head of the atomic notifier chain - * @val: Value passed unmodified to notifier function - * @v: Pointer passed unmodified to notifier function - * @nr_to_call: See the comment for notifier_call_chain. - * @nr_calls: See the comment for notifier_call_chain. - * - * Calls each function in a notifier chain in turn. The functions - * run in an atomic context, so they must not block. - * This routine uses RCU to synchronize with changes to the chain. - * - * If the return value of the notifier can be and'ed - * with %NOTIFY_STOP_MASK then atomic_notifier_call_chain() - * will return immediately, with the return value of - * the notifier function which halted execution. - * Otherwise the return value is the return value - * of the last notifier function called. - */ - -int __kprobes __atomic_notifier_call_chain(struct atomic_notifier_head *nh, - unsigned long val, void *v, - int nr_to_call, int *nr_calls) -{ - int ret; - - rcu_read_lock(); - ret = notifier_call_chain(&nh->head, val, v, nr_to_call, nr_calls); - rcu_read_unlock(); - return ret; -} - -EXPORT_SYMBOL_GPL(__atomic_notifier_call_chain); - -int __kprobes atomic_notifier_call_chain(struct atomic_notifier_head *nh, - unsigned long val, void *v) -{ - return __atomic_notifier_call_chain(nh, val, v, -1, NULL); -} - -EXPORT_SYMBOL_GPL(atomic_notifier_call_chain); -/* - * Blocking notifier chain routines. All access to the chain is - * synchronized by an rwsem. - */ - -/** - * blocking_notifier_chain_register - Add notifier to a blocking notifier chain - * @nh: Pointer to head of the blocking notifier chain - * @n: New entry in notifier chain - * - * Adds a notifier to a blocking notifier chain. - * Must be called in process context. - * - * Currently always returns zero. - */ - -int blocking_notifier_chain_register(struct blocking_notifier_head *nh, - struct notifier_block *n) -{ - int ret; - - /* - * This code gets used during boot-up, when task switching is - * not yet working and interrupts must remain disabled. At - * such times we must not call down_write(). - */ - if (unlikely(system_state == SYSTEM_BOOTING)) - return notifier_chain_register(&nh->head, n); - - down_write(&nh->rwsem); - ret = notifier_chain_register(&nh->head, n); - up_write(&nh->rwsem); - return ret; -} - -EXPORT_SYMBOL_GPL(blocking_notifier_chain_register); - -/** - * blocking_notifier_chain_unregister - Remove notifier from a blocking notifier chain - * @nh: Pointer to head of the blocking notifier chain - * @n: Entry to remove from notifier chain - * - * Removes a notifier from a blocking notifier chain. - * Must be called from process context. - * - * Returns zero on success or %-ENOENT on failure. - */ -int blocking_notifier_chain_unregister(struct blocking_notifier_head *nh, - struct notifier_block *n) -{ - int ret; - - /* - * This code gets used during boot-up, when task switching is - * not yet working and interrupts must remain disabled. At - * such times we must not call down_write(). - */ - if (unlikely(system_state == SYSTEM_BOOTING)) - return notifier_chain_unregister(&nh->head, n); - - down_write(&nh->rwsem); - ret = notifier_chain_unregister(&nh->head, n); - up_write(&nh->rwsem); - return ret; -} - -EXPORT_SYMBOL_GPL(blocking_notifier_chain_unregister); - -/** - * __blocking_notifier_call_chain - Call functions in a blocking notifier chain - * @nh: Pointer to head of the blocking notifier chain - * @val: Value passed unmodified to notifier function - * @v: Pointer passed unmodified to notifier function - * @nr_to_call: See comment for notifier_call_chain. - * @nr_calls: See comment for notifier_call_chain. - * - * Calls each function in a notifier chain in turn. The functions - * run in a process context, so they are allowed to block. - * - * If the return value of the notifier can be and'ed - * with %NOTIFY_STOP_MASK then blocking_notifier_call_chain() - * will return immediately, with the return value of - * the notifier function which halted execution. - * Otherwise the return value is the return value - * of the last notifier function called. - */ - -int __blocking_notifier_call_chain(struct blocking_notifier_head *nh, - unsigned long val, void *v, - int nr_to_call, int *nr_calls) -{ - int ret = NOTIFY_DONE; - - /* - * We check the head outside the lock, but if this access is - * racy then it does not matter what the result of the test - * is, we re-check the list after having taken the lock anyway: - */ - if (rcu_dereference(nh->head)) { - down_read(&nh->rwsem); - ret = notifier_call_chain(&nh->head, val, v, nr_to_call, - nr_calls); - up_read(&nh->rwsem); - } - return ret; -} -EXPORT_SYMBOL_GPL(__blocking_notifier_call_chain); - -int blocking_notifier_call_chain(struct blocking_notifier_head *nh, - unsigned long val, void *v) -{ - return __blocking_notifier_call_chain(nh, val, v, -1, NULL); -} -EXPORT_SYMBOL_GPL(blocking_notifier_call_chain); - -/* - * Raw notifier chain routines. There is no protection; - * the caller must provide it. Use at your own risk! - */ - -/** - * raw_notifier_chain_register - Add notifier to a raw notifier chain - * @nh: Pointer to head of the raw notifier chain - * @n: New entry in notifier chain - * - * Adds a notifier to a raw notifier chain. - * All locking must be provided by the caller. - * - * Currently always returns zero. - */ - -int raw_notifier_chain_register(struct raw_notifier_head *nh, - struct notifier_block *n) -{ - return notifier_chain_register(&nh->head, n); -} - -EXPORT_SYMBOL_GPL(raw_notifier_chain_register); - -/** - * raw_notifier_chain_unregister - Remove notifier from a raw notifier chain - * @nh: Pointer to head of the raw notifier chain - * @n: Entry to remove from notifier chain - * - * Removes a notifier from a raw notifier chain. - * All locking must be provided by the caller. - * - * Returns zero on success or %-ENOENT on failure. - */ -int raw_notifier_chain_unregister(struct raw_notifier_head *nh, - struct notifier_block *n) -{ - return notifier_chain_unregister(&nh->head, n); -} - -EXPORT_SYMBOL_GPL(raw_notifier_chain_unregister); - -/** - * __raw_notifier_call_chain - Call functions in a raw notifier chain - * @nh: Pointer to head of the raw notifier chain - * @val: Value passed unmodified to notifier function - * @v: Pointer passed unmodified to notifier function - * @nr_to_call: See comment for notifier_call_chain. - * @nr_calls: See comment for notifier_call_chain - * - * Calls each function in a notifier chain in turn. The functions - * run in an undefined context. - * All locking must be provided by the caller. - * - * If the return value of the notifier can be and'ed - * with %NOTIFY_STOP_MASK then raw_notifier_call_chain() - * will return immediately, with the return value of - * the notifier function which halted execution. - * Otherwise the return value is the return value - * of the last notifier function called. - */ - -int __raw_notifier_call_chain(struct raw_notifier_head *nh, - unsigned long val, void *v, - int nr_to_call, int *nr_calls) -{ - return notifier_call_chain(&nh->head, val, v, nr_to_call, nr_calls); -} - -EXPORT_SYMBOL_GPL(__raw_notifier_call_chain); - -int raw_notifier_call_chain(struct raw_notifier_head *nh, - unsigned long val, void *v) -{ - return __raw_notifier_call_chain(nh, val, v, -1, NULL); -} - -EXPORT_SYMBOL_GPL(raw_notifier_call_chain); - -/* - * SRCU notifier chain routines. Registration and unregistration - * use a mutex, and call_chain is synchronized by SRCU (no locks). - */ - -/** - * srcu_notifier_chain_register - Add notifier to an SRCU notifier chain - * @nh: Pointer to head of the SRCU notifier chain - * @n: New entry in notifier chain - * - * Adds a notifier to an SRCU notifier chain. - * Must be called in process context. - * - * Currently always returns zero. - */ - -int srcu_notifier_chain_register(struct srcu_notifier_head *nh, - struct notifier_block *n) -{ - int ret; - - /* - * This code gets used during boot-up, when task switching is - * not yet working and interrupts must remain disabled. At - * such times we must not call mutex_lock(). - */ - if (unlikely(system_state == SYSTEM_BOOTING)) - return notifier_chain_register(&nh->head, n); - - mutex_lock(&nh->mutex); - ret = notifier_chain_register(&nh->head, n); - mutex_unlock(&nh->mutex); - return ret; -} - -EXPORT_SYMBOL_GPL(srcu_notifier_chain_register); - -/** - * srcu_notifier_chain_unregister - Remove notifier from an SRCU notifier chain - * @nh: Pointer to head of the SRCU notifier chain - * @n: Entry to remove from notifier chain - * - * Removes a notifier from an SRCU notifier chain. - * Must be called from process context. - * - * Returns zero on success or %-ENOENT on failure. - */ -int srcu_notifier_chain_unregister(struct srcu_notifier_head *nh, - struct notifier_block *n) -{ - int ret; - - /* - * This code gets used during boot-up, when task switching is - * not yet working and interrupts must remain disabled. At - * such times we must not call mutex_lock(). - */ - if (unlikely(system_state == SYSTEM_BOOTING)) - return notifier_chain_unregister(&nh->head, n); - - mutex_lock(&nh->mutex); - ret = notifier_chain_unregister(&nh->head, n); - mutex_unlock(&nh->mutex); - synchronize_srcu(&nh->srcu); - return ret; -} - -EXPORT_SYMBOL_GPL(srcu_notifier_chain_unregister); - -/** - * __srcu_notifier_call_chain - Call functions in an SRCU notifier chain - * @nh: Pointer to head of the SRCU notifier chain - * @val: Value passed unmodified to notifier function - * @v: Pointer passed unmodified to notifier function - * @nr_to_call: See comment for notifier_call_chain. - * @nr_calls: See comment for notifier_call_chain - * - * Calls each function in a notifier chain in turn. The functions - * run in a process context, so they are allowed to block. - * - * If the return value of the notifier can be and'ed - * with %NOTIFY_STOP_MASK then srcu_notifier_call_chain() - * will return immediately, with the return value of - * the notifier function which halted execution. - * Otherwise the return value is the return value - * of the last notifier function called. - */ - -int __srcu_notifier_call_chain(struct srcu_notifier_head *nh, - unsigned long val, void *v, - int nr_to_call, int *nr_calls) -{ - int ret; - int idx; - - idx = srcu_read_lock(&nh->srcu); - ret = notifier_call_chain(&nh->head, val, v, nr_to_call, nr_calls); - srcu_read_unlock(&nh->srcu, idx); - return ret; -} -EXPORT_SYMBOL_GPL(__srcu_notifier_call_chain); - -int srcu_notifier_call_chain(struct srcu_notifier_head *nh, - unsigned long val, void *v) -{ - return __srcu_notifier_call_chain(nh, val, v, -1, NULL); -} -EXPORT_SYMBOL_GPL(srcu_notifier_call_chain); - -/** - * srcu_init_notifier_head - Initialize an SRCU notifier head - * @nh: Pointer to head of the srcu notifier chain - * - * Unlike other sorts of notifier heads, SRCU notifier heads require - * dynamic initialization. Be sure to call this routine before - * calling any of the other SRCU notifier routines for this head. - * - * If an SRCU notifier head is deallocated, it must first be cleaned - * up by calling srcu_cleanup_notifier_head(). Otherwise the head's - * per-cpu data (used by the SRCU mechanism) will leak. - */ - -void srcu_init_notifier_head(struct srcu_notifier_head *nh) -{ - mutex_init(&nh->mutex); - if (init_srcu_struct(&nh->srcu) < 0) - BUG(); - nh->head = NULL; -} - -EXPORT_SYMBOL_GPL(srcu_init_notifier_head); - -/** - * register_reboot_notifier - Register function to be called at reboot time - * @nb: Info about notifier function to be called - * - * Registers a function with the list of functions - * to be called at reboot time. - * - * Currently always returns zero, as blocking_notifier_chain_register() - * always returns zero. - */ - -int register_reboot_notifier(struct notifier_block * nb) -{ - return blocking_notifier_chain_register(&reboot_notifier_list, nb); -} - -EXPORT_SYMBOL(register_reboot_notifier); - -/** - * unregister_reboot_notifier - Unregister previously registered reboot notifier - * @nb: Hook to be unregistered - * - * Unregisters a previously registered reboot - * notifier function. - * - * Returns zero on success, or %-ENOENT on failure. - */ - -int unregister_reboot_notifier(struct notifier_block * nb) -{ - return blocking_notifier_chain_unregister(&reboot_notifier_list, nb); -} - -EXPORT_SYMBOL(unregister_reboot_notifier); - static int set_one_prio(struct task_struct *p, int niceval, int error) { int no_nice; @@ -683,7 +152,7 @@ asmlinkage long sys_setpriority(int which, int who, int niceval) switch (which) { case PRIO_PROCESS: if (who) - p = find_task_by_pid(who); + p = find_task_by_vpid(who); else p = current; if (p) @@ -691,7 +160,7 @@ asmlinkage long sys_setpriority(int which, int who, int niceval) break; case PRIO_PGRP: if (who) - pgrp = find_pid(who); + pgrp = find_vpid(who); else pgrp = task_pgrp(current); do_each_pid_task(pgrp, PIDTYPE_PGID, p) { @@ -740,7 +209,7 @@ asmlinkage long sys_getpriority(int which, int who) switch (which) { case PRIO_PROCESS: if (who) - p = find_task_by_pid(who); + p = find_task_by_vpid(who); else p = current; if (p) { @@ -751,7 +220,7 @@ asmlinkage long sys_getpriority(int which, int who) break; case PRIO_PGRP: if (who) - pgrp = find_pid(who); + pgrp = find_vpid(who); else pgrp = task_pgrp(current); do_each_pid_task(pgrp, PIDTYPE_PGID, p) { @@ -1448,9 +917,10 @@ asmlinkage long sys_setpgid(pid_t pid, pid_t pgid) struct task_struct *p; struct task_struct *group_leader = current->group_leader; int err = -EINVAL; + struct pid_namespace *ns; if (!pid) - pid = group_leader->pid; + pid = task_pid_vnr(group_leader); if (!pgid) pgid = pid; if (pgid < 0) @@ -1459,10 +929,12 @@ asmlinkage long sys_setpgid(pid_t pid, pid_t pgid) /* From this point forward we keep holding onto the tasklist lock * so that our parent does not change from under us. -DaveM */ + ns = current->nsproxy->pid_ns; + write_lock_irq(&tasklist_lock); err = -ESRCH; - p = find_task_by_pid(pid); + p = find_task_by_pid_ns(pid, ns); if (!p) goto out; @@ -1488,9 +960,9 @@ asmlinkage long sys_setpgid(pid_t pid, pid_t pgid) goto out; if (pgid != pid) { - struct task_struct *g = - find_task_by_pid_type(PIDTYPE_PGID, pgid); + struct task_struct *g; + g = find_task_by_pid_type_ns(PIDTYPE_PGID, pgid, ns); if (!g || task_session(g) != task_session(group_leader)) goto out; } @@ -1499,10 +971,13 @@ asmlinkage long sys_setpgid(pid_t pid, pid_t pgid) if (err) goto out; - if (process_group(p) != pgid) { + if (task_pgrp_nr_ns(p, ns) != pgid) { + struct pid *pid; + detach_pid(p, PIDTYPE_PGID); - p->signal->pgrp = pgid; - attach_pid(p, PIDTYPE_PGID, find_pid(pgid)); + pid = find_vpid(pgid); + attach_pid(p, PIDTYPE_PGID, pid); + set_task_pgrp(p, pid_nr(pid)); } err = 0; @@ -1515,19 +990,21 @@ out: asmlinkage long sys_getpgid(pid_t pid) { if (!pid) - return process_group(current); + return task_pgrp_vnr(current); else { int retval; struct task_struct *p; + struct pid_namespace *ns; - read_lock(&tasklist_lock); - p = find_task_by_pid(pid); + ns = current->nsproxy->pid_ns; + read_lock(&tasklist_lock); + p = find_task_by_pid_ns(pid, ns); retval = -ESRCH; if (p) { retval = security_task_getpgid(p); if (!retval) - retval = process_group(p); + retval = task_pgrp_nr_ns(p, ns); } read_unlock(&tasklist_lock); return retval; @@ -1539,7 +1016,7 @@ asmlinkage long sys_getpgid(pid_t pid) asmlinkage long sys_getpgrp(void) { /* SMP - assuming writes are word atomic this is fine */ - return process_group(current); + return task_pgrp_vnr(current); } #endif @@ -1547,19 +1024,21 @@ asmlinkage long sys_getpgrp(void) asmlinkage long sys_getsid(pid_t pid) { if (!pid) - return process_session(current); + return task_session_vnr(current); else { int retval; struct task_struct *p; + struct pid_namespace *ns; - read_lock(&tasklist_lock); - p = find_task_by_pid(pid); + ns = current->nsproxy->pid_ns; + read_lock(&tasklist_lock); + p = find_task_by_pid_ns(pid, ns); retval = -ESRCH; if (p) { retval = security_task_getsid(p); if (!retval) - retval = process_session(p); + retval = task_session_nr_ns(p, ns); } read_unlock(&tasklist_lock); return retval; @@ -1586,7 +1065,8 @@ asmlinkage long sys_setsid(void) * session id and so the check will always fail and make it so * init cannot successfully call setsid. */ - if (session > 1 && find_task_by_pid_type(PIDTYPE_PGID, session)) + if (session > 1 && find_task_by_pid_type_ns(PIDTYPE_PGID, + session, &init_pid_ns)) goto out; group_leader->signal->leader = 1; @@ -1596,7 +1076,7 @@ asmlinkage long sys_setsid(void) group_leader->signal->tty = NULL; spin_unlock(&group_leader->sighand->siglock); - err = process_group(group_leader); + err = task_pgrp_vnr(group_leader); out: write_unlock_irq(&tasklist_lock); return err; diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 067554bda8b7..3b4efbe26445 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1888,7 +1888,7 @@ int proc_dointvec_bset(struct ctl_table *table, int write, struct file *filp, return -EPERM; } - op = is_init(current) ? OP_SET : OP_AND; + op = is_global_init(current) ? OP_SET : OP_AND; return do_proc_dointvec(table,write,filp,buffer,lenp,ppos, do_proc_dointvec_bset_conv,&op); } @@ -2278,7 +2278,7 @@ static int proc_do_cad_pid(struct ctl_table *table, int write, struct file *filp pid_t tmp; int r; - tmp = pid_nr(cad_pid); + tmp = pid_nr_ns(cad_pid, current->nsproxy->pid_ns); r = __do_proc_dointvec(&tmp, table, write, filp, buffer, lenp, ppos, NULL, NULL); diff --git a/kernel/taskstats.c b/kernel/taskstats.c index 7d4d7f9c1bb2..9f360f68aad6 100644 --- a/kernel/taskstats.c +++ b/kernel/taskstats.c @@ -22,6 +22,10 @@ #include <linux/delayacct.h> #include <linux/cpumask.h> #include <linux/percpu.h> +#include <linux/cgroupstats.h> +#include <linux/cgroup.h> +#include <linux/fs.h> +#include <linux/file.h> #include <net/genetlink.h> #include <asm/atomic.h> @@ -49,6 +53,11 @@ __read_mostly = { [TASKSTATS_CMD_ATTR_REGISTER_CPUMASK] = { .type = NLA_STRING }, [TASKSTATS_CMD_ATTR_DEREGISTER_CPUMASK] = { .type = NLA_STRING },}; +static struct nla_policy +cgroupstats_cmd_get_policy[CGROUPSTATS_CMD_ATTR_MAX+1] __read_mostly = { + [CGROUPSTATS_CMD_ATTR_FD] = { .type = NLA_U32 }, +}; + struct listener { struct list_head list; pid_t pid; @@ -372,6 +381,51 @@ err: return NULL; } +static int cgroupstats_user_cmd(struct sk_buff *skb, struct genl_info *info) +{ + int rc = 0; + struct sk_buff *rep_skb; + struct cgroupstats *stats; + struct nlattr *na; + size_t size; + u32 fd; + struct file *file; + int fput_needed; + + na = info->attrs[CGROUPSTATS_CMD_ATTR_FD]; + if (!na) + return -EINVAL; + + fd = nla_get_u32(info->attrs[CGROUPSTATS_CMD_ATTR_FD]); + file = fget_light(fd, &fput_needed); + if (file) { + size = nla_total_size(sizeof(struct cgroupstats)); + + rc = prepare_reply(info, CGROUPSTATS_CMD_NEW, &rep_skb, + size); + if (rc < 0) + goto err; + + na = nla_reserve(rep_skb, CGROUPSTATS_TYPE_CGROUP_STATS, + sizeof(struct cgroupstats)); + stats = nla_data(na); + memset(stats, 0, sizeof(*stats)); + + rc = cgroupstats_build(stats, file->f_dentry); + if (rc < 0) + goto err; + + fput_light(file, fput_needed); + return send_reply(rep_skb, info->snd_pid); + } + +err: + if (file) + fput_light(file, fput_needed); + nlmsg_free(rep_skb); + return rc; +} + static int taskstats_user_cmd(struct sk_buff *skb, struct genl_info *info) { int rc = 0; @@ -522,6 +576,12 @@ static struct genl_ops taskstats_ops = { .policy = taskstats_cmd_get_policy, }; +static struct genl_ops cgroupstats_ops = { + .cmd = CGROUPSTATS_CMD_GET, + .doit = cgroupstats_user_cmd, + .policy = cgroupstats_cmd_get_policy, +}; + /* Needed early in initialization */ void __init taskstats_init_early(void) { @@ -546,8 +606,15 @@ static int __init taskstats_init(void) if (rc < 0) goto err; + rc = genl_register_ops(&family, &cgroupstats_ops); + if (rc < 0) + goto err_cgroup_ops; + family_registered = 1; + printk("registered taskstats version %d\n", TASKSTATS_GENL_VERSION); return 0; +err_cgroup_ops: + genl_unregister_ops(&family, &taskstats_ops); err: genl_unregister_family(&family); return rc; diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index 51b6a6a6158c..c8a9d13874df 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -207,15 +207,12 @@ static inline void clocksource_resume_watchdog(void) { } */ void clocksource_resume(void) { - struct list_head *tmp; + struct clocksource *cs; unsigned long flags; spin_lock_irqsave(&clocksource_lock, flags); - list_for_each(tmp, &clocksource_list) { - struct clocksource *cs; - - cs = list_entry(tmp, struct clocksource, list); + list_for_each_entry(cs, &clocksource_list, list) { if (cs->resume) cs->resume(); } @@ -369,7 +366,6 @@ static ssize_t sysfs_override_clocksource(struct sys_device *dev, const char *buf, size_t count) { struct clocksource *ovr = NULL; - struct list_head *tmp; size_t ret = count; int len; @@ -389,12 +385,11 @@ static ssize_t sysfs_override_clocksource(struct sys_device *dev, len = strlen(override_name); if (len) { + struct clocksource *cs; + ovr = clocksource_override; /* try to select it: */ - list_for_each(tmp, &clocksource_list) { - struct clocksource *cs; - - cs = list_entry(tmp, struct clocksource, list); + list_for_each_entry(cs, &clocksource_list, list) { if (strlen(cs->name) == len && !strcmp(cs->name, override_name)) ovr = cs; @@ -422,14 +417,11 @@ static ssize_t sysfs_override_clocksource(struct sys_device *dev, static ssize_t sysfs_show_available_clocksources(struct sys_device *dev, char *buf) { - struct list_head *tmp; + struct clocksource *src; char *curr = buf; spin_lock_irq(&clocksource_lock); - list_for_each(tmp, &clocksource_list) { - struct clocksource *src; - - src = list_entry(tmp, struct clocksource, list); + list_for_each_entry(src, &clocksource_list, list) { curr += sprintf(curr, "%s ", src->name); } spin_unlock_irq(&clocksource_lock); diff --git a/kernel/timer.c b/kernel/timer.c index 8521d10fbb27..fb4e67d5dd60 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -26,6 +26,7 @@ #include <linux/init.h> #include <linux/mm.h> #include <linux/swap.h> +#include <linux/pid_namespace.h> #include <linux/notifier.h> #include <linux/thread_info.h> #include <linux/time.h> @@ -956,7 +957,7 @@ asmlinkage unsigned long sys_alarm(unsigned int seconds) */ asmlinkage long sys_getpid(void) { - return current->tgid; + return task_tgid_vnr(current); } /* @@ -970,7 +971,7 @@ asmlinkage long sys_getppid(void) int pid; rcu_read_lock(); - pid = rcu_dereference(current->real_parent)->tgid; + pid = task_ppid_nr_ns(current, current->nsproxy->pid_ns); rcu_read_unlock(); return pid; @@ -1102,7 +1103,7 @@ EXPORT_SYMBOL(schedule_timeout_uninterruptible); /* Thread ID - the internal kernel "pid" */ asmlinkage long sys_gettid(void) { - return current->pid; + return task_pid_vnr(current); } /** diff --git a/kernel/workqueue.c b/kernel/workqueue.c index e080d1d744cc..52d5e7c9a8e6 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -32,6 +32,7 @@ #include <linux/freezer.h> #include <linux/kallsyms.h> #include <linux/debug_locks.h> +#include <linux/lockdep.h> /* * The per-CPU workqueue (if single thread, we always use the first @@ -61,6 +62,9 @@ struct workqueue_struct { const char *name; int singlethread; int freezeable; /* Freeze threads during suspend */ +#ifdef CONFIG_LOCKDEP + struct lockdep_map lockdep_map; +#endif }; /* All the per-cpu workqueues on the system, for hotplug cpu to add/remove @@ -250,6 +254,17 @@ static void run_workqueue(struct cpu_workqueue_struct *cwq) struct work_struct *work = list_entry(cwq->worklist.next, struct work_struct, entry); work_func_t f = work->func; +#ifdef CONFIG_LOCKDEP + /* + * It is permissible to free the struct work_struct + * from inside the function that is called from it, + * this we need to take into account for lockdep too. + * To avoid bogus "held lock freed" warnings as well + * as problems when looking into work->lockdep_map, + * make a copy and use that here. + */ + struct lockdep_map lockdep_map = work->lockdep_map; +#endif cwq->current_work = work; list_del_init(cwq->worklist.next); @@ -257,13 +272,17 @@ static void run_workqueue(struct cpu_workqueue_struct *cwq) BUG_ON(get_wq_data(work) != cwq); work_clear_pending(work); + lock_acquire(&cwq->wq->lockdep_map, 0, 0, 0, 2, _THIS_IP_); + lock_acquire(&lockdep_map, 0, 0, 0, 2, _THIS_IP_); f(work); + lock_release(&lockdep_map, 1, _THIS_IP_); + lock_release(&cwq->wq->lockdep_map, 1, _THIS_IP_); if (unlikely(in_atomic() || lockdep_depth(current) > 0)) { printk(KERN_ERR "BUG: workqueue leaked lock or atomic: " "%s/0x%08x/%d\n", current->comm, preempt_count(), - current->pid); + task_pid_nr(current)); printk(KERN_ERR " last function: "); print_symbol("%s\n", (unsigned long)f); debug_show_held_locks(current); @@ -376,6 +395,8 @@ void fastcall flush_workqueue(struct workqueue_struct *wq) int cpu; might_sleep(); + lock_acquire(&wq->lockdep_map, 0, 0, 0, 2, _THIS_IP_); + lock_release(&wq->lockdep_map, 1, _THIS_IP_); for_each_cpu_mask(cpu, *cpu_map) flush_cpu_workqueue(per_cpu_ptr(wq->cpu_wq, cpu)); } @@ -446,6 +467,9 @@ static void wait_on_work(struct work_struct *work) might_sleep(); + lock_acquire(&work->lockdep_map, 0, 0, 0, 2, _THIS_IP_); + lock_release(&work->lockdep_map, 1, _THIS_IP_); + cwq = get_wq_data(work); if (!cwq) return; @@ -695,8 +719,10 @@ static void start_workqueue_thread(struct cpu_workqueue_struct *cwq, int cpu) } } -struct workqueue_struct *__create_workqueue(const char *name, - int singlethread, int freezeable) +struct workqueue_struct *__create_workqueue_key(const char *name, + int singlethread, + int freezeable, + struct lock_class_key *key) { struct workqueue_struct *wq; struct cpu_workqueue_struct *cwq; @@ -713,6 +739,7 @@ struct workqueue_struct *__create_workqueue(const char *name, } wq->name = name; + lockdep_init_map(&wq->lockdep_map, name, key, 0); wq->singlethread = singlethread; wq->freezeable = freezeable; INIT_LIST_HEAD(&wq->list); @@ -741,7 +768,7 @@ struct workqueue_struct *__create_workqueue(const char *name, } return wq; } -EXPORT_SYMBOL_GPL(__create_workqueue); +EXPORT_SYMBOL_GPL(__create_workqueue_key); static void cleanup_workqueue_thread(struct cpu_workqueue_struct *cwq, int cpu) { @@ -752,6 +779,9 @@ static void cleanup_workqueue_thread(struct cpu_workqueue_struct *cwq, int cpu) if (cwq->thread == NULL) return; + lock_acquire(&cwq->wq->lockdep_map, 0, 0, 0, 2, _THIS_IP_); + lock_release(&cwq->wq->lockdep_map, 1, _THIS_IP_); + flush_cpu_workqueue(cwq); /* * If the caller is CPU_DEAD and cwq->worklist was not empty, diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 7d16e6433302..c567f219191d 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -498,3 +498,5 @@ config FAULT_INJECTION_STACKTRACE_FILTER select FRAME_POINTER help Provide stacktrace filter for fault-injection capabilities + +source "samples/Kconfig" diff --git a/lib/Makefile b/lib/Makefile index c5f215d509d3..3a0983b77412 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -6,7 +6,7 @@ lib-y := ctype.o string.o vsprintf.o cmdline.o \ rbtree.o radix-tree.o dump_stack.o \ idr.o int_sqrt.o bitmap.o extable.o prio_tree.o \ sha1.o irq_regs.o reciprocal_div.o argv_split.o \ - proportions.o + proportions.o prio_heap.o lib-$(CONFIG_MMU) += ioremap.o lib-$(CONFIG_SMP) += cpumask.o diff --git a/lib/hweight.c b/lib/hweight.c index 360556a7803d..389424ecb129 100644 --- a/lib/hweight.c +++ b/lib/hweight.c @@ -1,6 +1,6 @@ #include <linux/module.h> +#include <linux/bitops.h> #include <asm/types.h> -#include <asm/bitops.h> /** * hweightN - returns the hamming weight of a N-bit word diff --git a/lib/percpu_counter.c b/lib/percpu_counter.c index 9659eabffc31..393a0e915c23 100644 --- a/lib/percpu_counter.c +++ b/lib/percpu_counter.c @@ -124,12 +124,13 @@ static int __cpuinit percpu_counter_hotcpu_callback(struct notifier_block *nb, mutex_lock(&percpu_counters_lock); list_for_each_entry(fbc, &percpu_counters, list) { s32 *pcount; + unsigned long flags; - spin_lock(&fbc->lock); + spin_lock_irqsave(&fbc->lock, flags); pcount = per_cpu_ptr(fbc->counters, cpu); fbc->count += *pcount; *pcount = 0; - spin_unlock(&fbc->lock); + spin_unlock_irqrestore(&fbc->lock, flags); } mutex_unlock(&percpu_counters_lock); return NOTIFY_OK; diff --git a/lib/prio_heap.c b/lib/prio_heap.c new file mode 100644 index 000000000000..471944a54e23 --- /dev/null +++ b/lib/prio_heap.c @@ -0,0 +1,70 @@ +/* + * Simple insertion-only static-sized priority heap containing + * pointers, based on CLR, chapter 7 + */ + +#include <linux/slab.h> +#include <linux/prio_heap.h> + +int heap_init(struct ptr_heap *heap, size_t size, gfp_t gfp_mask, + int (*gt)(void *, void *)) +{ + heap->ptrs = kmalloc(size, gfp_mask); + if (!heap->ptrs) + return -ENOMEM; + heap->size = 0; + heap->max = size / sizeof(void *); + heap->gt = gt; + return 0; +} + +void heap_free(struct ptr_heap *heap) +{ + kfree(heap->ptrs); +} + +void *heap_insert(struct ptr_heap *heap, void *p) +{ + void *res; + void **ptrs = heap->ptrs; + int pos; + + if (heap->size < heap->max) { + /* Heap insertion */ + int pos = heap->size++; + while (pos > 0 && heap->gt(p, ptrs[(pos-1)/2])) { + ptrs[pos] = ptrs[(pos-1)/2]; + pos = (pos-1)/2; + } + ptrs[pos] = p; + return NULL; + } + + /* The heap is full, so something will have to be dropped */ + + /* If the new pointer is greater than the current max, drop it */ + if (heap->gt(p, ptrs[0])) + return p; + + /* Replace the current max and heapify */ + res = ptrs[0]; + ptrs[0] = p; + pos = 0; + + while (1) { + int left = 2 * pos + 1; + int right = 2 * pos + 2; + int largest = pos; + if (left < heap->size && heap->gt(ptrs[left], p)) + largest = left; + if (right < heap->size && heap->gt(ptrs[right], ptrs[largest])) + largest = right; + if (largest == pos) + break; + /* Push p down the heap one level and bump one up */ + ptrs[pos] = ptrs[largest]; + ptrs[largest] = p; + pos = largest; + } + return res; +} diff --git a/lib/spinlock_debug.c b/lib/spinlock_debug.c index 479fd462eaa9..9c4b0256490b 100644 --- a/lib/spinlock_debug.c +++ b/lib/spinlock_debug.c @@ -60,12 +60,12 @@ static void spin_bug(spinlock_t *lock, const char *msg) owner = lock->owner; printk(KERN_EMERG "BUG: spinlock %s on CPU#%d, %s/%d\n", msg, raw_smp_processor_id(), - current->comm, current->pid); + current->comm, task_pid_nr(current)); printk(KERN_EMERG " lock: %p, .magic: %08x, .owner: %s/%d, " ".owner_cpu: %d\n", lock, lock->magic, owner ? owner->comm : "<none>", - owner ? owner->pid : -1, + owner ? task_pid_nr(owner) : -1, lock->owner_cpu); dump_stack(); } @@ -116,7 +116,7 @@ static void __spin_lock_debug(spinlock_t *lock) printk(KERN_EMERG "BUG: spinlock lockup on CPU#%d, " "%s/%d, %p\n", raw_smp_processor_id(), current->comm, - current->pid, lock); + task_pid_nr(current), lock); dump_stack(); #ifdef CONFIG_SMP trigger_all_cpu_backtrace(); @@ -161,7 +161,7 @@ static void rwlock_bug(rwlock_t *lock, const char *msg) printk(KERN_EMERG "BUG: rwlock %s on CPU#%d, %s/%d, %p\n", msg, raw_smp_processor_id(), current->comm, - current->pid, lock); + task_pid_nr(current), lock); dump_stack(); } diff --git a/mm/filemap.c b/mm/filemap.c index 920366399eed..5209e47b7fe3 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -841,7 +841,7 @@ static void shrink_readahead_size_eio(struct file *filp, /** * do_generic_mapping_read - generic file read routine * @mapping: address_space to be read - * @_ra: file's readahead state + * @ra: file's readahead state * @filp: the file to read * @ppos: current file position * @desc: read_descriptor diff --git a/mm/memory.c b/mm/memory.c index bd16dcaeefb8..142683df8755 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -259,9 +259,6 @@ void free_pgd_range(struct mmu_gather **tlb, continue; free_pud_range(*tlb, pgd, addr, next, floor, ceiling); } while (pgd++, addr = next, addr != end); - - if (!(*tlb)->fullmm) - flush_tlb_pgtables((*tlb)->mm, start, end); } void free_pgtables(struct mmu_gather **tlb, struct vm_area_struct *vma, diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 568152ae6caf..c1592a94582f 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -78,6 +78,7 @@ #include <linux/slab.h> #include <linux/string.h> #include <linux/module.h> +#include <linux/nsproxy.h> #include <linux/interrupt.h> #include <linux/init.h> #include <linux/compat.h> @@ -940,7 +941,7 @@ asmlinkage long sys_migrate_pages(pid_t pid, unsigned long maxnode, /* Find the mm_struct */ read_lock(&tasklist_lock); - task = pid ? find_task_by_pid(pid) : current; + task = pid ? find_task_by_vpid(pid) : current; if (!task) { read_unlock(&tasklist_lock); return -ESRCH; @@ -1388,7 +1389,6 @@ EXPORT_SYMBOL(alloc_pages_current); * keeps mempolicies cpuset relative after its cpuset moves. See * further kernel/cpuset.c update_nodemask(). */ -void *cpuset_being_rebound; /* Slow path of a mempolicy copy */ struct mempolicy *__mpol_copy(struct mempolicy *old) @@ -2019,4 +2019,3 @@ out: m->version = (vma != priv->tail_vma) ? vma->vm_start : 0; return 0; } - diff --git a/mm/migrate.c b/mm/migrate.c index 06d0877a66ef..4d6ee03db946 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -19,6 +19,7 @@ #include <linux/pagemap.h> #include <linux/buffer_head.h> #include <linux/mm_inline.h> +#include <linux/nsproxy.h> #include <linux/pagevec.h> #include <linux/rmap.h> #include <linux/topology.h> @@ -924,7 +925,7 @@ asmlinkage long sys_move_pages(pid_t pid, unsigned long nr_pages, /* Find the mm_struct */ read_lock(&tasklist_lock); - task = pid ? find_task_by_pid(pid) : current; + task = pid ? find_task_by_vpid(pid) : current; if (!task) { read_unlock(&tasklist_lock); return -ESRCH; diff --git a/mm/mmap.c b/mm/mmap.c index 4275e81e25ba..7a30c4988231 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1048,8 +1048,7 @@ int vma_wants_writenotify(struct vm_area_struct *vma) /* The open routine did something to the protections already? */ if (pgprot_val(vma->vm_page_prot) != - pgprot_val(protection_map[vm_flags & - (VM_READ|VM_WRITE|VM_EXEC|VM_SHARED)])) + pgprot_val(vm_get_page_prot(vm_flags))) return 0; /* Specialty mapping? */ @@ -1130,8 +1129,7 @@ munmap_back: vma->vm_start = addr; vma->vm_end = addr + len; vma->vm_flags = vm_flags; - vma->vm_page_prot = protection_map[vm_flags & - (VM_READ|VM_WRITE|VM_EXEC|VM_SHARED)]; + vma->vm_page_prot = vm_get_page_prot(vm_flags); vma->vm_pgoff = pgoff; if (file) { @@ -2002,8 +2000,7 @@ unsigned long do_brk(unsigned long addr, unsigned long len) vma->vm_end = addr + len; vma->vm_pgoff = pgoff; vma->vm_flags = flags; - vma->vm_page_prot = protection_map[flags & - (VM_READ|VM_WRITE|VM_EXEC|VM_SHARED)]; + vma->vm_page_prot = vm_get_page_prot(flags); vma_link(mm, vma, prev, rb_link, rb_parent); out: mm->total_vm += len >> PAGE_SHIFT; @@ -2209,7 +2206,7 @@ int install_special_mapping(struct mm_struct *mm, vma->vm_end = addr + len; vma->vm_flags = vm_flags | mm->def_flags; - vma->vm_page_prot = protection_map[vma->vm_flags & 7]; + vma->vm_page_prot = vm_get_page_prot(vma->vm_flags); vma->vm_ops = &special_mapping_vmops; vma->vm_private_data = pages; diff --git a/mm/mprotect.c b/mm/mprotect.c index 1d4d69790e59..55227845abbe 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -192,11 +192,9 @@ success: * held in write mode. */ vma->vm_flags = newflags; - vma->vm_page_prot = protection_map[newflags & - (VM_READ|VM_WRITE|VM_EXEC|VM_SHARED)]; + vma->vm_page_prot = vm_get_page_prot(newflags); if (vma_wants_writenotify(vma)) { - vma->vm_page_prot = protection_map[newflags & - (VM_READ|VM_WRITE|VM_EXEC)]; + vma->vm_page_prot = vm_get_page_prot(newflags); dirty_accountable = 1; } diff --git a/mm/oom_kill.c b/mm/oom_kill.c index a64decb5b13f..824cade07827 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -212,7 +212,7 @@ static struct task_struct *select_bad_process(unsigned long *ppoints) if (!p->mm) continue; /* skip the init task */ - if (is_init(p)) + if (is_global_init(p)) continue; /* @@ -265,7 +265,7 @@ static struct task_struct *select_bad_process(unsigned long *ppoints) */ static void __oom_kill_task(struct task_struct *p, int verbose) { - if (is_init(p)) { + if (is_global_init(p)) { WARN_ON(1); printk(KERN_WARNING "tried to kill init!\n"); return; @@ -278,7 +278,8 @@ static void __oom_kill_task(struct task_struct *p, int verbose) } if (verbose) - printk(KERN_ERR "Killed process %d (%s)\n", p->pid, p->comm); + printk(KERN_ERR "Killed process %d (%s)\n", + task_pid_nr(p), p->comm); /* * We give our sacrificial lamb high priority and access to @@ -326,7 +327,7 @@ static int oom_kill_task(struct task_struct *p) * to memory reserves though, otherwise we might deplete all memory. */ do_each_thread(g, q) { - if (q->mm == mm && q->tgid != p->tgid) + if (q->mm == mm && !same_thread_group(q, p)) force_sig(SIGKILL, q); } while_each_thread(g, q); @@ -337,7 +338,6 @@ static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, unsigned long points, const char *message) { struct task_struct *c; - struct list_head *tsk; if (printk_ratelimit()) { printk(KERN_WARNING "%s invoked oom-killer: " @@ -357,11 +357,10 @@ static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, } printk(KERN_ERR "%s: kill process %d (%s) score %li or a child\n", - message, p->pid, p->comm, points); + message, task_pid_nr(p), p->comm, points); /* Try to kill a child first */ - list_for_each(tsk, &p->children) { - c = list_entry(tsk, struct task_struct, sibling); + list_for_each_entry(c, &p->children, sibling) { if (c->mm == p->mm) continue; if (!oom_kill_task(c)) diff --git a/net/bluetooth/hidp/core.c b/net/bluetooth/hidp/core.c index ff5784b440d7..66c736953cfe 100644 --- a/net/bluetooth/hidp/core.c +++ b/net/bluetooth/hidp/core.c @@ -656,11 +656,13 @@ static inline int hidp_setup_input(struct hidp_session *session, struct hidp_con } if (req->subclass & 0x80) { - input->evbit[0] = BIT(EV_KEY) | BIT(EV_REL); - input->keybit[LONG(BTN_MOUSE)] = BIT(BTN_LEFT) | BIT(BTN_RIGHT) | BIT(BTN_MIDDLE); - input->relbit[0] = BIT(REL_X) | BIT(REL_Y); - input->keybit[LONG(BTN_MOUSE)] |= BIT(BTN_SIDE) | BIT(BTN_EXTRA); - input->relbit[0] |= BIT(REL_WHEEL); + input->evbit[0] = BIT_MASK(EV_KEY) | BIT_MASK(EV_REL); + input->keybit[BIT_WORD(BTN_MOUSE)] = BIT_MASK(BTN_LEFT) | + BIT_MASK(BTN_RIGHT) | BIT_MASK(BTN_MIDDLE); + input->relbit[0] = BIT_MASK(REL_X) | BIT_MASK(REL_Y); + input->keybit[BIT_WORD(BTN_MOUSE)] |= BIT_MASK(BTN_SIDE) | + BIT_MASK(BTN_EXTRA); + input->relbit[0] |= BIT_MASK(REL_WHEEL); } input->dev.parent = hidp_get_device(session); diff --git a/net/core/filter.c b/net/core/filter.c index 1f0068eae501..e0a06942c025 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -447,7 +447,8 @@ int sk_attach_filter(struct sock_fprog *fprog, struct sock *sk) rcu_assign_pointer(sk->sk_filter, fp); rcu_read_unlock_bh(); - sk_filter_delayed_uncharge(sk, old_fp); + if (old_fp) + sk_filter_delayed_uncharge(sk, old_fp); return 0; } diff --git a/net/core/gen_estimator.c b/net/core/gen_estimator.c index 590a767b029c..daadbcc4e8dd 100644 --- a/net/core/gen_estimator.c +++ b/net/core/gen_estimator.c @@ -15,7 +15,7 @@ #include <asm/uaccess.h> #include <asm/system.h> -#include <asm/bitops.h> +#include <linux/bitops.h> #include <linux/module.h> #include <linux/types.h> #include <linux/kernel.h> diff --git a/net/core/pktgen.c b/net/core/pktgen.c index 8cae60c53383..7ac703171ff3 100644 --- a/net/core/pktgen.c +++ b/net/core/pktgen.c @@ -161,7 +161,7 @@ #endif #include <asm/byteorder.h> #include <linux/rcupdate.h> -#include <asm/bitops.h> +#include <linux/bitops.h> #include <asm/io.h> #include <asm/dma.h> #include <asm/uaccess.h> @@ -3514,7 +3514,7 @@ static int pktgen_thread_worker(void *arg) init_waitqueue_head(&t->queue); - pr_debug("pktgen: starting pktgen/%d: pid=%d\n", cpu, current->pid); + pr_debug("pktgen: starting pktgen/%d: pid=%d\n", cpu, task_pid_nr(current)); set_current_state(TASK_INTERRUPTIBLE); diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 1072d16696c3..4a2640d38261 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -744,10 +744,10 @@ static struct net *get_net_ns_by_pid(pid_t pid) rcu_read_lock(); tsk = find_task_by_pid(pid); if (tsk) { - task_lock(tsk); - if (tsk->nsproxy) - net = get_net(tsk->nsproxy->net_ns); - task_unlock(tsk); + struct nsproxy *nsproxy; + nsproxy = task_nsproxy(tsk); + if (nsproxy) + net = get_net(nsproxy->net_ns); } rcu_read_unlock(); return net; diff --git a/net/core/scm.c b/net/core/scm.c index 530bee8d9ed9..100ba6d9d478 100644 --- a/net/core/scm.c +++ b/net/core/scm.c @@ -24,6 +24,8 @@ #include <linux/interrupt.h> #include <linux/netdevice.h> #include <linux/security.h> +#include <linux/pid.h> +#include <linux/nsproxy.h> #include <asm/system.h> #include <asm/uaccess.h> @@ -42,7 +44,7 @@ static __inline__ int scm_check_creds(struct ucred *creds) { - if ((creds->pid == current->tgid || capable(CAP_SYS_ADMIN)) && + if ((creds->pid == task_tgid_vnr(current) || capable(CAP_SYS_ADMIN)) && ((creds->uid == current->uid || creds->uid == current->euid || creds->uid == current->suid) || capable(CAP_SETUID)) && ((creds->gid == current->gid || creds->gid == current->egid || diff --git a/net/core/sock.c b/net/core/sock.c index d292b4113d6e..febbcbcf8022 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -232,7 +232,7 @@ static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen) warned++; printk(KERN_INFO "sock_set_timeout: `%s' (pid %d) " "tries to set negative timeout\n", - current->comm, current->pid); + current->comm, task_pid_nr(current)); return 0; } *timeo_p = MAX_SCHEDULE_TIMEOUT; diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index 81a8285d6d6a..8d8c2915e064 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -54,7 +54,7 @@ #include <asm/uaccess.h> #include <asm/system.h> -#include <asm/bitops.h> +#include <linux/bitops.h> #include <linux/types.h> #include <linux/kernel.h> #include <linux/mm.h> diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index 3cef12835c4b..8fb6ca23700a 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -93,7 +93,7 @@ int inet_csk_get_port(struct inet_hashinfo *hashinfo, int remaining, rover, low, high; inet_get_local_port_range(&low, &high); - remaining = high - low; + remaining = (high - low) + 1; rover = net_random() % remaining + low; do { diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index fac6398e4367..16eecc7046a3 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -286,7 +286,7 @@ int inet_hash_connect(struct inet_timewait_death_row *death_row, struct inet_timewait_sock *tw = NULL; inet_get_local_port_range(&low, &high); - remaining = high - low; + remaining = (high - low) + 1; local_bh_disable(); for (i = 1; i <= remaining; i++) { diff --git a/net/ipv4/ipvs/ip_vs_sync.c b/net/ipv4/ipvs/ip_vs_sync.c index 1960747f354c..c99f2a33fb9e 100644 --- a/net/ipv4/ipvs/ip_vs_sync.c +++ b/net/ipv4/ipvs/ip_vs_sync.c @@ -794,7 +794,7 @@ static int sync_thread(void *startup) add_wait_queue(&sync_wait, &wait); - set_sync_pid(state, current->pid); + set_sync_pid(state, task_pid_nr(current)); complete(tinfo->startup); /* @@ -877,7 +877,7 @@ int start_sync_thread(int state, char *mcast_ifn, __u8 syncid) if (!tinfo) return -ENOMEM; - IP_VS_DBG(7, "%s: pid %d\n", __FUNCTION__, current->pid); + IP_VS_DBG(7, "%s: pid %d\n", __FUNCTION__, task_pid_nr(current)); IP_VS_DBG(7, "Each ip_vs_sync_conn entry need %Zd bytes\n", sizeof(struct ip_vs_sync_conn)); @@ -917,7 +917,7 @@ int stop_sync_thread(int state) (state == IP_VS_STATE_BACKUP && !sync_backup_pid)) return -ESRCH; - IP_VS_DBG(7, "%s: pid %d\n", __FUNCTION__, current->pid); + IP_VS_DBG(7, "%s: pid %d\n", __FUNCTION__, task_pid_nr(current)); IP_VS_INFO("stopping sync thread %d ...\n", (state == IP_VS_STATE_MASTER) ? sync_master_pid : sync_backup_pid); diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index c78acc1a7f11..ffddd2b45352 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -122,7 +122,7 @@ static int ipv4_local_port_range(ctl_table *table, int write, struct file *filp, ret = proc_dointvec_minmax(&tmp, write, filp, buffer, lenp, ppos); if (write && ret == 0) { - if (range[1] <= range[0]) + if (range[1] < range[0]) ret = -EINVAL; else set_local_port_range(range); @@ -150,7 +150,7 @@ static int ipv4_sysctl_local_port_range(ctl_table *table, int __user *name, ret = sysctl_intvec(&tmp, name, nlen, oldval, oldlenp, newval, newlen); if (ret == 0 && newval && newlen) { - if (range[1] <= range[0]) + if (range[1] < range[0]) ret = -EINVAL; else set_local_port_range(range); diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 4f322003835d..2e6ad6dbba6c 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -1334,7 +1334,7 @@ do_prequeue: if ((flags & MSG_PEEK) && peek_seq != tp->copied_seq) { if (net_ratelimit()) printk(KERN_DEBUG "TCP(%s:%d): Application bug, race in MSG_PEEK.\n", - current->comm, current->pid); + current->comm, task_pid_nr(current)); peek_seq = tp->copied_seq; } continue; diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index cb9fc58efb2f..35d2b0e9e10b 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -147,13 +147,14 @@ int __udp_lib_get_port(struct sock *sk, unsigned short snum, write_lock_bh(&udp_hash_lock); if (!snum) { - int i, low, high; + int i, low, high, remaining; unsigned rover, best, best_size_so_far; inet_get_local_port_range(&low, &high); + remaining = (high - low) + 1; best_size_so_far = UINT_MAX; - best = rover = net_random() % (high - low) + low; + best = rover = net_random() % remaining + low; /* 1st pass: look for empty (or shortest) hash chain */ for (i = 0; i < UDP_HTABLE_SIZE; i++) { diff --git a/net/ipv6/inet6_hashtables.c b/net/ipv6/inet6_hashtables.c index 1c2c27655435..d6f1026f1943 100644 --- a/net/ipv6/inet6_hashtables.c +++ b/net/ipv6/inet6_hashtables.c @@ -261,7 +261,7 @@ int inet6_hash_connect(struct inet_timewait_death_row *death_row, struct inet_timewait_sock *tw = NULL; inet_get_local_port_range(&low, &high); - remaining = high - low; + remaining = (high - low) + 1; local_bh_disable(); for (i = 1; i <= remaining; i++) { diff --git a/net/llc/af_llc.c b/net/llc/af_llc.c index 49eacba824df..46cf962f7f88 100644 --- a/net/llc/af_llc.c +++ b/net/llc/af_llc.c @@ -762,7 +762,7 @@ static int llc_ui_recvmsg(struct kiocb *iocb, struct socket *sock, if (net_ratelimit()) printk(KERN_DEBUG "LLC(%s:%d): Application " "bug, race in MSG_PEEK.\n", - current->comm, current->pid); + current->comm, task_pid_nr(current)); peek_seq = llc->copied_seq; } continue; diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h index d34a9deca67a..4b4ed2a5803c 100644 --- a/net/mac80211/ieee80211_i.h +++ b/net/mac80211/ieee80211_i.h @@ -37,8 +37,6 @@ struct ieee80211_local; -#define BIT(x) (1 << (x)) - #define IEEE80211_ALIGN32_PAD(a) ((4 - ((a) & 3)) & 3) /* Maximum number of broadcast/multicast frames to buffer when some of the diff --git a/net/mac80211/ieee80211_sta.c b/net/mac80211/ieee80211_sta.c index db81aef6177a..f7ffeec3913f 100644 --- a/net/mac80211/ieee80211_sta.c +++ b/net/mac80211/ieee80211_sta.c @@ -108,14 +108,11 @@ struct ieee802_11_elems { u8 wmm_param_len; }; -enum ParseRes { ParseOK = 0, ParseUnknown = 1, ParseFailed = -1 }; - -static enum ParseRes ieee802_11_parse_elems(u8 *start, size_t len, - struct ieee802_11_elems *elems) +static void ieee802_11_parse_elems(u8 *start, size_t len, + struct ieee802_11_elems *elems) { size_t left = len; u8 *pos = start; - int unknown = 0; memset(elems, 0, sizeof(*elems)); @@ -126,15 +123,8 @@ static enum ParseRes ieee802_11_parse_elems(u8 *start, size_t len, elen = *pos++; left -= 2; - if (elen > left) { -#if 0 - if (net_ratelimit()) - printk(KERN_DEBUG "IEEE 802.11 element parse " - "failed (id=%d elen=%d left=%d)\n", - id, elen, left); -#endif - return ParseFailed; - } + if (elen > left) + return; switch (id) { case WLAN_EID_SSID: @@ -201,28 +191,15 @@ static enum ParseRes ieee802_11_parse_elems(u8 *start, size_t len, elems->ext_supp_rates_len = elen; break; default: -#if 0 - printk(KERN_DEBUG "IEEE 802.11 element parse ignored " - "unknown element (id=%d elen=%d)\n", - id, elen); -#endif - unknown++; break; } left -= elen; pos += elen; } - - /* Do not trigger error if left == 1 as Apple Airport base stations - * send AssocResps that are one spurious byte too long. */ - - return unknown ? ParseUnknown : ParseOK; } - - static int ecw2cw(int ecw) { int cw = 1; @@ -931,12 +908,7 @@ static void ieee80211_auth_challenge(struct net_device *dev, printk(KERN_DEBUG "%s: replying to auth challenge\n", dev->name); pos = mgmt->u.auth.variable; - if (ieee802_11_parse_elems(pos, len - (pos - (u8 *) mgmt), &elems) - == ParseFailed) { - printk(KERN_DEBUG "%s: failed to parse Auth(challenge)\n", - dev->name); - return; - } + ieee802_11_parse_elems(pos, len - (pos - (u8 *) mgmt), &elems); if (!elems.challenge) { printk(KERN_DEBUG "%s: no challenge IE in shared key auth " "frame\n", dev->name); @@ -1230,12 +1202,7 @@ static void ieee80211_rx_mgmt_assoc_resp(struct net_device *dev, aid &= ~(BIT(15) | BIT(14)); pos = mgmt->u.assoc_resp.variable; - if (ieee802_11_parse_elems(pos, len - (pos - (u8 *) mgmt), &elems) - == ParseFailed) { - printk(KERN_DEBUG "%s: failed to parse AssocResp\n", - dev->name); - return; - } + ieee802_11_parse_elems(pos, len - (pos - (u8 *) mgmt), &elems); if (!elems.supp_rates) { printk(KERN_DEBUG "%s: no SuppRates element in AssocResp\n", @@ -1459,7 +1426,7 @@ static void ieee80211_rx_bss_info(struct net_device *dev, struct ieee80211_local *local = wdev_priv(dev->ieee80211_ptr); struct ieee802_11_elems elems; size_t baselen; - int channel, invalid = 0, clen; + int channel, clen; struct ieee80211_sta_bss *bss; struct sta_info *sta; struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); @@ -1505,9 +1472,7 @@ static void ieee80211_rx_bss_info(struct net_device *dev, #endif /* CONFIG_MAC80211_IBSS_DEBUG */ } - if (ieee802_11_parse_elems(mgmt->u.beacon.variable, len - baselen, - &elems) == ParseFailed) - invalid = 1; + ieee802_11_parse_elems(mgmt->u.beacon.variable, len - baselen, &elems); if (sdata->type == IEEE80211_IF_TYPE_IBSS && elems.supp_rates && memcmp(mgmt->bssid, sdata->u.sta.bssid, ETH_ALEN) == 0 && @@ -1724,9 +1689,7 @@ static void ieee80211_rx_mgmt_beacon(struct net_device *dev, if (baselen > len) return; - if (ieee802_11_parse_elems(mgmt->u.beacon.variable, len - baselen, - &elems) == ParseFailed) - return; + ieee802_11_parse_elems(mgmt->u.beacon.variable, len - baselen, &elems); if (elems.erp_info && elems.erp_info_len >= 1) ieee80211_handle_erp_ie(dev, elems.erp_info[0]); diff --git a/net/netfilter/xt_connbytes.c b/net/netfilter/xt_connbytes.c index af79423bc8e8..9ec50139b9a1 100644 --- a/net/netfilter/xt_connbytes.c +++ b/net/netfilter/xt_connbytes.c @@ -2,13 +2,13 @@ * GPL (C) 2002 Martin Devera (devik@cdi.cz). */ #include <linux/module.h> +#include <linux/bitops.h> #include <linux/skbuff.h> #include <linux/netfilter/x_tables.h> #include <linux/netfilter/xt_connbytes.h> #include <net/netfilter/nf_conntrack.h> #include <asm/div64.h> -#include <asm/bitops.h> MODULE_LICENSE("GPL"); MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>"); diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index e11000a8e950..d0936506b731 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -1623,11 +1623,6 @@ static struct vm_operations_struct packet_mmap_ops = { .close =packet_mm_close, }; -static inline struct page *pg_vec_endpage(char *one_pg_vec, unsigned int order) -{ - return virt_to_page(one_pg_vec + (PAGE_SIZE << order) - 1); -} - static void free_pg_vec(char **pg_vec, unsigned int order, unsigned int len) { int i; diff --git a/net/rfkill/rfkill-input.c b/net/rfkill/rfkill-input.c index eaabf087c59b..d1e9d68f8ba0 100644 --- a/net/rfkill/rfkill-input.c +++ b/net/rfkill/rfkill-input.c @@ -146,18 +146,18 @@ static void rfkill_disconnect(struct input_handle *handle) static const struct input_device_id rfkill_ids[] = { { .flags = INPUT_DEVICE_ID_MATCH_EVBIT | INPUT_DEVICE_ID_MATCH_KEYBIT, - .evbit = { BIT(EV_KEY) }, - .keybit = { [LONG(KEY_WLAN)] = BIT(KEY_WLAN) }, + .evbit = { BIT_MASK(EV_KEY) }, + .keybit = { [BIT_WORD(KEY_WLAN)] = BIT_MASK(KEY_WLAN) }, }, { .flags = INPUT_DEVICE_ID_MATCH_EVBIT | INPUT_DEVICE_ID_MATCH_KEYBIT, - .evbit = { BIT(EV_KEY) }, - .keybit = { [LONG(KEY_BLUETOOTH)] = BIT(KEY_BLUETOOTH) }, + .evbit = { BIT_MASK(EV_KEY) }, + .keybit = { [BIT_WORD(KEY_BLUETOOTH)] = BIT_MASK(KEY_BLUETOOTH) }, }, { .flags = INPUT_DEVICE_ID_MATCH_EVBIT | INPUT_DEVICE_ID_MATCH_KEYBIT, - .evbit = { BIT(EV_KEY) }, - .keybit = { [LONG(KEY_UWB)] = BIT(KEY_UWB) }, + .evbit = { BIT_MASK(EV_KEY) }, + .keybit = { [BIT_WORD(KEY_UWB)] = BIT_MASK(KEY_UWB) }, }, { } }; diff --git a/net/sched/Kconfig b/net/sched/Kconfig index 92435a882fac..9c15c4888d12 100644 --- a/net/sched/Kconfig +++ b/net/sched/Kconfig @@ -2,9 +2,7 @@ # Traffic control configuration. # -menu "QoS and/or fair queueing" - -config NET_SCHED +menuconfig NET_SCHED bool "QoS and/or fair queueing" select NET_SCH_FIFO ---help--- @@ -41,9 +39,6 @@ config NET_SCHED The available schedulers are listed in the following questions; you can say Y to as many as you like. If unsure, say N now. -config NET_SCH_FIFO - bool - if NET_SCHED comment "Queueing/Scheduling" @@ -500,4 +495,5 @@ config NET_CLS_IND endif # NET_SCHED -endmenu +config NET_SCH_FIFO + bool diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index e01d57692c9a..fa1a6f45dc41 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -556,6 +556,7 @@ void dev_deactivate(struct net_device *dev) { struct Qdisc *qdisc; struct sk_buff *skb; + int running; spin_lock_bh(&dev->queue_lock); qdisc = dev->qdisc; @@ -571,12 +572,31 @@ void dev_deactivate(struct net_device *dev) dev_watchdog_down(dev); - /* Wait for outstanding dev_queue_xmit calls. */ + /* Wait for outstanding qdisc-less dev_queue_xmit calls. */ synchronize_rcu(); /* Wait for outstanding qdisc_run calls. */ - while (test_bit(__LINK_STATE_QDISC_RUNNING, &dev->state)) - yield(); + do { + while (test_bit(__LINK_STATE_QDISC_RUNNING, &dev->state)) + yield(); + + /* + * Double-check inside queue lock to ensure that all effects + * of the queue run are visible when we return. + */ + spin_lock_bh(&dev->queue_lock); + running = test_bit(__LINK_STATE_QDISC_RUNNING, &dev->state); + spin_unlock_bh(&dev->queue_lock); + + /* + * The running flag should never be set at this point because + * we've already set dev->qdisc to noop_qdisc *inside* the same + * pair of spin locks. That is, if any qdisc_run starts after + * our initial test it should see the noop_qdisc and then + * clear the RUNNING bit before dropping the queue lock. So + * if it is set here then we've found a bug. + */ + } while (WARN_ON_ONCE(running)); } void dev_init_scheduler(struct net_device *dev) diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c index 3c773c53e12e..c98873f39aec 100644 --- a/net/sunrpc/sched.c +++ b/net/sunrpc/sched.c @@ -847,7 +847,7 @@ void rpc_init_task(struct rpc_task *task, struct rpc_clnt *clnt, int flags, cons task->tk_start = jiffies; dprintk("RPC: new task initialized, procpid %u\n", - current->pid); + task_pid_nr(current)); } static struct rpc_task * diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 6996cba5aa96..9163ec526c2a 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -483,7 +483,7 @@ static int unix_listen(struct socket *sock, int backlog) sk->sk_max_ack_backlog = backlog; sk->sk_state = TCP_LISTEN; /* set credentials so connect can copy them */ - sk->sk_peercred.pid = current->tgid; + sk->sk_peercred.pid = task_tgid_vnr(current); sk->sk_peercred.uid = current->euid; sk->sk_peercred.gid = current->egid; err = 0; @@ -1133,7 +1133,7 @@ restart: unix_peer(newsk) = sk; newsk->sk_state = TCP_ESTABLISHED; newsk->sk_type = sk->sk_type; - newsk->sk_peercred.pid = current->tgid; + newsk->sk_peercred.pid = task_tgid_vnr(current); newsk->sk_peercred.uid = current->euid; newsk->sk_peercred.gid = current->egid; newu = unix_sk(newsk); @@ -1194,7 +1194,7 @@ static int unix_socketpair(struct socket *socka, struct socket *sockb) sock_hold(skb); unix_peer(ska)=skb; unix_peer(skb)=ska; - ska->sk_peercred.pid = skb->sk_peercred.pid = current->tgid; + ska->sk_peercred.pid = skb->sk_peercred.pid = task_tgid_vnr(current); ska->sk_peercred.uid = skb->sk_peercred.uid = current->euid; ska->sk_peercred.gid = skb->sk_peercred.gid = current->egid; diff --git a/samples/Kconfig b/samples/Kconfig new file mode 100644 index 000000000000..57bb2236952c --- /dev/null +++ b/samples/Kconfig @@ -0,0 +1,16 @@ +# samples/Kconfig + +menuconfig SAMPLES + bool "Sample kernel code" + help + You can build and test sample kernel code here. + +if SAMPLES + +config SAMPLE_MARKERS + tristate "Build markers examples -- loadable modules only" + depends on MARKERS && m + help + This build markers example modules. + +endif # SAMPLES diff --git a/samples/Makefile b/samples/Makefile new file mode 100644 index 000000000000..5a4f0b6bcbed --- /dev/null +++ b/samples/Makefile @@ -0,0 +1,3 @@ +# Makefile for Linux samples code + +obj-$(CONFIG_SAMPLES) += markers/ diff --git a/samples/markers/Makefile b/samples/markers/Makefile new file mode 100644 index 000000000000..6d7231265f0f --- /dev/null +++ b/samples/markers/Makefile @@ -0,0 +1,4 @@ +# builds the kprobes example kernel modules; +# then to use one (as root): insmod <module_name.ko> + +obj-$(CONFIG_SAMPLE_MARKERS) += probe-example.o marker-example.o diff --git a/samples/markers/marker-example.c b/samples/markers/marker-example.c new file mode 100644 index 000000000000..e787c6d16dd7 --- /dev/null +++ b/samples/markers/marker-example.c @@ -0,0 +1,54 @@ +/* marker-example.c + * + * Executes a marker when /proc/marker-example is opened. + * + * (C) Copyright 2007 Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca> + * + * This file is released under the GPLv2. + * See the file COPYING for more details. + */ + +#include <linux/module.h> +#include <linux/marker.h> +#include <linux/sched.h> +#include <linux/proc_fs.h> + +struct proc_dir_entry *pentry_example; + +static int my_open(struct inode *inode, struct file *file) +{ + int i; + + trace_mark(subsystem_event, "%d %s", 123, "example string"); + for (i = 0; i < 10; i++) + trace_mark(subsystem_eventb, MARK_NOARGS); + return -EPERM; +} + +static struct file_operations mark_ops = { + .open = my_open, +}; + +static int example_init(void) +{ + printk(KERN_ALERT "example init\n"); + pentry_example = create_proc_entry("marker-example", 0444, NULL); + if (pentry_example) + pentry_example->proc_fops = &mark_ops; + else + return -EPERM; + return 0; +} + +static void example_exit(void) +{ + printk(KERN_ALERT "example exit\n"); + remove_proc_entry("marker-example", NULL); +} + +module_init(example_init) +module_exit(example_exit) + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Mathieu Desnoyers"); +MODULE_DESCRIPTION("Marker example"); diff --git a/samples/markers/probe-example.c b/samples/markers/probe-example.c new file mode 100644 index 000000000000..238b2e384fc8 --- /dev/null +++ b/samples/markers/probe-example.c @@ -0,0 +1,98 @@ +/* probe-example.c + * + * Connects two functions to marker call sites. + * + * (C) Copyright 2007 Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca> + * + * This file is released under the GPLv2. + * See the file COPYING for more details. + */ + +#include <linux/sched.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/marker.h> +#include <asm/atomic.h> + +struct probe_data { + const char *name; + const char *format; + marker_probe_func *probe_func; +}; + +void probe_subsystem_event(const struct marker *mdata, void *private, + const char *format, ...) +{ + va_list ap; + /* Declare args */ + unsigned int value; + const char *mystr; + + /* Assign args */ + va_start(ap, format); + value = va_arg(ap, typeof(value)); + mystr = va_arg(ap, typeof(mystr)); + + /* Call printk */ + printk(KERN_DEBUG "Value %u, string %s\n", value, mystr); + + /* or count, check rights, serialize data in a buffer */ + + va_end(ap); +} + +atomic_t eventb_count = ATOMIC_INIT(0); + +void probe_subsystem_eventb(const struct marker *mdata, void *private, + const char *format, ...) +{ + /* Increment counter */ + atomic_inc(&eventb_count); +} + +static struct probe_data probe_array[] = +{ + { .name = "subsystem_event", + .format = "%d %s", + .probe_func = probe_subsystem_event }, + { .name = "subsystem_eventb", + .format = MARK_NOARGS, + .probe_func = probe_subsystem_eventb }, +}; + +static int __init probe_init(void) +{ + int result; + int i; + + for (i = 0; i < ARRAY_SIZE(probe_array); i++) { + result = marker_probe_register(probe_array[i].name, + probe_array[i].format, + probe_array[i].probe_func, &probe_array[i]); + if (result) + printk(KERN_INFO "Unable to register probe %s\n", + probe_array[i].name); + result = marker_arm(probe_array[i].name); + if (result) + printk(KERN_INFO "Unable to arm probe %s\n", + probe_array[i].name); + } + return 0; +} + +static void __exit probe_fini(void) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(probe_array); i++) + marker_probe_unregister(probe_array[i].name); + printk(KERN_INFO "Number of event b : %u\n", + atomic_read(&eventb_count)); +} + +module_init(probe_init); +module_exit(probe_fini); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Mathieu Desnoyers"); +MODULE_DESCRIPTION("SUBSYSTEM Probe"); diff --git a/scripts/checkstack.pl b/scripts/checkstack.pl index b458e2acb4ac..28e480c8100f 100755 --- a/scripts/checkstack.pl +++ b/scripts/checkstack.pl @@ -15,7 +15,7 @@ # AVR32 port by Haavard Skinnemoen <hskinnemoen@atmel.com> # # Usage: -# objdump -d vmlinux | stackcheck.pl [arch] +# objdump -d vmlinux | scripts/checkstack.pl [arch] # # TODO : Port to all architectures (one regex per arch) diff --git a/security/commoncap.c b/security/commoncap.c index 48ca5b092768..43f902750a1b 100644 --- a/security/commoncap.c +++ b/security/commoncap.c @@ -23,6 +23,7 @@ #include <linux/xattr.h> #include <linux/hugetlb.h> #include <linux/mount.h> +#include <linux/sched.h> #ifdef CONFIG_SECURITY_FILE_CAPABILITIES /* @@ -334,7 +335,7 @@ void cap_bprm_apply_creds (struct linux_binprm *bprm, int unsafe) /* For init, we want to retain the capabilities set * in the init_task struct. Thus we skip the usual * capability rules */ - if (!is_init(current)) { + if (!is_global_init(current)) { current->cap_permitted = new_permitted; current->cap_effective = bprm->cap_effective ? new_permitted : 0; diff --git a/sound/ppc/beep.c b/sound/ppc/beep.c index a1aa89f2faf3..566b5ab9d4e8 100644 --- a/sound/ppc/beep.c +++ b/sound/ppc/beep.c @@ -236,8 +236,8 @@ int __init snd_pmac_attach_beep(struct snd_pmac *chip) input_dev->id.product = 0x0001; input_dev->id.version = 0x0100; - input_dev->evbit[0] = BIT(EV_SND); - input_dev->sndbit[0] = BIT(SND_BELL) | BIT(SND_TONE); + input_dev->evbit[0] = BIT_MASK(EV_SND); + input_dev->sndbit[0] = BIT_MASK(SND_BELL) | BIT_MASK(SND_TONE); input_dev->event = snd_pmac_beep_event; input_dev->dev.parent = &chip->pdev->dev; input_set_drvdata(input_dev, chip); diff --git a/sound/usb/caiaq/caiaq-input.c b/sound/usb/caiaq/caiaq-input.c index a1de0c608957..cd536ca20e56 100644 --- a/sound/usb/caiaq/caiaq-input.c +++ b/sound/usb/caiaq/caiaq-input.c @@ -200,8 +200,9 @@ int snd_usb_caiaq_input_init(struct snd_usb_caiaqdev *dev) switch (dev->chip.usb_id) { case USB_ID(USB_VID_NATIVEINSTRUMENTS, USB_PID_RIGKONTROL2): - input->evbit[0] = BIT(EV_KEY) | BIT(EV_ABS); - input->absbit[0] = BIT(ABS_X) | BIT(ABS_Y) | BIT(ABS_Z); + input->evbit[0] = BIT_MASK(EV_KEY) | BIT_MASK(EV_ABS); + input->absbit[0] = BIT_MASK(ABS_X) | BIT_MASK(ABS_Y) | + BIT_MASK(ABS_Z); input->keycode = keycode_rk2; input->keycodesize = sizeof(char); input->keycodemax = ARRAY_SIZE(keycode_rk2); @@ -228,8 +229,8 @@ int snd_usb_caiaq_input_init(struct snd_usb_caiaqdev *dev) snd_usb_caiaq_set_auto_msg(dev, 1, 10, 0); break; case USB_ID(USB_VID_NATIVEINSTRUMENTS, USB_PID_AK1): - input->evbit[0] = BIT(EV_KEY) | BIT(EV_ABS); - input->absbit[0] = BIT(ABS_X); + input->evbit[0] = BIT_MASK(EV_KEY) | BIT_MASK(EV_ABS); + input->absbit[0] = BIT_MASK(ABS_X); input->keycode = keycode_ak1; input->keycodesize = sizeof(char); input->keycodemax = ARRAY_SIZE(keycode_ak1); |