diff --git a/Documentation/hwlat_detector.txt b/Documentation/hwlat_detector.txt new file mode 100644 index 0000000..cb61516 --- /dev/null +++ b/Documentation/hwlat_detector.txt @@ -0,0 +1,64 @@ +Introduction: +------------- + +The module hwlat_detector is a special purpose kernel module that is used to +detect large system latencies induced by the behavior of certain underlying +hardware or firmware, independent of Linux itself. The code was developed +originally to detect SMIs (System Management Interrupts) on x86 systems, +however there is nothing x86 specific about this patchset. It was +originally written for use by the "RT" patch since the Real Time +kernel is highly latency sensitive. + +SMIs are usually not serviced by the Linux kernel, which typically does not +even know that they are occuring. SMIs are instead are set up by BIOS code +and are serviced by BIOS code, usually for "critical" events such as +management of thermal sensors and fans. Sometimes though, SMIs are used for +other tasks and those tasks can spend an inordinate amount of time in the +handler (sometimes measured in milliseconds). Obviously this is a problem if +you are trying to keep event service latencies down in the microsecond range. + +The hardware latency detector works by hogging all of the cpus for configurable +amounts of time (by calling stop_machine()), polling the CPU Time Stamp Counter +for some period, then looking for gaps in the TSC data. Any gap indicates a +time when the polling was interrupted and since the machine is stopped and +interrupts turned off the only thing that could do that would be an SMI. + +Note that the SMI detector should *NEVER* be used in a production environment. +It is intended to be run manually to determine if the hardware platform has a +problem with long system firmware service routines. + +Usage: +------ + +Loading the module hwlat_detector passing the parameter "enabled=1" (or by +setting the "enable" entry in "hwlat_detector" debugfs toggled on) is the only +step required to start the hwlat_detector. It is possible to redefine the +threshold in microseconds (us) above which latency spikes will be taken +into account (parameter "threshold="). + +Example: + + # modprobe hwlat_detector enabled=1 threshold=100 + +After the module is loaded, it creates a directory named "hwlat_detector" under +the debugfs mountpoint, "/debug/hwlat_detector" for this text. It is necessary +to have debugfs mounted, which might be on /sys/debug on your system. + +The /debug/hwlat_detector interface contains the following files: + +count - number of latency spikes observed since last reset +enable - a global enable/disable toggle (0/1), resets count +max - maximum hardware latency actually observed (usecs) +sample - a pipe from which to read current raw sample data + in the format <timestamp> <latency observed usecs> + (can be opened O_NONBLOCK for a single sample) +threshold - minimum latency value to be considered (usecs) +width - time period to sample with CPUs held (usecs) + must be less than the total window size (enforced) +window - total period of sampling, width being inside (usecs) + +By default we will set width to 500,000 and window to 1,000,000, meaning that +we will sample every 1,000,000 usecs (1s) for 500,000 usecs (0.5s). If we +observe any latencies that exceed the threshold (initially 100 usecs), +then we write to a global sample ring buffer of 8K samples, which is +consumed by reading from the "sample" (pipe) debugfs file interface. diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index 7936b80..8e91863 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -2480,6 +2480,11 @@ and is between 256 and 4096 characters. It is defined in the file trace_buf_size=nn[KMG] [FTRACE] will set tracing buffer size. + trace_event=[event-list] + [FTRACE] Set and start specified trace events in order + to facilitate early boot debugging. + See also Documentation/trace/events.txt + trix= [HW,OSS] MediaTrix AudioTrix Pro Format: <io>,<irq>,<dma>,<dma2>,<sb_io>,<sb_irq>,<sb_dma>,<mpu_io>,<mpu_irq> diff --git a/Documentation/trace/events.txt b/Documentation/trace/events.txt index f157d75..6e5f35e 100644 --- a/Documentation/trace/events.txt +++ b/Documentation/trace/events.txt @@ -1,7 +1,7 @@ Event Tracing Documentation written by Theodore Ts'o - Updated by Li Zefan + Updated by Li Zefan and Tom Zanussi 1. Introduction =============== @@ -83,8 +83,199 @@ When reading one of these enable files, there are four results: X - there is a mixture of events enabled and disabled ? - this file does not affect any event +2.3 Boot option +--------------- + +In order to facilitate early boot debugging, use boot option: + + trace_event=[event-list] + +The format of this boot option is the same as described in section 2.1. + 3. Defining an event-enabled tracepoint ======================================= See The example provided in samples/trace_events +4. Event formats +================ + +Each trace event has a 'format' file associated with it that contains +a description of each field in a logged event. This information can +be used to parse the binary trace stream, and is also the place to +find the field names that can be used in event filters (see section 5). + +It also displays the format string that will be used to print the +event in text mode, along with the event name and ID used for +profiling. + +Every event has a set of 'common' fields associated with it; these are +the fields prefixed with 'common_'. The other fields vary between +events and correspond to the fields defined in the TRACE_EVENT +definition for that event. + +Each field in the format has the form: + + field:field-type field-name; offset:N; size:N; + +where offset is the offset of the field in the trace record and size +is the size of the data item, in bytes. + +For example, here's the information displayed for the 'sched_wakeup' +event: + +# cat /debug/tracing/events/sched/sched_wakeup/format + +name: sched_wakeup +ID: 60 +format: + field:unsigned short common_type; offset:0; size:2; + field:unsigned char common_flags; offset:2; size:1; + field:unsigned char common_preempt_count; offset:3; size:1; + field:int common_pid; offset:4; size:4; + field:int common_tgid; offset:8; size:4; + + field:char comm[TASK_COMM_LEN]; offset:12; size:16; + field:pid_t pid; offset:28; size:4; + field:int prio; offset:32; size:4; + field:int success; offset:36; size:4; + field:int cpu; offset:40; size:4; + +print fmt: "task %s:%d [%d] success=%d [%03d]", REC->comm, REC->pid, + REC->prio, REC->success, REC->cpu + +This event contains 10 fields, the first 5 common and the remaining 5 +event-specific. All the fields for this event are numeric, except for +'comm' which is a string, a distinction important for event filtering. + +5. Event filtering +================== + +Trace events can be filtered in the kernel by associating boolean +'filter expressions' with them. As soon as an event is logged into +the trace buffer, its fields are checked against the filter expression +associated with that event type. An event with field values that +'match' the filter will appear in the trace output, and an event whose +values don't match will be discarded. An event with no filter +associated with it matches everything, and is the default when no +filter has been set for an event. + +5.1 Expression syntax +--------------------- + +A filter expression consists of one or more 'predicates' that can be +combined using the logical operators '&&' and '||'. A predicate is +simply a clause that compares the value of a field contained within a +logged event with a constant value and returns either 0 or 1 depending +on whether the field value matched (1) or didn't match (0): + + field-name relational-operator value + +Parentheses can be used to provide arbitrary logical groupings and +double-quotes can be used to prevent the shell from interpreting +operators as shell metacharacters. + +The field-names available for use in filters can be found in the +'format' files for trace events (see section 4). + +The relational-operators depend on the type of the field being tested: + +The operators available for numeric fields are: + +==, !=, <, <=, >, >= + +And for string fields they are: + +==, != + +Currently, only exact string matches are supported. + +Currently, the maximum number of predicates in a filter is 16. + +5.2 Setting filters +------------------- + +A filter for an individual event is set by writing a filter expression +to the 'filter' file for the given event. + +For example: + +# cd /debug/tracing/events/sched/sched_wakeup +# echo "common_preempt_count > 4" > filter + +A slightly more involved example: + +# cd /debug/tracing/events/sched/sched_signal_send +# echo "((sig >= 10 && sig < 15) || sig == 17) && comm != bash" > filter + +If there is an error in the expression, you'll get an 'Invalid +argument' error when setting it, and the erroneous string along with +an error message can be seen by looking at the filter e.g.: + +# cd /debug/tracing/events/sched/sched_signal_send +# echo "((sig >= 10 && sig < 15) || dsig == 17) && comm != bash" > filter +-bash: echo: write error: Invalid argument +# cat filter +((sig >= 10 && sig < 15) || dsig == 17) && comm != bash +^ +parse_error: Field not found + +Currently the caret ('^') for an error always appears at the beginning of +the filter string; the error message should still be useful though +even without more accurate position info. + +5.3 Clearing filters +-------------------- + +To clear the filter for an event, write a '0' to the event's filter +file. + +To clear the filters for all events in a subsystem, write a '0' to the +subsystem's filter file. + +5.3 Subsystem filters +--------------------- + +For convenience, filters for every event in a subsystem can be set or +cleared as a group by writing a filter expression into the filter file +at the root of the subsytem. Note however, that if a filter for any +event within the subsystem lacks a field specified in the subsystem +filter, or if the filter can't be applied for any other reason, the +filter for that event will retain its previous setting. This can +result in an unintended mixture of filters which could lead to +confusing (to the user who might think different filters are in +effect) trace output. Only filters that reference just the common +fields can be guaranteed to propagate successfully to all events. + +Here are a few subsystem filter examples that also illustrate the +above points: + +Clear the filters on all events in the sched subsytem: + +# cd /sys/kernel/debug/tracing/events/sched +# echo 0 > filter +# cat sched_switch/filter +none +# cat sched_wakeup/filter +none + +Set a filter using only common fields for all events in the sched +subsytem (all events end up with the same filter): + +# cd /sys/kernel/debug/tracing/events/sched +# echo common_pid == 0 > filter +# cat sched_switch/filter +common_pid == 0 +# cat sched_wakeup/filter +common_pid == 0 + +Attempt to set a filter using a non-common field for all events in the +sched subsytem (all events but those that have a prev_pid field retain +their old filters): + +# cd /sys/kernel/debug/tracing/events/sched +# echo prev_pid == 0 > filter +# cat sched_switch/filter +prev_pid == 0 +# cat sched_wakeup/filter +common_pid == 0 diff --git a/Documentation/trace/ftrace.txt b/Documentation/trace/ftrace.txt index a39b3c7..355d0f1 100644 --- a/Documentation/trace/ftrace.txt +++ b/Documentation/trace/ftrace.txt @@ -85,26 +85,19 @@ of ftrace. Here is a list of some of the key files: This file holds the output of the trace in a human readable format (described below). - latency_trace: - - This file shows the same trace but the information - is organized more to display possible latencies - in the system (described below). - trace_pipe: The output is the same as the "trace" file but this file is meant to be streamed with live tracing. - Reads from this file will block until new data - is retrieved. Unlike the "trace" and "latency_trace" - files, this file is a consumer. This means reading - from this file causes sequential reads to display - more current data. Once data is read from this - file, it is consumed, and will not be read - again with a sequential read. The "trace" and - "latency_trace" files are static, and if the - tracer is not adding more data, they will display - the same information every time they are read. + Reads from this file will block until new data is + retrieved. Unlike the "trace" file, this file is a + consumer. This means reading from this file causes + sequential reads to display more current data. Once + data is read from this file, it is consumed, and + will not be read again with a sequential read. The + "trace" file is static, and if the tracer is not + adding more data,they will display the same + information every time they are read. trace_options: @@ -117,10 +110,10 @@ of ftrace. Here is a list of some of the key files: Some of the tracers record the max latency. For example, the time interrupts are disabled. This time is saved in this file. The max trace - will also be stored, and displayed by either - "trace" or "latency_trace". A new max trace will - only be recorded if the latency is greater than - the value in this file. (in microseconds) + will also be stored, and displayed by "trace". + A new max trace will only be recorded if the + latency is greater than the value in this + file. (in microseconds) buffer_size_kb: @@ -210,7 +203,7 @@ Here is the list of current tracers that may be configured. the trace with the longest max latency. See tracing_max_latency. When a new max is recorded, it replaces the old trace. It is best to view this - trace via the latency_trace file. + trace with the latency-format option enabled. "preemptoff" @@ -307,8 +300,8 @@ the lowest priority thread (pid 0). Latency trace format -------------------- -For traces that display latency times, the latency_trace file -gives somewhat more information to see why a latency happened. +When the latency-format option is enabled, the trace file gives +somewhat more information to see why a latency happened. Here is a typical trace. # tracer: irqsoff @@ -380,9 +373,10 @@ explains which is which. The above is mostly meaningful for kernel developers. - time: This differs from the trace file output. The trace file output - includes an absolute timestamp. The timestamp used by the - latency_trace file is relative to the start of the trace. + time: When the latency-format option is enabled, the trace file + output includes a timestamp relative to the start of the + trace. This differs from the output when latency-format + is disabled, which includes an absolute timestamp. delay: This is just to help catch your eye a bit better. And needs to be fixed to be only relative to the same CPU. @@ -440,7 +434,8 @@ Here are the available options: sym-addr: bash-4000 [01] 1477.606694: simple_strtoul <c0339346> - verbose - This deals with the latency_trace file. + verbose - This deals with the trace file when the + latency-format option is enabled. bash 4000 1 0 00000000 00010a95 [58127d26] 1720.415ms \ (+0.000ms): simple_strtoul (strict_strtoul) @@ -472,7 +467,7 @@ Here are the available options: the app is no longer running The lookup is performed when you read - trace,trace_pipe,latency_trace. Example: + trace,trace_pipe. Example: a.out-1623 [000] 40874.465068: /root/a.out[+0x480] <-/root/a.out[+0 x494] <- /root/a.out[+0x4a8] <- /lib/libc-2.7.so[+0x1e1a6] @@ -481,6 +476,11 @@ x494] <- /root/a.out[+0x4a8] <- /lib/libc-2.7.so[+0x1e1a6] every scheduling event. Will add overhead if there's a lot of tasks running at once. + latency-format - This option changes the trace. When + it is enabled, the trace displays + additional information about the + latencies, as described in "Latency + trace format". sched_switch ------------ @@ -596,12 +596,13 @@ To reset the maximum, echo 0 into tracing_max_latency. Here is an example: # echo irqsoff > current_tracer + # echo latency-format > trace_options # echo 0 > tracing_max_latency # echo 1 > tracing_enabled # ls -ltr [...] # echo 0 > tracing_enabled - # cat latency_trace + # cat trace # tracer: irqsoff # irqsoff latency trace v1.1.5 on 2.6.26 @@ -703,12 +704,13 @@ which preemption was disabled. The control of preemptoff tracer is much like the irqsoff tracer. # echo preemptoff > current_tracer + # echo latency-format > trace_options # echo 0 > tracing_max_latency # echo 1 > tracing_enabled # ls -ltr [...] # echo 0 > tracing_enabled - # cat latency_trace + # cat trace # tracer: preemptoff # preemptoff latency trace v1.1.5 on 2.6.26-rc8 @@ -850,12 +852,13 @@ Again, using this trace is much like the irqsoff and preemptoff tracers. # echo preemptirqsoff > current_tracer + # echo latency-format > trace_options # echo 0 > tracing_max_latency # echo 1 > tracing_enabled # ls -ltr [...] # echo 0 > tracing_enabled - # cat latency_trace + # cat trace # tracer: preemptirqsoff # preemptirqsoff latency trace v1.1.5 on 2.6.26-rc8 @@ -1012,11 +1015,12 @@ Instead of performing an 'ls', we will run 'sleep 1' under 'chrt' which changes the priority of the task. # echo wakeup > current_tracer + # echo latency-format > trace_options # echo 0 > tracing_max_latency # echo 1 > tracing_enabled # chrt -f 5 sleep 1 # echo 0 > tracing_enabled - # cat latency_trace + # cat trace # tracer: wakeup # wakeup latency trace v1.1.5 on 2.6.26-rc8 diff --git a/Documentation/trace/function-graph-fold.vim b/Documentation/trace/function-graph-fold.vim new file mode 100644 index 0000000..0544b50 --- /dev/null +++ b/Documentation/trace/function-graph-fold.vim @@ -0,0 +1,42 @@ +" Enable folding for ftrace function_graph traces. +" +" To use, :source this file while viewing a function_graph trace, or use vim's +" -S option to load from the command-line together with a trace. You can then +" use the usual vim fold commands, such as "za", to open and close nested +" functions. While closed, a fold will show the total time taken for a call, +" as would normally appear on the line with the closing brace. Folded +" functions will not include finish_task_switch(), so folding should remain +" relatively sane even through a context switch. +" +" Note that this will almost certainly only work well with a +" single-CPU trace (e.g. trace-cmd report --cpu 1). + +function! FunctionGraphFoldExpr(lnum) + let line = getline(a:lnum) + if line[-1:] == '{' + if line =~ 'finish_task_switch() {$' + return '>1' + endif + return 'a1' + elseif line[-1:] == '}' + return 's1' + else + return '=' + endif +endfunction + +function! FunctionGraphFoldText() + let s = split(getline(v:foldstart), '|', 1) + if getline(v:foldend+1) =~ 'finish_task_switch() {$' + let s[2] = ' task switch ' + else + let e = split(getline(v:foldend), '|', 1) + let s[2] = e[2] + endif + return join(s, '|') +endfunction + +setlocal foldexpr=FunctionGraphFoldExpr(v:lnum) +setlocal foldtext=FunctionGraphFoldText() +setlocal foldcolumn=12 +setlocal foldmethod=expr diff --git a/Documentation/trace/histograms.txt b/Documentation/trace/histograms.txt new file mode 100644 index 0000000..6645057 --- /dev/null +++ b/Documentation/trace/histograms.txt @@ -0,0 +1,156 @@ + Using the Linux Kernel Latency Histograms + + +This document gives a short explanation how to enable, configure and use +latency histograms. Latency histograms are primarily relevant in the +context of real-time enabled kernels (CONFIG_PREEMPT/CONFIG_PREEMPT_RT) +and are used in the quality management of the Linux real-time +capabilities. + + +* Purpose of latency histograms + +A latency histogram continuously accumulates the frequencies of latency +data. There are two types of histograms +- potential sources of latencies +- effective latencies + + +* Potential sources of latencies + +Potential sources of latencies are code segments where interrupts, +preemption or both are disabled (aka critical sections). To create +histograms of potential sources of latency, the kernel stores the time +stamp at the start of a critical section, determines the time elapsed +when the end of the section is reached, and increments the frequency +counter of that latency value - irrespective of whether any concurrently +running process is affected by latency or not. +- Configuration items (in the Kernel hacking/Tracers submenu) + CONFIG_INTERRUPT_OFF_LATENCY + CONFIG_PREEMPT_OFF_LATENCY + + +* Effective latencies + +Effective latencies are actually occuring during wakeup of a process. To +determine effective latencies, the kernel stores the time stamp when a +process is scheduled to be woken up, and determines the duration of the +wakeup time shortly before control is passed over to this process. Note +that the apparent latency in user space may be considerably longer, +since +i) interrupts may be disabled preventing the scheduler from initiating +the wakeup mechanism, and +ii) the process may be interrupted after control is passed over to it +but before user space execution takes place. +- Configuration item (in the Kernel hacking/Tracers submenu) + CONFIG_WAKEUP_LATENCY + + +* Usage + +The interface to the administration of the latency histograms is located +in the debugfs file system. To mount it, either enter + +mount -t sysfs nodev /sys +mount -t debugfs nodev /sys/kernel/debug + +from shell command line level, or add + +nodev /sys sysfs defaults 0 0 +nodev /sys/kernel/debug debugfs defaults 0 0 + +to the file /etc/fstab. All latency histogram related files are +available in the directory /sys/kernel/debug/tracing/latency_hist. A +particular histogram type is enabled by writing non-zero to the related +variable in the /sys/kernel/debug/tracing/latency_hist/enable directory. +Select "preemptirqsoff" for the histograms of potential sources of +latencies and "wakeup" for histograms of effective latencies. The +histogram data - one per CPU - are available in the files + +/sys/kernel/debug/tracing/latency_hist/preemptoff/CPUx +/sys/kernel/debug/tracing/latency_hist/irqsoff/CPUx +/sys/kernel/debug/tracing/latency_hist/preemptirqsoff/CPUx +/sys/kernel/debug/tracing/latency_hist/wakeup/CPUx. + +The histograms are reset by writing non-zero to the file "reset" in a +particular latency directory. To reset all latency data, use + +#!/bin/sh + +HISTDIR=/sys/kernel/debug/tracing/latency_hist + +if test -d $HISTDIR +then + cd $HISTDIR + for i in */reset + do + echo 1 >$i + done +fi + + +* Data format + +Latency data are stored with a resolution of one microsecond. The +maximum latency is 10,240 microseconds. The data are only valid, if the +overflow register is empty. Every output line contains the latency in +microseconds in the first row and the number of samples in the second +row. To display only lines with a positive latency count, use, for +example, + +grep -v " 0$" /sys/kernel/debug/tracing/latency_hist/preemptoff/CPU0 + +#Minimum latency: 0 microseconds. +#Average latency: 0 microseconds. +#Maximum latency: 25 microseconds. +#Total samples: 3104770694 +#There are 0 samples greater or equal than 10240 microseconds +#usecs samples + 0 2984486876 + 1 49843506 + 2 58219047 + 3 5348126 + 4 2187960 + 5 3388262 + 6 959289 + 7 208294 + 8 40420 + 9 4485 + 10 14918 + 11 18340 + 12 25052 + 13 19455 + 14 5602 + 15 969 + 16 47 + 17 18 + 18 14 + 19 1 + 20 3 + 21 2 + 22 5 + 23 2 + 25 1 + + +* Wakeup latency of a selected process + +To only collect wakeup latency data of a particular process, write the +PID of the requested process to + +/sys/kernel/debug/tracing/latency_hist/wakeup/pid. + +PIDs are not considered, if this variable is set to 0. + + +* Details of the process with the highest wakeup latency so far + +Selected data of the process that suffered from the highest wakeup +latency that occurred in a particular CPU are available in the file + +/sys/kernel/debug/tracing/latency_hist/wakeup/max_latency-CPUx. + +The format of the data is +<PID> <Priority> <Latency> <Command> + +These data are also reset when the wakeup histogram ist reset. diff --git a/Documentation/trace/ring-buffer-design.txt b/Documentation/trace/ring-buffer-design.txt new file mode 100644 index 0000000..5b1d23d --- /dev/null +++ b/Documentation/trace/ring-buffer-design.txt @@ -0,0 +1,955 @@ + Lockless Ring Buffer Design + =========================== + +Copyright 2009 Red Hat Inc. + Author: Steven Rostedt <srostedt@redhat.com> + License: The GNU Free Documentation License, Version 1.2 + (dual licensed under the GPL v2) +Reviewers: Mathieu Desnoyers, Huang Ying, Hidetoshi Seto, + and Frederic Weisbecker. + + +Written for: 2.6.31 + +Terminology used in this Document +--------------------------------- + +tail - where new writes happen in the ring buffer. + +head - where new reads happen in the ring buffer. + +producer - the task that writes into the ring buffer (same as writer) + +writer - same as producer + +consumer - the task that reads from the buffer (same as reader) + +reader - same as consumer. + +reader_page - A page outside the ring buffer used solely (for the most part) + by the reader. + +head_page - a pointer to the page that the reader will use next + +tail_page - a pointer to the page that will be written to next + +commit_page - a pointer to the page with the last finished non nested write. + +cmpxchg - hardware assisted atomic transaction that performs the following: + + A = B iff previous A == C + + R = cmpxchg(A, C, B) is saying that we replace A with B if and only if + current A is equal to C, and we put the old (current) A into R + + R gets the previous A regardless if A is updated with B or not. + + To see if the update was successful a compare of R == C may be used. + +The Generic Ring Buffer +----------------------- + +The ring buffer can be used in either an overwrite mode or in +producer/consumer mode. + +Producer/consumer mode is where the producer were to fill up the +buffer before the consumer could free up anything, the producer +will stop writing to the buffer. This will lose most recent events. + +Overwrite mode is where the produce were to fill up the buffer +before the consumer could free up anything, the producer will +overwrite the older data. This will lose the oldest events. + +No two writers can write at the same time (on the same per cpu buffer), +but a writer may interrupt another writer, but it must finish writing +before the previous writer may continue. This is very important to the +algorithm. The writers act like a "stack". The way interrupts works +enforces this behavior. + + + writer1 start + <preempted> writer2 start + <preempted> writer3 start + writer3 finishes + writer2 finishes + writer1 finishes + +This is very much like a writer being preempted by an interrupt and +the interrupt doing a write as well. + +Readers can happen at any time. But no two readers may run at the +same time, nor can a reader preempt/interrupt another reader. A reader +can not preempt/interrupt a writer, but it may read/consume from the +buffer at the same time as a writer is writing, but the reader must be +on another processor to do so. A reader may read on its own processor +and can be preempted by a writer. + +A writer can preempt a reader, but a reader can not preempt a writer. +But a reader can read the buffer at the same time (on another processor) +as a writer. + +The ring buffer is made up of a list of pages held together by a link list. + +At initialization a reader page is allocated for the reader that is not +part of the ring buffer. + +The head_page, tail_page and commit_page are all initialized to point +to the same page. + +The reader page is initialized to have its next pointer pointing to +the head page, and its previous pointer pointing to a page before +the head page. + +The reader has its own page to use. At start up time, this page is +allocated but is not attached to the list. When the reader wants +to read from the buffer, if its page is empty (like it is on start up) +it will swap its page with the head_page. The old reader page will +become part of the ring buffer and the head_page will be removed. +The page after the inserted page (old reader_page) will become the +new head page. + +Once the new page is given to the reader, the reader could do what +it wants with it, as long as a writer has left that page. + +A sample of how the reader page is swapped: Note this does not +show the head page in the buffer, it is for demonstrating a swap +only. + + +------+ + |reader| RING BUFFER + |page | + +------+ + +---+ +---+ +---+ + | |-->| |-->| | + | |<--| |<--| | + +---+ +---+ +---+ + ^ | ^ | + | +-------------+ | + +-----------------+ + + + +------+ + |reader| RING BUFFER + |page |-------------------+ + +------+ v + | +---+ +---+ +---+ + | | |-->| |-->| | + | | |<--| |<--| |<-+ + | +---+ +---+ +---+ | + | ^ | ^ | | + | | +-------------+ | | + | +-----------------+ | + +------------------------------------+ + + +------+ + |reader| RING BUFFER + |page |-------------------+ + +------+ <---------------+ v + | ^ +---+ +---+ +---+ + | | | |-->| |-->| | + | | | | | |<--| |<-+ + | | +---+ +---+ +---+ | + | | | ^ | | + | | +-------------+ | | + | +-----------------------------+ | + +------------------------------------+ + + +------+ + |buffer| RING BUFFER + |page |-------------------+ + +------+ <---------------+ v + | ^ +---+ +---+ +---+ + | | | | | |-->| | + | | New | | | |<--| |<-+ + | | Reader +---+ +---+ +---+ | + | | page ----^ | | + | | | | + | +-----------------------------+ | + +------------------------------------+ + + + +It is possible that the page swapped is the commit page and the tail page, +if what is in the ring buffer is less than what is held in a buffer page. + + + reader page commit page tail page + | | | + v | | + +---+ | | + | |<----------+ | + | |<------------------------+ + | |------+ + +---+ | + | + v + +---+ +---+ +---+ +---+ +<---| |--->| |--->| |--->| |---> +--->| |<---| |<---| |<---| |<--- + +---+ +---+ +---+ +---+ + +This case is still valid for this algorithm. +When the writer leaves the page, it simply goes into the ring buffer +since the reader page still points to the next location in the ring +buffer. + + +The main pointers: + + reader page - The page used solely by the reader and is not part + of the ring buffer (may be swapped in) + + head page - the next page in the ring buffer that will be swapped + with the reader page. + + tail page - the page where the next write will take place. + + commit page - the page that last finished a write. + +The commit page only is updated by the outer most writer in the +writer stack. A writer that preempts another writer will not move the +commit page. + +When data is written into the ring buffer, a position is reserved +in the ring buffer and passed back to the writer. When the writer +is finished writing data into that position, it commits the write. + +Another write (or a read) may take place at anytime during this +transaction. If another write happens it must finish before continuing +with the previous write. + + + Write reserve: + + Buffer page + +---------+ + |written | + +---------+ <--- given back to writer (current commit) + |reserved | + +---------+ <--- tail pointer + | empty | + +---------+ + + Write commit: + + Buffer page + +---------+ + |written | + +---------+ + |written | + +---------+ <--- next positon for write (current commit) + | empty | + +---------+ + + + If a write happens after the first reserve: + + Buffer page + +---------+ + |written | + +---------+ <-- current commit + |reserved | + +---------+ <--- given back to second writer + |reserved | + +---------+ <--- tail pointer + + After second writer commits: + + + Buffer page + +---------+ + |written | + +---------+ <--(last full commit) + |reserved | + +---------+ + |pending | + |commit | + +---------+ <--- tail pointer + + When the first writer commits: + + Buffer page + +---------+ + |written | + +---------+ + |written | + +---------+ + |written | + +---------+ <--(last full commit and tail pointer) + + +The commit pointer points to the last write location that was +committed without preempting another write. When a write that +preempted another write is committed, it only becomes a pending commit +and will not be a full commit till all writes have been committed. + +The commit page points to the page that has the last full commit. +The tail page points to the page with the last write (before +committing). + +The tail page is always equal to or after the commit page. It may +be several pages ahead. If the tail page catches up to the commit +page then no more writes may take place (regardless of the mode +of the ring buffer: overwrite and produce/consumer). + +The order of pages are: + + head page + commit page + tail page + +Possible scenario: + tail page + head page commit page | + | | | + v v v + +---+ +---+ +---+ +---+ +<---| |--->| |--->| |--->| |---> +--->| |<---| |<---| |<---| |<--- + +---+ +---+ +---+ +---+ + +There is a special case that the head page is after either the commit page +and possibly the tail page. That is when the commit (and tail) page has been +swapped with the reader page. This is because the head page is always +part of the ring buffer, but the reader page is not. When ever there +has been less than a full page that has been committed inside the ring buffer, +and a reader swaps out a page, it will be swapping out the commit page. + + + reader page commit page tail page + | | | + v | | + +---+ | | + | |<----------+ | + | |<------------------------+ + | |------+ + +---+ | + | + v + +---+ +---+ +---+ +---+ +<---| |--->| |--->| |--->| |---> +--->| |<---| |<---| |<---| |<--- + +---+ +---+ +---+ +---+ + ^ + | + head page + + +In this case, the head page will not move when the tail and commit +move back into the ring buffer. + +The reader can not swap a page into the ring buffer if the commit page +is still on that page. If the read meets the last commit (real commit +not pending or reserved), then there is nothing more to read. +The buffer is considered empty until another full commit finishes. + +When the tail meets the head page, if the buffer is in overwrite mode, +the head page will be pushed ahead one. If the buffer is in producer/consumer +mode, the write will fail. + +Overwrite mode: + + tail page + | + v + +---+ +---+ +---+ +---+ +<---| |--->| |--->| |--->| |---> +--->| |<---| |<---| |<---| |<--- + +---+ +---+ +---+ +---+ + ^ + | + head page + + + tail page + | + v + +---+ +---+ +---+ +---+ +<---| |--->| |--->| |--->| |---> +--->| |<---| |<---| |<---| |<--- + +---+ +---+ +---+ +---+ + ^ + | + head page + + + tail page + | + v + +---+ +---+ +---+ +---+ +<---| |--->| |--->| |--->| |---> +--->| |<---| |<---| |<---| |<--- + +---+ +---+ +---+ +---+ + ^ + | + head page + +Note, the reader page will still point to the previous head page. +But when a swap takes place, it will use the most recent head page. + + +Making the Ring Buffer Lockless: +-------------------------------- + +The main idea behind the lockless algorithm is to combine the moving +of the head_page pointer with the swapping of pages with the reader. +State flags are placed inside the pointer to the page. To do this, +each page must be aligned in memory by 4 bytes. This will allow the 2 +least significant bits of the address to be used as flags. Since +they will always be zero for the address. To get the address, +simply mask out the flags. + + MASK = ~3 + + address & MASK + +Two flags will be kept by these two bits: + + HEADER - the page being pointed to is a head page + + UPDATE - the page being pointed to is being updated by a writer + and was or is about to be a head page. + + + reader page + | + v + +---+ + | |------+ + +---+ | + | + v + +---+ +---+ +---+ +---+ +<---| |--->| |-H->| |--->| |---> +--->| |<---| |<---| |<---| |<--- + +---+ +---+ +---+ +---+ + + +The above pointer "-H->" would have the HEADER flag set. That is +the next page is the next page to be swapped out by the reader. +This pointer means the next page is the head page. + +When the tail page meets the head pointer, it will use cmpxchg to +change the pointer to the UPDATE state: + + + tail page + | + v + +---+ +---+ +---+ +---+ +<---| |--->| |-H->| |--->| |---> +--->| |<---| |<---| |<---| |<--- + +---+ +---+ +---+ +---+ + + tail page + | + v + +---+ +---+ +---+ +---+ +<---| |--->| |-U->| |--->| |---> +--->| |<---| |<---| |<---| |<--- + +---+ +---+ +---+ +---+ + +"-U->" represents a pointer in the UPDATE state. + +Any access to the reader will need to take some sort of lock to serialize +the readers. But the writers will never take a lock to write to the +ring buffer. This means we only need to worry about a single reader, +and writes only preempt in "stack" formation. + +When the reader tries to swap the page with the ring buffer, it +will also use cmpxchg. If the flag bit in the pointer to the +head page does not have the HEADER flag set, the compare will fail +and the reader will need to look for the new head page and try again. +Note, the flag UPDATE and HEADER are never set at the same time. + +The reader swaps the reader page as follows: + + +------+ + |reader| RING BUFFER + |page | + +------+ + +---+ +---+ +---+ + | |--->| |--->| | + | |<---| |<---| | + +---+ +---+ +---+ + ^ | ^ | + | +---------------+ | + +-----H-------------+ + +The reader sets the reader page next pointer as HEADER to the page after +the head page. + + + +------+ + |reader| RING BUFFER + |page |-------H-----------+ + +------+ v + | +---+ +---+ +---+ + | | |--->| |--->| | + | | |<---| |<---| |<-+ + | +---+ +---+ +---+ | + | ^ | ^ | | + | | +---------------+ | | + | +-----H-------------+ | + +--------------------------------------+ + +It does a cmpxchg with the pointer to the previous head page to make it +point to the reader page. Note that the new pointer does not have the HEADER +flag set. This action atomically moves the head page forward. + + +------+ + |reader| RING BUFFER + |page |-------H-----------+ + +------+ v + | ^ +---+ +---+ +---+ + | | | |-->| |-->| | + | | | |<--| |<--| |<-+ + | | +---+ +---+ +---+ | + | | | ^ | | + | | +-------------+ | | + | +-----------------------------+ | + +------------------------------------+ + +After the new head page is set, the previous pointer of the head page is +updated to the reader page. + + +------+ + |reader| RING BUFFER + |page |-------H-----------+ + +------+ <---------------+ v + | ^ +---+ +---+ +---+ + | | | |-->| |-->| | + | | | | | |<--| |<-+ + | | +---+ +---+ +---+ | + | | | ^ | | + | | +-------------+ | | + | +-----------------------------+ | + +------------------------------------+ + + +------+ + |buffer| RING BUFFER + |page |-------H-----------+ <--- New head page + +------+ <---------------+ v + | ^ +---+ +---+ +---+ + | | | | | |-->| | + | | New | | | |<--| |<-+ + | | Reader +---+ +---+ +---+ | + | | page ----^ | | + | | | | + | +-----------------------------+ | + +------------------------------------+ + +Another important point. The page that the reader page points back to +by its previous pointer (the one that now points to the new head page) +never points back to the reader page. That is because the reader page is +not part of the ring buffer. Traversing the ring buffer via the next pointers +will always stay in the ring buffer. Traversing the ring buffer via the +prev pointers may not. + +Note, the way to determine a reader page is simply by examining the previous +pointer of the page. If the next pointer of the previous page does not +point back to the original page, then the original page is a reader page: + + + +--------+ + | reader | next +----+ + | page |-------->| |<====== (buffer page) + +--------+ +----+ + | | ^ + | v | next + prev | +----+ + +------------->| | + +----+ + +The way the head page moves forward: + +When the tail page meets the head page and the buffer is in overwrite mode +and more writes take place, the head page must be moved forward before the +writer may move the tail page. The way this is done is that the writer +performs a cmpxchg to convert the pointer to the head page from the HEADER +flag to have the UPDATE flag set. Once this is done, the reader will +not be able to swap the head page from the buffer, nor will it be able to +move the head page, until the writer is finished with the move. + +This eliminates any races that the reader can have on the writer. The reader +must spin, and this is why the reader can not preempt the writer. + + tail page + | + v + +---+ +---+ +---+ +---+ +<---| |--->| |-H->| |--->| |---> +--->| |<---| |<---| |<---| |<--- + +---+ +---+ +---+ +---+ + + tail page + | + v + +---+ +---+ +---+ +---+ +<---| |--->| |-U->| |--->| |---> +--->| |<---| |<---| |<---| |<--- + +---+ +---+ +---+ +---+ + +The following page will be made into the new head page. + + tail page + | + v + +---+ +---+ +---+ +---+ +<---| |--->| |-U->| |-H->| |---> +--->| |<---| |<---| |<---| |<--- + +---+ +---+ +---+ +---+ + +After the new head page has been set, we can set the old head page +pointer back to NORMAL. + + tail page + | + v + +---+ +---+ +---+ +---+ +<---| |--->| |--->| |-H->| |---> +--->| |<---| |<---| |<---| |<--- + +---+ +---+ +---+ +---+ + +After the head page has been moved, the tail page may now move forward. + + tail page + | + v + +---+ +---+ +---+ +---+ +<---| |--->| |--->| |-H->| |---> +--->| |<---| |<---| |<---| |<--- + +---+ +---+ +---+ +---+ + + +The above are the trivial updates. Now for the more complex scenarios. + + +As stated before, if enough writes preempt the first write, the +tail page may make it all the way around the buffer and meet the commit +page. At this time, we must start dropping writes (usually with some kind +of warning to the user). But what happens if the commit was still on the +reader page? The commit page is not part of the ring buffer. The tail page +must account for this. + + + reader page commit page + | | + v | + +---+ | + | |<----------+ + | | + | |------+ + +---+ | + | + v + +---+ +---+ +---+ +---+ +<---| |--->| |-H->| |--->| |---> +--->| |<---| |<---| |<---| |<--- + +---+ +---+ +---+ +---+ + ^ + | + tail page + +If the tail page were to simply push the head page forward, the commit when +leaving the reader page would not be pointing to the correct page. + +The solution to this is to test if the commit page is on the reader page +before pushing the head page. If it is, then it can be assumed that the +tail page wrapped the buffer, and we must drop new writes. + +This is not a race condition, because the commit page can only be moved +by the outter most writer (the writer that was preempted). +This means that the commit will not move while a writer is moving the +tail page. The reader can not swap the reader page if it is also being +used as the commit page. The reader can simply check that the commit +is off the reader page. Once the commit page leaves the reader page +it will never go back on it unless a reader does another swap with the +buffer page that is also the commit page. + + +Nested writes +------------- + +In the pushing forward of the tail page we must first push forward +the head page if the head page is the next page. If the head page +is not the next page, the tail page is simply updated with a cmpxchg. + +Only writers move the tail page. This must be done atomically to protect +against nested writers. + + temp_page = tail_page + next_page = temp_page->next + cmpxchg(tail_page, temp_page, next_page) + +The above will update the tail page if it is still pointing to the expected +page. If this fails, a nested write pushed it forward, the the current write +does not need to push it. + + + temp page + | + v + tail page + | + v + +---+ +---+ +---+ +---+ +<---| |--->| |--->| |--->| |---> +--->| |<---| |<---| |<---| |<--- + +---+ +---+ +---+ +---+ + +Nested write comes in and moves the tail page forward: + + tail page (moved by nested writer) + temp page | + | | + v v + +---+ +---+ +---+ +---+ +<---| |--->| |--->| |--->| |---> +--->| |<---| |<---| |<---| |<--- + +---+ +---+ +---+ +---+ + +The above would fail the cmpxchg, but since the tail page has already +been moved forward, the writer will just try again to reserve storage +on the new tail page. + +But the moving of the head page is a bit more complex. + + tail page + | + v + +---+ +---+ +---+ +---+ +<---| |--->| |-H->| |--->| |---> +--->| |<---| |<---| |<---| |<--- + +---+ +---+ +---+ +---+ + +The write converts the head page pointer to UPDATE. + + tail page + | + v + +---+ +---+ +---+ +---+ +<---| |--->| |-U->| |--->| |---> +--->| |<---| |<---| |<---| |<--- + +---+ +---+ +---+ +---+ + +But if a nested writer preempts here. It will see that the next +page is a head page, but it is also nested. It will detect that +it is nested and will save that information. The detection is the +fact that it sees the UPDATE flag instead of a HEADER or NORMAL +pointer. + +The nested writer will set the new head page pointer. + + tail page + | + v + +---+ +---+ +---+ +---+ +<---| |--->| |-U->| |-H->| |---> +--->| |<---| |<---| |<---| |<--- + +---+ +---+ +---+ +---+ + +But it will not reset the update back to normal. Only the writer +that converted a pointer from HEAD to UPDATE will convert it back +to NORMAL. + + tail page + | + v + +---+ +---+ +---+ +---+ +<---| |--->| |-U->| |-H->| |---> +--->| |<---| |<---| |<---| |<--- + +---+ +---+ +---+ +---+ + +After the nested writer finishes, the outer most writer will convert +the UPDATE pointer to NORMAL. + + + tail page + | + v + +---+ +---+ +---+ +---+ +<---| |--->| |--->| |-H->| |---> +--->| |<---| |<---| |<---| |<--- + +---+ +---+ +---+ +---+ + + +It can be even more complex if several nested writes came in and moved +the tail page ahead several pages: + + +(first writer) + + tail page + | + v + +---+ +---+ +---+ +---+ +<---| |--->| |-H->| |--->| |---> +--->| |<---| |<---| |<---| |<--- + +---+ +---+ +---+ +---+ + +The write converts the head page pointer to UPDATE. + + tail page + | + v + +---+ +---+ +---+ +---+ +<---| |--->| |-U->| |--->| |---> +--->| |<---| |<---| |<---| |<--- + +---+ +---+ +---+ +---+ + +Next writer comes in, and sees the update and sets up the new +head page. + +(second writer) + + tail page + | + v + +---+ +---+ +---+ +---+ +<---| |--->| |-U->| |-H->| |---> +--->| |<---| |<---| |<---| |<--- + +---+ +---+ +---+ +---+ + +The nested writer moves the tail page forward. But does not set the old +update page to NORMAL because it is not the outer most writer. + + tail page + | + v + +---+ +---+ +---+ +---+ +<---| |--->| |-U->| |-H->| |---> +--->| |<---| |<---| |<---| |<--- + +---+ +---+ +---+ +---+ + +Another writer preempts and sees the page after the tail page is a head page. +It changes it from HEAD to UPDATE. + +(third writer) + + tail page + | + v + +---+ +---+ +---+ +---+ +<---| |--->| |-U->| |-U->| |---> +--->| |<---| |<---| |<---| |<--- + +---+ +---+ +---+ +---+ + +The writer will move the head page forward: + + +(third writer) + + tail page + | + v + +---+ +---+ +---+ +---+ +<---| |--->| |-U->| |-U->| |-H-> +--->| |<---| |<---| |<---| |<--- + +---+ +---+ +---+ +---+ + +But now that the third writer did change the HEAD flag to UPDATE it +will convert it to normal: + + +(third writer) + + tail page + | + v + +---+ +---+ +---+ +---+ +<---| |--->| |-U->| |--->| |-H-> +--->| |<---| |<---| |<---| |<--- + +---+ +---+ +---+ +---+ + + +Then it will move the tail page, and return back to the second writer. + + +(second writer) + + tail page + | + v + +---+ +---+ +---+ +---+ +<---| |--->| |-U->| |--->| |-H-> +--->| |<---| |<---| |<---| |<--- + +---+ +---+ +---+ +---+ + + +The second writer will fail to move the tail page because it was already +moved, so it will try again and add its data to the new tail page. +It will return to the first writer. + + +(first writer) + + tail page + | + v + +---+ +---+ +---+ +---+ +<---| |--->| |-U->| |--->| |-H-> +--->| |<---| |<---| |<---| |<--- + +---+ +---+ +---+ +---+ + +The first writer can not know atomically test if the tail page moved +while it updates the HEAD page. It will then update the head page to +what it thinks is the new head page. + + +(first writer) + + tail page + | + v + +---+ +---+ +---+ +---+ +<---| |--->| |-U->| |-H->| |-H-> +--->| |<---| |<---| |<---| |<--- + +---+ +---+ +---+ +---+ + +Since the cmpxchg returns the old value of the pointer the first writer +will see it succeeded in updating the pointer from NORMAL to HEAD. +But as we can see, this is not good enough. It must also check to see +if the tail page is either where it use to be or on the next page: + + +(first writer) + + A B tail page + | | | + v v v + +---+ +---+ +---+ +---+ +<---| |--->| |-U->| |-H->| |-H-> +--->| |<---| |<---| |<---| |<--- + +---+ +---+ +---+ +---+ + +If tail page != A and tail page does not equal B, then it must reset the +pointer back to NORMAL. The fact that it only needs to worry about +nested writers, it only needs to check this after setting the HEAD page. + + +(first writer) + + A B tail page + | | | + v v v + +---+ +---+ +---+ +---+ +<---| |--->| |-U->| |--->| |-H-> +--->| |<---| |<---| |<---| |<--- + +---+ +---+ +---+ +---+ + +Now the writer can update the head page. This is also why the head page must +remain in UPDATE and only reset by the outer most writer. This prevents +the reader from seeing the incorrect head page. + + +(first writer) + + A B tail page + | | | + v v v + +---+ +---+ +---+ +---+ +<---| |--->| |--->| |--->| |-H-> +--->| |<---| |<---| |<---| |<--- + +---+ +---+ +---+ +---+ + diff --git a/MAINTAINERS b/MAINTAINERS index 8dca9d8..23389b1 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -2260,6 +2260,15 @@ T: git git://git.kernel.org/pub/scm/linux/kernel/git/mchehab/linux-2.6.git S: Maintained F: drivers/media/video/gspca/ +HARDWARE LATENCY DETECTOR +P: Jon Masters +M: jcm@jonmasters.org +W: http://www.kernel.org/pub/linux/kernel/people/jcm/hwlat_detector/ +S: Supported +L: linux-kernel@vger.kernel.org +F: Documentation/hwlat_detector.txt +F: drivers/misc/hwlat_detector.c + HARDWARE MONITORING L: lm-sensors@lm-sensors.org W: http://www.lm-sensors.org/ diff --git a/Makefile b/Makefile index ea8734a..7db5e21 100644 --- a/Makefile +++ b/Makefile @@ -1,7 +1,7 @@ VERSION = 2 PATCHLEVEL = 6 SUBLEVEL = 31 -EXTRAVERSION = .1 +EXTRAVERSION =.1-rt12 NAME = Man-Eating Seals of Antiquity # *DOCUMENTATION* diff --git a/arch/Kconfig b/arch/Kconfig index 99193b1..1b28306 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -33,6 +33,11 @@ config OPROFILE_IBS config HAVE_OPROFILE bool +config PROFILE_NMI + bool + depends on OPROFILE + default y + config KPROBES bool "Kprobes" depends on KALLSYMS && MODULES diff --git a/arch/alpha/include/asm/rwsem.h b/arch/alpha/include/asm/rwsem.h index 1570c0b..55f4f13 100644 --- a/arch/alpha/include/asm/rwsem.h +++ b/arch/alpha/include/asm/rwsem.h @@ -18,15 +18,18 @@ struct rwsem_waiter; -extern struct rw_semaphore *rwsem_down_read_failed(struct rw_semaphore *sem); -extern struct rw_semaphore *rwsem_down_write_failed(struct rw_semaphore *sem); -extern struct rw_semaphore *rwsem_wake(struct rw_semaphore *); -extern struct rw_semaphore *rwsem_downgrade_wake(struct rw_semaphore *sem); +extern struct rw_anon_semaphore * +rwsem_down_read_failed(struct rw_anon_semaphore *sem); +extern struct rw_anon_semaphore * +rwsem_down_write_failed(struct rw_anon_semaphore *sem); +extern struct rw_anon_semaphore *rwsem_wake(struct rw_anon_semaphore *); +extern struct rw_anon_semaphore * +rwsem_downgrade_wake(struct rw_anon_semaphore *sem); /* * the semaphore definition */ -struct rw_semaphore { +struct rw_anon_semaphore { long count; #define RWSEM_UNLOCKED_VALUE 0x0000000000000000L #define RWSEM_ACTIVE_BIAS 0x0000000000000001L @@ -38,6 +41,31 @@ struct rw_semaphore { struct list_head wait_list; }; +#define __RWSEM_ANON_INITIALIZER(name) \ + { RWSEM_UNLOCKED_VALUE, SPIN_LOCK_UNLOCKED, \ + LIST_HEAD_INIT((name).wait_list) } + +#define DECLARE_ANON_RWSEM(name) \ + struct rw_anon_semaphore name = __RWSEM_ANON_INITIALIZER(name) + +static inline void init_anon_rwsem(struct rw_anon_semaphore *sem) +{ + sem->count = RWSEM_UNLOCKED_VALUE; + spin_lock_init(&sem->wait_lock); + INIT_LIST_HEAD(&sem->wait_list); +} + +static inline int anon_rwsem_is_locked(struct rw_anon_semaphore *sem) +{ + return (sem->count != 0); +} + +struct rw_semaphore { + long count; + spinlock_t wait_lock; + struct list_head wait_list; +}; + #define __RWSEM_INITIALIZER(name) \ { RWSEM_UNLOCKED_VALUE, SPIN_LOCK_UNLOCKED, \ LIST_HEAD_INIT((name).wait_list) } @@ -47,12 +75,15 @@ struct rw_semaphore { static inline void init_rwsem(struct rw_semaphore *sem) { - sem->count = RWSEM_UNLOCKED_VALUE; - spin_lock_init(&sem->wait_lock); - INIT_LIST_HEAD(&sem->wait_list); + init_anon_rwsem((struct rw_anon_semaphore *)sem); } -static inline void __down_read(struct rw_semaphore *sem) +static inline int rwsem_is_locked(struct rw_semaphore *sem) +{ + return (sem->count != 0); +} + +static inline void __down_read(struct rw_anon_semaphore *sem) { long oldcount; #ifndef CONFIG_SMP @@ -79,7 +110,7 @@ static inline void __down_read(struct rw_semaphore *sem) /* * trylock for reading -- returns 1 if successful, 0 if contention */ -static inline int __down_read_trylock(struct rw_semaphore *sem) +static inline int __down_read_trylock(struct rw_anon_semaphore *sem) { long old, new, res; @@ -94,7 +125,7 @@ static inline int __down_read_trylock(struct rw_semaphore *sem) return res >= 0 ? 1 : 0; } -static inline void __down_write(struct rw_semaphore *sem) +static inline void __down_write(struct rw_anon_semaphore *sem) { long oldcount; #ifndef CONFIG_SMP @@ -121,7 +152,7 @@ static inline void __down_write(struct rw_semaphore *sem) /* * trylock for writing -- returns 1 if successful, 0 if contention */ -static inline int __down_write_trylock(struct rw_semaphore *sem) +static inline int __down_write_trylock(struct rw_anon_semaphore *sem) { long ret = cmpxchg(&sem->count, RWSEM_UNLOCKED_VALUE, RWSEM_ACTIVE_WRITE_BIAS); @@ -130,7 +161,7 @@ static inline int __down_write_trylock(struct rw_semaphore *sem) return 0; } -static inline void __up_read(struct rw_semaphore *sem) +static inline void __up_read(struct rw_anon_semaphore *sem) { long oldcount; #ifndef CONFIG_SMP @@ -155,7 +186,7 @@ static inline void __up_read(struct rw_semaphore *sem) rwsem_wake(sem); } -static inline void __up_write(struct rw_semaphore *sem) +static inline void __up_write(struct rw_anon_semaphore *sem) { long count; #ifndef CONFIG_SMP @@ -184,7 +215,7 @@ static inline void __up_write(struct rw_semaphore *sem) /* * downgrade write lock to read lock */ -static inline void __downgrade_write(struct rw_semaphore *sem) +static inline void __downgrade_write(struct rw_anon_semaphore *sem) { long oldcount; #ifndef CONFIG_SMP @@ -208,7 +239,7 @@ static inline void __downgrade_write(struct rw_semaphore *sem) rwsem_downgrade_wake(sem); } -static inline void rwsem_atomic_add(long val, struct rw_semaphore *sem) +static inline void rwsem_atomic_add(long val, struct rw_anon_semaphore *sem) { #ifndef CONFIG_SMP sem->count += val; @@ -227,7 +258,7 @@ static inline void rwsem_atomic_add(long val, struct rw_semaphore *sem) #endif } -static inline long rwsem_atomic_update(long val, struct rw_semaphore *sem) +static inline long rwsem_atomic_update(long val, struct rw_anon_semaphore *sem) { #ifndef CONFIG_SMP sem->count += val; @@ -250,10 +281,5 @@ static inline long rwsem_atomic_update(long val, struct rw_semaphore *sem) #endif } -static inline int rwsem_is_locked(struct rw_semaphore *sem) -{ - return (sem->count != 0); -} - #endif /* __KERNEL__ */ #endif /* _ALPHA_RWSEM_H */ diff --git a/arch/alpha/kernel/irq.c b/arch/alpha/kernel/irq.c index cc78346..2e0b75e 100644 --- a/arch/alpha/kernel/irq.c +++ b/arch/alpha/kernel/irq.c @@ -81,7 +81,7 @@ show_interrupts(struct seq_file *p, void *v) #endif if (irq < ACTUAL_NR_IRQS) { - spin_lock_irqsave(&irq_desc[irq].lock, flags); + atomic_spin_lock_irqsave(&irq_desc[irq].lock, flags); action = irq_desc[irq].action; if (!action) goto unlock; @@ -105,7 +105,7 @@ show_interrupts(struct seq_file *p, void *v) seq_putc(p, '\n'); unlock: - spin_unlock_irqrestore(&irq_desc[irq].lock, flags); + atomic_spin_unlock_irqrestore(&irq_desc[irq].lock, flags); } else if (irq == ACTUAL_NR_IRQS) { #ifdef CONFIG_SMP seq_puts(p, "IPI: "); diff --git a/arch/alpha/kernel/time.c b/arch/alpha/kernel/time.c index b04e2cb..991a967 100644 --- a/arch/alpha/kernel/time.c +++ b/arch/alpha/kernel/time.c @@ -106,7 +106,7 @@ irqreturn_t timer_interrupt(int irq, void *dev) profile_tick(CPU_PROFILING); #endif - write_seqlock(&xtime_lock); + write_atomic_seqlock(&xtime_lock); /* * Calculate how many ticks have passed since the last update, @@ -136,7 +136,7 @@ irqreturn_t timer_interrupt(int irq, void *dev) state.last_rtc_update = xtime.tv_sec - (tmp ? 600 : 0); } - write_sequnlock(&xtime_lock); + write_atomic_sequnlock(&xtime_lock); #ifndef CONFIG_SMP while (nticks--) @@ -416,14 +416,14 @@ do_gettimeofday(struct timeval *tv) unsigned long delta_cycles, delta_usec, partial_tick; do { - seq = read_seqbegin_irqsave(&xtime_lock, flags); + seq = read_atomic_seqbegin_irqsave(&xtime_lock, flags); delta_cycles = rpcc() - state.last_time; sec = xtime.tv_sec; usec = (xtime.tv_nsec / 1000); partial_tick = state.partial_tick; - } while (read_seqretry_irqrestore(&xtime_lock, seq, flags)); + } while (read_atomic_seqretry_irqrestore(&xtime_lock, seq, flags)); #ifdef CONFIG_SMP /* Until and unless we figure out how to get cpu cycle counters @@ -470,7 +470,7 @@ do_settimeofday(struct timespec *tv) if ((unsigned long)tv->tv_nsec >= NSEC_PER_SEC) return -EINVAL; - write_seqlock_irq(&xtime_lock); + write_atomic_seqlock_irq(&xtime_lock); /* The offset that is added into time in do_gettimeofday above must be subtracted out here to keep a coherent view of the @@ -496,7 +496,7 @@ do_settimeofday(struct timespec *tv) ntp_clear(); - write_sequnlock_irq(&xtime_lock); + write_atomic_sequnlock_irq(&xtime_lock); clock_was_set(); return 0; } diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig index aef63c8..eac1a92 100644 --- a/arch/arm/Kconfig +++ b/arch/arm/Kconfig @@ -962,18 +962,7 @@ config LOCAL_TIMERS accounting to be spread across the timer interval, preventing a "thundering herd" at every timer tick. -config PREEMPT - bool "Preemptible Kernel (EXPERIMENTAL)" - depends on EXPERIMENTAL - help - This option reduces the latency of the kernel when reacting to - real-time or interactive events by allowing a low priority process to - be preempted even if it is in kernel mode executing a system call. - This allows applications to run more reliably even when the system is - under load. - - Say Y here if you are building a kernel for a desktop, embedded - or real-time system. Say N if you are unsure. +source kernel/Kconfig.preempt config HZ int diff --git a/arch/arm/include/asm/dma.h b/arch/arm/include/asm/dma.h index 7edf353..c682ef4 100644 --- a/arch/arm/include/asm/dma.h +++ b/arch/arm/include/asm/dma.h @@ -31,18 +31,18 @@ #define DMA_MODE_CASCADE 0xc0 #define DMA_AUTOINIT 0x10 -extern spinlock_t dma_spin_lock; +extern atomic_spinlock_t dma_spin_lock; static inline unsigned long claim_dma_lock(void) { unsigned long flags; - spin_lock_irqsave(&dma_spin_lock, flags); + atomic_spin_lock_irqsave(&dma_spin_lock, flags); return flags; } static inline void release_dma_lock(unsigned long flags) { - spin_unlock_irqrestore(&dma_spin_lock, flags); + atomic_spin_unlock_irqrestore(&dma_spin_lock, flags); } /* Clear the 'DMA Pointer Flip Flop'. diff --git a/arch/arm/include/asm/ftrace.h b/arch/arm/include/asm/ftrace.h index 39c8bc1..d74265c 100644 --- a/arch/arm/include/asm/ftrace.h +++ b/arch/arm/include/asm/ftrace.h @@ -11,4 +11,38 @@ extern void mcount(void); #endif +#ifndef __ASSEMBLY__ + +#if defined(CONFIG_FRAME_POINTER) && !defined(CONFIG_ARM_UNWIND) +/* + * return_address uses walk_stackframe to do it's work. If both + * CONFIG_FRAME_POINTER=y and CONFIG_ARM_UNWIND=y walk_stackframe uses unwind + * information. For this to work in the function tracer many functions would + * have to be marked with __notrace. So for now just depend on + * !CONFIG_ARM_UNWIND. + */ + +void *return_address(unsigned int); + +#else + +extern inline void *return_address(unsigned int level) +{ + return NULL; +} + +#endif + +#define HAVE_ARCH_CALLER_ADDR + +#define CALLER_ADDR0 ((unsigned long)__builtin_return_address(0)) +#define CALLER_ADDR1 ((unsigned long)return_address(1)) +#define CALLER_ADDR2 ((unsigned long)return_address(2)) +#define CALLER_ADDR3 ((unsigned long)return_address(3)) +#define CALLER_ADDR4 ((unsigned long)return_address(4)) +#define CALLER_ADDR5 ((unsigned long)return_address(5)) +#define CALLER_ADDR6 ((unsigned long)return_address(6)) + +#endif /* ifndef __ASSEMBLY__ */ + #endif /* _ASM_ARM_FTRACE */ diff --git a/arch/arm/include/asm/mach/irq.h b/arch/arm/include/asm/mach/irq.h index acac530..3981cf2 100644 --- a/arch/arm/include/asm/mach/irq.h +++ b/arch/arm/include/asm/mach/irq.h @@ -26,9 +26,9 @@ extern int show_fiq_list(struct seq_file *, void *); */ #define do_bad_IRQ(irq,desc) \ do { \ - spin_lock(&desc->lock); \ + atomic_spin_lock(&desc->lock); \ handle_bad_irq(irq, desc); \ - spin_unlock(&desc->lock); \ + atomic_spin_unlock(&desc->lock); \ } while(0) #endif diff --git a/arch/arm/include/asm/system.h b/arch/arm/include/asm/system.h index d65b2f5..e849ed9 100644 --- a/arch/arm/include/asm/system.h +++ b/arch/arm/include/asm/system.h @@ -60,6 +60,8 @@ #include <linux/linkage.h> #include <linux/irqflags.h> +#include <asm/memory.h> + #define __exception __attribute__((section(".exception.text"))) struct thread_info; diff --git a/arch/arm/include/asm/tlb.h b/arch/arm/include/asm/tlb.h index f41a6f5..dd667f2 100644 --- a/arch/arm/include/asm/tlb.h +++ b/arch/arm/include/asm/tlb.h @@ -40,17 +40,12 @@ struct mmu_gather { unsigned long range_end; }; -DECLARE_PER_CPU(struct mmu_gather, mmu_gathers); - -static inline struct mmu_gather * -tlb_gather_mmu(struct mm_struct *mm, unsigned int full_mm_flush) +static inline void +tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, + unsigned int full_mm_flush) { - struct mmu_gather *tlb = &get_cpu_var(mmu_gathers); - tlb->mm = mm; tlb->fullmm = full_mm_flush; - - return tlb; } static inline void @@ -61,8 +56,6 @@ tlb_finish_mmu(struct mmu_gather *tlb, unsigned long start, unsigned long end) /* keep the page table cache within bounds */ check_pgt_cache(); - - put_cpu_var(mmu_gathers); } /* diff --git a/arch/arm/kernel/Makefile b/arch/arm/kernel/Makefile index ff89d0b..3213c93 100644 --- a/arch/arm/kernel/Makefile +++ b/arch/arm/kernel/Makefile @@ -8,10 +8,12 @@ ifdef CONFIG_DYNAMIC_FTRACE CFLAGS_REMOVE_ftrace.o = -pg endif +CFLAGS_REMOVE_return_address.o = -pg + # Object file lists. obj-y := compat.o elf.o entry-armv.o entry-common.o irq.o \ - process.o ptrace.o setup.o signal.o \ + process.o ptrace.o return_address.o setup.o signal.o \ sys_arm.o stacktrace.o time.o traps.o obj-$(CONFIG_ISA_DMA_API) += dma.o diff --git a/arch/arm/kernel/dma.c b/arch/arm/kernel/dma.c index 7d5b9fb..7a6f3d2 100644 --- a/arch/arm/kernel/dma.c +++ b/arch/arm/kernel/dma.c @@ -21,7 +21,7 @@ #include <asm/mach/dma.h> -DEFINE_SPINLOCK(dma_spin_lock); +DEFINE_ATOMIC_SPINLOCK(dma_spin_lock); EXPORT_SYMBOL(dma_spin_lock); static dma_t *dma_chan[MAX_DMA_CHANNELS]; diff --git a/arch/arm/kernel/entry-common.S b/arch/arm/kernel/entry-common.S index 8c3de1a..aa5df73 100644 --- a/arch/arm/kernel/entry-common.S +++ b/arch/arm/kernel/entry-common.S @@ -59,7 +59,8 @@ work_pending: b ret_slow_syscall @ Check work again work_resched: - bl schedule + bl __schedule + /* * "slow" syscall return path. "why" tells us if this was a real syscall. */ diff --git a/arch/arm/kernel/irq.c b/arch/arm/kernel/irq.c index b7c3490..6ea2a03 100644 --- a/arch/arm/kernel/irq.c +++ b/arch/arm/kernel/irq.c @@ -69,7 +69,7 @@ int show_interrupts(struct seq_file *p, void *v) } if (i < NR_IRQS) { - spin_lock_irqsave(&irq_desc[i].lock, flags); + atomic_spin_lock_irqsave(&irq_desc[i].lock, flags); action = irq_desc[i].action; if (!action) goto unlock; @@ -84,7 +84,7 @@ int show_interrupts(struct seq_file *p, void *v) seq_putc(p, '\n'); unlock: - spin_unlock_irqrestore(&irq_desc[i].lock, flags); + atomic_spin_unlock_irqrestore(&irq_desc[i].lock, flags); } else if (i == NR_IRQS) { #ifdef CONFIG_ARCH_ACORN show_fiq_list(p, v); @@ -139,7 +139,7 @@ void set_irq_flags(unsigned int irq, unsigned int iflags) } desc = irq_desc + irq; - spin_lock_irqsave(&desc->lock, flags); + atomic_spin_lock_irqsave(&desc->lock, flags); desc->status |= IRQ_NOREQUEST | IRQ_NOPROBE | IRQ_NOAUTOEN; if (iflags & IRQF_VALID) desc->status &= ~IRQ_NOREQUEST; @@ -147,7 +147,7 @@ void set_irq_flags(unsigned int irq, unsigned int iflags) desc->status &= ~IRQ_NOPROBE; if (!(iflags & IRQF_NOAUTOEN)) desc->status &= ~IRQ_NOAUTOEN; - spin_unlock_irqrestore(&desc->lock, flags); + atomic_spin_unlock_irqrestore(&desc->lock, flags); } void __init init_IRQ(void) @@ -166,9 +166,9 @@ static void route_irq(struct irq_desc *desc, unsigned int irq, unsigned int cpu) { pr_debug("IRQ%u: moving from cpu%u to cpu%u\n", irq, desc->node, cpu); - spin_lock_irq(&desc->lock); + atomic_spin_lock_irq(&desc->lock); desc->chip->set_affinity(irq, cpumask_of(cpu)); - spin_unlock_irq(&desc->lock); + atomic_spin_unlock_irq(&desc->lock); } /* diff --git a/arch/arm/kernel/process.c b/arch/arm/kernel/process.c index 39196df..fcecbb2 100644 --- a/arch/arm/kernel/process.c +++ b/arch/arm/kernel/process.c @@ -174,9 +174,11 @@ void cpu_idle(void) } leds_event(led_idle_end); tick_nohz_restart_sched_tick(); - preempt_enable_no_resched(); - schedule(); + local_irq_disable(); + __preempt_enable_no_resched(); + __schedule(); preempt_disable(); + local_irq_enable(); } } diff --git a/arch/arm/kernel/return_address.c b/arch/arm/kernel/return_address.c new file mode 100644 index 0000000..df246da --- /dev/null +++ b/arch/arm/kernel/return_address.c @@ -0,0 +1,71 @@ +/* + * arch/arm/kernel/return_address.c + * + * Copyright (C) 2009 Uwe Kleine-Koenig <u.kleine-koenig@pengutronix.de> + * for Pengutronix + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + */ +#include <linux/module.h> + +#if defined(CONFIG_FRAME_POINTER) && !defined(CONFIG_ARM_UNWIND) +#include <linux/sched.h> + +#include <asm/stacktrace.h> + +struct return_address_data { + unsigned int level; + void *addr; +}; + +static int save_return_addr(struct stackframe *frame, void *d) +{ + struct return_address_data *data = d; + + if (!data->level) { + data->addr = (void *)frame->lr; + + return 1; + } else { + --data->level; + return 0; + } +} + +void *return_address(unsigned int level) +{ + struct return_address_data data; + struct stackframe frame; + register unsigned long current_sp asm ("sp"); + + data.level = level + 1; + + frame.fp = (unsigned long)__builtin_frame_address(0); + frame.sp = current_sp; + frame.lr = (unsigned long)__builtin_return_address(0); + frame.pc = (unsigned long)return_address; + + walk_stackframe(&frame, save_return_addr, &data); + + if (!data.level) + return data.addr; + else + return NULL; +} + +#else /* if defined(CONFIG_FRAME_POINTER) && !defined(CONFIG_ARM_UNWIND) */ + +#if defined(CONFIG_ARM_UNWIND) +#warning "TODO: return_address should use unwind tables" +#endif + +void *return_address(unsigned int level) +{ + return NULL; +} + +#endif /* if defined(CONFIG_FRAME_POINTER) && !defined(CONFIG_ARM_UNWIND) / else */ + +EXPORT_SYMBOL_GPL(return_address); diff --git a/arch/arm/kernel/signal.c b/arch/arm/kernel/signal.c index f6bc5d4..95eb911 100644 --- a/arch/arm/kernel/signal.c +++ b/arch/arm/kernel/signal.c @@ -627,6 +627,14 @@ static int do_signal(sigset_t *oldset, struct pt_regs *regs, int syscall) siginfo_t info; int signr; +#ifdef CONFIG_PREEMPT_RT + /* + * Fully-preemptible kernel does not need interrupts disabled: + */ + local_irq_enable(); + preempt_check_resched(); +#endif + /* * We want the common case to go fast, which * is why we may in certain cases get here from diff --git a/arch/arm/kernel/smp.c b/arch/arm/kernel/smp.c index de885fd..d825d4e 100644 --- a/arch/arm/kernel/smp.c +++ b/arch/arm/kernel/smp.c @@ -451,17 +451,17 @@ void __cpuinit percpu_timer_setup(void) local_timer_setup(evt); } -static DEFINE_SPINLOCK(stop_lock); +static DEFINE_ATOMIC_SPINLOCK(stop_lock); /* * ipi_cpu_stop - handle IPI from smp_send_stop() */ static void ipi_cpu_stop(unsigned int cpu) { - spin_lock(&stop_lock); + atomic_spin_lock(&stop_lock); printk(KERN_CRIT "CPU%u: stopping\n", cpu); dump_stack(); - spin_unlock(&stop_lock); + atomic_spin_unlock(&stop_lock); set_cpu_online(cpu, false); diff --git a/arch/arm/kernel/stacktrace.c b/arch/arm/kernel/stacktrace.c index 9f444e5..20b7411 100644 --- a/arch/arm/kernel/stacktrace.c +++ b/arch/arm/kernel/stacktrace.c @@ -21,7 +21,7 @@ * Note that with framepointer enabled, even the leaf functions have the same * prologue and epilogue, therefore we can ignore the LR value in this case. */ -int unwind_frame(struct stackframe *frame) +int notrace unwind_frame(struct stackframe *frame) { unsigned long high, low; unsigned long fp = frame->fp; @@ -43,7 +43,7 @@ int unwind_frame(struct stackframe *frame) } #endif -void walk_stackframe(struct stackframe *frame, +void notrace walk_stackframe(struct stackframe *frame, int (*fn)(struct stackframe *, void *), void *data) { while (1) { diff --git a/arch/arm/kernel/time.c b/arch/arm/kernel/time.c index 4cdc4a0..8a545e2 100644 --- a/arch/arm/kernel/time.c +++ b/arch/arm/kernel/time.c @@ -244,11 +244,11 @@ void do_gettimeofday(struct timeval *tv) unsigned long usec, sec; do { - seq = read_seqbegin_irqsave(&xtime_lock, flags); + seq = read_atomic_seqbegin_irqsave(&xtime_lock, flags); usec = system_timer->offset(); sec = xtime.tv_sec; usec += xtime.tv_nsec / 1000; - } while (read_seqretry_irqrestore(&xtime_lock, seq, flags)); + } while (read_atomic_seqretry_irqrestore(&xtime_lock, seq, flags)); /* usec may have gone up a lot: be safe */ while (usec >= 1000000) { @@ -270,7 +270,7 @@ int do_settimeofday(struct timespec *tv) if ((unsigned long)tv->tv_nsec >= NSEC_PER_SEC) return -EINVAL; - write_seqlock_irq(&xtime_lock); + write_atomic_seqlock_irq(&xtime_lock); /* * This is revolting. We need to set "xtime" correctly. However, the * value in this location is the value at the most recent update of @@ -286,7 +286,7 @@ int do_settimeofday(struct timespec *tv) set_normalized_timespec(&wall_to_monotonic, wtm_sec, wtm_nsec); ntp_clear(); - write_sequnlock_irq(&xtime_lock); + write_atomic_sequnlock_irq(&xtime_lock); clock_was_set(); return 0; } @@ -336,9 +336,9 @@ void timer_tick(void) profile_tick(CPU_PROFILING); do_leds(); do_set_rtc(); - write_seqlock(&xtime_lock); + write_atomic_seqlock(&xtime_lock); do_timer(1); - write_sequnlock(&xtime_lock); + write_atomic_sequnlock(&xtime_lock); #ifndef CONFIG_SMP update_process_times(user_mode(get_irq_regs())); #endif diff --git a/arch/arm/kernel/traps.c b/arch/arm/kernel/traps.c index 57eb0f6..0cf0ae3 100644 --- a/arch/arm/kernel/traps.c +++ b/arch/arm/kernel/traps.c @@ -239,7 +239,7 @@ static void __die(const char *str, int err, struct thread_info *thread, struct p } } -DEFINE_SPINLOCK(die_lock); +DEFINE_ATOMIC_SPINLOCK(die_lock); /* * This function is protected against re-entrancy. @@ -251,12 +251,12 @@ NORET_TYPE void die(const char *str, struct pt_regs *regs, int err) oops_enter(); console_verbose(); - spin_lock_irq(&die_lock); + atomic_spin_lock_irq(&die_lock); bust_spinlocks(1); __die(str, err, thread, regs); bust_spinlocks(0); add_taint(TAINT_DIE); - spin_unlock_irq(&die_lock); + atomic_spin_unlock_irq(&die_lock); if (in_interrupt()) panic("Fatal exception in interrupt"); @@ -282,24 +282,24 @@ void arm_notify_die(const char *str, struct pt_regs *regs, } static LIST_HEAD(undef_hook); -static DEFINE_SPINLOCK(undef_lock); +static DEFINE_ATOMIC_SPINLOCK(undef_lock); void register_undef_hook(struct undef_hook *hook) { unsigned long flags; - spin_lock_irqsave(&undef_lock, flags); + atomic_spin_lock_irqsave(&undef_lock, flags); list_add(&hook->node, &undef_hook); - spin_unlock_irqrestore(&undef_lock, flags); + atomic_spin_unlock_irqrestore(&undef_lock, flags); } void unregister_undef_hook(struct undef_hook *hook) { unsigned long flags; - spin_lock_irqsave(&undef_lock, flags); + atomic_spin_lock_irqsave(&undef_lock, flags); list_del(&hook->node); - spin_unlock_irqrestore(&undef_lock, flags); + atomic_spin_unlock_irqrestore(&undef_lock, flags); } static int call_undef_hook(struct pt_regs *regs, unsigned int instr) @@ -308,12 +308,12 @@ static int call_undef_hook(struct pt_regs *regs, unsigned int instr) unsigned long flags; int (*fn)(struct pt_regs *regs, unsigned int instr) = NULL; - spin_lock_irqsave(&undef_lock, flags); + atomic_spin_lock_irqsave(&undef_lock, flags); list_for_each_entry(hook, &undef_hook, node) if ((instr & hook->instr_mask) == hook->instr_val && (regs->ARM_cpsr & hook->cpsr_mask) == hook->cpsr_val) fn = hook->fn; - spin_unlock_irqrestore(&undef_lock, flags); + atomic_spin_unlock_irqrestore(&undef_lock, flags); return fn ? fn(regs, instr) : 1; } diff --git a/arch/arm/mach-at91/gpio.c b/arch/arm/mach-at91/gpio.c index f2236f0..cf609f8 100644 --- a/arch/arm/mach-at91/gpio.c +++ b/arch/arm/mach-at91/gpio.c @@ -375,12 +375,18 @@ static int gpio_irq_type(unsigned pin, unsigned type) } } +static void gpio_irq_ack_noop(unsigned int irq) +{ + /* Dummy function. */ +} + static struct irq_chip gpio_irqchip = { .name = "GPIO", .mask = gpio_irq_mask, .unmask = gpio_irq_unmask, .set_type = gpio_irq_type, .set_wake = gpio_irq_set_wake, + .ack = gpio_irq_ack_noop, }; static void gpio_irq_handler(unsigned irq, struct irq_desc *desc) @@ -527,7 +533,7 @@ void __init at91_gpio_irq_setup(void) * shorter, and the AIC handles interrupts sanely. */ set_irq_chip(pin, &gpio_irqchip); - set_irq_handler(pin, handle_simple_irq); + set_irq_handler(pin, handle_edge_irq); set_irq_flags(pin, IRQF_VALID); } diff --git a/arch/arm/mach-footbridge/include/mach/hardware.h b/arch/arm/mach-footbridge/include/mach/hardware.h index 51dd902..2276ebf 100644 --- a/arch/arm/mach-footbridge/include/mach/hardware.h +++ b/arch/arm/mach-footbridge/include/mach/hardware.h @@ -86,7 +86,7 @@ #define CPLD_FLASH_WR_ENABLE 1 #ifndef __ASSEMBLY__ -extern spinlock_t nw_gpio_lock; +extern atomic_spinlock_t nw_gpio_lock; extern void nw_gpio_modify_op(unsigned int mask, unsigned int set); extern void nw_gpio_modify_io(unsigned int mask, unsigned int in); extern unsigned int nw_gpio_read(void); diff --git a/arch/arm/mach-footbridge/netwinder-hw.c b/arch/arm/mach-footbridge/netwinder-hw.c index ac7ffa6..574a57e 100644 --- a/arch/arm/mach-footbridge/netwinder-hw.c +++ b/arch/arm/mach-footbridge/netwinder-hw.c @@ -68,7 +68,7 @@ static inline void wb977_ww(int reg, int val) /* * This is a lock for accessing ports GP1_IO_BASE and GP2_IO_BASE */ -DEFINE_SPINLOCK(nw_gpio_lock); +DEFINE_ATOMIC_SPINLOCK(nw_gpio_lock); EXPORT_SYMBOL(nw_gpio_lock); static unsigned int current_gpio_op; @@ -327,9 +327,9 @@ static inline void wb977_init_gpio(void) /* * Set Group1/Group2 outputs */ - spin_lock_irqsave(&nw_gpio_lock, flags); + atomic_spin_lock_irqsave(&nw_gpio_lock, flags); nw_gpio_modify_op(-1, GPIO_RED_LED | GPIO_FAN); - spin_unlock_irqrestore(&nw_gpio_lock, flags); + atomic_spin_unlock_irqrestore(&nw_gpio_lock, flags); } /* @@ -390,9 +390,9 @@ static void __init cpld_init(void) { unsigned long flags; - spin_lock_irqsave(&nw_gpio_lock, flags); + atomic_spin_lock_irqsave(&nw_gpio_lock, flags); nw_cpld_modify(-1, CPLD_UNMUTE | CPLD_7111_DISABLE); - spin_unlock_irqrestore(&nw_gpio_lock, flags); + atomic_spin_unlock_irqrestore(&nw_gpio_lock, flags); } static unsigned char rwa_unlock[] __initdata = @@ -616,9 +616,9 @@ static int __init nw_hw_init(void) cpld_init(); rwa010_init(); - spin_lock_irqsave(&nw_gpio_lock, flags); + atomic_spin_lock_irqsave(&nw_gpio_lock, flags); nw_gpio_modify_op(GPIO_RED_LED|GPIO_GREEN_LED, DEFAULT_LEDS); - spin_unlock_irqrestore(&nw_gpio_lock, flags); + atomic_spin_unlock_irqrestore(&nw_gpio_lock, flags); } return 0; } diff --git a/arch/arm/mach-footbridge/netwinder-leds.c b/arch/arm/mach-footbridge/netwinder-leds.c index 00269fe..642a443 100644 --- a/arch/arm/mach-footbridge/netwinder-leds.c +++ b/arch/arm/mach-footbridge/netwinder-leds.c @@ -31,13 +31,13 @@ static char led_state; static char hw_led_state; -static DEFINE_SPINLOCK(leds_lock); +static DEFINE_ATOMIC_SPINLOCK(leds_lock); static void netwinder_leds_event(led_event_t evt) { unsigned long flags; - spin_lock_irqsave(&leds_lock, flags); + atomic_spin_lock_irqsave(&leds_lock, flags); switch (evt) { case led_start: @@ -117,12 +117,12 @@ static void netwinder_leds_event(led_event_t evt) break; } - spin_unlock_irqrestore(&leds_lock, flags); + atomic_spin_unlock_irqrestore(&leds_lock, flags); if (led_state & LED_STATE_ENABLED) { - spin_lock_irqsave(&nw_gpio_lock, flags); + atomic_spin_lock_irqsave(&nw_gpio_lock, flags); nw_gpio_modify_op(GPIO_RED_LED | GPIO_GREEN_LED, hw_led_state); - spin_unlock_irqrestore(&nw_gpio_lock, flags); + atomic_spin_unlock_irqrestore(&nw_gpio_lock, flags); } } diff --git a/arch/arm/mach-integrator/core.c b/arch/arm/mach-integrator/core.c index a0f60e5..523b160 100644 --- a/arch/arm/mach-integrator/core.c +++ b/arch/arm/mach-integrator/core.c @@ -199,7 +199,7 @@ static struct amba_pl010_data integrator_uart_data = { #define CM_CTRL IO_ADDRESS(INTEGRATOR_HDR_BASE) + INTEGRATOR_HDR_CTRL_OFFSET -static DEFINE_SPINLOCK(cm_lock); +static DEFINE_ATOMIC_SPINLOCK(cm_lock); /** * cm_control - update the CM_CTRL register. @@ -211,10 +211,10 @@ void cm_control(u32 mask, u32 set) unsigned long flags; u32 val; - spin_lock_irqsave(&cm_lock, flags); + atomic_spin_lock_irqsave(&cm_lock, flags); val = readl(CM_CTRL) & ~mask; writel(val | set, CM_CTRL); - spin_unlock_irqrestore(&cm_lock, flags); + atomic_spin_unlock_irqrestore(&cm_lock, flags); } EXPORT_SYMBOL(cm_control); diff --git a/arch/arm/mach-integrator/pci_v3.c b/arch/arm/mach-integrator/pci_v3.c index f1d72b2..50dcb35 100644 --- a/arch/arm/mach-integrator/pci_v3.c +++ b/arch/arm/mach-integrator/pci_v3.c @@ -162,7 +162,7 @@ * 7:2 register number * */ -static DEFINE_SPINLOCK(v3_lock); +static DEFINE_ATOMIC_SPINLOCK(v3_lock); #define PCI_BUS_NONMEM_START 0x00000000 #define PCI_BUS_NONMEM_SIZE SZ_256M @@ -283,7 +283,7 @@ static int v3_read_config(struct pci_bus *bus, unsigned int devfn, int where, unsigned long flags; u32 v; - spin_lock_irqsave(&v3_lock, flags); + atomic_spin_lock_irqsave(&v3_lock, flags); addr = v3_open_config_window(bus, devfn, where); switch (size) { @@ -301,7 +301,7 @@ static int v3_read_config(struct pci_bus *bus, unsigned int devfn, int where, } v3_close_config_window(); - spin_unlock_irqrestore(&v3_lock, flags); + atomic_spin_unlock_irqrestore(&v3_lock, flags); *val = v; return PCIBIOS_SUCCESSFUL; @@ -313,7 +313,7 @@ static int v3_write_config(struct pci_bus *bus, unsigned int devfn, int where, unsigned long addr; unsigned long flags; - spin_lock_irqsave(&v3_lock, flags); + atomic_spin_lock_irqsave(&v3_lock, flags); addr = v3_open_config_window(bus, devfn, where); switch (size) { @@ -334,7 +334,7 @@ static int v3_write_config(struct pci_bus *bus, unsigned int devfn, int where, } v3_close_config_window(); - spin_unlock_irqrestore(&v3_lock, flags); + atomic_spin_unlock_irqrestore(&v3_lock, flags); return PCIBIOS_SUCCESSFUL; } @@ -509,7 +509,7 @@ void __init pci_v3_preinit(void) hook_fault_code(8, v3_pci_fault, SIGBUS, "external abort on non-linefetch"); hook_fault_code(10, v3_pci_fault, SIGBUS, "external abort on non-linefetch"); - spin_lock_irqsave(&v3_lock, flags); + atomic_spin_lock_irqsave(&v3_lock, flags); /* * Unlock V3 registers, but only if they were previously locked. @@ -582,7 +582,7 @@ void __init pci_v3_preinit(void) printk(KERN_ERR "PCI: unable to grab PCI error " "interrupt: %d\n", ret); - spin_unlock_irqrestore(&v3_lock, flags); + atomic_spin_unlock_irqrestore(&v3_lock, flags); } void __init pci_v3_postinit(void) diff --git a/arch/arm/mach-ixp2000/core.c b/arch/arm/mach-ixp2000/core.c index babb225..e24e3d0 100644 --- a/arch/arm/mach-ixp2000/core.c +++ b/arch/arm/mach-ixp2000/core.c @@ -197,7 +197,7 @@ unsigned long ixp2000_gettimeoffset (void) return offset / ticks_per_usec; } -static int ixp2000_timer_interrupt(int irq, void *dev_id) +static irqreturn_t ixp2000_timer_interrupt(int irq, void *dev_id) { /* clear timer 1 */ ixp2000_reg_wrb(IXP2000_T1_CLR, 1); diff --git a/arch/arm/mach-ixp4xx/common-pci.c b/arch/arm/mach-ixp4xx/common-pci.c index 70afcfe..ef50a20 100644 --- a/arch/arm/mach-ixp4xx/common-pci.c +++ b/arch/arm/mach-ixp4xx/common-pci.c @@ -54,7 +54,7 @@ unsigned long ixp4xx_pci_reg_base = 0; * these transactions are atomic or we will end up * with corrupt data on the bus or in a driver. */ -static DEFINE_SPINLOCK(ixp4xx_pci_lock); +static DEFINE_ATOMIC_SPINLOCK(ixp4xx_pci_lock); /* * Read from PCI config space @@ -62,10 +62,10 @@ static DEFINE_SPINLOCK(ixp4xx_pci_lock); static void crp_read(u32 ad_cbe, u32 *data) { unsigned long flags; - spin_lock_irqsave(&ixp4xx_pci_lock, flags); + atomic_spin_lock_irqsave(&ixp4xx_pci_lock, flags); *PCI_CRP_AD_CBE = ad_cbe; *data = *PCI_CRP_RDATA; - spin_unlock_irqrestore(&ixp4xx_pci_lock, flags); + atomic_spin_unlock_irqrestore(&ixp4xx_pci_lock, flags); } /* @@ -74,10 +74,10 @@ static void crp_read(u32 ad_cbe, u32 *data) static void crp_write(u32 ad_cbe, u32 data) { unsigned long flags; - spin_lock_irqsave(&ixp4xx_pci_lock, flags); + atomic_spin_lock_irqsave(&ixp4xx_pci_lock, flags); *PCI_CRP_AD_CBE = CRP_AD_CBE_WRITE | ad_cbe; *PCI_CRP_WDATA = data; - spin_unlock_irqrestore(&ixp4xx_pci_lock, flags); + atomic_spin_unlock_irqrestore(&ixp4xx_pci_lock, flags); } static inline int check_master_abort(void) @@ -101,7 +101,7 @@ int ixp4xx_pci_read_errata(u32 addr, u32 cmd, u32* data) int retval = 0; int i; - spin_lock_irqsave(&ixp4xx_pci_lock, flags); + atomic_spin_lock_irqsave(&ixp4xx_pci_lock, flags); *PCI_NP_AD = addr; @@ -118,7 +118,7 @@ int ixp4xx_pci_read_errata(u32 addr, u32 cmd, u32* data) if(check_master_abort()) retval = 1; - spin_unlock_irqrestore(&ixp4xx_pci_lock, flags); + atomic_spin_unlock_irqrestore(&ixp4xx_pci_lock, flags); return retval; } @@ -127,7 +127,7 @@ int ixp4xx_pci_read_no_errata(u32 addr, u32 cmd, u32* data) unsigned long flags; int retval = 0; - spin_lock_irqsave(&ixp4xx_pci_lock, flags); + atomic_spin_lock_irqsave(&ixp4xx_pci_lock, flags); *PCI_NP_AD = addr; @@ -140,7 +140,7 @@ int ixp4xx_pci_read_no_errata(u32 addr, u32 cmd, u32* data) if(check_master_abort()) retval = 1; - spin_unlock_irqrestore(&ixp4xx_pci_lock, flags); + atomic_spin_unlock_irqrestore(&ixp4xx_pci_lock, flags); return retval; } @@ -149,7 +149,7 @@ int ixp4xx_pci_write(u32 addr, u32 cmd, u32 data) unsigned long flags; int retval = 0; - spin_lock_irqsave(&ixp4xx_pci_lock, flags); + atomic_spin_lock_irqsave(&ixp4xx_pci_lock, flags); *PCI_NP_AD = addr; @@ -162,7 +162,7 @@ int ixp4xx_pci_write(u32 addr, u32 cmd, u32 data) if(check_master_abort()) retval = 1; - spin_unlock_irqrestore(&ixp4xx_pci_lock, flags); + atomic_spin_unlock_irqrestore(&ixp4xx_pci_lock, flags); return retval; } diff --git a/arch/arm/mach-msm/proc_comm.c b/arch/arm/mach-msm/proc_comm.c index 915ee70..e825c36 100644 --- a/arch/arm/mach-msm/proc_comm.c +++ b/arch/arm/mach-msm/proc_comm.c @@ -14,6 +14,7 @@ * */ +#include <linux/cache.h> #include <linux/delay.h> #include <linux/errno.h> #include <linux/io.h> diff --git a/arch/arm/mach-ns9xxx/irq.c b/arch/arm/mach-ns9xxx/irq.c index feb0e54..6873b7b 100644 --- a/arch/arm/mach-ns9xxx/irq.c +++ b/arch/arm/mach-ns9xxx/irq.c @@ -66,7 +66,7 @@ static void handle_prio_irq(unsigned int irq, struct irq_desc *desc) struct irqaction *action; irqreturn_t action_ret; - spin_lock(&desc->lock); + atomic_spin_lock(&desc->lock); BUG_ON(desc->status & IRQ_INPROGRESS); @@ -78,7 +78,7 @@ static void handle_prio_irq(unsigned int irq, struct irq_desc *desc) goto out_mask; desc->status |= IRQ_INPROGRESS; - spin_unlock(&desc->lock); + atomic_spin_unlock(&desc->lock); action_ret = handle_IRQ_event(irq, action); @@ -87,7 +87,7 @@ static void handle_prio_irq(unsigned int irq, struct irq_desc *desc) * Maybe this function should go to kernel/irq/chip.c? */ note_interrupt(irq, desc, action_ret); - spin_lock(&desc->lock); + atomic_spin_lock(&desc->lock); desc->status &= ~IRQ_INPROGRESS; if (desc->status & IRQ_DISABLED) @@ -97,7 +97,7 @@ out_mask: /* ack unconditionally to unmask lower prio irqs */ desc->chip->ack(irq); - spin_unlock(&desc->lock); + atomic_spin_unlock(&desc->lock); } #define handle_irq handle_prio_irq #endif diff --git a/arch/arm/mach-sa1100/badge4.c b/arch/arm/mach-sa1100/badge4.c index ab5883b..0f0d555 100644 --- a/arch/arm/mach-sa1100/badge4.c +++ b/arch/arm/mach-sa1100/badge4.c @@ -240,15 +240,22 @@ void badge4_set_5V(unsigned subsystem, int on) /* detect on->off and off->on transitions */ if ((!old_5V_bitmap) && (badge4_5V_bitmap)) { /* was off, now on */ - printk(KERN_INFO "%s: enabling 5V supply rail\n", __func__); GPSR = BADGE4_GPIO_PCMEN5V; } else if ((old_5V_bitmap) && (!badge4_5V_bitmap)) { /* was on, now off */ - printk(KERN_INFO "%s: disabling 5V supply rail\n", __func__); GPCR = BADGE4_GPIO_PCMEN5V; } local_irq_restore(flags); + + /* detect on->off and off->on transitions */ + if ((!old_5V_bitmap) && (badge4_5V_bitmap)) { + /* was off, now on */ + printk(KERN_INFO "%s: enabling 5V supply rail\n", __FUNCTION__); + } else if ((old_5V_bitmap) && (!badge4_5V_bitmap)) { + /* was on, now off */ + printk(KERN_INFO "%s: disabling 5V supply rail\n", __FUNCTION__); + } } EXPORT_SYMBOL(badge4_set_5V); diff --git a/arch/arm/mach-shark/leds.c b/arch/arm/mach-shark/leds.c index c9e32de..6ae3314 100644 --- a/arch/arm/mach-shark/leds.c +++ b/arch/arm/mach-shark/leds.c @@ -36,7 +36,7 @@ static char led_state; static short hw_led_state; static short saved_state; -static DEFINE_SPINLOCK(leds_lock); +static DEFINE_ATOMIC_SPINLOCK(leds_lock); short sequoia_read(int addr) { outw(addr,0x24); @@ -52,7 +52,7 @@ static void sequoia_leds_event(led_event_t evt) { unsigned long flags; - spin_lock_irqsave(&leds_lock, flags); + atomic_spin_lock_irqsave(&leds_lock, flags); hw_led_state = sequoia_read(0x09); @@ -144,7 +144,7 @@ static void sequoia_leds_event(led_event_t evt) if (led_state & LED_STATE_ENABLED) sequoia_write(hw_led_state,0x09); - spin_unlock_irqrestore(&leds_lock, flags); + atomic_spin_unlock_irqrestore(&leds_lock, flags); } static int __init leds_init(void) diff --git a/arch/arm/mach-w90x900/mfp-w90p910.c b/arch/arm/mach-w90x900/mfp-w90p910.c index a3520fe..92adadf 100644 --- a/arch/arm/mach-w90x900/mfp-w90p910.c +++ b/arch/arm/mach-w90x900/mfp-w90p910.c @@ -34,7 +34,7 @@ #define GPSELEI0 (0x01 << 26) #define GPSELEI1 (0x01 << 27) -static DECLARE_MUTEX(mfp_sem); +static DEFINE_MUTEX(mfp_sem); void mfp_set_groupf(struct device *dev) { @@ -43,7 +43,7 @@ void mfp_set_groupf(struct device *dev) BUG_ON(!dev); - down(&mfp_sem); + mutex_lock(&mfp_sem); dev_id = dev_name(dev); @@ -56,7 +56,7 @@ void mfp_set_groupf(struct device *dev) __raw_writel(mfpen, REG_MFSEL); - up(&mfp_sem); + mutex_unlock(&mfp_sem); } EXPORT_SYMBOL(mfp_set_groupf); @@ -67,7 +67,7 @@ void mfp_set_groupc(struct device *dev) BUG_ON(!dev); - down(&mfp_sem); + mutex_lock(&mfp_sem); dev_id = dev_name(dev); @@ -86,7 +86,7 @@ void mfp_set_groupc(struct device *dev) __raw_writel(mfpen, REG_MFSEL); - up(&mfp_sem); + mutex_unlock(&mfp_sem); } EXPORT_SYMBOL(mfp_set_groupc); @@ -97,7 +97,7 @@ void mfp_set_groupi(struct device *dev, int gpio) BUG_ON(!dev); - down(&mfp_sem); + mutex_lock(&mfp_sem); dev_id = dev_name(dev); @@ -110,7 +110,7 @@ void mfp_set_groupi(struct device *dev, int gpio) __raw_writel(mfpen, REG_MFSEL); - up(&mfp_sem); + mutex_unlock(&mfp_sem); } EXPORT_SYMBOL(mfp_set_groupi); diff --git a/arch/arm/mm/cache-l2x0.c b/arch/arm/mm/cache-l2x0.c index b480f1d..ade5628 100644 --- a/arch/arm/mm/cache-l2x0.c +++ b/arch/arm/mm/cache-l2x0.c @@ -26,19 +26,19 @@ #define CACHE_LINE_SIZE 32 static void __iomem *l2x0_base; -static DEFINE_SPINLOCK(l2x0_lock); +static DEFINE_ATOMIC_SPINLOCK(l2x0_lock); static inline void sync_writel(unsigned long val, unsigned long reg, unsigned long complete_mask) { unsigned long flags; - spin_lock_irqsave(&l2x0_lock, flags); + atomic_spin_lock_irqsave(&l2x0_lock, flags); writel(val, l2x0_base + reg); /* wait for the operation to complete */ while (readl(l2x0_base + reg) & complete_mask) ; - spin_unlock_irqrestore(&l2x0_lock, flags); + atomic_spin_unlock_irqrestore(&l2x0_lock, flags); } static inline void cache_sync(void) diff --git a/arch/arm/mm/context.c b/arch/arm/mm/context.c index fc84fcc..072b7e9 100644 --- a/arch/arm/mm/context.c +++ b/arch/arm/mm/context.c @@ -14,7 +14,7 @@ #include <asm/mmu_context.h> #include <asm/tlbflush.h> -static DEFINE_SPINLOCK(cpu_asid_lock); +static DEFINE_ATOMIC_SPINLOCK(cpu_asid_lock); unsigned int cpu_last_asid = ASID_FIRST_VERSION; /* @@ -32,7 +32,7 @@ void __new_context(struct mm_struct *mm) { unsigned int asid; - spin_lock(&cpu_asid_lock); + atomic_spin_lock(&cpu_asid_lock); asid = ++cpu_last_asid; if (asid == 0) asid = cpu_last_asid = ASID_FIRST_VERSION; @@ -57,7 +57,7 @@ void __new_context(struct mm_struct *mm) dsb(); } } - spin_unlock(&cpu_asid_lock); + atomic_spin_unlock(&cpu_asid_lock); mm->cpu_vm_mask = cpumask_of_cpu(smp_processor_id()); mm->context.id = asid; diff --git a/arch/arm/mm/copypage-v4mc.c b/arch/arm/mm/copypage-v4mc.c index 7370a71..8b77925 100644 --- a/arch/arm/mm/copypage-v4mc.c +++ b/arch/arm/mm/copypage-v4mc.c @@ -30,7 +30,7 @@ #define minicache_pgprot __pgprot(L_PTE_PRESENT | L_PTE_YOUNG | \ L_PTE_MT_MINICACHE) -static DEFINE_SPINLOCK(minicache_lock); +static DEFINE_ATOMIC_SPINLOCK(minicache_lock); /* * ARMv4 mini-dcache optimised copy_user_highpage @@ -76,14 +76,14 @@ void v4_mc_copy_user_highpage(struct page *to, struct page *from, if (test_and_clear_bit(PG_dcache_dirty, &from->flags)) __flush_dcache_page(page_mapping(from), from); - spin_lock(&minicache_lock); + atomic_spin_lock(&minicache_lock); set_pte_ext(TOP_PTE(0xffff8000), pfn_pte(page_to_pfn(from), minicache_pgprot), 0); flush_tlb_kernel_page(0xffff8000); mc_copy_user_page((void *)0xffff8000, kto); - spin_unlock(&minicache_lock); + atomic_spin_unlock(&minicache_lock); kunmap_atomic(kto, KM_USER1); } diff --git a/arch/arm/mm/copypage-v6.c b/arch/arm/mm/copypage-v6.c index 4127a7b..7c81541 100644 --- a/arch/arm/mm/copypage-v6.c +++ b/arch/arm/mm/copypage-v6.c @@ -27,7 +27,7 @@ #define from_address (0xffff8000) #define to_address (0xffffc000) -static DEFINE_SPINLOCK(v6_lock); +static DEFINE_ATOMIC_SPINLOCK(v6_lock); /* * Copy the user page. No aliasing to deal with so we can just @@ -88,7 +88,7 @@ static void v6_copy_user_highpage_aliasing(struct page *to, * Now copy the page using the same cache colour as the * pages ultimate destination. */ - spin_lock(&v6_lock); + atomic_spin_lock(&v6_lock); set_pte_ext(TOP_PTE(from_address) + offset, pfn_pte(page_to_pfn(from), PAGE_KERNEL), 0); set_pte_ext(TOP_PTE(to_address) + offset, pfn_pte(page_to_pfn(to), PAGE_KERNEL), 0); @@ -101,7 +101,7 @@ static void v6_copy_user_highpage_aliasing(struct page *to, copy_page((void *)kto, (void *)kfrom); - spin_unlock(&v6_lock); + atomic_spin_unlock(&v6_lock); } /* @@ -121,13 +121,13 @@ static void v6_clear_user_highpage_aliasing(struct page *page, unsigned long vad * Now clear the page using the same cache colour as * the pages ultimate destination. */ - spin_lock(&v6_lock); + atomic_spin_lock(&v6_lock); set_pte_ext(TOP_PTE(to_address) + offset, pfn_pte(page_to_pfn(page), PAGE_KERNEL), 0); flush_tlb_kernel_page(to); clear_page((void *)to); - spin_unlock(&v6_lock); + atomic_spin_unlock(&v6_lock); } struct cpu_user_fns v6_user_fns __initdata = { diff --git a/arch/arm/mm/copypage-xscale.c b/arch/arm/mm/copypage-xscale.c index 76824d3..4320cbe 100644 --- a/arch/arm/mm/copypage-xscale.c +++ b/arch/arm/mm/copypage-xscale.c @@ -32,7 +32,7 @@ #define minicache_pgprot __pgprot(L_PTE_PRESENT | L_PTE_YOUNG | \ L_PTE_MT_MINICACHE) -static DEFINE_SPINLOCK(minicache_lock); +static DEFINE_ATOMIC_SPINLOCK(minicache_lock); /* * XScale mini-dcache optimised copy_user_highpage @@ -98,14 +98,14 @@ void xscale_mc_copy_user_highpage(struct page *to, struct page *from, if (test_and_clear_bit(PG_dcache_dirty, &from->flags)) __flush_dcache_page(page_mapping(from), from); - spin_lock(&minicache_lock); + atomic_spin_lock(&minicache_lock); set_pte_ext(TOP_PTE(COPYPAGE_MINICACHE), pfn_pte(page_to_pfn(from), minicache_pgprot), 0); flush_tlb_kernel_page(COPYPAGE_MINICACHE); mc_copy_user_page((void *)COPYPAGE_MINICACHE, kto); - spin_unlock(&minicache_lock); + atomic_spin_unlock(&minicache_lock); kunmap_atomic(kto, KM_USER1); } diff --git a/arch/arm/mm/dma-mapping.c b/arch/arm/mm/dma-mapping.c index 510c179..1576176 100644 --- a/arch/arm/mm/dma-mapping.c +++ b/arch/arm/mm/dma-mapping.c @@ -41,7 +41,7 @@ * These are the page tables (2MB each) covering uncached, DMA consistent allocations */ static pte_t *consistent_pte[NUM_CONSISTENT_PTES]; -static DEFINE_SPINLOCK(consistent_lock); +static DEFINE_ATOMIC_SPINLOCK(consistent_lock); /* * VM region handling support. @@ -97,7 +97,7 @@ arm_vm_region_alloc(struct arm_vm_region *head, size_t size, gfp_t gfp) if (!new) goto out; - spin_lock_irqsave(&consistent_lock, flags); + atomic_spin_lock_irqsave(&consistent_lock, flags); list_for_each_entry(c, &head->vm_list, vm_list) { if ((addr + size) < addr) @@ -118,11 +118,11 @@ arm_vm_region_alloc(struct arm_vm_region *head, size_t size, gfp_t gfp) new->vm_end = addr + size; new->vm_active = 1; - spin_unlock_irqrestore(&consistent_lock, flags); + atomic_spin_unlock_irqrestore(&consistent_lock, flags); return new; nospc: - spin_unlock_irqrestore(&consistent_lock, flags); + atomic_spin_unlock_irqrestore(&consistent_lock, flags); kfree(new); out: return NULL; @@ -317,9 +317,9 @@ static int dma_mmap(struct device *dev, struct vm_area_struct *vma, user_size = (vma->vm_end - vma->vm_start) >> PAGE_SHIFT; - spin_lock_irqsave(&consistent_lock, flags); + atomic_spin_lock_irqsave(&consistent_lock, flags); c = arm_vm_region_find(&consistent_head, (unsigned long)cpu_addr); - spin_unlock_irqrestore(&consistent_lock, flags); + atomic_spin_unlock_irqrestore(&consistent_lock, flags); if (c) { unsigned long off = vma->vm_pgoff; @@ -378,13 +378,13 @@ void dma_free_coherent(struct device *dev, size_t size, void *cpu_addr, dma_addr size = PAGE_ALIGN(size); - spin_lock_irqsave(&consistent_lock, flags); + atomic_spin_lock_irqsave(&consistent_lock, flags); c = arm_vm_region_find(&consistent_head, (unsigned long)cpu_addr); if (!c) goto no_area; c->vm_active = 0; - spin_unlock_irqrestore(&consistent_lock, flags); + atomic_spin_unlock_irqrestore(&consistent_lock, flags); if ((c->vm_end - c->vm_start) != size) { printk(KERN_ERR "%s: freeing wrong coherent size (%ld != %d)\n", @@ -431,15 +431,15 @@ void dma_free_coherent(struct device *dev, size_t size, void *cpu_addr, dma_addr flush_tlb_kernel_range(c->vm_start, c->vm_end); - spin_lock_irqsave(&consistent_lock, flags); + atomic_spin_lock_irqsave(&consistent_lock, flags); list_del(&c->vm_list); - spin_unlock_irqrestore(&consistent_lock, flags); + atomic_spin_unlock_irqrestore(&consistent_lock, flags); kfree(c); return; no_area: - spin_unlock_irqrestore(&consistent_lock, flags); + atomic_spin_unlock_irqrestore(&consistent_lock, flags); printk(KERN_ERR "%s: trying to free invalid coherent area: %p\n", __func__, cpu_addr); dump_stack(); diff --git a/arch/arm/mm/fault.c b/arch/arm/mm/fault.c index 6fdcbb7..02a07d8 100644 --- a/arch/arm/mm/fault.c +++ b/arch/arm/mm/fault.c @@ -258,7 +258,7 @@ do_page_fault(unsigned long addr, unsigned int fsr, struct pt_regs *regs) * If we're in an interrupt or have no user * context, we must not take the fault.. */ - if (in_atomic() || !mm) + if (in_atomic() || !mm || current->pagefault_disabled) goto no_context; /* diff --git a/arch/arm/mm/mmu.c b/arch/arm/mm/mmu.c index 4426ee6..50b51f2 100644 --- a/arch/arm/mm/mmu.c +++ b/arch/arm/mm/mmu.c @@ -29,8 +29,6 @@ #include "mm.h" -DEFINE_PER_CPU(struct mmu_gather, mmu_gathers); - /* * empty_zero_page is a special page that is used for * zero-initialized data and COW. diff --git a/arch/arm/oprofile/common.c b/arch/arm/oprofile/common.c index 3fcd752..695f8e2 100644 --- a/arch/arm/oprofile/common.c +++ b/arch/arm/oprofile/common.c @@ -48,9 +48,9 @@ static int op_arm_setup(void) { int ret; - spin_lock(&oprofilefs_lock); + atomic_spin_lock(&oprofilefs_lock); ret = op_arm_model->setup_ctrs(); - spin_unlock(&oprofilefs_lock); + atomic_spin_unlock(&oprofilefs_lock); return ret; } diff --git a/arch/arm/oprofile/op_model_mpcore.c b/arch/arm/oprofile/op_model_mpcore.c index 4ce0f98..55764a0 100644 --- a/arch/arm/oprofile/op_model_mpcore.c +++ b/arch/arm/oprofile/op_model_mpcore.c @@ -263,10 +263,10 @@ static void em_route_irq(int irq, unsigned int cpu) struct irq_desc *desc = irq_desc + irq; const struct cpumask *mask = cpumask_of(cpu); - spin_lock_irq(&desc->lock); + atomic_spin_lock_irq(&desc->lock); cpumask_copy(desc->affinity, mask); desc->chip->set_affinity(irq, mask); - spin_unlock_irq(&desc->lock); + atomic_spin_unlock_irq(&desc->lock); } static int em_setup(void) diff --git a/arch/arm/oprofile/op_model_xscale.c b/arch/arm/oprofile/op_model_xscale.c index 724ab9c..cbe91ee 100644 --- a/arch/arm/oprofile/op_model_xscale.c +++ b/arch/arm/oprofile/op_model_xscale.c @@ -381,8 +381,9 @@ static int xscale_pmu_start(void) { int ret; u32 pmnc = read_pmnc(); + unsigned long irq_flags = IRQF_DISABLED | IRQF_NODELAY; - ret = request_irq(XSCALE_PMU_IRQ, xscale_pmu_interrupt, IRQF_DISABLED, + ret = request_irq(XSCALE_PMU_IRQ, xscale_pmu_interrupt, irq_flags, "XScale PMU", (void *)results); if (ret < 0) { diff --git a/arch/arm/plat-omap/clock.c b/arch/arm/plat-omap/clock.c index e8c327a..a4447ce 100644 --- a/arch/arm/plat-omap/clock.c +++ b/arch/arm/plat-omap/clock.c @@ -108,15 +108,12 @@ EXPORT_SYMBOL(clk_disable); unsigned long clk_get_rate(struct clk *clk) { - unsigned long flags; unsigned long ret = 0; if (clk == NULL || IS_ERR(clk)) return 0; - spin_lock_irqsave(&clockfw_lock, flags); ret = clk->rate; - spin_unlock_irqrestore(&clockfw_lock, flags); return ret; } diff --git a/arch/avr32/kernel/irq.c b/arch/avr32/kernel/irq.c index 9f57222..778f6ef 100644 --- a/arch/avr32/kernel/irq.c +++ b/arch/avr32/kernel/irq.c @@ -51,7 +51,7 @@ int show_interrupts(struct seq_file *p, void *v) } if (i < NR_IRQS) { - spin_lock_irqsave(&irq_desc[i].lock, flags); + atomic_spin_lock_irqsave(&irq_desc[i].lock, flags); action = irq_desc[i].action; if (!action) goto unlock; @@ -66,7 +66,7 @@ int show_interrupts(struct seq_file *p, void *v) seq_putc(p, '\n'); unlock: - spin_unlock_irqrestore(&irq_desc[i].lock, flags); + atomic_spin_unlock_irqrestore(&irq_desc[i].lock, flags); } return 0; diff --git a/arch/blackfin/kernel/irqchip.c b/arch/blackfin/kernel/irqchip.c index 4b5fd36..d82bcb0 100644 --- a/arch/blackfin/kernel/irqchip.c +++ b/arch/blackfin/kernel/irqchip.c @@ -46,7 +46,7 @@ void ack_bad_irq(unsigned int irq) static struct irq_desc bad_irq_desc = { .handle_irq = handle_bad_irq, - .lock = __SPIN_LOCK_UNLOCKED(irq_desc->lock), + .lock = __ATOMIC_SPIN_LOCK_UNLOCKED(irq_desc->lock), }; #ifdef CONFIG_CPUMASK_OFFSTACK @@ -62,7 +62,7 @@ int show_interrupts(struct seq_file *p, void *v) unsigned long flags; if (i < NR_IRQS) { - spin_lock_irqsave(&irq_desc[i].lock, flags); + atomic_spin_lock_irqsave(&irq_desc[i].lock, flags); action = irq_desc[i].action; if (!action) goto skip; @@ -76,7 +76,7 @@ int show_interrupts(struct seq_file *p, void *v) seq_putc(p, '\n'); skip: - spin_unlock_irqrestore(&irq_desc[i].lock, flags); + atomic_spin_unlock_irqrestore(&irq_desc[i].lock, flags); } else if (i == NR_IRQS) { seq_printf(p, "NMI: "); for_each_online_cpu(j) diff --git a/arch/blackfin/kernel/time.c b/arch/blackfin/kernel/time.c index adb54aa..96b676a 100644 --- a/arch/blackfin/kernel/time.c +++ b/arch/blackfin/kernel/time.c @@ -128,7 +128,7 @@ irqreturn_t timer_interrupt(int irq, void *dummy) /* last time the cmos clock got updated */ static long last_rtc_update; - write_seqlock(&xtime_lock); + write_atomic_seqlock(&xtime_lock); do_timer(1); /* @@ -148,7 +148,7 @@ irqreturn_t timer_interrupt(int irq, void *dummy) /* Do it again in 60s. */ last_rtc_update = xtime.tv_sec - 600; } - write_sequnlock(&xtime_lock); + write_atomic_sequnlock(&xtime_lock); #ifdef CONFIG_IPIPE update_root_process_times(get_irq_regs()); @@ -192,12 +192,12 @@ void do_gettimeofday(struct timeval *tv) unsigned long usec, sec; do { - seq = read_seqbegin_irqsave(&xtime_lock, flags); + seq = read_atomic_seqbegin_irqsave(&xtime_lock, flags); usec = gettimeoffset(); sec = xtime.tv_sec; usec += (xtime.tv_nsec / NSEC_PER_USEC); } - while (read_seqretry_irqrestore(&xtime_lock, seq, flags)); + while (read_atomic_seqretry_irqrestore(&xtime_lock, seq, flags)); while (usec >= USEC_PER_SEC) { usec -= USEC_PER_SEC; @@ -217,7 +217,7 @@ int do_settimeofday(struct timespec *tv) if ((unsigned long)tv->tv_nsec >= NSEC_PER_SEC) return -EINVAL; - write_seqlock_irq(&xtime_lock); + write_atomic_seqlock_irq(&xtime_lock); /* * This is revolting. We need to set the xtime.tv_usec * correctly. However, the value in this location is @@ -235,7 +235,7 @@ int do_settimeofday(struct timespec *tv) ntp_clear(); - write_sequnlock_irq(&xtime_lock); + write_atomic_sequnlock_irq(&xtime_lock); clock_was_set(); return 0; diff --git a/arch/cris/kernel/irq.c b/arch/cris/kernel/irq.c index 7f642fc..45a270e 100644 --- a/arch/cris/kernel/irq.c +++ b/arch/cris/kernel/irq.c @@ -57,7 +57,7 @@ int show_interrupts(struct seq_file *p, void *v) } if (i < NR_IRQS) { - spin_lock_irqsave(&irq_desc[i].lock, flags); + atomic_spin_lock_irqsave(&irq_desc[i].lock, flags); action = irq_desc[i].action; if (!action) goto skip; @@ -76,7 +76,7 @@ int show_interrupts(struct seq_file *p, void *v) seq_putc(p, '\n'); skip: - spin_unlock_irqrestore(&irq_desc[i].lock, flags); + atomic_spin_unlock_irqrestore(&irq_desc[i].lock, flags); } return 0; } diff --git a/arch/cris/kernel/time.c b/arch/cris/kernel/time.c index 074fe7d..72408ed 100644 --- a/arch/cris/kernel/time.c +++ b/arch/cris/kernel/time.c @@ -87,7 +87,7 @@ int do_settimeofday(struct timespec *tv) if ((unsigned long)tv->tv_nsec >= NSEC_PER_SEC) return -EINVAL; - write_seqlock_irq(&xtime_lock); + write_atomic_seqlock_irq(&xtime_lock); /* * This is revolting. We need to set "xtime" correctly. However, the * value in this location is the value at the most recent update of @@ -103,7 +103,7 @@ int do_settimeofday(struct timespec *tv) set_normalized_timespec(&wall_to_monotonic, wtm_sec, wtm_nsec); ntp_clear(); - write_sequnlock_irq(&xtime_lock); + write_atomic_sequnlock_irq(&xtime_lock); clock_was_set(); return 0; } diff --git a/arch/frv/include/asm/highmem.h b/arch/frv/include/asm/highmem.h index 68e4677..5cb8dff 100644 --- a/arch/frv/include/asm/highmem.h +++ b/arch/frv/include/asm/highmem.h @@ -116,6 +116,7 @@ static inline void *kmap_atomic(struct page *page, enum km_type type) { unsigned long paddr; + preempt_disable(); pagefault_disable(); debug_kmap_atomic(type); paddr = page_to_phys(page); @@ -173,6 +174,7 @@ static inline void kunmap_atomic(void *kvaddr, enum km_type type) BUG(); } pagefault_enable(); + preempt_enable(); } #endif /* !__ASSEMBLY__ */ diff --git a/arch/frv/kernel/irq.c b/arch/frv/kernel/irq.c index af3e824..7b29f55 100644 --- a/arch/frv/kernel/irq.c +++ b/arch/frv/kernel/irq.c @@ -69,7 +69,7 @@ int show_interrupts(struct seq_file *p, void *v) } if (i < NR_IRQS) { - spin_lock_irqsave(&irq_desc[i].lock, flags); + atomic_spin_lock_irqsave(&irq_desc[i].lock, flags); action = irq_desc[i].action; if (action) { seq_printf(p, "%3d: ", i); @@ -85,7 +85,7 @@ int show_interrupts(struct seq_file *p, void *v) seq_putc(p, '\n'); } - spin_unlock_irqrestore(&irq_desc[i].lock, flags); + atomic_spin_unlock_irqrestore(&irq_desc[i].lock, flags); } else if (i == NR_IRQS) { seq_printf(p, "Err: %10u\n", atomic_read(&irq_err_count)); } diff --git a/arch/frv/kernel/time.c b/arch/frv/kernel/time.c index fb0ce75..fced1e3 100644 --- a/arch/frv/kernel/time.c +++ b/arch/frv/kernel/time.c @@ -70,7 +70,7 @@ static irqreturn_t timer_interrupt(int irq, void *dummy) * the irq version of write_lock because as just said we have irq * locally disabled. -arca */ - write_seqlock(&xtime_lock); + write_atomic_seqlock(&xtime_lock); do_timer(1); @@ -96,7 +96,7 @@ static irqreturn_t timer_interrupt(int irq, void *dummy) __set_LEDS(n); #endif /* CONFIG_HEARTBEAT */ - write_sequnlock(&xtime_lock); + write_atomic_sequnlock(&xtime_lock); update_process_times(user_mode(get_irq_regs())); diff --git a/arch/h8300/kernel/irq.c b/arch/h8300/kernel/irq.c index 74f8dd7..7dde350 100644 --- a/arch/h8300/kernel/irq.c +++ b/arch/h8300/kernel/irq.c @@ -191,7 +191,7 @@ int show_interrupts(struct seq_file *p, void *v) seq_puts(p, " CPU0"); if (i < NR_IRQS) { - spin_lock_irqsave(&irq_desc[i].lock, flags); + atomic_spin_lock_irqsave(&irq_desc[i].lock, flags); action = irq_desc[i].action; if (!action) goto unlock; @@ -205,7 +205,7 @@ int show_interrupts(struct seq_file *p, void *v) seq_printf(p, ", %s", action->name); seq_putc(p, '\n'); unlock: - spin_unlock_irqrestore(&irq_desc[i].lock, flags); + atomic_spin_unlock_irqrestore(&irq_desc[i].lock, flags); } return 0; } diff --git a/arch/h8300/kernel/time.c b/arch/h8300/kernel/time.c index 7f2d6cf..cb110a4 100644 --- a/arch/h8300/kernel/time.c +++ b/arch/h8300/kernel/time.c @@ -35,9 +35,9 @@ void h8300_timer_tick(void) { if (current->pid) profile_tick(CPU_PROFILING); - write_seqlock(&xtime_lock); + write_atomic_seqlock(&xtime_lock); do_timer(1); - write_sequnlock(&xtime_lock); + write_atomic_sequnlock(&xtime_lock); update_process_times(user_mode(get_irq_regs())); } diff --git a/arch/ia64/include/asm/rwsem.h b/arch/ia64/include/asm/rwsem.h index fbee74b..55d006d 100644 --- a/arch/ia64/include/asm/rwsem.h +++ b/arch/ia64/include/asm/rwsem.h @@ -33,7 +33,7 @@ /* * the semaphore definition */ -struct rw_semaphore { +struct rw_anon_semaphore { signed long count; spinlock_t wait_lock; struct list_head wait_list; @@ -51,26 +51,47 @@ struct rw_semaphore { LIST_HEAD_INIT((name).wait_list) } #define DECLARE_RWSEM(name) \ - struct rw_semaphore name = __RWSEM_INITIALIZER(name) - -extern struct rw_semaphore *rwsem_down_read_failed(struct rw_semaphore *sem); -extern struct rw_semaphore *rwsem_down_write_failed(struct rw_semaphore *sem); -extern struct rw_semaphore *rwsem_wake(struct rw_semaphore *sem); -extern struct rw_semaphore *rwsem_downgrade_wake(struct rw_semaphore *sem); - -static inline void -init_rwsem (struct rw_semaphore *sem) + struct rw_anon_semaphore name = __RWSEM_INITIALIZER(name) + +extern struct rw_anon_semaphore * +rwsem_down_read_failed(struct rw_anon_semaphore *sem); +extern struct rw_anon_semaphore * +rwsem_down_write_failed(struct rw_anon_semaphore *sem); +extern struct rw_anon_semaphore * +rwsem_wake(struct rw_anon_semaphore *sem); +extern struct rw_anon_semaphore * +rwsem_downgrade_wake(struct rw_anon_semaphore *sem); + +static inline void init_anon_rwsem (struct rw_anon_semaphore *sem) { sem->count = RWSEM_UNLOCKED_VALUE; spin_lock_init(&sem->wait_lock); INIT_LIST_HEAD(&sem->wait_list); } +struct rw_anon_semaphore { + signed long count; + spinlock_t wait_lock; + struct list_head wait_list; +}; + +#define __RWSEM_INITIALIZER(name) \ + { RWSEM_UNLOCKED_VALUE, SPIN_LOCK_UNLOCKED, \ + LIST_HEAD_INIT((name).wait_list) } + +#define DECLARE_RWSEM(name) \ + struct rw_semaphore name = __RWSEM_INITIALIZER(name) + +static inline void init_rwsem(struct rw_semaphore *sem) +{ + init_anon_rwsem((struct rw_anon_semaphore *)sem); +} + /* * lock for reading */ static inline void -__down_read (struct rw_semaphore *sem) +__down_read (struct rw_anon_semaphore *sem) { long result = ia64_fetchadd8_acq((unsigned long *)&sem->count, 1); @@ -82,7 +103,7 @@ __down_read (struct rw_semaphore *sem) * lock for writing */ static inline void -__down_write (struct rw_semaphore *sem) +__down_write (struct rw_anon_semaphore *sem) { long old, new; @@ -99,7 +120,7 @@ __down_write (struct rw_semaphore *sem) * unlock after reading */ static inline void -__up_read (struct rw_semaphore *sem) +__up_read (struct rw_anon_semaphore *sem) { long result = ia64_fetchadd8_rel((unsigned long *)&sem->count, -1); @@ -111,7 +132,7 @@ __up_read (struct rw_semaphore *sem) * unlock after writing */ static inline void -__up_write (struct rw_semaphore *sem) +__up_write (struct rw_anon_semaphore *sem) { long old, new; @@ -128,7 +149,7 @@ __up_write (struct rw_semaphore *sem) * trylock for reading -- returns 1 if successful, 0 if contention */ static inline int -__down_read_trylock (struct rw_semaphore *sem) +__down_read_trylock (struct rw_anon_semaphore *sem) { long tmp; while ((tmp = sem->count) >= 0) { @@ -143,7 +164,7 @@ __down_read_trylock (struct rw_semaphore *sem) * trylock for writing -- returns 1 if successful, 0 if contention */ static inline int -__down_write_trylock (struct rw_semaphore *sem) +__down_write_trylock (struct rw_anon_semaphore *sem) { long tmp = cmpxchg_acq(&sem->count, RWSEM_UNLOCKED_VALUE, RWSEM_ACTIVE_WRITE_BIAS); @@ -154,7 +175,7 @@ __down_write_trylock (struct rw_semaphore *sem) * downgrade write lock to read lock */ static inline void -__downgrade_write (struct rw_semaphore *sem) +__downgrade_write (struct rw_anon_semaphore *sem) { long old, new; @@ -174,6 +195,11 @@ __downgrade_write (struct rw_semaphore *sem) #define rwsem_atomic_add(delta, sem) atomic64_add(delta, (atomic64_t *)(&(sem)->count)) #define rwsem_atomic_update(delta, sem) atomic64_add_return(delta, (atomic64_t *)(&(sem)->count)) +static inline int anon_rwsem_is_locked(struct rw_anon_semaphore *sem) +{ + return (sem->count != 0); +} + static inline int rwsem_is_locked(struct rw_semaphore *sem) { return (sem->count != 0); diff --git a/arch/ia64/kernel/irq.c b/arch/ia64/kernel/irq.c index 7d89512..26f5123 100644 --- a/arch/ia64/kernel/irq.c +++ b/arch/ia64/kernel/irq.c @@ -71,7 +71,7 @@ int show_interrupts(struct seq_file *p, void *v) } if (i < NR_IRQS) { - spin_lock_irqsave(&irq_desc[i].lock, flags); + atomic_spin_lock_irqsave(&irq_desc[i].lock, flags); action = irq_desc[i].action; if (!action) goto skip; @@ -91,7 +91,7 @@ int show_interrupts(struct seq_file *p, void *v) seq_putc(p, '\n'); skip: - spin_unlock_irqrestore(&irq_desc[i].lock, flags); + atomic_spin_unlock_irqrestore(&irq_desc[i].lock, flags); } else if (i == NR_IRQS) seq_printf(p, "ERR: %10u\n", atomic_read(&irq_err_count)); return 0; diff --git a/arch/ia64/kernel/irq_ia64.c b/arch/ia64/kernel/irq_ia64.c index dd9d7b5..70763a0 100644 --- a/arch/ia64/kernel/irq_ia64.c +++ b/arch/ia64/kernel/irq_ia64.c @@ -345,7 +345,7 @@ static irqreturn_t smp_irq_move_cleanup_interrupt(int irq, void *dev_id) desc = irq_desc + irq; cfg = irq_cfg + irq; - spin_lock(&desc->lock); + atomic_spin_lock(&desc->lock); if (!cfg->move_cleanup_count) goto unlock; @@ -358,7 +358,7 @@ static irqreturn_t smp_irq_move_cleanup_interrupt(int irq, void *dev_id) spin_unlock_irqrestore(&vector_lock, flags); cfg->move_cleanup_count--; unlock: - spin_unlock(&desc->lock); + atomic_spin_unlock(&desc->lock); } return IRQ_HANDLED; } diff --git a/arch/ia64/kernel/salinfo.c b/arch/ia64/kernel/salinfo.c index e6676fc..414a22e 100644 --- a/arch/ia64/kernel/salinfo.c +++ b/arch/ia64/kernel/salinfo.c @@ -643,7 +643,7 @@ salinfo_init(void) for (i = 0; i < ARRAY_SIZE(salinfo_log_name); i++) { data = salinfo_data + i; data->type = i; - init_MUTEX(&data->mutex); + semaphore_init(&data->mutex); dir = proc_mkdir(salinfo_log_name[i], salinfo_dir); if (!dir) continue; diff --git a/arch/ia64/kernel/time.c b/arch/ia64/kernel/time.c index 4990495..c6e8a37 100644 --- a/arch/ia64/kernel/time.c +++ b/arch/ia64/kernel/time.c @@ -197,10 +197,10 @@ timer_interrupt (int irq, void *dev_id) * another CPU. We need to avoid to SMP race by acquiring the * xtime_lock. */ - write_seqlock(&xtime_lock); + write_atomic_seqlock(&xtime_lock); do_timer(1); local_cpu_data->itm_next = new_itm; - write_sequnlock(&xtime_lock); + write_atomic_sequnlock(&xtime_lock); } else local_cpu_data->itm_next = new_itm; @@ -477,7 +477,7 @@ void update_vsyscall(struct timespec *wall, struct clocksource *c) { unsigned long flags; - write_seqlock_irqsave(&fsyscall_gtod_data.lock, flags); + write_atomic_seqlock_irqsave(&fsyscall_gtod_data.lock, flags); /* copy fsyscall clock data */ fsyscall_gtod_data.clk_mask = c->mask; @@ -500,6 +500,6 @@ void update_vsyscall(struct timespec *wall, struct clocksource *c) fsyscall_gtod_data.monotonic_time.tv_sec++; } - write_sequnlock_irqrestore(&fsyscall_gtod_data.lock, flags); + write_atomic_sequnlock_irqrestore(&fsyscall_gtod_data.lock, flags); } diff --git a/arch/ia64/xen/time.c b/arch/ia64/xen/time.c index fb83326..7ec3f56 100644 --- a/arch/ia64/xen/time.c +++ b/arch/ia64/xen/time.c @@ -141,10 +141,10 @@ consider_steal_time(unsigned long new_itm) delta_itm += local_cpu_data->itm_delta * (stolen + blocked); if (cpu == time_keeper_id) { - write_seqlock(&xtime_lock); + write_atomic_seqlock(&xtime_lock); do_timer(stolen + blocked); local_cpu_data->itm_next = delta_itm + new_itm; - write_sequnlock(&xtime_lock); + write_atomic_sequnlock(&xtime_lock); } else { local_cpu_data->itm_next = delta_itm + new_itm; } diff --git a/arch/m32r/kernel/irq.c b/arch/m32r/kernel/irq.c index 8dfd31e..351e82d 100644 --- a/arch/m32r/kernel/irq.c +++ b/arch/m32r/kernel/irq.c @@ -40,7 +40,7 @@ int show_interrupts(struct seq_file *p, void *v) } if (i < NR_IRQS) { - spin_lock_irqsave(&irq_desc[i].lock, flags); + atomic_spin_lock_irqsave(&irq_desc[i].lock, flags); action = irq_desc[i].action; if (!action) goto skip; @@ -59,7 +59,7 @@ int show_interrupts(struct seq_file *p, void *v) seq_putc(p, '\n'); skip: - spin_unlock_irqrestore(&irq_desc[i].lock, flags); + atomic_spin_unlock_irqrestore(&irq_desc[i].lock, flags); } return 0; } diff --git a/arch/m32r/kernel/time.c b/arch/m32r/kernel/time.c index cada3ba..3e00242 100644 --- a/arch/m32r/kernel/time.c +++ b/arch/m32r/kernel/time.c @@ -106,7 +106,7 @@ void do_gettimeofday(struct timeval *tv) unsigned long max_ntp_tick = tick_usec - tickadj; do { - seq = read_seqbegin(&xtime_lock); + seq = read_atomic_seqbegin(&xtime_lock); usec = do_gettimeoffset(); @@ -120,7 +120,7 @@ void do_gettimeofday(struct timeval *tv) sec = xtime.tv_sec; usec += (xtime.tv_nsec / 1000); - } while (read_seqretry(&xtime_lock, seq)); + } while (read_atomic_seqretry(&xtime_lock, seq)); while (usec >= 1000000) { usec -= 1000000; @@ -141,7 +141,7 @@ int do_settimeofday(struct timespec *tv) if ((unsigned long)tv->tv_nsec >= NSEC_PER_SEC) return -EINVAL; - write_seqlock_irq(&xtime_lock); + write_atomic_seqlock_irq(&xtime_lock); /* * This is revolting. We need to set "xtime" correctly. However, the * value in this location is the value at the most recent update of @@ -157,7 +157,7 @@ int do_settimeofday(struct timespec *tv) set_normalized_timespec(&wall_to_monotonic, wtm_sec, wtm_nsec); ntp_clear(); - write_sequnlock_irq(&xtime_lock); + write_atomic_sequnlock_irq(&xtime_lock); clock_was_set(); return 0; @@ -202,7 +202,7 @@ static irqreturn_t timer_interrupt(int irq, void *dev_id) * CMOS clock accordingly every ~11 minutes. Set_rtc_mmss() has to be * called as close as possible to 500 ms before the new second starts. */ - write_seqlock(&xtime_lock); + write_atomic_seqlock(&xtime_lock); if (ntp_synced() && xtime.tv_sec > last_rtc_update + 660 && (xtime.tv_nsec / 1000) >= 500000 - ((unsigned)TICK_SIZE) / 2 @@ -213,7 +213,7 @@ static irqreturn_t timer_interrupt(int irq, void *dev_id) else /* do it again in 60 s */ last_rtc_update = xtime.tv_sec - 600; } - write_sequnlock(&xtime_lock); + write_atomic_sequnlock(&xtime_lock); /* As we return to user mode fire off the other CPU schedulers.. this is basically because we don't yet share IRQ's around. This message is rigged to be safe on the 386 - basically it's diff --git a/arch/m68k/kernel/time.c b/arch/m68k/kernel/time.c index 54d9807..612259c 100644 --- a/arch/m68k/kernel/time.c +++ b/arch/m68k/kernel/time.c @@ -102,7 +102,7 @@ void do_gettimeofday(struct timeval *tv) unsigned long max_ntp_tick = tick_usec - tickadj; do { - seq = read_seqbegin_irqsave(&xtime_lock, flags); + seq = read_atomic_seqbegin_irqsave(&xtime_lock, flags); usec = mach_gettimeoffset(); @@ -116,7 +116,7 @@ void do_gettimeofday(struct timeval *tv) sec = xtime.tv_sec; usec += xtime.tv_nsec/1000; - } while (read_seqretry_irqrestore(&xtime_lock, seq, flags)); + } while (read_atomic_seqretry_irqrestore(&xtime_lock, seq, flags)); while (usec >= 1000000) { @@ -138,7 +138,7 @@ int do_settimeofday(struct timespec *tv) if ((unsigned long)tv->tv_nsec >= NSEC_PER_SEC) return -EINVAL; - write_seqlock_irq(&xtime_lock); + write_atomic_seqlock_irq(&xtime_lock); /* This is revolting. We need to set the xtime.tv_nsec * correctly. However, the value in this location is * is value at the last tick. @@ -154,7 +154,7 @@ int do_settimeofday(struct timespec *tv) set_normalized_timespec(&wall_to_monotonic, wtm_sec, wtm_nsec); ntp_clear(); - write_sequnlock_irq(&xtime_lock); + write_atomic_sequnlock_irq(&xtime_lock); clock_was_set(); return 0; } diff --git a/arch/m68knommu/kernel/time.c b/arch/m68knommu/kernel/time.c index d182b2f..d3c646d 100644 --- a/arch/m68knommu/kernel/time.c +++ b/arch/m68knommu/kernel/time.c @@ -44,11 +44,11 @@ irqreturn_t arch_timer_interrupt(int irq, void *dummy) if (current->pid) profile_tick(CPU_PROFILING); - write_seqlock(&xtime_lock); + write_atomic_seqlock(&xtime_lock); do_timer(1); - write_sequnlock(&xtime_lock); + write_atomic_sequnlock(&xtime_lock); #ifndef CONFIG_SMP update_process_times(user_mode(get_irq_regs())); diff --git a/arch/microblaze/kernel/irq.c b/arch/microblaze/kernel/irq.c index 7d5ddd6..cd7cdf5 100644 --- a/arch/microblaze/kernel/irq.c +++ b/arch/microblaze/kernel/irq.c @@ -68,7 +68,7 @@ int show_interrupts(struct seq_file *p, void *v) } if (i < nr_irq) { - spin_lock_irqsave(&irq_desc[i].lock, flags); + atomic_spin_lock_irqsave(&irq_desc[i].lock, flags); action = irq_desc[i].action; if (!action) goto skip; @@ -89,7 +89,7 @@ int show_interrupts(struct seq_file *p, void *v) seq_putc(p, '\n'); skip: - spin_unlock_irqrestore(&irq_desc[i].lock, flags); + atomic_spin_unlock_irqrestore(&irq_desc[i].lock, flags); } return 0; } diff --git a/arch/mips/include/asm/i8253.h b/arch/mips/include/asm/i8253.h index 032ca73..ed7899f 100644 --- a/arch/mips/include/asm/i8253.h +++ b/arch/mips/include/asm/i8253.h @@ -12,7 +12,7 @@ #define PIT_CH0 0x40 #define PIT_CH2 0x42 -extern spinlock_t i8253_lock; +extern atomic_spinlock_t i8253_lock; extern void setup_pit_timer(void); diff --git a/arch/mips/kernel/i8253.c b/arch/mips/kernel/i8253.c index f7d8d5d..4ac943d 100644 --- a/arch/mips/kernel/i8253.c +++ b/arch/mips/kernel/i8253.c @@ -15,7 +15,7 @@ #include <asm/io.h> #include <asm/time.h> -DEFINE_SPINLOCK(i8253_lock); +DEFINE_ATOMIC_SPINLOCK(i8253_lock); EXPORT_SYMBOL(i8253_lock); /* @@ -26,7 +26,7 @@ EXPORT_SYMBOL(i8253_lock); static void init_pit_timer(enum clock_event_mode mode, struct clock_event_device *evt) { - spin_lock(&i8253_lock); + atomic_spin_lock(&i8253_lock); switch(mode) { case CLOCK_EVT_MODE_PERIODIC: @@ -55,7 +55,7 @@ static void init_pit_timer(enum clock_event_mode mode, /* Nothing to do here */ break; } - spin_unlock(&i8253_lock); + atomic_spin_unlock(&i8253_lock); } /* @@ -65,10 +65,10 @@ static void init_pit_timer(enum clock_event_mode mode, */ static int pit_next_event(unsigned long delta, struct clock_event_device *evt) { - spin_lock(&i8253_lock); + atomic_spin_lock(&i8253_lock); outb_p(delta & 0xff , PIT_CH0); /* LSB */ outb(delta >> 8 , PIT_CH0); /* MSB */ - spin_unlock(&i8253_lock); + atomic_spin_unlock(&i8253_lock); return 0; } @@ -137,7 +137,7 @@ static cycle_t pit_read(struct clocksource *cs) static int old_count; static u32 old_jifs; - spin_lock_irqsave(&i8253_lock, flags); + atomic_spin_lock_irqsave(&i8253_lock, flags); /* * Although our caller may have the read side of xtime_lock, * this is now a seqlock, and we are cheating in this routine @@ -183,7 +183,7 @@ static cycle_t pit_read(struct clocksource *cs) old_count = count; old_jifs = jifs; - spin_unlock_irqrestore(&i8253_lock, flags); + atomic_spin_unlock_irqrestore(&i8253_lock, flags); count = (LATCH - 1) - count; diff --git a/arch/mips/kernel/irq.c b/arch/mips/kernel/irq.c index 7b845ba..50a7451 100644 --- a/arch/mips/kernel/irq.c +++ b/arch/mips/kernel/irq.c @@ -99,7 +99,7 @@ int show_interrupts(struct seq_file *p, void *v) } if (i < NR_IRQS) { - spin_lock_irqsave(&irq_desc[i].lock, flags); + atomic_spin_lock_irqsave(&irq_desc[i].lock, flags); action = irq_desc[i].action; if (!action) goto skip; @@ -118,7 +118,7 @@ int show_interrupts(struct seq_file *p, void *v) seq_putc(p, '\n'); skip: - spin_unlock_irqrestore(&irq_desc[i].lock, flags); + atomic_spin_unlock_irqrestore(&irq_desc[i].lock, flags); } else if (i == NR_IRQS) { seq_putc(p, '\n'); seq_printf(p, "ERR: %10u\n", atomic_read(&irq_err_count)); diff --git a/arch/mips/mm/fault.c b/arch/mips/mm/fault.c index f956ecb..87714f7 100644 --- a/arch/mips/mm/fault.c +++ b/arch/mips/mm/fault.c @@ -69,7 +69,7 @@ asmlinkage void do_page_fault(struct pt_regs *regs, unsigned long write, * If we're in an interrupt or have no user * context, we must not take the fault.. */ - if (in_atomic() || !mm) + if (in_atomic() || !mm || current->pagefault_disabled) goto bad_area_nosemaphore; down_read(&mm->mmap_sem); diff --git a/arch/mips/mm/highmem.c b/arch/mips/mm/highmem.c index e274fda..c0c038b 100644 --- a/arch/mips/mm/highmem.c +++ b/arch/mips/mm/highmem.c @@ -45,7 +45,7 @@ void *__kmap_atomic(struct page *page, enum km_type type) enum fixed_addresses idx; unsigned long vaddr; - /* even !CONFIG_PREEMPT needs this, for in_atomic in do_page_fault */ + preempt_disable(); pagefault_disable(); if (!PageHighMem(page)) return page_address(page); @@ -71,6 +71,7 @@ void __kunmap_atomic(void *kvaddr, enum km_type type) if (vaddr < FIXADDR_START) { // FIXME pagefault_enable(); + preempt_enable(); return; } @@ -85,6 +86,7 @@ void __kunmap_atomic(void *kvaddr, enum km_type type) #endif pagefault_enable(); + preempt_enable(); } EXPORT_SYMBOL(__kunmap_atomic); @@ -97,6 +99,7 @@ void *kmap_atomic_pfn(unsigned long pfn, enum km_type type) enum fixed_addresses idx; unsigned long vaddr; + preempt_disable(); pagefault_disable(); debug_kmap_atomic(type); diff --git a/arch/mips/vr41xx/common/icu.c b/arch/mips/vr41xx/common/icu.c index 6d39e22..3da2ed2 100644 --- a/arch/mips/vr41xx/common/icu.c +++ b/arch/mips/vr41xx/common/icu.c @@ -159,9 +159,9 @@ void vr41xx_enable_piuint(uint16_t mask) if (current_cpu_type() == CPU_VR4111 || current_cpu_type() == CPU_VR4121) { - spin_lock_irqsave(&desc->lock, flags); + atomic_spin_lock_irqsave(&desc->lock, flags); icu1_set(MPIUINTREG, mask); - spin_unlock_irqrestore(&desc->lock, flags); + atomic_spin_unlock_irqrestore(&desc->lock, flags); } } @@ -174,9 +174,9 @@ void vr41xx_disable_piuint(uint16_t mask) if (current_cpu_type() == CPU_VR4111 || current_cpu_type() == CPU_VR4121) { - spin_lock_irqsave(&desc->lock, flags); + atomic_spin_lock_irqsave(&desc->lock, flags); icu1_clear(MPIUINTREG, mask); - spin_unlock_irqrestore(&desc->lock, flags); + atomic_spin_unlock_irqrestore(&desc->lock, flags); } } @@ -189,9 +189,9 @@ void vr41xx_enable_aiuint(uint16_t mask) if (current_cpu_type() == CPU_VR4111 || current_cpu_type() == CPU_VR4121) { - spin_lock_irqsave(&desc->lock, flags); + atomic_spin_lock_irqsave(&desc->lock, flags); icu1_set(MAIUINTREG, mask); - spin_unlock_irqrestore(&desc->lock, flags); + atomic_spin_unlock_irqrestore(&desc->lock, flags); } } @@ -204,9 +204,9 @@ void vr41xx_disable_aiuint(uint16_t mask) if (current_cpu_type() == CPU_VR4111 || current_cpu_type() == CPU_VR4121) { - spin_lock_irqsave(&desc->lock, flags); + atomic_spin_lock_irqsave(&desc->lock, flags); icu1_clear(MAIUINTREG, mask); - spin_unlock_irqrestore(&desc->lock, flags); + atomic_spin_unlock_irqrestore(&desc->lock, flags); } } @@ -219,9 +219,9 @@ void vr41xx_enable_kiuint(uint16_t mask) if (current_cpu_type() == CPU_VR4111 || current_cpu_type() == CPU_VR4121) { - spin_lock_irqsave(&desc->lock, flags); + atomic_spin_lock_irqsave(&desc->lock, flags); icu1_set(MKIUINTREG, mask); - spin_unlock_irqrestore(&desc->lock, flags); + atomic_spin_unlock_irqrestore(&desc->lock, flags); } } @@ -234,9 +234,9 @@ void vr41xx_disable_kiuint(uint16_t mask) if (current_cpu_type() == CPU_VR4111 || current_cpu_type() == CPU_VR4121) { - spin_lock_irqsave(&desc->lock, flags); + atomic_spin_lock_irqsave(&desc->lock, flags); icu1_clear(MKIUINTREG, mask); - spin_unlock_irqrestore(&desc->lock, flags); + atomic_spin_unlock_irqrestore(&desc->lock, flags); } } @@ -247,9 +247,9 @@ void vr41xx_enable_macint(uint16_t mask) struct irq_desc *desc = irq_desc + ETHERNET_IRQ; unsigned long flags; - spin_lock_irqsave(&desc->lock, flags); + atomic_spin_lock_irqsave(&desc->lock, flags); icu1_set(MMACINTREG, mask); - spin_unlock_irqrestore(&desc->lock, flags); + atomic_spin_unlock_irqrestore(&desc->lock, flags); } EXPORT_SYMBOL(vr41xx_enable_macint); @@ -259,9 +259,9 @@ void vr41xx_disable_macint(uint16_t mask) struct irq_desc *desc = irq_desc + ETHERNET_IRQ; unsigned long flags; - spin_lock_irqsave(&desc->lock, flags); + atomic_spin_lock_irqsave(&desc->lock, flags); icu1_clear(MMACINTREG, mask); - spin_unlock_irqrestore(&desc->lock, flags); + atomic_spin_unlock_irqrestore(&desc->lock, flags); } EXPORT_SYMBOL(vr41xx_disable_macint); @@ -271,9 +271,9 @@ void vr41xx_enable_dsiuint(uint16_t mask) struct irq_desc *desc = irq_desc + DSIU_IRQ; unsigned long flags; - spin_lock_irqsave(&desc->lock, flags); + atomic_spin_lock_irqsave(&desc->lock, flags); icu1_set(MDSIUINTREG, mask); - spin_unlock_irqrestore(&desc->lock, flags); + atomic_spin_unlock_irqrestore(&desc->lock, flags); } EXPORT_SYMBOL(vr41xx_enable_dsiuint); @@ -283,9 +283,9 @@ void vr41xx_disable_dsiuint(uint16_t mask) struct irq_desc *desc = irq_desc + DSIU_IRQ; unsigned long flags; - spin_lock_irqsave(&desc->lock, flags); + atomic_spin_lock_irqsave(&desc->lock, flags); icu1_clear(MDSIUINTREG, mask); - spin_unlock_irqrestore(&desc->lock, flags); + atomic_spin_unlock_irqrestore(&desc->lock, flags); } EXPORT_SYMBOL(vr41xx_disable_dsiuint); @@ -295,9 +295,9 @@ void vr41xx_enable_firint(uint16_t mask) struct irq_desc *desc = irq_desc + FIR_IRQ; unsigned long flags; - spin_lock_irqsave(&desc->lock, flags); + atomic_spin_lock_irqsave(&desc->lock, flags); icu2_set(MFIRINTREG, mask); - spin_unlock_irqrestore(&desc->lock, flags); + atomic_spin_unlock_irqrestore(&desc->lock, flags); } EXPORT_SYMBOL(vr41xx_enable_firint); @@ -307,9 +307,9 @@ void vr41xx_disable_firint(uint16_t mask) struct irq_desc *desc = irq_desc + FIR_IRQ; unsigned long flags; - spin_lock_irqsave(&desc->lock, flags); + atomic_spin_lock_irqsave(&desc->lock, flags); icu2_clear(MFIRINTREG, mask); - spin_unlock_irqrestore(&desc->lock, flags); + atomic_spin_unlock_irqrestore(&desc->lock, flags); } EXPORT_SYMBOL(vr41xx_disable_firint); @@ -322,9 +322,9 @@ void vr41xx_enable_pciint(void) if (current_cpu_type() == CPU_VR4122 || current_cpu_type() == CPU_VR4131 || current_cpu_type() == CPU_VR4133) { - spin_lock_irqsave(&desc->lock, flags); + atomic_spin_lock_irqsave(&desc->lock, flags); icu2_write(MPCIINTREG, PCIINT0); - spin_unlock_irqrestore(&desc->lock, flags); + atomic_spin_unlock_irqrestore(&desc->lock, flags); } } @@ -338,9 +338,9 @@ void vr41xx_disable_pciint(void) if (current_cpu_type() == CPU_VR4122 || current_cpu_type() == CPU_VR4131 || current_cpu_type() == CPU_VR4133) { - spin_lock_irqsave(&desc->lock, flags); + atomic_spin_lock_irqsave(&desc->lock, flags); icu2_write(MPCIINTREG, 0); - spin_unlock_irqrestore(&desc->lock, flags); + atomic_spin_unlock_irqrestore(&desc->lock, flags); } } @@ -354,9 +354,9 @@ void vr41xx_enable_scuint(void) if (current_cpu_type() == CPU_VR4122 || current_cpu_type() == CPU_VR4131 || current_cpu_type() == CPU_VR4133) { - spin_lock_irqsave(&desc->lock, flags); + atomic_spin_lock_irqsave(&desc->lock, flags); icu2_write(MSCUINTREG, SCUINT0); - spin_unlock_irqrestore(&desc->lock, flags); + atomic_spin_unlock_irqrestore(&desc->lock, flags); } } @@ -370,9 +370,9 @@ void vr41xx_disable_scuint(void) if (current_cpu_type() == CPU_VR4122 || current_cpu_type() == CPU_VR4131 || current_cpu_type() == CPU_VR4133) { - spin_lock_irqsave(&desc->lock, flags); + atomic_spin_lock_irqsave(&desc->lock, flags); icu2_write(MSCUINTREG, 0); - spin_unlock_irqrestore(&desc->lock, flags); + atomic_spin_unlock_irqrestore(&desc->lock, flags); } } @@ -386,9 +386,9 @@ void vr41xx_enable_csiint(uint16_t mask) if (current_cpu_type() == CPU_VR4122 || current_cpu_type() == CPU_VR4131 || current_cpu_type() == CPU_VR4133) { - spin_lock_irqsave(&desc->lock, flags); + atomic_spin_lock_irqsave(&desc->lock, flags); icu2_set(MCSIINTREG, mask); - spin_unlock_irqrestore(&desc->lock, flags); + atomic_spin_unlock_irqrestore(&desc->lock, flags); } } @@ -402,9 +402,9 @@ void vr41xx_disable_csiint(uint16_t mask) if (current_cpu_type() == CPU_VR4122 || current_cpu_type() == CPU_VR4131 || current_cpu_type() == CPU_VR4133) { - spin_lock_irqsave(&desc->lock, flags); + atomic_spin_lock_irqsave(&desc->lock, flags); icu2_clear(MCSIINTREG, mask); - spin_unlock_irqrestore(&desc->lock, flags); + atomic_spin_unlock_irqrestore(&desc->lock, flags); } } @@ -418,9 +418,9 @@ void vr41xx_enable_bcuint(void) if (current_cpu_type() == CPU_VR4122 || current_cpu_type() == CPU_VR4131 || current_cpu_type() == CPU_VR4133) { - spin_lock_irqsave(&desc->lock, flags); + atomic_spin_lock_irqsave(&desc->lock, flags); icu2_write(MBCUINTREG, BCUINTR); - spin_unlock_irqrestore(&desc->lock, flags); + atomic_spin_unlock_irqrestore(&desc->lock, flags); } } @@ -434,9 +434,9 @@ void vr41xx_disable_bcuint(void) if (current_cpu_type() == CPU_VR4122 || current_cpu_type() == CPU_VR4131 || current_cpu_type() == CPU_VR4133) { - spin_lock_irqsave(&desc->lock, flags); + atomic_spin_lock_irqsave(&desc->lock, flags); icu2_write(MBCUINTREG, 0); - spin_unlock_irqrestore(&desc->lock, flags); + atomic_spin_unlock_irqrestore(&desc->lock, flags); } } @@ -486,7 +486,7 @@ static inline int set_sysint1_assign(unsigned int irq, unsigned char assign) pin = SYSINT1_IRQ_TO_PIN(irq); - spin_lock_irq(&desc->lock); + atomic_spin_lock_irq(&desc->lock); intassign0 = icu1_read(INTASSIGN0); intassign1 = icu1_read(INTASSIGN1); @@ -525,7 +525,7 @@ static inline int set_sysint1_assign(unsigned int irq, unsigned char assign) intassign1 |= (uint16_t)assign << 9; break; default: - spin_unlock_irq(&desc->lock); + atomic_spin_unlock_irq(&desc->lock); return -EINVAL; } @@ -533,7 +533,7 @@ static inline int set_sysint1_assign(unsigned int irq, unsigned char assign) icu1_write(INTASSIGN0, intassign0); icu1_write(INTASSIGN1, intassign1); - spin_unlock_irq(&desc->lock); + atomic_spin_unlock_irq(&desc->lock); return 0; } @@ -546,7 +546,7 @@ static inline int set_sysint2_assign(unsigned int irq, unsigned char assign) pin = SYSINT2_IRQ_TO_PIN(irq); - spin_lock_irq(&desc->lock); + atomic_spin_lock_irq(&desc->lock); intassign2 = icu1_read(INTASSIGN2); intassign3 = icu1_read(INTASSIGN3); @@ -593,7 +593,7 @@ static inline int set_sysint2_assign(unsigned int irq, unsigned char assign) intassign3 |= (uint16_t)assign << 12; break; default: - spin_unlock_irq(&desc->lock); + atomic_spin_unlock_irq(&desc->lock); return -EINVAL; } @@ -601,7 +601,7 @@ static inline int set_sysint2_assign(unsigned int irq, unsigned char assign) icu1_write(INTASSIGN2, intassign2); icu1_write(INTASSIGN3, intassign3); - spin_unlock_irq(&desc->lock); + atomic_spin_unlock_irq(&desc->lock); return 0; } diff --git a/arch/mn10300/kernel/irq.c b/arch/mn10300/kernel/irq.c index 4c3c58e..c076cff 100644 --- a/arch/mn10300/kernel/irq.c +++ b/arch/mn10300/kernel/irq.c @@ -215,7 +215,7 @@ int show_interrupts(struct seq_file *p, void *v) /* display information rows, one per active CPU */ case 1 ... NR_IRQS - 1: - spin_lock_irqsave(&irq_desc[i].lock, flags); + atomic_spin_lock_irqsave(&irq_desc[i].lock, flags); action = irq_desc[i].action; if (action) { @@ -235,7 +235,7 @@ int show_interrupts(struct seq_file *p, void *v) seq_putc(p, '\n'); } - spin_unlock_irqrestore(&irq_desc[i].lock, flags); + atomic_spin_unlock_irqrestore(&irq_desc[i].lock, flags); break; /* polish off with NMI and error counters */ diff --git a/arch/mn10300/kernel/time.c b/arch/mn10300/kernel/time.c index 395caf0..b25588c 100644 --- a/arch/mn10300/kernel/time.c +++ b/arch/mn10300/kernel/time.c @@ -99,7 +99,7 @@ static irqreturn_t timer_interrupt(int irq, void *dev_id) { unsigned tsc, elapse; - write_seqlock(&xtime_lock); + write_atomic_seqlock(&xtime_lock); while (tsc = get_cycles(), elapse = mn10300_last_tsc - tsc, /* time elapsed since last @@ -114,7 +114,7 @@ static irqreturn_t timer_interrupt(int irq, void *dev_id) check_rtc_time(); } - write_sequnlock(&xtime_lock); + write_atomic_sequnlock(&xtime_lock); update_process_times(user_mode(get_irq_regs())); diff --git a/arch/parisc/kernel/irq.c b/arch/parisc/kernel/irq.c index 330f536..5bbc62b 100644 --- a/arch/parisc/kernel/irq.c +++ b/arch/parisc/kernel/irq.c @@ -180,7 +180,7 @@ int show_interrupts(struct seq_file *p, void *v) if (i < NR_IRQS) { struct irqaction *action; - spin_lock_irqsave(&irq_desc[i].lock, flags); + atomic_spin_lock_irqsave(&irq_desc[i].lock, flags); action = irq_desc[i].action; if (!action) goto skip; @@ -224,7 +224,7 @@ int show_interrupts(struct seq_file *p, void *v) seq_putc(p, '\n'); skip: - spin_unlock_irqrestore(&irq_desc[i].lock, flags); + atomic_spin_unlock_irqrestore(&irq_desc[i].lock, flags); } return 0; diff --git a/arch/parisc/kernel/time.c b/arch/parisc/kernel/time.c index a79c6f9..908cfde 100644 --- a/arch/parisc/kernel/time.c +++ b/arch/parisc/kernel/time.c @@ -163,9 +163,9 @@ irqreturn_t __irq_entry timer_interrupt(int irq, void *dev_id) } if (cpu == 0) { - write_seqlock(&xtime_lock); + write_atomic_seqlock(&xtime_lock); do_timer(ticks_elapsed); - write_sequnlock(&xtime_lock); + write_atomic_sequnlock(&xtime_lock); } return IRQ_HANDLED; @@ -268,12 +268,12 @@ void __init time_init(void) if (pdc_tod_read(&tod_data) == 0) { unsigned long flags; - write_seqlock_irqsave(&xtime_lock, flags); + write_atomic_seqlock_irqsave(&xtime_lock, flags); xtime.tv_sec = tod_data.tod_sec; xtime.tv_nsec = tod_data.tod_usec * 1000; set_normalized_timespec(&wall_to_monotonic, -xtime.tv_sec, -xtime.tv_nsec); - write_sequnlock_irqrestore(&xtime_lock, flags); + write_atomic_sequnlock_irqrestore(&xtime_lock, flags); } else { printk(KERN_ERR "Error reading tod clock\n"); xtime.tv_sec = 0; diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index d00131c..0b46b68 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -68,13 +68,6 @@ config LOCKDEP_SUPPORT bool default y -config RWSEM_GENERIC_SPINLOCK - bool - -config RWSEM_XCHGADD_ALGORITHM - bool - default y - config GENERIC_LOCKBREAK bool default y @@ -252,6 +245,14 @@ config HIGHMEM source kernel/time/Kconfig source kernel/Kconfig.hz source kernel/Kconfig.preempt + +config RWSEM_GENERIC_SPINLOCK + bool + default y + +config RWSEM_XCHGADD_ALGORITHM + bool + source "fs/Kconfig.binfmt" config HUGETLB_PAGE_SIZE_VARIABLE diff --git a/arch/powerpc/include/asm/mpic.h b/arch/powerpc/include/asm/mpic.h index a002682..582e47d 100644 --- a/arch/powerpc/include/asm/mpic.h +++ b/arch/powerpc/include/asm/mpic.h @@ -289,7 +289,7 @@ struct mpic #ifdef CONFIG_MPIC_U3_HT_IRQS /* The fixup table */ struct mpic_irq_fixup *fixups; - spinlock_t fixup_lock; + atomic_spinlock_t fixup_lock; #endif /* Register access method */ diff --git a/arch/powerpc/include/asm/pgtable-ppc64.h b/arch/powerpc/include/asm/pgtable-ppc64.h index 8cd083c..a1ddb07 100644 --- a/arch/powerpc/include/asm/pgtable-ppc64.h +++ b/arch/powerpc/include/asm/pgtable-ppc64.h @@ -202,8 +202,15 @@ static inline unsigned long pte_update(struct mm_struct *mm, assert_pte_locked(mm, addr); #ifdef CONFIG_PPC_STD_MMU_64 - if (old & _PAGE_HASHPTE) + if (old & _PAGE_HASHPTE) { +#ifdef CONFIG_PREEMPT_RT + preempt_disable(); +#endif hpte_need_flush(mm, addr, ptep, old, huge); +#ifdef CONFIG_PREEMPT_RT + preempt_enable(); +#endif + } #endif return old; diff --git a/arch/powerpc/include/asm/pmac_feature.h b/arch/powerpc/include/asm/pmac_feature.h index 877c35a..ba11723 100644 --- a/arch/powerpc/include/asm/pmac_feature.h +++ b/arch/powerpc/include/asm/pmac_feature.h @@ -378,7 +378,7 @@ extern struct macio_chip* macio_find(struct device_node* child, int type); * Those are exported by pmac feature for internal use by arch code * only like the platform function callbacks, do not use directly in drivers */ -extern spinlock_t feature_lock; +extern atomic_spinlock_t feature_lock; extern struct device_node *uninorth_node; extern u32 __iomem *uninorth_base; diff --git a/arch/powerpc/include/asm/rwsem.h b/arch/powerpc/include/asm/rwsem.h index 24cd928..c2494d4 100644 --- a/arch/powerpc/include/asm/rwsem.h +++ b/arch/powerpc/include/asm/rwsem.h @@ -21,7 +21,7 @@ /* * the semaphore definition */ -struct rw_semaphore { +struct rw_anon_semaphore { /* XXX this should be able to be an atomic_t -- paulus */ signed int count; #define RWSEM_UNLOCKED_VALUE 0x00000000 @@ -38,43 +38,47 @@ struct rw_semaphore { }; #ifdef CONFIG_DEBUG_LOCK_ALLOC -# define __RWSEM_DEP_MAP_INIT(lockname) , .dep_map = { .name = #lockname } +# define __RWSEM_ANON_DEP_MAP_INIT(lockname) , .dep_map = { .name = #lockname } #else -# define __RWSEM_DEP_MAP_INIT(lockname) +# define __RWSEM_ANON_DEP_MAP_INIT(lockname) #endif -#define __RWSEM_INITIALIZER(name) \ +#define __RWSEM_ANON_INITIALIZER(name) \ { RWSEM_UNLOCKED_VALUE, __SPIN_LOCK_UNLOCKED((name).wait_lock), \ - LIST_HEAD_INIT((name).wait_list) __RWSEM_DEP_MAP_INIT(name) } + LIST_HEAD_INIT((name).wait_list) __RWSEM_ANON_DEP_MAP_INIT(name) } -#define DECLARE_RWSEM(name) \ - struct rw_semaphore name = __RWSEM_INITIALIZER(name) +#define DECLARE_ANON_RWSEM(name) \ + struct rw_anon_semaphore name = __RWSEM_ANON_INITIALIZER(name) -extern struct rw_semaphore *rwsem_down_read_failed(struct rw_semaphore *sem); -extern struct rw_semaphore *rwsem_down_write_failed(struct rw_semaphore *sem); -extern struct rw_semaphore *rwsem_wake(struct rw_semaphore *sem); -extern struct rw_semaphore *rwsem_downgrade_wake(struct rw_semaphore *sem); +extern struct rw_anon_semaphore * +rwsem_down_read_failed(struct rw_anon_semaphore *sem); +extern struct rw_anon_semaphore * +rwsem_down_write_failed(struct rw_anon_semaphore *sem); +extern struct rw_anon_semaphore * +rwsem_wake(struct rw_anon_semaphore *sem); +extern struct rw_anon_semaphore * +rwsem_downgrade_wake(struct rw_anon_semaphore *sem); -extern void __init_rwsem(struct rw_semaphore *sem, const char *name, +extern void __init_anon_rwsem(struct rw_anon_semaphore *sem, const char *name, struct lock_class_key *key); -#define init_rwsem(sem) \ +#define init_anon_rwsem(sem) \ do { \ static struct lock_class_key __key; \ \ - __init_rwsem((sem), #sem, &__key); \ + __init_anon_rwsem((sem), #sem, &__key); \ } while (0) /* * lock for reading */ -static inline void __down_read(struct rw_semaphore *sem) +static inline void __down_read(struct rw_anon_semaphore *sem) { if (unlikely(atomic_inc_return((atomic_t *)(&sem->count)) <= 0)) rwsem_down_read_failed(sem); } -static inline int __down_read_trylock(struct rw_semaphore *sem) +static inline int __down_read_trylock(struct rw_anon_semaphore *sem) { int tmp; @@ -90,7 +94,7 @@ static inline int __down_read_trylock(struct rw_semaphore *sem) /* * lock for writing */ -static inline void __down_write_nested(struct rw_semaphore *sem, int subclass) +static inline void __down_write_nested(struct rw_anon_semaphore *sem, int subclass) { int tmp; @@ -100,12 +104,12 @@ static inline void __down_write_nested(struct rw_semaphore *sem, int subclass) rwsem_down_write_failed(sem); } -static inline void __down_write(struct rw_semaphore *sem) +static inline void __down_write(struct rw_anon_semaphore *sem) { __down_write_nested(sem, 0); } -static inline int __down_write_trylock(struct rw_semaphore *sem) +static inline int __down_write_trylock(struct rw_anon_semaphore *sem) { int tmp; @@ -117,7 +121,7 @@ static inline int __down_write_trylock(struct rw_semaphore *sem) /* * unlock after reading */ -static inline void __up_read(struct rw_semaphore *sem) +static inline void __up_read(struct rw_anon_semaphore *sem) { int tmp; @@ -129,7 +133,7 @@ static inline void __up_read(struct rw_semaphore *sem) /* * unlock after writing */ -static inline void __up_write(struct rw_semaphore *sem) +static inline void __up_write(struct rw_anon_semaphore *sem) { if (unlikely(atomic_sub_return(RWSEM_ACTIVE_WRITE_BIAS, (atomic_t *)(&sem->count)) < 0)) @@ -139,7 +143,7 @@ static inline void __up_write(struct rw_semaphore *sem) /* * implement atomic add functionality */ -static inline void rwsem_atomic_add(int delta, struct rw_semaphore *sem) +static inline void rwsem_atomic_add(int delta, struct rw_anon_semaphore *sem) { atomic_add(delta, (atomic_t *)(&sem->count)); } @@ -147,7 +151,7 @@ static inline void rwsem_atomic_add(int delta, struct rw_semaphore *sem) /* * downgrade write lock to read lock */ -static inline void __downgrade_write(struct rw_semaphore *sem) +static inline void __downgrade_write(struct rw_anon_semaphore *sem) { int tmp; @@ -159,15 +163,59 @@ static inline void __downgrade_write(struct rw_semaphore *sem) /* * implement exchange and add functionality */ -static inline int rwsem_atomic_update(int delta, struct rw_semaphore *sem) +static inline int rwsem_atomic_update(int delta, struct rw_anon_semaphore *sem) { return atomic_add_return(delta, (atomic_t *)(&sem->count)); } +static inline int anon_rwsem_is_locked(struct rw_anon_semaphore *sem) +{ + return (sem->count != 0); +} + +#ifndef CONFIG_PREEMPT_RT + +struct rw_semaphore { + /* XXX this should be able to be an atomic_t -- paulus */ + signed int count; + spinlock_t wait_lock; + struct list_head wait_list; +#ifdef CONFIG_DEBUG_LOCK_ALLOC + struct lockdep_map dep_map; +#endif +}; + +#ifdef CONFIG_DEBUG_LOCK_ALLOC +# define __RWSEM_DEP_MAP_INIT(lockname) , .dep_map = { .name = #lockname } +#else +# define __RWSEM_DEP_MAP_INIT(lockname) +#endif + +#define __RWSEM_INITIALIZER(name) \ + { RWSEM_UNLOCKED_VALUE, __SPIN_LOCK_UNLOCKED((name).wait_lock), \ + LIST_HEAD_INIT((name).wait_list) __RWSEM_DEP_MAP_INIT(name) } + +#define DECLARE_RWSEM(name) \ + struct rw_semaphore name = __RWSEM_INITIALIZER(name) + +static inline void __init_rwsem(struct rw_semaphore *sem, const char *name, + struct lock_class_key *key) +{ + __init_anon_rwsem((struct rw_anon_semaphore *)sem, name, key); +} + +#define init_rwsem(sem) \ + do { \ + static struct lock_class_key __key; \ + \ + __init_rwsem((sem), #sem, &__key); \ + } while (0) + static inline int rwsem_is_locked(struct rw_semaphore *sem) { return (sem->count != 0); } +#endif #endif /* __KERNEL__ */ #endif /* _ASM_POWERPC_RWSEM_H */ diff --git a/arch/powerpc/include/asm/tlb.h b/arch/powerpc/include/asm/tlb.h index e20ff75..3ddc8f6 100644 --- a/arch/powerpc/include/asm/tlb.h +++ b/arch/powerpc/include/asm/tlb.h @@ -30,26 +30,38 @@ struct mmu_gather; #define tlb_start_vma(tlb, vma) do { } while (0) #define tlb_end_vma(tlb, vma) do { } while (0) +#define HAVE_ARCH_MMU_GATHER 1 + +struct pte_freelist_batch; + +struct arch_mmu_gather { + struct pte_freelist_batch *batch; +}; + +#define ARCH_MMU_GATHER_INIT (struct arch_mmu_gather){ .batch = NULL, } + #if !defined(CONFIG_PPC_STD_MMU) #define tlb_flush(tlb) flush_tlb_mm((tlb)->mm) #elif defined(__powerpc64__) -extern void pte_free_finish(void); +extern void pte_free_finish(struct mmu_gather *tlb); static inline void tlb_flush(struct mmu_gather *tlb) { - struct ppc64_tlb_batch *tlbbatch = &__get_cpu_var(ppc64_tlb_batch); + struct ppc64_tlb_batch *tlbbatch = &get_cpu_var(ppc64_tlb_batch); /* If there's a TLB batch pending, then we must flush it because the * pages are going to be freed and we really don't want to have a CPU * access a freed page because it has a stale TLB */ - if (tlbbatch->index) + if (tlbbatch->index) { __flush_tlb_pending(tlbbatch); + } - pte_free_finish(); + put_cpu_var(ppc64_tlb_batch); + pte_free_finish(tlb); } #else diff --git a/arch/powerpc/include/asm/tlbflush.h b/arch/powerpc/include/asm/tlbflush.h index abbe341..3f67596 100644 --- a/arch/powerpc/include/asm/tlbflush.h +++ b/arch/powerpc/include/asm/tlbflush.h @@ -101,18 +101,25 @@ extern void hpte_need_flush(struct mm_struct *mm, unsigned long addr, static inline void arch_enter_lazy_mmu_mode(void) { - struct ppc64_tlb_batch *batch = &__get_cpu_var(ppc64_tlb_batch); + struct ppc64_tlb_batch *batch = &get_cpu_var(ppc64_tlb_batch); batch->active = 1; + + put_cpu_var(ppc64_tlb_batch); } static inline void arch_leave_lazy_mmu_mode(void) { - struct ppc64_tlb_batch *batch = &__get_cpu_var(ppc64_tlb_batch); + struct ppc64_tlb_batch *batch = &get_cpu_var(ppc64_tlb_batch); + + if (batch->active) { + if (batch->index) { + __flush_tlb_pending(batch); + } + batch->active = 0; + } - if (batch->index) - __flush_tlb_pending(batch); - batch->active = 0; + put_cpu_var(ppc64_tlb_batch); } #define arch_flush_lazy_mmu_mode() do {} while (0) diff --git a/arch/powerpc/kernel/entry_64.S b/arch/powerpc/kernel/entry_64.S index 43e0734..4bb9ce4 100644 --- a/arch/powerpc/kernel/entry_64.S +++ b/arch/powerpc/kernel/entry_64.S @@ -625,44 +625,52 @@ do_work: bne restore /* here we are preempting the current task */ 1: + /* + * preempt_schedule_irq() expects interrupts disabled and returns + * with interrupts disabled. No need to check preemption again, + * preempt_schedule_irq just did that for us. + */ + bl .preempt_schedule_irq #ifdef CONFIG_TRACE_IRQFLAGS bl .trace_hardirqs_on +#endif /* CONFIG_TRACE_IRQFLAGS */ + /* Note: we just clobbered r10 which used to contain the previous * MSR before the hard-disabling done by the caller of do_work. * We don't have that value anymore, but it doesn't matter as * we will hard-enable unconditionally, we can just reload the * current MSR into r10 */ + bl .preempt_schedule_irq mfmsr r10 -#endif /* CONFIG_TRACE_IRQFLAGS */ - li r0,1 - stb r0,PACASOFTIRQEN(r13) - stb r0,PACAHARDIRQEN(r13) - ori r10,r10,MSR_EE - mtmsrd r10,1 /* reenable interrupts */ - bl .preempt_schedule - mfmsr r10 - clrrdi r9,r1,THREAD_SHIFT - rldicl r10,r10,48,1 /* disable interrupts again */ - rotldi r10,r10,16 - mtmsrd r10,1 - ld r4,TI_FLAGS(r9) - andi. r0,r4,_TIF_NEED_RESCHED - bne 1b + clrrdi r9,r1,THREAD_SHIFT + rldicl r10,r10,48,1 /* disable interrupts again */ + rotldi r10,r10,16 + mtmsrd r10,1 + ld r4,TI_FLAGS(r9) + andi. r0,r4,(_TIF_NEED_RESCHED) + bne 1b b restore user_work: #endif - /* Enable interrupts */ - ori r10,r10,MSR_EE - mtmsrd r10,1 - andi. r0,r4,_TIF_NEED_RESCHED beq 1f - bl .schedule + + /* preempt_schedule_irq() expects interrupts disabled. */ + bl .preempt_schedule_irq b .ret_from_except_lite -1: bl .save_nvgprs + /* here we are preempting the current task */ +1: li r0,1 + stb r0,PACASOFTIRQEN(r13) + stb r0,PACAHARDIRQEN(r13) + + /* Enable interrupts */ + ori r10,r10,MSR_EE + mtmsrd r10,1 + + bl .save_nvgprs addi r3,r1,STACK_FRAME_OVERHEAD bl .do_signal b .ret_from_except diff --git a/arch/powerpc/kernel/idle.c b/arch/powerpc/kernel/idle.c index 88d9c1d..1a82d48 100644 --- a/arch/powerpc/kernel/idle.c +++ b/arch/powerpc/kernel/idle.c @@ -96,9 +96,11 @@ void cpu_idle(void) tick_nohz_restart_sched_tick(); if (cpu_should_die()) cpu_die(); - preempt_enable_no_resched(); - schedule(); + local_irq_disable(); + __preempt_enable_no_resched(); + __schedule(); preempt_disable(); + local_irq_enable(); } } diff --git a/arch/powerpc/kernel/irq.c b/arch/powerpc/kernel/irq.c index f7f376e..9f56521 100644 --- a/arch/powerpc/kernel/irq.c +++ b/arch/powerpc/kernel/irq.c @@ -191,7 +191,7 @@ int show_interrupts(struct seq_file *p, void *v) if (i < NR_IRQS) { desc = get_irq_desc(i); - spin_lock_irqsave(&desc->lock, flags); + atomic_spin_lock_irqsave(&desc->lock, flags); action = desc->action; if (!action || !action->handler) goto skip; @@ -212,7 +212,7 @@ int show_interrupts(struct seq_file *p, void *v) seq_printf(p, ", %s", action->name); seq_putc(p, '\n'); skip: - spin_unlock_irqrestore(&desc->lock, flags); + atomic_spin_unlock_irqrestore(&desc->lock, flags); } else if (i == NR_IRQS) { #if defined(CONFIG_PPC32) && defined(CONFIG_TAU_INT) if (tau_initialized){ @@ -453,7 +453,7 @@ void do_softirq(void) */ static LIST_HEAD(irq_hosts); -static DEFINE_SPINLOCK(irq_big_lock); +static DEFINE_ATOMIC_SPINLOCK(irq_big_lock); static unsigned int revmap_trees_allocated; static DEFINE_MUTEX(revmap_trees_mutex); struct irq_map_entry irq_map[NR_IRQS]; @@ -499,14 +499,14 @@ struct irq_host *irq_alloc_host(struct device_node *of_node, if (host->ops->match == NULL) host->ops->match = default_irq_host_match; - spin_lock_irqsave(&irq_big_lock, flags); + atomic_spin_lock_irqsave(&irq_big_lock, flags); /* If it's a legacy controller, check for duplicates and * mark it as allocated (we use irq 0 host pointer for that */ if (revmap_type == IRQ_HOST_MAP_LEGACY) { if (irq_map[0].host != NULL) { - spin_unlock_irqrestore(&irq_big_lock, flags); + atomic_spin_unlock_irqrestore(&irq_big_lock, flags); /* If we are early boot, we can't free the structure, * too bad... * this will be fixed once slab is made available early @@ -520,7 +520,7 @@ struct irq_host *irq_alloc_host(struct device_node *of_node, } list_add(&host->link, &irq_hosts); - spin_unlock_irqrestore(&irq_big_lock, flags); + atomic_spin_unlock_irqrestore(&irq_big_lock, flags); /* Additional setups per revmap type */ switch(revmap_type) { @@ -571,13 +571,13 @@ struct irq_host *irq_find_host(struct device_node *node) * the absence of a device node. This isn't a problem so far * yet though... */ - spin_lock_irqsave(&irq_big_lock, flags); + atomic_spin_lock_irqsave(&irq_big_lock, flags); list_for_each_entry(h, &irq_hosts, link) if (h->ops->match(h, node)) { found = h; break; } - spin_unlock_irqrestore(&irq_big_lock, flags); + atomic_spin_unlock_irqrestore(&irq_big_lock, flags); return found; } EXPORT_SYMBOL_GPL(irq_find_host); @@ -935,7 +935,7 @@ unsigned int irq_alloc_virt(struct irq_host *host, if (count == 0 || count > (irq_virq_count - NUM_ISA_INTERRUPTS)) return NO_IRQ; - spin_lock_irqsave(&irq_big_lock, flags); + atomic_spin_lock_irqsave(&irq_big_lock, flags); /* Use hint for 1 interrupt if any */ if (count == 1 && hint >= NUM_ISA_INTERRUPTS && @@ -959,7 +959,7 @@ unsigned int irq_alloc_virt(struct irq_host *host, } } if (found == NO_IRQ) { - spin_unlock_irqrestore(&irq_big_lock, flags); + atomic_spin_unlock_irqrestore(&irq_big_lock, flags); return NO_IRQ; } hint_found: @@ -968,7 +968,7 @@ unsigned int irq_alloc_virt(struct irq_host *host, smp_wmb(); irq_map[i].host = host; } - spin_unlock_irqrestore(&irq_big_lock, flags); + atomic_spin_unlock_irqrestore(&irq_big_lock, flags); return found; } @@ -980,7 +980,7 @@ void irq_free_virt(unsigned int virq, unsigned int count) WARN_ON (virq < NUM_ISA_INTERRUPTS); WARN_ON (count == 0 || (virq + count) > irq_virq_count); - spin_lock_irqsave(&irq_big_lock, flags); + atomic_spin_lock_irqsave(&irq_big_lock, flags); for (i = virq; i < (virq + count); i++) { struct irq_host *host; @@ -993,7 +993,7 @@ void irq_free_virt(unsigned int virq, unsigned int count) smp_wmb(); irq_map[i].host = NULL; } - spin_unlock_irqrestore(&irq_big_lock, flags); + atomic_spin_unlock_irqrestore(&irq_big_lock, flags); } void irq_early_init(void) @@ -1065,7 +1065,7 @@ static int virq_debug_show(struct seq_file *m, void *private) for (i = 1; i < NR_IRQS; i++) { desc = get_irq_desc(i); - spin_lock_irqsave(&desc->lock, flags); + atomic_spin_lock_irqsave(&desc->lock, flags); if (desc->action && desc->action->handler) { seq_printf(m, "%5d ", i); @@ -1084,7 +1084,7 @@ static int virq_debug_show(struct seq_file *m, void *private) seq_printf(m, "%s\n", p); } - spin_unlock_irqrestore(&desc->lock, flags); + atomic_spin_unlock_irqrestore(&desc->lock, flags); } return 0; diff --git a/arch/powerpc/kernel/kprobes.c b/arch/powerpc/kernel/kprobes.c index c932978..dcbf960 100644 --- a/arch/powerpc/kernel/kprobes.c +++ b/arch/powerpc/kernel/kprobes.c @@ -263,7 +263,7 @@ ss_probe: kcb->kprobe_status = KPROBE_HIT_SSDONE; reset_current_kprobe(); - preempt_enable_no_resched(); + preempt_enable(); return 1; } else if (ret < 0) { /* @@ -282,7 +282,7 @@ ss_probe: return 1; no_kprobe: - preempt_enable_no_resched(); + preempt_enable(); return ret; } @@ -412,7 +412,7 @@ static int __kprobes post_kprobe_handler(struct pt_regs *regs) } reset_current_kprobe(); out: - preempt_enable_no_resched(); + preempt_enable(); /* * if somebody else is singlestepping across a probe point, msr diff --git a/arch/powerpc/kernel/pmc.c b/arch/powerpc/kernel/pmc.c index 0516e2d..e38729a 100644 --- a/arch/powerpc/kernel/pmc.c +++ b/arch/powerpc/kernel/pmc.c @@ -37,7 +37,7 @@ static void dummy_perf(struct pt_regs *regs) } -static DEFINE_SPINLOCK(pmc_owner_lock); +static DEFINE_ATOMIC_SPINLOCK(pmc_owner_lock); static void *pmc_owner_caller; /* mostly for debugging */ perf_irq_t perf_irq = dummy_perf; @@ -45,7 +45,7 @@ int reserve_pmc_hardware(perf_irq_t new_perf_irq) { int err = 0; - spin_lock(&pmc_owner_lock); + atomic_spin_lock(&pmc_owner_lock); if (pmc_owner_caller) { printk(KERN_WARNING "reserve_pmc_hardware: " @@ -59,21 +59,21 @@ int reserve_pmc_hardware(perf_irq_t new_perf_irq) perf_irq = new_perf_irq ? new_perf_irq : dummy_perf; out: - spin_unlock(&pmc_owner_lock); + atomic_spin_unlock(&pmc_owner_lock); return err; } EXPORT_SYMBOL_GPL(reserve_pmc_hardware); void release_pmc_hardware(void) { - spin_lock(&pmc_owner_lock); + atomic_spin_lock(&pmc_owner_lock); WARN_ON(! pmc_owner_caller); pmc_owner_caller = NULL; perf_irq = dummy_perf; - spin_unlock(&pmc_owner_lock); + atomic_spin_unlock(&pmc_owner_lock); } EXPORT_SYMBOL_GPL(release_pmc_hardware); diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c index 892a9f2..ef9d506 100644 --- a/arch/powerpc/kernel/process.c +++ b/arch/powerpc/kernel/process.c @@ -305,6 +305,10 @@ struct task_struct *__switch_to(struct task_struct *prev, struct thread_struct *new_thread, *old_thread; unsigned long flags; struct task_struct *last; +#if defined(CONFIG_PPC64) && defined (CONFIG_PREEMPT_RT) + struct ppc64_tlb_batch *batch; + int hadbatch; +#endif #ifdef CONFIG_SMP /* avoid complexity of lazy save/restore of fpu @@ -396,6 +400,17 @@ struct task_struct *__switch_to(struct task_struct *prev, old_thread->accum_tb += (current_tb - start_tb); new_thread->start_tb = current_tb; } + +#ifdef CONFIG_PREEMPT_RT + batch = &__get_cpu_var(ppc64_tlb_batch); + if (batch->active) { + hadbatch = 1; + if (batch->index) { + __flush_tlb_pending(batch); + } + batch->active = 0; + } +#endif /* #ifdef CONFIG_PREEMPT_RT */ #endif local_irq_save(flags); @@ -414,6 +429,13 @@ struct task_struct *__switch_to(struct task_struct *prev, local_irq_restore(flags); +#if defined(CONFIG_PPC64) && defined(CONFIG_PREEMPT_RT) + if (hadbatch) { + batch = &__get_cpu_var(ppc64_tlb_batch); + batch->active = 1; + } +#endif + return last; } diff --git a/arch/powerpc/kernel/prom.c b/arch/powerpc/kernel/prom.c index d4405b9..de2295b 100644 --- a/arch/powerpc/kernel/prom.c +++ b/arch/powerpc/kernel/prom.c @@ -81,7 +81,7 @@ struct boot_param_header *initial_boot_params; extern struct device_node *allnodes; /* temporary while merging */ -extern rwlock_t devtree_lock; /* temporary while merging */ +extern atomic_spinlock_t devtree_lock; /* temporary while merging */ /* export that to outside world */ struct device_node *of_chosen; @@ -1275,12 +1275,12 @@ struct device_node *of_find_node_by_phandle(phandle handle) { struct device_node *np; - read_lock(&devtree_lock); + atomic_spin_lock(&devtree_lock); for (np = allnodes; np != 0; np = np->allnext) if (np->linux_phandle == handle) break; of_node_get(np); - read_unlock(&devtree_lock); + atomic_spin_unlock(&devtree_lock); return np; } EXPORT_SYMBOL(of_find_node_by_phandle); @@ -1328,13 +1328,13 @@ struct device_node *of_find_all_nodes(struct device_node *prev) { struct device_node *np; - read_lock(&devtree_lock); + atomic_spin_lock(&devtree_lock); np = prev ? prev->allnext : allnodes; for (; np != 0; np = np->allnext) if (of_node_get(np)) break; of_node_put(prev); - read_unlock(&devtree_lock); + atomic_spin_unlock(&devtree_lock); return np; } EXPORT_SYMBOL(of_find_all_nodes); @@ -1419,12 +1419,12 @@ void of_attach_node(struct device_node *np) { unsigned long flags; - write_lock_irqsave(&devtree_lock, flags); + atomic_spin_lock_irqsave(&devtree_lock, flags); np->sibling = np->parent->child; np->allnext = allnodes; np->parent->child = np; allnodes = np; - write_unlock_irqrestore(&devtree_lock, flags); + atomic_spin_unlock_irqrestore(&devtree_lock, flags); } /* @@ -1437,7 +1437,7 @@ void of_detach_node(struct device_node *np) struct device_node *parent; unsigned long flags; - write_lock_irqsave(&devtree_lock, flags); + atomic_spin_lock_irqsave(&devtree_lock, flags); parent = np->parent; if (!parent) @@ -1468,7 +1468,7 @@ void of_detach_node(struct device_node *np) of_node_set_flag(np, OF_DETACHED); out_unlock: - write_unlock_irqrestore(&devtree_lock, flags); + atomic_spin_unlock_irqrestore(&devtree_lock, flags); } #ifdef CONFIG_PPC_PSERIES @@ -1552,18 +1552,18 @@ int prom_add_property(struct device_node* np, struct property* prop) unsigned long flags; prop->next = NULL; - write_lock_irqsave(&devtree_lock, flags); + atomic_spin_lock_irqsave(&devtree_lock, flags); next = &np->properties; while (*next) { if (strcmp(prop->name, (*next)->name) == 0) { /* duplicate ! don't insert it */ - write_unlock_irqrestore(&devtree_lock, flags); + atomic_spin_unlock_irqrestore(&devtree_lock, flags); return -1; } next = &(*next)->next; } *next = prop; - write_unlock_irqrestore(&devtree_lock, flags); + atomic_spin_unlock_irqrestore(&devtree_lock, flags); #ifdef CONFIG_PROC_DEVICETREE /* try to add to proc as well if it was initialized */ @@ -1586,7 +1586,7 @@ int prom_remove_property(struct device_node *np, struct property *prop) unsigned long flags; int found = 0; - write_lock_irqsave(&devtree_lock, flags); + atomic_spin_lock_irqsave(&devtree_lock, flags); next = &np->properties; while (*next) { if (*next == prop) { @@ -1599,7 +1599,7 @@ int prom_remove_property(struct device_node *np, struct property *prop) } next = &(*next)->next; } - write_unlock_irqrestore(&devtree_lock, flags); + atomic_spin_unlock_irqrestore(&devtree_lock, flags); if (!found) return -ENODEV; @@ -1628,7 +1628,7 @@ int prom_update_property(struct device_node *np, unsigned long flags; int found = 0; - write_lock_irqsave(&devtree_lock, flags); + atomic_spin_lock_irqsave(&devtree_lock, flags); next = &np->properties; while (*next) { if (*next == oldprop) { @@ -1642,7 +1642,7 @@ int prom_update_property(struct device_node *np, } next = &(*next)->next; } - write_unlock_irqrestore(&devtree_lock, flags); + atomic_spin_unlock_irqrestore(&devtree_lock, flags); if (!found) return -ENODEV; diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c index eae4511..15d4291 100644 --- a/arch/powerpc/kernel/time.c +++ b/arch/powerpc/kernel/time.c @@ -1000,7 +1000,7 @@ void __init time_init(void) /* Save the current timebase to pretty up CONFIG_PRINTK_TIME */ boot_tb = get_tb_or_rtc(); - write_seqlock_irqsave(&xtime_lock, flags); + write_atomic_seqlock_irqsave(&xtime_lock, flags); /* If platform provided a timezone (pmac), we correct the time */ if (timezone_offset) { @@ -1014,7 +1014,7 @@ void __init time_init(void) vdso_data->stamp_xsec = (u64) xtime.tv_sec * XSEC_PER_SEC; vdso_data->tb_to_xs = tb_to_xs; - write_sequnlock_irqrestore(&xtime_lock, flags); + write_atomic_sequnlock_irqrestore(&xtime_lock, flags); /* Register the clocksource, if we're not running on iSeries */ if (!firmware_has_feature(FW_FEATURE_ISERIES)) diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c index 6f0ae1a..451a756 100644 --- a/arch/powerpc/kernel/traps.c +++ b/arch/powerpc/kernel/traps.c @@ -102,11 +102,11 @@ static inline void pmac_backlight_unblank(void) { } int die(const char *str, struct pt_regs *regs, long err) { static struct { - spinlock_t lock; + atomic_spinlock_t lock; u32 lock_owner; int lock_owner_depth; } die = { - .lock = __SPIN_LOCK_UNLOCKED(die.lock), + .lock = __ATOMIC_SPIN_LOCK_UNLOCKED(die.lock), .lock_owner = -1, .lock_owner_depth = 0 }; @@ -120,7 +120,7 @@ int die(const char *str, struct pt_regs *regs, long err) if (die.lock_owner != raw_smp_processor_id()) { console_verbose(); - spin_lock_irqsave(&die.lock, flags); + atomic_spin_lock_irqsave(&die.lock, flags); die.lock_owner = smp_processor_id(); die.lock_owner_depth = 0; bust_spinlocks(1); @@ -155,7 +155,7 @@ int die(const char *str, struct pt_regs *regs, long err) bust_spinlocks(0); die.lock_owner = -1; add_taint(TAINT_DIE); - spin_unlock_irqrestore(&die.lock, flags); + atomic_spin_unlock_irqrestore(&die.lock, flags); if (kexec_should_crash(current) || kexec_sr_activated(smp_processor_id())) @@ -193,6 +193,11 @@ void _exception(int signr, struct pt_regs *regs, int code, unsigned long addr) addr, regs->nip, regs->link, code); } +#ifdef CONFIG_PREEMPT_RT + local_irq_enable(); + preempt_check_resched(); +#endif + memset(&info, 0, sizeof(info)); info.si_signo = signr; info.si_code = code; diff --git a/arch/powerpc/lib/locks.c b/arch/powerpc/lib/locks.c index 79d0fa3..106d6a5 100644 --- a/arch/powerpc/lib/locks.c +++ b/arch/powerpc/lib/locks.c @@ -86,8 +86,10 @@ void __raw_spin_unlock_wait(raw_spinlock_t *lock) { while (lock->slock) { HMT_low(); + preempt_disable(); if (SHARED_PROCESSOR) __spin_yield(lock); + preempt_enable(); } HMT_medium(); } diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c index 830bef0..03c4343 100644 --- a/arch/powerpc/mm/fault.c +++ b/arch/powerpc/mm/fault.c @@ -159,7 +159,7 @@ int __kprobes do_page_fault(struct pt_regs *regs, unsigned long address, } #endif /* !(CONFIG_4xx || CONFIG_BOOKE)*/ - if (in_atomic() || mm == NULL) { + if (in_atomic() || mm == NULL || current->pagefault_disabled) { if (!user_mode(regs)) return SIGSEGV; /* in_atomic() in user mode is really bad, diff --git a/arch/powerpc/mm/hash_native_64.c b/arch/powerpc/mm/hash_native_64.c index 056d23a..a99d114 100644 --- a/arch/powerpc/mm/hash_native_64.c +++ b/arch/powerpc/mm/hash_native_64.c @@ -37,7 +37,7 @@ #define HPTE_LOCK_BIT 3 -static DEFINE_SPINLOCK(native_tlbie_lock); +static DEFINE_ATOMIC_SPINLOCK(native_tlbie_lock); static inline void __tlbie(unsigned long va, int psize, int ssize) { @@ -104,7 +104,7 @@ static inline void tlbie(unsigned long va, int psize, int ssize, int local) if (use_local) use_local = mmu_psize_defs[psize].tlbiel; if (lock_tlbie && !use_local) - spin_lock(&native_tlbie_lock); + atomic_spin_lock(&native_tlbie_lock); asm volatile("ptesync": : :"memory"); if (use_local) { __tlbiel(va, psize, ssize); @@ -114,7 +114,7 @@ static inline void tlbie(unsigned long va, int psize, int ssize, int local) asm volatile("eieio; tlbsync; ptesync": : :"memory"); } if (lock_tlbie && !use_local) - spin_unlock(&native_tlbie_lock); + atomic_spin_unlock(&native_tlbie_lock); } static inline void native_lock_hpte(struct hash_pte *hptep) @@ -434,7 +434,7 @@ static void native_hpte_clear(void) /* we take the tlbie lock and hold it. Some hardware will * deadlock if we try to tlbie from two processors at once. */ - spin_lock(&native_tlbie_lock); + atomic_spin_lock(&native_tlbie_lock); slots = pteg_count * HPTES_PER_GROUP; @@ -458,7 +458,7 @@ static void native_hpte_clear(void) } asm volatile("eieio; tlbsync; ptesync":::"memory"); - spin_unlock(&native_tlbie_lock); + atomic_spin_unlock(&native_tlbie_lock); local_irq_restore(flags); } @@ -521,7 +521,7 @@ static void native_flush_hash_range(unsigned long number, int local) int lock_tlbie = !cpu_has_feature(CPU_FTR_LOCKLESS_TLBIE); if (lock_tlbie) - spin_lock(&native_tlbie_lock); + atomic_spin_lock(&native_tlbie_lock); asm volatile("ptesync":::"memory"); for (i = 0; i < number; i++) { @@ -536,7 +536,7 @@ static void native_flush_hash_range(unsigned long number, int local) asm volatile("eieio; tlbsync; ptesync":::"memory"); if (lock_tlbie) - spin_unlock(&native_tlbie_lock); + atomic_spin_unlock(&native_tlbie_lock); } local_irq_restore(flags); diff --git a/arch/powerpc/mm/highmem.c b/arch/powerpc/mm/highmem.c index c2186c7..81310e2 100644 --- a/arch/powerpc/mm/highmem.c +++ b/arch/powerpc/mm/highmem.c @@ -35,6 +35,7 @@ void *kmap_atomic_prot(struct page *page, enum km_type type, pgprot_t prot) unsigned long vaddr; /* even !CONFIG_PREEMPT needs this, for in_atomic in do_page_fault */ + preempt_disable(); pagefault_disable(); if (!PageHighMem(page)) return page_address(page); @@ -73,5 +74,6 @@ void kunmap_atomic(void *kvaddr, enum km_type type) local_flush_tlb_page(NULL, vaddr); #endif pagefault_enable(); + preempt_enable(); } EXPORT_SYMBOL(kunmap_atomic); diff --git a/arch/powerpc/mm/init_32.c b/arch/powerpc/mm/init_32.c index 3de6a0d..3ef5084 100644 --- a/arch/powerpc/mm/init_32.c +++ b/arch/powerpc/mm/init_32.c @@ -54,8 +54,6 @@ #endif #define MAX_LOW_MEM CONFIG_LOWMEM_SIZE -DEFINE_PER_CPU(struct mmu_gather, mmu_gathers); - phys_addr_t total_memory; phys_addr_t total_lowmem; diff --git a/arch/powerpc/mm/mmu_context_nohash.c b/arch/powerpc/mm/mmu_context_nohash.c index b1a727d..64fb358 100644 --- a/arch/powerpc/mm/mmu_context_nohash.c +++ b/arch/powerpc/mm/mmu_context_nohash.c @@ -46,7 +46,7 @@ static unsigned int next_context, nr_free_contexts; static unsigned long *context_map; static unsigned long *stale_map[NR_CPUS]; static struct mm_struct **context_mm; -static DEFINE_SPINLOCK(context_lock); +static DEFINE_ATOMIC_SPINLOCK(context_lock); #define CTX_MAP_SIZE \ (sizeof(unsigned long) * (last_context / BITS_PER_LONG + 1)) @@ -104,9 +104,9 @@ static unsigned int steal_context_smp(unsigned int id) /* This will happen if you have more CPUs than available contexts, * all we can do here is wait a bit and try again */ - spin_unlock(&context_lock); + atomic_spin_unlock(&context_lock); cpu_relax(); - spin_lock(&context_lock); + atomic_spin_lock(&context_lock); /* This will cause the caller to try again */ return MMU_NO_CONTEXT; @@ -177,7 +177,7 @@ void switch_mmu_context(struct mm_struct *prev, struct mm_struct *next) unsigned long *map; /* No lockless fast path .. yet */ - spin_lock(&context_lock); + atomic_spin_lock(&context_lock); #ifndef DEBUG_STEAL_ONLY pr_devel("[%d] activating context for mm @%p, active=%d, id=%d\n", @@ -258,7 +258,7 @@ void switch_mmu_context(struct mm_struct *prev, struct mm_struct *next) /* Flick the MMU and release lock */ set_context(id, next->pgd); - spin_unlock(&context_lock); + atomic_spin_unlock(&context_lock); } /* @@ -285,7 +285,7 @@ void destroy_context(struct mm_struct *mm) WARN_ON(mm->context.active != 0); - spin_lock_irqsave(&context_lock, flags); + atomic_spin_lock_irqsave(&context_lock, flags); id = mm->context.id; if (id != MMU_NO_CONTEXT) { __clear_bit(id, context_map); @@ -296,7 +296,7 @@ void destroy_context(struct mm_struct *mm) context_mm[id] = NULL; nr_free_contexts++; } - spin_unlock_irqrestore(&context_lock, flags); + atomic_spin_unlock_irqrestore(&context_lock, flags); } #ifdef CONFIG_SMP diff --git a/arch/powerpc/mm/pgtable.c b/arch/powerpc/mm/pgtable.c index 627767d..c8a2904 100644 --- a/arch/powerpc/mm/pgtable.c +++ b/arch/powerpc/mm/pgtable.c @@ -30,7 +30,6 @@ #include <asm/tlbflush.h> #include <asm/tlb.h> -static DEFINE_PER_CPU(struct pte_freelist_batch *, pte_freelist_cur); static unsigned long pte_freelist_forced_free; struct pte_freelist_batch @@ -81,11 +80,11 @@ static void pte_free_submit(struct pte_freelist_batch *batch) void pgtable_free_tlb(struct mmu_gather *tlb, pgtable_free_t pgf) { - /* This is safe since tlb_gather_mmu has disabled preemption */ - struct pte_freelist_batch **batchp = &__get_cpu_var(pte_freelist_cur); + struct pte_freelist_batch **batchp; - if (atomic_read(&tlb->mm->mm_users) < 2 || - cpumask_equal(mm_cpumask(tlb->mm), cpumask_of(smp_processor_id()))){ + batchp = &tlb->arch.batch; + + if (atomic_read(&tlb->mm->mm_users) < 2) { pgtable_free(pgf); return; } @@ -105,15 +104,16 @@ void pgtable_free_tlb(struct mmu_gather *tlb, pgtable_free_t pgf) } } -void pte_free_finish(void) +void pte_free_finish(struct mmu_gather *tlb) { - /* This is safe since tlb_gather_mmu has disabled preemption */ - struct pte_freelist_batch **batchp = &__get_cpu_var(pte_freelist_cur); + struct pte_freelist_batch **batchp; - if (*batchp == NULL) - return; - pte_free_submit(*batchp); - *batchp = NULL; + batchp = &tlb->arch.batch; + + if (*batchp) { + pte_free_submit(*batchp); + *batchp = NULL; + } } /* diff --git a/arch/powerpc/mm/tlb_hash64.c b/arch/powerpc/mm/tlb_hash64.c index 937eb90..33c458f 100644 --- a/arch/powerpc/mm/tlb_hash64.c +++ b/arch/powerpc/mm/tlb_hash64.c @@ -30,14 +30,10 @@ #include <asm/tlbflush.h> #include <asm/tlb.h> #include <asm/bug.h> +#include <asm/machdep.h> DEFINE_PER_CPU(struct ppc64_tlb_batch, ppc64_tlb_batch); -/* This is declared as we are using the more or less generic - * arch/powerpc/include/asm/tlb.h file -- tgall - */ -DEFINE_PER_CPU(struct mmu_gather, mmu_gathers); - /* * A linux PTE was changed and the corresponding hash table entry * neesd to be flushed. This function will either perform the flush @@ -49,7 +45,7 @@ DEFINE_PER_CPU(struct mmu_gather, mmu_gathers); void hpte_need_flush(struct mm_struct *mm, unsigned long addr, pte_t *ptep, unsigned long pte, int huge) { - struct ppc64_tlb_batch *batch = &__get_cpu_var(ppc64_tlb_batch); + struct ppc64_tlb_batch *batch = &get_cpu_var(ppc64_tlb_batch); unsigned long vsid, vaddr; unsigned int psize; int ssize; @@ -100,6 +96,7 @@ void hpte_need_flush(struct mm_struct *mm, unsigned long addr, */ if (!batch->active) { flush_hash_page(vaddr, rpte, psize, ssize, 0); + put_cpu_var(ppc64_tlb_batch); return; } @@ -126,8 +123,22 @@ void hpte_need_flush(struct mm_struct *mm, unsigned long addr, batch->pte[i] = rpte; batch->vaddr[i] = vaddr; batch->index = ++i; + +#ifdef CONFIG_PREEMPT_RT + /* + * Since flushing tlb needs expensive hypervisor call(s) on celleb, + * always flush it on RT to reduce scheduling latency. + */ + if (machine_is(celleb)) { + __flush_tlb_pending(batch); + put_cpu_var(ppc64_tlb_batch); + return; + } +#endif /* CONFIG_PREEMPT_RT */ + if (i >= PPC64_TLB_BATCH_NR) __flush_tlb_pending(batch); + put_cpu_var(ppc64_tlb_batch); } /* diff --git a/arch/powerpc/mm/tlb_nohash.c b/arch/powerpc/mm/tlb_nohash.c index ad2eb4d..f627eb6 100644 --- a/arch/powerpc/mm/tlb_nohash.c +++ b/arch/powerpc/mm/tlb_nohash.c @@ -85,7 +85,7 @@ EXPORT_SYMBOL(local_flush_tlb_page); */ #ifdef CONFIG_SMP -static DEFINE_SPINLOCK(tlbivax_lock); +static DEFINE_ATOMIC_SPINLOCK(tlbivax_lock); struct tlb_flush_param { unsigned long addr; @@ -158,10 +158,10 @@ void flush_tlb_page(struct vm_area_struct *vma, unsigned long vmaddr) if (mmu_has_feature(MMU_FTR_USE_TLBIVAX_BCAST)) { int lock = mmu_has_feature(MMU_FTR_LOCK_BCAST_INVAL); if (lock) - spin_lock(&tlbivax_lock); + atomic_spin_lock(&tlbivax_lock); _tlbivax_bcast(vmaddr, pid); if (lock) - spin_unlock(&tlbivax_lock); + atomic_spin_unlock(&tlbivax_lock); goto bail; } else { struct tlb_flush_param p = { .pid = pid, .addr = vmaddr }; @@ -189,7 +189,9 @@ void flush_tlb_kernel_range(unsigned long start, unsigned long end) _tlbil_pid(0); preempt_enable(); #else + preempt_disable(); _tlbil_pid(0); + preempt_enable(); #endif } EXPORT_SYMBOL(flush_tlb_kernel_range); diff --git a/arch/powerpc/platforms/52xx/media5200.c b/arch/powerpc/platforms/52xx/media5200.c index 68e4f16..afffc6d 100644 --- a/arch/powerpc/platforms/52xx/media5200.c +++ b/arch/powerpc/platforms/52xx/media5200.c @@ -86,9 +86,9 @@ void media5200_irq_cascade(unsigned int virq, struct irq_desc *desc) u32 status, enable; /* Mask off the cascaded IRQ */ - spin_lock(&desc->lock); + atomic_spin_lock(&desc->lock); desc->chip->mask(virq); - spin_unlock(&desc->lock); + atomic_spin_unlock(&desc->lock); /* Ask the FPGA for IRQ status. If 'val' is 0, then no irqs * are pending. 'ffs()' is 1 based */ @@ -104,11 +104,11 @@ void media5200_irq_cascade(unsigned int virq, struct irq_desc *desc) } /* Processing done; can reenable the cascade now */ - spin_lock(&desc->lock); + atomic_spin_lock(&desc->lock); desc->chip->ack(virq); if (!(desc->status & IRQ_DISABLED)) desc->chip->unmask(virq); - spin_unlock(&desc->lock); + atomic_spin_unlock(&desc->lock); } static int media5200_irq_map(struct irq_host *h, unsigned int virq, diff --git a/arch/powerpc/platforms/cell/beat_htab.c b/arch/powerpc/platforms/cell/beat_htab.c index 35b1ec4..e1afbdc 100644 --- a/arch/powerpc/platforms/cell/beat_htab.c +++ b/arch/powerpc/platforms/cell/beat_htab.c @@ -40,7 +40,7 @@ #define DBG_LOW(fmt...) do { } while (0) #endif -static DEFINE_SPINLOCK(beat_htab_lock); +static DEFINE_ATOMIC_SPINLOCK(beat_htab_lock); static inline unsigned int beat_read_mask(unsigned hpte_group) { @@ -114,18 +114,18 @@ static long beat_lpar_hpte_insert(unsigned long hpte_group, if (rflags & _PAGE_NO_CACHE) hpte_r &= ~_PAGE_COHERENT; - spin_lock(&beat_htab_lock); + atomic_spin_lock(&beat_htab_lock); lpar_rc = beat_read_mask(hpte_group); if (lpar_rc == 0) { if (!(vflags & HPTE_V_BOLTED)) DBG_LOW(" full\n"); - spin_unlock(&beat_htab_lock); + atomic_spin_unlock(&beat_htab_lock); return -1; } lpar_rc = beat_insert_htab_entry(0, hpte_group, lpar_rc << 48, hpte_v, hpte_r, &slot); - spin_unlock(&beat_htab_lock); + atomic_spin_unlock(&beat_htab_lock); /* * Since we try and ioremap PHBs we don't own, the pte insert @@ -198,17 +198,17 @@ static long beat_lpar_hpte_updatepp(unsigned long slot, "avpnv=%016lx, slot=%016lx, psize: %d, newpp %016lx ... ", want_v & HPTE_V_AVPN, slot, psize, newpp); - spin_lock(&beat_htab_lock); + atomic_spin_lock(&beat_htab_lock); dummy0 = beat_lpar_hpte_getword0(slot); if ((dummy0 & ~0x7FUL) != (want_v & ~0x7FUL)) { DBG_LOW("not found !\n"); - spin_unlock(&beat_htab_lock); + atomic_spin_unlock(&beat_htab_lock); return -1; } lpar_rc = beat_write_htab_entry(0, slot, 0, newpp, 0, 7, &dummy0, &dummy1); - spin_unlock(&beat_htab_lock); + atomic_spin_unlock(&beat_htab_lock); if (lpar_rc != 0 || dummy0 == 0) { DBG_LOW("not found !\n"); return -1; @@ -262,13 +262,13 @@ static void beat_lpar_hpte_updateboltedpp(unsigned long newpp, vsid = get_kernel_vsid(ea, MMU_SEGSIZE_256M); va = (vsid << 28) | (ea & 0x0fffffff); - spin_lock(&beat_htab_lock); + atomic_spin_lock(&beat_htab_lock); slot = beat_lpar_hpte_find(va, psize); BUG_ON(slot == -1); lpar_rc = beat_write_htab_entry(0, slot, 0, newpp, 0, 7, &dummy0, &dummy1); - spin_unlock(&beat_htab_lock); + atomic_spin_unlock(&beat_htab_lock); BUG_ON(lpar_rc != 0); } @@ -285,18 +285,18 @@ static void beat_lpar_hpte_invalidate(unsigned long slot, unsigned long va, slot, va, psize, local); want_v = hpte_encode_v(va, psize, MMU_SEGSIZE_256M); - spin_lock_irqsave(&beat_htab_lock, flags); + atomic_spin_lock_irqsave(&beat_htab_lock, flags); dummy1 = beat_lpar_hpte_getword0(slot); if ((dummy1 & ~0x7FUL) != (want_v & ~0x7FUL)) { DBG_LOW("not found !\n"); - spin_unlock_irqrestore(&beat_htab_lock, flags); + atomic_spin_unlock_irqrestore(&beat_htab_lock, flags); return; } lpar_rc = beat_write_htab_entry(0, slot, 0, 0, HPTE_V_VALID, 0, &dummy1, &dummy2); - spin_unlock_irqrestore(&beat_htab_lock, flags); + atomic_spin_unlock_irqrestore(&beat_htab_lock, flags); BUG_ON(lpar_rc != 0); } diff --git a/arch/powerpc/platforms/cell/beat_interrupt.c b/arch/powerpc/platforms/cell/beat_interrupt.c index 7225484..e99c998 100644 --- a/arch/powerpc/platforms/cell/beat_interrupt.c +++ b/arch/powerpc/platforms/cell/beat_interrupt.c @@ -30,7 +30,7 @@ #include "beat_wrapper.h" #define MAX_IRQS NR_IRQS -static DEFINE_SPINLOCK(beatic_irq_mask_lock); +static DEFINE_ATOMIC_SPINLOCK(beatic_irq_mask_lock); static uint64_t beatic_irq_mask_enable[(MAX_IRQS+255)/64]; static uint64_t beatic_irq_mask_ack[(MAX_IRQS+255)/64]; @@ -65,30 +65,30 @@ static void beatic_mask_irq(unsigned int irq_plug) { unsigned long flags; - spin_lock_irqsave(&beatic_irq_mask_lock, flags); + atomic_spin_lock_irqsave(&beatic_irq_mask_lock, flags); beatic_irq_mask_enable[irq_plug/64] &= ~(1UL << (63 - (irq_plug%64))); beatic_update_irq_mask(irq_plug); - spin_unlock_irqrestore(&beatic_irq_mask_lock, flags); + atomic_spin_unlock_irqrestore(&beatic_irq_mask_lock, flags); } static void beatic_unmask_irq(unsigned int irq_plug) { unsigned long flags; - spin_lock_irqsave(&beatic_irq_mask_lock, flags); + atomic_spin_lock_irqsave(&beatic_irq_mask_lock, flags); beatic_irq_mask_enable[irq_plug/64] |= 1UL << (63 - (irq_plug%64)); beatic_update_irq_mask(irq_plug); - spin_unlock_irqrestore(&beatic_irq_mask_lock, flags); + atomic_spin_unlock_irqrestore(&beatic_irq_mask_lock, flags); } static void beatic_ack_irq(unsigned int irq_plug) { unsigned long flags; - spin_lock_irqsave(&beatic_irq_mask_lock, flags); + atomic_spin_lock_irqsave(&beatic_irq_mask_lock, flags); beatic_irq_mask_ack[irq_plug/64] &= ~(1UL << (63 - (irq_plug%64))); beatic_update_irq_mask(irq_plug); - spin_unlock_irqrestore(&beatic_irq_mask_lock, flags); + atomic_spin_unlock_irqrestore(&beatic_irq_mask_lock, flags); } static void beatic_end_irq(unsigned int irq_plug) @@ -103,10 +103,10 @@ static void beatic_end_irq(unsigned int irq_plug) printk(KERN_ERR "IRQ over-downcounted, plug %d\n", irq_plug); } - spin_lock_irqsave(&beatic_irq_mask_lock, flags); + atomic_spin_lock_irqsave(&beatic_irq_mask_lock, flags); beatic_irq_mask_ack[irq_plug/64] |= 1UL << (63 - (irq_plug%64)); beatic_update_irq_mask(irq_plug); - spin_unlock_irqrestore(&beatic_irq_mask_lock, flags); + atomic_spin_unlock_irqrestore(&beatic_irq_mask_lock, flags); } static struct irq_chip beatic_pic = { diff --git a/arch/powerpc/platforms/cell/interrupt.c b/arch/powerpc/platforms/cell/interrupt.c index 882e470..0d3022e 100644 --- a/arch/powerpc/platforms/cell/interrupt.c +++ b/arch/powerpc/platforms/cell/interrupt.c @@ -237,7 +237,7 @@ extern int noirqdebug; static void handle_iic_irq(unsigned int irq, struct irq_desc *desc) { - spin_lock(&desc->lock); + atomic_spin_lock(&desc->lock); desc->status &= ~(IRQ_REPLAY | IRQ_WAITING); @@ -265,18 +265,18 @@ static void handle_iic_irq(unsigned int irq, struct irq_desc *desc) goto out_eoi; desc->status &= ~IRQ_PENDING; - spin_unlock(&desc->lock); + atomic_spin_unlock(&desc->lock); action_ret = handle_IRQ_event(irq, action); if (!noirqdebug) note_interrupt(irq, desc, action_ret); - spin_lock(&desc->lock); + atomic_spin_lock(&desc->lock); } while ((desc->status & (IRQ_PENDING | IRQ_DISABLED)) == IRQ_PENDING); desc->status &= ~IRQ_INPROGRESS; out_eoi: desc->chip->eoi(irq); - spin_unlock(&desc->lock); + atomic_spin_unlock(&desc->lock); } static int iic_host_map(struct irq_host *h, unsigned int virq, diff --git a/arch/powerpc/platforms/chrp/time.c b/arch/powerpc/platforms/chrp/time.c index 054dfe5..8f1d8cd 100644 --- a/arch/powerpc/platforms/chrp/time.c +++ b/arch/powerpc/platforms/chrp/time.c @@ -83,7 +83,12 @@ int chrp_set_rtc_time(struct rtc_time *tmarg) unsigned char save_control, save_freq_select; struct rtc_time tm = *tmarg; +#if CONFIG_PREEMPT_RT + if (!spin_trylock(&rtc_lock)) + return -1; +#else spin_lock(&rtc_lock); +#endif save_control = chrp_cmos_clock_read(RTC_CONTROL); /* tell the clock it's being set */ diff --git a/arch/powerpc/platforms/iseries/irq.c b/arch/powerpc/platforms/iseries/irq.c index 94f4447..af7cce0 100644 --- a/arch/powerpc/platforms/iseries/irq.c +++ b/arch/powerpc/platforms/iseries/irq.c @@ -217,9 +217,9 @@ void __init iSeries_activate_IRQs() struct irq_desc *desc = get_irq_desc(irq); if (desc && desc->chip && desc->chip->startup) { - spin_lock_irqsave(&desc->lock, flags); + atomic_spin_lock_irqsave(&desc->lock, flags); desc->chip->startup(irq); - spin_unlock_irqrestore(&desc->lock, flags); + atomic_spin_unlock_irqrestore(&desc->lock, flags); } } } diff --git a/arch/powerpc/platforms/powermac/feature.c b/arch/powerpc/platforms/powermac/feature.c index e6c0040..276c335 100644 --- a/arch/powerpc/platforms/powermac/feature.c +++ b/arch/powerpc/platforms/powermac/feature.c @@ -59,10 +59,10 @@ extern struct device_node *k2_skiplist[2]; * We use a single global lock to protect accesses. Each driver has * to take care of its own locking */ -DEFINE_SPINLOCK(feature_lock); +DEFINE_ATOMIC_SPINLOCK(feature_lock); -#define LOCK(flags) spin_lock_irqsave(&feature_lock, flags); -#define UNLOCK(flags) spin_unlock_irqrestore(&feature_lock, flags); +#define LOCK(flags) atomic_spin_lock_irqsave(&feature_lock, flags); +#define UNLOCK(flags) atomic_spin_unlock_irqrestore(&feature_lock, flags); /* diff --git a/arch/powerpc/platforms/powermac/nvram.c b/arch/powerpc/platforms/powermac/nvram.c index c6f0f9e..7d57748 100644 --- a/arch/powerpc/platforms/powermac/nvram.c +++ b/arch/powerpc/platforms/powermac/nvram.c @@ -80,7 +80,7 @@ static int is_core_99; static int core99_bank = 0; static int nvram_partitions[3]; // XXX Turn that into a sem -static DEFINE_SPINLOCK(nv_lock); +static DEFINE_ATOMIC_SPINLOCK(nv_lock); static int (*core99_write_bank)(int bank, u8* datas); static int (*core99_erase_bank)(int bank); @@ -165,10 +165,10 @@ static unsigned char indirect_nvram_read_byte(int addr) unsigned char val; unsigned long flags; - spin_lock_irqsave(&nv_lock, flags); + atomic_spin_lock_irqsave(&nv_lock, flags); out_8(nvram_addr, addr >> 5); val = in_8(&nvram_data[(addr & 0x1f) << 4]); - spin_unlock_irqrestore(&nv_lock, flags); + atomic_spin_unlock_irqrestore(&nv_lock, flags); return val; } @@ -177,10 +177,10 @@ static void indirect_nvram_write_byte(int addr, unsigned char val) { unsigned long flags; - spin_lock_irqsave(&nv_lock, flags); + atomic_spin_lock_irqsave(&nv_lock, flags); out_8(nvram_addr, addr >> 5); out_8(&nvram_data[(addr & 0x1f) << 4], val); - spin_unlock_irqrestore(&nv_lock, flags); + atomic_spin_unlock_irqrestore(&nv_lock, flags); } @@ -481,7 +481,7 @@ static void core99_nvram_sync(void) if (!is_core_99 || !nvram_data || !nvram_image) return; - spin_lock_irqsave(&nv_lock, flags); + atomic_spin_lock_irqsave(&nv_lock, flags); if (!memcmp(nvram_image, (u8*)nvram_data + core99_bank*NVRAM_SIZE, NVRAM_SIZE)) goto bail; @@ -503,7 +503,7 @@ static void core99_nvram_sync(void) if (core99_write_bank(core99_bank, nvram_image)) printk("nvram: Error writing bank %d\n", core99_bank); bail: - spin_unlock_irqrestore(&nv_lock, flags); + atomic_spin_unlock_irqrestore(&nv_lock, flags); #ifdef DEBUG mdelay(2000); diff --git a/arch/powerpc/platforms/powermac/pfunc_base.c b/arch/powerpc/platforms/powermac/pfunc_base.c index db20de5..3ab56ec 100644 --- a/arch/powerpc/platforms/powermac/pfunc_base.c +++ b/arch/powerpc/platforms/powermac/pfunc_base.c @@ -50,13 +50,13 @@ static int macio_do_gpio_write(PMF_STD_ARGS, u8 value, u8 mask) value = ~value; /* Toggle the GPIO */ - spin_lock_irqsave(&feature_lock, flags); + atomic_spin_lock_irqsave(&feature_lock, flags); tmp = readb(addr); tmp = (tmp & ~mask) | (value & mask); DBG("Do write 0x%02x to GPIO %s (%p)\n", tmp, func->node->full_name, addr); writeb(tmp, addr); - spin_unlock_irqrestore(&feature_lock, flags); + atomic_spin_unlock_irqrestore(&feature_lock, flags); return 0; } @@ -145,9 +145,9 @@ static int macio_do_write_reg32(PMF_STD_ARGS, u32 offset, u32 value, u32 mask) struct macio_chip *macio = func->driver_data; unsigned long flags; - spin_lock_irqsave(&feature_lock, flags); + atomic_spin_lock_irqsave(&feature_lock, flags); MACIO_OUT32(offset, (MACIO_IN32(offset) & ~mask) | (value & mask)); - spin_unlock_irqrestore(&feature_lock, flags); + atomic_spin_unlock_irqrestore(&feature_lock, flags); return 0; } @@ -168,9 +168,9 @@ static int macio_do_write_reg8(PMF_STD_ARGS, u32 offset, u8 value, u8 mask) struct macio_chip *macio = func->driver_data; unsigned long flags; - spin_lock_irqsave(&feature_lock, flags); + atomic_spin_lock_irqsave(&feature_lock, flags); MACIO_OUT8(offset, (MACIO_IN8(offset) & ~mask) | (value & mask)); - spin_unlock_irqrestore(&feature_lock, flags); + atomic_spin_unlock_irqrestore(&feature_lock, flags); return 0; } @@ -223,12 +223,12 @@ static int macio_do_write_reg32_slm(PMF_STD_ARGS, u32 offset, u32 shift, if (args == NULL || args->count == 0) return -EINVAL; - spin_lock_irqsave(&feature_lock, flags); + atomic_spin_lock_irqsave(&feature_lock, flags); tmp = MACIO_IN32(offset); val = args->u[0].v << shift; tmp = (tmp & ~mask) | (val & mask); MACIO_OUT32(offset, tmp); - spin_unlock_irqrestore(&feature_lock, flags); + atomic_spin_unlock_irqrestore(&feature_lock, flags); return 0; } @@ -243,12 +243,12 @@ static int macio_do_write_reg8_slm(PMF_STD_ARGS, u32 offset, u32 shift, if (args == NULL || args->count == 0) return -EINVAL; - spin_lock_irqsave(&feature_lock, flags); + atomic_spin_lock_irqsave(&feature_lock, flags); tmp = MACIO_IN8(offset); val = args->u[0].v << shift; tmp = (tmp & ~mask) | (val & mask); MACIO_OUT8(offset, tmp); - spin_unlock_irqrestore(&feature_lock, flags); + atomic_spin_unlock_irqrestore(&feature_lock, flags); return 0; } @@ -278,12 +278,12 @@ static int unin_do_write_reg32(PMF_STD_ARGS, u32 offset, u32 value, u32 mask) { unsigned long flags; - spin_lock_irqsave(&feature_lock, flags); + atomic_spin_lock_irqsave(&feature_lock, flags); /* This is fairly bogus in darwin, but it should work for our needs * implemeted that way: */ UN_OUT(offset, (UN_IN(offset) & ~mask) | (value & mask)); - spin_unlock_irqrestore(&feature_lock, flags); + atomic_spin_unlock_irqrestore(&feature_lock, flags); return 0; } diff --git a/arch/powerpc/platforms/powermac/pic.c b/arch/powerpc/platforms/powermac/pic.c index d212006..9814414 100644 --- a/arch/powerpc/platforms/powermac/pic.c +++ b/arch/powerpc/platforms/powermac/pic.c @@ -57,7 +57,7 @@ static int max_irqs; static int max_real_irqs; static u32 level_mask[4]; -static DEFINE_SPINLOCK(pmac_pic_lock); +static DEFINE_ATOMIC_SPINLOCK(pmac_pic_lock); #define NR_MASK_WORDS ((NR_IRQS + 31) / 32) static unsigned long ppc_lost_interrupts[NR_MASK_WORDS]; @@ -85,7 +85,7 @@ static void pmac_mask_and_ack_irq(unsigned int virq) int i = src >> 5; unsigned long flags; - spin_lock_irqsave(&pmac_pic_lock, flags); + atomic_spin_lock_irqsave(&pmac_pic_lock, flags); __clear_bit(src, ppc_cached_irq_mask); if (__test_and_clear_bit(src, ppc_lost_interrupts)) atomic_dec(&ppc_n_lost_interrupts); @@ -97,7 +97,7 @@ static void pmac_mask_and_ack_irq(unsigned int virq) mb(); } while((in_le32(&pmac_irq_hw[i]->enable) & bit) != (ppc_cached_irq_mask[i] & bit)); - spin_unlock_irqrestore(&pmac_pic_lock, flags); + atomic_spin_unlock_irqrestore(&pmac_pic_lock, flags); } static void pmac_ack_irq(unsigned int virq) @@ -107,12 +107,12 @@ static void pmac_ack_irq(unsigned int virq) int i = src >> 5; unsigned long flags; - spin_lock_irqsave(&pmac_pic_lock, flags); + atomic_spin_lock_irqsave(&pmac_pic_lock, flags); if (__test_and_clear_bit(src, ppc_lost_interrupts)) atomic_dec(&ppc_n_lost_interrupts); out_le32(&pmac_irq_hw[i]->ack, bit); (void)in_le32(&pmac_irq_hw[i]->ack); - spin_unlock_irqrestore(&pmac_pic_lock, flags); + atomic_spin_unlock_irqrestore(&pmac_pic_lock, flags); } static void __pmac_set_irq_mask(unsigned int irq_nr, int nokicklost) @@ -152,12 +152,12 @@ static unsigned int pmac_startup_irq(unsigned int virq) unsigned long bit = 1UL << (src & 0x1f); int i = src >> 5; - spin_lock_irqsave(&pmac_pic_lock, flags); + atomic_spin_lock_irqsave(&pmac_pic_lock, flags); if ((irq_desc[virq].status & IRQ_LEVEL) == 0) out_le32(&pmac_irq_hw[i]->ack, bit); __set_bit(src, ppc_cached_irq_mask); __pmac_set_irq_mask(src, 0); - spin_unlock_irqrestore(&pmac_pic_lock, flags); + atomic_spin_unlock_irqrestore(&pmac_pic_lock, flags); return 0; } @@ -167,10 +167,10 @@ static void pmac_mask_irq(unsigned int virq) unsigned long flags; unsigned int src = irq_map[virq].hwirq; - spin_lock_irqsave(&pmac_pic_lock, flags); + atomic_spin_lock_irqsave(&pmac_pic_lock, flags); __clear_bit(src, ppc_cached_irq_mask); __pmac_set_irq_mask(src, 1); - spin_unlock_irqrestore(&pmac_pic_lock, flags); + atomic_spin_unlock_irqrestore(&pmac_pic_lock, flags); } static void pmac_unmask_irq(unsigned int virq) @@ -178,19 +178,19 @@ static void pmac_unmask_irq(unsigned int virq) unsigned long flags; unsigned int src = irq_map[virq].hwirq; - spin_lock_irqsave(&pmac_pic_lock, flags); + atomic_spin_lock_irqsave(&pmac_pic_lock, flags); __set_bit(src, ppc_cached_irq_mask); __pmac_set_irq_mask(src, 0); - spin_unlock_irqrestore(&pmac_pic_lock, flags); + atomic_spin_unlock_irqrestore(&pmac_pic_lock, flags); } static int pmac_retrigger(unsigned int virq) { unsigned long flags; - spin_lock_irqsave(&pmac_pic_lock, flags); + atomic_spin_lock_irqsave(&pmac_pic_lock, flags); __pmac_retrigger(irq_map[virq].hwirq); - spin_unlock_irqrestore(&pmac_pic_lock, flags); + atomic_spin_unlock_irqrestore(&pmac_pic_lock, flags); return 1; } @@ -210,7 +210,7 @@ static irqreturn_t gatwick_action(int cpl, void *dev_id) int irq, bits; int rc = IRQ_NONE; - spin_lock_irqsave(&pmac_pic_lock, flags); + atomic_spin_lock_irqsave(&pmac_pic_lock, flags); for (irq = max_irqs; (irq -= 32) >= max_real_irqs; ) { int i = irq >> 5; bits = in_le32(&pmac_irq_hw[i]->event) | ppc_lost_interrupts[i]; @@ -220,12 +220,12 @@ static irqreturn_t gatwick_action(int cpl, void *dev_id) if (bits == 0) continue; irq += __ilog2(bits); - spin_unlock_irqrestore(&pmac_pic_lock, flags); + atomic_spin_unlock_irqrestore(&pmac_pic_lock, flags); generic_handle_irq(irq); - spin_lock_irqsave(&pmac_pic_lock, flags); + atomic_spin_lock_irqsave(&pmac_pic_lock, flags); rc = IRQ_HANDLED; } - spin_unlock_irqrestore(&pmac_pic_lock, flags); + atomic_spin_unlock_irqrestore(&pmac_pic_lock, flags); return rc; } @@ -244,7 +244,7 @@ static unsigned int pmac_pic_get_irq(void) return NO_IRQ_IGNORE; /* ignore, already handled */ } #endif /* CONFIG_SMP */ - spin_lock_irqsave(&pmac_pic_lock, flags); + atomic_spin_lock_irqsave(&pmac_pic_lock, flags); for (irq = max_real_irqs; (irq -= 32) >= 0; ) { int i = irq >> 5; bits = in_le32(&pmac_irq_hw[i]->event) | ppc_lost_interrupts[i]; @@ -256,7 +256,7 @@ static unsigned int pmac_pic_get_irq(void) irq += __ilog2(bits); break; } - spin_unlock_irqrestore(&pmac_pic_lock, flags); + atomic_spin_unlock_irqrestore(&pmac_pic_lock, flags); if (unlikely(irq < 0)) return NO_IRQ; return irq_linear_revmap(pmac_pic_host, irq); diff --git a/arch/powerpc/platforms/pseries/eeh.c b/arch/powerpc/platforms/pseries/eeh.c index 989d646..2db0689 100644 --- a/arch/powerpc/platforms/pseries/eeh.c +++ b/arch/powerpc/platforms/pseries/eeh.c @@ -100,7 +100,7 @@ int eeh_subsystem_enabled; EXPORT_SYMBOL(eeh_subsystem_enabled); /* Lock to avoid races due to multiple reports of an error */ -static DEFINE_SPINLOCK(confirm_error_lock); +static DEFINE_ATOMIC_SPINLOCK(confirm_error_lock); /* Buffer for reporting slot-error-detail rtas calls. Its here * in BSS, and not dynamically alloced, so that it ends up in @@ -436,7 +436,7 @@ static void __eeh_clear_slot(struct device_node *parent, int mode_flag) void eeh_clear_slot (struct device_node *dn, int mode_flag) { unsigned long flags; - spin_lock_irqsave(&confirm_error_lock, flags); + atomic_spin_lock_irqsave(&confirm_error_lock, flags); dn = find_device_pe (dn); @@ -447,7 +447,7 @@ void eeh_clear_slot (struct device_node *dn, int mode_flag) PCI_DN(dn)->eeh_mode &= ~mode_flag; PCI_DN(dn)->eeh_check_count = 0; __eeh_clear_slot(dn, mode_flag); - spin_unlock_irqrestore(&confirm_error_lock, flags); + atomic_spin_unlock_irqrestore(&confirm_error_lock, flags); } /** @@ -506,7 +506,7 @@ int eeh_dn_check_failure(struct device_node *dn, struct pci_dev *dev) * in one slot might report errors simultaneously, and we * only want one error recovery routine running. */ - spin_lock_irqsave(&confirm_error_lock, flags); + atomic_spin_lock_irqsave(&confirm_error_lock, flags); rc = 1; if (pdn->eeh_mode & EEH_MODE_ISOLATED) { pdn->eeh_check_count ++; @@ -575,7 +575,7 @@ int eeh_dn_check_failure(struct device_node *dn, struct pci_dev *dev) * with other functions on this device, and functions under * bridges. */ eeh_mark_slot (dn, EEH_MODE_ISOLATED); - spin_unlock_irqrestore(&confirm_error_lock, flags); + atomic_spin_unlock_irqrestore(&confirm_error_lock, flags); eeh_send_failure_event (dn, dev); @@ -586,7 +586,7 @@ int eeh_dn_check_failure(struct device_node *dn, struct pci_dev *dev) return 1; dn_unlock: - spin_unlock_irqrestore(&confirm_error_lock, flags); + atomic_spin_unlock_irqrestore(&confirm_error_lock, flags); return rc; } @@ -1056,7 +1056,7 @@ void __init eeh_init(void) struct device_node *phb, *np; struct eeh_early_enable_info info; - spin_lock_init(&confirm_error_lock); + atomic_spin_lock_init(&confirm_error_lock); spin_lock_init(&slot_errbuf_lock); np = of_find_node_by_path("/rtas"); diff --git a/arch/powerpc/platforms/pseries/eeh_driver.c b/arch/powerpc/platforms/pseries/eeh_driver.c index 0e8db67..82d4513 100644 --- a/arch/powerpc/platforms/pseries/eeh_driver.c +++ b/arch/powerpc/platforms/pseries/eeh_driver.c @@ -70,12 +70,12 @@ static int irq_in_use(unsigned int irq) { int rc = 0; unsigned long flags; - struct irq_desc *desc = irq_desc + irq; + struct irq_desc *desc = irq_desc + irq; - spin_lock_irqsave(&desc->lock, flags); + atomic_spin_lock_irqsave(&desc->lock, flags); if (desc->action) rc = 1; - spin_unlock_irqrestore(&desc->lock, flags); + atomic_spin_unlock_irqrestore(&desc->lock, flags); return rc; } diff --git a/arch/powerpc/platforms/pseries/iommu.c b/arch/powerpc/platforms/pseries/iommu.c index 661c8e0..bcdce0a 100644 --- a/arch/powerpc/platforms/pseries/iommu.c +++ b/arch/powerpc/platforms/pseries/iommu.c @@ -140,7 +140,7 @@ static int tce_build_pSeriesLP(struct iommu_table *tbl, long tcenum, return ret; } -static DEFINE_PER_CPU(u64 *, tce_page) = NULL; +static DEFINE_PER_CPU_LOCKED(u64 *, tce_page) = NULL; static int tce_buildmulti_pSeriesLP(struct iommu_table *tbl, long tcenum, long npages, unsigned long uaddr, @@ -154,13 +154,14 @@ static int tce_buildmulti_pSeriesLP(struct iommu_table *tbl, long tcenum, long l, limit; long tcenum_start = tcenum, npages_start = npages; int ret = 0; + int cpu; if (npages == 1) { return tce_build_pSeriesLP(tbl, tcenum, npages, uaddr, direction, attrs); } - tcep = __get_cpu_var(tce_page); + tcep = get_cpu_var_locked(tce_page, &cpu); /* This is safe to do since interrupts are off when we're called * from iommu_alloc{,_sg}() @@ -169,10 +170,11 @@ static int tce_buildmulti_pSeriesLP(struct iommu_table *tbl, long tcenum, tcep = (u64 *)__get_free_page(GFP_ATOMIC); /* If allocation fails, fall back to the loop implementation */ if (!tcep) { + put_cpu_var_locked(tce_page, cpu); return tce_build_pSeriesLP(tbl, tcenum, npages, uaddr, direction, attrs); } - __get_cpu_var(tce_page) = tcep; + per_cpu_var_locked(tce_page, cpu) = tcep; } rpn = (virt_to_abs(uaddr)) >> TCE_SHIFT; @@ -216,6 +218,7 @@ static int tce_buildmulti_pSeriesLP(struct iommu_table *tbl, long tcenum, printk("\ttce[0] val = 0x%llx\n", tcep[0]); show_stack(current, (unsigned long *)__get_SP()); } + put_cpu_var_locked(tce_page, cpu); return ret; } diff --git a/arch/powerpc/platforms/pseries/rtasd.c b/arch/powerpc/platforms/pseries/rtasd.c index b3cbac8..493d8de 100644 --- a/arch/powerpc/platforms/pseries/rtasd.c +++ b/arch/powerpc/platforms/pseries/rtasd.c @@ -208,7 +208,7 @@ void pSeries_log_error(char *buf, unsigned int err_type, int fatal) break; case ERR_TYPE_KERNEL_PANIC: default: - WARN_ON_ONCE(!irqs_disabled()); /* @@@ DEBUG @@@ */ + WARN_ON_ONCE_NONRT(!irqs_disabled()); /* @@@ DEBUG @@@ */ spin_unlock_irqrestore(&rtasd_log_lock, s); return; } @@ -228,7 +228,7 @@ void pSeries_log_error(char *buf, unsigned int err_type, int fatal) /* Check to see if we need to or have stopped logging */ if (fatal || !logging_enabled) { logging_enabled = 0; - WARN_ON_ONCE(!irqs_disabled()); /* @@@ DEBUG @@@ */ + WARN_ON_ONCE_NONRT(!irqs_disabled()); /* @@@ DEBUG @@@ */ spin_unlock_irqrestore(&rtasd_log_lock, s); return; } @@ -251,13 +251,13 @@ void pSeries_log_error(char *buf, unsigned int err_type, int fatal) else rtas_log_start += 1; - WARN_ON_ONCE(!irqs_disabled()); /* @@@ DEBUG @@@ */ + WARN_ON_ONCE_NONRT(!irqs_disabled()); /* @@@ DEBUG @@@ */ spin_unlock_irqrestore(&rtasd_log_lock, s); wake_up_interruptible(&rtas_log_wait); break; case ERR_TYPE_KERNEL_PANIC: default: - WARN_ON_ONCE(!irqs_disabled()); /* @@@ DEBUG @@@ */ + WARN_ON_ONCE_NONRT(!irqs_disabled()); /* @@@ DEBUG @@@ */ spin_unlock_irqrestore(&rtasd_log_lock, s); return; } diff --git a/arch/powerpc/platforms/pseries/xics.c b/arch/powerpc/platforms/pseries/xics.c index 419f8a6..d4fbbdc 100644 --- a/arch/powerpc/platforms/pseries/xics.c +++ b/arch/powerpc/platforms/pseries/xics.c @@ -851,7 +851,7 @@ void xics_migrate_irqs_away(void) || desc->chip->set_affinity == NULL) continue; - spin_lock_irqsave(&desc->lock, flags); + atomic_spin_lock_irqsave(&desc->lock, flags); status = rtas_call(ibm_get_xive, 1, 3, xics_status, irq); if (status) { @@ -875,7 +875,7 @@ void xics_migrate_irqs_away(void) cpumask_setall(irq_desc[virq].affinity); desc->chip->set_affinity(virq, cpu_all_mask); unlock: - spin_unlock_irqrestore(&desc->lock, flags); + atomic_spin_unlock_irqrestore(&desc->lock, flags); } } #endif diff --git a/arch/powerpc/sysdev/fsl_msi.c b/arch/powerpc/sysdev/fsl_msi.c index da38a1f..d5702c4 100644 --- a/arch/powerpc/sysdev/fsl_msi.c +++ b/arch/powerpc/sysdev/fsl_msi.c @@ -173,7 +173,7 @@ static void fsl_msi_cascade(unsigned int irq, struct irq_desc *desc) u32 intr_index; u32 have_shift = 0; - spin_lock(&desc->lock); + atomic_spin_lock(&desc->lock); if ((msi_data->feature & FSL_PIC_IP_MASK) == FSL_PIC_IP_IPIC) { if (desc->chip->mask_ack) desc->chip->mask_ack(irq); @@ -225,7 +225,7 @@ static void fsl_msi_cascade(unsigned int irq, struct irq_desc *desc) break; } unlock: - spin_unlock(&desc->lock); + atomic_spin_unlock(&desc->lock); } static int __devinit fsl_of_msi_probe(struct of_device *dev, diff --git a/arch/powerpc/sysdev/i8259.c b/arch/powerpc/sysdev/i8259.c index a96584a..60c4e42 100644 --- a/arch/powerpc/sysdev/i8259.c +++ b/arch/powerpc/sysdev/i8259.c @@ -23,7 +23,7 @@ static unsigned char cached_8259[2] = { 0xff, 0xff }; #define cached_A1 (cached_8259[0]) #define cached_21 (cached_8259[1]) -static DEFINE_SPINLOCK(i8259_lock); +static DEFINE_ATOMIC_SPINLOCK(i8259_lock); static struct irq_host *i8259_host; @@ -42,7 +42,7 @@ unsigned int i8259_irq(void) if (pci_intack) irq = readb(pci_intack); else { - spin_lock(&i8259_lock); + atomic_spin_lock(&i8259_lock); lock = 1; /* Perform an interrupt acknowledge cycle on controller 1. */ @@ -74,7 +74,7 @@ unsigned int i8259_irq(void) irq = NO_IRQ; if (lock) - spin_unlock(&i8259_lock); + atomic_spin_unlock(&i8259_lock); return irq; } @@ -82,7 +82,7 @@ static void i8259_mask_and_ack_irq(unsigned int irq_nr) { unsigned long flags; - spin_lock_irqsave(&i8259_lock, flags); + atomic_spin_lock_irqsave(&i8259_lock, flags); if (irq_nr > 7) { cached_A1 |= 1 << (irq_nr-8); inb(0xA1); /* DUMMY */ @@ -95,7 +95,7 @@ static void i8259_mask_and_ack_irq(unsigned int irq_nr) outb(cached_21, 0x21); outb(0x20, 0x20); /* Non-specific EOI */ } - spin_unlock_irqrestore(&i8259_lock, flags); + atomic_spin_unlock_irqrestore(&i8259_lock, flags); } static void i8259_set_irq_mask(int irq_nr) @@ -110,13 +110,13 @@ static void i8259_mask_irq(unsigned int irq_nr) pr_debug("i8259_mask_irq(%d)\n", irq_nr); - spin_lock_irqsave(&i8259_lock, flags); + atomic_spin_lock_irqsave(&i8259_lock, flags); if (irq_nr < 8) cached_21 |= 1 << irq_nr; else cached_A1 |= 1 << (irq_nr-8); i8259_set_irq_mask(irq_nr); - spin_unlock_irqrestore(&i8259_lock, flags); + atomic_spin_unlock_irqrestore(&i8259_lock, flags); } static void i8259_unmask_irq(unsigned int irq_nr) @@ -125,13 +125,13 @@ static void i8259_unmask_irq(unsigned int irq_nr) pr_debug("i8259_unmask_irq(%d)\n", irq_nr); - spin_lock_irqsave(&i8259_lock, flags); + atomic_spin_lock_irqsave(&i8259_lock, flags); if (irq_nr < 8) cached_21 &= ~(1 << irq_nr); else cached_A1 &= ~(1 << (irq_nr-8)); i8259_set_irq_mask(irq_nr); - spin_unlock_irqrestore(&i8259_lock, flags); + atomic_spin_unlock_irqrestore(&i8259_lock, flags); } static struct irq_chip i8259_pic = { @@ -241,7 +241,7 @@ void i8259_init(struct device_node *node, unsigned long intack_addr) unsigned long flags; /* initialize the controller */ - spin_lock_irqsave(&i8259_lock, flags); + atomic_spin_lock_irqsave(&i8259_lock, flags); /* Mask all first */ outb(0xff, 0xA1); @@ -273,7 +273,7 @@ void i8259_init(struct device_node *node, unsigned long intack_addr) outb(cached_A1, 0xA1); outb(cached_21, 0x21); - spin_unlock_irqrestore(&i8259_lock, flags); + atomic_spin_unlock_irqrestore(&i8259_lock, flags); /* create a legacy host */ i8259_host = irq_alloc_host(node, IRQ_HOST_MAP_LEGACY, diff --git a/arch/powerpc/sysdev/ipic.c b/arch/powerpc/sysdev/ipic.c index 69e2630..7c84d27 100644 --- a/arch/powerpc/sysdev/ipic.c +++ b/arch/powerpc/sysdev/ipic.c @@ -32,7 +32,7 @@ static struct ipic * primary_ipic; static struct irq_chip ipic_level_irq_chip, ipic_edge_irq_chip; -static DEFINE_SPINLOCK(ipic_lock); +static DEFINE_ATOMIC_SPINLOCK(ipic_lock); static struct ipic_info ipic_info[] = { [1] = { @@ -530,13 +530,13 @@ static void ipic_unmask_irq(unsigned int virq) unsigned long flags; u32 temp; - spin_lock_irqsave(&ipic_lock, flags); + atomic_spin_lock_irqsave(&ipic_lock, flags); temp = ipic_read(ipic->regs, ipic_info[src].mask); temp |= (1 << (31 - ipic_info[src].bit)); ipic_write(ipic->regs, ipic_info[src].mask, temp); - spin_unlock_irqrestore(&ipic_lock, flags); + atomic_spin_unlock_irqrestore(&ipic_lock, flags); } static void ipic_mask_irq(unsigned int virq) @@ -546,7 +546,7 @@ static void ipic_mask_irq(unsigned int virq) unsigned long flags; u32 temp; - spin_lock_irqsave(&ipic_lock, flags); + atomic_spin_lock_irqsave(&ipic_lock, flags); temp = ipic_read(ipic->regs, ipic_info[src].mask); temp &= ~(1 << (31 - ipic_info[src].bit)); @@ -556,7 +556,7 @@ static void ipic_mask_irq(unsigned int virq) * for nearly all cases. */ mb(); - spin_unlock_irqrestore(&ipic_lock, flags); + atomic_spin_unlock_irqrestore(&ipic_lock, flags); } static void ipic_ack_irq(unsigned int virq) @@ -566,7 +566,7 @@ static void ipic_ack_irq(unsigned int virq) unsigned long flags; u32 temp; - spin_lock_irqsave(&ipic_lock, flags); + atomic_spin_lock_irqsave(&ipic_lock, flags); temp = 1 << (31 - ipic_info[src].bit); ipic_write(ipic->regs, ipic_info[src].ack, temp); @@ -575,7 +575,7 @@ static void ipic_ack_irq(unsigned int virq) * for nearly all cases. */ mb(); - spin_unlock_irqrestore(&ipic_lock, flags); + atomic_spin_unlock_irqrestore(&ipic_lock, flags); } static void ipic_mask_irq_and_ack(unsigned int virq) @@ -585,7 +585,7 @@ static void ipic_mask_irq_and_ack(unsigned int virq) unsigned long flags; u32 temp; - spin_lock_irqsave(&ipic_lock, flags); + atomic_spin_lock_irqsave(&ipic_lock, flags); temp = ipic_read(ipic->regs, ipic_info[src].mask); temp &= ~(1 << (31 - ipic_info[src].bit)); @@ -598,7 +598,7 @@ static void ipic_mask_irq_and_ack(unsigned int virq) * for nearly all cases. */ mb(); - spin_unlock_irqrestore(&ipic_lock, flags); + atomic_spin_unlock_irqrestore(&ipic_lock, flags); } static int ipic_set_irq_type(unsigned int virq, unsigned int flow_type) diff --git a/arch/powerpc/sysdev/mpic.c b/arch/powerpc/sysdev/mpic.c index 3981ae4..b0f66f9 100644 --- a/arch/powerpc/sysdev/mpic.c +++ b/arch/powerpc/sysdev/mpic.c @@ -46,7 +46,7 @@ static struct mpic *mpics; static struct mpic *mpic_primary; -static DEFINE_SPINLOCK(mpic_lock); +static DEFINE_ATOMIC_SPINLOCK(mpic_lock); #ifdef CONFIG_PPC32 /* XXX for now */ #ifdef CONFIG_IRQ_ALL_CPUS @@ -344,10 +344,10 @@ static inline void mpic_ht_end_irq(struct mpic *mpic, unsigned int source) unsigned int mask = 1U << (fixup->index & 0x1f); writel(mask, fixup->applebase + soff); } else { - spin_lock(&mpic->fixup_lock); + atomic_spin_lock(&mpic->fixup_lock); writeb(0x11 + 2 * fixup->index, fixup->base + 2); writel(fixup->data, fixup->base + 4); - spin_unlock(&mpic->fixup_lock); + atomic_spin_unlock(&mpic->fixup_lock); } } @@ -363,7 +363,7 @@ static void mpic_startup_ht_interrupt(struct mpic *mpic, unsigned int source, DBG("startup_ht_interrupt(0x%x, 0x%x) index: %d\n", source, irqflags, fixup->index); - spin_lock_irqsave(&mpic->fixup_lock, flags); + atomic_spin_lock_irqsave(&mpic->fixup_lock, flags); /* Enable and configure */ writeb(0x10 + 2 * fixup->index, fixup->base + 2); tmp = readl(fixup->base + 4); @@ -371,7 +371,7 @@ static void mpic_startup_ht_interrupt(struct mpic *mpic, unsigned int source, if (irqflags & IRQ_LEVEL) tmp |= 0x22; writel(tmp, fixup->base + 4); - spin_unlock_irqrestore(&mpic->fixup_lock, flags); + atomic_spin_unlock_irqrestore(&mpic->fixup_lock, flags); #ifdef CONFIG_PM /* use the lowest bit inverted to the actual HW, @@ -393,12 +393,12 @@ static void mpic_shutdown_ht_interrupt(struct mpic *mpic, unsigned int source, DBG("shutdown_ht_interrupt(0x%x, 0x%x)\n", source, irqflags); /* Disable */ - spin_lock_irqsave(&mpic->fixup_lock, flags); + atomic_spin_lock_irqsave(&mpic->fixup_lock, flags); writeb(0x10 + 2 * fixup->index, fixup->base + 2); tmp = readl(fixup->base + 4); tmp |= 1; writel(tmp, fixup->base + 4); - spin_unlock_irqrestore(&mpic->fixup_lock, flags); + atomic_spin_unlock_irqrestore(&mpic->fixup_lock, flags); #ifdef CONFIG_PM /* use the lowest bit inverted to the actual HW, @@ -512,7 +512,7 @@ static void __init mpic_scan_ht_pics(struct mpic *mpic) BUG_ON(mpic->fixups == NULL); /* Init spinlock */ - spin_lock_init(&mpic->fixup_lock); + atomic_spin_lock_init(&mpic->fixup_lock); /* Map U3 config space. We assume all IO-APICs are on the primary bus * so we only need to map 64kB. @@ -572,12 +572,12 @@ static int irq_choose_cpu(unsigned int virt_irq) cpumask_copy(&mask, irq_desc[virt_irq].affinity); if (cpus_equal(mask, CPU_MASK_ALL)) { static int irq_rover; - static DEFINE_SPINLOCK(irq_rover_lock); + static DEFINE_ATOMIC_SPINLOCK(irq_rover_lock); unsigned long flags; /* Round-robin distribution... */ do_round_robin: - spin_lock_irqsave(&irq_rover_lock, flags); + atomic_spin_lock_irqsave(&irq_rover_lock, flags); while (!cpu_online(irq_rover)) { if (++irq_rover >= NR_CPUS) @@ -589,7 +589,7 @@ static int irq_choose_cpu(unsigned int virt_irq) irq_rover = 0; } while (!cpu_online(irq_rover)); - spin_unlock_irqrestore(&irq_rover_lock, flags); + atomic_spin_unlock_irqrestore(&irq_rover_lock, flags); } else { cpumask_t tmp; @@ -1372,14 +1372,14 @@ void __init mpic_set_serial_int(struct mpic *mpic, int enable) unsigned long flags; u32 v; - spin_lock_irqsave(&mpic_lock, flags); + atomic_spin_lock_irqsave(&mpic_lock, flags); v = mpic_read(mpic->gregs, MPIC_GREG_GLOBAL_CONF_1); if (enable) v |= MPIC_GREG_GLOBAL_CONF_1_SIE; else v &= ~MPIC_GREG_GLOBAL_CONF_1_SIE; mpic_write(mpic->gregs, MPIC_GREG_GLOBAL_CONF_1, v); - spin_unlock_irqrestore(&mpic_lock, flags); + atomic_spin_unlock_irqrestore(&mpic_lock, flags); } void mpic_irq_set_priority(unsigned int irq, unsigned int pri) @@ -1392,7 +1392,7 @@ void mpic_irq_set_priority(unsigned int irq, unsigned int pri) if (!mpic) return; - spin_lock_irqsave(&mpic_lock, flags); + atomic_spin_lock_irqsave(&mpic_lock, flags); if (mpic_is_ipi(mpic, irq)) { reg = mpic_ipi_read(src - mpic->ipi_vecs[0]) & ~MPIC_VECPRI_PRIORITY_MASK; @@ -1404,7 +1404,7 @@ void mpic_irq_set_priority(unsigned int irq, unsigned int pri) mpic_irq_write(src, MPIC_INFO(IRQ_VECTOR_PRI), reg | (pri << MPIC_VECPRI_PRIORITY_SHIFT)); } - spin_unlock_irqrestore(&mpic_lock, flags); + atomic_spin_unlock_irqrestore(&mpic_lock, flags); } void mpic_setup_this_cpu(void) @@ -1419,7 +1419,7 @@ void mpic_setup_this_cpu(void) DBG("%s: setup_this_cpu(%d)\n", mpic->name, hard_smp_processor_id()); - spin_lock_irqsave(&mpic_lock, flags); + atomic_spin_lock_irqsave(&mpic_lock, flags); /* let the mpic know we want intrs. default affinity is 0xffffffff * until changed via /proc. That's how it's done on x86. If we want @@ -1435,7 +1435,7 @@ void mpic_setup_this_cpu(void) /* Set current processor priority to 0 */ mpic_cpu_write(MPIC_INFO(CPU_CURRENT_TASK_PRI), 0); - spin_unlock_irqrestore(&mpic_lock, flags); + atomic_spin_unlock_irqrestore(&mpic_lock, flags); #endif /* CONFIG_SMP */ } @@ -1464,7 +1464,7 @@ void mpic_teardown_this_cpu(int secondary) BUG_ON(mpic == NULL); DBG("%s: teardown_this_cpu(%d)\n", mpic->name, hard_smp_processor_id()); - spin_lock_irqsave(&mpic_lock, flags); + atomic_spin_lock_irqsave(&mpic_lock, flags); /* let the mpic know we don't want intrs. */ for (i = 0; i < mpic->num_sources ; i++) @@ -1478,7 +1478,7 @@ void mpic_teardown_this_cpu(int secondary) */ mpic_eoi(mpic); - spin_unlock_irqrestore(&mpic_lock, flags); + atomic_spin_unlock_irqrestore(&mpic_lock, flags); } diff --git a/arch/powerpc/sysdev/uic.c b/arch/powerpc/sysdev/uic.c index 466ce9a..2492a4a 100644 --- a/arch/powerpc/sysdev/uic.c +++ b/arch/powerpc/sysdev/uic.c @@ -225,12 +225,12 @@ void uic_irq_cascade(unsigned int virq, struct irq_desc *desc) int src; int subvirq; - spin_lock(&desc->lock); + atomic_spin_lock(&desc->lock); if (desc->status & IRQ_LEVEL) desc->chip->mask(virq); else desc->chip->mask_ack(virq); - spin_unlock(&desc->lock); + atomic_spin_unlock(&desc->lock); msr = mfdcr(uic->dcrbase + UIC_MSR); if (!msr) /* spurious interrupt */ @@ -242,12 +242,12 @@ void uic_irq_cascade(unsigned int virq, struct irq_desc *desc) generic_handle_irq(subvirq); uic_irq_ret: - spin_lock(&desc->lock); + atomic_spin_lock(&desc->lock); if (desc->status & IRQ_LEVEL) desc->chip->ack(virq); if (!(desc->status & IRQ_DISABLED) && desc->chip->unmask) desc->chip->unmask(virq); - spin_unlock(&desc->lock); + atomic_spin_unlock(&desc->lock); } static struct uic * __init uic_init_one(struct device_node *node) diff --git a/arch/powerpc/xmon/xmon.c b/arch/powerpc/xmon/xmon.c index e1f33a8..db7cf0d 100644 --- a/arch/powerpc/xmon/xmon.c +++ b/arch/powerpc/xmon/xmon.c @@ -348,6 +348,7 @@ static int xmon_core(struct pt_regs *regs, int fromipi) unsigned long timeout; #endif + preempt_disable(); local_irq_save(flags); bp = in_breakpoint_table(regs->nip, &offset); @@ -524,6 +525,7 @@ static int xmon_core(struct pt_regs *regs, int fromipi) insert_cpu_bpts(); local_irq_restore(flags); + preempt_enable(); return cmd != 'X' && cmd != EOF; } diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index 2ae5d72..7238ef4 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@ -84,7 +84,7 @@ config S390 select HAVE_FUNCTION_TRACER select HAVE_FUNCTION_TRACE_MCOUNT_TEST select HAVE_FTRACE_MCOUNT_RECORD - select HAVE_FTRACE_SYSCALLS + select HAVE_SYSCALL_TRACEPOINTS select HAVE_DYNAMIC_FTRACE select HAVE_FUNCTION_GRAPH_TRACER select HAVE_DEFAULT_NO_SPIN_MUTEXES diff --git a/arch/s390/defconfig b/arch/s390/defconfig index fcba206..4e91a25 100644 --- a/arch/s390/defconfig +++ b/arch/s390/defconfig @@ -900,7 +900,7 @@ CONFIG_HAVE_FUNCTION_GRAPH_TRACER=y CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST=y CONFIG_HAVE_DYNAMIC_FTRACE=y CONFIG_HAVE_FTRACE_MCOUNT_RECORD=y -CONFIG_HAVE_FTRACE_SYSCALLS=y +CONFIG_HAVE_SYSCALL_TRACEPOINTS=y CONFIG_TRACING_SUPPORT=y CONFIG_FTRACE=y # CONFIG_FUNCTION_TRACER is not set diff --git a/arch/s390/include/asm/rwsem.h b/arch/s390/include/asm/rwsem.h index 9d2a179..e70d6dd 100644 --- a/arch/s390/include/asm/rwsem.h +++ b/arch/s390/include/asm/rwsem.h @@ -48,16 +48,21 @@ struct rwsem_waiter; -extern struct rw_semaphore *rwsem_down_read_failed(struct rw_semaphore *); -extern struct rw_semaphore *rwsem_down_write_failed(struct rw_semaphore *); -extern struct rw_semaphore *rwsem_wake(struct rw_semaphore *); -extern struct rw_semaphore *rwsem_downgrade_wake(struct rw_semaphore *); -extern struct rw_semaphore *rwsem_downgrade_write(struct rw_semaphore *); +extern struct rw_anon_semaphore * +rwsem_down_read_failed(struct rw_anon_semaphore *); +extern struct rw_anon_semaphore * +rwsem_down_write_failed(struct rw_anon_semaphore *); +extern struct rw_anon_semaphore * +rwsem_wake(struct rw_anon_semaphore *); +extern struct rw_anon_semaphore * +rwsem_downgrade_wake(struct rw_anon_semaphore *); +extern struct rw_anon_semaphore * +rwsem_downgrade_write(struct rw_anon_semaphore *); /* * the semaphore definition */ -struct rw_semaphore { +struct rw_anon_semaphore { signed long count; spinlock_t wait_lock; struct list_head wait_list; @@ -85,40 +90,40 @@ struct rw_semaphore { */ #ifdef CONFIG_DEBUG_LOCK_ALLOC -# define __RWSEM_DEP_MAP_INIT(lockname) , .dep_map = { .name = #lockname } +# define __RWSEM_ANON_DEP_MAP_INIT(lockname) , .dep_map = { .name = #lockname } #else -# define __RWSEM_DEP_MAP_INIT(lockname) +# define __RWSEM_ANON_DEP_MAP_INIT(lockname) #endif -#define __RWSEM_INITIALIZER(name) \ +#define __RWSEM_ANON_INITIALIZER(name) \ { RWSEM_UNLOCKED_VALUE, __SPIN_LOCK_UNLOCKED((name).wait.lock), \ - LIST_HEAD_INIT((name).wait_list) __RWSEM_DEP_MAP_INIT(name) } + LIST_HEAD_INIT((name).wait_list) __RWSEM_ANON_DEP_MAP_INIT(name) } -#define DECLARE_RWSEM(name) \ - struct rw_semaphore name = __RWSEM_INITIALIZER(name) +#define DECLARE_ANON_RWSEM(name) \ + struct rw_anon_semaphore name = __RWSEM_ANON_INITIALIZER(name) -static inline void init_rwsem(struct rw_semaphore *sem) +static inline void init_anon_rwsem(struct rw_anon_semaphore *sem) { sem->count = RWSEM_UNLOCKED_VALUE; spin_lock_init(&sem->wait_lock); INIT_LIST_HEAD(&sem->wait_list); } -extern void __init_rwsem(struct rw_semaphore *sem, const char *name, - struct lock_class_key *key); +extern void __init_anon_rwsem(struct rw_anon_semaphore *sem, const char *name, + struct lock_class_key *key); -#define init_rwsem(sem) \ +#define init_anon_rwsem(sem) \ do { \ static struct lock_class_key __key; \ \ - __init_rwsem((sem), #sem, &__key); \ + __init_anon_rwsem((sem), #sem, &__key); \ } while (0) /* * lock for reading */ -static inline void __down_read(struct rw_semaphore *sem) +static inline void __down_read(struct rw_anon_semaphore *sem) { signed long old, new; @@ -146,7 +151,7 @@ static inline void __down_read(struct rw_semaphore *sem) /* * trylock for reading -- returns 1 if successful, 0 if contention */ -static inline int __down_read_trylock(struct rw_semaphore *sem) +static inline int __down_read_trylock(struct rw_anon_semaphore *sem) { signed long old, new; @@ -177,7 +182,8 @@ static inline int __down_read_trylock(struct rw_semaphore *sem) /* * lock for writing */ -static inline void __down_write_nested(struct rw_semaphore *sem, int subclass) +static inline void +__down_write_nested(struct rw_anon_semaphore *sem, int subclass) { signed long old, new, tmp; @@ -203,7 +209,7 @@ static inline void __down_write_nested(struct rw_semaphore *sem, int subclass) rwsem_down_write_failed(sem); } -static inline void __down_write(struct rw_semaphore *sem) +static inline void __down_write(struct rw_anon_semaphore *sem) { __down_write_nested(sem, 0); } @@ -211,7 +217,7 @@ static inline void __down_write(struct rw_semaphore *sem) /* * trylock for writing -- returns 1 if successful, 0 if contention */ -static inline int __down_write_trylock(struct rw_semaphore *sem) +static inline int __down_write_trylock(struct rw_anon_semaphore *sem) { signed long old; @@ -239,7 +245,7 @@ static inline int __down_write_trylock(struct rw_semaphore *sem) /* * unlock after reading */ -static inline void __up_read(struct rw_semaphore *sem) +static inline void __up_read(struct rw_anon_semaphore *sem) { signed long old, new; @@ -269,7 +275,7 @@ static inline void __up_read(struct rw_semaphore *sem) /* * unlock after writing */ -static inline void __up_write(struct rw_semaphore *sem) +static inline void __up_write(struct rw_anon_semaphore *sem) { signed long old, new, tmp; @@ -299,7 +305,7 @@ static inline void __up_write(struct rw_semaphore *sem) /* * downgrade write lock to read lock */ -static inline void __downgrade_write(struct rw_semaphore *sem) +static inline void __downgrade_write(struct rw_anon_semaphore *sem) { signed long old, new, tmp; @@ -328,7 +334,7 @@ static inline void __downgrade_write(struct rw_semaphore *sem) /* * implement atomic add functionality */ -static inline void rwsem_atomic_add(long delta, struct rw_semaphore *sem) +static inline void rwsem_atomic_add(long delta, struct rw_anon_semaphore *sem) { signed long old, new; @@ -354,7 +360,8 @@ static inline void rwsem_atomic_add(long delta, struct rw_semaphore *sem) /* * implement exchange and add functionality */ -static inline long rwsem_atomic_update(long delta, struct rw_semaphore *sem) +static inline long +rwsem_atomic_update(long delta, struct rw_anon_semaphore *sem) { signed long old, new; @@ -378,10 +385,52 @@ static inline long rwsem_atomic_update(long delta, struct rw_semaphore *sem) return new; } -static inline int rwsem_is_locked(struct rw_semaphore *sem) +static inline int rwsem_is_locked(struct rw_anon_semaphore *sem) { return (sem->count != 0); } +struct rw_semaphore { + signed long count; + spinlock_t wait_lock; + struct list_head wait_list; +#ifdef CONFIG_DEBUG_LOCK_ALLOC + struct lockdep_map dep_map; +#endif +}; + +#ifdef CONFIG_DEBUG_LOCK_ALLOC +# define __RWSEM_DEP_MAP_INIT(lockname) , .dep_map = { .name = #lockname } +#else +# define __RWSEM_DEP_MAP_INIT(lockname) +#endif + +#define __RWSEM_INITIALIZER(name) \ + { RWSEM_UNLOCKED_VALUE, __SPIN_LOCK_UNLOCKED((name).wait.lock), \ + LIST_HEAD_INIT((name).wait_list) __RWSEM_DEP_MAP_INIT(name) } + +#define DECLARE_RWSEM(name) \ + struct rw_anon_semaphore name = __RWSEM_INITIALIZER(name) + +static inline void init_rwsem(struct rw_anon_semaphore *sem) +{ + sem->count = RWSEM_UNLOCKED_VALUE; + spin_lock_init(&sem->wait_lock); + INIT_LIST_HEAD(&sem->wait_list); +} + +static inline void __init_rwsem(struct rw_semaphore *sem, const char *name, + struct lock_class_key *key) +{ + __init_anon_rwsem((struct rw_anon_semaphore *)sem, name, key); +} + +#define init_rwsem(sem) \ +do { \ + static struct lock_class_key __key; \ + \ + __init_rwsem((sem), #sem, &__key); \ +} while (0) + #endif /* __KERNEL__ */ #endif /* _S390_RWSEM_H */ diff --git a/arch/s390/include/asm/thread_info.h b/arch/s390/include/asm/thread_info.h index ba1cab9..07eb61b 100644 --- a/arch/s390/include/asm/thread_info.h +++ b/arch/s390/include/asm/thread_info.h @@ -92,7 +92,7 @@ static inline struct thread_info *current_thread_info(void) #define TIF_SYSCALL_TRACE 8 /* syscall trace active */ #define TIF_SYSCALL_AUDIT 9 /* syscall auditing active */ #define TIF_SECCOMP 10 /* secure computing */ -#define TIF_SYSCALL_FTRACE 11 /* ftrace syscall instrumentation */ +#define TIF_SYSCALL_TRACEPOINT 11 /* syscall tracepoint instrumentation */ #define TIF_USEDFPU 16 /* FPU was used by this task this quantum (SMP) */ #define TIF_POLLING_NRFLAG 17 /* true if poll_idle() is polling TIF_NEED_RESCHED */ @@ -111,7 +111,7 @@ static inline struct thread_info *current_thread_info(void) #define _TIF_SYSCALL_TRACE (1<<TIF_SYSCALL_TRACE) #define _TIF_SYSCALL_AUDIT (1<<TIF_SYSCALL_AUDIT) #define _TIF_SECCOMP (1<<TIF_SECCOMP) -#define _TIF_SYSCALL_FTRACE (1<<TIF_SYSCALL_FTRACE) +#define _TIF_SYSCALL_TRACEPOINT (1<<TIF_SYSCALL_TRACEPOINT) #define _TIF_USEDFPU (1<<TIF_USEDFPU) #define _TIF_POLLING_NRFLAG (1<<TIF_POLLING_NRFLAG) #define _TIF_31BIT (1<<TIF_31BIT) diff --git a/arch/s390/kernel/entry.S b/arch/s390/kernel/entry.S index c4c80a2..5d40fce 100644 --- a/arch/s390/kernel/entry.S +++ b/arch/s390/kernel/entry.S @@ -54,7 +54,7 @@ _TIF_WORK_SVC = (_TIF_SIGPENDING | _TIF_NOTIFY_RESUME | _TIF_NEED_RESCHED | \ _TIF_WORK_INT = (_TIF_SIGPENDING | _TIF_NOTIFY_RESUME | _TIF_NEED_RESCHED | \ _TIF_MCCK_PENDING) _TIF_SYSCALL = (_TIF_SYSCALL_TRACE>>8 | _TIF_SYSCALL_AUDIT>>8 | \ - _TIF_SECCOMP>>8 | _TIF_SYSCALL_FTRACE>>8) + _TIF_SECCOMP>>8 | _TIF_SYSCALL_TRACEPOINT>>8) STACK_SHIFT = PAGE_SHIFT + THREAD_ORDER STACK_SIZE = 1 << STACK_SHIFT diff --git a/arch/s390/kernel/entry64.S b/arch/s390/kernel/entry64.S index f6618e9..3ceb53c 100644 --- a/arch/s390/kernel/entry64.S +++ b/arch/s390/kernel/entry64.S @@ -57,7 +57,7 @@ _TIF_WORK_SVC = (_TIF_SIGPENDING | _TIF_NOTIFY_RESUME | _TIF_NEED_RESCHED | \ _TIF_WORK_INT = (_TIF_SIGPENDING | _TIF_NOTIFY_RESUME | _TIF_NEED_RESCHED | \ _TIF_MCCK_PENDING) _TIF_SYSCALL = (_TIF_SYSCALL_TRACE>>8 | _TIF_SYSCALL_AUDIT>>8 | \ - _TIF_SECCOMP>>8 | _TIF_SYSCALL_FTRACE>>8) + _TIF_SECCOMP>>8 | _TIF_SYSCALL_TRACEPOINT>>8) #define BASED(name) name-system_call(%r13) diff --git a/arch/s390/kernel/ftrace.c b/arch/s390/kernel/ftrace.c index 3e298e6..57bdcb1 100644 --- a/arch/s390/kernel/ftrace.c +++ b/arch/s390/kernel/ftrace.c @@ -220,6 +220,29 @@ struct syscall_metadata *syscall_nr_to_meta(int nr) return syscalls_metadata[nr]; } +int syscall_name_to_nr(char *name) +{ + int i; + + if (!syscalls_metadata) + return -1; + for (i = 0; i < NR_syscalls; i++) + if (syscalls_metadata[i]) + if (!strcmp(syscalls_metadata[i]->name, name)) + return i; + return -1; +} + +void set_syscall_enter_id(int num, int id) +{ + syscalls_metadata[num]->enter_id = id; +} + +void set_syscall_exit_id(int num, int id) +{ + syscalls_metadata[num]->exit_id = id; +} + static struct syscall_metadata *find_syscall_meta(unsigned long syscall) { struct syscall_metadata *start; @@ -237,24 +260,19 @@ static struct syscall_metadata *find_syscall_meta(unsigned long syscall) return NULL; } -void arch_init_ftrace_syscalls(void) +static int __init arch_init_ftrace_syscalls(void) { struct syscall_metadata *meta; int i; - static atomic_t refs; - - if (atomic_inc_return(&refs) != 1) - goto out; syscalls_metadata = kzalloc(sizeof(*syscalls_metadata) * NR_syscalls, GFP_KERNEL); if (!syscalls_metadata) - goto out; + return -ENOMEM; for (i = 0; i < NR_syscalls; i++) { meta = find_syscall_meta((unsigned long)sys_call_table[i]); syscalls_metadata[i] = meta; } - return; -out: - atomic_dec(&refs); + return 0; } +arch_initcall(arch_init_ftrace_syscalls); #endif diff --git a/arch/s390/kernel/ptrace.c b/arch/s390/kernel/ptrace.c index 43acd73..f3ddd7a 100644 --- a/arch/s390/kernel/ptrace.c +++ b/arch/s390/kernel/ptrace.c @@ -51,6 +51,9 @@ #include "compat_ptrace.h" #endif +#define CREATE_TRACE_POINTS +#include <trace/events/syscalls.h> + enum s390_regset { REGSET_GENERAL, REGSET_FP, @@ -661,8 +664,8 @@ asmlinkage long do_syscall_trace_enter(struct pt_regs *regs) ret = -1; } - if (unlikely(test_thread_flag(TIF_SYSCALL_FTRACE))) - ftrace_syscall_enter(regs); + if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT))) + trace_sys_enter(regs, regs->gprs[2]); if (unlikely(current->audit_context)) audit_syscall_entry(is_compat_task() ? @@ -679,8 +682,8 @@ asmlinkage void do_syscall_trace_exit(struct pt_regs *regs) audit_syscall_exit(AUDITSC_RESULT(regs->gprs[2]), regs->gprs[2]); - if (unlikely(test_thread_flag(TIF_SYSCALL_FTRACE))) - ftrace_syscall_exit(regs); + if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT))) + trace_sys_exit(regs, regs->gprs[2]); if (test_thread_flag(TIF_SYSCALL_TRACE)) tracehook_report_syscall_exit(regs, 0); diff --git a/arch/s390/kernel/time.c b/arch/s390/kernel/time.c index d4c8e9c..5582ff1 100644 --- a/arch/s390/kernel/time.c +++ b/arch/s390/kernel/time.c @@ -272,14 +272,14 @@ void __init time_init(void) * small for /proc/uptime to be accurate. * Reset xtime and wall_to_monotonic to sane values. */ - write_seqlock_irqsave(&xtime_lock, flags); + write_atomic_seqlock_irqsave(&xtime_lock, flags); now = get_clock(); tod_to_timeval(now - TOD_UNIX_EPOCH, &xtime); clocksource_tod.cycle_last = now; clocksource_tod.raw_time = xtime; tod_to_timeval(sched_clock_base_cc - TOD_UNIX_EPOCH, &ts); set_normalized_timespec(&wall_to_monotonic, -ts.tv_sec, -ts.tv_nsec); - write_sequnlock_irqrestore(&xtime_lock, flags); + write_atomic_sequnlock_irqrestore(&xtime_lock, flags); /* Enable TOD clock interrupts on the boot cpu. */ init_cpu_timer(); diff --git a/arch/sh/include/asm/rwsem.h b/arch/sh/include/asm/rwsem.h index 1987f3e..ed8c771 100644 --- a/arch/sh/include/asm/rwsem.h +++ b/arch/sh/include/asm/rwsem.h @@ -19,7 +19,7 @@ /* * the semaphore definition */ -struct rw_semaphore { +struct rw_anon_semaphore { long count; #define RWSEM_UNLOCKED_VALUE 0x00000000 #define RWSEM_ACTIVE_BIAS 0x00000001 @@ -35,35 +35,38 @@ struct rw_semaphore { }; #ifdef CONFIG_DEBUG_LOCK_ALLOC -# define __RWSEM_DEP_MAP_INIT(lockname) , .dep_map = { .name = #lockname } +# define __RWSEM_ANON_DEP_MAP_INIT(lockname) , .dep_map = { .name = #lockname } #else -# define __RWSEM_DEP_MAP_INIT(lockname) +# define __RWSEM_ANON_DEP_MAP_INIT(lockname) #endif -#define __RWSEM_INITIALIZER(name) \ +#define __RWSEM_ANON_INITIALIZER(name) \ { RWSEM_UNLOCKED_VALUE, SPIN_LOCK_UNLOCKED, \ LIST_HEAD_INIT((name).wait_list) \ - __RWSEM_DEP_MAP_INIT(name) } + __RWSEM_ANON_DEP_MAP_INIT(name) } -#define DECLARE_RWSEM(name) \ - struct rw_semaphore name = __RWSEM_INITIALIZER(name) +#define DECLARE_ANON_RWSEM(name) \ + struct rw_anon_semaphore name = __RWSEM_ANON_INITIALIZER(name) -extern struct rw_semaphore *rwsem_down_read_failed(struct rw_semaphore *sem); -extern struct rw_semaphore *rwsem_down_write_failed(struct rw_semaphore *sem); -extern struct rw_semaphore *rwsem_wake(struct rw_semaphore *sem); -extern struct rw_semaphore *rwsem_downgrade_wake(struct rw_semaphore *sem); +extern struct rw_anon_semaphore * +rwsem_down_read_failed(struct rw_anon_semaphore *sem); +extern struct rw_anon_semaphore * +rwsem_down_write_failed(struct rw_anon_semaphore *sem); +extern struct rw_anon_semaphore *rwsem_wake(struct rw_anon_semaphore *sem); +extern struct rw_anon_semaphore * +rwsem_downgrade_wake(struct rw_anon_semaphore *sem); -extern void __init_rwsem(struct rw_semaphore *sem, const char *name, - struct lock_class_key *key); +extern void __init_anon_rwsem(struct rw_anon_semaphore *sem, const char *name, + struct lock_class_key *key); -#define init_rwsem(sem) \ +#define init_anon_rwsem(sem) \ do { \ static struct lock_class_key __key; \ \ - __init_rwsem((sem), #sem, &__key); \ + __init_anon_rwsem((sem), #sem, &__key); \ } while (0) -static inline void init_rwsem(struct rw_semaphore *sem) +static inline void init_anon_rwsem(struct rw_anon_semaphore *sem) { sem->count = RWSEM_UNLOCKED_VALUE; spin_lock_init(&sem->wait_lock); @@ -73,7 +76,7 @@ static inline void init_rwsem(struct rw_semaphore *sem) /* * lock for reading */ -static inline void __down_read(struct rw_semaphore *sem) +static inline void __down_read(struct rw_anon_semaphore *sem) { if (atomic_inc_return((atomic_t *)(&sem->count)) > 0) smp_wmb(); @@ -81,7 +84,7 @@ static inline void __down_read(struct rw_semaphore *sem) rwsem_down_read_failed(sem); } -static inline int __down_read_trylock(struct rw_semaphore *sem) +static inline int __down_read_trylock(struct rw_anon_semaphore *sem) { int tmp; @@ -98,7 +101,7 @@ static inline int __down_read_trylock(struct rw_semaphore *sem) /* * lock for writing */ -static inline void __down_write(struct rw_semaphore *sem) +static inline void __down_write(struct rw_anon_semaphore *sem) { int tmp; @@ -110,7 +113,7 @@ static inline void __down_write(struct rw_semaphore *sem) rwsem_down_write_failed(sem); } -static inline int __down_write_trylock(struct rw_semaphore *sem) +static inline int __down_write_trylock(struct rw_anon_semaphore *sem) { int tmp; @@ -123,7 +126,7 @@ static inline int __down_write_trylock(struct rw_semaphore *sem) /* * unlock after reading */ -static inline void __up_read(struct rw_semaphore *sem) +static inline void __up_read(struct rw_anon_semaphore *sem) { int tmp; @@ -136,7 +139,7 @@ static inline void __up_read(struct rw_semaphore *sem) /* * unlock after writing */ -static inline void __up_write(struct rw_semaphore *sem) +static inline void __up_write(struct rw_anon_semaphore *sem) { smp_wmb(); if (atomic_sub_return(RWSEM_ACTIVE_WRITE_BIAS, @@ -147,7 +150,7 @@ static inline void __up_write(struct rw_semaphore *sem) /* * implement atomic add functionality */ -static inline void rwsem_atomic_add(int delta, struct rw_semaphore *sem) +static inline void rwsem_atomic_add(int delta, struct rw_anon_semaphore *sem) { atomic_add(delta, (atomic_t *)(&sem->count)); } @@ -155,7 +158,7 @@ static inline void rwsem_atomic_add(int delta, struct rw_semaphore *sem) /* * downgrade write lock to read lock */ -static inline void __downgrade_write(struct rw_semaphore *sem) +static inline void __downgrade_write(struct rw_anon_semaphore *sem) { int tmp; @@ -165,7 +168,8 @@ static inline void __downgrade_write(struct rw_semaphore *sem) rwsem_downgrade_wake(sem); } -static inline void __down_write_nested(struct rw_semaphore *sem, int subclass) +static inline void + __down_write_nested(struct rw_anon_semaphore *sem, int subclass) { __down_write(sem); } @@ -173,12 +177,60 @@ static inline void __down_write_nested(struct rw_semaphore *sem, int subclass) /* * implement exchange and add functionality */ -static inline int rwsem_atomic_update(int delta, struct rw_semaphore *sem) +static inline int rwsem_atomic_update(int delta, struct rw_anon_semaphore *sem) { smp_mb(); return atomic_add_return(delta, (atomic_t *)(&sem->count)); } +static inline int anon_rwsem_is_locked(struct rw_anon_semaphore *sem) +{ + return (sem->count != 0); +} + +struct rw_semaphore { + long count; + spinlock_t wait_lock; + struct list_head wait_list; +#ifdef CONFIG_DEBUG_LOCK_ALLOC + struct lockdep_map dep_map; +#endif +}; + +#ifdef CONFIG_DEBUG_LOCK_ALLOC +# define __RWSEM_DEP_MAP_INIT(lockname) , .dep_map = { .name = #lockname } +#else +# define __RWSEM_DEP_MAP_INIT(lockname) +#endif + +#define __RWSEM_INITIALIZER(name) \ + { RWSEM_UNLOCKED_VALUE, SPIN_LOCK_UNLOCKED, \ + LIST_HEAD_INIT((name).wait_list) \ + __RWSEM_DEP_MAP_INIT(name) } + +#define DECLARE_RWSEM(name) \ + struct rw_semaphore name = __RWSEM_INITIALIZER(name) + +static inline void __init_rwsem(struct rw_semaphore *sem, const char *name, + struct lock_class_key *key) +{ + __init_anon_rwsem((struct rw_anon_semaphore *)sem, name, key); +} + +#define init_rwsem(sem) \ +do { \ + static struct lock_class_key __key; \ + \ + __init_rwsem((sem), #sem, &__key); \ +} while (0) + +static inline void init_rwsem(struct rw_semaphore *sem) +{ + sem->count = RWSEM_UNLOCKED_VALUE; + spin_lock_init(&sem->wait_lock); + INIT_LIST_HEAD(&sem->wait_list); +} + static inline int rwsem_is_locked(struct rw_semaphore *sem) { return (sem->count != 0); diff --git a/arch/sh/kernel/irq.c b/arch/sh/kernel/irq.c index 3d09062..b17f303 100644 --- a/arch/sh/kernel/irq.c +++ b/arch/sh/kernel/irq.c @@ -1,4 +1,4 @@ -/* +1/* * linux/arch/sh/kernel/irq.c * * Copyright (C) 1992, 1998 Linus Torvalds, Ingo Molnar @@ -67,7 +67,7 @@ int show_interrupts(struct seq_file *p, void *v) if (!desc) return 0; - spin_lock_irqsave(&desc->lock, flags); + atomic_spin_lock_irqsave(&desc->lock, flags); for_each_online_cpu(j) any_count |= kstat_irqs_cpu(i, j); action = desc->action; @@ -88,7 +88,7 @@ int show_interrupts(struct seq_file *p, void *v) seq_putc(p, '\n'); out: - spin_unlock_irqrestore(&desc->lock, flags); + atomic_spin_unlock_irqrestore(&desc->lock, flags); return 0; } #endif diff --git a/arch/sparc/include/asm/rwsem.h b/arch/sparc/include/asm/rwsem.h index 1dc129a..0825088 100644 --- a/arch/sparc/include/asm/rwsem.h +++ b/arch/sparc/include/asm/rwsem.h @@ -19,7 +19,7 @@ struct rwsem_waiter; -struct rw_semaphore { +struct rw_anon_semaphore { signed int count; spinlock_t wait_lock; struct list_head wait_list; @@ -29,51 +29,92 @@ struct rw_semaphore { }; #ifdef CONFIG_DEBUG_LOCK_ALLOC -# define __RWSEM_DEP_MAP_INIT(lockname) , .dep_map = { .name = #lockname } +# define __RWSEM_ANON_DEP_MAP_INIT(lockname) , .dep_map = { .name = #lockname } #else -# define __RWSEM_DEP_MAP_INIT(lockname) +# define __RWSEM_ANON_DEP_MAP_INIT(lockname) #endif -#define __RWSEM_INITIALIZER(name) \ +#define __RWSEMANON__INITIALIZER(name) \ { RWSEM_UNLOCKED_VALUE, SPIN_LOCK_UNLOCKED, LIST_HEAD_INIT((name).wait_list) \ - __RWSEM_DEP_MAP_INIT(name) } + __RWSEM_ANON_DEP_MAP_INIT(name) } -#define DECLARE_RWSEM(name) \ - struct rw_semaphore name = __RWSEM_INITIALIZER(name) +#define DECLARE_ANON_RWSEM(name) \ + struct rw_anon_semaphore name = __RWSEM_ANON_INITIALIZER(name) -extern void __init_rwsem(struct rw_semaphore *sem, const char *name, - struct lock_class_key *key); +extern void __init_anon_rwsem(struct rw_anon_semaphore *sem, const char *name, + struct lock_class_key *key); -#define init_rwsem(sem) \ +#define init_anon_rwsem(sem) \ do { \ static struct lock_class_key __key; \ \ - __init_rwsem((sem), #sem, &__key); \ + __init_anon_rwsem((sem), #sem, &__key); \ } while (0) -extern void __down_read(struct rw_semaphore *sem); -extern int __down_read_trylock(struct rw_semaphore *sem); -extern void __down_write(struct rw_semaphore *sem); -extern int __down_write_trylock(struct rw_semaphore *sem); -extern void __up_read(struct rw_semaphore *sem); -extern void __up_write(struct rw_semaphore *sem); -extern void __downgrade_write(struct rw_semaphore *sem); +extern void __down_read(struct rw_anon_semaphore *sem); +extern int __down_read_trylock(struct rw_anon_semaphore *sem); +extern void __down_write(struct rw_anon_semaphore *sem); +extern int __down_write_trylock(struct rw_anon_semaphore *sem); +extern void __up_read(struct rw_anon_semaphore *sem); +extern void __up_write(struct rw_anon_semaphore *sem); +extern void __downgrade_write(struct rw_anon_semaphore *sem); -static inline void __down_write_nested(struct rw_semaphore *sem, int subclass) +static inline void +__down_write_nested(struct rw_anon_semaphore *sem, int subclass) { __down_write(sem); } -static inline int rwsem_atomic_update(int delta, struct rw_semaphore *sem) +static inline int rwsem_atomic_update(int delta, struct rw_anon_semaphore *sem) { return atomic_add_return(delta, (atomic_t *)(&sem->count)); } -static inline void rwsem_atomic_add(int delta, struct rw_semaphore *sem) +static inline void rwsem_atomic_add(int delta, struct rw_anon_semaphore *sem) { atomic_add(delta, (atomic_t *)(&sem->count)); } +static inline int anon_rwsem_is_locked(struct rw_semaphore *sem) +{ + return (sem->count != 0); +} + +struct rw_semaphore { + signed int count; + spinlock_t wait_lock; + struct list_head wait_list; +#ifdef CONFIG_DEBUG_LOCK_ALLOC + struct lockdep_map dep_map; +#endif +}; + +#ifdef CONFIG_DEBUG_LOCK_ALLOC +# define __RWSEM_DEP_MAP_INIT(lockname) , .dep_map = { .name = #lockname } +#else +# define __RWSEM_DEP_MAP_INIT(lockname) +#endif + +#define __RWSEM_INITIALIZER(name) \ +{ RWSEM_UNLOCKED_VALUE, SPIN_LOCK_UNLOCKED, LIST_HEAD_INIT((name).wait_list) \ + __RWSEM_DEP_MAP_INIT(name) } + +#define DECLARE_RWSEM(name) \ + struct rw_semaphore name = __RWSEM_INITIALIZER(name) + +static inline void __init_rwsem(struct rw_semaphore *sem, const char *name, + struct lock_class_key *key) +{ + __init_anon_rwsem((struct rw_anon_semaphore *)sem, name, key); +} + +#define init_rwsem(sem) \ +do { \ + static struct lock_class_key __key; \ + \ + __init_rwsem((sem), #sem, &__key); \ +} while (0) + static inline int rwsem_is_locked(struct rw_semaphore *sem) { return (sem->count != 0); diff --git a/arch/sparc/kernel/irq_64.c b/arch/sparc/kernel/irq_64.c index 8daab33..c72b1d5 100644 --- a/arch/sparc/kernel/irq_64.c +++ b/arch/sparc/kernel/irq_64.c @@ -176,7 +176,7 @@ int show_interrupts(struct seq_file *p, void *v) } if (i < NR_IRQS) { - spin_lock_irqsave(&irq_desc[i].lock, flags); + atomic_spin_lock_irqsave(&irq_desc[i].lock, flags); action = irq_desc[i].action; if (!action) goto skip; @@ -195,7 +195,7 @@ int show_interrupts(struct seq_file *p, void *v) seq_putc(p, '\n'); skip: - spin_unlock_irqrestore(&irq_desc[i].lock, flags); + atomic_spin_unlock_irqrestore(&irq_desc[i].lock, flags); } else if (i == NR_IRQS) { seq_printf(p, "NMI: "); for_each_online_cpu(j) @@ -785,14 +785,14 @@ void fixup_irqs(void) for (irq = 0; irq < NR_IRQS; irq++) { unsigned long flags; - spin_lock_irqsave(&irq_desc[irq].lock, flags); + atomic_spin_lock_irqsave(&irq_desc[irq].lock, flags); if (irq_desc[irq].action && !(irq_desc[irq].status & IRQ_PER_CPU)) { if (irq_desc[irq].chip->set_affinity) irq_desc[irq].chip->set_affinity(irq, irq_desc[irq].affinity); } - spin_unlock_irqrestore(&irq_desc[irq].lock, flags); + atomic_spin_unlock_irqrestore(&irq_desc[irq].lock, flags); } tick_ops->disable_irq(); diff --git a/arch/sparc/kernel/pcic.c b/arch/sparc/kernel/pcic.c index 85e7037..bea30b2 100644 --- a/arch/sparc/kernel/pcic.c +++ b/arch/sparc/kernel/pcic.c @@ -703,10 +703,10 @@ static void pcic_clear_clock_irq(void) static irqreturn_t pcic_timer_handler (int irq, void *h) { - write_seqlock(&xtime_lock); /* Dummy, to show that we remember */ + write_atomic_seqlock(&xtime_lock); /* Dummy, to show that we remember */ pcic_clear_clock_irq(); do_timer(1); - write_sequnlock(&xtime_lock); + write_atomic_sequnlock(&xtime_lock); #ifndef CONFIG_SMP update_process_times(user_mode(get_irq_regs())); #endif @@ -766,7 +766,7 @@ static void pci_do_gettimeofday(struct timeval *tv) unsigned long max_ntp_tick = tick_usec - tickadj; do { - seq = read_seqbegin_irqsave(&xtime_lock, flags); + seq = read_atomic_seqbegin_irqsave(&xtime_lock, flags); usec = do_gettimeoffset(); /* @@ -779,7 +779,7 @@ static void pci_do_gettimeofday(struct timeval *tv) sec = xtime.tv_sec; usec += (xtime.tv_nsec / 1000); - } while (read_seqretry_irqrestore(&xtime_lock, seq, flags)); + } while (read_atomic_seqretry_irqrestore(&xtime_lock, seq, flags)); while (usec >= 1000000) { usec -= 1000000; diff --git a/arch/sparc/kernel/time_32.c b/arch/sparc/kernel/time_32.c index 614ac7b..6530942 100644 --- a/arch/sparc/kernel/time_32.c +++ b/arch/sparc/kernel/time_32.c @@ -93,7 +93,7 @@ static irqreturn_t timer_interrupt(int dummy, void *dev_id) #endif /* Protect counter clear so that do_gettimeoffset works */ - write_seqlock(&xtime_lock); + write_atomic_seqlock(&xtime_lock); clear_clock_irq(); @@ -109,7 +109,7 @@ static irqreturn_t timer_interrupt(int dummy, void *dev_id) else last_rtc_update = xtime.tv_sec - 600; /* do it again in 60 s */ } - write_sequnlock(&xtime_lock); + write_atomic_sequnlock(&xtime_lock); #ifndef CONFIG_SMP update_process_times(user_mode(get_irq_regs())); @@ -251,7 +251,7 @@ void do_gettimeofday(struct timeval *tv) unsigned long max_ntp_tick = tick_usec - tickadj; do { - seq = read_seqbegin_irqsave(&xtime_lock, flags); + seq = read_atomic_seqbegin_irqsave(&xtime_lock, flags); usec = do_gettimeoffset(); /* @@ -264,7 +264,7 @@ void do_gettimeofday(struct timeval *tv) sec = xtime.tv_sec; usec += (xtime.tv_nsec / 1000); - } while (read_seqretry_irqrestore(&xtime_lock, seq, flags)); + } while (read_atomic_seqretry_irqrestore(&xtime_lock, seq, flags)); while (usec >= 1000000) { usec -= 1000000; @@ -281,9 +281,9 @@ int do_settimeofday(struct timespec *tv) { int ret; - write_seqlock_irq(&xtime_lock); + write_atomic_seqlock_irq(&xtime_lock); ret = bus_do_settimeofday(tv); - write_sequnlock_irq(&xtime_lock); + write_atomic_sequnlock_irq(&xtime_lock); clock_was_set(); return ret; } diff --git a/arch/sparc/mm/highmem.c b/arch/sparc/mm/highmem.c index 7916feb..a9f414c 100644 --- a/arch/sparc/mm/highmem.c +++ b/arch/sparc/mm/highmem.c @@ -34,7 +34,7 @@ void *kmap_atomic(struct page *page, enum km_type type) unsigned long idx; unsigned long vaddr; - /* even !CONFIG_PREEMPT needs this, for in_atomic in do_page_fault */ + preempt_disable(); pagefault_disable(); if (!PageHighMem(page)) return page_address(page); @@ -73,6 +73,7 @@ void kunmap_atomic(void *kvaddr, enum km_type type) if (vaddr < FIXADDR_START) { // FIXME pagefault_enable(); + preempt_enable(); return; } @@ -99,6 +100,7 @@ void kunmap_atomic(void *kvaddr, enum km_type type) #endif pagefault_enable(); + preempt_enable(); } EXPORT_SYMBOL(kunmap_atomic); diff --git a/arch/um/kernel/irq.c b/arch/um/kernel/irq.c index 454cdb4..048ca4a 100644 --- a/arch/um/kernel/irq.c +++ b/arch/um/kernel/irq.c @@ -33,7 +33,7 @@ int show_interrupts(struct seq_file *p, void *v) } if (i < NR_IRQS) { - spin_lock_irqsave(&irq_desc[i].lock, flags); + atomic_spin_lock_irqsave(&irq_desc[i].lock, flags); action = irq_desc[i].action; if (!action) goto skip; @@ -52,7 +52,7 @@ int show_interrupts(struct seq_file *p, void *v) seq_putc(p, '\n'); skip: - spin_unlock_irqrestore(&irq_desc[i].lock, flags); + atomic_spin_unlock_irqrestore(&irq_desc[i].lock, flags); } else if (i == NR_IRQS) seq_putc(p, '\n'); diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 13ffa5d..bc24095 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -38,7 +38,7 @@ config X86 select HAVE_FUNCTION_GRAPH_FP_TEST select HAVE_FUNCTION_TRACE_MCOUNT_TEST select HAVE_FTRACE_NMI_ENTER if DYNAMIC_FTRACE - select HAVE_FTRACE_SYSCALLS + select HAVE_SYSCALL_TRACEPOINTS select HAVE_KVM select HAVE_ARCH_KGDB select HAVE_ARCH_TRACEHOOK @@ -123,10 +123,18 @@ config ARCH_MAY_HAVE_PC_FDC def_bool y config RWSEM_GENERIC_SPINLOCK - def_bool !X86_XADD + bool + depends on !X86_XADD || PREEMPT_RT + default y + +config ASM_SEMAPHORES + bool + default y config RWSEM_XCHGADD_ALGORITHM - def_bool X86_XADD + bool + depends on X86_XADD && !RWSEM_GENERIC_SPINLOCK && !PREEMPT_RT + default y config ARCH_HAS_CPU_IDLE_WAIT def_bool y @@ -273,6 +281,7 @@ config X86_X2APIC config SPARSE_IRQ bool "Support sparse irq numbering" depends on PCI_MSI || HT_IRQ + depends on !PREEMPT_RT ---help--- This enables support for sparse irqs. This is useful for distro kernels that want to define a high CONFIG_NR_CPUS value but still @@ -672,7 +681,7 @@ config IOMMU_API config MAXSMP bool "Configure Maximum number of SMP Processors and NUMA Nodes" - depends on X86_64 && SMP && DEBUG_KERNEL && EXPERIMENTAL + depends on 0 && X86_64 && SMP && DEBUG_KERNEL && EXPERIMENTAL select CPUMASK_OFFSTACK default n ---help--- @@ -1894,7 +1903,7 @@ config PCI_MMCONFIG config DMAR bool "Support for DMA Remapping Devices (EXPERIMENTAL)" - depends on PCI_MSI && ACPI && EXPERIMENTAL + depends on PCI_MSI && ACPI && EXPERIMENTAL && !PREEMPT_RT help DMA remapping (DMAR) devices support enables independent address translations for Direct Memory Access (DMA) from devices. @@ -1937,6 +1946,7 @@ config DMAR_FLOPPY_WA config INTR_REMAP bool "Support for Interrupt Remapping (EXPERIMENTAL)" depends on X86_64 && X86_IO_APIC && PCI_MSI && ACPI && EXPERIMENTAL + depends on !PREEMPT_RT ---help--- Supports Interrupt remapping for IO-APIC and MSI devices. To use x2apic mode in the CPU's which support x2APIC enhancements or diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug index d105f29..3347029 100644 --- a/arch/x86/Kconfig.debug +++ b/arch/x86/Kconfig.debug @@ -76,6 +76,7 @@ config DEBUG_PER_CPU_MAPS bool "Debug access to per_cpu maps" depends on DEBUG_KERNEL depends on SMP + depends on !PREEMPT_RT default n ---help--- Say Y to verify that the per_cpu map being accessed has @@ -126,6 +127,7 @@ config DEBUG_NX_TEST config 4KSTACKS bool "Use 4Kb for kernel stacks instead of 8Kb" depends on X86_32 + default y ---help--- If you say Y here the kernel will use a 4Kb stacksize for the kernel stack attached to each process/thread. This facilitates diff --git a/arch/x86/configs/i386_defconfig b/arch/x86/configs/i386_defconfig index edb992e..d28fad1 100644 --- a/arch/x86/configs/i386_defconfig +++ b/arch/x86/configs/i386_defconfig @@ -2355,7 +2355,7 @@ CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST=y CONFIG_HAVE_DYNAMIC_FTRACE=y CONFIG_HAVE_FTRACE_MCOUNT_RECORD=y CONFIG_HAVE_HW_BRANCH_TRACER=y -CONFIG_HAVE_FTRACE_SYSCALLS=y +CONFIG_HAVE_SYSCALL_TRACEPOINTS=y CONFIG_RING_BUFFER=y CONFIG_TRACING=y CONFIG_TRACING_SUPPORT=y diff --git a/arch/x86/configs/x86_64_defconfig b/arch/x86/configs/x86_64_defconfig index cee1dd2..6c86acd 100644 --- a/arch/x86/configs/x86_64_defconfig +++ b/arch/x86/configs/x86_64_defconfig @@ -2329,7 +2329,7 @@ CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST=y CONFIG_HAVE_DYNAMIC_FTRACE=y CONFIG_HAVE_FTRACE_MCOUNT_RECORD=y CONFIG_HAVE_HW_BRANCH_TRACER=y -CONFIG_HAVE_FTRACE_SYSCALLS=y +CONFIG_HAVE_SYSCALL_TRACEPOINTS=y CONFIG_RING_BUFFER=y CONFIG_TRACING=y CONFIG_TRACING_SUPPORT=y diff --git a/arch/x86/include/asm/acpi.h b/arch/x86/include/asm/acpi.h index 20d1465..b87dff9 100644 --- a/arch/x86/include/asm/acpi.h +++ b/arch/x86/include/asm/acpi.h @@ -50,8 +50,8 @@ #define ACPI_ASM_MACROS #define BREAKPOINT3 -#define ACPI_DISABLE_IRQS() local_irq_disable() -#define ACPI_ENABLE_IRQS() local_irq_enable() +#define ACPI_DISABLE_IRQS() local_irq_disable_nort() +#define ACPI_ENABLE_IRQS() local_irq_enable_nort() #define ACPI_FLUSH_CPU_CACHE() wbinvd() int __acpi_acquire_global_lock(unsigned int *lock); diff --git a/arch/x86/include/asm/atomic_32.h b/arch/x86/include/asm/atomic_32.h index dc5a667..166704f 100644 --- a/arch/x86/include/asm/atomic_32.h +++ b/arch/x86/include/asm/atomic_32.h @@ -186,10 +186,10 @@ static inline int atomic_add_return(int i, atomic_t *v) #ifdef CONFIG_M386 no_xadd: /* Legacy 386 processor */ - local_irq_save(flags); + raw_local_irq_save(flags); __i = atomic_read(v); atomic_set(v, i + __i); - local_irq_restore(flags); + raw_local_irq_restore(flags); return i + __i; #endif } diff --git a/arch/x86/include/asm/ftrace.h b/arch/x86/include/asm/ftrace.h index bd2c651..db24c22 100644 --- a/arch/x86/include/asm/ftrace.h +++ b/arch/x86/include/asm/ftrace.h @@ -28,13 +28,6 @@ #endif -/* FIXME: I don't want to stay hardcoded */ -#ifdef CONFIG_X86_64 -# define FTRACE_SYSCALL_MAX 296 -#else -# define FTRACE_SYSCALL_MAX 333 -#endif - #ifdef CONFIG_FUNCTION_TRACER #define MCOUNT_ADDR ((long)(mcount)) #define MCOUNT_INSN_SIZE 5 /* sizeof mcount call */ diff --git a/arch/x86/include/asm/highmem.h b/arch/x86/include/asm/highmem.h index 014c2b8..3a53e85 100644 --- a/arch/x86/include/asm/highmem.h +++ b/arch/x86/include/asm/highmem.h @@ -58,6 +58,17 @@ extern void *kmap_high(struct page *page); extern void kunmap_high(struct page *page); void *kmap(struct page *page); +extern void kunmap_virt(void *ptr); +extern struct page *kmap_to_page(void *ptr); +void kunmap(struct page *page); + +void *__kmap_atomic_prot(struct page *page, enum km_type type, pgprot_t prot); +void *__kmap_atomic(struct page *page, enum km_type type); +void *__kmap_atomic_direct(struct page *page, enum km_type type); +void __kunmap_atomic(void *kvaddr, enum km_type type); +void *__kmap_atomic_pfn(unsigned long pfn, enum km_type type); +struct page *__kmap_atomic_to_page(void *ptr); + void kunmap(struct page *page); void *kmap_atomic_prot(struct page *page, enum km_type type, pgprot_t prot); void *kmap_atomic(struct page *page, enum km_type type); @@ -67,7 +78,8 @@ void *kmap_atomic_prot_pfn(unsigned long pfn, enum km_type type, pgprot_t prot); struct page *kmap_atomic_to_page(void *ptr); #ifndef CONFIG_PARAVIRT -#define kmap_atomic_pte(page, type) kmap_atomic(page, type) +#define kmap_atomic_pte(page, type) kmap_atomic(page, type) +#define kmap_atomic_pte_direct(page, type) kmap_atomic_direct(page, type) #endif #define flush_cache_kmaps() do { } while (0) @@ -75,6 +87,27 @@ struct page *kmap_atomic_to_page(void *ptr); extern void add_highpages_with_active_regions(int nid, unsigned long start_pfn, unsigned long end_pfn); +/* + * on PREEMPT_RT kmap_atomic() is a wrapper that uses kmap(): + */ +#ifdef CONFIG_PREEMPT_RT +# define kmap_atomic_prot(page, type, prot) ({ pagefault_disable(); kmap(page); }) +# define kmap_atomic(page, type) ({ pagefault_disable(); kmap(page); }) +# define kmap_atomic_pfn(pfn, type) kmap(pfn_to_page(pfn)) +# define kunmap_atomic(kvaddr, type) do { pagefault_enable(); kunmap_virt(kvaddr); } while(0) +# define kmap_atomic_to_page(kvaddr) kmap_to_page(kvaddr) +# define kmap_atomic_direct(page, type) __kmap_atomic_direct(page, type) +# define kunmap_atomic_direct(kvaddr, type) __kunmap_atomic(kvaddr, type) +#else +# define kmap_atomic_prot(page, type, prot) __kmap_atomic_prot(page, type, prot) +# define kmap_atomic(page, type) __kmap_atomic(page, type) +# define kmap_atomic_pfn(pfn, type) __kmap_atomic_pfn(pfn, type) +# define kunmap_atomic(kvaddr, type) __kunmap_atomic(kvaddr, type) +# define kmap_atomic_to_page(kvaddr) __kmap_atomic_to_page(kvaddr) +# define kmap_atomic_direct(page, type) __kmap_atomic(page, type) +# define kunmap_atomic_direct(kvaddr, type) __kunmap_atomic(kvaddr, type) +#endif + #endif /* __KERNEL__ */ #endif /* _ASM_X86_HIGHMEM_H */ diff --git a/arch/x86/include/asm/i8253.h b/arch/x86/include/asm/i8253.h index 1edbf89..a2bd5b5 100644 --- a/arch/x86/include/asm/i8253.h +++ b/arch/x86/include/asm/i8253.h @@ -6,7 +6,7 @@ #define PIT_CH0 0x40 #define PIT_CH2 0x42 -extern spinlock_t i8253_lock; +extern atomic_spinlock_t i8253_lock; extern struct clock_event_device *global_clock_event; diff --git a/arch/x86/include/asm/i8259.h b/arch/x86/include/asm/i8259.h index 58d7091..4720126 100644 --- a/arch/x86/include/asm/i8259.h +++ b/arch/x86/include/asm/i8259.h @@ -24,7 +24,7 @@ extern unsigned int cached_irq_mask; #define SLAVE_ICW4_DEFAULT 0x01 #define PIC_ICW4_AEOI 2 -extern spinlock_t i8259A_lock; +extern atomic_spinlock_t i8259A_lock; extern void init_8259A(int auto_eoi); extern void enable_8259A_irq(unsigned int irq); diff --git a/arch/x86/include/asm/nops.h b/arch/x86/include/asm/nops.h index ad2668e..6d8723a 100644 --- a/arch/x86/include/asm/nops.h +++ b/arch/x86/include/asm/nops.h @@ -65,6 +65,8 @@ 6: osp nopl 0x00(%eax,%eax,1) 7: nopl 0x00000000(%eax) 8: nopl 0x00000000(%eax,%eax,1) + Note: All the above are assumed to be a single instruction. + There is kernel code that depends on this. */ #define P6_NOP1 GENERIC_NOP1 #define P6_NOP2 ".byte 0x66,0x90\n" diff --git a/arch/x86/include/asm/page_64_types.h b/arch/x86/include/asm/page_64_types.h index 7639dbf..0ec050a 100644 --- a/arch/x86/include/asm/page_64_types.h +++ b/arch/x86/include/asm/page_64_types.h @@ -14,12 +14,21 @@ #define IRQ_STACK_ORDER 2 #define IRQ_STACK_SIZE (PAGE_SIZE << IRQ_STACK_ORDER) -#define STACKFAULT_STACK 1 -#define DOUBLEFAULT_STACK 2 -#define NMI_STACK 3 -#define DEBUG_STACK 4 -#define MCE_STACK 5 -#define N_EXCEPTION_STACKS 5 /* hw limit: 7 */ +#ifdef CONFIG_PREEMPT_RT +# define STACKFAULT_STACK 0 +# define DOUBLEFAULT_STACK 1 +# define NMI_STACK 2 +# define DEBUG_STACK 0 +# define MCE_STACK 3 +# define N_EXCEPTION_STACKS 3 /* hw limit: 7 */ +#else +# define STACKFAULT_STACK 1 +# define DOUBLEFAULT_STACK 2 +# define NMI_STACK 3 +# define DEBUG_STACK 4 +# define MCE_STACK 5 +# define N_EXCEPTION_STACKS 5 /* hw limit: 7 */ +#endif #define PUD_PAGE_SIZE (_AC(1, UL) << PUD_SHIFT) #define PUD_PAGE_MASK (~(PUD_PAGE_SIZE-1)) diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h index 4fb37c8..513b09e 100644 --- a/arch/x86/include/asm/paravirt.h +++ b/arch/x86/include/asm/paravirt.h @@ -340,6 +340,7 @@ struct pv_mmu_ops { #ifdef CONFIG_HIGHPTE void *(*kmap_atomic_pte)(struct page *page, enum km_type type); + void *(*kmap_atomic_pte_direct)(struct page *page, enum km_type type); #endif struct pv_lazy_ops lazy_mode; @@ -1136,6 +1137,14 @@ static inline void *kmap_atomic_pte(struct page *page, enum km_type type) ret = PVOP_CALL2(unsigned long, pv_mmu_ops.kmap_atomic_pte, page, type); return (void *)ret; } + +static inline void *kmap_atomic_pte_direct(struct page *page, enum km_type type) +{ + unsigned long ret; + ret = PVOP_CALL2(unsigned long, pv_mmu_ops.kmap_atomic_pte_direct, + page, type); + return (void *)ret; +} #endif static inline void pte_update(struct mm_struct *mm, unsigned long addr, diff --git a/arch/x86/include/asm/pci_x86.h b/arch/x86/include/asm/pci_x86.h index b399988..a2baf26 100644 --- a/arch/x86/include/asm/pci_x86.h +++ b/arch/x86/include/asm/pci_x86.h @@ -83,7 +83,7 @@ struct irq_routing_table { extern unsigned int pcibios_irq_mask; extern int pcibios_scanned; -extern spinlock_t pci_config_lock; +extern atomic_spinlock_t pci_config_lock; extern int (*pcibios_enable_irq)(struct pci_dev *dev); extern void (*pcibios_disable_irq)(struct pci_dev *dev); diff --git a/arch/x86/include/asm/pgtable-3level.h b/arch/x86/include/asm/pgtable-3level.h index 177b016..0e989a1 100644 --- a/arch/x86/include/asm/pgtable-3level.h +++ b/arch/x86/include/asm/pgtable-3level.h @@ -71,6 +71,7 @@ static inline void pud_clear(pud_t *pudp) { unsigned long pgd; + preempt_disable(); set_pud(pudp, __pud(0)); /* @@ -86,6 +87,7 @@ static inline void pud_clear(pud_t *pudp) if (__pa(pudp) >= pgd && __pa(pudp) < (pgd + sizeof(pgd_t)*PTRS_PER_PGD)) write_cr3(pgd); + preempt_enable(); } #ifdef CONFIG_SMP diff --git a/arch/x86/include/asm/pgtable_32.h b/arch/x86/include/asm/pgtable_32.h index 01fd946..b323c3d 100644 --- a/arch/x86/include/asm/pgtable_32.h +++ b/arch/x86/include/asm/pgtable_32.h @@ -59,14 +59,20 @@ extern void set_pmd_pfn(unsigned long, unsigned long, pgprot_t); #define pte_offset_map_nested(dir, address) \ ((pte_t *)kmap_atomic_pte(pmd_page(*(dir)), KM_PTE1) + \ pte_index((address))) +#define pte_offset_map_direct(dir, address) \ + ((pte_t *)kmap_atomic_pte_direct(pmd_page(*(dir)), __KM_PTE) + \ + pte_index((address))) #define pte_unmap(pte) kunmap_atomic((pte), __KM_PTE) #define pte_unmap_nested(pte) kunmap_atomic((pte), KM_PTE1) +#define pte_unmap_direct(pte) kunmap_atomic_direct((pte), __KM_PTE) #else #define pte_offset_map(dir, address) \ ((pte_t *)page_address(pmd_page(*(dir))) + pte_index((address))) #define pte_offset_map_nested(dir, address) pte_offset_map((dir), (address)) +#define pte_offset_map_direct(dir, address) pte_offset_map((dir), (address)) #define pte_unmap(pte) do { } while (0) #define pte_unmap_nested(pte) do { } while (0) +#define pte_unmap_direct(pte) do { } while (0) #endif /* Clear a kernel PTE and flush it from the TLB */ diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h index c57a301..efc01ae 100644 --- a/arch/x86/include/asm/pgtable_64.h +++ b/arch/x86/include/asm/pgtable_64.h @@ -126,8 +126,10 @@ static inline int pgd_large(pgd_t pgd) { return 0; } /* x86-64 always has all page tables mapped. */ #define pte_offset_map(dir, address) pte_offset_kernel((dir), (address)) #define pte_offset_map_nested(dir, address) pte_offset_kernel((dir), (address)) -#define pte_unmap(pte) /* NOP */ -#define pte_unmap_nested(pte) /* NOP */ +#define pte_offset_map_direct(dir, address) pte_offset_kernel((dir), (address)) +#define pte_unmap(pte) do { } while (0) +#define pte_unmap_nested(pte) do { } while (0) +#define pte_unmap_direct(pte) do { } while (0) #define update_mmu_cache(vma, address, pte) do { } while (0) diff --git a/arch/x86/include/asm/rwsem.h b/arch/x86/include/asm/rwsem.h index ca7517d..92d67a6 100644 --- a/arch/x86/include/asm/rwsem.h +++ b/arch/x86/include/asm/rwsem.h @@ -44,14 +44,14 @@ struct rwsem_waiter; -extern asmregparm struct rw_semaphore * - rwsem_down_read_failed(struct rw_semaphore *sem); -extern asmregparm struct rw_semaphore * - rwsem_down_write_failed(struct rw_semaphore *sem); -extern asmregparm struct rw_semaphore * - rwsem_wake(struct rw_semaphore *); -extern asmregparm struct rw_semaphore * - rwsem_downgrade_wake(struct rw_semaphore *sem); +extern asmregparm struct rw_anon_semaphore * + rwsem_down_read_failed(struct rw_anon_semaphore *sem); +extern asmregparm struct rw_anon_semaphore * + rwsem_down_write_failed(struct rw_anon_semaphore *sem); +extern asmregparm struct rw_anon_semaphore * + rwsem_wake(struct rw_anon_semaphore *); +extern asmregparm struct rw_anon_semaphore * + rwsem_downgrade_wake(struct rw_anon_semaphore *sem); /* * the semaphore definition @@ -64,7 +64,7 @@ extern asmregparm struct rw_semaphore * #define RWSEM_ACTIVE_READ_BIAS RWSEM_ACTIVE_BIAS #define RWSEM_ACTIVE_WRITE_BIAS (RWSEM_WAITING_BIAS + RWSEM_ACTIVE_BIAS) -struct rw_semaphore { +struct rw_anon_semaphore { signed long count; spinlock_t wait_lock; struct list_head wait_list; @@ -74,35 +74,34 @@ struct rw_semaphore { }; #ifdef CONFIG_DEBUG_LOCK_ALLOC -# define __RWSEM_DEP_MAP_INIT(lockname) , .dep_map = { .name = #lockname } +# define __RWSEM_ANON_DEP_MAP_INIT(lockname) , .dep_map = { .name = #lockname } #else -# define __RWSEM_DEP_MAP_INIT(lockname) +# define __RWSEM_ANON_DEP_MAP_INIT(lockname) #endif - -#define __RWSEM_INITIALIZER(name) \ +#define __RWSEM_ANON_INITIALIZER(name) \ { \ RWSEM_UNLOCKED_VALUE, __SPIN_LOCK_UNLOCKED((name).wait_lock), \ LIST_HEAD_INIT((name).wait_list) __RWSEM_DEP_MAP_INIT(name) \ } -#define DECLARE_RWSEM(name) \ - struct rw_semaphore name = __RWSEM_INITIALIZER(name) +#define DECLARE_ANON_RWSEM(name) \ + struct rw_anon_semaphore name = __RWSEM_ANON_INITIALIZER(name) -extern void __init_rwsem(struct rw_semaphore *sem, const char *name, - struct lock_class_key *key); +extern void __init_anon_rwsem(struct rw_anon_semaphore *sem, const char *name, + struct lock_class_key *key); -#define init_rwsem(sem) \ +#define init_anon_rwsem(sem) \ do { \ static struct lock_class_key __key; \ \ - __init_rwsem((sem), #sem, &__key); \ + __init_anon_rwsem((sem), #sem, &__key); \ } while (0) /* * lock for reading */ -static inline void __down_read(struct rw_semaphore *sem) +static inline void __down_read(struct rw_anon_semaphore *sem) { asm volatile("# beginning down_read\n\t" LOCK_PREFIX " incl (%%eax)\n\t" @@ -119,7 +118,7 @@ static inline void __down_read(struct rw_semaphore *sem) /* * trylock for reading -- returns 1 if successful, 0 if contention */ -static inline int __down_read_trylock(struct rw_semaphore *sem) +static inline int __down_read_trylock(struct rw_anon_semaphore *sem) { __s32 result, tmp; asm volatile("# beginning __down_read_trylock\n\t" @@ -141,7 +140,8 @@ static inline int __down_read_trylock(struct rw_semaphore *sem) /* * lock for writing */ -static inline void __down_write_nested(struct rw_semaphore *sem, int subclass) +static inline void +__down_write_nested(struct rw_anon_semaphore *sem, int subclass) { int tmp; @@ -160,7 +160,7 @@ static inline void __down_write_nested(struct rw_semaphore *sem, int subclass) : "memory", "cc"); } -static inline void __down_write(struct rw_semaphore *sem) +static inline void __down_write(struct rw_anon_semaphore *sem) { __down_write_nested(sem, 0); } @@ -168,7 +168,7 @@ static inline void __down_write(struct rw_semaphore *sem) /* * trylock for writing -- returns 1 if successful, 0 if contention */ -static inline int __down_write_trylock(struct rw_semaphore *sem) +static inline int __down_write_trylock(struct rw_anon_semaphore *sem) { signed long ret = cmpxchg(&sem->count, RWSEM_UNLOCKED_VALUE, @@ -181,7 +181,7 @@ static inline int __down_write_trylock(struct rw_semaphore *sem) /* * unlock after reading */ -static inline void __up_read(struct rw_semaphore *sem) +static inline void __up_read(struct rw_anon_semaphore *sem) { __s32 tmp = -RWSEM_ACTIVE_READ_BIAS; asm volatile("# beginning __up_read\n\t" @@ -199,7 +199,7 @@ static inline void __up_read(struct rw_semaphore *sem) /* * unlock after writing */ -static inline void __up_write(struct rw_semaphore *sem) +static inline void __up_write(struct rw_anon_semaphore *sem) { asm volatile("# beginning __up_write\n\t" " movl %2,%%edx\n\t" @@ -218,7 +218,7 @@ static inline void __up_write(struct rw_semaphore *sem) /* * downgrade write lock to read lock */ -static inline void __downgrade_write(struct rw_semaphore *sem) +static inline void __downgrade_write(struct rw_anon_semaphore *sem) { asm volatile("# beginning __downgrade_write\n\t" LOCK_PREFIX " addl %2,(%%eax)\n\t" @@ -235,7 +235,7 @@ static inline void __downgrade_write(struct rw_semaphore *sem) /* * implement atomic add functionality */ -static inline void rwsem_atomic_add(int delta, struct rw_semaphore *sem) +static inline void rwsem_atomic_add(int delta, struct rw_anon_semaphore *sem) { asm volatile(LOCK_PREFIX "addl %1,%0" : "+m" (sem->count) @@ -245,7 +245,7 @@ static inline void rwsem_atomic_add(int delta, struct rw_semaphore *sem) /* * implement exchange and add functionality */ -static inline int rwsem_atomic_update(int delta, struct rw_semaphore *sem) +static inline int rwsem_atomic_update(int delta, struct rw_anon_semaphore *sem) { int tmp = delta; @@ -256,10 +256,54 @@ static inline int rwsem_atomic_update(int delta, struct rw_semaphore *sem) return tmp + delta; } +static inline int anon_rwsem_is_locked(struct rw_anon_semaphore *sem) +{ + return (sem->count != 0); +} + +#ifndef CONFIG_PREEMPT_RT + +struct rw_semaphore { + signed long count; + spinlock_t wait_lock; + struct list_head wait_list; +#ifdef CONFIG_DEBUG_LOCK_ALLOC + struct lockdep_map dep_map; +#endif +}; + +#ifdef CONFIG_DEBUG_LOCK_ALLOC +# define __RWSEM_DEP_MAP_INIT(lockname) , .dep_map = { .name = #lockname } +#else +# define __RWSEM_DEP_MAP_INIT(lockname) +#endif + +#define __RWSEM_INITIALIZER(name) \ +{ 0, __SPIN_LOCK_UNLOCKED(name.wait_lock), LIST_HEAD_INIT((name).wait_list) \ + __RWSEM_DEP_MAP_INIT(name) } + +#define DECLARE_RWSEM(name) \ + struct rw_semaphore name = __RWSEM_INITIALIZER(name) + +static inline void __init_rwsem(struct rw_semaphore *sem, const char *name, + struct lock_class_key *key) +{ + __init_anon_rwsem((struct rw_anon_semaphore *)sem, name, key); +} + +#define init_rwsem(sem) \ +do { \ + static struct lock_class_key __key; \ + \ + __init_rwsem((sem), #sem, &__key); \ +} while (0) + + static inline int rwsem_is_locked(struct rw_semaphore *sem) { return (sem->count != 0); } +#endif #endif /* __KERNEL__ */ #endif /* _ASM_X86_RWSEM_H */ diff --git a/arch/x86/include/asm/spinlock.h b/arch/x86/include/asm/spinlock.h index 4e77853..135097c 100644 --- a/arch/x86/include/asm/spinlock.h +++ b/arch/x86/include/asm/spinlock.h @@ -298,9 +298,9 @@ static inline void __raw_write_unlock(raw_rwlock_t *rw) #define __raw_read_lock_flags(lock, flags) __raw_read_lock(lock) #define __raw_write_lock_flags(lock, flags) __raw_write_lock(lock) -#define _raw_spin_relax(lock) cpu_relax() -#define _raw_read_relax(lock) cpu_relax() -#define _raw_write_relax(lock) cpu_relax() +#define __raw_spin_relax(lock) cpu_relax() +#define __raw_read_relax(lock) cpu_relax() +#define __raw_write_relax(lock) cpu_relax() /* The {read|write|spin}_lock() on x86 are full memory barriers. */ static inline void smp_mb__after_lock(void) { } diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h index fad7d40..6f7786a 100644 --- a/arch/x86/include/asm/thread_info.h +++ b/arch/x86/include/asm/thread_info.h @@ -95,7 +95,7 @@ struct thread_info { #define TIF_DEBUGCTLMSR 25 /* uses thread_struct.debugctlmsr */ #define TIF_DS_AREA_MSR 26 /* uses thread_struct.ds_area_msr */ #define TIF_LAZY_MMU_UPDATES 27 /* task is updating the mmu lazily */ -#define TIF_SYSCALL_FTRACE 28 /* for ftrace syscall instrumentation */ +#define TIF_SYSCALL_TRACEPOINT 28 /* syscall tracepoint instrumentation */ #define _TIF_SYSCALL_TRACE (1 << TIF_SYSCALL_TRACE) #define _TIF_NOTIFY_RESUME (1 << TIF_NOTIFY_RESUME) @@ -118,17 +118,17 @@ struct thread_info { #define _TIF_DEBUGCTLMSR (1 << TIF_DEBUGCTLMSR) #define _TIF_DS_AREA_MSR (1 << TIF_DS_AREA_MSR) #define _TIF_LAZY_MMU_UPDATES (1 << TIF_LAZY_MMU_UPDATES) -#define _TIF_SYSCALL_FTRACE (1 << TIF_SYSCALL_FTRACE) +#define _TIF_SYSCALL_TRACEPOINT (1 << TIF_SYSCALL_TRACEPOINT) /* work to do in syscall_trace_enter() */ #define _TIF_WORK_SYSCALL_ENTRY \ - (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_EMU | _TIF_SYSCALL_FTRACE | \ - _TIF_SYSCALL_AUDIT | _TIF_SECCOMP | _TIF_SINGLESTEP) + (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_EMU | _TIF_SYSCALL_AUDIT | \ + _TIF_SECCOMP | _TIF_SINGLESTEP | _TIF_SYSCALL_TRACEPOINT) /* work to do in syscall_trace_leave() */ #define _TIF_WORK_SYSCALL_EXIT \ (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT | _TIF_SINGLESTEP | \ - _TIF_SYSCALL_FTRACE) + _TIF_SYSCALL_TRACEPOINT) /* work to do on interrupt/exception return */ #define _TIF_WORK_MASK \ @@ -137,7 +137,8 @@ struct thread_info { _TIF_SINGLESTEP|_TIF_SECCOMP|_TIF_SYSCALL_EMU)) /* work to do on any return to user space */ -#define _TIF_ALLWORK_MASK ((0x0000FFFF & ~_TIF_SECCOMP) | _TIF_SYSCALL_FTRACE) +#define _TIF_ALLWORK_MASK \ + ((0x0000FFFF & ~_TIF_SECCOMP) | _TIF_SYSCALL_TRACEPOINT) /* Only used for 64 bit */ #define _TIF_DO_NOTIFY_MASK \ diff --git a/arch/x86/include/asm/timer.h b/arch/x86/include/asm/timer.h index 20ca9c4..1c4277a 100644 --- a/arch/x86/include/asm/timer.h +++ b/arch/x86/include/asm/timer.h @@ -62,9 +62,9 @@ static inline unsigned long long cycles_2_ns(unsigned long long cyc) unsigned long long ns; unsigned long flags; - local_irq_save(flags); + raw_local_irq_save(flags); ns = __cycles_2_ns(cyc); - local_irq_restore(flags); + raw_local_irq_restore(flags); return ns; } diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h index 7f3eba0..1c77e81 100644 --- a/arch/x86/include/asm/tlbflush.h +++ b/arch/x86/include/asm/tlbflush.h @@ -7,6 +7,21 @@ #include <asm/processor.h> #include <asm/system.h> +/* + * TLB-flush needs to be nonpreemptible on PREEMPT_RT due to the + * following complex race scenario: + * + * if the current task is lazy-TLB and does a TLB flush and + * gets preempted after the movl %%r3, %0 but before the + * movl %0, %%cr3 then its ->active_mm might change and it will + * install the wrong cr3 when it switches back. This is not a + * problem for the lazy-TLB task itself, but if the next task it + * switches to has an ->mm that is also the lazy-TLB task's + * new ->active_mm, then the scheduler will assume that cr3 is + * the new one, while we overwrote it with the old one. The result + * is the wrong cr3 in the new (non-lazy-TLB) task, which typically + * causes an infinite pagefault upon the next userspace access. + */ #ifdef CONFIG_PARAVIRT #include <asm/paravirt.h> #else @@ -17,7 +32,9 @@ static inline void __native_flush_tlb(void) { + preempt_disable(); native_write_cr3(native_read_cr3()); + preempt_enable(); } static inline void __native_flush_tlb_global(void) @@ -95,6 +112,13 @@ static inline void __flush_tlb_one(unsigned long addr) static inline void flush_tlb_mm(struct mm_struct *mm) { + /* + * This is safe on PREEMPT_RT because if we preempt + * right after the check but before the __flush_tlb(), + * and if ->active_mm changes, then we might miss a + * TLB flush, but that TLB flush happened already when + * ->active_mm was changed: + */ if (mm == current->active_mm) __flush_tlb(); } diff --git a/arch/x86/include/asm/unistd_32.h b/arch/x86/include/asm/unistd_32.h index 732a307..8deaada 100644 --- a/arch/x86/include/asm/unistd_32.h +++ b/arch/x86/include/asm/unistd_32.h @@ -345,6 +345,8 @@ #ifdef __KERNEL__ +#define NR_syscalls 337 + #define __ARCH_WANT_IPC_PARSE_VERSION #define __ARCH_WANT_OLD_READDIR #define __ARCH_WANT_OLD_STAT diff --git a/arch/x86/include/asm/unistd_64.h b/arch/x86/include/asm/unistd_64.h index 900e161..b9f3c60 100644 --- a/arch/x86/include/asm/unistd_64.h +++ b/arch/x86/include/asm/unistd_64.h @@ -688,6 +688,12 @@ __SYSCALL(__NR_perf_counter_open, sys_perf_counter_open) #endif /* __NO_STUBS */ #ifdef __KERNEL__ + +#ifndef COMPILE_OFFSETS +#include <asm/asm-offsets.h> +#define NR_syscalls (__NR_syscall_max + 1) +#endif + /* * "Conditional" syscalls * diff --git a/arch/x86/include/asm/vgtod.h b/arch/x86/include/asm/vgtod.h index dc27a69..c536000 100644 --- a/arch/x86/include/asm/vgtod.h +++ b/arch/x86/include/asm/vgtod.h @@ -5,7 +5,7 @@ #include <linux/clocksource.h> struct vsyscall_gtod_data { - seqlock_t lock; + atomic_seqlock_t lock; /* open coded 'struct timespec' */ time_t wall_time_sec; diff --git a/arch/x86/include/asm/xor_32.h b/arch/x86/include/asm/xor_32.h index 133b40a..7a6aa68 100644 --- a/arch/x86/include/asm/xor_32.h +++ b/arch/x86/include/asm/xor_32.h @@ -865,7 +865,21 @@ static struct xor_block_template xor_block_pIII_sse = { #include <asm-generic/xor.h> #undef XOR_TRY_TEMPLATES -#define XOR_TRY_TEMPLATES \ +/* + * MMX/SSE ops disable preemption for long periods of time, + * so on PREEMPT_RT use the register-based ops only: + */ +#ifdef CONFIG_PREEMPT_RT +# define XOR_TRY_TEMPLATES \ + do { \ + xor_speed(&xor_block_8regs); \ + xor_speed(&xor_block_8regs_p); \ + xor_speed(&xor_block_32regs); \ + xor_speed(&xor_block_32regs_p); \ + } while (0) +# define XOR_SELECT_TEMPLATE(FASTEST) (FASTEST) +#else +# define XOR_TRY_TEMPLATES \ do { \ xor_speed(&xor_block_8regs); \ xor_speed(&xor_block_8regs_p); \ @@ -882,7 +896,8 @@ do { \ /* We force the use of the SSE xor block because it can write around L2. We may also be able to load into the L1 only depending on how the cpu deals with a load to a line that is being prefetched. */ -#define XOR_SELECT_TEMPLATE(FASTEST) \ +# define XOR_SELECT_TEMPLATE(FASTEST) \ (cpu_has_xmm ? &xor_block_pIII_sse : FASTEST) +#endif /* CONFIG_PREEMPT_RT */ #endif /* _ASM_X86_XOR_32_H */ diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index d2ed6c5..a3bf1db 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -73,8 +73,8 @@ */ int sis_apic_bug = -1; -static DEFINE_SPINLOCK(ioapic_lock); -static DEFINE_SPINLOCK(vector_lock); +static DEFINE_ATOMIC_SPINLOCK(ioapic_lock); +static DEFINE_ATOMIC_SPINLOCK(vector_lock); /* * # of IRQ routing registers @@ -413,7 +413,7 @@ static bool io_apic_level_ack_pending(struct irq_cfg *cfg) struct irq_pin_list *entry; unsigned long flags; - spin_lock_irqsave(&ioapic_lock, flags); + atomic_spin_lock_irqsave(&ioapic_lock, flags); entry = cfg->irq_2_pin; for (;;) { unsigned int reg; @@ -425,14 +425,14 @@ static bool io_apic_level_ack_pending(struct irq_cfg *cfg) reg = io_apic_read(entry->apic, 0x10 + pin*2); /* Is the remote IRR bit set? */ if (reg & IO_APIC_REDIR_REMOTE_IRR) { - spin_unlock_irqrestore(&ioapic_lock, flags); + atomic_spin_unlock_irqrestore(&ioapic_lock, flags); return true; } if (!entry->next) break; entry = entry->next; } - spin_unlock_irqrestore(&ioapic_lock, flags); + atomic_spin_unlock_irqrestore(&ioapic_lock, flags); return false; } @@ -446,10 +446,10 @@ static struct IO_APIC_route_entry ioapic_read_entry(int apic, int pin) { union entry_union eu; unsigned long flags; - spin_lock_irqsave(&ioapic_lock, flags); + atomic_spin_lock_irqsave(&ioapic_lock, flags); eu.w1 = io_apic_read(apic, 0x10 + 2 * pin); eu.w2 = io_apic_read(apic, 0x11 + 2 * pin); - spin_unlock_irqrestore(&ioapic_lock, flags); + atomic_spin_unlock_irqrestore(&ioapic_lock, flags); return eu.entry; } @@ -472,9 +472,9 @@ __ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e) void ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e) { unsigned long flags; - spin_lock_irqsave(&ioapic_lock, flags); + atomic_spin_lock_irqsave(&ioapic_lock, flags); __ioapic_write_entry(apic, pin, e); - spin_unlock_irqrestore(&ioapic_lock, flags); + atomic_spin_unlock_irqrestore(&ioapic_lock, flags); } /* @@ -487,10 +487,10 @@ static void ioapic_mask_entry(int apic, int pin) unsigned long flags; union entry_union eu = { .entry.mask = 1 }; - spin_lock_irqsave(&ioapic_lock, flags); + atomic_spin_lock_irqsave(&ioapic_lock, flags); io_apic_write(apic, 0x10 + 2*pin, eu.w1); io_apic_write(apic, 0x11 + 2*pin, eu.w2); - spin_unlock_irqrestore(&ioapic_lock, flags); + atomic_spin_unlock_irqrestore(&ioapic_lock, flags); } /* @@ -622,9 +622,9 @@ static void mask_IO_APIC_irq_desc(struct irq_desc *desc) BUG_ON(!cfg); - spin_lock_irqsave(&ioapic_lock, flags); + atomic_spin_lock_irqsave(&ioapic_lock, flags); __mask_IO_APIC_irq(cfg); - spin_unlock_irqrestore(&ioapic_lock, flags); + atomic_spin_unlock_irqrestore(&ioapic_lock, flags); } static void unmask_IO_APIC_irq_desc(struct irq_desc *desc) @@ -632,9 +632,9 @@ static void unmask_IO_APIC_irq_desc(struct irq_desc *desc) struct irq_cfg *cfg = desc->chip_data; unsigned long flags; - spin_lock_irqsave(&ioapic_lock, flags); + atomic_spin_lock_irqsave(&ioapic_lock, flags); __unmask_IO_APIC_irq(cfg); - spin_unlock_irqrestore(&ioapic_lock, flags); + atomic_spin_unlock_irqrestore(&ioapic_lock, flags); } static void mask_IO_APIC_irq(unsigned int irq) @@ -1158,12 +1158,12 @@ void lock_vector_lock(void) /* Used to the online set of cpus does not change * during assign_irq_vector. */ - spin_lock(&vector_lock); + atomic_spin_lock(&vector_lock); } void unlock_vector_lock(void) { - spin_unlock(&vector_lock); + atomic_spin_unlock(&vector_lock); } static int @@ -1251,9 +1251,9 @@ assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask) int err; unsigned long flags; - spin_lock_irqsave(&vector_lock, flags); + atomic_spin_lock_irqsave(&vector_lock, flags); err = __assign_irq_vector(irq, cfg, mask); - spin_unlock_irqrestore(&vector_lock, flags); + atomic_spin_unlock_irqrestore(&vector_lock, flags); return err; } @@ -1623,14 +1623,14 @@ __apicdebuginit(void) print_IO_APIC(void) for (apic = 0; apic < nr_ioapics; apic++) { - spin_lock_irqsave(&ioapic_lock, flags); + atomic_spin_lock_irqsave(&ioapic_lock, flags); reg_00.raw = io_apic_read(apic, 0); reg_01.raw = io_apic_read(apic, 1); if (reg_01.bits.version >= 0x10) reg_02.raw = io_apic_read(apic, 2); if (reg_01.bits.version >= 0x20) reg_03.raw = io_apic_read(apic, 3); - spin_unlock_irqrestore(&ioapic_lock, flags); + atomic_spin_unlock_irqrestore(&ioapic_lock, flags); printk("\n"); printk(KERN_DEBUG "IO APIC #%d......\n", mp_ioapics[apic].apicid); @@ -1856,7 +1856,7 @@ __apicdebuginit(void) print_PIC(void) printk(KERN_DEBUG "\nprinting PIC contents\n"); - spin_lock_irqsave(&i8259A_lock, flags); + atomic_spin_lock_irqsave(&i8259A_lock, flags); v = inb(0xa1) << 8 | inb(0x21); printk(KERN_DEBUG "... PIC IMR: %04x\n", v); @@ -1870,7 +1870,7 @@ __apicdebuginit(void) print_PIC(void) outb(0x0a,0xa0); outb(0x0a,0x20); - spin_unlock_irqrestore(&i8259A_lock, flags); + atomic_spin_unlock_irqrestore(&i8259A_lock, flags); printk(KERN_DEBUG "... PIC ISR: %04x\n", v); @@ -1909,9 +1909,9 @@ void __init enable_IO_APIC(void) * The number of IO-APIC IRQ registers (== #pins): */ for (apic = 0; apic < nr_ioapics; apic++) { - spin_lock_irqsave(&ioapic_lock, flags); + atomic_spin_lock_irqsave(&ioapic_lock, flags); reg_01.raw = io_apic_read(apic, 1); - spin_unlock_irqrestore(&ioapic_lock, flags); + atomic_spin_unlock_irqrestore(&ioapic_lock, flags); nr_ioapic_registers[apic] = reg_01.bits.entries+1; } for(apic = 0; apic < nr_ioapics; apic++) { @@ -2045,9 +2045,9 @@ static void __init setup_ioapic_ids_from_mpc(void) for (apic_id = 0; apic_id < nr_ioapics; apic_id++) { /* Read the register 0 value */ - spin_lock_irqsave(&ioapic_lock, flags); + atomic_spin_lock_irqsave(&ioapic_lock, flags); reg_00.raw = io_apic_read(apic_id, 0); - spin_unlock_irqrestore(&ioapic_lock, flags); + atomic_spin_unlock_irqrestore(&ioapic_lock, flags); old_id = mp_ioapics[apic_id].apicid; @@ -2106,16 +2106,16 @@ static void __init setup_ioapic_ids_from_mpc(void) mp_ioapics[apic_id].apicid); reg_00.bits.ID = mp_ioapics[apic_id].apicid; - spin_lock_irqsave(&ioapic_lock, flags); + atomic_spin_lock_irqsave(&ioapic_lock, flags); io_apic_write(apic_id, 0, reg_00.raw); - spin_unlock_irqrestore(&ioapic_lock, flags); + atomic_spin_unlock_irqrestore(&ioapic_lock, flags); /* * Sanity check */ - spin_lock_irqsave(&ioapic_lock, flags); + atomic_spin_lock_irqsave(&ioapic_lock, flags); reg_00.raw = io_apic_read(apic_id, 0); - spin_unlock_irqrestore(&ioapic_lock, flags); + atomic_spin_unlock_irqrestore(&ioapic_lock, flags); if (reg_00.bits.ID != mp_ioapics[apic_id].apicid) printk("could not set ID!\n"); else @@ -2164,8 +2164,10 @@ static int __init timer_irq_works(void) */ /* jiffies wrap? */ - if (time_after(jiffies, t1 + 4)) + if (time_after(jiffies, t1 + 4) && + time_before(jiffies, t1 + 16)) return 1; + return 0; } @@ -2198,7 +2200,7 @@ static unsigned int startup_ioapic_irq(unsigned int irq) unsigned long flags; struct irq_cfg *cfg; - spin_lock_irqsave(&ioapic_lock, flags); + atomic_spin_lock_irqsave(&ioapic_lock, flags); if (irq < NR_IRQS_LEGACY) { disable_8259A_irq(irq); if (i8259A_irq_pending(irq)) @@ -2206,7 +2208,7 @@ static unsigned int startup_ioapic_irq(unsigned int irq) } cfg = irq_cfg(irq); __unmask_IO_APIC_irq(cfg); - spin_unlock_irqrestore(&ioapic_lock, flags); + atomic_spin_unlock_irqrestore(&ioapic_lock, flags); return was_pending; } @@ -2218,9 +2220,9 @@ static int ioapic_retrigger_irq(unsigned int irq) struct irq_cfg *cfg = irq_cfg(irq); unsigned long flags; - spin_lock_irqsave(&vector_lock, flags); + atomic_spin_lock_irqsave(&vector_lock, flags); apic->send_IPI_mask(cpumask_of(cpumask_first(cfg->domain)), cfg->vector); - spin_unlock_irqrestore(&vector_lock, flags); + atomic_spin_unlock_irqrestore(&vector_lock, flags); return 1; } @@ -2333,7 +2335,7 @@ set_ioapic_affinity_irq_desc(struct irq_desc *desc, const struct cpumask *mask) irq = desc->irq; cfg = desc->chip_data; - spin_lock_irqsave(&ioapic_lock, flags); + atomic_spin_lock_irqsave(&ioapic_lock, flags); dest = set_desc_affinity(desc, mask); if (dest != BAD_APICID) { /* Only the high 8 bits are valid. */ @@ -2341,7 +2343,7 @@ set_ioapic_affinity_irq_desc(struct irq_desc *desc, const struct cpumask *mask) __target_IO_APIC_irq(irq, dest, cfg); ret = 0; } - spin_unlock_irqrestore(&ioapic_lock, flags); + atomic_spin_unlock_irqrestore(&ioapic_lock, flags); return ret; } @@ -2454,7 +2456,7 @@ asmlinkage void smp_irq_move_cleanup_interrupt(void) continue; cfg = irq_cfg(irq); - spin_lock(&desc->lock); + atomic_spin_lock(&desc->lock); if (!cfg->move_cleanup_count) goto unlock; @@ -2476,7 +2478,7 @@ asmlinkage void smp_irq_move_cleanup_interrupt(void) __get_cpu_var(vector_irq)[vector] = -1; cfg->move_cleanup_count--; unlock: - spin_unlock(&desc->lock); + atomic_spin_unlock(&desc->lock); } irq_exit(); @@ -2526,7 +2528,8 @@ static void ack_apic_level(unsigned int irq) irq_complete_move(&desc); #ifdef CONFIG_GENERIC_PENDING_IRQ /* If we are moving the irq we need to mask it */ - if (unlikely(desc->status & IRQ_MOVE_PENDING)) { + if (unlikely(desc->status & IRQ_MOVE_PENDING) && + !(desc->status & IRQ_INPROGRESS)) { do_unmask_irq = 1; mask_IO_APIC_irq_desc(desc); } @@ -2597,14 +2600,23 @@ static void ack_apic_level(unsigned int irq) move_masked_irq(irq); unmask_IO_APIC_irq_desc(desc); } +#if (defined(CONFIG_GENERIC_PENDING_IRQ) || defined(CONFIG_IRQBALANCE)) && \ + defined(CONFIG_PREEMPT_HARDIRQS) + /* + * With threaded interrupts, we always have IRQ_INPROGRESS + * when acking. + */ + else if (unlikely(desc->status & IRQ_MOVE_PENDING)) + move_masked_irq(irq); +#endif #ifdef CONFIG_X86_32 if (!(v & (1 << (i & 0x1f)))) { atomic_inc(&irq_mis_count); - spin_lock(&ioapic_lock); + atomic_spin_lock(&ioapic_lock); __mask_and_edge_IO_APIC_irq(cfg); __unmask_and_level_IO_APIC_irq(cfg); - spin_unlock(&ioapic_lock); + atomic_spin_unlock(&ioapic_lock); } #endif } @@ -2638,9 +2650,9 @@ eoi_ioapic_irq(struct irq_desc *desc) irq = desc->irq; cfg = desc->chip_data; - spin_lock_irqsave(&ioapic_lock, flags); + atomic_spin_lock_irqsave(&ioapic_lock, flags); __eoi_ioapic_irq(irq, cfg); - spin_unlock_irqrestore(&ioapic_lock, flags); + atomic_spin_unlock_irqrestore(&ioapic_lock, flags); } static void ir_ack_apic_edge(unsigned int irq) @@ -3116,13 +3128,13 @@ static int ioapic_resume(struct sys_device *dev) data = container_of(dev, struct sysfs_ioapic_data, dev); entry = data->entry; - spin_lock_irqsave(&ioapic_lock, flags); + atomic_spin_lock_irqsave(&ioapic_lock, flags); reg_00.raw = io_apic_read(dev->id, 0); if (reg_00.bits.ID != mp_ioapics[dev->id].apicid) { reg_00.bits.ID = mp_ioapics[dev->id].apicid; io_apic_write(dev->id, 0, reg_00.raw); } - spin_unlock_irqrestore(&ioapic_lock, flags); + atomic_spin_unlock_irqrestore(&ioapic_lock, flags); for (i = 0; i < nr_ioapic_registers[dev->id]; i++) ioapic_write_entry(dev->id, i, entry[i]); @@ -3186,7 +3198,6 @@ unsigned int create_irq_nr(unsigned int irq_want, int node) if (irq_want < nr_irqs_gsi) irq_want = nr_irqs_gsi; - spin_lock_irqsave(&vector_lock, flags); for (new = irq_want; new < nr_irqs; new++) { desc_new = irq_to_desc_alloc_node(new, node); if (!desc_new) { @@ -3198,13 +3209,14 @@ unsigned int create_irq_nr(unsigned int irq_want, int node) if (cfg_new->vector != 0) continue; + atomic_spin_lock_irqsave(&vector_lock, flags); desc_new = move_irq_desc(desc_new, node); if (__assign_irq_vector(new, cfg_new, apic->target_cpus()) == 0) irq = new; + atomic_spin_unlock_irqrestore(&vector_lock, flags); break; } - spin_unlock_irqrestore(&vector_lock, flags); if (irq > 0) { dynamic_irq_init(irq); @@ -3245,9 +3257,9 @@ void destroy_irq(unsigned int irq) desc->chip_data = cfg; free_irte(irq); - spin_lock_irqsave(&vector_lock, flags); + atomic_spin_lock_irqsave(&vector_lock, flags); __clear_irq_vector(irq, cfg); - spin_unlock_irqrestore(&vector_lock, flags); + atomic_spin_unlock_irqrestore(&vector_lock, flags); } /* @@ -3775,10 +3787,10 @@ int arch_enable_uv_irq(char *irq_name, unsigned int irq, int cpu, int mmr_blade, if (err != 0) return err; - spin_lock_irqsave(&vector_lock, flags); + atomic_spin_lock_irqsave(&vector_lock, flags); set_irq_chip_and_handler_name(irq, &uv_irq_chip, handle_percpu_irq, irq_name); - spin_unlock_irqrestore(&vector_lock, flags); + atomic_spin_unlock_irqrestore(&vector_lock, flags); mmr_value = 0; entry = (struct uv_IO_APIC_route_entry *)&mmr_value; @@ -3825,9 +3837,9 @@ int __init io_apic_get_redir_entries (int ioapic) union IO_APIC_reg_01 reg_01; unsigned long flags; - spin_lock_irqsave(&ioapic_lock, flags); + atomic_spin_lock_irqsave(&ioapic_lock, flags); reg_01.raw = io_apic_read(ioapic, 1); - spin_unlock_irqrestore(&ioapic_lock, flags); + atomic_spin_unlock_irqrestore(&ioapic_lock, flags); return reg_01.bits.entries; } @@ -3968,9 +3980,9 @@ int __init io_apic_get_unique_id(int ioapic, int apic_id) if (physids_empty(apic_id_map)) apic_id_map = apic->ioapic_phys_id_map(phys_cpu_present_map); - spin_lock_irqsave(&ioapic_lock, flags); + atomic_spin_lock_irqsave(&ioapic_lock, flags); reg_00.raw = io_apic_read(ioapic, 0); - spin_unlock_irqrestore(&ioapic_lock, flags); + atomic_spin_unlock_irqrestore(&ioapic_lock, flags); if (apic_id >= get_physical_broadcast()) { printk(KERN_WARNING "IOAPIC[%d]: Invalid apic_id %d, trying " @@ -4004,10 +4016,10 @@ int __init io_apic_get_unique_id(int ioapic, int apic_id) if (reg_00.bits.ID != apic_id) { reg_00.bits.ID = apic_id; - spin_lock_irqsave(&ioapic_lock, flags); + atomic_spin_lock_irqsave(&ioapic_lock, flags); io_apic_write(ioapic, 0, reg_00.raw); reg_00.raw = io_apic_read(ioapic, 0); - spin_unlock_irqrestore(&ioapic_lock, flags); + atomic_spin_unlock_irqrestore(&ioapic_lock, flags); /* Sanity check */ if (reg_00.bits.ID != apic_id) { @@ -4028,9 +4040,9 @@ int __init io_apic_get_version(int ioapic) union IO_APIC_reg_01 reg_01; unsigned long flags; - spin_lock_irqsave(&ioapic_lock, flags); + atomic_spin_lock_irqsave(&ioapic_lock, flags); reg_01.raw = io_apic_read(ioapic, 1); - spin_unlock_irqrestore(&ioapic_lock, flags); + atomic_spin_unlock_irqrestore(&ioapic_lock, flags); return reg_01.bits.version; } diff --git a/arch/x86/kernel/apic/ipi.c b/arch/x86/kernel/apic/ipi.c index 6ef00ba..c83acda 100644 --- a/arch/x86/kernel/apic/ipi.c +++ b/arch/x86/kernel/apic/ipi.c @@ -106,7 +106,7 @@ void default_send_IPI_mask_logical(const struct cpumask *cpumask, int vector) unsigned long mask = cpumask_bits(cpumask)[0]; unsigned long flags; - if (WARN_ONCE(!mask, "empty IPI mask")) + if (!mask) return; local_irq_save(flags); diff --git a/arch/x86/kernel/apic/nmi.c b/arch/x86/kernel/apic/nmi.c index b3025b4..f6755e5 100644 --- a/arch/x86/kernel/apic/nmi.c +++ b/arch/x86/kernel/apic/nmi.c @@ -90,7 +90,9 @@ static inline unsigned int get_timer_irqs(int cpu) */ static __init void nmi_cpu_busy(void *data) { +#ifndef CONFIG_PREEMPT_RT local_irq_enable_in_hardirq(); +#endif /* * Intentionally don't use cpu_relax here. This is * to make sure that the performance counter really ticks, @@ -416,12 +418,12 @@ nmi_watchdog_tick(struct pt_regs *regs, unsigned reason) /* We can be called before check_nmi_watchdog, hence NULL check. */ if (backtrace_mask != NULL && cpumask_test_cpu(cpu, backtrace_mask)) { - static DEFINE_SPINLOCK(lock); /* Serialise the printks */ + static DEFINE_ATOMIC_SPINLOCK(lock); /* Serialise the printks */ - spin_lock(&lock); + atomic_spin_lock(&lock); printk(KERN_WARNING "NMI backtrace for cpu %d\n", cpu); dump_stack(); - spin_unlock(&lock); + atomic_spin_unlock(&lock); cpumask_clear_cpu(cpu, backtrace_mask); } diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c index 442b550..ac6d430 100644 --- a/arch/x86/kernel/apm_32.c +++ b/arch/x86/kernel/apm_32.c @@ -1216,7 +1216,7 @@ static void reinit_timer(void) #ifdef INIT_TIMER_AFTER_SUSPEND unsigned long flags; - spin_lock_irqsave(&i8253_lock, flags); + atomic_spin_lock_irqsave(&i8253_lock, flags); /* set the clock to HZ */ outb_pit(0x34, PIT_MODE); /* binary, mode 2, LSB/MSB, ch 0 */ udelay(10); @@ -1224,7 +1224,7 @@ static void reinit_timer(void) udelay(10); outb_pit(LATCH >> 8, PIT_CH0); /* MSB */ udelay(10); - spin_unlock_irqrestore(&i8253_lock, flags); + atomic_spin_unlock_irqrestore(&i8253_lock, flags); #endif } diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c index 898ecc4..4a6aeed 100644 --- a/arch/x86/kernel/asm-offsets_64.c +++ b/arch/x86/kernel/asm-offsets_64.c @@ -3,6 +3,7 @@ * This code generates raw asm output which is post-processed to extract * and format the required data. */ +#define COMPILE_OFFSETS #include <linux/crypto.h> #include <linux/sched.h> diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index e338b5c..8ac5975 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -1004,7 +1004,9 @@ DEFINE_PER_CPU(unsigned int, irq_count) = -1; */ static const unsigned int exception_stack_sizes[N_EXCEPTION_STACKS] = { [0 ... N_EXCEPTION_STACKS - 1] = EXCEPTION_STKSZ, +#if DEBUG_STACK > 0 [DEBUG_STACK - 1] = DEBUG_STKSZ +#endif }; static DEFINE_PER_CPU_PAGE_ALIGNED(char, exception_stacks diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c index 0543f69..d486197 100644 --- a/arch/x86/kernel/cpu/mtrr/generic.c +++ b/arch/x86/kernel/cpu/mtrr/generic.c @@ -549,7 +549,7 @@ static unsigned long set_mtrr_state(void) static unsigned long cr4 = 0; -static DEFINE_SPINLOCK(set_atomicity_lock); +static DEFINE_ATOMIC_SPINLOCK(set_atomicity_lock); /* * Since we are disabling the cache don't allow any interrupts - they @@ -566,7 +566,7 @@ static void prepare_set(void) __acquires(set_atomicity_lock) for this CPU while the MTRRs are changed, but changing this requires more invasive changes to the way the kernel boots */ - spin_lock(&set_atomicity_lock); + atomic_spin_lock(&set_atomicity_lock); /* Enter the no-fill (CD=1, NW=0) cache mode and flush caches. */ cr0 = read_cr0() | X86_CR0_CD; @@ -603,7 +603,7 @@ static void post_set(void) __releases(set_atomicity_lock) /* Restore value of CR4 */ if ( cpu_has_pge ) write_cr4(cr4); - spin_unlock(&set_atomicity_lock); + atomic_spin_unlock(&set_atomicity_lock); } static void generic_set_all(void) diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c index bca5fba..ffb9886 100644 --- a/arch/x86/kernel/dumpstack_32.c +++ b/arch/x86/kernel/dumpstack_32.c @@ -99,6 +99,12 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, } +#if defined(CONFIG_DEBUG_STACKOVERFLOW) && defined(CONFIG_EVENT_TRACE) +extern unsigned long worst_stack_left; +#else +# define worst_stack_left -1L +#endif + void show_registers(struct pt_regs *regs) { int i; diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c index 54b0a32..5a11d2a 100644 --- a/arch/x86/kernel/dumpstack_64.c +++ b/arch/x86/kernel/dumpstack_64.c @@ -21,10 +21,14 @@ static char x86_stack_ids[][8] = { +#if DEBUG_STACK > 0 [DEBUG_STACK - 1] = "#DB", +#endif [NMI_STACK - 1] = "NMI", [DOUBLEFAULT_STACK - 1] = "#DF", +#if STACKFAULT_STACK > 0 [STACKFAULT_STACK - 1] = "#SS", +#endif [MCE_STACK - 1] = "#MC", #if DEBUG_STKSZ > EXCEPTION_STKSZ [N_EXCEPTION_STACKS ... diff --git a/arch/x86/kernel/early_printk.c b/arch/x86/kernel/early_printk.c index 335f049..8b60080 100644 --- a/arch/x86/kernel/early_printk.c +++ b/arch/x86/kernel/early_printk.c @@ -59,7 +59,7 @@ static void early_vga_write(struct console *con, const char *str, unsigned n) static struct console early_vga_console = { .name = "earlyvga", .write = early_vga_write, - .flags = CON_PRINTBUFFER, + .flags = CON_PRINTBUFFER | CON_ATOMIC, .index = -1, }; @@ -156,7 +156,7 @@ static __init void early_serial_init(char *s) static struct console early_serial_console = { .name = "earlyser", .write = early_serial_write, - .flags = CON_PRINTBUFFER, + .flags = CON_PRINTBUFFER | CON_ATOMIC, .index = -1, }; @@ -881,7 +881,7 @@ static int __initdata early_console_initialized; asmlinkage void early_printk(const char *fmt, ...) { - char buf[512]; + static char buf[512]; int n; va_list ap; diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S index c097e7d..f86fc3b 100644 --- a/arch/x86/kernel/entry_32.S +++ b/arch/x86/kernel/entry_32.S @@ -371,13 +371,13 @@ END(ret_from_exception) ENTRY(resume_kernel) DISABLE_INTERRUPTS(CLBR_ANY) cmpl $0,TI_preempt_count(%ebp) # non-zero preempt_count ? - jnz restore_all + jnz restore_nocheck need_resched: movl TI_flags(%ebp), %ecx # need_resched set ? testb $_TIF_NEED_RESCHED, %cl - jz restore_all + jz restore_nocheck testl $X86_EFLAGS_IF,PT_EFLAGS(%esp) # interrupts off (exception path) ? - jz restore_all + jz restore_nocheck call preempt_schedule_irq jmp need_resched END(resume_kernel) @@ -627,12 +627,9 @@ work_pending: testb $_TIF_NEED_RESCHED, %cl jz work_notifysig work_resched: - call schedule + call __schedule LOCKDEP_SYS_EXIT - DISABLE_INTERRUPTS(CLBR_ANY) # make sure we don't miss an interrupt - # setting need_resched or sigpending - # between sampling and the iret - TRACE_IRQS_OFF + movl TI_flags(%ebp), %ecx andl $_TIF_WORK_MASK, %ecx # is there any work to be done other # than syscall tracing? diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index c251be7..d59fe32 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -146,7 +146,7 @@ ENTRY(ftrace_graph_caller) END(ftrace_graph_caller) GLOBAL(return_to_handler) - subq $80, %rsp + subq $24, %rsp /* Save the return values */ movq %rax, (%rsp) @@ -155,10 +155,10 @@ GLOBAL(return_to_handler) call ftrace_return_to_handler - movq %rax, 72(%rsp) + movq %rax, 16(%rsp) movq 8(%rsp), %rdx movq (%rsp), %rax - addq $72, %rsp + addq $16, %rsp retq #endif diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index d94e1ea..9dbb527 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -417,10 +417,6 @@ void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr, unsigned long return_hooker = (unsigned long) &return_to_handler; - /* Nmi's are currently unsupported */ - if (unlikely(in_nmi())) - return; - if (unlikely(atomic_read(¤t->tracing_graph_pause))) return; @@ -498,37 +494,56 @@ static struct syscall_metadata *find_syscall_meta(unsigned long *syscall) struct syscall_metadata *syscall_nr_to_meta(int nr) { - if (!syscalls_metadata || nr >= FTRACE_SYSCALL_MAX || nr < 0) + if (!syscalls_metadata || nr >= NR_syscalls || nr < 0) return NULL; return syscalls_metadata[nr]; } -void arch_init_ftrace_syscalls(void) +int syscall_name_to_nr(char *name) +{ + int i; + + if (!syscalls_metadata) + return -1; + + for (i = 0; i < NR_syscalls; i++) { + if (syscalls_metadata[i]) { + if (!strcmp(syscalls_metadata[i]->name, name)) + return i; + } + } + return -1; +} + +void set_syscall_enter_id(int num, int id) +{ + syscalls_metadata[num]->enter_id = id; +} + +void set_syscall_exit_id(int num, int id) +{ + syscalls_metadata[num]->exit_id = id; +} + +static int __init arch_init_ftrace_syscalls(void) { int i; struct syscall_metadata *meta; unsigned long **psys_syscall_table = &sys_call_table; - static atomic_t refs; - - if (atomic_inc_return(&refs) != 1) - goto end; syscalls_metadata = kzalloc(sizeof(*syscalls_metadata) * - FTRACE_SYSCALL_MAX, GFP_KERNEL); + NR_syscalls, GFP_KERNEL); if (!syscalls_metadata) { WARN_ON(1); - return; + return -ENOMEM; } - for (i = 0; i < FTRACE_SYSCALL_MAX; i++) { + for (i = 0; i < NR_syscalls; i++) { meta = find_syscall_meta(psys_syscall_table[i]); syscalls_metadata[i] = meta; } - return; - - /* Paranoid: avoid overflow */ -end: - atomic_dec(&refs); + return 0; } +arch_initcall(arch_init_ftrace_syscalls); #endif diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index 70eaa85..4d7255c 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -30,7 +30,11 @@ static void __init zap_identity_mappings(void) { pgd_t *pgd = pgd_offset_k(0UL); pgd_clear(pgd); - __flush_tlb_all(); + /* + * preempt_disable/enable does not work this early in the + * bootup yet: + */ + write_cr3(read_cr3()); } /* Don't add a printk in there. printk relies on the PDA which is not initialized diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S index 7ffec6b..6d98b18 100644 --- a/arch/x86/kernel/head_32.S +++ b/arch/x86/kernel/head_32.S @@ -591,6 +591,7 @@ ignore_int: call dump_stack addl $(5*4),%esp + call dump_stack popl %ds popl %es popl %edx diff --git a/arch/x86/kernel/i8253.c b/arch/x86/kernel/i8253.c index 5cf36c0..84d4433 100644 --- a/arch/x86/kernel/i8253.c +++ b/arch/x86/kernel/i8253.c @@ -16,7 +16,7 @@ #include <asm/hpet.h> #include <asm/smp.h> -DEFINE_SPINLOCK(i8253_lock); +DEFINE_ATOMIC_SPINLOCK(i8253_lock); EXPORT_SYMBOL(i8253_lock); #ifdef CONFIG_X86_32 @@ -39,7 +39,7 @@ struct clock_event_device *global_clock_event; static void init_pit_timer(enum clock_event_mode mode, struct clock_event_device *evt) { - spin_lock(&i8253_lock); + atomic_spin_lock(&i8253_lock); switch (mode) { case CLOCK_EVT_MODE_PERIODIC: @@ -70,7 +70,7 @@ static void init_pit_timer(enum clock_event_mode mode, /* Nothing to do here */ break; } - spin_unlock(&i8253_lock); + atomic_spin_unlock(&i8253_lock); } /* @@ -80,10 +80,10 @@ static void init_pit_timer(enum clock_event_mode mode, */ static int pit_next_event(unsigned long delta, struct clock_event_device *evt) { - spin_lock(&i8253_lock); + atomic_spin_lock(&i8253_lock); outb_pit(delta & 0xff , PIT_CH0); /* LSB */ outb_pit(delta >> 8 , PIT_CH0); /* MSB */ - spin_unlock(&i8253_lock); + atomic_spin_unlock(&i8253_lock); return 0; } @@ -138,7 +138,7 @@ static cycle_t pit_read(struct clocksource *cs) int count; u32 jifs; - spin_lock_irqsave(&i8253_lock, flags); + atomic_spin_lock_irqsave(&i8253_lock, flags); /* * Although our caller may have the read side of xtime_lock, * this is now a seqlock, and we are cheating in this routine @@ -184,7 +184,7 @@ static cycle_t pit_read(struct clocksource *cs) old_count = count; old_jifs = jifs; - spin_unlock_irqrestore(&i8253_lock, flags); + atomic_spin_unlock_irqrestore(&i8253_lock, flags); count = (LATCH - 1) - count; diff --git a/arch/x86/kernel/i8259.c b/arch/x86/kernel/i8259.c index df89102..6fbe669 100644 --- a/arch/x86/kernel/i8259.c +++ b/arch/x86/kernel/i8259.c @@ -32,7 +32,7 @@ */ static int i8259A_auto_eoi; -DEFINE_SPINLOCK(i8259A_lock); +DEFINE_ATOMIC_SPINLOCK(i8259A_lock); static void mask_and_ack_8259A(unsigned int); struct irq_chip i8259A_chip = { @@ -68,13 +68,13 @@ void disable_8259A_irq(unsigned int irq) unsigned int mask = 1 << irq; unsigned long flags; - spin_lock_irqsave(&i8259A_lock, flags); + atomic_spin_lock_irqsave(&i8259A_lock, flags); cached_irq_mask |= mask; if (irq & 8) outb(cached_slave_mask, PIC_SLAVE_IMR); else outb(cached_master_mask, PIC_MASTER_IMR); - spin_unlock_irqrestore(&i8259A_lock, flags); + atomic_spin_unlock_irqrestore(&i8259A_lock, flags); } void enable_8259A_irq(unsigned int irq) @@ -82,13 +82,13 @@ void enable_8259A_irq(unsigned int irq) unsigned int mask = ~(1 << irq); unsigned long flags; - spin_lock_irqsave(&i8259A_lock, flags); + atomic_spin_lock_irqsave(&i8259A_lock, flags); cached_irq_mask &= mask; if (irq & 8) outb(cached_slave_mask, PIC_SLAVE_IMR); else outb(cached_master_mask, PIC_MASTER_IMR); - spin_unlock_irqrestore(&i8259A_lock, flags); + atomic_spin_unlock_irqrestore(&i8259A_lock, flags); } int i8259A_irq_pending(unsigned int irq) @@ -97,12 +97,12 @@ int i8259A_irq_pending(unsigned int irq) unsigned long flags; int ret; - spin_lock_irqsave(&i8259A_lock, flags); + atomic_spin_lock_irqsave(&i8259A_lock, flags); if (irq < 8) ret = inb(PIC_MASTER_CMD) & mask; else ret = inb(PIC_SLAVE_CMD) & (mask >> 8); - spin_unlock_irqrestore(&i8259A_lock, flags); + atomic_spin_unlock_irqrestore(&i8259A_lock, flags); return ret; } @@ -150,7 +150,7 @@ static void mask_and_ack_8259A(unsigned int irq) unsigned int irqmask = 1 << irq; unsigned long flags; - spin_lock_irqsave(&i8259A_lock, flags); + atomic_spin_lock_irqsave(&i8259A_lock, flags); /* * Lightweight spurious IRQ detection. We do not want * to overdo spurious IRQ handling - it's usually a sign @@ -168,6 +168,8 @@ static void mask_and_ack_8259A(unsigned int irq) */ if (cached_irq_mask & irqmask) goto spurious_8259A_irq; + if (irq & 8) + outb(0x60+(irq&7), PIC_SLAVE_CMD); /* 'Specific EOI' to slave */ cached_irq_mask |= irqmask; handle_real_irq: @@ -183,7 +185,7 @@ handle_real_irq: outb(cached_master_mask, PIC_MASTER_IMR); outb(0x60+irq, PIC_MASTER_CMD); /* 'Specific EOI to master */ } - spin_unlock_irqrestore(&i8259A_lock, flags); + atomic_spin_unlock_irqrestore(&i8259A_lock, flags); return; spurious_8259A_irq: @@ -285,24 +287,24 @@ void mask_8259A(void) { unsigned long flags; - spin_lock_irqsave(&i8259A_lock, flags); + atomic_spin_lock_irqsave(&i8259A_lock, flags); outb(0xff, PIC_MASTER_IMR); /* mask all of 8259A-1 */ outb(0xff, PIC_SLAVE_IMR); /* mask all of 8259A-2 */ - spin_unlock_irqrestore(&i8259A_lock, flags); + atomic_spin_unlock_irqrestore(&i8259A_lock, flags); } void unmask_8259A(void) { unsigned long flags; - spin_lock_irqsave(&i8259A_lock, flags); + atomic_spin_lock_irqsave(&i8259A_lock, flags); outb(cached_master_mask, PIC_MASTER_IMR); /* restore master IRQ mask */ outb(cached_slave_mask, PIC_SLAVE_IMR); /* restore slave IRQ mask */ - spin_unlock_irqrestore(&i8259A_lock, flags); + atomic_spin_unlock_irqrestore(&i8259A_lock, flags); } void init_8259A(int auto_eoi) @@ -311,7 +313,7 @@ void init_8259A(int auto_eoi) i8259A_auto_eoi = auto_eoi; - spin_lock_irqsave(&i8259A_lock, flags); + atomic_spin_lock_irqsave(&i8259A_lock, flags); outb(0xff, PIC_MASTER_IMR); /* mask all of 8259A-1 */ outb(0xff, PIC_SLAVE_IMR); /* mask all of 8259A-2 */ @@ -328,10 +330,10 @@ void init_8259A(int auto_eoi) /* 8259A-1 (the master) has a slave on IR2 */ outb_pic(1U << PIC_CASCADE_IR, PIC_MASTER_IMR); - if (auto_eoi) /* master does Auto EOI */ - outb_pic(MASTER_ICW4_DEFAULT | PIC_ICW4_AEOI, PIC_MASTER_IMR); - else /* master expects normal EOI */ - outb_pic(MASTER_ICW4_DEFAULT, PIC_MASTER_IMR); + if (!auto_eoi) /* master expects normal EOI */ + outb_p(MASTER_ICW4_DEFAULT, PIC_MASTER_IMR); + else /* master does Auto EOI */ + outb_p(MASTER_ICW4_DEFAULT | PIC_ICW4_AEOI, PIC_MASTER_IMR); outb_pic(0x11, PIC_SLAVE_CMD); /* ICW1: select 8259A-2 init */ @@ -356,5 +358,5 @@ void init_8259A(int auto_eoi) outb(cached_master_mask, PIC_MASTER_IMR); /* restore master IRQ mask */ outb(cached_slave_mask, PIC_SLAVE_IMR); /* restore slave IRQ mask */ - spin_unlock_irqrestore(&i8259A_lock, flags); + atomic_spin_unlock_irqrestore(&i8259A_lock, flags); } diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index b0cdde6..c51d8e6 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -149,7 +149,7 @@ int show_interrupts(struct seq_file *p, void *v) if (!desc) return 0; - spin_lock_irqsave(&desc->lock, flags); + atomic_spin_lock_irqsave(&desc->lock, flags); for_each_online_cpu(j) any_count |= kstat_irqs_cpu(i, j); action = desc->action; @@ -170,7 +170,7 @@ int show_interrupts(struct seq_file *p, void *v) seq_putc(p, '\n'); out: - spin_unlock_irqrestore(&desc->lock, flags); + atomic_spin_unlock_irqrestore(&desc->lock, flags); return 0; } diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c index 977d8b4..1aa5228 100644 --- a/arch/x86/kernel/irq_64.c +++ b/arch/x86/kernel/irq_64.c @@ -81,12 +81,12 @@ void fixup_irqs(void) continue; /* interrupt's are disabled at this point */ - spin_lock(&desc->lock); + atomic_spin_lock(&desc->lock); affinity = desc->affinity; if (!irq_has_action(irq) || cpumask_equal(affinity, cpu_online_mask)) { - spin_unlock(&desc->lock); + atomic_spin_unlock(&desc->lock); continue; } @@ -106,7 +106,7 @@ void fixup_irqs(void) if (desc->chip->unmask) desc->chip->unmask(irq); - spin_unlock(&desc->lock); + atomic_spin_unlock(&desc->lock); if (break_affinity && set_affinity) printk("Broke affinity for irq %i\n", irq); diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c index 92b7703..49dcf16 100644 --- a/arch/x86/kernel/irqinit.c +++ b/arch/x86/kernel/irqinit.c @@ -72,6 +72,7 @@ static irqreturn_t math_error_irq(int cpl, void *dev_id) static struct irqaction fpu_irq = { .handler = math_error_irq, .name = "fpu", + .flags = IRQF_NODELAY, }; #endif @@ -81,6 +82,7 @@ static struct irqaction fpu_irq = { static struct irqaction irq2 = { .handler = no_action, .name = "cascade", + .flags = IRQF_NODELAY, }; DEFINE_PER_CPU(vector_irq_t, vector_irq) = { diff --git a/arch/x86/kernel/kprobes.c b/arch/x86/kernel/kprobes.c index 7b5169d..d23c755 100644 --- a/arch/x86/kernel/kprobes.c +++ b/arch/x86/kernel/kprobes.c @@ -454,7 +454,7 @@ static void __kprobes setup_singlestep(struct kprobe *p, struct pt_regs *regs, /* Boost up -- we can execute copied instructions directly */ reset_current_kprobe(); regs->ip = (unsigned long)p->ainsn.insn; - preempt_enable_no_resched(); + preempt_enable(); return; } #endif @@ -480,7 +480,7 @@ static int __kprobes reenter_kprobe(struct kprobe *p, struct pt_regs *regs, arch_disarm_kprobe(p); regs->ip = (unsigned long)p->addr; reset_current_kprobe(); - preempt_enable_no_resched(); + preempt_enable(); break; #endif case KPROBE_HIT_ACTIVE: @@ -576,7 +576,7 @@ static int __kprobes kprobe_handler(struct pt_regs *regs) } } /* else: not a kprobe fault; let the kernel handle it */ - preempt_enable_no_resched(); + preempt_enable(); return 0; } @@ -876,7 +876,7 @@ static int __kprobes post_kprobe_handler(struct pt_regs *regs) } reset_current_kprobe(); out: - preempt_enable_no_resched(); + preempt_enable(); /* * if somebody else is singlestepping across a probe point, flags @@ -910,7 +910,7 @@ int __kprobes kprobe_fault_handler(struct pt_regs *regs, int trapnr) restore_previous_kprobe(kcb); else reset_current_kprobe(); - preempt_enable_no_resched(); + preempt_enable(); break; case KPROBE_HIT_ACTIVE: case KPROBE_HIT_SSDONE: @@ -1051,7 +1051,7 @@ int __kprobes longjmp_break_handler(struct kprobe *p, struct pt_regs *regs) memcpy((kprobe_opcode_t *)(kcb->jprobe_saved_sp), kcb->jprobes_stack, MIN_STACK_SIZE(kcb->jprobe_saved_sp)); - preempt_enable_no_resched(); + preempt_enable(); return 1; } return 0; diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index 70ec9b9..5611ed6 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -422,6 +422,20 @@ struct pv_apic_ops pv_apic_ops = { #define PTE_IDENT __PV_IS_CALLEE_SAVE(_paravirt_ident_64) #endif +#ifdef CONFIG_HIGHPTE +/* + * kmap_atomic() might be an inline or a macro: + */ +static void *kmap_atomic_func(struct page *page, enum km_type idx) +{ + return kmap_atomic(page, idx); +} +static void *kmap_atomic_direct_func(struct page *page, enum km_type idx) +{ + return kmap_atomic_direct(page, idx); +} +#endif + struct pv_mmu_ops pv_mmu_ops = { #ifndef CONFIG_X86_64 .pagetable_setup_start = native_pagetable_setup_start, @@ -462,7 +476,8 @@ struct pv_mmu_ops pv_mmu_ops = { .ptep_modify_prot_commit = __ptep_modify_prot_commit, #ifdef CONFIG_HIGHPTE - .kmap_atomic_pte = kmap_atomic, + .kmap_atomic_pte = kmap_atomic_func, + .kmap_atomic_pte_direct = kmap_atomic_direct_func, #endif #if PAGETABLE_LEVELS >= 3 diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 59f4524..b9e7a3f 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -105,7 +105,6 @@ void cpu_idle(void) tick_nohz_stop_sched_tick(1); while (!need_resched()) { - check_pgt_cache(); rmb(); if (cpu_is_offline(cpu)) @@ -117,10 +116,12 @@ void cpu_idle(void) pm_idle(); start_critical_timings(); } + local_irq_disable(); tick_nohz_restart_sched_tick(); - preempt_enable_no_resched(); - schedule(); + __preempt_enable_no_resched(); + __schedule(); preempt_disable(); + local_irq_enable(); } } @@ -162,8 +163,10 @@ void __show_regs(struct pt_regs *regs, int all) regs->ax, regs->bx, regs->cx, regs->dx); printk("ESI: %08lx EDI: %08lx EBP: %08lx ESP: %08lx\n", regs->si, regs->di, regs->bp, sp); - printk(" DS: %04x ES: %04x FS: %04x GS: %04x SS: %04x\n", - (u16)regs->ds, (u16)regs->es, (u16)regs->fs, gs, ss); + printk(" DS: %04x ES: %04x FS: %04x GS: %04x SS: %04x" + " preempt:%08x\n", + (u16)regs->ds, (u16)regs->es, (u16)regs->fs, gs, ss, + preempt_count()); if (!all) return; diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index ebefb54..c8d0ece 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -152,9 +152,11 @@ void cpu_idle(void) } tick_nohz_restart_sched_tick(); - preempt_enable_no_resched(); - schedule(); + local_irq_disable(); + __preempt_enable_no_resched(); + __schedule(); preempt_disable(); + local_irq_enable(); } } diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index 09ecbde..8d7d5c9 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -35,10 +35,11 @@ #include <asm/proto.h> #include <asm/ds.h> -#include <trace/syscall.h> - #include "tls.h" +#define CREATE_TRACE_POINTS +#include <trace/events/syscalls.h> + enum x86_regset { REGSET_GENERAL, REGSET_FP, @@ -1497,8 +1498,8 @@ asmregparm long syscall_trace_enter(struct pt_regs *regs) tracehook_report_syscall_entry(regs)) ret = -1L; - if (unlikely(test_thread_flag(TIF_SYSCALL_FTRACE))) - ftrace_syscall_enter(regs); + if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT))) + trace_sys_enter(regs, regs->orig_ax); if (unlikely(current->audit_context)) { if (IS_IA32) @@ -1523,8 +1524,8 @@ asmregparm void syscall_trace_leave(struct pt_regs *regs) if (unlikely(current->audit_context)) audit_syscall_exit(AUDITSC_RESULT(regs->ax), regs->ax); - if (unlikely(test_thread_flag(TIF_SYSCALL_FTRACE))) - ftrace_syscall_exit(regs); + if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT))) + trace_sys_exit(regs, regs->ax); if (test_thread_flag(TIF_SYSCALL_TRACE)) tracehook_report_syscall_exit(regs, 0); diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index 4c57875..5777895 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c @@ -782,6 +782,13 @@ static void do_signal(struct pt_regs *regs) int signr; sigset_t *oldset; +#ifdef CONFIG_PREEMPT_RT + /* + * Fully-preemptible kernel does not need interrupts disabled: + */ + local_irq_enable(); + preempt_check_resched(); +#endif /* * We want the common case to go fast, which is why we may in certain * cases get here from kernel mode. Just return without doing anything diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c index ec1de97..a83e38d 100644 --- a/arch/x86/kernel/smp.c +++ b/arch/x86/kernel/smp.c @@ -120,6 +120,16 @@ static void native_smp_send_reschedule(int cpu) apic->send_IPI_mask(cpumask_of(cpu), RESCHEDULE_VECTOR); } +/* + * this function sends a 'reschedule' IPI to all other CPUs. + * This is used when RT tasks are starving and other CPUs + * might be able to run them: + */ +void smp_send_reschedule_allbutself(void) +{ + apic->send_IPI_allbutself(RESCHEDULE_VECTOR); +} + void native_send_call_func_single_ipi(int cpu) { apic->send_IPI_mask(cpumask_of(cpu), CALL_FUNCTION_SINGLE_VECTOR); diff --git a/arch/x86/kernel/sys_x86_64.c b/arch/x86/kernel/sys_x86_64.c index 6bc211a..45e00eb 100644 --- a/arch/x86/kernel/sys_x86_64.c +++ b/arch/x86/kernel/sys_x86_64.c @@ -18,9 +18,9 @@ #include <asm/ia32.h> #include <asm/syscalls.h> -asmlinkage long sys_mmap(unsigned long addr, unsigned long len, - unsigned long prot, unsigned long flags, - unsigned long fd, unsigned long off) +SYSCALL_DEFINE6(mmap, unsigned long, addr, unsigned long, len, + unsigned long, prot, unsigned long, flags, + unsigned long, fd, unsigned long, off) { long error; struct file *file; @@ -226,7 +226,7 @@ bottomup: } -asmlinkage long sys_uname(struct new_utsname __user *name) +SYSCALL_DEFINE1(uname, struct new_utsname __user *, name) { int err; down_read(&uts_sem); diff --git a/arch/x86/kernel/time_32.c b/arch/x86/kernel/time_32.c index 5c5d87f..9e2cb0b 100644 --- a/arch/x86/kernel/time_32.c +++ b/arch/x86/kernel/time_32.c @@ -84,11 +84,11 @@ irqreturn_t timer_interrupt(int irq, void *dev_id) * manually to deassert NMI lines for the watchdog if run * on an 82489DX-based system. */ - spin_lock(&i8259A_lock); + atomic_spin_lock(&i8259A_lock); outb(0x0c, PIC_MASTER_OCW3); /* Ack the IRQ; AEOI will end it automatically. */ inb(PIC_MASTER_POLL); - spin_unlock(&i8259A_lock); + atomic_spin_unlock(&i8259A_lock); } #endif diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 5204332..20ee0b1 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -91,9 +91,10 @@ static inline void conditional_sti(struct pt_regs *regs) local_irq_enable(); } -static inline void preempt_conditional_sti(struct pt_regs *regs) +static inline void preempt_conditional_sti(struct pt_regs *regs, int stack) { - inc_preempt_count(); + if (stack) + inc_preempt_count(); if (regs->flags & X86_EFLAGS_IF) local_irq_enable(); } @@ -104,11 +105,12 @@ static inline void conditional_cli(struct pt_regs *regs) local_irq_disable(); } -static inline void preempt_conditional_cli(struct pt_regs *regs) +static inline void preempt_conditional_cli(struct pt_regs *regs, int stack) { if (regs->flags & X86_EFLAGS_IF) local_irq_disable(); - dec_preempt_count(); + if (stack) + dec_preempt_count(); } #ifdef CONFIG_X86_32 @@ -235,9 +237,9 @@ dotraplinkage void do_stack_segment(struct pt_regs *regs, long error_code) if (notify_die(DIE_TRAP, "stack segment", regs, error_code, 12, SIGBUS) == NOTIFY_STOP) return; - preempt_conditional_sti(regs); + preempt_conditional_sti(regs, STACKFAULT_STACK); do_trap(12, SIGBUS, "stack segment", regs, error_code, NULL); - preempt_conditional_cli(regs); + preempt_conditional_cli(regs, STACKFAULT_STACK); } dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code) @@ -473,9 +475,9 @@ dotraplinkage void __kprobes do_int3(struct pt_regs *regs, long error_code) return; #endif - preempt_conditional_sti(regs); + preempt_conditional_sti(regs, DEBUG_STACK); do_trap(3, SIGTRAP, "int3", regs, error_code, NULL); - preempt_conditional_cli(regs); + preempt_conditional_cli(regs, DEBUG_STACK); } #ifdef CONFIG_X86_64 @@ -552,7 +554,7 @@ dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code) return; /* It's safe to allow irq's after DR6 has been saved */ - preempt_conditional_sti(regs); + preempt_conditional_sti(regs, DEBUG_STACK); /* Mask out spurious debug traps due to lazy DR7 setting */ if (condition & (DR_TRAP0|DR_TRAP1|DR_TRAP2|DR_TRAP3)) { @@ -587,7 +589,7 @@ dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code) */ clear_dr7: set_debugreg(0, 7); - preempt_conditional_cli(regs); + preempt_conditional_cli(regs, DEBUG_STACK); return; #ifdef CONFIG_X86_32 @@ -602,7 +604,7 @@ debug_vm86: clear_TF_reenable: set_tsk_thread_flag(tsk, TIF_SINGLESTEP); regs->flags &= ~X86_EFLAGS_TF; - preempt_conditional_cli(regs); + preempt_conditional_cli(regs, DEBUG_STACK); return; } diff --git a/arch/x86/kernel/tsc_sync.c b/arch/x86/kernel/tsc_sync.c index 027b5b4..76315a4 100644 --- a/arch/x86/kernel/tsc_sync.c +++ b/arch/x86/kernel/tsc_sync.c @@ -104,6 +104,7 @@ static __cpuinit void check_tsc_warp(void) */ void __cpuinit check_tsc_sync_source(int cpu) { + unsigned long flags; int cpus = 2; /* @@ -129,8 +130,11 @@ void __cpuinit check_tsc_sync_source(int cpu) /* * Wait for the target to arrive: */ + local_save_flags(flags); + local_irq_enable(); while (atomic_read(&start_count) != cpus-1) cpu_relax(); + local_irq_restore(flags); /* * Trigger the target to continue into the measurement too: */ diff --git a/arch/x86/kernel/visws_quirks.c b/arch/x86/kernel/visws_quirks.c index 31ffc24..24e6d2b 100644 --- a/arch/x86/kernel/visws_quirks.c +++ b/arch/x86/kernel/visws_quirks.c @@ -581,7 +581,7 @@ static irqreturn_t piix4_master_intr(int irq, void *dev_id) struct irq_desc *desc; unsigned long flags; - spin_lock_irqsave(&i8259A_lock, flags); + atomic_spin_lock_irqsave(&i8259A_lock, flags); /* Find out what's interrupting in the PIIX4 master 8259 */ outb(0x0c, 0x20); /* OCW3 Poll command */ @@ -618,7 +618,7 @@ static irqreturn_t piix4_master_intr(int irq, void *dev_id) outb(0x60 + realirq, 0x20); } - spin_unlock_irqrestore(&i8259A_lock, flags); + atomic_spin_unlock_irqrestore(&i8259A_lock, flags); desc = irq_to_desc(realirq); @@ -636,18 +636,20 @@ static irqreturn_t piix4_master_intr(int irq, void *dev_id) return IRQ_HANDLED; out_unlock: - spin_unlock_irqrestore(&i8259A_lock, flags); + atomic_spin_unlock_irqrestore(&i8259A_lock, flags); return IRQ_NONE; } static struct irqaction master_action = { .handler = piix4_master_intr, .name = "PIIX4-8259", + .flags = IRQF_NODELAY, }; static struct irqaction cascade_action = { .handler = no_action, .name = "cascade", + .flags = IRQF_NODELAY, }; diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c index 9c4e625..cdaeef2 100644 --- a/arch/x86/kernel/vm86_32.c +++ b/arch/x86/kernel/vm86_32.c @@ -137,6 +137,7 @@ struct pt_regs *save_v86_state(struct kernel_vm86_regs *regs) local_irq_enable(); if (!current->thread.vm86_info) { + local_irq_disable(); printk("no vm86_info: BAD\n"); do_exit(SIGSEGV); } diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c index 25ee06a..a6c5525 100644 --- a/arch/x86/kernel/vsyscall_64.c +++ b/arch/x86/kernel/vsyscall_64.c @@ -59,7 +59,7 @@ int __vgetcpu_mode __section_vgetcpu_mode; struct vsyscall_gtod_data __vsyscall_gtod_data __section_vsyscall_gtod_data = { - .lock = SEQLOCK_UNLOCKED, + .lock = __ATOMIC_SEQLOCK_UNLOCKED(__vsyscall_gtod_data.lock), .sysctl_enabled = 1, }; @@ -67,27 +67,54 @@ void update_vsyscall_tz(void) { unsigned long flags; - write_seqlock_irqsave(&vsyscall_gtod_data.lock, flags); + write_atomic_seqlock_irqsave(&vsyscall_gtod_data.lock, flags); /* sys_tz has changed */ vsyscall_gtod_data.sys_tz = sys_tz; - write_sequnlock_irqrestore(&vsyscall_gtod_data.lock, flags); + write_atomic_sequnlock_irqrestore(&vsyscall_gtod_data.lock, flags); } void update_vsyscall(struct timespec *wall_time, struct clocksource *clock) { unsigned long flags; - write_seqlock_irqsave(&vsyscall_gtod_data.lock, flags); + write_atomic_seqlock_irqsave(&vsyscall_gtod_data.lock, flags); + + if (likely(vsyscall_gtod_data.sysctl_enabled == 2)) { + struct timespec tmp = *(wall_time); + cycle_t (*vread)(void); + cycle_t now; + + vread = vsyscall_gtod_data.clock.vread; + if (likely(vread)) + now = vread(); + else + now = clock->read(clock); + + /* calculate interval: */ + now = (now - clock->cycle_last) & clock->mask; + /* convert to nsecs: */ + tmp.tv_nsec += ( now * clock->mult) >> clock->shift; + + while (tmp.tv_nsec >= NSEC_PER_SEC) { + tmp.tv_sec += 1; + tmp.tv_nsec -= NSEC_PER_SEC; + } + + vsyscall_gtod_data.wall_time_sec = tmp.tv_sec; + vsyscall_gtod_data.wall_time_nsec = tmp.tv_nsec; + } else { + vsyscall_gtod_data.wall_time_sec = wall_time->tv_sec; + vsyscall_gtod_data.wall_time_nsec = wall_time->tv_nsec; + } + /* copy vsyscall data */ vsyscall_gtod_data.clock.vread = clock->vread; vsyscall_gtod_data.clock.cycle_last = clock->cycle_last; vsyscall_gtod_data.clock.mask = clock->mask; vsyscall_gtod_data.clock.mult = clock->mult; vsyscall_gtod_data.clock.shift = clock->shift; - vsyscall_gtod_data.wall_time_sec = wall_time->tv_sec; - vsyscall_gtod_data.wall_time_nsec = wall_time->tv_nsec; vsyscall_gtod_data.wall_to_monotonic = wall_to_monotonic; - write_sequnlock_irqrestore(&vsyscall_gtod_data.lock, flags); + write_atomic_sequnlock_irqrestore(&vsyscall_gtod_data.lock, flags); } /* RED-PEN may want to readd seq locking, but then the variable should be @@ -123,8 +150,28 @@ static __always_inline void do_vgettimeofday(struct timeval * tv) unsigned seq; unsigned long mult, shift, nsec; cycle_t (*vread)(void); + + if (likely(__vsyscall_gtod_data.sysctl_enabled == 2)) { + struct timeval tmp; + + do { + barrier(); + tv->tv_sec = __vsyscall_gtod_data.wall_time_sec; + tv->tv_usec = __vsyscall_gtod_data.wall_time_nsec; + barrier(); + tmp.tv_sec = __vsyscall_gtod_data.wall_time_sec; + tmp.tv_usec = __vsyscall_gtod_data.wall_time_nsec; + + } while (tmp.tv_usec != tv->tv_usec || + tmp.tv_sec != tv->tv_sec); + + tv->tv_usec /= NSEC_PER_MSEC; + tv->tv_usec *= USEC_PER_MSEC; + return; + } + do { - seq = read_seqbegin(&__vsyscall_gtod_data.lock); + seq = read_atomic_seqbegin(&__vsyscall_gtod_data.lock); vread = __vsyscall_gtod_data.clock.vread; if (unlikely(!__vsyscall_gtod_data.sysctl_enabled || !vread)) { @@ -133,6 +180,7 @@ static __always_inline void do_vgettimeofday(struct timeval * tv) } now = vread(); + base = __vsyscall_gtod_data.clock.cycle_last; mask = __vsyscall_gtod_data.clock.mask; mult = __vsyscall_gtod_data.clock.mult; @@ -140,7 +188,9 @@ static __always_inline void do_vgettimeofday(struct timeval * tv) tv->tv_sec = __vsyscall_gtod_data.wall_time_sec; nsec = __vsyscall_gtod_data.wall_time_nsec; - } while (read_seqretry(&__vsyscall_gtod_data.lock, seq)); + } while (read_atomic_seqretry(&__vsyscall_gtod_data.lock, seq)); + + now = vread(); /* calculate interval: */ cycle_delta = (now - base) & mask; diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig index 8600a09..2d89b40 100644 --- a/arch/x86/kvm/Kconfig +++ b/arch/x86/kvm/Kconfig @@ -65,7 +65,7 @@ config KVM_AMD config KVM_TRACE bool "KVM trace support" - depends on KVM && SYSFS + depends on KVM && SYSFS && !PREEMPTRT select MARKERS select RELAY select DEBUG_FS diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c index 21f68e0..463f15a 100644 --- a/arch/x86/kvm/i8254.c +++ b/arch/x86/kvm/i8254.c @@ -240,11 +240,11 @@ static void kvm_pit_ack_irq(struct kvm_irq_ack_notifier *kian) { struct kvm_kpit_state *ps = container_of(kian, struct kvm_kpit_state, irq_ack_notifier); - spin_lock(&ps->inject_lock); + atomic_spin_lock(&ps->inject_lock); if (atomic_dec_return(&ps->pit_timer.pending) < 0) atomic_inc(&ps->pit_timer.pending); ps->irq_ack = 1; - spin_unlock(&ps->inject_lock); + atomic_spin_unlock(&ps->inject_lock); } void __kvm_migrate_pit_timer(struct kvm_vcpu *vcpu) @@ -580,7 +580,7 @@ struct kvm_pit *kvm_create_pit(struct kvm *kvm) mutex_init(&pit->pit_state.lock); mutex_lock(&pit->pit_state.lock); - spin_lock_init(&pit->pit_state.inject_lock); + atomic_spin_lock_init(&pit->pit_state.inject_lock); /* Initialize PIO device */ pit->dev.read = pit_ioport_read; @@ -672,12 +672,12 @@ void kvm_inject_pit_timer_irqs(struct kvm_vcpu *vcpu) /* Try to inject pending interrupts when * last one has been acked. */ - spin_lock(&ps->inject_lock); + atomic_spin_lock(&ps->inject_lock); if (atomic_read(&ps->pit_timer.pending) && ps->irq_ack) { ps->irq_ack = 0; inject = 1; } - spin_unlock(&ps->inject_lock); + atomic_spin_unlock(&ps->inject_lock); if (inject) __inject_pit_timer_intr(kvm); } diff --git a/arch/x86/kvm/i8254.h b/arch/x86/kvm/i8254.h index bbd863f..33e30e3 100644 --- a/arch/x86/kvm/i8254.h +++ b/arch/x86/kvm/i8254.h @@ -26,7 +26,7 @@ struct kvm_kpit_state { u32 speaker_data_on; struct mutex lock; struct kvm_pit *pit; - spinlock_t inject_lock; + atomic_spinlock_t inject_lock; unsigned long irq_ack; struct kvm_irq_ack_notifier irq_ack_notifier; }; diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c index 1ccb50c..91fcca1 100644 --- a/arch/x86/kvm/i8259.c +++ b/arch/x86/kvm/i8259.c @@ -34,7 +34,7 @@ static void pic_lock(struct kvm_pic *s) __acquires(&s->lock) { - spin_lock(&s->lock); + atomic_spin_lock(&s->lock); } static void pic_unlock(struct kvm_pic *s) @@ -48,7 +48,7 @@ static void pic_unlock(struct kvm_pic *s) s->pending_acks = 0; s->wakeup_needed = false; - spin_unlock(&s->lock); + atomic_spin_unlock(&s->lock); while (acks) { kvm_notify_acked_irq(kvm, SELECT_PIC(__ffs(acks)), @@ -522,7 +522,7 @@ struct kvm_pic *kvm_create_pic(struct kvm *kvm) s = kzalloc(sizeof(struct kvm_pic), GFP_KERNEL); if (!s) return NULL; - spin_lock_init(&s->lock); + atomic_spin_lock_init(&s->lock); s->kvm = kvm; s->pics[0].elcr_mask = 0xf8; s->pics[1].elcr_mask = 0xde; diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h index 9f59318..fdae2ef 100644 --- a/arch/x86/kvm/irq.h +++ b/arch/x86/kvm/irq.h @@ -62,7 +62,7 @@ struct kvm_kpic_state { }; struct kvm_pic { - spinlock_t lock; + atomic_spinlock_t lock; bool wakeup_needed; unsigned pending_acks; struct kvm *kvm; diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index bfae139..07df264 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -561,6 +561,7 @@ static int is_f00f_bug(struct pt_regs *regs, unsigned long address) nr = (address - idt_descr.address) >> 3; if (nr == 6) { + zap_rt_locks(); do_invalid_op(regs, 0); return 1; } @@ -1032,7 +1033,7 @@ do_page_fault(struct pt_regs *regs, unsigned long error_code) * If we're in an interrupt, have no user context or are running * in an atomic region then we must not take the fault: */ - if (unlikely(in_atomic() || !mm)) { + if (unlikely(in_atomic() || !mm || current->pagefault_disabled)) { bad_area_nosemaphore(regs, error_code, address); return; } diff --git a/arch/x86/mm/gup.c b/arch/x86/mm/gup.c index 71da1bc..71871c8 100644 --- a/arch/x86/mm/gup.c +++ b/arch/x86/mm/gup.c @@ -77,13 +77,13 @@ static noinline int gup_pte_range(pmd_t pmd, unsigned long addr, if (write) mask |= _PAGE_RW; - ptep = pte_offset_map(&pmd, addr); + ptep = pte_offset_map_direct(&pmd, addr); do { pte_t pte = gup_get_pte(ptep); struct page *page; if ((pte_flags(pte) & (mask | _PAGE_SPECIAL)) != mask) { - pte_unmap(ptep); + pte_unmap_direct(ptep); return 0; } VM_BUG_ON(!pfn_valid(pte_pfn(pte))); @@ -93,7 +93,7 @@ static noinline int gup_pte_range(pmd_t pmd, unsigned long addr, (*nr)++; } while (ptep++, addr += PAGE_SIZE, addr != end); - pte_unmap(ptep - 1); + pte_unmap_direct(ptep - 1); return 1; } diff --git a/arch/x86/mm/highmem_32.c b/arch/x86/mm/highmem_32.c index 2112ed5..e33ec9f 100644 --- a/arch/x86/mm/highmem_32.c +++ b/arch/x86/mm/highmem_32.c @@ -4,9 +4,9 @@ void *kmap(struct page *page) { - might_sleep(); if (!PageHighMem(page)) return page_address(page); + might_sleep(); return kmap_high(page); } @@ -19,6 +19,27 @@ void kunmap(struct page *page) kunmap_high(page); } +void kunmap_virt(void *ptr) +{ + struct page *page; + + if ((unsigned long)ptr < PKMAP_ADDR(0)) + return; + page = pte_page(pkmap_page_table[PKMAP_NR((unsigned long)ptr)]); + kunmap(page); +} + +struct page *kmap_to_page(void *ptr) +{ + struct page *page; + + if ((unsigned long)ptr < PKMAP_ADDR(0)) + return virt_to_page(ptr); + page = pte_page(pkmap_page_table[PKMAP_NR((unsigned long)ptr)]); + return page; +} +EXPORT_SYMBOL_GPL(kmap_to_page); /* PREEMPT_RT converts some modules to use this */ + /* * kmap_atomic/kunmap_atomic is significantly faster than kmap/kunmap because * no global lock is needed and because the kmap code must perform a global TLB @@ -27,12 +48,13 @@ void kunmap(struct page *page) * However when holding an atomic kmap is is not legal to sleep, so atomic * kmaps are appropriate for short, tight code paths only. */ -void *kmap_atomic_prot(struct page *page, enum km_type type, pgprot_t prot) +void *__kmap_atomic_prot(struct page *page, enum km_type type, pgprot_t prot) { enum fixed_addresses idx; unsigned long vaddr; /* even !CONFIG_PREEMPT needs this, for in_atomic in do_page_fault */ + preempt_disable(); pagefault_disable(); if (!PageHighMem(page)) @@ -42,18 +64,23 @@ void *kmap_atomic_prot(struct page *page, enum km_type type, pgprot_t prot) idx = type + KM_TYPE_NR*smp_processor_id(); vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx); - BUG_ON(!pte_none(*(kmap_pte-idx))); + WARN_ON(!pte_none(*(kmap_pte-idx))); set_pte(kmap_pte-idx, mk_pte(page, prot)); return (void *)vaddr; } -void *kmap_atomic(struct page *page, enum km_type type) +void *__kmap_atomic_direct(struct page *page, enum km_type type) +{ + return __kmap_atomic_prot(page, type, kmap_prot); +} + +void *__kmap_atomic(struct page *page, enum km_type type) { return kmap_atomic_prot(page, type, kmap_prot); } -void kunmap_atomic(void *kvaddr, enum km_type type) +void __kunmap_atomic(void *kvaddr, enum km_type type) { unsigned long vaddr = (unsigned long) kvaddr & PAGE_MASK; enum fixed_addresses idx = type + KM_TYPE_NR*smp_processor_id(); @@ -74,19 +101,21 @@ void kunmap_atomic(void *kvaddr, enum km_type type) } pagefault_enable(); + preempt_enable(); } /* * This is the same as kmap_atomic() but can map memory that doesn't * have a struct page associated with it. */ -void *kmap_atomic_pfn(unsigned long pfn, enum km_type type) +void *__kmap_atomic_pfn(unsigned long pfn, enum km_type type) { + preempt_disable(); return kmap_atomic_prot_pfn(pfn, type, kmap_prot); } -EXPORT_SYMBOL_GPL(kmap_atomic_pfn); /* temporarily in use by i915 GEM until vmap */ +EXPORT_SYMBOL_GPL(__kmap_atomic_pfn); /* temporarily in use by i915 GEM until vmap */ -struct page *kmap_atomic_to_page(void *ptr) +struct page *__kmap_atomic_to_page(void *ptr) { unsigned long idx, vaddr = (unsigned long)ptr; pte_t *pte; @@ -101,9 +130,10 @@ struct page *kmap_atomic_to_page(void *ptr) EXPORT_SYMBOL(kmap); EXPORT_SYMBOL(kunmap); -EXPORT_SYMBOL(kmap_atomic); -EXPORT_SYMBOL(kunmap_atomic); -EXPORT_SYMBOL(kmap_atomic_prot); +EXPORT_SYMBOL(kunmap_virt); +EXPORT_SYMBOL(__kmap_atomic); +EXPORT_SYMBOL(__kunmap_atomic); +EXPORT_SYMBOL(__kmap_atomic_prot); void __init set_highmem_pages_init(void) { diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index 0607119..eab22a7 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -14,8 +14,6 @@ #include <asm/tlb.h> #include <asm/proto.h> -DEFINE_PER_CPU(struct mmu_gather, mmu_gathers); - unsigned long __initdata e820_table_start; unsigned long __meminitdata e820_table_end; unsigned long __meminitdata e820_table_top; diff --git a/arch/x86/mm/iomap_32.c b/arch/x86/mm/iomap_32.c index fe6f84c..bd045c2 100644 --- a/arch/x86/mm/iomap_32.c +++ b/arch/x86/mm/iomap_32.c @@ -37,6 +37,7 @@ void *kmap_atomic_prot_pfn(unsigned long pfn, enum km_type type, pgprot_t prot) enum fixed_addresses idx; unsigned long vaddr; + preempt_disable(); pagefault_disable(); debug_kmap_atomic(type); @@ -83,5 +84,6 @@ iounmap_atomic(void *kvaddr, enum km_type type) kpte_clear_flush(kmap_pte-idx, vaddr); pagefault_enable(); + preempt_enable(); } EXPORT_SYMBOL_GPL(iounmap_atomic); diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index e245775..fcea436 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -861,8 +861,10 @@ static int change_page_attr_set_clr(unsigned long *addr, int numpages, baddr = *addr; } +#if 0 /* Must avoid aliasing mappings in the highmem code */ kmap_flush_unused(); +#endif vm_unmap_aliases(); diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index ed34f5e..c2ea747 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c @@ -132,6 +132,7 @@ void pud_populate(struct mm_struct *mm, pud_t *pudp, pmd_t *pmd) reserved at the pmd (PDPT) level. */ set_pud(pudp, __pud(__pa(pmd) | _PAGE_PRESENT)); + preempt_disable(); /* * According to Intel App note "TLBs, Paging-Structure Caches, * and Their Invalidation", April 2007, document 317080-001, @@ -140,6 +141,7 @@ void pud_populate(struct mm_struct *mm, pud_t *pudp, pmd_t *pmd) */ if (mm == current->active_mm) write_cr3(read_cr3()); + preempt_enable(); } #else /* !CONFIG_X86_PAE */ diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index c814e14..fe5d800 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -40,7 +40,7 @@ union smp_flush_state { struct { struct mm_struct *flush_mm; unsigned long flush_va; - spinlock_t tlbstate_lock; + atomic_spinlock_t tlbstate_lock; DECLARE_BITMAP(flush_cpumask, NR_CPUS); }; char pad[CONFIG_X86_INTERNODE_CACHE_BYTES]; @@ -179,7 +179,7 @@ static void flush_tlb_others_ipi(const struct cpumask *cpumask, * num_online_cpus() <= NUM_INVALIDATE_TLB_VECTORS, but it is * probably not worth checking this for a cache-hot lock. */ - spin_lock(&f->tlbstate_lock); + atomic_spin_lock(&f->tlbstate_lock); f->flush_mm = mm; f->flush_va = va; @@ -197,7 +197,7 @@ static void flush_tlb_others_ipi(const struct cpumask *cpumask, f->flush_mm = NULL; f->flush_va = 0; - spin_unlock(&f->tlbstate_lock); + atomic_spin_unlock(&f->tlbstate_lock); } void native_flush_tlb_others(const struct cpumask *cpumask, @@ -221,7 +221,7 @@ static int __cpuinit init_smp_flush(void) int i; for (i = 0; i < ARRAY_SIZE(flush_state); i++) - spin_lock_init(&flush_state[i].tlbstate_lock); + atomic_spin_lock_init(&flush_state[i].tlbstate_lock); return 0; } diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c index 89b9a5c..dea0100 100644 --- a/arch/x86/oprofile/nmi_int.c +++ b/arch/x86/oprofile/nmi_int.c @@ -125,9 +125,9 @@ static void nmi_cpu_setup(void *dummy) { int cpu = smp_processor_id(); struct op_msrs *msrs = &per_cpu(cpu_msrs, cpu); - spin_lock(&oprofilefs_lock); + atomic_spin_lock(&oprofilefs_lock); model->setup_ctrs(msrs); - spin_unlock(&oprofilefs_lock); + atomic_spin_unlock(&oprofilefs_lock); per_cpu(saved_lvtpc, cpu) = apic_read(APIC_LVTPC); apic_write(APIC_LVTPC, APIC_DM_NMI); } diff --git a/arch/x86/pci/common.c b/arch/x86/pci/common.c index 2202b62..fc2697c 100644 --- a/arch/x86/pci/common.c +++ b/arch/x86/pci/common.c @@ -81,7 +81,7 @@ int pcibios_scanned; * This interrupt-safe spinlock protects all accesses to PCI * configuration space. */ -DEFINE_SPINLOCK(pci_config_lock); +DEFINE_ATOMIC_SPINLOCK(pci_config_lock); static int __devinit can_skip_ioresource_align(const struct dmi_system_id *d) { diff --git a/arch/x86/pci/direct.c b/arch/x86/pci/direct.c index bd13c3e..e76cff3 100644 --- a/arch/x86/pci/direct.c +++ b/arch/x86/pci/direct.c @@ -27,7 +27,7 @@ static int pci_conf1_read(unsigned int seg, unsigned int bus, return -EINVAL; } - spin_lock_irqsave(&pci_config_lock, flags); + atomic_spin_lock_irqsave(&pci_config_lock, flags); outl(PCI_CONF1_ADDRESS(bus, devfn, reg), 0xCF8); @@ -43,7 +43,7 @@ static int pci_conf1_read(unsigned int seg, unsigned int bus, break; } - spin_unlock_irqrestore(&pci_config_lock, flags); + atomic_spin_unlock_irqrestore(&pci_config_lock, flags); return 0; } @@ -56,7 +56,7 @@ static int pci_conf1_write(unsigned int seg, unsigned int bus, if ((bus > 255) || (devfn > 255) || (reg > 4095)) return -EINVAL; - spin_lock_irqsave(&pci_config_lock, flags); + atomic_spin_lock_irqsave(&pci_config_lock, flags); outl(PCI_CONF1_ADDRESS(bus, devfn, reg), 0xCF8); @@ -72,7 +72,7 @@ static int pci_conf1_write(unsigned int seg, unsigned int bus, break; } - spin_unlock_irqrestore(&pci_config_lock, flags); + atomic_spin_unlock_irqrestore(&pci_config_lock, flags); return 0; } @@ -108,7 +108,7 @@ static int pci_conf2_read(unsigned int seg, unsigned int bus, if (dev & 0x10) return PCIBIOS_DEVICE_NOT_FOUND; - spin_lock_irqsave(&pci_config_lock, flags); + atomic_spin_lock_irqsave(&pci_config_lock, flags); outb((u8)(0xF0 | (fn << 1)), 0xCF8); outb((u8)bus, 0xCFA); @@ -127,7 +127,7 @@ static int pci_conf2_read(unsigned int seg, unsigned int bus, outb(0, 0xCF8); - spin_unlock_irqrestore(&pci_config_lock, flags); + atomic_spin_unlock_irqrestore(&pci_config_lock, flags); return 0; } @@ -147,7 +147,7 @@ static int pci_conf2_write(unsigned int seg, unsigned int bus, if (dev & 0x10) return PCIBIOS_DEVICE_NOT_FOUND; - spin_lock_irqsave(&pci_config_lock, flags); + atomic_spin_lock_irqsave(&pci_config_lock, flags); outb((u8)(0xF0 | (fn << 1)), 0xCF8); outb((u8)bus, 0xCFA); @@ -166,7 +166,7 @@ static int pci_conf2_write(unsigned int seg, unsigned int bus, outb(0, 0xCF8); - spin_unlock_irqrestore(&pci_config_lock, flags); + atomic_spin_unlock_irqrestore(&pci_config_lock, flags); return 0; } @@ -223,16 +223,23 @@ static int __init pci_check_type1(void) unsigned int tmp; int works = 0; - local_irq_save(flags); + atomic_spin_lock_irqsave(&pci_config_lock, flags); outb(0x01, 0xCFB); tmp = inl(0xCF8); outl(0x80000000, 0xCF8); - if (inl(0xCF8) == 0x80000000 && pci_sanity_check(&pci_direct_conf1)) { - works = 1; + + if (inl(0xCF8) == 0x80000000) { + atomic_spin_unlock_irqrestore(&pci_config_lock, flags); + + if (pci_sanity_check(&pci_direct_conf1)) + works = 1; + + atomic_spin_lock_irqsave(&pci_config_lock, flags); } outl(tmp, 0xCF8); - local_irq_restore(flags); + + atomic_spin_unlock_irqrestore(&pci_config_lock, flags); return works; } @@ -242,17 +249,19 @@ static int __init pci_check_type2(void) unsigned long flags; int works = 0; - local_irq_save(flags); + atomic_spin_lock_irqsave(&pci_config_lock, flags); outb(0x00, 0xCFB); outb(0x00, 0xCF8); outb(0x00, 0xCFA); - if (inb(0xCF8) == 0x00 && inb(0xCFA) == 0x00 && - pci_sanity_check(&pci_direct_conf2)) { - works = 1; - } - local_irq_restore(flags); + if (inb(0xCF8) == 0x00 && inb(0xCFA) == 0x00) { + atomic_spin_unlock_irqrestore(&pci_config_lock, flags); + + if (pci_sanity_check(&pci_direct_conf2)) + works = 1; + } else + atomic_spin_unlock_irqrestore(&pci_config_lock, flags); return works; } diff --git a/arch/x86/pci/mmconfig_32.c b/arch/x86/pci/mmconfig_32.c index 8b2d561..7ca333f 100644 --- a/arch/x86/pci/mmconfig_32.c +++ b/arch/x86/pci/mmconfig_32.c @@ -72,7 +72,7 @@ err: *value = -1; if (!base) goto err; - spin_lock_irqsave(&pci_config_lock, flags); + atomic_spin_lock_irqsave(&pci_config_lock, flags); pci_exp_set_dev_base(base, bus, devfn); @@ -87,7 +87,7 @@ err: *value = -1; *value = mmio_config_readl(mmcfg_virt_addr + reg); break; } - spin_unlock_irqrestore(&pci_config_lock, flags); + atomic_spin_unlock_irqrestore(&pci_config_lock, flags); return 0; } @@ -105,7 +105,7 @@ static int pci_mmcfg_write(unsigned int seg, unsigned int bus, if (!base) return -EINVAL; - spin_lock_irqsave(&pci_config_lock, flags); + atomic_spin_lock_irqsave(&pci_config_lock, flags); pci_exp_set_dev_base(base, bus, devfn); @@ -120,7 +120,7 @@ static int pci_mmcfg_write(unsigned int seg, unsigned int bus, mmio_config_writel(mmcfg_virt_addr + reg, value); break; } - spin_unlock_irqrestore(&pci_config_lock, flags); + atomic_spin_unlock_irqrestore(&pci_config_lock, flags); return 0; } diff --git a/arch/x86/pci/numaq_32.c b/arch/x86/pci/numaq_32.c index 8eb295e..f2a1f1f 100644 --- a/arch/x86/pci/numaq_32.c +++ b/arch/x86/pci/numaq_32.c @@ -41,7 +41,7 @@ static int pci_conf1_mq_read(unsigned int seg, unsigned int bus, if (!value || (bus >= MAX_MP_BUSSES) || (devfn > 255) || (reg > 255)) return -EINVAL; - spin_lock_irqsave(&pci_config_lock, flags); + atomic_spin_lock_irqsave(&pci_config_lock, flags); write_cf8(bus, devfn, reg); @@ -66,7 +66,7 @@ static int pci_conf1_mq_read(unsigned int seg, unsigned int bus, break; } - spin_unlock_irqrestore(&pci_config_lock, flags); + atomic_spin_unlock_irqrestore(&pci_config_lock, flags); return 0; } @@ -80,7 +80,7 @@ static int pci_conf1_mq_write(unsigned int seg, unsigned int bus, if ((bus >= MAX_MP_BUSSES) || (devfn > 255) || (reg > 255)) return -EINVAL; - spin_lock_irqsave(&pci_config_lock, flags); + atomic_spin_lock_irqsave(&pci_config_lock, flags); write_cf8(bus, devfn, reg); @@ -105,7 +105,7 @@ static int pci_conf1_mq_write(unsigned int seg, unsigned int bus, break; } - spin_unlock_irqrestore(&pci_config_lock, flags); + atomic_spin_unlock_irqrestore(&pci_config_lock, flags); return 0; } diff --git a/arch/x86/pci/pcbios.c b/arch/x86/pci/pcbios.c index 1c975cc..ffebd80 100644 --- a/arch/x86/pci/pcbios.c +++ b/arch/x86/pci/pcbios.c @@ -161,7 +161,7 @@ static int pci_bios_read(unsigned int seg, unsigned int bus, if (!value || (bus > 255) || (devfn > 255) || (reg > 255)) return -EINVAL; - spin_lock_irqsave(&pci_config_lock, flags); + atomic_spin_lock_irqsave(&pci_config_lock, flags); switch (len) { case 1: @@ -212,7 +212,7 @@ static int pci_bios_read(unsigned int seg, unsigned int bus, break; } - spin_unlock_irqrestore(&pci_config_lock, flags); + atomic_spin_unlock_irqrestore(&pci_config_lock, flags); return (int)((result & 0xff00) >> 8); } @@ -227,7 +227,7 @@ static int pci_bios_write(unsigned int seg, unsigned int bus, if ((bus > 255) || (devfn > 255) || (reg > 255)) return -EINVAL; - spin_lock_irqsave(&pci_config_lock, flags); + atomic_spin_lock_irqsave(&pci_config_lock, flags); switch (len) { case 1: @@ -268,7 +268,7 @@ static int pci_bios_write(unsigned int seg, unsigned int bus, break; } - spin_unlock_irqrestore(&pci_config_lock, flags); + atomic_spin_unlock_irqrestore(&pci_config_lock, flags); return (int)((result & 0xff00) >> 8); } diff --git a/arch/x86/vdso/vclock_gettime.c b/arch/x86/vdso/vclock_gettime.c index 6a40b78..cd36532 100644 --- a/arch/x86/vdso/vclock_gettime.c +++ b/arch/x86/vdso/vclock_gettime.c @@ -47,11 +47,11 @@ notrace static noinline int do_realtime(struct timespec *ts) { unsigned long seq, ns; do { - seq = read_seqbegin(>od->lock); + seq = read_atomic_seqbegin(>od->lock); ts->tv_sec = gtod->wall_time_sec; ts->tv_nsec = gtod->wall_time_nsec; ns = vgetns(); - } while (unlikely(read_seqretry(>od->lock, seq))); + } while (unlikely(read_atomic_seqretry(>od->lock, seq))); timespec_add_ns(ts, ns); return 0; } @@ -76,12 +76,12 @@ notrace static noinline int do_monotonic(struct timespec *ts) { unsigned long seq, ns, secs; do { - seq = read_seqbegin(>od->lock); + seq = read_atomic_seqbegin(>od->lock); secs = gtod->wall_time_sec; ns = gtod->wall_time_nsec + vgetns(); secs += gtod->wall_to_monotonic.tv_sec; ns += gtod->wall_to_monotonic.tv_nsec; - } while (unlikely(read_seqretry(>od->lock, seq))); + } while (unlikely(read_atomic_seqretry(>od->lock, seq))); vset_normalized_timespec(ts, secs, ns); return 0; } diff --git a/arch/xtensa/include/asm/rwsem.h b/arch/xtensa/include/asm/rwsem.h index e39edf5..32c5e28 100644 --- a/arch/xtensa/include/asm/rwsem.h +++ b/arch/xtensa/include/asm/rwsem.h @@ -25,7 +25,7 @@ /* * the semaphore definition */ -struct rw_semaphore { +struct rw_anon_semaphore { signed long count; #define RWSEM_UNLOCKED_VALUE 0x00000000 #define RWSEM_ACTIVE_BIAS 0x00000001 @@ -37,29 +37,37 @@ struct rw_semaphore { struct list_head wait_list; }; -#define __RWSEM_INITIALIZER(name) \ +#define __RWSEM_ANON_INITIALIZER(name) \ { RWSEM_UNLOCKED_VALUE, SPIN_LOCK_UNLOCKED, \ LIST_HEAD_INIT((name).wait_list) } -#define DECLARE_RWSEM(name) \ - struct rw_semaphore name = __RWSEM_INITIALIZER(name) +#define DECLARE_ANON_RWSEM(name) \ + struct rw_anon_semaphore name = __RWSEM_ANON_INITIALIZER(name) -extern struct rw_semaphore *rwsem_down_read_failed(struct rw_semaphore *sem); -extern struct rw_semaphore *rwsem_down_write_failed(struct rw_semaphore *sem); -extern struct rw_semaphore *rwsem_wake(struct rw_semaphore *sem); -extern struct rw_semaphore *rwsem_downgrade_wake(struct rw_semaphore *sem); +extern struct rw_anon_semaphore * +rwsem_down_read_failed(struct rw_anon_semaphore *sem); +extern struct rw_anon_semaphore * +rwsem_down_write_failed(struct rw_anon_semaphore *sem); +extern struct rw_anon_semaphore *rwsem_wake(struct rw_anon_semaphore *sem); +extern struct rw_anon_semaphore * +rwsem_downgrade_wake(struct rw_anon_semaphore *sem); -static inline void init_rwsem(struct rw_semaphore *sem) +static inline void init_anon_rwsem(struct rw_anon_semaphore *sem) { sem->count = RWSEM_UNLOCKED_VALUE; spin_lock_init(&sem->wait_lock); INIT_LIST_HEAD(&sem->wait_list); } +static inline int anon_rwsem_is_locked(struct rw_anon_semaphore *sem) +{ + return (sem->count != 0); +} + /* * lock for reading */ -static inline void __down_read(struct rw_semaphore *sem) +static inline void __down_read(struct rw_anon_semaphore *sem) { if (atomic_add_return(1,(atomic_t *)(&sem->count)) > 0) smp_wmb(); @@ -67,7 +75,7 @@ static inline void __down_read(struct rw_semaphore *sem) rwsem_down_read_failed(sem); } -static inline int __down_read_trylock(struct rw_semaphore *sem) +static inline int __down_read_trylock(struct rw_anon_semaphore *sem) { int tmp; @@ -84,7 +92,7 @@ static inline int __down_read_trylock(struct rw_semaphore *sem) /* * lock for writing */ -static inline void __down_write(struct rw_semaphore *sem) +static inline void __down_write(struct rw_anon_semaphore *sem) { int tmp; @@ -96,7 +104,7 @@ static inline void __down_write(struct rw_semaphore *sem) rwsem_down_write_failed(sem); } -static inline int __down_write_trylock(struct rw_semaphore *sem) +static inline int __down_write_trylock(struct rw_anon_semaphore *sem) { int tmp; @@ -109,7 +117,7 @@ static inline int __down_write_trylock(struct rw_semaphore *sem) /* * unlock after reading */ -static inline void __up_read(struct rw_semaphore *sem) +static inline void __up_read(struct rw_anon_semaphore *sem) { int tmp; @@ -122,7 +130,7 @@ static inline void __up_read(struct rw_semaphore *sem) /* * unlock after writing */ -static inline void __up_write(struct rw_semaphore *sem) +static inline void __up_write(struct rw_anon_semaphore *sem) { smp_wmb(); if (atomic_sub_return(RWSEM_ACTIVE_WRITE_BIAS, @@ -133,7 +141,7 @@ static inline void __up_write(struct rw_semaphore *sem) /* * implement atomic add functionality */ -static inline void rwsem_atomic_add(int delta, struct rw_semaphore *sem) +static inline void rwsem_atomic_add(int delta, struct rw_anon_semaphore *sem) { atomic_add(delta, (atomic_t *)(&sem->count)); } @@ -141,7 +149,7 @@ static inline void rwsem_atomic_add(int delta, struct rw_semaphore *sem) /* * downgrade write lock to read lock */ -static inline void __downgrade_write(struct rw_semaphore *sem) +static inline void __downgrade_write(struct rw_anon_semaphore *sem) { int tmp; @@ -154,12 +162,37 @@ static inline void __downgrade_write(struct rw_semaphore *sem) /* * implement exchange and add functionality */ -static inline int rwsem_atomic_update(int delta, struct rw_semaphore *sem) +static inline int rwsem_atomic_update(int delta, struct rw_anon_semaphore *sem) { smp_mb(); return atomic_add_return(delta, (atomic_t *)(&sem->count)); } +static inline int anon_rwsem_is_locked(struct rw_anon_semaphore *sem) +{ + return (sem->count != 0); +} + +struct rw_semaphore { + signed long count; + spinlock_t wait_lock; + struct list_head wait_list; +}; + +#define __RWSEM_INITIALIZER(name) \ + { RWSEM_UNLOCKED_VALUE, SPIN_LOCK_UNLOCKED, \ + LIST_HEAD_INIT((name).wait_list) } + +#define DECLARE_RWSEM(name) \ + struct rw_semaphore name = __RWSEM_INITIALIZER(name) + +static inline void init_rwsem(struct rw_semaphore *sem) +{ + sem->count = RWSEM_UNLOCKED_VALUE; + spin_lock_init(&sem->wait_lock); + INIT_LIST_HEAD(&sem->wait_list); +} + static inline int rwsem_is_locked(struct rw_semaphore *sem) { return (sem->count != 0); diff --git a/arch/xtensa/kernel/irq.c b/arch/xtensa/kernel/irq.c index a1badb3..bb599c3 100644 --- a/arch/xtensa/kernel/irq.c +++ b/arch/xtensa/kernel/irq.c @@ -90,7 +90,7 @@ int show_interrupts(struct seq_file *p, void *v) } if (i < NR_IRQS) { - spin_lock_irqsave(&irq_desc[i].lock, flags); + atomic_spin_lock_irqsave(&irq_desc[i].lock, flags); action = irq_desc[i].action; if (!action) goto skip; @@ -109,7 +109,7 @@ int show_interrupts(struct seq_file *p, void *v) seq_putc(p, '\n'); skip: - spin_unlock_irqrestore(&irq_desc[i].lock, flags); + atomic_spin_unlock_irqrestore(&irq_desc[i].lock, flags); } else if (i == NR_IRQS) { seq_printf(p, "NMI: "); for_each_online_cpu(j) diff --git a/block/blk-core.c b/block/blk-core.c index e3299a7..620579e 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -201,7 +201,7 @@ EXPORT_SYMBOL(blk_dump_rq_flags); */ void blk_plug_device(struct request_queue *q) { - WARN_ON(!irqs_disabled()); + WARN_ON_NONRT(!irqs_disabled()); /* * don't plug a stopped queue, it must be paired with blk_start_queue() @@ -241,7 +241,7 @@ EXPORT_SYMBOL(blk_plug_device_unlocked); */ int blk_remove_plug(struct request_queue *q) { - WARN_ON(!irqs_disabled()); + WARN_ON_NONRT(!irqs_disabled()); if (!queue_flag_test_and_clear(QUEUE_FLAG_PLUGGED, q)) return 0; @@ -333,7 +333,7 @@ EXPORT_SYMBOL(blk_unplug); **/ void blk_start_queue(struct request_queue *q) { - WARN_ON(!irqs_disabled()); + WARN_ON_NONRT(!irqs_disabled()); queue_flag_clear(QUEUE_FLAG_STOPPED, q); __blk_run_queue(q); diff --git a/drivers/acpi/acpica/acglobal.h b/drivers/acpi/acpica/acglobal.h index 3d87362..feee017 100644 --- a/drivers/acpi/acpica/acglobal.h +++ b/drivers/acpi/acpica/acglobal.h @@ -197,7 +197,12 @@ ACPI_EXTERN u8 acpi_gbl_global_lock_present; * interrupt level */ ACPI_EXTERN spinlock_t _acpi_gbl_gpe_lock; /* For GPE data structs and registers */ -ACPI_EXTERN spinlock_t _acpi_gbl_hardware_lock; /* For ACPI H/W except GPE registers */ + +/* + * Need to be raw because it might be used in acpi_processor_idle(): + */ +ACPI_EXTERN atomic_spinlock_t _acpi_gbl_hardware_lock; /* For ACPI H/W except GPE registers */ + #define acpi_gbl_gpe_lock &_acpi_gbl_gpe_lock #define acpi_gbl_hardware_lock &_acpi_gbl_hardware_lock diff --git a/drivers/acpi/acpica/hwregs.c b/drivers/acpi/acpica/hwregs.c index 23d5505..756fe9e 100644 --- a/drivers/acpi/acpica/hwregs.c +++ b/drivers/acpi/acpica/hwregs.c @@ -85,7 +85,7 @@ acpi_status acpi_hw_clear_acpi_status(void) ACPI_BITMASK_ALL_FIXED_STATUS, ACPI_FORMAT_UINT64(acpi_gbl_xpm1a_status.address))); - lock_flags = acpi_os_acquire_lock(acpi_gbl_hardware_lock); + atomic_spin_lock_irqsave(acpi_gbl_hardware_lock, lock_flags); /* Clear the fixed events in PM1 A/B */ @@ -100,7 +100,7 @@ acpi_status acpi_hw_clear_acpi_status(void) status = acpi_ev_walk_gpe_list(acpi_hw_clear_gpe_block, NULL); unlock_and_exit: - acpi_os_release_lock(acpi_gbl_hardware_lock, lock_flags); + atomic_spin_unlock_irqrestore(acpi_gbl_hardware_lock, lock_flags); return_ACPI_STATUS(status); } diff --git a/drivers/acpi/acpica/hwxface.c b/drivers/acpi/acpica/hwxface.c index 9829979..3898b18 100644 --- a/drivers/acpi/acpica/hwxface.c +++ b/drivers/acpi/acpica/hwxface.c @@ -341,7 +341,7 @@ acpi_status acpi_write_bit_register(u32 register_id, u32 value) return_ACPI_STATUS(AE_BAD_PARAMETER); } - lock_flags = acpi_os_acquire_lock(acpi_gbl_hardware_lock); + atomic_spin_lock_irqsave(acpi_gbl_hardware_lock, lock_flags); /* * At this point, we know that the parent register is one of the @@ -402,7 +402,7 @@ acpi_status acpi_write_bit_register(u32 register_id, u32 value) unlock_and_exit: - acpi_os_release_lock(acpi_gbl_hardware_lock, lock_flags); + atomic_spin_unlock_irqrestore(acpi_gbl_hardware_lock, lock_flags); return_ACPI_STATUS(status); } diff --git a/drivers/acpi/acpica/utmutex.c b/drivers/acpi/acpica/utmutex.c index 80bb651..477de01 100644 --- a/drivers/acpi/acpica/utmutex.c +++ b/drivers/acpi/acpica/utmutex.c @@ -84,7 +84,7 @@ acpi_status acpi_ut_mutex_initialize(void) /* Create the spinlocks for use at interrupt level */ spin_lock_init(acpi_gbl_gpe_lock); - spin_lock_init(acpi_gbl_hardware_lock); + atomic_spin_lock_init(acpi_gbl_hardware_lock); /* Create the reader/writer lock for namespace access */ @@ -117,11 +117,6 @@ void acpi_ut_mutex_terminate(void) (void)acpi_ut_delete_mutex(i); } - /* Delete the spinlocks */ - - acpi_os_delete_lock(acpi_gbl_gpe_lock); - acpi_os_delete_lock(acpi_gbl_hardware_lock); - /* Delete the reader/writer lock */ acpi_ut_delete_rw_lock(&acpi_gbl_namespace_rw_lock); diff --git a/drivers/acpi/ec.c b/drivers/acpi/ec.c index 391f331..fe086ca 100644 --- a/drivers/acpi/ec.c +++ b/drivers/acpi/ec.c @@ -573,8 +573,22 @@ static u32 acpi_ec_gpe_handler(void *data) if (test_bit(EC_FLAGS_GPE_MODE, &ec->flags)) { gpe_transaction(ec, status); if (ec_transaction_done(ec) && - (status & ACPI_EC_FLAG_IBF) == 0) + (status & ACPI_EC_FLAG_IBF) == 0) { + +#ifndef CONFIG_PREEMPT_RT wake_up(&ec->wait); +#else + // hack ... + if (waitqueue_active(&ec->wait)) { + struct task_struct *task; + + task = list_entry(ec->wait.task_list.next, + wait_queue_t, task_list)->private; + if (task) + wake_up_process(task); + } +#endif + } } ec_check_sci(ec, status); diff --git a/drivers/acpi/osl.c b/drivers/acpi/osl.c index 5691f16..64bdc1b 100644 --- a/drivers/acpi/osl.c +++ b/drivers/acpi/osl.c @@ -833,14 +833,14 @@ void acpi_os_delete_lock(acpi_spinlock handle) acpi_status acpi_os_create_semaphore(u32 max_units, u32 initial_units, acpi_handle * handle) { - struct semaphore *sem = NULL; + struct anon_semaphore *sem = NULL; - sem = acpi_os_allocate(sizeof(struct semaphore)); + sem = acpi_os_allocate(sizeof(struct anon_semaphore)); if (!sem) return AE_NO_MEMORY; - memset(sem, 0, sizeof(struct semaphore)); + memset(sem, 0, sizeof(struct anon_semaphore)); - sema_init(sem, initial_units); + anon_sema_init(sem, initial_units); *handle = (acpi_handle *) sem; @@ -859,7 +859,7 @@ acpi_os_create_semaphore(u32 max_units, u32 initial_units, acpi_handle * handle) acpi_status acpi_os_delete_semaphore(acpi_handle handle) { - struct semaphore *sem = (struct semaphore *)handle; + struct anon_semaphore *sem = (struct anon_semaphore *)handle; if (!sem) return AE_BAD_PARAMETER; @@ -879,7 +879,7 @@ acpi_status acpi_os_delete_semaphore(acpi_handle handle) acpi_status acpi_os_wait_semaphore(acpi_handle handle, u32 units, u16 timeout) { acpi_status status = AE_OK; - struct semaphore *sem = (struct semaphore *)handle; + struct anon_semaphore *sem = (struct anon_semaphore *)handle; long jiffies; int ret = 0; @@ -897,7 +897,7 @@ acpi_status acpi_os_wait_semaphore(acpi_handle handle, u32 units, u16 timeout) else jiffies = msecs_to_jiffies(timeout); - ret = down_timeout(sem, jiffies); + ret = anon_down_timeout(sem, jiffies); if (ret) status = AE_TIME; @@ -920,7 +920,7 @@ acpi_status acpi_os_wait_semaphore(acpi_handle handle, u32 units, u16 timeout) */ acpi_status acpi_os_signal_semaphore(acpi_handle handle, u32 units) { - struct semaphore *sem = (struct semaphore *)handle; + struct anon_semaphore *sem = (struct anon_semaphore *)handle; if (!sem || (units < 1)) return AE_BAD_PARAMETER; @@ -931,7 +931,7 @@ acpi_status acpi_os_signal_semaphore(acpi_handle handle, u32 units) ACPI_DEBUG_PRINT((ACPI_DB_MUTEX, "Signaling semaphore[%p|%d]\n", handle, units)); - up(sem); + anon_up(sem); return AE_OK; } diff --git a/drivers/acpi/processor_idle.c b/drivers/acpi/processor_idle.c index 66393d5..716709b 100644 --- a/drivers/acpi/processor_idle.c +++ b/drivers/acpi/processor_idle.c @@ -921,7 +921,7 @@ static int acpi_idle_enter_simple(struct cpuidle_device *dev, } static int c3_cpu_count; -static DEFINE_SPINLOCK(c3_lock); +static DEFINE_ATOMIC_SPINLOCK(c3_lock); /** * acpi_idle_enter_bm - enters C3 with proper BM handling @@ -996,12 +996,12 @@ static int acpi_idle_enter_bm(struct cpuidle_device *dev, * without doing anything. */ if (pr->flags.bm_check && pr->flags.bm_control) { - spin_lock(&c3_lock); + atomic_spin_lock(&c3_lock); c3_cpu_count++; /* Disable bus master arbitration when all CPUs are in C3 */ if (c3_cpu_count == num_online_cpus()) acpi_write_bit_register(ACPI_BITREG_ARB_DISABLE, 1); - spin_unlock(&c3_lock); + atomic_spin_unlock(&c3_lock); } else if (!pr->flags.bm_check) { ACPI_FLUSH_CPU_CACHE(); } @@ -1010,10 +1010,10 @@ static int acpi_idle_enter_bm(struct cpuidle_device *dev, /* Re-enable bus master arbitration */ if (pr->flags.bm_check && pr->flags.bm_control) { - spin_lock(&c3_lock); + atomic_spin_lock(&c3_lock); acpi_write_bit_register(ACPI_BITREG_ARB_DISABLE, 0); c3_cpu_count--; - spin_unlock(&c3_lock); + atomic_spin_unlock(&c3_lock); } kt2 = ktime_get_real(); idle_time = ktime_to_us(ktime_sub(kt2, kt1)); diff --git a/drivers/ata/libata-sff.c b/drivers/ata/libata-sff.c index bbbb1fa..b79d110 100644 --- a/drivers/ata/libata-sff.c +++ b/drivers/ata/libata-sff.c @@ -837,9 +837,9 @@ unsigned int ata_sff_data_xfer_noirq(struct ata_device *dev, unsigned char *buf, unsigned long flags; unsigned int consumed; - local_irq_save(flags); + local_irq_save_nort(flags); consumed = ata_sff_data_xfer(dev, buf, buflen, rw); - local_irq_restore(flags); + local_irq_restore_nort(flags); return consumed; } @@ -878,7 +878,7 @@ static void ata_pio_sector(struct ata_queued_cmd *qc) unsigned long flags; /* FIXME: use a bounce buffer */ - local_irq_save(flags); + local_irq_save_nort(flags); buf = kmap_atomic(page, KM_IRQ0); /* do the actual data transfer */ @@ -886,7 +886,7 @@ static void ata_pio_sector(struct ata_queued_cmd *qc) do_write); kunmap_atomic(buf, KM_IRQ0); - local_irq_restore(flags); + local_irq_restore_nort(flags); } else { buf = page_address(page); ap->ops->sff_data_xfer(qc->dev, buf + offset, qc->sect_size, @@ -1016,7 +1016,7 @@ next_sg: unsigned long flags; /* FIXME: use bounce buffer */ - local_irq_save(flags); + local_irq_save_nort(flags); buf = kmap_atomic(page, KM_IRQ0); /* do the actual data transfer */ @@ -1024,7 +1024,7 @@ next_sg: count, rw); kunmap_atomic(buf, KM_IRQ0); - local_irq_restore(flags); + local_irq_restore_nort(flags); } else { buf = page_address(page); consumed = ap->ops->sff_data_xfer(dev, buf + offset, diff --git a/drivers/base/bus.c b/drivers/base/bus.c index 4b04a15..77ecb26 100644 --- a/drivers/base/bus.c +++ b/drivers/base/bus.c @@ -173,10 +173,10 @@ static ssize_t driver_unbind(struct device_driver *drv, dev = bus_find_device_by_name(bus, NULL, buf); if (dev && dev->driver == drv) { if (dev->parent) /* Needed for USB */ - down(&dev->parent->sem); + mutex_lock(&dev->parent->mutex); device_release_driver(dev); if (dev->parent) - up(&dev->parent->sem); + mutex_unlock(&dev->parent->mutex); err = count; } put_device(dev); @@ -200,12 +200,12 @@ static ssize_t driver_bind(struct device_driver *drv, dev = bus_find_device_by_name(bus, NULL, buf); if (dev && dev->driver == NULL && driver_match_device(drv, dev)) { if (dev->parent) /* Needed for USB */ - down(&dev->parent->sem); - down(&dev->sem); + mutex_lock(&dev->parent->mutex); + mutex_lock(&dev->mutex); err = driver_probe_device(drv, dev); - up(&dev->sem); + mutex_unlock(&dev->mutex); if (dev->parent) - up(&dev->parent->sem); + mutex_unlock(&dev->parent->mutex); if (err > 0) { /* success */ @@ -742,10 +742,10 @@ static int __must_check bus_rescan_devices_helper(struct device *dev, if (!dev->driver) { if (dev->parent) /* Needed for USB */ - down(&dev->parent->sem); + mutex_lock(&dev->parent->mutex); ret = device_attach(dev); if (dev->parent) - up(&dev->parent->sem); + mutex_unlock(&dev->parent->mutex); } return ret < 0 ? ret : 0; } @@ -777,10 +777,10 @@ int device_reprobe(struct device *dev) { if (dev->driver) { if (dev->parent) /* Needed for USB */ - down(&dev->parent->sem); + mutex_lock(&dev->parent->mutex); device_release_driver(dev); if (dev->parent) - up(&dev->parent->sem); + mutex_unlock(&dev->parent->mutex); } return bus_rescan_devices_helper(dev, NULL); } diff --git a/drivers/base/core.c b/drivers/base/core.c index 7ecb193..9cfc4a5 100644 --- a/drivers/base/core.c +++ b/drivers/base/core.c @@ -20,7 +20,6 @@ #include <linux/notifier.h> #include <linux/genhd.h> #include <linux/kallsyms.h> -#include <linux/semaphore.h> #include <linux/mutex.h> #include <linux/async.h> @@ -550,7 +549,7 @@ void device_initialize(struct device *dev) dev->kobj.kset = devices_kset; kobject_init(&dev->kobj, &device_ktype); INIT_LIST_HEAD(&dev->dma_pools); - init_MUTEX(&dev->sem); + mutex_init(&dev->mutex); spin_lock_init(&dev->devres_lock); INIT_LIST_HEAD(&dev->devres_head); device_init_wakeup(dev, 0); diff --git a/drivers/base/dd.c b/drivers/base/dd.c index f010687..c90f82b 100644 --- a/drivers/base/dd.c +++ b/drivers/base/dd.c @@ -84,7 +84,7 @@ static void driver_sysfs_remove(struct device *dev) * for before calling this. (It is ok to call with no other effort * from a driver's probe() method.) * - * This function must be called with @dev->sem held. + * This function must be called with @dev->mutex held. */ int device_bind_driver(struct device *dev) { @@ -189,8 +189,8 @@ EXPORT_SYMBOL_GPL(wait_for_device_probe); * This function returns -ENODEV if the device is not registered, * 1 if the device is bound sucessfully and 0 otherwise. * - * This function must be called with @dev->sem held. When called for a - * USB interface, @dev->parent->sem must be held as well. + * This function must be called with @dev->mutex held. When called for a + * USB interface, @dev->parent->mutex must be held as well. */ int driver_probe_device(struct device_driver *drv, struct device *dev) { @@ -229,13 +229,13 @@ static int __device_attach(struct device_driver *drv, void *data) * 0 if no matching driver was found; * -ENODEV if the device is not registered. * - * When called for a USB interface, @dev->parent->sem must be held. + * When called for a USB interface, @dev->parent->mutex must be held. */ int device_attach(struct device *dev) { int ret = 0; - down(&dev->sem); + mutex_lock(&dev->mutex); if (dev->driver) { ret = device_bind_driver(dev); if (ret == 0) @@ -247,7 +247,7 @@ int device_attach(struct device *dev) } else { ret = bus_for_each_drv(dev->bus, NULL, dev, __device_attach); } - up(&dev->sem); + mutex_unlock(&dev->mutex); return ret; } EXPORT_SYMBOL_GPL(device_attach); @@ -270,13 +270,13 @@ static int __driver_attach(struct device *dev, void *data) return 0; if (dev->parent) /* Needed for USB */ - down(&dev->parent->sem); - down(&dev->sem); + mutex_lock(&dev->parent->mutex); + mutex_lock(&dev->mutex); if (!dev->driver) driver_probe_device(drv, dev); - up(&dev->sem); + mutex_unlock(&dev->mutex); if (dev->parent) - up(&dev->parent->sem); + mutex_unlock(&dev->parent->mutex); return 0; } @@ -297,8 +297,8 @@ int driver_attach(struct device_driver *drv) EXPORT_SYMBOL_GPL(driver_attach); /* - * __device_release_driver() must be called with @dev->sem held. - * When called for a USB interface, @dev->parent->sem must be held as well. + * __device_release_driver() must be called with @dev->mutex held. + * When called for a USB interface, @dev->parent->mutex must be held as well. */ static void __device_release_driver(struct device *dev) { @@ -332,7 +332,7 @@ static void __device_release_driver(struct device *dev) * @dev: device. * * Manually detach device from driver. - * When called for a USB interface, @dev->parent->sem must be held. + * When called for a USB interface, @dev->parent->mutex must be held. */ void device_release_driver(struct device *dev) { @@ -341,9 +341,9 @@ void device_release_driver(struct device *dev) * within their ->remove callback for the same device, they * will deadlock right here. */ - down(&dev->sem); + mutex_lock(&dev->mutex); __device_release_driver(dev); - up(&dev->sem); + mutex_unlock(&dev->mutex); } EXPORT_SYMBOL_GPL(device_release_driver); @@ -370,13 +370,13 @@ void driver_detach(struct device_driver *drv) spin_unlock(&drv->p->klist_devices.k_lock); if (dev->parent) /* Needed for USB */ - down(&dev->parent->sem); - down(&dev->sem); + mutex_lock(&dev->parent->mutex); + mutex_lock(&dev->mutex); if (dev->driver == drv) __device_release_driver(dev); - up(&dev->sem); + mutex_unlock(&dev->mutex); if (dev->parent) - up(&dev->parent->sem); + mutex_unlock(&dev->parent->mutex); put_device(dev); } } diff --git a/drivers/base/power/main.c b/drivers/base/power/main.c index 58a3e57..01d026d 100644 --- a/drivers/base/power/main.c +++ b/drivers/base/power/main.c @@ -33,8 +33,8 @@ * because children are guaranteed to be discovered after parents, and * are inserted at the back of the list on discovery. * - * Since device_pm_add() may be called with a device semaphore held, - * we must never try to acquire a device semaphore while holding + * Since device_pm_add() may be called with a device mutex held, + * we must never try to acquire a device mutex while holding * dpm_list_mutex. */ @@ -381,7 +381,7 @@ static int device_resume(struct device *dev, pm_message_t state) TRACE_DEVICE(dev); TRACE_RESUME(0); - down(&dev->sem); + mutex_lock(&dev->mutex); if (dev->bus) { if (dev->bus->pm) { @@ -414,7 +414,7 @@ static int device_resume(struct device *dev, pm_message_t state) } } End: - up(&dev->sem); + mutex_unlock(&dev->mutex); TRACE_RESUME(error); return error; @@ -468,7 +468,7 @@ static void dpm_resume(pm_message_t state) */ static void device_complete(struct device *dev, pm_message_t state) { - down(&dev->sem); + mutex_lock(&dev->mutex); if (dev->class && dev->class->pm && dev->class->pm->complete) { pm_dev_dbg(dev, state, "completing class "); @@ -485,7 +485,7 @@ static void device_complete(struct device *dev, pm_message_t state) dev->bus->pm->complete(dev); } - up(&dev->sem); + mutex_unlock(&dev->mutex); } /** @@ -619,7 +619,7 @@ static int device_suspend(struct device *dev, pm_message_t state) { int error = 0; - down(&dev->sem); + mutex_lock(&dev->mutex); if (dev->class) { if (dev->class->pm) { @@ -654,7 +654,7 @@ static int device_suspend(struct device *dev, pm_message_t state) } } End: - up(&dev->sem); + mutex_unlock(&dev->mutex); return error; } @@ -705,7 +705,7 @@ static int device_prepare(struct device *dev, pm_message_t state) { int error = 0; - down(&dev->sem); + mutex_lock(&dev->mutex); if (dev->bus && dev->bus->pm && dev->bus->pm->prepare) { pm_dev_dbg(dev, state, "preparing "); @@ -729,7 +729,7 @@ static int device_prepare(struct device *dev, pm_message_t state) suspend_report_result(dev->class->pm->prepare, error); } End: - up(&dev->sem); + mutex_unlock(&dev->mutex); return error; } diff --git a/drivers/block/hd.c b/drivers/block/hd.c index f9d0160..cc71770 100644 --- a/drivers/block/hd.c +++ b/drivers/block/hd.c @@ -165,12 +165,12 @@ unsigned long read_timer(void) unsigned long t, flags; int i; - spin_lock_irqsave(&i8253_lock, flags); + atomic_spin_lock_irqsave(&i8253_lock, flags); t = jiffies * 11932; outb_p(0, 0x43); i = inb_p(0x40); i |= inb(0x40) << 8; - spin_unlock_irqrestore(&i8253_lock, flags); + atomic_spin_unlock_irqrestore(&i8253_lock, flags); return(t - i); } #endif diff --git a/drivers/block/paride/pseudo.h b/drivers/block/paride/pseudo.h index bc37032..0fbc78c 100644 --- a/drivers/block/paride/pseudo.h +++ b/drivers/block/paride/pseudo.h @@ -43,7 +43,7 @@ static unsigned long ps_timeout; static int ps_tq_active = 0; static int ps_nice = 0; -static DEFINE_SPINLOCK(ps_spinlock __attribute__((unused))); +static __attribute__((unused)) DEFINE_SPINLOCK(ps_spinlock); static DECLARE_DELAYED_WORK(ps_tq, ps_tq_int); diff --git a/drivers/char/random.c b/drivers/char/random.c index 8c74448..91cc9c6 100644 --- a/drivers/char/random.c +++ b/drivers/char/random.c @@ -623,8 +623,11 @@ static void add_timer_randomness(struct timer_rand_state *state, unsigned num) preempt_disable(); /* if over the trickle threshold, use only 1 in 4096 samples */ if (input_pool.entropy_count > trickle_thresh && - (__get_cpu_var(trickle_count)++ & 0xfff)) - goto out; + (__get_cpu_var(trickle_count)++ & 0xfff)) { + preempt_enable(); + return; + } + preempt_enable(); sample.jiffies = jiffies; sample.cycles = get_cycles(); @@ -666,8 +669,6 @@ static void add_timer_randomness(struct timer_rand_state *state, unsigned num) credit_entropy_bits(&input_pool, min_t(int, fls(delta>>1), 11)); } -out: - preempt_enable(); } void add_input_randomness(unsigned int type, unsigned int code, diff --git a/drivers/char/rtc.c b/drivers/char/rtc.c index e0d0f8b..d809c4d 100644 --- a/drivers/char/rtc.c +++ b/drivers/char/rtc.c @@ -1197,10 +1197,12 @@ static void rtc_dropped_irq(unsigned long data) spin_unlock_irq(&rtc_lock); +#ifndef CONFIG_PREEMPT_RT if (printk_ratelimit()) { printk(KERN_WARNING "rtc: lost some interrupts at %ldHz.\n", freq); } +#endif /* Now we have new data */ wake_up_interruptible(&rtc_wait); diff --git a/drivers/char/tty_buffer.c b/drivers/char/tty_buffer.c index 3108991..7100206 100644 --- a/drivers/char/tty_buffer.c +++ b/drivers/char/tty_buffer.c @@ -495,10 +495,14 @@ void tty_flip_buffer_push(struct tty_struct *tty) tty->buf.tail->commit = tty->buf.tail->used; spin_unlock_irqrestore(&tty->buf.lock, flags); +#ifndef CONFIG_PREEMPT_RT if (tty->low_latency) flush_to_ldisc(&tty->buf.work.work); else schedule_delayed_work(&tty->buf.work, 1); +#else + flush_to_ldisc(&tty->buf.work.work); +#endif } EXPORT_SYMBOL(tty_flip_buffer_push); diff --git a/drivers/char/tty_ldisc.c b/drivers/char/tty_ldisc.c index e48af9f..c8f7b70 100644 --- a/drivers/char/tty_ldisc.c +++ b/drivers/char/tty_ldisc.c @@ -69,7 +69,7 @@ static void put_ldisc(struct tty_ldisc *ld) * We really want an "atomic_dec_and_lock_irqsave()", * but we don't have it, so this does it by hand. */ - local_irq_save(flags); + local_irq_save_nort(flags); if (atomic_dec_and_lock(&ld->users, &tty_ldisc_lock)) { struct tty_ldisc_ops *ldo = ld->ops; @@ -80,7 +80,7 @@ static void put_ldisc(struct tty_ldisc *ld) kfree(ld); return; } - local_irq_restore(flags); + local_irq_restore_nort(flags); } /** diff --git a/drivers/char/vt.c b/drivers/char/vt.c index 404f4c1..dee3f64 100644 --- a/drivers/char/vt.c +++ b/drivers/char/vt.c @@ -2537,7 +2537,7 @@ static struct console vt_console_driver = { .write = vt_console_print, .device = vt_console_device, .unblank = unblank_screen, - .flags = CON_PRINTBUFFER, + .flags = CON_PRINTBUFFER | CON_ATOMIC, .index = -1, }; #endif diff --git a/drivers/firewire/core-device.c b/drivers/firewire/core-device.c index 97e656a..16c0e4a 100644 --- a/drivers/firewire/core-device.c +++ b/drivers/firewire/core-device.c @@ -762,9 +762,9 @@ static int update_unit(struct device *dev, void *data) struct fw_driver *driver = (struct fw_driver *)dev->driver; if (is_fw_unit(dev) && driver != NULL && driver->update != NULL) { - down(&dev->sem); + mutex_lock(&dev->mutex); driver->update(unit); - up(&dev->sem); + mutex_unlock(&dev->mutex); } return 0; diff --git a/drivers/ide/alim15x3.c b/drivers/ide/alim15x3.c index e59b6de..b35ede8 100644 --- a/drivers/ide/alim15x3.c +++ b/drivers/ide/alim15x3.c @@ -90,7 +90,7 @@ static void ali_set_pio_mode(ide_drive_t *drive, const u8 pio) if (r_clc >= 16) r_clc = 0; } - local_irq_save(flags); + local_irq_save_nort(flags); /* * PIO mode => ATA FIFO on, ATAPI FIFO off @@ -112,7 +112,7 @@ static void ali_set_pio_mode(ide_drive_t *drive, const u8 pio) pci_write_config_byte(dev, port, s_clc); pci_write_config_byte(dev, port + unit + 2, (a_clc << 4) | r_clc); - local_irq_restore(flags); + local_irq_restore_nort(flags); } /** @@ -223,7 +223,7 @@ static int init_chipset_ali15x3(struct pci_dev *dev) isa_dev = pci_get_device(PCI_VENDOR_ID_AL, PCI_DEVICE_ID_AL_M1533, NULL); - local_irq_save(flags); + local_irq_save_nort(flags); if (m5229_revision < 0xC2) { /* @@ -314,7 +314,7 @@ out: } pci_dev_put(north); pci_dev_put(isa_dev); - local_irq_restore(flags); + local_irq_restore_nort(flags); return 0; } @@ -376,7 +376,7 @@ static u8 ali_cable_detect(ide_hwif_t *hwif) unsigned long flags; u8 cbl = ATA_CBL_PATA40, tmpbyte; - local_irq_save(flags); + local_irq_save_nort(flags); if (m5229_revision >= 0xC2) { /* @@ -397,7 +397,7 @@ static u8 ali_cable_detect(ide_hwif_t *hwif) } } - local_irq_restore(flags); + local_irq_restore_nort(flags); return cbl; } diff --git a/drivers/ide/hpt366.c b/drivers/ide/hpt366.c index 7ce68ef..26c3a83 100644 --- a/drivers/ide/hpt366.c +++ b/drivers/ide/hpt366.c @@ -1302,7 +1302,7 @@ static int __devinit init_dma_hpt366(ide_hwif_t *hwif, dma_old = inb(base + 2); - local_irq_save(flags); + local_irq_save_nort(flags); dma_new = dma_old; pci_read_config_byte(dev, hwif->channel ? 0x4b : 0x43, &masterdma); @@ -1313,7 +1313,7 @@ static int __devinit init_dma_hpt366(ide_hwif_t *hwif, if (dma_new != dma_old) outb(dma_new, base + 2); - local_irq_restore(flags); + local_irq_restore_nort(flags); printk(KERN_INFO " %s: BM-DMA at 0x%04lx-0x%04lx\n", hwif->name, base, base + 7); diff --git a/drivers/ide/ide-io-std.c b/drivers/ide/ide-io-std.c index 46721c4..b6f114a 100644 --- a/drivers/ide/ide-io-std.c +++ b/drivers/ide/ide-io-std.c @@ -174,7 +174,7 @@ void ide_input_data(ide_drive_t *drive, struct ide_cmd *cmd, void *buf, unsigned long uninitialized_var(flags); if ((io_32bit & 2) && !mmio) { - local_irq_save(flags); + local_irq_save_nort(flags); ata_vlb_sync(io_ports->nsect_addr); } @@ -185,7 +185,7 @@ void ide_input_data(ide_drive_t *drive, struct ide_cmd *cmd, void *buf, insl(data_addr, buf, words); if ((io_32bit & 2) && !mmio) - local_irq_restore(flags); + local_irq_restore_nort(flags); if (((len + 1) & 3) < 2) return; @@ -218,7 +218,7 @@ void ide_output_data(ide_drive_t *drive, struct ide_cmd *cmd, void *buf, unsigned long uninitialized_var(flags); if ((io_32bit & 2) && !mmio) { - local_irq_save(flags); + local_irq_save_nort(flags); ata_vlb_sync(io_ports->nsect_addr); } @@ -229,7 +229,7 @@ void ide_output_data(ide_drive_t *drive, struct ide_cmd *cmd, void *buf, outsl(data_addr, buf, words); if ((io_32bit & 2) && !mmio) - local_irq_restore(flags); + local_irq_restore_nort(flags); if (((len + 1) & 3) < 2) return; diff --git a/drivers/ide/ide-io.c b/drivers/ide/ide-io.c index db96138..b8b3ab6 100644 --- a/drivers/ide/ide-io.c +++ b/drivers/ide/ide-io.c @@ -667,7 +667,7 @@ void ide_timer_expiry (unsigned long data) /* disable_irq_nosync ?? */ disable_irq(hwif->irq); /* local CPU only, as if we were handling an interrupt */ - local_irq_disable(); + local_irq_disable_nort(); if (hwif->polling) { startstop = handler(drive); } else if (drive_is_ready(drive)) { diff --git a/drivers/ide/ide-iops.c b/drivers/ide/ide-iops.c index 2892b24..b1765dd 100644 --- a/drivers/ide/ide-iops.c +++ b/drivers/ide/ide-iops.c @@ -129,12 +129,12 @@ static int __ide_wait_stat(ide_drive_t *drive, u8 good, u8 bad, if ((stat & ATA_BUSY) == 0) break; - local_irq_restore(flags); + local_irq_restore_nort(flags); *rstat = stat; return -EBUSY; } } - local_irq_restore(flags); + local_irq_restore_nort(flags); } /* * Allow status to settle, then read it again. diff --git a/drivers/ide/ide-probe.c b/drivers/ide/ide-probe.c index 1bb106f..596f186 100644 --- a/drivers/ide/ide-probe.c +++ b/drivers/ide/ide-probe.c @@ -196,10 +196,10 @@ static void do_identify(ide_drive_t *drive, u8 cmd, u16 *id) int bswap = 1; /* local CPU only; some systems need this */ - local_irq_save(flags); + local_irq_save_nort(flags); /* read 512 bytes of id info */ hwif->tp_ops->input_data(drive, NULL, id, SECTOR_SIZE); - local_irq_restore(flags); + local_irq_restore_nort(flags); drive->dev_flags |= IDE_DFLAG_ID_READ; #ifdef DEBUG diff --git a/drivers/ide/ide-taskfile.c b/drivers/ide/ide-taskfile.c index 75b85a8..bf341a0 100644 --- a/drivers/ide/ide-taskfile.c +++ b/drivers/ide/ide-taskfile.c @@ -248,7 +248,7 @@ void ide_pio_bytes(ide_drive_t *drive, struct ide_cmd *cmd, offset %= PAGE_SIZE; if (PageHighMem(page)) - local_irq_save(flags); + local_irq_save_nort(flags); buf = kmap_atomic(page, KM_BIO_SRC_IRQ) + offset; @@ -269,7 +269,7 @@ void ide_pio_bytes(ide_drive_t *drive, struct ide_cmd *cmd, kunmap_atomic(buf, KM_BIO_SRC_IRQ); if (PageHighMem(page)) - local_irq_restore(flags); + local_irq_restore_nort(flags); len -= nr_bytes; } @@ -406,7 +406,7 @@ static ide_startstop_t pre_task_out_intr(ide_drive_t *drive, } if ((drive->dev_flags & IDE_DFLAG_UNMASK) == 0) - local_irq_disable(); + local_irq_disable_nort(); ide_set_handler(drive, &task_pio_intr, WAIT_WORSTCASE); diff --git a/drivers/ieee1394/nodemgr.c b/drivers/ieee1394/nodemgr.c index 5122b5a..bcc3d32 100644 --- a/drivers/ieee1394/nodemgr.c +++ b/drivers/ieee1394/nodemgr.c @@ -1397,9 +1397,9 @@ static int update_pdrv(struct device *dev, void *data) pdrv = container_of(drv, struct hpsb_protocol_driver, driver); if (pdrv->update) { - down(&ud->device.sem); + mutex_lock(&ud->device.mutex); error = pdrv->update(ud); - up(&ud->device.sem); + mutex_unlock(&ud->device.mutex); } if (error) device_release_driver(&ud->device); diff --git a/drivers/infiniband/core/user_mad.c b/drivers/infiniband/core/user_mad.c index 8c46f22..727776a 100644 --- a/drivers/infiniband/core/user_mad.c +++ b/drivers/infiniband/core/user_mad.c @@ -1003,7 +1003,7 @@ static int ib_umad_init_port(struct ib_device *device, int port_num, port->ib_dev = device; port->port_num = port_num; - init_MUTEX(&port->sm_sem); + semaphore_init(&port->sm_sem); mutex_init(&port->file_mutex); INIT_LIST_HEAD(&port->file_list); diff --git a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c index a0e9753..1dc6446 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c @@ -773,7 +773,7 @@ void ipoib_mcast_restart_task(struct work_struct *work) ipoib_mcast_stop_thread(dev, 0); - local_irq_save(flags); + local_irq_save_nort(flags); netif_addr_lock(dev); spin_lock(&priv->lock); @@ -852,7 +852,7 @@ void ipoib_mcast_restart_task(struct work_struct *work) spin_unlock(&priv->lock); netif_addr_unlock(dev); - local_irq_restore(flags); + local_irq_restore_nort(flags); /* We have to cancel outside of the spinlock */ list_for_each_entry_safe(mcast, tmcast, &remove_list, list) { diff --git a/drivers/input/gameport/gameport.c b/drivers/input/gameport/gameport.c index ac11be0..29577a8 100644 --- a/drivers/input/gameport/gameport.c +++ b/drivers/input/gameport/gameport.c @@ -20,6 +20,7 @@ #include <linux/slab.h> #include <linux/delay.h> #include <linux/kthread.h> +#include <linux/interrupt.h> #include <linux/sched.h> /* HZ */ #include <linux/mutex.h> #include <linux/freezer.h> @@ -57,11 +58,11 @@ static unsigned int get_time_pit(void) unsigned long flags; unsigned int count; - spin_lock_irqsave(&i8253_lock, flags); + atomic_spin_lock_irqsave(&i8253_lock, flags); outb_p(0x00, 0x43); count = inb_p(0x40); count |= inb_p(0x40) << 8; - spin_unlock_irqrestore(&i8253_lock, flags); + atomic_spin_unlock_irqrestore(&i8253_lock, flags); return count; } @@ -87,12 +88,12 @@ static int gameport_measure_speed(struct gameport *gameport) tx = 1 << 30; for(i = 0; i < 50; i++) { - local_irq_save(flags); + local_irq_save_nort(flags); GET_TIME(t1); for (t = 0; t < 50; t++) gameport_read(gameport); GET_TIME(t2); GET_TIME(t3); - local_irq_restore(flags); + local_irq_restore_nort(flags); udelay(i * 10); if ((t = DELTA(t2,t1) - DELTA(t3,t2)) < tx) tx = t; } @@ -111,11 +112,11 @@ static int gameport_measure_speed(struct gameport *gameport) tx = 1 << 30; for(i = 0; i < 50; i++) { - local_irq_save(flags); + local_irq_save_nort(flags); rdtscl(t1); for (t = 0; t < 50; t++) gameport_read(gameport); rdtscl(t2); - local_irq_restore(flags); + local_irq_restore_nort(flags); udelay(i * 10); if (t2 - t1 < tx) tx = t2 - t1; } diff --git a/drivers/input/joystick/analog.c b/drivers/input/joystick/analog.c index 1c0b529..2c52c68 100644 --- a/drivers/input/joystick/analog.c +++ b/drivers/input/joystick/analog.c @@ -146,11 +146,11 @@ static unsigned int get_time_pit(void) unsigned long flags; unsigned int count; - spin_lock_irqsave(&i8253_lock, flags); + atomic_spin_lock_irqsave(&i8253_lock, flags); outb_p(0x00, 0x43); count = inb_p(0x40); count |= inb_p(0x40) << 8; - spin_unlock_irqrestore(&i8253_lock, flags); + atomic_spin_unlock_irqrestore(&i8253_lock, flags); return count; } diff --git a/drivers/input/keyboard/hil_kbd.c b/drivers/input/keyboard/hil_kbd.c index 6f35670..53c2547 100644 --- a/drivers/input/keyboard/hil_kbd.c +++ b/drivers/input/keyboard/hil_kbd.c @@ -277,7 +277,7 @@ static int hil_kbd_connect(struct serio *serio, struct serio_driver *drv) serio_set_drvdata(serio, kbd); kbd->serio = serio; - init_MUTEX_LOCKED(&kbd->sem); + semaphore_init_locked(&kbd->sem); /* Get device info. MLC driver supplies devid/status/etc. */ serio->write(serio, 0); diff --git a/drivers/input/misc/hp_sdc_rtc.c b/drivers/input/misc/hp_sdc_rtc.c index 216a559..a59c141 100644 --- a/drivers/input/misc/hp_sdc_rtc.c +++ b/drivers/input/misc/hp_sdc_rtc.c @@ -104,7 +104,7 @@ static int hp_sdc_rtc_do_read_bbrtc (struct rtc_time *rtctm) t.endidx = 91; t.seq = tseq; t.act.semaphore = &tsem; - init_MUTEX_LOCKED(&tsem); + semaphore_init_locked(&tsem); if (hp_sdc_enqueue_transaction(&t)) return -1; @@ -686,7 +686,7 @@ static int __init hp_sdc_rtc_init(void) return -ENODEV; #endif - init_MUTEX(&i8042tregs); + semaphore_init(&i8042tregs); if ((ret = hp_sdc_request_timer_irq(&hp_sdc_rtc_isr))) return ret; diff --git a/drivers/input/misc/pcspkr.c b/drivers/input/misc/pcspkr.c index 21cb755..5dc0ccd 100644 --- a/drivers/input/misc/pcspkr.c +++ b/drivers/input/misc/pcspkr.c @@ -30,7 +30,7 @@ MODULE_ALIAS("platform:pcspkr"); #include <asm/i8253.h> #else #include <asm/8253pit.h> -static DEFINE_SPINLOCK(i8253_lock); +static DEFINE_ATOMIC_SPINLOCK(i8253_lock); #endif static int pcspkr_event(struct input_dev *dev, unsigned int type, unsigned int code, int value) @@ -50,7 +50,7 @@ static int pcspkr_event(struct input_dev *dev, unsigned int type, unsigned int c if (value > 20 && value < 32767) count = PIT_TICK_RATE / value; - spin_lock_irqsave(&i8253_lock, flags); + atomic_spin_lock_irqsave(&i8253_lock, flags); if (count) { /* set command for counter 2, 2 byte write */ @@ -65,7 +65,7 @@ static int pcspkr_event(struct input_dev *dev, unsigned int type, unsigned int c outb(inb_p(0x61) & 0xFC, 0x61); } - spin_unlock_irqrestore(&i8253_lock, flags); + atomic_spin_unlock_irqrestore(&i8253_lock, flags); return 0; } diff --git a/drivers/input/mouse/hil_ptr.c b/drivers/input/mouse/hil_ptr.c index 3263ce0..4fa00a0 100644 --- a/drivers/input/mouse/hil_ptr.c +++ b/drivers/input/mouse/hil_ptr.c @@ -270,7 +270,7 @@ static int hil_ptr_connect(struct serio *serio, struct serio_driver *driver) serio_set_drvdata(serio, ptr); ptr->serio = serio; - init_MUTEX_LOCKED(&ptr->sem); + semaphore_init_locked(&ptr->sem); /* Get device info. MLC driver supplies devid/status/etc. */ serio->write(serio, 0); diff --git a/drivers/input/serio/hil_mlc.c b/drivers/input/serio/hil_mlc.c index 7ba9f2b..17f3641 100644 --- a/drivers/input/serio/hil_mlc.c +++ b/drivers/input/serio/hil_mlc.c @@ -914,15 +914,15 @@ int hil_mlc_register(hil_mlc *mlc) mlc->ostarted = 0; rwlock_init(&mlc->lock); - init_MUTEX(&mlc->osem); + semaphore_init(&mlc->osem); - init_MUTEX(&mlc->isem); + semaphore_init(&mlc->isem); mlc->icount = -1; mlc->imatch = 0; mlc->opercnt = 0; - init_MUTEX_LOCKED(&(mlc->csem)); + semaphore_init(&(mlc->csem)); hil_mlc_clear_di_scratch(mlc); hil_mlc_clear_di_map(mlc, 0); diff --git a/drivers/input/serio/hp_sdc.c b/drivers/input/serio/hp_sdc.c index 1c9410d..14b7ccb 100644 --- a/drivers/input/serio/hp_sdc.c +++ b/drivers/input/serio/hp_sdc.c @@ -1039,7 +1039,7 @@ static int __init hp_sdc_register(void) return hp_sdc.dev_err; } - init_MUTEX_LOCKED(&tq_init_sem); + semaphore_init(&tq_init_sem); tq_init.actidx = 0; tq_init.idx = 1; diff --git a/drivers/macintosh/adb.c b/drivers/macintosh/adb.c index 23741ce..4d9b203 100644 --- a/drivers/macintosh/adb.c +++ b/drivers/macintosh/adb.c @@ -83,7 +83,7 @@ static struct adb_driver *adb_controller; BLOCKING_NOTIFIER_HEAD(adb_client_list); static int adb_got_sleep; static int adb_inited; -static DECLARE_MUTEX(adb_probe_mutex); +static DEFINE_SEMAPHORE(adb_probe_mutex); static int sleepy_trackpad; static int autopoll_devs; int __adb_probe_sync; diff --git a/drivers/media/dvb/dvb-core/dvb_frontend.c b/drivers/media/dvb/dvb-core/dvb_frontend.c index f50ca72..28f0e3f 100644 --- a/drivers/media/dvb/dvb-core/dvb_frontend.c +++ b/drivers/media/dvb/dvb-core/dvb_frontend.c @@ -101,7 +101,7 @@ struct dvb_frontend_private { struct dvb_device *dvbdev; struct dvb_frontend_parameters parameters; struct dvb_fe_events events; - struct semaphore sem; + struct anon_semaphore sem; struct list_head list_head; wait_queue_head_t wait_queue; struct task_struct *thread; @@ -189,12 +189,12 @@ static int dvb_frontend_get_event(struct dvb_frontend *fe, if (flags & O_NONBLOCK) return -EWOULDBLOCK; - up(&fepriv->sem); + anon_up(&fepriv->sem); ret = wait_event_interruptible (events->wait_queue, events->eventw != events->eventr); - if (down_interruptible (&fepriv->sem)) + if (anon_down_interruptible (&fepriv->sem)) return -ERESTARTSYS; if (ret < 0) @@ -534,7 +534,7 @@ static int dvb_frontend_thread(void *data) set_freezable(); while (1) { - up(&fepriv->sem); /* is locked when we enter the thread... */ + anon_up(&fepriv->sem); /* is locked when we enter the thread... */ restart: timeout = wait_event_interruptible_timeout(fepriv->wait_queue, dvb_frontend_should_wakeup(fe) || kthread_should_stop() @@ -550,7 +550,7 @@ restart: if (try_to_freeze()) goto restart; - if (down_interruptible(&fepriv->sem)) + if (anon_down_interruptible(&fepriv->sem)) break; if (fepriv->reinitialise) { @@ -678,7 +678,7 @@ static void dvb_frontend_stop(struct dvb_frontend *fe) kthread_stop(fepriv->thread); - init_MUTEX (&fepriv->sem); + anon_semaphore_init(&fepriv->sem); fepriv->state = FESTATE_IDLE; /* paranoia check in case a signal arrived */ @@ -747,7 +747,7 @@ static int dvb_frontend_start(struct dvb_frontend *fe) if (signal_pending(current)) return -EINTR; - if (down_interruptible (&fepriv->sem)) + if (anon_down_interruptible (&fepriv->sem)) return -EINTR; fepriv->state = FESTATE_IDLE; @@ -760,7 +760,7 @@ static int dvb_frontend_start(struct dvb_frontend *fe) if (IS_ERR(fe_thread)) { ret = PTR_ERR(fe_thread); printk("dvb_frontend_start: failed to start kthread (%d)\n", ret); - up(&fepriv->sem); + anon_up(&fepriv->sem); return ret; } fepriv->thread = fe_thread; @@ -1372,7 +1372,7 @@ static int dvb_frontend_ioctl(struct inode *inode, struct file *file, cmd == FE_DISEQC_RECV_SLAVE_REPLY)) return -EPERM; - if (down_interruptible (&fepriv->sem)) + if (anon_down_interruptible (&fepriv->sem)) return -ERESTARTSYS; if ((cmd == FE_SET_PROPERTY) || (cmd == FE_GET_PROPERTY)) @@ -1382,7 +1382,7 @@ static int dvb_frontend_ioctl(struct inode *inode, struct file *file, err = dvb_frontend_ioctl_legacy(inode, file, cmd, parg); } - up(&fepriv->sem); + anon_up(&fepriv->sem); return err; } @@ -1909,7 +1909,7 @@ int dvb_register_frontend(struct dvb_adapter* dvb, } fepriv = fe->frontend_priv; - init_MUTEX (&fepriv->sem); + anon_semaphore_init(&fepriv->sem); init_waitqueue_head (&fepriv->wait_queue); init_waitqueue_head (&fepriv->events.wait_queue); mutex_init(&fepriv->events.mtx); diff --git a/drivers/mfd/twl4030-irq.c b/drivers/mfd/twl4030-irq.c index 7d43083..c1bc157 100644 --- a/drivers/mfd/twl4030-irq.c +++ b/drivers/mfd/twl4030-irq.c @@ -458,12 +458,12 @@ static void twl4030_sih_do_edge(struct work_struct *work) bytes[byte] &= ~(0x03 << off); - spin_lock_irq(&d->lock); + atomic_spin_lock_irq(&d->lock); if (d->status & IRQ_TYPE_EDGE_RISING) bytes[byte] |= BIT(off + 1); if (d->status & IRQ_TYPE_EDGE_FALLING) bytes[byte] |= BIT(off + 0); - spin_unlock_irq(&d->lock); + atomic_spin_unlock_irq(&d->lock); edge_change &= ~BIT(i); } diff --git a/drivers/mfd/ucb1x00-core.c b/drivers/mfd/ucb1x00-core.c index fea9085..3c59c26 100644 --- a/drivers/mfd/ucb1x00-core.c +++ b/drivers/mfd/ucb1x00-core.c @@ -24,6 +24,7 @@ #include <linux/interrupt.h> #include <linux/device.h> #include <linux/mutex.h> +#include <linux/semaphore.h> #include <mach/dma.h> #include <mach/hardware.h> diff --git a/drivers/misc/Kconfig b/drivers/misc/Kconfig index 68ab39d..f51ba7b 100644 --- a/drivers/misc/Kconfig +++ b/drivers/misc/Kconfig @@ -76,6 +76,35 @@ config IBM_ASM information on the specific driver level and support statement for your IBM server. +config HWLAT_DETECTOR + tristate "Testing module to detect hardware-induced latencies" + depends on DEBUG_FS + select RING_BUFFER + default m + ---help--- + A simple hardware latency detector. Use this module to detect + large latencies introduced by the behavior of the underlying + system firmware external to Linux. We do this using periodic + use of stop_machine to grab all available CPUs and measure + for unexplainable gaps in the CPU timestamp counter(s). By + default, the module is not enabled until the "enable" file + within the "hwlat_detector" debugfs directory is toggled. + + This module is often used to detect SMI (System Management + Interrupts) on x86 systems, though is not x86 specific. To + this end, we default to using a sample window of 1 second, + during which we will sample for 0.5 seconds. If an SMI or + similar event occurs during that time, it is recorded + into an 8K samples global ring buffer until retreived. + + WARNING: This software should never be enabled (it can be built + but should not be turned on after it is loaded) in a production + environment where high latencies are a concern since the + sampling mechanism actually introduces latencies for + regular tasks while the CPU(s) are being held. + + If unsure, say N + config PHANTOM tristate "Sensable PHANToM (PCI)" depends on PCI diff --git a/drivers/misc/Makefile b/drivers/misc/Makefile index 36f733c..854d9ff 100644 --- a/drivers/misc/Makefile +++ b/drivers/misc/Makefile @@ -22,3 +22,4 @@ obj-$(CONFIG_ISL29003) += isl29003.o obj-$(CONFIG_C2PORT) += c2port/ obj-y += eeprom/ obj-y += cb710/ +obj-$(CONFIG_HWLAT_DETECTOR) += hwlat_detector.o diff --git a/drivers/misc/hwlat_detector.c b/drivers/misc/hwlat_detector.c new file mode 100644 index 0000000..be6553f --- /dev/null +++ b/drivers/misc/hwlat_detector.c @@ -0,0 +1,1208 @@ +/* + * hwlat_detector.c - A simple Hardware Latency detector. + * + * Use this module to detect large system latencies induced by the behavior of + * certain underlying system hardware or firmware, independent of Linux itself. + * The code was developed originally to detect the presence of SMIs on Intel + * and AMD systems, although there is no dependency upon x86 herein. + * + * The classical example usage of this module is in detecting the presence of + * SMIs or System Management Interrupts on Intel and AMD systems. An SMI is a + * somewhat special form of hardware interrupt spawned from earlier CPU debug + * modes in which the (BIOS/EFI/etc.) firmware arranges for the South Bridge + * LPC (or other device) to generate a special interrupt under certain + * circumstances, for example, upon expiration of a special SMI timer device, + * due to certain external thermal readings, on certain I/O address accesses, + * and other situations. An SMI hits a special CPU pin, triggers a special + * SMI mode (complete with special memory map), and the OS is unaware. + * + * Although certain hardware-inducing latencies are necessary (for example, + * a modern system often requires an SMI handler for correct thermal control + * and remote management) they can wreak havoc upon any OS-level performance + * guarantees toward low-latency, especially when the OS is not even made + * aware of the presence of these interrupts. For this reason, we need a + * somewhat brute force mechanism to detect these interrupts. In this case, + * we do it by hogging all of the CPU(s) for configurable timer intervals, + * sampling the built-in CPU timer, looking for discontiguous readings. + * + * WARNING: This implementation necessarily introduces latencies. Therefore, + * you should NEVER use this module in a production environment + * requiring any kind of low-latency performance guarantee(s). + * + * Copyright (C) 2008-2009 Jon Masters, Red Hat, Inc. <jcm@redhat.com> + * + * Includes useful feedback from Clark Williams <clark@redhat.com> + * + * This file is licensed under the terms of the GNU General Public + * License version 2. This program is licensed "as is" without any + * warranty of any kind, whether express or implied. + */ + +#include <linux/module.h> +#include <linux/init.h> +#include <linux/ring_buffer.h> +#include <linux/stop_machine.h> +#include <linux/time.h> +#include <linux/hrtimer.h> +#include <linux/kthread.h> +#include <linux/debugfs.h> +#include <linux/seq_file.h> +#include <linux/uaccess.h> +#include <linux/version.h> +#include <linux/delay.h> + +#define BUF_SIZE_DEFAULT 262144UL /* 8K*(sizeof(entry)) */ +#define BUF_FLAGS (RB_FL_OVERWRITE) /* no block on full */ +#define U64STR_SIZE 22 /* 20 digits max */ + +#define VERSION "1.0.0" +#define BANNER "hwlat_detector: " +#define DRVNAME "hwlat_detector" +#define DEFAULT_SAMPLE_WINDOW 1000000 /* 1s */ +#define DEFAULT_SAMPLE_WIDTH 500000 /* 0.5s */ +#define DEFAULT_LAT_THRESHOLD 10 /* 10us */ + +/* Module metadata */ + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Jon Masters <jcm@redhat.com>"); +MODULE_DESCRIPTION("A simple hardware latency detector"); +MODULE_VERSION(VERSION); + +/* Module parameters */ + +static int debug; +static int enabled; +static int threshold; + +module_param(debug, int, 0); /* enable debug */ +module_param(enabled, int, 0); /* enable detector */ +module_param(threshold, int, 0); /* latency threshold */ + +/* Buffering and sampling */ + +static struct ring_buffer *ring_buffer; /* sample buffer */ +static DEFINE_MUTEX(ring_buffer_mutex); /* lock changes */ +static unsigned long buf_size = BUF_SIZE_DEFAULT; +static struct task_struct *kthread; /* sampling thread */ + +/* DebugFS filesystem entries */ + +static struct dentry *debug_dir; /* debugfs directory */ +static struct dentry *debug_max; /* maximum TSC delta */ +static struct dentry *debug_count; /* total detect count */ +static struct dentry *debug_sample_width; /* sample width us */ +static struct dentry *debug_sample_window; /* sample window us */ +static struct dentry *debug_sample; /* raw samples us */ +static struct dentry *debug_threshold; /* threshold us */ +static struct dentry *debug_enable; /* enable/disable */ + +/* Individual samples and global state */ + +struct sample; /* latency sample */ +struct data; /* Global state */ + +/* Sampling functions */ +static int __buffer_add_sample(struct sample *sample); +static struct sample *buffer_get_sample(struct sample *sample); +static int get_sample(void *unused); + +/* Threading and state */ +static int kthread_fn(void *unused); +static int start_kthread(void); +static int stop_kthread(void); +static void __reset_stats(void); +static int init_stats(void); + +/* Debugfs interface */ +static ssize_t simple_data_read(struct file *filp, char __user *ubuf, + size_t cnt, loff_t *ppos, const u64 *entry); +static ssize_t simple_data_write(struct file *filp, const char __user *ubuf, + size_t cnt, loff_t *ppos, u64 *entry); +static int debug_sample_fopen(struct inode *inode, struct file *filp); +static ssize_t debug_sample_fread(struct file *filp, char __user *ubuf, + size_t cnt, loff_t *ppos); +static int debug_sample_release(struct inode *inode, struct file *filp); +static int debug_enable_fopen(struct inode *inode, struct file *filp); +static ssize_t debug_enable_fread(struct file *filp, char __user *ubuf, + size_t cnt, loff_t *ppos); +static ssize_t debug_enable_fwrite(struct file *file, + const char __user *user_buffer, + size_t user_size, loff_t *offset); + +/* Initialization functions */ +static int init_debugfs(void); +static void free_debugfs(void); +static int detector_init(void); +static void detector_exit(void); + +/* Individual latency samples are stored here when detected and packed into + * the ring_buffer circular buffer, where they are overwritten when + * more than buf_size/sizeof(sample) samples are received. */ +struct sample { + u64 seqnum; /* unique sequence */ + u64 duration; /* ktime delta */ + struct timespec timestamp; /* wall time */ +}; + +/* keep the global state somewhere. Mostly used under stop_machine. */ +static struct data { + + struct mutex lock; /* protect changes */ + + u64 count; /* total since reset */ + u64 max_sample; /* max hardware latency */ + u64 threshold; /* sample threshold level */ + + u64 sample_window; /* total sampling window (on+off) */ + u64 sample_width; /* active sampling portion of window */ + + atomic_t sample_open; /* whether the sample file is open */ + + wait_queue_head_t wq; /* waitqeue for new sample values */ + +} data; + +/** + * __buffer_add_sample - add a new latency sample recording to the ring buffer + * @sample: The new latency sample value + * + * This receives a new latency sample and records it in a global ring buffer. + * No additional locking is used in this case - suited for stop_machine use. + */ +static int __buffer_add_sample(struct sample *sample) +{ + return ring_buffer_write(ring_buffer, + sizeof(struct sample), sample); +} + +/** + * buffer_get_sample - remove a hardware latency sample from the ring buffer + * @sample: Pre-allocated storage for the sample + * + * This retrieves a hardware latency sample from the global circular buffer + */ +static struct sample *buffer_get_sample(struct sample *sample) +{ + struct ring_buffer_event *e = NULL; + struct sample *s = NULL; + unsigned int cpu = 0; + + if (!sample) + return NULL; + + /* ring_buffers are per-cpu but we just want any value */ + /* so we'll start with this cpu and try others if not */ + /* Steven is planning to add a generic mechanism */ + mutex_lock(&ring_buffer_mutex); + e = ring_buffer_consume(ring_buffer, smp_processor_id(), NULL); + if (!e) { + for_each_online_cpu(cpu) { + e = ring_buffer_consume(ring_buffer, cpu, NULL); + if (e) + break; + } + } + + if (e) { + s = ring_buffer_event_data(e); + memcpy(sample, s, sizeof(struct sample)); + } else + sample = NULL; + mutex_unlock(&ring_buffer_mutex); + + return sample; +} + +/** + * get_sample - sample the CPU TSC and look for likely hardware latencies + * @unused: This is not used but is a part of the stop_machine API + * + * Used to repeatedly capture the CPU TSC (or similar), looking for potential + * hardware-induced latency. Called under stop_machine, with data.lock held. + */ +static int get_sample(void *unused) +{ + ktime_t start, t1, t2; + s64 diff, total = 0; + u64 sample = 0; + int ret = 1; + + start = ktime_get(); /* start timestamp */ + + do { + + t1 = ktime_get(); /* we'll look for a discontinuity */ + t2 = ktime_get(); + + total = ktime_to_us(ktime_sub(t2, start)); /* sample width */ + diff = ktime_to_us(ktime_sub(t2, t1)); /* current diff */ + + /* This shouldn't happen */ + if (diff < 0) { + printk(KERN_ERR BANNER "time running backwards\n"); + goto out; + } + + if (diff > sample) + sample = diff; /* only want highest value */ + + } while (total <= data.sample_width); + + /* If we exceed the threshold value, we have found a hardware latency */ + if (sample > data.threshold) { + struct sample s; + + data.count++; + s.seqnum = data.count; + s.duration = sample; + s.timestamp = CURRENT_TIME; + __buffer_add_sample(&s); + + /* Keep a running maximum ever recorded hardware latency */ + if (sample > data.max_sample) + data.max_sample = sample; + } + + ret = 0; +out: + return ret; +} + +/* + * kthread_fn - The CPU time sampling/hardware latency detection kernel thread + * @unused: A required part of the kthread API. + * + * Used to periodically sample the CPU TSC via a call to get_sample. We + * use stop_machine, whith does (intentionally) introduce latency since we + * need to ensure nothing else might be running (and thus pre-empting). + * Obviously this should never be used in production environments. + * + * stop_machine will schedule us typically only on CPU0 which is fine for + * almost every real-world hardware latency situation - but we might later + * generalize this if we find there are any actualy systems with alternate + * SMI delivery or other non CPU0 hardware latencies. + */ +static int kthread_fn(void *unused) +{ + int err = 0; + u64 interval = 0; + + while (!kthread_should_stop()) { + + mutex_lock(&data.lock); + + err = stop_machine(get_sample, unused, 0); + if (err) { + /* Houston, we have a problem */ + mutex_unlock(&data.lock); + goto err_out; + } + + wake_up(&data.wq); /* wake up reader(s) */ + + interval = data.sample_window - data.sample_width; + do_div(interval, USEC_PER_MSEC); /* modifies interval value */ + + mutex_unlock(&data.lock); + + if (msleep_interruptible(interval)) + goto out; + } + goto out; +err_out: + printk(KERN_ERR BANNER "could not call stop_machine, disabling\n"); + enabled = 0; +out: + return err; + +} + +/** + * start_kthread - Kick off the hardware latency sampling/detector kthread + * + * This starts a kernel thread that will sit and sample the CPU timestamp + * counter (TSC or similar) and look for potential hardware latencies. + */ +static int start_kthread(void) +{ + kthread = kthread_run(kthread_fn, NULL, + DRVNAME); + if (IS_ERR(kthread)) { + printk(KERN_ERR BANNER "could not start sampling thread\n"); + enabled = 0; + return -ENOMEM; + } + + return 0; +} + +/** + * stop_kthread - Inform the hardware latency samping/detector kthread to stop + * + * This kicks the running hardware latency sampling/detector kernel thread and + * tells it to stop sampling now. Use this on unload and at system shutdown. + */ +static int stop_kthread(void) +{ + int ret; + + ret = kthread_stop(kthread); + + return ret; +} + +/** + * __reset_stats - Reset statistics for the hardware latency detector + * + * We use data to store various statistics and global state. We call this + * function in order to reset those when "enable" is toggled on or off, and + * also at initialization. Should be called with data.lock held. + */ +static void __reset_stats(void) +{ + data.count = 0; + data.max_sample = 0; + ring_buffer_reset(ring_buffer); /* flush out old sample entries */ +} + +/** + * init_stats - Setup global state statistics for the hardware latency detector + * + * We use data to store various statistics and global state. We also use + * a global ring buffer (ring_buffer) to keep raw samples of detected hardware + * induced system latencies. This function initializes these structures and + * allocates the global ring buffer also. + */ +static int init_stats(void) +{ + int ret = -ENOMEM; + + mutex_init(&data.lock); + init_waitqueue_head(&data.wq); + atomic_set(&data.sample_open, 0); + + ring_buffer = ring_buffer_alloc(buf_size, BUF_FLAGS); + + if (WARN(!ring_buffer, KERN_ERR BANNER + "failed to allocate ring buffer!\n")) + goto out; + + __reset_stats(); + data.threshold = DEFAULT_LAT_THRESHOLD; /* threshold us */ + data.sample_window = DEFAULT_SAMPLE_WINDOW; /* window us */ + data.sample_width = DEFAULT_SAMPLE_WIDTH; /* width us */ + + ret = 0; + +out: + return ret; + +} + +/* + * simple_data_read - Wrapper read function for global state debugfs entries + * @filp: The active open file structure for the debugfs "file" + * @ubuf: The userspace provided buffer to read value into + * @cnt: The maximum number of bytes to read + * @ppos: The current "file" position + * @entry: The entry to read from + * + * This function provides a generic read implementation for the global state + * "data" structure debugfs filesystem entries. It would be nice to use + * simple_attr_read directly, but we need to make sure that the data.lock + * spinlock is held during the actual read (even though we likely won't ever + * actually race here as the updater runs under a stop_machine context). + */ +static ssize_t simple_data_read(struct file *filp, char __user *ubuf, + size_t cnt, loff_t *ppos, const u64 *entry) +{ + char buf[U64STR_SIZE]; + u64 val = 0; + int len = 0; + + memset(buf, 0, sizeof(buf)); + + if (!entry) + return -EFAULT; + + mutex_lock(&data.lock); + val = *entry; + mutex_unlock(&data.lock); + + len = snprintf(buf, sizeof(buf), "%llu\n", (unsigned long long)val); + + return simple_read_from_buffer(ubuf, cnt, ppos, buf, len); + +} + +/* + * simple_data_write - Wrapper write function for global state debugfs entries + * @filp: The active open file structure for the debugfs "file" + * @ubuf: The userspace provided buffer to write value from + * @cnt: The maximum number of bytes to write + * @ppos: The current "file" position + * @entry: The entry to write to + * + * This function provides a generic write implementation for the global state + * "data" structure debugfs filesystem entries. It would be nice to use + * simple_attr_write directly, but we need to make sure that the data.lock + * spinlock is held during the actual write (even though we likely won't ever + * actually race here as the updater runs under a stop_machine context). + */ +static ssize_t simple_data_write(struct file *filp, const char __user *ubuf, + size_t cnt, loff_t *ppos, u64 *entry) +{ + char buf[U64STR_SIZE]; + int csize = min(cnt, sizeof(buf)); + u64 val = 0; + int err = 0; + + memset(buf, '\0', sizeof(buf)); + if (copy_from_user(buf, ubuf, csize)) + return -EFAULT; + + buf[U64STR_SIZE-1] = '\0'; /* just in case */ + err = strict_strtoull(buf, 10, &val); + if (err) + return -EINVAL; + + mutex_lock(&data.lock); + *entry = val; + mutex_unlock(&data.lock); + + return csize; +} + +/** + * debug_count_fopen - Open function for "count" debugfs entry + * @inode: The in-kernel inode representation of the debugfs "file" + * @filp: The active open file structure for the debugfs "file" + * + * This function provides an open implementation for the "count" debugfs + * interface to the hardware latency detector. + */ +static int debug_count_fopen(struct inode *inode, struct file *filp) +{ + return 0; +} + +/** + * debug_count_fread - Read function for "count" debugfs entry + * @filp: The active open file structure for the debugfs "file" + * @ubuf: The userspace provided buffer to read value into + * @cnt: The maximum number of bytes to read + * @ppos: The current "file" position + * + * This function provides a read implementation for the "count" debugfs + * interface to the hardware latency detector. Can be used to read the + * number of latency readings exceeding the configured threshold since + * the detector was last reset (e.g. by writing a zero into "count"). + */ +static ssize_t debug_count_fread(struct file *filp, char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + return simple_data_read(filp, ubuf, cnt, ppos, &data.count); +} + +/** + * debug_count_fwrite - Write function for "count" debugfs entry + * @filp: The active open file structure for the debugfs "file" + * @ubuf: The user buffer that contains the value to write + * @cnt: The maximum number of bytes to write to "file" + * @ppos: The current position in the debugfs "file" + * + * This function provides a write implementation for the "count" debugfs + * interface to the hardware latency detector. Can be used to write a + * desired value, especially to zero the total count. + */ +static ssize_t debug_count_fwrite(struct file *filp, + const char __user *ubuf, + size_t cnt, + loff_t *ppos) +{ + return simple_data_write(filp, ubuf, cnt, ppos, &data.count); +} + +/** + * debug_enable_fopen - Dummy open function for "enable" debugfs interface + * @inode: The in-kernel inode representation of the debugfs "file" + * @filp: The active open file structure for the debugfs "file" + * + * This function provides an open implementation for the "enable" debugfs + * interface to the hardware latency detector. + */ +static int debug_enable_fopen(struct inode *inode, struct file *filp) +{ + return 0; +} + +/** + * debug_enable_fread - Read function for "enable" debugfs interface + * @filp: The active open file structure for the debugfs "file" + * @ubuf: The userspace provided buffer to read value into + * @cnt: The maximum number of bytes to read + * @ppos: The current "file" position + * + * This function provides a read implementation for the "enable" debugfs + * interface to the hardware latency detector. Can be used to determine + * whether the detector is currently enabled ("0\n" or "1\n" returned). + */ +static ssize_t debug_enable_fread(struct file *filp, char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + char buf[4]; + + if ((cnt < sizeof(buf)) || (*ppos)) + return 0; + + buf[0] = enabled ? '1' : '0'; + buf[1] = '\n'; + buf[2] = '\0'; + if (copy_to_user(ubuf, buf, strlen(buf))) + return -EFAULT; + return *ppos = strlen(buf); +} + +/** + * debug_enable_fwrite - Write function for "enable" debugfs interface + * @filp: The active open file structure for the debugfs "file" + * @ubuf: The user buffer that contains the value to write + * @cnt: The maximum number of bytes to write to "file" + * @ppos: The current position in the debugfs "file" + * + * This function provides a write implementation for the "enable" debugfs + * interface to the hardware latency detector. Can be used to enable or + * disable the detector, which will have the side-effect of possibly + * also resetting the global stats and kicking off the measuring + * kthread (on an enable) or the converse (upon a disable). + */ +static ssize_t debug_enable_fwrite(struct file *filp, + const char __user *ubuf, + size_t cnt, + loff_t *ppos) +{ + char buf[4]; + int csize = min(cnt, sizeof(buf)); + long val = 0; + int err = 0; + + memset(buf, '\0', sizeof(buf)); + if (copy_from_user(buf, ubuf, csize)) + return -EFAULT; + + buf[sizeof(buf)-1] = '\0'; /* just in case */ + err = strict_strtoul(buf, 10, &val); + if (0 != err) + return -EINVAL; + + if (val) { + if (enabled) + goto unlock; + enabled = 1; + __reset_stats(); + if (start_kthread()) + return -EFAULT; + } else { + if (!enabled) + goto unlock; + enabled = 0; + stop_kthread(); + wake_up(&data.wq); /* reader(s) should return */ + } +unlock: + return csize; +} + +/** + * debug_max_fopen - Open function for "max" debugfs entry + * @inode: The in-kernel inode representation of the debugfs "file" + * @filp: The active open file structure for the debugfs "file" + * + * This function provides an open implementation for the "max" debugfs + * interface to the hardware latency detector. + */ +static int debug_max_fopen(struct inode *inode, struct file *filp) +{ + return 0; +} + +/** + * debug_max_fread - Read function for "max" debugfs entry + * @filp: The active open file structure for the debugfs "file" + * @ubuf: The userspace provided buffer to read value into + * @cnt: The maximum number of bytes to read + * @ppos: The current "file" position + * + * This function provides a read implementation for the "max" debugfs + * interface to the hardware latency detector. Can be used to determine + * the maximum latency value observed since it was last reset. + */ +static ssize_t debug_max_fread(struct file *filp, char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + return simple_data_read(filp, ubuf, cnt, ppos, &data.max_sample); +} + +/** + * debug_max_fwrite - Write function for "max" debugfs entry + * @filp: The active open file structure for the debugfs "file" + * @ubuf: The user buffer that contains the value to write + * @cnt: The maximum number of bytes to write to "file" + * @ppos: The current position in the debugfs "file" + * + * This function provides a write implementation for the "max" debugfs + * interface to the hardware latency detector. Can be used to reset the + * maximum or set it to some other desired value - if, then, subsequent + * measurements exceed this value, the maximum will be updated. + */ +static ssize_t debug_max_fwrite(struct file *filp, + const char __user *ubuf, + size_t cnt, + loff_t *ppos) +{ + return simple_data_write(filp, ubuf, cnt, ppos, &data.max_sample); +} + + +/** + * debug_sample_fopen - An open function for "sample" debugfs interface + * @inode: The in-kernel inode representation of this debugfs "file" + * @filp: The active open file structure for the debugfs "file" + * + * This function handles opening the "sample" file within the hardware + * latency detector debugfs directory interface. This file is used to read + * raw samples from the global ring_buffer and allows the user to see a + * running latency history. Can be opened blocking or non-blocking, + * affecting whether it behaves as a buffer read pipe, or does not. + * Implements simple locking to prevent multiple simultaneous use. + */ +static int debug_sample_fopen(struct inode *inode, struct file *filp) +{ + if (!atomic_add_unless(&data.sample_open, 1, 1)) + return -EBUSY; + else + return 0; +} + +/** + * debug_sample_fread - A read function for "sample" debugfs interface + * @filp: The active open file structure for the debugfs "file" + * @ubuf: The user buffer that will contain the samples read + * @cnt: The maximum bytes to read from the debugfs "file" + * @ppos: The current position in the debugfs "file" + * + * This function handles reading from the "sample" file within the hardware + * latency detector debugfs directory interface. This file is used to read + * raw samples from the global ring_buffer and allows the user to see a + * running latency history. By default this will block pending a new + * value written into the sample buffer, unless there are already a + * number of value(s) waiting in the buffer, or the sample file was + * previously opened in a non-blocking mode of operation. + */ +static ssize_t debug_sample_fread(struct file *filp, char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + int len = 0; + char buf[64]; + struct sample *sample = NULL; + + if (!enabled) + return 0; + + sample = kzalloc(sizeof(struct sample), GFP_KERNEL); + if (!sample) + return -ENOMEM; + + while (!buffer_get_sample(sample)) { + + DEFINE_WAIT(wait); + + if (filp->f_flags & O_NONBLOCK) { + len = -EAGAIN; + goto out; + } + + prepare_to_wait(&data.wq, &wait, TASK_INTERRUPTIBLE); + schedule(); + finish_wait(&data.wq, &wait); + + if (signal_pending(current)) { + len = -EINTR; + goto out; + } + + if (!enabled) { /* enable was toggled */ + len = 0; + goto out; + } + } + + len = snprintf(buf, sizeof(buf), "%010lu.%010lu\t%llu\n", + sample->timestamp.tv_sec, + sample->timestamp.tv_nsec, + sample->duration); + + + /* handling partial reads is more trouble than it's worth */ + if (len > cnt) + goto out; + + if (copy_to_user(ubuf, buf, len)) + len = -EFAULT; + +out: + kfree(sample); + return len; +} + +/** + * debug_sample_release - Release function for "sample" debugfs interface + * @inode: The in-kernel inode represenation of the debugfs "file" + * @filp: The active open file structure for the debugfs "file" + * + * This function completes the close of the debugfs interface "sample" file. + * Frees the sample_open "lock" so that other users may open the interface. + */ +static int debug_sample_release(struct inode *inode, struct file *filp) +{ + atomic_dec(&data.sample_open); + + return 0; +} + +/** + * debug_threshold_fopen - Open function for "threshold" debugfs entry + * @inode: The in-kernel inode representation of the debugfs "file" + * @filp: The active open file structure for the debugfs "file" + * + * This function provides an open implementation for the "threshold" debugfs + * interface to the hardware latency detector. + */ +static int debug_threshold_fopen(struct inode *inode, struct file *filp) +{ + return 0; +} + +/** + * debug_threshold_fread - Read function for "threshold" debugfs entry + * @filp: The active open file structure for the debugfs "file" + * @ubuf: The userspace provided buffer to read value into + * @cnt: The maximum number of bytes to read + * @ppos: The current "file" position + * + * This function provides a read implementation for the "threshold" debugfs + * interface to the hardware latency detector. It can be used to determine + * the current threshold level at which a latency will be recorded in the + * global ring buffer, typically on the order of 10us. + */ +static ssize_t debug_threshold_fread(struct file *filp, char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + return simple_data_read(filp, ubuf, cnt, ppos, &data.threshold); +} + +/** + * debug_threshold_fwrite - Write function for "threshold" debugfs entry + * @filp: The active open file structure for the debugfs "file" + * @ubuf: The user buffer that contains the value to write + * @cnt: The maximum number of bytes to write to "file" + * @ppos: The current position in the debugfs "file" + * + * This function provides a write implementation for the "threshold" debugfs + * interface to the hardware latency detector. It can be used to configure + * the threshold level at which any subsequently detected latencies will + * be recorded into the global ring buffer. + */ +static ssize_t debug_threshold_fwrite(struct file *filp, + const char __user *ubuf, + size_t cnt, + loff_t *ppos) +{ + int ret; + + ret = simple_data_write(filp, ubuf, cnt, ppos, &data.threshold); + + if (enabled) + wake_up_process(kthread); + + return ret; +} + +/** + * debug_width_fopen - Open function for "width" debugfs entry + * @inode: The in-kernel inode representation of the debugfs "file" + * @filp: The active open file structure for the debugfs "file" + * + * This function provides an open implementation for the "width" debugfs + * interface to the hardware latency detector. + */ +static int debug_width_fopen(struct inode *inode, struct file *filp) +{ + return 0; +} + +/** + * debug_width_fread - Read function for "width" debugfs entry + * @filp: The active open file structure for the debugfs "file" + * @ubuf: The userspace provided buffer to read value into + * @cnt: The maximum number of bytes to read + * @ppos: The current "file" position + * + * This function provides a read implementation for the "width" debugfs + * interface to the hardware latency detector. It can be used to determine + * for how many us of the total window us we will actively sample for any + * hardware-induced latecy periods. Obviously, it is not possible to + * sample constantly and have the system respond to a sample reader, or, + * worse, without having the system appear to have gone out to lunch. + */ +static ssize_t debug_width_fread(struct file *filp, char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + return simple_data_read(filp, ubuf, cnt, ppos, &data.sample_width); +} + +/** + * debug_width_fwrite - Write function for "width" debugfs entry + * @filp: The active open file structure for the debugfs "file" + * @ubuf: The user buffer that contains the value to write + * @cnt: The maximum number of bytes to write to "file" + * @ppos: The current position in the debugfs "file" + * + * This function provides a write implementation for the "width" debugfs + * interface to the hardware latency detector. It can be used to configure + * for how many us of the total window us we will actively sample for any + * hardware-induced latency periods. Obviously, it is not possible to + * sample constantly and have the system respond to a sample reader, or, + * worse, without having the system appear to have gone out to lunch. It + * is enforced that width is less that the total window size. + */ +static ssize_t debug_width_fwrite(struct file *filp, + const char __user *ubuf, + size_t cnt, + loff_t *ppos) +{ + char buf[U64STR_SIZE]; + int csize = min(cnt, sizeof(buf)); + u64 val = 0; + int err = 0; + + memset(buf, '\0', sizeof(buf)); + if (copy_from_user(buf, ubuf, csize)) + return -EFAULT; + + buf[U64STR_SIZE-1] = '\0'; /* just in case */ + err = strict_strtoull(buf, 10, &val); + if (0 != err) + return -EINVAL; + + mutex_lock(&data.lock); + if (val < data.sample_window) + data.sample_width = val; + else { + mutex_unlock(&data.lock); + return -EINVAL; + } + mutex_unlock(&data.lock); + + if (enabled) + wake_up_process(kthread); + + return csize; +} + +/** + * debug_window_fopen - Open function for "window" debugfs entry + * @inode: The in-kernel inode representation of the debugfs "file" + * @filp: The active open file structure for the debugfs "file" + * + * This function provides an open implementation for the "window" debugfs + * interface to the hardware latency detector. The window is the total time + * in us that will be considered one sample period. Conceptually, windows + * occur back-to-back and contain a sample width period during which + * actual sampling occurs. + */ +static int debug_window_fopen(struct inode *inode, struct file *filp) +{ + return 0; +} + +/** + * debug_window_fread - Read function for "window" debugfs entry + * @filp: The active open file structure for the debugfs "file" + * @ubuf: The userspace provided buffer to read value into + * @cnt: The maximum number of bytes to read + * @ppos: The current "file" position + * + * This function provides a read implementation for the "window" debugfs + * interface to the hardware latency detector. The window is the total time + * in us that will be considered one sample period. Conceptually, windows + * occur back-to-back and contain a sample width period during which + * actual sampling occurs. Can be used to read the total window size. + */ +static ssize_t debug_window_fread(struct file *filp, char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + return simple_data_read(filp, ubuf, cnt, ppos, &data.sample_window); +} + +/** + * debug_window_fwrite - Write function for "window" debugfs entry + * @filp: The active open file structure for the debugfs "file" + * @ubuf: The user buffer that contains the value to write + * @cnt: The maximum number of bytes to write to "file" + * @ppos: The current position in the debugfs "file" + * + * This function provides a write implementation for the "window" debufds + * interface to the hardware latency detetector. The window is the total time + * in us that will be considered one sample period. Conceptually, windows + * occur back-to-back and contain a sample width period during which + * actual sampling occurs. Can be used to write a new total window size. It + * is enfoced that any value written must be greater than the sample width + * size, or an error results. + */ +static ssize_t debug_window_fwrite(struct file *filp, + const char __user *ubuf, + size_t cnt, + loff_t *ppos) +{ + char buf[U64STR_SIZE]; + int csize = min(cnt, sizeof(buf)); + u64 val = 0; + int err = 0; + + memset(buf, '\0', sizeof(buf)); + if (copy_from_user(buf, ubuf, csize)) + return -EFAULT; + + buf[U64STR_SIZE-1] = '\0'; /* just in case */ + err = strict_strtoull(buf, 10, &val); + if (0 != err) + return -EINVAL; + + mutex_lock(&data.lock); + if (data.sample_width < val) + data.sample_window = val; + else { + mutex_unlock(&data.lock); + return -EINVAL; + } + mutex_unlock(&data.lock); + + return csize; +} + +/* + * Function pointers for the "count" debugfs file operations + */ +static const struct file_operations count_fops = { + .open = debug_count_fopen, + .read = debug_count_fread, + .write = debug_count_fwrite, + .owner = THIS_MODULE, +}; + +/* + * Function pointers for the "enable" debugfs file operations + */ +static const struct file_operations enable_fops = { + .open = debug_enable_fopen, + .read = debug_enable_fread, + .write = debug_enable_fwrite, + .owner = THIS_MODULE, +}; + +/* + * Function pointers for the "max" debugfs file operations + */ +static const struct file_operations max_fops = { + .open = debug_max_fopen, + .read = debug_max_fread, + .write = debug_max_fwrite, + .owner = THIS_MODULE, +}; + +/* + * Function pointers for the "sample" debugfs file operations + */ +static const struct file_operations sample_fops = { + .open = debug_sample_fopen, + .read = debug_sample_fread, + .release = debug_sample_release, + .owner = THIS_MODULE, +}; + +/* + * Function pointers for the "threshold" debugfs file operations + */ +static const struct file_operations threshold_fops = { + .open = debug_threshold_fopen, + .read = debug_threshold_fread, + .write = debug_threshold_fwrite, + .owner = THIS_MODULE, +}; + +/* + * Function pointers for the "width" debugfs file operations + */ +static const struct file_operations width_fops = { + .open = debug_width_fopen, + .read = debug_width_fread, + .write = debug_width_fwrite, + .owner = THIS_MODULE, +}; + +/* + * Function pointers for the "window" debugfs file operations + */ +static const struct file_operations window_fops = { + .open = debug_window_fopen, + .read = debug_window_fread, + .write = debug_window_fwrite, + .owner = THIS_MODULE, +}; + +/** + * init_debugfs - A function to initialize the debugfs interface files + * + * This function creates entries in debugfs for "hwlat_detector", including + * files to read values from the detector, current samples, and the + * maximum sample that has been captured since the hardware latency + * dectector was started. + */ +static int init_debugfs(void) +{ + int ret = -ENOMEM; + + debug_dir = debugfs_create_dir(DRVNAME, NULL); + if (!debug_dir) + goto err_debug_dir; + + debug_sample = debugfs_create_file("sample", 0444, + debug_dir, NULL, + &sample_fops); + if (!debug_sample) + goto err_sample; + + debug_count = debugfs_create_file("count", 0444, + debug_dir, NULL, + &count_fops); + if (!debug_count) + goto err_count; + + debug_max = debugfs_create_file("max", 0444, + debug_dir, NULL, + &max_fops); + if (!debug_max) + goto err_max; + + debug_sample_window = debugfs_create_file("window", 0644, + debug_dir, NULL, + &window_fops); + if (!debug_sample_window) + goto err_window; + + debug_sample_width = debugfs_create_file("width", 0644, + debug_dir, NULL, + &width_fops); + if (!debug_sample_width) + goto err_width; + + debug_threshold = debugfs_create_file("threshold", 0644, + debug_dir, NULL, + &threshold_fops); + if (!debug_threshold) + goto err_threshold; + + debug_enable = debugfs_create_file("enable", 0644, + debug_dir, &enabled, + &enable_fops); + if (!debug_enable) + goto err_enable; + + else { + ret = 0; + goto out; + } + +err_enable: + debugfs_remove(debug_threshold); +err_threshold: + debugfs_remove(debug_sample_width); +err_width: + debugfs_remove(debug_sample_window); +err_window: + debugfs_remove(debug_max); +err_max: + debugfs_remove(debug_count); +err_count: + debugfs_remove(debug_sample); +err_sample: + debugfs_remove(debug_dir); +err_debug_dir: +out: + return ret; +} + +/** + * free_debugfs - A function to cleanup the debugfs file interface + */ +static void free_debugfs(void) +{ + /* could also use a debugfs_remove_recursive */ + debugfs_remove(debug_enable); + debugfs_remove(debug_threshold); + debugfs_remove(debug_sample_width); + debugfs_remove(debug_sample_window); + debugfs_remove(debug_max); + debugfs_remove(debug_count); + debugfs_remove(debug_sample); + debugfs_remove(debug_dir); +} + +/** + * detector_init - Standard module initialization code + */ +static int detector_init(void) +{ + int ret = -ENOMEM; + + printk(KERN_INFO BANNER "version %s\n", VERSION); + + ret = init_stats(); + if (0 != ret) + goto out; + + ret = init_debugfs(); + if (0 != ret) + goto err_stats; + + if (enabled) + ret = start_kthread(); + + goto out; + +err_stats: + ring_buffer_free(ring_buffer); +out: + return ret; + +} + +/** + * detector_exit - Standard module cleanup code + */ +static void detector_exit(void) +{ + if (enabled) { + enabled = 0; + stop_kthread(); + } + + free_debugfs(); + ring_buffer_free(ring_buffer); /* free up the ring buffer */ + +} + +module_init(detector_init); +module_exit(detector_exit); diff --git a/drivers/mmc/card/queue.c b/drivers/mmc/card/queue.c index 49e5823..8df561b 100644 --- a/drivers/mmc/card/queue.c +++ b/drivers/mmc/card/queue.c @@ -194,7 +194,7 @@ int mmc_init_queue(struct mmc_queue *mq, struct mmc_card *card, spinlock_t *lock sg_init_table(mq->sg, host->max_phys_segs); } - init_MUTEX(&mq->thread_sem); + semaphore_init(&mq->thread_sem); mq->thread = kthread_run(mmc_queue_thread, mq, "mmcqd"); if (IS_ERR(mq->thread)) { diff --git a/drivers/net/3c527.c b/drivers/net/3c527.c index aaa8a9f..1b28ef5 100644 --- a/drivers/net/3c527.c +++ b/drivers/net/3c527.c @@ -179,7 +179,7 @@ struct mc32_local u16 rx_ring_tail; /* index to rx de-queue end */ - struct semaphore cmd_mutex; /* Serialises issuing of execute commands */ + struct anon_semaphore cmd_mutex; /* Serialises issuing of execute commands */ struct completion execution_cmd; /* Card has completed an execute command */ struct completion xceiver_cmd; /* Card has completed a tx or rx command */ }; @@ -521,7 +521,7 @@ static int __init mc32_probe1(struct net_device *dev, int slot) lp->tx_len = lp->exec_box->data[9]; /* Transmit list count */ lp->rx_len = lp->exec_box->data[11]; /* Receive list count */ - init_MUTEX_LOCKED(&lp->cmd_mutex); + anon_semaphore_init_locked(&lp->cmd_mutex); init_completion(&lp->execution_cmd); init_completion(&lp->xceiver_cmd); @@ -580,7 +580,7 @@ static int mc32_command_nowait(struct net_device *dev, u16 cmd, void *data, int int ioaddr = dev->base_addr; int ret = -1; - if (down_trylock(&lp->cmd_mutex) == 0) + if (anon_down_trylock(&lp->cmd_mutex) == 0) { lp->cmd_nonblocking=1; lp->exec_box->mbox=0; @@ -626,7 +626,7 @@ static int mc32_command(struct net_device *dev, u16 cmd, void *data, int len) int ioaddr = dev->base_addr; int ret = 0; - down(&lp->cmd_mutex); + anon_down(&lp->cmd_mutex); /* * My Turn @@ -646,7 +646,7 @@ static int mc32_command(struct net_device *dev, u16 cmd, void *data, int len) if(lp->exec_box->mbox&(1<<13)) ret = -1; - up(&lp->cmd_mutex); + anon_up(&lp->cmd_mutex); /* * A multicast set got blocked - try it now @@ -916,7 +916,7 @@ static int mc32_open(struct net_device *dev) * Allow ourselves to issue commands */ - up(&lp->cmd_mutex); + anon_up(&lp->cmd_mutex); /* @@ -1384,7 +1384,7 @@ static irqreturn_t mc32_interrupt(int irq, void *dev_id) */ if (lp->cmd_nonblocking) { - up(&lp->cmd_mutex); + anon_up(&lp->cmd_mutex); if (lp->mc_reload_wait) mc32_reset_multicast_list(dev); } @@ -1461,7 +1461,7 @@ static int mc32_close(struct net_device *dev) /* Ensure we issue no more commands beyond this point */ - down(&lp->cmd_mutex); + anon_down(&lp->cmd_mutex); /* Ok the card is now stopping */ diff --git a/drivers/net/3c59x.c b/drivers/net/3c59x.c index 4567588..38e920f 100644 --- a/drivers/net/3c59x.c +++ b/drivers/net/3c59x.c @@ -795,9 +795,9 @@ static void poll_vortex(struct net_device *dev) { struct vortex_private *vp = netdev_priv(dev); unsigned long flags; - local_irq_save(flags); + local_irq_save_nort(flags); (vp->full_bus_master_rx ? boomerang_interrupt:vortex_interrupt)(dev->irq,dev); - local_irq_restore(flags); + local_irq_restore_nort(flags); } #endif @@ -1766,6 +1766,7 @@ vortex_timer(unsigned long data) int next_tick = 60*HZ; int ok = 0; int media_status, old_window; + unsigned long flags; if (vortex_debug > 2) { pr_debug("%s: Media selection timer tick happened, %s.\n", @@ -1773,7 +1774,7 @@ vortex_timer(unsigned long data) pr_debug("dev->watchdog_timeo=%d\n", dev->watchdog_timeo); } - disable_irq_lockdep(dev->irq); + spin_lock_irqsave(&vp->lock, flags); old_window = ioread16(ioaddr + EL3_CMD) >> 13; EL3WINDOW(4); media_status = ioread16(ioaddr + Wn4_Media); @@ -1796,10 +1797,7 @@ vortex_timer(unsigned long data) case XCVR_MII: case XCVR_NWAY: { ok = 1; - /* Interrupts are already disabled */ - spin_lock(&vp->lock); vortex_check_media(dev, 0); - spin_unlock(&vp->lock); } break; default: /* Other media types handled by Tx timeouts. */ @@ -1853,7 +1851,7 @@ leave_media_alone: dev->name, media_tbl[dev->if_port].name); EL3WINDOW(old_window); - enable_irq_lockdep(dev->irq); + spin_unlock_irqrestore(&vp->lock, flags); mod_timer(&vp->timer, RUN_AT(next_tick)); if (vp->deferred) iowrite16(FakeIntr, ioaddr + EL3_CMD); @@ -1887,12 +1885,12 @@ static void vortex_tx_timeout(struct net_device *dev) * Block interrupts because vortex_interrupt does a bare spin_lock() */ unsigned long flags; - local_irq_save(flags); + local_irq_save_nort(flags); if (vp->full_bus_master_tx) boomerang_interrupt(dev->irq, dev); else vortex_interrupt(dev->irq, dev); - local_irq_restore(flags); + local_irq_restore_nort(flags); } } diff --git a/drivers/net/8139too.c b/drivers/net/8139too.c index 0e2ba21..560c233 100644 --- a/drivers/net/8139too.c +++ b/drivers/net/8139too.c @@ -2195,7 +2195,11 @@ static irqreturn_t rtl8139_interrupt (int irq, void *dev_instance) */ static void rtl8139_poll_controller(struct net_device *dev) { - disable_irq(dev->irq); + /* + * use _nosync() variant - might be used by netconsole + * from atomic contexts: + */ + disable_irq_nosync(dev->irq); rtl8139_interrupt(dev->irq, dev); enable_irq(dev->irq); } diff --git a/drivers/net/atl1c/atl1c_main.c b/drivers/net/atl1c/atl1c_main.c index a383122..74cf8f5 100644 --- a/drivers/net/atl1c/atl1c_main.c +++ b/drivers/net/atl1c/atl1c_main.c @@ -2069,11 +2069,8 @@ static int atl1c_xmit_frame(struct sk_buff *skb, struct net_device *netdev) } tpd_req = atl1c_cal_tpd_req(skb); - if (!spin_trylock_irqsave(&adapter->tx_lock, flags)) { - if (netif_msg_pktdata(adapter)) - dev_info(&adapter->pdev->dev, "tx locked\n"); - return NETDEV_TX_LOCKED; - } + spin_lock_irqsave(&adapter->tx_lock, flags); + if (skb->mark == 0x01) type = atl1c_trans_high; else diff --git a/drivers/net/atl1e/atl1e_main.c b/drivers/net/atl1e/atl1e_main.c index 9fc6d6d..4f6df51 100644 --- a/drivers/net/atl1e/atl1e_main.c +++ b/drivers/net/atl1e/atl1e_main.c @@ -1856,8 +1856,7 @@ static int atl1e_xmit_frame(struct sk_buff *skb, struct net_device *netdev) return NETDEV_TX_OK; } tpd_req = atl1e_cal_tdp_req(skb); - if (!spin_trylock_irqsave(&adapter->tx_lock, flags)) - return NETDEV_TX_LOCKED; + spin_lock_irqsave(&adapter->tx_lock, flags); if (atl1e_tpd_avail(adapter) < tpd_req) { /* no enough descriptor, just stop queue */ diff --git a/drivers/net/bnx2.c b/drivers/net/bnx2.c index 06b9011..ae014d1 100644 --- a/drivers/net/bnx2.c +++ b/drivers/net/bnx2.c @@ -2838,7 +2838,7 @@ bnx2_tx_int(struct bnx2 *bp, struct bnx2_napi *bnapi, int budget) if (unlikely(netif_tx_queue_stopped(txq)) && (bnx2_tx_avail(bp, txr) > bp->tx_wake_thresh)) { - __netif_tx_lock(txq, smp_processor_id()); + __netif_tx_lock(txq); if ((netif_tx_queue_stopped(txq)) && (bnx2_tx_avail(bp, txr) > bp->tx_wake_thresh)) netif_tx_wake_queue(txq); diff --git a/drivers/net/bnx2x_main.c b/drivers/net/bnx2x_main.c index c36a5f3..2922e00 100644 --- a/drivers/net/bnx2x_main.c +++ b/drivers/net/bnx2x_main.c @@ -926,7 +926,7 @@ static void bnx2x_tx_int(struct bnx2x_fastpath *fp) /* TBD need a thresh? */ if (unlikely(netif_tx_queue_stopped(txq))) { - __netif_tx_lock(txq, smp_processor_id()); + __netif_tx_lock(txq); /* Need to make the tx_bd_cons update visible to start_xmit() * before checking for netif_tx_queue_stopped(). Without the diff --git a/drivers/net/chelsio/sge.c b/drivers/net/chelsio/sge.c index 3711d64..e6e5abd 100644 --- a/drivers/net/chelsio/sge.c +++ b/drivers/net/chelsio/sge.c @@ -1671,8 +1671,7 @@ static int t1_sge_tx(struct sk_buff *skb, struct adapter *adapter, struct cmdQ *q = &sge->cmdQ[qid]; unsigned int credits, pidx, genbit, count, use_sched_skb = 0; - if (!spin_trylock(&q->lock)) - return NETDEV_TX_LOCKED; + spin_lock(&q->lock); reclaim_completed_tx(sge, q); diff --git a/drivers/net/hamradio/6pack.c b/drivers/net/hamradio/6pack.c index 981ab53..fcc67e0 100644 --- a/drivers/net/hamradio/6pack.c +++ b/drivers/net/hamradio/6pack.c @@ -120,7 +120,7 @@ struct sixpack { struct timer_list tx_t; struct timer_list resync_t; atomic_t refcnt; - struct semaphore dead_sem; + struct anon_semaphore dead_sem; spinlock_t lock; }; @@ -412,7 +412,7 @@ static struct sixpack *sp_get(struct tty_struct *tty) static void sp_put(struct sixpack *sp) { if (atomic_dec_and_test(&sp->refcnt)) - up(&sp->dead_sem); + anon_up(&sp->dead_sem); } /* @@ -606,7 +606,7 @@ static int sixpack_open(struct tty_struct *tty) spin_lock_init(&sp->lock); atomic_set(&sp->refcnt, 1); - init_MUTEX_LOCKED(&sp->dead_sem); + anon_semaphore_init_locked(&sp->dead_sem); /* !!! length of the buffers. MTU is IP MTU, not PACLEN! */ @@ -702,7 +702,7 @@ static void sixpack_close(struct tty_struct *tty) * we have to wait for all existing users to finish. */ if (!atomic_dec_and_test(&sp->refcnt)) - down(&sp->dead_sem); + anon_down(&sp->dead_sem); unregister_netdev(sp->dev); diff --git a/drivers/net/hamradio/mkiss.c b/drivers/net/hamradio/mkiss.c index fda2fc8..dd897b0 100644 --- a/drivers/net/hamradio/mkiss.c +++ b/drivers/net/hamradio/mkiss.c @@ -82,7 +82,7 @@ struct mkiss { #define CRC_MODE_SMACK_TEST 4 atomic_t refcnt; - struct semaphore dead_sem; + struct anon_semaphore dead_sem; }; /*---------------------------------------------------------------------------*/ @@ -718,7 +718,7 @@ static struct mkiss *mkiss_get(struct tty_struct *tty) static void mkiss_put(struct mkiss *ax) { if (atomic_dec_and_test(&ax->refcnt)) - up(&ax->dead_sem); + anon_up(&ax->dead_sem); } static int crc_force = 0; /* Can be overridden with insmod */ @@ -745,7 +745,7 @@ static int mkiss_open(struct tty_struct *tty) spin_lock_init(&ax->buflock); atomic_set(&ax->refcnt, 1); - init_MUTEX_LOCKED(&ax->dead_sem); + anon_semaphore_init_locked(&ax->dead_sem); ax->tty = tty; tty->disc_data = ax; @@ -824,7 +824,7 @@ static void mkiss_close(struct tty_struct *tty) * we have to wait for all existing users to finish. */ if (!atomic_dec_and_test(&ax->refcnt)) - down(&ax->dead_sem); + anon_down(&ax->dead_sem); unregister_netdev(ax->dev); diff --git a/drivers/net/irda/sir_dev.c b/drivers/net/irda/sir_dev.c index fd0796c..aa04bbd 100644 --- a/drivers/net/irda/sir_dev.c +++ b/drivers/net/irda/sir_dev.c @@ -908,7 +908,7 @@ struct sir_dev * sirdev_get_instance(const struct sir_driver *drv, const char *n dev->tx_skb = NULL; spin_lock_init(&dev->tx_lock); - init_MUTEX(&dev->fsm.sem); + semaphore_init(&dev->fsm.sem); dev->drv = drv; dev->netdev = ndev; diff --git a/drivers/net/loopback.c b/drivers/net/loopback.c index da472c6..15c6599 100644 --- a/drivers/net/loopback.c +++ b/drivers/net/loopback.c @@ -65,6 +65,14 @@ struct pcpu_lstats { unsigned long drops; }; +#ifdef CONFIG_PREEMPT_RT +# define xmit_get_cpu() get_cpu() +# define xmit_put_cpu() put_cpu() +#else +# define xmit_get_cpu() smp_processor_id() +# define xmit_put_cpu() do { } while (0) +#endif + /* * The higher levels take care of making this non-reentrant (it's * called with bh's disabled). @@ -72,22 +80,23 @@ struct pcpu_lstats { static int loopback_xmit(struct sk_buff *skb, struct net_device *dev) { struct pcpu_lstats *pcpu_lstats, *lb_stats; - int len; + int len, res; skb_orphan(skb); skb->protocol = eth_type_trans(skb, dev); + len = skb->len; + res = netif_rx_ni(skb); - /* it's OK to use per_cpu_ptr() because BHs are off */ pcpu_lstats = dev->ml_priv; - lb_stats = per_cpu_ptr(pcpu_lstats, smp_processor_id()); + lb_stats = per_cpu_ptr(pcpu_lstats, xmit_get_cpu()); - len = skb->len; - if (likely(netif_rx(skb) == NET_RX_SUCCESS)) { + if (likely(res == NET_RX_SUCCESS)) { lb_stats->bytes += len; lb_stats->packets++; } else lb_stats->drops++; + xmit_put_cpu(); return 0; } diff --git a/drivers/net/mlx4/mlx4.h b/drivers/net/mlx4/mlx4.h index bc72d6e..13343e8 100644 --- a/drivers/net/mlx4/mlx4.h +++ b/drivers/net/mlx4/mlx4.h @@ -40,6 +40,7 @@ #include <linux/mutex.h> #include <linux/radix-tree.h> #include <linux/timer.h> +#include <linux/semaphore.h> #include <linux/workqueue.h> #include <linux/mlx4/device.h> diff --git a/drivers/net/mv643xx_eth.c b/drivers/net/mv643xx_eth.c index 0f32db3..ee2c56d 100644 --- a/drivers/net/mv643xx_eth.c +++ b/drivers/net/mv643xx_eth.c @@ -508,7 +508,7 @@ static void txq_maybe_wake(struct tx_queue *txq) struct netdev_queue *nq = netdev_get_tx_queue(mp->dev, txq->index); if (netif_tx_queue_stopped(nq)) { - __netif_tx_lock(nq, smp_processor_id()); + __netif_tx_lock(nq); if (txq->tx_ring_size - txq->tx_desc_count >= MAX_SKB_FRAGS + 1) netif_tx_wake_queue(nq); __netif_tx_unlock(nq); @@ -899,7 +899,7 @@ static void txq_kick(struct tx_queue *txq) u32 hw_desc_ptr; u32 expected_ptr; - __netif_tx_lock(nq, smp_processor_id()); + __netif_tx_lock(nq); if (rdlp(mp, TXQ_COMMAND) & (1 << txq->index)) goto out; @@ -923,7 +923,7 @@ static int txq_reclaim(struct tx_queue *txq, int budget, int force) struct netdev_queue *nq = netdev_get_tx_queue(mp->dev, txq->index); int reclaimed; - __netif_tx_lock(nq, smp_processor_id()); + __netif_tx_lock(nq); reclaimed = 0; while (reclaimed < budget && txq->tx_desc_count > 0) { diff --git a/drivers/net/netxen/netxen_nic_init.c b/drivers/net/netxen/netxen_nic_init.c index 5d3343e..3fb47ee 100644 --- a/drivers/net/netxen/netxen_nic_init.c +++ b/drivers/net/netxen/netxen_nic_init.c @@ -1401,7 +1401,7 @@ int netxen_process_cmd_ring(struct netxen_adapter *adapter) smp_mb(); if (netif_queue_stopped(netdev) && netif_carrier_ok(netdev)) { - __netif_tx_lock(tx_ring->txq, smp_processor_id()); + __netif_tx_lock(tx_ring->txq); if (netxen_tx_avail(tx_ring) > TX_STOP_THRESH) netif_wake_queue(netdev); __netif_tx_unlock(tx_ring->txq); diff --git a/drivers/net/niu.c b/drivers/net/niu.c index d2146d4..4b6d8ce 100644 --- a/drivers/net/niu.c +++ b/drivers/net/niu.c @@ -3681,7 +3681,7 @@ static void niu_tx_work(struct niu *np, struct tx_ring_info *rp) out: if (unlikely(netif_tx_queue_stopped(txq) && (niu_tx_avail(rp) > NIU_TX_WAKEUP_THRESH(rp)))) { - __netif_tx_lock(txq, smp_processor_id()); + __netif_tx_lock(txq); if (netif_tx_queue_stopped(txq) && (niu_tx_avail(rp) > NIU_TX_WAKEUP_THRESH(rp))) netif_tx_wake_queue(txq); diff --git a/drivers/net/ppp_async.c b/drivers/net/ppp_async.c index 6de8399..1a29a1c 100644 --- a/drivers/net/ppp_async.c +++ b/drivers/net/ppp_async.c @@ -67,7 +67,7 @@ struct asyncppp { struct tasklet_struct tsk; atomic_t refcnt; - struct semaphore dead_sem; + struct anon_semaphore dead_sem; struct ppp_channel chan; /* interface to generic ppp layer */ unsigned char obuf[OBUFSIZE]; }; @@ -145,7 +145,7 @@ static struct asyncppp *ap_get(struct tty_struct *tty) static void ap_put(struct asyncppp *ap) { if (atomic_dec_and_test(&ap->refcnt)) - up(&ap->dead_sem); + anon_up(&ap->dead_sem); } /* @@ -183,7 +183,7 @@ ppp_asynctty_open(struct tty_struct *tty) tasklet_init(&ap->tsk, ppp_async_process, (unsigned long) ap); atomic_set(&ap->refcnt, 1); - init_MUTEX_LOCKED(&ap->dead_sem); + anon_semaphore_init_locked(&ap->dead_sem); ap->chan.private = ap; ap->chan.ops = &async_ops; @@ -232,7 +232,7 @@ ppp_asynctty_close(struct tty_struct *tty) * by the time it returns. */ if (!atomic_dec_and_test(&ap->refcnt)) - down(&ap->dead_sem); + anon_down(&ap->dead_sem); tasklet_kill(&ap->tsk); ppp_unregister_channel(&ap->chan); diff --git a/drivers/net/rionet.c b/drivers/net/rionet.c index 8702e7a..f4703ac 100644 --- a/drivers/net/rionet.c +++ b/drivers/net/rionet.c @@ -180,11 +180,7 @@ static int rionet_start_xmit(struct sk_buff *skb, struct net_device *ndev) u16 destid; unsigned long flags; - local_irq_save(flags); - if (!spin_trylock(&rnet->tx_lock)) { - local_irq_restore(flags); - return NETDEV_TX_LOCKED; - } + spin_lock_irqsave(&rnet->tx_lock, flags); if ((rnet->tx_cnt + 1) > RIONET_TX_RING_SIZE) { netif_stop_queue(ndev); diff --git a/drivers/net/s2io.c b/drivers/net/s2io.c index 458daa0..60913bf 100644 --- a/drivers/net/s2io.c +++ b/drivers/net/s2io.c @@ -4161,12 +4161,7 @@ static int s2io_xmit(struct sk_buff *skb, struct net_device *dev) [skb->priority & (MAX_TX_FIFOS - 1)]; fifo = &mac_control->fifos[queue]; - if (do_spin_lock) - spin_lock_irqsave(&fifo->tx_lock, flags); - else { - if (unlikely(!spin_trylock_irqsave(&fifo->tx_lock, flags))) - return NETDEV_TX_LOCKED; - } + spin_lock_irqsave(&fifo->tx_lock, flags); if (sp->config.multiq) { if (__netif_subqueue_stopped(dev, fifo->fifo_no)) { diff --git a/drivers/net/sungem.c b/drivers/net/sungem.c index d2dfe0a..d47ce6c 100644 --- a/drivers/net/sungem.c +++ b/drivers/net/sungem.c @@ -1032,12 +1032,8 @@ static int gem_start_xmit(struct sk_buff *skb, struct net_device *dev) (csum_stuff_off << 21)); } - local_irq_save(flags); - if (!spin_trylock(&gp->tx_lock)) { - /* Tell upper layer to requeue */ - local_irq_restore(flags); - return NETDEV_TX_LOCKED; - } + spin_lock_irqsave(&gp->tx_lock, flags); + /* We raced with gem_do_stop() */ if (!gp->running) { spin_unlock_irqrestore(&gp->tx_lock, flags); diff --git a/drivers/net/tehuti.c b/drivers/net/tehuti.c index 3c2679c..ce65846 100644 --- a/drivers/net/tehuti.c +++ b/drivers/net/tehuti.c @@ -1638,13 +1638,8 @@ static int bdx_tx_transmit(struct sk_buff *skb, struct net_device *ndev) unsigned long flags; ENTER; - local_irq_save(flags); - if (!spin_trylock(&priv->tx_lock)) { - local_irq_restore(flags); - DBG("%s[%s]: TX locked, returning NETDEV_TX_LOCKED\n", - BDX_DRV_NAME, ndev->name); - return NETDEV_TX_LOCKED; - } + + spin_lock_irqsave(&priv->tx_lock, flags); /* build tx descriptor */ BDX_ASSERT(f->m.wptr >= f->m.memsz); /* started with valid wptr */ diff --git a/drivers/net/tulip/tulip_core.c b/drivers/net/tulip/tulip_core.c index 4cf9a65..c5cf990 100644 --- a/drivers/net/tulip/tulip_core.c +++ b/drivers/net/tulip/tulip_core.c @@ -1817,6 +1817,7 @@ static void __devexit tulip_remove_one (struct pci_dev *pdev) pci_iounmap(pdev, tp->base_addr); free_netdev (dev); pci_release_regions (pdev); + pci_disable_device (pdev); pci_set_drvdata (pdev, NULL); /* pci_power_off (pdev, -1); */ diff --git a/drivers/net/wan/cosa.c b/drivers/net/wan/cosa.c index 61581ee..40d64ce 100644 --- a/drivers/net/wan/cosa.c +++ b/drivers/net/wan/cosa.c @@ -574,7 +574,7 @@ static int cosa_probe(int base, int irq, int dma) /* Initialize the chardev data structures */ mutex_init(&chan->rlock); - init_MUTEX(&chan->wsem); + semaphore_init(&chan->wsem); /* Register the network interface */ if (!(chan->netdev = alloc_hdlcdev(chan))) { diff --git a/drivers/of/base.c b/drivers/of/base.c index 69f85c0..fc0c206 100644 --- a/drivers/of/base.c +++ b/drivers/of/base.c @@ -25,7 +25,7 @@ struct device_node *allnodes; /* use when traversing tree through the allnext, child, sibling, * or parent members of struct device_node. */ -DEFINE_RWLOCK(devtree_lock); +DEFINE_ATOMIC_SPINLOCK(devtree_lock); int of_n_addr_cells(struct device_node *np) { @@ -59,16 +59,14 @@ int of_n_size_cells(struct device_node *np) } EXPORT_SYMBOL(of_n_size_cells); -struct property *of_find_property(const struct device_node *np, - const char *name, - int *lenp) +static struct property *__of_find_property(const struct device_node *np, + const char *name, int *lenp) { struct property *pp; if (!np) return NULL; - read_lock(&devtree_lock); for (pp = np->properties; pp != 0; pp = pp->next) { if (of_prop_cmp(pp->name, name) == 0) { if (lenp != 0) @@ -76,7 +74,20 @@ struct property *of_find_property(const struct device_node *np, break; } } - read_unlock(&devtree_lock); + + return pp; +} + +struct property *of_find_property(const struct device_node *np, + const char *name, + int *lenp) +{ + struct property *pp; + unsigned long flags; + + atomic_spin_lock_irqsave(&devtree_lock, flags); + pp = __of_find_property(np, name, lenp); + atomic_spin_unlock_irqrestore(&devtree_lock, flags); return pp; } @@ -86,8 +97,20 @@ EXPORT_SYMBOL(of_find_property); * Find a property with a given name for a given node * and return the value. */ +static const void *__of_get_property(const struct device_node *np, + const char *name, int *lenp) +{ + struct property *pp = __of_find_property(np, name, lenp); + + return pp ? pp->value : NULL; +} + +/* + * Find a property with a given name for a given node + * and return the value. + */ const void *of_get_property(const struct device_node *np, const char *name, - int *lenp) + int *lenp) { struct property *pp = of_find_property(np, name, lenp); @@ -98,13 +121,13 @@ EXPORT_SYMBOL(of_get_property); /** Checks if the given "compat" string matches one of the strings in * the device's "compatible" property */ -int of_device_is_compatible(const struct device_node *device, - const char *compat) +static int __of_device_is_compatible(const struct device_node *device, + const char *compat) { const char* cp; - int cplen, l; + int uninitialized_var(cplen), l; - cp = of_get_property(device, "compatible", &cplen); + cp = __of_get_property(device, "compatible", &cplen); if (cp == NULL) return 0; while (cplen > 0) { @@ -117,6 +140,21 @@ int of_device_is_compatible(const struct device_node *device, return 0; } + +/** Checks if the given "compat" string matches one of the strings in + * the device's "compatible" property + */ +int of_device_is_compatible(const struct device_node *device, + const char *compat) +{ + unsigned long flags; + int res; + + atomic_spin_lock_irqsave(&devtree_lock, flags); + res = __of_device_is_compatible(device, compat); + atomic_spin_unlock_irqrestore(&devtree_lock, flags); + return res; +} EXPORT_SYMBOL(of_device_is_compatible); /** @@ -155,13 +193,14 @@ EXPORT_SYMBOL(of_device_is_available); struct device_node *of_get_parent(const struct device_node *node) { struct device_node *np; + unsigned long flags; if (!node) return NULL; - read_lock(&devtree_lock); + atomic_spin_lock_irqsave(&devtree_lock, flags); np = of_node_get(node->parent); - read_unlock(&devtree_lock); + atomic_spin_unlock_irqrestore(&devtree_lock, flags); return np; } EXPORT_SYMBOL(of_get_parent); @@ -180,14 +219,15 @@ EXPORT_SYMBOL(of_get_parent); struct device_node *of_get_next_parent(struct device_node *node) { struct device_node *parent; + unsigned long flags; if (!node) return NULL; - read_lock(&devtree_lock); + atomic_spin_lock_irqsave(&devtree_lock, flags); parent = of_node_get(node->parent); of_node_put(node); - read_unlock(&devtree_lock); + atomic_spin_unlock_irqrestore(&devtree_lock, flags); return parent; } @@ -203,14 +243,15 @@ struct device_node *of_get_next_child(const struct device_node *node, struct device_node *prev) { struct device_node *next; + unsigned long flags; - read_lock(&devtree_lock); + atomic_spin_lock_irqsave(&devtree_lock, flags); next = prev ? prev->sibling : node->child; for (; next; next = next->sibling) if (of_node_get(next)) break; of_node_put(prev); - read_unlock(&devtree_lock); + atomic_spin_unlock_irqrestore(&devtree_lock, flags); return next; } EXPORT_SYMBOL(of_get_next_child); @@ -225,14 +266,15 @@ EXPORT_SYMBOL(of_get_next_child); struct device_node *of_find_node_by_path(const char *path) { struct device_node *np = allnodes; + unsigned long flags; - read_lock(&devtree_lock); + atomic_spin_lock_irqsave(&devtree_lock, flags); for (; np; np = np->allnext) { if (np->full_name && (of_node_cmp(np->full_name, path) == 0) && of_node_get(np)) break; } - read_unlock(&devtree_lock); + atomic_spin_unlock_irqrestore(&devtree_lock, flags); return np; } EXPORT_SYMBOL(of_find_node_by_path); @@ -252,15 +294,16 @@ struct device_node *of_find_node_by_name(struct device_node *from, const char *name) { struct device_node *np; + unsigned long flags; - read_lock(&devtree_lock); + atomic_spin_lock_irqsave(&devtree_lock, flags); np = from ? from->allnext : allnodes; for (; np; np = np->allnext) if (np->name && (of_node_cmp(np->name, name) == 0) && of_node_get(np)) break; of_node_put(from); - read_unlock(&devtree_lock); + atomic_spin_unlock_irqrestore(&devtree_lock, flags); return np; } EXPORT_SYMBOL(of_find_node_by_name); @@ -281,15 +324,16 @@ struct device_node *of_find_node_by_type(struct device_node *from, const char *type) { struct device_node *np; + unsigned long flags; - read_lock(&devtree_lock); + atomic_spin_lock_irqsave(&devtree_lock, flags); np = from ? from->allnext : allnodes; for (; np; np = np->allnext) if (np->type && (of_node_cmp(np->type, type) == 0) && of_node_get(np)) break; of_node_put(from); - read_unlock(&devtree_lock); + atomic_spin_unlock_irqrestore(&devtree_lock, flags); return np; } EXPORT_SYMBOL(of_find_node_by_type); @@ -312,18 +356,20 @@ struct device_node *of_find_compatible_node(struct device_node *from, const char *type, const char *compatible) { struct device_node *np; + unsigned long flags; - read_lock(&devtree_lock); + atomic_spin_lock_irqsave(&devtree_lock, flags); np = from ? from->allnext : allnodes; for (; np; np = np->allnext) { if (type && !(np->type && (of_node_cmp(np->type, type) == 0))) continue; - if (of_device_is_compatible(np, compatible) && of_node_get(np)) + if (__of_device_is_compatible(np, compatible) && + of_node_get(np)) break; } of_node_put(from); - read_unlock(&devtree_lock); + atomic_spin_unlock_irqrestore(&devtree_lock, flags); return np; } EXPORT_SYMBOL(of_find_compatible_node); @@ -345,8 +391,9 @@ struct device_node *of_find_node_with_property(struct device_node *from, { struct device_node *np; struct property *pp; + unsigned long flags; - read_lock(&devtree_lock); + atomic_spin_lock_irqsave(&devtree_lock, flags); np = from ? from->allnext : allnodes; for (; np; np = np->allnext) { for (pp = np->properties; pp != 0; pp = pp->next) { @@ -358,20 +405,14 @@ struct device_node *of_find_node_with_property(struct device_node *from, } out: of_node_put(from); - read_unlock(&devtree_lock); + atomic_spin_unlock_irqrestore(&devtree_lock, flags); return np; } EXPORT_SYMBOL(of_find_node_with_property); -/** - * of_match_node - Tell if an device_node has a matching of_match structure - * @matches: array of of device match structures to search in - * @node: the of device structure to match against - * - * Low level utility function used by device matching. - */ -const struct of_device_id *of_match_node(const struct of_device_id *matches, - const struct device_node *node) +static const struct of_device_id * +__of_match_node(const struct of_device_id *matches, + const struct device_node *node) { while (matches->name[0] || matches->type[0] || matches->compatible[0]) { int match = 1; @@ -382,14 +423,33 @@ const struct of_device_id *of_match_node(const struct of_device_id *matches, match &= node->type && !strcmp(matches->type, node->type); if (matches->compatible[0]) - match &= of_device_is_compatible(node, - matches->compatible); + match &= __of_device_is_compatible(node, + matches->compatible); if (match) return matches; matches++; } return NULL; } + +/** + * of_match_node - Tell if an device_node has a matching of_match structure + * @matches: array of of device match structures to search in + * @node: the of device structure to match against + * + * Low level utility function used by device matching. + */ +const struct of_device_id *of_match_node(const struct of_device_id *matches, + const struct device_node *node) +{ + const struct of_device_id *match; + unsigned long flags; + + atomic_spin_lock_irqsave(&devtree_lock, flags); + match = __of_match_node(matches, node); + atomic_spin_unlock_irqrestore(&devtree_lock, flags); + return match; +} EXPORT_SYMBOL(of_match_node); /** @@ -408,15 +468,16 @@ struct device_node *of_find_matching_node(struct device_node *from, const struct of_device_id *matches) { struct device_node *np; + unsigned long flags; - read_lock(&devtree_lock); + atomic_spin_lock_irqsave(&devtree_lock, flags); np = from ? from->allnext : allnodes; for (; np; np = np->allnext) { - if (of_match_node(matches, np) && of_node_get(np)) + if (__of_match_node(matches, np) && of_node_get(np)) break; } of_node_put(from); - read_unlock(&devtree_lock); + atomic_spin_unlock_irqrestore(&devtree_lock, flags); return np; } EXPORT_SYMBOL(of_find_matching_node); diff --git a/drivers/oprofile/event_buffer.c b/drivers/oprofile/event_buffer.c index 2b7ae36..284814b 100644 --- a/drivers/oprofile/event_buffer.c +++ b/drivers/oprofile/event_buffer.c @@ -72,10 +72,10 @@ int alloc_event_buffer(void) int err = -ENOMEM; unsigned long flags; - spin_lock_irqsave(&oprofilefs_lock, flags); + atomic_spin_lock_irqsave(&oprofilefs_lock, flags); buffer_size = oprofile_buffer_size; buffer_watershed = oprofile_buffer_watershed; - spin_unlock_irqrestore(&oprofilefs_lock, flags); + atomic_spin_unlock_irqrestore(&oprofilefs_lock, flags); if (buffer_watershed >= buffer_size) return -EINVAL; diff --git a/drivers/oprofile/oprofilefs.c b/drivers/oprofile/oprofilefs.c index b7e4cee..7cbf76d 100644 --- a/drivers/oprofile/oprofilefs.c +++ b/drivers/oprofile/oprofilefs.c @@ -21,7 +21,7 @@ #define OPROFILEFS_MAGIC 0x6f70726f -DEFINE_SPINLOCK(oprofilefs_lock); +DEFINE_ATOMIC_SPINLOCK(oprofilefs_lock); static struct inode *oprofilefs_get_inode(struct super_block *sb, int mode) { @@ -75,9 +75,9 @@ int oprofilefs_ulong_from_user(unsigned long *val, char const __user *buf, size_ if (copy_from_user(tmpbuf, buf, count)) return -EFAULT; - spin_lock_irqsave(&oprofilefs_lock, flags); + atomic_spin_lock_irqsave(&oprofilefs_lock, flags); *val = simple_strtoul(tmpbuf, NULL, 0); - spin_unlock_irqrestore(&oprofilefs_lock, flags); + atomic_spin_unlock_irqrestore(&oprofilefs_lock, flags); return 0; } diff --git a/drivers/parport/ieee1284.c b/drivers/parport/ieee1284.c index 8901ecf..6e2f206 100644 --- a/drivers/parport/ieee1284.c +++ b/drivers/parport/ieee1284.c @@ -41,7 +41,7 @@ * It will be useful to call this from an interrupt handler. */ static void parport_ieee1284_wakeup (struct parport *port) { - up (&port->physport->ieee1284.irq); + anon_up (&port->physport->ieee1284.irq); } static struct parport *port_from_cookie[PARPORT_MAX]; @@ -83,7 +83,7 @@ int parport_wait_event (struct parport *port, signed long timeout) timer.data = port->number; add_timer (&timer); - ret = down_interruptible (&port->physport->ieee1284.irq); + ret = anon_down_interruptible (&port->physport->ieee1284.irq); if (!del_timer_sync(&timer) && !ret) /* Timed out. */ ret = 1; diff --git a/drivers/parport/share.c b/drivers/parport/share.c index dffa5d4..228942f 100644 --- a/drivers/parport/share.c +++ b/drivers/parport/share.c @@ -306,7 +306,7 @@ struct parport *parport_register_port(unsigned long base, int irq, int dma, spin_lock_init(&tmp->pardevice_lock); tmp->ieee1284.mode = IEEE1284_MODE_COMPAT; tmp->ieee1284.phase = IEEE1284_PH_FWD_IDLE; - init_MUTEX_LOCKED (&tmp->ieee1284.irq); /* actually a semaphore at 0 */ + anon_semaphore_init_locked(&tmp->ieee1284.irq); tmp->spintime = parport_default_spintime; atomic_set (&tmp->ref_count, 1); INIT_LIST_HEAD(&tmp->full_list); diff --git a/drivers/pci/access.c b/drivers/pci/access.c index db23200..fddeb63 100644 --- a/drivers/pci/access.c +++ b/drivers/pci/access.c @@ -12,7 +12,7 @@ * configuration space. */ -static DEFINE_SPINLOCK(pci_lock); +static DEFINE_ATOMIC_SPINLOCK(pci_lock); /* * Wrappers for all PCI configuration access functions. They just check @@ -32,10 +32,10 @@ int pci_bus_read_config_##size \ unsigned long flags; \ u32 data = 0; \ if (PCI_##size##_BAD) return PCIBIOS_BAD_REGISTER_NUMBER; \ - spin_lock_irqsave(&pci_lock, flags); \ + atomic_spin_lock_irqsave(&pci_lock, flags); \ res = bus->ops->read(bus, devfn, pos, len, &data); \ *value = (type)data; \ - spin_unlock_irqrestore(&pci_lock, flags); \ + atomic_spin_unlock_irqrestore(&pci_lock, flags); \ return res; \ } @@ -46,9 +46,9 @@ int pci_bus_write_config_##size \ int res; \ unsigned long flags; \ if (PCI_##size##_BAD) return PCIBIOS_BAD_REGISTER_NUMBER; \ - spin_lock_irqsave(&pci_lock, flags); \ + atomic_spin_lock_irqsave(&pci_lock, flags); \ res = bus->ops->write(bus, devfn, pos, len, value); \ - spin_unlock_irqrestore(&pci_lock, flags); \ + atomic_spin_unlock_irqrestore(&pci_lock, flags); \ return res; \ } @@ -78,10 +78,10 @@ struct pci_ops *pci_bus_set_ops(struct pci_bus *bus, struct pci_ops *ops) struct pci_ops *old_ops; unsigned long flags; - spin_lock_irqsave(&pci_lock, flags); + atomic_spin_lock_irqsave(&pci_lock, flags); old_ops = bus->ops; bus->ops = ops; - spin_unlock_irqrestore(&pci_lock, flags); + atomic_spin_unlock_irqrestore(&pci_lock, flags); return old_ops; } EXPORT_SYMBOL(pci_bus_set_ops); @@ -135,9 +135,9 @@ static noinline void pci_wait_ucfg(struct pci_dev *dev) __add_wait_queue(&pci_ucfg_wait, &wait); do { set_current_state(TASK_UNINTERRUPTIBLE); - spin_unlock_irq(&pci_lock); + atomic_spin_unlock_irq(&pci_lock); schedule(); - spin_lock_irq(&pci_lock); + atomic_spin_lock_irq(&pci_lock); } while (dev->block_ucfg_access); __remove_wait_queue(&pci_ucfg_wait, &wait); } @@ -149,11 +149,11 @@ int pci_user_read_config_##size \ int ret = 0; \ u32 data = -1; \ if (PCI_##size##_BAD) return PCIBIOS_BAD_REGISTER_NUMBER; \ - spin_lock_irq(&pci_lock); \ + atomic_spin_lock_irq(&pci_lock); \ if (unlikely(dev->block_ucfg_access)) pci_wait_ucfg(dev); \ ret = dev->bus->ops->read(dev->bus, dev->devfn, \ pos, sizeof(type), &data); \ - spin_unlock_irq(&pci_lock); \ + atomic_spin_unlock_irq(&pci_lock); \ *val = (type)data; \ return ret; \ } @@ -164,11 +164,11 @@ int pci_user_write_config_##size \ { \ int ret = -EIO; \ if (PCI_##size##_BAD) return PCIBIOS_BAD_REGISTER_NUMBER; \ - spin_lock_irq(&pci_lock); \ + atomic_spin_lock_irq(&pci_lock); \ if (unlikely(dev->block_ucfg_access)) pci_wait_ucfg(dev); \ ret = dev->bus->ops->write(dev->bus, dev->devfn, \ pos, sizeof(type), val); \ - spin_unlock_irq(&pci_lock); \ + atomic_spin_unlock_irq(&pci_lock); \ return ret; \ } @@ -395,10 +395,10 @@ void pci_block_user_cfg_access(struct pci_dev *dev) unsigned long flags; int was_blocked; - spin_lock_irqsave(&pci_lock, flags); + atomic_spin_lock_irqsave(&pci_lock, flags); was_blocked = dev->block_ucfg_access; dev->block_ucfg_access = 1; - spin_unlock_irqrestore(&pci_lock, flags); + atomic_spin_unlock_irqrestore(&pci_lock, flags); /* If we BUG() inside the pci_lock, we're guaranteed to hose * the machine */ @@ -416,7 +416,7 @@ void pci_unblock_user_cfg_access(struct pci_dev *dev) { unsigned long flags; - spin_lock_irqsave(&pci_lock, flags); + atomic_spin_lock_irqsave(&pci_lock, flags); /* This indicates a problem in the caller, but we don't need * to kill them, unlike a double-block above. */ @@ -424,6 +424,6 @@ void pci_unblock_user_cfg_access(struct pci_dev *dev) dev->block_ucfg_access = 0; wake_up_all(&pci_ucfg_wait); - spin_unlock_irqrestore(&pci_lock, flags); + atomic_spin_unlock_irqrestore(&pci_lock, flags); } EXPORT_SYMBOL_GPL(pci_unblock_user_cfg_access); diff --git a/drivers/pci/bus.c b/drivers/pci/bus.c index cef28a7..caa6295 100644 --- a/drivers/pci/bus.c +++ b/drivers/pci/bus.c @@ -240,9 +240,9 @@ void pci_walk_bus(struct pci_bus *top, int (*cb)(struct pci_dev *, void *), next = dev->bus_list.next; /* Run device routines with the device locked */ - down(&dev->dev.sem); + mutex_lock(&dev->dev.mutex); retval = cb(dev, userdata); - up(&dev->dev.sem); + mutex_unlock(&dev->dev.mutex); if (retval) break; } diff --git a/drivers/pci/hotplug/ibmphp_hpc.c b/drivers/pci/hotplug/ibmphp_hpc.c index 83f337c..d120da6 100644 --- a/drivers/pci/hotplug/ibmphp_hpc.c +++ b/drivers/pci/hotplug/ibmphp_hpc.c @@ -104,7 +104,7 @@ static int to_debug = 0; static struct mutex sem_hpcaccess; // lock access to HPC static struct semaphore semOperations; // lock all operations and // access to data structures -static struct semaphore sem_exit; // make sure polling thread goes away +static struct anon_semaphore sem_exit; // make sure polling thread goes away static struct task_struct *ibmphp_poll_thread; //---------------------------------------------------------------------------- // local function prototypes @@ -132,8 +132,8 @@ void __init ibmphp_hpc_initvars (void) debug ("%s - Entry\n", __func__); mutex_init(&sem_hpcaccess); - init_MUTEX (&semOperations); - init_MUTEX_LOCKED (&sem_exit); + semaphore_init(&semOperations); + anon_semaphore_init_locked(&sem_exit); to_debug = 0; debug ("%s - Exit\n", __func__); @@ -906,7 +906,7 @@ static int poll_hpc(void *data) /* sleep for a short time just for good measure */ msleep(100); } - up (&sem_exit); + anon_up (&sem_exit); debug ("%s - Exit\n", __func__); return 0; } @@ -1076,7 +1076,7 @@ void __exit ibmphp_hpc_stop_poll_thread (void) // wait for poll thread to exit debug ("before sem_exit down \n"); - down (&sem_exit); + anon_down (&sem_exit); debug ("after sem_exit down \n"); // cleanup @@ -1085,7 +1085,7 @@ void __exit ibmphp_hpc_stop_poll_thread (void) debug ("after free_hpc_access \n"); ibmphp_unlock_operations (); debug ("after unlock operations \n"); - up (&sem_exit); + anon_up (&sem_exit); debug ("after sem exit up\n"); debug ("%s - Exit\n", __func__); diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c index 7b70312..662b5ce 100644 --- a/drivers/pci/pci.c +++ b/drivers/pci/pci.c @@ -2213,7 +2213,7 @@ static int pci_dev_reset(struct pci_dev *dev, int probe) if (!probe) { pci_block_user_cfg_access(dev); /* block PM suspend, driver probe, etc. */ - down(&dev->dev.sem); + mutex_lock(&dev->dev.mutex); } rc = pcie_flr(dev, probe); @@ -2231,7 +2231,7 @@ static int pci_dev_reset(struct pci_dev *dev, int probe) rc = pci_parent_bus_reset(dev, probe); done: if (!probe) { - up(&dev->dev.sem); + mutex_unlock(&dev->dev.mutex); pci_unblock_user_cfg_access(dev); } diff --git a/drivers/pcmcia/ds.c b/drivers/pcmcia/ds.c index 304ff6d..a5bcb5c 100644 --- a/drivers/pcmcia/ds.c +++ b/drivers/pcmcia/ds.c @@ -1082,9 +1082,9 @@ static int runtime_suspend(struct device *dev) { int rc; - down(&dev->sem); + mutex_lock(&dev->mutex); rc = pcmcia_dev_suspend(dev, PMSG_SUSPEND); - up(&dev->sem); + mutex_unlock(&dev->mutex); return rc; } @@ -1092,9 +1092,9 @@ static void runtime_resume(struct device *dev) { int rc; - down(&dev->sem); + mutex_lock(&dev->mutex); rc = pcmcia_dev_resume(dev); - up(&dev->sem); + mutex_unlock(&dev->mutex); } /************************ per-device sysfs output ***************************/ diff --git a/drivers/s390/cio/crw.c b/drivers/s390/cio/crw.c index d157665..dde3d10 100644 --- a/drivers/s390/cio/crw.c +++ b/drivers/s390/cio/crw.c @@ -137,7 +137,7 @@ void crw_handle_channel_report(void) */ static int __init crw_init_semaphore(void) { - init_MUTEX_LOCKED(&crw_semaphore); + semaphore_init_locked(&crw_semaphore); return 0; } pure_initcall(crw_init_semaphore); diff --git a/drivers/scsi/aacraid/aacraid.h b/drivers/scsi/aacraid/aacraid.h index cdbdec9..5ccaa8d 100644 --- a/drivers/scsi/aacraid/aacraid.h +++ b/drivers/scsi/aacraid/aacraid.h @@ -719,7 +719,7 @@ struct aac_fib_context { u32 unique; // unique value representing this context ulong jiffies; // used for cleanup - dmb changed to ulong struct list_head next; // used to link context's into a linked list - struct semaphore wait_sem; // this is used to wait for the next fib to arrive. + struct anon_semaphore wait_sem; // this is used to wait for the next fib to arrive. int wait; // Set to true when thread is in WaitForSingleObject unsigned long count; // total number of FIBs on FibList struct list_head fib_list; // this holds fibs and their attachd hw_fibs @@ -789,7 +789,7 @@ struct fib { * This is the event the sendfib routine will wait on if the * caller did not pass one and this is synch io. */ - struct semaphore event_wait; + struct anon_semaphore event_wait; spinlock_t event_lock; u32 done; /* gets set to 1 when fib is complete */ diff --git a/drivers/scsi/aacraid/commctrl.c b/drivers/scsi/aacraid/commctrl.c index 0391d75..ab39bfc 100644 --- a/drivers/scsi/aacraid/commctrl.c +++ b/drivers/scsi/aacraid/commctrl.c @@ -190,7 +190,7 @@ static int open_getadapter_fib(struct aac_dev * dev, void __user *arg) /* * Initialize the mutex used to wait for the next AIF. */ - init_MUTEX_LOCKED(&fibctx->wait_sem); + anon_semaphore_init_locked(&fibctx->wait_sem); fibctx->wait = 0; /* * Initialize the fibs and set the count of fibs on @@ -321,7 +321,7 @@ return_fib: ssleep(1); } if (f.wait) { - if(down_interruptible(&fibctx->wait_sem) < 0) { + if(anon_down_interruptible(&fibctx->wait_sem) < 0) { status = -EINTR; } else { /* Lock again and retry */ diff --git a/drivers/scsi/aacraid/commsup.c b/drivers/scsi/aacraid/commsup.c index 956261f..b5bda58 100644 --- a/drivers/scsi/aacraid/commsup.c +++ b/drivers/scsi/aacraid/commsup.c @@ -124,7 +124,7 @@ int aac_fib_setup(struct aac_dev * dev) fibptr->hw_fib_va = hw_fib; fibptr->data = (void *) fibptr->hw_fib_va->data; fibptr->next = fibptr+1; /* Forward chain the fibs */ - init_MUTEX_LOCKED(&fibptr->event_wait); + anon_semaphore_init_locked(&fibptr->event_wait); spin_lock_init(&fibptr->event_lock); hw_fib->header.XferState = cpu_to_le32(0xffffffff); hw_fib->header.SenderSize = cpu_to_le16(dev->max_fib_size); @@ -490,7 +490,7 @@ int aac_fib_send(u16 command, struct fib *fibptr, unsigned long size, * hardware failure has occurred. */ unsigned long count = 36000000L; /* 3 minutes */ - while (down_trylock(&fibptr->event_wait)) { + while (anon_down_trylock(&fibptr->event_wait)) { int blink; if (--count == 0) { struct aac_queue * q = &dev->queues->queue[AdapNormCmdQueue]; @@ -515,9 +515,9 @@ int aac_fib_send(u16 command, struct fib *fibptr, unsigned long size, } udelay(5); } - } else if (down_interruptible(&fibptr->event_wait)) { + } else if (anon_down_interruptible(&fibptr->event_wait)) { fibptr->done = 2; - up(&fibptr->event_wait); + anon_up(&fibptr->event_wait); } spin_lock_irqsave(&fibptr->event_lock, flags); if ((fibptr->done == 0) || (fibptr->done == 2)) { @@ -1177,7 +1177,7 @@ static int _aac_reset_adapter(struct aac_dev *aac, int forced) (fib->hw_fib_va->header.XferState & cpu_to_le32(ResponseExpected))) { unsigned long flagv; spin_lock_irqsave(&fib->event_lock, flagv); - up(&fib->event_wait); + anon_up(&fib->event_wait); spin_unlock_irqrestore(&fib->event_lock, flagv); schedule(); retval = 0; @@ -1460,7 +1460,7 @@ int aac_check_health(struct aac_dev * aac) * Set the event to wake up the * thread that will waiting. */ - up(&fibctx->wait_sem); + anon_up(&fibctx->wait_sem); } else { printk(KERN_WARNING "aifd: didn't allocate NewFib.\n"); kfree(fib); @@ -1691,7 +1691,7 @@ int aac_command_thread(void *data) * Set the event to wake up the * thread that is waiting. */ - up(&fibctx->wait_sem); + anon_up(&fibctx->wait_sem); } else { printk(KERN_WARNING "aifd: didn't allocate NewFib.\n"); } diff --git a/drivers/scsi/aacraid/dpcsup.c b/drivers/scsi/aacraid/dpcsup.c index abc9ef5..0e29b5f 100644 --- a/drivers/scsi/aacraid/dpcsup.c +++ b/drivers/scsi/aacraid/dpcsup.c @@ -127,7 +127,7 @@ unsigned int aac_response_normal(struct aac_queue * q) spin_lock_irqsave(&fib->event_lock, flagv); if (!fib->done) fib->done = 1; - up(&fib->event_wait); + anon_up(&fib->event_wait); spin_unlock_irqrestore(&fib->event_lock, flagv); FIB_COUNTER_INCREMENT(aac_config.NormalRecved); if (fib->done == 2) { @@ -322,7 +322,7 @@ unsigned int aac_intr_normal(struct aac_dev * dev, u32 index) spin_lock_irqsave(&fib->event_lock, flagv); if (!fib->done) fib->done = 1; - up(&fib->event_wait); + anon_up(&fib->event_wait); spin_unlock_irqrestore(&fib->event_lock, flagv); FIB_COUNTER_INCREMENT(aac_config.NormalRecved); } diff --git a/drivers/serial/8250.c b/drivers/serial/8250.c index fb867a9..d406333 100644 --- a/drivers/serial/8250.c +++ b/drivers/serial/8250.c @@ -1595,7 +1595,12 @@ static irqreturn_t serial8250_interrupt(int irq, void *dev_id) l = l->next; - if (l == i->head && pass_counter++ > PASS_LIMIT) { + /* + * On preempt-rt we can be preempted and run in our + * own thread. + */ + if (!preempt_rt() && l == i->head && + pass_counter++ > PASS_LIMIT) { /* If we hit this, we're dead. */ printk(KERN_ERR "serial8250: too much work for " "irq%d\n", irq); @@ -2729,14 +2734,10 @@ serial8250_console_write(struct console *co, const char *s, unsigned int count) touch_nmi_watchdog(); - local_irq_save(flags); - if (up->port.sysrq) { - /* serial8250_handle_port() already took the lock */ - locked = 0; - } else if (oops_in_progress) { - locked = spin_trylock(&up->port.lock); - } else - spin_lock(&up->port.lock); + if (up->port.sysrq || oops_in_progress || preempt_rt()) + locked = spin_trylock_irqsave(&up->port.lock, flags); + else + spin_lock_irqsave(&up->port.lock, flags); /* * First save the IER then disable the interrupts @@ -2768,8 +2769,7 @@ serial8250_console_write(struct console *co, const char *s, unsigned int count) check_modem_status(up); if (locked) - spin_unlock(&up->port.lock); - local_irq_restore(flags); + spin_unlock_irqrestore(&up->port.lock, flags); } static int __init serial8250_console_setup(struct console *co, char *options) diff --git a/drivers/staging/comedi/drivers/dt9812.c b/drivers/staging/comedi/drivers/dt9812.c index cc4c046..aac9a65 100644 --- a/drivers/staging/comedi/drivers/dt9812.c +++ b/drivers/staging/comedi/drivers/dt9812.c @@ -262,7 +262,7 @@ struct dt9812_usb_cmd { #define DT9812_NUM_SLOTS 16 -static DECLARE_MUTEX(dt9812_mutex); +static DEFINE_SEMAPHORE(dt9812_mutex); static struct usb_device_id dt9812_table[] = { {USB_DEVICE(0x0867, 0x9812)}, @@ -1121,7 +1121,7 @@ static int __init usb_dt9812_init(void) /* Initialize all driver slots */ for (i = 0; i < DT9812_NUM_SLOTS; i++) { - init_MUTEX(&dt9812[i].mutex); + semaphore_init(&dt9812[i].mutex); dt9812[i].serial = 0; dt9812[i].usb = NULL; dt9812[i].comedi = NULL; diff --git a/drivers/staging/comedi/drivers/usbdux.c b/drivers/staging/comedi/drivers/usbdux.c index 171a6f2..7c84121 100644 --- a/drivers/staging/comedi/drivers/usbdux.c +++ b/drivers/staging/comedi/drivers/usbdux.c @@ -307,7 +307,7 @@ struct usbduxsub { */ static struct usbduxsub usbduxsub[NUMUSBDUX]; -static DECLARE_MUTEX(start_stop_sem); +static DEFINE_SEMAPHORE(start_stop_sem); /* * Stops the data acquision @@ -2349,7 +2349,7 @@ static int usbduxsub_probe(struct usb_interface *uinterf, dev_dbg(dev, "comedi_: usbdux: " "usbduxsub[%d] is ready to connect to comedi.\n", index); - init_MUTEX(&(usbduxsub[index].sem)); + semaphore_init(&(usbduxsub[index].sem)); /* save a pointer to the usb device */ usbduxsub[index].usbdev = udev; diff --git a/drivers/staging/comedi/drivers/usbduxfast.c b/drivers/staging/comedi/drivers/usbduxfast.c index 939b53f..3a465ba 100644 --- a/drivers/staging/comedi/drivers/usbduxfast.c +++ b/drivers/staging/comedi/drivers/usbduxfast.c @@ -201,7 +201,7 @@ struct usbduxfastsub_s { */ static struct usbduxfastsub_s usbduxfastsub[NUMUSBDUXFAST]; -static DECLARE_MUTEX(start_stop_sem); +static DEFINE_SEMAPHORE(start_stop_sem); /* * bulk transfers to usbduxfast @@ -1500,7 +1500,7 @@ static int usbduxfastsub_probe(struct usb_interface *uinterf, "connect to comedi.\n", index); #endif - init_MUTEX(&(usbduxfastsub[index].sem)); + semaphore_init(&(usbduxfastsub[index].sem)); /* save a pointer to the usb device */ usbduxfastsub[index].usbdev = udev; diff --git a/drivers/staging/cpc-usb/cpc-usb_drv.c b/drivers/staging/cpc-usb/cpc-usb_drv.c index 9bf3f98..9a51965 100644 --- a/drivers/staging/cpc-usb/cpc-usb_drv.c +++ b/drivers/staging/cpc-usb/cpc-usb_drv.c @@ -83,7 +83,7 @@ static CPC_USB_T *CPCUSB_Table[CPC_USB_CARD_CNT] = { 0 }; static unsigned int CPCUsbCnt; /* prevent races between open() and disconnect() */ -static DECLARE_MUTEX(disconnect_sem); +static DEFINE_SEMAPHORE(disconnect_sem); /* local function prototypes */ static ssize_t cpcusb_read(struct file *file, char *buffer, size_t count, @@ -903,7 +903,7 @@ static int cpcusb_probe(struct usb_interface *interface, memset(chan, 0, sizeof(CPC_CHAN_T)); ResetBuffer(chan); - init_MUTEX(&card->sem); + semaphore_init(&card->sem); spin_lock_init(&card->slock); card->udev = udev; diff --git a/drivers/staging/frontier/alphatrack.c b/drivers/staging/frontier/alphatrack.c index 15aed87..d4d801e 100644 --- a/drivers/staging/frontier/alphatrack.c +++ b/drivers/staging/frontier/alphatrack.c @@ -678,7 +678,7 @@ static int usb_alphatrack_probe(struct usb_interface *intf, dev_err(&intf->dev, "Out of memory\n"); goto exit; } - init_MUTEX(&dev->sem); + semaphore_init(&dev->sem); dev->intf = intf; init_waitqueue_head(&dev->read_wait); init_waitqueue_head(&dev->write_wait); diff --git a/drivers/staging/frontier/tranzport.c b/drivers/staging/frontier/tranzport.c index ef8fcc8..81db34b 100644 --- a/drivers/staging/frontier/tranzport.c +++ b/drivers/staging/frontier/tranzport.c @@ -800,7 +800,7 @@ static int usb_tranzport_probe(struct usb_interface *intf, dev_err(&intf->dev, "Out of memory\n"); goto exit; } - init_MUTEX(&dev->sem); + semaphore_init(&dev->sem); dev->intf = intf; init_waitqueue_head(&dev->read_wait); init_waitqueue_head(&dev->write_wait); diff --git a/drivers/staging/go7007/go7007-driver.c b/drivers/staging/go7007/go7007-driver.c index 77b1e76..6efcd79 100644 --- a/drivers/staging/go7007/go7007-driver.c +++ b/drivers/staging/go7007/go7007-driver.c @@ -604,7 +604,7 @@ struct go7007 *go7007_alloc(struct go7007_board_info *board, struct device *dev) go->tuner_type = -1; go->channel_number = 0; go->name[0] = 0; - init_MUTEX(&go->hw_lock); + semaphore_init(&go->hw_lock); init_waitqueue_head(&go->frame_waitq); spin_lock_init(&go->spinlock); go->video_dev = NULL; diff --git a/drivers/staging/go7007/go7007-i2c.c b/drivers/staging/go7007/go7007-i2c.c index c82867f..f9d9d71 100644 --- a/drivers/staging/go7007/go7007-i2c.c +++ b/drivers/staging/go7007/go7007-i2c.c @@ -48,7 +48,7 @@ /* There is only one I2C port on the TW2804 that feeds all four GO7007 VIPs * on the Adlink PCI-MPG24, so access is shared between all of them. */ -static DECLARE_MUTEX(adlink_mpg24_i2c_lock); +static DEFINE_SEMAPHORE(adlink_mpg24_i2c_lock); static int go7007_i2c_xfer(struct go7007 *go, u16 addr, int read, u16 command, int flags, u8 *data) diff --git a/drivers/staging/go7007/go7007-usb.c b/drivers/staging/go7007/go7007-usb.c index aa4a9e0..d988d05 100644 --- a/drivers/staging/go7007/go7007-usb.c +++ b/drivers/staging/go7007/go7007-usb.c @@ -1065,7 +1065,7 @@ static int go7007_usb_probe(struct usb_interface *intf, if (board->flags & GO7007_USB_EZUSB_I2C) { memcpy(&go->i2c_adapter, &go7007_usb_adap_templ, sizeof(go7007_usb_adap_templ)); - init_MUTEX(&usb->i2c_lock); + semaphore_init(&usb->i2c_lock); go->i2c_adapter.dev.parent = go->dev; i2c_set_adapdata(&go->i2c_adapter, go); if (i2c_add_adapter(&go->i2c_adapter) < 0) { diff --git a/drivers/staging/go7007/go7007-v4l2.c b/drivers/staging/go7007/go7007-v4l2.c index 06cacd3..daf6b73 100644 --- a/drivers/staging/go7007/go7007-v4l2.c +++ b/drivers/staging/go7007/go7007-v4l2.c @@ -101,7 +101,7 @@ static int go7007_open(struct file *file) return -ENOMEM; ++go->ref_count; gofh->go = go; - init_MUTEX(&gofh->lock); + semaphore_init(&gofh->lock); gofh->buf_count = 0; file->private_data = gofh; return 0; diff --git a/drivers/staging/go7007/s2250-loader.c b/drivers/staging/go7007/s2250-loader.c index bb22347..5031bbc 100644 --- a/drivers/staging/go7007/s2250-loader.c +++ b/drivers/staging/go7007/s2250-loader.c @@ -35,7 +35,7 @@ typedef struct device_extension_s { #define MAX_DEVICES 256 static pdevice_extension_t s2250_dev_table[MAX_DEVICES]; -static DECLARE_MUTEX(s2250_dev_table_mutex); +static DEFINE_SEMAPHORE(s2250_dev_table_mutex); #define to_s2250loader_dev_common(d) container_of(d, device_extension_t, kref) static void s2250loader_delete(struct kref *kref) diff --git a/drivers/staging/mimio/mimio.c b/drivers/staging/mimio/mimio.c index 1ba8103..63bf2db 100644 --- a/drivers/staging/mimio/mimio.c +++ b/drivers/staging/mimio/mimio.c @@ -160,7 +160,7 @@ static struct usb_driver mimio_driver = { .id_table = mimio_table, }; -static DECLARE_MUTEX(disconnect_sem); +static DEFINE_SEMAPHORE(disconnect_sem); static void mimio_close(struct input_dev *idev) { diff --git a/drivers/staging/octeon/ethernet-mdio.c b/drivers/staging/octeon/ethernet-mdio.c index 93cab0a..6c7dd49 100644 --- a/drivers/staging/octeon/ethernet-mdio.c +++ b/drivers/staging/octeon/ethernet-mdio.c @@ -39,7 +39,7 @@ #include "cvmx-smix-defs.h" -DECLARE_MUTEX(mdio_sem); +DEFINE_SEMAPHORE(mdio_sem); /** * Perform an MII read. Called by the generic MII routines diff --git a/drivers/staging/otus/wwrap.c b/drivers/staging/otus/wwrap.c index 4db8f6e..1de941e 100644 --- a/drivers/staging/otus/wwrap.c +++ b/drivers/staging/otus/wwrap.c @@ -1066,7 +1066,7 @@ u8_t zfLnxCreateThread(zdev_t *dev) /* Create Mutex and keventd */ INIT_WORK(&macp->kevent, kevent); - init_MUTEX(&macp->ioctl_sem); + semaphore_init(&macp->ioctl_sem); return 0; } diff --git a/drivers/staging/p9auth/p9auth.c b/drivers/staging/p9auth/p9auth.c index 9111dcb..b23e201 100644 --- a/drivers/staging/p9auth/p9auth.c +++ b/drivers/staging/p9auth/p9auth.c @@ -388,7 +388,7 @@ static int cap_init_module(void) /* Initialize each device. */ for (i = 0; i < cap_nr_devs; i++) { cap_devices[i].node_size = cap_node_size; - init_MUTEX(&cap_devices[i].sem); + semaphore_init(&cap_devices[i].sem); cap_setup_cdev(&cap_devices[i], i); } diff --git a/drivers/staging/rspiusb/rspiusb.c b/drivers/staging/rspiusb/rspiusb.c index 04e2f92..ca9688f 100644 --- a/drivers/staging/rspiusb/rspiusb.c +++ b/drivers/staging/rspiusb/rspiusb.c @@ -63,7 +63,7 @@ static int debug; #endif /* prevent races between open() and disconnect() */ -static DECLARE_MUTEX(disconnect_sem); +static DEFINE_SEMAPHORE(disconnect_sem); /* Structure to hold all of our device specific stuff */ struct device_extension { diff --git a/drivers/staging/rt2870/common/2870_rtmp_init.c b/drivers/staging/rt2870/common/2870_rtmp_init.c index 80909e9..114bdc7 100644 --- a/drivers/staging/rt2870/common/2870_rtmp_init.c +++ b/drivers/staging/rt2870/common/2870_rtmp_init.c @@ -751,13 +751,13 @@ NDIS_STATUS CreateThreads( //init_MUTEX(&(pAd->usbdev_semaphore)); - init_MUTEX_LOCKED(&(pAd->mlme_semaphore)); + semaphore_init_locked(&(pAd->mlme_semaphore)); init_completion (&pAd->mlmeComplete); - init_MUTEX_LOCKED(&(pAd->RTUSBCmd_semaphore)); + semaphore_init_locked(&(pAd->RTUSBCmd_semaphore)); init_completion (&pAd->CmdQComplete); - init_MUTEX_LOCKED(&(pAd->RTUSBTimer_semaphore)); + semaphore_init_locked(&(pAd->RTUSBTimer_semaphore)); init_completion (&pAd->TimerQComplete); // Creat MLME Thread diff --git a/drivers/usb/core/devio.c b/drivers/usb/core/devio.c index 4247ecc..57ba3b1 100644 --- a/drivers/usb/core/devio.c +++ b/drivers/usb/core/devio.c @@ -330,8 +330,9 @@ static void async_completed(struct urb *urb) uid_t euid = 0; u32 secid = 0; int signr; + unsigned long flags; - spin_lock(&ps->lock); + spin_lock_irqsave(&ps->lock, flags); list_move_tail(&as->asynclist, &ps->async_completed); as->status = urb->status; signr = as->signr; @@ -347,7 +348,7 @@ static void async_completed(struct urb *urb) } snoop(&urb->dev->dev, "urb complete\n"); snoop_urb(urb, as->userurb); - spin_unlock(&ps->lock); + spin_unlock_irqrestore(&ps->lock, flags); if (signr) kill_pid_info_as_uid(sinfo.si_signo, &sinfo, pid, uid, diff --git a/drivers/usb/core/driver.c b/drivers/usb/core/driver.c index 69e5773..e0dacaf 100644 --- a/drivers/usb/core/driver.c +++ b/drivers/usb/core/driver.c @@ -391,10 +391,10 @@ void usb_driver_release_interface(struct usb_driver *driver, if (device_is_registered(dev)) { device_release_driver(dev); } else { - down(&dev->sem); + mutex_lock(&dev->mutex); usb_unbind_interface(dev); dev->driver = NULL; - up(&dev->sem); + mutex_unlock(&dev->mutex); } } EXPORT_SYMBOL_GPL(usb_driver_release_interface); diff --git a/drivers/usb/core/hcd.c b/drivers/usb/core/hcd.c index 95ccfa0..167548a 100644 --- a/drivers/usb/core/hcd.c +++ b/drivers/usb/core/hcd.c @@ -1880,7 +1880,7 @@ irqreturn_t usb_hcd_irq (int irq, void *__hcd) * when the first handler doesn't use it. So let's just * assume it's never used. */ - local_irq_save(flags); + local_irq_save_nort(flags); if (unlikely(hcd->state == HC_STATE_HALT || !test_bit(HCD_FLAG_HW_ACCESSIBLE, &hcd->flags))) { @@ -1895,7 +1895,7 @@ irqreturn_t usb_hcd_irq (int irq, void *__hcd) rc = IRQ_HANDLED; } - local_irq_restore(flags); + local_irq_restore_nort(flags); return rc; } diff --git a/drivers/usb/core/message.c b/drivers/usb/core/message.c index 9720e69..b529a76 100644 --- a/drivers/usb/core/message.c +++ b/drivers/usb/core/message.c @@ -269,8 +269,9 @@ static void sg_complete(struct urb *urb) { struct usb_sg_request *io = urb->context; int status = urb->status; + unsigned long flags; - spin_lock(&io->lock); + spin_lock_irqsave (&io->lock, flags); /* In 2.5 we require hcds' endpoint queues not to progress after fault * reports, until the completion callback (this!) returns. That lets @@ -304,7 +305,7 @@ static void sg_complete(struct urb *urb) * unlink pending urbs so they won't rx/tx bad data. * careful: unlink can sometimes be synchronous... */ - spin_unlock(&io->lock); + spin_unlock_irqrestore (&io->lock, flags); for (i = 0, found = 0; i < io->entries; i++) { if (!io->urbs [i] || !io->urbs [i]->dev) continue; @@ -319,7 +320,7 @@ static void sg_complete(struct urb *urb) } else if (urb == io->urbs [i]) found = 1; } - spin_lock(&io->lock); + spin_lock_irqsave (&io->lock, flags); } urb->dev = NULL; @@ -329,7 +330,7 @@ static void sg_complete(struct urb *urb) if (!io->count) complete(&io->complete); - spin_unlock(&io->lock); + spin_unlock_irqrestore (&io->lock, flags); } @@ -643,7 +644,7 @@ void usb_sg_cancel(struct usb_sg_request *io) int i; io->status = -ECONNRESET; - spin_unlock(&io->lock); + spin_unlock_irqrestore(&io->lock, flags); for (i = 0; i < io->entries; i++) { int retval; @@ -654,7 +655,7 @@ void usb_sg_cancel(struct usb_sg_request *io) dev_warn(&io->dev->dev, "%s, unlink --> %d\n", __func__, retval); } - spin_lock(&io->lock); + spin_lock_irqsave(&io->lock, flags); } spin_unlock_irqrestore(&io->lock, flags); } diff --git a/drivers/usb/gadget/inode.c b/drivers/usb/gadget/inode.c index 7d33f50..2e2ae7f 100644 --- a/drivers/usb/gadget/inode.c +++ b/drivers/usb/gadget/inode.c @@ -193,7 +193,7 @@ enum ep_state { }; struct ep_data { - struct semaphore lock; + struct mutex lock; enum ep_state state; atomic_t count; struct dev_data *dev; @@ -297,10 +297,10 @@ get_ready_ep (unsigned f_flags, struct ep_data *epdata) int val; if (f_flags & O_NONBLOCK) { - if (down_trylock (&epdata->lock) != 0) + if (mutex_trylock(&epdata->lock) != 0) goto nonblock; if (epdata->state != STATE_EP_ENABLED) { - up (&epdata->lock); + mutex_unlock(&epdata->lock); nonblock: val = -EAGAIN; } else @@ -308,7 +308,8 @@ nonblock: return val; } - if ((val = down_interruptible (&epdata->lock)) < 0) + val = mutex_lock_interruptible(&epdata->lock); + if (val < 0) return val; switch (epdata->state) { @@ -322,7 +323,7 @@ nonblock: // FALLTHROUGH case STATE_EP_UNBOUND: /* clean disconnect */ val = -ENODEV; - up (&epdata->lock); + mutex_unlock(&epdata->lock); } return val; } @@ -392,7 +393,7 @@ ep_read (struct file *fd, char __user *buf, size_t len, loff_t *ptr) if (likely (data->ep != NULL)) usb_ep_set_halt (data->ep); spin_unlock_irq (&data->dev->lock); - up (&data->lock); + mutex_unlock(&data->lock); return -EBADMSG; } @@ -410,7 +411,7 @@ ep_read (struct file *fd, char __user *buf, size_t len, loff_t *ptr) value = -EFAULT; free1: - up (&data->lock); + mutex_unlock(&data->lock); kfree (kbuf); return value; } @@ -435,7 +436,7 @@ ep_write (struct file *fd, const char __user *buf, size_t len, loff_t *ptr) if (likely (data->ep != NULL)) usb_ep_set_halt (data->ep); spin_unlock_irq (&data->dev->lock); - up (&data->lock); + mutex_unlock(&data->lock); return -EBADMSG; } @@ -454,7 +455,7 @@ ep_write (struct file *fd, const char __user *buf, size_t len, loff_t *ptr) VDEBUG (data->dev, "%s write %zu IN, status %d\n", data->name, len, (int) value); free1: - up (&data->lock); + mutex_unlock(&data->lock); kfree (kbuf); return value; } @@ -465,7 +466,8 @@ ep_release (struct inode *inode, struct file *fd) struct ep_data *data = fd->private_data; int value; - if ((value = down_interruptible(&data->lock)) < 0) + value = mutex_lock_interruptible(&data->lock); + if (value < 0) return value; /* clean up if this can be reopened */ @@ -475,7 +477,7 @@ ep_release (struct inode *inode, struct file *fd) data->hs_desc.bDescriptorType = 0; usb_ep_disable(data->ep); } - up (&data->lock); + mutex_unlock(&data->lock); put_ep (data); return 0; } @@ -506,7 +508,7 @@ static long ep_ioctl(struct file *fd, unsigned code, unsigned long value) } else status = -ENODEV; spin_unlock_irq (&data->dev->lock); - up (&data->lock); + mutex_unlock(&data->lock); return status; } @@ -672,7 +674,7 @@ fail: value = -ENODEV; spin_unlock_irq(&epdata->dev->lock); - up(&epdata->lock); + mutex_unlock(&epdata->lock); if (unlikely(value)) { kfree(priv); @@ -764,7 +766,8 @@ ep_config (struct file *fd, const char __user *buf, size_t len, loff_t *ptr) u32 tag; int value, length = len; - if ((value = down_interruptible (&data->lock)) < 0) + value = mutex_lock_interruptible(&data->lock); + if (value < 0) return value; if (data->state != STATE_EP_READY) { @@ -853,7 +856,7 @@ fail: data->desc.bDescriptorType = 0; data->hs_desc.bDescriptorType = 0; } - up (&data->lock); + mutex_unlock(&data->lock); return value; fail0: value = -EINVAL; @@ -869,7 +872,7 @@ ep_open (struct inode *inode, struct file *fd) struct ep_data *data = inode->i_private; int value = -EBUSY; - if (down_interruptible (&data->lock) != 0) + if (mutex_lock_interruptible(&data->lock) != 0) return -EINTR; spin_lock_irq (&data->dev->lock); if (data->dev->state == STATE_DEV_UNBOUND) @@ -884,7 +887,7 @@ ep_open (struct inode *inode, struct file *fd) DBG (data->dev, "%s state %d\n", data->name, data->state); spin_unlock_irq (&data->dev->lock); - up (&data->lock); + mutex_unlock(&data->lock); return value; } @@ -1630,7 +1633,7 @@ static int activate_ep_files (struct dev_data *dev) if (!data) goto enomem0; data->state = STATE_EP_DISABLED; - init_MUTEX (&data->lock); + mutex_init(&data->lock); init_waitqueue_head (&data->wait); strncpy (data->name, ep->name, sizeof (data->name) - 1); diff --git a/drivers/usb/misc/ftdi-elan.c b/drivers/usb/misc/ftdi-elan.c index 9d0675e..d20cb67 100644 --- a/drivers/usb/misc/ftdi-elan.c +++ b/drivers/usb/misc/ftdi-elan.c @@ -2766,7 +2766,7 @@ static int ftdi_elan_probe(struct usb_interface *interface, ftdi->sequence_num = ++ftdi_instances; mutex_unlock(&ftdi_module_lock); ftdi_elan_init_kref(ftdi); - init_MUTEX(&ftdi->sw_lock); + semaphore_init(&ftdi->sw_lock); ftdi->udev = usb_get_dev(interface_to_usbdev(interface)); ftdi->interface = interface; mutex_init(&ftdi->u132_lock); diff --git a/drivers/uwb/umc-bus.c b/drivers/uwb/umc-bus.c index 5ad3616..125d19e 100644 --- a/drivers/uwb/umc-bus.c +++ b/drivers/uwb/umc-bus.c @@ -62,12 +62,12 @@ int umc_controller_reset(struct umc_dev *umc) struct device *parent = umc->dev.parent; int ret = 0; - if(down_trylock(&parent->sem)) + if (mutex_trylock(&parent->mutex)) return -EAGAIN; ret = device_for_each_child(parent, parent, umc_bus_pre_reset_helper); if (ret >= 0) device_for_each_child(parent, parent, umc_bus_post_reset_helper); - up(&parent->sem); + mutex_unlock(&parent->mutex); return ret; } diff --git a/drivers/uwb/uwb-internal.h b/drivers/uwb/uwb-internal.h index d5bcfc1..17b10b9 100644 --- a/drivers/uwb/uwb-internal.h +++ b/drivers/uwb/uwb-internal.h @@ -366,12 +366,12 @@ struct dentry *uwb_dbg_create_pal_dir(struct uwb_pal *pal); static inline void uwb_dev_lock(struct uwb_dev *uwb_dev) { - down(&uwb_dev->dev.sem); + mutex_lock(&uwb_dev->dev.mutex); } static inline void uwb_dev_unlock(struct uwb_dev *uwb_dev) { - up(&uwb_dev->dev.sem); + mutex_unlock(&uwb_dev->dev.mutex); } #endif /* #ifndef __UWB_INTERNAL_H__ */ diff --git a/drivers/video/console/fbcon.c b/drivers/video/console/fbcon.c index 3a44695..da8c9e3 100644 --- a/drivers/video/console/fbcon.c +++ b/drivers/video/console/fbcon.c @@ -1203,7 +1203,6 @@ static void fbcon_clear(struct vc_data *vc, int sy, int sx, int height, { struct fb_info *info = registered_fb[con2fb_map[vc->vc_num]]; struct fbcon_ops *ops = info->fbcon_par; - struct display *p = &fb_display[vc->vc_num]; u_int y_break; @@ -1235,10 +1234,11 @@ static void fbcon_putcs(struct vc_data *vc, const unsigned short *s, struct display *p = &fb_display[vc->vc_num]; struct fbcon_ops *ops = info->fbcon_par; - if (!fbcon_is_inactive(vc, info)) + if (!fbcon_is_inactive(vc, info)) { ops->putcs(vc, info, s, count, real_y(p, ypos), xpos, get_color(vc, info, scr_readw(s), 1), get_color(vc, info, scr_readw(s), 0)); + } } static void fbcon_putc(struct vc_data *vc, int c, int ypos, int xpos) @@ -3225,6 +3225,7 @@ static const struct consw fb_con = { .con_screen_pos = fbcon_screen_pos, .con_getxy = fbcon_getxy, .con_resize = fbcon_resize, + .con_preemptible = 1, }; static struct notifier_block fbcon_event_notifier = { diff --git a/drivers/video/console/vgacon.c b/drivers/video/console/vgacon.c index 59d7d5e..41cc04c 100644 --- a/drivers/video/console/vgacon.c +++ b/drivers/video/console/vgacon.c @@ -51,7 +51,7 @@ #include <video/vga.h> #include <asm/io.h> -static DEFINE_SPINLOCK(vga_lock); +static DEFINE_ATOMIC_SPINLOCK(vga_lock); static int cursor_size_lastfrom; static int cursor_size_lastto; static u32 vgacon_xres; @@ -158,7 +158,7 @@ static inline void write_vga(unsigned char reg, unsigned int val) * ddprintk might set the console position from interrupt * handlers, thus the write has to be IRQ-atomic. */ - spin_lock_irqsave(&vga_lock, flags); + atomic_spin_lock_irqsave(&vga_lock, flags); #ifndef SLOW_VGA v1 = reg + (val & 0xff00); @@ -171,7 +171,7 @@ static inline void write_vga(unsigned char reg, unsigned int val) outb_p(reg + 1, vga_video_port_reg); outb_p(val & 0xff, vga_video_port_val); #endif - spin_unlock_irqrestore(&vga_lock, flags); + atomic_spin_unlock_irqrestore(&vga_lock, flags); } static inline void vga_set_mem_top(struct vc_data *c) @@ -662,7 +662,7 @@ static void vgacon_set_cursor_size(int xpos, int from, int to) cursor_size_lastfrom = from; cursor_size_lastto = to; - spin_lock_irqsave(&vga_lock, flags); + atomic_spin_lock_irqsave(&vga_lock, flags); if (vga_video_type >= VIDEO_TYPE_VGAC) { outb_p(VGA_CRTC_CURSOR_START, vga_video_port_reg); curs = inb_p(vga_video_port_val); @@ -680,7 +680,7 @@ static void vgacon_set_cursor_size(int xpos, int from, int to) outb_p(curs, vga_video_port_val); outb_p(VGA_CRTC_CURSOR_END, vga_video_port_reg); outb_p(cure, vga_video_port_val); - spin_unlock_irqrestore(&vga_lock, flags); + atomic_spin_unlock_irqrestore(&vga_lock, flags); } static void vgacon_cursor(struct vc_data *c, int mode) @@ -755,7 +755,7 @@ static int vgacon_doresize(struct vc_data *c, unsigned int scanlines = height * c->vc_font.height; u8 scanlines_lo = 0, r7 = 0, vsync_end = 0, mode, max_scan; - spin_lock_irqsave(&vga_lock, flags); + atomic_spin_lock_irqsave(&vga_lock, flags); vgacon_xres = width * VGA_FONTWIDTH; vgacon_yres = height * c->vc_font.height; @@ -806,7 +806,7 @@ static int vgacon_doresize(struct vc_data *c, outb_p(vsync_end, vga_video_port_val); } - spin_unlock_irqrestore(&vga_lock, flags); + atomic_spin_unlock_irqrestore(&vga_lock, flags); return 0; } @@ -889,11 +889,11 @@ static void vga_vesa_blank(struct vgastate *state, int mode) { /* save original values of VGA controller registers */ if (!vga_vesa_blanked) { - spin_lock_irq(&vga_lock); + atomic_spin_lock_irq(&vga_lock); vga_state.SeqCtrlIndex = vga_r(state->vgabase, VGA_SEQ_I); vga_state.CrtCtrlIndex = inb_p(vga_video_port_reg); vga_state.CrtMiscIO = vga_r(state->vgabase, VGA_MIS_R); - spin_unlock_irq(&vga_lock); + atomic_spin_unlock_irq(&vga_lock); outb_p(0x00, vga_video_port_reg); /* HorizontalTotal */ vga_state.HorizontalTotal = inb_p(vga_video_port_val); @@ -916,7 +916,7 @@ static void vga_vesa_blank(struct vgastate *state, int mode) /* assure that video is enabled */ /* "0x20" is VIDEO_ENABLE_bit in register 01 of sequencer */ - spin_lock_irq(&vga_lock); + atomic_spin_lock_irq(&vga_lock); vga_wseq(state->vgabase, VGA_SEQ_CLOCK_MODE, vga_state.ClockingMode | 0x20); /* test for vertical retrace in process.... */ @@ -952,13 +952,13 @@ static void vga_vesa_blank(struct vgastate *state, int mode) /* restore both index registers */ vga_w(state->vgabase, VGA_SEQ_I, vga_state.SeqCtrlIndex); outb_p(vga_state.CrtCtrlIndex, vga_video_port_reg); - spin_unlock_irq(&vga_lock); + atomic_spin_unlock_irq(&vga_lock); } static void vga_vesa_unblank(struct vgastate *state) { /* restore original values of VGA controller registers */ - spin_lock_irq(&vga_lock); + atomic_spin_lock_irq(&vga_lock); vga_w(state->vgabase, VGA_MIS_W, vga_state.CrtMiscIO); outb_p(0x00, vga_video_port_reg); /* HorizontalTotal */ @@ -983,7 +983,7 @@ static void vga_vesa_unblank(struct vgastate *state) /* restore index/control registers */ vga_w(state->vgabase, VGA_SEQ_I, vga_state.SeqCtrlIndex); outb_p(vga_state.CrtCtrlIndex, vga_video_port_reg); - spin_unlock_irq(&vga_lock); + atomic_spin_unlock_irq(&vga_lock); } static void vga_pal_blank(struct vgastate *state) @@ -1103,7 +1103,7 @@ static int vgacon_do_font_op(struct vgastate *state,char *arg,int set,int ch512) #endif unlock_kernel(); - spin_lock_irq(&vga_lock); + atomic_spin_lock_irq(&vga_lock); /* First, the Sequencer */ vga_wseq(state->vgabase, VGA_SEQ_RESET, 0x1); /* CPU writes only to map 2 */ @@ -1119,7 +1119,7 @@ static int vgacon_do_font_op(struct vgastate *state,char *arg,int set,int ch512) vga_wgfx(state->vgabase, VGA_GFX_MODE, 0x00); /* map start at A000:0000 */ vga_wgfx(state->vgabase, VGA_GFX_MISC, 0x00); - spin_unlock_irq(&vga_lock); + atomic_spin_unlock_irq(&vga_lock); if (arg) { if (set) @@ -1146,7 +1146,7 @@ static int vgacon_do_font_op(struct vgastate *state,char *arg,int set,int ch512) } } - spin_lock_irq(&vga_lock); + atomic_spin_lock_irq(&vga_lock); /* First, the sequencer, Synchronous reset */ vga_wseq(state->vgabase, VGA_SEQ_RESET, 0x01); /* CPU writes to maps 0 and 1 */ @@ -1185,7 +1185,7 @@ static int vgacon_do_font_op(struct vgastate *state,char *arg,int set,int ch512) inb_p(video_port_status); vga_wattr(state->vgabase, VGA_AR_ENABLE_DISPLAY, 0); } - spin_unlock_irq(&vga_lock); + atomic_spin_unlock_irq(&vga_lock); lock_kernel(); return 0; } @@ -1211,26 +1211,26 @@ static int vgacon_adjust_height(struct vc_data *vc, unsigned fontheight) registers; they are write-only on EGA, but it appears that they are all don't care bits on EGA, so I guess it doesn't matter. */ - spin_lock_irq(&vga_lock); + atomic_spin_lock_irq(&vga_lock); outb_p(0x07, vga_video_port_reg); /* CRTC overflow register */ ovr = inb_p(vga_video_port_val); outb_p(0x09, vga_video_port_reg); /* Font size register */ fsr = inb_p(vga_video_port_val); - spin_unlock_irq(&vga_lock); + atomic_spin_unlock_irq(&vga_lock); vde = maxscan & 0xff; /* Vertical display end reg */ ovr = (ovr & 0xbd) + /* Overflow register */ ((maxscan & 0x100) >> 7) + ((maxscan & 0x200) >> 3); fsr = (fsr & 0xe0) + (fontheight - 1); /* Font size register */ - spin_lock_irq(&vga_lock); + atomic_spin_lock_irq(&vga_lock); outb_p(0x07, vga_video_port_reg); /* CRTC overflow register */ outb_p(ovr, vga_video_port_val); outb_p(0x09, vga_video_port_reg); /* Font size */ outb_p(fsr, vga_video_port_val); outb_p(0x12, vga_video_port_reg); /* Vertical display limit */ outb_p(vde, vga_video_port_val); - spin_unlock_irq(&vga_lock); + atomic_spin_unlock_irq(&vga_lock); vga_video_font_height = fontheight; for (i = 0; i < MAX_NR_CONSOLES; i++) { diff --git a/fs/affs/super.c b/fs/affs/super.c index 104fdcb..b26e824 100644 --- a/fs/affs/super.c +++ b/fs/affs/super.c @@ -113,8 +113,8 @@ static void init_once(void *foo) { struct affs_inode_info *ei = (struct affs_inode_info *) foo; - init_MUTEX(&ei->i_link_lock); - init_MUTEX(&ei->i_ext_lock); + semaphore_init(&ei->i_link_lock); + semaphore_init(&ei->i_ext_lock); inode_init_once(&ei->vfs_inode); } diff --git a/fs/aio.c b/fs/aio.c index d065b2c..05e61f7 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -612,9 +612,11 @@ static void use_mm(struct mm_struct *mm) task_lock(tsk); active_mm = tsk->active_mm; atomic_inc(&mm->mm_count); + local_irq_disable(); // FIXME + switch_mm(active_mm, mm, tsk); tsk->mm = mm; tsk->active_mm = mm; - switch_mm(active_mm, mm, tsk); + local_irq_enable(); task_unlock(tsk); mmdrop(active_mm); diff --git a/fs/attr.c b/fs/attr.c index 9fe1b1b..5c8f5be 100644 --- a/fs/attr.c +++ b/fs/attr.c @@ -164,7 +164,7 @@ int notify_change(struct dentry * dentry, struct iattr * attr) return error; if (ia_valid & ATTR_SIZE) - down_write(&dentry->d_inode->i_alloc_sem); + anon_down_write(&dentry->d_inode->i_alloc_sem); if (inode->i_op && inode->i_op->setattr) { error = inode->i_op->setattr(dentry, attr); @@ -181,7 +181,7 @@ int notify_change(struct dentry * dentry, struct iattr * attr) } if (ia_valid & ATTR_SIZE) - up_write(&dentry->d_inode->i_alloc_sem); + anon_up_write(&dentry->d_inode->i_alloc_sem); if (!error) fsnotify_change(dentry, ia_valid); diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c index 1c36e5c..4b3d18a 100644 --- a/fs/btrfs/locking.c +++ b/fs/btrfs/locking.c @@ -94,6 +94,7 @@ static int btrfs_spin_on_block(struct extent_buffer *eb) */ int btrfs_try_spin_lock(struct extent_buffer *eb) { +#ifndef CONFIG_PREEMPT_RT int i; if (btrfs_spin_on_block(eb)) { @@ -113,6 +114,7 @@ int btrfs_try_spin_lock(struct extent_buffer *eb) return 1; spin_unlock(&eb->lock); } +#endif return 0; } diff --git a/fs/buffer.c b/fs/buffer.c index 28f320f..deccb03 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -40,7 +40,6 @@ #include <linux/cpu.h> #include <linux/bitops.h> #include <linux/mpage.h> -#include <linux/bit_spinlock.h> static int fsync_buffers_list(spinlock_t *lock, struct list_head *list); @@ -324,8 +323,7 @@ static void end_buffer_async_read(struct buffer_head *bh, int uptodate) * decide that the page is now completely done. */ first = page_buffers(page); - local_irq_save(flags); - bit_spin_lock(BH_Uptodate_Lock, &first->b_state); + spin_lock_irqsave(&first->b_uptodate_lock, flags); clear_buffer_async_read(bh); unlock_buffer(bh); tmp = bh; @@ -338,8 +336,7 @@ static void end_buffer_async_read(struct buffer_head *bh, int uptodate) } tmp = tmp->b_this_page; } while (tmp != bh); - bit_spin_unlock(BH_Uptodate_Lock, &first->b_state); - local_irq_restore(flags); + spin_unlock_irqrestore(&first->b_uptodate_lock, flags); /* * If none of the buffers had errors and they are all @@ -351,8 +348,7 @@ static void end_buffer_async_read(struct buffer_head *bh, int uptodate) return; still_busy: - bit_spin_unlock(BH_Uptodate_Lock, &first->b_state); - local_irq_restore(flags); + spin_unlock_irqrestore(&first->b_uptodate_lock, flags); return; } @@ -387,8 +383,7 @@ void end_buffer_async_write(struct buffer_head *bh, int uptodate) } first = page_buffers(page); - local_irq_save(flags); - bit_spin_lock(BH_Uptodate_Lock, &first->b_state); + spin_lock_irqsave(&first->b_uptodate_lock, flags); clear_buffer_async_write(bh); unlock_buffer(bh); @@ -400,14 +395,12 @@ void end_buffer_async_write(struct buffer_head *bh, int uptodate) } tmp = tmp->b_this_page; } - bit_spin_unlock(BH_Uptodate_Lock, &first->b_state); - local_irq_restore(flags); + spin_unlock_irqrestore(&first->b_uptodate_lock, flags); end_page_writeback(page); return; still_busy: - bit_spin_unlock(BH_Uptodate_Lock, &first->b_state); - local_irq_restore(flags); + spin_unlock_irqrestore(&first->b_uptodate_lock, flags); return; } @@ -3251,6 +3244,8 @@ struct buffer_head *alloc_buffer_head(gfp_t gfp_flags) struct buffer_head *ret = kmem_cache_alloc(bh_cachep, gfp_flags); if (ret) { INIT_LIST_HEAD(&ret->b_assoc_buffers); + spin_lock_init(&ret->b_uptodate_lock); + spin_lock_init(&ret->b_state_lock); get_cpu_var(bh_accounting).nr++; recalc_bh_state(); put_cpu_var(bh_accounting); @@ -3262,6 +3257,8 @@ EXPORT_SYMBOL(alloc_buffer_head); void free_buffer_head(struct buffer_head *bh) { BUG_ON(!list_empty(&bh->b_assoc_buffers)); + BUG_ON(spin_is_locked(&bh->b_uptodate_lock)); + BUG_ON(spin_is_locked(&bh->b_state_lock)); kmem_cache_free(bh_cachep, bh); get_cpu_var(bh_accounting).nr--; recalc_bh_state(); diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h index 6084d63..0b9dfae 100644 --- a/fs/cifs/cifsglob.h +++ b/fs/cifs/cifsglob.h @@ -203,7 +203,7 @@ struct cifsUidInfo { struct cifsSesInfo { struct list_head smb_ses_list; struct list_head tcon_list; - struct semaphore sesSem; + struct mutex sesSem; #if 0 struct cifsUidInfo *uidInfo; /* pointer to user info */ #endif diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c index 1866bc2..2ed129a 100644 --- a/fs/cifs/cifssmb.c +++ b/fs/cifs/cifssmb.c @@ -154,7 +154,7 @@ small_smb_init(int smb_command, int wct, struct cifsTconInfo *tcon, nls_codepage = load_nls_default(); /* need to prevent multiple threads trying to simultaneously reconnect the same SMB session */ - down(&tcon->ses->sesSem); + mutex_lock(&tcon->ses->sesSem); if (tcon->ses->need_reconnect) rc = cifs_setup_session(0, tcon->ses, nls_codepage); @@ -162,7 +162,7 @@ small_smb_init(int smb_command, int wct, struct cifsTconInfo *tcon, mark_open_files_invalid(tcon); rc = CIFSTCon(0, tcon->ses, tcon->treeName, tcon, nls_codepage); - up(&tcon->ses->sesSem); + mutex_unlock(&tcon->ses->sesSem); /* BB FIXME add code to check if wsize needs update due to negotiated smb buffer size shrinking */ @@ -196,7 +196,7 @@ small_smb_init(int smb_command, int wct, struct cifsTconInfo *tcon, } } } else { - up(&tcon->ses->sesSem); + mutex_unlock(&tcon->ses->sesSem); } unload_nls(nls_codepage); @@ -301,7 +301,7 @@ smb_init(int smb_command, int wct, struct cifsTconInfo *tcon, nls_codepage = load_nls_default(); /* need to prevent multiple threads trying to simultaneously reconnect the same SMB session */ - down(&tcon->ses->sesSem); + mutex_lock(&tcon->ses->sesSem); if (tcon->ses->need_reconnect) rc = cifs_setup_session(0, tcon->ses, nls_codepage); @@ -309,7 +309,7 @@ smb_init(int smb_command, int wct, struct cifsTconInfo *tcon, mark_open_files_invalid(tcon); rc = CIFSTCon(0, tcon->ses, tcon->treeName, tcon, nls_codepage); - up(&tcon->ses->sesSem); + mutex_unlock(&tcon->ses->sesSem); /* BB FIXME add code to check if wsize needs update due to negotiated smb buffer size shrinking */ @@ -343,7 +343,7 @@ smb_init(int smb_command, int wct, struct cifsTconInfo *tcon, } } } else { - up(&tcon->ses->sesSem); + mutex_unlock(&tcon->ses->sesSem); } unload_nls(nls_codepage); @@ -765,13 +765,13 @@ CIFSSMBLogoff(const int xid, struct cifsSesInfo *ses) if (!ses || !ses->server) return -EIO; - down(&ses->sesSem); + mutex_lock(&ses->sesSem); if (ses->need_reconnect) goto session_already_dead; /* no need to send SMBlogoff if uid already closed due to reconnect */ rc = small_smb_init(SMB_COM_LOGOFF_ANDX, 2, NULL, (void **)&pSMB); if (rc) { - up(&ses->sesSem); + mutex_unlock(&ses->sesSem); return rc; } @@ -786,7 +786,7 @@ CIFSSMBLogoff(const int xid, struct cifsSesInfo *ses) pSMB->AndXCommand = 0xFF; rc = SendReceiveNoRsp(xid, ses, (struct smb_hdr *) pSMB, 0); session_already_dead: - up(&ses->sesSem); + mutex_unlock(&ses->sesSem); /* if session dead then we do not need to do ulogoff, since server closed smb session, no sense reporting diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 1f3345d..6704605 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -2349,13 +2349,13 @@ try_mount_again: */ cifs_put_tcp_session(srvTcp); - down(&pSesInfo->sesSem); + mutex_lock(&pSesInfo->sesSem); if (pSesInfo->need_reconnect) { cFYI(1, ("Session needs reconnect")); rc = cifs_setup_session(xid, pSesInfo, cifs_sb->local_nls); } - up(&pSesInfo->sesSem); + mutex_unlock(&pSesInfo->sesSem); } else if (!rc) { cFYI(1, ("Existing smb sess not found")); pSesInfo = sesInfoAlloc(); @@ -2398,12 +2398,12 @@ try_mount_again: } pSesInfo->linux_uid = volume_info->linux_uid; pSesInfo->overrideSecFlg = volume_info->secFlg; - down(&pSesInfo->sesSem); + mutex_lock(&pSesInfo->sesSem); /* BB FIXME need to pass vol->secFlgs BB */ rc = cifs_setup_session(xid, pSesInfo, cifs_sb->local_nls); - up(&pSesInfo->sesSem); + mutex_unlock(&pSesInfo->sesSem); } /* search for existing tcon to this server share */ diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c index e079a91..79d7c9c 100644 --- a/fs/cifs/misc.c +++ b/fs/cifs/misc.c @@ -80,7 +80,7 @@ sesInfoAlloc(void) ++ret_buf->ses_count; INIT_LIST_HEAD(&ret_buf->smb_ses_list); INIT_LIST_HEAD(&ret_buf->tcon_list); - init_MUTEX(&ret_buf->sesSem); + mutex_init(&ret_buf->sesSem); } return ret_buf; } diff --git a/fs/dcache.c b/fs/dcache.c index 9e5cd3c..ce00455 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -726,8 +726,9 @@ void shrink_dcache_for_umount(struct super_block *sb) { struct dentry *dentry; - if (down_read_trylock(&sb->s_umount)) - BUG(); +// -rt: this might succeed there ... +// if (down_read_trylock(&sb->s_umount)) +// BUG(); dentry = sb->s_root; sb->s_root = NULL; diff --git a/fs/direct-io.c b/fs/direct-io.c index 8b10b87..ab9ae61 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -242,7 +242,7 @@ static int dio_complete(struct dio *dio, loff_t offset, int ret) dio->map_bh.b_private); if (dio->lock_type == DIO_LOCKING) /* lockdep: non-owner release */ - up_read_non_owner(&dio->inode->i_alloc_sem); + anon_up_read_non_owner(&dio->inode->i_alloc_sem); if (ret == 0) ret = dio->page_errors; @@ -1192,7 +1192,7 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, if (dio_lock_type == DIO_LOCKING) /* lockdep: not the owner will release it */ - down_read_non_owner(&inode->i_alloc_sem); + anon_down_read_non_owner(&inode->i_alloc_sem); } /* diff --git a/fs/exec.c b/fs/exec.c index 172ceb6..e23ccf3 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -48,6 +48,7 @@ #include <linux/security.h> #include <linux/ima.h> #include <linux/syscalls.h> +#include <linux/delay.h> #include <linux/tsacct_kern.h> #include <linux/cn_proc.h> #include <linux/audit.h> @@ -501,7 +502,7 @@ static int shift_arg_pages(struct vm_area_struct *vma, unsigned long shift) unsigned long length = old_end - old_start; unsigned long new_start = old_start - shift; unsigned long new_end = old_end - shift; - struct mmu_gather *tlb; + struct mmu_gather tlb; BUG_ON(new_start > new_end); @@ -526,12 +527,12 @@ static int shift_arg_pages(struct vm_area_struct *vma, unsigned long shift) return -ENOMEM; lru_add_drain(); - tlb = tlb_gather_mmu(mm, 0); + tlb_gather_mmu(&tlb, mm, 0); if (new_end > old_start) { /* * when the old and new regions overlap clear from new_end. */ - free_pgd_range(tlb, new_end, old_end, new_end, + free_pgd_range(&tlb, new_end, old_end, new_end, vma->vm_next ? vma->vm_next->vm_start : 0); } else { /* @@ -540,10 +541,10 @@ static int shift_arg_pages(struct vm_area_struct *vma, unsigned long shift) * have constraints on va-space that make this illegal (IA64) - * for the others its just a little faster. */ - free_pgd_range(tlb, old_start, old_end, new_end, + free_pgd_range(&tlb, old_start, old_end, new_end, vma->vm_next ? vma->vm_next->vm_start : 0); } - tlb_finish_mmu(tlb, new_end, old_end); + tlb_finish_mmu(&tlb, new_end, old_end); /* * shrink the vma to just the new range. @@ -719,10 +720,12 @@ static int exec_mmap(struct mm_struct *mm) } } task_lock(tsk); + local_irq_disable(); active_mm = tsk->active_mm; + activate_mm(active_mm, mm); tsk->mm = mm; tsk->active_mm = mm; - activate_mm(active_mm, mm); + local_irq_enable(); task_unlock(tsk); arch_pick_mmap_layout(mm); if (old_mm) { diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index f9c642b..5ad50b2 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -5265,7 +5265,7 @@ int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) * Get i_alloc_sem to stop truncates messing with the inode. We cannot * get i_mutex because we are already holding mmap_sem. */ - down_read(&inode->i_alloc_sem); + anon_down_read(&inode->i_alloc_sem); size = i_size_read(inode); if (page->mapping != mapping || size <= page_offset(page) || !PageUptodate(page)) { @@ -5306,6 +5306,6 @@ int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) out_unlock: if (ret) ret = VM_FAULT_SIGBUS; - up_read(&inode->i_alloc_sem); + anon_up_read(&inode->i_alloc_sem); return ret; } diff --git a/fs/fat/inode.c b/fs/fat/inode.c index 8970d8c..da8dbb1 100644 --- a/fs/fat/inode.c +++ b/fs/fat/inode.c @@ -202,9 +202,9 @@ static sector_t _fat_bmap(struct address_space *mapping, sector_t block) sector_t blocknr; /* fat_get_cluster() assumes the requested blocknr isn't truncated. */ - down_read(&mapping->host->i_alloc_sem); + anon_down_read(&mapping->host->i_alloc_sem); blocknr = generic_block_bmap(mapping, block, fat_get_block); - up_read(&mapping->host->i_alloc_sem); + anon_up_read(&mapping->host->i_alloc_sem); return blocknr; } diff --git a/fs/file.c b/fs/file.c index f313314..710e9b0 100644 --- a/fs/file.c +++ b/fs/file.c @@ -102,14 +102,15 @@ void free_fdtable_rcu(struct rcu_head *rcu) kfree(fdt->open_fds); kfree(fdt); } else { - fddef = &get_cpu_var(fdtable_defer_list); + + fddef = &per_cpu(fdtable_defer_list, raw_smp_processor_id()); + spin_lock(&fddef->lock); fdt->next = fddef->next; fddef->next = fdt; /* vmallocs are handled from the workqueue context */ schedule_work(&fddef->wq); spin_unlock(&fddef->lock); - put_cpu_var(fdtable_defer_list); } } diff --git a/fs/hfs/bfind.c b/fs/hfs/bfind.c index 4129cdb..571abe9 100644 --- a/fs/hfs/bfind.c +++ b/fs/hfs/bfind.c @@ -23,7 +23,7 @@ int hfs_find_init(struct hfs_btree *tree, struct hfs_find_data *fd) fd->search_key = ptr; fd->key = ptr + tree->max_key_len + 2; dprint(DBG_BNODE_REFS, "find_init: %d (%p)\n", tree->cnid, __builtin_return_address(0)); - down(&tree->tree_lock); + mutex_lock(&tree->tree_lock); return 0; } @@ -32,7 +32,7 @@ void hfs_find_exit(struct hfs_find_data *fd) hfs_bnode_put(fd->bnode); kfree(fd->search_key); dprint(DBG_BNODE_REFS, "find_exit: %d (%p)\n", fd->tree->cnid, __builtin_return_address(0)); - up(&fd->tree->tree_lock); + mutex_unlock(&fd->tree->tree_lock); fd->tree = NULL; } diff --git a/fs/hfs/btree.c b/fs/hfs/btree.c index 9b9d639..4452b3a 100644 --- a/fs/hfs/btree.c +++ b/fs/hfs/btree.c @@ -26,7 +26,7 @@ struct hfs_btree *hfs_btree_open(struct super_block *sb, u32 id, btree_keycmp ke if (!tree) return NULL; - init_MUTEX(&tree->tree_lock); + mutex_init(&tree->tree_lock); spin_lock_init(&tree->hash_lock); /* Set the correct compare function */ tree->sb = sb; diff --git a/fs/hfs/btree.h b/fs/hfs/btree.h index cc51905..2a1d712 100644 --- a/fs/hfs/btree.h +++ b/fs/hfs/btree.h @@ -33,7 +33,7 @@ struct hfs_btree { unsigned int depth; //unsigned int map1_size, map_size; - struct semaphore tree_lock; + struct mutex tree_lock; unsigned int pages_per_bnode; spinlock_t hash_lock; diff --git a/fs/hfsplus/bfind.c b/fs/hfsplus/bfind.c index 5007a41..68c7983 100644 --- a/fs/hfsplus/bfind.c +++ b/fs/hfsplus/bfind.c @@ -23,7 +23,7 @@ int hfs_find_init(struct hfs_btree *tree, struct hfs_find_data *fd) fd->search_key = ptr; fd->key = ptr + tree->max_key_len + 2; dprint(DBG_BNODE_REFS, "find_init: %d (%p)\n", tree->cnid, __builtin_return_address(0)); - down(&tree->tree_lock); + mutex_lock(&tree->tree_lock); return 0; } @@ -32,7 +32,7 @@ void hfs_find_exit(struct hfs_find_data *fd) hfs_bnode_put(fd->bnode); kfree(fd->search_key); dprint(DBG_BNODE_REFS, "find_exit: %d (%p)\n", fd->tree->cnid, __builtin_return_address(0)); - up(&fd->tree->tree_lock); + mutex_unlock(&fd->tree->tree_lock); fd->tree = NULL; } diff --git a/fs/hfsplus/btree.c b/fs/hfsplus/btree.c index e49fcee..aa5fcb3 100644 --- a/fs/hfsplus/btree.c +++ b/fs/hfsplus/btree.c @@ -30,7 +30,7 @@ struct hfs_btree *hfs_btree_open(struct super_block *sb, u32 id) if (!tree) return NULL; - init_MUTEX(&tree->tree_lock); + mutex_init(&tree->tree_lock); spin_lock_init(&tree->hash_lock); tree->sb = sb; tree->cnid = id; diff --git a/fs/hfsplus/hfsplus_fs.h b/fs/hfsplus/hfsplus_fs.h index 5c10d80..d15f35e 100644 --- a/fs/hfsplus/hfsplus_fs.h +++ b/fs/hfsplus/hfsplus_fs.h @@ -62,7 +62,7 @@ struct hfs_btree { unsigned int depth; //unsigned int map1_size, map_size; - struct semaphore tree_lock; + struct mutex tree_lock; unsigned int pages_per_bnode; spinlock_t hash_lock; diff --git a/fs/hpfs/buffer.c b/fs/hpfs/buffer.c index b6fca54..8f4141c 100644 --- a/fs/hpfs/buffer.c +++ b/fs/hpfs/buffer.c @@ -13,7 +13,7 @@ void hpfs_lock_creation(struct super_block *s) #ifdef DEBUG_LOCKS printk("lock creation\n"); #endif - down(&hpfs_sb(s)->hpfs_creation_de); + mutex_lock(&hpfs_sb(s)->hpfs_creation_de); } void hpfs_unlock_creation(struct super_block *s) @@ -21,7 +21,7 @@ void hpfs_unlock_creation(struct super_block *s) #ifdef DEBUG_LOCKS printk("unlock creation\n"); #endif - up(&hpfs_sb(s)->hpfs_creation_de); + mutex_unlock(&hpfs_sb(s)->hpfs_creation_de); } /* Map a sector into a buffer and return pointers to it and to the buffer. */ diff --git a/fs/hpfs/hpfs_fn.h b/fs/hpfs/hpfs_fn.h index 701ca54..ed222f5 100644 --- a/fs/hpfs/hpfs_fn.h +++ b/fs/hpfs/hpfs_fn.h @@ -87,7 +87,7 @@ struct hpfs_sb_info { unsigned *sb_bmp_dir; /* main bitmap directory */ unsigned sb_c_bitmap; /* current bitmap */ unsigned sb_max_fwd_alloc; /* max forwad allocation */ - struct semaphore hpfs_creation_de; /* when creating dirents, nobody else + struct mutex hpfs_creation_de; /* when creating dirents, nobody else can alloc blocks */ /*unsigned sb_mounting : 1;*/ int sb_timeshift; diff --git a/fs/hpfs/super.c b/fs/hpfs/super.c index f2feaa0..fa81510 100644 --- a/fs/hpfs/super.c +++ b/fs/hpfs/super.c @@ -487,7 +487,7 @@ static int hpfs_fill_super(struct super_block *s, void *options, int silent) sbi->sb_bmp_dir = NULL; sbi->sb_cp_table = NULL; - init_MUTEX(&sbi->hpfs_creation_de); + mutex_init(&sbi->hpfs_creation_de); uid = current_uid(); gid = current_gid(); diff --git a/fs/inode.c b/fs/inode.c index ae7b67e..a958813 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -163,7 +163,7 @@ int inode_init_always(struct super_block *sb, struct inode *inode) mutex_init(&inode->i_mutex); lockdep_set_class(&inode->i_mutex, &sb->s_type->i_mutex_key); - init_rwsem(&inode->i_alloc_sem); + init_anon_rwsem(&inode->i_alloc_sem); lockdep_set_class(&inode->i_alloc_sem, &sb->s_type->i_alloc_sem_key); mapping->a_ops = &empty_aops; diff --git a/fs/jbd/transaction.c b/fs/jbd/transaction.c index c03ac11..e7348c2 100644 --- a/fs/jbd/transaction.c +++ b/fs/jbd/transaction.c @@ -1576,7 +1576,7 @@ static void __journal_temp_unlink_buffer(struct journal_head *jh) transaction_t *transaction; struct buffer_head *bh = jh2bh(jh); - J_ASSERT_JH(jh, jbd_is_locked_bh_state(bh)); + J_ASSERT_JH_SMP(jh, jbd_is_locked_bh_state(bh)); transaction = jh->b_transaction; if (transaction) assert_spin_locked(&transaction->t_journal->j_list_lock); @@ -2028,7 +2028,7 @@ void __journal_file_buffer(struct journal_head *jh, int was_dirty = 0; struct buffer_head *bh = jh2bh(jh); - J_ASSERT_JH(jh, jbd_is_locked_bh_state(bh)); + J_ASSERT_JH_SMP(jh, jbd_is_locked_bh_state(bh)); assert_spin_locked(&transaction->t_journal->j_list_lock); J_ASSERT_JH(jh, jh->b_jlist < BJ_Types); @@ -2122,7 +2122,7 @@ void __journal_refile_buffer(struct journal_head *jh) int was_dirty; struct buffer_head *bh = jh2bh(jh); - J_ASSERT_JH(jh, jbd_is_locked_bh_state(bh)); + J_ASSERT_JH_SMP(jh, jbd_is_locked_bh_state(bh)); if (jh->b_transaction) assert_spin_locked(&jh->b_transaction->t_journal->j_list_lock); diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index 6213ac7..575d8c0 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c @@ -1451,7 +1451,7 @@ void __jbd2_journal_temp_unlink_buffer(struct journal_head *jh) transaction_t *transaction; struct buffer_head *bh = jh2bh(jh); - J_ASSERT_JH(jh, jbd_is_locked_bh_state(bh)); + J_ASSERT_JH_SMP(jh, jbd_is_locked_bh_state(bh)); transaction = jh->b_transaction; if (transaction) assert_spin_locked(&transaction->t_journal->j_list_lock); @@ -1883,7 +1883,7 @@ void __jbd2_journal_file_buffer(struct journal_head *jh, int was_dirty = 0; struct buffer_head *bh = jh2bh(jh); - J_ASSERT_JH(jh, jbd_is_locked_bh_state(bh)); + J_ASSERT_JH_SMP(jh, jbd_is_locked_bh_state(bh)); assert_spin_locked(&transaction->t_journal->j_list_lock); J_ASSERT_JH(jh, jh->b_jlist < BJ_Types); @@ -1971,7 +1971,7 @@ void __jbd2_journal_refile_buffer(struct journal_head *jh) int was_dirty; struct buffer_head *bh = jh2bh(jh); - J_ASSERT_JH(jh, jbd_is_locked_bh_state(bh)); + J_ASSERT_JH_SMP(jh, jbd_is_locked_bh_state(bh)); if (jh->b_transaction) assert_spin_locked(&jh->b_transaction->t_journal->j_list_lock); diff --git a/fs/namespace.c b/fs/namespace.c index 7230787..2a27e33 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -264,8 +264,16 @@ int mnt_want_write(struct vfsmount *mnt) * incremented count after it has set MNT_WRITE_HOLD. */ smp_mb(); - while (mnt->mnt_flags & MNT_WRITE_HOLD) - cpu_relax(); + preempt_enable(); + /* + * HACK ALERT. on RT we can not spin here with cpu_relax() and + * preemption disabled so we block on the vfsmount lock which is + * held by mnt_make_readonly(). Works on !RT as well. + */ + while (mnt->mnt_flags & MNT_WRITE_HOLD) { + spin_lock(&vfsmount_lock); + spin_unlock(&vfsmount_lock); + } /* * After the slowpath clears MNT_WRITE_HOLD, mnt_is_readonly will * be set to match its requirements. So we must not load that until @@ -273,12 +281,11 @@ int mnt_want_write(struct vfsmount *mnt) */ smp_rmb(); if (__mnt_is_readonly(mnt)) { + preempt_disable(); dec_mnt_writers(mnt); + preempt_enable(); ret = -EROFS; - goto out; } -out: - preempt_enable(); return ret; } EXPORT_SYMBOL_GPL(mnt_want_write); diff --git a/fs/nilfs2/mdt.c b/fs/nilfs2/mdt.c index 2dfd477..42b6394 100644 --- a/fs/nilfs2/mdt.c +++ b/fs/nilfs2/mdt.c @@ -501,7 +501,7 @@ nilfs_mdt_new_common(struct the_nilfs *nilfs, struct super_block *sb, spin_lock_init(&inode->i_lock); mutex_init(&inode->i_mutex); - init_rwsem(&inode->i_alloc_sem); + init_anon_rwsem(&inode->i_alloc_sem); mapping->host = NULL; /* instead of inode */ mapping->flags = 0; diff --git a/fs/ntfs/aops.c b/fs/ntfs/aops.c index b38f944..4391e00 100644 --- a/fs/ntfs/aops.c +++ b/fs/ntfs/aops.c @@ -29,6 +29,7 @@ #include <linux/buffer_head.h> #include <linux/writeback.h> #include <linux/bit_spinlock.h> +#include <linux/interrupt.h> #include "aops.h" #include "attrib.h" @@ -107,8 +108,7 @@ static void ntfs_end_buffer_async_read(struct buffer_head *bh, int uptodate) "0x%llx.", (unsigned long long)bh->b_blocknr); } first = page_buffers(page); - local_irq_save(flags); - bit_spin_lock(BH_Uptodate_Lock, &first->b_state); + spin_lock_irqsave(&first->b_uptodate_lock, flags); clear_buffer_async_read(bh); unlock_buffer(bh); tmp = bh; @@ -123,8 +123,7 @@ static void ntfs_end_buffer_async_read(struct buffer_head *bh, int uptodate) } tmp = tmp->b_this_page; } while (tmp != bh); - bit_spin_unlock(BH_Uptodate_Lock, &first->b_state); - local_irq_restore(flags); + spin_unlock_irqrestore(&first->b_uptodate_lock, flags); /* * If none of the buffers had errors then we can set the page uptodate, * but we first have to perform the post read mst fixups, if the @@ -145,13 +144,13 @@ static void ntfs_end_buffer_async_read(struct buffer_head *bh, int uptodate) recs = PAGE_CACHE_SIZE / rec_size; /* Should have been verified before we got here... */ BUG_ON(!recs); - local_irq_save(flags); + local_irq_save_nort(flags); kaddr = kmap_atomic(page, KM_BIO_SRC_IRQ); for (i = 0; i < recs; i++) post_read_mst_fixup((NTFS_RECORD*)(kaddr + i * rec_size), rec_size); kunmap_atomic(kaddr, KM_BIO_SRC_IRQ); - local_irq_restore(flags); + local_irq_restore_nort(flags); flush_dcache_page(page); if (likely(page_uptodate && !PageError(page))) SetPageUptodate(page); @@ -159,8 +158,7 @@ static void ntfs_end_buffer_async_read(struct buffer_head *bh, int uptodate) unlock_page(page); return; still_busy: - bit_spin_unlock(BH_Uptodate_Lock, &first->b_state); - local_irq_restore(flags); + spin_unlock_irqrestore(&first->b_uptodate_lock, flags); return; } diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c index 3140a44..9f07b9c 100644 --- a/fs/ntfs/file.c +++ b/fs/ntfs/file.c @@ -1845,9 +1845,9 @@ static ssize_t ntfs_file_buffered_write(struct kiocb *iocb, * fails again. */ if (unlikely(NInoTruncateFailed(ni))) { - down_write(&vi->i_alloc_sem); + anon_down_write(&vi->i_alloc_sem); err = ntfs_truncate(vi); - up_write(&vi->i_alloc_sem); + anon_up_write(&vi->i_alloc_sem); if (err || NInoTruncateFailed(ni)) { if (!err) err = -EIO; diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index 8a1e615..788c0c8 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c @@ -643,7 +643,7 @@ static void ocfs2_dio_end_io(struct kiocb *iocb, level = ocfs2_iocb_rw_locked_level(iocb); if (!level) - up_read(&inode->i_alloc_sem); + anon_up_read(&inode->i_alloc_sem); ocfs2_rw_unlock(inode, level); } diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index aa501d3..988d7b4 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -1799,7 +1799,7 @@ static ssize_t ocfs2_file_aio_write(struct kiocb *iocb, relock: /* to match setattr's i_mutex -> i_alloc_sem -> rw_lock ordering */ if (direct_io) { - down_read(&inode->i_alloc_sem); + anon_down_read(&inode->i_alloc_sem); have_alloc_sem = 1; } @@ -1826,7 +1826,7 @@ relock: */ if (direct_io && !can_do_direct) { ocfs2_rw_unlock(inode, rw_level); - up_read(&inode->i_alloc_sem); + anon_up_read(&inode->i_alloc_sem); have_alloc_sem = 0; rw_level = -1; @@ -1915,7 +1915,7 @@ out: out_sems: if (have_alloc_sem) - up_read(&inode->i_alloc_sem); + anon_up_read(&inode->i_alloc_sem); mutex_unlock(&inode->i_mutex); @@ -2079,7 +2079,7 @@ static ssize_t ocfs2_file_aio_read(struct kiocb *iocb, * need locks to protect pending reads from racing with truncate. */ if (filp->f_flags & O_DIRECT) { - down_read(&inode->i_alloc_sem); + anon_down_read(&inode->i_alloc_sem); have_alloc_sem = 1; ret = ocfs2_rw_lock(inode, 0); @@ -2123,7 +2123,7 @@ static ssize_t ocfs2_file_aio_read(struct kiocb *iocb, bail: if (have_alloc_sem) - up_read(&inode->i_alloc_sem); + anon_up_read(&inode->i_alloc_sem); if (rw_level != -1) ocfs2_rw_unlock(inode, rw_level); mlog_exit(ret); diff --git a/fs/pipe.c b/fs/pipe.c index 52c4151..7f30ed2 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -434,8 +434,14 @@ redo: wake_up_interruptible_sync(&pipe->wait); kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT); } + /* + * Hack: we turn off atime updates for -RT kernels. + * Who uses them on pipes anyway? + */ +#ifndef CONFIG_PREEMPT_RT if (ret > 0) file_accessed(filp); +#endif return ret; } @@ -607,8 +613,14 @@ out: wake_up_interruptible_sync(&pipe->wait); kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); } + /* + * Hack: we turn off atime updates for -RT kernels. + * Who uses them on pipes anyway? + */ +#ifndef CONFIG_PREEMPT_RT if (ret > 0) file_update_time(filp); +#endif return ret; } diff --git a/fs/proc/array.c b/fs/proc/array.c index 725a650..d470f09 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -134,12 +134,13 @@ static inline void task_name(struct seq_file *m, struct task_struct *p) */ static const char *task_state_array[] = { "R (running)", /* 0 */ - "S (sleeping)", /* 1 */ - "D (disk sleep)", /* 2 */ - "T (stopped)", /* 4 */ - "T (tracing stop)", /* 8 */ - "Z (zombie)", /* 16 */ - "X (dead)" /* 32 */ + "M (running-mutex)", /* 1 */ + "S (sleeping)", /* 2 */ + "D (disk sleep)", /* 4 */ + "T (stopped)", /* 8 */ + "T (tracing stop)", /* 16 */ + "Z (zombie)", /* 32 */ + "X (dead)" /* 64 */ }; static inline const char *get_task_state(struct task_struct *tsk) @@ -321,6 +322,19 @@ static inline void task_context_switch_counts(struct seq_file *m, p->nivcsw); } +#define get_blocked_on(t) (-1) + +static inline void show_blocked_on(struct seq_file *m, struct task_struct *p) +{ + pid_t pid = get_blocked_on(p); + + if (pid < 0) + return; + + seq_printf(m, "BlckOn: %d\n", pid); +} + + int proc_pid_status(struct seq_file *m, struct pid_namespace *ns, struct pid *pid, struct task_struct *task) { @@ -340,6 +354,7 @@ int proc_pid_status(struct seq_file *m, struct pid_namespace *ns, task_show_regs(m, task); #endif task_context_switch_counts(m, task); + show_blocked_on(m, task); return 0; } diff --git a/fs/proc/stat.c b/fs/proc/stat.c index 7cc726c..ded1841 100644 --- a/fs/proc/stat.c +++ b/fs/proc/stat.c @@ -27,14 +27,14 @@ static int show_stat(struct seq_file *p, void *v) int i, j; unsigned long jif; cputime64_t user, nice, system, idle, iowait, irq, softirq, steal; - cputime64_t guest; + cputime64_t guest, user_rt, system_rt; u64 sum = 0; u64 sum_softirq = 0; unsigned int per_softirq_sums[NR_SOFTIRQS] = {0}; struct timespec boottime; unsigned int per_irq_sum; - user = nice = system = idle = iowait = + user_rt = system_rt = user = nice = system = idle = iowait = irq = softirq = steal = cputime64_zero; guest = cputime64_zero; getboottime(&boottime); @@ -50,6 +50,8 @@ static int show_stat(struct seq_file *p, void *v) irq = cputime64_add(irq, kstat_cpu(i).cpustat.irq); softirq = cputime64_add(softirq, kstat_cpu(i).cpustat.softirq); steal = cputime64_add(steal, kstat_cpu(i).cpustat.steal); + user_rt = cputime64_add(user_rt, kstat_cpu(i).cpustat.user_rt); + system_rt = cputime64_add(system_rt, kstat_cpu(i).cpustat.system_rt); guest = cputime64_add(guest, kstat_cpu(i).cpustat.guest); for_each_irq_nr(j) { sum += kstat_irqs_cpu(j, i); @@ -65,7 +67,10 @@ static int show_stat(struct seq_file *p, void *v) } sum += arch_irq_stat(); - seq_printf(p, "cpu %llu %llu %llu %llu %llu %llu %llu %llu %llu\n", + user = cputime64_add(user_rt, user); + system = cputime64_add(system_rt, system); + + seq_printf(p, "cpu %llu %llu %llu %llu %llu %llu %llu %llu %llu %llu %llu\n", (unsigned long long)cputime64_to_clock_t(user), (unsigned long long)cputime64_to_clock_t(nice), (unsigned long long)cputime64_to_clock_t(system), @@ -74,13 +79,17 @@ static int show_stat(struct seq_file *p, void *v) (unsigned long long)cputime64_to_clock_t(irq), (unsigned long long)cputime64_to_clock_t(softirq), (unsigned long long)cputime64_to_clock_t(steal), + (unsigned long long)cputime64_to_clock_t(user_rt), + (unsigned long long)cputime64_to_clock_t(system_rt), (unsigned long long)cputime64_to_clock_t(guest)); for_each_online_cpu(i) { /* Copy values here to work around gcc-2.95.3, gcc-2.96 */ - user = kstat_cpu(i).cpustat.user; + user_rt = kstat_cpu(i).cpustat.user_rt; + system_rt = kstat_cpu(i).cpustat.system_rt; + user = cputime64_add(user_rt, kstat_cpu(i).cpustat.user); nice = kstat_cpu(i).cpustat.nice; - system = kstat_cpu(i).cpustat.system; + system = cputime64_add(system_rt, kstat_cpu(i).cpustat.system); idle = kstat_cpu(i).cpustat.idle; idle = cputime64_add(idle, arch_idle_time(i)); iowait = kstat_cpu(i).cpustat.iowait; @@ -89,7 +98,7 @@ static int show_stat(struct seq_file *p, void *v) steal = kstat_cpu(i).cpustat.steal; guest = kstat_cpu(i).cpustat.guest; seq_printf(p, - "cpu%d %llu %llu %llu %llu %llu %llu %llu %llu %llu\n", + "cpu%d %llu %llu %llu %llu %llu %llu %llu %llu %llu %llu %llu\n", i, (unsigned long long)cputime64_to_clock_t(user), (unsigned long long)cputime64_to_clock_t(nice), @@ -99,6 +108,8 @@ static int show_stat(struct seq_file *p, void *v) (unsigned long long)cputime64_to_clock_t(irq), (unsigned long long)cputime64_to_clock_t(softirq), (unsigned long long)cputime64_to_clock_t(steal), + (unsigned long long)cputime64_to_clock_t(user_rt), + (unsigned long long)cputime64_to_clock_t(system_rt), (unsigned long long)cputime64_to_clock_t(guest)); } seq_printf(p, "intr %llu", (unsigned long long)sum); diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 9bd8be1..9d5e1b1 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -138,8 +138,10 @@ static void *m_start(struct seq_file *m, loff_t *pos) vma = NULL; if ((unsigned long)l < mm->map_count) { vma = mm->mmap; - while (l-- && vma) + while (l-- && vma) { vma = vma->vm_next; + cond_resched(); + } goto out; } diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c index 6925b83..45a1c14 100644 --- a/fs/reiserfs/xattr.c +++ b/fs/reiserfs/xattr.c @@ -540,9 +540,9 @@ reiserfs_xattr_set_handle(struct reiserfs_transaction_handle *th, .ia_valid = ATTR_SIZE | ATTR_CTIME, }; mutex_lock_nested(&dentry->d_inode->i_mutex, I_MUTEX_XATTR); - down_write(&dentry->d_inode->i_alloc_sem); + anon_down_write(&dentry->d_inode->i_alloc_sem); err = reiserfs_setattr(dentry, &newattrs); - up_write(&dentry->d_inode->i_alloc_sem); + anon_up_write(&dentry->d_inode->i_alloc_sem); mutex_unlock(&dentry->d_inode->i_mutex); } else update_ctime(inode); diff --git a/fs/smbfs/inode.c b/fs/smbfs/inode.c index 1402d2d..2e5ad84 100644 --- a/fs/smbfs/inode.c +++ b/fs/smbfs/inode.c @@ -536,7 +536,7 @@ static int smb_fill_super(struct super_block *sb, void *raw_data, int silent) server->mnt = NULL; server->sock_file = NULL; init_waitqueue_head(&server->conn_wq); - init_MUTEX(&server->sem); + mutex_init(&server->mutex); INIT_LIST_HEAD(&server->entry); INIT_LIST_HEAD(&server->xmitq); INIT_LIST_HEAD(&server->recvq); diff --git a/fs/xfs/linux-2.6/mrlock.h b/fs/xfs/linux-2.6/mrlock.h index ff6a198..77de2c2 100644 --- a/fs/xfs/linux-2.6/mrlock.h +++ b/fs/xfs/linux-2.6/mrlock.h @@ -21,7 +21,7 @@ #include <linux/rwsem.h> typedef struct { - struct rw_semaphore mr_lock; + struct rw_anon_semaphore mr_lock; #ifdef DEBUG int mr_writer; #endif @@ -29,10 +29,10 @@ typedef struct { #ifdef DEBUG #define mrinit(mrp, name) \ - do { (mrp)->mr_writer = 0; init_rwsem(&(mrp)->mr_lock); } while (0) + do { (mrp)->mr_writer = 0; init_anon_rwsem(&(mrp)->mr_lock); } while (0) #else #define mrinit(mrp, name) \ - do { init_rwsem(&(mrp)->mr_lock); } while (0) + do { init_anon_rwsem(&(mrp)->mr_lock); } while (0) #endif #define mrlock_init(mrp, t,n,s) mrinit(mrp, n) @@ -40,12 +40,12 @@ typedef struct { static inline void mraccess_nested(mrlock_t *mrp, int subclass) { - down_read_nested(&mrp->mr_lock, subclass); + anon_down_read_nested(&mrp->mr_lock, subclass); } static inline void mrupdate_nested(mrlock_t *mrp, int subclass) { - down_write_nested(&mrp->mr_lock, subclass); + anon_down_write_nested(&mrp->mr_lock, subclass); #ifdef DEBUG mrp->mr_writer = 1; #endif @@ -53,12 +53,12 @@ static inline void mrupdate_nested(mrlock_t *mrp, int subclass) static inline int mrtryaccess(mrlock_t *mrp) { - return down_read_trylock(&mrp->mr_lock); + return anon_down_read_trylock(&mrp->mr_lock); } static inline int mrtryupdate(mrlock_t *mrp) { - if (!down_write_trylock(&mrp->mr_lock)) + if (!anon_down_write_trylock(&mrp->mr_lock)) return 0; #ifdef DEBUG mrp->mr_writer = 1; @@ -71,12 +71,12 @@ static inline void mrunlock_excl(mrlock_t *mrp) #ifdef DEBUG mrp->mr_writer = 0; #endif - up_write(&mrp->mr_lock); + anon_up_write(&mrp->mr_lock); } static inline void mrunlock_shared(mrlock_t *mrp) { - up_read(&mrp->mr_lock); + anon_up_read(&mrp->mr_lock); } static inline void mrdemote(mrlock_t *mrp) @@ -84,7 +84,7 @@ static inline void mrdemote(mrlock_t *mrp) #ifdef DEBUG mrp->mr_writer = 0; #endif - downgrade_write(&mrp->mr_lock); + anon_downgrade_write(&mrp->mr_lock); } #endif /* __XFS_SUPPORT_MRLOCK_H__ */ diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c index 965df12..c4db8c8 100644 --- a/fs/xfs/linux-2.6/xfs_buf.c +++ b/fs/xfs/linux-2.6/xfs_buf.c @@ -263,7 +263,7 @@ _xfs_buf_initialize( init_completion(&bp->b_iowait); INIT_LIST_HEAD(&bp->b_list); INIT_LIST_HEAD(&bp->b_hash_list); - init_MUTEX_LOCKED(&bp->b_sema); /* held, no waiters */ + anon_semaphore_init_locked(&bp->b_sema); /* held, no waiters */ XB_SET_OWNER(bp); bp->b_target = target; bp->b_file_offset = range_base; @@ -545,7 +545,7 @@ found: * if this does not work then we need to drop the * spinlock and do a hard attempt on the semaphore. */ - if (down_trylock(&bp->b_sema)) { + if (anon_down_trylock(&bp->b_sema)) { if (!(flags & XBF_TRYLOCK)) { /* wait for buffer ownership */ XB_TRACE(bp, "get_lock", 0); @@ -908,7 +908,7 @@ xfs_buf_cond_lock( { int locked; - locked = down_trylock(&bp->b_sema) == 0; + locked = anon_down_trylock(&bp->b_sema) == 0; if (locked) { XB_SET_OWNER(bp); } @@ -938,7 +938,7 @@ xfs_buf_lock( XB_TRACE(bp, "lock", 0); if (atomic_read(&bp->b_io_remaining)) blk_run_address_space(bp->b_target->bt_mapping); - down(&bp->b_sema); + anon_down(&bp->b_sema); XB_SET_OWNER(bp); XB_TRACE(bp, "locked", 0); } @@ -961,7 +961,7 @@ xfs_buf_unlock( } XB_CLEAR_OWNER(bp); - up(&bp->b_sema); + anon_up(&bp->b_sema); XB_TRACE(bp, "unlock", 0); } diff --git a/fs/xfs/linux-2.6/xfs_buf.h b/fs/xfs/linux-2.6/xfs_buf.h index 9b4d666..8371852 100644 --- a/fs/xfs/linux-2.6/xfs_buf.h +++ b/fs/xfs/linux-2.6/xfs_buf.h @@ -145,7 +145,7 @@ typedef int (*xfs_buf_bdstrat_t)(struct xfs_buf *); #define XB_PAGES 2 typedef struct xfs_buf { - struct semaphore b_sema; /* semaphore for lockables */ + struct anon_semaphore b_sema; /* semaphore for lockables */ unsigned long b_queuetime; /* time buffer was queued */ atomic_t b_pin_count; /* pin count */ wait_queue_head_t b_waiters; /* unpin waiters */ diff --git a/fs/xfs/xfs_alloc.c b/fs/xfs/xfs_alloc.c index 2cf944e..c1f80d0 100644 --- a/fs/xfs/xfs_alloc.c +++ b/fs/xfs/xfs_alloc.c @@ -2412,7 +2412,7 @@ xfs_alloc_vextent( * These three force us into a single a.g. */ args->agno = XFS_FSB_TO_AGNO(mp, args->fsbno); - down_read(&mp->m_peraglock); + anon_down_read(&mp->m_peraglock); args->pag = &mp->m_perag[args->agno]; args->minleft = 0; error = xfs_alloc_fix_freelist(args, 0); @@ -2422,14 +2422,14 @@ xfs_alloc_vextent( goto error0; } if (!args->agbp) { - up_read(&mp->m_peraglock); + anon_up_read(&mp->m_peraglock); TRACE_ALLOC("noagbp", args); break; } args->agbno = XFS_FSB_TO_AGBNO(mp, args->fsbno); if ((error = xfs_alloc_ag_vextent(args))) goto error0; - up_read(&mp->m_peraglock); + anon_up_read(&mp->m_peraglock); break; case XFS_ALLOCTYPE_START_BNO: /* @@ -2481,7 +2481,7 @@ xfs_alloc_vextent( * Loop over allocation groups twice; first time with * trylock set, second time without. */ - down_read(&mp->m_peraglock); + anon_down_read(&mp->m_peraglock); for (;;) { args->pag = &mp->m_perag[args->agno]; if (no_min) args->minleft = 0; @@ -2541,7 +2541,7 @@ xfs_alloc_vextent( } } } - up_read(&mp->m_peraglock); + anon_up_read(&mp->m_peraglock); if (bump_rotor || (type == XFS_ALLOCTYPE_ANY_AG)) { if (args->agno == sagno) mp->m_agfrotor = (mp->m_agfrotor + 1) % @@ -2569,7 +2569,7 @@ xfs_alloc_vextent( } return 0; error0: - up_read(&mp->m_peraglock); + anon_up_read(&mp->m_peraglock); return error; } @@ -2594,7 +2594,7 @@ xfs_free_extent( args.agno = XFS_FSB_TO_AGNO(args.mp, bno); ASSERT(args.agno < args.mp->m_sb.sb_agcount); args.agbno = XFS_FSB_TO_AGBNO(args.mp, bno); - down_read(&args.mp->m_peraglock); + anon_down_read(&args.mp->m_peraglock); args.pag = &args.mp->m_perag[args.agno]; if ((error = xfs_alloc_fix_freelist(&args, XFS_ALLOC_FLAG_FREEING))) goto error0; @@ -2605,7 +2605,7 @@ xfs_free_extent( #endif error = xfs_free_ag_extent(tp, args.agbp, args.agno, args.agbno, len, 0); error0: - up_read(&args.mp->m_peraglock); + anon_up_read(&args.mp->m_peraglock); return error; } diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c index 8ee5b5a..68dc91b 100644 --- a/fs/xfs/xfs_bmap.c +++ b/fs/xfs/xfs_bmap.c @@ -2780,13 +2780,13 @@ xfs_bmap_btalloc( if (startag == NULLAGNUMBER) startag = ag = 0; notinit = 0; - down_read(&mp->m_peraglock); + anon_down_read(&mp->m_peraglock); while (blen < ap->alen) { pag = &mp->m_perag[ag]; if (!pag->pagf_init && (error = xfs_alloc_pagf_init(mp, args.tp, ag, XFS_ALLOC_FLAG_TRYLOCK))) { - up_read(&mp->m_peraglock); + anon_up_read(&mp->m_peraglock); return error; } /* @@ -2819,7 +2819,7 @@ xfs_bmap_btalloc( error = xfs_filestream_new_ag(ap, &ag); if (error) { - up_read(&mp->m_peraglock); + anon_up_read(&mp->m_peraglock); return error; } @@ -2833,7 +2833,7 @@ xfs_bmap_btalloc( if (ag == startag) break; } - up_read(&mp->m_peraglock); + anon_up_read(&mp->m_peraglock); /* * Since the above loop did a BUF_TRYLOCK, it is * possible that there is space for this request. diff --git a/fs/xfs/xfs_filestream.c b/fs/xfs/xfs_filestream.c index edf8bdf..233e214 100644 --- a/fs/xfs/xfs_filestream.c +++ b/fs/xfs/xfs_filestream.c @@ -530,7 +530,7 @@ xfs_filestream_associate( mp = pip->i_mount; cache = mp->m_filestream; - down_read(&mp->m_peraglock); + anon_down_read(&mp->m_peraglock); /* * We have a problem, Houston. @@ -548,7 +548,7 @@ xfs_filestream_associate( * So, if we can't get the iolock without sleeping then just give up */ if (!xfs_ilock_nowait(pip, XFS_IOLOCK_EXCL)) { - up_read(&mp->m_peraglock); + anon_up_read(&mp->m_peraglock); return 1; } @@ -605,7 +605,7 @@ exit_did_pick: exit: xfs_iunlock(pip, XFS_IOLOCK_EXCL); - up_read(&mp->m_peraglock); + anon_up_read(&mp->m_peraglock); return -err; } diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c index 2d0b3e1..432fc3f 100644 --- a/fs/xfs/xfs_fsops.c +++ b/fs/xfs/xfs_fsops.c @@ -176,16 +176,17 @@ xfs_growfs_data_private( if (!new_perag) return XFS_ERROR(ENOMEM); - down_write(&mp->m_peraglock); + anon_down_write(&mp->m_peraglock); memcpy(new_perag, mp->m_perag, sizeof(xfs_perag_t) * oagcount); old_perag = mp->m_perag; mp->m_perag = new_perag; mp->m_flags |= XFS_MOUNT_32BITINODES; nagimax = xfs_initialize_perag(mp, nagcount); - up_write(&mp->m_peraglock); + anon_up_write(&mp->m_peraglock); kmem_free(old_perag); + } tp = xfs_trans_alloc(mp, XFS_TRANS_GROWFS); tp->t_flags |= XFS_TRANS_RESERVE; diff --git a/fs/xfs/xfs_ialloc.c b/fs/xfs/xfs_ialloc.c index 3120a3a..01ac9b5 100644 --- a/fs/xfs/xfs_ialloc.c +++ b/fs/xfs/xfs_ialloc.c @@ -369,9 +369,9 @@ xfs_ialloc_ag_alloc( be32_add_cpu(&agi->agi_count, newlen); be32_add_cpu(&agi->agi_freecount, newlen); agno = be32_to_cpu(agi->agi_seqno); - down_read(&args.mp->m_peraglock); + anon_down_read(&args.mp->m_peraglock); args.mp->m_perag[agno].pagi_freecount += newlen; - up_read(&args.mp->m_peraglock); + anon_up_read(&args.mp->m_peraglock); agi->agi_newino = cpu_to_be32(newino); /* * Insert records describing the new inode chunk into the btree. @@ -468,7 +468,7 @@ xfs_ialloc_ag_select( */ agno = pagno; flags = XFS_ALLOC_FLAG_TRYLOCK; - down_read(&mp->m_peraglock); + anon_down_read(&mp->m_peraglock); for (;;) { pag = &mp->m_perag[agno]; if (!pag->pagi_init) { @@ -509,7 +509,7 @@ xfs_ialloc_ag_select( agbp = NULL; goto nextag; } - up_read(&mp->m_peraglock); + anon_up_read(&mp->m_peraglock); return agbp; } } @@ -522,7 +522,7 @@ nextag: * down. */ if (XFS_FORCED_SHUTDOWN(mp)) { - up_read(&mp->m_peraglock); + anon_up_read(&mp->m_peraglock); return NULL; } agno++; @@ -530,7 +530,7 @@ nextag: agno = 0; if (agno == pagno) { if (flags == 0) { - up_read(&mp->m_peraglock); + anon_up_read(&mp->m_peraglock); return NULL; } flags = 0; @@ -697,13 +697,13 @@ nextag: *inop = NULLFSINO; return noroom ? ENOSPC : 0; } - down_read(&mp->m_peraglock); + anon_down_read(&mp->m_peraglock); if (mp->m_perag[tagno].pagi_inodeok == 0) { - up_read(&mp->m_peraglock); + anon_up_read(&mp->m_peraglock); goto nextag; } error = xfs_ialloc_read_agi(mp, tp, tagno, &agbp); - up_read(&mp->m_peraglock); + anon_up_read(&mp->m_peraglock); if (error) goto nextag; agi = XFS_BUF_TO_AGI(agbp); @@ -950,9 +950,9 @@ nextag: goto error0; be32_add_cpu(&agi->agi_freecount, -1); xfs_ialloc_log_agi(tp, agbp, XFS_AGI_FREECOUNT); - down_read(&mp->m_peraglock); + anon_down_read(&mp->m_peraglock); mp->m_perag[tagno].pagi_freecount--; - up_read(&mp->m_peraglock); + anon_up_read(&mp->m_peraglock); #ifdef DEBUG if (cur->bc_nlevels == 1) { int freecount = 0; @@ -1046,9 +1046,9 @@ xfs_difree( /* * Get the allocation group header. */ - down_read(&mp->m_peraglock); + anon_down_read(&mp->m_peraglock); error = xfs_ialloc_read_agi(mp, tp, agno, &agbp); - up_read(&mp->m_peraglock); + anon_up_read(&mp->m_peraglock); if (error) { cmn_err(CE_WARN, "xfs_difree: xfs_ialloc_read_agi() returned an error %d on %s. Returning error.", @@ -1130,9 +1130,9 @@ xfs_difree( be32_add_cpu(&agi->agi_count, -ilen); be32_add_cpu(&agi->agi_freecount, -(ilen - 1)); xfs_ialloc_log_agi(tp, agbp, XFS_AGI_COUNT | XFS_AGI_FREECOUNT); - down_read(&mp->m_peraglock); + anon_down_read(&mp->m_peraglock); mp->m_perag[agno].pagi_freecount -= ilen - 1; - up_read(&mp->m_peraglock); + anon_up_read(&mp->m_peraglock); xfs_trans_mod_sb(tp, XFS_TRANS_SB_ICOUNT, -ilen); xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, -(ilen - 1)); @@ -1159,9 +1159,9 @@ xfs_difree( */ be32_add_cpu(&agi->agi_freecount, 1); xfs_ialloc_log_agi(tp, agbp, XFS_AGI_FREECOUNT); - down_read(&mp->m_peraglock); + anon_down_read(&mp->m_peraglock); mp->m_perag[agno].pagi_freecount++; - up_read(&mp->m_peraglock); + anon_up_read(&mp->m_peraglock); xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, 1); } @@ -1303,9 +1303,9 @@ xfs_imap( xfs_buf_t *agbp; /* agi buffer */ int i; /* temp state */ - down_read(&mp->m_peraglock); + anon_down_read(&mp->m_peraglock); error = xfs_ialloc_read_agi(mp, tp, agno, &agbp); - up_read(&mp->m_peraglock); + anon_up_read(&mp->m_peraglock); if (error) { xfs_fs_cmn_err(CE_ALERT, mp, "xfs_imap: " "xfs_ialloc_read_agi() returned " diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c index aeb2d22..2b020fc 100644 --- a/fs/xfs/xfs_itable.c +++ b/fs/xfs/xfs_itable.c @@ -418,9 +418,9 @@ xfs_bulkstat( while (XFS_BULKSTAT_UBLEFT(ubleft) && agno < mp->m_sb.sb_agcount) { cond_resched(); bp = NULL; - down_read(&mp->m_peraglock); + anon_down_read(&mp->m_peraglock); error = xfs_ialloc_read_agi(mp, NULL, agno, &agbp); - up_read(&mp->m_peraglock); + anon_up_read(&mp->m_peraglock); if (error) { /* * Skip this allocation group and go to the next one. @@ -840,9 +840,9 @@ xfs_inumbers( agbp = NULL; while (left > 0 && agno < mp->m_sb.sb_agcount) { if (agbp == NULL) { - down_read(&mp->m_peraglock); + anon_down_read(&mp->m_peraglock); error = xfs_ialloc_read_agi(mp, NULL, agno, &agbp); - up_read(&mp->m_peraglock); + anon_up_read(&mp->m_peraglock); if (error) { /* * If we can't read the AGI of this ag, diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index 5c6f092..df54a40 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -1150,7 +1150,7 @@ xfs_mountfs( /* * Allocate and initialize the per-ag data. */ - init_rwsem(&mp->m_peraglock); + init_anon_rwsem(&mp->m_peraglock); mp->m_perag = kmem_zalloc(sbp->sb_agcount * sizeof(xfs_perag_t), KM_MAYFAIL); if (!mp->m_perag) diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index a512238..cc85340 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -193,7 +193,7 @@ typedef struct xfs_mount { uint m_bm_maxlevels[2]; /* XFS_BM_MAXLEVELS */ uint m_in_maxlevels; /* max inobt btree levels. */ struct xfs_perag *m_perag; /* per-ag accounting info */ - struct rw_semaphore m_peraglock; /* lock for m_perag (pointer) */ + struct rw_anon_semaphore m_peraglock; /* lock for m_perag (pointer) */ struct mutex m_growlock; /* growfs mutex */ int m_fixedfsid[2]; /* unchanged for life of FS */ uint m_dmevmask; /* DMI events for this FS */ diff --git a/include/acpi/acpiosxf.h b/include/acpi/acpiosxf.h index ab0b85c..e3094c6 100644 --- a/include/acpi/acpiosxf.h +++ b/include/acpi/acpiosxf.h @@ -61,7 +61,7 @@ typedef enum { OSL_EC_BURST_HANDLER } acpi_execute_type; -#define ACPI_NO_UNIT_LIMIT ((u32) -1) +#define ACPI_NO_UNIT_LIMIT (INT_MAX/2) #define ACPI_MUTEX_SEM 1 /* Functions for acpi_os_signal */ diff --git a/include/asm-generic/bug.h b/include/asm-generic/bug.h index 4b67559..495d568 100644 --- a/include/asm-generic/bug.h +++ b/include/asm-generic/bug.h @@ -3,6 +3,10 @@ #include <linux/compiler.h> +#ifndef __ASSEMBLY__ +extern void __WARN_ON(const char *func, const char *file, const int line); +#endif /* __ASSEMBLY__ */ + #ifdef CONFIG_BUG #ifdef CONFIG_GENERIC_BUG @@ -141,4 +145,20 @@ extern void warn_slowpath_null(const char *file, const int line); # define WARN_ON_SMP(x) do { } while (0) #endif +#ifdef CONFIG_PREEMPT_RT +# define BUG_ON_RT(c) BUG_ON(c) +# define BUG_ON_NONRT(c) do { } while (0) +# define WARN_ON_RT(condition) WARN_ON(condition) +# define WARN_ON_NONRT(condition) do { } while (0) +# define WARN_ON_ONCE_NONRT(condition) do { } while (0) +# define WARN_ONCE_NONRT do { } while (0) +#else +# define BUG_ON_RT(c) do { } while (0) +# define BUG_ON_NONRT(c) BUG_ON(c) +# define WARN_ON_RT(condition) do { } while (0) +# define WARN_ON_NONRT(condition) WARN_ON(condition) +# define WARN_ON_ONCE_NONRT(condition) WARN_ON_ONCE(condition) +# define WARN_ONCE_NONRT(cond, fmt...) WARN_ONCE(cond, fmt) +#endif + #endif diff --git a/include/asm-generic/cmpxchg-local.h b/include/asm-generic/cmpxchg-local.h index b2ba2fc..9793123 100644 --- a/include/asm-generic/cmpxchg-local.h +++ b/include/asm-generic/cmpxchg-local.h @@ -20,7 +20,7 @@ static inline unsigned long __cmpxchg_local_generic(volatile void *ptr, if (size == 8 && sizeof(unsigned long) != 8) wrong_size_cmpxchg(ptr); - local_irq_save(flags); + raw_local_irq_save(flags); switch (size) { case 1: prev = *(u8 *)ptr; if (prev == old) @@ -41,7 +41,7 @@ static inline unsigned long __cmpxchg_local_generic(volatile void *ptr, default: wrong_size_cmpxchg(ptr); } - local_irq_restore(flags); + raw_local_irq_restore(flags); return prev; } @@ -54,11 +54,11 @@ static inline u64 __cmpxchg64_local_generic(volatile void *ptr, u64 prev; unsigned long flags; - local_irq_save(flags); + raw_local_irq_save(flags); prev = *(u64 *)ptr; if (prev == old) *(u64 *)ptr = new; - local_irq_restore(flags); + raw_local_irq_restore(flags); return prev; } diff --git a/include/asm-generic/percpu.h b/include/asm-generic/percpu.h index aa00800..f8a80da 100644 --- a/include/asm-generic/percpu.h +++ b/include/asm-generic/percpu.h @@ -5,6 +5,9 @@ #include <linux/threads.h> #include <linux/percpu-defs.h> +#define __per_cpu_var_lock(var) per_cpu__lock_##var##_locked +#define __per_cpu_var_lock_var(var) per_cpu__##var##_locked + #ifdef CONFIG_SMP /* @@ -56,6 +59,14 @@ extern unsigned long __per_cpu_offset[NR_CPUS]; #define __raw_get_cpu_var(var) \ (*SHIFT_PERCPU_PTR(&per_cpu_var(var), __my_cpu_offset)) +#define per_cpu_lock(var, cpu) \ + (*SHIFT_PERCPU_PTR(&__per_cpu_var_lock(var), per_cpu_offset(cpu))) +#define per_cpu_var_locked(var, cpu) \ + (*SHIFT_PERCPU_PTR(&__per_cpu_var_lock_var(var), per_cpu_offset(cpu))) +#define __get_cpu_lock(var, cpu) \ + per_cpu_lock(var, cpu) +#define __get_cpu_var_locked(var, cpu) \ + per_cpu_var_locked(var, cpu) #ifdef CONFIG_HAVE_SETUP_PER_CPU_AREA extern void setup_per_cpu_areas(void); @@ -64,9 +75,11 @@ extern void setup_per_cpu_areas(void); #else /* ! SMP */ #define per_cpu(var, cpu) (*((void)(cpu), &per_cpu_var(var))) +#define per_cpu_var_locked(var, cpu) (*((void)(cpu), &__per_cpu_var_lock_var(var))) #define __get_cpu_var(var) per_cpu_var(var) #define __raw_get_cpu_var(var) per_cpu_var(var) - +#define __get_cpu_lock(var, cpu) __per_cpu_var_lock(var) +#define __get_cpu_var_locked(var, cpu) __per_cpu_var_lock_var(var) #endif /* SMP */ #ifndef PER_CPU_BASE_SECTION diff --git a/include/asm-generic/tlb.h b/include/asm-generic/tlb.h index e43f976..30f998d 100644 --- a/include/asm-generic/tlb.h +++ b/include/asm-generic/tlb.h @@ -22,14 +22,8 @@ * and page free order so much.. */ #ifdef CONFIG_SMP - #ifdef ARCH_FREE_PTR_NR - #define FREE_PTR_NR ARCH_FREE_PTR_NR - #else - #define FREE_PTE_NR 506 - #endif #define tlb_fast_mode(tlb) ((tlb)->nr == ~0U) #else - #define FREE_PTE_NR 1 #define tlb_fast_mode(tlb) 1 #endif @@ -39,30 +33,48 @@ struct mmu_gather { struct mm_struct *mm; unsigned int nr; /* set to ~0U means fast mode */ + unsigned int max; /* nr < max */ unsigned int need_flush;/* Really unmapped some ptes? */ unsigned int fullmm; /* non-zero means full mm flush */ - struct page * pages[FREE_PTE_NR]; +#ifdef HAVE_ARCH_MMU_GATHER + struct arch_mmu_gather arch; +#endif + struct page ** pages; + struct page * local[8]; }; -/* Users of the generic TLB shootdown code must declare this storage space. */ -DECLARE_PER_CPU(struct mmu_gather, mmu_gathers); +static inline void __tlb_alloc_pages(struct mmu_gather *tlb) +{ + unsigned long addr = __get_free_pages(GFP_ATOMIC, 0); + + if (addr) { + tlb->pages = (void *)addr; + tlb->max = PAGE_SIZE / sizeof(struct page *); + } +} /* tlb_gather_mmu * Return a pointer to an initialized struct mmu_gather. */ -static inline struct mmu_gather * -tlb_gather_mmu(struct mm_struct *mm, unsigned int full_mm_flush) +static inline void +tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, unsigned int full_mm_flush) { - struct mmu_gather *tlb = &get_cpu_var(mmu_gathers); - tlb->mm = mm; - /* Use fast mode if only one CPU is online */ - tlb->nr = num_online_cpus() > 1 ? 0U : ~0U; + tlb->max = ARRAY_SIZE(tlb->local); + tlb->pages = tlb->local; + + if (num_online_cpus() > 1) { + tlb->nr = 0; + __tlb_alloc_pages(tlb); + } else /* Use fast mode if only one CPU is online */ + tlb->nr = ~0U; tlb->fullmm = full_mm_flush; - return tlb; +#ifdef HAVE_ARCH_MMU_GATHER + tlb->arch = ARCH_MMU_GATHER_INIT; +#endif } static inline void @@ -75,6 +87,8 @@ tlb_flush_mmu(struct mmu_gather *tlb, unsigned long start, unsigned long end) if (!tlb_fast_mode(tlb)) { free_pages_and_swap_cache(tlb->pages, tlb->nr); tlb->nr = 0; + if (tlb->pages == tlb->local) + __tlb_alloc_pages(tlb); } } @@ -90,7 +104,8 @@ tlb_finish_mmu(struct mmu_gather *tlb, unsigned long start, unsigned long end) /* keep the page table cache within bounds */ check_pgt_cache(); - put_cpu_var(mmu_gathers); + if (tlb->pages != tlb->local) + free_pages((unsigned long)tlb->pages, 0); } /* tlb_remove_page @@ -106,7 +121,7 @@ static inline void tlb_remove_page(struct mmu_gather *tlb, struct page *page) return; } tlb->pages[tlb->nr++] = page; - if (tlb->nr >= FREE_PTE_NR) + if (tlb->nr >= tlb->max) tlb_flush_mmu(tlb, 0, 0); } diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h index 6ad76bf..98b37cf 100644 --- a/include/asm-generic/vmlinux.lds.h +++ b/include/asm-generic/vmlinux.lds.h @@ -91,7 +91,8 @@ #endif #ifdef CONFIG_FTRACE_MCOUNT_RECORD -#define MCOUNT_REC() VMLINUX_SYMBOL(__start_mcount_loc) = .; \ +#define MCOUNT_REC() . = ALIGN(8); \ + VMLINUX_SYMBOL(__start_mcount_loc) = .; \ *(__mcount_loc) \ VMLINUX_SYMBOL(__stop_mcount_loc) = .; #else @@ -331,7 +332,6 @@ /* __*init sections */ \ __init_rodata : AT(ADDR(__init_rodata) - LOAD_OFFSET) { \ *(.ref.rodata) \ - MCOUNT_REC() \ DEV_KEEP(init.rodata) \ DEV_KEEP(exit.rodata) \ CPU_KEEP(init.rodata) \ @@ -455,6 +455,7 @@ MEM_DISCARD(init.data) \ KERNEL_CTORS() \ *(.init.rodata) \ + MCOUNT_REC() \ DEV_DISCARD(init.rodata) \ CPU_DISCARD(init.rodata) \ MEM_DISCARD(init.rodata) diff --git a/include/linux/bottom_half.h b/include/linux/bottom_half.h index 27b1bcf..37dc356 100644 --- a/include/linux/bottom_half.h +++ b/include/linux/bottom_half.h @@ -1,9 +1,17 @@ #ifndef _LINUX_BH_H #define _LINUX_BH_H +#ifdef CONFIG_PREEMPT_HARDIRQS +# define local_bh_disable() do { } while (0) +# define __local_bh_disable(ip) do { } while (0) +# define _local_bh_enable() do { } while (0) +# define local_bh_enable() do { } while (0) +# define local_bh_enable_ip(ip) do { } while (0) +#else extern void local_bh_disable(void); extern void _local_bh_enable(void); extern void local_bh_enable(void); extern void local_bh_enable_ip(unsigned long ip); +#endif #endif /* _LINUX_BH_H */ diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h index 16ed028..a7a7491 100644 --- a/include/linux/buffer_head.h +++ b/include/linux/buffer_head.h @@ -21,10 +21,6 @@ enum bh_state_bits { BH_Dirty, /* Is dirty */ BH_Lock, /* Is locked */ BH_Req, /* Has been submitted for I/O */ - BH_Uptodate_Lock,/* Used by the first bh in a page, to serialise - * IO completion of other buffers in the page - */ - BH_Mapped, /* Has a disk mapping */ BH_New, /* Disk mapping was newly created by get_block */ BH_Async_Read, /* Is under end_buffer_async_read I/O */ @@ -74,6 +70,8 @@ struct buffer_head { struct address_space *b_assoc_map; /* mapping this buffer is associated with */ atomic_t b_count; /* users using this buffer_head */ + spinlock_t b_uptodate_lock; + spinlock_t b_state_lock; }; /* diff --git a/include/linux/console.h b/include/linux/console.h index dcca533..81651ad 100644 --- a/include/linux/console.h +++ b/include/linux/console.h @@ -55,6 +55,7 @@ struct consw { void (*con_invert_region)(struct vc_data *, u16 *, int); u16 *(*con_screen_pos)(struct vc_data *, int); unsigned long (*con_getxy)(struct vc_data *, unsigned long, int *, int *); + int con_preemptible; // can it reschedule from within printk? }; extern const struct consw *conswitchp; @@ -92,6 +93,17 @@ void give_up_console(const struct consw *sw); #define CON_BOOT (8) #define CON_ANYTIME (16) /* Safe to call when cpu is offline */ #define CON_BRL (32) /* Used for a braille device */ +#define CON_ATOMIC (64) /* Safe to call in PREEMPT_RT atomic */ + +#ifdef CONFIG_PREEMPT_RT +# define console_atomic_safe(con) \ + (((con)->flags & CON_ATOMIC) || \ + (!in_atomic() && !irqs_disabled()) || \ + (system_state != SYSTEM_RUNNING) || \ + oops_in_progress) +#else +# define console_atomic_safe(con) (1) +#endif struct console { char name[16]; diff --git a/include/linux/device.h b/include/linux/device.h index aebb810..1bfad19 100644 --- a/include/linux/device.h +++ b/include/linux/device.h @@ -21,7 +21,6 @@ #include <linux/types.h> #include <linux/module.h> #include <linux/pm.h> -#include <linux/semaphore.h> #include <asm/atomic.h> #include <asm/device.h> @@ -105,7 +104,7 @@ extern int bus_unregister_notifier(struct bus_type *bus, /* All 4 notifers below get called with the target struct device * * as an argument. Note that those functions are likely to be called - * with the device semaphore held in the core, so be careful. + * with the device mutex held in the core, so be careful. */ #define BUS_NOTIFY_ADD_DEVICE 0x00000001 /* device added */ #define BUS_NOTIFY_DEL_DEVICE 0x00000002 /* device removed */ @@ -373,7 +372,7 @@ struct device { const char *init_name; /* initial name of the device */ struct device_type *type; - struct semaphore sem; /* semaphore to synchronize calls to + struct mutex mutex; /* mutex to synchronize calls to * its driver. */ diff --git a/include/linux/fs.h b/include/linux/fs.h index 73e9b64..00dc2cf 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -738,7 +738,7 @@ struct inode { umode_t i_mode; spinlock_t i_lock; /* i_blocks, i_bytes, maybe i_size */ struct mutex i_mutex; - struct rw_semaphore i_alloc_sem; + struct rw_anon_semaphore i_alloc_sem; const struct inode_operations *i_op; const struct file_operations *i_fop; /* former ->i_op->default_file_ops */ struct super_block *i_sb; diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h index a81170d..bd099ba 100644 --- a/include/linux/ftrace_event.h +++ b/include/linux/ftrace_event.h @@ -1,8 +1,8 @@ #ifndef _LINUX_FTRACE_EVENT_H #define _LINUX_FTRACE_EVENT_H -#include <linux/trace_seq.h> #include <linux/ring_buffer.h> +#include <linux/trace_seq.h> #include <linux/percpu.h> struct trace_array; @@ -34,7 +34,7 @@ struct trace_entry { unsigned char flags; unsigned char preempt_count; int pid; - int tgid; + int lock_depth; }; #define FTRACE_MAX_EVENT \ @@ -93,16 +93,22 @@ void tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags, int pc); struct ring_buffer_event * -trace_current_buffer_lock_reserve(int type, unsigned long len, +trace_current_buffer_lock_reserve(struct ring_buffer **current_buffer, + int type, unsigned long len, unsigned long flags, int pc); -void trace_current_buffer_unlock_commit(struct ring_buffer_event *event, +void trace_current_buffer_unlock_commit(struct ring_buffer *buffer, + struct ring_buffer_event *event, unsigned long flags, int pc); -void trace_nowake_buffer_unlock_commit(struct ring_buffer_event *event, +void trace_nowake_buffer_unlock_commit(struct ring_buffer *buffer, + struct ring_buffer_event *event, unsigned long flags, int pc); -void trace_current_buffer_discard_commit(struct ring_buffer_event *event); +void trace_current_buffer_discard_commit(struct ring_buffer *buffer, + struct ring_buffer_event *event); void tracing_record_cmdline(struct task_struct *tsk); +struct event_filter; + struct ftrace_event_call { struct list_head list; char *name; @@ -110,16 +116,18 @@ struct ftrace_event_call { struct dentry *dir; struct trace_event *event; int enabled; - int (*regfunc)(void); - void (*unregfunc)(void); + int (*regfunc)(void *); + void (*unregfunc)(void *); int id; int (*raw_init)(void); - int (*show_format)(struct trace_seq *s); - int (*define_fields)(void); + int (*show_format)(struct ftrace_event_call *call, + struct trace_seq *s); + int (*define_fields)(struct ftrace_event_call *); struct list_head fields; int filter_active; - void *filter; + struct event_filter *filter; void *mod; + void *data; atomic_t profile_count; int (*profile_enable)(struct ftrace_event_call *); @@ -127,17 +135,27 @@ struct ftrace_event_call { }; #define MAX_FILTER_PRED 32 -#define MAX_FILTER_STR_VAL 128 +#define MAX_FILTER_STR_VAL 256 /* Should handle KSYM_SYMBOL_LEN */ -extern int init_preds(struct ftrace_event_call *call); extern void destroy_preds(struct ftrace_event_call *call); extern int filter_match_preds(struct ftrace_event_call *call, void *rec); -extern int filter_current_check_discard(struct ftrace_event_call *call, +extern int filter_current_check_discard(struct ring_buffer *buffer, + struct ftrace_event_call *call, void *rec, struct ring_buffer_event *event); -extern int trace_define_field(struct ftrace_event_call *call, char *type, - char *name, int offset, int size, int is_signed); +enum { + FILTER_OTHER = 0, + FILTER_STATIC_STRING, + FILTER_DYN_STRING, + FILTER_PTR_STRING, +}; + +extern int trace_define_field(struct ftrace_event_call *call, + const char *type, const char *name, + int offset, int size, int is_signed, + int filter_type); +extern int trace_define_common_fields(struct ftrace_event_call *call); #define is_signed_type(type) (((type)(-1)) < 0) @@ -162,11 +180,4 @@ do { \ __trace_printk(ip, fmt, ##args); \ } while (0) -#define __common_field(type, item, is_signed) \ - ret = trace_define_field(event_call, #type, "common_" #item, \ - offsetof(typeof(field.ent), item), \ - sizeof(field.ent.item), is_signed); \ - if (ret) \ - return ret; - #endif /* _LINUX_FTRACE_EVENT_H */ diff --git a/include/linux/hardirq.h b/include/linux/hardirq.h index 8246c69..16966fb 100644 --- a/include/linux/hardirq.h +++ b/include/linux/hardirq.h @@ -77,23 +77,15 @@ * Are we doing bottom half or hardware interrupt processing? * Are we in a softirq context? Interrupt context? */ -#define in_irq() (hardirq_count()) -#define in_softirq() (softirq_count()) -#define in_interrupt() (irq_count()) +#define in_irq() (hardirq_count() || (current->flags & PF_HARDIRQ)) +#define in_softirq() (softirq_count() || (current->flags & PF_SOFTIRQ)) +#define in_interrupt() (irq_count()) /* * Are we in NMI context? */ #define in_nmi() (preempt_count() & NMI_MASK) -#if defined(CONFIG_PREEMPT) -# define PREEMPT_INATOMIC_BASE kernel_locked() -# define PREEMPT_CHECK_OFFSET 1 -#else -# define PREEMPT_INATOMIC_BASE 0 -# define PREEMPT_CHECK_OFFSET 0 -#endif - /* * Are we running in atomic context? WARNING: this macro cannot * always detect atomic context; in particular, it cannot know about @@ -101,14 +93,7 @@ * used in the general case to determine whether sleeping is possible. * Do not use in_atomic() in driver code. */ -#define in_atomic() ((preempt_count() & ~PREEMPT_ACTIVE) != PREEMPT_INATOMIC_BASE) - -/* - * Check whether we were atomic before we did preempt_disable(): - * (used by the scheduler, *after* releasing the kernel lock) - */ -#define in_atomic_preempt_off() \ - ((preempt_count() & ~PREEMPT_ACTIVE) != PREEMPT_CHECK_OFFSET) +#define in_atomic() ((preempt_count() & ~PREEMPT_ACTIVE) != 0) #ifdef CONFIG_PREEMPT # define preemptible() (preempt_count() == 0 && !irqs_disabled()) diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h index 4759917..6bc3e28 100644 --- a/include/linux/hrtimer.h +++ b/include/linux/hrtimer.h @@ -109,6 +109,7 @@ struct hrtimer { struct hrtimer_clock_base *base; unsigned long state; struct list_head cb_entry; + int irqsafe; #ifdef CONFIG_TIMER_STATS int start_pid; void *start_site; @@ -144,6 +145,7 @@ struct hrtimer_clock_base { struct hrtimer_cpu_base *cpu_base; clockid_t index; struct rb_root active; + struct list_head expired; struct rb_node *first; ktime_t resolution; ktime_t (*get_time)(void); @@ -170,13 +172,16 @@ struct hrtimer_clock_base { * @nr_events: Total number of timer interrupt events */ struct hrtimer_cpu_base { - spinlock_t lock; + atomic_spinlock_t lock; struct hrtimer_clock_base clock_base[HRTIMER_MAX_CLOCK_BASES]; #ifdef CONFIG_HIGH_RES_TIMERS ktime_t expires_next; int hres_active; unsigned long nr_events; #endif +#ifdef CONFIG_PREEMPT_SOFTIRQS + wait_queue_head_t wait; +#endif }; static inline void hrtimer_set_expires(struct hrtimer *timer, ktime_t time) @@ -364,6 +369,13 @@ static inline int hrtimer_restart(struct hrtimer *timer) return hrtimer_start_expires(timer, HRTIMER_MODE_ABS); } +/* Softirq preemption could deadlock timer removal */ +#ifdef CONFIG_PREEMPT_SOFTIRQS + extern void hrtimer_wait_for_timer(const struct hrtimer *timer); +#else +# define hrtimer_wait_for_timer(timer) do { cpu_relax(); } while (0) +#endif + /* Query timers: */ extern ktime_t hrtimer_get_remaining(const struct hrtimer *timer); extern int hrtimer_get_res(const clockid_t which_clock, struct timespec *tp); diff --git a/include/linux/init_task.h b/include/linux/init_task.h index 7fc01b1..0d2e607 100644 --- a/include/linux/init_task.h +++ b/include/linux/init_task.h @@ -27,7 +27,7 @@ extern struct fs_struct init_fs; .cputimer = { \ .cputime = INIT_CPUTIME, \ .running = 0, \ - .lock = __SPIN_LOCK_UNLOCKED(sig.cputimer.lock), \ + .lock = __ATOMIC_SPIN_LOCK_UNLOCKED(sig.cputimer.lock), \ }, \ } @@ -159,8 +159,9 @@ extern struct cred init_cred; .journal_info = NULL, \ .cpu_timers = INIT_CPU_TIMERS(tsk.cpu_timers), \ .fs_excl = ATOMIC_INIT(0), \ - .pi_lock = __SPIN_LOCK_UNLOCKED(tsk.pi_lock), \ + .pi_lock = __ATOMIC_SPIN_LOCK_UNLOCKED(tsk.pi_lock), \ .timer_slack_ns = 50000, /* 50 usec default slack */ \ + .posix_timer_list = NULL, \ .pids = { \ [PIDTYPE_PID] = INIT_PID_LINK(PIDTYPE_PID), \ [PIDTYPE_PGID] = INIT_PID_LINK(PIDTYPE_PGID), \ diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h index 35e7df1..f7bbea6 100644 --- a/include/linux/interrupt.h +++ b/include/linux/interrupt.h @@ -50,15 +50,23 @@ * IRQF_IRQPOLL - Interrupt is used for polling (only the interrupt that is * registered first in an shared interrupt is considered for * performance reasons) + * IRQF_ONESHOT - Interrupt is not reenabled after the hardirq handler finished. + * Used by threaded interrupts which need to keep the + * irq line disabled until the threaded handler has been run. + * IRQF_NODELAY - Interrupt is not force threaded */ #define IRQF_DISABLED 0x00000020 #define IRQF_SAMPLE_RANDOM 0x00000040 #define IRQF_SHARED 0x00000080 #define IRQF_PROBE_SHARED 0x00000100 -#define IRQF_TIMER 0x00000200 +#define __IRQF_TIMER 0x00000200 #define IRQF_PERCPU 0x00000400 #define IRQF_NOBALANCING 0x00000800 #define IRQF_IRQPOLL 0x00001000 +#define IRQF_ONESHOT 0x00002000 +#define IRQF_NODELAY 0x00004000 + +#define IRQF_TIMER (__IRQF_TIMER | IRQF_NODELAY) /* * Bits used by threaded handlers: @@ -89,6 +97,7 @@ typedef irqreturn_t (*irq_handler_t)(int, void *); * @thread_fn: interupt handler function for threaded interrupts * @thread: thread pointer for threaded interrupts * @thread_flags: flags related to @thread + * @thread_mask: bit mask to account for forced threads */ struct irqaction { irq_handler_t handler; @@ -102,6 +111,7 @@ struct irqaction { irq_handler_t thread_fn; struct task_struct *thread; unsigned long thread_flags; + unsigned long thread_mask; }; extern irqreturn_t no_action(int cpl, void *dev_id); @@ -178,7 +188,7 @@ extern void devm_free_irq(struct device *dev, unsigned int irq, void *dev_id); #ifdef CONFIG_LOCKDEP # define local_irq_enable_in_hardirq() do { } while (0) #else -# define local_irq_enable_in_hardirq() local_irq_enable() +# define local_irq_enable_in_hardirq() local_irq_enable_nort() #endif extern void disable_irq_nosync(unsigned int irq); @@ -318,6 +328,7 @@ static inline int disable_irq_wake(unsigned int irq) #ifndef __ARCH_SET_SOFTIRQ_PENDING #define set_softirq_pending(x) (local_softirq_pending() = (x)) +// FIXME: PREEMPT_RT: set_bit()? #define or_softirq_pending(x) (local_softirq_pending() |= (x)) #endif @@ -348,7 +359,6 @@ enum SCHED_SOFTIRQ, HRTIMER_SOFTIRQ, RCU_SOFTIRQ, /* Preferable RCU should always be the last softirq */ - NR_SOFTIRQS }; @@ -366,14 +376,23 @@ struct softirq_action void (*action)(struct softirq_action *); }; +#ifdef CONFIG_PREEMPT_HARDIRQS +# define __raise_softirq_irqoff(nr) raise_softirq_irqoff(nr) +# define __do_raise_softirq_irqoff(nr) \ + do { or_softirq_pending(1UL << (nr)); } while (0) +#else +# define __raise_softirq_irqoff(nr) \ + do { or_softirq_pending(1UL << (nr)); } while (0) +# define __do_raise_softirq_irqoff(nr) __raise_softirq_irqoff(nr) +#endif + asmlinkage void do_softirq(void); asmlinkage void __do_softirq(void); extern void open_softirq(int nr, void (*action)(struct softirq_action *)); extern void softirq_init(void); -#define __raise_softirq_irqoff(nr) do { or_softirq_pending(1UL << (nr)); } while (0) extern void raise_softirq_irqoff(unsigned int nr); extern void raise_softirq(unsigned int nr); -extern void wakeup_softirqd(void); +extern void softirq_check_pending_idle(void); /* This is the worklist that queues up per-cpu softirq work. * @@ -408,8 +427,9 @@ extern void __send_remote_softirq(struct call_single_data *cp, int cpu, to be executed on some cpu at least once after this. * If the tasklet is already scheduled, but its excecution is still not started, it will be executed only once. - * If this tasklet is already running on another CPU (or schedule is called - from tasklet itself), it is rescheduled for later. + * If this tasklet is already running on another CPU, it is rescheduled + for later. + * Schedule must not be called from the tasklet itself (a lockup occurs) * Tasklet is strictly serialized wrt itself, but not wrt another tasklets. If client needs some intertask synchronization, he makes it with spinlocks. @@ -434,27 +454,36 @@ struct tasklet_struct name = { NULL, 0, ATOMIC_INIT(1), func, data } enum { TASKLET_STATE_SCHED, /* Tasklet is scheduled for execution */ - TASKLET_STATE_RUN /* Tasklet is running (SMP only) */ + TASKLET_STATE_RUN, /* Tasklet is running (SMP only) */ + TASKLET_STATE_PENDING /* Tasklet is pending */ }; -#ifdef CONFIG_SMP +#define TASKLET_STATEF_SCHED (1 << TASKLET_STATE_SCHED) +#define TASKLET_STATEF_RUN (1 << TASKLET_STATE_RUN) +#define TASKLET_STATEF_PENDING (1 << TASKLET_STATE_PENDING) + +#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT) static inline int tasklet_trylock(struct tasklet_struct *t) { return !test_and_set_bit(TASKLET_STATE_RUN, &(t)->state); } +static inline int tasklet_tryunlock(struct tasklet_struct *t) +{ + return cmpxchg(&t->state, TASKLET_STATEF_RUN, 0) == TASKLET_STATEF_RUN; +} + static inline void tasklet_unlock(struct tasklet_struct *t) { smp_mb__before_clear_bit(); clear_bit(TASKLET_STATE_RUN, &(t)->state); } -static inline void tasklet_unlock_wait(struct tasklet_struct *t) -{ - while (test_bit(TASKLET_STATE_RUN, &(t)->state)) { barrier(); } -} +extern void tasklet_unlock_wait(struct tasklet_struct *t); + #else #define tasklet_trylock(t) 1 +#define tasklet_tryunlock(t) 1 #define tasklet_unlock_wait(t) do { } while (0) #define tasklet_unlock(t) do { } while (0) #endif @@ -503,22 +532,14 @@ static inline void tasklet_disable(struct tasklet_struct *t) smp_mb(); } -static inline void tasklet_enable(struct tasklet_struct *t) -{ - smp_mb__before_atomic_dec(); - atomic_dec(&t->count); -} - -static inline void tasklet_hi_enable(struct tasklet_struct *t) -{ - smp_mb__before_atomic_dec(); - atomic_dec(&t->count); -} +extern void tasklet_enable(struct tasklet_struct *t); +extern void tasklet_hi_enable(struct tasklet_struct *t); extern void tasklet_kill(struct tasklet_struct *t); extern void tasklet_kill_immediate(struct tasklet_struct *t, unsigned int cpu); extern void tasklet_init(struct tasklet_struct *t, void (*func)(unsigned long), unsigned long data); +extern void takeover_tasklets(unsigned int cpu); struct tasklet_hrtimer { struct hrtimer timer; @@ -616,4 +637,19 @@ extern int arch_probe_nr_irqs(void); extern int arch_early_irq_init(void); extern int arch_init_chip_data(struct irq_desc *desc, int node); +/* + * local_irq* variants depending on RT/!RT + */ +#ifdef CONFIG_PREEMPT_RT +# define local_irq_disable_nort() do { } while (0) +# define local_irq_enable_nort() do { } while (0) +# define local_irq_save_nort(flags) do { local_save_flags(flags); } while (0) +# define local_irq_restore_nort(flags) do { (void)(flags); } while (0) +#else +# define local_irq_disable_nort() local_irq_disable() +# define local_irq_enable_nort() local_irq_enable() +# define local_irq_save_nort(flags) local_irq_save(flags) +# define local_irq_restore_nort(flags) local_irq_restore(flags) +#endif + #endif diff --git a/include/linux/irq.h b/include/linux/irq.h index cb2e77a..9a0a74f 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -69,6 +69,8 @@ typedef void (*irq_flow_handler_t)(unsigned int irq, #define IRQ_MOVE_PCNTXT 0x01000000 /* IRQ migration from process context */ #define IRQ_AFFINITY_SET 0x02000000 /* IRQ affinity was set from userspace*/ #define IRQ_SUSPENDED 0x04000000 /* IRQ has gone through suspend sequence */ +#define IRQ_ONESHOT 0x08000000 /* IRQ is not unmasked after hardirq */ +#define IRQ_NESTED_THREAD 0x10000000 /* IRQ is nested into another, no own handler thread */ #ifdef CONFIG_IRQ_PER_CPU # define CHECK_IRQ_PER_CPU(var) ((var) & IRQ_PER_CPU) @@ -100,6 +102,9 @@ struct msi_desc; * @set_type: set the flow type (IRQ_TYPE_LEVEL/etc.) of an IRQ * @set_wake: enable/disable power-management wake-on of an IRQ * + * @bus_lock: function to lock access to slow bus (i2c) chips + * @bus_sync_unlock: function to sync and unlock slow bus (i2c) chips + * * @release: release function solely used by UML * @typename: obsoleted by name, kept as migration helper */ @@ -123,6 +128,9 @@ struct irq_chip { int (*set_type)(unsigned int irq, unsigned int flow_type); int (*set_wake)(unsigned int irq, unsigned int on); + void (*bus_lock)(unsigned int irq); + void (*bus_sync_unlock)(unsigned int irq); + /* Currently used only by UML, might disappear one day.*/ #ifdef CONFIG_IRQ_RELEASE_METHOD void (*release)(unsigned int irq, void *dev_id); @@ -184,7 +192,7 @@ struct irq_desc { unsigned int irq_count; /* For detecting broken IRQs */ unsigned long last_unhandled; /* Aging timer for unhandled count */ unsigned int irqs_unhandled; - spinlock_t lock; + atomic_spinlock_t lock; #ifdef CONFIG_SMP cpumask_var_t affinity; unsigned int node; @@ -193,6 +201,7 @@ struct irq_desc { #endif #endif atomic_t threads_active; + unsigned long forced_threads_active; wait_queue_head_t wait_for_threads; #ifdef CONFIG_PROC_FS struct proc_dir_entry *dir; @@ -379,6 +388,8 @@ set_irq_chained_handler(unsigned int irq, __set_irq_handler(irq, handle, 1, NULL); } +extern void set_irq_nested_thread(unsigned int irq, int nest); + extern void set_irq_noprobe(unsigned int irq); extern void set_irq_probe(unsigned int irq); diff --git a/include/linux/jbd.h b/include/linux/jbd.h index c2049a0..ce07056 100644 --- a/include/linux/jbd.h +++ b/include/linux/jbd.h @@ -260,6 +260,15 @@ void buffer_assertion_failure(struct buffer_head *bh); #define J_ASSERT_JH(jh, expr) J_ASSERT(expr) #endif +/* + * For assertions that are only valid on SMP (e.g. spin_is_locked()): + */ +#ifdef CONFIG_SMP +# define J_ASSERT_JH_SMP(jh, expr) J_ASSERT_JH(jh, expr) +#else +# define J_ASSERT_JH_SMP(jh, assert) do { } while (0) +#endif + #if defined(JBD_PARANOID_IOFAIL) #define J_EXPECT(expr, why...) J_ASSERT(expr) #define J_EXPECT_BH(bh, expr, why...) J_ASSERT_BH(bh, expr) @@ -315,32 +324,32 @@ static inline struct journal_head *bh2jh(struct buffer_head *bh) static inline void jbd_lock_bh_state(struct buffer_head *bh) { - bit_spin_lock(BH_State, &bh->b_state); + spin_lock(&bh->b_state_lock); } static inline int jbd_trylock_bh_state(struct buffer_head *bh) { - return bit_spin_trylock(BH_State, &bh->b_state); + return spin_trylock(&bh->b_state_lock); } static inline int jbd_is_locked_bh_state(struct buffer_head *bh) { - return bit_spin_is_locked(BH_State, &bh->b_state); + return spin_is_locked(&bh->b_state_lock); } static inline void jbd_unlock_bh_state(struct buffer_head *bh) { - bit_spin_unlock(BH_State, &bh->b_state); + spin_unlock(&bh->b_state_lock); } static inline void jbd_lock_bh_journal_head(struct buffer_head *bh) { - bit_spin_lock(BH_JournalHead, &bh->b_state); + spin_lock_irq(&bh->b_uptodate_lock); } static inline void jbd_unlock_bh_journal_head(struct buffer_head *bh) { - bit_spin_unlock(BH_JournalHead, &bh->b_state); + spin_unlock_irq(&bh->b_uptodate_lock); } struct jbd_revoke_table_s; diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h index d97eb65..406d3b5 100644 --- a/include/linux/jbd2.h +++ b/include/linux/jbd2.h @@ -319,6 +319,15 @@ void buffer_assertion_failure(struct buffer_head *bh); #define J_EXPECT_JH(jh, expr, why...) __journal_expect(expr, ## why) #endif +/* + * For assertions that are only valid on SMP (e.g. spin_is_locked()): + */ +#ifdef CONFIG_SMP +# define J_ASSERT_JH_SMP(jh, expr) J_ASSERT_JH(jh, expr) +#else +# define J_ASSERT_JH_SMP(jh, assert) do { } while (0) +#endif + enum jbd_state_bits { BH_JBD /* Has an attached ext3 journal_head */ = BH_PrivateStart, @@ -355,32 +364,32 @@ static inline struct journal_head *bh2jh(struct buffer_head *bh) static inline void jbd_lock_bh_state(struct buffer_head *bh) { - bit_spin_lock(BH_State, &bh->b_state); + spin_lock(&bh->b_state_lock); } static inline int jbd_trylock_bh_state(struct buffer_head *bh) { - return bit_spin_trylock(BH_State, &bh->b_state); + return spin_trylock(&bh->b_state_lock); } static inline int jbd_is_locked_bh_state(struct buffer_head *bh) { - return bit_spin_is_locked(BH_State, &bh->b_state); + return spin_is_locked(&bh->b_state_lock); } static inline void jbd_unlock_bh_state(struct buffer_head *bh) { - bit_spin_unlock(BH_State, &bh->b_state); + spin_unlock(&bh->b_state_lock); } static inline void jbd_lock_bh_journal_head(struct buffer_head *bh) { - bit_spin_lock(BH_JournalHead, &bh->b_state); + spin_lock(&bh->b_uptodate_lock); } static inline void jbd_unlock_bh_journal_head(struct buffer_head *bh) { - bit_spin_unlock(BH_JournalHead, &bh->b_state); + spin_unlock(&bh->b_uptodate_lock); } /* Flags in jbd_inode->i_flags */ diff --git a/include/linux/kernel.h b/include/linux/kernel.h index d6320a3..4651e09 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -124,7 +124,7 @@ extern int _cond_resched(void); # define might_resched() do { } while (0) #endif -#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP +#if defined(CONFIG_DEBUG_SPINLOCK_SLEEP) || defined(CONFIG_DEBUG_PREEMPT) void __might_sleep(char *file, int line); /** * might_sleep - annotation for functions that can sleep @@ -284,6 +284,12 @@ extern void printk_tick(void); extern void asmlinkage __attribute__((format(printf, 1, 2))) early_printk(const char *fmt, ...); +#ifdef CONFIG_PREEMPT_RT +extern void zap_rt_locks(void); +#else +# define zap_rt_locks() do { } while (0) +#endif + unsigned long int_sqrt(unsigned long); static inline void console_silent(void) @@ -313,6 +319,7 @@ extern int root_mountflags; /* Values used for system_state */ extern enum system_states { SYSTEM_BOOTING, + SYSTEM_BOOTING_SCHEDULER_OK, SYSTEM_RUNNING, SYSTEM_HALT, SYSTEM_POWER_OFF, diff --git a/include/linux/kernel_stat.h b/include/linux/kernel_stat.h index 348fa88..91958d3 100644 --- a/include/linux/kernel_stat.h +++ b/include/linux/kernel_stat.h @@ -24,6 +24,8 @@ struct cpu_usage_stat { cputime64_t idle; cputime64_t iowait; cputime64_t steal; + cputime64_t user_rt; + cputime64_t system_rt; cputime64_t guest; }; diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h index bcd9c07..4dffaec 100644 --- a/include/linux/kprobes.h +++ b/include/linux/kprobes.h @@ -170,7 +170,7 @@ struct kretprobe { int nmissed; size_t data_size; struct hlist_head free_instances; - spinlock_t lock; + atomic_spinlock_t lock; }; struct kretprobe_instance { diff --git a/include/linux/list.h b/include/linux/list.h index 969f6e9..d62a35b 100644 --- a/include/linux/list.h +++ b/include/linux/list.h @@ -345,6 +345,9 @@ static inline void list_splice_tail_init(struct list_head *list, #define list_first_entry(ptr, type, member) \ list_entry((ptr)->next, type, member) +#define list_last_entry(ptr, type, member) \ + list_entry((ptr)->prev, type, member) + /** * list_for_each - iterate over a list * @pos: the &struct list_head to use as a loop cursor. diff --git a/include/linux/mm.h b/include/linux/mm.h index 9a72cc7..b34e158 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -587,23 +587,39 @@ static __always_inline void *lowmem_page_address(struct page *page) #endif #if defined(WANT_PAGE_VIRTUAL) -#define page_address(page) ((page)->virtual) -#define set_page_address(page, address) \ - do { \ - (page)->virtual = (address); \ - } while(0) -#define page_address_init() do { } while(0) +/* + * wrap page->virtual so it is safe to set/read locklessly + */ +#define page_address(page) \ + ({ typeof((page)->virtual) v = (page)->virtual; \ + smp_read_barrier_depends(); \ + v; }) + +static inline int set_page_address(struct page *page, void *address) +{ + if (address) + return cmpxchg(&page->virtual, NULL, address) == NULL; + else { + /* + * cmpxchg is a bit abused because it is not guaranteed + * safe wrt direct assignment on all platforms. + */ + void *virt = page->virtual; + return cmpxchg(&page->vitrual, virt, NULL) == virt; + } +} +void page_address_init(void); #endif #if defined(HASHED_PAGE_VIRTUAL) void *page_address(struct page *page); -void set_page_address(struct page *page, void *virtual); +int set_page_address(struct page *page, void *virtual); void page_address_init(void); #endif #if !defined(HASHED_PAGE_VIRTUAL) && !defined(WANT_PAGE_VIRTUAL) #define page_address(page) lowmem_page_address(page) -#define set_page_address(page, address) do { } while(0) +#define set_page_address(page, address) (0) #define page_address_init() do { } while(0) #endif @@ -744,7 +760,7 @@ int zap_vma_ptes(struct vm_area_struct *vma, unsigned long address, unsigned long size); unsigned long zap_page_range(struct vm_area_struct *vma, unsigned long address, unsigned long size, struct zap_details *); -unsigned long unmap_vmas(struct mmu_gather **tlb, +unsigned long unmap_vmas(struct mmu_gather *tlb, struct vm_area_struct *start_vma, unsigned long start_addr, unsigned long end_addr, unsigned long *nr_accounted, struct zap_details *); @@ -923,27 +939,85 @@ static inline pmd_t *pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long a * overflow into the next struct page (as it might with DEBUG_SPINLOCK). * When freeing, reset page->mapping so free_pages_check won't complain. */ +#ifndef CONFIG_PREEMPT_RT + #define __pte_lockptr(page) &((page)->ptl) -#define pte_lock_init(_page) do { \ - spin_lock_init(__pte_lockptr(_page)); \ -} while (0) + +static inline struct page *pte_lock_init(struct page *page) +{ + spin_lock_init(__pte_lockptr(page)); + return page; +} + #define pte_lock_deinit(page) ((page)->mapping = NULL) + +#else /* PREEMPT_RT */ + +/* + * On PREEMPT_RT the spinlock_t's are too large to embed in the + * page frame, hence it only has a pointer and we need to dynamically + * allocate the lock when we allocate PTE-pages. + * + * This is an overall win, since only a small fraction of the pages + * will be PTE pages under normal circumstances. + */ + +#define __pte_lockptr(page) ((page)->ptl) + +/* + * Heinous hack, relies on the caller doing something like: + * + * pte = alloc_pages(PGALLOC_GFP, 0); + * if (pte) + * pgtable_page_ctor(pte); + * return pte; + * + * This ensures we release the page and return NULL when the + * lock allocation fails. + */ +static inline struct page *pte_lock_init(struct page *page) +{ + page->ptl = kmalloc(sizeof(spinlock_t), GFP_KERNEL); + if (page->ptl) { + spin_lock_init(__pte_lockptr(page)); + } else { + __free_page(page); + page = NULL; + } + return page; +} + +static inline void pte_lock_deinit(struct page *page) +{ + kfree(page->ptl); + page->mapping = NULL; +} + +#endif /* PREEMPT_RT */ + #define pte_lockptr(mm, pmd) ({(void)(mm); __pte_lockptr(pmd_page(*(pmd)));}) #else /* !USE_SPLIT_PTLOCKS */ /* * We use mm->page_table_lock to guard all pagetable pages of the mm. */ -#define pte_lock_init(page) do {} while (0) +static inline struct page *pte_lock_init(struct page *page) { return page; } #define pte_lock_deinit(page) do {} while (0) #define pte_lockptr(mm, pmd) ({(void)(pmd); &(mm)->page_table_lock;}) #endif /* USE_SPLIT_PTLOCKS */ -static inline void pgtable_page_ctor(struct page *page) +static inline struct page *__pgtable_page_ctor(struct page *page) { - pte_lock_init(page); - inc_zone_page_state(page, NR_PAGETABLE); + page = pte_lock_init(page); + if (page) + inc_zone_page_state(page, NR_PAGETABLE); + return page; } +#define pgtable_page_ctor(page) \ +do { \ + page = __pgtable_page_ctor(page); \ +} while (0) + static inline void pgtable_page_dtor(struct page *page) { pte_lock_deinit(page); diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 0042090..4e6a701 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -69,7 +69,11 @@ struct page { */ }; #if USE_SPLIT_PTLOCKS +#ifndef CONFIG_PREEMPT_RT spinlock_t ptl; +#else + spinlock_t *ptl; +#endif #endif struct kmem_cache *slab; /* SLUB: Pointer to slab */ struct page *first_page; /* Compound tail pages */ @@ -245,6 +249,9 @@ struct mm_struct { /* Architecture-specific MM context */ mm_context_t context; + /* realtime bits */ + struct list_head delayed_drop; + /* Swap token stuff */ /* * Last value of global fault stamp as seen by this process. diff --git a/include/linux/module.h b/include/linux/module.h index 098bdb7..f8f92d0 100644 --- a/include/linux/module.h +++ b/include/linux/module.h @@ -17,10 +17,12 @@ #include <linux/moduleparam.h> #include <linux/marker.h> #include <linux/tracepoint.h> -#include <asm/local.h> +#include <asm/local.h> #include <asm/module.h> +#include <trace/events/module.h> + /* Not Yet Implemented */ #define MODULE_SUPPORTED_DEVICE(name) @@ -462,7 +464,10 @@ static inline local_t *__module_ref_addr(struct module *mod, int cpu) static inline void __module_get(struct module *module) { if (module) { - local_inc(__module_ref_addr(module, get_cpu())); + unsigned int cpu = get_cpu(); + local_inc(__module_ref_addr(module, cpu)); + trace_module_get(module, _THIS_IP_, + local_read(__module_ref_addr(module, cpu))); put_cpu(); } } @@ -473,8 +478,11 @@ static inline int try_module_get(struct module *module) if (module) { unsigned int cpu = get_cpu(); - if (likely(module_is_live(module))) + if (likely(module_is_live(module))) { local_inc(__module_ref_addr(module, cpu)); + trace_module_get(module, _THIS_IP_, + local_read(__module_ref_addr(module, cpu))); + } else ret = 0; put_cpu(); diff --git a/include/linux/mutex.h b/include/linux/mutex.h index 878cab4..f98509b 100644 --- a/include/linux/mutex.h +++ b/include/linux/mutex.h @@ -12,11 +12,85 @@ #include <linux/list.h> #include <linux/spinlock_types.h> +#include <linux/rt_lock.h> #include <linux/linkage.h> #include <linux/lockdep.h> #include <asm/atomic.h> +#ifdef CONFIG_DEBUG_LOCK_ALLOC +# define __DEP_MAP_MUTEX_INITIALIZER(lockname) \ + , .dep_map = { .name = #lockname } +#else +# define __DEP_MAP_MUTEX_INITIALIZER(lockname) +#endif + +#ifdef CONFIG_PREEMPT_RT + +#include <linux/rtmutex.h> + +struct mutex { + struct rt_mutex lock; +#ifdef CONFIG_DEBUG_LOCK_ALLOC + struct lockdep_map dep_map; +#endif +}; + + +#define __MUTEX_INITIALIZER(mutexname) \ + { \ + .lock = __RT_MUTEX_INITIALIZER(mutexname.lock) \ + __DEP_MAP_MUTEX_INITIALIZER(mutexname) \ + } + +#define DEFINE_MUTEX(mutexname) \ + struct mutex mutexname = __MUTEX_INITIALIZER(mutexname) + +extern void +__mutex_init(struct mutex *lock, char *name, struct lock_class_key *key); + +extern void __lockfunc _mutex_lock(struct mutex *lock); +extern int __lockfunc _mutex_lock_interruptible(struct mutex *lock); +extern int __lockfunc _mutex_lock_killable(struct mutex *lock); +extern void __lockfunc _mutex_lock_nested(struct mutex *lock, int subclass); +extern int __lockfunc +_mutex_lock_interruptible_nested(struct mutex *lock, int subclass); +extern int __lockfunc +_mutex_lock_killable_nested(struct mutex *lock, int subclass); +extern int __lockfunc _mutex_trylock(struct mutex *lock); +extern void __lockfunc _mutex_unlock(struct mutex *lock); + +#define mutex_is_locked(l) rt_mutex_is_locked(&(l)->lock) +#define mutex_lock(l) _mutex_lock(l) +#define mutex_lock_interruptible(l) _mutex_lock_interruptible(l) +#define mutex_lock_killable(l) _mutex_lock_killable(l) +#define mutex_trylock(l) _mutex_trylock(l) +#define mutex_unlock(l) _mutex_unlock(l) +#define mutex_destroy(l) rt_mutex_destroy(&(l)->lock) + +#ifdef CONFIG_DEBUG_LOCK_ALLOC +# define mutex_lock_nested(l, s) _mutex_lock_nested(l, s) +# define mutex_lock_interruptible_nested(l, s) \ + _mutex_lock_interruptible_nested(l, s) +# define mutex_lock_killable_nested(l, s) \ + _mutex_lock_killable_nested(l, s) +#else +# define mutex_lock_nested(l, s) _mutex_lock(l) +# define mutex_lock_interruptible_nested(l, s) \ + _mutex_lock_interruptible(l) +# define mutex_lock_killable_nested(l, s) \ + _mutex_lock_killable(l) +#endif + +# define mutex_init(mutex) \ +do { \ + static struct lock_class_key __key; \ + \ + __mutex_init((mutex), #mutex, &__key); \ +} while (0) + +#else /* PREEMPT_RT */ + /* * Simple, straightforward mutexes with strict semantics: * @@ -87,13 +161,6 @@ do { \ # define mutex_destroy(mutex) do { } while (0) #endif -#ifdef CONFIG_DEBUG_LOCK_ALLOC -# define __DEP_MAP_MUTEX_INITIALIZER(lockname) \ - , .dep_map = { .name = #lockname } -#else -# define __DEP_MAP_MUTEX_INITIALIZER(lockname) -#endif - #define __MUTEX_INITIALIZER(lockname) \ { .count = ATOMIC_INIT(1) \ , .wait_lock = __SPIN_LOCK_UNLOCKED(lockname.wait_lock) \ @@ -150,6 +217,8 @@ extern int __must_check mutex_lock_killable(struct mutex *lock); */ extern int mutex_trylock(struct mutex *lock); extern void mutex_unlock(struct mutex *lock); +#endif /* !PREEMPT_RT */ + extern int atomic_dec_and_mutex_lock(atomic_t *cnt, struct mutex *lock); #endif diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index d4a4d98..14aa9d9 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -477,7 +477,7 @@ struct netdev_queue { * write mostly part */ spinlock_t _xmit_lock ____cacheline_aligned_in_smp; - int xmit_lock_owner; + void *xmit_lock_owner; /* * please use this field instead of dev->trans_start */ @@ -1665,41 +1665,49 @@ static inline u32 netif_msg_init(int debug_value, int default_msg_enable_bits) return (1 << debug_value) - 1; } -static inline void __netif_tx_lock(struct netdev_queue *txq, int cpu) +static inline void __netif_tx_lock(struct netdev_queue *txq) { spin_lock(&txq->_xmit_lock); - txq->xmit_lock_owner = cpu; + txq->xmit_lock_owner = (void *)current; +} + +/* + * Do we hold the xmit_lock already? + */ +static inline int netif_tx_lock_recursion(struct netdev_queue *txq) +{ + return txq->xmit_lock_owner == (void *)current; } static inline void __netif_tx_lock_bh(struct netdev_queue *txq) { spin_lock_bh(&txq->_xmit_lock); - txq->xmit_lock_owner = smp_processor_id(); + txq->xmit_lock_owner = (void *)current; } static inline int __netif_tx_trylock(struct netdev_queue *txq) { int ok = spin_trylock(&txq->_xmit_lock); if (likely(ok)) - txq->xmit_lock_owner = smp_processor_id(); + txq->xmit_lock_owner = (void *)current; return ok; } static inline void __netif_tx_unlock(struct netdev_queue *txq) { - txq->xmit_lock_owner = -1; + txq->xmit_lock_owner = (void *)-1; spin_unlock(&txq->_xmit_lock); } static inline void __netif_tx_unlock_bh(struct netdev_queue *txq) { - txq->xmit_lock_owner = -1; + txq->xmit_lock_owner = (void *)-1; spin_unlock_bh(&txq->_xmit_lock); } static inline void txq_trans_update(struct netdev_queue *txq) { - if (txq->xmit_lock_owner != -1) + if (txq->xmit_lock_owner != (void *)-1) txq->trans_start = jiffies; } @@ -1712,10 +1720,8 @@ static inline void txq_trans_update(struct netdev_queue *txq) static inline void netif_tx_lock(struct net_device *dev) { unsigned int i; - int cpu; spin_lock(&dev->tx_global_lock); - cpu = smp_processor_id(); for (i = 0; i < dev->num_tx_queues; i++) { struct netdev_queue *txq = netdev_get_tx_queue(dev, i); @@ -1725,7 +1731,7 @@ static inline void netif_tx_lock(struct net_device *dev) * the ->hard_start_xmit() handler and already * checked the frozen bit. */ - __netif_tx_lock(txq, cpu); + __netif_tx_lock(txq); set_bit(__QUEUE_STATE_FROZEN, &txq->state); __netif_tx_unlock(txq); } @@ -1761,9 +1767,9 @@ static inline void netif_tx_unlock_bh(struct net_device *dev) local_bh_enable(); } -#define HARD_TX_LOCK(dev, txq, cpu) { \ +#define HARD_TX_LOCK(dev, txq) { \ if ((dev->features & NETIF_F_LLTX) == 0) { \ - __netif_tx_lock(txq, cpu); \ + __netif_tx_lock(txq); \ } \ } @@ -1776,14 +1782,12 @@ static inline void netif_tx_unlock_bh(struct net_device *dev) static inline void netif_tx_disable(struct net_device *dev) { unsigned int i; - int cpu; local_bh_disable(); - cpu = smp_processor_id(); for (i = 0; i < dev->num_tx_queues; i++) { struct netdev_queue *txq = netdev_get_tx_queue(dev, i); - __netif_tx_lock(txq, cpu); + __netif_tx_lock(txq); netif_tx_stop_queue(txq); __netif_tx_unlock(txq); } diff --git a/include/linux/netfilter/x_tables.h b/include/linux/netfilter/x_tables.h index 1030b75..a3f5427 100644 --- a/include/linux/netfilter/x_tables.h +++ b/include/linux/netfilter/x_tables.h @@ -468,22 +468,35 @@ DECLARE_PER_CPU(struct xt_info_lock, xt_info_locks); * _Only_ that special combination of being per-cpu and never getting * re-entered asynchronously means that the count is safe. */ -static inline void xt_info_rdlock_bh(void) +static inline int xt_info_rdlock_bh(void) { struct xt_info_lock *lock; + int cpu; local_bh_disable(); - lock = &__get_cpu_var(xt_info_locks); - if (likely(!lock->readers++)) + preempt_disable_rt(); + cpu = smp_processor_id(); + lock = &per_cpu(xt_info_locks, cpu); + if (likely(!lock->readers++)) { + preempt_enable_rt(); spin_lock(&lock->lock); + } else + preempt_enable_rt(); + return cpu; } -static inline void xt_info_rdunlock_bh(void) +static inline void xt_info_rdunlock_bh(int cpu) { - struct xt_info_lock *lock = &__get_cpu_var(xt_info_locks); + struct xt_info_lock *lock = &per_cpu(xt_info_locks, cpu); - if (likely(!--lock->readers)) + preempt_disable_rt(); + + if (likely(!--lock->readers)) { + preempt_enable_rt(); spin_unlock(&lock->lock); + } else + preempt_enable_rt(); + local_bh_enable(); } diff --git a/include/linux/netpoll.h b/include/linux/netpoll.h index 2524267..838405c 100644 --- a/include/linux/netpoll.h +++ b/include/linux/netpoll.h @@ -84,7 +84,7 @@ static inline void *netpoll_poll_lock(struct napi_struct *napi) rcu_read_lock(); /* deal with race on ->npinfo */ if (dev && dev->npinfo) { spin_lock(&napi->poll_lock); - napi->poll_owner = smp_processor_id(); + napi->poll_owner = raw_smp_processor_id(); return napi; } return NULL; diff --git a/include/linux/oprofile.h b/include/linux/oprofile.h index 1d9518b..aa89457 100644 --- a/include/linux/oprofile.h +++ b/include/linux/oprofile.h @@ -153,7 +153,7 @@ ssize_t oprofilefs_ulong_to_user(unsigned long val, char __user * buf, size_t co int oprofilefs_ulong_from_user(unsigned long * val, char const __user * buf, size_t count); /** lock for read/write safety */ -extern spinlock_t oprofilefs_lock; +extern atomic_spinlock_t oprofilefs_lock; /** * Add the contents of a circular buffer to the event buffer. diff --git a/include/linux/page_cgroup.h b/include/linux/page_cgroup.h index 13f126c..8c933dc 100644 --- a/include/linux/page_cgroup.h +++ b/include/linux/page_cgroup.h @@ -12,6 +12,7 @@ */ struct page_cgroup { unsigned long flags; + spinlock_t lock; struct mem_cgroup *mem_cgroup; struct page *page; struct list_head lru; /* per cgroup LRU list */ @@ -70,17 +71,17 @@ static inline enum zone_type page_cgroup_zid(struct page_cgroup *pc) static inline void lock_page_cgroup(struct page_cgroup *pc) { - bit_spin_lock(PCG_LOCK, &pc->flags); + spin_lock(&pc->lock); } static inline int trylock_page_cgroup(struct page_cgroup *pc) { - return bit_spin_trylock(PCG_LOCK, &pc->flags); + return spin_trylock(&pc->lock); } static inline void unlock_page_cgroup(struct page_cgroup *pc) { - bit_spin_unlock(PCG_LOCK, &pc->flags); + spin_unlock(&pc->lock); } #else /* CONFIG_CGROUP_MEM_RES_CTLR */ diff --git a/include/linux/pagevec.h b/include/linux/pagevec.h index bab82f4..0af5218 100644 --- a/include/linux/pagevec.h +++ b/include/linux/pagevec.h @@ -9,7 +9,7 @@ #define _LINUX_PAGEVEC_H /* 14 pointers + two long's align the pagevec structure to a power of two */ -#define PAGEVEC_SIZE 14 +#define PAGEVEC_SIZE 8 struct page; struct address_space; diff --git a/include/linux/parport.h b/include/linux/parport.h index 38a423e..70957cc 100644 --- a/include/linux/parport.h +++ b/include/linux/parport.h @@ -264,7 +264,7 @@ enum ieee1284_phase { struct ieee1284_info { int mode; volatile enum ieee1284_phase phase; - struct semaphore irq; + struct anon_semaphore irq; }; /* A parallel port */ diff --git a/include/linux/percpu-defs.h b/include/linux/percpu-defs.h index 68438e1..c02051d 100644 --- a/include/linux/percpu-defs.h +++ b/include/linux/percpu-defs.h @@ -38,6 +38,22 @@ DEFINE_PER_CPU_SECTION(type, name, "") /* + * next two added for RT patch + * (wonder if we need corresponding DECLARE_*'s?) (clrkwllms) + */ +#define DEFINE_PER_CPU_SPINLOCK(name, section) \ + __attribute__((__section__(PER_CPU_BASE_SECTION section))) \ + PER_CPU_ATTRIBUTES __DEFINE_SPINLOCK(per_cpu__lock_##name##_locked); + +#define DECLARE_PER_CPU_LOCKED(type, name) \ + extern PER_CPU_ATTRIBUTES spinlock_t __per_cpu_var_lock(name); \ + extern PER_CPU_ATTRIBUTES __typeof__(type) __per_cpu_var_lock_var(name) + +#define DEFINE_PER_CPU_LOCKED(type, name) \ + DEFINE_PER_CPU_SPINLOCK(name, "") \ + DEFINE_PER_CPU_SECTION(type, name##_locked, "") + +/* * Declaration/definition used for per-CPU variables that must come first in * the set of variables. */ @@ -79,7 +95,9 @@ * Intermodule exports for per-CPU variables. */ #define EXPORT_PER_CPU_SYMBOL(var) EXPORT_SYMBOL(per_cpu__##var) +#define EXPORT_PER_CPU_LOCKED_SYMBOL(var) EXPORT_SYMBOL(per_cpu__##var##_locked) #define EXPORT_PER_CPU_SYMBOL_GPL(var) EXPORT_SYMBOL_GPL(per_cpu__##var) +#define EXPORT_PER_CPU_LOCKED_SYMBOL_GPL(var) EXPORT_SYMBOL_GPL(per_cpu__##var##_locked) #endif /* _LINUX_PERCPU_DEFS_H */ diff --git a/include/linux/percpu.h b/include/linux/percpu.h index 26fd9d1..0b45757 100644 --- a/include/linux/percpu.h +++ b/include/linux/percpu.h @@ -32,6 +32,51 @@ &__get_cpu_var(var); })) #define put_cpu_var(var) preempt_enable() +/* + * Per-CPU data structures with an additional lock - useful for + * PREEMPT_RT code that wants to reschedule but also wants + * per-CPU data structures. + * + * 'cpu' gets updated with the CPU the task is currently executing on. + * + * NOTE: on normal !PREEMPT_RT kernels these per-CPU variables + * are the same as the normal per-CPU variables, so there no + * runtime overhead. + */ +#ifdef CONFIG_PREEMPT_RT +#define get_cpu_var_locked(var, cpuptr) \ +(*({ \ + spinlock_t *__lock; \ + int __cpu; \ + \ +again: \ + __cpu = raw_smp_processor_id(); \ + __lock = &__get_cpu_lock(var, __cpu); \ + spin_lock(__lock); \ + if (!cpu_online(__cpu)) { \ + spin_unlock(__lock); \ + goto again; \ + } \ + *(cpuptr) = __cpu; \ + &__get_cpu_var_locked(var, __cpu); \ +})) +#else +#define get_cpu_var_locked(var, cpuptr) \ +(*({ \ + int __cpu; \ + \ + preempt_disable(); \ + __cpu = smp_processor_id(); \ + spin_lock(&__get_cpu_lock(var, __cpu)); \ + preempt_enable(); \ + *(cpuptr) = __cpu; \ + &__get_cpu_var_locked(var, __cpu); \ +})) +#endif + +#define put_cpu_var_locked(var, cpu) \ + do { (void)cpu; spin_unlock(&__get_cpu_lock(var, cpu)); } while (0) + #ifdef CONFIG_SMP #ifdef CONFIG_HAVE_DYNAMIC_PER_CPU_AREA diff --git a/include/linux/percpu_counter.h b/include/linux/percpu_counter.h index a7684a5..fafe0a6 100644 --- a/include/linux/percpu_counter.h +++ b/include/linux/percpu_counter.h @@ -16,7 +16,7 @@ #ifdef CONFIG_SMP struct percpu_counter { - spinlock_t lock; + atomic_spinlock_t lock; s64 count; #ifdef CONFIG_HOTPLUG_CPU struct list_head list; /* All percpu_counters are on a list */ diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index b53f700..683ad4e 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -607,6 +607,9 @@ struct perf_counter { int pending_kill; int pending_disable; struct perf_pending_entry pending; +#ifdef CONFIG_PREEMPT_RT + struct perf_pending_entry pending_softirq; +#endif atomic_t event_limit; @@ -628,7 +631,7 @@ struct perf_counter_context { * Protect the states of the counters in the list, * nr_active, and the list: */ - spinlock_t lock; + atomic_spinlock_t lock; /* * Protect the list of counters. Locking either mutex or lock * is sufficient to ensure the list doesn't change; to change @@ -698,6 +701,7 @@ extern void perf_counter_exit_task(struct task_struct *child); extern void perf_counter_free_task(struct task_struct *task); extern void set_perf_counter_pending(void); extern void perf_counter_do_pending(void); +extern void perf_counter_do_pending_softirq(void); extern void perf_counter_print_debug(void); extern void __perf_disable(void); extern bool __perf_enable(void); @@ -761,6 +765,8 @@ extern int sysctl_perf_counter_mlock; extern int sysctl_perf_counter_sample_rate; extern void perf_counter_init(void); +extern void perf_tpcounter_event(int event_id, u64 addr, u64 count, + void *record, int entry_size); #ifndef perf_misc_flags #define perf_misc_flags(regs) (user_mode(regs) ? PERF_EVENT_MISC_USER : \ @@ -780,6 +786,7 @@ static inline int perf_counter_init_task(struct task_struct *child) { return 0; static inline void perf_counter_exit_task(struct task_struct *child) { } static inline void perf_counter_free_task(struct task_struct *task) { } static inline void perf_counter_do_pending(void) { } +static inline void perf_counter_do_pending_softirq(void) { } static inline void perf_counter_print_debug(void) { } static inline void perf_disable(void) { } static inline void perf_enable(void) { } diff --git a/include/linux/plist.h b/include/linux/plist.h index 45926d7..a2d2010 100644 --- a/include/linux/plist.h +++ b/include/linux/plist.h @@ -75,13 +75,16 @@ #include <linux/kernel.h> #include <linux/list.h> -#include <linux/spinlock_types.h> + +struct spinlock; +struct atomic_spinlock; struct plist_head { struct list_head prio_list; struct list_head node_list; #ifdef CONFIG_DEBUG_PI_LIST - spinlock_t *lock; + struct atomic_spinlock *alock; + struct spinlock *slock; #endif }; @@ -91,9 +94,11 @@ struct plist_node { }; #ifdef CONFIG_DEBUG_PI_LIST -# define PLIST_HEAD_LOCK_INIT(_lock) .lock = _lock +# define PLIST_HEAD_LOCK_INIT(_lock) .slock = _lock +# define PLIST_HEAD_LOCK_INIT_ATOMIC(_lock) .alock = _lock #else # define PLIST_HEAD_LOCK_INIT(_lock) +# define PLIST_HEAD_LOCK_INIT_ATOMIC(_lock) #endif #define _PLIST_HEAD_INIT(head) \ @@ -107,11 +112,22 @@ struct plist_node { */ #define PLIST_HEAD_INIT(head, _lock) \ { \ - _PLIST_HEAD_INIT(head), \ + _PLIST_HEAD_INIT(head), \ PLIST_HEAD_LOCK_INIT(&(_lock)) \ } /** + * PLIST_HEAD_INIT_ATOMIC - static struct plist_head initializer + * @head: struct plist_head variable name + * @_lock: lock to initialize for this list + */ +#define PLIST_HEAD_INIT_ATOMIC(head, _lock) \ +{ \ + _PLIST_HEAD_INIT(head), \ + PLIST_HEAD_LOCK_INIT_ATOMIC(&(_lock)) \ +} + +/** * PLIST_NODE_INIT - static struct plist_node initializer * @node: struct plist_node variable name * @__prio: initial node priority @@ -119,7 +135,7 @@ struct plist_node { #define PLIST_NODE_INIT(node, __prio) \ { \ .prio = (__prio), \ - .plist = { _PLIST_HEAD_INIT((node).plist) }, \ + .plist = { _PLIST_HEAD_INIT((node).plist) }, \ } /** @@ -128,12 +144,29 @@ struct plist_node { * @lock: list spinlock, remembered for debugging */ static inline void -plist_head_init(struct plist_head *head, spinlock_t *lock) +plist_head_init(struct plist_head *head, struct spinlock *lock) +{ + INIT_LIST_HEAD(&head->prio_list); + INIT_LIST_HEAD(&head->node_list); +#ifdef CONFIG_DEBUG_PI_LIST + head->slock = lock; + head->alock = NULL; +#endif +} + +/** + * plist_head_init_atomic - dynamic struct plist_head initializer + * @head: &struct plist_head pointer + * @lock: list atomic_spinlock, remembered for debugging + */ +static inline void +plist_head_init_atomic(struct plist_head *head, struct atomic_spinlock *lock) { INIT_LIST_HEAD(&head->prio_list); INIT_LIST_HEAD(&head->node_list); #ifdef CONFIG_DEBUG_PI_LIST - head->lock = lock; + head->alock = lock; + head->slock = NULL; #endif } diff --git a/include/linux/preempt.h b/include/linux/preempt.h index 72b1a10..5cb6d20 100644 --- a/include/linux/preempt.h +++ b/include/linux/preempt.h @@ -33,12 +33,24 @@ do { \ barrier(); \ } while (0) -#define preempt_enable_no_resched() \ +#define __preempt_enable_no_resched() \ do { \ barrier(); \ dec_preempt_count(); \ } while (0) +#ifdef CONFIG_DEBUG_PREEMPT +extern void notrace preempt_enable_no_resched(void); +#else +# define preempt_enable_no_resched() __preempt_enable_no_resched() +#endif + +#define preempt_enable_and_schedule() \ +do { \ + __preempt_enable_no_resched(); \ + schedule(); \ +} while (0) + #define preempt_check_resched() \ do { \ if (unlikely(test_thread_flag(TIF_NEED_RESCHED))) \ @@ -47,7 +59,7 @@ do { \ #define preempt_enable() \ do { \ - preempt_enable_no_resched(); \ + __preempt_enable_no_resched(); \ barrier(); \ preempt_check_resched(); \ } while (0) @@ -84,6 +96,8 @@ do { \ #define preempt_disable() do { } while (0) #define preempt_enable_no_resched() do { } while (0) +#define __preempt_enable_no_resched() do { } while (0) +#define preempt_enable_and_schedule() schedule() #define preempt_enable() do { } while (0) #define preempt_check_resched() do { } while (0) @@ -93,6 +107,18 @@ do { \ #endif +#ifdef CONFIG_PREEMPT_RT +# define preempt_disable_rt() preempt_disable() +# define preempt_enable_rt() preempt_enable() +# define preempt_disable_nort() do { } while (0) +# define preempt_enable_nort() do { } while (0) +#else +# define preempt_disable_rt() do { } while (0) +# define preempt_enable_rt() do { } while (0) +# define preempt_disable_nort() preempt_disable() +# define preempt_enable_nort() preempt_enable() +#endif + #ifdef CONFIG_PREEMPT_NOTIFIERS struct preempt_notifier; diff --git a/include/linux/profile.h b/include/linux/profile.h index a0fc322..5b72082 100644 --- a/include/linux/profile.h +++ b/include/linux/profile.h @@ -8,10 +8,11 @@ #include <asm/errno.h> -#define CPU_PROFILING 1 -#define SCHED_PROFILING 2 -#define SLEEP_PROFILING 3 -#define KVM_PROFILING 4 +#define CPU_PROFILING 1 +#define SCHED_PROFILING 2 +#define SLEEP_PROFILING 3 +#define KVM_PROFILING 4 +#define PREEMPT_PROFILING 5 struct proc_dir_entry; struct pt_regs; @@ -36,6 +37,8 @@ enum profile_type { PROFILE_MUNMAP }; +extern int prof_pid; + #ifdef CONFIG_PROFILING extern int prof_on __read_mostly; diff --git a/include/linux/proportions.h b/include/linux/proportions.h index cf793bb..de2e447 100644 --- a/include/linux/proportions.h +++ b/include/linux/proportions.h @@ -58,7 +58,7 @@ struct prop_local_percpu { */ int shift; unsigned long period; - spinlock_t lock; /* protect the snapshot state */ + atomic_spinlock_t lock; /* protect the snapshot state */ }; int prop_local_init_percpu(struct prop_local_percpu *pl); @@ -106,11 +106,11 @@ struct prop_local_single { */ unsigned long period; int shift; - spinlock_t lock; /* protect the snapshot state */ + atomic_spinlock_t lock; /* protect the snapshot state */ }; #define INIT_PROP_LOCAL_SINGLE(name) \ -{ .lock = __SPIN_LOCK_UNLOCKED(name.lock), \ +{ .lock = __ATOMIC_SPIN_LOCK_UNLOCKED(name.lock), \ } int prop_local_init_single(struct prop_local_single *pl); diff --git a/include/linux/quicklist.h b/include/linux/quicklist.h index bd46643..1bc3d46 100644 --- a/include/linux/quicklist.h +++ b/include/linux/quicklist.h @@ -18,7 +18,7 @@ struct quicklist { int nr_pages; }; -DECLARE_PER_CPU(struct quicklist, quicklist)[CONFIG_NR_QUICK]; +DECLARE_PER_CPU_LOCKED(struct quicklist, quicklist)[CONFIG_NR_QUICK]; /* * The two key functions quicklist_alloc and quicklist_free are inline so @@ -30,19 +30,27 @@ DECLARE_PER_CPU(struct quicklist, quicklist)[CONFIG_NR_QUICK]; * The fast patch in quicklist_alloc touched only a per cpu cacheline and * the first cacheline of the page itself. There is minmal overhead involved. */ -static inline void *quicklist_alloc(int nr, gfp_t flags, void (*ctor)(void *)) +static inline void *__quicklist_alloc(struct quicklist *q) { - struct quicklist *q; - void **p = NULL; + void **p = q->page; - q =&get_cpu_var(quicklist)[nr]; - p = q->page; if (likely(p)) { q->page = p[0]; p[0] = NULL; q->nr_pages--; } - put_cpu_var(quicklist); + return p; +} + +static inline void *quicklist_alloc(int nr, gfp_t flags, void (*ctor)(void *)) +{ + struct quicklist *q; + void **p; + int cpu; + + q = &get_cpu_var_locked(quicklist, &cpu)[nr]; + p = __quicklist_alloc(q); + put_cpu_var_locked(quicklist, cpu); if (likely(p)) return p; @@ -56,12 +64,13 @@ static inline void __quicklist_free(int nr, void (*dtor)(void *), void *p, struct page *page) { struct quicklist *q; + int cpu; - q = &get_cpu_var(quicklist)[nr]; + q = &get_cpu_var_locked(quicklist, &cpu)[nr]; *(void **)p = q->page; q->page = p; q->nr_pages++; - put_cpu_var(quicklist); + put_cpu_var_locked(quicklist, cpu); } static inline void quicklist_free(int nr, void (*dtor)(void *), void *pp) diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h index c5da749..9eb17f9 100644 --- a/include/linux/radix-tree.h +++ b/include/linux/radix-tree.h @@ -169,7 +169,18 @@ unsigned long radix_tree_next_hole(struct radix_tree_root *root, unsigned long index, unsigned long max_scan); unsigned long radix_tree_prev_hole(struct radix_tree_root *root, unsigned long index, unsigned long max_scan); +/* + * On a mutex based kernel we can freely schedule within the radix code: + */ +#ifdef CONFIG_PREEMPT_RT +static inline int radix_tree_preload(gfp_t gfp_mask) +{ + return 0; +} +#else int radix_tree_preload(gfp_t gfp_mask); +#endif + void radix_tree_init(void); void *radix_tree_tag_set(struct radix_tree_root *root, unsigned long index, unsigned int tag); @@ -189,7 +200,9 @@ int radix_tree_tagged(struct radix_tree_root *root, unsigned int tag); static inline void radix_tree_preload_end(void) { +#ifndef CONFIG_PREEMPT_RT preempt_enable(); +#endif } #endif /* _LINUX_RADIX_TREE_H */ diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h index 29f8599..5fcc31e 100644 --- a/include/linux/ring_buffer.h +++ b/include/linux/ring_buffer.h @@ -75,20 +75,6 @@ ring_buffer_event_time_delta(struct ring_buffer_event *event) } /* - * ring_buffer_event_discard can discard any event in the ring buffer. - * it is up to the caller to protect against a reader from - * consuming it or a writer from wrapping and replacing it. - * - * No external protection is needed if this is called before - * the event is commited. But in that case it would be better to - * use ring_buffer_discard_commit. - * - * Note, if an event that has not been committed is discarded - * with ring_buffer_event_discard, it must still be committed. - */ -void ring_buffer_event_discard(struct ring_buffer_event *event); - -/* * ring_buffer_discard_commit will remove an event that has not * ben committed yet. If this is used, then ring_buffer_unlock_commit * must not be called on the discarded event. This function @@ -154,8 +140,17 @@ unsigned long ring_buffer_size(struct ring_buffer *buffer); void ring_buffer_reset_cpu(struct ring_buffer *buffer, int cpu); void ring_buffer_reset(struct ring_buffer *buffer); +#ifdef CONFIG_RING_BUFFER_ALLOW_SWAP int ring_buffer_swap_cpu(struct ring_buffer *buffer_a, struct ring_buffer *buffer_b, int cpu); +#else +static inline int +ring_buffer_swap_cpu(struct ring_buffer *buffer_a, + struct ring_buffer *buffer_b, int cpu) +{ + return -ENODEV; +} +#endif int ring_buffer_empty(struct ring_buffer *buffer); int ring_buffer_empty_cpu(struct ring_buffer *buffer, int cpu); @@ -170,7 +165,6 @@ unsigned long ring_buffer_overruns(struct ring_buffer *buffer); unsigned long ring_buffer_entries_cpu(struct ring_buffer *buffer, int cpu); unsigned long ring_buffer_overrun_cpu(struct ring_buffer *buffer, int cpu); unsigned long ring_buffer_commit_overrun_cpu(struct ring_buffer *buffer, int cpu); -unsigned long ring_buffer_nmi_dropped_cpu(struct ring_buffer *buffer, int cpu); u64 ring_buffer_time_stamp(struct ring_buffer *buffer, int cpu); void ring_buffer_normalize_time_stamp(struct ring_buffer *buffer, diff --git a/include/linux/rt_lock.h b/include/linux/rt_lock.h new file mode 100644 index 0000000..5c74bad --- /dev/null +++ b/include/linux/rt_lock.h @@ -0,0 +1,214 @@ +#ifndef __LINUX_RT_LOCK_H +#define __LINUX_RT_LOCK_H + +/* + * Real-Time Preemption Support + * + * started by Ingo Molnar: + * + * Copyright (C) 2004, 2005 Red Hat, Inc., Ingo Molnar <mingo@redhat.com> + * + * This file contains the main data structure definitions. + */ +#include <linux/rtmutex.h> +#include <asm/atomic.h> +#include <linux/spinlock_types.h> + +#ifdef CONFIG_PREEMPT_RT + +static inline int preempt_rt(void) { return 1; } + +/* + * spinlocks - an RT mutex plus lock-break field: + */ +typedef struct spinlock { + struct rt_mutex lock; + unsigned int break_lock; +#ifdef CONFIG_DEBUG_LOCK_ALLOC + struct lockdep_map dep_map; +#endif +} spinlock_t; + +#ifdef CONFIG_DEBUG_RT_MUTEXES +# define __RT_SPIN_INITIALIZER(name) \ + { \ + .wait_lock = __ATOMIC_SPIN_LOCK_UNLOCKED(name), \ + .save_state = 1, \ + .file = __FILE__, \ + .line = __LINE__ , \ + } +#else +# define __RT_SPIN_INITIALIZER(name) \ + { .wait_lock = __ATOMIC_SPIN_LOCK_UNLOCKED(name) } +#endif + +#define __SPIN_LOCK_UNLOCKED(name) \ + { .lock = __RT_SPIN_INITIALIZER(name), \ + SPIN_DEP_MAP_INIT(name) } + +#define SPIN_LOCK_UNLOCKED __SPIN_LOCK_UNLOCKED(spin_old_style) + +#define __DEFINE_SPINLOCK(name) \ + spinlock_t name = __SPIN_LOCK_UNLOCKED(name) + +#define DEFINE_SPINLOCK(name) \ + spinlock_t name __cacheline_aligned_in_smp = __SPIN_LOCK_UNLOCKED(name) + +extern void +__rt_spin_lock_init(spinlock_t *lock, char *name, struct lock_class_key *key); + +#define spin_lock_init(lock) \ +do { \ + static struct lock_class_key __key; \ + \ + __rt_spin_lock_init(lock, #lock, &__key); \ +} while (0) + +extern void __lockfunc rt_spin_lock(spinlock_t *lock); +extern void __lockfunc rt_spin_lock_nested(spinlock_t *lock, int subclass); +extern void __lockfunc rt_spin_unlock(spinlock_t *lock); +extern void __lockfunc rt_spin_unlock_wait(spinlock_t *lock); +extern int __lockfunc +rt_spin_trylock_irqsave(spinlock_t *lock, unsigned long *flags); +extern int __lockfunc rt_spin_trylock(spinlock_t *lock); +extern int atomic_dec_and_spin_lock(atomic_t *atomic, spinlock_t *lock); + +/* + * lockdep-less calls, for derived types like rwlock: + * (for trylock they can use rt_mutex_trylock() directly. + */ +extern void __lockfunc __rt_spin_lock(struct rt_mutex *lock); +extern void __lockfunc __rt_spin_unlock(struct rt_mutex *lock); + +/* + * rwlocks - an RW semaphore plus lock-break field: + */ +typedef struct { + struct rt_mutex lock; + int read_depth; + unsigned int break_lock; +#ifdef CONFIG_DEBUG_LOCK_ALLOC + struct lockdep_map dep_map; +#endif +} rwlock_t; + +#ifdef CONFIG_DEBUG_LOCK_ALLOC +# define RW_DEP_MAP_INIT(lockname) .dep_map = { .name = #lockname } +#else +# define RW_DEP_MAP_INIT(lockname) +#endif + +#define __RW_LOCK_UNLOCKED(name) \ + { .lock = __RT_SPIN_INITIALIZER(name), \ + RW_DEP_MAP_INIT(name) } + +#define RW_LOCK_UNLOCKED __RW_LOCK_UNLOCKED(rw_old_style) + +#define DEFINE_RWLOCK(name) \ + rwlock_t name __cacheline_aligned_in_smp = __RW_LOCK_UNLOCKED(name) + +extern void __lockfunc rt_write_lock(rwlock_t *rwlock); +extern void __lockfunc rt_read_lock(rwlock_t *rwlock); +extern int __lockfunc rt_write_trylock(rwlock_t *rwlock); +extern int __lockfunc rt_write_trylock_irqsave(rwlock_t *trylock, + unsigned long *flags); +extern int __lockfunc rt_read_trylock(rwlock_t *rwlock); +extern void __lockfunc rt_write_unlock(rwlock_t *rwlock); +extern void __lockfunc rt_read_unlock(rwlock_t *rwlock); +extern unsigned long __lockfunc rt_write_lock_irqsave(rwlock_t *rwlock); +extern unsigned long __lockfunc rt_read_lock_irqsave(rwlock_t *rwlock); +extern void +__rt_rwlock_init(rwlock_t *rwlock, char *name, struct lock_class_key *key); + +#define rwlock_init(rwl) \ +do { \ + static struct lock_class_key __key; \ + \ + __rt_rwlock_init(rwl, #rwl, &__key); \ +} while (0) + +/* + * RW-semaphores are a spinlock plus a reader-depth count. + * + * Note that the semantics are different from the usual + * Linux rw-sems, in PREEMPT_RT mode we do not allow + * multiple readers to hold the lock at once, we only allow + * a read-lock owner to read-lock recursively. This is + * better for latency, makes the implementation inherently + * fair and makes it simpler as well: + */ +struct rw_semaphore { + struct rt_mutex lock; + int read_depth; +#ifdef CONFIG_DEBUG_LOCK_ALLOC + struct lockdep_map dep_map; +#endif +}; + +#define __RWSEM_INITIALIZER(name) \ + { .lock = __RT_MUTEX_INITIALIZER(name.lock), \ + RW_DEP_MAP_INIT(name) } + +#define DECLARE_RWSEM(lockname) \ + struct rw_semaphore lockname = __RWSEM_INITIALIZER(lockname) + +extern void __rt_rwsem_init(struct rw_semaphore *rwsem, char *name, + struct lock_class_key *key); + +# define rt_init_rwsem(sem) \ +do { \ + static struct lock_class_key __key; \ + \ + __rt_rwsem_init((sem), #sem, &__key); \ +} while (0) + +extern void rt_down_write(struct rw_semaphore *rwsem); +extern void +rt_down_read_nested(struct rw_semaphore *rwsem, int subclass); +extern void +rt_down_write_nested(struct rw_semaphore *rwsem, int subclass); +extern void rt_down_read(struct rw_semaphore *rwsem); +extern int rt_down_write_trylock(struct rw_semaphore *rwsem); +extern int rt_down_read_trylock(struct rw_semaphore *rwsem); +extern void rt_up_read(struct rw_semaphore *rwsem); +extern void rt_up_write(struct rw_semaphore *rwsem); +extern void rt_downgrade_write(struct rw_semaphore *rwsem); + +/* + * Semaphores - a spinlock plus the semaphore count: + */ +struct semaphore { + atomic_t count; + struct rt_mutex lock; +}; + +#define DEFINE_SEMAPHORE(name) \ +struct semaphore name = \ + { .count = { 1 }, .lock = __RT_MUTEX_INITIALIZER(name.lock) } + +extern void +__sema_init(struct semaphore *sem, int val, char *name, char *file, int line); + +#define rt_sema_init(sem, val) \ + __sema_init(sem, val, #sem, __FILE__, __LINE__) + +/* + * No locked initialization for RT semaphores + */ +extern void rt_down(struct semaphore *sem); +extern int rt_down_interruptible(struct semaphore *sem); +extern int rt_down_timeout(struct semaphore *sem, long jiffies); +extern int rt_down_trylock(struct semaphore *sem); +extern void rt_up(struct semaphore *sem); + +#define rt_sem_is_locked(s) rt_mutex_is_locked(&(s)->lock) +#define rt_sema_count(s) atomic_read(&(s)->count) + +#else + +static inline int preempt_rt(void) { return 0; } + +#endif /* CONFIG_PREEMPT_RT */ + +#endif + diff --git a/include/linux/rtmutex.h b/include/linux/rtmutex.h index f19b00b..4e06290 100644 --- a/include/linux/rtmutex.h +++ b/include/linux/rtmutex.h @@ -24,7 +24,7 @@ * @owner: the mutex owner */ struct rt_mutex { - spinlock_t wait_lock; + atomic_spinlock_t wait_lock; struct plist_head wait_list; struct task_struct *owner; #ifdef CONFIG_DEBUG_RT_MUTEXES @@ -63,8 +63,8 @@ struct hrtimer_sleeper; #endif #define __RT_MUTEX_INITIALIZER(mutexname) \ - { .wait_lock = __SPIN_LOCK_UNLOCKED(mutexname.wait_lock) \ - , .wait_list = PLIST_HEAD_INIT(mutexname.wait_list, mutexname.wait_lock) \ + { .wait_lock = __ATOMIC_SPIN_LOCK_UNLOCKED(mutexname.wait_lock) \ + , .wait_list = PLIST_HEAD_INIT_ATOMIC(mutexname.wait_list, mutexname.wait_lock) \ , .owner = NULL \ __DEBUG_RT_MUTEX_INITIALIZER(mutexname)} @@ -88,6 +88,8 @@ extern void rt_mutex_destroy(struct rt_mutex *lock); extern void rt_mutex_lock(struct rt_mutex *lock); extern int rt_mutex_lock_interruptible(struct rt_mutex *lock, int detect_deadlock); +extern int rt_mutex_lock_killable(struct rt_mutex *lock, + int detect_deadlock); extern int rt_mutex_timed_lock(struct rt_mutex *lock, struct hrtimer_sleeper *timeout, int detect_deadlock); @@ -98,7 +100,7 @@ extern void rt_mutex_unlock(struct rt_mutex *lock); #ifdef CONFIG_RT_MUTEXES # define INIT_RT_MUTEXES(tsk) \ - .pi_waiters = PLIST_HEAD_INIT(tsk.pi_waiters, tsk.pi_lock), \ + .pi_waiters = PLIST_HEAD_INIT_ATOMIC(tsk.pi_waiters, tsk.pi_lock), \ INIT_RT_MUTEX_DEBUG(tsk) #else # define INIT_RT_MUTEXES(tsk) diff --git a/include/linux/rwlock.h b/include/linux/rwlock.h new file mode 100644 index 0000000..a51ec97 --- /dev/null +++ b/include/linux/rwlock.h @@ -0,0 +1,206 @@ +#ifndef __LINUX_RWLOCK_H +#define __LINUX_RWLOCK_H + +#ifndef __LINUX_SPINLOCK_H +# error "please don't include this file directly" +#endif + +#ifdef CONFIG_PREEMPT_RT + +#define read_trylock(lock) __cond_lock(lock, rt_read_trylock(lock)) +#define write_trylock(lock) __cond_lock(lock, rt_write_trylock(lock)) + +#define write_trylock_irqsave(lock, flags) \ + __cond_lock(lock, rt_write_trylock_irqsave(lock, &flags)) + +#define write_lock(lock) rt_write_lock(lock) +#define read_lock(lock) rt_read_lock(lock) + +#define read_lock_irqsave(lock, flags) \ + do { \ + typecheck(unsigned long, flags); \ + flags = rt_read_lock_irqsave(lock); \ + } while (0) + +#define write_lock_irqsave(lock, flags) \ + do { \ + typecheck(unsigned long, flags); \ + flags = rt_write_lock_irqsave(lock); \ + } while (0) + +#define read_lock_irq(lock) rt_read_lock(lock) +#define read_lock_bh(lock) rt_read_lock(lock) + +#define write_lock_irq(lock) rt_write_lock(lock) +#define write_lock_bh(lock) rt_write_lock(lock) + +#define read_unlock(lock) rt_read_unlock(lock) +#define write_unlock(lock) rt_write_unlock(lock) +#define read_unlock_irq(lock) rt_read_unlock(lock) +#define write_unlock_irq(lock) rt_write_unlock(lock) + +#define read_unlock_irqrestore(lock, flags) \ + do { \ + typecheck(unsigned long, flags); \ + (void) flags; \ + rt_read_unlock(lock); \ + } while (0) + +#define read_unlock_bh(lock) rt_read_unlock(lock) + +#define write_unlock_irqrestore(lock, flags) \ + do { \ + typecheck(unsigned long, flags); \ + (void) flags; \ + rt_write_unlock(lock); \ + } while (0) + +#define write_unlock_bh(lock) rt_write_unlock(lock) + +#else + +/* + * rwlock related methods + * + * split out from spinlock.h + * + * portions Copyright 2005, Red Hat, Inc., Ingo Molnar + * Released under the General Public License (GPL). + */ + +#ifdef CONFIG_DEBUG_SPINLOCK + extern void __rwlock_init(rwlock_t *lock, const char *name, + struct lock_class_key *key); +# define rwlock_init(lock) \ +do { \ + static struct lock_class_key __key; \ + \ + __rwlock_init((lock), #lock, &__key); \ +} while (0) +#else +# define rwlock_init(lock) \ + do { *(lock) = RW_LOCK_UNLOCKED; } while (0) +#endif + +#ifdef CONFIG_DEBUG_SPINLOCK + extern void _raw_read_lock(rwlock_t *lock); +#define _raw_read_lock_flags(lock, flags) _raw_read_lock(lock) + extern int _raw_read_trylock(rwlock_t *lock); + extern void _raw_read_unlock(rwlock_t *lock); + extern void _raw_write_lock(rwlock_t *lock); +#define _raw_write_lock_flags(lock, flags) _raw_write_lock(lock) + extern int _raw_write_trylock(rwlock_t *lock); + extern void _raw_write_unlock(rwlock_t *lock); +#else +# define _raw_read_lock(rwlock) __raw_read_lock(&(rwlock)->raw_lock) +# define _raw_read_lock_flags(lock, flags) \ + __raw_read_lock_flags(&(lock)->raw_lock, *(flags)) +# define _raw_read_trylock(rwlock) __raw_read_trylock(&(rwlock)->raw_lock) +# define _raw_read_unlock(rwlock) __raw_read_unlock(&(rwlock)->raw_lock) +# define _raw_write_lock(rwlock) __raw_write_lock(&(rwlock)->raw_lock) +# define _raw_write_lock_flags(lock, flags) \ + __raw_write_lock_flags(&(lock)->raw_lock, *(flags)) +# define _raw_write_trylock(rwlock) __raw_write_trylock(&(rwlock)->raw_lock) +# define _raw_write_unlock(rwlock) __raw_write_unlock(&(rwlock)->raw_lock) +#endif + +#define read_can_lock(rwlock) __raw_read_can_lock(&(rwlock)->raw_lock) +#define write_can_lock(rwlock) __raw_write_can_lock(&(rwlock)->raw_lock) + +/* + * Define the various rw_lock methods. Note we define these + * regardless of whether CONFIG_SMP or CONFIG_PREEMPT are set. The + * various methods are defined as nops in the case they are not + * required. + */ +#define read_trylock(lock) __cond_lock(lock, _read_trylock(lock)) +#define write_trylock(lock) __cond_lock(lock, _write_trylock(lock)) + +#define write_lock(lock) _write_lock(lock) +#define read_lock(lock) _read_lock(lock) + +#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) + +#define read_lock_irqsave(lock, flags) \ + do { \ + typecheck(unsigned long, flags); \ + flags = _read_lock_irqsave(lock); \ + } while (0) +#define write_lock_irqsave(lock, flags) \ + do { \ + typecheck(unsigned long, flags); \ + flags = _write_lock_irqsave(lock); \ + } while (0) + +#else + +#define read_lock_irqsave(lock, flags) \ + do { \ + typecheck(unsigned long, flags); \ + _read_lock_irqsave(lock, flags); \ + } while (0) +#define write_lock_irqsave(lock, flags) \ + do { \ + typecheck(unsigned long, flags); \ + _write_lock_irqsave(lock, flags); \ + } while (0) + +#endif + +#define read_lock_irq(lock) _read_lock_irq(lock) +#define read_lock_bh(lock) _read_lock_bh(lock) + +#define write_lock_irq(lock) _write_lock_irq(lock) +#define write_lock_bh(lock) _write_lock_bh(lock) + +/* + * We inline the unlock functions in the nondebug case: + */ +#if defined(CONFIG_DEBUG_SPINLOCK) || defined(CONFIG_PREEMPT) || \ + !defined(CONFIG_SMP) +# define read_unlock(lock) _read_unlock(lock) +# define write_unlock(lock) _write_unlock(lock) +# define read_unlock_irq(lock) _read_unlock_irq(lock) +# define write_unlock_irq(lock) _write_unlock_irq(lock) +#else +# define read_unlock(lock) \ + do {__raw_read_unlock(&(lock)->raw_lock); __release(lock); } while (0) +# define write_unlock(lock) \ + do {__raw_write_unlock(&(lock)->raw_lock); __release(lock); } while (0) +# define read_unlock_irq(lock) \ +do { \ + __raw_read_unlock(&(lock)->raw_lock); \ + __release(lock); \ + local_irq_enable(); \ +} while (0) +# define write_unlock_irq(lock) \ +do { \ + __raw_write_unlock(&(lock)->raw_lock); \ + __release(lock); \ + local_irq_enable(); \ +} while (0) +#endif + +#define read_unlock_irqrestore(lock, flags) \ + do { \ + typecheck(unsigned long, flags); \ + _read_unlock_irqrestore(lock, flags); \ + } while (0) +#define read_unlock_bh(lock) _read_unlock_bh(lock) + +#define write_unlock_irqrestore(lock, flags) \ + do { \ + typecheck(unsigned long, flags); \ + _write_unlock_irqrestore(lock, flags); \ + } while (0) +#define write_unlock_bh(lock) _write_unlock_bh(lock) + +#define write_trylock_irqsave(lock, flags) \ +({ \ + local_irq_save(flags); \ + write_trylock(lock) ? \ + 1 : ({ local_irq_restore(flags); 0; }); \ +}) +#endif + +#endif /* __LINUX_RWLOCK_H */ diff --git a/include/linux/rwlock_types.h b/include/linux/rwlock_types.h new file mode 100644 index 0000000..f8c9352 --- /dev/null +++ b/include/linux/rwlock_types.h @@ -0,0 +1,56 @@ +#ifndef __LINUX_RWLOCK_TYPES_H +#define __LINUX_RWLOCK_TYPES_H + +/* + * include/linux/rwlock_types.h - generic rwlock type definitions + * and initializers + * + * portions Copyright 2005, Red Hat, Inc., Ingo Molnar + * Released under the General Public License (GPL). + */ +typedef struct { + raw_rwlock_t raw_lock; +#ifdef CONFIG_GENERIC_LOCKBREAK + unsigned int break_lock; +#endif +#ifdef CONFIG_DEBUG_SPINLOCK + unsigned int magic, owner_cpu; + void *owner; +#endif +#ifdef CONFIG_DEBUG_LOCK_ALLOC + struct lockdep_map dep_map; +#endif +} rwlock_t; + +#define RWLOCK_MAGIC 0xdeaf1eed + +#ifdef CONFIG_DEBUG_LOCK_ALLOC +# define RW_DEP_MAP_INIT(lockname) .dep_map = { .name = #lockname } +#else +# define RW_DEP_MAP_INIT(lockname) +#endif + +#ifdef CONFIG_DEBUG_SPINLOCK +#define __RW_LOCK_UNLOCKED(lockname) \ + (rwlock_t) { .raw_lock = __RAW_RW_LOCK_UNLOCKED, \ + .magic = RWLOCK_MAGIC, \ + .owner = SPINLOCK_OWNER_INIT, \ + .owner_cpu = -1, \ + RW_DEP_MAP_INIT(lockname) } +#else +#define __RW_LOCK_UNLOCKED(lockname) \ + (rwlock_t) { .raw_lock = __RAW_RW_LOCK_UNLOCKED, \ + RW_DEP_MAP_INIT(lockname) } +#endif + +/* + * RW_LOCK_UNLOCKED defeat lockdep state tracking and is hence + * deprecated. + * + * Please use DEFINE_RWLOCK() or __RW_LOCK_UNLOCKED() as appropriate. + */ +#define RW_LOCK_UNLOCKED __RW_LOCK_UNLOCKED(old_style_rw_init) + +#define DEFINE_RWLOCK(x) rwlock_t x = __RW_LOCK_UNLOCKED(x) + +#endif /* __LINUX_RWLOCK_TYPES_H */ diff --git a/include/linux/rwsem-spinlock.h b/include/linux/rwsem-spinlock.h index 6c3c0f6..d9af794 100644 --- a/include/linux/rwsem-spinlock.h +++ b/include/linux/rwsem-spinlock.h @@ -22,6 +22,69 @@ struct rwsem_waiter; /* + * the rw-anon-semaphore definition + * - if activity is 0 then there are no active readers or writers + * - if activity is +ve then that is the number of active readers + * - if activity is -1 then there is one active writer + * - if wait_list is not empty, then there are processes waiting for the semaphore + * + * the anon in the name documents that the semaphore has no full + * restrictions versus owner ship. + */ +struct rw_anon_semaphore { + __s32 activity; + spinlock_t wait_lock; + struct list_head wait_list; +#ifdef CONFIG_DEBUG_LOCK_ALLOC + struct lockdep_map dep_map; +#endif +}; + +#ifdef CONFIG_DEBUG_LOCK_ALLOC +# define __RWSEM_ANON_DEP_MAP_INIT(lockname) , .dep_map = { .name = #lockname } +#else +# define __RWSEM_ANON_DEP_MAP_INIT(lockname) +#endif + +#define __RWSEM_ANON_INITIALIZER(name) \ +{ 0, __SPIN_LOCK_UNLOCKED(name.wait_lock), LIST_HEAD_INIT((name).wait_list) \ + __RWSEM_ANON_DEP_MAP_INIT(name) } + +#define DECLARE_ANON_RWSEM(name) \ + struct rw_anon_semaphore name = __RWSEM_ANON_INITIALIZER(name) + +extern void __init_anon_rwsem(struct rw_anon_semaphore *sem, const char *name, + struct lock_class_key *key); + +#define init_anon_rwsem(sem) \ +do { \ + static struct lock_class_key __key; \ + \ + __init_anon_rwsem((sem), #sem, &__key); \ +} while (0) + +extern void __down_read(struct rw_anon_semaphore *sem); +extern int __down_read_trylock(struct rw_anon_semaphore *sem); +extern void __down_write(struct rw_anon_semaphore *sem); +extern void __down_write_nested(struct rw_anon_semaphore *sem, int subclass); +extern int __down_write_trylock(struct rw_anon_semaphore *sem); +extern void __up_read(struct rw_anon_semaphore *sem); +extern void __up_write(struct rw_anon_semaphore *sem); +extern void __downgrade_write(struct rw_anon_semaphore *sem); + +static inline int anon_rwsem_is_locked(struct rw_anon_semaphore *sem) +{ + return (sem->activity != 0); +} + +#ifndef CONFIG_PREEMPT_RT +/* + * Non preempt-rt implementation of rw_semaphore. Same as above, but + * restricted vs. ownership. i.e. ownerless locked state and non owner + * release not allowed. + */ + +/* * the rw-semaphore definition * - if activity is 0 then there are no active readers or writers * - if activity is +ve then that is the number of active readers @@ -50,8 +113,11 @@ struct rw_semaphore { #define DECLARE_RWSEM(name) \ struct rw_semaphore name = __RWSEM_INITIALIZER(name) -extern void __init_rwsem(struct rw_semaphore *sem, const char *name, - struct lock_class_key *key); +static inline void __init_rwsem(struct rw_semaphore *sem, const char *name, + struct lock_class_key *key) +{ + __init_anon_rwsem((struct rw_anon_semaphore *)sem, name, key); +} #define init_rwsem(sem) \ do { \ @@ -60,19 +126,11 @@ do { \ __init_rwsem((sem), #sem, &__key); \ } while (0) -extern void __down_read(struct rw_semaphore *sem); -extern int __down_read_trylock(struct rw_semaphore *sem); -extern void __down_write(struct rw_semaphore *sem); -extern void __down_write_nested(struct rw_semaphore *sem, int subclass); -extern int __down_write_trylock(struct rw_semaphore *sem); -extern void __up_read(struct rw_semaphore *sem); -extern void __up_write(struct rw_semaphore *sem); -extern void __downgrade_write(struct rw_semaphore *sem); - static inline int rwsem_is_locked(struct rw_semaphore *sem) { return (sem->activity != 0); } +#endif #endif /* __KERNEL__ */ #endif /* _LINUX_RWSEM_SPINLOCK_H */ diff --git a/include/linux/rwsem.h b/include/linux/rwsem.h index efd348f..e516c81 100644 --- a/include/linux/rwsem.h +++ b/include/linux/rwsem.h @@ -11,9 +11,11 @@ #include <linux/types.h> #include <linux/kernel.h> +#include <linux/rt_lock.h> #include <asm/system.h> #include <asm/atomic.h> +struct rw_anon_semaphore; struct rw_semaphore; #ifdef CONFIG_RWSEM_GENERIC_SPINLOCK @@ -25,37 +27,37 @@ struct rw_semaphore; /* * lock for reading */ -extern void down_read(struct rw_semaphore *sem); +extern void anon_down_read(struct rw_anon_semaphore *sem); /* * trylock for reading -- returns 1 if successful, 0 if contention */ -extern int down_read_trylock(struct rw_semaphore *sem); +extern int anon_down_read_trylock(struct rw_anon_semaphore *sem); /* * lock for writing */ -extern void down_write(struct rw_semaphore *sem); +extern void anon_down_write(struct rw_anon_semaphore *sem); /* * trylock for writing -- returns 1 if successful, 0 if contention */ -extern int down_write_trylock(struct rw_semaphore *sem); +extern int anon_down_write_trylock(struct rw_anon_semaphore *sem); /* * release a read lock */ -extern void up_read(struct rw_semaphore *sem); +extern void anon_up_read(struct rw_anon_semaphore *sem); /* * release a write lock */ -extern void up_write(struct rw_semaphore *sem); +extern void anon_up_write(struct rw_anon_semaphore *sem); /* * downgrade write lock to read lock */ -extern void downgrade_write(struct rw_semaphore *sem); +extern void anon_downgrade_write(struct rw_anon_semaphore *sem); #ifdef CONFIG_DEBUG_LOCK_ALLOC /* @@ -71,21 +73,123 @@ extern void downgrade_write(struct rw_semaphore *sem); * lockdep_set_class() at lock initialization time. * See Documentation/lockdep-design.txt for more details.) */ -extern void down_read_nested(struct rw_semaphore *sem, int subclass); -extern void down_write_nested(struct rw_semaphore *sem, int subclass); +extern void anon_down_read_nested(struct rw_anon_semaphore *sem, int subclass); +extern void anon_down_write_nested(struct rw_anon_semaphore *sem, int subclass); /* * Take/release a lock when not the owner will release it. * * [ This API should be avoided as much as possible - the * proper abstraction for this case is completions. ] */ -extern void down_read_non_owner(struct rw_semaphore *sem); -extern void up_read_non_owner(struct rw_semaphore *sem); +extern void anon_down_read_non_owner(struct rw_anon_semaphore *sem); +extern void anon_up_read_non_owner(struct rw_anon_semaphore *sem); #else -# define down_read_nested(sem, subclass) down_read(sem) -# define down_write_nested(sem, subclass) down_write(sem) -# define down_read_non_owner(sem) down_read(sem) -# define up_read_non_owner(sem) up_read(sem) +# define anon_down_read_nested(sem, subclass) anon_down_read(sem) +# define anon_down_write_nested(sem, subclass) anon_down_write(sem) +# define anon_down_read_non_owner(sem) anon_down_read(sem) +# define anon_up_read_non_owner(sem) anon_up_read(sem) +#endif + +#ifdef CONFIG_PREEMPT_RT + +#include <linux/rt_lock.h> + +#define init_rwsem(sem) rt_init_rwsem(sem) +#define rwsem_is_locked(s) rt_mutex_is_locked(&(s)->lock) + +static inline void down_read(struct rw_semaphore *sem) +{ + rt_down_read(sem); +} + +static inline int down_read_trylock(struct rw_semaphore *sem) +{ + return rt_down_read_trylock(sem); +} + +static inline void down_write(struct rw_semaphore *sem) +{ + rt_down_write(sem); +} + +static inline int down_write_trylock(struct rw_semaphore *sem) +{ + return rt_down_write_trylock(sem); +} + +static inline void up_read(struct rw_semaphore *sem) +{ + rt_up_read(sem); +} + +static inline void up_write(struct rw_semaphore *sem) +{ + rt_up_write(sem); +} + +static inline void downgrade_write(struct rw_semaphore *sem) +{ + rt_downgrade_write(sem); +} + +static inline void down_read_nested(struct rw_semaphore *sem, int subclass) +{ + return rt_down_read_nested(sem, subclass); +} + +static inline void down_write_nested(struct rw_semaphore *sem, int subclass) +{ + rt_down_write_nested(sem, subclass); +} + +#else +/* + * Non preempt-rt implementations + */ +static inline void down_read(struct rw_semaphore *sem) +{ + anon_down_read((struct rw_anon_semaphore *)sem); +} + +static inline int down_read_trylock(struct rw_semaphore *sem) +{ + return anon_down_read_trylock((struct rw_anon_semaphore *)sem); +} + +static inline void down_write(struct rw_semaphore *sem) +{ + anon_down_write((struct rw_anon_semaphore *)sem); +} + +static inline int down_write_trylock(struct rw_semaphore *sem) +{ + return anon_down_write_trylock((struct rw_anon_semaphore *)sem); +} + +static inline void up_read(struct rw_semaphore *sem) +{ + anon_up_read((struct rw_anon_semaphore *)sem); +} + +static inline void up_write(struct rw_semaphore *sem) +{ + anon_up_write((struct rw_anon_semaphore *)sem); +} + +static inline void downgrade_write(struct rw_semaphore *sem) +{ + anon_downgrade_write((struct rw_anon_semaphore *)sem); +} + +static inline void down_read_nested(struct rw_semaphore *sem, int subclass) +{ + return anon_down_read_nested((struct rw_anon_semaphore *)sem, subclass); +} + +static inline void down_write_nested(struct rw_semaphore *sem, int subclass) +{ + anon_down_write_nested((struct rw_anon_semaphore *)sem, subclass); +} #endif #endif /* _LINUX_RWSEM_H */ diff --git a/include/linux/sched.h b/include/linux/sched.h index 0f1ea4a..ddffc7a 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -100,6 +100,23 @@ struct fs_struct; struct bts_context; struct perf_counter_context; +#ifdef CONFIG_PREEMPT +extern int kernel_preemption; +#else +# define kernel_preemption 0 +#endif +#ifdef CONFIG_PREEMPT_VOLUNTARY +extern int voluntary_preemption; +#else +# define voluntary_preemption 0 +#endif + +#ifdef CONFIG_PREEMPT_SOFTIRQS +extern int softirq_preemption; +#else +# define softirq_preemption 0 +#endif + /* * List of flags we want to share for kernel threads, * if only because they are not used by them anyway. @@ -166,6 +183,7 @@ print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) #endif extern unsigned long long time_sync_thresh; +extern struct semaphore kernel_sem; /* * Task state bitmask. NOTE! These bits are also @@ -178,16 +196,17 @@ extern unsigned long long time_sync_thresh; * mistake. */ #define TASK_RUNNING 0 -#define TASK_INTERRUPTIBLE 1 -#define TASK_UNINTERRUPTIBLE 2 -#define __TASK_STOPPED 4 -#define __TASK_TRACED 8 +#define TASK_RUNNING_MUTEX 1 +#define TASK_INTERRUPTIBLE 2 +#define TASK_UNINTERRUPTIBLE 4 +#define __TASK_STOPPED 8 +#define __TASK_TRACED 16 /* in tsk->exit_state */ -#define EXIT_ZOMBIE 16 -#define EXIT_DEAD 32 +#define EXIT_ZOMBIE 32 +#define EXIT_DEAD 64 /* in tsk->state again */ -#define TASK_DEAD 64 -#define TASK_WAKEKILL 128 +#define TASK_DEAD 128 +#define TASK_WAKEKILL 256 /* Convenience macros for the sake of set_task_state */ #define TASK_KILLABLE (TASK_WAKEKILL | TASK_UNINTERRUPTIBLE) @@ -199,7 +218,8 @@ extern unsigned long long time_sync_thresh; #define TASK_ALL (TASK_NORMAL | __TASK_STOPPED | __TASK_TRACED) /* get_task_state() */ -#define TASK_REPORT (TASK_RUNNING | TASK_INTERRUPTIBLE | \ +#define TASK_REPORT (TASK_RUNNING | TASK_RUNNING_MUTEX | \ + TASK_INTERRUPTIBLE | \ TASK_UNINTERRUPTIBLE | __TASK_STOPPED | \ __TASK_TRACED) @@ -216,6 +236,28 @@ extern unsigned long long time_sync_thresh; #define set_task_state(tsk, state_value) \ set_mb((tsk)->state, (state_value)) +// #define PREEMPT_DIRECT + +#ifdef CONFIG_X86_LOCAL_APIC +extern void nmi_show_all_regs(void); +#else +# define nmi_show_all_regs() do { } while (0) +#endif + +#include <linux/smp.h> +#include <linux/sem.h> +#include <linux/signal.h> +#include <linux/securebits.h> +#include <linux/fs_struct.h> +#include <linux/compiler.h> +#include <linux/completion.h> +#include <linux/pid.h> +#include <linux/percpu.h> +#include <linux/topology.h> +#include <linux/seccomp.h> + +struct exec_domain; + /* * set_current_state() includes a barrier so that the write of current->state * is correctly serialised wrt the caller's subsequent test of whether to @@ -345,6 +387,11 @@ extern signed long schedule_timeout_uninterruptible(signed long timeout); asmlinkage void __schedule(void); asmlinkage void schedule(void); extern int mutex_spin_on_owner(struct mutex *lock, struct thread_info *owner); +/* + * This one can be called with interrupts disabled, only + * to be used by lowlevel arch code! + */ +asmlinkage void __sched __schedule(void); struct nsproxy; struct user_namespace; @@ -520,7 +567,7 @@ struct task_cputime { struct thread_group_cputimer { struct task_cputime cputime; int running; - spinlock_t lock; + atomic_spinlock_t lock; }; /* @@ -1173,10 +1220,8 @@ struct task_struct { int lock_depth; /* BKL lock depth */ #ifdef CONFIG_SMP -#ifdef __ARCH_WANT_UNLOCKED_CTXSW int oncpu; #endif -#endif int prio, static_prio, normal_prio; unsigned int rt_priority; @@ -1284,6 +1329,8 @@ struct task_struct { struct task_cputime cputime_expires; struct list_head cpu_timers[3]; + struct task_struct* posix_timer_list; + /* process credentials */ const struct cred *real_cred; /* objective and real subjective task * credentials (COW) */ @@ -1318,6 +1365,7 @@ struct task_struct { /* signal handlers */ struct signal_struct *signal; struct sighand_struct *sighand; + struct sigqueue *sigqueue_cache; sigset_t blocked, real_blocked; sigset_t saved_sigmask; /* restored if set_restore_sigmask() was used */ @@ -1348,7 +1396,7 @@ struct task_struct { #endif /* Protection of the PI data structures: */ - spinlock_t pi_lock; + atomic_spinlock_t pi_lock; #ifdef CONFIG_RT_MUTEXES /* PI waiters blocked on a rt_mutex held by this task */ @@ -1361,6 +1409,7 @@ struct task_struct { /* mutex deadlock detection */ struct mutex_waiter *blocked_on; #endif + int pagefault_disabled; #ifdef CONFIG_TRACE_IRQFLAGS unsigned int irq_events; int hardirqs_enabled; @@ -1385,6 +1434,26 @@ struct task_struct { gfp_t lockdep_reclaim_gfp; #endif +/* realtime bits */ + +#define MAX_PREEMPT_TRACE 25 +#define MAX_LOCK_STACK MAX_PREEMPT_TRACE +#ifdef CONFIG_DEBUG_PREEMPT + atomic_t lock_count; +# ifdef CONFIG_PREEMPT_RT + struct rt_mutex *owned_lock[MAX_LOCK_STACK]; +# endif +#endif +#ifdef CONFIG_DETECT_SOFTLOCKUP + unsigned long softlockup_count; /* Count to keep track how long the + * thread is in the kernel without + * sleeping. + */ +#endif +#ifdef CONFIG_DEBUG_RT_MUTEXES + void *last_kernel_lock; +#endif + /* journalling filesystem info */ void *journal_info; @@ -1423,6 +1492,7 @@ struct task_struct { #endif struct list_head pi_state_list; struct futex_pi_state *pi_state_cache; + struct task_struct *futex_wakeup; #endif #ifdef CONFIG_PERF_COUNTERS struct perf_counter_context *perf_counter_ctxp; @@ -1480,11 +1550,24 @@ struct task_struct { /* bitmask of trace recursion */ unsigned long trace_recursion; #endif /* CONFIG_TRACING */ +#ifdef CONFIG_PREEMPT_RT + /* + * Temporary hack, until we find a solution to + * handle printk in atomic operations. + */ + int in_printk; +#endif }; /* Future-safe accessor for struct task_struct's cpus_allowed. */ #define tsk_cpumask(tsk) (&(tsk)->cpus_allowed) +#ifdef CONFIG_PREEMPT_RT +# define set_printk_might_sleep(x) do { current->in_printk = x; } while(0) +#else +# define set_printk_might_sleep(x) do { } while(0) +#endif + /* * Priority of a process goes from 0..MAX_PRIO-1, valid RT * priority is 0..MAX_RT_PRIO-1, and SCHED_NORMAL/SCHED_BATCH @@ -1653,6 +1736,15 @@ extern struct pid *cad_pid; extern void free_task(struct task_struct *tsk); #define get_task_struct(tsk) do { atomic_inc(&(tsk)->usage); } while(0) +#ifdef CONFIG_PREEMPT_RT +extern void __put_task_struct_cb(struct rcu_head *rhp); + +static inline void put_task_struct(struct task_struct *t) +{ + if (atomic_dec_and_test(&t->usage)) + call_rcu(&t->rcu, __put_task_struct_cb); +} +#else extern void __put_task_struct(struct task_struct *t); static inline void put_task_struct(struct task_struct *t) @@ -1660,6 +1752,7 @@ static inline void put_task_struct(struct task_struct *t) if (atomic_dec_and_test(&t->usage)) __put_task_struct(t); } +#endif extern cputime_t task_utime(struct task_struct *p); extern cputime_t task_stime(struct task_struct *p); @@ -1674,7 +1767,9 @@ extern cputime_t task_gtime(struct task_struct *p); #define PF_EXITING 0x00000004 /* getting shut down */ #define PF_EXITPIDONE 0x00000008 /* pi exit done on shut down */ #define PF_VCPU 0x00000010 /* I'm a virtual CPU */ +#define PF_HARDIRQ 0x00000020 /* hardirq thread */ #define PF_FORKNOEXEC 0x00000040 /* forked but didn't exec */ +#define PF_KMAP 0x00000080 /* this context has a kmap */ #define PF_SUPERPRIV 0x00000100 /* used super-user privileges */ #define PF_DUMPCORE 0x00000200 /* dumped core */ #define PF_SIGNALED 0x00000400 /* killed by a signal */ @@ -1694,6 +1789,7 @@ extern cputime_t task_gtime(struct task_struct *p); #define PF_SPREAD_PAGE 0x01000000 /* Spread page cache over cpuset */ #define PF_SPREAD_SLAB 0x02000000 /* Spread some slab caches over cpuset */ #define PF_THREAD_BOUND 0x04000000 /* Thread bound to specific cpu */ +#define PF_SOFTIRQ 0x08000000 /* softirq context */ #define PF_MEMPOLICY 0x10000000 /* Non-default NUMA mempolicy */ #define PF_MUTEX_TESTER 0x20000000 /* Thread belongs to the rt mutex tester */ #define PF_FREEZER_SKIP 0x40000000 /* Freezer should not count it as freezeable */ @@ -1844,9 +1940,14 @@ int sched_rt_handler(struct ctl_table *table, int write, extern unsigned int sysctl_sched_compat_yield; +extern void task_setprio(struct task_struct *p, int prio); + #ifdef CONFIG_RT_MUTEXES extern int rt_mutex_getprio(struct task_struct *p); -extern void rt_mutex_setprio(struct task_struct *p, int prio); +static inline void rt_mutex_setprio(struct task_struct *p, int prio) +{ + task_setprio(p, prio); +} extern void rt_mutex_adjust_pi(struct task_struct *p); #else static inline int rt_mutex_getprio(struct task_struct *p) @@ -1868,8 +1969,10 @@ extern int sched_setscheduler_nocheck(struct task_struct *, int, extern struct task_struct *idle_task(int cpu); extern struct task_struct *curr_task(int cpu); extern void set_curr_task(int cpu, struct task_struct *p); +extern struct task_struct *rq_curr(struct rq *rq); void yield(void); +void __yield(void); /* * The default (Linux) execution domain. @@ -1931,6 +2034,9 @@ extern void do_timer(unsigned long ticks); extern int wake_up_state(struct task_struct *tsk, unsigned int state); extern int wake_up_process(struct task_struct *tsk); +extern int wake_up_process_mutex(struct task_struct * tsk); +extern int wake_up_process_sync(struct task_struct * tsk); +extern int wake_up_process_mutex_sync(struct task_struct * tsk); extern void wake_up_new_task(struct task_struct *tsk, unsigned long clone_flags); #ifdef CONFIG_SMP @@ -2019,12 +2125,20 @@ extern struct mm_struct * mm_alloc(void); /* mmdrop drops the mm and the page tables */ extern void __mmdrop(struct mm_struct *); +extern void __mmdrop_delayed(struct mm_struct *); + static inline void mmdrop(struct mm_struct * mm) { if (unlikely(atomic_dec_and_test(&mm->mm_count))) __mmdrop(mm); } +static inline void mmdrop_delayed(struct mm_struct * mm) +{ + if (atomic_dec_and_test(&mm->mm_count)) + __mmdrop_delayed(mm); +} + /* mmput gets rid of the mappings and all user-space */ extern void mmput(struct mm_struct *); /* Grab a reference to a task's mm, if it is not already going away */ @@ -2298,6 +2412,7 @@ static inline int cond_resched_bkl(void) { return _cond_resched(); } +extern int cond_resched_softirq_context(void); /* * Does a critical section need to be broken due to another @@ -2322,7 +2437,7 @@ void thread_group_cputimer(struct task_struct *tsk, struct task_cputime *times); static inline void thread_group_cputime_init(struct signal_struct *sig) { sig->cputimer.cputime = INIT_CPUTIME; - spin_lock_init(&sig->cputimer.lock); + atomic_spin_lock_init(&sig->cputimer.lock); sig->cputimer.running = 0; } @@ -2330,6 +2445,13 @@ static inline void thread_group_cputime_free(struct signal_struct *sig) { } +static inline int softirq_need_resched(void) +{ + if (softirq_preemption && (current->flags & PF_SOFTIRQ)) + return need_resched(); + return 0; +} + /* * Reevaluate whether the task has signals pending delivery. * Wake the task if so. @@ -2476,7 +2598,14 @@ static inline void mm_init_owner(struct mm_struct *mm, struct task_struct *p) } #endif /* CONFIG_MM_OWNER */ -#define TASK_STATE_TO_CHAR_STR "RSDTtZX" +#define TASK_STATE_TO_CHAR_STR "RMSDTtZX" + +#ifdef CONFIG_SMP +static inline int task_is_current(struct task_struct *task) +{ + return task->oncpu; +} +#endif #endif /* __KERNEL__ */ diff --git a/include/linux/semaphore.h b/include/linux/semaphore.h index 7415839..7ba0d2d 100644 --- a/include/linux/semaphore.h +++ b/include/linux/semaphore.h @@ -13,6 +13,94 @@ #include <linux/spinlock.h> /* Please don't access any members of this structure directly */ +struct anon_semaphore { + spinlock_t lock; + unsigned int count; + struct list_head wait_list; +}; + +#define __ANON_SEMAPHORE_INITIALIZER(name, n) \ +{ \ + .lock = __SPIN_LOCK_UNLOCKED((name).lock), \ + .count = n, \ + .wait_list = LIST_HEAD_INIT((name).wait_list), \ +} + +#define DEFINE_ANON_SEMAPHORE(name) \ + struct anon_semaphore name = __ANON_SEMAPHORE_INITIALIZER(name, 1) + +static inline void anon_sema_init(struct anon_semaphore *sem, int val) +{ + static struct lock_class_key __key; + *sem = (struct anon_semaphore) __ANON_SEMAPHORE_INITIALIZER(*sem, val); + lockdep_init_map(&sem->lock.dep_map, "semaphore->lock", &__key, 0); +} + +static inline void anon_semaphore_init(struct anon_semaphore *sem) +{ + anon_sema_init(sem, 1); +} + +/* + * semaphore_init_locked() is mostly a sign for a mutex which is + * abused as completion. + */ +static inline void __deprecated +anon_semaphore_init_locked(struct anon_semaphore *sem) +{ + anon_sema_init(sem, 0); +} + +extern void anon_down(struct anon_semaphore *sem); +extern int __must_check anon_down_interruptible(struct anon_semaphore *sem); +extern int __must_check anon_down_killable(struct anon_semaphore *sem); +extern int __must_check anon_down_trylock(struct anon_semaphore *sem); +extern int __must_check anon_down_timeout(struct anon_semaphore *sem, long jiffies); +extern void anon_up(struct anon_semaphore *sem); + +#ifdef CONFIG_PREEMPT_RT + +static inline void sema_init(struct semaphore *sem, int val) +{ + rt_sema_init(sem, val); +} + +static inline void semaphore_init(struct semaphore *sem) +{ + sema_init(sem, 1); +} + +static inline void down(struct semaphore *sem) +{ + rt_down(sem); +} + +static inline int __must_check down_interruptible(struct semaphore *sem) +{ + return rt_down_interruptible(sem); +} + +static inline int __must_check down_trylock(struct semaphore *sem) +{ + return rt_down_trylock(sem); +} + +static inline int __must_check +down_timeout(struct semaphore *sem, long jiffies) +{ + return rt_down_timeout(sem, jiffies); +} + +static inline void up(struct semaphore *sem) +{ + rt_up(sem); +} + + +#else +/* + * Non preempt-rt maps semaphores to anon semaphores + */ struct semaphore { spinlock_t lock; unsigned int count; @@ -26,24 +114,57 @@ struct semaphore { .wait_list = LIST_HEAD_INIT((name).wait_list), \ } -#define DECLARE_MUTEX(name) \ - struct semaphore name = __SEMAPHORE_INITIALIZER(name, 1) +#define DEFINE_SEMAPHORE(name) \ + struct semaphore name = __SEMAPHORE_INITIALIZER(name, 1) static inline void sema_init(struct semaphore *sem, int val) { - static struct lock_class_key __key; - *sem = (struct semaphore) __SEMAPHORE_INITIALIZER(*sem, val); - lockdep_init_map(&sem->lock.dep_map, "semaphore->lock", &__key, 0); + anon_sema_init((struct anon_semaphore *)sem, val); +} + +static inline void semaphore_init(struct semaphore *sem) +{ + anon_sema_init((struct anon_semaphore *)sem, 1); +} + +/* + * semaphore_init_locked() is mostly a sign for a mutex which is + * abused as completion. + */ +static inline void __deprecated semaphore_init_locked(struct semaphore *sem) +{ + anon_sema_init((struct anon_semaphore *)sem, 0); } -#define init_MUTEX(sem) sema_init(sem, 1) -#define init_MUTEX_LOCKED(sem) sema_init(sem, 0) +static inline void down(struct semaphore *sem) +{ + anon_down((struct anon_semaphore *)sem); +} -extern void down(struct semaphore *sem); -extern int __must_check down_interruptible(struct semaphore *sem); -extern int __must_check down_killable(struct semaphore *sem); -extern int __must_check down_trylock(struct semaphore *sem); -extern int __must_check down_timeout(struct semaphore *sem, long jiffies); -extern void up(struct semaphore *sem); +static inline int __must_check down_interruptible(struct semaphore *sem) +{ + return anon_down_interruptible((struct anon_semaphore *)sem); +} +static inline int __must_check down_killable(struct semaphore *sem) +{ + return anon_down_killable((struct anon_semaphore *)sem); +} + +static inline int __must_check down_trylock(struct semaphore *sem) +{ + return anon_down_trylock((struct anon_semaphore *)sem); +} + +static inline int __must_check +down_timeout(struct semaphore *sem, long jiffies) +{ + return anon_down_timeout((struct anon_semaphore *)sem, jiffies); +} + +static inline void up(struct semaphore *sem) +{ + anon_up((struct anon_semaphore *)sem); +} +#endif #endif /* __LINUX_SEMAPHORE_H */ diff --git a/include/linux/seqlock.h b/include/linux/seqlock.h index 632205c..e4a3f95 100644 --- a/include/linux/seqlock.h +++ b/include/linux/seqlock.h @@ -3,9 +3,11 @@ /* * Reader/writer consistent mechanism without starving writers. This type of * lock for data where the reader wants a consistent set of information - * and is willing to retry if the information changes. Readers never - * block but they may have to retry if a writer is in - * progress. Writers do not wait for readers. + * and is willing to retry if the information changes. Readers block + * on write contention (and where applicable, pi-boost the writer). + * Readers without contention on entry acquire the critical section + * without any atomic operations, but they may have to retry if a writer + * enters before the critical section ends. Writers do not wait for readers. * * This is not as cache friendly as brlock. Also, this will not work * for data that contains pointers, because any writer could @@ -24,6 +26,8 @@ * * Based on x86_64 vsyscall gettimeofday * by Keith Owens and Andrea Arcangeli + * + * Priority inheritance and live-lock avoidance by Gregory Haskins */ #include <linux/spinlock.h> @@ -31,49 +35,80 @@ typedef struct { unsigned sequence; - spinlock_t lock; + atomic_spinlock_t lock; +} atomic_seqlock_t; + +typedef struct { + unsigned sequence; + rwlock_t lock; } seqlock_t; /* * These macros triggered gcc-3.x compile-time problems. We think these are * OK now. Be cautious. */ +#define __ATOMIC_SEQLOCK_UNLOCKED(lockname) \ + { 0, __ATOMIC_SPIN_LOCK_UNLOCKED(lockname) } + +#define seqlock_atomic_init(x) \ + do { \ + (x)->sequence = 0; \ + atomic_spin_lock_init(&(x)->lock); \ + } while (0) + +#define DEFINE_ATOMIC_SEQLOCK(x) \ + atomic_seqlock_t x = __ATOMIC_SEQLOCK_UNLOCKED(x) + #define __SEQLOCK_UNLOCKED(lockname) \ - { 0, __SPIN_LOCK_UNLOCKED(lockname) } + { 0, __RW_LOCK_UNLOCKED(lockname) } #define SEQLOCK_UNLOCKED \ - __SEQLOCK_UNLOCKED(old_style_seqlock_init) + __SEQLOCK_UNLOCKED(old_style_seqlock_init) #define seqlock_init(x) \ do { \ (x)->sequence = 0; \ - spin_lock_init(&(x)->lock); \ + rwlock_init(&(x)->lock); \ } while (0) #define DEFINE_SEQLOCK(x) \ - seqlock_t x = __SEQLOCK_UNLOCKED(x) + seqlock_t x = __SEQLOCK_UNLOCKED(x) /* Lock out other writers and update the count. * Acts like a normal spin_lock/unlock. * Don't need preempt_disable() because that is in the spin_lock already. */ +static inline void write_atomic_seqlock(atomic_seqlock_t *sl) +{ + atomic_spin_lock(&sl->lock); + ++sl->sequence; + smp_wmb(); +} + static inline void write_seqlock(seqlock_t *sl) { - spin_lock(&sl->lock); + write_lock(&sl->lock); ++sl->sequence; smp_wmb(); } +static inline void write_atomic_sequnlock(atomic_seqlock_t *sl) +{ + smp_wmb(); + sl->sequence++; + atomic_spin_unlock(&sl->lock); +} + static inline void write_sequnlock(seqlock_t *sl) { smp_wmb(); sl->sequence++; - spin_unlock(&sl->lock); + write_unlock(&sl->lock); } static inline int write_tryseqlock(seqlock_t *sl) { - int ret = spin_trylock(&sl->lock); + int ret = write_trylock(&sl->lock); if (ret) { ++sl->sequence; @@ -83,7 +118,7 @@ static inline int write_tryseqlock(seqlock_t *sl) } /* Start of read calculation -- fetch last complete writer token */ -static __always_inline unsigned read_seqbegin(const seqlock_t *sl) +static __always_inline unsigned read_atomic_seqbegin(const atomic_seqlock_t *sl) { unsigned ret; @@ -98,11 +133,42 @@ repeat: return ret; } +static __always_inline unsigned read_seqbegin(seqlock_t *sl) +{ + unsigned ret; + + ret = sl->sequence; + smp_rmb(); + if (unlikely(ret & 1)) { + cpu_relax(); + /* + * Serialze with the writer which will ensure they are + * pi-boosted if necessary and prevent us from starving + * them. + */ + read_lock(&sl->lock); + ret = sl->sequence; + read_unlock(&sl->lock); + } + + BUG_ON(ret & 1); + + return ret; +} + /* * Test if reader processed invalid data. * * If sequence value changed then writer changed data while in section. */ +static __always_inline int +read_atomic_seqretry(const atomic_seqlock_t *sl, unsigned start) +{ + smp_rmb(); + + return (sl->sequence != start); +} + static __always_inline int read_seqretry(const seqlock_t *sl, unsigned start) { smp_rmb(); @@ -170,12 +236,36 @@ static inline void write_seqcount_end(seqcount_t *s) /* * Possible sw/hw IRQ protected versions of the interfaces. */ +#define write_atomic_seqlock_irqsave(lock, flags) \ + do { local_irq_save(flags); write_atomic_seqlock(lock); } while (0) +#define write_atomic_seqlock_irq(lock) \ + do { local_irq_disable(); write_atomic_seqlock(lock); } while (0) +#define write_atomic_seqlock_bh(lock) \ + do { local_bh_disable(); write_atomic_seqlock(lock); } while (0) + +#define write_atomic_sequnlock_irqrestore(lock, flags) \ + do { write_atomic_sequnlock(lock); local_irq_restore(flags); } while(0) +#define write_atomic_sequnlock_irq(lock) \ + do { write_atomic_sequnlock(lock); local_irq_enable(); } while(0) +#define write_atomic_sequnlock_bh(lock) \ + do { write_atomic_sequnlock(lock); local_bh_enable(); } while(0) + +#define read_atomic_seqbegin_irqsave(lock, flags) \ + ({ local_irq_save(flags); read_atomic_seqbegin(lock); }) + +#define read_atomic_seqretry_irqrestore(lock, iv, flags) \ + ({ \ + int ret = read_atomic_seqretry(lock, iv); \ + local_irq_restore(flags); \ + ret; \ + }) + #define write_seqlock_irqsave(lock, flags) \ do { local_irq_save(flags); write_seqlock(lock); } while (0) #define write_seqlock_irq(lock) \ do { local_irq_disable(); write_seqlock(lock); } while (0) #define write_seqlock_bh(lock) \ - do { local_bh_disable(); write_seqlock(lock); } while (0) + do { local_bh_disable(); write_seqlock(lock); } while (0) #define write_sequnlock_irqrestore(lock, flags) \ do { write_sequnlock(lock); local_irq_restore(flags); } while(0) diff --git a/include/linux/signal.h b/include/linux/signal.h index c755283..46b4600 100644 --- a/include/linux/signal.h +++ b/include/linux/signal.h @@ -225,6 +225,7 @@ static inline void init_sigpending(struct sigpending *sig) } extern void flush_sigqueue(struct sigpending *queue); +extern void flush_task_sigqueue(struct task_struct *tsk); /* Test if 'sig' is valid signal. Use this instead of testing _NSIG directly */ static inline int valid_signal(unsigned long sig) diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index f2c69a2..313f09d 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -98,6 +98,9 @@ struct pipe_inode_info; #if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE) struct nf_conntrack { atomic_t use; +#ifdef CONFIG_PREEMPT_RT + struct rcu_head rcu; +#endif }; #endif diff --git a/include/linux/smb_fs_sb.h b/include/linux/smb_fs_sb.h index 8a060a7..41c69a4 100644 --- a/include/linux/smb_fs_sb.h +++ b/include/linux/smb_fs_sb.h @@ -57,7 +57,7 @@ struct smb_sb_info { struct smb_conn_opt opt; wait_queue_head_t conn_wq; int conn_complete; - struct semaphore sem; + struct mutex mutex; unsigned char header[SMB_HEADER_LEN + 20*2 + 2]; u32 header_len; @@ -79,19 +79,19 @@ struct smb_sb_info { static inline int smb_lock_server_interruptible(struct smb_sb_info *server) { - return down_interruptible(&(server->sem)); + return mutex_lock_interruptible(&server->mutex); } static inline void smb_lock_server(struct smb_sb_info *server) { - down(&(server->sem)); + mutex_lock(&server->mutex); } static inline void smb_unlock_server(struct smb_sb_info *server) { - up(&(server->sem)); + mutex_unlock(&server->mutex); } #endif diff --git a/include/linux/smp.h b/include/linux/smp.h index 9e3d8af..3780051 100644 --- a/include/linux/smp.h +++ b/include/linux/smp.h @@ -50,6 +50,16 @@ extern void smp_send_stop(void); */ extern void smp_send_reschedule(int cpu); +/* + * trigger a reschedule on all other CPUs: + */ +extern void smp_send_reschedule_allbutself(void); + +/* + * trigger a reschedule on all other CPUs: + */ +extern void smp_send_reschedule_allbutself(void); + /* * Prepare machine for booting other CPUs. @@ -142,6 +152,7 @@ static inline int up_smp_call_function(void (*func)(void *), void *info) 0; \ }) static inline void smp_send_reschedule(int cpu) { } +static inline void smp_send_reschedule_allbutself(void) { } #define num_booting_cpus() 1 #define smp_prepare_boot_cpu() do {} while (0) #define smp_call_function_mask(mask, func, info, wait) \ diff --git a/include/linux/smp_lock.h b/include/linux/smp_lock.h index 813be59..0cb3cf9 100644 --- a/include/linux/smp_lock.h +++ b/include/linux/smp_lock.h @@ -45,7 +45,7 @@ static inline void cycle_kernel_lock(void) #define unlock_kernel() do { } while(0) #define release_kernel_lock(task) do { } while(0) #define cycle_kernel_lock() do { } while(0) -#define reacquire_kernel_lock(task) 0 +#define reacquire_kernel_lock(task) do { } while(0) #define kernel_locked() 1 #endif /* CONFIG_LOCK_KERNEL */ diff --git a/include/linux/spinlock.h b/include/linux/spinlock.h index 4be57ab..b2eb1c9 100644 --- a/include/linux/spinlock.h +++ b/include/linux/spinlock.h @@ -58,23 +58,6 @@ #include <asm/system.h> /* - * Must define these before including other files, inline functions need them - */ -#define LOCK_SECTION_NAME ".text.lock."KBUILD_BASENAME - -#define LOCK_SECTION_START(extra) \ - ".subsection 1\n\t" \ - extra \ - ".ifndef " LOCK_SECTION_NAME "\n\t" \ - LOCK_SECTION_NAME ":\n\t" \ - ".endif\n" - -#define LOCK_SECTION_END \ - ".previous\n\t" - -#define __lockfunc __attribute__((section(".spinlock.text"))) - -/* * Pull the raw_spinlock_t and raw_rwlock_t definitions: */ #include <linux/spinlock_types.h> @@ -91,44 +74,32 @@ extern int __lockfunc generic__raw_read_trylock(raw_rwlock_t *lock); #endif #ifdef CONFIG_DEBUG_SPINLOCK - extern void __spin_lock_init(spinlock_t *lock, const char *name, - struct lock_class_key *key); -# define spin_lock_init(lock) \ -do { \ - static struct lock_class_key __key; \ - \ - __spin_lock_init((lock), #lock, &__key); \ -} while (0) + extern void __atomic_spin_lock_init(atomic_spinlock_t *lock, + const char *name, + struct lock_class_key *key); -#else -# define spin_lock_init(lock) \ - do { *(lock) = SPIN_LOCK_UNLOCKED; } while (0) -#endif - -#ifdef CONFIG_DEBUG_SPINLOCK - extern void __rwlock_init(rwlock_t *lock, const char *name, - struct lock_class_key *key); -# define rwlock_init(lock) \ +# define atomic_spin_lock_init(lock) \ do { \ static struct lock_class_key __key; \ \ - __rwlock_init((lock), #lock, &__key); \ + __atomic_spin_lock_init((lock), #lock, &__key); \ } while (0) + #else -# define rwlock_init(lock) \ - do { *(lock) = RW_LOCK_UNLOCKED; } while (0) +# define atomic_spin_lock_init(lock) \ + do { *(lock) = __ATOMIC_SPIN_LOCK_UNLOCKED(lock); } while (0) #endif -#define spin_is_locked(lock) __raw_spin_is_locked(&(lock)->raw_lock) +#define atomic_spin_is_locked(lock) __raw_spin_is_locked(&(lock)->raw_lock) #ifdef CONFIG_GENERIC_LOCKBREAK -#define spin_is_contended(lock) ((lock)->break_lock) +#define atomic_spin_is_contended(lock) ((lock)->break_lock) #else #ifdef __raw_spin_is_contended -#define spin_is_contended(lock) __raw_spin_is_contended(&(lock)->raw_lock) +#define atomic_spin_is_contended(lock) __raw_spin_is_contended(&(lock)->raw_lock) #else -#define spin_is_contended(lock) (((void)(lock), 0)) +#define atomic_spin_is_contended(lock) (((void)(lock), 0)) #endif /*__raw_spin_is_contended*/ #endif @@ -141,7 +112,7 @@ static inline void smp_mb__after_lock(void) { smp_mb(); } * spin_unlock_wait - wait until the spinlock gets unlocked * @lock: the spinlock in question. */ -#define spin_unlock_wait(lock) __raw_spin_unlock_wait(&(lock)->raw_lock) +#define atomic_spin_unlock_wait(lock) __raw_spin_unlock_wait(&(lock)->raw_lock) /* * Pull the _spin_*()/_read_*()/_write_*() functions/declarations: @@ -153,209 +124,128 @@ static inline void smp_mb__after_lock(void) { smp_mb(); } #endif #ifdef CONFIG_DEBUG_SPINLOCK - extern void _raw_spin_lock(spinlock_t *lock); + extern void _raw_spin_lock(atomic_spinlock_t *lock); #define _raw_spin_lock_flags(lock, flags) _raw_spin_lock(lock) - extern int _raw_spin_trylock(spinlock_t *lock); - extern void _raw_spin_unlock(spinlock_t *lock); - extern void _raw_read_lock(rwlock_t *lock); -#define _raw_read_lock_flags(lock, flags) _raw_read_lock(lock) - extern int _raw_read_trylock(rwlock_t *lock); - extern void _raw_read_unlock(rwlock_t *lock); - extern void _raw_write_lock(rwlock_t *lock); -#define _raw_write_lock_flags(lock, flags) _raw_write_lock(lock) - extern int _raw_write_trylock(rwlock_t *lock); - extern void _raw_write_unlock(rwlock_t *lock); + extern int _raw_spin_trylock(atomic_spinlock_t *lock); + extern void _raw_spin_unlock(atomic_spinlock_t *lock); #else # define _raw_spin_lock(lock) __raw_spin_lock(&(lock)->raw_lock) # define _raw_spin_lock_flags(lock, flags) \ __raw_spin_lock_flags(&(lock)->raw_lock, *(flags)) # define _raw_spin_trylock(lock) __raw_spin_trylock(&(lock)->raw_lock) # define _raw_spin_unlock(lock) __raw_spin_unlock(&(lock)->raw_lock) -# define _raw_read_lock(rwlock) __raw_read_lock(&(rwlock)->raw_lock) -# define _raw_read_lock_flags(lock, flags) \ - __raw_read_lock_flags(&(lock)->raw_lock, *(flags)) -# define _raw_read_trylock(rwlock) __raw_read_trylock(&(rwlock)->raw_lock) -# define _raw_read_unlock(rwlock) __raw_read_unlock(&(rwlock)->raw_lock) -# define _raw_write_lock(rwlock) __raw_write_lock(&(rwlock)->raw_lock) -# define _raw_write_lock_flags(lock, flags) \ - __raw_write_lock_flags(&(lock)->raw_lock, *(flags)) -# define _raw_write_trylock(rwlock) __raw_write_trylock(&(rwlock)->raw_lock) -# define _raw_write_unlock(rwlock) __raw_write_unlock(&(rwlock)->raw_lock) #endif -#define read_can_lock(rwlock) __raw_read_can_lock(&(rwlock)->raw_lock) -#define write_can_lock(rwlock) __raw_write_can_lock(&(rwlock)->raw_lock) - /* - * Define the various spin_lock and rw_lock methods. Note we define these - * regardless of whether CONFIG_SMP or CONFIG_PREEMPT are set. The various - * methods are defined as nops in the case they are not required. + * Define the various spin_lock methods. Note we define these + * regardless of whether CONFIG_SMP or CONFIG_PREEMPT are set. The + * various methods are defined as nops in the case they are not + * required. */ -#define spin_trylock(lock) __cond_lock(lock, _spin_trylock(lock)) -#define read_trylock(lock) __cond_lock(lock, _read_trylock(lock)) -#define write_trylock(lock) __cond_lock(lock, _write_trylock(lock)) +#define atomic_spin_trylock(lock) __cond_lock(lock, _atomic_spin_trylock(lock)) -#define spin_lock(lock) _spin_lock(lock) +#define atomic_spin_lock(lock) _atomic_spin_lock(lock) #ifdef CONFIG_DEBUG_LOCK_ALLOC -# define spin_lock_nested(lock, subclass) _spin_lock_nested(lock, subclass) -# define spin_lock_nest_lock(lock, nest_lock) \ +# define atomic_spin_lock_nested(lock, subclass) \ + _atomic_spin_lock_nested(lock, subclass) + +# define atomic_spin_lock_nest_lock(lock, nest_lock) \ do { \ typecheck(struct lockdep_map *, &(nest_lock)->dep_map);\ - _spin_lock_nest_lock(lock, &(nest_lock)->dep_map); \ + _atomic_spin_lock_nest_lock(lock, &(nest_lock)->dep_map);\ } while (0) #else -# define spin_lock_nested(lock, subclass) _spin_lock(lock) -# define spin_lock_nest_lock(lock, nest_lock) _spin_lock(lock) +# define atomic_spin_lock_nested(lock, subclass) _atomic_spin_lock(lock) +# define atomic_spin_lock_nest_lock(lock, nest_lock) _atomic_spin_lock(lock) #endif -#define write_lock(lock) _write_lock(lock) -#define read_lock(lock) _read_lock(lock) - #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) -#define spin_lock_irqsave(lock, flags) \ +#define atomic_spin_lock_irqsave(lock, flags) \ do { \ typecheck(unsigned long, flags); \ - flags = _spin_lock_irqsave(lock); \ - } while (0) -#define read_lock_irqsave(lock, flags) \ - do { \ - typecheck(unsigned long, flags); \ - flags = _read_lock_irqsave(lock); \ - } while (0) -#define write_lock_irqsave(lock, flags) \ - do { \ - typecheck(unsigned long, flags); \ - flags = _write_lock_irqsave(lock); \ + flags = _atomic_spin_lock_irqsave(lock);\ } while (0) #ifdef CONFIG_DEBUG_LOCK_ALLOC -#define spin_lock_irqsave_nested(lock, flags, subclass) \ +#define atomic_spin_lock_irqsave_nested(lock, flags, subclass) \ do { \ typecheck(unsigned long, flags); \ - flags = _spin_lock_irqsave_nested(lock, subclass); \ + flags = _atomic_spin_lock_irqsave_nested(lock, subclass);\ } while (0) #else -#define spin_lock_irqsave_nested(lock, flags, subclass) \ +#define atomic_spin_lock_irqsave_nested(lock, flags, subclass) \ do { \ typecheck(unsigned long, flags); \ - flags = _spin_lock_irqsave(lock); \ + flags = _atomic_spin_lock_irqsave(lock); \ } while (0) #endif #else -#define spin_lock_irqsave(lock, flags) \ - do { \ - typecheck(unsigned long, flags); \ - _spin_lock_irqsave(lock, flags); \ - } while (0) -#define read_lock_irqsave(lock, flags) \ - do { \ - typecheck(unsigned long, flags); \ - _read_lock_irqsave(lock, flags); \ - } while (0) -#define write_lock_irqsave(lock, flags) \ +#define atomic_spin_lock_irqsave(lock, flags) \ do { \ typecheck(unsigned long, flags); \ - _write_lock_irqsave(lock, flags); \ + _atomic_spin_lock_irqsave(lock, flags); \ } while (0) -#define spin_lock_irqsave_nested(lock, flags, subclass) \ - spin_lock_irqsave(lock, flags) -#endif - -#define spin_lock_irq(lock) _spin_lock_irq(lock) -#define spin_lock_bh(lock) _spin_lock_bh(lock) +#define atomic_spin_lock_irqsave_nested(lock, flags, subclass) \ + atomic_spin_lock_irqsave(lock, flags) -#define read_lock_irq(lock) _read_lock_irq(lock) -#define read_lock_bh(lock) _read_lock_bh(lock) +#endif -#define write_lock_irq(lock) _write_lock_irq(lock) -#define write_lock_bh(lock) _write_lock_bh(lock) +#define atomic_spin_lock_irq(lock) _atomic_spin_lock_irq(lock) +#define atomic_spin_lock_bh(lock) _atomic_spin_lock_bh(lock) /* * We inline the unlock functions in the nondebug case: */ #if defined(CONFIG_DEBUG_SPINLOCK) || defined(CONFIG_PREEMPT) || \ !defined(CONFIG_SMP) -# define spin_unlock(lock) _spin_unlock(lock) -# define read_unlock(lock) _read_unlock(lock) -# define write_unlock(lock) _write_unlock(lock) -# define spin_unlock_irq(lock) _spin_unlock_irq(lock) -# define read_unlock_irq(lock) _read_unlock_irq(lock) -# define write_unlock_irq(lock) _write_unlock_irq(lock) +# define atomic_spin_unlock(lock) _atomic_spin_unlock(lock) +# define atomic_spin_unlock_irq(lock) _atomic_spin_unlock_irq(lock) #else -# define spin_unlock(lock) \ +# define atomic_spin_unlock(lock) \ do {__raw_spin_unlock(&(lock)->raw_lock); __release(lock); } while (0) -# define read_unlock(lock) \ - do {__raw_read_unlock(&(lock)->raw_lock); __release(lock); } while (0) -# define write_unlock(lock) \ - do {__raw_write_unlock(&(lock)->raw_lock); __release(lock); } while (0) -# define spin_unlock_irq(lock) \ + +# define atomic_spin_unlock_irq(lock) \ do { \ __raw_spin_unlock(&(lock)->raw_lock); \ __release(lock); \ local_irq_enable(); \ } while (0) -# define read_unlock_irq(lock) \ -do { \ - __raw_read_unlock(&(lock)->raw_lock); \ - __release(lock); \ - local_irq_enable(); \ -} while (0) -# define write_unlock_irq(lock) \ -do { \ - __raw_write_unlock(&(lock)->raw_lock); \ - __release(lock); \ - local_irq_enable(); \ -} while (0) #endif -#define spin_unlock_irqrestore(lock, flags) \ - do { \ - typecheck(unsigned long, flags); \ - _spin_unlock_irqrestore(lock, flags); \ - } while (0) -#define spin_unlock_bh(lock) _spin_unlock_bh(lock) - -#define read_unlock_irqrestore(lock, flags) \ - do { \ - typecheck(unsigned long, flags); \ - _read_unlock_irqrestore(lock, flags); \ - } while (0) -#define read_unlock_bh(lock) _read_unlock_bh(lock) - -#define write_unlock_irqrestore(lock, flags) \ +#define atomic_spin_unlock_irqrestore(lock, flags) \ do { \ typecheck(unsigned long, flags); \ - _write_unlock_irqrestore(lock, flags); \ + _atomic_spin_unlock_irqrestore(lock, flags);\ } while (0) -#define write_unlock_bh(lock) _write_unlock_bh(lock) +#define atomic_spin_unlock_bh(lock) _atomic_spin_unlock_bh(lock) -#define spin_trylock_bh(lock) __cond_lock(lock, _spin_trylock_bh(lock)) +#define atomic_spin_trylock_bh(lock) \ + __cond_lock(lock, _atomic_spin_trylock_bh(lock)) -#define spin_trylock_irq(lock) \ +#define atomic_spin_trylock_irq(lock) \ ({ \ local_irq_disable(); \ - spin_trylock(lock) ? \ + atomic_spin_trylock(lock) ? \ 1 : ({ local_irq_enable(); 0; }); \ }) -#define spin_trylock_irqsave(lock, flags) \ +#define atomic_spin_trylock_irqsave(lock, flags) \ ({ \ local_irq_save(flags); \ - spin_trylock(lock) ? \ + atomic_spin_trylock(lock) ? \ 1 : ({ local_irq_restore(flags); 0; }); \ }) -#define write_trylock_irqsave(lock, flags) \ -({ \ - local_irq_save(flags); \ - write_trylock(lock) ? \ - 1 : ({ local_irq_restore(flags); 0; }); \ -}) +/** + * spin_can_lock - would spin_trylock() succeed? + * @lock: the spinlock in question. + */ +#define atomic_spin_can_lock(lock) (!atomic_spin_is_locked(lock)) /* * Pull the atomic_t declaration: @@ -370,14 +260,246 @@ do { \ * Decrements @atomic by 1. If the result is 0, returns true and locks * @lock. Returns false for all other cases. */ -extern int _atomic_dec_and_lock(atomic_t *atomic, spinlock_t *lock); +extern int +_atomic_dec_and_atomic_lock(atomic_t *atomic, atomic_spinlock_t *lock); + +#define atomic_dec_and_atomic_lock(atomic, lock) \ + __cond_lock(lock, _atomic_dec_and_atomic_lock(atomic, lock)) + +#ifdef CONFIG_PREEMPT_RT + +#include <linux/rt_lock.h> + +#define spin_lock(lock) rt_spin_lock(lock) +#define spin_lock_bh(lock) rt_spin_lock(lock) + +#define spin_trylock(lock) __cond_lock(lock, rt_spin_trylock(lock)) + +#ifdef CONFIG_LOCKDEP +# define spin_lock_nested(lock, subclass) \ + rt_spin_lock_nested(lock, subclass) + +# define spin_lock_irqsave_nested(lock, flags, subclass) \ +do { \ + typecheck(unsigned long, flags); \ + flags = 0; \ + rt_spin_lock_nested(lock, subclass); \ +} while (0) +#else +# define spin_lock_nested(lock, subclass) \ + rt_spin_lock(lock) + +# define spin_lock_irqsave_nested(lock, flags, subclass) \ +do { \ + typecheck(unsigned long, flags); \ + flags = 0; \ + rt_spin_lock(lock); \ +} while (0) +#endif + +#define spin_lock_irq(lock) rt_spin_lock(lock) + +#define spin_lock_irqsave(lock, flags) \ +do { \ + typecheck(unsigned long, flags); \ + flags = 0; \ + rt_spin_lock(lock); \ +} while (0) + +/* FIXME: we need rt_spin_lock_nested */ +#define spin_lock_nest_lock(lock, nest_lock) spin_lock_nested(lock, 0) + +#define spin_unlock(lock) rt_spin_unlock(lock) +#define spin_unlock_bh(lock) rt_spin_unlock(lock) +#define spin_unlock_irq(lock) rt_spin_unlock(lock) + +#define spin_unlock_irqrestore(lock, flags) \ +do { \ + typecheck(unsigned long, flags); \ + (void) flags; \ + rt_spin_unlock(lock); \ +} while (0) + +#define spin_trylock_bh(lock) __cond_lock(lock, rt_spin_trylock(lock)) +#define spin_trylock_irq(lock) __cond_lock(lock, rt_spin_trylock(lock)) + +#define spin_trylock_irqsave(lock, flags) \ +({ \ + typecheck(unsigned long, flags); \ + flags = 0; \ + __cond_lock(lock, rt_spin_trylock(lock)); \ +}) + +#define spin_unlock_wait(lock) rt_spin_unlock_wait(lock) + +#ifdef CONFIG_GENERIC_LOCKBREAK +# define spin_is_contended(lock) ((lock)->break_lock) +#else +# define spin_is_contended(lock) (((void)(lock), 0)) +#endif + +static inline int spin_can_locked(spinlock_t *lock) +{ + return !rt_mutex_is_locked(&lock->lock); +} + +static inline int spin_is_locked(spinlock_t *lock) +{ + return rt_mutex_is_locked(&lock->lock); +} + +static inline void assert_spin_locked(spinlock_t *lock) +{ + BUG_ON(!spin_is_locked(lock)); +} + #define atomic_dec_and_lock(atomic, lock) \ - __cond_lock(lock, _atomic_dec_and_lock(atomic, lock)) + atomic_dec_and_spin_lock(atomic, lock) -/** - * spin_can_lock - would spin_trylock() succeed? - * @lock: the spinlock in question. +#else + +/* + * Map spin* to atomic_spin* for PREEMPT_RT=n + */ +static inline void spin_lockcheck(spinlock_t *lock) { } + +#define spin_lock_init(lock) \ +do { \ + spin_lockcheck(lock); \ + atomic_spin_lock_init((atomic_spinlock_t *)lock); \ +} while (0) + +#define spin_lock(lock) \ +do { \ + spin_lockcheck(lock); \ + atomic_spin_lock((atomic_spinlock_t *)lock); \ +} while (0) + +#define spin_lock_bh(lock) \ +do { \ + spin_lockcheck(lock); \ + atomic_spin_lock_bh((atomic_spinlock_t *)lock); \ +} while (0) + +#define spin_trylock(lock) \ +({ \ + spin_lockcheck(lock); \ + atomic_spin_trylock((atomic_spinlock_t *)lock); \ +}) + +#define spin_lock_nested(lock, subclass) \ +do { \ + spin_lockcheck(lock); \ + atomic_spin_lock_nested((atomic_spinlock_t *)lock, subclass); \ +} while (0) + +#define spin_lock_nest_lock(lock, nest_lock) \ +do { \ + spin_lockcheck(lock); \ + atomic_spin_lock_nest_lock((atomic_spinlock_t *)lock, nest_lock); \ +} while (0) + +#define spin_lock_irq(lock) \ +do { \ + spin_lockcheck(lock); \ + atomic_spin_lock_irq((atomic_spinlock_t *)lock); \ +} while (0) + +#define spin_lock_irqsave(lock, flags) \ +do { \ + spin_lockcheck(lock); \ + atomic_spin_lock_irqsave((atomic_spinlock_t *)lock, flags); \ +} while (0) + +#define spin_lock_irqsave_nested(lock, flags, subclass) \ +do { \ + spin_lockcheck(lock); \ + atomic_spin_lock_irqsave_nested((atomic_spinlock_t *)lock, flags, subclass); \ +} while (0) + +#define spin_unlock(lock) \ +do { \ + spin_lockcheck(lock); \ + atomic_spin_unlock((atomic_spinlock_t *)lock); \ +} while (0) + +#define spin_unlock_bh(lock) \ +do { \ + spin_lockcheck(lock); \ + atomic_spin_unlock_bh((atomic_spinlock_t *)lock); \ +} while (0) + +#define spin_unlock_irq(lock) \ +do { \ + spin_lockcheck(lock); \ + atomic_spin_unlock_irq((atomic_spinlock_t *)lock); \ +} while (0) + +#define spin_unlock_irqrestore(lock, flags) \ +do { \ + spin_lockcheck(lock); \ + atomic_spin_unlock_irqrestore((atomic_spinlock_t *)lock, flags); \ +} while (0) + +#define spin_trylock_bh(lock) \ +({ \ + spin_lockcheck(lock); \ + atomic_spin_trylock_bh((atomic_spinlock_t *)lock); \ +}) + +#define spin_trylock_irq(lock) \ +({ \ + spin_lockcheck(lock); \ + atomic_spin_trylock_irq((atomic_spinlock_t *)lock); \ +}) + +#define spin_trylock_irqsave(lock, flags) \ +({ \ + spin_lockcheck(lock); \ + atomic_spin_trylock_irqsave((atomic_spinlock_t *)lock, flags); \ +}) + +#define spin_unlock_wait(lock) \ +do { \ + spin_lockcheck(lock); \ + atomic_spin_unlock_wait((atomic_spinlock_t *)lock); \ +} while (0) + +#define spin_is_locked(lock) \ +({ \ + spin_lockcheck(lock); \ + atomic_spin_is_locked((atomic_spinlock_t *)lock); \ +}) + +#define spin_is_contended(lock) \ +({ \ + spin_lockcheck(lock); \ + atomic_spin_is_contended((atomic_spinlock_t *)lock); \ +}) + +#define spin_can_lock(lock) \ +({ \ + spin_lockcheck(lock); \ + atomic_spin_can_lock((atomic_spinlock_t *)lock); \ +}) + +#define assert_spin_locked(lock) \ +do { \ + spin_lockcheck(lock); \ + assert_atomic_spin_locked((atomic_spinlock_t *)lock); \ +} while (0) + +#define atomic_dec_and_lock(atomic, lock) \ +({ \ + spin_lockcheck(lock); \ + atomic_dec_and_atomic_lock(atomic, (atomic_spinlock_t *)lock); \ +}) + +#endif /* !PREEMPT_RT */ + +/* + * Get the rwlock part */ -#define spin_can_lock(lock) (!spin_is_locked(lock)) +#include <linux/rwlock.h> #endif /* __LINUX_SPINLOCK_H */ diff --git a/include/linux/spinlock_api_smp.h b/include/linux/spinlock_api_smp.h index d79845d..4a9f6e4 100644 --- a/include/linux/spinlock_api_smp.h +++ b/include/linux/spinlock_api_smp.h @@ -17,47 +17,74 @@ int in_lock_functions(unsigned long addr); -#define assert_spin_locked(x) BUG_ON(!spin_is_locked(x)) +#define assert_atomic_spin_locked(x) BUG_ON(!atomic_spin_is_locked(x)) -void __lockfunc _spin_lock(spinlock_t *lock) __acquires(lock); -void __lockfunc _spin_lock_nested(spinlock_t *lock, int subclass) +void __lockfunc +_atomic_spin_lock(atomic_spinlock_t *lock) __acquires(lock); + +void __lockfunc +_atomic_spin_lock_nested(atomic_spinlock_t *lock, int subclass) + __acquires(lock); + +void __lockfunc +_atomic_spin_lock_nest_lock(atomic_spinlock_t *lock, struct lockdep_map *map) __acquires(lock); -void __lockfunc _spin_lock_nest_lock(spinlock_t *lock, struct lockdep_map *map) +void __lockfunc +_atomic_spin_lock_bh(atomic_spinlock_t *lock) __acquires(lock); + +void __lockfunc +_atomic_spin_lock_irq(atomic_spinlock_t *lock) __acquires(lock); + +unsigned long __lockfunc +_atomic_spin_lock_irqsave(atomic_spinlock_t *lock) __acquires(lock); + +unsigned long __lockfunc +_atomic_spin_lock_irqsave_nested(atomic_spinlock_t *lock, int subclass) __acquires(lock); + +int __lockfunc _atomic_spin_trylock(atomic_spinlock_t *lock); +int __lockfunc _atomic_spin_trylock_bh(atomic_spinlock_t *lock); + +void __lockfunc +_atomic_spin_unlock(atomic_spinlock_t *lock) __releases(lock); + +void __lockfunc +_atomic_spin_unlock_bh(atomic_spinlock_t *lock) __releases(lock); + +void __lockfunc +_atomic_spin_unlock_irq(atomic_spinlock_t *lock) __releases(lock); + +void __lockfunc +_atomic_spin_unlock_irqrestore(atomic_spinlock_t *lock, unsigned long flags) + __releases(lock); + +#ifndef CONFIG_PREEMPT_RT void __lockfunc _read_lock(rwlock_t *lock) __acquires(lock); void __lockfunc _write_lock(rwlock_t *lock) __acquires(lock); -void __lockfunc _spin_lock_bh(spinlock_t *lock) __acquires(lock); void __lockfunc _read_lock_bh(rwlock_t *lock) __acquires(lock); void __lockfunc _write_lock_bh(rwlock_t *lock) __acquires(lock); -void __lockfunc _spin_lock_irq(spinlock_t *lock) __acquires(lock); void __lockfunc _read_lock_irq(rwlock_t *lock) __acquires(lock); void __lockfunc _write_lock_irq(rwlock_t *lock) __acquires(lock); -unsigned long __lockfunc _spin_lock_irqsave(spinlock_t *lock) - __acquires(lock); -unsigned long __lockfunc _spin_lock_irqsave_nested(spinlock_t *lock, int subclass) - __acquires(lock); + unsigned long __lockfunc _read_lock_irqsave(rwlock_t *lock) __acquires(lock); unsigned long __lockfunc _write_lock_irqsave(rwlock_t *lock) __acquires(lock); -int __lockfunc _spin_trylock(spinlock_t *lock); int __lockfunc _read_trylock(rwlock_t *lock); int __lockfunc _write_trylock(rwlock_t *lock); -int __lockfunc _spin_trylock_bh(spinlock_t *lock); -void __lockfunc _spin_unlock(spinlock_t *lock) __releases(lock); + void __lockfunc _read_unlock(rwlock_t *lock) __releases(lock); void __lockfunc _write_unlock(rwlock_t *lock) __releases(lock); -void __lockfunc _spin_unlock_bh(spinlock_t *lock) __releases(lock); void __lockfunc _read_unlock_bh(rwlock_t *lock) __releases(lock); void __lockfunc _write_unlock_bh(rwlock_t *lock) __releases(lock); -void __lockfunc _spin_unlock_irq(spinlock_t *lock) __releases(lock); + void __lockfunc _read_unlock_irq(rwlock_t *lock) __releases(lock); void __lockfunc _write_unlock_irq(rwlock_t *lock) __releases(lock); -void __lockfunc _spin_unlock_irqrestore(spinlock_t *lock, unsigned long flags) - __releases(lock); + void __lockfunc _read_unlock_irqrestore(rwlock_t *lock, unsigned long flags) __releases(lock); void __lockfunc _write_unlock_irqrestore(rwlock_t *lock, unsigned long flags) __releases(lock); +#endif #endif /* __LINUX_SPINLOCK_API_SMP_H */ diff --git a/include/linux/spinlock_api_up.h b/include/linux/spinlock_api_up.h index 04e1d31..208e474 100644 --- a/include/linux/spinlock_api_up.h +++ b/include/linux/spinlock_api_up.h @@ -16,7 +16,7 @@ #define in_lock_functions(ADDR) 0 -#define assert_spin_locked(lock) do { (void)(lock); } while (0) +#define assert_atomic_spin_locked(lock) do { (void)(lock); } while (0) /* * In the UP-nondebug case there's no real locking going on, so the @@ -40,7 +40,8 @@ do { preempt_enable(); __release(lock); (void)(lock); } while (0) #define __UNLOCK_BH(lock) \ - do { preempt_enable_no_resched(); local_bh_enable(); __release(lock); (void)(lock); } while (0) + do { __preempt_enable_no_resched(); local_bh_enable(); __release(lock); \ + (void)(lock); } while (0) #define __UNLOCK_IRQ(lock) \ do { local_irq_enable(); __UNLOCK(lock); } while (0) @@ -48,33 +49,35 @@ #define __UNLOCK_IRQRESTORE(lock, flags) \ do { local_irq_restore(flags); __UNLOCK(lock); } while (0) -#define _spin_lock(lock) __LOCK(lock) -#define _spin_lock_nested(lock, subclass) __LOCK(lock) +#define _atomic_spin_lock(lock) __LOCK(lock) +#define _atomic_spin_lock_nested(lock, subclass) \ + __LOCK(lock) #define _read_lock(lock) __LOCK(lock) #define _write_lock(lock) __LOCK(lock) -#define _spin_lock_bh(lock) __LOCK_BH(lock) +#define _atomic_spin_lock_bh(lock) __LOCK_BH(lock) #define _read_lock_bh(lock) __LOCK_BH(lock) #define _write_lock_bh(lock) __LOCK_BH(lock) -#define _spin_lock_irq(lock) __LOCK_IRQ(lock) +#define _atomic_spin_lock_irq(lock) __LOCK_IRQ(lock) #define _read_lock_irq(lock) __LOCK_IRQ(lock) #define _write_lock_irq(lock) __LOCK_IRQ(lock) -#define _spin_lock_irqsave(lock, flags) __LOCK_IRQSAVE(lock, flags) +#define _atomic_spin_lock_irqsave(lock, flags) __LOCK_IRQSAVE(lock, flags) #define _read_lock_irqsave(lock, flags) __LOCK_IRQSAVE(lock, flags) #define _write_lock_irqsave(lock, flags) __LOCK_IRQSAVE(lock, flags) -#define _spin_trylock(lock) ({ __LOCK(lock); 1; }) +#define _atomic_spin_trylock(lock) ({ __LOCK(lock); 1; }) #define _read_trylock(lock) ({ __LOCK(lock); 1; }) #define _write_trylock(lock) ({ __LOCK(lock); 1; }) -#define _spin_trylock_bh(lock) ({ __LOCK_BH(lock); 1; }) -#define _spin_unlock(lock) __UNLOCK(lock) +#define _atomic_spin_trylock_bh(lock) ({ __LOCK_BH(lock); 1; }) +#define _atomic_spin_unlock(lock) __UNLOCK(lock) #define _read_unlock(lock) __UNLOCK(lock) #define _write_unlock(lock) __UNLOCK(lock) -#define _spin_unlock_bh(lock) __UNLOCK_BH(lock) +#define _atomic_spin_unlock_bh(lock) __UNLOCK_BH(lock) #define _write_unlock_bh(lock) __UNLOCK_BH(lock) #define _read_unlock_bh(lock) __UNLOCK_BH(lock) -#define _spin_unlock_irq(lock) __UNLOCK_IRQ(lock) +#define _atomic_spin_unlock_irq(lock) __UNLOCK_IRQ(lock) #define _read_unlock_irq(lock) __UNLOCK_IRQ(lock) #define _write_unlock_irq(lock) __UNLOCK_IRQ(lock) -#define _spin_unlock_irqrestore(lock, flags) __UNLOCK_IRQRESTORE(lock, flags) +#define _atomic_spin_unlock_irqrestore(lock, flags) \ + __UNLOCK_IRQRESTORE(lock, flags) #define _read_unlock_irqrestore(lock, flags) __UNLOCK_IRQRESTORE(lock, flags) #define _write_unlock_irqrestore(lock, flags) __UNLOCK_IRQRESTORE(lock, flags) diff --git a/include/linux/spinlock_types.h b/include/linux/spinlock_types.h index 68d88f7..a9278a9 100644 --- a/include/linux/spinlock_types.h +++ b/include/linux/spinlock_types.h @@ -9,6 +9,23 @@ * Released under the General Public License (GPL). */ +/* + * Must define these before including other files, inline functions need them + */ +#define LOCK_SECTION_NAME ".text.lock."KBUILD_BASENAME + +#define LOCK_SECTION_START(extra) \ + ".subsection 1\n\t" \ + extra \ + ".ifndef " LOCK_SECTION_NAME "\n\t" \ + LOCK_SECTION_NAME ":\n\t" \ + ".endif\n" + +#define LOCK_SECTION_END \ + ".previous\n\t" + +#define __lockfunc __attribute__((section(".spinlock.text"))) + #if defined(CONFIG_SMP) # include <asm/spinlock_types.h> #else @@ -17,7 +34,7 @@ #include <linux/lockdep.h> -typedef struct { +typedef struct atomic_spinlock { raw_spinlock_t raw_lock; #ifdef CONFIG_GENERIC_LOCKBREAK unsigned int break_lock; @@ -29,26 +46,10 @@ typedef struct { #ifdef CONFIG_DEBUG_LOCK_ALLOC struct lockdep_map dep_map; #endif -} spinlock_t; +} atomic_spinlock_t; #define SPINLOCK_MAGIC 0xdead4ead -typedef struct { - raw_rwlock_t raw_lock; -#ifdef CONFIG_GENERIC_LOCKBREAK - unsigned int break_lock; -#endif -#ifdef CONFIG_DEBUG_SPINLOCK - unsigned int magic, owner_cpu; - void *owner; -#endif -#ifdef CONFIG_DEBUG_LOCK_ALLOC - struct lockdep_map dep_map; -#endif -} rwlock_t; - -#define RWLOCK_MAGIC 0xdeaf1eed - #define SPINLOCK_OWNER_INIT ((void *)-1L) #ifdef CONFIG_DEBUG_LOCK_ALLOC @@ -57,44 +58,75 @@ typedef struct { # define SPIN_DEP_MAP_INIT(lockname) #endif -#ifdef CONFIG_DEBUG_LOCK_ALLOC -# define RW_DEP_MAP_INIT(lockname) .dep_map = { .name = #lockname } -#else -# define RW_DEP_MAP_INIT(lockname) -#endif - #ifdef CONFIG_DEBUG_SPINLOCK -# define __SPIN_LOCK_UNLOCKED(lockname) \ - (spinlock_t) { .raw_lock = __RAW_SPIN_LOCK_UNLOCKED, \ +# define __ATOMIC_SPIN_LOCK_UNLOCKED(lockname) \ + (atomic_spinlock_t) { .raw_lock = __RAW_SPIN_LOCK_UNLOCKED, \ .magic = SPINLOCK_MAGIC, \ .owner = SPINLOCK_OWNER_INIT, \ .owner_cpu = -1, \ SPIN_DEP_MAP_INIT(lockname) } -#define __RW_LOCK_UNLOCKED(lockname) \ - (rwlock_t) { .raw_lock = __RAW_RW_LOCK_UNLOCKED, \ - .magic = RWLOCK_MAGIC, \ - .owner = SPINLOCK_OWNER_INIT, \ - .owner_cpu = -1, \ - RW_DEP_MAP_INIT(lockname) } #else -# define __SPIN_LOCK_UNLOCKED(lockname) \ - (spinlock_t) { .raw_lock = __RAW_SPIN_LOCK_UNLOCKED, \ +# define __ATOMIC_SPIN_LOCK_UNLOCKED(lockname) \ + (atomic_spinlock_t) { .raw_lock = __RAW_SPIN_LOCK_UNLOCKED, \ SPIN_DEP_MAP_INIT(lockname) } -#define __RW_LOCK_UNLOCKED(lockname) \ - (rwlock_t) { .raw_lock = __RAW_RW_LOCK_UNLOCKED, \ - RW_DEP_MAP_INIT(lockname) } #endif /* - * SPIN_LOCK_UNLOCKED and RW_LOCK_UNLOCKED defeat lockdep state tracking and - * are hence deprecated. - * Please use DEFINE_SPINLOCK()/DEFINE_RWLOCK() or - * __SPIN_LOCK_UNLOCKED()/__RW_LOCK_UNLOCKED() as appropriate. + * SPIN_LOCK_UNLOCKED defeats lockdep state tracking and is hence + * deprecated. + * + * Please use DEFINE_SPINLOCK() or __SPIN_LOCK_UNLOCKED() as + * appropriate. + */ +#define DEFINE_ATOMIC_SPINLOCK(x) \ + atomic_spinlock_t x = __ATOMIC_SPIN_LOCK_UNLOCKED(x) + +#ifndef CONFIG_PREEMPT_RT +/* + * For PREEMPT_RT=n we use the same data structures and the spinlock + * functions are mapped to the atomic_spinlock functions + */ +typedef struct spinlock { + raw_spinlock_t raw_lock; +#ifdef CONFIG_GENERIC_LOCKBREAK + unsigned int break_lock; +#endif +#ifdef CONFIG_DEBUG_SPINLOCK + unsigned int magic, owner_cpu; + void *owner; +#endif +#ifdef CONFIG_DEBUG_LOCK_ALLOC + struct lockdep_map dep_map; +#endif +} spinlock_t; + +#ifdef CONFIG_DEBUG_SPINLOCK +# define __SPIN_LOCK_UNLOCKED(lockname) \ + (spinlock_t) { .raw_lock = __RAW_SPIN_LOCK_UNLOCKED, \ + .magic = SPINLOCK_MAGIC, \ + .owner = SPINLOCK_OWNER_INIT, \ + .owner_cpu = -1, \ + SPIN_DEP_MAP_INIT(lockname) } +#else +# define __SPIN_LOCK_UNLOCKED(lockname) \ + (spinlock_t) { .raw_lock = __RAW_SPIN_LOCK_UNLOCKED, \ + SPIN_DEP_MAP_INIT(lockname) } +#endif + +/* + * SPIN_LOCK_UNLOCKED defeats lockdep state tracking and is hence + * deprecated. + * + * Please use DEFINE_SPINLOCK() or __SPIN_LOCK_UNLOCKED() as + * appropriate. */ #define SPIN_LOCK_UNLOCKED __SPIN_LOCK_UNLOCKED(old_style_spin_init) -#define RW_LOCK_UNLOCKED __RW_LOCK_UNLOCKED(old_style_rw_init) -#define DEFINE_SPINLOCK(x) spinlock_t x = __SPIN_LOCK_UNLOCKED(x) -#define DEFINE_RWLOCK(x) rwlock_t x = __RW_LOCK_UNLOCKED(x) +#define __DEFINE_SPINLOCK(x) spinlock_t x = __SPIN_LOCK_UNLOCKED(x) +#define DEFINE_SPINLOCK(x) __DEFINE_SPINLOCK(x) + +#include <linux/rwlock_types.h> + +#endif #endif /* __LINUX_SPINLOCK_TYPES_H */ diff --git a/include/linux/srcu.h b/include/linux/srcu.h index aca0eee..3c02050 100644 --- a/include/linux/srcu.h +++ b/include/linux/srcu.h @@ -27,6 +27,8 @@ #ifndef _LINUX_SRCU_H #define _LINUX_SRCU_H +#include <linux/wait.h> + struct srcu_struct_array { int c[2]; }; @@ -50,4 +52,24 @@ void srcu_read_unlock(struct srcu_struct *sp, int idx) __releases(sp); void synchronize_srcu(struct srcu_struct *sp); long srcu_batches_completed(struct srcu_struct *sp); +/* + * fully compatible with srcu, but optimized for writers. + */ + +struct qrcu_struct { + int completed; + atomic_t ctr[2]; + wait_queue_head_t wq; + struct mutex mutex; +}; + +int init_qrcu_struct(struct qrcu_struct *qp); +int qrcu_read_lock(struct qrcu_struct *qp); +void qrcu_read_unlock(struct qrcu_struct *qp, int idx); +void synchronize_qrcu(struct qrcu_struct *qp); + +static inline void cleanup_qrcu_struct(struct qrcu_struct *qp) +{ +} + #endif diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index 80de700..a8e3782 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -64,6 +64,7 @@ struct perf_counter_attr; #include <linux/sem.h> #include <asm/siginfo.h> #include <asm/signal.h> +#include <linux/unistd.h> #include <linux/quota.h> #include <linux/key.h> #include <trace/syscall.h> @@ -97,6 +98,53 @@ struct perf_counter_attr; #define __SC_TEST5(t5, a5, ...) __SC_TEST(t5); __SC_TEST4(__VA_ARGS__) #define __SC_TEST6(t6, a6, ...) __SC_TEST(t6); __SC_TEST5(__VA_ARGS__) +#ifdef CONFIG_EVENT_PROFILE +#define TRACE_SYS_ENTER_PROFILE(sname) \ +static int prof_sysenter_enable_##sname(struct ftrace_event_call *event_call) \ +{ \ + int ret = 0; \ + if (!atomic_inc_return(&event_enter_##sname.profile_count)) \ + ret = reg_prof_syscall_enter("sys"#sname); \ + return ret; \ +} \ + \ +static void prof_sysenter_disable_##sname(struct ftrace_event_call *event_call)\ +{ \ + if (atomic_add_negative(-1, &event_enter_##sname.profile_count)) \ + unreg_prof_syscall_enter("sys"#sname); \ +} + +#define TRACE_SYS_EXIT_PROFILE(sname) \ +static int prof_sysexit_enable_##sname(struct ftrace_event_call *event_call) \ +{ \ + int ret = 0; \ + if (!atomic_inc_return(&event_exit_##sname.profile_count)) \ + ret = reg_prof_syscall_exit("sys"#sname); \ + return ret; \ +} \ + \ +static void prof_sysexit_disable_##sname(struct ftrace_event_call *event_call) \ +{ \ + if (atomic_add_negative(-1, &event_exit_##sname.profile_count)) \ + unreg_prof_syscall_exit("sys"#sname); \ +} + +#define TRACE_SYS_ENTER_PROFILE_INIT(sname) \ + .profile_count = ATOMIC_INIT(-1), \ + .profile_enable = prof_sysenter_enable_##sname, \ + .profile_disable = prof_sysenter_disable_##sname, + +#define TRACE_SYS_EXIT_PROFILE_INIT(sname) \ + .profile_count = ATOMIC_INIT(-1), \ + .profile_enable = prof_sysexit_enable_##sname, \ + .profile_disable = prof_sysexit_disable_##sname, +#else +#define TRACE_SYS_ENTER_PROFILE(sname) +#define TRACE_SYS_ENTER_PROFILE_INIT(sname) +#define TRACE_SYS_EXIT_PROFILE(sname) +#define TRACE_SYS_EXIT_PROFILE_INIT(sname) +#endif + #ifdef CONFIG_FTRACE_SYSCALLS #define __SC_STR_ADECL1(t, a) #a #define __SC_STR_ADECL2(t, a, ...) #a, __SC_STR_ADECL1(__VA_ARGS__) @@ -112,7 +160,81 @@ struct perf_counter_attr; #define __SC_STR_TDECL5(t, a, ...) #t, __SC_STR_TDECL4(__VA_ARGS__) #define __SC_STR_TDECL6(t, a, ...) #t, __SC_STR_TDECL5(__VA_ARGS__) +#define SYSCALL_TRACE_ENTER_EVENT(sname) \ + static struct ftrace_event_call event_enter_##sname; \ + struct trace_event enter_syscall_print_##sname = { \ + .trace = print_syscall_enter, \ + }; \ + static int init_enter_##sname(void) \ + { \ + int num, id; \ + num = syscall_name_to_nr("sys"#sname); \ + if (num < 0) \ + return -ENOSYS; \ + id = register_ftrace_event(&enter_syscall_print_##sname);\ + if (!id) \ + return -ENODEV; \ + event_enter_##sname.id = id; \ + set_syscall_enter_id(num, id); \ + INIT_LIST_HEAD(&event_enter_##sname.fields); \ + return 0; \ + } \ + TRACE_SYS_ENTER_PROFILE(sname); \ + static struct ftrace_event_call __used \ + __attribute__((__aligned__(4))) \ + __attribute__((section("_ftrace_events"))) \ + event_enter_##sname = { \ + .name = "sys_enter"#sname, \ + .system = "syscalls", \ + .event = &event_syscall_enter, \ + .raw_init = init_enter_##sname, \ + .show_format = syscall_enter_format, \ + .define_fields = syscall_enter_define_fields, \ + .regfunc = reg_event_syscall_enter, \ + .unregfunc = unreg_event_syscall_enter, \ + .data = "sys"#sname, \ + TRACE_SYS_ENTER_PROFILE_INIT(sname) \ + } + +#define SYSCALL_TRACE_EXIT_EVENT(sname) \ + static struct ftrace_event_call event_exit_##sname; \ + struct trace_event exit_syscall_print_##sname = { \ + .trace = print_syscall_exit, \ + }; \ + static int init_exit_##sname(void) \ + { \ + int num, id; \ + num = syscall_name_to_nr("sys"#sname); \ + if (num < 0) \ + return -ENOSYS; \ + id = register_ftrace_event(&exit_syscall_print_##sname);\ + if (!id) \ + return -ENODEV; \ + event_exit_##sname.id = id; \ + set_syscall_exit_id(num, id); \ + INIT_LIST_HEAD(&event_exit_##sname.fields); \ + return 0; \ + } \ + TRACE_SYS_EXIT_PROFILE(sname); \ + static struct ftrace_event_call __used \ + __attribute__((__aligned__(4))) \ + __attribute__((section("_ftrace_events"))) \ + event_exit_##sname = { \ + .name = "sys_exit"#sname, \ + .system = "syscalls", \ + .event = &event_syscall_exit, \ + .raw_init = init_exit_##sname, \ + .show_format = syscall_exit_format, \ + .define_fields = syscall_exit_define_fields, \ + .regfunc = reg_event_syscall_exit, \ + .unregfunc = unreg_event_syscall_exit, \ + .data = "sys"#sname, \ + TRACE_SYS_EXIT_PROFILE_INIT(sname) \ + } + #define SYSCALL_METADATA(sname, nb) \ + SYSCALL_TRACE_ENTER_EVENT(sname); \ + SYSCALL_TRACE_EXIT_EVENT(sname); \ static const struct syscall_metadata __used \ __attribute__((__aligned__(4))) \ __attribute__((section("__syscalls_metadata"))) \ @@ -121,18 +243,23 @@ struct perf_counter_attr; .nb_args = nb, \ .types = types_##sname, \ .args = args_##sname, \ - } + .enter_event = &event_enter_##sname, \ + .exit_event = &event_exit_##sname, \ + }; #define SYSCALL_DEFINE0(sname) \ + SYSCALL_TRACE_ENTER_EVENT(_##sname); \ + SYSCALL_TRACE_EXIT_EVENT(_##sname); \ static const struct syscall_metadata __used \ __attribute__((__aligned__(4))) \ __attribute__((section("__syscalls_metadata"))) \ __syscall_meta_##sname = { \ .name = "sys_"#sname, \ .nb_args = 0, \ + .enter_event = &event_enter__##sname, \ + .exit_event = &event_exit__##sname, \ }; \ asmlinkage long sys_##sname(void) - #else #define SYSCALL_DEFINE0(name) asmlinkage long sys_##name(void) #endif diff --git a/include/linux/time.h b/include/linux/time.h index ea16c1a..2c4acaa 100644 --- a/include/linux/time.h +++ b/include/linux/time.h @@ -75,7 +75,7 @@ extern unsigned long mktime(const unsigned int year, const unsigned int mon, const unsigned int day, const unsigned int hour, const unsigned int min, const unsigned int sec); -extern void set_normalized_timespec(struct timespec *ts, time_t sec, long nsec); +extern void set_normalized_timespec(struct timespec *ts, time_t sec, s64 nsec); extern struct timespec timespec_add_safe(const struct timespec lhs, const struct timespec rhs); @@ -99,7 +99,7 @@ static inline struct timespec timespec_sub(struct timespec lhs, extern struct timespec xtime; extern struct timespec wall_to_monotonic; -extern seqlock_t xtime_lock; +extern atomic_seqlock_t xtime_lock; extern unsigned long read_persistent_clock(void); extern int update_persistent_clock(struct timespec now); diff --git a/include/linux/timer.h b/include/linux/timer.h index be62ec2..0f3c593 100644 --- a/include/linux/timer.h +++ b/include/linux/timer.h @@ -230,10 +230,12 @@ static inline void timer_stats_timer_clear_start_info(struct timer_list *timer) extern void add_timer(struct timer_list *timer); -#ifdef CONFIG_SMP +#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_SOFTIRQS) + extern int timer_pending_sync(struct timer_list *timer); extern int try_to_del_timer_sync(struct timer_list *timer); extern int del_timer_sync(struct timer_list *timer); #else +# define timer_pending_sync(t) timer_pending(t) # define try_to_del_timer_sync(t) del_timer(t) # define del_timer_sync(t) del_timer(t) #endif diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h index b9dc4ca..63a3f7a 100644 --- a/include/linux/tracepoint.h +++ b/include/linux/tracepoint.h @@ -23,6 +23,8 @@ struct tracepoint; struct tracepoint { const char *name; /* Tracepoint name */ int state; /* State. */ + void (*regfunc)(void); + void (*unregfunc)(void); void **funcs; } __attribute__((aligned(32))); /* * Aligned on 32 bytes because it is @@ -78,12 +80,16 @@ struct tracepoint { return tracepoint_probe_unregister(#name, (void *)probe);\ } -#define DEFINE_TRACE(name) \ + +#define DEFINE_TRACE_FN(name, reg, unreg) \ static const char __tpstrtab_##name[] \ __attribute__((section("__tracepoints_strings"))) = #name; \ struct tracepoint __tracepoint_##name \ __attribute__((section("__tracepoints"), aligned(32))) = \ - { __tpstrtab_##name, 0, NULL } + { __tpstrtab_##name, 0, reg, unreg, NULL } + +#define DEFINE_TRACE(name) \ + DEFINE_TRACE_FN(name, NULL, NULL); #define EXPORT_TRACEPOINT_SYMBOL_GPL(name) \ EXPORT_SYMBOL_GPL(__tracepoint_##name) @@ -108,6 +114,7 @@ extern void tracepoint_update_probe_range(struct tracepoint *begin, return -ENOSYS; \ } +#define DEFINE_TRACE_FN(name, reg, unreg) #define DEFINE_TRACE(name) #define EXPORT_TRACEPOINT_SYMBOL_GPL(name) #define EXPORT_TRACEPOINT_SYMBOL(name) @@ -158,6 +165,15 @@ static inline void tracepoint_synchronize_unregister(void) #define PARAMS(args...) args +#endif /* _LINUX_TRACEPOINT_H */ + +/* + * Note: we keep the TRACE_EVENT outside the include file ifdef protection. + * This is due to the way trace events work. If a file includes two + * trace event headers under one "CREATE_TRACE_POINTS" the first include + * will override the TRACE_EVENT and break the second include. + */ + #ifndef TRACE_EVENT /* * For use with the TRACE_EVENT macro: @@ -259,10 +275,15 @@ static inline void tracepoint_synchronize_unregister(void) * can also by used by generic instrumentation like SystemTap), and * it is also used to expose a structured trace record in * /sys/kernel/debug/tracing/events/. + * + * A set of (un)registration functions can be passed to the variant + * TRACE_EVENT_FN to perform any (un)registration work. */ #define TRACE_EVENT(name, proto, args, struct, assign, print) \ DECLARE_TRACE(name, PARAMS(proto), PARAMS(args)) -#endif +#define TRACE_EVENT_FN(name, proto, args, struct, \ + assign, print, reg, unreg) \ + DECLARE_TRACE(name, PARAMS(proto), PARAMS(args)) -#endif +#endif /* ifdef TRACE_EVENT (see note above) */ diff --git a/include/linux/uaccess.h b/include/linux/uaccess.h index 6b58367..0c7aabb 100644 --- a/include/linux/uaccess.h +++ b/include/linux/uaccess.h @@ -6,37 +6,10 @@ /* * These routines enable/disable the pagefault handler in that - * it will not take any locks and go straight to the fixup table. - * - * They have great resemblance to the preempt_disable/enable calls - * and in fact they are identical; this is because currently there is - * no other way to make the pagefault handlers do this. So we do - * disable preemption but we don't necessarily care about that. + * it will not take any MM locks and go straight to the fixup table. */ -static inline void pagefault_disable(void) -{ - inc_preempt_count(); - /* - * make sure to have issued the store before a pagefault - * can hit. - */ - barrier(); -} - -static inline void pagefault_enable(void) -{ - /* - * make sure to issue those last loads/stores before enabling - * the pagefault handler again. - */ - barrier(); - dec_preempt_count(); - /* - * make sure we do.. - */ - barrier(); - preempt_check_resched(); -} +extern void pagefault_disable(void); +extern void pagefault_enable(void); #ifndef ARCH_HAS_NOCACHE_UACCESS diff --git a/include/linux/usb.h b/include/linux/usb.h index b1e3c2f..ee0c481 100644 --- a/include/linux/usb.h +++ b/include/linux/usb.h @@ -529,9 +529,9 @@ extern struct usb_device *usb_get_dev(struct usb_device *dev); extern void usb_put_dev(struct usb_device *dev); /* USB device locking */ -#define usb_lock_device(udev) down(&(udev)->dev.sem) -#define usb_unlock_device(udev) up(&(udev)->dev.sem) -#define usb_trylock_device(udev) down_trylock(&(udev)->dev.sem) +#define usb_lock_device(udev) mutex_lock(&(udev)->dev.mutex) +#define usb_unlock_device(udev) mutex_unlock(&(udev)->dev.mutex) +#define usb_trylock_device(udev) mutex_trylock(&(udev)->dev.mutex) extern int usb_lock_device_for_reset(struct usb_device *udev, const struct usb_interface *iface); diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h index 81a97cf..49f5691 100644 --- a/include/linux/vmstat.h +++ b/include/linux/vmstat.h @@ -76,7 +76,12 @@ DECLARE_PER_CPU(struct vm_event_state, vm_event_states); static inline void __count_vm_event(enum vm_event_item item) { +#ifdef CONFIG_PREEMPT_RT + get_cpu_var(vm_event_states).event[item]++; + put_cpu(); +#else __get_cpu_var(vm_event_states).event[item]++; +#endif } static inline void count_vm_event(enum vm_event_item item) @@ -87,7 +92,12 @@ static inline void count_vm_event(enum vm_event_item item) static inline void __count_vm_events(enum vm_event_item item, long delta) { +#ifdef CONFIG_PREEMPT_RT + get_cpu_var(vm_event_states).event[item] += delta; + put_cpu(); +#else __get_cpu_var(vm_event_states).event[item] += delta; +#endif } static inline void count_vm_events(enum vm_event_item item, long delta) diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h index 6273fa9..2ab0f9f 100644 --- a/include/linux/workqueue.h +++ b/include/linux/workqueue.h @@ -195,6 +195,9 @@ __create_workqueue_key(const char *name, int singlethread, #define create_freezeable_workqueue(name) __create_workqueue((name), 1, 1, 0) #define create_singlethread_workqueue(name) __create_workqueue((name), 1, 0, 0) +extern void set_workqueue_prio(struct workqueue_struct *wq, int policy, + int rt_priority, int nice); + extern void destroy_workqueue(struct workqueue_struct *wq); extern int queue_work(struct workqueue_struct *wq, struct work_struct *work); diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h index c4ca422..e8b60f2 100644 --- a/include/net/bluetooth/hci_core.h +++ b/include/net/bluetooth/hci_core.h @@ -117,7 +117,7 @@ struct hci_dev { struct sk_buff *sent_cmd; struct sk_buff *reassembly[3]; - struct semaphore req_lock; + struct mutex req_lock; wait_queue_head_t req_wait_q; __u32 req_status; __u32 req_result; @@ -700,8 +700,8 @@ struct hci_sec_filter { #define HCI_REQ_PEND 1 #define HCI_REQ_CANCELED 2 -#define hci_req_lock(d) down(&d->req_lock) -#define hci_req_unlock(d) up(&d->req_lock) +#define hci_req_lock(d) mutex_lock(&d->req_lock) +#define hci_req_unlock(d) mutex_unlock(&d->req_lock) void hci_req_complete(struct hci_dev *hdev, int result); diff --git a/include/trace/define_trace.h b/include/trace/define_trace.h index f7a7ae1..2a4b3bf 100644 --- a/include/trace/define_trace.h +++ b/include/trace/define_trace.h @@ -26,6 +26,11 @@ #define TRACE_EVENT(name, proto, args, tstruct, assign, print) \ DEFINE_TRACE(name) +#undef TRACE_EVENT_FN +#define TRACE_EVENT_FN(name, proto, args, tstruct, \ + assign, print, reg, unreg) \ + DEFINE_TRACE_FN(name, reg, unreg) + #undef DECLARE_TRACE #define DECLARE_TRACE(name, proto, args) \ DEFINE_TRACE(name) @@ -56,6 +61,8 @@ #include <trace/ftrace.h> #endif +#undef TRACE_EVENT +#undef TRACE_EVENT_FN #undef TRACE_HEADER_MULTI_READ /* Only undef what we defined in this file */ diff --git a/include/trace/events/block.h b/include/trace/events/block.h index 9a74b46..d86af94 100644 --- a/include/trace/events/block.h +++ b/include/trace/events/block.h @@ -171,6 +171,7 @@ TRACE_EVENT(block_rq_complete, (unsigned long long)__entry->sector, __entry->nr_sector, __entry->errors) ); + TRACE_EVENT(block_bio_bounce, TP_PROTO(struct request_queue *q, struct bio *bio), @@ -186,7 +187,8 @@ TRACE_EVENT(block_bio_bounce, ), TP_fast_assign( - __entry->dev = bio->bi_bdev->bd_dev; + __entry->dev = bio->bi_bdev ? + bio->bi_bdev->bd_dev : 0; __entry->sector = bio->bi_sector; __entry->nr_sector = bio->bi_size >> 9; blk_fill_rwbs(__entry->rwbs, bio->bi_rw, bio->bi_size); diff --git a/include/trace/events/hist.h b/include/trace/events/hist.h new file mode 100644 index 0000000..73b0454 --- /dev/null +++ b/include/trace/events/hist.h @@ -0,0 +1,37 @@ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM hist + +#if !defined(_TRACE_HIST_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_HIST_H + +#include "latency_hist.h" +#include <linux/tracepoint.h> + +#if !defined(CONFIG_PREEMPT_OFF_HIST) && !defined(CONFIG_INTERRUPT_OFF_HIST) +#define trace_preemptirqsoff_hist(a,b) +#else +TRACE_EVENT(preemptirqsoff_hist, + + TP_PROTO(int reason, int starthist), + + TP_ARGS(reason, starthist), + + TP_STRUCT__entry( + __field( int, reason ) + __field( int, starthist ) + ), + + TP_fast_assign( + __entry->reason = reason; + __entry->starthist = starthist; + ), + + TP_printk("reason=%s starthist=%s", getaction(__entry->reason), + __entry->starthist ? "start" : "stop") +); +#endif + +#endif /* _TRACE_HIST_H */ + +/* This part must be outside protection */ +#include <trace/define_trace.h> diff --git a/include/trace/events/latency_hist.h b/include/trace/events/latency_hist.h new file mode 100644 index 0000000..d6b5d77 --- /dev/null +++ b/include/trace/events/latency_hist.h @@ -0,0 +1,30 @@ +#ifndef _LATENCY_HIST_H +#define _LATENCY_HIST_H + +enum hist_action { + IRQS_ON, + PREEMPT_ON, + TRACE_STOP, + IRQS_OFF, + PREEMPT_OFF, + TRACE_START, +}; + +static char *actions[] = { + "IRQS_ON", + "PREEMPT_ON", + "TRACE_STOP", + "IRQS_OFF", + "PREEMPT_OFF", + "TRACE_START", +}; + +static inline char *getaction(int action) +{ + if (action >= 0 && action <= sizeof(actions)/sizeof(actions[0])) + return(actions[action]); + return("unknown"); +} + +#endif /* _LATENCY_HIST_H */ + diff --git a/include/trace/events/module.h b/include/trace/events/module.h new file mode 100644 index 0000000..84160fb --- /dev/null +++ b/include/trace/events/module.h @@ -0,0 +1,126 @@ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM module + +#if !defined(_TRACE_MODULE_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_MODULE_H + +#include <linux/tracepoint.h> + +#ifdef CONFIG_MODULES + +struct module; + +#define show_module_flags(flags) __print_flags(flags, "", \ + { (1UL << TAINT_PROPRIETARY_MODULE), "P" }, \ + { (1UL << TAINT_FORCED_MODULE), "F" }, \ + { (1UL << TAINT_CRAP), "C" }) + +TRACE_EVENT(module_load, + + TP_PROTO(struct module *mod), + + TP_ARGS(mod), + + TP_STRUCT__entry( + __field( unsigned int, taints ) + __string( name, mod->name ) + ), + + TP_fast_assign( + __entry->taints = mod->taints; + __assign_str(name, mod->name); + ), + + TP_printk("%s %s", __get_str(name), show_module_flags(__entry->taints)) +); + +TRACE_EVENT(module_free, + + TP_PROTO(struct module *mod), + + TP_ARGS(mod), + + TP_STRUCT__entry( + __string( name, mod->name ) + ), + + TP_fast_assign( + __assign_str(name, mod->name); + ), + + TP_printk("%s", __get_str(name)) +); + +TRACE_EVENT(module_get, + + TP_PROTO(struct module *mod, unsigned long ip, int refcnt), + + TP_ARGS(mod, ip, refcnt), + + TP_STRUCT__entry( + __field( unsigned long, ip ) + __field( int, refcnt ) + __string( name, mod->name ) + ), + + TP_fast_assign( + __entry->ip = ip; + __entry->refcnt = refcnt; + __assign_str(name, mod->name); + ), + + TP_printk("%s call_site=%pf refcnt=%d", + __get_str(name), (void *)__entry->ip, __entry->refcnt) +); + +TRACE_EVENT(module_put, + + TP_PROTO(struct module *mod, unsigned long ip, int refcnt), + + TP_ARGS(mod, ip, refcnt), + + TP_STRUCT__entry( + __field( unsigned long, ip ) + __field( int, refcnt ) + __string( name, mod->name ) + ), + + TP_fast_assign( + __entry->ip = ip; + __entry->refcnt = refcnt; + __assign_str(name, mod->name); + ), + + TP_printk("%s call_site=%pf refcnt=%d", + __get_str(name), (void *)__entry->ip, __entry->refcnt) +); + +TRACE_EVENT(module_request, + + TP_PROTO(char *name, bool wait, unsigned long ip), + + TP_ARGS(name, wait, ip), + + TP_STRUCT__entry( + __field( bool, wait ) + __field( unsigned long, ip ) + __string( name, name ) + ), + + TP_fast_assign( + __entry->wait = wait; + __entry->ip = ip; + __assign_str(name, name); + ), + + TP_printk("%s wait=%d call_site=%pf", + __get_str(name), (int)__entry->wait, (void *)__entry->ip) +); + +#endif /* CONFIG_MODULES */ + +#endif /* _TRACE_MODULE_H */ + +/* This part must be outside protection */ +#include <trace/define_trace.h> + diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h index 8949bb7..7e1ee53 100644 --- a/include/trace/events/sched.h +++ b/include/trace/events/sched.h @@ -94,6 +94,7 @@ TRACE_EVENT(sched_wakeup, __field( pid_t, pid ) __field( int, prio ) __field( int, success ) + __field( int, cpu ) ), TP_fast_assign( @@ -101,11 +102,12 @@ TRACE_EVENT(sched_wakeup, __entry->pid = p->pid; __entry->prio = p->prio; __entry->success = success; + __entry->cpu = task_cpu(p); ), - TP_printk("task %s:%d [%d] success=%d", + TP_printk("task %s:%d [%d] success=%d [%03d]", __entry->comm, __entry->pid, __entry->prio, - __entry->success) + __entry->success, __entry->cpu) ); /* @@ -125,6 +127,7 @@ TRACE_EVENT(sched_wakeup_new, __field( pid_t, pid ) __field( int, prio ) __field( int, success ) + __field( int, cpu ) ), TP_fast_assign( @@ -132,11 +135,12 @@ TRACE_EVENT(sched_wakeup_new, __entry->pid = p->pid; __entry->prio = p->prio; __entry->success = success; + __entry->cpu = task_cpu(p); ), - TP_printk("task %s:%d [%d] success=%d", + TP_printk("task %s:%d [%d] success=%d [%03d]", __entry->comm, __entry->pid, __entry->prio, - __entry->success) + __entry->success, __entry->cpu) ); /* @@ -263,6 +267,37 @@ TRACE_EVENT(sched_process_exit, ); /* + * Tracepoint for priority boosting/deboosting of a task: + * + * (NOTE: the 'rq' argument is not used by generic trace events, + * but used by the latency tracer plugin. ) + */ +TRACE_EVENT(sched_task_setprio, + + TP_PROTO(struct rq *rq, struct task_struct *p, int oldprio), + + TP_ARGS(rq, p, oldprio), + + TP_STRUCT__entry( + __array( char, comm, TASK_COMM_LEN ) + __field( pid_t, pid ) + __field( int, prio ) + __field( int, oldprio ) + ), + + TP_fast_assign( + memcpy(__entry->comm, p->comm, TASK_COMM_LEN); + __entry->pid = p->pid; + __entry->prio = p->prio; + __entry->oldprio = oldprio; + ), + + TP_printk("task %s:%d [%d] oldprio=%d", + __entry->comm, __entry->pid, __entry->prio, + __entry->oldprio) +); + +/* * Tracepoint for a waiting task: */ TRACE_EVENT(sched_process_wait, diff --git a/include/trace/events/syscalls.h b/include/trace/events/syscalls.h new file mode 100644 index 0000000..397dff2 --- /dev/null +++ b/include/trace/events/syscalls.h @@ -0,0 +1,70 @@ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM syscalls + +#if !defined(_TRACE_EVENTS_SYSCALLS_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_EVENTS_SYSCALLS_H + +#include <linux/tracepoint.h> + +#include <asm/ptrace.h> +#include <asm/syscall.h> + + +#ifdef CONFIG_HAVE_SYSCALL_TRACEPOINTS + +extern void syscall_regfunc(void); +extern void syscall_unregfunc(void); + +TRACE_EVENT_FN(sys_enter, + + TP_PROTO(struct pt_regs *regs, long id), + + TP_ARGS(regs, id), + + TP_STRUCT__entry( + __field( long, id ) + __array( unsigned long, args, 6 ) + ), + + TP_fast_assign( + __entry->id = id; + syscall_get_arguments(current, regs, 0, 6, __entry->args); + ), + + TP_printk("NR %ld (%lx, %lx, %lx, %lx, %lx, %lx)", + __entry->id, + __entry->args[0], __entry->args[1], __entry->args[2], + __entry->args[3], __entry->args[4], __entry->args[5]), + + syscall_regfunc, syscall_unregfunc +); + +TRACE_EVENT_FN(sys_exit, + + TP_PROTO(struct pt_regs *regs, long ret), + + TP_ARGS(regs, ret), + + TP_STRUCT__entry( + __field( long, id ) + __field( long, ret ) + ), + + TP_fast_assign( + __entry->id = syscall_get_nr(current, regs); + __entry->ret = ret; + ), + + TP_printk("NR %ld = %ld", + __entry->id, __entry->ret), + + syscall_regfunc, syscall_unregfunc +); + +#endif /* CONFIG_HAVE_SYSCALL_TRACEPOINTS */ + +#endif /* _TRACE_EVENTS_SYSCALLS_H */ + +/* This part must be outside protection */ +#include <trace/define_trace.h> + diff --git a/include/trace/ftrace.h b/include/trace/ftrace.h index f64fbaa..72a3b43 100644 --- a/include/trace/ftrace.h +++ b/include/trace/ftrace.h @@ -21,11 +21,14 @@ #undef __field #define __field(type, item) type item; +#undef __field_ext +#define __field_ext(type, item, filter_type) type item; + #undef __array #define __array(type, item, len) type item[len]; #undef __dynamic_array -#define __dynamic_array(type, item, len) unsigned short __data_loc_##item; +#define __dynamic_array(type, item, len) u32 __data_loc_##item; #undef __string #define __string(item, src) __dynamic_array(char, item, -1) @@ -42,6 +45,16 @@ }; \ static struct ftrace_event_call event_##name +#undef __cpparg +#define __cpparg(arg...) arg + +/* Callbacks are meaningless to ftrace. */ +#undef TRACE_EVENT_FN +#define TRACE_EVENT_FN(name, proto, args, tstruct, \ + assign, print, reg, unreg) \ + TRACE_EVENT(name, __cpparg(proto), __cpparg(args), \ + __cpparg(tstruct), __cpparg(assign), __cpparg(print)) \ + #include TRACE_INCLUDE(TRACE_INCLUDE_FILE) @@ -51,23 +64,27 @@ * Include the following: * * struct ftrace_data_offsets_<call> { - * int <item1>; - * int <item2>; + * u32 <item1>; + * u32 <item2>; * [...] * }; * - * The __dynamic_array() macro will create each int <item>, this is + * The __dynamic_array() macro will create each u32 <item>, this is * to keep the offset of each array from the beginning of the event. + * The size of an array is also encoded, in the higher 16 bits of <item>. */ #undef __field -#define __field(type, item); +#define __field(type, item) + +#undef __field_ext +#define __field_ext(type, item, filter_type) #undef __array #define __array(type, item, len) #undef __dynamic_array -#define __dynamic_array(type, item, len) int item; +#define __dynamic_array(type, item, len) u32 item; #undef __string #define __string(item, src) __dynamic_array(char, item, -1) @@ -109,6 +126,9 @@ if (!ret) \ return 0; +#undef __field_ext +#define __field_ext(type, item, filter_type) __field(type, item) + #undef __array #define __array(type, item, len) \ ret = trace_seq_printf(s, "\tfield:" #type " " #item "[" #len "];\t" \ @@ -120,7 +140,7 @@ #undef __dynamic_array #define __dynamic_array(type, item, len) \ - ret = trace_seq_printf(s, "\tfield:__data_loc " #item ";\t" \ + ret = trace_seq_printf(s, "\tfield:__data_loc " #type "[] " #item ";\t"\ "offset:%u;\tsize:%u;\n", \ (unsigned int)offsetof(typeof(field), \ __data_loc_##item), \ @@ -150,7 +170,8 @@ #undef TRACE_EVENT #define TRACE_EVENT(call, proto, args, tstruct, func, print) \ static int \ -ftrace_format_##call(struct trace_seq *s) \ +ftrace_format_##call(struct ftrace_event_call *unused, \ + struct trace_seq *s) \ { \ struct ftrace_raw_##call field __attribute__((unused)); \ int ret = 0; \ @@ -210,7 +231,7 @@ ftrace_format_##call(struct trace_seq *s) \ #undef __get_dynamic_array #define __get_dynamic_array(field) \ - ((void *)__entry + __entry->__data_loc_##field) + ((void *)__entry + (__entry->__data_loc_##field & 0xffff)) #undef __get_str #define __get_str(field) (char *)__get_dynamic_array(field) @@ -218,9 +239,9 @@ ftrace_format_##call(struct trace_seq *s) \ #undef __print_flags #define __print_flags(flag, delim, flag_array...) \ ({ \ - static const struct trace_print_flags flags[] = \ + static const struct trace_print_flags __flags[] = \ { flag_array, { -1, NULL }}; \ - ftrace_print_flags_seq(p, delim, flag, flags); \ + ftrace_print_flags_seq(p, delim, flag, __flags); \ }) #undef __print_symbolic @@ -233,7 +254,7 @@ ftrace_format_##call(struct trace_seq *s) \ #undef TRACE_EVENT #define TRACE_EVENT(call, proto, args, tstruct, assign, print) \ -enum print_line_t \ +static enum print_line_t \ ftrace_raw_output_##call(struct trace_iterator *iter, int flags) \ { \ struct trace_seq *s = &iter->seq; \ @@ -263,46 +284,48 @@ ftrace_raw_output_##call(struct trace_iterator *iter, int flags) \ #include TRACE_INCLUDE(TRACE_INCLUDE_FILE) -#undef __field -#define __field(type, item) \ +#undef __field_ext +#define __field_ext(type, item, filter_type) \ ret = trace_define_field(event_call, #type, #item, \ offsetof(typeof(field), item), \ - sizeof(field.item), is_signed_type(type)); \ + sizeof(field.item), \ + is_signed_type(type), filter_type); \ if (ret) \ return ret; +#undef __field +#define __field(type, item) __field_ext(type, item, FILTER_OTHER) + #undef __array #define __array(type, item, len) \ BUILD_BUG_ON(len > MAX_FILTER_STR_VAL); \ ret = trace_define_field(event_call, #type "[" #len "]", #item, \ offsetof(typeof(field), item), \ - sizeof(field.item), 0); \ + sizeof(field.item), 0, FILTER_OTHER); \ if (ret) \ return ret; #undef __dynamic_array #define __dynamic_array(type, item, len) \ - ret = trace_define_field(event_call, "__data_loc" "[" #type "]", #item,\ - offsetof(typeof(field), __data_loc_##item), \ - sizeof(field.__data_loc_##item), 0); + ret = trace_define_field(event_call, "__data_loc " #type "[]", #item, \ + offsetof(typeof(field), __data_loc_##item), \ + sizeof(field.__data_loc_##item), 0, \ + FILTER_OTHER); #undef __string #define __string(item, src) __dynamic_array(char, item, -1) #undef TRACE_EVENT #define TRACE_EVENT(call, proto, args, tstruct, func, print) \ -int \ -ftrace_define_fields_##call(void) \ +static int \ +ftrace_define_fields_##call(struct ftrace_event_call *event_call) \ { \ struct ftrace_raw_##call field; \ - struct ftrace_event_call *event_call = &event_##call; \ int ret; \ \ - __common_field(int, type, 1); \ - __common_field(unsigned char, flags, 0); \ - __common_field(unsigned char, preempt_count, 0); \ - __common_field(int, pid, 1); \ - __common_field(int, tgid, 1); \ + ret = trace_define_common_fields(event_call); \ + if (ret) \ + return ret; \ \ tstruct; \ \ @@ -321,6 +344,9 @@ ftrace_define_fields_##call(void) \ #undef __field #define __field(type, item) +#undef __field_ext +#define __field_ext(type, item, filter_type) + #undef __array #define __array(type, item, len) @@ -328,6 +354,7 @@ ftrace_define_fields_##call(void) \ #define __dynamic_array(type, item, len) \ __data_offsets->item = __data_size + \ offsetof(typeof(*entry), __data); \ + __data_offsets->item |= (len * sizeof(type)) << 16; \ __data_size += (len) * sizeof(type); #undef __string @@ -433,13 +460,15 @@ static void ftrace_profile_disable_##call(struct ftrace_event_call *event_call)\ * { * struct ring_buffer_event *event; * struct ftrace_raw_<call> *entry; <-- defined in stage 1 + * struct ring_buffer *buffer; * unsigned long irq_flags; * int pc; * * local_save_flags(irq_flags); * pc = preempt_count(); * - * event = trace_current_buffer_lock_reserve(event_<call>.id, + * event = trace_current_buffer_lock_reserve(&buffer, + * event_<call>.id, * sizeof(struct ftrace_raw_<call>), * irq_flags, pc); * if (!event) @@ -449,7 +478,7 @@ static void ftrace_profile_disable_##call(struct ftrace_event_call *event_call)\ * <assign>; <-- Here we assign the entries by the __field and * __array macros. * - * trace_current_buffer_unlock_commit(event, irq_flags, pc); + * trace_current_buffer_unlock_commit(buffer, event, irq_flags, pc); * } * * static int ftrace_raw_reg_event_<call>(void) @@ -541,6 +570,7 @@ static void ftrace_raw_event_##call(proto) \ struct ftrace_event_call *event_call = &event_##call; \ struct ring_buffer_event *event; \ struct ftrace_raw_##call *entry; \ + struct ring_buffer *buffer; \ unsigned long irq_flags; \ int __data_size; \ int pc; \ @@ -550,7 +580,8 @@ static void ftrace_raw_event_##call(proto) \ \ __data_size = ftrace_get_offsets_##call(&__data_offsets, args); \ \ - event = trace_current_buffer_lock_reserve(event_##call.id, \ + event = trace_current_buffer_lock_reserve(&buffer, \ + event_##call.id, \ sizeof(*entry) + __data_size, \ irq_flags, pc); \ if (!event) \ @@ -562,11 +593,12 @@ static void ftrace_raw_event_##call(proto) \ \ { assign; } \ \ - if (!filter_current_check_discard(event_call, entry, event)) \ - trace_nowake_buffer_unlock_commit(event, irq_flags, pc); \ + if (!filter_current_check_discard(buffer, event_call, entry, event)) \ + trace_nowake_buffer_unlock_commit(buffer, \ + event, irq_flags, pc); \ } \ \ -static int ftrace_raw_reg_event_##call(void) \ +static int ftrace_raw_reg_event_##call(void *ptr) \ { \ int ret; \ \ @@ -577,7 +609,7 @@ static int ftrace_raw_reg_event_##call(void) \ return ret; \ } \ \ -static void ftrace_raw_unreg_event_##call(void) \ +static void ftrace_raw_unreg_event_##call(void *ptr) \ { \ unregister_trace_##call(ftrace_raw_event_##call); \ } \ @@ -595,7 +627,6 @@ static int ftrace_raw_init_event_##call(void) \ return -ENODEV; \ event_##call.id = id; \ INIT_LIST_HEAD(&event_##call.fields); \ - init_preds(&event_##call); \ return 0; \ } \ \ diff --git a/include/trace/syscall.h b/include/trace/syscall.h index 8cfe515..5dc283b 100644 --- a/include/trace/syscall.h +++ b/include/trace/syscall.h @@ -1,8 +1,13 @@ #ifndef _TRACE_SYSCALL_H #define _TRACE_SYSCALL_H +#include <linux/tracepoint.h> +#include <linux/unistd.h> +#include <linux/ftrace_event.h> + #include <asm/ptrace.h> + /* * A syscall entry in the ftrace syscalls array. * @@ -10,26 +15,49 @@ * @nb_args: number of parameters it takes * @types: list of types as strings * @args: list of args as strings (args[i] matches types[i]) + * @enter_id: associated ftrace enter event id + * @exit_id: associated ftrace exit event id + * @enter_event: associated syscall_enter trace event + * @exit_event: associated syscall_exit trace event */ struct syscall_metadata { const char *name; int nb_args; const char **types; const char **args; + int enter_id; + int exit_id; + + struct ftrace_event_call *enter_event; + struct ftrace_event_call *exit_event; }; #ifdef CONFIG_FTRACE_SYSCALLS -extern void arch_init_ftrace_syscalls(void); extern struct syscall_metadata *syscall_nr_to_meta(int nr); -extern void start_ftrace_syscalls(void); -extern void stop_ftrace_syscalls(void); -extern void ftrace_syscall_enter(struct pt_regs *regs); -extern void ftrace_syscall_exit(struct pt_regs *regs); -#else -static inline void start_ftrace_syscalls(void) { } -static inline void stop_ftrace_syscalls(void) { } -static inline void ftrace_syscall_enter(struct pt_regs *regs) { } -static inline void ftrace_syscall_exit(struct pt_regs *regs) { } +extern int syscall_name_to_nr(char *name); +void set_syscall_enter_id(int num, int id); +void set_syscall_exit_id(int num, int id); +extern struct trace_event event_syscall_enter; +extern struct trace_event event_syscall_exit; +extern int reg_event_syscall_enter(void *ptr); +extern void unreg_event_syscall_enter(void *ptr); +extern int reg_event_syscall_exit(void *ptr); +extern void unreg_event_syscall_exit(void *ptr); +extern int syscall_enter_format(struct ftrace_event_call *call, + struct trace_seq *s); +extern int syscall_exit_format(struct ftrace_event_call *call, + struct trace_seq *s); +extern int syscall_enter_define_fields(struct ftrace_event_call *call); +extern int syscall_exit_define_fields(struct ftrace_event_call *call); +enum print_line_t print_syscall_enter(struct trace_iterator *iter, int flags); +enum print_line_t print_syscall_exit(struct trace_iterator *iter, int flags); +#endif +#ifdef CONFIG_EVENT_PROFILE +int reg_prof_syscall_enter(char *name); +void unreg_prof_syscall_enter(char *name); +int reg_prof_syscall_exit(char *name); +void unreg_prof_syscall_exit(char *name); + #endif #endif /* _TRACE_SYSCALL_H */ diff --git a/init/Kconfig b/init/Kconfig index 3f7e609..ef7c8db 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -318,6 +318,7 @@ choice config CLASSIC_RCU bool "Classic RCU" + depends on !PREEMPT_RT help This option selects the classic RCU implementation that is designed for best read-side performance on non-realtime @@ -327,6 +328,7 @@ config CLASSIC_RCU config TREE_RCU bool "Tree-based hierarchical RCU" + depends on !PREEMPT_RT help This option selects the RCU implementation that is designed for very large SMP system with hundreds or @@ -1039,6 +1041,7 @@ config SLAB config SLUB bool "SLUB (Unqueued Allocator)" + depends on !PREEMPT_RT help SLUB is a slab allocator that minimizes cache line usage instead of managing queues of cached objects (SLAB approach). @@ -1072,6 +1075,7 @@ config TRACEPOINTS config MARKERS bool "Activate markers" + depends on !PREEMPT_RT select TRACEPOINTS help Place an empty function call at each marker site. Can be diff --git a/init/Makefile b/init/Makefile index 4a243df..3f6e894 100644 --- a/init/Makefile +++ b/init/Makefile @@ -33,4 +33,5 @@ silent_chk_compile.h = : include/linux/compile.h: FORCE @$($(quiet)chk_compile.h) $(Q)$(CONFIG_SHELL) $(srctree)/scripts/mkcompile_h $@ \ - "$(UTS_MACHINE)" "$(CONFIG_SMP)" "$(CONFIG_PREEMPT)" "$(CC) $(KBUILD_CFLAGS)" + "$(UTS_MACHINE)" "$(CONFIG_SMP)" "$(CONFIG_PREEMPT)" "$(CONFIG_PREEMPT_RT)" \ + "$(CC) $(KBUILD_CFLAGS)" diff --git a/init/main.c b/init/main.c index 11f4f14..bdbdaab 100644 --- a/init/main.c +++ b/init/main.c @@ -37,6 +37,7 @@ #include <linux/workqueue.h> #include <linux/profile.h> #include <linux/rcupdate.h> +#include <linux/posix-timers.h> #include <linux/moduleparam.h> #include <linux/kallsyms.h> #include <linux/writeback.h> @@ -451,6 +452,8 @@ static noinline void __init_refok rest_init(void) { int pid; + system_state = SYSTEM_BOOTING_SCHEDULER_OK; + kernel_thread(kernel_init, NULL, CLONE_FS | CLONE_SIGHAND); numa_default_policy(); pid = kernel_thread(kthreadd, NULL, CLONE_FS | CLONE_FILES); @@ -463,8 +466,7 @@ static noinline void __init_refok rest_init(void) */ init_idle_bootup_task(current); rcu_scheduler_starting(); - preempt_enable_no_resched(); - schedule(); + preempt_enable_and_schedule(); preempt_disable(); /* Call into cpu_idle with preempt disabled */ @@ -715,6 +717,9 @@ asmlinkage void __init start_kernel(void) ftrace_init(); +#ifdef CONFIG_PREEMPT_RT + WARN_ON(irqs_disabled()); +#endif /* Do the rest non-__init'ed, we're now alive */ rest_init(); } @@ -818,9 +823,11 @@ static void __init do_basic_setup(void) static void __init do_pre_smp_initcalls(void) { initcall_t *call; + extern int spawn_desched_task(void); for (call = __initcall_start; call < __early_initcall_end; call++) do_one_initcall(*call); + spawn_desched_task(); } static void run_init_process(char *init_filename) @@ -856,6 +863,9 @@ static noinline int init_post(void) printk(KERN_WARNING "Failed to execute %s\n", ramdisk_execute_command); } +#ifdef CONFIG_PREEMPT_RT + WARN_ON(irqs_disabled()); +#endif /* * We try each of these until one succeeds. @@ -922,7 +932,60 @@ static int __init kernel_init(void * unused) ramdisk_execute_command = NULL; prepare_namespace(); } +#ifdef CONFIG_PREEMPT_RT + WARN_ON(irqs_disabled()); +#endif + +#define DEBUG_COUNT (defined(CONFIG_DEBUG_RT_MUTEXES) + defined(CONFIG_IRQSOFF_TRACER) + defined(CONFIG_PREEMPT_TRACER) + defined(CONFIG_STACK_TRACER) + defined(CONFIG_INTERRUPT_OFF_HIST) + defined(CONFIG_PREEMPT_OFF_HIST) + defined(CONFIG_WAKEUP_LATENCY_HIST) + defined(CONFIG_DEBUG_SLAB) + defined(CONFIG_DEBUG_PAGEALLOC) + defined(CONFIG_LOCKDEP) + (defined(CONFIG_FTRACE) - defined(CONFIG_FTRACE_MCOUNT_RECORD))) +#if DEBUG_COUNT > 0 + printk(KERN_ERR "*****************************************************************************\n"); + printk(KERN_ERR "* *\n"); +#if DEBUG_COUNT == 1 + printk(KERN_ERR "* REMINDER, the following debugging option is turned on in your .config: *\n"); +#else + printk(KERN_ERR "* REMINDER, the following debugging options are turned on in your .config: *\n"); +#endif + printk(KERN_ERR "* *\n"); +#ifdef CONFIG_DEBUG_RT_MUTEXES + printk(KERN_ERR "* CONFIG_DEBUG_RT_MUTEXES *\n"); +#endif +#ifdef CONFIG_IRQSOFF_TRACER + printk(KERN_ERR "* CONFIG_IRQSOFF_TRACER *\n"); +#endif +#ifdef CONFIG_PREEMPT_TRACER + printk(KERN_ERR "* CONFIG_PREEMPT_TRACER *\n"); +#endif +#if defined(CONFIG_FTRACE) && !defined(CONFIG_FTRACE_MCOUNT_RECORD) + printk(KERN_ERR "* CONFIG_FTRACE *\n"); +#endif +#ifdef CONFIG_INTERRUPT_OFF_HIST + printk(KERN_ERR "* CONFIG_INTERRUPT_OFF_HIST *\n"); +#endif +#ifdef CONFIG_PREEMPT_OFF_HIST + printk(KERN_ERR "* CONFIG_PREEMPT_OFF_HIST *\n"); +#endif +#ifdef CONFIG_WAKEUP_LATENCY_HIST + printk(KERN_ERR "* CONFIG_WAKEUP_LATENCY_HIST *\n"); +#endif +#ifdef CONFIG_DEBUG_SLAB + printk(KERN_ERR "* CONFIG_DEBUG_SLAB *\n"); +#endif +#ifdef CONFIG_DEBUG_PAGEALLOC + printk(KERN_ERR "* CONFIG_DEBUG_PAGEALLOC *\n"); +#endif +#ifdef CONFIG_LOCKDEP + printk(KERN_ERR "* CONFIG_LOCKDEP *\n"); +#endif + printk(KERN_ERR "* *\n"); +#if DEBUG_COUNT == 1 + printk(KERN_ERR "* it may increase runtime overhead and latencies. *\n"); +#else + printk(KERN_ERR "* they may increase runtime overhead and latencies. *\n"); +#endif + printk(KERN_ERR "* *\n"); + printk(KERN_ERR "*****************************************************************************\n"); +#endif /* * Ok, we have completed the initial bootup, and * we're essentially up and running. Get rid of the diff --git a/ipc/mqueue.c b/ipc/mqueue.c index c5e68ad..63a47f7 100644 --- a/ipc/mqueue.c +++ b/ipc/mqueue.c @@ -820,12 +820,17 @@ static inline void pipelined_send(struct mqueue_inode_info *info, struct msg_msg *message, struct ext_wait_queue *receiver) { + /* + * Keep them in one critical section for PREEMPT_RT: + */ + preempt_disable_rt(); receiver->msg = message; list_del(&receiver->list); receiver->state = STATE_PENDING; wake_up_process(receiver->task); smp_wmb(); receiver->state = STATE_READY; + preempt_enable_nort(); } /* pipelined_receive() - if there is task waiting in sys_mq_timedsend() diff --git a/ipc/msg.c b/ipc/msg.c index 2ceab7f..6de2720 100644 --- a/ipc/msg.c +++ b/ipc/msg.c @@ -259,12 +259,20 @@ static void expunge_all(struct msg_queue *msq, int res) while (tmp != &msq->q_receivers) { struct msg_receiver *msr; + /* + * Make sure that the wakeup doesnt preempt + * this CPU prematurely. (on PREEMPT_RT) + */ + preempt_disable_rt(); + msr = list_entry(tmp, struct msg_receiver, r_list); tmp = tmp->next; msr->r_msg = NULL; wake_up_process(msr->r_tsk); smp_mb(); msr->r_msg = ERR_PTR(res); + + preempt_enable_rt(); } } @@ -611,6 +619,12 @@ static inline int pipelined_send(struct msg_queue *msq, struct msg_msg *msg) !security_msg_queue_msgrcv(msq, msg, msr->r_tsk, msr->r_msgtype, msr->r_mode)) { + /* + * Make sure that the wakeup doesnt preempt + * this CPU prematurely. (on PREEMPT_RT) + */ + preempt_disable_rt(); + list_del(&msr->r_list); if (msr->r_maxsize < msg->m_ts) { msr->r_msg = NULL; @@ -624,9 +638,11 @@ static inline int pipelined_send(struct msg_queue *msq, struct msg_msg *msg) wake_up_process(msr->r_tsk); smp_mb(); msr->r_msg = msg; + preempt_enable_rt(); return 1; } + preempt_enable_rt(); } } return 0; diff --git a/ipc/sem.c b/ipc/sem.c index 87c2b64..3ee3554 100644 --- a/ipc/sem.c +++ b/ipc/sem.c @@ -415,6 +415,11 @@ static void update_queue (struct sem_array * sma) struct sem_queue *n; /* + * make sure that the wakeup doesnt preempt + * _this_ cpu prematurely. (on preempt_rt) + */ + preempt_disable_rt(); + /* * Continue scanning. The next operation * that must be checked depends on the type of the * completed operation: @@ -450,6 +455,7 @@ static void update_queue (struct sem_array * sma) */ smp_wmb(); q->status = error; + preempt_enable_rt(); q = n; } else { q = list_entry(q->list.next, struct sem_queue, list); diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt index bf987b9..f4602f8 100644 --- a/kernel/Kconfig.preempt +++ b/kernel/Kconfig.preempt @@ -1,14 +1,13 @@ - choice - prompt "Preemption Model" - default PREEMPT_NONE + prompt "Preemption Mode" + default PREEMPT_RT config PREEMPT_NONE bool "No Forced Preemption (Server)" help - This is the traditional Linux preemption model, geared towards + This is the traditional Linux preemption model geared towards throughput. It will still provide good latencies most of the - time, but there are no guarantees and occasional longer delays + time but there are no guarantees and occasional long delays are possible. Select this option if you are building a kernel for a server or @@ -21,7 +20,7 @@ config PREEMPT_VOLUNTARY help This option reduces the latency of the kernel by adding more "explicit preemption points" to the kernel code. These new - preemption points have been selected to reduce the maximum + preemption points have been selected to minimize the maximum latency of rescheduling, providing faster application reactions, at the cost of slightly lower throughput. @@ -33,22 +32,91 @@ config PREEMPT_VOLUNTARY Select this if you are building a kernel for a desktop system. -config PREEMPT +config PREEMPT_DESKTOP bool "Preemptible Kernel (Low-Latency Desktop)" help This option reduces the latency of the kernel by making - all kernel code (that is not executing in a critical section) + all kernel code that is not executing in a critical section preemptible. This allows reaction to interactive events by permitting a low priority process to be preempted involuntarily even if it is in kernel mode executing a system call and would - otherwise not be about to reach a natural preemption point. - This allows applications to run more 'smoothly' even when the - system is under load, at the cost of slightly lower throughput - and a slight runtime overhead to kernel code. + otherwise not about to reach a preemption point. This allows + applications to run more 'smoothly' even when the system is + under load, at the cost of slighly lower throughput and a + slight runtime overhead to kernel code. + + (According to profiles, when this mode is selected then even + during kernel-intense workloads the system is in an immediately + preemptible state more than 50% of the time.) Select this if you are building a kernel for a desktop or embedded system with latency requirements in the milliseconds range. +config PREEMPT_RT + bool "Complete Preemption (Real-Time)" + select PREEMPT_SOFTIRQS + select PREEMPT_HARDIRQS + select PREEMPT_RCU + select RT_MUTEXES + help + This option further reduces the scheduling latency of the + kernel by replacing almost every spinlock used by the kernel + with preemptible mutexes and thus making all but the most + critical kernel code involuntarily preemptible. The remaining + handful of lowlevel non-preemptible codepaths are short and + have a deterministic latency of a couple of tens of + microseconds (depending on the hardware). This also allows + applications to run more 'smoothly' even when the system is + under load, at the cost of lower throughput and runtime + overhead to kernel code. + + (According to profiles, when this mode is selected then even + during kernel-intense workloads the system is in an immediately + preemptible state more than 95% of the time.) + + Select this if you are building a kernel for a desktop, + embedded or real-time system with guaranteed latency + requirements of 100 usecs or lower. + endchoice +config PREEMPT + bool + default y + depends on PREEMPT_DESKTOP || PREEMPT_RT + +config PREEMPT_SOFTIRQS + bool "Thread Softirqs" + default n +# depends on PREEMPT + help + This option reduces the latency of the kernel by 'threading' + soft interrupts. This means that all softirqs will execute + in softirqd's context. While this helps latency, it can also + reduce performance. + + The threading of softirqs can also be controlled via + /proc/sys/kernel/softirq_preemption runtime flag and the + sofirq-preempt=0/1 boot-time option. + + Say N if you are unsure. + +config PREEMPT_HARDIRQS + bool "Thread Hardirqs" + default n + depends on GENERIC_HARDIRQS_NO__DO_IRQ + select PREEMPT_SOFTIRQS + help + This option reduces the latency of the kernel by 'threading' + hardirqs. This means that all (or selected) hardirqs will run + in their own kernel thread context. While this helps latency, + this feature can also reduce performance. + + The threading of hardirqs can also be controlled via the + /proc/sys/kernel/hardirq_preemption runtime flag and the + hardirq-preempt=0/1 boot-time option. Per-irq threading can + be enabled/disable via the /proc/irq/<IRQ>/<handler>/threaded + runtime flags. + + Say N if you are unsure. diff --git a/kernel/Makefile b/kernel/Makefile index 2093a69..1ed7510 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -7,7 +7,7 @@ obj-y = sched.o fork.o exec_domain.o panic.o printk.o \ sysctl.o capability.o ptrace.o timer.o user.o \ signal.o sys.o kmod.o workqueue.o pid.o \ rcupdate.o extable.o params.o posix-timers.o \ - kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \ + kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o \ hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \ notifier.o ksysfs.o pm_qos_params.o sched_clock.o cred.o \ async.o @@ -28,7 +28,10 @@ obj-$(CONFIG_PROFILING) += profile.o obj-$(CONFIG_SYSCTL_SYSCALL_CHECK) += sysctl_check.o obj-$(CONFIG_STACKTRACE) += stacktrace.o obj-y += time/ +ifneq ($(CONFIG_PREEMPT_RT),y) +obj-y += mutex.o obj-$(CONFIG_DEBUG_MUTEXES) += mutex-debug.o +endif obj-$(CONFIG_LOCKDEP) += lockdep.o ifeq ($(CONFIG_PROC_FS),y) obj-$(CONFIG_LOCKDEP) += lockdep_proc.o @@ -40,14 +43,15 @@ endif obj-$(CONFIG_RT_MUTEXES) += rtmutex.o obj-$(CONFIG_DEBUG_RT_MUTEXES) += rtmutex-debug.o obj-$(CONFIG_RT_MUTEX_TESTER) += rtmutex-tester.o +obj-$(CONFIG_PREEMPT_RT) += rt.o obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o obj-$(CONFIG_USE_GENERIC_SMP_HELPERS) += smp.o ifneq ($(CONFIG_SMP),y) obj-y += up.o endif -obj-$(CONFIG_SMP) += spinlock.o -obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o -obj-$(CONFIG_PROVE_LOCKING) += spinlock.o +obj-$(CONFIG_SMP) += spinlock.o rwlock.o +obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o rwlock.o +obj-$(CONFIG_PROVE_LOCKING) += spinlock.o rwlock.o obj-$(CONFIG_UID16) += uid16.o obj-$(CONFIG_MODULES) += module.o obj-$(CONFIG_KALLSYMS) += kallsyms.o diff --git a/kernel/cgroup.c b/kernel/cgroup.c index b6eadfe..b1ac8a2 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -189,7 +189,7 @@ list_for_each_entry(_root, &roots, root_list) /* the list of cgroups eligible for automatic release. Protected by * release_list_lock */ static LIST_HEAD(release_list); -static DEFINE_SPINLOCK(release_list_lock); +static DEFINE_ATOMIC_SPINLOCK(release_list_lock); static void cgroup_release_agent(struct work_struct *work); static DECLARE_WORK(release_agent_work, cgroup_release_agent); static void check_for_release(struct cgroup *cgrp); @@ -2802,11 +2802,11 @@ again: finish_wait(&cgroup_rmdir_waitq, &wait); clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags); - spin_lock(&release_list_lock); + atomic_spin_lock(&release_list_lock); set_bit(CGRP_REMOVED, &cgrp->flags); if (!list_empty(&cgrp->release_list)) list_del(&cgrp->release_list); - spin_unlock(&release_list_lock); + atomic_spin_unlock(&release_list_lock); cgroup_lock_hierarchy(cgrp->root); /* delete this cgroup from parent->children */ @@ -3342,13 +3342,13 @@ static void check_for_release(struct cgroup *cgrp) * already queued for a userspace notification, queue * it now */ int need_schedule_work = 0; - spin_lock(&release_list_lock); + atomic_spin_lock(&release_list_lock); if (!cgroup_is_removed(cgrp) && list_empty(&cgrp->release_list)) { list_add(&cgrp->release_list, &release_list); need_schedule_work = 1; } - spin_unlock(&release_list_lock); + atomic_spin_unlock(&release_list_lock); if (need_schedule_work) schedule_work(&release_agent_work); } @@ -3395,7 +3395,7 @@ static void cgroup_release_agent(struct work_struct *work) { BUG_ON(work != &release_agent_work); mutex_lock(&cgroup_mutex); - spin_lock(&release_list_lock); + atomic_spin_lock(&release_list_lock); while (!list_empty(&release_list)) { char *argv[3], *envp[3]; int i; @@ -3404,7 +3404,7 @@ static void cgroup_release_agent(struct work_struct *work) struct cgroup, release_list); list_del_init(&cgrp->release_list); - spin_unlock(&release_list_lock); + atomic_spin_unlock(&release_list_lock); pathbuf = kmalloc(PAGE_SIZE, GFP_KERNEL); if (!pathbuf) goto continue_free; @@ -3434,9 +3434,9 @@ static void cgroup_release_agent(struct work_struct *work) continue_free: kfree(pathbuf); kfree(agentbuf); - spin_lock(&release_list_lock); + atomic_spin_lock(&release_list_lock); } - spin_unlock(&release_list_lock); + atomic_spin_unlock(&release_list_lock); mutex_unlock(&cgroup_mutex); } diff --git a/kernel/exit.c b/kernel/exit.c index 869dc22..4441e62 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -67,7 +67,9 @@ static void __unhash_process(struct task_struct *p) detach_pid(p, PIDTYPE_SID); list_del_rcu(&p->tasks); + preempt_disable(); __get_cpu_var(process_counts)--; + preempt_enable(); } list_del_rcu(&p->thread_group); list_del_init(&p->sibling); @@ -130,7 +132,7 @@ static void __exit_signal(struct task_struct *tsk) * Do this under ->siglock, we can race with another thread * doing sigqueue_free() if we have SIGQUEUE_PREALLOC signals. */ - flush_sigqueue(&tsk->pending); + flush_task_sigqueue(tsk); tsk->signal = NULL; tsk->sighand = NULL; @@ -685,9 +687,11 @@ static void exit_mm(struct task_struct * tsk) task_lock(tsk); tsk->mm = NULL; up_read(&mm->mmap_sem); + preempt_disable(); // FIXME enter_lazy_tlb(mm, current); /* We don't want this task to be frozen prematurely */ clear_freeze_flag(tsk); + preempt_enable(); task_unlock(tsk); mm_update_next_owner(mm); mmput(mm); @@ -930,7 +934,7 @@ NORET_TYPE void do_exit(long code) * an exiting task cleaning up the robust pi futexes. */ smp_mb(); - spin_unlock_wait(&tsk->pi_lock); + atomic_spin_unlock_wait(&tsk->pi_lock); if (unlikely(in_atomic())) printk(KERN_INFO "note: %s[%d] exited with preempt_count %d\n", @@ -1009,14 +1013,17 @@ NORET_TYPE void do_exit(long code) if (tsk->splice_pipe) __free_pipe_info(tsk->splice_pipe); - preempt_disable(); +again: + local_irq_disable(); /* causes final put_task_struct in finish_task_switch(). */ tsk->state = TASK_DEAD; - schedule(); - BUG(); - /* Avoid "noreturn function does return". */ - for (;;) - cpu_relax(); /* For when BUG is null */ + __schedule(); + printk(KERN_ERR "BUG: dead task %s:%d back from the grave!\n", + current->comm, current->pid); + printk(KERN_ERR ".... flags: %08x, count: %d, state: %08lx\n", + current->flags, atomic_read(¤t->usage), current->state); + printk(KERN_ERR ".... trying again ...\n"); + goto again; } EXPORT_SYMBOL_GPL(do_exit); @@ -1476,6 +1483,9 @@ static int wait_consider_task(struct wait_opts *wo, struct task_struct *parent, int ptrace, struct task_struct *p) { int ret = eligible_child(wo, p); + + BUG_ON(!atomic_read(&p->usage)); + if (!ret) return ret; diff --git a/kernel/fork.c b/kernel/fork.c index e6c04d4..cd875a9 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -38,6 +38,7 @@ #include <linux/syscalls.h> #include <linux/jiffies.h> #include <linux/tracehook.h> +#include <linux/interrupt.h> #include <linux/futex.h> #include <linux/compat.h> #include <linux/task_io_accounting_ops.h> @@ -48,6 +49,8 @@ #include <linux/memcontrol.h> #include <linux/ftrace.h> #include <linux/profile.h> +#include <linux/kthread.h> +#include <linux/notifier.h> #include <linux/rmap.h> #include <linux/acct.h> #include <linux/tsacct_kern.h> @@ -82,7 +85,19 @@ int max_threads; /* tunable limit on nr_threads */ DEFINE_PER_CPU(unsigned long, process_counts) = 0; +#ifdef CONFIG_PREEMPT_RT +DEFINE_RWLOCK(tasklist_lock); /* outer */ +#else __cacheline_aligned DEFINE_RWLOCK(tasklist_lock); /* outer */ +#endif + +/* + * Delayed mmdrop. In the PREEMPT_RT case we + * dont want to do this from the scheduling + * context. + */ +static DEFINE_PER_CPU(struct task_struct *, desched_task); +static DEFINE_PER_CPU(struct list_head, delayed_drop_list); int nr_processes(void) { @@ -160,6 +175,16 @@ void __put_task_struct(struct task_struct *tsk) free_task(tsk); } +#ifdef CONFIG_PREEMPT_RT +void __put_task_struct_cb(struct rcu_head *rhp) +{ + struct task_struct *tsk = container_of(rhp, struct task_struct, rcu); + + __put_task_struct(tsk); + +} +#endif + /* * macro override instead of weak attribute alias, to workaround * gcc 4.1.0 and 4.1.1 bugs with weak attribute and empty functions. @@ -170,6 +195,8 @@ void __put_task_struct(struct task_struct *tsk) void __init fork_init(unsigned long mempages) { + int i; + #ifndef __HAVE_ARCH_TASK_STRUCT_ALLOCATOR #ifndef ARCH_MIN_TASKALIGN #define ARCH_MIN_TASKALIGN L1_CACHE_BYTES @@ -200,6 +227,9 @@ void __init fork_init(unsigned long mempages) init_task.signal->rlim[RLIMIT_NPROC].rlim_max = max_threads/2; init_task.signal->rlim[RLIMIT_SIGPENDING] = init_task.signal->rlim[RLIMIT_NPROC]; + + for (i = 0; i < NR_CPUS; i++) + INIT_LIST_HEAD(&per_cpu(delayed_drop_list, i)); } int __attribute__((weak)) arch_dup_task_struct(struct task_struct *dst, @@ -281,6 +311,7 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) mm->locked_vm = 0; mm->mmap = NULL; mm->mmap_cache = NULL; + INIT_LIST_HEAD(&mm->delayed_drop); mm->free_area_cache = oldmm->mmap_base; mm->cached_hole_size = ~0UL; mm->map_count = 0; @@ -894,10 +925,13 @@ SYSCALL_DEFINE1(set_tid_address, int __user *, tidptr) static void rt_mutex_init_task(struct task_struct *p) { - spin_lock_init(&p->pi_lock); + atomic_spin_lock_init(&p->pi_lock); #ifdef CONFIG_RT_MUTEXES - plist_head_init(&p->pi_waiters, &p->pi_lock); + plist_head_init_atomic(&p->pi_waiters, &p->pi_lock); p->pi_blocked_on = NULL; +# ifdef CONFIG_DEBUG_RT_MUTEXES + p->last_kernel_lock = NULL; +# endif #endif } @@ -1016,6 +1050,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, spin_lock_init(&p->alloc_lock); init_sigpending(&p->pending); + p->sigqueue_cache = NULL; p->utime = cputime_zero; p->stime = cputime_zero; @@ -1031,7 +1066,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, acct_clear_integrals(p); posix_cpu_timers_init(p); - + p->posix_timer_list = NULL; p->lock_depth = -1; /* -1 = no lock */ do_posix_clock_monotonic_gettime(&p->start_time); p->real_start_time = p->start_time; @@ -1067,6 +1102,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, p->hardirq_context = 0; p->softirq_context = 0; #endif + p->pagefault_disabled = 0; #ifdef CONFIG_LOCKDEP p->lockdep_depth = 0; /* no locks held yet */ p->curr_chain_key = 0; @@ -1108,6 +1144,9 @@ static struct task_struct *copy_process(unsigned long clone_flags, retval = copy_thread(clone_flags, stack_start, stack_size, p, regs); if (retval) goto bad_fork_cleanup_io; +#ifdef CONFIG_DEBUG_PREEMPT + atomic_set(&p->lock_count, 0); +#endif if (pid != &init_struct_pid) { retval = -ENOMEM; @@ -1145,6 +1184,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, #endif INIT_LIST_HEAD(&p->pi_state_list); p->pi_state_cache = NULL; + p->futex_wakeup = NULL; #endif /* * sigaltstack should be cleared when sharing the same VM @@ -1192,11 +1232,13 @@ static struct task_struct *copy_process(unsigned long clone_flags, * to ensure it is on a valid CPU (and if not, just force it back to * parent's CPU). This avoids alot of nasty races. */ + preempt_disable(); p->cpus_allowed = current->cpus_allowed; p->rt.nr_cpus_allowed = current->rt.nr_cpus_allowed; if (unlikely(!cpu_isset(task_cpu(p), p->cpus_allowed) || !cpu_online(task_cpu(p)))) set_task_cpu(p, smp_processor_id()); + preempt_enable(); /* CLONE_PARENT re-uses the old parent */ if (clone_flags & (CLONE_PARENT|CLONE_THREAD)) { @@ -1246,7 +1288,9 @@ static struct task_struct *copy_process(unsigned long clone_flags, attach_pid(p, PIDTYPE_PGID, task_pgrp(current)); attach_pid(p, PIDTYPE_SID, task_session(current)); list_add_tail_rcu(&p->tasks, &init_task.tasks); + preempt_disable(); __get_cpu_var(process_counts)++; + preempt_enable(); } attach_pid(p, PIDTYPE_PID, pid); nr_threads++; @@ -1723,3 +1767,138 @@ int unshare_files(struct files_struct **displaced) task_unlock(task); return 0; } + +static int mmdrop_complete(void) +{ + struct list_head *head; + int ret = 0; + + head = &get_cpu_var(delayed_drop_list); + while (!list_empty(head)) { + struct mm_struct *mm = list_entry(head->next, + struct mm_struct, delayed_drop); + list_del(&mm->delayed_drop); + put_cpu_var(delayed_drop_list); + + __mmdrop(mm); + ret = 1; + + head = &get_cpu_var(delayed_drop_list); + } + put_cpu_var(delayed_drop_list); + + return ret; +} + +/* + * We dont want to do complex work from the scheduler, thus + * we delay the work to a per-CPU worker thread: + */ +void __mmdrop_delayed(struct mm_struct *mm) +{ + struct task_struct *desched_task; + struct list_head *head; + + head = &get_cpu_var(delayed_drop_list); + list_add_tail(&mm->delayed_drop, head); + desched_task = __get_cpu_var(desched_task); + if (desched_task) + wake_up_process(desched_task); + put_cpu_var(delayed_drop_list); +} + +static void takeover_delayed_drop(int hotcpu) +{ + struct list_head *head = &per_cpu(delayed_drop_list, hotcpu); + + while (!list_empty(head)) { + struct mm_struct *mm = list_entry(head->next, + struct mm_struct, delayed_drop); + + list_del(&mm->delayed_drop); + __mmdrop_delayed(mm); + } +} + +static int desched_thread(void * __bind_cpu) +{ + set_user_nice(current, -10); + current->flags |= PF_NOFREEZE | PF_SOFTIRQ; + + set_current_state(TASK_INTERRUPTIBLE); + + while (!kthread_should_stop()) { + + if (mmdrop_complete()) + continue; + schedule(); + + /* + * This must be called from time to time on ia64, and is a + * no-op on other archs. Used to be in cpu_idle(), but with + * the new -rt semantics it can't stay there. + */ + check_pgt_cache(); + + set_current_state(TASK_INTERRUPTIBLE); + } + __set_current_state(TASK_RUNNING); + return 0; +} + +static int __devinit cpu_callback(struct notifier_block *nfb, + unsigned long action, + void *hcpu) +{ + int hotcpu = (unsigned long)hcpu; + struct task_struct *p; + + switch (action) { + case CPU_UP_PREPARE: + + BUG_ON(per_cpu(desched_task, hotcpu)); + INIT_LIST_HEAD(&per_cpu(delayed_drop_list, hotcpu)); + p = kthread_create(desched_thread, hcpu, "desched/%d", hotcpu); + if (IS_ERR(p)) { + printk("desched_thread for %i failed\n", hotcpu); + return NOTIFY_BAD; + } + per_cpu(desched_task, hotcpu) = p; + kthread_bind(p, hotcpu); + break; + case CPU_ONLINE: + + wake_up_process(per_cpu(desched_task, hotcpu)); + break; +#ifdef CONFIG_HOTPLUG_CPU + case CPU_UP_CANCELED: + + /* Unbind so it can run. Fall thru. */ + kthread_bind(per_cpu(desched_task, hotcpu), smp_processor_id()); + case CPU_DEAD: + + p = per_cpu(desched_task, hotcpu); + per_cpu(desched_task, hotcpu) = NULL; + kthread_stop(p); + takeover_delayed_drop(hotcpu); + takeover_tasklets(hotcpu); + break; +#endif /* CONFIG_HOTPLUG_CPU */ + } + return NOTIFY_OK; +} + +static struct notifier_block __devinitdata cpu_nfb = { + .notifier_call = cpu_callback +}; + +__init int spawn_desched_task(void) +{ + void *cpu = (void *)(long)smp_processor_id(); + + cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu); + cpu_callback(&cpu_nfb, CPU_ONLINE, cpu); + register_cpu_notifier(&cpu_nfb); + return 0; +} + diff --git a/kernel/futex.c b/kernel/futex.c index e18cfbd..4ac78d9 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -392,9 +392,9 @@ static void free_pi_state(struct futex_pi_state *pi_state) * and has cleaned up the pi_state already */ if (pi_state->owner) { - spin_lock_irq(&pi_state->owner->pi_lock); + atomic_spin_lock_irq(&pi_state->owner->pi_lock); list_del_init(&pi_state->list); - spin_unlock_irq(&pi_state->owner->pi_lock); + atomic_spin_unlock_irq(&pi_state->owner->pi_lock); rt_mutex_proxy_unlock(&pi_state->pi_mutex, pi_state->owner); } @@ -459,18 +459,18 @@ void exit_pi_state_list(struct task_struct *curr) * pi_state_list anymore, but we have to be careful * versus waiters unqueueing themselves: */ - spin_lock_irq(&curr->pi_lock); + atomic_spin_lock_irq(&curr->pi_lock); while (!list_empty(head)) { next = head->next; pi_state = list_entry(next, struct futex_pi_state, list); key = pi_state->key; hb = hash_futex(&key); - spin_unlock_irq(&curr->pi_lock); + atomic_spin_unlock_irq(&curr->pi_lock); spin_lock(&hb->lock); - spin_lock_irq(&curr->pi_lock); + atomic_spin_lock_irq(&curr->pi_lock); /* * We dropped the pi-lock, so re-check whether this * task still owns the PI-state: @@ -484,15 +484,15 @@ void exit_pi_state_list(struct task_struct *curr) WARN_ON(list_empty(&pi_state->list)); list_del_init(&pi_state->list); pi_state->owner = NULL; - spin_unlock_irq(&curr->pi_lock); + atomic_spin_unlock_irq(&curr->pi_lock); rt_mutex_unlock(&pi_state->pi_mutex); spin_unlock(&hb->lock); - spin_lock_irq(&curr->pi_lock); + atomic_spin_lock_irq(&curr->pi_lock); } - spin_unlock_irq(&curr->pi_lock); + atomic_spin_unlock_irq(&curr->pi_lock); } static int @@ -547,7 +547,7 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, * change of the task flags, we do this protected by * p->pi_lock: */ - spin_lock_irq(&p->pi_lock); + atomic_spin_lock_irq(&p->pi_lock); if (unlikely(p->flags & PF_EXITING)) { /* * The task is on the way out. When PF_EXITPIDONE is @@ -556,7 +556,7 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, */ int ret = (p->flags & PF_EXITPIDONE) ? -ESRCH : -EAGAIN; - spin_unlock_irq(&p->pi_lock); + atomic_spin_unlock_irq(&p->pi_lock); put_task_struct(p); return ret; } @@ -575,7 +575,7 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, WARN_ON(!list_empty(&pi_state->list)); list_add(&pi_state->list, &p->pi_state_list); pi_state->owner = p; - spin_unlock_irq(&p->pi_lock); + atomic_spin_unlock_irq(&p->pi_lock); put_task_struct(p); @@ -713,7 +713,7 @@ retry: * The hash bucket lock must be held when this is called. * Afterwards, the futex_q must not be accessed. */ -static void wake_futex(struct futex_q *q) +static void wake_futex(struct task_struct **wake_list, struct futex_q *q) { struct task_struct *p = q->task; @@ -736,8 +736,51 @@ static void wake_futex(struct futex_q *q) smp_wmb(); q->lock_ptr = NULL; - wake_up_state(p, TASK_NORMAL); - put_task_struct(p); + /* + * Atomically grab the task, if ->futex_wakeup is !0 already it means + * its already queued (either by us or someone else) and will get the + * wakeup due to that. + * + * This cmpxchg() implies a full barrier, which pairs with the write + * barrier implied by the wakeup in wake_futex_list(). + */ + if (cmpxchg(&p->futex_wakeup, 0, p) != 0) { + /* + * It was already queued, drop the extra ref and we're done. + */ + put_task_struct(p); + return; + } + + /* + * Put the task on our wakeup list by atomically switching it with + * the list head. (XXX its a local list, no possible concurrency, + * this could be written without cmpxchg). + */ + do { + p->futex_wakeup = *wake_list; + } while (cmpxchg(wake_list, p->futex_wakeup, p) != p->futex_wakeup); +} + +/* + * For each task on the list, deliver the pending wakeup and release the + * task reference obtained in wake_futex(). + */ +static void wake_futex_list(struct task_struct *head) +{ + while (head != &init_task) { + struct task_struct *next = head->futex_wakeup; + + head->futex_wakeup = NULL; + /* + * wake_up_state() implies a wmb() to pair with the queueing + * in wake_futex() so as to not miss wakeups. + */ + wake_up_state(head, TASK_NORMAL); + put_task_struct(head); + + head = next; + } } static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this) @@ -749,7 +792,7 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this) if (!pi_state) return -EINVAL; - spin_lock(&pi_state->pi_mutex.wait_lock); + atomic_spin_lock(&pi_state->pi_mutex.wait_lock); new_owner = rt_mutex_next_owner(&pi_state->pi_mutex); /* @@ -778,23 +821,23 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this) else if (curval != uval) ret = -EINVAL; if (ret) { - spin_unlock(&pi_state->pi_mutex.wait_lock); + atomic_spin_unlock(&pi_state->pi_mutex.wait_lock); return ret; } } - spin_lock_irq(&pi_state->owner->pi_lock); + atomic_spin_lock_irq(&pi_state->owner->pi_lock); WARN_ON(list_empty(&pi_state->list)); list_del_init(&pi_state->list); - spin_unlock_irq(&pi_state->owner->pi_lock); + atomic_spin_unlock_irq(&pi_state->owner->pi_lock); - spin_lock_irq(&new_owner->pi_lock); + atomic_spin_lock_irq(&new_owner->pi_lock); WARN_ON(!list_empty(&pi_state->list)); list_add(&pi_state->list, &new_owner->pi_state_list); pi_state->owner = new_owner; - spin_unlock_irq(&new_owner->pi_lock); + atomic_spin_unlock_irq(&new_owner->pi_lock); - spin_unlock(&pi_state->pi_mutex.wait_lock); + atomic_spin_unlock(&pi_state->pi_mutex.wait_lock); rt_mutex_unlock(&pi_state->pi_mutex); return 0; @@ -851,6 +894,7 @@ static int futex_wake(u32 __user *uaddr, int fshared, int nr_wake, u32 bitset) struct futex_q *this, *next; struct plist_head *head; union futex_key key = FUTEX_KEY_INIT; + struct task_struct *wake_list = &init_task; int ret; if (!bitset) @@ -875,7 +919,7 @@ static int futex_wake(u32 __user *uaddr, int fshared, int nr_wake, u32 bitset) if (!(this->bitset & bitset)) continue; - wake_futex(this); + wake_futex(&wake_list, this); if (++ret >= nr_wake) break; } @@ -883,6 +927,8 @@ static int futex_wake(u32 __user *uaddr, int fshared, int nr_wake, u32 bitset) spin_unlock(&hb->lock); put_futex_key(fshared, &key); + + wake_futex_list(wake_list); out: return ret; } @@ -899,6 +945,7 @@ futex_wake_op(u32 __user *uaddr1, int fshared, u32 __user *uaddr2, struct futex_hash_bucket *hb1, *hb2; struct plist_head *head; struct futex_q *this, *next; + struct task_struct *wake_list = &init_task; int ret, op_ret; retry: @@ -912,8 +959,8 @@ retry: hb1 = hash_futex(&key1); hb2 = hash_futex(&key2); - double_lock_hb(hb1, hb2); retry_private: + double_lock_hb(hb1, hb2); op_ret = futex_atomic_op_inuser(op, uaddr2); if (unlikely(op_ret < 0)) { @@ -949,7 +996,7 @@ retry_private: plist_for_each_entry_safe(this, next, head, list) { if (match_futex (&this->key, &key1)) { - wake_futex(this); + wake_futex(&wake_list, this); if (++ret >= nr_wake) break; } @@ -961,7 +1008,7 @@ retry_private: op_ret = 0; plist_for_each_entry_safe(this, next, head, list) { if (match_futex (&this->key, &key2)) { - wake_futex(this); + wake_futex(&wake_list, this); if (++op_ret >= nr_wake2) break; } @@ -974,6 +1021,8 @@ out_put_keys: put_futex_key(fshared, &key2); out_put_key1: put_futex_key(fshared, &key1); + + wake_futex_list(wake_list); out: return ret; } @@ -999,7 +1048,7 @@ void requeue_futex(struct futex_q *q, struct futex_hash_bucket *hb1, plist_add(&q->list, &hb2->chain); q->lock_ptr = &hb2->lock; #ifdef CONFIG_DEBUG_PI_LIST - q->list.plist.lock = &hb2->lock; + q->list.plist.slock = &hb2->lock; #endif } get_futex_key_refs(key2); @@ -1036,7 +1085,7 @@ void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key, q->lock_ptr = &hb->lock; #ifdef CONFIG_DEBUG_PI_LIST - q->list.plist.lock = &hb->lock; + q->list.plist.slock = &hb->lock; #endif wake_up_state(q->task, TASK_NORMAL); @@ -1128,6 +1177,7 @@ static int futex_requeue(u32 __user *uaddr1, int fshared, u32 __user *uaddr2, struct futex_hash_bucket *hb1, *hb2; struct plist_head *head1; struct futex_q *this, *next; + struct task_struct *wake_list = &init_task; u32 curval2; if (requeue_pi) { @@ -1272,7 +1322,7 @@ retry_private: * woken by futex_unlock_pi(). */ if (++task_count <= nr_wake && !requeue_pi) { - wake_futex(this); + wake_futex(&wake_list, this); continue; } @@ -1318,6 +1368,8 @@ out_put_keys: put_futex_key(fshared, &key2); out_put_key1: put_futex_key(fshared, &key1); + + wake_futex_list(wake_list); out: if (pi_state != NULL) free_pi_state(pi_state); @@ -1353,7 +1405,7 @@ static inline void queue_me(struct futex_q *q, struct futex_hash_bucket *hb) plist_node_init(&q->list, prio); #ifdef CONFIG_DEBUG_PI_LIST - q->list.plist.lock = &hb->lock; + q->list.plist.slock = &hb->lock; #endif plist_add(&q->list, &hb->chain); q->task = current; @@ -1490,18 +1542,18 @@ retry: * itself. */ if (pi_state->owner != NULL) { - spin_lock_irq(&pi_state->owner->pi_lock); + atomic_spin_lock_irq(&pi_state->owner->pi_lock); WARN_ON(list_empty(&pi_state->list)); list_del_init(&pi_state->list); - spin_unlock_irq(&pi_state->owner->pi_lock); + atomic_spin_unlock_irq(&pi_state->owner->pi_lock); } pi_state->owner = newowner; - spin_lock_irq(&newowner->pi_lock); + atomic_spin_lock_irq(&newowner->pi_lock); WARN_ON(!list_empty(&pi_state->list)); list_add(&pi_state->list, &newowner->pi_state_list); - spin_unlock_irq(&newowner->pi_lock); + atomic_spin_unlock_irq(&newowner->pi_lock); return 0; /* diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index 49da79a..875327a 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -48,37 +48,6 @@ #include <asm/uaccess.h> -/** - * ktime_get - get the monotonic time in ktime_t format - * - * returns the time in ktime_t format - */ -ktime_t ktime_get(void) -{ - struct timespec now; - - ktime_get_ts(&now); - - return timespec_to_ktime(now); -} -EXPORT_SYMBOL_GPL(ktime_get); - -/** - * ktime_get_real - get the real (wall-) time in ktime_t format - * - * returns the time in ktime_t format - */ -ktime_t ktime_get_real(void) -{ - struct timespec now; - - getnstimeofday(&now); - - return timespec_to_ktime(now); -} - -EXPORT_SYMBOL_GPL(ktime_get_real); - /* * The timer bases: * @@ -106,31 +75,6 @@ DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) = } }; -/** - * ktime_get_ts - get the monotonic clock in timespec format - * @ts: pointer to timespec variable - * - * The function calculates the monotonic clock from the realtime - * clock and the wall_to_monotonic offset and stores the result - * in normalized timespec format in the variable pointed to by @ts. - */ -void ktime_get_ts(struct timespec *ts) -{ - struct timespec tomono; - unsigned long seq; - - do { - seq = read_seqbegin(&xtime_lock); - getnstimeofday(ts); - tomono = wall_to_monotonic; - - } while (read_seqretry(&xtime_lock, seq)); - - set_normalized_timespec(ts, ts->tv_sec + tomono.tv_sec, - ts->tv_nsec + tomono.tv_nsec); -} -EXPORT_SYMBOL_GPL(ktime_get_ts); - /* * Get the coarse grained time at the softirq based on xtime and * wall_to_monotonic. @@ -142,10 +86,10 @@ static void hrtimer_get_softirq_time(struct hrtimer_cpu_base *base) unsigned long seq; do { - seq = read_seqbegin(&xtime_lock); + seq = read_atomic_seqbegin(&xtime_lock); xts = current_kernel_time(); tom = wall_to_monotonic; - } while (read_seqretry(&xtime_lock, seq)); + } while (read_atomic_seqretry(&xtime_lock, seq)); xtim = timespec_to_ktime(xts); tomono = timespec_to_ktime(tom); @@ -181,11 +125,12 @@ struct hrtimer_clock_base *lock_hrtimer_base(const struct hrtimer *timer, for (;;) { base = timer->base; if (likely(base != NULL)) { - spin_lock_irqsave(&base->cpu_base->lock, *flags); + atomic_spin_lock_irqsave(&base->cpu_base->lock, *flags); if (likely(base == timer->base)) return base; /* The timer has migrated to another CPU: */ - spin_unlock_irqrestore(&base->cpu_base->lock, *flags); + atomic_spin_unlock_irqrestore(&base->cpu_base->lock, + *flags); } cpu_relax(); } @@ -262,13 +207,13 @@ again: /* See the comment in lock_timer_base() */ timer->base = NULL; - spin_unlock(&base->cpu_base->lock); - spin_lock(&new_base->cpu_base->lock); + atomic_spin_unlock(&base->cpu_base->lock); + atomic_spin_lock(&new_base->cpu_base->lock); if (cpu != this_cpu && hrtimer_check_target(timer, new_base)) { cpu = this_cpu; - spin_unlock(&new_base->cpu_base->lock); - spin_lock(&base->cpu_base->lock); + atomic_spin_unlock(&new_base->cpu_base->lock); + atomic_spin_lock(&base->cpu_base->lock); timer->base = base; goto again; } @@ -284,7 +229,7 @@ lock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags) { struct hrtimer_clock_base *base = timer->base; - spin_lock_irqsave(&base->cpu_base->lock, *flags); + atomic_spin_lock_irqsave(&base->cpu_base->lock, *flags); return base; } @@ -532,9 +477,9 @@ static inline int hrtimer_is_hres_enabled(void) /* * Is the high resolution mode active ? */ -static inline int hrtimer_hres_active(void) +static inline int hrtimer_hres_active(struct hrtimer_cpu_base *cpu_base) { - return __get_cpu_var(hrtimer_bases).hres_active; + return cpu_base->hres_active; } /* @@ -594,8 +539,7 @@ static int hrtimer_reprogram(struct hrtimer *timer, * When the callback is running, we do not reprogram the clock event * device. The timer callback is either running on a different CPU or * the callback is executed in the hrtimer_interrupt context. The - * reprogramming is handled either by the softirq, which called the - * callback or at the end of the hrtimer_interrupt. + * reprogramming is handled at the end of the hrtimer_interrupt. */ if (hrtimer_callback_running(timer)) return 0; @@ -629,29 +573,27 @@ static int hrtimer_reprogram(struct hrtimer *timer, */ static void retrigger_next_event(void *arg) { - struct hrtimer_cpu_base *base; + struct hrtimer_cpu_base *base = &__get_cpu_var(hrtimer_bases); struct timespec realtime_offset; unsigned long seq; - if (!hrtimer_hres_active()) + if (!hrtimer_hres_active(base)) return; do { - seq = read_seqbegin(&xtime_lock); + seq = read_atomic_seqbegin(&xtime_lock); set_normalized_timespec(&realtime_offset, -wall_to_monotonic.tv_sec, -wall_to_monotonic.tv_nsec); - } while (read_seqretry(&xtime_lock, seq)); - - base = &__get_cpu_var(hrtimer_bases); + } while (read_atomic_seqretry(&xtime_lock, seq)); /* Adjust CLOCK_REALTIME offset */ - spin_lock(&base->lock); + atomic_spin_lock(&base->lock); base->clock_base[CLOCK_REALTIME].offset = timespec_to_ktime(realtime_offset); hrtimer_force_reprogram(base); - spin_unlock(&base->lock); + atomic_spin_unlock(&base->lock); } /* @@ -699,6 +641,8 @@ static inline void hrtimer_init_timer_hres(struct hrtimer *timer) { } +static void __run_hrtimer(struct hrtimer *timer); +static int hrtimer_rt_defer(struct hrtimer *timer); /* * When High resolution timers are active, try to reprogram. Note, that in case @@ -710,11 +654,31 @@ static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer, struct hrtimer_clock_base *base, int wakeup) { +#ifdef CONFIG_PREEMPT_RT +again: +#endif if (base->cpu_base->hres_active && hrtimer_reprogram(timer, base)) { +#ifdef CONFIG_PREEMPT_RT + /* + * Move softirq based timers away from the rbtree in + * case it expired already. Otherwise we would have a + * stale base->first entry until the softirq runs. + */ + if (!hrtimer_rt_defer(timer)) { + __run_hrtimer(timer); + /* + * __run_hrtimer might have requeued timer and + * it could be base->first again. + */ + if (base->first == &timer->node) + goto again; + return 1; + } +#endif if (wakeup) { - spin_unlock(&base->cpu_base->lock); + atomic_spin_unlock(&base->cpu_base->lock); raise_softirq_irqoff(HRTIMER_SOFTIRQ); - spin_lock(&base->cpu_base->lock); + atomic_spin_lock(&base->cpu_base->lock); } else __raise_softirq_irqoff(HRTIMER_SOFTIRQ); @@ -727,10 +691,8 @@ static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer, /* * Switch to high resolution mode */ -static int hrtimer_switch_to_hres(void) +static int hrtimer_switch_to_hres(struct hrtimer_cpu_base *base) { - int cpu = smp_processor_id(); - struct hrtimer_cpu_base *base = &per_cpu(hrtimer_bases, cpu); unsigned long flags; if (base->hres_active) @@ -741,7 +703,7 @@ static int hrtimer_switch_to_hres(void) if (tick_init_highres()) { local_irq_restore(flags); printk(KERN_WARNING "Could not switch to high resolution " - "mode on CPU %d\n", cpu); + "mode on CPU %d\n", raw_smp_processor_id()); return 0; } base->hres_active = 1; @@ -753,16 +715,20 @@ static int hrtimer_switch_to_hres(void) /* "Retrigger" the interrupt to get things going */ retrigger_next_event(NULL); local_irq_restore(flags); - printk(KERN_DEBUG "Switched to high resolution mode on CPU %d\n", - smp_processor_id()); return 1; } #else -static inline int hrtimer_hres_active(void) { return 0; } +static inline int hrtimer_hres_active(struct hrtimer_cpu_base *base) +{ + return 0; +} static inline int hrtimer_is_hres_enabled(void) { return 0; } -static inline int hrtimer_switch_to_hres(void) { return 0; } +static inline int hrtimer_switch_to_hres(struct hrtimer_cpu_base *base) +{ + return 0; +} static inline void hrtimer_force_reprogram(struct hrtimer_cpu_base *base) { } static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer, struct hrtimer_clock_base *base, @@ -770,6 +736,13 @@ static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer, { return 0; } + +static inline int hrtimer_reprogram(struct hrtimer *timer, + struct hrtimer_clock_base *base) +{ + return 0; +} + static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base) { } static inline void hrtimer_init_timer_hres(struct hrtimer *timer) { } @@ -793,7 +766,7 @@ void __timer_stats_hrtimer_set_start_info(struct hrtimer *timer, void *addr) static inline void unlock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags) { - spin_unlock_irqrestore(&timer->base->cpu_base->lock, *flags); + atomic_spin_unlock_irqrestore(&timer->base->cpu_base->lock, *flags); } /** @@ -892,6 +865,32 @@ static int enqueue_hrtimer(struct hrtimer *timer, return leftmost; } +#ifdef CONFIG_PREEMPT_SOFTIRQS +# define wake_up_timer_waiters(b) wake_up(&(b)->wait) + +/** + * hrtimer_wait_for_timer - Wait for a running timer + * + * @timer: timer to wait for + * + * The function waits in case the timers callback function is + * currently executed on the waitqueue of the timer base. The + * waitqueue is woken up after the timer callback function has + * finished execution. + */ +void hrtimer_wait_for_timer(const struct hrtimer *timer) +{ + struct hrtimer_clock_base *base = timer->base; + + if (base && base->cpu_base && !timer->irqsafe) + wait_event(base->cpu_base->wait, + !(timer->state & HRTIMER_STATE_CALLBACK)); +} + +#else +# define wake_up_timer_waiters(b) do { } while (0) +#endif + /* * __remove_hrtimer - internal function to remove a timer * @@ -907,6 +906,11 @@ static void __remove_hrtimer(struct hrtimer *timer, unsigned long newstate, int reprogram) { if (timer->state & HRTIMER_STATE_ENQUEUED) { + + if (unlikely(!list_empty(&timer->cb_entry))) { + list_del_init(&timer->cb_entry); + goto out; + } /* * Remove the timer from the rbtree and replace the * first entry pointer if necessary. @@ -914,11 +918,12 @@ static void __remove_hrtimer(struct hrtimer *timer, if (base->first == &timer->node) { base->first = rb_next(&timer->node); /* Reprogram the clock event device. if enabled */ - if (reprogram && hrtimer_hres_active()) + if (reprogram && hrtimer_hres_active(base->cpu_base)) hrtimer_force_reprogram(base->cpu_base); } rb_erase(&timer->node, &base->active); } +out: timer->state = newstate; } @@ -1078,7 +1083,7 @@ int hrtimer_cancel(struct hrtimer *timer) if (ret >= 0) return ret; - cpu_relax(); + hrtimer_wait_for_timer(timer); } } EXPORT_SYMBOL_GPL(hrtimer_cancel); @@ -1116,9 +1121,9 @@ ktime_t hrtimer_get_next_event(void) unsigned long flags; int i; - spin_lock_irqsave(&cpu_base->lock, flags); + atomic_spin_lock_irqsave(&cpu_base->lock, flags); - if (!hrtimer_hres_active()) { + if (!hrtimer_hres_active(cpu_base)) { for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++, base++) { struct hrtimer *timer; @@ -1133,7 +1138,7 @@ ktime_t hrtimer_get_next_event(void) } } - spin_unlock_irqrestore(&cpu_base->lock, flags); + atomic_spin_unlock_irqrestore(&cpu_base->lock, flags); if (mindelta.tv64 < 0) mindelta.tv64 = 0; @@ -1216,9 +1221,9 @@ static void __run_hrtimer(struct hrtimer *timer) * they get migrated to another cpu, therefore its safe to unlock * the timer base. */ - spin_unlock(&cpu_base->lock); + atomic_spin_unlock(&cpu_base->lock); restart = fn(timer); - spin_lock(&cpu_base->lock); + atomic_spin_lock(&cpu_base->lock); /* * Note: We clear the CALLBACK bit after enqueue_hrtimer and @@ -1232,6 +1237,115 @@ static void __run_hrtimer(struct hrtimer *timer) timer->state &= ~HRTIMER_STATE_CALLBACK; } +#ifdef CONFIG_PREEMPT_RT + +static void hrtimer_rt_reprogram(int restart, struct hrtimer *timer, + struct hrtimer_clock_base *base) +{ + /* + * Note, we clear the callback flag before we requeue the + * timer otherwise we trigger the callback_running() check + * in hrtimer_reprogram(). + */ + timer->state &= ~HRTIMER_STATE_CALLBACK; + + if (restart != HRTIMER_NORESTART) { + BUG_ON(hrtimer_active(timer)); + /* + * Enqueue the timer, if it's the leftmost timer then + * we need to reprogram it. + */ + if (!enqueue_hrtimer(timer, base)) + return; + + if (hrtimer_reprogram(timer, base)) + goto requeue; + + } else if (hrtimer_active(timer)) { + /* + * If the timer was rearmed on another CPU, reprogram + * the event device. + */ + if (base->first == &timer->node && + hrtimer_reprogram(timer, base)) + goto requeue; + } + return; + +requeue: + /* + * Timer is expired. Thus move it from tree to pending list + * again. + */ + __remove_hrtimer(timer, base, timer->state, 0); + list_add_tail(&timer->cb_entry, &base->expired); +} + +/* + * The changes in mainline which removed the callback modes from + * hrtimer are not yet working with -rt. The non wakeup_process() + * based callbacks which involve sleeping locks need to be treated + * seperately. + */ +static void hrtimer_rt_run_pending(void) +{ + enum hrtimer_restart (*fn)(struct hrtimer *); + struct hrtimer_cpu_base *cpu_base; + struct hrtimer_clock_base *base; + struct hrtimer *timer; + int index, restart; + + local_irq_disable(); + cpu_base = &per_cpu(hrtimer_bases, smp_processor_id()); + + atomic_spin_lock(&cpu_base->lock); + + for (index = 0; index < HRTIMER_MAX_CLOCK_BASES; index++) { + base = &cpu_base->clock_base[index]; + + while (!list_empty(&base->expired)) { + timer = list_first_entry(&base->expired, + struct hrtimer, cb_entry); + + /* + * Same as the above __run_hrtimer function + * just we run with interrupts enabled. + */ + debug_hrtimer_deactivate(timer); + __remove_hrtimer(timer, base, HRTIMER_STATE_CALLBACK, 0); + timer_stats_account_hrtimer(timer); + fn = timer->function; + + atomic_spin_unlock_irq(&cpu_base->lock); + restart = fn(timer); + atomic_spin_lock_irq(&cpu_base->lock); + + hrtimer_rt_reprogram(restart, timer, base); + } + } + + atomic_spin_unlock_irq(&cpu_base->lock); + + wake_up_timer_waiters(cpu_base); +} + +static int hrtimer_rt_defer(struct hrtimer *timer) +{ + if (timer->irqsafe) + return 0; + + __remove_hrtimer(timer, timer->base, timer->state, 0); + list_add_tail(&timer->cb_entry, &timer->base->expired); + return 1; +} + +#else + +static inline void hrtimer_rt_run_pending(void) { } +static inline int hrtimer_rt_defer(struct hrtimer *timer) { return 0; } + +#endif + #ifdef CONFIG_HIGH_RES_TIMERS static int force_clock_reprogram; @@ -1267,7 +1381,7 @@ void hrtimer_interrupt(struct clock_event_device *dev) struct hrtimer_clock_base *base; ktime_t expires_next, now; int nr_retries = 0; - int i; + int i, raise = 0; BUG_ON(!cpu_base->hres_active); cpu_base->nr_events++; @@ -1282,7 +1396,7 @@ void hrtimer_interrupt(struct clock_event_device *dev) expires_next.tv64 = KTIME_MAX; - spin_lock(&cpu_base->lock); + atomic_spin_lock(&cpu_base->lock); /* * We set expires_next to KTIME_MAX here with cpu_base->lock * held to prevent that a timer is enqueued in our queue via @@ -1328,7 +1442,10 @@ void hrtimer_interrupt(struct clock_event_device *dev) break; } - __run_hrtimer(timer); + if (!hrtimer_rt_defer(timer)) + __run_hrtimer(timer); + else + raise = 1; } base++; } @@ -1338,13 +1455,16 @@ void hrtimer_interrupt(struct clock_event_device *dev) * against it. */ cpu_base->expires_next = expires_next; - spin_unlock(&cpu_base->lock); + atomic_spin_unlock(&cpu_base->lock); /* Reprogramming necessary ? */ if (expires_next.tv64 != KTIME_MAX) { if (tick_program_event(expires_next, force_clock_reprogram)) goto retry; } + + if (raise) + raise_softirq_irqoff(HRTIMER_SOFTIRQ); } /* @@ -1353,9 +1473,11 @@ void hrtimer_interrupt(struct clock_event_device *dev) */ static void __hrtimer_peek_ahead_timers(void) { + struct hrtimer_cpu_base *cpu_base; struct tick_device *td; - if (!hrtimer_hres_active()) + cpu_base = &__get_cpu_var(hrtimer_bases); + if (!hrtimer_hres_active(cpu_base)) return; td = &__get_cpu_var(tick_cpu_device); @@ -1381,17 +1503,17 @@ void hrtimer_peek_ahead_timers(void) local_irq_restore(flags); } -static void run_hrtimer_softirq(struct softirq_action *h) -{ - hrtimer_peek_ahead_timers(); -} - #else /* CONFIG_HIGH_RES_TIMERS */ static inline void __hrtimer_peek_ahead_timers(void) { } #endif /* !CONFIG_HIGH_RES_TIMERS */ +static void run_hrtimer_softirq(struct softirq_action *h) +{ + hrtimer_rt_run_pending(); +} + /* * Called from timer softirq every jiffy, expire hrtimers: * @@ -1401,7 +1523,9 @@ static inline void __hrtimer_peek_ahead_timers(void) { } */ void hrtimer_run_pending(void) { - if (hrtimer_hres_active()) + struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases); + + if (hrtimer_hres_active(cpu_base)) return; /* @@ -1413,7 +1537,7 @@ void hrtimer_run_pending(void) * deadlock vs. xtime_lock. */ if (tick_check_oneshot_change(!hrtimer_is_hres_enabled())) - hrtimer_switch_to_hres(); + hrtimer_switch_to_hres(cpu_base); } /* @@ -1422,11 +1546,12 @@ void hrtimer_run_pending(void) void hrtimer_run_queues(void) { struct rb_node *node; - struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases); + struct hrtimer_cpu_base *cpu_base; struct hrtimer_clock_base *base; - int index, gettime = 1; + int index, gettime = 1, raise = 0; - if (hrtimer_hres_active()) + cpu_base = &per_cpu(hrtimer_bases, raw_smp_processor_id()); + if (hrtimer_hres_active(cpu_base)) return; for (index = 0; index < HRTIMER_MAX_CLOCK_BASES; index++) { @@ -1440,7 +1565,7 @@ void hrtimer_run_queues(void) gettime = 0; } - spin_lock(&cpu_base->lock); + atomic_spin_lock(&cpu_base->lock); while ((node = base->first)) { struct hrtimer *timer; @@ -1450,10 +1575,16 @@ void hrtimer_run_queues(void) hrtimer_get_expires_tv64(timer)) break; - __run_hrtimer(timer); + if (!hrtimer_rt_defer(timer)) + __run_hrtimer(timer); + else + raise = 1; } - spin_unlock(&cpu_base->lock); + atomic_spin_unlock(&cpu_base->lock); } + + if (raise) + raise_softirq_irqoff(HRTIMER_SOFTIRQ); } /* @@ -1475,6 +1606,7 @@ static enum hrtimer_restart hrtimer_wakeup(struct hrtimer *timer) void hrtimer_init_sleeper(struct hrtimer_sleeper *sl, struct task_struct *task) { sl->timer.function = hrtimer_wakeup; + sl->timer.irqsafe = 1; sl->task = task; } @@ -1607,12 +1739,17 @@ static void __cpuinit init_hrtimers_cpu(int cpu) struct hrtimer_cpu_base *cpu_base = &per_cpu(hrtimer_bases, cpu); int i; - spin_lock_init(&cpu_base->lock); + atomic_spin_lock_init(&cpu_base->lock); - for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) + for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) { cpu_base->clock_base[i].cpu_base = cpu_base; + INIT_LIST_HEAD(&cpu_base->clock_base[i].expired); + } hrtimer_init_hres(cpu_base); +#ifdef CONFIG_PREEMPT_RT + init_waitqueue_head(&cpu_base->wait); +#endif } #ifdef CONFIG_HOTPLUG_CPU @@ -1665,16 +1802,16 @@ static void migrate_hrtimers(int scpu) * The caller is globally serialized and nobody else * takes two locks at once, deadlock is not possible. */ - spin_lock(&new_base->lock); - spin_lock_nested(&old_base->lock, SINGLE_DEPTH_NESTING); + atomic_spin_lock(&new_base->lock); + atomic_spin_lock_nested(&old_base->lock, SINGLE_DEPTH_NESTING); for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) { migrate_hrtimer_list(&old_base->clock_base[i], &new_base->clock_base[i]); } - spin_unlock(&old_base->lock); - spin_unlock(&new_base->lock); + atomic_spin_unlock(&old_base->lock); + atomic_spin_unlock(&new_base->lock); /* Check, if we got expired work to do */ __hrtimer_peek_ahead_timers(); @@ -1725,9 +1862,7 @@ void __init hrtimers_init(void) hrtimer_cpu_notify(&hrtimers_nb, (unsigned long)CPU_UP_PREPARE, (void *)(long)smp_processor_id()); register_cpu_notifier(&hrtimers_nb); -#ifdef CONFIG_HIGH_RES_TIMERS open_softirq(HRTIMER_SOFTIRQ, run_hrtimer_softirq); -#endif } /** diff --git a/kernel/irq/autoprobe.c b/kernel/irq/autoprobe.c index 1de9700..ed0377d 100644 --- a/kernel/irq/autoprobe.c +++ b/kernel/irq/autoprobe.c @@ -45,7 +45,7 @@ unsigned long probe_irq_on(void) * flush such a longstanding irq before considering it as spurious. */ for_each_irq_desc_reverse(i, desc) { - spin_lock_irq(&desc->lock); + atomic_spin_lock_irq(&desc->lock); if (!desc->action && !(desc->status & IRQ_NOPROBE)) { /* * An old-style architecture might still have @@ -61,7 +61,7 @@ unsigned long probe_irq_on(void) desc->chip->set_type(i, IRQ_TYPE_PROBE); desc->chip->startup(i); } - spin_unlock_irq(&desc->lock); + atomic_spin_unlock_irq(&desc->lock); } /* Wait for longstanding interrupts to trigger. */ @@ -73,13 +73,13 @@ unsigned long probe_irq_on(void) * happened in the previous stage, it may have masked itself) */ for_each_irq_desc_reverse(i, desc) { - spin_lock_irq(&desc->lock); + atomic_spin_lock_irq(&desc->lock); if (!desc->action && !(desc->status & IRQ_NOPROBE)) { desc->status |= IRQ_AUTODETECT | IRQ_WAITING; if (desc->chip->startup(i)) desc->status |= IRQ_PENDING; } - spin_unlock_irq(&desc->lock); + atomic_spin_unlock_irq(&desc->lock); } /* @@ -91,7 +91,7 @@ unsigned long probe_irq_on(void) * Now filter out any obviously spurious interrupts */ for_each_irq_desc(i, desc) { - spin_lock_irq(&desc->lock); + atomic_spin_lock_irq(&desc->lock); status = desc->status; if (status & IRQ_AUTODETECT) { @@ -103,7 +103,7 @@ unsigned long probe_irq_on(void) if (i < 32) mask |= 1 << i; } - spin_unlock_irq(&desc->lock); + atomic_spin_unlock_irq(&desc->lock); } return mask; @@ -129,7 +129,7 @@ unsigned int probe_irq_mask(unsigned long val) int i; for_each_irq_desc(i, desc) { - spin_lock_irq(&desc->lock); + atomic_spin_lock_irq(&desc->lock); status = desc->status; if (status & IRQ_AUTODETECT) { @@ -139,7 +139,7 @@ unsigned int probe_irq_mask(unsigned long val) desc->status = status & ~IRQ_AUTODETECT; desc->chip->shutdown(i); } - spin_unlock_irq(&desc->lock); + atomic_spin_unlock_irq(&desc->lock); } mutex_unlock(&probing_active); @@ -171,7 +171,7 @@ int probe_irq_off(unsigned long val) unsigned int status; for_each_irq_desc(i, desc) { - spin_lock_irq(&desc->lock); + atomic_spin_lock_irq(&desc->lock); status = desc->status; if (status & IRQ_AUTODETECT) { @@ -183,7 +183,7 @@ int probe_irq_off(unsigned long val) desc->status = status & ~IRQ_AUTODETECT; desc->chip->shutdown(i); } - spin_unlock_irq(&desc->lock); + atomic_spin_unlock_irq(&desc->lock); } mutex_unlock(&probing_active); diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 13c68e7..c6f2d0a 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -34,7 +34,7 @@ void dynamic_irq_init(unsigned int irq) } /* Ensure we don't have left over values from a previous use of this irq */ - spin_lock_irqsave(&desc->lock, flags); + atomic_spin_lock_irqsave(&desc->lock, flags); desc->status = IRQ_DISABLED; desc->chip = &no_irq_chip; desc->handle_irq = handle_bad_irq; @@ -51,7 +51,7 @@ void dynamic_irq_init(unsigned int irq) cpumask_clear(desc->pending_mask); #endif #endif - spin_unlock_irqrestore(&desc->lock, flags); + atomic_spin_unlock_irqrestore(&desc->lock, flags); } /** @@ -68,9 +68,9 @@ void dynamic_irq_cleanup(unsigned int irq) return; } - spin_lock_irqsave(&desc->lock, flags); + atomic_spin_lock_irqsave(&desc->lock, flags); if (desc->action) { - spin_unlock_irqrestore(&desc->lock, flags); + atomic_spin_unlock_irqrestore(&desc->lock, flags); WARN(1, KERN_ERR "Destroying IRQ%d without calling free_irq\n", irq); return; @@ -82,7 +82,7 @@ void dynamic_irq_cleanup(unsigned int irq) desc->chip = &no_irq_chip; desc->name = NULL; clear_kstat_irqs(desc); - spin_unlock_irqrestore(&desc->lock, flags); + atomic_spin_unlock_irqrestore(&desc->lock, flags); } @@ -104,10 +104,10 @@ int set_irq_chip(unsigned int irq, struct irq_chip *chip) if (!chip) chip = &no_irq_chip; - spin_lock_irqsave(&desc->lock, flags); + atomic_spin_lock_irqsave(&desc->lock, flags); irq_chip_set_defaults(chip); desc->chip = chip; - spin_unlock_irqrestore(&desc->lock, flags); + atomic_spin_unlock_irqrestore(&desc->lock, flags); return 0; } @@ -133,9 +133,9 @@ int set_irq_type(unsigned int irq, unsigned int type) if (type == IRQ_TYPE_NONE) return 0; - spin_lock_irqsave(&desc->lock, flags); + atomic_spin_lock_irqsave(&desc->lock, flags); ret = __irq_set_trigger(desc, irq, type); - spin_unlock_irqrestore(&desc->lock, flags); + atomic_spin_unlock_irqrestore(&desc->lock, flags); return ret; } EXPORT_SYMBOL(set_irq_type); @@ -158,9 +158,9 @@ int set_irq_data(unsigned int irq, void *data) return -EINVAL; } - spin_lock_irqsave(&desc->lock, flags); + atomic_spin_lock_irqsave(&desc->lock, flags); desc->handler_data = data; - spin_unlock_irqrestore(&desc->lock, flags); + atomic_spin_unlock_irqrestore(&desc->lock, flags); return 0; } EXPORT_SYMBOL(set_irq_data); @@ -183,11 +183,11 @@ int set_irq_msi(unsigned int irq, struct msi_desc *entry) return -EINVAL; } - spin_lock_irqsave(&desc->lock, flags); + atomic_spin_lock_irqsave(&desc->lock, flags); desc->msi_desc = entry; if (entry) entry->irq = irq; - spin_unlock_irqrestore(&desc->lock, flags); + atomic_spin_unlock_irqrestore(&desc->lock, flags); return 0; } @@ -214,14 +214,42 @@ int set_irq_chip_data(unsigned int irq, void *data) return -EINVAL; } - spin_lock_irqsave(&desc->lock, flags); + atomic_spin_lock_irqsave(&desc->lock, flags); desc->chip_data = data; - spin_unlock_irqrestore(&desc->lock, flags); + atomic_spin_unlock_irqrestore(&desc->lock, flags); return 0; } EXPORT_SYMBOL(set_irq_chip_data); +/** + * set_irq_nested_thread - Set/Reset the IRQ_NESTED_THREAD flag of an irq + * + * @irq: Interrupt number + * @nest: 0 to clear / 1 to set the IRQ_NESTED_THREAD flag + * + * The IRQ_NESTED_THREAD flag indicates that on + * request_threaded_irq() no separate interrupt thread should be + * created for the irq as the handler are called nested in the + * context of a demultiplexing interrupt handler thread. + */ +void set_irq_nested_thread(unsigned int irq, int nest) +{ + struct irq_desc *desc = irq_to_desc(irq); + unsigned long flags; + + if (!desc) + return; + + atomic_spin_lock_irqsave(&desc->lock, flags); + if (nest) + desc->status |= IRQ_NESTED_THREAD; + else + desc->status &= ~IRQ_NESTED_THREAD; + atomic_spin_unlock_irqrestore(&desc->lock, flags); +} +EXPORT_SYMBOL_GPL(set_irq_nested_thread); + /* * default enable function */ @@ -247,6 +275,7 @@ static unsigned int default_startup(unsigned int irq) { struct irq_desc *desc = irq_to_desc(irq); + desc->status &= ~IRQ_MASKED; desc->chip->enable(irq); return 0; } @@ -299,6 +328,45 @@ static inline void mask_ack_irq(struct irq_desc *desc, int irq) } } +/* + * handle_nested_irq - Handle a nested irq from a irq thread + * @irq: the interrupt number + * + * Handle interrupts which are nested into a threaded interrupt + * handler. The handler function is called inside the calling + * threads context. + */ +void handle_nested_irq(unsigned int irq) +{ + struct irq_desc *desc = irq_to_desc(irq); + struct irqaction *action; + irqreturn_t action_ret; + + might_sleep(); + + atomic_spin_lock_irq(&desc->lock); + + kstat_incr_irqs_this_cpu(irq, desc); + + action = desc->action; + if (unlikely(!action || (desc->status & IRQ_DISABLED))) + goto out_unlock; + + desc->status |= IRQ_INPROGRESS; + atomic_spin_unlock_irq(&desc->lock); + + action_ret = action->thread_fn(action->irq, action->dev_id); + if (!noirqdebug) + note_interrupt(irq, desc, action_ret); + + atomic_spin_lock_irq(&desc->lock); + desc->status &= ~IRQ_INPROGRESS; + +out_unlock: + atomic_spin_unlock_irq(&desc->lock); +} +EXPORT_SYMBOL_GPL(handle_nested_irq); + /** * handle_simple_irq - Simple and software-decoded IRQs. * @irq: the interrupt number @@ -317,7 +385,7 @@ handle_simple_irq(unsigned int irq, struct irq_desc *desc) struct irqaction *action; irqreturn_t action_ret; - spin_lock(&desc->lock); + atomic_spin_lock(&desc->lock); if (unlikely(desc->status & IRQ_INPROGRESS)) goto out_unlock; @@ -329,16 +397,16 @@ handle_simple_irq(unsigned int irq, struct irq_desc *desc) goto out_unlock; desc->status |= IRQ_INPROGRESS; - spin_unlock(&desc->lock); + atomic_spin_unlock(&desc->lock); action_ret = handle_IRQ_event(irq, action); if (!noirqdebug) note_interrupt(irq, desc, action_ret); - spin_lock(&desc->lock); + atomic_spin_lock(&desc->lock); desc->status &= ~IRQ_INPROGRESS; out_unlock: - spin_unlock(&desc->lock); + atomic_spin_unlock(&desc->lock); } /** @@ -357,7 +425,7 @@ handle_level_irq(unsigned int irq, struct irq_desc *desc) struct irqaction *action; irqreturn_t action_ret; - spin_lock(&desc->lock); + atomic_spin_lock(&desc->lock); mask_ack_irq(desc, irq); if (unlikely(desc->status & IRQ_INPROGRESS)) @@ -374,18 +442,21 @@ handle_level_irq(unsigned int irq, struct irq_desc *desc) goto out_unlock; desc->status |= IRQ_INPROGRESS; - spin_unlock(&desc->lock); + atomic_spin_unlock(&desc->lock); action_ret = handle_IRQ_event(irq, action); if (!noirqdebug) note_interrupt(irq, desc, action_ret); - spin_lock(&desc->lock); + atomic_spin_lock(&desc->lock); desc->status &= ~IRQ_INPROGRESS; - if (!(desc->status & IRQ_DISABLED) && desc->chip->unmask) + + if (unlikely(desc->status & IRQ_ONESHOT)) + desc->status |= IRQ_MASKED; + else if (!(desc->status & IRQ_DISABLED) && desc->chip->unmask) desc->chip->unmask(irq); out_unlock: - spin_unlock(&desc->lock); + atomic_spin_unlock(&desc->lock); } EXPORT_SYMBOL_GPL(handle_level_irq); @@ -405,7 +476,7 @@ handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc) struct irqaction *action; irqreturn_t action_ret; - spin_lock(&desc->lock); + atomic_spin_lock(&desc->lock); if (unlikely(desc->status & IRQ_INPROGRESS)) goto out; @@ -425,20 +496,23 @@ handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc) goto out; } + if ((desc->status & IRQ_ONESHOT) && desc->chip->mask) + desc->chip->mask(irq); + desc->status |= IRQ_INPROGRESS; desc->status &= ~IRQ_PENDING; - spin_unlock(&desc->lock); + atomic_spin_unlock(&desc->lock); action_ret = handle_IRQ_event(irq, action); if (!noirqdebug) note_interrupt(irq, desc, action_ret); - spin_lock(&desc->lock); + atomic_spin_lock(&desc->lock); desc->status &= ~IRQ_INPROGRESS; out: desc->chip->eoi(irq); - spin_unlock(&desc->lock); + atomic_spin_unlock(&desc->lock); } /** @@ -460,9 +534,14 @@ out: void handle_edge_irq(unsigned int irq, struct irq_desc *desc) { - spin_lock(&desc->lock); + atomic_spin_lock(&desc->lock); - desc->status &= ~(IRQ_REPLAY | IRQ_WAITING); + /* + * Edge irqs can be requested with IRQF_ONESHOT set. RT + * (ab)uses this for enforced irq threading, but we do not + * want to mask edge type interrupts. Clear the oneshot flag. + */ + desc->status &= ~(IRQ_REPLAY | IRQ_WAITING | IRQ_ONESHOT); /* * If we're currently running this IRQ, or its disabled, @@ -477,7 +556,6 @@ handle_edge_irq(unsigned int irq, struct irq_desc *desc) } kstat_incr_irqs_this_cpu(irq, desc); - /* Start handling the irq */ if (desc->chip->ack) desc->chip->ack(irq); @@ -506,17 +584,17 @@ handle_edge_irq(unsigned int irq, struct irq_desc *desc) } desc->status &= ~IRQ_PENDING; - spin_unlock(&desc->lock); + atomic_spin_unlock(&desc->lock); action_ret = handle_IRQ_event(irq, action); if (!noirqdebug) note_interrupt(irq, desc, action_ret); - spin_lock(&desc->lock); + atomic_spin_lock(&desc->lock); } while ((desc->status & (IRQ_PENDING | IRQ_DISABLED)) == IRQ_PENDING); desc->status &= ~IRQ_INPROGRESS; out_unlock: - spin_unlock(&desc->lock); + atomic_spin_unlock(&desc->lock); } /** @@ -572,7 +650,8 @@ __set_irq_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained, desc->chip = &dummy_irq_chip; } - spin_lock_irqsave(&desc->lock, flags); + chip_bus_lock(irq, desc); + atomic_spin_lock_irqsave(&desc->lock, flags); /* Uninstall? */ if (handle == handle_bad_irq) { @@ -590,7 +669,9 @@ __set_irq_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained, desc->depth = 0; desc->chip->startup(irq); } - spin_unlock_irqrestore(&desc->lock, flags); + + atomic_spin_unlock_irqrestore(&desc->lock, flags); + chip_bus_sync_unlock(irq, desc); } EXPORT_SYMBOL_GPL(__set_irq_handler); @@ -620,9 +701,9 @@ void __init set_irq_noprobe(unsigned int irq) return; } - spin_lock_irqsave(&desc->lock, flags); + atomic_spin_lock_irqsave(&desc->lock, flags); desc->status |= IRQ_NOPROBE; - spin_unlock_irqrestore(&desc->lock, flags); + atomic_spin_unlock_irqrestore(&desc->lock, flags); } void __init set_irq_probe(unsigned int irq) @@ -635,7 +716,7 @@ void __init set_irq_probe(unsigned int irq) return; } - spin_lock_irqsave(&desc->lock, flags); + atomic_spin_lock_irqsave(&desc->lock, flags); desc->status &= ~IRQ_NOPROBE; - spin_unlock_irqrestore(&desc->lock, flags); + atomic_spin_unlock_irqrestore(&desc->lock, flags); } diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index 065205b..2e3b251 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -79,7 +79,7 @@ static struct irq_desc irq_desc_init = { .chip = &no_irq_chip, .handle_irq = handle_bad_irq, .depth = 1, - .lock = __SPIN_LOCK_UNLOCKED(irq_desc_init.lock), + .lock = __ATOMIC_SPIN_LOCK_UNLOCKED(irq_desc_init.lock), }; void __ref init_kstat_irqs(struct irq_desc *desc, int node, int nr) @@ -107,7 +107,7 @@ static void init_one_irq_desc(int irq, struct irq_desc *desc, int node) { memcpy(desc, &irq_desc_init, sizeof(struct irq_desc)); - spin_lock_init(&desc->lock); + atomic_spin_lock_init(&desc->lock); desc->irq = irq; #ifdef CONFIG_SMP desc->node = node; @@ -129,7 +129,7 @@ static void init_one_irq_desc(int irq, struct irq_desc *desc, int node) /* * Protect the sparse_irqs: */ -DEFINE_SPINLOCK(sparse_irq_lock); +DEFINE_ATOMIC_SPINLOCK(sparse_irq_lock); struct irq_desc **irq_desc_ptrs __read_mostly; @@ -140,7 +140,7 @@ static struct irq_desc irq_desc_legacy[NR_IRQS_LEGACY] __cacheline_aligned_in_sm .chip = &no_irq_chip, .handle_irq = handle_bad_irq, .depth = 1, - .lock = __SPIN_LOCK_UNLOCKED(irq_desc_init.lock), + .lock = __ATOMIC_SPIN_LOCK_UNLOCKED(irq_desc_init.lock), } }; @@ -208,7 +208,7 @@ struct irq_desc * __ref irq_to_desc_alloc_node(unsigned int irq, int node) if (desc) return desc; - spin_lock_irqsave(&sparse_irq_lock, flags); + atomic_spin_lock_irqsave(&sparse_irq_lock, flags); /* We have to check it to avoid races with another CPU */ desc = irq_desc_ptrs[irq]; @@ -230,7 +230,7 @@ struct irq_desc * __ref irq_to_desc_alloc_node(unsigned int irq, int node) irq_desc_ptrs[irq] = desc; out_unlock: - spin_unlock_irqrestore(&sparse_irq_lock, flags); + atomic_spin_unlock_irqrestore(&sparse_irq_lock, flags); return desc; } @@ -243,7 +243,7 @@ struct irq_desc irq_desc[NR_IRQS] __cacheline_aligned_in_smp = { .chip = &no_irq_chip, .handle_irq = handle_bad_irq, .depth = 1, - .lock = __SPIN_LOCK_UNLOCKED(irq_desc->lock), + .lock = __ATOMIC_SPIN_LOCK_UNLOCKED(irq_desc->lock), } }; @@ -356,6 +356,25 @@ static void warn_no_thread(unsigned int irq, struct irqaction *action) "but no thread function available.", irq, action->name); } +/* + * Momentary workaround until I have a brighter idea how to handle the + * accounting of forced threaded (shared) handlers. + */ +irqreturn_t handle_irq_action(unsigned int irq, struct irqaction *action) +{ + struct irq_desc *desc = irq_to_desc(irq); + + if (desc->status & IRQ_ONESHOT) { + unsigned long flags; + + atomic_spin_lock_irqsave(&desc->lock, flags); + desc->forced_threads_active |= action->thread_mask; + atomic_spin_unlock_irqrestore(&desc->lock, flags); + return IRQ_WAKE_THREAD; + } + return action->handler(irq, action->dev_id); +} + /** * handle_IRQ_event - irq action chain handler * @irq: the interrupt number @@ -373,7 +392,7 @@ irqreturn_t handle_IRQ_event(unsigned int irq, struct irqaction *action) do { trace_irq_handler_entry(irq, action); - ret = action->handler(irq, action->dev_id); + ret = handle_irq_action(irq, action); trace_irq_handler_exit(irq, action, ret); switch (ret) { @@ -420,8 +439,11 @@ irqreturn_t handle_IRQ_event(unsigned int irq, struct irqaction *action) action = action->next; } while (action); +#ifndef CONFIG_PREEMPT_RT + /* FIXME: Can we unbreak that ? */ if (status & IRQF_SAMPLE_RANDOM) add_interrupt_randomness(irq); +#endif local_irq_disable(); return retval; @@ -450,6 +472,11 @@ unsigned int __do_IRQ(unsigned int irq) struct irqaction *action; unsigned int status; +#ifdef CONFIG_PREEMPT_RT + printk(KERN_WARNING "__do_IRQ called for irq %d. " + "PREEMPT_RT will crash your system soon\n", irq); + printk(KERN_WARNING "I hope you have a fire-extinguisher handy!\n"); +#endif kstat_incr_irqs_this_cpu(irq, desc); if (CHECK_IRQ_PER_CPU(desc->status)) { @@ -469,7 +496,7 @@ unsigned int __do_IRQ(unsigned int irq) return 1; } - spin_lock(&desc->lock); + atomic_spin_lock(&desc->lock); if (desc->chip->ack) desc->chip->ack(irq); /* @@ -513,13 +540,13 @@ unsigned int __do_IRQ(unsigned int irq) for (;;) { irqreturn_t action_ret; - spin_unlock(&desc->lock); + atomic_spin_unlock(&desc->lock); action_ret = handle_IRQ_event(irq, action); if (!noirqdebug) note_interrupt(irq, desc, action_ret); - spin_lock(&desc->lock); + atomic_spin_lock(&desc->lock); if (likely(!(desc->status & IRQ_PENDING))) break; desc->status &= ~IRQ_PENDING; @@ -532,7 +559,7 @@ out: * disabled while the handler was running. */ desc->chip->end(irq); - spin_unlock(&desc->lock); + atomic_spin_unlock(&desc->lock); return 1; } diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index e70ed55..eed93c3 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -18,7 +18,7 @@ extern void __enable_irq(struct irq_desc *desc, unsigned int irq, bool resume); extern struct lock_class_key irq_desc_lock_class; extern void init_kstat_irqs(struct irq_desc *desc, int node, int nr); extern void clear_kstat_irqs(struct irq_desc *desc); -extern spinlock_t sparse_irq_lock; +extern atomic_spinlock_t sparse_irq_lock; #ifdef CONFIG_SPARSE_IRQ /* irq_desc_ptrs allocated at boot time */ @@ -44,6 +44,19 @@ extern int irq_select_affinity_usr(unsigned int irq); extern void irq_set_thread_affinity(struct irq_desc *desc); +/* Inline functions for support of irq chips on slow busses */ +static inline void chip_bus_lock(unsigned int irq, struct irq_desc *desc) +{ + if (unlikely(desc->chip->bus_lock)) + desc->chip->bus_lock(irq); +} + +static inline void chip_bus_sync_unlock(unsigned int irq, struct irq_desc *desc) +{ + if (unlikely(desc->chip->bus_sync_unlock)) + desc->chip->bus_sync_unlock(irq); +} + /* * Debugging printout: */ diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 0ec9ed8..3a5a785 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -46,9 +46,9 @@ void synchronize_irq(unsigned int irq) cpu_relax(); /* Ok, that indicated we're done: double-check carefully. */ - spin_lock_irqsave(&desc->lock, flags); + atomic_spin_lock_irqsave(&desc->lock, flags); status = desc->status; - spin_unlock_irqrestore(&desc->lock, flags); + atomic_spin_unlock_irqrestore(&desc->lock, flags); /* Oops, that failed? */ } while (status & IRQ_INPROGRESS); @@ -114,7 +114,7 @@ int irq_set_affinity(unsigned int irq, const struct cpumask *cpumask) if (!desc->chip->set_affinity) return -EINVAL; - spin_lock_irqsave(&desc->lock, flags); + atomic_spin_lock_irqsave(&desc->lock, flags); #ifdef CONFIG_GENERIC_PENDING_IRQ if (desc->status & IRQ_MOVE_PCNTXT) { @@ -134,7 +134,7 @@ int irq_set_affinity(unsigned int irq, const struct cpumask *cpumask) } #endif desc->status |= IRQ_AFFINITY_SET; - spin_unlock_irqrestore(&desc->lock, flags); + atomic_spin_unlock_irqrestore(&desc->lock, flags); return 0; } @@ -181,11 +181,11 @@ int irq_select_affinity_usr(unsigned int irq) unsigned long flags; int ret; - spin_lock_irqsave(&desc->lock, flags); + atomic_spin_lock_irqsave(&desc->lock, flags); ret = setup_affinity(irq, desc); if (!ret) irq_set_thread_affinity(desc); - spin_unlock_irqrestore(&desc->lock, flags); + atomic_spin_unlock_irqrestore(&desc->lock, flags); return ret; } @@ -230,9 +230,11 @@ void disable_irq_nosync(unsigned int irq) if (!desc) return; - spin_lock_irqsave(&desc->lock, flags); + chip_bus_lock(irq, desc); + atomic_spin_lock_irqsave(&desc->lock, flags); __disable_irq(desc, irq, false); - spin_unlock_irqrestore(&desc->lock, flags); + atomic_spin_unlock_irqrestore(&desc->lock, flags); + chip_bus_sync_unlock(irq, desc); } EXPORT_SYMBOL(disable_irq_nosync); @@ -278,7 +280,8 @@ void __enable_irq(struct irq_desc *desc, unsigned int irq, bool resume) goto err_out; /* Prevent probing on this irq: */ desc->status = status | IRQ_NOPROBE; - check_irq_resend(desc, irq); + if (!desc->forced_threads_active) + check_irq_resend(desc, irq); /* fall-through */ } default: @@ -294,7 +297,8 @@ void __enable_irq(struct irq_desc *desc, unsigned int irq, bool resume) * matches the last disable, processing of interrupts on this * IRQ line is re-enabled. * - * This function may be called from IRQ context. + * This function may be called from IRQ context only when + * desc->chip->bus_lock and desc->chip->bus_sync_unlock are NULL ! */ void enable_irq(unsigned int irq) { @@ -304,9 +308,11 @@ void enable_irq(unsigned int irq) if (!desc) return; - spin_lock_irqsave(&desc->lock, flags); + chip_bus_lock(irq, desc); + atomic_spin_lock_irqsave(&desc->lock, flags); __enable_irq(desc, irq, false); - spin_unlock_irqrestore(&desc->lock, flags); + atomic_spin_unlock_irqrestore(&desc->lock, flags); + chip_bus_sync_unlock(irq, desc); } EXPORT_SYMBOL(enable_irq); @@ -342,7 +348,7 @@ int set_irq_wake(unsigned int irq, unsigned int on) /* wakeup-capable irqs can be shared between drivers that * don't need to have the same sleep mode behaviors. */ - spin_lock_irqsave(&desc->lock, flags); + atomic_spin_lock_irqsave(&desc->lock, flags); if (on) { if (desc->wake_depth++ == 0) { ret = set_irq_wake_real(irq, on); @@ -363,7 +369,7 @@ int set_irq_wake(unsigned int irq, unsigned int on) } } - spin_unlock_irqrestore(&desc->lock, flags); + atomic_spin_unlock_irqrestore(&desc->lock, flags); return ret; } EXPORT_SYMBOL(set_irq_wake); @@ -436,7 +442,108 @@ int __irq_set_trigger(struct irq_desc *desc, unsigned int irq, return ret; } -static int irq_wait_for_interrupt(struct irqaction *action) +/* + * Default primary interrupt handler for threaded interrupts. Is + * assigned as primary handler when request_threaded_irq is called + * with handler == NULL. Useful for oneshot interrupts. + */ +static irqreturn_t irq_default_primary_handler(int irq, void *dev_id) +{ + return IRQ_WAKE_THREAD; +} + +/* + * Primary handler for nested threaded interrupts. Should never be + * called. + */ +static irqreturn_t irq_nested_primary_handler(int irq, void *dev_id) +{ + WARN(1, "Primary handler called for nested irq %d\n", irq); + return IRQ_NONE; +} + +#ifdef CONFIG_PREEMPT_HARDIRQS +/* + * If the caller does not request irq threading then the handler + * becomes the thread function and we use the above handler as the + * primary hardirq context handler. + */ +static void preempt_hardirq_setup(struct irqaction *new) +{ + if (new->thread_fn || (new->flags & (IRQF_NODELAY | IRQF_PERCPU))) + return; + + new->flags |= IRQF_ONESHOT; + new->thread_fn = new->handler; + new->handler = irq_default_primary_handler; +} + +#else +static inline void preempt_hardirq_setup(struct irqaction *new) { } +#endif + +/* + * forced threaded interrupts need to unmask the interrupt line + */ +static int preempt_hardirq_thread_done(struct irq_desc *desc, + struct irqaction *action) +{ + unsigned long masked; + + if (!(desc->status & IRQ_ONESHOT)) + return 0; +again: + atomic_spin_lock_irq(&desc->lock); + /* + * Be careful. The hardirq handler might be running on the + * other CPU. + */ + if (desc->status & IRQ_INPROGRESS) { + atomic_spin_unlock_irq(&desc->lock); + cpu_relax(); + goto again; + } + + /* + * Now check again, whether the thread should run. Otherwise + * we would clear the forced_threads_active bit which was just + * set. + */ + if (test_bit(IRQTF_RUNTHREAD, &action->thread_flags)) { + atomic_spin_unlock_irq(&desc->lock); + return 1; + } + + masked = desc->forced_threads_active; + desc->forced_threads_active &= ~action->thread_mask; + + /* + * Unmask the interrupt line when this is the last active + * thread and the interrupt is not disabled. + */ + if (masked && !desc->forced_threads_active && + !(desc->status & IRQ_DISABLED)) { + if (desc->chip->unmask) + desc->chip->unmask(action->irq); + /* + * Do we need to call check_irq_resend() here ? + * No. check_irq_resend needs only to be checked when + * we go from IRQ_DISABLED to IRQ_ENABLED state. + */ + } + atomic_spin_unlock_irq(&desc->lock); + return 0; +} + +static inline void +preempt_hardirq_cleanup(struct irq_desc *desc, struct irqaction *action) +{ + clear_bit(IRQTF_RUNTHREAD, &action->thread_flags); + preempt_hardirq_thread_done(desc, action); +} + +static int +irq_wait_for_interrupt(struct irq_desc *desc, struct irqaction *action) { while (!kthread_should_stop()) { set_current_state(TASK_INTERRUPTIBLE); @@ -446,7 +553,8 @@ static int irq_wait_for_interrupt(struct irqaction *action) __set_current_state(TASK_RUNNING); return 0; } - schedule(); + if (!preempt_hardirq_thread_done(desc, action)) + schedule(); } return -1; } @@ -472,9 +580,9 @@ irq_thread_check_affinity(struct irq_desc *desc, struct irqaction *action) return; } - spin_lock_irq(&desc->lock); + atomic_spin_lock_irq(&desc->lock); cpumask_copy(mask, desc->affinity); - spin_unlock_irq(&desc->lock); + atomic_spin_unlock_irq(&desc->lock); set_cpus_allowed_ptr(current, mask); free_cpumask_var(mask); @@ -495,15 +603,16 @@ static int irq_thread(void *data) int wake; sched_setscheduler(current, SCHED_FIFO, ¶m); + current->flags |= PF_HARDIRQ; current->irqaction = action; - while (!irq_wait_for_interrupt(action)) { + while (!irq_wait_for_interrupt(desc, action)) { irq_thread_check_affinity(desc, action); atomic_inc(&desc->threads_active); - spin_lock_irq(&desc->lock); + atomic_spin_lock_irq(&desc->lock); if (unlikely(desc->status & IRQ_DISABLED)) { /* * CHECKME: We might need a dedicated @@ -513,9 +622,9 @@ static int irq_thread(void *data) * retriggers the interrupt itself --- tglx */ desc->status |= IRQ_PENDING; - spin_unlock_irq(&desc->lock); + atomic_spin_unlock_irq(&desc->lock); } else { - spin_unlock_irq(&desc->lock); + atomic_spin_unlock_irq(&desc->lock); action->thread_fn(action->irq, action->dev_id); } @@ -526,6 +635,8 @@ static int irq_thread(void *data) wake_up(&desc->wait_for_threads); } + preempt_hardirq_cleanup(desc, action); + /* * Clear irqaction. Otherwise exit_irq_thread() would make * fuzz about an active irq thread going into nirvana. @@ -564,8 +675,8 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) { struct irqaction *old, **old_ptr; const char *old_name = NULL; - unsigned long flags; - int shared = 0; + unsigned long flags, thread_mask = 0; + int nested, shared = 0; int ret; if (!desc) @@ -590,10 +701,32 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) rand_initialize_irq(irq); } + + /* Preempt-RT setup for forced threading */ + preempt_hardirq_setup(new); + + /* + * Check whether the interrupt nests into another interrupt + * thread. + */ + nested = desc->status & IRQ_NESTED_THREAD; + if (nested) { + if (!new->thread_fn) + return -EINVAL; + /* + * Replace the primary handler which was provided from + * the driver for non nested interrupt handling by the + * dummy function which warns when called. + */ + new->handler = irq_nested_primary_handler; + } + /* - * Threaded handler ? + * Create a handler thread when a thread function is supplied + * and the interrupt does not nest into another interrupt + * thread. */ - if (new->thread_fn) { + if (new->thread_fn && !nested) { struct task_struct *t; t = kthread_create(irq_thread, new, "irq/%d-%s", irq, @@ -612,7 +745,7 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) /* * The following block of code has to be executed atomically */ - spin_lock_irqsave(&desc->lock, flags); + atomic_spin_lock_irqsave(&desc->lock, flags); old_ptr = &desc->action; old = *old_ptr; if (old) { @@ -637,12 +770,20 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) /* add new interrupt at end of irq queue */ do { + thread_mask |= old->thread_mask; old_ptr = &old->next; old = *old_ptr; } while (old); shared = 1; } + /* + * Setup the thread mask for this irqaction. No risk that ffz + * will fail. If we have 32 resp. 64 devices sharing one irq + * then ..... + */ + new->thread_mask = 1 << ffz(thread_mask); + if (!shared) { irq_chip_set_defaults(desc->chip); @@ -662,9 +803,12 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) desc->status |= IRQ_PER_CPU; #endif - desc->status &= ~(IRQ_AUTODETECT | IRQ_WAITING | + desc->status &= ~(IRQ_AUTODETECT | IRQ_WAITING | IRQ_ONESHOT | IRQ_INPROGRESS | IRQ_SPURIOUS_DISABLED); + if (new->flags & IRQF_ONESHOT) + desc->status |= IRQ_ONESHOT; + if (!(desc->status & IRQ_NOAUTOEN)) { desc->depth = 0; desc->status &= ~IRQ_DISABLED; @@ -705,7 +849,7 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) __enable_irq(desc, irq, false); } - spin_unlock_irqrestore(&desc->lock, flags); + atomic_spin_unlock_irqrestore(&desc->lock, flags); /* * Strictly no need to wake it up, but hung_task complains @@ -732,7 +876,7 @@ mismatch: ret = -EBUSY; out_thread: - spin_unlock_irqrestore(&desc->lock, flags); + atomic_spin_unlock_irqrestore(&desc->lock, flags); if (new->thread) { struct task_struct *t = new->thread; @@ -774,7 +918,7 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id) if (!desc) return NULL; - spin_lock_irqsave(&desc->lock, flags); + atomic_spin_lock_irqsave(&desc->lock, flags); /* * There can be multiple actions per IRQ descriptor, find the right @@ -786,7 +930,7 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id) if (!action) { WARN(1, "Trying to free already-free IRQ %d\n", irq); - spin_unlock_irqrestore(&desc->lock, flags); + atomic_spin_unlock_irqrestore(&desc->lock, flags); return NULL; } @@ -814,7 +958,7 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id) desc->chip->disable(irq); } - spin_unlock_irqrestore(&desc->lock, flags); + atomic_spin_unlock_irqrestore(&desc->lock, flags); unregister_handler_proc(irq, action); @@ -875,7 +1019,14 @@ EXPORT_SYMBOL_GPL(remove_irq); */ void free_irq(unsigned int irq, void *dev_id) { + struct irq_desc *desc = irq_to_desc(irq); + + if (!desc) + return; + + chip_bus_lock(irq, desc); kfree(__free_irq(irq, dev_id)); + chip_bus_sync_unlock(irq, desc); } EXPORT_SYMBOL(free_irq); @@ -884,6 +1035,8 @@ EXPORT_SYMBOL(free_irq); * @irq: Interrupt line to allocate * @handler: Function to be called when the IRQ occurs. * Primary handler for threaded interrupts + * If NULL and thread_fn != NULL the default + * primary handler is installed * @thread_fn: Function called from the irq handler thread * If NULL, no irq thread is created * @irqflags: Interrupt type flags @@ -963,8 +1116,12 @@ int request_threaded_irq(unsigned int irq, irq_handler_t handler, if (desc->status & IRQ_NOREQUEST) return -EINVAL; - if (!handler) - return -EINVAL; + + if (!handler) { + if (!thread_fn) + return -EINVAL; + handler = irq_default_primary_handler; + } action = kzalloc(sizeof(struct irqaction), GFP_KERNEL); if (!action) @@ -976,12 +1133,15 @@ int request_threaded_irq(unsigned int irq, irq_handler_t handler, action->name = devname; action->dev_id = dev_id; + chip_bus_lock(irq, desc); retval = __setup_irq(irq, desc, action); + chip_bus_sync_unlock(irq, desc); + if (retval) kfree(action); #ifdef CONFIG_DEBUG_SHIRQ - if (irqflags & IRQF_SHARED) { + if (!retval && (irqflags & IRQF_SHARED)) { /* * It's a shared IRQ -- the driver ought to be prepared for it * to happen immediately, so let's make sure.... @@ -989,13 +1149,18 @@ int request_threaded_irq(unsigned int irq, irq_handler_t handler, * run in parallel with our fake. */ unsigned long flags; + irqreturn_t ret; disable_irq(irq); local_irq_save(flags); - handler(irq, dev_id); + ret = action->handler(irq, dev_id); local_irq_restore(flags); + + if (ret == IRQ_WAKE_THREAD) + action->thread_fn(irq, dev_id); + enable_irq(irq); } #endif diff --git a/kernel/irq/migration.c b/kernel/irq/migration.c index fcb6c96..1d9ff65 100644 --- a/kernel/irq/migration.c +++ b/kernel/irq/migration.c @@ -27,7 +27,7 @@ void move_masked_irq(int irq) if (!desc->chip->set_affinity) return; - assert_spin_locked(&desc->lock); + assert_atomic_spin_locked(&desc->lock); /* * If there was a valid mask to work with, please @@ -54,6 +54,7 @@ void move_masked_irq(int irq) void move_native_irq(int irq) { struct irq_desc *desc = irq_to_desc(irq); + int mask = 1; if (likely(!(desc->status & IRQ_MOVE_PENDING))) return; @@ -61,8 +62,18 @@ void move_native_irq(int irq) if (unlikely(desc->status & IRQ_DISABLED)) return; - desc->chip->mask(irq); + /* + * If the irq is already in progress, it should be masked. + * If we unmask it, we might cause an interrupt storm on RT. + */ + if (unlikely(desc->status & IRQ_INPROGRESS || + desc->forced_threads_active)) + mask = 0; + + if (mask) + desc->chip->mask(irq); move_masked_irq(irq); - desc->chip->unmask(irq); + if (mask) + desc->chip->unmask(irq); } diff --git a/kernel/irq/numa_migrate.c b/kernel/irq/numa_migrate.c index 3fd3019..23bad34 100644 --- a/kernel/irq/numa_migrate.c +++ b/kernel/irq/numa_migrate.c @@ -42,7 +42,7 @@ static bool init_copy_one_irq_desc(int irq, struct irq_desc *old_desc, "for migration.\n", irq); return false; } - spin_lock_init(&desc->lock); + atomic_spin_lock_init(&desc->lock); desc->node = node; lockdep_set_class(&desc->lock, &irq_desc_lock_class); init_copy_kstat_irqs(old_desc, desc, node, nr_cpu_ids); @@ -67,7 +67,7 @@ static struct irq_desc *__real_move_irq_desc(struct irq_desc *old_desc, irq = old_desc->irq; - spin_lock_irqsave(&sparse_irq_lock, flags); + atomic_spin_lock_irqsave(&sparse_irq_lock, flags); /* We have to check it to avoid races with another CPU */ desc = irq_desc_ptrs[irq]; @@ -91,7 +91,7 @@ static struct irq_desc *__real_move_irq_desc(struct irq_desc *old_desc, } irq_desc_ptrs[irq] = desc; - spin_unlock_irqrestore(&sparse_irq_lock, flags); + atomic_spin_unlock_irqrestore(&sparse_irq_lock, flags); /* free the old one */ free_one_irq_desc(old_desc, desc); @@ -100,7 +100,7 @@ static struct irq_desc *__real_move_irq_desc(struct irq_desc *old_desc, return desc; out_unlock: - spin_unlock_irqrestore(&sparse_irq_lock, flags); + atomic_spin_unlock_irqrestore(&sparse_irq_lock, flags); return desc; } diff --git a/kernel/irq/pm.c b/kernel/irq/pm.c index 638d8be..e4e3783 100644 --- a/kernel/irq/pm.c +++ b/kernel/irq/pm.c @@ -28,9 +28,9 @@ void suspend_device_irqs(void) for_each_irq_desc(irq, desc) { unsigned long flags; - spin_lock_irqsave(&desc->lock, flags); + atomic_spin_lock_irqsave(&desc->lock, flags); __disable_irq(desc, irq, true); - spin_unlock_irqrestore(&desc->lock, flags); + atomic_spin_unlock_irqrestore(&desc->lock, flags); } for_each_irq_desc(irq, desc) @@ -56,9 +56,9 @@ void resume_device_irqs(void) if (!(desc->status & IRQ_SUSPENDED)) continue; - spin_lock_irqsave(&desc->lock, flags); + atomic_spin_lock_irqsave(&desc->lock, flags); __enable_irq(desc, irq, true); - spin_unlock_irqrestore(&desc->lock, flags); + atomic_spin_unlock_irqrestore(&desc->lock, flags); } } EXPORT_SYMBOL_GPL(resume_device_irqs); diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c index 692363d..d4ae675 100644 --- a/kernel/irq/proc.c +++ b/kernel/irq/proc.c @@ -169,7 +169,7 @@ static int name_unique(unsigned int irq, struct irqaction *new_action) unsigned long flags; int ret = 1; - spin_lock_irqsave(&desc->lock, flags); + atomic_spin_lock_irqsave(&desc->lock, flags); for (action = desc->action ; action; action = action->next) { if ((action != new_action) && action->name && !strcmp(new_action->name, action->name)) { @@ -177,7 +177,7 @@ static int name_unique(unsigned int irq, struct irqaction *new_action) break; } } - spin_unlock_irqrestore(&desc->lock, flags); + atomic_spin_unlock_irqrestore(&desc->lock, flags); return ret; } diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c index 4d56829..d8b2df0 100644 --- a/kernel/irq/spurious.c +++ b/kernel/irq/spurious.c @@ -28,7 +28,7 @@ static int try_one_irq(int irq, struct irq_desc *desc) struct irqaction *action; int ok = 0, work = 0; - spin_lock(&desc->lock); + atomic_spin_lock(&desc->lock); /* Already running on another processor */ if (desc->status & IRQ_INPROGRESS) { /* @@ -37,13 +37,13 @@ static int try_one_irq(int irq, struct irq_desc *desc) */ if (desc->action && (desc->action->flags & IRQF_SHARED)) desc->status |= IRQ_PENDING; - spin_unlock(&desc->lock); + atomic_spin_unlock(&desc->lock); return ok; } /* Honour the normal IRQ locking */ desc->status |= IRQ_INPROGRESS; action = desc->action; - spin_unlock(&desc->lock); + atomic_spin_unlock(&desc->lock); while (action) { /* Only shared IRQ handlers are safe to call */ @@ -54,9 +54,9 @@ static int try_one_irq(int irq, struct irq_desc *desc) } action = action->next; } - local_irq_disable(); + /* Now clean up the flags */ - spin_lock(&desc->lock); + atomic_spin_lock_irq(&desc->lock); action = desc->action; /* @@ -68,9 +68,9 @@ static int try_one_irq(int irq, struct irq_desc *desc) * Perform real IRQ processing for the IRQ we deferred */ work = 1; - spin_unlock(&desc->lock); + atomic_spin_unlock(&desc->lock); handle_IRQ_event(irq, action); - spin_lock(&desc->lock); + atomic_spin_lock(&desc->lock); desc->status &= ~IRQ_PENDING; } desc->status &= ~IRQ_INPROGRESS; @@ -80,7 +80,7 @@ static int try_one_irq(int irq, struct irq_desc *desc) */ if (work && desc->chip && desc->chip->end) desc->chip->end(irq); - spin_unlock(&desc->lock); + atomic_spin_unlock(&desc->lock); return ok; } @@ -288,6 +288,11 @@ MODULE_PARM_DESC(noirqdebug, "Disable irq lockup detection when true"); static int __init irqfixup_setup(char *str) { +#ifdef CONFIG_PREEMPT_RT + printk(KERN_WARNING "irqfixup boot option not supported " + "w/ CONFIG_PREEMPT_RT\n"); + return 1; +#endif irqfixup = 1; printk(KERN_WARNING "Misrouted IRQ fixup support enabled.\n"); printk(KERN_WARNING "This may impact system performance.\n"); @@ -301,6 +306,11 @@ MODULE_PARM_DESC("irqfixup", "0: No fixup, 1: irqfixup mode, 2: irqpoll mode"); static int __init irqpoll_setup(char *str) { +#ifdef CONFIG_PREEMPT_RT + printk(KERN_WARNING "irqpoll boot option not supported " + "w/ CONFIG_PREEMPT_RT\n"); + return 1; +#endif irqfixup = 2; printk(KERN_WARNING "Misrouted IRQ fixup and polling support " "enabled\n"); diff --git a/kernel/itimer.c b/kernel/itimer.c index 58762f7..b4d3998 100644 --- a/kernel/itimer.c +++ b/kernel/itimer.c @@ -161,6 +161,7 @@ again: /* We are sharing ->siglock with it_real_fn() */ if (hrtimer_try_to_cancel(timer) < 0) { spin_unlock_irq(&tsk->sighand->siglock); + hrtimer_wait_for_timer(&tsk->signal->real_timer); goto again; } expires = timeval_to_ktime(value->it_value); diff --git a/kernel/kmod.c b/kernel/kmod.c index 385c31a..a922808 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c @@ -37,6 +37,8 @@ #include <linux/suspend.h> #include <asm/uaccess.h> +#include <trace/events/module.h> + extern int max_threads; static struct workqueue_struct *khelper_wq; @@ -108,6 +110,8 @@ int __request_module(bool wait, const char *fmt, ...) return -ENOMEM; } + trace_module_request(module_name, wait, _RET_IP_); + ret = call_usermodehelper(modprobe_path, argv, envp, wait ? UMH_WAIT_PROC : UMH_WAIT_EXEC); atomic_dec(&kmod_concurrent); diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 0540948..03fe489 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -73,10 +73,10 @@ static bool kprobes_all_disarmed; static DEFINE_MUTEX(kprobe_mutex); /* Protects kprobe_table */ static DEFINE_PER_CPU(struct kprobe *, kprobe_instance) = NULL; static struct { - spinlock_t lock ____cacheline_aligned_in_smp; + atomic_spinlock_t lock ____cacheline_aligned_in_smp; } kretprobe_table_locks[KPROBE_TABLE_SIZE]; -static spinlock_t *kretprobe_table_lock_ptr(unsigned long hash) +static atomic_spinlock_t *kretprobe_table_lock_ptr(unsigned long hash) { return &(kretprobe_table_locks[hash].lock); } @@ -103,7 +103,7 @@ static struct kprobe_blackpoint kprobe_blacklist[] = { #define INSNS_PER_PAGE (PAGE_SIZE/(MAX_INSN_SIZE * sizeof(kprobe_opcode_t))) struct kprobe_insn_page { - struct hlist_node hlist; + struct list_head list; kprobe_opcode_t *insns; /* Page of instruction slots */ char slot_used[INSNS_PER_PAGE]; int nused; @@ -117,7 +117,7 @@ enum kprobe_slot_state { }; static DEFINE_MUTEX(kprobe_insn_mutex); /* Protects kprobe_insn_pages */ -static struct hlist_head kprobe_insn_pages; +static LIST_HEAD(kprobe_insn_pages); static int kprobe_garbage_slots; static int collect_garbage_slots(void); @@ -152,10 +152,9 @@ loop_end: static kprobe_opcode_t __kprobes *__get_insn_slot(void) { struct kprobe_insn_page *kip; - struct hlist_node *pos; retry: - hlist_for_each_entry(kip, pos, &kprobe_insn_pages, hlist) { + list_for_each_entry(kip, &kprobe_insn_pages, list) { if (kip->nused < INSNS_PER_PAGE) { int i; for (i = 0; i < INSNS_PER_PAGE; i++) { @@ -189,8 +188,8 @@ static kprobe_opcode_t __kprobes *__get_insn_slot(void) kfree(kip); return NULL; } - INIT_HLIST_NODE(&kip->hlist); - hlist_add_head(&kip->hlist, &kprobe_insn_pages); + INIT_LIST_HEAD(&kip->list); + list_add(&kip->list, &kprobe_insn_pages); memset(kip->slot_used, SLOT_CLEAN, INSNS_PER_PAGE); kip->slot_used[0] = SLOT_USED; kip->nused = 1; @@ -219,12 +218,8 @@ static int __kprobes collect_one_slot(struct kprobe_insn_page *kip, int idx) * so as not to have to set it up again the * next time somebody inserts a probe. */ - hlist_del(&kip->hlist); - if (hlist_empty(&kprobe_insn_pages)) { - INIT_HLIST_NODE(&kip->hlist); - hlist_add_head(&kip->hlist, - &kprobe_insn_pages); - } else { + if (!list_is_singular(&kprobe_insn_pages)) { + list_del(&kip->list); module_free(NULL, kip->insns); kfree(kip); } @@ -235,14 +230,13 @@ static int __kprobes collect_one_slot(struct kprobe_insn_page *kip, int idx) static int __kprobes collect_garbage_slots(void) { - struct kprobe_insn_page *kip; - struct hlist_node *pos, *next; + struct kprobe_insn_page *kip, *next; /* Ensure no-one is preepmted on the garbages */ if (check_safety()) return -EAGAIN; - hlist_for_each_entry_safe(kip, pos, next, &kprobe_insn_pages, hlist) { + list_for_each_entry_safe(kip, next, &kprobe_insn_pages, list) { int i; if (kip->ngarbage == 0) continue; @@ -260,19 +254,17 @@ static int __kprobes collect_garbage_slots(void) void __kprobes free_insn_slot(kprobe_opcode_t * slot, int dirty) { struct kprobe_insn_page *kip; - struct hlist_node *pos; mutex_lock(&kprobe_insn_mutex); - hlist_for_each_entry(kip, pos, &kprobe_insn_pages, hlist) { + list_for_each_entry(kip, &kprobe_insn_pages, list) { if (kip->insns <= slot && slot < kip->insns + (INSNS_PER_PAGE * MAX_INSN_SIZE)) { int i = (slot - kip->insns) / MAX_INSN_SIZE; if (dirty) { kip->slot_used[i] = SLOT_DIRTY; kip->ngarbage++; - } else { + } else collect_one_slot(kip, i); - } break; } } @@ -415,9 +407,9 @@ void __kprobes recycle_rp_inst(struct kretprobe_instance *ri, hlist_del(&ri->hlist); INIT_HLIST_NODE(&ri->hlist); if (likely(rp)) { - spin_lock(&rp->lock); + atomic_spin_lock(&rp->lock); hlist_add_head(&ri->hlist, &rp->free_instances); - spin_unlock(&rp->lock); + atomic_spin_unlock(&rp->lock); } else /* Unregistering */ hlist_add_head(&ri->hlist, head); @@ -427,34 +419,34 @@ void __kprobes kretprobe_hash_lock(struct task_struct *tsk, struct hlist_head **head, unsigned long *flags) { unsigned long hash = hash_ptr(tsk, KPROBE_HASH_BITS); - spinlock_t *hlist_lock; + atomic_spinlock_t *hlist_lock; *head = &kretprobe_inst_table[hash]; hlist_lock = kretprobe_table_lock_ptr(hash); - spin_lock_irqsave(hlist_lock, *flags); + atomic_spin_lock_irqsave(hlist_lock, *flags); } static void __kprobes kretprobe_table_lock(unsigned long hash, unsigned long *flags) { - spinlock_t *hlist_lock = kretprobe_table_lock_ptr(hash); - spin_lock_irqsave(hlist_lock, *flags); + atomic_spinlock_t *hlist_lock = kretprobe_table_lock_ptr(hash); + atomic_spin_lock_irqsave(hlist_lock, *flags); } void __kprobes kretprobe_hash_unlock(struct task_struct *tsk, unsigned long *flags) { unsigned long hash = hash_ptr(tsk, KPROBE_HASH_BITS); - spinlock_t *hlist_lock; + atomic_spinlock_t *hlist_lock; hlist_lock = kretprobe_table_lock_ptr(hash); - spin_unlock_irqrestore(hlist_lock, *flags); + atomic_spin_unlock_irqrestore(hlist_lock, *flags); } void __kprobes kretprobe_table_unlock(unsigned long hash, unsigned long *flags) { - spinlock_t *hlist_lock = kretprobe_table_lock_ptr(hash); - spin_unlock_irqrestore(hlist_lock, *flags); + atomic_spinlock_t *hlist_lock = kretprobe_table_lock_ptr(hash); + atomic_spin_unlock_irqrestore(hlist_lock, *flags); } /* @@ -969,12 +961,12 @@ static int __kprobes pre_handler_kretprobe(struct kprobe *p, /*TODO: consider to only swap the RA after the last pre_handler fired */ hash = hash_ptr(current, KPROBE_HASH_BITS); - spin_lock_irqsave(&rp->lock, flags); + atomic_spin_lock_irqsave(&rp->lock, flags); if (!hlist_empty(&rp->free_instances)) { ri = hlist_entry(rp->free_instances.first, struct kretprobe_instance, hlist); hlist_del(&ri->hlist); - spin_unlock_irqrestore(&rp->lock, flags); + atomic_spin_unlock_irqrestore(&rp->lock, flags); ri->rp = rp; ri->task = current; @@ -991,7 +983,7 @@ static int __kprobes pre_handler_kretprobe(struct kprobe *p, kretprobe_table_unlock(hash, &flags); } else { rp->nmissed++; - spin_unlock_irqrestore(&rp->lock, flags); + atomic_spin_unlock_irqrestore(&rp->lock, flags); } return 0; } @@ -1027,7 +1019,7 @@ int __kprobes register_kretprobe(struct kretprobe *rp) rp->maxactive = NR_CPUS; #endif } - spin_lock_init(&rp->lock); + atomic_spin_lock_init(&rp->lock); INIT_HLIST_HEAD(&rp->free_instances); for (i = 0; i < rp->maxactive; i++) { inst = kmalloc(sizeof(struct kretprobe_instance) + @@ -1207,7 +1199,7 @@ static int __init init_kprobes(void) for (i = 0; i < KPROBE_TABLE_SIZE; i++) { INIT_HLIST_HEAD(&kprobe_table[i]); INIT_HLIST_HEAD(&kretprobe_inst_table[i]); - spin_lock_init(&(kretprobe_table_locks[i].lock)); + atomic_spin_lock_init(&(kretprobe_table_locks[i].lock)); } /* diff --git a/kernel/latencytop.c b/kernel/latencytop.c index ca07c5c..34311e1 100644 --- a/kernel/latencytop.c +++ b/kernel/latencytop.c @@ -59,7 +59,7 @@ #include <linux/slab.h> #include <linux/stacktrace.h> -static DEFINE_SPINLOCK(latency_lock); +static DEFINE_ATOMIC_SPINLOCK(latency_lock); #define MAXLR 128 static struct latency_record latency_record[MAXLR]; @@ -73,19 +73,19 @@ void clear_all_latency_tracing(struct task_struct *p) if (!latencytop_enabled) return; - spin_lock_irqsave(&latency_lock, flags); + atomic_spin_lock_irqsave(&latency_lock, flags); memset(&p->latency_record, 0, sizeof(p->latency_record)); p->latency_record_count = 0; - spin_unlock_irqrestore(&latency_lock, flags); + atomic_spin_unlock_irqrestore(&latency_lock, flags); } static void clear_global_latency_tracing(void) { unsigned long flags; - spin_lock_irqsave(&latency_lock, flags); + atomic_spin_lock_irqsave(&latency_lock, flags); memset(&latency_record, 0, sizeof(latency_record)); - spin_unlock_irqrestore(&latency_lock, flags); + atomic_spin_unlock_irqrestore(&latency_lock, flags); } static void __sched @@ -191,7 +191,7 @@ __account_scheduler_latency(struct task_struct *tsk, int usecs, int inter) lat.max = usecs; store_stacktrace(tsk, &lat); - spin_lock_irqsave(&latency_lock, flags); + atomic_spin_lock_irqsave(&latency_lock, flags); account_global_scheduler_latency(tsk, &lat); @@ -233,7 +233,7 @@ __account_scheduler_latency(struct task_struct *tsk, int usecs, int inter) memcpy(&tsk->latency_record[i], &lat, sizeof(struct latency_record)); out_unlock: - spin_unlock_irqrestore(&latency_lock, flags); + atomic_spin_unlock_irqrestore(&latency_lock, flags); } static int lstats_show(struct seq_file *m, void *v) diff --git a/kernel/lock-internals.h b/kernel/lock-internals.h new file mode 100644 index 0000000..76f694c --- /dev/null +++ b/kernel/lock-internals.h @@ -0,0 +1,75 @@ +/* + * Macros shared by spinlock.c and rwlock.c + */ +/* + * This could be a long-held lock. We both prepare to spin for a long + * time (making _this_ CPU preemptable if possible), and we also signal + * towards that other CPU that it should break the lock ASAP. + * + * (We do this in a function because inlining it would be excessive.) + */ + +#define BUILD_LOCK_OPS(prefix, op, locktype) \ +void __lockfunc _##prefix##_lock(locktype##_t *lock) \ +{ \ + for (;;) { \ + preempt_disable(); \ + if (likely(_raw_##op##_trylock(lock))) \ + break; \ + preempt_enable(); \ + \ + if (!(lock)->break_lock) \ + (lock)->break_lock = 1; \ + while (!prefix##_can_lock(lock) && (lock)->break_lock) \ + _raw_##op##_relax(&lock->raw_lock); \ + } \ + (lock)->break_lock = 0; \ +} \ + \ +EXPORT_SYMBOL(_##prefix##_lock); \ + \ +unsigned long __lockfunc _##prefix##_lock_irqsave(locktype##_t *lock) \ +{ \ + unsigned long flags; \ + \ + for (;;) { \ + preempt_disable(); \ + local_irq_save(flags); \ + if (likely(_raw_##op##_trylock(lock))) \ + break; \ + local_irq_restore(flags); \ + preempt_enable(); \ + \ + if (!(lock)->break_lock) \ + (lock)->break_lock = 1; \ + while (!prefix##_can_lock(lock) && (lock)->break_lock) \ + _raw_##op##_relax(&lock->raw_lock); \ + } \ + (lock)->break_lock = 0; \ + return flags; \ +} \ + \ +EXPORT_SYMBOL(_##prefix##_lock_irqsave); \ + \ +void __lockfunc _##prefix##_lock_irq(locktype##_t *lock) \ +{ \ + _##prefix##_lock_irqsave(lock); \ +} \ + \ +EXPORT_SYMBOL(_##prefix##_lock_irq); \ + \ +void __lockfunc _##prefix##_lock_bh(locktype##_t *lock) \ +{ \ + unsigned long flags; \ + \ + /* */ \ + /* Careful: we must exclude softirqs too, hence the */ \ + /* irq-disabling. We use the generic preemption-aware */ \ + /* function: */ \ + /**/ \ + flags = _##prefix##_lock_irqsave(lock); \ + local_bh_disable(); \ + local_irq_restore(flags); \ +} \ + \ +EXPORT_SYMBOL(_##prefix##_lock_bh) diff --git a/kernel/lockdep.c b/kernel/lockdep.c index 8bbeef9..a53d4fb 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c @@ -846,6 +846,21 @@ out_unlock_set: return class; } +#if defined(CONFIG_PROVE_LOCKING) || defined(CONFIG_TRACE_IRQFLAGS) + +#define RECURSION_LIMIT 40 + +static int noinline print_infinite_recursion_bug(void) +{ + if (!debug_locks_off_graph_unlock()) + return 0; + + WARN_ON(1); + + return 0; +} +#endif /* CONFIG_PROVE_LOCKING || CONFIG_TRACE_IRQFLAGS */ + #ifdef CONFIG_PROVE_LOCKING /* * Allocate a lockdep entry. (assumes the graph_lock held, returns @@ -977,18 +992,6 @@ static noinline int print_circular_bug_tail(void) return 0; } -#define RECURSION_LIMIT 40 - -static int noinline print_infinite_recursion_bug(void) -{ - if (!debug_locks_off_graph_unlock()) - return 0; - - WARN_ON(1); - - return 0; -} - unsigned long __lockdep_count_forward_deps(struct lock_class *class, unsigned int depth) { @@ -1181,6 +1184,7 @@ find_usage_backwards(struct lock_class *source, unsigned int depth) return 1; } +#ifdef CONFIG_PROVE_LOCKING static int print_bad_irq_dependency(struct task_struct *curr, struct held_lock *prev, @@ -1241,6 +1245,7 @@ print_bad_irq_dependency(struct task_struct *curr, return 0; } +#endif /* CONFIG_PROVE_LOCKING */ static int check_usage(struct task_struct *curr, struct held_lock *prev, diff --git a/kernel/module.c b/kernel/module.c index 2d53718..46580ed 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -55,6 +55,11 @@ #include <linux/percpu.h> #include <linux/kmemleak.h> +#define CREATE_TRACE_POINTS +#include <trace/events/module.h> + +EXPORT_TRACEPOINT_SYMBOL(module_get); + #if 0 #define DEBUGP printk #else @@ -942,6 +947,8 @@ void module_put(struct module *module) if (module) { unsigned int cpu = get_cpu(); local_dec(__module_ref_addr(module, cpu)); + trace_module_put(module, _RET_IP_, + local_read(__module_ref_addr(module, cpu))); /* Maybe they're waiting for us to drop reference? */ if (unlikely(!module_is_live(module))) wake_up_process(module->waiter); @@ -1497,6 +1504,8 @@ static int __unlink_module(void *_mod) /* Free a module, remove from lists, etc (must hold module_mutex). */ static void free_module(struct module *mod) { + trace_module_free(mod); + /* Delete from various lists */ stop_machine(__unlink_module, mod, NULL); remove_notes_attrs(mod); @@ -2364,6 +2373,8 @@ static noinline struct module *load_module(void __user *umod, /* Get rid of temporary copy */ vfree(hdr); + trace_module_load(mod); + /* Done! */ return mod; diff --git a/kernel/mutex.c b/kernel/mutex.c index 947b3ad..73ad8a6 100644 --- a/kernel/mutex.c +++ b/kernel/mutex.c @@ -249,9 +249,13 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, /* didnt get the lock, go to sleep: */ spin_unlock_mutex(&lock->wait_lock, flags); - preempt_enable_no_resched(); - schedule(); + + local_irq_disable(); + __preempt_enable_no_resched(); + __schedule(); preempt_disable(); + local_irq_enable(); + spin_lock_mutex(&lock->wait_lock, flags); } diff --git a/kernel/notifier.c b/kernel/notifier.c index 61d5aa5..cf40c2d 100644 --- a/kernel/notifier.c +++ b/kernel/notifier.c @@ -71,7 +71,7 @@ static int notifier_chain_unregister(struct notifier_block **nl, * @returns: notifier_call_chain returns the value returned by the * last notifier function called. */ -static int __kprobes notifier_call_chain(struct notifier_block **nl, +static int __kprobes notrace notifier_call_chain(struct notifier_block **nl, unsigned long val, void *v, int nr_to_call, int *nr_calls) { @@ -217,7 +217,7 @@ int blocking_notifier_chain_register(struct blocking_notifier_head *nh, * not yet working and interrupts must remain disabled. At * such times we must not call down_write(). */ - if (unlikely(system_state == SYSTEM_BOOTING)) + if (unlikely(system_state < SYSTEM_RUNNING)) return notifier_chain_register(&nh->head, n); down_write(&nh->rwsem); diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 3f49f53..225f8f2 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -194,14 +194,14 @@ perf_lock_task_context(struct task_struct *task, unsigned long *flags) * if so. If we locked the right context, then it * can't get swapped on us any more. */ - spin_lock_irqsave(&ctx->lock, *flags); + atomic_spin_lock_irqsave(&ctx->lock, *flags); if (ctx != rcu_dereference(task->perf_counter_ctxp)) { - spin_unlock_irqrestore(&ctx->lock, *flags); + atomic_spin_unlock_irqrestore(&ctx->lock, *flags); goto retry; } if (!atomic_inc_not_zero(&ctx->refcount)) { - spin_unlock_irqrestore(&ctx->lock, *flags); + atomic_spin_unlock_irqrestore(&ctx->lock, *flags); ctx = NULL; } } @@ -222,7 +222,7 @@ static struct perf_counter_context *perf_pin_task_context(struct task_struct *ta ctx = perf_lock_task_context(task, &flags); if (ctx) { ++ctx->pin_count; - spin_unlock_irqrestore(&ctx->lock, flags); + atomic_spin_unlock_irqrestore(&ctx->lock, flags); } return ctx; } @@ -231,9 +231,9 @@ static void perf_unpin_context(struct perf_counter_context *ctx) { unsigned long flags; - spin_lock_irqsave(&ctx->lock, flags); + atomic_spin_lock_irqsave(&ctx->lock, flags); --ctx->pin_count; - spin_unlock_irqrestore(&ctx->lock, flags); + atomic_spin_unlock_irqrestore(&ctx->lock, flags); put_ctx(ctx); } @@ -364,7 +364,7 @@ static void __perf_counter_remove_from_context(void *info) if (ctx->task && cpuctx->task_ctx != ctx) return; - spin_lock(&ctx->lock); + atomic_spin_lock(&ctx->lock); /* * Protect the list operation against NMI by disabling the * counters on a global level. @@ -386,7 +386,7 @@ static void __perf_counter_remove_from_context(void *info) } perf_enable(); - spin_unlock(&ctx->lock); + atomic_spin_unlock(&ctx->lock); } @@ -425,12 +425,12 @@ retry: task_oncpu_function_call(task, __perf_counter_remove_from_context, counter); - spin_lock_irq(&ctx->lock); + atomic_spin_lock_irq(&ctx->lock); /* * If the context is active we need to retry the smp call. */ if (ctx->nr_active && !list_empty(&counter->list_entry)) { - spin_unlock_irq(&ctx->lock); + atomic_spin_unlock_irq(&ctx->lock); goto retry; } @@ -442,7 +442,7 @@ retry: if (!list_empty(&counter->list_entry)) { list_del_counter(counter, ctx); } - spin_unlock_irq(&ctx->lock); + atomic_spin_unlock_irq(&ctx->lock); } static inline u64 perf_clock(void) @@ -511,7 +511,7 @@ static void __perf_counter_disable(void *info) if (ctx->task && cpuctx->task_ctx != ctx) return; - spin_lock(&ctx->lock); + atomic_spin_lock(&ctx->lock); /* * If the counter is on, turn it off. @@ -527,7 +527,7 @@ static void __perf_counter_disable(void *info) counter->state = PERF_COUNTER_STATE_OFF; } - spin_unlock(&ctx->lock); + atomic_spin_unlock(&ctx->lock); } /* @@ -560,12 +560,12 @@ static void perf_counter_disable(struct perf_counter *counter) retry: task_oncpu_function_call(task, __perf_counter_disable, counter); - spin_lock_irq(&ctx->lock); + atomic_spin_lock_irq(&ctx->lock); /* * If the counter is still active, we need to retry the cross-call. */ if (counter->state == PERF_COUNTER_STATE_ACTIVE) { - spin_unlock_irq(&ctx->lock); + atomic_spin_unlock_irq(&ctx->lock); goto retry; } @@ -578,7 +578,7 @@ static void perf_counter_disable(struct perf_counter *counter) counter->state = PERF_COUNTER_STATE_OFF; } - spin_unlock_irq(&ctx->lock); + atomic_spin_unlock_irq(&ctx->lock); } static int @@ -746,7 +746,7 @@ static void __perf_install_in_context(void *info) cpuctx->task_ctx = ctx; } - spin_lock(&ctx->lock); + atomic_spin_lock(&ctx->lock); ctx->is_active = 1; update_context_time(ctx); @@ -796,7 +796,7 @@ static void __perf_install_in_context(void *info) unlock: perf_enable(); - spin_unlock(&ctx->lock); + atomic_spin_unlock(&ctx->lock); } /* @@ -832,12 +832,12 @@ retry: task_oncpu_function_call(task, __perf_install_in_context, counter); - spin_lock_irq(&ctx->lock); + atomic_spin_lock_irq(&ctx->lock); /* * we need to retry the smp call. */ if (ctx->is_active && list_empty(&counter->list_entry)) { - spin_unlock_irq(&ctx->lock); + atomic_spin_unlock_irq(&ctx->lock); goto retry; } @@ -848,7 +848,7 @@ retry: */ if (list_empty(&counter->list_entry)) add_counter_to_ctx(counter, ctx); - spin_unlock_irq(&ctx->lock); + atomic_spin_unlock_irq(&ctx->lock); } /* @@ -893,7 +893,7 @@ static void __perf_counter_enable(void *info) cpuctx->task_ctx = ctx; } - spin_lock(&ctx->lock); + atomic_spin_lock(&ctx->lock); ctx->is_active = 1; update_context_time(ctx); @@ -935,7 +935,7 @@ static void __perf_counter_enable(void *info) } unlock: - spin_unlock(&ctx->lock); + atomic_spin_unlock(&ctx->lock); } /* @@ -961,7 +961,7 @@ static void perf_counter_enable(struct perf_counter *counter) return; } - spin_lock_irq(&ctx->lock); + atomic_spin_lock_irq(&ctx->lock); if (counter->state >= PERF_COUNTER_STATE_INACTIVE) goto out; @@ -976,10 +976,10 @@ static void perf_counter_enable(struct perf_counter *counter) counter->state = PERF_COUNTER_STATE_OFF; retry: - spin_unlock_irq(&ctx->lock); + atomic_spin_unlock_irq(&ctx->lock); task_oncpu_function_call(task, __perf_counter_enable, counter); - spin_lock_irq(&ctx->lock); + atomic_spin_lock_irq(&ctx->lock); /* * If the context is active and the counter is still off, @@ -996,7 +996,7 @@ static void perf_counter_enable(struct perf_counter *counter) __perf_counter_mark_enabled(counter, ctx); out: - spin_unlock_irq(&ctx->lock); + atomic_spin_unlock_irq(&ctx->lock); } static int perf_counter_refresh(struct perf_counter *counter, int refresh) @@ -1018,7 +1018,7 @@ void __perf_counter_sched_out(struct perf_counter_context *ctx, { struct perf_counter *counter; - spin_lock(&ctx->lock); + atomic_spin_lock(&ctx->lock); ctx->is_active = 0; if (likely(!ctx->nr_counters)) goto out; @@ -1035,7 +1035,7 @@ void __perf_counter_sched_out(struct perf_counter_context *ctx, } perf_enable(); out: - spin_unlock(&ctx->lock); + atomic_spin_unlock(&ctx->lock); } /* @@ -1175,8 +1175,8 @@ void perf_counter_task_sched_out(struct task_struct *task, * order we take the locks because no other cpu could * be trying to lock both of these tasks. */ - spin_lock(&ctx->lock); - spin_lock_nested(&next_ctx->lock, SINGLE_DEPTH_NESTING); + atomic_spin_lock(&ctx->lock); + atomic_spin_lock_nested(&next_ctx->lock, SINGLE_DEPTH_NESTING); if (context_equiv(ctx, next_ctx)) { /* * XXX do we need a memory barrier of sorts @@ -1190,8 +1190,8 @@ void perf_counter_task_sched_out(struct task_struct *task, perf_counter_sync_stat(ctx, next_ctx); } - spin_unlock(&next_ctx->lock); - spin_unlock(&ctx->lock); + atomic_spin_unlock(&next_ctx->lock); + atomic_spin_unlock(&ctx->lock); } rcu_read_unlock(); @@ -1233,7 +1233,7 @@ __perf_counter_sched_in(struct perf_counter_context *ctx, struct perf_counter *counter; int can_add_hw = 1; - spin_lock(&ctx->lock); + atomic_spin_lock(&ctx->lock); ctx->is_active = 1; if (likely(!ctx->nr_counters)) goto out; @@ -1298,7 +1298,7 @@ __perf_counter_sched_in(struct perf_counter_context *ctx, } perf_enable(); out: - spin_unlock(&ctx->lock); + atomic_spin_unlock(&ctx->lock); } /* @@ -1362,7 +1362,7 @@ static void perf_ctx_adjust_freq(struct perf_counter_context *ctx) struct hw_perf_counter *hwc; u64 interrupts, freq; - spin_lock(&ctx->lock); + atomic_spin_lock(&ctx->lock); list_for_each_entry(counter, &ctx->counter_list, list_entry) { if (counter->state != PERF_COUNTER_STATE_ACTIVE) continue; @@ -1417,7 +1417,7 @@ static void perf_ctx_adjust_freq(struct perf_counter_context *ctx) perf_enable(); } } - spin_unlock(&ctx->lock); + atomic_spin_unlock(&ctx->lock); } /* @@ -1430,7 +1430,7 @@ static void rotate_ctx(struct perf_counter_context *ctx) if (!ctx->nr_counters) return; - spin_lock(&ctx->lock); + atomic_spin_lock(&ctx->lock); /* * Rotate the first entry last (works just fine for group counters too): */ @@ -1441,7 +1441,7 @@ static void rotate_ctx(struct perf_counter_context *ctx) } perf_enable(); - spin_unlock(&ctx->lock); + atomic_spin_unlock(&ctx->lock); } void perf_counter_task_tick(struct task_struct *curr, int cpu) @@ -1490,7 +1490,7 @@ static void perf_counter_enable_on_exec(struct task_struct *task) __perf_counter_task_sched_out(ctx); - spin_lock(&ctx->lock); + atomic_spin_lock(&ctx->lock); list_for_each_entry(counter, &ctx->counter_list, list_entry) { if (!counter->attr.enable_on_exec) @@ -1508,7 +1508,7 @@ static void perf_counter_enable_on_exec(struct task_struct *task) if (enabled) unclone_ctx(ctx); - spin_unlock(&ctx->lock); + atomic_spin_unlock(&ctx->lock); perf_counter_task_sched_in(task, smp_processor_id()); out: @@ -1567,7 +1567,7 @@ __perf_counter_init_context(struct perf_counter_context *ctx, struct task_struct *task) { memset(ctx, 0, sizeof(*ctx)); - spin_lock_init(&ctx->lock); + atomic_spin_lock_init(&ctx->lock); mutex_init(&ctx->mutex); INIT_LIST_HEAD(&ctx->counter_list); INIT_LIST_HEAD(&ctx->event_list); @@ -1637,7 +1637,7 @@ static struct perf_counter_context *find_get_context(pid_t pid, int cpu) ctx = perf_lock_task_context(task, &flags); if (ctx) { unclone_ctx(ctx); - spin_unlock_irqrestore(&ctx->lock, flags); + atomic_spin_unlock_irqrestore(&ctx->lock, flags); } if (!ctx) { @@ -1959,7 +1959,7 @@ static int perf_counter_period(struct perf_counter *counter, u64 __user *arg) if (!value) return -EINVAL; - spin_lock_irq(&ctx->lock); + atomic_spin_lock_irq(&ctx->lock); if (counter->attr.freq) { if (value > sysctl_perf_counter_sample_rate) { ret = -EINVAL; @@ -1972,7 +1972,7 @@ static int perf_counter_period(struct perf_counter *counter, u64 __user *arg) counter->hw.sample_period = value; } unlock: - spin_unlock_irq(&ctx->lock); + atomic_spin_unlock_irq(&ctx->lock); return ret; } @@ -2382,11 +2382,26 @@ static void perf_pending_counter(struct perf_pending_entry *entry) __perf_counter_disable(counter); } +#ifndef CONFIG_PREEMPT_RT + if (counter->pending_wakeup) { + counter->pending_wakeup = 0; + perf_counter_wakeup(counter); + } +#endif +} + +#ifdef CONFIG_PREEMPT_RT +static void perf_pending_counter_softirq(struct perf_pending_entry *entry) +{ + struct perf_counter *counter = container_of(entry, + struct perf_counter, pending_softirq); + if (counter->pending_wakeup) { counter->pending_wakeup = 0; perf_counter_wakeup(counter); } } +#endif #define PENDING_TAIL ((struct perf_pending_entry *)-1UL) @@ -2394,33 +2409,43 @@ static DEFINE_PER_CPU(struct perf_pending_entry *, perf_pending_head) = { PENDING_TAIL, }; -static void perf_pending_queue(struct perf_pending_entry *entry, - void (*func)(struct perf_pending_entry *)) +static DEFINE_PER_CPU(struct perf_pending_entry *, perf_pending_softirq_head) = { - struct perf_pending_entry **head; + PENDING_TAIL, +}; +static void __perf_pending_queue(struct perf_pending_entry **head, + struct perf_pending_entry *entry, + void (*func)(struct perf_pending_entry *)) +{ if (cmpxchg(&entry->next, NULL, PENDING_TAIL) != NULL) return; entry->func = func; - head = &get_cpu_var(perf_pending_head); - do { entry->next = *head; } while (cmpxchg(head, entry->next, entry) != entry->next); +} - set_perf_counter_pending(); +static void perf_pending_queue(struct perf_pending_entry *entry, + void (*func)(struct perf_pending_entry *)) +{ + struct perf_pending_entry **head; + head = &get_cpu_var(perf_pending_head); + __perf_pending_queue(head, entry, func); put_cpu_var(perf_pending_head); + + set_perf_counter_pending(); } -static int __perf_pending_run(void) +static int __perf_pending_run(struct perf_pending_entry **head) { struct perf_pending_entry *list; int nr = 0; - list = xchg(&__get_cpu_var(perf_pending_head), PENDING_TAIL); + list = xchg(head, PENDING_TAIL); while (list != PENDING_TAIL) { void (*func)(struct perf_pending_entry *); struct perf_pending_entry *entry = list; @@ -2449,9 +2474,8 @@ static inline int perf_not_pending(struct perf_counter *counter) * If we flush on whatever cpu we run, there is a chance we don't * need to wait. */ - get_cpu(); - __perf_pending_run(); - put_cpu(); + __perf_pending_run(&__raw_get_cpu_var(perf_pending_head)); + __perf_pending_run(&__raw_get_cpu_var(perf_pending_softirq_head)); /* * Ensure we see the proper queue state before going to sleep @@ -2468,7 +2492,13 @@ static void perf_pending_sync(struct perf_counter *counter) void perf_counter_do_pending(void) { - __perf_pending_run(); + __perf_pending_run(&__get_cpu_var(perf_pending_head)); +} + +void perf_counter_do_pending_softirq(void) +{ + __perf_pending_run(&__raw_get_cpu_var(perf_pending_head)); + __perf_pending_run(&__raw_get_cpu_var(perf_pending_softirq_head)); } /* @@ -2526,12 +2556,23 @@ static void perf_output_wakeup(struct perf_output_handle *handle) { atomic_set(&handle->data->poll, POLL_IN); +#ifndef CONFIG_PREEMPT_RT if (handle->nmi) { handle->counter->pending_wakeup = 1; perf_pending_queue(&handle->counter->pending, perf_pending_counter); } else perf_counter_wakeup(handle->counter); +#else + /* + * Move it always to the softirq. This code is called with + * interrupts disabled. + */ + handle->counter->pending_wakeup = 1; + __perf_pending_queue(&__get_cpu_var(perf_pending_softirq_head), + &handle->counter->pending_softirq, + perf_pending_counter_softirq); +#endif } /* @@ -4517,7 +4558,7 @@ void perf_counter_exit_task(struct task_struct *child) * reading child->perf_counter_ctxp, we wait until it has * incremented the context's refcount before we do put_ctx below. */ - spin_lock(&child_ctx->lock); + atomic_spin_lock(&child_ctx->lock); child->perf_counter_ctxp = NULL; /* * If this context is a clone; unclone it so it can't get @@ -4525,7 +4566,7 @@ void perf_counter_exit_task(struct task_struct *child) * the counters from it. */ unclone_ctx(child_ctx); - spin_unlock_irqrestore(&child_ctx->lock, flags); + atomic_spin_unlock_irqrestore(&child_ctx->lock, flags); /* * Report the task dead after unscheduling the counters so that we @@ -4811,11 +4852,11 @@ perf_set_reserve_percpu(struct sysdev_class *class, perf_reserved_percpu = val; for_each_online_cpu(cpu) { cpuctx = &per_cpu(perf_cpu_context, cpu); - spin_lock_irq(&cpuctx->ctx.lock); + atomic_spin_lock_irq(&cpuctx->ctx.lock); mpt = min(perf_max_counters - cpuctx->ctx.nr_counters, perf_max_counters - perf_reserved_percpu); cpuctx->max_pertask = mpt; - spin_unlock_irq(&cpuctx->ctx.lock); + atomic_spin_unlock_irq(&cpuctx->ctx.lock); } spin_unlock(&perf_resource_lock); diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c index e33a21c..ca750c7 100644 --- a/kernel/posix-cpu-timers.c +++ b/kernel/posix-cpu-timers.c @@ -279,7 +279,7 @@ void thread_group_cputimer(struct task_struct *tsk, struct task_cputime *times) struct task_cputime sum; unsigned long flags; - spin_lock_irqsave(&cputimer->lock, flags); + atomic_spin_lock_irqsave(&cputimer->lock, flags); if (!cputimer->running) { cputimer->running = 1; /* @@ -292,7 +292,7 @@ void thread_group_cputimer(struct task_struct *tsk, struct task_cputime *times) update_gt_cputime(&cputimer->cputime, &sum); } *times = cputimer->cputime; - spin_unlock_irqrestore(&cputimer->lock, flags); + atomic_spin_unlock_irqrestore(&cputimer->lock, flags); } /* @@ -559,7 +559,7 @@ static void arm_timer(struct k_itimer *timer, union cpu_time_count now) p->cpu_timers : p->signal->cpu_timers); head += CPUCLOCK_WHICH(timer->it_clock); - BUG_ON(!irqs_disabled()); + BUG_ON_NONRT(!irqs_disabled()); spin_lock(&p->sighand->siglock); listpos = head; @@ -747,7 +747,7 @@ int posix_cpu_timer_set(struct k_itimer *timer, int flags, /* * Disarm any old timer after extracting its expiry time. */ - BUG_ON(!irqs_disabled()); + BUG_ON_NONRT(!irqs_disabled()); ret = 0; spin_lock(&p->sighand->siglock); @@ -1066,9 +1066,9 @@ static void stop_process_timers(struct task_struct *tsk) if (!cputimer->running) return; - spin_lock_irqsave(&cputimer->lock, flags); + atomic_spin_lock_irqsave(&cputimer->lock, flags); cputimer->running = 0; - spin_unlock_irqrestore(&cputimer->lock, flags); + atomic_spin_unlock_irqrestore(&cputimer->lock, flags); } /* @@ -1381,12 +1381,11 @@ static inline int fastpath_timer_check(struct task_struct *tsk) * already updated our counts. We need to check if any timers fire now. * Interrupts are disabled. */ -void run_posix_cpu_timers(struct task_struct *tsk) +void __run_posix_cpu_timers(struct task_struct *tsk) { LIST_HEAD(firing); struct k_itimer *timer, *next; - BUG_ON(!irqs_disabled()); /* * The fast path checks that there are no expired thread or thread @@ -1438,6 +1437,177 @@ void run_posix_cpu_timers(struct task_struct *tsk) } } +#include <linux/kthread.h> +#include <linux/cpu.h> +DEFINE_PER_CPU(struct task_struct *, posix_timer_task); +DEFINE_PER_CPU(struct task_struct *, posix_timer_tasklist); + +static int posix_cpu_timers_thread(void *data) +{ + int cpu = (long)data; + + BUG_ON(per_cpu(posix_timer_task,cpu) != current); + + while (!kthread_should_stop()) { + struct task_struct *tsk = NULL; + struct task_struct *next = NULL; + + if (cpu_is_offline(cpu)) + goto wait_to_die; + + /* grab task list */ + raw_local_irq_disable(); + tsk = per_cpu(posix_timer_tasklist, cpu); + per_cpu(posix_timer_tasklist, cpu) = NULL; + raw_local_irq_enable(); + + /* its possible the list is empty, just return */ + if (!tsk) { + set_current_state(TASK_INTERRUPTIBLE); + schedule(); + __set_current_state(TASK_RUNNING); + continue; + } + + /* Process task list */ + while (1) { + /* save next */ + next = tsk->posix_timer_list; + + /* run the task timers, clear its ptr and + * unreference it + */ + __run_posix_cpu_timers(tsk); + tsk->posix_timer_list = NULL; + put_task_struct(tsk); + + /* check if this is the last on the list */ + if (next == tsk) + break; + tsk = next; + } + } + return 0; + +wait_to_die: + /* Wait for kthread_stop */ + set_current_state(TASK_INTERRUPTIBLE); + while (!kthread_should_stop()) { + schedule(); + set_current_state(TASK_INTERRUPTIBLE); + } + __set_current_state(TASK_RUNNING); + return 0; +} + +static inline int __fastpath_timer_check(struct task_struct *tsk) +{ + /* tsk == current, ensure it is safe to use ->signal/sighand */ + if (unlikely(tsk->exit_state)) + return 0; + + if (!task_cputime_zero(&tsk->cputime_expires)) + return 1; + + if (!task_cputime_zero(&tsk->signal->cputime_expires)) + return 1; + + return 0; +} + +void run_posix_cpu_timers(struct task_struct *tsk) +{ + unsigned long cpu = smp_processor_id(); + struct task_struct *tasklist; + + BUG_ON(!irqs_disabled()); + if(!per_cpu(posix_timer_task, cpu)) + return; + /* get per-cpu references */ + tasklist = per_cpu(posix_timer_tasklist, cpu); + + /* check to see if we're already queued */ + if (!tsk->posix_timer_list && __fastpath_timer_check(tsk)) { + get_task_struct(tsk); + if (tasklist) { + tsk->posix_timer_list = tasklist; + } else { + /* + * The list is terminated by a self-pointing + * task_struct + */ + tsk->posix_timer_list = tsk; + } + per_cpu(posix_timer_tasklist, cpu) = tsk; + + wake_up_process(per_cpu(posix_timer_task, cpu)); + } +} + +/* + * posix_cpu_thread_call - callback that gets triggered when a CPU is added. + * Here we can start up the necessary migration thread for the new CPU. + */ +static int posix_cpu_thread_call(struct notifier_block *nfb, + unsigned long action, void *hcpu) +{ + int cpu = (long)hcpu; + struct task_struct *p; + struct sched_param param; + + switch (action) { + case CPU_UP_PREPARE: + p = kthread_create(posix_cpu_timers_thread, hcpu, + "posixcputmr/%d",cpu); + if (IS_ERR(p)) + return NOTIFY_BAD; + p->flags |= PF_NOFREEZE; + kthread_bind(p, cpu); + /* Must be high prio to avoid getting starved */ + param.sched_priority = MAX_RT_PRIO-1; + sched_setscheduler(p, SCHED_FIFO, ¶m); + per_cpu(posix_timer_task,cpu) = p; + break; + case CPU_ONLINE: + /* Strictly unneccessary, as first user will wake it. */ + wake_up_process(per_cpu(posix_timer_task,cpu)); + break; +#ifdef CONFIG_HOTPLUG_CPU + case CPU_UP_CANCELED: + /* Unbind it from offline cpu so it can run. Fall thru. */ + kthread_bind(per_cpu(posix_timer_task,cpu), + any_online_cpu(cpu_online_map)); + kthread_stop(per_cpu(posix_timer_task,cpu)); + per_cpu(posix_timer_task,cpu) = NULL; + break; + case CPU_DEAD: + kthread_stop(per_cpu(posix_timer_task,cpu)); + per_cpu(posix_timer_task,cpu) = NULL; + break; +#endif + } + return NOTIFY_OK; +} + +/* Register at highest priority so that task migration (migrate_all_tasks) + * happens before everything else. + */ +static struct notifier_block __devinitdata posix_cpu_thread_notifier = { + .notifier_call = posix_cpu_thread_call, + .priority = 10 +}; + +static int __init posix_cpu_thread_init(void) +{ + void *cpu = (void *)(long)smp_processor_id(); + /* Start one for boot CPU. */ + posix_cpu_thread_call(&posix_cpu_thread_notifier, CPU_UP_PREPARE, cpu); + posix_cpu_thread_call(&posix_cpu_thread_notifier, CPU_ONLINE, cpu); + register_cpu_notifier(&posix_cpu_thread_notifier); + return 0; +} +early_initcall(posix_cpu_thread_init); + /* * Set one of the process-wide special case CPU timers. * The tsk->sighand->siglock must be held by the caller. @@ -1703,6 +1873,12 @@ static __init int init_posix_cpu_timers(void) .nsleep = thread_cpu_nsleep, .nsleep_restart = thread_cpu_nsleep_restart, }; + unsigned long cpu; + + /* init the per-cpu posix_timer_tasklets */ + for_each_cpu_mask(cpu, cpu_possible_map) { + per_cpu(posix_timer_tasklist, cpu) = NULL; + } register_posix_clock(CLOCK_PROCESS_CPUTIME_ID, &process); register_posix_clock(CLOCK_THREAD_CPUTIME_ID, &thread); diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c index d089d05..2817dd3 100644 --- a/kernel/posix-timers.c +++ b/kernel/posix-timers.c @@ -427,6 +427,7 @@ static enum hrtimer_restart posix_timer_fn(struct hrtimer *timer) static struct pid *good_sigevent(sigevent_t * event) { struct task_struct *rtn = current->group_leader; + int sig = event->sigev_signo; if ((event->sigev_notify & SIGEV_THREAD_ID ) && (!(rtn = find_task_by_vpid(event->sigev_notify_thread_id)) || @@ -435,7 +436,8 @@ static struct pid *good_sigevent(sigevent_t * event) return NULL; if (((event->sigev_notify & ~SIGEV_THREAD_ID) != SIGEV_NONE) && - ((event->sigev_signo <= 0) || (event->sigev_signo > SIGRTMAX))) + (sig <= 0 || sig > SIGRTMAX || sig_kernel_only(sig) || + sig_kernel_coredump(sig))) return NULL; return task_pid(rtn); @@ -794,6 +796,7 @@ retry: unlock_timer(timr, flag); if (error == TIMER_RETRY) { + hrtimer_wait_for_timer(&timr->it.real.timer); rtn = NULL; // We already got the old time... goto retry; } @@ -832,6 +835,7 @@ retry_delete: if (timer_delete_hook(timer) == TIMER_RETRY) { unlock_timer(timer, flags); + hrtimer_wait_for_timer(&timer->it.real.timer); goto retry_delete; } @@ -861,6 +865,7 @@ retry_delete: if (timer_delete_hook(timer) == TIMER_RETRY) { unlock_timer(timer, flags); + hrtimer_wait_for_timer(&timer->it.real.timer); goto retry_delete; } list_del(&timer->list); diff --git a/kernel/printk.c b/kernel/printk.c index b4d97b5..1fa5c42 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -33,6 +33,7 @@ #include <linux/bootmem.h> #include <linux/syscalls.h> #include <linux/kexec.h> +#include <linux/semaphore.h> #include <asm/uaccess.h> @@ -73,7 +74,7 @@ EXPORT_SYMBOL(oops_in_progress); * provides serialisation for access to the entire console * driver system. */ -static DECLARE_MUTEX(console_sem); +static DEFINE_SEMAPHORE(console_sem); struct console *console_drivers; EXPORT_SYMBOL_GPL(console_drivers); @@ -92,7 +93,7 @@ static int console_locked, console_suspended; * It is also used in interesting ways to provide interlocking in * release_console_sem(). */ -static DEFINE_SPINLOCK(logbuf_lock); +static DEFINE_ATOMIC_SPINLOCK(logbuf_lock); #define LOG_BUF_MASK (log_buf_len-1) #define LOG_BUF(idx) (log_buf[(idx) & LOG_BUF_MASK]) @@ -171,7 +172,7 @@ static int __init log_buf_len_setup(char *str) goto out; } - spin_lock_irqsave(&logbuf_lock, flags); + atomic_spin_lock_irqsave(&logbuf_lock, flags); log_buf_len = size; log_buf = new_log_buf; @@ -185,7 +186,7 @@ static int __init log_buf_len_setup(char *str) log_start -= offset; con_start -= offset; log_end -= offset; - spin_unlock_irqrestore(&logbuf_lock, flags); + atomic_spin_unlock_irqrestore(&logbuf_lock, flags); printk(KERN_NOTICE "log_buf_len: %d\n", log_buf_len); } @@ -297,18 +298,18 @@ int do_syslog(int type, char __user *buf, int len) if (error) goto out; i = 0; - spin_lock_irq(&logbuf_lock); + atomic_spin_lock_irq(&logbuf_lock); while (!error && (log_start != log_end) && i < len) { c = LOG_BUF(log_start); log_start++; - spin_unlock_irq(&logbuf_lock); + atomic_spin_unlock_irq(&logbuf_lock); error = __put_user(c,buf); buf++; i++; cond_resched(); - spin_lock_irq(&logbuf_lock); + atomic_spin_lock_irq(&logbuf_lock); } - spin_unlock_irq(&logbuf_lock); + atomic_spin_unlock_irq(&logbuf_lock); if (!error) error = i; break; @@ -329,7 +330,7 @@ int do_syslog(int type, char __user *buf, int len) count = len; if (count > log_buf_len) count = log_buf_len; - spin_lock_irq(&logbuf_lock); + atomic_spin_lock_irq(&logbuf_lock); if (count > logged_chars) count = logged_chars; if (do_clear) @@ -346,12 +347,12 @@ int do_syslog(int type, char __user *buf, int len) if (j + log_buf_len < log_end) break; c = LOG_BUF(j); - spin_unlock_irq(&logbuf_lock); + atomic_spin_unlock_irq(&logbuf_lock); error = __put_user(c,&buf[count-1-i]); cond_resched(); - spin_lock_irq(&logbuf_lock); + atomic_spin_lock_irq(&logbuf_lock); } - spin_unlock_irq(&logbuf_lock); + atomic_spin_unlock_irq(&logbuf_lock); if (error) break; error = i; @@ -414,9 +415,13 @@ static void __call_console_drivers(unsigned start, unsigned end) for (con = console_drivers; con; con = con->next) { if ((con->flags & CON_ENABLED) && con->write && - (cpu_online(smp_processor_id()) || - (con->flags & CON_ANYTIME))) + console_atomic_safe(con) && + (cpu_online(raw_smp_processor_id()) || + (con->flags & CON_ANYTIME))) { + set_printk_might_sleep(1); con->write(con, &LOG_BUF(start), end - start); + set_printk_might_sleep(0); + } } } @@ -527,9 +532,10 @@ static void zap_locks(void) oops_timestamp = jiffies; /* If a crash is occurring, make sure we can't deadlock */ - spin_lock_init(&logbuf_lock); + atomic_spin_lock_init(&logbuf_lock); /* And make sure that we print immediately */ - init_MUTEX(&console_sem); + semaphore_init(&console_sem); + zap_rt_locks(); } #if defined(CONFIG_PRINTK_TIME) @@ -611,7 +617,8 @@ static inline int can_use_console(unsigned int cpu) * interrupts disabled. It should return with 'lockbuf_lock' * released but interrupts still disabled. */ -static int acquire_console_semaphore_for_printk(unsigned int cpu) +static int acquire_console_semaphore_for_printk(unsigned int cpu, + unsigned long flags) { int retval = 0; @@ -631,7 +638,9 @@ static int acquire_console_semaphore_for_printk(unsigned int cpu) } } printk_cpu = UINT_MAX; - spin_unlock(&logbuf_lock); + atomic_spin_unlock(&logbuf_lock); + lockdep_on(); + local_irq_restore(flags); return retval; } static const char recursion_bug_msg [] = @@ -653,7 +662,7 @@ asmlinkage int vprintk(const char *fmt, va_list args) preempt_disable(); /* This stops the holder of console_sem just where we want him */ raw_local_irq_save(flags); - this_cpu = smp_processor_id(); + this_cpu = raw_smp_processor_id(); /* * Ouch, printk recursed into itself! @@ -668,14 +677,16 @@ asmlinkage int vprintk(const char *fmt, va_list args) */ if (!oops_in_progress) { recursion_bug = 1; - goto out_restore_irqs; + raw_local_irq_restore(flags); + goto out; } zap_locks(); } lockdep_off(); - spin_lock(&logbuf_lock); + atomic_spin_lock(&logbuf_lock); printk_cpu = this_cpu; + preempt_enable(); if (recursion_bug) { recursion_bug = 0; @@ -760,14 +771,10 @@ asmlinkage int vprintk(const char *fmt, va_list args) * will release 'logbuf_lock' regardless of whether it * actually gets the semaphore or not. */ - if (acquire_console_semaphore_for_printk(this_cpu)) + if (acquire_console_semaphore_for_printk(this_cpu, flags)) release_console_sem(); - lockdep_on(); -out_restore_irqs: - raw_local_irq_restore(flags); - - preempt_enable(); +out: return printed_len; } EXPORT_SYMBOL(printk); @@ -1023,22 +1030,43 @@ void release_console_sem(void) console_may_schedule = 0; for ( ; ; ) { - spin_lock_irqsave(&logbuf_lock, flags); + atomic_spin_lock_irqsave(&logbuf_lock, flags); wake_klogd |= log_start - log_end; if (con_start == log_end) break; /* Nothing to print */ _con_start = con_start; _log_end = log_end; con_start = log_end; /* Flush */ - spin_unlock(&logbuf_lock); + + /* + * on PREEMPT_RT, call console drivers with + * interrupts enabled (if printk was called + * with interrupts disabled): + */ +#ifdef CONFIG_PREEMPT_RT + atomic_spin_unlock_irqrestore(&logbuf_lock, flags); +#else + atomic_spin_unlock(&logbuf_lock); stop_critical_timings(); /* don't trace print latency */ +#endif call_console_drivers(_con_start, _log_end); start_critical_timings(); +#ifndef CONFIG_PREEMPT_RT local_irq_restore(flags); +#endif } console_locked = 0; + atomic_spin_unlock_irqrestore(&logbuf_lock, flags); up(&console_sem); - spin_unlock_irqrestore(&logbuf_lock, flags); + /* + * On PREEMPT_RT kernels __wake_up may sleep, so wake syslogd + * up only if we are in a preemptible section. We normally dont + * printk from non-preemptible sections so this is for the emergency + * case only. + */ +#ifdef CONFIG_PREEMPT_RT + if (!in_atomic() && !irqs_disabled()) +#endif if (wake_klogd) wake_up_klogd(); } @@ -1240,9 +1268,9 @@ void register_console(struct console *console) * release_console_sem() will print out the buffered messages * for us. */ - spin_lock_irqsave(&logbuf_lock, flags); + atomic_spin_lock_irqsave(&logbuf_lock, flags); con_start = log_start; - spin_unlock_irqrestore(&logbuf_lock, flags); + atomic_spin_unlock_irqrestore(&logbuf_lock, flags); } release_console_sem(); } @@ -1314,6 +1342,23 @@ int printk_ratelimit(void) } EXPORT_SYMBOL(printk_ratelimit); +static DEFINE_ATOMIC_SPINLOCK(warn_lock); + +void __WARN_ON(const char *func, const char *file, const int line) +{ + unsigned long flags; + + atomic_spin_lock_irqsave(&warn_lock, flags); + printk("%s/%d[CPU#%d]: BUG in %s at %s:%d\n", + current->comm, current->pid, raw_smp_processor_id(), + func, file, line); + dump_stack(); + atomic_spin_unlock_irqrestore(&warn_lock, flags); +} + +EXPORT_SYMBOL(__WARN_ON); + + /** * printk_timed_ratelimit - caller-controlled printk ratelimiting * @caller_jiffies: pointer to caller's state diff --git a/kernel/rcupreempt.c b/kernel/rcupreempt.c index beb0e65..e520176 100644 --- a/kernel/rcupreempt.c +++ b/kernel/rcupreempt.c @@ -71,7 +71,7 @@ */ #define GP_STAGES 2 struct rcu_data { - spinlock_t lock; /* Protect rcu_data fields. */ + atomic_spinlock_t lock; /* Protect rcu_data fields. */ long completed; /* Number of last completed batch. */ int waitlistcount; struct rcu_head *nextlist; @@ -138,7 +138,7 @@ enum rcu_sched_sleep_states { }; struct rcu_ctrlblk { - spinlock_t fliplock; /* Protect state-machine transitions. */ + atomic_spinlock_t fliplock; /* Protect state-machine transitions. */ long completed; /* Number of last completed batch. */ enum rcu_try_flip_states rcu_try_flip_state; /* The current state of the rcu state machine */ @@ -193,7 +193,7 @@ void rcu_exit_nohz(void) static DEFINE_PER_CPU(struct rcu_data, rcu_data); static struct rcu_ctrlblk rcu_ctrlblk = { - .fliplock = __SPIN_LOCK_UNLOCKED(rcu_ctrlblk.fliplock), + .fliplock = __ATOMIC_SPIN_LOCK_UNLOCKED(rcu_ctrlblk.fliplock), .completed = 0, .rcu_try_flip_state = rcu_try_flip_idle_state, .schedlock = __SPIN_LOCK_UNLOCKED(rcu_ctrlblk.schedlock), @@ -910,7 +910,7 @@ static void rcu_try_flip(void) unsigned long flags; RCU_TRACE_ME(rcupreempt_trace_try_flip_1); - if (unlikely(!spin_trylock_irqsave(&rcu_ctrlblk.fliplock, flags))) { + if (unlikely(!atomic_spin_trylock_irqsave(&rcu_ctrlblk.fliplock, flags))) { RCU_TRACE_ME(rcupreempt_trace_try_flip_e1); return; } @@ -941,7 +941,7 @@ static void rcu_try_flip(void) rcu_ctrlblk.rcu_try_flip_state = rcu_try_flip_idle_state; } - spin_unlock_irqrestore(&rcu_ctrlblk.fliplock, flags); + atomic_spin_unlock_irqrestore(&rcu_ctrlblk.fliplock, flags); } /* @@ -986,13 +986,13 @@ void rcu_check_callbacks(int cpu, int user) rcu_check_mb(cpu); if (rcu_ctrlblk.completed == rdp->completed) rcu_try_flip(); - spin_lock_irqsave(&rdp->lock, flags); + atomic_spin_lock_irqsave(&rdp->lock, flags); RCU_TRACE_RDP(rcupreempt_trace_check_callbacks, rdp); __rcu_advance_callbacks(rdp); if (rdp->donelist == NULL) { - spin_unlock_irqrestore(&rdp->lock, flags); + atomic_spin_unlock_irqrestore(&rdp->lock, flags); } else { - spin_unlock_irqrestore(&rdp->lock, flags); + atomic_spin_unlock_irqrestore(&rdp->lock, flags); raise_softirq(RCU_SOFTIRQ); } } @@ -1011,10 +1011,10 @@ void rcu_advance_callbacks(int cpu, int user) if (rcu_ctrlblk.completed == rdp->completed) return; } - spin_lock_irqsave(&rdp->lock, flags); + atomic_spin_lock_irqsave(&rdp->lock, flags); RCU_TRACE_RDP(rcupreempt_trace_check_callbacks, rdp); __rcu_advance_callbacks(rdp); - spin_unlock_irqrestore(&rdp->lock, flags); + atomic_spin_unlock_irqrestore(&rdp->lock, flags); } #ifdef CONFIG_HOTPLUG_CPU @@ -1042,7 +1042,7 @@ void rcu_offline_cpu(int cpu) * Otherwise rcu_barrier() will fail */ - spin_lock_irqsave(&rdp->lock, flags); + atomic_spin_lock_irqsave(&rdp->lock, flags); rcu_offline_cpu_enqueue(rdp->donelist, rdp->donetail, list, tail); for (i = GP_STAGES - 1; i >= 0; i--) rcu_offline_cpu_enqueue(rdp->waitlist[i], rdp->waittail[i], @@ -1053,12 +1053,12 @@ void rcu_offline_cpu(int cpu) rcu_offline_cpu_enqueue(rdp->nextschedlist, rdp->nextschedtail, schedlist, schedtail); rdp->rcu_sched_sleeping = 0; - spin_unlock_irqrestore(&rdp->lock, flags); + atomic_spin_unlock_irqrestore(&rdp->lock, flags); rdp->waitlistcount = 0; /* Disengage the newly dead CPU from the grace-period computation. */ - spin_lock_irqsave(&rcu_ctrlblk.fliplock, flags); + atomic_spin_lock_irqsave(&rcu_ctrlblk.fliplock, flags); rcu_check_mb(cpu); if (per_cpu(rcu_flip_flag, cpu) == rcu_flipped) { smp_mb(); /* Subsequent counter accesses must see new value */ @@ -1075,7 +1075,7 @@ void rcu_offline_cpu(int cpu) cpumask_clear_cpu(cpu, to_cpumask(rcu_cpu_online_map)); - spin_unlock_irqrestore(&rcu_ctrlblk.fliplock, flags); + atomic_spin_unlock_irqrestore(&rcu_ctrlblk.fliplock, flags); /* * Place the removed callbacks on the current CPU's queue. @@ -1089,14 +1089,14 @@ void rcu_offline_cpu(int cpu) local_irq_save(flags); /* disable preempt till we know what lock. */ rdp = RCU_DATA_ME(); - spin_lock(&rdp->lock); + atomic_spin_lock(&rdp->lock); *rdp->nexttail = list; if (list) rdp->nexttail = tail; *rdp->nextschedtail = schedlist; if (schedlist) rdp->nextschedtail = schedtail; - spin_unlock_irqrestore(&rdp->lock, flags); + atomic_spin_unlock_irqrestore(&rdp->lock, flags); } #else /* #ifdef CONFIG_HOTPLUG_CPU */ @@ -1112,9 +1112,9 @@ void __cpuinit rcu_online_cpu(int cpu) unsigned long flags; struct rcu_data *rdp; - spin_lock_irqsave(&rcu_ctrlblk.fliplock, flags); + atomic_spin_lock_irqsave(&rcu_ctrlblk.fliplock, flags); cpumask_set_cpu(cpu, to_cpumask(rcu_cpu_online_map)); - spin_unlock_irqrestore(&rcu_ctrlblk.fliplock, flags); + atomic_spin_unlock_irqrestore(&rcu_ctrlblk.fliplock, flags); /* * The rcu_sched grace-period processing might have bypassed @@ -1126,9 +1126,9 @@ void __cpuinit rcu_online_cpu(int cpu) */ rdp = RCU_DATA_CPU(cpu); - spin_lock_irqsave(&rdp->lock, flags); + atomic_spin_lock_irqsave(&rdp->lock, flags); rdp->rcu_sched_sleeping = 1; - spin_unlock_irqrestore(&rdp->lock, flags); + atomic_spin_unlock_irqrestore(&rdp->lock, flags); } static void rcu_process_callbacks(struct softirq_action *unused) @@ -1139,16 +1139,16 @@ static void rcu_process_callbacks(struct softirq_action *unused) local_irq_save(flags); rdp = RCU_DATA_ME(); - spin_lock(&rdp->lock); + atomic_spin_lock(&rdp->lock); list = rdp->donelist; if (list == NULL) { - spin_unlock_irqrestore(&rdp->lock, flags); + atomic_spin_unlock_irqrestore(&rdp->lock, flags); return; } rdp->donelist = NULL; rdp->donetail = &rdp->donelist; RCU_TRACE_RDP(rcupreempt_trace_done_remove, rdp); - spin_unlock_irqrestore(&rdp->lock, flags); + atomic_spin_unlock_irqrestore(&rdp->lock, flags); while (list) { next = list->next; list->func(list); @@ -1166,12 +1166,12 @@ void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) head->next = NULL; local_irq_save(flags); rdp = RCU_DATA_ME(); - spin_lock(&rdp->lock); + atomic_spin_lock(&rdp->lock); __rcu_advance_callbacks(rdp); *rdp->nexttail = head; rdp->nexttail = &head->next; RCU_TRACE_RDP(rcupreempt_trace_next_add, rdp); - spin_unlock_irqrestore(&rdp->lock, flags); + atomic_spin_unlock_irqrestore(&rdp->lock, flags); } EXPORT_SYMBOL_GPL(call_rcu); @@ -1185,7 +1185,7 @@ void call_rcu_sched(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) head->next = NULL; local_irq_save(flags); rdp = RCU_DATA_ME(); - spin_lock(&rdp->lock); + atomic_spin_lock(&rdp->lock); *rdp->nextschedtail = head; rdp->nextschedtail = &head->next; if (rdp->rcu_sched_sleeping) { @@ -1195,7 +1195,7 @@ void call_rcu_sched(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) rdp->rcu_sched_sleeping = 0; wake_gp = 1; } - spin_unlock_irqrestore(&rdp->lock, flags); + atomic_spin_unlock_irqrestore(&rdp->lock, flags); if (wake_gp) { /* Wake up grace-period processing, unless someone beat us. */ @@ -1291,7 +1291,7 @@ static int rcu_sched_grace_period(void *arg) for_each_online_cpu(cpu) { rdp = RCU_DATA_CPU(cpu); - spin_lock_irqsave(&rdp->lock, flags); + atomic_spin_lock_irqsave(&rdp->lock, flags); /* * We are running on this CPU irq-disabled, so no @@ -1330,7 +1330,7 @@ static int rcu_sched_grace_period(void *arg) rdp->rcu_sched_sleeping = couldsleep; - spin_unlock_irqrestore(&rdp->lock, flags); + atomic_spin_unlock_irqrestore(&rdp->lock, flags); } /* If we saw callbacks on the last scan, go deal with them. */ @@ -1452,7 +1452,7 @@ void __init __rcu_init(void) printk(KERN_NOTICE "Preemptible RCU implementation.\n"); for_each_possible_cpu(cpu) { rdp = RCU_DATA_CPU(cpu); - spin_lock_init(&rdp->lock); + atomic_spin_lock_init(&rdp->lock); rdp->completed = 0; rdp->waitlistcount = 0; rdp->nextlist = NULL; diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c index 9b4a975..7a4e912 100644 --- a/kernel/rcutorture.c +++ b/kernel/rcutorture.c @@ -752,7 +752,7 @@ rcu_torture_reader(void *arg) if (p == NULL) { /* Wait for rcu_torture_writer to get underway */ cur_ops->readunlock(idx); - schedule_timeout_interruptible(HZ); + schedule_timeout_interruptible(round_jiffies_relative(HZ)); continue; } if (p->rtort_mbtest == 0) diff --git a/kernel/relay.c b/kernel/relay.c index bc18854..05fd6d5 100644 --- a/kernel/relay.c +++ b/kernel/relay.c @@ -343,6 +343,10 @@ static void wakeup_readers(unsigned long data) { struct rchan_buf *buf = (struct rchan_buf *)data; wake_up_interruptible(&buf->read_wait); + /* + * Stupid polling for now: + */ + mod_timer(&buf->timer, jiffies + 1); } /** @@ -360,6 +364,7 @@ static void __relay_reset(struct rchan_buf *buf, unsigned int init) init_waitqueue_head(&buf->read_wait); kref_init(&buf->kref); setup_timer(&buf->timer, wakeup_readers, (unsigned long)buf); + mod_timer(&buf->timer, jiffies + 1); } else del_timer_sync(&buf->timer); @@ -740,15 +745,6 @@ size_t relay_switch_subbuf(struct rchan_buf *buf, size_t length) else buf->early_bytes += buf->chan->subbuf_size - buf->padding[old_subbuf]; - smp_mb(); - if (waitqueue_active(&buf->read_wait)) - /* - * Calling wake_up_interruptible() from here - * will deadlock if we happen to be logging - * from the scheduler (trying to re-grab - * rq->lock), so defer it. - */ - mod_timer(&buf->timer, jiffies + 1); } old = buf->data; diff --git a/kernel/res_counter.c b/kernel/res_counter.c index e1338f0..988a919 100644 --- a/kernel/res_counter.c +++ b/kernel/res_counter.c @@ -14,6 +14,7 @@ #include <linux/res_counter.h> #include <linux/uaccess.h> #include <linux/mm.h> +#include <linux/interrupt.h> void res_counter_init(struct res_counter *counter, struct res_counter *parent) { @@ -43,7 +44,7 @@ int res_counter_charge(struct res_counter *counter, unsigned long val, struct res_counter *c, *u; *limit_fail_at = NULL; - local_irq_save(flags); + local_irq_save_nort(flags); for (c = counter; c != NULL; c = c->parent) { spin_lock(&c->lock); ret = res_counter_charge_locked(c, val); @@ -62,7 +63,7 @@ undo: spin_unlock(&u->lock); } done: - local_irq_restore(flags); + local_irq_restore_nort(flags); return ret; } @@ -79,13 +80,13 @@ void res_counter_uncharge(struct res_counter *counter, unsigned long val) unsigned long flags; struct res_counter *c; - local_irq_save(flags); + local_irq_save_nort(flags); for (c = counter; c != NULL; c = c->parent) { spin_lock(&c->lock); res_counter_uncharge_locked(c, val); spin_unlock(&c->lock); } - local_irq_restore(flags); + local_irq_restore_nort(flags); } diff --git a/kernel/rt.c b/kernel/rt.c new file mode 100644 index 0000000..fd033a9 --- /dev/null +++ b/kernel/rt.c @@ -0,0 +1,566 @@ +/* + * kernel/rt.c + * + * Real-Time Preemption Support + * + * started by Ingo Molnar: + * + * Copyright (C) 2004-2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com> + * Copyright (C) 2006, Timesys Corp., Thomas Gleixner <tglx@timesys.com> + * + * historic credit for proving that Linux spinlocks can be implemented via + * RT-aware mutexes goes to many people: The Pmutex project (Dirk Grambow + * and others) who prototyped it on 2.4 and did lots of comparative + * research and analysis; TimeSys, for proving that you can implement a + * fully preemptible kernel via the use of IRQ threading and mutexes; + * Bill Huey for persuasively arguing on lkml that the mutex model is the + * right one; and to MontaVista, who ported pmutexes to 2.6. + * + * This code is a from-scratch implementation and is not based on pmutexes, + * but the idea of converting spinlocks to mutexes is used here too. + * + * lock debugging, locking tree, deadlock detection: + * + * Copyright (C) 2004, LynuxWorks, Inc., Igor Manyilov, Bill Huey + * Released under the General Public License (GPL). + * + * Includes portions of the generic R/W semaphore implementation from: + * + * Copyright (c) 2001 David Howells (dhowells@redhat.com). + * - Derived partially from idea by Andrea Arcangeli <andrea@suse.de> + * - Derived also from comments by Linus + * + * Pending ownership of locks and ownership stealing: + * + * Copyright (C) 2005, Kihon Technologies Inc., Steven Rostedt + * + * (also by Steven Rostedt) + * - Converted single pi_lock to individual task locks. + * + * By Esben Nielsen: + * Doing priority inheritance with help of the scheduler. + * + * Copyright (C) 2006, Timesys Corp., Thomas Gleixner <tglx@timesys.com> + * - major rework based on Esben Nielsens initial patch + * - replaced thread_info references by task_struct refs + * - removed task->pending_owner dependency + * - BKL drop/reacquire for semaphore style locks to avoid deadlocks + * in the scheduler return path as discussed with Steven Rostedt + * + * Copyright (C) 2006, Kihon Technologies Inc. + * Steven Rostedt <rostedt@goodmis.org> + * - debugged and patched Thomas Gleixner's rework. + * - added back the cmpxchg to the rework. + * - turned atomic require back on for SMP. + */ + +#include <linux/spinlock.h> +#include <linux/sched.h> +#include <linux/delay.h> +#include <linux/module.h> +#include <linux/spinlock.h> +#include <linux/kallsyms.h> +#include <linux/syscalls.h> +#include <linux/interrupt.h> +#include <linux/plist.h> +#include <linux/fs.h> +#include <linux/futex.h> +#include <linux/hrtimer.h> + +#include "rtmutex_common.h" + +#ifdef CONFIG_PREEMPT_RT +/* + * Unlock these on crash: + */ +void zap_rt_locks(void) +{ + //trace_lock_init(); +} +#endif + +/* + * struct mutex functions + */ +void __mutex_init(struct mutex *lock, char *name, struct lock_class_key *key) +{ +#ifdef CONFIG_DEBUG_LOCK_ALLOC + /* + * Make sure we are not reinitializing a held lock: + */ + debug_check_no_locks_freed((void *)lock, sizeof(*lock)); + lockdep_init_map(&lock->dep_map, name, key, 0); +#endif + __rt_mutex_init(&lock->lock, name); +} +EXPORT_SYMBOL(__mutex_init); + +void __lockfunc _mutex_lock(struct mutex *lock) +{ + mutex_acquire(&lock->dep_map, 0, 0, _RET_IP_); + rt_mutex_lock(&lock->lock); +} +EXPORT_SYMBOL(_mutex_lock); + +int __lockfunc _mutex_lock_interruptible(struct mutex *lock) +{ + int ret; + + mutex_acquire(&lock->dep_map, 0, 0, _RET_IP_); + ret = rt_mutex_lock_interruptible(&lock->lock, 0); + if (ret) + mutex_release(&lock->dep_map, 1, _RET_IP_); + return ret; +} +EXPORT_SYMBOL(_mutex_lock_interruptible); + +int __lockfunc _mutex_lock_killable(struct mutex *lock) +{ + int ret; + + mutex_acquire(&lock->dep_map, 0, 0, _RET_IP_); + ret = rt_mutex_lock_killable(&lock->lock, 0); + if (ret) + mutex_release(&lock->dep_map, 1, _RET_IP_); + return ret; +} +EXPORT_SYMBOL(_mutex_lock_killable); + +#ifdef CONFIG_DEBUG_LOCK_ALLOC +void __lockfunc _mutex_lock_nested(struct mutex *lock, int subclass) +{ + mutex_acquire(&lock->dep_map, subclass, 0, _RET_IP_); + rt_mutex_lock(&lock->lock); +} +EXPORT_SYMBOL(_mutex_lock_nested); + +int __lockfunc _mutex_lock_interruptible_nested(struct mutex *lock, int subclass) +{ + int ret; + + mutex_acquire(&lock->dep_map, subclass, 0, _RET_IP_); + ret = rt_mutex_lock_interruptible(&lock->lock, 0); + if (ret) + mutex_release(&lock->dep_map, 1, _RET_IP_); + return ret; +} +EXPORT_SYMBOL(_mutex_lock_interruptible_nested); + +int __lockfunc _mutex_lock_killable_nested(struct mutex *lock, int subclass) +{ + int ret; + + mutex_acquire(&lock->dep_map, subclass, 0, _RET_IP_); + ret = rt_mutex_lock_killable(&lock->lock, 0); + if (ret) + mutex_release(&lock->dep_map, 1, _RET_IP_); + return ret; +} +EXPORT_SYMBOL(_mutex_lock_killable_nested); +#endif + +int __lockfunc _mutex_trylock(struct mutex *lock) +{ + int ret = rt_mutex_trylock(&lock->lock); + + if (ret) + mutex_acquire(&lock->dep_map, 0, 1, _RET_IP_); + + return ret; +} +EXPORT_SYMBOL(_mutex_trylock); + +void __lockfunc _mutex_unlock(struct mutex *lock) +{ + mutex_release(&lock->dep_map, 1, _RET_IP_); + rt_mutex_unlock(&lock->lock); +} +EXPORT_SYMBOL(_mutex_unlock); + +/* + * rwlock_t functions + */ +int __lockfunc rt_write_trylock(rwlock_t *rwlock) +{ + int ret = rt_mutex_trylock(&rwlock->lock); + + if (ret) + rwlock_acquire(&rwlock->dep_map, 0, 1, _RET_IP_); + + return ret; +} +EXPORT_SYMBOL(rt_write_trylock); + +int __lockfunc rt_write_trylock_irqsave(rwlock_t *rwlock, unsigned long *flags) +{ + *flags = 0; + return rt_write_trylock(rwlock); +} +EXPORT_SYMBOL(rt_write_trylock_irqsave); + +int __lockfunc rt_read_trylock(rwlock_t *rwlock) +{ + struct rt_mutex *lock = &rwlock->lock; + int ret = 1; + + /* + * recursive read locks succeed when current owns the lock, + * but not when read_depth == 0 which means that the lock is + * write locked. + */ + if (rt_mutex_real_owner(lock) != current) + ret = rt_mutex_trylock(lock); + else if (!rwlock->read_depth) + ret = 0; + + if (ret) { + rwlock->read_depth++; + rwlock_acquire_read(&rwlock->dep_map, 0, 1, _RET_IP_); + } + + return ret; +} +EXPORT_SYMBOL(rt_read_trylock); + +void __lockfunc rt_write_lock(rwlock_t *rwlock) +{ + rwlock_acquire(&rwlock->dep_map, 0, 0, _RET_IP_); + __rt_spin_lock(&rwlock->lock); +} +EXPORT_SYMBOL(rt_write_lock); + +void __lockfunc rt_read_lock(rwlock_t *rwlock) +{ + struct rt_mutex *lock = &rwlock->lock; + + rwlock_acquire_read(&rwlock->dep_map, 0, 0, _RET_IP_); + + /* + * recursive read locks succeed when current owns the lock + */ + if (rt_mutex_real_owner(lock) != current) + __rt_spin_lock(lock); + rwlock->read_depth++; +} + +EXPORT_SYMBOL(rt_read_lock); + +void __lockfunc rt_write_unlock(rwlock_t *rwlock) +{ + /* NOTE: we always pass in '1' for nested, for simplicity */ + rwlock_release(&rwlock->dep_map, 1, _RET_IP_); + __rt_spin_unlock(&rwlock->lock); +} +EXPORT_SYMBOL(rt_write_unlock); + +void __lockfunc rt_read_unlock(rwlock_t *rwlock) +{ + rwlock_release(&rwlock->dep_map, 1, _RET_IP_); + + /* Release the lock only when read_depth is down to 0 */ + if (--rwlock->read_depth == 0) + __rt_spin_unlock(&rwlock->lock); +} +EXPORT_SYMBOL(rt_read_unlock); + +unsigned long __lockfunc rt_write_lock_irqsave(rwlock_t *rwlock) +{ + rt_write_lock(rwlock); + + return 0; +} +EXPORT_SYMBOL(rt_write_lock_irqsave); + +unsigned long __lockfunc rt_read_lock_irqsave(rwlock_t *rwlock) +{ + rt_read_lock(rwlock); + + return 0; +} +EXPORT_SYMBOL(rt_read_lock_irqsave); + +void __rt_rwlock_init(rwlock_t *rwlock, char *name, struct lock_class_key *key) +{ +#ifdef CONFIG_DEBUG_LOCK_ALLOC + /* + * Make sure we are not reinitializing a held lock: + */ + debug_check_no_locks_freed((void *)rwlock, sizeof(*rwlock)); + lockdep_init_map(&rwlock->dep_map, name, key, 0); +#endif + __rt_mutex_init(&rwlock->lock, name); + rwlock->read_depth = 0; +} +EXPORT_SYMBOL(__rt_rwlock_init); + +/* + * rw_semaphores + */ + +void rt_up_write(struct rw_semaphore *rwsem) +{ + rwsem_release(&rwsem->dep_map, 1, _RET_IP_); + rt_mutex_unlock(&rwsem->lock); +} +EXPORT_SYMBOL(rt_up_write); + +void rt_up_read(struct rw_semaphore *rwsem) +{ + rwsem_release(&rwsem->dep_map, 1, _RET_IP_); + if (--rwsem->read_depth == 0) + rt_mutex_unlock(&rwsem->lock); +} +EXPORT_SYMBOL(rt_up_read); + +/* + * downgrade a write lock into a read lock + * - just wake up any readers at the front of the queue + */ +void rt_downgrade_write(struct rw_semaphore *rwsem) +{ + BUG_ON(rt_mutex_real_owner(&rwsem->lock) != current); + rwsem->read_depth = 1; +} +EXPORT_SYMBOL(rt_downgrade_write); + +int rt_down_write_trylock(struct rw_semaphore *rwsem) +{ + int ret = rt_mutex_trylock(&rwsem->lock); + + if (ret) + rwsem_acquire(&rwsem->dep_map, 0, 1, _RET_IP_); + return ret; +} +EXPORT_SYMBOL(rt_down_write_trylock); + +void rt_down_write(struct rw_semaphore *rwsem) +{ + rwsem_acquire(&rwsem->dep_map, 0, 0, _RET_IP_); + rt_mutex_lock(&rwsem->lock); +} +EXPORT_SYMBOL(rt_down_write); + +void rt_down_write_nested(struct rw_semaphore *rwsem, int subclass) +{ + rwsem_acquire(&rwsem->dep_map, subclass, 0, _RET_IP_); + rt_mutex_lock(&rwsem->lock); +} +EXPORT_SYMBOL(rt_down_write_nested); + +int rt_down_read_trylock(struct rw_semaphore *rwsem) +{ + struct rt_mutex *lock = &rwsem->lock; + int ret = 1; + + /* + * recursive read locks succeed when current owns the rwsem, + * but not when read_depth == 0 which means that the rwsem is + * write locked. + */ + if (rt_mutex_real_owner(lock) != current) + ret = rt_mutex_trylock(&rwsem->lock); + else if (!rwsem->read_depth) + ret = 0; + + if (ret) { + rwsem->read_depth++; + rwsem_acquire(&rwsem->dep_map, 0, 1, _RET_IP_); + } + return ret; +} +EXPORT_SYMBOL(rt_down_read_trylock); + +static void __rt_down_read(struct rw_semaphore *rwsem, int subclass) +{ + struct rt_mutex *lock = &rwsem->lock; + + rwsem_acquire_read(&rwsem->dep_map, subclass, 0, _RET_IP_); + + if (rt_mutex_real_owner(lock) != current) + rt_mutex_lock(&rwsem->lock); + rwsem->read_depth++; +} + +void rt_down_read(struct rw_semaphore *rwsem) +{ + __rt_down_read(rwsem, 0); +} +EXPORT_SYMBOL(rt_down_read); + +void rt_down_read_nested(struct rw_semaphore *rwsem, int subclass) +{ + __rt_down_read(rwsem, subclass); +} +EXPORT_SYMBOL(rt_down_read_nested); + +void __rt_rwsem_init(struct rw_semaphore *rwsem, char *name, + struct lock_class_key *key) +{ +#ifdef CONFIG_DEBUG_LOCK_ALLOC + /* + * Make sure we are not reinitializing a held lock: + */ + debug_check_no_locks_freed((void *)rwsem, sizeof(*rwsem)); + lockdep_init_map(&rwsem->dep_map, name, key, 0); +#endif + __rt_mutex_init(&rwsem->lock, name); + rwsem->read_depth = 0; +} +EXPORT_SYMBOL(__rt_rwsem_init); + +/* + * Semaphores + */ +/* + * Linux Semaphores implemented via RT-mutexes. + * + * In the down() variants we use the mutex as the semaphore blocking + * object: we always acquire it, decrease the counter and keep the lock + * locked if we did the 1->0 transition. The next down() will then block. + * + * In the up() path we atomically increase the counter and do the + * unlock if we were the one doing the 0->1 transition. + */ + +static inline void __down_complete(struct semaphore *sem) +{ + int count = atomic_dec_return(&sem->count); + + if (unlikely(count > 0)) + rt_mutex_unlock(&sem->lock); +} + +void rt_down(struct semaphore *sem) +{ + rt_mutex_lock(&sem->lock); + __down_complete(sem); +} +EXPORT_SYMBOL(rt_down); + +int rt_down_interruptible(struct semaphore *sem) +{ + int ret; + + ret = rt_mutex_lock_interruptible(&sem->lock, 0); + if (ret) + return ret; + __down_complete(sem); + return 0; +} +EXPORT_SYMBOL(rt_down_interruptible); + +int rt_down_timeout(struct semaphore *sem, long jiff) +{ + struct hrtimer_sleeper t; + struct timespec ts; + unsigned long expires = jiffies + jiff + 1; + int ret; + + /* + * rt_mutex_slowlock can use an interruptible, but this needs to + * be TASK_INTERRUPTIBLE. The down_timeout uses TASK_UNINTERRUPTIBLE. + * To handle this we loop if a signal caused the timeout and the + * we recalculate the new timeout. + * Yes Thomas, this is a hack! But we can fix it right later. + */ + do { + jiffies_to_timespec(jiff, &ts); + hrtimer_init_on_stack(&t.timer, HRTIMER_MODE_REL, CLOCK_MONOTONIC); + t.timer._expires = timespec_to_ktime(ts); + + ret = rt_mutex_timed_lock(&sem->lock, &t, 0); + if (ret != -EINTR) + break; + + /* signal occured, but the down_timeout doesn't handle them */ + jiff = expires - jiffies; + + } while (jiff > 0); + + if (!ret) + __down_complete(sem); + else + ret = -ETIME; + + return ret; +} +EXPORT_SYMBOL(rt_down_timeout); + +/* + * try to down the semaphore, 0 on success and 1 on failure. (inverted) + */ +int rt_down_trylock(struct semaphore *sem) +{ + /* + * Here we are a tiny bit different from ordinary Linux semaphores, + * because we can get 'transient' locking-failures when say a + * process decreases the count from 9 to 8 and locks/releases the + * embedded mutex internally. It would be quite complex to remove + * these transient failures so lets try it the simple way first: + */ + if (rt_mutex_trylock(&sem->lock)) { + __down_complete(sem); + return 0; + } + return 1; +} +EXPORT_SYMBOL(rt_down_trylock); + +void rt_up(struct semaphore *sem) +{ + int count; + + /* + * Disable preemption to make sure a highprio trylock-er cannot + * preempt us here and get into an infinite loop: + */ + preempt_disable(); + count = atomic_inc_return(&sem->count); + /* + * If we did the 0 -> 1 transition then we are the ones to unlock it: + */ + if (likely(count == 1)) + rt_mutex_unlock(&sem->lock); + preempt_enable(); +} +EXPORT_SYMBOL(rt_up); + +void __sema_init(struct semaphore *sem, int val, + char *name, char *file, int line) +{ + atomic_set(&sem->count, val); + switch (val) { + case 0: + __rt_mutex_init(&sem->lock, name); + rt_mutex_lock(&sem->lock); + break; + default: + __rt_mutex_init(&sem->lock, name); + break; + } +} +EXPORT_SYMBOL(__sema_init); + +/** + * atomic_dec_and_mutex_lock - return holding mutex if we dec to 0 + * @cnt: the atomic which we are to dec + * @lock: the mutex to return holding if we dec to 0 + * + * return true and hold lock if we dec to 0, return false otherwise + */ +int atomic_dec_and_mutex_lock(atomic_t *cnt, struct mutex *lock) +{ + /* dec if we can't possibly hit 0 */ + if (atomic_add_unless(cnt, -1, 1)) + return 0; + /* we might hit 0, so take the lock */ + mutex_lock(lock); + if (!atomic_dec_and_test(cnt)) { + /* when we actually did the dec, we didn't hit 0 */ + mutex_unlock(lock); + return 0; + } + /* we hit 0, and we hold the lock */ + return 1; +} +EXPORT_SYMBOL(atomic_dec_and_mutex_lock); diff --git a/kernel/rtmutex-debug.c b/kernel/rtmutex-debug.c index 5fcb4fe..e7e6314 100644 --- a/kernel/rtmutex-debug.c +++ b/kernel/rtmutex-debug.c @@ -29,61 +29,6 @@ #include "rtmutex_common.h" -# define TRACE_WARN_ON(x) WARN_ON(x) -# define TRACE_BUG_ON(x) BUG_ON(x) - -# define TRACE_OFF() \ -do { \ - if (rt_trace_on) { \ - rt_trace_on = 0; \ - console_verbose(); \ - if (spin_is_locked(¤t->pi_lock)) \ - spin_unlock(¤t->pi_lock); \ - } \ -} while (0) - -# define TRACE_OFF_NOLOCK() \ -do { \ - if (rt_trace_on) { \ - rt_trace_on = 0; \ - console_verbose(); \ - } \ -} while (0) - -# define TRACE_BUG_LOCKED() \ -do { \ - TRACE_OFF(); \ - BUG(); \ -} while (0) - -# define TRACE_WARN_ON_LOCKED(c) \ -do { \ - if (unlikely(c)) { \ - TRACE_OFF(); \ - WARN_ON(1); \ - } \ -} while (0) - -# define TRACE_BUG_ON_LOCKED(c) \ -do { \ - if (unlikely(c)) \ - TRACE_BUG_LOCKED(); \ -} while (0) - -#ifdef CONFIG_SMP -# define SMP_TRACE_BUG_ON_LOCKED(c) TRACE_BUG_ON_LOCKED(c) -#else -# define SMP_TRACE_BUG_ON_LOCKED(c) do { } while (0) -#endif - -/* - * deadlock detection flag. We turn it off when we detect - * the first problem because we dont want to recurse back - * into the tracing code when doing error printk or - * executing a BUG(): - */ -static int rt_trace_on = 1; - static void printk_task(struct task_struct *p) { if (p) @@ -111,8 +56,8 @@ static void printk_lock(struct rt_mutex *lock, int print_owner) void rt_mutex_debug_task_free(struct task_struct *task) { - WARN_ON(!plist_head_empty(&task->pi_waiters)); - WARN_ON(task->pi_blocked_on); + DEBUG_LOCKS_WARN_ON(!plist_head_empty(&task->pi_waiters)); + DEBUG_LOCKS_WARN_ON(task->pi_blocked_on); } /* @@ -125,7 +70,7 @@ void debug_rt_mutex_deadlock(int detect, struct rt_mutex_waiter *act_waiter, { struct task_struct *task; - if (!rt_trace_on || detect || !act_waiter) + if (!debug_locks || detect || !act_waiter) return; task = rt_mutex_owner(act_waiter->lock); @@ -139,7 +84,7 @@ void debug_rt_mutex_print_deadlock(struct rt_mutex_waiter *waiter) { struct task_struct *task; - if (!waiter->deadlock_lock || !rt_trace_on) + if (!waiter->deadlock_lock || !debug_locks) return; rcu_read_lock(); @@ -149,7 +94,8 @@ void debug_rt_mutex_print_deadlock(struct rt_mutex_waiter *waiter) return; } - TRACE_OFF_NOLOCK(); + if (!debug_locks_off()) + return; printk("\n============================================\n"); printk( "[ BUG: circular locking deadlock detected! ]\n"); @@ -180,7 +126,6 @@ void debug_rt_mutex_print_deadlock(struct rt_mutex_waiter *waiter) printk("[ turning off deadlock detection." "Please report this trace. ]\n\n"); - local_irq_disable(); } void debug_rt_mutex_lock(struct rt_mutex *lock) @@ -189,7 +134,8 @@ void debug_rt_mutex_lock(struct rt_mutex *lock) void debug_rt_mutex_unlock(struct rt_mutex *lock) { - TRACE_WARN_ON_LOCKED(rt_mutex_owner(lock) != current); + if (debug_locks) + DEBUG_LOCKS_WARN_ON(rt_mutex_owner(lock) != current); } void @@ -199,7 +145,7 @@ debug_rt_mutex_proxy_lock(struct rt_mutex *lock, struct task_struct *powner) void debug_rt_mutex_proxy_unlock(struct rt_mutex *lock) { - TRACE_WARN_ON_LOCKED(!rt_mutex_owner(lock)); + DEBUG_LOCKS_WARN_ON(!rt_mutex_owner(lock)); } void debug_rt_mutex_init_waiter(struct rt_mutex_waiter *waiter) @@ -213,9 +159,9 @@ void debug_rt_mutex_init_waiter(struct rt_mutex_waiter *waiter) void debug_rt_mutex_free_waiter(struct rt_mutex_waiter *waiter) { put_pid(waiter->deadlock_task_pid); - TRACE_WARN_ON(!plist_node_empty(&waiter->list_entry)); - TRACE_WARN_ON(!plist_node_empty(&waiter->pi_list_entry)); - TRACE_WARN_ON(waiter->task); + DEBUG_LOCKS_WARN_ON(!plist_node_empty(&waiter->list_entry)); + DEBUG_LOCKS_WARN_ON(!plist_node_empty(&waiter->pi_list_entry)); + DEBUG_LOCKS_WARN_ON(waiter->task); memset(waiter, 0x22, sizeof(*waiter)); } @@ -231,9 +177,36 @@ void debug_rt_mutex_init(struct rt_mutex *lock, const char *name) void rt_mutex_deadlock_account_lock(struct rt_mutex *lock, struct task_struct *task) { +#ifdef CONFIG_DEBUG_PREEMPT + if (atomic_read(&task->lock_count) >= MAX_LOCK_STACK) { + if (!debug_locks_off()) + return; + printk("BUG: %s/%d: lock count overflow!\n", + task->comm, task->pid); + dump_stack(); + return; + } +#ifdef CONFIG_PREEMPT_RT + task->owned_lock[atomic_read(&task->lock_count)] = lock; +#endif + atomic_inc(&task->lock_count); +#endif } void rt_mutex_deadlock_account_unlock(struct task_struct *task) { +#ifdef CONFIG_DEBUG_PREEMPT + if (!atomic_read(&task->lock_count)) { + if (!debug_locks_off()) + return; + printk("BUG: %s/%d: lock count underflow!\n", + task->comm, task->pid); + dump_stack(); + return; + } + atomic_dec(&task->lock_count); +#ifdef CONFIG_PREEMPT_RT + task->owned_lock[atomic_read(&task->lock_count)] = NULL; +#endif +#endif } - diff --git a/kernel/rtmutex-debug.h b/kernel/rtmutex-debug.h index 14193d5..b031c8a 100644 --- a/kernel/rtmutex-debug.h +++ b/kernel/rtmutex-debug.h @@ -17,17 +17,17 @@ extern void debug_rt_mutex_free_waiter(struct rt_mutex_waiter *waiter); extern void debug_rt_mutex_init(struct rt_mutex *lock, const char *name); extern void debug_rt_mutex_lock(struct rt_mutex *lock); extern void debug_rt_mutex_unlock(struct rt_mutex *lock); -extern void debug_rt_mutex_proxy_lock(struct rt_mutex *lock, - struct task_struct *powner); +extern void +debug_rt_mutex_proxy_lock(struct rt_mutex *lock, struct task_struct *powner); extern void debug_rt_mutex_proxy_unlock(struct rt_mutex *lock); extern void debug_rt_mutex_deadlock(int detect, struct rt_mutex_waiter *waiter, struct rt_mutex *lock); extern void debug_rt_mutex_print_deadlock(struct rt_mutex_waiter *waiter); -# define debug_rt_mutex_reset_waiter(w) \ +# define debug_rt_mutex_reset_waiter(w) \ do { (w)->deadlock_lock = NULL; } while (0) -static inline int debug_rt_mutex_detect_deadlock(struct rt_mutex_waiter *waiter, - int detect) +static inline int +debug_rt_mutex_detect_deadlock(struct rt_mutex_waiter *waiter, int detect) { - return (waiter != NULL); + return waiter != NULL; } diff --git a/kernel/rtmutex.c b/kernel/rtmutex.c index 29bd4ba..f66f98d 100644 --- a/kernel/rtmutex.c +++ b/kernel/rtmutex.c @@ -8,12 +8,20 @@ * Copyright (C) 2005 Kihon Technologies Inc., Steven Rostedt * Copyright (C) 2006 Esben Nielsen * + * Adaptive Spinlocks: + * Copyright (C) 2008 Novell, Inc., Gregory Haskins, Sven Dietrich, + * and Peter Morreale, + * Adaptive Spinlocks simplification: + * Copyright (C) 2008 Red Hat, Inc., Steven Rostedt <srostedt@redhat.com> + * * See Documentation/rt-mutex-design.txt for details. */ #include <linux/spinlock.h> #include <linux/module.h> #include <linux/sched.h> #include <linux/timer.h> +#include <linux/hardirq.h> +#include <linux/semaphore.h> #include "rtmutex_common.h" @@ -97,6 +105,22 @@ static inline void mark_rt_mutex_waiters(struct rt_mutex *lock) } #endif +int pi_initialized; + +/* + * we initialize the wait_list runtime. (Could be done build-time and/or + * boot-time.) + */ +static inline void init_lists(struct rt_mutex *lock) +{ + if (unlikely(!lock->wait_list.prio_list.prev)) { + plist_head_init_atomic(&lock->wait_list, &lock->wait_lock); +#ifdef CONFIG_DEBUG_RT_MUTEXES + pi_initialized++; +#endif + } +} + /* * Calculate task priority from the waiter list priority * @@ -131,16 +155,16 @@ static void __rt_mutex_adjust_prio(struct task_struct *task) * * (Note: We do this outside of the protection of lock->wait_lock to * allow the lock to be taken while or before we readjust the priority - * of task. We do not use the spin_xx_mutex() variants here as we are + * of task. We do not use the atomic_spin_xx_mutex() variants here as we are * outside of the debug path.) */ static void rt_mutex_adjust_prio(struct task_struct *task) { unsigned long flags; - spin_lock_irqsave(&task->pi_lock, flags); + atomic_spin_lock_irqsave(&task->pi_lock, flags); __rt_mutex_adjust_prio(task); - spin_unlock_irqrestore(&task->pi_lock, flags); + atomic_spin_unlock_irqrestore(&task->pi_lock, flags); } /* @@ -195,7 +219,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, /* * Task can not go away as we did a get_task() before ! */ - spin_lock_irqsave(&task->pi_lock, flags); + atomic_spin_lock_irqsave(&task->pi_lock, flags); waiter = task->pi_blocked_on; /* @@ -231,8 +255,8 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, goto out_unlock_pi; lock = waiter->lock; - if (!spin_trylock(&lock->wait_lock)) { - spin_unlock_irqrestore(&task->pi_lock, flags); + if (!atomic_spin_trylock(&lock->wait_lock)) { + atomic_spin_unlock_irqrestore(&task->pi_lock, flags); cpu_relax(); goto retry; } @@ -240,7 +264,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, /* Deadlock detection */ if (lock == orig_lock || rt_mutex_owner(lock) == top_task) { debug_rt_mutex_deadlock(deadlock_detect, orig_waiter, lock); - spin_unlock(&lock->wait_lock); + atomic_spin_unlock(&lock->wait_lock); ret = deadlock_detect ? -EDEADLK : 0; goto out_unlock_pi; } @@ -253,13 +277,13 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, plist_add(&waiter->list_entry, &lock->wait_list); /* Release the task */ - spin_unlock_irqrestore(&task->pi_lock, flags); + atomic_spin_unlock(&task->pi_lock); put_task_struct(task); /* Grab the next task */ task = rt_mutex_owner(lock); get_task_struct(task); - spin_lock_irqsave(&task->pi_lock, flags); + atomic_spin_lock(&task->pi_lock); if (waiter == rt_mutex_top_waiter(lock)) { /* Boost the owner */ @@ -277,10 +301,10 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, __rt_mutex_adjust_prio(task); } - spin_unlock_irqrestore(&task->pi_lock, flags); + atomic_spin_unlock(&task->pi_lock); top_waiter = rt_mutex_top_waiter(lock); - spin_unlock(&lock->wait_lock); + atomic_spin_unlock_irqrestore(&lock->wait_lock, flags); if (!detect_deadlock && waiter != top_waiter) goto out_put_task; @@ -288,7 +312,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, goto again; out_unlock_pi: - spin_unlock_irqrestore(&task->pi_lock, flags); + atomic_spin_unlock_irqrestore(&task->pi_lock, flags); out_put_task: put_task_struct(task); @@ -301,11 +325,10 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, * lock yet]: */ static inline int try_to_steal_lock(struct rt_mutex *lock, - struct task_struct *task) + struct task_struct *task, int mode) { struct task_struct *pendowner = rt_mutex_owner(lock); struct rt_mutex_waiter *next; - unsigned long flags; if (!rt_mutex_owner_pending(lock)) return 0; @@ -313,9 +336,9 @@ static inline int try_to_steal_lock(struct rt_mutex *lock, if (pendowner == task) return 1; - spin_lock_irqsave(&pendowner->pi_lock, flags); - if (task->prio >= pendowner->prio) { - spin_unlock_irqrestore(&pendowner->pi_lock, flags); + atomic_spin_lock(&pendowner->pi_lock); + if (!lock_is_stealable(task, pendowner, mode)) { + atomic_spin_unlock(&pendowner->pi_lock); return 0; } @@ -325,7 +348,7 @@ static inline int try_to_steal_lock(struct rt_mutex *lock, * priority. */ if (likely(!rt_mutex_has_waiters(lock))) { - spin_unlock_irqrestore(&pendowner->pi_lock, flags); + atomic_spin_unlock(&pendowner->pi_lock); return 1; } @@ -333,7 +356,7 @@ static inline int try_to_steal_lock(struct rt_mutex *lock, next = rt_mutex_top_waiter(lock); plist_del(&next->pi_list_entry, &pendowner->pi_waiters); __rt_mutex_adjust_prio(pendowner); - spin_unlock_irqrestore(&pendowner->pi_lock, flags); + atomic_spin_unlock(&pendowner->pi_lock); /* * We are going to steal the lock and a waiter was @@ -350,10 +373,10 @@ static inline int try_to_steal_lock(struct rt_mutex *lock, * might be task: */ if (likely(next->task != task)) { - spin_lock_irqsave(&task->pi_lock, flags); + atomic_spin_lock(&task->pi_lock); plist_add(&next->pi_list_entry, &task->pi_waiters); __rt_mutex_adjust_prio(task); - spin_unlock_irqrestore(&task->pi_lock, flags); + atomic_spin_unlock(&task->pi_lock); } return 1; } @@ -367,7 +390,7 @@ static inline int try_to_steal_lock(struct rt_mutex *lock, * * Must be called with lock->wait_lock held. */ -static int try_to_take_rt_mutex(struct rt_mutex *lock) +static int do_try_to_take_rt_mutex(struct rt_mutex *lock, int mode) { /* * We have to be careful here if the atomic speedups are @@ -390,7 +413,7 @@ static int try_to_take_rt_mutex(struct rt_mutex *lock) */ mark_rt_mutex_waiters(lock); - if (rt_mutex_owner(lock) && !try_to_steal_lock(lock, current)) + if (rt_mutex_owner(lock) && !try_to_steal_lock(lock, current, mode)) return 0; /* We got the lock. */ @@ -403,6 +426,11 @@ static int try_to_take_rt_mutex(struct rt_mutex *lock) return 1; } +static inline int try_to_take_rt_mutex(struct rt_mutex *lock) +{ + return do_try_to_take_rt_mutex(lock, STEAL_NORMAL); +} + /* * Task blocks on lock. * @@ -413,14 +441,13 @@ static int try_to_take_rt_mutex(struct rt_mutex *lock) static int task_blocks_on_rt_mutex(struct rt_mutex *lock, struct rt_mutex_waiter *waiter, struct task_struct *task, - int detect_deadlock) + int detect_deadlock, unsigned long flags) { struct task_struct *owner = rt_mutex_owner(lock); struct rt_mutex_waiter *top_waiter = waiter; - unsigned long flags; int chain_walk = 0, res; - spin_lock_irqsave(&task->pi_lock, flags); + atomic_spin_lock(&task->pi_lock); __rt_mutex_adjust_prio(task); waiter->task = task; waiter->lock = lock; @@ -434,17 +461,17 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock, task->pi_blocked_on = waiter; - spin_unlock_irqrestore(&task->pi_lock, flags); + atomic_spin_unlock(&task->pi_lock); if (waiter == rt_mutex_top_waiter(lock)) { - spin_lock_irqsave(&owner->pi_lock, flags); + atomic_spin_lock(&owner->pi_lock); plist_del(&top_waiter->pi_list_entry, &owner->pi_waiters); plist_add(&waiter->pi_list_entry, &owner->pi_waiters); __rt_mutex_adjust_prio(owner); if (owner->pi_blocked_on) chain_walk = 1; - spin_unlock_irqrestore(&owner->pi_lock, flags); + atomic_spin_unlock(&owner->pi_lock); } else if (debug_rt_mutex_detect_deadlock(waiter, detect_deadlock)) chain_walk = 1; @@ -459,12 +486,12 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock, */ get_task_struct(owner); - spin_unlock(&lock->wait_lock); + atomic_spin_unlock_irqrestore(&lock->wait_lock, flags); res = rt_mutex_adjust_prio_chain(owner, detect_deadlock, lock, waiter, task); - spin_lock(&lock->wait_lock); + atomic_spin_lock_irq(&lock->wait_lock); return res; } @@ -477,13 +504,13 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock, * * Called with lock->wait_lock held. */ -static void wakeup_next_waiter(struct rt_mutex *lock) +static void wakeup_next_waiter(struct rt_mutex *lock, int savestate) { struct rt_mutex_waiter *waiter; struct task_struct *pendowner; - unsigned long flags; + struct rt_mutex_waiter *next; - spin_lock_irqsave(¤t->pi_lock, flags); + atomic_spin_lock(¤t->pi_lock); waiter = rt_mutex_top_waiter(lock); plist_del(&waiter->list_entry, &lock->wait_list); @@ -498,9 +525,44 @@ static void wakeup_next_waiter(struct rt_mutex *lock) pendowner = waiter->task; waiter->task = NULL; + /* + * Do the wakeup before the ownership change to give any spinning + * waiter grantees a headstart over the other threads that will + * trigger once owner changes. + */ + if (!savestate) + wake_up_process(pendowner); + else { + /* + * We can skip the actual (expensive) wakeup if the + * waiter is already running, but we have to be careful + * of race conditions because they may be about to sleep. + * + * The waiter-side protocol has the following pattern: + * 1: Set state != RUNNING + * 2: Conditionally sleep if waiter->task != NULL; + * + * And the owner-side has the following: + * A: Set waiter->task = NULL + * B: Conditionally wake if the state != RUNNING + * + * As long as we ensure 1->2 order, and A->B order, we + * will never miss a wakeup. + * + * Therefore, this barrier ensures that waiter->task = NULL + * is visible before we test the pendowner->state. The + * corresponding barrier is in the sleep logic. + */ + smp_mb(); + + /* If !RUNNING && !RUNNING_MUTEX */ + if (pendowner->state & ~TASK_RUNNING_MUTEX) + wake_up_process_mutex(pendowner); + } + rt_mutex_set_owner(lock, pendowner, RT_MUTEX_OWNER_PENDING); - spin_unlock_irqrestore(¤t->pi_lock, flags); + atomic_spin_unlock(¤t->pi_lock); /* * Clear the pi_blocked_on variable and enqueue a possible @@ -509,7 +571,13 @@ static void wakeup_next_waiter(struct rt_mutex *lock) * waiter with higher priority than pending-owner->normal_prio * is blocked on the unboosted (pending) owner. */ - spin_lock_irqsave(&pendowner->pi_lock, flags); + + if (rt_mutex_has_waiters(lock)) + next = rt_mutex_top_waiter(lock); + else + next = NULL; + + atomic_spin_lock(&pendowner->pi_lock); WARN_ON(!pendowner->pi_blocked_on); WARN_ON(pendowner->pi_blocked_on != waiter); @@ -517,15 +585,10 @@ static void wakeup_next_waiter(struct rt_mutex *lock) pendowner->pi_blocked_on = NULL; - if (rt_mutex_has_waiters(lock)) { - struct rt_mutex_waiter *next; - - next = rt_mutex_top_waiter(lock); + if (next) plist_add(&next->pi_list_entry, &pendowner->pi_waiters); - } - spin_unlock_irqrestore(&pendowner->pi_lock, flags); - wake_up_process(pendowner); + atomic_spin_unlock(&pendowner->pi_lock); } /* @@ -534,22 +597,22 @@ static void wakeup_next_waiter(struct rt_mutex *lock) * Must be called with lock->wait_lock held */ static void remove_waiter(struct rt_mutex *lock, - struct rt_mutex_waiter *waiter) + struct rt_mutex_waiter *waiter, + unsigned long flags) { int first = (waiter == rt_mutex_top_waiter(lock)); struct task_struct *owner = rt_mutex_owner(lock); - unsigned long flags; int chain_walk = 0; - spin_lock_irqsave(¤t->pi_lock, flags); + atomic_spin_lock(¤t->pi_lock); plist_del(&waiter->list_entry, &lock->wait_list); waiter->task = NULL; current->pi_blocked_on = NULL; - spin_unlock_irqrestore(¤t->pi_lock, flags); + atomic_spin_unlock(¤t->pi_lock); if (first && owner != current) { - spin_lock_irqsave(&owner->pi_lock, flags); + atomic_spin_lock(&owner->pi_lock); plist_del(&waiter->pi_list_entry, &owner->pi_waiters); @@ -564,7 +627,7 @@ static void remove_waiter(struct rt_mutex *lock, if (owner->pi_blocked_on) chain_walk = 1; - spin_unlock_irqrestore(&owner->pi_lock, flags); + atomic_spin_unlock(&owner->pi_lock); } WARN_ON(!plist_node_empty(&waiter->pi_list_entry)); @@ -575,11 +638,11 @@ static void remove_waiter(struct rt_mutex *lock, /* gets dropped in rt_mutex_adjust_prio_chain()! */ get_task_struct(owner); - spin_unlock(&lock->wait_lock); + atomic_spin_unlock_irqrestore(&lock->wait_lock, flags); rt_mutex_adjust_prio_chain(owner, 0, lock, NULL, current); - spin_lock(&lock->wait_lock); + atomic_spin_lock_irq(&lock->wait_lock); } /* @@ -592,26 +655,399 @@ void rt_mutex_adjust_pi(struct task_struct *task) struct rt_mutex_waiter *waiter; unsigned long flags; - spin_lock_irqsave(&task->pi_lock, flags); + atomic_spin_lock_irqsave(&task->pi_lock, flags); waiter = task->pi_blocked_on; if (!waiter || waiter->list_entry.prio == task->prio) { - spin_unlock_irqrestore(&task->pi_lock, flags); + atomic_spin_unlock_irqrestore(&task->pi_lock, flags); return; } - spin_unlock_irqrestore(&task->pi_lock, flags); - /* gets dropped in rt_mutex_adjust_prio_chain()! */ get_task_struct(task); + atomic_spin_unlock_irqrestore(&task->pi_lock, flags); rt_mutex_adjust_prio_chain(task, 0, NULL, NULL, task); } +/* + * preemptible spin_lock functions: + */ + +#ifdef CONFIG_PREEMPT_RT + +static inline void +rt_spin_lock_fastlock(struct rt_mutex *lock, + void (*slowfn)(struct rt_mutex *lock)) +{ + /* Temporary HACK! */ + if (likely(!current->in_printk)) + might_sleep(); + else if (in_atomic() || irqs_disabled()) + /* don't grab locks for printk in atomic */ + return; + + if (likely(rt_mutex_cmpxchg(lock, NULL, current))) + rt_mutex_deadlock_account_lock(lock, current); + else + slowfn(lock); +} + +static inline void +rt_spin_lock_fastunlock(struct rt_mutex *lock, + void (*slowfn)(struct rt_mutex *lock)) +{ + /* Temporary HACK! */ + if (unlikely(rt_mutex_owner(lock) != current) && current->in_printk) + /* don't grab locks for printk in atomic */ + return; + + if (likely(rt_mutex_cmpxchg(lock, current, NULL))) + rt_mutex_deadlock_account_unlock(current); + else + slowfn(lock); +} + + +#ifdef CONFIG_SMP +static int adaptive_wait(struct rt_mutex_waiter *waiter, + struct task_struct *orig_owner) +{ + for (;;) { + + /* we are the owner? */ + if (!waiter->task) + return 0; + + /* Owner changed? Then lets update the original */ + if (orig_owner != rt_mutex_owner(waiter->lock)) + return 0; + + /* Owner went to bed, so should we */ + if (!task_is_current(orig_owner)) + return 1; + + cpu_relax(); + } +} +#else +static int adaptive_wait(struct rt_mutex_waiter *waiter, + struct task_struct *orig_owner) +{ + return 1; +} +#endif + +/* + * The state setting needs to preserve the original state and needs to + * take care of non rtmutex wakeups. + * + * Called with rtmutex->wait_lock held to serialize against rtmutex + * wakeups(). + */ +static inline unsigned long +rt_set_current_blocked_state(unsigned long saved_state) +{ + unsigned long state, block_state; + + /* + * If state is TASK_INTERRUPTIBLE, then we set the state for + * blocking to TASK_INTERRUPTIBLE as well, otherwise we would + * miss real wakeups via wake_up_interruptible(). If such a + * wakeup happens we see the running state and preserve it in + * saved_state. Now we can ignore further wakeups as we will + * return in state running from our "spin" sleep. + */ + if (saved_state == TASK_INTERRUPTIBLE) + block_state = TASK_INTERRUPTIBLE; + else + block_state = TASK_UNINTERRUPTIBLE; + + state = xchg(¤t->state, block_state); + /* + * Take care of non rtmutex wakeups. rtmutex wakeups + * or TASK_RUNNING_MUTEX to (UN)INTERRUPTIBLE. + */ + if (state == TASK_RUNNING) + saved_state = TASK_RUNNING; + + return saved_state; +} + +static inline void rt_restore_current_state(unsigned long saved_state) +{ + unsigned long state = xchg(¤t->state, saved_state); + + if (state == TASK_RUNNING) + current->state = TASK_RUNNING; +} + +/* + * Slow path lock function spin_lock style: this variant is very + * careful not to miss any non-lock wakeups. + * + * The wakeup side uses wake_up_process_mutex, which, combined with + * the xchg code of this function is a transparent sleep/wakeup + * mechanism nested within any existing sleep/wakeup mechanism. This + * enables the seemless use of arbitrary (blocking) spinlocks within + * sleep/wakeup event loops. + */ +static void noinline __sched +rt_spin_lock_slowlock(struct rt_mutex *lock) +{ + struct rt_mutex_waiter waiter; + unsigned long saved_state, flags; + struct task_struct *orig_owner; + + debug_rt_mutex_init_waiter(&waiter); + waiter.task = NULL; + + atomic_spin_lock_irqsave(&lock->wait_lock, flags); + init_lists(lock); + + BUG_ON(rt_mutex_owner(lock) == current); + + /* + * Here we save whatever state the task was in originally, + * we'll restore it at the end of the function and we'll take + * any intermediate wakeup into account as well, independently + * of the lock sleep/wakeup mechanism. When we get a real + * wakeup the task->state is TASK_RUNNING and we change + * saved_state accordingly. If we did not get a real wakeup + * then we return with the saved state. We need to be careful + * about original state TASK_INTERRUPTIBLE as well, as we + * could miss a wakeup_interruptible() + */ + saved_state = rt_set_current_blocked_state(current->state); + + for (;;) { + int saved_lock_depth = current->lock_depth; + + /* Try to acquire the lock */ + if (do_try_to_take_rt_mutex(lock, STEAL_LATERAL)) + break; + + /* + * waiter.task is NULL the first time we come here and + * when we have been woken up by the previous owner + * but the lock got stolen by an higher prio task. + */ + if (!waiter.task) { + task_blocks_on_rt_mutex(lock, &waiter, current, 0, + flags); + /* Wakeup during boost ? */ + if (unlikely(!waiter.task)) + continue; + } + + /* + * Prevent schedule() to drop BKL, while waiting for + * the lock ! We restore lock_depth when we come back. + */ + current->lock_depth = -1; + orig_owner = rt_mutex_owner(lock); + get_task_struct(orig_owner); + atomic_spin_unlock_irqrestore(&lock->wait_lock, flags); + + debug_rt_mutex_print_deadlock(&waiter); + + if (adaptive_wait(&waiter, orig_owner)) { + put_task_struct(orig_owner); + + if (waiter.task) + schedule_rt_mutex(lock); + } else + put_task_struct(orig_owner); + + atomic_spin_lock_irqsave(&lock->wait_lock, flags); + current->lock_depth = saved_lock_depth; + saved_state = rt_set_current_blocked_state(saved_state); + } + + rt_restore_current_state(saved_state); + + /* + * Extremely rare case, if we got woken up by a non-mutex wakeup, + * and we managed to steal the lock despite us not being the + * highest-prio waiter (due to SCHED_OTHER changing prio), then we + * can end up with a non-NULL waiter.task: + */ + if (unlikely(waiter.task)) + remove_waiter(lock, &waiter, flags); + /* + * try_to_take_rt_mutex() sets the waiter bit + * unconditionally. We might have to fix that up: + */ + fixup_rt_mutex_waiters(lock); + + atomic_spin_unlock_irqrestore(&lock->wait_lock, flags); + + debug_rt_mutex_free_waiter(&waiter); +} + +/* + * Slow path to release a rt_mutex spin_lock style + */ +static void noinline __sched +rt_spin_lock_slowunlock(struct rt_mutex *lock) +{ + unsigned long flags; + + atomic_spin_lock_irqsave(&lock->wait_lock, flags); + + debug_rt_mutex_unlock(lock); + + rt_mutex_deadlock_account_unlock(current); + + if (!rt_mutex_has_waiters(lock)) { + lock->owner = NULL; + atomic_spin_unlock_irqrestore(&lock->wait_lock, flags); + return; + } + + wakeup_next_waiter(lock, 1); + + atomic_spin_unlock_irqrestore(&lock->wait_lock, flags); + + /* Undo pi boosting.when necessary */ + rt_mutex_adjust_prio(current); +} + +void __lockfunc rt_spin_lock(spinlock_t *lock) +{ + rt_spin_lock_fastlock(&lock->lock, rt_spin_lock_slowlock); + spin_acquire(&lock->dep_map, 0, 0, _RET_IP_); +} +EXPORT_SYMBOL(rt_spin_lock); + +void __lockfunc __rt_spin_lock(struct rt_mutex *lock) +{ + rt_spin_lock_fastlock(lock, rt_spin_lock_slowlock); +} +EXPORT_SYMBOL(__rt_spin_lock); + +#ifdef CONFIG_DEBUG_LOCK_ALLOC + +void __lockfunc rt_spin_lock_nested(spinlock_t *lock, int subclass) +{ + rt_spin_lock_fastlock(&lock->lock, rt_spin_lock_slowlock); + spin_acquire(&lock->dep_map, subclass, 0, _RET_IP_); +} +EXPORT_SYMBOL(rt_spin_lock_nested); + +#endif + +void __lockfunc rt_spin_unlock(spinlock_t *lock) +{ + /* NOTE: we always pass in '1' for nested, for simplicity */ + spin_release(&lock->dep_map, 1, _RET_IP_); + rt_spin_lock_fastunlock(&lock->lock, rt_spin_lock_slowunlock); +} +EXPORT_SYMBOL(rt_spin_unlock); + +void __lockfunc __rt_spin_unlock(struct rt_mutex *lock) +{ + rt_spin_lock_fastunlock(lock, rt_spin_lock_slowunlock); +} +EXPORT_SYMBOL(__rt_spin_unlock); + +/* + * Wait for the lock to get unlocked: instead of polling for an unlock + * (like raw spinlocks do), we lock and unlock, to force the kernel to + * schedule if there's contention: + */ +void __lockfunc rt_spin_unlock_wait(spinlock_t *lock) +{ + spin_lock(lock); + spin_unlock(lock); +} +EXPORT_SYMBOL(rt_spin_unlock_wait); + +int __lockfunc rt_spin_trylock(spinlock_t *lock) +{ + int ret = rt_mutex_trylock(&lock->lock); + + if (ret) + spin_acquire(&lock->dep_map, 0, 1, _RET_IP_); + + return ret; +} +EXPORT_SYMBOL(rt_spin_trylock); + +int __lockfunc rt_spin_trylock_irqsave(spinlock_t *lock, unsigned long *flags) +{ + int ret; + + *flags = 0; + ret = rt_mutex_trylock(&lock->lock); + if (ret) + spin_acquire(&lock->dep_map, 0, 1, _RET_IP_); + + return ret; +} +EXPORT_SYMBOL(rt_spin_trylock_irqsave); + +int atomic_dec_and_spin_lock(atomic_t *atomic, spinlock_t *lock) +{ + /* Subtract 1 from counter unless that drops it to 0 (ie. it was 1) */ + if (atomic_add_unless(atomic, -1, 1)) + return 0; + rt_spin_lock(lock); + if (atomic_dec_and_test(atomic)) + return 1; + rt_spin_unlock(lock); + return 0; +} +EXPORT_SYMBOL(atomic_dec_and_spin_lock); + +void +__rt_spin_lock_init(spinlock_t *lock, char *name, struct lock_class_key *key) +{ +#ifdef CONFIG_DEBUG_LOCK_ALLOC + /* + * Make sure we are not reinitializing a held lock: + */ + debug_check_no_locks_freed((void *)lock, sizeof(*lock)); + lockdep_init_map(&lock->dep_map, name, key, 0); +#endif + __rt_mutex_init(&lock->lock, name); +} +EXPORT_SYMBOL(__rt_spin_lock_init); + +#endif + +static inline int rt_release_bkl(struct rt_mutex *lock, unsigned long flags) +{ + int saved_lock_depth = current->lock_depth; + +#ifdef CONFIG_LOCK_KERNEL + current->lock_depth = -1; + /* + * try_to_take_lock set the waiters, make sure it's + * still correct. + */ + fixup_rt_mutex_waiters(lock); + atomic_spin_unlock_irqrestore(&lock->wait_lock, flags); + + up(&kernel_sem); + + atomic_spin_lock_irq(&lock->wait_lock); +#endif + return saved_lock_depth; +} + +static inline void rt_reacquire_bkl(int saved_lock_depth) +{ +#ifdef CONFIG_LOCK_KERNEL + down(&kernel_sem); + current->lock_depth = saved_lock_depth; +#endif +} + /** * __rt_mutex_slowlock() - Perform the wait-wake-try-to-take loop * @lock: the rt_mutex to take * @state: the state the task should block in (TASK_INTERRUPTIBLE - * or TASK_UNINTERRUPTIBLE) + * or TASK_UNINTERRUPTIBLE) * @timeout: the pre-initialized and started timer, or NULL for none * @waiter: the pre-initialized rt_mutex_waiter * @detect_deadlock: passed to task_blocks_on_rt_mutex @@ -622,7 +1058,7 @@ static int __sched __rt_mutex_slowlock(struct rt_mutex *lock, int state, struct hrtimer_sleeper *timeout, struct rt_mutex_waiter *waiter, - int detect_deadlock) + int detect_deadlock, unsigned long flags) { int ret = 0; @@ -652,7 +1088,7 @@ __rt_mutex_slowlock(struct rt_mutex *lock, int state, */ if (!waiter->task) { ret = task_blocks_on_rt_mutex(lock, waiter, current, - detect_deadlock); + detect_deadlock, flags); /* * If we got woken up by the owner then start loop * all over without going into schedule to try @@ -672,14 +1108,15 @@ __rt_mutex_slowlock(struct rt_mutex *lock, int state, break; } - spin_unlock(&lock->wait_lock); + atomic_spin_unlock_irq(&lock->wait_lock); debug_rt_mutex_print_deadlock(waiter); if (waiter->task) schedule_rt_mutex(lock); - spin_lock(&lock->wait_lock); + atomic_spin_lock_irq(&lock->wait_lock); + set_current_state(state); } @@ -694,20 +1131,29 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state, struct hrtimer_sleeper *timeout, int detect_deadlock) { + int ret = 0, saved_lock_depth = -1; struct rt_mutex_waiter waiter; - int ret = 0; + unsigned long flags; debug_rt_mutex_init_waiter(&waiter); waiter.task = NULL; - spin_lock(&lock->wait_lock); + atomic_spin_lock_irqsave(&lock->wait_lock, flags); + init_lists(lock); /* Try to acquire the lock again: */ if (try_to_take_rt_mutex(lock)) { - spin_unlock(&lock->wait_lock); + atomic_spin_unlock_irqrestore(&lock->wait_lock, flags); return 0; } + /* + * We drop the BKL here before we go into the wait loop to avoid a + * possible deadlock in the scheduler. + */ + if (unlikely(current->lock_depth >= 0)) + saved_lock_depth = rt_release_bkl(lock, flags); + set_current_state(state); /* Setup the timer, when timeout != NULL */ @@ -718,12 +1164,12 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state, } ret = __rt_mutex_slowlock(lock, state, timeout, &waiter, - detect_deadlock); + detect_deadlock, flags); set_current_state(TASK_RUNNING); if (unlikely(waiter.task)) - remove_waiter(lock, &waiter); + remove_waiter(lock, &waiter, flags); /* * try_to_take_rt_mutex() sets the waiter bit @@ -731,7 +1177,7 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state, */ fixup_rt_mutex_waiters(lock); - spin_unlock(&lock->wait_lock); + atomic_spin_unlock_irqrestore(&lock->wait_lock, flags); /* Remove pending timer: */ if (unlikely(timeout)) @@ -745,6 +1191,10 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state, if (unlikely(ret)) rt_mutex_adjust_prio(current); + /* Must we reaquire the BKL? */ + if (unlikely(saved_lock_depth >= 0)) + rt_reacquire_bkl(saved_lock_depth); + debug_rt_mutex_free_waiter(&waiter); return ret; @@ -756,12 +1206,15 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state, static inline int rt_mutex_slowtrylock(struct rt_mutex *lock) { + unsigned long flags; int ret = 0; - spin_lock(&lock->wait_lock); + atomic_spin_lock_irqsave(&lock->wait_lock, flags); if (likely(rt_mutex_owner(lock) != current)) { + init_lists(lock); + ret = try_to_take_rt_mutex(lock); /* * try_to_take_rt_mutex() sets the lock waiters @@ -770,7 +1223,7 @@ rt_mutex_slowtrylock(struct rt_mutex *lock) fixup_rt_mutex_waiters(lock); } - spin_unlock(&lock->wait_lock); + atomic_spin_unlock_irqrestore(&lock->wait_lock, flags); return ret; } @@ -781,7 +1234,9 @@ rt_mutex_slowtrylock(struct rt_mutex *lock) static void __sched rt_mutex_slowunlock(struct rt_mutex *lock) { - spin_lock(&lock->wait_lock); + unsigned long flags; + + atomic_spin_lock_irqsave(&lock->wait_lock, flags); debug_rt_mutex_unlock(lock); @@ -789,13 +1244,13 @@ rt_mutex_slowunlock(struct rt_mutex *lock) if (!rt_mutex_has_waiters(lock)) { lock->owner = NULL; - spin_unlock(&lock->wait_lock); + atomic_spin_unlock_irqrestore(&lock->wait_lock, flags); return; } - wakeup_next_waiter(lock); + wakeup_next_waiter(lock, 0); - spin_unlock(&lock->wait_lock); + atomic_spin_unlock_irqrestore(&lock->wait_lock, flags); /* Undo pi boosting if necessary: */ rt_mutex_adjust_prio(current); @@ -857,6 +1312,27 @@ rt_mutex_fastunlock(struct rt_mutex *lock, } /** + * rt_mutex_lock_killable - lock a rt_mutex killable + * + * @lock: the rt_mutex to be locked + * @detect_deadlock: deadlock detection on/off + * + * Returns: + * 0 on success + * -EINTR when interrupted by a signal + * -EDEADLK when the lock would deadlock (when deadlock detection is on) + */ +int __sched rt_mutex_lock_killable(struct rt_mutex *lock, + int detect_deadlock) +{ + might_sleep(); + + return rt_mutex_fastlock(lock, TASK_KILLABLE, + detect_deadlock, rt_mutex_slowlock); +} +EXPORT_SYMBOL_GPL(rt_mutex_lock_killable); + +/** * rt_mutex_lock - lock a rt_mutex * * @lock: the rt_mutex to be locked @@ -970,8 +1446,8 @@ EXPORT_SYMBOL_GPL(rt_mutex_destroy); void __rt_mutex_init(struct rt_mutex *lock, const char *name) { lock->owner = NULL; - spin_lock_init(&lock->wait_lock); - plist_head_init(&lock->wait_list, &lock->wait_lock); + atomic_spin_lock_init(&lock->wait_lock); + plist_head_init_atomic(&lock->wait_list, &lock->wait_lock); debug_rt_mutex_init(lock, name); } @@ -1030,22 +1506,25 @@ int rt_mutex_start_proxy_lock(struct rt_mutex *lock, struct rt_mutex_waiter *waiter, struct task_struct *task, int detect_deadlock) { + unsigned long flags; int ret; - spin_lock(&lock->wait_lock); + atomic_spin_lock_irqsave(&lock->wait_lock, flags); mark_rt_mutex_waiters(lock); - if (!rt_mutex_owner(lock) || try_to_steal_lock(lock, task)) { + if (!rt_mutex_owner(lock) || + try_to_steal_lock(lock, task, STEAL_NORMAL)) { /* We got the lock for task. */ debug_rt_mutex_lock(lock); rt_mutex_set_owner(lock, task, 0); - spin_unlock(&lock->wait_lock); + atomic_spin_unlock(&lock->wait_lock); rt_mutex_deadlock_account_lock(lock, task); return 1; } - ret = task_blocks_on_rt_mutex(lock, waiter, task, detect_deadlock); + ret = task_blocks_on_rt_mutex(lock, waiter, task, detect_deadlock, + flags); if (ret && !waiter->task) { /* @@ -1056,7 +1535,7 @@ int rt_mutex_start_proxy_lock(struct rt_mutex *lock, */ ret = 0; } - spin_unlock(&lock->wait_lock); + atomic_spin_unlock_irqrestore(&lock->wait_lock, flags); debug_rt_mutex_print_deadlock(waiter); @@ -1104,19 +1583,20 @@ int rt_mutex_finish_proxy_lock(struct rt_mutex *lock, struct rt_mutex_waiter *waiter, int detect_deadlock) { + unsigned long flags; int ret; - spin_lock(&lock->wait_lock); + atomic_spin_lock_irqsave(&lock->wait_lock, flags); set_current_state(TASK_INTERRUPTIBLE); ret = __rt_mutex_slowlock(lock, TASK_INTERRUPTIBLE, to, waiter, - detect_deadlock); + detect_deadlock, flags); set_current_state(TASK_RUNNING); if (unlikely(waiter->task)) - remove_waiter(lock, waiter); + remove_waiter(lock, waiter, flags); /* * try_to_take_rt_mutex() sets the waiter bit unconditionally. We might @@ -1124,7 +1604,7 @@ int rt_mutex_finish_proxy_lock(struct rt_mutex *lock, */ fixup_rt_mutex_waiters(lock); - spin_unlock(&lock->wait_lock); + atomic_spin_unlock_irqrestore(&lock->wait_lock, flags); /* * Readjust priority, when we did not get the lock. We might have been diff --git a/kernel/rtmutex_common.h b/kernel/rtmutex_common.h index 97a2f81..4df690c 100644 --- a/kernel/rtmutex_common.h +++ b/kernel/rtmutex_common.h @@ -129,6 +129,26 @@ extern int rt_mutex_finish_proxy_lock(struct rt_mutex *lock, struct rt_mutex_waiter *waiter, int detect_deadlock); + +#define STEAL_LATERAL 1 +#define STEAL_NORMAL 0 + +/* + * Note that RT tasks are excluded from lateral-steals to prevent the + * introduction of an unbounded latency + */ +static inline int lock_is_stealable(struct task_struct *task, + struct task_struct *pendowner, int mode) +{ + if (mode == STEAL_NORMAL || rt_task(task)) { + if (task->prio >= pendowner->prio) + return 0; + } else if (task->prio > pendowner->prio) + return 0; + + return 1; +} + #ifdef CONFIG_DEBUG_RT_MUTEXES # include "rtmutex-debug.h" #else diff --git a/kernel/rwlock.c b/kernel/rwlock.c new file mode 100644 index 0000000..20a357d --- /dev/null +++ b/kernel/rwlock.c @@ -0,0 +1,226 @@ +/* + * Copyright (2004) Linus Torvalds + * + * Author: Zwane Mwaikambo <zwane@fsmlabs.com> + * + * Copyright (2004, 2005) Ingo Molnar + * + * This file contains the spinlock/rwlock implementations for the + * SMP and the DEBUG_SPINLOCK cases. (UP-nondebug inlines them) + * + * Note that some architectures have special knowledge about the + * stack frames of these functions in their profile_pc. If you + * change anything significant here that could change the stack + * frame contact the architecture maintainers. + */ + +#ifndef CONFIG_PREEMPT_RT + +#include <linux/linkage.h> +#include <linux/preempt.h> +#include <linux/spinlock.h> +#include <linux/interrupt.h> +#include <linux/debug_locks.h> +#include <linux/module.h> + +#include "lock-internals.h" + +int __lockfunc _read_trylock(rwlock_t *lock) +{ + preempt_disable(); + if (_raw_read_trylock(lock)) { + rwlock_acquire_read(&lock->dep_map, 0, 1, _RET_IP_); + return 1; + } + + preempt_enable(); + return 0; +} +EXPORT_SYMBOL(_read_trylock); + +int __lockfunc _write_trylock(rwlock_t *lock) +{ + preempt_disable(); + if (_raw_write_trylock(lock)) { + rwlock_acquire(&lock->dep_map, 0, 1, _RET_IP_); + return 1; + } + + preempt_enable(); + return 0; +} +EXPORT_SYMBOL(_write_trylock); + +/* + * If lockdep is enabled then we use the non-preemption spin-ops + * even on CONFIG_PREEMPT, because lockdep assumes that interrupts are + * not re-enabled during lock-acquire (which the preempt-spin-ops do): + */ +#if !defined(CONFIG_GENERIC_LOCKBREAK) || defined(CONFIG_DEBUG_LOCK_ALLOC) + +void __lockfunc _read_lock(rwlock_t *lock) +{ + preempt_disable(); + rwlock_acquire_read(&lock->dep_map, 0, 0, _RET_IP_); + LOCK_CONTENDED(lock, _raw_read_trylock, _raw_read_lock); +} +EXPORT_SYMBOL(_read_lock); + +unsigned long __lockfunc _read_lock_irqsave(rwlock_t *lock) +{ + unsigned long flags; + + local_irq_save(flags); + preempt_disable(); + rwlock_acquire_read(&lock->dep_map, 0, 0, _RET_IP_); + LOCK_CONTENDED_FLAGS(lock, _raw_read_trylock, _raw_read_lock, + _raw_read_lock_flags, &flags); + return flags; +} +EXPORT_SYMBOL(_read_lock_irqsave); + +void __lockfunc _read_lock_irq(rwlock_t *lock) +{ + local_irq_disable(); + preempt_disable(); + rwlock_acquire_read(&lock->dep_map, 0, 0, _RET_IP_); + LOCK_CONTENDED(lock, _raw_read_trylock, _raw_read_lock); +} +EXPORT_SYMBOL(_read_lock_irq); + +void __lockfunc _read_lock_bh(rwlock_t *lock) +{ + local_bh_disable(); + preempt_disable(); + rwlock_acquire_read(&lock->dep_map, 0, 0, _RET_IP_); + LOCK_CONTENDED(lock, _raw_read_trylock, _raw_read_lock); +} +EXPORT_SYMBOL(_read_lock_bh); + +unsigned long __lockfunc _write_lock_irqsave(rwlock_t *lock) +{ + unsigned long flags; + + local_irq_save(flags); + preempt_disable(); + rwlock_acquire(&lock->dep_map, 0, 0, _RET_IP_); + LOCK_CONTENDED_FLAGS(lock, _raw_write_trylock, _raw_write_lock, + _raw_write_lock_flags, &flags); + return flags; +} +EXPORT_SYMBOL(_write_lock_irqsave); + +void __lockfunc _write_lock_irq(rwlock_t *lock) +{ + local_irq_disable(); + preempt_disable(); + rwlock_acquire(&lock->dep_map, 0, 0, _RET_IP_); + LOCK_CONTENDED(lock, _raw_write_trylock, _raw_write_lock); +} +EXPORT_SYMBOL(_write_lock_irq); + +void __lockfunc _write_lock_bh(rwlock_t *lock) +{ + local_bh_disable(); + preempt_disable(); + rwlock_acquire(&lock->dep_map, 0, 0, _RET_IP_); + LOCK_CONTENDED(lock, _raw_write_trylock, _raw_write_lock); +} +EXPORT_SYMBOL(_write_lock_bh); + +void __lockfunc _write_lock(rwlock_t *lock) +{ + preempt_disable(); + rwlock_acquire(&lock->dep_map, 0, 0, _RET_IP_); + LOCK_CONTENDED(lock, _raw_write_trylock, _raw_write_lock); +} + +EXPORT_SYMBOL(_write_lock); + +#else /* CONFIG_PREEMPT: */ + +/* + * Build preemption-friendly versions of the following + * lock-spinning functions: + * + * _[read|write]_lock() + * _[read|write]_lock_irq() + * _[read|write]_lock_irqsave() + * _[read|write]_lock_bh() + */ +BUILD_LOCK_OPS(read, read, rwlock); +BUILD_LOCK_OPS(write, write, rwlock); + +#endif /* CONFIG_PREEMPT */ + +void __lockfunc _write_unlock(rwlock_t *lock) +{ + rwlock_release(&lock->dep_map, 1, _RET_IP_); + _raw_write_unlock(lock); + preempt_enable(); +} +EXPORT_SYMBOL(_write_unlock); + +void __lockfunc _read_unlock(rwlock_t *lock) +{ + rwlock_release(&lock->dep_map, 1, _RET_IP_); + _raw_read_unlock(lock); + preempt_enable(); +} +EXPORT_SYMBOL(_read_unlock); + +void __lockfunc _read_unlock_irqrestore(rwlock_t *lock, unsigned long flags) +{ + rwlock_release(&lock->dep_map, 1, _RET_IP_); + _raw_read_unlock(lock); + local_irq_restore(flags); + preempt_enable(); +} +EXPORT_SYMBOL(_read_unlock_irqrestore); + +void __lockfunc _read_unlock_irq(rwlock_t *lock) +{ + rwlock_release(&lock->dep_map, 1, _RET_IP_); + _raw_read_unlock(lock); + local_irq_enable(); + preempt_enable(); +} +EXPORT_SYMBOL(_read_unlock_irq); + +void __lockfunc _read_unlock_bh(rwlock_t *lock) +{ + rwlock_release(&lock->dep_map, 1, _RET_IP_); + _raw_read_unlock(lock); + __preempt_enable_no_resched(); + local_bh_enable_ip((unsigned long)__builtin_return_address(0)); +} +EXPORT_SYMBOL(_read_unlock_bh); + +void __lockfunc _write_unlock_irqrestore(rwlock_t *lock, unsigned long flags) +{ + rwlock_release(&lock->dep_map, 1, _RET_IP_); + _raw_write_unlock(lock); + local_irq_restore(flags); + preempt_enable(); +} +EXPORT_SYMBOL(_write_unlock_irqrestore); + +void __lockfunc _write_unlock_irq(rwlock_t *lock) +{ + rwlock_release(&lock->dep_map, 1, _RET_IP_); + _raw_write_unlock(lock); + local_irq_enable(); + preempt_enable(); +} +EXPORT_SYMBOL(_write_unlock_irq); + +void __lockfunc _write_unlock_bh(rwlock_t *lock) +{ + rwlock_release(&lock->dep_map, 1, _RET_IP_); + _raw_write_unlock(lock); + __preempt_enable_no_resched(); + local_bh_enable_ip((unsigned long)__builtin_return_address(0)); +} +EXPORT_SYMBOL(_write_unlock_bh); + +#endif diff --git a/kernel/rwsem.c b/kernel/rwsem.c index cae050b..6c6e7fa 100644 --- a/kernel/rwsem.c +++ b/kernel/rwsem.c @@ -16,20 +16,19 @@ /* * lock for reading */ -void __sched down_read(struct rw_semaphore *sem) +void __sched anon_down_read(struct rw_anon_semaphore *sem) { might_sleep(); rwsem_acquire_read(&sem->dep_map, 0, 0, _RET_IP_); LOCK_CONTENDED(sem, __down_read_trylock, __down_read); } - -EXPORT_SYMBOL(down_read); +EXPORT_SYMBOL(anon_down_read); /* * trylock for reading -- returns 1 if successful, 0 if contention */ -int down_read_trylock(struct rw_semaphore *sem) +int anon_down_read_trylock(struct rw_anon_semaphore *sem) { int ret = __down_read_trylock(sem); @@ -37,26 +36,24 @@ int down_read_trylock(struct rw_semaphore *sem) rwsem_acquire_read(&sem->dep_map, 0, 1, _RET_IP_); return ret; } - -EXPORT_SYMBOL(down_read_trylock); +EXPORT_SYMBOL(anon_down_read_trylock); /* * lock for writing */ -void __sched down_write(struct rw_semaphore *sem) +void __sched anon_down_write(struct rw_anon_semaphore *sem) { might_sleep(); rwsem_acquire(&sem->dep_map, 0, 0, _RET_IP_); LOCK_CONTENDED(sem, __down_write_trylock, __down_write); } - -EXPORT_SYMBOL(down_write); +EXPORT_SYMBOL(anon_down_write); /* * trylock for writing -- returns 1 if successful, 0 if contention */ -int down_write_trylock(struct rw_semaphore *sem) +int anon_down_write_trylock(struct rw_anon_semaphore *sem) { int ret = __down_write_trylock(sem); @@ -64,37 +61,34 @@ int down_write_trylock(struct rw_semaphore *sem) rwsem_acquire(&sem->dep_map, 0, 1, _RET_IP_); return ret; } - -EXPORT_SYMBOL(down_write_trylock); +EXPORT_SYMBOL(anon_down_write_trylock); /* * release a read lock */ -void up_read(struct rw_semaphore *sem) +void anon_up_read(struct rw_anon_semaphore *sem) { rwsem_release(&sem->dep_map, 1, _RET_IP_); __up_read(sem); } - -EXPORT_SYMBOL(up_read); +EXPORT_SYMBOL(anon_up_read); /* * release a write lock */ -void up_write(struct rw_semaphore *sem) +void anon_up_write(struct rw_anon_semaphore *sem) { rwsem_release(&sem->dep_map, 1, _RET_IP_); __up_write(sem); } - -EXPORT_SYMBOL(up_write); +EXPORT_SYMBOL(anon_up_write); /* * downgrade write lock to read lock */ -void downgrade_write(struct rw_semaphore *sem) +void anon_downgrade_write(struct rw_anon_semaphore *sem) { /* * lockdep: a downgraded write will live on as a write @@ -102,46 +96,41 @@ void downgrade_write(struct rw_semaphore *sem) */ __downgrade_write(sem); } - -EXPORT_SYMBOL(downgrade_write); +EXPORT_SYMBOL(anon_downgrade_write); #ifdef CONFIG_DEBUG_LOCK_ALLOC -void down_read_nested(struct rw_semaphore *sem, int subclass) +void anon_down_read_nested(struct rw_anon_semaphore *sem, int subclass) { might_sleep(); rwsem_acquire_read(&sem->dep_map, subclass, 0, _RET_IP_); LOCK_CONTENDED(sem, __down_read_trylock, __down_read); } +EXPORT_SYMBOL(anon_down_read_nested); -EXPORT_SYMBOL(down_read_nested); - -void down_read_non_owner(struct rw_semaphore *sem) +void anon_down_read_non_owner(struct rw_anon_semaphore *sem) { might_sleep(); __down_read(sem); } +EXPORT_SYMBOL(anon_down_read_non_owner); -EXPORT_SYMBOL(down_read_non_owner); - -void down_write_nested(struct rw_semaphore *sem, int subclass) +void anon_down_write_nested(struct rw_anon_semaphore *sem, int subclass) { might_sleep(); rwsem_acquire(&sem->dep_map, subclass, 0, _RET_IP_); LOCK_CONTENDED(sem, __down_write_trylock, __down_write); } +EXPORT_SYMBOL(anon_down_write_nested); -EXPORT_SYMBOL(down_write_nested); - -void up_read_non_owner(struct rw_semaphore *sem) +void anon_up_read_non_owner(struct rw_anon_semaphore *sem) { __up_read(sem); } - -EXPORT_SYMBOL(up_read_non_owner); +EXPORT_SYMBOL(anon_up_read_non_owner); #endif diff --git a/kernel/sched.c b/kernel/sched.c index 1b59e26..2122102 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -4,6 +4,7 @@ * Kernel scheduler and related syscalls * * Copyright (C) 1991-2002 Linus Torvalds + * Copyright (C) 2004 Red Hat, Inc., Ingo Molnar <mingo@redhat.com> * * 1996-12-23 Modified by Dave Grothe to fix bugs in semaphores and * make semaphores SMP safe @@ -16,6 +17,7 @@ * by Davide Libenzi, preemptible kernel bits by Robert Love. * 2003-09-03 Interactivity tuning by Con Kolivas. * 2004-04-02 Scheduler domains code by Nick Piggin + * 2004-10-13 Real-Time Preemption support by Ingo Molnar * 2007-04-15 Work begun on replacing all interactivity tuning with a * fair scheduling design by Con Kolivas. * 2007-05-05 Load balancing (smp-nice) and other improvements @@ -61,6 +63,7 @@ #include <linux/sysctl.h> #include <linux/syscalls.h> #include <linux/times.h> +#include <linux/kallsyms.h> #include <linux/tsacct_kern.h> #include <linux/kprobes.h> #include <linux/delayacct.h> @@ -107,6 +110,20 @@ #define NICE_0_LOAD SCHED_LOAD_SCALE #define NICE_0_SHIFT SCHED_LOAD_SHIFT +#if (BITS_PER_LONG < 64) +#define JIFFIES_TO_NS64(TIME) \ + ((unsigned long long)(TIME) * ((unsigned long) (1000000000 / HZ))) + +#define NS64_TO_JIFFIES(TIME) \ + ((((unsigned long long)((TIME)) >> BITS_PER_LONG) * \ + (1 + NS_TO_JIFFIES(~0UL))) + NS_TO_JIFFIES((unsigned long)(TIME))) +#else /* BITS_PER_LONG < 64 */ + +#define NS64_TO_JIFFIES(TIME) NS_TO_JIFFIES(TIME) +#define JIFFIES_TO_NS64(TIME) JIFFIES_TO_NS(TIME) + +#endif /* BITS_PER_LONG < 64 */ + /* * These are the 'tuning knobs' of the scheduler: * @@ -144,6 +161,32 @@ static inline void sg_inc_cpu_power(struct sched_group *sg, u32 val) } #endif +#define TASK_PREEMPTS_CURR(p, rq) \ + ((p)->prio < (rq)->curr->prio) + +/* + * Tweaks for current + */ + +#ifdef CURRENT_PTR +struct task_struct * const ___current = &init_task; +struct task_struct ** const current_ptr = (struct task_struct ** const)&___current; +struct thread_info * const current_ti = &init_thread_union.thread_info; +struct thread_info ** const current_ti_ptr = (struct thread_info ** const)¤t_ti; + +EXPORT_SYMBOL(___current); +EXPORT_SYMBOL(current_ti); + +/* + * The scheduler itself doesnt want 'current' to be cached + * during context-switches: + */ +# undef current +# define current __current() +# undef current_thread_info +# define current_thread_info() __current_thread_info() +#endif + static inline int rt_policy(int policy) { if (unlikely(policy == SCHED_FIFO || policy == SCHED_RR)) @@ -166,7 +209,7 @@ struct rt_prio_array { struct rt_bandwidth { /* nests inside the rq lock: */ - spinlock_t rt_runtime_lock; + atomic_spinlock_t rt_runtime_lock; ktime_t rt_period; u64 rt_runtime; struct hrtimer rt_period_timer; @@ -203,10 +246,11 @@ void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime) rt_b->rt_period = ns_to_ktime(period); rt_b->rt_runtime = runtime; - spin_lock_init(&rt_b->rt_runtime_lock); + atomic_spin_lock_init(&rt_b->rt_runtime_lock); hrtimer_init(&rt_b->rt_period_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); + rt_b->rt_period_timer.irqsafe = 1; rt_b->rt_period_timer.function = sched_rt_period_timer; } @@ -225,7 +269,7 @@ static void start_rt_bandwidth(struct rt_bandwidth *rt_b) if (hrtimer_active(&rt_b->rt_period_timer)) return; - spin_lock(&rt_b->rt_runtime_lock); + atomic_spin_lock(&rt_b->rt_runtime_lock); for (;;) { unsigned long delta; ktime_t soft, hard; @@ -242,7 +286,7 @@ static void start_rt_bandwidth(struct rt_bandwidth *rt_b) __hrtimer_start_range_ns(&rt_b->rt_period_timer, soft, delta, HRTIMER_MODE_ABS_PINNED, 0); } - spin_unlock(&rt_b->rt_runtime_lock); + atomic_spin_unlock(&rt_b->rt_runtime_lock); } #ifdef CONFIG_RT_GROUP_SCHED @@ -497,11 +541,12 @@ struct rt_rq { int overloaded; struct plist_head pushable_tasks; #endif + unsigned long rt_nr_uninterruptible; int rt_throttled; u64 rt_time; u64 rt_runtime; /* Nests inside the rq lock: */ - spinlock_t rt_runtime_lock; + atomic_spinlock_t rt_runtime_lock; #ifdef CONFIG_RT_GROUP_SCHED unsigned long rt_nr_boosted; @@ -564,7 +609,7 @@ static struct root_domain def_root_domain; */ struct rq { /* runqueue lock: */ - spinlock_t lock; + atomic_spinlock_t lock; /* * nr_running and cpu_load should be in the same cacheline because @@ -602,6 +647,8 @@ struct rq { */ unsigned long nr_uninterruptible; + unsigned long switch_timestamp; + unsigned long slice_avg; struct task_struct *curr, *idle; unsigned long next_balance; struct mm_struct *prev_mm; @@ -660,9 +707,21 @@ struct rq { /* BKL stats */ unsigned int bkl_count; + + /* RT-overload stats: */ + unsigned long rto_schedule; + unsigned long rto_schedule_tail; + unsigned long rto_wakeup; + unsigned long rto_pulled; + unsigned long rto_pushed; #endif }; +struct task_struct *rq_curr(struct rq *rq) +{ + return rq->curr; +} + static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); static inline void check_preempt_curr(struct rq *rq, struct task_struct *p, int sync) @@ -699,6 +758,13 @@ inline void update_rq_clock(struct rq *rq) rq->clock = sched_clock_cpu(cpu_of(rq)); } +#ifndef CONFIG_SMP +int task_is_current(struct task_struct *task) +{ + return task_rq(task)->curr == task; +} +#endif + /* * Tunables that become constants when CONFIG_SCHED_DEBUG is off: */ @@ -721,7 +787,7 @@ int runqueue_is_locked(void) struct rq *rq = cpu_rq(cpu); int ret; - ret = spin_is_locked(&rq->lock); + ret = atomic_spin_is_locked(&rq->lock); put_cpu(); return ret; } @@ -887,11 +953,23 @@ static inline u64 global_rt_runtime(void) return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC; } +/* + * We really dont want to do anything complex within switch_to() + * on PREEMPT_RT - this check enforces this. + */ +#ifdef prepare_arch_switch +# ifdef CONFIG_PREEMPT_RT +# error FIXME +# else +# define _finish_arch_switch finish_arch_switch +# endif +#endif + #ifndef prepare_arch_switch # define prepare_arch_switch(next) do { } while (0) #endif #ifndef finish_arch_switch -# define finish_arch_switch(prev) do { } while (0) +# define _finish_arch_switch(prev) do { } while (0) #endif static inline int task_current(struct rq *rq, struct task_struct *p) @@ -899,18 +977,39 @@ static inline int task_current(struct rq *rq, struct task_struct *p) return rq->curr == p; } -#ifndef __ARCH_WANT_UNLOCKED_CTXSW static inline int task_running(struct rq *rq, struct task_struct *p) { +#ifdef CONFIG_SMP + return p->oncpu; +#else return task_current(rq, p); +#endif } +#ifndef __ARCH_WANT_UNLOCKED_CTXSW static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next) { +#ifdef CONFIG_SMP + /* + * We can optimise this out completely for !SMP, because the + * SMP rebalancing from interrupt is the only thing that cares + * here. + */ + next->oncpu = 1; +#endif } static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) { +#ifdef CONFIG_SMP + /* + * After ->oncpu is cleared, the task can be moved to a different CPU. + * We must ensure this doesn't happen until the switch is completely + * finished. + */ + smp_wmb(); + prev->oncpu = 0; +#endif #ifdef CONFIG_DEBUG_SPINLOCK /* this is a valid case when another task releases the spinlock */ rq->lock.owner = current; @@ -922,18 +1021,10 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) */ spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_); - spin_unlock_irq(&rq->lock); + atomic_spin_unlock(&rq->lock); } #else /* __ARCH_WANT_UNLOCKED_CTXSW */ -static inline int task_running(struct rq *rq, struct task_struct *p) -{ -#ifdef CONFIG_SMP - return p->oncpu; -#else - return task_current(rq, p); -#endif -} static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next) { @@ -946,9 +1037,9 @@ static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next) next->oncpu = 1; #endif #ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW - spin_unlock_irq(&rq->lock); + atomic_spin_unlock_irq(&rq->lock); #else - spin_unlock(&rq->lock); + atomic_spin_unlock(&rq->lock); #endif } @@ -963,8 +1054,8 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) smp_wmb(); prev->oncpu = 0; #endif -#ifndef __ARCH_WANT_INTERRUPTS_ON_CTXSW - local_irq_enable(); +#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW + local_irq_disable(); #endif } #endif /* __ARCH_WANT_UNLOCKED_CTXSW */ @@ -978,10 +1069,10 @@ static inline struct rq *__task_rq_lock(struct task_struct *p) { for (;;) { struct rq *rq = task_rq(p); - spin_lock(&rq->lock); + atomic_spin_lock(&rq->lock); if (likely(rq == task_rq(p))) return rq; - spin_unlock(&rq->lock); + atomic_spin_unlock(&rq->lock); } } @@ -998,10 +1089,10 @@ static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags) for (;;) { local_irq_save(*flags); rq = task_rq(p); - spin_lock(&rq->lock); + atomic_spin_lock(&rq->lock); if (likely(rq == task_rq(p))) return rq; - spin_unlock_irqrestore(&rq->lock, *flags); + atomic_spin_unlock_irqrestore(&rq->lock, *flags); } } @@ -1010,19 +1101,19 @@ void task_rq_unlock_wait(struct task_struct *p) struct rq *rq = task_rq(p); smp_mb(); /* spin-unlock-wait is not a full memory barrier */ - spin_unlock_wait(&rq->lock); + atomic_spin_unlock_wait(&rq->lock); } static void __task_rq_unlock(struct rq *rq) __releases(rq->lock) { - spin_unlock(&rq->lock); + atomic_spin_unlock(&rq->lock); } static inline void task_rq_unlock(struct rq *rq, unsigned long *flags) __releases(rq->lock) { - spin_unlock_irqrestore(&rq->lock, *flags); + atomic_spin_unlock_irqrestore(&rq->lock, *flags); } /* @@ -1035,7 +1126,7 @@ static struct rq *this_rq_lock(void) local_irq_disable(); rq = this_rq(); - spin_lock(&rq->lock); + atomic_spin_lock(&rq->lock); return rq; } @@ -1082,10 +1173,10 @@ static enum hrtimer_restart hrtick(struct hrtimer *timer) WARN_ON_ONCE(cpu_of(rq) != smp_processor_id()); - spin_lock(&rq->lock); + atomic_spin_lock(&rq->lock); update_rq_clock(rq); rq->curr->sched_class->task_tick(rq, rq->curr, 1); - spin_unlock(&rq->lock); + atomic_spin_unlock(&rq->lock); return HRTIMER_NORESTART; } @@ -1098,10 +1189,10 @@ static void __hrtick_start(void *arg) { struct rq *rq = arg; - spin_lock(&rq->lock); + atomic_spin_lock(&rq->lock); hrtimer_restart(&rq->hrtick_timer); rq->hrtick_csd_pending = 0; - spin_unlock(&rq->lock); + atomic_spin_unlock(&rq->lock); } /* @@ -1176,6 +1267,7 @@ static void init_rq_hrtick(struct rq *rq) hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); rq->hrtick_timer.function = hrtick; + rq->hrtick_timer.irqsafe = 1; } #else /* CONFIG_SCHED_HRTICK */ static inline void hrtick_clear(struct rq *rq) @@ -1208,7 +1300,7 @@ static void resched_task(struct task_struct *p) { int cpu; - assert_spin_locked(&task_rq(p)->lock); + assert_atomic_spin_locked(&task_rq(p)->lock); if (test_tsk_need_resched(p)) return; @@ -1230,10 +1322,10 @@ static void resched_cpu(int cpu) struct rq *rq = cpu_rq(cpu); unsigned long flags; - if (!spin_trylock_irqsave(&rq->lock, flags)) + if (!atomic_spin_trylock_irqsave(&rq->lock, flags)) return; resched_task(cpu_curr(cpu)); - spin_unlock_irqrestore(&rq->lock, flags); + atomic_spin_unlock_irqrestore(&rq->lock, flags); } #ifdef CONFIG_NO_HZ @@ -1251,7 +1343,7 @@ void wake_up_idle_cpu(int cpu) { struct rq *rq = cpu_rq(cpu); - if (cpu == smp_processor_id()) + if (cpu == raw_smp_processor_id()) return; /* @@ -1281,7 +1373,7 @@ void wake_up_idle_cpu(int cpu) #else /* !CONFIG_SMP */ static void resched_task(struct task_struct *p) { - assert_spin_locked(&task_rq(p)->lock); + assert_atomic_spin_locked(&task_rq(p)->lock); set_tsk_need_resched(p); } #endif /* CONFIG_SMP */ @@ -1544,11 +1636,11 @@ update_group_shares_cpu(struct task_group *tg, int cpu, struct rq *rq = cpu_rq(cpu); unsigned long flags; - spin_lock_irqsave(&rq->lock, flags); + atomic_spin_lock_irqsave(&rq->lock, flags); tg->cfs_rq[cpu]->shares = shares; __set_se_shares(tg->se[cpu], shares); - spin_unlock_irqrestore(&rq->lock, flags); + atomic_spin_unlock_irqrestore(&rq->lock, flags); } } @@ -1627,9 +1719,9 @@ static void update_shares(struct sched_domain *sd) static void update_shares_locked(struct rq *rq, struct sched_domain *sd) { - spin_unlock(&rq->lock); + atomic_spin_unlock(&rq->lock); update_shares(sd); - spin_lock(&rq->lock); + atomic_spin_lock(&rq->lock); } static void update_h_load(long cpu) @@ -1664,7 +1756,7 @@ static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest) __acquires(busiest->lock) __acquires(this_rq->lock) { - spin_unlock(&this_rq->lock); + atomic_spin_unlock(&this_rq->lock); double_rq_lock(this_rq, busiest); return 1; @@ -1685,14 +1777,14 @@ static int _double_lock_balance(struct rq *this_rq, struct rq *busiest) { int ret = 0; - if (unlikely(!spin_trylock(&busiest->lock))) { + if (unlikely(!atomic_spin_trylock(&busiest->lock))) { if (busiest < this_rq) { - spin_unlock(&this_rq->lock); - spin_lock(&busiest->lock); - spin_lock_nested(&this_rq->lock, SINGLE_DEPTH_NESTING); + atomic_spin_unlock(&this_rq->lock); + atomic_spin_lock(&busiest->lock); + atomic_spin_lock_nested(&this_rq->lock, SINGLE_DEPTH_NESTING); ret = 1; } else - spin_lock_nested(&busiest->lock, SINGLE_DEPTH_NESTING); + atomic_spin_lock_nested(&busiest->lock, SINGLE_DEPTH_NESTING); } return ret; } @@ -1706,7 +1798,7 @@ static int double_lock_balance(struct rq *this_rq, struct rq *busiest) { if (unlikely(!irqs_disabled())) { /* printk() doesn't work good under rq->lock */ - spin_unlock(&this_rq->lock); + atomic_spin_unlock(&this_rq->lock); BUG_ON(1); } @@ -1716,7 +1808,7 @@ static int double_lock_balance(struct rq *this_rq, struct rq *busiest) static inline void double_unlock_balance(struct rq *this_rq, struct rq *busiest) __releases(busiest->lock) { - spin_unlock(&busiest->lock); + atomic_spin_unlock(&busiest->lock); lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_); } #endif @@ -1832,6 +1924,8 @@ static inline int normal_prio(struct task_struct *p) prio = MAX_RT_PRIO-1 - p->rt_priority; else prio = __normal_prio(p); + +// trace_special_pid(p->pid, PRIO(p), __PRIO(prio)); return prio; } @@ -2156,7 +2250,10 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state) * yield - it could be a while. */ if (unlikely(on_rq)) { - schedule_timeout_uninterruptible(1); + ktime_t to = ktime_set(0, NSEC_PER_SEC/HZ); + + set_current_state(TASK_UNINTERRUPTIBLE); + schedule_hrtimeout(&to, HRTIMER_MODE_REL); continue; } @@ -2412,7 +2509,8 @@ void task_oncpu_function_call(struct task_struct *p, * * returns failure only if the task is already active. */ -static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync) +static int +try_to_wake_up(struct task_struct *p, unsigned int state, int sync, int mutex) { int cpu, orig_cpu, this_cpu, success = 0; unsigned long flags; @@ -2438,6 +2536,13 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync) } #endif +#ifdef CONFIG_PREEMPT_RT + /* + * sync wakeups can increase wakeup latencies: + */ + if (rt_task(p)) + sync = 0; +#endif smp_wmb(); rq = task_rq_lock(p, &flags); update_rq_clock(rq); @@ -2521,7 +2626,18 @@ out_running: trace_sched_wakeup(rq, p, success); check_preempt_curr(rq, p, sync); - p->state = TASK_RUNNING; + /* + * For a mutex wakeup we or TASK_RUNNING_MUTEX to the task + * state to preserve the original state, so a real wakeup + * still can see the (UN)INTERRUPTIBLE bits in the state check + * above. We dont have to worry about the | TASK_RUNNING_MUTEX + * here. The waiter is serialized by the mutex lock and nobody + * else can fiddle with p->state as we hold rq lock. + */ + if (mutex) + p->state |= TASK_RUNNING_MUTEX; + else + p->state = TASK_RUNNING; #ifdef CONFIG_SMP if (p->sched_class->task_wake_up) p->sched_class->task_wake_up(rq, p); @@ -2545,13 +2661,31 @@ out: */ int wake_up_process(struct task_struct *p) { - return try_to_wake_up(p, TASK_ALL, 0); + return try_to_wake_up(p, TASK_ALL, 0, 0); } EXPORT_SYMBOL(wake_up_process); +int wake_up_process_sync(struct task_struct * p) +{ + return try_to_wake_up(p, TASK_ALL, 1, 0); +} +EXPORT_SYMBOL(wake_up_process_sync); + +int wake_up_process_mutex(struct task_struct * p) +{ + return try_to_wake_up(p, TASK_ALL, 0, 1); +} +EXPORT_SYMBOL(wake_up_process_mutex); + +int wake_up_process_mutex_sync(struct task_struct * p) +{ + return try_to_wake_up(p, TASK_ALL, 1, 1); +} +EXPORT_SYMBOL(wake_up_process_mutex_sync); + int wake_up_state(struct task_struct *p, unsigned int state) { - return try_to_wake_up(p, state, 0); + return try_to_wake_up(p, state, 0, 0); } /* @@ -2647,7 +2781,7 @@ void sched_fork(struct task_struct *p, int clone_flags) if (likely(sched_info_on())) memset(&p->sched_info, 0, sizeof(p->sched_info)); #endif -#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW) +#if defined(CONFIG_SMP) p->oncpu = 0; #endif #ifdef CONFIG_PREEMPT @@ -2725,8 +2859,17 @@ static void fire_sched_in_preempt_notifiers(struct task_struct *curr) struct preempt_notifier *notifier; struct hlist_node *node; + if (hlist_empty(&curr->preempt_notifiers)) + return; + + /* + * The KVM sched in notifier expects to be called with + * interrupts enabled. + */ + local_irq_enable(); hlist_for_each_entry(notifier, node, &curr->preempt_notifiers, link) notifier->ops->sched_in(notifier, raw_smp_processor_id()); + local_irq_disable(); } static void @@ -2817,7 +2960,7 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev) * Manfred Spraul <manfred@colorfullife.com> */ prev_state = prev->state; - finish_arch_switch(prev); + _finish_arch_switch(prev); perf_counter_task_sched_in(current, cpu_of(rq)); finish_lock_switch(rq, prev); #ifdef CONFIG_SMP @@ -2826,8 +2969,12 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev) #endif fire_sched_in_preempt_notifiers(current); + /* + * Delay the final freeing of the mm or task, so that we dont have + * to do complex work from within the scheduler: + */ if (mm) - mmdrop(mm); + mmdrop_delayed(mm); if (unlikely(prev_state == TASK_DEAD)) { /* * Remove function-return probe instances associated with this @@ -2845,12 +2992,15 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev) asmlinkage void schedule_tail(struct task_struct *prev) __releases(rq->lock) { - struct rq *rq = this_rq(); - - finish_task_switch(rq, prev); + preempt_disable(); + finish_task_switch(this_rq(), prev); + __preempt_enable_no_resched(); + local_irq_enable(); #ifdef __ARCH_WANT_UNLOCKED_CTXSW /* In this case, finish_task_switch does not reenable preemption */ preempt_enable(); +#else + preempt_check_resched(); #endif if (current->set_child_tid) put_user(task_pid_vnr(current), current->set_child_tid); @@ -2898,6 +3048,11 @@ context_switch(struct rq *rq, struct task_struct *prev, spin_release(&rq->lock.dep_map, 1, _THIS_IP_); #endif +#ifdef CURRENT_PTR + barrier(); + *current_ptr = next; + *current_ti_ptr = next->thread_info; +#endif /* Here we just switch the register state and the stack. */ switch_to(prev, next, prev); @@ -2944,6 +3099,11 @@ unsigned long nr_uninterruptible(void) return sum; } +unsigned long nr_uninterruptible_cpu(int cpu) +{ + return cpu_rq(cpu)->nr_uninterruptible; +} + unsigned long long nr_context_switches(void) { int i; @@ -2962,6 +3122,13 @@ unsigned long nr_iowait(void) for_each_possible_cpu(i) sum += atomic_read(&cpu_rq(i)->nr_iowait); + /* + * Since we read the counters lockless, it might be slightly + * inaccurate. Do not allow it to go below zero though: + */ + if (unlikely((long)sum < 0)) + sum = 0; + return sum; } @@ -3091,15 +3258,17 @@ static void double_rq_lock(struct rq *rq1, struct rq *rq2) { BUG_ON(!irqs_disabled()); if (rq1 == rq2) { - spin_lock(&rq1->lock); + atomic_spin_lock(&rq1->lock); __acquire(rq2->lock); /* Fake it out ;) */ } else { if (rq1 < rq2) { - spin_lock(&rq1->lock); - spin_lock_nested(&rq2->lock, SINGLE_DEPTH_NESTING); + atomic_spin_lock(&rq1->lock); + atomic_spin_lock_nested(&rq2->lock, + SINGLE_DEPTH_NESTING); } else { - spin_lock(&rq2->lock); - spin_lock_nested(&rq1->lock, SINGLE_DEPTH_NESTING); + atomic_spin_lock(&rq2->lock); + atomic_spin_lock_nested(&rq1->lock, + SINGLE_DEPTH_NESTING); } } update_rq_clock(rq1); @@ -3116,9 +3285,9 @@ static void double_rq_unlock(struct rq *rq1, struct rq *rq2) __releases(rq1->lock) __releases(rq2->lock) { - spin_unlock(&rq1->lock); + atomic_spin_unlock(&rq1->lock); if (rq1 != rq2) - spin_unlock(&rq2->lock); + atomic_spin_unlock(&rq2->lock); else __release(rq2->lock); } @@ -4105,14 +4274,15 @@ redo: if (unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2)) { - spin_lock_irqsave(&busiest->lock, flags); + atomic_spin_lock_irqsave(&busiest->lock, flags); /* don't kick the migration_thread, if the curr * task on busiest cpu can't be moved to this_cpu */ if (!cpumask_test_cpu(this_cpu, &busiest->curr->cpus_allowed)) { - spin_unlock_irqrestore(&busiest->lock, flags); + atomic_spin_unlock_irqrestore(&busiest->lock, + flags); all_pinned = 1; goto out_one_pinned; } @@ -4122,7 +4292,7 @@ redo: busiest->push_cpu = this_cpu; active_balance = 1; } - spin_unlock_irqrestore(&busiest->lock, flags); + atomic_spin_unlock_irqrestore(&busiest->lock, flags); if (active_balance) wake_up_process(busiest->migration_thread); @@ -4304,10 +4474,10 @@ redo: /* * Should not call ttwu while holding a rq->lock */ - spin_unlock(&this_rq->lock); + atomic_spin_unlock(&this_rq->lock); if (active_balance) wake_up_process(busiest->migration_thread); - spin_lock(&this_rq->lock); + atomic_spin_lock(&this_rq->lock); } else sd->nr_balance_failed = 0; @@ -4712,7 +4882,7 @@ out: */ static void run_rebalance_domains(struct softirq_action *h) { - int this_cpu = smp_processor_id(); + int this_cpu = raw_smp_processor_id(); struct rq *this_rq = cpu_rq(this_cpu); enum cpu_idle_type idle = this_rq->idle_at_tick ? CPU_IDLE : CPU_NOT_IDLE; @@ -4921,7 +5091,9 @@ void account_user_time(struct task_struct *p, cputime_t cputime, /* Add user time to cpustat. */ tmp = cputime_to_cputime64(cputime); - if (TASK_NICE(p) > 0) + if (rt_task(p)) + cpustat->user_rt = cputime64_add(cpustat->user_rt, tmp); + else if (TASK_NICE(p) > 0) cpustat->nice = cputime64_add(cpustat->nice, tmp); else cpustat->user = cputime64_add(cpustat->user, tmp); @@ -4983,8 +5155,10 @@ void account_system_time(struct task_struct *p, int hardirq_offset, tmp = cputime_to_cputime64(cputime); if (hardirq_count() - hardirq_offset) cpustat->irq = cputime64_add(cpustat->irq, tmp); - else if (softirq_count()) + else if (softirq_count() || (p->flags & PF_SOFTIRQ)) cpustat->softirq = cputime64_add(cpustat->softirq, tmp); + else if (rt_task(p)) + cpustat->system_rt = cputime64_add(cpustat->system_rt, tmp); else cpustat->system = cputime64_add(cpustat->system, tmp); @@ -5139,11 +5313,14 @@ void scheduler_tick(void) sched_clock_tick(); - spin_lock(&rq->lock); + BUG_ON(!irqs_disabled()); + + atomic_spin_lock(&rq->lock); update_rq_clock(rq); update_cpu_load(rq); - curr->sched_class->task_tick(rq, curr, 0); - spin_unlock(&rq->lock); + if (curr != rq->idle && curr->se.on_rq) + curr->sched_class->task_tick(rq, curr, 0); + atomic_spin_unlock(&rq->lock); perf_counter_task_tick(curr, cpu); @@ -5163,6 +5340,19 @@ notrace unsigned long get_parent_ip(unsigned long addr) return addr; } +#ifdef CONFIG_DEBUG_PREEMPT +void notrace preempt_enable_no_resched(void) +{ + barrier(); + dec_preempt_count(); + + WARN_ONCE(!preempt_count(), + KERN_ERR "BUG: %s:%d task might have lost a preemption check!\n", + current->comm, current->pid); +} +EXPORT_SYMBOL(preempt_enable_no_resched); +#endif + #if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \ defined(CONFIG_PREEMPT_TRACER)) @@ -5219,8 +5409,8 @@ static noinline void __schedule_bug(struct task_struct *prev) { struct pt_regs *regs = get_irq_regs(); - printk(KERN_ERR "BUG: scheduling while atomic: %s/%d/0x%08x\n", - prev->comm, prev->pid, preempt_count()); + printk(KERN_ERR "BUG: scheduling while atomic: %s/0x%08x/%d, CPU#%d\n", + prev->comm, preempt_count(), prev->pid, smp_processor_id()); debug_show_held_locks(prev); print_modules(); @@ -5238,12 +5428,14 @@ static noinline void __schedule_bug(struct task_struct *prev) */ static inline void schedule_debug(struct task_struct *prev) { +// WARN_ON(system_state == SYSTEM_BOOTING); + /* * Test if we are atomic. Since do_exit() needs to call into * schedule() atomically, we ignore that path for now. * Otherwise, whine if we are scheduling when we should not be. */ - if (unlikely(in_atomic_preempt_off() && !prev->exit_state)) + if (unlikely(in_atomic() && !prev->exit_state)) __schedule_bug(prev); profile_hit(SCHED_PROFILING, __builtin_return_address(0)); @@ -5314,15 +5506,13 @@ pick_next_task(struct rq *rq) /* * schedule() is the main scheduler function. */ -asmlinkage void __sched schedule(void) +asmlinkage void __sched __schedule(void) { struct task_struct *prev, *next; unsigned long *switch_count; struct rq *rq; int cpu; -need_resched: - preempt_disable(); cpu = smp_processor_id(); rq = cpu_rq(cpu); rcu_qsctr_inc(cpu); @@ -5330,25 +5520,32 @@ need_resched: switch_count = &prev->nivcsw; release_kernel_lock(prev); -need_resched_nonpreemptible: schedule_debug(prev); + preempt_disable(); + if (sched_feat(HRTICK)) hrtick_clear(rq); - spin_lock_irq(&rq->lock); + atomic_spin_lock_irq(&rq->lock); update_rq_clock(rq); clear_tsk_need_resched(prev); - if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) { + if (!(prev->state & TASK_RUNNING_MUTEX) && prev->state && + !(preempt_count() & PREEMPT_ACTIVE)) { if (unlikely(signal_pending_state(prev->state, prev))) prev->state = TASK_RUNNING; - else + else { + touch_softlockup_watchdog(); deactivate_task(rq, prev, 1); + } switch_count = &prev->nvcsw; } + if (preempt_count() & PREEMPT_ACTIVE) + sub_preempt_count(PREEMPT_ACTIVE); + #ifdef CONFIG_SMP if (prev->sched_class->pre_schedule) prev->sched_class->pre_schedule(rq, prev); @@ -5375,19 +5572,28 @@ need_resched_nonpreemptible: */ cpu = smp_processor_id(); rq = cpu_rq(cpu); - } else - spin_unlock_irq(&rq->lock); + __preempt_enable_no_resched(); + } else { + __preempt_enable_no_resched(); + atomic_spin_unlock(&rq->lock); + } + + reacquire_kernel_lock(current); +} - if (unlikely(reacquire_kernel_lock(current) < 0)) - goto need_resched_nonpreemptible; +asmlinkage void __sched schedule(void) +{ +need_resched: + local_irq_disable(); + __schedule(); + local_irq_enable(); - preempt_enable_no_resched(); if (need_resched()) goto need_resched; } EXPORT_SYMBOL(schedule); -#ifdef CONFIG_SMP +#if defined(CONFIG_SMP) && !defined(CONFIG_PREEMPT_RT) /* * Look out! "owner" is an entirely speculative pointer * access and not reliable. @@ -5449,6 +5655,35 @@ out: #endif #ifdef CONFIG_PREEMPT + +/* + * Global flag to turn preemption off on a CONFIG_PREEMPT kernel: + */ +int kernel_preemption = 1; + +static int __init preempt_setup (char *str) +{ + if (!strncmp(str, "off", 3)) { + if (kernel_preemption) { + printk(KERN_INFO "turning off kernel preemption!\n"); + kernel_preemption = 0; + } + return 1; + } + if (!strncmp(str, "on", 2)) { + if (!kernel_preemption) { + printk(KERN_INFO "turning on kernel preemption!\n"); + kernel_preemption = 1; + } + return 1; + } + get_option(&str, &kernel_preemption); + + return 1; +} + +__setup("preempt=", preempt_setup); + /* * this is the entry point to schedule() from in-kernel preemption * off of preempt_enable. Kernel preemptions off return from interrupt @@ -5457,7 +5692,11 @@ out: asmlinkage void __sched preempt_schedule(void) { struct thread_info *ti = current_thread_info(); + struct task_struct *task = current; + int saved_lock_depth; + if (!kernel_preemption) + return; /* * If there is a non-zero preempt_count or interrupts are disabled, * we do not want to preempt the current task. Just return.. @@ -5466,9 +5705,19 @@ asmlinkage void __sched preempt_schedule(void) return; do { + local_irq_disable(); add_preempt_count(PREEMPT_ACTIVE); - schedule(); - sub_preempt_count(PREEMPT_ACTIVE); + + /* + * We keep the big kernel semaphore locked, but we + * clear ->lock_depth so that schedule() doesnt + * auto-release the semaphore: + */ + saved_lock_depth = task->lock_depth; + task->lock_depth = -1; + __schedule(); + task->lock_depth = saved_lock_depth; + local_irq_enable(); /* * Check again in case we missed a preemption opportunity @@ -5480,24 +5729,40 @@ asmlinkage void __sched preempt_schedule(void) EXPORT_SYMBOL(preempt_schedule); /* - * this is the entry point to schedule() from kernel preemption - * off of irq context. - * Note, that this is called and return with irqs disabled. This will - * protect us against recursive calling from irq. + * this is is the entry point for the IRQ return path. Called with + * interrupts disabled. To avoid infinite irq-entry recursion problems + * with fast-paced IRQ sources we do all of this carefully to never + * enable interrupts again. */ asmlinkage void __sched preempt_schedule_irq(void) { struct thread_info *ti = current_thread_info(); + struct task_struct *task = current; + int saved_lock_depth; - /* Catch callers which need to be fixed */ - BUG_ON(ti->preempt_count || !irqs_disabled()); + if (!kernel_preemption) + return; + /* + * If there is a non-zero preempt_count then just return. + * (interrupts are disabled) + */ + if (unlikely(ti->preempt_count)) + return; do { + local_irq_disable(); add_preempt_count(PREEMPT_ACTIVE); - local_irq_enable(); - schedule(); + + /* + * We keep the big kernel semaphore locked, but we + * clear ->lock_depth so that schedule() doesnt + * auto-release the semaphore: + */ + saved_lock_depth = task->lock_depth; + task->lock_depth = -1; + __schedule(); local_irq_disable(); - sub_preempt_count(PREEMPT_ACTIVE); + task->lock_depth = saved_lock_depth; /* * Check again in case we missed a preemption opportunity @@ -5512,7 +5777,7 @@ asmlinkage void __sched preempt_schedule_irq(void) int default_wake_function(wait_queue_t *curr, unsigned mode, int sync, void *key) { - return try_to_wake_up(curr->private, mode, sync); + return try_to_wake_up(curr->private, mode, sync, 0); } EXPORT_SYMBOL(default_wake_function); @@ -5555,7 +5820,7 @@ void __wake_up(wait_queue_head_t *q, unsigned int mode, unsigned long flags; spin_lock_irqsave(&q->lock, flags); - __wake_up_common(q, mode, nr_exclusive, 0, key); + __wake_up_common(q, mode, nr_exclusive, 1, key); spin_unlock_irqrestore(&q->lock, flags); } EXPORT_SYMBOL(__wake_up); @@ -5635,7 +5900,7 @@ void complete(struct completion *x) spin_lock_irqsave(&x->wait.lock, flags); x->done++; - __wake_up_common(&x->wait, TASK_NORMAL, 1, 0, NULL); + __wake_up_common(&x->wait, TASK_NORMAL, 1, 1, NULL); spin_unlock_irqrestore(&x->wait.lock, flags); } EXPORT_SYMBOL(complete); @@ -5655,7 +5920,7 @@ void complete_all(struct completion *x) spin_lock_irqsave(&x->wait.lock, flags); x->done += UINT_MAX/2; - __wake_up_common(&x->wait, TASK_NORMAL, 0, 0, NULL); + __wake_up_common(&x->wait, TASK_NORMAL, 0, 1, NULL); spin_unlock_irqrestore(&x->wait.lock, flags); } EXPORT_SYMBOL(complete_all); @@ -5869,19 +6134,19 @@ long __sched sleep_on_timeout(wait_queue_head_t *q, long timeout) } EXPORT_SYMBOL(sleep_on_timeout); -#ifdef CONFIG_RT_MUTEXES - /* - * rt_mutex_setprio - set the current priority of a task + * task_setprio - set the current priority of a task * @p: task * @prio: prio value (kernel-internal form) * * This function changes the 'effective' priority of a task. It does * not touch ->normal_prio like __setscheduler(). * - * Used by the rt_mutex code to implement priority inheritance logic. + * Used by the rt_mutex code to implement priority inheritance logic + * and by rcupreempt-boost to boost priorities of tasks sleeping + * with rcu locks. */ -void rt_mutex_setprio(struct task_struct *p, int prio) +void task_setprio(struct task_struct *p, int prio) { unsigned long flags; int oldprio, on_rq, running; @@ -5891,6 +6156,25 @@ void rt_mutex_setprio(struct task_struct *p, int prio) BUG_ON(prio < 0 || prio > MAX_PRIO); rq = task_rq_lock(p, &flags); + + /* + * Idle task boosting is a nono in general. There is one + * exception, when NOHZ is active: + * + * The idle task calls get_next_timer_interrupt() and holds + * the timer wheel base->lock on the CPU and another CPU wants + * to access the timer (probably to cancel it). We can safely + * ignore the boosting request, as the idle CPU runs this code + * with interrupts disabled and will complete the lock + * protected section without being interrupted. So there is no + * real need to boost. + */ + if (unlikely(p == rq->idle)) { + WARN_ON(p != rq->curr); + WARN_ON(p->pi_blocked_on); + goto out_unlock; + } + update_rq_clock(rq); oldprio = p->prio; @@ -5908,6 +6192,8 @@ void rt_mutex_setprio(struct task_struct *p, int prio) p->prio = prio; + trace_sched_task_setprio(rq, p, oldprio); + if (running) p->sched_class->set_curr_task(rq); if (on_rq) { @@ -5915,11 +6201,11 @@ void rt_mutex_setprio(struct task_struct *p, int prio) check_class_changed(rq, p, prev_class, oldprio, running); } + +out_unlock: task_rq_unlock(rq, &flags); } -#endif - void set_user_nice(struct task_struct *p, long nice) { int old_prio, delta, on_rq; @@ -6199,7 +6485,7 @@ recheck: * make sure no PI-waiters arrive (or leave) while we are * changing the priority of the task: */ - spin_lock_irqsave(&p->pi_lock, flags); + atomic_spin_lock_irqsave(&p->pi_lock, flags); /* * To be able to change p->policy safely, the apropriate * runqueue lock must be held. @@ -6209,7 +6495,7 @@ recheck: if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) { policy = oldpolicy = -1; __task_rq_unlock(rq); - spin_unlock_irqrestore(&p->pi_lock, flags); + atomic_spin_unlock_irqrestore(&p->pi_lock, flags); goto recheck; } update_rq_clock(rq); @@ -6231,7 +6517,7 @@ recheck: check_class_changed(rq, p, prev_class, oldprio, running); } __task_rq_unlock(rq); - spin_unlock_irqrestore(&p->pi_lock, flags); + atomic_spin_unlock_irqrestore(&p->pi_lock, flags); rt_mutex_adjust_pi(p); @@ -6557,9 +6843,9 @@ SYSCALL_DEFINE0(sched_yield) __release(rq->lock); spin_release(&rq->lock.dep_map, 1, _THIS_IP_); _raw_spin_unlock(&rq->lock); - preempt_enable_no_resched(); + local_irq_enable(); - schedule(); + preempt_enable_and_schedule(); return 0; } @@ -6569,9 +6855,40 @@ static inline int should_resched(void) return need_resched() && !(preempt_count() & PREEMPT_ACTIVE); } +#if defined(CONFIG_DEBUG_SPINLOCK_SLEEP) || defined(CONFIG_DEBUG_PREEMPT) +void __might_sleep(char *file, int line) +{ +#ifdef in_atomic + static unsigned long prev_jiffy; /* ratelimiting */ + + if ((!in_atomic() && !irqs_disabled()) || + system_state != SYSTEM_RUNNING || oops_in_progress) + return; + + if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy) + return; + prev_jiffy = jiffies; + + printk(KERN_ERR + "BUG: sleeping function called from invalid context at %s:%d\n", + file, line); + printk(KERN_ERR + "in_atomic(): %d, irqs_disabled(): %d, pid: %d, name: %s\n", + in_atomic(), irqs_disabled(), + current->pid, current->comm); + + debug_show_held_locks(current); + if (irqs_disabled()) + print_irqtrace_events(current); + dump_stack(); +#endif +} +EXPORT_SYMBOL(__might_sleep); +#endif + static void __cond_resched(void) { -#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP +#if defined(CONFIG_DEBUG_SPINLOCK_SLEEP) || defined(CONFIG_DEBUG_PREEMPT) __might_sleep(__FILE__, __LINE__); #endif /* @@ -6580,10 +6897,11 @@ static void __cond_resched(void) * cond_resched() call. */ do { + local_irq_disable(); add_preempt_count(PREEMPT_ACTIVE); - schedule(); - sub_preempt_count(PREEMPT_ACTIVE); + __schedule(); } while (need_resched()); + local_irq_enable(); } int __sched _cond_resched(void) @@ -6622,9 +6940,16 @@ int cond_resched_lock(spinlock_t *lock) } EXPORT_SYMBOL(cond_resched_lock); +/* + * Voluntarily preempt a process context that has softirqs disabled: + */ int __sched cond_resched_softirq(void) { - BUG_ON(!in_softirq()); +#ifndef CONFIG_PREEMPT_SOFTIRQS + WARN_ON_ONCE(!in_softirq()); + if (!in_softirq()) + return 0; +#endif if (should_resched()) { local_bh_enable(); @@ -6636,17 +6961,75 @@ int __sched cond_resched_softirq(void) } EXPORT_SYMBOL(cond_resched_softirq); +/* + * Voluntarily preempt a softirq context (possible with softirq threading): + */ +int __sched cond_resched_softirq_context(void) +{ + WARN_ON_ONCE(!in_softirq()); + + if (softirq_need_resched() && system_state == SYSTEM_RUNNING) { + raw_local_irq_disable(); + _local_bh_enable(); + raw_local_irq_enable(); + __cond_resched(); + local_bh_disable(); + return 1; + } + return 0; +} +EXPORT_SYMBOL(cond_resched_softirq_context); + +#ifdef CONFIG_PREEMPT_VOLUNTARY +int voluntary_preemption = 1; +EXPORT_SYMBOL(voluntary_preemption); + +static int __init voluntary_preempt_setup (char *str) +{ + if (!strncmp(str, "off", 3)) + voluntary_preemption = 0; + else + get_option(&str, &voluntary_preemption); + if (!voluntary_preemption) + printk("turning off voluntary preemption!\n"); + + return 1; +} + +__setup("voluntary-preempt=", voluntary_preempt_setup); + +#endif + /** * yield - yield the current processor to other threads. * * This is a shortcut for kernel-space yielding - it marks the * thread runnable and calls sys_sched_yield(). */ -void __sched yield(void) +void __sched __yield(void) { set_current_state(TASK_RUNNING); sys_sched_yield(); } + +void __sched yield(void) +{ + static int once = 1; + + /* + * it's a bug to rely on yield() with RT priorities. We print + * the first occurance after bootup ... this will still give + * us an idea about the scope of the problem, without spamming + * the syslog: + */ + if (once && rt_task(current)) { + once = 0; + printk(KERN_ERR "BUG: %s:%d RT task yield()-ing!\n", + current->comm, current->pid); + dump_stack(); + } + __yield(); +} EXPORT_SYMBOL(yield); /* @@ -6820,6 +7203,7 @@ void sched_show_task(struct task_struct *p) void show_state_filter(unsigned long state_filter) { struct task_struct *g, *p; + int do_unlock = 1; #if BITS_PER_LONG == 32 printk(KERN_INFO @@ -6828,7 +7212,16 @@ void show_state_filter(unsigned long state_filter) printk(KERN_INFO " task PC stack pid father\n"); #endif +#ifdef CONFIG_PREEMPT_RT + if (!read_trylock(&tasklist_lock)) { + printk("hm, tasklist_lock write-locked.\n"); + printk("ignoring ...\n"); + do_unlock = 0; + } +#else read_lock(&tasklist_lock); +#endif + do_each_thread(g, p) { /* * reset the NMI-timeout, listing all files on a slow @@ -6844,7 +7237,8 @@ void show_state_filter(unsigned long state_filter) #ifdef CONFIG_SCHED_DEBUG sysrq_sched_debug_show(); #endif - read_unlock(&tasklist_lock); + if (do_unlock) + read_unlock(&tasklist_lock); /* * Only show locks if all tasks are dumped: */ @@ -6870,7 +7264,7 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu) struct rq *rq = cpu_rq(cpu); unsigned long flags; - spin_lock_irqsave(&rq->lock, flags); + atomic_spin_lock_irqsave(&rq->lock, flags); __sched_fork(idle); idle->se.exec_start = sched_clock(); @@ -6880,17 +7274,14 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu) __set_task_cpu(idle, cpu); rq->curr = rq->idle = idle; -#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW) +#if defined(CONFIG_SMP) idle->oncpu = 1; #endif - spin_unlock_irqrestore(&rq->lock, flags); + atomic_spin_unlock_irqrestore(&rq->lock, flags); /* Set the preempt count _outside_ the spinlocks! */ -#if defined(CONFIG_PREEMPT) - task_thread_info(idle)->preempt_count = (idle->lock_depth >= 0); -#else task_thread_info(idle)->preempt_count = 0; -#endif + /* * The idle tasks have their own, simple scheduling class: */ @@ -7019,11 +7410,18 @@ EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr); static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu) { struct rq *rq_dest, *rq_src; + unsigned long flags; int ret = 0, on_rq; if (unlikely(!cpu_active(dest_cpu))) return ret; + /* + * PREEMPT_RT: this relies on write_lock_irq(&tasklist_lock) + * disabling interrupts - which on PREEMPT_RT does not do: + */ + local_irq_save(flags); + rq_src = cpu_rq(src_cpu); rq_dest = cpu_rq(dest_cpu); @@ -7048,6 +7446,8 @@ done: ret = 1; fail: double_rq_unlock(rq_src, rq_dest); + local_irq_restore(flags); + return ret; } @@ -7069,10 +7469,10 @@ static int migration_thread(void *data) struct migration_req *req; struct list_head *head; - spin_lock_irq(&rq->lock); + atomic_spin_lock_irq(&rq->lock); if (cpu_is_offline(cpu)) { - spin_unlock_irq(&rq->lock); + atomic_spin_unlock_irq(&rq->lock); break; } @@ -7084,7 +7484,7 @@ static int migration_thread(void *data) head = &rq->migration_queue; if (list_empty(head)) { - spin_unlock_irq(&rq->lock); + atomic_spin_unlock_irq(&rq->lock); schedule(); set_current_state(TASK_INTERRUPTIBLE); continue; @@ -7092,7 +7492,7 @@ static int migration_thread(void *data) req = list_entry(head->next, struct migration_req, list); list_del_init(head->next); - spin_unlock(&rq->lock); + atomic_spin_unlock(&rq->lock); __migrate_task(req->task, cpu, req->dest_cpu); local_irq_enable(); @@ -7214,14 +7614,14 @@ void sched_idle_next(void) * Strictly not necessary since rest of the CPUs are stopped by now * and interrupts disabled on the current cpu. */ - spin_lock_irqsave(&rq->lock, flags); + atomic_spin_lock_irqsave(&rq->lock, flags); __setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1); update_rq_clock(rq); activate_task(rq, p, 0); - spin_unlock_irqrestore(&rq->lock, flags); + atomic_spin_unlock_irqrestore(&rq->lock, flags); } /* @@ -7236,7 +7636,11 @@ void idle_task_exit(void) if (mm != &init_mm) switch_mm(mm, &init_mm, current); +#ifdef CONFIG_PREEMPT_RT + mmdrop_delayed(mm); +#else mmdrop(mm); +#endif } /* called under rq->lock with disabled interrupts */ @@ -7257,9 +7661,9 @@ static void migrate_dead(unsigned int dead_cpu, struct task_struct *p) * that's OK. No task can be added to this CPU, so iteration is * fine. */ - spin_unlock_irq(&rq->lock); + atomic_spin_unlock_irq(&rq->lock); move_task_off_dead_cpu(dead_cpu, p); - spin_lock_irq(&rq->lock); + atomic_spin_lock_irq(&rq->lock); put_task_struct(p); } @@ -7526,13 +7930,13 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) /* Update our root-domain */ rq = cpu_rq(cpu); - spin_lock_irqsave(&rq->lock, flags); + atomic_spin_lock_irqsave(&rq->lock, flags); if (rq->rd) { BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); set_rq_online(rq); } - spin_unlock_irqrestore(&rq->lock, flags); + atomic_spin_unlock_irqrestore(&rq->lock, flags); break; #ifdef CONFIG_HOTPLUG_CPU @@ -7557,14 +7961,14 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) put_task_struct(rq->migration_thread); rq->migration_thread = NULL; /* Idle task back to normal (off runqueue, low prio) */ - spin_lock_irq(&rq->lock); + atomic_spin_lock_irq(&rq->lock); update_rq_clock(rq); deactivate_task(rq, rq->idle, 0); rq->idle->static_prio = MAX_PRIO; __setscheduler(rq, rq->idle, SCHED_NORMAL, 0); rq->idle->sched_class = &idle_sched_class; migrate_dead_tasks(cpu); - spin_unlock_irq(&rq->lock); + atomic_spin_unlock_irq(&rq->lock); cpuset_unlock(); migrate_nr_uninterruptible(rq); BUG_ON(rq->nr_running != 0); @@ -7574,30 +7978,30 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) * they didn't take sched_hotcpu_mutex. Just wake up * the requestors. */ - spin_lock_irq(&rq->lock); + atomic_spin_lock_irq(&rq->lock); while (!list_empty(&rq->migration_queue)) { struct migration_req *req; req = list_entry(rq->migration_queue.next, struct migration_req, list); list_del_init(&req->list); - spin_unlock_irq(&rq->lock); + atomic_spin_unlock_irq(&rq->lock); complete(&req->done); - spin_lock_irq(&rq->lock); + atomic_spin_lock_irq(&rq->lock); } - spin_unlock_irq(&rq->lock); + atomic_spin_unlock_irq(&rq->lock); break; case CPU_DYING: case CPU_DYING_FROZEN: /* Update our root-domain */ rq = cpu_rq(cpu); - spin_lock_irqsave(&rq->lock, flags); + atomic_spin_lock_irqsave(&rq->lock, flags); if (rq->rd) { BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); set_rq_offline(rq); } - spin_unlock_irqrestore(&rq->lock, flags); + atomic_spin_unlock_irqrestore(&rq->lock, flags); break; #endif } @@ -7625,7 +8029,7 @@ static int __init migration_init(void) migration_call(&migration_notifier, CPU_ONLINE, cpu); register_cpu_notifier(&migration_notifier); - return err; + return 0; } early_initcall(migration_init); #endif @@ -7818,7 +8222,7 @@ static void rq_attach_root(struct rq *rq, struct root_domain *rd) struct root_domain *old_rd = NULL; unsigned long flags; - spin_lock_irqsave(&rq->lock, flags); + atomic_spin_lock_irqsave(&rq->lock, flags); if (rq->rd) { old_rd = rq->rd; @@ -7844,7 +8248,7 @@ static void rq_attach_root(struct rq *rq, struct root_domain *rd) if (cpumask_test_cpu(rq->cpu, cpu_online_mask)) set_rq_online(rq); - spin_unlock_irqrestore(&rq->lock, flags); + atomic_spin_unlock_irqrestore(&rq->lock, flags); if (old_rd) free_rootdomain(old_rd); @@ -9097,13 +9501,13 @@ static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq) #ifdef CONFIG_SMP rt_rq->rt_nr_migratory = 0; rt_rq->overloaded = 0; - plist_head_init(&rt_rq->pushable_tasks, &rq->lock); + plist_head_init_atomic(&rt_rq->pushable_tasks, &rq->lock); #endif rt_rq->rt_time = 0; rt_rq->rt_throttled = 0; rt_rq->rt_runtime = 0; - spin_lock_init(&rt_rq->rt_runtime_lock); + atomic_spin_lock_init(&rt_rq->rt_runtime_lock); #ifdef CONFIG_RT_GROUP_SCHED rt_rq->rt_nr_boosted = 0; @@ -9263,7 +9667,7 @@ void __init sched_init(void) struct rq *rq; rq = cpu_rq(i); - spin_lock_init(&rq->lock); + atomic_spin_lock_init(&rq->lock); rq->nr_running = 0; rq->calc_load_active = 0; rq->calc_load_update = jiffies + LOAD_FREQ; @@ -9358,7 +9762,7 @@ void __init sched_init(void) #endif #ifdef CONFIG_RT_MUTEXES - plist_head_init(&init_task.pi_waiters, &init_task.pi_lock); + plist_head_init_atomic(&init_task.pi_waiters, &init_task.pi_lock); #endif /* @@ -9367,6 +9771,9 @@ void __init sched_init(void) atomic_inc(&init_mm.mm_count); enter_lazy_tlb(&init_mm, current); +#ifdef CONFIG_PREEMPT_RT + printk("Real-Time Preemption Support (C) 2004-2007 Ingo Molnar\n"); +#endif /* * Make us the idle thread. Technically, schedule() should not be * called from this thread, however somewhere below it might be, @@ -9397,36 +9804,6 @@ void __init sched_init(void) scheduler_running = 1; } -#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP -void __might_sleep(char *file, int line) -{ -#ifdef in_atomic - static unsigned long prev_jiffy; /* ratelimiting */ - - if ((!in_atomic() && !irqs_disabled()) || - system_state != SYSTEM_RUNNING || oops_in_progress) - return; - if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy) - return; - prev_jiffy = jiffies; - - printk(KERN_ERR - "BUG: sleeping function called from invalid context at %s:%d\n", - file, line); - printk(KERN_ERR - "in_atomic(): %d, irqs_disabled(): %d, pid: %d, name: %s\n", - in_atomic(), irqs_disabled(), - current->pid, current->comm); - - debug_show_held_locks(current); - if (irqs_disabled()) - print_irqtrace_events(current); - dump_stack(); -#endif -} -EXPORT_SYMBOL(__might_sleep); -#endif - #ifdef CONFIG_MAGIC_SYSRQ static void normalize_task(struct rq *rq, struct task_struct *p) { @@ -9474,13 +9851,13 @@ void normalize_rt_tasks(void) continue; } - spin_lock(&p->pi_lock); + atomic_spin_lock(&p->pi_lock); rq = __task_rq_lock(p); normalize_task(rq, p); __task_rq_unlock(rq); - spin_unlock(&p->pi_lock); + atomic_spin_unlock(&p->pi_lock); } while_each_thread(g, p); read_unlock_irqrestore(&tasklist_lock, flags); @@ -9839,9 +10216,9 @@ static void set_se_shares(struct sched_entity *se, unsigned long shares) struct rq *rq = cfs_rq->rq; unsigned long flags; - spin_lock_irqsave(&rq->lock, flags); + atomic_spin_lock_irqsave(&rq->lock, flags); __set_se_shares(se, shares); - spin_unlock_irqrestore(&rq->lock, flags); + atomic_spin_unlock_irqrestore(&rq->lock, flags); } static DEFINE_MUTEX(shares_mutex); @@ -10026,18 +10403,18 @@ static int tg_set_bandwidth(struct task_group *tg, if (err) goto unlock; - spin_lock_irq(&tg->rt_bandwidth.rt_runtime_lock); + atomic_spin_lock_irq(&tg->rt_bandwidth.rt_runtime_lock); tg->rt_bandwidth.rt_period = ns_to_ktime(rt_period); tg->rt_bandwidth.rt_runtime = rt_runtime; for_each_possible_cpu(i) { struct rt_rq *rt_rq = tg->rt_rq[i]; - spin_lock(&rt_rq->rt_runtime_lock); + atomic_spin_lock(&rt_rq->rt_runtime_lock); rt_rq->rt_runtime = rt_runtime; - spin_unlock(&rt_rq->rt_runtime_lock); + atomic_spin_unlock(&rt_rq->rt_runtime_lock); } - spin_unlock_irq(&tg->rt_bandwidth.rt_runtime_lock); + atomic_spin_unlock_irq(&tg->rt_bandwidth.rt_runtime_lock); unlock: read_unlock(&tasklist_lock); mutex_unlock(&rt_constraints_mutex); @@ -10142,15 +10519,15 @@ static int sched_rt_global_constraints(void) if (sysctl_sched_rt_runtime == 0) return -EBUSY; - spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags); + atomic_spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags); for_each_possible_cpu(i) { struct rt_rq *rt_rq = &cpu_rq(i)->rt; - spin_lock(&rt_rq->rt_runtime_lock); + atomic_spin_lock(&rt_rq->rt_runtime_lock); rt_rq->rt_runtime = global_rt_runtime(); - spin_unlock(&rt_rq->rt_runtime_lock); + atomic_spin_unlock(&rt_rq->rt_runtime_lock); } - spin_unlock_irqrestore(&def_rt_bandwidth.rt_runtime_lock, flags); + atomic_spin_unlock_irqrestore(&def_rt_bandwidth.rt_runtime_lock, flags); return 0; } @@ -10412,9 +10789,9 @@ static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu) /* * Take rq->lock to make 64-bit read safe on 32-bit platforms. */ - spin_lock_irq(&cpu_rq(cpu)->lock); + atomic_spin_lock_irq(&cpu_rq(cpu)->lock); data = *cpuusage; - spin_unlock_irq(&cpu_rq(cpu)->lock); + atomic_spin_unlock_irq(&cpu_rq(cpu)->lock); #else data = *cpuusage; #endif @@ -10430,9 +10807,9 @@ static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val) /* * Take rq->lock to make 64-bit write safe on 32-bit platforms. */ - spin_lock_irq(&cpu_rq(cpu)->lock); + atomic_spin_lock_irq(&cpu_rq(cpu)->lock); *cpuusage = val; - spin_unlock_irq(&cpu_rq(cpu)->lock); + atomic_spin_unlock_irq(&cpu_rq(cpu)->lock); #else *cpuusage = val; #endif diff --git a/kernel/sched_cpupri.c b/kernel/sched_cpupri.c index d014efb..a75d990 100644 --- a/kernel/sched_cpupri.c +++ b/kernel/sched_cpupri.c @@ -132,27 +132,27 @@ void cpupri_set(struct cpupri *cp, int cpu, int newpri) if (likely(oldpri != CPUPRI_INVALID)) { struct cpupri_vec *vec = &cp->pri_to_cpu[oldpri]; - spin_lock_irqsave(&vec->lock, flags); + atomic_spin_lock_irqsave(&vec->lock, flags); vec->count--; if (!vec->count) clear_bit(oldpri, cp->pri_active); cpumask_clear_cpu(cpu, vec->mask); - spin_unlock_irqrestore(&vec->lock, flags); + atomic_spin_unlock_irqrestore(&vec->lock, flags); } if (likely(newpri != CPUPRI_INVALID)) { struct cpupri_vec *vec = &cp->pri_to_cpu[newpri]; - spin_lock_irqsave(&vec->lock, flags); + atomic_spin_lock_irqsave(&vec->lock, flags); cpumask_set_cpu(cpu, vec->mask); vec->count++; if (vec->count == 1) set_bit(newpri, cp->pri_active); - spin_unlock_irqrestore(&vec->lock, flags); + atomic_spin_unlock_irqrestore(&vec->lock, flags); } *currpri = newpri; @@ -178,7 +178,7 @@ int cpupri_init(struct cpupri *cp, bool bootmem) for (i = 0; i < CPUPRI_NR_PRIORITIES; i++) { struct cpupri_vec *vec = &cp->pri_to_cpu[i]; - spin_lock_init(&vec->lock); + atomic_spin_lock_init(&vec->lock); vec->count = 0; if (!zalloc_cpumask_var(&vec->mask, gfp)) goto cleanup; diff --git a/kernel/sched_cpupri.h b/kernel/sched_cpupri.h index 9a7e859..9a4c0f3 100644 --- a/kernel/sched_cpupri.h +++ b/kernel/sched_cpupri.h @@ -12,7 +12,7 @@ /* values 2-101 are RT priorities 0-99 */ struct cpupri_vec { - spinlock_t lock; + atomic_spinlock_t lock; int count; cpumask_var_t mask; }; diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c index 70c7e0b..97216fb 100644 --- a/kernel/sched_debug.c +++ b/kernel/sched_debug.c @@ -184,7 +184,7 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "exec_clock", SPLIT_NS(cfs_rq->exec_clock)); - spin_lock_irqsave(&rq->lock, flags); + atomic_spin_lock_irqsave(&rq->lock, flags); if (cfs_rq->rb_leftmost) MIN_vruntime = (__pick_next_entity(cfs_rq))->vruntime; last = __pick_last_entity(cfs_rq); @@ -192,7 +192,7 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) max_vruntime = last->vruntime; min_vruntime = cfs_rq->min_vruntime; rq0_min_vruntime = cpu_rq(0)->cfs.min_vruntime; - spin_unlock_irqrestore(&rq->lock, flags); + atomic_spin_unlock_irqrestore(&rq->lock, flags); SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "MIN_vruntime", SPLIT_NS(MIN_vruntime)); SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "min_vruntime", @@ -280,6 +280,19 @@ static void print_cpu(struct seq_file *m, int cpu) P(cpu_load[2]); P(cpu_load[3]); P(cpu_load[4]); +#ifdef CONFIG_PREEMPT_RT + /* Print rt related rq stats */ + P(rt.rt_nr_running); + P(rt.rt_nr_uninterruptible); +# ifdef CONFIG_SCHEDSTATS + P(rto_schedule); + P(rto_schedule_tail); + P(rto_wakeup); + P(rto_pulled); + P(rto_pushed); +# endif +#endif + #undef P #undef PN diff --git a/kernel/sched_idletask.c b/kernel/sched_idletask.c index 499672c..467d6d2 100644 --- a/kernel/sched_idletask.c +++ b/kernel/sched_idletask.c @@ -34,10 +34,10 @@ static struct task_struct *pick_next_task_idle(struct rq *rq) static void dequeue_task_idle(struct rq *rq, struct task_struct *p, int sleep) { - spin_unlock_irq(&rq->lock); + atomic_spin_unlock_irq(&rq->lock); printk(KERN_ERR "bad: scheduling from the idle thread!\n"); dump_stack(); - spin_lock_irq(&rq->lock); + atomic_spin_lock_irq(&rq->lock); } static void put_prev_task_idle(struct rq *rq, struct task_struct *prev) diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index 3918e01..adcbc68 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c @@ -314,7 +314,7 @@ static int do_balance_runtime(struct rt_rq *rt_rq) weight = cpumask_weight(rd->span); - spin_lock(&rt_b->rt_runtime_lock); + atomic_spin_lock(&rt_b->rt_runtime_lock); rt_period = ktime_to_ns(rt_b->rt_period); for_each_cpu(i, rd->span) { struct rt_rq *iter = sched_rt_period_rt_rq(rt_b, i); @@ -323,7 +323,7 @@ static int do_balance_runtime(struct rt_rq *rt_rq) if (iter == rt_rq) continue; - spin_lock(&iter->rt_runtime_lock); + atomic_spin_lock(&iter->rt_runtime_lock); /* * Either all rqs have inf runtime and there's nothing to steal * or __disable_runtime() below sets a specific rq to inf to @@ -345,14 +345,14 @@ static int do_balance_runtime(struct rt_rq *rt_rq) rt_rq->rt_runtime += diff; more = 1; if (rt_rq->rt_runtime == rt_period) { - spin_unlock(&iter->rt_runtime_lock); + atomic_spin_unlock(&iter->rt_runtime_lock); break; } } next: - spin_unlock(&iter->rt_runtime_lock); + atomic_spin_unlock(&iter->rt_runtime_lock); } - spin_unlock(&rt_b->rt_runtime_lock); + atomic_spin_unlock(&rt_b->rt_runtime_lock); return more; } @@ -373,8 +373,8 @@ static void __disable_runtime(struct rq *rq) s64 want; int i; - spin_lock(&rt_b->rt_runtime_lock); - spin_lock(&rt_rq->rt_runtime_lock); + atomic_spin_lock(&rt_b->rt_runtime_lock); + atomic_spin_lock(&rt_rq->rt_runtime_lock); /* * Either we're all inf and nobody needs to borrow, or we're * already disabled and thus have nothing to do, or we have @@ -383,7 +383,7 @@ static void __disable_runtime(struct rq *rq) if (rt_rq->rt_runtime == RUNTIME_INF || rt_rq->rt_runtime == rt_b->rt_runtime) goto balanced; - spin_unlock(&rt_rq->rt_runtime_lock); + atomic_spin_unlock(&rt_rq->rt_runtime_lock); /* * Calculate the difference between what we started out with @@ -405,7 +405,7 @@ static void __disable_runtime(struct rq *rq) if (iter == rt_rq || iter->rt_runtime == RUNTIME_INF) continue; - spin_lock(&iter->rt_runtime_lock); + atomic_spin_lock(&iter->rt_runtime_lock); if (want > 0) { diff = min_t(s64, iter->rt_runtime, want); iter->rt_runtime -= diff; @@ -414,13 +414,13 @@ static void __disable_runtime(struct rq *rq) iter->rt_runtime -= want; want -= want; } - spin_unlock(&iter->rt_runtime_lock); + atomic_spin_unlock(&iter->rt_runtime_lock); if (!want) break; } - spin_lock(&rt_rq->rt_runtime_lock); + atomic_spin_lock(&rt_rq->rt_runtime_lock); /* * We cannot be left wanting - that would mean some runtime * leaked out of the system. @@ -432,8 +432,8 @@ balanced: * runtime - in which case borrowing doesn't make sense. */ rt_rq->rt_runtime = RUNTIME_INF; - spin_unlock(&rt_rq->rt_runtime_lock); - spin_unlock(&rt_b->rt_runtime_lock); + atomic_spin_unlock(&rt_rq->rt_runtime_lock); + atomic_spin_unlock(&rt_b->rt_runtime_lock); } } @@ -441,9 +441,9 @@ static void disable_runtime(struct rq *rq) { unsigned long flags; - spin_lock_irqsave(&rq->lock, flags); + atomic_spin_lock_irqsave(&rq->lock, flags); __disable_runtime(rq); - spin_unlock_irqrestore(&rq->lock, flags); + atomic_spin_unlock_irqrestore(&rq->lock, flags); } static void __enable_runtime(struct rq *rq) @@ -459,13 +459,13 @@ static void __enable_runtime(struct rq *rq) for_each_leaf_rt_rq(rt_rq, rq) { struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq); - spin_lock(&rt_b->rt_runtime_lock); - spin_lock(&rt_rq->rt_runtime_lock); + atomic_spin_lock(&rt_b->rt_runtime_lock); + atomic_spin_lock(&rt_rq->rt_runtime_lock); rt_rq->rt_runtime = rt_b->rt_runtime; rt_rq->rt_time = 0; rt_rq->rt_throttled = 0; - spin_unlock(&rt_rq->rt_runtime_lock); - spin_unlock(&rt_b->rt_runtime_lock); + atomic_spin_unlock(&rt_rq->rt_runtime_lock); + atomic_spin_unlock(&rt_b->rt_runtime_lock); } } @@ -473,9 +473,9 @@ static void enable_runtime(struct rq *rq) { unsigned long flags; - spin_lock_irqsave(&rq->lock, flags); + atomic_spin_lock_irqsave(&rq->lock, flags); __enable_runtime(rq); - spin_unlock_irqrestore(&rq->lock, flags); + atomic_spin_unlock_irqrestore(&rq->lock, flags); } static int balance_runtime(struct rt_rq *rt_rq) @@ -483,9 +483,9 @@ static int balance_runtime(struct rt_rq *rt_rq) int more = 0; if (rt_rq->rt_time > rt_rq->rt_runtime) { - spin_unlock(&rt_rq->rt_runtime_lock); + atomic_spin_unlock(&rt_rq->rt_runtime_lock); more = do_balance_runtime(rt_rq); - spin_lock(&rt_rq->rt_runtime_lock); + atomic_spin_lock(&rt_rq->rt_runtime_lock); } return more; @@ -511,11 +511,11 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun) struct rt_rq *rt_rq = sched_rt_period_rt_rq(rt_b, i); struct rq *rq = rq_of_rt_rq(rt_rq); - spin_lock(&rq->lock); + atomic_spin_lock(&rq->lock); if (rt_rq->rt_time) { u64 runtime; - spin_lock(&rt_rq->rt_runtime_lock); + atomic_spin_lock(&rt_rq->rt_runtime_lock); if (rt_rq->rt_throttled) balance_runtime(rt_rq); runtime = rt_rq->rt_runtime; @@ -526,13 +526,13 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun) } if (rt_rq->rt_time || rt_rq->rt_nr_running) idle = 0; - spin_unlock(&rt_rq->rt_runtime_lock); + atomic_spin_unlock(&rt_rq->rt_runtime_lock); } else if (rt_rq->rt_nr_running) idle = 0; if (enqueue) sched_rt_rq_enqueue(rt_rq); - spin_unlock(&rq->lock); + atomic_spin_unlock(&rq->lock); } return idle; @@ -609,11 +609,11 @@ static void update_curr_rt(struct rq *rq) rt_rq = rt_rq_of_se(rt_se); if (sched_rt_runtime(rt_rq) != RUNTIME_INF) { - spin_lock(&rt_rq->rt_runtime_lock); + atomic_spin_lock(&rt_rq->rt_runtime_lock); rt_rq->rt_time += delta_exec; if (sched_rt_runtime_exceeded(rt_rq)) resched_task(curr); - spin_unlock(&rt_rq->rt_runtime_lock); + atomic_spin_unlock(&rt_rq->rt_runtime_lock); } } } @@ -860,6 +860,55 @@ static void dequeue_rt_entity(struct sched_rt_entity *rt_se) } } +static inline void incr_rt_nr_uninterruptible(struct task_struct *p, + struct rq *rq) +{ + rq->rt.rt_nr_uninterruptible++; +} + +static inline void decr_rt_nr_uninterruptible(struct task_struct *p, + struct rq *rq) +{ + rq->rt.rt_nr_uninterruptible--; +} + +unsigned long rt_nr_running(void) +{ + unsigned long i, sum = 0; + + for_each_online_cpu(i) + sum += cpu_rq(i)->rt.rt_nr_running; + + return sum; +} + +unsigned long rt_nr_running_cpu(int cpu) +{ + return cpu_rq(cpu)->rt.rt_nr_running; +} + +unsigned long rt_nr_uninterruptible(void) +{ + unsigned long i, sum = 0; + + for_each_online_cpu(i) + sum += cpu_rq(i)->rt.rt_nr_uninterruptible; + + /* + * Since we read the counters lockless, it might be slightly + * inaccurate. Do not allow it to go below zero though: + */ + if (unlikely((long)sum < 0)) + sum = 0; + + return sum; +} + +unsigned long rt_nr_uninterruptible_cpu(int cpu) +{ + return cpu_rq(cpu)->rt.rt_nr_uninterruptible; +} + /* * Adding/removing a task to/from a priority array: */ @@ -872,6 +921,9 @@ static void enqueue_task_rt(struct rq *rq, struct task_struct *p, int wakeup) enqueue_rt_entity(rt_se); + if (p->state == TASK_UNINTERRUPTIBLE) + decr_rt_nr_uninterruptible(p, rq); + if (!task_current(rq, p) && p->rt.nr_cpus_allowed > 1) enqueue_pushable_task(rq, p); @@ -883,6 +935,10 @@ static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep) struct sched_rt_entity *rt_se = &p->rt; update_curr_rt(rq); + + if (p->state == TASK_UNINTERRUPTIBLE) + incr_rt_nr_uninterruptible(p, rq); + dequeue_rt_entity(rt_se); dequeue_pushable_task(rq, p); @@ -1244,7 +1300,7 @@ static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq) task_running(rq, task) || !task->se.on_rq)) { - spin_unlock(&lowest_rq->lock); + atomic_spin_unlock(&lowest_rq->lock); lowest_rq = NULL; break; } @@ -1462,8 +1518,10 @@ static int pull_rt_task(struct rq *this_rq) static void pre_schedule_rt(struct rq *rq, struct task_struct *prev) { /* Try to pull RT tasks here if we lower this rq's prio */ - if (unlikely(rt_task(prev)) && rq->rt.highest_prio.curr > prev->prio) + if (unlikely(rt_task(prev)) && rq->rt.highest_prio.curr > prev->prio) { pull_rt_task(rq); + schedstat_inc(rq, rto_schedule); + } } /* @@ -1480,9 +1538,9 @@ static void post_schedule_rt(struct rq *rq) * This is only called if needs_post_schedule_rt() indicates that * we need to push tasks away */ - spin_lock_irq(&rq->lock); + atomic_spin_lock_irq(&rq->lock); push_rt_tasks(rq); - spin_unlock_irq(&rq->lock); + atomic_spin_unlock_irq(&rq->lock); } /* @@ -1545,7 +1603,6 @@ static void set_cpus_allowed_rt(struct task_struct *p, */ if (weight > 1) enqueue_pushable_task(rq, p); - } if ((p->rt.nr_cpus_allowed <= 1) && (weight > 1)) { diff --git a/kernel/sched_stats.h b/kernel/sched_stats.h index 32d2bd4..7f69cea 100644 --- a/kernel/sched_stats.h +++ b/kernel/sched_stats.h @@ -306,10 +306,10 @@ static inline void account_group_user_time(struct task_struct *tsk, if (!cputimer->running) return; - spin_lock(&cputimer->lock); + atomic_spin_lock(&cputimer->lock); cputimer->cputime.utime = cputime_add(cputimer->cputime.utime, cputime); - spin_unlock(&cputimer->lock); + atomic_spin_unlock(&cputimer->lock); } /** @@ -336,10 +336,10 @@ static inline void account_group_system_time(struct task_struct *tsk, if (!cputimer->running) return; - spin_lock(&cputimer->lock); + atomic_spin_lock(&cputimer->lock); cputimer->cputime.stime = cputime_add(cputimer->cputime.stime, cputime); - spin_unlock(&cputimer->lock); + atomic_spin_unlock(&cputimer->lock); } /** @@ -369,7 +369,7 @@ static inline void account_group_exec_runtime(struct task_struct *tsk, if (!cputimer->running) return; - spin_lock(&cputimer->lock); + atomic_spin_lock(&cputimer->lock); cputimer->cputime.sum_exec_runtime += ns; - spin_unlock(&cputimer->lock); + atomic_spin_unlock(&cputimer->lock); } diff --git a/kernel/semaphore.c b/kernel/semaphore.c index 94a62c0..283b586 100644 --- a/kernel/semaphore.c +++ b/kernel/semaphore.c @@ -33,11 +33,11 @@ #include <linux/spinlock.h> #include <linux/ftrace.h> -static noinline void __down(struct semaphore *sem); -static noinline int __down_interruptible(struct semaphore *sem); -static noinline int __down_killable(struct semaphore *sem); -static noinline int __down_timeout(struct semaphore *sem, long jiffies); -static noinline void __up(struct semaphore *sem); +static noinline void __down(struct anon_semaphore *sem); +static noinline int __down_interruptible(struct anon_semaphore *sem); +static noinline int __down_killable(struct anon_semaphore *sem); +static noinline int __down_timeout(struct anon_semaphore *sem, long jiffies); +static noinline void __up(struct anon_semaphore *sem); /** * down - acquire the semaphore @@ -50,7 +50,7 @@ static noinline void __up(struct semaphore *sem); * Use of this function is deprecated, please use down_interruptible() or * down_killable() instead. */ -void down(struct semaphore *sem) +void anon_down(struct anon_semaphore *sem) { unsigned long flags; @@ -61,7 +61,7 @@ void down(struct semaphore *sem) __down(sem); spin_unlock_irqrestore(&sem->lock, flags); } -EXPORT_SYMBOL(down); +EXPORT_SYMBOL(anon_down); /** * down_interruptible - acquire the semaphore unless interrupted @@ -72,7 +72,7 @@ EXPORT_SYMBOL(down); * If the sleep is interrupted by a signal, this function will return -EINTR. * If the semaphore is successfully acquired, this function returns 0. */ -int down_interruptible(struct semaphore *sem) +int anon_down_interruptible(struct anon_semaphore *sem) { unsigned long flags; int result = 0; @@ -86,7 +86,7 @@ int down_interruptible(struct semaphore *sem) return result; } -EXPORT_SYMBOL(down_interruptible); +EXPORT_SYMBOL(anon_down_interruptible); /** * down_killable - acquire the semaphore unless killed @@ -98,7 +98,7 @@ EXPORT_SYMBOL(down_interruptible); * -EINTR. If the semaphore is successfully acquired, this function returns * 0. */ -int down_killable(struct semaphore *sem) +int anon_down_killable(struct anon_semaphore *sem) { unsigned long flags; int result = 0; @@ -112,7 +112,7 @@ int down_killable(struct semaphore *sem) return result; } -EXPORT_SYMBOL(down_killable); +EXPORT_SYMBOL(anon_down_killable); /** * down_trylock - try to acquire the semaphore, without waiting @@ -127,7 +127,7 @@ EXPORT_SYMBOL(down_killable); * Unlike mutex_trylock, this function can be used from interrupt context, * and the semaphore can be released by any task or interrupt. */ -int down_trylock(struct semaphore *sem) +int anon_down_trylock(struct anon_semaphore *sem) { unsigned long flags; int count; @@ -140,7 +140,7 @@ int down_trylock(struct semaphore *sem) return (count < 0); } -EXPORT_SYMBOL(down_trylock); +EXPORT_SYMBOL(anon_down_trylock); /** * down_timeout - acquire the semaphore within a specified time @@ -152,7 +152,7 @@ EXPORT_SYMBOL(down_trylock); * If the semaphore is not released within the specified number of jiffies, * this function returns -ETIME. It returns 0 if the semaphore was acquired. */ -int down_timeout(struct semaphore *sem, long jiffies) +int anon_down_timeout(struct anon_semaphore *sem, long jiffies) { unsigned long flags; int result = 0; @@ -166,7 +166,7 @@ int down_timeout(struct semaphore *sem, long jiffies) return result; } -EXPORT_SYMBOL(down_timeout); +EXPORT_SYMBOL(anon_down_timeout); /** * up - release the semaphore @@ -175,7 +175,7 @@ EXPORT_SYMBOL(down_timeout); * Release the semaphore. Unlike mutexes, up() may be called from any * context and even by tasks which have never called down(). */ -void up(struct semaphore *sem) +void anon_up(struct anon_semaphore *sem) { unsigned long flags; @@ -186,7 +186,7 @@ void up(struct semaphore *sem) __up(sem); spin_unlock_irqrestore(&sem->lock, flags); } -EXPORT_SYMBOL(up); +EXPORT_SYMBOL(anon_up); /* Functions for the contended case */ @@ -201,7 +201,7 @@ struct semaphore_waiter { * constant, and thus optimised away by the compiler. Likewise the * 'timeout' parameter for the cases without timeouts. */ -static inline int __sched __down_common(struct semaphore *sem, long state, +static inline int __sched __down_common(struct anon_semaphore *sem, long state, long timeout) { struct task_struct *task = current; @@ -233,27 +233,27 @@ static inline int __sched __down_common(struct semaphore *sem, long state, return -EINTR; } -static noinline void __sched __down(struct semaphore *sem) +static noinline void __sched __down(struct anon_semaphore *sem) { __down_common(sem, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT); } -static noinline int __sched __down_interruptible(struct semaphore *sem) +static noinline int __sched __down_interruptible(struct anon_semaphore *sem) { return __down_common(sem, TASK_INTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT); } -static noinline int __sched __down_killable(struct semaphore *sem) +static noinline int __sched __down_killable(struct anon_semaphore *sem) { return __down_common(sem, TASK_KILLABLE, MAX_SCHEDULE_TIMEOUT); } -static noinline int __sched __down_timeout(struct semaphore *sem, long jiffies) +static noinline int __sched __down_timeout(struct anon_semaphore *sem, long jiffies) { return __down_common(sem, TASK_UNINTERRUPTIBLE, jiffies); } -static noinline void __sched __up(struct semaphore *sem) +static noinline void __sched __up(struct anon_semaphore *sem) { struct semaphore_waiter *waiter = list_first_entry(&sem->wait_list, struct semaphore_waiter, list); diff --git a/kernel/signal.c b/kernel/signal.c index 64c5dee..88a4ee3 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -188,13 +188,46 @@ int next_signal(struct sigpending *pending, sigset_t *mask) return sig; } +#ifdef __HAVE_ARCH_CMPXCHG +static inline struct sigqueue *get_task_cache(struct task_struct *t) +{ + struct sigqueue *q = t->sigqueue_cache; + + if (cmpxchg(&t->sigqueue_cache, q, NULL) != q) + return NULL; + + return q; +} + +static inline int put_task_cache(struct task_struct *t, struct sigqueue *q) +{ + if (cmpxchg(&t->sigqueue_cache, NULL, q) == NULL) + return 0; + + return 1; +} + +#else + +static inline struct sigqueue *get_task_cache(struct task_struct *t) +{ + return NULL; +} + +static inline int put_task_cache(struct task_struct *t, struct sigqueue *q) +{ + return 1; +} + +#endif + /* * allocate a new signal queue record * - this may be called without locks if and only if t == current, otherwise an * appopriate lock must be held to stop the target task from exiting */ -static struct sigqueue *__sigqueue_alloc(struct task_struct *t, gfp_t flags, - int override_rlimit) +static struct sigqueue *__sigqueue_do_alloc(struct task_struct *t, gfp_t flags, + int override_rlimit, int fromslab) { struct sigqueue *q = NULL; struct user_struct *user; @@ -209,8 +242,14 @@ static struct sigqueue *__sigqueue_alloc(struct task_struct *t, gfp_t flags, atomic_inc(&user->sigpending); if (override_rlimit || atomic_read(&user->sigpending) <= - t->signal->rlim[RLIMIT_SIGPENDING].rlim_cur) - q = kmem_cache_alloc(sigqueue_cachep, flags); + t->signal->rlim[RLIMIT_SIGPENDING].rlim_cur) { + + if (!fromslab) + q = get_task_cache(t); + if (!q) + q = kmem_cache_alloc(sigqueue_cachep, flags); + } + if (unlikely(q == NULL)) { atomic_dec(&user->sigpending); free_uid(user); @@ -223,6 +262,12 @@ static struct sigqueue *__sigqueue_alloc(struct task_struct *t, gfp_t flags, return q; } +static struct sigqueue *__sigqueue_alloc(struct task_struct *t, gfp_t flags, + int override_rlimit) +{ + return __sigqueue_do_alloc(t, flags, override_rlimit, 0); +} + static void __sigqueue_free(struct sigqueue *q) { if (q->flags & SIGQUEUE_PREALLOC) @@ -232,6 +277,21 @@ static void __sigqueue_free(struct sigqueue *q) kmem_cache_free(sigqueue_cachep, q); } +static void sigqueue_free_current(struct sigqueue *q) +{ + struct user_struct *up; + + if (q->flags & SIGQUEUE_PREALLOC) + return; + + up = q->user; + if (rt_prio(current->normal_prio) && !put_task_cache(current, q)) { + atomic_dec(&up->sigpending); + free_uid(up); + } else + __sigqueue_free(q); +} + void flush_sigqueue(struct sigpending *queue) { struct sigqueue *q; @@ -245,6 +305,21 @@ void flush_sigqueue(struct sigpending *queue) } /* + * Called from __exit_signal. Flush tsk->pending and + * tsk->sigqueue_cache + */ +void flush_task_sigqueue(struct task_struct *tsk) +{ + struct sigqueue *q; + + flush_sigqueue(&tsk->pending); + + q = get_task_cache(tsk); + if (q) + kmem_cache_free(sigqueue_cachep, q); +} + +/* * Flush all pending signals for a task. */ void __flush_signals(struct task_struct *t) @@ -392,7 +467,7 @@ static void collect_signal(int sig, struct sigpending *list, siginfo_t *info) still_pending: list_del_init(&first->list); copy_siginfo(info, &first->info); - __sigqueue_free(first); + sigqueue_free_current(first); } else { /* Ok, it wasn't in the queue. This must be a fast-pathed signal or we must have been @@ -437,6 +512,8 @@ int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info) { int signr; + WARN_ON_ONCE(tsk != current); + /* We only dequeue private signals from ourselves, we don't let * signalfd steal them */ @@ -519,6 +596,9 @@ void signal_wake_up(struct task_struct *t, int resume) set_tsk_thread_flag(t, TIF_SIGPENDING); + if (unlikely(t == current)) + return; + /* * For SIGKILL, we want to wake it up in the stopped/traced/killable * case. We don't check t->state here because there is a race with it @@ -836,8 +916,9 @@ static int __send_signal(int sig, struct siginfo *info, struct task_struct *t, trace_sched_signal_send(sig, t); +#ifdef CONFIG_SMP assert_spin_locked(&t->sighand->siglock); - +#endif if (!prepare_signal(sig, t, from_ancestor_ns)) return 0; @@ -1312,7 +1393,8 @@ struct sigqueue *sigqueue_alloc(void) { struct sigqueue *q; - if ((q = __sigqueue_alloc(current, GFP_KERNEL, 0))) + /* Preallocated sigqueue objects always from the slabcache ! */ + if ((q = __sigqueue_do_alloc(current, GFP_KERNEL, 0, 1))) q->flags |= SIGQUEUE_PREALLOC; return(q); } @@ -1611,15 +1693,7 @@ static void ptrace_stop(int exit_code, int clear_code, siginfo_t *info) read_lock(&tasklist_lock); if (may_ptrace_stop()) { do_notify_parent_cldstop(current, CLD_TRAPPED); - /* - * Don't want to allow preemption here, because - * sys_ptrace() needs this task to be inactive. - * - * XXX: implement read_unlock_no_resched(). - */ - preempt_disable(); read_unlock(&tasklist_lock); - preempt_enable_no_resched(); schedule(); } else { /* diff --git a/kernel/smp.c b/kernel/smp.c index 94188b8..ac897a8 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -16,11 +16,11 @@ static DEFINE_PER_CPU(struct call_single_queue, call_single_queue); static struct { struct list_head queue; - spinlock_t lock; + atomic_spinlock_t lock; } call_function __cacheline_aligned_in_smp = { - .queue = LIST_HEAD_INIT(call_function.queue), - .lock = __SPIN_LOCK_UNLOCKED(call_function.lock), + .queue = LIST_HEAD_INIT(call_function.queue), + .lock = __ATOMIC_SPIN_LOCK_UNLOCKED(call_function.lock), }; enum { @@ -29,18 +29,18 @@ enum { struct call_function_data { struct call_single_data csd; - spinlock_t lock; + atomic_spinlock_t lock; unsigned int refs; cpumask_var_t cpumask; }; struct call_single_queue { struct list_head list; - spinlock_t lock; + atomic_spinlock_t lock; }; static DEFINE_PER_CPU(struct call_function_data, cfd_data) = { - .lock = __SPIN_LOCK_UNLOCKED(cfd_data.lock), + .lock = __ATOMIC_SPIN_LOCK_UNLOCKED(cfd_data.lock), }; static int @@ -83,7 +83,7 @@ static int __cpuinit init_call_single_data(void) for_each_possible_cpu(i) { struct call_single_queue *q = &per_cpu(call_single_queue, i); - spin_lock_init(&q->lock); + atomic_spin_lock_init(&q->lock); INIT_LIST_HEAD(&q->list); } @@ -144,10 +144,10 @@ void generic_exec_single(int cpu, struct call_single_data *data, int wait) unsigned long flags; int ipi; - spin_lock_irqsave(&dst->lock, flags); + atomic_spin_lock_irqsave(&dst->lock, flags); ipi = list_empty(&dst->list); list_add_tail(&data->list, &dst->list); - spin_unlock_irqrestore(&dst->lock, flags); + atomic_spin_unlock_irqrestore(&dst->lock, flags); /* * The list addition should be visible before sending the IPI @@ -191,25 +191,25 @@ void generic_smp_call_function_interrupt(void) list_for_each_entry_rcu(data, &call_function.queue, csd.list) { int refs; - spin_lock(&data->lock); + atomic_spin_lock(&data->lock); if (!cpumask_test_cpu(cpu, data->cpumask)) { - spin_unlock(&data->lock); + atomic_spin_unlock(&data->lock); continue; } cpumask_clear_cpu(cpu, data->cpumask); - spin_unlock(&data->lock); + atomic_spin_unlock(&data->lock); data->csd.func(data->csd.info); - spin_lock(&data->lock); + atomic_spin_lock(&data->lock); WARN_ON(data->refs == 0); refs = --data->refs; if (!refs) { - spin_lock(&call_function.lock); + atomic_spin_lock(&call_function.lock); list_del_rcu(&data->csd.list); - spin_unlock(&call_function.lock); + atomic_spin_unlock(&call_function.lock); } - spin_unlock(&data->lock); + atomic_spin_unlock(&data->lock); if (refs) continue; @@ -230,9 +230,9 @@ void generic_smp_call_function_single_interrupt(void) unsigned int data_flags; LIST_HEAD(list); - spin_lock(&q->lock); + atomic_spin_lock(&q->lock); list_replace_init(&q->list, &list); - spin_unlock(&q->lock); + atomic_spin_unlock(&q->lock); while (!list_empty(&list)) { struct call_single_data *data; @@ -391,23 +391,23 @@ void smp_call_function_many(const struct cpumask *mask, data = &__get_cpu_var(cfd_data); csd_lock(&data->csd); - spin_lock_irqsave(&data->lock, flags); + atomic_spin_lock_irqsave(&data->lock, flags); data->csd.func = func; data->csd.info = info; cpumask_and(data->cpumask, mask, cpu_online_mask); cpumask_clear_cpu(this_cpu, data->cpumask); data->refs = cpumask_weight(data->cpumask); - spin_lock(&call_function.lock); + atomic_spin_lock(&call_function.lock); /* * Place entry at the _HEAD_ of the list, so that any cpu still * observing the entry in generic_smp_call_function_interrupt() * will not miss any other list entries: */ list_add_rcu(&data->csd.list, &call_function.queue); - spin_unlock(&call_function.lock); + atomic_spin_unlock(&call_function.lock); - spin_unlock_irqrestore(&data->lock, flags); + atomic_spin_unlock_irqrestore(&data->lock, flags); /* * Make the list addition visible before sending the ipi. @@ -453,20 +453,20 @@ EXPORT_SYMBOL(smp_call_function); void ipi_call_lock(void) { - spin_lock(&call_function.lock); + atomic_spin_lock(&call_function.lock); } void ipi_call_unlock(void) { - spin_unlock(&call_function.lock); + atomic_spin_unlock(&call_function.lock); } void ipi_call_lock_irq(void) { - spin_lock_irq(&call_function.lock); + atomic_spin_lock_irq(&call_function.lock); } void ipi_call_unlock_irq(void) { - spin_unlock_irq(&call_function.lock); + atomic_spin_unlock_irq(&call_function.lock); } diff --git a/kernel/softirq.c b/kernel/softirq.c index eb5e131..aae8d45 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -8,15 +8,23 @@ * Rewritten. Old one was good in 2.2, but in 2.3 it was immoral. --ANK (990903) * * Remote softirq infrastructure is by Jens Axboe. + * + * Softirq-split implemetation by + * Copyright (C) 2005 Thomas Gleixner, Ingo Molnar */ #include <linux/module.h> +#include <linux/kallsyms.h> +#include <linux/syscalls.h> +#include <linux/wait.h> #include <linux/kernel_stat.h> #include <linux/interrupt.h> #include <linux/init.h> +#include <linux/delay.h> #include <linux/mm.h> #include <linux/notifier.h> #include <linux/percpu.h> +#include <linux/delay.h> #include <linux/cpu.h> #include <linux/freezer.h> #include <linux/kthread.h> @@ -54,29 +62,122 @@ EXPORT_SYMBOL(irq_stat); static struct softirq_action softirq_vec[NR_SOFTIRQS] __cacheline_aligned_in_smp; -static DEFINE_PER_CPU(struct task_struct *, ksoftirqd); +struct softirqdata { + int nr; + unsigned long cpu; + struct task_struct *tsk; + int running; +}; + +static DEFINE_PER_CPU(struct softirqdata [NR_SOFTIRQS], ksoftirqd); char *softirq_to_name[NR_SOFTIRQS] = { "HI", "TIMER", "NET_TX", "NET_RX", "BLOCK", "TASKLET", "SCHED", "HRTIMER", "RCU" }; +#ifdef CONFIG_PREEMPT_RT +/* + * On preempt-rt a softirq might be blocked on a lock. There might be + * no other runnable task on this CPU because the lock owner runs on + * some other CPU. So we have to go into idle with the pending bit + * set. Therefor we need to check this otherwise we warn about false + * positives which confuses users and defeats the whole purpose of + * this test. + * + * This code is called with interrupts disabled. + */ +void softirq_check_pending_idle(void) +{ + static int rate_limit; + u32 warnpending = 0, pending = local_softirq_pending(); + int curr = 0; + + if (rate_limit >= 10) + return; + + while (pending) { + if (pending & 1) { + struct task_struct *tsk; + + tsk = __get_cpu_var(ksoftirqd)[curr].tsk; + /* + * The wakeup code in rtmutex.c wakes up the + * task _before_ it sets pi_blocked_on to NULL + * under tsk->pi_lock. So we need to check for + * both: state and pi_blocked_on. + */ + atomic_spin_lock(&tsk->pi_lock); + + if (!tsk->pi_blocked_on && + !(tsk->state == TASK_RUNNING) && + !(tsk->state & TASK_RUNNING_MUTEX)) + warnpending |= 1 << curr; + + atomic_spin_unlock(&tsk->pi_lock); + } + pending >>= 1; + curr++; + } + + if (warnpending) { + printk(KERN_ERR "NOHZ: local_softirq_pending %02x\n", + warnpending); + rate_limit++; + } +} + +#else +/* + * On !PREEMPT_RT we just printk rate limited: + */ +void softirq_check_pending_idle(void) +{ + static int rate_limit; + + if (rate_limit < 10) { + printk(KERN_ERR "NOHZ: local_softirq_pending %02x\n", + local_softirq_pending()); + rate_limit++; + } +} + +#endif + /* * we cannot loop indefinitely here to avoid userspace starvation, * but we also don't want to introduce a worst case 1/HZ latency * to the pending events, so lets the scheduler to balance * the softirq load for us. */ -void wakeup_softirqd(void) +static void wakeup_softirqd(int softirq) { /* Interrupts are disabled: no need to stop preemption */ - struct task_struct *tsk = __get_cpu_var(ksoftirqd); + struct task_struct *tsk = __get_cpu_var(ksoftirqd)[softirq].tsk; if (tsk && tsk->state != TASK_RUNNING) wake_up_process(tsk); } /* + * Wake up the softirq threads which have work + */ +static void trigger_softirqs(void) +{ + u32 pending = local_softirq_pending(); + int curr = 0; + + while (pending) { + if (pending & 1) + wakeup_softirqd(curr); + pending >>= 1; + curr++; + } +} + +#ifndef CONFIG_PREEMPT_HARDIRQS + +/* * This one is for softirq.c-internal use, * where hardirqs are disabled legitimately: */ @@ -128,7 +229,6 @@ EXPORT_SYMBOL(local_bh_disable); */ void _local_bh_enable(void) { - WARN_ON_ONCE(in_irq()); WARN_ON_ONCE(!irqs_disabled()); if (softirq_count() == SOFTIRQ_OFFSET) @@ -138,45 +238,72 @@ void _local_bh_enable(void) EXPORT_SYMBOL(_local_bh_enable); -static inline void _local_bh_enable_ip(unsigned long ip) +void local_bh_enable(void) { - WARN_ON_ONCE(in_irq() || irqs_disabled()); #ifdef CONFIG_TRACE_IRQFLAGS - local_irq_disable(); + unsigned long flags; + + WARN_ON_ONCE(in_irq()); +#endif + +#ifdef CONFIG_TRACE_IRQFLAGS + local_irq_save(flags); #endif /* * Are softirqs going to be turned on now: */ if (softirq_count() == SOFTIRQ_OFFSET) - trace_softirqs_on(ip); + trace_softirqs_on((unsigned long)__builtin_return_address(0)); /* * Keep preemption disabled until we are done with * softirq processing: - */ - sub_preempt_count(SOFTIRQ_OFFSET - 1); + */ + sub_preempt_count(SOFTIRQ_OFFSET - 1); if (unlikely(!in_interrupt() && local_softirq_pending())) do_softirq(); dec_preempt_count(); #ifdef CONFIG_TRACE_IRQFLAGS - local_irq_enable(); + local_irq_restore(flags); #endif preempt_check_resched(); } - -void local_bh_enable(void) -{ - _local_bh_enable_ip((unsigned long)__builtin_return_address(0)); -} EXPORT_SYMBOL(local_bh_enable); void local_bh_enable_ip(unsigned long ip) { - _local_bh_enable_ip(ip); +#ifdef CONFIG_TRACE_IRQFLAGS + unsigned long flags; + + WARN_ON_ONCE(in_irq()); + + local_irq_save(flags); +#endif + /* + * Are softirqs going to be turned on now: + */ + if (softirq_count() == SOFTIRQ_OFFSET) + trace_softirqs_on(ip); + /* + * Keep preemption disabled until we are done with + * softirq processing: + */ + sub_preempt_count(SOFTIRQ_OFFSET - 1); + + if (unlikely(!in_interrupt() && local_softirq_pending())) + do_softirq(); + + dec_preempt_count(); +#ifdef CONFIG_TRACE_IRQFLAGS + local_irq_restore(flags); +#endif + preempt_check_resched(); } EXPORT_SYMBOL(local_bh_enable_ip); +#endif + /* * We restart softirq processing MAX_SOFTIRQ_RESTART times, * and we fall back to softirqd after that. @@ -186,66 +313,148 @@ EXPORT_SYMBOL(local_bh_enable_ip); * we want to handle softirqs as soon as possible, but they * should not be able to lock up the box. */ -#define MAX_SOFTIRQ_RESTART 10 +#define MAX_SOFTIRQ_RESTART 20 -asmlinkage void __do_softirq(void) +static DEFINE_PER_CPU(u32, softirq_running); + +/* + * Debug check for leaking preempt counts in h->action handlers: + */ + +static inline void debug_check_preempt_count_start(__u32 *preempt_count) { - struct softirq_action *h; - __u32 pending; +#ifdef CONFIG_DEBUG_PREEMPT + *preempt_count = preempt_count(); +#endif +} + +static inline void +debug_check_preempt_count_stop(__u32 *preempt_count, struct softirq_action *h) +{ +#ifdef CONFIG_DEBUG_PREEMPT + if (*preempt_count == preempt_count()) + return; + + print_symbol("BUG: %Ps exited with wrong preemption count!\n", + (unsigned long)h->action); + printk("=> enter: %08x, exit: %08x.\n", *preempt_count, preempt_count()); + preempt_count() = *preempt_count; +#endif +} + +/* + * Execute softirq handlers: + */ +static void ___do_softirq(const int same_prio_only) +{ + __u32 pending, available_mask, same_prio_skipped, preempt_count; int max_restart = MAX_SOFTIRQ_RESTART; - int cpu; + struct softirq_action *h; + int cpu, softirq; pending = local_softirq_pending(); account_system_vtime(current); - __local_bh_disable((unsigned long)__builtin_return_address(0)); - lockdep_softirq_enter(); - cpu = smp_processor_id(); restart: + available_mask = -1; + softirq = 0; + same_prio_skipped = 0; + /* Reset the pending bitmask before enabling irqs */ set_softirq_pending(0); - local_irq_enable(); - h = softirq_vec; do { - if (pending & 1) { - int prev_count = preempt_count(); - kstat_incr_softirqs_this_cpu(h - softirq_vec); - - trace_softirq_entry(h, softirq_vec); - h->action(h); - trace_softirq_exit(h, softirq_vec); - if (unlikely(prev_count != preempt_count())) { - printk(KERN_ERR "huh, entered softirq %td %s %p" - "with preempt_count %08x," - " exited with %08x?\n", h - softirq_vec, - softirq_to_name[h - softirq_vec], - h->action, prev_count, preempt_count()); - preempt_count() = prev_count; + u32 softirq_mask = 1 << softirq; + + if (!(pending & 1)) + goto next; + + debug_check_preempt_count_start(&preempt_count); + +#if defined(CONFIG_PREEMPT_SOFTIRQS) && defined(CONFIG_PREEMPT_HARDIRQS) + /* + * If executed by a same-prio hardirq thread + * then skip pending softirqs that belong + * to softirq threads with different priority: + */ + if (same_prio_only) { + struct task_struct *tsk; + + tsk = __get_cpu_var(ksoftirqd)[softirq].tsk; + if (tsk && tsk->normal_prio != current->normal_prio) { + same_prio_skipped |= softirq_mask; + available_mask &= ~softirq_mask; + goto next; } - - rcu_bh_qsctr_inc(cpu); } +#endif + /* + * Is this softirq already being processed? + */ + if (per_cpu(softirq_running, cpu) & softirq_mask) { + available_mask &= ~softirq_mask; + goto next; + } + per_cpu(softirq_running, cpu) |= softirq_mask; + kstat_incr_softirqs_this_cpu(h - softirq_vec); + local_irq_enable(); + + trace_softirq_entry(h, softirq_vec); + h->action(h); + trace_softirq_exit(h, softirq_vec); + + debug_check_preempt_count_stop(&preempt_count, h); + + rcu_bh_qsctr_inc(cpu); + cond_resched_softirq_context(); + local_irq_disable(); + per_cpu(softirq_running, cpu) &= ~softirq_mask; + +next: h++; + softirq++; pending >>= 1; } while (pending); - local_irq_disable(); - + or_softirq_pending(same_prio_skipped); pending = local_softirq_pending(); - if (pending && --max_restart) - goto restart; + if (pending & available_mask) { + if (--max_restart) + goto restart; + } if (pending) - wakeup_softirqd(); + trigger_softirqs(); +} + +asmlinkage void __do_softirq(void) +{ +#ifdef CONFIG_PREEMPT_SOFTIRQS + /* + * 'preempt harder'. Push all softirq processing off to ksoftirqd. + */ + if (softirq_preemption) { + if (local_softirq_pending()) + trigger_softirqs(); + return; + } +#endif + /* + * 'immediate' softirq execution: + */ + __local_bh_disable((unsigned long)__builtin_return_address(0)); + lockdep_softirq_enter(); + + ___do_softirq(0); lockdep_softirq_exit(); account_system_vtime(current); _local_bh_enable(); + } #ifndef __ARCH_HAS_DO_SOFTIRQ @@ -308,7 +517,7 @@ void irq_exit(void) if (idle_cpu(smp_processor_id()) && !in_interrupt() && !need_resched()) tick_nohz_stop_sched_tick(0); #endif - preempt_enable_no_resched(); + __preempt_enable_no_resched(); } /* @@ -316,19 +525,11 @@ void irq_exit(void) */ inline void raise_softirq_irqoff(unsigned int nr) { - __raise_softirq_irqoff(nr); + __do_raise_softirq_irqoff(nr); - /* - * If we're in an interrupt or softirq, we're done - * (this also catches softirq-disabled code). We will - * actually run the softirq once we return from - * the irq or softirq. - * - * Otherwise we wake up ksoftirqd to make sure we - * schedule the softirq soon. - */ - if (!in_interrupt()) - wakeup_softirqd(); +#ifdef CONFIG_PREEMPT_SOFTIRQS + wakeup_softirqd(nr); +#endif } void raise_softirq(unsigned int nr) @@ -357,15 +558,45 @@ struct tasklet_head static DEFINE_PER_CPU(struct tasklet_head, tasklet_vec); static DEFINE_PER_CPU(struct tasklet_head, tasklet_hi_vec); +static void inline +__tasklet_common_schedule(struct tasklet_struct *t, struct tasklet_head *head, unsigned int nr) +{ + if (tasklet_trylock(t)) { +again: + /* We may have been preempted before tasklet_trylock + * and __tasklet_action may have already run. + * So double check the sched bit while the takslet + * is locked before adding it to the list. + */ + if (test_bit(TASKLET_STATE_SCHED, &t->state)) { + t->next = NULL; + *head->tail = t; + head->tail = &(t->next); + raise_softirq_irqoff(nr); + tasklet_unlock(t); + } else { + /* This is subtle. If we hit the corner case above + * It is possible that we get preempted right here, + * and another task has successfully called + * tasklet_schedule(), then this function, and + * failed on the trylock. Thus we must be sure + * before releasing the tasklet lock, that the + * SCHED_BIT is clear. Otherwise the tasklet + * may get its SCHED_BIT set, but not added to the + * list + */ + if (!tasklet_tryunlock(t)) + goto again; + } + } +} + void __tasklet_schedule(struct tasklet_struct *t) { unsigned long flags; local_irq_save(flags); - t->next = NULL; - *__get_cpu_var(tasklet_vec).tail = t; - __get_cpu_var(tasklet_vec).tail = &(t->next); - raise_softirq_irqoff(TASKLET_SOFTIRQ); + __tasklet_common_schedule(t, &__get_cpu_var(tasklet_vec), TASKLET_SOFTIRQ); local_irq_restore(flags); } @@ -376,10 +607,7 @@ void __tasklet_hi_schedule(struct tasklet_struct *t) unsigned long flags; local_irq_save(flags); - t->next = NULL; - *__get_cpu_var(tasklet_hi_vec).tail = t; - __get_cpu_var(tasklet_hi_vec).tail = &(t->next); - raise_softirq_irqoff(HI_SOFTIRQ); + __tasklet_common_schedule(t, &__get_cpu_var(tasklet_hi_vec), HI_SOFTIRQ); local_irq_restore(flags); } @@ -387,50 +615,119 @@ EXPORT_SYMBOL(__tasklet_hi_schedule); void __tasklet_hi_schedule_first(struct tasklet_struct *t) { - BUG_ON(!irqs_disabled()); - - t->next = __get_cpu_var(tasklet_hi_vec).head; - __get_cpu_var(tasklet_hi_vec).head = t; - __raise_softirq_irqoff(HI_SOFTIRQ); + __tasklet_hi_schedule(t); } EXPORT_SYMBOL(__tasklet_hi_schedule_first); -static void tasklet_action(struct softirq_action *a) +void tasklet_enable(struct tasklet_struct *t) { - struct tasklet_struct *list; + if (!atomic_dec_and_test(&t->count)) + return; + if (test_and_clear_bit(TASKLET_STATE_PENDING, &t->state)) + tasklet_schedule(t); +} - local_irq_disable(); - list = __get_cpu_var(tasklet_vec).head; - __get_cpu_var(tasklet_vec).head = NULL; - __get_cpu_var(tasklet_vec).tail = &__get_cpu_var(tasklet_vec).head; - local_irq_enable(); +EXPORT_SYMBOL(tasklet_enable); + +void tasklet_hi_enable(struct tasklet_struct *t) +{ + if (!atomic_dec_and_test(&t->count)) + return; + if (test_and_clear_bit(TASKLET_STATE_PENDING, &t->state)) + tasklet_hi_schedule(t); +} + +EXPORT_SYMBOL(tasklet_hi_enable); + +static void +__tasklet_action(struct softirq_action *a, struct tasklet_struct *list) +{ + int loops = 1000000; while (list) { struct tasklet_struct *t = list; list = list->next; - if (tasklet_trylock(t)) { - if (!atomic_read(&t->count)) { - if (!test_and_clear_bit(TASKLET_STATE_SCHED, &t->state)) - BUG(); - t->func(t->data); - tasklet_unlock(t); - continue; - } - tasklet_unlock(t); + /* + * Should always succeed - after a tasklist got on the + * list (after getting the SCHED bit set from 0 to 1), + * nothing but the tasklet softirq it got queued to can + * lock it: + */ + if (!tasklet_trylock(t)) { + WARN_ON(1); + continue; } - local_irq_disable(); t->next = NULL; - *__get_cpu_var(tasklet_vec).tail = t; - __get_cpu_var(tasklet_vec).tail = &(t->next); - __raise_softirq_irqoff(TASKLET_SOFTIRQ); - local_irq_enable(); + + /* + * If we cannot handle the tasklet because it's disabled, + * mark it as pending. tasklet_enable() will later + * re-schedule the tasklet. + */ + if (unlikely(atomic_read(&t->count))) { +out_disabled: + /* implicit unlock: */ + wmb(); + t->state = TASKLET_STATEF_PENDING; + continue; + } + + /* + * After this point on the tasklet might be rescheduled + * on another CPU, but it can only be added to another + * CPU's tasklet list if we unlock the tasklet (which we + * dont do yet). + */ + if (!test_and_clear_bit(TASKLET_STATE_SCHED, &t->state)) + WARN_ON(1); + +again: + t->func(t->data); + + /* + * Try to unlock the tasklet. We must use cmpxchg, because + * another CPU might have scheduled or disabled the tasklet. + * We only allow the STATE_RUN -> 0 transition here. + */ + while (!tasklet_tryunlock(t)) { + /* + * If it got disabled meanwhile, bail out: + */ + if (atomic_read(&t->count)) + goto out_disabled; + /* + * If it got scheduled meanwhile, re-execute + * the tasklet function: + */ + if (test_and_clear_bit(TASKLET_STATE_SCHED, &t->state)) + goto again; + if (!--loops) { + printk("hm, tasklet state: %08lx\n", t->state); + WARN_ON(1); + tasklet_unlock(t); + break; + } + } } } +static void tasklet_action(struct softirq_action *a) +{ + struct tasklet_struct *list; + + local_irq_disable(); + list = __get_cpu_var(tasklet_vec).head; + __get_cpu_var(tasklet_vec).head = NULL; + __get_cpu_var(tasklet_vec).tail = &__get_cpu_var(tasklet_vec).head; + local_irq_enable(); + + __tasklet_action(a, list); +} + static void tasklet_hi_action(struct softirq_action *a) { struct tasklet_struct *list; @@ -441,29 +738,7 @@ static void tasklet_hi_action(struct softirq_action *a) __get_cpu_var(tasklet_hi_vec).tail = &__get_cpu_var(tasklet_hi_vec).head; local_irq_enable(); - while (list) { - struct tasklet_struct *t = list; - - list = list->next; - - if (tasklet_trylock(t)) { - if (!atomic_read(&t->count)) { - if (!test_and_clear_bit(TASKLET_STATE_SCHED, &t->state)) - BUG(); - t->func(t->data); - tasklet_unlock(t); - continue; - } - tasklet_unlock(t); - } - - local_irq_disable(); - t->next = NULL; - *__get_cpu_var(tasklet_hi_vec).tail = t; - __get_cpu_var(tasklet_hi_vec).tail = &(t->next); - __raise_softirq_irqoff(HI_SOFTIRQ); - local_irq_enable(); - } + __tasklet_action(a, list); } @@ -486,7 +761,7 @@ void tasklet_kill(struct tasklet_struct *t) while (test_and_set_bit(TASKLET_STATE_SCHED, &t->state)) { do { - yield(); + msleep(1); } while (test_bit(TASKLET_STATE_SCHED, &t->state)); } tasklet_unlock_wait(t); @@ -697,34 +972,89 @@ void __init softirq_init(void) open_softirq(HI_SOFTIRQ, tasklet_hi_action); } -static int ksoftirqd(void * __bind_cpu) +#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT) + +void tasklet_unlock_wait(struct tasklet_struct *t) { + while (test_bit(TASKLET_STATE_RUN, &(t)->state)) { + /* + * Hack for now to avoid this busy-loop: + */ +#ifdef CONFIG_PREEMPT_RT + msleep(1); +#else + barrier(); +#endif + } +} +EXPORT_SYMBOL(tasklet_unlock_wait); + +#endif + +static int ksoftirqd(void * __data) +{ + /* Priority needs to be below hardirqs */ + struct sched_param param = { .sched_priority = MAX_USER_RT_PRIO/2 - 1}; + struct softirqdata *data = __data; + u32 softirq_mask = (1 << data->nr); + struct softirq_action *h; + int cpu = data->cpu; + + sys_sched_setscheduler(current->pid, SCHED_FIFO, ¶m); + current->flags |= PF_SOFTIRQ; set_current_state(TASK_INTERRUPTIBLE); while (!kthread_should_stop()) { preempt_disable(); - if (!local_softirq_pending()) { - preempt_enable_no_resched(); - schedule(); + if (!(local_softirq_pending() & softirq_mask)) { +sleep_more: + preempt_enable_and_schedule(); preempt_disable(); } __set_current_state(TASK_RUNNING); + data->running = 1; - while (local_softirq_pending()) { + while (local_softirq_pending() & softirq_mask) { /* Preempt disable stops cpu going offline. If already offline, we'll be on wrong CPU: don't process */ - if (cpu_is_offline((long)__bind_cpu)) + if (cpu_is_offline(cpu)) goto wait_to_die; - do_softirq(); - preempt_enable_no_resched(); + + /* + * Is the softirq already being executed by + * a hardirq context? + */ + local_irq_disable(); + if (per_cpu(softirq_running, cpu) & softirq_mask) { + local_irq_enable(); + set_current_state(TASK_INTERRUPTIBLE); + goto sleep_more; + } + per_cpu(softirq_running, cpu) |= softirq_mask; + __preempt_enable_no_resched(); + set_softirq_pending(local_softirq_pending() & ~softirq_mask); + local_bh_disable(); + local_irq_enable(); + + h = &softirq_vec[data->nr]; + if (h) + h->action(h); + rcu_bh_qsctr_inc(data->cpu); + + local_irq_disable(); + per_cpu(softirq_running, cpu) &= ~softirq_mask; + _local_bh_enable(); + local_irq_enable(); + cond_resched(); preempt_disable(); - rcu_qsctr_inc((long)__bind_cpu); + rcu_qsctr_inc(data->cpu); } preempt_enable(); set_current_state(TASK_INTERRUPTIBLE); + data->running = 0; } __set_current_state(TASK_RUNNING); return 0; @@ -774,7 +1104,7 @@ void tasklet_kill_immediate(struct tasklet_struct *t, unsigned int cpu) BUG(); } -static void takeover_tasklets(unsigned int cpu) +void takeover_tasklets(unsigned int cpu) { /* CPU is dead, so no lock needed. */ local_irq_disable(); @@ -800,49 +1130,77 @@ static void takeover_tasklets(unsigned int cpu) } #endif /* CONFIG_HOTPLUG_CPU */ +static const char *softirq_names [] = +{ + [HI_SOFTIRQ] = "high", + [SCHED_SOFTIRQ] = "sched", + [TIMER_SOFTIRQ] = "timer", + [NET_TX_SOFTIRQ] = "net-tx", + [NET_RX_SOFTIRQ] = "net-rx", + [BLOCK_SOFTIRQ] = "block", + [TASKLET_SOFTIRQ] = "tasklet", +#ifdef CONFIG_HIGH_RES_TIMERS + [HRTIMER_SOFTIRQ] = "hrtimer", +#endif + [RCU_SOFTIRQ] = "rcu", +}; + static int __cpuinit cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) { - int hotcpu = (unsigned long)hcpu; + int hotcpu = (unsigned long)hcpu, i; struct task_struct *p; switch (action) { case CPU_UP_PREPARE: case CPU_UP_PREPARE_FROZEN: - p = kthread_create(ksoftirqd, hcpu, "ksoftirqd/%d", hotcpu); - if (IS_ERR(p)) { - printk("ksoftirqd for %i failed\n", hotcpu); - return NOTIFY_BAD; + for (i = 0; i < NR_SOFTIRQS; i++) { + per_cpu(ksoftirqd, hotcpu)[i].nr = i; + per_cpu(ksoftirqd, hotcpu)[i].cpu = hotcpu; + per_cpu(ksoftirqd, hotcpu)[i].tsk = NULL; + } + for (i = 0; i < NR_SOFTIRQS; i++) { + p = kthread_create(ksoftirqd, + &per_cpu(ksoftirqd, hotcpu)[i], + "sirq-%s/%d", softirq_names[i], + hotcpu); + if (IS_ERR(p)) { + printk("ksoftirqd %d for %i failed\n", i, + hotcpu); + return NOTIFY_BAD; + } + kthread_bind(p, hotcpu); + per_cpu(ksoftirqd, hotcpu)[i].tsk = p; } - kthread_bind(p, hotcpu); - per_cpu(ksoftirqd, hotcpu) = p; - break; + break; + break; case CPU_ONLINE: case CPU_ONLINE_FROZEN: - wake_up_process(per_cpu(ksoftirqd, hotcpu)); + for (i = 0; i < NR_SOFTIRQS; i++) + wake_up_process(per_cpu(ksoftirqd, hotcpu)[i].tsk); break; #ifdef CONFIG_HOTPLUG_CPU case CPU_UP_CANCELED: case CPU_UP_CANCELED_FROZEN: - if (!per_cpu(ksoftirqd, hotcpu)) - break; - /* Unbind so it can run. Fall thru. */ - kthread_bind(per_cpu(ksoftirqd, hotcpu), - cpumask_any(cpu_online_mask)); + /* Fall trough */ + case CPU_DEAD: case CPU_DEAD_FROZEN: { - struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 }; + struct sched_param param; - p = per_cpu(ksoftirqd, hotcpu); - per_cpu(ksoftirqd, hotcpu) = NULL; - sched_setscheduler_nocheck(p, SCHED_FIFO, ¶m); - kthread_stop(p); + for (i = 0; i < NR_SOFTIRQS; i++) { + param.sched_priority = MAX_RT_PRIO-1; + p = per_cpu(ksoftirqd, hotcpu)[i].tsk; + sched_setscheduler(p, SCHED_FIFO, ¶m); + per_cpu(ksoftirqd, hotcpu)[i].tsk = NULL; + kthread_stop(p); + } takeover_tasklets(hotcpu); break; } #endif /* CONFIG_HOTPLUG_CPU */ - } + } return NOTIFY_OK; } @@ -862,6 +1220,34 @@ static __init int spawn_ksoftirqd(void) } early_initcall(spawn_ksoftirqd); + +#ifdef CONFIG_PREEMPT_SOFTIRQS + +int softirq_preemption = 1; + +EXPORT_SYMBOL(softirq_preemption); + +/* + * Real-Time Preemption depends on softirq threading: + */ +#ifndef CONFIG_PREEMPT_RT + +static int __init softirq_preempt_setup (char *str) +{ + if (!strncmp(str, "off", 3)) + softirq_preemption = 0; + else + get_option(&str, &softirq_preemption); + if (!softirq_preemption) + printk("turning off softirq preemption!\n"); + + return 1; +} + +__setup("softirq-preempt=", softirq_preempt_setup); +#endif +#endif + #ifdef CONFIG_SMP /* * Call a function on all processors diff --git a/kernel/softlockup.c b/kernel/softlockup.c index 88796c3..6299617 100644 --- a/kernel/softlockup.c +++ b/kernel/softlockup.c @@ -20,7 +20,7 @@ #include <asm/irq_regs.h> -static DEFINE_SPINLOCK(print_lock); +static DEFINE_ATOMIC_SPINLOCK(print_lock); static DEFINE_PER_CPU(unsigned long, touch_timestamp); static DEFINE_PER_CPU(unsigned long, print_timestamp); @@ -149,7 +149,7 @@ void softlockup_tick(void) per_cpu(print_timestamp, this_cpu) = touch_timestamp; - spin_lock(&print_lock); + atomic_spin_lock(&print_lock); printk(KERN_ERR "BUG: soft lockup - CPU#%d stuck for %lus! [%s:%d]\n", this_cpu, now - touch_timestamp, current->comm, task_pid_nr(current)); @@ -159,7 +159,7 @@ void softlockup_tick(void) show_regs(regs); else dump_stack(); - spin_unlock(&print_lock); + atomic_spin_unlock(&print_lock); if (softlockup_panic) panic("softlockup: hung tasks"); diff --git a/kernel/spinlock.c b/kernel/spinlock.c index 7932653..79c6581 100644 --- a/kernel/spinlock.c +++ b/kernel/spinlock.c @@ -21,44 +21,19 @@ #include <linux/debug_locks.h> #include <linux/module.h> -int __lockfunc _spin_trylock(spinlock_t *lock) +#include "lock-internals.h" + +int __lockfunc _atomic_spin_trylock(atomic_spinlock_t *lock) { preempt_disable(); if (_raw_spin_trylock(lock)) { spin_acquire(&lock->dep_map, 0, 1, _RET_IP_); return 1; } - - preempt_enable(); - return 0; -} -EXPORT_SYMBOL(_spin_trylock); - -int __lockfunc _read_trylock(rwlock_t *lock) -{ - preempt_disable(); - if (_raw_read_trylock(lock)) { - rwlock_acquire_read(&lock->dep_map, 0, 1, _RET_IP_); - return 1; - } - preempt_enable(); return 0; } -EXPORT_SYMBOL(_read_trylock); - -int __lockfunc _write_trylock(rwlock_t *lock) -{ - preempt_disable(); - if (_raw_write_trylock(lock)) { - rwlock_acquire(&lock->dep_map, 0, 1, _RET_IP_); - return 1; - } - - preempt_enable(); - return 0; -} -EXPORT_SYMBOL(_write_trylock); +EXPORT_SYMBOL(_atomic_spin_trylock); /* * If lockdep is enabled then we use the non-preemption spin-ops @@ -67,15 +42,7 @@ EXPORT_SYMBOL(_write_trylock); */ #if !defined(CONFIG_GENERIC_LOCKBREAK) || defined(CONFIG_DEBUG_LOCK_ALLOC) -void __lockfunc _read_lock(rwlock_t *lock) -{ - preempt_disable(); - rwlock_acquire_read(&lock->dep_map, 0, 0, _RET_IP_); - LOCK_CONTENDED(lock, _raw_read_trylock, _raw_read_lock); -} -EXPORT_SYMBOL(_read_lock); - -unsigned long __lockfunc _spin_lock_irqsave(spinlock_t *lock) +unsigned long __lockfunc _atomic_spin_lock_irqsave(atomic_spinlock_t *lock) { unsigned long flags; @@ -94,207 +61,61 @@ unsigned long __lockfunc _spin_lock_irqsave(spinlock_t *lock) #endif return flags; } -EXPORT_SYMBOL(_spin_lock_irqsave); +EXPORT_SYMBOL(_atomic_spin_lock_irqsave); -void __lockfunc _spin_lock_irq(spinlock_t *lock) +void __lockfunc _atomic_spin_lock_irq(atomic_spinlock_t *lock) { local_irq_disable(); preempt_disable(); spin_acquire(&lock->dep_map, 0, 0, _RET_IP_); LOCK_CONTENDED(lock, _raw_spin_trylock, _raw_spin_lock); } -EXPORT_SYMBOL(_spin_lock_irq); +EXPORT_SYMBOL(_atomic_spin_lock_irq); -void __lockfunc _spin_lock_bh(spinlock_t *lock) +void __lockfunc _atomic_spin_lock_bh(atomic_spinlock_t *lock) { local_bh_disable(); preempt_disable(); spin_acquire(&lock->dep_map, 0, 0, _RET_IP_); LOCK_CONTENDED(lock, _raw_spin_trylock, _raw_spin_lock); } -EXPORT_SYMBOL(_spin_lock_bh); - -unsigned long __lockfunc _read_lock_irqsave(rwlock_t *lock) -{ - unsigned long flags; - - local_irq_save(flags); - preempt_disable(); - rwlock_acquire_read(&lock->dep_map, 0, 0, _RET_IP_); - LOCK_CONTENDED_FLAGS(lock, _raw_read_trylock, _raw_read_lock, - _raw_read_lock_flags, &flags); - return flags; -} -EXPORT_SYMBOL(_read_lock_irqsave); - -void __lockfunc _read_lock_irq(rwlock_t *lock) -{ - local_irq_disable(); - preempt_disable(); - rwlock_acquire_read(&lock->dep_map, 0, 0, _RET_IP_); - LOCK_CONTENDED(lock, _raw_read_trylock, _raw_read_lock); -} -EXPORT_SYMBOL(_read_lock_irq); - -void __lockfunc _read_lock_bh(rwlock_t *lock) -{ - local_bh_disable(); - preempt_disable(); - rwlock_acquire_read(&lock->dep_map, 0, 0, _RET_IP_); - LOCK_CONTENDED(lock, _raw_read_trylock, _raw_read_lock); -} -EXPORT_SYMBOL(_read_lock_bh); - -unsigned long __lockfunc _write_lock_irqsave(rwlock_t *lock) -{ - unsigned long flags; - - local_irq_save(flags); - preempt_disable(); - rwlock_acquire(&lock->dep_map, 0, 0, _RET_IP_); - LOCK_CONTENDED_FLAGS(lock, _raw_write_trylock, _raw_write_lock, - _raw_write_lock_flags, &flags); - return flags; -} -EXPORT_SYMBOL(_write_lock_irqsave); - -void __lockfunc _write_lock_irq(rwlock_t *lock) -{ - local_irq_disable(); - preempt_disable(); - rwlock_acquire(&lock->dep_map, 0, 0, _RET_IP_); - LOCK_CONTENDED(lock, _raw_write_trylock, _raw_write_lock); -} -EXPORT_SYMBOL(_write_lock_irq); +EXPORT_SYMBOL(_atomic_spin_lock_bh); -void __lockfunc _write_lock_bh(rwlock_t *lock) -{ - local_bh_disable(); - preempt_disable(); - rwlock_acquire(&lock->dep_map, 0, 0, _RET_IP_); - LOCK_CONTENDED(lock, _raw_write_trylock, _raw_write_lock); -} -EXPORT_SYMBOL(_write_lock_bh); - -void __lockfunc _spin_lock(spinlock_t *lock) +void __lockfunc _atomic_spin_lock(atomic_spinlock_t *lock) { preempt_disable(); spin_acquire(&lock->dep_map, 0, 0, _RET_IP_); LOCK_CONTENDED(lock, _raw_spin_trylock, _raw_spin_lock); } - -EXPORT_SYMBOL(_spin_lock); - -void __lockfunc _write_lock(rwlock_t *lock) -{ - preempt_disable(); - rwlock_acquire(&lock->dep_map, 0, 0, _RET_IP_); - LOCK_CONTENDED(lock, _raw_write_trylock, _raw_write_lock); -} - -EXPORT_SYMBOL(_write_lock); +EXPORT_SYMBOL(_atomic_spin_lock); #else /* CONFIG_PREEMPT: */ /* - * This could be a long-held lock. We both prepare to spin for a long - * time (making _this_ CPU preemptable if possible), and we also signal - * towards that other CPU that it should break the lock ASAP. - * - * (We do this in a function because inlining it would be excessive.) - */ - -#define BUILD_LOCK_OPS(op, locktype) \ -void __lockfunc _##op##_lock(locktype##_t *lock) \ -{ \ - for (;;) { \ - preempt_disable(); \ - if (likely(_raw_##op##_trylock(lock))) \ - break; \ - preempt_enable(); \ - \ - if (!(lock)->break_lock) \ - (lock)->break_lock = 1; \ - while (!op##_can_lock(lock) && (lock)->break_lock) \ - _raw_##op##_relax(&lock->raw_lock); \ - } \ - (lock)->break_lock = 0; \ -} \ - \ -EXPORT_SYMBOL(_##op##_lock); \ - \ -unsigned long __lockfunc _##op##_lock_irqsave(locktype##_t *lock) \ -{ \ - unsigned long flags; \ - \ - for (;;) { \ - preempt_disable(); \ - local_irq_save(flags); \ - if (likely(_raw_##op##_trylock(lock))) \ - break; \ - local_irq_restore(flags); \ - preempt_enable(); \ - \ - if (!(lock)->break_lock) \ - (lock)->break_lock = 1; \ - while (!op##_can_lock(lock) && (lock)->break_lock) \ - _raw_##op##_relax(&lock->raw_lock); \ - } \ - (lock)->break_lock = 0; \ - return flags; \ -} \ - \ -EXPORT_SYMBOL(_##op##_lock_irqsave); \ - \ -void __lockfunc _##op##_lock_irq(locktype##_t *lock) \ -{ \ - _##op##_lock_irqsave(lock); \ -} \ - \ -EXPORT_SYMBOL(_##op##_lock_irq); \ - \ -void __lockfunc _##op##_lock_bh(locktype##_t *lock) \ -{ \ - unsigned long flags; \ - \ - /* */ \ - /* Careful: we must exclude softirqs too, hence the */ \ - /* irq-disabling. We use the generic preemption-aware */ \ - /* function: */ \ - /**/ \ - flags = _##op##_lock_irqsave(lock); \ - local_bh_disable(); \ - local_irq_restore(flags); \ -} \ - \ -EXPORT_SYMBOL(_##op##_lock_bh) - -/* * Build preemption-friendly versions of the following * lock-spinning functions: * - * _[spin|read|write]_lock() - * _[spin|read|write]_lock_irq() - * _[spin|read|write]_lock_irqsave() - * _[spin|read|write]_lock_bh() + * _atomic_spin_lock() + * _atomic_spin_lock_irq() + * _atomic_spin_lock_irqsave() + * _atomic_spin_lock_bh() */ -BUILD_LOCK_OPS(spin, spinlock); -BUILD_LOCK_OPS(read, rwlock); -BUILD_LOCK_OPS(write, rwlock); +BUILD_LOCK_OPS(atomic_spin, spin, atomic_spinlock); #endif /* CONFIG_PREEMPT */ #ifdef CONFIG_DEBUG_LOCK_ALLOC -void __lockfunc _spin_lock_nested(spinlock_t *lock, int subclass) +void __lockfunc _atomic_spin_lock_nested(atomic_spinlock_t *lock, int subclass) { preempt_disable(); spin_acquire(&lock->dep_map, subclass, 0, _RET_IP_); LOCK_CONTENDED(lock, _raw_spin_trylock, _raw_spin_lock); } -EXPORT_SYMBOL(_spin_lock_nested); +EXPORT_SYMBOL(_atomic_spin_lock_nested); -unsigned long __lockfunc _spin_lock_irqsave_nested(spinlock_t *lock, int subclass) +unsigned long __lockfunc +_atomic_spin_lock_irqsave_nested(atomic_spinlock_t *lock, int subclass) { unsigned long flags; @@ -305,125 +126,56 @@ unsigned long __lockfunc _spin_lock_irqsave_nested(spinlock_t *lock, int subclas _raw_spin_lock_flags, &flags); return flags; } -EXPORT_SYMBOL(_spin_lock_irqsave_nested); +EXPORT_SYMBOL(_atomic_spin_lock_irqsave_nested); -void __lockfunc _spin_lock_nest_lock(spinlock_t *lock, +void __lockfunc _atomic_spin_lock_nest_lock(atomic_spinlock_t *lock, struct lockdep_map *nest_lock) { preempt_disable(); spin_acquire_nest(&lock->dep_map, 0, 0, nest_lock, _RET_IP_); LOCK_CONTENDED(lock, _raw_spin_trylock, _raw_spin_lock); } -EXPORT_SYMBOL(_spin_lock_nest_lock); +EXPORT_SYMBOL(_atomic_spin_lock_nest_lock); #endif -void __lockfunc _spin_unlock(spinlock_t *lock) +void __lockfunc _atomic_spin_unlock(atomic_spinlock_t *lock) { spin_release(&lock->dep_map, 1, _RET_IP_); _raw_spin_unlock(lock); preempt_enable(); } -EXPORT_SYMBOL(_spin_unlock); - -void __lockfunc _write_unlock(rwlock_t *lock) -{ - rwlock_release(&lock->dep_map, 1, _RET_IP_); - _raw_write_unlock(lock); - preempt_enable(); -} -EXPORT_SYMBOL(_write_unlock); +EXPORT_SYMBOL(_atomic_spin_unlock); -void __lockfunc _read_unlock(rwlock_t *lock) -{ - rwlock_release(&lock->dep_map, 1, _RET_IP_); - _raw_read_unlock(lock); - preempt_enable(); -} -EXPORT_SYMBOL(_read_unlock); - -void __lockfunc _spin_unlock_irqrestore(spinlock_t *lock, unsigned long flags) +void __lockfunc +_atomic_spin_unlock_irqrestore(atomic_spinlock_t *lock, unsigned long flags) { spin_release(&lock->dep_map, 1, _RET_IP_); _raw_spin_unlock(lock); local_irq_restore(flags); preempt_enable(); } -EXPORT_SYMBOL(_spin_unlock_irqrestore); +EXPORT_SYMBOL(_atomic_spin_unlock_irqrestore); -void __lockfunc _spin_unlock_irq(spinlock_t *lock) +void __lockfunc _atomic_spin_unlock_irq(atomic_spinlock_t *lock) { spin_release(&lock->dep_map, 1, _RET_IP_); _raw_spin_unlock(lock); local_irq_enable(); preempt_enable(); } -EXPORT_SYMBOL(_spin_unlock_irq); +EXPORT_SYMBOL(_atomic_spin_unlock_irq); -void __lockfunc _spin_unlock_bh(spinlock_t *lock) +void __lockfunc _atomic_spin_unlock_bh(atomic_spinlock_t *lock) { spin_release(&lock->dep_map, 1, _RET_IP_); _raw_spin_unlock(lock); - preempt_enable_no_resched(); - local_bh_enable_ip((unsigned long)__builtin_return_address(0)); -} -EXPORT_SYMBOL(_spin_unlock_bh); - -void __lockfunc _read_unlock_irqrestore(rwlock_t *lock, unsigned long flags) -{ - rwlock_release(&lock->dep_map, 1, _RET_IP_); - _raw_read_unlock(lock); - local_irq_restore(flags); - preempt_enable(); -} -EXPORT_SYMBOL(_read_unlock_irqrestore); - -void __lockfunc _read_unlock_irq(rwlock_t *lock) -{ - rwlock_release(&lock->dep_map, 1, _RET_IP_); - _raw_read_unlock(lock); - local_irq_enable(); - preempt_enable(); -} -EXPORT_SYMBOL(_read_unlock_irq); - -void __lockfunc _read_unlock_bh(rwlock_t *lock) -{ - rwlock_release(&lock->dep_map, 1, _RET_IP_); - _raw_read_unlock(lock); - preempt_enable_no_resched(); - local_bh_enable_ip((unsigned long)__builtin_return_address(0)); -} -EXPORT_SYMBOL(_read_unlock_bh); - -void __lockfunc _write_unlock_irqrestore(rwlock_t *lock, unsigned long flags) -{ - rwlock_release(&lock->dep_map, 1, _RET_IP_); - _raw_write_unlock(lock); - local_irq_restore(flags); - preempt_enable(); -} -EXPORT_SYMBOL(_write_unlock_irqrestore); - -void __lockfunc _write_unlock_irq(rwlock_t *lock) -{ - rwlock_release(&lock->dep_map, 1, _RET_IP_); - _raw_write_unlock(lock); - local_irq_enable(); - preempt_enable(); -} -EXPORT_SYMBOL(_write_unlock_irq); - -void __lockfunc _write_unlock_bh(rwlock_t *lock) -{ - rwlock_release(&lock->dep_map, 1, _RET_IP_); - _raw_write_unlock(lock); - preempt_enable_no_resched(); + __preempt_enable_no_resched(); local_bh_enable_ip((unsigned long)__builtin_return_address(0)); } -EXPORT_SYMBOL(_write_unlock_bh); +EXPORT_SYMBOL(_atomic_spin_unlock_bh); -int __lockfunc _spin_trylock_bh(spinlock_t *lock) +int __lockfunc _atomic_spin_trylock_bh(atomic_spinlock_t *lock) { local_bh_disable(); preempt_disable(); @@ -432,11 +184,11 @@ int __lockfunc _spin_trylock_bh(spinlock_t *lock) return 1; } - preempt_enable_no_resched(); + __preempt_enable_no_resched(); local_bh_enable_ip((unsigned long)__builtin_return_address(0)); return 0; } -EXPORT_SYMBOL(_spin_trylock_bh); +EXPORT_SYMBOL(_atomic_spin_trylock_bh); notrace int in_lock_functions(unsigned long addr) { diff --git a/kernel/srcu.c b/kernel/srcu.c index b0aeeaf..6b4b325 100644 --- a/kernel/srcu.c +++ b/kernel/srcu.c @@ -255,3 +255,89 @@ EXPORT_SYMBOL_GPL(srcu_read_lock); EXPORT_SYMBOL_GPL(srcu_read_unlock); EXPORT_SYMBOL_GPL(synchronize_srcu); EXPORT_SYMBOL_GPL(srcu_batches_completed); + +int init_qrcu_struct(struct qrcu_struct *qp) +{ + qp->completed = 0; + atomic_set(qp->ctr + 0, 1); + atomic_set(qp->ctr + 1, 0); + init_waitqueue_head(&qp->wq); + mutex_init(&qp->mutex); + + return 0; +} + +int qrcu_read_lock(struct qrcu_struct *qp) +{ + for (;;) { + int idx = qp->completed & 0x1; + if (likely(atomic_inc_not_zero(qp->ctr + idx))) + return idx; + } +} + +void qrcu_read_unlock(struct qrcu_struct *qp, int idx) +{ + if (atomic_dec_and_test(qp->ctr + idx)) + wake_up(&qp->wq); +} + +void synchronize_qrcu(struct qrcu_struct *qp) +{ + int idx; + + smp_mb(); /* Force preceding change to happen before fastpath check. */ + + /* + * Fastpath: If the two counters sum to "1" at a given point in + * time, there are no readers. However, it takes two separate + * loads to sample both counters, which won't occur simultaneously. + * So we might race with a counter switch, so that we might see + * ctr[0]==0, then the counter might switch, then we might see + * ctr[1]==1 (unbeknownst to us because there is a reader still + * there). So we do a read memory barrier and recheck. If the + * same race happens again, there must have been a second counter + * switch. This second counter switch could not have happened + * until all preceding readers finished, so if the condition + * is true both times, we may safely proceed. + * + * This relies critically on the atomic increment and atomic + * decrement being seen as executing in order. + */ + + if (atomic_read(&qp->ctr[0]) + atomic_read(&qp->ctr[1]) <= 1) { + smp_rmb(); /* Keep two checks independent. */ + if (atomic_read(&qp->ctr[0]) + atomic_read(&qp->ctr[1]) <= 1) + goto out; + } + + mutex_lock(&qp->mutex); + + idx = qp->completed & 0x1; + if (atomic_read(qp->ctr + idx) == 1) + goto out_unlock; + + atomic_inc(qp->ctr + (idx ^ 0x1)); + + /* + * Prevent subsequent decrement from being seen before previous + * increment -- such an inversion could cause the fastpath + * above to falsely conclude that there were no readers. Also, + * reduce the likelihood that qrcu_read_lock() will loop. + */ + + smp_mb__after_atomic_inc(); + qp->completed++; + + atomic_dec(qp->ctr + idx); + __wait_event(qp->wq, !atomic_read(qp->ctr + idx)); +out_unlock: + mutex_unlock(&qp->mutex); +out: + smp_mb(); /* force subsequent free after qrcu_read_unlock(). */ +} + +EXPORT_SYMBOL_GPL(init_qrcu_struct); +EXPORT_SYMBOL_GPL(qrcu_read_lock); +EXPORT_SYMBOL_GPL(qrcu_read_unlock); +EXPORT_SYMBOL_GPL(synchronize_qrcu); diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c index 912823e..22d1d77 100644 --- a/kernel/stop_machine.c +++ b/kernel/stop_machine.c @@ -40,6 +40,8 @@ static atomic_t thread_ack; static DEFINE_MUTEX(lock); /* setup_lock protects refcount, stop_machine_wq and stop_machine_work. */ static DEFINE_MUTEX(setup_lock); +/* do not start up until all worklets have been placed: */ +static DEFINE_MUTEX(startup_lock); /* Users of stop_machine. */ static int refcount; static struct workqueue_struct *stop_machine_wq; @@ -71,6 +73,15 @@ static void stop_cpu(struct work_struct *unused) int cpu = smp_processor_id(); int err; + /* + * Wait for the startup loop to finish: + */ + mutex_lock(&startup_lock); + /* + * Let other threads continue too: + */ + mutex_unlock(&startup_lock); + if (!active_cpus) { if (cpu == cpumask_first(cpu_online_mask)) smdata = &active; @@ -166,16 +177,21 @@ int __stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus) set_state(STOPMACHINE_PREPARE); - /* Schedule the stop_cpu work on all cpus: hold this CPU so one - * doesn't hit this CPU until we're ready. */ - get_cpu(); + /* + * Schedule the stop_cpu work on all cpus before allowing any + * of the CPUs to execute it: + */ + mutex_lock(&startup_lock); + for_each_online_cpu(i) { sm_work = per_cpu_ptr(stop_machine_work, i); INIT_WORK(sm_work, stop_cpu); queue_work_on(i, stop_machine_wq, sm_work); } - /* This will release the thread on our CPU. */ - put_cpu(); + + /* This will release the thread on all CPUs: */ + mutex_unlock(&startup_lock); + flush_workqueue(stop_machine_wq); ret = active.fnret; mutex_unlock(&lock); diff --git a/kernel/sys.c b/kernel/sys.c index b3f1097..eb040a4 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -33,6 +33,7 @@ #include <linux/getcpu.h> #include <linux/task_io_accounting_ops.h> #include <linux/seccomp.h> +#include <linux/hardirq.h> #include <linux/cpu.h> #include <linux/ptrace.h> #include <linux/fs_struct.h> @@ -280,6 +281,15 @@ out_unlock: */ void emergency_restart(void) { + /* + * Call the notifier chain if we are not in an + * atomic context: + */ +#ifdef CONFIG_PREEMPT + if (!in_atomic() && !irqs_disabled()) + blocking_notifier_call_chain(&reboot_notifier_list, + SYS_RESTART, NULL); +#endif machine_emergency_restart(); } EXPORT_SYMBOL_GPL(emergency_restart); diff --git a/kernel/time.c b/kernel/time.c index 2951194..35d1aaa 100644 --- a/kernel/time.c +++ b/kernel/time.c @@ -133,11 +133,11 @@ SYSCALL_DEFINE2(gettimeofday, struct timeval __user *, tv, */ static inline void warp_clock(void) { - write_seqlock_irq(&xtime_lock); + write_atomic_seqlock_irq(&xtime_lock); wall_to_monotonic.tv_sec -= sys_tz.tz_minuteswest * 60; xtime.tv_sec += sys_tz.tz_minuteswest * 60; update_xtime_cache(0); - write_sequnlock_irq(&xtime_lock); + write_atomic_sequnlock_irq(&xtime_lock); clock_was_set(); } @@ -370,13 +370,20 @@ EXPORT_SYMBOL(mktime); * 0 <= tv_nsec < NSEC_PER_SEC * For negative values only the tv_sec field is negative ! */ -void set_normalized_timespec(struct timespec *ts, time_t sec, long nsec) +void set_normalized_timespec(struct timespec *ts, time_t sec, s64 nsec) { while (nsec >= NSEC_PER_SEC) { + /* + * The following asm() prevents the compiler from + * optimising this loop into a modulo operation. See + * also __iter_div_u64_rem() in include/linux/time.h + */ + asm("" : "+rm"(nsec)); nsec -= NSEC_PER_SEC; ++sec; } while (nsec < 0) { + asm("" : "+rm"(nsec)); nsec += NSEC_PER_SEC; --sec; } @@ -662,9 +669,9 @@ u64 get_jiffies_64(void) u64 ret; do { - seq = read_seqbegin(&xtime_lock); + seq = read_atomic_seqbegin(&xtime_lock); ret = jiffies_64; - } while (read_seqretry(&xtime_lock, seq)); + } while (read_atomic_seqretry(&xtime_lock, seq)); return ret; } EXPORT_SYMBOL(get_jiffies_64); diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c index 620b58a..05097c4 100644 --- a/kernel/time/clockevents.c +++ b/kernel/time/clockevents.c @@ -28,7 +28,7 @@ static LIST_HEAD(clockevents_released); static RAW_NOTIFIER_HEAD(clockevents_chain); /* Protection for the above */ -static DEFINE_SPINLOCK(clockevents_lock); +static DEFINE_ATOMIC_SPINLOCK(clockevents_lock); /** * clockevents_delta2ns - Convert a latch value (device ticks) to nanoseconds @@ -140,9 +140,9 @@ int clockevents_register_notifier(struct notifier_block *nb) unsigned long flags; int ret; - spin_lock_irqsave(&clockevents_lock, flags); + atomic_spin_lock_irqsave(&clockevents_lock, flags); ret = raw_notifier_chain_register(&clockevents_chain, nb); - spin_unlock_irqrestore(&clockevents_lock, flags); + atomic_spin_unlock_irqrestore(&clockevents_lock, flags); return ret; } @@ -184,13 +184,13 @@ void clockevents_register_device(struct clock_event_device *dev) BUG_ON(dev->mode != CLOCK_EVT_MODE_UNUSED); BUG_ON(!dev->cpumask); - spin_lock_irqsave(&clockevents_lock, flags); + atomic_spin_lock_irqsave(&clockevents_lock, flags); list_add(&dev->list, &clockevent_devices); clockevents_do_notify(CLOCK_EVT_NOTIFY_ADD, dev); clockevents_notify_released(); - spin_unlock_irqrestore(&clockevents_lock, flags); + atomic_spin_unlock_irqrestore(&clockevents_lock, flags); } EXPORT_SYMBOL_GPL(clockevents_register_device); @@ -240,7 +240,8 @@ void clockevents_notify(unsigned long reason, void *arg) struct list_head *node, *tmp; unsigned long flags; - spin_lock_irqsave(&clockevents_lock, flags); + atomic_spin_lock_irqsave(&clockevents_lock, flags); + clockevents_do_notify(reason, arg); switch (reason) { @@ -255,7 +256,7 @@ void clockevents_notify(unsigned long reason, void *arg) default: break; } - spin_unlock_irqrestore(&clockevents_lock, flags); + atomic_spin_unlock_irqrestore(&clockevents_lock, flags); } EXPORT_SYMBOL_GPL(clockevents_notify); #endif diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index 7466cb8..8a42731 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -127,7 +127,7 @@ static struct clocksource *curr_clocksource = &clocksource_jiffies; static struct clocksource *next_clocksource; static struct clocksource *clocksource_override; static LIST_HEAD(clocksource_list); -static DEFINE_SPINLOCK(clocksource_lock); +static DEFINE_ATOMIC_SPINLOCK(clocksource_lock); static char override_name[32]; static int finished_booting; @@ -296,7 +296,7 @@ void clocksource_resume(void) struct clocksource *cs; unsigned long flags; - spin_lock_irqsave(&clocksource_lock, flags); + atomic_spin_lock_irqsave(&clocksource_lock, flags); list_for_each_entry(cs, &clocksource_list, list) { if (cs->resume) @@ -305,7 +305,7 @@ void clocksource_resume(void) clocksource_resume_watchdog(); - spin_unlock_irqrestore(&clocksource_lock, flags); + atomic_spin_unlock_irqrestore(&clocksource_lock, flags); } /** @@ -328,12 +328,12 @@ struct clocksource *clocksource_get_next(void) { unsigned long flags; - spin_lock_irqsave(&clocksource_lock, flags); + atomic_spin_lock_irqsave(&clocksource_lock, flags); if (next_clocksource && finished_booting) { curr_clocksource = next_clocksource; next_clocksource = NULL; } - spin_unlock_irqrestore(&clocksource_lock, flags); + atomic_spin_unlock_irqrestore(&clocksource_lock, flags); return curr_clocksource; } @@ -402,11 +402,11 @@ int clocksource_register(struct clocksource *c) unsigned long flags; int ret; - spin_lock_irqsave(&clocksource_lock, flags); + atomic_spin_lock_irqsave(&clocksource_lock, flags); ret = clocksource_enqueue(c); if (!ret) next_clocksource = select_clocksource(); - spin_unlock_irqrestore(&clocksource_lock, flags); + atomic_spin_unlock_irqrestore(&clocksource_lock, flags); if (!ret) clocksource_check_watchdog(c); return ret; @@ -421,12 +421,12 @@ void clocksource_change_rating(struct clocksource *cs, int rating) { unsigned long flags; - spin_lock_irqsave(&clocksource_lock, flags); + atomic_spin_lock_irqsave(&clocksource_lock, flags); list_del(&cs->list); cs->rating = rating; clocksource_enqueue(cs); next_clocksource = select_clocksource(); - spin_unlock_irqrestore(&clocksource_lock, flags); + atomic_spin_unlock_irqrestore(&clocksource_lock, flags); } /** @@ -436,12 +436,12 @@ void clocksource_unregister(struct clocksource *cs) { unsigned long flags; - spin_lock_irqsave(&clocksource_lock, flags); + atomic_spin_lock_irqsave(&clocksource_lock, flags); list_del(&cs->list); if (clocksource_override == cs) clocksource_override = NULL; next_clocksource = select_clocksource(); - spin_unlock_irqrestore(&clocksource_lock, flags); + atomic_spin_unlock_irqrestore(&clocksource_lock, flags); } #ifdef CONFIG_SYSFS @@ -458,9 +458,9 @@ sysfs_show_current_clocksources(struct sys_device *dev, { ssize_t count = 0; - spin_lock_irq(&clocksource_lock); + atomic_spin_lock_irq(&clocksource_lock); count = snprintf(buf, PAGE_SIZE, "%s\n", curr_clocksource->name); - spin_unlock_irq(&clocksource_lock); + atomic_spin_unlock_irq(&clocksource_lock); return count; } @@ -490,7 +490,7 @@ static ssize_t sysfs_override_clocksource(struct sys_device *dev, if (buf[count-1] == '\n') count--; - spin_lock_irq(&clocksource_lock); + atomic_spin_lock_irq(&clocksource_lock); if (count > 0) memcpy(override_name, buf, count); @@ -527,7 +527,7 @@ static ssize_t sysfs_override_clocksource(struct sys_device *dev, next_clocksource = select_clocksource(); } - spin_unlock_irq(&clocksource_lock); + atomic_spin_unlock_irq(&clocksource_lock); return ret; } @@ -547,7 +547,7 @@ sysfs_show_available_clocksources(struct sys_device *dev, struct clocksource *src; ssize_t count = 0; - spin_lock_irq(&clocksource_lock); + atomic_spin_lock_irq(&clocksource_lock); list_for_each_entry(src, &clocksource_list, list) { /* * Don't show non-HRES clocksource if the tick code is @@ -559,7 +559,7 @@ sysfs_show_available_clocksources(struct sys_device *dev, max((ssize_t)PAGE_SIZE - count, (ssize_t)0), "%s ", src->name); } - spin_unlock_irq(&clocksource_lock); + atomic_spin_unlock_irq(&clocksource_lock); count += snprintf(buf + count, max((ssize_t)PAGE_SIZE - count, (ssize_t)0), "\n"); @@ -615,10 +615,10 @@ device_initcall(init_clocksource_sysfs); static int __init boot_override_clocksource(char* str) { unsigned long flags; - spin_lock_irqsave(&clocksource_lock, flags); + atomic_spin_lock_irqsave(&clocksource_lock, flags); if (str) strlcpy(override_name, str, sizeof(override_name)); - spin_unlock_irqrestore(&clocksource_lock, flags); + atomic_spin_unlock_irqrestore(&clocksource_lock, flags); return 1; } diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index 7fc6437..c195ede 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -188,7 +188,7 @@ static enum hrtimer_restart ntp_leap_second(struct hrtimer *timer) { enum hrtimer_restart res = HRTIMER_NORESTART; - write_seqlock(&xtime_lock); + write_atomic_seqlock(&xtime_lock); switch (time_state) { case TIME_OK: @@ -221,7 +221,7 @@ static enum hrtimer_restart ntp_leap_second(struct hrtimer *timer) } update_vsyscall(&xtime, clock); - write_sequnlock(&xtime_lock); + write_atomic_sequnlock(&xtime_lock); return res; } @@ -479,7 +479,7 @@ int do_adjtimex(struct timex *txc) getnstimeofday(&ts); - write_seqlock_irq(&xtime_lock); + write_atomic_seqlock_irq(&xtime_lock); if (txc->modes & ADJ_ADJTIME) { long save_adjust = time_adjust; @@ -527,7 +527,7 @@ int do_adjtimex(struct timex *txc) txc->errcnt = 0; txc->stbcnt = 0; - write_sequnlock_irq(&xtime_lock); + write_atomic_sequnlock_irq(&xtime_lock); txc->time.tv_sec = ts.tv_sec; txc->time.tv_usec = ts.tv_nsec; diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index c2ec250..e6d7286 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c @@ -31,7 +31,7 @@ static struct tick_device tick_broadcast_device; /* FIXME: Use cpumask_var_t. */ static DECLARE_BITMAP(tick_broadcast_mask, NR_CPUS); static DECLARE_BITMAP(tmpmask, NR_CPUS); -static DEFINE_SPINLOCK(tick_broadcast_lock); +static DEFINE_ATOMIC_SPINLOCK(tick_broadcast_lock); static int tick_broadcast_force; #ifdef CONFIG_TICK_ONESHOT @@ -96,7 +96,7 @@ int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu) unsigned long flags; int ret = 0; - spin_lock_irqsave(&tick_broadcast_lock, flags); + atomic_spin_lock_irqsave(&tick_broadcast_lock, flags); /* * Devices might be registered with both periodic and oneshot @@ -122,7 +122,7 @@ int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu) tick_broadcast_clear_oneshot(cpu); } } - spin_unlock_irqrestore(&tick_broadcast_lock, flags); + atomic_spin_unlock_irqrestore(&tick_broadcast_lock, flags); return ret; } @@ -161,13 +161,13 @@ static void tick_do_broadcast(struct cpumask *mask) */ static void tick_do_periodic_broadcast(void) { - spin_lock(&tick_broadcast_lock); + atomic_spin_lock(&tick_broadcast_lock); cpumask_and(to_cpumask(tmpmask), cpu_online_mask, tick_get_broadcast_mask()); tick_do_broadcast(to_cpumask(tmpmask)); - spin_unlock(&tick_broadcast_lock); + atomic_spin_unlock(&tick_broadcast_lock); } /* @@ -212,7 +212,7 @@ static void tick_do_broadcast_on_off(unsigned long *reason) unsigned long flags; int cpu, bc_stopped; - spin_lock_irqsave(&tick_broadcast_lock, flags); + atomic_spin_lock_irqsave(&tick_broadcast_lock, flags); cpu = smp_processor_id(); td = &per_cpu(tick_cpu_device, cpu); @@ -263,7 +263,7 @@ static void tick_do_broadcast_on_off(unsigned long *reason) tick_broadcast_setup_oneshot(bc); } out: - spin_unlock_irqrestore(&tick_broadcast_lock, flags); + atomic_spin_unlock_irqrestore(&tick_broadcast_lock, flags); } /* @@ -299,7 +299,7 @@ void tick_shutdown_broadcast(unsigned int *cpup) unsigned long flags; unsigned int cpu = *cpup; - spin_lock_irqsave(&tick_broadcast_lock, flags); + atomic_spin_lock_irqsave(&tick_broadcast_lock, flags); bc = tick_broadcast_device.evtdev; cpumask_clear_cpu(cpu, tick_get_broadcast_mask()); @@ -309,7 +309,7 @@ void tick_shutdown_broadcast(unsigned int *cpup) clockevents_shutdown(bc); } - spin_unlock_irqrestore(&tick_broadcast_lock, flags); + atomic_spin_unlock_irqrestore(&tick_broadcast_lock, flags); } void tick_suspend_broadcast(void) @@ -317,13 +317,13 @@ void tick_suspend_broadcast(void) struct clock_event_device *bc; unsigned long flags; - spin_lock_irqsave(&tick_broadcast_lock, flags); + atomic_spin_lock_irqsave(&tick_broadcast_lock, flags); bc = tick_broadcast_device.evtdev; if (bc) clockevents_shutdown(bc); - spin_unlock_irqrestore(&tick_broadcast_lock, flags); + atomic_spin_unlock_irqrestore(&tick_broadcast_lock, flags); } int tick_resume_broadcast(void) @@ -332,7 +332,7 @@ int tick_resume_broadcast(void) unsigned long flags; int broadcast = 0; - spin_lock_irqsave(&tick_broadcast_lock, flags); + atomic_spin_lock_irqsave(&tick_broadcast_lock, flags); bc = tick_broadcast_device.evtdev; @@ -351,7 +351,7 @@ int tick_resume_broadcast(void) break; } } - spin_unlock_irqrestore(&tick_broadcast_lock, flags); + atomic_spin_unlock_irqrestore(&tick_broadcast_lock, flags); return broadcast; } @@ -405,7 +405,7 @@ static void tick_handle_oneshot_broadcast(struct clock_event_device *dev) ktime_t now, next_event; int cpu; - spin_lock(&tick_broadcast_lock); + atomic_spin_lock(&tick_broadcast_lock); again: dev->next_event.tv64 = KTIME_MAX; next_event.tv64 = KTIME_MAX; @@ -443,7 +443,7 @@ again: if (tick_broadcast_set_event(next_event, 0)) goto again; } - spin_unlock(&tick_broadcast_lock); + atomic_spin_unlock(&tick_broadcast_lock); } /* @@ -457,7 +457,7 @@ void tick_broadcast_oneshot_control(unsigned long reason) unsigned long flags; int cpu; - spin_lock_irqsave(&tick_broadcast_lock, flags); + atomic_spin_lock_irqsave(&tick_broadcast_lock, flags); /* * Periodic mode does not care about the enter/exit of power @@ -492,7 +492,7 @@ void tick_broadcast_oneshot_control(unsigned long reason) } out: - spin_unlock_irqrestore(&tick_broadcast_lock, flags); + atomic_spin_unlock_irqrestore(&tick_broadcast_lock, flags); } /* @@ -563,13 +563,13 @@ void tick_broadcast_switch_to_oneshot(void) struct clock_event_device *bc; unsigned long flags; - spin_lock_irqsave(&tick_broadcast_lock, flags); + atomic_spin_lock_irqsave(&tick_broadcast_lock, flags); tick_broadcast_device.mode = TICKDEV_MODE_ONESHOT; bc = tick_broadcast_device.evtdev; if (bc) tick_broadcast_setup_oneshot(bc); - spin_unlock_irqrestore(&tick_broadcast_lock, flags); + atomic_spin_unlock_irqrestore(&tick_broadcast_lock, flags); } @@ -581,7 +581,7 @@ void tick_shutdown_broadcast_oneshot(unsigned int *cpup) unsigned long flags; unsigned int cpu = *cpup; - spin_lock_irqsave(&tick_broadcast_lock, flags); + atomic_spin_lock_irqsave(&tick_broadcast_lock, flags); /* * Clear the broadcast mask flag for the dead cpu, but do not @@ -589,7 +589,7 @@ void tick_shutdown_broadcast_oneshot(unsigned int *cpup) */ cpumask_clear_cpu(cpu, tick_get_broadcast_oneshot_mask()); - spin_unlock_irqrestore(&tick_broadcast_lock, flags); + atomic_spin_unlock_irqrestore(&tick_broadcast_lock, flags); } /* diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c index 83c4417..1d3068a 100644 --- a/kernel/time/tick-common.c +++ b/kernel/time/tick-common.c @@ -34,7 +34,7 @@ DEFINE_PER_CPU(struct tick_device, tick_cpu_device); ktime_t tick_next_period; ktime_t tick_period; int tick_do_timer_cpu __read_mostly = TICK_DO_TIMER_BOOT; -DEFINE_SPINLOCK(tick_device_lock); +DEFINE_ATOMIC_SPINLOCK(tick_device_lock); /* * Debugging: see timer_list.c @@ -60,13 +60,13 @@ int tick_is_oneshot_available(void) static void tick_periodic(int cpu) { if (tick_do_timer_cpu == cpu) { - write_seqlock(&xtime_lock); + write_atomic_seqlock(&xtime_lock); /* Keep track of the next tick event */ tick_next_period = ktime_add(tick_next_period, tick_period); do_timer(1); - write_sequnlock(&xtime_lock); + write_atomic_sequnlock(&xtime_lock); } update_process_times(user_mode(get_irq_regs())); @@ -127,9 +127,9 @@ void tick_setup_periodic(struct clock_event_device *dev, int broadcast) ktime_t next; do { - seq = read_seqbegin(&xtime_lock); + seq = read_atomic_seqbegin(&xtime_lock); next = tick_next_period; - } while (read_seqretry(&xtime_lock, seq)); + } while (read_atomic_seqretry(&xtime_lock, seq)); clockevents_set_mode(dev, CLOCK_EVT_MODE_ONESHOT); @@ -209,7 +209,7 @@ static int tick_check_new_device(struct clock_event_device *newdev) int cpu, ret = NOTIFY_OK; unsigned long flags; - spin_lock_irqsave(&tick_device_lock, flags); + atomic_spin_lock_irqsave(&tick_device_lock, flags); cpu = smp_processor_id(); if (!cpumask_test_cpu(cpu, newdev->cpumask)) @@ -268,7 +268,7 @@ static int tick_check_new_device(struct clock_event_device *newdev) if (newdev->features & CLOCK_EVT_FEAT_ONESHOT) tick_oneshot_notify(); - spin_unlock_irqrestore(&tick_device_lock, flags); + atomic_spin_unlock_irqrestore(&tick_device_lock, flags); return NOTIFY_STOP; out_bc: @@ -278,7 +278,7 @@ out_bc: if (tick_check_broadcast_device(newdev)) ret = NOTIFY_STOP; - spin_unlock_irqrestore(&tick_device_lock, flags); + atomic_spin_unlock_irqrestore(&tick_device_lock, flags); return ret; } @@ -311,7 +311,7 @@ static void tick_shutdown(unsigned int *cpup) struct clock_event_device *dev = td->evtdev; unsigned long flags; - spin_lock_irqsave(&tick_device_lock, flags); + atomic_spin_lock_irqsave(&tick_device_lock, flags); td->mode = TICKDEV_MODE_PERIODIC; if (dev) { /* @@ -322,7 +322,7 @@ static void tick_shutdown(unsigned int *cpup) clockevents_exchange_device(dev, NULL); td->evtdev = NULL; } - spin_unlock_irqrestore(&tick_device_lock, flags); + atomic_spin_unlock_irqrestore(&tick_device_lock, flags); } static void tick_suspend(void) @@ -330,9 +330,9 @@ static void tick_suspend(void) struct tick_device *td = &__get_cpu_var(tick_cpu_device); unsigned long flags; - spin_lock_irqsave(&tick_device_lock, flags); + atomic_spin_lock_irqsave(&tick_device_lock, flags); clockevents_shutdown(td->evtdev); - spin_unlock_irqrestore(&tick_device_lock, flags); + atomic_spin_unlock_irqrestore(&tick_device_lock, flags); } static void tick_resume(void) @@ -341,7 +341,7 @@ static void tick_resume(void) unsigned long flags; int broadcast = tick_resume_broadcast(); - spin_lock_irqsave(&tick_device_lock, flags); + atomic_spin_lock_irqsave(&tick_device_lock, flags); clockevents_set_mode(td->evtdev, CLOCK_EVT_MODE_RESUME); if (!broadcast) { @@ -350,7 +350,7 @@ static void tick_resume(void) else tick_resume_oneshot(); } - spin_unlock_irqrestore(&tick_device_lock, flags); + atomic_spin_unlock_irqrestore(&tick_device_lock, flags); } /* diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h index b1c05bf..e0726c7 100644 --- a/kernel/time/tick-internal.h +++ b/kernel/time/tick-internal.h @@ -6,7 +6,7 @@ #define TICK_DO_TIMER_BOOT -2 DECLARE_PER_CPU(struct tick_device, tick_cpu_device); -extern spinlock_t tick_device_lock; +extern atomic_spinlock_t tick_device_lock; extern ktime_t tick_next_period; extern ktime_t tick_period; extern int tick_do_timer_cpu __read_mostly; diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index e0f59a2..8e15027 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -57,7 +57,7 @@ static void tick_do_update_jiffies64(ktime_t now) return; /* Reevalute with xtime_lock held */ - write_seqlock(&xtime_lock); + write_atomic_seqlock(&xtime_lock); delta = ktime_sub(now, last_jiffies_update); if (delta.tv64 >= tick_period.tv64) { @@ -80,7 +80,7 @@ static void tick_do_update_jiffies64(ktime_t now) /* Keep the tick_next_period variable up to date */ tick_next_period = ktime_add(last_jiffies_update, tick_period); } - write_sequnlock(&xtime_lock); + write_atomic_sequnlock(&xtime_lock); } /* @@ -90,12 +90,12 @@ static ktime_t tick_init_jiffy_update(void) { ktime_t period; - write_seqlock(&xtime_lock); + write_atomic_seqlock(&xtime_lock); /* Did we start the jiffies update yet ? */ if (last_jiffies_update.tv64 == 0) last_jiffies_update = tick_next_period; period = last_jiffies_update; - write_sequnlock(&xtime_lock); + write_atomic_sequnlock(&xtime_lock); return period; } @@ -254,23 +254,17 @@ void tick_nohz_stop_sched_tick(int inidle) goto end; if (unlikely(local_softirq_pending() && cpu_online(cpu))) { - static int ratelimit; - - if (ratelimit < 10) { - printk(KERN_ERR "NOHZ: local_softirq_pending %02x\n", - local_softirq_pending()); - ratelimit++; - } + softirq_check_pending_idle(); goto end; } ts->idle_calls++; /* Read jiffies and the time when jiffies were updated last */ do { - seq = read_seqbegin(&xtime_lock); + seq = read_atomic_seqbegin(&xtime_lock); last_update = last_jiffies_update; last_jiffies = jiffies; - } while (read_seqretry(&xtime_lock, seq)); + } while (read_atomic_seqretry(&xtime_lock, seq)); /* Get the next timer wheel timer */ next_jiffies = get_next_timer_interrupt(last_jiffies); @@ -693,6 +687,7 @@ void tick_setup_sched_timer(void) * Emulate tick processing via per-CPU hrtimers: */ hrtimer_init(&ts->sched_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); + ts->sched_timer.irqsafe = 1; ts->sched_timer.function = tick_sched_timer; /* Get the next period (per cpu) */ diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index e8c77d9..9d1bac7 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -24,8 +24,7 @@ * This read-write spinlock protects us from races in SMP while * playing with xtime. */ -__cacheline_aligned_in_smp DEFINE_SEQLOCK(xtime_lock); - +__cacheline_aligned_in_smp DEFINE_ATOMIC_SEQLOCK(xtime_lock); /* * The current time @@ -102,7 +101,7 @@ void getnstimeofday(struct timespec *ts) WARN_ON(timekeeping_suspended); do { - seq = read_seqbegin(&xtime_lock); + seq = read_atomic_seqbegin(&xtime_lock); *ts = xtime; @@ -118,13 +117,82 @@ void getnstimeofday(struct timespec *ts) /* If arch requires, add in gettimeoffset() */ nsecs += arch_gettimeoffset(); - } while (read_seqretry(&xtime_lock, seq)); + } while (read_atomic_seqretry(&xtime_lock, seq)); timespec_add_ns(ts, nsecs); } EXPORT_SYMBOL(getnstimeofday); +ktime_t ktime_get(void) +{ + cycle_t cycle_now, cycle_delta; + unsigned int seq; + s64 secs, nsecs; + + WARN_ON(timekeeping_suspended); + + do { + seq = read_atomic_seqbegin(&xtime_lock); + secs = xtime.tv_sec + wall_to_monotonic.tv_sec; + nsecs = xtime.tv_nsec + wall_to_monotonic.tv_nsec; + + /* read clocksource: */ + cycle_now = clocksource_read(clock); + + /* calculate the delta since the last update_wall_time: */ + cycle_delta = (cycle_now - clock->cycle_last) & clock->mask; + + /* convert to nanoseconds: */ + nsecs += cyc2ns(clock, cycle_delta); + + } while (read_atomic_seqretry(&xtime_lock, seq)); + /* + * Use ktime_set/ktime_add_ns to create a proper ktime on + * 32-bit architectures without CONFIG_KTIME_SCALAR. + */ + return ktime_add_ns(ktime_set(secs, 0), nsecs); +} +EXPORT_SYMBOL_GPL(ktime_get); + +/** + * ktime_get_ts - get the monotonic clock in timespec format + * @ts: pointer to timespec variable + * + * The function calculates the monotonic clock from the realtime + * clock and the wall_to_monotonic offset and stores the result + * in normalized timespec format in the variable pointed to by @ts. + */ +void ktime_get_ts(struct timespec *ts) +{ + cycle_t cycle_now, cycle_delta; + struct timespec tomono; + unsigned int seq; + s64 nsecs; + + WARN_ON(timekeeping_suspended); + + do { + seq = read_atomic_seqbegin(&xtime_lock); + *ts = xtime; + tomono = wall_to_monotonic; + + /* read clocksource: */ + cycle_now = clocksource_read(clock); + + /* calculate the delta since the last update_wall_time: */ + cycle_delta = (cycle_now - clock->cycle_last) & clock->mask; + + /* convert to nanoseconds: */ + nsecs = cyc2ns(clock, cycle_delta); + + } while (read_atomic_seqretry(&xtime_lock, seq)); + + set_normalized_timespec(ts, ts->tv_sec + tomono.tv_sec, + ts->tv_nsec + tomono.tv_nsec + nsecs); +} +EXPORT_SYMBOL_GPL(ktime_get_ts); + /** * do_gettimeofday - Returns the time of day in a timeval * @tv: pointer to the timeval to be set @@ -155,7 +223,7 @@ int do_settimeofday(struct timespec *tv) if ((unsigned long)tv->tv_nsec >= NSEC_PER_SEC) return -EINVAL; - write_seqlock_irqsave(&xtime_lock, flags); + write_atomic_seqlock_irqsave(&xtime_lock, flags); clocksource_forward_now(); @@ -172,7 +240,7 @@ int do_settimeofday(struct timespec *tv) update_vsyscall(&xtime, clock); - write_sequnlock_irqrestore(&xtime_lock, flags); + write_atomic_sequnlock_irqrestore(&xtime_lock, flags); /* signal hrtimers about time change */ clock_was_set(); @@ -221,10 +289,65 @@ static void change_clocksource(void) clock->name); */ } -#else +#else /* GENERIC_TIME */ static inline void clocksource_forward_now(void) { } static inline void change_clocksource(void) { } -#endif + +/** + * ktime_get - get the monotonic time in ktime_t format + * + * returns the time in ktime_t format + */ +ktime_t ktime_get(void) +{ + struct timespec now; + + ktime_get_ts(&now); + + return timespec_to_ktime(now); +} +EXPORT_SYMBOL_GPL(ktime_get); + +/** + * ktime_get_ts - get the monotonic clock in timespec format + * @ts: pointer to timespec variable + * + * The function calculates the monotonic clock from the realtime + * clock and the wall_to_monotonic offset and stores the result + * in normalized timespec format in the variable pointed to by @ts. + */ +void ktime_get_ts(struct timespec *ts) +{ + struct timespec tomono; + unsigned long seq; + + do { + seq = read_atomic_seqbegin(&xtime_lock); + getnstimeofday(ts); + tomono = wall_to_monotonic; + + } while (read_atomic_seqretry(&xtime_lock, seq)); + + set_normalized_timespec(ts, ts->tv_sec + tomono.tv_sec, + ts->tv_nsec + tomono.tv_nsec); +} +EXPORT_SYMBOL_GPL(ktime_get_ts); +#endif /* !GENERIC_TIME */ + +/** + * ktime_get_real - get the real (wall-) time in ktime_t format + * + * returns the time in ktime_t format + */ +ktime_t ktime_get_real(void) +{ + struct timespec now; + + getnstimeofday(&now); + + return timespec_to_ktime(now); +} +EXPORT_SYMBOL_GPL(ktime_get_real); /** * getrawmonotonic - Returns the raw monotonic time in a timespec @@ -239,7 +362,7 @@ void getrawmonotonic(struct timespec *ts) cycle_t cycle_now, cycle_delta; do { - seq = read_seqbegin(&xtime_lock); + seq = read_atomic_seqbegin(&xtime_lock); /* read clocksource: */ cycle_now = clocksource_read(clock); @@ -252,7 +375,7 @@ void getrawmonotonic(struct timespec *ts) *ts = clock->raw_time; - } while (read_seqretry(&xtime_lock, seq)); + } while (read_atomic_seqretry(&xtime_lock, seq)); timespec_add_ns(ts, nsecs); } @@ -268,11 +391,11 @@ int timekeeping_valid_for_hres(void) int ret; do { - seq = read_seqbegin(&xtime_lock); + seq = read_atomic_seqbegin(&xtime_lock); ret = clock->flags & CLOCK_SOURCE_VALID_FOR_HRES; - } while (read_seqretry(&xtime_lock, seq)); + } while (read_atomic_seqretry(&xtime_lock, seq)); return ret; } @@ -299,7 +422,7 @@ void __init timekeeping_init(void) unsigned long flags; unsigned long sec = read_persistent_clock(); - write_seqlock_irqsave(&xtime_lock, flags); + write_atomic_seqlock_irqsave(&xtime_lock, flags); ntp_init(); @@ -314,7 +437,7 @@ void __init timekeeping_init(void) -xtime.tv_sec, -xtime.tv_nsec); update_xtime_cache(0); total_sleep_time = 0; - write_sequnlock_irqrestore(&xtime_lock, flags); + write_atomic_sequnlock_irqrestore(&xtime_lock, flags); } /* time in seconds when suspend began */ @@ -335,7 +458,7 @@ static int timekeeping_resume(struct sys_device *dev) clocksource_resume(); - write_seqlock_irqsave(&xtime_lock, flags); + write_atomic_seqlock_irqsave(&xtime_lock, flags); if (now && (now > timekeeping_suspend_time)) { unsigned long sleep_length = now - timekeeping_suspend_time; @@ -350,7 +473,7 @@ static int timekeeping_resume(struct sys_device *dev) clock->cycle_last = clocksource_read(clock); clock->error = 0; timekeeping_suspended = 0; - write_sequnlock_irqrestore(&xtime_lock, flags); + write_atomic_sequnlock_irqrestore(&xtime_lock, flags); touch_softlockup_watchdog(); @@ -368,10 +491,10 @@ static int timekeeping_suspend(struct sys_device *dev, pm_message_t state) timekeeping_suspend_time = read_persistent_clock(); - write_seqlock_irqsave(&xtime_lock, flags); + write_atomic_seqlock_irqsave(&xtime_lock, flags); clocksource_forward_now(); timekeeping_suspended = 1; - write_sequnlock_irqrestore(&xtime_lock, flags); + write_atomic_sequnlock_irqrestore(&xtime_lock, flags); clockevents_notify(CLOCK_EVT_NOTIFY_SUSPEND, NULL); @@ -610,10 +733,10 @@ struct timespec current_kernel_time(void) unsigned long seq; do { - seq = read_seqbegin(&xtime_lock); + seq = read_atomic_seqbegin(&xtime_lock); now = xtime_cache; - } while (read_seqretry(&xtime_lock, seq)); + } while (read_atomic_seqretry(&xtime_lock, seq)); return now; } diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c index fddd69d..9b20c72 100644 --- a/kernel/time/timer_list.c +++ b/kernel/time/timer_list.c @@ -84,7 +84,7 @@ print_active_timers(struct seq_file *m, struct hrtimer_clock_base *base, next_one: i = 0; - spin_lock_irqsave(&base->cpu_base->lock, flags); + atomic_spin_lock_irqsave(&base->cpu_base->lock, flags); curr = base->first; /* @@ -100,13 +100,13 @@ next_one: timer = rb_entry(curr, struct hrtimer, node); tmp = *timer; - spin_unlock_irqrestore(&base->cpu_base->lock, flags); + atomic_spin_unlock_irqrestore(&base->cpu_base->lock, flags); print_timer(m, timer, &tmp, i, now); next++; goto next_one; } - spin_unlock_irqrestore(&base->cpu_base->lock, flags); + atomic_spin_unlock_irqrestore(&base->cpu_base->lock, flags); } static void diff --git a/kernel/time/timer_stats.c b/kernel/time/timer_stats.c index 4cde8b9..0654f94 100644 --- a/kernel/time/timer_stats.c +++ b/kernel/time/timer_stats.c @@ -81,12 +81,12 @@ struct entry { /* * Spinlock protecting the tables - not taken during lookup: */ -static DEFINE_SPINLOCK(table_lock); +static DEFINE_ATOMIC_SPINLOCK(table_lock); /* * Per-CPU lookup locks for fast hash lookup: */ -static DEFINE_PER_CPU(spinlock_t, lookup_lock); +static DEFINE_PER_CPU(atomic_spinlock_t, lookup_lock); /* * Mutex to serialize state changes with show-stats activities: @@ -188,7 +188,7 @@ static struct entry *tstat_lookup(struct entry *entry, char *comm) prev = NULL; curr = *head; - spin_lock(&table_lock); + atomic_spin_lock(&table_lock); /* * Make sure we have not raced with another CPU: */ @@ -215,7 +215,7 @@ static struct entry *tstat_lookup(struct entry *entry, char *comm) *head = curr; } out_unlock: - spin_unlock(&table_lock); + atomic_spin_unlock(&table_lock); return curr; } @@ -238,7 +238,7 @@ void timer_stats_update_stats(void *timer, pid_t pid, void *startf, /* * It doesnt matter which lock we take: */ - spinlock_t *lock; + atomic_spinlock_t *lock; struct entry *entry, input; unsigned long flags; @@ -253,7 +253,7 @@ void timer_stats_update_stats(void *timer, pid_t pid, void *startf, input.pid = pid; input.timer_flag = timer_flag; - spin_lock_irqsave(lock, flags); + atomic_spin_lock_irqsave(lock, flags); if (!timer_stats_active) goto out_unlock; @@ -264,7 +264,7 @@ void timer_stats_update_stats(void *timer, pid_t pid, void *startf, atomic_inc(&overflow_count); out_unlock: - spin_unlock_irqrestore(lock, flags); + atomic_spin_unlock_irqrestore(lock, flags); } static void print_name_offset(struct seq_file *m, unsigned long addr) @@ -348,9 +348,9 @@ static void sync_access(void) int cpu; for_each_online_cpu(cpu) { - spin_lock_irqsave(&per_cpu(lookup_lock, cpu), flags); + atomic_spin_lock_irqsave(&per_cpu(lookup_lock, cpu), flags); /* nothing */ - spin_unlock_irqrestore(&per_cpu(lookup_lock, cpu), flags); + atomic_spin_unlock_irqrestore(&per_cpu(lookup_lock, cpu), flags); } } @@ -408,7 +408,7 @@ void __init init_timer_stats(void) int cpu; for_each_possible_cpu(cpu) - spin_lock_init(&per_cpu(lookup_lock, cpu)); + atomic_spin_lock_init(&per_cpu(lookup_lock, cpu)); } static int __init init_tstats_procfs(void) diff --git a/kernel/timer.c b/kernel/timer.c index a7f07d5..085a6a7 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -34,6 +34,7 @@ #include <linux/posix-timers.h> #include <linux/cpu.h> #include <linux/syscalls.h> +#include <linux/kallsyms.h> #include <linux/delay.h> #include <linux/tick.h> #include <linux/kallsyms.h> @@ -71,6 +72,7 @@ struct tvec_root { struct tvec_base { spinlock_t lock; struct timer_list *running_timer; + wait_queue_head_t wait_for_running_timer; unsigned long timer_jiffies; struct tvec_root tv1; struct tvec tv2; @@ -318,9 +320,7 @@ EXPORT_SYMBOL_GPL(round_jiffies_up_relative); static inline void set_running_timer(struct tvec_base *base, struct timer_list *timer) { -#ifdef CONFIG_SMP base->running_timer = timer; -#endif } static void internal_add_timer(struct tvec_base *base, struct timer_list *timer) @@ -630,8 +630,8 @@ __mod_timer(struct timer_list *timer, unsigned long expires, debug_timer_activate(timer); + preempt_disable(); new_base = __get_cpu_var(tvec_bases); - cpu = smp_processor_id(); #if defined(CONFIG_NO_HZ) && defined(CONFIG_SMP) @@ -642,6 +642,8 @@ __mod_timer(struct timer_list *timer, unsigned long expires, cpu = preferred_cpu; } #endif + preempt_enable(); + new_base = per_cpu(tvec_bases, cpu); if (base != new_base) { @@ -661,7 +663,6 @@ __mod_timer(struct timer_list *timer, unsigned long expires, timer_set_base(timer, base); } } - timer->expires = expires; internal_add_timer(base, timer); @@ -795,6 +796,18 @@ void add_timer_on(struct timer_list *timer, int cpu) } EXPORT_SYMBOL_GPL(add_timer_on); +/* + * Wait for a running timer + */ +void wait_for_running_timer(struct timer_list *timer) +{ + struct tvec_base *base = timer->base; + + if (base->running_timer == timer) + wait_event(base->wait_for_running_timer, + base->running_timer != timer); +} + /** * del_timer - deactive a timer. * @timer: the timer to be deactivated @@ -826,7 +839,34 @@ int del_timer(struct timer_list *timer) } EXPORT_SYMBOL(del_timer); -#ifdef CONFIG_SMP +#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_SOFTIRQS) +/* + * This function checks whether a timer is active and not running on any + * CPU. Upon successful (ret >= 0) exit the timer is not queued and the + * handler is not running on any CPU. + * + * It must not be called from interrupt contexts. + */ +int timer_pending_sync(struct timer_list *timer) +{ + struct tvec_base *base; + unsigned long flags; + int ret = -1; + + base = lock_timer_base(timer, &flags); + + if (base->running_timer == timer) + goto out; + + ret = 0; + if (timer_pending(timer)) + ret = 1; +out: + spin_unlock_irqrestore(&base->lock, flags); + + return ret; +} + /** * try_to_del_timer_sync - Try to deactivate a timer * @timer: timer do del @@ -891,7 +931,7 @@ int del_timer_sync(struct timer_list *timer) int ret = try_to_del_timer_sync(timer); if (ret >= 0) return ret; - cpu_relax(); + wait_for_running_timer(timer); } } EXPORT_SYMBOL(del_timer_sync); @@ -936,6 +976,20 @@ static inline void __run_timers(struct tvec_base *base) struct list_head *head = &work_list; int index = base->timer_jiffies & TVR_MASK; + if (softirq_need_resched()) { + spin_unlock_irq(&base->lock); + wake_up(&base->wait_for_running_timer); + cond_resched_softirq_context(); + cpu_relax(); + spin_lock_irq(&base->lock); + /* + * We can simply continue after preemption, nobody + * else can touch timer_jiffies so 'index' is still + * valid. Any new jiffy will be taken care of in + * subsequent loops: + */ + } + /* * Cascade timers: */ @@ -989,18 +1043,17 @@ static inline void __run_timers(struct tvec_base *base) lock_map_release(&lockdep_map); if (preempt_count != preempt_count()) { - printk(KERN_ERR "huh, entered %p " - "with preempt_count %08x, exited" - " with %08x?\n", - fn, preempt_count, - preempt_count()); - BUG(); + print_symbol("BUG: unbalanced timer-handler preempt count in %s!\n", (unsigned long) fn); + printk("entered with %08x, exited with %08x.\n", preempt_count, preempt_count()); + preempt_count() = preempt_count; } } + set_running_timer(base, NULL); + cond_resched_softirq_context(); spin_lock_irq(&base->lock); } } - set_running_timer(base, NULL); + wake_up(&base->wait_for_running_timer); spin_unlock_irq(&base->lock); } @@ -1133,9 +1186,22 @@ unsigned long get_next_timer_interrupt(unsigned long now) struct tvec_base *base = __get_cpu_var(tvec_bases); unsigned long expires; +#ifdef CONFIG_PREEMPT_RT + /* + * On PREEMPT_RT we cannot sleep here. If the trylock does not + * succeed then we return the worst-case 'expires in 1 tick' + * value: + */ + if (spin_trylock(&base->lock)) { + expires = __next_timer_interrupt(base); + spin_unlock(&base->lock); + } else + expires = now + 1; +#else spin_lock(&base->lock); expires = __next_timer_interrupt(base); spin_unlock(&base->lock); +#endif if (time_before_eq(expires, now)) return now; @@ -1155,11 +1221,10 @@ void update_process_times(int user_tick) /* Note: this timer irq context must be accounted for as well. */ account_process_tick(p, user_tick); + scheduler_tick(); run_local_timers(); if (rcu_pending(cpu)) rcu_check_callbacks(cpu, user_tick); - printk_tick(); - scheduler_tick(); run_posix_cpu_timers(p); } @@ -1168,10 +1233,11 @@ void update_process_times(int user_tick) */ static void run_timer_softirq(struct softirq_action *h) { - struct tvec_base *base = __get_cpu_var(tvec_bases); + struct tvec_base *base = per_cpu(tvec_bases, raw_smp_processor_id()); - perf_counter_do_pending(); + perf_counter_do_pending_softirq(); + printk_tick(); hrtimer_run_pending(); if (time_after_eq(jiffies, base->timer_jiffies)) @@ -1512,6 +1578,7 @@ static int __cpuinit init_timers_cpu(int cpu) } spin_lock_init(&base->lock); + init_waitqueue_head(&base->wait_for_running_timer); for (j = 0; j < TVN_SIZE; j++) { INIT_LIST_HEAD(base->tv5.vec + j); @@ -1543,6 +1610,7 @@ static void __cpuinit migrate_timers(int cpu) { struct tvec_base *old_base; struct tvec_base *new_base; + unsigned long flags; int i; BUG_ON(cpu_online(cpu)); @@ -1552,8 +1620,11 @@ static void __cpuinit migrate_timers(int cpu) * The caller is globally serialized and nobody else * takes two locks at once, deadlock is not possible. */ - spin_lock_irq(&new_base->lock); - spin_lock_nested(&old_base->lock, SINGLE_DEPTH_NESTING); + local_irq_save(flags); + while (!spin_trylock(&new_base->lock)) + cpu_relax(); + while (!spin_trylock(&old_base->lock)) + cpu_relax(); BUG_ON(old_base->running_timer); @@ -1567,7 +1638,9 @@ static void __cpuinit migrate_timers(int cpu) } spin_unlock(&old_base->lock); - spin_unlock_irq(&new_base->lock); + spin_unlock(&new_base->lock); + local_irq_restore(flags); + put_cpu_var(tvec_bases); } #endif /* CONFIG_HOTPLUG_CPU */ diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 019f380..fea2f14 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -41,7 +41,7 @@ config HAVE_FTRACE_MCOUNT_RECORD config HAVE_HW_BRANCH_TRACER bool -config HAVE_FTRACE_SYSCALLS +config HAVE_SYSCALL_TRACEPOINTS bool config TRACER_MAX_TRACE @@ -60,9 +60,14 @@ config EVENT_TRACING bool config CONTEXT_SWITCH_TRACER - select MARKERS bool +config RING_BUFFER_ALLOW_SWAP + bool + help + Allow the use of ring_buffer_swap_cpu. + Adds a very slight overhead to tracing when enabled. + # All tracer options should select GENERIC_TRACER. For those options that are # enabled by all tracers (context switch and event tracer) they select TRACING. # This allows those options to appear when no other tracer is selected. But the @@ -147,6 +152,7 @@ config IRQSOFF_TRACER select TRACE_IRQFLAGS select GENERIC_TRACER select TRACER_MAX_TRACE + select RING_BUFFER_ALLOW_SWAP help This option measures the time spent in irqs-off critical sections, with microsecond accuracy. @@ -161,6 +167,21 @@ config IRQSOFF_TRACER enabled. This option and the preempt-off timing option can be used together or separately.) +config INTERRUPT_OFF_HIST + bool "Interrupts-off Latency Histogram" + depends on IRQSOFF_TRACER + help + This option generates a continuously updated histogram (one per cpu) + of the duration of time periods with interrupts disabled. The + histogram is disabled by default. To enable it, write a non-zero + number to the related file in + + /sys/kernel/debug/tracing/latency_hist/enable/preemptirqsoff + + If PREEMPT_OFF_HIST is also selected, an additional histogram (one + per cpu) is generated that accumulates the duration of time periods + when both interrupts and preemption are disabled. + config PREEMPT_TRACER bool "Preemption-off Latency Tracer" default n @@ -168,6 +189,7 @@ config PREEMPT_TRACER depends on PREEMPT select GENERIC_TRACER select TRACER_MAX_TRACE + select RING_BUFFER_ALLOW_SWAP help This option measures the time spent in preemption off critical sections, with microsecond accuracy. @@ -182,14 +204,20 @@ config PREEMPT_TRACER enabled. This option and the irqs-off timing option can be used together or separately.) -config SYSPROF_TRACER - bool "Sysprof Tracer" - depends on X86 - select GENERIC_TRACER - select CONTEXT_SWITCH_TRACER +config PREEMPT_OFF_HIST + bool "Preemption-off Latency Histogram" + depends on PREEMPT_TRACER help - This tracer provides the trace needed by the 'Sysprof' userspace - tool. + This option generates a continuously updated histogram (one per cpu) + of the duration of time periods with preemption disabled. The + histogram is disabled by default. To enable it, write a non-zero + number to + + /sys/kernel/debug/tracing/latency_hist/enable/preemptirqsoff + + If INTERRUPT_OFF_HIST is also selected, an additional histogram (one + per cpu) is generated that accumulates the duration of time periods + when both interrupts and preemption are disabled. config SCHED_TRACER bool "Scheduling Latency Tracer" @@ -200,6 +228,25 @@ config SCHED_TRACER This tracer tracks the latency of the highest priority task to be scheduled in, starting from the point it has woken up. +config WAKEUP_LATENCY_HIST + bool "Scheduling Latency Histogram" + depends on SCHED_TRACER + help + This option generates a continuously updated histogram (one per cpu) + of the scheduling latency of the highest priority task. The histogram + is disabled by default. To enable it, write a non-zero number to + + /sys/kernel/debug/tracing/latency_hist/enable/wakeup + +config SYSPROF_TRACER + bool "Sysprof Tracer" + depends on X86 + select GENERIC_TRACER + select CONTEXT_SWITCH_TRACER + help + This tracer provides the trace needed by the 'Sysprof' userspace + tool. + config ENABLE_DEFAULT_TRACERS bool "Trace process context switches and events" depends on !GENERIC_TRACER @@ -211,7 +258,7 @@ config ENABLE_DEFAULT_TRACERS config FTRACE_SYSCALLS bool "Trace syscalls" - depends on HAVE_FTRACE_SYSCALLS + depends on HAVE_SYSCALL_TRACEPOINTS select GENERIC_TRACER select KALLSYMS help @@ -349,6 +396,7 @@ config STACK_TRACER config HW_BRANCH_TRACER depends on HAVE_HW_BRANCH_TRACER + depends on !PREEMPT_RT bool "Trace hw branches" select GENERIC_TRACER help @@ -376,7 +424,7 @@ config KMEMTRACE If unsure, say N. config WORKQUEUE_TRACER - bool "Trace workqueues" + bool "Trace workqueues" if !PREEMPT_RT select GENERIC_TRACER help The workqueue tracer provides some statistical informations @@ -462,6 +510,18 @@ config FTRACE_STARTUP_TEST functioning properly. It will do tests on all the configured tracers of ftrace. +config EVENT_TRACE_TEST_SYSCALLS + bool "Run selftest on syscall events" + depends on FTRACE_STARTUP_TEST + help + This option will also enable testing every syscall event. + It only enables the event and disables it and runs various loads + with the event enabled. This adds a bit more time for kernel boot + up since it runs this on every system call defined. + + TBD - enable a way to actually call the syscalls as we test their + events + config MMIOTRACE bool "Memory mapped IO tracing" depends on HAVE_MMIOTRACE_SUPPORT && PCI diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index 844164d..c71519b 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile @@ -35,6 +35,9 @@ obj-$(CONFIG_FUNCTION_TRACER) += trace_functions.o obj-$(CONFIG_IRQSOFF_TRACER) += trace_irqsoff.o obj-$(CONFIG_PREEMPT_TRACER) += trace_irqsoff.o obj-$(CONFIG_SCHED_TRACER) += trace_sched_wakeup.o +obj-$(CONFIG_INTERRUPT_OFF_HIST) += latency_hist.o +obj-$(CONFIG_PREEMPT_OFF_HIST) += latency_hist.o +obj-$(CONFIG_WAKEUP_LATENCY_HIST) += latency_hist.o obj-$(CONFIG_NOP_TRACER) += trace_nop.o obj-$(CONFIG_STACK_TRACER) += trace_stack.o obj-$(CONFIG_MMIOTRACE) += trace_mmiotrace.o diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 7a34cb5..3eb159c 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -65,13 +65,15 @@ static void trace_note(struct blk_trace *bt, pid_t pid, int action, { struct blk_io_trace *t; struct ring_buffer_event *event = NULL; + struct ring_buffer *buffer = NULL; int pc = 0; int cpu = smp_processor_id(); bool blk_tracer = blk_tracer_enabled; if (blk_tracer) { + buffer = blk_tr->buffer; pc = preempt_count(); - event = trace_buffer_lock_reserve(blk_tr, TRACE_BLK, + event = trace_buffer_lock_reserve(buffer, TRACE_BLK, sizeof(*t) + len, 0, pc); if (!event) @@ -96,7 +98,7 @@ record_it: memcpy((void *) t + sizeof(*t), data, len); if (blk_tracer) - trace_buffer_unlock_commit(blk_tr, event, 0, pc); + trace_buffer_unlock_commit(buffer, event, 0, pc); } } @@ -179,6 +181,7 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes, { struct task_struct *tsk = current; struct ring_buffer_event *event = NULL; + struct ring_buffer *buffer = NULL; struct blk_io_trace *t; unsigned long flags = 0; unsigned long *sequence; @@ -204,8 +207,9 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes, if (blk_tracer) { tracing_record_cmdline(current); + buffer = blk_tr->buffer; pc = preempt_count(); - event = trace_buffer_lock_reserve(blk_tr, TRACE_BLK, + event = trace_buffer_lock_reserve(buffer, TRACE_BLK, sizeof(*t) + pdu_len, 0, pc); if (!event) @@ -252,7 +256,7 @@ record_it: memcpy((void *) t + sizeof(*t), pdu_data, pdu_len); if (blk_tracer) { - trace_buffer_unlock_commit(blk_tr, event, 0, pc); + trace_buffer_unlock_commit(buffer, event, 0, pc); return; } } diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 25edd5c..b10556a 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -377,7 +377,8 @@ static int function_stat_show(struct seq_file *m, void *v) #ifdef CONFIG_FUNCTION_GRAPH_TRACER seq_printf(m, " "); avg = rec->time; - do_div(avg, rec->counter); + if (rec->counter) + do_div(avg, rec->counter); mutex_lock(&mutex); trace_seq_init(&s); @@ -1016,71 +1017,35 @@ static int __ftrace_replace_code(struct dyn_ftrace *rec, int enable) { unsigned long ftrace_addr; - unsigned long ip, fl; + unsigned long flag = 0UL; ftrace_addr = (unsigned long)FTRACE_ADDR; - ip = rec->ip; - /* - * If this record is not to be traced and - * it is not enabled then do nothing. + * If this record is not to be traced or we want to disable it, + * then disable it. * - * If this record is not to be traced and - * it is enabled then disable it. + * If we want to enable it and filtering is off, then enable it. * + * If we want to enable it and filtering is on, enable it only if + * it's filtered */ - if (rec->flags & FTRACE_FL_NOTRACE) { - if (rec->flags & FTRACE_FL_ENABLED) - rec->flags &= ~FTRACE_FL_ENABLED; - else - return 0; - - } else if (ftrace_filtered && enable) { - /* - * Filtering is on: - */ - - fl = rec->flags & (FTRACE_FL_FILTER | FTRACE_FL_ENABLED); - - /* Record is filtered and enabled, do nothing */ - if (fl == (FTRACE_FL_FILTER | FTRACE_FL_ENABLED)) - return 0; - - /* Record is not filtered or enabled, do nothing */ - if (!fl) - return 0; - - /* Record is not filtered but enabled, disable it */ - if (fl == FTRACE_FL_ENABLED) - rec->flags &= ~FTRACE_FL_ENABLED; - else - /* Otherwise record is filtered but not enabled, enable it */ - rec->flags |= FTRACE_FL_ENABLED; - } else { - /* Disable or not filtered */ - - if (enable) { - /* if record is enabled, do nothing */ - if (rec->flags & FTRACE_FL_ENABLED) - return 0; - - rec->flags |= FTRACE_FL_ENABLED; - - } else { + if (enable && !(rec->flags & FTRACE_FL_NOTRACE)) { + if (!ftrace_filtered || (rec->flags & FTRACE_FL_FILTER)) + flag = FTRACE_FL_ENABLED; + } - /* if record is not enabled, do nothing */ - if (!(rec->flags & FTRACE_FL_ENABLED)) - return 0; + /* If the state of this record hasn't changed, then do nothing */ + if ((rec->flags & FTRACE_FL_ENABLED) == flag) + return 0; - rec->flags &= ~FTRACE_FL_ENABLED; - } + if (flag) { + rec->flags |= FTRACE_FL_ENABLED; + return ftrace_make_call(rec, ftrace_addr); } - if (rec->flags & FTRACE_FL_ENABLED) - return ftrace_make_call(rec, ftrace_addr); - else - return ftrace_make_nop(NULL, rec, ftrace_addr); + rec->flags &= ~FTRACE_FL_ENABLED; + return ftrace_make_nop(NULL, rec, ftrace_addr); } static void ftrace_replace_code(int enable) @@ -1359,11 +1324,10 @@ static int __init ftrace_dyn_table_alloc(unsigned long num_to_init) enum { FTRACE_ITER_FILTER = (1 << 0), - FTRACE_ITER_CONT = (1 << 1), - FTRACE_ITER_NOTRACE = (1 << 2), - FTRACE_ITER_FAILURES = (1 << 3), - FTRACE_ITER_PRINTALL = (1 << 4), - FTRACE_ITER_HASH = (1 << 5), + FTRACE_ITER_NOTRACE = (1 << 1), + FTRACE_ITER_FAILURES = (1 << 2), + FTRACE_ITER_PRINTALL = (1 << 3), + FTRACE_ITER_HASH = (1 << 4), }; #define FTRACE_BUFF_MAX (KSYM_SYMBOL_LEN+4) /* room for wildcards */ @@ -1373,9 +1337,7 @@ struct ftrace_iterator { int hidx; int idx; unsigned flags; - unsigned char buffer[FTRACE_BUFF_MAX+1]; - unsigned buffer_idx; - unsigned filtered; + struct trace_parser parser; }; static void * @@ -1438,18 +1400,13 @@ static int t_hash_show(struct seq_file *m, void *v) { struct ftrace_func_probe *rec; struct hlist_node *hnd = v; - char str[KSYM_SYMBOL_LEN]; rec = hlist_entry(hnd, struct ftrace_func_probe, node); if (rec->ops->print) return rec->ops->print(m, rec->ip, rec->ops, rec->data); - kallsyms_lookup(rec->ip, NULL, NULL, NULL, str); - seq_printf(m, "%s:", str); - - kallsyms_lookup((unsigned long)rec->ops->func, NULL, NULL, NULL, str); - seq_printf(m, "%s", str); + seq_printf(m, "%pf:%pf", (void *)rec->ip, (void *)rec->ops->func); if (rec->data) seq_printf(m, ":%p", rec->data); @@ -1547,7 +1504,6 @@ static int t_show(struct seq_file *m, void *v) { struct ftrace_iterator *iter = m->private; struct dyn_ftrace *rec = v; - char str[KSYM_SYMBOL_LEN]; if (iter->flags & FTRACE_ITER_HASH) return t_hash_show(m, v); @@ -1560,9 +1516,7 @@ static int t_show(struct seq_file *m, void *v) if (!rec) return 0; - kallsyms_lookup(rec->ip, NULL, NULL, NULL, str); - - seq_printf(m, "%s\n", str); + seq_printf(m, "%pf\n", (void *)rec->ip); return 0; } @@ -1601,17 +1555,6 @@ ftrace_avail_open(struct inode *inode, struct file *file) return ret; } -int ftrace_avail_release(struct inode *inode, struct file *file) -{ - struct seq_file *m = (struct seq_file *)file->private_data; - struct ftrace_iterator *iter = m->private; - - seq_release(inode, file); - kfree(iter); - - return 0; -} - static int ftrace_failures_open(struct inode *inode, struct file *file) { @@ -1660,6 +1603,11 @@ ftrace_regex_open(struct inode *inode, struct file *file, int enable) if (!iter) return -ENOMEM; + if (trace_parser_get_init(&iter->parser, FTRACE_BUFF_MAX)) { + kfree(iter); + return -ENOMEM; + } + mutex_lock(&ftrace_regex_lock); if ((file->f_mode & FMODE_WRITE) && (file->f_flags & O_TRUNC)) @@ -2252,9 +2200,8 @@ ftrace_regex_write(struct file *file, const char __user *ubuf, size_t cnt, loff_t *ppos, int enable) { struct ftrace_iterator *iter; - char ch; - size_t read = 0; - ssize_t ret; + struct trace_parser *parser; + ssize_t ret, read; if (!cnt || cnt < 0) return 0; @@ -2267,73 +2214,23 @@ ftrace_regex_write(struct file *file, const char __user *ubuf, } else iter = file->private_data; - if (!*ppos) { - iter->flags &= ~FTRACE_ITER_CONT; - iter->buffer_idx = 0; - } - - ret = get_user(ch, ubuf++); - if (ret) - goto out; - read++; - cnt--; + parser = &iter->parser; + read = trace_get_user(parser, ubuf, cnt, ppos); - /* - * If the parser haven't finished with the last write, - * continue reading the user input without skipping spaces. - */ - if (!(iter->flags & FTRACE_ITER_CONT)) { - /* skip white space */ - while (cnt && isspace(ch)) { - ret = get_user(ch, ubuf++); - if (ret) - goto out; - read++; - cnt--; - } - - /* only spaces were written */ - if (isspace(ch)) { - *ppos += read; - ret = read; - goto out; - } - - iter->buffer_idx = 0; - } - - while (cnt && !isspace(ch)) { - if (iter->buffer_idx < FTRACE_BUFF_MAX) - iter->buffer[iter->buffer_idx++] = ch; - else { - ret = -EINVAL; - goto out; - } - ret = get_user(ch, ubuf++); + if (trace_parser_loaded(parser) && + !trace_parser_cont(parser)) { + ret = ftrace_process_regex(parser->buffer, + parser->idx, enable); if (ret) goto out; - read++; - cnt--; - } - if (isspace(ch)) { - iter->filtered++; - iter->buffer[iter->buffer_idx] = 0; - ret = ftrace_process_regex(iter->buffer, - iter->buffer_idx, enable); - if (ret) - goto out; - iter->buffer_idx = 0; - } else { - iter->flags |= FTRACE_ITER_CONT; - iter->buffer[iter->buffer_idx++] = ch; + trace_parser_clear(parser); } - *ppos += read; ret = read; - out: - mutex_unlock(&ftrace_regex_lock); + mutex_unlock(&ftrace_regex_lock); +out: return ret; } @@ -2438,6 +2335,7 @@ ftrace_regex_release(struct inode *inode, struct file *file, int enable) { struct seq_file *m = (struct seq_file *)file->private_data; struct ftrace_iterator *iter; + struct trace_parser *parser; mutex_lock(&ftrace_regex_lock); if (file->f_mode & FMODE_READ) { @@ -2447,10 +2345,10 @@ ftrace_regex_release(struct inode *inode, struct file *file, int enable) } else iter = file->private_data; - if (iter->buffer_idx) { - iter->filtered++; - iter->buffer[iter->buffer_idx] = 0; - ftrace_match_records(iter->buffer, iter->buffer_idx, enable); + parser = &iter->parser; + if (trace_parser_loaded(parser)) { + parser->buffer[parser->idx] = 0; + ftrace_match_records(parser->buffer, parser->idx, enable); } mutex_lock(&ftrace_lock); @@ -2458,7 +2356,9 @@ ftrace_regex_release(struct inode *inode, struct file *file, int enable) ftrace_run_update_code(FTRACE_ENABLE_CALLS); mutex_unlock(&ftrace_lock); + trace_parser_put(parser); kfree(iter); + mutex_unlock(&ftrace_regex_lock); return 0; } @@ -2479,14 +2379,14 @@ static const struct file_operations ftrace_avail_fops = { .open = ftrace_avail_open, .read = seq_read, .llseek = seq_lseek, - .release = ftrace_avail_release, + .release = seq_release_private, }; static const struct file_operations ftrace_failures_fops = { .open = ftrace_failures_open, .read = seq_read, .llseek = seq_lseek, - .release = ftrace_avail_release, + .release = seq_release_private, }; static const struct file_operations ftrace_filter_fops = { @@ -2548,7 +2448,6 @@ static void g_stop(struct seq_file *m, void *p) static int g_show(struct seq_file *m, void *v) { unsigned long *ptr = v; - char str[KSYM_SYMBOL_LEN]; if (!ptr) return 0; @@ -2558,9 +2457,7 @@ static int g_show(struct seq_file *m, void *v) return 0; } - kallsyms_lookup(*ptr, NULL, NULL, NULL, str); - - seq_printf(m, "%s\n", str); + seq_printf(m, "%pf\n", (void *)*ptr); return 0; } @@ -2663,12 +2560,10 @@ static ssize_t ftrace_graph_write(struct file *file, const char __user *ubuf, size_t cnt, loff_t *ppos) { - unsigned char buffer[FTRACE_BUFF_MAX+1]; + struct trace_parser parser; unsigned long *array; size_t read = 0; ssize_t ret; - int index = 0; - char ch; if (!cnt || cnt < 0) return 0; @@ -2686,51 +2581,26 @@ ftrace_graph_write(struct file *file, const char __user *ubuf, } else array = file->private_data; - ret = get_user(ch, ubuf++); - if (ret) + if (trace_parser_get_init(&parser, FTRACE_BUFF_MAX)) { + ret = -ENOMEM; goto out; - read++; - cnt--; - - /* skip white space */ - while (cnt && isspace(ch)) { - ret = get_user(ch, ubuf++); - if (ret) - goto out; - read++; - cnt--; } - if (isspace(ch)) { - *ppos += read; - ret = read; - goto out; - } + read = trace_get_user(&parser, ubuf, cnt, ppos); - while (cnt && !isspace(ch)) { - if (index < FTRACE_BUFF_MAX) - buffer[index++] = ch; - else { - ret = -EINVAL; - goto out; - } - ret = get_user(ch, ubuf++); + if (trace_parser_loaded((&parser))) { + parser.buffer[parser.idx] = 0; + + /* we allow only one expression at a time */ + ret = ftrace_set_func(array, &ftrace_graph_count, + parser.buffer); if (ret) goto out; - read++; - cnt--; } - buffer[index] = 0; - - /* we allow only one expression at a time */ - ret = ftrace_set_func(array, &ftrace_graph_count, buffer); - if (ret) - goto out; - - file->f_pos += read; ret = read; out: + trace_parser_put(&parser); mutex_unlock(&graph_lock); return ret; diff --git a/kernel/trace/kmemtrace.c b/kernel/trace/kmemtrace.c index 1edaa95..81b1645 100644 --- a/kernel/trace/kmemtrace.c +++ b/kernel/trace/kmemtrace.c @@ -183,11 +183,9 @@ static void kmemtrace_stop_probes(void) static int kmem_trace_init(struct trace_array *tr) { - int cpu; kmemtrace_array = tr; - for_each_cpu(cpu, cpu_possible_mask) - tracing_reset(tr, cpu); + tracing_reset_online_cpus(tr); kmemtrace_start_probes(); @@ -239,12 +237,52 @@ struct kmemtrace_user_event_alloc { }; static enum print_line_t -kmemtrace_print_alloc_user(struct trace_iterator *iter, - struct kmemtrace_alloc_entry *entry) +kmemtrace_print_alloc(struct trace_iterator *iter, int flags) { - struct kmemtrace_user_event_alloc *ev_alloc; struct trace_seq *s = &iter->seq; + struct kmemtrace_alloc_entry *entry; + int ret; + + trace_assign_type(entry, iter->ent); + + ret = trace_seq_printf(s, "type_id %d call_site %pF ptr %lu " + "bytes_req %lu bytes_alloc %lu gfp_flags %lu node %d\n", + entry->type_id, (void *)entry->call_site, (unsigned long)entry->ptr, + (unsigned long)entry->bytes_req, (unsigned long)entry->bytes_alloc, + (unsigned long)entry->gfp_flags, entry->node); + + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + return TRACE_TYPE_HANDLED; +} + +static enum print_line_t +kmemtrace_print_free(struct trace_iterator *iter, int flags) +{ + struct trace_seq *s = &iter->seq; + struct kmemtrace_free_entry *entry; + int ret; + + trace_assign_type(entry, iter->ent); + + ret = trace_seq_printf(s, "type_id %d call_site %pF ptr %lu\n", + entry->type_id, (void *)entry->call_site, + (unsigned long)entry->ptr); + + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + return TRACE_TYPE_HANDLED; +} + +static enum print_line_t +kmemtrace_print_alloc_user(struct trace_iterator *iter, int flags) +{ + struct trace_seq *s = &iter->seq; + struct kmemtrace_alloc_entry *entry; struct kmemtrace_user_event *ev; + struct kmemtrace_user_event_alloc *ev_alloc; + + trace_assign_type(entry, iter->ent); ev = trace_seq_reserve(s, sizeof(*ev)); if (!ev) @@ -271,12 +309,14 @@ kmemtrace_print_alloc_user(struct trace_iterator *iter, } static enum print_line_t -kmemtrace_print_free_user(struct trace_iterator *iter, - struct kmemtrace_free_entry *entry) +kmemtrace_print_free_user(struct trace_iterator *iter, int flags) { struct trace_seq *s = &iter->seq; + struct kmemtrace_free_entry *entry; struct kmemtrace_user_event *ev; + trace_assign_type(entry, iter->ent); + ev = trace_seq_reserve(s, sizeof(*ev)); if (!ev) return TRACE_TYPE_PARTIAL_LINE; @@ -294,12 +334,14 @@ kmemtrace_print_free_user(struct trace_iterator *iter, /* The two other following provide a more minimalistic output */ static enum print_line_t -kmemtrace_print_alloc_compress(struct trace_iterator *iter, - struct kmemtrace_alloc_entry *entry) +kmemtrace_print_alloc_compress(struct trace_iterator *iter) { + struct kmemtrace_alloc_entry *entry; struct trace_seq *s = &iter->seq; int ret; + trace_assign_type(entry, iter->ent); + /* Alloc entry */ ret = trace_seq_printf(s, " + "); if (!ret) @@ -345,29 +387,24 @@ kmemtrace_print_alloc_compress(struct trace_iterator *iter, if (!ret) return TRACE_TYPE_PARTIAL_LINE; - /* Node */ - ret = trace_seq_printf(s, "%4d ", entry->node); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - - /* Call site */ - ret = seq_print_ip_sym(s, entry->call_site, 0); + /* Node and call site*/ + ret = trace_seq_printf(s, "%4d %pf\n", entry->node, + (void *)entry->call_site); if (!ret) return TRACE_TYPE_PARTIAL_LINE; - if (!trace_seq_printf(s, "\n")) - return TRACE_TYPE_PARTIAL_LINE; - return TRACE_TYPE_HANDLED; } static enum print_line_t -kmemtrace_print_free_compress(struct trace_iterator *iter, - struct kmemtrace_free_entry *entry) +kmemtrace_print_free_compress(struct trace_iterator *iter) { + struct kmemtrace_free_entry *entry; struct trace_seq *s = &iter->seq; int ret; + trace_assign_type(entry, iter->ent); + /* Free entry */ ret = trace_seq_printf(s, " - "); if (!ret) @@ -401,19 +438,11 @@ kmemtrace_print_free_compress(struct trace_iterator *iter, if (!ret) return TRACE_TYPE_PARTIAL_LINE; - /* Skip node */ - ret = trace_seq_printf(s, " "); + /* Skip node and print call site*/ + ret = trace_seq_printf(s, " %pf\n", (void *)entry->call_site); if (!ret) return TRACE_TYPE_PARTIAL_LINE; - /* Call site */ - ret = seq_print_ip_sym(s, entry->call_site, 0); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - - if (!trace_seq_printf(s, "\n")) - return TRACE_TYPE_PARTIAL_LINE; - return TRACE_TYPE_HANDLED; } @@ -421,32 +450,31 @@ static enum print_line_t kmemtrace_print_line(struct trace_iterator *iter) { struct trace_entry *entry = iter->ent; - switch (entry->type) { - case TRACE_KMEM_ALLOC: { - struct kmemtrace_alloc_entry *field; - - trace_assign_type(field, entry); - if (kmem_tracer_flags.val & TRACE_KMEM_OPT_MINIMAL) - return kmemtrace_print_alloc_compress(iter, field); - else - return kmemtrace_print_alloc_user(iter, field); - } - - case TRACE_KMEM_FREE: { - struct kmemtrace_free_entry *field; - - trace_assign_type(field, entry); - if (kmem_tracer_flags.val & TRACE_KMEM_OPT_MINIMAL) - return kmemtrace_print_free_compress(iter, field); - else - return kmemtrace_print_free_user(iter, field); - } + if (!(kmem_tracer_flags.val & TRACE_KMEM_OPT_MINIMAL)) + return TRACE_TYPE_UNHANDLED; + switch (entry->type) { + case TRACE_KMEM_ALLOC: + return kmemtrace_print_alloc_compress(iter); + case TRACE_KMEM_FREE: + return kmemtrace_print_free_compress(iter); default: return TRACE_TYPE_UNHANDLED; } } +static struct trace_event kmem_trace_alloc = { + .type = TRACE_KMEM_ALLOC, + .trace = kmemtrace_print_alloc, + .binary = kmemtrace_print_alloc_user, +}; + +static struct trace_event kmem_trace_free = { + .type = TRACE_KMEM_FREE, + .trace = kmemtrace_print_free, + .binary = kmemtrace_print_free_user, +}; + static struct tracer kmem_tracer __read_mostly = { .name = "kmemtrace", .init = kmem_trace_init, @@ -463,6 +491,21 @@ void kmemtrace_init(void) static int __init init_kmem_tracer(void) { - return register_tracer(&kmem_tracer); + if (!register_ftrace_event(&kmem_trace_alloc)) { + pr_warning("Warning: could not register kmem events\n"); + return 1; + } + + if (!register_ftrace_event(&kmem_trace_free)) { + pr_warning("Warning: could not register kmem events\n"); + return 1; + } + + if (!register_tracer(&kmem_tracer)) { + pr_warning("Warning: could not register the kmem tracer\n"); + return 1; + } + + return 0; } device_initcall(init_kmem_tracer); diff --git a/kernel/trace/latency_hist.c b/kernel/trace/latency_hist.c new file mode 100644 index 0000000..c83463e --- /dev/null +++ b/kernel/trace/latency_hist.c @@ -0,0 +1,822 @@ +/* + * kernel/trace/latency_hist.c + * + * Add support for histograms of preemption-off latency and + * interrupt-off latency and wakeup latency, it depends on + * Real-Time Preemption Support. + * + * Copyright (C) 2005 MontaVista Software, Inc. + * Yi Yang <yyang@ch.mvista.com> + * + * Converted to work with the new latency tracer. + * Copyright (C) 2008 Red Hat, Inc. + * Steven Rostedt <srostedt@redhat.com> + * + */ +#include <linux/module.h> +#include <linux/debugfs.h> +#include <linux/seq_file.h> +#include <linux/percpu.h> +#include <linux/kallsyms.h> +#include <linux/uaccess.h> +#include <linux/sched.h> +#include <asm/atomic.h> +#include <asm/div64.h> + +#include "trace.h" +#include <trace/events/sched.h> + +#define CREATE_TRACE_POINTS +#include <trace/events/hist.h> + +enum { + IRQSOFF_LATENCY = 0, + PREEMPTOFF_LATENCY, + PREEMPTIRQSOFF_LATENCY, + WAKEUP_LATENCY, + MAX_LATENCY_TYPE, +}; + +#define MAX_ENTRY_NUM 10240 + +struct hist_data { + atomic_t hist_mode; /* 0 log, 1 don't log */ + unsigned long min_lat; + unsigned long max_lat; + unsigned long long beyond_hist_bound_samples; + unsigned long long accumulate_lat; + unsigned long long total_samples; + unsigned long long hist_array[MAX_ENTRY_NUM]; +}; + +struct enable_data { + int latency_type; + int enabled; +}; + +static char *latency_hist_dir_root = "latency_hist"; + +#ifdef CONFIG_INTERRUPT_OFF_HIST +static DEFINE_PER_CPU(struct hist_data, irqsoff_hist); +static char *irqsoff_hist_dir = "irqsoff"; +#endif + +#ifdef CONFIG_PREEMPT_OFF_HIST +static DEFINE_PER_CPU(struct hist_data, preemptoff_hist); +static char *preemptoff_hist_dir = "preemptoff"; +#endif + +#if defined(CONFIG_PREEMPT_OFF_HIST) && defined(CONFIG_INTERRUPT_OFF_HIST) +static DEFINE_PER_CPU(struct hist_data, preemptirqsoff_hist); +static char *preemptirqsoff_hist_dir = "preemptirqsoff"; +#endif + +#if defined(CONFIG_PREEMPT_OFF_HIST) || defined(CONFIG_INTERRUPT_OFF_HIST) +static notrace void probe_preemptirqsoff_hist(int reason, int start); +static struct enable_data preemptirqsoff_enabled_data = { + .latency_type = PREEMPTIRQSOFF_LATENCY, + .enabled = 0, +}; +#endif + +#ifdef CONFIG_WAKEUP_LATENCY_HIST +static DEFINE_PER_CPU(struct hist_data, wakeup_latency_hist); +static char *wakeup_latency_hist_dir = "wakeup"; +static notrace void probe_wakeup_latency_hist_start(struct rq *rq, + struct task_struct *p, int success); +static notrace void probe_wakeup_latency_hist_stop(struct rq *rq, + struct task_struct *prev, struct task_struct *next); +static struct enable_data wakeup_latency_enabled_data = { + .latency_type = WAKEUP_LATENCY, + .enabled = 0, +}; +static struct task_struct *ts; +struct maxlatproc_data { + char comm[sizeof(ts->comm)]; + unsigned int pid; + unsigned int prio; + unsigned long latency; +}; +static DEFINE_PER_CPU(struct maxlatproc_data, wakeup_maxlatproc); +static unsigned wakeup_prio = (unsigned)-1; +static struct task_struct *wakeup_task; +static int wakeup_pid; +#endif + +void notrace latency_hist(int latency_type, int cpu, unsigned long latency, + struct task_struct *p) +{ + struct hist_data *my_hist; + + if (cpu < 0 || cpu >= NR_CPUS || latency_type < 0 || + latency_type >= MAX_LATENCY_TYPE) + return; + + switch (latency_type) { +#ifdef CONFIG_INTERRUPT_OFF_HIST + case IRQSOFF_LATENCY: + my_hist = &per_cpu(irqsoff_hist, cpu); + break; +#endif + +#ifdef CONFIG_PREEMPT_OFF_HIST + case PREEMPTOFF_LATENCY: + my_hist = &per_cpu(preemptoff_hist, cpu); + break; +#endif + +#if defined(CONFIG_PREEMPT_OFF_HIST) && defined(CONFIG_INTERRUPT_OFF_HIST) + case PREEMPTIRQSOFF_LATENCY: + my_hist = &per_cpu(preemptirqsoff_hist, cpu); + break; +#endif + +#ifdef CONFIG_WAKEUP_LATENCY_HIST + case WAKEUP_LATENCY: + my_hist = &per_cpu(wakeup_latency_hist, cpu); + break; +#endif + default: + return; + } + + if (atomic_read(&my_hist->hist_mode) == 0) + return; + + if (latency >= MAX_ENTRY_NUM) + my_hist->beyond_hist_bound_samples++; + else + my_hist->hist_array[latency]++; + + if (latency < my_hist->min_lat) + my_hist->min_lat = latency; + else if (latency > my_hist->max_lat) { +#ifdef CONFIG_WAKEUP_LATENCY_HIST + if (latency_type == WAKEUP_LATENCY) { + struct maxlatproc_data *mp = + &per_cpu(wakeup_maxlatproc, cpu); + strncpy(mp->comm, p->comm, sizeof(mp->comm)); + mp->pid = task_pid_nr(p); + mp->prio = p->prio; + mp->latency = latency; + } +#endif + my_hist->max_lat = latency; + } + + my_hist->total_samples++; + my_hist->accumulate_lat += latency; + return; +} + +static void *l_start(struct seq_file *m, loff_t *pos) +{ + loff_t *index_ptr = kmalloc(sizeof(loff_t), GFP_KERNEL); + loff_t index = *pos; + struct hist_data *my_hist = m->private; + + if (!index_ptr) + return NULL; + + if (index == 0) { + char avgstr[32]; + + atomic_dec(&my_hist->hist_mode); + if (likely(my_hist->total_samples)) { + unsigned long avg = (unsigned long) + div64_u64(my_hist->accumulate_lat, + my_hist->total_samples); + sprintf(avgstr, "%lu", avg); + } else + strcpy(avgstr, "<undef>"); + + seq_printf(m, "#Minimum latency: %lu microseconds.\n" + "#Average latency: %s microseconds.\n" + "#Maximum latency: %lu microseconds.\n" + "#Total samples: %llu\n" + "#There are %llu samples greater or equal" + " than %d microseconds\n" + "#usecs\t%16s\n" + , my_hist->min_lat + , avgstr + , my_hist->max_lat + , my_hist->total_samples + , my_hist->beyond_hist_bound_samples + , MAX_ENTRY_NUM, "samples"); + } + if (index >= MAX_ENTRY_NUM) + return NULL; + + *index_ptr = index; + return index_ptr; +} + +static void *l_next(struct seq_file *m, void *p, loff_t *pos) +{ + loff_t *index_ptr = p; + struct hist_data *my_hist = m->private; + + if (++*pos >= MAX_ENTRY_NUM) { + atomic_inc(&my_hist->hist_mode); + return NULL; + } + *index_ptr = *pos; + return index_ptr; +} + +static void l_stop(struct seq_file *m, void *p) +{ + kfree(p); +} + +static int l_show(struct seq_file *m, void *p) +{ + int index = *(loff_t *) p; + struct hist_data *my_hist = m->private; + + seq_printf(m, "%5d\t%16llu\n", index, my_hist->hist_array[index]); + return 0; +} + +static struct seq_operations latency_hist_seq_op = { + .start = l_start, + .next = l_next, + .stop = l_stop, + .show = l_show +}; + +static int latency_hist_open(struct inode *inode, struct file *file) +{ + int ret; + + ret = seq_open(file, &latency_hist_seq_op); + if (!ret) { + struct seq_file *seq = file->private_data; + seq->private = inode->i_private; + } + return ret; +} + +static struct file_operations latency_hist_fops = { + .open = latency_hist_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +static void hist_reset(struct hist_data *hist) +{ + atomic_dec(&hist->hist_mode); + + memset(hist->hist_array, 0, sizeof(hist->hist_array)); + hist->beyond_hist_bound_samples = 0ULL; + hist->min_lat = 0xFFFFFFFFUL; + hist->max_lat = 0UL; + hist->total_samples = 0ULL; + hist->accumulate_lat = 0ULL; + + atomic_inc(&hist->hist_mode); +} + +static ssize_t +latency_hist_reset(struct file *file, const char __user *a, + size_t size, loff_t *off) +{ + int cpu; + struct hist_data *hist; + int latency_type = (int) file->private_data; + + switch (latency_type) { + +#ifdef CONFIG_PREEMPT_OFF_HIST + case PREEMPTOFF_LATENCY: + for_each_online_cpu(cpu) { + hist = &per_cpu(preemptoff_hist, cpu); + hist_reset(hist); + } + break; +#endif + +#ifdef CONFIG_INTERRUPT_OFF_HIST + case IRQSOFF_LATENCY: + for_each_online_cpu(cpu) { + hist = &per_cpu(irqsoff_hist, cpu); + hist_reset(hist); + } + break; +#endif + +#if defined(CONFIG_INTERRUPT_OFF_HIST) && defined(CONFIG_PREEMPT_OFF_HIST) + case PREEMPTIRQSOFF_LATENCY: + for_each_online_cpu(cpu) { + hist = &per_cpu(preemptirqsoff_hist, cpu); + hist_reset(hist); + } + break; +#endif + +#ifdef CONFIG_WAKEUP_LATENCY_HIST + case WAKEUP_LATENCY: + for_each_online_cpu(cpu) { + struct maxlatproc_data *mp = + &per_cpu(wakeup_maxlatproc, cpu); + mp->comm[0] = '\0'; + mp->prio = mp->pid = mp->latency = 0; + hist = &per_cpu(wakeup_latency_hist, cpu); + hist_reset(hist); + } + break; +#endif + } + + return size; +} + +#ifdef CONFIG_WAKEUP_LATENCY_HIST +static ssize_t +latency_hist_show_pid(struct file *filp, char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + char buf[64]; + int r; + + r = snprintf(buf, sizeof(buf), "%u\n", wakeup_pid); + if (r > sizeof(buf)) + r = sizeof(buf); + return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); +} + +static ssize_t +latency_hist_pid(struct file *filp, const char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + char buf[64]; + unsigned long pid; + + if (cnt >= sizeof(buf)) + return -EINVAL; + + if (copy_from_user(&buf, ubuf, cnt)) + return -EFAULT; + + buf[cnt] = '\0'; + + if (strict_strtoul(buf, 10, &pid)) + return(-EINVAL); + + wakeup_pid = pid; + return cnt; +} + +static ssize_t +latency_hist_show_maxlatproc(struct file *filp, char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + char buf[1024]; + int r; + struct maxlatproc_data *mp = (struct maxlatproc_data *) + filp->private_data; + + r = snprintf(buf, sizeof(buf), "%5d %3d %ld %s\n", + mp->pid, mp->prio, mp->latency, mp->comm); + if (r > sizeof(buf)) + r = sizeof(buf); + return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); +} + +#endif + +#if defined(CONFIG_INTERRUPT_OFF_HIST) || defined(CONFIG_PREEMPT_OFF_HIST) +#ifdef CONFIG_INTERRUPT_OFF_HIST +static DEFINE_PER_CPU(cycles_t, hist_irqsoff_start); +static DEFINE_PER_CPU(int, hist_irqsoff_counting); +#endif +#ifdef CONFIG_PREEMPT_OFF_HIST +static DEFINE_PER_CPU(cycles_t, hist_preemptoff_start); +static DEFINE_PER_CPU(int, hist_preemptoff_counting); +#endif +#if defined(CONFIG_INTERRUPT_OFF_HIST) && defined(CONFIG_PREEMPT_OFF_HIST) +static DEFINE_PER_CPU(cycles_t, hist_preemptirqsoff_start); +static DEFINE_PER_CPU(int, hist_preemptirqsoff_counting); +#endif + +static ssize_t +latency_hist_show_enable(struct file *filp, char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + char buf[64]; + struct enable_data *ed = (struct enable_data *) filp->private_data; + int r; + + r = snprintf(buf, sizeof(buf), "%d\n", ed->enabled); + if (r > sizeof(buf)) + r = sizeof(buf); + return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); +} + +static ssize_t +latency_hist_enable(struct file *filp, const char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + char buf[64]; + long enable; + struct enable_data *ed = (struct enable_data *) filp->private_data; + + if (cnt >= sizeof(buf)) + return -EINVAL; + + if (copy_from_user(&buf, ubuf, cnt)) + return -EFAULT; + + buf[cnt] = 0; + + if (strict_strtol(buf, 10, &enable)) + return(-EINVAL); + + if ((enable && ed->enabled) || (!enable && !ed->enabled)) + return cnt; + + if (enable) { + int ret; + + switch (ed->latency_type) { + case WAKEUP_LATENCY: + ret = register_trace_sched_wakeup( + probe_wakeup_latency_hist_start); + if (ret) { + pr_info("wakeup trace: Couldn't assign " + "probe_wakeup_latency_hist_start " + "to trace_sched_wakeup\n"); + return ret; + } + ret = register_trace_sched_wakeup_new( + probe_wakeup_latency_hist_start); + if (ret) { + pr_info("wakeup trace: Couldn't assign " + "probe_wakeup_latency_hist_start " + "to trace_sched_wakeup_new\n"); + unregister_trace_sched_wakeup( + probe_wakeup_latency_hist_start); + return ret; + } + ret = register_trace_sched_switch( + probe_wakeup_latency_hist_stop); + if (ret) { + pr_info("wakeup trace: Couldn't assign " + "probe_wakeup_latency_hist_stop " + "to trace_sched_switch\n"); + unregister_trace_sched_wakeup( + probe_wakeup_latency_hist_start); + unregister_trace_sched_switch( + probe_wakeup_latency_hist_stop); + return ret; + } + break; + case PREEMPTIRQSOFF_LATENCY: + ret = register_trace_preemptirqsoff_hist( + probe_preemptirqsoff_hist); + if (ret) { + pr_info("wakeup trace: Couldn't assign " + "probe_preemptirqsoff_hist " + "to trace_preemptirqsoff_hist\n"); + return ret; + } + break; + default: + break; + } + } else { + int cpu; + + switch (ed->latency_type) { + case WAKEUP_LATENCY: + unregister_trace_sched_wakeup( + probe_wakeup_latency_hist_start); + unregister_trace_sched_wakeup_new( + probe_wakeup_latency_hist_start); + unregister_trace_sched_switch( + probe_wakeup_latency_hist_stop); + wakeup_task = NULL; + wakeup_prio = (unsigned)-1; + break; + case PREEMPTIRQSOFF_LATENCY: + unregister_trace_preemptirqsoff_hist( + probe_preemptirqsoff_hist); + for_each_online_cpu(cpu) { +#ifdef CONFIG_INTERRUPT_OFF_HIST + per_cpu(hist_irqsoff_counting, cpu) = 0; +#endif +#ifdef CONFIG_PREEMPT_OFF_HIST + per_cpu(hist_preemptoff_counting, cpu) = 0; +#endif +#if defined(CONFIG_INTERRUPT_OFF_HIST) && defined(CONFIG_PREEMPT_OFF_HIST) + per_cpu(hist_preemptirqsoff_counting, cpu) = 0; +#endif + } + break; + default: + break; + } + } + ed->enabled = enable; + return cnt; +} + +static struct file_operations latency_hist_reset_fops = { + .open = tracing_open_generic, + .write = latency_hist_reset, +}; + +static struct file_operations latency_hist_pid_fops = { + .open = tracing_open_generic, + .read = latency_hist_show_pid, + .write = latency_hist_pid, +}; + +static struct file_operations latency_hist_maxlatproc_fops = { + .open = tracing_open_generic, + .read = latency_hist_show_maxlatproc, +}; + +static struct file_operations latency_hist_enable_fops = { + .open = tracing_open_generic, + .read = latency_hist_show_enable, + .write = latency_hist_enable, +}; + +notrace void probe_preemptirqsoff_hist(int reason, int starthist) +{ + int cpu = raw_smp_processor_id(); + int time_set = 0; + + if (starthist) { + cycle_t uninitialized_var(start); + + if (!preempt_count() && !irqs_disabled()) + return; + +#ifdef CONFIG_INTERRUPT_OFF_HIST + if ((reason == IRQS_OFF || reason == TRACE_START) && + !per_cpu(hist_irqsoff_counting, cpu)) { + per_cpu(hist_irqsoff_counting, cpu) = 1; + start = ftrace_now(cpu); + time_set++; + per_cpu(hist_irqsoff_start, cpu) = start; + } +#endif + +#ifdef CONFIG_PREEMPT_OFF_HIST + if ((reason == PREEMPT_OFF || reason == TRACE_START) && + !per_cpu(hist_preemptoff_counting, cpu)) { + per_cpu(hist_preemptoff_counting, cpu) = 1; + if (!(time_set++)) + start = ftrace_now(cpu); + per_cpu(hist_preemptoff_start, cpu) = start; + } +#endif + +#if defined(CONFIG_INTERRUPT_OFF_HIST) && defined(CONFIG_PREEMPT_OFF_HIST) + if (per_cpu(hist_irqsoff_counting, cpu) && + per_cpu(hist_preemptoff_counting, cpu) && + !per_cpu(hist_preemptirqsoff_counting, cpu)) { + per_cpu(hist_preemptirqsoff_counting, cpu) = 1; + if (!time_set) + start = ftrace_now(cpu); + per_cpu(hist_preemptirqsoff_start, cpu) = start; + } +#endif + } else { + cycle_t uninitialized_var(stop); + +#ifdef CONFIG_INTERRUPT_OFF_HIST + if ((reason == IRQS_ON || reason == TRACE_STOP) && + per_cpu(hist_irqsoff_counting, cpu)) { + cycle_t start = per_cpu(hist_irqsoff_start, cpu); + + stop = ftrace_now(cpu); + time_set++; + if (start && stop >= start) { + unsigned long latency = + nsecs_to_usecs(stop - start); + + latency_hist(IRQSOFF_LATENCY, cpu, latency, + NULL); + } + per_cpu(hist_irqsoff_counting, cpu) = 0; + } +#endif + +#ifdef CONFIG_PREEMPT_OFF_HIST + if ((reason == PREEMPT_ON || reason == TRACE_STOP) && + per_cpu(hist_preemptoff_counting, cpu)) { + cycle_t start = per_cpu(hist_preemptoff_start, cpu); + + if (!(time_set++)) + stop = ftrace_now(cpu); + if (start && stop >= start) { + unsigned long latency = + nsecs_to_usecs(stop - start); + + latency_hist(PREEMPTOFF_LATENCY, cpu, latency, + NULL); + } + per_cpu(hist_preemptoff_counting, cpu) = 0; + } +#endif + +#if defined(CONFIG_INTERRUPT_OFF_HIST) && defined(CONFIG_PREEMPT_OFF_HIST) + if ((!per_cpu(hist_irqsoff_counting, cpu) || + !per_cpu(hist_preemptoff_counting, cpu)) && + per_cpu(hist_preemptirqsoff_counting, cpu)) { + cycle_t start = per_cpu(hist_preemptirqsoff_start, cpu); + + if (!time_set) + stop = ftrace_now(cpu); + if (start && stop >= start) { + unsigned long latency = + nsecs_to_usecs(stop - start); + latency_hist(PREEMPTIRQSOFF_LATENCY, cpu, + latency, NULL); + } + per_cpu(hist_preemptirqsoff_counting, cpu) = 0; + } +#endif + } +} + +#endif + +#ifdef CONFIG_WAKEUP_LATENCY_HIST +static cycle_t wakeup_start; +static DEFINE_ATOMIC_SPINLOCK(wakeup_lock); + +notrace void probe_wakeup_latency_hist_start(struct rq *rq, + struct task_struct *p, int success) +{ + unsigned long flags; + struct task_struct *curr = rq_curr(rq); + + if (wakeup_pid) { + if (likely(wakeup_pid != task_pid_nr(p))) + return; + } else { + if (likely(!rt_task(p)) || + p->prio >= wakeup_prio || + p->prio >= curr->prio) + return; + } + + atomic_spin_lock_irqsave(&wakeup_lock, flags); + if (wakeup_task) + put_task_struct(wakeup_task); + + get_task_struct(p); + wakeup_task = p; + wakeup_prio = p->prio; + wakeup_start = ftrace_now(raw_smp_processor_id()); + atomic_spin_unlock_irqrestore(&wakeup_lock, flags); +} + +notrace void probe_wakeup_latency_hist_stop(struct rq *rq, + struct task_struct *prev, struct task_struct *next) +{ + unsigned long flags; + int cpu; + unsigned long latency; + cycle_t stop; + + if (next != wakeup_task) + return; + + cpu = raw_smp_processor_id(); + stop = ftrace_now(cpu); + + atomic_spin_lock_irqsave(&wakeup_lock, flags); + if (next != wakeup_task) + goto out; + + latency = nsecs_to_usecs(stop - wakeup_start); + latency_hist(WAKEUP_LATENCY, cpu, latency, next); + + put_task_struct(wakeup_task); + wakeup_task = NULL; + wakeup_prio = (unsigned)-1; +out: + atomic_spin_unlock_irqrestore(&wakeup_lock, flags); +} + +#endif + +static __init int latency_hist_init(void) +{ + struct dentry *latency_hist_root = NULL; + struct dentry *dentry; + struct dentry *entry; + struct dentry *latency_hist_enable_root; + int i = 0, len = 0; + struct hist_data *my_hist; + char name[64]; + char *cpufmt = "CPU%d"; + + dentry = tracing_init_dentry(); + + latency_hist_root = + debugfs_create_dir(latency_hist_dir_root, dentry); + + latency_hist_enable_root = + debugfs_create_dir("enable", latency_hist_root); + +#ifdef CONFIG_INTERRUPT_OFF_HIST + dentry = debugfs_create_dir(irqsoff_hist_dir, latency_hist_root); + for_each_possible_cpu(i) { + len = sprintf(name, cpufmt, i); + name[len] = '\0'; + entry = debugfs_create_file(name, 0444, dentry, + &per_cpu(irqsoff_hist, i), + &latency_hist_fops); + my_hist = &per_cpu(irqsoff_hist, i); + atomic_set(&my_hist->hist_mode, 1); + my_hist->min_lat = 0xFFFFFFFFUL; + } + entry = debugfs_create_file("reset", 0644, dentry, + (void *)IRQSOFF_LATENCY, + &latency_hist_reset_fops); +#endif + +#ifdef CONFIG_PREEMPT_OFF_HIST + dentry = debugfs_create_dir(preemptoff_hist_dir, + latency_hist_root); + for_each_possible_cpu(i) { + len = sprintf(name, cpufmt, i); + name[len] = '\0'; + entry = debugfs_create_file(name, 0444, dentry, + &per_cpu(preemptoff_hist, i), + &latency_hist_fops); + my_hist = &per_cpu(preemptoff_hist, i); + atomic_set(&my_hist->hist_mode, 1); + my_hist->min_lat = 0xFFFFFFFFUL; + } + entry = debugfs_create_file("reset", 0644, dentry, + (void *)PREEMPTOFF_LATENCY, + &latency_hist_reset_fops); +#endif + +#if defined(CONFIG_INTERRUPT_OFF_HIST) && defined(CONFIG_PREEMPT_OFF_HIST) + dentry = debugfs_create_dir(preemptirqsoff_hist_dir, + latency_hist_root); + for_each_possible_cpu(i) { + len = sprintf(name, cpufmt, i); + name[len] = '\0'; + entry = debugfs_create_file(name, 0444, dentry, + &per_cpu(preemptirqsoff_hist, i), + &latency_hist_fops); + my_hist = &per_cpu(preemptirqsoff_hist, i); + atomic_set(&my_hist->hist_mode, 1); + my_hist->min_lat = 0xFFFFFFFFUL; + } + entry = debugfs_create_file("reset", 0644, dentry, + (void *)PREEMPTIRQSOFF_LATENCY, + &latency_hist_reset_fops); +#endif + +#if defined(CONFIG_INTERRUPT_OFF_HIST) || defined(CONFIG_PREEMPT_OFF_HIST) + entry = debugfs_create_file("preemptirqsoff", 0644, + latency_hist_enable_root, + (void *)&preemptirqsoff_enabled_data, + &latency_hist_enable_fops); +#endif + +#ifdef CONFIG_WAKEUP_LATENCY_HIST + dentry = debugfs_create_dir(wakeup_latency_hist_dir, + latency_hist_root); + for_each_possible_cpu(i) { + len = sprintf(name, cpufmt, i); + name[len] = '\0'; + entry = debugfs_create_file(name, 0444, dentry, + &per_cpu(wakeup_latency_hist, i), + &latency_hist_fops); + my_hist = &per_cpu(wakeup_latency_hist, i); + atomic_set(&my_hist->hist_mode, 1); + my_hist->min_lat = 0xFFFFFFFFUL; + + len = sprintf(name, "max_latency-CPU%d", i); + name[len] = '\0'; + entry = debugfs_create_file(name, 0444, dentry, + &per_cpu(wakeup_maxlatproc, i), + &latency_hist_maxlatproc_fops); + } + entry = debugfs_create_file("pid", 0644, dentry, + (void *)&wakeup_pid, + &latency_hist_pid_fops); + entry = debugfs_create_file("reset", 0644, dentry, + (void *)WAKEUP_LATENCY, + &latency_hist_reset_fops); + entry = debugfs_create_file("wakeup", 0644, + latency_hist_enable_root, + (void *)&wakeup_latency_enabled_data, + &latency_hist_enable_fops); +#endif + return 0; +} + +__initcall(latency_hist_init); diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index a330513..f780675 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -218,17 +218,12 @@ enum { static inline int rb_null_event(struct ring_buffer_event *event) { - return event->type_len == RINGBUF_TYPE_PADDING - && event->time_delta == 0; -} - -static inline int rb_discarded_event(struct ring_buffer_event *event) -{ - return event->type_len == RINGBUF_TYPE_PADDING && event->time_delta; + return event->type_len == RINGBUF_TYPE_PADDING && !event->time_delta; } static void rb_event_set_padding(struct ring_buffer_event *event) { + /* padding has a NULL time_delta */ event->type_len = RINGBUF_TYPE_PADDING; event->time_delta = 0; } @@ -322,6 +317,14 @@ struct buffer_data_page { unsigned char data[]; /* data of buffer page */ }; +/* + * Note, the buffer_page list must be first. The buffer pages + * are allocated in cache lines, which means that each buffer + * page will be at the beginning of a cache line, and thus + * the least significant bits will be zero. We use this to + * add flags in the list struct pointers, to make the ring buffer + * lockless. + */ struct buffer_page { struct list_head list; /* list of buffer pages */ local_t write; /* index for next write */ @@ -330,6 +333,21 @@ struct buffer_page { struct buffer_data_page *page; /* Actual data page */ }; +/* + * The buffer page counters, write and entries, must be reset + * atomically when crossing page boundaries. To synchronize this + * update, two counters are inserted into the number. One is + * the actual counter for the write position or count on the page. + * + * The other is a counter of updaters. Before an update happens + * the update partition of the counter is incremented. This will + * allow the updater to update the counter atomically. + * + * The counter is 20 bits, and the state data is 12. + */ +#define RB_WRITE_MASK 0xfffff +#define RB_WRITE_INTCNT (1 << 20) + static void rb_init_page(struct buffer_data_page *bpage) { local_set(&bpage->commit, 0); @@ -403,21 +421,20 @@ int ring_buffer_print_page_header(struct trace_seq *s) struct ring_buffer_per_cpu { int cpu; struct ring_buffer *buffer; - spinlock_t reader_lock; /* serialize readers */ + atomic_spinlock_t reader_lock; /* serialize readers */ raw_spinlock_t lock; struct lock_class_key lock_key; - struct list_head pages; + struct list_head *pages; struct buffer_page *head_page; /* read from head */ struct buffer_page *tail_page; /* write to tail */ struct buffer_page *commit_page; /* committed pages */ struct buffer_page *reader_page; - unsigned long nmi_dropped; - unsigned long commit_overrun; - unsigned long overrun; - unsigned long read; + local_t commit_overrun; + local_t overrun; local_t entries; local_t committing; local_t commits; + unsigned long read; u64 write_stamp; u64 read_stamp; atomic_t record_disabled; @@ -450,14 +467,19 @@ struct ring_buffer_iter { }; /* buffer may be either ring_buffer or ring_buffer_per_cpu */ -#define RB_WARN_ON(buffer, cond) \ - ({ \ - int _____ret = unlikely(cond); \ - if (_____ret) { \ - atomic_inc(&buffer->record_disabled); \ - WARN_ON(1); \ - } \ - _____ret; \ +#define RB_WARN_ON(b, cond) \ + ({ \ + int _____ret = unlikely(cond); \ + if (_____ret) { \ + if (__same_type(*(b), struct ring_buffer_per_cpu)) { \ + struct ring_buffer_per_cpu *__b = \ + (void *)b; \ + atomic_inc(&__b->buffer->record_disabled); \ + } else \ + atomic_inc(&b->record_disabled); \ + WARN_ON(1); \ + } \ + _____ret; \ }) /* Up this if you want to test the TIME_EXTENTS and normalization */ @@ -489,6 +511,390 @@ void ring_buffer_normalize_time_stamp(struct ring_buffer *buffer, } EXPORT_SYMBOL_GPL(ring_buffer_normalize_time_stamp); +/* + * Making the ring buffer lockless makes things tricky. + * Although writes only happen on the CPU that they are on, + * and they only need to worry about interrupts. Reads can + * happen on any CPU. + * + * The reader page is always off the ring buffer, but when the + * reader finishes with a page, it needs to swap its page with + * a new one from the buffer. The reader needs to take from + * the head (writes go to the tail). But if a writer is in overwrite + * mode and wraps, it must push the head page forward. + * + * Here lies the problem. + * + * The reader must be careful to replace only the head page, and + * not another one. As described at the top of the file in the + * ASCII art, the reader sets its old page to point to the next + * page after head. It then sets the page after head to point to + * the old reader page. But if the writer moves the head page + * during this operation, the reader could end up with the tail. + * + * We use cmpxchg to help prevent this race. We also do something + * special with the page before head. We set the LSB to 1. + * + * When the writer must push the page forward, it will clear the + * bit that points to the head page, move the head, and then set + * the bit that points to the new head page. + * + * We also don't want an interrupt coming in and moving the head + * page on another writer. Thus we use the second LSB to catch + * that too. Thus: + * + * head->list->prev->next bit 1 bit 0 + * ------- ------- + * Normal page 0 0 + * Points to head page 0 1 + * New head page 1 0 + * + * Note we can not trust the prev pointer of the head page, because: + * + * +----+ +-----+ +-----+ + * | |------>| T |---X--->| N | + * | |<------| | | | + * +----+ +-----+ +-----+ + * ^ ^ | + * | +-----+ | | + * +----------| R |----------+ | + * | |<-----------+ + * +-----+ + * + * Key: ---X--> HEAD flag set in pointer + * T Tail page + * R Reader page + * N Next page + * + * (see __rb_reserve_next() to see where this happens) + * + * What the above shows is that the reader just swapped out + * the reader page with a page in the buffer, but before it + * could make the new header point back to the new page added + * it was preempted by a writer. The writer moved forward onto + * the new page added by the reader and is about to move forward + * again. + * + * You can see, it is legitimate for the previous pointer of + * the head (or any page) not to point back to itself. But only + * temporarially. + */ + +#define RB_PAGE_NORMAL 0UL +#define RB_PAGE_HEAD 1UL +#define RB_PAGE_UPDATE 2UL + + +#define RB_FLAG_MASK 3UL + +/* PAGE_MOVED is not part of the mask */ +#define RB_PAGE_MOVED 4UL + +/* + * rb_list_head - remove any bit + */ +static struct list_head *rb_list_head(struct list_head *list) +{ + unsigned long val = (unsigned long)list; + + return (struct list_head *)(val & ~RB_FLAG_MASK); +} + +/* + * rb_is_head_page - test if the give page is the head page + * + * Because the reader may move the head_page pointer, we can + * not trust what the head page is (it may be pointing to + * the reader page). But if the next page is a header page, + * its flags will be non zero. + */ +static int inline +rb_is_head_page(struct ring_buffer_per_cpu *cpu_buffer, + struct buffer_page *page, struct list_head *list) +{ + unsigned long val; + + val = (unsigned long)list->next; + + if ((val & ~RB_FLAG_MASK) != (unsigned long)&page->list) + return RB_PAGE_MOVED; + + return val & RB_FLAG_MASK; +} + +/* + * rb_is_reader_page + * + * The unique thing about the reader page, is that, if the + * writer is ever on it, the previous pointer never points + * back to the reader page. + */ +static int rb_is_reader_page(struct buffer_page *page) +{ + struct list_head *list = page->list.prev; + + return rb_list_head(list->next) != &page->list; +} + +/* + * rb_set_list_to_head - set a list_head to be pointing to head. + */ +static void rb_set_list_to_head(struct ring_buffer_per_cpu *cpu_buffer, + struct list_head *list) +{ + unsigned long *ptr; + + ptr = (unsigned long *)&list->next; + *ptr |= RB_PAGE_HEAD; + *ptr &= ~RB_PAGE_UPDATE; +} + +/* + * rb_head_page_activate - sets up head page + */ +static void rb_head_page_activate(struct ring_buffer_per_cpu *cpu_buffer) +{ + struct buffer_page *head; + + head = cpu_buffer->head_page; + if (!head) + return; + + /* + * Set the previous list pointer to have the HEAD flag. + */ + rb_set_list_to_head(cpu_buffer, head->list.prev); +} + +static void rb_list_head_clear(struct list_head *list) +{ + unsigned long *ptr = (unsigned long *)&list->next; + + *ptr &= ~RB_FLAG_MASK; +} + +/* + * rb_head_page_dactivate - clears head page ptr (for free list) + */ +static void +rb_head_page_deactivate(struct ring_buffer_per_cpu *cpu_buffer) +{ + struct list_head *hd; + + /* Go through the whole list and clear any pointers found. */ + rb_list_head_clear(cpu_buffer->pages); + + list_for_each(hd, cpu_buffer->pages) + rb_list_head_clear(hd); +} + +static int rb_head_page_set(struct ring_buffer_per_cpu *cpu_buffer, + struct buffer_page *head, + struct buffer_page *prev, + int old_flag, int new_flag) +{ + struct list_head *list; + unsigned long val = (unsigned long)&head->list; + unsigned long ret; + + list = &prev->list; + + val &= ~RB_FLAG_MASK; + + ret = cmpxchg((unsigned long *)&list->next, + val | old_flag, val | new_flag); + + /* check if the reader took the page */ + if ((ret & ~RB_FLAG_MASK) != val) + return RB_PAGE_MOVED; + + return ret & RB_FLAG_MASK; +} + +static int rb_head_page_set_update(struct ring_buffer_per_cpu *cpu_buffer, + struct buffer_page *head, + struct buffer_page *prev, + int old_flag) +{ + return rb_head_page_set(cpu_buffer, head, prev, + old_flag, RB_PAGE_UPDATE); +} + +static int rb_head_page_set_head(struct ring_buffer_per_cpu *cpu_buffer, + struct buffer_page *head, + struct buffer_page *prev, + int old_flag) +{ + return rb_head_page_set(cpu_buffer, head, prev, + old_flag, RB_PAGE_HEAD); +} + +static int rb_head_page_set_normal(struct ring_buffer_per_cpu *cpu_buffer, + struct buffer_page *head, + struct buffer_page *prev, + int old_flag) +{ + return rb_head_page_set(cpu_buffer, head, prev, + old_flag, RB_PAGE_NORMAL); +} + +static inline void rb_inc_page(struct ring_buffer_per_cpu *cpu_buffer, + struct buffer_page **bpage) +{ + struct list_head *p = rb_list_head((*bpage)->list.next); + + *bpage = list_entry(p, struct buffer_page, list); +} + +static struct buffer_page * +rb_set_head_page(struct ring_buffer_per_cpu *cpu_buffer) +{ + struct buffer_page *head; + struct buffer_page *page; + struct list_head *list; + int i; + + if (RB_WARN_ON(cpu_buffer, !cpu_buffer->head_page)) + return NULL; + + /* sanity check */ + list = cpu_buffer->pages; + if (RB_WARN_ON(cpu_buffer, rb_list_head(list->prev->next) != list)) + return NULL; + + page = head = cpu_buffer->head_page; + /* + * It is possible that the writer moves the header behind + * where we started, and we miss in one loop. + * A second loop should grab the header, but we'll do + * three loops just because I'm paranoid. + */ + for (i = 0; i < 3; i++) { + do { + if (rb_is_head_page(cpu_buffer, page, page->list.prev)) { + cpu_buffer->head_page = page; + return page; + } + rb_inc_page(cpu_buffer, &page); + } while (page != head); + } + + RB_WARN_ON(cpu_buffer, 1); + + return NULL; +} + +static int rb_head_page_replace(struct buffer_page *old, + struct buffer_page *new) +{ + unsigned long *ptr = (unsigned long *)&old->list.prev->next; + unsigned long val; + unsigned long ret; + + val = *ptr & ~RB_FLAG_MASK; + val |= RB_PAGE_HEAD; + + ret = cmpxchg(ptr, val, (unsigned long)&new->list); + + return ret == val; +} + +/* + * rb_tail_page_update - move the tail page forward + * + * Returns 1 if moved tail page, 0 if someone else did. + */ +static int rb_tail_page_update(struct ring_buffer_per_cpu *cpu_buffer, + struct buffer_page *tail_page, + struct buffer_page *next_page) +{ + struct buffer_page *old_tail; + unsigned long old_entries; + unsigned long old_write; + int ret = 0; + + /* + * The tail page now needs to be moved forward. + * + * We need to reset the tail page, but without messing + * with possible erasing of data brought in by interrupts + * that have moved the tail page and are currently on it. + * + * We add a counter to the write field to denote this. + */ + old_write = local_add_return(RB_WRITE_INTCNT, &next_page->write); + old_entries = local_add_return(RB_WRITE_INTCNT, &next_page->entries); + + /* + * Just make sure we have seen our old_write and synchronize + * with any interrupts that come in. + */ + barrier(); + + /* + * If the tail page is still the same as what we think + * it is, then it is up to us to update the tail + * pointer. + */ + if (tail_page == cpu_buffer->tail_page) { + /* Zero the write counter */ + unsigned long val = old_write & ~RB_WRITE_MASK; + unsigned long eval = old_entries & ~RB_WRITE_MASK; + + /* + * This will only succeed if an interrupt did + * not come in and change it. In which case, we + * do not want to modify it. + * + * We add (void) to let the compiler know that we do not care + * about the return value of these functions. We use the + * cmpxchg to only update if an interrupt did not already + * do it for us. If the cmpxchg fails, we don't care. + */ + (void)local_cmpxchg(&next_page->write, old_write, val); + (void)local_cmpxchg(&next_page->entries, old_entries, eval); + + /* + * No need to worry about races with clearing out the commit. + * it only can increment when a commit takes place. But that + * only happens in the outer most nested commit. + */ + local_set(&next_page->page->commit, 0); + + old_tail = cmpxchg(&cpu_buffer->tail_page, + tail_page, next_page); + + if (old_tail == tail_page) + ret = 1; + } + + return ret; +} + +static int rb_check_bpage(struct ring_buffer_per_cpu *cpu_buffer, + struct buffer_page *bpage) +{ + unsigned long val = (unsigned long)bpage; + + if (RB_WARN_ON(cpu_buffer, val & RB_FLAG_MASK)) + return 1; + + return 0; +} + +/** + * rb_check_list - make sure a pointer to a list has the last bits zero + */ +static int rb_check_list(struct ring_buffer_per_cpu *cpu_buffer, + struct list_head *list) +{ + if (RB_WARN_ON(cpu_buffer, rb_list_head(list->prev) != list->prev)) + return 1; + if (RB_WARN_ON(cpu_buffer, rb_list_head(list->next) != list->next)) + return 1; + return 0; +} + /** * check_pages - integrity check of buffer pages * @cpu_buffer: CPU buffer with pages to test @@ -498,14 +904,19 @@ EXPORT_SYMBOL_GPL(ring_buffer_normalize_time_stamp); */ static int rb_check_pages(struct ring_buffer_per_cpu *cpu_buffer) { - struct list_head *head = &cpu_buffer->pages; + struct list_head *head = cpu_buffer->pages; struct buffer_page *bpage, *tmp; + rb_head_page_deactivate(cpu_buffer); + if (RB_WARN_ON(cpu_buffer, head->next->prev != head)) return -1; if (RB_WARN_ON(cpu_buffer, head->prev->next != head)) return -1; + if (rb_check_list(cpu_buffer, head)) + return -1; + list_for_each_entry_safe(bpage, tmp, head, list) { if (RB_WARN_ON(cpu_buffer, bpage->list.next->prev != &bpage->list)) @@ -513,25 +924,33 @@ static int rb_check_pages(struct ring_buffer_per_cpu *cpu_buffer) if (RB_WARN_ON(cpu_buffer, bpage->list.prev->next != &bpage->list)) return -1; + if (rb_check_list(cpu_buffer, &bpage->list)) + return -1; } + rb_head_page_activate(cpu_buffer); + return 0; } static int rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned nr_pages) { - struct list_head *head = &cpu_buffer->pages; struct buffer_page *bpage, *tmp; unsigned long addr; LIST_HEAD(pages); unsigned i; + WARN_ON(!nr_pages); + for (i = 0; i < nr_pages; i++) { bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()), GFP_KERNEL, cpu_to_node(cpu_buffer->cpu)); if (!bpage) goto free_pages; + + rb_check_bpage(cpu_buffer, bpage); + list_add(&bpage->list, &pages); addr = __get_free_page(GFP_KERNEL); @@ -541,7 +960,13 @@ static int rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer, rb_init_page(bpage->page); } - list_splice(&pages, head); + /* + * The ring buffer page list is a circular list that does not + * start and end with a list head. All page list items point to + * other pages. + */ + cpu_buffer->pages = pages.next; + list_del(&pages); rb_check_pages(cpu_buffer); @@ -570,16 +995,17 @@ rb_allocate_cpu_buffer(struct ring_buffer *buffer, int cpu) cpu_buffer->cpu = cpu; cpu_buffer->buffer = buffer; - spin_lock_init(&cpu_buffer->reader_lock); + atomic_spin_lock_init(&cpu_buffer->reader_lock); lockdep_set_class(&cpu_buffer->reader_lock, buffer->reader_lock_key); cpu_buffer->lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED; - INIT_LIST_HEAD(&cpu_buffer->pages); bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()), GFP_KERNEL, cpu_to_node(cpu)); if (!bpage) goto fail_free_buffer; + rb_check_bpage(cpu_buffer, bpage); + cpu_buffer->reader_page = bpage; addr = __get_free_page(GFP_KERNEL); if (!addr) @@ -594,9 +1020,11 @@ rb_allocate_cpu_buffer(struct ring_buffer *buffer, int cpu) goto fail_free_reader; cpu_buffer->head_page - = list_entry(cpu_buffer->pages.next, struct buffer_page, list); + = list_entry(cpu_buffer->pages, struct buffer_page, list); cpu_buffer->tail_page = cpu_buffer->commit_page = cpu_buffer->head_page; + rb_head_page_activate(cpu_buffer); + return cpu_buffer; fail_free_reader: @@ -609,15 +1037,22 @@ rb_allocate_cpu_buffer(struct ring_buffer *buffer, int cpu) static void rb_free_cpu_buffer(struct ring_buffer_per_cpu *cpu_buffer) { - struct list_head *head = &cpu_buffer->pages; + struct list_head *head = cpu_buffer->pages; struct buffer_page *bpage, *tmp; free_buffer_page(cpu_buffer->reader_page); - list_for_each_entry_safe(bpage, tmp, head, list) { - list_del_init(&bpage->list); + rb_head_page_deactivate(cpu_buffer); + + if (head) { + list_for_each_entry_safe(bpage, tmp, head, list) { + list_del_init(&bpage->list); + free_buffer_page(bpage); + } + bpage = list_entry(head, struct buffer_page, list); free_buffer_page(bpage); } + kfree(cpu_buffer); } @@ -760,15 +1195,17 @@ rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned nr_pages) atomic_inc(&cpu_buffer->record_disabled); synchronize_sched(); + rb_head_page_deactivate(cpu_buffer); + for (i = 0; i < nr_pages; i++) { - if (RB_WARN_ON(cpu_buffer, list_empty(&cpu_buffer->pages))) + if (RB_WARN_ON(cpu_buffer, list_empty(cpu_buffer->pages))) return; - p = cpu_buffer->pages.next; + p = cpu_buffer->pages->next; bpage = list_entry(p, struct buffer_page, list); list_del_init(&bpage->list); free_buffer_page(bpage); } - if (RB_WARN_ON(cpu_buffer, list_empty(&cpu_buffer->pages))) + if (RB_WARN_ON(cpu_buffer, list_empty(cpu_buffer->pages))) return; rb_reset_cpu(cpu_buffer); @@ -790,15 +1227,19 @@ rb_insert_pages(struct ring_buffer_per_cpu *cpu_buffer, atomic_inc(&cpu_buffer->record_disabled); synchronize_sched(); + atomic_spin_lock_irq(&cpu_buffer->reader_lock); + rb_head_page_deactivate(cpu_buffer); + for (i = 0; i < nr_pages; i++) { if (RB_WARN_ON(cpu_buffer, list_empty(pages))) return; p = pages->next; bpage = list_entry(p, struct buffer_page, list); list_del_init(&bpage->list); - list_add_tail(&bpage->list, &cpu_buffer->pages); + list_add_tail(&bpage->list, cpu_buffer->pages); } rb_reset_cpu(cpu_buffer); + atomic_spin_unlock_irq(&cpu_buffer->reader_lock); rb_check_pages(cpu_buffer); @@ -949,21 +1390,14 @@ rb_reader_event(struct ring_buffer_per_cpu *cpu_buffer) } static inline struct ring_buffer_event * -rb_head_event(struct ring_buffer_per_cpu *cpu_buffer) -{ - return __rb_page_index(cpu_buffer->head_page, - cpu_buffer->head_page->read); -} - -static inline struct ring_buffer_event * rb_iter_head_event(struct ring_buffer_iter *iter) { return __rb_page_index(iter->head_page, iter->head); } -static inline unsigned rb_page_write(struct buffer_page *bpage) +static inline unsigned long rb_page_write(struct buffer_page *bpage) { - return local_read(&bpage->write); + return local_read(&bpage->write) & RB_WRITE_MASK; } static inline unsigned rb_page_commit(struct buffer_page *bpage) @@ -971,6 +1405,11 @@ static inline unsigned rb_page_commit(struct buffer_page *bpage) return local_read(&bpage->page->commit); } +static inline unsigned long rb_page_entries(struct buffer_page *bpage) +{ + return local_read(&bpage->entries) & RB_WRITE_MASK; +} + /* Size is determined by what has been commited */ static inline unsigned rb_page_size(struct buffer_page *bpage) { @@ -983,22 +1422,6 @@ rb_commit_index(struct ring_buffer_per_cpu *cpu_buffer) return rb_page_commit(cpu_buffer->commit_page); } -static inline unsigned rb_head_size(struct ring_buffer_per_cpu *cpu_buffer) -{ - return rb_page_commit(cpu_buffer->head_page); -} - -static inline void rb_inc_page(struct ring_buffer_per_cpu *cpu_buffer, - struct buffer_page **bpage) -{ - struct list_head *p = (*bpage)->list.next; - - if (p == &cpu_buffer->pages) - p = p->next; - - *bpage = list_entry(p, struct buffer_page, list); -} - static inline unsigned rb_event_index(struct ring_buffer_event *event) { @@ -1024,6 +1447,8 @@ rb_event_is_commit(struct ring_buffer_per_cpu *cpu_buffer, static void rb_set_commit_to_write(struct ring_buffer_per_cpu *cpu_buffer) { + unsigned long max_count; + /* * We only race with interrupts and NMIs on this CPU. * If we own the commit event, then we can commit @@ -1033,9 +1458,16 @@ rb_set_commit_to_write(struct ring_buffer_per_cpu *cpu_buffer) * assign the commit to the tail. */ again: + max_count = cpu_buffer->buffer->pages * 100; + while (cpu_buffer->commit_page != cpu_buffer->tail_page) { - cpu_buffer->commit_page->page->commit = - cpu_buffer->commit_page->write; + if (RB_WARN_ON(cpu_buffer, !(--max_count))) + return; + if (RB_WARN_ON(cpu_buffer, + rb_is_reader_page(cpu_buffer->tail_page))) + return; + local_set(&cpu_buffer->commit_page->page->commit, + rb_page_write(cpu_buffer->commit_page)); rb_inc_page(cpu_buffer, &cpu_buffer->commit_page); cpu_buffer->write_stamp = cpu_buffer->commit_page->page->time_stamp; @@ -1044,8 +1476,12 @@ rb_set_commit_to_write(struct ring_buffer_per_cpu *cpu_buffer) } while (rb_commit_index(cpu_buffer) != rb_page_write(cpu_buffer->commit_page)) { - cpu_buffer->commit_page->page->commit = - cpu_buffer->commit_page->write; + + local_set(&cpu_buffer->commit_page->page->commit, + rb_page_write(cpu_buffer->commit_page)); + RB_WARN_ON(cpu_buffer, + local_read(&cpu_buffer->commit_page->page->commit) & + ~RB_WRITE_MASK); barrier(); } @@ -1078,7 +1514,7 @@ static void rb_inc_iter(struct ring_buffer_iter *iter) * to the head page instead of next. */ if (iter->head_page == cpu_buffer->reader_page) - iter->head_page = cpu_buffer->head_page; + iter->head_page = rb_set_head_page(cpu_buffer); else rb_inc_page(cpu_buffer, &iter->head_page); @@ -1122,6 +1558,163 @@ rb_update_event(struct ring_buffer_event *event, } } +/* + * rb_handle_head_page - writer hit the head page + * + * Returns: +1 to retry page + * 0 to continue + * -1 on error + */ +static int +rb_handle_head_page(struct ring_buffer_per_cpu *cpu_buffer, + struct buffer_page *tail_page, + struct buffer_page *next_page) +{ + struct buffer_page *new_head; + int entries; + int type; + int ret; + + entries = rb_page_entries(next_page); + + /* + * The hard part is here. We need to move the head + * forward, and protect against both readers on + * other CPUs and writers coming in via interrupts. + */ + type = rb_head_page_set_update(cpu_buffer, next_page, tail_page, + RB_PAGE_HEAD); + + /* + * type can be one of four: + * NORMAL - an interrupt already moved it for us + * HEAD - we are the first to get here. + * UPDATE - we are the interrupt interrupting + * a current move. + * MOVED - a reader on another CPU moved the next + * pointer to its reader page. Give up + * and try again. + */ + + switch (type) { + case RB_PAGE_HEAD: + /* + * We changed the head to UPDATE, thus + * it is our responsibility to update + * the counters. + */ + local_add(entries, &cpu_buffer->overrun); + + /* + * The entries will be zeroed out when we move the + * tail page. + */ + + /* still more to do */ + break; + + case RB_PAGE_UPDATE: + /* + * This is an interrupt that interrupt the + * previous update. Still more to do. + */ + break; + case RB_PAGE_NORMAL: + /* + * An interrupt came in before the update + * and processed this for us. + * Nothing left to do. + */ + return 1; + case RB_PAGE_MOVED: + /* + * The reader is on another CPU and just did + * a swap with our next_page. + * Try again. + */ + return 1; + default: + RB_WARN_ON(cpu_buffer, 1); /* WTF??? */ + return -1; + } + + /* + * Now that we are here, the old head pointer is + * set to UPDATE. This will keep the reader from + * swapping the head page with the reader page. + * The reader (on another CPU) will spin till + * we are finished. + * + * We just need to protect against interrupts + * doing the job. We will set the next pointer + * to HEAD. After that, we set the old pointer + * to NORMAL, but only if it was HEAD before. + * otherwise we are an interrupt, and only + * want the outer most commit to reset it. + */ + new_head = next_page; + rb_inc_page(cpu_buffer, &new_head); + + ret = rb_head_page_set_head(cpu_buffer, new_head, next_page, + RB_PAGE_NORMAL); + + /* + * Valid returns are: + * HEAD - an interrupt came in and already set it. + * NORMAL - One of two things: + * 1) We really set it. + * 2) A bunch of interrupts came in and moved + * the page forward again. + */ + switch (ret) { + case RB_PAGE_HEAD: + case RB_PAGE_NORMAL: + /* OK */ + break; + default: + RB_WARN_ON(cpu_buffer, 1); + return -1; + } + + /* + * It is possible that an interrupt came in, + * set the head up, then more interrupts came in + * and moved it again. When we get back here, + * the page would have been set to NORMAL but we + * just set it back to HEAD. + * + * How do you detect this? Well, if that happened + * the tail page would have moved. + */ + if (ret == RB_PAGE_NORMAL) { + /* + * If the tail had moved passed next, then we need + * to reset the pointer. + */ + if (cpu_buffer->tail_page != tail_page && + cpu_buffer->tail_page != next_page) + rb_head_page_set_normal(cpu_buffer, new_head, + next_page, + RB_PAGE_HEAD); + } + + /* + * If this was the outer most commit (the one that + * changed the original pointer from HEAD to UPDATE), + * then it is up to us to reset it to NORMAL. + */ + if (type == RB_PAGE_HEAD) { + ret = rb_head_page_set_normal(cpu_buffer, next_page, + tail_page, + RB_PAGE_UPDATE); + if (RB_WARN_ON(cpu_buffer, + ret != RB_PAGE_UPDATE)) + return -1; + } + + return 0; +} + static unsigned rb_calculate_event_length(unsigned length) { struct ring_buffer_event event; /* Used only for sizeof array */ @@ -1185,9 +1778,6 @@ rb_reset_tail(struct ring_buffer_per_cpu *cpu_buffer, event->type_len = RINGBUF_TYPE_PADDING; /* time delta must be non zero */ event->time_delta = 1; - /* Account for this as an entry */ - local_inc(&tail_page->entries); - local_inc(&cpu_buffer->entries); /* Set write to end of buffer */ length = (tail + length) - BUF_PAGE_SIZE; @@ -1200,96 +1790,93 @@ rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer, struct buffer_page *commit_page, struct buffer_page *tail_page, u64 *ts) { - struct buffer_page *next_page, *head_page, *reader_page; struct ring_buffer *buffer = cpu_buffer->buffer; - bool lock_taken = false; - unsigned long flags; + struct buffer_page *next_page; + int ret; next_page = tail_page; - local_irq_save(flags); - /* - * Since the write to the buffer is still not - * fully lockless, we must be careful with NMIs. - * The locks in the writers are taken when a write - * crosses to a new page. The locks protect against - * races with the readers (this will soon be fixed - * with a lockless solution). - * - * Because we can not protect against NMIs, and we - * want to keep traces reentrant, we need to manage - * what happens when we are in an NMI. - * - * NMIs can happen after we take the lock. - * If we are in an NMI, only take the lock - * if it is not already taken. Otherwise - * simply fail. - */ - if (unlikely(in_nmi())) { - if (!__raw_spin_trylock(&cpu_buffer->lock)) { - cpu_buffer->nmi_dropped++; - goto out_reset; - } - } else - __raw_spin_lock(&cpu_buffer->lock); - - lock_taken = true; - rb_inc_page(cpu_buffer, &next_page); - head_page = cpu_buffer->head_page; - reader_page = cpu_buffer->reader_page; - - /* we grabbed the lock before incrementing */ - if (RB_WARN_ON(cpu_buffer, next_page == reader_page)) - goto out_reset; - /* * If for some reason, we had an interrupt storm that made * it all the way around the buffer, bail, and warn * about it. */ if (unlikely(next_page == commit_page)) { - cpu_buffer->commit_overrun++; + local_inc(&cpu_buffer->commit_overrun); goto out_reset; } - if (next_page == head_page) { - if (!(buffer->flags & RB_FL_OVERWRITE)) - goto out_reset; - - /* tail_page has not moved yet? */ - if (tail_page == cpu_buffer->tail_page) { - /* count overflows */ - cpu_buffer->overrun += - local_read(&head_page->entries); + /* + * This is where the fun begins! + * + * We are fighting against races between a reader that + * could be on another CPU trying to swap its reader + * page with the buffer head. + * + * We are also fighting against interrupts coming in and + * moving the head or tail on us as well. + * + * If the next page is the head page then we have filled + * the buffer, unless the commit page is still on the + * reader page. + */ + if (rb_is_head_page(cpu_buffer, next_page, &tail_page->list)) { - rb_inc_page(cpu_buffer, &head_page); - cpu_buffer->head_page = head_page; - cpu_buffer->head_page->read = 0; + /* + * If the commit is not on the reader page, then + * move the header page. + */ + if (!rb_is_reader_page(cpu_buffer->commit_page)) { + /* + * If we are not in overwrite mode, + * this is easy, just stop here. + */ + if (!(buffer->flags & RB_FL_OVERWRITE)) + goto out_reset; + + ret = rb_handle_head_page(cpu_buffer, + tail_page, + next_page); + if (ret < 0) + goto out_reset; + if (ret) + goto out_again; + } else { + /* + * We need to be careful here too. The + * commit page could still be on the reader + * page. We could have a small buffer, and + * have filled up the buffer with events + * from interrupts and such, and wrapped. + * + * Note, if the tail page is also the on the + * reader_page, we let it move out. + */ + if (unlikely((cpu_buffer->commit_page != + cpu_buffer->tail_page) && + (cpu_buffer->commit_page == + cpu_buffer->reader_page))) { + local_inc(&cpu_buffer->commit_overrun); + goto out_reset; + } } } - /* - * If the tail page is still the same as what we think - * it is, then it is up to us to update the tail - * pointer. - */ - if (tail_page == cpu_buffer->tail_page) { - local_set(&next_page->write, 0); - local_set(&next_page->entries, 0); - local_set(&next_page->page->commit, 0); - cpu_buffer->tail_page = next_page; - - /* reread the time stamp */ + ret = rb_tail_page_update(cpu_buffer, tail_page, next_page); + if (ret) { + /* + * Nested commits always have zero deltas, so + * just reread the time stamp + */ *ts = rb_time_stamp(buffer, cpu_buffer->cpu); - cpu_buffer->tail_page->page->time_stamp = *ts; + next_page->page->time_stamp = *ts; } - rb_reset_tail(cpu_buffer, tail_page, tail, length); + out_again: - __raw_spin_unlock(&cpu_buffer->lock); - local_irq_restore(flags); + rb_reset_tail(cpu_buffer, tail_page, tail, length); /* fail and let the caller try again */ return ERR_PTR(-EAGAIN); @@ -1298,9 +1885,6 @@ rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer, /* reset write */ rb_reset_tail(cpu_buffer, tail_page, tail, length); - if (likely(lock_taken)) - __raw_spin_unlock(&cpu_buffer->lock); - local_irq_restore(flags); return NULL; } @@ -1317,6 +1901,9 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, barrier(); tail_page = cpu_buffer->tail_page; write = local_add_return(length, &tail_page->write); + + /* set write to only the index of the write */ + write &= RB_WRITE_MASK; tail = write - length; /* See if we shot pass the end of this buffer page */ @@ -1361,12 +1948,16 @@ rb_try_to_discard(struct ring_buffer_per_cpu *cpu_buffer, bpage = cpu_buffer->tail_page; if (bpage->page == (void *)addr && rb_page_write(bpage) == old_index) { + unsigned long write_mask = + local_read(&bpage->write) & ~RB_WRITE_MASK; /* * This is on the tail page. It is possible that * a write could come in and move the tail page * and write to the next page. That is fine * because we just shorten what is on this page. */ + old_index += write_mask; + new_index += write_mask; index = local_cmpxchg(&bpage->write, old_index, new_index); if (index == old_index) return 1; @@ -1482,7 +2073,8 @@ static void rb_end_commit(struct ring_buffer_per_cpu *cpu_buffer) } static struct ring_buffer_event * -rb_reserve_next_event(struct ring_buffer_per_cpu *cpu_buffer, +rb_reserve_next_event(struct ring_buffer *buffer, + struct ring_buffer_per_cpu *cpu_buffer, unsigned long length) { struct ring_buffer_event *event; @@ -1492,6 +2084,21 @@ rb_reserve_next_event(struct ring_buffer_per_cpu *cpu_buffer, rb_start_commit(cpu_buffer); +#ifdef CONFIG_RING_BUFFER_ALLOW_SWAP + /* + * Due to the ability to swap a cpu buffer from a buffer + * it is possible it was swapped before we committed. + * (committing stops a swap). We check for it here and + * if it happened, we have to fail the write. + */ + barrier(); + if (unlikely(ACCESS_ONCE(cpu_buffer->buffer) != buffer)) { + local_dec(&cpu_buffer->committing); + local_dec(&cpu_buffer->commits); + return NULL; + } +#endif + length = rb_calculate_event_length(length); again: /* @@ -1652,7 +2259,7 @@ ring_buffer_lock_reserve(struct ring_buffer *buffer, unsigned long length) if (length > BUF_MAX_DATA_SIZE) goto out; - event = rb_reserve_next_event(cpu_buffer, length); + event = rb_reserve_next_event(buffer, cpu_buffer, length); if (!event) goto out; @@ -1675,18 +2282,23 @@ ring_buffer_lock_reserve(struct ring_buffer *buffer, unsigned long length) } EXPORT_SYMBOL_GPL(ring_buffer_lock_reserve); -static void rb_commit(struct ring_buffer_per_cpu *cpu_buffer, +static void +rb_update_write_stamp(struct ring_buffer_per_cpu *cpu_buffer, struct ring_buffer_event *event) { - local_inc(&cpu_buffer->entries); - /* * The event first in the commit queue updates the * time stamp. */ if (rb_event_is_commit(cpu_buffer, event)) cpu_buffer->write_stamp += event->time_delta; +} +static void rb_commit(struct ring_buffer_per_cpu *cpu_buffer, + struct ring_buffer_event *event) +{ + local_inc(&cpu_buffer->entries); + rb_update_write_stamp(cpu_buffer, event); rb_end_commit(cpu_buffer); } @@ -1733,32 +2345,57 @@ static inline void rb_event_discard(struct ring_buffer_event *event) event->time_delta = 1; } -/** - * ring_buffer_event_discard - discard any event in the ring buffer - * @event: the event to discard - * - * Sometimes a event that is in the ring buffer needs to be ignored. - * This function lets the user discard an event in the ring buffer - * and then that event will not be read later. - * - * Note, it is up to the user to be careful with this, and protect - * against races. If the user discards an event that has been consumed - * it is possible that it could corrupt the ring buffer. +/* + * Decrement the entries to the page that an event is on. + * The event does not even need to exist, only the pointer + * to the page it is on. This may only be called before the commit + * takes place. */ -void ring_buffer_event_discard(struct ring_buffer_event *event) +static inline void +rb_decrement_entry(struct ring_buffer_per_cpu *cpu_buffer, + struct ring_buffer_event *event) { - rb_event_discard(event); + unsigned long addr = (unsigned long)event; + struct buffer_page *bpage = cpu_buffer->commit_page; + struct buffer_page *start; + + addr &= PAGE_MASK; + + /* Do the likely case first */ + if (likely(bpage->page == (void *)addr)) { + local_dec(&bpage->entries); + return; + } + + /* + * Because the commit page may be on the reader page we + * start with the next page and check the end loop there. + */ + rb_inc_page(cpu_buffer, &bpage); + start = bpage; + do { + if (bpage->page == (void *)addr) { + local_dec(&bpage->entries); + return; + } + rb_inc_page(cpu_buffer, &bpage); + } while (bpage != start); + + /* commit not part of this buffer?? */ + RB_WARN_ON(cpu_buffer, 1); } -EXPORT_SYMBOL_GPL(ring_buffer_event_discard); /** * ring_buffer_commit_discard - discard an event that has not been committed * @buffer: the ring buffer * @event: non committed event to discard * - * This is similar to ring_buffer_event_discard but must only be - * performed on an event that has not been committed yet. The difference - * is that this will also try to free the event from the ring buffer + * Sometimes an event that is in the ring buffer needs to be ignored. + * This function lets the user discard an event in the ring buffer + * and then that event will not be read later. + * + * This function only works if it is called before the the item has been + * committed. It will try to free the event from the ring buffer * if another event has not been added behind it. * * If another event has been added behind it, it will set the event @@ -1786,14 +2423,15 @@ void ring_buffer_discard_commit(struct ring_buffer *buffer, */ RB_WARN_ON(buffer, !local_read(&cpu_buffer->committing)); + rb_decrement_entry(cpu_buffer, event); if (rb_try_to_discard(cpu_buffer, event)) goto out; /* * The commit is still visible by the reader, so we - * must increment entries. + * must still update the timestamp. */ - local_inc(&cpu_buffer->entries); + rb_update_write_stamp(cpu_buffer, event); out: rb_end_commit(cpu_buffer); @@ -1854,7 +2492,7 @@ int ring_buffer_write(struct ring_buffer *buffer, if (length > BUF_MAX_DATA_SIZE) goto out; - event = rb_reserve_next_event(cpu_buffer, length); + event = rb_reserve_next_event(buffer, cpu_buffer, length); if (!event) goto out; @@ -1875,9 +2513,13 @@ EXPORT_SYMBOL_GPL(ring_buffer_write); static int rb_per_cpu_empty(struct ring_buffer_per_cpu *cpu_buffer) { struct buffer_page *reader = cpu_buffer->reader_page; - struct buffer_page *head = cpu_buffer->head_page; + struct buffer_page *head = rb_set_head_page(cpu_buffer); struct buffer_page *commit = cpu_buffer->commit_page; + /* In case of error, head will be NULL */ + if (unlikely(!head)) + return 1; + return reader->read == rb_page_commit(reader) && (commit == reader || (commit == head && @@ -1968,7 +2610,7 @@ unsigned long ring_buffer_entries_cpu(struct ring_buffer *buffer, int cpu) return 0; cpu_buffer = buffer->buffers[cpu]; - ret = (local_read(&cpu_buffer->entries) - cpu_buffer->overrun) + ret = (local_read(&cpu_buffer->entries) - local_read(&cpu_buffer->overrun)) - cpu_buffer->read; return ret; @@ -1989,33 +2631,13 @@ unsigned long ring_buffer_overrun_cpu(struct ring_buffer *buffer, int cpu) return 0; cpu_buffer = buffer->buffers[cpu]; - ret = cpu_buffer->overrun; + ret = local_read(&cpu_buffer->overrun); return ret; } EXPORT_SYMBOL_GPL(ring_buffer_overrun_cpu); /** - * ring_buffer_nmi_dropped_cpu - get the number of nmis that were dropped - * @buffer: The ring buffer - * @cpu: The per CPU buffer to get the number of overruns from - */ -unsigned long ring_buffer_nmi_dropped_cpu(struct ring_buffer *buffer, int cpu) -{ - struct ring_buffer_per_cpu *cpu_buffer; - unsigned long ret; - - if (!cpumask_test_cpu(cpu, buffer->cpumask)) - return 0; - - cpu_buffer = buffer->buffers[cpu]; - ret = cpu_buffer->nmi_dropped; - - return ret; -} -EXPORT_SYMBOL_GPL(ring_buffer_nmi_dropped_cpu); - -/** * ring_buffer_commit_overrun_cpu - get the number of overruns caused by commits * @buffer: The ring buffer * @cpu: The per CPU buffer to get the number of overruns from @@ -2030,7 +2652,7 @@ ring_buffer_commit_overrun_cpu(struct ring_buffer *buffer, int cpu) return 0; cpu_buffer = buffer->buffers[cpu]; - ret = cpu_buffer->commit_overrun; + ret = local_read(&cpu_buffer->commit_overrun); return ret; } @@ -2053,7 +2675,7 @@ unsigned long ring_buffer_entries(struct ring_buffer *buffer) for_each_buffer_cpu(buffer, cpu) { cpu_buffer = buffer->buffers[cpu]; entries += (local_read(&cpu_buffer->entries) - - cpu_buffer->overrun) - cpu_buffer->read; + local_read(&cpu_buffer->overrun)) - cpu_buffer->read; } return entries; @@ -2076,7 +2698,7 @@ unsigned long ring_buffer_overruns(struct ring_buffer *buffer) /* if you care about this being correct, lock the buffer */ for_each_buffer_cpu(buffer, cpu) { cpu_buffer = buffer->buffers[cpu]; - overruns += cpu_buffer->overrun; + overruns += local_read(&cpu_buffer->overrun); } return overruns; @@ -2089,8 +2711,10 @@ static void rb_iter_reset(struct ring_buffer_iter *iter) /* Iterator usage is expected to have record disabled */ if (list_empty(&cpu_buffer->reader_page->list)) { - iter->head_page = cpu_buffer->head_page; - iter->head = cpu_buffer->head_page->read; + iter->head_page = rb_set_head_page(cpu_buffer); + if (unlikely(!iter->head_page)) + return; + iter->head = iter->head_page->read; } else { iter->head_page = cpu_buffer->reader_page; iter->head = cpu_buffer->reader_page->read; @@ -2118,9 +2742,9 @@ void ring_buffer_iter_reset(struct ring_buffer_iter *iter) cpu_buffer = iter->cpu_buffer; - spin_lock_irqsave(&cpu_buffer->reader_lock, flags); + atomic_spin_lock_irqsave(&cpu_buffer->reader_lock, flags); rb_iter_reset(iter); - spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); + atomic_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); } EXPORT_SYMBOL_GPL(ring_buffer_iter_reset); @@ -2207,6 +2831,7 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer) struct buffer_page *reader = NULL; unsigned long flags; int nr_loops = 0; + int ret; local_irq_save(flags); __raw_spin_lock(&cpu_buffer->lock); @@ -2240,30 +2865,56 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer) goto out; /* - * Splice the empty reader page into the list around the head. * Reset the reader page to size zero. */ + local_set(&cpu_buffer->reader_page->write, 0); + local_set(&cpu_buffer->reader_page->entries, 0); + local_set(&cpu_buffer->reader_page->page->commit, 0); - reader = cpu_buffer->head_page; + spin: + /* + * Splice the empty reader page into the list around the head. + */ + reader = rb_set_head_page(cpu_buffer); cpu_buffer->reader_page->list.next = reader->list.next; cpu_buffer->reader_page->list.prev = reader->list.prev; - local_set(&cpu_buffer->reader_page->write, 0); - local_set(&cpu_buffer->reader_page->entries, 0); - local_set(&cpu_buffer->reader_page->page->commit, 0); + /* + * cpu_buffer->pages just needs to point to the buffer, it + * has no specific buffer page to point to. Lets move it out + * of our way so we don't accidently swap it. + */ + cpu_buffer->pages = reader->list.prev; - /* Make the reader page now replace the head */ - reader->list.prev->next = &cpu_buffer->reader_page->list; - reader->list.next->prev = &cpu_buffer->reader_page->list; + /* The reader page will be pointing to the new head */ + rb_set_list_to_head(cpu_buffer, &cpu_buffer->reader_page->list); /* - * If the tail is on the reader, then we must set the head - * to the inserted page, otherwise we set it one before. + * Here's the tricky part. + * + * We need to move the pointer past the header page. + * But we can only do that if a writer is not currently + * moving it. The page before the header page has the + * flag bit '1' set if it is pointing to the page we want. + * but if the writer is in the process of moving it + * than it will be '2' or already moved '0'. */ - cpu_buffer->head_page = cpu_buffer->reader_page; - if (cpu_buffer->commit_page != reader) - rb_inc_page(cpu_buffer, &cpu_buffer->head_page); + ret = rb_head_page_replace(reader, cpu_buffer->reader_page); + + /* + * If we did not convert it, then we must try again. + */ + if (!ret) + goto spin; + + /* + * Yeah! We succeeded in replacing the page. + * + * Now make the new head point back to the reader page. + */ + reader->list.next->prev = &cpu_buffer->reader_page->list; + rb_inc_page(cpu_buffer, &cpu_buffer->head_page); /* Finally update the reader page to the new head */ cpu_buffer->reader_page = reader; @@ -2292,8 +2943,7 @@ static void rb_advance_reader(struct ring_buffer_per_cpu *cpu_buffer) event = rb_reader_event(cpu_buffer); - if (event->type_len <= RINGBUF_TYPE_DATA_TYPE_LEN_MAX - || rb_discarded_event(event)) + if (event->type_len <= RINGBUF_TYPE_DATA_TYPE_LEN_MAX) cpu_buffer->read++; rb_update_read_stamp(cpu_buffer, event); @@ -2347,15 +2997,12 @@ static void rb_advance_iter(struct ring_buffer_iter *iter) } static struct ring_buffer_event * -rb_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts) +rb_buffer_peek(struct ring_buffer_per_cpu *cpu_buffer, u64 *ts) { - struct ring_buffer_per_cpu *cpu_buffer; struct ring_buffer_event *event; struct buffer_page *reader; int nr_loops = 0; - cpu_buffer = buffer->buffers[cpu]; - again: /* * We repeat when a timestamp is encountered. It is possible @@ -2399,7 +3046,7 @@ rb_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts) case RINGBUF_TYPE_DATA: if (ts) { *ts = cpu_buffer->read_stamp + event->time_delta; - ring_buffer_normalize_time_stamp(buffer, + ring_buffer_normalize_time_stamp(cpu_buffer->buffer, cpu_buffer->cpu, ts); } return event; @@ -2517,18 +3164,16 @@ ring_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts) again: local_irq_save(flags); if (dolock) - spin_lock(&cpu_buffer->reader_lock); - event = rb_buffer_peek(buffer, cpu, ts); + atomic_spin_lock(&cpu_buffer->reader_lock); + event = rb_buffer_peek(cpu_buffer, ts); if (event && event->type_len == RINGBUF_TYPE_PADDING) rb_advance_reader(cpu_buffer); if (dolock) - spin_unlock(&cpu_buffer->reader_lock); + atomic_spin_unlock(&cpu_buffer->reader_lock); local_irq_restore(flags); - if (event && event->type_len == RINGBUF_TYPE_PADDING) { - cpu_relax(); + if (event && event->type_len == RINGBUF_TYPE_PADDING) goto again; - } return event; } @@ -2549,14 +3194,12 @@ ring_buffer_iter_peek(struct ring_buffer_iter *iter, u64 *ts) unsigned long flags; again: - spin_lock_irqsave(&cpu_buffer->reader_lock, flags); + atomic_spin_lock_irqsave(&cpu_buffer->reader_lock, flags); event = rb_iter_peek(iter, ts); - spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); + atomic_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); - if (event && event->type_len == RINGBUF_TYPE_PADDING) { - cpu_relax(); + if (event && event->type_len == RINGBUF_TYPE_PADDING) goto again; - } return event; } @@ -2589,23 +3232,21 @@ ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts) cpu_buffer = buffer->buffers[cpu]; local_irq_save(flags); if (dolock) - spin_lock(&cpu_buffer->reader_lock); + atomic_spin_lock(&cpu_buffer->reader_lock); - event = rb_buffer_peek(buffer, cpu, ts); + event = rb_buffer_peek(cpu_buffer, ts); if (event) rb_advance_reader(cpu_buffer); if (dolock) - spin_unlock(&cpu_buffer->reader_lock); + atomic_spin_unlock(&cpu_buffer->reader_lock); local_irq_restore(flags); out: preempt_enable(); - if (event && event->type_len == RINGBUF_TYPE_PADDING) { - cpu_relax(); + if (event && event->type_len == RINGBUF_TYPE_PADDING) goto again; - } return event; } @@ -2644,11 +3285,11 @@ ring_buffer_read_start(struct ring_buffer *buffer, int cpu) atomic_inc(&cpu_buffer->record_disabled); synchronize_sched(); - spin_lock_irqsave(&cpu_buffer->reader_lock, flags); + atomic_spin_lock_irqsave(&cpu_buffer->reader_lock, flags); __raw_spin_lock(&cpu_buffer->lock); rb_iter_reset(iter); __raw_spin_unlock(&cpu_buffer->lock); - spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); + atomic_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); return iter; } @@ -2685,20 +3326,18 @@ ring_buffer_read(struct ring_buffer_iter *iter, u64 *ts) struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer; unsigned long flags; + atomic_spin_lock_irqsave(&cpu_buffer->reader_lock, flags); again: - spin_lock_irqsave(&cpu_buffer->reader_lock, flags); event = rb_iter_peek(iter, ts); if (!event) goto out; + if (event->type_len == RINGBUF_TYPE_PADDING) + goto again; + rb_advance_iter(iter); out: - spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); - - if (event && event->type_len == RINGBUF_TYPE_PADDING) { - cpu_relax(); - goto again; - } + atomic_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); return event; } @@ -2717,8 +3356,10 @@ EXPORT_SYMBOL_GPL(ring_buffer_size); static void rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer) { + rb_head_page_deactivate(cpu_buffer); + cpu_buffer->head_page - = list_entry(cpu_buffer->pages.next, struct buffer_page, list); + = list_entry(cpu_buffer->pages, struct buffer_page, list); local_set(&cpu_buffer->head_page->write, 0); local_set(&cpu_buffer->head_page->entries, 0); local_set(&cpu_buffer->head_page->page->commit, 0); @@ -2734,16 +3375,17 @@ rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer) local_set(&cpu_buffer->reader_page->page->commit, 0); cpu_buffer->reader_page->read = 0; - cpu_buffer->nmi_dropped = 0; - cpu_buffer->commit_overrun = 0; - cpu_buffer->overrun = 0; - cpu_buffer->read = 0; + local_set(&cpu_buffer->commit_overrun, 0); + local_set(&cpu_buffer->overrun, 0); local_set(&cpu_buffer->entries, 0); local_set(&cpu_buffer->committing, 0); local_set(&cpu_buffer->commits, 0); + cpu_buffer->read = 0; cpu_buffer->write_stamp = 0; cpu_buffer->read_stamp = 0; + + rb_head_page_activate(cpu_buffer); } /** @@ -2761,7 +3403,10 @@ void ring_buffer_reset_cpu(struct ring_buffer *buffer, int cpu) atomic_inc(&cpu_buffer->record_disabled); - spin_lock_irqsave(&cpu_buffer->reader_lock, flags); + atomic_spin_lock_irqsave(&cpu_buffer->reader_lock, flags); + + if (RB_WARN_ON(cpu_buffer, local_read(&cpu_buffer->committing))) + goto out; __raw_spin_lock(&cpu_buffer->lock); @@ -2769,7 +3414,8 @@ void ring_buffer_reset_cpu(struct ring_buffer *buffer, int cpu) __raw_spin_unlock(&cpu_buffer->lock); - spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); + out: + atomic_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); atomic_dec(&cpu_buffer->record_disabled); } @@ -2807,10 +3453,10 @@ int ring_buffer_empty(struct ring_buffer *buffer) cpu_buffer = buffer->buffers[cpu]; local_irq_save(flags); if (dolock) - spin_lock(&cpu_buffer->reader_lock); + atomic_spin_lock(&cpu_buffer->reader_lock); ret = rb_per_cpu_empty(cpu_buffer); if (dolock) - spin_unlock(&cpu_buffer->reader_lock); + atomic_spin_unlock(&cpu_buffer->reader_lock); local_irq_restore(flags); if (!ret) @@ -2841,16 +3487,17 @@ int ring_buffer_empty_cpu(struct ring_buffer *buffer, int cpu) cpu_buffer = buffer->buffers[cpu]; local_irq_save(flags); if (dolock) - spin_lock(&cpu_buffer->reader_lock); + atomic_spin_lock(&cpu_buffer->reader_lock); ret = rb_per_cpu_empty(cpu_buffer); if (dolock) - spin_unlock(&cpu_buffer->reader_lock); + atomic_spin_unlock(&cpu_buffer->reader_lock); local_irq_restore(flags); return ret; } EXPORT_SYMBOL_GPL(ring_buffer_empty_cpu); +#ifdef CONFIG_RING_BUFFER_ALLOW_SWAP /** * ring_buffer_swap_cpu - swap a CPU buffer between two ring buffers * @buffer_a: One buffer to swap with @@ -2905,20 +3552,28 @@ int ring_buffer_swap_cpu(struct ring_buffer *buffer_a, atomic_inc(&cpu_buffer_a->record_disabled); atomic_inc(&cpu_buffer_b->record_disabled); + ret = -EBUSY; + if (local_read(&cpu_buffer_a->committing)) + goto out_dec; + if (local_read(&cpu_buffer_b->committing)) + goto out_dec; + buffer_a->buffers[cpu] = cpu_buffer_b; buffer_b->buffers[cpu] = cpu_buffer_a; cpu_buffer_b->buffer = buffer_a; cpu_buffer_a->buffer = buffer_b; + ret = 0; + +out_dec: atomic_dec(&cpu_buffer_a->record_disabled); atomic_dec(&cpu_buffer_b->record_disabled); - - ret = 0; out: return ret; } EXPORT_SYMBOL_GPL(ring_buffer_swap_cpu); +#endif /* CONFIG_RING_BUFFER_ALLOW_SWAP */ /** * ring_buffer_alloc_read_page - allocate a page to read from buffer @@ -3030,7 +3685,7 @@ int ring_buffer_read_page(struct ring_buffer *buffer, if (!bpage) goto out; - spin_lock_irqsave(&cpu_buffer->reader_lock, flags); + atomic_spin_lock_irqsave(&cpu_buffer->reader_lock, flags); reader = rb_get_reader_page(cpu_buffer); if (!reader) @@ -3091,7 +3746,7 @@ int ring_buffer_read_page(struct ring_buffer *buffer, read = 0; } else { /* update the entry counter */ - cpu_buffer->read += local_read(&reader->entries); + cpu_buffer->read += rb_page_entries(reader); /* swap the pages */ rb_init_page(bpage); @@ -3105,7 +3760,7 @@ int ring_buffer_read_page(struct ring_buffer *buffer, ret = read; out_unlock: - spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); + atomic_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); out: return ret; diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 8c35839..ebd2924 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -43,14 +43,11 @@ #define TRACE_BUFFER_FLAGS (RB_FL_OVERWRITE) -unsigned long __read_mostly tracing_max_latency; -unsigned long __read_mostly tracing_thresh; - /* * On boot up, the ring buffer is set to the minimum size, so that * we do not waste memory on systems that are not using tracing. */ -static int ring_buffer_expanded; +int ring_buffer_expanded; /* * We need to change this state when a selftest is running. @@ -64,7 +61,7 @@ static bool __read_mostly tracing_selftest_running; /* * If a tracer is running, we do not want to run SELFTEST. */ -static bool __read_mostly tracing_selftest_disabled; +bool __read_mostly tracing_selftest_disabled; /* For tracers that don't implement custom flags */ static struct tracer_opt dummy_tracer_opt[] = { @@ -89,7 +86,7 @@ static int dummy_set_flag(u32 old_flags, u32 bit, int set) */ static int tracing_disabled = 1; -static DEFINE_PER_CPU(local_t, ftrace_cpu_disabled); +DEFINE_PER_CPU(local_t, ftrace_cpu_disabled); static inline void ftrace_disable_cpu(void) { @@ -172,10 +169,11 @@ static struct trace_array global_trace; static DEFINE_PER_CPU(struct trace_array_cpu, global_trace_cpu); -int filter_current_check_discard(struct ftrace_event_call *call, void *rec, +int filter_current_check_discard(struct ring_buffer *buffer, + struct ftrace_event_call *call, void *rec, struct ring_buffer_event *event) { - return filter_check_discard(call, rec, global_trace.buffer, event); + return filter_check_discard(call, rec, buffer, event); } EXPORT_SYMBOL_GPL(filter_current_check_discard); @@ -266,6 +264,9 @@ unsigned long trace_flags = TRACE_ITER_PRINT_PARENT | TRACE_ITER_PRINTK | TRACE_ITER_ANNOTATE | TRACE_ITER_CONTEXT_INFO | TRACE_ITER_SLEEP_TIME | TRACE_ITER_GRAPH_TIME; +static int trace_stop_count; +static DEFINE_ATOMIC_SPINLOCK(tracing_start_lock); + /** * trace_wake_up - wake up tasks waiting for trace input * @@ -274,6 +275,10 @@ unsigned long trace_flags = TRACE_ITER_PRINT_PARENT | TRACE_ITER_PRINTK | */ void trace_wake_up(void) { +#ifdef CONFIG_PREEMPT_RT + if (in_atomic() || irqs_disabled()) + return; +#endif /* * The runqueue_is_locked() can fail, but this is the best we * have for now: @@ -323,49 +328,125 @@ static const char *trace_options[] = { "printk-msg-only", "context-info", "latency-format", - "global-clock", "sleep-time", "graph-time", NULL }; +static struct { + u64 (*func)(void); + const char *name; +} trace_clocks[] = { + { trace_clock_local, "local" }, + { trace_clock_global, "global" }, +}; + +int trace_clock_id; + /* - * ftrace_max_lock is used to protect the swapping of buffers - * when taking a max snapshot. The buffers themselves are - * protected by per_cpu spinlocks. But the action of the swap - * needs its own lock. - * - * This is defined as a raw_spinlock_t in order to help - * with performance when lockdep debugging is enabled. + * trace_parser_get_init - gets the buffer for trace parser */ -static raw_spinlock_t ftrace_max_lock = - (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED; +int trace_parser_get_init(struct trace_parser *parser, int size) +{ + memset(parser, 0, sizeof(*parser)); + + parser->buffer = kmalloc(size, GFP_KERNEL); + if (!parser->buffer) + return 1; + + parser->size = size; + return 0; +} /* - * Copy the new maximum trace into the separate maximum-trace - * structure. (this way the maximum trace is permanently saved, - * for later retrieval via /sys/kernel/debug/tracing/latency_trace) + * trace_parser_put - frees the buffer for trace parser */ -static void -__update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu) +void trace_parser_put(struct trace_parser *parser) { - struct trace_array_cpu *data = tr->data[cpu]; + kfree(parser->buffer); +} - max_tr.cpu = cpu; - max_tr.time_start = data->preempt_timestamp; +/* + * trace_get_user - reads the user input string separated by space + * (matched by isspace(ch)) + * + * For each string found the 'struct trace_parser' is updated, + * and the function returns. + * + * Returns number of bytes read. + * + * See kernel/trace/trace.h for 'struct trace_parser' details. + */ +int trace_get_user(struct trace_parser *parser, const char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + char ch; + size_t read = 0; + ssize_t ret; - data = max_tr.data[cpu]; - data->saved_latency = tracing_max_latency; + if (!*ppos) + trace_parser_clear(parser); - memcpy(data->comm, tsk->comm, TASK_COMM_LEN); - data->pid = tsk->pid; - data->uid = task_uid(tsk); - data->nice = tsk->static_prio - 20 - MAX_RT_PRIO; - data->policy = tsk->policy; - data->rt_priority = tsk->rt_priority; + ret = get_user(ch, ubuf++); + if (ret) + goto out; - /* record this tasks comm */ - tracing_record_cmdline(tsk); + read++; + cnt--; + + /* + * The parser is not finished with the last write, + * continue reading the user input without skipping spaces. + */ + if (!parser->cont) { + /* skip white space */ + while (cnt && isspace(ch)) { + ret = get_user(ch, ubuf++); + if (ret) + goto out; + read++; + cnt--; + } + + /* only spaces were written */ + if (isspace(ch)) { + *ppos += read; + ret = read; + goto out; + } + + parser->idx = 0; + } + + /* read the non-space input */ + while (cnt && !isspace(ch)) { + if (parser->idx < parser->size) + parser->buffer[parser->idx++] = ch; + else { + ret = -EINVAL; + goto out; + } + ret = get_user(ch, ubuf++); + if (ret) + goto out; + read++; + cnt--; + } + + /* We either got finished input or we have to wait for another call. */ + if (isspace(ch)) { + parser->buffer[parser->idx] = 0; + parser->cont = false; + } else { + parser->cont = true; + parser->buffer[parser->idx++] = ch; + } + + *ppos += read; + ret = read; + +out: + return ret; } ssize_t trace_seq_to_user(struct trace_seq *s, char __user *ubuf, size_t cnt) @@ -411,6 +492,56 @@ static ssize_t trace_seq_to_buffer(struct trace_seq *s, void *buf, size_t cnt) return cnt; } +/* + * ftrace_max_lock is used to protect the swapping of buffers + * when taking a max snapshot. The buffers themselves are + * protected by per_cpu spinlocks. But the action of the swap + * needs its own lock. + * + * This is defined as a raw_spinlock_t in order to help + * with performance when lockdep debugging is enabled. + * + * It is also used in other places outside the update_max_tr + * so it needs to be defined outside of the + * CONFIG_TRACER_MAX_TRACE. + */ +static raw_spinlock_t ftrace_max_lock = + (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED; + +#ifdef CONFIG_TRACER_MAX_TRACE +unsigned long __read_mostly tracing_max_latency; +unsigned long __read_mostly tracing_thresh; + +/* + * Copy the new maximum trace into the separate maximum-trace + * structure. (this way the maximum trace is permanently saved, + * for later retrieval via /sys/kernel/debug/tracing/latency_trace) + */ +static void +__update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu) +{ + struct trace_array_cpu *data = tr->data[cpu]; + struct trace_array_cpu *max_data = tr->data[cpu]; + + max_tr.cpu = cpu; + max_tr.time_start = data->preempt_timestamp; + + max_data = max_tr.data[cpu]; + max_data->saved_latency = tracing_max_latency; + max_data->critical_start = data->critical_start; + max_data->critical_end = data->critical_end; + + memcpy(data->comm, tsk->comm, TASK_COMM_LEN); + max_data->pid = tsk->pid; + max_data->uid = task_uid(tsk); + max_data->nice = tsk->static_prio - 20 - MAX_RT_PRIO; + max_data->policy = tsk->policy; + max_data->rt_priority = tsk->rt_priority; + + /* record this tasks comm */ + tracing_record_cmdline(tsk); +} + /** * update_max_tr - snapshot all trace buffers from global_trace to max_tr * @tr: tracer @@ -425,16 +556,15 @@ update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu) { struct ring_buffer *buf = tr->buffer; + if (trace_stop_count) + return; + WARN_ON_ONCE(!irqs_disabled()); __raw_spin_lock(&ftrace_max_lock); tr->buffer = max_tr.buffer; max_tr.buffer = buf; - ftrace_disable_cpu(); - ring_buffer_reset(tr->buffer); - ftrace_enable_cpu(); - __update_max_tr(tr, tsk, cpu); __raw_spin_unlock(&ftrace_max_lock); } @@ -452,21 +582,35 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu) { int ret; + if (trace_stop_count) + return; + WARN_ON_ONCE(!irqs_disabled()); __raw_spin_lock(&ftrace_max_lock); ftrace_disable_cpu(); - ring_buffer_reset(max_tr.buffer); ret = ring_buffer_swap_cpu(max_tr.buffer, tr->buffer, cpu); + if (ret == -EBUSY) { + /* + * We failed to swap the buffer due to a commit taking + * place on this CPU. We fail to record, but we reset + * the max trace buffer (no one writes directly to it) + * and flag that it failed. + */ + trace_array_printk(&max_tr, _THIS_IP_, + "Failed to swap buffers due to commit in progress\n"); + } + ftrace_enable_cpu(); - WARN_ON_ONCE(ret && ret != -EAGAIN); + WARN_ON_ONCE(ret && ret != -EAGAIN && ret != -EBUSY); __update_max_tr(tr, tsk, cpu); __raw_spin_unlock(&ftrace_max_lock); } +#endif /* CONFIG_TRACER_MAX_TRACE */ /** * register_tracer - register a tracer with the ftrace system. @@ -523,7 +667,6 @@ __acquires(kernel_lock) if (type->selftest && !tracing_selftest_disabled) { struct tracer *saved_tracer = current_trace; struct trace_array *tr = &global_trace; - int i; /* * Run a selftest on this tracer. @@ -532,8 +675,7 @@ __acquires(kernel_lock) * internal tracing to verify that everything is in order. * If we fail, we do not register this tracer. */ - for_each_tracing_cpu(i) - tracing_reset(tr, i); + tracing_reset_online_cpus(tr); current_trace = type; /* the test is responsible for initializing and enabling */ @@ -546,8 +688,7 @@ __acquires(kernel_lock) goto out; } /* Only reset on passing, to avoid touching corrupted buffers */ - for_each_tracing_cpu(i) - tracing_reset(tr, i); + tracing_reset_online_cpus(tr); printk(KERN_CONT "PASSED\n"); } @@ -622,21 +763,42 @@ void unregister_tracer(struct tracer *type) mutex_unlock(&trace_types_lock); } -void tracing_reset(struct trace_array *tr, int cpu) +static void __tracing_reset(struct trace_array *tr, int cpu) { ftrace_disable_cpu(); ring_buffer_reset_cpu(tr->buffer, cpu); ftrace_enable_cpu(); } +void tracing_reset(struct trace_array *tr, int cpu) +{ + struct ring_buffer *buffer = tr->buffer; + + ring_buffer_record_disable(buffer); + + /* Make sure all commits have finished */ + synchronize_sched(); + __tracing_reset(tr, cpu); + + ring_buffer_record_enable(buffer); +} + void tracing_reset_online_cpus(struct trace_array *tr) { + struct ring_buffer *buffer = tr->buffer; int cpu; + ring_buffer_record_disable(buffer); + + /* Make sure all commits have finished */ + synchronize_sched(); + tr->time_start = ftrace_now(tr->cpu); for_each_online_cpu(cpu) - tracing_reset(tr, cpu); + __tracing_reset(tr, cpu); + + ring_buffer_record_enable(buffer); } void tracing_reset_current(int cpu) @@ -667,8 +829,10 @@ static void trace_init_cmdlines(void) cmdline_idx = 0; } -static int trace_stop_count; -static DEFINE_SPINLOCK(tracing_start_lock); +int is_tracing_stopped(void) +{ + return trace_stop_count; +} /** * ftrace_off_permanent - disable all ftrace code permanently @@ -699,7 +863,7 @@ void tracing_start(void) if (tracing_disabled) return; - spin_lock_irqsave(&tracing_start_lock, flags); + atomic_spin_lock_irqsave(&tracing_start_lock, flags); if (--trace_stop_count) { if (trace_stop_count < 0) { /* Someone screwed up their debugging */ @@ -720,7 +884,7 @@ void tracing_start(void) ftrace_start(); out: - spin_unlock_irqrestore(&tracing_start_lock, flags); + atomic_spin_unlock_irqrestore(&tracing_start_lock, flags); } /** @@ -735,7 +899,7 @@ void tracing_stop(void) unsigned long flags; ftrace_stop(); - spin_lock_irqsave(&tracing_start_lock, flags); + atomic_spin_lock_irqsave(&tracing_start_lock, flags); if (trace_stop_count++) goto out; @@ -748,7 +912,7 @@ void tracing_stop(void) ring_buffer_record_disable(buffer); out: - spin_unlock_irqrestore(&tracing_start_lock, flags); + atomic_spin_unlock_irqrestore(&tracing_start_lock, flags); } void trace_stop_cmdline_recording(void); @@ -837,7 +1001,7 @@ tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags, entry->preempt_count = pc & 0xff; entry->pid = (tsk) ? tsk->pid : 0; - entry->tgid = (tsk) ? tsk->tgid : 0; + entry->lock_depth = (tsk) ? tsk->lock_depth : 0; entry->flags = #ifdef CONFIG_TRACE_IRQFLAGS_SUPPORT (irqs_disabled_flags(flags) ? TRACE_FLAG_IRQS_OFF : 0) | @@ -850,14 +1014,15 @@ tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags, } EXPORT_SYMBOL_GPL(tracing_generic_entry_update); -struct ring_buffer_event *trace_buffer_lock_reserve(struct trace_array *tr, - int type, - unsigned long len, - unsigned long flags, int pc) +struct ring_buffer_event * +trace_buffer_lock_reserve(struct ring_buffer *buffer, + int type, + unsigned long len, + unsigned long flags, int pc) { struct ring_buffer_event *event; - event = ring_buffer_lock_reserve(tr->buffer, len); + event = ring_buffer_lock_reserve(buffer, len); if (event != NULL) { struct trace_entry *ent = ring_buffer_event_data(event); @@ -867,58 +1032,60 @@ struct ring_buffer_event *trace_buffer_lock_reserve(struct trace_array *tr, return event; } -static void ftrace_trace_stack(struct trace_array *tr, - unsigned long flags, int skip, int pc); -static void ftrace_trace_userstack(struct trace_array *tr, - unsigned long flags, int pc); -static inline void __trace_buffer_unlock_commit(struct trace_array *tr, - struct ring_buffer_event *event, - unsigned long flags, int pc, - int wake) +static inline void +__trace_buffer_unlock_commit(struct ring_buffer *buffer, + struct ring_buffer_event *event, + unsigned long flags, int pc, + int wake) { - ring_buffer_unlock_commit(tr->buffer, event); + ring_buffer_unlock_commit(buffer, event); - ftrace_trace_stack(tr, flags, 6, pc); - ftrace_trace_userstack(tr, flags, pc); + ftrace_trace_stack(buffer, flags, 6, pc); + ftrace_trace_userstack(buffer, flags, pc); if (wake) trace_wake_up(); } -void trace_buffer_unlock_commit(struct trace_array *tr, - struct ring_buffer_event *event, - unsigned long flags, int pc) +void trace_buffer_unlock_commit(struct ring_buffer *buffer, + struct ring_buffer_event *event, + unsigned long flags, int pc) { - __trace_buffer_unlock_commit(tr, event, flags, pc, 1); + __trace_buffer_unlock_commit(buffer, event, flags, pc, 1); } struct ring_buffer_event * -trace_current_buffer_lock_reserve(int type, unsigned long len, +trace_current_buffer_lock_reserve(struct ring_buffer **current_rb, + int type, unsigned long len, unsigned long flags, int pc) { - return trace_buffer_lock_reserve(&global_trace, + *current_rb = global_trace.buffer; + return trace_buffer_lock_reserve(*current_rb, type, len, flags, pc); } EXPORT_SYMBOL_GPL(trace_current_buffer_lock_reserve); -void trace_current_buffer_unlock_commit(struct ring_buffer_event *event, +void trace_current_buffer_unlock_commit(struct ring_buffer *buffer, + struct ring_buffer_event *event, unsigned long flags, int pc) { - __trace_buffer_unlock_commit(&global_trace, event, flags, pc, 1); + __trace_buffer_unlock_commit(buffer, event, flags, pc, 1); } EXPORT_SYMBOL_GPL(trace_current_buffer_unlock_commit); -void trace_nowake_buffer_unlock_commit(struct ring_buffer_event *event, - unsigned long flags, int pc) +void trace_nowake_buffer_unlock_commit(struct ring_buffer *buffer, + struct ring_buffer_event *event, + unsigned long flags, int pc) { - __trace_buffer_unlock_commit(&global_trace, event, flags, pc, 0); + __trace_buffer_unlock_commit(buffer, event, flags, pc, 0); } EXPORT_SYMBOL_GPL(trace_nowake_buffer_unlock_commit); -void trace_current_buffer_discard_commit(struct ring_buffer_event *event) +void trace_current_buffer_discard_commit(struct ring_buffer *buffer, + struct ring_buffer_event *event) { - ring_buffer_discard_commit(global_trace.buffer, event); + ring_buffer_discard_commit(buffer, event); } EXPORT_SYMBOL_GPL(trace_current_buffer_discard_commit); @@ -928,6 +1095,7 @@ trace_function(struct trace_array *tr, int pc) { struct ftrace_event_call *call = &event_function; + struct ring_buffer *buffer = tr->buffer; struct ring_buffer_event *event; struct ftrace_entry *entry; @@ -935,7 +1103,7 @@ trace_function(struct trace_array *tr, if (unlikely(local_read(&__get_cpu_var(ftrace_cpu_disabled)))) return; - event = trace_buffer_lock_reserve(tr, TRACE_FN, sizeof(*entry), + event = trace_buffer_lock_reserve(buffer, TRACE_FN, sizeof(*entry), flags, pc); if (!event) return; @@ -943,58 +1111,10 @@ trace_function(struct trace_array *tr, entry->ip = ip; entry->parent_ip = parent_ip; - if (!filter_check_discard(call, entry, tr->buffer, event)) - ring_buffer_unlock_commit(tr->buffer, event); -} - -#ifdef CONFIG_FUNCTION_GRAPH_TRACER -static int __trace_graph_entry(struct trace_array *tr, - struct ftrace_graph_ent *trace, - unsigned long flags, - int pc) -{ - struct ftrace_event_call *call = &event_funcgraph_entry; - struct ring_buffer_event *event; - struct ftrace_graph_ent_entry *entry; - - if (unlikely(local_read(&__get_cpu_var(ftrace_cpu_disabled)))) - return 0; - - event = trace_buffer_lock_reserve(&global_trace, TRACE_GRAPH_ENT, - sizeof(*entry), flags, pc); - if (!event) - return 0; - entry = ring_buffer_event_data(event); - entry->graph_ent = *trace; - if (!filter_current_check_discard(call, entry, event)) - ring_buffer_unlock_commit(global_trace.buffer, event); - - return 1; + if (!filter_check_discard(call, entry, buffer, event)) + ring_buffer_unlock_commit(buffer, event); } -static void __trace_graph_return(struct trace_array *tr, - struct ftrace_graph_ret *trace, - unsigned long flags, - int pc) -{ - struct ftrace_event_call *call = &event_funcgraph_exit; - struct ring_buffer_event *event; - struct ftrace_graph_ret_entry *entry; - - if (unlikely(local_read(&__get_cpu_var(ftrace_cpu_disabled)))) - return; - - event = trace_buffer_lock_reserve(&global_trace, TRACE_GRAPH_RET, - sizeof(*entry), flags, pc); - if (!event) - return; - entry = ring_buffer_event_data(event); - entry->ret = *trace; - if (!filter_current_check_discard(call, entry, event)) - ring_buffer_unlock_commit(global_trace.buffer, event); -} -#endif - void ftrace(struct trace_array *tr, struct trace_array_cpu *data, unsigned long ip, unsigned long parent_ip, unsigned long flags, @@ -1004,17 +1124,17 @@ ftrace(struct trace_array *tr, struct trace_array_cpu *data, trace_function(tr, ip, parent_ip, flags, pc); } -static void __ftrace_trace_stack(struct trace_array *tr, +#ifdef CONFIG_STACKTRACE +static void __ftrace_trace_stack(struct ring_buffer *buffer, unsigned long flags, int skip, int pc) { -#ifdef CONFIG_STACKTRACE struct ftrace_event_call *call = &event_kernel_stack; struct ring_buffer_event *event; struct stack_entry *entry; struct stack_trace trace; - event = trace_buffer_lock_reserve(tr, TRACE_STACK, + event = trace_buffer_lock_reserve(buffer, TRACE_STACK, sizeof(*entry), flags, pc); if (!event) return; @@ -1027,32 +1147,28 @@ static void __ftrace_trace_stack(struct trace_array *tr, trace.entries = entry->caller; save_stack_trace(&trace); - if (!filter_check_discard(call, entry, tr->buffer, event)) - ring_buffer_unlock_commit(tr->buffer, event); -#endif + if (!filter_check_discard(call, entry, buffer, event)) + ring_buffer_unlock_commit(buffer, event); } -static void ftrace_trace_stack(struct trace_array *tr, - unsigned long flags, - int skip, int pc) +void ftrace_trace_stack(struct ring_buffer *buffer, unsigned long flags, + int skip, int pc) { if (!(trace_flags & TRACE_ITER_STACKTRACE)) return; - __ftrace_trace_stack(tr, flags, skip, pc); + __ftrace_trace_stack(buffer, flags, skip, pc); } -void __trace_stack(struct trace_array *tr, - unsigned long flags, - int skip, int pc) +void __trace_stack(struct trace_array *tr, unsigned long flags, int skip, + int pc) { - __ftrace_trace_stack(tr, flags, skip, pc); + __ftrace_trace_stack(tr->buffer, flags, skip, pc); } -static void ftrace_trace_userstack(struct trace_array *tr, - unsigned long flags, int pc) +void +ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags, int pc) { -#ifdef CONFIG_STACKTRACE struct ftrace_event_call *call = &event_user_stack; struct ring_buffer_event *event; struct userstack_entry *entry; @@ -1061,12 +1177,13 @@ static void ftrace_trace_userstack(struct trace_array *tr, if (!(trace_flags & TRACE_ITER_USERSTACKTRACE)) return; - event = trace_buffer_lock_reserve(tr, TRACE_USER_STACK, + event = trace_buffer_lock_reserve(buffer, TRACE_USER_STACK, sizeof(*entry), flags, pc); if (!event) return; entry = ring_buffer_event_data(event); + entry->tgid = current->tgid; memset(&entry->caller, 0, sizeof(entry->caller)); trace.nr_entries = 0; @@ -1075,9 +1192,8 @@ static void ftrace_trace_userstack(struct trace_array *tr, trace.entries = entry->caller; save_stack_trace_user(&trace); - if (!filter_check_discard(call, entry, tr->buffer, event)) - ring_buffer_unlock_commit(tr->buffer, event); -#endif + if (!filter_check_discard(call, entry, buffer, event)) + ring_buffer_unlock_commit(buffer, event); } #ifdef UNUSED @@ -1087,16 +1203,20 @@ static void __trace_userstack(struct trace_array *tr, unsigned long flags) } #endif /* UNUSED */ +#endif /* CONFIG_STACKTRACE */ + static void ftrace_trace_special(void *__tr, unsigned long arg1, unsigned long arg2, unsigned long arg3, int pc) { + struct ftrace_event_call *call = &event_special; struct ring_buffer_event *event; struct trace_array *tr = __tr; + struct ring_buffer *buffer = tr->buffer; struct special_entry *entry; - event = trace_buffer_lock_reserve(tr, TRACE_SPECIAL, + event = trace_buffer_lock_reserve(buffer, TRACE_SPECIAL, sizeof(*entry), 0, pc); if (!event) return; @@ -1104,7 +1224,9 @@ ftrace_trace_special(void *__tr, entry->arg1 = arg1; entry->arg2 = arg2; entry->arg3 = arg3; - trace_buffer_unlock_commit(tr, event, 0, pc); + + if (!filter_check_discard(call, entry, buffer, event)) + trace_buffer_unlock_commit(buffer, event, 0, pc); } void @@ -1115,62 +1237,6 @@ __trace_special(void *__tr, void *__data, } void -tracing_sched_switch_trace(struct trace_array *tr, - struct task_struct *prev, - struct task_struct *next, - unsigned long flags, int pc) -{ - struct ftrace_event_call *call = &event_context_switch; - struct ring_buffer_event *event; - struct ctx_switch_entry *entry; - - event = trace_buffer_lock_reserve(tr, TRACE_CTX, - sizeof(*entry), flags, pc); - if (!event) - return; - entry = ring_buffer_event_data(event); - entry->prev_pid = prev->pid; - entry->prev_prio = prev->prio; - entry->prev_state = prev->state; - entry->next_pid = next->pid; - entry->next_prio = next->prio; - entry->next_state = next->state; - entry->next_cpu = task_cpu(next); - - if (!filter_check_discard(call, entry, tr->buffer, event)) - trace_buffer_unlock_commit(tr, event, flags, pc); -} - -void -tracing_sched_wakeup_trace(struct trace_array *tr, - struct task_struct *wakee, - struct task_struct *curr, - unsigned long flags, int pc) -{ - struct ftrace_event_call *call = &event_wakeup; - struct ring_buffer_event *event; - struct ctx_switch_entry *entry; - - event = trace_buffer_lock_reserve(tr, TRACE_WAKE, - sizeof(*entry), flags, pc); - if (!event) - return; - entry = ring_buffer_event_data(event); - entry->prev_pid = curr->pid; - entry->prev_prio = curr->prio; - entry->prev_state = curr->state; - entry->next_pid = wakee->pid; - entry->next_prio = wakee->prio; - entry->next_state = wakee->state; - entry->next_cpu = task_cpu(wakee); - - if (!filter_check_discard(call, entry, tr->buffer, event)) - ring_buffer_unlock_commit(tr->buffer, event); - ftrace_trace_stack(tr, flags, 6, pc); - ftrace_trace_userstack(tr, flags, pc); -} - -void ftrace_special(unsigned long arg1, unsigned long arg2, unsigned long arg3) { struct trace_array *tr = &global_trace; @@ -1194,68 +1260,6 @@ ftrace_special(unsigned long arg1, unsigned long arg2, unsigned long arg3) local_irq_restore(flags); } -#ifdef CONFIG_FUNCTION_GRAPH_TRACER -int trace_graph_entry(struct ftrace_graph_ent *trace) -{ - struct trace_array *tr = &global_trace; - struct trace_array_cpu *data; - unsigned long flags; - long disabled; - int ret; - int cpu; - int pc; - - if (!ftrace_trace_task(current)) - return 0; - - if (!ftrace_graph_addr(trace->func)) - return 0; - - local_irq_save(flags); - cpu = raw_smp_processor_id(); - data = tr->data[cpu]; - disabled = atomic_inc_return(&data->disabled); - if (likely(disabled == 1)) { - pc = preempt_count(); - ret = __trace_graph_entry(tr, trace, flags, pc); - } else { - ret = 0; - } - /* Only do the atomic if it is not already set */ - if (!test_tsk_trace_graph(current)) - set_tsk_trace_graph(current); - - atomic_dec(&data->disabled); - local_irq_restore(flags); - - return ret; -} - -void trace_graph_return(struct ftrace_graph_ret *trace) -{ - struct trace_array *tr = &global_trace; - struct trace_array_cpu *data; - unsigned long flags; - long disabled; - int cpu; - int pc; - - local_irq_save(flags); - cpu = raw_smp_processor_id(); - data = tr->data[cpu]; - disabled = atomic_inc_return(&data->disabled); - if (likely(disabled == 1)) { - pc = preempt_count(); - __trace_graph_return(tr, trace, flags, pc); - } - if (!trace->depth) - clear_tsk_trace_graph(current); - atomic_dec(&data->disabled); - local_irq_restore(flags); -} -#endif /* CONFIG_FUNCTION_GRAPH_TRACER */ - - /** * trace_vbprintk - write binary msg to tracing buffer * @@ -1268,6 +1272,7 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args) struct ftrace_event_call *call = &event_bprint; struct ring_buffer_event *event; + struct ring_buffer *buffer; struct trace_array *tr = &global_trace; struct trace_array_cpu *data; struct bprint_entry *entry; @@ -1300,7 +1305,9 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args) goto out_unlock; size = sizeof(*entry) + sizeof(u32) * len; - event = trace_buffer_lock_reserve(tr, TRACE_BPRINT, size, flags, pc); + buffer = tr->buffer; + event = trace_buffer_lock_reserve(buffer, TRACE_BPRINT, size, + flags, pc); if (!event) goto out_unlock; entry = ring_buffer_event_data(event); @@ -1308,8 +1315,8 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args) entry->fmt = fmt; memcpy(entry->buf, trace_buf, sizeof(u32) * len); - if (!filter_check_discard(call, entry, tr->buffer, event)) - ring_buffer_unlock_commit(tr->buffer, event); + if (!filter_check_discard(call, entry, buffer, event)) + ring_buffer_unlock_commit(buffer, event); out_unlock: __raw_spin_unlock(&trace_buf_lock); @@ -1324,14 +1331,30 @@ out: } EXPORT_SYMBOL_GPL(trace_vbprintk); -int trace_vprintk(unsigned long ip, const char *fmt, va_list args) +int trace_array_printk(struct trace_array *tr, + unsigned long ip, const char *fmt, ...) +{ + int ret; + va_list ap; + + if (!(trace_flags & TRACE_ITER_PRINTK)) + return 0; + + va_start(ap, fmt); + ret = trace_array_vprintk(tr, ip, fmt, ap); + va_end(ap); + return ret; +} + +int trace_array_vprintk(struct trace_array *tr, + unsigned long ip, const char *fmt, va_list args) { static raw_spinlock_t trace_buf_lock = __RAW_SPIN_LOCK_UNLOCKED; static char trace_buf[TRACE_BUF_SIZE]; struct ftrace_event_call *call = &event_print; struct ring_buffer_event *event; - struct trace_array *tr = &global_trace; + struct ring_buffer *buffer; struct trace_array_cpu *data; int cpu, len = 0, size, pc; struct print_entry *entry; @@ -1359,7 +1382,9 @@ int trace_vprintk(unsigned long ip, const char *fmt, va_list args) trace_buf[len] = 0; size = sizeof(*entry) + len + 1; - event = trace_buffer_lock_reserve(tr, TRACE_PRINT, size, irq_flags, pc); + buffer = tr->buffer; + event = trace_buffer_lock_reserve(buffer, TRACE_PRINT, size, + irq_flags, pc); if (!event) goto out_unlock; entry = ring_buffer_event_data(event); @@ -1367,8 +1392,8 @@ int trace_vprintk(unsigned long ip, const char *fmt, va_list args) memcpy(&entry->buf, trace_buf, len); entry->buf[len] = 0; - if (!filter_check_discard(call, entry, tr->buffer, event)) - ring_buffer_unlock_commit(tr->buffer, event); + if (!filter_check_discard(call, entry, buffer, event)) + ring_buffer_unlock_commit(buffer, event); out_unlock: __raw_spin_unlock(&trace_buf_lock); @@ -1380,6 +1405,11 @@ int trace_vprintk(unsigned long ip, const char *fmt, va_list args) return len; } + +int trace_vprintk(unsigned long ip, const char *fmt, va_list args) +{ + return trace_array_printk(&global_trace, ip, fmt, args); +} EXPORT_SYMBOL_GPL(trace_vprintk); enum trace_file_type { @@ -1519,6 +1549,37 @@ static void *s_next(struct seq_file *m, void *v, loff_t *pos) return ent; } +static void tracing_iter_reset(struct trace_iterator *iter, int cpu) +{ + struct trace_array *tr = iter->tr; + struct ring_buffer_event *event; + struct ring_buffer_iter *buf_iter; + unsigned long entries = 0; + u64 ts; + + tr->data[cpu]->skipped_entries = 0; + + if (!iter->buffer_iter[cpu]) + return; + + buf_iter = iter->buffer_iter[cpu]; + ring_buffer_iter_reset(buf_iter); + + /* + * We could have the case with the max latency tracers + * that a reset never took place on a cpu. This is evident + * by the timestamp being before the start of the buffer. + */ + while ((event = ring_buffer_iter_peek(buf_iter, &ts))) { + if (ts >= iter->tr->time_start) + break; + entries++; + ring_buffer_read(buf_iter, NULL); + } + + tr->data[cpu]->skipped_entries = entries; +} + /* * No necessary locking here. The worst thing which can * happen is loosing events consumed at the same time @@ -1557,10 +1618,9 @@ static void *s_start(struct seq_file *m, loff_t *pos) if (cpu_file == TRACE_PIPE_ALL_CPU) { for_each_tracing_cpu(cpu) - ring_buffer_iter_reset(iter->buffer_iter[cpu]); + tracing_iter_reset(iter, cpu); } else - ring_buffer_iter_reset(iter->buffer_iter[cpu_file]); - + tracing_iter_reset(iter, cpu_file); ftrace_enable_cpu(); @@ -1589,10 +1649,10 @@ static void print_lat_help_header(struct seq_file *m) seq_puts(m, "# | / _----=> need-resched \n"); seq_puts(m, "# || / _---=> hardirq/softirq \n"); seq_puts(m, "# ||| / _--=> preempt-depth \n"); - seq_puts(m, "# |||| / \n"); - seq_puts(m, "# ||||| delay \n"); - seq_puts(m, "# cmd pid ||||| time | caller \n"); - seq_puts(m, "# \\ / ||||| \\ | / \n"); + seq_puts(m, "# |||| /_--=> lock-depth \n"); + seq_puts(m, "# |||||/ delay \n"); + seq_puts(m, "# cmd pid |||||| time | caller \n"); + seq_puts(m, "# \\ / |||||| \\ | / \n"); } static void print_func_help_header(struct seq_file *m) @@ -1609,16 +1669,32 @@ print_trace_header(struct seq_file *m, struct trace_iterator *iter) struct trace_array *tr = iter->tr; struct trace_array_cpu *data = tr->data[tr->cpu]; struct tracer *type = current_trace; - unsigned long total; - unsigned long entries; + unsigned long entries = 0; + unsigned long total = 0; + unsigned long count; const char *name = "preemption"; + int cpu; if (type) name = type->name; - entries = ring_buffer_entries(iter->tr->buffer); - total = entries + - ring_buffer_overruns(iter->tr->buffer); + + for_each_tracing_cpu(cpu) { + count = ring_buffer_entries_cpu(tr->buffer, cpu); + /* + * If this buffer has skipped entries, then we hold all + * entries for the trace and we need to ignore the + * ones before the time stamp. + */ + if (tr->data[cpu]->skipped_entries) { + count -= tr->data[cpu]->skipped_entries; + /* total is the same as the entries */ + total += count; + } else + total += count + + ring_buffer_overrun_cpu(tr->buffer, cpu); + entries += count; + } seq_printf(m, "# %s latency trace v1.1.5 on %s\n", name, UTS_RELEASE); @@ -1660,7 +1736,7 @@ print_trace_header(struct seq_file *m, struct trace_iterator *iter) seq_puts(m, "\n# => ended at: "); seq_print_ip_sym(&iter->seq, data->critical_end, sym_flags); trace_print_seq(m, &iter->seq); - seq_puts(m, "#\n"); + seq_puts(m, "\n#\n"); } seq_puts(m, "#\n"); @@ -1679,6 +1755,9 @@ static void test_cpu_buff_start(struct trace_iterator *iter) if (cpumask_test_cpu(iter->cpu, iter->started)) return; + if (iter->tr->data[iter->cpu]->skipped_entries) + return; + cpumask_set_cpu(iter->cpu, iter->started); /* Don't print started cpu buffer for the first entry of the trace */ @@ -1941,19 +2020,23 @@ __tracing_open(struct inode *inode, struct file *file) if (ring_buffer_overruns(iter->tr->buffer)) iter->iter_flags |= TRACE_FILE_ANNOTATE; + /* stop the trace while dumping */ + tracing_stop(); + if (iter->cpu_file == TRACE_PIPE_ALL_CPU) { for_each_tracing_cpu(cpu) { iter->buffer_iter[cpu] = ring_buffer_read_start(iter->tr->buffer, cpu); + tracing_iter_reset(iter, cpu); } } else { cpu = iter->cpu_file; iter->buffer_iter[cpu] = ring_buffer_read_start(iter->tr->buffer, cpu); + tracing_iter_reset(iter, cpu); } - /* TODO stop tracer */ ret = seq_open(file, &tracer_seq_ops); if (ret < 0) { fail_ret = ERR_PTR(ret); @@ -1963,9 +2046,6 @@ __tracing_open(struct inode *inode, struct file *file) m = file->private_data; m->private = iter; - /* stop the trace while dumping */ - tracing_stop(); - mutex_unlock(&trace_types_lock); return iter; @@ -1976,6 +2056,7 @@ __tracing_open(struct inode *inode, struct file *file) ring_buffer_read_finish(iter->buffer_iter[cpu]); } free_cpumask_var(iter->started); + tracing_start(); fail: mutex_unlock(&trace_types_lock); kfree(iter->trace); @@ -2257,8 +2338,8 @@ tracing_trace_options_read(struct file *filp, char __user *ubuf, len += 3; /* "no" and newline */ } - /* +2 for \n and \0 */ - buf = kmalloc(len + 2, GFP_KERNEL); + /* +1 for \0 */ + buf = kmalloc(len + 1, GFP_KERNEL); if (!buf) { mutex_unlock(&trace_types_lock); return -ENOMEM; @@ -2281,7 +2362,7 @@ tracing_trace_options_read(struct file *filp, char __user *ubuf, } mutex_unlock(&trace_types_lock); - WARN_ON(r >= len + 2); + WARN_ON(r >= len + 1); r = simple_read_from_buffer(ubuf, cnt, ppos, buf, r); @@ -2292,23 +2373,23 @@ tracing_trace_options_read(struct file *filp, char __user *ubuf, /* Try to assign a tracer specific option */ static int set_tracer_option(struct tracer *trace, char *cmp, int neg) { - struct tracer_flags *trace_flags = trace->flags; + struct tracer_flags *tracer_flags = trace->flags; struct tracer_opt *opts = NULL; int ret = 0, i = 0; int len; - for (i = 0; trace_flags->opts[i].name; i++) { - opts = &trace_flags->opts[i]; + for (i = 0; tracer_flags->opts[i].name; i++) { + opts = &tracer_flags->opts[i]; len = strlen(opts->name); if (strncmp(cmp, opts->name, len) == 0) { - ret = trace->set_flag(trace_flags->val, + ret = trace->set_flag(tracer_flags->val, opts->bit, !neg); break; } } /* Not found */ - if (!trace_flags->opts[i].name) + if (!tracer_flags->opts[i].name) return -EINVAL; /* Refused to handle */ @@ -2316,9 +2397,9 @@ static int set_tracer_option(struct tracer *trace, char *cmp, int neg) return ret; if (neg) - trace_flags->val &= ~opts->bit; + tracer_flags->val &= ~opts->bit; else - trace_flags->val |= opts->bit; + tracer_flags->val |= opts->bit; return 0; } @@ -2333,22 +2414,6 @@ static void set_tracer_flags(unsigned int mask, int enabled) trace_flags |= mask; else trace_flags &= ~mask; - - if (mask == TRACE_ITER_GLOBAL_CLK) { - u64 (*func)(void); - - if (enabled) - func = trace_clock_global; - else - func = trace_clock_local; - - mutex_lock(&trace_types_lock); - ring_buffer_set_clock(global_trace.buffer, func); - - if (max_tr.buffer) - ring_buffer_set_clock(max_tr.buffer, func); - mutex_unlock(&trace_types_lock); - } } static ssize_t @@ -3316,6 +3381,62 @@ tracing_mark_write(struct file *filp, const char __user *ubuf, return cnt; } +static ssize_t tracing_clock_read(struct file *filp, char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + char buf[64]; + int bufiter = 0; + int i; + + for (i = 0; i < ARRAY_SIZE(trace_clocks); i++) + bufiter += snprintf(buf + bufiter, sizeof(buf) - bufiter, + "%s%s%s%s", i ? " " : "", + i == trace_clock_id ? "[" : "", trace_clocks[i].name, + i == trace_clock_id ? "]" : ""); + bufiter += snprintf(buf + bufiter, sizeof(buf) - bufiter, "\n"); + + return simple_read_from_buffer(ubuf, cnt, ppos, buf, bufiter); +} + +static ssize_t tracing_clock_write(struct file *filp, const char __user *ubuf, + size_t cnt, loff_t *fpos) +{ + char buf[64]; + const char *clockstr; + int i; + + if (cnt >= sizeof(buf)) + return -EINVAL; + + if (copy_from_user(&buf, ubuf, cnt)) + return -EFAULT; + + buf[cnt] = 0; + + clockstr = strstrip(buf); + + for (i = 0; i < ARRAY_SIZE(trace_clocks); i++) { + if (strcmp(trace_clocks[i].name, clockstr) == 0) + break; + } + if (i == ARRAY_SIZE(trace_clocks)) + return -EINVAL; + + trace_clock_id = i; + + mutex_lock(&trace_types_lock); + + ring_buffer_set_clock(global_trace.buffer, trace_clocks[i].func); + if (max_tr.buffer) + ring_buffer_set_clock(max_tr.buffer, trace_clocks[i].func); + + mutex_unlock(&trace_types_lock); + + *fpos += cnt; + + return cnt; +} + static const struct file_operations tracing_max_lat_fops = { .open = tracing_open_generic, .read = tracing_max_lat_read, @@ -3353,6 +3474,12 @@ static const struct file_operations tracing_mark_fops = { .write = tracing_mark_write, }; +static const struct file_operations trace_clock_fops = { + .open = tracing_open_generic, + .read = tracing_clock_read, + .write = tracing_clock_write, +}; + struct ftrace_buffer_info { struct trace_array *tr; void *spare; @@ -3633,9 +3760,6 @@ tracing_stats_read(struct file *filp, char __user *ubuf, cnt = ring_buffer_commit_overrun_cpu(tr->buffer, cpu); trace_seq_printf(s, "commit overrun: %ld\n", cnt); - cnt = ring_buffer_nmi_dropped_cpu(tr->buffer, cpu); - trace_seq_printf(s, "nmi dropped: %ld\n", cnt); - count = simple_read_from_buffer(ubuf, count, ppos, s->buffer, s->len); kfree(s); @@ -4066,11 +4190,13 @@ static __init int tracer_init_debugfs(void) trace_create_file("current_tracer", 0644, d_tracer, &global_trace, &set_tracer_fops); +#ifdef CONFIG_TRACER_MAX_TRACE trace_create_file("tracing_max_latency", 0644, d_tracer, &tracing_max_latency, &tracing_max_lat_fops); trace_create_file("tracing_thresh", 0644, d_tracer, &tracing_thresh, &tracing_max_lat_fops); +#endif trace_create_file("README", 0444, d_tracer, NULL, &tracing_readme_fops); @@ -4087,6 +4213,9 @@ static __init int tracer_init_debugfs(void) trace_create_file("saved_cmdlines", 0444, d_tracer, NULL, &tracing_saved_cmdlines_fops); + trace_create_file("trace_clock", 0644, d_tracer, NULL, + &trace_clock_fops); + #ifdef CONFIG_DYNAMIC_FTRACE trace_create_file("dyn_ftrace_total_info", 0444, d_tracer, &ftrace_update_tot_cnt, &tracing_dyn_info_fops); @@ -4265,7 +4394,6 @@ void ftrace_dump(void) __init static int tracer_alloc_buffers(void) { - struct trace_array_cpu *data; int ring_buf_size; int i; int ret = -ENOMEM; @@ -4315,7 +4443,7 @@ __init static int tracer_alloc_buffers(void) /* Allocate the first page for all buffers */ for_each_tracing_cpu(i) { - data = global_trace.data[i] = &per_cpu(global_trace_cpu, i); + global_trace.data[i] = &per_cpu(global_trace_cpu, i); max_tr.data[i] = &per_cpu(max_data, i); } diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 8b9f4f6..86bcff9 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -7,6 +7,7 @@ #include <linux/clocksource.h> #include <linux/ring_buffer.h> #include <linux/mmiotrace.h> +#include <linux/tracepoint.h> #include <linux/ftrace.h> #include <trace/boot.h> #include <linux/kmemtrace.h> @@ -34,8 +35,6 @@ enum trace_type { TRACE_GRAPH_ENT, TRACE_USER_STACK, TRACE_HW_BRANCHES, - TRACE_SYSCALL_ENTER, - TRACE_SYSCALL_EXIT, TRACE_KMEM_ALLOC, TRACE_KMEM_FREE, TRACE_POWER, @@ -44,157 +43,54 @@ enum trace_type { __TRACE_LAST_TYPE, }; -/* - * Function trace entry - function address and parent function addres: - */ -struct ftrace_entry { - struct trace_entry ent; - unsigned long ip; - unsigned long parent_ip; -}; - -/* Function call entry */ -struct ftrace_graph_ent_entry { - struct trace_entry ent; - struct ftrace_graph_ent graph_ent; +enum kmemtrace_type_id { + KMEMTRACE_TYPE_KMALLOC = 0, /* kmalloc() or kfree(). */ + KMEMTRACE_TYPE_CACHE, /* kmem_cache_*(). */ + KMEMTRACE_TYPE_PAGES, /* __get_free_pages() and friends. */ }; -/* Function return entry */ -struct ftrace_graph_ret_entry { - struct trace_entry ent; - struct ftrace_graph_ret ret; -}; extern struct tracer boot_tracer; -/* - * Context switch trace entry - which task (and prio) we switched from/to: - */ -struct ctx_switch_entry { - struct trace_entry ent; - unsigned int prev_pid; - unsigned char prev_prio; - unsigned char prev_state; - unsigned int next_pid; - unsigned char next_prio; - unsigned char next_state; - unsigned int next_cpu; -}; +#undef __field +#define __field(type, item) type item; -/* - * Special (free-form) trace entry: - */ -struct special_entry { - struct trace_entry ent; - unsigned long arg1; - unsigned long arg2; - unsigned long arg3; -}; +#undef __field_struct +#define __field_struct(type, item) __field(type, item) -/* - * Stack-trace entry: - */ +#undef __field_desc +#define __field_desc(type, container, item) -#define FTRACE_STACK_ENTRIES 8 +#undef __array +#define __array(type, item, size) type item[size]; -struct stack_entry { - struct trace_entry ent; - unsigned long caller[FTRACE_STACK_ENTRIES]; -}; +#undef __array_desc +#define __array_desc(type, container, item, size) -struct userstack_entry { - struct trace_entry ent; - unsigned long caller[FTRACE_STACK_ENTRIES]; -}; +#undef __dynamic_array +#define __dynamic_array(type, item) type item[]; -/* - * trace_printk entry: - */ -struct bprint_entry { - struct trace_entry ent; - unsigned long ip; - const char *fmt; - u32 buf[]; -}; +#undef F_STRUCT +#define F_STRUCT(args...) args -struct print_entry { - struct trace_entry ent; - unsigned long ip; - char buf[]; -}; - -#define TRACE_OLD_SIZE 88 - -struct trace_field_cont { - unsigned char type; - /* Temporary till we get rid of this completely */ - char buf[TRACE_OLD_SIZE - 1]; -}; - -struct trace_mmiotrace_rw { - struct trace_entry ent; - struct mmiotrace_rw rw; -}; - -struct trace_mmiotrace_map { - struct trace_entry ent; - struct mmiotrace_map map; -}; - -struct trace_boot_call { - struct trace_entry ent; - struct boot_trace_call boot_call; -}; - -struct trace_boot_ret { - struct trace_entry ent; - struct boot_trace_ret boot_ret; -}; - -#define TRACE_FUNC_SIZE 30 -#define TRACE_FILE_SIZE 20 -struct trace_branch { - struct trace_entry ent; - unsigned line; - char func[TRACE_FUNC_SIZE+1]; - char file[TRACE_FILE_SIZE+1]; - char correct; -}; - -struct hw_branch_entry { - struct trace_entry ent; - u64 from; - u64 to; -}; +#undef FTRACE_ENTRY +#define FTRACE_ENTRY(name, struct_name, id, tstruct, print) \ + struct struct_name { \ + struct trace_entry ent; \ + tstruct \ + } -struct trace_power { - struct trace_entry ent; - struct power_trace state_data; -}; +#undef TP_ARGS +#define TP_ARGS(args...) args -enum kmemtrace_type_id { - KMEMTRACE_TYPE_KMALLOC = 0, /* kmalloc() or kfree(). */ - KMEMTRACE_TYPE_CACHE, /* kmem_cache_*(). */ - KMEMTRACE_TYPE_PAGES, /* __get_free_pages() and friends. */ -}; - -struct kmemtrace_alloc_entry { - struct trace_entry ent; - enum kmemtrace_type_id type_id; - unsigned long call_site; - const void *ptr; - size_t bytes_req; - size_t bytes_alloc; - gfp_t gfp_flags; - int node; -}; +#undef FTRACE_ENTRY_DUP +#define FTRACE_ENTRY_DUP(name, name_struct, id, tstruct, printk) -struct kmemtrace_free_entry { - struct trace_entry ent; - enum kmemtrace_type_id type_id; - unsigned long call_site; - const void *ptr; -}; +#include "trace_entries.h" +/* + * syscalls are special, and need special handling, this is why + * they are not included in trace_entries.h + */ struct syscall_trace_enter { struct trace_entry ent; int nr; @@ -207,13 +103,12 @@ struct syscall_trace_exit { unsigned long ret; }; - /* * trace_flag_type is an enumeration that holds different * states when a trace occurs. These are: * IRQS_OFF - interrupts were disabled * IRQS_NOSUPPORT - arch does not support irqs_disabled_flags - * NEED_RESCED - reschedule is requested + * NEED_RESCHED - reschedule is requested * HARDIRQ - inside an interrupt handler * SOFTIRQ - inside a softirq handler */ @@ -236,9 +131,6 @@ struct trace_array_cpu { atomic_t disabled; void *buffer_page; /* ring buffer spare */ - /* these fields get copied into max-trace: */ - unsigned long trace_idx; - unsigned long overrun; unsigned long saved_latency; unsigned long critical_start; unsigned long critical_end; @@ -246,6 +138,7 @@ struct trace_array_cpu { unsigned long nice; unsigned long policy; unsigned long rt_priority; + unsigned long skipped_entries; cycle_t preempt_timestamp; pid_t pid; uid_t uid; @@ -319,10 +212,6 @@ extern void __ftrace_bad_type(void); TRACE_KMEM_ALLOC); \ IF_ASSIGN(var, ent, struct kmemtrace_free_entry, \ TRACE_KMEM_FREE); \ - IF_ASSIGN(var, ent, struct syscall_trace_enter, \ - TRACE_SYSCALL_ENTER); \ - IF_ASSIGN(var, ent, struct syscall_trace_exit, \ - TRACE_SYSCALL_EXIT); \ __ftrace_bad_type(); \ } while (0) @@ -398,7 +287,6 @@ struct tracer { struct tracer *next; int print_max; struct tracer_flags *flags; - struct tracer_stat *stats; }; @@ -423,12 +311,13 @@ void init_tracer_sysprof_debugfs(struct dentry *d_tracer); struct ring_buffer_event; -struct ring_buffer_event *trace_buffer_lock_reserve(struct trace_array *tr, - int type, - unsigned long len, - unsigned long flags, - int pc); -void trace_buffer_unlock_commit(struct trace_array *tr, +struct ring_buffer_event * +trace_buffer_lock_reserve(struct ring_buffer *buffer, + int type, + unsigned long len, + unsigned long flags, + int pc); +void trace_buffer_unlock_commit(struct ring_buffer *buffer, struct ring_buffer_event *event, unsigned long flags, int pc); @@ -467,6 +356,7 @@ void trace_function(struct trace_array *tr, void trace_graph_return(struct ftrace_graph_ret *trace); int trace_graph_entry(struct ftrace_graph_ent *trace); +void set_graph_array(struct trace_array *tr); void tracing_start_cmdline_record(void); void tracing_stop_cmdline_record(void); @@ -475,35 +365,46 @@ void tracing_stop_sched_switch_record(void); void tracing_start_sched_switch_record(void); int register_tracer(struct tracer *type); void unregister_tracer(struct tracer *type); +int is_tracing_stopped(void); extern unsigned long nsecs_to_usecs(unsigned long nsecs); +#ifdef CONFIG_TRACER_MAX_TRACE extern unsigned long tracing_max_latency; extern unsigned long tracing_thresh; void update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu); void update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu); +#endif /* CONFIG_TRACER_MAX_TRACE */ -void __trace_stack(struct trace_array *tr, - unsigned long flags, - int skip, int pc); +#ifdef CONFIG_STACKTRACE +void ftrace_trace_stack(struct ring_buffer *buffer, unsigned long flags, + int skip, int pc); -extern cycle_t ftrace_now(int cpu); +void ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags, + int pc); -#ifdef CONFIG_CONTEXT_SWITCH_TRACER -typedef void -(*tracer_switch_func_t)(void *private, - void *__rq, - struct task_struct *prev, - struct task_struct *next); - -struct tracer_switch_ops { - tracer_switch_func_t func; - void *private; - struct tracer_switch_ops *next; -}; -#endif /* CONFIG_CONTEXT_SWITCH_TRACER */ +void __trace_stack(struct trace_array *tr, unsigned long flags, int skip, + int pc); +#else +static inline void ftrace_trace_stack(struct trace_array *tr, + unsigned long flags, int skip, int pc) +{ +} + +static inline void ftrace_trace_userstack(struct trace_array *tr, + unsigned long flags, int pc) +{ +} + +static inline void __trace_stack(struct trace_array *tr, unsigned long flags, + int skip, int pc) +{ +} +#endif /* CONFIG_STACKTRACE */ + +extern cycle_t ftrace_now(int cpu); extern void trace_find_cmdline(int pid, char comm[]); @@ -513,6 +414,10 @@ extern unsigned long ftrace_update_tot_cnt; extern int DYN_FTRACE_TEST_NAME(void); #endif +extern int ring_buffer_expanded; +extern bool tracing_selftest_disabled; +DECLARE_PER_CPU(local_t, ftrace_cpu_disabled); + #ifdef CONFIG_FTRACE_STARTUP_TEST extern int trace_selftest_startup_function(struct tracer *trace, struct trace_array *tr); @@ -544,9 +449,16 @@ extern int trace_vbprintk(unsigned long ip, const char *fmt, va_list args); extern int trace_vprintk(unsigned long ip, const char *fmt, va_list args); +extern int +trace_array_vprintk(struct trace_array *tr, + unsigned long ip, const char *fmt, va_list args); +int trace_array_printk(struct trace_array *tr, + unsigned long ip, const char *fmt, ...); extern unsigned long trace_flags; +extern int trace_clock_id; + /* Standard output formatting function used for function return traces */ #ifdef CONFIG_FUNCTION_GRAPH_TRACER extern enum print_line_t print_graph_function(struct trace_iterator *iter); @@ -609,6 +521,41 @@ static inline int ftrace_trace_task(struct task_struct *task) #endif /* + * struct trace_parser - servers for reading the user input separated by spaces + * @cont: set if the input is not complete - no final space char was found + * @buffer: holds the parsed user input + * @idx: user input lenght + * @size: buffer size + */ +struct trace_parser { + bool cont; + char *buffer; + unsigned idx; + unsigned size; +}; + +static inline bool trace_parser_loaded(struct trace_parser *parser) +{ + return (parser->idx != 0); +} + +static inline bool trace_parser_cont(struct trace_parser *parser) +{ + return parser->cont; +} + +static inline void trace_parser_clear(struct trace_parser *parser) +{ + parser->cont = false; + parser->idx = 0; +} + +extern int trace_parser_get_init(struct trace_parser *parser, int size); +extern void trace_parser_put(struct trace_parser *parser); +extern int trace_get_user(struct trace_parser *parser, const char __user *ubuf, + size_t cnt, loff_t *ppos); + +/* * trace_iterator_flags is an enumeration that defines bit * positions into trace_flags that controls the output. * @@ -635,9 +582,8 @@ enum trace_iterator_flags { TRACE_ITER_PRINTK_MSGONLY = 0x10000, TRACE_ITER_CONTEXT_INFO = 0x20000, /* Print pid/cpu/time */ TRACE_ITER_LATENCY_FMT = 0x40000, - TRACE_ITER_GLOBAL_CLK = 0x80000, - TRACE_ITER_SLEEP_TIME = 0x100000, - TRACE_ITER_GRAPH_TIME = 0x200000, + TRACE_ITER_SLEEP_TIME = 0x80000, + TRACE_ITER_GRAPH_TIME = 0x100000, }; /* @@ -734,6 +680,7 @@ struct ftrace_event_field { struct list_head link; char *name; char *type; + int filter_type; int offset; int size; int is_signed; @@ -743,13 +690,15 @@ struct event_filter { int n_preds; struct filter_pred **preds; char *filter_string; + bool no_reset; }; struct event_subsystem { struct list_head list; const char *name; struct dentry *entry; - void *filter; + struct event_filter *filter; + int nr_events; }; struct filter_pred; @@ -777,6 +726,7 @@ extern int apply_subsystem_event_filter(struct event_subsystem *system, char *filter_string); extern void print_subsystem_event_filter(struct event_subsystem *system, struct trace_seq *s); +extern int filter_assign_type(const char *type); static inline int filter_check_discard(struct ftrace_event_call *call, void *rec, @@ -791,58 +741,18 @@ filter_check_discard(struct ftrace_event_call *call, void *rec, return 0; } -#define DEFINE_COMPARISON_PRED(type) \ -static int filter_pred_##type(struct filter_pred *pred, void *event, \ - int val1, int val2) \ -{ \ - type *addr = (type *)(event + pred->offset); \ - type val = (type)pred->val; \ - int match = 0; \ - \ - switch (pred->op) { \ - case OP_LT: \ - match = (*addr < val); \ - break; \ - case OP_LE: \ - match = (*addr <= val); \ - break; \ - case OP_GT: \ - match = (*addr > val); \ - break; \ - case OP_GE: \ - match = (*addr >= val); \ - break; \ - default: \ - break; \ - } \ - \ - return match; \ -} - -#define DEFINE_EQUALITY_PRED(size) \ -static int filter_pred_##size(struct filter_pred *pred, void *event, \ - int val1, int val2) \ -{ \ - u##size *addr = (u##size *)(event + pred->offset); \ - u##size val = (u##size)pred->val; \ - int match; \ - \ - match = (val == *addr) ^ pred->not; \ - \ - return match; \ -} - extern struct mutex event_mutex; extern struct list_head ftrace_events; extern const char *__start___trace_bprintk_fmt[]; extern const char *__stop___trace_bprintk_fmt[]; -#undef TRACE_EVENT_FORMAT -#define TRACE_EVENT_FORMAT(call, proto, args, fmt, tstruct, tpfmt) \ +#undef FTRACE_ENTRY +#define FTRACE_ENTRY(call, struct_name, id, tstruct, print) \ extern struct ftrace_event_call event_##call; -#undef TRACE_EVENT_FORMAT_NOFILTER -#define TRACE_EVENT_FORMAT_NOFILTER(call, proto, args, fmt, tstruct, tpfmt) -#include "trace_event_types.h" +#undef FTRACE_ENTRY_DUP +#define FTRACE_ENTRY_DUP(call, struct_name, id, tstruct, print) \ + FTRACE_ENTRY(call, struct_name, id, PARAMS(tstruct), PARAMS(print)) +#include "trace_entries.h" #endif /* _LINUX_KERNEL_TRACE_H */ diff --git a/kernel/trace/trace_boot.c b/kernel/trace/trace_boot.c index a29ef23..c21d5f3 100644 --- a/kernel/trace/trace_boot.c +++ b/kernel/trace/trace_boot.c @@ -41,14 +41,12 @@ void disable_boot_trace(void) static int boot_trace_init(struct trace_array *tr) { - int cpu; boot_trace = tr; if (!tr) return 0; - for_each_cpu(cpu, cpu_possible_mask) - tracing_reset(tr, cpu); + tracing_reset_online_cpus(tr); tracing_sched_switch_assign_trace(tr); return 0; @@ -131,7 +129,9 @@ struct tracer boot_tracer __read_mostly = void trace_boot_call(struct boot_trace_call *bt, initcall_t fn) { + struct ftrace_event_call *call = &event_boot_call; struct ring_buffer_event *event; + struct ring_buffer *buffer; struct trace_boot_call *entry; struct trace_array *tr = boot_trace; @@ -144,20 +144,24 @@ void trace_boot_call(struct boot_trace_call *bt, initcall_t fn) sprint_symbol(bt->func, (unsigned long)fn); preempt_disable(); - event = trace_buffer_lock_reserve(tr, TRACE_BOOT_CALL, + buffer = tr->buffer; + event = trace_buffer_lock_reserve(buffer, TRACE_BOOT_CALL, sizeof(*entry), 0, 0); if (!event) goto out; entry = ring_buffer_event_data(event); entry->boot_call = *bt; - trace_buffer_unlock_commit(tr, event, 0, 0); + if (!filter_check_discard(call, entry, buffer, event)) + trace_buffer_unlock_commit(buffer, event, 0, 0); out: preempt_enable(); } void trace_boot_ret(struct boot_trace_ret *bt, initcall_t fn) { + struct ftrace_event_call *call = &event_boot_ret; struct ring_buffer_event *event; + struct ring_buffer *buffer; struct trace_boot_ret *entry; struct trace_array *tr = boot_trace; @@ -167,13 +171,15 @@ void trace_boot_ret(struct boot_trace_ret *bt, initcall_t fn) sprint_symbol(bt->func, (unsigned long)fn); preempt_disable(); - event = trace_buffer_lock_reserve(tr, TRACE_BOOT_RET, + buffer = tr->buffer; + event = trace_buffer_lock_reserve(buffer, TRACE_BOOT_RET, sizeof(*entry), 0, 0); if (!event) goto out; entry = ring_buffer_event_data(event); entry->boot_ret = *bt; - trace_buffer_unlock_commit(tr, event, 0, 0); + if (!filter_check_discard(call, entry, buffer, event)) + trace_buffer_unlock_commit(buffer, event, 0, 0); out: preempt_enable(); } diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h new file mode 100644 index 0000000..a431748 --- /dev/null +++ b/kernel/trace/trace_entries.h @@ -0,0 +1,383 @@ +/* + * This file defines the trace event structures that go into the ring + * buffer directly. They are created via macros so that changes for them + * appear in the format file. Using macros will automate this process. + * + * The macro used to create a ftrace data structure is: + * + * FTRACE_ENTRY( name, struct_name, id, structure, print ) + * + * @name: the name used the event name, as well as the name of + * the directory that holds the format file. + * + * @struct_name: the name of the structure that is created. + * + * @id: The event identifier that is used to detect what event + * this is from the ring buffer. + * + * @structure: the structure layout + * + * - __field( type, item ) + * This is equivalent to declaring + * type item; + * in the structure. + * - __array( type, item, size ) + * This is equivalent to declaring + * type item[size]; + * in the structure. + * + * * for structures within structures, the format of the internal + * structure is layed out. This allows the internal structure + * to be deciphered for the format file. Although these macros + * may become out of sync with the internal structure, they + * will create a compile error if it happens. Since the + * internel structures are just tracing helpers, this is not + * an issue. + * + * When an internal structure is used, it should use: + * + * __field_struct( type, item ) + * + * instead of __field. This will prevent it from being shown in + * the output file. The fields in the structure should use. + * + * __field_desc( type, container, item ) + * __array_desc( type, container, item, len ) + * + * type, item and len are the same as __field and __array, but + * container is added. This is the name of the item in + * __field_struct that this is describing. + * + * + * @print: the print format shown to users in the format file. + */ + +/* + * Function trace entry - function address and parent function addres: + */ +FTRACE_ENTRY(function, ftrace_entry, + + TRACE_FN, + + F_STRUCT( + __field( unsigned long, ip ) + __field( unsigned long, parent_ip ) + ), + + F_printk(" %lx <-- %lx", __entry->ip, __entry->parent_ip) +); + +/* Function call entry */ +FTRACE_ENTRY(funcgraph_entry, ftrace_graph_ent_entry, + + TRACE_GRAPH_ENT, + + F_STRUCT( + __field_struct( struct ftrace_graph_ent, graph_ent ) + __field_desc( unsigned long, graph_ent, func ) + __field_desc( int, graph_ent, depth ) + ), + + F_printk("--> %lx (%d)", __entry->func, __entry->depth) +); + +/* Function return entry */ +FTRACE_ENTRY(funcgraph_exit, ftrace_graph_ret_entry, + + TRACE_GRAPH_RET, + + F_STRUCT( + __field_struct( struct ftrace_graph_ret, ret ) + __field_desc( unsigned long, ret, func ) + __field_desc( unsigned long long, ret, calltime) + __field_desc( unsigned long long, ret, rettime ) + __field_desc( unsigned long, ret, overrun ) + __field_desc( int, ret, depth ) + ), + + F_printk("<-- %lx (%d) (start: %llx end: %llx) over: %d", + __entry->func, __entry->depth, + __entry->calltime, __entry->rettime, + __entry->depth) +); + +/* + * Context switch trace entry - which task (and prio) we switched from/to: + * + * This is used for both wakeup and context switches. We only want + * to create one structure, but we need two outputs for it. + */ +#define FTRACE_CTX_FIELDS \ + __field( unsigned int, prev_pid ) \ + __field( unsigned char, prev_prio ) \ + __field( unsigned char, prev_state ) \ + __field( unsigned int, next_pid ) \ + __field( unsigned char, next_prio ) \ + __field( unsigned char, next_state ) \ + __field( unsigned int, next_cpu ) + +FTRACE_ENTRY(context_switch, ctx_switch_entry, + + TRACE_CTX, + + F_STRUCT( + FTRACE_CTX_FIELDS + ), + + F_printk("%u:%u:%u ==> %u:%u:%u [%03u]", + __entry->prev_pid, __entry->prev_prio, __entry->prev_state, + __entry->next_pid, __entry->next_prio, __entry->next_state, + __entry->next_cpu + ) +); + +/* + * FTRACE_ENTRY_DUP only creates the format file, it will not + * create another structure. + */ +FTRACE_ENTRY_DUP(wakeup, ctx_switch_entry, + + TRACE_WAKE, + + F_STRUCT( + FTRACE_CTX_FIELDS + ), + + F_printk("%u:%u:%u ==+ %u:%u:%u [%03u]", + __entry->prev_pid, __entry->prev_prio, __entry->prev_state, + __entry->next_pid, __entry->next_prio, __entry->next_state, + __entry->next_cpu + ) +); + +/* + * Special (free-form) trace entry: + */ +FTRACE_ENTRY(special, special_entry, + + TRACE_SPECIAL, + + F_STRUCT( + __field( unsigned long, arg1 ) + __field( unsigned long, arg2 ) + __field( unsigned long, arg3 ) + ), + + F_printk("(%08lx) (%08lx) (%08lx)", + __entry->arg1, __entry->arg2, __entry->arg3) +); + +/* + * Stack-trace entry: + */ + +#define FTRACE_STACK_ENTRIES 8 + +FTRACE_ENTRY(kernel_stack, stack_entry, + + TRACE_STACK, + + F_STRUCT( + __array( unsigned long, caller, FTRACE_STACK_ENTRIES ) + ), + + F_printk("\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n" + "\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n", + __entry->caller[0], __entry->caller[1], __entry->caller[2], + __entry->caller[3], __entry->caller[4], __entry->caller[5], + __entry->caller[6], __entry->caller[7]) +); + +FTRACE_ENTRY(user_stack, userstack_entry, + + TRACE_USER_STACK, + + F_STRUCT( + __field( unsigned int, tgid ) + __array( unsigned long, caller, FTRACE_STACK_ENTRIES ) + ), + + F_printk("\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n" + "\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n", + __entry->caller[0], __entry->caller[1], __entry->caller[2], + __entry->caller[3], __entry->caller[4], __entry->caller[5], + __entry->caller[6], __entry->caller[7]) +); + +/* + * trace_printk entry: + */ +FTRACE_ENTRY(bprint, bprint_entry, + + TRACE_BPRINT, + + F_STRUCT( + __field( unsigned long, ip ) + __field( const char *, fmt ) + __dynamic_array( u32, buf ) + ), + + F_printk("%08lx fmt:%p", + __entry->ip, __entry->fmt) +); + +FTRACE_ENTRY(print, print_entry, + + TRACE_PRINT, + + F_STRUCT( + __field( unsigned long, ip ) + __dynamic_array( char, buf ) + ), + + F_printk("%08lx %s", + __entry->ip, __entry->buf) +); + +FTRACE_ENTRY(mmiotrace_rw, trace_mmiotrace_rw, + + TRACE_MMIO_RW, + + F_STRUCT( + __field_struct( struct mmiotrace_rw, rw ) + __field_desc( resource_size_t, rw, phys ) + __field_desc( unsigned long, rw, value ) + __field_desc( unsigned long, rw, pc ) + __field_desc( int, rw, map_id ) + __field_desc( unsigned char, rw, opcode ) + __field_desc( unsigned char, rw, width ) + ), + + F_printk("%lx %lx %lx %d %x %x", + (unsigned long)__entry->phys, __entry->value, __entry->pc, + __entry->map_id, __entry->opcode, __entry->width) +); + +FTRACE_ENTRY(mmiotrace_map, trace_mmiotrace_map, + + TRACE_MMIO_MAP, + + F_STRUCT( + __field_struct( struct mmiotrace_map, map ) + __field_desc( resource_size_t, map, phys ) + __field_desc( unsigned long, map, virt ) + __field_desc( unsigned long, map, len ) + __field_desc( int, map, map_id ) + __field_desc( unsigned char, map, opcode ) + ), + + F_printk("%lx %lx %lx %d %x", + (unsigned long)__entry->phys, __entry->virt, __entry->len, + __entry->map_id, __entry->opcode) +); + +FTRACE_ENTRY(boot_call, trace_boot_call, + + TRACE_BOOT_CALL, + + F_STRUCT( + __field_struct( struct boot_trace_call, boot_call ) + __field_desc( pid_t, boot_call, caller ) + __array_desc( char, boot_call, func, KSYM_SYMBOL_LEN) + ), + + F_printk("%d %s", __entry->caller, __entry->func) +); + +FTRACE_ENTRY(boot_ret, trace_boot_ret, + + TRACE_BOOT_RET, + + F_STRUCT( + __field_struct( struct boot_trace_ret, boot_ret ) + __array_desc( char, boot_ret, func, KSYM_SYMBOL_LEN) + __field_desc( int, boot_ret, result ) + __field_desc( unsigned long, boot_ret, duration ) + ), + + F_printk("%s %d %lx", + __entry->func, __entry->result, __entry->duration) +); + +#define TRACE_FUNC_SIZE 30 +#define TRACE_FILE_SIZE 20 + +FTRACE_ENTRY(branch, trace_branch, + + TRACE_BRANCH, + + F_STRUCT( + __field( unsigned int, line ) + __array( char, func, TRACE_FUNC_SIZE+1 ) + __array( char, file, TRACE_FILE_SIZE+1 ) + __field( char, correct ) + ), + + F_printk("%u:%s:%s (%u)", + __entry->line, + __entry->func, __entry->file, __entry->correct) +); + +FTRACE_ENTRY(hw_branch, hw_branch_entry, + + TRACE_HW_BRANCHES, + + F_STRUCT( + __field( u64, from ) + __field( u64, to ) + ), + + F_printk("from: %llx to: %llx", __entry->from, __entry->to) +); + +FTRACE_ENTRY(power, trace_power, + + TRACE_POWER, + + F_STRUCT( + __field_struct( struct power_trace, state_data ) + __field_desc( s64, state_data, stamp ) + __field_desc( s64, state_data, end ) + __field_desc( int, state_data, type ) + __field_desc( int, state_data, state ) + ), + + F_printk("%llx->%llx type:%u state:%u", + __entry->stamp, __entry->end, + __entry->type, __entry->state) +); + +FTRACE_ENTRY(kmem_alloc, kmemtrace_alloc_entry, + + TRACE_KMEM_ALLOC, + + F_STRUCT( + __field( enum kmemtrace_type_id, type_id ) + __field( unsigned long, call_site ) + __field( const void *, ptr ) + __field( size_t, bytes_req ) + __field( size_t, bytes_alloc ) + __field( gfp_t, gfp_flags ) + __field( int, node ) + ), + + F_printk("type:%u call_site:%lx ptr:%p req:%zi alloc:%zi" + " flags:%x node:%d", + __entry->type_id, __entry->call_site, __entry->ptr, + __entry->bytes_req, __entry->bytes_alloc, + __entry->gfp_flags, __entry->node) +); + +FTRACE_ENTRY(kmem_free, kmemtrace_free_entry, + + TRACE_KMEM_FREE, + + F_STRUCT( + __field( enum kmemtrace_type_id, type_id ) + __field( unsigned long, call_site ) + __field( const void *, ptr ) + ), + + F_printk("type:%u call_site:%lx ptr:%p", + __entry->type_id, __entry->call_site, __entry->ptr) +); diff --git a/kernel/trace/trace_event_profile.c b/kernel/trace/trace_event_profile.c index 11ba5bb..55a25c9 100644 --- a/kernel/trace/trace_event_profile.c +++ b/kernel/trace/trace_event_profile.c @@ -5,6 +5,7 @@ * */ +#include <linux/module.h> #include "trace.h" int ftrace_profile_enable(int event_id) @@ -14,7 +15,8 @@ int ftrace_profile_enable(int event_id) mutex_lock(&event_mutex); list_for_each_entry(event, &ftrace_events, list) { - if (event->id == event_id && event->profile_enable) { + if (event->id == event_id && event->profile_enable && + try_module_get(event->mod)) { ret = event->profile_enable(event); break; } @@ -32,6 +34,7 @@ void ftrace_profile_disable(int event_id) list_for_each_entry(event, &ftrace_events, list) { if (event->id == event_id) { event->profile_disable(event); + module_put(event->mod); break; } } diff --git a/kernel/trace/trace_event_types.h b/kernel/trace/trace_event_types.h deleted file mode 100644 index 6db005e..0000000 --- a/kernel/trace/trace_event_types.h +++ /dev/null @@ -1,178 +0,0 @@ -#undef TRACE_SYSTEM -#define TRACE_SYSTEM ftrace - -/* - * We cheat and use the proto type field as the ID - * and args as the entry type (minus 'struct') - */ -TRACE_EVENT_FORMAT(function, TRACE_FN, ftrace_entry, ignore, - TRACE_STRUCT( - TRACE_FIELD(unsigned long, ip, ip) - TRACE_FIELD(unsigned long, parent_ip, parent_ip) - ), - TP_RAW_FMT(" %lx <-- %lx") -); - -TRACE_EVENT_FORMAT(funcgraph_entry, TRACE_GRAPH_ENT, - ftrace_graph_ent_entry, ignore, - TRACE_STRUCT( - TRACE_FIELD(unsigned long, graph_ent.func, func) - TRACE_FIELD(int, graph_ent.depth, depth) - ), - TP_RAW_FMT("--> %lx (%d)") -); - -TRACE_EVENT_FORMAT(funcgraph_exit, TRACE_GRAPH_RET, - ftrace_graph_ret_entry, ignore, - TRACE_STRUCT( - TRACE_FIELD(unsigned long, ret.func, func) - TRACE_FIELD(unsigned long long, ret.calltime, calltime) - TRACE_FIELD(unsigned long long, ret.rettime, rettime) - TRACE_FIELD(unsigned long, ret.overrun, overrun) - TRACE_FIELD(int, ret.depth, depth) - ), - TP_RAW_FMT("<-- %lx (%d)") -); - -TRACE_EVENT_FORMAT(wakeup, TRACE_WAKE, ctx_switch_entry, ignore, - TRACE_STRUCT( - TRACE_FIELD(unsigned int, prev_pid, prev_pid) - TRACE_FIELD(unsigned char, prev_prio, prev_prio) - TRACE_FIELD(unsigned char, prev_state, prev_state) - TRACE_FIELD(unsigned int, next_pid, next_pid) - TRACE_FIELD(unsigned char, next_prio, next_prio) - TRACE_FIELD(unsigned char, next_state, next_state) - TRACE_FIELD(unsigned int, next_cpu, next_cpu) - ), - TP_RAW_FMT("%u:%u:%u ==+ %u:%u:%u [%03u]") -); - -TRACE_EVENT_FORMAT(context_switch, TRACE_CTX, ctx_switch_entry, ignore, - TRACE_STRUCT( - TRACE_FIELD(unsigned int, prev_pid, prev_pid) - TRACE_FIELD(unsigned char, prev_prio, prev_prio) - TRACE_FIELD(unsigned char, prev_state, prev_state) - TRACE_FIELD(unsigned int, next_pid, next_pid) - TRACE_FIELD(unsigned char, next_prio, next_prio) - TRACE_FIELD(unsigned char, next_state, next_state) - TRACE_FIELD(unsigned int, next_cpu, next_cpu) - ), - TP_RAW_FMT("%u:%u:%u ==+ %u:%u:%u [%03u]") -); - -TRACE_EVENT_FORMAT_NOFILTER(special, TRACE_SPECIAL, special_entry, ignore, - TRACE_STRUCT( - TRACE_FIELD(unsigned long, arg1, arg1) - TRACE_FIELD(unsigned long, arg2, arg2) - TRACE_FIELD(unsigned long, arg3, arg3) - ), - TP_RAW_FMT("(%08lx) (%08lx) (%08lx)") -); - -/* - * Stack-trace entry: - */ - -/* #define FTRACE_STACK_ENTRIES 8 */ - -TRACE_EVENT_FORMAT(kernel_stack, TRACE_STACK, stack_entry, ignore, - TRACE_STRUCT( - TRACE_FIELD(unsigned long, caller[0], stack0) - TRACE_FIELD(unsigned long, caller[1], stack1) - TRACE_FIELD(unsigned long, caller[2], stack2) - TRACE_FIELD(unsigned long, caller[3], stack3) - TRACE_FIELD(unsigned long, caller[4], stack4) - TRACE_FIELD(unsigned long, caller[5], stack5) - TRACE_FIELD(unsigned long, caller[6], stack6) - TRACE_FIELD(unsigned long, caller[7], stack7) - ), - TP_RAW_FMT("\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n" - "\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n") -); - -TRACE_EVENT_FORMAT(user_stack, TRACE_USER_STACK, userstack_entry, ignore, - TRACE_STRUCT( - TRACE_FIELD(unsigned long, caller[0], stack0) - TRACE_FIELD(unsigned long, caller[1], stack1) - TRACE_FIELD(unsigned long, caller[2], stack2) - TRACE_FIELD(unsigned long, caller[3], stack3) - TRACE_FIELD(unsigned long, caller[4], stack4) - TRACE_FIELD(unsigned long, caller[5], stack5) - TRACE_FIELD(unsigned long, caller[6], stack6) - TRACE_FIELD(unsigned long, caller[7], stack7) - ), - TP_RAW_FMT("\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n" - "\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n") -); - -TRACE_EVENT_FORMAT(bprint, TRACE_BPRINT, bprint_entry, ignore, - TRACE_STRUCT( - TRACE_FIELD(unsigned long, ip, ip) - TRACE_FIELD(char *, fmt, fmt) - TRACE_FIELD_ZERO_CHAR(buf) - ), - TP_RAW_FMT("%08lx (%d) fmt:%p %s") -); - -TRACE_EVENT_FORMAT(print, TRACE_PRINT, print_entry, ignore, - TRACE_STRUCT( - TRACE_FIELD(unsigned long, ip, ip) - TRACE_FIELD_ZERO_CHAR(buf) - ), - TP_RAW_FMT("%08lx (%d) fmt:%p %s") -); - -TRACE_EVENT_FORMAT(branch, TRACE_BRANCH, trace_branch, ignore, - TRACE_STRUCT( - TRACE_FIELD(unsigned int, line, line) - TRACE_FIELD_SPECIAL(char func[TRACE_FUNC_SIZE+1], func, - TRACE_FUNC_SIZE+1, func) - TRACE_FIELD_SPECIAL(char file[TRACE_FUNC_SIZE+1], file, - TRACE_FUNC_SIZE+1, file) - TRACE_FIELD(char, correct, correct) - ), - TP_RAW_FMT("%u:%s:%s (%u)") -); - -TRACE_EVENT_FORMAT(hw_branch, TRACE_HW_BRANCHES, hw_branch_entry, ignore, - TRACE_STRUCT( - TRACE_FIELD(u64, from, from) - TRACE_FIELD(u64, to, to) - ), - TP_RAW_FMT("from: %llx to: %llx") -); - -TRACE_EVENT_FORMAT(power, TRACE_POWER, trace_power, ignore, - TRACE_STRUCT( - TRACE_FIELD_SIGN(ktime_t, state_data.stamp, stamp, 1) - TRACE_FIELD_SIGN(ktime_t, state_data.end, end, 1) - TRACE_FIELD(int, state_data.type, type) - TRACE_FIELD(int, state_data.state, state) - ), - TP_RAW_FMT("%llx->%llx type:%u state:%u") -); - -TRACE_EVENT_FORMAT(kmem_alloc, TRACE_KMEM_ALLOC, kmemtrace_alloc_entry, ignore, - TRACE_STRUCT( - TRACE_FIELD(enum kmemtrace_type_id, type_id, type_id) - TRACE_FIELD(unsigned long, call_site, call_site) - TRACE_FIELD(const void *, ptr, ptr) - TRACE_FIELD(size_t, bytes_req, bytes_req) - TRACE_FIELD(size_t, bytes_alloc, bytes_alloc) - TRACE_FIELD(gfp_t, gfp_flags, gfp_flags) - TRACE_FIELD(int, node, node) - ), - TP_RAW_FMT("type:%u call_site:%lx ptr:%p req:%lu alloc:%lu" - " flags:%x node:%d") -); - -TRACE_EVENT_FORMAT(kmem_free, TRACE_KMEM_FREE, kmemtrace_free_entry, ignore, - TRACE_STRUCT( - TRACE_FIELD(enum kmemtrace_type_id, type_id, type_id) - TRACE_FIELD(unsigned long, call_site, call_site) - TRACE_FIELD(const void *, ptr, ptr) - ), - TP_RAW_FMT("type:%u call_site:%lx ptr:%p") -); - -#undef TRACE_SYSTEM diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index e75276a..787f0fb 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -17,16 +17,20 @@ #include <linux/ctype.h> #include <linux/delay.h> +#include <asm/setup.h> + #include "trace_output.h" +#undef TRACE_SYSTEM #define TRACE_SYSTEM "TRACE_SYSTEM" DEFINE_MUTEX(event_mutex); LIST_HEAD(ftrace_events); -int trace_define_field(struct ftrace_event_call *call, char *type, - char *name, int offset, int size, int is_signed) +int trace_define_field(struct ftrace_event_call *call, const char *type, + const char *name, int offset, int size, int is_signed, + int filter_type) { struct ftrace_event_field *field; @@ -42,9 +46,15 @@ int trace_define_field(struct ftrace_event_call *call, char *type, if (!field->type) goto err; + if (filter_type == FILTER_OTHER) + field->filter_type = filter_assign_type(type); + else + field->filter_type = filter_type; + field->offset = offset; field->size = size; field->is_signed = is_signed; + list_add(&field->link, &call->fields); return 0; @@ -60,6 +70,29 @@ err: } EXPORT_SYMBOL_GPL(trace_define_field); +#define __common_field(type, item) \ + ret = trace_define_field(call, #type, "common_" #item, \ + offsetof(typeof(ent), item), \ + sizeof(ent.item), \ + is_signed_type(type), FILTER_OTHER); \ + if (ret) \ + return ret; + +int trace_define_common_fields(struct ftrace_event_call *call) +{ + int ret; + struct trace_entry ent; + + __common_field(unsigned short, type); + __common_field(unsigned char, flags); + __common_field(unsigned char, preempt_count); + __common_field(int, pid); + __common_field(int, lock_depth); + + return ret; +} +EXPORT_SYMBOL_GPL(trace_define_common_fields); + #ifdef CONFIG_MODULES static void trace_destroy_fields(struct ftrace_event_call *call) @@ -84,14 +117,14 @@ static void ftrace_event_enable_disable(struct ftrace_event_call *call, if (call->enabled) { call->enabled = 0; tracing_stop_cmdline_record(); - call->unregfunc(); + call->unregfunc(call->data); } break; case 1: if (!call->enabled) { call->enabled = 1; tracing_start_cmdline_record(); - call->regfunc(); + call->regfunc(call->data); } break; } @@ -198,11 +231,9 @@ static ssize_t ftrace_event_write(struct file *file, const char __user *ubuf, size_t cnt, loff_t *ppos) { + struct trace_parser parser; size_t read = 0; - int i, set = 1; ssize_t ret; - char *buf; - char ch; if (!cnt || cnt < 0) return 0; @@ -211,60 +242,28 @@ ftrace_event_write(struct file *file, const char __user *ubuf, if (ret < 0) return ret; - ret = get_user(ch, ubuf++); - if (ret) - return ret; - read++; - cnt--; - - /* skip white space */ - while (cnt && isspace(ch)) { - ret = get_user(ch, ubuf++); - if (ret) - return ret; - read++; - cnt--; - } - - /* Only white space found? */ - if (isspace(ch)) { - file->f_pos += read; - ret = read; - return ret; - } - - buf = kmalloc(EVENT_BUF_SIZE+1, GFP_KERNEL); - if (!buf) + if (trace_parser_get_init(&parser, EVENT_BUF_SIZE + 1)) return -ENOMEM; - if (cnt > EVENT_BUF_SIZE) - cnt = EVENT_BUF_SIZE; + read = trace_get_user(&parser, ubuf, cnt, ppos); + + if (trace_parser_loaded((&parser))) { + int set = 1; - i = 0; - while (cnt && !isspace(ch)) { - if (!i && ch == '!') + if (*parser.buffer == '!') set = 0; - else - buf[i++] = ch; - ret = get_user(ch, ubuf++); + parser.buffer[parser.idx] = 0; + + ret = ftrace_set_clr_event(parser.buffer + !set, set); if (ret) - goto out_free; - read++; - cnt--; + goto out_put; } - buf[i] = 0; - - file->f_pos += read; - - ret = ftrace_set_clr_event(buf, set); - if (ret) - goto out_free; ret = read; - out_free: - kfree(buf); + out_put: + trace_parser_put(&parser); return ret; } @@ -546,7 +545,7 @@ static int trace_write_header(struct trace_seq *s) FIELD(unsigned char, flags), FIELD(unsigned char, preempt_count), FIELD(int, pid), - FIELD(int, tgid)); + FIELD(int, lock_depth)); } static ssize_t @@ -574,7 +573,7 @@ event_format_read(struct file *filp, char __user *ubuf, size_t cnt, trace_seq_printf(s, "format:\n"); trace_write_header(s); - r = call->show_format(s); + r = call->show_format(call, s); if (!r) { /* * ug! The format output is bigger than a PAGE!! @@ -849,8 +848,10 @@ event_subsystem_dir(const char *name, struct dentry *d_events) /* First see if we did not already create this dir */ list_for_each_entry(system, &event_subsystems, list) { - if (strcmp(system->name, name) == 0) + if (strcmp(system->name, name) == 0) { + system->nr_events++; return system->entry; + } } /* need to create new entry */ @@ -869,6 +870,7 @@ event_subsystem_dir(const char *name, struct dentry *d_events) return d_events; } + system->nr_events = 1; system->name = kstrdup(name, GFP_KERNEL); if (!system->name) { debugfs_remove(system->entry); @@ -920,15 +922,6 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events, if (strcmp(call->system, TRACE_SYSTEM) != 0) d_events = event_subsystem_dir(call->system, d_events); - if (call->raw_init) { - ret = call->raw_init(); - if (ret < 0) { - pr_warning("Could not initialize trace point" - " events/%s\n", call->name); - return ret; - } - } - call->dir = debugfs_create_dir(call->name, d_events); if (!call->dir) { pr_warning("Could not create debugfs " @@ -945,7 +938,7 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events, id); if (call->define_fields) { - ret = call->define_fields(); + ret = call->define_fields(call); if (ret < 0) { pr_warning("Could not initialize trace point" " events/%s\n", call->name); @@ -987,6 +980,32 @@ struct ftrace_module_file_ops { struct file_operations filter; }; +static void remove_subsystem_dir(const char *name) +{ + struct event_subsystem *system; + + if (strcmp(name, TRACE_SYSTEM) == 0) + return; + + list_for_each_entry(system, &event_subsystems, list) { + if (strcmp(system->name, name) == 0) { + if (!--system->nr_events) { + struct event_filter *filter = system->filter; + + debugfs_remove_recursive(system->entry); + list_del(&system->list); + if (filter) { + kfree(filter->filter_string); + kfree(filter); + } + kfree(system->name); + kfree(system); + } + break; + } + } +} + static struct ftrace_module_file_ops * trace_create_file_ops(struct module *mod) { @@ -1027,6 +1046,7 @@ static void trace_module_add_events(struct module *mod) struct ftrace_module_file_ops *file_ops = NULL; struct ftrace_event_call *call, *start, *end; struct dentry *d_events; + int ret; start = mod->trace_events; end = mod->trace_events + mod->num_trace_events; @@ -1042,7 +1062,15 @@ static void trace_module_add_events(struct module *mod) /* The linker may leave blanks */ if (!call->name) continue; - + if (call->raw_init) { + ret = call->raw_init(); + if (ret < 0) { + if (ret != -ENOSYS) + pr_warning("Could not initialize trace " + "point events/%s\n", call->name); + continue; + } + } /* * This module has events, create file ops for this module * if not already done. @@ -1077,6 +1105,7 @@ static void trace_module_remove_events(struct module *mod) list_del(&call->list); trace_destroy_fields(call); destroy_preds(call); + remove_subsystem_dir(call->system); } } @@ -1125,7 +1154,7 @@ static int trace_module_notify(struct notifier_block *self, } #endif /* CONFIG_MODULES */ -struct notifier_block trace_module_nb = { +static struct notifier_block trace_module_nb = { .notifier_call = trace_module_notify, .priority = 0, }; @@ -1133,6 +1162,18 @@ struct notifier_block trace_module_nb = { extern struct ftrace_event_call __start_ftrace_events[]; extern struct ftrace_event_call __stop_ftrace_events[]; +static char bootup_event_buf[COMMAND_LINE_SIZE] __initdata; + +static __init int setup_trace_event(char *str) +{ + strlcpy(bootup_event_buf, str, COMMAND_LINE_SIZE); + ring_buffer_expanded = 1; + tracing_selftest_disabled = 1; + + return 1; +} +__setup("trace_event=", setup_trace_event); + static __init int event_trace_init(void) { struct ftrace_event_call *call; @@ -1140,6 +1181,8 @@ static __init int event_trace_init(void) struct dentry *entry; struct dentry *d_events; int ret; + char *buf = bootup_event_buf; + char *token; d_tracer = tracing_init_dentry(); if (!d_tracer) @@ -1179,12 +1222,34 @@ static __init int event_trace_init(void) /* The linker may leave blanks */ if (!call->name) continue; + if (call->raw_init) { + ret = call->raw_init(); + if (ret < 0) { + if (ret != -ENOSYS) + pr_warning("Could not initialize trace " + "point events/%s\n", call->name); + continue; + } + } list_add(&call->list, &ftrace_events); event_create_dir(call, d_events, &ftrace_event_id_fops, &ftrace_enable_fops, &ftrace_event_filter_fops, &ftrace_event_format_fops); } + while (true) { + token = strsep(&buf, ","); + + if (!token) + break; + if (!*token) + continue; + + ret = ftrace_set_clr_event(token, 1); + if (ret) + pr_warning("Failed to enable trace event: %s\n", token); + } + ret = register_module_notifier(&trace_module_nb); if (ret) pr_warning("Failed to register trace events module notifier\n"); @@ -1261,6 +1326,18 @@ static __init void event_trace_self_tests(void) if (!call->regfunc) continue; +/* + * Testing syscall events here is pretty useless, but + * we still do it if configured. But this is time consuming. + * What we really need is a user thread to perform the + * syscalls as we test. + */ +#ifndef CONFIG_EVENT_TRACE_TEST_SYSCALLS + if (call->system && + strcmp(call->system, "syscalls") == 0) + continue; +#endif + pr_info("Testing event %s: ", call->name); /* @@ -1340,6 +1417,7 @@ static void function_test_events_call(unsigned long ip, unsigned long parent_ip) { struct ring_buffer_event *event; + struct ring_buffer *buffer; struct ftrace_entry *entry; unsigned long flags; long disabled; @@ -1357,7 +1435,8 @@ function_test_events_call(unsigned long ip, unsigned long parent_ip) local_save_flags(flags); - event = trace_current_buffer_lock_reserve(TRACE_FN, sizeof(*entry), + event = trace_current_buffer_lock_reserve(&buffer, + TRACE_FN, sizeof(*entry), flags, pc); if (!event) goto out; @@ -1365,7 +1444,7 @@ function_test_events_call(unsigned long ip, unsigned long parent_ip) entry->ip = ip; entry->parent_ip = parent_ip; - trace_nowake_buffer_unlock_commit(event, flags, pc); + trace_nowake_buffer_unlock_commit(buffer, event, flags, pc); out: atomic_dec(&per_cpu(test_event_disable, cpu)); @@ -1392,10 +1471,10 @@ static __init void event_trace_self_test_with_function(void) static __init int event_trace_self_tests_init(void) { - - event_trace_self_tests(); - - event_trace_self_test_with_function(); + if (!tracing_selftest_disabled) { + event_trace_self_tests(); + event_trace_self_test_with_function(); + } return 0; } diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index f32dc9d..2324578 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -121,6 +121,47 @@ struct filter_parse_state { } operand; }; +#define DEFINE_COMPARISON_PRED(type) \ +static int filter_pred_##type(struct filter_pred *pred, void *event, \ + int val1, int val2) \ +{ \ + type *addr = (type *)(event + pred->offset); \ + type val = (type)pred->val; \ + int match = 0; \ + \ + switch (pred->op) { \ + case OP_LT: \ + match = (*addr < val); \ + break; \ + case OP_LE: \ + match = (*addr <= val); \ + break; \ + case OP_GT: \ + match = (*addr > val); \ + break; \ + case OP_GE: \ + match = (*addr >= val); \ + break; \ + default: \ + break; \ + } \ + \ + return match; \ +} + +#define DEFINE_EQUALITY_PRED(size) \ +static int filter_pred_##size(struct filter_pred *pred, void *event, \ + int val1, int val2) \ +{ \ + u##size *addr = (u##size *)(event + pred->offset); \ + u##size val = (u##size)pred->val; \ + int match; \ + \ + match = (val == *addr) ^ pred->not; \ + \ + return match; \ +} + DEFINE_COMPARISON_PRED(s64); DEFINE_COMPARISON_PRED(u64); DEFINE_COMPARISON_PRED(s32); @@ -163,6 +204,20 @@ static int filter_pred_string(struct filter_pred *pred, void *event, return match; } +/* Filter predicate for char * pointers */ +static int filter_pred_pchar(struct filter_pred *pred, void *event, + int val1, int val2) +{ + char **addr = (char **)(event + pred->offset); + int cmp, match; + + cmp = strncmp(*addr, pred->str_val, pred->str_len); + + match = (!cmp) ^ pred->not; + + return match; +} + /* * Filter predicate for dynamic sized arrays of characters. * These are implemented through a list of strings at the end @@ -176,11 +231,13 @@ static int filter_pred_string(struct filter_pred *pred, void *event, static int filter_pred_strloc(struct filter_pred *pred, void *event, int val1, int val2) { - unsigned short str_loc = *(unsigned short *)(event + pred->offset); + u32 str_item = *(u32 *)(event + pred->offset); + int str_loc = str_item & 0xffff; + int str_len = str_item >> 16; char *addr = (char *)(event + str_loc); int cmp, match; - cmp = strncmp(addr, pred->str_val, pred->str_len); + cmp = strncmp(addr, pred->str_val, str_len); match = (!cmp) ^ pred->not; @@ -293,7 +350,7 @@ void print_event_filter(struct ftrace_event_call *call, struct trace_seq *s) struct event_filter *filter = call->filter; mutex_lock(&event_mutex); - if (filter->filter_string) + if (filter && filter->filter_string) trace_seq_printf(s, "%s\n", filter->filter_string); else trace_seq_printf(s, "none\n"); @@ -306,7 +363,7 @@ void print_subsystem_event_filter(struct event_subsystem *system, struct event_filter *filter = system->filter; mutex_lock(&event_mutex); - if (filter->filter_string) + if (filter && filter->filter_string) trace_seq_printf(s, "%s\n", filter->filter_string); else trace_seq_printf(s, "none\n"); @@ -374,6 +431,9 @@ void destroy_preds(struct ftrace_event_call *call) struct event_filter *filter = call->filter; int i; + if (!filter) + return; + for (i = 0; i < MAX_FILTER_PRED; i++) { if (filter->preds[i]) filter_free_pred(filter->preds[i]); @@ -384,17 +444,19 @@ void destroy_preds(struct ftrace_event_call *call) call->filter = NULL; } -int init_preds(struct ftrace_event_call *call) +static int init_preds(struct ftrace_event_call *call) { struct event_filter *filter; struct filter_pred *pred; int i; + if (call->filter) + return 0; + filter = call->filter = kzalloc(sizeof(*filter), GFP_KERNEL); if (!call->filter) return -ENOMEM; - call->filter_active = 0; filter->n_preds = 0; filter->preds = kzalloc(MAX_FILTER_PRED * sizeof(pred), GFP_KERNEL); @@ -416,30 +478,55 @@ oom: return -ENOMEM; } -EXPORT_SYMBOL_GPL(init_preds); -static void filter_free_subsystem_preds(struct event_subsystem *system) +static int init_subsystem_preds(struct event_subsystem *system) { - struct event_filter *filter = system->filter; struct ftrace_event_call *call; - int i; + int err; - if (filter->n_preds) { - for (i = 0; i < filter->n_preds; i++) - filter_free_pred(filter->preds[i]); - kfree(filter->preds); - filter->preds = NULL; - filter->n_preds = 0; + list_for_each_entry(call, &ftrace_events, list) { + if (!call->define_fields) + continue; + + if (strcmp(call->system, system->name) != 0) + continue; + + err = init_preds(call); + if (err) + return err; } + return 0; +} + +enum { + FILTER_DISABLE_ALL, + FILTER_INIT_NO_RESET, + FILTER_SKIP_NO_RESET, +}; + +static void filter_free_subsystem_preds(struct event_subsystem *system, + int flag) +{ + struct ftrace_event_call *call; + list_for_each_entry(call, &ftrace_events, list) { if (!call->define_fields) continue; - if (!strcmp(call->system, system->name)) { - filter_disable_preds(call); - remove_filter_string(call->filter); + if (strcmp(call->system, system->name) != 0) + continue; + + if (flag == FILTER_INIT_NO_RESET) { + call->filter->no_reset = false; + continue; } + + if (flag == FILTER_SKIP_NO_RESET && call->filter->no_reset) + continue; + + filter_disable_preds(call); + remove_filter_string(call->filter); } } @@ -468,12 +555,7 @@ static int filter_add_pred_fn(struct filter_parse_state *ps, return 0; } -enum { - FILTER_STATIC_STRING = 1, - FILTER_DYN_STRING -}; - -static int is_string_field(const char *type) +int filter_assign_type(const char *type) { if (strstr(type, "__data_loc") && strstr(type, "char")) return FILTER_DYN_STRING; @@ -481,12 +563,19 @@ static int is_string_field(const char *type) if (strchr(type, '[') && strstr(type, "char")) return FILTER_STATIC_STRING; - return 0; + return FILTER_OTHER; +} + +static bool is_string_field(struct ftrace_event_field *field) +{ + return field->filter_type == FILTER_DYN_STRING || + field->filter_type == FILTER_STATIC_STRING || + field->filter_type == FILTER_PTR_STRING; } static int is_legal_op(struct ftrace_event_field *field, int op) { - if (is_string_field(field->type) && (op != OP_EQ && op != OP_NE)) + if (is_string_field(field) && (op != OP_EQ && op != OP_NE)) return 0; return 1; @@ -537,22 +626,24 @@ static filter_pred_fn_t select_comparison_fn(int op, int field_size, static int filter_add_pred(struct filter_parse_state *ps, struct ftrace_event_call *call, - struct filter_pred *pred) + struct filter_pred *pred, + bool dry_run) { struct ftrace_event_field *field; filter_pred_fn_t fn; unsigned long long val; - int string_type; int ret; pred->fn = filter_pred_none; if (pred->op == OP_AND) { pred->pop_n = 2; - return filter_add_pred_fn(ps, call, pred, filter_pred_and); + fn = filter_pred_and; + goto add_pred_fn; } else if (pred->op == OP_OR) { pred->pop_n = 2; - return filter_add_pred_fn(ps, call, pred, filter_pred_or); + fn = filter_pred_or; + goto add_pred_fn; } field = find_event_field(call, pred->field_name); @@ -568,16 +659,17 @@ static int filter_add_pred(struct filter_parse_state *ps, return -EINVAL; } - string_type = is_string_field(field->type); - if (string_type) { - if (string_type == FILTER_STATIC_STRING) + if (is_string_field(field)) { + pred->str_len = field->size; + + if (field->filter_type == FILTER_STATIC_STRING) fn = filter_pred_string; - else + else if (field->filter_type == FILTER_DYN_STRING) fn = filter_pred_strloc; - pred->str_len = field->size; - if (pred->op == OP_NE) - pred->not = 1; - return filter_add_pred_fn(ps, call, pred, fn); + else { + fn = filter_pred_pchar; + pred->str_len = strlen(pred->str_val); + } } else { if (field->is_signed) ret = strict_strtoll(pred->str_val, 0, &val); @@ -588,41 +680,33 @@ static int filter_add_pred(struct filter_parse_state *ps, return -EINVAL; } pred->val = val; - } - fn = select_comparison_fn(pred->op, field->size, field->is_signed); - if (!fn) { - parse_error(ps, FILT_ERR_INVALID_OP, 0); - return -EINVAL; + fn = select_comparison_fn(pred->op, field->size, + field->is_signed); + if (!fn) { + parse_error(ps, FILT_ERR_INVALID_OP, 0); + return -EINVAL; + } } if (pred->op == OP_NE) pred->not = 1; - return filter_add_pred_fn(ps, call, pred, fn); +add_pred_fn: + if (!dry_run) + return filter_add_pred_fn(ps, call, pred, fn); + return 0; } static int filter_add_subsystem_pred(struct filter_parse_state *ps, struct event_subsystem *system, struct filter_pred *pred, - char *filter_string) + char *filter_string, + bool dry_run) { - struct event_filter *filter = system->filter; struct ftrace_event_call *call; int err = 0; - - if (!filter->preds) { - filter->preds = kzalloc(MAX_FILTER_PRED * sizeof(pred), - GFP_KERNEL); - - if (!filter->preds) - return -ENOMEM; - } - - if (filter->n_preds == MAX_FILTER_PRED) { - parse_error(ps, FILT_ERR_TOO_MANY_PREDS, 0); - return -ENOSPC; - } + bool fail = true; list_for_each_entry(call, &ftrace_events, list) { @@ -632,19 +716,24 @@ static int filter_add_subsystem_pred(struct filter_parse_state *ps, if (strcmp(call->system, system->name)) continue; - err = filter_add_pred(ps, call, pred); - if (err) { - filter_free_subsystem_preds(system); - parse_error(ps, FILT_ERR_BAD_SUBSYS_FILTER, 0); - goto out; - } - replace_filter_string(call->filter, filter_string); + if (call->filter->no_reset) + continue; + + err = filter_add_pred(ps, call, pred, dry_run); + if (err) + call->filter->no_reset = true; + else + fail = false; + + if (!dry_run) + replace_filter_string(call->filter, filter_string); } - filter->preds[filter->n_preds] = pred; - filter->n_preds++; -out: - return err; + if (fail) { + parse_error(ps, FILT_ERR_BAD_SUBSYS_FILTER, 0); + return err; + } + return 0; } static void parse_init(struct filter_parse_state *ps, @@ -1003,12 +1092,14 @@ static int check_preds(struct filter_parse_state *ps) static int replace_preds(struct event_subsystem *system, struct ftrace_event_call *call, struct filter_parse_state *ps, - char *filter_string) + char *filter_string, + bool dry_run) { char *operand1 = NULL, *operand2 = NULL; struct filter_pred *pred; struct postfix_elt *elt; int err; + int n_preds = 0; err = check_preds(ps); if (err) @@ -1027,24 +1118,14 @@ static int replace_preds(struct event_subsystem *system, continue; } + if (n_preds++ == MAX_FILTER_PRED) { + parse_error(ps, FILT_ERR_TOO_MANY_PREDS, 0); + return -ENOSPC; + } + if (elt->op == OP_AND || elt->op == OP_OR) { pred = create_logical_pred(elt->op); - if (!pred) - return -ENOMEM; - if (call) { - err = filter_add_pred(ps, call, pred); - filter_free_pred(pred); - } else { - err = filter_add_subsystem_pred(ps, system, - pred, filter_string); - if (err) - filter_free_pred(pred); - } - if (err) - return err; - - operand1 = operand2 = NULL; - continue; + goto add_pred; } if (!operand1 || !operand2) { @@ -1053,17 +1134,15 @@ static int replace_preds(struct event_subsystem *system, } pred = create_pred(elt->op, operand1, operand2); +add_pred: if (!pred) return -ENOMEM; - if (call) { - err = filter_add_pred(ps, call, pred); - filter_free_pred(pred); - } else { + if (call) + err = filter_add_pred(ps, call, pred, false); + else err = filter_add_subsystem_pred(ps, system, pred, - filter_string); - if (err) - filter_free_pred(pred); - } + filter_string, dry_run); + filter_free_pred(pred); if (err) return err; @@ -1081,6 +1160,10 @@ int apply_event_filter(struct ftrace_event_call *call, char *filter_string) mutex_lock(&event_mutex); + err = init_preds(call); + if (err) + goto out_unlock; + if (!strcmp(strstrip(filter_string), "0")) { filter_disable_preds(call); remove_filter_string(call->filter); @@ -1103,7 +1186,7 @@ int apply_event_filter(struct ftrace_event_call *call, char *filter_string) goto out; } - err = replace_preds(NULL, call, ps, filter_string); + err = replace_preds(NULL, call, ps, filter_string, false); if (err) append_filter_err(ps, call->filter); @@ -1126,8 +1209,12 @@ int apply_subsystem_event_filter(struct event_subsystem *system, mutex_lock(&event_mutex); + err = init_subsystem_preds(system); + if (err) + goto out_unlock; + if (!strcmp(strstrip(filter_string), "0")) { - filter_free_subsystem_preds(system); + filter_free_subsystem_preds(system, FILTER_DISABLE_ALL); remove_filter_string(system->filter); mutex_unlock(&event_mutex); return 0; @@ -1138,7 +1225,6 @@ int apply_subsystem_event_filter(struct event_subsystem *system, if (!ps) goto out_unlock; - filter_free_subsystem_preds(system); replace_filter_string(system->filter, filter_string); parse_init(ps, filter_ops, filter_string); @@ -1148,9 +1234,23 @@ int apply_subsystem_event_filter(struct event_subsystem *system, goto out; } - err = replace_preds(system, NULL, ps, filter_string); - if (err) + filter_free_subsystem_preds(system, FILTER_INIT_NO_RESET); + + /* try to see the filter can be applied to which events */ + err = replace_preds(system, NULL, ps, filter_string, true); + if (err) { append_filter_err(ps, system->filter); + goto out; + } + + filter_free_subsystem_preds(system, FILTER_SKIP_NO_RESET); + + /* really apply the filter to the events */ + err = replace_preds(system, NULL, ps, filter_string, false); + if (err) { + append_filter_err(ps, system->filter); + filter_free_subsystem_preds(system, 2); + } out: filter_opstack_clear(ps); diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c index d06cf89..9753fcc 100644 --- a/kernel/trace/trace_export.c +++ b/kernel/trace/trace_export.c @@ -15,116 +15,209 @@ #include "trace_output.h" +#undef TRACE_SYSTEM +#define TRACE_SYSTEM ftrace -#undef TRACE_STRUCT -#define TRACE_STRUCT(args...) args +/* not needed for this file */ +#undef __field_struct +#define __field_struct(type, item) -extern void __bad_type_size(void); +#undef __field +#define __field(type, item) type item; -#undef TRACE_FIELD -#define TRACE_FIELD(type, item, assign) \ - if (sizeof(type) != sizeof(field.item)) \ - __bad_type_size(); \ +#undef __field_desc +#define __field_desc(type, container, item) type item; + +#undef __array +#define __array(type, item, size) type item[size]; + +#undef __array_desc +#define __array_desc(type, container, item, size) type item[size]; + +#undef __dynamic_array +#define __dynamic_array(type, item) type item[]; + +#undef F_STRUCT +#define F_STRUCT(args...) args + +#undef F_printk +#define F_printk(fmt, args...) fmt, args + +#undef FTRACE_ENTRY +#define FTRACE_ENTRY(name, struct_name, id, tstruct, print) \ +struct ____ftrace_##name { \ + tstruct \ +}; \ +static void __used ____ftrace_check_##name(void) \ +{ \ + struct ____ftrace_##name *__entry = NULL; \ + \ + /* force cmpile-time check on F_printk() */ \ + printk(print); \ +} + +#undef FTRACE_ENTRY_DUP +#define FTRACE_ENTRY_DUP(name, struct_name, id, tstruct, print) \ + FTRACE_ENTRY(name, struct_name, id, PARAMS(tstruct), PARAMS(print)) + +#include "trace_entries.h" + + +#undef __field +#define __field(type, item) \ ret = trace_seq_printf(s, "\tfield:" #type " " #item ";\t" \ - "offset:%u;\tsize:%u;\n", \ - (unsigned int)offsetof(typeof(field), item), \ - (unsigned int)sizeof(field.item)); \ + "offset:%zu;\tsize:%zu;\n", \ + offsetof(typeof(field), item), \ + sizeof(field.item)); \ if (!ret) \ return 0; +#undef __field_desc +#define __field_desc(type, container, item) \ + ret = trace_seq_printf(s, "\tfield:" #type " " #item ";\t" \ + "offset:%zu;\tsize:%zu;\n", \ + offsetof(typeof(field), container.item), \ + sizeof(field.container.item)); \ + if (!ret) \ + return 0; + +#undef __array +#define __array(type, item, len) \ + ret = trace_seq_printf(s, "\tfield:" #type " " #item "[" #len "];\t" \ + "offset:%zu;\tsize:%zu;\n", \ + offsetof(typeof(field), item), \ + sizeof(field.item)); \ + if (!ret) \ + return 0; -#undef TRACE_FIELD_SPECIAL -#define TRACE_FIELD_SPECIAL(type_item, item, len, cmd) \ - ret = trace_seq_printf(s, "\tfield special:" #type_item ";\t" \ - "offset:%u;\tsize:%u;\n", \ - (unsigned int)offsetof(typeof(field), item), \ - (unsigned int)sizeof(field.item)); \ +#undef __array_desc +#define __array_desc(type, container, item, len) \ + ret = trace_seq_printf(s, "\tfield:" #type " " #item "[" #len "];\t" \ + "offset:%zu;\tsize:%zu;\n", \ + offsetof(typeof(field), container.item), \ + sizeof(field.container.item)); \ if (!ret) \ return 0; -#undef TRACE_FIELD_ZERO_CHAR -#define TRACE_FIELD_ZERO_CHAR(item) \ - ret = trace_seq_printf(s, "\tfield:char " #item ";\t" \ - "offset:%u;\tsize:0;\n", \ - (unsigned int)offsetof(typeof(field), item)); \ +#undef __dynamic_array +#define __dynamic_array(type, item) \ + ret = trace_seq_printf(s, "\tfield:" #type " " #item ";\t" \ + "offset:%zu;\tsize:0;\n", \ + offsetof(typeof(field), item)); \ if (!ret) \ return 0; -#undef TRACE_FIELD_SIGN -#define TRACE_FIELD_SIGN(type, item, assign, is_signed) \ - TRACE_FIELD(type, item, assign) +#undef F_printk +#define F_printk(fmt, args...) "%s, %s\n", #fmt, __stringify(args) -#undef TP_RAW_FMT -#define TP_RAW_FMT(args...) args +#undef __entry +#define __entry REC -#undef TRACE_EVENT_FORMAT -#define TRACE_EVENT_FORMAT(call, proto, args, fmt, tstruct, tpfmt) \ +#undef FTRACE_ENTRY +#define FTRACE_ENTRY(name, struct_name, id, tstruct, print) \ static int \ -ftrace_format_##call(struct trace_seq *s) \ +ftrace_format_##name(struct ftrace_event_call *unused, \ + struct trace_seq *s) \ { \ - struct args field; \ - int ret; \ + struct struct_name field __attribute__((unused)); \ + int ret = 0; \ \ tstruct; \ \ - trace_seq_printf(s, "\nprint fmt: \"%s\"\n", tpfmt); \ + trace_seq_printf(s, "\nprint fmt: " print); \ \ return ret; \ } -#undef TRACE_EVENT_FORMAT_NOFILTER -#define TRACE_EVENT_FORMAT_NOFILTER(call, proto, args, fmt, tstruct, \ - tpfmt) \ -static int \ -ftrace_format_##call(struct trace_seq *s) \ +#include "trace_entries.h" + + +#undef __field +#define __field(type, item) \ + ret = trace_define_field(event_call, #type, #item, \ + offsetof(typeof(field), item), \ + sizeof(field.item), \ + is_signed_type(type), FILTER_OTHER); \ + if (ret) \ + return ret; + +#undef __field_desc +#define __field_desc(type, container, item) \ + ret = trace_define_field(event_call, #type, #item, \ + offsetof(typeof(field), \ + container.item), \ + sizeof(field.container.item), \ + is_signed_type(type), FILTER_OTHER); \ + if (ret) \ + return ret; + +#undef __array +#define __array(type, item, len) \ + BUILD_BUG_ON(len > MAX_FILTER_STR_VAL); \ + ret = trace_define_field(event_call, #type "[" #len "]", #item, \ + offsetof(typeof(field), item), \ + sizeof(field.item), 0, FILTER_OTHER); \ + if (ret) \ + return ret; + +#undef __array_desc +#define __array_desc(type, container, item, len) \ + BUILD_BUG_ON(len > MAX_FILTER_STR_VAL); \ + ret = trace_define_field(event_call, #type "[" #len "]", #item, \ + offsetof(typeof(field), \ + container.item), \ + sizeof(field.container.item), 0, \ + FILTER_OTHER); \ + if (ret) \ + return ret; + +#undef __dynamic_array +#define __dynamic_array(type, item) + +#undef FTRACE_ENTRY +#define FTRACE_ENTRY(name, struct_name, id, tstruct, print) \ +int \ +ftrace_define_fields_##name(struct ftrace_event_call *event_call) \ { \ - struct args field; \ + struct struct_name field; \ int ret; \ \ - tstruct; \ + ret = trace_define_common_fields(event_call); \ + if (ret) \ + return ret; \ \ - trace_seq_printf(s, "\nprint fmt: \"%s\"\n", tpfmt); \ + tstruct; \ \ return ret; \ } -#include "trace_event_types.h" - -#undef TRACE_ZERO_CHAR -#define TRACE_ZERO_CHAR(arg) +#include "trace_entries.h" -#undef TRACE_FIELD -#define TRACE_FIELD(type, item, assign)\ - entry->item = assign; -#undef TRACE_FIELD -#define TRACE_FIELD(type, item, assign)\ - entry->item = assign; +#undef __field +#define __field(type, item) -#undef TRACE_FIELD_SIGN -#define TRACE_FIELD_SIGN(type, item, assign, is_signed) \ - TRACE_FIELD(type, item, assign) +#undef __field_desc +#define __field_desc(type, container, item) -#undef TP_CMD -#define TP_CMD(cmd...) cmd +#undef __array +#define __array(type, item, len) -#undef TRACE_ENTRY -#define TRACE_ENTRY entry +#undef __array_desc +#define __array_desc(type, container, item, len) -#undef TRACE_FIELD_SPECIAL -#define TRACE_FIELD_SPECIAL(type_item, item, len, cmd) \ - cmd; +#undef __dynamic_array +#define __dynamic_array(type, item) -#undef TRACE_EVENT_FORMAT -#define TRACE_EVENT_FORMAT(call, proto, args, fmt, tstruct, tpfmt) \ -int ftrace_define_fields_##call(void); \ +#undef FTRACE_ENTRY +#define FTRACE_ENTRY(call, struct_name, type, tstruct, print) \ static int ftrace_raw_init_event_##call(void); \ \ struct ftrace_event_call __used \ __attribute__((__aligned__(4))) \ __attribute__((section("_ftrace_events"))) event_##call = { \ .name = #call, \ - .id = proto, \ + .id = type, \ .system = __stringify(TRACE_SYSTEM), \ .raw_init = ftrace_raw_init_event_##call, \ .show_format = ftrace_format_##call, \ @@ -133,74 +226,7 @@ __attribute__((section("_ftrace_events"))) event_##call = { \ static int ftrace_raw_init_event_##call(void) \ { \ INIT_LIST_HEAD(&event_##call.fields); \ - init_preds(&event_##call); \ return 0; \ } \ -#undef TRACE_EVENT_FORMAT_NOFILTER -#define TRACE_EVENT_FORMAT_NOFILTER(call, proto, args, fmt, tstruct, \ - tpfmt) \ - \ -struct ftrace_event_call __used \ -__attribute__((__aligned__(4))) \ -__attribute__((section("_ftrace_events"))) event_##call = { \ - .name = #call, \ - .id = proto, \ - .system = __stringify(TRACE_SYSTEM), \ - .show_format = ftrace_format_##call, \ -}; - -#include "trace_event_types.h" - -#undef TRACE_FIELD -#define TRACE_FIELD(type, item, assign) \ - ret = trace_define_field(event_call, #type, #item, \ - offsetof(typeof(field), item), \ - sizeof(field.item), is_signed_type(type)); \ - if (ret) \ - return ret; - -#undef TRACE_FIELD_SPECIAL -#define TRACE_FIELD_SPECIAL(type, item, len, cmd) \ - ret = trace_define_field(event_call, #type "[" #len "]", #item, \ - offsetof(typeof(field), item), \ - sizeof(field.item), 0); \ - if (ret) \ - return ret; - -#undef TRACE_FIELD_SIGN -#define TRACE_FIELD_SIGN(type, item, assign, is_signed) \ - ret = trace_define_field(event_call, #type, #item, \ - offsetof(typeof(field), item), \ - sizeof(field.item), is_signed); \ - if (ret) \ - return ret; - -#undef TRACE_FIELD_ZERO_CHAR -#define TRACE_FIELD_ZERO_CHAR(item) - -#undef TRACE_EVENT_FORMAT -#define TRACE_EVENT_FORMAT(call, proto, args, fmt, tstruct, tpfmt) \ -int \ -ftrace_define_fields_##call(void) \ -{ \ - struct ftrace_event_call *event_call = &event_##call; \ - struct args field; \ - int ret; \ - \ - __common_field(unsigned char, type, 0); \ - __common_field(unsigned char, flags, 0); \ - __common_field(unsigned char, preempt_count, 0); \ - __common_field(int, pid, 1); \ - __common_field(int, tgid, 1); \ - \ - tstruct; \ - \ - return ret; \ -} - -#undef TRACE_EVENT_FORMAT_NOFILTER -#define TRACE_EVENT_FORMAT_NOFILTER(call, proto, args, fmt, tstruct, \ - tpfmt) - -#include "trace_event_types.h" +#include "trace_entries.h" diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c index 75ef000..5b01b94 100644 --- a/kernel/trace/trace_functions.c +++ b/kernel/trace/trace_functions.c @@ -288,11 +288,9 @@ static int ftrace_trace_onoff_print(struct seq_file *m, unsigned long ip, struct ftrace_probe_ops *ops, void *data) { - char str[KSYM_SYMBOL_LEN]; long count = (long)data; - kallsyms_lookup(ip, NULL, NULL, NULL, str); - seq_printf(m, "%s:", str); + seq_printf(m, "%pf:", (void *)ip); if (ops == &traceon_probe_ops) seq_printf(m, "traceon"); diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index 420ec34..79ed27c 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -52,7 +52,7 @@ static struct tracer_flags tracer_flags = { .opts = trace_opts }; -/* pid on the last trace processed */ +static struct trace_array *graph_array; /* Add a function return address to the trace stack on thread info.*/ @@ -166,10 +166,123 @@ unsigned long ftrace_return_to_handler(unsigned long frame_pointer) return ret; } +static int __trace_graph_entry(struct trace_array *tr, + struct ftrace_graph_ent *trace, + unsigned long flags, + int pc) +{ + struct ftrace_event_call *call = &event_funcgraph_entry; + struct ring_buffer_event *event; + struct ring_buffer *buffer = tr->buffer; + struct ftrace_graph_ent_entry *entry; + + if (unlikely(local_read(&__get_cpu_var(ftrace_cpu_disabled)))) + return 0; + + event = trace_buffer_lock_reserve(buffer, TRACE_GRAPH_ENT, + sizeof(*entry), flags, pc); + if (!event) + return 0; + entry = ring_buffer_event_data(event); + entry->graph_ent = *trace; + if (!filter_current_check_discard(buffer, call, entry, event)) + ring_buffer_unlock_commit(buffer, event); + + return 1; +} + +int trace_graph_entry(struct ftrace_graph_ent *trace) +{ + struct trace_array *tr = graph_array; + struct trace_array_cpu *data; + unsigned long flags; + long disabled; + int ret; + int cpu; + int pc; + + if (unlikely(!tr)) + return 0; + + if (!ftrace_trace_task(current)) + return 0; + + if (!ftrace_graph_addr(trace->func)) + return 0; + + local_irq_save(flags); + cpu = raw_smp_processor_id(); + data = tr->data[cpu]; + disabled = atomic_inc_return(&data->disabled); + if (likely(disabled == 1)) { + pc = preempt_count(); + ret = __trace_graph_entry(tr, trace, flags, pc); + } else { + ret = 0; + } + /* Only do the atomic if it is not already set */ + if (!test_tsk_trace_graph(current)) + set_tsk_trace_graph(current); + + atomic_dec(&data->disabled); + local_irq_restore(flags); + + return ret; +} + +static void __trace_graph_return(struct trace_array *tr, + struct ftrace_graph_ret *trace, + unsigned long flags, + int pc) +{ + struct ftrace_event_call *call = &event_funcgraph_exit; + struct ring_buffer_event *event; + struct ring_buffer *buffer = tr->buffer; + struct ftrace_graph_ret_entry *entry; + + if (unlikely(local_read(&__get_cpu_var(ftrace_cpu_disabled)))) + return; + + event = trace_buffer_lock_reserve(buffer, TRACE_GRAPH_RET, + sizeof(*entry), flags, pc); + if (!event) + return; + entry = ring_buffer_event_data(event); + entry->ret = *trace; + if (!filter_current_check_discard(buffer, call, entry, event)) + ring_buffer_unlock_commit(buffer, event); +} + +void trace_graph_return(struct ftrace_graph_ret *trace) +{ + struct trace_array *tr = graph_array; + struct trace_array_cpu *data; + unsigned long flags; + long disabled; + int cpu; + int pc; + + local_irq_save(flags); + cpu = raw_smp_processor_id(); + data = tr->data[cpu]; + disabled = atomic_inc_return(&data->disabled); + if (likely(disabled == 1)) { + pc = preempt_count(); + __trace_graph_return(tr, trace, flags, pc); + } + if (!trace->depth) + clear_tsk_trace_graph(current); + atomic_dec(&data->disabled); + local_irq_restore(flags); +} + static int graph_trace_init(struct trace_array *tr) { - int ret = register_ftrace_graph(&trace_graph_return, - &trace_graph_entry); + int ret; + + graph_array = tr; + ret = register_ftrace_graph(&trace_graph_return, + &trace_graph_entry); if (ret) return ret; tracing_start_cmdline_record(); @@ -177,49 +290,30 @@ static int graph_trace_init(struct trace_array *tr) return 0; } +void set_graph_array(struct trace_array *tr) +{ + graph_array = tr; +} + static void graph_trace_reset(struct trace_array *tr) { tracing_stop_cmdline_record(); unregister_ftrace_graph(); } -static inline int log10_cpu(int nb) -{ - if (nb / 100) - return 3; - if (nb / 10) - return 2; - return 1; -} +static int max_bytes_for_cpu; static enum print_line_t print_graph_cpu(struct trace_seq *s, int cpu) { - int i; int ret; - int log10_this = log10_cpu(cpu); - int log10_all = log10_cpu(cpumask_weight(cpu_online_mask)); - /* * Start with a space character - to make it stand out * to the right a bit when trace output is pasted into * email: */ - ret = trace_seq_printf(s, " "); - - /* - * Tricky - we space the CPU field according to the max - * number of online CPUs. On a 2-cpu system it would take - * a maximum of 1 digit - on a 128 cpu system it would - * take up to 3 digits: - */ - for (i = 0; i < log10_all - log10_this; i++) { - ret = trace_seq_printf(s, " "); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - } - ret = trace_seq_printf(s, "%d) ", cpu); + ret = trace_seq_printf(s, " %*d) ", max_bytes_for_cpu, cpu); if (!ret) return TRACE_TYPE_PARTIAL_LINE; @@ -270,6 +364,15 @@ print_graph_proc(struct trace_seq *s, pid_t pid) } +static enum print_line_t +print_graph_lat_fmt(struct trace_seq *s, struct trace_entry *entry) +{ + if (!trace_seq_putc(s, ' ')) + return 0; + + return trace_print_lat_fmt(s, entry); +} + /* If the pid changed since the last trace, output this event */ static enum print_line_t verif_pid(struct trace_seq *s, pid_t pid, int cpu, struct fgraph_data *data) @@ -427,6 +530,7 @@ print_graph_irq(struct trace_iterator *iter, unsigned long addr, if (ret == TRACE_TYPE_PARTIAL_LINE) return TRACE_TYPE_PARTIAL_LINE; } + /* Proc */ if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC) { ret = print_graph_proc(s, pid); @@ -470,7 +574,7 @@ trace_print_graph_duration(unsigned long long duration, struct trace_seq *s) int ret, len; int i; - sprintf(msecs_str, "%lu", (unsigned long) duration); + snprintf(msecs_str, sizeof(msecs_str), "%lu", (unsigned long) duration); /* Print msecs */ ret = trace_seq_printf(s, "%s", msecs_str); @@ -565,11 +669,7 @@ print_graph_entry_leaf(struct trace_iterator *iter, return TRACE_TYPE_PARTIAL_LINE; } - ret = seq_print_ip_sym(s, call->func, 0); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - - ret = trace_seq_printf(s, "();\n"); + ret = trace_seq_printf(s, "%pf();\n", (void *)call->func); if (!ret) return TRACE_TYPE_PARTIAL_LINE; @@ -612,11 +712,7 @@ print_graph_entry_nested(struct trace_iterator *iter, return TRACE_TYPE_PARTIAL_LINE; } - ret = seq_print_ip_sym(s, call->func, 0); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - - ret = trace_seq_printf(s, "() {\n"); + ret = trace_seq_printf(s, "%pf() {\n", (void *)call->func); if (!ret) return TRACE_TYPE_PARTIAL_LINE; @@ -672,6 +768,13 @@ print_graph_prologue(struct trace_iterator *iter, struct trace_seq *s, return TRACE_TYPE_PARTIAL_LINE; } + /* Latency format */ + if (trace_flags & TRACE_ITER_LATENCY_FMT) { + ret = print_graph_lat_fmt(s, ent); + if (ret == TRACE_TYPE_PARTIAL_LINE) + return TRACE_TYPE_PARTIAL_LINE; + } + return 0; } @@ -866,28 +969,59 @@ print_graph_function(struct trace_iterator *iter) return TRACE_TYPE_HANDLED; } +static void print_lat_header(struct seq_file *s) +{ + static const char spaces[] = " " /* 16 spaces */ + " " /* 4 spaces */ + " "; /* 17 spaces */ + int size = 0; + + if (tracer_flags.val & TRACE_GRAPH_PRINT_ABS_TIME) + size += 16; + if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU) + size += 4; + if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC) + size += 17; + + seq_printf(s, "#%.*s _-----=> irqs-off \n", size, spaces); + seq_printf(s, "#%.*s / _----=> need-resched \n", size, spaces); + seq_printf(s, "#%.*s| / _---=> hardirq/softirq \n", size, spaces); + seq_printf(s, "#%.*s|| / _--=> preempt-depth \n", size, spaces); + seq_printf(s, "#%.*s||| / _-=> lock-depth \n", size, spaces); + seq_printf(s, "#%.*s|||| / \n", size, spaces); +} + static void print_graph_headers(struct seq_file *s) { + int lat = trace_flags & TRACE_ITER_LATENCY_FMT; + + if (lat) + print_lat_header(s); + /* 1st line */ - seq_printf(s, "# "); + seq_printf(s, "#"); if (tracer_flags.val & TRACE_GRAPH_PRINT_ABS_TIME) seq_printf(s, " TIME "); if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU) - seq_printf(s, "CPU"); + seq_printf(s, " CPU"); if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC) - seq_printf(s, " TASK/PID "); + seq_printf(s, " TASK/PID "); + if (lat) + seq_printf(s, "|||||"); if (tracer_flags.val & TRACE_GRAPH_PRINT_DURATION) seq_printf(s, " DURATION "); seq_printf(s, " FUNCTION CALLS\n"); /* 2nd line */ - seq_printf(s, "# "); + seq_printf(s, "#"); if (tracer_flags.val & TRACE_GRAPH_PRINT_ABS_TIME) seq_printf(s, " | "); if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU) - seq_printf(s, "| "); + seq_printf(s, " | "); if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC) - seq_printf(s, " | | "); + seq_printf(s, " | | "); + if (lat) + seq_printf(s, "|||||"); if (tracer_flags.val & TRACE_GRAPH_PRINT_DURATION) seq_printf(s, " | | "); seq_printf(s, " | | | |\n"); @@ -934,6 +1068,8 @@ static struct tracer graph_trace __read_mostly = { static __init int init_graph_trace(void) { + max_bytes_for_cpu = snprintf(NULL, 0, "%d", nr_cpu_ids - 1); + return register_tracer(&graph_trace); } diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c index b923d13..b7a60f9 100644 --- a/kernel/trace/trace_irqsoff.c +++ b/kernel/trace/trace_irqsoff.c @@ -17,13 +17,14 @@ #include <linux/fs.h> #include "trace.h" +#include <trace/events/hist.h> static struct trace_array *irqsoff_trace __read_mostly; static int tracer_enabled __read_mostly; static DEFINE_PER_CPU(int, tracing_cpu); -static DEFINE_SPINLOCK(max_trace_lock); +static DEFINE_ATOMIC_SPINLOCK(max_trace_lock); enum { TRACER_IRQS_OFF = (1 << 1), @@ -129,15 +130,10 @@ check_critical_timing(struct trace_array *tr, unsigned long parent_ip, int cpu) { - unsigned long latency, t0, t1; cycle_t T0, T1, delta; unsigned long flags; int pc; - /* - * usecs conversion is slow so we try to delay the conversion - * as long as possible: - */ T0 = data->preempt_timestamp; T1 = ftrace_now(cpu); delta = T1-T0; @@ -149,7 +145,7 @@ check_critical_timing(struct trace_array *tr, if (!report_latency(delta)) goto out; - spin_lock_irqsave(&max_trace_lock, flags); + atomic_spin_lock_irqsave(&max_trace_lock, flags); /* check if we are still the max latency */ if (!report_latency(delta)) @@ -157,28 +153,24 @@ check_critical_timing(struct trace_array *tr, trace_function(tr, CALLER_ADDR0, parent_ip, flags, pc); - latency = nsecs_to_usecs(delta); - if (data->critical_sequence != max_sequence) goto out_unlock; - tracing_max_latency = delta; - t0 = nsecs_to_usecs(T0); - t1 = nsecs_to_usecs(T1); - data->critical_end = parent_ip; - update_max_tr_single(tr, current, cpu); + if (likely(!is_tracing_stopped())) { + tracing_max_latency = delta; + update_max_tr_single(tr, current, cpu); + } max_sequence++; out_unlock: - spin_unlock_irqrestore(&max_trace_lock, flags); + atomic_spin_unlock_irqrestore(&max_trace_lock, flags); out: data->critical_sequence = max_sequence; data->preempt_timestamp = ftrace_now(cpu); - tracing_reset(tr, cpu); trace_function(tr, CALLER_ADDR0, parent_ip, flags, pc); } @@ -208,7 +200,6 @@ start_critical_timing(unsigned long ip, unsigned long parent_ip) data->critical_sequence = max_sequence; data->preempt_timestamp = ftrace_now(cpu); data->critical_start = parent_ip ? : ip; - tracing_reset(tr, cpu); local_save_flags(flags); @@ -257,11 +248,13 @@ void start_critical_timings(void) { if (preempt_trace() || irq_trace()) start_critical_timing(CALLER_ADDR0, CALLER_ADDR1); + trace_preemptirqsoff_hist(TRACE_START, 1); } EXPORT_SYMBOL_GPL(start_critical_timings); void stop_critical_timings(void) { + trace_preemptirqsoff_hist(TRACE_STOP, 0); if (preempt_trace() || irq_trace()) stop_critical_timing(CALLER_ADDR0, CALLER_ADDR1); } @@ -271,6 +264,7 @@ EXPORT_SYMBOL_GPL(stop_critical_timings); #ifdef CONFIG_PROVE_LOCKING void time_hardirqs_on(unsigned long a0, unsigned long a1) { + trace_preemptirqsoff_hist(IRQS_ON, 0); if (!preempt_trace() && irq_trace()) stop_critical_timing(a0, a1); } @@ -279,6 +273,7 @@ void time_hardirqs_off(unsigned long a0, unsigned long a1) { if (!preempt_trace() && irq_trace()) start_critical_timing(a0, a1); + trace_preemptirqsoff_hist(IRQS_OFF, 1); } #else /* !CONFIG_PROVE_LOCKING */ @@ -312,6 +307,7 @@ inline void print_irqtrace_events(struct task_struct *curr) */ void trace_hardirqs_on(void) { + trace_preemptirqsoff_hist(IRQS_ON, 0); if (!preempt_trace() && irq_trace()) stop_critical_timing(CALLER_ADDR0, CALLER_ADDR1); } @@ -321,11 +317,13 @@ void trace_hardirqs_off(void) { if (!preempt_trace() && irq_trace()) start_critical_timing(CALLER_ADDR0, CALLER_ADDR1); + trace_preemptirqsoff_hist(IRQS_OFF, 1); } EXPORT_SYMBOL(trace_hardirqs_off); void trace_hardirqs_on_caller(unsigned long caller_addr) { + trace_preemptirqsoff_hist(IRQS_ON, 0); if (!preempt_trace() && irq_trace()) stop_critical_timing(CALLER_ADDR0, caller_addr); } @@ -335,6 +333,7 @@ void trace_hardirqs_off_caller(unsigned long caller_addr) { if (!preempt_trace() && irq_trace()) start_critical_timing(CALLER_ADDR0, caller_addr); + trace_preemptirqsoff_hist(IRQS_OFF, 1); } EXPORT_SYMBOL(trace_hardirqs_off_caller); @@ -344,12 +343,14 @@ EXPORT_SYMBOL(trace_hardirqs_off_caller); #ifdef CONFIG_PREEMPT_TRACER void trace_preempt_on(unsigned long a0, unsigned long a1) { + trace_preemptirqsoff_hist(PREEMPT_ON, 0); if (preempt_trace()) stop_critical_timing(a0, a1); } void trace_preempt_off(unsigned long a0, unsigned long a1) { + trace_preemptirqsoff_hist(PREEMPT_OFF, 1); if (preempt_trace()) start_critical_timing(a0, a1); } @@ -379,6 +380,7 @@ static void __irqsoff_tracer_init(struct trace_array *tr) irqsoff_trace = tr; /* make sure that the tracer is visible */ smp_wmb(); + tracing_reset_online_cpus(tr); start_irqsoff_tracer(tr); } diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c index d53b45e..0acd834 100644 --- a/kernel/trace/trace_mmiotrace.c +++ b/kernel/trace/trace_mmiotrace.c @@ -307,11 +307,13 @@ static void __trace_mmiotrace_rw(struct trace_array *tr, struct trace_array_cpu *data, struct mmiotrace_rw *rw) { + struct ftrace_event_call *call = &event_mmiotrace_rw; + struct ring_buffer *buffer = tr->buffer; struct ring_buffer_event *event; struct trace_mmiotrace_rw *entry; int pc = preempt_count(); - event = trace_buffer_lock_reserve(tr, TRACE_MMIO_RW, + event = trace_buffer_lock_reserve(buffer, TRACE_MMIO_RW, sizeof(*entry), 0, pc); if (!event) { atomic_inc(&dropped_count); @@ -319,7 +321,9 @@ static void __trace_mmiotrace_rw(struct trace_array *tr, } entry = ring_buffer_event_data(event); entry->rw = *rw; - trace_buffer_unlock_commit(tr, event, 0, pc); + + if (!filter_check_discard(call, entry, buffer, event)) + trace_buffer_unlock_commit(buffer, event, 0, pc); } void mmio_trace_rw(struct mmiotrace_rw *rw) @@ -333,11 +337,13 @@ static void __trace_mmiotrace_map(struct trace_array *tr, struct trace_array_cpu *data, struct mmiotrace_map *map) { + struct ftrace_event_call *call = &event_mmiotrace_map; + struct ring_buffer *buffer = tr->buffer; struct ring_buffer_event *event; struct trace_mmiotrace_map *entry; int pc = preempt_count(); - event = trace_buffer_lock_reserve(tr, TRACE_MMIO_MAP, + event = trace_buffer_lock_reserve(buffer, TRACE_MMIO_MAP, sizeof(*entry), 0, pc); if (!event) { atomic_inc(&dropped_count); @@ -345,7 +351,9 @@ static void __trace_mmiotrace_map(struct trace_array *tr, } entry = ring_buffer_event_data(event); entry->map = *map; - trace_buffer_unlock_commit(tr, event, 0, pc); + + if (!filter_check_discard(call, entry, buffer, event)) + trace_buffer_unlock_commit(buffer, event, 0, pc); } void mmio_trace_mapping(struct mmiotrace_map *map) diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index e0c2545..f572f44 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -407,7 +407,7 @@ seq_print_userip_objs(const struct userstack_entry *entry, struct trace_seq *s, * since individual threads might have already quit! */ rcu_read_lock(); - task = find_task_by_vpid(entry->ent.tgid); + task = find_task_by_vpid(entry->tgid); if (task) mm = get_task_mm(task); rcu_read_unlock(); @@ -460,18 +460,23 @@ seq_print_ip_sym(struct trace_seq *s, unsigned long ip, unsigned long sym_flags) return ret; } -static int -lat_print_generic(struct trace_seq *s, struct trace_entry *entry, int cpu) +/** + * trace_print_lat_fmt - print the irq, preempt and lockdep fields + * @s: trace seq struct to write to + * @entry: The trace entry field from the ring buffer + * + * Prints the generic fields of irqs off, in hard or softirq, preempt + * count and lock depth. + */ +int trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry) { int hardirq, softirq; - char comm[TASK_COMM_LEN]; + int ret; - trace_find_cmdline(entry->pid, comm); hardirq = entry->flags & TRACE_FLAG_HARDIRQ; softirq = entry->flags & TRACE_FLAG_SOFTIRQ; - if (!trace_seq_printf(s, "%8.8s-%-5d %3d%c%c%c", - comm, entry->pid, cpu, + if (!trace_seq_printf(s, "%c%c%c", (entry->flags & TRACE_FLAG_IRQS_OFF) ? 'd' : (entry->flags & TRACE_FLAG_IRQS_NOSUPPORT) ? 'X' : '.', @@ -481,9 +486,30 @@ lat_print_generic(struct trace_seq *s, struct trace_entry *entry, int cpu) hardirq ? 'h' : softirq ? 's' : '.')) return 0; + if (entry->lock_depth < 0) + ret = trace_seq_putc(s, '.'); + else + ret = trace_seq_printf(s, "%d", entry->lock_depth); + if (!ret) + return 0; + if (entry->preempt_count) return trace_seq_printf(s, "%x", entry->preempt_count); - return trace_seq_puts(s, "."); + return trace_seq_putc(s, '.'); +} + +static int +lat_print_generic(struct trace_seq *s, struct trace_entry *entry, int cpu) +{ + char comm[TASK_COMM_LEN]; + + trace_find_cmdline(entry->pid, comm); + + if (!trace_seq_printf(s, "%8.8s-%-5d %3d", + comm, entry->pid, cpu)) + return 0; + + return trace_print_lat_fmt(s, entry); } static unsigned long preempt_mark_thresh = 100; diff --git a/kernel/trace/trace_output.h b/kernel/trace/trace_output.h index d38bec4..9d91c72 100644 --- a/kernel/trace/trace_output.h +++ b/kernel/trace/trace_output.h @@ -26,6 +26,8 @@ extern struct trace_event *ftrace_find_event(int type); extern enum print_line_t trace_nop_print(struct trace_iterator *iter, int flags); +extern int +trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry); /* used by module unregistering */ extern int __unregister_ftrace_event(struct trace_event *event); diff --git a/kernel/trace/trace_power.c b/kernel/trace/trace_power.c index 8a30d98..fe1a00f 100644 --- a/kernel/trace/trace_power.c +++ b/kernel/trace/trace_power.c @@ -38,6 +38,7 @@ static void probe_power_end(struct power_trace *it) { struct ftrace_event_call *call = &event_power; struct ring_buffer_event *event; + struct ring_buffer *buffer; struct trace_power *entry; struct trace_array_cpu *data; struct trace_array *tr = power_trace; @@ -45,18 +46,20 @@ static void probe_power_end(struct power_trace *it) if (!trace_power_enabled) return; + buffer = tr->buffer; + preempt_disable(); it->end = ktime_get(); data = tr->data[smp_processor_id()]; - event = trace_buffer_lock_reserve(tr, TRACE_POWER, + event = trace_buffer_lock_reserve(buffer, TRACE_POWER, sizeof(*entry), 0, 0); if (!event) goto out; entry = ring_buffer_event_data(event); entry->state_data = *it; - if (!filter_check_discard(call, entry, tr->buffer, event)) - trace_buffer_unlock_commit(tr, event, 0, 0); + if (!filter_check_discard(call, entry, buffer, event)) + trace_buffer_unlock_commit(buffer, event, 0, 0); out: preempt_enable(); } @@ -66,6 +69,7 @@ static void probe_power_mark(struct power_trace *it, unsigned int type, { struct ftrace_event_call *call = &event_power; struct ring_buffer_event *event; + struct ring_buffer *buffer; struct trace_power *entry; struct trace_array_cpu *data; struct trace_array *tr = power_trace; @@ -73,6 +77,8 @@ static void probe_power_mark(struct power_trace *it, unsigned int type, if (!trace_power_enabled) return; + buffer = tr->buffer; + memset(it, 0, sizeof(struct power_trace)); it->state = level; it->type = type; @@ -81,14 +87,14 @@ static void probe_power_mark(struct power_trace *it, unsigned int type, it->end = it->stamp; data = tr->data[smp_processor_id()]; - event = trace_buffer_lock_reserve(tr, TRACE_POWER, + event = trace_buffer_lock_reserve(buffer, TRACE_POWER, sizeof(*entry), 0, 0); if (!event) goto out; entry = ring_buffer_event_data(event); entry->state_data = *it; - if (!filter_check_discard(call, entry, tr->buffer, event)) - trace_buffer_unlock_commit(tr, event, 0, 0); + if (!filter_check_discard(call, entry, buffer, event)) + trace_buffer_unlock_commit(buffer, event, 0, 0); out: preempt_enable(); } @@ -144,14 +150,12 @@ static void power_trace_reset(struct trace_array *tr) static int power_trace_init(struct trace_array *tr) { - int cpu; power_trace = tr; trace_power_enabled = 1; tracing_power_register(); - for_each_cpu(cpu, cpu_possible_mask) - tracing_reset(tr, cpu); + tracing_reset_online_cpus(tr); return 0; } diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c index a98106d..5fca0f5 100644 --- a/kernel/trace/trace_sched_switch.c +++ b/kernel/trace/trace_sched_switch.c @@ -20,6 +20,35 @@ static int sched_ref; static DEFINE_MUTEX(sched_register_mutex); static int sched_stopped; + +void +tracing_sched_switch_trace(struct trace_array *tr, + struct task_struct *prev, + struct task_struct *next, + unsigned long flags, int pc) +{ + struct ftrace_event_call *call = &event_context_switch; + struct ring_buffer *buffer = tr->buffer; + struct ring_buffer_event *event; + struct ctx_switch_entry *entry; + + event = trace_buffer_lock_reserve(buffer, TRACE_CTX, + sizeof(*entry), flags, pc); + if (!event) + return; + entry = ring_buffer_event_data(event); + entry->prev_pid = prev->pid; + entry->prev_prio = prev->prio; + entry->prev_state = prev->state; + entry->next_pid = next->pid; + entry->next_prio = next->prio; + entry->next_state = next->state; + entry->next_cpu = task_cpu(next); + + if (!filter_check_discard(call, entry, buffer, event)) + trace_buffer_unlock_commit(buffer, event, flags, pc); +} + static void probe_sched_switch(struct rq *__rq, struct task_struct *prev, struct task_struct *next) @@ -49,6 +78,36 @@ probe_sched_switch(struct rq *__rq, struct task_struct *prev, local_irq_restore(flags); } +void +tracing_sched_wakeup_trace(struct trace_array *tr, + struct task_struct *wakee, + struct task_struct *curr, + unsigned long flags, int pc) +{ + struct ftrace_event_call *call = &event_wakeup; + struct ring_buffer_event *event; + struct ctx_switch_entry *entry; + struct ring_buffer *buffer = tr->buffer; + + event = trace_buffer_lock_reserve(buffer, TRACE_WAKE, + sizeof(*entry), flags, pc); + if (!event) + return; + entry = ring_buffer_event_data(event); + entry->prev_pid = curr->pid; + entry->prev_prio = curr->prio; + entry->prev_state = curr->state; + entry->next_pid = wakee->pid; + entry->next_prio = wakee->prio; + entry->next_state = wakee->state; + entry->next_cpu = task_cpu(wakee); + + if (!filter_check_discard(call, entry, buffer, event)) + ring_buffer_unlock_commit(buffer, event); + ftrace_trace_stack(tr->buffer, flags, 6, pc); + ftrace_trace_userstack(tr->buffer, flags, pc); +} + static void probe_sched_wakeup(struct rq *__rq, struct task_struct *wakee, int success) { diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c index eacb272..26185d7 100644 --- a/kernel/trace/trace_sched_wakeup.c +++ b/kernel/trace/trace_sched_wakeup.c @@ -24,6 +24,7 @@ static int __read_mostly tracer_enabled; static struct task_struct *wakeup_task; static int wakeup_cpu; +static int wakeup_current_cpu; static unsigned wakeup_prio = -1; static int wakeup_rt; @@ -56,33 +57,23 @@ wakeup_tracer_call(unsigned long ip, unsigned long parent_ip) resched = ftrace_preempt_disable(); cpu = raw_smp_processor_id(); + if (cpu != wakeup_current_cpu) + goto out_enable; + data = tr->data[cpu]; disabled = atomic_inc_return(&data->disabled); if (unlikely(disabled != 1)) goto out; local_irq_save(flags); - __raw_spin_lock(&wakeup_lock); - - if (unlikely(!wakeup_task)) - goto unlock; - - /* - * The task can't disappear because it needs to - * wake up first, and we have the wakeup_lock. - */ - if (task_cpu(wakeup_task) != cpu) - goto unlock; trace_function(tr, ip, parent_ip, flags, pc); - unlock: - __raw_spin_unlock(&wakeup_lock); local_irq_restore(flags); out: atomic_dec(&data->disabled); - + out_enable: ftrace_preempt_enable(resched); } @@ -107,11 +98,18 @@ static int report_latency(cycle_t delta) return 1; } +static void probe_wakeup_migrate_task(struct task_struct *task, int cpu) +{ + if (task != wakeup_task) + return; + + wakeup_current_cpu = cpu; +} + static void notrace probe_wakeup_sched_switch(struct rq *rq, struct task_struct *prev, struct task_struct *next) { - unsigned long latency = 0, t0 = 0, t1 = 0; struct trace_array_cpu *data; cycle_t T0, T1, delta; unsigned long flags; @@ -157,10 +155,6 @@ probe_wakeup_sched_switch(struct rq *rq, struct task_struct *prev, trace_function(wakeup_trace, CALLER_ADDR0, CALLER_ADDR1, flags, pc); tracing_sched_switch_trace(wakeup_trace, prev, next, flags, pc); - /* - * usecs conversion is slow so we try to delay the conversion - * as long as possible: - */ T0 = data->preempt_timestamp; T1 = ftrace_now(cpu); delta = T1-T0; @@ -168,13 +162,10 @@ probe_wakeup_sched_switch(struct rq *rq, struct task_struct *prev, if (!report_latency(delta)) goto out_unlock; - latency = nsecs_to_usecs(delta); - - tracing_max_latency = delta; - t0 = nsecs_to_usecs(T0); - t1 = nsecs_to_usecs(T1); - - update_max_tr(wakeup_trace, wakeup_task, wakeup_cpu); + if (likely(!is_tracing_stopped())) { + tracing_max_latency = delta; + update_max_tr(wakeup_trace, wakeup_task, wakeup_cpu); + } out_unlock: __wakeup_reset(wakeup_trace); @@ -186,11 +177,6 @@ out: static void __wakeup_reset(struct trace_array *tr) { - int cpu; - - for_each_possible_cpu(cpu) - tracing_reset(tr, cpu); - wakeup_cpu = -1; wakeup_prio = -1; @@ -204,6 +190,8 @@ static void wakeup_reset(struct trace_array *tr) { unsigned long flags; + tracing_reset_online_cpus(tr); + local_irq_save(flags); __raw_spin_lock(&wakeup_lock); __wakeup_reset(tr); @@ -247,6 +235,7 @@ probe_wakeup(struct rq *rq, struct task_struct *p, int success) __wakeup_reset(wakeup_trace); wakeup_cpu = task_cpu(p); + wakeup_current_cpu = wakeup_cpu; wakeup_prio = p->prio; wakeup_task = p; @@ -296,6 +285,13 @@ static void start_wakeup_tracer(struct trace_array *tr) goto fail_deprobe_wake_new; } + ret = register_trace_sched_migrate_task(probe_wakeup_migrate_task); + if (ret) { + pr_info("wakeup trace: Couldn't activate tracepoint" + " probe to kernel_sched_migrate_task\n"); + return; + } + wakeup_reset(tr); /* @@ -328,6 +324,7 @@ static void stop_wakeup_tracer(struct trace_array *tr) unregister_trace_sched_switch(probe_wakeup_sched_switch); unregister_trace_sched_wakeup_new(probe_wakeup); unregister_trace_sched_wakeup(probe_wakeup); + unregister_trace_sched_migrate_task(probe_wakeup_migrate_task); } static int __wakeup_tracer_init(struct trace_array *tr) diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c index 00dd648..d2cdbab 100644 --- a/kernel/trace/trace_selftest.c +++ b/kernel/trace/trace_selftest.c @@ -288,6 +288,7 @@ trace_selftest_startup_function_graph(struct tracer *trace, * to detect and recover from possible hangs */ tracing_reset_online_cpus(tr); + set_graph_array(tr); ret = register_ftrace_graph(&trace_graph_return, &trace_graph_entry_watchdog); if (ret) { diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c index 6a2a9d4..0f6facb 100644 --- a/kernel/trace/trace_stack.c +++ b/kernel/trace/trace_stack.c @@ -186,43 +186,33 @@ static const struct file_operations stack_max_size_fops = { }; static void * -t_next(struct seq_file *m, void *v, loff_t *pos) +__next(struct seq_file *m, loff_t *pos) { - long i; + long n = *pos - 1; - (*pos)++; - - if (v == SEQ_START_TOKEN) - i = 0; - else { - i = *(long *)v; - i++; - } - - if (i >= max_stack_trace.nr_entries || - stack_dump_trace[i] == ULONG_MAX) + if (n >= max_stack_trace.nr_entries || stack_dump_trace[n] == ULONG_MAX) return NULL; - m->private = (void *)i; - + m->private = (void *)n; return &m->private; } -static void *t_start(struct seq_file *m, loff_t *pos) +static void * +t_next(struct seq_file *m, void *v, loff_t *pos) { - void *t = SEQ_START_TOKEN; - loff_t l = 0; + (*pos)++; + return __next(m, pos); +} +static void *t_start(struct seq_file *m, loff_t *pos) +{ local_irq_disable(); __raw_spin_lock(&max_stack_lock); if (*pos == 0) return SEQ_START_TOKEN; - for (; t && l < *pos; t = t_next(m, t, &l)) - ; - - return t; + return __next(m, pos); } static void t_stop(struct seq_file *m, void *p) @@ -234,15 +224,8 @@ static void t_stop(struct seq_file *m, void *p) static int trace_lookup_stack(struct seq_file *m, long i) { unsigned long addr = stack_dump_trace[i]; -#ifdef CONFIG_KALLSYMS - char str[KSYM_SYMBOL_LEN]; - - sprint_symbol(str, addr); - return seq_printf(m, "%s\n", str); -#else - return seq_printf(m, "%p\n", (void*)addr); -#endif + return seq_printf(m, "%pF\n", (void *)addr); } static void print_disabled(struct seq_file *m) diff --git a/kernel/trace/trace_stat.c b/kernel/trace/trace_stat.c index aea321c..a4bb239 100644 --- a/kernel/trace/trace_stat.c +++ b/kernel/trace/trace_stat.c @@ -49,7 +49,8 @@ static struct dentry *stat_dir; * but it will at least advance closer to the next one * to be released. */ -static struct rb_node *release_next(struct rb_node *node) +static struct rb_node *release_next(struct tracer_stat *ts, + struct rb_node *node) { struct stat_node *snode; struct rb_node *parent = rb_parent(node); @@ -67,6 +68,8 @@ static struct rb_node *release_next(struct rb_node *node) parent->rb_right = NULL; snode = container_of(node, struct stat_node, node); + if (ts->stat_release) + ts->stat_release(snode->stat); kfree(snode); return parent; @@ -78,7 +81,7 @@ static void __reset_stat_session(struct stat_session *session) struct rb_node *node = session->stat_root.rb_node; while (node) - node = release_next(node); + node = release_next(session->ts, node); session->stat_root = RB_ROOT; } @@ -200,17 +203,21 @@ static void *stat_seq_start(struct seq_file *s, loff_t *pos) { struct stat_session *session = s->private; struct rb_node *node; + int n = *pos; int i; /* Prevent from tracer switch or rbtree modification */ mutex_lock(&session->stat_mutex); /* If we are in the beginning of the file, print the headers */ - if (!*pos && session->ts->stat_headers) - return SEQ_START_TOKEN; + if (session->ts->stat_headers) { + if (n == 0) + return SEQ_START_TOKEN; + n--; + } node = rb_first(&session->stat_root); - for (i = 0; node && i < *pos; i++) + for (i = 0; node && i < n; i++) node = rb_next(node); return node; diff --git a/kernel/trace/trace_stat.h b/kernel/trace/trace_stat.h index f3546a2..8f03914 100644 --- a/kernel/trace/trace_stat.h +++ b/kernel/trace/trace_stat.h @@ -18,6 +18,8 @@ struct tracer_stat { int (*stat_cmp)(void *p1, void *p2); /* Print a stat entry */ int (*stat_show)(struct seq_file *s, void *p); + /* Release an entry */ + void (*stat_release)(void *stat); /* Print the headers of your stat entries */ int (*stat_headers)(struct seq_file *s); }; diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index 5e57964..8712ce3 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -1,30 +1,18 @@ #include <trace/syscall.h> +#include <trace/events/syscalls.h> #include <linux/kernel.h> +#include <linux/ftrace.h> +#include <linux/perf_counter.h> #include <asm/syscall.h> #include "trace_output.h" #include "trace.h" -/* Keep a counter of the syscall tracing users */ -static int refcount; - -/* Prevent from races on thread flags toggling */ static DEFINE_MUTEX(syscall_trace_lock); - -/* Option to display the parameters types */ -enum { - TRACE_SYSCALLS_OPT_TYPES = 0x1, -}; - -static struct tracer_opt syscalls_opts[] = { - { TRACER_OPT(syscall_arg_type, TRACE_SYSCALLS_OPT_TYPES) }, - { } -}; - -static struct tracer_flags syscalls_flags = { - .val = 0, /* By default: no parameters types */ - .opts = syscalls_opts -}; +static int sys_refcount_enter; +static int sys_refcount_exit; +static DECLARE_BITMAP(enabled_enter_syscalls, NR_syscalls); +static DECLARE_BITMAP(enabled_exit_syscalls, NR_syscalls); enum print_line_t print_syscall_enter(struct trace_iterator *iter, int flags) @@ -35,35 +23,46 @@ print_syscall_enter(struct trace_iterator *iter, int flags) struct syscall_metadata *entry; int i, ret, syscall; - trace_assign_type(trace, ent); - + trace = (typeof(trace))ent; syscall = trace->nr; - entry = syscall_nr_to_meta(syscall); + if (!entry) goto end; + if (entry->enter_id != ent->type) { + WARN_ON_ONCE(1); + goto end; + } + ret = trace_seq_printf(s, "%s(", entry->name); if (!ret) return TRACE_TYPE_PARTIAL_LINE; for (i = 0; i < entry->nb_args; i++) { /* parameter types */ - if (syscalls_flags.val & TRACE_SYSCALLS_OPT_TYPES) { + if (trace_flags & TRACE_ITER_VERBOSE) { ret = trace_seq_printf(s, "%s ", entry->types[i]); if (!ret) return TRACE_TYPE_PARTIAL_LINE; } /* parameter values */ - ret = trace_seq_printf(s, "%s: %lx%s ", entry->args[i], + ret = trace_seq_printf(s, "%s: %lx%s", entry->args[i], trace->args[i], - i == entry->nb_args - 1 ? ")" : ","); + i == entry->nb_args - 1 ? "" : ", "); if (!ret) return TRACE_TYPE_PARTIAL_LINE; } + ret = trace_seq_putc(s, ')'); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + end: - trace_seq_printf(s, "\n"); + ret = trace_seq_putc(s, '\n'); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + return TRACE_TYPE_HANDLED; } @@ -77,16 +76,20 @@ print_syscall_exit(struct trace_iterator *iter, int flags) struct syscall_metadata *entry; int ret; - trace_assign_type(trace, ent); - + trace = (typeof(trace))ent; syscall = trace->nr; - entry = syscall_nr_to_meta(syscall); + if (!entry) { trace_seq_printf(s, "\n"); return TRACE_TYPE_HANDLED; } + if (entry->exit_id != ent->type) { + WARN_ON_ONCE(1); + return TRACE_TYPE_UNHANDLED; + } + ret = trace_seq_printf(s, "%s -> 0x%lx\n", entry->name, trace->ret); if (!ret) @@ -95,62 +98,140 @@ print_syscall_exit(struct trace_iterator *iter, int flags) return TRACE_TYPE_HANDLED; } -void start_ftrace_syscalls(void) +extern char *__bad_type_size(void); + +#define SYSCALL_FIELD(type, name) \ + sizeof(type) != sizeof(trace.name) ? \ + __bad_type_size() : \ + #type, #name, offsetof(typeof(trace), name), sizeof(trace.name) + +int syscall_enter_format(struct ftrace_event_call *call, struct trace_seq *s) { - unsigned long flags; - struct task_struct *g, *t; + int i; + int nr; + int ret; + struct syscall_metadata *entry; + struct syscall_trace_enter trace; + int offset = offsetof(struct syscall_trace_enter, args); - mutex_lock(&syscall_trace_lock); + nr = syscall_name_to_nr(call->data); + entry = syscall_nr_to_meta(nr); - /* Don't enable the flag on the tasks twice */ - if (++refcount != 1) - goto unlock; + if (!entry) + return 0; - arch_init_ftrace_syscalls(); - read_lock_irqsave(&tasklist_lock, flags); + ret = trace_seq_printf(s, "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n", + SYSCALL_FIELD(int, nr)); + if (!ret) + return 0; - do_each_thread(g, t) { - set_tsk_thread_flag(t, TIF_SYSCALL_FTRACE); - } while_each_thread(g, t); + for (i = 0; i < entry->nb_args; i++) { + ret = trace_seq_printf(s, "\tfield:%s %s;", entry->types[i], + entry->args[i]); + if (!ret) + return 0; + ret = trace_seq_printf(s, "\toffset:%d;\tsize:%zu;\n", offset, + sizeof(unsigned long)); + if (!ret) + return 0; + offset += sizeof(unsigned long); + } - read_unlock_irqrestore(&tasklist_lock, flags); + trace_seq_puts(s, "\nprint fmt: \""); + for (i = 0; i < entry->nb_args; i++) { + ret = trace_seq_printf(s, "%s: 0x%%0%zulx%s", entry->args[i], + sizeof(unsigned long), + i == entry->nb_args - 1 ? "" : ", "); + if (!ret) + return 0; + } + trace_seq_putc(s, '"'); -unlock: - mutex_unlock(&syscall_trace_lock); + for (i = 0; i < entry->nb_args; i++) { + ret = trace_seq_printf(s, ", ((unsigned long)(REC->%s))", + entry->args[i]); + if (!ret) + return 0; + } + + return trace_seq_putc(s, '\n'); } -void stop_ftrace_syscalls(void) +int syscall_exit_format(struct ftrace_event_call *call, struct trace_seq *s) { - unsigned long flags; - struct task_struct *g, *t; + int ret; + struct syscall_trace_exit trace; - mutex_lock(&syscall_trace_lock); + ret = trace_seq_printf(s, + "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n" + "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n", + SYSCALL_FIELD(int, nr), + SYSCALL_FIELD(unsigned long, ret)); + if (!ret) + return 0; - /* There are perhaps still some users */ - if (--refcount) - goto unlock; + return trace_seq_printf(s, "\nprint fmt: \"0x%%lx\", REC->ret\n"); +} - read_lock_irqsave(&tasklist_lock, flags); +int syscall_enter_define_fields(struct ftrace_event_call *call) +{ + struct syscall_trace_enter trace; + struct syscall_metadata *meta; + int ret; + int nr; + int i; + int offset = offsetof(typeof(trace), args); + + nr = syscall_name_to_nr(call->data); + meta = syscall_nr_to_meta(nr); + + if (!meta) + return 0; + + ret = trace_define_common_fields(call); + if (ret) + return ret; + + for (i = 0; i < meta->nb_args; i++) { + ret = trace_define_field(call, meta->types[i], + meta->args[i], offset, + sizeof(unsigned long), 0, + FILTER_OTHER); + offset += sizeof(unsigned long); + } - do_each_thread(g, t) { - clear_tsk_thread_flag(t, TIF_SYSCALL_FTRACE); - } while_each_thread(g, t); + return ret; +} - read_unlock_irqrestore(&tasklist_lock, flags); +int syscall_exit_define_fields(struct ftrace_event_call *call) +{ + struct syscall_trace_exit trace; + int ret; -unlock: - mutex_unlock(&syscall_trace_lock); + ret = trace_define_common_fields(call); + if (ret) + return ret; + + ret = trace_define_field(call, SYSCALL_FIELD(unsigned long, ret), 0, + FILTER_OTHER); + + return ret; } -void ftrace_syscall_enter(struct pt_regs *regs) +void ftrace_syscall_enter(struct pt_regs *regs, long id) { struct syscall_trace_enter *entry; struct syscall_metadata *sys_data; struct ring_buffer_event *event; + struct ring_buffer *buffer; int size; int syscall_nr; syscall_nr = syscall_get_nr(current, regs); + if (syscall_nr < 0) + return; + if (!test_bit(syscall_nr, enabled_enter_syscalls)) + return; sys_data = syscall_nr_to_meta(syscall_nr); if (!sys_data) @@ -158,8 +239,8 @@ void ftrace_syscall_enter(struct pt_regs *regs) size = sizeof(*entry) + sizeof(unsigned long) * sys_data->nb_args; - event = trace_current_buffer_lock_reserve(TRACE_SYSCALL_ENTER, size, - 0, 0); + event = trace_current_buffer_lock_reserve(&buffer, sys_data->enter_id, + size, 0, 0); if (!event) return; @@ -167,24 +248,30 @@ void ftrace_syscall_enter(struct pt_regs *regs) entry->nr = syscall_nr; syscall_get_arguments(current, regs, 0, sys_data->nb_args, entry->args); - trace_current_buffer_unlock_commit(event, 0, 0); - trace_wake_up(); + if (!filter_current_check_discard(buffer, sys_data->enter_event, + entry, event)) + trace_current_buffer_unlock_commit(buffer, event, 0, 0); } -void ftrace_syscall_exit(struct pt_regs *regs) +void ftrace_syscall_exit(struct pt_regs *regs, long ret) { struct syscall_trace_exit *entry; struct syscall_metadata *sys_data; struct ring_buffer_event *event; + struct ring_buffer *buffer; int syscall_nr; syscall_nr = syscall_get_nr(current, regs); + if (syscall_nr < 0) + return; + if (!test_bit(syscall_nr, enabled_exit_syscalls)) + return; sys_data = syscall_nr_to_meta(syscall_nr); if (!sys_data) return; - event = trace_current_buffer_lock_reserve(TRACE_SYSCALL_EXIT, + event = trace_current_buffer_lock_reserve(&buffer, sys_data->exit_id, sizeof(*entry), 0, 0); if (!event) return; @@ -193,58 +280,244 @@ void ftrace_syscall_exit(struct pt_regs *regs) entry->nr = syscall_nr; entry->ret = syscall_get_return_value(current, regs); - trace_current_buffer_unlock_commit(event, 0, 0); - trace_wake_up(); + if (!filter_current_check_discard(buffer, sys_data->exit_event, + entry, event)) + trace_current_buffer_unlock_commit(buffer, event, 0, 0); } -static int init_syscall_tracer(struct trace_array *tr) +int reg_event_syscall_enter(void *ptr) { - start_ftrace_syscalls(); + int ret = 0; + int num; + char *name; + + name = (char *)ptr; + num = syscall_name_to_nr(name); + if (num < 0 || num >= NR_syscalls) + return -ENOSYS; + mutex_lock(&syscall_trace_lock); + if (!sys_refcount_enter) + ret = register_trace_sys_enter(ftrace_syscall_enter); + if (ret) { + pr_info("event trace: Could not activate" + "syscall entry trace point"); + } else { + set_bit(num, enabled_enter_syscalls); + sys_refcount_enter++; + } + mutex_unlock(&syscall_trace_lock); + return ret; +} + +void unreg_event_syscall_enter(void *ptr) +{ + int num; + char *name; - return 0; + name = (char *)ptr; + num = syscall_name_to_nr(name); + if (num < 0 || num >= NR_syscalls) + return; + mutex_lock(&syscall_trace_lock); + sys_refcount_enter--; + clear_bit(num, enabled_enter_syscalls); + if (!sys_refcount_enter) + unregister_trace_sys_enter(ftrace_syscall_enter); + mutex_unlock(&syscall_trace_lock); } -static void reset_syscall_tracer(struct trace_array *tr) +int reg_event_syscall_exit(void *ptr) { - stop_ftrace_syscalls(); - tracing_reset_online_cpus(tr); + int ret = 0; + int num; + char *name; + + name = (char *)ptr; + num = syscall_name_to_nr(name); + if (num < 0 || num >= NR_syscalls) + return -ENOSYS; + mutex_lock(&syscall_trace_lock); + if (!sys_refcount_exit) + ret = register_trace_sys_exit(ftrace_syscall_exit); + if (ret) { + pr_info("event trace: Could not activate" + "syscall exit trace point"); + } else { + set_bit(num, enabled_exit_syscalls); + sys_refcount_exit++; + } + mutex_unlock(&syscall_trace_lock); + return ret; } -static struct trace_event syscall_enter_event = { - .type = TRACE_SYSCALL_ENTER, - .trace = print_syscall_enter, -}; +void unreg_event_syscall_exit(void *ptr) +{ + int num; + char *name; + + name = (char *)ptr; + num = syscall_name_to_nr(name); + if (num < 0 || num >= NR_syscalls) + return; + mutex_lock(&syscall_trace_lock); + sys_refcount_exit--; + clear_bit(num, enabled_exit_syscalls); + if (!sys_refcount_exit) + unregister_trace_sys_exit(ftrace_syscall_exit); + mutex_unlock(&syscall_trace_lock); +} -static struct trace_event syscall_exit_event = { - .type = TRACE_SYSCALL_EXIT, - .trace = print_syscall_exit, +struct trace_event event_syscall_enter = { + .trace = print_syscall_enter, }; -static struct tracer syscall_tracer __read_mostly = { - .name = "syscall", - .init = init_syscall_tracer, - .reset = reset_syscall_tracer, - .flags = &syscalls_flags, +struct trace_event event_syscall_exit = { + .trace = print_syscall_exit, }; -__init int register_ftrace_syscalls(void) +#ifdef CONFIG_EVENT_PROFILE + +static DECLARE_BITMAP(enabled_prof_enter_syscalls, NR_syscalls); +static DECLARE_BITMAP(enabled_prof_exit_syscalls, NR_syscalls); +static int sys_prof_refcount_enter; +static int sys_prof_refcount_exit; + +static void prof_syscall_enter(struct pt_regs *regs, long id) { - int ret; + struct syscall_trace_enter *rec; + struct syscall_metadata *sys_data; + int syscall_nr; + int size; - ret = register_ftrace_event(&syscall_enter_event); - if (!ret) { - printk(KERN_WARNING "event %d failed to register\n", - syscall_enter_event.type); - WARN_ON_ONCE(1); + syscall_nr = syscall_get_nr(current, regs); + if (!test_bit(syscall_nr, enabled_prof_enter_syscalls)) + return; + + sys_data = syscall_nr_to_meta(syscall_nr); + if (!sys_data) + return; + + /* get the size after alignment with the u32 buffer size field */ + size = sizeof(unsigned long) * sys_data->nb_args + sizeof(*rec); + size = ALIGN(size + sizeof(u32), sizeof(u64)); + size -= sizeof(u32); + + do { + char raw_data[size]; + + /* zero the dead bytes from align to not leak stack to user */ + *(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL; + + rec = (struct syscall_trace_enter *) raw_data; + tracing_generic_entry_update(&rec->ent, 0, 0); + rec->ent.type = sys_data->enter_id; + rec->nr = syscall_nr; + syscall_get_arguments(current, regs, 0, sys_data->nb_args, + (unsigned long *)&rec->args); + perf_tpcounter_event(sys_data->enter_id, 0, 1, rec, size); + } while(0); +} + +int reg_prof_syscall_enter(char *name) +{ + int ret = 0; + int num; + + num = syscall_name_to_nr(name); + if (num < 0 || num >= NR_syscalls) + return -ENOSYS; + + mutex_lock(&syscall_trace_lock); + if (!sys_prof_refcount_enter) + ret = register_trace_sys_enter(prof_syscall_enter); + if (ret) { + pr_info("event trace: Could not activate" + "syscall entry trace point"); + } else { + set_bit(num, enabled_prof_enter_syscalls); + sys_prof_refcount_enter++; } + mutex_unlock(&syscall_trace_lock); + return ret; +} - ret = register_ftrace_event(&syscall_exit_event); - if (!ret) { - printk(KERN_WARNING "event %d failed to register\n", - syscall_exit_event.type); - WARN_ON_ONCE(1); +void unreg_prof_syscall_enter(char *name) +{ + int num; + + num = syscall_name_to_nr(name); + if (num < 0 || num >= NR_syscalls) + return; + + mutex_lock(&syscall_trace_lock); + sys_prof_refcount_enter--; + clear_bit(num, enabled_prof_enter_syscalls); + if (!sys_prof_refcount_enter) + unregister_trace_sys_enter(prof_syscall_enter); + mutex_unlock(&syscall_trace_lock); +} + +static void prof_syscall_exit(struct pt_regs *regs, long ret) +{ + struct syscall_metadata *sys_data; + struct syscall_trace_exit rec; + int syscall_nr; + + syscall_nr = syscall_get_nr(current, regs); + if (!test_bit(syscall_nr, enabled_prof_exit_syscalls)) + return; + + sys_data = syscall_nr_to_meta(syscall_nr); + if (!sys_data) + return; + + tracing_generic_entry_update(&rec.ent, 0, 0); + rec.ent.type = sys_data->exit_id; + rec.nr = syscall_nr; + rec.ret = syscall_get_return_value(current, regs); + + perf_tpcounter_event(sys_data->exit_id, 0, 1, &rec, sizeof(rec)); +} + +int reg_prof_syscall_exit(char *name) +{ + int ret = 0; + int num; + + num = syscall_name_to_nr(name); + if (num < 0 || num >= NR_syscalls) + return -ENOSYS; + + mutex_lock(&syscall_trace_lock); + if (!sys_prof_refcount_exit) + ret = register_trace_sys_exit(prof_syscall_exit); + if (ret) { + pr_info("event trace: Could not activate" + "syscall entry trace point"); + } else { + set_bit(num, enabled_prof_exit_syscalls); + sys_prof_refcount_exit++; } + mutex_unlock(&syscall_trace_lock); + return ret; +} - return register_tracer(&syscall_tracer); +void unreg_prof_syscall_exit(char *name) +{ + int num; + + num = syscall_name_to_nr(name); + if (num < 0 || num >= NR_syscalls) + return; + + mutex_lock(&syscall_trace_lock); + sys_prof_refcount_exit--; + clear_bit(num, enabled_prof_exit_syscalls); + if (!sys_prof_refcount_exit) + unregister_trace_sys_exit(prof_syscall_exit); + mutex_unlock(&syscall_trace_lock); } -device_initcall(register_ftrace_syscalls); + +#endif + + diff --git a/kernel/trace/trace_workqueue.c b/kernel/trace/trace_workqueue.c index 97fcea4..40cafb0 100644 --- a/kernel/trace/trace_workqueue.c +++ b/kernel/trace/trace_workqueue.c @@ -9,6 +9,7 @@ #include <trace/events/workqueue.h> #include <linux/list.h> #include <linux/percpu.h> +#include <linux/kref.h> #include "trace_stat.h" #include "trace.h" @@ -16,6 +17,7 @@ /* A cpu workqueue thread */ struct cpu_workqueue_stats { struct list_head list; + struct kref kref; int cpu; pid_t pid; /* Can be inserted from interrupt or user context, need to be atomic */ @@ -39,6 +41,11 @@ struct workqueue_global_stats { static DEFINE_PER_CPU(struct workqueue_global_stats, all_workqueue_stat); #define workqueue_cpu_stat(cpu) (&per_cpu(all_workqueue_stat, cpu)) +static void cpu_workqueue_stat_free(struct kref *kref) +{ + kfree(container_of(kref, struct cpu_workqueue_stats, kref)); +} + /* Insertion of a work */ static void probe_workqueue_insertion(struct task_struct *wq_thread, @@ -96,8 +103,8 @@ static void probe_workqueue_creation(struct task_struct *wq_thread, int cpu) return; } INIT_LIST_HEAD(&cws->list); + kref_init(&cws->kref); cws->cpu = cpu; - cws->pid = wq_thread->pid; spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags); @@ -118,7 +125,7 @@ static void probe_workqueue_destruction(struct task_struct *wq_thread) list) { if (node->pid == wq_thread->pid) { list_del(&node->list); - kfree(node); + kref_put(&node->kref, cpu_workqueue_stat_free); goto found; } } @@ -137,9 +144,11 @@ static struct cpu_workqueue_stats *workqueue_stat_start_cpu(int cpu) spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags); - if (!list_empty(&workqueue_cpu_stat(cpu)->list)) + if (!list_empty(&workqueue_cpu_stat(cpu)->list)) { ret = list_entry(workqueue_cpu_stat(cpu)->list.next, struct cpu_workqueue_stats, list); + kref_get(&ret->kref); + } spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags); @@ -162,9 +171,9 @@ static void *workqueue_stat_start(struct tracer_stat *trace) static void *workqueue_stat_next(void *prev, int idx) { struct cpu_workqueue_stats *prev_cws = prev; + struct cpu_workqueue_stats *ret; int cpu = prev_cws->cpu; unsigned long flags; - void *ret = NULL; spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags); if (list_is_last(&prev_cws->list, &workqueue_cpu_stat(cpu)->list)) { @@ -175,11 +184,14 @@ static void *workqueue_stat_next(void *prev, int idx) return NULL; } while (!(ret = workqueue_stat_start_cpu(cpu))); return ret; + } else { + ret = list_entry(prev_cws->list.next, + struct cpu_workqueue_stats, list); + kref_get(&ret->kref); } spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags); - return list_entry(prev_cws->list.next, struct cpu_workqueue_stats, - list); + return ret; } static int workqueue_stat_show(struct seq_file *s, void *p) @@ -203,6 +215,13 @@ static int workqueue_stat_show(struct seq_file *s, void *p) return 0; } +static void workqueue_stat_release(void *stat) +{ + struct cpu_workqueue_stats *node = stat; + + kref_put(&node->kref, cpu_workqueue_stat_free); +} + static int workqueue_stat_headers(struct seq_file *s) { seq_printf(s, "# CPU INSERTED EXECUTED NAME\n"); @@ -215,6 +234,7 @@ struct tracer_stat workqueue_stats __read_mostly = { .stat_start = workqueue_stat_start, .stat_next = workqueue_stat_next, .stat_show = workqueue_stat_show, + .stat_release = workqueue_stat_release, .stat_headers = workqueue_stat_headers }; diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c index 1ef5d3a..9489a0a 100644 --- a/kernel/tracepoint.c +++ b/kernel/tracepoint.c @@ -24,6 +24,7 @@ #include <linux/tracepoint.h> #include <linux/err.h> #include <linux/slab.h> +#include <linux/sched.h> extern struct tracepoint __start___tracepoints[]; extern struct tracepoint __stop___tracepoints[]; @@ -242,6 +243,11 @@ static void set_tracepoint(struct tracepoint_entry **entry, { WARN_ON(strcmp((*entry)->name, elem->name) != 0); + if (elem->regfunc && !elem->state && active) + elem->regfunc(); + else if (elem->unregfunc && elem->state && !active) + elem->unregfunc(); + /* * rcu_assign_pointer has a smp_wmb() which makes sure that the new * probe callbacks array is consistent before setting a pointer to it. @@ -261,6 +267,9 @@ static void set_tracepoint(struct tracepoint_entry **entry, */ static void disable_tracepoint(struct tracepoint *elem) { + if (elem->unregfunc && elem->state) + elem->unregfunc(); + elem->state = 0; rcu_assign_pointer(elem->funcs, NULL); } @@ -554,9 +563,6 @@ int tracepoint_module_notify(struct notifier_block *self, switch (val) { case MODULE_STATE_COMING: - tracepoint_update_probe_range(mod->tracepoints, - mod->tracepoints + mod->num_tracepoints); - break; case MODULE_STATE_GOING: tracepoint_update_probe_range(mod->tracepoints, mod->tracepoints + mod->num_tracepoints); @@ -577,3 +583,41 @@ static int init_tracepoints(void) __initcall(init_tracepoints); #endif /* CONFIG_MODULES */ + +#ifdef CONFIG_HAVE_SYSCALL_TRACEPOINTS + +/* NB: reg/unreg are called while guarded with the tracepoints_mutex */ +static int sys_tracepoint_refcount; + +void syscall_regfunc(void) +{ + unsigned long flags; + struct task_struct *g, *t; + + if (!sys_tracepoint_refcount) { + read_lock_irqsave(&tasklist_lock, flags); + do_each_thread(g, t) { + /* Skip kernel threads. */ + if (t->mm) + set_tsk_thread_flag(t, TIF_SYSCALL_TRACEPOINT); + } while_each_thread(g, t); + read_unlock_irqrestore(&tasklist_lock, flags); + } + sys_tracepoint_refcount++; +} + +void syscall_unregfunc(void) +{ + unsigned long flags; + struct task_struct *g, *t; + + sys_tracepoint_refcount--; + if (!sys_tracepoint_refcount) { + read_lock_irqsave(&tasklist_lock, flags); + do_each_thread(g, t) { + clear_tsk_thread_flag(t, TIF_SYSCALL_TRACEPOINT); + } while_each_thread(g, t); + read_unlock_irqrestore(&tasklist_lock, flags); + } +} +#endif diff --git a/kernel/user.c b/kernel/user.c index 2c000e7..2d0519d 100644 --- a/kernel/user.c +++ b/kernel/user.c @@ -416,11 +416,11 @@ void free_uid(struct user_struct *up) if (!up) return; - local_irq_save(flags); + local_irq_save_nort(flags); if (atomic_dec_and_lock(&up->__count, &uidhash_lock)) free_user(up, flags); else - local_irq_restore(flags); + local_irq_restore_nort(flags); } struct user_struct *alloc_uid(struct user_namespace *ns, uid_t uid) diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 0668795..0a98bef 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -26,6 +26,7 @@ #include <linux/slab.h> #include <linux/cpu.h> #include <linux/notifier.h> +#include <linux/syscalls.h> #include <linux/kthread.h> #include <linux/hardirq.h> #include <linux/mempolicy.h> @@ -36,6 +37,8 @@ #define CREATE_TRACE_POINTS #include <trace/events/workqueue.h> +#include <asm/uaccess.h> + /* * The per-CPU workqueue (if single thread, we always use the first * possible cpu). @@ -159,13 +162,14 @@ static void __queue_work(struct cpu_workqueue_struct *cwq, * * We queue the work to the CPU on which it was submitted, but if the CPU dies * it can be processed by another CPU. + * + * Especially no such guarantee on PREEMPT_RT. */ int queue_work(struct workqueue_struct *wq, struct work_struct *work) { - int ret; + int ret = 0, cpu = raw_smp_processor_id(); - ret = queue_work_on(get_cpu(), wq, work); - put_cpu(); + ret = queue_work_on(cpu, wq, work); return ret; } @@ -202,7 +206,7 @@ static void delayed_work_timer_fn(unsigned long __data) struct cpu_workqueue_struct *cwq = get_wq_data(&dwork->work); struct workqueue_struct *wq = cwq->wq; - __queue_work(wq_per_cpu(wq, smp_processor_id()), &dwork->work); + __queue_work(wq_per_cpu(wq, raw_smp_processor_id()), &dwork->work); } /** @@ -883,6 +887,49 @@ static void cleanup_workqueue_thread(struct cpu_workqueue_struct *cwq) cwq->thread = NULL; } +void set_workqueue_thread_prio(struct workqueue_struct *wq, int cpu, + int policy, int rt_priority, int nice) +{ + struct sched_param param = { .sched_priority = rt_priority }; + struct cpu_workqueue_struct *cwq; + mm_segment_t oldfs = get_fs(); + struct task_struct *p; + unsigned long flags; + int ret; + + cwq = per_cpu_ptr(wq->cpu_wq, cpu); + spin_lock_irqsave(&cwq->lock, flags); + p = cwq->thread; + spin_unlock_irqrestore(&cwq->lock, flags); + + set_user_nice(p, nice); + + set_fs(KERNEL_DS); + ret = sys_sched_setscheduler(p->pid, policy, ¶m); + set_fs(oldfs); + + WARN_ON(ret); +} + +void set_workqueue_prio(struct workqueue_struct *wq, int policy, + int rt_priority, int nice) +{ + int cpu; + + /* We don't need the distraction of CPUs appearing and vanishing. */ + get_online_cpus(); + spin_lock(&workqueue_lock); + if (is_wq_single_threaded(wq)) + set_workqueue_thread_prio(wq, 0, policy, rt_priority, nice); + else { + for_each_online_cpu(cpu) + set_workqueue_thread_prio(wq, cpu, policy, + rt_priority, nice); + } + spin_unlock(&workqueue_lock); + put_online_cpus(); +} + /** * destroy_workqueue - safely terminate a workqueue * @wq: target workqueue @@ -1015,4 +1062,5 @@ void __init init_workqueues(void) hotcpu_notifier(workqueue_cpu_callback, 0); keventd_wq = create_workqueue("events"); BUG_ON(!keventd_wq); + set_workqueue_prio(keventd_wq, SCHED_FIFO, 1, -20); } diff --git a/lib/Kconfig b/lib/Kconfig index bb1326d..faefb80 100644 --- a/lib/Kconfig +++ b/lib/Kconfig @@ -179,6 +179,7 @@ config HAVE_LMB config CPUMASK_OFFSTACK bool "Force CPU masks off stack" if DEBUG_PER_CPU_MAPS + depends on !PREEMPT_RT && BROKEN help Use dynamic allocation for cpumask_var_t, instead of putting them on the stack. This is a bit more expensive, but avoids diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 12327b2..8a2ddd3 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -397,6 +397,8 @@ config DEBUG_RT_MUTEXES help This allows rt mutex semantics violations and rt mutex related deadlocks (lockups) to be detected and reported automatically. + When realtime preemption is enabled this includes spinlocks, + rwlocks, mutexes and (rw)semaphores config DEBUG_PI_LIST bool @@ -420,7 +422,7 @@ config DEBUG_SPINLOCK config DEBUG_MUTEXES bool "Mutex debugging: basic checks" - depends on DEBUG_KERNEL + depends on DEBUG_KERNEL && !PREEMPT_RT help This feature allows mutex semantics violations to be detected and reported. diff --git a/lib/Makefile b/lib/Makefile index 2e78277..ceeef24 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -34,7 +34,8 @@ obj-$(CONFIG_HAS_IOMEM) += iomap_copy.o devres.o obj-$(CONFIG_CHECK_SIGNATURE) += check_signature.o obj-$(CONFIG_DEBUG_LOCKING_API_SELFTESTS) += locking-selftest.o obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock_debug.o -lib-$(CONFIG_RWSEM_GENERIC_SPINLOCK) += rwsem-spinlock.o +obj-$(CONFIG_PREEMPT_RT) += plist.o +obj-$(CONFIG_RWSEM_GENERIC_SPINLOCK) += rwsem-spinlock.o lib-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem.o lib-$(CONFIG_GENERIC_FIND_FIRST_BIT) += find_next_bit.o lib-$(CONFIG_GENERIC_FIND_NEXT_BIT) += find_next_bit.o diff --git a/lib/debugobjects.c b/lib/debugobjects.c index 2755a3b..b8e69ee 100644 --- a/lib/debugobjects.c +++ b/lib/debugobjects.c @@ -25,14 +25,14 @@ struct debug_bucket { struct hlist_head list; - spinlock_t lock; + atomic_spinlock_t lock; }; static struct debug_bucket obj_hash[ODEBUG_HASH_SIZE]; static struct debug_obj obj_static_pool[ODEBUG_POOL_SIZE] __initdata; -static DEFINE_SPINLOCK(pool_lock); +static DEFINE_ATOMIC_SPINLOCK(pool_lock); static HLIST_HEAD(obj_pool); @@ -95,10 +95,10 @@ static int fill_pool(void) if (!new) return obj_pool_free; - spin_lock_irqsave(&pool_lock, flags); + atomic_spin_lock_irqsave(&pool_lock, flags); hlist_add_head(&new->node, &obj_pool); obj_pool_free++; - spin_unlock_irqrestore(&pool_lock, flags); + atomic_spin_unlock_irqrestore(&pool_lock, flags); } return obj_pool_free; } @@ -132,7 +132,7 @@ alloc_object(void *addr, struct debug_bucket *b, struct debug_obj_descr *descr) { struct debug_obj *obj = NULL; - spin_lock(&pool_lock); + atomic_spin_lock(&pool_lock); if (obj_pool.first) { obj = hlist_entry(obj_pool.first, typeof(*obj), node); @@ -151,7 +151,7 @@ alloc_object(void *addr, struct debug_bucket *b, struct debug_obj_descr *descr) if (obj_pool_free < obj_pool_min_free) obj_pool_min_free = obj_pool_free; } - spin_unlock(&pool_lock); + atomic_spin_unlock(&pool_lock); return obj; } @@ -164,7 +164,7 @@ static void free_obj_work(struct work_struct *work) struct debug_obj *obj; unsigned long flags; - spin_lock_irqsave(&pool_lock, flags); + atomic_spin_lock_irqsave(&pool_lock, flags); while (obj_pool_free > ODEBUG_POOL_SIZE) { obj = hlist_entry(obj_pool.first, typeof(*obj), node); hlist_del(&obj->node); @@ -173,11 +173,11 @@ static void free_obj_work(struct work_struct *work) * We release pool_lock across kmem_cache_free() to * avoid contention on pool_lock. */ - spin_unlock_irqrestore(&pool_lock, flags); + atomic_spin_unlock_irqrestore(&pool_lock, flags); kmem_cache_free(obj_cache, obj); - spin_lock_irqsave(&pool_lock, flags); + atomic_spin_lock_irqsave(&pool_lock, flags); } - spin_unlock_irqrestore(&pool_lock, flags); + atomic_spin_unlock_irqrestore(&pool_lock, flags); } /* @@ -189,7 +189,7 @@ static void free_object(struct debug_obj *obj) unsigned long flags; int sched = 0; - spin_lock_irqsave(&pool_lock, flags); + atomic_spin_lock_irqsave(&pool_lock, flags); /* * schedule work when the pool is filled and the cache is * initialized: @@ -199,7 +199,7 @@ static void free_object(struct debug_obj *obj) hlist_add_head(&obj->node, &obj_pool); obj_pool_free++; obj_pool_used--; - spin_unlock_irqrestore(&pool_lock, flags); + atomic_spin_unlock_irqrestore(&pool_lock, flags); if (sched) schedule_work(&debug_obj_work); } @@ -220,9 +220,9 @@ static void debug_objects_oom(void) printk(KERN_WARNING "ODEBUG: Out of memory. ODEBUG disabled\n"); for (i = 0; i < ODEBUG_HASH_SIZE; i++, db++) { - spin_lock_irqsave(&db->lock, flags); + atomic_spin_lock_irqsave(&db->lock, flags); hlist_move_list(&db->list, &freelist); - spin_unlock_irqrestore(&db->lock, flags); + atomic_spin_unlock_irqrestore(&db->lock, flags); /* Now free them */ hlist_for_each_entry_safe(obj, node, tmp, &freelist, node) { @@ -302,14 +302,14 @@ __debug_object_init(void *addr, struct debug_obj_descr *descr, int onstack) db = get_bucket((unsigned long) addr); - spin_lock_irqsave(&db->lock, flags); + atomic_spin_lock_irqsave(&db->lock, flags); obj = lookup_object(addr, db); if (!obj) { obj = alloc_object(addr, db, descr); if (!obj) { debug_objects_enabled = 0; - spin_unlock_irqrestore(&db->lock, flags); + atomic_spin_unlock_irqrestore(&db->lock, flags); debug_objects_oom(); return; } @@ -326,7 +326,7 @@ __debug_object_init(void *addr, struct debug_obj_descr *descr, int onstack) case ODEBUG_STATE_ACTIVE: debug_print_object(obj, "init"); state = obj->state; - spin_unlock_irqrestore(&db->lock, flags); + atomic_spin_unlock_irqrestore(&db->lock, flags); debug_object_fixup(descr->fixup_init, addr, state); return; @@ -337,7 +337,7 @@ __debug_object_init(void *addr, struct debug_obj_descr *descr, int onstack) break; } - spin_unlock_irqrestore(&db->lock, flags); + atomic_spin_unlock_irqrestore(&db->lock, flags); } /** @@ -384,7 +384,7 @@ void debug_object_activate(void *addr, struct debug_obj_descr *descr) db = get_bucket((unsigned long) addr); - spin_lock_irqsave(&db->lock, flags); + atomic_spin_lock_irqsave(&db->lock, flags); obj = lookup_object(addr, db); if (obj) { @@ -397,7 +397,7 @@ void debug_object_activate(void *addr, struct debug_obj_descr *descr) case ODEBUG_STATE_ACTIVE: debug_print_object(obj, "activate"); state = obj->state; - spin_unlock_irqrestore(&db->lock, flags); + atomic_spin_unlock_irqrestore(&db->lock, flags); debug_object_fixup(descr->fixup_activate, addr, state); return; @@ -407,11 +407,11 @@ void debug_object_activate(void *addr, struct debug_obj_descr *descr) default: break; } - spin_unlock_irqrestore(&db->lock, flags); + atomic_spin_unlock_irqrestore(&db->lock, flags); return; } - spin_unlock_irqrestore(&db->lock, flags); + atomic_spin_unlock_irqrestore(&db->lock, flags); /* * This happens when a static object is activated. We * let the type specific code decide whether this is @@ -437,7 +437,7 @@ void debug_object_deactivate(void *addr, struct debug_obj_descr *descr) db = get_bucket((unsigned long) addr); - spin_lock_irqsave(&db->lock, flags); + atomic_spin_lock_irqsave(&db->lock, flags); obj = lookup_object(addr, db); if (obj) { @@ -462,7 +462,7 @@ void debug_object_deactivate(void *addr, struct debug_obj_descr *descr) debug_print_object(&o, "deactivate"); } - spin_unlock_irqrestore(&db->lock, flags); + atomic_spin_unlock_irqrestore(&db->lock, flags); } /** @@ -482,7 +482,7 @@ void debug_object_destroy(void *addr, struct debug_obj_descr *descr) db = get_bucket((unsigned long) addr); - spin_lock_irqsave(&db->lock, flags); + atomic_spin_lock_irqsave(&db->lock, flags); obj = lookup_object(addr, db); if (!obj) @@ -497,7 +497,7 @@ void debug_object_destroy(void *addr, struct debug_obj_descr *descr) case ODEBUG_STATE_ACTIVE: debug_print_object(obj, "destroy"); state = obj->state; - spin_unlock_irqrestore(&db->lock, flags); + atomic_spin_unlock_irqrestore(&db->lock, flags); debug_object_fixup(descr->fixup_destroy, addr, state); return; @@ -508,7 +508,7 @@ void debug_object_destroy(void *addr, struct debug_obj_descr *descr) break; } out_unlock: - spin_unlock_irqrestore(&db->lock, flags); + atomic_spin_unlock_irqrestore(&db->lock, flags); } /** @@ -528,7 +528,7 @@ void debug_object_free(void *addr, struct debug_obj_descr *descr) db = get_bucket((unsigned long) addr); - spin_lock_irqsave(&db->lock, flags); + atomic_spin_lock_irqsave(&db->lock, flags); obj = lookup_object(addr, db); if (!obj) @@ -538,17 +538,17 @@ void debug_object_free(void *addr, struct debug_obj_descr *descr) case ODEBUG_STATE_ACTIVE: debug_print_object(obj, "free"); state = obj->state; - spin_unlock_irqrestore(&db->lock, flags); + atomic_spin_unlock_irqrestore(&db->lock, flags); debug_object_fixup(descr->fixup_free, addr, state); return; default: hlist_del(&obj->node); - spin_unlock_irqrestore(&db->lock, flags); + atomic_spin_unlock_irqrestore(&db->lock, flags); free_object(obj); return; } out_unlock: - spin_unlock_irqrestore(&db->lock, flags); + atomic_spin_unlock_irqrestore(&db->lock, flags); } #ifdef CONFIG_DEBUG_OBJECTS_FREE @@ -574,7 +574,7 @@ static void __debug_check_no_obj_freed(const void *address, unsigned long size) repeat: cnt = 0; - spin_lock_irqsave(&db->lock, flags); + atomic_spin_lock_irqsave(&db->lock, flags); hlist_for_each_entry_safe(obj, node, tmp, &db->list, node) { cnt++; oaddr = (unsigned long) obj->object; @@ -586,7 +586,7 @@ repeat: debug_print_object(obj, "free"); descr = obj->descr; state = obj->state; - spin_unlock_irqrestore(&db->lock, flags); + atomic_spin_unlock_irqrestore(&db->lock, flags); debug_object_fixup(descr->fixup_free, (void *) oaddr, state); goto repeat; @@ -596,7 +596,7 @@ repeat: break; } } - spin_unlock_irqrestore(&db->lock, flags); + atomic_spin_unlock_irqrestore(&db->lock, flags); /* Now free them */ hlist_for_each_entry_safe(obj, node, tmp, &freelist, node) { @@ -782,7 +782,7 @@ check_results(void *addr, enum debug_obj_state state, int fixups, int warnings) db = get_bucket((unsigned long) addr); - spin_lock_irqsave(&db->lock, flags); + atomic_spin_lock_irqsave(&db->lock, flags); obj = lookup_object(addr, db); if (!obj && state != ODEBUG_STATE_NONE) { @@ -806,7 +806,7 @@ check_results(void *addr, enum debug_obj_state state, int fixups, int warnings) } res = 0; out: - spin_unlock_irqrestore(&db->lock, flags); + atomic_spin_unlock_irqrestore(&db->lock, flags); if (res) debug_objects_enabled = 0; return res; @@ -906,7 +906,7 @@ void __init debug_objects_early_init(void) int i; for (i = 0; i < ODEBUG_HASH_SIZE; i++) - spin_lock_init(&obj_hash[i].lock); + atomic_spin_lock_init(&obj_hash[i].lock); for (i = 0; i < ODEBUG_POOL_SIZE; i++) hlist_add_head(&obj_static_pool[i].node, &obj_pool); diff --git a/lib/dec_and_lock.c b/lib/dec_and_lock.c index e73822a..6a4ec2b 100644 --- a/lib/dec_and_lock.c +++ b/lib/dec_and_lock.c @@ -17,18 +17,18 @@ * because the spin-lock and the decrement must be * "atomic". */ -int _atomic_dec_and_lock(atomic_t *atomic, spinlock_t *lock) +int _atomic_dec_and_atomic_lock(atomic_t *atomic, atomic_spinlock_t *lock) { /* Subtract 1 from counter unless that drops it to 0 (ie. it was 1) */ if (atomic_add_unless(atomic, -1, 1)) return 0; /* Otherwise do it the slow way */ - spin_lock(lock); + atomic_spin_lock(lock); if (atomic_dec_and_test(atomic)) return 1; - spin_unlock(lock); + atomic_spin_unlock(lock); return 0; } -EXPORT_SYMBOL(_atomic_dec_and_lock); +EXPORT_SYMBOL(_atomic_dec_and_atomic_lock); diff --git a/lib/kernel_lock.c b/lib/kernel_lock.c index 39f1029..709c432 100644 --- a/lib/kernel_lock.c +++ b/lib/kernel_lock.c @@ -11,121 +11,89 @@ #include <linux/semaphore.h> /* - * The 'big kernel lock' + * The 'big kernel semaphore' * - * This spinlock is taken and released recursively by lock_kernel() + * This mutex is taken and released recursively by lock_kernel() * and unlock_kernel(). It is transparently dropped and reacquired * over schedule(). It is used to protect legacy code that hasn't * been migrated to a proper locking design yet. * + * Note: code locked by this semaphore will only be serialized against + * other code using the same locking facility. The code guarantees that + * the task remains on the same CPU. + * * Don't use in new code. */ -static __cacheline_aligned_in_smp DEFINE_SPINLOCK(kernel_flag); - +DEFINE_SEMAPHORE(kernel_sem); /* - * Acquire/release the underlying lock from the scheduler. + * Re-acquire the kernel semaphore. * - * This is called with preemption disabled, and should - * return an error value if it cannot get the lock and - * TIF_NEED_RESCHED gets set. + * This function is called with preemption off. * - * If it successfully gets the lock, it should increment - * the preemption count like any spinlock does. + * We are executing in schedule() so the code must be extremely careful + * about recursion, both due to the down() and due to the enabling of + * preemption. schedule() will re-check the preemption flag after + * reacquiring the semaphore. * - * (This works on UP too - _raw_spin_trylock will never - * return false in that case) + * Called with interrupts disabled. */ int __lockfunc __reacquire_kernel_lock(void) { - while (!_raw_spin_trylock(&kernel_flag)) { - if (need_resched()) - return -EAGAIN; - cpu_relax(); - } - preempt_disable(); + struct task_struct *task = current; + int saved_lock_depth = task->lock_depth; + + local_irq_enable(); + BUG_ON(saved_lock_depth < 0); + + task->lock_depth = -1; + + down(&kernel_sem); + + task->lock_depth = saved_lock_depth; + local_irq_enable(); + return 0; } void __lockfunc __release_kernel_lock(void) { - _raw_spin_unlock(&kernel_flag); - preempt_enable_no_resched(); + up(&kernel_sem); } /* - * These are the BKL spinlocks - we try to be polite about preemption. - * If SMP is not on (ie UP preemption), this all goes away because the - * _raw_spin_trylock() will always succeed. + * Getting the big kernel semaphore. */ -#ifdef CONFIG_PREEMPT -static inline void __lock_kernel(void) +void __lockfunc lock_kernel(void) { - preempt_disable(); - if (unlikely(!_raw_spin_trylock(&kernel_flag))) { - /* - * If preemption was disabled even before this - * was called, there's nothing we can be polite - * about - just spin. - */ - if (preempt_count() > 1) { - _raw_spin_lock(&kernel_flag); - return; - } + struct task_struct *task = current; + int depth = task->lock_depth + 1; + if (likely(!depth)) { /* - * Otherwise, let's wait for the kernel lock - * with preemption enabled.. + * No recursion worries - we set up lock_depth _after_ */ - do { - preempt_enable(); - while (spin_is_locked(&kernel_flag)) - cpu_relax(); - preempt_disable(); - } while (!_raw_spin_trylock(&kernel_flag)); + down(&kernel_sem); +#ifdef CONFIG_DEBUG_RT_MUTEXES + current->last_kernel_lock = __builtin_return_address(0); +#endif } -} - -#else -/* - * Non-preemption case - just get the spinlock - */ -static inline void __lock_kernel(void) -{ - _raw_spin_lock(&kernel_flag); + task->lock_depth = depth; } -#endif -static inline void __unlock_kernel(void) +void __lockfunc unlock_kernel(void) { - /* - * the BKL is not covered by lockdep, so we open-code the - * unlocking sequence (and thus avoid the dep-chain ops): - */ - _raw_spin_unlock(&kernel_flag); - preempt_enable(); -} + struct task_struct *task = current; -/* - * Getting the big kernel lock. - * - * This cannot happen asynchronously, so we only need to - * worry about other CPU's. - */ -void __lockfunc lock_kernel(void) -{ - int depth = current->lock_depth+1; - if (likely(!depth)) - __lock_kernel(); - current->lock_depth = depth; -} + BUG_ON(task->lock_depth < 0); -void __lockfunc unlock_kernel(void) -{ - BUG_ON(current->lock_depth < 0); - if (likely(--current->lock_depth < 0)) - __unlock_kernel(); + if (likely(--task->lock_depth < 0)) { +#ifdef CONFIG_DEBUG_RT_MUTEXES + current->last_kernel_lock = NULL; +#endif + up(&kernel_sem); + } } EXPORT_SYMBOL(lock_kernel); diff --git a/lib/locking-selftest.c b/lib/locking-selftest.c index 619313e..65e7eab 100644 --- a/lib/locking-selftest.c +++ b/lib/locking-selftest.c @@ -158,7 +158,7 @@ static void init_shared_classes(void) local_bh_disable(); \ local_irq_disable(); \ lockdep_softirq_enter(); \ - WARN_ON(!in_softirq()); + /* FIXME: preemptible softirqs. WARN_ON(!in_softirq()); */ #define SOFTIRQ_EXIT() \ lockdep_softirq_exit(); \ @@ -550,6 +550,11 @@ GENERATE_TESTCASE(init_held_rsem) #undef E /* + * FIXME: turns these into raw-spinlock tests on -rt + */ +#ifndef CONFIG_PREEMPT_RT + +/* * locking an irq-safe lock with irqs enabled: */ #define E1() \ @@ -890,6 +895,8 @@ GENERATE_PERMUTATIONS_3_EVENTS(irq_read_recursion_soft) #include "locking-selftest-softirq.h" // GENERATE_PERMUTATIONS_3_EVENTS(irq_read_recursion2_soft) +#endif /* !CONFIG_PREEMPT_RT */ + #ifdef CONFIG_DEBUG_LOCK_ALLOC # define I_SPINLOCK(x) lockdep_reset_lock(&lock_##x.dep_map) # define I_RWLOCK(x) lockdep_reset_lock(&rwlock_##x.dep_map) @@ -998,7 +1005,7 @@ static inline void print_testname(const char *testname) #define DO_TESTCASE_1(desc, name, nr) \ print_testname(desc"/"#nr); \ - dotest(name##_##nr, SUCCESS, LOCKTYPE_RWLOCK); \ + dotest(name##_##nr, SUCCESS, LOCKTYPE_RWLOCK); \ printk("\n"); #define DO_TESTCASE_1B(desc, name, nr) \ @@ -1006,17 +1013,17 @@ static inline void print_testname(const char *testname) dotest(name##_##nr, FAILURE, LOCKTYPE_RWLOCK); \ printk("\n"); -#define DO_TESTCASE_3(desc, name, nr) \ - print_testname(desc"/"#nr); \ - dotest(name##_spin_##nr, FAILURE, LOCKTYPE_SPIN); \ - dotest(name##_wlock_##nr, FAILURE, LOCKTYPE_RWLOCK); \ +#define DO_TESTCASE_3(desc, name, nr) \ + print_testname(desc"/"#nr); \ + dotest(name##_spin_##nr, FAILURE, LOCKTYPE_SPIN); \ + dotest(name##_wlock_##nr, FAILURE, LOCKTYPE_RWLOCK); \ dotest(name##_rlock_##nr, SUCCESS, LOCKTYPE_RWLOCK); \ printk("\n"); -#define DO_TESTCASE_3RW(desc, name, nr) \ - print_testname(desc"/"#nr); \ +#define DO_TESTCASE_3RW(desc, name, nr) \ + print_testname(desc"/"#nr); \ dotest(name##_spin_##nr, FAILURE, LOCKTYPE_SPIN|LOCKTYPE_RWLOCK);\ - dotest(name##_wlock_##nr, FAILURE, LOCKTYPE_RWLOCK); \ + dotest(name##_wlock_##nr, FAILURE, LOCKTYPE_RWLOCK); \ dotest(name##_rlock_##nr, SUCCESS, LOCKTYPE_RWLOCK); \ printk("\n"); @@ -1047,7 +1054,7 @@ static inline void print_testname(const char *testname) print_testname(desc); \ dotest(name##_spin, FAILURE, LOCKTYPE_SPIN); \ dotest(name##_wlock, FAILURE, LOCKTYPE_RWLOCK); \ - dotest(name##_rlock, SUCCESS, LOCKTYPE_RWLOCK); \ + dotest(name##_rlock, SUCCESS, LOCKTYPE_RWLOCK); \ dotest(name##_mutex, FAILURE, LOCKTYPE_MUTEX); \ dotest(name##_wsem, FAILURE, LOCKTYPE_RWSEM); \ dotest(name##_rsem, FAILURE, LOCKTYPE_RWSEM); \ @@ -1179,6 +1186,7 @@ void locking_selftest(void) /* * irq-context testcases: */ +#ifndef CONFIG_PREEMPT_RT DO_TESTCASE_2x6("irqs-on + irq-safe-A", irqsafe1); DO_TESTCASE_2x3("sirq-safe-A => hirqs-on", irqsafe2A); DO_TESTCASE_2x6("safe-A + irqs-on", irqsafe2B); @@ -1188,6 +1196,7 @@ void locking_selftest(void) DO_TESTCASE_6x2("irq read-recursion", irq_read_recursion); // DO_TESTCASE_6x2B("irq read-recursion #2", irq_read_recursion2); +#endif if (unexpected_testcase_failures) { printk("-----------------------------------------------------------------\n"); diff --git a/lib/percpu_counter.c b/lib/percpu_counter.c index aeaa6d7..e63af9a 100644 --- a/lib/percpu_counter.c +++ b/lib/percpu_counter.c @@ -16,13 +16,13 @@ void percpu_counter_set(struct percpu_counter *fbc, s64 amount) { int cpu; - spin_lock(&fbc->lock); + atomic_spin_lock(&fbc->lock); for_each_possible_cpu(cpu) { s32 *pcount = per_cpu_ptr(fbc->counters, cpu); *pcount = 0; } fbc->count = amount; - spin_unlock(&fbc->lock); + atomic_spin_unlock(&fbc->lock); } EXPORT_SYMBOL(percpu_counter_set); @@ -35,10 +35,10 @@ void __percpu_counter_add(struct percpu_counter *fbc, s64 amount, s32 batch) pcount = per_cpu_ptr(fbc->counters, cpu); count = *pcount + amount; if (count >= batch || count <= -batch) { - spin_lock(&fbc->lock); + atomic_spin_lock(&fbc->lock); fbc->count += count; *pcount = 0; - spin_unlock(&fbc->lock); + atomic_spin_unlock(&fbc->lock); } else { *pcount = count; } @@ -55,13 +55,13 @@ s64 __percpu_counter_sum(struct percpu_counter *fbc) s64 ret; int cpu; - spin_lock(&fbc->lock); + atomic_spin_lock(&fbc->lock); ret = fbc->count; for_each_online_cpu(cpu) { s32 *pcount = per_cpu_ptr(fbc->counters, cpu); ret += *pcount; } - spin_unlock(&fbc->lock); + atomic_spin_unlock(&fbc->lock); return ret; } EXPORT_SYMBOL(__percpu_counter_sum); @@ -69,7 +69,7 @@ EXPORT_SYMBOL(__percpu_counter_sum); int __percpu_counter_init(struct percpu_counter *fbc, s64 amount, struct lock_class_key *key) { - spin_lock_init(&fbc->lock); + atomic_spin_lock_init(&fbc->lock); lockdep_set_class(&fbc->lock, key); fbc->count = amount; fbc->counters = alloc_percpu(s32); @@ -126,11 +126,11 @@ static int __cpuinit percpu_counter_hotcpu_callback(struct notifier_block *nb, s32 *pcount; unsigned long flags; - spin_lock_irqsave(&fbc->lock, flags); + atomic_spin_lock_irqsave(&fbc->lock, flags); pcount = per_cpu_ptr(fbc->counters, cpu); fbc->count += *pcount; *pcount = 0; - spin_unlock_irqrestore(&fbc->lock, flags); + atomic_spin_unlock_irqrestore(&fbc->lock, flags); } mutex_unlock(&percpu_counters_lock); #endif diff --git a/lib/plist.c b/lib/plist.c index d6c64a8..beff294 100644 --- a/lib/plist.c +++ b/lib/plist.c @@ -54,9 +54,11 @@ static void plist_check_list(struct list_head *top) static void plist_check_head(struct plist_head *head) { - WARN_ON(!head->lock); - if (head->lock) - WARN_ON_SMP(!spin_is_locked(head->lock)); + WARN_ON(!head->alock && !head->slock); + if (head->alock) + WARN_ON_SMP(!atomic_spin_is_locked(head->alock)); + if (head->slock) + WARN_ON_SMP(!spin_is_locked(head->slock)); plist_check_list(&head->prio_list); plist_check_list(&head->node_list); } diff --git a/lib/proportions.c b/lib/proportions.c index d50746a..ff347dc 100644 --- a/lib/proportions.c +++ b/lib/proportions.c @@ -190,7 +190,7 @@ prop_adjust_shift(int *pl_shift, unsigned long *pl_period, int new_shift) int prop_local_init_percpu(struct prop_local_percpu *pl) { - spin_lock_init(&pl->lock); + atomic_spin_lock_init(&pl->lock); pl->shift = 0; pl->period = 0; return percpu_counter_init(&pl->events, 0); @@ -226,7 +226,7 @@ void prop_norm_percpu(struct prop_global *pg, struct prop_local_percpu *pl) if (pl->period == global_period) return; - spin_lock_irqsave(&pl->lock, flags); + atomic_spin_lock_irqsave(&pl->lock, flags); prop_adjust_shift(&pl->shift, &pl->period, pg->shift); /* @@ -247,7 +247,7 @@ void prop_norm_percpu(struct prop_global *pg, struct prop_local_percpu *pl) percpu_counter_set(&pl->events, 0); pl->period = global_period; - spin_unlock_irqrestore(&pl->lock, flags); + atomic_spin_unlock_irqrestore(&pl->lock, flags); } /* @@ -324,7 +324,7 @@ void prop_fraction_percpu(struct prop_descriptor *pd, int prop_local_init_single(struct prop_local_single *pl) { - spin_lock_init(&pl->lock); + atomic_spin_lock_init(&pl->lock); pl->shift = 0; pl->period = 0; pl->events = 0; @@ -356,7 +356,7 @@ void prop_norm_single(struct prop_global *pg, struct prop_local_single *pl) if (pl->period == global_period) return; - spin_lock_irqsave(&pl->lock, flags); + atomic_spin_lock_irqsave(&pl->lock, flags); prop_adjust_shift(&pl->shift, &pl->period, pg->shift); /* * For each missed period, we half the local counter. @@ -367,7 +367,7 @@ void prop_norm_single(struct prop_global *pg, struct prop_local_single *pl) else pl->events = 0; pl->period = global_period; - spin_unlock_irqrestore(&pl->lock, flags); + atomic_spin_unlock_irqrestore(&pl->lock, flags); } /* diff --git a/lib/radix-tree.c b/lib/radix-tree.c index 23abbd9..e209012 100644 --- a/lib/radix-tree.c +++ b/lib/radix-tree.c @@ -157,12 +157,14 @@ radix_tree_node_alloc(struct radix_tree_root *root) * succeed in getting a node here (and never reach * kmem_cache_alloc) */ + rtp = &get_cpu_var(radix_tree_preloads); rtp = &__get_cpu_var(radix_tree_preloads); if (rtp->nr) { ret = rtp->nodes[rtp->nr - 1]; rtp->nodes[rtp->nr - 1] = NULL; rtp->nr--; } + put_cpu_var(radix_tree_preloads); } if (ret == NULL) ret = kmem_cache_alloc(radix_tree_node_cachep, gfp_mask); @@ -195,6 +197,8 @@ radix_tree_node_free(struct radix_tree_node *node) call_rcu(&node->rcu_head, radix_tree_node_rcu_free); } +#ifndef CONFIG_PREEMPT_RT + /* * Load up this CPU's radix_tree_node buffer with sufficient objects to * ensure that the addition of a single element in the tree cannot fail. On @@ -227,6 +231,8 @@ out: } EXPORT_SYMBOL(radix_tree_preload); +#endif + /* * Return the maximum key which can be store into a * radix tree with height HEIGHT. diff --git a/lib/ratelimit.c b/lib/ratelimit.c index 26187ed..5488990 100644 --- a/lib/ratelimit.c +++ b/lib/ratelimit.c @@ -14,7 +14,7 @@ #include <linux/jiffies.h> #include <linux/module.h> -static DEFINE_SPINLOCK(ratelimit_lock); +static DEFINE_ATOMIC_SPINLOCK(ratelimit_lock); /* * __ratelimit - rate limiting @@ -30,7 +30,7 @@ int __ratelimit(struct ratelimit_state *rs) if (!rs->interval) return 1; - spin_lock_irqsave(&ratelimit_lock, flags); + atomic_spin_lock_irqsave(&ratelimit_lock, flags); if (!rs->begin) rs->begin = jiffies; @@ -46,12 +46,12 @@ int __ratelimit(struct ratelimit_state *rs) goto print; rs->missed++; - spin_unlock_irqrestore(&ratelimit_lock, flags); + atomic_spin_unlock_irqrestore(&ratelimit_lock, flags); return 0; print: rs->printed++; - spin_unlock_irqrestore(&ratelimit_lock, flags); + atomic_spin_unlock_irqrestore(&ratelimit_lock, flags); return 1; } EXPORT_SYMBOL(__ratelimit); diff --git a/lib/rwsem-spinlock.c b/lib/rwsem-spinlock.c index 9df3ca5..018dd5d 100644 --- a/lib/rwsem-spinlock.c +++ b/lib/rwsem-spinlock.c @@ -20,8 +20,8 @@ struct rwsem_waiter { /* * initialise the semaphore */ -void __init_rwsem(struct rw_semaphore *sem, const char *name, - struct lock_class_key *key) +void __init_anon_rwsem(struct rw_anon_semaphore *sem, const char *name, + struct lock_class_key *key) { #ifdef CONFIG_DEBUG_LOCK_ALLOC /* @@ -44,8 +44,8 @@ void __init_rwsem(struct rw_semaphore *sem, const char *name, * - woken process blocks are discarded from the list after having task zeroed * - writers are only woken if wakewrite is non-zero */ -static inline struct rw_semaphore * -__rwsem_do_wake(struct rw_semaphore *sem, int wakewrite) +static inline struct rw_anon_semaphore * +__rwsem_do_wake(struct rw_anon_semaphore *sem, int wakewrite) { struct rwsem_waiter *waiter; struct task_struct *tsk; @@ -103,8 +103,8 @@ __rwsem_do_wake(struct rw_semaphore *sem, int wakewrite) /* * wake a single writer */ -static inline struct rw_semaphore * -__rwsem_wake_one_writer(struct rw_semaphore *sem) +static inline struct rw_anon_semaphore * +__rwsem_wake_one_writer(struct rw_anon_semaphore *sem) { struct rwsem_waiter *waiter; struct task_struct *tsk; @@ -125,7 +125,7 @@ __rwsem_wake_one_writer(struct rw_semaphore *sem) /* * get a read lock on the semaphore */ -void __sched __down_read(struct rw_semaphore *sem) +void __sched __down_read(struct rw_anon_semaphore *sem) { struct rwsem_waiter waiter; struct task_struct *tsk; @@ -168,7 +168,7 @@ void __sched __down_read(struct rw_semaphore *sem) /* * trylock for reading -- returns 1 if successful, 0 if contention */ -int __down_read_trylock(struct rw_semaphore *sem) +int __down_read_trylock(struct rw_anon_semaphore *sem) { unsigned long flags; int ret = 0; @@ -191,7 +191,7 @@ int __down_read_trylock(struct rw_semaphore *sem) * get a write lock on the semaphore * - we increment the waiting count anyway to indicate an exclusive lock */ -void __sched __down_write_nested(struct rw_semaphore *sem, int subclass) +void __sched __down_write_nested(struct rw_anon_semaphore *sem, int subclass) { struct rwsem_waiter waiter; struct task_struct *tsk; @@ -231,7 +231,7 @@ void __sched __down_write_nested(struct rw_semaphore *sem, int subclass) ; } -void __sched __down_write(struct rw_semaphore *sem) +void __sched __down_write(struct rw_anon_semaphore *sem) { __down_write_nested(sem, 0); } @@ -239,7 +239,7 @@ void __sched __down_write(struct rw_semaphore *sem) /* * trylock for writing -- returns 1 if successful, 0 if contention */ -int __down_write_trylock(struct rw_semaphore *sem) +int __down_write_trylock(struct rw_anon_semaphore *sem) { unsigned long flags; int ret = 0; @@ -260,7 +260,7 @@ int __down_write_trylock(struct rw_semaphore *sem) /* * release a read lock on the semaphore */ -void __up_read(struct rw_semaphore *sem) +void __up_read(struct rw_anon_semaphore *sem) { unsigned long flags; @@ -275,7 +275,7 @@ void __up_read(struct rw_semaphore *sem) /* * release a write lock on the semaphore */ -void __up_write(struct rw_semaphore *sem) +void __up_write(struct rw_anon_semaphore *sem) { unsigned long flags; @@ -292,7 +292,7 @@ void __up_write(struct rw_semaphore *sem) * downgrade a write lock into a read lock * - just wake up any readers at the front of the queue */ -void __downgrade_write(struct rw_semaphore *sem) +void __downgrade_write(struct rw_anon_semaphore *sem) { unsigned long flags; @@ -305,7 +305,7 @@ void __downgrade_write(struct rw_semaphore *sem) spin_unlock_irqrestore(&sem->wait_lock, flags); } -EXPORT_SYMBOL(__init_rwsem); +EXPORT_SYMBOL(__init_anon_rwsem); EXPORT_SYMBOL(__down_read); EXPORT_SYMBOL(__down_read_trylock); EXPORT_SYMBOL(__down_write_nested); diff --git a/lib/rwsem.c b/lib/rwsem.c index 3e3365e..72eaba5 100644 --- a/lib/rwsem.c +++ b/lib/rwsem.c @@ -11,8 +11,8 @@ /* * Initialize an rwsem: */ -void __init_rwsem(struct rw_semaphore *sem, const char *name, - struct lock_class_key *key) +void __init_anon_rwsem(struct rw_anon_semaphore *sem, const char *name, + struct lock_class_key *key) { #ifdef CONFIG_DEBUG_LOCK_ALLOC /* @@ -25,8 +25,7 @@ void __init_rwsem(struct rw_semaphore *sem, const char *name, spin_lock_init(&sem->wait_lock); INIT_LIST_HEAD(&sem->wait_list); } - -EXPORT_SYMBOL(__init_rwsem); +EXPORT_SYMBOL(__init_anon_rwsem); struct rwsem_waiter { struct list_head list; @@ -46,8 +45,8 @@ struct rwsem_waiter { * - woken process blocks are discarded from the list after having task zeroed * - writers are only woken if downgrading is false */ -static inline struct rw_semaphore * -__rwsem_do_wake(struct rw_semaphore *sem, int downgrading) +static inline struct rw_anon_semaphore * +__rwsem_do_wake(struct rw_anon_semaphore *sem, int downgrading) { struct rwsem_waiter *waiter; struct task_struct *tsk; @@ -146,9 +145,9 @@ __rwsem_do_wake(struct rw_semaphore *sem, int downgrading) /* * wait for a lock to be granted */ -static struct rw_semaphore __sched * -rwsem_down_failed_common(struct rw_semaphore *sem, - struct rwsem_waiter *waiter, signed long adjustment) +static struct rw_anon_semaphore __sched * +rwsem_down_failed_common(struct rw_anon_semaphore *sem, + struct rwsem_waiter *waiter, signed long adjustment) { struct task_struct *tsk = current; signed long count; @@ -187,8 +186,8 @@ rwsem_down_failed_common(struct rw_semaphore *sem, /* * wait for the read lock to be granted */ -asmregparm struct rw_semaphore __sched * -rwsem_down_read_failed(struct rw_semaphore *sem) +asmregparm struct rw_anon_semaphore __sched * +rwsem_down_read_failed(struct rw_anon_semaphore *sem) { struct rwsem_waiter waiter; @@ -201,8 +200,8 @@ rwsem_down_read_failed(struct rw_semaphore *sem) /* * wait for the write lock to be granted */ -asmregparm struct rw_semaphore __sched * -rwsem_down_write_failed(struct rw_semaphore *sem) +asmregparm struct rw_anon_semaphore __sched * +rwsem_down_write_failed(struct rw_anon_semaphore *sem) { struct rwsem_waiter waiter; @@ -216,7 +215,7 @@ rwsem_down_write_failed(struct rw_semaphore *sem) * handle waking up a waiter on the semaphore * - up_read/up_write has decremented the active part of count if we come here */ -asmregparm struct rw_semaphore *rwsem_wake(struct rw_semaphore *sem) +asmregparm struct rw_anon_semaphore *rwsem_wake(struct rw_anon_semaphore *sem) { unsigned long flags; @@ -236,7 +235,8 @@ asmregparm struct rw_semaphore *rwsem_wake(struct rw_semaphore *sem) * - caller incremented waiting part of count and discovered it still negative * - just wake up any readers at the front of the queue */ -asmregparm struct rw_semaphore *rwsem_downgrade_wake(struct rw_semaphore *sem) +asmregparm struct rw_anon_semaphore * +rwsem_downgrade_wake(struct rw_anon_semaphore *sem) { unsigned long flags; diff --git a/lib/scatterlist.c b/lib/scatterlist.c index 0d475d8..e6dcd3b 100644 --- a/lib/scatterlist.c +++ b/lib/scatterlist.c @@ -9,6 +9,7 @@ #include <linux/module.h> #include <linux/scatterlist.h> #include <linux/highmem.h> +#include <linux/interrupt.h> /** * sg_next - return the next scatterlist entry in a list @@ -399,7 +400,7 @@ void sg_miter_stop(struct sg_mapping_iter *miter) flush_kernel_dcache_page(miter->page); if (miter->__flags & SG_MITER_ATOMIC) { - WARN_ON(!irqs_disabled()); + WARN_ON_NONRT(!irqs_disabled()); kunmap_atomic(miter->addr, KM_BIO_SRC_IRQ); } else kunmap(miter->page); @@ -439,7 +440,7 @@ static size_t sg_copy_buffer(struct scatterlist *sgl, unsigned int nents, sg_miter_start(&miter, sgl, nents, sg_flags); - local_irq_save(flags); + local_irq_save_nort(flags); while (sg_miter_next(&miter) && offset < buflen) { unsigned int len; @@ -456,7 +457,7 @@ static size_t sg_copy_buffer(struct scatterlist *sgl, unsigned int nents, sg_miter_stop(&miter); - local_irq_restore(flags); + local_irq_restore_nort(flags); return offset; } diff --git a/lib/spinlock_debug.c b/lib/spinlock_debug.c index 9c4b025..70b5c1c 100644 --- a/lib/spinlock_debug.c +++ b/lib/spinlock_debug.c @@ -13,8 +13,8 @@ #include <linux/delay.h> #include <linux/module.h> -void __spin_lock_init(spinlock_t *lock, const char *name, - struct lock_class_key *key) +void __atomic_spin_lock_init(atomic_spinlock_t *lock, const char *name, + struct lock_class_key *key) { #ifdef CONFIG_DEBUG_LOCK_ALLOC /* @@ -29,8 +29,9 @@ void __spin_lock_init(spinlock_t *lock, const char *name, lock->owner_cpu = -1; } -EXPORT_SYMBOL(__spin_lock_init); +EXPORT_SYMBOL(__atomic_spin_lock_init); +#ifndef CONFIG_PREEMPT_RT void __rwlock_init(rwlock_t *lock, const char *name, struct lock_class_key *key) { @@ -46,10 +47,10 @@ void __rwlock_init(rwlock_t *lock, const char *name, lock->owner = SPINLOCK_OWNER_INIT; lock->owner_cpu = -1; } - EXPORT_SYMBOL(__rwlock_init); +#endif -static void spin_bug(spinlock_t *lock, const char *msg) +static void spin_bug(atomic_spinlock_t *lock, const char *msg) { struct task_struct *owner = NULL; @@ -73,7 +74,7 @@ static void spin_bug(spinlock_t *lock, const char *msg) #define SPIN_BUG_ON(cond, lock, msg) if (unlikely(cond)) spin_bug(lock, msg) static inline void -debug_spin_lock_before(spinlock_t *lock) +debug_spin_lock_before(atomic_spinlock_t *lock) { SPIN_BUG_ON(lock->magic != SPINLOCK_MAGIC, lock, "bad magic"); SPIN_BUG_ON(lock->owner == current, lock, "recursion"); @@ -81,16 +82,16 @@ debug_spin_lock_before(spinlock_t *lock) lock, "cpu recursion"); } -static inline void debug_spin_lock_after(spinlock_t *lock) +static inline void debug_spin_lock_after(atomic_spinlock_t *lock) { lock->owner_cpu = raw_smp_processor_id(); lock->owner = current; } -static inline void debug_spin_unlock(spinlock_t *lock) +static inline void debug_spin_unlock(atomic_spinlock_t *lock) { SPIN_BUG_ON(lock->magic != SPINLOCK_MAGIC, lock, "bad magic"); - SPIN_BUG_ON(!spin_is_locked(lock), lock, "already unlocked"); + SPIN_BUG_ON(!atomic_spin_is_locked(lock), lock, "already unlocked"); SPIN_BUG_ON(lock->owner != current, lock, "wrong owner"); SPIN_BUG_ON(lock->owner_cpu != raw_smp_processor_id(), lock, "wrong CPU"); @@ -98,7 +99,7 @@ static inline void debug_spin_unlock(spinlock_t *lock) lock->owner_cpu = -1; } -static void __spin_lock_debug(spinlock_t *lock) +static void __spin_lock_debug(atomic_spinlock_t *lock) { u64 i; u64 loops = loops_per_jiffy * HZ; @@ -125,7 +126,7 @@ static void __spin_lock_debug(spinlock_t *lock) } } -void _raw_spin_lock(spinlock_t *lock) +void _raw_spin_lock(atomic_spinlock_t *lock) { debug_spin_lock_before(lock); if (unlikely(!__raw_spin_trylock(&lock->raw_lock))) @@ -133,7 +134,7 @@ void _raw_spin_lock(spinlock_t *lock) debug_spin_lock_after(lock); } -int _raw_spin_trylock(spinlock_t *lock) +int _raw_spin_trylock(atomic_spinlock_t *lock) { int ret = __raw_spin_trylock(&lock->raw_lock); @@ -148,12 +149,14 @@ int _raw_spin_trylock(spinlock_t *lock) return ret; } -void _raw_spin_unlock(spinlock_t *lock) +void _raw_spin_unlock(atomic_spinlock_t *lock) { debug_spin_unlock(lock); __raw_spin_unlock(&lock->raw_lock); } +#ifndef CONFIG_PREEMPT_RT + static void rwlock_bug(rwlock_t *lock, const char *msg) { if (!debug_locks_off()) @@ -295,3 +298,4 @@ void _raw_write_unlock(rwlock_t *lock) debug_write_unlock(lock); __raw_write_unlock(&lock->raw_lock); } +#endif diff --git a/mm/bounce.c b/mm/bounce.c index a2b76a5..4a91eed 100644 --- a/mm/bounce.c +++ b/mm/bounce.c @@ -13,6 +13,7 @@ #include <linux/init.h> #include <linux/hash.h> #include <linux/highmem.h> +#include <linux/interrupt.h> #include <asm/tlbflush.h> #include <trace/events/block.h> @@ -49,11 +50,11 @@ static void bounce_copy_vec(struct bio_vec *to, unsigned char *vfrom) unsigned long flags; unsigned char *vto; - local_irq_save(flags); + local_irq_save_nort(flags); vto = kmap_atomic(to->bv_page, KM_BOUNCE_READ); memcpy(vto + to->bv_offset, vfrom, to->bv_len); kunmap_atomic(vto, KM_BOUNCE_READ); - local_irq_restore(flags); + local_irq_restore_nort(flags); } #else /* CONFIG_HIGHMEM */ diff --git a/mm/filemap.c b/mm/filemap.c index ccea3b6..769d389 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1890,7 +1890,7 @@ size_t iov_iter_copy_from_user_atomic(struct page *page, char *kaddr; size_t copied; - BUG_ON(!in_atomic()); +// BUG_ON(!in_atomic()); kaddr = kmap_atomic(page, KM_USER0); if (likely(i->nr_segs == 1)) { int left; diff --git a/mm/highmem.c b/mm/highmem.c index 25878cc..66e915a 100644 --- a/mm/highmem.c +++ b/mm/highmem.c @@ -14,6 +14,11 @@ * based on Linus' idea. * * Copyright (C) 1999 Ingo Molnar <mingo@redhat.com> + * + * Largely rewritten to get rid of all global locks + * + * Copyright (C) 2006 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com> + * */ #include <linux/mm.h> @@ -26,18 +31,15 @@ #include <linux/init.h> #include <linux/hash.h> #include <linux/highmem.h> +#include <linux/hardirq.h> + #include <asm/tlbflush.h> +#include <asm/pgtable.h> -/* - * Virtual_count is not a pure "count". - * 0 means that it is not mapped, and has not been mapped - * since a TLB flush - it is usable. - * 1 means that there are no users, but it has been mapped - * since the last TLB flush - so we can't use it. - * n means that there are (n-1) current users of it. - */ #ifdef CONFIG_HIGHMEM +static int __set_page_address(struct page *page, void *virtual, int pos); + unsigned long totalhigh_pages __read_mostly; EXPORT_SYMBOL(totalhigh_pages); @@ -58,13 +60,21 @@ unsigned int nr_free_highpages (void) return pages; } -static int pkmap_count[LAST_PKMAP]; -static unsigned int last_pkmap_nr; -static __cacheline_aligned_in_smp DEFINE_SPINLOCK(kmap_lock); +/* + * count is not a pure "count". + * 0 means its owned exclusively by someone + * 1 means its free for use - either mapped or not. + * n means that there are (n-1) current users of it. + */ +static atomic_t pkmap_count[LAST_PKMAP]; +static atomic_t pkmap_hand; +static atomic_t pkmap_free; +static atomic_t pkmap_users; pte_t * pkmap_page_table; -static DECLARE_WAIT_QUEUE_HEAD(pkmap_map_wait); +static DECLARE_WAIT_QUEUE_HEAD(pkmap_wait); + /* * Most architectures have no use for kmap_high_get(), so let's abstract @@ -85,131 +95,261 @@ static DECLARE_WAIT_QUEUE_HEAD(pkmap_map_wait); do { spin_unlock(&kmap_lock); (void)(flags); } while (0) #endif -static void flush_all_zero_pkmaps(void) +/* + * Try to free a given kmap slot. + * + * Returns: + * -1 - in use + * 0 - free, no TLB flush needed + * 1 - free, needs TLB flush + */ +static int pkmap_try_free(int pos) { - int i; - int need_flush = 0; + if (atomic_cmpxchg(&pkmap_count[pos], 1, 0) != 1) + return -1; + atomic_dec(&pkmap_free); + /* + * TODO: add a young bit to make it CLOCK + */ + if (!pte_none(pkmap_page_table[pos])) { + struct page *page = pte_page(pkmap_page_table[pos]); + unsigned long addr = PKMAP_ADDR(pos); + pte_t *ptep = &pkmap_page_table[pos]; + + VM_BUG_ON(addr != (unsigned long)page_address(page)); - flush_cache_kmaps(); + if (!__set_page_address(page, NULL, pos)) + BUG(); + flush_kernel_dcache_page(page); + pte_clear(&init_mm, addr, ptep); + + return 1; + } + + return 0; +} + +static inline void pkmap_put(atomic_t *counter) +{ + switch (atomic_dec_return(counter)) { + case 0: + BUG(); + + case 1: + atomic_inc(&pkmap_free); + wake_up(&pkmap_wait); + } +} + +#define TLB_BATCH 32 + +static int pkmap_get_free(void) +{ + int i, pos, flush; + +restart: for (i = 0; i < LAST_PKMAP; i++) { - struct page *page; + pos = atomic_inc_return(&pkmap_hand) & LAST_PKMAP_MASK; + flush = pkmap_try_free(pos); + if (flush >= 0) + goto got_one; + } + + atomic_dec(&pkmap_free); + /* + * wait for somebody else to unmap their entries + */ + if (likely(!in_interrupt())) + wait_event(pkmap_wait, atomic_read(&pkmap_free) != 0); + + goto restart; + +got_one: + if (flush) { +#if 0 + flush_tlb_kernel_range(PKMAP_ADDR(pos), PKMAP_ADDR(pos+1)); +#else + int pos2 = (pos + 1) & LAST_PKMAP_MASK; + int nr; + int entries[TLB_BATCH]; /* - * zero means we don't have anything to do, - * >1 means that it is still in use. Only - * a count of 1 means that it is free but - * needs to be unmapped + * For those architectures that cannot help but flush the + * whole TLB, flush some more entries to make it worthwhile. + * Scan ahead of the hand to minimise search distances. */ - if (pkmap_count[i] != 1) - continue; - pkmap_count[i] = 0; + for (i = 0, nr = 0; i < LAST_PKMAP && nr < TLB_BATCH; + i++, pos2 = (pos2 + 1) & LAST_PKMAP_MASK) { + + flush = pkmap_try_free(pos2); + if (flush < 0) + continue; + + if (!flush) { + atomic_t *counter = &pkmap_count[pos2]; + VM_BUG_ON(atomic_read(counter) != 0); + atomic_set(counter, 2); + pkmap_put(counter); + } else + entries[nr++] = pos2; + } + flush_tlb_kernel_range(PKMAP_ADDR(0), PKMAP_ADDR(LAST_PKMAP)); - /* sanity check */ - BUG_ON(pte_none(pkmap_page_table[i])); + for (i = 0; i < nr; i++) { + atomic_t *counter = &pkmap_count[entries[i]]; + VM_BUG_ON(atomic_read(counter) != 0); + atomic_set(counter, 2); + pkmap_put(counter); + } +#endif + } + return pos; +} + +static unsigned long pkmap_insert(struct page *page) +{ + int pos = pkmap_get_free(); + unsigned long vaddr = PKMAP_ADDR(pos); + pte_t *ptep = &pkmap_page_table[pos]; + pte_t entry = mk_pte(page, kmap_prot); + atomic_t *counter = &pkmap_count[pos]; + VM_BUG_ON(atomic_read(counter) != 0); + + set_pte_at(&init_mm, vaddr, ptep, entry); + if (unlikely(!__set_page_address(page, (void *)vaddr, pos))) { /* - * Don't need an atomic fetch-and-clear op here; - * no-one has the page mapped, and cannot get at - * its virtual address (and hence PTE) without first - * getting the kmap_lock (which is held here). - * So no dangers, even with speculative execution. + * concurrent pkmap_inserts for this page - + * the other won the race, release this entry. + * + * we can still clear the pte without a tlb flush since + * it couldn't have been used yet. */ - page = pte_page(pkmap_page_table[i]); - pte_clear(&init_mm, (unsigned long)page_address(page), - &pkmap_page_table[i]); + pte_clear(&init_mm, vaddr, ptep); + VM_BUG_ON(atomic_read(counter) != 0); + atomic_set(counter, 2); + pkmap_put(counter); + vaddr = 0; + } else + atomic_set(counter, 2); - set_page_address(page, NULL); - need_flush = 1; - } - if (need_flush) - flush_tlb_kernel_range(PKMAP_ADDR(0), PKMAP_ADDR(LAST_PKMAP)); + return vaddr; } -/** - * kmap_flush_unused - flush all unused kmap mappings in order to remove stray mappings +/* + * Flush all unused kmap mappings in order to remove stray mappings. */ void kmap_flush_unused(void) { - lock_kmap(); - flush_all_zero_pkmaps(); - unlock_kmap(); + WARN_ON_ONCE(1); } -static inline unsigned long map_new_virtual(struct page *page) +/* + * Avoid starvation deadlock by limiting the number of tasks that can obtain a + * kmap to (LAST_PKMAP - KM_TYPE_NR*NR_CPUS)/2. + */ +static void kmap_account(void) { - unsigned long vaddr; - int count; - -start: - count = LAST_PKMAP; - /* Find an empty entry */ - for (;;) { - last_pkmap_nr = (last_pkmap_nr + 1) & LAST_PKMAP_MASK; - if (!last_pkmap_nr) { - flush_all_zero_pkmaps(); - count = LAST_PKMAP; - } - if (!pkmap_count[last_pkmap_nr]) - break; /* Found a usable entry */ - if (--count) - continue; + int weight; +#ifndef CONFIG_PREEMPT_RT + if (in_interrupt()) { + /* irqs can always get them */ + weight = -1; + } else +#endif + if (current->flags & PF_KMAP) { + current->flags &= ~PF_KMAP; + /* we already accounted the second */ + weight = 0; + } else { + /* mark 1, account 2 */ + current->flags |= PF_KMAP; + weight = 2; + } + + if (weight > 0) { /* - * Sleep for somebody else to unmap their entries + * reserve KM_TYPE_NR maps per CPU for interrupt context */ - { - DECLARE_WAITQUEUE(wait, current); - - __set_current_state(TASK_UNINTERRUPTIBLE); - add_wait_queue(&pkmap_map_wait, &wait); - unlock_kmap(); - schedule(); - remove_wait_queue(&pkmap_map_wait, &wait); - lock_kmap(); - - /* Somebody else might have mapped it while we slept */ - if (page_address(page)) - return (unsigned long)page_address(page); - - /* Re-start */ - goto start; + const int target = LAST_PKMAP +#ifndef CONFIG_PREEMPT_RT + - KM_TYPE_NR*NR_CPUS +#endif + ; + +again: + wait_event(pkmap_wait, + atomic_read(&pkmap_users) + weight <= target); + + if (atomic_add_return(weight, &pkmap_users) > target) { + atomic_sub(weight, &pkmap_users); + goto again; } } - vaddr = PKMAP_ADDR(last_pkmap_nr); - set_pte_at(&init_mm, vaddr, - &(pkmap_page_table[last_pkmap_nr]), mk_pte(page, kmap_prot)); +} - pkmap_count[last_pkmap_nr] = 1; - set_page_address(page, (void *)vaddr); +static void kunmap_account(void) +{ + int weight; - return vaddr; +#ifndef CONFIG_PREEMPT_RT + if (in_irq()) { + weight = -1; + } else +#endif + if (current->flags & PF_KMAP) { + /* there was only 1 kmap, un-account both */ + current->flags &= ~PF_KMAP; + weight = 2; + } else { + /* there were two kmaps, un-account per kunmap */ + weight = 1; + } + + if (weight > 0) + atomic_sub(weight, &pkmap_users); + wake_up(&pkmap_wait); } -/** - * kmap_high - map a highmem page into memory - * @page: &struct page to map - * - * Returns the page's virtual memory address. - * - * We cannot call this from interrupts, as it may block. - */ void *kmap_high(struct page *page) { unsigned long vaddr; - /* - * For highmem pages, we can't trust "virtual" until - * after we have the lock. - */ - lock_kmap(); + + kmap_account(); +again: vaddr = (unsigned long)page_address(page); + if (vaddr) { + atomic_t *counter = &pkmap_count[PKMAP_NR(vaddr)]; + if (atomic_inc_not_zero(counter)) { + /* + * atomic_inc_not_zero implies a (memory) barrier on success + * so page address will be reloaded. + */ + unsigned long vaddr2 = (unsigned long)page_address(page); + if (likely(vaddr == vaddr2)) + return (void *)vaddr; + + /* + * Oops, we got someone else. + * + * This can happen if we get preempted after + * page_address() and before atomic_inc_not_zero() + * and during that preemption this slot is freed and + * reused. + */ + pkmap_put(counter); + goto again; + } + } + + vaddr = pkmap_insert(page); if (!vaddr) - vaddr = map_new_virtual(page); - pkmap_count[PKMAP_NR(vaddr)]++; - BUG_ON(pkmap_count[PKMAP_NR(vaddr)] < 2); - unlock_kmap(); - return (void*) vaddr; + goto again; + + return (void *)vaddr; } EXPORT_SYMBOL(kmap_high); @@ -240,51 +380,12 @@ void *kmap_high_get(struct page *page) } #endif -/** - * kunmap_high - map a highmem page into memory - * @page: &struct page to unmap - * - * If ARCH_NEEDS_KMAP_HIGH_GET is not defined then this may be called - * only from user context. - */ -void kunmap_high(struct page *page) + void kunmap_high(struct page *page) { - unsigned long vaddr; - unsigned long nr; - unsigned long flags; - int need_wakeup; - - lock_kmap_any(flags); - vaddr = (unsigned long)page_address(page); + unsigned long vaddr = (unsigned long)page_address(page); BUG_ON(!vaddr); - nr = PKMAP_NR(vaddr); - - /* - * A count must never go down to zero - * without a TLB flush! - */ - need_wakeup = 0; - switch (--pkmap_count[nr]) { - case 0: - BUG(); - case 1: - /* - * Avoid an unnecessary wake_up() function call. - * The common case is pkmap_count[] == 1, but - * no waiters. - * The tasks queued in the wait-queue are guarded - * by both the lock in the wait-queue-head and by - * the kmap_lock. As the kmap_lock is held here, - * no need for the wait-queue-head's lock. Simply - * test if the queue is empty. - */ - need_wakeup = waitqueue_active(&pkmap_map_wait); - } - unlock_kmap_any(flags); - - /* do wake-up, if needed, race-free outside of the spin lock */ - if (need_wakeup) - wake_up(&pkmap_map_wait); + pkmap_put(&pkmap_count[PKMAP_NR(vaddr)]); + kunmap_account(); } EXPORT_SYMBOL(kunmap_high); @@ -295,19 +396,13 @@ EXPORT_SYMBOL(kunmap_high); #define PA_HASH_ORDER 7 /* - * Describes one page->virtual association + * Describes one page->virtual address association. */ -struct page_address_map { +static struct page_address_map { struct page *page; void *virtual; struct list_head list; -}; - -/* - * page_address_map freelist, allocated from page_address_maps. - */ -static struct list_head page_address_pool; /* freelist */ -static spinlock_t pool_lock; /* protects page_address_pool */ +} page_address_maps[LAST_PKMAP]; /* * Hash table bucket @@ -328,29 +423,37 @@ static struct page_address_slot *page_slot(struct page *page) * * Returns the page's virtual address. */ -void *page_address(struct page *page) -{ - unsigned long flags; - void *ret; - struct page_address_slot *pas; - if (!PageHighMem(page)) - return lowmem_page_address(page); +static void *__page_address(struct page_address_slot *pas, struct page *page) +{ + void *ret = NULL; - pas = page_slot(page); - ret = NULL; - spin_lock_irqsave(&pas->lock, flags); if (!list_empty(&pas->lh)) { struct page_address_map *pam; list_for_each_entry(pam, &pas->lh, list) { if (pam->page == page) { ret = pam->virtual; - goto done; + break; } } } -done: + + return ret; +} + +void *page_address(struct page *page) +{ + unsigned long flags; + void *ret; + struct page_address_slot *pas; + + if (!PageHighMem(page)) + return lowmem_page_address(page); + + pas = page_slot(page); + spin_lock_irqsave(&pas->lock, flags); + ret = __page_address(pas, page); spin_unlock_irqrestore(&pas->lock, flags); return ret; } @@ -362,62 +465,90 @@ EXPORT_SYMBOL(page_address); * @page: &struct page to set * @virtual: virtual address to use */ -void set_page_address(struct page *page, void *virtual) +static int __set_page_address(struct page *page, void *virtual, int pos) { + int ret = 0; unsigned long flags; struct page_address_slot *pas; struct page_address_map *pam; - BUG_ON(!PageHighMem(page)); + VM_BUG_ON(!PageHighMem(page)); + VM_BUG_ON(atomic_read(&pkmap_count[pos]) != 0); + VM_BUG_ON(pos < 0 || pos >= LAST_PKMAP); pas = page_slot(page); - if (virtual) { /* Add */ - BUG_ON(list_empty(&page_address_pool)); - - spin_lock_irqsave(&pool_lock, flags); - pam = list_entry(page_address_pool.next, - struct page_address_map, list); - list_del(&pam->list); - spin_unlock_irqrestore(&pool_lock, flags); - - pam->page = page; - pam->virtual = virtual; - - spin_lock_irqsave(&pas->lock, flags); - list_add_tail(&pam->list, &pas->lh); - spin_unlock_irqrestore(&pas->lock, flags); - } else { /* Remove */ - spin_lock_irqsave(&pas->lock, flags); - list_for_each_entry(pam, &pas->lh, list) { - if (pam->page == page) { - list_del(&pam->list); - spin_unlock_irqrestore(&pas->lock, flags); - spin_lock_irqsave(&pool_lock, flags); - list_add_tail(&pam->list, &page_address_pool); - spin_unlock_irqrestore(&pool_lock, flags); - goto done; - } + pam = &page_address_maps[pos]; + + spin_lock_irqsave(&pas->lock, flags); + if (virtual) { /* add */ + VM_BUG_ON(!list_empty(&pam->list)); + + if (!__page_address(pas, page)) { + pam->page = page; + pam->virtual = virtual; + list_add_tail(&pam->list, &pas->lh); + ret = 1; + } + } else { /* remove */ + if (!list_empty(&pam->list)) { + list_del_init(&pam->list); + ret = 1; } - spin_unlock_irqrestore(&pas->lock, flags); } -done: - return; + spin_unlock_irqrestore(&pas->lock, flags); + + return ret; } -static struct page_address_map page_address_maps[LAST_PKMAP]; +int set_page_address(struct page *page, void *virtual) +{ + /* + * set_page_address is not supposed to be called when using + * hashed virtual addresses. + */ + BUG(); + return 0; +} -void __init page_address_init(void) +void __init __page_address_init(void) { int i; - INIT_LIST_HEAD(&page_address_pool); for (i = 0; i < ARRAY_SIZE(page_address_maps); i++) - list_add(&page_address_maps[i].list, &page_address_pool); + INIT_LIST_HEAD(&page_address_maps[i].list); + for (i = 0; i < ARRAY_SIZE(page_address_htable); i++) { INIT_LIST_HEAD(&page_address_htable[i].lh); spin_lock_init(&page_address_htable[i].lock); } - spin_lock_init(&pool_lock); +} + +#elif defined (CONFIG_HIGHMEM) /* HASHED_PAGE_VIRTUAL */ + +static int __set_page_address(struct page *page, void *virtual, int pos) +{ + return set_page_address(page, virtual); +} + +#endif /* defined(CONFIG_HIGHMEM) && !defined(WANT_PAGE_VIRTUAL) */ + +#if defined(CONFIG_HIGHMEM) || defined(HASHED_PAGE_VIRTUAL) + +void __init page_address_init(void) +{ +#ifdef CONFIG_HIGHMEM + int i; + + for (i = 0; i < ARRAY_SIZE(pkmap_count); i++) + atomic_set(&pkmap_count[i], 1); + atomic_set(&pkmap_hand, 0); + atomic_set(&pkmap_free, LAST_PKMAP); + atomic_set(&pkmap_users, 0); +#endif + +#ifdef HASHED_PAGE_VIRTUAL + __page_address_init(); +#endif } #endif /* defined(CONFIG_HIGHMEM) && !defined(WANT_PAGE_VIRTUAL) */ diff --git a/mm/memcontrol.c b/mm/memcontrol.c index fd4529d..34643f4 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -948,13 +948,14 @@ void mem_cgroup_update_mapped_file_stat(struct page *page, int val) goto done; /* - * Preemption is already disabled, we don't need get_cpu() + * Preemption is already disabled, we don't need get_cpu(), but + * that's not true for RT :) */ - cpu = smp_processor_id(); + cpu = get_cpu(); stat = &mem->stat; cpustat = &stat->cpustat[cpu]; - __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_MAPPED_FILE, val); + put_cpu(); done: unlock_page_cgroup(pc); } diff --git a/mm/memory.c b/mm/memory.c index aede2ce..c393969 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -923,10 +923,13 @@ static unsigned long unmap_page_range(struct mmu_gather *tlb, return addr; } -#ifdef CONFIG_PREEMPT +#if defined(CONFIG_PREEMPT) && !defined(CONFIG_PREEMPT_RT) # define ZAP_BLOCK_SIZE (8 * PAGE_SIZE) #else -/* No preempt: go for improved straight-line efficiency */ +/* + * No preempt: go for improved straight-line efficiency + * on PREEMPT_RT this is not a critical latency-path. + */ # define ZAP_BLOCK_SIZE (1024 * PAGE_SIZE) #endif @@ -956,17 +959,14 @@ static unsigned long unmap_page_range(struct mmu_gather *tlb, * ensure that any thus-far unmapped pages are flushed before unmap_vmas() * drops the lock and schedules. */ -unsigned long unmap_vmas(struct mmu_gather **tlbp, +unsigned long unmap_vmas(struct mmu_gather *tlb, struct vm_area_struct *vma, unsigned long start_addr, unsigned long end_addr, unsigned long *nr_accounted, struct zap_details *details) { long zap_work = ZAP_BLOCK_SIZE; - unsigned long tlb_start = 0; /* For tlb_finish_mmu */ - int tlb_start_valid = 0; unsigned long start = start_addr; spinlock_t *i_mmap_lock = details? details->i_mmap_lock: NULL; - int fullmm = (*tlbp)->fullmm; struct mm_struct *mm = vma->vm_mm; mmu_notifier_invalidate_range_start(mm, start_addr, end_addr); @@ -987,11 +987,6 @@ unsigned long unmap_vmas(struct mmu_gather **tlbp, untrack_pfn_vma(vma, 0, 0); while (start != end) { - if (!tlb_start_valid) { - tlb_start = start; - tlb_start_valid = 1; - } - if (unlikely(is_vm_hugetlb_page(vma))) { /* * It is undesirable to test vma->vm_file as it @@ -1012,7 +1007,7 @@ unsigned long unmap_vmas(struct mmu_gather **tlbp, start = end; } else - start = unmap_page_range(*tlbp, vma, + start = unmap_page_range(tlb, vma, start, end, &zap_work, details); if (zap_work > 0) { @@ -1020,19 +1015,13 @@ unsigned long unmap_vmas(struct mmu_gather **tlbp, break; } - tlb_finish_mmu(*tlbp, tlb_start, start); - if (need_resched() || (i_mmap_lock && spin_needbreak(i_mmap_lock))) { - if (i_mmap_lock) { - *tlbp = NULL; + if (i_mmap_lock) goto out; - } cond_resched(); } - *tlbp = tlb_gather_mmu(vma->vm_mm, fullmm); - tlb_start_valid = 0; zap_work = ZAP_BLOCK_SIZE; } } @@ -1052,16 +1041,15 @@ unsigned long zap_page_range(struct vm_area_struct *vma, unsigned long address, unsigned long size, struct zap_details *details) { struct mm_struct *mm = vma->vm_mm; - struct mmu_gather *tlb; + struct mmu_gather tlb; unsigned long end = address + size; unsigned long nr_accounted = 0; lru_add_drain(); - tlb = tlb_gather_mmu(mm, 0); + tlb_gather_mmu(&tlb, mm, 0); update_hiwater_rss(mm); end = unmap_vmas(&tlb, vma, address, end, &nr_accounted, details); - if (tlb) - tlb_finish_mmu(tlb, address, end); + tlb_finish_mmu(&tlb, address, end); return end; } @@ -2480,12 +2468,12 @@ int vmtruncate_range(struct inode *inode, loff_t offset, loff_t end) return -ENOSYS; mutex_lock(&inode->i_mutex); - down_write(&inode->i_alloc_sem); + anon_down_write(&inode->i_alloc_sem); unmap_mapping_range(mapping, offset, (end - offset), 1); truncate_inode_pages_range(mapping, offset, end); unmap_mapping_range(mapping, offset, (end - offset), 1); inode->i_op->truncate_range(inode, offset, end); - up_write(&inode->i_alloc_sem); + anon_up_write(&inode->i_alloc_sem); mutex_unlock(&inode->i_mutex); return 0; @@ -2956,6 +2944,28 @@ unlock: return 0; } +void pagefault_disable(void) +{ + current->pagefault_disabled++; + /* + * make sure to have issued the store before a pagefault + * can hit. + */ + barrier(); +} +EXPORT_SYMBOL(pagefault_disable); + +void pagefault_enable(void) +{ + /* + * make sure to issue those last loads/stores before enabling + * the pagefault handler again. + */ + barrier(); + current->pagefault_disabled--; +} +EXPORT_SYMBOL(pagefault_enable); + /* * By the time we get here, we already hold the mm semaphore */ diff --git a/mm/mmap.c b/mm/mmap.c index 8101de4..1a0f652 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1772,17 +1772,17 @@ static void unmap_region(struct mm_struct *mm, unsigned long start, unsigned long end) { struct vm_area_struct *next = prev? prev->vm_next: mm->mmap; - struct mmu_gather *tlb; + struct mmu_gather tlb; unsigned long nr_accounted = 0; lru_add_drain(); - tlb = tlb_gather_mmu(mm, 0); + tlb_gather_mmu(&tlb, mm, 0); update_hiwater_rss(mm); unmap_vmas(&tlb, vma, start, end, &nr_accounted, NULL); vm_unacct_memory(nr_accounted); - free_pgtables(tlb, vma, prev? prev->vm_end: FIRST_USER_ADDRESS, + free_pgtables(&tlb, vma, prev? prev->vm_end: FIRST_USER_ADDRESS, next? next->vm_start: 0); - tlb_finish_mmu(tlb, start, end); + tlb_finish_mmu(&tlb, start, end); } /* @@ -1964,10 +1964,16 @@ SYSCALL_DEFINE2(munmap, unsigned long, addr, size_t, len) static inline void verify_mm_writelocked(struct mm_struct *mm) { #ifdef CONFIG_DEBUG_VM - if (unlikely(down_read_trylock(&mm->mmap_sem))) { +# ifdef CONFIG_PREEMPT_RT + if (unlikely(!rwsem_is_locked(&mm->mmap_sem))) { WARN_ON(1); - up_read(&mm->mmap_sem); } +# else + if (unlikely(down_read_trylock(&mm->mmap_sem))) { + WARN_ON(1); + up_read(&mm->mmap_sem); + } +# endif #endif } @@ -2081,7 +2087,7 @@ EXPORT_SYMBOL(do_brk); /* Release all mmaps. */ void exit_mmap(struct mm_struct *mm) { - struct mmu_gather *tlb; + struct mmu_gather tlb; struct vm_area_struct *vma; unsigned long nr_accounted = 0; unsigned long end; @@ -2106,13 +2112,13 @@ void exit_mmap(struct mm_struct *mm) lru_add_drain(); flush_cache_mm(mm); - tlb = tlb_gather_mmu(mm, 1); + tlb_gather_mmu(&tlb, mm, 1); /* update_hiwater_rss(mm) here? but nobody should be looking */ /* Use -1 here to ensure all VMAs in the mm are unmapped */ end = unmap_vmas(&tlb, vma, 0, -1, &nr_accounted, NULL); vm_unacct_memory(nr_accounted); - free_pgtables(tlb, vma, FIRST_USER_ADDRESS, 0); - tlb_finish_mmu(tlb, 0, end); + free_pgtables(&tlb, vma, FIRST_USER_ADDRESS, 0); + tlb_finish_mmu(&tlb, 0, end); /* * Walk the list again, actually closing and freeing it, diff --git a/mm/page_alloc.c b/mm/page_alloc.c index a0de15f..b3ebbd9 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -161,6 +161,53 @@ static unsigned long __meminitdata dma_reserve; EXPORT_SYMBOL(movable_zone); #endif /* CONFIG_ARCH_POPULATES_NODE_MAP */ +#ifdef CONFIG_PREEMPT_RT +static DEFINE_PER_CPU_LOCKED(int, pcp_locks); +#endif + +static inline void __lock_cpu_pcp(unsigned long *flags, int cpu) +{ +#ifdef CONFIG_PREEMPT_RT + spin_lock(&__get_cpu_lock(pcp_locks, cpu)); + flags = 0; +#else + local_irq_save(*flags); +#endif +} + +static inline void lock_cpu_pcp(unsigned long *flags, int *this_cpu) +{ +#ifdef CONFIG_PREEMPT_RT + (void)get_cpu_var_locked(pcp_locks, this_cpu); + flags = 0; +#else + local_irq_save(*flags); + *this_cpu = smp_processor_id(); +#endif +} + +static inline void unlock_cpu_pcp(unsigned long flags, int this_cpu) +{ +#ifdef CONFIG_PREEMPT_RT + put_cpu_var_locked(pcp_locks, this_cpu); +#else + local_irq_restore(flags); +#endif +} + +static struct per_cpu_pageset * +get_zone_pcp(struct zone *zone, unsigned long *flags, int *this_cpu) +{ + lock_cpu_pcp(flags, this_cpu); + return zone_pcp(zone, *this_cpu); +} + +static void +put_zone_pcp(struct zone *zone, unsigned long flags, int this_cpu) +{ + unlock_cpu_pcp(flags, this_cpu); +} + #if MAX_NUMNODES > 1 int nr_node_ids __read_mostly = MAX_NUMNODES; int nr_online_nodes __read_mostly = 1; @@ -523,7 +570,9 @@ static inline int free_pages_check(struct page *page) static void free_pages_bulk(struct zone *zone, int count, struct list_head *list, int order) { - spin_lock(&zone->lock); + unsigned long flags; + + spin_lock_irqsave(&zone->lock, flags); zone_clear_flag(zone, ZONE_ALL_UNRECLAIMABLE); zone->pages_scanned = 0; @@ -536,27 +585,31 @@ static void free_pages_bulk(struct zone *zone, int count, /* have to delete it as __free_one_page list manipulates */ list_del(&page->lru); __free_one_page(page, zone, order, page_private(page)); +#ifdef CONFIG_PREEMPT_RT + cond_resched_lock(&zone->lock); +#endif } - spin_unlock(&zone->lock); + spin_unlock_irqrestore(&zone->lock, flags); } static void free_one_page(struct zone *zone, struct page *page, int order, int migratetype) { - spin_lock(&zone->lock); + unsigned long flags; + + spin_lock_irqsave(&zone->lock, flags); zone_clear_flag(zone, ZONE_ALL_UNRECLAIMABLE); zone->pages_scanned = 0; __mod_zone_page_state(zone, NR_FREE_PAGES, 1 << order); __free_one_page(page, zone, order, migratetype); - spin_unlock(&zone->lock); + spin_unlock_irqrestore(&zone->lock, flags); } static void __free_pages_ok(struct page *page, unsigned int order) { unsigned long flags; - int i; - int bad = 0; + int i, this_cpu, bad = 0; int wasMlocked = TestClearPageMlocked(page); kmemcheck_free_shadow(page, order); @@ -574,13 +627,13 @@ static void __free_pages_ok(struct page *page, unsigned int order) arch_free_page(page, order); kernel_map_pages(page, 1 << order, 0); - local_irq_save(flags); + lock_cpu_pcp(&flags, &this_cpu); if (unlikely(wasMlocked)) free_page_mlock(page); - __count_vm_events(PGFREE, 1 << order); + count_vm_events(PGFREE, 1 << order); + unlock_cpu_pcp(flags, this_cpu); free_one_page(page_zone(page), page, order, - get_pageblock_migratetype(page)); - local_irq_restore(flags); + get_pageblock_migratetype(page)); } /* @@ -915,6 +968,16 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order, return i; } +static void +isolate_pcp_pages(int count, struct list_head *src, struct list_head *dst) +{ + while (count--) { + struct page *page = list_last_entry(src, struct page, lru); + list_move(&page->lru, dst); + } +} + + #ifdef CONFIG_NUMA /* * Called from the vmstat counter updater to drain pagesets of this @@ -926,17 +989,20 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order, */ void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp) { + LIST_HEAD(free_list); unsigned long flags; int to_drain; + int this_cpu; - local_irq_save(flags); + lock_cpu_pcp(&flags, &this_cpu); if (pcp->count >= pcp->batch) to_drain = pcp->batch; else to_drain = pcp->count; - free_pages_bulk(zone, to_drain, &pcp->list, 0); + isolate_pcp_pages(to_drain, &pcp->list, &free_list); pcp->count -= to_drain; - local_irq_restore(flags); + unlock_cpu_pcp(flags, this_cpu); + free_pages_bulk(zone, to_drain, &free_list, 0); } #endif @@ -955,14 +1021,22 @@ static void drain_pages(unsigned int cpu) for_each_populated_zone(zone) { struct per_cpu_pageset *pset; struct per_cpu_pages *pcp; + LIST_HEAD(free_list); + int count; + __lock_cpu_pcp(&flags, cpu); pset = zone_pcp(zone, cpu); - + if (!pset) { + unlock_cpu_pcp(flags, cpu); + WARN_ON(1); + continue; + } pcp = &pset->pcp; - local_irq_save(flags); - free_pages_bulk(zone, pcp->count, &pcp->list, 0); + isolate_pcp_pages(pcp->count, &pcp->list, &free_list); + count = pcp->count; pcp->count = 0; - local_irq_restore(flags); + unlock_cpu_pcp(flags, cpu); + free_pages_bulk(zone, count, &free_list, 0); } } @@ -974,12 +1048,52 @@ void drain_local_pages(void *arg) drain_pages(smp_processor_id()); } +#ifdef CONFIG_PREEMPT_RT +static void drain_local_pages_work(struct work_struct *wrk) +{ + drain_pages(smp_processor_id()); +} +#endif + /* * Spill all the per-cpu pages from all CPUs back into the buddy allocator */ void drain_all_pages(void) { +#ifdef CONFIG_PREEMPT_RT + /* + * HACK!!!!! + * For RT we can't use IPIs to run drain_local_pages, since + * that code will call spin_locks that will now sleep. + * But, schedule_on_each_cpu will call kzalloc, which will + * call page_alloc which was what calls this. + * + * Luckily, there's a condition to get here, and that is if + * the order passed in to alloc_pages is greater than 0 + * (alloced more than a page size). The slabs only allocate + * what is needed, and the allocation made by schedule_on_each_cpu + * does an alloc of "sizeof(void *)*nr_cpu_ids". + * + * So we can safely call schedule_on_each_cpu if that number + * is less than a page. Otherwise don't bother. At least warn of + * this issue. + * + * And yes, this is one big hack. Please fix ;-) + */ + if (sizeof(void *)*nr_cpu_ids < PAGE_SIZE) + schedule_on_each_cpu(drain_local_pages_work); + else { + static int once; + if (!once) { + printk(KERN_ERR "Can't drain all CPUS due to possible recursion\n"); + once = 1; + } + drain_local_pages(NULL); + } + +#else on_each_cpu(drain_local_pages, NULL, 1); +#endif } #ifdef CONFIG_HIBERNATION @@ -1024,9 +1138,10 @@ void mark_free_pages(struct zone *zone) static void free_hot_cold_page(struct page *page, int cold) { struct zone *zone = page_zone(page); + struct per_cpu_pageset *pset; struct per_cpu_pages *pcp; unsigned long flags; - int wasMlocked = TestClearPageMlocked(page); + int count, this_cpu, wasMlocked = TestClearPageMlocked(page); kmemcheck_free_shadow(page, 0); @@ -1042,12 +1157,12 @@ static void free_hot_cold_page(struct page *page, int cold) arch_free_page(page, 0); kernel_map_pages(page, 1, 0); - pcp = &zone_pcp(zone, get_cpu())->pcp; + pset = get_zone_pcp(zone, &flags, &this_cpu); + pcp = &pset->pcp; set_page_private(page, get_pageblock_migratetype(page)); - local_irq_save(flags); if (unlikely(wasMlocked)) free_page_mlock(page); - __count_vm_event(PGFREE); + count_vm_event(PGFREE); if (cold) list_add_tail(&page->lru, &pcp->list); @@ -1055,11 +1170,15 @@ static void free_hot_cold_page(struct page *page, int cold) list_add(&page->lru, &pcp->list); pcp->count++; if (pcp->count >= pcp->high) { - free_pages_bulk(zone, pcp->batch, &pcp->list, 0); + LIST_HEAD(free_list); + + isolate_pcp_pages(pcp->batch, &pcp->list, &free_list); pcp->count -= pcp->batch; - } - local_irq_restore(flags); - put_cpu(); + count = pcp->batch; + put_zone_pcp(zone, flags, this_cpu); + free_pages_bulk(zone, count, &free_list, 0); + } else + put_zone_pcp(zone, flags, this_cpu); } void free_hot_page(struct page *page) @@ -1113,15 +1232,15 @@ struct page *buffered_rmqueue(struct zone *preferred_zone, unsigned long flags; struct page *page; int cold = !!(gfp_flags & __GFP_COLD); - int cpu; + struct per_cpu_pageset *pset; + int this_cpu; again: - cpu = get_cpu(); + pset = get_zone_pcp(zone, &flags, &this_cpu); + if (likely(order == 0)) { - struct per_cpu_pages *pcp; + struct per_cpu_pages *pcp = &pset->pcp; - pcp = &zone_pcp(zone, cpu)->pcp; - local_irq_save(flags); if (!pcp->count) { pcp->count = rmqueue_bulk(zone, 0, pcp->batch, &pcp->list, @@ -1165,7 +1284,7 @@ again: */ WARN_ON_ONCE(order > 1); } - spin_lock_irqsave(&zone->lock, flags); + spin_lock(&zone->lock); page = __rmqueue(zone, order, migratetype); __mod_zone_page_state(zone, NR_FREE_PAGES, -(1 << order)); spin_unlock(&zone->lock); @@ -1175,8 +1294,7 @@ again: __count_zone_vm_events(PGALLOC, zone, 1 << order); zone_statistics(preferred_zone, zone); - local_irq_restore(flags); - put_cpu(); + put_zone_pcp(zone, flags, this_cpu); VM_BUG_ON(bad_range(zone, page)); if (prep_new_page(page, order, gfp_flags)) @@ -1184,8 +1302,7 @@ again: return page; failed: - local_irq_restore(flags); - put_cpu(); + put_zone_pcp(zone, flags, this_cpu); return NULL; } @@ -3051,7 +3168,23 @@ static inline void free_zone_pagesets(int cpu) struct zone *zone; for_each_zone(zone) { - struct per_cpu_pageset *pset = zone_pcp(zone, cpu); + unsigned long flags; + struct per_cpu_pageset *pset; + + /* + * On PREEMPT_RT the allocator is preemptible, therefore + * kstopmachine can preempt a process in the middle of an + * allocation, freeing the pset underneath such a process + * isn't a good idea. + * + * Take the per-cpu pcp lock to allow the task to complete + * before we free it. New tasks will be held off by the + * cpu_online() check in get_cpu_var_locked(). + */ + __lock_cpu_pcp(&flags, cpu); + pset = zone_pcp(zone, cpu); + zone_pcp(zone, cpu) = NULL; + unlock_cpu_pcp(flags, cpu); /* Free per_cpu_pageset if it is slab allocated */ if (pset != &boot_pageset[cpu]) diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c index f22b4eb..835674e 100644 --- a/mm/page_cgroup.c +++ b/mm/page_cgroup.c @@ -14,6 +14,7 @@ static void __meminit __init_page_cgroup(struct page_cgroup *pc, unsigned long pfn) { pc->flags = 0; + spin_lock_init(&pc->lock); pc->mem_cgroup = NULL; pc->page = pfn_to_page(pfn); INIT_LIST_HEAD(&pc->lru); diff --git a/mm/quicklist.c b/mm/quicklist.c index e66d07d..03341b0 100644 --- a/mm/quicklist.c +++ b/mm/quicklist.c @@ -19,7 +19,7 @@ #include <linux/module.h> #include <linux/quicklist.h> -DEFINE_PER_CPU(struct quicklist, quicklist)[CONFIG_NR_QUICK]; +DEFINE_PER_CPU_LOCKED(struct quicklist, quicklist)[CONFIG_NR_QUICK]; #define FRACTION_OF_NODE_MEM 16 @@ -66,17 +66,14 @@ void quicklist_trim(int nr, void (*dtor)(void *), { long pages_to_free; struct quicklist *q; + int cpu; - q = &get_cpu_var(quicklist)[nr]; + q = &get_cpu_var_locked(quicklist, &cpu)[nr]; if (q->nr_pages > min_pages) { pages_to_free = min_pages_to_free(q, min_pages, max_free); while (pages_to_free > 0) { - /* - * We pass a gfp_t of 0 to quicklist_alloc here - * because we will never call into the page allocator. - */ - void *p = quicklist_alloc(nr, 0, NULL); + void *p = __quicklist_alloc(q); if (dtor) dtor(p); @@ -84,7 +81,7 @@ void quicklist_trim(int nr, void (*dtor)(void *), pages_to_free--; } } - put_cpu_var(quicklist); + put_cpu_var_locked(quicklist, cpu); } unsigned long quicklist_total_size(void) @@ -94,7 +91,7 @@ unsigned long quicklist_total_size(void) struct quicklist *ql, *q; for_each_online_cpu(cpu) { - ql = per_cpu(quicklist, cpu); + ql = per_cpu_var_locked(quicklist, cpu); for (q = ql; q < ql + CONFIG_NR_QUICK; q++) count += q->nr_pages; } diff --git a/mm/slab.c b/mm/slab.c index 7b5d4de..a4bd906 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -121,6 +121,138 @@ #include <asm/page.h> /* + * On !PREEMPT_RT, raw irq flags are used as a per-CPU locking + * mechanism. + * + * On PREEMPT_RT, we use per-CPU locks for this. That's why the + * calling convention is changed slightly: a new 'flags' argument + * is passed to 'irq disable/enable' - the PREEMPT_RT code stores + * the CPU number of the lock there. + */ +#ifndef CONFIG_PREEMPT_RT + +# define slab_irq_disable(cpu) \ + do { local_irq_disable(); (cpu) = smp_processor_id(); } while (0) +# define slab_irq_enable(cpu) local_irq_enable() + +static inline void slab_irq_disable_this_rt(int cpu) +{ +} + +static inline void slab_irq_enable_rt(int cpu) +{ +} + +# define slab_irq_save(flags, cpu) \ + do { local_irq_save(flags); (cpu) = smp_processor_id(); } while (0) +# define slab_irq_restore(flags, cpu) local_irq_restore(flags) + +/* + * In the __GFP_WAIT case we enable/disable interrupts on !PREEMPT_RT, + * which has no per-CPU locking effect since we are holding the cache + * lock in that case already. + */ +static void slab_irq_enable_GFP_WAIT(gfp_t flags, int *cpu) +{ + if (flags & __GFP_WAIT) + local_irq_enable(); +} + +static void slab_irq_disable_GFP_WAIT(gfp_t flags, int *cpu) +{ + if (flags & __GFP_WAIT) + local_irq_disable(); +} + +# define slab_spin_lock_irq(lock, cpu) \ + do { spin_lock_irq(lock); (cpu) = smp_processor_id(); } while (0) +# define slab_spin_unlock_irq(lock, cpu) spin_unlock_irq(lock) + +# define slab_spin_lock_irqsave(lock, flags, cpu) \ + do { spin_lock_irqsave(lock, flags); (cpu) = smp_processor_id(); } while (0) +# define slab_spin_unlock_irqrestore(lock, flags, cpu) \ + do { spin_unlock_irqrestore(lock, flags); } while (0) + +#else /* CONFIG_PREEMPT_RT */ + +/* + * Instead of serializing the per-cpu state by disabling interrupts we do so + * by a lock. This keeps the code preemptable - albeit at the cost of remote + * memory access when the task does get migrated away. + */ +DEFINE_PER_CPU_LOCKED(struct list_head, slab) = { 0, }; + +static void _slab_irq_disable(int *cpu) +{ + (void)get_cpu_var_locked(slab, cpu); +} + +#define slab_irq_disable(cpu) _slab_irq_disable(&(cpu)) + +static inline void slab_irq_enable(int cpu) +{ + LIST_HEAD(list); + + list_splice_init(&__get_cpu_var_locked(slab, cpu), &list); + put_cpu_var_locked(slab, cpu); + + while (!list_empty(&list)) { + struct page *page = list_first_entry(&list, struct page, lru); + list_del(&page->lru); + __free_pages(page, page->index); + } +} + +static inline void slab_irq_disable_this_rt(int cpu) +{ + spin_lock(&__get_cpu_lock(slab, cpu)); +} + +static inline void slab_irq_enable_rt(int cpu) +{ + LIST_HEAD(list); + + list_splice_init(&__get_cpu_var_locked(slab, cpu), &list); + spin_unlock(&__get_cpu_lock(slab, cpu)); + + while (!list_empty(&list)) { + struct page *page = list_first_entry(&list, struct page, lru); + list_del(&page->lru); + __free_pages(page, page->index); + } +} + +# define slab_irq_save(flags, cpu) \ + do { slab_irq_disable(cpu); (void) (flags); } while (0) +# define slab_irq_restore(flags, cpu) \ + do { slab_irq_enable(cpu); (void) (flags); } while (0) + +/* + * On PREEMPT_RT we have to drop the locks unconditionally to avoid lock + * recursion on the cache_grow()->alloc_slabmgmt() path. + */ +static void slab_irq_enable_GFP_WAIT(gfp_t flags, int *cpu) +{ + slab_irq_enable(*cpu); +} + +static void slab_irq_disable_GFP_WAIT(gfp_t flags, int *cpu) +{ + slab_irq_disable(*cpu); +} + +# define slab_spin_lock_irq(lock, cpu) \ + do { slab_irq_disable(cpu); spin_lock(lock); } while (0) +# define slab_spin_unlock_irq(lock, cpu) \ + do { spin_unlock(lock); slab_irq_enable(cpu); } while (0) +# define slab_spin_lock_irqsave(lock, flags, cpu) \ + do { slab_irq_disable(cpu); spin_lock_irqsave(lock, flags); } while (0) +# define slab_spin_unlock_irqrestore(lock, flags, cpu) \ + do { spin_unlock_irqrestore(lock, flags); slab_irq_enable(cpu); } while (0) + +#endif /* CONFIG_PREEMPT_RT */ + +/* * DEBUG - 1 for kmem_cache_create() to honour; SLAB_RED_ZONE & SLAB_POISON. * 0 for faster, smaller code (especially in the critical paths). * @@ -316,7 +448,7 @@ struct kmem_list3 __initdata initkmem_list3[NUM_INIT_LISTS]; static int drain_freelist(struct kmem_cache *cache, struct kmem_list3 *l3, int tofree); static void free_block(struct kmem_cache *cachep, void **objpp, int len, - int node); + int node, int *this_cpu); static int enable_cpucache(struct kmem_cache *cachep, gfp_t gfp); static void cache_reap(struct work_struct *unused); @@ -687,9 +819,10 @@ int slab_is_available(void) static DEFINE_PER_CPU(struct delayed_work, reap_work); -static inline struct array_cache *cpu_cache_get(struct kmem_cache *cachep) +static inline struct array_cache * +cpu_cache_get(struct kmem_cache *cachep, int this_cpu) { - return cachep->array[smp_processor_id()]; + return cachep->array[this_cpu]; } static inline struct kmem_cache *__find_general_cachep(size_t size, @@ -930,7 +1063,7 @@ static int transfer_objects(struct array_cache *to, #ifndef CONFIG_NUMA #define drain_alien_cache(cachep, alien) do { } while (0) -#define reap_alien(cachep, l3) do { } while (0) +#define reap_alien(cachep, l3, this_cpu) 0 static inline struct array_cache **alloc_alien_cache(int node, int limit, gfp_t gfp) { @@ -941,27 +1074,28 @@ static inline void free_alien_cache(struct array_cache **ac_ptr) { } -static inline int cache_free_alien(struct kmem_cache *cachep, void *objp) +static inline int +cache_free_alien(struct kmem_cache *cachep, void *objp, int *this_cpu) { return 0; } static inline void *alternate_node_alloc(struct kmem_cache *cachep, - gfp_t flags) + gfp_t flags, int *this_cpu) { return NULL; } static inline void *____cache_alloc_node(struct kmem_cache *cachep, - gfp_t flags, int nodeid) + gfp_t flags, int nodeid, int *this_cpu) { return NULL; } #else /* CONFIG_NUMA */ -static void *____cache_alloc_node(struct kmem_cache *, gfp_t, int); -static void *alternate_node_alloc(struct kmem_cache *, gfp_t); +static void *____cache_alloc_node(struct kmem_cache *, gfp_t, int, int *); +static void *alternate_node_alloc(struct kmem_cache *, gfp_t, int *); static struct array_cache **alloc_alien_cache(int node, int limit, gfp_t gfp) { @@ -1002,7 +1136,8 @@ static void free_alien_cache(struct array_cache **ac_ptr) } static void __drain_alien_cache(struct kmem_cache *cachep, - struct array_cache *ac, int node) + struct array_cache *ac, int node, + int *this_cpu) { struct kmem_list3 *rl3 = cachep->nodelists[node]; @@ -1016,7 +1151,7 @@ static void __drain_alien_cache(struct kmem_cache *cachep, if (rl3->shared) transfer_objects(rl3->shared, ac, ac->limit); - free_block(cachep, ac->entry, ac->avail, node); + free_block(cachep, ac->entry, ac->avail, node, this_cpu); ac->avail = 0; spin_unlock(&rl3->list_lock); } @@ -1025,38 +1160,42 @@ static void __drain_alien_cache(struct kmem_cache *cachep, /* * Called from cache_reap() to regularly drain alien caches round robin. */ -static void reap_alien(struct kmem_cache *cachep, struct kmem_list3 *l3) +static int +reap_alien(struct kmem_cache *cachep, struct kmem_list3 *l3, int *this_cpu) { - int node = __get_cpu_var(reap_node); + int node = per_cpu(reap_node, *this_cpu); if (l3->alien) { struct array_cache *ac = l3->alien[node]; if (ac && ac->avail && spin_trylock_irq(&ac->lock)) { - __drain_alien_cache(cachep, ac, node); + __drain_alien_cache(cachep, ac, node, this_cpu); spin_unlock_irq(&ac->lock); + return 1; } } + return 0; } static void drain_alien_cache(struct kmem_cache *cachep, struct array_cache **alien) { - int i = 0; + int i = 0, this_cpu; struct array_cache *ac; unsigned long flags; for_each_online_node(i) { ac = alien[i]; if (ac) { - spin_lock_irqsave(&ac->lock, flags); - __drain_alien_cache(cachep, ac, i); - spin_unlock_irqrestore(&ac->lock, flags); + slab_spin_lock_irqsave(&ac->lock, flags, this_cpu); + __drain_alien_cache(cachep, ac, i, &this_cpu); + slab_spin_unlock_irqrestore(&ac->lock, flags, this_cpu); } } } -static inline int cache_free_alien(struct kmem_cache *cachep, void *objp) +static inline int +cache_free_alien(struct kmem_cache *cachep, void *objp, int *this_cpu) { struct slab *slabp = virt_to_slab(objp); int nodeid = slabp->nodeid; @@ -1064,7 +1203,7 @@ static inline int cache_free_alien(struct kmem_cache *cachep, void *objp) struct array_cache *alien = NULL; int node; - node = numa_node_id(); + node = cpu_to_node(*this_cpu); /* * Make sure we are not freeing a object from another node to the array @@ -1080,20 +1219,20 @@ static inline int cache_free_alien(struct kmem_cache *cachep, void *objp) spin_lock(&alien->lock); if (unlikely(alien->avail == alien->limit)) { STATS_INC_ACOVERFLOW(cachep); - __drain_alien_cache(cachep, alien, nodeid); + __drain_alien_cache(cachep, alien, nodeid, this_cpu); } alien->entry[alien->avail++] = objp; spin_unlock(&alien->lock); } else { spin_lock(&(cachep->nodelists[nodeid])->list_lock); - free_block(cachep, &objp, 1, nodeid); + free_block(cachep, &objp, 1, nodeid, this_cpu); spin_unlock(&(cachep->nodelists[nodeid])->list_lock); } return 1; } #endif -static void __cpuinit cpuup_canceled(long cpu) +static void __cpuinit cpuup_canceled(int cpu) { struct kmem_cache *cachep; struct kmem_list3 *l3 = NULL; @@ -1104,6 +1243,7 @@ static void __cpuinit cpuup_canceled(long cpu) struct array_cache *nc; struct array_cache *shared; struct array_cache **alien; + int orig_cpu = cpu; /* cpu is dead; no one can alloc from it. */ nc = cachep->array[cpu]; @@ -1118,7 +1258,8 @@ static void __cpuinit cpuup_canceled(long cpu) /* Free limit for this kmem_list3 */ l3->free_limit -= cachep->batchcount; if (nc) - free_block(cachep, nc->entry, nc->avail, node); + free_block(cachep, nc->entry, nc->avail, node, + &cpu); if (!cpus_empty(*mask)) { spin_unlock_irq(&l3->list_lock); @@ -1128,7 +1269,7 @@ static void __cpuinit cpuup_canceled(long cpu) shared = l3->shared; if (shared) { free_block(cachep, shared->entry, - shared->avail, node); + shared->avail, node, &cpu); l3->shared = NULL; } @@ -1144,6 +1285,7 @@ static void __cpuinit cpuup_canceled(long cpu) } free_array_cache: kfree(nc); + BUG_ON(cpu != orig_cpu); } /* * In the previous loop, all the objects were freed to @@ -1158,7 +1300,7 @@ free_array_cache: } } -static int __cpuinit cpuup_prepare(long cpu) +static int __cpuinit cpuup_prepare(int cpu) { struct kmem_cache *cachep; struct kmem_list3 *l3 = NULL; @@ -1266,10 +1408,19 @@ static int __cpuinit cpuup_callback(struct notifier_block *nfb, long cpu = (long)hcpu; int err = 0; + switch (action) { case CPU_UP_PREPARE: case CPU_UP_PREPARE_FROZEN: mutex_lock(&cache_chain_mutex); + /* + * lock/unlock cycle to push any holders away -- no new ones + * can come in due to the cpu still being offline. + * + * XXX -- weird case anyway, can it happen? + */ + slab_irq_disable_this_rt(cpu); + slab_irq_enable_rt(cpu); err = cpuup_prepare(cpu); mutex_unlock(&cache_chain_mutex); break; @@ -1309,10 +1460,14 @@ static int __cpuinit cpuup_callback(struct notifier_block *nfb, case CPU_UP_CANCELED: case CPU_UP_CANCELED_FROZEN: mutex_lock(&cache_chain_mutex); + slab_irq_disable_this_rt(cpu); cpuup_canceled(cpu); + slab_irq_enable_rt(cpu); mutex_unlock(&cache_chain_mutex); break; } + + return err ? NOTIFY_BAD : NOTIFY_OK; } @@ -1370,6 +1525,12 @@ void __init kmem_cache_init(void) int order; int node; +#ifdef CONFIG_PREEMPT_RT + for_each_possible_cpu(i) { + INIT_LIST_HEAD(&__get_cpu_var_locked(slab, i)); + } +#endif + if (num_possible_nodes() == 1) use_alien_caches = 0; @@ -1499,32 +1660,34 @@ void __init kmem_cache_init(void) /* 4) Replace the bootstrap head arrays */ { struct array_cache *ptr; + int cpu = smp_processor_id(); ptr = kmalloc(sizeof(struct arraycache_init), GFP_NOWAIT); - BUG_ON(cpu_cache_get(&cache_cache) != &initarray_cache.cache); - memcpy(ptr, cpu_cache_get(&cache_cache), + BUG_ON(cpu_cache_get(&cache_cache, cpu) != + &initarray_cache.cache); + memcpy(ptr, cpu_cache_get(&cache_cache, cpu), sizeof(struct arraycache_init)); /* * Do not assume that spinlocks can be initialized via memcpy: */ spin_lock_init(&ptr->lock); - cache_cache.array[smp_processor_id()] = ptr; + cache_cache.array[cpu] = ptr; ptr = kmalloc(sizeof(struct arraycache_init), GFP_NOWAIT); - BUG_ON(cpu_cache_get(malloc_sizes[INDEX_AC].cs_cachep) + BUG_ON(cpu_cache_get(malloc_sizes[INDEX_AC].cs_cachep, cpu) != &initarray_generic.cache); - memcpy(ptr, cpu_cache_get(malloc_sizes[INDEX_AC].cs_cachep), + memcpy(ptr, + cpu_cache_get(malloc_sizes[INDEX_AC].cs_cachep, cpu), sizeof(struct arraycache_init)); /* * Do not assume that spinlocks can be initialized via memcpy: */ spin_lock_init(&ptr->lock); - malloc_sizes[INDEX_AC].cs_cachep->array[smp_processor_id()] = - ptr; + malloc_sizes[INDEX_AC].cs_cachep->array[cpu] = ptr; } /* 5) Replace the bootstrap kmem_list3's */ { @@ -1642,12 +1805,14 @@ static void *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, int nodeid) /* * Interface to system's page release. */ -static void kmem_freepages(struct kmem_cache *cachep, void *addr) +static void kmem_freepages(struct kmem_cache *cachep, void *addr, int cpu) { unsigned long i = (1 << cachep->gfporder); - struct page *page = virt_to_page(addr); + struct page *page, *basepage = virt_to_page(addr); const unsigned long nr_freed = i; + page = basepage; + kmemcheck_free_shadow(page, cachep->gfporder); if (cachep->flags & SLAB_RECLAIM_ACCOUNT) @@ -1656,6 +1821,7 @@ static void kmem_freepages(struct kmem_cache *cachep, void *addr) else sub_zone_page_state(page_zone(page), NR_SLAB_UNRECLAIMABLE, nr_freed); + while (i--) { BUG_ON(!PageSlab(page)); __ClearPageSlab(page); @@ -1663,6 +1829,13 @@ static void kmem_freepages(struct kmem_cache *cachep, void *addr) } if (current->reclaim_state) current->reclaim_state->reclaimed_slab += nr_freed; + +#ifdef CONFIG_PREEMPT_RT + if (cpu >= 0) { + basepage->index = cachep->gfporder; + list_add(&basepage->lru, &__get_cpu_var_locked(slab, cpu)); + } else +#endif free_pages((unsigned long)addr, cachep->gfporder); } @@ -1671,7 +1844,7 @@ static void kmem_rcu_free(struct rcu_head *head) struct slab_rcu *slab_rcu = (struct slab_rcu *)head; struct kmem_cache *cachep = slab_rcu->cachep; - kmem_freepages(cachep, slab_rcu->addr); + kmem_freepages(cachep, slab_rcu->addr, -1); if (OFF_SLAB(cachep)) kmem_cache_free(cachep->slabp_cache, slab_rcu); } @@ -1691,7 +1864,7 @@ static void store_stackinfo(struct kmem_cache *cachep, unsigned long *addr, *addr++ = 0x12345678; *addr++ = caller; - *addr++ = smp_processor_id(); + *addr++ = raw_smp_processor_id(); size -= 3 * sizeof(unsigned long); { unsigned long *sptr = &caller; @@ -1881,6 +2054,10 @@ static void slab_destroy_debugcheck(struct kmem_cache *cachep, struct slab *slab } #endif +static void +__cache_free(struct kmem_cache *cachep, void *objp, int *this_cpu); + + /** * slab_destroy - destroy and release all objects in a slab * @cachep: cache pointer being destroyed @@ -1890,7 +2067,8 @@ static void slab_destroy_debugcheck(struct kmem_cache *cachep, struct slab *slab * Before calling the slab must have been unlinked from the cache. The * cache-lock is not held/needed. */ -static void slab_destroy(struct kmem_cache *cachep, struct slab *slabp) +static void +slab_destroy(struct kmem_cache *cachep, struct slab *slabp, int *this_cpu) { void *addr = slabp->s_mem - slabp->colouroff; @@ -1903,9 +2081,13 @@ static void slab_destroy(struct kmem_cache *cachep, struct slab *slabp) slab_rcu->addr = addr; call_rcu(&slab_rcu->head, kmem_rcu_free); } else { - kmem_freepages(cachep, addr); - if (OFF_SLAB(cachep)) - kmem_cache_free(cachep->slabp_cache, slabp); + kmem_freepages(cachep, addr, *this_cpu); + if (OFF_SLAB(cachep)) { + if (this_cpu) + __cache_free(cachep->slabp_cache, slabp, this_cpu); + else + kmem_cache_free(cachep->slabp_cache, slabp); + } } } @@ -2002,6 +2184,8 @@ static size_t calculate_slab_order(struct kmem_cache *cachep, static int __init_refok setup_cpu_cache(struct kmem_cache *cachep, gfp_t gfp) { + int this_cpu; + if (g_cpucache_up == FULL) return enable_cpucache(cachep, gfp); @@ -2045,10 +2229,12 @@ static int __init_refok setup_cpu_cache(struct kmem_cache *cachep, gfp_t gfp) jiffies + REAPTIMEOUT_LIST3 + ((unsigned long)cachep) % REAPTIMEOUT_LIST3; - cpu_cache_get(cachep)->avail = 0; - cpu_cache_get(cachep)->limit = BOOT_CPUCACHE_ENTRIES; - cpu_cache_get(cachep)->batchcount = 1; - cpu_cache_get(cachep)->touched = 0; + this_cpu = raw_smp_processor_id(); + + cpu_cache_get(cachep, this_cpu)->avail = 0; + cpu_cache_get(cachep, this_cpu)->limit = BOOT_CPUCACHE_ENTRIES; + cpu_cache_get(cachep, this_cpu)->batchcount = 1; + cpu_cache_get(cachep, this_cpu)->touched = 0; cachep->batchcount = 1; cachep->limit = BOOT_CPUCACHE_ENTRIES; return 0; @@ -2358,19 +2544,19 @@ EXPORT_SYMBOL(kmem_cache_create); #if DEBUG static void check_irq_off(void) { +/* + * On PREEMPT_RT we use locks to protect the per-CPU lists, + * and keep interrupts enabled. + */ +#ifndef CONFIG_PREEMPT_RT BUG_ON(!irqs_disabled()); +#endif } static void check_irq_on(void) { +#ifndef CONFIG_PREEMPT_RT BUG_ON(irqs_disabled()); -} - -static void check_spinlock_acquired(struct kmem_cache *cachep) -{ -#ifdef CONFIG_SMP - check_irq_off(); - assert_spin_locked(&cachep->nodelists[numa_node_id()]->list_lock); #endif } @@ -2385,34 +2571,67 @@ static void check_spinlock_acquired_node(struct kmem_cache *cachep, int node) #else #define check_irq_off() do { } while(0) #define check_irq_on() do { } while(0) -#define check_spinlock_acquired(x) do { } while(0) #define check_spinlock_acquired_node(x, y) do { } while(0) #endif -static void drain_array(struct kmem_cache *cachep, struct kmem_list3 *l3, +static int drain_array(struct kmem_cache *cachep, struct kmem_list3 *l3, struct array_cache *ac, int force, int node); -static void do_drain(void *arg) +static void __do_drain(void *arg, int this_cpu) { struct kmem_cache *cachep = arg; + int node = cpu_to_node(this_cpu); struct array_cache *ac; - int node = numa_node_id(); check_irq_off(); - ac = cpu_cache_get(cachep); + ac = cpu_cache_get(cachep, this_cpu); spin_lock(&cachep->nodelists[node]->list_lock); - free_block(cachep, ac->entry, ac->avail, node); + free_block(cachep, ac->entry, ac->avail, node, &this_cpu); spin_unlock(&cachep->nodelists[node]->list_lock); ac->avail = 0; } +#ifdef CONFIG_PREEMPT_RT +static void do_drain(void *arg, int this_cpu) +{ + __do_drain(arg, this_cpu); +} +#else +static void do_drain(void *arg) +{ + __do_drain(arg, smp_processor_id()); +} +#endif + +#ifdef CONFIG_PREEMPT_RT +/* + * execute func() for all CPUs. On PREEMPT_RT we dont actually have + * to run on the remote CPUs - we only have to take their CPU-locks. + * (This is a rare operation, so cacheline bouncing is not an issue.) + */ +static void +slab_on_each_cpu(void (*func)(void *arg, int this_cpu), void *arg) +{ + unsigned int i; + + check_irq_on(); + for_each_online_cpu(i) { + spin_lock(&__get_cpu_lock(slab, i)); + func(arg, i); + spin_unlock(&__get_cpu_lock(slab, i)); + } +} +#else +# define slab_on_each_cpu(func, cachep) on_each_cpu(func, cachep, 1) +#endif + static void drain_cpu_caches(struct kmem_cache *cachep) { struct kmem_list3 *l3; int node; - on_each_cpu(do_drain, cachep, 1); + slab_on_each_cpu(do_drain, cachep); check_irq_on(); for_each_online_node(node) { l3 = cachep->nodelists[node]; @@ -2437,16 +2656,16 @@ static int drain_freelist(struct kmem_cache *cache, struct kmem_list3 *l3, int tofree) { struct list_head *p; - int nr_freed; + int nr_freed, this_cpu; struct slab *slabp; nr_freed = 0; while (nr_freed < tofree && !list_empty(&l3->slabs_free)) { - spin_lock_irq(&l3->list_lock); + slab_spin_lock_irq(&l3->list_lock, this_cpu); p = l3->slabs_free.prev; if (p == &l3->slabs_free) { - spin_unlock_irq(&l3->list_lock); + slab_spin_unlock_irq(&l3->list_lock, this_cpu); goto out; } @@ -2455,13 +2674,9 @@ static int drain_freelist(struct kmem_cache *cache, BUG_ON(slabp->inuse); #endif list_del(&slabp->list); - /* - * Safe to drop the lock. The slab is no longer linked - * to the cache. - */ l3->free_objects -= cache->num; - spin_unlock_irq(&l3->list_lock); - slab_destroy(cache, slabp); + slab_destroy(cache, slabp, &this_cpu); + slab_spin_unlock_irq(&l3->list_lock, this_cpu); nr_freed++; } out: @@ -2725,8 +2940,8 @@ static void slab_map_pages(struct kmem_cache *cache, struct slab *slab, * Grow (by 1) the number of slabs within a cache. This is called by * kmem_cache_alloc() when there are no active objs left in a cache. */ -static int cache_grow(struct kmem_cache *cachep, - gfp_t flags, int nodeid, void *objp) +static int cache_grow(struct kmem_cache *cachep, gfp_t flags, int nodeid, + void *objp, int *this_cpu) { struct slab *slabp; size_t offset; @@ -2754,8 +2969,7 @@ static int cache_grow(struct kmem_cache *cachep, offset *= cachep->colour_off; - if (local_flags & __GFP_WAIT) - local_irq_enable(); + slab_irq_enable_GFP_WAIT(local_flags, this_cpu); /* * The test for missing atomic flag is performed here, rather than @@ -2784,8 +2998,8 @@ static int cache_grow(struct kmem_cache *cachep, cache_init_objs(cachep, slabp); - if (local_flags & __GFP_WAIT) - local_irq_disable(); + slab_irq_disable_GFP_WAIT(local_flags, this_cpu); + check_irq_off(); spin_lock(&l3->list_lock); @@ -2796,10 +3010,9 @@ static int cache_grow(struct kmem_cache *cachep, spin_unlock(&l3->list_lock); return 1; opps1: - kmem_freepages(cachep, objp); + kmem_freepages(cachep, objp, -1); failed: - if (local_flags & __GFP_WAIT) - local_irq_disable(); + slab_irq_disable_GFP_WAIT(local_flags, this_cpu); return 0; } @@ -2921,7 +3134,8 @@ bad: #define check_slabp(x,y) do { } while(0) #endif -static void *cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags) +static void * +cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags, int *this_cpu) { int batchcount; struct kmem_list3 *l3; @@ -2931,7 +3145,7 @@ static void *cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags) retry: check_irq_off(); node = numa_node_id(); - ac = cpu_cache_get(cachep); + ac = cpu_cache_get(cachep, *this_cpu); batchcount = ac->batchcount; if (!ac->touched && batchcount > BATCHREFILL_LIMIT) { /* @@ -2941,7 +3155,7 @@ retry: */ batchcount = BATCHREFILL_LIMIT; } - l3 = cachep->nodelists[node]; + l3 = cachep->nodelists[cpu_to_node(*this_cpu)]; BUG_ON(ac->avail > 0 || !l3); spin_lock(&l3->list_lock); @@ -2964,7 +3178,7 @@ retry: slabp = list_entry(entry, struct slab, list); check_slabp(cachep, slabp); - check_spinlock_acquired(cachep); + check_spinlock_acquired_node(cachep, cpu_to_node(*this_cpu)); /* * The slab was either on partial or free list so @@ -2978,8 +3192,9 @@ retry: STATS_INC_ACTIVE(cachep); STATS_SET_HIGH(cachep); - ac->entry[ac->avail++] = slab_get_obj(cachep, slabp, - node); + ac->entry[ac->avail++] = + slab_get_obj(cachep, slabp, + cpu_to_node(*this_cpu)); } check_slabp(cachep, slabp); @@ -2998,10 +3213,10 @@ alloc_done: if (unlikely(!ac->avail)) { int x; - x = cache_grow(cachep, flags | GFP_THISNODE, node, NULL); + x = cache_grow(cachep, flags | GFP_THISNODE, cpu_to_node(*this_cpu), NULL, this_cpu); /* cache_grow can reenable interrupts, then ac could change. */ - ac = cpu_cache_get(cachep); + ac = cpu_cache_get(cachep, *this_cpu); if (!x && ac->avail == 0) /* no objects in sight? abort */ return NULL; @@ -3088,21 +3303,22 @@ static bool slab_should_failslab(struct kmem_cache *cachep, gfp_t flags) return should_failslab(obj_size(cachep), flags); } -static inline void *____cache_alloc(struct kmem_cache *cachep, gfp_t flags) +static inline void * +____cache_alloc(struct kmem_cache *cachep, gfp_t flags, int *this_cpu) { void *objp; struct array_cache *ac; check_irq_off(); - ac = cpu_cache_get(cachep); + ac = cpu_cache_get(cachep, *this_cpu); if (likely(ac->avail)) { STATS_INC_ALLOCHIT(cachep); ac->touched = 1; objp = ac->entry[--ac->avail]; } else { STATS_INC_ALLOCMISS(cachep); - objp = cache_alloc_refill(cachep, flags); + objp = cache_alloc_refill(cachep, flags, this_cpu); } /* * To avoid a false negative, if an object that is in one of the @@ -3120,7 +3336,8 @@ static inline void *____cache_alloc(struct kmem_cache *cachep, gfp_t flags) * If we are in_interrupt, then process context, including cpusets and * mempolicy, may not apply and should not be used for allocation policy. */ -static void *alternate_node_alloc(struct kmem_cache *cachep, gfp_t flags) +static void *alternate_node_alloc(struct kmem_cache *cachep, gfp_t flags, + int *this_cpu) { int nid_alloc, nid_here; @@ -3132,7 +3349,7 @@ static void *alternate_node_alloc(struct kmem_cache *cachep, gfp_t flags) else if (current->mempolicy) nid_alloc = slab_node(current->mempolicy); if (nid_alloc != nid_here) - return ____cache_alloc_node(cachep, flags, nid_alloc); + return ____cache_alloc_node(cachep, flags, nid_alloc, this_cpu); return NULL; } @@ -3144,7 +3361,7 @@ static void *alternate_node_alloc(struct kmem_cache *cachep, gfp_t flags) * allocator to do its reclaim / fallback magic. We then insert the * slab into the proper nodelist and then allocate from it. */ -static void *fallback_alloc(struct kmem_cache *cache, gfp_t flags) +static void *fallback_alloc(struct kmem_cache *cache, gfp_t flags, int *this_cpu) { struct zonelist *zonelist; gfp_t local_flags; @@ -3172,7 +3389,8 @@ retry: cache->nodelists[nid] && cache->nodelists[nid]->free_objects) { obj = ____cache_alloc_node(cache, - flags | GFP_THISNODE, nid); + flags | GFP_THISNODE, nid, + this_cpu); if (obj) break; } @@ -3185,20 +3403,21 @@ retry: * We may trigger various forms of reclaim on the allowed * set and go into memory reserves if necessary. */ - if (local_flags & __GFP_WAIT) - local_irq_enable(); + slab_irq_enable_GFP_WAIT(local_flags, this_cpu); + kmem_flagcheck(cache, flags); - obj = kmem_getpages(cache, local_flags, numa_node_id()); - if (local_flags & __GFP_WAIT) - local_irq_disable(); + obj = kmem_getpages(cache, local_flags, cpu_to_node(*this_cpu)); + + slab_irq_disable_GFP_WAIT(local_flags, this_cpu); + if (obj) { /* * Insert into the appropriate per node queues */ nid = page_to_nid(virt_to_page(obj)); - if (cache_grow(cache, flags, nid, obj)) { + if (cache_grow(cache, flags, nid, obj, this_cpu)) { obj = ____cache_alloc_node(cache, - flags | GFP_THISNODE, nid); + flags | GFP_THISNODE, nid, this_cpu); if (!obj) /* * Another processor may allocate the @@ -3219,7 +3438,7 @@ retry: * A interface to enable slab creation on nodeid */ static void *____cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, - int nodeid) + int nodeid, int *this_cpu) { struct list_head *entry; struct slab *slabp; @@ -3267,11 +3486,11 @@ retry: must_grow: spin_unlock(&l3->list_lock); - x = cache_grow(cachep, flags | GFP_THISNODE, nodeid, NULL); + x = cache_grow(cachep, flags | GFP_THISNODE, nodeid, NULL, this_cpu); if (x) goto retry; - return fallback_alloc(cachep, flags); + return fallback_alloc(cachep, flags, this_cpu); done: return obj; @@ -3294,6 +3513,7 @@ __cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid, void *caller) { unsigned long save_flags; + int this_cpu, this_node; void *ptr; flags &= gfp_allowed_mask; @@ -3304,32 +3524,34 @@ __cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid, return NULL; cache_alloc_debugcheck_before(cachep, flags); - local_irq_save(save_flags); + slab_irq_save(save_flags, this_cpu); + + this_node = cpu_to_node(this_cpu); if (unlikely(nodeid == -1)) - nodeid = numa_node_id(); + nodeid = this_node; if (unlikely(!cachep->nodelists[nodeid])) { /* Node not bootstrapped yet */ - ptr = fallback_alloc(cachep, flags); + ptr = fallback_alloc(cachep, flags, &this_cpu); goto out; } - if (nodeid == numa_node_id()) { + if (nodeid == this_node) { /* * Use the locally cached objects if possible. * However ____cache_alloc does not allow fallback * to other nodes. It may fail while we still have * objects on other nodes available. */ - ptr = ____cache_alloc(cachep, flags); + ptr = ____cache_alloc(cachep, flags, &this_cpu); if (ptr) goto out; } /* ___cache_alloc_node can fall back to other nodes */ - ptr = ____cache_alloc_node(cachep, flags, nodeid); + ptr = ____cache_alloc_node(cachep, flags, nodeid, &this_cpu); out: - local_irq_restore(save_flags); + slab_irq_restore(save_flags, this_cpu); ptr = cache_alloc_debugcheck_after(cachep, flags, ptr, caller); kmemleak_alloc_recursive(ptr, obj_size(cachep), 1, cachep->flags, flags); @@ -3344,33 +3566,33 @@ __cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid, } static __always_inline void * -__do_cache_alloc(struct kmem_cache *cache, gfp_t flags) +__do_cache_alloc(struct kmem_cache *cache, gfp_t flags, int *this_cpu) { void *objp; if (unlikely(current->flags & (PF_SPREAD_SLAB | PF_MEMPOLICY))) { - objp = alternate_node_alloc(cache, flags); + objp = alternate_node_alloc(cache, flags, this_cpu); if (objp) goto out; } - objp = ____cache_alloc(cache, flags); + objp = ____cache_alloc(cache, flags, this_cpu); /* * We may just have run out of memory on the local node. * ____cache_alloc_node() knows how to locate memory on other nodes */ - if (!objp) - objp = ____cache_alloc_node(cache, flags, numa_node_id()); - + if (!objp) + objp = ____cache_alloc_node(cache, flags, + cpu_to_node(*this_cpu), this_cpu); out: return objp; } #else static __always_inline void * -__do_cache_alloc(struct kmem_cache *cachep, gfp_t flags) +__do_cache_alloc(struct kmem_cache *cachep, gfp_t flags, int *this_cpu) { - return ____cache_alloc(cachep, flags); + return ____cache_alloc(cachep, flags, this_cpu); } #endif /* CONFIG_NUMA */ @@ -3379,6 +3601,7 @@ static __always_inline void * __cache_alloc(struct kmem_cache *cachep, gfp_t flags, void *caller) { unsigned long save_flags; + int this_cpu; void *objp; flags &= gfp_allowed_mask; @@ -3389,9 +3612,9 @@ __cache_alloc(struct kmem_cache *cachep, gfp_t flags, void *caller) return NULL; cache_alloc_debugcheck_before(cachep, flags); - local_irq_save(save_flags); - objp = __do_cache_alloc(cachep, flags); - local_irq_restore(save_flags); + slab_irq_save(save_flags, this_cpu); + objp = __do_cache_alloc(cachep, flags, &this_cpu); + slab_irq_restore(save_flags, this_cpu); objp = cache_alloc_debugcheck_after(cachep, flags, objp, caller); kmemleak_alloc_recursive(objp, obj_size(cachep), 1, cachep->flags, flags); @@ -3410,7 +3633,7 @@ __cache_alloc(struct kmem_cache *cachep, gfp_t flags, void *caller) * Caller needs to acquire correct kmem_list's list_lock */ static void free_block(struct kmem_cache *cachep, void **objpp, int nr_objects, - int node) + int node, int *this_cpu) { int i; struct kmem_list3 *l3; @@ -3439,7 +3662,7 @@ static void free_block(struct kmem_cache *cachep, void **objpp, int nr_objects, * a different cache, refer to comments before * alloc_slabmgmt. */ - slab_destroy(cachep, slabp); + slab_destroy(cachep, slabp, this_cpu); } else { list_add(&slabp->list, &l3->slabs_free); } @@ -3453,11 +3676,12 @@ static void free_block(struct kmem_cache *cachep, void **objpp, int nr_objects, } } -static void cache_flusharray(struct kmem_cache *cachep, struct array_cache *ac) +static void +cache_flusharray(struct kmem_cache *cachep, struct array_cache *ac, int *this_cpu) { int batchcount; struct kmem_list3 *l3; - int node = numa_node_id(); + int node = cpu_to_node(*this_cpu); batchcount = ac->batchcount; #if DEBUG @@ -3479,7 +3703,7 @@ static void cache_flusharray(struct kmem_cache *cachep, struct array_cache *ac) } } - free_block(cachep, ac->entry, batchcount, node); + free_block(cachep, ac->entry, batchcount, node, this_cpu); free_done: #if STATS { @@ -3508,9 +3732,10 @@ free_done: * Release an obj back to its cache. If the obj has a constructed state, it must * be in this state _before_ it is released. Called with disabled ints. */ -static inline void __cache_free(struct kmem_cache *cachep, void *objp) +static inline void +__cache_free(struct kmem_cache *cachep, void *objp, int *this_cpu) { - struct array_cache *ac = cpu_cache_get(cachep); + struct array_cache *ac = cpu_cache_get(cachep, *this_cpu); check_irq_off(); kmemleak_free_recursive(objp, cachep->flags); @@ -3525,7 +3750,7 @@ static inline void __cache_free(struct kmem_cache *cachep, void *objp) * variable to skip the call, which is mostly likely to be present in * the cache. */ - if (nr_online_nodes > 1 && cache_free_alien(cachep, objp)) + if (nr_online_nodes > 1 && cache_free_alien(cachep, objp, this_cpu)) return; if (likely(ac->avail < ac->limit)) { @@ -3534,7 +3759,7 @@ static inline void __cache_free(struct kmem_cache *cachep, void *objp) return; } else { STATS_INC_FREEMISS(cachep); - cache_flusharray(cachep, ac); + cache_flusharray(cachep, ac, this_cpu); ac->entry[ac->avail++] = objp; } } @@ -3733,13 +3958,14 @@ EXPORT_SYMBOL(__kmalloc); void kmem_cache_free(struct kmem_cache *cachep, void *objp) { unsigned long flags; + int this_cpu; - local_irq_save(flags); + slab_irq_save(flags, this_cpu); debug_check_no_locks_freed(objp, obj_size(cachep)); if (!(cachep->flags & SLAB_DEBUG_OBJECTS)) debug_check_no_obj_freed(objp, obj_size(cachep)); - __cache_free(cachep, objp); - local_irq_restore(flags); + __cache_free(cachep, objp, &this_cpu); + slab_irq_restore(flags, this_cpu); trace_kmem_cache_free(_RET_IP_, objp); } @@ -3758,18 +3984,19 @@ void kfree(const void *objp) { struct kmem_cache *c; unsigned long flags; + int this_cpu; trace_kfree(_RET_IP_, objp); if (unlikely(ZERO_OR_NULL_PTR(objp))) return; - local_irq_save(flags); + slab_irq_save(flags, this_cpu); kfree_debugcheck(objp); c = virt_to_cache(objp); debug_check_no_locks_freed(objp, obj_size(c)); debug_check_no_obj_freed(objp, obj_size(c)); - __cache_free(c, (void *)objp); - local_irq_restore(flags); + __cache_free(c, (void *)objp, &this_cpu); + slab_irq_restore(flags, this_cpu); } EXPORT_SYMBOL(kfree); @@ -3790,7 +4017,7 @@ EXPORT_SYMBOL_GPL(kmem_cache_name); */ static int alloc_kmemlist(struct kmem_cache *cachep, gfp_t gfp) { - int node; + int node, this_cpu; struct kmem_list3 *l3; struct array_cache *new_shared; struct array_cache **new_alien = NULL; @@ -3818,11 +4045,11 @@ static int alloc_kmemlist(struct kmem_cache *cachep, gfp_t gfp) if (l3) { struct array_cache *shared = l3->shared; - spin_lock_irq(&l3->list_lock); + slab_spin_lock_irq(&l3->list_lock, this_cpu); if (shared) free_block(cachep, shared->entry, - shared->avail, node); + shared->avail, node, &this_cpu); l3->shared = new_shared; if (!l3->alien) { @@ -3831,7 +4058,7 @@ static int alloc_kmemlist(struct kmem_cache *cachep, gfp_t gfp) } l3->free_limit = (1 + nr_cpus_node(node)) * cachep->batchcount + cachep->num; - spin_unlock_irq(&l3->list_lock); + slab_spin_unlock_irq(&l3->list_lock, this_cpu); kfree(shared); free_alien_cache(new_alien); continue; @@ -3878,24 +4105,36 @@ struct ccupdate_struct { struct array_cache *new[NR_CPUS]; }; -static void do_ccupdate_local(void *info) +static void __do_ccupdate_local(void *info, int this_cpu) { struct ccupdate_struct *new = info; struct array_cache *old; check_irq_off(); - old = cpu_cache_get(new->cachep); + old = cpu_cache_get(new->cachep, this_cpu); - new->cachep->array[smp_processor_id()] = new->new[smp_processor_id()]; - new->new[smp_processor_id()] = old; + new->cachep->array[this_cpu] = new->new[this_cpu]; + new->new[this_cpu] = old; } +#ifdef CONFIG_PREEMPT_RT +static void do_ccupdate_local(void *arg, int this_cpu) +{ + __do_ccupdate_local(arg, this_cpu); +} +#else +static void do_ccupdate_local(void *arg) +{ + __do_ccupdate_local(arg, smp_processor_id()); +} +#endif + /* Always called with the cache_chain_mutex held */ static int do_tune_cpucache(struct kmem_cache *cachep, int limit, int batchcount, int shared, gfp_t gfp) { struct ccupdate_struct *new; - int i; + int i, this_cpu; new = kzalloc(sizeof(*new), gfp); if (!new) @@ -3913,7 +4152,7 @@ static int do_tune_cpucache(struct kmem_cache *cachep, int limit, } new->cachep = cachep; - on_each_cpu(do_ccupdate_local, (void *)new, 1); + slab_on_each_cpu(do_ccupdate_local, (void *)new); check_irq_on(); cachep->batchcount = batchcount; @@ -3924,9 +4163,12 @@ static int do_tune_cpucache(struct kmem_cache *cachep, int limit, struct array_cache *ccold = new->new[i]; if (!ccold) continue; - spin_lock_irq(&cachep->nodelists[cpu_to_node(i)]->list_lock); - free_block(cachep, ccold->entry, ccold->avail, cpu_to_node(i)); - spin_unlock_irq(&cachep->nodelists[cpu_to_node(i)]->list_lock); + slab_spin_lock_irq(&cachep->nodelists[cpu_to_node(i)]->list_lock, + this_cpu); + free_block(cachep, ccold->entry, ccold->avail, cpu_to_node(i), + &this_cpu); + slab_spin_unlock_irq(&cachep->nodelists[cpu_to_node(i)]->list_lock, + this_cpu); kfree(ccold); } kfree(new); @@ -3991,29 +4233,31 @@ static int enable_cpucache(struct kmem_cache *cachep, gfp_t gfp) * Drain an array if it contains any elements taking the l3 lock only if * necessary. Note that the l3 listlock also protects the array_cache * if drain_array() is used on the shared array. + * returns non-zero if some work is done */ -void drain_array(struct kmem_cache *cachep, struct kmem_list3 *l3, - struct array_cache *ac, int force, int node) +int drain_array(struct kmem_cache *cachep, struct kmem_list3 *l3, + struct array_cache *ac, int force, int node) { - int tofree; + int tofree, this_cpu; if (!ac || !ac->avail) - return; + return 0; if (ac->touched && !force) { ac->touched = 0; } else { - spin_lock_irq(&l3->list_lock); + slab_spin_lock_irq(&l3->list_lock, this_cpu); if (ac->avail) { tofree = force ? ac->avail : (ac->limit + 4) / 5; if (tofree > ac->avail) tofree = (ac->avail + 1) / 2; - free_block(cachep, ac->entry, tofree, node); + free_block(cachep, ac->entry, tofree, node, &this_cpu); ac->avail -= tofree; memmove(ac->entry, &(ac->entry[tofree]), sizeof(void *) * ac->avail); } - spin_unlock_irq(&l3->list_lock); + slab_spin_unlock_irq(&l3->list_lock, this_cpu); } + return 1; } /** @@ -4030,10 +4274,11 @@ void drain_array(struct kmem_cache *cachep, struct kmem_list3 *l3, */ static void cache_reap(struct work_struct *w) { + int this_cpu = raw_smp_processor_id(), node = cpu_to_node(this_cpu); struct kmem_cache *searchp; struct kmem_list3 *l3; - int node = numa_node_id(); struct delayed_work *work = to_delayed_work(w); + int work_done = 0; if (!mutex_trylock(&cache_chain_mutex)) /* Give up. Setup the next iteration. */ @@ -4049,9 +4294,12 @@ static void cache_reap(struct work_struct *w) */ l3 = searchp->nodelists[node]; - reap_alien(searchp, l3); + work_done += reap_alien(searchp, l3, &this_cpu); + + node = cpu_to_node(this_cpu); - drain_array(searchp, l3, cpu_cache_get(searchp), 0, node); + work_done += drain_array(searchp, l3, + cpu_cache_get(searchp, this_cpu), 0, node); /* * These are racy checks but it does not matter @@ -4062,7 +4310,7 @@ static void cache_reap(struct work_struct *w) l3->next_reap = jiffies + REAPTIMEOUT_LIST3; - drain_array(searchp, l3, l3->shared, 0, node); + work_done += drain_array(searchp, l3, l3->shared, 0, node); if (l3->free_touched) l3->free_touched = 0; @@ -4081,7 +4329,8 @@ next: next_reap_node(); out: /* Set up the next iteration */ - schedule_delayed_work(work, round_jiffies_relative(REAPTIMEOUT_CPUC)); + schedule_delayed_work(work, + round_jiffies_relative((1+!work_done) * REAPTIMEOUT_CPUC)); } #ifdef CONFIG_SLABINFO @@ -4140,7 +4389,7 @@ static int s_show(struct seq_file *m, void *p) unsigned long num_slabs, free_objects = 0, shared_avail = 0; const char *name; char *error = NULL; - int node; + int this_cpu, node; struct kmem_list3 *l3; active_objs = 0; @@ -4151,7 +4400,7 @@ static int s_show(struct seq_file *m, void *p) continue; check_irq_on(); - spin_lock_irq(&l3->list_lock); + slab_spin_lock_irq(&l3->list_lock, this_cpu); list_for_each_entry(slabp, &l3->slabs_full, list) { if (slabp->inuse != cachep->num && !error) @@ -4176,7 +4425,7 @@ static int s_show(struct seq_file *m, void *p) if (l3->shared) shared_avail += l3->shared->avail; - spin_unlock_irq(&l3->list_lock); + slab_spin_unlock_irq(&l3->list_lock, this_cpu); } num_slabs += active_slabs; num_objs = num_slabs * cachep->num; @@ -4386,7 +4635,7 @@ static int leaks_show(struct seq_file *m, void *p) struct kmem_list3 *l3; const char *name; unsigned long *n = m->private; - int node; + int node, this_cpu; int i; if (!(cachep->flags & SLAB_STORE_USER)) @@ -4404,13 +4653,13 @@ static int leaks_show(struct seq_file *m, void *p) continue; check_irq_on(); - spin_lock_irq(&l3->list_lock); + slab_spin_lock_irq(&l3->list_lock, this_cpu); list_for_each_entry(slabp, &l3->slabs_full, list) handle_slab(n, cachep, slabp); list_for_each_entry(slabp, &l3->slabs_partial, list) handle_slab(n, cachep, slabp); - spin_unlock_irq(&l3->list_lock); + slab_spin_unlock_irq(&l3->list_lock, this_cpu); } name = cachep->name; if (n[0] == n[1]) { diff --git a/mm/swap.c b/mm/swap.c index cb29ae5..c2dfe5f 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -30,15 +30,93 @@ #include <linux/notifier.h> #include <linux/backing-dev.h> #include <linux/memcontrol.h> +#include <linux/interrupt.h> #include "internal.h" /* How many pages do we try to swap or page in/out together? */ int page_cluster; +#ifdef CONFIG_PREEMPT_RT +/* + * On PREEMPT_RT we don't want to disable preemption for cpu variables. + * We grab a cpu and then use that cpu to lock the variables accordingly. + * + * (On !PREEMPT_RT this turns into normal preempt-off sections, as before.) + */ +static DEFINE_PER_CPU_LOCKED(struct pagevec[NR_LRU_LISTS], lru_add_pvecs); +static DEFINE_PER_CPU_LOCKED(struct pagevec, lru_rotate_pvecs); + +#define swap_get_cpu_var_irq_save(var, flags, cpu) \ + ({ \ + (void)flags; \ + &get_cpu_var_locked(var, &cpu); \ + }) + +#define swap_put_cpu_var_irq_restore(var, flags, cpu) \ + put_cpu_var_locked(var, cpu) + +#define swap_get_cpu_var(var, cpu) \ + &get_cpu_var_locked(var, &cpu) + +#define swap_put_cpu_var(var, cpu) \ + put_cpu_var_locked(var, cpu) + +#define swap_per_cpu_lock(var, cpu) \ + ({ \ + spin_lock(&__get_cpu_lock(var, cpu)); \ + &__get_cpu_var_locked(var, cpu); \ + }) + +#define swap_per_cpu_unlock(var, cpu) \ + spin_unlock(&__get_cpu_lock(var, cpu)); + +#define swap_get_cpu() raw_smp_processor_id() + +#define swap_put_cpu() do { } while (0) + +#define swap_irq_save(flags) do { (void)flags; } while (0) + +#define swap_irq_restore(flags) do { (void)flags; } while (0) + +#else + static DEFINE_PER_CPU(struct pagevec[NR_LRU_LISTS], lru_add_pvecs); static DEFINE_PER_CPU(struct pagevec, lru_rotate_pvecs); +#define swap_get_cpu_var_irq_save(var, flags, cpu) \ + ({ \ + (void)cpu; \ + local_irq_save(flags); \ + &__get_cpu_var(var); \ + }) + +#define swap_put_cpu_var_irq_restore(var, flags, cpu) \ + local_irq_restore(flags) + +#define swap_get_cpu_var(var, cpu) \ + &get_cpu_var(var) \ + +#define swap_put_cpu_var(var, cpu) \ + ({ \ + (void)cpu; \ + put_cpu_var(var); \ + }) + +#define swap_per_cpu_lock(var, cpu) &per_cpu(var, cpu) + +#define swap_per_cpu_unlock(var, cpu) do { } while (0) + +#define swap_get_cpu() get_cpu() + +#define swap_put_cpu() put_cpu() + +#define swap_irq_save(flags) local_irq_save(flags) + +#define swap_irq_restore(flags) local_irq_restore(flags) + +#endif + /* * This path almost never happens for VM activity - pages are normally * freed via pagevecs. But it gets used by networking. @@ -141,13 +219,13 @@ void rotate_reclaimable_page(struct page *page) !PageUnevictable(page) && PageLRU(page)) { struct pagevec *pvec; unsigned long flags; + int cpu; page_cache_get(page); - local_irq_save(flags); - pvec = &__get_cpu_var(lru_rotate_pvecs); + pvec = swap_get_cpu_var_irq_save(lru_rotate_pvecs, flags, cpu); if (!pagevec_add(pvec, page)) pagevec_move_tail(pvec); - local_irq_restore(flags); + swap_put_cpu_var_irq_restore(lru_rotate_pvecs, flags, cpu); } } @@ -216,12 +294,14 @@ EXPORT_SYMBOL(mark_page_accessed); void __lru_cache_add(struct page *page, enum lru_list lru) { - struct pagevec *pvec = &get_cpu_var(lru_add_pvecs)[lru]; + struct pagevec *pvec; + int cpu; + pvec = swap_get_cpu_var(lru_add_pvecs, cpu)[lru]; page_cache_get(page); if (!pagevec_add(pvec, page)) ____pagevec_lru_add(pvec, lru); - put_cpu_var(lru_add_pvecs); + swap_put_cpu_var(lru_add_pvecs, cpu); } /** @@ -271,31 +351,33 @@ void add_page_to_unevictable_list(struct page *page) */ static void drain_cpu_pagevecs(int cpu) { - struct pagevec *pvecs = per_cpu(lru_add_pvecs, cpu); - struct pagevec *pvec; + struct pagevec *pvecs, *pvec; int lru; + pvecs = swap_per_cpu_lock(lru_add_pvecs, cpu)[0]; for_each_lru(lru) { pvec = &pvecs[lru - LRU_BASE]; if (pagevec_count(pvec)) ____pagevec_lru_add(pvec, lru); } + swap_per_cpu_unlock(lru_add_pvecs, cpu); - pvec = &per_cpu(lru_rotate_pvecs, cpu); + pvec = swap_per_cpu_lock(lru_rotate_pvecs, cpu); if (pagevec_count(pvec)) { unsigned long flags; /* No harm done if a racing interrupt already did this */ - local_irq_save(flags); + swap_irq_save(flags); pagevec_move_tail(pvec); - local_irq_restore(flags); + swap_irq_restore(flags); } + swap_per_cpu_unlock(lru_rotate_pvecs, cpu); } void lru_add_drain(void) { - drain_cpu_pagevecs(get_cpu()); - put_cpu(); + drain_cpu_pagevecs(swap_get_cpu()); + swap_put_cpu(); } static void lru_add_drain_per_cpu(struct work_struct *dummy) @@ -369,7 +451,7 @@ void release_pages(struct page **pages, int nr, int cold) } __pagevec_free(&pages_to_free); pagevec_reinit(&pages_to_free); - } + } } if (zone) spin_unlock_irqrestore(&zone->lru_lock, flags); diff --git a/mm/vmscan.c b/mm/vmscan.c index 94e86dd..9360c3e 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -23,6 +23,7 @@ #include <linux/file.h> #include <linux/writeback.h> #include <linux/blkdev.h> +#include <linux/interrupt.h> #include <linux/buffer_head.h> /* for try_to_release_page(), buffer_heads_over_limit */ #include <linux/mm_inline.h> @@ -1123,7 +1124,7 @@ static unsigned long shrink_inactive_list(unsigned long max_scan, } nr_reclaimed += nr_freed; - local_irq_disable(); + local_irq_disable_nort(); if (current_is_kswapd()) { __count_zone_vm_events(PGSCAN_KSWAPD, zone, nr_scan); __count_vm_events(KSWAPD_STEAL, nr_freed); @@ -1164,9 +1165,14 @@ static unsigned long shrink_inactive_list(unsigned long max_scan, } } } while (nr_scanned < max_scan); + /* + * Non-PREEMPT_RT relies on IRQs-off protecting the page_states + * per-CPU data. PREEMPT_RT has that data protected even in + * __mod_page_state(), so no need to keep IRQs disabled. + */ spin_unlock(&zone->lru_lock); done: - local_irq_enable(); + local_irq_enable_nort(); pagevec_release(&pvec); return nr_reclaimed; } diff --git a/mm/vmstat.c b/mm/vmstat.c index 138bed5..9f7c001 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -149,17 +149,16 @@ static void refresh_zone_stat_thresholds(void) void __mod_zone_page_state(struct zone *zone, enum zone_stat_item item, int delta) { - struct per_cpu_pageset *pcp = zone_pcp(zone, smp_processor_id()); + struct per_cpu_pageset *pcp = zone_pcp(zone, get_cpu()); s8 *p = pcp->vm_stat_diff + item; - long x; - - x = delta + *p; + long x = delta + *p; if (unlikely(x > pcp->stat_threshold || x < -pcp->stat_threshold)) { zone_page_state_add(x, zone, item); x = 0; } *p = x; + put_cpu(); } EXPORT_SYMBOL(__mod_zone_page_state); @@ -202,7 +201,7 @@ EXPORT_SYMBOL(mod_zone_page_state); */ void __inc_zone_state(struct zone *zone, enum zone_stat_item item) { - struct per_cpu_pageset *pcp = zone_pcp(zone, smp_processor_id()); + struct per_cpu_pageset *pcp = zone_pcp(zone, get_cpu()); s8 *p = pcp->vm_stat_diff + item; (*p)++; @@ -213,17 +212,28 @@ void __inc_zone_state(struct zone *zone, enum zone_stat_item item) zone_page_state_add(*p + overstep, zone, item); *p = -overstep; } + put_cpu(); } void __inc_zone_page_state(struct page *page, enum zone_stat_item item) { +#ifdef CONFIG_PREEMPT_RT + unsigned long flags; + struct zone *zone; + + zone = page_zone(page); + local_irq_save(flags); + __inc_zone_state(zone, item); + local_irq_restore(flags); +#else __inc_zone_state(page_zone(page), item); +#endif } EXPORT_SYMBOL(__inc_zone_page_state); void __dec_zone_state(struct zone *zone, enum zone_stat_item item) { - struct per_cpu_pageset *pcp = zone_pcp(zone, smp_processor_id()); + struct per_cpu_pageset *pcp = zone_pcp(zone, get_cpu()); s8 *p = pcp->vm_stat_diff + item; (*p)--; @@ -234,6 +244,7 @@ void __dec_zone_state(struct zone *zone, enum zone_stat_item item) zone_page_state_add(*p - overstep, zone, item); *p = overstep; } + put_cpu(); } void __dec_zone_page_state(struct page *page, enum zone_stat_item item) diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c index 406ad07..e1da8f6 100644 --- a/net/bluetooth/hci_core.c +++ b/net/bluetooth/hci_core.c @@ -911,7 +911,7 @@ int hci_register_dev(struct hci_dev *hdev) hdev->reassembly[i] = NULL; init_waitqueue_head(&hdev->req_wait_q); - init_MUTEX(&hdev->req_lock); + mutex_init(&hdev->req_lock); inquiry_cache_init(hdev); diff --git a/net/core/dev.c b/net/core/dev.c index 6a94475..24cfd44 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1887,42 +1887,52 @@ gso: Check this and shot the lock. It is not prone from deadlocks. Either shot noqueue qdisc, it is even simpler 8) */ - if (dev->flags & IFF_UP) { - int cpu = smp_processor_id(); /* ok because BHs are off */ + if (!(dev->flags & IFF_UP)) + goto err; - if (txq->xmit_lock_owner != cpu) { + /* Recursion is detected! It is possible, unfortunately: */ + if (netif_tx_lock_recursion(txq)) + goto err_recursion; - HARD_TX_LOCK(dev, txq, cpu); + HARD_TX_LOCK(dev, txq); - if (!netif_tx_queue_stopped(txq)) { - rc = 0; - if (!dev_hard_start_xmit(skb, dev, txq)) { - HARD_TX_UNLOCK(dev, txq); - goto out; - } - } - HARD_TX_UNLOCK(dev, txq); - if (net_ratelimit()) - printk(KERN_CRIT "Virtual device %s asks to " - "queue packet!\n", dev->name); - } else { - /* Recursion is detected! It is possible, - * unfortunately */ - if (net_ratelimit()) - printk(KERN_CRIT "Dead loop on virtual device " - "%s, fix it urgently!\n", dev->name); - } + if (netif_tx_queue_stopped(txq)) + goto err_tx_unlock; + + if (dev_hard_start_xmit(skb, dev, txq)) + goto err_tx_unlock; + + rc = 0; + HARD_TX_UNLOCK(dev, txq); + +out: + rcu_read_unlock_bh(); + return rc; + +err_recursion: + if (net_ratelimit()) { + printk(KERN_CRIT + "Dead loop on virtual device %s, fix it urgently!\n", + dev->name); + } + goto err; + +err_tx_unlock: + HARD_TX_UNLOCK(dev, txq); + + if (net_ratelimit()) { + printk(KERN_CRIT "Virtual device %s asks to queue packet!\n", + dev->name); } + /* Fall through: */ +err: rc = -ENETDOWN; rcu_read_unlock_bh(); out_kfree_skb: kfree_skb(skb); return rc; -out: - rcu_read_unlock_bh(); - return rc; } @@ -1995,8 +2005,8 @@ int netif_rx_ni(struct sk_buff *skb) { int err; - preempt_disable(); err = netif_rx(skb); + preempt_disable(); if (local_softirq_pending()) do_softirq(); preempt_enable(); @@ -2008,7 +2018,8 @@ EXPORT_SYMBOL(netif_rx_ni); static void net_tx_action(struct softirq_action *h) { - struct softnet_data *sd = &__get_cpu_var(softnet_data); + struct softnet_data *sd = &per_cpu(softnet_data, + raw_smp_processor_id()); if (sd->completion_queue) { struct sk_buff *clist; @@ -2024,6 +2035,11 @@ static void net_tx_action(struct softirq_action *h) WARN_ON(atomic_read(&skb->users)); __kfree_skb(skb); + /* + * Safe to reschedule - the list is private + * at this point. + */ + cond_resched_softirq_context(); } } @@ -2042,6 +2058,22 @@ static void net_tx_action(struct softirq_action *h) head = head->next_sched; root_lock = qdisc_lock(q); + /* + * We are executing in softirq context here, and + * if softirqs are preemptible, we must avoid + * infinite reactivation of the softirq by + * either the tx handler, or by netif_schedule(). + * (it would result in an infinitely looping + * softirq context) + * So we take the spinlock unconditionally. + */ +#ifdef CONFIG_PREEMPT_SOFTIRQS + spin_lock(root_lock); + smp_mb__before_clear_bit(); + clear_bit(__QDISC_STATE_SCHED, &q->state); + qdisc_run(q); + spin_unlock(root_lock); +#else if (spin_trylock(root_lock)) { smp_mb__before_clear_bit(); clear_bit(__QDISC_STATE_SCHED, @@ -2058,6 +2090,7 @@ static void net_tx_action(struct softirq_action *h) &q->state); } } +#endif } } } @@ -2270,7 +2303,7 @@ int netif_receive_skb(struct sk_buff *skb) skb->dev = orig_dev->master; } - __get_cpu_var(netdev_rx_stat).total++; + per_cpu(netdev_rx_stat, raw_smp_processor_id()).total++; skb_reset_network_header(skb); skb_reset_transport_header(skb); @@ -2660,9 +2693,10 @@ EXPORT_SYMBOL(napi_gro_frags); static int process_backlog(struct napi_struct *napi, int quota) { int work = 0; - struct softnet_data *queue = &__get_cpu_var(softnet_data); + struct softnet_data *queue; unsigned long start_time = jiffies; + queue = &per_cpu(softnet_data, raw_smp_processor_id()); napi->weight = weight_p; do { struct sk_buff *skb; @@ -2694,7 +2728,7 @@ void __napi_schedule(struct napi_struct *n) local_irq_save(flags); list_add_tail(&n->poll_list, &__get_cpu_var(softnet_data).poll_list); - __raise_softirq_irqoff(NET_RX_SOFTIRQ); + raise_softirq_irqoff(NET_RX_SOFTIRQ); local_irq_restore(flags); } EXPORT_SYMBOL(__napi_schedule); @@ -2848,7 +2882,7 @@ out: softnet_break: __get_cpu_var(netdev_rx_stat).time_squeeze++; - __raise_softirq_irqoff(NET_RX_SOFTIRQ); + raise_softirq_irqoff(NET_RX_SOFTIRQ); goto out; } @@ -4644,7 +4678,7 @@ static void __netdev_init_queue_locks_one(struct net_device *dev, { spin_lock_init(&dev_queue->_xmit_lock); netdev_set_xmit_lockdep_class(&dev_queue->_xmit_lock, dev->type); - dev_queue->xmit_lock_owner = -1; + dev_queue->xmit_lock_owner = (void *)-1; } static void netdev_init_queue_locks(struct net_device *dev) diff --git a/net/core/flow.c b/net/core/flow.c index 9601587..f032d1c 100644 --- a/net/core/flow.c +++ b/net/core/flow.c @@ -39,9 +39,10 @@ atomic_t flow_cache_genid = ATOMIC_INIT(0); static u32 flow_hash_shift; #define flow_hash_size (1 << flow_hash_shift) -static DEFINE_PER_CPU(struct flow_cache_entry **, flow_tables) = { NULL }; -#define flow_table(cpu) (per_cpu(flow_tables, cpu)) +static DEFINE_PER_CPU_LOCKED(struct flow_cache_entry **, flow_tables); + +#define flow_table(cpu) (per_cpu_var_locked(flow_tables, cpu)) static struct kmem_cache *flow_cachep __read_mostly; @@ -168,24 +169,24 @@ static int flow_key_compare(struct flowi *key1, struct flowi *key2) void *flow_cache_lookup(struct net *net, struct flowi *key, u16 family, u8 dir, flow_resolve_t resolver) { - struct flow_cache_entry *fle, **head; + struct flow_cache_entry **table, *fle, **head; unsigned int hash; int cpu; local_bh_disable(); - cpu = smp_processor_id(); + table = get_cpu_var_locked(flow_tables, &cpu); fle = NULL; /* Packet really early in init? Making flow_cache_init a * pre-smp initcall would solve this. --RR */ - if (!flow_table(cpu)) + if (!table) goto nocache; if (flow_hash_rnd_recalc(cpu)) flow_new_hash_rnd(cpu); hash = flow_hash_code(key, cpu); - head = &flow_table(cpu)[hash]; + head = &table[hash]; for (fle = *head; fle; fle = fle->next) { if (fle->family == family && fle->dir == dir && @@ -195,6 +196,7 @@ void *flow_cache_lookup(struct net *net, struct flowi *key, u16 family, u8 dir, if (ret) atomic_inc(fle->object_ref); + put_cpu_var_locked(flow_tables, cpu); local_bh_enable(); return ret; @@ -220,6 +222,8 @@ void *flow_cache_lookup(struct net *net, struct flowi *key, u16 family, u8 dir, } nocache: + put_cpu_var_locked(flow_tables, cpu); + { int err; void *obj; @@ -249,14 +253,15 @@ nocache: static void flow_cache_flush_tasklet(unsigned long data) { struct flow_flush_info *info = (void *)data; + struct flow_cache_entry **table; int i; int cpu; - cpu = smp_processor_id(); + table = get_cpu_var_locked(flow_tables, &cpu); for (i = 0; i < flow_hash_size; i++) { struct flow_cache_entry *fle; - fle = flow_table(cpu)[i]; + fle = table[i]; for (; fle; fle = fle->next) { unsigned genid = atomic_read(&flow_cache_genid); @@ -267,6 +272,7 @@ static void flow_cache_flush_tasklet(unsigned long data) atomic_dec(fle->object_ref); } } + put_cpu_var_locked(flow_tables, cpu); if (atomic_dec_and_test(&info->cpuleft)) complete(&info->completion); diff --git a/net/core/netpoll.c b/net/core/netpoll.c index 1b76eb1..950000c 100644 --- a/net/core/netpoll.c +++ b/net/core/netpoll.c @@ -69,20 +69,20 @@ static void queue_process(struct work_struct *work) txq = netdev_get_tx_queue(dev, skb_get_queue_mapping(skb)); - local_irq_save(flags); - __netif_tx_lock(txq, smp_processor_id()); + local_irq_save_nort(flags); + __netif_tx_lock(txq); if (netif_tx_queue_stopped(txq) || netif_tx_queue_frozen(txq) || ops->ndo_start_xmit(skb, dev) != NETDEV_TX_OK) { skb_queue_head(&npinfo->txq, skb); __netif_tx_unlock(txq); - local_irq_restore(flags); + local_irq_restore_nort(flags); schedule_delayed_work(&npinfo->tx_work, HZ/10); return; } __netif_tx_unlock(txq); - local_irq_restore(flags); + local_irq_restore_nort(flags); } } @@ -153,7 +153,7 @@ static void poll_napi(struct net_device *dev) int budget = 16; list_for_each_entry(napi, &dev->napi_list, dev_list) { - if (napi->poll_owner != smp_processor_id() && + if (napi->poll_owner != raw_smp_processor_id() && spin_trylock(&napi->poll_lock)) { budget = poll_one_napi(dev->npinfo, napi, budget); spin_unlock(&napi->poll_lock); @@ -214,30 +214,35 @@ static void refill_skbs(void) static void zap_completion_queue(void) { - unsigned long flags; struct softnet_data *sd = &get_cpu_var(softnet_data); + struct sk_buff *clist = NULL; + unsigned long flags; if (sd->completion_queue) { - struct sk_buff *clist; local_irq_save(flags); clist = sd->completion_queue; sd->completion_queue = NULL; local_irq_restore(flags); - - while (clist != NULL) { - struct sk_buff *skb = clist; - clist = clist->next; - if (skb->destructor) { - atomic_inc(&skb->users); - dev_kfree_skb_any(skb); /* put this one back */ - } else { - __kfree_skb(skb); - } - } } + + /* + * Took the list private, can drop our softnet + * reference: + */ put_cpu_var(softnet_data); + + while (clist != NULL) { + struct sk_buff *skb = clist; + clist = clist->next; + if (skb->destructor) { + atomic_inc(&skb->users); + dev_kfree_skb_any(skb); /* put this one back */ + } else { + __kfree_skb(skb); + } + } } static struct sk_buff *find_skb(struct netpoll *np, int len, int reserve) @@ -245,13 +250,26 @@ static struct sk_buff *find_skb(struct netpoll *np, int len, int reserve) int count = 0; struct sk_buff *skb; +#ifdef CONFIG_PREEMPT_RT + /* + * On -rt skb_pool.lock is schedulable, so if we are + * in an atomic context we just try to dequeue from the + * pool and fail if we cannot get one. + */ + if (in_atomic() || irqs_disabled()) + goto pick_atomic; +#endif zap_completion_queue(); refill_skbs(); repeat: skb = alloc_skb(len, GFP_ATOMIC); - if (!skb) + if (!skb) { +#ifdef CONFIG_PREEMPT_RT +pick_atomic: +#endif skb = skb_dequeue(&skb_pool); + } if (!skb) { if (++count < 10) { @@ -271,7 +289,7 @@ static int netpoll_owner_active(struct net_device *dev) struct napi_struct *napi; list_for_each_entry(napi, &dev->napi_list, dev_list) { - if (napi->poll_owner == smp_processor_id()) + if (napi->poll_owner == raw_smp_processor_id()) return 1; } return 0; @@ -297,7 +315,7 @@ static void netpoll_send_skb(struct netpoll *np, struct sk_buff *skb) txq = netdev_get_tx_queue(dev, skb_get_queue_mapping(skb)); - local_irq_save(flags); + local_irq_save_nort(flags); /* try until next clock tick */ for (tries = jiffies_to_usecs(1)/USEC_PER_POLL; tries > 0; --tries) { @@ -319,12 +337,10 @@ static void netpoll_send_skb(struct netpoll *np, struct sk_buff *skb) udelay(USEC_PER_POLL); } - WARN_ONCE(!irqs_disabled(), "netpoll_send_skb(): %s enabled interrupts in poll (%pF)\n", dev->name, ops->ndo_start_xmit); - - local_irq_restore(flags); + local_irq_restore_nort(flags); } if (status != NETDEV_TX_OK) { diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 9e0597d..27d2eb2 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -395,7 +395,7 @@ static void skb_release_head_state(struct sk_buff *skb) secpath_put(skb->sp); #endif if (skb->destructor) { - WARN_ON(in_irq()); +// WARN_ON(in_irq()); skb->destructor(skb); } #if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE) diff --git a/net/core/sock.c b/net/core/sock.c index 7633422..f34cbbb 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -2080,8 +2080,9 @@ static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR); #ifdef CONFIG_NET_NS void sock_prot_inuse_add(struct net *net, struct proto *prot, int val) { - int cpu = smp_processor_id(); + int cpu = get_cpu(); per_cpu_ptr(net->core.inuse, cpu)->val[prot->inuse_idx] += val; + put_cpu(); } EXPORT_SYMBOL_GPL(sock_prot_inuse_add); @@ -2127,7 +2128,9 @@ static DEFINE_PER_CPU(struct prot_inuse, prot_inuse); void sock_prot_inuse_add(struct net *net, struct proto *prot, int val) { - __get_cpu_var(prot_inuse).val[prot->inuse_idx] += val; + int cpu = get_cpu(); + per_cpu(prot_inuse, cpu).val[prot->inuse_idx] += val; + put_cpu(); } EXPORT_SYMBOL_GPL(sock_prot_inuse_add); diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index 97c410e..c883e29 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -201,7 +201,10 @@ static const struct icmp_control icmp_pointers[NR_ICMP_TYPES+1]; */ static struct sock *icmp_sk(struct net *net) { - return net->ipv4.icmp_sk[smp_processor_id()]; + /* + * Should be safe on PREEMPT_SOFTIRQS/HARDIRQS to use raw-smp-processor-id: + */ + return net->ipv4.icmp_sk[raw_smp_processor_id()]; } static inline struct sock *icmp_xmit_lock(struct net *net) diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c index 7505dff..40b5120 100644 --- a/net/ipv4/netfilter/arp_tables.c +++ b/net/ipv4/netfilter/arp_tables.c @@ -252,6 +252,7 @@ unsigned int arpt_do_table(struct sk_buff *skb, void *table_base; const struct xt_table_info *private; struct xt_target_param tgpar; + int cpu; if (!pskb_may_pull(skb, arp_hdr_len(skb->dev))) return NF_DROP; @@ -259,9 +260,9 @@ unsigned int arpt_do_table(struct sk_buff *skb, indev = in ? in->name : nulldevname; outdev = out ? out->name : nulldevname; - xt_info_rdlock_bh(); + cpu = xt_info_rdlock_bh(); private = table->private; - table_base = private->entries[smp_processor_id()]; + table_base = private->entries[cpu]; e = get_entry(table_base, private->hook_entry[hook]); back = get_entry(table_base, private->underflow[hook]); @@ -332,7 +333,7 @@ unsigned int arpt_do_table(struct sk_buff *skb, /* Verdict */ break; } while (!hotdrop); - xt_info_rdunlock_bh(); + xt_info_rdunlock_bh(cpu); if (hotdrop) return NF_DROP; @@ -709,7 +710,7 @@ static void get_counters(const struct xt_table_info *t, { unsigned int cpu; unsigned int i; - unsigned int curcpu; + unsigned int curcpu = NR_CPUS; /* Instead of clearing (by a previous call to memset()) * the counters and using adds, we set the counters @@ -719,6 +720,7 @@ static void get_counters(const struct xt_table_info *t, * if new softirq were to run and call ipt_do_table */ local_bh_disable(); +#ifndef CONFIG_PREEMPT_RT curcpu = smp_processor_id(); i = 0; @@ -727,7 +729,7 @@ static void get_counters(const struct xt_table_info *t, set_entry_to_counter, counters, &i); - +#endif for_each_possible_cpu(cpu) { if (cpu == curcpu) continue; @@ -1183,7 +1185,7 @@ static int do_add_counters(struct net *net, void __user *user, unsigned int len, i = 0; /* Choose the copy that is on our node */ - curcpu = smp_processor_id(); + curcpu = raw_smp_processor_id(); loc_cpu_entry = private->entries[curcpu]; xt_info_wrlock(curcpu); ARPT_ENTRY_ITERATE(loc_cpu_entry, diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c index fdefae6..4bce8ec 100644 --- a/net/ipv4/netfilter/ip_tables.c +++ b/net/ipv4/netfilter/ip_tables.c @@ -325,6 +325,7 @@ ipt_do_table(struct sk_buff *skb, struct xt_table_info *private; struct xt_match_param mtpar; struct xt_target_param tgpar; + int cpu; /* Initialization */ ip = ip_hdr(skb); @@ -346,9 +347,9 @@ ipt_do_table(struct sk_buff *skb, mtpar.hooknum = tgpar.hooknum = hook; IP_NF_ASSERT(table->valid_hooks & (1 << hook)); - xt_info_rdlock_bh(); + cpu = xt_info_rdlock_bh(); private = table->private; - table_base = private->entries[smp_processor_id()]; + table_base = private->entries[cpu]; e = get_entry(table_base, private->hook_entry[hook]); @@ -435,7 +436,7 @@ ipt_do_table(struct sk_buff *skb, /* Verdict */ break; } while (!hotdrop); - xt_info_rdunlock_bh(); + xt_info_rdunlock_bh(cpu); #ifdef DEBUG_ALLOW_ALL return NF_ACCEPT; @@ -892,7 +893,7 @@ get_counters(const struct xt_table_info *t, { unsigned int cpu; unsigned int i; - unsigned int curcpu; + unsigned int curcpu = NR_CPUS; /* Instead of clearing (by a previous call to memset()) * the counters and using adds, we set the counters @@ -902,6 +903,7 @@ get_counters(const struct xt_table_info *t, * if new softirq were to run and call ipt_do_table */ local_bh_disable(); +#ifndef CONFIG_PREEMPT_RT curcpu = smp_processor_id(); i = 0; @@ -910,7 +912,7 @@ get_counters(const struct xt_table_info *t, set_entry_to_counter, counters, &i); - +#endif for_each_possible_cpu(cpu) { if (cpu == curcpu) continue; @@ -1391,7 +1393,7 @@ do_add_counters(struct net *net, void __user *user, unsigned int len, int compat i = 0; /* Choose the copy that is on our node */ - curcpu = smp_processor_id(); + curcpu = raw_smp_processor_id(); loc_cpu_entry = private->entries[curcpu]; xt_info_wrlock(curcpu); IPT_ENTRY_ITERATE(loc_cpu_entry, diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 278f46f..2cfa9cb 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -204,13 +204,13 @@ struct rt_hash_bucket { }; #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \ - defined(CONFIG_PROVE_LOCKING) + defined(CONFIG_PROVE_LOCKING) || defined(CONFIG_PREEMPT_RT) /* * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks * The size of this table is a power of two and depends on the number of CPUS. * (on lockdep we have a quite big spinlock_t, so keep the size down there) */ -#ifdef CONFIG_LOCKDEP +#if defined(CONFIG_LOCKDEP) || defined(CONFIG_PREEMPT_RT) # define RT_HASH_LOCK_SZ 256 #else # if NR_CPUS >= 32 @@ -242,7 +242,7 @@ static __init void rt_hash_lock_init(void) spin_lock_init(&rt_hash_locks[i]); } #else -# define rt_hash_lock_addr(slot) NULL +# define rt_hash_lock_addr(slot) ((spinlock_t *)NULL) static inline void rt_hash_lock_init(void) { diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 9114524..ccf3323 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -1365,11 +1365,11 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, (len > sysctl_tcp_dma_copybreak) && !(flags & MSG_PEEK) && !sysctl_tcp_low_latency && dma_find_channel(DMA_MEMCPY)) { - preempt_enable_no_resched(); + preempt_enable(); tp->ucopy.pinned_list = dma_pin_iovec_pages(msg->msg_iov, len); } else { - preempt_enable_no_resched(); + preempt_enable(); } } #endif diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c index ced1f2c..ca367b7 100644 --- a/net/ipv6/netfilter/ip6_tables.c +++ b/net/ipv6/netfilter/ip6_tables.c @@ -355,6 +355,7 @@ ip6t_do_table(struct sk_buff *skb, struct xt_table_info *private; struct xt_match_param mtpar; struct xt_target_param tgpar; + int cpu; /* Initialization */ indev = in ? in->name : nulldevname; @@ -373,9 +374,9 @@ ip6t_do_table(struct sk_buff *skb, IP_NF_ASSERT(table->valid_hooks & (1 << hook)); - xt_info_rdlock_bh(); + cpu = xt_info_rdlock_bh(); private = table->private; - table_base = private->entries[smp_processor_id()]; + table_base = private->entries[cpu]; e = get_entry(table_base, private->hook_entry[hook]); @@ -464,7 +465,7 @@ ip6t_do_table(struct sk_buff *skb, #ifdef CONFIG_NETFILTER_DEBUG tb_comefrom = NETFILTER_LINK_POISON; #endif - xt_info_rdunlock_bh(); + xt_info_rdunlock_bh(cpu); #ifdef DEBUG_ALLOW_ALL return NF_ACCEPT; @@ -921,7 +922,7 @@ get_counters(const struct xt_table_info *t, { unsigned int cpu; unsigned int i; - unsigned int curcpu; + unsigned int curcpu = NR_CPUS; /* Instead of clearing (by a previous call to memset()) * the counters and using adds, we set the counters @@ -931,6 +932,8 @@ get_counters(const struct xt_table_info *t, * if new softirq were to run and call ipt_do_table */ local_bh_disable(); + +#ifndef CONFIG_PREEMPT_RT curcpu = smp_processor_id(); i = 0; @@ -939,7 +942,7 @@ get_counters(const struct xt_table_info *t, set_entry_to_counter, counters, &i); - +#endif for_each_possible_cpu(cpu) { if (cpu == curcpu) continue; @@ -960,12 +963,13 @@ static struct xt_counters *alloc_counters(struct xt_table *table) unsigned int countersize; struct xt_counters *counters; struct xt_table_info *private = table->private; + int node = cpu_to_node(raw_smp_processor_id()); /* We need atomic snapshot of counters: rest doesn't change (other than comefrom, which userspace doesn't care about). */ countersize = sizeof(struct xt_counters) * private->number; - counters = vmalloc_node(countersize, numa_node_id()); + counters = vmalloc_node(countersize, node); if (counters == NULL) return ERR_PTR(-ENOMEM); @@ -1423,7 +1427,7 @@ do_add_counters(struct net *net, void __user *user, unsigned int len, i = 0; /* Choose the copy that is on our node */ - curcpu = smp_processor_id(); + curcpu = raw_smp_processor_id(); xt_info_wrlock(curcpu); loc_cpu_entry = private->entries[curcpu]; IP6T_ENTRY_ITERATE(loc_cpu_entry, diff --git a/net/netfilter/core.c b/net/netfilter/core.c index 5bb3473..9fce0a4 100644 --- a/net/netfilter/core.c +++ b/net/netfilter/core.c @@ -233,7 +233,7 @@ EXPORT_SYMBOL(nf_ct_attach); void (*nf_ct_destroy)(struct nf_conntrack *); EXPORT_SYMBOL(nf_ct_destroy); -void nf_conntrack_destroy(struct nf_conntrack *nfct) +static void __nf_conntrack_destroy(struct nf_conntrack *nfct) { void (*destroy)(struct nf_conntrack *); @@ -243,6 +243,28 @@ void nf_conntrack_destroy(struct nf_conntrack *nfct) destroy(nfct); rcu_read_unlock(); } + +#ifdef CONFIG_PREEMPT_RT +/* + * nf_contrack_destroy is called with preemption disabled + * and will call functions that might schedule in PREEMPT_RT. + * For PREEMPT_RT we use a rcu callback instead to handle + * the destroying. + */ +static void nf_conntrack_destroy_rcu(struct rcu_head *rhp) +{ + __nf_conntrack_destroy(container_of(rhp, struct nf_conntrack, rcu)); +} +void nf_conntrack_destroy(struct nf_conntrack *nfct) +{ + call_rcu(&nfct->rcu, nf_conntrack_destroy_rcu); +} +#else /* !PREEMPT_RT */ +void nf_conntrack_destroy(struct nf_conntrack *nfct) +{ + __nf_conntrack_destroy(nfct); +} +#endif /* PREEMPT_RT */ EXPORT_SYMBOL(nf_conntrack_destroy); #endif /* CONFIG_NF_CONNTRACK */ diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index 2936fa3..f4e94fb 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -1061,7 +1061,7 @@ int netlink_broadcast(struct sock *ssk, struct sk_buff *skb, u32 pid, return -ENOBUFS; if (info.delivered) { - if (info.congested && (allocation & __GFP_WAIT)) + if (info.congested && (allocation & __GFP_WAIT) && !rt_task(current)) yield(); return 0; } diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index 27d0381..98d22ca 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -12,6 +12,7 @@ */ #include <linux/bitops.h> +#include <linux/kallsyms.h> #include <linux/module.h> #include <linux/types.h> #include <linux/kernel.h> @@ -24,6 +25,7 @@ #include <linux/init.h> #include <linux/rcupdate.h> #include <linux/list.h> +#include <linux/delay.h> #include <net/pkt_sched.h> /* Main transmission queue. */ @@ -78,7 +80,7 @@ static inline int handle_dev_cpu_collision(struct sk_buff *skb, { int ret; - if (unlikely(dev_queue->xmit_lock_owner == smp_processor_id())) { + if (unlikely(netif_tx_lock_recursion(dev_queue))) { /* * Same CPU holding the lock. It may be a transient * configuration error, when hard_start_xmit() recurses. We @@ -95,7 +97,9 @@ static inline int handle_dev_cpu_collision(struct sk_buff *skb, * Another cpu is holding lock, requeue & delay xmits for * some time. */ + preempt_disable(); /* FIXME: we need an _rt version of this */ __get_cpu_var(netdev_rx_stat).cpu_collision++; + preempt_enable(); ret = dev_requeue_skb(skb, q); } @@ -141,7 +145,7 @@ static inline int qdisc_restart(struct Qdisc *q) dev = qdisc_dev(q); txq = netdev_get_tx_queue(dev, skb_get_queue_mapping(skb)); - HARD_TX_LOCK(dev, txq, smp_processor_id()); + HARD_TX_LOCK(dev, txq); if (!netif_tx_queue_stopped(txq) && !netif_tx_queue_frozen(txq)) ret = dev_hard_start_xmit(skb, dev, txq); @@ -713,9 +717,12 @@ void dev_deactivate(struct net_device *dev) /* Wait for outstanding qdisc-less dev_queue_xmit calls. */ synchronize_rcu(); - /* Wait for outstanding qdisc_run calls. */ + /* + * Wait for outstanding qdisc_run calls. + * TODO: shouldnt this be wakeup-based, instead of polling it? + */ while (some_qdisc_is_busy(dev)) - yield(); + msleep(1); } static void dev_init_scheduler_queue(struct net_device *dev, diff --git a/scripts/Kbuild.include b/scripts/Kbuild.include index c29be8f..55a2571 100644 --- a/scripts/Kbuild.include +++ b/scripts/Kbuild.include @@ -98,8 +98,9 @@ as-option = $(call try-run,\ # as-instr # Usage: cflags-y += $(call as-instr,instr,option1,option2) -as-instr = $(call try-run,\ - echo -e "$(1)" | $(CC) $(KBUILD_AFLAGS) -c -xassembler -o "$$TMP" -,$(2),$(3)) +as-instr = $(call try-run, \ + echo -e "$(1)" > "$$TMP"; \ + $(CC) $(KBUILD_AFLAGS) -c -xassembler -o /dev/null "$$TMP",$(2),$(3)) # cc-option # Usage: cflags-y += $(call cc-option,-march=winchip-c6,-march=i586) diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index 2d5ece7..7c7c26e 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -2512,14 +2512,11 @@ sub process { WARN("__func__ should be used instead of gcc specific __FUNCTION__\n" . $herecurr); } -# check for semaphores used as mutexes - if ($line =~ /^.\s*(DECLARE_MUTEX|init_MUTEX)\s*\(/) { - WARN("mutexes are preferred for single holder semaphores\n" . $herecurr); - } -# check for semaphores used as mutexes - if ($line =~ /^.\s*init_MUTEX_LOCKED\s*\(/) { +# check for semaphores initialized locked + if ($line =~ /^.\s*semaphore_init_locked\s*\(/) { WARN("consider using a completion\n" . $herecurr); } + # recommend strict_strto* over simple_strto* if ($line =~ /\bsimple_(strto.*?)\s*\(/) { WARN("consider using strict_$1 in preference to simple_$1\n" . $herecurr); diff --git a/scripts/mkcompile_h b/scripts/mkcompile_h index 6a12dd9..b987a5f 100755 --- a/scripts/mkcompile_h +++ b/scripts/mkcompile_h @@ -2,7 +2,8 @@ TARGET=$1 ARCH=$2 SMP=$3 PREEMPT=$4 -CC=$5 +PREEMPT_RT=$5 +CC=$6 vecho() { [ "${quiet}" = "silent_" ] || echo "$@" ; } @@ -45,6 +46,7 @@ UTS_VERSION="#$VERSION" CONFIG_FLAGS="" if [ -n "$SMP" ] ; then CONFIG_FLAGS="SMP"; fi if [ -n "$PREEMPT" ] ; then CONFIG_FLAGS="$CONFIG_FLAGS PREEMPT"; fi +if [ -n "$PREEMPT_RT" ] ; then CONFIG_FLAGS="$CONFIG_FLAGS RT"; fi UTS_VERSION="$UTS_VERSION $CONFIG_FLAGS $TIMESTAMP" # Truncate to maximum length diff --git a/scripts/recordmcount.pl b/scripts/recordmcount.pl index 911ba7f..090d300 100755 --- a/scripts/recordmcount.pl +++ b/scripts/recordmcount.pl @@ -57,7 +57,6 @@ # call mcount (offset: 0x5) # [...] # ret -# .globl my_func # other_func: # [...] # call mcount (offset: 0x1b) diff --git a/sound/drivers/pcsp/pcsp.h b/sound/drivers/pcsp/pcsp.h index 174dd2f..be7228c 100644 --- a/sound/drivers/pcsp/pcsp.h +++ b/sound/drivers/pcsp/pcsp.h @@ -16,7 +16,7 @@ #include <asm/i8253.h> #else #include <asm/8253pit.h> -static DEFINE_SPINLOCK(i8253_lock); +static DEFINE_ATOMIC_SPINLOCK(i8253_lock); #endif #define PCSP_SOUND_VERSION 0x400 /* read 4.00 */ diff --git a/sound/drivers/pcsp/pcsp_input.c b/sound/drivers/pcsp/pcsp_input.c index 0444cde..9dfa285 100644 --- a/sound/drivers/pcsp/pcsp_input.c +++ b/sound/drivers/pcsp/pcsp_input.c @@ -21,7 +21,7 @@ static void pcspkr_do_sound(unsigned int count) { unsigned long flags; - spin_lock_irqsave(&i8253_lock, flags); + atomic_spin_lock_irqsave(&i8253_lock, flags); if (count) { /* set command for counter 2, 2 byte write */ @@ -36,7 +36,7 @@ static void pcspkr_do_sound(unsigned int count) outb(inb_p(0x61) & 0xFC, 0x61); } - spin_unlock_irqrestore(&i8253_lock, flags); + atomic_spin_unlock_irqrestore(&i8253_lock, flags); } void pcspkr_stop_sound(void) diff --git a/sound/drivers/pcsp/pcsp_lib.c b/sound/drivers/pcsp/pcsp_lib.c index 84cc265..88f7388 100644 --- a/sound/drivers/pcsp/pcsp_lib.c +++ b/sound/drivers/pcsp/pcsp_lib.c @@ -70,7 +70,7 @@ static unsigned long pcsp_timer_update(struct hrtimer *handle) timer_cnt = val * CUR_DIV() / 256; if (timer_cnt && chip->enable) { - spin_lock_irqsave(&i8253_lock, flags); + atomic_spin_lock_irqsave(&i8253_lock, flags); if (!nforce_wa) { outb_p(chip->val61, 0x61); outb_p(timer_cnt, 0x42); @@ -79,7 +79,7 @@ static unsigned long pcsp_timer_update(struct hrtimer *handle) outb(chip->val61 ^ 2, 0x61); chip->thalf = 1; } - spin_unlock_irqrestore(&i8253_lock, flags); + atomic_spin_unlock_irqrestore(&i8253_lock, flags); } chip->ns_rem = PCSP_PERIOD_NS(); @@ -152,10 +152,10 @@ static int pcsp_start_playing(struct snd_pcsp *chip) return -EIO; } - spin_lock(&i8253_lock); + atomic_spin_lock(&i8253_lock); chip->val61 = inb(0x61) | 0x03; outb_p(0x92, 0x43); /* binary, mode 1, LSB only, ch 2 */ - spin_unlock(&i8253_lock); + atomic_spin_unlock(&i8253_lock); atomic_set(&chip->timer_active, 1); chip->thalf = 0; @@ -176,11 +176,11 @@ static void pcsp_stop_playing(struct snd_pcsp *chip) return; atomic_set(&chip->timer_active, 0); - spin_lock(&i8253_lock); + atomic_spin_lock(&i8253_lock); /* restore the timer */ outb_p(0xb6, 0x43); /* binary, mode 3, LSB/MSB, ch 2 */ outb(chip->val61 & 0xFC, 0x61); - spin_unlock(&i8253_lock); + atomic_spin_unlock(&i8253_lock); } /* diff --git a/sound/soc/s3c24xx/s3c2443-ac97.c b/sound/soc/s3c24xx/s3c2443-ac97.c index 3f03d5d..bf16f20 100644 --- a/sound/soc/s3c24xx/s3c2443-ac97.c +++ b/sound/soc/s3c24xx/s3c2443-ac97.c @@ -47,7 +47,7 @@ static struct s3c24xx_ac97_info s3c24xx_ac97; static DECLARE_COMPLETION(ac97_completion); static u32 codec_ready; -static DECLARE_MUTEX(ac97_mutex); +static DEFINE_MUTEX(ac97_mutex); static unsigned short s3c2443_ac97_read(struct snd_ac97 *ac97, unsigned short reg) @@ -56,7 +56,7 @@ static unsigned short s3c2443_ac97_read(struct snd_ac97 *ac97, u32 ac_codec_cmd; u32 stat, addr, data; - down(&ac97_mutex); + mutex_lock(&ac97_mutex); codec_ready = S3C_AC97_GLBSTAT_CODECREADY; ac_codec_cmd = readl(s3c24xx_ac97.regs + S3C_AC97_CODEC_CMD); @@ -79,7 +79,7 @@ static unsigned short s3c2443_ac97_read(struct snd_ac97 *ac97, printk(KERN_ERR "s3c24xx-ac97: req addr = %02x," " rep addr = %02x\n", reg, addr); - up(&ac97_mutex); + mutex_unlock(&ac97_mutex); return (unsigned short)data; } @@ -90,7 +90,7 @@ static void s3c2443_ac97_write(struct snd_ac97 *ac97, unsigned short reg, u32 ac_glbctrl; u32 ac_codec_cmd; - down(&ac97_mutex); + mutex_lock(&ac97_mutex); codec_ready = S3C_AC97_GLBSTAT_CODECREADY; ac_codec_cmd = readl(s3c24xx_ac97.regs + S3C_AC97_CODEC_CMD); @@ -109,7 +109,7 @@ static void s3c2443_ac97_write(struct snd_ac97 *ac97, unsigned short reg, ac_codec_cmd |= S3C_AC97_CODEC_CMD_READ; writel(ac_codec_cmd, s3c24xx_ac97.regs + S3C_AC97_CODEC_CMD); - up(&ac97_mutex); + mutex_unlock(&ac97_mutex); } diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c index 0441784..4bab278 100644 --- a/tools/perf/util/parse-events.c +++ b/tools/perf/util/parse-events.c @@ -616,7 +616,7 @@ static void print_tracepoint_events(void) evt_path, st) { snprintf(evt_path, MAXPATHLEN, "%s:%s", sys_dirent.d_name, evt_dirent.d_name); - fprintf(stderr, " %-40s [%s]\n", evt_path, + fprintf(stderr, " %-42s [%s]\n", evt_path, event_type_descriptors[PERF_TYPE_TRACEPOINT+1]); } closedir(evt_dir); @@ -650,7 +650,7 @@ void print_events(void) sprintf(name, "%s OR %s", syms->symbol, syms->alias); else strcpy(name, syms->symbol); - fprintf(stderr, " %-40s [%s]\n", name, + fprintf(stderr, " %-42s [%s]\n", name, event_type_descriptors[type]); prev_type = type; @@ -664,7 +664,7 @@ void print_events(void) continue; for (i = 0; i < PERF_COUNT_HW_CACHE_RESULT_MAX; i++) { - fprintf(stderr, " %-40s [%s]\n", + fprintf(stderr, " %-42s [%s]\n", event_cache_name(type, op, i), event_type_descriptors[4]); } @@ -672,7 +672,7 @@ void print_events(void) } fprintf(stderr, "\n"); - fprintf(stderr, " %-40s [raw hardware event descriptor]\n", + fprintf(stderr, " %-42s [raw hardware event descriptor]\n", "rNNN"); fprintf(stderr, "\n"); diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 2884baf..b24e96d 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -745,8 +745,8 @@ static bool make_all_cpus_request(struct kvm *kvm, unsigned int req) if (alloc_cpumask_var(&cpus, GFP_ATOMIC)) cpumask_clear(cpus); - me = get_cpu(); spin_lock(&kvm->requests_lock); + me = get_cpu(); for (i = 0; i < KVM_MAX_VCPUS; ++i) { vcpu = kvm->vcpus[i]; if (!vcpu) @@ -763,8 +763,8 @@ static bool make_all_cpus_request(struct kvm *kvm, unsigned int req) smp_call_function_many(cpus, ack_flush, NULL, 1); else called = false; - spin_unlock(&kvm->requests_lock); put_cpu(); + spin_unlock(&kvm->requests_lock); free_cpumask_var(cpus); return called; }